{ "best_global_step": 20468, "best_metric": 0.11281616985797882, "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_qqp_42_1779207273/checkpoint-20468", "epoch": 5.0, "eval_steps": 10234, "global_step": 204665, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012215083184716487, "grad_norm": 436.6144714355469, "learning_rate": 3.908731128157522e-10, "loss": 1.5874, "num_input_tokens_seen": 3200, "step": 5 }, { "epoch": 0.00024430166369432974, "grad_norm": 601.8504028320312, "learning_rate": 8.794645038354424e-10, "loss": 1.5585, "num_input_tokens_seen": 6528, "step": 10 }, { "epoch": 0.00036645249554149463, "grad_norm": 531.4441528320312, "learning_rate": 1.3680558948551327e-09, "loss": 1.5632, "num_input_tokens_seen": 9856, "step": 15 }, { "epoch": 0.0004886033273886595, "grad_norm": 527.0687255859375, "learning_rate": 1.8566472858748227e-09, "loss": 1.6307, "num_input_tokens_seen": 12864, "step": 20 }, { "epoch": 0.0006107541592358244, "grad_norm": 550.139892578125, "learning_rate": 2.345238676894513e-09, "loss": 1.559, "num_input_tokens_seen": 16768, "step": 25 }, { "epoch": 0.0007329049910829893, "grad_norm": 502.5528259277344, "learning_rate": 2.833830067914203e-09, "loss": 1.604, "num_input_tokens_seen": 19840, "step": 30 }, { "epoch": 0.0008550558229301541, "grad_norm": 617.6123046875, "learning_rate": 3.3224214589338933e-09, "loss": 1.4891, "num_input_tokens_seen": 23488, "step": 35 }, { "epoch": 0.000977206654777319, "grad_norm": 457.3122863769531, "learning_rate": 3.811012849953584e-09, "loss": 1.63, "num_input_tokens_seen": 26816, "step": 40 }, { "epoch": 0.001099357486624484, "grad_norm": 443.7096252441406, "learning_rate": 4.299604240973273e-09, "loss": 1.6983, "num_input_tokens_seen": 30464, "step": 45 }, { "epoch": 0.0012215083184716488, "grad_norm": 616.4371948242188, "learning_rate": 4.788195631992964e-09, "loss": 1.6001, "num_input_tokens_seen": 33856, "step": 50 }, { "epoch": 0.0013436591503188137, "grad_norm": 567.6763305664062, "learning_rate": 5.276787023012655e-09, "loss": 1.6222, "num_input_tokens_seen": 37248, "step": 55 }, { "epoch": 0.0014658099821659785, "grad_norm": 493.2958984375, "learning_rate": 5.7653784140323445e-09, "loss": 1.3772, "num_input_tokens_seen": 40832, "step": 60 }, { "epoch": 0.0015879608140131434, "grad_norm": 565.9519653320312, "learning_rate": 6.253969805052035e-09, "loss": 1.7392, "num_input_tokens_seen": 44032, "step": 65 }, { "epoch": 0.0017101116458603082, "grad_norm": 430.8811340332031, "learning_rate": 6.7425611960717245e-09, "loss": 1.5064, "num_input_tokens_seen": 46912, "step": 70 }, { "epoch": 0.0018322624777074733, "grad_norm": 476.9154052734375, "learning_rate": 7.231152587091415e-09, "loss": 1.5169, "num_input_tokens_seen": 49792, "step": 75 }, { "epoch": 0.001954413309554638, "grad_norm": 484.8668518066406, "learning_rate": 7.719743978111105e-09, "loss": 1.6833, "num_input_tokens_seen": 52800, "step": 80 }, { "epoch": 0.002076564141401803, "grad_norm": 539.1660766601562, "learning_rate": 8.208335369130795e-09, "loss": 1.7706, "num_input_tokens_seen": 56704, "step": 85 }, { "epoch": 0.002198714973248968, "grad_norm": 551.7511596679688, "learning_rate": 8.696926760150486e-09, "loss": 1.5604, "num_input_tokens_seen": 60096, "step": 90 }, { "epoch": 0.0023208658050961326, "grad_norm": 461.3867492675781, "learning_rate": 9.185518151170177e-09, "loss": 1.5772, "num_input_tokens_seen": 63296, "step": 95 }, { "epoch": 0.0024430166369432977, "grad_norm": 624.7465209960938, "learning_rate": 9.674109542189865e-09, "loss": 1.4907, "num_input_tokens_seen": 66816, "step": 100 }, { "epoch": 0.0025651674687904623, "grad_norm": 368.129638671875, "learning_rate": 1.0162700933209557e-08, "loss": 1.5834, "num_input_tokens_seen": 70272, "step": 105 }, { "epoch": 0.0026873183006376274, "grad_norm": 483.0669860839844, "learning_rate": 1.0651292324229246e-08, "loss": 1.6402, "num_input_tokens_seen": 73344, "step": 110 }, { "epoch": 0.0028094691324847924, "grad_norm": 391.2715148925781, "learning_rate": 1.1139883715248937e-08, "loss": 1.3031, "num_input_tokens_seen": 76480, "step": 115 }, { "epoch": 0.002931619964331957, "grad_norm": 569.3486328125, "learning_rate": 1.1628475106268627e-08, "loss": 1.8053, "num_input_tokens_seen": 79936, "step": 120 }, { "epoch": 0.003053770796179122, "grad_norm": 522.3450317382812, "learning_rate": 1.2117066497288317e-08, "loss": 1.5657, "num_input_tokens_seen": 83200, "step": 125 }, { "epoch": 0.0031759216280262867, "grad_norm": 409.33636474609375, "learning_rate": 1.2605657888308008e-08, "loss": 1.4574, "num_input_tokens_seen": 86720, "step": 130 }, { "epoch": 0.003298072459873452, "grad_norm": 452.3154296875, "learning_rate": 1.3094249279327697e-08, "loss": 1.4738, "num_input_tokens_seen": 89984, "step": 135 }, { "epoch": 0.0034202232917206164, "grad_norm": 526.6522827148438, "learning_rate": 1.3582840670347389e-08, "loss": 1.5492, "num_input_tokens_seen": 93120, "step": 140 }, { "epoch": 0.0035423741235677815, "grad_norm": 398.2308654785156, "learning_rate": 1.4071432061367078e-08, "loss": 1.5314, "num_input_tokens_seen": 97088, "step": 145 }, { "epoch": 0.0036645249554149465, "grad_norm": 432.50579833984375, "learning_rate": 1.4560023452386768e-08, "loss": 1.3488, "num_input_tokens_seen": 100160, "step": 150 }, { "epoch": 0.003786675787262111, "grad_norm": 392.37359619140625, "learning_rate": 1.504861484340646e-08, "loss": 1.3233, "num_input_tokens_seen": 103488, "step": 155 }, { "epoch": 0.003908826619109276, "grad_norm": 406.7005920410156, "learning_rate": 1.553720623442615e-08, "loss": 1.4491, "num_input_tokens_seen": 106816, "step": 160 }, { "epoch": 0.004030977450956441, "grad_norm": 440.5775451660156, "learning_rate": 1.6025797625445838e-08, "loss": 1.327, "num_input_tokens_seen": 110208, "step": 165 }, { "epoch": 0.004153128282803606, "grad_norm": 404.6108093261719, "learning_rate": 1.651438901646553e-08, "loss": 1.3947, "num_input_tokens_seen": 113408, "step": 170 }, { "epoch": 0.0042752791146507705, "grad_norm": 452.1753234863281, "learning_rate": 1.700298040748522e-08, "loss": 1.2108, "num_input_tokens_seen": 116288, "step": 175 }, { "epoch": 0.004397429946497936, "grad_norm": 450.4519348144531, "learning_rate": 1.749157179850491e-08, "loss": 1.3352, "num_input_tokens_seen": 119296, "step": 180 }, { "epoch": 0.004519580778345101, "grad_norm": 426.2240295410156, "learning_rate": 1.79801631895246e-08, "loss": 1.3764, "num_input_tokens_seen": 122624, "step": 185 }, { "epoch": 0.004641731610192265, "grad_norm": 432.450439453125, "learning_rate": 1.8468754580544288e-08, "loss": 1.4257, "num_input_tokens_seen": 126144, "step": 190 }, { "epoch": 0.00476388244203943, "grad_norm": 438.7889709472656, "learning_rate": 1.8957345971563982e-08, "loss": 1.3153, "num_input_tokens_seen": 131648, "step": 195 }, { "epoch": 0.004886033273886595, "grad_norm": 564.0687866210938, "learning_rate": 1.944593736258367e-08, "loss": 1.3291, "num_input_tokens_seen": 134912, "step": 200 }, { "epoch": 0.00500818410573376, "grad_norm": 329.5206604003906, "learning_rate": 1.9934528753603358e-08, "loss": 1.0956, "num_input_tokens_seen": 138304, "step": 205 }, { "epoch": 0.005130334937580925, "grad_norm": 308.6362609863281, "learning_rate": 2.0423120144623053e-08, "loss": 0.9825, "num_input_tokens_seen": 141888, "step": 210 }, { "epoch": 0.00525248576942809, "grad_norm": 405.0592956542969, "learning_rate": 2.091171153564274e-08, "loss": 0.9918, "num_input_tokens_seen": 145088, "step": 215 }, { "epoch": 0.005374636601275255, "grad_norm": 299.959228515625, "learning_rate": 2.1400302926662432e-08, "loss": 0.9288, "num_input_tokens_seen": 148480, "step": 220 }, { "epoch": 0.005496787433122419, "grad_norm": 297.4017333984375, "learning_rate": 2.1888894317682123e-08, "loss": 0.8789, "num_input_tokens_seen": 152256, "step": 225 }, { "epoch": 0.005618938264969585, "grad_norm": 289.1470947265625, "learning_rate": 2.237748570870181e-08, "loss": 0.9644, "num_input_tokens_seen": 155584, "step": 230 }, { "epoch": 0.0057410890968167495, "grad_norm": 353.2586364746094, "learning_rate": 2.2866077099721502e-08, "loss": 0.9388, "num_input_tokens_seen": 159104, "step": 235 }, { "epoch": 0.005863239928663914, "grad_norm": 282.7951965332031, "learning_rate": 2.335466849074119e-08, "loss": 1.0616, "num_input_tokens_seen": 162176, "step": 240 }, { "epoch": 0.005985390760511079, "grad_norm": 192.14768981933594, "learning_rate": 2.384325988176088e-08, "loss": 0.7593, "num_input_tokens_seen": 165440, "step": 245 }, { "epoch": 0.006107541592358244, "grad_norm": 278.6020812988281, "learning_rate": 2.4331851272780573e-08, "loss": 0.8342, "num_input_tokens_seen": 169088, "step": 250 }, { "epoch": 0.006229692424205409, "grad_norm": 374.0990905761719, "learning_rate": 2.482044266380026e-08, "loss": 0.8074, "num_input_tokens_seen": 172480, "step": 255 }, { "epoch": 0.0063518432560525735, "grad_norm": 201.84979248046875, "learning_rate": 2.5309034054819955e-08, "loss": 0.7764, "num_input_tokens_seen": 175680, "step": 260 }, { "epoch": 0.006473994087899739, "grad_norm": 327.6642150878906, "learning_rate": 2.5797625445839643e-08, "loss": 0.7722, "num_input_tokens_seen": 179008, "step": 265 }, { "epoch": 0.006596144919746904, "grad_norm": 248.3802490234375, "learning_rate": 2.628621683685933e-08, "loss": 0.7188, "num_input_tokens_seen": 182528, "step": 270 }, { "epoch": 0.006718295751594068, "grad_norm": 121.84048461914062, "learning_rate": 2.6774808227879026e-08, "loss": 0.4984, "num_input_tokens_seen": 186112, "step": 275 }, { "epoch": 0.006840446583441233, "grad_norm": 130.44570922851562, "learning_rate": 2.7263399618898714e-08, "loss": 0.4016, "num_input_tokens_seen": 189760, "step": 280 }, { "epoch": 0.006962597415288398, "grad_norm": 161.82168579101562, "learning_rate": 2.7751991009918405e-08, "loss": 0.4292, "num_input_tokens_seen": 193216, "step": 285 }, { "epoch": 0.007084748247135563, "grad_norm": 110.50860595703125, "learning_rate": 2.8240582400938093e-08, "loss": 0.3719, "num_input_tokens_seen": 196800, "step": 290 }, { "epoch": 0.007206899078982728, "grad_norm": 31.40703010559082, "learning_rate": 2.8729173791957784e-08, "loss": 0.266, "num_input_tokens_seen": 200000, "step": 295 }, { "epoch": 0.007329049910829893, "grad_norm": 96.29593658447266, "learning_rate": 2.9217765182977475e-08, "loss": 0.3052, "num_input_tokens_seen": 203392, "step": 300 }, { "epoch": 0.007451200742677058, "grad_norm": 96.10907745361328, "learning_rate": 2.9706356573997163e-08, "loss": 0.3312, "num_input_tokens_seen": 206656, "step": 305 }, { "epoch": 0.007573351574524222, "grad_norm": 32.14572525024414, "learning_rate": 3.0194947965016854e-08, "loss": 0.2604, "num_input_tokens_seen": 210368, "step": 310 }, { "epoch": 0.007695502406371387, "grad_norm": 61.403629302978516, "learning_rate": 3.068353935603654e-08, "loss": 0.3638, "num_input_tokens_seen": 213760, "step": 315 }, { "epoch": 0.007817653238218552, "grad_norm": 41.64677047729492, "learning_rate": 3.117213074705624e-08, "loss": 0.2793, "num_input_tokens_seen": 217152, "step": 320 }, { "epoch": 0.007939804070065717, "grad_norm": 59.9798583984375, "learning_rate": 3.1660722138075925e-08, "loss": 0.2819, "num_input_tokens_seen": 220928, "step": 325 }, { "epoch": 0.008061954901912883, "grad_norm": 45.30350875854492, "learning_rate": 3.214931352909562e-08, "loss": 0.2614, "num_input_tokens_seen": 225024, "step": 330 }, { "epoch": 0.008184105733760046, "grad_norm": 82.58984375, "learning_rate": 3.263790492011531e-08, "loss": 0.2997, "num_input_tokens_seen": 228416, "step": 335 }, { "epoch": 0.008306256565607212, "grad_norm": 63.688331604003906, "learning_rate": 3.3126496311134995e-08, "loss": 0.2678, "num_input_tokens_seen": 231552, "step": 340 }, { "epoch": 0.008428407397454377, "grad_norm": 56.17790222167969, "learning_rate": 3.361508770215469e-08, "loss": 0.2656, "num_input_tokens_seen": 234752, "step": 345 }, { "epoch": 0.008550558229301541, "grad_norm": 21.489974975585938, "learning_rate": 3.410367909317437e-08, "loss": 0.2684, "num_input_tokens_seen": 238016, "step": 350 }, { "epoch": 0.008672709061148707, "grad_norm": 24.420082092285156, "learning_rate": 3.4592270484194066e-08, "loss": 0.3036, "num_input_tokens_seen": 241536, "step": 355 }, { "epoch": 0.008794859892995872, "grad_norm": 40.783447265625, "learning_rate": 3.508086187521376e-08, "loss": 0.2641, "num_input_tokens_seen": 244928, "step": 360 }, { "epoch": 0.008917010724843036, "grad_norm": 28.45875358581543, "learning_rate": 3.556945326623344e-08, "loss": 0.2976, "num_input_tokens_seen": 248576, "step": 365 }, { "epoch": 0.009039161556690201, "grad_norm": 139.9940185546875, "learning_rate": 3.6058044657253136e-08, "loss": 0.3249, "num_input_tokens_seen": 251712, "step": 370 }, { "epoch": 0.009161312388537367, "grad_norm": 37.76359939575195, "learning_rate": 3.654663604827283e-08, "loss": 0.255, "num_input_tokens_seen": 255232, "step": 375 }, { "epoch": 0.00928346322038453, "grad_norm": 60.50453567504883, "learning_rate": 3.703522743929252e-08, "loss": 0.3003, "num_input_tokens_seen": 258880, "step": 380 }, { "epoch": 0.009405614052231696, "grad_norm": 55.451576232910156, "learning_rate": 3.7523818830312206e-08, "loss": 0.2777, "num_input_tokens_seen": 262336, "step": 385 }, { "epoch": 0.00952776488407886, "grad_norm": 74.9093246459961, "learning_rate": 3.80124102213319e-08, "loss": 0.2748, "num_input_tokens_seen": 265664, "step": 390 }, { "epoch": 0.009649915715926025, "grad_norm": 64.74765014648438, "learning_rate": 3.850100161235159e-08, "loss": 0.2474, "num_input_tokens_seen": 269056, "step": 395 }, { "epoch": 0.00977206654777319, "grad_norm": 52.95093536376953, "learning_rate": 3.898959300337128e-08, "loss": 0.2575, "num_input_tokens_seen": 272640, "step": 400 }, { "epoch": 0.009894217379620355, "grad_norm": 86.30628204345703, "learning_rate": 3.947818439439097e-08, "loss": 0.2952, "num_input_tokens_seen": 276032, "step": 405 }, { "epoch": 0.01001636821146752, "grad_norm": 41.927181243896484, "learning_rate": 3.996677578541066e-08, "loss": 0.2455, "num_input_tokens_seen": 279488, "step": 410 }, { "epoch": 0.010138519043314686, "grad_norm": 36.422889709472656, "learning_rate": 4.045536717643035e-08, "loss": 0.2665, "num_input_tokens_seen": 282752, "step": 415 }, { "epoch": 0.01026066987516185, "grad_norm": 105.75531768798828, "learning_rate": 4.094395856745004e-08, "loss": 0.244, "num_input_tokens_seen": 285952, "step": 420 }, { "epoch": 0.010382820707009015, "grad_norm": 115.46588897705078, "learning_rate": 4.143254995846973e-08, "loss": 0.2549, "num_input_tokens_seen": 289024, "step": 425 }, { "epoch": 0.01050497153885618, "grad_norm": 55.69449234008789, "learning_rate": 4.192114134948942e-08, "loss": 0.2537, "num_input_tokens_seen": 292352, "step": 430 }, { "epoch": 0.010627122370703344, "grad_norm": 37.521759033203125, "learning_rate": 4.240973274050911e-08, "loss": 0.2425, "num_input_tokens_seen": 295616, "step": 435 }, { "epoch": 0.01074927320255051, "grad_norm": 53.086669921875, "learning_rate": 4.28983241315288e-08, "loss": 0.2613, "num_input_tokens_seen": 299200, "step": 440 }, { "epoch": 0.010871424034397675, "grad_norm": 27.426273345947266, "learning_rate": 4.3386915522548495e-08, "loss": 0.2972, "num_input_tokens_seen": 302272, "step": 445 }, { "epoch": 0.010993574866244839, "grad_norm": 28.601957321166992, "learning_rate": 4.3875506913568176e-08, "loss": 0.2497, "num_input_tokens_seen": 305600, "step": 450 }, { "epoch": 0.011115725698092004, "grad_norm": 83.53900146484375, "learning_rate": 4.436409830458787e-08, "loss": 0.2463, "num_input_tokens_seen": 309760, "step": 455 }, { "epoch": 0.01123787652993917, "grad_norm": 36.838340759277344, "learning_rate": 4.4852689695607565e-08, "loss": 0.2568, "num_input_tokens_seen": 313856, "step": 460 }, { "epoch": 0.011360027361786334, "grad_norm": 37.25075149536133, "learning_rate": 4.5341281086627246e-08, "loss": 0.2602, "num_input_tokens_seen": 317696, "step": 465 }, { "epoch": 0.011482178193633499, "grad_norm": 53.051753997802734, "learning_rate": 4.582987247764694e-08, "loss": 0.2595, "num_input_tokens_seen": 320704, "step": 470 }, { "epoch": 0.011604329025480663, "grad_norm": 37.922607421875, "learning_rate": 4.6318463868666636e-08, "loss": 0.2515, "num_input_tokens_seen": 324160, "step": 475 }, { "epoch": 0.011726479857327828, "grad_norm": 40.48796081542969, "learning_rate": 4.680705525968632e-08, "loss": 0.2352, "num_input_tokens_seen": 327936, "step": 480 }, { "epoch": 0.011848630689174994, "grad_norm": 31.205713272094727, "learning_rate": 4.729564665070601e-08, "loss": 0.2296, "num_input_tokens_seen": 331264, "step": 485 }, { "epoch": 0.011970781521022157, "grad_norm": 108.14125061035156, "learning_rate": 4.7784238041725706e-08, "loss": 0.2708, "num_input_tokens_seen": 334592, "step": 490 }, { "epoch": 0.012092932352869323, "grad_norm": 19.461109161376953, "learning_rate": 4.8272829432745394e-08, "loss": 0.2384, "num_input_tokens_seen": 338688, "step": 495 }, { "epoch": 0.012215083184716488, "grad_norm": 97.08906555175781, "learning_rate": 4.876142082376508e-08, "loss": 0.2568, "num_input_tokens_seen": 341888, "step": 500 }, { "epoch": 0.012337234016563652, "grad_norm": 49.344146728515625, "learning_rate": 4.9250012214784776e-08, "loss": 0.2555, "num_input_tokens_seen": 345088, "step": 505 }, { "epoch": 0.012459384848410818, "grad_norm": 32.22880172729492, "learning_rate": 4.9738603605804464e-08, "loss": 0.2275, "num_input_tokens_seen": 348480, "step": 510 }, { "epoch": 0.012581535680257983, "grad_norm": 68.86043548583984, "learning_rate": 5.022719499682415e-08, "loss": 0.2045, "num_input_tokens_seen": 351936, "step": 515 }, { "epoch": 0.012703686512105147, "grad_norm": 18.139911651611328, "learning_rate": 5.071578638784385e-08, "loss": 0.2334, "num_input_tokens_seen": 355712, "step": 520 }, { "epoch": 0.012825837343952312, "grad_norm": 17.971744537353516, "learning_rate": 5.1204377778863535e-08, "loss": 0.2546, "num_input_tokens_seen": 359232, "step": 525 }, { "epoch": 0.012947988175799478, "grad_norm": 152.04522705078125, "learning_rate": 5.169296916988322e-08, "loss": 0.2338, "num_input_tokens_seen": 362624, "step": 530 }, { "epoch": 0.013070139007646642, "grad_norm": 22.91851234436035, "learning_rate": 5.218156056090292e-08, "loss": 0.2105, "num_input_tokens_seen": 366336, "step": 535 }, { "epoch": 0.013192289839493807, "grad_norm": 29.61043357849121, "learning_rate": 5.2670151951922605e-08, "loss": 0.1841, "num_input_tokens_seen": 370112, "step": 540 }, { "epoch": 0.013314440671340971, "grad_norm": 27.556501388549805, "learning_rate": 5.315874334294229e-08, "loss": 0.1867, "num_input_tokens_seen": 373120, "step": 545 }, { "epoch": 0.013436591503188136, "grad_norm": 48.11063766479492, "learning_rate": 5.364733473396198e-08, "loss": 0.2397, "num_input_tokens_seen": 376768, "step": 550 }, { "epoch": 0.013558742335035302, "grad_norm": 16.25624656677246, "learning_rate": 5.4135926124981675e-08, "loss": 0.1875, "num_input_tokens_seen": 380544, "step": 555 }, { "epoch": 0.013680893166882466, "grad_norm": 26.067886352539062, "learning_rate": 5.462451751600137e-08, "loss": 0.224, "num_input_tokens_seen": 383552, "step": 560 }, { "epoch": 0.013803043998729631, "grad_norm": 61.1496467590332, "learning_rate": 5.511310890702105e-08, "loss": 0.2137, "num_input_tokens_seen": 387456, "step": 565 }, { "epoch": 0.013925194830576797, "grad_norm": 23.894100189208984, "learning_rate": 5.5601700298040746e-08, "loss": 0.1913, "num_input_tokens_seen": 391616, "step": 570 }, { "epoch": 0.01404734566242396, "grad_norm": 26.000904083251953, "learning_rate": 5.609029168906044e-08, "loss": 0.1981, "num_input_tokens_seen": 395392, "step": 575 }, { "epoch": 0.014169496494271126, "grad_norm": 56.60203170776367, "learning_rate": 5.657888308008012e-08, "loss": 0.1901, "num_input_tokens_seen": 400384, "step": 580 }, { "epoch": 0.014291647326118291, "grad_norm": 71.11356353759766, "learning_rate": 5.7067474471099816e-08, "loss": 0.2619, "num_input_tokens_seen": 403392, "step": 585 }, { "epoch": 0.014413798157965455, "grad_norm": 23.953935623168945, "learning_rate": 5.755606586211951e-08, "loss": 0.1605, "num_input_tokens_seen": 406592, "step": 590 }, { "epoch": 0.01453594898981262, "grad_norm": 69.51998138427734, "learning_rate": 5.804465725313919e-08, "loss": 0.1769, "num_input_tokens_seen": 409920, "step": 595 }, { "epoch": 0.014658099821659786, "grad_norm": 25.43825912475586, "learning_rate": 5.853324864415889e-08, "loss": 0.2082, "num_input_tokens_seen": 413440, "step": 600 }, { "epoch": 0.01478025065350695, "grad_norm": 39.317501068115234, "learning_rate": 5.902184003517858e-08, "loss": 0.1786, "num_input_tokens_seen": 416896, "step": 605 }, { "epoch": 0.014902401485354115, "grad_norm": 34.33527374267578, "learning_rate": 5.951043142619827e-08, "loss": 0.182, "num_input_tokens_seen": 420416, "step": 610 }, { "epoch": 0.015024552317201281, "grad_norm": 21.858205795288086, "learning_rate": 5.999902281721795e-08, "loss": 0.1692, "num_input_tokens_seen": 423936, "step": 615 }, { "epoch": 0.015146703149048445, "grad_norm": 32.28944396972656, "learning_rate": 6.048761420823765e-08, "loss": 0.2483, "num_input_tokens_seen": 427776, "step": 620 }, { "epoch": 0.01526885398089561, "grad_norm": 52.35483932495117, "learning_rate": 6.097620559925734e-08, "loss": 0.1979, "num_input_tokens_seen": 431616, "step": 625 }, { "epoch": 0.015391004812742774, "grad_norm": 16.09345245361328, "learning_rate": 6.146479699027702e-08, "loss": 0.2048, "num_input_tokens_seen": 434944, "step": 630 }, { "epoch": 0.01551315564458994, "grad_norm": 19.902141571044922, "learning_rate": 6.195338838129672e-08, "loss": 0.2058, "num_input_tokens_seen": 438272, "step": 635 }, { "epoch": 0.015635306476437103, "grad_norm": 20.941940307617188, "learning_rate": 6.244197977231641e-08, "loss": 0.2192, "num_input_tokens_seen": 441792, "step": 640 }, { "epoch": 0.01575745730828427, "grad_norm": 26.100826263427734, "learning_rate": 6.293057116333609e-08, "loss": 0.2148, "num_input_tokens_seen": 444864, "step": 645 }, { "epoch": 0.015879608140131434, "grad_norm": 78.05301666259766, "learning_rate": 6.34191625543558e-08, "loss": 0.153, "num_input_tokens_seen": 448512, "step": 650 }, { "epoch": 0.0160017589719786, "grad_norm": 32.38904571533203, "learning_rate": 6.390775394537548e-08, "loss": 0.1999, "num_input_tokens_seen": 451456, "step": 655 }, { "epoch": 0.016123909803825765, "grad_norm": 88.80257415771484, "learning_rate": 6.439634533639516e-08, "loss": 0.205, "num_input_tokens_seen": 454912, "step": 660 }, { "epoch": 0.01624606063567293, "grad_norm": 20.400436401367188, "learning_rate": 6.488493672741487e-08, "loss": 0.1934, "num_input_tokens_seen": 458688, "step": 665 }, { "epoch": 0.016368211467520093, "grad_norm": 59.74924850463867, "learning_rate": 6.537352811843455e-08, "loss": 0.1651, "num_input_tokens_seen": 461504, "step": 670 }, { "epoch": 0.016490362299367258, "grad_norm": 88.08880615234375, "learning_rate": 6.586211950945423e-08, "loss": 0.2466, "num_input_tokens_seen": 464512, "step": 675 }, { "epoch": 0.016612513131214424, "grad_norm": 25.69615936279297, "learning_rate": 6.635071090047394e-08, "loss": 0.2258, "num_input_tokens_seen": 467584, "step": 680 }, { "epoch": 0.01673466396306159, "grad_norm": 54.38239288330078, "learning_rate": 6.683930229149362e-08, "loss": 0.2461, "num_input_tokens_seen": 470848, "step": 685 }, { "epoch": 0.016856814794908755, "grad_norm": 136.9866943359375, "learning_rate": 6.73278936825133e-08, "loss": 0.216, "num_input_tokens_seen": 473920, "step": 690 }, { "epoch": 0.016978965626755917, "grad_norm": 44.14219665527344, "learning_rate": 6.781648507353301e-08, "loss": 0.2186, "num_input_tokens_seen": 477184, "step": 695 }, { "epoch": 0.017101116458603082, "grad_norm": 53.0874137878418, "learning_rate": 6.830507646455269e-08, "loss": 0.2767, "num_input_tokens_seen": 482688, "step": 700 }, { "epoch": 0.017223267290450248, "grad_norm": 41.03855514526367, "learning_rate": 6.879366785557237e-08, "loss": 0.1857, "num_input_tokens_seen": 485888, "step": 705 }, { "epoch": 0.017345418122297413, "grad_norm": 35.35697555541992, "learning_rate": 6.928225924659208e-08, "loss": 0.206, "num_input_tokens_seen": 489536, "step": 710 }, { "epoch": 0.01746756895414458, "grad_norm": 41.01392364501953, "learning_rate": 6.977085063761176e-08, "loss": 0.1713, "num_input_tokens_seen": 493056, "step": 715 }, { "epoch": 0.017589719785991744, "grad_norm": 80.70348358154297, "learning_rate": 7.025944202863144e-08, "loss": 0.2117, "num_input_tokens_seen": 496256, "step": 720 }, { "epoch": 0.017711870617838906, "grad_norm": 56.13608169555664, "learning_rate": 7.074803341965115e-08, "loss": 0.1537, "num_input_tokens_seen": 500160, "step": 725 }, { "epoch": 0.01783402144968607, "grad_norm": 78.34611511230469, "learning_rate": 7.123662481067083e-08, "loss": 0.2369, "num_input_tokens_seen": 503488, "step": 730 }, { "epoch": 0.017956172281533237, "grad_norm": 35.684146881103516, "learning_rate": 7.172521620169051e-08, "loss": 0.2371, "num_input_tokens_seen": 506560, "step": 735 }, { "epoch": 0.018078323113380403, "grad_norm": 27.17791748046875, "learning_rate": 7.221380759271022e-08, "loss": 0.1702, "num_input_tokens_seen": 509888, "step": 740 }, { "epoch": 0.018200473945227568, "grad_norm": 47.520076751708984, "learning_rate": 7.27023989837299e-08, "loss": 0.2225, "num_input_tokens_seen": 512960, "step": 745 }, { "epoch": 0.018322624777074734, "grad_norm": 33.98930358886719, "learning_rate": 7.31909903747496e-08, "loss": 0.2422, "num_input_tokens_seen": 516416, "step": 750 }, { "epoch": 0.018444775608921896, "grad_norm": 82.38633728027344, "learning_rate": 7.367958176576929e-08, "loss": 0.2441, "num_input_tokens_seen": 519680, "step": 755 }, { "epoch": 0.01856692644076906, "grad_norm": 45.27608108520508, "learning_rate": 7.416817315678897e-08, "loss": 0.1828, "num_input_tokens_seen": 522944, "step": 760 }, { "epoch": 0.018689077272616227, "grad_norm": 24.22097396850586, "learning_rate": 7.465676454780867e-08, "loss": 0.1292, "num_input_tokens_seen": 526144, "step": 765 }, { "epoch": 0.018811228104463392, "grad_norm": 79.6414566040039, "learning_rate": 7.514535593882836e-08, "loss": 0.1908, "num_input_tokens_seen": 529152, "step": 770 }, { "epoch": 0.018933378936310558, "grad_norm": 120.36722564697266, "learning_rate": 7.563394732984804e-08, "loss": 0.2619, "num_input_tokens_seen": 532608, "step": 775 }, { "epoch": 0.01905552976815772, "grad_norm": 141.484130859375, "learning_rate": 7.612253872086774e-08, "loss": 0.1843, "num_input_tokens_seen": 536320, "step": 780 }, { "epoch": 0.019177680600004885, "grad_norm": 50.55957794189453, "learning_rate": 7.661113011188742e-08, "loss": 0.1081, "num_input_tokens_seen": 539520, "step": 785 }, { "epoch": 0.01929983143185205, "grad_norm": 95.52667236328125, "learning_rate": 7.709972150290711e-08, "loss": 0.2077, "num_input_tokens_seen": 542848, "step": 790 }, { "epoch": 0.019421982263699216, "grad_norm": 58.965370178222656, "learning_rate": 7.758831289392681e-08, "loss": 0.1838, "num_input_tokens_seen": 546304, "step": 795 }, { "epoch": 0.01954413309554638, "grad_norm": 50.59172058105469, "learning_rate": 7.807690428494649e-08, "loss": 0.2594, "num_input_tokens_seen": 549120, "step": 800 }, { "epoch": 0.019666283927393547, "grad_norm": 71.519775390625, "learning_rate": 7.856549567596618e-08, "loss": 0.1731, "num_input_tokens_seen": 552832, "step": 805 }, { "epoch": 0.01978843475924071, "grad_norm": 29.882675170898438, "learning_rate": 7.905408706698588e-08, "loss": 0.1918, "num_input_tokens_seen": 556224, "step": 810 }, { "epoch": 0.019910585591087875, "grad_norm": 46.32589340209961, "learning_rate": 7.954267845800556e-08, "loss": 0.1981, "num_input_tokens_seen": 559552, "step": 815 }, { "epoch": 0.02003273642293504, "grad_norm": 21.45256233215332, "learning_rate": 8.003126984902525e-08, "loss": 0.1791, "num_input_tokens_seen": 562752, "step": 820 }, { "epoch": 0.020154887254782206, "grad_norm": 80.42591094970703, "learning_rate": 8.051986124004495e-08, "loss": 0.1723, "num_input_tokens_seen": 566080, "step": 825 }, { "epoch": 0.02027703808662937, "grad_norm": 31.974721908569336, "learning_rate": 8.100845263106463e-08, "loss": 0.2432, "num_input_tokens_seen": 569408, "step": 830 }, { "epoch": 0.020399188918476537, "grad_norm": 23.089448928833008, "learning_rate": 8.149704402208433e-08, "loss": 0.1827, "num_input_tokens_seen": 572928, "step": 835 }, { "epoch": 0.0205213397503237, "grad_norm": 53.952659606933594, "learning_rate": 8.198563541310402e-08, "loss": 0.256, "num_input_tokens_seen": 576448, "step": 840 }, { "epoch": 0.020643490582170864, "grad_norm": 49.31153106689453, "learning_rate": 8.24742268041237e-08, "loss": 0.2329, "num_input_tokens_seen": 579712, "step": 845 }, { "epoch": 0.02076564141401803, "grad_norm": 26.863073348999023, "learning_rate": 8.29628181951434e-08, "loss": 0.1903, "num_input_tokens_seen": 583104, "step": 850 }, { "epoch": 0.020887792245865195, "grad_norm": 176.17437744140625, "learning_rate": 8.345140958616309e-08, "loss": 0.2145, "num_input_tokens_seen": 586560, "step": 855 }, { "epoch": 0.02100994307771236, "grad_norm": 75.82144927978516, "learning_rate": 8.394000097718277e-08, "loss": 0.1631, "num_input_tokens_seen": 590336, "step": 860 }, { "epoch": 0.021132093909559523, "grad_norm": 50.85761260986328, "learning_rate": 8.442859236820248e-08, "loss": 0.2079, "num_input_tokens_seen": 593728, "step": 865 }, { "epoch": 0.021254244741406688, "grad_norm": 33.126644134521484, "learning_rate": 8.491718375922216e-08, "loss": 0.1904, "num_input_tokens_seen": 597056, "step": 870 }, { "epoch": 0.021376395573253854, "grad_norm": 38.85331344604492, "learning_rate": 8.540577515024184e-08, "loss": 0.1724, "num_input_tokens_seen": 600384, "step": 875 }, { "epoch": 0.02149854640510102, "grad_norm": 31.606834411621094, "learning_rate": 8.589436654126155e-08, "loss": 0.137, "num_input_tokens_seen": 603968, "step": 880 }, { "epoch": 0.021620697236948185, "grad_norm": 72.67549896240234, "learning_rate": 8.638295793228123e-08, "loss": 0.2714, "num_input_tokens_seen": 607360, "step": 885 }, { "epoch": 0.02174284806879535, "grad_norm": 35.970947265625, "learning_rate": 8.687154932330091e-08, "loss": 0.2029, "num_input_tokens_seen": 611008, "step": 890 }, { "epoch": 0.021864998900642512, "grad_norm": 43.83979034423828, "learning_rate": 8.736014071432062e-08, "loss": 0.267, "num_input_tokens_seen": 614080, "step": 895 }, { "epoch": 0.021987149732489678, "grad_norm": 43.2574577331543, "learning_rate": 8.78487321053403e-08, "loss": 0.2438, "num_input_tokens_seen": 617408, "step": 900 }, { "epoch": 0.022109300564336843, "grad_norm": 25.0458984375, "learning_rate": 8.833732349635998e-08, "loss": 0.2295, "num_input_tokens_seen": 621056, "step": 905 }, { "epoch": 0.02223145139618401, "grad_norm": 79.44489288330078, "learning_rate": 8.882591488737969e-08, "loss": 0.1759, "num_input_tokens_seen": 624064, "step": 910 }, { "epoch": 0.022353602228031174, "grad_norm": 126.1421127319336, "learning_rate": 8.931450627839937e-08, "loss": 0.1532, "num_input_tokens_seen": 627520, "step": 915 }, { "epoch": 0.02247575305987834, "grad_norm": 57.74498748779297, "learning_rate": 8.980309766941905e-08, "loss": 0.1624, "num_input_tokens_seen": 631168, "step": 920 }, { "epoch": 0.0225979038917255, "grad_norm": 91.72770690917969, "learning_rate": 9.029168906043876e-08, "loss": 0.1618, "num_input_tokens_seen": 634240, "step": 925 }, { "epoch": 0.022720054723572667, "grad_norm": 32.56623458862305, "learning_rate": 9.078028045145844e-08, "loss": 0.1677, "num_input_tokens_seen": 637696, "step": 930 }, { "epoch": 0.022842205555419832, "grad_norm": 51.39784622192383, "learning_rate": 9.126887184247812e-08, "loss": 0.1632, "num_input_tokens_seen": 641216, "step": 935 }, { "epoch": 0.022964356387266998, "grad_norm": 81.02871704101562, "learning_rate": 9.175746323349783e-08, "loss": 0.1576, "num_input_tokens_seen": 644544, "step": 940 }, { "epoch": 0.023086507219114163, "grad_norm": 54.13098907470703, "learning_rate": 9.224605462451751e-08, "loss": 0.2266, "num_input_tokens_seen": 647552, "step": 945 }, { "epoch": 0.023208658050961326, "grad_norm": 20.77579689025879, "learning_rate": 9.27346460155372e-08, "loss": 0.2041, "num_input_tokens_seen": 651264, "step": 950 }, { "epoch": 0.02333080888280849, "grad_norm": 88.34909057617188, "learning_rate": 9.32232374065569e-08, "loss": 0.1553, "num_input_tokens_seen": 654528, "step": 955 }, { "epoch": 0.023452959714655656, "grad_norm": 65.97145080566406, "learning_rate": 9.371182879757658e-08, "loss": 0.2058, "num_input_tokens_seen": 658048, "step": 960 }, { "epoch": 0.023575110546502822, "grad_norm": 106.45439147949219, "learning_rate": 9.420042018859626e-08, "loss": 0.1971, "num_input_tokens_seen": 661312, "step": 965 }, { "epoch": 0.023697261378349987, "grad_norm": 55.05768585205078, "learning_rate": 9.468901157961596e-08, "loss": 0.1805, "num_input_tokens_seen": 664128, "step": 970 }, { "epoch": 0.023819412210197153, "grad_norm": 78.96724700927734, "learning_rate": 9.517760297063565e-08, "loss": 0.2021, "num_input_tokens_seen": 667328, "step": 975 }, { "epoch": 0.023941563042044315, "grad_norm": 164.31275939941406, "learning_rate": 9.566619436165535e-08, "loss": 0.1853, "num_input_tokens_seen": 670464, "step": 980 }, { "epoch": 0.02406371387389148, "grad_norm": 54.7586555480957, "learning_rate": 9.615478575267503e-08, "loss": 0.1505, "num_input_tokens_seen": 673984, "step": 985 }, { "epoch": 0.024185864705738646, "grad_norm": 41.857547760009766, "learning_rate": 9.664337714369472e-08, "loss": 0.1624, "num_input_tokens_seen": 677056, "step": 990 }, { "epoch": 0.02430801553758581, "grad_norm": 81.85855865478516, "learning_rate": 9.713196853471442e-08, "loss": 0.1981, "num_input_tokens_seen": 680704, "step": 995 }, { "epoch": 0.024430166369432977, "grad_norm": 40.62771987915039, "learning_rate": 9.76205599257341e-08, "loss": 0.2078, "num_input_tokens_seen": 683968, "step": 1000 }, { "epoch": 0.024552317201280142, "grad_norm": 82.70243835449219, "learning_rate": 9.81091513167538e-08, "loss": 0.1137, "num_input_tokens_seen": 687552, "step": 1005 }, { "epoch": 0.024674468033127304, "grad_norm": 57.07538604736328, "learning_rate": 9.859774270777349e-08, "loss": 0.1055, "num_input_tokens_seen": 690624, "step": 1010 }, { "epoch": 0.02479661886497447, "grad_norm": 108.51544952392578, "learning_rate": 9.908633409879317e-08, "loss": 0.1542, "num_input_tokens_seen": 694016, "step": 1015 }, { "epoch": 0.024918769696821635, "grad_norm": 94.2655029296875, "learning_rate": 9.957492548981286e-08, "loss": 0.1864, "num_input_tokens_seen": 697152, "step": 1020 }, { "epoch": 0.0250409205286688, "grad_norm": 49.298973083496094, "learning_rate": 1.0006351688083256e-07, "loss": 0.2495, "num_input_tokens_seen": 700672, "step": 1025 }, { "epoch": 0.025163071360515966, "grad_norm": 151.5254669189453, "learning_rate": 1.0055210827185224e-07, "loss": 0.1592, "num_input_tokens_seen": 704256, "step": 1030 }, { "epoch": 0.02528522219236313, "grad_norm": 86.0088882446289, "learning_rate": 1.0104069966287194e-07, "loss": 0.1961, "num_input_tokens_seen": 707712, "step": 1035 }, { "epoch": 0.025407373024210294, "grad_norm": 58.221675872802734, "learning_rate": 1.0152929105389163e-07, "loss": 0.1893, "num_input_tokens_seen": 711232, "step": 1040 }, { "epoch": 0.02552952385605746, "grad_norm": 94.9822998046875, "learning_rate": 1.0201788244491131e-07, "loss": 0.1576, "num_input_tokens_seen": 714560, "step": 1045 }, { "epoch": 0.025651674687904625, "grad_norm": 23.00892448425293, "learning_rate": 1.02506473835931e-07, "loss": 0.1167, "num_input_tokens_seen": 718144, "step": 1050 }, { "epoch": 0.02577382551975179, "grad_norm": 64.81526184082031, "learning_rate": 1.029950652269507e-07, "loss": 0.1414, "num_input_tokens_seen": 721600, "step": 1055 }, { "epoch": 0.025895976351598956, "grad_norm": 143.27706909179688, "learning_rate": 1.0348365661797038e-07, "loss": 0.3008, "num_input_tokens_seen": 724928, "step": 1060 }, { "epoch": 0.026018127183446118, "grad_norm": 167.41702270507812, "learning_rate": 1.0397224800899008e-07, "loss": 0.218, "num_input_tokens_seen": 728704, "step": 1065 }, { "epoch": 0.026140278015293283, "grad_norm": 19.40850830078125, "learning_rate": 1.0446083940000977e-07, "loss": 0.1445, "num_input_tokens_seen": 731968, "step": 1070 }, { "epoch": 0.02626242884714045, "grad_norm": 110.09046936035156, "learning_rate": 1.0494943079102945e-07, "loss": 0.1742, "num_input_tokens_seen": 734976, "step": 1075 }, { "epoch": 0.026384579678987614, "grad_norm": 169.50914001464844, "learning_rate": 1.0543802218204915e-07, "loss": 0.2347, "num_input_tokens_seen": 738176, "step": 1080 }, { "epoch": 0.02650673051083478, "grad_norm": 37.02994155883789, "learning_rate": 1.0592661357306884e-07, "loss": 0.1286, "num_input_tokens_seen": 742080, "step": 1085 }, { "epoch": 0.026628881342681942, "grad_norm": 42.34803009033203, "learning_rate": 1.0641520496408852e-07, "loss": 0.1454, "num_input_tokens_seen": 745152, "step": 1090 }, { "epoch": 0.026751032174529107, "grad_norm": 60.0620231628418, "learning_rate": 1.0690379635510823e-07, "loss": 0.2185, "num_input_tokens_seen": 748608, "step": 1095 }, { "epoch": 0.026873183006376273, "grad_norm": 115.29000091552734, "learning_rate": 1.0739238774612791e-07, "loss": 0.2498, "num_input_tokens_seen": 752192, "step": 1100 }, { "epoch": 0.02699533383822344, "grad_norm": 37.201820373535156, "learning_rate": 1.0788097913714759e-07, "loss": 0.2033, "num_input_tokens_seen": 756096, "step": 1105 }, { "epoch": 0.027117484670070604, "grad_norm": 39.57706069946289, "learning_rate": 1.083695705281673e-07, "loss": 0.1271, "num_input_tokens_seen": 759296, "step": 1110 }, { "epoch": 0.02723963550191777, "grad_norm": 32.63840103149414, "learning_rate": 1.0885816191918698e-07, "loss": 0.1702, "num_input_tokens_seen": 762368, "step": 1115 }, { "epoch": 0.02736178633376493, "grad_norm": 236.14743041992188, "learning_rate": 1.0934675331020666e-07, "loss": 0.1896, "num_input_tokens_seen": 765568, "step": 1120 }, { "epoch": 0.027483937165612097, "grad_norm": 117.73133850097656, "learning_rate": 1.0983534470122637e-07, "loss": 0.1672, "num_input_tokens_seen": 768960, "step": 1125 }, { "epoch": 0.027606087997459262, "grad_norm": 47.201210021972656, "learning_rate": 1.1032393609224605e-07, "loss": 0.1767, "num_input_tokens_seen": 772352, "step": 1130 }, { "epoch": 0.027728238829306428, "grad_norm": 95.38451385498047, "learning_rate": 1.1081252748326573e-07, "loss": 0.2014, "num_input_tokens_seen": 775616, "step": 1135 }, { "epoch": 0.027850389661153593, "grad_norm": 70.23417663574219, "learning_rate": 1.1130111887428544e-07, "loss": 0.2096, "num_input_tokens_seen": 778880, "step": 1140 }, { "epoch": 0.02797254049300076, "grad_norm": 115.88958740234375, "learning_rate": 1.1178971026530512e-07, "loss": 0.161, "num_input_tokens_seen": 781824, "step": 1145 }, { "epoch": 0.02809469132484792, "grad_norm": 71.50474548339844, "learning_rate": 1.122783016563248e-07, "loss": 0.1846, "num_input_tokens_seen": 785152, "step": 1150 }, { "epoch": 0.028216842156695086, "grad_norm": 93.5502700805664, "learning_rate": 1.1276689304734451e-07, "loss": 0.2438, "num_input_tokens_seen": 788160, "step": 1155 }, { "epoch": 0.028338992988542252, "grad_norm": 137.8145294189453, "learning_rate": 1.1325548443836419e-07, "loss": 0.1536, "num_input_tokens_seen": 791808, "step": 1160 }, { "epoch": 0.028461143820389417, "grad_norm": 121.41155242919922, "learning_rate": 1.1374407582938387e-07, "loss": 0.1637, "num_input_tokens_seen": 795392, "step": 1165 }, { "epoch": 0.028583294652236583, "grad_norm": 95.23970031738281, "learning_rate": 1.1423266722040357e-07, "loss": 0.2223, "num_input_tokens_seen": 799168, "step": 1170 }, { "epoch": 0.028705445484083745, "grad_norm": 89.20221710205078, "learning_rate": 1.1472125861142326e-07, "loss": 0.2553, "num_input_tokens_seen": 802624, "step": 1175 }, { "epoch": 0.02882759631593091, "grad_norm": 78.99765014648438, "learning_rate": 1.1520985000244294e-07, "loss": 0.1225, "num_input_tokens_seen": 806400, "step": 1180 }, { "epoch": 0.028949747147778076, "grad_norm": 81.3134994506836, "learning_rate": 1.1569844139346264e-07, "loss": 0.1684, "num_input_tokens_seen": 810048, "step": 1185 }, { "epoch": 0.02907189797962524, "grad_norm": 69.7149429321289, "learning_rate": 1.1618703278448233e-07, "loss": 0.1959, "num_input_tokens_seen": 813248, "step": 1190 }, { "epoch": 0.029194048811472407, "grad_norm": 25.824722290039062, "learning_rate": 1.1667562417550202e-07, "loss": 0.1737, "num_input_tokens_seen": 816320, "step": 1195 }, { "epoch": 0.029316199643319572, "grad_norm": 26.50715446472168, "learning_rate": 1.1716421556652171e-07, "loss": 0.1948, "num_input_tokens_seen": 819584, "step": 1200 }, { "epoch": 0.029438350475166734, "grad_norm": 107.89574432373047, "learning_rate": 1.176528069575414e-07, "loss": 0.1995, "num_input_tokens_seen": 822400, "step": 1205 }, { "epoch": 0.0295605013070139, "grad_norm": 28.458755493164062, "learning_rate": 1.181413983485611e-07, "loss": 0.1063, "num_input_tokens_seen": 825728, "step": 1210 }, { "epoch": 0.029682652138861065, "grad_norm": 87.51367950439453, "learning_rate": 1.1862998973958078e-07, "loss": 0.2259, "num_input_tokens_seen": 828992, "step": 1215 }, { "epoch": 0.02980480297070823, "grad_norm": 91.14592742919922, "learning_rate": 1.1911858113060047e-07, "loss": 0.2222, "num_input_tokens_seen": 831936, "step": 1220 }, { "epoch": 0.029926953802555396, "grad_norm": 154.0288848876953, "learning_rate": 1.1960717252162018e-07, "loss": 0.2204, "num_input_tokens_seen": 835520, "step": 1225 }, { "epoch": 0.030049104634402562, "grad_norm": 110.25627136230469, "learning_rate": 1.2009576391263986e-07, "loss": 0.147, "num_input_tokens_seen": 838976, "step": 1230 }, { "epoch": 0.030171255466249724, "grad_norm": 39.7508659362793, "learning_rate": 1.2058435530365955e-07, "loss": 0.2507, "num_input_tokens_seen": 842368, "step": 1235 }, { "epoch": 0.03029340629809689, "grad_norm": 153.55630493164062, "learning_rate": 1.2107294669467925e-07, "loss": 0.1361, "num_input_tokens_seen": 845632, "step": 1240 }, { "epoch": 0.030415557129944055, "grad_norm": 138.62791442871094, "learning_rate": 1.2156153808569893e-07, "loss": 0.1966, "num_input_tokens_seen": 848832, "step": 1245 }, { "epoch": 0.03053770796179122, "grad_norm": 40.55439758300781, "learning_rate": 1.2205012947671862e-07, "loss": 0.1246, "num_input_tokens_seen": 852416, "step": 1250 }, { "epoch": 0.030659858793638386, "grad_norm": 140.769775390625, "learning_rate": 1.225387208677383e-07, "loss": 0.1815, "num_input_tokens_seen": 855808, "step": 1255 }, { "epoch": 0.030782009625485548, "grad_norm": 133.49143981933594, "learning_rate": 1.23027312258758e-07, "loss": 0.1261, "num_input_tokens_seen": 859008, "step": 1260 }, { "epoch": 0.030904160457332713, "grad_norm": 30.480289459228516, "learning_rate": 1.2351590364977769e-07, "loss": 0.0881, "num_input_tokens_seen": 862976, "step": 1265 }, { "epoch": 0.03102631128917988, "grad_norm": 44.4013671875, "learning_rate": 1.2400449504079737e-07, "loss": 0.2887, "num_input_tokens_seen": 866240, "step": 1270 }, { "epoch": 0.031148462121027044, "grad_norm": 83.47310638427734, "learning_rate": 1.2449308643181707e-07, "loss": 0.202, "num_input_tokens_seen": 869376, "step": 1275 }, { "epoch": 0.031270612952874206, "grad_norm": 83.847900390625, "learning_rate": 1.2498167782283676e-07, "loss": 0.1258, "num_input_tokens_seen": 872896, "step": 1280 }, { "epoch": 0.031392763784721375, "grad_norm": 147.70680236816406, "learning_rate": 1.2547026921385644e-07, "loss": 0.1506, "num_input_tokens_seen": 876416, "step": 1285 }, { "epoch": 0.03151491461656854, "grad_norm": 115.61612701416016, "learning_rate": 1.2595886060487615e-07, "loss": 0.2235, "num_input_tokens_seen": 879680, "step": 1290 }, { "epoch": 0.031637065448415706, "grad_norm": 119.4736328125, "learning_rate": 1.2644745199589583e-07, "loss": 0.1283, "num_input_tokens_seen": 883008, "step": 1295 }, { "epoch": 0.03175921628026287, "grad_norm": 43.5504035949707, "learning_rate": 1.269360433869155e-07, "loss": 0.2082, "num_input_tokens_seen": 886848, "step": 1300 }, { "epoch": 0.03188136711211003, "grad_norm": 124.5676040649414, "learning_rate": 1.2742463477793522e-07, "loss": 0.1701, "num_input_tokens_seen": 890176, "step": 1305 }, { "epoch": 0.0320035179439572, "grad_norm": 39.65419006347656, "learning_rate": 1.279132261689549e-07, "loss": 0.1525, "num_input_tokens_seen": 893184, "step": 1310 }, { "epoch": 0.03212566877580436, "grad_norm": 131.31710815429688, "learning_rate": 1.2840181755997458e-07, "loss": 0.1911, "num_input_tokens_seen": 896448, "step": 1315 }, { "epoch": 0.03224781960765153, "grad_norm": 267.3889465332031, "learning_rate": 1.2889040895099429e-07, "loss": 0.2272, "num_input_tokens_seen": 899840, "step": 1320 }, { "epoch": 0.03236997043949869, "grad_norm": 104.76681518554688, "learning_rate": 1.2937900034201397e-07, "loss": 0.1963, "num_input_tokens_seen": 902912, "step": 1325 }, { "epoch": 0.03249212127134586, "grad_norm": 31.133392333984375, "learning_rate": 1.2986759173303365e-07, "loss": 0.1968, "num_input_tokens_seen": 906688, "step": 1330 }, { "epoch": 0.03261427210319302, "grad_norm": 158.30206298828125, "learning_rate": 1.3035618312405336e-07, "loss": 0.3204, "num_input_tokens_seen": 910272, "step": 1335 }, { "epoch": 0.032736422935040185, "grad_norm": 117.29009246826172, "learning_rate": 1.3084477451507304e-07, "loss": 0.1802, "num_input_tokens_seen": 913472, "step": 1340 }, { "epoch": 0.032858573766887354, "grad_norm": 35.10454559326172, "learning_rate": 1.3133336590609272e-07, "loss": 0.1534, "num_input_tokens_seen": 916800, "step": 1345 }, { "epoch": 0.032980724598734516, "grad_norm": 66.56099700927734, "learning_rate": 1.3182195729711243e-07, "loss": 0.1086, "num_input_tokens_seen": 920384, "step": 1350 }, { "epoch": 0.033102875430581685, "grad_norm": 26.81028175354004, "learning_rate": 1.323105486881321e-07, "loss": 0.1473, "num_input_tokens_seen": 923456, "step": 1355 }, { "epoch": 0.03322502626242885, "grad_norm": 123.34690856933594, "learning_rate": 1.327991400791518e-07, "loss": 0.1892, "num_input_tokens_seen": 926848, "step": 1360 }, { "epoch": 0.03334717709427601, "grad_norm": 27.120784759521484, "learning_rate": 1.332877314701715e-07, "loss": 0.1372, "num_input_tokens_seen": 930560, "step": 1365 }, { "epoch": 0.03346932792612318, "grad_norm": 70.49962615966797, "learning_rate": 1.3377632286119118e-07, "loss": 0.2135, "num_input_tokens_seen": 934144, "step": 1370 }, { "epoch": 0.03359147875797034, "grad_norm": 45.35801696777344, "learning_rate": 1.3426491425221086e-07, "loss": 0.1484, "num_input_tokens_seen": 937088, "step": 1375 }, { "epoch": 0.03371362958981751, "grad_norm": 63.658634185791016, "learning_rate": 1.3475350564323057e-07, "loss": 0.2285, "num_input_tokens_seen": 940544, "step": 1380 }, { "epoch": 0.03383578042166467, "grad_norm": 62.41775131225586, "learning_rate": 1.3524209703425025e-07, "loss": 0.1582, "num_input_tokens_seen": 944192, "step": 1385 }, { "epoch": 0.03395793125351183, "grad_norm": 59.16685104370117, "learning_rate": 1.3573068842526993e-07, "loss": 0.1455, "num_input_tokens_seen": 947712, "step": 1390 }, { "epoch": 0.034080082085359, "grad_norm": 36.50937271118164, "learning_rate": 1.3621927981628964e-07, "loss": 0.193, "num_input_tokens_seen": 950784, "step": 1395 }, { "epoch": 0.034202232917206164, "grad_norm": 17.540971755981445, "learning_rate": 1.3670787120730932e-07, "loss": 0.1433, "num_input_tokens_seen": 954432, "step": 1400 }, { "epoch": 0.03432438374905333, "grad_norm": 82.31527709960938, "learning_rate": 1.37196462598329e-07, "loss": 0.1772, "num_input_tokens_seen": 957504, "step": 1405 }, { "epoch": 0.034446534580900495, "grad_norm": 201.3554229736328, "learning_rate": 1.376850539893487e-07, "loss": 0.1693, "num_input_tokens_seen": 960448, "step": 1410 }, { "epoch": 0.034568685412747664, "grad_norm": 87.75347137451172, "learning_rate": 1.381736453803684e-07, "loss": 0.1895, "num_input_tokens_seen": 963840, "step": 1415 }, { "epoch": 0.034690836244594826, "grad_norm": 111.88566589355469, "learning_rate": 1.3866223677138807e-07, "loss": 0.174, "num_input_tokens_seen": 967232, "step": 1420 }, { "epoch": 0.03481298707644199, "grad_norm": 114.49220275878906, "learning_rate": 1.3915082816240778e-07, "loss": 0.2, "num_input_tokens_seen": 971008, "step": 1425 }, { "epoch": 0.03493513790828916, "grad_norm": 30.848787307739258, "learning_rate": 1.3963941955342746e-07, "loss": 0.1766, "num_input_tokens_seen": 974784, "step": 1430 }, { "epoch": 0.03505728874013632, "grad_norm": 77.15776062011719, "learning_rate": 1.4012801094444714e-07, "loss": 0.243, "num_input_tokens_seen": 977792, "step": 1435 }, { "epoch": 0.03517943957198349, "grad_norm": 58.27136993408203, "learning_rate": 1.4061660233546685e-07, "loss": 0.14, "num_input_tokens_seen": 981056, "step": 1440 }, { "epoch": 0.03530159040383065, "grad_norm": 48.294612884521484, "learning_rate": 1.4110519372648653e-07, "loss": 0.1969, "num_input_tokens_seen": 984192, "step": 1445 }, { "epoch": 0.03542374123567781, "grad_norm": 29.22694969177246, "learning_rate": 1.415937851175062e-07, "loss": 0.1798, "num_input_tokens_seen": 987520, "step": 1450 }, { "epoch": 0.03554589206752498, "grad_norm": 66.20771789550781, "learning_rate": 1.4208237650852592e-07, "loss": 0.1784, "num_input_tokens_seen": 991168, "step": 1455 }, { "epoch": 0.03566804289937214, "grad_norm": 78.16349029541016, "learning_rate": 1.425709678995456e-07, "loss": 0.1598, "num_input_tokens_seen": 994496, "step": 1460 }, { "epoch": 0.03579019373121931, "grad_norm": 176.6148223876953, "learning_rate": 1.4305955929056528e-07, "loss": 0.1563, "num_input_tokens_seen": 998208, "step": 1465 }, { "epoch": 0.035912344563066474, "grad_norm": 46.208099365234375, "learning_rate": 1.43548150681585e-07, "loss": 0.159, "num_input_tokens_seen": 1001984, "step": 1470 }, { "epoch": 0.036034495394913636, "grad_norm": 50.82776641845703, "learning_rate": 1.4403674207260467e-07, "loss": 0.229, "num_input_tokens_seen": 1004992, "step": 1475 }, { "epoch": 0.036156646226760805, "grad_norm": 77.68282318115234, "learning_rate": 1.4452533346362435e-07, "loss": 0.1666, "num_input_tokens_seen": 1008320, "step": 1480 }, { "epoch": 0.03627879705860797, "grad_norm": 63.0606575012207, "learning_rate": 1.4501392485464406e-07, "loss": 0.1998, "num_input_tokens_seen": 1011776, "step": 1485 }, { "epoch": 0.036400947890455136, "grad_norm": 21.415904998779297, "learning_rate": 1.4550251624566374e-07, "loss": 0.1194, "num_input_tokens_seen": 1014976, "step": 1490 }, { "epoch": 0.0365230987223023, "grad_norm": 95.43403625488281, "learning_rate": 1.4599110763668342e-07, "loss": 0.1989, "num_input_tokens_seen": 1018496, "step": 1495 }, { "epoch": 0.03664524955414947, "grad_norm": 77.62108612060547, "learning_rate": 1.4647969902770313e-07, "loss": 0.1353, "num_input_tokens_seen": 1021696, "step": 1500 }, { "epoch": 0.03676740038599663, "grad_norm": 140.04039001464844, "learning_rate": 1.4696829041872284e-07, "loss": 0.2075, "num_input_tokens_seen": 1025344, "step": 1505 }, { "epoch": 0.03688955121784379, "grad_norm": 73.93856048583984, "learning_rate": 1.474568818097425e-07, "loss": 0.1738, "num_input_tokens_seen": 1028672, "step": 1510 }, { "epoch": 0.03701170204969096, "grad_norm": 157.8907928466797, "learning_rate": 1.479454732007622e-07, "loss": 0.1116, "num_input_tokens_seen": 1031936, "step": 1515 }, { "epoch": 0.03713385288153812, "grad_norm": 80.8382568359375, "learning_rate": 1.484340645917819e-07, "loss": 0.1733, "num_input_tokens_seen": 1035200, "step": 1520 }, { "epoch": 0.03725600371338529, "grad_norm": 90.218505859375, "learning_rate": 1.4892265598280156e-07, "loss": 0.1415, "num_input_tokens_seen": 1038656, "step": 1525 }, { "epoch": 0.03737815454523245, "grad_norm": 52.294525146484375, "learning_rate": 1.4941124737382127e-07, "loss": 0.138, "num_input_tokens_seen": 1041984, "step": 1530 }, { "epoch": 0.037500305377079615, "grad_norm": 103.6115951538086, "learning_rate": 1.4989983876484098e-07, "loss": 0.1636, "num_input_tokens_seen": 1045184, "step": 1535 }, { "epoch": 0.037622456208926784, "grad_norm": 96.00193786621094, "learning_rate": 1.5038843015586063e-07, "loss": 0.1178, "num_input_tokens_seen": 1048640, "step": 1540 }, { "epoch": 0.037744607040773946, "grad_norm": 29.498638153076172, "learning_rate": 1.5087702154688034e-07, "loss": 0.1089, "num_input_tokens_seen": 1052352, "step": 1545 }, { "epoch": 0.037866757872621115, "grad_norm": 93.26831817626953, "learning_rate": 1.5136561293790005e-07, "loss": 0.1198, "num_input_tokens_seen": 1056192, "step": 1550 }, { "epoch": 0.03798890870446828, "grad_norm": 136.19781494140625, "learning_rate": 1.518542043289197e-07, "loss": 0.2486, "num_input_tokens_seen": 1059392, "step": 1555 }, { "epoch": 0.03811105953631544, "grad_norm": 156.92518615722656, "learning_rate": 1.523427957199394e-07, "loss": 0.1491, "num_input_tokens_seen": 1062400, "step": 1560 }, { "epoch": 0.03823321036816261, "grad_norm": 77.09222412109375, "learning_rate": 1.5283138711095912e-07, "loss": 0.1352, "num_input_tokens_seen": 1065792, "step": 1565 }, { "epoch": 0.03835536120000977, "grad_norm": 208.2629852294922, "learning_rate": 1.5331997850197878e-07, "loss": 0.2562, "num_input_tokens_seen": 1068864, "step": 1570 }, { "epoch": 0.03847751203185694, "grad_norm": 46.75325012207031, "learning_rate": 1.5380856989299848e-07, "loss": 0.1579, "num_input_tokens_seen": 1071872, "step": 1575 }, { "epoch": 0.0385996628637041, "grad_norm": 71.7177734375, "learning_rate": 1.542971612840182e-07, "loss": 0.1577, "num_input_tokens_seen": 1074880, "step": 1580 }, { "epoch": 0.03872181369555127, "grad_norm": 79.63215637207031, "learning_rate": 1.5478575267503785e-07, "loss": 0.1595, "num_input_tokens_seen": 1079168, "step": 1585 }, { "epoch": 0.03884396452739843, "grad_norm": 233.81239318847656, "learning_rate": 1.5527434406605755e-07, "loss": 0.2213, "num_input_tokens_seen": 1082496, "step": 1590 }, { "epoch": 0.038966115359245594, "grad_norm": 154.5967559814453, "learning_rate": 1.5576293545707726e-07, "loss": 0.1883, "num_input_tokens_seen": 1085824, "step": 1595 }, { "epoch": 0.03908826619109276, "grad_norm": 81.72132873535156, "learning_rate": 1.5625152684809692e-07, "loss": 0.1329, "num_input_tokens_seen": 1089920, "step": 1600 }, { "epoch": 0.039210417022939925, "grad_norm": 64.67450714111328, "learning_rate": 1.5674011823911662e-07, "loss": 0.1146, "num_input_tokens_seen": 1093760, "step": 1605 }, { "epoch": 0.039332567854787094, "grad_norm": 65.7629165649414, "learning_rate": 1.5722870963013633e-07, "loss": 0.1238, "num_input_tokens_seen": 1097216, "step": 1610 }, { "epoch": 0.039454718686634256, "grad_norm": 99.66740417480469, "learning_rate": 1.5771730102115599e-07, "loss": 0.1502, "num_input_tokens_seen": 1100608, "step": 1615 }, { "epoch": 0.03957686951848142, "grad_norm": 82.75312805175781, "learning_rate": 1.582058924121757e-07, "loss": 0.1395, "num_input_tokens_seen": 1104064, "step": 1620 }, { "epoch": 0.03969902035032859, "grad_norm": 49.08173370361328, "learning_rate": 1.586944838031954e-07, "loss": 0.1604, "num_input_tokens_seen": 1107584, "step": 1625 }, { "epoch": 0.03982117118217575, "grad_norm": 43.87813949584961, "learning_rate": 1.5918307519421506e-07, "loss": 0.0803, "num_input_tokens_seen": 1110656, "step": 1630 }, { "epoch": 0.03994332201402292, "grad_norm": 122.28240203857422, "learning_rate": 1.5967166658523476e-07, "loss": 0.158, "num_input_tokens_seen": 1113792, "step": 1635 }, { "epoch": 0.04006547284587008, "grad_norm": 71.87992858886719, "learning_rate": 1.6016025797625445e-07, "loss": 0.1535, "num_input_tokens_seen": 1117440, "step": 1640 }, { "epoch": 0.04018762367771724, "grad_norm": 49.055503845214844, "learning_rate": 1.6064884936727413e-07, "loss": 0.1075, "num_input_tokens_seen": 1120960, "step": 1645 }, { "epoch": 0.04030977450956441, "grad_norm": 38.739166259765625, "learning_rate": 1.6113744075829384e-07, "loss": 0.063, "num_input_tokens_seen": 1124032, "step": 1650 }, { "epoch": 0.04043192534141157, "grad_norm": 29.723430633544922, "learning_rate": 1.6162603214931352e-07, "loss": 0.0726, "num_input_tokens_seen": 1127424, "step": 1655 }, { "epoch": 0.04055407617325874, "grad_norm": 115.54558563232422, "learning_rate": 1.621146235403332e-07, "loss": 0.2601, "num_input_tokens_seen": 1130624, "step": 1660 }, { "epoch": 0.040676227005105904, "grad_norm": 70.30089569091797, "learning_rate": 1.626032149313529e-07, "loss": 0.2492, "num_input_tokens_seen": 1134272, "step": 1665 }, { "epoch": 0.04079837783695307, "grad_norm": 72.49864196777344, "learning_rate": 1.630918063223726e-07, "loss": 0.1489, "num_input_tokens_seen": 1137792, "step": 1670 }, { "epoch": 0.040920528668800235, "grad_norm": 59.99082565307617, "learning_rate": 1.6358039771339227e-07, "loss": 0.1915, "num_input_tokens_seen": 1141056, "step": 1675 }, { "epoch": 0.0410426795006474, "grad_norm": 129.36558532714844, "learning_rate": 1.6406898910441198e-07, "loss": 0.2433, "num_input_tokens_seen": 1144576, "step": 1680 }, { "epoch": 0.041164830332494566, "grad_norm": 44.43220901489258, "learning_rate": 1.6455758049543166e-07, "loss": 0.1577, "num_input_tokens_seen": 1147840, "step": 1685 }, { "epoch": 0.04128698116434173, "grad_norm": 104.24834442138672, "learning_rate": 1.6504617188645134e-07, "loss": 0.105, "num_input_tokens_seen": 1151552, "step": 1690 }, { "epoch": 0.0414091319961889, "grad_norm": 87.4388198852539, "learning_rate": 1.6553476327747105e-07, "loss": 0.1276, "num_input_tokens_seen": 1154560, "step": 1695 }, { "epoch": 0.04153128282803606, "grad_norm": 81.07981872558594, "learning_rate": 1.6602335466849073e-07, "loss": 0.1723, "num_input_tokens_seen": 1157760, "step": 1700 }, { "epoch": 0.04165343365988322, "grad_norm": 237.32518005371094, "learning_rate": 1.665119460595104e-07, "loss": 0.2233, "num_input_tokens_seen": 1161472, "step": 1705 }, { "epoch": 0.04177558449173039, "grad_norm": 151.68650817871094, "learning_rate": 1.6700053745053012e-07, "loss": 0.1753, "num_input_tokens_seen": 1164480, "step": 1710 }, { "epoch": 0.04189773532357755, "grad_norm": 131.32754516601562, "learning_rate": 1.674891288415498e-07, "loss": 0.1315, "num_input_tokens_seen": 1168384, "step": 1715 }, { "epoch": 0.04201988615542472, "grad_norm": 108.83721160888672, "learning_rate": 1.6797772023256948e-07, "loss": 0.158, "num_input_tokens_seen": 1171648, "step": 1720 }, { "epoch": 0.04214203698727188, "grad_norm": 85.09517669677734, "learning_rate": 1.684663116235892e-07, "loss": 0.1922, "num_input_tokens_seen": 1174976, "step": 1725 }, { "epoch": 0.042264187819119045, "grad_norm": 47.35398483276367, "learning_rate": 1.6895490301460887e-07, "loss": 0.2365, "num_input_tokens_seen": 1178176, "step": 1730 }, { "epoch": 0.042386338650966214, "grad_norm": 70.96749114990234, "learning_rate": 1.6944349440562858e-07, "loss": 0.114, "num_input_tokens_seen": 1181696, "step": 1735 }, { "epoch": 0.042508489482813376, "grad_norm": 106.02008056640625, "learning_rate": 1.6993208579664826e-07, "loss": 0.16, "num_input_tokens_seen": 1185216, "step": 1740 }, { "epoch": 0.042630640314660545, "grad_norm": 127.25152587890625, "learning_rate": 1.7042067718766794e-07, "loss": 0.1776, "num_input_tokens_seen": 1188672, "step": 1745 }, { "epoch": 0.04275279114650771, "grad_norm": 33.11648178100586, "learning_rate": 1.7090926857868765e-07, "loss": 0.1947, "num_input_tokens_seen": 1192512, "step": 1750 }, { "epoch": 0.042874941978354876, "grad_norm": 75.8089599609375, "learning_rate": 1.7139785996970733e-07, "loss": 0.1722, "num_input_tokens_seen": 1195520, "step": 1755 }, { "epoch": 0.04299709281020204, "grad_norm": 75.60196685791016, "learning_rate": 1.71886451360727e-07, "loss": 0.1517, "num_input_tokens_seen": 1199040, "step": 1760 }, { "epoch": 0.0431192436420492, "grad_norm": 40.18544387817383, "learning_rate": 1.7237504275174672e-07, "loss": 0.1429, "num_input_tokens_seen": 1202240, "step": 1765 }, { "epoch": 0.04324139447389637, "grad_norm": 61.27647018432617, "learning_rate": 1.728636341427664e-07, "loss": 0.1743, "num_input_tokens_seen": 1205504, "step": 1770 }, { "epoch": 0.04336354530574353, "grad_norm": 145.07838439941406, "learning_rate": 1.7335222553378608e-07, "loss": 0.1751, "num_input_tokens_seen": 1208896, "step": 1775 }, { "epoch": 0.0434856961375907, "grad_norm": 23.42119789123535, "learning_rate": 1.738408169248058e-07, "loss": 0.1709, "num_input_tokens_seen": 1212864, "step": 1780 }, { "epoch": 0.04360784696943786, "grad_norm": 24.543663024902344, "learning_rate": 1.7432940831582547e-07, "loss": 0.0921, "num_input_tokens_seen": 1216192, "step": 1785 }, { "epoch": 0.043729997801285024, "grad_norm": 79.07312774658203, "learning_rate": 1.7481799970684515e-07, "loss": 0.2697, "num_input_tokens_seen": 1219264, "step": 1790 }, { "epoch": 0.04385214863313219, "grad_norm": 27.745731353759766, "learning_rate": 1.7530659109786486e-07, "loss": 0.1171, "num_input_tokens_seen": 1222400, "step": 1795 }, { "epoch": 0.043974299464979355, "grad_norm": 96.10833740234375, "learning_rate": 1.7579518248888454e-07, "loss": 0.1516, "num_input_tokens_seen": 1225472, "step": 1800 }, { "epoch": 0.044096450296826524, "grad_norm": 31.986623764038086, "learning_rate": 1.7628377387990422e-07, "loss": 0.2495, "num_input_tokens_seen": 1228736, "step": 1805 }, { "epoch": 0.044218601128673686, "grad_norm": 23.220766067504883, "learning_rate": 1.7677236527092393e-07, "loss": 0.098, "num_input_tokens_seen": 1232960, "step": 1810 }, { "epoch": 0.04434075196052085, "grad_norm": 26.95801544189453, "learning_rate": 1.772609566619436e-07, "loss": 0.1844, "num_input_tokens_seen": 1236480, "step": 1815 }, { "epoch": 0.04446290279236802, "grad_norm": 69.52427673339844, "learning_rate": 1.777495480529633e-07, "loss": 0.0747, "num_input_tokens_seen": 1240448, "step": 1820 }, { "epoch": 0.04458505362421518, "grad_norm": 56.518741607666016, "learning_rate": 1.78238139443983e-07, "loss": 0.1022, "num_input_tokens_seen": 1243648, "step": 1825 }, { "epoch": 0.04470720445606235, "grad_norm": 31.176511764526367, "learning_rate": 1.7872673083500268e-07, "loss": 0.1376, "num_input_tokens_seen": 1247040, "step": 1830 }, { "epoch": 0.04482935528790951, "grad_norm": 76.87928771972656, "learning_rate": 1.7921532222602236e-07, "loss": 0.2258, "num_input_tokens_seen": 1250816, "step": 1835 }, { "epoch": 0.04495150611975668, "grad_norm": 54.71757888793945, "learning_rate": 1.7970391361704207e-07, "loss": 0.1251, "num_input_tokens_seen": 1254656, "step": 1840 }, { "epoch": 0.04507365695160384, "grad_norm": 24.595806121826172, "learning_rate": 1.8019250500806175e-07, "loss": 0.1035, "num_input_tokens_seen": 1258176, "step": 1845 }, { "epoch": 0.045195807783451, "grad_norm": 98.47230529785156, "learning_rate": 1.8068109639908143e-07, "loss": 0.1309, "num_input_tokens_seen": 1261248, "step": 1850 }, { "epoch": 0.04531795861529817, "grad_norm": 139.2314453125, "learning_rate": 1.8116968779010114e-07, "loss": 0.2031, "num_input_tokens_seen": 1264576, "step": 1855 }, { "epoch": 0.045440109447145334, "grad_norm": 67.46702575683594, "learning_rate": 1.8165827918112082e-07, "loss": 0.1752, "num_input_tokens_seen": 1267776, "step": 1860 }, { "epoch": 0.0455622602789925, "grad_norm": 78.09505462646484, "learning_rate": 1.821468705721405e-07, "loss": 0.1825, "num_input_tokens_seen": 1271040, "step": 1865 }, { "epoch": 0.045684411110839665, "grad_norm": 28.8896427154541, "learning_rate": 1.826354619631602e-07, "loss": 0.099, "num_input_tokens_seen": 1274496, "step": 1870 }, { "epoch": 0.04580656194268683, "grad_norm": 71.47722625732422, "learning_rate": 1.831240533541799e-07, "loss": 0.2014, "num_input_tokens_seen": 1277824, "step": 1875 }, { "epoch": 0.045928712774533996, "grad_norm": 23.266725540161133, "learning_rate": 1.8361264474519957e-07, "loss": 0.1235, "num_input_tokens_seen": 1280832, "step": 1880 }, { "epoch": 0.04605086360638116, "grad_norm": 111.74454498291016, "learning_rate": 1.8410123613621928e-07, "loss": 0.122, "num_input_tokens_seen": 1284160, "step": 1885 }, { "epoch": 0.04617301443822833, "grad_norm": 34.12566375732422, "learning_rate": 1.8458982752723896e-07, "loss": 0.1444, "num_input_tokens_seen": 1287168, "step": 1890 }, { "epoch": 0.04629516527007549, "grad_norm": 48.650390625, "learning_rate": 1.8507841891825864e-07, "loss": 0.093, "num_input_tokens_seen": 1290880, "step": 1895 }, { "epoch": 0.04641731610192265, "grad_norm": 55.02333450317383, "learning_rate": 1.8556701030927835e-07, "loss": 0.1654, "num_input_tokens_seen": 1294656, "step": 1900 }, { "epoch": 0.04653946693376982, "grad_norm": 94.02783203125, "learning_rate": 1.8605560170029803e-07, "loss": 0.1705, "num_input_tokens_seen": 1297728, "step": 1905 }, { "epoch": 0.04666161776561698, "grad_norm": 78.1871566772461, "learning_rate": 1.8654419309131771e-07, "loss": 0.1407, "num_input_tokens_seen": 1300928, "step": 1910 }, { "epoch": 0.04678376859746415, "grad_norm": 117.9102554321289, "learning_rate": 1.8703278448233742e-07, "loss": 0.2134, "num_input_tokens_seen": 1303808, "step": 1915 }, { "epoch": 0.04690591942931131, "grad_norm": 41.647972106933594, "learning_rate": 1.875213758733571e-07, "loss": 0.1497, "num_input_tokens_seen": 1307648, "step": 1920 }, { "epoch": 0.04702807026115848, "grad_norm": 55.67569351196289, "learning_rate": 1.8800996726437678e-07, "loss": 0.1378, "num_input_tokens_seen": 1310848, "step": 1925 }, { "epoch": 0.047150221093005644, "grad_norm": 48.958621978759766, "learning_rate": 1.884985586553965e-07, "loss": 0.2215, "num_input_tokens_seen": 1314432, "step": 1930 }, { "epoch": 0.047272371924852806, "grad_norm": 41.2342643737793, "learning_rate": 1.8898715004641617e-07, "loss": 0.2139, "num_input_tokens_seen": 1318080, "step": 1935 }, { "epoch": 0.047394522756699975, "grad_norm": 92.6572036743164, "learning_rate": 1.8947574143743585e-07, "loss": 0.2444, "num_input_tokens_seen": 1321024, "step": 1940 }, { "epoch": 0.04751667358854714, "grad_norm": 26.353044509887695, "learning_rate": 1.8996433282845556e-07, "loss": 0.1379, "num_input_tokens_seen": 1324736, "step": 1945 }, { "epoch": 0.047638824420394306, "grad_norm": 75.55083465576172, "learning_rate": 1.9045292421947524e-07, "loss": 0.1321, "num_input_tokens_seen": 1328000, "step": 1950 }, { "epoch": 0.04776097525224147, "grad_norm": 33.58150100708008, "learning_rate": 1.9094151561049492e-07, "loss": 0.124, "num_input_tokens_seen": 1331008, "step": 1955 }, { "epoch": 0.04788312608408863, "grad_norm": 79.53211212158203, "learning_rate": 1.9143010700151463e-07, "loss": 0.1557, "num_input_tokens_seen": 1334208, "step": 1960 }, { "epoch": 0.0480052769159358, "grad_norm": 56.7999153137207, "learning_rate": 1.9191869839253434e-07, "loss": 0.1513, "num_input_tokens_seen": 1337472, "step": 1965 }, { "epoch": 0.04812742774778296, "grad_norm": 48.73048400878906, "learning_rate": 1.92407289783554e-07, "loss": 0.2155, "num_input_tokens_seen": 1340928, "step": 1970 }, { "epoch": 0.04824957857963013, "grad_norm": 34.07157516479492, "learning_rate": 1.928958811745737e-07, "loss": 0.1602, "num_input_tokens_seen": 1344128, "step": 1975 }, { "epoch": 0.04837172941147729, "grad_norm": 69.41503143310547, "learning_rate": 1.933844725655934e-07, "loss": 0.0793, "num_input_tokens_seen": 1347904, "step": 1980 }, { "epoch": 0.048493880243324454, "grad_norm": 82.7708740234375, "learning_rate": 1.9387306395661307e-07, "loss": 0.1941, "num_input_tokens_seen": 1351552, "step": 1985 }, { "epoch": 0.04861603107517162, "grad_norm": 70.6302719116211, "learning_rate": 1.9436165534763277e-07, "loss": 0.1792, "num_input_tokens_seen": 1354816, "step": 1990 }, { "epoch": 0.048738181907018785, "grad_norm": 80.82976531982422, "learning_rate": 1.9485024673865248e-07, "loss": 0.2255, "num_input_tokens_seen": 1357824, "step": 1995 }, { "epoch": 0.048860332738865954, "grad_norm": 113.82199096679688, "learning_rate": 1.9533883812967214e-07, "loss": 0.1439, "num_input_tokens_seen": 1360896, "step": 2000 }, { "epoch": 0.048982483570713116, "grad_norm": 55.600555419921875, "learning_rate": 1.9582742952069184e-07, "loss": 0.1135, "num_input_tokens_seen": 1364288, "step": 2005 }, { "epoch": 0.049104634402560285, "grad_norm": 101.28015899658203, "learning_rate": 1.9631602091171155e-07, "loss": 0.1285, "num_input_tokens_seen": 1367424, "step": 2010 }, { "epoch": 0.04922678523440745, "grad_norm": 68.34232330322266, "learning_rate": 1.968046123027312e-07, "loss": 0.149, "num_input_tokens_seen": 1370880, "step": 2015 }, { "epoch": 0.04934893606625461, "grad_norm": 45.278873443603516, "learning_rate": 1.9729320369375091e-07, "loss": 0.0844, "num_input_tokens_seen": 1374272, "step": 2020 }, { "epoch": 0.04947108689810178, "grad_norm": 49.61530303955078, "learning_rate": 1.977817950847706e-07, "loss": 0.1647, "num_input_tokens_seen": 1377600, "step": 2025 }, { "epoch": 0.04959323772994894, "grad_norm": 28.334949493408203, "learning_rate": 1.9827038647579028e-07, "loss": 0.1265, "num_input_tokens_seen": 1380544, "step": 2030 }, { "epoch": 0.04971538856179611, "grad_norm": 98.55791473388672, "learning_rate": 1.9875897786680998e-07, "loss": 0.1444, "num_input_tokens_seen": 1384064, "step": 2035 }, { "epoch": 0.04983753939364327, "grad_norm": 121.94667053222656, "learning_rate": 1.9924756925782967e-07, "loss": 0.1977, "num_input_tokens_seen": 1387264, "step": 2040 }, { "epoch": 0.04995969022549043, "grad_norm": 129.62332153320312, "learning_rate": 1.9973616064884935e-07, "loss": 0.2471, "num_input_tokens_seen": 1390464, "step": 2045 }, { "epoch": 0.0500818410573376, "grad_norm": 50.89700698852539, "learning_rate": 2.0022475203986905e-07, "loss": 0.1957, "num_input_tokens_seen": 1394368, "step": 2050 }, { "epoch": 0.050203991889184764, "grad_norm": 37.54369354248047, "learning_rate": 2.0071334343088874e-07, "loss": 0.2043, "num_input_tokens_seen": 1397824, "step": 2055 }, { "epoch": 0.05032614272103193, "grad_norm": 22.28923988342285, "learning_rate": 2.0120193482190842e-07, "loss": 0.0892, "num_input_tokens_seen": 1401664, "step": 2060 }, { "epoch": 0.050448293552879095, "grad_norm": 44.674705505371094, "learning_rate": 2.0169052621292813e-07, "loss": 0.1588, "num_input_tokens_seen": 1405504, "step": 2065 }, { "epoch": 0.05057044438472626, "grad_norm": 61.58700180053711, "learning_rate": 2.021791176039478e-07, "loss": 0.131, "num_input_tokens_seen": 1408768, "step": 2070 }, { "epoch": 0.050692595216573426, "grad_norm": 20.806472778320312, "learning_rate": 2.026677089949675e-07, "loss": 0.1556, "num_input_tokens_seen": 1411968, "step": 2075 }, { "epoch": 0.05081474604842059, "grad_norm": 24.93768310546875, "learning_rate": 2.031563003859872e-07, "loss": 0.1472, "num_input_tokens_seen": 1415168, "step": 2080 }, { "epoch": 0.05093689688026776, "grad_norm": 45.42394256591797, "learning_rate": 2.0364489177700688e-07, "loss": 0.149, "num_input_tokens_seen": 1418688, "step": 2085 }, { "epoch": 0.05105904771211492, "grad_norm": 54.84282302856445, "learning_rate": 2.0413348316802656e-07, "loss": 0.1877, "num_input_tokens_seen": 1422144, "step": 2090 }, { "epoch": 0.05118119854396209, "grad_norm": 49.84492492675781, "learning_rate": 2.0462207455904627e-07, "loss": 0.1552, "num_input_tokens_seen": 1425792, "step": 2095 }, { "epoch": 0.05130334937580925, "grad_norm": 79.23111724853516, "learning_rate": 2.0511066595006595e-07, "loss": 0.181, "num_input_tokens_seen": 1429184, "step": 2100 }, { "epoch": 0.05142550020765641, "grad_norm": 77.6365737915039, "learning_rate": 2.0559925734108563e-07, "loss": 0.2218, "num_input_tokens_seen": 1432192, "step": 2105 }, { "epoch": 0.05154765103950358, "grad_norm": 34.6860237121582, "learning_rate": 2.0608784873210534e-07, "loss": 0.0809, "num_input_tokens_seen": 1435520, "step": 2110 }, { "epoch": 0.05166980187135074, "grad_norm": 57.39856719970703, "learning_rate": 2.0657644012312502e-07, "loss": 0.1314, "num_input_tokens_seen": 1438848, "step": 2115 }, { "epoch": 0.05179195270319791, "grad_norm": 76.45271301269531, "learning_rate": 2.070650315141447e-07, "loss": 0.1002, "num_input_tokens_seen": 1442432, "step": 2120 }, { "epoch": 0.051914103535045074, "grad_norm": 64.7785873413086, "learning_rate": 2.075536229051644e-07, "loss": 0.1973, "num_input_tokens_seen": 1445568, "step": 2125 }, { "epoch": 0.052036254366892236, "grad_norm": 129.55709838867188, "learning_rate": 2.080422142961841e-07, "loss": 0.223, "num_input_tokens_seen": 1449152, "step": 2130 }, { "epoch": 0.052158405198739405, "grad_norm": 67.76712799072266, "learning_rate": 2.0853080568720377e-07, "loss": 0.1381, "num_input_tokens_seen": 1452224, "step": 2135 }, { "epoch": 0.05228055603058657, "grad_norm": 65.36962890625, "learning_rate": 2.0901939707822348e-07, "loss": 0.1132, "num_input_tokens_seen": 1456064, "step": 2140 }, { "epoch": 0.052402706862433736, "grad_norm": 21.638248443603516, "learning_rate": 2.0950798846924316e-07, "loss": 0.1221, "num_input_tokens_seen": 1459648, "step": 2145 }, { "epoch": 0.0525248576942809, "grad_norm": 41.81351852416992, "learning_rate": 2.0999657986026284e-07, "loss": 0.1435, "num_input_tokens_seen": 1463296, "step": 2150 }, { "epoch": 0.05264700852612806, "grad_norm": 49.325618743896484, "learning_rate": 2.1048517125128255e-07, "loss": 0.0786, "num_input_tokens_seen": 1466752, "step": 2155 }, { "epoch": 0.05276915935797523, "grad_norm": 70.45987701416016, "learning_rate": 2.1097376264230223e-07, "loss": 0.2243, "num_input_tokens_seen": 1469760, "step": 2160 }, { "epoch": 0.05289131018982239, "grad_norm": 16.85369300842285, "learning_rate": 2.114623540333219e-07, "loss": 0.2312, "num_input_tokens_seen": 1473408, "step": 2165 }, { "epoch": 0.05301346102166956, "grad_norm": 48.095367431640625, "learning_rate": 2.1195094542434162e-07, "loss": 0.1557, "num_input_tokens_seen": 1476736, "step": 2170 }, { "epoch": 0.05313561185351672, "grad_norm": 122.31871032714844, "learning_rate": 2.124395368153613e-07, "loss": 0.1821, "num_input_tokens_seen": 1479936, "step": 2175 }, { "epoch": 0.053257762685363884, "grad_norm": 94.89913177490234, "learning_rate": 2.1292812820638098e-07, "loss": 0.1161, "num_input_tokens_seen": 1483328, "step": 2180 }, { "epoch": 0.05337991351721105, "grad_norm": 68.65199279785156, "learning_rate": 2.134167195974007e-07, "loss": 0.1854, "num_input_tokens_seen": 1486592, "step": 2185 }, { "epoch": 0.053502064349058215, "grad_norm": 26.472394943237305, "learning_rate": 2.1390531098842037e-07, "loss": 0.0968, "num_input_tokens_seen": 1489792, "step": 2190 }, { "epoch": 0.053624215180905384, "grad_norm": 25.325607299804688, "learning_rate": 2.1439390237944008e-07, "loss": 0.1555, "num_input_tokens_seen": 1492928, "step": 2195 }, { "epoch": 0.053746366012752546, "grad_norm": 97.38853454589844, "learning_rate": 2.1488249377045976e-07, "loss": 0.1967, "num_input_tokens_seen": 1496512, "step": 2200 }, { "epoch": 0.053868516844599715, "grad_norm": 23.16677474975586, "learning_rate": 2.1537108516147944e-07, "loss": 0.1324, "num_input_tokens_seen": 1499840, "step": 2205 }, { "epoch": 0.05399066767644688, "grad_norm": 21.841686248779297, "learning_rate": 2.1585967655249915e-07, "loss": 0.1308, "num_input_tokens_seen": 1502976, "step": 2210 }, { "epoch": 0.05411281850829404, "grad_norm": 23.00255584716797, "learning_rate": 2.1634826794351883e-07, "loss": 0.24, "num_input_tokens_seen": 1506432, "step": 2215 }, { "epoch": 0.05423496934014121, "grad_norm": 48.22821044921875, "learning_rate": 2.168368593345385e-07, "loss": 0.2346, "num_input_tokens_seen": 1509888, "step": 2220 }, { "epoch": 0.05435712017198837, "grad_norm": 38.02397155761719, "learning_rate": 2.1732545072555822e-07, "loss": 0.2164, "num_input_tokens_seen": 1513024, "step": 2225 }, { "epoch": 0.05447927100383554, "grad_norm": 52.66263961791992, "learning_rate": 2.178140421165779e-07, "loss": 0.2196, "num_input_tokens_seen": 1516544, "step": 2230 }, { "epoch": 0.0546014218356827, "grad_norm": 28.82169532775879, "learning_rate": 2.1830263350759758e-07, "loss": 0.1195, "num_input_tokens_seen": 1520576, "step": 2235 }, { "epoch": 0.05472357266752986, "grad_norm": 52.429527282714844, "learning_rate": 2.187912248986173e-07, "loss": 0.1513, "num_input_tokens_seen": 1523712, "step": 2240 }, { "epoch": 0.05484572349937703, "grad_norm": 13.38976764678955, "learning_rate": 2.1927981628963697e-07, "loss": 0.0863, "num_input_tokens_seen": 1527808, "step": 2245 }, { "epoch": 0.054967874331224194, "grad_norm": 44.932308197021484, "learning_rate": 2.1976840768065665e-07, "loss": 0.232, "num_input_tokens_seen": 1531264, "step": 2250 }, { "epoch": 0.05509002516307136, "grad_norm": 30.4183292388916, "learning_rate": 2.2025699907167636e-07, "loss": 0.1553, "num_input_tokens_seen": 1534912, "step": 2255 }, { "epoch": 0.055212175994918525, "grad_norm": 44.02510452270508, "learning_rate": 2.2074559046269604e-07, "loss": 0.14, "num_input_tokens_seen": 1538176, "step": 2260 }, { "epoch": 0.05533432682676569, "grad_norm": 75.7412109375, "learning_rate": 2.2123418185371572e-07, "loss": 0.1312, "num_input_tokens_seen": 1541760, "step": 2265 }, { "epoch": 0.055456477658612856, "grad_norm": 100.07758331298828, "learning_rate": 2.2172277324473543e-07, "loss": 0.2186, "num_input_tokens_seen": 1545152, "step": 2270 }, { "epoch": 0.05557862849046002, "grad_norm": 50.55472183227539, "learning_rate": 2.222113646357551e-07, "loss": 0.1079, "num_input_tokens_seen": 1548096, "step": 2275 }, { "epoch": 0.05570077932230719, "grad_norm": 33.335689544677734, "learning_rate": 2.226999560267748e-07, "loss": 0.0889, "num_input_tokens_seen": 1551168, "step": 2280 }, { "epoch": 0.05582293015415435, "grad_norm": 134.88308715820312, "learning_rate": 2.231885474177945e-07, "loss": 0.2402, "num_input_tokens_seen": 1554432, "step": 2285 }, { "epoch": 0.05594508098600152, "grad_norm": 53.62267303466797, "learning_rate": 2.2367713880881418e-07, "loss": 0.1479, "num_input_tokens_seen": 1557568, "step": 2290 }, { "epoch": 0.05606723181784868, "grad_norm": 10.846335411071777, "learning_rate": 2.2416573019983386e-07, "loss": 0.1292, "num_input_tokens_seen": 1561152, "step": 2295 }, { "epoch": 0.05618938264969584, "grad_norm": 56.55923843383789, "learning_rate": 2.2465432159085357e-07, "loss": 0.1956, "num_input_tokens_seen": 1564160, "step": 2300 }, { "epoch": 0.05631153348154301, "grad_norm": 41.92458724975586, "learning_rate": 2.2514291298187325e-07, "loss": 0.101, "num_input_tokens_seen": 1567616, "step": 2305 }, { "epoch": 0.05643368431339017, "grad_norm": 21.67095375061035, "learning_rate": 2.2563150437289293e-07, "loss": 0.1142, "num_input_tokens_seen": 1570944, "step": 2310 }, { "epoch": 0.05655583514523734, "grad_norm": 56.14408493041992, "learning_rate": 2.2612009576391264e-07, "loss": 0.1082, "num_input_tokens_seen": 1574208, "step": 2315 }, { "epoch": 0.056677985977084504, "grad_norm": 61.64763259887695, "learning_rate": 2.2660868715493232e-07, "loss": 0.1785, "num_input_tokens_seen": 1577664, "step": 2320 }, { "epoch": 0.056800136808931666, "grad_norm": 33.018280029296875, "learning_rate": 2.27097278545952e-07, "loss": 0.1554, "num_input_tokens_seen": 1580992, "step": 2325 }, { "epoch": 0.056922287640778835, "grad_norm": 33.67935562133789, "learning_rate": 2.275858699369717e-07, "loss": 0.1593, "num_input_tokens_seen": 1583808, "step": 2330 }, { "epoch": 0.057044438472626, "grad_norm": 111.91667175292969, "learning_rate": 2.280744613279914e-07, "loss": 0.1516, "num_input_tokens_seen": 1587136, "step": 2335 }, { "epoch": 0.057166589304473166, "grad_norm": 33.9388542175293, "learning_rate": 2.2856305271901107e-07, "loss": 0.0974, "num_input_tokens_seen": 1590592, "step": 2340 }, { "epoch": 0.05728874013632033, "grad_norm": 31.756982803344727, "learning_rate": 2.2905164411003078e-07, "loss": 0.1008, "num_input_tokens_seen": 1593984, "step": 2345 }, { "epoch": 0.05741089096816749, "grad_norm": 16.234106063842773, "learning_rate": 2.2954023550105044e-07, "loss": 0.1619, "num_input_tokens_seen": 1597248, "step": 2350 }, { "epoch": 0.05753304180001466, "grad_norm": 57.85862731933594, "learning_rate": 2.3002882689207014e-07, "loss": 0.0828, "num_input_tokens_seen": 1600512, "step": 2355 }, { "epoch": 0.05765519263186182, "grad_norm": 96.33238220214844, "learning_rate": 2.3051741828308985e-07, "loss": 0.229, "num_input_tokens_seen": 1603520, "step": 2360 }, { "epoch": 0.05777734346370899, "grad_norm": 133.5282440185547, "learning_rate": 2.310060096741095e-07, "loss": 0.2023, "num_input_tokens_seen": 1606848, "step": 2365 }, { "epoch": 0.05789949429555615, "grad_norm": 25.754037857055664, "learning_rate": 2.3149460106512921e-07, "loss": 0.1168, "num_input_tokens_seen": 1610688, "step": 2370 }, { "epoch": 0.05802164512740332, "grad_norm": 64.81108856201172, "learning_rate": 2.3198319245614892e-07, "loss": 0.0904, "num_input_tokens_seen": 1614336, "step": 2375 }, { "epoch": 0.05814379595925048, "grad_norm": 21.53687858581543, "learning_rate": 2.3247178384716858e-07, "loss": 0.1832, "num_input_tokens_seen": 1617216, "step": 2380 }, { "epoch": 0.058265946791097645, "grad_norm": 33.07185363769531, "learning_rate": 2.3296037523818829e-07, "loss": 0.157, "num_input_tokens_seen": 1620544, "step": 2385 }, { "epoch": 0.058388097622944814, "grad_norm": 31.172517776489258, "learning_rate": 2.33448966629208e-07, "loss": 0.1079, "num_input_tokens_seen": 1623680, "step": 2390 }, { "epoch": 0.058510248454791976, "grad_norm": 30.034866333007812, "learning_rate": 2.3393755802022765e-07, "loss": 0.1297, "num_input_tokens_seen": 1626816, "step": 2395 }, { "epoch": 0.058632399286639145, "grad_norm": 41.092018127441406, "learning_rate": 2.3442614941124736e-07, "loss": 0.2271, "num_input_tokens_seen": 1630336, "step": 2400 }, { "epoch": 0.05875455011848631, "grad_norm": 17.320775985717773, "learning_rate": 2.3491474080226706e-07, "loss": 0.1235, "num_input_tokens_seen": 1634304, "step": 2405 }, { "epoch": 0.05887670095033347, "grad_norm": 34.5215950012207, "learning_rate": 2.3540333219328672e-07, "loss": 0.1444, "num_input_tokens_seen": 1637760, "step": 2410 }, { "epoch": 0.05899885178218064, "grad_norm": 60.64455795288086, "learning_rate": 2.3589192358430643e-07, "loss": 0.1087, "num_input_tokens_seen": 1640896, "step": 2415 }, { "epoch": 0.0591210026140278, "grad_norm": 44.74037551879883, "learning_rate": 2.3638051497532613e-07, "loss": 0.1224, "num_input_tokens_seen": 1644352, "step": 2420 }, { "epoch": 0.05924315344587497, "grad_norm": 59.06151580810547, "learning_rate": 2.3686910636634582e-07, "loss": 0.2265, "num_input_tokens_seen": 1648320, "step": 2425 }, { "epoch": 0.05936530427772213, "grad_norm": 37.72708511352539, "learning_rate": 2.373576977573655e-07, "loss": 0.2262, "num_input_tokens_seen": 1651712, "step": 2430 }, { "epoch": 0.05948745510956929, "grad_norm": 79.2681884765625, "learning_rate": 2.378462891483852e-07, "loss": 0.1233, "num_input_tokens_seen": 1654848, "step": 2435 }, { "epoch": 0.05960960594141646, "grad_norm": 35.262062072753906, "learning_rate": 2.3833488053940489e-07, "loss": 0.187, "num_input_tokens_seen": 1658240, "step": 2440 }, { "epoch": 0.059731756773263624, "grad_norm": 23.657838821411133, "learning_rate": 2.3882347193042457e-07, "loss": 0.1262, "num_input_tokens_seen": 1661440, "step": 2445 }, { "epoch": 0.05985390760511079, "grad_norm": 30.22248649597168, "learning_rate": 2.3931206332144425e-07, "loss": 0.0881, "num_input_tokens_seen": 1664704, "step": 2450 }, { "epoch": 0.059976058436957955, "grad_norm": 30.023712158203125, "learning_rate": 2.39800654712464e-07, "loss": 0.1146, "num_input_tokens_seen": 1668352, "step": 2455 }, { "epoch": 0.060098209268805124, "grad_norm": 55.32008361816406, "learning_rate": 2.4028924610348366e-07, "loss": 0.198, "num_input_tokens_seen": 1672256, "step": 2460 }, { "epoch": 0.060220360100652286, "grad_norm": 56.359771728515625, "learning_rate": 2.4077783749450335e-07, "loss": 0.2135, "num_input_tokens_seen": 1675008, "step": 2465 }, { "epoch": 0.06034251093249945, "grad_norm": 79.16679382324219, "learning_rate": 2.41266428885523e-07, "loss": 0.1464, "num_input_tokens_seen": 1678528, "step": 2470 }, { "epoch": 0.06046466176434662, "grad_norm": 58.615352630615234, "learning_rate": 2.417550202765427e-07, "loss": 0.1469, "num_input_tokens_seen": 1681536, "step": 2475 }, { "epoch": 0.06058681259619378, "grad_norm": 77.35822296142578, "learning_rate": 2.422436116675624e-07, "loss": 0.2015, "num_input_tokens_seen": 1684544, "step": 2480 }, { "epoch": 0.06070896342804095, "grad_norm": 78.36398315429688, "learning_rate": 2.427322030585821e-07, "loss": 0.2508, "num_input_tokens_seen": 1688000, "step": 2485 }, { "epoch": 0.06083111425988811, "grad_norm": 46.09014129638672, "learning_rate": 2.432207944496018e-07, "loss": 0.1095, "num_input_tokens_seen": 1691264, "step": 2490 }, { "epoch": 0.06095326509173527, "grad_norm": 34.414459228515625, "learning_rate": 2.437093858406215e-07, "loss": 0.0887, "num_input_tokens_seen": 1694656, "step": 2495 }, { "epoch": 0.06107541592358244, "grad_norm": 33.107975006103516, "learning_rate": 2.4419797723164117e-07, "loss": 0.1181, "num_input_tokens_seen": 1698304, "step": 2500 }, { "epoch": 0.0611975667554296, "grad_norm": 75.07171630859375, "learning_rate": 2.4468656862266085e-07, "loss": 0.1827, "num_input_tokens_seen": 1701376, "step": 2505 }, { "epoch": 0.06131971758727677, "grad_norm": 54.65150451660156, "learning_rate": 2.4517516001368053e-07, "loss": 0.1661, "num_input_tokens_seen": 1704320, "step": 2510 }, { "epoch": 0.061441868419123934, "grad_norm": 21.785341262817383, "learning_rate": 2.4566375140470026e-07, "loss": 0.0947, "num_input_tokens_seen": 1707840, "step": 2515 }, { "epoch": 0.061564019250971096, "grad_norm": 25.026840209960938, "learning_rate": 2.4615234279571995e-07, "loss": 0.0512, "num_input_tokens_seen": 1711232, "step": 2520 }, { "epoch": 0.061686170082818265, "grad_norm": 22.339088439941406, "learning_rate": 2.4664093418673963e-07, "loss": 0.1004, "num_input_tokens_seen": 1714944, "step": 2525 }, { "epoch": 0.06180832091466543, "grad_norm": 24.17763328552246, "learning_rate": 2.471295255777593e-07, "loss": 0.0867, "num_input_tokens_seen": 1718336, "step": 2530 }, { "epoch": 0.061930471746512596, "grad_norm": 52.764808654785156, "learning_rate": 2.47618116968779e-07, "loss": 0.1462, "num_input_tokens_seen": 1721536, "step": 2535 }, { "epoch": 0.06205262257835976, "grad_norm": 50.03315353393555, "learning_rate": 2.4810670835979867e-07, "loss": 0.1976, "num_input_tokens_seen": 1724928, "step": 2540 }, { "epoch": 0.06217477341020693, "grad_norm": 42.78987121582031, "learning_rate": 2.485952997508184e-07, "loss": 0.1417, "num_input_tokens_seen": 1728064, "step": 2545 }, { "epoch": 0.06229692424205409, "grad_norm": 85.56526947021484, "learning_rate": 2.4908389114183803e-07, "loss": 0.142, "num_input_tokens_seen": 1731392, "step": 2550 }, { "epoch": 0.06241907507390125, "grad_norm": 34.91035461425781, "learning_rate": 2.4957248253285777e-07, "loss": 0.1403, "num_input_tokens_seen": 1734912, "step": 2555 }, { "epoch": 0.06254122590574841, "grad_norm": 102.80146026611328, "learning_rate": 2.5006107392387745e-07, "loss": 0.138, "num_input_tokens_seen": 1737984, "step": 2560 }, { "epoch": 0.06266337673759559, "grad_norm": 36.7209587097168, "learning_rate": 2.5054966531489713e-07, "loss": 0.2083, "num_input_tokens_seen": 1741504, "step": 2565 }, { "epoch": 0.06278552756944275, "grad_norm": 51.84968185424805, "learning_rate": 2.510382567059168e-07, "loss": 0.1017, "num_input_tokens_seen": 1745152, "step": 2570 }, { "epoch": 0.06290767840128991, "grad_norm": 54.773658752441406, "learning_rate": 2.5152684809693655e-07, "loss": 0.1951, "num_input_tokens_seen": 1748480, "step": 2575 }, { "epoch": 0.06302982923313707, "grad_norm": 35.93278884887695, "learning_rate": 2.5201543948795623e-07, "loss": 0.1033, "num_input_tokens_seen": 1751936, "step": 2580 }, { "epoch": 0.06315198006498424, "grad_norm": 39.635047912597656, "learning_rate": 2.525040308789759e-07, "loss": 0.257, "num_input_tokens_seen": 1755200, "step": 2585 }, { "epoch": 0.06327413089683141, "grad_norm": 27.207603454589844, "learning_rate": 2.529926222699956e-07, "loss": 0.1453, "num_input_tokens_seen": 1758656, "step": 2590 }, { "epoch": 0.06339628172867857, "grad_norm": 33.18776321411133, "learning_rate": 2.5348121366101527e-07, "loss": 0.1116, "num_input_tokens_seen": 1761664, "step": 2595 }, { "epoch": 0.06351843256052574, "grad_norm": 19.57836151123047, "learning_rate": 2.5396980505203495e-07, "loss": 0.0599, "num_input_tokens_seen": 1764992, "step": 2600 }, { "epoch": 0.0636405833923729, "grad_norm": 19.68585205078125, "learning_rate": 2.544583964430547e-07, "loss": 0.0763, "num_input_tokens_seen": 1768512, "step": 2605 }, { "epoch": 0.06376273422422006, "grad_norm": 60.64818572998047, "learning_rate": 2.5494698783407437e-07, "loss": 0.1781, "num_input_tokens_seen": 1771776, "step": 2610 }, { "epoch": 0.06388488505606724, "grad_norm": 51.9759635925293, "learning_rate": 2.5543557922509405e-07, "loss": 0.1103, "num_input_tokens_seen": 1775488, "step": 2615 }, { "epoch": 0.0640070358879144, "grad_norm": 61.86332321166992, "learning_rate": 2.5592417061611373e-07, "loss": 0.1475, "num_input_tokens_seen": 1778880, "step": 2620 }, { "epoch": 0.06412918671976156, "grad_norm": 45.70049285888672, "learning_rate": 2.564127620071334e-07, "loss": 0.1251, "num_input_tokens_seen": 1782848, "step": 2625 }, { "epoch": 0.06425133755160872, "grad_norm": 9.266349792480469, "learning_rate": 2.569013533981531e-07, "loss": 0.1095, "num_input_tokens_seen": 1786240, "step": 2630 }, { "epoch": 0.06437348838345588, "grad_norm": 47.50636672973633, "learning_rate": 2.5738994478917283e-07, "loss": 0.1151, "num_input_tokens_seen": 1789760, "step": 2635 }, { "epoch": 0.06449563921530306, "grad_norm": 36.173431396484375, "learning_rate": 2.578785361801925e-07, "loss": 0.0562, "num_input_tokens_seen": 1793536, "step": 2640 }, { "epoch": 0.06461779004715022, "grad_norm": 48.79774856567383, "learning_rate": 2.583671275712122e-07, "loss": 0.1524, "num_input_tokens_seen": 1796928, "step": 2645 }, { "epoch": 0.06473994087899738, "grad_norm": 75.12857055664062, "learning_rate": 2.5885571896223187e-07, "loss": 0.2607, "num_input_tokens_seen": 1800384, "step": 2650 }, { "epoch": 0.06486209171084455, "grad_norm": 75.68853759765625, "learning_rate": 2.5934431035325155e-07, "loss": 0.1124, "num_input_tokens_seen": 1803648, "step": 2655 }, { "epoch": 0.06498424254269172, "grad_norm": 42.324928283691406, "learning_rate": 2.5983290174427123e-07, "loss": 0.1881, "num_input_tokens_seen": 1806784, "step": 2660 }, { "epoch": 0.06510639337453888, "grad_norm": 92.44508361816406, "learning_rate": 2.6032149313529097e-07, "loss": 0.3077, "num_input_tokens_seen": 1809728, "step": 2665 }, { "epoch": 0.06522854420638605, "grad_norm": 103.27250671386719, "learning_rate": 2.6081008452631065e-07, "loss": 0.1816, "num_input_tokens_seen": 1813056, "step": 2670 }, { "epoch": 0.06535069503823321, "grad_norm": 15.108059883117676, "learning_rate": 2.6129867591733033e-07, "loss": 0.1059, "num_input_tokens_seen": 1816448, "step": 2675 }, { "epoch": 0.06547284587008037, "grad_norm": 14.266429901123047, "learning_rate": 2.6178726730835e-07, "loss": 0.08, "num_input_tokens_seen": 1819712, "step": 2680 }, { "epoch": 0.06559499670192755, "grad_norm": 79.78052520751953, "learning_rate": 2.622758586993697e-07, "loss": 0.2463, "num_input_tokens_seen": 1822784, "step": 2685 }, { "epoch": 0.06571714753377471, "grad_norm": 69.0220718383789, "learning_rate": 2.627644500903894e-07, "loss": 0.1522, "num_input_tokens_seen": 1826048, "step": 2690 }, { "epoch": 0.06583929836562187, "grad_norm": 2.8440940380096436, "learning_rate": 2.632530414814091e-07, "loss": 0.1671, "num_input_tokens_seen": 1829056, "step": 2695 }, { "epoch": 0.06596144919746903, "grad_norm": 19.573835372924805, "learning_rate": 2.637416328724288e-07, "loss": 0.1142, "num_input_tokens_seen": 1832576, "step": 2700 }, { "epoch": 0.0660836000293162, "grad_norm": 50.86659622192383, "learning_rate": 2.6423022426344847e-07, "loss": 0.2161, "num_input_tokens_seen": 1836224, "step": 2705 }, { "epoch": 0.06620575086116337, "grad_norm": 48.93684005737305, "learning_rate": 2.6471881565446815e-07, "loss": 0.0834, "num_input_tokens_seen": 1839424, "step": 2710 }, { "epoch": 0.06632790169301053, "grad_norm": 112.5682144165039, "learning_rate": 2.6520740704548783e-07, "loss": 0.1834, "num_input_tokens_seen": 1842432, "step": 2715 }, { "epoch": 0.0664500525248577, "grad_norm": 100.8270034790039, "learning_rate": 2.656959984365075e-07, "loss": 0.1845, "num_input_tokens_seen": 1845952, "step": 2720 }, { "epoch": 0.06657220335670486, "grad_norm": 3.2667179107666016, "learning_rate": 2.6618458982752725e-07, "loss": 0.1038, "num_input_tokens_seen": 1849600, "step": 2725 }, { "epoch": 0.06669435418855202, "grad_norm": 65.88026428222656, "learning_rate": 2.6667318121854693e-07, "loss": 0.139, "num_input_tokens_seen": 1852992, "step": 2730 }, { "epoch": 0.0668165050203992, "grad_norm": 21.890911102294922, "learning_rate": 2.671617726095666e-07, "loss": 0.1293, "num_input_tokens_seen": 1855872, "step": 2735 }, { "epoch": 0.06693865585224636, "grad_norm": 41.880271911621094, "learning_rate": 2.676503640005863e-07, "loss": 0.1518, "num_input_tokens_seen": 1859200, "step": 2740 }, { "epoch": 0.06706080668409352, "grad_norm": 22.694137573242188, "learning_rate": 2.68138955391606e-07, "loss": 0.1194, "num_input_tokens_seen": 1863168, "step": 2745 }, { "epoch": 0.06718295751594068, "grad_norm": 36.17405319213867, "learning_rate": 2.6862754678262566e-07, "loss": 0.173, "num_input_tokens_seen": 1866112, "step": 2750 }, { "epoch": 0.06730510834778784, "grad_norm": 41.965919494628906, "learning_rate": 2.691161381736454e-07, "loss": 0.1099, "num_input_tokens_seen": 1869376, "step": 2755 }, { "epoch": 0.06742725917963502, "grad_norm": 96.62606811523438, "learning_rate": 2.6960472956466507e-07, "loss": 0.1761, "num_input_tokens_seen": 1873152, "step": 2760 }, { "epoch": 0.06754941001148218, "grad_norm": 42.52037048339844, "learning_rate": 2.7009332095568475e-07, "loss": 0.1735, "num_input_tokens_seen": 1876992, "step": 2765 }, { "epoch": 0.06767156084332934, "grad_norm": 42.74496841430664, "learning_rate": 2.705819123467045e-07, "loss": 0.158, "num_input_tokens_seen": 1880320, "step": 2770 }, { "epoch": 0.0677937116751765, "grad_norm": 24.743160247802734, "learning_rate": 2.710705037377241e-07, "loss": 0.0991, "num_input_tokens_seen": 1883328, "step": 2775 }, { "epoch": 0.06791586250702367, "grad_norm": 37.85517120361328, "learning_rate": 2.715590951287438e-07, "loss": 0.2085, "num_input_tokens_seen": 1886784, "step": 2780 }, { "epoch": 0.06803801333887084, "grad_norm": 17.8824462890625, "learning_rate": 2.7204768651976353e-07, "loss": 0.1438, "num_input_tokens_seen": 1890176, "step": 2785 }, { "epoch": 0.068160164170718, "grad_norm": 13.706899642944336, "learning_rate": 2.725362779107832e-07, "loss": 0.0999, "num_input_tokens_seen": 1893376, "step": 2790 }, { "epoch": 0.06828231500256517, "grad_norm": 13.36403751373291, "learning_rate": 2.730248693018029e-07, "loss": 0.1893, "num_input_tokens_seen": 1896768, "step": 2795 }, { "epoch": 0.06840446583441233, "grad_norm": 39.36397171020508, "learning_rate": 2.7351346069282263e-07, "loss": 0.189, "num_input_tokens_seen": 1899904, "step": 2800 }, { "epoch": 0.06852661666625949, "grad_norm": 28.411413192749023, "learning_rate": 2.7400205208384226e-07, "loss": 0.1138, "num_input_tokens_seen": 1903488, "step": 2805 }, { "epoch": 0.06864876749810667, "grad_norm": 43.3648681640625, "learning_rate": 2.7449064347486194e-07, "loss": 0.1319, "num_input_tokens_seen": 1907392, "step": 2810 }, { "epoch": 0.06877091832995383, "grad_norm": 72.1719741821289, "learning_rate": 2.7497923486588167e-07, "loss": 0.1733, "num_input_tokens_seen": 1911040, "step": 2815 }, { "epoch": 0.06889306916180099, "grad_norm": 25.68684959411621, "learning_rate": 2.7546782625690135e-07, "loss": 0.1078, "num_input_tokens_seen": 1914048, "step": 2820 }, { "epoch": 0.06901521999364815, "grad_norm": 30.87330436706543, "learning_rate": 2.7595641764792103e-07, "loss": 0.1456, "num_input_tokens_seen": 1917120, "step": 2825 }, { "epoch": 0.06913737082549533, "grad_norm": 38.771766662597656, "learning_rate": 2.7644500903894077e-07, "loss": 0.238, "num_input_tokens_seen": 1920448, "step": 2830 }, { "epoch": 0.06925952165734249, "grad_norm": 7.507987976074219, "learning_rate": 2.769336004299604e-07, "loss": 0.2612, "num_input_tokens_seen": 1923968, "step": 2835 }, { "epoch": 0.06938167248918965, "grad_norm": 36.60184860229492, "learning_rate": 2.774221918209801e-07, "loss": 0.1823, "num_input_tokens_seen": 1926912, "step": 2840 }, { "epoch": 0.06950382332103681, "grad_norm": 30.51137351989746, "learning_rate": 2.779107832119998e-07, "loss": 0.0955, "num_input_tokens_seen": 1930304, "step": 2845 }, { "epoch": 0.06962597415288398, "grad_norm": 25.373884201049805, "learning_rate": 2.783993746030195e-07, "loss": 0.0827, "num_input_tokens_seen": 1933696, "step": 2850 }, { "epoch": 0.06974812498473115, "grad_norm": 45.243255615234375, "learning_rate": 2.788879659940392e-07, "loss": 0.1063, "num_input_tokens_seen": 1937472, "step": 2855 }, { "epoch": 0.06987027581657831, "grad_norm": 87.1949691772461, "learning_rate": 2.793765573850589e-07, "loss": 0.1743, "num_input_tokens_seen": 1940992, "step": 2860 }, { "epoch": 0.06999242664842548, "grad_norm": 17.872364044189453, "learning_rate": 2.7986514877607854e-07, "loss": 0.1996, "num_input_tokens_seen": 1944704, "step": 2865 }, { "epoch": 0.07011457748027264, "grad_norm": 33.64302444458008, "learning_rate": 2.803537401670982e-07, "loss": 0.2116, "num_input_tokens_seen": 1947840, "step": 2870 }, { "epoch": 0.0702367283121198, "grad_norm": 8.390338897705078, "learning_rate": 2.8084233155811795e-07, "loss": 0.1368, "num_input_tokens_seen": 1951168, "step": 2875 }, { "epoch": 0.07035887914396698, "grad_norm": 12.758993148803711, "learning_rate": 2.8133092294913764e-07, "loss": 0.087, "num_input_tokens_seen": 1954624, "step": 2880 }, { "epoch": 0.07048102997581414, "grad_norm": 52.06711959838867, "learning_rate": 2.818195143401573e-07, "loss": 0.1982, "num_input_tokens_seen": 1958016, "step": 2885 }, { "epoch": 0.0706031808076613, "grad_norm": 28.96224594116211, "learning_rate": 2.8230810573117705e-07, "loss": 0.2, "num_input_tokens_seen": 1961152, "step": 2890 }, { "epoch": 0.07072533163950846, "grad_norm": 34.215213775634766, "learning_rate": 2.827966971221967e-07, "loss": 0.1631, "num_input_tokens_seen": 1964224, "step": 2895 }, { "epoch": 0.07084748247135562, "grad_norm": 64.00146484375, "learning_rate": 2.8328528851321636e-07, "loss": 0.1541, "num_input_tokens_seen": 1967104, "step": 2900 }, { "epoch": 0.0709696333032028, "grad_norm": 12.239304542541504, "learning_rate": 2.837738799042361e-07, "loss": 0.1006, "num_input_tokens_seen": 1970944, "step": 2905 }, { "epoch": 0.07109178413504996, "grad_norm": 49.136531829833984, "learning_rate": 2.842624712952558e-07, "loss": 0.1888, "num_input_tokens_seen": 1974016, "step": 2910 }, { "epoch": 0.07121393496689712, "grad_norm": 33.941627502441406, "learning_rate": 2.8475106268627546e-07, "loss": 0.0826, "num_input_tokens_seen": 1977664, "step": 2915 }, { "epoch": 0.07133608579874429, "grad_norm": 100.75859069824219, "learning_rate": 2.852396540772952e-07, "loss": 0.2103, "num_input_tokens_seen": 1980928, "step": 2920 }, { "epoch": 0.07145823663059145, "grad_norm": 53.549861907958984, "learning_rate": 2.857282454683148e-07, "loss": 0.1023, "num_input_tokens_seen": 1984256, "step": 2925 }, { "epoch": 0.07158038746243862, "grad_norm": 23.475570678710938, "learning_rate": 2.862168368593345e-07, "loss": 0.0925, "num_input_tokens_seen": 1987904, "step": 2930 }, { "epoch": 0.07170253829428579, "grad_norm": 9.245582580566406, "learning_rate": 2.867054282503542e-07, "loss": 0.0685, "num_input_tokens_seen": 1991104, "step": 2935 }, { "epoch": 0.07182468912613295, "grad_norm": 38.58662033081055, "learning_rate": 2.871940196413739e-07, "loss": 0.1111, "num_input_tokens_seen": 1994624, "step": 2940 }, { "epoch": 0.07194683995798011, "grad_norm": 9.25555419921875, "learning_rate": 2.876826110323936e-07, "loss": 0.1573, "num_input_tokens_seen": 1997696, "step": 2945 }, { "epoch": 0.07206899078982727, "grad_norm": 62.213035583496094, "learning_rate": 2.8817120242341333e-07, "loss": 0.125, "num_input_tokens_seen": 2000960, "step": 2950 }, { "epoch": 0.07219114162167445, "grad_norm": 47.498817443847656, "learning_rate": 2.8865979381443296e-07, "loss": 0.2306, "num_input_tokens_seen": 2004224, "step": 2955 }, { "epoch": 0.07231329245352161, "grad_norm": 34.238121032714844, "learning_rate": 2.8914838520545264e-07, "loss": 0.2147, "num_input_tokens_seen": 2007680, "step": 2960 }, { "epoch": 0.07243544328536877, "grad_norm": 53.50193786621094, "learning_rate": 2.896369765964723e-07, "loss": 0.249, "num_input_tokens_seen": 2011008, "step": 2965 }, { "epoch": 0.07255759411721593, "grad_norm": 27.394147872924805, "learning_rate": 2.9012556798749206e-07, "loss": 0.183, "num_input_tokens_seen": 2014464, "step": 2970 }, { "epoch": 0.0726797449490631, "grad_norm": 35.457420349121094, "learning_rate": 2.9061415937851174e-07, "loss": 0.1893, "num_input_tokens_seen": 2018240, "step": 2975 }, { "epoch": 0.07280189578091027, "grad_norm": 34.14503860473633, "learning_rate": 2.911027507695314e-07, "loss": 0.1779, "num_input_tokens_seen": 2021440, "step": 2980 }, { "epoch": 0.07292404661275743, "grad_norm": 82.81512451171875, "learning_rate": 2.915913421605511e-07, "loss": 0.2133, "num_input_tokens_seen": 2024832, "step": 2985 }, { "epoch": 0.0730461974446046, "grad_norm": 16.721773147583008, "learning_rate": 2.920799335515708e-07, "loss": 0.1293, "num_input_tokens_seen": 2028480, "step": 2990 }, { "epoch": 0.07316834827645176, "grad_norm": 32.263912200927734, "learning_rate": 2.9256852494259046e-07, "loss": 0.1734, "num_input_tokens_seen": 2031808, "step": 2995 }, { "epoch": 0.07329049910829893, "grad_norm": 23.13930892944336, "learning_rate": 2.930571163336102e-07, "loss": 0.1972, "num_input_tokens_seen": 2034496, "step": 3000 }, { "epoch": 0.0734126499401461, "grad_norm": 34.838260650634766, "learning_rate": 2.935457077246299e-07, "loss": 0.1524, "num_input_tokens_seen": 2037504, "step": 3005 }, { "epoch": 0.07353480077199326, "grad_norm": 56.00746154785156, "learning_rate": 2.9403429911564956e-07, "loss": 0.1649, "num_input_tokens_seen": 2040512, "step": 3010 }, { "epoch": 0.07365695160384042, "grad_norm": 29.56755256652832, "learning_rate": 2.945228905066693e-07, "loss": 0.1462, "num_input_tokens_seen": 2043712, "step": 3015 }, { "epoch": 0.07377910243568758, "grad_norm": 28.033449172973633, "learning_rate": 2.950114818976889e-07, "loss": 0.1338, "num_input_tokens_seen": 2047104, "step": 3020 }, { "epoch": 0.07390125326753476, "grad_norm": 20.67574119567871, "learning_rate": 2.955000732887086e-07, "loss": 0.1718, "num_input_tokens_seen": 2050944, "step": 3025 }, { "epoch": 0.07402340409938192, "grad_norm": 30.444156646728516, "learning_rate": 2.9598866467972834e-07, "loss": 0.0823, "num_input_tokens_seen": 2054016, "step": 3030 }, { "epoch": 0.07414555493122908, "grad_norm": 9.620954513549805, "learning_rate": 2.96477256070748e-07, "loss": 0.091, "num_input_tokens_seen": 2056896, "step": 3035 }, { "epoch": 0.07426770576307624, "grad_norm": 25.14250373840332, "learning_rate": 2.969658474617677e-07, "loss": 0.1407, "num_input_tokens_seen": 2060224, "step": 3040 }, { "epoch": 0.0743898565949234, "grad_norm": 69.50040435791016, "learning_rate": 2.9745443885278744e-07, "loss": 0.1161, "num_input_tokens_seen": 2063808, "step": 3045 }, { "epoch": 0.07451200742677058, "grad_norm": 58.84195327758789, "learning_rate": 2.9794303024380706e-07, "loss": 0.2451, "num_input_tokens_seen": 2067072, "step": 3050 }, { "epoch": 0.07463415825861774, "grad_norm": 15.835522651672363, "learning_rate": 2.9843162163482675e-07, "loss": 0.1461, "num_input_tokens_seen": 2070720, "step": 3055 }, { "epoch": 0.0747563090904649, "grad_norm": 52.69871139526367, "learning_rate": 2.989202130258465e-07, "loss": 0.0868, "num_input_tokens_seen": 2074176, "step": 3060 }, { "epoch": 0.07487845992231207, "grad_norm": 75.31199645996094, "learning_rate": 2.9940880441686616e-07, "loss": 0.2104, "num_input_tokens_seen": 2077376, "step": 3065 }, { "epoch": 0.07500061075415923, "grad_norm": 25.50840950012207, "learning_rate": 2.9989739580788584e-07, "loss": 0.1225, "num_input_tokens_seen": 2080896, "step": 3070 }, { "epoch": 0.0751227615860064, "grad_norm": 38.57832717895508, "learning_rate": 3.003859871989056e-07, "loss": 0.167, "num_input_tokens_seen": 2084096, "step": 3075 }, { "epoch": 0.07524491241785357, "grad_norm": 53.26701736450195, "learning_rate": 3.008745785899252e-07, "loss": 0.2051, "num_input_tokens_seen": 2087296, "step": 3080 }, { "epoch": 0.07536706324970073, "grad_norm": 6.57878303527832, "learning_rate": 3.013631699809449e-07, "loss": 0.1622, "num_input_tokens_seen": 2090624, "step": 3085 }, { "epoch": 0.07548921408154789, "grad_norm": 44.716312408447266, "learning_rate": 3.018517613719646e-07, "loss": 0.0648, "num_input_tokens_seen": 2094848, "step": 3090 }, { "epoch": 0.07561136491339505, "grad_norm": 29.319324493408203, "learning_rate": 3.023403527629843e-07, "loss": 0.1455, "num_input_tokens_seen": 2098048, "step": 3095 }, { "epoch": 0.07573351574524223, "grad_norm": 15.153791427612305, "learning_rate": 3.02828944154004e-07, "loss": 0.076, "num_input_tokens_seen": 2101888, "step": 3100 }, { "epoch": 0.07585566657708939, "grad_norm": 30.897048950195312, "learning_rate": 3.033175355450237e-07, "loss": 0.2214, "num_input_tokens_seen": 2104960, "step": 3105 }, { "epoch": 0.07597781740893655, "grad_norm": 23.50116539001465, "learning_rate": 3.0380612693604335e-07, "loss": 0.1427, "num_input_tokens_seen": 2107968, "step": 3110 }, { "epoch": 0.07609996824078372, "grad_norm": 33.305538177490234, "learning_rate": 3.0429471832706303e-07, "loss": 0.1003, "num_input_tokens_seen": 2111168, "step": 3115 }, { "epoch": 0.07622211907263088, "grad_norm": 31.173358917236328, "learning_rate": 3.0478330971808276e-07, "loss": 0.1005, "num_input_tokens_seen": 2114496, "step": 3120 }, { "epoch": 0.07634426990447805, "grad_norm": 4.587650775909424, "learning_rate": 3.0527190110910244e-07, "loss": 0.0685, "num_input_tokens_seen": 2117952, "step": 3125 }, { "epoch": 0.07646642073632522, "grad_norm": 59.983375549316406, "learning_rate": 3.057604925001221e-07, "loss": 0.2478, "num_input_tokens_seen": 2121088, "step": 3130 }, { "epoch": 0.07658857156817238, "grad_norm": 26.86677360534668, "learning_rate": 3.0624908389114186e-07, "loss": 0.1306, "num_input_tokens_seen": 2124224, "step": 3135 }, { "epoch": 0.07671072240001954, "grad_norm": 34.66873550415039, "learning_rate": 3.067376752821615e-07, "loss": 0.1123, "num_input_tokens_seen": 2127296, "step": 3140 }, { "epoch": 0.0768328732318667, "grad_norm": 18.21170425415039, "learning_rate": 3.0722626667318117e-07, "loss": 0.1153, "num_input_tokens_seen": 2131008, "step": 3145 }, { "epoch": 0.07695502406371388, "grad_norm": 94.64302062988281, "learning_rate": 3.077148580642009e-07, "loss": 0.2439, "num_input_tokens_seen": 2134016, "step": 3150 }, { "epoch": 0.07707717489556104, "grad_norm": 46.07650375366211, "learning_rate": 3.082034494552206e-07, "loss": 0.1683, "num_input_tokens_seen": 2138048, "step": 3155 }, { "epoch": 0.0771993257274082, "grad_norm": 32.91551208496094, "learning_rate": 3.0869204084624027e-07, "loss": 0.099, "num_input_tokens_seen": 2141504, "step": 3160 }, { "epoch": 0.07732147655925536, "grad_norm": 55.034549713134766, "learning_rate": 3.0918063223726e-07, "loss": 0.1558, "num_input_tokens_seen": 2144576, "step": 3165 }, { "epoch": 0.07744362739110254, "grad_norm": 20.479995727539062, "learning_rate": 3.0966922362827963e-07, "loss": 0.1402, "num_input_tokens_seen": 2147904, "step": 3170 }, { "epoch": 0.0775657782229497, "grad_norm": 20.71714210510254, "learning_rate": 3.101578150192993e-07, "loss": 0.1425, "num_input_tokens_seen": 2151360, "step": 3175 }, { "epoch": 0.07768792905479686, "grad_norm": 21.133060455322266, "learning_rate": 3.1064640641031904e-07, "loss": 0.1607, "num_input_tokens_seen": 2154816, "step": 3180 }, { "epoch": 0.07781007988664403, "grad_norm": 18.6674747467041, "learning_rate": 3.111349978013387e-07, "loss": 0.1372, "num_input_tokens_seen": 2157952, "step": 3185 }, { "epoch": 0.07793223071849119, "grad_norm": 49.647891998291016, "learning_rate": 3.116235891923584e-07, "loss": 0.139, "num_input_tokens_seen": 2161216, "step": 3190 }, { "epoch": 0.07805438155033836, "grad_norm": 25.552492141723633, "learning_rate": 3.1211218058337814e-07, "loss": 0.1316, "num_input_tokens_seen": 2164416, "step": 3195 }, { "epoch": 0.07817653238218553, "grad_norm": 60.401676177978516, "learning_rate": 3.1260077197439777e-07, "loss": 0.1019, "num_input_tokens_seen": 2167680, "step": 3200 }, { "epoch": 0.07829868321403269, "grad_norm": 83.02261352539062, "learning_rate": 3.1308936336541745e-07, "loss": 0.177, "num_input_tokens_seen": 2170688, "step": 3205 }, { "epoch": 0.07842083404587985, "grad_norm": 31.337995529174805, "learning_rate": 3.135779547564372e-07, "loss": 0.0725, "num_input_tokens_seen": 2173696, "step": 3210 }, { "epoch": 0.07854298487772701, "grad_norm": 27.961589813232422, "learning_rate": 3.1406654614745687e-07, "loss": 0.1236, "num_input_tokens_seen": 2177344, "step": 3215 }, { "epoch": 0.07866513570957419, "grad_norm": 31.159038543701172, "learning_rate": 3.1455513753847655e-07, "loss": 0.13, "num_input_tokens_seen": 2180672, "step": 3220 }, { "epoch": 0.07878728654142135, "grad_norm": 57.42909622192383, "learning_rate": 3.150437289294963e-07, "loss": 0.1509, "num_input_tokens_seen": 2183808, "step": 3225 }, { "epoch": 0.07890943737326851, "grad_norm": 51.559627532958984, "learning_rate": 3.1553232032051596e-07, "loss": 0.1114, "num_input_tokens_seen": 2187584, "step": 3230 }, { "epoch": 0.07903158820511567, "grad_norm": 55.43442916870117, "learning_rate": 3.160209117115356e-07, "loss": 0.1227, "num_input_tokens_seen": 2190784, "step": 3235 }, { "epoch": 0.07915373903696284, "grad_norm": 42.4616813659668, "learning_rate": 3.165095031025553e-07, "loss": 0.1845, "num_input_tokens_seen": 2194112, "step": 3240 }, { "epoch": 0.07927588986881001, "grad_norm": 55.628013610839844, "learning_rate": 3.16998094493575e-07, "loss": 0.1198, "num_input_tokens_seen": 2197504, "step": 3245 }, { "epoch": 0.07939804070065717, "grad_norm": 34.316688537597656, "learning_rate": 3.174866858845947e-07, "loss": 0.1323, "num_input_tokens_seen": 2200512, "step": 3250 }, { "epoch": 0.07952019153250434, "grad_norm": 164.48052978515625, "learning_rate": 3.179752772756144e-07, "loss": 0.1792, "num_input_tokens_seen": 2204224, "step": 3255 }, { "epoch": 0.0796423423643515, "grad_norm": 91.04350280761719, "learning_rate": 3.184638686666341e-07, "loss": 0.1659, "num_input_tokens_seen": 2207488, "step": 3260 }, { "epoch": 0.07976449319619866, "grad_norm": 8.240289688110352, "learning_rate": 3.1895246005765373e-07, "loss": 0.1968, "num_input_tokens_seen": 2210496, "step": 3265 }, { "epoch": 0.07988664402804584, "grad_norm": 21.83321762084961, "learning_rate": 3.1944105144867347e-07, "loss": 0.0935, "num_input_tokens_seen": 2213824, "step": 3270 }, { "epoch": 0.080008794859893, "grad_norm": 38.04369354248047, "learning_rate": 3.1992964283969315e-07, "loss": 0.1406, "num_input_tokens_seen": 2217600, "step": 3275 }, { "epoch": 0.08013094569174016, "grad_norm": 54.49981689453125, "learning_rate": 3.2041823423071283e-07, "loss": 0.1532, "num_input_tokens_seen": 2221056, "step": 3280 }, { "epoch": 0.08025309652358732, "grad_norm": 38.160335540771484, "learning_rate": 3.2090682562173256e-07, "loss": 0.094, "num_input_tokens_seen": 2224448, "step": 3285 }, { "epoch": 0.08037524735543448, "grad_norm": 33.39901351928711, "learning_rate": 3.2139541701275224e-07, "loss": 0.1089, "num_input_tokens_seen": 2228160, "step": 3290 }, { "epoch": 0.08049739818728166, "grad_norm": 32.578369140625, "learning_rate": 3.2188400840377187e-07, "loss": 0.127, "num_input_tokens_seen": 2231488, "step": 3295 }, { "epoch": 0.08061954901912882, "grad_norm": 57.24115753173828, "learning_rate": 3.223725997947916e-07, "loss": 0.0842, "num_input_tokens_seen": 2235008, "step": 3300 }, { "epoch": 0.08074169985097598, "grad_norm": 71.96141052246094, "learning_rate": 3.228611911858113e-07, "loss": 0.2433, "num_input_tokens_seen": 2238016, "step": 3305 }, { "epoch": 0.08086385068282315, "grad_norm": 5.054819583892822, "learning_rate": 3.2334978257683097e-07, "loss": 0.1516, "num_input_tokens_seen": 2241664, "step": 3310 }, { "epoch": 0.08098600151467031, "grad_norm": 73.97016906738281, "learning_rate": 3.238383739678507e-07, "loss": 0.2021, "num_input_tokens_seen": 2244864, "step": 3315 }, { "epoch": 0.08110815234651748, "grad_norm": 13.356393814086914, "learning_rate": 3.243269653588704e-07, "loss": 0.0696, "num_input_tokens_seen": 2248320, "step": 3320 }, { "epoch": 0.08123030317836465, "grad_norm": 36.295074462890625, "learning_rate": 3.2481555674989e-07, "loss": 0.088, "num_input_tokens_seen": 2251712, "step": 3325 }, { "epoch": 0.08135245401021181, "grad_norm": 89.90282440185547, "learning_rate": 3.2530414814090975e-07, "loss": 0.1939, "num_input_tokens_seen": 2254976, "step": 3330 }, { "epoch": 0.08147460484205897, "grad_norm": 22.770984649658203, "learning_rate": 3.2579273953192943e-07, "loss": 0.2149, "num_input_tokens_seen": 2258304, "step": 3335 }, { "epoch": 0.08159675567390615, "grad_norm": 9.757627487182617, "learning_rate": 3.262813309229491e-07, "loss": 0.1501, "num_input_tokens_seen": 2261632, "step": 3340 }, { "epoch": 0.08171890650575331, "grad_norm": 39.16585922241211, "learning_rate": 3.2676992231396884e-07, "loss": 0.1428, "num_input_tokens_seen": 2264576, "step": 3345 }, { "epoch": 0.08184105733760047, "grad_norm": 54.00513458251953, "learning_rate": 3.272585137049885e-07, "loss": 0.2068, "num_input_tokens_seen": 2267520, "step": 3350 }, { "epoch": 0.08196320816944763, "grad_norm": 17.601160049438477, "learning_rate": 3.2774710509600815e-07, "loss": 0.0769, "num_input_tokens_seen": 2271552, "step": 3355 }, { "epoch": 0.0820853590012948, "grad_norm": 43.5909538269043, "learning_rate": 3.282356964870279e-07, "loss": 0.0671, "num_input_tokens_seen": 2275072, "step": 3360 }, { "epoch": 0.08220750983314197, "grad_norm": 36.28582763671875, "learning_rate": 3.2872428787804757e-07, "loss": 0.1913, "num_input_tokens_seen": 2278912, "step": 3365 }, { "epoch": 0.08232966066498913, "grad_norm": 48.7681999206543, "learning_rate": 3.2921287926906725e-07, "loss": 0.1205, "num_input_tokens_seen": 2282624, "step": 3370 }, { "epoch": 0.0824518114968363, "grad_norm": 6.605459690093994, "learning_rate": 3.29701470660087e-07, "loss": 0.192, "num_input_tokens_seen": 2286336, "step": 3375 }, { "epoch": 0.08257396232868346, "grad_norm": 38.756752014160156, "learning_rate": 3.3019006205110667e-07, "loss": 0.1847, "num_input_tokens_seen": 2289792, "step": 3380 }, { "epoch": 0.08269611316053062, "grad_norm": 37.65235900878906, "learning_rate": 3.306786534421263e-07, "loss": 0.1688, "num_input_tokens_seen": 2293248, "step": 3385 }, { "epoch": 0.0828182639923778, "grad_norm": 46.75678253173828, "learning_rate": 3.3116724483314603e-07, "loss": 0.1734, "num_input_tokens_seen": 2296512, "step": 3390 }, { "epoch": 0.08294041482422496, "grad_norm": 21.31465721130371, "learning_rate": 3.316558362241657e-07, "loss": 0.08, "num_input_tokens_seen": 2299904, "step": 3395 }, { "epoch": 0.08306256565607212, "grad_norm": 45.592308044433594, "learning_rate": 3.321444276151854e-07, "loss": 0.0663, "num_input_tokens_seen": 2303296, "step": 3400 }, { "epoch": 0.08318471648791928, "grad_norm": 42.98056411743164, "learning_rate": 3.326330190062051e-07, "loss": 0.1287, "num_input_tokens_seen": 2307072, "step": 3405 }, { "epoch": 0.08330686731976644, "grad_norm": 50.305381774902344, "learning_rate": 3.331216103972248e-07, "loss": 0.1319, "num_input_tokens_seen": 2310144, "step": 3410 }, { "epoch": 0.08342901815161362, "grad_norm": 91.99374389648438, "learning_rate": 3.3361020178824444e-07, "loss": 0.1975, "num_input_tokens_seen": 2313472, "step": 3415 }, { "epoch": 0.08355116898346078, "grad_norm": 27.918872833251953, "learning_rate": 3.3409879317926417e-07, "loss": 0.077, "num_input_tokens_seen": 2316672, "step": 3420 }, { "epoch": 0.08367331981530794, "grad_norm": 24.6630859375, "learning_rate": 3.3458738457028385e-07, "loss": 0.0846, "num_input_tokens_seen": 2319872, "step": 3425 }, { "epoch": 0.0837954706471551, "grad_norm": 12.345520973205566, "learning_rate": 3.3507597596130353e-07, "loss": 0.0846, "num_input_tokens_seen": 2323328, "step": 3430 }, { "epoch": 0.08391762147900227, "grad_norm": 29.239362716674805, "learning_rate": 3.3556456735232327e-07, "loss": 0.1385, "num_input_tokens_seen": 2326208, "step": 3435 }, { "epoch": 0.08403977231084944, "grad_norm": 18.62710189819336, "learning_rate": 3.3605315874334295e-07, "loss": 0.0969, "num_input_tokens_seen": 2329536, "step": 3440 }, { "epoch": 0.0841619231426966, "grad_norm": 87.40646362304688, "learning_rate": 3.365417501343626e-07, "loss": 0.1586, "num_input_tokens_seen": 2333440, "step": 3445 }, { "epoch": 0.08428407397454377, "grad_norm": 77.6432113647461, "learning_rate": 3.370303415253823e-07, "loss": 0.1598, "num_input_tokens_seen": 2336640, "step": 3450 }, { "epoch": 0.08440622480639093, "grad_norm": 41.03227615356445, "learning_rate": 3.37518932916402e-07, "loss": 0.1757, "num_input_tokens_seen": 2339904, "step": 3455 }, { "epoch": 0.08452837563823809, "grad_norm": 23.494503021240234, "learning_rate": 3.3800752430742167e-07, "loss": 0.0961, "num_input_tokens_seen": 2343808, "step": 3460 }, { "epoch": 0.08465052647008527, "grad_norm": 11.61443042755127, "learning_rate": 3.384961156984414e-07, "loss": 0.2321, "num_input_tokens_seen": 2347264, "step": 3465 }, { "epoch": 0.08477267730193243, "grad_norm": 26.035037994384766, "learning_rate": 3.389847070894611e-07, "loss": 0.187, "num_input_tokens_seen": 2350464, "step": 3470 }, { "epoch": 0.08489482813377959, "grad_norm": 52.27019500732422, "learning_rate": 3.3947329848048077e-07, "loss": 0.2035, "num_input_tokens_seen": 2353536, "step": 3475 }, { "epoch": 0.08501697896562675, "grad_norm": 22.691511154174805, "learning_rate": 3.3996188987150045e-07, "loss": 0.1203, "num_input_tokens_seen": 2356864, "step": 3480 }, { "epoch": 0.08513912979747391, "grad_norm": 37.64372253417969, "learning_rate": 3.4045048126252013e-07, "loss": 0.0895, "num_input_tokens_seen": 2360192, "step": 3485 }, { "epoch": 0.08526128062932109, "grad_norm": 36.15673065185547, "learning_rate": 3.409390726535398e-07, "loss": 0.0905, "num_input_tokens_seen": 2363520, "step": 3490 }, { "epoch": 0.08538343146116825, "grad_norm": 42.125545501708984, "learning_rate": 3.4142766404455955e-07, "loss": 0.0922, "num_input_tokens_seen": 2366656, "step": 3495 }, { "epoch": 0.08550558229301541, "grad_norm": 49.36576843261719, "learning_rate": 3.4191625543557923e-07, "loss": 0.1001, "num_input_tokens_seen": 2369920, "step": 3500 }, { "epoch": 0.08562773312486258, "grad_norm": 7.662629127502441, "learning_rate": 3.424048468265989e-07, "loss": 0.1379, "num_input_tokens_seen": 2373312, "step": 3505 }, { "epoch": 0.08574988395670975, "grad_norm": 79.47669982910156, "learning_rate": 3.428934382176186e-07, "loss": 0.1846, "num_input_tokens_seen": 2376448, "step": 3510 }, { "epoch": 0.08587203478855691, "grad_norm": 57.955997467041016, "learning_rate": 3.433820296086383e-07, "loss": 0.1408, "num_input_tokens_seen": 2380160, "step": 3515 }, { "epoch": 0.08599418562040408, "grad_norm": 23.892112731933594, "learning_rate": 3.4387062099965796e-07, "loss": 0.156, "num_input_tokens_seen": 2383616, "step": 3520 }, { "epoch": 0.08611633645225124, "grad_norm": 4.1693830490112305, "learning_rate": 3.443592123906777e-07, "loss": 0.0705, "num_input_tokens_seen": 2386944, "step": 3525 }, { "epoch": 0.0862384872840984, "grad_norm": 38.16275405883789, "learning_rate": 3.4484780378169737e-07, "loss": 0.0863, "num_input_tokens_seen": 2390336, "step": 3530 }, { "epoch": 0.08636063811594558, "grad_norm": 35.907779693603516, "learning_rate": 3.4533639517271705e-07, "loss": 0.1225, "num_input_tokens_seen": 2393600, "step": 3535 }, { "epoch": 0.08648278894779274, "grad_norm": 37.99102783203125, "learning_rate": 3.4582498656373673e-07, "loss": 0.1515, "num_input_tokens_seen": 2396608, "step": 3540 }, { "epoch": 0.0866049397796399, "grad_norm": 53.43939208984375, "learning_rate": 3.463135779547564e-07, "loss": 0.1262, "num_input_tokens_seen": 2399680, "step": 3545 }, { "epoch": 0.08672709061148706, "grad_norm": 21.493928909301758, "learning_rate": 3.468021693457761e-07, "loss": 0.0761, "num_input_tokens_seen": 2402432, "step": 3550 }, { "epoch": 0.08684924144333422, "grad_norm": 21.230327606201172, "learning_rate": 3.4729076073679583e-07, "loss": 0.2028, "num_input_tokens_seen": 2405504, "step": 3555 }, { "epoch": 0.0869713922751814, "grad_norm": 55.68876647949219, "learning_rate": 3.477793521278155e-07, "loss": 0.1644, "num_input_tokens_seen": 2409024, "step": 3560 }, { "epoch": 0.08709354310702856, "grad_norm": 90.71858215332031, "learning_rate": 3.482679435188352e-07, "loss": 0.1153, "num_input_tokens_seen": 2412416, "step": 3565 }, { "epoch": 0.08721569393887572, "grad_norm": 106.14388275146484, "learning_rate": 3.487565349098549e-07, "loss": 0.1801, "num_input_tokens_seen": 2416192, "step": 3570 }, { "epoch": 0.08733784477072289, "grad_norm": 27.10405158996582, "learning_rate": 3.4924512630087456e-07, "loss": 0.1723, "num_input_tokens_seen": 2419840, "step": 3575 }, { "epoch": 0.08745999560257005, "grad_norm": 26.839338302612305, "learning_rate": 3.4973371769189424e-07, "loss": 0.0996, "num_input_tokens_seen": 2422976, "step": 3580 }, { "epoch": 0.08758214643441722, "grad_norm": 64.43603515625, "learning_rate": 3.5022230908291397e-07, "loss": 0.1036, "num_input_tokens_seen": 2426496, "step": 3585 }, { "epoch": 0.08770429726626439, "grad_norm": 39.594139099121094, "learning_rate": 3.5071090047393365e-07, "loss": 0.0971, "num_input_tokens_seen": 2429376, "step": 3590 }, { "epoch": 0.08782644809811155, "grad_norm": 41.60728073120117, "learning_rate": 3.5119949186495333e-07, "loss": 0.1917, "num_input_tokens_seen": 2432576, "step": 3595 }, { "epoch": 0.08794859892995871, "grad_norm": 47.88832473754883, "learning_rate": 3.51688083255973e-07, "loss": 0.1268, "num_input_tokens_seen": 2436096, "step": 3600 }, { "epoch": 0.08807074976180587, "grad_norm": 66.10047149658203, "learning_rate": 3.521766746469927e-07, "loss": 0.1253, "num_input_tokens_seen": 2440064, "step": 3605 }, { "epoch": 0.08819290059365305, "grad_norm": 36.456153869628906, "learning_rate": 3.526652660380124e-07, "loss": 0.1452, "num_input_tokens_seen": 2443200, "step": 3610 }, { "epoch": 0.08831505142550021, "grad_norm": 14.999835968017578, "learning_rate": 3.531538574290321e-07, "loss": 0.2102, "num_input_tokens_seen": 2446848, "step": 3615 }, { "epoch": 0.08843720225734737, "grad_norm": 33.82028579711914, "learning_rate": 3.536424488200518e-07, "loss": 0.131, "num_input_tokens_seen": 2450368, "step": 3620 }, { "epoch": 0.08855935308919453, "grad_norm": 27.221302032470703, "learning_rate": 3.541310402110715e-07, "loss": 0.109, "num_input_tokens_seen": 2454016, "step": 3625 }, { "epoch": 0.0886815039210417, "grad_norm": 57.56504440307617, "learning_rate": 3.5461963160209116e-07, "loss": 0.1683, "num_input_tokens_seen": 2457472, "step": 3630 }, { "epoch": 0.08880365475288887, "grad_norm": 26.43320083618164, "learning_rate": 3.5510822299311084e-07, "loss": 0.0988, "num_input_tokens_seen": 2461312, "step": 3635 }, { "epoch": 0.08892580558473603, "grad_norm": 23.146326065063477, "learning_rate": 3.555968143841305e-07, "loss": 0.1124, "num_input_tokens_seen": 2464512, "step": 3640 }, { "epoch": 0.0890479564165832, "grad_norm": 42.45225524902344, "learning_rate": 3.5608540577515025e-07, "loss": 0.1564, "num_input_tokens_seen": 2468288, "step": 3645 }, { "epoch": 0.08917010724843036, "grad_norm": 38.197105407714844, "learning_rate": 3.5657399716616993e-07, "loss": 0.1885, "num_input_tokens_seen": 2471424, "step": 3650 }, { "epoch": 0.08929225808027752, "grad_norm": 13.764596939086914, "learning_rate": 3.570625885571896e-07, "loss": 0.0834, "num_input_tokens_seen": 2475008, "step": 3655 }, { "epoch": 0.0894144089121247, "grad_norm": 43.19955825805664, "learning_rate": 3.5755117994820924e-07, "loss": 0.1104, "num_input_tokens_seen": 2478080, "step": 3660 }, { "epoch": 0.08953655974397186, "grad_norm": 17.45258140563965, "learning_rate": 3.58039771339229e-07, "loss": 0.136, "num_input_tokens_seen": 2481472, "step": 3665 }, { "epoch": 0.08965871057581902, "grad_norm": 36.92076110839844, "learning_rate": 3.5852836273024866e-07, "loss": 0.2465, "num_input_tokens_seen": 2484736, "step": 3670 }, { "epoch": 0.08978086140766618, "grad_norm": 42.25662612915039, "learning_rate": 3.590169541212684e-07, "loss": 0.0973, "num_input_tokens_seen": 2487872, "step": 3675 }, { "epoch": 0.08990301223951336, "grad_norm": 43.27426528930664, "learning_rate": 3.595055455122881e-07, "loss": 0.1254, "num_input_tokens_seen": 2491392, "step": 3680 }, { "epoch": 0.09002516307136052, "grad_norm": 56.761863708496094, "learning_rate": 3.5999413690330776e-07, "loss": 0.1701, "num_input_tokens_seen": 2494784, "step": 3685 }, { "epoch": 0.09014731390320768, "grad_norm": 16.943342208862305, "learning_rate": 3.604827282943275e-07, "loss": 0.0829, "num_input_tokens_seen": 2497984, "step": 3690 }, { "epoch": 0.09026946473505484, "grad_norm": 36.46076965332031, "learning_rate": 3.609713196853471e-07, "loss": 0.2115, "num_input_tokens_seen": 2500992, "step": 3695 }, { "epoch": 0.090391615566902, "grad_norm": 30.30020523071289, "learning_rate": 3.614599110763668e-07, "loss": 0.1073, "num_input_tokens_seen": 2504384, "step": 3700 }, { "epoch": 0.09051376639874918, "grad_norm": 25.540746688842773, "learning_rate": 3.619485024673865e-07, "loss": 0.1459, "num_input_tokens_seen": 2508096, "step": 3705 }, { "epoch": 0.09063591723059634, "grad_norm": 83.55743408203125, "learning_rate": 3.624370938584062e-07, "loss": 0.2189, "num_input_tokens_seen": 2511232, "step": 3710 }, { "epoch": 0.0907580680624435, "grad_norm": 20.501020431518555, "learning_rate": 3.629256852494259e-07, "loss": 0.1667, "num_input_tokens_seen": 2514240, "step": 3715 }, { "epoch": 0.09088021889429067, "grad_norm": 64.41683959960938, "learning_rate": 3.6341427664044563e-07, "loss": 0.1413, "num_input_tokens_seen": 2518144, "step": 3720 }, { "epoch": 0.09100236972613783, "grad_norm": 57.561256408691406, "learning_rate": 3.6390286803146526e-07, "loss": 0.1179, "num_input_tokens_seen": 2521344, "step": 3725 }, { "epoch": 0.091124520557985, "grad_norm": 44.93265914916992, "learning_rate": 3.6439145942248494e-07, "loss": 0.2325, "num_input_tokens_seen": 2524288, "step": 3730 }, { "epoch": 0.09124667138983217, "grad_norm": 54.41051483154297, "learning_rate": 3.648800508135046e-07, "loss": 0.1381, "num_input_tokens_seen": 2528064, "step": 3735 }, { "epoch": 0.09136882222167933, "grad_norm": 65.68891906738281, "learning_rate": 3.6536864220452436e-07, "loss": 0.1633, "num_input_tokens_seen": 2531392, "step": 3740 }, { "epoch": 0.09149097305352649, "grad_norm": 39.20538330078125, "learning_rate": 3.6585723359554404e-07, "loss": 0.0835, "num_input_tokens_seen": 2534720, "step": 3745 }, { "epoch": 0.09161312388537365, "grad_norm": 43.629066467285156, "learning_rate": 3.663458249865637e-07, "loss": 0.2312, "num_input_tokens_seen": 2538304, "step": 3750 }, { "epoch": 0.09173527471722083, "grad_norm": 28.365129470825195, "learning_rate": 3.668344163775834e-07, "loss": 0.1795, "num_input_tokens_seen": 2541696, "step": 3755 }, { "epoch": 0.09185742554906799, "grad_norm": 4.946837425231934, "learning_rate": 3.673230077686031e-07, "loss": 0.1341, "num_input_tokens_seen": 2545216, "step": 3760 }, { "epoch": 0.09197957638091515, "grad_norm": 51.57637405395508, "learning_rate": 3.6781159915962276e-07, "loss": 0.126, "num_input_tokens_seen": 2548480, "step": 3765 }, { "epoch": 0.09210172721276232, "grad_norm": 20.10322380065918, "learning_rate": 3.683001905506425e-07, "loss": 0.0626, "num_input_tokens_seen": 2551808, "step": 3770 }, { "epoch": 0.09222387804460948, "grad_norm": 51.054447174072266, "learning_rate": 3.687887819416622e-07, "loss": 0.1606, "num_input_tokens_seen": 2555200, "step": 3775 }, { "epoch": 0.09234602887645665, "grad_norm": 37.11040115356445, "learning_rate": 3.6927737333268186e-07, "loss": 0.1056, "num_input_tokens_seen": 2558336, "step": 3780 }, { "epoch": 0.09246817970830382, "grad_norm": 35.62503433227539, "learning_rate": 3.6976596472370154e-07, "loss": 0.148, "num_input_tokens_seen": 2561856, "step": 3785 }, { "epoch": 0.09259033054015098, "grad_norm": 27.626007080078125, "learning_rate": 3.702545561147212e-07, "loss": 0.2257, "num_input_tokens_seen": 2565120, "step": 3790 }, { "epoch": 0.09271248137199814, "grad_norm": 81.18321990966797, "learning_rate": 3.707431475057409e-07, "loss": 0.1475, "num_input_tokens_seen": 2568704, "step": 3795 }, { "epoch": 0.0928346322038453, "grad_norm": 25.010517120361328, "learning_rate": 3.7123173889676064e-07, "loss": 0.1184, "num_input_tokens_seen": 2572288, "step": 3800 }, { "epoch": 0.09295678303569248, "grad_norm": 67.2930679321289, "learning_rate": 3.717203302877803e-07, "loss": 0.1522, "num_input_tokens_seen": 2575872, "step": 3805 }, { "epoch": 0.09307893386753964, "grad_norm": 20.680904388427734, "learning_rate": 3.722089216788e-07, "loss": 0.1005, "num_input_tokens_seen": 2578624, "step": 3810 }, { "epoch": 0.0932010846993868, "grad_norm": 48.87485885620117, "learning_rate": 3.726975130698197e-07, "loss": 0.075, "num_input_tokens_seen": 2581888, "step": 3815 }, { "epoch": 0.09332323553123396, "grad_norm": 29.187978744506836, "learning_rate": 3.7318610446083936e-07, "loss": 0.1201, "num_input_tokens_seen": 2585216, "step": 3820 }, { "epoch": 0.09344538636308113, "grad_norm": 25.896148681640625, "learning_rate": 3.7367469585185904e-07, "loss": 0.1435, "num_input_tokens_seen": 2588480, "step": 3825 }, { "epoch": 0.0935675371949283, "grad_norm": 32.884918212890625, "learning_rate": 3.741632872428788e-07, "loss": 0.1068, "num_input_tokens_seen": 2591488, "step": 3830 }, { "epoch": 0.09368968802677546, "grad_norm": 4.826634407043457, "learning_rate": 3.7465187863389846e-07, "loss": 0.0608, "num_input_tokens_seen": 2594560, "step": 3835 }, { "epoch": 0.09381183885862263, "grad_norm": 45.7083625793457, "learning_rate": 3.7514047002491814e-07, "loss": 0.166, "num_input_tokens_seen": 2597952, "step": 3840 }, { "epoch": 0.09393398969046979, "grad_norm": 76.23258972167969, "learning_rate": 3.756290614159378e-07, "loss": 0.2698, "num_input_tokens_seen": 2602112, "step": 3845 }, { "epoch": 0.09405614052231696, "grad_norm": 49.7977294921875, "learning_rate": 3.761176528069575e-07, "loss": 0.1339, "num_input_tokens_seen": 2605184, "step": 3850 }, { "epoch": 0.09417829135416413, "grad_norm": 29.901966094970703, "learning_rate": 3.766062441979772e-07, "loss": 0.1611, "num_input_tokens_seen": 2608384, "step": 3855 }, { "epoch": 0.09430044218601129, "grad_norm": 99.41666412353516, "learning_rate": 3.770948355889969e-07, "loss": 0.0959, "num_input_tokens_seen": 2611968, "step": 3860 }, { "epoch": 0.09442259301785845, "grad_norm": 55.669315338134766, "learning_rate": 3.775834269800166e-07, "loss": 0.3012, "num_input_tokens_seen": 2615168, "step": 3865 }, { "epoch": 0.09454474384970561, "grad_norm": 40.56696701049805, "learning_rate": 3.780720183710363e-07, "loss": 0.1067, "num_input_tokens_seen": 2618688, "step": 3870 }, { "epoch": 0.09466689468155279, "grad_norm": 35.744956970214844, "learning_rate": 3.7856060976205596e-07, "loss": 0.1673, "num_input_tokens_seen": 2622080, "step": 3875 }, { "epoch": 0.09478904551339995, "grad_norm": 21.277301788330078, "learning_rate": 3.7904920115307564e-07, "loss": 0.1458, "num_input_tokens_seen": 2625216, "step": 3880 }, { "epoch": 0.09491119634524711, "grad_norm": 23.259803771972656, "learning_rate": 3.795377925440953e-07, "loss": 0.0596, "num_input_tokens_seen": 2628736, "step": 3885 }, { "epoch": 0.09503334717709427, "grad_norm": 48.48746871948242, "learning_rate": 3.8002638393511506e-07, "loss": 0.0805, "num_input_tokens_seen": 2631744, "step": 3890 }, { "epoch": 0.09515549800894144, "grad_norm": 98.93216705322266, "learning_rate": 3.8051497532613474e-07, "loss": 0.1671, "num_input_tokens_seen": 2634688, "step": 3895 }, { "epoch": 0.09527764884078861, "grad_norm": 37.49694061279297, "learning_rate": 3.810035667171544e-07, "loss": 0.1827, "num_input_tokens_seen": 2637824, "step": 3900 }, { "epoch": 0.09539979967263577, "grad_norm": 84.37271881103516, "learning_rate": 3.814921581081741e-07, "loss": 0.1409, "num_input_tokens_seen": 2640768, "step": 3905 }, { "epoch": 0.09552195050448294, "grad_norm": 53.07040786743164, "learning_rate": 3.819807494991938e-07, "loss": 0.256, "num_input_tokens_seen": 2643648, "step": 3910 }, { "epoch": 0.0956441013363301, "grad_norm": 3.5294644832611084, "learning_rate": 3.8246934089021347e-07, "loss": 0.155, "num_input_tokens_seen": 2647296, "step": 3915 }, { "epoch": 0.09576625216817726, "grad_norm": 43.76532745361328, "learning_rate": 3.829579322812332e-07, "loss": 0.1863, "num_input_tokens_seen": 2650368, "step": 3920 }, { "epoch": 0.09588840300002444, "grad_norm": 19.401376724243164, "learning_rate": 3.834465236722529e-07, "loss": 0.1506, "num_input_tokens_seen": 2653696, "step": 3925 }, { "epoch": 0.0960105538318716, "grad_norm": 53.53800964355469, "learning_rate": 3.8393511506327256e-07, "loss": 0.1153, "num_input_tokens_seen": 2656896, "step": 3930 }, { "epoch": 0.09613270466371876, "grad_norm": 19.80342674255371, "learning_rate": 3.844237064542923e-07, "loss": 0.0772, "num_input_tokens_seen": 2660160, "step": 3935 }, { "epoch": 0.09625485549556592, "grad_norm": 58.60802459716797, "learning_rate": 3.8491229784531193e-07, "loss": 0.1788, "num_input_tokens_seen": 2663232, "step": 3940 }, { "epoch": 0.09637700632741308, "grad_norm": 31.286151885986328, "learning_rate": 3.854008892363316e-07, "loss": 0.0492, "num_input_tokens_seen": 2666432, "step": 3945 }, { "epoch": 0.09649915715926026, "grad_norm": 32.254730224609375, "learning_rate": 3.8588948062735134e-07, "loss": 0.2631, "num_input_tokens_seen": 2669504, "step": 3950 }, { "epoch": 0.09662130799110742, "grad_norm": 24.599687576293945, "learning_rate": 3.86378072018371e-07, "loss": 0.0781, "num_input_tokens_seen": 2672896, "step": 3955 }, { "epoch": 0.09674345882295458, "grad_norm": 19.06269645690918, "learning_rate": 3.868666634093907e-07, "loss": 0.0871, "num_input_tokens_seen": 2676096, "step": 3960 }, { "epoch": 0.09686560965480175, "grad_norm": 24.946197509765625, "learning_rate": 3.8735525480041044e-07, "loss": 0.0841, "num_input_tokens_seen": 2679168, "step": 3965 }, { "epoch": 0.09698776048664891, "grad_norm": 118.59291076660156, "learning_rate": 3.8784384619143007e-07, "loss": 0.18, "num_input_tokens_seen": 2682240, "step": 3970 }, { "epoch": 0.09710991131849608, "grad_norm": 52.05529022216797, "learning_rate": 3.8833243758244975e-07, "loss": 0.2067, "num_input_tokens_seen": 2686016, "step": 3975 }, { "epoch": 0.09723206215034325, "grad_norm": 41.86702346801758, "learning_rate": 3.888210289734695e-07, "loss": 0.0556, "num_input_tokens_seen": 2689088, "step": 3980 }, { "epoch": 0.09735421298219041, "grad_norm": 66.6626205444336, "learning_rate": 3.8930962036448916e-07, "loss": 0.215, "num_input_tokens_seen": 2692416, "step": 3985 }, { "epoch": 0.09747636381403757, "grad_norm": 14.627568244934082, "learning_rate": 3.8979821175550885e-07, "loss": 0.1439, "num_input_tokens_seen": 2695744, "step": 3990 }, { "epoch": 0.09759851464588473, "grad_norm": 1.7436994314193726, "learning_rate": 3.902868031465286e-07, "loss": 0.1125, "num_input_tokens_seen": 2698880, "step": 3995 }, { "epoch": 0.09772066547773191, "grad_norm": 17.24464988708496, "learning_rate": 3.907753945375482e-07, "loss": 0.1243, "num_input_tokens_seen": 2703040, "step": 4000 }, { "epoch": 0.09784281630957907, "grad_norm": 4.9105682373046875, "learning_rate": 3.912639859285679e-07, "loss": 0.137, "num_input_tokens_seen": 2706816, "step": 4005 }, { "epoch": 0.09796496714142623, "grad_norm": 44.11586380004883, "learning_rate": 3.917525773195876e-07, "loss": 0.1429, "num_input_tokens_seen": 2710336, "step": 4010 }, { "epoch": 0.0980871179732734, "grad_norm": 1.7405118942260742, "learning_rate": 3.922411687106073e-07, "loss": 0.0883, "num_input_tokens_seen": 2713344, "step": 4015 }, { "epoch": 0.09820926880512057, "grad_norm": 2.1863999366760254, "learning_rate": 3.92729760101627e-07, "loss": 0.1977, "num_input_tokens_seen": 2716736, "step": 4020 }, { "epoch": 0.09833141963696773, "grad_norm": 65.2334976196289, "learning_rate": 3.932183514926467e-07, "loss": 0.2043, "num_input_tokens_seen": 2720640, "step": 4025 }, { "epoch": 0.0984535704688149, "grad_norm": 88.74542236328125, "learning_rate": 3.9370694288366635e-07, "loss": 0.257, "num_input_tokens_seen": 2723840, "step": 4030 }, { "epoch": 0.09857572130066206, "grad_norm": 9.849947929382324, "learning_rate": 3.9419553427468603e-07, "loss": 0.094, "num_input_tokens_seen": 2726976, "step": 4035 }, { "epoch": 0.09869787213250922, "grad_norm": 69.36286163330078, "learning_rate": 3.9468412566570576e-07, "loss": 0.2761, "num_input_tokens_seen": 2730368, "step": 4040 }, { "epoch": 0.0988200229643564, "grad_norm": 28.743057250976562, "learning_rate": 3.9517271705672545e-07, "loss": 0.1444, "num_input_tokens_seen": 2733504, "step": 4045 }, { "epoch": 0.09894217379620356, "grad_norm": 25.775449752807617, "learning_rate": 3.9566130844774513e-07, "loss": 0.1586, "num_input_tokens_seen": 2736704, "step": 4050 }, { "epoch": 0.09906432462805072, "grad_norm": 16.260456085205078, "learning_rate": 3.9614989983876486e-07, "loss": 0.0806, "num_input_tokens_seen": 2739904, "step": 4055 }, { "epoch": 0.09918647545989788, "grad_norm": 29.549999237060547, "learning_rate": 3.966384912297845e-07, "loss": 0.0966, "num_input_tokens_seen": 2743424, "step": 4060 }, { "epoch": 0.09930862629174504, "grad_norm": 43.79568099975586, "learning_rate": 3.9712708262080417e-07, "loss": 0.0883, "num_input_tokens_seen": 2746560, "step": 4065 }, { "epoch": 0.09943077712359222, "grad_norm": 31.825456619262695, "learning_rate": 3.976156740118239e-07, "loss": 0.2489, "num_input_tokens_seen": 2750528, "step": 4070 }, { "epoch": 0.09955292795543938, "grad_norm": 29.87146759033203, "learning_rate": 3.981042654028436e-07, "loss": 0.1553, "num_input_tokens_seen": 2754048, "step": 4075 }, { "epoch": 0.09967507878728654, "grad_norm": 104.86454010009766, "learning_rate": 3.9859285679386327e-07, "loss": 0.2288, "num_input_tokens_seen": 2757632, "step": 4080 }, { "epoch": 0.0997972296191337, "grad_norm": 39.14055252075195, "learning_rate": 3.99081448184883e-07, "loss": 0.1119, "num_input_tokens_seen": 2761408, "step": 4085 }, { "epoch": 0.09991938045098087, "grad_norm": 58.73264694213867, "learning_rate": 3.9957003957590263e-07, "loss": 0.1223, "num_input_tokens_seen": 2764608, "step": 4090 }, { "epoch": 0.10004153128282804, "grad_norm": 29.07638931274414, "learning_rate": 4.000586309669223e-07, "loss": 0.1659, "num_input_tokens_seen": 2767616, "step": 4095 }, { "epoch": 0.1001636821146752, "grad_norm": 4.408840179443359, "learning_rate": 4.0054722235794205e-07, "loss": 0.0863, "num_input_tokens_seen": 2770688, "step": 4100 }, { "epoch": 0.10028583294652237, "grad_norm": 36.848358154296875, "learning_rate": 4.0103581374896173e-07, "loss": 0.1221, "num_input_tokens_seen": 2773824, "step": 4105 }, { "epoch": 0.10040798377836953, "grad_norm": 5.810712814331055, "learning_rate": 4.015244051399814e-07, "loss": 0.0746, "num_input_tokens_seen": 2777344, "step": 4110 }, { "epoch": 0.10053013461021669, "grad_norm": 1.0392895936965942, "learning_rate": 4.0201299653100114e-07, "loss": 0.0951, "num_input_tokens_seen": 2781632, "step": 4115 }, { "epoch": 0.10065228544206387, "grad_norm": 50.974159240722656, "learning_rate": 4.0250158792202077e-07, "loss": 0.2181, "num_input_tokens_seen": 2785024, "step": 4120 }, { "epoch": 0.10077443627391103, "grad_norm": 22.851579666137695, "learning_rate": 4.0299017931304045e-07, "loss": 0.2985, "num_input_tokens_seen": 2788416, "step": 4125 }, { "epoch": 0.10089658710575819, "grad_norm": 43.48515319824219, "learning_rate": 4.034787707040602e-07, "loss": 0.1828, "num_input_tokens_seen": 2791872, "step": 4130 }, { "epoch": 0.10101873793760535, "grad_norm": 15.764205932617188, "learning_rate": 4.0396736209507987e-07, "loss": 0.0847, "num_input_tokens_seen": 2795200, "step": 4135 }, { "epoch": 0.10114088876945251, "grad_norm": 52.164432525634766, "learning_rate": 4.0445595348609955e-07, "loss": 0.1232, "num_input_tokens_seen": 2798784, "step": 4140 }, { "epoch": 0.10126303960129969, "grad_norm": 5.731689929962158, "learning_rate": 4.049445448771193e-07, "loss": 0.1623, "num_input_tokens_seen": 2802432, "step": 4145 }, { "epoch": 0.10138519043314685, "grad_norm": 60.42582702636719, "learning_rate": 4.0543313626813897e-07, "loss": 0.1074, "num_input_tokens_seen": 2806144, "step": 4150 }, { "epoch": 0.10150734126499401, "grad_norm": 58.094215393066406, "learning_rate": 4.059217276591586e-07, "loss": 0.116, "num_input_tokens_seen": 2809472, "step": 4155 }, { "epoch": 0.10162949209684118, "grad_norm": 41.62969970703125, "learning_rate": 4.0641031905017833e-07, "loss": 0.1349, "num_input_tokens_seen": 2813632, "step": 4160 }, { "epoch": 0.10175164292868834, "grad_norm": 21.467771530151367, "learning_rate": 4.06898910441198e-07, "loss": 0.1785, "num_input_tokens_seen": 2816576, "step": 4165 }, { "epoch": 0.10187379376053551, "grad_norm": 38.030250549316406, "learning_rate": 4.073875018322177e-07, "loss": 0.1915, "num_input_tokens_seen": 2819904, "step": 4170 }, { "epoch": 0.10199594459238268, "grad_norm": 46.13201904296875, "learning_rate": 4.078760932232374e-07, "loss": 0.09, "num_input_tokens_seen": 2823360, "step": 4175 }, { "epoch": 0.10211809542422984, "grad_norm": 36.75332260131836, "learning_rate": 4.083646846142571e-07, "loss": 0.1443, "num_input_tokens_seen": 2826752, "step": 4180 }, { "epoch": 0.102240246256077, "grad_norm": 54.063507080078125, "learning_rate": 4.0885327600527673e-07, "loss": 0.1226, "num_input_tokens_seen": 2830016, "step": 4185 }, { "epoch": 0.10236239708792418, "grad_norm": 31.851612091064453, "learning_rate": 4.0934186739629647e-07, "loss": 0.1663, "num_input_tokens_seen": 2833408, "step": 4190 }, { "epoch": 0.10248454791977134, "grad_norm": 50.87443923950195, "learning_rate": 4.0983045878731615e-07, "loss": 0.1124, "num_input_tokens_seen": 2836480, "step": 4195 }, { "epoch": 0.1026066987516185, "grad_norm": 26.74880599975586, "learning_rate": 4.1031905017833583e-07, "loss": 0.1346, "num_input_tokens_seen": 2839744, "step": 4200 }, { "epoch": 0.10272884958346566, "grad_norm": 35.637325286865234, "learning_rate": 4.1080764156935557e-07, "loss": 0.1257, "num_input_tokens_seen": 2842880, "step": 4205 }, { "epoch": 0.10285100041531282, "grad_norm": 8.508408546447754, "learning_rate": 4.1129623296037525e-07, "loss": 0.0673, "num_input_tokens_seen": 2846272, "step": 4210 }, { "epoch": 0.10297315124716, "grad_norm": 10.577549934387207, "learning_rate": 4.117848243513949e-07, "loss": 0.0861, "num_input_tokens_seen": 2849536, "step": 4215 }, { "epoch": 0.10309530207900716, "grad_norm": 5.944873809814453, "learning_rate": 4.122734157424146e-07, "loss": 0.1275, "num_input_tokens_seen": 2852800, "step": 4220 }, { "epoch": 0.10321745291085432, "grad_norm": 25.301698684692383, "learning_rate": 4.127620071334343e-07, "loss": 0.1408, "num_input_tokens_seen": 2856000, "step": 4225 }, { "epoch": 0.10333960374270149, "grad_norm": 56.8901481628418, "learning_rate": 4.1325059852445397e-07, "loss": 0.1893, "num_input_tokens_seen": 2858944, "step": 4230 }, { "epoch": 0.10346175457454865, "grad_norm": 38.725032806396484, "learning_rate": 4.137391899154737e-07, "loss": 0.1007, "num_input_tokens_seen": 2862208, "step": 4235 }, { "epoch": 0.10358390540639582, "grad_norm": 27.819990158081055, "learning_rate": 4.142277813064934e-07, "loss": 0.0926, "num_input_tokens_seen": 2865408, "step": 4240 }, { "epoch": 0.10370605623824299, "grad_norm": 18.120283126831055, "learning_rate": 4.14716372697513e-07, "loss": 0.1382, "num_input_tokens_seen": 2869120, "step": 4245 }, { "epoch": 0.10382820707009015, "grad_norm": 13.521397590637207, "learning_rate": 4.1520496408853275e-07, "loss": 0.0831, "num_input_tokens_seen": 2872448, "step": 4250 }, { "epoch": 0.10395035790193731, "grad_norm": 33.140411376953125, "learning_rate": 4.1569355547955243e-07, "loss": 0.2276, "num_input_tokens_seen": 2875776, "step": 4255 }, { "epoch": 0.10407250873378447, "grad_norm": 39.06216049194336, "learning_rate": 4.161821468705721e-07, "loss": 0.1392, "num_input_tokens_seen": 2878720, "step": 4260 }, { "epoch": 0.10419465956563165, "grad_norm": 39.4327507019043, "learning_rate": 4.1667073826159185e-07, "loss": 0.154, "num_input_tokens_seen": 2882432, "step": 4265 }, { "epoch": 0.10431681039747881, "grad_norm": 18.142154693603516, "learning_rate": 4.1715932965261153e-07, "loss": 0.1137, "num_input_tokens_seen": 2885824, "step": 4270 }, { "epoch": 0.10443896122932597, "grad_norm": 33.220359802246094, "learning_rate": 4.1764792104363116e-07, "loss": 0.15, "num_input_tokens_seen": 2889408, "step": 4275 }, { "epoch": 0.10456111206117313, "grad_norm": 48.25017166137695, "learning_rate": 4.181365124346509e-07, "loss": 0.221, "num_input_tokens_seen": 2892416, "step": 4280 }, { "epoch": 0.1046832628930203, "grad_norm": 38.912113189697266, "learning_rate": 4.1862510382567057e-07, "loss": 0.1015, "num_input_tokens_seen": 2895552, "step": 4285 }, { "epoch": 0.10480541372486747, "grad_norm": 24.962831497192383, "learning_rate": 4.1911369521669025e-07, "loss": 0.1397, "num_input_tokens_seen": 2899264, "step": 4290 }, { "epoch": 0.10492756455671463, "grad_norm": 14.322883605957031, "learning_rate": 4.1960228660771e-07, "loss": 0.0841, "num_input_tokens_seen": 2902464, "step": 4295 }, { "epoch": 0.1050497153885618, "grad_norm": 26.260435104370117, "learning_rate": 4.2009087799872967e-07, "loss": 0.1638, "num_input_tokens_seen": 2906304, "step": 4300 }, { "epoch": 0.10517186622040896, "grad_norm": 11.948546409606934, "learning_rate": 4.205794693897493e-07, "loss": 0.1394, "num_input_tokens_seen": 2909632, "step": 4305 }, { "epoch": 0.10529401705225612, "grad_norm": 34.74366760253906, "learning_rate": 4.2106806078076903e-07, "loss": 0.0915, "num_input_tokens_seen": 2913280, "step": 4310 }, { "epoch": 0.1054161678841033, "grad_norm": 24.318824768066406, "learning_rate": 4.215566521717887e-07, "loss": 0.1507, "num_input_tokens_seen": 2916672, "step": 4315 }, { "epoch": 0.10553831871595046, "grad_norm": 20.615177154541016, "learning_rate": 4.220452435628084e-07, "loss": 0.1375, "num_input_tokens_seen": 2921472, "step": 4320 }, { "epoch": 0.10566046954779762, "grad_norm": 55.968509674072266, "learning_rate": 4.2253383495382813e-07, "loss": 0.1117, "num_input_tokens_seen": 2924736, "step": 4325 }, { "epoch": 0.10578262037964478, "grad_norm": 63.45806884765625, "learning_rate": 4.230224263448478e-07, "loss": 0.1468, "num_input_tokens_seen": 2927936, "step": 4330 }, { "epoch": 0.10590477121149194, "grad_norm": 32.341922760009766, "learning_rate": 4.2351101773586744e-07, "loss": 0.1953, "num_input_tokens_seen": 2931456, "step": 4335 }, { "epoch": 0.10602692204333912, "grad_norm": 38.54914093017578, "learning_rate": 4.2399960912688717e-07, "loss": 0.1076, "num_input_tokens_seen": 2934656, "step": 4340 }, { "epoch": 0.10614907287518628, "grad_norm": 31.62356948852539, "learning_rate": 4.2448820051790685e-07, "loss": 0.145, "num_input_tokens_seen": 2937728, "step": 4345 }, { "epoch": 0.10627122370703344, "grad_norm": 69.52967834472656, "learning_rate": 4.2497679190892654e-07, "loss": 0.1833, "num_input_tokens_seen": 2940992, "step": 4350 }, { "epoch": 0.1063933745388806, "grad_norm": 35.780120849609375, "learning_rate": 4.2546538329994627e-07, "loss": 0.0839, "num_input_tokens_seen": 2944640, "step": 4355 }, { "epoch": 0.10651552537072777, "grad_norm": 19.068248748779297, "learning_rate": 4.2595397469096595e-07, "loss": 0.0561, "num_input_tokens_seen": 2948224, "step": 4360 }, { "epoch": 0.10663767620257494, "grad_norm": 53.031036376953125, "learning_rate": 4.264425660819856e-07, "loss": 0.1876, "num_input_tokens_seen": 2952320, "step": 4365 }, { "epoch": 0.1067598270344221, "grad_norm": 15.228533744812012, "learning_rate": 4.269311574730053e-07, "loss": 0.0542, "num_input_tokens_seen": 2955456, "step": 4370 }, { "epoch": 0.10688197786626927, "grad_norm": 28.26727294921875, "learning_rate": 4.27419748864025e-07, "loss": 0.1754, "num_input_tokens_seen": 2958848, "step": 4375 }, { "epoch": 0.10700412869811643, "grad_norm": 6.667966842651367, "learning_rate": 4.279083402550447e-07, "loss": 0.0646, "num_input_tokens_seen": 2962816, "step": 4380 }, { "epoch": 0.1071262795299636, "grad_norm": 25.915861129760742, "learning_rate": 4.283969316460644e-07, "loss": 0.133, "num_input_tokens_seen": 2965888, "step": 4385 }, { "epoch": 0.10724843036181077, "grad_norm": 37.18895721435547, "learning_rate": 4.288855230370841e-07, "loss": 0.1174, "num_input_tokens_seen": 2969600, "step": 4390 }, { "epoch": 0.10737058119365793, "grad_norm": 47.06808090209961, "learning_rate": 4.2937411442810377e-07, "loss": 0.051, "num_input_tokens_seen": 2972864, "step": 4395 }, { "epoch": 0.10749273202550509, "grad_norm": 5.6044921875, "learning_rate": 4.2986270581912345e-07, "loss": 0.198, "num_input_tokens_seen": 2976192, "step": 4400 }, { "epoch": 0.10761488285735225, "grad_norm": 43.25118637084961, "learning_rate": 4.3035129721014314e-07, "loss": 0.2707, "num_input_tokens_seen": 2979840, "step": 4405 }, { "epoch": 0.10773703368919943, "grad_norm": 47.26470947265625, "learning_rate": 4.308398886011628e-07, "loss": 0.149, "num_input_tokens_seen": 2983168, "step": 4410 }, { "epoch": 0.10785918452104659, "grad_norm": 19.98095703125, "learning_rate": 4.3132847999218255e-07, "loss": 0.0545, "num_input_tokens_seen": 2987072, "step": 4415 }, { "epoch": 0.10798133535289375, "grad_norm": 32.365257263183594, "learning_rate": 4.3181707138320223e-07, "loss": 0.129, "num_input_tokens_seen": 2990144, "step": 4420 }, { "epoch": 0.10810348618474092, "grad_norm": 47.49005889892578, "learning_rate": 4.323056627742219e-07, "loss": 0.2385, "num_input_tokens_seen": 2994176, "step": 4425 }, { "epoch": 0.10822563701658808, "grad_norm": 47.387611389160156, "learning_rate": 4.3279425416524154e-07, "loss": 0.125, "num_input_tokens_seen": 2997248, "step": 4430 }, { "epoch": 0.10834778784843525, "grad_norm": 42.15059280395508, "learning_rate": 4.332828455562613e-07, "loss": 0.2284, "num_input_tokens_seen": 3000832, "step": 4435 }, { "epoch": 0.10846993868028242, "grad_norm": 39.20221710205078, "learning_rate": 4.3377143694728096e-07, "loss": 0.2434, "num_input_tokens_seen": 3004096, "step": 4440 }, { "epoch": 0.10859208951212958, "grad_norm": 21.04318618774414, "learning_rate": 4.342600283383007e-07, "loss": 0.2088, "num_input_tokens_seen": 3007296, "step": 4445 }, { "epoch": 0.10871424034397674, "grad_norm": 26.36186408996582, "learning_rate": 4.3474861972932037e-07, "loss": 0.1412, "num_input_tokens_seen": 3010304, "step": 4450 }, { "epoch": 0.1088363911758239, "grad_norm": 52.20693588256836, "learning_rate": 4.3523721112034005e-07, "loss": 0.2006, "num_input_tokens_seen": 3013696, "step": 4455 }, { "epoch": 0.10895854200767108, "grad_norm": 17.572002410888672, "learning_rate": 4.357258025113597e-07, "loss": 0.1489, "num_input_tokens_seen": 3016704, "step": 4460 }, { "epoch": 0.10908069283951824, "grad_norm": 40.26177215576172, "learning_rate": 4.362143939023794e-07, "loss": 0.1612, "num_input_tokens_seen": 3020032, "step": 4465 }, { "epoch": 0.1092028436713654, "grad_norm": 51.259647369384766, "learning_rate": 4.367029852933991e-07, "loss": 0.1526, "num_input_tokens_seen": 3023232, "step": 4470 }, { "epoch": 0.10932499450321256, "grad_norm": 32.641841888427734, "learning_rate": 4.371915766844188e-07, "loss": 0.1318, "num_input_tokens_seen": 3026432, "step": 4475 }, { "epoch": 0.10944714533505973, "grad_norm": 6.420051574707031, "learning_rate": 4.376801680754385e-07, "loss": 0.0994, "num_input_tokens_seen": 3029568, "step": 4480 }, { "epoch": 0.1095692961669069, "grad_norm": 24.219253540039062, "learning_rate": 4.381687594664582e-07, "loss": 0.166, "num_input_tokens_seen": 3032576, "step": 4485 }, { "epoch": 0.10969144699875406, "grad_norm": 13.019012451171875, "learning_rate": 4.386573508574778e-07, "loss": 0.0961, "num_input_tokens_seen": 3035904, "step": 4490 }, { "epoch": 0.10981359783060123, "grad_norm": 88.06275939941406, "learning_rate": 4.3914594224849756e-07, "loss": 0.1549, "num_input_tokens_seen": 3040064, "step": 4495 }, { "epoch": 0.10993574866244839, "grad_norm": 52.31646728515625, "learning_rate": 4.3963453363951724e-07, "loss": 0.1122, "num_input_tokens_seen": 3043648, "step": 4500 }, { "epoch": 0.11005789949429555, "grad_norm": 15.511794090270996, "learning_rate": 4.401231250305369e-07, "loss": 0.1506, "num_input_tokens_seen": 3046656, "step": 4505 }, { "epoch": 0.11018005032614273, "grad_norm": 9.491758346557617, "learning_rate": 4.4061171642155665e-07, "loss": 0.1529, "num_input_tokens_seen": 3050304, "step": 4510 }, { "epoch": 0.11030220115798989, "grad_norm": 28.71082878112793, "learning_rate": 4.4110030781257634e-07, "loss": 0.1095, "num_input_tokens_seen": 3053952, "step": 4515 }, { "epoch": 0.11042435198983705, "grad_norm": 61.650177001953125, "learning_rate": 4.4158889920359596e-07, "loss": 0.2287, "num_input_tokens_seen": 3057152, "step": 4520 }, { "epoch": 0.11054650282168421, "grad_norm": 18.630380630493164, "learning_rate": 4.420774905946157e-07, "loss": 0.1766, "num_input_tokens_seen": 3060416, "step": 4525 }, { "epoch": 0.11066865365353137, "grad_norm": 23.90188980102539, "learning_rate": 4.425660819856354e-07, "loss": 0.145, "num_input_tokens_seen": 3063616, "step": 4530 }, { "epoch": 0.11079080448537855, "grad_norm": 56.79856491088867, "learning_rate": 4.4305467337665506e-07, "loss": 0.107, "num_input_tokens_seen": 3067072, "step": 4535 }, { "epoch": 0.11091295531722571, "grad_norm": 50.20392608642578, "learning_rate": 4.435432647676748e-07, "loss": 0.1432, "num_input_tokens_seen": 3070592, "step": 4540 }, { "epoch": 0.11103510614907287, "grad_norm": 5.514303684234619, "learning_rate": 4.440318561586945e-07, "loss": 0.1287, "num_input_tokens_seen": 3074176, "step": 4545 }, { "epoch": 0.11115725698092004, "grad_norm": 47.270328521728516, "learning_rate": 4.445204475497141e-07, "loss": 0.1766, "num_input_tokens_seen": 3077632, "step": 4550 }, { "epoch": 0.11127940781276721, "grad_norm": 9.4541015625, "learning_rate": 4.4500903894073384e-07, "loss": 0.1604, "num_input_tokens_seen": 3081088, "step": 4555 }, { "epoch": 0.11140155864461437, "grad_norm": 11.66859245300293, "learning_rate": 4.454976303317535e-07, "loss": 0.0403, "num_input_tokens_seen": 3084224, "step": 4560 }, { "epoch": 0.11152370947646154, "grad_norm": 55.1706428527832, "learning_rate": 4.459862217227732e-07, "loss": 0.1672, "num_input_tokens_seen": 3087424, "step": 4565 }, { "epoch": 0.1116458603083087, "grad_norm": 26.333547592163086, "learning_rate": 4.4647481311379294e-07, "loss": 0.2273, "num_input_tokens_seen": 3091008, "step": 4570 }, { "epoch": 0.11176801114015586, "grad_norm": 53.830989837646484, "learning_rate": 4.469634045048126e-07, "loss": 0.1293, "num_input_tokens_seen": 3094016, "step": 4575 }, { "epoch": 0.11189016197200304, "grad_norm": 48.61365509033203, "learning_rate": 4.4745199589583225e-07, "loss": 0.13, "num_input_tokens_seen": 3097536, "step": 4580 }, { "epoch": 0.1120123128038502, "grad_norm": 13.673266410827637, "learning_rate": 4.47940587286852e-07, "loss": 0.0926, "num_input_tokens_seen": 3100800, "step": 4585 }, { "epoch": 0.11213446363569736, "grad_norm": 11.077170372009277, "learning_rate": 4.4842917867787166e-07, "loss": 0.1141, "num_input_tokens_seen": 3104128, "step": 4590 }, { "epoch": 0.11225661446754452, "grad_norm": 23.086936950683594, "learning_rate": 4.4891777006889134e-07, "loss": 0.1559, "num_input_tokens_seen": 3107456, "step": 4595 }, { "epoch": 0.11237876529939168, "grad_norm": 12.579133033752441, "learning_rate": 4.494063614599111e-07, "loss": 0.1458, "num_input_tokens_seen": 3111296, "step": 4600 }, { "epoch": 0.11250091613123886, "grad_norm": 31.168546676635742, "learning_rate": 4.4989495285093076e-07, "loss": 0.1157, "num_input_tokens_seen": 3114752, "step": 4605 }, { "epoch": 0.11262306696308602, "grad_norm": 34.17367172241211, "learning_rate": 4.5038354424195044e-07, "loss": 0.1587, "num_input_tokens_seen": 3117824, "step": 4610 }, { "epoch": 0.11274521779493318, "grad_norm": 6.410637378692627, "learning_rate": 4.508721356329701e-07, "loss": 0.0831, "num_input_tokens_seen": 3121792, "step": 4615 }, { "epoch": 0.11286736862678035, "grad_norm": 40.198307037353516, "learning_rate": 4.513607270239898e-07, "loss": 0.0569, "num_input_tokens_seen": 3125248, "step": 4620 }, { "epoch": 0.11298951945862751, "grad_norm": 4.963204383850098, "learning_rate": 4.518493184150095e-07, "loss": 0.1616, "num_input_tokens_seen": 3128640, "step": 4625 }, { "epoch": 0.11311167029047468, "grad_norm": 42.212196350097656, "learning_rate": 4.523379098060292e-07, "loss": 0.1618, "num_input_tokens_seen": 3131776, "step": 4630 }, { "epoch": 0.11323382112232185, "grad_norm": 31.58185386657715, "learning_rate": 4.528265011970489e-07, "loss": 0.1112, "num_input_tokens_seen": 3135168, "step": 4635 }, { "epoch": 0.11335597195416901, "grad_norm": 57.309425354003906, "learning_rate": 4.533150925880686e-07, "loss": 0.247, "num_input_tokens_seen": 3138688, "step": 4640 }, { "epoch": 0.11347812278601617, "grad_norm": 41.127506256103516, "learning_rate": 4.5380368397908826e-07, "loss": 0.1139, "num_input_tokens_seen": 3141952, "step": 4645 }, { "epoch": 0.11360027361786333, "grad_norm": 61.593666076660156, "learning_rate": 4.5429227537010794e-07, "loss": 0.1915, "num_input_tokens_seen": 3145280, "step": 4650 }, { "epoch": 0.11372242444971051, "grad_norm": 26.210472106933594, "learning_rate": 4.547808667611276e-07, "loss": 0.1277, "num_input_tokens_seen": 3148736, "step": 4655 }, { "epoch": 0.11384457528155767, "grad_norm": 40.950233459472656, "learning_rate": 4.5526945815214736e-07, "loss": 0.121, "num_input_tokens_seen": 3152320, "step": 4660 }, { "epoch": 0.11396672611340483, "grad_norm": 29.210453033447266, "learning_rate": 4.5575804954316704e-07, "loss": 0.11, "num_input_tokens_seen": 3155904, "step": 4665 }, { "epoch": 0.114088876945252, "grad_norm": 19.651756286621094, "learning_rate": 4.562466409341867e-07, "loss": 0.1673, "num_input_tokens_seen": 3159488, "step": 4670 }, { "epoch": 0.11421102777709916, "grad_norm": 30.511878967285156, "learning_rate": 4.567352323252064e-07, "loss": 0.2023, "num_input_tokens_seen": 3162752, "step": 4675 }, { "epoch": 0.11433317860894633, "grad_norm": 27.229795455932617, "learning_rate": 4.572238237162261e-07, "loss": 0.134, "num_input_tokens_seen": 3165888, "step": 4680 }, { "epoch": 0.1144553294407935, "grad_norm": 20.034862518310547, "learning_rate": 4.5771241510724577e-07, "loss": 0.1225, "num_input_tokens_seen": 3169472, "step": 4685 }, { "epoch": 0.11457748027264066, "grad_norm": 21.58957862854004, "learning_rate": 4.582010064982655e-07, "loss": 0.1116, "num_input_tokens_seen": 3172608, "step": 4690 }, { "epoch": 0.11469963110448782, "grad_norm": 30.950048446655273, "learning_rate": 4.586895978892852e-07, "loss": 0.0563, "num_input_tokens_seen": 3175680, "step": 4695 }, { "epoch": 0.11482178193633498, "grad_norm": 37.59648895263672, "learning_rate": 4.5917818928030486e-07, "loss": 0.1314, "num_input_tokens_seen": 3178944, "step": 4700 }, { "epoch": 0.11494393276818216, "grad_norm": 54.260047912597656, "learning_rate": 4.5966678067132454e-07, "loss": 0.1484, "num_input_tokens_seen": 3182272, "step": 4705 }, { "epoch": 0.11506608360002932, "grad_norm": 35.50208282470703, "learning_rate": 4.601553720623442e-07, "loss": 0.2703, "num_input_tokens_seen": 3185536, "step": 4710 }, { "epoch": 0.11518823443187648, "grad_norm": 10.326123237609863, "learning_rate": 4.606439634533639e-07, "loss": 0.0611, "num_input_tokens_seen": 3188928, "step": 4715 }, { "epoch": 0.11531038526372364, "grad_norm": 22.59487533569336, "learning_rate": 4.6113255484438364e-07, "loss": 0.1684, "num_input_tokens_seen": 3192832, "step": 4720 }, { "epoch": 0.11543253609557082, "grad_norm": 33.71693801879883, "learning_rate": 4.616211462354033e-07, "loss": 0.1242, "num_input_tokens_seen": 3196416, "step": 4725 }, { "epoch": 0.11555468692741798, "grad_norm": 16.002641677856445, "learning_rate": 4.62109737626423e-07, "loss": 0.0917, "num_input_tokens_seen": 3200000, "step": 4730 }, { "epoch": 0.11567683775926514, "grad_norm": 7.192635536193848, "learning_rate": 4.625983290174427e-07, "loss": 0.0778, "num_input_tokens_seen": 3203456, "step": 4735 }, { "epoch": 0.1157989885911123, "grad_norm": 10.36010456085205, "learning_rate": 4.6308692040846237e-07, "loss": 0.1563, "num_input_tokens_seen": 3206720, "step": 4740 }, { "epoch": 0.11592113942295947, "grad_norm": 26.770761489868164, "learning_rate": 4.6357551179948205e-07, "loss": 0.1827, "num_input_tokens_seen": 3210176, "step": 4745 }, { "epoch": 0.11604329025480664, "grad_norm": 45.02693176269531, "learning_rate": 4.640641031905018e-07, "loss": 0.1546, "num_input_tokens_seen": 3213056, "step": 4750 }, { "epoch": 0.1161654410866538, "grad_norm": 23.50362205505371, "learning_rate": 4.6455269458152146e-07, "loss": 0.206, "num_input_tokens_seen": 3216064, "step": 4755 }, { "epoch": 0.11628759191850097, "grad_norm": 5.375946521759033, "learning_rate": 4.6504128597254114e-07, "loss": 0.075, "num_input_tokens_seen": 3219264, "step": 4760 }, { "epoch": 0.11640974275034813, "grad_norm": 6.029793739318848, "learning_rate": 4.655298773635608e-07, "loss": 0.0856, "num_input_tokens_seen": 3222784, "step": 4765 }, { "epoch": 0.11653189358219529, "grad_norm": 43.54521179199219, "learning_rate": 4.660184687545805e-07, "loss": 0.2882, "num_input_tokens_seen": 3225856, "step": 4770 }, { "epoch": 0.11665404441404247, "grad_norm": 49.9635124206543, "learning_rate": 4.665070601456002e-07, "loss": 0.1822, "num_input_tokens_seen": 3229120, "step": 4775 }, { "epoch": 0.11677619524588963, "grad_norm": 31.54857063293457, "learning_rate": 4.669956515366199e-07, "loss": 0.0688, "num_input_tokens_seen": 3233024, "step": 4780 }, { "epoch": 0.11689834607773679, "grad_norm": 39.51899337768555, "learning_rate": 4.674842429276396e-07, "loss": 0.0929, "num_input_tokens_seen": 3236608, "step": 4785 }, { "epoch": 0.11702049690958395, "grad_norm": 32.86260223388672, "learning_rate": 4.679728343186593e-07, "loss": 0.0559, "num_input_tokens_seen": 3239808, "step": 4790 }, { "epoch": 0.11714264774143111, "grad_norm": 18.63022804260254, "learning_rate": 4.6846142570967897e-07, "loss": 0.1595, "num_input_tokens_seen": 3243072, "step": 4795 }, { "epoch": 0.11726479857327829, "grad_norm": 34.65690612792969, "learning_rate": 4.6895001710069865e-07, "loss": 0.1409, "num_input_tokens_seen": 3246208, "step": 4800 }, { "epoch": 0.11738694940512545, "grad_norm": 50.13557052612305, "learning_rate": 4.6943860849171833e-07, "loss": 0.1298, "num_input_tokens_seen": 3250496, "step": 4805 }, { "epoch": 0.11750910023697261, "grad_norm": 5.652262210845947, "learning_rate": 4.6992719988273806e-07, "loss": 0.1475, "num_input_tokens_seen": 3254208, "step": 4810 }, { "epoch": 0.11763125106881978, "grad_norm": 26.71537208557129, "learning_rate": 4.7041579127375774e-07, "loss": 0.0992, "num_input_tokens_seen": 3257600, "step": 4815 }, { "epoch": 0.11775340190066694, "grad_norm": 30.07550048828125, "learning_rate": 4.709043826647774e-07, "loss": 0.2301, "num_input_tokens_seen": 3260928, "step": 4820 }, { "epoch": 0.11787555273251411, "grad_norm": 20.74995994567871, "learning_rate": 4.713929740557971e-07, "loss": 0.0816, "num_input_tokens_seen": 3264192, "step": 4825 }, { "epoch": 0.11799770356436128, "grad_norm": 47.43042755126953, "learning_rate": 4.718815654468168e-07, "loss": 0.0875, "num_input_tokens_seen": 3267392, "step": 4830 }, { "epoch": 0.11811985439620844, "grad_norm": 1.182123064994812, "learning_rate": 4.7237015683783647e-07, "loss": 0.1291, "num_input_tokens_seen": 3270272, "step": 4835 }, { "epoch": 0.1182420052280556, "grad_norm": 59.24190902709961, "learning_rate": 4.728587482288562e-07, "loss": 0.1015, "num_input_tokens_seen": 3273408, "step": 4840 }, { "epoch": 0.11836415605990276, "grad_norm": 52.56169509887695, "learning_rate": 4.733473396198759e-07, "loss": 0.1972, "num_input_tokens_seen": 3276800, "step": 4845 }, { "epoch": 0.11848630689174994, "grad_norm": 27.176464080810547, "learning_rate": 4.7383593101089557e-07, "loss": 0.0876, "num_input_tokens_seen": 3279936, "step": 4850 }, { "epoch": 0.1186084577235971, "grad_norm": 32.131317138671875, "learning_rate": 4.743245224019153e-07, "loss": 0.1071, "num_input_tokens_seen": 3283456, "step": 4855 }, { "epoch": 0.11873060855544426, "grad_norm": 43.500160217285156, "learning_rate": 4.7481311379293493e-07, "loss": 0.16, "num_input_tokens_seen": 3286848, "step": 4860 }, { "epoch": 0.11885275938729142, "grad_norm": 58.2629280090332, "learning_rate": 4.753017051839546e-07, "loss": 0.1569, "num_input_tokens_seen": 3290432, "step": 4865 }, { "epoch": 0.11897491021913859, "grad_norm": 0.8482736945152283, "learning_rate": 4.7579029657497434e-07, "loss": 0.1321, "num_input_tokens_seen": 3293760, "step": 4870 }, { "epoch": 0.11909706105098576, "grad_norm": 28.063852310180664, "learning_rate": 4.76278887965994e-07, "loss": 0.1658, "num_input_tokens_seen": 3296768, "step": 4875 }, { "epoch": 0.11921921188283292, "grad_norm": 42.519065856933594, "learning_rate": 4.767674793570137e-07, "loss": 0.2226, "num_input_tokens_seen": 3299968, "step": 4880 }, { "epoch": 0.11934136271468009, "grad_norm": 17.598634719848633, "learning_rate": 4.772560707480334e-07, "loss": 0.1144, "num_input_tokens_seen": 3303488, "step": 4885 }, { "epoch": 0.11946351354652725, "grad_norm": 35.20301818847656, "learning_rate": 4.777446621390531e-07, "loss": 0.1006, "num_input_tokens_seen": 3306880, "step": 4890 }, { "epoch": 0.11958566437837442, "grad_norm": 4.097479343414307, "learning_rate": 4.782332535300728e-07, "loss": 0.1341, "num_input_tokens_seen": 3310336, "step": 4895 }, { "epoch": 0.11970781521022159, "grad_norm": 19.13021469116211, "learning_rate": 4.787218449210924e-07, "loss": 0.1649, "num_input_tokens_seen": 3313856, "step": 4900 }, { "epoch": 0.11982996604206875, "grad_norm": 16.483863830566406, "learning_rate": 4.792104363121121e-07, "loss": 0.1185, "num_input_tokens_seen": 3317120, "step": 4905 }, { "epoch": 0.11995211687391591, "grad_norm": 6.986123561859131, "learning_rate": 4.796990277031319e-07, "loss": 0.1056, "num_input_tokens_seen": 3320768, "step": 4910 }, { "epoch": 0.12007426770576307, "grad_norm": 28.12428092956543, "learning_rate": 4.801876190941516e-07, "loss": 0.0633, "num_input_tokens_seen": 3324288, "step": 4915 }, { "epoch": 0.12019641853761025, "grad_norm": 14.742039680480957, "learning_rate": 4.806762104851712e-07, "loss": 0.1208, "num_input_tokens_seen": 3327680, "step": 4920 }, { "epoch": 0.12031856936945741, "grad_norm": 8.952460289001465, "learning_rate": 4.811648018761909e-07, "loss": 0.1176, "num_input_tokens_seen": 3331008, "step": 4925 }, { "epoch": 0.12044072020130457, "grad_norm": 39.11952590942383, "learning_rate": 4.816533932672106e-07, "loss": 0.151, "num_input_tokens_seen": 3334272, "step": 4930 }, { "epoch": 0.12056287103315173, "grad_norm": 27.464441299438477, "learning_rate": 4.821419846582303e-07, "loss": 0.0791, "num_input_tokens_seen": 3337728, "step": 4935 }, { "epoch": 0.1206850218649989, "grad_norm": 37.72605514526367, "learning_rate": 4.8263057604925e-07, "loss": 0.1625, "num_input_tokens_seen": 3341056, "step": 4940 }, { "epoch": 0.12080717269684607, "grad_norm": 31.29315185546875, "learning_rate": 4.831191674402697e-07, "loss": 0.14, "num_input_tokens_seen": 3344704, "step": 4945 }, { "epoch": 0.12092932352869323, "grad_norm": 37.559452056884766, "learning_rate": 4.836077588312894e-07, "loss": 0.2221, "num_input_tokens_seen": 3347648, "step": 4950 }, { "epoch": 0.1210514743605404, "grad_norm": 31.414438247680664, "learning_rate": 4.84096350222309e-07, "loss": 0.0901, "num_input_tokens_seen": 3350976, "step": 4955 }, { "epoch": 0.12117362519238756, "grad_norm": 27.78361701965332, "learning_rate": 4.845849416133287e-07, "loss": 0.2851, "num_input_tokens_seen": 3354112, "step": 4960 }, { "epoch": 0.12129577602423472, "grad_norm": 24.971927642822266, "learning_rate": 4.850735330043484e-07, "loss": 0.1281, "num_input_tokens_seen": 3357696, "step": 4965 }, { "epoch": 0.1214179268560819, "grad_norm": 26.934574127197266, "learning_rate": 4.855621243953682e-07, "loss": 0.1526, "num_input_tokens_seen": 3361024, "step": 4970 }, { "epoch": 0.12154007768792906, "grad_norm": 56.78594207763672, "learning_rate": 4.860507157863879e-07, "loss": 0.2137, "num_input_tokens_seen": 3364032, "step": 4975 }, { "epoch": 0.12166222851977622, "grad_norm": 7.1227521896362305, "learning_rate": 4.865393071774074e-07, "loss": 0.1128, "num_input_tokens_seen": 3367744, "step": 4980 }, { "epoch": 0.12178437935162338, "grad_norm": 13.746490478515625, "learning_rate": 4.870278985684272e-07, "loss": 0.1315, "num_input_tokens_seen": 3370880, "step": 4985 }, { "epoch": 0.12190653018347054, "grad_norm": 19.91473388671875, "learning_rate": 4.875164899594469e-07, "loss": 0.0926, "num_input_tokens_seen": 3374720, "step": 4990 }, { "epoch": 0.12202868101531772, "grad_norm": 17.86402130126953, "learning_rate": 4.880050813504666e-07, "loss": 0.0964, "num_input_tokens_seen": 3377984, "step": 4995 }, { "epoch": 0.12215083184716488, "grad_norm": 17.522184371948242, "learning_rate": 4.884936727414863e-07, "loss": 0.0963, "num_input_tokens_seen": 3381376, "step": 5000 }, { "epoch": 0.12227298267901204, "grad_norm": 12.18857192993164, "learning_rate": 4.88982264132506e-07, "loss": 0.1253, "num_input_tokens_seen": 3384832, "step": 5005 }, { "epoch": 0.1223951335108592, "grad_norm": 54.867347717285156, "learning_rate": 4.894708555235256e-07, "loss": 0.1456, "num_input_tokens_seen": 3388480, "step": 5010 }, { "epoch": 0.12251728434270637, "grad_norm": 67.53438568115234, "learning_rate": 4.899594469145453e-07, "loss": 0.1403, "num_input_tokens_seen": 3391744, "step": 5015 }, { "epoch": 0.12263943517455354, "grad_norm": 3.713974952697754, "learning_rate": 4.90448038305565e-07, "loss": 0.1419, "num_input_tokens_seen": 3395136, "step": 5020 }, { "epoch": 0.1227615860064007, "grad_norm": 12.415149688720703, "learning_rate": 4.909366296965847e-07, "loss": 0.1163, "num_input_tokens_seen": 3398272, "step": 5025 }, { "epoch": 0.12288373683824787, "grad_norm": 14.186080932617188, "learning_rate": 4.914252210876045e-07, "loss": 0.1212, "num_input_tokens_seen": 3402112, "step": 5030 }, { "epoch": 0.12300588767009503, "grad_norm": 18.723878860473633, "learning_rate": 4.919138124786241e-07, "loss": 0.1536, "num_input_tokens_seen": 3405440, "step": 5035 }, { "epoch": 0.12312803850194219, "grad_norm": 7.663374423980713, "learning_rate": 4.924024038696437e-07, "loss": 0.0942, "num_input_tokens_seen": 3408384, "step": 5040 }, { "epoch": 0.12325018933378937, "grad_norm": 36.07602310180664, "learning_rate": 4.928909952606635e-07, "loss": 0.157, "num_input_tokens_seen": 3411968, "step": 5045 }, { "epoch": 0.12337234016563653, "grad_norm": 33.824764251708984, "learning_rate": 4.933795866516832e-07, "loss": 0.1502, "num_input_tokens_seen": 3415360, "step": 5050 }, { "epoch": 0.12349449099748369, "grad_norm": 36.74732971191406, "learning_rate": 4.938681780427029e-07, "loss": 0.1455, "num_input_tokens_seen": 3418560, "step": 5055 }, { "epoch": 0.12361664182933085, "grad_norm": 29.577823638916016, "learning_rate": 4.943567694337226e-07, "loss": 0.1418, "num_input_tokens_seen": 3421888, "step": 5060 }, { "epoch": 0.12373879266117803, "grad_norm": 17.719083786010742, "learning_rate": 4.948453608247422e-07, "loss": 0.1016, "num_input_tokens_seen": 3425536, "step": 5065 }, { "epoch": 0.12386094349302519, "grad_norm": 20.10811424255371, "learning_rate": 4.953339522157619e-07, "loss": 0.0608, "num_input_tokens_seen": 3429248, "step": 5070 }, { "epoch": 0.12398309432487235, "grad_norm": 17.528579711914062, "learning_rate": 4.958225436067816e-07, "loss": 0.1167, "num_input_tokens_seen": 3432512, "step": 5075 }, { "epoch": 0.12410524515671952, "grad_norm": 17.629924774169922, "learning_rate": 4.963111349978013e-07, "loss": 0.1865, "num_input_tokens_seen": 3435712, "step": 5080 }, { "epoch": 0.12422739598856668, "grad_norm": 23.14451789855957, "learning_rate": 4.96799726388821e-07, "loss": 0.0807, "num_input_tokens_seen": 3438848, "step": 5085 }, { "epoch": 0.12434954682041385, "grad_norm": 57.59390640258789, "learning_rate": 4.972883177798407e-07, "loss": 0.0983, "num_input_tokens_seen": 3442368, "step": 5090 }, { "epoch": 0.12447169765226102, "grad_norm": 6.212153911590576, "learning_rate": 4.977769091708604e-07, "loss": 0.0881, "num_input_tokens_seen": 3445760, "step": 5095 }, { "epoch": 0.12459384848410818, "grad_norm": 32.92991256713867, "learning_rate": 4.982655005618801e-07, "loss": 0.1125, "num_input_tokens_seen": 3448768, "step": 5100 }, { "epoch": 0.12471599931595534, "grad_norm": 43.08573532104492, "learning_rate": 4.987540919528998e-07, "loss": 0.1208, "num_input_tokens_seen": 3451968, "step": 5105 }, { "epoch": 0.1248381501478025, "grad_norm": 62.34801483154297, "learning_rate": 4.992426833439195e-07, "loss": 0.1621, "num_input_tokens_seen": 3455680, "step": 5110 }, { "epoch": 0.12496030097964968, "grad_norm": 2.1716251373291016, "learning_rate": 4.997312747349392e-07, "loss": 0.0785, "num_input_tokens_seen": 3458816, "step": 5115 }, { "epoch": 0.12508245181149683, "grad_norm": 41.48759841918945, "learning_rate": 5.002198661259588e-07, "loss": 0.2852, "num_input_tokens_seen": 3461824, "step": 5120 }, { "epoch": 0.125204602643344, "grad_norm": 48.85375213623047, "learning_rate": 5.007084575169785e-07, "loss": 0.1053, "num_input_tokens_seen": 3466304, "step": 5125 }, { "epoch": 0.12532675347519118, "grad_norm": 5.803524017333984, "learning_rate": 5.011970489079982e-07, "loss": 0.1095, "num_input_tokens_seen": 3469696, "step": 5130 }, { "epoch": 0.12544890430703834, "grad_norm": 31.555641174316406, "learning_rate": 5.01685640299018e-07, "loss": 0.1306, "num_input_tokens_seen": 3473216, "step": 5135 }, { "epoch": 0.1255710551388855, "grad_norm": 49.93914031982422, "learning_rate": 5.021742316900376e-07, "loss": 0.113, "num_input_tokens_seen": 3476352, "step": 5140 }, { "epoch": 0.12569320597073266, "grad_norm": 3.528350353240967, "learning_rate": 5.026628230810573e-07, "loss": 0.0987, "num_input_tokens_seen": 3479680, "step": 5145 }, { "epoch": 0.12581535680257983, "grad_norm": 18.18247413635254, "learning_rate": 5.03151414472077e-07, "loss": 0.0808, "num_input_tokens_seen": 3483392, "step": 5150 }, { "epoch": 0.125937507634427, "grad_norm": 23.490751266479492, "learning_rate": 5.036400058630966e-07, "loss": 0.0885, "num_input_tokens_seen": 3486720, "step": 5155 }, { "epoch": 0.12605965846627415, "grad_norm": 34.18044662475586, "learning_rate": 5.041285972541164e-07, "loss": 0.0894, "num_input_tokens_seen": 3489920, "step": 5160 }, { "epoch": 0.1261818092981213, "grad_norm": 62.92462158203125, "learning_rate": 5.046171886451361e-07, "loss": 0.1762, "num_input_tokens_seen": 3493248, "step": 5165 }, { "epoch": 0.12630396012996847, "grad_norm": 44.83457946777344, "learning_rate": 5.051057800361558e-07, "loss": 0.0735, "num_input_tokens_seen": 3496640, "step": 5170 }, { "epoch": 0.12642611096181566, "grad_norm": 22.707056045532227, "learning_rate": 5.055943714271754e-07, "loss": 0.1073, "num_input_tokens_seen": 3499648, "step": 5175 }, { "epoch": 0.12654826179366283, "grad_norm": 33.12401580810547, "learning_rate": 5.060829628181951e-07, "loss": 0.0827, "num_input_tokens_seen": 3502592, "step": 5180 }, { "epoch": 0.12667041262551, "grad_norm": 33.55727005004883, "learning_rate": 5.065715542092148e-07, "loss": 0.1814, "num_input_tokens_seen": 3505984, "step": 5185 }, { "epoch": 0.12679256345735715, "grad_norm": 57.06978225708008, "learning_rate": 5.070601456002345e-07, "loss": 0.2021, "num_input_tokens_seen": 3509120, "step": 5190 }, { "epoch": 0.1269147142892043, "grad_norm": 3.5166196823120117, "learning_rate": 5.075487369912543e-07, "loss": 0.1381, "num_input_tokens_seen": 3512192, "step": 5195 }, { "epoch": 0.12703686512105147, "grad_norm": 19.803998947143555, "learning_rate": 5.080373283822738e-07, "loss": 0.1622, "num_input_tokens_seen": 3515200, "step": 5200 }, { "epoch": 0.12715901595289864, "grad_norm": 25.440887451171875, "learning_rate": 5.085259197732936e-07, "loss": 0.1644, "num_input_tokens_seen": 3518272, "step": 5205 }, { "epoch": 0.1272811667847458, "grad_norm": 24.9045352935791, "learning_rate": 5.090145111643133e-07, "loss": 0.0878, "num_input_tokens_seen": 3521600, "step": 5210 }, { "epoch": 0.12740331761659296, "grad_norm": 49.746337890625, "learning_rate": 5.095031025553329e-07, "loss": 0.1429, "num_input_tokens_seen": 3524544, "step": 5215 }, { "epoch": 0.12752546844844012, "grad_norm": 12.738018035888672, "learning_rate": 5.099916939463527e-07, "loss": 0.1227, "num_input_tokens_seen": 3528000, "step": 5220 }, { "epoch": 0.1276476192802873, "grad_norm": 19.108055114746094, "learning_rate": 5.104802853373724e-07, "loss": 0.1693, "num_input_tokens_seen": 3531072, "step": 5225 }, { "epoch": 0.12776977011213447, "grad_norm": 28.770009994506836, "learning_rate": 5.10968876728392e-07, "loss": 0.1392, "num_input_tokens_seen": 3534912, "step": 5230 }, { "epoch": 0.12789192094398164, "grad_norm": 61.38738250732422, "learning_rate": 5.114574681194117e-07, "loss": 0.1636, "num_input_tokens_seen": 3538048, "step": 5235 }, { "epoch": 0.1280140717758288, "grad_norm": 13.256145477294922, "learning_rate": 5.119460595104314e-07, "loss": 0.1147, "num_input_tokens_seen": 3541248, "step": 5240 }, { "epoch": 0.12813622260767596, "grad_norm": 10.175660133361816, "learning_rate": 5.124346509014511e-07, "loss": 0.1623, "num_input_tokens_seen": 3544640, "step": 5245 }, { "epoch": 0.12825837343952312, "grad_norm": 52.19990158081055, "learning_rate": 5.129232422924708e-07, "loss": 0.1917, "num_input_tokens_seen": 3548160, "step": 5250 }, { "epoch": 0.12838052427137028, "grad_norm": 67.88219451904297, "learning_rate": 5.134118336834905e-07, "loss": 0.3176, "num_input_tokens_seen": 3551488, "step": 5255 }, { "epoch": 0.12850267510321745, "grad_norm": 55.14279556274414, "learning_rate": 5.139004250745101e-07, "loss": 0.1757, "num_input_tokens_seen": 3555456, "step": 5260 }, { "epoch": 0.1286248259350646, "grad_norm": 31.440595626831055, "learning_rate": 5.143890164655299e-07, "loss": 0.0561, "num_input_tokens_seen": 3558848, "step": 5265 }, { "epoch": 0.12874697676691177, "grad_norm": 27.637584686279297, "learning_rate": 5.148776078565496e-07, "loss": 0.1261, "num_input_tokens_seen": 3562624, "step": 5270 }, { "epoch": 0.12886912759875896, "grad_norm": 22.20782470703125, "learning_rate": 5.153661992475692e-07, "loss": 0.0671, "num_input_tokens_seen": 3566336, "step": 5275 }, { "epoch": 0.12899127843060612, "grad_norm": 19.314735412597656, "learning_rate": 5.15854790638589e-07, "loss": 0.0612, "num_input_tokens_seen": 3569280, "step": 5280 }, { "epoch": 0.12911342926245328, "grad_norm": 56.35293197631836, "learning_rate": 5.163433820296086e-07, "loss": 0.1168, "num_input_tokens_seen": 3572160, "step": 5285 }, { "epoch": 0.12923558009430045, "grad_norm": 47.87625503540039, "learning_rate": 5.168319734206283e-07, "loss": 0.1233, "num_input_tokens_seen": 3575360, "step": 5290 }, { "epoch": 0.1293577309261476, "grad_norm": 34.83906936645508, "learning_rate": 5.17320564811648e-07, "loss": 0.2233, "num_input_tokens_seen": 3578816, "step": 5295 }, { "epoch": 0.12947988175799477, "grad_norm": 25.64273452758789, "learning_rate": 5.178091562026678e-07, "loss": 0.1171, "num_input_tokens_seen": 3582208, "step": 5300 }, { "epoch": 0.12960203258984193, "grad_norm": 65.27552795410156, "learning_rate": 5.182977475936874e-07, "loss": 0.1628, "num_input_tokens_seen": 3585472, "step": 5305 }, { "epoch": 0.1297241834216891, "grad_norm": 25.95465660095215, "learning_rate": 5.18786338984707e-07, "loss": 0.2128, "num_input_tokens_seen": 3588480, "step": 5310 }, { "epoch": 0.12984633425353626, "grad_norm": 19.13545036315918, "learning_rate": 5.192749303757268e-07, "loss": 0.2443, "num_input_tokens_seen": 3591872, "step": 5315 }, { "epoch": 0.12996848508538345, "grad_norm": 11.983221054077148, "learning_rate": 5.197635217667464e-07, "loss": 0.106, "num_input_tokens_seen": 3595328, "step": 5320 }, { "epoch": 0.1300906359172306, "grad_norm": 17.041215896606445, "learning_rate": 5.202521131577662e-07, "loss": 0.0865, "num_input_tokens_seen": 3599168, "step": 5325 }, { "epoch": 0.13021278674907777, "grad_norm": 6.825228691101074, "learning_rate": 5.207407045487859e-07, "loss": 0.1366, "num_input_tokens_seen": 3602176, "step": 5330 }, { "epoch": 0.13033493758092493, "grad_norm": 24.280309677124023, "learning_rate": 5.212292959398054e-07, "loss": 0.0841, "num_input_tokens_seen": 3606144, "step": 5335 }, { "epoch": 0.1304570884127721, "grad_norm": 58.35236358642578, "learning_rate": 5.217178873308252e-07, "loss": 0.1475, "num_input_tokens_seen": 3609344, "step": 5340 }, { "epoch": 0.13057923924461926, "grad_norm": 34.24288558959961, "learning_rate": 5.222064787218449e-07, "loss": 0.0872, "num_input_tokens_seen": 3612608, "step": 5345 }, { "epoch": 0.13070139007646642, "grad_norm": 40.71889114379883, "learning_rate": 5.226950701128646e-07, "loss": 0.2108, "num_input_tokens_seen": 3615680, "step": 5350 }, { "epoch": 0.13082354090831358, "grad_norm": 27.078983306884766, "learning_rate": 5.231836615038843e-07, "loss": 0.1861, "num_input_tokens_seen": 3618944, "step": 5355 }, { "epoch": 0.13094569174016074, "grad_norm": 19.30623435974121, "learning_rate": 5.236722528949041e-07, "loss": 0.0905, "num_input_tokens_seen": 3622080, "step": 5360 }, { "epoch": 0.1310678425720079, "grad_norm": 17.23054313659668, "learning_rate": 5.241608442859236e-07, "loss": 0.1503, "num_input_tokens_seen": 3625280, "step": 5365 }, { "epoch": 0.1311899934038551, "grad_norm": 34.495113372802734, "learning_rate": 5.246494356769433e-07, "loss": 0.2217, "num_input_tokens_seen": 3628672, "step": 5370 }, { "epoch": 0.13131214423570226, "grad_norm": 26.085792541503906, "learning_rate": 5.251380270679631e-07, "loss": 0.1845, "num_input_tokens_seen": 3631744, "step": 5375 }, { "epoch": 0.13143429506754942, "grad_norm": 11.248368263244629, "learning_rate": 5.256266184589827e-07, "loss": 0.0909, "num_input_tokens_seen": 3635008, "step": 5380 }, { "epoch": 0.13155644589939658, "grad_norm": 22.01517105102539, "learning_rate": 5.261152098500025e-07, "loss": 0.1601, "num_input_tokens_seen": 3639040, "step": 5385 }, { "epoch": 0.13167859673124374, "grad_norm": 16.220895767211914, "learning_rate": 5.266038012410222e-07, "loss": 0.1047, "num_input_tokens_seen": 3642048, "step": 5390 }, { "epoch": 0.1318007475630909, "grad_norm": 35.09331512451172, "learning_rate": 5.270923926320417e-07, "loss": 0.1907, "num_input_tokens_seen": 3645504, "step": 5395 }, { "epoch": 0.13192289839493807, "grad_norm": 18.162155151367188, "learning_rate": 5.275809840230615e-07, "loss": 0.0751, "num_input_tokens_seen": 3648640, "step": 5400 }, { "epoch": 0.13204504922678523, "grad_norm": 35.38753128051758, "learning_rate": 5.280695754140812e-07, "loss": 0.1491, "num_input_tokens_seen": 3651904, "step": 5405 }, { "epoch": 0.1321672000586324, "grad_norm": 25.72626304626465, "learning_rate": 5.285581668051009e-07, "loss": 0.1306, "num_input_tokens_seen": 3654912, "step": 5410 }, { "epoch": 0.13228935089047955, "grad_norm": 39.30805587768555, "learning_rate": 5.290467581961206e-07, "loss": 0.1601, "num_input_tokens_seen": 3658112, "step": 5415 }, { "epoch": 0.13241150172232674, "grad_norm": 18.81882095336914, "learning_rate": 5.295353495871403e-07, "loss": 0.1715, "num_input_tokens_seen": 3661440, "step": 5420 }, { "epoch": 0.1325336525541739, "grad_norm": 31.62189483642578, "learning_rate": 5.300239409781599e-07, "loss": 0.1183, "num_input_tokens_seen": 3664768, "step": 5425 }, { "epoch": 0.13265580338602107, "grad_norm": 24.074796676635742, "learning_rate": 5.305125323691796e-07, "loss": 0.1052, "num_input_tokens_seen": 3668800, "step": 5430 }, { "epoch": 0.13277795421786823, "grad_norm": 1.8361209630966187, "learning_rate": 5.310011237601994e-07, "loss": 0.0333, "num_input_tokens_seen": 3671680, "step": 5435 }, { "epoch": 0.1329001050497154, "grad_norm": 76.28353118896484, "learning_rate": 5.31489715151219e-07, "loss": 0.123, "num_input_tokens_seen": 3675456, "step": 5440 }, { "epoch": 0.13302225588156255, "grad_norm": 10.621566772460938, "learning_rate": 5.319783065422388e-07, "loss": 0.0356, "num_input_tokens_seen": 3679040, "step": 5445 }, { "epoch": 0.1331444067134097, "grad_norm": 8.068056106567383, "learning_rate": 5.324668979332584e-07, "loss": 0.0859, "num_input_tokens_seen": 3682816, "step": 5450 }, { "epoch": 0.13326655754525688, "grad_norm": 26.7534122467041, "learning_rate": 5.32955489324278e-07, "loss": 0.2202, "num_input_tokens_seen": 3685952, "step": 5455 }, { "epoch": 0.13338870837710404, "grad_norm": 39.93231964111328, "learning_rate": 5.334440807152978e-07, "loss": 0.1361, "num_input_tokens_seen": 3689024, "step": 5460 }, { "epoch": 0.1335108592089512, "grad_norm": 4.293823719024658, "learning_rate": 5.339326721063175e-07, "loss": 0.1323, "num_input_tokens_seen": 3692544, "step": 5465 }, { "epoch": 0.1336330100407984, "grad_norm": 1.931091070175171, "learning_rate": 5.344212634973372e-07, "loss": 0.1305, "num_input_tokens_seen": 3696000, "step": 5470 }, { "epoch": 0.13375516087264555, "grad_norm": 73.22601318359375, "learning_rate": 5.349098548883568e-07, "loss": 0.2171, "num_input_tokens_seen": 3699136, "step": 5475 }, { "epoch": 0.1338773117044927, "grad_norm": 29.357030868530273, "learning_rate": 5.353984462793766e-07, "loss": 0.3066, "num_input_tokens_seen": 3702528, "step": 5480 }, { "epoch": 0.13399946253633988, "grad_norm": 27.00177764892578, "learning_rate": 5.358870376703962e-07, "loss": 0.2215, "num_input_tokens_seen": 3705984, "step": 5485 }, { "epoch": 0.13412161336818704, "grad_norm": 15.332404136657715, "learning_rate": 5.363756290614159e-07, "loss": 0.238, "num_input_tokens_seen": 3709184, "step": 5490 }, { "epoch": 0.1342437642000342, "grad_norm": 31.646596908569336, "learning_rate": 5.368642204524357e-07, "loss": 0.1463, "num_input_tokens_seen": 3712320, "step": 5495 }, { "epoch": 0.13436591503188136, "grad_norm": 25.798009872436523, "learning_rate": 5.373528118434552e-07, "loss": 0.0943, "num_input_tokens_seen": 3715392, "step": 5500 }, { "epoch": 0.13448806586372852, "grad_norm": 29.131210327148438, "learning_rate": 5.37841403234475e-07, "loss": 0.143, "num_input_tokens_seen": 3718528, "step": 5505 }, { "epoch": 0.13461021669557569, "grad_norm": 23.474040985107422, "learning_rate": 5.383299946254947e-07, "loss": 0.1029, "num_input_tokens_seen": 3721600, "step": 5510 }, { "epoch": 0.13473236752742287, "grad_norm": 19.45317268371582, "learning_rate": 5.388185860165143e-07, "loss": 0.1197, "num_input_tokens_seen": 3725312, "step": 5515 }, { "epoch": 0.13485451835927004, "grad_norm": 16.48733901977539, "learning_rate": 5.393071774075341e-07, "loss": 0.1296, "num_input_tokens_seen": 3728448, "step": 5520 }, { "epoch": 0.1349766691911172, "grad_norm": 48.141746520996094, "learning_rate": 5.397957687985537e-07, "loss": 0.1721, "num_input_tokens_seen": 3732160, "step": 5525 }, { "epoch": 0.13509882002296436, "grad_norm": 37.35277557373047, "learning_rate": 5.402843601895734e-07, "loss": 0.1383, "num_input_tokens_seen": 3735552, "step": 5530 }, { "epoch": 0.13522097085481152, "grad_norm": 14.689126968383789, "learning_rate": 5.407729515805931e-07, "loss": 0.1532, "num_input_tokens_seen": 3738944, "step": 5535 }, { "epoch": 0.13534312168665869, "grad_norm": 18.417491912841797, "learning_rate": 5.412615429716129e-07, "loss": 0.177, "num_input_tokens_seen": 3742720, "step": 5540 }, { "epoch": 0.13546527251850585, "grad_norm": 20.64642333984375, "learning_rate": 5.417501343626325e-07, "loss": 0.1112, "num_input_tokens_seen": 3745984, "step": 5545 }, { "epoch": 0.135587423350353, "grad_norm": 35.43182373046875, "learning_rate": 5.422387257536522e-07, "loss": 0.0857, "num_input_tokens_seen": 3749696, "step": 5550 }, { "epoch": 0.13570957418220017, "grad_norm": 17.774368286132812, "learning_rate": 5.42727317144672e-07, "loss": 0.0957, "num_input_tokens_seen": 3753152, "step": 5555 }, { "epoch": 0.13583172501404733, "grad_norm": 3.5985748767852783, "learning_rate": 5.432159085356915e-07, "loss": 0.0524, "num_input_tokens_seen": 3756800, "step": 5560 }, { "epoch": 0.13595387584589452, "grad_norm": 34.646461486816406, "learning_rate": 5.437044999267113e-07, "loss": 0.1252, "num_input_tokens_seen": 3760896, "step": 5565 }, { "epoch": 0.13607602667774168, "grad_norm": 2.2413535118103027, "learning_rate": 5.441930913177309e-07, "loss": 0.0735, "num_input_tokens_seen": 3764032, "step": 5570 }, { "epoch": 0.13619817750958885, "grad_norm": 20.598520278930664, "learning_rate": 5.446816827087507e-07, "loss": 0.0557, "num_input_tokens_seen": 3767488, "step": 5575 }, { "epoch": 0.136320328341436, "grad_norm": 30.10392189025879, "learning_rate": 5.451702740997704e-07, "loss": 0.2006, "num_input_tokens_seen": 3770944, "step": 5580 }, { "epoch": 0.13644247917328317, "grad_norm": 32.69186782836914, "learning_rate": 5.456588654907899e-07, "loss": 0.1705, "num_input_tokens_seen": 3774208, "step": 5585 }, { "epoch": 0.13656463000513033, "grad_norm": 38.73693084716797, "learning_rate": 5.461474568818097e-07, "loss": 0.1656, "num_input_tokens_seen": 3777920, "step": 5590 }, { "epoch": 0.1366867808369775, "grad_norm": 6.445084571838379, "learning_rate": 5.466360482728294e-07, "loss": 0.0259, "num_input_tokens_seen": 3781504, "step": 5595 }, { "epoch": 0.13680893166882466, "grad_norm": 60.09385681152344, "learning_rate": 5.471246396638492e-07, "loss": 0.1219, "num_input_tokens_seen": 3785280, "step": 5600 }, { "epoch": 0.13693108250067182, "grad_norm": 29.345659255981445, "learning_rate": 5.476132310548688e-07, "loss": 0.1452, "num_input_tokens_seen": 3791168, "step": 5605 }, { "epoch": 0.13705323333251898, "grad_norm": 14.892522811889648, "learning_rate": 5.481018224458884e-07, "loss": 0.0706, "num_input_tokens_seen": 3794432, "step": 5610 }, { "epoch": 0.13717538416436617, "grad_norm": 1.7643539905548096, "learning_rate": 5.485904138369081e-07, "loss": 0.0769, "num_input_tokens_seen": 3797504, "step": 5615 }, { "epoch": 0.13729753499621333, "grad_norm": 27.91608238220215, "learning_rate": 5.490790052279278e-07, "loss": 0.1494, "num_input_tokens_seen": 3801280, "step": 5620 }, { "epoch": 0.1374196858280605, "grad_norm": 32.55119705200195, "learning_rate": 5.495675966189476e-07, "loss": 0.1295, "num_input_tokens_seen": 3804736, "step": 5625 }, { "epoch": 0.13754183665990766, "grad_norm": 43.087921142578125, "learning_rate": 5.500561880099672e-07, "loss": 0.1242, "num_input_tokens_seen": 3808512, "step": 5630 }, { "epoch": 0.13766398749175482, "grad_norm": 55.47489929199219, "learning_rate": 5.50544779400987e-07, "loss": 0.161, "num_input_tokens_seen": 3811776, "step": 5635 }, { "epoch": 0.13778613832360198, "grad_norm": 23.946348190307617, "learning_rate": 5.510333707920066e-07, "loss": 0.1648, "num_input_tokens_seen": 3815232, "step": 5640 }, { "epoch": 0.13790828915544914, "grad_norm": 37.26478958129883, "learning_rate": 5.515219621830262e-07, "loss": 0.0756, "num_input_tokens_seen": 3818816, "step": 5645 }, { "epoch": 0.1380304399872963, "grad_norm": 46.365379333496094, "learning_rate": 5.52010553574046e-07, "loss": 0.147, "num_input_tokens_seen": 3821952, "step": 5650 }, { "epoch": 0.13815259081914347, "grad_norm": 12.609604835510254, "learning_rate": 5.524991449650657e-07, "loss": 0.0421, "num_input_tokens_seen": 3825408, "step": 5655 }, { "epoch": 0.13827474165099066, "grad_norm": 25.285354614257812, "learning_rate": 5.529877363560854e-07, "loss": 0.0934, "num_input_tokens_seen": 3828864, "step": 5660 }, { "epoch": 0.13839689248283782, "grad_norm": 16.68872833251953, "learning_rate": 5.53476327747105e-07, "loss": 0.0789, "num_input_tokens_seen": 3832064, "step": 5665 }, { "epoch": 0.13851904331468498, "grad_norm": 66.28137969970703, "learning_rate": 5.539649191381247e-07, "loss": 0.0692, "num_input_tokens_seen": 3835264, "step": 5670 }, { "epoch": 0.13864119414653214, "grad_norm": 0.4072583317756653, "learning_rate": 5.544535105291444e-07, "loss": 0.1247, "num_input_tokens_seen": 3838528, "step": 5675 }, { "epoch": 0.1387633449783793, "grad_norm": 72.48626708984375, "learning_rate": 5.549421019201641e-07, "loss": 0.1948, "num_input_tokens_seen": 3841984, "step": 5680 }, { "epoch": 0.13888549581022647, "grad_norm": 70.16142272949219, "learning_rate": 5.554306933111839e-07, "loss": 0.1382, "num_input_tokens_seen": 3845440, "step": 5685 }, { "epoch": 0.13900764664207363, "grad_norm": 42.96235656738281, "learning_rate": 5.559192847022035e-07, "loss": 0.1565, "num_input_tokens_seen": 3848704, "step": 5690 }, { "epoch": 0.1391297974739208, "grad_norm": 86.99781036376953, "learning_rate": 5.564078760932232e-07, "loss": 0.1449, "num_input_tokens_seen": 3852480, "step": 5695 }, { "epoch": 0.13925194830576795, "grad_norm": 43.10320281982422, "learning_rate": 5.568964674842429e-07, "loss": 0.2077, "num_input_tokens_seen": 3855616, "step": 5700 }, { "epoch": 0.13937409913761512, "grad_norm": 1.7821024656295776, "learning_rate": 5.573850588752625e-07, "loss": 0.0602, "num_input_tokens_seen": 3859200, "step": 5705 }, { "epoch": 0.1394962499694623, "grad_norm": 38.42781066894531, "learning_rate": 5.578736502662823e-07, "loss": 0.2262, "num_input_tokens_seen": 3862784, "step": 5710 }, { "epoch": 0.13961840080130947, "grad_norm": 48.475135803222656, "learning_rate": 5.58362241657302e-07, "loss": 0.0682, "num_input_tokens_seen": 3865920, "step": 5715 }, { "epoch": 0.13974055163315663, "grad_norm": 9.301112174987793, "learning_rate": 5.588508330483217e-07, "loss": 0.036, "num_input_tokens_seen": 3869632, "step": 5720 }, { "epoch": 0.1398627024650038, "grad_norm": 8.468901634216309, "learning_rate": 5.593394244393413e-07, "loss": 0.1062, "num_input_tokens_seen": 3872576, "step": 5725 }, { "epoch": 0.13998485329685095, "grad_norm": 52.738311767578125, "learning_rate": 5.59828015830361e-07, "loss": 0.2479, "num_input_tokens_seen": 3876288, "step": 5730 }, { "epoch": 0.14010700412869811, "grad_norm": 40.101619720458984, "learning_rate": 5.603166072213807e-07, "loss": 0.138, "num_input_tokens_seen": 3879360, "step": 5735 }, { "epoch": 0.14022915496054528, "grad_norm": 0.8611541390419006, "learning_rate": 5.608051986124004e-07, "loss": 0.1795, "num_input_tokens_seen": 3883200, "step": 5740 }, { "epoch": 0.14035130579239244, "grad_norm": 24.446453094482422, "learning_rate": 5.612937900034202e-07, "loss": 0.1743, "num_input_tokens_seen": 3887296, "step": 5745 }, { "epoch": 0.1404734566242396, "grad_norm": 17.659931182861328, "learning_rate": 5.617823813944397e-07, "loss": 0.1256, "num_input_tokens_seen": 3890880, "step": 5750 }, { "epoch": 0.14059560745608676, "grad_norm": 32.55177307128906, "learning_rate": 5.622709727854595e-07, "loss": 0.1736, "num_input_tokens_seen": 3894208, "step": 5755 }, { "epoch": 0.14071775828793395, "grad_norm": 14.735184669494629, "learning_rate": 5.627595641764792e-07, "loss": 0.1214, "num_input_tokens_seen": 3897536, "step": 5760 }, { "epoch": 0.14083990911978111, "grad_norm": 21.811237335205078, "learning_rate": 5.632481555674988e-07, "loss": 0.1589, "num_input_tokens_seen": 3900480, "step": 5765 }, { "epoch": 0.14096205995162828, "grad_norm": 18.260709762573242, "learning_rate": 5.637367469585186e-07, "loss": 0.1105, "num_input_tokens_seen": 3903552, "step": 5770 }, { "epoch": 0.14108421078347544, "grad_norm": 13.663732528686523, "learning_rate": 5.642253383495383e-07, "loss": 0.0396, "num_input_tokens_seen": 3906688, "step": 5775 }, { "epoch": 0.1412063616153226, "grad_norm": 4.043575763702393, "learning_rate": 5.647139297405579e-07, "loss": 0.1107, "num_input_tokens_seen": 3910080, "step": 5780 }, { "epoch": 0.14132851244716976, "grad_norm": 14.13771915435791, "learning_rate": 5.652025211315776e-07, "loss": 0.1385, "num_input_tokens_seen": 3912960, "step": 5785 }, { "epoch": 0.14145066327901692, "grad_norm": 59.14686584472656, "learning_rate": 5.656911125225974e-07, "loss": 0.1114, "num_input_tokens_seen": 3915904, "step": 5790 }, { "epoch": 0.1415728141108641, "grad_norm": 1.1978075504302979, "learning_rate": 5.66179703913617e-07, "loss": 0.1246, "num_input_tokens_seen": 3919104, "step": 5795 }, { "epoch": 0.14169496494271125, "grad_norm": 26.907318115234375, "learning_rate": 5.666682953046367e-07, "loss": 0.2857, "num_input_tokens_seen": 3922496, "step": 5800 }, { "epoch": 0.1418171157745584, "grad_norm": 70.06200408935547, "learning_rate": 5.671568866956564e-07, "loss": 0.0916, "num_input_tokens_seen": 3925824, "step": 5805 }, { "epoch": 0.1419392666064056, "grad_norm": 5.980767250061035, "learning_rate": 5.67645478086676e-07, "loss": 0.1194, "num_input_tokens_seen": 3929344, "step": 5810 }, { "epoch": 0.14206141743825276, "grad_norm": 39.06313705444336, "learning_rate": 5.681340694776958e-07, "loss": 0.1381, "num_input_tokens_seen": 3932608, "step": 5815 }, { "epoch": 0.14218356827009992, "grad_norm": 40.51398849487305, "learning_rate": 5.686226608687155e-07, "loss": 0.1014, "num_input_tokens_seen": 3935936, "step": 5820 }, { "epoch": 0.1423057191019471, "grad_norm": 18.47413444519043, "learning_rate": 5.691112522597351e-07, "loss": 0.129, "num_input_tokens_seen": 3939648, "step": 5825 }, { "epoch": 0.14242786993379425, "grad_norm": 18.596654891967773, "learning_rate": 5.695998436507549e-07, "loss": 0.0574, "num_input_tokens_seen": 3942976, "step": 5830 }, { "epoch": 0.1425500207656414, "grad_norm": 33.14046096801758, "learning_rate": 5.700884350417745e-07, "loss": 0.1438, "num_input_tokens_seen": 3946048, "step": 5835 }, { "epoch": 0.14267217159748857, "grad_norm": 20.379133224487305, "learning_rate": 5.705770264327942e-07, "loss": 0.1404, "num_input_tokens_seen": 3949568, "step": 5840 }, { "epoch": 0.14279432242933573, "grad_norm": 10.639100074768066, "learning_rate": 5.710656178238139e-07, "loss": 0.1302, "num_input_tokens_seen": 3953088, "step": 5845 }, { "epoch": 0.1429164732611829, "grad_norm": 42.40608215332031, "learning_rate": 5.715542092148337e-07, "loss": 0.063, "num_input_tokens_seen": 3956416, "step": 5850 }, { "epoch": 0.1430386240930301, "grad_norm": 35.883018493652344, "learning_rate": 5.720428006058533e-07, "loss": 0.1239, "num_input_tokens_seen": 3959616, "step": 5855 }, { "epoch": 0.14316077492487725, "grad_norm": 32.85486602783203, "learning_rate": 5.725313919968729e-07, "loss": 0.0998, "num_input_tokens_seen": 3962944, "step": 5860 }, { "epoch": 0.1432829257567244, "grad_norm": 27.143404006958008, "learning_rate": 5.730199833878927e-07, "loss": 0.1661, "num_input_tokens_seen": 3966528, "step": 5865 }, { "epoch": 0.14340507658857157, "grad_norm": 25.951753616333008, "learning_rate": 5.735085747789123e-07, "loss": 0.0673, "num_input_tokens_seen": 3969728, "step": 5870 }, { "epoch": 0.14352722742041873, "grad_norm": 36.098236083984375, "learning_rate": 5.739971661699321e-07, "loss": 0.1448, "num_input_tokens_seen": 3973056, "step": 5875 }, { "epoch": 0.1436493782522659, "grad_norm": 1.0197221040725708, "learning_rate": 5.744857575609518e-07, "loss": 0.063, "num_input_tokens_seen": 3976192, "step": 5880 }, { "epoch": 0.14377152908411306, "grad_norm": 60.88951873779297, "learning_rate": 5.749743489519713e-07, "loss": 0.2091, "num_input_tokens_seen": 3980480, "step": 5885 }, { "epoch": 0.14389367991596022, "grad_norm": 72.92743682861328, "learning_rate": 5.754629403429911e-07, "loss": 0.15, "num_input_tokens_seen": 3984000, "step": 5890 }, { "epoch": 0.14401583074780738, "grad_norm": 2.2103939056396484, "learning_rate": 5.759515317340108e-07, "loss": 0.1052, "num_input_tokens_seen": 3987264, "step": 5895 }, { "epoch": 0.14413798157965454, "grad_norm": 27.705286026000977, "learning_rate": 5.764401231250305e-07, "loss": 0.0707, "num_input_tokens_seen": 3990784, "step": 5900 }, { "epoch": 0.14426013241150173, "grad_norm": 31.684846878051758, "learning_rate": 5.769287145160502e-07, "loss": 0.1064, "num_input_tokens_seen": 3994304, "step": 5905 }, { "epoch": 0.1443822832433489, "grad_norm": 13.381579399108887, "learning_rate": 5.7741730590707e-07, "loss": 0.0527, "num_input_tokens_seen": 3997504, "step": 5910 }, { "epoch": 0.14450443407519606, "grad_norm": 38.11252975463867, "learning_rate": 5.779058972980895e-07, "loss": 0.1981, "num_input_tokens_seen": 4000832, "step": 5915 }, { "epoch": 0.14462658490704322, "grad_norm": 65.77743530273438, "learning_rate": 5.783944886891092e-07, "loss": 0.0958, "num_input_tokens_seen": 4004352, "step": 5920 }, { "epoch": 0.14474873573889038, "grad_norm": 28.584915161132812, "learning_rate": 5.78883080080129e-07, "loss": 0.1188, "num_input_tokens_seen": 4007552, "step": 5925 }, { "epoch": 0.14487088657073754, "grad_norm": 58.487369537353516, "learning_rate": 5.793716714711486e-07, "loss": 0.1757, "num_input_tokens_seen": 4010944, "step": 5930 }, { "epoch": 0.1449930374025847, "grad_norm": 52.59897232055664, "learning_rate": 5.798602628621684e-07, "loss": 0.0767, "num_input_tokens_seen": 4014528, "step": 5935 }, { "epoch": 0.14511518823443187, "grad_norm": 15.854714393615723, "learning_rate": 5.80348854253188e-07, "loss": 0.1921, "num_input_tokens_seen": 4018368, "step": 5940 }, { "epoch": 0.14523733906627903, "grad_norm": 4.226526260375977, "learning_rate": 5.808374456442076e-07, "loss": 0.0764, "num_input_tokens_seen": 4021888, "step": 5945 }, { "epoch": 0.1453594898981262, "grad_norm": 40.54411315917969, "learning_rate": 5.813260370352274e-07, "loss": 0.0979, "num_input_tokens_seen": 4025024, "step": 5950 }, { "epoch": 0.14548164072997338, "grad_norm": 2.4570257663726807, "learning_rate": 5.818146284262471e-07, "loss": 0.1028, "num_input_tokens_seen": 4028416, "step": 5955 }, { "epoch": 0.14560379156182054, "grad_norm": 29.194568634033203, "learning_rate": 5.823032198172668e-07, "loss": 0.1325, "num_input_tokens_seen": 4031488, "step": 5960 }, { "epoch": 0.1457259423936677, "grad_norm": 87.35152435302734, "learning_rate": 5.827918112082865e-07, "loss": 0.305, "num_input_tokens_seen": 4035008, "step": 5965 }, { "epoch": 0.14584809322551487, "grad_norm": 76.78803253173828, "learning_rate": 5.832804025993062e-07, "loss": 0.1991, "num_input_tokens_seen": 4038272, "step": 5970 }, { "epoch": 0.14597024405736203, "grad_norm": 54.32789611816406, "learning_rate": 5.837689939903258e-07, "loss": 0.1803, "num_input_tokens_seen": 4041536, "step": 5975 }, { "epoch": 0.1460923948892092, "grad_norm": 45.33341598510742, "learning_rate": 5.842575853813455e-07, "loss": 0.2937, "num_input_tokens_seen": 4044608, "step": 5980 }, { "epoch": 0.14621454572105635, "grad_norm": 4.984194755554199, "learning_rate": 5.847461767723653e-07, "loss": 0.0937, "num_input_tokens_seen": 4048064, "step": 5985 }, { "epoch": 0.14633669655290352, "grad_norm": 22.13462257385254, "learning_rate": 5.852347681633849e-07, "loss": 0.098, "num_input_tokens_seen": 4051392, "step": 5990 }, { "epoch": 0.14645884738475068, "grad_norm": 66.0450439453125, "learning_rate": 5.857233595544047e-07, "loss": 0.1357, "num_input_tokens_seen": 4055232, "step": 5995 }, { "epoch": 0.14658099821659787, "grad_norm": 8.29734992980957, "learning_rate": 5.862119509454243e-07, "loss": 0.1316, "num_input_tokens_seen": 4058112, "step": 6000 }, { "epoch": 0.14670314904844503, "grad_norm": 41.1361083984375, "learning_rate": 5.86700542336444e-07, "loss": 0.1374, "num_input_tokens_seen": 4061056, "step": 6005 }, { "epoch": 0.1468252998802922, "grad_norm": 7.894331932067871, "learning_rate": 5.871891337274637e-07, "loss": 0.112, "num_input_tokens_seen": 4064640, "step": 6010 }, { "epoch": 0.14694745071213935, "grad_norm": 4.169567108154297, "learning_rate": 5.876777251184834e-07, "loss": 0.1227, "num_input_tokens_seen": 4067904, "step": 6015 }, { "epoch": 0.14706960154398652, "grad_norm": 46.6483154296875, "learning_rate": 5.881663165095031e-07, "loss": 0.213, "num_input_tokens_seen": 4071232, "step": 6020 }, { "epoch": 0.14719175237583368, "grad_norm": 41.06544494628906, "learning_rate": 5.886549079005227e-07, "loss": 0.0935, "num_input_tokens_seen": 4074496, "step": 6025 }, { "epoch": 0.14731390320768084, "grad_norm": 31.227313995361328, "learning_rate": 5.891434992915425e-07, "loss": 0.1533, "num_input_tokens_seen": 4077888, "step": 6030 }, { "epoch": 0.147436054039528, "grad_norm": 20.721681594848633, "learning_rate": 5.896320906825621e-07, "loss": 0.1824, "num_input_tokens_seen": 4080960, "step": 6035 }, { "epoch": 0.14755820487137516, "grad_norm": 46.61140060424805, "learning_rate": 5.901206820735818e-07, "loss": 0.1432, "num_input_tokens_seen": 4084416, "step": 6040 }, { "epoch": 0.14768035570322233, "grad_norm": 19.30379295349121, "learning_rate": 5.906092734646016e-07, "loss": 0.0937, "num_input_tokens_seen": 4087744, "step": 6045 }, { "epoch": 0.14780250653506952, "grad_norm": 21.93967056274414, "learning_rate": 5.910978648556211e-07, "loss": 0.1235, "num_input_tokens_seen": 4091200, "step": 6050 }, { "epoch": 0.14792465736691668, "grad_norm": 20.221229553222656, "learning_rate": 5.915864562466409e-07, "loss": 0.0512, "num_input_tokens_seen": 4094336, "step": 6055 }, { "epoch": 0.14804680819876384, "grad_norm": 28.617212295532227, "learning_rate": 5.920750476376606e-07, "loss": 0.1712, "num_input_tokens_seen": 4098048, "step": 6060 }, { "epoch": 0.148168959030611, "grad_norm": 41.802513122558594, "learning_rate": 5.925636390286803e-07, "loss": 0.1851, "num_input_tokens_seen": 4101760, "step": 6065 }, { "epoch": 0.14829110986245816, "grad_norm": 14.1126708984375, "learning_rate": 5.930522304197e-07, "loss": 0.1798, "num_input_tokens_seen": 4105408, "step": 6070 }, { "epoch": 0.14841326069430533, "grad_norm": 62.553184509277344, "learning_rate": 5.935408218107197e-07, "loss": 0.1347, "num_input_tokens_seen": 4109312, "step": 6075 }, { "epoch": 0.1485354115261525, "grad_norm": 20.548730850219727, "learning_rate": 5.940294132017393e-07, "loss": 0.1295, "num_input_tokens_seen": 4112640, "step": 6080 }, { "epoch": 0.14865756235799965, "grad_norm": 14.097305297851562, "learning_rate": 5.94518004592759e-07, "loss": 0.1796, "num_input_tokens_seen": 4115904, "step": 6085 }, { "epoch": 0.1487797131898468, "grad_norm": 3.2791802883148193, "learning_rate": 5.950065959837788e-07, "loss": 0.0824, "num_input_tokens_seen": 4119360, "step": 6090 }, { "epoch": 0.14890186402169397, "grad_norm": 2.169283390045166, "learning_rate": 5.954951873747984e-07, "loss": 0.1154, "num_input_tokens_seen": 4122816, "step": 6095 }, { "epoch": 0.14902401485354116, "grad_norm": 23.00556755065918, "learning_rate": 5.959837787658181e-07, "loss": 0.1101, "num_input_tokens_seen": 4126144, "step": 6100 }, { "epoch": 0.14914616568538833, "grad_norm": 38.71464920043945, "learning_rate": 5.964723701568379e-07, "loss": 0.0952, "num_input_tokens_seen": 4129216, "step": 6105 }, { "epoch": 0.1492683165172355, "grad_norm": 31.493146896362305, "learning_rate": 5.969609615478574e-07, "loss": 0.1513, "num_input_tokens_seen": 4132032, "step": 6110 }, { "epoch": 0.14939046734908265, "grad_norm": 43.34003448486328, "learning_rate": 5.974495529388772e-07, "loss": 0.1582, "num_input_tokens_seen": 4135616, "step": 6115 }, { "epoch": 0.1495126181809298, "grad_norm": 52.86286926269531, "learning_rate": 5.979381443298969e-07, "loss": 0.1166, "num_input_tokens_seen": 4139136, "step": 6120 }, { "epoch": 0.14963476901277697, "grad_norm": 17.101299285888672, "learning_rate": 5.984267357209166e-07, "loss": 0.223, "num_input_tokens_seen": 4142464, "step": 6125 }, { "epoch": 0.14975691984462414, "grad_norm": 23.224618911743164, "learning_rate": 5.989153271119363e-07, "loss": 0.063, "num_input_tokens_seen": 4145920, "step": 6130 }, { "epoch": 0.1498790706764713, "grad_norm": 6.051283836364746, "learning_rate": 5.994039185029559e-07, "loss": 0.0648, "num_input_tokens_seen": 4149568, "step": 6135 }, { "epoch": 0.15000122150831846, "grad_norm": 22.5601806640625, "learning_rate": 5.998925098939756e-07, "loss": 0.2033, "num_input_tokens_seen": 4153088, "step": 6140 }, { "epoch": 0.15012337234016562, "grad_norm": 42.55513000488281, "learning_rate": 6.003811012849953e-07, "loss": 0.1186, "num_input_tokens_seen": 4156544, "step": 6145 }, { "epoch": 0.1502455231720128, "grad_norm": 50.73028564453125, "learning_rate": 6.008696926760151e-07, "loss": 0.086, "num_input_tokens_seen": 4159872, "step": 6150 }, { "epoch": 0.15036767400385997, "grad_norm": 19.807815551757812, "learning_rate": 6.013582840670347e-07, "loss": 0.1547, "num_input_tokens_seen": 4162880, "step": 6155 }, { "epoch": 0.15048982483570714, "grad_norm": 31.513826370239258, "learning_rate": 6.018468754580543e-07, "loss": 0.1011, "num_input_tokens_seen": 4166208, "step": 6160 }, { "epoch": 0.1506119756675543, "grad_norm": 3.1334080696105957, "learning_rate": 6.023354668490741e-07, "loss": 0.0551, "num_input_tokens_seen": 4169472, "step": 6165 }, { "epoch": 0.15073412649940146, "grad_norm": 43.65282440185547, "learning_rate": 6.028240582400937e-07, "loss": 0.1453, "num_input_tokens_seen": 4172544, "step": 6170 }, { "epoch": 0.15085627733124862, "grad_norm": 25.209524154663086, "learning_rate": 6.033126496311135e-07, "loss": 0.2232, "num_input_tokens_seen": 4176384, "step": 6175 }, { "epoch": 0.15097842816309578, "grad_norm": 25.307361602783203, "learning_rate": 6.038012410221332e-07, "loss": 0.0965, "num_input_tokens_seen": 4179904, "step": 6180 }, { "epoch": 0.15110057899494295, "grad_norm": 13.137029647827148, "learning_rate": 6.042898324131529e-07, "loss": 0.0665, "num_input_tokens_seen": 4183104, "step": 6185 }, { "epoch": 0.1512227298267901, "grad_norm": 2.0337600708007812, "learning_rate": 6.047784238041725e-07, "loss": 0.1853, "num_input_tokens_seen": 4186624, "step": 6190 }, { "epoch": 0.1513448806586373, "grad_norm": 28.856618881225586, "learning_rate": 6.052670151951922e-07, "loss": 0.1416, "num_input_tokens_seen": 4189568, "step": 6195 }, { "epoch": 0.15146703149048446, "grad_norm": 7.899556636810303, "learning_rate": 6.057556065862119e-07, "loss": 0.0886, "num_input_tokens_seen": 4192960, "step": 6200 }, { "epoch": 0.15158918232233162, "grad_norm": 30.406021118164062, "learning_rate": 6.062441979772316e-07, "loss": 0.1363, "num_input_tokens_seen": 4196032, "step": 6205 }, { "epoch": 0.15171133315417878, "grad_norm": 42.989532470703125, "learning_rate": 6.067327893682514e-07, "loss": 0.1335, "num_input_tokens_seen": 4199488, "step": 6210 }, { "epoch": 0.15183348398602595, "grad_norm": 5.426946640014648, "learning_rate": 6.07221380759271e-07, "loss": 0.1709, "num_input_tokens_seen": 4202816, "step": 6215 }, { "epoch": 0.1519556348178731, "grad_norm": 3.0106301307678223, "learning_rate": 6.077099721502907e-07, "loss": 0.1961, "num_input_tokens_seen": 4205760, "step": 6220 }, { "epoch": 0.15207778564972027, "grad_norm": 56.07905578613281, "learning_rate": 6.081985635413104e-07, "loss": 0.1633, "num_input_tokens_seen": 4209152, "step": 6225 }, { "epoch": 0.15219993648156743, "grad_norm": 35.35799026489258, "learning_rate": 6.0868715493233e-07, "loss": 0.2286, "num_input_tokens_seen": 4212288, "step": 6230 }, { "epoch": 0.1523220873134146, "grad_norm": 20.754413604736328, "learning_rate": 6.091757463233498e-07, "loss": 0.1305, "num_input_tokens_seen": 4215744, "step": 6235 }, { "epoch": 0.15244423814526176, "grad_norm": 15.790669441223145, "learning_rate": 6.096643377143695e-07, "loss": 0.1388, "num_input_tokens_seen": 4219008, "step": 6240 }, { "epoch": 0.15256638897710895, "grad_norm": 18.692581176757812, "learning_rate": 6.101529291053891e-07, "loss": 0.0912, "num_input_tokens_seen": 4222272, "step": 6245 }, { "epoch": 0.1526885398089561, "grad_norm": 33.882667541503906, "learning_rate": 6.106415204964088e-07, "loss": 0.1363, "num_input_tokens_seen": 4225792, "step": 6250 }, { "epoch": 0.15281069064080327, "grad_norm": 14.197614669799805, "learning_rate": 6.111301118874285e-07, "loss": 0.1123, "num_input_tokens_seen": 4228800, "step": 6255 }, { "epoch": 0.15293284147265043, "grad_norm": 42.19538116455078, "learning_rate": 6.116187032784482e-07, "loss": 0.1594, "num_input_tokens_seen": 4232000, "step": 6260 }, { "epoch": 0.1530549923044976, "grad_norm": 10.284985542297363, "learning_rate": 6.121072946694679e-07, "loss": 0.1191, "num_input_tokens_seen": 4235328, "step": 6265 }, { "epoch": 0.15317714313634476, "grad_norm": 31.913787841796875, "learning_rate": 6.125958860604877e-07, "loss": 0.1284, "num_input_tokens_seen": 4238784, "step": 6270 }, { "epoch": 0.15329929396819192, "grad_norm": 46.03422927856445, "learning_rate": 6.130844774515072e-07, "loss": 0.161, "num_input_tokens_seen": 4242112, "step": 6275 }, { "epoch": 0.15342144480003908, "grad_norm": 47.99618911743164, "learning_rate": 6.13573068842527e-07, "loss": 0.1836, "num_input_tokens_seen": 4245568, "step": 6280 }, { "epoch": 0.15354359563188624, "grad_norm": 10.648454666137695, "learning_rate": 6.140616602335467e-07, "loss": 0.1363, "num_input_tokens_seen": 4248896, "step": 6285 }, { "epoch": 0.1536657464637334, "grad_norm": 16.757213592529297, "learning_rate": 6.145502516245663e-07, "loss": 0.1812, "num_input_tokens_seen": 4252096, "step": 6290 }, { "epoch": 0.1537878972955806, "grad_norm": 27.234424591064453, "learning_rate": 6.150388430155861e-07, "loss": 0.1504, "num_input_tokens_seen": 4255424, "step": 6295 }, { "epoch": 0.15391004812742776, "grad_norm": 17.769634246826172, "learning_rate": 6.155274344066057e-07, "loss": 0.1123, "num_input_tokens_seen": 4258816, "step": 6300 }, { "epoch": 0.15403219895927492, "grad_norm": 18.891216278076172, "learning_rate": 6.160160257976254e-07, "loss": 0.0778, "num_input_tokens_seen": 4262272, "step": 6305 }, { "epoch": 0.15415434979112208, "grad_norm": 5.673632621765137, "learning_rate": 6.165046171886451e-07, "loss": 0.1027, "num_input_tokens_seen": 4265472, "step": 6310 }, { "epoch": 0.15427650062296924, "grad_norm": 21.022520065307617, "learning_rate": 6.169932085796648e-07, "loss": 0.0871, "num_input_tokens_seen": 4268416, "step": 6315 }, { "epoch": 0.1543986514548164, "grad_norm": 8.996367454528809, "learning_rate": 6.174817999706845e-07, "loss": 0.1307, "num_input_tokens_seen": 4271232, "step": 6320 }, { "epoch": 0.15452080228666357, "grad_norm": 35.42950439453125, "learning_rate": 6.179703913617041e-07, "loss": 0.0743, "num_input_tokens_seen": 4274624, "step": 6325 }, { "epoch": 0.15464295311851073, "grad_norm": 14.378525733947754, "learning_rate": 6.184589827527239e-07, "loss": 0.0905, "num_input_tokens_seen": 4277824, "step": 6330 }, { "epoch": 0.1547651039503579, "grad_norm": 25.608861923217773, "learning_rate": 6.189475741437435e-07, "loss": 0.1555, "num_input_tokens_seen": 4280768, "step": 6335 }, { "epoch": 0.15488725478220508, "grad_norm": 11.071989059448242, "learning_rate": 6.194361655347633e-07, "loss": 0.1338, "num_input_tokens_seen": 4283776, "step": 6340 }, { "epoch": 0.15500940561405224, "grad_norm": 30.60003662109375, "learning_rate": 6.19924756925783e-07, "loss": 0.0846, "num_input_tokens_seen": 4287168, "step": 6345 }, { "epoch": 0.1551315564458994, "grad_norm": 54.509544372558594, "learning_rate": 6.204133483168026e-07, "loss": 0.0945, "num_input_tokens_seen": 4290496, "step": 6350 }, { "epoch": 0.15525370727774657, "grad_norm": 45.7187385559082, "learning_rate": 6.209019397078223e-07, "loss": 0.1313, "num_input_tokens_seen": 4294464, "step": 6355 }, { "epoch": 0.15537585810959373, "grad_norm": 20.216039657592773, "learning_rate": 6.21390531098842e-07, "loss": 0.113, "num_input_tokens_seen": 4298048, "step": 6360 }, { "epoch": 0.1554980089414409, "grad_norm": 25.05139923095703, "learning_rate": 6.218791224898617e-07, "loss": 0.0559, "num_input_tokens_seen": 4300928, "step": 6365 }, { "epoch": 0.15562015977328805, "grad_norm": 52.927974700927734, "learning_rate": 6.223677138808814e-07, "loss": 0.1924, "num_input_tokens_seen": 4304320, "step": 6370 }, { "epoch": 0.15574231060513521, "grad_norm": 3.7601611614227295, "learning_rate": 6.228563052719011e-07, "loss": 0.0631, "num_input_tokens_seen": 4307584, "step": 6375 }, { "epoch": 0.15586446143698238, "grad_norm": 96.52560424804688, "learning_rate": 6.233448966629207e-07, "loss": 0.2685, "num_input_tokens_seen": 4310528, "step": 6380 }, { "epoch": 0.15598661226882954, "grad_norm": 10.017736434936523, "learning_rate": 6.238334880539404e-07, "loss": 0.1225, "num_input_tokens_seen": 4313664, "step": 6385 }, { "epoch": 0.15610876310067673, "grad_norm": 38.85869598388672, "learning_rate": 6.243220794449602e-07, "loss": 0.0834, "num_input_tokens_seen": 4316992, "step": 6390 }, { "epoch": 0.1562309139325239, "grad_norm": 20.78007698059082, "learning_rate": 6.248106708359798e-07, "loss": 0.1642, "num_input_tokens_seen": 4320320, "step": 6395 }, { "epoch": 0.15635306476437105, "grad_norm": 3.2366719245910645, "learning_rate": 6.252992622269996e-07, "loss": 0.1726, "num_input_tokens_seen": 4323840, "step": 6400 }, { "epoch": 0.15647521559621821, "grad_norm": 15.642763137817383, "learning_rate": 6.257878536180193e-07, "loss": 0.0551, "num_input_tokens_seen": 4327104, "step": 6405 }, { "epoch": 0.15659736642806538, "grad_norm": 22.45438575744629, "learning_rate": 6.262764450090388e-07, "loss": 0.1901, "num_input_tokens_seen": 4330560, "step": 6410 }, { "epoch": 0.15671951725991254, "grad_norm": 53.00764465332031, "learning_rate": 6.267650364000586e-07, "loss": 0.1802, "num_input_tokens_seen": 4333952, "step": 6415 }, { "epoch": 0.1568416680917597, "grad_norm": 15.467325210571289, "learning_rate": 6.272536277910783e-07, "loss": 0.0586, "num_input_tokens_seen": 4337280, "step": 6420 }, { "epoch": 0.15696381892360686, "grad_norm": 1.292763352394104, "learning_rate": 6.27742219182098e-07, "loss": 0.0911, "num_input_tokens_seen": 4340736, "step": 6425 }, { "epoch": 0.15708596975545402, "grad_norm": 24.337791442871094, "learning_rate": 6.282308105731177e-07, "loss": 0.1349, "num_input_tokens_seen": 4343808, "step": 6430 }, { "epoch": 0.1572081205873012, "grad_norm": 54.08406066894531, "learning_rate": 6.287194019641373e-07, "loss": 0.1372, "num_input_tokens_seen": 4347136, "step": 6435 }, { "epoch": 0.15733027141914838, "grad_norm": 18.846115112304688, "learning_rate": 6.29207993355157e-07, "loss": 0.0889, "num_input_tokens_seen": 4350336, "step": 6440 }, { "epoch": 0.15745242225099554, "grad_norm": 24.113834381103516, "learning_rate": 6.296965847461767e-07, "loss": 0.099, "num_input_tokens_seen": 4353600, "step": 6445 }, { "epoch": 0.1575745730828427, "grad_norm": 33.02431106567383, "learning_rate": 6.301851761371965e-07, "loss": 0.2071, "num_input_tokens_seen": 4357056, "step": 6450 }, { "epoch": 0.15769672391468986, "grad_norm": 27.968963623046875, "learning_rate": 6.306737675282161e-07, "loss": 0.2063, "num_input_tokens_seen": 4360704, "step": 6455 }, { "epoch": 0.15781887474653702, "grad_norm": 16.813751220703125, "learning_rate": 6.311623589192359e-07, "loss": 0.1524, "num_input_tokens_seen": 4364352, "step": 6460 }, { "epoch": 0.1579410255783842, "grad_norm": 25.05374526977539, "learning_rate": 6.316509503102555e-07, "loss": 0.1161, "num_input_tokens_seen": 4368192, "step": 6465 }, { "epoch": 0.15806317641023135, "grad_norm": 23.216142654418945, "learning_rate": 6.321395417012751e-07, "loss": 0.1139, "num_input_tokens_seen": 4371456, "step": 6470 }, { "epoch": 0.1581853272420785, "grad_norm": 33.85103225708008, "learning_rate": 6.326281330922949e-07, "loss": 0.1646, "num_input_tokens_seen": 4374400, "step": 6475 }, { "epoch": 0.15830747807392567, "grad_norm": 21.796924591064453, "learning_rate": 6.331167244833146e-07, "loss": 0.1281, "num_input_tokens_seen": 4377792, "step": 6480 }, { "epoch": 0.15842962890577283, "grad_norm": 37.538352966308594, "learning_rate": 6.336053158743343e-07, "loss": 0.1209, "num_input_tokens_seen": 4381120, "step": 6485 }, { "epoch": 0.15855177973762002, "grad_norm": 17.142486572265625, "learning_rate": 6.34093907265354e-07, "loss": 0.2482, "num_input_tokens_seen": 4384448, "step": 6490 }, { "epoch": 0.1586739305694672, "grad_norm": 6.992273807525635, "learning_rate": 6.345824986563737e-07, "loss": 0.113, "num_input_tokens_seen": 4387520, "step": 6495 }, { "epoch": 0.15879608140131435, "grad_norm": 12.753617286682129, "learning_rate": 6.350710900473933e-07, "loss": 0.0973, "num_input_tokens_seen": 4391040, "step": 6500 }, { "epoch": 0.1589182322331615, "grad_norm": 6.668143272399902, "learning_rate": 6.35559681438413e-07, "loss": 0.151, "num_input_tokens_seen": 4394880, "step": 6505 }, { "epoch": 0.15904038306500867, "grad_norm": 21.62032699584961, "learning_rate": 6.360482728294328e-07, "loss": 0.0759, "num_input_tokens_seen": 4397824, "step": 6510 }, { "epoch": 0.15916253389685583, "grad_norm": 5.089215278625488, "learning_rate": 6.365368642204524e-07, "loss": 0.0951, "num_input_tokens_seen": 4401152, "step": 6515 }, { "epoch": 0.159284684728703, "grad_norm": 45.12503433227539, "learning_rate": 6.370254556114721e-07, "loss": 0.0996, "num_input_tokens_seen": 4404352, "step": 6520 }, { "epoch": 0.15940683556055016, "grad_norm": 37.46071243286133, "learning_rate": 6.375140470024918e-07, "loss": 0.1367, "num_input_tokens_seen": 4407488, "step": 6525 }, { "epoch": 0.15952898639239732, "grad_norm": 31.072433471679688, "learning_rate": 6.380026383935114e-07, "loss": 0.0958, "num_input_tokens_seen": 4410816, "step": 6530 }, { "epoch": 0.1596511372242445, "grad_norm": 12.748991012573242, "learning_rate": 6.384912297845312e-07, "loss": 0.015, "num_input_tokens_seen": 4414016, "step": 6535 }, { "epoch": 0.15977328805609167, "grad_norm": 47.43623733520508, "learning_rate": 6.389798211755509e-07, "loss": 0.271, "num_input_tokens_seen": 4417472, "step": 6540 }, { "epoch": 0.15989543888793883, "grad_norm": 43.680030822753906, "learning_rate": 6.394684125665705e-07, "loss": 0.1196, "num_input_tokens_seen": 4420544, "step": 6545 }, { "epoch": 0.160017589719786, "grad_norm": 1.3316268920898438, "learning_rate": 6.399570039575902e-07, "loss": 0.0459, "num_input_tokens_seen": 4423936, "step": 6550 }, { "epoch": 0.16013974055163316, "grad_norm": 41.480247497558594, "learning_rate": 6.4044559534861e-07, "loss": 0.1126, "num_input_tokens_seen": 4427520, "step": 6555 }, { "epoch": 0.16026189138348032, "grad_norm": 75.16388702392578, "learning_rate": 6.409341867396296e-07, "loss": 0.1337, "num_input_tokens_seen": 4431296, "step": 6560 }, { "epoch": 0.16038404221532748, "grad_norm": 42.94929885864258, "learning_rate": 6.414227781306493e-07, "loss": 0.2227, "num_input_tokens_seen": 4434176, "step": 6565 }, { "epoch": 0.16050619304717464, "grad_norm": 96.06494903564453, "learning_rate": 6.419113695216691e-07, "loss": 0.0667, "num_input_tokens_seen": 4437568, "step": 6570 }, { "epoch": 0.1606283438790218, "grad_norm": 25.993467330932617, "learning_rate": 6.423999609126886e-07, "loss": 0.0748, "num_input_tokens_seen": 4440960, "step": 6575 }, { "epoch": 0.16075049471086897, "grad_norm": 18.6013240814209, "learning_rate": 6.428885523037084e-07, "loss": 0.1785, "num_input_tokens_seen": 4444224, "step": 6580 }, { "epoch": 0.16087264554271616, "grad_norm": 16.228498458862305, "learning_rate": 6.433771436947281e-07, "loss": 0.1116, "num_input_tokens_seen": 4447424, "step": 6585 }, { "epoch": 0.16099479637456332, "grad_norm": 56.65265655517578, "learning_rate": 6.438657350857477e-07, "loss": 0.178, "num_input_tokens_seen": 4450688, "step": 6590 }, { "epoch": 0.16111694720641048, "grad_norm": 89.38908386230469, "learning_rate": 6.443543264767675e-07, "loss": 0.2095, "num_input_tokens_seen": 4454336, "step": 6595 }, { "epoch": 0.16123909803825764, "grad_norm": 23.842357635498047, "learning_rate": 6.448429178677871e-07, "loss": 0.0842, "num_input_tokens_seen": 4457792, "step": 6600 }, { "epoch": 0.1613612488701048, "grad_norm": 14.655643463134766, "learning_rate": 6.453315092588068e-07, "loss": 0.1051, "num_input_tokens_seen": 4460992, "step": 6605 }, { "epoch": 0.16148339970195197, "grad_norm": 38.159698486328125, "learning_rate": 6.458201006498265e-07, "loss": 0.1712, "num_input_tokens_seen": 4464576, "step": 6610 }, { "epoch": 0.16160555053379913, "grad_norm": 29.461435317993164, "learning_rate": 6.463086920408463e-07, "loss": 0.1776, "num_input_tokens_seen": 4467584, "step": 6615 }, { "epoch": 0.1617277013656463, "grad_norm": 38.899383544921875, "learning_rate": 6.467972834318659e-07, "loss": 0.2165, "num_input_tokens_seen": 4470848, "step": 6620 }, { "epoch": 0.16184985219749345, "grad_norm": 15.065881729125977, "learning_rate": 6.472858748228856e-07, "loss": 0.2397, "num_input_tokens_seen": 4474368, "step": 6625 }, { "epoch": 0.16197200302934062, "grad_norm": 47.70626449584961, "learning_rate": 6.477744662139053e-07, "loss": 0.1456, "num_input_tokens_seen": 4478208, "step": 6630 }, { "epoch": 0.1620941538611878, "grad_norm": 10.539874076843262, "learning_rate": 6.482630576049249e-07, "loss": 0.1784, "num_input_tokens_seen": 4481664, "step": 6635 }, { "epoch": 0.16221630469303497, "grad_norm": 13.360639572143555, "learning_rate": 6.487516489959447e-07, "loss": 0.1122, "num_input_tokens_seen": 4484736, "step": 6640 }, { "epoch": 0.16233845552488213, "grad_norm": 4.426427364349365, "learning_rate": 6.492402403869644e-07, "loss": 0.0607, "num_input_tokens_seen": 4489024, "step": 6645 }, { "epoch": 0.1624606063567293, "grad_norm": 3.5760512351989746, "learning_rate": 6.49728831777984e-07, "loss": 0.0822, "num_input_tokens_seen": 4492352, "step": 6650 }, { "epoch": 0.16258275718857645, "grad_norm": 29.747814178466797, "learning_rate": 6.502174231690037e-07, "loss": 0.1032, "num_input_tokens_seen": 4495360, "step": 6655 }, { "epoch": 0.16270490802042362, "grad_norm": 19.472917556762695, "learning_rate": 6.507060145600234e-07, "loss": 0.0636, "num_input_tokens_seen": 4498752, "step": 6660 }, { "epoch": 0.16282705885227078, "grad_norm": 37.16254425048828, "learning_rate": 6.511946059510431e-07, "loss": 0.2362, "num_input_tokens_seen": 4501760, "step": 6665 }, { "epoch": 0.16294920968411794, "grad_norm": 71.30066680908203, "learning_rate": 6.516831973420628e-07, "loss": 0.243, "num_input_tokens_seen": 4505344, "step": 6670 }, { "epoch": 0.1630713605159651, "grad_norm": 2.2536447048187256, "learning_rate": 6.521717887330826e-07, "loss": 0.093, "num_input_tokens_seen": 4508672, "step": 6675 }, { "epoch": 0.1631935113478123, "grad_norm": 3.9860222339630127, "learning_rate": 6.526603801241022e-07, "loss": 0.1341, "num_input_tokens_seen": 4512768, "step": 6680 }, { "epoch": 0.16331566217965945, "grad_norm": 21.329500198364258, "learning_rate": 6.531489715151218e-07, "loss": 0.1077, "num_input_tokens_seen": 4515840, "step": 6685 }, { "epoch": 0.16343781301150662, "grad_norm": 15.191680908203125, "learning_rate": 6.536375629061416e-07, "loss": 0.1545, "num_input_tokens_seen": 4519232, "step": 6690 }, { "epoch": 0.16355996384335378, "grad_norm": 52.87079620361328, "learning_rate": 6.541261542971612e-07, "loss": 0.1031, "num_input_tokens_seen": 4522816, "step": 6695 }, { "epoch": 0.16368211467520094, "grad_norm": 20.58293914794922, "learning_rate": 6.54614745688181e-07, "loss": 0.1403, "num_input_tokens_seen": 4525888, "step": 6700 }, { "epoch": 0.1638042655070481, "grad_norm": 10.931684494018555, "learning_rate": 6.551033370792007e-07, "loss": 0.1446, "num_input_tokens_seen": 4529280, "step": 6705 }, { "epoch": 0.16392641633889526, "grad_norm": 68.2391357421875, "learning_rate": 6.555919284702204e-07, "loss": 0.186, "num_input_tokens_seen": 4532544, "step": 6710 }, { "epoch": 0.16404856717074243, "grad_norm": 28.83963394165039, "learning_rate": 6.5608051986124e-07, "loss": 0.1599, "num_input_tokens_seen": 4535360, "step": 6715 }, { "epoch": 0.1641707180025896, "grad_norm": 10.062387466430664, "learning_rate": 6.565691112522597e-07, "loss": 0.1133, "num_input_tokens_seen": 4538368, "step": 6720 }, { "epoch": 0.16429286883443675, "grad_norm": 35.1144905090332, "learning_rate": 6.570577026432794e-07, "loss": 0.0491, "num_input_tokens_seen": 4541632, "step": 6725 }, { "epoch": 0.16441501966628394, "grad_norm": 49.85148620605469, "learning_rate": 6.575462940342991e-07, "loss": 0.1565, "num_input_tokens_seen": 4544512, "step": 6730 }, { "epoch": 0.1645371704981311, "grad_norm": 16.927494049072266, "learning_rate": 6.580348854253189e-07, "loss": 0.102, "num_input_tokens_seen": 4547776, "step": 6735 }, { "epoch": 0.16465932132997826, "grad_norm": 61.242252349853516, "learning_rate": 6.585234768163384e-07, "loss": 0.1985, "num_input_tokens_seen": 4550912, "step": 6740 }, { "epoch": 0.16478147216182543, "grad_norm": 14.769601821899414, "learning_rate": 6.590120682073581e-07, "loss": 0.0903, "num_input_tokens_seen": 4554112, "step": 6745 }, { "epoch": 0.1649036229936726, "grad_norm": 14.11997127532959, "learning_rate": 6.595006595983779e-07, "loss": 0.18, "num_input_tokens_seen": 4557120, "step": 6750 }, { "epoch": 0.16502577382551975, "grad_norm": 17.178272247314453, "learning_rate": 6.599892509893975e-07, "loss": 0.0676, "num_input_tokens_seen": 4560576, "step": 6755 }, { "epoch": 0.1651479246573669, "grad_norm": 16.599720001220703, "learning_rate": 6.604778423804173e-07, "loss": 0.1992, "num_input_tokens_seen": 4564096, "step": 6760 }, { "epoch": 0.16527007548921407, "grad_norm": 14.66077995300293, "learning_rate": 6.60966433771437e-07, "loss": 0.1163, "num_input_tokens_seen": 4567104, "step": 6765 }, { "epoch": 0.16539222632106124, "grad_norm": 14.158099174499512, "learning_rate": 6.614550251624566e-07, "loss": 0.1756, "num_input_tokens_seen": 4570496, "step": 6770 }, { "epoch": 0.1655143771529084, "grad_norm": 7.5408034324646, "learning_rate": 6.619436165534763e-07, "loss": 0.0301, "num_input_tokens_seen": 4576000, "step": 6775 }, { "epoch": 0.1656365279847556, "grad_norm": 2.0234808921813965, "learning_rate": 6.62432207944496e-07, "loss": 0.1214, "num_input_tokens_seen": 4579392, "step": 6780 }, { "epoch": 0.16575867881660275, "grad_norm": 36.175838470458984, "learning_rate": 6.629207993355157e-07, "loss": 0.0789, "num_input_tokens_seen": 4582656, "step": 6785 }, { "epoch": 0.1658808296484499, "grad_norm": 48.25929641723633, "learning_rate": 6.634093907265354e-07, "loss": 0.1426, "num_input_tokens_seen": 4586304, "step": 6790 }, { "epoch": 0.16600298048029707, "grad_norm": 20.59220314025879, "learning_rate": 6.638979821175551e-07, "loss": 0.0402, "num_input_tokens_seen": 4589376, "step": 6795 }, { "epoch": 0.16612513131214424, "grad_norm": 50.40332794189453, "learning_rate": 6.643865735085747e-07, "loss": 0.1095, "num_input_tokens_seen": 4592704, "step": 6800 }, { "epoch": 0.1662472821439914, "grad_norm": 22.707887649536133, "learning_rate": 6.648751648995944e-07, "loss": 0.1261, "num_input_tokens_seen": 4596672, "step": 6805 }, { "epoch": 0.16636943297583856, "grad_norm": 27.34088897705078, "learning_rate": 6.653637562906142e-07, "loss": 0.1199, "num_input_tokens_seen": 4600128, "step": 6810 }, { "epoch": 0.16649158380768572, "grad_norm": 15.953385353088379, "learning_rate": 6.658523476816338e-07, "loss": 0.1429, "num_input_tokens_seen": 4603200, "step": 6815 }, { "epoch": 0.16661373463953288, "grad_norm": 12.182960510253906, "learning_rate": 6.663409390726536e-07, "loss": 0.1214, "num_input_tokens_seen": 4606208, "step": 6820 }, { "epoch": 0.16673588547138005, "grad_norm": 9.563040733337402, "learning_rate": 6.668295304636732e-07, "loss": 0.0518, "num_input_tokens_seen": 4609280, "step": 6825 }, { "epoch": 0.16685803630322724, "grad_norm": 44.647499084472656, "learning_rate": 6.673181218546929e-07, "loss": 0.1066, "num_input_tokens_seen": 4612352, "step": 6830 }, { "epoch": 0.1669801871350744, "grad_norm": 9.651607513427734, "learning_rate": 6.678067132457126e-07, "loss": 0.1821, "num_input_tokens_seen": 4615744, "step": 6835 }, { "epoch": 0.16710233796692156, "grad_norm": 103.38037872314453, "learning_rate": 6.682953046367323e-07, "loss": 0.3838, "num_input_tokens_seen": 4619392, "step": 6840 }, { "epoch": 0.16722448879876872, "grad_norm": 56.43678665161133, "learning_rate": 6.68783896027752e-07, "loss": 0.0645, "num_input_tokens_seen": 4622336, "step": 6845 }, { "epoch": 0.16734663963061588, "grad_norm": 27.00754165649414, "learning_rate": 6.692724874187716e-07, "loss": 0.1702, "num_input_tokens_seen": 4625280, "step": 6850 }, { "epoch": 0.16746879046246305, "grad_norm": 12.663601875305176, "learning_rate": 6.697610788097914e-07, "loss": 0.2138, "num_input_tokens_seen": 4628544, "step": 6855 }, { "epoch": 0.1675909412943102, "grad_norm": 43.01512908935547, "learning_rate": 6.70249670200811e-07, "loss": 0.1501, "num_input_tokens_seen": 4632384, "step": 6860 }, { "epoch": 0.16771309212615737, "grad_norm": 4.006659984588623, "learning_rate": 6.707382615918307e-07, "loss": 0.0843, "num_input_tokens_seen": 4635712, "step": 6865 }, { "epoch": 0.16783524295800453, "grad_norm": 5.144809722900391, "learning_rate": 6.712268529828505e-07, "loss": 0.0591, "num_input_tokens_seen": 4638848, "step": 6870 }, { "epoch": 0.16795739378985172, "grad_norm": 0.25560063123703003, "learning_rate": 6.7171544437387e-07, "loss": 0.0569, "num_input_tokens_seen": 4641728, "step": 6875 }, { "epoch": 0.16807954462169888, "grad_norm": 85.87944030761719, "learning_rate": 6.722040357648898e-07, "loss": 0.2355, "num_input_tokens_seen": 4644992, "step": 6880 }, { "epoch": 0.16820169545354605, "grad_norm": 36.92861557006836, "learning_rate": 6.726926271559095e-07, "loss": 0.1886, "num_input_tokens_seen": 4647872, "step": 6885 }, { "epoch": 0.1683238462853932, "grad_norm": 6.552468776702881, "learning_rate": 6.731812185469292e-07, "loss": 0.1627, "num_input_tokens_seen": 4651520, "step": 6890 }, { "epoch": 0.16844599711724037, "grad_norm": 20.896957397460938, "learning_rate": 6.736698099379489e-07, "loss": 0.1422, "num_input_tokens_seen": 4655488, "step": 6895 }, { "epoch": 0.16856814794908753, "grad_norm": 61.85845947265625, "learning_rate": 6.741584013289686e-07, "loss": 0.2005, "num_input_tokens_seen": 4658944, "step": 6900 }, { "epoch": 0.1686902987809347, "grad_norm": 4.090170383453369, "learning_rate": 6.746469927199882e-07, "loss": 0.2195, "num_input_tokens_seen": 4661952, "step": 6905 }, { "epoch": 0.16881244961278186, "grad_norm": 52.27777862548828, "learning_rate": 6.751355841110079e-07, "loss": 0.2332, "num_input_tokens_seen": 4665472, "step": 6910 }, { "epoch": 0.16893460044462902, "grad_norm": 24.961549758911133, "learning_rate": 6.756241755020277e-07, "loss": 0.1836, "num_input_tokens_seen": 4668800, "step": 6915 }, { "epoch": 0.16905675127647618, "grad_norm": 31.262836456298828, "learning_rate": 6.761127668930473e-07, "loss": 0.132, "num_input_tokens_seen": 4672512, "step": 6920 }, { "epoch": 0.16917890210832337, "grad_norm": 25.470243453979492, "learning_rate": 6.766013582840671e-07, "loss": 0.1959, "num_input_tokens_seen": 4675904, "step": 6925 }, { "epoch": 0.16930105294017053, "grad_norm": 16.708919525146484, "learning_rate": 6.770899496750868e-07, "loss": 0.0929, "num_input_tokens_seen": 4679424, "step": 6930 }, { "epoch": 0.1694232037720177, "grad_norm": 6.275186061859131, "learning_rate": 6.775785410661063e-07, "loss": 0.1339, "num_input_tokens_seen": 4683520, "step": 6935 }, { "epoch": 0.16954535460386486, "grad_norm": 23.944786071777344, "learning_rate": 6.780671324571261e-07, "loss": 0.138, "num_input_tokens_seen": 4686848, "step": 6940 }, { "epoch": 0.16966750543571202, "grad_norm": 8.790188789367676, "learning_rate": 6.785557238481458e-07, "loss": 0.068, "num_input_tokens_seen": 4690304, "step": 6945 }, { "epoch": 0.16978965626755918, "grad_norm": 23.236326217651367, "learning_rate": 6.790443152391655e-07, "loss": 0.0549, "num_input_tokens_seen": 4693632, "step": 6950 }, { "epoch": 0.16991180709940634, "grad_norm": 24.558181762695312, "learning_rate": 6.795329066301852e-07, "loss": 0.104, "num_input_tokens_seen": 4696832, "step": 6955 }, { "epoch": 0.1700339579312535, "grad_norm": 2.767685890197754, "learning_rate": 6.800214980212048e-07, "loss": 0.1351, "num_input_tokens_seen": 4700672, "step": 6960 }, { "epoch": 0.17015610876310067, "grad_norm": 9.79178524017334, "learning_rate": 6.805100894122245e-07, "loss": 0.1338, "num_input_tokens_seen": 4704384, "step": 6965 }, { "epoch": 0.17027825959494783, "grad_norm": 14.672940254211426, "learning_rate": 6.809986808032442e-07, "loss": 0.0839, "num_input_tokens_seen": 4707712, "step": 6970 }, { "epoch": 0.17040041042679502, "grad_norm": 10.246041297912598, "learning_rate": 6.81487272194264e-07, "loss": 0.0875, "num_input_tokens_seen": 4710720, "step": 6975 }, { "epoch": 0.17052256125864218, "grad_norm": 1.6319217681884766, "learning_rate": 6.819758635852836e-07, "loss": 0.0822, "num_input_tokens_seen": 4714048, "step": 6980 }, { "epoch": 0.17064471209048934, "grad_norm": 23.715749740600586, "learning_rate": 6.824644549763034e-07, "loss": 0.1104, "num_input_tokens_seen": 4717312, "step": 6985 }, { "epoch": 0.1707668629223365, "grad_norm": 29.73052406311035, "learning_rate": 6.82953046367323e-07, "loss": 0.0742, "num_input_tokens_seen": 4721152, "step": 6990 }, { "epoch": 0.17088901375418367, "grad_norm": 20.4119873046875, "learning_rate": 6.834416377583426e-07, "loss": 0.1043, "num_input_tokens_seen": 4724416, "step": 6995 }, { "epoch": 0.17101116458603083, "grad_norm": 37.61855697631836, "learning_rate": 6.839302291493624e-07, "loss": 0.1324, "num_input_tokens_seen": 4727872, "step": 7000 }, { "epoch": 0.171133315417878, "grad_norm": 44.900047302246094, "learning_rate": 6.844188205403821e-07, "loss": 0.2628, "num_input_tokens_seen": 4731328, "step": 7005 }, { "epoch": 0.17125546624972515, "grad_norm": 23.994895935058594, "learning_rate": 6.849074119314018e-07, "loss": 0.1661, "num_input_tokens_seen": 4734912, "step": 7010 }, { "epoch": 0.17137761708157231, "grad_norm": 36.63328170776367, "learning_rate": 6.853960033224214e-07, "loss": 0.0935, "num_input_tokens_seen": 4738816, "step": 7015 }, { "epoch": 0.1714997679134195, "grad_norm": 31.2292423248291, "learning_rate": 6.85884594713441e-07, "loss": 0.1215, "num_input_tokens_seen": 4741888, "step": 7020 }, { "epoch": 0.17162191874526667, "grad_norm": 27.03329086303711, "learning_rate": 6.863731861044608e-07, "loss": 0.1481, "num_input_tokens_seen": 4745152, "step": 7025 }, { "epoch": 0.17174406957711383, "grad_norm": 10.102874755859375, "learning_rate": 6.868617774954805e-07, "loss": 0.1287, "num_input_tokens_seen": 4748608, "step": 7030 }, { "epoch": 0.171866220408961, "grad_norm": 57.4080696105957, "learning_rate": 6.873503688865003e-07, "loss": 0.1947, "num_input_tokens_seen": 4751552, "step": 7035 }, { "epoch": 0.17198837124080815, "grad_norm": 19.714679718017578, "learning_rate": 6.878389602775198e-07, "loss": 0.0742, "num_input_tokens_seen": 4754688, "step": 7040 }, { "epoch": 0.17211052207265531, "grad_norm": 26.733041763305664, "learning_rate": 6.883275516685396e-07, "loss": 0.1217, "num_input_tokens_seen": 4757760, "step": 7045 }, { "epoch": 0.17223267290450248, "grad_norm": 16.901596069335938, "learning_rate": 6.888161430595593e-07, "loss": 0.2113, "num_input_tokens_seen": 4760896, "step": 7050 }, { "epoch": 0.17235482373634964, "grad_norm": 26.491451263427734, "learning_rate": 6.893047344505789e-07, "loss": 0.1273, "num_input_tokens_seen": 4763904, "step": 7055 }, { "epoch": 0.1724769745681968, "grad_norm": 34.51689529418945, "learning_rate": 6.897933258415987e-07, "loss": 0.1319, "num_input_tokens_seen": 4767680, "step": 7060 }, { "epoch": 0.17259912540004396, "grad_norm": 22.6583251953125, "learning_rate": 6.902819172326183e-07, "loss": 0.1141, "num_input_tokens_seen": 4771200, "step": 7065 }, { "epoch": 0.17272127623189115, "grad_norm": 16.140857696533203, "learning_rate": 6.90770508623638e-07, "loss": 0.0704, "num_input_tokens_seen": 4774592, "step": 7070 }, { "epoch": 0.17284342706373831, "grad_norm": 28.333457946777344, "learning_rate": 6.912591000146577e-07, "loss": 0.1583, "num_input_tokens_seen": 4777600, "step": 7075 }, { "epoch": 0.17296557789558548, "grad_norm": 22.110576629638672, "learning_rate": 6.917476914056773e-07, "loss": 0.0724, "num_input_tokens_seen": 4780736, "step": 7080 }, { "epoch": 0.17308772872743264, "grad_norm": 22.722089767456055, "learning_rate": 6.922362827966971e-07, "loss": 0.196, "num_input_tokens_seen": 4783936, "step": 7085 }, { "epoch": 0.1732098795592798, "grad_norm": 5.605364799499512, "learning_rate": 6.927248741877168e-07, "loss": 0.088, "num_input_tokens_seen": 4787328, "step": 7090 }, { "epoch": 0.17333203039112696, "grad_norm": 36.73191833496094, "learning_rate": 6.932134655787366e-07, "loss": 0.0997, "num_input_tokens_seen": 4790528, "step": 7095 }, { "epoch": 0.17345418122297412, "grad_norm": 4.203964710235596, "learning_rate": 6.937020569697561e-07, "loss": 0.0858, "num_input_tokens_seen": 4793728, "step": 7100 }, { "epoch": 0.1735763320548213, "grad_norm": 21.231687545776367, "learning_rate": 6.941906483607759e-07, "loss": 0.1983, "num_input_tokens_seen": 4797312, "step": 7105 }, { "epoch": 0.17369848288666845, "grad_norm": 20.893413543701172, "learning_rate": 6.946792397517955e-07, "loss": 0.1578, "num_input_tokens_seen": 4801344, "step": 7110 }, { "epoch": 0.1738206337185156, "grad_norm": 17.35948944091797, "learning_rate": 6.951678311428152e-07, "loss": 0.0992, "num_input_tokens_seen": 4804928, "step": 7115 }, { "epoch": 0.1739427845503628, "grad_norm": 12.587772369384766, "learning_rate": 6.95656422533835e-07, "loss": 0.101, "num_input_tokens_seen": 4808832, "step": 7120 }, { "epoch": 0.17406493538220996, "grad_norm": 66.40949249267578, "learning_rate": 6.961450139248545e-07, "loss": 0.3265, "num_input_tokens_seen": 4811648, "step": 7125 }, { "epoch": 0.17418708621405712, "grad_norm": 19.095821380615234, "learning_rate": 6.966336053158743e-07, "loss": 0.0718, "num_input_tokens_seen": 4815360, "step": 7130 }, { "epoch": 0.1743092370459043, "grad_norm": 21.798635482788086, "learning_rate": 6.97122196706894e-07, "loss": 0.0902, "num_input_tokens_seen": 4818816, "step": 7135 }, { "epoch": 0.17443138787775145, "grad_norm": 30.94866943359375, "learning_rate": 6.976107880979138e-07, "loss": 0.1007, "num_input_tokens_seen": 4822464, "step": 7140 }, { "epoch": 0.1745535387095986, "grad_norm": 6.085841655731201, "learning_rate": 6.980993794889334e-07, "loss": 0.177, "num_input_tokens_seen": 4825664, "step": 7145 }, { "epoch": 0.17467568954144577, "grad_norm": 25.90575408935547, "learning_rate": 6.98587970879953e-07, "loss": 0.1748, "num_input_tokens_seen": 4828480, "step": 7150 }, { "epoch": 0.17479784037329293, "grad_norm": 40.3851203918457, "learning_rate": 6.990765622709727e-07, "loss": 0.1292, "num_input_tokens_seen": 4832000, "step": 7155 }, { "epoch": 0.1749199912051401, "grad_norm": 29.815210342407227, "learning_rate": 6.995651536619924e-07, "loss": 0.1327, "num_input_tokens_seen": 4835584, "step": 7160 }, { "epoch": 0.17504214203698726, "grad_norm": 9.371803283691406, "learning_rate": 7.000537450530122e-07, "loss": 0.1325, "num_input_tokens_seen": 4838848, "step": 7165 }, { "epoch": 0.17516429286883445, "grad_norm": 23.05524444580078, "learning_rate": 7.005423364440318e-07, "loss": 0.0945, "num_input_tokens_seen": 4842048, "step": 7170 }, { "epoch": 0.1752864437006816, "grad_norm": 13.318825721740723, "learning_rate": 7.010309278350515e-07, "loss": 0.0969, "num_input_tokens_seen": 4845632, "step": 7175 }, { "epoch": 0.17540859453252877, "grad_norm": 50.86394119262695, "learning_rate": 7.015195192260712e-07, "loss": 0.1881, "num_input_tokens_seen": 4848832, "step": 7180 }, { "epoch": 0.17553074536437593, "grad_norm": 6.3671040534973145, "learning_rate": 7.020081106170908e-07, "loss": 0.1016, "num_input_tokens_seen": 4851968, "step": 7185 }, { "epoch": 0.1756528961962231, "grad_norm": 28.17583656311035, "learning_rate": 7.024967020081106e-07, "loss": 0.0988, "num_input_tokens_seen": 4855360, "step": 7190 }, { "epoch": 0.17577504702807026, "grad_norm": 30.14576530456543, "learning_rate": 7.029852933991303e-07, "loss": 0.1393, "num_input_tokens_seen": 4858496, "step": 7195 }, { "epoch": 0.17589719785991742, "grad_norm": 21.089799880981445, "learning_rate": 7.0347388479015e-07, "loss": 0.0923, "num_input_tokens_seen": 4862016, "step": 7200 }, { "epoch": 0.17601934869176458, "grad_norm": 32.99907302856445, "learning_rate": 7.039624761811696e-07, "loss": 0.1801, "num_input_tokens_seen": 4865216, "step": 7205 }, { "epoch": 0.17614149952361174, "grad_norm": 6.302981376647949, "learning_rate": 7.044510675721893e-07, "loss": 0.0319, "num_input_tokens_seen": 4868352, "step": 7210 }, { "epoch": 0.17626365035545893, "grad_norm": 9.754927635192871, "learning_rate": 7.04939658963209e-07, "loss": 0.082, "num_input_tokens_seen": 4871744, "step": 7215 }, { "epoch": 0.1763858011873061, "grad_norm": 1.676681637763977, "learning_rate": 7.054282503542287e-07, "loss": 0.0897, "num_input_tokens_seen": 4875520, "step": 7220 }, { "epoch": 0.17650795201915326, "grad_norm": 23.12895393371582, "learning_rate": 7.059168417452485e-07, "loss": 0.0919, "num_input_tokens_seen": 4878912, "step": 7225 }, { "epoch": 0.17663010285100042, "grad_norm": 32.92357635498047, "learning_rate": 7.064054331362681e-07, "loss": 0.0748, "num_input_tokens_seen": 4882368, "step": 7230 }, { "epoch": 0.17675225368284758, "grad_norm": 37.07939910888672, "learning_rate": 7.068940245272877e-07, "loss": 0.0947, "num_input_tokens_seen": 4885504, "step": 7235 }, { "epoch": 0.17687440451469474, "grad_norm": 0.42080605030059814, "learning_rate": 7.073826159183075e-07, "loss": 0.0184, "num_input_tokens_seen": 4889344, "step": 7240 }, { "epoch": 0.1769965553465419, "grad_norm": 34.457584381103516, "learning_rate": 7.078712073093271e-07, "loss": 0.0587, "num_input_tokens_seen": 4892480, "step": 7245 }, { "epoch": 0.17711870617838907, "grad_norm": 83.84002685546875, "learning_rate": 7.083597987003469e-07, "loss": 0.1063, "num_input_tokens_seen": 4895936, "step": 7250 }, { "epoch": 0.17724085701023623, "grad_norm": 30.910654067993164, "learning_rate": 7.088483900913666e-07, "loss": 0.2293, "num_input_tokens_seen": 4899200, "step": 7255 }, { "epoch": 0.1773630078420834, "grad_norm": 24.910533905029297, "learning_rate": 7.093369814823862e-07, "loss": 0.1442, "num_input_tokens_seen": 4902464, "step": 7260 }, { "epoch": 0.17748515867393058, "grad_norm": 16.055313110351562, "learning_rate": 7.098255728734059e-07, "loss": 0.35, "num_input_tokens_seen": 4906240, "step": 7265 }, { "epoch": 0.17760730950577774, "grad_norm": 19.099559783935547, "learning_rate": 7.103141642644256e-07, "loss": 0.0915, "num_input_tokens_seen": 4909760, "step": 7270 }, { "epoch": 0.1777294603376249, "grad_norm": 34.7686882019043, "learning_rate": 7.108027556554453e-07, "loss": 0.1682, "num_input_tokens_seen": 4912768, "step": 7275 }, { "epoch": 0.17785161116947207, "grad_norm": 12.762410163879395, "learning_rate": 7.11291347046465e-07, "loss": 0.0714, "num_input_tokens_seen": 4915968, "step": 7280 }, { "epoch": 0.17797376200131923, "grad_norm": 1.6703814268112183, "learning_rate": 7.117799384374848e-07, "loss": 0.211, "num_input_tokens_seen": 4918976, "step": 7285 }, { "epoch": 0.1780959128331664, "grad_norm": 40.02713394165039, "learning_rate": 7.122685298285043e-07, "loss": 0.1074, "num_input_tokens_seen": 4922816, "step": 7290 }, { "epoch": 0.17821806366501355, "grad_norm": 34.362022399902344, "learning_rate": 7.12757121219524e-07, "loss": 0.1798, "num_input_tokens_seen": 4925824, "step": 7295 }, { "epoch": 0.17834021449686072, "grad_norm": 26.875831604003906, "learning_rate": 7.132457126105438e-07, "loss": 0.1859, "num_input_tokens_seen": 4929664, "step": 7300 }, { "epoch": 0.17846236532870788, "grad_norm": 1.096500039100647, "learning_rate": 7.137343040015634e-07, "loss": 0.0412, "num_input_tokens_seen": 4933184, "step": 7305 }, { "epoch": 0.17858451616055504, "grad_norm": 4.698089122772217, "learning_rate": 7.142228953925832e-07, "loss": 0.0465, "num_input_tokens_seen": 4936320, "step": 7310 }, { "epoch": 0.17870666699240223, "grad_norm": 1.2416880130767822, "learning_rate": 7.147114867836028e-07, "loss": 0.0908, "num_input_tokens_seen": 4940160, "step": 7315 }, { "epoch": 0.1788288178242494, "grad_norm": 24.97760772705078, "learning_rate": 7.152000781746225e-07, "loss": 0.1059, "num_input_tokens_seen": 4943680, "step": 7320 }, { "epoch": 0.17895096865609655, "grad_norm": 31.947988510131836, "learning_rate": 7.156886695656422e-07, "loss": 0.0785, "num_input_tokens_seen": 4946560, "step": 7325 }, { "epoch": 0.17907311948794372, "grad_norm": 35.131752014160156, "learning_rate": 7.161772609566619e-07, "loss": 0.1813, "num_input_tokens_seen": 4949760, "step": 7330 }, { "epoch": 0.17919527031979088, "grad_norm": 58.12864685058594, "learning_rate": 7.166658523476816e-07, "loss": 0.0774, "num_input_tokens_seen": 4953024, "step": 7335 }, { "epoch": 0.17931742115163804, "grad_norm": 18.54363441467285, "learning_rate": 7.171544437387013e-07, "loss": 0.095, "num_input_tokens_seen": 4956672, "step": 7340 }, { "epoch": 0.1794395719834852, "grad_norm": 24.689943313598633, "learning_rate": 7.17643035129721e-07, "loss": 0.2202, "num_input_tokens_seen": 4959936, "step": 7345 }, { "epoch": 0.17956172281533236, "grad_norm": 6.816656589508057, "learning_rate": 7.181316265207406e-07, "loss": 0.0811, "num_input_tokens_seen": 4963392, "step": 7350 }, { "epoch": 0.17968387364717953, "grad_norm": 19.98248863220215, "learning_rate": 7.186202179117603e-07, "loss": 0.2623, "num_input_tokens_seen": 4966720, "step": 7355 }, { "epoch": 0.17980602447902672, "grad_norm": 33.617252349853516, "learning_rate": 7.191088093027801e-07, "loss": 0.1531, "num_input_tokens_seen": 4969792, "step": 7360 }, { "epoch": 0.17992817531087388, "grad_norm": 56.086578369140625, "learning_rate": 7.195974006937997e-07, "loss": 0.1272, "num_input_tokens_seen": 4973184, "step": 7365 }, { "epoch": 0.18005032614272104, "grad_norm": 20.100421905517578, "learning_rate": 7.200859920848194e-07, "loss": 0.1201, "num_input_tokens_seen": 4976192, "step": 7370 }, { "epoch": 0.1801724769745682, "grad_norm": 47.291358947753906, "learning_rate": 7.205745834758391e-07, "loss": 0.1259, "num_input_tokens_seen": 4979584, "step": 7375 }, { "epoch": 0.18029462780641536, "grad_norm": 32.62343978881836, "learning_rate": 7.210631748668588e-07, "loss": 0.1405, "num_input_tokens_seen": 4982784, "step": 7380 }, { "epoch": 0.18041677863826253, "grad_norm": 2.495743751525879, "learning_rate": 7.215517662578785e-07, "loss": 0.1383, "num_input_tokens_seen": 4986240, "step": 7385 }, { "epoch": 0.1805389294701097, "grad_norm": 27.57878875732422, "learning_rate": 7.220403576488982e-07, "loss": 0.133, "num_input_tokens_seen": 4989760, "step": 7390 }, { "epoch": 0.18066108030195685, "grad_norm": 31.568477630615234, "learning_rate": 7.225289490399179e-07, "loss": 0.1669, "num_input_tokens_seen": 4993216, "step": 7395 }, { "epoch": 0.180783231133804, "grad_norm": 13.473135948181152, "learning_rate": 7.230175404309375e-07, "loss": 0.2285, "num_input_tokens_seen": 4996672, "step": 7400 }, { "epoch": 0.18090538196565117, "grad_norm": 23.307954788208008, "learning_rate": 7.235061318219573e-07, "loss": 0.1672, "num_input_tokens_seen": 5000320, "step": 7405 }, { "epoch": 0.18102753279749836, "grad_norm": 23.983013153076172, "learning_rate": 7.239947232129769e-07, "loss": 0.0913, "num_input_tokens_seen": 5003392, "step": 7410 }, { "epoch": 0.18114968362934553, "grad_norm": 13.432177543640137, "learning_rate": 7.244833146039967e-07, "loss": 0.1249, "num_input_tokens_seen": 5006528, "step": 7415 }, { "epoch": 0.1812718344611927, "grad_norm": 22.402496337890625, "learning_rate": 7.249719059950164e-07, "loss": 0.1705, "num_input_tokens_seen": 5009536, "step": 7420 }, { "epoch": 0.18139398529303985, "grad_norm": 9.332347869873047, "learning_rate": 7.254604973860359e-07, "loss": 0.1101, "num_input_tokens_seen": 5013184, "step": 7425 }, { "epoch": 0.181516136124887, "grad_norm": 2.7800843715667725, "learning_rate": 7.259490887770557e-07, "loss": 0.0673, "num_input_tokens_seen": 5016320, "step": 7430 }, { "epoch": 0.18163828695673417, "grad_norm": 5.731550216674805, "learning_rate": 7.264376801680754e-07, "loss": 0.085, "num_input_tokens_seen": 5019456, "step": 7435 }, { "epoch": 0.18176043778858134, "grad_norm": 32.029327392578125, "learning_rate": 7.269262715590951e-07, "loss": 0.1711, "num_input_tokens_seen": 5022720, "step": 7440 }, { "epoch": 0.1818825886204285, "grad_norm": 22.998117446899414, "learning_rate": 7.274148629501148e-07, "loss": 0.0828, "num_input_tokens_seen": 5025664, "step": 7445 }, { "epoch": 0.18200473945227566, "grad_norm": 44.311195373535156, "learning_rate": 7.279034543411345e-07, "loss": 0.1175, "num_input_tokens_seen": 5029120, "step": 7450 }, { "epoch": 0.18212689028412282, "grad_norm": 2.4467546939849854, "learning_rate": 7.283920457321541e-07, "loss": 0.0589, "num_input_tokens_seen": 5032448, "step": 7455 }, { "epoch": 0.18224904111597, "grad_norm": 17.926509857177734, "learning_rate": 7.288806371231738e-07, "loss": 0.1039, "num_input_tokens_seen": 5035456, "step": 7460 }, { "epoch": 0.18237119194781717, "grad_norm": 19.704965591430664, "learning_rate": 7.293692285141936e-07, "loss": 0.2372, "num_input_tokens_seen": 5038656, "step": 7465 }, { "epoch": 0.18249334277966434, "grad_norm": 6.128961086273193, "learning_rate": 7.298578199052132e-07, "loss": 0.1839, "num_input_tokens_seen": 5042304, "step": 7470 }, { "epoch": 0.1826154936115115, "grad_norm": 25.417865753173828, "learning_rate": 7.30346411296233e-07, "loss": 0.1081, "num_input_tokens_seen": 5045440, "step": 7475 }, { "epoch": 0.18273764444335866, "grad_norm": 18.716413497924805, "learning_rate": 7.308350026872526e-07, "loss": 0.2091, "num_input_tokens_seen": 5048576, "step": 7480 }, { "epoch": 0.18285979527520582, "grad_norm": 13.868701934814453, "learning_rate": 7.313235940782722e-07, "loss": 0.0761, "num_input_tokens_seen": 5051840, "step": 7485 }, { "epoch": 0.18298194610705298, "grad_norm": 10.58272647857666, "learning_rate": 7.31812185469292e-07, "loss": 0.2031, "num_input_tokens_seen": 5054976, "step": 7490 }, { "epoch": 0.18310409693890015, "grad_norm": 54.09087371826172, "learning_rate": 7.323007768603117e-07, "loss": 0.1034, "num_input_tokens_seen": 5058560, "step": 7495 }, { "epoch": 0.1832262477707473, "grad_norm": 26.250953674316406, "learning_rate": 7.327893682513314e-07, "loss": 0.0831, "num_input_tokens_seen": 5061568, "step": 7500 }, { "epoch": 0.18334839860259447, "grad_norm": 13.708479881286621, "learning_rate": 7.332779596423511e-07, "loss": 0.0543, "num_input_tokens_seen": 5064768, "step": 7505 }, { "epoch": 0.18347054943444166, "grad_norm": 16.8136043548584, "learning_rate": 7.337665510333707e-07, "loss": 0.0832, "num_input_tokens_seen": 5070336, "step": 7510 }, { "epoch": 0.18359270026628882, "grad_norm": 10.423450469970703, "learning_rate": 7.342551424243904e-07, "loss": 0.0929, "num_input_tokens_seen": 5073472, "step": 7515 }, { "epoch": 0.18371485109813598, "grad_norm": 22.869081497192383, "learning_rate": 7.347437338154101e-07, "loss": 0.0853, "num_input_tokens_seen": 5076992, "step": 7520 }, { "epoch": 0.18383700192998315, "grad_norm": 16.256441116333008, "learning_rate": 7.352323252064299e-07, "loss": 0.1682, "num_input_tokens_seen": 5080320, "step": 7525 }, { "epoch": 0.1839591527618303, "grad_norm": 29.337858200073242, "learning_rate": 7.357209165974495e-07, "loss": 0.1089, "num_input_tokens_seen": 5083584, "step": 7530 }, { "epoch": 0.18408130359367747, "grad_norm": 25.76587677001953, "learning_rate": 7.362095079884692e-07, "loss": 0.2105, "num_input_tokens_seen": 5086912, "step": 7535 }, { "epoch": 0.18420345442552463, "grad_norm": 4.57034969329834, "learning_rate": 7.366980993794889e-07, "loss": 0.1018, "num_input_tokens_seen": 5090240, "step": 7540 }, { "epoch": 0.1843256052573718, "grad_norm": 26.5098819732666, "learning_rate": 7.371866907705085e-07, "loss": 0.1192, "num_input_tokens_seen": 5093632, "step": 7545 }, { "epoch": 0.18444775608921896, "grad_norm": 34.24437713623047, "learning_rate": 7.376752821615283e-07, "loss": 0.1163, "num_input_tokens_seen": 5097792, "step": 7550 }, { "epoch": 0.18456990692106615, "grad_norm": 28.638696670532227, "learning_rate": 7.38163873552548e-07, "loss": 0.098, "num_input_tokens_seen": 5100992, "step": 7555 }, { "epoch": 0.1846920577529133, "grad_norm": 23.46084213256836, "learning_rate": 7.386524649435677e-07, "loss": 0.1937, "num_input_tokens_seen": 5104128, "step": 7560 }, { "epoch": 0.18481420858476047, "grad_norm": 30.672029495239258, "learning_rate": 7.391410563345873e-07, "loss": 0.1587, "num_input_tokens_seen": 5107328, "step": 7565 }, { "epoch": 0.18493635941660763, "grad_norm": 42.915653228759766, "learning_rate": 7.39629647725607e-07, "loss": 0.117, "num_input_tokens_seen": 5110272, "step": 7570 }, { "epoch": 0.1850585102484548, "grad_norm": 18.438732147216797, "learning_rate": 7.401182391166267e-07, "loss": 0.1317, "num_input_tokens_seen": 5113792, "step": 7575 }, { "epoch": 0.18518066108030196, "grad_norm": 16.258872985839844, "learning_rate": 7.406068305076464e-07, "loss": 0.2001, "num_input_tokens_seen": 5116992, "step": 7580 }, { "epoch": 0.18530281191214912, "grad_norm": 33.11271286010742, "learning_rate": 7.410954218986662e-07, "loss": 0.1577, "num_input_tokens_seen": 5120192, "step": 7585 }, { "epoch": 0.18542496274399628, "grad_norm": 23.325136184692383, "learning_rate": 7.415840132896857e-07, "loss": 0.0709, "num_input_tokens_seen": 5123776, "step": 7590 }, { "epoch": 0.18554711357584344, "grad_norm": 12.562117576599121, "learning_rate": 7.420726046807055e-07, "loss": 0.1228, "num_input_tokens_seen": 5127360, "step": 7595 }, { "epoch": 0.1856692644076906, "grad_norm": 7.058622360229492, "learning_rate": 7.425611960717252e-07, "loss": 0.0538, "num_input_tokens_seen": 5130432, "step": 7600 }, { "epoch": 0.1857914152395378, "grad_norm": 46.326969146728516, "learning_rate": 7.430497874627448e-07, "loss": 0.0803, "num_input_tokens_seen": 5133632, "step": 7605 }, { "epoch": 0.18591356607138496, "grad_norm": 26.77501106262207, "learning_rate": 7.435383788537646e-07, "loss": 0.1216, "num_input_tokens_seen": 5136576, "step": 7610 }, { "epoch": 0.18603571690323212, "grad_norm": 12.72427749633789, "learning_rate": 7.440269702447843e-07, "loss": 0.1015, "num_input_tokens_seen": 5139840, "step": 7615 }, { "epoch": 0.18615786773507928, "grad_norm": 3.829493522644043, "learning_rate": 7.445155616358039e-07, "loss": 0.0851, "num_input_tokens_seen": 5142976, "step": 7620 }, { "epoch": 0.18628001856692644, "grad_norm": 7.829679489135742, "learning_rate": 7.450041530268236e-07, "loss": 0.0948, "num_input_tokens_seen": 5146368, "step": 7625 }, { "epoch": 0.1864021693987736, "grad_norm": 56.75333023071289, "learning_rate": 7.454927444178434e-07, "loss": 0.1405, "num_input_tokens_seen": 5150080, "step": 7630 }, { "epoch": 0.18652432023062077, "grad_norm": 32.06376647949219, "learning_rate": 7.45981335808863e-07, "loss": 0.1601, "num_input_tokens_seen": 5153280, "step": 7635 }, { "epoch": 0.18664647106246793, "grad_norm": 19.36145782470703, "learning_rate": 7.464699271998827e-07, "loss": 0.1544, "num_input_tokens_seen": 5156032, "step": 7640 }, { "epoch": 0.1867686218943151, "grad_norm": 17.868488311767578, "learning_rate": 7.469585185909024e-07, "loss": 0.0794, "num_input_tokens_seen": 5159296, "step": 7645 }, { "epoch": 0.18689077272616225, "grad_norm": 43.9721794128418, "learning_rate": 7.47447109981922e-07, "loss": 0.172, "num_input_tokens_seen": 5162752, "step": 7650 }, { "epoch": 0.18701292355800944, "grad_norm": 4.353318691253662, "learning_rate": 7.479357013729418e-07, "loss": 0.3807, "num_input_tokens_seen": 5166080, "step": 7655 }, { "epoch": 0.1871350743898566, "grad_norm": 7.278509616851807, "learning_rate": 7.484242927639615e-07, "loss": 0.1094, "num_input_tokens_seen": 5169536, "step": 7660 }, { "epoch": 0.18725722522170377, "grad_norm": 7.651449203491211, "learning_rate": 7.489128841549811e-07, "loss": 0.0831, "num_input_tokens_seen": 5172928, "step": 7665 }, { "epoch": 0.18737937605355093, "grad_norm": 22.87514877319336, "learning_rate": 7.494014755460009e-07, "loss": 0.1211, "num_input_tokens_seen": 5176448, "step": 7670 }, { "epoch": 0.1875015268853981, "grad_norm": 21.99827766418457, "learning_rate": 7.498900669370205e-07, "loss": 0.2272, "num_input_tokens_seen": 5180160, "step": 7675 }, { "epoch": 0.18762367771724525, "grad_norm": 59.72435760498047, "learning_rate": 7.503786583280402e-07, "loss": 0.1483, "num_input_tokens_seen": 5183360, "step": 7680 }, { "epoch": 0.18774582854909241, "grad_norm": 13.781630516052246, "learning_rate": 7.508672497190599e-07, "loss": 0.1039, "num_input_tokens_seen": 5187264, "step": 7685 }, { "epoch": 0.18786797938093958, "grad_norm": 10.817276954650879, "learning_rate": 7.513558411100797e-07, "loss": 0.0943, "num_input_tokens_seen": 5190528, "step": 7690 }, { "epoch": 0.18799013021278674, "grad_norm": 44.46306610107422, "learning_rate": 7.518444325010993e-07, "loss": 0.1337, "num_input_tokens_seen": 5193536, "step": 7695 }, { "epoch": 0.18811228104463393, "grad_norm": 20.650449752807617, "learning_rate": 7.523330238921189e-07, "loss": 0.0767, "num_input_tokens_seen": 5197056, "step": 7700 }, { "epoch": 0.1882344318764811, "grad_norm": 19.215862274169922, "learning_rate": 7.528216152831387e-07, "loss": 0.1273, "num_input_tokens_seen": 5200832, "step": 7705 }, { "epoch": 0.18835658270832825, "grad_norm": 41.045352935791016, "learning_rate": 7.533102066741583e-07, "loss": 0.1268, "num_input_tokens_seen": 5204352, "step": 7710 }, { "epoch": 0.1884787335401754, "grad_norm": 38.370384216308594, "learning_rate": 7.537987980651781e-07, "loss": 0.1765, "num_input_tokens_seen": 5207488, "step": 7715 }, { "epoch": 0.18860088437202258, "grad_norm": 6.754038333892822, "learning_rate": 7.542873894561978e-07, "loss": 0.0459, "num_input_tokens_seen": 5211008, "step": 7720 }, { "epoch": 0.18872303520386974, "grad_norm": 3.6415226459503174, "learning_rate": 7.547759808472174e-07, "loss": 0.1811, "num_input_tokens_seen": 5214400, "step": 7725 }, { "epoch": 0.1888451860357169, "grad_norm": 53.11391830444336, "learning_rate": 7.552645722382371e-07, "loss": 0.1476, "num_input_tokens_seen": 5217536, "step": 7730 }, { "epoch": 0.18896733686756406, "grad_norm": 7.384988784790039, "learning_rate": 7.557531636292568e-07, "loss": 0.0754, "num_input_tokens_seen": 5220800, "step": 7735 }, { "epoch": 0.18908948769941122, "grad_norm": 43.585693359375, "learning_rate": 7.562417550202765e-07, "loss": 0.1234, "num_input_tokens_seen": 5224064, "step": 7740 }, { "epoch": 0.18921163853125839, "grad_norm": 5.436645984649658, "learning_rate": 7.567303464112962e-07, "loss": 0.124, "num_input_tokens_seen": 5227712, "step": 7745 }, { "epoch": 0.18933378936310558, "grad_norm": 9.57922649383545, "learning_rate": 7.57218937802316e-07, "loss": 0.0938, "num_input_tokens_seen": 5231168, "step": 7750 }, { "epoch": 0.18945594019495274, "grad_norm": 41.084922790527344, "learning_rate": 7.577075291933355e-07, "loss": 0.0911, "num_input_tokens_seen": 5234432, "step": 7755 }, { "epoch": 0.1895780910267999, "grad_norm": 32.1443977355957, "learning_rate": 7.581961205843552e-07, "loss": 0.1401, "num_input_tokens_seen": 5237696, "step": 7760 }, { "epoch": 0.18970024185864706, "grad_norm": 42.15285110473633, "learning_rate": 7.58684711975375e-07, "loss": 0.0962, "num_input_tokens_seen": 5243264, "step": 7765 }, { "epoch": 0.18982239269049422, "grad_norm": 3.755997657775879, "learning_rate": 7.591733033663946e-07, "loss": 0.1051, "num_input_tokens_seen": 5246272, "step": 7770 }, { "epoch": 0.18994454352234139, "grad_norm": 33.203025817871094, "learning_rate": 7.596618947574144e-07, "loss": 0.1633, "num_input_tokens_seen": 5249664, "step": 7775 }, { "epoch": 0.19006669435418855, "grad_norm": 22.799898147583008, "learning_rate": 7.601504861484341e-07, "loss": 0.1516, "num_input_tokens_seen": 5253056, "step": 7780 }, { "epoch": 0.1901888451860357, "grad_norm": 8.654549598693848, "learning_rate": 7.606390775394536e-07, "loss": 0.0891, "num_input_tokens_seen": 5256448, "step": 7785 }, { "epoch": 0.19031099601788287, "grad_norm": 58.40709686279297, "learning_rate": 7.611276689304734e-07, "loss": 0.1017, "num_input_tokens_seen": 5259712, "step": 7790 }, { "epoch": 0.19043314684973003, "grad_norm": 1.5954786539077759, "learning_rate": 7.616162603214931e-07, "loss": 0.039, "num_input_tokens_seen": 5263040, "step": 7795 }, { "epoch": 0.19055529768157722, "grad_norm": 2.5210037231445312, "learning_rate": 7.621048517125128e-07, "loss": 0.1871, "num_input_tokens_seen": 5266432, "step": 7800 }, { "epoch": 0.19067744851342439, "grad_norm": 43.504371643066406, "learning_rate": 7.625934431035325e-07, "loss": 0.2188, "num_input_tokens_seen": 5269952, "step": 7805 }, { "epoch": 0.19079959934527155, "grad_norm": 25.404308319091797, "learning_rate": 7.630820344945523e-07, "loss": 0.052, "num_input_tokens_seen": 5273088, "step": 7810 }, { "epoch": 0.1909217501771187, "grad_norm": 55.287723541259766, "learning_rate": 7.635706258855718e-07, "loss": 0.2614, "num_input_tokens_seen": 5276800, "step": 7815 }, { "epoch": 0.19104390100896587, "grad_norm": 1.3680720329284668, "learning_rate": 7.640592172765915e-07, "loss": 0.1203, "num_input_tokens_seen": 5280384, "step": 7820 }, { "epoch": 0.19116605184081303, "grad_norm": 66.30671691894531, "learning_rate": 7.645478086676113e-07, "loss": 0.1966, "num_input_tokens_seen": 5283584, "step": 7825 }, { "epoch": 0.1912882026726602, "grad_norm": 0.8676133155822754, "learning_rate": 7.650364000586309e-07, "loss": 0.0947, "num_input_tokens_seen": 5286976, "step": 7830 }, { "epoch": 0.19141035350450736, "grad_norm": 34.26639175415039, "learning_rate": 7.655249914496507e-07, "loss": 0.1253, "num_input_tokens_seen": 5290432, "step": 7835 }, { "epoch": 0.19153250433635452, "grad_norm": 48.03748321533203, "learning_rate": 7.660135828406703e-07, "loss": 0.1757, "num_input_tokens_seen": 5294144, "step": 7840 }, { "epoch": 0.19165465516820168, "grad_norm": 33.33087158203125, "learning_rate": 7.6650217423169e-07, "loss": 0.1266, "num_input_tokens_seen": 5297344, "step": 7845 }, { "epoch": 0.19177680600004887, "grad_norm": 46.95682907104492, "learning_rate": 7.669907656227097e-07, "loss": 0.0449, "num_input_tokens_seen": 5300992, "step": 7850 }, { "epoch": 0.19189895683189603, "grad_norm": 1.6643356084823608, "learning_rate": 7.674793570137294e-07, "loss": 0.1091, "num_input_tokens_seen": 5304384, "step": 7855 }, { "epoch": 0.1920211076637432, "grad_norm": 60.34025192260742, "learning_rate": 7.679679484047491e-07, "loss": 0.3198, "num_input_tokens_seen": 5307840, "step": 7860 }, { "epoch": 0.19214325849559036, "grad_norm": 35.65785598754883, "learning_rate": 7.684565397957687e-07, "loss": 0.2131, "num_input_tokens_seen": 5310720, "step": 7865 }, { "epoch": 0.19226540932743752, "grad_norm": 35.110530853271484, "learning_rate": 7.689451311867885e-07, "loss": 0.2205, "num_input_tokens_seen": 5314304, "step": 7870 }, { "epoch": 0.19238756015928468, "grad_norm": 13.194342613220215, "learning_rate": 7.694337225778081e-07, "loss": 0.1237, "num_input_tokens_seen": 5317376, "step": 7875 }, { "epoch": 0.19250971099113184, "grad_norm": 26.445087432861328, "learning_rate": 7.699223139688278e-07, "loss": 0.0954, "num_input_tokens_seen": 5320704, "step": 7880 }, { "epoch": 0.192631861822979, "grad_norm": 2.860102891921997, "learning_rate": 7.704109053598476e-07, "loss": 0.085, "num_input_tokens_seen": 5323776, "step": 7885 }, { "epoch": 0.19275401265482617, "grad_norm": 9.533358573913574, "learning_rate": 7.708994967508672e-07, "loss": 0.1251, "num_input_tokens_seen": 5326912, "step": 7890 }, { "epoch": 0.19287616348667336, "grad_norm": 22.648160934448242, "learning_rate": 7.713880881418869e-07, "loss": 0.1403, "num_input_tokens_seen": 5330176, "step": 7895 }, { "epoch": 0.19299831431852052, "grad_norm": 25.47882652282715, "learning_rate": 7.718766795329066e-07, "loss": 0.2507, "num_input_tokens_seen": 5333376, "step": 7900 }, { "epoch": 0.19312046515036768, "grad_norm": 14.389740943908691, "learning_rate": 7.723652709239263e-07, "loss": 0.1507, "num_input_tokens_seen": 5336896, "step": 7905 }, { "epoch": 0.19324261598221484, "grad_norm": 27.832984924316406, "learning_rate": 7.72853862314946e-07, "loss": 0.1085, "num_input_tokens_seen": 5340032, "step": 7910 }, { "epoch": 0.193364766814062, "grad_norm": 37.951759338378906, "learning_rate": 7.733424537059657e-07, "loss": 0.1381, "num_input_tokens_seen": 5343104, "step": 7915 }, { "epoch": 0.19348691764590917, "grad_norm": 20.33961296081543, "learning_rate": 7.738310450969853e-07, "loss": 0.0414, "num_input_tokens_seen": 5346752, "step": 7920 }, { "epoch": 0.19360906847775633, "grad_norm": 3.8267455101013184, "learning_rate": 7.74319636488005e-07, "loss": 0.1919, "num_input_tokens_seen": 5350016, "step": 7925 }, { "epoch": 0.1937312193096035, "grad_norm": 40.605690002441406, "learning_rate": 7.748082278790248e-07, "loss": 0.1732, "num_input_tokens_seen": 5353216, "step": 7930 }, { "epoch": 0.19385337014145065, "grad_norm": 41.19668197631836, "learning_rate": 7.752968192700444e-07, "loss": 0.3179, "num_input_tokens_seen": 5356864, "step": 7935 }, { "epoch": 0.19397552097329782, "grad_norm": 25.50882911682129, "learning_rate": 7.757854106610641e-07, "loss": 0.0838, "num_input_tokens_seen": 5360192, "step": 7940 }, { "epoch": 0.194097671805145, "grad_norm": 9.193754196166992, "learning_rate": 7.762740020520839e-07, "loss": 0.1119, "num_input_tokens_seen": 5363648, "step": 7945 }, { "epoch": 0.19421982263699217, "grad_norm": 44.46554946899414, "learning_rate": 7.767625934431034e-07, "loss": 0.1827, "num_input_tokens_seen": 5366528, "step": 7950 }, { "epoch": 0.19434197346883933, "grad_norm": 22.7675838470459, "learning_rate": 7.772511848341232e-07, "loss": 0.1905, "num_input_tokens_seen": 5370112, "step": 7955 }, { "epoch": 0.1944641243006865, "grad_norm": 22.128459930419922, "learning_rate": 7.777397762251429e-07, "loss": 0.0704, "num_input_tokens_seen": 5373760, "step": 7960 }, { "epoch": 0.19458627513253365, "grad_norm": 28.93324089050293, "learning_rate": 7.782283676161626e-07, "loss": 0.153, "num_input_tokens_seen": 5379520, "step": 7965 }, { "epoch": 0.19470842596438082, "grad_norm": 16.455291748046875, "learning_rate": 7.787169590071823e-07, "loss": 0.0725, "num_input_tokens_seen": 5382464, "step": 7970 }, { "epoch": 0.19483057679622798, "grad_norm": 40.39324951171875, "learning_rate": 7.792055503982019e-07, "loss": 0.1389, "num_input_tokens_seen": 5385792, "step": 7975 }, { "epoch": 0.19495272762807514, "grad_norm": 30.548707962036133, "learning_rate": 7.796941417892216e-07, "loss": 0.1474, "num_input_tokens_seen": 5389120, "step": 7980 }, { "epoch": 0.1950748784599223, "grad_norm": 21.405261993408203, "learning_rate": 7.801827331802413e-07, "loss": 0.0684, "num_input_tokens_seen": 5392576, "step": 7985 }, { "epoch": 0.19519702929176946, "grad_norm": 27.85494613647461, "learning_rate": 7.806713245712611e-07, "loss": 0.1549, "num_input_tokens_seen": 5396160, "step": 7990 }, { "epoch": 0.19531918012361665, "grad_norm": 14.183422088623047, "learning_rate": 7.811599159622807e-07, "loss": 0.0496, "num_input_tokens_seen": 5399424, "step": 7995 }, { "epoch": 0.19544133095546382, "grad_norm": 17.311738967895508, "learning_rate": 7.816485073533004e-07, "loss": 0.1259, "num_input_tokens_seen": 5402752, "step": 8000 }, { "epoch": 0.19556348178731098, "grad_norm": 29.035099029541016, "learning_rate": 7.821370987443201e-07, "loss": 0.0967, "num_input_tokens_seen": 5405952, "step": 8005 }, { "epoch": 0.19568563261915814, "grad_norm": 26.67888069152832, "learning_rate": 7.826256901353397e-07, "loss": 0.1424, "num_input_tokens_seen": 5409472, "step": 8010 }, { "epoch": 0.1958077834510053, "grad_norm": 42.170406341552734, "learning_rate": 7.831142815263595e-07, "loss": 0.0976, "num_input_tokens_seen": 5412800, "step": 8015 }, { "epoch": 0.19592993428285246, "grad_norm": 73.1128158569336, "learning_rate": 7.836028729173792e-07, "loss": 0.133, "num_input_tokens_seen": 5416256, "step": 8020 }, { "epoch": 0.19605208511469963, "grad_norm": 34.78676223754883, "learning_rate": 7.840914643083989e-07, "loss": 0.3602, "num_input_tokens_seen": 5419520, "step": 8025 }, { "epoch": 0.1961742359465468, "grad_norm": 8.561169624328613, "learning_rate": 7.845800556994185e-07, "loss": 0.1214, "num_input_tokens_seen": 5422976, "step": 8030 }, { "epoch": 0.19629638677839395, "grad_norm": 32.9570198059082, "learning_rate": 7.850686470904382e-07, "loss": 0.0961, "num_input_tokens_seen": 5426176, "step": 8035 }, { "epoch": 0.19641853761024114, "grad_norm": 28.533235549926758, "learning_rate": 7.855572384814579e-07, "loss": 0.282, "num_input_tokens_seen": 5429696, "step": 8040 }, { "epoch": 0.1965406884420883, "grad_norm": 22.516508102416992, "learning_rate": 7.860458298724776e-07, "loss": 0.0883, "num_input_tokens_seen": 5432832, "step": 8045 }, { "epoch": 0.19666283927393546, "grad_norm": 3.684361696243286, "learning_rate": 7.865344212634974e-07, "loss": 0.0787, "num_input_tokens_seen": 5436160, "step": 8050 }, { "epoch": 0.19678499010578263, "grad_norm": 19.151451110839844, "learning_rate": 7.87023012654517e-07, "loss": 0.137, "num_input_tokens_seen": 5439168, "step": 8055 }, { "epoch": 0.1969071409376298, "grad_norm": 2.493832588195801, "learning_rate": 7.875116040455367e-07, "loss": 0.0778, "num_input_tokens_seen": 5442496, "step": 8060 }, { "epoch": 0.19702929176947695, "grad_norm": 58.97475051879883, "learning_rate": 7.880001954365564e-07, "loss": 0.1975, "num_input_tokens_seen": 5446144, "step": 8065 }, { "epoch": 0.1971514426013241, "grad_norm": 29.96076011657715, "learning_rate": 7.88488786827576e-07, "loss": 0.2025, "num_input_tokens_seen": 5449408, "step": 8070 }, { "epoch": 0.19727359343317127, "grad_norm": 4.311563014984131, "learning_rate": 7.889773782185958e-07, "loss": 0.1698, "num_input_tokens_seen": 5453056, "step": 8075 }, { "epoch": 0.19739574426501844, "grad_norm": 56.86328887939453, "learning_rate": 7.894659696096155e-07, "loss": 0.3359, "num_input_tokens_seen": 5456064, "step": 8080 }, { "epoch": 0.1975178950968656, "grad_norm": 14.863036155700684, "learning_rate": 7.899545610006351e-07, "loss": 0.1443, "num_input_tokens_seen": 5459264, "step": 8085 }, { "epoch": 0.1976400459287128, "grad_norm": 10.164009094238281, "learning_rate": 7.904431523916548e-07, "loss": 0.0892, "num_input_tokens_seen": 5462976, "step": 8090 }, { "epoch": 0.19776219676055995, "grad_norm": 29.270055770874023, "learning_rate": 7.909317437826745e-07, "loss": 0.2065, "num_input_tokens_seen": 5466304, "step": 8095 }, { "epoch": 0.1978843475924071, "grad_norm": 28.83013343811035, "learning_rate": 7.914203351736942e-07, "loss": 0.0773, "num_input_tokens_seen": 5469632, "step": 8100 }, { "epoch": 0.19800649842425427, "grad_norm": 17.059755325317383, "learning_rate": 7.919089265647139e-07, "loss": 0.0448, "num_input_tokens_seen": 5472768, "step": 8105 }, { "epoch": 0.19812864925610144, "grad_norm": 13.976848602294922, "learning_rate": 7.923975179557337e-07, "loss": 0.1155, "num_input_tokens_seen": 5476032, "step": 8110 }, { "epoch": 0.1982508000879486, "grad_norm": 19.26152229309082, "learning_rate": 7.928861093467532e-07, "loss": 0.2373, "num_input_tokens_seen": 5479296, "step": 8115 }, { "epoch": 0.19837295091979576, "grad_norm": 13.917434692382812, "learning_rate": 7.93374700737773e-07, "loss": 0.217, "num_input_tokens_seen": 5483072, "step": 8120 }, { "epoch": 0.19849510175164292, "grad_norm": 2.5597445964813232, "learning_rate": 7.938632921287927e-07, "loss": 0.084, "num_input_tokens_seen": 5486208, "step": 8125 }, { "epoch": 0.19861725258349008, "grad_norm": 13.927988052368164, "learning_rate": 7.943518835198123e-07, "loss": 0.0706, "num_input_tokens_seen": 5489280, "step": 8130 }, { "epoch": 0.19873940341533725, "grad_norm": 14.691224098205566, "learning_rate": 7.948404749108321e-07, "loss": 0.1443, "num_input_tokens_seen": 5492608, "step": 8135 }, { "epoch": 0.19886155424718444, "grad_norm": 16.61770248413086, "learning_rate": 7.953290663018517e-07, "loss": 0.1016, "num_input_tokens_seen": 5495744, "step": 8140 }, { "epoch": 0.1989837050790316, "grad_norm": 13.769593238830566, "learning_rate": 7.958176576928714e-07, "loss": 0.0974, "num_input_tokens_seen": 5499520, "step": 8145 }, { "epoch": 0.19910585591087876, "grad_norm": 47.58414077758789, "learning_rate": 7.963062490838911e-07, "loss": 0.096, "num_input_tokens_seen": 5502976, "step": 8150 }, { "epoch": 0.19922800674272592, "grad_norm": 0.6372967958450317, "learning_rate": 7.967948404749108e-07, "loss": 0.1639, "num_input_tokens_seen": 5505984, "step": 8155 }, { "epoch": 0.19935015757457308, "grad_norm": 17.84566879272461, "learning_rate": 7.972834318659305e-07, "loss": 0.0544, "num_input_tokens_seen": 5509760, "step": 8160 }, { "epoch": 0.19947230840642025, "grad_norm": 43.51220703125, "learning_rate": 7.977720232569502e-07, "loss": 0.2632, "num_input_tokens_seen": 5512704, "step": 8165 }, { "epoch": 0.1995944592382674, "grad_norm": 0.9529390335083008, "learning_rate": 7.982606146479699e-07, "loss": 0.0972, "num_input_tokens_seen": 5516096, "step": 8170 }, { "epoch": 0.19971661007011457, "grad_norm": 18.115739822387695, "learning_rate": 7.987492060389895e-07, "loss": 0.2251, "num_input_tokens_seen": 5519872, "step": 8175 }, { "epoch": 0.19983876090196173, "grad_norm": 41.57394027709961, "learning_rate": 7.992377974300093e-07, "loss": 0.1235, "num_input_tokens_seen": 5523264, "step": 8180 }, { "epoch": 0.1999609117338089, "grad_norm": 18.623090744018555, "learning_rate": 7.99726388821029e-07, "loss": 0.083, "num_input_tokens_seen": 5526272, "step": 8185 }, { "epoch": 0.20008306256565608, "grad_norm": 1.6735551357269287, "learning_rate": 8.002149802120486e-07, "loss": 0.1533, "num_input_tokens_seen": 5529792, "step": 8190 }, { "epoch": 0.20020521339750325, "grad_norm": 70.54412078857422, "learning_rate": 8.007035716030683e-07, "loss": 0.092, "num_input_tokens_seen": 5532992, "step": 8195 }, { "epoch": 0.2003273642293504, "grad_norm": 29.043832778930664, "learning_rate": 8.01192162994088e-07, "loss": 0.0971, "num_input_tokens_seen": 5536640, "step": 8200 }, { "epoch": 0.20044951506119757, "grad_norm": 6.162371635437012, "learning_rate": 8.016807543851077e-07, "loss": 0.0859, "num_input_tokens_seen": 5539968, "step": 8205 }, { "epoch": 0.20057166589304473, "grad_norm": 37.265079498291016, "learning_rate": 8.021693457761274e-07, "loss": 0.1228, "num_input_tokens_seen": 5542912, "step": 8210 }, { "epoch": 0.2006938167248919, "grad_norm": 34.13664245605469, "learning_rate": 8.026579371671471e-07, "loss": 0.1139, "num_input_tokens_seen": 5545728, "step": 8215 }, { "epoch": 0.20081596755673906, "grad_norm": 19.852344512939453, "learning_rate": 8.031465285581668e-07, "loss": 0.1316, "num_input_tokens_seen": 5548864, "step": 8220 }, { "epoch": 0.20093811838858622, "grad_norm": 9.395726203918457, "learning_rate": 8.036351199491864e-07, "loss": 0.1198, "num_input_tokens_seen": 5551744, "step": 8225 }, { "epoch": 0.20106026922043338, "grad_norm": 10.440095901489258, "learning_rate": 8.041237113402062e-07, "loss": 0.1258, "num_input_tokens_seen": 5555968, "step": 8230 }, { "epoch": 0.20118242005228057, "grad_norm": 50.22675704956055, "learning_rate": 8.046123027312258e-07, "loss": 0.1186, "num_input_tokens_seen": 5559360, "step": 8235 }, { "epoch": 0.20130457088412773, "grad_norm": 38.31979751586914, "learning_rate": 8.051008941222456e-07, "loss": 0.0886, "num_input_tokens_seen": 5563008, "step": 8240 }, { "epoch": 0.2014267217159749, "grad_norm": 8.49152946472168, "learning_rate": 8.055894855132653e-07, "loss": 0.113, "num_input_tokens_seen": 5566464, "step": 8245 }, { "epoch": 0.20154887254782206, "grad_norm": 7.730495929718018, "learning_rate": 8.060780769042848e-07, "loss": 0.0882, "num_input_tokens_seen": 5569408, "step": 8250 }, { "epoch": 0.20167102337966922, "grad_norm": 35.468379974365234, "learning_rate": 8.065666682953046e-07, "loss": 0.1449, "num_input_tokens_seen": 5573056, "step": 8255 }, { "epoch": 0.20179317421151638, "grad_norm": 20.88719367980957, "learning_rate": 8.070552596863243e-07, "loss": 0.1259, "num_input_tokens_seen": 5576512, "step": 8260 }, { "epoch": 0.20191532504336354, "grad_norm": 21.56291389465332, "learning_rate": 8.07543851077344e-07, "loss": 0.2285, "num_input_tokens_seen": 5579584, "step": 8265 }, { "epoch": 0.2020374758752107, "grad_norm": 21.869901657104492, "learning_rate": 8.080324424683637e-07, "loss": 0.1879, "num_input_tokens_seen": 5582784, "step": 8270 }, { "epoch": 0.20215962670705787, "grad_norm": 5.118806838989258, "learning_rate": 8.085210338593834e-07, "loss": 0.1162, "num_input_tokens_seen": 5585920, "step": 8275 }, { "epoch": 0.20228177753890503, "grad_norm": 27.39159393310547, "learning_rate": 8.09009625250403e-07, "loss": 0.1593, "num_input_tokens_seen": 5589504, "step": 8280 }, { "epoch": 0.20240392837075222, "grad_norm": 5.3895158767700195, "learning_rate": 8.094982166414227e-07, "loss": 0.082, "num_input_tokens_seen": 5592576, "step": 8285 }, { "epoch": 0.20252607920259938, "grad_norm": 16.00442123413086, "learning_rate": 8.099868080324425e-07, "loss": 0.1413, "num_input_tokens_seen": 5595648, "step": 8290 }, { "epoch": 0.20264823003444654, "grad_norm": 14.546874046325684, "learning_rate": 8.104753994234621e-07, "loss": 0.079, "num_input_tokens_seen": 5599104, "step": 8295 }, { "epoch": 0.2027703808662937, "grad_norm": 0.4530252516269684, "learning_rate": 8.109639908144819e-07, "loss": 0.0501, "num_input_tokens_seen": 5602432, "step": 8300 }, { "epoch": 0.20289253169814087, "grad_norm": 21.29576301574707, "learning_rate": 8.114525822055015e-07, "loss": 0.1506, "num_input_tokens_seen": 5605504, "step": 8305 }, { "epoch": 0.20301468252998803, "grad_norm": 0.7760035991668701, "learning_rate": 8.119411735965211e-07, "loss": 0.1131, "num_input_tokens_seen": 5609216, "step": 8310 }, { "epoch": 0.2031368333618352, "grad_norm": 23.73055648803711, "learning_rate": 8.124297649875409e-07, "loss": 0.1505, "num_input_tokens_seen": 5613248, "step": 8315 }, { "epoch": 0.20325898419368235, "grad_norm": 26.038301467895508, "learning_rate": 8.129183563785606e-07, "loss": 0.0719, "num_input_tokens_seen": 5616832, "step": 8320 }, { "epoch": 0.2033811350255295, "grad_norm": 73.4818115234375, "learning_rate": 8.134069477695803e-07, "loss": 0.1667, "num_input_tokens_seen": 5620800, "step": 8325 }, { "epoch": 0.20350328585737668, "grad_norm": 30.03678321838379, "learning_rate": 8.138955391606e-07, "loss": 0.1361, "num_input_tokens_seen": 5624448, "step": 8330 }, { "epoch": 0.20362543668922387, "grad_norm": 28.63085174560547, "learning_rate": 8.143841305516197e-07, "loss": 0.1627, "num_input_tokens_seen": 5627712, "step": 8335 }, { "epoch": 0.20374758752107103, "grad_norm": 0.3213491141796112, "learning_rate": 8.148727219426393e-07, "loss": 0.1031, "num_input_tokens_seen": 5631040, "step": 8340 }, { "epoch": 0.2038697383529182, "grad_norm": 21.672334671020508, "learning_rate": 8.15361313333659e-07, "loss": 0.1729, "num_input_tokens_seen": 5634624, "step": 8345 }, { "epoch": 0.20399188918476535, "grad_norm": 3.710036277770996, "learning_rate": 8.158499047246788e-07, "loss": 0.1207, "num_input_tokens_seen": 5638080, "step": 8350 }, { "epoch": 0.2041140400166125, "grad_norm": 7.503905296325684, "learning_rate": 8.163384961156984e-07, "loss": 0.1336, "num_input_tokens_seen": 5641536, "step": 8355 }, { "epoch": 0.20423619084845968, "grad_norm": 3.1992719173431396, "learning_rate": 8.168270875067181e-07, "loss": 0.0766, "num_input_tokens_seen": 5644544, "step": 8360 }, { "epoch": 0.20435834168030684, "grad_norm": 21.4776554107666, "learning_rate": 8.173156788977378e-07, "loss": 0.1265, "num_input_tokens_seen": 5647936, "step": 8365 }, { "epoch": 0.204480492512154, "grad_norm": 23.008224487304688, "learning_rate": 8.178042702887574e-07, "loss": 0.1076, "num_input_tokens_seen": 5651584, "step": 8370 }, { "epoch": 0.20460264334400116, "grad_norm": 12.869095802307129, "learning_rate": 8.182928616797772e-07, "loss": 0.0794, "num_input_tokens_seen": 5654784, "step": 8375 }, { "epoch": 0.20472479417584835, "grad_norm": 51.182090759277344, "learning_rate": 8.187814530707969e-07, "loss": 0.2906, "num_input_tokens_seen": 5657984, "step": 8380 }, { "epoch": 0.2048469450076955, "grad_norm": 38.773929595947266, "learning_rate": 8.192700444618166e-07, "loss": 0.1992, "num_input_tokens_seen": 5661248, "step": 8385 }, { "epoch": 0.20496909583954268, "grad_norm": 45.63922882080078, "learning_rate": 8.197586358528362e-07, "loss": 0.1088, "num_input_tokens_seen": 5664384, "step": 8390 }, { "epoch": 0.20509124667138984, "grad_norm": 16.322813034057617, "learning_rate": 8.20247227243856e-07, "loss": 0.1534, "num_input_tokens_seen": 5667840, "step": 8395 }, { "epoch": 0.205213397503237, "grad_norm": 20.92977523803711, "learning_rate": 8.207358186348756e-07, "loss": 0.082, "num_input_tokens_seen": 5670848, "step": 8400 }, { "epoch": 0.20533554833508416, "grad_norm": 22.75577735900879, "learning_rate": 8.212244100258953e-07, "loss": 0.1136, "num_input_tokens_seen": 5674176, "step": 8405 }, { "epoch": 0.20545769916693132, "grad_norm": 18.192934036254883, "learning_rate": 8.217130014169151e-07, "loss": 0.1473, "num_input_tokens_seen": 5677376, "step": 8410 }, { "epoch": 0.20557984999877849, "grad_norm": 11.31889820098877, "learning_rate": 8.222015928079346e-07, "loss": 0.083, "num_input_tokens_seen": 5680640, "step": 8415 }, { "epoch": 0.20570200083062565, "grad_norm": 13.893218040466309, "learning_rate": 8.226901841989544e-07, "loss": 0.1466, "num_input_tokens_seen": 5684288, "step": 8420 }, { "epoch": 0.2058241516624728, "grad_norm": 16.788461685180664, "learning_rate": 8.231787755899741e-07, "loss": 0.1573, "num_input_tokens_seen": 5687680, "step": 8425 }, { "epoch": 0.20594630249432, "grad_norm": 25.608034133911133, "learning_rate": 8.236673669809937e-07, "loss": 0.0939, "num_input_tokens_seen": 5691328, "step": 8430 }, { "epoch": 0.20606845332616716, "grad_norm": 59.523643493652344, "learning_rate": 8.241559583720135e-07, "loss": 0.1198, "num_input_tokens_seen": 5694336, "step": 8435 }, { "epoch": 0.20619060415801432, "grad_norm": 38.94904327392578, "learning_rate": 8.246445497630332e-07, "loss": 0.2146, "num_input_tokens_seen": 5697344, "step": 8440 }, { "epoch": 0.20631275498986149, "grad_norm": 10.258160591125488, "learning_rate": 8.251331411540528e-07, "loss": 0.1408, "num_input_tokens_seen": 5700800, "step": 8445 }, { "epoch": 0.20643490582170865, "grad_norm": 13.627341270446777, "learning_rate": 8.256217325450725e-07, "loss": 0.1243, "num_input_tokens_seen": 5704640, "step": 8450 }, { "epoch": 0.2065570566535558, "grad_norm": 15.296594619750977, "learning_rate": 8.261103239360923e-07, "loss": 0.1146, "num_input_tokens_seen": 5708352, "step": 8455 }, { "epoch": 0.20667920748540297, "grad_norm": 19.404726028442383, "learning_rate": 8.265989153271119e-07, "loss": 0.1354, "num_input_tokens_seen": 5711744, "step": 8460 }, { "epoch": 0.20680135831725013, "grad_norm": 28.34406280517578, "learning_rate": 8.270875067181316e-07, "loss": 0.1455, "num_input_tokens_seen": 5714880, "step": 8465 }, { "epoch": 0.2069235091490973, "grad_norm": 4.979307651519775, "learning_rate": 8.275760981091513e-07, "loss": 0.0521, "num_input_tokens_seen": 5718208, "step": 8470 }, { "epoch": 0.20704565998094446, "grad_norm": 20.692550659179688, "learning_rate": 8.280646895001709e-07, "loss": 0.0445, "num_input_tokens_seen": 5721408, "step": 8475 }, { "epoch": 0.20716781081279165, "grad_norm": 1.2701551914215088, "learning_rate": 8.285532808911907e-07, "loss": 0.1252, "num_input_tokens_seen": 5724800, "step": 8480 }, { "epoch": 0.2072899616446388, "grad_norm": 67.5052719116211, "learning_rate": 8.290418722822104e-07, "loss": 0.2579, "num_input_tokens_seen": 5728000, "step": 8485 }, { "epoch": 0.20741211247648597, "grad_norm": 1.723806619644165, "learning_rate": 8.2953046367323e-07, "loss": 0.0754, "num_input_tokens_seen": 5731456, "step": 8490 }, { "epoch": 0.20753426330833313, "grad_norm": 11.588358879089355, "learning_rate": 8.300190550642498e-07, "loss": 0.1841, "num_input_tokens_seen": 5734400, "step": 8495 }, { "epoch": 0.2076564141401803, "grad_norm": 41.95431900024414, "learning_rate": 8.305076464552694e-07, "loss": 0.1131, "num_input_tokens_seen": 5737920, "step": 8500 }, { "epoch": 0.20777856497202746, "grad_norm": 33.22675704956055, "learning_rate": 8.309962378462891e-07, "loss": 0.1254, "num_input_tokens_seen": 5741056, "step": 8505 }, { "epoch": 0.20790071580387462, "grad_norm": 40.123233795166016, "learning_rate": 8.314848292373088e-07, "loss": 0.1324, "num_input_tokens_seen": 5744576, "step": 8510 }, { "epoch": 0.20802286663572178, "grad_norm": 12.709420204162598, "learning_rate": 8.319734206283286e-07, "loss": 0.197, "num_input_tokens_seen": 5748480, "step": 8515 }, { "epoch": 0.20814501746756894, "grad_norm": 0.15196305513381958, "learning_rate": 8.324620120193482e-07, "loss": 0.1648, "num_input_tokens_seen": 5752000, "step": 8520 }, { "epoch": 0.2082671682994161, "grad_norm": 0.7214540243148804, "learning_rate": 8.329506034103678e-07, "loss": 0.2212, "num_input_tokens_seen": 5755648, "step": 8525 }, { "epoch": 0.2083893191312633, "grad_norm": 31.200590133666992, "learning_rate": 8.334391948013876e-07, "loss": 0.2119, "num_input_tokens_seen": 5759040, "step": 8530 }, { "epoch": 0.20851146996311046, "grad_norm": 51.559303283691406, "learning_rate": 8.339277861924072e-07, "loss": 0.0682, "num_input_tokens_seen": 5762496, "step": 8535 }, { "epoch": 0.20863362079495762, "grad_norm": 9.00422191619873, "learning_rate": 8.34416377583427e-07, "loss": 0.0486, "num_input_tokens_seen": 5765312, "step": 8540 }, { "epoch": 0.20875577162680478, "grad_norm": 5.614874362945557, "learning_rate": 8.349049689744467e-07, "loss": 0.1615, "num_input_tokens_seen": 5768768, "step": 8545 }, { "epoch": 0.20887792245865194, "grad_norm": 13.906744003295898, "learning_rate": 8.353935603654664e-07, "loss": 0.0643, "num_input_tokens_seen": 5771968, "step": 8550 }, { "epoch": 0.2090000732904991, "grad_norm": 0.11603455245494843, "learning_rate": 8.35882151756486e-07, "loss": 0.2121, "num_input_tokens_seen": 5775296, "step": 8555 }, { "epoch": 0.20912222412234627, "grad_norm": 29.86203956604004, "learning_rate": 8.363707431475056e-07, "loss": 0.0623, "num_input_tokens_seen": 5778048, "step": 8560 }, { "epoch": 0.20924437495419343, "grad_norm": 14.665985107421875, "learning_rate": 8.368593345385254e-07, "loss": 0.1852, "num_input_tokens_seen": 5781312, "step": 8565 }, { "epoch": 0.2093665257860406, "grad_norm": 11.448308944702148, "learning_rate": 8.373479259295451e-07, "loss": 0.0873, "num_input_tokens_seen": 5784704, "step": 8570 }, { "epoch": 0.20948867661788778, "grad_norm": 13.049193382263184, "learning_rate": 8.378365173205649e-07, "loss": 0.0896, "num_input_tokens_seen": 5788224, "step": 8575 }, { "epoch": 0.20961082744973494, "grad_norm": 22.919639587402344, "learning_rate": 8.383251087115844e-07, "loss": 0.1468, "num_input_tokens_seen": 5791488, "step": 8580 }, { "epoch": 0.2097329782815821, "grad_norm": 15.393905639648438, "learning_rate": 8.388137001026041e-07, "loss": 0.1301, "num_input_tokens_seen": 5794752, "step": 8585 }, { "epoch": 0.20985512911342927, "grad_norm": 7.1114583015441895, "learning_rate": 8.393022914936239e-07, "loss": 0.077, "num_input_tokens_seen": 5798336, "step": 8590 }, { "epoch": 0.20997727994527643, "grad_norm": 32.48029327392578, "learning_rate": 8.397908828846435e-07, "loss": 0.1719, "num_input_tokens_seen": 5802048, "step": 8595 }, { "epoch": 0.2100994307771236, "grad_norm": 10.279617309570312, "learning_rate": 8.402794742756633e-07, "loss": 0.1771, "num_input_tokens_seen": 5805312, "step": 8600 }, { "epoch": 0.21022158160897075, "grad_norm": 6.890974521636963, "learning_rate": 8.407680656666829e-07, "loss": 0.102, "num_input_tokens_seen": 5808448, "step": 8605 }, { "epoch": 0.21034373244081792, "grad_norm": 29.157899856567383, "learning_rate": 8.412566570577026e-07, "loss": 0.1211, "num_input_tokens_seen": 5811776, "step": 8610 }, { "epoch": 0.21046588327266508, "grad_norm": 11.824888229370117, "learning_rate": 8.417452484487223e-07, "loss": 0.0809, "num_input_tokens_seen": 5815232, "step": 8615 }, { "epoch": 0.21058803410451224, "grad_norm": 16.32293128967285, "learning_rate": 8.422338398397419e-07, "loss": 0.0449, "num_input_tokens_seen": 5818560, "step": 8620 }, { "epoch": 0.21071018493635943, "grad_norm": 36.444828033447266, "learning_rate": 8.427224312307617e-07, "loss": 0.1412, "num_input_tokens_seen": 5822272, "step": 8625 }, { "epoch": 0.2108323357682066, "grad_norm": 23.423480987548828, "learning_rate": 8.432110226217814e-07, "loss": 0.1022, "num_input_tokens_seen": 5825600, "step": 8630 }, { "epoch": 0.21095448660005375, "grad_norm": 36.47949981689453, "learning_rate": 8.436996140128011e-07, "loss": 0.1595, "num_input_tokens_seen": 5828672, "step": 8635 }, { "epoch": 0.21107663743190092, "grad_norm": 20.2095890045166, "learning_rate": 8.441882054038207e-07, "loss": 0.0977, "num_input_tokens_seen": 5831552, "step": 8640 }, { "epoch": 0.21119878826374808, "grad_norm": 9.84154224395752, "learning_rate": 8.446767967948404e-07, "loss": 0.1556, "num_input_tokens_seen": 5834368, "step": 8645 }, { "epoch": 0.21132093909559524, "grad_norm": 1.6117609739303589, "learning_rate": 8.451653881858601e-07, "loss": 0.2125, "num_input_tokens_seen": 5837440, "step": 8650 }, { "epoch": 0.2114430899274424, "grad_norm": 2.4990859031677246, "learning_rate": 8.456539795768798e-07, "loss": 0.1083, "num_input_tokens_seen": 5840640, "step": 8655 }, { "epoch": 0.21156524075928956, "grad_norm": 28.441207885742188, "learning_rate": 8.461425709678996e-07, "loss": 0.1269, "num_input_tokens_seen": 5843712, "step": 8660 }, { "epoch": 0.21168739159113673, "grad_norm": 1.2140380144119263, "learning_rate": 8.466311623589191e-07, "loss": 0.1657, "num_input_tokens_seen": 5847104, "step": 8665 }, { "epoch": 0.2118095424229839, "grad_norm": 21.97617530822754, "learning_rate": 8.471197537499389e-07, "loss": 0.0945, "num_input_tokens_seen": 5850432, "step": 8670 }, { "epoch": 0.21193169325483108, "grad_norm": 46.13754653930664, "learning_rate": 8.476083451409586e-07, "loss": 0.0999, "num_input_tokens_seen": 5853824, "step": 8675 }, { "epoch": 0.21205384408667824, "grad_norm": 11.853911399841309, "learning_rate": 8.480969365319782e-07, "loss": 0.0518, "num_input_tokens_seen": 5857472, "step": 8680 }, { "epoch": 0.2121759949185254, "grad_norm": 21.628971099853516, "learning_rate": 8.48585527922998e-07, "loss": 0.1515, "num_input_tokens_seen": 5861312, "step": 8685 }, { "epoch": 0.21229814575037256, "grad_norm": 21.702733993530273, "learning_rate": 8.490741193140176e-07, "loss": 0.1207, "num_input_tokens_seen": 5864832, "step": 8690 }, { "epoch": 0.21242029658221973, "grad_norm": 22.620609283447266, "learning_rate": 8.495627107050373e-07, "loss": 0.0963, "num_input_tokens_seen": 5867776, "step": 8695 }, { "epoch": 0.2125424474140669, "grad_norm": 21.320842742919922, "learning_rate": 8.50051302096057e-07, "loss": 0.1344, "num_input_tokens_seen": 5870848, "step": 8700 }, { "epoch": 0.21266459824591405, "grad_norm": 10.63657283782959, "learning_rate": 8.505398934870767e-07, "loss": 0.1278, "num_input_tokens_seen": 5874560, "step": 8705 }, { "epoch": 0.2127867490777612, "grad_norm": 21.64299964904785, "learning_rate": 8.510284848780964e-07, "loss": 0.098, "num_input_tokens_seen": 5877824, "step": 8710 }, { "epoch": 0.21290889990960837, "grad_norm": 32.38628387451172, "learning_rate": 8.51517076269116e-07, "loss": 0.1258, "num_input_tokens_seen": 5880704, "step": 8715 }, { "epoch": 0.21303105074145554, "grad_norm": 31.72425651550293, "learning_rate": 8.520056676601358e-07, "loss": 0.1202, "num_input_tokens_seen": 5883904, "step": 8720 }, { "epoch": 0.21315320157330273, "grad_norm": 10.176417350769043, "learning_rate": 8.524942590511554e-07, "loss": 0.0826, "num_input_tokens_seen": 5887296, "step": 8725 }, { "epoch": 0.2132753524051499, "grad_norm": 32.1301383972168, "learning_rate": 8.529828504421752e-07, "loss": 0.0917, "num_input_tokens_seen": 5890304, "step": 8730 }, { "epoch": 0.21339750323699705, "grad_norm": 22.305946350097656, "learning_rate": 8.534714418331949e-07, "loss": 0.2655, "num_input_tokens_seen": 5893632, "step": 8735 }, { "epoch": 0.2135196540688442, "grad_norm": 1.0712743997573853, "learning_rate": 8.539600332242145e-07, "loss": 0.0348, "num_input_tokens_seen": 5896960, "step": 8740 }, { "epoch": 0.21364180490069137, "grad_norm": 21.419750213623047, "learning_rate": 8.544486246152342e-07, "loss": 0.0938, "num_input_tokens_seen": 5900224, "step": 8745 }, { "epoch": 0.21376395573253854, "grad_norm": 31.946155548095703, "learning_rate": 8.549372160062539e-07, "loss": 0.1702, "num_input_tokens_seen": 5903360, "step": 8750 }, { "epoch": 0.2138861065643857, "grad_norm": 27.752010345458984, "learning_rate": 8.554258073972736e-07, "loss": 0.1216, "num_input_tokens_seen": 5906432, "step": 8755 }, { "epoch": 0.21400825739623286, "grad_norm": 24.327363967895508, "learning_rate": 8.559143987882933e-07, "loss": 0.0419, "num_input_tokens_seen": 5909568, "step": 8760 }, { "epoch": 0.21413040822808002, "grad_norm": 60.757179260253906, "learning_rate": 8.564029901793131e-07, "loss": 0.1031, "num_input_tokens_seen": 5913280, "step": 8765 }, { "epoch": 0.2142525590599272, "grad_norm": 25.384445190429688, "learning_rate": 8.568915815703327e-07, "loss": 0.0859, "num_input_tokens_seen": 5916352, "step": 8770 }, { "epoch": 0.21437470989177437, "grad_norm": 25.03833770751953, "learning_rate": 8.573801729613523e-07, "loss": 0.1947, "num_input_tokens_seen": 5919232, "step": 8775 }, { "epoch": 0.21449686072362154, "grad_norm": 25.255443572998047, "learning_rate": 8.578687643523721e-07, "loss": 0.2758, "num_input_tokens_seen": 5921984, "step": 8780 }, { "epoch": 0.2146190115554687, "grad_norm": 22.09063720703125, "learning_rate": 8.583573557433917e-07, "loss": 0.1436, "num_input_tokens_seen": 5925504, "step": 8785 }, { "epoch": 0.21474116238731586, "grad_norm": 22.207136154174805, "learning_rate": 8.588459471344115e-07, "loss": 0.0936, "num_input_tokens_seen": 5928384, "step": 8790 }, { "epoch": 0.21486331321916302, "grad_norm": 3.0565848350524902, "learning_rate": 8.593345385254312e-07, "loss": 0.0995, "num_input_tokens_seen": 5931776, "step": 8795 }, { "epoch": 0.21498546405101018, "grad_norm": 28.339794158935547, "learning_rate": 8.598231299164507e-07, "loss": 0.1071, "num_input_tokens_seen": 5934784, "step": 8800 }, { "epoch": 0.21510761488285735, "grad_norm": 34.90781784057617, "learning_rate": 8.603117213074705e-07, "loss": 0.1719, "num_input_tokens_seen": 5938560, "step": 8805 }, { "epoch": 0.2152297657147045, "grad_norm": 7.775395393371582, "learning_rate": 8.608003126984902e-07, "loss": 0.169, "num_input_tokens_seen": 5941824, "step": 8810 }, { "epoch": 0.21535191654655167, "grad_norm": 19.345985412597656, "learning_rate": 8.612889040895099e-07, "loss": 0.2145, "num_input_tokens_seen": 5945152, "step": 8815 }, { "epoch": 0.21547406737839886, "grad_norm": 25.269824981689453, "learning_rate": 8.617774954805296e-07, "loss": 0.1402, "num_input_tokens_seen": 5948992, "step": 8820 }, { "epoch": 0.21559621821024602, "grad_norm": 23.3660888671875, "learning_rate": 8.622660868715494e-07, "loss": 0.1242, "num_input_tokens_seen": 5952256, "step": 8825 }, { "epoch": 0.21571836904209318, "grad_norm": 20.00462532043457, "learning_rate": 8.627546782625689e-07, "loss": 0.1165, "num_input_tokens_seen": 5955648, "step": 8830 }, { "epoch": 0.21584051987394035, "grad_norm": 4.807917594909668, "learning_rate": 8.632432696535886e-07, "loss": 0.0812, "num_input_tokens_seen": 5959040, "step": 8835 }, { "epoch": 0.2159626707057875, "grad_norm": 5.229181289672852, "learning_rate": 8.637318610446084e-07, "loss": 0.1652, "num_input_tokens_seen": 5962560, "step": 8840 }, { "epoch": 0.21608482153763467, "grad_norm": 18.924203872680664, "learning_rate": 8.64220452435628e-07, "loss": 0.1325, "num_input_tokens_seen": 5965952, "step": 8845 }, { "epoch": 0.21620697236948183, "grad_norm": 2.562269926071167, "learning_rate": 8.647090438266478e-07, "loss": 0.0561, "num_input_tokens_seen": 5969280, "step": 8850 }, { "epoch": 0.216329123201329, "grad_norm": 37.25053787231445, "learning_rate": 8.651976352176674e-07, "loss": 0.1161, "num_input_tokens_seen": 5972480, "step": 8855 }, { "epoch": 0.21645127403317616, "grad_norm": 9.465447425842285, "learning_rate": 8.65686226608687e-07, "loss": 0.1002, "num_input_tokens_seen": 5975872, "step": 8860 }, { "epoch": 0.21657342486502332, "grad_norm": 13.03969669342041, "learning_rate": 8.661748179997068e-07, "loss": 0.1086, "num_input_tokens_seen": 5978944, "step": 8865 }, { "epoch": 0.2166955756968705, "grad_norm": 39.545013427734375, "learning_rate": 8.666634093907265e-07, "loss": 0.135, "num_input_tokens_seen": 5982656, "step": 8870 }, { "epoch": 0.21681772652871767, "grad_norm": 22.047517776489258, "learning_rate": 8.671520007817462e-07, "loss": 0.0856, "num_input_tokens_seen": 5985664, "step": 8875 }, { "epoch": 0.21693987736056483, "grad_norm": 0.7058843374252319, "learning_rate": 8.676405921727659e-07, "loss": 0.1775, "num_input_tokens_seen": 5989120, "step": 8880 }, { "epoch": 0.217062028192412, "grad_norm": 29.685516357421875, "learning_rate": 8.681291835637856e-07, "loss": 0.0496, "num_input_tokens_seen": 5992768, "step": 8885 }, { "epoch": 0.21718417902425916, "grad_norm": 30.59242820739746, "learning_rate": 8.686177749548052e-07, "loss": 0.2056, "num_input_tokens_seen": 5996288, "step": 8890 }, { "epoch": 0.21730632985610632, "grad_norm": 7.976591110229492, "learning_rate": 8.691063663458249e-07, "loss": 0.1601, "num_input_tokens_seen": 5999808, "step": 8895 }, { "epoch": 0.21742848068795348, "grad_norm": 39.25209426879883, "learning_rate": 8.695949577368447e-07, "loss": 0.1511, "num_input_tokens_seen": 6002944, "step": 8900 }, { "epoch": 0.21755063151980064, "grad_norm": 27.88641357421875, "learning_rate": 8.700835491278643e-07, "loss": 0.1434, "num_input_tokens_seen": 6006080, "step": 8905 }, { "epoch": 0.2176727823516478, "grad_norm": 1.7723349332809448, "learning_rate": 8.70572140518884e-07, "loss": 0.0495, "num_input_tokens_seen": 6009600, "step": 8910 }, { "epoch": 0.217794933183495, "grad_norm": 25.451271057128906, "learning_rate": 8.710607319099037e-07, "loss": 0.0918, "num_input_tokens_seen": 6012736, "step": 8915 }, { "epoch": 0.21791708401534216, "grad_norm": 6.012026309967041, "learning_rate": 8.715493233009233e-07, "loss": 0.0888, "num_input_tokens_seen": 6016064, "step": 8920 }, { "epoch": 0.21803923484718932, "grad_norm": 3.7859063148498535, "learning_rate": 8.720379146919431e-07, "loss": 0.1266, "num_input_tokens_seen": 6019328, "step": 8925 }, { "epoch": 0.21816138567903648, "grad_norm": 15.164278984069824, "learning_rate": 8.725265060829628e-07, "loss": 0.1356, "num_input_tokens_seen": 6023040, "step": 8930 }, { "epoch": 0.21828353651088364, "grad_norm": 17.575706481933594, "learning_rate": 8.730150974739825e-07, "loss": 0.1061, "num_input_tokens_seen": 6026496, "step": 8935 }, { "epoch": 0.2184056873427308, "grad_norm": 1.0441234111785889, "learning_rate": 8.735036888650021e-07, "loss": 0.0705, "num_input_tokens_seen": 6030080, "step": 8940 }, { "epoch": 0.21852783817457797, "grad_norm": 10.35993766784668, "learning_rate": 8.739922802560219e-07, "loss": 0.105, "num_input_tokens_seen": 6033408, "step": 8945 }, { "epoch": 0.21864998900642513, "grad_norm": 38.489383697509766, "learning_rate": 8.744808716470415e-07, "loss": 0.156, "num_input_tokens_seen": 6037120, "step": 8950 }, { "epoch": 0.2187721398382723, "grad_norm": 4.143792629241943, "learning_rate": 8.749694630380612e-07, "loss": 0.0852, "num_input_tokens_seen": 6040640, "step": 8955 }, { "epoch": 0.21889429067011945, "grad_norm": 48.676483154296875, "learning_rate": 8.75458054429081e-07, "loss": 0.1522, "num_input_tokens_seen": 6044096, "step": 8960 }, { "epoch": 0.21901644150196664, "grad_norm": 34.4107780456543, "learning_rate": 8.759466458201005e-07, "loss": 0.1482, "num_input_tokens_seen": 6047488, "step": 8965 }, { "epoch": 0.2191385923338138, "grad_norm": 29.14084815979004, "learning_rate": 8.764352372111203e-07, "loss": 0.1361, "num_input_tokens_seen": 6051072, "step": 8970 }, { "epoch": 0.21926074316566097, "grad_norm": 26.460193634033203, "learning_rate": 8.7692382860214e-07, "loss": 0.1871, "num_input_tokens_seen": 6054080, "step": 8975 }, { "epoch": 0.21938289399750813, "grad_norm": 5.312205791473389, "learning_rate": 8.774124199931597e-07, "loss": 0.0459, "num_input_tokens_seen": 6057856, "step": 8980 }, { "epoch": 0.2195050448293553, "grad_norm": 21.948001861572266, "learning_rate": 8.779010113841794e-07, "loss": 0.1141, "num_input_tokens_seen": 6061120, "step": 8985 }, { "epoch": 0.21962719566120245, "grad_norm": 34.10026168823242, "learning_rate": 8.78389602775199e-07, "loss": 0.142, "num_input_tokens_seen": 6064512, "step": 8990 }, { "epoch": 0.2197493464930496, "grad_norm": 12.825250625610352, "learning_rate": 8.788781941662187e-07, "loss": 0.0917, "num_input_tokens_seen": 6067904, "step": 8995 }, { "epoch": 0.21987149732489678, "grad_norm": 17.931827545166016, "learning_rate": 8.793667855572384e-07, "loss": 0.0879, "num_input_tokens_seen": 6071040, "step": 9000 }, { "epoch": 0.21999364815674394, "grad_norm": 4.5756731033325195, "learning_rate": 8.798553769482582e-07, "loss": 0.0989, "num_input_tokens_seen": 6074240, "step": 9005 }, { "epoch": 0.2201157989885911, "grad_norm": 31.641714096069336, "learning_rate": 8.803439683392778e-07, "loss": 0.2064, "num_input_tokens_seen": 6077696, "step": 9010 }, { "epoch": 0.2202379498204383, "grad_norm": 20.369218826293945, "learning_rate": 8.808325597302975e-07, "loss": 0.0578, "num_input_tokens_seen": 6080960, "step": 9015 }, { "epoch": 0.22036010065228545, "grad_norm": 21.847148895263672, "learning_rate": 8.813211511213172e-07, "loss": 0.0985, "num_input_tokens_seen": 6084736, "step": 9020 }, { "epoch": 0.2204822514841326, "grad_norm": 8.247864723205566, "learning_rate": 8.818097425123368e-07, "loss": 0.2164, "num_input_tokens_seen": 6087872, "step": 9025 }, { "epoch": 0.22060440231597978, "grad_norm": 17.761096954345703, "learning_rate": 8.822983339033566e-07, "loss": 0.0508, "num_input_tokens_seen": 6091264, "step": 9030 }, { "epoch": 0.22072655314782694, "grad_norm": 25.695043563842773, "learning_rate": 8.827869252943763e-07, "loss": 0.0883, "num_input_tokens_seen": 6094784, "step": 9035 }, { "epoch": 0.2208487039796741, "grad_norm": 25.67939567565918, "learning_rate": 8.83275516685396e-07, "loss": 0.132, "num_input_tokens_seen": 6098560, "step": 9040 }, { "epoch": 0.22097085481152126, "grad_norm": 38.35128402709961, "learning_rate": 8.837641080764157e-07, "loss": 0.1693, "num_input_tokens_seen": 6101952, "step": 9045 }, { "epoch": 0.22109300564336842, "grad_norm": 41.83271789550781, "learning_rate": 8.842526994674353e-07, "loss": 0.1167, "num_input_tokens_seen": 6105280, "step": 9050 }, { "epoch": 0.22121515647521559, "grad_norm": 31.90433120727539, "learning_rate": 8.84741290858455e-07, "loss": 0.1305, "num_input_tokens_seen": 6108224, "step": 9055 }, { "epoch": 0.22133730730706275, "grad_norm": 23.579755783081055, "learning_rate": 8.852298822494747e-07, "loss": 0.1659, "num_input_tokens_seen": 6111488, "step": 9060 }, { "epoch": 0.22145945813890994, "grad_norm": 32.75372314453125, "learning_rate": 8.857184736404945e-07, "loss": 0.1377, "num_input_tokens_seen": 6114752, "step": 9065 }, { "epoch": 0.2215816089707571, "grad_norm": 30.386550903320312, "learning_rate": 8.862070650315141e-07, "loss": 0.13, "num_input_tokens_seen": 6117824, "step": 9070 }, { "epoch": 0.22170375980260426, "grad_norm": 30.29648208618164, "learning_rate": 8.866956564225337e-07, "loss": 0.1427, "num_input_tokens_seen": 6121216, "step": 9075 }, { "epoch": 0.22182591063445142, "grad_norm": 44.06056213378906, "learning_rate": 8.871842478135535e-07, "loss": 0.2194, "num_input_tokens_seen": 6124736, "step": 9080 }, { "epoch": 0.22194806146629859, "grad_norm": 42.379302978515625, "learning_rate": 8.876728392045731e-07, "loss": 0.1909, "num_input_tokens_seen": 6128192, "step": 9085 }, { "epoch": 0.22207021229814575, "grad_norm": 2.4536609649658203, "learning_rate": 8.881614305955929e-07, "loss": 0.0626, "num_input_tokens_seen": 6131456, "step": 9090 }, { "epoch": 0.2221923631299929, "grad_norm": 23.942848205566406, "learning_rate": 8.886500219866126e-07, "loss": 0.1528, "num_input_tokens_seen": 6135104, "step": 9095 }, { "epoch": 0.22231451396184007, "grad_norm": 10.383399963378906, "learning_rate": 8.891386133776323e-07, "loss": 0.0779, "num_input_tokens_seen": 6138560, "step": 9100 }, { "epoch": 0.22243666479368723, "grad_norm": 17.084259033203125, "learning_rate": 8.896272047686519e-07, "loss": 0.168, "num_input_tokens_seen": 6141760, "step": 9105 }, { "epoch": 0.22255881562553442, "grad_norm": 34.821205139160156, "learning_rate": 8.901157961596716e-07, "loss": 0.1326, "num_input_tokens_seen": 6145280, "step": 9110 }, { "epoch": 0.22268096645738159, "grad_norm": 16.09421157836914, "learning_rate": 8.906043875506913e-07, "loss": 0.2119, "num_input_tokens_seen": 6148608, "step": 9115 }, { "epoch": 0.22280311728922875, "grad_norm": 11.02747631072998, "learning_rate": 8.91092978941711e-07, "loss": 0.1082, "num_input_tokens_seen": 6152128, "step": 9120 }, { "epoch": 0.2229252681210759, "grad_norm": 39.81501007080078, "learning_rate": 8.915815703327308e-07, "loss": 0.1069, "num_input_tokens_seen": 6155456, "step": 9125 }, { "epoch": 0.22304741895292307, "grad_norm": 18.303213119506836, "learning_rate": 8.920701617237503e-07, "loss": 0.0835, "num_input_tokens_seen": 6159360, "step": 9130 }, { "epoch": 0.22316956978477023, "grad_norm": 12.419539451599121, "learning_rate": 8.9255875311477e-07, "loss": 0.101, "num_input_tokens_seen": 6162944, "step": 9135 }, { "epoch": 0.2232917206166174, "grad_norm": 4.48032283782959, "learning_rate": 8.930473445057898e-07, "loss": 0.12, "num_input_tokens_seen": 6165952, "step": 9140 }, { "epoch": 0.22341387144846456, "grad_norm": 15.183457374572754, "learning_rate": 8.935359358968094e-07, "loss": 0.1024, "num_input_tokens_seen": 6169600, "step": 9145 }, { "epoch": 0.22353602228031172, "grad_norm": 16.72601318359375, "learning_rate": 8.940245272878292e-07, "loss": 0.0809, "num_input_tokens_seen": 6172992, "step": 9150 }, { "epoch": 0.22365817311215888, "grad_norm": 2.3916523456573486, "learning_rate": 8.945131186788489e-07, "loss": 0.0282, "num_input_tokens_seen": 6176064, "step": 9155 }, { "epoch": 0.22378032394400607, "grad_norm": 25.425764083862305, "learning_rate": 8.950017100698685e-07, "loss": 0.0747, "num_input_tokens_seen": 6179648, "step": 9160 }, { "epoch": 0.22390247477585323, "grad_norm": 0.3232182264328003, "learning_rate": 8.954903014608882e-07, "loss": 0.0868, "num_input_tokens_seen": 6183360, "step": 9165 }, { "epoch": 0.2240246256077004, "grad_norm": 21.200733184814453, "learning_rate": 8.959788928519079e-07, "loss": 0.1656, "num_input_tokens_seen": 6186368, "step": 9170 }, { "epoch": 0.22414677643954756, "grad_norm": 26.374645233154297, "learning_rate": 8.964674842429276e-07, "loss": 0.1054, "num_input_tokens_seen": 6190208, "step": 9175 }, { "epoch": 0.22426892727139472, "grad_norm": 2.9865565299987793, "learning_rate": 8.969560756339473e-07, "loss": 0.1436, "num_input_tokens_seen": 6193728, "step": 9180 }, { "epoch": 0.22439107810324188, "grad_norm": 33.91632080078125, "learning_rate": 8.97444667024967e-07, "loss": 0.1397, "num_input_tokens_seen": 6196864, "step": 9185 }, { "epoch": 0.22451322893508904, "grad_norm": 9.978410720825195, "learning_rate": 8.979332584159866e-07, "loss": 0.0629, "num_input_tokens_seen": 6200384, "step": 9190 }, { "epoch": 0.2246353797669362, "grad_norm": 21.483123779296875, "learning_rate": 8.984218498070063e-07, "loss": 0.1048, "num_input_tokens_seen": 6204032, "step": 9195 }, { "epoch": 0.22475753059878337, "grad_norm": 20.44054412841797, "learning_rate": 8.989104411980261e-07, "loss": 0.0767, "num_input_tokens_seen": 6206848, "step": 9200 }, { "epoch": 0.22487968143063053, "grad_norm": 1.512670874595642, "learning_rate": 8.993990325890457e-07, "loss": 0.083, "num_input_tokens_seen": 6210496, "step": 9205 }, { "epoch": 0.22500183226247772, "grad_norm": 14.777823448181152, "learning_rate": 8.998876239800655e-07, "loss": 0.1987, "num_input_tokens_seen": 6213632, "step": 9210 }, { "epoch": 0.22512398309432488, "grad_norm": 27.269006729125977, "learning_rate": 9.003762153710851e-07, "loss": 0.1313, "num_input_tokens_seen": 6217088, "step": 9215 }, { "epoch": 0.22524613392617204, "grad_norm": 9.171672821044922, "learning_rate": 9.008648067621048e-07, "loss": 0.0453, "num_input_tokens_seen": 6220224, "step": 9220 }, { "epoch": 0.2253682847580192, "grad_norm": 39.435176849365234, "learning_rate": 9.013533981531245e-07, "loss": 0.1343, "num_input_tokens_seen": 6223808, "step": 9225 }, { "epoch": 0.22549043558986637, "grad_norm": 32.36396789550781, "learning_rate": 9.018419895441442e-07, "loss": 0.1499, "num_input_tokens_seen": 6227904, "step": 9230 }, { "epoch": 0.22561258642171353, "grad_norm": 32.112342834472656, "learning_rate": 9.023305809351639e-07, "loss": 0.1896, "num_input_tokens_seen": 6231232, "step": 9235 }, { "epoch": 0.2257347372535607, "grad_norm": 15.149031639099121, "learning_rate": 9.028191723261835e-07, "loss": 0.1545, "num_input_tokens_seen": 6234944, "step": 9240 }, { "epoch": 0.22585688808540785, "grad_norm": 14.708928108215332, "learning_rate": 9.033077637172033e-07, "loss": 0.0667, "num_input_tokens_seen": 6237696, "step": 9245 }, { "epoch": 0.22597903891725502, "grad_norm": 17.376279830932617, "learning_rate": 9.037963551082229e-07, "loss": 0.0615, "num_input_tokens_seen": 6240896, "step": 9250 }, { "epoch": 0.2261011897491022, "grad_norm": 20.528581619262695, "learning_rate": 9.042849464992427e-07, "loss": 0.0874, "num_input_tokens_seen": 6244160, "step": 9255 }, { "epoch": 0.22622334058094937, "grad_norm": 35.168731689453125, "learning_rate": 9.047735378902624e-07, "loss": 0.1483, "num_input_tokens_seen": 6247040, "step": 9260 }, { "epoch": 0.22634549141279653, "grad_norm": 29.01551055908203, "learning_rate": 9.05262129281282e-07, "loss": 0.1543, "num_input_tokens_seen": 6250752, "step": 9265 }, { "epoch": 0.2264676422446437, "grad_norm": 19.882156372070312, "learning_rate": 9.057507206723017e-07, "loss": 0.1016, "num_input_tokens_seen": 6253824, "step": 9270 }, { "epoch": 0.22658979307649085, "grad_norm": 8.662846565246582, "learning_rate": 9.062393120633214e-07, "loss": 0.1943, "num_input_tokens_seen": 6257344, "step": 9275 }, { "epoch": 0.22671194390833802, "grad_norm": 53.7777099609375, "learning_rate": 9.067279034543411e-07, "loss": 0.0592, "num_input_tokens_seen": 6260672, "step": 9280 }, { "epoch": 0.22683409474018518, "grad_norm": 19.83289337158203, "learning_rate": 9.072164948453608e-07, "loss": 0.1493, "num_input_tokens_seen": 6263872, "step": 9285 }, { "epoch": 0.22695624557203234, "grad_norm": 8.226678848266602, "learning_rate": 9.077050862363805e-07, "loss": 0.1311, "num_input_tokens_seen": 6266944, "step": 9290 }, { "epoch": 0.2270783964038795, "grad_norm": 36.27229309082031, "learning_rate": 9.081936776274001e-07, "loss": 0.1872, "num_input_tokens_seen": 6270144, "step": 9295 }, { "epoch": 0.22720054723572666, "grad_norm": 39.83232498168945, "learning_rate": 9.086822690184198e-07, "loss": 0.1492, "num_input_tokens_seen": 6273536, "step": 9300 }, { "epoch": 0.22732269806757385, "grad_norm": 17.2782039642334, "learning_rate": 9.091708604094396e-07, "loss": 0.0562, "num_input_tokens_seen": 6276608, "step": 9305 }, { "epoch": 0.22744484889942101, "grad_norm": 4.376680850982666, "learning_rate": 9.096594518004592e-07, "loss": 0.2263, "num_input_tokens_seen": 6280128, "step": 9310 }, { "epoch": 0.22756699973126818, "grad_norm": 28.051551818847656, "learning_rate": 9.10148043191479e-07, "loss": 0.1597, "num_input_tokens_seen": 6284032, "step": 9315 }, { "epoch": 0.22768915056311534, "grad_norm": 19.270139694213867, "learning_rate": 9.106366345824987e-07, "loss": 0.1385, "num_input_tokens_seen": 6287552, "step": 9320 }, { "epoch": 0.2278113013949625, "grad_norm": 14.105006217956543, "learning_rate": 9.111252259735182e-07, "loss": 0.1459, "num_input_tokens_seen": 6291200, "step": 9325 }, { "epoch": 0.22793345222680966, "grad_norm": 49.273494720458984, "learning_rate": 9.11613817364538e-07, "loss": 0.1791, "num_input_tokens_seen": 6294336, "step": 9330 }, { "epoch": 0.22805560305865683, "grad_norm": 1.47030770778656, "learning_rate": 9.121024087555577e-07, "loss": 0.1285, "num_input_tokens_seen": 6297856, "step": 9335 }, { "epoch": 0.228177753890504, "grad_norm": 25.866708755493164, "learning_rate": 9.125910001465774e-07, "loss": 0.1282, "num_input_tokens_seen": 6301440, "step": 9340 }, { "epoch": 0.22829990472235115, "grad_norm": 13.393543243408203, "learning_rate": 9.130795915375971e-07, "loss": 0.119, "num_input_tokens_seen": 6305024, "step": 9345 }, { "epoch": 0.2284220555541983, "grad_norm": 19.900217056274414, "learning_rate": 9.135681829286167e-07, "loss": 0.0722, "num_input_tokens_seen": 6308224, "step": 9350 }, { "epoch": 0.2285442063860455, "grad_norm": 24.861238479614258, "learning_rate": 9.140567743196364e-07, "loss": 0.1011, "num_input_tokens_seen": 6311808, "step": 9355 }, { "epoch": 0.22866635721789266, "grad_norm": 30.723133087158203, "learning_rate": 9.145453657106561e-07, "loss": 0.1862, "num_input_tokens_seen": 6315200, "step": 9360 }, { "epoch": 0.22878850804973982, "grad_norm": 27.590089797973633, "learning_rate": 9.150339571016759e-07, "loss": 0.0647, "num_input_tokens_seen": 6318592, "step": 9365 }, { "epoch": 0.228910658881587, "grad_norm": 4.14084529876709, "learning_rate": 9.155225484926955e-07, "loss": 0.0721, "num_input_tokens_seen": 6321600, "step": 9370 }, { "epoch": 0.22903280971343415, "grad_norm": 11.999881744384766, "learning_rate": 9.160111398837153e-07, "loss": 0.064, "num_input_tokens_seen": 6324608, "step": 9375 }, { "epoch": 0.2291549605452813, "grad_norm": 20.66971206665039, "learning_rate": 9.164997312747349e-07, "loss": 0.1505, "num_input_tokens_seen": 6327936, "step": 9380 }, { "epoch": 0.22927711137712847, "grad_norm": 42.416725158691406, "learning_rate": 9.169883226657545e-07, "loss": 0.3507, "num_input_tokens_seen": 6330752, "step": 9385 }, { "epoch": 0.22939926220897564, "grad_norm": 27.1995849609375, "learning_rate": 9.174769140567743e-07, "loss": 0.1794, "num_input_tokens_seen": 6334336, "step": 9390 }, { "epoch": 0.2295214130408228, "grad_norm": 10.624309539794922, "learning_rate": 9.17965505447794e-07, "loss": 0.1112, "num_input_tokens_seen": 6337728, "step": 9395 }, { "epoch": 0.22964356387266996, "grad_norm": 19.080080032348633, "learning_rate": 9.184540968388137e-07, "loss": 0.0702, "num_input_tokens_seen": 6340992, "step": 9400 }, { "epoch": 0.22976571470451715, "grad_norm": 5.388887882232666, "learning_rate": 9.189426882298333e-07, "loss": 0.068, "num_input_tokens_seen": 6344000, "step": 9405 }, { "epoch": 0.2298878655363643, "grad_norm": 23.71079444885254, "learning_rate": 9.19431279620853e-07, "loss": 0.1371, "num_input_tokens_seen": 6347200, "step": 9410 }, { "epoch": 0.23001001636821147, "grad_norm": 7.927711009979248, "learning_rate": 9.199198710118727e-07, "loss": 0.1161, "num_input_tokens_seen": 6350848, "step": 9415 }, { "epoch": 0.23013216720005863, "grad_norm": 15.976344108581543, "learning_rate": 9.204084624028924e-07, "loss": 0.1124, "num_input_tokens_seen": 6354496, "step": 9420 }, { "epoch": 0.2302543180319058, "grad_norm": 9.561179161071777, "learning_rate": 9.208970537939122e-07, "loss": 0.054, "num_input_tokens_seen": 6358016, "step": 9425 }, { "epoch": 0.23037646886375296, "grad_norm": 48.035709381103516, "learning_rate": 9.213856451849317e-07, "loss": 0.1934, "num_input_tokens_seen": 6361536, "step": 9430 }, { "epoch": 0.23049861969560012, "grad_norm": 2.9448719024658203, "learning_rate": 9.218742365759515e-07, "loss": 0.1495, "num_input_tokens_seen": 6364544, "step": 9435 }, { "epoch": 0.23062077052744728, "grad_norm": 1.32425856590271, "learning_rate": 9.223628279669712e-07, "loss": 0.1165, "num_input_tokens_seen": 6368832, "step": 9440 }, { "epoch": 0.23074292135929445, "grad_norm": 20.775575637817383, "learning_rate": 9.228514193579908e-07, "loss": 0.0733, "num_input_tokens_seen": 6372160, "step": 9445 }, { "epoch": 0.23086507219114163, "grad_norm": 9.844050407409668, "learning_rate": 9.233400107490106e-07, "loss": 0.136, "num_input_tokens_seen": 6375552, "step": 9450 }, { "epoch": 0.2309872230229888, "grad_norm": 11.708232879638672, "learning_rate": 9.238286021400303e-07, "loss": 0.0873, "num_input_tokens_seen": 6378944, "step": 9455 }, { "epoch": 0.23110937385483596, "grad_norm": 27.45308494567871, "learning_rate": 9.243171935310499e-07, "loss": 0.1288, "num_input_tokens_seen": 6381888, "step": 9460 }, { "epoch": 0.23123152468668312, "grad_norm": 17.817472457885742, "learning_rate": 9.248057849220696e-07, "loss": 0.1454, "num_input_tokens_seen": 6385088, "step": 9465 }, { "epoch": 0.23135367551853028, "grad_norm": 57.0069580078125, "learning_rate": 9.252943763130894e-07, "loss": 0.0999, "num_input_tokens_seen": 6388224, "step": 9470 }, { "epoch": 0.23147582635037744, "grad_norm": 36.93852615356445, "learning_rate": 9.25782967704109e-07, "loss": 0.1227, "num_input_tokens_seen": 6391552, "step": 9475 }, { "epoch": 0.2315979771822246, "grad_norm": 0.25835201144218445, "learning_rate": 9.262715590951287e-07, "loss": 0.0858, "num_input_tokens_seen": 6395200, "step": 9480 }, { "epoch": 0.23172012801407177, "grad_norm": 43.729427337646484, "learning_rate": 9.267601504861485e-07, "loss": 0.0834, "num_input_tokens_seen": 6398976, "step": 9485 }, { "epoch": 0.23184227884591893, "grad_norm": 10.132993698120117, "learning_rate": 9.27248741877168e-07, "loss": 0.1258, "num_input_tokens_seen": 6402304, "step": 9490 }, { "epoch": 0.2319644296777661, "grad_norm": 17.87611198425293, "learning_rate": 9.277373332681878e-07, "loss": 0.1547, "num_input_tokens_seen": 6405696, "step": 9495 }, { "epoch": 0.23208658050961328, "grad_norm": 61.92757034301758, "learning_rate": 9.282259246592075e-07, "loss": 0.269, "num_input_tokens_seen": 6409280, "step": 9500 }, { "epoch": 0.23220873134146044, "grad_norm": 19.06719398498535, "learning_rate": 9.287145160502271e-07, "loss": 0.1537, "num_input_tokens_seen": 6413504, "step": 9505 }, { "epoch": 0.2323308821733076, "grad_norm": 75.03028869628906, "learning_rate": 9.292031074412469e-07, "loss": 0.0875, "num_input_tokens_seen": 6417088, "step": 9510 }, { "epoch": 0.23245303300515477, "grad_norm": 81.23187255859375, "learning_rate": 9.296916988322665e-07, "loss": 0.123, "num_input_tokens_seen": 6420352, "step": 9515 }, { "epoch": 0.23257518383700193, "grad_norm": 17.87434959411621, "learning_rate": 9.301802902232862e-07, "loss": 0.1393, "num_input_tokens_seen": 6424192, "step": 9520 }, { "epoch": 0.2326973346688491, "grad_norm": 40.12156677246094, "learning_rate": 9.306688816143059e-07, "loss": 0.1615, "num_input_tokens_seen": 6427520, "step": 9525 }, { "epoch": 0.23281948550069625, "grad_norm": 2.44599986076355, "learning_rate": 9.311574730053257e-07, "loss": 0.0897, "num_input_tokens_seen": 6431104, "step": 9530 }, { "epoch": 0.23294163633254342, "grad_norm": 11.04211711883545, "learning_rate": 9.316460643963453e-07, "loss": 0.1335, "num_input_tokens_seen": 6434688, "step": 9535 }, { "epoch": 0.23306378716439058, "grad_norm": 46.168495178222656, "learning_rate": 9.32134655787365e-07, "loss": 0.1245, "num_input_tokens_seen": 6438080, "step": 9540 }, { "epoch": 0.23318593799623774, "grad_norm": 14.355307579040527, "learning_rate": 9.326232471783847e-07, "loss": 0.0828, "num_input_tokens_seen": 6441408, "step": 9545 }, { "epoch": 0.23330808882808493, "grad_norm": 24.860210418701172, "learning_rate": 9.331118385694043e-07, "loss": 0.1374, "num_input_tokens_seen": 6444992, "step": 9550 }, { "epoch": 0.2334302396599321, "grad_norm": 32.60927200317383, "learning_rate": 9.336004299604241e-07, "loss": 0.1874, "num_input_tokens_seen": 6448768, "step": 9555 }, { "epoch": 0.23355239049177925, "grad_norm": 6.585495948791504, "learning_rate": 9.340890213514438e-07, "loss": 0.125, "num_input_tokens_seen": 6451840, "step": 9560 }, { "epoch": 0.23367454132362642, "grad_norm": 7.540535926818848, "learning_rate": 9.345776127424634e-07, "loss": 0.1218, "num_input_tokens_seen": 6455296, "step": 9565 }, { "epoch": 0.23379669215547358, "grad_norm": 26.24741554260254, "learning_rate": 9.350662041334831e-07, "loss": 0.1065, "num_input_tokens_seen": 6458496, "step": 9570 }, { "epoch": 0.23391884298732074, "grad_norm": 22.11855697631836, "learning_rate": 9.355547955245028e-07, "loss": 0.1746, "num_input_tokens_seen": 6461760, "step": 9575 }, { "epoch": 0.2340409938191679, "grad_norm": 6.021768093109131, "learning_rate": 9.360433869155225e-07, "loss": 0.0387, "num_input_tokens_seen": 6465472, "step": 9580 }, { "epoch": 0.23416314465101506, "grad_norm": 13.273813247680664, "learning_rate": 9.365319783065422e-07, "loss": 0.1766, "num_input_tokens_seen": 6468544, "step": 9585 }, { "epoch": 0.23428529548286223, "grad_norm": 1.5607789754867554, "learning_rate": 9.37020569697562e-07, "loss": 0.0384, "num_input_tokens_seen": 6472128, "step": 9590 }, { "epoch": 0.23440744631470942, "grad_norm": 28.771175384521484, "learning_rate": 9.375091610885816e-07, "loss": 0.307, "num_input_tokens_seen": 6475392, "step": 9595 }, { "epoch": 0.23452959714655658, "grad_norm": 31.59319496154785, "learning_rate": 9.379977524796012e-07, "loss": 0.159, "num_input_tokens_seen": 6478720, "step": 9600 }, { "epoch": 0.23465174797840374, "grad_norm": 37.83382034301758, "learning_rate": 9.38486343870621e-07, "loss": 0.2908, "num_input_tokens_seen": 6482688, "step": 9605 }, { "epoch": 0.2347738988102509, "grad_norm": 30.60443878173828, "learning_rate": 9.389749352616406e-07, "loss": 0.1376, "num_input_tokens_seen": 6486016, "step": 9610 }, { "epoch": 0.23489604964209806, "grad_norm": 11.49390983581543, "learning_rate": 9.394635266526604e-07, "loss": 0.0965, "num_input_tokens_seen": 6489280, "step": 9615 }, { "epoch": 0.23501820047394523, "grad_norm": 24.10028648376465, "learning_rate": 9.399521180436801e-07, "loss": 0.0924, "num_input_tokens_seen": 6492416, "step": 9620 }, { "epoch": 0.2351403513057924, "grad_norm": 39.35089874267578, "learning_rate": 9.404407094346996e-07, "loss": 0.1259, "num_input_tokens_seen": 6495744, "step": 9625 }, { "epoch": 0.23526250213763955, "grad_norm": 23.8758544921875, "learning_rate": 9.409293008257194e-07, "loss": 0.1663, "num_input_tokens_seen": 6499136, "step": 9630 }, { "epoch": 0.2353846529694867, "grad_norm": 6.008744716644287, "learning_rate": 9.414178922167391e-07, "loss": 0.0544, "num_input_tokens_seen": 6502528, "step": 9635 }, { "epoch": 0.23550680380133387, "grad_norm": 30.769689559936523, "learning_rate": 9.419064836077588e-07, "loss": 0.2107, "num_input_tokens_seen": 6506112, "step": 9640 }, { "epoch": 0.23562895463318106, "grad_norm": 21.367860794067383, "learning_rate": 9.423950749987785e-07, "loss": 0.0687, "num_input_tokens_seen": 6509696, "step": 9645 }, { "epoch": 0.23575110546502823, "grad_norm": 1.8322582244873047, "learning_rate": 9.428836663897983e-07, "loss": 0.0455, "num_input_tokens_seen": 6513408, "step": 9650 }, { "epoch": 0.2358732562968754, "grad_norm": 0.16293790936470032, "learning_rate": 9.433722577808178e-07, "loss": 0.0809, "num_input_tokens_seen": 6516864, "step": 9655 }, { "epoch": 0.23599540712872255, "grad_norm": 19.307575225830078, "learning_rate": 9.438608491718375e-07, "loss": 0.2114, "num_input_tokens_seen": 6519872, "step": 9660 }, { "epoch": 0.2361175579605697, "grad_norm": 1.3225882053375244, "learning_rate": 9.443494405628573e-07, "loss": 0.1159, "num_input_tokens_seen": 6522944, "step": 9665 }, { "epoch": 0.23623970879241687, "grad_norm": 67.67269897460938, "learning_rate": 9.448380319538769e-07, "loss": 0.1237, "num_input_tokens_seen": 6526080, "step": 9670 }, { "epoch": 0.23636185962426404, "grad_norm": 0.4603472948074341, "learning_rate": 9.453266233448967e-07, "loss": 0.0633, "num_input_tokens_seen": 6529536, "step": 9675 }, { "epoch": 0.2364840104561112, "grad_norm": 7.607626914978027, "learning_rate": 9.458152147359163e-07, "loss": 0.1407, "num_input_tokens_seen": 6533056, "step": 9680 }, { "epoch": 0.23660616128795836, "grad_norm": 46.98657989501953, "learning_rate": 9.46303806126936e-07, "loss": 0.1077, "num_input_tokens_seen": 6536384, "step": 9685 }, { "epoch": 0.23672831211980552, "grad_norm": 2.524136781692505, "learning_rate": 9.467923975179557e-07, "loss": 0.0448, "num_input_tokens_seen": 6540160, "step": 9690 }, { "epoch": 0.2368504629516527, "grad_norm": 37.658634185791016, "learning_rate": 9.472809889089754e-07, "loss": 0.239, "num_input_tokens_seen": 6544000, "step": 9695 }, { "epoch": 0.23697261378349987, "grad_norm": 2.243898630142212, "learning_rate": 9.477695802999951e-07, "loss": 0.2482, "num_input_tokens_seen": 6547584, "step": 9700 }, { "epoch": 0.23709476461534704, "grad_norm": 5.683414459228516, "learning_rate": 9.482581716910148e-07, "loss": 0.1092, "num_input_tokens_seen": 6550656, "step": 9705 }, { "epoch": 0.2372169154471942, "grad_norm": 4.624309539794922, "learning_rate": 9.487467630820345e-07, "loss": 0.0269, "num_input_tokens_seen": 6553920, "step": 9710 }, { "epoch": 0.23733906627904136, "grad_norm": 0.224063441157341, "learning_rate": 9.492353544730541e-07, "loss": 0.2107, "num_input_tokens_seen": 6557312, "step": 9715 }, { "epoch": 0.23746121711088852, "grad_norm": 1.4567283391952515, "learning_rate": 9.497239458640738e-07, "loss": 0.1803, "num_input_tokens_seen": 6560512, "step": 9720 }, { "epoch": 0.23758336794273568, "grad_norm": 12.508830070495605, "learning_rate": 9.502125372550936e-07, "loss": 0.0741, "num_input_tokens_seen": 6564480, "step": 9725 }, { "epoch": 0.23770551877458285, "grad_norm": 27.259662628173828, "learning_rate": 9.507011286461132e-07, "loss": 0.1461, "num_input_tokens_seen": 6567808, "step": 9730 }, { "epoch": 0.23782766960643, "grad_norm": 2.4258787631988525, "learning_rate": 9.511897200371329e-07, "loss": 0.0947, "num_input_tokens_seen": 6571200, "step": 9735 }, { "epoch": 0.23794982043827717, "grad_norm": 14.623574256896973, "learning_rate": 9.516783114281526e-07, "loss": 0.0608, "num_input_tokens_seen": 6574912, "step": 9740 }, { "epoch": 0.23807197127012436, "grad_norm": 32.38935852050781, "learning_rate": 9.521669028191723e-07, "loss": 0.1259, "num_input_tokens_seen": 6578432, "step": 9745 }, { "epoch": 0.23819412210197152, "grad_norm": 18.282604217529297, "learning_rate": 9.52655494210192e-07, "loss": 0.1251, "num_input_tokens_seen": 6581760, "step": 9750 }, { "epoch": 0.23831627293381868, "grad_norm": 52.684356689453125, "learning_rate": 9.531440856012117e-07, "loss": 0.0943, "num_input_tokens_seen": 6585280, "step": 9755 }, { "epoch": 0.23843842376566585, "grad_norm": 25.119375228881836, "learning_rate": 9.536326769922314e-07, "loss": 0.0738, "num_input_tokens_seen": 6588416, "step": 9760 }, { "epoch": 0.238560574597513, "grad_norm": 44.266971588134766, "learning_rate": 9.54121268383251e-07, "loss": 0.1411, "num_input_tokens_seen": 6591808, "step": 9765 }, { "epoch": 0.23868272542936017, "grad_norm": 61.14725112915039, "learning_rate": 9.546098597742707e-07, "loss": 0.2692, "num_input_tokens_seen": 6595328, "step": 9770 }, { "epoch": 0.23880487626120733, "grad_norm": 73.36861419677734, "learning_rate": 9.550984511652904e-07, "loss": 0.1692, "num_input_tokens_seen": 6598464, "step": 9775 }, { "epoch": 0.2389270270930545, "grad_norm": 50.22648239135742, "learning_rate": 9.5558704255631e-07, "loss": 0.1712, "num_input_tokens_seen": 6601984, "step": 9780 }, { "epoch": 0.23904917792490166, "grad_norm": 22.295808792114258, "learning_rate": 9.560756339473298e-07, "loss": 0.0982, "num_input_tokens_seen": 6605184, "step": 9785 }, { "epoch": 0.23917132875674885, "grad_norm": 16.143505096435547, "learning_rate": 9.565642253383494e-07, "loss": 0.3098, "num_input_tokens_seen": 6608384, "step": 9790 }, { "epoch": 0.239293479588596, "grad_norm": 2.035536766052246, "learning_rate": 9.570528167293691e-07, "loss": 0.0284, "num_input_tokens_seen": 6611776, "step": 9795 }, { "epoch": 0.23941563042044317, "grad_norm": 17.494781494140625, "learning_rate": 9.575414081203888e-07, "loss": 0.0463, "num_input_tokens_seen": 6615168, "step": 9800 }, { "epoch": 0.23953778125229033, "grad_norm": 23.606449127197266, "learning_rate": 9.580299995114087e-07, "loss": 0.2122, "num_input_tokens_seen": 6618496, "step": 9805 }, { "epoch": 0.2396599320841375, "grad_norm": 13.581524848937988, "learning_rate": 9.585185909024282e-07, "loss": 0.0728, "num_input_tokens_seen": 6621888, "step": 9810 }, { "epoch": 0.23978208291598466, "grad_norm": 25.172103881835938, "learning_rate": 9.590071822934478e-07, "loss": 0.1752, "num_input_tokens_seen": 6625152, "step": 9815 }, { "epoch": 0.23990423374783182, "grad_norm": 19.240901947021484, "learning_rate": 9.594957736844677e-07, "loss": 0.0904, "num_input_tokens_seen": 6628736, "step": 9820 }, { "epoch": 0.24002638457967898, "grad_norm": 14.154614448547363, "learning_rate": 9.599843650754872e-07, "loss": 0.1229, "num_input_tokens_seen": 6632192, "step": 9825 }, { "epoch": 0.24014853541152614, "grad_norm": 8.691808700561523, "learning_rate": 9.60472956466507e-07, "loss": 0.0957, "num_input_tokens_seen": 6635648, "step": 9830 }, { "epoch": 0.2402706862433733, "grad_norm": 70.14839172363281, "learning_rate": 9.609615478575268e-07, "loss": 0.178, "num_input_tokens_seen": 6639104, "step": 9835 }, { "epoch": 0.2403928370752205, "grad_norm": 13.37100887298584, "learning_rate": 9.614501392485463e-07, "loss": 0.2036, "num_input_tokens_seen": 6642752, "step": 9840 }, { "epoch": 0.24051498790706766, "grad_norm": 9.792305946350098, "learning_rate": 9.619387306395661e-07, "loss": 0.0657, "num_input_tokens_seen": 6646208, "step": 9845 }, { "epoch": 0.24063713873891482, "grad_norm": 34.969730377197266, "learning_rate": 9.624273220305858e-07, "loss": 0.122, "num_input_tokens_seen": 6649600, "step": 9850 }, { "epoch": 0.24075928957076198, "grad_norm": 23.51738929748535, "learning_rate": 9.629159134216055e-07, "loss": 0.1153, "num_input_tokens_seen": 6653248, "step": 9855 }, { "epoch": 0.24088144040260914, "grad_norm": 51.84558868408203, "learning_rate": 9.634045048126252e-07, "loss": 0.0749, "num_input_tokens_seen": 6656512, "step": 9860 }, { "epoch": 0.2410035912344563, "grad_norm": 25.279094696044922, "learning_rate": 9.638930962036449e-07, "loss": 0.1768, "num_input_tokens_seen": 6660160, "step": 9865 }, { "epoch": 0.24112574206630347, "grad_norm": 10.345446586608887, "learning_rate": 9.643816875946646e-07, "loss": 0.185, "num_input_tokens_seen": 6663168, "step": 9870 }, { "epoch": 0.24124789289815063, "grad_norm": 51.44864273071289, "learning_rate": 9.648702789856842e-07, "loss": 0.1417, "num_input_tokens_seen": 6666496, "step": 9875 }, { "epoch": 0.2413700437299978, "grad_norm": 39.88957595825195, "learning_rate": 9.65358870376704e-07, "loss": 0.1139, "num_input_tokens_seen": 6669504, "step": 9880 }, { "epoch": 0.24149219456184495, "grad_norm": 2.5328776836395264, "learning_rate": 9.658474617677236e-07, "loss": 0.1812, "num_input_tokens_seen": 6673152, "step": 9885 }, { "epoch": 0.24161434539369214, "grad_norm": 22.786235809326172, "learning_rate": 9.663360531587433e-07, "loss": 0.062, "num_input_tokens_seen": 6676608, "step": 9890 }, { "epoch": 0.2417364962255393, "grad_norm": 24.119340896606445, "learning_rate": 9.66824644549763e-07, "loss": 0.0548, "num_input_tokens_seen": 6679488, "step": 9895 }, { "epoch": 0.24185864705738647, "grad_norm": 40.44359588623047, "learning_rate": 9.673132359407826e-07, "loss": 0.2652, "num_input_tokens_seen": 6682624, "step": 9900 }, { "epoch": 0.24198079788923363, "grad_norm": 2.0219240188598633, "learning_rate": 9.678018273318023e-07, "loss": 0.0769, "num_input_tokens_seen": 6685952, "step": 9905 }, { "epoch": 0.2421029487210808, "grad_norm": 21.13841438293457, "learning_rate": 9.68290418722822e-07, "loss": 0.1698, "num_input_tokens_seen": 6689920, "step": 9910 }, { "epoch": 0.24222509955292795, "grad_norm": 14.74864387512207, "learning_rate": 9.687790101138417e-07, "loss": 0.146, "num_input_tokens_seen": 6693184, "step": 9915 }, { "epoch": 0.24234725038477511, "grad_norm": 2.7721073627471924, "learning_rate": 9.692676015048614e-07, "loss": 0.0271, "num_input_tokens_seen": 6696320, "step": 9920 }, { "epoch": 0.24246940121662228, "grad_norm": 33.85337448120117, "learning_rate": 9.697561928958813e-07, "loss": 0.0984, "num_input_tokens_seen": 6699136, "step": 9925 }, { "epoch": 0.24259155204846944, "grad_norm": 49.046958923339844, "learning_rate": 9.702447842869007e-07, "loss": 0.113, "num_input_tokens_seen": 6703360, "step": 9930 }, { "epoch": 0.24271370288031663, "grad_norm": 9.85345458984375, "learning_rate": 9.707333756779204e-07, "loss": 0.1492, "num_input_tokens_seen": 6706752, "step": 9935 }, { "epoch": 0.2428358537121638, "grad_norm": 17.741451263427734, "learning_rate": 9.712219670689403e-07, "loss": 0.1364, "num_input_tokens_seen": 6710144, "step": 9940 }, { "epoch": 0.24295800454401095, "grad_norm": 5.377196788787842, "learning_rate": 9.717105584599598e-07, "loss": 0.2191, "num_input_tokens_seen": 6713088, "step": 9945 }, { "epoch": 0.24308015537585811, "grad_norm": 43.747596740722656, "learning_rate": 9.721991498509797e-07, "loss": 0.1625, "num_input_tokens_seen": 6716352, "step": 9950 }, { "epoch": 0.24320230620770528, "grad_norm": 29.562347412109375, "learning_rate": 9.726877412419993e-07, "loss": 0.1616, "num_input_tokens_seen": 6719488, "step": 9955 }, { "epoch": 0.24332445703955244, "grad_norm": 11.152497291564941, "learning_rate": 9.73176332633019e-07, "loss": 0.0587, "num_input_tokens_seen": 6722880, "step": 9960 }, { "epoch": 0.2434466078713996, "grad_norm": 11.652654647827148, "learning_rate": 9.736649240240387e-07, "loss": 0.1104, "num_input_tokens_seen": 6726784, "step": 9965 }, { "epoch": 0.24356875870324676, "grad_norm": 15.605438232421875, "learning_rate": 9.741535154150584e-07, "loss": 0.076, "num_input_tokens_seen": 6730560, "step": 9970 }, { "epoch": 0.24369090953509392, "grad_norm": 47.092041015625, "learning_rate": 9.74642106806078e-07, "loss": 0.0844, "num_input_tokens_seen": 6734144, "step": 9975 }, { "epoch": 0.2438130603669411, "grad_norm": 2.1623594760894775, "learning_rate": 9.751306981970978e-07, "loss": 0.0711, "num_input_tokens_seen": 6737536, "step": 9980 }, { "epoch": 0.24393521119878828, "grad_norm": 0.6756454706192017, "learning_rate": 9.756192895881174e-07, "loss": 0.0827, "num_input_tokens_seen": 6740864, "step": 9985 }, { "epoch": 0.24405736203063544, "grad_norm": 36.02912139892578, "learning_rate": 9.761078809791371e-07, "loss": 0.119, "num_input_tokens_seen": 6744256, "step": 9990 }, { "epoch": 0.2441795128624826, "grad_norm": 62.33737564086914, "learning_rate": 9.765964723701568e-07, "loss": 0.1792, "num_input_tokens_seen": 6747904, "step": 9995 }, { "epoch": 0.24430166369432976, "grad_norm": 39.74384307861328, "learning_rate": 9.770850637611765e-07, "loss": 0.1775, "num_input_tokens_seen": 6751232, "step": 10000 }, { "epoch": 0.24442381452617692, "grad_norm": 37.42814636230469, "learning_rate": 9.775736551521962e-07, "loss": 0.2521, "num_input_tokens_seen": 6754368, "step": 10005 }, { "epoch": 0.2445459653580241, "grad_norm": 30.444894790649414, "learning_rate": 9.780622465432158e-07, "loss": 0.0733, "num_input_tokens_seen": 6757632, "step": 10010 }, { "epoch": 0.24466811618987125, "grad_norm": 25.95069122314453, "learning_rate": 9.785508379342355e-07, "loss": 0.2123, "num_input_tokens_seen": 6761024, "step": 10015 }, { "epoch": 0.2447902670217184, "grad_norm": 33.53807067871094, "learning_rate": 9.790394293252552e-07, "loss": 0.1199, "num_input_tokens_seen": 6764480, "step": 10020 }, { "epoch": 0.24491241785356557, "grad_norm": 16.67969512939453, "learning_rate": 9.795280207162749e-07, "loss": 0.0975, "num_input_tokens_seen": 6767680, "step": 10025 }, { "epoch": 0.24503456868541273, "grad_norm": 29.962873458862305, "learning_rate": 9.800166121072946e-07, "loss": 0.1569, "num_input_tokens_seen": 6771008, "step": 10030 }, { "epoch": 0.24515671951725992, "grad_norm": 1.9181692600250244, "learning_rate": 9.805052034983142e-07, "loss": 0.1284, "num_input_tokens_seen": 6774080, "step": 10035 }, { "epoch": 0.2452788703491071, "grad_norm": 6.126092433929443, "learning_rate": 9.80993794889334e-07, "loss": 0.0622, "num_input_tokens_seen": 6777472, "step": 10040 }, { "epoch": 0.24540102118095425, "grad_norm": 40.68521499633789, "learning_rate": 9.814823862803538e-07, "loss": 0.087, "num_input_tokens_seen": 6780736, "step": 10045 }, { "epoch": 0.2455231720128014, "grad_norm": 6.570443153381348, "learning_rate": 9.819709776713733e-07, "loss": 0.192, "num_input_tokens_seen": 6783872, "step": 10050 }, { "epoch": 0.24564532284464857, "grad_norm": 30.137697219848633, "learning_rate": 9.82459569062393e-07, "loss": 0.1749, "num_input_tokens_seen": 6787328, "step": 10055 }, { "epoch": 0.24576747367649573, "grad_norm": 33.942325592041016, "learning_rate": 9.829481604534129e-07, "loss": 0.1122, "num_input_tokens_seen": 6791296, "step": 10060 }, { "epoch": 0.2458896245083429, "grad_norm": 16.92814064025879, "learning_rate": 9.834367518444323e-07, "loss": 0.0593, "num_input_tokens_seen": 6794624, "step": 10065 }, { "epoch": 0.24601177534019006, "grad_norm": 22.280366897583008, "learning_rate": 9.839253432354522e-07, "loss": 0.092, "num_input_tokens_seen": 6797504, "step": 10070 }, { "epoch": 0.24613392617203722, "grad_norm": 29.49980926513672, "learning_rate": 9.84413934626472e-07, "loss": 0.0173, "num_input_tokens_seen": 6800960, "step": 10075 }, { "epoch": 0.24625607700388438, "grad_norm": 6.354789733886719, "learning_rate": 9.849025260174916e-07, "loss": 0.0718, "num_input_tokens_seen": 6804416, "step": 10080 }, { "epoch": 0.24637822783573157, "grad_norm": 14.301334381103516, "learning_rate": 9.853911174085113e-07, "loss": 0.0776, "num_input_tokens_seen": 6808128, "step": 10085 }, { "epoch": 0.24650037866757873, "grad_norm": 2.1689467430114746, "learning_rate": 9.85879708799531e-07, "loss": 0.0546, "num_input_tokens_seen": 6811584, "step": 10090 }, { "epoch": 0.2466225294994259, "grad_norm": 33.23162078857422, "learning_rate": 9.863683001905506e-07, "loss": 0.1041, "num_input_tokens_seen": 6815232, "step": 10095 }, { "epoch": 0.24674468033127306, "grad_norm": 25.61594581604004, "learning_rate": 9.868568915815703e-07, "loss": 0.1665, "num_input_tokens_seen": 6818816, "step": 10100 }, { "epoch": 0.24686683116312022, "grad_norm": 13.704983711242676, "learning_rate": 9.8734548297259e-07, "loss": 0.2181, "num_input_tokens_seen": 6822080, "step": 10105 }, { "epoch": 0.24698898199496738, "grad_norm": 28.8029727935791, "learning_rate": 9.878340743636097e-07, "loss": 0.1648, "num_input_tokens_seen": 6825216, "step": 10110 }, { "epoch": 0.24711113282681454, "grad_norm": 6.627451419830322, "learning_rate": 9.883226657546294e-07, "loss": 0.2491, "num_input_tokens_seen": 6828608, "step": 10115 }, { "epoch": 0.2472332836586617, "grad_norm": 32.42625427246094, "learning_rate": 9.88811257145649e-07, "loss": 0.281, "num_input_tokens_seen": 6831872, "step": 10120 }, { "epoch": 0.24735543449050887, "grad_norm": 5.586592674255371, "learning_rate": 9.892998485366687e-07, "loss": 0.0527, "num_input_tokens_seen": 6835456, "step": 10125 }, { "epoch": 0.24747758532235606, "grad_norm": 15.273069381713867, "learning_rate": 9.897884399276884e-07, "loss": 0.1532, "num_input_tokens_seen": 6839616, "step": 10130 }, { "epoch": 0.24759973615420322, "grad_norm": 29.001066207885742, "learning_rate": 9.90277031318708e-07, "loss": 0.1079, "num_input_tokens_seen": 6842880, "step": 10135 }, { "epoch": 0.24772188698605038, "grad_norm": 0.7507160902023315, "learning_rate": 9.907656227097278e-07, "loss": 0.075, "num_input_tokens_seen": 6846400, "step": 10140 }, { "epoch": 0.24784403781789754, "grad_norm": 19.052486419677734, "learning_rate": 9.912542141007474e-07, "loss": 0.1472, "num_input_tokens_seen": 6849728, "step": 10145 }, { "epoch": 0.2479661886497447, "grad_norm": 42.05897903442383, "learning_rate": 9.917428054917671e-07, "loss": 0.1916, "num_input_tokens_seen": 6852544, "step": 10150 }, { "epoch": 0.24808833948159187, "grad_norm": 11.443634986877441, "learning_rate": 9.922313968827868e-07, "loss": 0.1914, "num_input_tokens_seen": 6855936, "step": 10155 }, { "epoch": 0.24821049031343903, "grad_norm": 24.85001564025879, "learning_rate": 9.927199882738065e-07, "loss": 0.1312, "num_input_tokens_seen": 6859584, "step": 10160 }, { "epoch": 0.2483326411452862, "grad_norm": 15.051764488220215, "learning_rate": 9.932085796648264e-07, "loss": 0.2342, "num_input_tokens_seen": 6862720, "step": 10165 }, { "epoch": 0.24845479197713335, "grad_norm": 15.476161003112793, "learning_rate": 9.936971710558459e-07, "loss": 0.0421, "num_input_tokens_seen": 6866368, "step": 10170 }, { "epoch": 0.24857694280898052, "grad_norm": 19.41805648803711, "learning_rate": 9.941857624468657e-07, "loss": 0.0989, "num_input_tokens_seen": 6869568, "step": 10175 }, { "epoch": 0.2486990936408277, "grad_norm": 37.16714096069336, "learning_rate": 9.946743538378854e-07, "loss": 0.0829, "num_input_tokens_seen": 6872832, "step": 10180 }, { "epoch": 0.24882124447267487, "grad_norm": 23.69383430480957, "learning_rate": 9.95162945228905e-07, "loss": 0.1695, "num_input_tokens_seen": 6876672, "step": 10185 }, { "epoch": 0.24894339530452203, "grad_norm": 32.78408432006836, "learning_rate": 9.956515366199248e-07, "loss": 0.1031, "num_input_tokens_seen": 6880000, "step": 10190 }, { "epoch": 0.2490655461363692, "grad_norm": 28.071138381958008, "learning_rate": 9.961401280109445e-07, "loss": 0.0678, "num_input_tokens_seen": 6883520, "step": 10195 }, { "epoch": 0.24918769696821635, "grad_norm": 14.612953186035156, "learning_rate": 9.966287194019642e-07, "loss": 0.107, "num_input_tokens_seen": 6887168, "step": 10200 }, { "epoch": 0.24930984780006352, "grad_norm": 29.333614349365234, "learning_rate": 9.971173107929838e-07, "loss": 0.0546, "num_input_tokens_seen": 6890944, "step": 10205 }, { "epoch": 0.24943199863191068, "grad_norm": 11.473103523254395, "learning_rate": 9.976059021840035e-07, "loss": 0.1747, "num_input_tokens_seen": 6894080, "step": 10210 }, { "epoch": 0.24955414946375784, "grad_norm": 33.84881591796875, "learning_rate": 9.980944935750232e-07, "loss": 0.1655, "num_input_tokens_seen": 6897408, "step": 10215 }, { "epoch": 0.249676300295605, "grad_norm": 2.882136821746826, "learning_rate": 9.985830849660429e-07, "loss": 0.1251, "num_input_tokens_seen": 6900608, "step": 10220 }, { "epoch": 0.24979845112745216, "grad_norm": 18.558082580566406, "learning_rate": 9.990716763570626e-07, "loss": 0.1599, "num_input_tokens_seen": 6904192, "step": 10225 }, { "epoch": 0.24992060195929935, "grad_norm": 20.312400817871094, "learning_rate": 9.995602677480822e-07, "loss": 0.1388, "num_input_tokens_seen": 6908032, "step": 10230 }, { "epoch": 0.2500183226247771, "eval_loss": 0.150357186794281, "eval_runtime": 47.7693, "eval_samples_per_second": 761.681, "eval_steps_per_second": 95.228, "num_input_tokens_seen": 6910656, "step": 10234 }, { "epoch": 0.2500427527911465, "grad_norm": 30.07940101623535, "learning_rate": 1.000048859139102e-06, "loss": 0.2217, "num_input_tokens_seen": 6911360, "step": 10235 }, { "epoch": 0.25016490362299365, "grad_norm": 11.564278602600098, "learning_rate": 1.0005374505301216e-06, "loss": 0.1544, "num_input_tokens_seen": 6914624, "step": 10240 }, { "epoch": 0.25028705445484084, "grad_norm": 30.50282859802246, "learning_rate": 1.0010260419211413e-06, "loss": 0.1194, "num_input_tokens_seen": 6918144, "step": 10245 }, { "epoch": 0.250409205286688, "grad_norm": 29.134689331054688, "learning_rate": 1.001514633312161e-06, "loss": 0.0953, "num_input_tokens_seen": 6921408, "step": 10250 }, { "epoch": 0.25053135611853516, "grad_norm": 7.919709205627441, "learning_rate": 1.0020032247031806e-06, "loss": 0.0691, "num_input_tokens_seen": 6925248, "step": 10255 }, { "epoch": 0.25065350695038235, "grad_norm": 59.279876708984375, "learning_rate": 1.0024918160942003e-06, "loss": 0.1579, "num_input_tokens_seen": 6928512, "step": 10260 }, { "epoch": 0.2507756577822295, "grad_norm": 32.2935676574707, "learning_rate": 1.0029804074852202e-06, "loss": 0.1694, "num_input_tokens_seen": 6931904, "step": 10265 }, { "epoch": 0.2508978086140767, "grad_norm": 21.012094497680664, "learning_rate": 1.0034689988762397e-06, "loss": 0.1749, "num_input_tokens_seen": 6935552, "step": 10270 }, { "epoch": 0.2510199594459238, "grad_norm": 37.00103759765625, "learning_rate": 1.0039575902672594e-06, "loss": 0.1575, "num_input_tokens_seen": 6939200, "step": 10275 }, { "epoch": 0.251142110277771, "grad_norm": 34.77095413208008, "learning_rate": 1.0044461816582793e-06, "loss": 0.1271, "num_input_tokens_seen": 6942400, "step": 10280 }, { "epoch": 0.25126426110961814, "grad_norm": 4.355811595916748, "learning_rate": 1.0049347730492987e-06, "loss": 0.1603, "num_input_tokens_seen": 6945344, "step": 10285 }, { "epoch": 0.2513864119414653, "grad_norm": 24.556846618652344, "learning_rate": 1.0054233644403184e-06, "loss": 0.1221, "num_input_tokens_seen": 6948672, "step": 10290 }, { "epoch": 0.25150856277331246, "grad_norm": 12.190361976623535, "learning_rate": 1.0059119558313383e-06, "loss": 0.1968, "num_input_tokens_seen": 6952192, "step": 10295 }, { "epoch": 0.25163071360515965, "grad_norm": 5.837921142578125, "learning_rate": 1.0064005472223578e-06, "loss": 0.0807, "num_input_tokens_seen": 6955328, "step": 10300 }, { "epoch": 0.25175286443700684, "grad_norm": 2.7097465991973877, "learning_rate": 1.0068891386133775e-06, "loss": 0.166, "num_input_tokens_seen": 6958336, "step": 10305 }, { "epoch": 0.251875015268854, "grad_norm": 19.389720916748047, "learning_rate": 1.0073777300043974e-06, "loss": 0.1161, "num_input_tokens_seen": 6961920, "step": 10310 }, { "epoch": 0.25199716610070116, "grad_norm": 7.943970680236816, "learning_rate": 1.007866321395417e-06, "loss": 0.1002, "num_input_tokens_seen": 6965184, "step": 10315 }, { "epoch": 0.2521193169325483, "grad_norm": 10.147026062011719, "learning_rate": 1.0083549127864365e-06, "loss": 0.1075, "num_input_tokens_seen": 6968896, "step": 10320 }, { "epoch": 0.2522414677643955, "grad_norm": 19.060894012451172, "learning_rate": 1.0088435041774564e-06, "loss": 0.1228, "num_input_tokens_seen": 6972352, "step": 10325 }, { "epoch": 0.2523636185962426, "grad_norm": 24.950483322143555, "learning_rate": 1.009332095568476e-06, "loss": 0.0779, "num_input_tokens_seen": 6975680, "step": 10330 }, { "epoch": 0.2524857694280898, "grad_norm": 74.65290069580078, "learning_rate": 1.0098206869594955e-06, "loss": 0.1664, "num_input_tokens_seen": 6978816, "step": 10335 }, { "epoch": 0.25260792025993695, "grad_norm": 23.90727996826172, "learning_rate": 1.0103092783505154e-06, "loss": 0.1542, "num_input_tokens_seen": 6982464, "step": 10340 }, { "epoch": 0.25273007109178414, "grad_norm": 36.87062454223633, "learning_rate": 1.0107978697415351e-06, "loss": 0.1632, "num_input_tokens_seen": 6985664, "step": 10345 }, { "epoch": 0.2528522219236313, "grad_norm": 23.86003303527832, "learning_rate": 1.0112864611325548e-06, "loss": 0.0819, "num_input_tokens_seen": 6988608, "step": 10350 }, { "epoch": 0.25297437275547846, "grad_norm": 4.453189373016357, "learning_rate": 1.0117750525235745e-06, "loss": 0.0703, "num_input_tokens_seen": 6992064, "step": 10355 }, { "epoch": 0.25309652358732565, "grad_norm": 23.15815544128418, "learning_rate": 1.0122636439145942e-06, "loss": 0.1059, "num_input_tokens_seen": 6995456, "step": 10360 }, { "epoch": 0.2532186744191728, "grad_norm": 29.278385162353516, "learning_rate": 1.0127522353056138e-06, "loss": 0.0914, "num_input_tokens_seen": 6999488, "step": 10365 }, { "epoch": 0.25334082525102, "grad_norm": 23.133424758911133, "learning_rate": 1.0132408266966335e-06, "loss": 0.1598, "num_input_tokens_seen": 7002496, "step": 10370 }, { "epoch": 0.2534629760828671, "grad_norm": 16.84296989440918, "learning_rate": 1.0137294180876532e-06, "loss": 0.2147, "num_input_tokens_seen": 7005696, "step": 10375 }, { "epoch": 0.2535851269147143, "grad_norm": 26.553749084472656, "learning_rate": 1.0142180094786729e-06, "loss": 0.2037, "num_input_tokens_seen": 7009216, "step": 10380 }, { "epoch": 0.25370727774656143, "grad_norm": 0.6738439798355103, "learning_rate": 1.0147066008696928e-06, "loss": 0.0145, "num_input_tokens_seen": 7012544, "step": 10385 }, { "epoch": 0.2538294285784086, "grad_norm": 1.1998552083969116, "learning_rate": 1.0151951922607123e-06, "loss": 0.1001, "num_input_tokens_seen": 7015936, "step": 10390 }, { "epoch": 0.25395157941025576, "grad_norm": 3.0031771659851074, "learning_rate": 1.015683783651732e-06, "loss": 0.104, "num_input_tokens_seen": 7019328, "step": 10395 }, { "epoch": 0.25407373024210295, "grad_norm": 7.232554912567139, "learning_rate": 1.0161723750427518e-06, "loss": 0.1893, "num_input_tokens_seen": 7022784, "step": 10400 }, { "epoch": 0.25419588107395014, "grad_norm": 2.9085607528686523, "learning_rate": 1.0166609664337713e-06, "loss": 0.0968, "num_input_tokens_seen": 7025856, "step": 10405 }, { "epoch": 0.25431803190579727, "grad_norm": 38.883914947509766, "learning_rate": 1.017149557824791e-06, "loss": 0.1591, "num_input_tokens_seen": 7029376, "step": 10410 }, { "epoch": 0.25444018273764446, "grad_norm": 2.6737284660339355, "learning_rate": 1.0176381492158109e-06, "loss": 0.1463, "num_input_tokens_seen": 7032832, "step": 10415 }, { "epoch": 0.2545623335694916, "grad_norm": 2.1448378562927246, "learning_rate": 1.0181267406068306e-06, "loss": 0.0194, "num_input_tokens_seen": 7036096, "step": 10420 }, { "epoch": 0.2546844844013388, "grad_norm": 18.835430145263672, "learning_rate": 1.01861533199785e-06, "loss": 0.1483, "num_input_tokens_seen": 7039360, "step": 10425 }, { "epoch": 0.2548066352331859, "grad_norm": 27.005678176879883, "learning_rate": 1.01910392338887e-06, "loss": 0.1019, "num_input_tokens_seen": 7042496, "step": 10430 }, { "epoch": 0.2549287860650331, "grad_norm": 41.107025146484375, "learning_rate": 1.0195925147798896e-06, "loss": 0.3398, "num_input_tokens_seen": 7045632, "step": 10435 }, { "epoch": 0.25505093689688024, "grad_norm": 10.689356803894043, "learning_rate": 1.020081106170909e-06, "loss": 0.1183, "num_input_tokens_seen": 7049024, "step": 10440 }, { "epoch": 0.25517308772872743, "grad_norm": 36.1721305847168, "learning_rate": 1.020569697561929e-06, "loss": 0.0926, "num_input_tokens_seen": 7053056, "step": 10445 }, { "epoch": 0.2552952385605746, "grad_norm": 27.044384002685547, "learning_rate": 1.0210582889529486e-06, "loss": 0.1251, "num_input_tokens_seen": 7056128, "step": 10450 }, { "epoch": 0.25541738939242176, "grad_norm": 1.3893952369689941, "learning_rate": 1.0215468803439681e-06, "loss": 0.0557, "num_input_tokens_seen": 7059584, "step": 10455 }, { "epoch": 0.25553954022426895, "grad_norm": 1.2650136947631836, "learning_rate": 1.022035471734988e-06, "loss": 0.0973, "num_input_tokens_seen": 7063104, "step": 10460 }, { "epoch": 0.2556616910561161, "grad_norm": 8.588451385498047, "learning_rate": 1.0225240631260077e-06, "loss": 0.1217, "num_input_tokens_seen": 7066368, "step": 10465 }, { "epoch": 0.25578384188796327, "grad_norm": 60.455047607421875, "learning_rate": 1.0230126545170274e-06, "loss": 0.1888, "num_input_tokens_seen": 7069504, "step": 10470 }, { "epoch": 0.2559059927198104, "grad_norm": 22.16314125061035, "learning_rate": 1.023501245908047e-06, "loss": 0.1105, "num_input_tokens_seen": 7073152, "step": 10475 }, { "epoch": 0.2560281435516576, "grad_norm": 36.67524337768555, "learning_rate": 1.0239898372990667e-06, "loss": 0.1404, "num_input_tokens_seen": 7076608, "step": 10480 }, { "epoch": 0.25615029438350473, "grad_norm": 0.3097395896911621, "learning_rate": 1.0244784286900864e-06, "loss": 0.1086, "num_input_tokens_seen": 7080000, "step": 10485 }, { "epoch": 0.2562724452153519, "grad_norm": 49.033294677734375, "learning_rate": 1.024967020081106e-06, "loss": 0.1311, "num_input_tokens_seen": 7083264, "step": 10490 }, { "epoch": 0.2563945960471991, "grad_norm": 21.20745086669922, "learning_rate": 1.0254556114721258e-06, "loss": 0.1045, "num_input_tokens_seen": 7086528, "step": 10495 }, { "epoch": 0.25651674687904624, "grad_norm": 8.799860954284668, "learning_rate": 1.0259442028631455e-06, "loss": 0.2151, "num_input_tokens_seen": 7089600, "step": 10500 }, { "epoch": 0.25663889771089343, "grad_norm": 21.061410903930664, "learning_rate": 1.0264327942541653e-06, "loss": 0.19, "num_input_tokens_seen": 7092672, "step": 10505 }, { "epoch": 0.25676104854274057, "grad_norm": 0.9606673717498779, "learning_rate": 1.0269213856451848e-06, "loss": 0.1237, "num_input_tokens_seen": 7096128, "step": 10510 }, { "epoch": 0.25688319937458776, "grad_norm": 5.484951972961426, "learning_rate": 1.0274099770362045e-06, "loss": 0.0726, "num_input_tokens_seen": 7099712, "step": 10515 }, { "epoch": 0.2570053502064349, "grad_norm": 0.6776230335235596, "learning_rate": 1.0278985684272244e-06, "loss": 0.0353, "num_input_tokens_seen": 7103040, "step": 10520 }, { "epoch": 0.2571275010382821, "grad_norm": 27.392627716064453, "learning_rate": 1.0283871598182439e-06, "loss": 0.14, "num_input_tokens_seen": 7106304, "step": 10525 }, { "epoch": 0.2572496518701292, "grad_norm": 0.6827471852302551, "learning_rate": 1.0288757512092635e-06, "loss": 0.1152, "num_input_tokens_seen": 7109824, "step": 10530 }, { "epoch": 0.2573718027019764, "grad_norm": 0.37764617800712585, "learning_rate": 1.0293643426002834e-06, "loss": 0.1283, "num_input_tokens_seen": 7112896, "step": 10535 }, { "epoch": 0.25749395353382354, "grad_norm": 37.246009826660156, "learning_rate": 1.0298529339913031e-06, "loss": 0.1707, "num_input_tokens_seen": 7116096, "step": 10540 }, { "epoch": 0.25761610436567073, "grad_norm": 24.440879821777344, "learning_rate": 1.0303415253823226e-06, "loss": 0.1114, "num_input_tokens_seen": 7119488, "step": 10545 }, { "epoch": 0.2577382551975179, "grad_norm": 32.95541763305664, "learning_rate": 1.0308301167733425e-06, "loss": 0.3013, "num_input_tokens_seen": 7122688, "step": 10550 }, { "epoch": 0.25786040602936505, "grad_norm": 24.378509521484375, "learning_rate": 1.0313187081643622e-06, "loss": 0.1704, "num_input_tokens_seen": 7126080, "step": 10555 }, { "epoch": 0.25798255686121224, "grad_norm": 13.029778480529785, "learning_rate": 1.0318072995553816e-06, "loss": 0.1579, "num_input_tokens_seen": 7129280, "step": 10560 }, { "epoch": 0.2581047076930594, "grad_norm": 20.938175201416016, "learning_rate": 1.0322958909464015e-06, "loss": 0.0861, "num_input_tokens_seen": 7132672, "step": 10565 }, { "epoch": 0.25822685852490657, "grad_norm": 9.321990013122559, "learning_rate": 1.0327844823374212e-06, "loss": 0.1046, "num_input_tokens_seen": 7136320, "step": 10570 }, { "epoch": 0.2583490093567537, "grad_norm": 15.518908500671387, "learning_rate": 1.0332730737284407e-06, "loss": 0.1541, "num_input_tokens_seen": 7139712, "step": 10575 }, { "epoch": 0.2584711601886009, "grad_norm": 15.851611137390137, "learning_rate": 1.0337616651194606e-06, "loss": 0.1296, "num_input_tokens_seen": 7142784, "step": 10580 }, { "epoch": 0.258593311020448, "grad_norm": 6.366760730743408, "learning_rate": 1.0342502565104803e-06, "loss": 0.0574, "num_input_tokens_seen": 7146112, "step": 10585 }, { "epoch": 0.2587154618522952, "grad_norm": 16.666038513183594, "learning_rate": 1.0347388479015e-06, "loss": 0.0815, "num_input_tokens_seen": 7149248, "step": 10590 }, { "epoch": 0.2588376126841424, "grad_norm": 10.800519943237305, "learning_rate": 1.0352274392925196e-06, "loss": 0.1847, "num_input_tokens_seen": 7152320, "step": 10595 }, { "epoch": 0.25895976351598954, "grad_norm": 17.76580238342285, "learning_rate": 1.0357160306835393e-06, "loss": 0.0792, "num_input_tokens_seen": 7155392, "step": 10600 }, { "epoch": 0.25908191434783673, "grad_norm": 13.790797233581543, "learning_rate": 1.036204622074559e-06, "loss": 0.1473, "num_input_tokens_seen": 7159424, "step": 10605 }, { "epoch": 0.25920406517968386, "grad_norm": 19.42371368408203, "learning_rate": 1.0366932134655787e-06, "loss": 0.1038, "num_input_tokens_seen": 7162752, "step": 10610 }, { "epoch": 0.25932621601153105, "grad_norm": 26.431636810302734, "learning_rate": 1.0371818048565983e-06, "loss": 0.0652, "num_input_tokens_seen": 7166080, "step": 10615 }, { "epoch": 0.2594483668433782, "grad_norm": 16.103227615356445, "learning_rate": 1.037670396247618e-06, "loss": 0.0808, "num_input_tokens_seen": 7169280, "step": 10620 }, { "epoch": 0.2595705176752254, "grad_norm": 27.495723724365234, "learning_rate": 1.038158987638638e-06, "loss": 0.1462, "num_input_tokens_seen": 7172480, "step": 10625 }, { "epoch": 0.2596926685070725, "grad_norm": 1.79203200340271, "learning_rate": 1.0386475790296574e-06, "loss": 0.0809, "num_input_tokens_seen": 7176128, "step": 10630 }, { "epoch": 0.2598148193389197, "grad_norm": 28.41242218017578, "learning_rate": 1.039136170420677e-06, "loss": 0.1069, "num_input_tokens_seen": 7179136, "step": 10635 }, { "epoch": 0.2599369701707669, "grad_norm": 82.02660369873047, "learning_rate": 1.039624761811697e-06, "loss": 0.2446, "num_input_tokens_seen": 7182272, "step": 10640 }, { "epoch": 0.260059121002614, "grad_norm": 15.813164710998535, "learning_rate": 1.0401133532027164e-06, "loss": 0.0538, "num_input_tokens_seen": 7185088, "step": 10645 }, { "epoch": 0.2601812718344612, "grad_norm": 3.777578592300415, "learning_rate": 1.0406019445937361e-06, "loss": 0.1061, "num_input_tokens_seen": 7188480, "step": 10650 }, { "epoch": 0.26030342266630835, "grad_norm": 0.10652507096529007, "learning_rate": 1.041090535984756e-06, "loss": 0.0527, "num_input_tokens_seen": 7191552, "step": 10655 }, { "epoch": 0.26042557349815554, "grad_norm": 44.32902526855469, "learning_rate": 1.0415791273757757e-06, "loss": 0.1326, "num_input_tokens_seen": 7194752, "step": 10660 }, { "epoch": 0.2605477243300027, "grad_norm": 3.918788433074951, "learning_rate": 1.0420677187667952e-06, "loss": 0.3496, "num_input_tokens_seen": 7198528, "step": 10665 }, { "epoch": 0.26066987516184986, "grad_norm": 42.468475341796875, "learning_rate": 1.042556310157815e-06, "loss": 0.3142, "num_input_tokens_seen": 7201728, "step": 10670 }, { "epoch": 0.260792025993697, "grad_norm": 2.682720899581909, "learning_rate": 1.0430449015488347e-06, "loss": 0.1681, "num_input_tokens_seen": 7205120, "step": 10675 }, { "epoch": 0.2609141768255442, "grad_norm": 12.731380462646484, "learning_rate": 1.0435334929398542e-06, "loss": 0.0965, "num_input_tokens_seen": 7208576, "step": 10680 }, { "epoch": 0.2610363276573913, "grad_norm": 37.0477409362793, "learning_rate": 1.044022084330874e-06, "loss": 0.3016, "num_input_tokens_seen": 7211584, "step": 10685 }, { "epoch": 0.2611584784892385, "grad_norm": 3.5659420490264893, "learning_rate": 1.0445106757218938e-06, "loss": 0.1004, "num_input_tokens_seen": 7214976, "step": 10690 }, { "epoch": 0.2612806293210857, "grad_norm": 6.397414207458496, "learning_rate": 1.0449992671129135e-06, "loss": 0.0837, "num_input_tokens_seen": 7218240, "step": 10695 }, { "epoch": 0.26140278015293283, "grad_norm": 17.796171188354492, "learning_rate": 1.0454878585039331e-06, "loss": 0.1402, "num_input_tokens_seen": 7221952, "step": 10700 }, { "epoch": 0.26152493098478, "grad_norm": 34.56057357788086, "learning_rate": 1.0459764498949528e-06, "loss": 0.0783, "num_input_tokens_seen": 7225344, "step": 10705 }, { "epoch": 0.26164708181662716, "grad_norm": 22.077072143554688, "learning_rate": 1.0464650412859725e-06, "loss": 0.0884, "num_input_tokens_seen": 7228800, "step": 10710 }, { "epoch": 0.26176923264847435, "grad_norm": 9.046226501464844, "learning_rate": 1.0469536326769922e-06, "loss": 0.1519, "num_input_tokens_seen": 7232192, "step": 10715 }, { "epoch": 0.2618913834803215, "grad_norm": 10.881168365478516, "learning_rate": 1.0474422240680119e-06, "loss": 0.0741, "num_input_tokens_seen": 7235648, "step": 10720 }, { "epoch": 0.26201353431216867, "grad_norm": 2.611872673034668, "learning_rate": 1.0479308154590315e-06, "loss": 0.104, "num_input_tokens_seen": 7238976, "step": 10725 }, { "epoch": 0.2621356851440158, "grad_norm": 28.027048110961914, "learning_rate": 1.0484194068500512e-06, "loss": 0.1013, "num_input_tokens_seen": 7242304, "step": 10730 }, { "epoch": 0.262257835975863, "grad_norm": 0.523321807384491, "learning_rate": 1.048907998241071e-06, "loss": 0.1474, "num_input_tokens_seen": 7245632, "step": 10735 }, { "epoch": 0.2623799868077102, "grad_norm": 28.843177795410156, "learning_rate": 1.0493965896320906e-06, "loss": 0.2086, "num_input_tokens_seen": 7248896, "step": 10740 }, { "epoch": 0.2625021376395573, "grad_norm": 8.959317207336426, "learning_rate": 1.0498851810231105e-06, "loss": 0.1583, "num_input_tokens_seen": 7252352, "step": 10745 }, { "epoch": 0.2626242884714045, "grad_norm": 23.193830490112305, "learning_rate": 1.05037377241413e-06, "loss": 0.1325, "num_input_tokens_seen": 7255552, "step": 10750 }, { "epoch": 0.26274643930325164, "grad_norm": 0.8629907965660095, "learning_rate": 1.0508623638051496e-06, "loss": 0.106, "num_input_tokens_seen": 7259008, "step": 10755 }, { "epoch": 0.26286859013509883, "grad_norm": 68.59699249267578, "learning_rate": 1.0513509551961695e-06, "loss": 0.1296, "num_input_tokens_seen": 7262464, "step": 10760 }, { "epoch": 0.26299074096694597, "grad_norm": 18.427112579345703, "learning_rate": 1.051839546587189e-06, "loss": 0.0801, "num_input_tokens_seen": 7265792, "step": 10765 }, { "epoch": 0.26311289179879316, "grad_norm": 14.622143745422363, "learning_rate": 1.0523281379782087e-06, "loss": 0.086, "num_input_tokens_seen": 7269376, "step": 10770 }, { "epoch": 0.2632350426306403, "grad_norm": 28.404579162597656, "learning_rate": 1.0528167293692286e-06, "loss": 0.1174, "num_input_tokens_seen": 7272832, "step": 10775 }, { "epoch": 0.2633571934624875, "grad_norm": 29.52048683166504, "learning_rate": 1.0533053207602482e-06, "loss": 0.0803, "num_input_tokens_seen": 7276096, "step": 10780 }, { "epoch": 0.26347934429433467, "grad_norm": 5.9442548751831055, "learning_rate": 1.0537939121512677e-06, "loss": 0.0768, "num_input_tokens_seen": 7279488, "step": 10785 }, { "epoch": 0.2636014951261818, "grad_norm": 12.992175102233887, "learning_rate": 1.0542825035422876e-06, "loss": 0.1289, "num_input_tokens_seen": 7282944, "step": 10790 }, { "epoch": 0.263723645958029, "grad_norm": 4.273730278015137, "learning_rate": 1.0547710949333073e-06, "loss": 0.0317, "num_input_tokens_seen": 7286464, "step": 10795 }, { "epoch": 0.26384579678987613, "grad_norm": 32.03951644897461, "learning_rate": 1.0552596863243268e-06, "loss": 0.1985, "num_input_tokens_seen": 7289792, "step": 10800 }, { "epoch": 0.2639679476217233, "grad_norm": 27.004234313964844, "learning_rate": 1.0557482777153467e-06, "loss": 0.2123, "num_input_tokens_seen": 7293120, "step": 10805 }, { "epoch": 0.26409009845357045, "grad_norm": 22.78232765197754, "learning_rate": 1.0562368691063663e-06, "loss": 0.0863, "num_input_tokens_seen": 7296640, "step": 10810 }, { "epoch": 0.26421224928541764, "grad_norm": 12.611019134521484, "learning_rate": 1.056725460497386e-06, "loss": 0.1017, "num_input_tokens_seen": 7299584, "step": 10815 }, { "epoch": 0.2643344001172648, "grad_norm": 28.07274627685547, "learning_rate": 1.0572140518884057e-06, "loss": 0.0895, "num_input_tokens_seen": 7302528, "step": 10820 }, { "epoch": 0.26445655094911197, "grad_norm": 11.911782264709473, "learning_rate": 1.0577026432794254e-06, "loss": 0.2, "num_input_tokens_seen": 7305728, "step": 10825 }, { "epoch": 0.2645787017809591, "grad_norm": 27.716493606567383, "learning_rate": 1.058191234670445e-06, "loss": 0.1894, "num_input_tokens_seen": 7309312, "step": 10830 }, { "epoch": 0.2647008526128063, "grad_norm": 53.70561218261719, "learning_rate": 1.0586798260614647e-06, "loss": 0.1635, "num_input_tokens_seen": 7312448, "step": 10835 }, { "epoch": 0.2648230034446535, "grad_norm": 20.634546279907227, "learning_rate": 1.0591684174524844e-06, "loss": 0.1332, "num_input_tokens_seen": 7315904, "step": 10840 }, { "epoch": 0.2649451542765006, "grad_norm": 20.69364356994629, "learning_rate": 1.059657008843504e-06, "loss": 0.0642, "num_input_tokens_seen": 7319360, "step": 10845 }, { "epoch": 0.2650673051083478, "grad_norm": 29.68886375427246, "learning_rate": 1.060145600234524e-06, "loss": 0.1647, "num_input_tokens_seen": 7322816, "step": 10850 }, { "epoch": 0.26518945594019494, "grad_norm": 18.294721603393555, "learning_rate": 1.0606341916255435e-06, "loss": 0.1161, "num_input_tokens_seen": 7326784, "step": 10855 }, { "epoch": 0.26531160677204213, "grad_norm": 12.947708129882812, "learning_rate": 1.0611227830165631e-06, "loss": 0.1226, "num_input_tokens_seen": 7330304, "step": 10860 }, { "epoch": 0.26543375760388926, "grad_norm": 9.835976600646973, "learning_rate": 1.061611374407583e-06, "loss": 0.1123, "num_input_tokens_seen": 7333632, "step": 10865 }, { "epoch": 0.26555590843573645, "grad_norm": 24.71727180480957, "learning_rate": 1.0620999657986025e-06, "loss": 0.1671, "num_input_tokens_seen": 7337280, "step": 10870 }, { "epoch": 0.2656780592675836, "grad_norm": 29.23613166809082, "learning_rate": 1.0625885571896222e-06, "loss": 0.1121, "num_input_tokens_seen": 7340480, "step": 10875 }, { "epoch": 0.2658002100994308, "grad_norm": 27.944616317749023, "learning_rate": 1.063077148580642e-06, "loss": 0.1282, "num_input_tokens_seen": 7343424, "step": 10880 }, { "epoch": 0.26592236093127797, "grad_norm": 24.89887046813965, "learning_rate": 1.0635657399716616e-06, "loss": 0.1091, "num_input_tokens_seen": 7346752, "step": 10885 }, { "epoch": 0.2660445117631251, "grad_norm": 21.505664825439453, "learning_rate": 1.0640543313626812e-06, "loss": 0.1666, "num_input_tokens_seen": 7350144, "step": 10890 }, { "epoch": 0.2661666625949723, "grad_norm": 1.815690040588379, "learning_rate": 1.0645429227537011e-06, "loss": 0.0835, "num_input_tokens_seen": 7353344, "step": 10895 }, { "epoch": 0.2662888134268194, "grad_norm": 2.3935718536376953, "learning_rate": 1.0650315141447208e-06, "loss": 0.1354, "num_input_tokens_seen": 7356544, "step": 10900 }, { "epoch": 0.2664109642586666, "grad_norm": 0.4179496169090271, "learning_rate": 1.0655201055357403e-06, "loss": 0.147, "num_input_tokens_seen": 7359872, "step": 10905 }, { "epoch": 0.26653311509051375, "grad_norm": 11.624155044555664, "learning_rate": 1.0660086969267602e-06, "loss": 0.1455, "num_input_tokens_seen": 7363392, "step": 10910 }, { "epoch": 0.26665526592236094, "grad_norm": 17.070810317993164, "learning_rate": 1.0664972883177799e-06, "loss": 0.066, "num_input_tokens_seen": 7366912, "step": 10915 }, { "epoch": 0.2667774167542081, "grad_norm": 19.962379455566406, "learning_rate": 1.0669858797087993e-06, "loss": 0.0645, "num_input_tokens_seen": 7370176, "step": 10920 }, { "epoch": 0.26689956758605526, "grad_norm": 31.91449546813965, "learning_rate": 1.0674744710998192e-06, "loss": 0.1761, "num_input_tokens_seen": 7373440, "step": 10925 }, { "epoch": 0.2670217184179024, "grad_norm": 20.309640884399414, "learning_rate": 1.067963062490839e-06, "loss": 0.0981, "num_input_tokens_seen": 7377216, "step": 10930 }, { "epoch": 0.2671438692497496, "grad_norm": 17.986515045166016, "learning_rate": 1.0684516538818586e-06, "loss": 0.0754, "num_input_tokens_seen": 7380480, "step": 10935 }, { "epoch": 0.2672660200815968, "grad_norm": 27.000978469848633, "learning_rate": 1.0689402452728783e-06, "loss": 0.0664, "num_input_tokens_seen": 7383872, "step": 10940 }, { "epoch": 0.2673881709134439, "grad_norm": 10.778860092163086, "learning_rate": 1.069428836663898e-06, "loss": 0.1116, "num_input_tokens_seen": 7387008, "step": 10945 }, { "epoch": 0.2675103217452911, "grad_norm": 26.65261459350586, "learning_rate": 1.0699174280549176e-06, "loss": 0.2517, "num_input_tokens_seen": 7390272, "step": 10950 }, { "epoch": 0.26763247257713824, "grad_norm": 43.30821990966797, "learning_rate": 1.0704060194459373e-06, "loss": 0.1716, "num_input_tokens_seen": 7393856, "step": 10955 }, { "epoch": 0.2677546234089854, "grad_norm": 11.451996803283691, "learning_rate": 1.070894610836957e-06, "loss": 0.1175, "num_input_tokens_seen": 7396992, "step": 10960 }, { "epoch": 0.26787677424083256, "grad_norm": 40.32914352416992, "learning_rate": 1.0713832022279767e-06, "loss": 0.2216, "num_input_tokens_seen": 7400448, "step": 10965 }, { "epoch": 0.26799892507267975, "grad_norm": 38.25727081298828, "learning_rate": 1.0718717936189966e-06, "loss": 0.1282, "num_input_tokens_seen": 7403904, "step": 10970 }, { "epoch": 0.2681210759045269, "grad_norm": 7.2183756828308105, "learning_rate": 1.072360385010016e-06, "loss": 0.0799, "num_input_tokens_seen": 7406976, "step": 10975 }, { "epoch": 0.2682432267363741, "grad_norm": 3.240387201309204, "learning_rate": 1.0728489764010357e-06, "loss": 0.0994, "num_input_tokens_seen": 7410176, "step": 10980 }, { "epoch": 0.26836537756822126, "grad_norm": 6.279014587402344, "learning_rate": 1.0733375677920556e-06, "loss": 0.069, "num_input_tokens_seen": 7413760, "step": 10985 }, { "epoch": 0.2684875284000684, "grad_norm": 18.114967346191406, "learning_rate": 1.073826159183075e-06, "loss": 0.2193, "num_input_tokens_seen": 7417216, "step": 10990 }, { "epoch": 0.2686096792319156, "grad_norm": 21.088268280029297, "learning_rate": 1.0743147505740948e-06, "loss": 0.1294, "num_input_tokens_seen": 7420672, "step": 10995 }, { "epoch": 0.2687318300637627, "grad_norm": 30.939210891723633, "learning_rate": 1.0748033419651146e-06, "loss": 0.1481, "num_input_tokens_seen": 7423808, "step": 11000 }, { "epoch": 0.2688539808956099, "grad_norm": 13.835090637207031, "learning_rate": 1.0752919333561341e-06, "loss": 0.1, "num_input_tokens_seen": 7427328, "step": 11005 }, { "epoch": 0.26897613172745705, "grad_norm": 34.25198745727539, "learning_rate": 1.0757805247471538e-06, "loss": 0.1661, "num_input_tokens_seen": 7430592, "step": 11010 }, { "epoch": 0.26909828255930424, "grad_norm": 35.57109451293945, "learning_rate": 1.0762691161381737e-06, "loss": 0.1138, "num_input_tokens_seen": 7433664, "step": 11015 }, { "epoch": 0.26922043339115137, "grad_norm": 2.9439427852630615, "learning_rate": 1.0767577075291934e-06, "loss": 0.1723, "num_input_tokens_seen": 7436416, "step": 11020 }, { "epoch": 0.26934258422299856, "grad_norm": 27.516061782836914, "learning_rate": 1.0772462989202128e-06, "loss": 0.0646, "num_input_tokens_seen": 7439616, "step": 11025 }, { "epoch": 0.26946473505484575, "grad_norm": 31.73204231262207, "learning_rate": 1.0777348903112327e-06, "loss": 0.1593, "num_input_tokens_seen": 7442624, "step": 11030 }, { "epoch": 0.2695868858866929, "grad_norm": 25.48242950439453, "learning_rate": 1.0782234817022524e-06, "loss": 0.1045, "num_input_tokens_seen": 7445888, "step": 11035 }, { "epoch": 0.2697090367185401, "grad_norm": 3.6421027183532715, "learning_rate": 1.0787120730932719e-06, "loss": 0.0894, "num_input_tokens_seen": 7448960, "step": 11040 }, { "epoch": 0.2698311875503872, "grad_norm": 2.8793671131134033, "learning_rate": 1.0792006644842918e-06, "loss": 0.0651, "num_input_tokens_seen": 7452544, "step": 11045 }, { "epoch": 0.2699533383822344, "grad_norm": 75.24100494384766, "learning_rate": 1.0796892558753115e-06, "loss": 0.1117, "num_input_tokens_seen": 7455680, "step": 11050 }, { "epoch": 0.27007548921408153, "grad_norm": 26.78316879272461, "learning_rate": 1.0801778472663311e-06, "loss": 0.12, "num_input_tokens_seen": 7459008, "step": 11055 }, { "epoch": 0.2701976400459287, "grad_norm": 17.241432189941406, "learning_rate": 1.0806664386573508e-06, "loss": 0.0946, "num_input_tokens_seen": 7462464, "step": 11060 }, { "epoch": 0.27031979087777586, "grad_norm": 30.588302612304688, "learning_rate": 1.0811550300483705e-06, "loss": 0.179, "num_input_tokens_seen": 7465664, "step": 11065 }, { "epoch": 0.27044194170962305, "grad_norm": 11.447126388549805, "learning_rate": 1.0816436214393902e-06, "loss": 0.0426, "num_input_tokens_seen": 7469056, "step": 11070 }, { "epoch": 0.2705640925414702, "grad_norm": 26.24823760986328, "learning_rate": 1.0821322128304099e-06, "loss": 0.1197, "num_input_tokens_seen": 7472512, "step": 11075 }, { "epoch": 0.27068624337331737, "grad_norm": 12.120738983154297, "learning_rate": 1.0826208042214295e-06, "loss": 0.0911, "num_input_tokens_seen": 7475392, "step": 11080 }, { "epoch": 0.27080839420516456, "grad_norm": 23.091278076171875, "learning_rate": 1.0831093956124492e-06, "loss": 0.1618, "num_input_tokens_seen": 7478912, "step": 11085 }, { "epoch": 0.2709305450370117, "grad_norm": 23.355270385742188, "learning_rate": 1.0835979870034691e-06, "loss": 0.1194, "num_input_tokens_seen": 7482048, "step": 11090 }, { "epoch": 0.2710526958688589, "grad_norm": 19.180255889892578, "learning_rate": 1.0840865783944886e-06, "loss": 0.1323, "num_input_tokens_seen": 7485248, "step": 11095 }, { "epoch": 0.271174846700706, "grad_norm": 35.145591735839844, "learning_rate": 1.0845751697855083e-06, "loss": 0.2519, "num_input_tokens_seen": 7488576, "step": 11100 }, { "epoch": 0.2712969975325532, "grad_norm": 4.28073787689209, "learning_rate": 1.0850637611765282e-06, "loss": 0.0914, "num_input_tokens_seen": 7491904, "step": 11105 }, { "epoch": 0.27141914836440034, "grad_norm": 1.7749961614608765, "learning_rate": 1.0855523525675476e-06, "loss": 0.1573, "num_input_tokens_seen": 7496064, "step": 11110 }, { "epoch": 0.27154129919624753, "grad_norm": 18.251876831054688, "learning_rate": 1.0860409439585673e-06, "loss": 0.0401, "num_input_tokens_seen": 7499328, "step": 11115 }, { "epoch": 0.27166345002809467, "grad_norm": 29.7154598236084, "learning_rate": 1.0865295353495872e-06, "loss": 0.1413, "num_input_tokens_seen": 7503040, "step": 11120 }, { "epoch": 0.27178560085994186, "grad_norm": 22.521013259887695, "learning_rate": 1.0870181267406069e-06, "loss": 0.2132, "num_input_tokens_seen": 7506368, "step": 11125 }, { "epoch": 0.27190775169178905, "grad_norm": 21.76904296875, "learning_rate": 1.0875067181316264e-06, "loss": 0.1462, "num_input_tokens_seen": 7509888, "step": 11130 }, { "epoch": 0.2720299025236362, "grad_norm": 26.12929916381836, "learning_rate": 1.0879953095226463e-06, "loss": 0.118, "num_input_tokens_seen": 7513728, "step": 11135 }, { "epoch": 0.27215205335548337, "grad_norm": 49.16400909423828, "learning_rate": 1.088483900913666e-06, "loss": 0.1055, "num_input_tokens_seen": 7517376, "step": 11140 }, { "epoch": 0.2722742041873305, "grad_norm": 23.947162628173828, "learning_rate": 1.0889724923046854e-06, "loss": 0.2541, "num_input_tokens_seen": 7520576, "step": 11145 }, { "epoch": 0.2723963550191777, "grad_norm": 28.64226531982422, "learning_rate": 1.0894610836957053e-06, "loss": 0.1983, "num_input_tokens_seen": 7523584, "step": 11150 }, { "epoch": 0.27251850585102483, "grad_norm": 30.37683868408203, "learning_rate": 1.089949675086725e-06, "loss": 0.067, "num_input_tokens_seen": 7526976, "step": 11155 }, { "epoch": 0.272640656682872, "grad_norm": 32.52040100097656, "learning_rate": 1.0904382664777444e-06, "loss": 0.0961, "num_input_tokens_seen": 7529920, "step": 11160 }, { "epoch": 0.27276280751471915, "grad_norm": 3.7241928577423096, "learning_rate": 1.0909268578687643e-06, "loss": 0.0953, "num_input_tokens_seen": 7533312, "step": 11165 }, { "epoch": 0.27288495834656634, "grad_norm": 11.024883270263672, "learning_rate": 1.091415449259784e-06, "loss": 0.0257, "num_input_tokens_seen": 7537536, "step": 11170 }, { "epoch": 0.27300710917841353, "grad_norm": 29.855224609375, "learning_rate": 1.0919040406508037e-06, "loss": 0.1563, "num_input_tokens_seen": 7540736, "step": 11175 }, { "epoch": 0.27312926001026067, "grad_norm": 25.81363296508789, "learning_rate": 1.0923926320418234e-06, "loss": 0.1178, "num_input_tokens_seen": 7544256, "step": 11180 }, { "epoch": 0.27325141084210786, "grad_norm": 16.829345703125, "learning_rate": 1.092881223432843e-06, "loss": 0.2243, "num_input_tokens_seen": 7547648, "step": 11185 }, { "epoch": 0.273373561673955, "grad_norm": 45.78081130981445, "learning_rate": 1.0933698148238627e-06, "loss": 0.192, "num_input_tokens_seen": 7551488, "step": 11190 }, { "epoch": 0.2734957125058022, "grad_norm": 34.01533508300781, "learning_rate": 1.0938584062148824e-06, "loss": 0.1384, "num_input_tokens_seen": 7554752, "step": 11195 }, { "epoch": 0.2736178633376493, "grad_norm": 19.07633399963379, "learning_rate": 1.0943469976059021e-06, "loss": 0.0948, "num_input_tokens_seen": 7558080, "step": 11200 }, { "epoch": 0.2737400141694965, "grad_norm": 0.7051904201507568, "learning_rate": 1.0948355889969218e-06, "loss": 0.1033, "num_input_tokens_seen": 7561472, "step": 11205 }, { "epoch": 0.27386216500134364, "grad_norm": 1.2588499784469604, "learning_rate": 1.0953241803879417e-06, "loss": 0.1728, "num_input_tokens_seen": 7565248, "step": 11210 }, { "epoch": 0.27398431583319083, "grad_norm": 4.5054097175598145, "learning_rate": 1.0958127717789612e-06, "loss": 0.0578, "num_input_tokens_seen": 7568384, "step": 11215 }, { "epoch": 0.27410646666503796, "grad_norm": 20.64672088623047, "learning_rate": 1.0963013631699808e-06, "loss": 0.1279, "num_input_tokens_seen": 7571776, "step": 11220 }, { "epoch": 0.27422861749688515, "grad_norm": 4.561092376708984, "learning_rate": 1.0967899545610007e-06, "loss": 0.0493, "num_input_tokens_seen": 7574784, "step": 11225 }, { "epoch": 0.27435076832873234, "grad_norm": 46.156089782714844, "learning_rate": 1.0972785459520202e-06, "loss": 0.1948, "num_input_tokens_seen": 7577984, "step": 11230 }, { "epoch": 0.2744729191605795, "grad_norm": 29.57512664794922, "learning_rate": 1.0977671373430399e-06, "loss": 0.0757, "num_input_tokens_seen": 7580992, "step": 11235 }, { "epoch": 0.27459506999242667, "grad_norm": 32.9466667175293, "learning_rate": 1.0982557287340598e-06, "loss": 0.1289, "num_input_tokens_seen": 7584256, "step": 11240 }, { "epoch": 0.2747172208242738, "grad_norm": 18.003232955932617, "learning_rate": 1.0987443201250795e-06, "loss": 0.2638, "num_input_tokens_seen": 7587392, "step": 11245 }, { "epoch": 0.274839371656121, "grad_norm": 35.77971649169922, "learning_rate": 1.099232911516099e-06, "loss": 0.1724, "num_input_tokens_seen": 7590528, "step": 11250 }, { "epoch": 0.2749615224879681, "grad_norm": 5.708560466766357, "learning_rate": 1.0997215029071188e-06, "loss": 0.0775, "num_input_tokens_seen": 7593792, "step": 11255 }, { "epoch": 0.2750836733198153, "grad_norm": 0.5288552641868591, "learning_rate": 1.1002100942981385e-06, "loss": 0.1246, "num_input_tokens_seen": 7597120, "step": 11260 }, { "epoch": 0.27520582415166245, "grad_norm": 56.88216018676758, "learning_rate": 1.100698685689158e-06, "loss": 0.1392, "num_input_tokens_seen": 7600256, "step": 11265 }, { "epoch": 0.27532797498350964, "grad_norm": 45.19654083251953, "learning_rate": 1.1011872770801779e-06, "loss": 0.1305, "num_input_tokens_seen": 7603200, "step": 11270 }, { "epoch": 0.2754501258153568, "grad_norm": 27.574857711791992, "learning_rate": 1.1016758684711975e-06, "loss": 0.1257, "num_input_tokens_seen": 7606400, "step": 11275 }, { "epoch": 0.27557227664720396, "grad_norm": 21.560970306396484, "learning_rate": 1.1021644598622172e-06, "loss": 0.2147, "num_input_tokens_seen": 7609600, "step": 11280 }, { "epoch": 0.27569442747905115, "grad_norm": 22.572582244873047, "learning_rate": 1.102653051253237e-06, "loss": 0.1249, "num_input_tokens_seen": 7612736, "step": 11285 }, { "epoch": 0.2758165783108983, "grad_norm": 37.17354965209961, "learning_rate": 1.1031416426442566e-06, "loss": 0.0402, "num_input_tokens_seen": 7616128, "step": 11290 }, { "epoch": 0.2759387291427455, "grad_norm": 46.42210388183594, "learning_rate": 1.1036302340352763e-06, "loss": 0.1116, "num_input_tokens_seen": 7619264, "step": 11295 }, { "epoch": 0.2760608799745926, "grad_norm": 30.43973731994629, "learning_rate": 1.104118825426296e-06, "loss": 0.1853, "num_input_tokens_seen": 7622400, "step": 11300 }, { "epoch": 0.2761830308064398, "grad_norm": 0.16094528138637543, "learning_rate": 1.1046074168173156e-06, "loss": 0.134, "num_input_tokens_seen": 7625408, "step": 11305 }, { "epoch": 0.27630518163828693, "grad_norm": 16.4888858795166, "learning_rate": 1.1050960082083353e-06, "loss": 0.1378, "num_input_tokens_seen": 7628992, "step": 11310 }, { "epoch": 0.2764273324701341, "grad_norm": 15.717243194580078, "learning_rate": 1.105584599599355e-06, "loss": 0.1405, "num_input_tokens_seen": 7632384, "step": 11315 }, { "epoch": 0.2765494833019813, "grad_norm": 36.28303527832031, "learning_rate": 1.1060731909903747e-06, "loss": 0.0711, "num_input_tokens_seen": 7635264, "step": 11320 }, { "epoch": 0.27667163413382845, "grad_norm": 43.312347412109375, "learning_rate": 1.1065617823813944e-06, "loss": 0.133, "num_input_tokens_seen": 7638272, "step": 11325 }, { "epoch": 0.27679378496567564, "grad_norm": 5.552413463592529, "learning_rate": 1.1070503737724142e-06, "loss": 0.1541, "num_input_tokens_seen": 7641536, "step": 11330 }, { "epoch": 0.27691593579752277, "grad_norm": 34.17829513549805, "learning_rate": 1.1075389651634337e-06, "loss": 0.1017, "num_input_tokens_seen": 7645120, "step": 11335 }, { "epoch": 0.27703808662936996, "grad_norm": 23.244646072387695, "learning_rate": 1.1080275565544534e-06, "loss": 0.0997, "num_input_tokens_seen": 7648768, "step": 11340 }, { "epoch": 0.2771602374612171, "grad_norm": 26.523557662963867, "learning_rate": 1.1085161479454733e-06, "loss": 0.2157, "num_input_tokens_seen": 7651776, "step": 11345 }, { "epoch": 0.2772823882930643, "grad_norm": 1.2325987815856934, "learning_rate": 1.1090047393364928e-06, "loss": 0.0374, "num_input_tokens_seen": 7655168, "step": 11350 }, { "epoch": 0.2774045391249114, "grad_norm": 13.653995513916016, "learning_rate": 1.1094933307275124e-06, "loss": 0.0361, "num_input_tokens_seen": 7658752, "step": 11355 }, { "epoch": 0.2775266899567586, "grad_norm": 30.004316329956055, "learning_rate": 1.1099819221185323e-06, "loss": 0.1332, "num_input_tokens_seen": 7662080, "step": 11360 }, { "epoch": 0.27764884078860574, "grad_norm": 17.038942337036133, "learning_rate": 1.110470513509552e-06, "loss": 0.2056, "num_input_tokens_seen": 7665280, "step": 11365 }, { "epoch": 0.27777099162045293, "grad_norm": 29.278806686401367, "learning_rate": 1.1109591049005715e-06, "loss": 0.09, "num_input_tokens_seen": 7668672, "step": 11370 }, { "epoch": 0.2778931424523001, "grad_norm": 30.497360229492188, "learning_rate": 1.1114476962915914e-06, "loss": 0.1548, "num_input_tokens_seen": 7671872, "step": 11375 }, { "epoch": 0.27801529328414726, "grad_norm": 0.15975019335746765, "learning_rate": 1.111936287682611e-06, "loss": 0.08, "num_input_tokens_seen": 7675072, "step": 11380 }, { "epoch": 0.27813744411599445, "grad_norm": 46.03263854980469, "learning_rate": 1.1124248790736305e-06, "loss": 0.089, "num_input_tokens_seen": 7677952, "step": 11385 }, { "epoch": 0.2782595949478416, "grad_norm": 26.398767471313477, "learning_rate": 1.1129134704646504e-06, "loss": 0.0878, "num_input_tokens_seen": 7681088, "step": 11390 }, { "epoch": 0.27838174577968877, "grad_norm": 46.81501770019531, "learning_rate": 1.11340206185567e-06, "loss": 0.0657, "num_input_tokens_seen": 7684288, "step": 11395 }, { "epoch": 0.2785038966115359, "grad_norm": 49.09954071044922, "learning_rate": 1.1138906532466898e-06, "loss": 0.0987, "num_input_tokens_seen": 7687360, "step": 11400 }, { "epoch": 0.2786260474433831, "grad_norm": 25.029376983642578, "learning_rate": 1.1143792446377095e-06, "loss": 0.1251, "num_input_tokens_seen": 7690432, "step": 11405 }, { "epoch": 0.27874819827523023, "grad_norm": 40.152469635009766, "learning_rate": 1.1148678360287291e-06, "loss": 0.1636, "num_input_tokens_seen": 7693888, "step": 11410 }, { "epoch": 0.2788703491070774, "grad_norm": 42.61248779296875, "learning_rate": 1.1153564274197488e-06, "loss": 0.1634, "num_input_tokens_seen": 7697664, "step": 11415 }, { "epoch": 0.2789924999389246, "grad_norm": 27.95641326904297, "learning_rate": 1.1158450188107685e-06, "loss": 0.136, "num_input_tokens_seen": 7700800, "step": 11420 }, { "epoch": 0.27911465077077174, "grad_norm": 23.90423583984375, "learning_rate": 1.1163336102017882e-06, "loss": 0.0537, "num_input_tokens_seen": 7703680, "step": 11425 }, { "epoch": 0.27923680160261893, "grad_norm": 17.14470100402832, "learning_rate": 1.1168222015928079e-06, "loss": 0.2367, "num_input_tokens_seen": 7707456, "step": 11430 }, { "epoch": 0.27935895243446607, "grad_norm": 24.315845489501953, "learning_rate": 1.1173107929838276e-06, "loss": 0.1418, "num_input_tokens_seen": 7710720, "step": 11435 }, { "epoch": 0.27948110326631326, "grad_norm": 28.46575355529785, "learning_rate": 1.1177993843748472e-06, "loss": 0.2804, "num_input_tokens_seen": 7713792, "step": 11440 }, { "epoch": 0.2796032540981604, "grad_norm": 23.07647705078125, "learning_rate": 1.118287975765867e-06, "loss": 0.1272, "num_input_tokens_seen": 7716736, "step": 11445 }, { "epoch": 0.2797254049300076, "grad_norm": 17.141408920288086, "learning_rate": 1.1187765671568868e-06, "loss": 0.1038, "num_input_tokens_seen": 7720000, "step": 11450 }, { "epoch": 0.2798475557618547, "grad_norm": 13.484521865844727, "learning_rate": 1.1192651585479063e-06, "loss": 0.1069, "num_input_tokens_seen": 7723456, "step": 11455 }, { "epoch": 0.2799697065937019, "grad_norm": 14.077258110046387, "learning_rate": 1.119753749938926e-06, "loss": 0.0691, "num_input_tokens_seen": 7726464, "step": 11460 }, { "epoch": 0.2800918574255491, "grad_norm": 11.436298370361328, "learning_rate": 1.1202423413299459e-06, "loss": 0.1096, "num_input_tokens_seen": 7729920, "step": 11465 }, { "epoch": 0.28021400825739623, "grad_norm": 25.3713436126709, "learning_rate": 1.1207309327209653e-06, "loss": 0.0664, "num_input_tokens_seen": 7733184, "step": 11470 }, { "epoch": 0.2803361590892434, "grad_norm": 14.680299758911133, "learning_rate": 1.121219524111985e-06, "loss": 0.11, "num_input_tokens_seen": 7736384, "step": 11475 }, { "epoch": 0.28045830992109055, "grad_norm": 0.4156033396720886, "learning_rate": 1.121708115503005e-06, "loss": 0.1157, "num_input_tokens_seen": 7740096, "step": 11480 }, { "epoch": 0.28058046075293774, "grad_norm": 15.260326385498047, "learning_rate": 1.1221967068940246e-06, "loss": 0.0868, "num_input_tokens_seen": 7743744, "step": 11485 }, { "epoch": 0.2807026115847849, "grad_norm": 30.606847763061523, "learning_rate": 1.122685298285044e-06, "loss": 0.1729, "num_input_tokens_seen": 7747264, "step": 11490 }, { "epoch": 0.28082476241663207, "grad_norm": 0.62032550573349, "learning_rate": 1.123173889676064e-06, "loss": 0.1499, "num_input_tokens_seen": 7750400, "step": 11495 }, { "epoch": 0.2809469132484792, "grad_norm": 0.2940658926963806, "learning_rate": 1.1236624810670836e-06, "loss": 0.1297, "num_input_tokens_seen": 7753600, "step": 11500 }, { "epoch": 0.2810690640803264, "grad_norm": 11.697123527526855, "learning_rate": 1.124151072458103e-06, "loss": 0.1523, "num_input_tokens_seen": 7756928, "step": 11505 }, { "epoch": 0.2811912149121735, "grad_norm": 21.05707359313965, "learning_rate": 1.124639663849123e-06, "loss": 0.1703, "num_input_tokens_seen": 7759872, "step": 11510 }, { "epoch": 0.2813133657440207, "grad_norm": 3.735363721847534, "learning_rate": 1.1251282552401427e-06, "loss": 0.0938, "num_input_tokens_seen": 7763584, "step": 11515 }, { "epoch": 0.2814355165758679, "grad_norm": 3.861677646636963, "learning_rate": 1.1256168466311623e-06, "loss": 0.1084, "num_input_tokens_seen": 7766720, "step": 11520 }, { "epoch": 0.28155766740771504, "grad_norm": 27.23649024963379, "learning_rate": 1.126105438022182e-06, "loss": 0.2459, "num_input_tokens_seen": 7769856, "step": 11525 }, { "epoch": 0.28167981823956223, "grad_norm": 3.6670424938201904, "learning_rate": 1.1265940294132017e-06, "loss": 0.1084, "num_input_tokens_seen": 7772992, "step": 11530 }, { "epoch": 0.28180196907140936, "grad_norm": 3.219539165496826, "learning_rate": 1.1270826208042214e-06, "loss": 0.1102, "num_input_tokens_seen": 7776320, "step": 11535 }, { "epoch": 0.28192411990325655, "grad_norm": 18.450578689575195, "learning_rate": 1.127571212195241e-06, "loss": 0.1183, "num_input_tokens_seen": 7779520, "step": 11540 }, { "epoch": 0.2820462707351037, "grad_norm": 4.513743877410889, "learning_rate": 1.1280598035862608e-06, "loss": 0.0264, "num_input_tokens_seen": 7783296, "step": 11545 }, { "epoch": 0.2821684215669509, "grad_norm": 18.675949096679688, "learning_rate": 1.1285483949772804e-06, "loss": 0.0725, "num_input_tokens_seen": 7786304, "step": 11550 }, { "epoch": 0.282290572398798, "grad_norm": 18.002805709838867, "learning_rate": 1.1290369863683003e-06, "loss": 0.0659, "num_input_tokens_seen": 7790592, "step": 11555 }, { "epoch": 0.2824127232306452, "grad_norm": 3.769627094268799, "learning_rate": 1.1295255777593198e-06, "loss": 0.2091, "num_input_tokens_seen": 7794432, "step": 11560 }, { "epoch": 0.2825348740624924, "grad_norm": 33.11570739746094, "learning_rate": 1.1300141691503395e-06, "loss": 0.056, "num_input_tokens_seen": 7797888, "step": 11565 }, { "epoch": 0.2826570248943395, "grad_norm": 0.7942283749580383, "learning_rate": 1.1305027605413594e-06, "loss": 0.1011, "num_input_tokens_seen": 7800896, "step": 11570 }, { "epoch": 0.2827791757261867, "grad_norm": 25.094234466552734, "learning_rate": 1.1309913519323788e-06, "loss": 0.144, "num_input_tokens_seen": 7803904, "step": 11575 }, { "epoch": 0.28290132655803385, "grad_norm": 0.5091480016708374, "learning_rate": 1.1314799433233985e-06, "loss": 0.0422, "num_input_tokens_seen": 7807360, "step": 11580 }, { "epoch": 0.28302347738988104, "grad_norm": 34.18690490722656, "learning_rate": 1.1319685347144184e-06, "loss": 0.1381, "num_input_tokens_seen": 7810560, "step": 11585 }, { "epoch": 0.2831456282217282, "grad_norm": 3.1453356742858887, "learning_rate": 1.1324571261054379e-06, "loss": 0.0882, "num_input_tokens_seen": 7814400, "step": 11590 }, { "epoch": 0.28326777905357536, "grad_norm": 62.70692443847656, "learning_rate": 1.1329457174964576e-06, "loss": 0.1955, "num_input_tokens_seen": 7817728, "step": 11595 }, { "epoch": 0.2833899298854225, "grad_norm": 1.02499520778656, "learning_rate": 1.1334343088874775e-06, "loss": 0.0308, "num_input_tokens_seen": 7820928, "step": 11600 }, { "epoch": 0.2835120807172697, "grad_norm": 11.048792839050293, "learning_rate": 1.1339229002784971e-06, "loss": 0.1583, "num_input_tokens_seen": 7824128, "step": 11605 }, { "epoch": 0.2836342315491168, "grad_norm": 36.67851257324219, "learning_rate": 1.1344114916695166e-06, "loss": 0.2019, "num_input_tokens_seen": 7827712, "step": 11610 }, { "epoch": 0.283756382380964, "grad_norm": 96.36241149902344, "learning_rate": 1.1349000830605365e-06, "loss": 0.097, "num_input_tokens_seen": 7830720, "step": 11615 }, { "epoch": 0.2838785332128112, "grad_norm": 14.921711921691895, "learning_rate": 1.1353886744515562e-06, "loss": 0.065, "num_input_tokens_seen": 7834816, "step": 11620 }, { "epoch": 0.28400068404465834, "grad_norm": 1.4795444011688232, "learning_rate": 1.1358772658425757e-06, "loss": 0.1565, "num_input_tokens_seen": 7838080, "step": 11625 }, { "epoch": 0.2841228348765055, "grad_norm": 12.521466255187988, "learning_rate": 1.1363658572335956e-06, "loss": 0.0938, "num_input_tokens_seen": 7841280, "step": 11630 }, { "epoch": 0.28424498570835266, "grad_norm": 15.332459449768066, "learning_rate": 1.1368544486246152e-06, "loss": 0.1394, "num_input_tokens_seen": 7844224, "step": 11635 }, { "epoch": 0.28436713654019985, "grad_norm": 10.226259231567383, "learning_rate": 1.137343040015635e-06, "loss": 0.1112, "num_input_tokens_seen": 7847488, "step": 11640 }, { "epoch": 0.284489287372047, "grad_norm": 33.42863082885742, "learning_rate": 1.1378316314066546e-06, "loss": 0.146, "num_input_tokens_seen": 7850816, "step": 11645 }, { "epoch": 0.2846114382038942, "grad_norm": 29.166101455688477, "learning_rate": 1.1383202227976743e-06, "loss": 0.1581, "num_input_tokens_seen": 7853952, "step": 11650 }, { "epoch": 0.2847335890357413, "grad_norm": 11.100362777709961, "learning_rate": 1.138808814188694e-06, "loss": 0.1123, "num_input_tokens_seen": 7857344, "step": 11655 }, { "epoch": 0.2848557398675885, "grad_norm": 27.956087112426758, "learning_rate": 1.1392974055797136e-06, "loss": 0.0412, "num_input_tokens_seen": 7860928, "step": 11660 }, { "epoch": 0.2849778906994357, "grad_norm": 45.29127502441406, "learning_rate": 1.1397859969707333e-06, "loss": 0.126, "num_input_tokens_seen": 7865024, "step": 11665 }, { "epoch": 0.2851000415312828, "grad_norm": 3.9593372344970703, "learning_rate": 1.140274588361753e-06, "loss": 0.0255, "num_input_tokens_seen": 7868672, "step": 11670 }, { "epoch": 0.28522219236313, "grad_norm": 12.407175064086914, "learning_rate": 1.1407631797527729e-06, "loss": 0.1446, "num_input_tokens_seen": 7871680, "step": 11675 }, { "epoch": 0.28534434319497715, "grad_norm": 23.719539642333984, "learning_rate": 1.1412517711437924e-06, "loss": 0.1934, "num_input_tokens_seen": 7875136, "step": 11680 }, { "epoch": 0.28546649402682434, "grad_norm": 25.424650192260742, "learning_rate": 1.141740362534812e-06, "loss": 0.1407, "num_input_tokens_seen": 7878528, "step": 11685 }, { "epoch": 0.28558864485867147, "grad_norm": 30.113014221191406, "learning_rate": 1.142228953925832e-06, "loss": 0.0432, "num_input_tokens_seen": 7881984, "step": 11690 }, { "epoch": 0.28571079569051866, "grad_norm": 62.609764099121094, "learning_rate": 1.1427175453168514e-06, "loss": 0.0923, "num_input_tokens_seen": 7885184, "step": 11695 }, { "epoch": 0.2858329465223658, "grad_norm": 19.35966682434082, "learning_rate": 1.143206136707871e-06, "loss": 0.2471, "num_input_tokens_seen": 7889088, "step": 11700 }, { "epoch": 0.285955097354213, "grad_norm": 1.5210208892822266, "learning_rate": 1.143694728098891e-06, "loss": 0.0819, "num_input_tokens_seen": 7892928, "step": 11705 }, { "epoch": 0.2860772481860602, "grad_norm": 25.11351776123047, "learning_rate": 1.1441833194899105e-06, "loss": 0.0932, "num_input_tokens_seen": 7896320, "step": 11710 }, { "epoch": 0.2861993990179073, "grad_norm": 22.904691696166992, "learning_rate": 1.1446719108809301e-06, "loss": 0.2197, "num_input_tokens_seen": 7899776, "step": 11715 }, { "epoch": 0.2863215498497545, "grad_norm": 3.6096725463867188, "learning_rate": 1.14516050227195e-06, "loss": 0.0535, "num_input_tokens_seen": 7903424, "step": 11720 }, { "epoch": 0.28644370068160163, "grad_norm": 11.384994506835938, "learning_rate": 1.1456490936629697e-06, "loss": 0.1282, "num_input_tokens_seen": 7906432, "step": 11725 }, { "epoch": 0.2865658515134488, "grad_norm": 11.53609561920166, "learning_rate": 1.1461376850539892e-06, "loss": 0.0956, "num_input_tokens_seen": 7909760, "step": 11730 }, { "epoch": 0.28668800234529596, "grad_norm": 20.164472579956055, "learning_rate": 1.146626276445009e-06, "loss": 0.1069, "num_input_tokens_seen": 7912960, "step": 11735 }, { "epoch": 0.28681015317714315, "grad_norm": 32.91118621826172, "learning_rate": 1.1471148678360288e-06, "loss": 0.1146, "num_input_tokens_seen": 7916800, "step": 11740 }, { "epoch": 0.2869323040089903, "grad_norm": 17.6117000579834, "learning_rate": 1.1476034592270482e-06, "loss": 0.1474, "num_input_tokens_seen": 7919808, "step": 11745 }, { "epoch": 0.28705445484083747, "grad_norm": 23.570358276367188, "learning_rate": 1.1480920506180681e-06, "loss": 0.0573, "num_input_tokens_seen": 7923200, "step": 11750 }, { "epoch": 0.2871766056726846, "grad_norm": 6.977721691131592, "learning_rate": 1.1485806420090878e-06, "loss": 0.214, "num_input_tokens_seen": 7926208, "step": 11755 }, { "epoch": 0.2872987565045318, "grad_norm": 5.850998401641846, "learning_rate": 1.1490692334001075e-06, "loss": 0.0956, "num_input_tokens_seen": 7929664, "step": 11760 }, { "epoch": 0.287420907336379, "grad_norm": 26.8560791015625, "learning_rate": 1.1495578247911272e-06, "loss": 0.0706, "num_input_tokens_seen": 7933312, "step": 11765 }, { "epoch": 0.2875430581682261, "grad_norm": 8.291579246520996, "learning_rate": 1.1500464161821468e-06, "loss": 0.2371, "num_input_tokens_seen": 7936896, "step": 11770 }, { "epoch": 0.2876652090000733, "grad_norm": 23.933063507080078, "learning_rate": 1.1505350075731665e-06, "loss": 0.1282, "num_input_tokens_seen": 7940672, "step": 11775 }, { "epoch": 0.28778735983192044, "grad_norm": 3.920012950897217, "learning_rate": 1.1510235989641862e-06, "loss": 0.1103, "num_input_tokens_seen": 7944064, "step": 11780 }, { "epoch": 0.28790951066376763, "grad_norm": 31.8831844329834, "learning_rate": 1.1515121903552059e-06, "loss": 0.0834, "num_input_tokens_seen": 7947456, "step": 11785 }, { "epoch": 0.28803166149561477, "grad_norm": 20.2723445892334, "learning_rate": 1.1520007817462256e-06, "loss": 0.1646, "num_input_tokens_seen": 7950528, "step": 11790 }, { "epoch": 0.28815381232746196, "grad_norm": 24.096601486206055, "learning_rate": 1.1524893731372455e-06, "loss": 0.1808, "num_input_tokens_seen": 7953728, "step": 11795 }, { "epoch": 0.2882759631593091, "grad_norm": 25.648883819580078, "learning_rate": 1.152977964528265e-06, "loss": 0.0937, "num_input_tokens_seen": 7957248, "step": 11800 }, { "epoch": 0.2883981139911563, "grad_norm": 1.3611674308776855, "learning_rate": 1.1534665559192846e-06, "loss": 0.1189, "num_input_tokens_seen": 7960320, "step": 11805 }, { "epoch": 0.28852026482300347, "grad_norm": 14.197624206542969, "learning_rate": 1.1539551473103045e-06, "loss": 0.1614, "num_input_tokens_seen": 7963264, "step": 11810 }, { "epoch": 0.2886424156548506, "grad_norm": 27.47894859313965, "learning_rate": 1.154443738701324e-06, "loss": 0.1142, "num_input_tokens_seen": 7966464, "step": 11815 }, { "epoch": 0.2887645664866978, "grad_norm": 24.667394638061523, "learning_rate": 1.1549323300923437e-06, "loss": 0.1608, "num_input_tokens_seen": 7970240, "step": 11820 }, { "epoch": 0.2888867173185449, "grad_norm": 36.73628234863281, "learning_rate": 1.1554209214833635e-06, "loss": 0.2324, "num_input_tokens_seen": 7973632, "step": 11825 }, { "epoch": 0.2890088681503921, "grad_norm": 18.555866241455078, "learning_rate": 1.1559095128743832e-06, "loss": 0.1112, "num_input_tokens_seen": 7976832, "step": 11830 }, { "epoch": 0.28913101898223925, "grad_norm": 49.42720031738281, "learning_rate": 1.1563981042654027e-06, "loss": 0.0498, "num_input_tokens_seen": 7981120, "step": 11835 }, { "epoch": 0.28925316981408644, "grad_norm": 54.10476303100586, "learning_rate": 1.1568866956564226e-06, "loss": 0.127, "num_input_tokens_seen": 7984320, "step": 11840 }, { "epoch": 0.2893753206459336, "grad_norm": 1.2627156972885132, "learning_rate": 1.1573752870474423e-06, "loss": 0.0945, "num_input_tokens_seen": 7987712, "step": 11845 }, { "epoch": 0.28949747147778077, "grad_norm": 23.782079696655273, "learning_rate": 1.1578638784384617e-06, "loss": 0.2029, "num_input_tokens_seen": 7991104, "step": 11850 }, { "epoch": 0.28961962230962796, "grad_norm": 56.127593994140625, "learning_rate": 1.1583524698294816e-06, "loss": 0.1268, "num_input_tokens_seen": 7994112, "step": 11855 }, { "epoch": 0.2897417731414751, "grad_norm": 1.2978109121322632, "learning_rate": 1.1588410612205013e-06, "loss": 0.1884, "num_input_tokens_seen": 7997568, "step": 11860 }, { "epoch": 0.2898639239733223, "grad_norm": 27.923282623291016, "learning_rate": 1.1593296526115208e-06, "loss": 0.1154, "num_input_tokens_seen": 8000832, "step": 11865 }, { "epoch": 0.2899860748051694, "grad_norm": 33.25914001464844, "learning_rate": 1.1598182440025407e-06, "loss": 0.0996, "num_input_tokens_seen": 8004096, "step": 11870 }, { "epoch": 0.2901082256370166, "grad_norm": 14.677995681762695, "learning_rate": 1.1603068353935604e-06, "loss": 0.0903, "num_input_tokens_seen": 8007040, "step": 11875 }, { "epoch": 0.29023037646886374, "grad_norm": 47.54405212402344, "learning_rate": 1.16079542678458e-06, "loss": 0.1717, "num_input_tokens_seen": 8009984, "step": 11880 }, { "epoch": 0.2903525273007109, "grad_norm": 93.65504455566406, "learning_rate": 1.1612840181755997e-06, "loss": 0.1056, "num_input_tokens_seen": 8013312, "step": 11885 }, { "epoch": 0.29047467813255806, "grad_norm": 0.011830669827759266, "learning_rate": 1.1617726095666194e-06, "loss": 0.0957, "num_input_tokens_seen": 8016640, "step": 11890 }, { "epoch": 0.29059682896440525, "grad_norm": 2.5089898109436035, "learning_rate": 1.162261200957639e-06, "loss": 0.107, "num_input_tokens_seen": 8020096, "step": 11895 }, { "epoch": 0.2907189797962524, "grad_norm": 28.145421981811523, "learning_rate": 1.1627497923486588e-06, "loss": 0.0793, "num_input_tokens_seen": 8024000, "step": 11900 }, { "epoch": 0.2908411306280996, "grad_norm": 60.22999572753906, "learning_rate": 1.1632383837396784e-06, "loss": 0.0996, "num_input_tokens_seen": 8027520, "step": 11905 }, { "epoch": 0.29096328145994677, "grad_norm": 11.291279792785645, "learning_rate": 1.1637269751306981e-06, "loss": 0.0543, "num_input_tokens_seen": 8031232, "step": 11910 }, { "epoch": 0.2910854322917939, "grad_norm": 49.44854736328125, "learning_rate": 1.164215566521718e-06, "loss": 0.1778, "num_input_tokens_seen": 8034368, "step": 11915 }, { "epoch": 0.2912075831236411, "grad_norm": 34.739707946777344, "learning_rate": 1.1647041579127375e-06, "loss": 0.1668, "num_input_tokens_seen": 8037440, "step": 11920 }, { "epoch": 0.2913297339554882, "grad_norm": 10.186627388000488, "learning_rate": 1.1651927493037572e-06, "loss": 0.2086, "num_input_tokens_seen": 8041152, "step": 11925 }, { "epoch": 0.2914518847873354, "grad_norm": 14.452888488769531, "learning_rate": 1.165681340694777e-06, "loss": 0.1015, "num_input_tokens_seen": 8044608, "step": 11930 }, { "epoch": 0.29157403561918255, "grad_norm": 1.4567221403121948, "learning_rate": 1.1661699320857965e-06, "loss": 0.0934, "num_input_tokens_seen": 8048064, "step": 11935 }, { "epoch": 0.29169618645102974, "grad_norm": 22.50691032409668, "learning_rate": 1.1666585234768162e-06, "loss": 0.1407, "num_input_tokens_seen": 8051392, "step": 11940 }, { "epoch": 0.29181833728287687, "grad_norm": 32.28067398071289, "learning_rate": 1.1671471148678361e-06, "loss": 0.1593, "num_input_tokens_seen": 8054656, "step": 11945 }, { "epoch": 0.29194048811472406, "grad_norm": 20.15083122253418, "learning_rate": 1.1676357062588558e-06, "loss": 0.0877, "num_input_tokens_seen": 8058176, "step": 11950 }, { "epoch": 0.29206263894657125, "grad_norm": 20.036026000976562, "learning_rate": 1.1681242976498753e-06, "loss": 0.0686, "num_input_tokens_seen": 8062208, "step": 11955 }, { "epoch": 0.2921847897784184, "grad_norm": 11.90791130065918, "learning_rate": 1.1686128890408952e-06, "loss": 0.1233, "num_input_tokens_seen": 8066560, "step": 11960 }, { "epoch": 0.2923069406102656, "grad_norm": 10.932024002075195, "learning_rate": 1.1691014804319148e-06, "loss": 0.1607, "num_input_tokens_seen": 8069824, "step": 11965 }, { "epoch": 0.2924290914421127, "grad_norm": 26.350589752197266, "learning_rate": 1.1695900718229343e-06, "loss": 0.2533, "num_input_tokens_seen": 8073216, "step": 11970 }, { "epoch": 0.2925512422739599, "grad_norm": 2.1203091144561768, "learning_rate": 1.1700786632139542e-06, "loss": 0.104, "num_input_tokens_seen": 8076416, "step": 11975 }, { "epoch": 0.29267339310580703, "grad_norm": 9.464871406555176, "learning_rate": 1.1705672546049739e-06, "loss": 0.0888, "num_input_tokens_seen": 8079680, "step": 11980 }, { "epoch": 0.2927955439376542, "grad_norm": 27.66861915588379, "learning_rate": 1.1710558459959936e-06, "loss": 0.1051, "num_input_tokens_seen": 8083136, "step": 11985 }, { "epoch": 0.29291769476950136, "grad_norm": 1.5247504711151123, "learning_rate": 1.1715444373870132e-06, "loss": 0.1162, "num_input_tokens_seen": 8086336, "step": 11990 }, { "epoch": 0.29303984560134855, "grad_norm": 11.192178726196289, "learning_rate": 1.172033028778033e-06, "loss": 0.1732, "num_input_tokens_seen": 8089536, "step": 11995 }, { "epoch": 0.29316199643319574, "grad_norm": 8.231268882751465, "learning_rate": 1.1725216201690526e-06, "loss": 0.1816, "num_input_tokens_seen": 8092928, "step": 12000 }, { "epoch": 0.29328414726504287, "grad_norm": 13.495226860046387, "learning_rate": 1.1730102115600723e-06, "loss": 0.0593, "num_input_tokens_seen": 8095872, "step": 12005 }, { "epoch": 0.29340629809689006, "grad_norm": 11.1940336227417, "learning_rate": 1.173498802951092e-06, "loss": 0.1083, "num_input_tokens_seen": 8099008, "step": 12010 }, { "epoch": 0.2935284489287372, "grad_norm": 10.936391830444336, "learning_rate": 1.1739873943421116e-06, "loss": 0.07, "num_input_tokens_seen": 8102208, "step": 12015 }, { "epoch": 0.2936505997605844, "grad_norm": 26.182369232177734, "learning_rate": 1.1744759857331313e-06, "loss": 0.094, "num_input_tokens_seen": 8105472, "step": 12020 }, { "epoch": 0.2937727505924315, "grad_norm": 21.47533416748047, "learning_rate": 1.174964577124151e-06, "loss": 0.1558, "num_input_tokens_seen": 8108928, "step": 12025 }, { "epoch": 0.2938949014242787, "grad_norm": 9.534170150756836, "learning_rate": 1.1754531685151707e-06, "loss": 0.0828, "num_input_tokens_seen": 8112320, "step": 12030 }, { "epoch": 0.29401705225612584, "grad_norm": 44.23080825805664, "learning_rate": 1.1759417599061906e-06, "loss": 0.201, "num_input_tokens_seen": 8115776, "step": 12035 }, { "epoch": 0.29413920308797303, "grad_norm": 16.109403610229492, "learning_rate": 1.17643035129721e-06, "loss": 0.1382, "num_input_tokens_seen": 8118784, "step": 12040 }, { "epoch": 0.29426135391982017, "grad_norm": 9.764286994934082, "learning_rate": 1.1769189426882297e-06, "loss": 0.1141, "num_input_tokens_seen": 8122368, "step": 12045 }, { "epoch": 0.29438350475166736, "grad_norm": 21.418296813964844, "learning_rate": 1.1774075340792496e-06, "loss": 0.1614, "num_input_tokens_seen": 8125568, "step": 12050 }, { "epoch": 0.29450565558351455, "grad_norm": 4.4351677894592285, "learning_rate": 1.177896125470269e-06, "loss": 0.1131, "num_input_tokens_seen": 8128768, "step": 12055 }, { "epoch": 0.2946278064153617, "grad_norm": 19.241079330444336, "learning_rate": 1.1783847168612888e-06, "loss": 0.112, "num_input_tokens_seen": 8132096, "step": 12060 }, { "epoch": 0.29474995724720887, "grad_norm": 35.76866912841797, "learning_rate": 1.1788733082523087e-06, "loss": 0.0945, "num_input_tokens_seen": 8135360, "step": 12065 }, { "epoch": 0.294872108079056, "grad_norm": 23.178844451904297, "learning_rate": 1.1793618996433284e-06, "loss": 0.1098, "num_input_tokens_seen": 8138560, "step": 12070 }, { "epoch": 0.2949942589109032, "grad_norm": 4.689840316772461, "learning_rate": 1.1798504910343478e-06, "loss": 0.0424, "num_input_tokens_seen": 8141952, "step": 12075 }, { "epoch": 0.29511640974275033, "grad_norm": 23.47108268737793, "learning_rate": 1.1803390824253677e-06, "loss": 0.0183, "num_input_tokens_seen": 8145472, "step": 12080 }, { "epoch": 0.2952385605745975, "grad_norm": 45.8759651184082, "learning_rate": 1.1808276738163874e-06, "loss": 0.1725, "num_input_tokens_seen": 8148544, "step": 12085 }, { "epoch": 0.29536071140644465, "grad_norm": 66.73482513427734, "learning_rate": 1.1813162652074069e-06, "loss": 0.1385, "num_input_tokens_seen": 8152192, "step": 12090 }, { "epoch": 0.29548286223829184, "grad_norm": 86.15653991699219, "learning_rate": 1.1818048565984268e-06, "loss": 0.2717, "num_input_tokens_seen": 8155008, "step": 12095 }, { "epoch": 0.29560501307013903, "grad_norm": 72.50959777832031, "learning_rate": 1.1822934479894464e-06, "loss": 0.2778, "num_input_tokens_seen": 8158592, "step": 12100 }, { "epoch": 0.29572716390198617, "grad_norm": 25.257177352905273, "learning_rate": 1.1827820393804661e-06, "loss": 0.109, "num_input_tokens_seen": 8162048, "step": 12105 }, { "epoch": 0.29584931473383336, "grad_norm": 18.258644104003906, "learning_rate": 1.1832706307714858e-06, "loss": 0.2265, "num_input_tokens_seen": 8165312, "step": 12110 }, { "epoch": 0.2959714655656805, "grad_norm": 35.079010009765625, "learning_rate": 1.1837592221625055e-06, "loss": 0.0558, "num_input_tokens_seen": 8168768, "step": 12115 }, { "epoch": 0.2960936163975277, "grad_norm": 23.954132080078125, "learning_rate": 1.1842478135535252e-06, "loss": 0.2307, "num_input_tokens_seen": 8172288, "step": 12120 }, { "epoch": 0.2962157672293748, "grad_norm": 24.541086196899414, "learning_rate": 1.1847364049445448e-06, "loss": 0.13, "num_input_tokens_seen": 8175808, "step": 12125 }, { "epoch": 0.296337918061222, "grad_norm": 54.54975128173828, "learning_rate": 1.1852249963355645e-06, "loss": 0.1947, "num_input_tokens_seen": 8179136, "step": 12130 }, { "epoch": 0.29646006889306914, "grad_norm": 21.549692153930664, "learning_rate": 1.1857135877265842e-06, "loss": 0.2153, "num_input_tokens_seen": 8182528, "step": 12135 }, { "epoch": 0.29658221972491633, "grad_norm": 15.662209510803223, "learning_rate": 1.1862021791176037e-06, "loss": 0.1133, "num_input_tokens_seen": 8185536, "step": 12140 }, { "epoch": 0.2967043705567635, "grad_norm": 14.91734790802002, "learning_rate": 1.1866907705086236e-06, "loss": 0.0854, "num_input_tokens_seen": 8188864, "step": 12145 }, { "epoch": 0.29682652138861065, "grad_norm": 3.287745475769043, "learning_rate": 1.1871793618996433e-06, "loss": 0.086, "num_input_tokens_seen": 8192192, "step": 12150 }, { "epoch": 0.29694867222045784, "grad_norm": 0.6978838443756104, "learning_rate": 1.1876679532906631e-06, "loss": 0.1689, "num_input_tokens_seen": 8195136, "step": 12155 }, { "epoch": 0.297070823052305, "grad_norm": 1.291856288909912, "learning_rate": 1.1881565446816826e-06, "loss": 0.1316, "num_input_tokens_seen": 8198592, "step": 12160 }, { "epoch": 0.29719297388415217, "grad_norm": 15.981237411499023, "learning_rate": 1.1886451360727023e-06, "loss": 0.1468, "num_input_tokens_seen": 8201920, "step": 12165 }, { "epoch": 0.2973151247159993, "grad_norm": 8.645312309265137, "learning_rate": 1.1891337274637222e-06, "loss": 0.1071, "num_input_tokens_seen": 8205824, "step": 12170 }, { "epoch": 0.2974372755478465, "grad_norm": 0.7097776532173157, "learning_rate": 1.1896223188547417e-06, "loss": 0.1096, "num_input_tokens_seen": 8209216, "step": 12175 }, { "epoch": 0.2975594263796936, "grad_norm": 10.166160583496094, "learning_rate": 1.1901109102457613e-06, "loss": 0.1757, "num_input_tokens_seen": 8212672, "step": 12180 }, { "epoch": 0.2976815772115408, "grad_norm": 38.063968658447266, "learning_rate": 1.1905995016367812e-06, "loss": 0.1019, "num_input_tokens_seen": 8215744, "step": 12185 }, { "epoch": 0.29780372804338795, "grad_norm": 16.18952178955078, "learning_rate": 1.191088093027801e-06, "loss": 0.1574, "num_input_tokens_seen": 8219200, "step": 12190 }, { "epoch": 0.29792587887523514, "grad_norm": 30.04621124267578, "learning_rate": 1.1915766844188204e-06, "loss": 0.0352, "num_input_tokens_seen": 8222784, "step": 12195 }, { "epoch": 0.29804802970708233, "grad_norm": 29.685110092163086, "learning_rate": 1.1920652758098403e-06, "loss": 0.1086, "num_input_tokens_seen": 8226368, "step": 12200 }, { "epoch": 0.29817018053892946, "grad_norm": 13.526518821716309, "learning_rate": 1.19255386720086e-06, "loss": 0.194, "num_input_tokens_seen": 8229440, "step": 12205 }, { "epoch": 0.29829233137077665, "grad_norm": 32.48648452758789, "learning_rate": 1.1930424585918794e-06, "loss": 0.0659, "num_input_tokens_seen": 8232768, "step": 12210 }, { "epoch": 0.2984144822026238, "grad_norm": 30.82636833190918, "learning_rate": 1.1935310499828993e-06, "loss": 0.3557, "num_input_tokens_seen": 8235968, "step": 12215 }, { "epoch": 0.298536633034471, "grad_norm": 25.68510627746582, "learning_rate": 1.194019641373919e-06, "loss": 0.1073, "num_input_tokens_seen": 8239040, "step": 12220 }, { "epoch": 0.2986587838663181, "grad_norm": 27.209468841552734, "learning_rate": 1.1945082327649387e-06, "loss": 0.0733, "num_input_tokens_seen": 8242112, "step": 12225 }, { "epoch": 0.2987809346981653, "grad_norm": 18.996612548828125, "learning_rate": 1.1949968241559582e-06, "loss": 0.0708, "num_input_tokens_seen": 8245888, "step": 12230 }, { "epoch": 0.29890308553001244, "grad_norm": 34.09287643432617, "learning_rate": 1.195485415546978e-06, "loss": 0.0598, "num_input_tokens_seen": 8249344, "step": 12235 }, { "epoch": 0.2990252363618596, "grad_norm": 34.00294494628906, "learning_rate": 1.1959740069379977e-06, "loss": 0.1704, "num_input_tokens_seen": 8252992, "step": 12240 }, { "epoch": 0.2991473871937068, "grad_norm": 4.011249542236328, "learning_rate": 1.1964625983290172e-06, "loss": 0.0935, "num_input_tokens_seen": 8256832, "step": 12245 }, { "epoch": 0.29926953802555395, "grad_norm": 50.23273849487305, "learning_rate": 1.196951189720037e-06, "loss": 0.0363, "num_input_tokens_seen": 8260352, "step": 12250 }, { "epoch": 0.29939168885740114, "grad_norm": 55.2866096496582, "learning_rate": 1.1974397811110568e-06, "loss": 0.0674, "num_input_tokens_seen": 8263680, "step": 12255 }, { "epoch": 0.2995138396892483, "grad_norm": 126.1909408569336, "learning_rate": 1.1979283725020767e-06, "loss": 0.1637, "num_input_tokens_seen": 8266944, "step": 12260 }, { "epoch": 0.29963599052109546, "grad_norm": 0.3145304024219513, "learning_rate": 1.1984169638930961e-06, "loss": 0.2747, "num_input_tokens_seen": 8269952, "step": 12265 }, { "epoch": 0.2997581413529426, "grad_norm": 51.68240737915039, "learning_rate": 1.1989055552841158e-06, "loss": 0.2082, "num_input_tokens_seen": 8273472, "step": 12270 }, { "epoch": 0.2998802921847898, "grad_norm": 3.867333173751831, "learning_rate": 1.1993941466751357e-06, "loss": 0.2137, "num_input_tokens_seen": 8276416, "step": 12275 }, { "epoch": 0.3000024430166369, "grad_norm": 64.91502380371094, "learning_rate": 1.1998827380661552e-06, "loss": 0.0824, "num_input_tokens_seen": 8279488, "step": 12280 }, { "epoch": 0.3001245938484841, "grad_norm": 12.365072250366211, "learning_rate": 1.2003713294571749e-06, "loss": 0.2047, "num_input_tokens_seen": 8283072, "step": 12285 }, { "epoch": 0.30024674468033125, "grad_norm": 17.204029083251953, "learning_rate": 1.2008599208481948e-06, "loss": 0.1809, "num_input_tokens_seen": 8286208, "step": 12290 }, { "epoch": 0.30036889551217844, "grad_norm": 44.201969146728516, "learning_rate": 1.2013485122392142e-06, "loss": 0.2269, "num_input_tokens_seen": 8289216, "step": 12295 }, { "epoch": 0.3004910463440256, "grad_norm": 26.522666931152344, "learning_rate": 1.201837103630234e-06, "loss": 0.1067, "num_input_tokens_seen": 8292672, "step": 12300 }, { "epoch": 0.30061319717587276, "grad_norm": 15.294692993164062, "learning_rate": 1.2023256950212538e-06, "loss": 0.1393, "num_input_tokens_seen": 8296576, "step": 12305 }, { "epoch": 0.30073534800771995, "grad_norm": 2.377216339111328, "learning_rate": 1.2028142864122735e-06, "loss": 0.1101, "num_input_tokens_seen": 8299776, "step": 12310 }, { "epoch": 0.3008574988395671, "grad_norm": 3.2018935680389404, "learning_rate": 1.203302877803293e-06, "loss": 0.1248, "num_input_tokens_seen": 8303616, "step": 12315 }, { "epoch": 0.3009796496714143, "grad_norm": 1.4899169206619263, "learning_rate": 1.2037914691943128e-06, "loss": 0.044, "num_input_tokens_seen": 8306816, "step": 12320 }, { "epoch": 0.3011018005032614, "grad_norm": 15.029468536376953, "learning_rate": 1.2042800605853325e-06, "loss": 0.0833, "num_input_tokens_seen": 8310144, "step": 12325 }, { "epoch": 0.3012239513351086, "grad_norm": 18.200010299682617, "learning_rate": 1.204768651976352e-06, "loss": 0.0851, "num_input_tokens_seen": 8313408, "step": 12330 }, { "epoch": 0.30134610216695573, "grad_norm": 17.035367965698242, "learning_rate": 1.2052572433673717e-06, "loss": 0.0985, "num_input_tokens_seen": 8316608, "step": 12335 }, { "epoch": 0.3014682529988029, "grad_norm": 32.94978713989258, "learning_rate": 1.2057458347583916e-06, "loss": 0.1569, "num_input_tokens_seen": 8320576, "step": 12340 }, { "epoch": 0.3015904038306501, "grad_norm": 18.8249568939209, "learning_rate": 1.2062344261494112e-06, "loss": 0.1164, "num_input_tokens_seen": 8323520, "step": 12345 }, { "epoch": 0.30171255466249725, "grad_norm": 0.9230943322181702, "learning_rate": 1.2067230175404307e-06, "loss": 0.0494, "num_input_tokens_seen": 8327040, "step": 12350 }, { "epoch": 0.30183470549434444, "grad_norm": 18.419689178466797, "learning_rate": 1.2072116089314506e-06, "loss": 0.2087, "num_input_tokens_seen": 8330240, "step": 12355 }, { "epoch": 0.30195685632619157, "grad_norm": 5.805023670196533, "learning_rate": 1.2077002003224703e-06, "loss": 0.0641, "num_input_tokens_seen": 8333312, "step": 12360 }, { "epoch": 0.30207900715803876, "grad_norm": 10.554583549499512, "learning_rate": 1.2081887917134898e-06, "loss": 0.1007, "num_input_tokens_seen": 8336576, "step": 12365 }, { "epoch": 0.3022011579898859, "grad_norm": 25.36183738708496, "learning_rate": 1.2086773831045097e-06, "loss": 0.2068, "num_input_tokens_seen": 8340288, "step": 12370 }, { "epoch": 0.3023233088217331, "grad_norm": 39.68854904174805, "learning_rate": 1.2091659744955293e-06, "loss": 0.1953, "num_input_tokens_seen": 8343552, "step": 12375 }, { "epoch": 0.3024454596535802, "grad_norm": 1.862717628479004, "learning_rate": 1.2096545658865492e-06, "loss": 0.1275, "num_input_tokens_seen": 8347136, "step": 12380 }, { "epoch": 0.3025676104854274, "grad_norm": 34.478878021240234, "learning_rate": 1.2101431572775687e-06, "loss": 0.2043, "num_input_tokens_seen": 8351104, "step": 12385 }, { "epoch": 0.3026897613172746, "grad_norm": 5.264378070831299, "learning_rate": 1.2106317486685884e-06, "loss": 0.1308, "num_input_tokens_seen": 8354176, "step": 12390 }, { "epoch": 0.30281191214912173, "grad_norm": 2.541132688522339, "learning_rate": 1.2111203400596083e-06, "loss": 0.0783, "num_input_tokens_seen": 8357888, "step": 12395 }, { "epoch": 0.3029340629809689, "grad_norm": 18.312211990356445, "learning_rate": 1.2116089314506277e-06, "loss": 0.0698, "num_input_tokens_seen": 8361024, "step": 12400 }, { "epoch": 0.30305621381281606, "grad_norm": 18.090362548828125, "learning_rate": 1.2120975228416474e-06, "loss": 0.0714, "num_input_tokens_seen": 8364224, "step": 12405 }, { "epoch": 0.30317836464466325, "grad_norm": 23.060293197631836, "learning_rate": 1.2125861142326673e-06, "loss": 0.184, "num_input_tokens_seen": 8367296, "step": 12410 }, { "epoch": 0.3033005154765104, "grad_norm": 29.778749465942383, "learning_rate": 1.2130747056236868e-06, "loss": 0.1278, "num_input_tokens_seen": 8370304, "step": 12415 }, { "epoch": 0.30342266630835757, "grad_norm": 20.16764259338379, "learning_rate": 1.2135632970147065e-06, "loss": 0.1006, "num_input_tokens_seen": 8373504, "step": 12420 }, { "epoch": 0.3035448171402047, "grad_norm": 14.994250297546387, "learning_rate": 1.2140518884057261e-06, "loss": 0.1913, "num_input_tokens_seen": 8377024, "step": 12425 }, { "epoch": 0.3036669679720519, "grad_norm": 39.417724609375, "learning_rate": 1.214540479796746e-06, "loss": 0.1267, "num_input_tokens_seen": 8380160, "step": 12430 }, { "epoch": 0.303789118803899, "grad_norm": 20.88626480102539, "learning_rate": 1.2150290711877655e-06, "loss": 0.1016, "num_input_tokens_seen": 8383424, "step": 12435 }, { "epoch": 0.3039112696357462, "grad_norm": 4.2171149253845215, "learning_rate": 1.2155176625787852e-06, "loss": 0.0801, "num_input_tokens_seen": 8387072, "step": 12440 }, { "epoch": 0.3040334204675934, "grad_norm": 4.536027908325195, "learning_rate": 1.216006253969805e-06, "loss": 0.0782, "num_input_tokens_seen": 8390656, "step": 12445 }, { "epoch": 0.30415557129944054, "grad_norm": 1.3891936540603638, "learning_rate": 1.2164948453608246e-06, "loss": 0.0775, "num_input_tokens_seen": 8394304, "step": 12450 }, { "epoch": 0.30427772213128773, "grad_norm": 13.561484336853027, "learning_rate": 1.2169834367518442e-06, "loss": 0.243, "num_input_tokens_seen": 8397632, "step": 12455 }, { "epoch": 0.30439987296313487, "grad_norm": 21.17917823791504, "learning_rate": 1.2174720281428641e-06, "loss": 0.2416, "num_input_tokens_seen": 8401024, "step": 12460 }, { "epoch": 0.30452202379498206, "grad_norm": 0.6886555552482605, "learning_rate": 1.2179606195338838e-06, "loss": 0.0535, "num_input_tokens_seen": 8404416, "step": 12465 }, { "epoch": 0.3046441746268292, "grad_norm": 32.76136779785156, "learning_rate": 1.2184492109249033e-06, "loss": 0.151, "num_input_tokens_seen": 8408448, "step": 12470 }, { "epoch": 0.3047663254586764, "grad_norm": 1.4499430656433105, "learning_rate": 1.2189378023159232e-06, "loss": 0.1923, "num_input_tokens_seen": 8412416, "step": 12475 }, { "epoch": 0.3048884762905235, "grad_norm": 22.090356826782227, "learning_rate": 1.2194263937069429e-06, "loss": 0.1284, "num_input_tokens_seen": 8415296, "step": 12480 }, { "epoch": 0.3050106271223707, "grad_norm": 8.059381484985352, "learning_rate": 1.2199149850979623e-06, "loss": 0.0634, "num_input_tokens_seen": 8418240, "step": 12485 }, { "epoch": 0.3051327779542179, "grad_norm": 33.66912841796875, "learning_rate": 1.2204035764889822e-06, "loss": 0.108, "num_input_tokens_seen": 8421952, "step": 12490 }, { "epoch": 0.305254928786065, "grad_norm": 18.86580467224121, "learning_rate": 1.220892167880002e-06, "loss": 0.0815, "num_input_tokens_seen": 8425024, "step": 12495 }, { "epoch": 0.3053770796179122, "grad_norm": 32.913883209228516, "learning_rate": 1.2213807592710218e-06, "loss": 0.1205, "num_input_tokens_seen": 8428352, "step": 12500 }, { "epoch": 0.30549923044975935, "grad_norm": 0.14717574417591095, "learning_rate": 1.2218693506620413e-06, "loss": 0.0864, "num_input_tokens_seen": 8431488, "step": 12505 }, { "epoch": 0.30562138128160654, "grad_norm": 0.622601330280304, "learning_rate": 1.222357942053061e-06, "loss": 0.0892, "num_input_tokens_seen": 8434944, "step": 12510 }, { "epoch": 0.3057435321134537, "grad_norm": 0.17498765885829926, "learning_rate": 1.2228465334440806e-06, "loss": 0.0486, "num_input_tokens_seen": 8438272, "step": 12515 }, { "epoch": 0.30586568294530087, "grad_norm": 14.947955131530762, "learning_rate": 1.2233351248351003e-06, "loss": 0.1224, "num_input_tokens_seen": 8441792, "step": 12520 }, { "epoch": 0.305987833777148, "grad_norm": 8.9529447555542, "learning_rate": 1.22382371622612e-06, "loss": 0.2268, "num_input_tokens_seen": 8444864, "step": 12525 }, { "epoch": 0.3061099846089952, "grad_norm": 14.22401237487793, "learning_rate": 1.2243123076171397e-06, "loss": 0.107, "num_input_tokens_seen": 8448320, "step": 12530 }, { "epoch": 0.3062321354408424, "grad_norm": 32.58435821533203, "learning_rate": 1.2248008990081596e-06, "loss": 0.1659, "num_input_tokens_seen": 8451904, "step": 12535 }, { "epoch": 0.3063542862726895, "grad_norm": 42.00128173828125, "learning_rate": 1.225289490399179e-06, "loss": 0.1476, "num_input_tokens_seen": 8455552, "step": 12540 }, { "epoch": 0.3064764371045367, "grad_norm": 0.3023090064525604, "learning_rate": 1.2257780817901987e-06, "loss": 0.041, "num_input_tokens_seen": 8458752, "step": 12545 }, { "epoch": 0.30659858793638384, "grad_norm": 30.7508487701416, "learning_rate": 1.2262666731812186e-06, "loss": 0.1281, "num_input_tokens_seen": 8461888, "step": 12550 }, { "epoch": 0.306720738768231, "grad_norm": 16.92136001586914, "learning_rate": 1.226755264572238e-06, "loss": 0.0974, "num_input_tokens_seen": 8465536, "step": 12555 }, { "epoch": 0.30684288960007816, "grad_norm": 1.042176604270935, "learning_rate": 1.2272438559632578e-06, "loss": 0.1158, "num_input_tokens_seen": 8468608, "step": 12560 }, { "epoch": 0.30696504043192535, "grad_norm": 0.04367939010262489, "learning_rate": 1.2277324473542776e-06, "loss": 0.058, "num_input_tokens_seen": 8472192, "step": 12565 }, { "epoch": 0.3070871912637725, "grad_norm": 18.037769317626953, "learning_rate": 1.2282210387452971e-06, "loss": 0.067, "num_input_tokens_seen": 8476224, "step": 12570 }, { "epoch": 0.3072093420956197, "grad_norm": 1.1944955587387085, "learning_rate": 1.2287096301363168e-06, "loss": 0.2403, "num_input_tokens_seen": 8479680, "step": 12575 }, { "epoch": 0.3073314929274668, "grad_norm": 28.309049606323242, "learning_rate": 1.2291982215273367e-06, "loss": 0.1631, "num_input_tokens_seen": 8483392, "step": 12580 }, { "epoch": 0.307453643759314, "grad_norm": 32.31857681274414, "learning_rate": 1.2296868129183564e-06, "loss": 0.1993, "num_input_tokens_seen": 8487232, "step": 12585 }, { "epoch": 0.3075757945911612, "grad_norm": 26.554603576660156, "learning_rate": 1.2301754043093758e-06, "loss": 0.1225, "num_input_tokens_seen": 8490944, "step": 12590 }, { "epoch": 0.3076979454230083, "grad_norm": 28.783294677734375, "learning_rate": 1.2306639957003957e-06, "loss": 0.0538, "num_input_tokens_seen": 8494336, "step": 12595 }, { "epoch": 0.3078200962548555, "grad_norm": 34.531803131103516, "learning_rate": 1.2311525870914154e-06, "loss": 0.1477, "num_input_tokens_seen": 8497472, "step": 12600 }, { "epoch": 0.30794224708670265, "grad_norm": 20.531259536743164, "learning_rate": 1.2316411784824349e-06, "loss": 0.0984, "num_input_tokens_seen": 8501056, "step": 12605 }, { "epoch": 0.30806439791854984, "grad_norm": 1.207240104675293, "learning_rate": 1.2321297698734548e-06, "loss": 0.0836, "num_input_tokens_seen": 8504320, "step": 12610 }, { "epoch": 0.30818654875039697, "grad_norm": 24.03017807006836, "learning_rate": 1.2326183612644745e-06, "loss": 0.249, "num_input_tokens_seen": 8507648, "step": 12615 }, { "epoch": 0.30830869958224416, "grad_norm": 39.64318084716797, "learning_rate": 1.2331069526554941e-06, "loss": 0.1641, "num_input_tokens_seen": 8510720, "step": 12620 }, { "epoch": 0.3084308504140913, "grad_norm": 35.29051971435547, "learning_rate": 1.2335955440465138e-06, "loss": 0.1154, "num_input_tokens_seen": 8513984, "step": 12625 }, { "epoch": 0.3085530012459385, "grad_norm": 1.6317697763442993, "learning_rate": 1.2340841354375335e-06, "loss": 0.1706, "num_input_tokens_seen": 8517312, "step": 12630 }, { "epoch": 0.3086751520777857, "grad_norm": 11.825055122375488, "learning_rate": 1.2345727268285532e-06, "loss": 0.025, "num_input_tokens_seen": 8520768, "step": 12635 }, { "epoch": 0.3087973029096328, "grad_norm": 11.842734336853027, "learning_rate": 1.2350613182195729e-06, "loss": 0.1898, "num_input_tokens_seen": 8523712, "step": 12640 }, { "epoch": 0.30891945374148, "grad_norm": 66.35395050048828, "learning_rate": 1.2355499096105926e-06, "loss": 0.2702, "num_input_tokens_seen": 8526784, "step": 12645 }, { "epoch": 0.30904160457332713, "grad_norm": 21.204254150390625, "learning_rate": 1.2360385010016122e-06, "loss": 0.0935, "num_input_tokens_seen": 8530048, "step": 12650 }, { "epoch": 0.3091637554051743, "grad_norm": 41.15862274169922, "learning_rate": 1.2365270923926321e-06, "loss": 0.1544, "num_input_tokens_seen": 8533056, "step": 12655 }, { "epoch": 0.30928590623702146, "grad_norm": 34.149261474609375, "learning_rate": 1.2370156837836516e-06, "loss": 0.1554, "num_input_tokens_seen": 8536320, "step": 12660 }, { "epoch": 0.30940805706886865, "grad_norm": 7.996495723724365, "learning_rate": 1.2375042751746713e-06, "loss": 0.1172, "num_input_tokens_seen": 8539392, "step": 12665 }, { "epoch": 0.3095302079007158, "grad_norm": 27.449047088623047, "learning_rate": 1.2379928665656912e-06, "loss": 0.1463, "num_input_tokens_seen": 8543168, "step": 12670 }, { "epoch": 0.30965235873256297, "grad_norm": 32.094703674316406, "learning_rate": 1.2384814579567106e-06, "loss": 0.1172, "num_input_tokens_seen": 8546496, "step": 12675 }, { "epoch": 0.30977450956441016, "grad_norm": 10.903212547302246, "learning_rate": 1.2389700493477303e-06, "loss": 0.0659, "num_input_tokens_seen": 8550336, "step": 12680 }, { "epoch": 0.3098966603962573, "grad_norm": 3.5534703731536865, "learning_rate": 1.2394586407387502e-06, "loss": 0.1353, "num_input_tokens_seen": 8553600, "step": 12685 }, { "epoch": 0.3100188112281045, "grad_norm": 6.912077903747559, "learning_rate": 1.23994723212977e-06, "loss": 0.1039, "num_input_tokens_seen": 8557184, "step": 12690 }, { "epoch": 0.3101409620599516, "grad_norm": 32.508079528808594, "learning_rate": 1.2404358235207894e-06, "loss": 0.1146, "num_input_tokens_seen": 8561152, "step": 12695 }, { "epoch": 0.3102631128917988, "grad_norm": 22.530750274658203, "learning_rate": 1.2409244149118093e-06, "loss": 0.1317, "num_input_tokens_seen": 8564992, "step": 12700 }, { "epoch": 0.31038526372364594, "grad_norm": 23.627466201782227, "learning_rate": 1.241413006302829e-06, "loss": 0.1771, "num_input_tokens_seen": 8568448, "step": 12705 }, { "epoch": 0.31050741455549313, "grad_norm": 3.3501780033111572, "learning_rate": 1.2419015976938484e-06, "loss": 0.0726, "num_input_tokens_seen": 8572224, "step": 12710 }, { "epoch": 0.31062956538734027, "grad_norm": 20.607057571411133, "learning_rate": 1.2423901890848683e-06, "loss": 0.1894, "num_input_tokens_seen": 8576064, "step": 12715 }, { "epoch": 0.31075171621918746, "grad_norm": 23.418180465698242, "learning_rate": 1.242878780475888e-06, "loss": 0.1499, "num_input_tokens_seen": 8579840, "step": 12720 }, { "epoch": 0.3108738670510346, "grad_norm": 0.8048353791236877, "learning_rate": 1.2433673718669075e-06, "loss": 0.021, "num_input_tokens_seen": 8583488, "step": 12725 }, { "epoch": 0.3109960178828818, "grad_norm": 36.71179962158203, "learning_rate": 1.2438559632579273e-06, "loss": 0.2072, "num_input_tokens_seen": 8586688, "step": 12730 }, { "epoch": 0.31111816871472897, "grad_norm": 5.470469951629639, "learning_rate": 1.244344554648947e-06, "loss": 0.0856, "num_input_tokens_seen": 8589888, "step": 12735 }, { "epoch": 0.3112403195465761, "grad_norm": 20.87204360961914, "learning_rate": 1.2448331460399667e-06, "loss": 0.1108, "num_input_tokens_seen": 8593088, "step": 12740 }, { "epoch": 0.3113624703784233, "grad_norm": 32.569461822509766, "learning_rate": 1.2453217374309864e-06, "loss": 0.0851, "num_input_tokens_seen": 8596288, "step": 12745 }, { "epoch": 0.31148462121027043, "grad_norm": 2.8659555912017822, "learning_rate": 1.245810328822006e-06, "loss": 0.1089, "num_input_tokens_seen": 8600000, "step": 12750 }, { "epoch": 0.3116067720421176, "grad_norm": 39.90515899658203, "learning_rate": 1.2462989202130258e-06, "loss": 0.0856, "num_input_tokens_seen": 8602944, "step": 12755 }, { "epoch": 0.31172892287396475, "grad_norm": 1.3492740392684937, "learning_rate": 1.2467875116040454e-06, "loss": 0.1233, "num_input_tokens_seen": 8606336, "step": 12760 }, { "epoch": 0.31185107370581194, "grad_norm": 21.49714469909668, "learning_rate": 1.2472761029950651e-06, "loss": 0.185, "num_input_tokens_seen": 8609536, "step": 12765 }, { "epoch": 0.3119732245376591, "grad_norm": 1.6193816661834717, "learning_rate": 1.2477646943860848e-06, "loss": 0.1804, "num_input_tokens_seen": 8612672, "step": 12770 }, { "epoch": 0.31209537536950627, "grad_norm": 19.069721221923828, "learning_rate": 1.2482532857771047e-06, "loss": 0.1443, "num_input_tokens_seen": 8616128, "step": 12775 }, { "epoch": 0.31221752620135346, "grad_norm": 21.02128791809082, "learning_rate": 1.2487418771681242e-06, "loss": 0.1616, "num_input_tokens_seen": 8619584, "step": 12780 }, { "epoch": 0.3123396770332006, "grad_norm": 21.080299377441406, "learning_rate": 1.2492304685591438e-06, "loss": 0.1156, "num_input_tokens_seen": 8623040, "step": 12785 }, { "epoch": 0.3124618278650478, "grad_norm": 15.542115211486816, "learning_rate": 1.2497190599501637e-06, "loss": 0.1364, "num_input_tokens_seen": 8626432, "step": 12790 }, { "epoch": 0.3125839786968949, "grad_norm": 26.10782814025879, "learning_rate": 1.2502076513411832e-06, "loss": 0.1408, "num_input_tokens_seen": 8630400, "step": 12795 }, { "epoch": 0.3127061295287421, "grad_norm": 17.829648971557617, "learning_rate": 1.2506962427322029e-06, "loss": 0.0587, "num_input_tokens_seen": 8633536, "step": 12800 }, { "epoch": 0.31282828036058924, "grad_norm": 25.496232986450195, "learning_rate": 1.2511848341232228e-06, "loss": 0.1218, "num_input_tokens_seen": 8636992, "step": 12805 }, { "epoch": 0.31295043119243643, "grad_norm": 11.133720397949219, "learning_rate": 1.2516734255142425e-06, "loss": 0.0968, "num_input_tokens_seen": 8640448, "step": 12810 }, { "epoch": 0.31307258202428356, "grad_norm": 11.136938095092773, "learning_rate": 1.252162016905262e-06, "loss": 0.1877, "num_input_tokens_seen": 8643456, "step": 12815 }, { "epoch": 0.31319473285613075, "grad_norm": 24.134342193603516, "learning_rate": 1.2526506082962818e-06, "loss": 0.1171, "num_input_tokens_seen": 8647104, "step": 12820 }, { "epoch": 0.31331688368797794, "grad_norm": 30.42660903930664, "learning_rate": 1.2531391996873015e-06, "loss": 0.1403, "num_input_tokens_seen": 8650624, "step": 12825 }, { "epoch": 0.3134390345198251, "grad_norm": 26.292850494384766, "learning_rate": 1.253627791078321e-06, "loss": 0.0747, "num_input_tokens_seen": 8654144, "step": 12830 }, { "epoch": 0.31356118535167227, "grad_norm": 36.40958786010742, "learning_rate": 1.2541163824693409e-06, "loss": 0.0731, "num_input_tokens_seen": 8657792, "step": 12835 }, { "epoch": 0.3136833361835194, "grad_norm": 55.101200103759766, "learning_rate": 1.2546049738603605e-06, "loss": 0.1482, "num_input_tokens_seen": 8661056, "step": 12840 }, { "epoch": 0.3138054870153666, "grad_norm": 3.0964555740356445, "learning_rate": 1.25509356525138e-06, "loss": 0.0934, "num_input_tokens_seen": 8664448, "step": 12845 }, { "epoch": 0.3139276378472137, "grad_norm": 33.852081298828125, "learning_rate": 1.2555821566424e-06, "loss": 0.2028, "num_input_tokens_seen": 8667840, "step": 12850 }, { "epoch": 0.3140497886790609, "grad_norm": 3.9148716926574707, "learning_rate": 1.2560707480334196e-06, "loss": 0.1502, "num_input_tokens_seen": 8671744, "step": 12855 }, { "epoch": 0.31417193951090805, "grad_norm": 0.09759137779474258, "learning_rate": 1.2565593394244393e-06, "loss": 0.0424, "num_input_tokens_seen": 8675200, "step": 12860 }, { "epoch": 0.31429409034275524, "grad_norm": 7.122541904449463, "learning_rate": 1.257047930815459e-06, "loss": 0.0822, "num_input_tokens_seen": 8678464, "step": 12865 }, { "epoch": 0.3144162411746024, "grad_norm": 0.599284827709198, "learning_rate": 1.2575365222064786e-06, "loss": 0.128, "num_input_tokens_seen": 8681600, "step": 12870 }, { "epoch": 0.31453839200644956, "grad_norm": 19.219327926635742, "learning_rate": 1.2580251135974983e-06, "loss": 0.1034, "num_input_tokens_seen": 8685120, "step": 12875 }, { "epoch": 0.31466054283829675, "grad_norm": 26.379104614257812, "learning_rate": 1.258513704988518e-06, "loss": 0.2057, "num_input_tokens_seen": 8688448, "step": 12880 }, { "epoch": 0.3147826936701439, "grad_norm": 34.90458297729492, "learning_rate": 1.2590022963795377e-06, "loss": 0.1319, "num_input_tokens_seen": 8691584, "step": 12885 }, { "epoch": 0.3149048445019911, "grad_norm": 3.304243564605713, "learning_rate": 1.2594908877705574e-06, "loss": 0.0822, "num_input_tokens_seen": 8695104, "step": 12890 }, { "epoch": 0.3150269953338382, "grad_norm": 0.8440029621124268, "learning_rate": 1.2599794791615773e-06, "loss": 0.1366, "num_input_tokens_seen": 8698944, "step": 12895 }, { "epoch": 0.3151491461656854, "grad_norm": 10.932198524475098, "learning_rate": 1.2604680705525967e-06, "loss": 0.1846, "num_input_tokens_seen": 8702784, "step": 12900 }, { "epoch": 0.31527129699753254, "grad_norm": 11.124720573425293, "learning_rate": 1.2609566619436164e-06, "loss": 0.0956, "num_input_tokens_seen": 8706560, "step": 12905 }, { "epoch": 0.3153934478293797, "grad_norm": 25.000829696655273, "learning_rate": 1.2614452533346363e-06, "loss": 0.1179, "num_input_tokens_seen": 8709632, "step": 12910 }, { "epoch": 0.31551559866122686, "grad_norm": 31.354000091552734, "learning_rate": 1.2619338447256558e-06, "loss": 0.1559, "num_input_tokens_seen": 8712960, "step": 12915 }, { "epoch": 0.31563774949307405, "grad_norm": 3.164998769760132, "learning_rate": 1.2624224361166754e-06, "loss": 0.1198, "num_input_tokens_seen": 8716928, "step": 12920 }, { "epoch": 0.31575990032492124, "grad_norm": 8.413860321044922, "learning_rate": 1.2629110275076953e-06, "loss": 0.1578, "num_input_tokens_seen": 8720320, "step": 12925 }, { "epoch": 0.3158820511567684, "grad_norm": 2.5737593173980713, "learning_rate": 1.263399618898715e-06, "loss": 0.0998, "num_input_tokens_seen": 8723776, "step": 12930 }, { "epoch": 0.31600420198861556, "grad_norm": 56.24729537963867, "learning_rate": 1.2638882102897345e-06, "loss": 0.1375, "num_input_tokens_seen": 8727424, "step": 12935 }, { "epoch": 0.3161263528204627, "grad_norm": 22.61892318725586, "learning_rate": 1.2643768016807544e-06, "loss": 0.1309, "num_input_tokens_seen": 8730432, "step": 12940 }, { "epoch": 0.3162485036523099, "grad_norm": 31.44400405883789, "learning_rate": 1.264865393071774e-06, "loss": 0.1737, "num_input_tokens_seen": 8733952, "step": 12945 }, { "epoch": 0.316370654484157, "grad_norm": 16.67905616760254, "learning_rate": 1.2653539844627935e-06, "loss": 0.0793, "num_input_tokens_seen": 8736960, "step": 12950 }, { "epoch": 0.3164928053160042, "grad_norm": 10.576042175292969, "learning_rate": 1.2658425758538134e-06, "loss": 0.075, "num_input_tokens_seen": 8740160, "step": 12955 }, { "epoch": 0.31661495614785135, "grad_norm": 9.4652681350708, "learning_rate": 1.2663311672448331e-06, "loss": 0.1446, "num_input_tokens_seen": 8743744, "step": 12960 }, { "epoch": 0.31673710697969854, "grad_norm": 10.283336639404297, "learning_rate": 1.2668197586358528e-06, "loss": 0.0679, "num_input_tokens_seen": 8746880, "step": 12965 }, { "epoch": 0.31685925781154567, "grad_norm": 7.207754611968994, "learning_rate": 1.2673083500268725e-06, "loss": 0.117, "num_input_tokens_seen": 8750272, "step": 12970 }, { "epoch": 0.31698140864339286, "grad_norm": 1.7486845254898071, "learning_rate": 1.2677969414178922e-06, "loss": 0.1294, "num_input_tokens_seen": 8753728, "step": 12975 }, { "epoch": 0.31710355947524005, "grad_norm": 7.140497207641602, "learning_rate": 1.2682855328089118e-06, "loss": 0.1858, "num_input_tokens_seen": 8757248, "step": 12980 }, { "epoch": 0.3172257103070872, "grad_norm": 15.590055465698242, "learning_rate": 1.2687741241999315e-06, "loss": 0.1265, "num_input_tokens_seen": 8760512, "step": 12985 }, { "epoch": 0.3173478611389344, "grad_norm": 12.499231338500977, "learning_rate": 1.2692627155909512e-06, "loss": 0.1477, "num_input_tokens_seen": 8763584, "step": 12990 }, { "epoch": 0.3174700119707815, "grad_norm": 1.9817485809326172, "learning_rate": 1.2697513069819709e-06, "loss": 0.045, "num_input_tokens_seen": 8766912, "step": 12995 }, { "epoch": 0.3175921628026287, "grad_norm": 34.18552017211914, "learning_rate": 1.2702398983729906e-06, "loss": 0.0796, "num_input_tokens_seen": 8770624, "step": 13000 }, { "epoch": 0.31771431363447583, "grad_norm": 5.509322643280029, "learning_rate": 1.2707284897640102e-06, "loss": 0.067, "num_input_tokens_seen": 8773696, "step": 13005 }, { "epoch": 0.317836464466323, "grad_norm": 11.841221809387207, "learning_rate": 1.27121708115503e-06, "loss": 0.254, "num_input_tokens_seen": 8777280, "step": 13010 }, { "epoch": 0.31795861529817016, "grad_norm": 20.776254653930664, "learning_rate": 1.2717056725460498e-06, "loss": 0.1108, "num_input_tokens_seen": 8780224, "step": 13015 }, { "epoch": 0.31808076613001735, "grad_norm": 0.46001580357551575, "learning_rate": 1.2721942639370693e-06, "loss": 0.138, "num_input_tokens_seen": 8783552, "step": 13020 }, { "epoch": 0.31820291696186453, "grad_norm": 5.034506797790527, "learning_rate": 1.272682855328089e-06, "loss": 0.1391, "num_input_tokens_seen": 8786816, "step": 13025 }, { "epoch": 0.31832506779371167, "grad_norm": 8.84393310546875, "learning_rate": 1.2731714467191089e-06, "loss": 0.0396, "num_input_tokens_seen": 8790016, "step": 13030 }, { "epoch": 0.31844721862555886, "grad_norm": 23.799724578857422, "learning_rate": 1.2736600381101283e-06, "loss": 0.1898, "num_input_tokens_seen": 8793216, "step": 13035 }, { "epoch": 0.318569369457406, "grad_norm": 33.30390930175781, "learning_rate": 1.274148629501148e-06, "loss": 0.0986, "num_input_tokens_seen": 8796352, "step": 13040 }, { "epoch": 0.3186915202892532, "grad_norm": 19.92257308959961, "learning_rate": 1.274637220892168e-06, "loss": 0.0983, "num_input_tokens_seen": 8799360, "step": 13045 }, { "epoch": 0.3188136711211003, "grad_norm": 0.6676555871963501, "learning_rate": 1.2751258122831876e-06, "loss": 0.0892, "num_input_tokens_seen": 8802304, "step": 13050 }, { "epoch": 0.3189358219529475, "grad_norm": 25.848312377929688, "learning_rate": 1.275614403674207e-06, "loss": 0.1344, "num_input_tokens_seen": 8805312, "step": 13055 }, { "epoch": 0.31905797278479464, "grad_norm": 13.977712631225586, "learning_rate": 1.276102995065227e-06, "loss": 0.1088, "num_input_tokens_seen": 8808256, "step": 13060 }, { "epoch": 0.31918012361664183, "grad_norm": 7.164159774780273, "learning_rate": 1.2765915864562466e-06, "loss": 0.0799, "num_input_tokens_seen": 8811648, "step": 13065 }, { "epoch": 0.319302274448489, "grad_norm": 34.81647491455078, "learning_rate": 1.277080177847266e-06, "loss": 0.079, "num_input_tokens_seen": 8814784, "step": 13070 }, { "epoch": 0.31942442528033616, "grad_norm": 38.616493225097656, "learning_rate": 1.277568769238286e-06, "loss": 0.0918, "num_input_tokens_seen": 8817920, "step": 13075 }, { "epoch": 0.31954657611218334, "grad_norm": 13.898828506469727, "learning_rate": 1.2780573606293057e-06, "loss": 0.0483, "num_input_tokens_seen": 8821248, "step": 13080 }, { "epoch": 0.3196687269440305, "grad_norm": 35.597225189208984, "learning_rate": 1.2785459520203254e-06, "loss": 0.0595, "num_input_tokens_seen": 8824704, "step": 13085 }, { "epoch": 0.31979087777587767, "grad_norm": 39.44450759887695, "learning_rate": 1.279034543411345e-06, "loss": 0.1064, "num_input_tokens_seen": 8828032, "step": 13090 }, { "epoch": 0.3199130286077248, "grad_norm": 3.0465939044952393, "learning_rate": 1.2795231348023647e-06, "loss": 0.1114, "num_input_tokens_seen": 8831040, "step": 13095 }, { "epoch": 0.320035179439572, "grad_norm": 10.89338207244873, "learning_rate": 1.2800117261933844e-06, "loss": 0.3178, "num_input_tokens_seen": 8834240, "step": 13100 }, { "epoch": 0.3201573302714191, "grad_norm": 40.36931610107422, "learning_rate": 1.280500317584404e-06, "loss": 0.1605, "num_input_tokens_seen": 8837696, "step": 13105 }, { "epoch": 0.3202794811032663, "grad_norm": 32.20518493652344, "learning_rate": 1.2809889089754238e-06, "loss": 0.1192, "num_input_tokens_seen": 8840960, "step": 13110 }, { "epoch": 0.32040163193511345, "grad_norm": 17.135501861572266, "learning_rate": 1.2814775003664434e-06, "loss": 0.0765, "num_input_tokens_seen": 8844032, "step": 13115 }, { "epoch": 0.32052378276696064, "grad_norm": 3.2122209072113037, "learning_rate": 1.2819660917574633e-06, "loss": 0.0768, "num_input_tokens_seen": 8847424, "step": 13120 }, { "epoch": 0.32064593359880783, "grad_norm": 41.46558380126953, "learning_rate": 1.2824546831484828e-06, "loss": 0.142, "num_input_tokens_seen": 8850560, "step": 13125 }, { "epoch": 0.32076808443065497, "grad_norm": 20.672643661499023, "learning_rate": 1.2829432745395025e-06, "loss": 0.0831, "num_input_tokens_seen": 8854016, "step": 13130 }, { "epoch": 0.32089023526250215, "grad_norm": 0.6543686389923096, "learning_rate": 1.2834318659305224e-06, "loss": 0.1633, "num_input_tokens_seen": 8857280, "step": 13135 }, { "epoch": 0.3210123860943493, "grad_norm": 23.34600067138672, "learning_rate": 1.2839204573215418e-06, "loss": 0.0974, "num_input_tokens_seen": 8860672, "step": 13140 }, { "epoch": 0.3211345369261965, "grad_norm": 24.48014259338379, "learning_rate": 1.2844090487125615e-06, "loss": 0.0883, "num_input_tokens_seen": 8863808, "step": 13145 }, { "epoch": 0.3212566877580436, "grad_norm": 37.018714904785156, "learning_rate": 1.2848976401035814e-06, "loss": 0.2072, "num_input_tokens_seen": 8867392, "step": 13150 }, { "epoch": 0.3213788385898908, "grad_norm": 28.14137077331543, "learning_rate": 1.2853862314946009e-06, "loss": 0.1008, "num_input_tokens_seen": 8870592, "step": 13155 }, { "epoch": 0.32150098942173794, "grad_norm": 19.779029846191406, "learning_rate": 1.2858748228856206e-06, "loss": 0.087, "num_input_tokens_seen": 8873856, "step": 13160 }, { "epoch": 0.3216231402535851, "grad_norm": 0.5094308257102966, "learning_rate": 1.2863634142766405e-06, "loss": 0.1631, "num_input_tokens_seen": 8877056, "step": 13165 }, { "epoch": 0.3217452910854323, "grad_norm": 21.85708236694336, "learning_rate": 1.2868520056676601e-06, "loss": 0.1666, "num_input_tokens_seen": 8880320, "step": 13170 }, { "epoch": 0.32186744191727945, "grad_norm": 11.574304580688477, "learning_rate": 1.2873405970586796e-06, "loss": 0.1232, "num_input_tokens_seen": 8883520, "step": 13175 }, { "epoch": 0.32198959274912664, "grad_norm": 0.3177202045917511, "learning_rate": 1.2878291884496995e-06, "loss": 0.0721, "num_input_tokens_seen": 8886464, "step": 13180 }, { "epoch": 0.3221117435809738, "grad_norm": 0.4103322923183441, "learning_rate": 1.2883177798407192e-06, "loss": 0.1252, "num_input_tokens_seen": 8889664, "step": 13185 }, { "epoch": 0.32223389441282096, "grad_norm": 40.97935485839844, "learning_rate": 1.2888063712317387e-06, "loss": 0.0993, "num_input_tokens_seen": 8892736, "step": 13190 }, { "epoch": 0.3223560452446681, "grad_norm": 24.75423240661621, "learning_rate": 1.2892949626227586e-06, "loss": 0.1207, "num_input_tokens_seen": 8895936, "step": 13195 }, { "epoch": 0.3224781960765153, "grad_norm": 31.197431564331055, "learning_rate": 1.2897835540137782e-06, "loss": 0.1758, "num_input_tokens_seen": 8899264, "step": 13200 }, { "epoch": 0.3226003469083624, "grad_norm": 6.995538234710693, "learning_rate": 1.290272145404798e-06, "loss": 0.1376, "num_input_tokens_seen": 8902528, "step": 13205 }, { "epoch": 0.3227224977402096, "grad_norm": 4.579649448394775, "learning_rate": 1.2907607367958176e-06, "loss": 0.0735, "num_input_tokens_seen": 8905792, "step": 13210 }, { "epoch": 0.3228446485720568, "grad_norm": 26.562294006347656, "learning_rate": 1.2912493281868373e-06, "loss": 0.1756, "num_input_tokens_seen": 8908928, "step": 13215 }, { "epoch": 0.32296679940390394, "grad_norm": 46.12147903442383, "learning_rate": 1.291737919577857e-06, "loss": 0.2033, "num_input_tokens_seen": 8912128, "step": 13220 }, { "epoch": 0.3230889502357511, "grad_norm": 7.420219898223877, "learning_rate": 1.2922265109688766e-06, "loss": 0.0812, "num_input_tokens_seen": 8915136, "step": 13225 }, { "epoch": 0.32321110106759826, "grad_norm": 29.414127349853516, "learning_rate": 1.2927151023598963e-06, "loss": 0.1386, "num_input_tokens_seen": 8918400, "step": 13230 }, { "epoch": 0.32333325189944545, "grad_norm": 38.283241271972656, "learning_rate": 1.293203693750916e-06, "loss": 0.1314, "num_input_tokens_seen": 8921280, "step": 13235 }, { "epoch": 0.3234554027312926, "grad_norm": 46.68159103393555, "learning_rate": 1.293692285141936e-06, "loss": 0.1797, "num_input_tokens_seen": 8924288, "step": 13240 }, { "epoch": 0.3235775535631398, "grad_norm": 12.005806922912598, "learning_rate": 1.2941808765329554e-06, "loss": 0.1386, "num_input_tokens_seen": 8928192, "step": 13245 }, { "epoch": 0.3236997043949869, "grad_norm": 28.0427188873291, "learning_rate": 1.294669467923975e-06, "loss": 0.068, "num_input_tokens_seen": 8931968, "step": 13250 }, { "epoch": 0.3238218552268341, "grad_norm": 33.40077209472656, "learning_rate": 1.295158059314995e-06, "loss": 0.2441, "num_input_tokens_seen": 8935168, "step": 13255 }, { "epoch": 0.32394400605868123, "grad_norm": 29.742095947265625, "learning_rate": 1.2956466507060144e-06, "loss": 0.1352, "num_input_tokens_seen": 8938240, "step": 13260 }, { "epoch": 0.3240661568905284, "grad_norm": 23.64820098876953, "learning_rate": 1.296135242097034e-06, "loss": 0.1099, "num_input_tokens_seen": 8941696, "step": 13265 }, { "epoch": 0.3241883077223756, "grad_norm": 17.8576717376709, "learning_rate": 1.296623833488054e-06, "loss": 0.0451, "num_input_tokens_seen": 8945408, "step": 13270 }, { "epoch": 0.32431045855422275, "grad_norm": 11.507369995117188, "learning_rate": 1.2971124248790735e-06, "loss": 0.1093, "num_input_tokens_seen": 8948288, "step": 13275 }, { "epoch": 0.32443260938606994, "grad_norm": 7.504124164581299, "learning_rate": 1.2976010162700931e-06, "loss": 0.0524, "num_input_tokens_seen": 8951808, "step": 13280 }, { "epoch": 0.32455476021791707, "grad_norm": 17.95069122314453, "learning_rate": 1.298089607661113e-06, "loss": 0.0399, "num_input_tokens_seen": 8955456, "step": 13285 }, { "epoch": 0.32467691104976426, "grad_norm": 31.09222984313965, "learning_rate": 1.2985781990521327e-06, "loss": 0.1968, "num_input_tokens_seen": 8958720, "step": 13290 }, { "epoch": 0.3247990618816114, "grad_norm": 13.580461502075195, "learning_rate": 1.2990667904431522e-06, "loss": 0.1475, "num_input_tokens_seen": 8961984, "step": 13295 }, { "epoch": 0.3249212127134586, "grad_norm": 36.72895050048828, "learning_rate": 1.299555381834172e-06, "loss": 0.1347, "num_input_tokens_seen": 8965888, "step": 13300 }, { "epoch": 0.3250433635453057, "grad_norm": 29.59313201904297, "learning_rate": 1.3000439732251918e-06, "loss": 0.2356, "num_input_tokens_seen": 8969152, "step": 13305 }, { "epoch": 0.3251655143771529, "grad_norm": 28.670122146606445, "learning_rate": 1.3005325646162112e-06, "loss": 0.2769, "num_input_tokens_seen": 8972224, "step": 13310 }, { "epoch": 0.3252876652090001, "grad_norm": 9.596991539001465, "learning_rate": 1.3010211560072311e-06, "loss": 0.1549, "num_input_tokens_seen": 8975360, "step": 13315 }, { "epoch": 0.32540981604084723, "grad_norm": 4.092257976531982, "learning_rate": 1.3015097473982508e-06, "loss": 0.0598, "num_input_tokens_seen": 8978560, "step": 13320 }, { "epoch": 0.3255319668726944, "grad_norm": 22.807069778442383, "learning_rate": 1.3019983387892705e-06, "loss": 0.1031, "num_input_tokens_seen": 8981504, "step": 13325 }, { "epoch": 0.32565411770454156, "grad_norm": 14.356558799743652, "learning_rate": 1.3024869301802902e-06, "loss": 0.1894, "num_input_tokens_seen": 8984640, "step": 13330 }, { "epoch": 0.32577626853638875, "grad_norm": 3.0045337677001953, "learning_rate": 1.3029755215713098e-06, "loss": 0.0675, "num_input_tokens_seen": 8987648, "step": 13335 }, { "epoch": 0.3258984193682359, "grad_norm": 1.4146621227264404, "learning_rate": 1.3034641129623295e-06, "loss": 0.0484, "num_input_tokens_seen": 8991232, "step": 13340 }, { "epoch": 0.32602057020008307, "grad_norm": 8.540959358215332, "learning_rate": 1.3039527043533492e-06, "loss": 0.0963, "num_input_tokens_seen": 8994432, "step": 13345 }, { "epoch": 0.3261427210319302, "grad_norm": 49.685516357421875, "learning_rate": 1.3044412957443689e-06, "loss": 0.173, "num_input_tokens_seen": 8998080, "step": 13350 }, { "epoch": 0.3262648718637774, "grad_norm": 42.36631774902344, "learning_rate": 1.3049298871353886e-06, "loss": 0.1508, "num_input_tokens_seen": 9001344, "step": 13355 }, { "epoch": 0.3263870226956246, "grad_norm": 12.289406776428223, "learning_rate": 1.3054184785264085e-06, "loss": 0.1075, "num_input_tokens_seen": 9004928, "step": 13360 }, { "epoch": 0.3265091735274717, "grad_norm": 55.62197494506836, "learning_rate": 1.305907069917428e-06, "loss": 0.1445, "num_input_tokens_seen": 9007936, "step": 13365 }, { "epoch": 0.3266313243593189, "grad_norm": 25.463848114013672, "learning_rate": 1.3063956613084476e-06, "loss": 0.0846, "num_input_tokens_seen": 9011264, "step": 13370 }, { "epoch": 0.32675347519116604, "grad_norm": 44.88855743408203, "learning_rate": 1.3068842526994675e-06, "loss": 0.1817, "num_input_tokens_seen": 9014528, "step": 13375 }, { "epoch": 0.32687562602301323, "grad_norm": 25.74319839477539, "learning_rate": 1.307372844090487e-06, "loss": 0.1482, "num_input_tokens_seen": 9017536, "step": 13380 }, { "epoch": 0.32699777685486037, "grad_norm": 25.14618682861328, "learning_rate": 1.3078614354815067e-06, "loss": 0.1093, "num_input_tokens_seen": 9020480, "step": 13385 }, { "epoch": 0.32711992768670756, "grad_norm": 14.147246360778809, "learning_rate": 1.3083500268725265e-06, "loss": 0.0675, "num_input_tokens_seen": 9023808, "step": 13390 }, { "epoch": 0.3272420785185547, "grad_norm": 3.24316668510437, "learning_rate": 1.3088386182635462e-06, "loss": 0.2227, "num_input_tokens_seen": 9027200, "step": 13395 }, { "epoch": 0.3273642293504019, "grad_norm": 36.026851654052734, "learning_rate": 1.3093272096545657e-06, "loss": 0.132, "num_input_tokens_seen": 9030400, "step": 13400 }, { "epoch": 0.327486380182249, "grad_norm": 0.8746715188026428, "learning_rate": 1.3098158010455856e-06, "loss": 0.0713, "num_input_tokens_seen": 9033920, "step": 13405 }, { "epoch": 0.3276085310140962, "grad_norm": 11.483881950378418, "learning_rate": 1.3103043924366053e-06, "loss": 0.1003, "num_input_tokens_seen": 9037504, "step": 13410 }, { "epoch": 0.3277306818459434, "grad_norm": 31.39374542236328, "learning_rate": 1.3107929838276247e-06, "loss": 0.1388, "num_input_tokens_seen": 9040384, "step": 13415 }, { "epoch": 0.32785283267779053, "grad_norm": 0.4166433811187744, "learning_rate": 1.3112815752186446e-06, "loss": 0.0525, "num_input_tokens_seen": 9043712, "step": 13420 }, { "epoch": 0.3279749835096377, "grad_norm": 24.85698890686035, "learning_rate": 1.3117701666096643e-06, "loss": 0.2262, "num_input_tokens_seen": 9046336, "step": 13425 }, { "epoch": 0.32809713434148485, "grad_norm": 1.9511191844940186, "learning_rate": 1.3122587580006838e-06, "loss": 0.1109, "num_input_tokens_seen": 9049536, "step": 13430 }, { "epoch": 0.32821928517333204, "grad_norm": 2.2473626136779785, "learning_rate": 1.3127473493917037e-06, "loss": 0.0949, "num_input_tokens_seen": 9052928, "step": 13435 }, { "epoch": 0.3283414360051792, "grad_norm": 17.47495460510254, "learning_rate": 1.3132359407827234e-06, "loss": 0.1478, "num_input_tokens_seen": 9056384, "step": 13440 }, { "epoch": 0.32846358683702637, "grad_norm": 8.121326446533203, "learning_rate": 1.313724532173743e-06, "loss": 0.0471, "num_input_tokens_seen": 9059840, "step": 13445 }, { "epoch": 0.3285857376688735, "grad_norm": 13.05697250366211, "learning_rate": 1.3142131235647627e-06, "loss": 0.1651, "num_input_tokens_seen": 9063680, "step": 13450 }, { "epoch": 0.3287078885007207, "grad_norm": 16.482269287109375, "learning_rate": 1.3147017149557824e-06, "loss": 0.0945, "num_input_tokens_seen": 9067520, "step": 13455 }, { "epoch": 0.3288300393325679, "grad_norm": 0.4536917805671692, "learning_rate": 1.315190306346802e-06, "loss": 0.072, "num_input_tokens_seen": 9070656, "step": 13460 }, { "epoch": 0.328952190164415, "grad_norm": 27.94501304626465, "learning_rate": 1.3156788977378218e-06, "loss": 0.1405, "num_input_tokens_seen": 9074240, "step": 13465 }, { "epoch": 0.3290743409962622, "grad_norm": 25.520456314086914, "learning_rate": 1.3161674891288414e-06, "loss": 0.0691, "num_input_tokens_seen": 9077568, "step": 13470 }, { "epoch": 0.32919649182810934, "grad_norm": 0.49751996994018555, "learning_rate": 1.3166560805198611e-06, "loss": 0.0812, "num_input_tokens_seen": 9081024, "step": 13475 }, { "epoch": 0.32931864265995653, "grad_norm": 18.828784942626953, "learning_rate": 1.317144671910881e-06, "loss": 0.0681, "num_input_tokens_seen": 9084288, "step": 13480 }, { "epoch": 0.32944079349180366, "grad_norm": 3.951866388320923, "learning_rate": 1.3176332633019005e-06, "loss": 0.0192, "num_input_tokens_seen": 9087424, "step": 13485 }, { "epoch": 0.32956294432365085, "grad_norm": 2.443051338195801, "learning_rate": 1.3181218546929202e-06, "loss": 0.1937, "num_input_tokens_seen": 9090688, "step": 13490 }, { "epoch": 0.329685095155498, "grad_norm": 48.771732330322266, "learning_rate": 1.31861044608394e-06, "loss": 0.1272, "num_input_tokens_seen": 9093824, "step": 13495 }, { "epoch": 0.3298072459873452, "grad_norm": 39.788150787353516, "learning_rate": 1.3190990374749595e-06, "loss": 0.1441, "num_input_tokens_seen": 9097088, "step": 13500 }, { "epoch": 0.3299293968191923, "grad_norm": 22.174091339111328, "learning_rate": 1.3195876288659792e-06, "loss": 0.163, "num_input_tokens_seen": 9100736, "step": 13505 }, { "epoch": 0.3300515476510395, "grad_norm": 29.268009185791016, "learning_rate": 1.3200762202569991e-06, "loss": 0.1131, "num_input_tokens_seen": 9104192, "step": 13510 }, { "epoch": 0.3301736984828867, "grad_norm": 0.8035185933113098, "learning_rate": 1.3205648116480188e-06, "loss": 0.0136, "num_input_tokens_seen": 9107584, "step": 13515 }, { "epoch": 0.3302958493147338, "grad_norm": 11.216259002685547, "learning_rate": 1.3210534030390383e-06, "loss": 0.0706, "num_input_tokens_seen": 9110784, "step": 13520 }, { "epoch": 0.330418000146581, "grad_norm": 1.113582968711853, "learning_rate": 1.3215419944300582e-06, "loss": 0.1026, "num_input_tokens_seen": 9114240, "step": 13525 }, { "epoch": 0.33054015097842815, "grad_norm": 15.959999084472656, "learning_rate": 1.3220305858210778e-06, "loss": 0.1298, "num_input_tokens_seen": 9117568, "step": 13530 }, { "epoch": 0.33066230181027534, "grad_norm": 18.487342834472656, "learning_rate": 1.3225191772120973e-06, "loss": 0.0658, "num_input_tokens_seen": 9121152, "step": 13535 }, { "epoch": 0.3307844526421225, "grad_norm": 34.5030632019043, "learning_rate": 1.3230077686031172e-06, "loss": 0.1518, "num_input_tokens_seen": 9124352, "step": 13540 }, { "epoch": 0.33090660347396966, "grad_norm": 80.67737579345703, "learning_rate": 1.3234963599941369e-06, "loss": 0.1158, "num_input_tokens_seen": 9127488, "step": 13545 }, { "epoch": 0.3310287543058168, "grad_norm": 35.42460250854492, "learning_rate": 1.3239849513851564e-06, "loss": 0.0693, "num_input_tokens_seen": 9131584, "step": 13550 }, { "epoch": 0.331150905137664, "grad_norm": 12.657451629638672, "learning_rate": 1.3244735427761762e-06, "loss": 0.1466, "num_input_tokens_seen": 9135104, "step": 13555 }, { "epoch": 0.3312730559695112, "grad_norm": 36.274234771728516, "learning_rate": 1.324962134167196e-06, "loss": 0.2083, "num_input_tokens_seen": 9138880, "step": 13560 }, { "epoch": 0.3313952068013583, "grad_norm": 69.3318862915039, "learning_rate": 1.3254507255582156e-06, "loss": 0.087, "num_input_tokens_seen": 9141952, "step": 13565 }, { "epoch": 0.3315173576332055, "grad_norm": 37.832054138183594, "learning_rate": 1.3259393169492353e-06, "loss": 0.2453, "num_input_tokens_seen": 9145024, "step": 13570 }, { "epoch": 0.33163950846505263, "grad_norm": 0.09780533611774445, "learning_rate": 1.326427908340255e-06, "loss": 0.2447, "num_input_tokens_seen": 9148096, "step": 13575 }, { "epoch": 0.3317616592968998, "grad_norm": 0.24856971204280853, "learning_rate": 1.3269164997312747e-06, "loss": 0.1698, "num_input_tokens_seen": 9151296, "step": 13580 }, { "epoch": 0.33188381012874696, "grad_norm": 18.92795181274414, "learning_rate": 1.3274050911222943e-06, "loss": 0.1941, "num_input_tokens_seen": 9154560, "step": 13585 }, { "epoch": 0.33200596096059415, "grad_norm": 43.682830810546875, "learning_rate": 1.327893682513314e-06, "loss": 0.1002, "num_input_tokens_seen": 9157888, "step": 13590 }, { "epoch": 0.3321281117924413, "grad_norm": 1.1924934387207031, "learning_rate": 1.3283822739043337e-06, "loss": 0.1172, "num_input_tokens_seen": 9161472, "step": 13595 }, { "epoch": 0.3322502626242885, "grad_norm": 0.16776405274868011, "learning_rate": 1.3288708652953536e-06, "loss": 0.0366, "num_input_tokens_seen": 9164864, "step": 13600 }, { "epoch": 0.33237241345613566, "grad_norm": 16.99744415283203, "learning_rate": 1.329359456686373e-06, "loss": 0.073, "num_input_tokens_seen": 9168576, "step": 13605 }, { "epoch": 0.3324945642879828, "grad_norm": 0.9078207015991211, "learning_rate": 1.3298480480773927e-06, "loss": 0.2594, "num_input_tokens_seen": 9171648, "step": 13610 }, { "epoch": 0.33261671511983, "grad_norm": 45.5037841796875, "learning_rate": 1.3303366394684126e-06, "loss": 0.2359, "num_input_tokens_seen": 9174848, "step": 13615 }, { "epoch": 0.3327388659516771, "grad_norm": 43.5709342956543, "learning_rate": 1.330825230859432e-06, "loss": 0.187, "num_input_tokens_seen": 9177984, "step": 13620 }, { "epoch": 0.3328610167835243, "grad_norm": 19.13081932067871, "learning_rate": 1.3313138222504518e-06, "loss": 0.1991, "num_input_tokens_seen": 9181440, "step": 13625 }, { "epoch": 0.33298316761537144, "grad_norm": 59.84111404418945, "learning_rate": 1.3318024136414717e-06, "loss": 0.1258, "num_input_tokens_seen": 9184704, "step": 13630 }, { "epoch": 0.33310531844721863, "grad_norm": 25.472261428833008, "learning_rate": 1.3322910050324914e-06, "loss": 0.1824, "num_input_tokens_seen": 9187904, "step": 13635 }, { "epoch": 0.33322746927906577, "grad_norm": 10.703448295593262, "learning_rate": 1.3327795964235108e-06, "loss": 0.1674, "num_input_tokens_seen": 9191168, "step": 13640 }, { "epoch": 0.33334962011091296, "grad_norm": 15.315190315246582, "learning_rate": 1.3332681878145307e-06, "loss": 0.164, "num_input_tokens_seen": 9194752, "step": 13645 }, { "epoch": 0.3334717709427601, "grad_norm": 15.319101333618164, "learning_rate": 1.3337567792055504e-06, "loss": 0.1096, "num_input_tokens_seen": 9198528, "step": 13650 }, { "epoch": 0.3335939217746073, "grad_norm": 13.643003463745117, "learning_rate": 1.3342453705965699e-06, "loss": 0.0589, "num_input_tokens_seen": 9201728, "step": 13655 }, { "epoch": 0.3337160726064545, "grad_norm": 15.755820274353027, "learning_rate": 1.3347339619875898e-06, "loss": 0.1301, "num_input_tokens_seen": 9204800, "step": 13660 }, { "epoch": 0.3338382234383016, "grad_norm": 20.200284957885742, "learning_rate": 1.3352225533786094e-06, "loss": 0.129, "num_input_tokens_seen": 9208192, "step": 13665 }, { "epoch": 0.3339603742701488, "grad_norm": 21.0355281829834, "learning_rate": 1.3357111447696291e-06, "loss": 0.1446, "num_input_tokens_seen": 9211328, "step": 13670 }, { "epoch": 0.33408252510199593, "grad_norm": 10.959577560424805, "learning_rate": 1.3361997361606488e-06, "loss": 0.1586, "num_input_tokens_seen": 9214720, "step": 13675 }, { "epoch": 0.3342046759338431, "grad_norm": 45.59690475463867, "learning_rate": 1.3366883275516685e-06, "loss": 0.1188, "num_input_tokens_seen": 9217856, "step": 13680 }, { "epoch": 0.33432682676569025, "grad_norm": 16.36484146118164, "learning_rate": 1.3371769189426882e-06, "loss": 0.0507, "num_input_tokens_seen": 9221312, "step": 13685 }, { "epoch": 0.33444897759753744, "grad_norm": 0.2937169075012207, "learning_rate": 1.3376655103337079e-06, "loss": 0.0902, "num_input_tokens_seen": 9224896, "step": 13690 }, { "epoch": 0.3345711284293846, "grad_norm": 13.3306884765625, "learning_rate": 1.3381541017247275e-06, "loss": 0.0699, "num_input_tokens_seen": 9227904, "step": 13695 }, { "epoch": 0.33469327926123177, "grad_norm": 1.268502950668335, "learning_rate": 1.3386426931157472e-06, "loss": 0.054, "num_input_tokens_seen": 9231360, "step": 13700 }, { "epoch": 0.33481543009307896, "grad_norm": 40.71323776245117, "learning_rate": 1.339131284506767e-06, "loss": 0.1826, "num_input_tokens_seen": 9235072, "step": 13705 }, { "epoch": 0.3349375809249261, "grad_norm": 0.662493109703064, "learning_rate": 1.3396198758977866e-06, "loss": 0.1498, "num_input_tokens_seen": 9238464, "step": 13710 }, { "epoch": 0.3350597317567733, "grad_norm": 8.303804397583008, "learning_rate": 1.3401084672888063e-06, "loss": 0.1819, "num_input_tokens_seen": 9241920, "step": 13715 }, { "epoch": 0.3351818825886204, "grad_norm": 8.60287094116211, "learning_rate": 1.3405970586798262e-06, "loss": 0.1203, "num_input_tokens_seen": 9245056, "step": 13720 }, { "epoch": 0.3353040334204676, "grad_norm": 13.480615615844727, "learning_rate": 1.3410856500708456e-06, "loss": 0.1672, "num_input_tokens_seen": 9248832, "step": 13725 }, { "epoch": 0.33542618425231474, "grad_norm": 8.956427574157715, "learning_rate": 1.3415742414618653e-06, "loss": 0.201, "num_input_tokens_seen": 9251968, "step": 13730 }, { "epoch": 0.33554833508416193, "grad_norm": 9.379366874694824, "learning_rate": 1.3420628328528852e-06, "loss": 0.1651, "num_input_tokens_seen": 9255360, "step": 13735 }, { "epoch": 0.33567048591600906, "grad_norm": 12.51583194732666, "learning_rate": 1.3425514242439047e-06, "loss": 0.1308, "num_input_tokens_seen": 9258560, "step": 13740 }, { "epoch": 0.33579263674785625, "grad_norm": 28.122478485107422, "learning_rate": 1.3430400156349243e-06, "loss": 0.2076, "num_input_tokens_seen": 9261824, "step": 13745 }, { "epoch": 0.33591478757970344, "grad_norm": 7.3910746574401855, "learning_rate": 1.3435286070259442e-06, "loss": 0.111, "num_input_tokens_seen": 9265088, "step": 13750 }, { "epoch": 0.3360369384115506, "grad_norm": 14.954057693481445, "learning_rate": 1.344017198416964e-06, "loss": 0.0729, "num_input_tokens_seen": 9268416, "step": 13755 }, { "epoch": 0.33615908924339777, "grad_norm": 21.911819458007812, "learning_rate": 1.3445057898079834e-06, "loss": 0.1038, "num_input_tokens_seen": 9271936, "step": 13760 }, { "epoch": 0.3362812400752449, "grad_norm": 19.510435104370117, "learning_rate": 1.3449943811990033e-06, "loss": 0.18, "num_input_tokens_seen": 9275392, "step": 13765 }, { "epoch": 0.3364033909070921, "grad_norm": 13.656414985656738, "learning_rate": 1.345482972590023e-06, "loss": 0.1319, "num_input_tokens_seen": 9278720, "step": 13770 }, { "epoch": 0.3365255417389392, "grad_norm": 4.69479513168335, "learning_rate": 1.3459715639810424e-06, "loss": 0.1157, "num_input_tokens_seen": 9281792, "step": 13775 }, { "epoch": 0.3366476925707864, "grad_norm": 21.458940505981445, "learning_rate": 1.3464601553720623e-06, "loss": 0.0984, "num_input_tokens_seen": 9285056, "step": 13780 }, { "epoch": 0.33676984340263355, "grad_norm": 1.911492943763733, "learning_rate": 1.346948746763082e-06, "loss": 0.0676, "num_input_tokens_seen": 9288512, "step": 13785 }, { "epoch": 0.33689199423448074, "grad_norm": 10.871212005615234, "learning_rate": 1.3474373381541017e-06, "loss": 0.1568, "num_input_tokens_seen": 9291456, "step": 13790 }, { "epoch": 0.3370141450663279, "grad_norm": 1.0117473602294922, "learning_rate": 1.3479259295451214e-06, "loss": 0.0388, "num_input_tokens_seen": 9294912, "step": 13795 }, { "epoch": 0.33713629589817506, "grad_norm": 0.2364530861377716, "learning_rate": 1.348414520936141e-06, "loss": 0.1024, "num_input_tokens_seen": 9298304, "step": 13800 }, { "epoch": 0.33725844673002225, "grad_norm": 20.05010414123535, "learning_rate": 1.3489031123271607e-06, "loss": 0.0944, "num_input_tokens_seen": 9301632, "step": 13805 }, { "epoch": 0.3373805975618694, "grad_norm": 20.302865982055664, "learning_rate": 1.3493917037181804e-06, "loss": 0.1152, "num_input_tokens_seen": 9304896, "step": 13810 }, { "epoch": 0.3375027483937166, "grad_norm": 0.9043674468994141, "learning_rate": 1.3498802951092e-06, "loss": 0.19, "num_input_tokens_seen": 9308096, "step": 13815 }, { "epoch": 0.3376248992255637, "grad_norm": 0.870741069316864, "learning_rate": 1.3503688865002198e-06, "loss": 0.0909, "num_input_tokens_seen": 9311616, "step": 13820 }, { "epoch": 0.3377470500574109, "grad_norm": 29.548307418823242, "learning_rate": 1.3508574778912397e-06, "loss": 0.2014, "num_input_tokens_seen": 9314880, "step": 13825 }, { "epoch": 0.33786920088925804, "grad_norm": 23.69499397277832, "learning_rate": 1.3513460692822591e-06, "loss": 0.1298, "num_input_tokens_seen": 9319424, "step": 13830 }, { "epoch": 0.3379913517211052, "grad_norm": 8.086859703063965, "learning_rate": 1.3518346606732788e-06, "loss": 0.1577, "num_input_tokens_seen": 9322560, "step": 13835 }, { "epoch": 0.33811350255295236, "grad_norm": 46.39087677001953, "learning_rate": 1.3523232520642987e-06, "loss": 0.1624, "num_input_tokens_seen": 9326080, "step": 13840 }, { "epoch": 0.33823565338479955, "grad_norm": 35.82646179199219, "learning_rate": 1.3528118434553182e-06, "loss": 0.2945, "num_input_tokens_seen": 9328960, "step": 13845 }, { "epoch": 0.33835780421664674, "grad_norm": 7.0568766593933105, "learning_rate": 1.3533004348463379e-06, "loss": 0.0515, "num_input_tokens_seen": 9332288, "step": 13850 }, { "epoch": 0.3384799550484939, "grad_norm": 1.6458518505096436, "learning_rate": 1.3537890262373578e-06, "loss": 0.0948, "num_input_tokens_seen": 9335424, "step": 13855 }, { "epoch": 0.33860210588034106, "grad_norm": 14.4976806640625, "learning_rate": 1.3542776176283772e-06, "loss": 0.1205, "num_input_tokens_seen": 9338624, "step": 13860 }, { "epoch": 0.3387242567121882, "grad_norm": 0.4415428340435028, "learning_rate": 1.354766209019397e-06, "loss": 0.0804, "num_input_tokens_seen": 9342016, "step": 13865 }, { "epoch": 0.3388464075440354, "grad_norm": 6.9508161544799805, "learning_rate": 1.3552548004104168e-06, "loss": 0.0663, "num_input_tokens_seen": 9345408, "step": 13870 }, { "epoch": 0.3389685583758825, "grad_norm": 10.175514221191406, "learning_rate": 1.3557433918014365e-06, "loss": 0.2014, "num_input_tokens_seen": 9348736, "step": 13875 }, { "epoch": 0.3390907092077297, "grad_norm": 2.260913848876953, "learning_rate": 1.356231983192456e-06, "loss": 0.1201, "num_input_tokens_seen": 9352064, "step": 13880 }, { "epoch": 0.33921286003957685, "grad_norm": 33.67397689819336, "learning_rate": 1.3567205745834758e-06, "loss": 0.1617, "num_input_tokens_seen": 9356032, "step": 13885 }, { "epoch": 0.33933501087142404, "grad_norm": 13.100915908813477, "learning_rate": 1.3572091659744955e-06, "loss": 0.0969, "num_input_tokens_seen": 9359808, "step": 13890 }, { "epoch": 0.3394571617032712, "grad_norm": 20.006126403808594, "learning_rate": 1.357697757365515e-06, "loss": 0.1939, "num_input_tokens_seen": 9362880, "step": 13895 }, { "epoch": 0.33957931253511836, "grad_norm": 24.357524871826172, "learning_rate": 1.3581863487565349e-06, "loss": 0.1736, "num_input_tokens_seen": 9366336, "step": 13900 }, { "epoch": 0.33970146336696555, "grad_norm": 2.007045030593872, "learning_rate": 1.3586749401475546e-06, "loss": 0.0832, "num_input_tokens_seen": 9370240, "step": 13905 }, { "epoch": 0.3398236141988127, "grad_norm": 38.24091339111328, "learning_rate": 1.3591635315385743e-06, "loss": 0.1163, "num_input_tokens_seen": 9373504, "step": 13910 }, { "epoch": 0.3399457650306599, "grad_norm": 17.034334182739258, "learning_rate": 1.359652122929594e-06, "loss": 0.1155, "num_input_tokens_seen": 9376704, "step": 13915 }, { "epoch": 0.340067915862507, "grad_norm": 49.082942962646484, "learning_rate": 1.3601407143206136e-06, "loss": 0.1426, "num_input_tokens_seen": 9380096, "step": 13920 }, { "epoch": 0.3401900666943542, "grad_norm": 18.470176696777344, "learning_rate": 1.3606293057116333e-06, "loss": 0.0888, "num_input_tokens_seen": 9383936, "step": 13925 }, { "epoch": 0.34031221752620133, "grad_norm": 46.603797912597656, "learning_rate": 1.361117897102653e-06, "loss": 0.2428, "num_input_tokens_seen": 9387072, "step": 13930 }, { "epoch": 0.3404343683580485, "grad_norm": 36.19292449951172, "learning_rate": 1.3616064884936727e-06, "loss": 0.0733, "num_input_tokens_seen": 9390208, "step": 13935 }, { "epoch": 0.34055651918989566, "grad_norm": 33.56175231933594, "learning_rate": 1.3620950798846923e-06, "loss": 0.2286, "num_input_tokens_seen": 9393280, "step": 13940 }, { "epoch": 0.34067867002174285, "grad_norm": 11.620241165161133, "learning_rate": 1.3625836712757122e-06, "loss": 0.195, "num_input_tokens_seen": 9396928, "step": 13945 }, { "epoch": 0.34080082085359004, "grad_norm": 5.474987030029297, "learning_rate": 1.3630722626667317e-06, "loss": 0.1635, "num_input_tokens_seen": 9400512, "step": 13950 }, { "epoch": 0.34092297168543717, "grad_norm": 31.326126098632812, "learning_rate": 1.3635608540577514e-06, "loss": 0.0758, "num_input_tokens_seen": 9403968, "step": 13955 }, { "epoch": 0.34104512251728436, "grad_norm": 4.288376808166504, "learning_rate": 1.3640494454487713e-06, "loss": 0.0502, "num_input_tokens_seen": 9406848, "step": 13960 }, { "epoch": 0.3411672733491315, "grad_norm": 42.29257583618164, "learning_rate": 1.3645380368397907e-06, "loss": 0.0588, "num_input_tokens_seen": 9410304, "step": 13965 }, { "epoch": 0.3412894241809787, "grad_norm": 26.356138229370117, "learning_rate": 1.3650266282308104e-06, "loss": 0.1204, "num_input_tokens_seen": 9413504, "step": 13970 }, { "epoch": 0.3414115750128258, "grad_norm": 17.461332321166992, "learning_rate": 1.3655152196218303e-06, "loss": 0.0598, "num_input_tokens_seen": 9416512, "step": 13975 }, { "epoch": 0.341533725844673, "grad_norm": 25.22222328186035, "learning_rate": 1.3660038110128498e-06, "loss": 0.1496, "num_input_tokens_seen": 9419840, "step": 13980 }, { "epoch": 0.34165587667652014, "grad_norm": 2.4524993896484375, "learning_rate": 1.3664924024038695e-06, "loss": 0.1019, "num_input_tokens_seen": 9423168, "step": 13985 }, { "epoch": 0.34177802750836733, "grad_norm": 55.20090103149414, "learning_rate": 1.3669809937948894e-06, "loss": 0.1098, "num_input_tokens_seen": 9426752, "step": 13990 }, { "epoch": 0.3419001783402145, "grad_norm": 38.13460922241211, "learning_rate": 1.367469585185909e-06, "loss": 0.1414, "num_input_tokens_seen": 9430464, "step": 13995 }, { "epoch": 0.34202232917206166, "grad_norm": 10.772130966186523, "learning_rate": 1.3679581765769285e-06, "loss": 0.1338, "num_input_tokens_seen": 9434752, "step": 14000 }, { "epoch": 0.34214448000390885, "grad_norm": 7.737995624542236, "learning_rate": 1.3684467679679484e-06, "loss": 0.1397, "num_input_tokens_seen": 9438144, "step": 14005 }, { "epoch": 0.342266630835756, "grad_norm": 21.433128356933594, "learning_rate": 1.368935359358968e-06, "loss": 0.186, "num_input_tokens_seen": 9441472, "step": 14010 }, { "epoch": 0.34238878166760317, "grad_norm": 14.350296020507812, "learning_rate": 1.3694239507499876e-06, "loss": 0.1268, "num_input_tokens_seen": 9444544, "step": 14015 }, { "epoch": 0.3425109324994503, "grad_norm": 10.877737998962402, "learning_rate": 1.3699125421410075e-06, "loss": 0.1247, "num_input_tokens_seen": 9447808, "step": 14020 }, { "epoch": 0.3426330833312975, "grad_norm": 32.232540130615234, "learning_rate": 1.3704011335320271e-06, "loss": 0.2446, "num_input_tokens_seen": 9450688, "step": 14025 }, { "epoch": 0.34275523416314463, "grad_norm": 11.100104331970215, "learning_rate": 1.3708897249230468e-06, "loss": 0.1617, "num_input_tokens_seen": 9453824, "step": 14030 }, { "epoch": 0.3428773849949918, "grad_norm": 17.309906005859375, "learning_rate": 1.3713783163140665e-06, "loss": 0.1629, "num_input_tokens_seen": 9457408, "step": 14035 }, { "epoch": 0.342999535826839, "grad_norm": 15.123571395874023, "learning_rate": 1.3718669077050862e-06, "loss": 0.1217, "num_input_tokens_seen": 9461376, "step": 14040 }, { "epoch": 0.34312168665868614, "grad_norm": 1.2097609043121338, "learning_rate": 1.3723554990961059e-06, "loss": 0.0887, "num_input_tokens_seen": 9464576, "step": 14045 }, { "epoch": 0.34324383749053333, "grad_norm": 2.189056158065796, "learning_rate": 1.3728440904871255e-06, "loss": 0.0384, "num_input_tokens_seen": 9467776, "step": 14050 }, { "epoch": 0.34336598832238047, "grad_norm": 24.694095611572266, "learning_rate": 1.3733326818781452e-06, "loss": 0.1598, "num_input_tokens_seen": 9471040, "step": 14055 }, { "epoch": 0.34348813915422766, "grad_norm": 29.479421615600586, "learning_rate": 1.373821273269165e-06, "loss": 0.0941, "num_input_tokens_seen": 9474368, "step": 14060 }, { "epoch": 0.3436102899860748, "grad_norm": 11.847206115722656, "learning_rate": 1.3743098646601848e-06, "loss": 0.133, "num_input_tokens_seen": 9477632, "step": 14065 }, { "epoch": 0.343732440817922, "grad_norm": 20.486820220947266, "learning_rate": 1.3747984560512043e-06, "loss": 0.0703, "num_input_tokens_seen": 9481280, "step": 14070 }, { "epoch": 0.3438545916497691, "grad_norm": 3.3354222774505615, "learning_rate": 1.375287047442224e-06, "loss": 0.1171, "num_input_tokens_seen": 9483968, "step": 14075 }, { "epoch": 0.3439767424816163, "grad_norm": 16.6711368560791, "learning_rate": 1.3757756388332438e-06, "loss": 0.1291, "num_input_tokens_seen": 9487040, "step": 14080 }, { "epoch": 0.34409889331346344, "grad_norm": 32.15079879760742, "learning_rate": 1.3762642302242633e-06, "loss": 0.2305, "num_input_tokens_seen": 9490752, "step": 14085 }, { "epoch": 0.34422104414531063, "grad_norm": 16.164844512939453, "learning_rate": 1.376752821615283e-06, "loss": 0.1599, "num_input_tokens_seen": 9494016, "step": 14090 }, { "epoch": 0.3443431949771578, "grad_norm": 21.418575286865234, "learning_rate": 1.3772414130063029e-06, "loss": 0.0997, "num_input_tokens_seen": 9497408, "step": 14095 }, { "epoch": 0.34446534580900495, "grad_norm": 4.597272872924805, "learning_rate": 1.3777300043973226e-06, "loss": 0.0827, "num_input_tokens_seen": 9500928, "step": 14100 }, { "epoch": 0.34458749664085214, "grad_norm": 36.62041473388672, "learning_rate": 1.378218595788342e-06, "loss": 0.2393, "num_input_tokens_seen": 9504128, "step": 14105 }, { "epoch": 0.3447096474726993, "grad_norm": 21.164810180664062, "learning_rate": 1.378707187179362e-06, "loss": 0.1463, "num_input_tokens_seen": 9507456, "step": 14110 }, { "epoch": 0.34483179830454647, "grad_norm": 12.85429573059082, "learning_rate": 1.3791957785703816e-06, "loss": 0.0823, "num_input_tokens_seen": 9511104, "step": 14115 }, { "epoch": 0.3449539491363936, "grad_norm": 12.703222274780273, "learning_rate": 1.379684369961401e-06, "loss": 0.0464, "num_input_tokens_seen": 9514112, "step": 14120 }, { "epoch": 0.3450760999682408, "grad_norm": 1.6505541801452637, "learning_rate": 1.380172961352421e-06, "loss": 0.1753, "num_input_tokens_seen": 9517184, "step": 14125 }, { "epoch": 0.3451982508000879, "grad_norm": 1.279025912284851, "learning_rate": 1.3806615527434407e-06, "loss": 0.0295, "num_input_tokens_seen": 9520448, "step": 14130 }, { "epoch": 0.3453204016319351, "grad_norm": 37.95659637451172, "learning_rate": 1.3811501441344601e-06, "loss": 0.3037, "num_input_tokens_seen": 9523776, "step": 14135 }, { "epoch": 0.3454425524637823, "grad_norm": 10.611307144165039, "learning_rate": 1.38163873552548e-06, "loss": 0.1079, "num_input_tokens_seen": 9527360, "step": 14140 }, { "epoch": 0.34556470329562944, "grad_norm": 3.302593946456909, "learning_rate": 1.3821273269164997e-06, "loss": 0.0986, "num_input_tokens_seen": 9530560, "step": 14145 }, { "epoch": 0.34568685412747663, "grad_norm": 36.47740173339844, "learning_rate": 1.3826159183075194e-06, "loss": 0.075, "num_input_tokens_seen": 9533888, "step": 14150 }, { "epoch": 0.34580900495932376, "grad_norm": 0.7400091886520386, "learning_rate": 1.383104509698539e-06, "loss": 0.1583, "num_input_tokens_seen": 9538176, "step": 14155 }, { "epoch": 0.34593115579117095, "grad_norm": 1.301757574081421, "learning_rate": 1.3835931010895587e-06, "loss": 0.0705, "num_input_tokens_seen": 9541760, "step": 14160 }, { "epoch": 0.3460533066230181, "grad_norm": 27.92353630065918, "learning_rate": 1.3840816924805784e-06, "loss": 0.0922, "num_input_tokens_seen": 9544960, "step": 14165 }, { "epoch": 0.3461754574548653, "grad_norm": 47.610252380371094, "learning_rate": 1.384570283871598e-06, "loss": 0.251, "num_input_tokens_seen": 9548224, "step": 14170 }, { "epoch": 0.3462976082867124, "grad_norm": 33.565673828125, "learning_rate": 1.3850588752626178e-06, "loss": 0.0849, "num_input_tokens_seen": 9552000, "step": 14175 }, { "epoch": 0.3464197591185596, "grad_norm": 45.37564468383789, "learning_rate": 1.3855474666536375e-06, "loss": 0.2377, "num_input_tokens_seen": 9555072, "step": 14180 }, { "epoch": 0.34654190995040673, "grad_norm": 10.524803161621094, "learning_rate": 1.3860360580446574e-06, "loss": 0.0824, "num_input_tokens_seen": 9557952, "step": 14185 }, { "epoch": 0.3466640607822539, "grad_norm": 6.826573848724365, "learning_rate": 1.3865246494356768e-06, "loss": 0.1271, "num_input_tokens_seen": 9561024, "step": 14190 }, { "epoch": 0.3467862116141011, "grad_norm": 42.382469177246094, "learning_rate": 1.3870132408266965e-06, "loss": 0.1956, "num_input_tokens_seen": 9564736, "step": 14195 }, { "epoch": 0.34690836244594825, "grad_norm": 9.132492065429688, "learning_rate": 1.3875018322177164e-06, "loss": 0.1167, "num_input_tokens_seen": 9567808, "step": 14200 }, { "epoch": 0.34703051327779544, "grad_norm": 5.075997829437256, "learning_rate": 1.3879904236087359e-06, "loss": 0.0684, "num_input_tokens_seen": 9570880, "step": 14205 }, { "epoch": 0.3471526641096426, "grad_norm": 29.276092529296875, "learning_rate": 1.3884790149997556e-06, "loss": 0.1585, "num_input_tokens_seen": 9574720, "step": 14210 }, { "epoch": 0.34727481494148976, "grad_norm": 24.11640167236328, "learning_rate": 1.3889676063907754e-06, "loss": 0.1083, "num_input_tokens_seen": 9578560, "step": 14215 }, { "epoch": 0.3473969657733369, "grad_norm": 0.9906240701675415, "learning_rate": 1.3894561977817951e-06, "loss": 0.047, "num_input_tokens_seen": 9581568, "step": 14220 }, { "epoch": 0.3475191166051841, "grad_norm": 1.8317592144012451, "learning_rate": 1.3899447891728146e-06, "loss": 0.0997, "num_input_tokens_seen": 9585408, "step": 14225 }, { "epoch": 0.3476412674370312, "grad_norm": 23.83988380432129, "learning_rate": 1.3904333805638345e-06, "loss": 0.0707, "num_input_tokens_seen": 9588864, "step": 14230 }, { "epoch": 0.3477634182688784, "grad_norm": 2.099985122680664, "learning_rate": 1.3909219719548542e-06, "loss": 0.1108, "num_input_tokens_seen": 9592064, "step": 14235 }, { "epoch": 0.3478855691007256, "grad_norm": 15.784367561340332, "learning_rate": 1.3914105633458736e-06, "loss": 0.1077, "num_input_tokens_seen": 9595328, "step": 14240 }, { "epoch": 0.34800771993257273, "grad_norm": 103.47981262207031, "learning_rate": 1.3918991547368935e-06, "loss": 0.2657, "num_input_tokens_seen": 9598592, "step": 14245 }, { "epoch": 0.3481298707644199, "grad_norm": 58.79197692871094, "learning_rate": 1.3923877461279132e-06, "loss": 0.1663, "num_input_tokens_seen": 9601856, "step": 14250 }, { "epoch": 0.34825202159626706, "grad_norm": 0.24049213528633118, "learning_rate": 1.3928763375189327e-06, "loss": 0.0336, "num_input_tokens_seen": 9605184, "step": 14255 }, { "epoch": 0.34837417242811425, "grad_norm": 51.701271057128906, "learning_rate": 1.3933649289099526e-06, "loss": 0.1935, "num_input_tokens_seen": 9608256, "step": 14260 }, { "epoch": 0.3484963232599614, "grad_norm": 16.26445960998535, "learning_rate": 1.3938535203009723e-06, "loss": 0.1625, "num_input_tokens_seen": 9611328, "step": 14265 }, { "epoch": 0.3486184740918086, "grad_norm": 13.481324195861816, "learning_rate": 1.394342111691992e-06, "loss": 0.1454, "num_input_tokens_seen": 9614400, "step": 14270 }, { "epoch": 0.3487406249236557, "grad_norm": 23.933629989624023, "learning_rate": 1.3948307030830116e-06, "loss": 0.1765, "num_input_tokens_seen": 9618304, "step": 14275 }, { "epoch": 0.3488627757555029, "grad_norm": 11.629081726074219, "learning_rate": 1.3953192944740313e-06, "loss": 0.097, "num_input_tokens_seen": 9622016, "step": 14280 }, { "epoch": 0.3489849265873501, "grad_norm": 14.450347900390625, "learning_rate": 1.395807885865051e-06, "loss": 0.1955, "num_input_tokens_seen": 9625344, "step": 14285 }, { "epoch": 0.3491070774191972, "grad_norm": 28.15384864807129, "learning_rate": 1.3962964772560707e-06, "loss": 0.1924, "num_input_tokens_seen": 9629056, "step": 14290 }, { "epoch": 0.3492292282510444, "grad_norm": 13.48028564453125, "learning_rate": 1.3967850686470903e-06, "loss": 0.2156, "num_input_tokens_seen": 9632384, "step": 14295 }, { "epoch": 0.34935137908289154, "grad_norm": 2.4608681201934814, "learning_rate": 1.39727366003811e-06, "loss": 0.0258, "num_input_tokens_seen": 9636224, "step": 14300 }, { "epoch": 0.34947352991473873, "grad_norm": 16.832611083984375, "learning_rate": 1.39776225142913e-06, "loss": 0.0459, "num_input_tokens_seen": 9639680, "step": 14305 }, { "epoch": 0.34959568074658587, "grad_norm": 25.56661605834961, "learning_rate": 1.3982508428201494e-06, "loss": 0.0851, "num_input_tokens_seen": 9642816, "step": 14310 }, { "epoch": 0.34971783157843306, "grad_norm": 2.605492353439331, "learning_rate": 1.398739434211169e-06, "loss": 0.1033, "num_input_tokens_seen": 9645952, "step": 14315 }, { "epoch": 0.3498399824102802, "grad_norm": 0.7130058407783508, "learning_rate": 1.399228025602189e-06, "loss": 0.1258, "num_input_tokens_seen": 9649152, "step": 14320 }, { "epoch": 0.3499621332421274, "grad_norm": 28.27107810974121, "learning_rate": 1.3997166169932084e-06, "loss": 0.0797, "num_input_tokens_seen": 9652544, "step": 14325 }, { "epoch": 0.3500842840739745, "grad_norm": 40.19394302368164, "learning_rate": 1.4002052083842281e-06, "loss": 0.145, "num_input_tokens_seen": 9656320, "step": 14330 }, { "epoch": 0.3502064349058217, "grad_norm": 26.140825271606445, "learning_rate": 1.400693799775248e-06, "loss": 0.1833, "num_input_tokens_seen": 9659264, "step": 14335 }, { "epoch": 0.3503285857376689, "grad_norm": 36.82887649536133, "learning_rate": 1.4011823911662677e-06, "loss": 0.2014, "num_input_tokens_seen": 9662528, "step": 14340 }, { "epoch": 0.35045073656951603, "grad_norm": 6.582906246185303, "learning_rate": 1.4016709825572872e-06, "loss": 0.0795, "num_input_tokens_seen": 9666240, "step": 14345 }, { "epoch": 0.3505728874013632, "grad_norm": 51.33736801147461, "learning_rate": 1.402159573948307e-06, "loss": 0.0894, "num_input_tokens_seen": 9669376, "step": 14350 }, { "epoch": 0.35069503823321035, "grad_norm": 2.4924840927124023, "learning_rate": 1.4026481653393267e-06, "loss": 0.0369, "num_input_tokens_seen": 9672768, "step": 14355 }, { "epoch": 0.35081718906505754, "grad_norm": 13.934281349182129, "learning_rate": 1.4031367567303462e-06, "loss": 0.1528, "num_input_tokens_seen": 9675840, "step": 14360 }, { "epoch": 0.3509393398969047, "grad_norm": 16.30111312866211, "learning_rate": 1.403625348121366e-06, "loss": 0.1661, "num_input_tokens_seen": 9678976, "step": 14365 }, { "epoch": 0.35106149072875187, "grad_norm": 1.643057942390442, "learning_rate": 1.4041139395123858e-06, "loss": 0.0757, "num_input_tokens_seen": 9682496, "step": 14370 }, { "epoch": 0.351183641560599, "grad_norm": 26.52783203125, "learning_rate": 1.4046025309034055e-06, "loss": 0.1252, "num_input_tokens_seen": 9685568, "step": 14375 }, { "epoch": 0.3513057923924462, "grad_norm": 31.023881912231445, "learning_rate": 1.4050911222944251e-06, "loss": 0.183, "num_input_tokens_seen": 9689344, "step": 14380 }, { "epoch": 0.3514279432242934, "grad_norm": 53.98402786254883, "learning_rate": 1.4055797136854448e-06, "loss": 0.0644, "num_input_tokens_seen": 9692288, "step": 14385 }, { "epoch": 0.3515500940561405, "grad_norm": 32.249046325683594, "learning_rate": 1.4060683050764645e-06, "loss": 0.1095, "num_input_tokens_seen": 9695744, "step": 14390 }, { "epoch": 0.3516722448879877, "grad_norm": 0.3559386432170868, "learning_rate": 1.4065568964674842e-06, "loss": 0.1922, "num_input_tokens_seen": 9698880, "step": 14395 }, { "epoch": 0.35179439571983484, "grad_norm": 1.8468283414840698, "learning_rate": 1.4070454878585039e-06, "loss": 0.1522, "num_input_tokens_seen": 9702400, "step": 14400 }, { "epoch": 0.35191654655168203, "grad_norm": 1.7591185569763184, "learning_rate": 1.4075340792495235e-06, "loss": 0.059, "num_input_tokens_seen": 9705792, "step": 14405 }, { "epoch": 0.35203869738352916, "grad_norm": 15.38747787475586, "learning_rate": 1.4080226706405432e-06, "loss": 0.0717, "num_input_tokens_seen": 9709056, "step": 14410 }, { "epoch": 0.35216084821537635, "grad_norm": 2.455747127532959, "learning_rate": 1.408511262031563e-06, "loss": 0.113, "num_input_tokens_seen": 9712512, "step": 14415 }, { "epoch": 0.3522829990472235, "grad_norm": 12.720258712768555, "learning_rate": 1.4089998534225826e-06, "loss": 0.2413, "num_input_tokens_seen": 9716544, "step": 14420 }, { "epoch": 0.3524051498790707, "grad_norm": 20.368738174438477, "learning_rate": 1.4094884448136025e-06, "loss": 0.126, "num_input_tokens_seen": 9720000, "step": 14425 }, { "epoch": 0.35252730071091787, "grad_norm": 30.44447898864746, "learning_rate": 1.409977036204622e-06, "loss": 0.186, "num_input_tokens_seen": 9723520, "step": 14430 }, { "epoch": 0.352649451542765, "grad_norm": 8.672028541564941, "learning_rate": 1.4104656275956416e-06, "loss": 0.1164, "num_input_tokens_seen": 9726400, "step": 14435 }, { "epoch": 0.3527716023746122, "grad_norm": 23.961200714111328, "learning_rate": 1.4109542189866615e-06, "loss": 0.1667, "num_input_tokens_seen": 9729472, "step": 14440 }, { "epoch": 0.3528937532064593, "grad_norm": 17.22600746154785, "learning_rate": 1.411442810377681e-06, "loss": 0.0485, "num_input_tokens_seen": 9733056, "step": 14445 }, { "epoch": 0.3530159040383065, "grad_norm": 1.832526445388794, "learning_rate": 1.4119314017687007e-06, "loss": 0.0319, "num_input_tokens_seen": 9737088, "step": 14450 }, { "epoch": 0.35313805487015365, "grad_norm": 21.00478172302246, "learning_rate": 1.4124199931597206e-06, "loss": 0.0899, "num_input_tokens_seen": 9740480, "step": 14455 }, { "epoch": 0.35326020570200084, "grad_norm": 11.341621398925781, "learning_rate": 1.4129085845507403e-06, "loss": 0.1556, "num_input_tokens_seen": 9743744, "step": 14460 }, { "epoch": 0.353382356533848, "grad_norm": 22.92499542236328, "learning_rate": 1.4133971759417597e-06, "loss": 0.0558, "num_input_tokens_seen": 9746944, "step": 14465 }, { "epoch": 0.35350450736569516, "grad_norm": 21.124168395996094, "learning_rate": 1.4138857673327796e-06, "loss": 0.0949, "num_input_tokens_seen": 9750144, "step": 14470 }, { "epoch": 0.3536266581975423, "grad_norm": 6.369211673736572, "learning_rate": 1.4143743587237993e-06, "loss": 0.1063, "num_input_tokens_seen": 9753984, "step": 14475 }, { "epoch": 0.3537488090293895, "grad_norm": 10.5155611038208, "learning_rate": 1.4148629501148188e-06, "loss": 0.1582, "num_input_tokens_seen": 9757376, "step": 14480 }, { "epoch": 0.3538709598612367, "grad_norm": 12.186725616455078, "learning_rate": 1.4153515415058387e-06, "loss": 0.047, "num_input_tokens_seen": 9760576, "step": 14485 }, { "epoch": 0.3539931106930838, "grad_norm": 3.1093106269836426, "learning_rate": 1.4158401328968583e-06, "loss": 0.0452, "num_input_tokens_seen": 9763520, "step": 14490 }, { "epoch": 0.354115261524931, "grad_norm": 24.648107528686523, "learning_rate": 1.416328724287878e-06, "loss": 0.1651, "num_input_tokens_seen": 9767040, "step": 14495 }, { "epoch": 0.35423741235677814, "grad_norm": 1.211809515953064, "learning_rate": 1.4168173156788977e-06, "loss": 0.1228, "num_input_tokens_seen": 9770368, "step": 14500 }, { "epoch": 0.3543595631886253, "grad_norm": 0.7195934057235718, "learning_rate": 1.4173059070699174e-06, "loss": 0.0829, "num_input_tokens_seen": 9773888, "step": 14505 }, { "epoch": 0.35448171402047246, "grad_norm": 19.63247299194336, "learning_rate": 1.417794498460937e-06, "loss": 0.1262, "num_input_tokens_seen": 9777216, "step": 14510 }, { "epoch": 0.35460386485231965, "grad_norm": 36.69753646850586, "learning_rate": 1.4182830898519568e-06, "loss": 0.0925, "num_input_tokens_seen": 9780544, "step": 14515 }, { "epoch": 0.3547260156841668, "grad_norm": 0.07466413825750351, "learning_rate": 1.4187716812429764e-06, "loss": 0.1258, "num_input_tokens_seen": 9783680, "step": 14520 }, { "epoch": 0.354848166516014, "grad_norm": 24.7801570892334, "learning_rate": 1.4192602726339961e-06, "loss": 0.2356, "num_input_tokens_seen": 9786880, "step": 14525 }, { "epoch": 0.35497031734786116, "grad_norm": 23.782604217529297, "learning_rate": 1.419748864025016e-06, "loss": 0.1121, "num_input_tokens_seen": 9790144, "step": 14530 }, { "epoch": 0.3550924681797083, "grad_norm": 16.412891387939453, "learning_rate": 1.4202374554160355e-06, "loss": 0.1484, "num_input_tokens_seen": 9794240, "step": 14535 }, { "epoch": 0.3552146190115555, "grad_norm": 21.980588912963867, "learning_rate": 1.4207260468070552e-06, "loss": 0.1654, "num_input_tokens_seen": 9797632, "step": 14540 }, { "epoch": 0.3553367698434026, "grad_norm": 22.10216522216797, "learning_rate": 1.421214638198075e-06, "loss": 0.0966, "num_input_tokens_seen": 9801152, "step": 14545 }, { "epoch": 0.3554589206752498, "grad_norm": 28.08307456970215, "learning_rate": 1.4217032295890945e-06, "loss": 0.1549, "num_input_tokens_seen": 9804352, "step": 14550 }, { "epoch": 0.35558107150709695, "grad_norm": 17.134769439697266, "learning_rate": 1.4221918209801142e-06, "loss": 0.0237, "num_input_tokens_seen": 9808512, "step": 14555 }, { "epoch": 0.35570322233894414, "grad_norm": 26.15998649597168, "learning_rate": 1.422680412371134e-06, "loss": 0.1175, "num_input_tokens_seen": 9811328, "step": 14560 }, { "epoch": 0.35582537317079127, "grad_norm": 16.268836975097656, "learning_rate": 1.4231690037621536e-06, "loss": 0.1035, "num_input_tokens_seen": 9815104, "step": 14565 }, { "epoch": 0.35594752400263846, "grad_norm": 6.109137535095215, "learning_rate": 1.4236575951531732e-06, "loss": 0.1708, "num_input_tokens_seen": 9818688, "step": 14570 }, { "epoch": 0.35606967483448565, "grad_norm": 1.4849107265472412, "learning_rate": 1.4241461865441931e-06, "loss": 0.0521, "num_input_tokens_seen": 9822464, "step": 14575 }, { "epoch": 0.3561918256663328, "grad_norm": 23.204818725585938, "learning_rate": 1.4246347779352128e-06, "loss": 0.182, "num_input_tokens_seen": 9825728, "step": 14580 }, { "epoch": 0.35631397649818, "grad_norm": 15.597477912902832, "learning_rate": 1.4251233693262323e-06, "loss": 0.1155, "num_input_tokens_seen": 9829184, "step": 14585 }, { "epoch": 0.3564361273300271, "grad_norm": 2.52561616897583, "learning_rate": 1.4256119607172522e-06, "loss": 0.1183, "num_input_tokens_seen": 9832704, "step": 14590 }, { "epoch": 0.3565582781618743, "grad_norm": 15.523015022277832, "learning_rate": 1.4261005521082719e-06, "loss": 0.2253, "num_input_tokens_seen": 9835776, "step": 14595 }, { "epoch": 0.35668042899372143, "grad_norm": 26.568798065185547, "learning_rate": 1.4265891434992913e-06, "loss": 0.1039, "num_input_tokens_seen": 9839168, "step": 14600 }, { "epoch": 0.3568025798255686, "grad_norm": 12.2405366897583, "learning_rate": 1.4270777348903112e-06, "loss": 0.1266, "num_input_tokens_seen": 9842112, "step": 14605 }, { "epoch": 0.35692473065741576, "grad_norm": 10.383499145507812, "learning_rate": 1.427566326281331e-06, "loss": 0.0948, "num_input_tokens_seen": 9846016, "step": 14610 }, { "epoch": 0.35704688148926295, "grad_norm": 6.05488395690918, "learning_rate": 1.4280549176723506e-06, "loss": 0.1952, "num_input_tokens_seen": 9849280, "step": 14615 }, { "epoch": 0.3571690323211101, "grad_norm": 12.122830390930176, "learning_rate": 1.4285435090633703e-06, "loss": 0.0812, "num_input_tokens_seen": 9852224, "step": 14620 }, { "epoch": 0.35729118315295727, "grad_norm": 10.15803050994873, "learning_rate": 1.42903210045439e-06, "loss": 0.1073, "num_input_tokens_seen": 9855552, "step": 14625 }, { "epoch": 0.35741333398480446, "grad_norm": 21.924358367919922, "learning_rate": 1.4295206918454096e-06, "loss": 0.1347, "num_input_tokens_seen": 9858560, "step": 14630 }, { "epoch": 0.3575354848166516, "grad_norm": 22.783618927001953, "learning_rate": 1.4300092832364293e-06, "loss": 0.1511, "num_input_tokens_seen": 9862528, "step": 14635 }, { "epoch": 0.3576576356484988, "grad_norm": 52.09235763549805, "learning_rate": 1.430497874627449e-06, "loss": 0.1841, "num_input_tokens_seen": 9865920, "step": 14640 }, { "epoch": 0.3577797864803459, "grad_norm": 24.848285675048828, "learning_rate": 1.4309864660184687e-06, "loss": 0.0648, "num_input_tokens_seen": 9869440, "step": 14645 }, { "epoch": 0.3579019373121931, "grad_norm": 14.198959350585938, "learning_rate": 1.4314750574094886e-06, "loss": 0.1556, "num_input_tokens_seen": 9872320, "step": 14650 }, { "epoch": 0.35802408814404024, "grad_norm": 5.674281120300293, "learning_rate": 1.431963648800508e-06, "loss": 0.1211, "num_input_tokens_seen": 9875968, "step": 14655 }, { "epoch": 0.35814623897588743, "grad_norm": 25.904781341552734, "learning_rate": 1.4324522401915277e-06, "loss": 0.1172, "num_input_tokens_seen": 9879040, "step": 14660 }, { "epoch": 0.35826838980773457, "grad_norm": 7.980208396911621, "learning_rate": 1.4329408315825476e-06, "loss": 0.091, "num_input_tokens_seen": 9882368, "step": 14665 }, { "epoch": 0.35839054063958176, "grad_norm": 10.887474060058594, "learning_rate": 1.433429422973567e-06, "loss": 0.1043, "num_input_tokens_seen": 9885504, "step": 14670 }, { "epoch": 0.35851269147142895, "grad_norm": 4.5630316734313965, "learning_rate": 1.4339180143645868e-06, "loss": 0.0735, "num_input_tokens_seen": 9889088, "step": 14675 }, { "epoch": 0.3586348423032761, "grad_norm": 21.410566329956055, "learning_rate": 1.4344066057556067e-06, "loss": 0.0663, "num_input_tokens_seen": 9892352, "step": 14680 }, { "epoch": 0.35875699313512327, "grad_norm": 6.099719047546387, "learning_rate": 1.4348951971466261e-06, "loss": 0.0196, "num_input_tokens_seen": 9895872, "step": 14685 }, { "epoch": 0.3588791439669704, "grad_norm": 17.751115798950195, "learning_rate": 1.4353837885376458e-06, "loss": 0.0654, "num_input_tokens_seen": 9899200, "step": 14690 }, { "epoch": 0.3590012947988176, "grad_norm": 0.37945330142974854, "learning_rate": 1.4358723799286657e-06, "loss": 0.143, "num_input_tokens_seen": 9903104, "step": 14695 }, { "epoch": 0.35912344563066473, "grad_norm": 13.141345977783203, "learning_rate": 1.4363609713196854e-06, "loss": 0.1784, "num_input_tokens_seen": 9906176, "step": 14700 }, { "epoch": 0.3592455964625119, "grad_norm": 22.139732360839844, "learning_rate": 1.4368495627107049e-06, "loss": 0.0943, "num_input_tokens_seen": 9909504, "step": 14705 }, { "epoch": 0.35936774729435905, "grad_norm": 14.856293678283691, "learning_rate": 1.4373381541017247e-06, "loss": 0.1623, "num_input_tokens_seen": 9913088, "step": 14710 }, { "epoch": 0.35948989812620624, "grad_norm": 20.756912231445312, "learning_rate": 1.4378267454927444e-06, "loss": 0.216, "num_input_tokens_seen": 9916608, "step": 14715 }, { "epoch": 0.35961204895805343, "grad_norm": 7.858725070953369, "learning_rate": 1.438315336883764e-06, "loss": 0.2218, "num_input_tokens_seen": 9919936, "step": 14720 }, { "epoch": 0.35973419978990057, "grad_norm": 22.180316925048828, "learning_rate": 1.4388039282747838e-06, "loss": 0.1147, "num_input_tokens_seen": 9922752, "step": 14725 }, { "epoch": 0.35985635062174776, "grad_norm": 14.053325653076172, "learning_rate": 1.4392925196658035e-06, "loss": 0.1799, "num_input_tokens_seen": 9926144, "step": 14730 }, { "epoch": 0.3599785014535949, "grad_norm": 13.91140365600586, "learning_rate": 1.4397811110568232e-06, "loss": 0.0715, "num_input_tokens_seen": 9929344, "step": 14735 }, { "epoch": 0.3601006522854421, "grad_norm": 13.437630653381348, "learning_rate": 1.4402697024478428e-06, "loss": 0.101, "num_input_tokens_seen": 9932416, "step": 14740 }, { "epoch": 0.3602228031172892, "grad_norm": 2.004776954650879, "learning_rate": 1.4407582938388625e-06, "loss": 0.0896, "num_input_tokens_seen": 9936000, "step": 14745 }, { "epoch": 0.3603449539491364, "grad_norm": 24.528547286987305, "learning_rate": 1.4412468852298822e-06, "loss": 0.0944, "num_input_tokens_seen": 9938816, "step": 14750 }, { "epoch": 0.36046710478098354, "grad_norm": 24.631078720092773, "learning_rate": 1.4417354766209019e-06, "loss": 0.1834, "num_input_tokens_seen": 9942080, "step": 14755 }, { "epoch": 0.36058925561283073, "grad_norm": 25.52674102783203, "learning_rate": 1.4422240680119216e-06, "loss": 0.0603, "num_input_tokens_seen": 9945216, "step": 14760 }, { "epoch": 0.36071140644467786, "grad_norm": 7.100739479064941, "learning_rate": 1.4427126594029412e-06, "loss": 0.0395, "num_input_tokens_seen": 9948416, "step": 14765 }, { "epoch": 0.36083355727652505, "grad_norm": 41.38105010986328, "learning_rate": 1.4432012507939611e-06, "loss": 0.174, "num_input_tokens_seen": 9952128, "step": 14770 }, { "epoch": 0.36095570810837224, "grad_norm": 10.956008911132812, "learning_rate": 1.4436898421849806e-06, "loss": 0.201, "num_input_tokens_seen": 9955584, "step": 14775 }, { "epoch": 0.3610778589402194, "grad_norm": 19.014673233032227, "learning_rate": 1.4441784335760003e-06, "loss": 0.1133, "num_input_tokens_seen": 9959296, "step": 14780 }, { "epoch": 0.36120000977206657, "grad_norm": 16.54399871826172, "learning_rate": 1.4446670249670202e-06, "loss": 0.0877, "num_input_tokens_seen": 9962816, "step": 14785 }, { "epoch": 0.3613221606039137, "grad_norm": 0.5795342922210693, "learning_rate": 1.4451556163580396e-06, "loss": 0.0743, "num_input_tokens_seen": 9965952, "step": 14790 }, { "epoch": 0.3614443114357609, "grad_norm": 23.187488555908203, "learning_rate": 1.4456442077490593e-06, "loss": 0.1787, "num_input_tokens_seen": 9969344, "step": 14795 }, { "epoch": 0.361566462267608, "grad_norm": 22.39498519897461, "learning_rate": 1.4461327991400792e-06, "loss": 0.1723, "num_input_tokens_seen": 9972672, "step": 14800 }, { "epoch": 0.3616886130994552, "grad_norm": 7.977668762207031, "learning_rate": 1.446621390531099e-06, "loss": 0.0897, "num_input_tokens_seen": 9975616, "step": 14805 }, { "epoch": 0.36181076393130235, "grad_norm": 40.34514617919922, "learning_rate": 1.4471099819221184e-06, "loss": 0.1675, "num_input_tokens_seen": 9979648, "step": 14810 }, { "epoch": 0.36193291476314954, "grad_norm": 38.52743911743164, "learning_rate": 1.4475985733131383e-06, "loss": 0.1314, "num_input_tokens_seen": 9983168, "step": 14815 }, { "epoch": 0.36205506559499673, "grad_norm": 14.206141471862793, "learning_rate": 1.448087164704158e-06, "loss": 0.1952, "num_input_tokens_seen": 9986624, "step": 14820 }, { "epoch": 0.36217721642684386, "grad_norm": 36.0733757019043, "learning_rate": 1.4485757560951774e-06, "loss": 0.0967, "num_input_tokens_seen": 9990080, "step": 14825 }, { "epoch": 0.36229936725869105, "grad_norm": 3.471787691116333, "learning_rate": 1.4490643474861973e-06, "loss": 0.1674, "num_input_tokens_seen": 9993088, "step": 14830 }, { "epoch": 0.3624215180905382, "grad_norm": 1.9856698513031006, "learning_rate": 1.449552938877217e-06, "loss": 0.1011, "num_input_tokens_seen": 9996096, "step": 14835 }, { "epoch": 0.3625436689223854, "grad_norm": 22.805315017700195, "learning_rate": 1.4500415302682365e-06, "loss": 0.065, "num_input_tokens_seen": 9999744, "step": 14840 }, { "epoch": 0.3626658197542325, "grad_norm": 4.531700611114502, "learning_rate": 1.4505301216592564e-06, "loss": 0.0536, "num_input_tokens_seen": 10002944, "step": 14845 }, { "epoch": 0.3627879705860797, "grad_norm": 4.974030494689941, "learning_rate": 1.451018713050276e-06, "loss": 0.1733, "num_input_tokens_seen": 10006336, "step": 14850 }, { "epoch": 0.36291012141792683, "grad_norm": 15.88058853149414, "learning_rate": 1.4515073044412957e-06, "loss": 0.1593, "num_input_tokens_seen": 10009344, "step": 14855 }, { "epoch": 0.363032272249774, "grad_norm": 61.39813995361328, "learning_rate": 1.4519958958323154e-06, "loss": 0.1427, "num_input_tokens_seen": 10012352, "step": 14860 }, { "epoch": 0.36315442308162116, "grad_norm": 1.7952611446380615, "learning_rate": 1.452484487223335e-06, "loss": 0.1268, "num_input_tokens_seen": 10015360, "step": 14865 }, { "epoch": 0.36327657391346835, "grad_norm": 12.197927474975586, "learning_rate": 1.4529730786143548e-06, "loss": 0.084, "num_input_tokens_seen": 10019392, "step": 14870 }, { "epoch": 0.36339872474531554, "grad_norm": 19.5502872467041, "learning_rate": 1.4534616700053744e-06, "loss": 0.156, "num_input_tokens_seen": 10022912, "step": 14875 }, { "epoch": 0.36352087557716267, "grad_norm": 26.997758865356445, "learning_rate": 1.4539502613963941e-06, "loss": 0.059, "num_input_tokens_seen": 10026304, "step": 14880 }, { "epoch": 0.36364302640900986, "grad_norm": 1.9117192029953003, "learning_rate": 1.4544388527874138e-06, "loss": 0.1032, "num_input_tokens_seen": 10029568, "step": 14885 }, { "epoch": 0.363765177240857, "grad_norm": 37.157466888427734, "learning_rate": 1.4549274441784337e-06, "loss": 0.1649, "num_input_tokens_seen": 10033088, "step": 14890 }, { "epoch": 0.3638873280727042, "grad_norm": 30.732894897460938, "learning_rate": 1.4554160355694532e-06, "loss": 0.0947, "num_input_tokens_seen": 10036672, "step": 14895 }, { "epoch": 0.3640094789045513, "grad_norm": 4.4297404289245605, "learning_rate": 1.4559046269604728e-06, "loss": 0.2256, "num_input_tokens_seen": 10040000, "step": 14900 }, { "epoch": 0.3641316297363985, "grad_norm": 0.5586517453193665, "learning_rate": 1.4563932183514927e-06, "loss": 0.0815, "num_input_tokens_seen": 10043072, "step": 14905 }, { "epoch": 0.36425378056824564, "grad_norm": 0.9251121878623962, "learning_rate": 1.4568818097425122e-06, "loss": 0.2063, "num_input_tokens_seen": 10046336, "step": 14910 }, { "epoch": 0.36437593140009283, "grad_norm": 31.80733299255371, "learning_rate": 1.4573704011335319e-06, "loss": 0.1413, "num_input_tokens_seen": 10049152, "step": 14915 }, { "epoch": 0.36449808223194, "grad_norm": 23.533954620361328, "learning_rate": 1.4578589925245518e-06, "loss": 0.0804, "num_input_tokens_seen": 10052224, "step": 14920 }, { "epoch": 0.36462023306378716, "grad_norm": 6.359363079071045, "learning_rate": 1.4583475839155715e-06, "loss": 0.0736, "num_input_tokens_seen": 10055488, "step": 14925 }, { "epoch": 0.36474238389563435, "grad_norm": 2.5986242294311523, "learning_rate": 1.458836175306591e-06, "loss": 0.0469, "num_input_tokens_seen": 10058752, "step": 14930 }, { "epoch": 0.3648645347274815, "grad_norm": 30.787466049194336, "learning_rate": 1.4593247666976108e-06, "loss": 0.2228, "num_input_tokens_seen": 10062080, "step": 14935 }, { "epoch": 0.36498668555932867, "grad_norm": 10.304224014282227, "learning_rate": 1.4598133580886305e-06, "loss": 0.1705, "num_input_tokens_seen": 10065408, "step": 14940 }, { "epoch": 0.3651088363911758, "grad_norm": 3.115684747695923, "learning_rate": 1.46030194947965e-06, "loss": 0.0597, "num_input_tokens_seen": 10068480, "step": 14945 }, { "epoch": 0.365230987223023, "grad_norm": 28.490812301635742, "learning_rate": 1.4607905408706699e-06, "loss": 0.2122, "num_input_tokens_seen": 10071488, "step": 14950 }, { "epoch": 0.36535313805487013, "grad_norm": 3.1588618755340576, "learning_rate": 1.4612791322616896e-06, "loss": 0.1656, "num_input_tokens_seen": 10075072, "step": 14955 }, { "epoch": 0.3654752888867173, "grad_norm": 12.030863761901855, "learning_rate": 1.4617677236527092e-06, "loss": 0.157, "num_input_tokens_seen": 10078400, "step": 14960 }, { "epoch": 0.3655974397185645, "grad_norm": 9.774025917053223, "learning_rate": 1.462256315043729e-06, "loss": 0.0806, "num_input_tokens_seen": 10081472, "step": 14965 }, { "epoch": 0.36571959055041164, "grad_norm": 24.714754104614258, "learning_rate": 1.4627449064347486e-06, "loss": 0.1563, "num_input_tokens_seen": 10084608, "step": 14970 }, { "epoch": 0.36584174138225883, "grad_norm": 4.036285877227783, "learning_rate": 1.4632334978257683e-06, "loss": 0.1024, "num_input_tokens_seen": 10088000, "step": 14975 }, { "epoch": 0.36596389221410597, "grad_norm": 11.198826789855957, "learning_rate": 1.463722089216788e-06, "loss": 0.185, "num_input_tokens_seen": 10091520, "step": 14980 }, { "epoch": 0.36608604304595316, "grad_norm": 16.43745994567871, "learning_rate": 1.4642106806078076e-06, "loss": 0.1085, "num_input_tokens_seen": 10094656, "step": 14985 }, { "epoch": 0.3662081938778003, "grad_norm": 33.093833923339844, "learning_rate": 1.4646992719988273e-06, "loss": 0.154, "num_input_tokens_seen": 10097792, "step": 14990 }, { "epoch": 0.3663303447096475, "grad_norm": 6.07112979888916, "learning_rate": 1.465187863389847e-06, "loss": 0.0851, "num_input_tokens_seen": 10101184, "step": 14995 }, { "epoch": 0.3664524955414946, "grad_norm": 18.167476654052734, "learning_rate": 1.4656764547808667e-06, "loss": 0.1695, "num_input_tokens_seen": 10104256, "step": 15000 }, { "epoch": 0.3665746463733418, "grad_norm": 4.230319499969482, "learning_rate": 1.4661650461718864e-06, "loss": 0.0803, "num_input_tokens_seen": 10107328, "step": 15005 }, { "epoch": 0.36669679720518894, "grad_norm": 1.558373212814331, "learning_rate": 1.4666536375629063e-06, "loss": 0.0945, "num_input_tokens_seen": 10110912, "step": 15010 }, { "epoch": 0.36681894803703613, "grad_norm": 3.3440184593200684, "learning_rate": 1.4671422289539257e-06, "loss": 0.108, "num_input_tokens_seen": 10114176, "step": 15015 }, { "epoch": 0.3669410988688833, "grad_norm": 2.4016432762145996, "learning_rate": 1.4676308203449454e-06, "loss": 0.0921, "num_input_tokens_seen": 10117248, "step": 15020 }, { "epoch": 0.36706324970073045, "grad_norm": 11.902193069458008, "learning_rate": 1.4681194117359653e-06, "loss": 0.0643, "num_input_tokens_seen": 10120768, "step": 15025 }, { "epoch": 0.36718540053257764, "grad_norm": 14.054060935974121, "learning_rate": 1.4686080031269848e-06, "loss": 0.0781, "num_input_tokens_seen": 10123840, "step": 15030 }, { "epoch": 0.3673075513644248, "grad_norm": 2.9538633823394775, "learning_rate": 1.4690965945180045e-06, "loss": 0.0874, "num_input_tokens_seen": 10127104, "step": 15035 }, { "epoch": 0.36742970219627197, "grad_norm": 10.337444305419922, "learning_rate": 1.4695851859090243e-06, "loss": 0.098, "num_input_tokens_seen": 10130688, "step": 15040 }, { "epoch": 0.3675518530281191, "grad_norm": 19.6004695892334, "learning_rate": 1.470073777300044e-06, "loss": 0.1618, "num_input_tokens_seen": 10134144, "step": 15045 }, { "epoch": 0.3676740038599663, "grad_norm": 11.789777755737305, "learning_rate": 1.4705623686910635e-06, "loss": 0.0887, "num_input_tokens_seen": 10137344, "step": 15050 }, { "epoch": 0.3677961546918134, "grad_norm": 29.683469772338867, "learning_rate": 1.4710509600820834e-06, "loss": 0.2011, "num_input_tokens_seen": 10140800, "step": 15055 }, { "epoch": 0.3679183055236606, "grad_norm": 30.46916961669922, "learning_rate": 1.471539551473103e-06, "loss": 0.1338, "num_input_tokens_seen": 10143936, "step": 15060 }, { "epoch": 0.3680404563555078, "grad_norm": 10.194910049438477, "learning_rate": 1.4720281428641225e-06, "loss": 0.0552, "num_input_tokens_seen": 10147392, "step": 15065 }, { "epoch": 0.36816260718735494, "grad_norm": 23.93616485595703, "learning_rate": 1.4725167342551424e-06, "loss": 0.1437, "num_input_tokens_seen": 10150400, "step": 15070 }, { "epoch": 0.36828475801920213, "grad_norm": 7.265649795532227, "learning_rate": 1.4730053256461621e-06, "loss": 0.0665, "num_input_tokens_seen": 10153728, "step": 15075 }, { "epoch": 0.36840690885104926, "grad_norm": 23.494064331054688, "learning_rate": 1.4734939170371818e-06, "loss": 0.064, "num_input_tokens_seen": 10157056, "step": 15080 }, { "epoch": 0.36852905968289645, "grad_norm": 17.327404022216797, "learning_rate": 1.4739825084282015e-06, "loss": 0.1049, "num_input_tokens_seen": 10161152, "step": 15085 }, { "epoch": 0.3686512105147436, "grad_norm": 15.256969451904297, "learning_rate": 1.4744710998192212e-06, "loss": 0.2415, "num_input_tokens_seen": 10164480, "step": 15090 }, { "epoch": 0.3687733613465908, "grad_norm": 17.404699325561523, "learning_rate": 1.4749596912102408e-06, "loss": 0.154, "num_input_tokens_seen": 10167360, "step": 15095 }, { "epoch": 0.3688955121784379, "grad_norm": 0.9550981521606445, "learning_rate": 1.4754482826012605e-06, "loss": 0.0262, "num_input_tokens_seen": 10170752, "step": 15100 }, { "epoch": 0.3690176630102851, "grad_norm": 16.822263717651367, "learning_rate": 1.4759368739922802e-06, "loss": 0.1336, "num_input_tokens_seen": 10174336, "step": 15105 }, { "epoch": 0.3691398138421323, "grad_norm": 2.805422782897949, "learning_rate": 1.4764254653832999e-06, "loss": 0.0717, "num_input_tokens_seen": 10177408, "step": 15110 }, { "epoch": 0.3692619646739794, "grad_norm": 1.0176135301589966, "learning_rate": 1.4769140567743196e-06, "loss": 0.2487, "num_input_tokens_seen": 10181440, "step": 15115 }, { "epoch": 0.3693841155058266, "grad_norm": 1.3316823244094849, "learning_rate": 1.4774026481653392e-06, "loss": 0.0709, "num_input_tokens_seen": 10184640, "step": 15120 }, { "epoch": 0.36950626633767375, "grad_norm": 20.177623748779297, "learning_rate": 1.477891239556359e-06, "loss": 0.049, "num_input_tokens_seen": 10187584, "step": 15125 }, { "epoch": 0.36962841716952094, "grad_norm": 19.99055290222168, "learning_rate": 1.4783798309473788e-06, "loss": 0.1035, "num_input_tokens_seen": 10190848, "step": 15130 }, { "epoch": 0.3697505680013681, "grad_norm": 19.489837646484375, "learning_rate": 1.4788684223383983e-06, "loss": 0.0623, "num_input_tokens_seen": 10194304, "step": 15135 }, { "epoch": 0.36987271883321526, "grad_norm": 20.77276039123535, "learning_rate": 1.479357013729418e-06, "loss": 0.1515, "num_input_tokens_seen": 10197696, "step": 15140 }, { "epoch": 0.3699948696650624, "grad_norm": 4.104226589202881, "learning_rate": 1.4798456051204379e-06, "loss": 0.2384, "num_input_tokens_seen": 10200576, "step": 15145 }, { "epoch": 0.3701170204969096, "grad_norm": 45.69715881347656, "learning_rate": 1.4803341965114573e-06, "loss": 0.0572, "num_input_tokens_seen": 10203840, "step": 15150 }, { "epoch": 0.3702391713287567, "grad_norm": 29.123130798339844, "learning_rate": 1.480822787902477e-06, "loss": 0.0828, "num_input_tokens_seen": 10207488, "step": 15155 }, { "epoch": 0.3703613221606039, "grad_norm": 35.56546401977539, "learning_rate": 1.481311379293497e-06, "loss": 0.1452, "num_input_tokens_seen": 10211008, "step": 15160 }, { "epoch": 0.3704834729924511, "grad_norm": 27.84103012084961, "learning_rate": 1.4817999706845166e-06, "loss": 0.17, "num_input_tokens_seen": 10214336, "step": 15165 }, { "epoch": 0.37060562382429824, "grad_norm": 0.9257592558860779, "learning_rate": 1.482288562075536e-06, "loss": 0.1446, "num_input_tokens_seen": 10217856, "step": 15170 }, { "epoch": 0.3707277746561454, "grad_norm": 18.228384017944336, "learning_rate": 1.482777153466556e-06, "loss": 0.0167, "num_input_tokens_seen": 10221696, "step": 15175 }, { "epoch": 0.37084992548799256, "grad_norm": 31.668554306030273, "learning_rate": 1.4832657448575756e-06, "loss": 0.2496, "num_input_tokens_seen": 10225088, "step": 15180 }, { "epoch": 0.37097207631983975, "grad_norm": 16.257932662963867, "learning_rate": 1.483754336248595e-06, "loss": 0.0886, "num_input_tokens_seen": 10228160, "step": 15185 }, { "epoch": 0.3710942271516869, "grad_norm": 20.400663375854492, "learning_rate": 1.484242927639615e-06, "loss": 0.0719, "num_input_tokens_seen": 10231744, "step": 15190 }, { "epoch": 0.3712163779835341, "grad_norm": 48.53895950317383, "learning_rate": 1.4847315190306347e-06, "loss": 0.2309, "num_input_tokens_seen": 10235520, "step": 15195 }, { "epoch": 0.3713385288153812, "grad_norm": 25.890565872192383, "learning_rate": 1.4852201104216544e-06, "loss": 0.1409, "num_input_tokens_seen": 10238848, "step": 15200 }, { "epoch": 0.3714606796472284, "grad_norm": 0.1722467839717865, "learning_rate": 1.485708701812674e-06, "loss": 0.0193, "num_input_tokens_seen": 10242304, "step": 15205 }, { "epoch": 0.3715828304790756, "grad_norm": 1.651286244392395, "learning_rate": 1.4861972932036937e-06, "loss": 0.1968, "num_input_tokens_seen": 10245440, "step": 15210 }, { "epoch": 0.3717049813109227, "grad_norm": 33.69007110595703, "learning_rate": 1.4866858845947134e-06, "loss": 0.1411, "num_input_tokens_seen": 10248768, "step": 15215 }, { "epoch": 0.3718271321427699, "grad_norm": 4.165786266326904, "learning_rate": 1.4871744759857329e-06, "loss": 0.0839, "num_input_tokens_seen": 10252160, "step": 15220 }, { "epoch": 0.37194928297461705, "grad_norm": 3.8775460720062256, "learning_rate": 1.4876630673767528e-06, "loss": 0.2086, "num_input_tokens_seen": 10255168, "step": 15225 }, { "epoch": 0.37207143380646424, "grad_norm": 19.101089477539062, "learning_rate": 1.4881516587677724e-06, "loss": 0.2026, "num_input_tokens_seen": 10259648, "step": 15230 }, { "epoch": 0.37219358463831137, "grad_norm": 6.220980167388916, "learning_rate": 1.4886402501587923e-06, "loss": 0.0053, "num_input_tokens_seen": 10263424, "step": 15235 }, { "epoch": 0.37231573547015856, "grad_norm": 3.341536283493042, "learning_rate": 1.4891288415498118e-06, "loss": 0.1144, "num_input_tokens_seen": 10266496, "step": 15240 }, { "epoch": 0.3724378863020057, "grad_norm": 0.19938251376152039, "learning_rate": 1.4896174329408315e-06, "loss": 0.0223, "num_input_tokens_seen": 10270144, "step": 15245 }, { "epoch": 0.3725600371338529, "grad_norm": 18.8046932220459, "learning_rate": 1.4901060243318514e-06, "loss": 0.2106, "num_input_tokens_seen": 10273984, "step": 15250 }, { "epoch": 0.3726821879657001, "grad_norm": 5.193131923675537, "learning_rate": 1.4905946157228709e-06, "loss": 0.1617, "num_input_tokens_seen": 10277056, "step": 15255 }, { "epoch": 0.3728043387975472, "grad_norm": 32.93628692626953, "learning_rate": 1.4910832071138905e-06, "loss": 0.0294, "num_input_tokens_seen": 10280384, "step": 15260 }, { "epoch": 0.3729264896293944, "grad_norm": 6.409894943237305, "learning_rate": 1.4915717985049104e-06, "loss": 0.118, "num_input_tokens_seen": 10283520, "step": 15265 }, { "epoch": 0.37304864046124153, "grad_norm": 0.4226660132408142, "learning_rate": 1.49206038989593e-06, "loss": 0.1248, "num_input_tokens_seen": 10286784, "step": 15270 }, { "epoch": 0.3731707912930887, "grad_norm": 31.093536376953125, "learning_rate": 1.4925489812869496e-06, "loss": 0.2098, "num_input_tokens_seen": 10290048, "step": 15275 }, { "epoch": 0.37329294212493586, "grad_norm": 7.129814147949219, "learning_rate": 1.4930375726779695e-06, "loss": 0.2542, "num_input_tokens_seen": 10293312, "step": 15280 }, { "epoch": 0.37341509295678305, "grad_norm": 24.767244338989258, "learning_rate": 1.4935261640689892e-06, "loss": 0.1726, "num_input_tokens_seen": 10296448, "step": 15285 }, { "epoch": 0.3735372437886302, "grad_norm": 20.03944206237793, "learning_rate": 1.4940147554600086e-06, "loss": 0.257, "num_input_tokens_seen": 10299520, "step": 15290 }, { "epoch": 0.37365939462047737, "grad_norm": 3.744540214538574, "learning_rate": 1.4945033468510285e-06, "loss": 0.0406, "num_input_tokens_seen": 10302976, "step": 15295 }, { "epoch": 0.3737815454523245, "grad_norm": 28.81475830078125, "learning_rate": 1.4949919382420482e-06, "loss": 0.1918, "num_input_tokens_seen": 10306432, "step": 15300 }, { "epoch": 0.3739036962841717, "grad_norm": 1.5936713218688965, "learning_rate": 1.4954805296330677e-06, "loss": 0.1485, "num_input_tokens_seen": 10310016, "step": 15305 }, { "epoch": 0.3740258471160189, "grad_norm": 15.979466438293457, "learning_rate": 1.4959691210240873e-06, "loss": 0.1358, "num_input_tokens_seen": 10313408, "step": 15310 }, { "epoch": 0.374147997947866, "grad_norm": 0.897377073764801, "learning_rate": 1.4964577124151072e-06, "loss": 0.0169, "num_input_tokens_seen": 10316992, "step": 15315 }, { "epoch": 0.3742701487797132, "grad_norm": 14.27642822265625, "learning_rate": 1.496946303806127e-06, "loss": 0.1957, "num_input_tokens_seen": 10320384, "step": 15320 }, { "epoch": 0.37439229961156034, "grad_norm": 21.05185890197754, "learning_rate": 1.4974348951971464e-06, "loss": 0.1696, "num_input_tokens_seen": 10323712, "step": 15325 }, { "epoch": 0.37451445044340753, "grad_norm": 11.088553428649902, "learning_rate": 1.4979234865881663e-06, "loss": 0.1251, "num_input_tokens_seen": 10326848, "step": 15330 }, { "epoch": 0.37463660127525467, "grad_norm": 61.10281753540039, "learning_rate": 1.498412077979186e-06, "loss": 0.1813, "num_input_tokens_seen": 10330112, "step": 15335 }, { "epoch": 0.37475875210710186, "grad_norm": 14.84995174407959, "learning_rate": 1.4989006693702054e-06, "loss": 0.0824, "num_input_tokens_seen": 10333696, "step": 15340 }, { "epoch": 0.374880902938949, "grad_norm": 0.3659310042858124, "learning_rate": 1.4993892607612253e-06, "loss": 0.0672, "num_input_tokens_seen": 10337024, "step": 15345 }, { "epoch": 0.3750030537707962, "grad_norm": 20.780288696289062, "learning_rate": 1.499877852152245e-06, "loss": 0.066, "num_input_tokens_seen": 10340480, "step": 15350 }, { "epoch": 0.37512520460264337, "grad_norm": 27.659393310546875, "learning_rate": 1.500366443543265e-06, "loss": 0.2135, "num_input_tokens_seen": 10343872, "step": 15355 }, { "epoch": 0.3752473554344905, "grad_norm": 0.8734622597694397, "learning_rate": 1.5008550349342844e-06, "loss": 0.0905, "num_input_tokens_seen": 10347200, "step": 15360 }, { "epoch": 0.3753695062663377, "grad_norm": 14.381644248962402, "learning_rate": 1.501343626325304e-06, "loss": 0.1545, "num_input_tokens_seen": 10350464, "step": 15365 }, { "epoch": 0.37549165709818483, "grad_norm": 7.951765060424805, "learning_rate": 1.501832217716324e-06, "loss": 0.1257, "num_input_tokens_seen": 10353728, "step": 15370 }, { "epoch": 0.375613807930032, "grad_norm": 18.489259719848633, "learning_rate": 1.5023208091073434e-06, "loss": 0.0401, "num_input_tokens_seen": 10356672, "step": 15375 }, { "epoch": 0.37573595876187915, "grad_norm": 20.716266632080078, "learning_rate": 1.502809400498363e-06, "loss": 0.1308, "num_input_tokens_seen": 10359808, "step": 15380 }, { "epoch": 0.37585810959372634, "grad_norm": 2.1580662727355957, "learning_rate": 1.503297991889383e-06, "loss": 0.116, "num_input_tokens_seen": 10363136, "step": 15385 }, { "epoch": 0.3759802604255735, "grad_norm": 50.68510055541992, "learning_rate": 1.5037865832804025e-06, "loss": 0.1752, "num_input_tokens_seen": 10366528, "step": 15390 }, { "epoch": 0.37610241125742067, "grad_norm": 52.328433990478516, "learning_rate": 1.5042751746714221e-06, "loss": 0.287, "num_input_tokens_seen": 10370240, "step": 15395 }, { "epoch": 0.37622456208926786, "grad_norm": 12.517891883850098, "learning_rate": 1.5047637660624418e-06, "loss": 0.1475, "num_input_tokens_seen": 10373888, "step": 15400 }, { "epoch": 0.376346712921115, "grad_norm": 15.019147872924805, "learning_rate": 1.5052523574534617e-06, "loss": 0.1297, "num_input_tokens_seen": 10377472, "step": 15405 }, { "epoch": 0.3764688637529622, "grad_norm": 23.52758026123047, "learning_rate": 1.5057409488444812e-06, "loss": 0.1229, "num_input_tokens_seen": 10380672, "step": 15410 }, { "epoch": 0.3765910145848093, "grad_norm": 9.015829086303711, "learning_rate": 1.5062295402355009e-06, "loss": 0.1197, "num_input_tokens_seen": 10384128, "step": 15415 }, { "epoch": 0.3767131654166565, "grad_norm": 11.400869369506836, "learning_rate": 1.5067181316265208e-06, "loss": 0.142, "num_input_tokens_seen": 10386816, "step": 15420 }, { "epoch": 0.37683531624850364, "grad_norm": 3.6362061500549316, "learning_rate": 1.5072067230175402e-06, "loss": 0.0599, "num_input_tokens_seen": 10390336, "step": 15425 }, { "epoch": 0.3769574670803508, "grad_norm": 20.962465286254883, "learning_rate": 1.50769531440856e-06, "loss": 0.1482, "num_input_tokens_seen": 10394112, "step": 15430 }, { "epoch": 0.37707961791219796, "grad_norm": 15.88277530670166, "learning_rate": 1.5081839057995798e-06, "loss": 0.1473, "num_input_tokens_seen": 10398016, "step": 15435 }, { "epoch": 0.37720176874404515, "grad_norm": 0.9228224158287048, "learning_rate": 1.5086724971905995e-06, "loss": 0.099, "num_input_tokens_seen": 10401280, "step": 15440 }, { "epoch": 0.3773239195758923, "grad_norm": 1.427764654159546, "learning_rate": 1.509161088581619e-06, "loss": 0.1114, "num_input_tokens_seen": 10404736, "step": 15445 }, { "epoch": 0.3774460704077395, "grad_norm": 8.329426765441895, "learning_rate": 1.5096496799726388e-06, "loss": 0.1868, "num_input_tokens_seen": 10408064, "step": 15450 }, { "epoch": 0.37756822123958667, "grad_norm": 2.8574776649475098, "learning_rate": 1.5101382713636585e-06, "loss": 0.171, "num_input_tokens_seen": 10411456, "step": 15455 }, { "epoch": 0.3776903720714338, "grad_norm": 0.10268853604793549, "learning_rate": 1.510626862754678e-06, "loss": 0.048, "num_input_tokens_seen": 10414592, "step": 15460 }, { "epoch": 0.377812522903281, "grad_norm": 33.580810546875, "learning_rate": 1.5111154541456979e-06, "loss": 0.1467, "num_input_tokens_seen": 10417984, "step": 15465 }, { "epoch": 0.3779346737351281, "grad_norm": 6.88870906829834, "learning_rate": 1.5116040455367176e-06, "loss": 0.0913, "num_input_tokens_seen": 10421568, "step": 15470 }, { "epoch": 0.3780568245669753, "grad_norm": 2.1941027641296387, "learning_rate": 1.5120926369277375e-06, "loss": 0.0812, "num_input_tokens_seen": 10424896, "step": 15475 }, { "epoch": 0.37817897539882245, "grad_norm": 54.10744857788086, "learning_rate": 1.512581228318757e-06, "loss": 0.2021, "num_input_tokens_seen": 10428544, "step": 15480 }, { "epoch": 0.37830112623066964, "grad_norm": 28.76828384399414, "learning_rate": 1.5130698197097766e-06, "loss": 0.1401, "num_input_tokens_seen": 10431744, "step": 15485 }, { "epoch": 0.37842327706251677, "grad_norm": 0.7208065986633301, "learning_rate": 1.5135584111007965e-06, "loss": 0.0534, "num_input_tokens_seen": 10435712, "step": 15490 }, { "epoch": 0.37854542789436396, "grad_norm": 1.000946283340454, "learning_rate": 1.514047002491816e-06, "loss": 0.1069, "num_input_tokens_seen": 10439040, "step": 15495 }, { "epoch": 0.37866757872621115, "grad_norm": 0.2939513325691223, "learning_rate": 1.5145355938828357e-06, "loss": 0.0527, "num_input_tokens_seen": 10442304, "step": 15500 }, { "epoch": 0.3787897295580583, "grad_norm": 0.05420077219605446, "learning_rate": 1.5150241852738553e-06, "loss": 0.0106, "num_input_tokens_seen": 10445696, "step": 15505 }, { "epoch": 0.3789118803899055, "grad_norm": 51.432655334472656, "learning_rate": 1.5155127766648752e-06, "loss": 0.2094, "num_input_tokens_seen": 10448896, "step": 15510 }, { "epoch": 0.3790340312217526, "grad_norm": 32.62167739868164, "learning_rate": 1.5160013680558947e-06, "loss": 0.0971, "num_input_tokens_seen": 10452352, "step": 15515 }, { "epoch": 0.3791561820535998, "grad_norm": 22.451175689697266, "learning_rate": 1.5164899594469144e-06, "loss": 0.2383, "num_input_tokens_seen": 10456000, "step": 15520 }, { "epoch": 0.37927833288544693, "grad_norm": 0.42879417538642883, "learning_rate": 1.5169785508379343e-06, "loss": 0.1404, "num_input_tokens_seen": 10459968, "step": 15525 }, { "epoch": 0.3794004837172941, "grad_norm": 25.073436737060547, "learning_rate": 1.5174671422289538e-06, "loss": 0.1199, "num_input_tokens_seen": 10463488, "step": 15530 }, { "epoch": 0.37952263454914126, "grad_norm": 24.64297103881836, "learning_rate": 1.5179557336199734e-06, "loss": 0.0706, "num_input_tokens_seen": 10467072, "step": 15535 }, { "epoch": 0.37964478538098845, "grad_norm": 15.014025688171387, "learning_rate": 1.5184443250109933e-06, "loss": 0.1237, "num_input_tokens_seen": 10470272, "step": 15540 }, { "epoch": 0.3797669362128356, "grad_norm": 1.430694341659546, "learning_rate": 1.5189329164020128e-06, "loss": 0.1375, "num_input_tokens_seen": 10473920, "step": 15545 }, { "epoch": 0.37988908704468277, "grad_norm": 33.992801666259766, "learning_rate": 1.5194215077930325e-06, "loss": 0.0761, "num_input_tokens_seen": 10477312, "step": 15550 }, { "epoch": 0.38001123787652996, "grad_norm": 48.1543083190918, "learning_rate": 1.5199100991840524e-06, "loss": 0.0856, "num_input_tokens_seen": 10480576, "step": 15555 }, { "epoch": 0.3801333887083771, "grad_norm": 2.4673454761505127, "learning_rate": 1.520398690575072e-06, "loss": 0.0892, "num_input_tokens_seen": 10483776, "step": 15560 }, { "epoch": 0.3802555395402243, "grad_norm": 77.2197036743164, "learning_rate": 1.5208872819660915e-06, "loss": 0.1898, "num_input_tokens_seen": 10487040, "step": 15565 }, { "epoch": 0.3803776903720714, "grad_norm": 11.173587799072266, "learning_rate": 1.5213758733571114e-06, "loss": 0.1709, "num_input_tokens_seen": 10490752, "step": 15570 }, { "epoch": 0.3804998412039186, "grad_norm": 25.323055267333984, "learning_rate": 1.521864464748131e-06, "loss": 0.1493, "num_input_tokens_seen": 10493888, "step": 15575 }, { "epoch": 0.38062199203576574, "grad_norm": 13.963493347167969, "learning_rate": 1.5223530561391506e-06, "loss": 0.1155, "num_input_tokens_seen": 10497728, "step": 15580 }, { "epoch": 0.38074414286761293, "grad_norm": 47.22227478027344, "learning_rate": 1.5228416475301705e-06, "loss": 0.1956, "num_input_tokens_seen": 10501248, "step": 15585 }, { "epoch": 0.38086629369946007, "grad_norm": 1.2307871580123901, "learning_rate": 1.5233302389211901e-06, "loss": 0.1305, "num_input_tokens_seen": 10504576, "step": 15590 }, { "epoch": 0.38098844453130726, "grad_norm": 14.952744483947754, "learning_rate": 1.5238188303122098e-06, "loss": 0.0638, "num_input_tokens_seen": 10507712, "step": 15595 }, { "epoch": 0.38111059536315445, "grad_norm": 23.147411346435547, "learning_rate": 1.5243074217032295e-06, "loss": 0.2178, "num_input_tokens_seen": 10511552, "step": 15600 }, { "epoch": 0.3812327461950016, "grad_norm": 23.532026290893555, "learning_rate": 1.5247960130942492e-06, "loss": 0.1734, "num_input_tokens_seen": 10514432, "step": 15605 }, { "epoch": 0.38135489702684877, "grad_norm": 8.446444511413574, "learning_rate": 1.5252846044852689e-06, "loss": 0.1081, "num_input_tokens_seen": 10517632, "step": 15610 }, { "epoch": 0.3814770478586959, "grad_norm": 57.32693862915039, "learning_rate": 1.5257731958762885e-06, "loss": 0.2098, "num_input_tokens_seen": 10521024, "step": 15615 }, { "epoch": 0.3815991986905431, "grad_norm": 21.209300994873047, "learning_rate": 1.5262617872673082e-06, "loss": 0.1375, "num_input_tokens_seen": 10524608, "step": 15620 }, { "epoch": 0.38172134952239023, "grad_norm": 27.126405715942383, "learning_rate": 1.526750378658328e-06, "loss": 0.0546, "num_input_tokens_seen": 10528256, "step": 15625 }, { "epoch": 0.3818435003542374, "grad_norm": 26.83810806274414, "learning_rate": 1.5272389700493478e-06, "loss": 0.1676, "num_input_tokens_seen": 10531200, "step": 15630 }, { "epoch": 0.38196565118608455, "grad_norm": 9.054481506347656, "learning_rate": 1.5277275614403673e-06, "loss": 0.159, "num_input_tokens_seen": 10534656, "step": 15635 }, { "epoch": 0.38208780201793174, "grad_norm": 27.62286949157715, "learning_rate": 1.528216152831387e-06, "loss": 0.079, "num_input_tokens_seen": 10537920, "step": 15640 }, { "epoch": 0.38220995284977893, "grad_norm": 18.78369140625, "learning_rate": 1.5287047442224068e-06, "loss": 0.1409, "num_input_tokens_seen": 10541504, "step": 15645 }, { "epoch": 0.38233210368162607, "grad_norm": 3.5283448696136475, "learning_rate": 1.5291933356134263e-06, "loss": 0.1738, "num_input_tokens_seen": 10544384, "step": 15650 }, { "epoch": 0.38245425451347326, "grad_norm": 21.821565628051758, "learning_rate": 1.529681927004446e-06, "loss": 0.1478, "num_input_tokens_seen": 10547712, "step": 15655 }, { "epoch": 0.3825764053453204, "grad_norm": 33.52692794799805, "learning_rate": 1.5301705183954659e-06, "loss": 0.2437, "num_input_tokens_seen": 10550784, "step": 15660 }, { "epoch": 0.3826985561771676, "grad_norm": 5.589592456817627, "learning_rate": 1.5306591097864856e-06, "loss": 0.0959, "num_input_tokens_seen": 10554304, "step": 15665 }, { "epoch": 0.3828207070090147, "grad_norm": 27.160764694213867, "learning_rate": 1.531147701177505e-06, "loss": 0.1287, "num_input_tokens_seen": 10557760, "step": 15670 }, { "epoch": 0.3829428578408619, "grad_norm": 15.26628589630127, "learning_rate": 1.531636292568525e-06, "loss": 0.1185, "num_input_tokens_seen": 10561152, "step": 15675 }, { "epoch": 0.38306500867270904, "grad_norm": 12.412528038024902, "learning_rate": 1.5321248839595446e-06, "loss": 0.1117, "num_input_tokens_seen": 10564544, "step": 15680 }, { "epoch": 0.38318715950455623, "grad_norm": 1.3267569541931152, "learning_rate": 1.532613475350564e-06, "loss": 0.1692, "num_input_tokens_seen": 10567552, "step": 15685 }, { "epoch": 0.38330931033640336, "grad_norm": 3.8919661045074463, "learning_rate": 1.533102066741584e-06, "loss": 0.1682, "num_input_tokens_seen": 10570624, "step": 15690 }, { "epoch": 0.38343146116825055, "grad_norm": 0.884100615978241, "learning_rate": 1.5335906581326037e-06, "loss": 0.0318, "num_input_tokens_seen": 10573952, "step": 15695 }, { "epoch": 0.38355361200009774, "grad_norm": 18.8651065826416, "learning_rate": 1.5340792495236231e-06, "loss": 0.0905, "num_input_tokens_seen": 10577344, "step": 15700 }, { "epoch": 0.3836757628319449, "grad_norm": 22.760498046875, "learning_rate": 1.534567840914643e-06, "loss": 0.0864, "num_input_tokens_seen": 10580480, "step": 15705 }, { "epoch": 0.38379791366379207, "grad_norm": 30.443721771240234, "learning_rate": 1.5350564323056627e-06, "loss": 0.2148, "num_input_tokens_seen": 10584064, "step": 15710 }, { "epoch": 0.3839200644956392, "grad_norm": 5.712250709533691, "learning_rate": 1.5355450236966824e-06, "loss": 0.1423, "num_input_tokens_seen": 10587776, "step": 15715 }, { "epoch": 0.3840422153274864, "grad_norm": 24.226604461669922, "learning_rate": 1.536033615087702e-06, "loss": 0.1419, "num_input_tokens_seen": 10591296, "step": 15720 }, { "epoch": 0.3841643661593335, "grad_norm": 26.179553985595703, "learning_rate": 1.5365222064787217e-06, "loss": 0.1796, "num_input_tokens_seen": 10594368, "step": 15725 }, { "epoch": 0.3842865169911807, "grad_norm": 11.114547729492188, "learning_rate": 1.5370107978697414e-06, "loss": 0.1257, "num_input_tokens_seen": 10597376, "step": 15730 }, { "epoch": 0.38440866782302785, "grad_norm": 2.732433795928955, "learning_rate": 1.5374993892607611e-06, "loss": 0.1822, "num_input_tokens_seen": 10600704, "step": 15735 }, { "epoch": 0.38453081865487504, "grad_norm": 30.16254425048828, "learning_rate": 1.5379879806517808e-06, "loss": 0.2058, "num_input_tokens_seen": 10604352, "step": 15740 }, { "epoch": 0.38465296948672223, "grad_norm": 12.766703605651855, "learning_rate": 1.5384765720428005e-06, "loss": 0.0936, "num_input_tokens_seen": 10607680, "step": 15745 }, { "epoch": 0.38477512031856936, "grad_norm": 18.774023056030273, "learning_rate": 1.5389651634338204e-06, "loss": 0.1499, "num_input_tokens_seen": 10611072, "step": 15750 }, { "epoch": 0.38489727115041655, "grad_norm": 16.16313934326172, "learning_rate": 1.5394537548248398e-06, "loss": 0.1205, "num_input_tokens_seen": 10614272, "step": 15755 }, { "epoch": 0.3850194219822637, "grad_norm": 11.570549964904785, "learning_rate": 1.5399423462158595e-06, "loss": 0.0851, "num_input_tokens_seen": 10617088, "step": 15760 }, { "epoch": 0.3851415728141109, "grad_norm": 18.607933044433594, "learning_rate": 1.5404309376068794e-06, "loss": 0.1307, "num_input_tokens_seen": 10619904, "step": 15765 }, { "epoch": 0.385263723645958, "grad_norm": 18.242626190185547, "learning_rate": 1.5409195289978989e-06, "loss": 0.1781, "num_input_tokens_seen": 10623488, "step": 15770 }, { "epoch": 0.3853858744778052, "grad_norm": 7.264750003814697, "learning_rate": 1.5414081203889186e-06, "loss": 0.1018, "num_input_tokens_seen": 10626880, "step": 15775 }, { "epoch": 0.38550802530965234, "grad_norm": 35.27031326293945, "learning_rate": 1.5418967117799385e-06, "loss": 0.1506, "num_input_tokens_seen": 10630208, "step": 15780 }, { "epoch": 0.3856301761414995, "grad_norm": 2.565133571624756, "learning_rate": 1.5423853031709581e-06, "loss": 0.043, "num_input_tokens_seen": 10633216, "step": 15785 }, { "epoch": 0.3857523269733467, "grad_norm": 14.965524673461914, "learning_rate": 1.5428738945619776e-06, "loss": 0.1396, "num_input_tokens_seen": 10636672, "step": 15790 }, { "epoch": 0.38587447780519385, "grad_norm": 14.533952713012695, "learning_rate": 1.5433624859529975e-06, "loss": 0.1323, "num_input_tokens_seen": 10639936, "step": 15795 }, { "epoch": 0.38599662863704104, "grad_norm": 11.508635520935059, "learning_rate": 1.5438510773440172e-06, "loss": 0.1178, "num_input_tokens_seen": 10643136, "step": 15800 }, { "epoch": 0.3861187794688882, "grad_norm": 19.610515594482422, "learning_rate": 1.5443396687350366e-06, "loss": 0.086, "num_input_tokens_seen": 10646528, "step": 15805 }, { "epoch": 0.38624093030073536, "grad_norm": 1.9880129098892212, "learning_rate": 1.5448282601260565e-06, "loss": 0.0228, "num_input_tokens_seen": 10649984, "step": 15810 }, { "epoch": 0.3863630811325825, "grad_norm": 17.143823623657227, "learning_rate": 1.5453168515170762e-06, "loss": 0.0876, "num_input_tokens_seen": 10653120, "step": 15815 }, { "epoch": 0.3864852319644297, "grad_norm": 21.041353225708008, "learning_rate": 1.5458054429080957e-06, "loss": 0.1813, "num_input_tokens_seen": 10656256, "step": 15820 }, { "epoch": 0.3866073827962768, "grad_norm": 8.247550010681152, "learning_rate": 1.5462940342991156e-06, "loss": 0.1512, "num_input_tokens_seen": 10659328, "step": 15825 }, { "epoch": 0.386729533628124, "grad_norm": 4.186298370361328, "learning_rate": 1.5467826256901353e-06, "loss": 0.0881, "num_input_tokens_seen": 10662848, "step": 15830 }, { "epoch": 0.38685168445997115, "grad_norm": 7.553508281707764, "learning_rate": 1.547271217081155e-06, "loss": 0.3, "num_input_tokens_seen": 10666112, "step": 15835 }, { "epoch": 0.38697383529181834, "grad_norm": 2.3813862800598145, "learning_rate": 1.5477598084721746e-06, "loss": 0.1846, "num_input_tokens_seen": 10669184, "step": 15840 }, { "epoch": 0.3870959861236655, "grad_norm": 5.976868152618408, "learning_rate": 1.5482483998631943e-06, "loss": 0.0803, "num_input_tokens_seen": 10672512, "step": 15845 }, { "epoch": 0.38721813695551266, "grad_norm": 9.132421493530273, "learning_rate": 1.548736991254214e-06, "loss": 0.1346, "num_input_tokens_seen": 10675392, "step": 15850 }, { "epoch": 0.38734028778735985, "grad_norm": 16.281137466430664, "learning_rate": 1.5492255826452337e-06, "loss": 0.1351, "num_input_tokens_seen": 10678720, "step": 15855 }, { "epoch": 0.387462438619207, "grad_norm": 14.683212280273438, "learning_rate": 1.5497141740362534e-06, "loss": 0.0553, "num_input_tokens_seen": 10682176, "step": 15860 }, { "epoch": 0.3875845894510542, "grad_norm": 20.908401489257812, "learning_rate": 1.550202765427273e-06, "loss": 0.0425, "num_input_tokens_seen": 10685696, "step": 15865 }, { "epoch": 0.3877067402829013, "grad_norm": 4.737698554992676, "learning_rate": 1.550691356818293e-06, "loss": 0.0721, "num_input_tokens_seen": 10689216, "step": 15870 }, { "epoch": 0.3878288911147485, "grad_norm": 10.016496658325195, "learning_rate": 1.5511799482093124e-06, "loss": 0.1571, "num_input_tokens_seen": 10692288, "step": 15875 }, { "epoch": 0.38795104194659563, "grad_norm": 10.282059669494629, "learning_rate": 1.551668539600332e-06, "loss": 0.0994, "num_input_tokens_seen": 10695680, "step": 15880 }, { "epoch": 0.3880731927784428, "grad_norm": 21.099702835083008, "learning_rate": 1.552157130991352e-06, "loss": 0.2294, "num_input_tokens_seen": 10698624, "step": 15885 }, { "epoch": 0.38819534361029, "grad_norm": 11.00361442565918, "learning_rate": 1.5526457223823714e-06, "loss": 0.2457, "num_input_tokens_seen": 10702208, "step": 15890 }, { "epoch": 0.38831749444213715, "grad_norm": 1.1076817512512207, "learning_rate": 1.5531343137733911e-06, "loss": 0.0488, "num_input_tokens_seen": 10705216, "step": 15895 }, { "epoch": 0.38843964527398434, "grad_norm": 10.160321235656738, "learning_rate": 1.553622905164411e-06, "loss": 0.081, "num_input_tokens_seen": 10708160, "step": 15900 }, { "epoch": 0.38856179610583147, "grad_norm": 0.8632951378822327, "learning_rate": 1.5541114965554307e-06, "loss": 0.1093, "num_input_tokens_seen": 10712384, "step": 15905 }, { "epoch": 0.38868394693767866, "grad_norm": 34.629905700683594, "learning_rate": 1.5546000879464502e-06, "loss": 0.1033, "num_input_tokens_seen": 10716096, "step": 15910 }, { "epoch": 0.3888060977695258, "grad_norm": 17.712677001953125, "learning_rate": 1.55508867933747e-06, "loss": 0.1871, "num_input_tokens_seen": 10719232, "step": 15915 }, { "epoch": 0.388928248601373, "grad_norm": 0.24668991565704346, "learning_rate": 1.5555772707284897e-06, "loss": 0.0706, "num_input_tokens_seen": 10722560, "step": 15920 }, { "epoch": 0.3890503994332201, "grad_norm": 17.20937156677246, "learning_rate": 1.5560658621195092e-06, "loss": 0.0899, "num_input_tokens_seen": 10725504, "step": 15925 }, { "epoch": 0.3891725502650673, "grad_norm": 6.1287994384765625, "learning_rate": 1.556554453510529e-06, "loss": 0.2236, "num_input_tokens_seen": 10728704, "step": 15930 }, { "epoch": 0.3892947010969145, "grad_norm": 29.251150131225586, "learning_rate": 1.5570430449015488e-06, "loss": 0.148, "num_input_tokens_seen": 10732096, "step": 15935 }, { "epoch": 0.38941685192876163, "grad_norm": 0.6747702956199646, "learning_rate": 1.5575316362925685e-06, "loss": 0.0168, "num_input_tokens_seen": 10735360, "step": 15940 }, { "epoch": 0.3895390027606088, "grad_norm": 44.67377853393555, "learning_rate": 1.5580202276835881e-06, "loss": 0.2019, "num_input_tokens_seen": 10738688, "step": 15945 }, { "epoch": 0.38966115359245596, "grad_norm": 47.97969436645508, "learning_rate": 1.5585088190746078e-06, "loss": 0.0239, "num_input_tokens_seen": 10741632, "step": 15950 }, { "epoch": 0.38978330442430315, "grad_norm": 29.029560089111328, "learning_rate": 1.5589974104656275e-06, "loss": 0.1504, "num_input_tokens_seen": 10745472, "step": 15955 }, { "epoch": 0.3899054552561503, "grad_norm": 11.867027282714844, "learning_rate": 1.5594860018566472e-06, "loss": 0.1547, "num_input_tokens_seen": 10748864, "step": 15960 }, { "epoch": 0.39002760608799747, "grad_norm": 16.851787567138672, "learning_rate": 1.5599745932476669e-06, "loss": 0.1754, "num_input_tokens_seen": 10752128, "step": 15965 }, { "epoch": 0.3901497569198446, "grad_norm": 2.712797164916992, "learning_rate": 1.5604631846386866e-06, "loss": 0.1719, "num_input_tokens_seen": 10755840, "step": 15970 }, { "epoch": 0.3902719077516918, "grad_norm": 20.49784278869629, "learning_rate": 1.5609517760297062e-06, "loss": 0.1273, "num_input_tokens_seen": 10759040, "step": 15975 }, { "epoch": 0.3903940585835389, "grad_norm": 3.896900177001953, "learning_rate": 1.561440367420726e-06, "loss": 0.1223, "num_input_tokens_seen": 10762176, "step": 15980 }, { "epoch": 0.3905162094153861, "grad_norm": 17.352317810058594, "learning_rate": 1.5619289588117456e-06, "loss": 0.1119, "num_input_tokens_seen": 10765696, "step": 15985 }, { "epoch": 0.3906383602472333, "grad_norm": 1.630355954170227, "learning_rate": 1.5624175502027655e-06, "loss": 0.1843, "num_input_tokens_seen": 10769024, "step": 15990 }, { "epoch": 0.39076051107908044, "grad_norm": 4.977283000946045, "learning_rate": 1.562906141593785e-06, "loss": 0.1047, "num_input_tokens_seen": 10772800, "step": 15995 }, { "epoch": 0.39088266191092763, "grad_norm": 5.900393009185791, "learning_rate": 1.5633947329848046e-06, "loss": 0.0575, "num_input_tokens_seen": 10776704, "step": 16000 }, { "epoch": 0.39100481274277477, "grad_norm": 21.47952651977539, "learning_rate": 1.5638833243758245e-06, "loss": 0.071, "num_input_tokens_seen": 10780416, "step": 16005 }, { "epoch": 0.39112696357462196, "grad_norm": 16.857471466064453, "learning_rate": 1.564371915766844e-06, "loss": 0.1896, "num_input_tokens_seen": 10783552, "step": 16010 }, { "epoch": 0.3912491144064691, "grad_norm": 12.78052806854248, "learning_rate": 1.5648605071578637e-06, "loss": 0.1082, "num_input_tokens_seen": 10786432, "step": 16015 }, { "epoch": 0.3913712652383163, "grad_norm": 18.435972213745117, "learning_rate": 1.5653490985488836e-06, "loss": 0.1499, "num_input_tokens_seen": 10789504, "step": 16020 }, { "epoch": 0.3914934160701634, "grad_norm": 14.367976188659668, "learning_rate": 1.5658376899399033e-06, "loss": 0.1097, "num_input_tokens_seen": 10792768, "step": 16025 }, { "epoch": 0.3916155669020106, "grad_norm": 35.80486297607422, "learning_rate": 1.5663262813309227e-06, "loss": 0.0895, "num_input_tokens_seen": 10796288, "step": 16030 }, { "epoch": 0.3917377177338578, "grad_norm": 52.145294189453125, "learning_rate": 1.5668148727219426e-06, "loss": 0.2037, "num_input_tokens_seen": 10800064, "step": 16035 }, { "epoch": 0.3918598685657049, "grad_norm": 50.63142395019531, "learning_rate": 1.5673034641129623e-06, "loss": 0.155, "num_input_tokens_seen": 10803456, "step": 16040 }, { "epoch": 0.3919820193975521, "grad_norm": 3.279127836227417, "learning_rate": 1.5677920555039818e-06, "loss": 0.1418, "num_input_tokens_seen": 10806784, "step": 16045 }, { "epoch": 0.39210417022939925, "grad_norm": 39.02543258666992, "learning_rate": 1.5682806468950017e-06, "loss": 0.1995, "num_input_tokens_seen": 10810496, "step": 16050 }, { "epoch": 0.39222632106124644, "grad_norm": 13.758199691772461, "learning_rate": 1.5687692382860213e-06, "loss": 0.0676, "num_input_tokens_seen": 10813760, "step": 16055 }, { "epoch": 0.3923484718930936, "grad_norm": 6.683971881866455, "learning_rate": 1.569257829677041e-06, "loss": 0.111, "num_input_tokens_seen": 10817088, "step": 16060 }, { "epoch": 0.39247062272494077, "grad_norm": 15.666305541992188, "learning_rate": 1.5697464210680607e-06, "loss": 0.1503, "num_input_tokens_seen": 10820480, "step": 16065 }, { "epoch": 0.3925927735567879, "grad_norm": 10.361749649047852, "learning_rate": 1.5702350124590804e-06, "loss": 0.2075, "num_input_tokens_seen": 10823680, "step": 16070 }, { "epoch": 0.3927149243886351, "grad_norm": 42.955177307128906, "learning_rate": 1.5707236038501e-06, "loss": 0.1805, "num_input_tokens_seen": 10827456, "step": 16075 }, { "epoch": 0.3928370752204823, "grad_norm": 16.565195083618164, "learning_rate": 1.5712121952411198e-06, "loss": 0.1784, "num_input_tokens_seen": 10830784, "step": 16080 }, { "epoch": 0.3929592260523294, "grad_norm": 15.590194702148438, "learning_rate": 1.5717007866321394e-06, "loss": 0.1074, "num_input_tokens_seen": 10834112, "step": 16085 }, { "epoch": 0.3930813768841766, "grad_norm": 1.7364860773086548, "learning_rate": 1.5721893780231591e-06, "loss": 0.1116, "num_input_tokens_seen": 10837312, "step": 16090 }, { "epoch": 0.39320352771602374, "grad_norm": 13.02885627746582, "learning_rate": 1.5726779694141788e-06, "loss": 0.1407, "num_input_tokens_seen": 10840448, "step": 16095 }, { "epoch": 0.3933256785478709, "grad_norm": 12.833983421325684, "learning_rate": 1.5731665608051985e-06, "loss": 0.0713, "num_input_tokens_seen": 10843648, "step": 16100 }, { "epoch": 0.39344782937971806, "grad_norm": 14.190690040588379, "learning_rate": 1.5736551521962182e-06, "loss": 0.0996, "num_input_tokens_seen": 10847168, "step": 16105 }, { "epoch": 0.39356998021156525, "grad_norm": 36.03334045410156, "learning_rate": 1.574143743587238e-06, "loss": 0.1066, "num_input_tokens_seen": 10850240, "step": 16110 }, { "epoch": 0.3936921310434124, "grad_norm": 17.66339111328125, "learning_rate": 1.5746323349782575e-06, "loss": 0.07, "num_input_tokens_seen": 10853632, "step": 16115 }, { "epoch": 0.3938142818752596, "grad_norm": 16.764387130737305, "learning_rate": 1.5751209263692772e-06, "loss": 0.2055, "num_input_tokens_seen": 10857472, "step": 16120 }, { "epoch": 0.3939364327071067, "grad_norm": 24.138320922851562, "learning_rate": 1.575609517760297e-06, "loss": 0.2077, "num_input_tokens_seen": 10860864, "step": 16125 }, { "epoch": 0.3940585835389539, "grad_norm": 1.7676178216934204, "learning_rate": 1.5760981091513166e-06, "loss": 0.0593, "num_input_tokens_seen": 10864256, "step": 16130 }, { "epoch": 0.3941807343708011, "grad_norm": 15.465826034545898, "learning_rate": 1.5765867005423362e-06, "loss": 0.075, "num_input_tokens_seen": 10867328, "step": 16135 }, { "epoch": 0.3943028852026482, "grad_norm": 31.692584991455078, "learning_rate": 1.5770752919333561e-06, "loss": 0.1361, "num_input_tokens_seen": 10870720, "step": 16140 }, { "epoch": 0.3944250360344954, "grad_norm": 5.40956449508667, "learning_rate": 1.5775638833243758e-06, "loss": 0.1391, "num_input_tokens_seen": 10874176, "step": 16145 }, { "epoch": 0.39454718686634255, "grad_norm": 12.23112678527832, "learning_rate": 1.5780524747153953e-06, "loss": 0.1064, "num_input_tokens_seen": 10877312, "step": 16150 }, { "epoch": 0.39466933769818974, "grad_norm": 11.061859130859375, "learning_rate": 1.5785410661064152e-06, "loss": 0.0347, "num_input_tokens_seen": 10880384, "step": 16155 }, { "epoch": 0.39479148853003687, "grad_norm": 29.293039321899414, "learning_rate": 1.5790296574974349e-06, "loss": 0.1328, "num_input_tokens_seen": 10883648, "step": 16160 }, { "epoch": 0.39491363936188406, "grad_norm": 16.09516143798828, "learning_rate": 1.5795182488884543e-06, "loss": 0.043, "num_input_tokens_seen": 10887104, "step": 16165 }, { "epoch": 0.3950357901937312, "grad_norm": 17.577499389648438, "learning_rate": 1.5800068402794742e-06, "loss": 0.135, "num_input_tokens_seen": 10890560, "step": 16170 }, { "epoch": 0.3951579410255784, "grad_norm": 45.8845100402832, "learning_rate": 1.580495431670494e-06, "loss": 0.1755, "num_input_tokens_seen": 10893696, "step": 16175 }, { "epoch": 0.3952800918574256, "grad_norm": 26.468963623046875, "learning_rate": 1.5809840230615136e-06, "loss": 0.0819, "num_input_tokens_seen": 10896768, "step": 16180 }, { "epoch": 0.3954022426892727, "grad_norm": 1.9360413551330566, "learning_rate": 1.5814726144525333e-06, "loss": 0.0533, "num_input_tokens_seen": 10899840, "step": 16185 }, { "epoch": 0.3955243935211199, "grad_norm": 1.5298985242843628, "learning_rate": 1.581961205843553e-06, "loss": 0.1159, "num_input_tokens_seen": 10902656, "step": 16190 }, { "epoch": 0.39564654435296703, "grad_norm": 15.531476974487305, "learning_rate": 1.5824497972345726e-06, "loss": 0.1456, "num_input_tokens_seen": 10905856, "step": 16195 }, { "epoch": 0.3957686951848142, "grad_norm": 13.77304458618164, "learning_rate": 1.5829383886255923e-06, "loss": 0.2502, "num_input_tokens_seen": 10909120, "step": 16200 }, { "epoch": 0.39589084601666136, "grad_norm": 19.4348201751709, "learning_rate": 1.583426980016612e-06, "loss": 0.1608, "num_input_tokens_seen": 10912384, "step": 16205 }, { "epoch": 0.39601299684850855, "grad_norm": 0.1257823258638382, "learning_rate": 1.5839155714076317e-06, "loss": 0.0518, "num_input_tokens_seen": 10915840, "step": 16210 }, { "epoch": 0.3961351476803557, "grad_norm": 31.687843322753906, "learning_rate": 1.5844041627986516e-06, "loss": 0.1392, "num_input_tokens_seen": 10919168, "step": 16215 }, { "epoch": 0.39625729851220287, "grad_norm": 9.020833015441895, "learning_rate": 1.584892754189671e-06, "loss": 0.1297, "num_input_tokens_seen": 10922176, "step": 16220 }, { "epoch": 0.39637944934405, "grad_norm": 17.56090545654297, "learning_rate": 1.5853813455806907e-06, "loss": 0.0987, "num_input_tokens_seen": 10925504, "step": 16225 }, { "epoch": 0.3965016001758972, "grad_norm": 25.40732192993164, "learning_rate": 1.5858699369717106e-06, "loss": 0.0924, "num_input_tokens_seen": 10928384, "step": 16230 }, { "epoch": 0.3966237510077444, "grad_norm": 21.291419982910156, "learning_rate": 1.58635852836273e-06, "loss": 0.1234, "num_input_tokens_seen": 10931392, "step": 16235 }, { "epoch": 0.3967459018395915, "grad_norm": 20.114309310913086, "learning_rate": 1.5868471197537498e-06, "loss": 0.0636, "num_input_tokens_seen": 10934912, "step": 16240 }, { "epoch": 0.3968680526714387, "grad_norm": 23.1328067779541, "learning_rate": 1.5873357111447697e-06, "loss": 0.1387, "num_input_tokens_seen": 10938112, "step": 16245 }, { "epoch": 0.39699020350328584, "grad_norm": 13.599346160888672, "learning_rate": 1.5878243025357891e-06, "loss": 0.1428, "num_input_tokens_seen": 10942144, "step": 16250 }, { "epoch": 0.39711235433513303, "grad_norm": 23.25737190246582, "learning_rate": 1.5883128939268088e-06, "loss": 0.1288, "num_input_tokens_seen": 10945280, "step": 16255 }, { "epoch": 0.39723450516698017, "grad_norm": 10.759117126464844, "learning_rate": 1.5888014853178287e-06, "loss": 0.1731, "num_input_tokens_seen": 10949056, "step": 16260 }, { "epoch": 0.39735665599882736, "grad_norm": 24.098670959472656, "learning_rate": 1.5892900767088484e-06, "loss": 0.0676, "num_input_tokens_seen": 10952384, "step": 16265 }, { "epoch": 0.3974788068306745, "grad_norm": 37.097408294677734, "learning_rate": 1.5897786680998679e-06, "loss": 0.0302, "num_input_tokens_seen": 10955712, "step": 16270 }, { "epoch": 0.3976009576625217, "grad_norm": 41.81599044799805, "learning_rate": 1.5902672594908877e-06, "loss": 0.0313, "num_input_tokens_seen": 10959040, "step": 16275 }, { "epoch": 0.39772310849436887, "grad_norm": 14.4953031539917, "learning_rate": 1.5907558508819074e-06, "loss": 0.1402, "num_input_tokens_seen": 10962240, "step": 16280 }, { "epoch": 0.397845259326216, "grad_norm": 1.8347476720809937, "learning_rate": 1.591244442272927e-06, "loss": 0.0433, "num_input_tokens_seen": 10965504, "step": 16285 }, { "epoch": 0.3979674101580632, "grad_norm": 50.44518280029297, "learning_rate": 1.5917330336639468e-06, "loss": 0.1955, "num_input_tokens_seen": 10968960, "step": 16290 }, { "epoch": 0.39808956098991033, "grad_norm": 47.06920623779297, "learning_rate": 1.5922216250549665e-06, "loss": 0.1945, "num_input_tokens_seen": 10972672, "step": 16295 }, { "epoch": 0.3982117118217575, "grad_norm": 32.896270751953125, "learning_rate": 1.5927102164459862e-06, "loss": 0.2091, "num_input_tokens_seen": 10975680, "step": 16300 }, { "epoch": 0.39833386265360465, "grad_norm": 31.728307723999023, "learning_rate": 1.5931988078370058e-06, "loss": 0.135, "num_input_tokens_seen": 10978688, "step": 16305 }, { "epoch": 0.39845601348545184, "grad_norm": 1.8147532939910889, "learning_rate": 1.5936873992280255e-06, "loss": 0.1638, "num_input_tokens_seen": 10981696, "step": 16310 }, { "epoch": 0.398578164317299, "grad_norm": 47.015018463134766, "learning_rate": 1.5941759906190452e-06, "loss": 0.1767, "num_input_tokens_seen": 10985024, "step": 16315 }, { "epoch": 0.39870031514914617, "grad_norm": 20.603796005249023, "learning_rate": 1.5946645820100649e-06, "loss": 0.128, "num_input_tokens_seen": 10988416, "step": 16320 }, { "epoch": 0.39882246598099336, "grad_norm": 0.8213732838630676, "learning_rate": 1.5951531734010846e-06, "loss": 0.1098, "num_input_tokens_seen": 10992000, "step": 16325 }, { "epoch": 0.3989446168128405, "grad_norm": 25.38956069946289, "learning_rate": 1.5956417647921042e-06, "loss": 0.1098, "num_input_tokens_seen": 10995264, "step": 16330 }, { "epoch": 0.3990667676446877, "grad_norm": 9.087261199951172, "learning_rate": 1.5961303561831241e-06, "loss": 0.0463, "num_input_tokens_seen": 10998464, "step": 16335 }, { "epoch": 0.3991889184765348, "grad_norm": 23.549148559570312, "learning_rate": 1.5966189475741436e-06, "loss": 0.1157, "num_input_tokens_seen": 11001600, "step": 16340 }, { "epoch": 0.399311069308382, "grad_norm": 44.26289749145508, "learning_rate": 1.5971075389651633e-06, "loss": 0.053, "num_input_tokens_seen": 11004928, "step": 16345 }, { "epoch": 0.39943322014022914, "grad_norm": 30.471595764160156, "learning_rate": 1.5975961303561832e-06, "loss": 0.1173, "num_input_tokens_seen": 11008064, "step": 16350 }, { "epoch": 0.39955537097207633, "grad_norm": 14.239789962768555, "learning_rate": 1.5980847217472026e-06, "loss": 0.2029, "num_input_tokens_seen": 11011392, "step": 16355 }, { "epoch": 0.39967752180392346, "grad_norm": 14.216521263122559, "learning_rate": 1.5985733131382223e-06, "loss": 0.2419, "num_input_tokens_seen": 11014912, "step": 16360 }, { "epoch": 0.39979967263577065, "grad_norm": 34.19306945800781, "learning_rate": 1.5990619045292422e-06, "loss": 0.1646, "num_input_tokens_seen": 11018496, "step": 16365 }, { "epoch": 0.3999218234676178, "grad_norm": 0.6496175527572632, "learning_rate": 1.599550495920262e-06, "loss": 0.0541, "num_input_tokens_seen": 11022016, "step": 16370 }, { "epoch": 0.400043974299465, "grad_norm": 24.54413604736328, "learning_rate": 1.6000390873112814e-06, "loss": 0.0528, "num_input_tokens_seen": 11025088, "step": 16375 }, { "epoch": 0.40016612513131217, "grad_norm": 6.894566059112549, "learning_rate": 1.6005276787023013e-06, "loss": 0.0828, "num_input_tokens_seen": 11027968, "step": 16380 }, { "epoch": 0.4002882759631593, "grad_norm": 0.47866374254226685, "learning_rate": 1.601016270093321e-06, "loss": 0.1717, "num_input_tokens_seen": 11031296, "step": 16385 }, { "epoch": 0.4004104267950065, "grad_norm": 6.847443580627441, "learning_rate": 1.6015048614843404e-06, "loss": 0.0743, "num_input_tokens_seen": 11034688, "step": 16390 }, { "epoch": 0.4005325776268536, "grad_norm": 21.000062942504883, "learning_rate": 1.6019934528753603e-06, "loss": 0.134, "num_input_tokens_seen": 11038016, "step": 16395 }, { "epoch": 0.4006547284587008, "grad_norm": 33.84720230102539, "learning_rate": 1.60248204426638e-06, "loss": 0.1923, "num_input_tokens_seen": 11041472, "step": 16400 }, { "epoch": 0.40077687929054795, "grad_norm": 34.48461151123047, "learning_rate": 1.6029706356573995e-06, "loss": 0.1039, "num_input_tokens_seen": 11044864, "step": 16405 }, { "epoch": 0.40089903012239514, "grad_norm": 0.1740226000547409, "learning_rate": 1.6034592270484194e-06, "loss": 0.1081, "num_input_tokens_seen": 11048384, "step": 16410 }, { "epoch": 0.4010211809542423, "grad_norm": 33.16176223754883, "learning_rate": 1.603947818439439e-06, "loss": 0.0626, "num_input_tokens_seen": 11051712, "step": 16415 }, { "epoch": 0.40114333178608946, "grad_norm": 10.99461555480957, "learning_rate": 1.6044364098304587e-06, "loss": 0.0369, "num_input_tokens_seen": 11055552, "step": 16420 }, { "epoch": 0.40126548261793665, "grad_norm": 11.003421783447266, "learning_rate": 1.6049250012214784e-06, "loss": 0.1517, "num_input_tokens_seen": 11058880, "step": 16425 }, { "epoch": 0.4013876334497838, "grad_norm": 34.61371994018555, "learning_rate": 1.605413592612498e-06, "loss": 0.1362, "num_input_tokens_seen": 11062208, "step": 16430 }, { "epoch": 0.401509784281631, "grad_norm": 12.106389999389648, "learning_rate": 1.6059021840035178e-06, "loss": 0.2636, "num_input_tokens_seen": 11066048, "step": 16435 }, { "epoch": 0.4016319351134781, "grad_norm": 33.2092170715332, "learning_rate": 1.6063907753945374e-06, "loss": 0.3135, "num_input_tokens_seen": 11069824, "step": 16440 }, { "epoch": 0.4017540859453253, "grad_norm": 31.4134521484375, "learning_rate": 1.6068793667855571e-06, "loss": 0.1552, "num_input_tokens_seen": 11073088, "step": 16445 }, { "epoch": 0.40187623677717244, "grad_norm": 38.61223220825195, "learning_rate": 1.6073679581765768e-06, "loss": 0.1472, "num_input_tokens_seen": 11076864, "step": 16450 }, { "epoch": 0.4019983876090196, "grad_norm": 0.4963672161102295, "learning_rate": 1.6078565495675967e-06, "loss": 0.0921, "num_input_tokens_seen": 11080256, "step": 16455 }, { "epoch": 0.40212053844086676, "grad_norm": 36.525779724121094, "learning_rate": 1.6083451409586162e-06, "loss": 0.1323, "num_input_tokens_seen": 11083712, "step": 16460 }, { "epoch": 0.40224268927271395, "grad_norm": 14.989439964294434, "learning_rate": 1.6088337323496359e-06, "loss": 0.1764, "num_input_tokens_seen": 11087040, "step": 16465 }, { "epoch": 0.40236484010456114, "grad_norm": 15.095987319946289, "learning_rate": 1.6093223237406557e-06, "loss": 0.1396, "num_input_tokens_seen": 11090240, "step": 16470 }, { "epoch": 0.4024869909364083, "grad_norm": 15.008681297302246, "learning_rate": 1.6098109151316752e-06, "loss": 0.0541, "num_input_tokens_seen": 11093632, "step": 16475 }, { "epoch": 0.40260914176825546, "grad_norm": 28.64449691772461, "learning_rate": 1.610299506522695e-06, "loss": 0.1336, "num_input_tokens_seen": 11096768, "step": 16480 }, { "epoch": 0.4027312926001026, "grad_norm": 20.235105514526367, "learning_rate": 1.6107880979137148e-06, "loss": 0.0992, "num_input_tokens_seen": 11099968, "step": 16485 }, { "epoch": 0.4028534434319498, "grad_norm": 29.153833389282227, "learning_rate": 1.6112766893047345e-06, "loss": 0.1079, "num_input_tokens_seen": 11103232, "step": 16490 }, { "epoch": 0.4029755942637969, "grad_norm": 13.984383583068848, "learning_rate": 1.611765280695754e-06, "loss": 0.1288, "num_input_tokens_seen": 11106048, "step": 16495 }, { "epoch": 0.4030977450956441, "grad_norm": 22.53131866455078, "learning_rate": 1.6122538720867738e-06, "loss": 0.0708, "num_input_tokens_seen": 11109248, "step": 16500 }, { "epoch": 0.40321989592749125, "grad_norm": 21.125389099121094, "learning_rate": 1.6127424634777935e-06, "loss": 0.1343, "num_input_tokens_seen": 11112384, "step": 16505 }, { "epoch": 0.40334204675933844, "grad_norm": 0.2616809606552124, "learning_rate": 1.613231054868813e-06, "loss": 0.075, "num_input_tokens_seen": 11116096, "step": 16510 }, { "epoch": 0.40346419759118557, "grad_norm": 23.93106460571289, "learning_rate": 1.6137196462598329e-06, "loss": 0.1278, "num_input_tokens_seen": 11119232, "step": 16515 }, { "epoch": 0.40358634842303276, "grad_norm": 19.08568572998047, "learning_rate": 1.6142082376508526e-06, "loss": 0.1306, "num_input_tokens_seen": 11123200, "step": 16520 }, { "epoch": 0.40370849925487995, "grad_norm": 11.759346008300781, "learning_rate": 1.614696829041872e-06, "loss": 0.0965, "num_input_tokens_seen": 11126528, "step": 16525 }, { "epoch": 0.4038306500867271, "grad_norm": 1.0693756341934204, "learning_rate": 1.615185420432892e-06, "loss": 0.1082, "num_input_tokens_seen": 11130048, "step": 16530 }, { "epoch": 0.4039528009185743, "grad_norm": 29.432117462158203, "learning_rate": 1.6156740118239116e-06, "loss": 0.1653, "num_input_tokens_seen": 11133248, "step": 16535 }, { "epoch": 0.4040749517504214, "grad_norm": 15.797622680664062, "learning_rate": 1.6161626032149313e-06, "loss": 0.198, "num_input_tokens_seen": 11136576, "step": 16540 }, { "epoch": 0.4041971025822686, "grad_norm": 2.2607970237731934, "learning_rate": 1.616651194605951e-06, "loss": 0.0332, "num_input_tokens_seen": 11139968, "step": 16545 }, { "epoch": 0.40431925341411573, "grad_norm": 26.23784637451172, "learning_rate": 1.6171397859969706e-06, "loss": 0.1053, "num_input_tokens_seen": 11143104, "step": 16550 }, { "epoch": 0.4044414042459629, "grad_norm": 19.77354621887207, "learning_rate": 1.6176283773879903e-06, "loss": 0.1145, "num_input_tokens_seen": 11146816, "step": 16555 }, { "epoch": 0.40456355507781006, "grad_norm": 20.735017776489258, "learning_rate": 1.61811696877901e-06, "loss": 0.058, "num_input_tokens_seen": 11150016, "step": 16560 }, { "epoch": 0.40468570590965725, "grad_norm": 40.133853912353516, "learning_rate": 1.6186055601700297e-06, "loss": 0.3602, "num_input_tokens_seen": 11153280, "step": 16565 }, { "epoch": 0.40480785674150443, "grad_norm": 17.214582443237305, "learning_rate": 1.6190941515610494e-06, "loss": 0.2336, "num_input_tokens_seen": 11156352, "step": 16570 }, { "epoch": 0.40493000757335157, "grad_norm": 29.597932815551758, "learning_rate": 1.6195827429520693e-06, "loss": 0.1717, "num_input_tokens_seen": 11159872, "step": 16575 }, { "epoch": 0.40505215840519876, "grad_norm": 13.730437278747559, "learning_rate": 1.6200713343430887e-06, "loss": 0.0624, "num_input_tokens_seen": 11163072, "step": 16580 }, { "epoch": 0.4051743092370459, "grad_norm": 25.89980697631836, "learning_rate": 1.6205599257341084e-06, "loss": 0.1306, "num_input_tokens_seen": 11166336, "step": 16585 }, { "epoch": 0.4052964600688931, "grad_norm": 14.32689094543457, "learning_rate": 1.6210485171251283e-06, "loss": 0.1343, "num_input_tokens_seen": 11169856, "step": 16590 }, { "epoch": 0.4054186109007402, "grad_norm": 16.669872283935547, "learning_rate": 1.6215371085161478e-06, "loss": 0.1087, "num_input_tokens_seen": 11172992, "step": 16595 }, { "epoch": 0.4055407617325874, "grad_norm": 26.87580680847168, "learning_rate": 1.6220256999071675e-06, "loss": 0.0759, "num_input_tokens_seen": 11176320, "step": 16600 }, { "epoch": 0.40566291256443454, "grad_norm": 2.764390468597412, "learning_rate": 1.6225142912981874e-06, "loss": 0.0217, "num_input_tokens_seen": 11179648, "step": 16605 }, { "epoch": 0.40578506339628173, "grad_norm": 23.70087432861328, "learning_rate": 1.623002882689207e-06, "loss": 0.1103, "num_input_tokens_seen": 11182656, "step": 16610 }, { "epoch": 0.4059072142281289, "grad_norm": 44.54327392578125, "learning_rate": 1.6234914740802265e-06, "loss": 0.225, "num_input_tokens_seen": 11186240, "step": 16615 }, { "epoch": 0.40602936505997606, "grad_norm": 0.4073219895362854, "learning_rate": 1.6239800654712464e-06, "loss": 0.1125, "num_input_tokens_seen": 11190016, "step": 16620 }, { "epoch": 0.40615151589182324, "grad_norm": 13.439949035644531, "learning_rate": 1.624468656862266e-06, "loss": 0.0681, "num_input_tokens_seen": 11193408, "step": 16625 }, { "epoch": 0.4062736667236704, "grad_norm": 28.58769416809082, "learning_rate": 1.6249572482532855e-06, "loss": 0.1086, "num_input_tokens_seen": 11196416, "step": 16630 }, { "epoch": 0.40639581755551757, "grad_norm": 0.441162645816803, "learning_rate": 1.6254458396443054e-06, "loss": 0.163, "num_input_tokens_seen": 11199808, "step": 16635 }, { "epoch": 0.4065179683873647, "grad_norm": 43.27127456665039, "learning_rate": 1.6259344310353251e-06, "loss": 0.2066, "num_input_tokens_seen": 11203072, "step": 16640 }, { "epoch": 0.4066401192192119, "grad_norm": 25.73714828491211, "learning_rate": 1.6264230224263448e-06, "loss": 0.0939, "num_input_tokens_seen": 11206400, "step": 16645 }, { "epoch": 0.406762270051059, "grad_norm": 11.029995918273926, "learning_rate": 1.6269116138173645e-06, "loss": 0.1402, "num_input_tokens_seen": 11209600, "step": 16650 }, { "epoch": 0.4068844208829062, "grad_norm": 24.101545333862305, "learning_rate": 1.6274002052083842e-06, "loss": 0.1372, "num_input_tokens_seen": 11212736, "step": 16655 }, { "epoch": 0.40700657171475335, "grad_norm": 25.160419464111328, "learning_rate": 1.6278887965994038e-06, "loss": 0.1845, "num_input_tokens_seen": 11216064, "step": 16660 }, { "epoch": 0.40712872254660054, "grad_norm": 25.641063690185547, "learning_rate": 1.6283773879904235e-06, "loss": 0.1796, "num_input_tokens_seen": 11219264, "step": 16665 }, { "epoch": 0.40725087337844773, "grad_norm": 14.866890907287598, "learning_rate": 1.6288659793814432e-06, "loss": 0.0825, "num_input_tokens_seen": 11222784, "step": 16670 }, { "epoch": 0.40737302421029487, "grad_norm": 24.974517822265625, "learning_rate": 1.6293545707724629e-06, "loss": 0.1176, "num_input_tokens_seen": 11226048, "step": 16675 }, { "epoch": 0.40749517504214205, "grad_norm": 0.3464401662349701, "learning_rate": 1.6298431621634826e-06, "loss": 0.0873, "num_input_tokens_seen": 11229568, "step": 16680 }, { "epoch": 0.4076173258739892, "grad_norm": 2.9293575286865234, "learning_rate": 1.6303317535545023e-06, "loss": 0.0537, "num_input_tokens_seen": 11233024, "step": 16685 }, { "epoch": 0.4077394767058364, "grad_norm": 19.832813262939453, "learning_rate": 1.630820344945522e-06, "loss": 0.0696, "num_input_tokens_seen": 11236032, "step": 16690 }, { "epoch": 0.4078616275376835, "grad_norm": 25.053936004638672, "learning_rate": 1.6313089363365418e-06, "loss": 0.1806, "num_input_tokens_seen": 11239104, "step": 16695 }, { "epoch": 0.4079837783695307, "grad_norm": 28.535404205322266, "learning_rate": 1.6317975277275613e-06, "loss": 0.149, "num_input_tokens_seen": 11242368, "step": 16700 }, { "epoch": 0.40810592920137784, "grad_norm": 3.1667778491973877, "learning_rate": 1.632286119118581e-06, "loss": 0.049, "num_input_tokens_seen": 11245632, "step": 16705 }, { "epoch": 0.408228080033225, "grad_norm": 19.77776336669922, "learning_rate": 1.6327747105096009e-06, "loss": 0.088, "num_input_tokens_seen": 11249344, "step": 16710 }, { "epoch": 0.4083502308650722, "grad_norm": 3.902125597000122, "learning_rate": 1.6332633019006203e-06, "loss": 0.0908, "num_input_tokens_seen": 11253056, "step": 16715 }, { "epoch": 0.40847238169691935, "grad_norm": 6.418919563293457, "learning_rate": 1.63375189329164e-06, "loss": 0.1675, "num_input_tokens_seen": 11256256, "step": 16720 }, { "epoch": 0.40859453252876654, "grad_norm": 5.540237903594971, "learning_rate": 1.63424048468266e-06, "loss": 0.0575, "num_input_tokens_seen": 11259904, "step": 16725 }, { "epoch": 0.4087166833606137, "grad_norm": 4.5995330810546875, "learning_rate": 1.6347290760736796e-06, "loss": 0.0636, "num_input_tokens_seen": 11263296, "step": 16730 }, { "epoch": 0.40883883419246086, "grad_norm": 6.6088643074035645, "learning_rate": 1.635217667464699e-06, "loss": 0.1156, "num_input_tokens_seen": 11266560, "step": 16735 }, { "epoch": 0.408960985024308, "grad_norm": 14.79891586303711, "learning_rate": 1.635706258855719e-06, "loss": 0.171, "num_input_tokens_seen": 11270592, "step": 16740 }, { "epoch": 0.4090831358561552, "grad_norm": 19.40209197998047, "learning_rate": 1.6361948502467386e-06, "loss": 0.1687, "num_input_tokens_seen": 11274048, "step": 16745 }, { "epoch": 0.4092052866880023, "grad_norm": 36.3360595703125, "learning_rate": 1.6366834416377581e-06, "loss": 0.1968, "num_input_tokens_seen": 11277760, "step": 16750 }, { "epoch": 0.4093274375198495, "grad_norm": 2.9016504287719727, "learning_rate": 1.637172033028778e-06, "loss": 0.1212, "num_input_tokens_seen": 11281344, "step": 16755 }, { "epoch": 0.4094495883516967, "grad_norm": 0.9625802636146545, "learning_rate": 1.6376606244197977e-06, "loss": 0.1246, "num_input_tokens_seen": 11285184, "step": 16760 }, { "epoch": 0.40957173918354384, "grad_norm": 0.9809098839759827, "learning_rate": 1.6381492158108174e-06, "loss": 0.1465, "num_input_tokens_seen": 11288576, "step": 16765 }, { "epoch": 0.409693890015391, "grad_norm": 16.860475540161133, "learning_rate": 1.638637807201837e-06, "loss": 0.1732, "num_input_tokens_seen": 11291968, "step": 16770 }, { "epoch": 0.40981604084723816, "grad_norm": 11.833168983459473, "learning_rate": 1.6391263985928567e-06, "loss": 0.1714, "num_input_tokens_seen": 11295680, "step": 16775 }, { "epoch": 0.40993819167908535, "grad_norm": 34.927284240722656, "learning_rate": 1.6396149899838764e-06, "loss": 0.1803, "num_input_tokens_seen": 11299264, "step": 16780 }, { "epoch": 0.4100603425109325, "grad_norm": 21.84634017944336, "learning_rate": 1.640103581374896e-06, "loss": 0.1094, "num_input_tokens_seen": 11302720, "step": 16785 }, { "epoch": 0.4101824933427797, "grad_norm": 14.141610145568848, "learning_rate": 1.6405921727659158e-06, "loss": 0.1998, "num_input_tokens_seen": 11305664, "step": 16790 }, { "epoch": 0.4103046441746268, "grad_norm": 10.894978523254395, "learning_rate": 1.6410807641569355e-06, "loss": 0.067, "num_input_tokens_seen": 11309056, "step": 16795 }, { "epoch": 0.410426795006474, "grad_norm": 10.09036922454834, "learning_rate": 1.6415693555479553e-06, "loss": 0.1192, "num_input_tokens_seen": 11312576, "step": 16800 }, { "epoch": 0.41054894583832113, "grad_norm": 19.356460571289062, "learning_rate": 1.6420579469389748e-06, "loss": 0.1092, "num_input_tokens_seen": 11315648, "step": 16805 }, { "epoch": 0.4106710966701683, "grad_norm": 3.952444076538086, "learning_rate": 1.6425465383299945e-06, "loss": 0.1387, "num_input_tokens_seen": 11318848, "step": 16810 }, { "epoch": 0.4107932475020155, "grad_norm": 26.717870712280273, "learning_rate": 1.6430351297210144e-06, "loss": 0.1777, "num_input_tokens_seen": 11322048, "step": 16815 }, { "epoch": 0.41091539833386265, "grad_norm": 1.2356115579605103, "learning_rate": 1.6435237211120339e-06, "loss": 0.125, "num_input_tokens_seen": 11325376, "step": 16820 }, { "epoch": 0.41103754916570984, "grad_norm": 28.821317672729492, "learning_rate": 1.6440123125030535e-06, "loss": 0.151, "num_input_tokens_seen": 11328704, "step": 16825 }, { "epoch": 0.41115969999755697, "grad_norm": 0.7280601263046265, "learning_rate": 1.6445009038940734e-06, "loss": 0.0885, "num_input_tokens_seen": 11331776, "step": 16830 }, { "epoch": 0.41128185082940416, "grad_norm": 6.10056209564209, "learning_rate": 1.644989495285093e-06, "loss": 0.0768, "num_input_tokens_seen": 11335680, "step": 16835 }, { "epoch": 0.4114040016612513, "grad_norm": 6.15144157409668, "learning_rate": 1.6454780866761126e-06, "loss": 0.159, "num_input_tokens_seen": 11338624, "step": 16840 }, { "epoch": 0.4115261524930985, "grad_norm": 0.3368801176548004, "learning_rate": 1.6459666780671325e-06, "loss": 0.0856, "num_input_tokens_seen": 11341888, "step": 16845 }, { "epoch": 0.4116483033249456, "grad_norm": 12.542458534240723, "learning_rate": 1.6464552694581522e-06, "loss": 0.139, "num_input_tokens_seen": 11345728, "step": 16850 }, { "epoch": 0.4117704541567928, "grad_norm": 2.5582902431488037, "learning_rate": 1.6469438608491716e-06, "loss": 0.1357, "num_input_tokens_seen": 11349184, "step": 16855 }, { "epoch": 0.41189260498864, "grad_norm": 18.847349166870117, "learning_rate": 1.6474324522401915e-06, "loss": 0.1509, "num_input_tokens_seen": 11352384, "step": 16860 }, { "epoch": 0.41201475582048713, "grad_norm": 30.65785026550293, "learning_rate": 1.6479210436312112e-06, "loss": 0.1452, "num_input_tokens_seen": 11355904, "step": 16865 }, { "epoch": 0.4121369066523343, "grad_norm": 7.287036418914795, "learning_rate": 1.6484096350222307e-06, "loss": 0.0671, "num_input_tokens_seen": 11359040, "step": 16870 }, { "epoch": 0.41225905748418146, "grad_norm": 9.512429237365723, "learning_rate": 1.6488982264132506e-06, "loss": 0.1257, "num_input_tokens_seen": 11361984, "step": 16875 }, { "epoch": 0.41238120831602865, "grad_norm": 3.455568790435791, "learning_rate": 1.6493868178042702e-06, "loss": 0.066, "num_input_tokens_seen": 11366016, "step": 16880 }, { "epoch": 0.4125033591478758, "grad_norm": 0.9491122364997864, "learning_rate": 1.64987540919529e-06, "loss": 0.1641, "num_input_tokens_seen": 11369984, "step": 16885 }, { "epoch": 0.41262550997972297, "grad_norm": 4.618907928466797, "learning_rate": 1.6503640005863096e-06, "loss": 0.1858, "num_input_tokens_seen": 11373312, "step": 16890 }, { "epoch": 0.4127476608115701, "grad_norm": 8.991425514221191, "learning_rate": 1.6508525919773293e-06, "loss": 0.1818, "num_input_tokens_seen": 11376448, "step": 16895 }, { "epoch": 0.4128698116434173, "grad_norm": 5.761434555053711, "learning_rate": 1.651341183368349e-06, "loss": 0.0363, "num_input_tokens_seen": 11379904, "step": 16900 }, { "epoch": 0.41299196247526443, "grad_norm": 37.52250671386719, "learning_rate": 1.6518297747593687e-06, "loss": 0.1037, "num_input_tokens_seen": 11383360, "step": 16905 }, { "epoch": 0.4131141133071116, "grad_norm": 26.177513122558594, "learning_rate": 1.6523183661503883e-06, "loss": 0.2688, "num_input_tokens_seen": 11386624, "step": 16910 }, { "epoch": 0.4132362641389588, "grad_norm": 10.464012145996094, "learning_rate": 1.652806957541408e-06, "loss": 0.0447, "num_input_tokens_seen": 11390208, "step": 16915 }, { "epoch": 0.41335841497080594, "grad_norm": 15.334251403808594, "learning_rate": 1.653295548932428e-06, "loss": 0.1803, "num_input_tokens_seen": 11394048, "step": 16920 }, { "epoch": 0.41348056580265313, "grad_norm": 13.855216026306152, "learning_rate": 1.6537841403234474e-06, "loss": 0.0391, "num_input_tokens_seen": 11397376, "step": 16925 }, { "epoch": 0.41360271663450027, "grad_norm": 26.607481002807617, "learning_rate": 1.654272731714467e-06, "loss": 0.126, "num_input_tokens_seen": 11400960, "step": 16930 }, { "epoch": 0.41372486746634746, "grad_norm": 23.822050094604492, "learning_rate": 1.654761323105487e-06, "loss": 0.0747, "num_input_tokens_seen": 11404160, "step": 16935 }, { "epoch": 0.4138470182981946, "grad_norm": 28.980283737182617, "learning_rate": 1.6552499144965064e-06, "loss": 0.1278, "num_input_tokens_seen": 11407488, "step": 16940 }, { "epoch": 0.4139691691300418, "grad_norm": 14.651640892028809, "learning_rate": 1.655738505887526e-06, "loss": 0.0899, "num_input_tokens_seen": 11410944, "step": 16945 }, { "epoch": 0.4140913199618889, "grad_norm": 24.085874557495117, "learning_rate": 1.656227097278546e-06, "loss": 0.1208, "num_input_tokens_seen": 11414848, "step": 16950 }, { "epoch": 0.4142134707937361, "grad_norm": 1.1301546096801758, "learning_rate": 1.6567156886695655e-06, "loss": 0.0931, "num_input_tokens_seen": 11418176, "step": 16955 }, { "epoch": 0.4143356216255833, "grad_norm": 0.6874004006385803, "learning_rate": 1.6572042800605851e-06, "loss": 0.3264, "num_input_tokens_seen": 11421504, "step": 16960 }, { "epoch": 0.41445777245743043, "grad_norm": 4.807636260986328, "learning_rate": 1.657692871451605e-06, "loss": 0.0632, "num_input_tokens_seen": 11424896, "step": 16965 }, { "epoch": 0.4145799232892776, "grad_norm": 24.025726318359375, "learning_rate": 1.6581814628426247e-06, "loss": 0.2217, "num_input_tokens_seen": 11428096, "step": 16970 }, { "epoch": 0.41470207412112475, "grad_norm": 17.033933639526367, "learning_rate": 1.6586700542336442e-06, "loss": 0.1892, "num_input_tokens_seen": 11431872, "step": 16975 }, { "epoch": 0.41482422495297194, "grad_norm": 0.6783763766288757, "learning_rate": 1.659158645624664e-06, "loss": 0.118, "num_input_tokens_seen": 11434944, "step": 16980 }, { "epoch": 0.4149463757848191, "grad_norm": 20.161361694335938, "learning_rate": 1.6596472370156838e-06, "loss": 0.1374, "num_input_tokens_seen": 11438528, "step": 16985 }, { "epoch": 0.41506852661666627, "grad_norm": 17.77075958251953, "learning_rate": 1.6601358284067032e-06, "loss": 0.1233, "num_input_tokens_seen": 11441472, "step": 16990 }, { "epoch": 0.4151906774485134, "grad_norm": 23.082717895507812, "learning_rate": 1.6606244197977231e-06, "loss": 0.0969, "num_input_tokens_seen": 11444800, "step": 16995 }, { "epoch": 0.4153128282803606, "grad_norm": 37.304412841796875, "learning_rate": 1.6611130111887428e-06, "loss": 0.1357, "num_input_tokens_seen": 11448576, "step": 17000 }, { "epoch": 0.4154349791122078, "grad_norm": 2.0742123126983643, "learning_rate": 1.6616016025797625e-06, "loss": 0.1851, "num_input_tokens_seen": 11451968, "step": 17005 }, { "epoch": 0.4155571299440549, "grad_norm": 10.271985054016113, "learning_rate": 1.6620901939707822e-06, "loss": 0.0463, "num_input_tokens_seen": 11455232, "step": 17010 }, { "epoch": 0.4156792807759021, "grad_norm": 1.929952621459961, "learning_rate": 1.6625787853618019e-06, "loss": 0.0914, "num_input_tokens_seen": 11458240, "step": 17015 }, { "epoch": 0.41580143160774924, "grad_norm": 0.3577294945716858, "learning_rate": 1.6630673767528215e-06, "loss": 0.1541, "num_input_tokens_seen": 11461376, "step": 17020 }, { "epoch": 0.41592358243959643, "grad_norm": 0.5475679039955139, "learning_rate": 1.6635559681438412e-06, "loss": 0.1685, "num_input_tokens_seen": 11464576, "step": 17025 }, { "epoch": 0.41604573327144356, "grad_norm": 2.176693916320801, "learning_rate": 1.664044559534861e-06, "loss": 0.1494, "num_input_tokens_seen": 11468032, "step": 17030 }, { "epoch": 0.41616788410329075, "grad_norm": 26.510526657104492, "learning_rate": 1.6645331509258806e-06, "loss": 0.0575, "num_input_tokens_seen": 11472192, "step": 17035 }, { "epoch": 0.4162900349351379, "grad_norm": 11.13955307006836, "learning_rate": 1.6650217423169005e-06, "loss": 0.1208, "num_input_tokens_seen": 11475328, "step": 17040 }, { "epoch": 0.4164121857669851, "grad_norm": 4.4048590660095215, "learning_rate": 1.66551033370792e-06, "loss": 0.1505, "num_input_tokens_seen": 11478336, "step": 17045 }, { "epoch": 0.4165343365988322, "grad_norm": 15.270955085754395, "learning_rate": 1.6659989250989396e-06, "loss": 0.1621, "num_input_tokens_seen": 11481920, "step": 17050 }, { "epoch": 0.4166564874306794, "grad_norm": 20.76490020751953, "learning_rate": 1.6664875164899595e-06, "loss": 0.187, "num_input_tokens_seen": 11484992, "step": 17055 }, { "epoch": 0.4167786382625266, "grad_norm": 9.951630592346191, "learning_rate": 1.666976107880979e-06, "loss": 0.0291, "num_input_tokens_seen": 11488384, "step": 17060 }, { "epoch": 0.4169007890943737, "grad_norm": 22.358646392822266, "learning_rate": 1.6674646992719987e-06, "loss": 0.1093, "num_input_tokens_seen": 11491712, "step": 17065 }, { "epoch": 0.4170229399262209, "grad_norm": 7.269237518310547, "learning_rate": 1.6679532906630186e-06, "loss": 0.0532, "num_input_tokens_seen": 11495232, "step": 17070 }, { "epoch": 0.41714509075806805, "grad_norm": 50.06626892089844, "learning_rate": 1.6684418820540382e-06, "loss": 0.2184, "num_input_tokens_seen": 11498304, "step": 17075 }, { "epoch": 0.41726724158991524, "grad_norm": 23.30377960205078, "learning_rate": 1.6689304734450577e-06, "loss": 0.1244, "num_input_tokens_seen": 11501632, "step": 17080 }, { "epoch": 0.4173893924217624, "grad_norm": 28.480815887451172, "learning_rate": 1.6694190648360776e-06, "loss": 0.0564, "num_input_tokens_seen": 11505152, "step": 17085 }, { "epoch": 0.41751154325360956, "grad_norm": 14.612655639648438, "learning_rate": 1.6699076562270973e-06, "loss": 0.0455, "num_input_tokens_seen": 11508352, "step": 17090 }, { "epoch": 0.4176336940854567, "grad_norm": 52.15671920776367, "learning_rate": 1.6703962476181168e-06, "loss": 0.138, "num_input_tokens_seen": 11512192, "step": 17095 }, { "epoch": 0.4177558449173039, "grad_norm": 41.2117919921875, "learning_rate": 1.6708848390091366e-06, "loss": 0.1228, "num_input_tokens_seen": 11515456, "step": 17100 }, { "epoch": 0.4178779957491511, "grad_norm": 29.704301834106445, "learning_rate": 1.6713734304001563e-06, "loss": 0.1136, "num_input_tokens_seen": 11518336, "step": 17105 }, { "epoch": 0.4180001465809982, "grad_norm": 15.26375675201416, "learning_rate": 1.6718620217911758e-06, "loss": 0.0804, "num_input_tokens_seen": 11521920, "step": 17110 }, { "epoch": 0.4181222974128454, "grad_norm": 13.994719505310059, "learning_rate": 1.6723506131821957e-06, "loss": 0.0496, "num_input_tokens_seen": 11525248, "step": 17115 }, { "epoch": 0.41824444824469253, "grad_norm": 30.99201202392578, "learning_rate": 1.6728392045732154e-06, "loss": 0.055, "num_input_tokens_seen": 11528256, "step": 17120 }, { "epoch": 0.4183665990765397, "grad_norm": 24.87019920349121, "learning_rate": 1.673327795964235e-06, "loss": 0.1625, "num_input_tokens_seen": 11531264, "step": 17125 }, { "epoch": 0.41848874990838686, "grad_norm": 38.52754211425781, "learning_rate": 1.6738163873552547e-06, "loss": 0.1851, "num_input_tokens_seen": 11534784, "step": 17130 }, { "epoch": 0.41861090074023405, "grad_norm": 3.06817889213562, "learning_rate": 1.6743049787462744e-06, "loss": 0.2029, "num_input_tokens_seen": 11538880, "step": 17135 }, { "epoch": 0.4187330515720812, "grad_norm": 11.713780403137207, "learning_rate": 1.674793570137294e-06, "loss": 0.2042, "num_input_tokens_seen": 11542080, "step": 17140 }, { "epoch": 0.4188552024039284, "grad_norm": 14.065862655639648, "learning_rate": 1.6752821615283138e-06, "loss": 0.0884, "num_input_tokens_seen": 11545664, "step": 17145 }, { "epoch": 0.41897735323577556, "grad_norm": 14.169081687927246, "learning_rate": 1.6757707529193335e-06, "loss": 0.0796, "num_input_tokens_seen": 11549056, "step": 17150 }, { "epoch": 0.4190995040676227, "grad_norm": 25.049545288085938, "learning_rate": 1.6762593443103531e-06, "loss": 0.166, "num_input_tokens_seen": 11552000, "step": 17155 }, { "epoch": 0.4192216548994699, "grad_norm": 2.066195487976074, "learning_rate": 1.676747935701373e-06, "loss": 0.2232, "num_input_tokens_seen": 11555008, "step": 17160 }, { "epoch": 0.419343805731317, "grad_norm": 25.07562828063965, "learning_rate": 1.6772365270923925e-06, "loss": 0.1723, "num_input_tokens_seen": 11558720, "step": 17165 }, { "epoch": 0.4194659565631642, "grad_norm": 21.18793296813965, "learning_rate": 1.6777251184834122e-06, "loss": 0.0894, "num_input_tokens_seen": 11562048, "step": 17170 }, { "epoch": 0.41958810739501134, "grad_norm": 3.6823861598968506, "learning_rate": 1.678213709874432e-06, "loss": 0.0682, "num_input_tokens_seen": 11565632, "step": 17175 }, { "epoch": 0.41971025822685853, "grad_norm": 12.41185188293457, "learning_rate": 1.6787023012654515e-06, "loss": 0.1125, "num_input_tokens_seen": 11569152, "step": 17180 }, { "epoch": 0.41983240905870567, "grad_norm": 31.988386154174805, "learning_rate": 1.6791908926564712e-06, "loss": 0.18, "num_input_tokens_seen": 11572416, "step": 17185 }, { "epoch": 0.41995455989055286, "grad_norm": 22.454463958740234, "learning_rate": 1.6796794840474911e-06, "loss": 0.0782, "num_input_tokens_seen": 11575552, "step": 17190 }, { "epoch": 0.4200767107224, "grad_norm": 10.569482803344727, "learning_rate": 1.6801680754385108e-06, "loss": 0.0783, "num_input_tokens_seen": 11578688, "step": 17195 }, { "epoch": 0.4201988615542472, "grad_norm": 10.280340194702148, "learning_rate": 1.6806566668295303e-06, "loss": 0.2211, "num_input_tokens_seen": 11581632, "step": 17200 }, { "epoch": 0.4203210123860944, "grad_norm": 23.099761962890625, "learning_rate": 1.6811452582205502e-06, "loss": 0.2273, "num_input_tokens_seen": 11584896, "step": 17205 }, { "epoch": 0.4204431632179415, "grad_norm": 18.303823471069336, "learning_rate": 1.6816338496115698e-06, "loss": 0.0593, "num_input_tokens_seen": 11588416, "step": 17210 }, { "epoch": 0.4205653140497887, "grad_norm": 1.9946376085281372, "learning_rate": 1.6821224410025893e-06, "loss": 0.0771, "num_input_tokens_seen": 11591488, "step": 17215 }, { "epoch": 0.42068746488163583, "grad_norm": 19.712543487548828, "learning_rate": 1.6826110323936092e-06, "loss": 0.094, "num_input_tokens_seen": 11594944, "step": 17220 }, { "epoch": 0.420809615713483, "grad_norm": 41.17828369140625, "learning_rate": 1.6830996237846289e-06, "loss": 0.1358, "num_input_tokens_seen": 11597952, "step": 17225 }, { "epoch": 0.42093176654533015, "grad_norm": 22.785945892333984, "learning_rate": 1.6835882151756484e-06, "loss": 0.1798, "num_input_tokens_seen": 11601344, "step": 17230 }, { "epoch": 0.42105391737717734, "grad_norm": 15.529670715332031, "learning_rate": 1.6840768065666683e-06, "loss": 0.1284, "num_input_tokens_seen": 11604864, "step": 17235 }, { "epoch": 0.4211760682090245, "grad_norm": 1.1814509630203247, "learning_rate": 1.684565397957688e-06, "loss": 0.0769, "num_input_tokens_seen": 11608320, "step": 17240 }, { "epoch": 0.42129821904087167, "grad_norm": 28.040863037109375, "learning_rate": 1.6850539893487076e-06, "loss": 0.1262, "num_input_tokens_seen": 11611968, "step": 17245 }, { "epoch": 0.42142036987271886, "grad_norm": 27.960603713989258, "learning_rate": 1.6855425807397273e-06, "loss": 0.1179, "num_input_tokens_seen": 11615872, "step": 17250 }, { "epoch": 0.421542520704566, "grad_norm": 0.27788135409355164, "learning_rate": 1.686031172130747e-06, "loss": 0.0897, "num_input_tokens_seen": 11619136, "step": 17255 }, { "epoch": 0.4216646715364132, "grad_norm": 9.456171989440918, "learning_rate": 1.6865197635217667e-06, "loss": 0.1211, "num_input_tokens_seen": 11622400, "step": 17260 }, { "epoch": 0.4217868223682603, "grad_norm": 15.458897590637207, "learning_rate": 1.6870083549127863e-06, "loss": 0.117, "num_input_tokens_seen": 11625600, "step": 17265 }, { "epoch": 0.4219089732001075, "grad_norm": 63.20888137817383, "learning_rate": 1.687496946303806e-06, "loss": 0.2705, "num_input_tokens_seen": 11628928, "step": 17270 }, { "epoch": 0.42203112403195464, "grad_norm": 23.697669982910156, "learning_rate": 1.6879855376948257e-06, "loss": 0.1376, "num_input_tokens_seen": 11632192, "step": 17275 }, { "epoch": 0.42215327486380183, "grad_norm": 23.548437118530273, "learning_rate": 1.6884741290858456e-06, "loss": 0.109, "num_input_tokens_seen": 11635200, "step": 17280 }, { "epoch": 0.42227542569564896, "grad_norm": 21.84796714782715, "learning_rate": 1.688962720476865e-06, "loss": 0.0928, "num_input_tokens_seen": 11638336, "step": 17285 }, { "epoch": 0.42239757652749615, "grad_norm": 18.382299423217773, "learning_rate": 1.6894513118678847e-06, "loss": 0.0988, "num_input_tokens_seen": 11642048, "step": 17290 }, { "epoch": 0.42251972735934334, "grad_norm": 25.732805252075195, "learning_rate": 1.6899399032589046e-06, "loss": 0.182, "num_input_tokens_seen": 11645248, "step": 17295 }, { "epoch": 0.4226418781911905, "grad_norm": 21.091764450073242, "learning_rate": 1.6904284946499241e-06, "loss": 0.1811, "num_input_tokens_seen": 11648768, "step": 17300 }, { "epoch": 0.42276402902303767, "grad_norm": 0.5553004145622253, "learning_rate": 1.6909170860409438e-06, "loss": 0.0477, "num_input_tokens_seen": 11652032, "step": 17305 }, { "epoch": 0.4228861798548848, "grad_norm": 1.810340166091919, "learning_rate": 1.6914056774319637e-06, "loss": 0.082, "num_input_tokens_seen": 11655104, "step": 17310 }, { "epoch": 0.423008330686732, "grad_norm": 15.941764831542969, "learning_rate": 1.6918942688229834e-06, "loss": 0.2205, "num_input_tokens_seen": 11658624, "step": 17315 }, { "epoch": 0.4231304815185791, "grad_norm": 21.933530807495117, "learning_rate": 1.6923828602140028e-06, "loss": 0.1187, "num_input_tokens_seen": 11661440, "step": 17320 }, { "epoch": 0.4232526323504263, "grad_norm": 43.517852783203125, "learning_rate": 1.6928714516050227e-06, "loss": 0.0832, "num_input_tokens_seen": 11664512, "step": 17325 }, { "epoch": 0.42337478318227345, "grad_norm": 3.793125867843628, "learning_rate": 1.6933600429960424e-06, "loss": 0.2175, "num_input_tokens_seen": 11667648, "step": 17330 }, { "epoch": 0.42349693401412064, "grad_norm": 8.742578506469727, "learning_rate": 1.6938486343870619e-06, "loss": 0.1996, "num_input_tokens_seen": 11670784, "step": 17335 }, { "epoch": 0.4236190848459678, "grad_norm": 3.419423818588257, "learning_rate": 1.6943372257780818e-06, "loss": 0.0694, "num_input_tokens_seen": 11673792, "step": 17340 }, { "epoch": 0.42374123567781496, "grad_norm": 14.705204963684082, "learning_rate": 1.6948258171691015e-06, "loss": 0.0343, "num_input_tokens_seen": 11677568, "step": 17345 }, { "epoch": 0.42386338650966215, "grad_norm": 32.34300994873047, "learning_rate": 1.6953144085601211e-06, "loss": 0.1287, "num_input_tokens_seen": 11681216, "step": 17350 }, { "epoch": 0.4239855373415093, "grad_norm": 25.323711395263672, "learning_rate": 1.6958029999511408e-06, "loss": 0.2124, "num_input_tokens_seen": 11684544, "step": 17355 }, { "epoch": 0.4241076881733565, "grad_norm": 7.455917835235596, "learning_rate": 1.6962915913421605e-06, "loss": 0.1128, "num_input_tokens_seen": 11687744, "step": 17360 }, { "epoch": 0.4242298390052036, "grad_norm": 4.69118070602417, "learning_rate": 1.6967801827331802e-06, "loss": 0.0695, "num_input_tokens_seen": 11691136, "step": 17365 }, { "epoch": 0.4243519898370508, "grad_norm": 2.9468350410461426, "learning_rate": 1.6972687741241999e-06, "loss": 0.0546, "num_input_tokens_seen": 11694592, "step": 17370 }, { "epoch": 0.42447414066889794, "grad_norm": 8.724267959594727, "learning_rate": 1.6977573655152195e-06, "loss": 0.1033, "num_input_tokens_seen": 11697856, "step": 17375 }, { "epoch": 0.4245962915007451, "grad_norm": 17.574451446533203, "learning_rate": 1.6982459569062392e-06, "loss": 0.0761, "num_input_tokens_seen": 11701696, "step": 17380 }, { "epoch": 0.42471844233259226, "grad_norm": 30.870471954345703, "learning_rate": 1.698734548297259e-06, "loss": 0.1001, "num_input_tokens_seen": 11704832, "step": 17385 }, { "epoch": 0.42484059316443945, "grad_norm": 1.6396796703338623, "learning_rate": 1.6992231396882786e-06, "loss": 0.0872, "num_input_tokens_seen": 11708096, "step": 17390 }, { "epoch": 0.42496274399628664, "grad_norm": 10.13924789428711, "learning_rate": 1.6997117310792983e-06, "loss": 0.1583, "num_input_tokens_seen": 11710976, "step": 17395 }, { "epoch": 0.4250848948281338, "grad_norm": 10.56521987915039, "learning_rate": 1.7002003224703182e-06, "loss": 0.0655, "num_input_tokens_seen": 11714560, "step": 17400 }, { "epoch": 0.42520704565998096, "grad_norm": 13.599597930908203, "learning_rate": 1.7006889138613376e-06, "loss": 0.2183, "num_input_tokens_seen": 11717760, "step": 17405 }, { "epoch": 0.4253291964918281, "grad_norm": 29.13743019104004, "learning_rate": 1.7011775052523573e-06, "loss": 0.1628, "num_input_tokens_seen": 11721152, "step": 17410 }, { "epoch": 0.4254513473236753, "grad_norm": 15.540716171264648, "learning_rate": 1.7016660966433772e-06, "loss": 0.0634, "num_input_tokens_seen": 11724224, "step": 17415 }, { "epoch": 0.4255734981555224, "grad_norm": 36.84275436401367, "learning_rate": 1.7021546880343967e-06, "loss": 0.0711, "num_input_tokens_seen": 11727296, "step": 17420 }, { "epoch": 0.4256956489873696, "grad_norm": 27.168283462524414, "learning_rate": 1.7026432794254164e-06, "loss": 0.0998, "num_input_tokens_seen": 11730624, "step": 17425 }, { "epoch": 0.42581779981921675, "grad_norm": 1.0942851305007935, "learning_rate": 1.7031318708164362e-06, "loss": 0.1372, "num_input_tokens_seen": 11733888, "step": 17430 }, { "epoch": 0.42593995065106394, "grad_norm": 29.49009132385254, "learning_rate": 1.703620462207456e-06, "loss": 0.1501, "num_input_tokens_seen": 11737152, "step": 17435 }, { "epoch": 0.42606210148291107, "grad_norm": 37.950321197509766, "learning_rate": 1.7041090535984754e-06, "loss": 0.2104, "num_input_tokens_seen": 11740352, "step": 17440 }, { "epoch": 0.42618425231475826, "grad_norm": 5.7830915451049805, "learning_rate": 1.7045976449894953e-06, "loss": 0.1267, "num_input_tokens_seen": 11744320, "step": 17445 }, { "epoch": 0.42630640314660545, "grad_norm": 27.485319137573242, "learning_rate": 1.705086236380515e-06, "loss": 0.0531, "num_input_tokens_seen": 11747328, "step": 17450 }, { "epoch": 0.4264285539784526, "grad_norm": 10.673401832580566, "learning_rate": 1.7055748277715344e-06, "loss": 0.1527, "num_input_tokens_seen": 11750272, "step": 17455 }, { "epoch": 0.4265507048102998, "grad_norm": 47.06698226928711, "learning_rate": 1.7060634191625543e-06, "loss": 0.1504, "num_input_tokens_seen": 11753856, "step": 17460 }, { "epoch": 0.4266728556421469, "grad_norm": 5.912046909332275, "learning_rate": 1.706552010553574e-06, "loss": 0.0409, "num_input_tokens_seen": 11756928, "step": 17465 }, { "epoch": 0.4267950064739941, "grad_norm": 1.2672039270401, "learning_rate": 1.7070406019445937e-06, "loss": 0.2459, "num_input_tokens_seen": 11760128, "step": 17470 }, { "epoch": 0.42691715730584123, "grad_norm": 15.835184097290039, "learning_rate": 1.7075291933356134e-06, "loss": 0.0789, "num_input_tokens_seen": 11763584, "step": 17475 }, { "epoch": 0.4270393081376884, "grad_norm": 10.859064102172852, "learning_rate": 1.708017784726633e-06, "loss": 0.1138, "num_input_tokens_seen": 11766912, "step": 17480 }, { "epoch": 0.42716145896953556, "grad_norm": 22.323833465576172, "learning_rate": 1.7085063761176527e-06, "loss": 0.0573, "num_input_tokens_seen": 11771008, "step": 17485 }, { "epoch": 0.42728360980138275, "grad_norm": 8.541205406188965, "learning_rate": 1.7089949675086724e-06, "loss": 0.1585, "num_input_tokens_seen": 11773952, "step": 17490 }, { "epoch": 0.42740576063322994, "grad_norm": 0.24411046504974365, "learning_rate": 1.7094835588996921e-06, "loss": 0.0502, "num_input_tokens_seen": 11777344, "step": 17495 }, { "epoch": 0.42752791146507707, "grad_norm": 1.567487120628357, "learning_rate": 1.7099721502907118e-06, "loss": 0.1001, "num_input_tokens_seen": 11780928, "step": 17500 }, { "epoch": 0.42765006229692426, "grad_norm": 18.680326461791992, "learning_rate": 1.7104607416817317e-06, "loss": 0.1082, "num_input_tokens_seen": 11784512, "step": 17505 }, { "epoch": 0.4277722131287714, "grad_norm": 33.931095123291016, "learning_rate": 1.7109493330727512e-06, "loss": 0.1218, "num_input_tokens_seen": 11787776, "step": 17510 }, { "epoch": 0.4278943639606186, "grad_norm": 24.828832626342773, "learning_rate": 1.7114379244637708e-06, "loss": 0.1005, "num_input_tokens_seen": 11791360, "step": 17515 }, { "epoch": 0.4280165147924657, "grad_norm": 26.003156661987305, "learning_rate": 1.7119265158547907e-06, "loss": 0.0767, "num_input_tokens_seen": 11794560, "step": 17520 }, { "epoch": 0.4281386656243129, "grad_norm": 30.900161743164062, "learning_rate": 1.7124151072458102e-06, "loss": 0.1079, "num_input_tokens_seen": 11797632, "step": 17525 }, { "epoch": 0.42826081645616004, "grad_norm": 14.616107940673828, "learning_rate": 1.7129036986368299e-06, "loss": 0.1193, "num_input_tokens_seen": 11801024, "step": 17530 }, { "epoch": 0.42838296728800723, "grad_norm": 31.8143367767334, "learning_rate": 1.7133922900278498e-06, "loss": 0.157, "num_input_tokens_seen": 11804736, "step": 17535 }, { "epoch": 0.4285051181198544, "grad_norm": 26.70197105407715, "learning_rate": 1.7138808814188692e-06, "loss": 0.235, "num_input_tokens_seen": 11808512, "step": 17540 }, { "epoch": 0.42862726895170156, "grad_norm": 15.07053279876709, "learning_rate": 1.714369472809889e-06, "loss": 0.1438, "num_input_tokens_seen": 11811584, "step": 17545 }, { "epoch": 0.42874941978354875, "grad_norm": 0.40839216113090515, "learning_rate": 1.7148580642009088e-06, "loss": 0.1372, "num_input_tokens_seen": 11814848, "step": 17550 }, { "epoch": 0.4288715706153959, "grad_norm": 19.45197105407715, "learning_rate": 1.7153466555919285e-06, "loss": 0.0802, "num_input_tokens_seen": 11818368, "step": 17555 }, { "epoch": 0.42899372144724307, "grad_norm": 0.8354930877685547, "learning_rate": 1.715835246982948e-06, "loss": 0.133, "num_input_tokens_seen": 11821696, "step": 17560 }, { "epoch": 0.4291158722790902, "grad_norm": 40.36201095581055, "learning_rate": 1.7163238383739679e-06, "loss": 0.1478, "num_input_tokens_seen": 11825088, "step": 17565 }, { "epoch": 0.4292380231109374, "grad_norm": 0.6440602540969849, "learning_rate": 1.7168124297649875e-06, "loss": 0.0598, "num_input_tokens_seen": 11828480, "step": 17570 }, { "epoch": 0.42936017394278453, "grad_norm": 36.08130645751953, "learning_rate": 1.717301021156007e-06, "loss": 0.1228, "num_input_tokens_seen": 11831936, "step": 17575 }, { "epoch": 0.4294823247746317, "grad_norm": 23.151386260986328, "learning_rate": 1.717789612547027e-06, "loss": 0.0906, "num_input_tokens_seen": 11835328, "step": 17580 }, { "epoch": 0.42960447560647885, "grad_norm": 0.3527524173259735, "learning_rate": 1.7182782039380466e-06, "loss": 0.078, "num_input_tokens_seen": 11838720, "step": 17585 }, { "epoch": 0.42972662643832604, "grad_norm": 1.2807598114013672, "learning_rate": 1.7187667953290663e-06, "loss": 0.0597, "num_input_tokens_seen": 11842112, "step": 17590 }, { "epoch": 0.42984877727017323, "grad_norm": 36.731380462646484, "learning_rate": 1.719255386720086e-06, "loss": 0.2555, "num_input_tokens_seen": 11845760, "step": 17595 }, { "epoch": 0.42997092810202037, "grad_norm": 1.584189534187317, "learning_rate": 1.7197439781111056e-06, "loss": 0.2048, "num_input_tokens_seen": 11849088, "step": 17600 }, { "epoch": 0.43009307893386756, "grad_norm": 35.07202911376953, "learning_rate": 1.7202325695021253e-06, "loss": 0.1001, "num_input_tokens_seen": 11852160, "step": 17605 }, { "epoch": 0.4302152297657147, "grad_norm": 3.500735282897949, "learning_rate": 1.720721160893145e-06, "loss": 0.0159, "num_input_tokens_seen": 11855232, "step": 17610 }, { "epoch": 0.4303373805975619, "grad_norm": 33.144378662109375, "learning_rate": 1.7212097522841647e-06, "loss": 0.149, "num_input_tokens_seen": 11858432, "step": 17615 }, { "epoch": 0.430459531429409, "grad_norm": 2.410709857940674, "learning_rate": 1.7216983436751844e-06, "loss": 0.1561, "num_input_tokens_seen": 11862336, "step": 17620 }, { "epoch": 0.4305816822612562, "grad_norm": 45.39206314086914, "learning_rate": 1.7221869350662042e-06, "loss": 0.1548, "num_input_tokens_seen": 11865344, "step": 17625 }, { "epoch": 0.43070383309310334, "grad_norm": 10.103440284729004, "learning_rate": 1.7226755264572237e-06, "loss": 0.1538, "num_input_tokens_seen": 11868672, "step": 17630 }, { "epoch": 0.43082598392495053, "grad_norm": 66.29302978515625, "learning_rate": 1.7231641178482434e-06, "loss": 0.1825, "num_input_tokens_seen": 11871872, "step": 17635 }, { "epoch": 0.4309481347567977, "grad_norm": 16.076980590820312, "learning_rate": 1.7236527092392633e-06, "loss": 0.169, "num_input_tokens_seen": 11875328, "step": 17640 }, { "epoch": 0.43107028558864485, "grad_norm": 20.99383544921875, "learning_rate": 1.7241413006302828e-06, "loss": 0.1005, "num_input_tokens_seen": 11878592, "step": 17645 }, { "epoch": 0.43119243642049204, "grad_norm": 2.398129940032959, "learning_rate": 1.7246298920213024e-06, "loss": 0.0513, "num_input_tokens_seen": 11881984, "step": 17650 }, { "epoch": 0.4313145872523392, "grad_norm": 38.867679595947266, "learning_rate": 1.7251184834123223e-06, "loss": 0.1343, "num_input_tokens_seen": 11884992, "step": 17655 }, { "epoch": 0.43143673808418637, "grad_norm": 13.77950668334961, "learning_rate": 1.7256070748033418e-06, "loss": 0.1457, "num_input_tokens_seen": 11888384, "step": 17660 }, { "epoch": 0.4315588889160335, "grad_norm": 16.17148208618164, "learning_rate": 1.7260956661943615e-06, "loss": 0.1399, "num_input_tokens_seen": 11891584, "step": 17665 }, { "epoch": 0.4316810397478807, "grad_norm": 22.251768112182617, "learning_rate": 1.7265842575853814e-06, "loss": 0.1125, "num_input_tokens_seen": 11894656, "step": 17670 }, { "epoch": 0.4318031905797278, "grad_norm": 23.052682876586914, "learning_rate": 1.727072848976401e-06, "loss": 0.1155, "num_input_tokens_seen": 11897600, "step": 17675 }, { "epoch": 0.431925341411575, "grad_norm": 1.7083724737167358, "learning_rate": 1.7275614403674205e-06, "loss": 0.0801, "num_input_tokens_seen": 11900864, "step": 17680 }, { "epoch": 0.4320474922434222, "grad_norm": 19.45560646057129, "learning_rate": 1.7280500317584404e-06, "loss": 0.2049, "num_input_tokens_seen": 11903744, "step": 17685 }, { "epoch": 0.43216964307526934, "grad_norm": 30.353389739990234, "learning_rate": 1.72853862314946e-06, "loss": 0.0588, "num_input_tokens_seen": 11907456, "step": 17690 }, { "epoch": 0.43229179390711653, "grad_norm": 1.1827892065048218, "learning_rate": 1.7290272145404796e-06, "loss": 0.1682, "num_input_tokens_seen": 11910912, "step": 17695 }, { "epoch": 0.43241394473896366, "grad_norm": 36.37336349487305, "learning_rate": 1.7295158059314995e-06, "loss": 0.1615, "num_input_tokens_seen": 11914176, "step": 17700 }, { "epoch": 0.43253609557081085, "grad_norm": 22.18280601501465, "learning_rate": 1.7300043973225191e-06, "loss": 0.1076, "num_input_tokens_seen": 11917184, "step": 17705 }, { "epoch": 0.432658246402658, "grad_norm": 0.602412223815918, "learning_rate": 1.7304929887135388e-06, "loss": 0.1363, "num_input_tokens_seen": 11920192, "step": 17710 }, { "epoch": 0.4327803972345052, "grad_norm": 33.112857818603516, "learning_rate": 1.7309815801045585e-06, "loss": 0.134, "num_input_tokens_seen": 11923392, "step": 17715 }, { "epoch": 0.4329025480663523, "grad_norm": 14.708128929138184, "learning_rate": 1.7314701714955782e-06, "loss": 0.1268, "num_input_tokens_seen": 11926656, "step": 17720 }, { "epoch": 0.4330246988981995, "grad_norm": 0.573269784450531, "learning_rate": 1.7319587628865979e-06, "loss": 0.121, "num_input_tokens_seen": 11930432, "step": 17725 }, { "epoch": 0.43314684973004663, "grad_norm": 18.703655242919922, "learning_rate": 1.7324473542776176e-06, "loss": 0.0776, "num_input_tokens_seen": 11933952, "step": 17730 }, { "epoch": 0.4332690005618938, "grad_norm": 11.47458267211914, "learning_rate": 1.7329359456686372e-06, "loss": 0.1244, "num_input_tokens_seen": 11937088, "step": 17735 }, { "epoch": 0.433391151393741, "grad_norm": 18.38603973388672, "learning_rate": 1.733424537059657e-06, "loss": 0.1057, "num_input_tokens_seen": 11940032, "step": 17740 }, { "epoch": 0.43351330222558815, "grad_norm": 17.476255416870117, "learning_rate": 1.7339131284506768e-06, "loss": 0.0861, "num_input_tokens_seen": 11943680, "step": 17745 }, { "epoch": 0.43363545305743534, "grad_norm": 0.9486721754074097, "learning_rate": 1.7344017198416963e-06, "loss": 0.0573, "num_input_tokens_seen": 11947520, "step": 17750 }, { "epoch": 0.4337576038892825, "grad_norm": 0.8889973163604736, "learning_rate": 1.734890311232716e-06, "loss": 0.0298, "num_input_tokens_seen": 11950720, "step": 17755 }, { "epoch": 0.43387975472112966, "grad_norm": 1.1656432151794434, "learning_rate": 1.7353789026237359e-06, "loss": 0.1699, "num_input_tokens_seen": 11954048, "step": 17760 }, { "epoch": 0.4340019055529768, "grad_norm": 4.165920257568359, "learning_rate": 1.7358674940147553e-06, "loss": 0.0999, "num_input_tokens_seen": 11957248, "step": 17765 }, { "epoch": 0.434124056384824, "grad_norm": 46.445533752441406, "learning_rate": 1.736356085405775e-06, "loss": 0.1424, "num_input_tokens_seen": 11960640, "step": 17770 }, { "epoch": 0.4342462072166711, "grad_norm": 55.10757064819336, "learning_rate": 1.736844676796795e-06, "loss": 0.212, "num_input_tokens_seen": 11963968, "step": 17775 }, { "epoch": 0.4343683580485183, "grad_norm": 37.553836822509766, "learning_rate": 1.7373332681878146e-06, "loss": 0.2234, "num_input_tokens_seen": 11967104, "step": 17780 }, { "epoch": 0.4344905088803655, "grad_norm": 16.70302391052246, "learning_rate": 1.737821859578834e-06, "loss": 0.055, "num_input_tokens_seen": 11970304, "step": 17785 }, { "epoch": 0.43461265971221263, "grad_norm": 0.09657532721757889, "learning_rate": 1.738310450969854e-06, "loss": 0.1663, "num_input_tokens_seen": 11974144, "step": 17790 }, { "epoch": 0.4347348105440598, "grad_norm": 1.6909934282302856, "learning_rate": 1.7387990423608736e-06, "loss": 0.1038, "num_input_tokens_seen": 11977280, "step": 17795 }, { "epoch": 0.43485696137590696, "grad_norm": 16.23759651184082, "learning_rate": 1.739287633751893e-06, "loss": 0.0562, "num_input_tokens_seen": 11981184, "step": 17800 }, { "epoch": 0.43497911220775415, "grad_norm": 0.7810158133506775, "learning_rate": 1.739776225142913e-06, "loss": 0.028, "num_input_tokens_seen": 11984320, "step": 17805 }, { "epoch": 0.4351012630396013, "grad_norm": 17.658899307250977, "learning_rate": 1.7402648165339327e-06, "loss": 0.054, "num_input_tokens_seen": 11987712, "step": 17810 }, { "epoch": 0.4352234138714485, "grad_norm": 15.886202812194824, "learning_rate": 1.7407534079249521e-06, "loss": 0.1537, "num_input_tokens_seen": 11991104, "step": 17815 }, { "epoch": 0.4353455647032956, "grad_norm": 32.44907760620117, "learning_rate": 1.741241999315972e-06, "loss": 0.08, "num_input_tokens_seen": 11994688, "step": 17820 }, { "epoch": 0.4354677155351428, "grad_norm": 7.900591850280762, "learning_rate": 1.7417305907069917e-06, "loss": 0.0853, "num_input_tokens_seen": 11998400, "step": 17825 }, { "epoch": 0.43558986636699, "grad_norm": 20.13414192199707, "learning_rate": 1.7422191820980114e-06, "loss": 0.2011, "num_input_tokens_seen": 12001280, "step": 17830 }, { "epoch": 0.4357120171988371, "grad_norm": 16.002933502197266, "learning_rate": 1.742707773489031e-06, "loss": 0.0902, "num_input_tokens_seen": 12004480, "step": 17835 }, { "epoch": 0.4358341680306843, "grad_norm": 12.876802444458008, "learning_rate": 1.7431963648800508e-06, "loss": 0.2976, "num_input_tokens_seen": 12008128, "step": 17840 }, { "epoch": 0.43595631886253144, "grad_norm": 24.299243927001953, "learning_rate": 1.7436849562710704e-06, "loss": 0.0274, "num_input_tokens_seen": 12011456, "step": 17845 }, { "epoch": 0.43607846969437863, "grad_norm": 40.15287399291992, "learning_rate": 1.7441735476620901e-06, "loss": 0.1043, "num_input_tokens_seen": 12015040, "step": 17850 }, { "epoch": 0.43620062052622577, "grad_norm": 5.354029655456543, "learning_rate": 1.7446621390531098e-06, "loss": 0.0902, "num_input_tokens_seen": 12018112, "step": 17855 }, { "epoch": 0.43632277135807296, "grad_norm": 16.343725204467773, "learning_rate": 1.7451507304441295e-06, "loss": 0.0827, "num_input_tokens_seen": 12021504, "step": 17860 }, { "epoch": 0.4364449221899201, "grad_norm": 19.908218383789062, "learning_rate": 1.7456393218351494e-06, "loss": 0.2358, "num_input_tokens_seen": 12024704, "step": 17865 }, { "epoch": 0.4365670730217673, "grad_norm": 20.10467529296875, "learning_rate": 1.7461279132261688e-06, "loss": 0.1437, "num_input_tokens_seen": 12028544, "step": 17870 }, { "epoch": 0.4366892238536144, "grad_norm": 0.4849933981895447, "learning_rate": 1.7466165046171885e-06, "loss": 0.1021, "num_input_tokens_seen": 12032192, "step": 17875 }, { "epoch": 0.4368113746854616, "grad_norm": 24.99449920654297, "learning_rate": 1.7471050960082084e-06, "loss": 0.1488, "num_input_tokens_seen": 12035520, "step": 17880 }, { "epoch": 0.4369335255173088, "grad_norm": 46.49135208129883, "learning_rate": 1.7475936873992279e-06, "loss": 0.2402, "num_input_tokens_seen": 12038848, "step": 17885 }, { "epoch": 0.43705567634915593, "grad_norm": 29.723981857299805, "learning_rate": 1.7480822787902476e-06, "loss": 0.084, "num_input_tokens_seen": 12042240, "step": 17890 }, { "epoch": 0.4371778271810031, "grad_norm": 2.592606544494629, "learning_rate": 1.7485708701812675e-06, "loss": 0.0383, "num_input_tokens_seen": 12046400, "step": 17895 }, { "epoch": 0.43729997801285025, "grad_norm": 33.512306213378906, "learning_rate": 1.7490594615722871e-06, "loss": 0.0683, "num_input_tokens_seen": 12050048, "step": 17900 }, { "epoch": 0.43742212884469744, "grad_norm": 25.43195915222168, "learning_rate": 1.7495480529633066e-06, "loss": 0.1424, "num_input_tokens_seen": 12053120, "step": 17905 }, { "epoch": 0.4375442796765446, "grad_norm": 0.5998150110244751, "learning_rate": 1.7500366443543265e-06, "loss": 0.0343, "num_input_tokens_seen": 12056704, "step": 17910 }, { "epoch": 0.43766643050839177, "grad_norm": 2.1911861896514893, "learning_rate": 1.7505252357453462e-06, "loss": 0.0722, "num_input_tokens_seen": 12059712, "step": 17915 }, { "epoch": 0.4377885813402389, "grad_norm": 28.80990219116211, "learning_rate": 1.7510138271363657e-06, "loss": 0.0684, "num_input_tokens_seen": 12063168, "step": 17920 }, { "epoch": 0.4379107321720861, "grad_norm": 8.22693157196045, "learning_rate": 1.7515024185273855e-06, "loss": 0.0728, "num_input_tokens_seen": 12066560, "step": 17925 }, { "epoch": 0.4380328830039333, "grad_norm": 3.5628767013549805, "learning_rate": 1.7519910099184052e-06, "loss": 0.1294, "num_input_tokens_seen": 12069760, "step": 17930 }, { "epoch": 0.4381550338357804, "grad_norm": 9.691713333129883, "learning_rate": 1.7524796013094247e-06, "loss": 0.1249, "num_input_tokens_seen": 12073088, "step": 17935 }, { "epoch": 0.4382771846676276, "grad_norm": 17.05838394165039, "learning_rate": 1.7529681927004446e-06, "loss": 0.1269, "num_input_tokens_seen": 12076096, "step": 17940 }, { "epoch": 0.43839933549947474, "grad_norm": 3.224658966064453, "learning_rate": 1.7534567840914643e-06, "loss": 0.0412, "num_input_tokens_seen": 12079680, "step": 17945 }, { "epoch": 0.43852148633132193, "grad_norm": 15.042247772216797, "learning_rate": 1.753945375482484e-06, "loss": 0.0797, "num_input_tokens_seen": 12082880, "step": 17950 }, { "epoch": 0.43864363716316906, "grad_norm": 21.121103286743164, "learning_rate": 1.7544339668735036e-06, "loss": 0.0568, "num_input_tokens_seen": 12085888, "step": 17955 }, { "epoch": 0.43876578799501625, "grad_norm": 12.655285835266113, "learning_rate": 1.7549225582645233e-06, "loss": 0.1215, "num_input_tokens_seen": 12089408, "step": 17960 }, { "epoch": 0.4388879388268634, "grad_norm": 2.449303150177002, "learning_rate": 1.755411149655543e-06, "loss": 0.0904, "num_input_tokens_seen": 12092160, "step": 17965 }, { "epoch": 0.4390100896587106, "grad_norm": 18.23929786682129, "learning_rate": 1.7558997410465627e-06, "loss": 0.1017, "num_input_tokens_seen": 12095616, "step": 17970 }, { "epoch": 0.43913224049055777, "grad_norm": 6.55383825302124, "learning_rate": 1.7563883324375824e-06, "loss": 0.1135, "num_input_tokens_seen": 12098496, "step": 17975 }, { "epoch": 0.4392543913224049, "grad_norm": 20.092472076416016, "learning_rate": 1.756876923828602e-06, "loss": 0.1535, "num_input_tokens_seen": 12101824, "step": 17980 }, { "epoch": 0.4393765421542521, "grad_norm": 15.990132331848145, "learning_rate": 1.757365515219622e-06, "loss": 0.0734, "num_input_tokens_seen": 12105152, "step": 17985 }, { "epoch": 0.4394986929860992, "grad_norm": 0.2111085206270218, "learning_rate": 1.7578541066106414e-06, "loss": 0.1458, "num_input_tokens_seen": 12108992, "step": 17990 }, { "epoch": 0.4396208438179464, "grad_norm": 12.769996643066406, "learning_rate": 1.758342698001661e-06, "loss": 0.1572, "num_input_tokens_seen": 12112192, "step": 17995 }, { "epoch": 0.43974299464979355, "grad_norm": 25.63375473022461, "learning_rate": 1.758831289392681e-06, "loss": 0.1239, "num_input_tokens_seen": 12115904, "step": 18000 }, { "epoch": 0.43986514548164074, "grad_norm": 36.48350524902344, "learning_rate": 1.7593198807837004e-06, "loss": 0.1095, "num_input_tokens_seen": 12119744, "step": 18005 }, { "epoch": 0.4399872963134879, "grad_norm": 11.921119689941406, "learning_rate": 1.7598084721747201e-06, "loss": 0.0698, "num_input_tokens_seen": 12122880, "step": 18010 }, { "epoch": 0.44010944714533506, "grad_norm": 27.85514259338379, "learning_rate": 1.76029706356574e-06, "loss": 0.1874, "num_input_tokens_seen": 12126080, "step": 18015 }, { "epoch": 0.4402315979771822, "grad_norm": 13.67387866973877, "learning_rate": 1.7607856549567597e-06, "loss": 0.0898, "num_input_tokens_seen": 12129536, "step": 18020 }, { "epoch": 0.4403537488090294, "grad_norm": 9.061943054199219, "learning_rate": 1.7612742463477792e-06, "loss": 0.0822, "num_input_tokens_seen": 12132992, "step": 18025 }, { "epoch": 0.4404758996408766, "grad_norm": 40.795631408691406, "learning_rate": 1.761762837738799e-06, "loss": 0.1154, "num_input_tokens_seen": 12136320, "step": 18030 }, { "epoch": 0.4405980504727237, "grad_norm": 17.49411964416504, "learning_rate": 1.7622514291298187e-06, "loss": 0.1134, "num_input_tokens_seen": 12139712, "step": 18035 }, { "epoch": 0.4407202013045709, "grad_norm": 4.147327423095703, "learning_rate": 1.7627400205208382e-06, "loss": 0.1963, "num_input_tokens_seen": 12142656, "step": 18040 }, { "epoch": 0.44084235213641804, "grad_norm": 57.400169372558594, "learning_rate": 1.7632286119118581e-06, "loss": 0.2346, "num_input_tokens_seen": 12145984, "step": 18045 }, { "epoch": 0.4409645029682652, "grad_norm": 15.839056968688965, "learning_rate": 1.7637172033028778e-06, "loss": 0.128, "num_input_tokens_seen": 12149568, "step": 18050 }, { "epoch": 0.44108665380011236, "grad_norm": 14.90942668914795, "learning_rate": 1.7642057946938975e-06, "loss": 0.054, "num_input_tokens_seen": 12153216, "step": 18055 }, { "epoch": 0.44120880463195955, "grad_norm": 10.704864501953125, "learning_rate": 1.7646943860849172e-06, "loss": 0.1968, "num_input_tokens_seen": 12156992, "step": 18060 }, { "epoch": 0.4413309554638067, "grad_norm": 15.071078300476074, "learning_rate": 1.7651829774759368e-06, "loss": 0.0838, "num_input_tokens_seen": 12160448, "step": 18065 }, { "epoch": 0.4414531062956539, "grad_norm": 15.569796562194824, "learning_rate": 1.7656715688669565e-06, "loss": 0.1253, "num_input_tokens_seen": 12163520, "step": 18070 }, { "epoch": 0.44157525712750106, "grad_norm": 9.158796310424805, "learning_rate": 1.7661601602579762e-06, "loss": 0.0315, "num_input_tokens_seen": 12167488, "step": 18075 }, { "epoch": 0.4416974079593482, "grad_norm": 16.890132904052734, "learning_rate": 1.7666487516489959e-06, "loss": 0.0407, "num_input_tokens_seen": 12170944, "step": 18080 }, { "epoch": 0.4418195587911954, "grad_norm": 9.779267311096191, "learning_rate": 1.7671373430400156e-06, "loss": 0.1787, "num_input_tokens_seen": 12174336, "step": 18085 }, { "epoch": 0.4419417096230425, "grad_norm": 5.361752033233643, "learning_rate": 1.7676259344310352e-06, "loss": 0.0386, "num_input_tokens_seen": 12177472, "step": 18090 }, { "epoch": 0.4420638604548897, "grad_norm": 21.10449981689453, "learning_rate": 1.768114525822055e-06, "loss": 0.1516, "num_input_tokens_seen": 12180480, "step": 18095 }, { "epoch": 0.44218601128673685, "grad_norm": 2.0351269245147705, "learning_rate": 1.7686031172130746e-06, "loss": 0.1046, "num_input_tokens_seen": 12183808, "step": 18100 }, { "epoch": 0.44230816211858404, "grad_norm": 44.6690673828125, "learning_rate": 1.7690917086040945e-06, "loss": 0.2344, "num_input_tokens_seen": 12187008, "step": 18105 }, { "epoch": 0.44243031295043117, "grad_norm": 1.5464696884155273, "learning_rate": 1.769580299995114e-06, "loss": 0.0702, "num_input_tokens_seen": 12190464, "step": 18110 }, { "epoch": 0.44255246378227836, "grad_norm": 0.9766054749488831, "learning_rate": 1.7700688913861336e-06, "loss": 0.2018, "num_input_tokens_seen": 12194624, "step": 18115 }, { "epoch": 0.4426746146141255, "grad_norm": 22.986719131469727, "learning_rate": 1.7705574827771535e-06, "loss": 0.2166, "num_input_tokens_seen": 12198016, "step": 18120 }, { "epoch": 0.4427967654459727, "grad_norm": 1.6592448949813843, "learning_rate": 1.771046074168173e-06, "loss": 0.0513, "num_input_tokens_seen": 12201088, "step": 18125 }, { "epoch": 0.4429189162778199, "grad_norm": 10.565286636352539, "learning_rate": 1.7715346655591927e-06, "loss": 0.1119, "num_input_tokens_seen": 12204288, "step": 18130 }, { "epoch": 0.443041067109667, "grad_norm": 37.32499313354492, "learning_rate": 1.7720232569502126e-06, "loss": 0.2426, "num_input_tokens_seen": 12207488, "step": 18135 }, { "epoch": 0.4431632179415142, "grad_norm": 27.269149780273438, "learning_rate": 1.7725118483412323e-06, "loss": 0.2005, "num_input_tokens_seen": 12210560, "step": 18140 }, { "epoch": 0.44328536877336133, "grad_norm": 0.24170862138271332, "learning_rate": 1.7730004397322517e-06, "loss": 0.0823, "num_input_tokens_seen": 12213824, "step": 18145 }, { "epoch": 0.4434075196052085, "grad_norm": 24.47719383239746, "learning_rate": 1.7734890311232716e-06, "loss": 0.0994, "num_input_tokens_seen": 12217280, "step": 18150 }, { "epoch": 0.44352967043705566, "grad_norm": 10.670074462890625, "learning_rate": 1.7739776225142913e-06, "loss": 0.1956, "num_input_tokens_seen": 12220544, "step": 18155 }, { "epoch": 0.44365182126890285, "grad_norm": 19.86406135559082, "learning_rate": 1.7744662139053108e-06, "loss": 0.109, "num_input_tokens_seen": 12223616, "step": 18160 }, { "epoch": 0.44377397210075, "grad_norm": 13.4468412399292, "learning_rate": 1.7749548052963307e-06, "loss": 0.168, "num_input_tokens_seen": 12227136, "step": 18165 }, { "epoch": 0.44389612293259717, "grad_norm": 15.348857879638672, "learning_rate": 1.7754433966873504e-06, "loss": 0.1427, "num_input_tokens_seen": 12229952, "step": 18170 }, { "epoch": 0.44401827376444436, "grad_norm": 27.562042236328125, "learning_rate": 1.77593198807837e-06, "loss": 0.0636, "num_input_tokens_seen": 12233472, "step": 18175 }, { "epoch": 0.4441404245962915, "grad_norm": 3.60552716255188, "learning_rate": 1.7764205794693897e-06, "loss": 0.1436, "num_input_tokens_seen": 12236864, "step": 18180 }, { "epoch": 0.4442625754281387, "grad_norm": 43.09302520751953, "learning_rate": 1.7769091708604094e-06, "loss": 0.1426, "num_input_tokens_seen": 12240192, "step": 18185 }, { "epoch": 0.4443847262599858, "grad_norm": 9.167168617248535, "learning_rate": 1.777397762251429e-06, "loss": 0.1146, "num_input_tokens_seen": 12243328, "step": 18190 }, { "epoch": 0.444506877091833, "grad_norm": 11.480422019958496, "learning_rate": 1.7778863536424485e-06, "loss": 0.0868, "num_input_tokens_seen": 12246656, "step": 18195 }, { "epoch": 0.44462902792368014, "grad_norm": 24.931589126586914, "learning_rate": 1.7783749450334684e-06, "loss": 0.089, "num_input_tokens_seen": 12250176, "step": 18200 }, { "epoch": 0.44475117875552733, "grad_norm": 0.5531277060508728, "learning_rate": 1.7788635364244881e-06, "loss": 0.2103, "num_input_tokens_seen": 12253376, "step": 18205 }, { "epoch": 0.44487332958737447, "grad_norm": 4.852066516876221, "learning_rate": 1.779352127815508e-06, "loss": 0.1752, "num_input_tokens_seen": 12256512, "step": 18210 }, { "epoch": 0.44499548041922166, "grad_norm": 17.739526748657227, "learning_rate": 1.7798407192065275e-06, "loss": 0.0764, "num_input_tokens_seen": 12260032, "step": 18215 }, { "epoch": 0.44511763125106885, "grad_norm": 0.7192504405975342, "learning_rate": 1.7803293105975472e-06, "loss": 0.0446, "num_input_tokens_seen": 12263360, "step": 18220 }, { "epoch": 0.445239782082916, "grad_norm": 21.309751510620117, "learning_rate": 1.780817901988567e-06, "loss": 0.238, "num_input_tokens_seen": 12267008, "step": 18225 }, { "epoch": 0.44536193291476317, "grad_norm": 29.178531646728516, "learning_rate": 1.7813064933795865e-06, "loss": 0.1511, "num_input_tokens_seen": 12270400, "step": 18230 }, { "epoch": 0.4454840837466103, "grad_norm": 16.01999282836914, "learning_rate": 1.7817950847706062e-06, "loss": 0.0897, "num_input_tokens_seen": 12273792, "step": 18235 }, { "epoch": 0.4456062345784575, "grad_norm": 13.02960205078125, "learning_rate": 1.782283676161626e-06, "loss": 0.1574, "num_input_tokens_seen": 12276992, "step": 18240 }, { "epoch": 0.44572838541030463, "grad_norm": 5.595557689666748, "learning_rate": 1.7827722675526456e-06, "loss": 0.1676, "num_input_tokens_seen": 12280192, "step": 18245 }, { "epoch": 0.4458505362421518, "grad_norm": 17.469158172607422, "learning_rate": 1.7832608589436653e-06, "loss": 0.1209, "num_input_tokens_seen": 12283264, "step": 18250 }, { "epoch": 0.44597268707399895, "grad_norm": 22.485258102416992, "learning_rate": 1.7837494503346851e-06, "loss": 0.0612, "num_input_tokens_seen": 12286528, "step": 18255 }, { "epoch": 0.44609483790584614, "grad_norm": 18.569324493408203, "learning_rate": 1.7842380417257048e-06, "loss": 0.106, "num_input_tokens_seen": 12289856, "step": 18260 }, { "epoch": 0.4462169887376933, "grad_norm": 16.557361602783203, "learning_rate": 1.7847266331167243e-06, "loss": 0.08, "num_input_tokens_seen": 12292928, "step": 18265 }, { "epoch": 0.44633913956954047, "grad_norm": 13.699399948120117, "learning_rate": 1.7852152245077442e-06, "loss": 0.1225, "num_input_tokens_seen": 12296000, "step": 18270 }, { "epoch": 0.44646129040138766, "grad_norm": 20.304798126220703, "learning_rate": 1.7857038158987639e-06, "loss": 0.0658, "num_input_tokens_seen": 12299200, "step": 18275 }, { "epoch": 0.4465834412332348, "grad_norm": 8.509125709533691, "learning_rate": 1.7861924072897833e-06, "loss": 0.1892, "num_input_tokens_seen": 12302528, "step": 18280 }, { "epoch": 0.446705592065082, "grad_norm": 1.5570862293243408, "learning_rate": 1.7866809986808032e-06, "loss": 0.1002, "num_input_tokens_seen": 12305408, "step": 18285 }, { "epoch": 0.4468277428969291, "grad_norm": 4.416943073272705, "learning_rate": 1.787169590071823e-06, "loss": 0.0617, "num_input_tokens_seen": 12308608, "step": 18290 }, { "epoch": 0.4469498937287763, "grad_norm": 8.536765098571777, "learning_rate": 1.7876581814628426e-06, "loss": 0.071, "num_input_tokens_seen": 12312000, "step": 18295 }, { "epoch": 0.44707204456062344, "grad_norm": 20.15012550354004, "learning_rate": 1.788146772853862e-06, "loss": 0.1079, "num_input_tokens_seen": 12314944, "step": 18300 }, { "epoch": 0.44719419539247063, "grad_norm": 1.5687205791473389, "learning_rate": 1.788635364244882e-06, "loss": 0.095, "num_input_tokens_seen": 12318144, "step": 18305 }, { "epoch": 0.44731634622431776, "grad_norm": 12.047454833984375, "learning_rate": 1.7891239556359016e-06, "loss": 0.095, "num_input_tokens_seen": 12321344, "step": 18310 }, { "epoch": 0.44743849705616495, "grad_norm": 23.377124786376953, "learning_rate": 1.7896125470269211e-06, "loss": 0.1015, "num_input_tokens_seen": 12324672, "step": 18315 }, { "epoch": 0.44756064788801214, "grad_norm": 29.494647979736328, "learning_rate": 1.790101138417941e-06, "loss": 0.2122, "num_input_tokens_seen": 12327744, "step": 18320 }, { "epoch": 0.4476827987198593, "grad_norm": 0.5062751770019531, "learning_rate": 1.7905897298089607e-06, "loss": 0.1254, "num_input_tokens_seen": 12331136, "step": 18325 }, { "epoch": 0.44780494955170647, "grad_norm": 28.779891967773438, "learning_rate": 1.7910783211999806e-06, "loss": 0.1622, "num_input_tokens_seen": 12334528, "step": 18330 }, { "epoch": 0.4479271003835536, "grad_norm": 19.23775863647461, "learning_rate": 1.791566912591e-06, "loss": 0.2092, "num_input_tokens_seen": 12338112, "step": 18335 }, { "epoch": 0.4480492512154008, "grad_norm": 24.164121627807617, "learning_rate": 1.7920555039820197e-06, "loss": 0.0867, "num_input_tokens_seen": 12341632, "step": 18340 }, { "epoch": 0.4481714020472479, "grad_norm": 5.992556095123291, "learning_rate": 1.7925440953730396e-06, "loss": 0.0842, "num_input_tokens_seen": 12344576, "step": 18345 }, { "epoch": 0.4482935528790951, "grad_norm": 3.7169806957244873, "learning_rate": 1.793032686764059e-06, "loss": 0.0792, "num_input_tokens_seen": 12347776, "step": 18350 }, { "epoch": 0.44841570371094225, "grad_norm": 15.159074783325195, "learning_rate": 1.7935212781550788e-06, "loss": 0.1075, "num_input_tokens_seen": 12351296, "step": 18355 }, { "epoch": 0.44853785454278944, "grad_norm": 28.605384826660156, "learning_rate": 1.7940098695460987e-06, "loss": 0.1293, "num_input_tokens_seen": 12355008, "step": 18360 }, { "epoch": 0.44866000537463663, "grad_norm": 35.69192123413086, "learning_rate": 1.7944984609371181e-06, "loss": 0.0894, "num_input_tokens_seen": 12358272, "step": 18365 }, { "epoch": 0.44878215620648376, "grad_norm": 39.226112365722656, "learning_rate": 1.7949870523281378e-06, "loss": 0.1345, "num_input_tokens_seen": 12361408, "step": 18370 }, { "epoch": 0.44890430703833095, "grad_norm": 20.16849708557129, "learning_rate": 1.7954756437191577e-06, "loss": 0.1673, "num_input_tokens_seen": 12364800, "step": 18375 }, { "epoch": 0.4490264578701781, "grad_norm": 27.406200408935547, "learning_rate": 1.7959642351101774e-06, "loss": 0.0761, "num_input_tokens_seen": 12368064, "step": 18380 }, { "epoch": 0.4491486087020253, "grad_norm": 22.962316513061523, "learning_rate": 1.7964528265011969e-06, "loss": 0.2545, "num_input_tokens_seen": 12371584, "step": 18385 }, { "epoch": 0.4492707595338724, "grad_norm": 3.987178087234497, "learning_rate": 1.7969414178922165e-06, "loss": 0.0337, "num_input_tokens_seen": 12374592, "step": 18390 }, { "epoch": 0.4493929103657196, "grad_norm": 1.4741876125335693, "learning_rate": 1.7974300092832364e-06, "loss": 0.1143, "num_input_tokens_seen": 12377920, "step": 18395 }, { "epoch": 0.44951506119756673, "grad_norm": 0.08817076683044434, "learning_rate": 1.797918600674256e-06, "loss": 0.1046, "num_input_tokens_seen": 12380992, "step": 18400 }, { "epoch": 0.4496372120294139, "grad_norm": 12.917234420776367, "learning_rate": 1.7984071920652756e-06, "loss": 0.2225, "num_input_tokens_seen": 12384448, "step": 18405 }, { "epoch": 0.44975936286126106, "grad_norm": 7.490536689758301, "learning_rate": 1.7988957834562955e-06, "loss": 0.1052, "num_input_tokens_seen": 12387584, "step": 18410 }, { "epoch": 0.44988151369310825, "grad_norm": 34.98502731323242, "learning_rate": 1.7993843748473152e-06, "loss": 0.1601, "num_input_tokens_seen": 12391040, "step": 18415 }, { "epoch": 0.45000366452495544, "grad_norm": 45.478694915771484, "learning_rate": 1.7998729662383346e-06, "loss": 0.0576, "num_input_tokens_seen": 12394048, "step": 18420 }, { "epoch": 0.45012581535680257, "grad_norm": 0.9002415537834167, "learning_rate": 1.8003615576293545e-06, "loss": 0.1271, "num_input_tokens_seen": 12396928, "step": 18425 }, { "epoch": 0.45024796618864976, "grad_norm": 14.245929718017578, "learning_rate": 1.8008501490203742e-06, "loss": 0.042, "num_input_tokens_seen": 12400128, "step": 18430 }, { "epoch": 0.4503701170204969, "grad_norm": 21.265151977539062, "learning_rate": 1.8013387404113937e-06, "loss": 0.1996, "num_input_tokens_seen": 12403520, "step": 18435 }, { "epoch": 0.4504922678523441, "grad_norm": 27.13143539428711, "learning_rate": 1.8018273318024136e-06, "loss": 0.1522, "num_input_tokens_seen": 12406912, "step": 18440 }, { "epoch": 0.4506144186841912, "grad_norm": 25.90831184387207, "learning_rate": 1.8023159231934333e-06, "loss": 0.1372, "num_input_tokens_seen": 12410240, "step": 18445 }, { "epoch": 0.4507365695160384, "grad_norm": 27.729671478271484, "learning_rate": 1.8028045145844531e-06, "loss": 0.023, "num_input_tokens_seen": 12413120, "step": 18450 }, { "epoch": 0.45085872034788554, "grad_norm": 20.380538940429688, "learning_rate": 1.8032931059754726e-06, "loss": 0.1611, "num_input_tokens_seen": 12416256, "step": 18455 }, { "epoch": 0.45098087117973273, "grad_norm": 7.213114261627197, "learning_rate": 1.8037816973664923e-06, "loss": 0.0895, "num_input_tokens_seen": 12419584, "step": 18460 }, { "epoch": 0.4511030220115799, "grad_norm": 66.10938262939453, "learning_rate": 1.8042702887575122e-06, "loss": 0.0713, "num_input_tokens_seen": 12422976, "step": 18465 }, { "epoch": 0.45122517284342706, "grad_norm": 26.906949996948242, "learning_rate": 1.8047588801485317e-06, "loss": 0.1842, "num_input_tokens_seen": 12426432, "step": 18470 }, { "epoch": 0.45134732367527425, "grad_norm": 37.6614875793457, "learning_rate": 1.8052474715395513e-06, "loss": 0.122, "num_input_tokens_seen": 12429504, "step": 18475 }, { "epoch": 0.4514694745071214, "grad_norm": 5.725609302520752, "learning_rate": 1.805736062930571e-06, "loss": 0.0757, "num_input_tokens_seen": 12432576, "step": 18480 }, { "epoch": 0.45159162533896857, "grad_norm": 42.64928436279297, "learning_rate": 1.806224654321591e-06, "loss": 0.0932, "num_input_tokens_seen": 12435904, "step": 18485 }, { "epoch": 0.4517137761708157, "grad_norm": 1.5770378112792969, "learning_rate": 1.8067132457126104e-06, "loss": 0.0791, "num_input_tokens_seen": 12439872, "step": 18490 }, { "epoch": 0.4518359270026629, "grad_norm": 19.790578842163086, "learning_rate": 1.80720183710363e-06, "loss": 0.2214, "num_input_tokens_seen": 12443328, "step": 18495 }, { "epoch": 0.45195807783451003, "grad_norm": 5.598482608795166, "learning_rate": 1.80769042849465e-06, "loss": 0.1355, "num_input_tokens_seen": 12446656, "step": 18500 }, { "epoch": 0.4520802286663572, "grad_norm": 36.39820098876953, "learning_rate": 1.8081790198856694e-06, "loss": 0.1599, "num_input_tokens_seen": 12450112, "step": 18505 }, { "epoch": 0.4522023794982044, "grad_norm": 7.050886631011963, "learning_rate": 1.8086676112766891e-06, "loss": 0.1091, "num_input_tokens_seen": 12453568, "step": 18510 }, { "epoch": 0.45232453033005154, "grad_norm": 5.478260517120361, "learning_rate": 1.809156202667709e-06, "loss": 0.1155, "num_input_tokens_seen": 12457088, "step": 18515 }, { "epoch": 0.45244668116189873, "grad_norm": 4.716951370239258, "learning_rate": 1.8096447940587285e-06, "loss": 0.1302, "num_input_tokens_seen": 12460352, "step": 18520 }, { "epoch": 0.45256883199374587, "grad_norm": 10.715855598449707, "learning_rate": 1.8101333854497482e-06, "loss": 0.152, "num_input_tokens_seen": 12464512, "step": 18525 }, { "epoch": 0.45269098282559306, "grad_norm": 2.5258378982543945, "learning_rate": 1.810621976840768e-06, "loss": 0.0808, "num_input_tokens_seen": 12468160, "step": 18530 }, { "epoch": 0.4528131336574402, "grad_norm": 6.279117584228516, "learning_rate": 1.8111105682317877e-06, "loss": 0.1997, "num_input_tokens_seen": 12471744, "step": 18535 }, { "epoch": 0.4529352844892874, "grad_norm": 10.042743682861328, "learning_rate": 1.8115991596228072e-06, "loss": 0.0638, "num_input_tokens_seen": 12475584, "step": 18540 }, { "epoch": 0.4530574353211345, "grad_norm": 9.592670440673828, "learning_rate": 1.812087751013827e-06, "loss": 0.1659, "num_input_tokens_seen": 12479488, "step": 18545 }, { "epoch": 0.4531795861529817, "grad_norm": 17.198471069335938, "learning_rate": 1.8125763424048468e-06, "loss": 0.1573, "num_input_tokens_seen": 12482496, "step": 18550 }, { "epoch": 0.45330173698482884, "grad_norm": 6.23707914352417, "learning_rate": 1.8130649337958662e-06, "loss": 0.0731, "num_input_tokens_seen": 12486272, "step": 18555 }, { "epoch": 0.45342388781667603, "grad_norm": 29.733585357666016, "learning_rate": 1.8135535251868861e-06, "loss": 0.2805, "num_input_tokens_seen": 12489472, "step": 18560 }, { "epoch": 0.4535460386485232, "grad_norm": 21.276653289794922, "learning_rate": 1.8140421165779058e-06, "loss": 0.1171, "num_input_tokens_seen": 12492608, "step": 18565 }, { "epoch": 0.45366818948037035, "grad_norm": 2.3936610221862793, "learning_rate": 1.8145307079689255e-06, "loss": 0.1096, "num_input_tokens_seen": 12496256, "step": 18570 }, { "epoch": 0.45379034031221754, "grad_norm": 3.278949737548828, "learning_rate": 1.8150192993599452e-06, "loss": 0.1323, "num_input_tokens_seen": 12499712, "step": 18575 }, { "epoch": 0.4539124911440647, "grad_norm": 3.4298598766326904, "learning_rate": 1.8155078907509649e-06, "loss": 0.1229, "num_input_tokens_seen": 12503296, "step": 18580 }, { "epoch": 0.45403464197591187, "grad_norm": 15.00065803527832, "learning_rate": 1.8159964821419845e-06, "loss": 0.1623, "num_input_tokens_seen": 12507264, "step": 18585 }, { "epoch": 0.454156792807759, "grad_norm": 8.928850173950195, "learning_rate": 1.8164850735330042e-06, "loss": 0.0724, "num_input_tokens_seen": 12510528, "step": 18590 }, { "epoch": 0.4542789436396062, "grad_norm": 14.59353256225586, "learning_rate": 1.816973664924024e-06, "loss": 0.094, "num_input_tokens_seen": 12514304, "step": 18595 }, { "epoch": 0.4544010944714533, "grad_norm": 18.910980224609375, "learning_rate": 1.8174622563150436e-06, "loss": 0.1304, "num_input_tokens_seen": 12518528, "step": 18600 }, { "epoch": 0.4545232453033005, "grad_norm": 42.9864616394043, "learning_rate": 1.8179508477060635e-06, "loss": 0.1311, "num_input_tokens_seen": 12522240, "step": 18605 }, { "epoch": 0.4546453961351477, "grad_norm": 15.668207168579102, "learning_rate": 1.818439439097083e-06, "loss": 0.1302, "num_input_tokens_seen": 12525568, "step": 18610 }, { "epoch": 0.45476754696699484, "grad_norm": 0.8417151570320129, "learning_rate": 1.8189280304881026e-06, "loss": 0.0112, "num_input_tokens_seen": 12528768, "step": 18615 }, { "epoch": 0.45488969779884203, "grad_norm": 18.940181732177734, "learning_rate": 1.8194166218791225e-06, "loss": 0.0795, "num_input_tokens_seen": 12532224, "step": 18620 }, { "epoch": 0.45501184863068916, "grad_norm": 40.989173889160156, "learning_rate": 1.819905213270142e-06, "loss": 0.1319, "num_input_tokens_seen": 12535360, "step": 18625 }, { "epoch": 0.45513399946253635, "grad_norm": 4.111881256103516, "learning_rate": 1.8203938046611617e-06, "loss": 0.1793, "num_input_tokens_seen": 12538624, "step": 18630 }, { "epoch": 0.4552561502943835, "grad_norm": 6.590780735015869, "learning_rate": 1.8208823960521816e-06, "loss": 0.1235, "num_input_tokens_seen": 12542080, "step": 18635 }, { "epoch": 0.4553783011262307, "grad_norm": 41.6611213684082, "learning_rate": 1.8213709874432012e-06, "loss": 0.1727, "num_input_tokens_seen": 12546048, "step": 18640 }, { "epoch": 0.4555004519580778, "grad_norm": 3.607111692428589, "learning_rate": 1.8218595788342207e-06, "loss": 0.2543, "num_input_tokens_seen": 12549248, "step": 18645 }, { "epoch": 0.455622602789925, "grad_norm": 37.54118347167969, "learning_rate": 1.8223481702252406e-06, "loss": 0.2284, "num_input_tokens_seen": 12552640, "step": 18650 }, { "epoch": 0.4557447536217722, "grad_norm": 18.619918823242188, "learning_rate": 1.8228367616162603e-06, "loss": 0.0855, "num_input_tokens_seen": 12555968, "step": 18655 }, { "epoch": 0.4558669044536193, "grad_norm": 3.5834906101226807, "learning_rate": 1.8233253530072798e-06, "loss": 0.0782, "num_input_tokens_seen": 12559296, "step": 18660 }, { "epoch": 0.4559890552854665, "grad_norm": 16.49788475036621, "learning_rate": 1.8238139443982997e-06, "loss": 0.1598, "num_input_tokens_seen": 12562368, "step": 18665 }, { "epoch": 0.45611120611731365, "grad_norm": 1.2870237827301025, "learning_rate": 1.8243025357893193e-06, "loss": 0.0274, "num_input_tokens_seen": 12565696, "step": 18670 }, { "epoch": 0.45623335694916084, "grad_norm": 1.1009882688522339, "learning_rate": 1.8247911271803388e-06, "loss": 0.0718, "num_input_tokens_seen": 12569408, "step": 18675 }, { "epoch": 0.456355507781008, "grad_norm": 0.1918848603963852, "learning_rate": 1.8252797185713587e-06, "loss": 0.1147, "num_input_tokens_seen": 12572864, "step": 18680 }, { "epoch": 0.45647765861285516, "grad_norm": 47.69165802001953, "learning_rate": 1.8257683099623784e-06, "loss": 0.1923, "num_input_tokens_seen": 12576448, "step": 18685 }, { "epoch": 0.4565998094447023, "grad_norm": 0.2405889481306076, "learning_rate": 1.826256901353398e-06, "loss": 0.3076, "num_input_tokens_seen": 12579584, "step": 18690 }, { "epoch": 0.4567219602765495, "grad_norm": 39.91948699951172, "learning_rate": 1.8267454927444177e-06, "loss": 0.1864, "num_input_tokens_seen": 12582976, "step": 18695 }, { "epoch": 0.4568441111083966, "grad_norm": 18.106645584106445, "learning_rate": 1.8272340841354374e-06, "loss": 0.1133, "num_input_tokens_seen": 12586432, "step": 18700 }, { "epoch": 0.4569662619402438, "grad_norm": 14.690360069274902, "learning_rate": 1.827722675526457e-06, "loss": 0.1829, "num_input_tokens_seen": 12589184, "step": 18705 }, { "epoch": 0.457088412772091, "grad_norm": 0.1765415072441101, "learning_rate": 1.8282112669174768e-06, "loss": 0.1619, "num_input_tokens_seen": 12592768, "step": 18710 }, { "epoch": 0.45721056360393814, "grad_norm": 17.927274703979492, "learning_rate": 1.8286998583084965e-06, "loss": 0.1941, "num_input_tokens_seen": 12595840, "step": 18715 }, { "epoch": 0.4573327144357853, "grad_norm": 20.598888397216797, "learning_rate": 1.8291884496995161e-06, "loss": 0.1251, "num_input_tokens_seen": 12599168, "step": 18720 }, { "epoch": 0.45745486526763246, "grad_norm": 5.029316425323486, "learning_rate": 1.829677041090536e-06, "loss": 0.1294, "num_input_tokens_seen": 12602560, "step": 18725 }, { "epoch": 0.45757701609947965, "grad_norm": 32.626529693603516, "learning_rate": 1.8301656324815555e-06, "loss": 0.087, "num_input_tokens_seen": 12606208, "step": 18730 }, { "epoch": 0.4576991669313268, "grad_norm": 20.20982551574707, "learning_rate": 1.8306542238725752e-06, "loss": 0.0713, "num_input_tokens_seen": 12609600, "step": 18735 }, { "epoch": 0.457821317763174, "grad_norm": 10.495606422424316, "learning_rate": 1.831142815263595e-06, "loss": 0.1419, "num_input_tokens_seen": 12612864, "step": 18740 }, { "epoch": 0.4579434685950211, "grad_norm": 28.25389289855957, "learning_rate": 1.8316314066546146e-06, "loss": 0.0825, "num_input_tokens_seen": 12616192, "step": 18745 }, { "epoch": 0.4580656194268683, "grad_norm": 17.525806427001953, "learning_rate": 1.8321199980456342e-06, "loss": 0.0839, "num_input_tokens_seen": 12619584, "step": 18750 }, { "epoch": 0.4581877702587155, "grad_norm": 0.5782161951065063, "learning_rate": 1.8326085894366541e-06, "loss": 0.0678, "num_input_tokens_seen": 12623232, "step": 18755 }, { "epoch": 0.4583099210905626, "grad_norm": 21.926904678344727, "learning_rate": 1.8330971808276738e-06, "loss": 0.1289, "num_input_tokens_seen": 12627200, "step": 18760 }, { "epoch": 0.4584320719224098, "grad_norm": 0.8156311511993408, "learning_rate": 1.8335857722186933e-06, "loss": 0.2559, "num_input_tokens_seen": 12630528, "step": 18765 }, { "epoch": 0.45855422275425695, "grad_norm": 10.35279369354248, "learning_rate": 1.8340743636097132e-06, "loss": 0.0332, "num_input_tokens_seen": 12633536, "step": 18770 }, { "epoch": 0.45867637358610414, "grad_norm": 20.163043975830078, "learning_rate": 1.8345629550007329e-06, "loss": 0.1104, "num_input_tokens_seen": 12636864, "step": 18775 }, { "epoch": 0.45879852441795127, "grad_norm": 20.536880493164062, "learning_rate": 1.8350515463917523e-06, "loss": 0.1016, "num_input_tokens_seen": 12640576, "step": 18780 }, { "epoch": 0.45892067524979846, "grad_norm": 9.669657707214355, "learning_rate": 1.8355401377827722e-06, "loss": 0.0848, "num_input_tokens_seen": 12643968, "step": 18785 }, { "epoch": 0.4590428260816456, "grad_norm": 7.986501216888428, "learning_rate": 1.836028729173792e-06, "loss": 0.1632, "num_input_tokens_seen": 12647040, "step": 18790 }, { "epoch": 0.4591649769134928, "grad_norm": 11.392936706542969, "learning_rate": 1.8365173205648114e-06, "loss": 0.2297, "num_input_tokens_seen": 12650240, "step": 18795 }, { "epoch": 0.4592871277453399, "grad_norm": 36.556251525878906, "learning_rate": 1.8370059119558313e-06, "loss": 0.1519, "num_input_tokens_seen": 12653312, "step": 18800 }, { "epoch": 0.4594092785771871, "grad_norm": 47.813777923583984, "learning_rate": 1.837494503346851e-06, "loss": 0.0764, "num_input_tokens_seen": 12656832, "step": 18805 }, { "epoch": 0.4595314294090343, "grad_norm": 39.372642517089844, "learning_rate": 1.8379830947378706e-06, "loss": 0.2496, "num_input_tokens_seen": 12660288, "step": 18810 }, { "epoch": 0.45965358024088143, "grad_norm": 9.365509986877441, "learning_rate": 1.8384716861288903e-06, "loss": 0.1129, "num_input_tokens_seen": 12663680, "step": 18815 }, { "epoch": 0.4597757310727286, "grad_norm": 16.465686798095703, "learning_rate": 1.83896027751991e-06, "loss": 0.1077, "num_input_tokens_seen": 12667200, "step": 18820 }, { "epoch": 0.45989788190457576, "grad_norm": 15.674369812011719, "learning_rate": 1.8394488689109297e-06, "loss": 0.0589, "num_input_tokens_seen": 12671040, "step": 18825 }, { "epoch": 0.46002003273642295, "grad_norm": 18.266897201538086, "learning_rate": 1.8399374603019493e-06, "loss": 0.129, "num_input_tokens_seen": 12674496, "step": 18830 }, { "epoch": 0.4601421835682701, "grad_norm": 25.660734176635742, "learning_rate": 1.840426051692969e-06, "loss": 0.098, "num_input_tokens_seen": 12677504, "step": 18835 }, { "epoch": 0.46026433440011727, "grad_norm": 1.7656751871109009, "learning_rate": 1.8409146430839887e-06, "loss": 0.1474, "num_input_tokens_seen": 12680640, "step": 18840 }, { "epoch": 0.4603864852319644, "grad_norm": 20.569786071777344, "learning_rate": 1.8414032344750086e-06, "loss": 0.0447, "num_input_tokens_seen": 12684160, "step": 18845 }, { "epoch": 0.4605086360638116, "grad_norm": 4.835124969482422, "learning_rate": 1.841891825866028e-06, "loss": 0.0752, "num_input_tokens_seen": 12687424, "step": 18850 }, { "epoch": 0.4606307868956588, "grad_norm": 13.825311660766602, "learning_rate": 1.8423804172570478e-06, "loss": 0.0254, "num_input_tokens_seen": 12690816, "step": 18855 }, { "epoch": 0.4607529377275059, "grad_norm": 32.11545944213867, "learning_rate": 1.8428690086480676e-06, "loss": 0.1209, "num_input_tokens_seen": 12694016, "step": 18860 }, { "epoch": 0.4608750885593531, "grad_norm": 44.318389892578125, "learning_rate": 1.8433576000390871e-06, "loss": 0.0971, "num_input_tokens_seen": 12697728, "step": 18865 }, { "epoch": 0.46099723939120024, "grad_norm": 21.114696502685547, "learning_rate": 1.8438461914301068e-06, "loss": 0.1475, "num_input_tokens_seen": 12701632, "step": 18870 }, { "epoch": 0.46111939022304743, "grad_norm": 26.01463508605957, "learning_rate": 1.8443347828211267e-06, "loss": 0.1265, "num_input_tokens_seen": 12705024, "step": 18875 }, { "epoch": 0.46124154105489457, "grad_norm": 44.7984733581543, "learning_rate": 1.8448233742121464e-06, "loss": 0.2305, "num_input_tokens_seen": 12708480, "step": 18880 }, { "epoch": 0.46136369188674176, "grad_norm": 18.585708618164062, "learning_rate": 1.8453119656031658e-06, "loss": 0.3209, "num_input_tokens_seen": 12711936, "step": 18885 }, { "epoch": 0.4614858427185889, "grad_norm": 7.216206073760986, "learning_rate": 1.8458005569941857e-06, "loss": 0.1201, "num_input_tokens_seen": 12715904, "step": 18890 }, { "epoch": 0.4616079935504361, "grad_norm": 24.87909698486328, "learning_rate": 1.8462891483852054e-06, "loss": 0.0603, "num_input_tokens_seen": 12719744, "step": 18895 }, { "epoch": 0.46173014438228327, "grad_norm": 1.087585687637329, "learning_rate": 1.8467777397762249e-06, "loss": 0.1219, "num_input_tokens_seen": 12723200, "step": 18900 }, { "epoch": 0.4618522952141304, "grad_norm": 14.558207511901855, "learning_rate": 1.8472663311672448e-06, "loss": 0.1796, "num_input_tokens_seen": 12726208, "step": 18905 }, { "epoch": 0.4619744460459776, "grad_norm": 14.949580192565918, "learning_rate": 1.8477549225582645e-06, "loss": 0.1212, "num_input_tokens_seen": 12729088, "step": 18910 }, { "epoch": 0.46209659687782473, "grad_norm": 3.4832208156585693, "learning_rate": 1.8482435139492841e-06, "loss": 0.0825, "num_input_tokens_seen": 12732864, "step": 18915 }, { "epoch": 0.4622187477096719, "grad_norm": 23.411306381225586, "learning_rate": 1.8487321053403038e-06, "loss": 0.1234, "num_input_tokens_seen": 12736256, "step": 18920 }, { "epoch": 0.46234089854151905, "grad_norm": 3.059123992919922, "learning_rate": 1.8492206967313235e-06, "loss": 0.0573, "num_input_tokens_seen": 12739520, "step": 18925 }, { "epoch": 0.46246304937336624, "grad_norm": 28.218448638916016, "learning_rate": 1.8497092881223432e-06, "loss": 0.1303, "num_input_tokens_seen": 12743104, "step": 18930 }, { "epoch": 0.4625852002052134, "grad_norm": 22.900333404541016, "learning_rate": 1.8501978795133629e-06, "loss": 0.1818, "num_input_tokens_seen": 12746432, "step": 18935 }, { "epoch": 0.46270735103706057, "grad_norm": 15.471698760986328, "learning_rate": 1.8506864709043825e-06, "loss": 0.0869, "num_input_tokens_seen": 12749568, "step": 18940 }, { "epoch": 0.4628295018689077, "grad_norm": 19.799257278442383, "learning_rate": 1.8511750622954022e-06, "loss": 0.109, "num_input_tokens_seen": 12752768, "step": 18945 }, { "epoch": 0.4629516527007549, "grad_norm": 15.0982666015625, "learning_rate": 1.851663653686422e-06, "loss": 0.1561, "num_input_tokens_seen": 12756480, "step": 18950 }, { "epoch": 0.4630738035326021, "grad_norm": 0.41449591517448425, "learning_rate": 1.8521522450774416e-06, "loss": 0.1497, "num_input_tokens_seen": 12759872, "step": 18955 }, { "epoch": 0.4631959543644492, "grad_norm": 31.08782386779785, "learning_rate": 1.8526408364684613e-06, "loss": 0.1316, "num_input_tokens_seen": 12763264, "step": 18960 }, { "epoch": 0.4633181051962964, "grad_norm": 2.4979774951934814, "learning_rate": 1.8531294278594812e-06, "loss": 0.0591, "num_input_tokens_seen": 12766720, "step": 18965 }, { "epoch": 0.46344025602814354, "grad_norm": 30.141979217529297, "learning_rate": 1.8536180192505006e-06, "loss": 0.1837, "num_input_tokens_seen": 12770240, "step": 18970 }, { "epoch": 0.4635624068599907, "grad_norm": 8.070691108703613, "learning_rate": 1.8541066106415203e-06, "loss": 0.0643, "num_input_tokens_seen": 12774080, "step": 18975 }, { "epoch": 0.46368455769183786, "grad_norm": 44.15768051147461, "learning_rate": 1.8545952020325402e-06, "loss": 0.1817, "num_input_tokens_seen": 12777280, "step": 18980 }, { "epoch": 0.46380670852368505, "grad_norm": 24.315988540649414, "learning_rate": 1.8550837934235597e-06, "loss": 0.1293, "num_input_tokens_seen": 12781056, "step": 18985 }, { "epoch": 0.4639288593555322, "grad_norm": 18.086763381958008, "learning_rate": 1.8555723848145794e-06, "loss": 0.139, "num_input_tokens_seen": 12784384, "step": 18990 }, { "epoch": 0.4640510101873794, "grad_norm": 23.693946838378906, "learning_rate": 1.8560609762055993e-06, "loss": 0.1052, "num_input_tokens_seen": 12787520, "step": 18995 }, { "epoch": 0.46417316101922657, "grad_norm": 2.2032880783081055, "learning_rate": 1.856549567596619e-06, "loss": 0.1275, "num_input_tokens_seen": 12791040, "step": 19000 }, { "epoch": 0.4642953118510737, "grad_norm": 21.85564613342285, "learning_rate": 1.8570381589876384e-06, "loss": 0.0531, "num_input_tokens_seen": 12794432, "step": 19005 }, { "epoch": 0.4644174626829209, "grad_norm": 33.41852951049805, "learning_rate": 1.8575267503786583e-06, "loss": 0.1065, "num_input_tokens_seen": 12797632, "step": 19010 }, { "epoch": 0.464539613514768, "grad_norm": 67.6316909790039, "learning_rate": 1.858015341769678e-06, "loss": 0.1797, "num_input_tokens_seen": 12801152, "step": 19015 }, { "epoch": 0.4646617643466152, "grad_norm": 2.0205495357513428, "learning_rate": 1.8585039331606974e-06, "loss": 0.0999, "num_input_tokens_seen": 12804864, "step": 19020 }, { "epoch": 0.46478391517846235, "grad_norm": 1.9264830350875854, "learning_rate": 1.8589925245517173e-06, "loss": 0.0713, "num_input_tokens_seen": 12808256, "step": 19025 }, { "epoch": 0.46490606601030954, "grad_norm": 17.218624114990234, "learning_rate": 1.859481115942737e-06, "loss": 0.1033, "num_input_tokens_seen": 12812224, "step": 19030 }, { "epoch": 0.46502821684215667, "grad_norm": 37.7265510559082, "learning_rate": 1.8599697073337567e-06, "loss": 0.2061, "num_input_tokens_seen": 12815616, "step": 19035 }, { "epoch": 0.46515036767400386, "grad_norm": 2.984351873397827, "learning_rate": 1.8604582987247764e-06, "loss": 0.1223, "num_input_tokens_seen": 12819392, "step": 19040 }, { "epoch": 0.46527251850585105, "grad_norm": 10.050679206848145, "learning_rate": 1.860946890115796e-06, "loss": 0.0222, "num_input_tokens_seen": 12822528, "step": 19045 }, { "epoch": 0.4653946693376982, "grad_norm": 35.5335578918457, "learning_rate": 1.8614354815068157e-06, "loss": 0.1632, "num_input_tokens_seen": 12825664, "step": 19050 }, { "epoch": 0.4655168201695454, "grad_norm": 3.787121534347534, "learning_rate": 1.8619240728978354e-06, "loss": 0.0751, "num_input_tokens_seen": 12829184, "step": 19055 }, { "epoch": 0.4656389710013925, "grad_norm": 23.502910614013672, "learning_rate": 1.8624126642888551e-06, "loss": 0.1655, "num_input_tokens_seen": 12832256, "step": 19060 }, { "epoch": 0.4657611218332397, "grad_norm": 45.55996322631836, "learning_rate": 1.8629012556798748e-06, "loss": 0.1237, "num_input_tokens_seen": 12835520, "step": 19065 }, { "epoch": 0.46588327266508683, "grad_norm": 3.102910280227661, "learning_rate": 1.8633898470708945e-06, "loss": 0.1247, "num_input_tokens_seen": 12838656, "step": 19070 }, { "epoch": 0.466005423496934, "grad_norm": 17.618194580078125, "learning_rate": 1.8638784384619142e-06, "loss": 0.0818, "num_input_tokens_seen": 12841984, "step": 19075 }, { "epoch": 0.46612757432878116, "grad_norm": 8.12735652923584, "learning_rate": 1.8643670298529338e-06, "loss": 0.1951, "num_input_tokens_seen": 12845952, "step": 19080 }, { "epoch": 0.46624972516062835, "grad_norm": 49.927486419677734, "learning_rate": 1.8648556212439537e-06, "loss": 0.1492, "num_input_tokens_seen": 12849216, "step": 19085 }, { "epoch": 0.4663718759924755, "grad_norm": 25.571504592895508, "learning_rate": 1.8653442126349732e-06, "loss": 0.0904, "num_input_tokens_seen": 12852352, "step": 19090 }, { "epoch": 0.46649402682432267, "grad_norm": 15.531042098999023, "learning_rate": 1.8658328040259929e-06, "loss": 0.1296, "num_input_tokens_seen": 12856000, "step": 19095 }, { "epoch": 0.46661617765616986, "grad_norm": 11.947113037109375, "learning_rate": 1.8663213954170128e-06, "loss": 0.0455, "num_input_tokens_seen": 12859008, "step": 19100 }, { "epoch": 0.466738328488017, "grad_norm": 12.411914825439453, "learning_rate": 1.8668099868080322e-06, "loss": 0.1772, "num_input_tokens_seen": 12862464, "step": 19105 }, { "epoch": 0.4668604793198642, "grad_norm": 11.80659294128418, "learning_rate": 1.867298578199052e-06, "loss": 0.0789, "num_input_tokens_seen": 12865792, "step": 19110 }, { "epoch": 0.4669826301517113, "grad_norm": 0.7526969313621521, "learning_rate": 1.8677871695900718e-06, "loss": 0.0677, "num_input_tokens_seen": 12869376, "step": 19115 }, { "epoch": 0.4671047809835585, "grad_norm": 8.360123634338379, "learning_rate": 1.8682757609810915e-06, "loss": 0.1072, "num_input_tokens_seen": 12872896, "step": 19120 }, { "epoch": 0.46722693181540564, "grad_norm": 8.367980003356934, "learning_rate": 1.868764352372111e-06, "loss": 0.1762, "num_input_tokens_seen": 12876608, "step": 19125 }, { "epoch": 0.46734908264725283, "grad_norm": 3.0096700191497803, "learning_rate": 1.8692529437631309e-06, "loss": 0.1542, "num_input_tokens_seen": 12879936, "step": 19130 }, { "epoch": 0.46747123347909997, "grad_norm": 73.32782745361328, "learning_rate": 1.8697415351541505e-06, "loss": 0.1688, "num_input_tokens_seen": 12883392, "step": 19135 }, { "epoch": 0.46759338431094716, "grad_norm": 1.9654453992843628, "learning_rate": 1.87023012654517e-06, "loss": 0.1618, "num_input_tokens_seen": 12886784, "step": 19140 }, { "epoch": 0.46771553514279435, "grad_norm": 11.835115432739258, "learning_rate": 1.87071871793619e-06, "loss": 0.1077, "num_input_tokens_seen": 12890048, "step": 19145 }, { "epoch": 0.4678376859746415, "grad_norm": 0.22136236727237701, "learning_rate": 1.8712073093272096e-06, "loss": 0.0752, "num_input_tokens_seen": 12893312, "step": 19150 }, { "epoch": 0.46795983680648867, "grad_norm": 32.926368713378906, "learning_rate": 1.8716959007182293e-06, "loss": 0.1155, "num_input_tokens_seen": 12896704, "step": 19155 }, { "epoch": 0.4680819876383358, "grad_norm": 1.585870623588562, "learning_rate": 1.872184492109249e-06, "loss": 0.0286, "num_input_tokens_seen": 12900096, "step": 19160 }, { "epoch": 0.468204138470183, "grad_norm": 21.62451171875, "learning_rate": 1.8726730835002686e-06, "loss": 0.1852, "num_input_tokens_seen": 12903040, "step": 19165 }, { "epoch": 0.46832628930203013, "grad_norm": 10.86733627319336, "learning_rate": 1.8731616748912883e-06, "loss": 0.1419, "num_input_tokens_seen": 12906432, "step": 19170 }, { "epoch": 0.4684484401338773, "grad_norm": 51.00568771362305, "learning_rate": 1.873650266282308e-06, "loss": 0.106, "num_input_tokens_seen": 12909376, "step": 19175 }, { "epoch": 0.46857059096572445, "grad_norm": 41.37013626098633, "learning_rate": 1.8741388576733277e-06, "loss": 0.1177, "num_input_tokens_seen": 12912896, "step": 19180 }, { "epoch": 0.46869274179757164, "grad_norm": 14.004033088684082, "learning_rate": 1.8746274490643474e-06, "loss": 0.1659, "num_input_tokens_seen": 12916096, "step": 19185 }, { "epoch": 0.46881489262941883, "grad_norm": 51.04507827758789, "learning_rate": 1.8751160404553672e-06, "loss": 0.126, "num_input_tokens_seen": 12919168, "step": 19190 }, { "epoch": 0.46893704346126597, "grad_norm": 36.705265045166016, "learning_rate": 1.8756046318463867e-06, "loss": 0.233, "num_input_tokens_seen": 12922496, "step": 19195 }, { "epoch": 0.46905919429311316, "grad_norm": 9.139296531677246, "learning_rate": 1.8760932232374064e-06, "loss": 0.2077, "num_input_tokens_seen": 12926016, "step": 19200 }, { "epoch": 0.4691813451249603, "grad_norm": 11.94605541229248, "learning_rate": 1.8765818146284263e-06, "loss": 0.1411, "num_input_tokens_seen": 12929472, "step": 19205 }, { "epoch": 0.4693034959568075, "grad_norm": 4.031689643859863, "learning_rate": 1.8770704060194458e-06, "loss": 0.0664, "num_input_tokens_seen": 12933056, "step": 19210 }, { "epoch": 0.4694256467886546, "grad_norm": 11.11829948425293, "learning_rate": 1.8775589974104654e-06, "loss": 0.0971, "num_input_tokens_seen": 12936384, "step": 19215 }, { "epoch": 0.4695477976205018, "grad_norm": 3.863867998123169, "learning_rate": 1.8780475888014853e-06, "loss": 0.0367, "num_input_tokens_seen": 12939648, "step": 19220 }, { "epoch": 0.46966994845234894, "grad_norm": 20.8947696685791, "learning_rate": 1.8785361801925048e-06, "loss": 0.0736, "num_input_tokens_seen": 12943104, "step": 19225 }, { "epoch": 0.46979209928419613, "grad_norm": 17.901206970214844, "learning_rate": 1.8790247715835245e-06, "loss": 0.0748, "num_input_tokens_seen": 12946304, "step": 19230 }, { "epoch": 0.46991425011604326, "grad_norm": 37.201446533203125, "learning_rate": 1.8795133629745444e-06, "loss": 0.044, "num_input_tokens_seen": 12949248, "step": 19235 }, { "epoch": 0.47003640094789045, "grad_norm": 34.428733825683594, "learning_rate": 1.880001954365564e-06, "loss": 0.2714, "num_input_tokens_seen": 12952512, "step": 19240 }, { "epoch": 0.47015855177973764, "grad_norm": 60.48667526245117, "learning_rate": 1.8804905457565835e-06, "loss": 0.1009, "num_input_tokens_seen": 12955712, "step": 19245 }, { "epoch": 0.4702807026115848, "grad_norm": 6.710975646972656, "learning_rate": 1.8809791371476034e-06, "loss": 0.0945, "num_input_tokens_seen": 12959424, "step": 19250 }, { "epoch": 0.47040285344343197, "grad_norm": 14.131996154785156, "learning_rate": 1.881467728538623e-06, "loss": 0.1379, "num_input_tokens_seen": 12962880, "step": 19255 }, { "epoch": 0.4705250042752791, "grad_norm": 15.032042503356934, "learning_rate": 1.8819563199296426e-06, "loss": 0.1535, "num_input_tokens_seen": 12966464, "step": 19260 }, { "epoch": 0.4706471551071263, "grad_norm": 5.918193817138672, "learning_rate": 1.8824449113206625e-06, "loss": 0.0838, "num_input_tokens_seen": 12970048, "step": 19265 }, { "epoch": 0.4707693059389734, "grad_norm": 32.9735221862793, "learning_rate": 1.8829335027116821e-06, "loss": 0.1308, "num_input_tokens_seen": 12973760, "step": 19270 }, { "epoch": 0.4708914567708206, "grad_norm": 0.23837028443813324, "learning_rate": 1.8834220941027018e-06, "loss": 0.0659, "num_input_tokens_seen": 12977600, "step": 19275 }, { "epoch": 0.47101360760266775, "grad_norm": 32.31455993652344, "learning_rate": 1.8839106854937215e-06, "loss": 0.2964, "num_input_tokens_seen": 12980864, "step": 19280 }, { "epoch": 0.47113575843451494, "grad_norm": 26.42384910583496, "learning_rate": 1.8843992768847412e-06, "loss": 0.0543, "num_input_tokens_seen": 12984256, "step": 19285 }, { "epoch": 0.47125790926636213, "grad_norm": 0.42388415336608887, "learning_rate": 1.8848878682757609e-06, "loss": 0.0297, "num_input_tokens_seen": 12987456, "step": 19290 }, { "epoch": 0.47138006009820926, "grad_norm": 1.073360562324524, "learning_rate": 1.8853764596667806e-06, "loss": 0.1323, "num_input_tokens_seen": 12990976, "step": 19295 }, { "epoch": 0.47150221093005645, "grad_norm": 36.52716064453125, "learning_rate": 1.8858650510578002e-06, "loss": 0.0875, "num_input_tokens_seen": 12994368, "step": 19300 }, { "epoch": 0.4716243617619036, "grad_norm": 18.84002113342285, "learning_rate": 1.88635364244882e-06, "loss": 0.2024, "num_input_tokens_seen": 12997376, "step": 19305 }, { "epoch": 0.4717465125937508, "grad_norm": 9.292922019958496, "learning_rate": 1.8868422338398398e-06, "loss": 0.259, "num_input_tokens_seen": 13000448, "step": 19310 }, { "epoch": 0.4718686634255979, "grad_norm": 57.36844253540039, "learning_rate": 1.8873308252308593e-06, "loss": 0.1971, "num_input_tokens_seen": 13003840, "step": 19315 }, { "epoch": 0.4719908142574451, "grad_norm": 22.608705520629883, "learning_rate": 1.887819416621879e-06, "loss": 0.2406, "num_input_tokens_seen": 13007424, "step": 19320 }, { "epoch": 0.47211296508929224, "grad_norm": 33.60593032836914, "learning_rate": 1.8883080080128989e-06, "loss": 0.1605, "num_input_tokens_seen": 13010816, "step": 19325 }, { "epoch": 0.4722351159211394, "grad_norm": 0.326453298330307, "learning_rate": 1.8887965994039183e-06, "loss": 0.1467, "num_input_tokens_seen": 13014080, "step": 19330 }, { "epoch": 0.4723572667529866, "grad_norm": 30.46360206604004, "learning_rate": 1.889285190794938e-06, "loss": 0.0574, "num_input_tokens_seen": 13017344, "step": 19335 }, { "epoch": 0.47247941758483375, "grad_norm": 31.23234748840332, "learning_rate": 1.889773782185958e-06, "loss": 0.1405, "num_input_tokens_seen": 13020608, "step": 19340 }, { "epoch": 0.47260156841668094, "grad_norm": 30.33367919921875, "learning_rate": 1.8902623735769776e-06, "loss": 0.1287, "num_input_tokens_seen": 13024128, "step": 19345 }, { "epoch": 0.4727237192485281, "grad_norm": 0.36472418904304504, "learning_rate": 1.890750964967997e-06, "loss": 0.1658, "num_input_tokens_seen": 13027584, "step": 19350 }, { "epoch": 0.47284587008037526, "grad_norm": 9.631101608276367, "learning_rate": 1.891239556359017e-06, "loss": 0.2057, "num_input_tokens_seen": 13030592, "step": 19355 }, { "epoch": 0.4729680209122224, "grad_norm": 19.1376895904541, "learning_rate": 1.8917281477500366e-06, "loss": 0.1622, "num_input_tokens_seen": 13033792, "step": 19360 }, { "epoch": 0.4730901717440696, "grad_norm": 5.452675819396973, "learning_rate": 1.892216739141056e-06, "loss": 0.086, "num_input_tokens_seen": 13036800, "step": 19365 }, { "epoch": 0.4732123225759167, "grad_norm": 16.26456642150879, "learning_rate": 1.892705330532076e-06, "loss": 0.1403, "num_input_tokens_seen": 13040128, "step": 19370 }, { "epoch": 0.4733344734077639, "grad_norm": 13.444612503051758, "learning_rate": 1.8931939219230957e-06, "loss": 0.1043, "num_input_tokens_seen": 13043584, "step": 19375 }, { "epoch": 0.47345662423961105, "grad_norm": 13.632548332214355, "learning_rate": 1.8936825133141151e-06, "loss": 0.1329, "num_input_tokens_seen": 13046656, "step": 19380 }, { "epoch": 0.47357877507145824, "grad_norm": 1.3274794816970825, "learning_rate": 1.894171104705135e-06, "loss": 0.0519, "num_input_tokens_seen": 13049792, "step": 19385 }, { "epoch": 0.4737009259033054, "grad_norm": 1.07449209690094, "learning_rate": 1.8946596960961547e-06, "loss": 0.1164, "num_input_tokens_seen": 13052736, "step": 19390 }, { "epoch": 0.47382307673515256, "grad_norm": 26.944843292236328, "learning_rate": 1.8951482874871744e-06, "loss": 0.1841, "num_input_tokens_seen": 13055616, "step": 19395 }, { "epoch": 0.47394522756699975, "grad_norm": 12.593786239624023, "learning_rate": 1.895636878878194e-06, "loss": 0.1359, "num_input_tokens_seen": 13058880, "step": 19400 }, { "epoch": 0.4740673783988469, "grad_norm": 10.367599487304688, "learning_rate": 1.8961254702692138e-06, "loss": 0.1169, "num_input_tokens_seen": 13062080, "step": 19405 }, { "epoch": 0.4741895292306941, "grad_norm": 16.240982055664062, "learning_rate": 1.8966140616602334e-06, "loss": 0.0242, "num_input_tokens_seen": 13065728, "step": 19410 }, { "epoch": 0.4743116800625412, "grad_norm": 2.3575432300567627, "learning_rate": 1.8971026530512531e-06, "loss": 0.0707, "num_input_tokens_seen": 13068992, "step": 19415 }, { "epoch": 0.4744338308943884, "grad_norm": 46.091365814208984, "learning_rate": 1.8975912444422728e-06, "loss": 0.2041, "num_input_tokens_seen": 13072384, "step": 19420 }, { "epoch": 0.47455598172623553, "grad_norm": 4.177423477172852, "learning_rate": 1.8980798358332925e-06, "loss": 0.0601, "num_input_tokens_seen": 13075648, "step": 19425 }, { "epoch": 0.4746781325580827, "grad_norm": 32.177249908447266, "learning_rate": 1.8985684272243124e-06, "loss": 0.1482, "num_input_tokens_seen": 13079040, "step": 19430 }, { "epoch": 0.4748002833899299, "grad_norm": 23.470182418823242, "learning_rate": 1.8990570186153318e-06, "loss": 0.0946, "num_input_tokens_seen": 13082304, "step": 19435 }, { "epoch": 0.47492243422177705, "grad_norm": 11.740708351135254, "learning_rate": 1.8995456100063515e-06, "loss": 0.1373, "num_input_tokens_seen": 13085632, "step": 19440 }, { "epoch": 0.47504458505362424, "grad_norm": 1.594720721244812, "learning_rate": 1.9000342013973714e-06, "loss": 0.1384, "num_input_tokens_seen": 13089024, "step": 19445 }, { "epoch": 0.47516673588547137, "grad_norm": 2.3788228034973145, "learning_rate": 1.9005227927883909e-06, "loss": 0.0781, "num_input_tokens_seen": 13092480, "step": 19450 }, { "epoch": 0.47528888671731856, "grad_norm": 13.307036399841309, "learning_rate": 1.9010113841794106e-06, "loss": 0.1162, "num_input_tokens_seen": 13096064, "step": 19455 }, { "epoch": 0.4754110375491657, "grad_norm": 19.755977630615234, "learning_rate": 1.9014999755704305e-06, "loss": 0.1173, "num_input_tokens_seen": 13099648, "step": 19460 }, { "epoch": 0.4755331883810129, "grad_norm": 9.011610984802246, "learning_rate": 1.9019885669614501e-06, "loss": 0.1658, "num_input_tokens_seen": 13102848, "step": 19465 }, { "epoch": 0.47565533921286, "grad_norm": 8.883974075317383, "learning_rate": 1.9024771583524696e-06, "loss": 0.0847, "num_input_tokens_seen": 13106688, "step": 19470 }, { "epoch": 0.4757774900447072, "grad_norm": 26.36639976501465, "learning_rate": 1.9029657497434895e-06, "loss": 0.1892, "num_input_tokens_seen": 13110272, "step": 19475 }, { "epoch": 0.47589964087655434, "grad_norm": 6.351770401000977, "learning_rate": 1.9034543411345092e-06, "loss": 0.1094, "num_input_tokens_seen": 13113280, "step": 19480 }, { "epoch": 0.47602179170840153, "grad_norm": 3.0226128101348877, "learning_rate": 1.9039429325255287e-06, "loss": 0.0507, "num_input_tokens_seen": 13116480, "step": 19485 }, { "epoch": 0.4761439425402487, "grad_norm": 49.442291259765625, "learning_rate": 1.9044315239165486e-06, "loss": 0.1353, "num_input_tokens_seen": 13119936, "step": 19490 }, { "epoch": 0.47626609337209586, "grad_norm": 27.841060638427734, "learning_rate": 1.9049201153075682e-06, "loss": 0.0437, "num_input_tokens_seen": 13123392, "step": 19495 }, { "epoch": 0.47638824420394305, "grad_norm": 31.20875358581543, "learning_rate": 1.9054087066985877e-06, "loss": 0.1317, "num_input_tokens_seen": 13126464, "step": 19500 }, { "epoch": 0.4765103950357902, "grad_norm": 10.526351928710938, "learning_rate": 1.9058972980896076e-06, "loss": 0.2263, "num_input_tokens_seen": 13130048, "step": 19505 }, { "epoch": 0.47663254586763737, "grad_norm": 0.17358553409576416, "learning_rate": 1.9063858894806273e-06, "loss": 0.1022, "num_input_tokens_seen": 13133440, "step": 19510 }, { "epoch": 0.4767546966994845, "grad_norm": 15.227017402648926, "learning_rate": 1.906874480871647e-06, "loss": 0.1827, "num_input_tokens_seen": 13136448, "step": 19515 }, { "epoch": 0.4768768475313317, "grad_norm": 35.180110931396484, "learning_rate": 1.9073630722626666e-06, "loss": 0.1598, "num_input_tokens_seen": 13139584, "step": 19520 }, { "epoch": 0.4769989983631788, "grad_norm": 46.55698776245117, "learning_rate": 1.9078516636536863e-06, "loss": 0.1595, "num_input_tokens_seen": 13142784, "step": 19525 }, { "epoch": 0.477121149195026, "grad_norm": 2.008786201477051, "learning_rate": 1.908340255044706e-06, "loss": 0.2117, "num_input_tokens_seen": 13146112, "step": 19530 }, { "epoch": 0.4772433000268732, "grad_norm": 30.464975357055664, "learning_rate": 1.9088288464357257e-06, "loss": 0.2164, "num_input_tokens_seen": 13149504, "step": 19535 }, { "epoch": 0.47736545085872034, "grad_norm": 6.6905364990234375, "learning_rate": 1.9093174378267454e-06, "loss": 0.0371, "num_input_tokens_seen": 13152704, "step": 19540 }, { "epoch": 0.47748760169056753, "grad_norm": 14.021597862243652, "learning_rate": 1.909806029217765e-06, "loss": 0.1501, "num_input_tokens_seen": 13156160, "step": 19545 }, { "epoch": 0.47760975252241467, "grad_norm": 12.774880409240723, "learning_rate": 1.9102946206087847e-06, "loss": 0.0894, "num_input_tokens_seen": 13159616, "step": 19550 }, { "epoch": 0.47773190335426186, "grad_norm": 23.132492065429688, "learning_rate": 1.9107832119998044e-06, "loss": 0.1337, "num_input_tokens_seen": 13162624, "step": 19555 }, { "epoch": 0.477854054186109, "grad_norm": 20.382837295532227, "learning_rate": 1.911271803390824e-06, "loss": 0.0509, "num_input_tokens_seen": 13165760, "step": 19560 }, { "epoch": 0.4779762050179562, "grad_norm": 0.5650962591171265, "learning_rate": 1.9117603947818438e-06, "loss": 0.115, "num_input_tokens_seen": 13169024, "step": 19565 }, { "epoch": 0.4780983558498033, "grad_norm": 39.08525085449219, "learning_rate": 1.9122489861728635e-06, "loss": 0.2129, "num_input_tokens_seen": 13171968, "step": 19570 }, { "epoch": 0.4782205066816505, "grad_norm": 5.242583751678467, "learning_rate": 1.912737577563883e-06, "loss": 0.0831, "num_input_tokens_seen": 13175424, "step": 19575 }, { "epoch": 0.4783426575134977, "grad_norm": 18.628211975097656, "learning_rate": 1.913226168954903e-06, "loss": 0.1144, "num_input_tokens_seen": 13178880, "step": 19580 }, { "epoch": 0.4784648083453448, "grad_norm": 4.374256610870361, "learning_rate": 1.913714760345923e-06, "loss": 0.0943, "num_input_tokens_seen": 13182208, "step": 19585 }, { "epoch": 0.478586959177192, "grad_norm": 1.0530842542648315, "learning_rate": 1.914203351736942e-06, "loss": 0.0779, "num_input_tokens_seen": 13185600, "step": 19590 }, { "epoch": 0.47870911000903915, "grad_norm": 31.666772842407227, "learning_rate": 1.914691943127962e-06, "loss": 0.0657, "num_input_tokens_seen": 13188928, "step": 19595 }, { "epoch": 0.47883126084088634, "grad_norm": 9.453741073608398, "learning_rate": 1.915180534518982e-06, "loss": 0.2389, "num_input_tokens_seen": 13192064, "step": 19600 }, { "epoch": 0.4789534116727335, "grad_norm": 28.946199417114258, "learning_rate": 1.9156691259100012e-06, "loss": 0.1893, "num_input_tokens_seen": 13195712, "step": 19605 }, { "epoch": 0.47907556250458067, "grad_norm": 17.196870803833008, "learning_rate": 1.916157717301021e-06, "loss": 0.262, "num_input_tokens_seen": 13198656, "step": 19610 }, { "epoch": 0.4791977133364278, "grad_norm": 14.507476806640625, "learning_rate": 1.916646308692041e-06, "loss": 0.1524, "num_input_tokens_seen": 13201984, "step": 19615 }, { "epoch": 0.479319864168275, "grad_norm": 41.175994873046875, "learning_rate": 1.9171349000830607e-06, "loss": 0.0971, "num_input_tokens_seen": 13205312, "step": 19620 }, { "epoch": 0.4794420150001221, "grad_norm": 15.390592575073242, "learning_rate": 1.91762349147408e-06, "loss": 0.1953, "num_input_tokens_seen": 13208192, "step": 19625 }, { "epoch": 0.4795641658319693, "grad_norm": 13.96489143371582, "learning_rate": 1.9181120828651e-06, "loss": 0.1565, "num_input_tokens_seen": 13211520, "step": 19630 }, { "epoch": 0.4796863166638165, "grad_norm": 10.897221565246582, "learning_rate": 1.9186006742561197e-06, "loss": 0.1034, "num_input_tokens_seen": 13214848, "step": 19635 }, { "epoch": 0.47980846749566364, "grad_norm": 12.923073768615723, "learning_rate": 1.919089265647139e-06, "loss": 0.1566, "num_input_tokens_seen": 13218304, "step": 19640 }, { "epoch": 0.4799306183275108, "grad_norm": 15.097844123840332, "learning_rate": 1.919577857038159e-06, "loss": 0.139, "num_input_tokens_seen": 13221696, "step": 19645 }, { "epoch": 0.48005276915935796, "grad_norm": 3.387833595275879, "learning_rate": 1.9200664484291788e-06, "loss": 0.0572, "num_input_tokens_seen": 13224896, "step": 19650 }, { "epoch": 0.48017491999120515, "grad_norm": 9.004018783569336, "learning_rate": 1.920555039820198e-06, "loss": 0.1403, "num_input_tokens_seen": 13227968, "step": 19655 }, { "epoch": 0.4802970708230523, "grad_norm": 3.3921761512756348, "learning_rate": 1.921043631211218e-06, "loss": 0.0733, "num_input_tokens_seen": 13231232, "step": 19660 }, { "epoch": 0.4804192216548995, "grad_norm": 1.3995615243911743, "learning_rate": 1.921532222602238e-06, "loss": 0.088, "num_input_tokens_seen": 13234560, "step": 19665 }, { "epoch": 0.4805413724867466, "grad_norm": 16.81403923034668, "learning_rate": 1.9220208139932575e-06, "loss": 0.0543, "num_input_tokens_seen": 13237696, "step": 19670 }, { "epoch": 0.4806635233185938, "grad_norm": 0.8026001453399658, "learning_rate": 1.922509405384277e-06, "loss": 0.1292, "num_input_tokens_seen": 13241216, "step": 19675 }, { "epoch": 0.480785674150441, "grad_norm": 80.34431457519531, "learning_rate": 1.922997996775297e-06, "loss": 0.1134, "num_input_tokens_seen": 13244736, "step": 19680 }, { "epoch": 0.4809078249822881, "grad_norm": 87.73758697509766, "learning_rate": 1.9234865881663165e-06, "loss": 0.2556, "num_input_tokens_seen": 13248000, "step": 19685 }, { "epoch": 0.4810299758141353, "grad_norm": 25.267589569091797, "learning_rate": 1.923975179557336e-06, "loss": 0.2345, "num_input_tokens_seen": 13251008, "step": 19690 }, { "epoch": 0.48115212664598245, "grad_norm": 35.30669021606445, "learning_rate": 1.924463770948356e-06, "loss": 0.1163, "num_input_tokens_seen": 13254336, "step": 19695 }, { "epoch": 0.48127427747782964, "grad_norm": 15.753335952758789, "learning_rate": 1.9249523623393756e-06, "loss": 0.1854, "num_input_tokens_seen": 13258048, "step": 19700 }, { "epoch": 0.48139642830967677, "grad_norm": 21.301998138427734, "learning_rate": 1.9254409537303953e-06, "loss": 0.3226, "num_input_tokens_seen": 13261632, "step": 19705 }, { "epoch": 0.48151857914152396, "grad_norm": 25.74140739440918, "learning_rate": 1.925929545121415e-06, "loss": 0.1299, "num_input_tokens_seen": 13265024, "step": 19710 }, { "epoch": 0.4816407299733711, "grad_norm": 23.157045364379883, "learning_rate": 1.9264181365124346e-06, "loss": 0.1003, "num_input_tokens_seen": 13268416, "step": 19715 }, { "epoch": 0.4817628808052183, "grad_norm": 44.55127716064453, "learning_rate": 1.9269067279034543e-06, "loss": 0.16, "num_input_tokens_seen": 13272064, "step": 19720 }, { "epoch": 0.4818850316370655, "grad_norm": 24.022293090820312, "learning_rate": 1.927395319294474e-06, "loss": 0.1596, "num_input_tokens_seen": 13275712, "step": 19725 }, { "epoch": 0.4820071824689126, "grad_norm": 18.870023727416992, "learning_rate": 1.9278839106854937e-06, "loss": 0.1389, "num_input_tokens_seen": 13278976, "step": 19730 }, { "epoch": 0.4821293333007598, "grad_norm": 1.568988561630249, "learning_rate": 1.9283725020765134e-06, "loss": 0.0965, "num_input_tokens_seen": 13282496, "step": 19735 }, { "epoch": 0.48225148413260693, "grad_norm": 33.636409759521484, "learning_rate": 1.928861093467533e-06, "loss": 0.1027, "num_input_tokens_seen": 13285568, "step": 19740 }, { "epoch": 0.4823736349644541, "grad_norm": 22.591812133789062, "learning_rate": 1.9293496848585527e-06, "loss": 0.0695, "num_input_tokens_seen": 13288960, "step": 19745 }, { "epoch": 0.48249578579630126, "grad_norm": 2.1983540058135986, "learning_rate": 1.9298382762495724e-06, "loss": 0.109, "num_input_tokens_seen": 13292224, "step": 19750 }, { "epoch": 0.48261793662814845, "grad_norm": 0.34230145812034607, "learning_rate": 1.930326867640592e-06, "loss": 0.1414, "num_input_tokens_seen": 13296064, "step": 19755 }, { "epoch": 0.4827400874599956, "grad_norm": 0.6198939681053162, "learning_rate": 1.9308154590316118e-06, "loss": 0.0575, "num_input_tokens_seen": 13299648, "step": 19760 }, { "epoch": 0.48286223829184277, "grad_norm": 0.22586674988269806, "learning_rate": 1.9313040504226314e-06, "loss": 0.1743, "num_input_tokens_seen": 13302912, "step": 19765 }, { "epoch": 0.4829843891236899, "grad_norm": 13.703788757324219, "learning_rate": 1.931792641813651e-06, "loss": 0.0443, "num_input_tokens_seen": 13306368, "step": 19770 }, { "epoch": 0.4831065399555371, "grad_norm": 9.638721466064453, "learning_rate": 1.932281233204671e-06, "loss": 0.0543, "num_input_tokens_seen": 13309376, "step": 19775 }, { "epoch": 0.4832286907873843, "grad_norm": 34.85390090942383, "learning_rate": 1.9327698245956905e-06, "loss": 0.1372, "num_input_tokens_seen": 13312640, "step": 19780 }, { "epoch": 0.4833508416192314, "grad_norm": 40.97196960449219, "learning_rate": 1.93325841598671e-06, "loss": 0.0512, "num_input_tokens_seen": 13315776, "step": 19785 }, { "epoch": 0.4834729924510786, "grad_norm": 46.25328063964844, "learning_rate": 1.93374700737773e-06, "loss": 0.0811, "num_input_tokens_seen": 13319616, "step": 19790 }, { "epoch": 0.48359514328292574, "grad_norm": 4.664642333984375, "learning_rate": 1.9342355987687495e-06, "loss": 0.1762, "num_input_tokens_seen": 13323008, "step": 19795 }, { "epoch": 0.48371729411477293, "grad_norm": 10.510961532592773, "learning_rate": 1.9347241901597692e-06, "loss": 0.0668, "num_input_tokens_seen": 13326400, "step": 19800 }, { "epoch": 0.48383944494662007, "grad_norm": 57.567710876464844, "learning_rate": 1.935212781550789e-06, "loss": 0.2493, "num_input_tokens_seen": 13329728, "step": 19805 }, { "epoch": 0.48396159577846726, "grad_norm": 63.7676887512207, "learning_rate": 1.9357013729418086e-06, "loss": 0.1694, "num_input_tokens_seen": 13333120, "step": 19810 }, { "epoch": 0.4840837466103144, "grad_norm": 16.2789306640625, "learning_rate": 1.9361899643328283e-06, "loss": 0.1478, "num_input_tokens_seen": 13336576, "step": 19815 }, { "epoch": 0.4842058974421616, "grad_norm": 2.470158100128174, "learning_rate": 1.936678555723848e-06, "loss": 0.0925, "num_input_tokens_seen": 13339712, "step": 19820 }, { "epoch": 0.48432804827400877, "grad_norm": 10.53795337677002, "learning_rate": 1.937167147114868e-06, "loss": 0.089, "num_input_tokens_seen": 13343168, "step": 19825 }, { "epoch": 0.4844501991058559, "grad_norm": 2.1699910163879395, "learning_rate": 1.9376557385058873e-06, "loss": 0.1069, "num_input_tokens_seen": 13346240, "step": 19830 }, { "epoch": 0.4845723499377031, "grad_norm": 9.327561378479004, "learning_rate": 1.938144329896907e-06, "loss": 0.0559, "num_input_tokens_seen": 13349888, "step": 19835 }, { "epoch": 0.48469450076955023, "grad_norm": 8.914467811584473, "learning_rate": 1.938632921287927e-06, "loss": 0.0851, "num_input_tokens_seen": 13353472, "step": 19840 }, { "epoch": 0.4848166516013974, "grad_norm": 10.970634460449219, "learning_rate": 1.9391215126789463e-06, "loss": 0.1351, "num_input_tokens_seen": 13356672, "step": 19845 }, { "epoch": 0.48493880243324455, "grad_norm": 0.31943902373313904, "learning_rate": 1.939610104069966e-06, "loss": 0.2086, "num_input_tokens_seen": 13360064, "step": 19850 }, { "epoch": 0.48506095326509174, "grad_norm": 30.93590545654297, "learning_rate": 1.940098695460986e-06, "loss": 0.0905, "num_input_tokens_seen": 13363392, "step": 19855 }, { "epoch": 0.4851831040969389, "grad_norm": 12.492888450622559, "learning_rate": 1.940587286852006e-06, "loss": 0.1664, "num_input_tokens_seen": 13366656, "step": 19860 }, { "epoch": 0.48530525492878607, "grad_norm": 12.074633598327637, "learning_rate": 1.941075878243025e-06, "loss": 0.0714, "num_input_tokens_seen": 13370240, "step": 19865 }, { "epoch": 0.48542740576063326, "grad_norm": 13.443814277648926, "learning_rate": 1.941564469634045e-06, "loss": 0.1149, "num_input_tokens_seen": 13373568, "step": 19870 }, { "epoch": 0.4855495565924804, "grad_norm": 13.719369888305664, "learning_rate": 1.942053061025065e-06, "loss": 0.1532, "num_input_tokens_seen": 13377024, "step": 19875 }, { "epoch": 0.4856717074243276, "grad_norm": 27.99519157409668, "learning_rate": 1.942541652416084e-06, "loss": 0.133, "num_input_tokens_seen": 13380800, "step": 19880 }, { "epoch": 0.4857938582561747, "grad_norm": 0.44706669449806213, "learning_rate": 1.943030243807104e-06, "loss": 0.1333, "num_input_tokens_seen": 13384320, "step": 19885 }, { "epoch": 0.4859160090880219, "grad_norm": 0.12808431684970856, "learning_rate": 1.943518835198124e-06, "loss": 0.1726, "num_input_tokens_seen": 13387648, "step": 19890 }, { "epoch": 0.48603815991986904, "grad_norm": 0.382138192653656, "learning_rate": 1.9440074265891436e-06, "loss": 0.1649, "num_input_tokens_seen": 13391232, "step": 19895 }, { "epoch": 0.48616031075171623, "grad_norm": 20.595565795898438, "learning_rate": 1.944496017980163e-06, "loss": 0.0477, "num_input_tokens_seen": 13395136, "step": 19900 }, { "epoch": 0.48628246158356336, "grad_norm": 26.79218292236328, "learning_rate": 1.944984609371183e-06, "loss": 0.1885, "num_input_tokens_seen": 13398400, "step": 19905 }, { "epoch": 0.48640461241541055, "grad_norm": 15.850521087646484, "learning_rate": 1.9454732007622026e-06, "loss": 0.22, "num_input_tokens_seen": 13401792, "step": 19910 }, { "epoch": 0.4865267632472577, "grad_norm": 0.2584015130996704, "learning_rate": 1.945961792153222e-06, "loss": 0.1027, "num_input_tokens_seen": 13405120, "step": 19915 }, { "epoch": 0.4866489140791049, "grad_norm": 20.799280166625977, "learning_rate": 1.946450383544242e-06, "loss": 0.064, "num_input_tokens_seen": 13409088, "step": 19920 }, { "epoch": 0.48677106491095207, "grad_norm": 0.3154126703739166, "learning_rate": 1.9469389749352617e-06, "loss": 0.0144, "num_input_tokens_seen": 13412736, "step": 19925 }, { "epoch": 0.4868932157427992, "grad_norm": 14.14961051940918, "learning_rate": 1.947427566326281e-06, "loss": 0.0679, "num_input_tokens_seen": 13415872, "step": 19930 }, { "epoch": 0.4870153665746464, "grad_norm": 24.828821182250977, "learning_rate": 1.947916157717301e-06, "loss": 0.1872, "num_input_tokens_seen": 13419136, "step": 19935 }, { "epoch": 0.4871375174064935, "grad_norm": 14.158141136169434, "learning_rate": 1.9484047491083207e-06, "loss": 0.3057, "num_input_tokens_seen": 13422848, "step": 19940 }, { "epoch": 0.4872596682383407, "grad_norm": 14.543496131896973, "learning_rate": 1.9488933404993404e-06, "loss": 0.1605, "num_input_tokens_seen": 13426816, "step": 19945 }, { "epoch": 0.48738181907018785, "grad_norm": 8.337520599365234, "learning_rate": 1.94938193189036e-06, "loss": 0.0222, "num_input_tokens_seen": 13429824, "step": 19950 }, { "epoch": 0.48750396990203504, "grad_norm": 35.78062057495117, "learning_rate": 1.9498705232813798e-06, "loss": 0.1204, "num_input_tokens_seen": 13433088, "step": 19955 }, { "epoch": 0.4876261207338822, "grad_norm": 11.560881614685059, "learning_rate": 1.9503591146723994e-06, "loss": 0.1232, "num_input_tokens_seen": 13436416, "step": 19960 }, { "epoch": 0.48774827156572936, "grad_norm": 9.376676559448242, "learning_rate": 1.950847706063419e-06, "loss": 0.0839, "num_input_tokens_seen": 13439936, "step": 19965 }, { "epoch": 0.48787042239757655, "grad_norm": 0.3136023283004761, "learning_rate": 1.951336297454439e-06, "loss": 0.0839, "num_input_tokens_seen": 13443200, "step": 19970 }, { "epoch": 0.4879925732294237, "grad_norm": 11.248197555541992, "learning_rate": 1.9518248888454585e-06, "loss": 0.0941, "num_input_tokens_seen": 13446400, "step": 19975 }, { "epoch": 0.4881147240612709, "grad_norm": 2.7925679683685303, "learning_rate": 1.952313480236478e-06, "loss": 0.0274, "num_input_tokens_seen": 13450240, "step": 19980 }, { "epoch": 0.488236874893118, "grad_norm": 1.5199958086013794, "learning_rate": 1.952802071627498e-06, "loss": 0.0952, "num_input_tokens_seen": 13453376, "step": 19985 }, { "epoch": 0.4883590257249652, "grad_norm": 0.21349725127220154, "learning_rate": 1.9532906630185175e-06, "loss": 0.1549, "num_input_tokens_seen": 13456512, "step": 19990 }, { "epoch": 0.48848117655681234, "grad_norm": 28.426145553588867, "learning_rate": 1.953779254409537e-06, "loss": 0.1634, "num_input_tokens_seen": 13459712, "step": 19995 }, { "epoch": 0.4886033273886595, "grad_norm": 18.666841506958008, "learning_rate": 1.954267845800557e-06, "loss": 0.1312, "num_input_tokens_seen": 13462976, "step": 20000 }, { "epoch": 0.48872547822050666, "grad_norm": 1.9791568517684937, "learning_rate": 1.9547564371915766e-06, "loss": 0.159, "num_input_tokens_seen": 13466432, "step": 20005 }, { "epoch": 0.48884762905235385, "grad_norm": 1.8186808824539185, "learning_rate": 1.9552450285825963e-06, "loss": 0.1475, "num_input_tokens_seen": 13470016, "step": 20010 }, { "epoch": 0.48896977988420104, "grad_norm": 5.565760135650635, "learning_rate": 1.955733619973616e-06, "loss": 0.1175, "num_input_tokens_seen": 13474368, "step": 20015 }, { "epoch": 0.4890919307160482, "grad_norm": 40.79534149169922, "learning_rate": 1.9562222113646356e-06, "loss": 0.1288, "num_input_tokens_seen": 13477696, "step": 20020 }, { "epoch": 0.48921408154789536, "grad_norm": 17.945369720458984, "learning_rate": 1.9567108027556553e-06, "loss": 0.1885, "num_input_tokens_seen": 13481152, "step": 20025 }, { "epoch": 0.4893362323797425, "grad_norm": 11.218626022338867, "learning_rate": 1.957199394146675e-06, "loss": 0.0811, "num_input_tokens_seen": 13484608, "step": 20030 }, { "epoch": 0.4894583832115897, "grad_norm": 13.835955619812012, "learning_rate": 1.9576879855376947e-06, "loss": 0.1319, "num_input_tokens_seen": 13487808, "step": 20035 }, { "epoch": 0.4895805340434368, "grad_norm": 18.503908157348633, "learning_rate": 1.9581765769287143e-06, "loss": 0.136, "num_input_tokens_seen": 13491008, "step": 20040 }, { "epoch": 0.489702684875284, "grad_norm": 4.769625663757324, "learning_rate": 1.958665168319734e-06, "loss": 0.0896, "num_input_tokens_seen": 13494016, "step": 20045 }, { "epoch": 0.48982483570713115, "grad_norm": 6.78912353515625, "learning_rate": 1.959153759710754e-06, "loss": 0.1986, "num_input_tokens_seen": 13497600, "step": 20050 }, { "epoch": 0.48994698653897834, "grad_norm": 13.250788688659668, "learning_rate": 1.9596423511017734e-06, "loss": 0.1285, "num_input_tokens_seen": 13500672, "step": 20055 }, { "epoch": 0.49006913737082547, "grad_norm": 2.766676664352417, "learning_rate": 1.960130942492793e-06, "loss": 0.0348, "num_input_tokens_seen": 13503808, "step": 20060 }, { "epoch": 0.49019128820267266, "grad_norm": 21.56344985961914, "learning_rate": 1.9606195338838127e-06, "loss": 0.0635, "num_input_tokens_seen": 13507200, "step": 20065 }, { "epoch": 0.49031343903451985, "grad_norm": 1.6514256000518799, "learning_rate": 1.9611081252748324e-06, "loss": 0.1066, "num_input_tokens_seen": 13510464, "step": 20070 }, { "epoch": 0.490435589866367, "grad_norm": 2.146864652633667, "learning_rate": 1.961596716665852e-06, "loss": 0.0995, "num_input_tokens_seen": 13513472, "step": 20075 }, { "epoch": 0.4905577406982142, "grad_norm": 13.342808723449707, "learning_rate": 1.962085308056872e-06, "loss": 0.2648, "num_input_tokens_seen": 13516672, "step": 20080 }, { "epoch": 0.4906798915300613, "grad_norm": 17.412988662719727, "learning_rate": 1.9625738994478915e-06, "loss": 0.1242, "num_input_tokens_seen": 13519808, "step": 20085 }, { "epoch": 0.4908020423619085, "grad_norm": 57.60816955566406, "learning_rate": 1.963062490838911e-06, "loss": 0.0692, "num_input_tokens_seen": 13522816, "step": 20090 }, { "epoch": 0.49092419319375563, "grad_norm": 24.054847717285156, "learning_rate": 1.963551082229931e-06, "loss": 0.3476, "num_input_tokens_seen": 13526208, "step": 20095 }, { "epoch": 0.4910463440256028, "grad_norm": 24.362337112426758, "learning_rate": 1.964039673620951e-06, "loss": 0.3069, "num_input_tokens_seen": 13529664, "step": 20100 }, { "epoch": 0.49116849485744996, "grad_norm": 11.493230819702148, "learning_rate": 1.96452826501197e-06, "loss": 0.0861, "num_input_tokens_seen": 13533184, "step": 20105 }, { "epoch": 0.49129064568929715, "grad_norm": 29.886674880981445, "learning_rate": 1.96501685640299e-06, "loss": 0.1571, "num_input_tokens_seen": 13536704, "step": 20110 }, { "epoch": 0.49141279652114433, "grad_norm": 14.273351669311523, "learning_rate": 1.96550544779401e-06, "loss": 0.0438, "num_input_tokens_seen": 13540224, "step": 20115 }, { "epoch": 0.49153494735299147, "grad_norm": 19.529502868652344, "learning_rate": 1.9659940391850292e-06, "loss": 0.1241, "num_input_tokens_seen": 13543808, "step": 20120 }, { "epoch": 0.49165709818483866, "grad_norm": 8.674605369567871, "learning_rate": 1.966482630576049e-06, "loss": 0.0822, "num_input_tokens_seen": 13547264, "step": 20125 }, { "epoch": 0.4917792490166858, "grad_norm": 34.01549530029297, "learning_rate": 1.966971221967069e-06, "loss": 0.1256, "num_input_tokens_seen": 13550400, "step": 20130 }, { "epoch": 0.491901399848533, "grad_norm": 23.52373695373535, "learning_rate": 1.9674598133580887e-06, "loss": 0.0989, "num_input_tokens_seen": 13554048, "step": 20135 }, { "epoch": 0.4920235506803801, "grad_norm": 17.888687133789062, "learning_rate": 1.967948404749108e-06, "loss": 0.0538, "num_input_tokens_seen": 13557248, "step": 20140 }, { "epoch": 0.4921457015122273, "grad_norm": 31.919679641723633, "learning_rate": 1.968436996140128e-06, "loss": 0.1511, "num_input_tokens_seen": 13560960, "step": 20145 }, { "epoch": 0.49226785234407444, "grad_norm": 16.944351196289062, "learning_rate": 1.9689255875311478e-06, "loss": 0.1369, "num_input_tokens_seen": 13563776, "step": 20150 }, { "epoch": 0.49239000317592163, "grad_norm": 40.24680709838867, "learning_rate": 1.969414178922167e-06, "loss": 0.1426, "num_input_tokens_seen": 13566976, "step": 20155 }, { "epoch": 0.49251215400776877, "grad_norm": 22.800159454345703, "learning_rate": 1.969902770313187e-06, "loss": 0.0422, "num_input_tokens_seen": 13570304, "step": 20160 }, { "epoch": 0.49263430483961596, "grad_norm": 14.297897338867188, "learning_rate": 1.970391361704207e-06, "loss": 0.2077, "num_input_tokens_seen": 13574080, "step": 20165 }, { "epoch": 0.49275645567146314, "grad_norm": 28.413267135620117, "learning_rate": 1.9708799530952265e-06, "loss": 0.1064, "num_input_tokens_seen": 13577344, "step": 20170 }, { "epoch": 0.4928786065033103, "grad_norm": 20.27289581298828, "learning_rate": 1.971368544486246e-06, "loss": 0.0783, "num_input_tokens_seen": 13580480, "step": 20175 }, { "epoch": 0.49300075733515747, "grad_norm": 3.2449045181274414, "learning_rate": 1.971857135877266e-06, "loss": 0.1319, "num_input_tokens_seen": 13583424, "step": 20180 }, { "epoch": 0.4931229081670046, "grad_norm": 23.341785430908203, "learning_rate": 1.9723457272682855e-06, "loss": 0.1384, "num_input_tokens_seen": 13586624, "step": 20185 }, { "epoch": 0.4932450589988518, "grad_norm": 17.50263214111328, "learning_rate": 1.972834318659305e-06, "loss": 0.0931, "num_input_tokens_seen": 13589888, "step": 20190 }, { "epoch": 0.4933672098306989, "grad_norm": 11.289210319519043, "learning_rate": 1.973322910050325e-06, "loss": 0.1851, "num_input_tokens_seen": 13593280, "step": 20195 }, { "epoch": 0.4934893606625461, "grad_norm": 12.591771125793457, "learning_rate": 1.9738115014413446e-06, "loss": 0.0802, "num_input_tokens_seen": 13596608, "step": 20200 }, { "epoch": 0.49361151149439325, "grad_norm": 16.865806579589844, "learning_rate": 1.9743000928323642e-06, "loss": 0.0851, "num_input_tokens_seen": 13599936, "step": 20205 }, { "epoch": 0.49373366232624044, "grad_norm": 17.790006637573242, "learning_rate": 1.974788684223384e-06, "loss": 0.1579, "num_input_tokens_seen": 13603072, "step": 20210 }, { "epoch": 0.49385581315808763, "grad_norm": 31.209308624267578, "learning_rate": 1.9752772756144036e-06, "loss": 0.1716, "num_input_tokens_seen": 13606656, "step": 20215 }, { "epoch": 0.49397796398993477, "grad_norm": 25.8765926361084, "learning_rate": 1.9757658670054233e-06, "loss": 0.1867, "num_input_tokens_seen": 13610048, "step": 20220 }, { "epoch": 0.49410011482178195, "grad_norm": 17.896934509277344, "learning_rate": 1.976254458396443e-06, "loss": 0.095, "num_input_tokens_seen": 13613504, "step": 20225 }, { "epoch": 0.4942222656536291, "grad_norm": 2.6187055110931396, "learning_rate": 1.9767430497874627e-06, "loss": 0.1053, "num_input_tokens_seen": 13617472, "step": 20230 }, { "epoch": 0.4943444164854763, "grad_norm": 14.588163375854492, "learning_rate": 1.9772316411784823e-06, "loss": 0.0849, "num_input_tokens_seen": 13621312, "step": 20235 }, { "epoch": 0.4944665673173234, "grad_norm": 4.078947067260742, "learning_rate": 1.977720232569502e-06, "loss": 0.0733, "num_input_tokens_seen": 13624640, "step": 20240 }, { "epoch": 0.4945887181491706, "grad_norm": 20.883548736572266, "learning_rate": 1.9782088239605217e-06, "loss": 0.1103, "num_input_tokens_seen": 13628032, "step": 20245 }, { "epoch": 0.49471086898101774, "grad_norm": 6.015990257263184, "learning_rate": 1.9786974153515414e-06, "loss": 0.153, "num_input_tokens_seen": 13631296, "step": 20250 }, { "epoch": 0.4948330198128649, "grad_norm": 2.3922202587127686, "learning_rate": 1.979186006742561e-06, "loss": 0.0764, "num_input_tokens_seen": 13634816, "step": 20255 }, { "epoch": 0.4949551706447121, "grad_norm": 27.54640007019043, "learning_rate": 1.9796745981335807e-06, "loss": 0.075, "num_input_tokens_seen": 13637952, "step": 20260 }, { "epoch": 0.49507732147655925, "grad_norm": 8.645411491394043, "learning_rate": 1.9801631895246004e-06, "loss": 0.1036, "num_input_tokens_seen": 13641344, "step": 20265 }, { "epoch": 0.49519947230840644, "grad_norm": 15.165426254272461, "learning_rate": 1.98065178091562e-06, "loss": 0.0937, "num_input_tokens_seen": 13644992, "step": 20270 }, { "epoch": 0.4953216231402536, "grad_norm": 23.38542938232422, "learning_rate": 1.9811403723066398e-06, "loss": 0.0303, "num_input_tokens_seen": 13648256, "step": 20275 }, { "epoch": 0.49544377397210076, "grad_norm": 2.478635311126709, "learning_rate": 1.9816289636976595e-06, "loss": 0.0959, "num_input_tokens_seen": 13651776, "step": 20280 }, { "epoch": 0.4955659248039479, "grad_norm": 27.348520278930664, "learning_rate": 1.982117555088679e-06, "loss": 0.1028, "num_input_tokens_seen": 13655232, "step": 20285 }, { "epoch": 0.4956880756357951, "grad_norm": 33.220062255859375, "learning_rate": 1.982606146479699e-06, "loss": 0.1079, "num_input_tokens_seen": 13659264, "step": 20290 }, { "epoch": 0.4958102264676422, "grad_norm": 12.448121070861816, "learning_rate": 1.9830947378707185e-06, "loss": 0.11, "num_input_tokens_seen": 13663232, "step": 20295 }, { "epoch": 0.4959323772994894, "grad_norm": 14.38774299621582, "learning_rate": 1.983583329261738e-06, "loss": 0.1488, "num_input_tokens_seen": 13666496, "step": 20300 }, { "epoch": 0.49605452813133655, "grad_norm": 16.634559631347656, "learning_rate": 1.984071920652758e-06, "loss": 0.1363, "num_input_tokens_seen": 13669952, "step": 20305 }, { "epoch": 0.49617667896318374, "grad_norm": 0.1701052337884903, "learning_rate": 1.9845605120437776e-06, "loss": 0.0476, "num_input_tokens_seen": 13673984, "step": 20310 }, { "epoch": 0.4962988297950309, "grad_norm": 14.704794883728027, "learning_rate": 1.9850491034347972e-06, "loss": 0.0905, "num_input_tokens_seen": 13677696, "step": 20315 }, { "epoch": 0.49642098062687806, "grad_norm": 4.579434871673584, "learning_rate": 1.985537694825817e-06, "loss": 0.1117, "num_input_tokens_seen": 13680960, "step": 20320 }, { "epoch": 0.49654313145872525, "grad_norm": 3.7386457920074463, "learning_rate": 1.986026286216837e-06, "loss": 0.0668, "num_input_tokens_seen": 13684480, "step": 20325 }, { "epoch": 0.4966652822905724, "grad_norm": 5.1689276695251465, "learning_rate": 1.9865148776078563e-06, "loss": 0.0343, "num_input_tokens_seen": 13687872, "step": 20330 }, { "epoch": 0.4967874331224196, "grad_norm": 12.612316131591797, "learning_rate": 1.987003468998876e-06, "loss": 0.0929, "num_input_tokens_seen": 13691072, "step": 20335 }, { "epoch": 0.4969095839542667, "grad_norm": 0.49819138646125793, "learning_rate": 1.987492060389896e-06, "loss": 0.0431, "num_input_tokens_seen": 13694720, "step": 20340 }, { "epoch": 0.4970317347861139, "grad_norm": 46.28751754760742, "learning_rate": 1.9879806517809153e-06, "loss": 0.1056, "num_input_tokens_seen": 13697984, "step": 20345 }, { "epoch": 0.49715388561796103, "grad_norm": 33.69471740722656, "learning_rate": 1.988469243171935e-06, "loss": 0.1068, "num_input_tokens_seen": 13701184, "step": 20350 }, { "epoch": 0.4972760364498082, "grad_norm": 30.221221923828125, "learning_rate": 1.988957834562955e-06, "loss": 0.1515, "num_input_tokens_seen": 13704704, "step": 20355 }, { "epoch": 0.4973981872816554, "grad_norm": 3.9779465198516846, "learning_rate": 1.9894464259539744e-06, "loss": 0.1188, "num_input_tokens_seen": 13708096, "step": 20360 }, { "epoch": 0.49752033811350255, "grad_norm": 14.227984428405762, "learning_rate": 1.989935017344994e-06, "loss": 0.1672, "num_input_tokens_seen": 13711488, "step": 20365 }, { "epoch": 0.49764248894534974, "grad_norm": 18.283992767333984, "learning_rate": 1.990423608736014e-06, "loss": 0.087, "num_input_tokens_seen": 13714944, "step": 20370 }, { "epoch": 0.49776463977719687, "grad_norm": 13.039130210876465, "learning_rate": 1.990912200127034e-06, "loss": 0.1452, "num_input_tokens_seen": 13718656, "step": 20375 }, { "epoch": 0.49788679060904406, "grad_norm": 14.41788387298584, "learning_rate": 1.991400791518053e-06, "loss": 0.2331, "num_input_tokens_seen": 13721472, "step": 20380 }, { "epoch": 0.4980089414408912, "grad_norm": 16.680484771728516, "learning_rate": 1.991889382909073e-06, "loss": 0.0584, "num_input_tokens_seen": 13725248, "step": 20385 }, { "epoch": 0.4981310922727384, "grad_norm": 33.19993209838867, "learning_rate": 1.992377974300093e-06, "loss": 0.1784, "num_input_tokens_seen": 13729088, "step": 20390 }, { "epoch": 0.4982532431045855, "grad_norm": 3.5821926593780518, "learning_rate": 1.992866565691112e-06, "loss": 0.1174, "num_input_tokens_seen": 13732608, "step": 20395 }, { "epoch": 0.4983753939364327, "grad_norm": 22.986421585083008, "learning_rate": 1.9933551570821322e-06, "loss": 0.2099, "num_input_tokens_seen": 13736128, "step": 20400 }, { "epoch": 0.4984975447682799, "grad_norm": 15.068368911743164, "learning_rate": 1.993843748473152e-06, "loss": 0.0965, "num_input_tokens_seen": 13739456, "step": 20405 }, { "epoch": 0.49861969560012703, "grad_norm": 19.621456146240234, "learning_rate": 1.9943323398641716e-06, "loss": 0.1392, "num_input_tokens_seen": 13742528, "step": 20410 }, { "epoch": 0.4987418464319742, "grad_norm": 4.565086841583252, "learning_rate": 1.9948209312551913e-06, "loss": 0.0692, "num_input_tokens_seen": 13745600, "step": 20415 }, { "epoch": 0.49886399726382136, "grad_norm": 7.886087417602539, "learning_rate": 1.995309522646211e-06, "loss": 0.0727, "num_input_tokens_seen": 13748864, "step": 20420 }, { "epoch": 0.49898614809566855, "grad_norm": 26.047780990600586, "learning_rate": 1.9957981140372307e-06, "loss": 0.0934, "num_input_tokens_seen": 13753472, "step": 20425 }, { "epoch": 0.4991082989275157, "grad_norm": 27.23785972595215, "learning_rate": 1.9962867054282503e-06, "loss": 0.0655, "num_input_tokens_seen": 13756352, "step": 20430 }, { "epoch": 0.49923044975936287, "grad_norm": 23.27521324157715, "learning_rate": 1.99677529681927e-06, "loss": 0.1874, "num_input_tokens_seen": 13759872, "step": 20435 }, { "epoch": 0.49935260059121, "grad_norm": 24.3862361907959, "learning_rate": 1.9972638882102897e-06, "loss": 0.1828, "num_input_tokens_seen": 13763008, "step": 20440 }, { "epoch": 0.4994747514230572, "grad_norm": 0.43370214104652405, "learning_rate": 1.9977524796013094e-06, "loss": 0.2337, "num_input_tokens_seen": 13766144, "step": 20445 }, { "epoch": 0.49959690225490433, "grad_norm": 23.112834930419922, "learning_rate": 1.998241070992329e-06, "loss": 0.0943, "num_input_tokens_seen": 13769408, "step": 20450 }, { "epoch": 0.4997190530867515, "grad_norm": 16.897188186645508, "learning_rate": 1.9987296623833487e-06, "loss": 0.1177, "num_input_tokens_seen": 13772608, "step": 20455 }, { "epoch": 0.4998412039185987, "grad_norm": 18.243366241455078, "learning_rate": 1.9992182537743684e-06, "loss": 0.1733, "num_input_tokens_seen": 13775680, "step": 20460 }, { "epoch": 0.49996335475044584, "grad_norm": 18.259639739990234, "learning_rate": 1.999706845165388e-06, "loss": 0.1021, "num_input_tokens_seen": 13778880, "step": 20465 }, { "epoch": 0.5000366452495542, "eval_loss": 0.11281616985797882, "eval_runtime": 47.6807, "eval_samples_per_second": 763.096, "eval_steps_per_second": 95.405, "num_input_tokens_seen": 13780928, "step": 20468 }, { "epoch": 0.500085505582293, "grad_norm": 19.301481246948242, "learning_rate": 1.9999999994182183e-06, "loss": 0.1982, "num_input_tokens_seen": 13782144, "step": 20470 }, { "epoch": 0.5002076564141402, "grad_norm": 4.169570446014404, "learning_rate": 1.9999999928731765e-06, "loss": 0.0798, "num_input_tokens_seen": 13785600, "step": 20475 }, { "epoch": 0.5003298072459873, "grad_norm": 5.7112860679626465, "learning_rate": 1.9999999790558656e-06, "loss": 0.1128, "num_input_tokens_seen": 13788800, "step": 20480 }, { "epoch": 0.5004519580778345, "grad_norm": 21.82765769958496, "learning_rate": 1.9999999579662855e-06, "loss": 0.1589, "num_input_tokens_seen": 13792256, "step": 20485 }, { "epoch": 0.5005741089096817, "grad_norm": 17.94799041748047, "learning_rate": 1.999999929604437e-06, "loss": 0.1436, "num_input_tokens_seen": 13795264, "step": 20490 }, { "epoch": 0.5006962597415289, "grad_norm": 3.4309074878692627, "learning_rate": 1.99999989397032e-06, "loss": 0.0407, "num_input_tokens_seen": 13799040, "step": 20495 }, { "epoch": 0.500818410573376, "grad_norm": 11.802199363708496, "learning_rate": 1.9999998510639352e-06, "loss": 0.0523, "num_input_tokens_seen": 13801984, "step": 20500 }, { "epoch": 0.5009405614052231, "grad_norm": 2.4828591346740723, "learning_rate": 1.999999800885282e-06, "loss": 0.0662, "num_input_tokens_seen": 13805440, "step": 20505 }, { "epoch": 0.5010627122370703, "grad_norm": 15.110939979553223, "learning_rate": 1.9999997434343614e-06, "loss": 0.1458, "num_input_tokens_seen": 13808384, "step": 20510 }, { "epoch": 0.5011848630689175, "grad_norm": 4.889379501342773, "learning_rate": 1.9999996787111737e-06, "loss": 0.122, "num_input_tokens_seen": 13811456, "step": 20515 }, { "epoch": 0.5013070139007647, "grad_norm": 13.039811134338379, "learning_rate": 1.9999996067157195e-06, "loss": 0.115, "num_input_tokens_seen": 13814464, "step": 20520 }, { "epoch": 0.5014291647326118, "grad_norm": 18.891918182373047, "learning_rate": 1.999999527447999e-06, "loss": 0.0875, "num_input_tokens_seen": 13817920, "step": 20525 }, { "epoch": 0.501551315564459, "grad_norm": 19.57078742980957, "learning_rate": 1.9999994409080134e-06, "loss": 0.0648, "num_input_tokens_seen": 13820992, "step": 20530 }, { "epoch": 0.5016734663963062, "grad_norm": 25.669971466064453, "learning_rate": 1.9999993470957628e-06, "loss": 0.1201, "num_input_tokens_seen": 13824576, "step": 20535 }, { "epoch": 0.5017956172281534, "grad_norm": 0.14215537905693054, "learning_rate": 1.9999992460112477e-06, "loss": 0.0275, "num_input_tokens_seen": 13827520, "step": 20540 }, { "epoch": 0.5019177680600004, "grad_norm": 6.8776092529296875, "learning_rate": 1.999999137654469e-06, "loss": 0.0616, "num_input_tokens_seen": 13830528, "step": 20545 }, { "epoch": 0.5020399188918476, "grad_norm": 16.818981170654297, "learning_rate": 1.999999022025428e-06, "loss": 0.0757, "num_input_tokens_seen": 13833984, "step": 20550 }, { "epoch": 0.5021620697236948, "grad_norm": 6.901256084442139, "learning_rate": 1.999998899124125e-06, "loss": 0.2078, "num_input_tokens_seen": 13837120, "step": 20555 }, { "epoch": 0.502284220555542, "grad_norm": 1.4620168209075928, "learning_rate": 1.999998768950561e-06, "loss": 0.0793, "num_input_tokens_seen": 13840512, "step": 20560 }, { "epoch": 0.5024063713873892, "grad_norm": 9.964655876159668, "learning_rate": 1.999998631504737e-06, "loss": 0.1634, "num_input_tokens_seen": 13844096, "step": 20565 }, { "epoch": 0.5025285222192363, "grad_norm": 1.1515722274780273, "learning_rate": 1.9999984867866536e-06, "loss": 0.1221, "num_input_tokens_seen": 13847168, "step": 20570 }, { "epoch": 0.5026506730510835, "grad_norm": 25.116619110107422, "learning_rate": 1.999998334796313e-06, "loss": 0.2725, "num_input_tokens_seen": 13850496, "step": 20575 }, { "epoch": 0.5027728238829307, "grad_norm": 5.306918144226074, "learning_rate": 1.9999981755337148e-06, "loss": 0.2294, "num_input_tokens_seen": 13853568, "step": 20580 }, { "epoch": 0.5028949747147778, "grad_norm": 7.240964889526367, "learning_rate": 1.9999980089988614e-06, "loss": 0.1153, "num_input_tokens_seen": 13857216, "step": 20585 }, { "epoch": 0.5030171255466249, "grad_norm": 21.508577346801758, "learning_rate": 1.9999978351917536e-06, "loss": 0.0272, "num_input_tokens_seen": 13860608, "step": 20590 }, { "epoch": 0.5031392763784721, "grad_norm": 0.7625007033348083, "learning_rate": 1.999997654112392e-06, "loss": 0.0345, "num_input_tokens_seen": 13864448, "step": 20595 }, { "epoch": 0.5032614272103193, "grad_norm": 24.227930068969727, "learning_rate": 1.9999974657607787e-06, "loss": 0.0895, "num_input_tokens_seen": 13867904, "step": 20600 }, { "epoch": 0.5033835780421665, "grad_norm": 46.96719741821289, "learning_rate": 1.999997270136915e-06, "loss": 0.2186, "num_input_tokens_seen": 13871744, "step": 20605 }, { "epoch": 0.5035057288740137, "grad_norm": 24.50628662109375, "learning_rate": 1.9999970672408025e-06, "loss": 0.0855, "num_input_tokens_seen": 13875136, "step": 20610 }, { "epoch": 0.5036278797058608, "grad_norm": 8.925004005432129, "learning_rate": 1.999996857072442e-06, "loss": 0.1191, "num_input_tokens_seen": 13878464, "step": 20615 }, { "epoch": 0.503750030537708, "grad_norm": 24.1169490814209, "learning_rate": 1.9999966396318354e-06, "loss": 0.1111, "num_input_tokens_seen": 13881856, "step": 20620 }, { "epoch": 0.5038721813695551, "grad_norm": 8.494999885559082, "learning_rate": 1.9999964149189844e-06, "loss": 0.0529, "num_input_tokens_seen": 13885248, "step": 20625 }, { "epoch": 0.5039943322014023, "grad_norm": 12.20933723449707, "learning_rate": 1.99999618293389e-06, "loss": 0.0501, "num_input_tokens_seen": 13888640, "step": 20630 }, { "epoch": 0.5041164830332494, "grad_norm": 22.99600601196289, "learning_rate": 1.999995943676555e-06, "loss": 0.2339, "num_input_tokens_seen": 13891712, "step": 20635 }, { "epoch": 0.5042386338650966, "grad_norm": 0.8674524426460266, "learning_rate": 1.9999956971469804e-06, "loss": 0.0618, "num_input_tokens_seen": 13895360, "step": 20640 }, { "epoch": 0.5043607846969438, "grad_norm": 3.513089656829834, "learning_rate": 1.9999954433451676e-06, "loss": 0.1164, "num_input_tokens_seen": 13898240, "step": 20645 }, { "epoch": 0.504482935528791, "grad_norm": 0.6792322993278503, "learning_rate": 1.9999951822711196e-06, "loss": 0.1748, "num_input_tokens_seen": 13901888, "step": 20650 }, { "epoch": 0.5046050863606382, "grad_norm": 10.531157493591309, "learning_rate": 1.9999949139248376e-06, "loss": 0.1279, "num_input_tokens_seen": 13904896, "step": 20655 }, { "epoch": 0.5047272371924852, "grad_norm": 16.625782012939453, "learning_rate": 1.9999946383063233e-06, "loss": 0.0989, "num_input_tokens_seen": 13907584, "step": 20660 }, { "epoch": 0.5048493880243324, "grad_norm": 39.52769088745117, "learning_rate": 1.9999943554155792e-06, "loss": 0.0806, "num_input_tokens_seen": 13911104, "step": 20665 }, { "epoch": 0.5049715388561796, "grad_norm": 11.926505088806152, "learning_rate": 1.999994065252607e-06, "loss": 0.1024, "num_input_tokens_seen": 13914496, "step": 20670 }, { "epoch": 0.5050936896880268, "grad_norm": 18.042837142944336, "learning_rate": 1.9999937678174095e-06, "loss": 0.0824, "num_input_tokens_seen": 13917632, "step": 20675 }, { "epoch": 0.5052158405198739, "grad_norm": 8.337514877319336, "learning_rate": 1.999993463109988e-06, "loss": 0.0504, "num_input_tokens_seen": 13921280, "step": 20680 }, { "epoch": 0.5053379913517211, "grad_norm": 25.05613136291504, "learning_rate": 1.9999931511303454e-06, "loss": 0.089, "num_input_tokens_seen": 13924928, "step": 20685 }, { "epoch": 0.5054601421835683, "grad_norm": 28.33561897277832, "learning_rate": 1.999992831878483e-06, "loss": 0.0856, "num_input_tokens_seen": 13927744, "step": 20690 }, { "epoch": 0.5055822930154155, "grad_norm": 24.654722213745117, "learning_rate": 1.9999925053544042e-06, "loss": 0.2061, "num_input_tokens_seen": 13930944, "step": 20695 }, { "epoch": 0.5057044438472627, "grad_norm": 10.103041648864746, "learning_rate": 1.999992171558111e-06, "loss": 0.1686, "num_input_tokens_seen": 13933952, "step": 20700 }, { "epoch": 0.5058265946791097, "grad_norm": 18.388208389282227, "learning_rate": 1.9999918304896055e-06, "loss": 0.1257, "num_input_tokens_seen": 13937088, "step": 20705 }, { "epoch": 0.5059487455109569, "grad_norm": 0.4605598747730255, "learning_rate": 1.9999914821488907e-06, "loss": 0.0984, "num_input_tokens_seen": 13940096, "step": 20710 }, { "epoch": 0.5060708963428041, "grad_norm": 27.575864791870117, "learning_rate": 1.9999911265359687e-06, "loss": 0.153, "num_input_tokens_seen": 13943360, "step": 20715 }, { "epoch": 0.5061930471746513, "grad_norm": 5.479243755340576, "learning_rate": 1.9999907636508423e-06, "loss": 0.0386, "num_input_tokens_seen": 13947136, "step": 20720 }, { "epoch": 0.5063151980064984, "grad_norm": 15.30483341217041, "learning_rate": 1.999990393493514e-06, "loss": 0.0095, "num_input_tokens_seen": 13950784, "step": 20725 }, { "epoch": 0.5064373488383456, "grad_norm": 18.604610443115234, "learning_rate": 1.9999900160639867e-06, "loss": 0.0739, "num_input_tokens_seen": 13954048, "step": 20730 }, { "epoch": 0.5065594996701928, "grad_norm": 0.8294208645820618, "learning_rate": 1.999989631362263e-06, "loss": 0.0863, "num_input_tokens_seen": 13957568, "step": 20735 }, { "epoch": 0.50668165050204, "grad_norm": 6.405564785003662, "learning_rate": 1.999989239388346e-06, "loss": 0.1229, "num_input_tokens_seen": 13960896, "step": 20740 }, { "epoch": 0.5068038013338871, "grad_norm": 141.67640686035156, "learning_rate": 1.999988840142238e-06, "loss": 0.1095, "num_input_tokens_seen": 13964096, "step": 20745 }, { "epoch": 0.5069259521657342, "grad_norm": 18.958993911743164, "learning_rate": 1.999988433623942e-06, "loss": 0.18, "num_input_tokens_seen": 13967488, "step": 20750 }, { "epoch": 0.5070481029975814, "grad_norm": 36.95416259765625, "learning_rate": 1.9999880198334615e-06, "loss": 0.22, "num_input_tokens_seen": 13970496, "step": 20755 }, { "epoch": 0.5071702538294286, "grad_norm": 0.5629600286483765, "learning_rate": 1.999987598770799e-06, "loss": 0.1957, "num_input_tokens_seen": 13973824, "step": 20760 }, { "epoch": 0.5072924046612758, "grad_norm": 40.47126007080078, "learning_rate": 1.999987170435958e-06, "loss": 0.2626, "num_input_tokens_seen": 13976768, "step": 20765 }, { "epoch": 0.5074145554931229, "grad_norm": 15.845697402954102, "learning_rate": 1.999986734828941e-06, "loss": 0.1409, "num_input_tokens_seen": 13979712, "step": 20770 }, { "epoch": 0.50753670632497, "grad_norm": 19.21963119506836, "learning_rate": 1.9999862919497516e-06, "loss": 0.2365, "num_input_tokens_seen": 13982848, "step": 20775 }, { "epoch": 0.5076588571568172, "grad_norm": 24.437908172607422, "learning_rate": 1.9999858417983926e-06, "loss": 0.1471, "num_input_tokens_seen": 13985984, "step": 20780 }, { "epoch": 0.5077810079886644, "grad_norm": 23.66278839111328, "learning_rate": 1.999985384374868e-06, "loss": 0.1118, "num_input_tokens_seen": 13989632, "step": 20785 }, { "epoch": 0.5079031588205115, "grad_norm": 28.34226417541504, "learning_rate": 1.9999849196791806e-06, "loss": 0.0515, "num_input_tokens_seen": 13993280, "step": 20790 }, { "epoch": 0.5080253096523587, "grad_norm": 2.3390464782714844, "learning_rate": 1.999984447711334e-06, "loss": 0.0598, "num_input_tokens_seen": 13996544, "step": 20795 }, { "epoch": 0.5081474604842059, "grad_norm": 5.47605037689209, "learning_rate": 1.9999839684713317e-06, "loss": 0.1112, "num_input_tokens_seen": 13999680, "step": 20800 }, { "epoch": 0.5082696113160531, "grad_norm": 0.12189412862062454, "learning_rate": 1.9999834819591767e-06, "loss": 0.1447, "num_input_tokens_seen": 14002752, "step": 20805 }, { "epoch": 0.5083917621479003, "grad_norm": 29.23406982421875, "learning_rate": 1.9999829881748725e-06, "loss": 0.164, "num_input_tokens_seen": 14005760, "step": 20810 }, { "epoch": 0.5085139129797474, "grad_norm": 21.18992805480957, "learning_rate": 1.9999824871184237e-06, "loss": 0.1718, "num_input_tokens_seen": 14009088, "step": 20815 }, { "epoch": 0.5086360638115945, "grad_norm": 20.333215713500977, "learning_rate": 1.999981978789833e-06, "loss": 0.085, "num_input_tokens_seen": 14012544, "step": 20820 }, { "epoch": 0.5087582146434417, "grad_norm": 11.833674430847168, "learning_rate": 1.999981463189105e-06, "loss": 0.1174, "num_input_tokens_seen": 14016064, "step": 20825 }, { "epoch": 0.5088803654752889, "grad_norm": 33.08906555175781, "learning_rate": 1.999980940316242e-06, "loss": 0.0498, "num_input_tokens_seen": 14019520, "step": 20830 }, { "epoch": 0.509002516307136, "grad_norm": 24.263809204101562, "learning_rate": 1.9999804101712487e-06, "loss": 0.1354, "num_input_tokens_seen": 14022528, "step": 20835 }, { "epoch": 0.5091246671389832, "grad_norm": 0.9982167482376099, "learning_rate": 1.9999798727541293e-06, "loss": 0.1094, "num_input_tokens_seen": 14025856, "step": 20840 }, { "epoch": 0.5092468179708304, "grad_norm": 15.391318321228027, "learning_rate": 1.9999793280648873e-06, "loss": 0.1613, "num_input_tokens_seen": 14029056, "step": 20845 }, { "epoch": 0.5093689688026776, "grad_norm": 4.251640319824219, "learning_rate": 1.9999787761035265e-06, "loss": 0.0166, "num_input_tokens_seen": 14032448, "step": 20850 }, { "epoch": 0.5094911196345248, "grad_norm": 17.91326904296875, "learning_rate": 1.9999782168700507e-06, "loss": 0.1246, "num_input_tokens_seen": 14035712, "step": 20855 }, { "epoch": 0.5096132704663718, "grad_norm": 8.680523872375488, "learning_rate": 1.999977650364465e-06, "loss": 0.1157, "num_input_tokens_seen": 14039168, "step": 20860 }, { "epoch": 0.509735421298219, "grad_norm": 3.187202215194702, "learning_rate": 1.9999770765867725e-06, "loss": 0.0137, "num_input_tokens_seen": 14042624, "step": 20865 }, { "epoch": 0.5098575721300662, "grad_norm": 10.735001564025879, "learning_rate": 1.9999764955369777e-06, "loss": 0.2449, "num_input_tokens_seen": 14046016, "step": 20870 }, { "epoch": 0.5099797229619134, "grad_norm": 6.723996639251709, "learning_rate": 1.9999759072150852e-06, "loss": 0.1225, "num_input_tokens_seen": 14049472, "step": 20875 }, { "epoch": 0.5101018737937605, "grad_norm": 18.004169464111328, "learning_rate": 1.9999753116210986e-06, "loss": 0.117, "num_input_tokens_seen": 14053056, "step": 20880 }, { "epoch": 0.5102240246256077, "grad_norm": 21.24515151977539, "learning_rate": 1.999974708755023e-06, "loss": 0.148, "num_input_tokens_seen": 14056512, "step": 20885 }, { "epoch": 0.5103461754574549, "grad_norm": 21.89750099182129, "learning_rate": 1.9999740986168617e-06, "loss": 0.1261, "num_input_tokens_seen": 14059904, "step": 20890 }, { "epoch": 0.510468326289302, "grad_norm": 16.346757888793945, "learning_rate": 1.9999734812066203e-06, "loss": 0.0516, "num_input_tokens_seen": 14063744, "step": 20895 }, { "epoch": 0.5105904771211492, "grad_norm": 19.123580932617188, "learning_rate": 1.9999728565243025e-06, "loss": 0.1038, "num_input_tokens_seen": 14067264, "step": 20900 }, { "epoch": 0.5107126279529963, "grad_norm": 43.41123580932617, "learning_rate": 1.9999722245699134e-06, "loss": 0.1236, "num_input_tokens_seen": 14070656, "step": 20905 }, { "epoch": 0.5108347787848435, "grad_norm": 16.726806640625, "learning_rate": 1.999971585343457e-06, "loss": 0.1776, "num_input_tokens_seen": 14074048, "step": 20910 }, { "epoch": 0.5109569296166907, "grad_norm": 0.432910293340683, "learning_rate": 1.999970938844939e-06, "loss": 0.0515, "num_input_tokens_seen": 14077312, "step": 20915 }, { "epoch": 0.5110790804485379, "grad_norm": 28.468435287475586, "learning_rate": 1.9999702850743623e-06, "loss": 0.1006, "num_input_tokens_seen": 14080192, "step": 20920 }, { "epoch": 0.511201231280385, "grad_norm": 24.04837417602539, "learning_rate": 1.9999696240317335e-06, "loss": 0.085, "num_input_tokens_seen": 14083392, "step": 20925 }, { "epoch": 0.5113233821122322, "grad_norm": 43.350826263427734, "learning_rate": 1.9999689557170562e-06, "loss": 0.1012, "num_input_tokens_seen": 14087296, "step": 20930 }, { "epoch": 0.5114455329440794, "grad_norm": 43.21065139770508, "learning_rate": 1.999968280130336e-06, "loss": 0.0761, "num_input_tokens_seen": 14090624, "step": 20935 }, { "epoch": 0.5115676837759265, "grad_norm": 8.68442440032959, "learning_rate": 1.9999675972715774e-06, "loss": 0.0917, "num_input_tokens_seen": 14094272, "step": 20940 }, { "epoch": 0.5116898346077737, "grad_norm": 10.342585563659668, "learning_rate": 1.9999669071407856e-06, "loss": 0.2236, "num_input_tokens_seen": 14097280, "step": 20945 }, { "epoch": 0.5118119854396208, "grad_norm": 0.07209543883800507, "learning_rate": 1.9999662097379652e-06, "loss": 0.0352, "num_input_tokens_seen": 14100480, "step": 20950 }, { "epoch": 0.511934136271468, "grad_norm": 49.62693405151367, "learning_rate": 1.9999655050631218e-06, "loss": 0.1376, "num_input_tokens_seen": 14103680, "step": 20955 }, { "epoch": 0.5120562871033152, "grad_norm": 18.054792404174805, "learning_rate": 1.99996479311626e-06, "loss": 0.3091, "num_input_tokens_seen": 14106944, "step": 20960 }, { "epoch": 0.5121784379351624, "grad_norm": 4.0170722007751465, "learning_rate": 1.9999640738973856e-06, "loss": 0.2194, "num_input_tokens_seen": 14110336, "step": 20965 }, { "epoch": 0.5123005887670095, "grad_norm": 8.806989669799805, "learning_rate": 1.9999633474065034e-06, "loss": 0.1982, "num_input_tokens_seen": 14113728, "step": 20970 }, { "epoch": 0.5124227395988566, "grad_norm": 2.0078670978546143, "learning_rate": 1.9999626136436185e-06, "loss": 0.0321, "num_input_tokens_seen": 14116864, "step": 20975 }, { "epoch": 0.5125448904307038, "grad_norm": 13.33098316192627, "learning_rate": 1.9999618726087373e-06, "loss": 0.1371, "num_input_tokens_seen": 14120000, "step": 20980 }, { "epoch": 0.512667041262551, "grad_norm": 2.074002742767334, "learning_rate": 1.999961124301864e-06, "loss": 0.0286, "num_input_tokens_seen": 14123200, "step": 20985 }, { "epoch": 0.5127891920943982, "grad_norm": 3.3571574687957764, "learning_rate": 1.9999603687230037e-06, "loss": 0.1198, "num_input_tokens_seen": 14126720, "step": 20990 }, { "epoch": 0.5129113429262453, "grad_norm": 28.249244689941406, "learning_rate": 1.9999596058721634e-06, "loss": 0.1523, "num_input_tokens_seen": 14130048, "step": 20995 }, { "epoch": 0.5130334937580925, "grad_norm": 23.86388397216797, "learning_rate": 1.999958835749348e-06, "loss": 0.1031, "num_input_tokens_seen": 14133184, "step": 21000 }, { "epoch": 0.5131556445899397, "grad_norm": 23.31868553161621, "learning_rate": 1.9999580583545625e-06, "loss": 0.1792, "num_input_tokens_seen": 14136768, "step": 21005 }, { "epoch": 0.5132777954217869, "grad_norm": 0.21132534742355347, "learning_rate": 1.9999572736878134e-06, "loss": 0.0683, "num_input_tokens_seen": 14140160, "step": 21010 }, { "epoch": 0.5133999462536339, "grad_norm": 23.453798294067383, "learning_rate": 1.999956481749106e-06, "loss": 0.1488, "num_input_tokens_seen": 14143744, "step": 21015 }, { "epoch": 0.5135220970854811, "grad_norm": 14.550957679748535, "learning_rate": 1.999955682538446e-06, "loss": 0.0436, "num_input_tokens_seen": 14147072, "step": 21020 }, { "epoch": 0.5136442479173283, "grad_norm": 4.155126571655273, "learning_rate": 1.9999548760558395e-06, "loss": 0.0647, "num_input_tokens_seen": 14149952, "step": 21025 }, { "epoch": 0.5137663987491755, "grad_norm": 10.049470901489258, "learning_rate": 1.9999540623012917e-06, "loss": 0.1996, "num_input_tokens_seen": 14153216, "step": 21030 }, { "epoch": 0.5138885495810227, "grad_norm": 0.2116897702217102, "learning_rate": 1.9999532412748093e-06, "loss": 0.0563, "num_input_tokens_seen": 14156352, "step": 21035 }, { "epoch": 0.5140107004128698, "grad_norm": 52.188636779785156, "learning_rate": 1.9999524129763983e-06, "loss": 0.1079, "num_input_tokens_seen": 14159808, "step": 21040 }, { "epoch": 0.514132851244717, "grad_norm": 27.014354705810547, "learning_rate": 1.999951577406064e-06, "loss": 0.1015, "num_input_tokens_seen": 14163328, "step": 21045 }, { "epoch": 0.5142550020765642, "grad_norm": 11.840407371520996, "learning_rate": 1.9999507345638132e-06, "loss": 0.1002, "num_input_tokens_seen": 14166976, "step": 21050 }, { "epoch": 0.5143771529084114, "grad_norm": 40.124752044677734, "learning_rate": 1.9999498844496515e-06, "loss": 0.1809, "num_input_tokens_seen": 14170048, "step": 21055 }, { "epoch": 0.5144993037402584, "grad_norm": 60.805362701416016, "learning_rate": 1.999949027063585e-06, "loss": 0.2079, "num_input_tokens_seen": 14173312, "step": 21060 }, { "epoch": 0.5146214545721056, "grad_norm": 73.4646224975586, "learning_rate": 1.999948162405621e-06, "loss": 0.2025, "num_input_tokens_seen": 14176832, "step": 21065 }, { "epoch": 0.5147436054039528, "grad_norm": 39.85783386230469, "learning_rate": 1.9999472904757644e-06, "loss": 0.1187, "num_input_tokens_seen": 14179712, "step": 21070 }, { "epoch": 0.5148657562358, "grad_norm": 0.24146099388599396, "learning_rate": 1.999946411274022e-06, "loss": 0.0518, "num_input_tokens_seen": 14182912, "step": 21075 }, { "epoch": 0.5149879070676471, "grad_norm": 27.32053565979004, "learning_rate": 1.9999455248004005e-06, "loss": 0.057, "num_input_tokens_seen": 14185984, "step": 21080 }, { "epoch": 0.5151100578994943, "grad_norm": 31.766202926635742, "learning_rate": 1.9999446310549067e-06, "loss": 0.2056, "num_input_tokens_seen": 14189504, "step": 21085 }, { "epoch": 0.5152322087313415, "grad_norm": 3.2112789154052734, "learning_rate": 1.9999437300375457e-06, "loss": 0.0752, "num_input_tokens_seen": 14192640, "step": 21090 }, { "epoch": 0.5153543595631886, "grad_norm": 31.221282958984375, "learning_rate": 1.9999428217483256e-06, "loss": 0.1698, "num_input_tokens_seen": 14196160, "step": 21095 }, { "epoch": 0.5154765103950358, "grad_norm": 4.7101359367370605, "learning_rate": 1.9999419061872526e-06, "loss": 0.1077, "num_input_tokens_seen": 14200000, "step": 21100 }, { "epoch": 0.5155986612268829, "grad_norm": 26.364850997924805, "learning_rate": 1.9999409833543327e-06, "loss": 0.1557, "num_input_tokens_seen": 14203200, "step": 21105 }, { "epoch": 0.5157208120587301, "grad_norm": 0.626477062702179, "learning_rate": 1.9999400532495728e-06, "loss": 0.0347, "num_input_tokens_seen": 14206720, "step": 21110 }, { "epoch": 0.5158429628905773, "grad_norm": 10.539756774902344, "learning_rate": 1.99993911587298e-06, "loss": 0.2105, "num_input_tokens_seen": 14210048, "step": 21115 }, { "epoch": 0.5159651137224245, "grad_norm": 3.132664203643799, "learning_rate": 1.9999381712245613e-06, "loss": 0.2227, "num_input_tokens_seen": 14213504, "step": 21120 }, { "epoch": 0.5160872645542716, "grad_norm": 31.730087280273438, "learning_rate": 1.999937219304323e-06, "loss": 0.1199, "num_input_tokens_seen": 14217088, "step": 21125 }, { "epoch": 0.5162094153861188, "grad_norm": 0.2630053162574768, "learning_rate": 1.999936260112272e-06, "loss": 0.0566, "num_input_tokens_seen": 14220608, "step": 21130 }, { "epoch": 0.5163315662179659, "grad_norm": 33.741310119628906, "learning_rate": 1.999935293648416e-06, "loss": 0.2041, "num_input_tokens_seen": 14224128, "step": 21135 }, { "epoch": 0.5164537170498131, "grad_norm": 39.81195831298828, "learning_rate": 1.9999343199127616e-06, "loss": 0.126, "num_input_tokens_seen": 14227136, "step": 21140 }, { "epoch": 0.5165758678816603, "grad_norm": 37.612003326416016, "learning_rate": 1.9999333389053157e-06, "loss": 0.1738, "num_input_tokens_seen": 14230528, "step": 21145 }, { "epoch": 0.5166980187135074, "grad_norm": 35.23280715942383, "learning_rate": 1.999932350626086e-06, "loss": 0.1324, "num_input_tokens_seen": 14233920, "step": 21150 }, { "epoch": 0.5168201695453546, "grad_norm": 0.28483980894088745, "learning_rate": 1.999931355075079e-06, "loss": 0.0723, "num_input_tokens_seen": 14236928, "step": 21155 }, { "epoch": 0.5169423203772018, "grad_norm": 27.012868881225586, "learning_rate": 1.999930352252302e-06, "loss": 0.1253, "num_input_tokens_seen": 14240512, "step": 21160 }, { "epoch": 0.517064471209049, "grad_norm": 3.816581964492798, "learning_rate": 1.999929342157763e-06, "loss": 0.0818, "num_input_tokens_seen": 14243968, "step": 21165 }, { "epoch": 0.517186622040896, "grad_norm": 18.7530574798584, "learning_rate": 1.9999283247914684e-06, "loss": 0.1234, "num_input_tokens_seen": 14247040, "step": 21170 }, { "epoch": 0.5173087728727432, "grad_norm": 0.8007626533508301, "learning_rate": 1.9999273001534265e-06, "loss": 0.058, "num_input_tokens_seen": 14250112, "step": 21175 }, { "epoch": 0.5174309237045904, "grad_norm": 2.104343891143799, "learning_rate": 1.999926268243644e-06, "loss": 0.0567, "num_input_tokens_seen": 14254144, "step": 21180 }, { "epoch": 0.5175530745364376, "grad_norm": 31.472410202026367, "learning_rate": 1.999925229062129e-06, "loss": 0.0362, "num_input_tokens_seen": 14257344, "step": 21185 }, { "epoch": 0.5176752253682848, "grad_norm": 0.5018709301948547, "learning_rate": 1.9999241826088883e-06, "loss": 0.0792, "num_input_tokens_seen": 14260672, "step": 21190 }, { "epoch": 0.5177973762001319, "grad_norm": 13.426669120788574, "learning_rate": 1.9999231288839303e-06, "loss": 0.0565, "num_input_tokens_seen": 14263936, "step": 21195 }, { "epoch": 0.5179195270319791, "grad_norm": 4.62431526184082, "learning_rate": 1.9999220678872626e-06, "loss": 0.1347, "num_input_tokens_seen": 14267264, "step": 21200 }, { "epoch": 0.5180416778638263, "grad_norm": 30.063844680786133, "learning_rate": 1.9999209996188924e-06, "loss": 0.2543, "num_input_tokens_seen": 14270720, "step": 21205 }, { "epoch": 0.5181638286956735, "grad_norm": 4.680274486541748, "learning_rate": 1.9999199240788282e-06, "loss": 0.22, "num_input_tokens_seen": 14274112, "step": 21210 }, { "epoch": 0.5182859795275205, "grad_norm": 67.60924530029297, "learning_rate": 1.999918841267077e-06, "loss": 0.199, "num_input_tokens_seen": 14277376, "step": 21215 }, { "epoch": 0.5184081303593677, "grad_norm": 3.24830961227417, "learning_rate": 1.9999177511836467e-06, "loss": 0.134, "num_input_tokens_seen": 14280896, "step": 21220 }, { "epoch": 0.5185302811912149, "grad_norm": 24.1802921295166, "learning_rate": 1.9999166538285463e-06, "loss": 0.1493, "num_input_tokens_seen": 14284224, "step": 21225 }, { "epoch": 0.5186524320230621, "grad_norm": 7.4385504722595215, "learning_rate": 1.9999155492017824e-06, "loss": 0.0893, "num_input_tokens_seen": 14287488, "step": 21230 }, { "epoch": 0.5187745828549093, "grad_norm": 0.4136931002140045, "learning_rate": 1.999914437303364e-06, "loss": 0.0955, "num_input_tokens_seen": 14290560, "step": 21235 }, { "epoch": 0.5188967336867564, "grad_norm": 13.242329597473145, "learning_rate": 1.9999133181332984e-06, "loss": 0.0706, "num_input_tokens_seen": 14294272, "step": 21240 }, { "epoch": 0.5190188845186036, "grad_norm": 13.405590057373047, "learning_rate": 1.9999121916915948e-06, "loss": 0.1165, "num_input_tokens_seen": 14298048, "step": 21245 }, { "epoch": 0.5191410353504508, "grad_norm": 23.771085739135742, "learning_rate": 1.9999110579782607e-06, "loss": 0.0853, "num_input_tokens_seen": 14301184, "step": 21250 }, { "epoch": 0.5192631861822979, "grad_norm": 10.134591102600098, "learning_rate": 1.9999099169933046e-06, "loss": 0.0565, "num_input_tokens_seen": 14304128, "step": 21255 }, { "epoch": 0.519385337014145, "grad_norm": 25.851348876953125, "learning_rate": 1.999908768736734e-06, "loss": 0.0888, "num_input_tokens_seen": 14307648, "step": 21260 }, { "epoch": 0.5195074878459922, "grad_norm": 23.614892959594727, "learning_rate": 1.9999076132085582e-06, "loss": 0.1576, "num_input_tokens_seen": 14310592, "step": 21265 }, { "epoch": 0.5196296386778394, "grad_norm": 4.225952625274658, "learning_rate": 1.9999064504087853e-06, "loss": 0.1492, "num_input_tokens_seen": 14313728, "step": 21270 }, { "epoch": 0.5197517895096866, "grad_norm": 29.918014526367188, "learning_rate": 1.9999052803374243e-06, "loss": 0.2074, "num_input_tokens_seen": 14317056, "step": 21275 }, { "epoch": 0.5198739403415338, "grad_norm": 23.858491897583008, "learning_rate": 1.9999041029944827e-06, "loss": 0.1367, "num_input_tokens_seen": 14319936, "step": 21280 }, { "epoch": 0.5199960911733809, "grad_norm": 8.586871147155762, "learning_rate": 1.9999029183799696e-06, "loss": 0.1315, "num_input_tokens_seen": 14323072, "step": 21285 }, { "epoch": 0.520118242005228, "grad_norm": 6.292385101318359, "learning_rate": 1.9999017264938933e-06, "loss": 0.1303, "num_input_tokens_seen": 14326336, "step": 21290 }, { "epoch": 0.5202403928370752, "grad_norm": 5.245234489440918, "learning_rate": 1.9999005273362628e-06, "loss": 0.0619, "num_input_tokens_seen": 14329792, "step": 21295 }, { "epoch": 0.5203625436689224, "grad_norm": 3.494668960571289, "learning_rate": 1.9998993209070865e-06, "loss": 0.1505, "num_input_tokens_seen": 14332864, "step": 21300 }, { "epoch": 0.5204846945007695, "grad_norm": 5.071745872497559, "learning_rate": 1.9998981072063738e-06, "loss": 0.1236, "num_input_tokens_seen": 14336256, "step": 21305 }, { "epoch": 0.5206068453326167, "grad_norm": 10.048419952392578, "learning_rate": 1.999896886234133e-06, "loss": 0.1244, "num_input_tokens_seen": 14339904, "step": 21310 }, { "epoch": 0.5207289961644639, "grad_norm": 15.470492362976074, "learning_rate": 1.9998956579903733e-06, "loss": 0.2658, "num_input_tokens_seen": 14343424, "step": 21315 }, { "epoch": 0.5208511469963111, "grad_norm": 18.155609130859375, "learning_rate": 1.999894422475103e-06, "loss": 0.0877, "num_input_tokens_seen": 14346880, "step": 21320 }, { "epoch": 0.5209732978281582, "grad_norm": 0.9843332171440125, "learning_rate": 1.9998931796883316e-06, "loss": 0.0885, "num_input_tokens_seen": 14350336, "step": 21325 }, { "epoch": 0.5210954486600053, "grad_norm": 25.281551361083984, "learning_rate": 1.9998919296300684e-06, "loss": 0.1958, "num_input_tokens_seen": 14353600, "step": 21330 }, { "epoch": 0.5212175994918525, "grad_norm": 21.323984146118164, "learning_rate": 1.9998906723003216e-06, "loss": 0.194, "num_input_tokens_seen": 14356928, "step": 21335 }, { "epoch": 0.5213397503236997, "grad_norm": 3.6292245388031006, "learning_rate": 1.9998894076991015e-06, "loss": 0.1148, "num_input_tokens_seen": 14361024, "step": 21340 }, { "epoch": 0.5214619011555469, "grad_norm": 1.0466417074203491, "learning_rate": 1.9998881358264165e-06, "loss": 0.1752, "num_input_tokens_seen": 14364416, "step": 21345 }, { "epoch": 0.521584051987394, "grad_norm": 15.274713516235352, "learning_rate": 1.9998868566822756e-06, "loss": 0.1184, "num_input_tokens_seen": 14367936, "step": 21350 }, { "epoch": 0.5217062028192412, "grad_norm": 14.416821479797363, "learning_rate": 1.9998855702666892e-06, "loss": 0.0963, "num_input_tokens_seen": 14371392, "step": 21355 }, { "epoch": 0.5218283536510884, "grad_norm": 5.276510238647461, "learning_rate": 1.9998842765796656e-06, "loss": 0.0451, "num_input_tokens_seen": 14374976, "step": 21360 }, { "epoch": 0.5219505044829356, "grad_norm": 3.4116029739379883, "learning_rate": 1.9998829756212147e-06, "loss": 0.0914, "num_input_tokens_seen": 14378752, "step": 21365 }, { "epoch": 0.5220726553147826, "grad_norm": 5.307886600494385, "learning_rate": 1.999881667391346e-06, "loss": 0.1199, "num_input_tokens_seen": 14381760, "step": 21370 }, { "epoch": 0.5221948061466298, "grad_norm": 9.030430793762207, "learning_rate": 1.9998803518900687e-06, "loss": 0.1328, "num_input_tokens_seen": 14385344, "step": 21375 }, { "epoch": 0.522316956978477, "grad_norm": 67.31908416748047, "learning_rate": 1.999879029117393e-06, "loss": 0.2202, "num_input_tokens_seen": 14389120, "step": 21380 }, { "epoch": 0.5224391078103242, "grad_norm": 1.338660478591919, "learning_rate": 1.9998776990733274e-06, "loss": 0.0601, "num_input_tokens_seen": 14392832, "step": 21385 }, { "epoch": 0.5225612586421714, "grad_norm": 13.9439115524292, "learning_rate": 1.9998763617578825e-06, "loss": 0.2013, "num_input_tokens_seen": 14396544, "step": 21390 }, { "epoch": 0.5226834094740185, "grad_norm": 6.826702117919922, "learning_rate": 1.999875017171068e-06, "loss": 0.0907, "num_input_tokens_seen": 14399552, "step": 21395 }, { "epoch": 0.5228055603058657, "grad_norm": 36.58500671386719, "learning_rate": 1.999873665312893e-06, "loss": 0.1445, "num_input_tokens_seen": 14402752, "step": 21400 }, { "epoch": 0.5229277111377129, "grad_norm": 3.132948160171509, "learning_rate": 1.9998723061833676e-06, "loss": 0.174, "num_input_tokens_seen": 14406144, "step": 21405 }, { "epoch": 0.52304986196956, "grad_norm": 17.92841148376465, "learning_rate": 1.9998709397825024e-06, "loss": 0.0878, "num_input_tokens_seen": 14409344, "step": 21410 }, { "epoch": 0.5231720128014071, "grad_norm": 9.672657012939453, "learning_rate": 1.999869566110307e-06, "loss": 0.067, "num_input_tokens_seen": 14412608, "step": 21415 }, { "epoch": 0.5232941636332543, "grad_norm": 15.635049819946289, "learning_rate": 1.9998681851667903e-06, "loss": 0.1183, "num_input_tokens_seen": 14415744, "step": 21420 }, { "epoch": 0.5234163144651015, "grad_norm": 9.510732650756836, "learning_rate": 1.999866796951964e-06, "loss": 0.1525, "num_input_tokens_seen": 14418752, "step": 21425 }, { "epoch": 0.5235384652969487, "grad_norm": 16.59809684753418, "learning_rate": 1.999865401465837e-06, "loss": 0.2345, "num_input_tokens_seen": 14421696, "step": 21430 }, { "epoch": 0.5236606161287959, "grad_norm": 17.911949157714844, "learning_rate": 1.9998639987084204e-06, "loss": 0.1961, "num_input_tokens_seen": 14424960, "step": 21435 }, { "epoch": 0.523782766960643, "grad_norm": 2.501218557357788, "learning_rate": 1.9998625886797235e-06, "loss": 0.0777, "num_input_tokens_seen": 14428608, "step": 21440 }, { "epoch": 0.5239049177924902, "grad_norm": 1.355796456336975, "learning_rate": 1.9998611713797566e-06, "loss": 0.1527, "num_input_tokens_seen": 14431680, "step": 21445 }, { "epoch": 0.5240270686243373, "grad_norm": 19.79297637939453, "learning_rate": 1.9998597468085306e-06, "loss": 0.148, "num_input_tokens_seen": 14434944, "step": 21450 }, { "epoch": 0.5241492194561845, "grad_norm": 18.022729873657227, "learning_rate": 1.9998583149660558e-06, "loss": 0.1219, "num_input_tokens_seen": 14437952, "step": 21455 }, { "epoch": 0.5242713702880316, "grad_norm": 18.11164093017578, "learning_rate": 1.999856875852342e-06, "loss": 0.1344, "num_input_tokens_seen": 14440896, "step": 21460 }, { "epoch": 0.5243935211198788, "grad_norm": 2.50659441947937, "learning_rate": 1.9998554294674005e-06, "loss": 0.0374, "num_input_tokens_seen": 14444096, "step": 21465 }, { "epoch": 0.524515671951726, "grad_norm": 6.1895365715026855, "learning_rate": 1.9998539758112413e-06, "loss": 0.2057, "num_input_tokens_seen": 14447232, "step": 21470 }, { "epoch": 0.5246378227835732, "grad_norm": 28.5371036529541, "learning_rate": 1.999852514883875e-06, "loss": 0.1155, "num_input_tokens_seen": 14450624, "step": 21475 }, { "epoch": 0.5247599736154204, "grad_norm": 19.53989601135254, "learning_rate": 1.999851046685312e-06, "loss": 0.142, "num_input_tokens_seen": 14453824, "step": 21480 }, { "epoch": 0.5248821244472675, "grad_norm": 5.390257835388184, "learning_rate": 1.9998495712155638e-06, "loss": 0.0319, "num_input_tokens_seen": 14457088, "step": 21485 }, { "epoch": 0.5250042752791146, "grad_norm": 0.5065277218818665, "learning_rate": 1.9998480884746403e-06, "loss": 0.0508, "num_input_tokens_seen": 14460032, "step": 21490 }, { "epoch": 0.5251264261109618, "grad_norm": 0.2909560203552246, "learning_rate": 1.9998465984625526e-06, "loss": 0.1255, "num_input_tokens_seen": 14463744, "step": 21495 }, { "epoch": 0.525248576942809, "grad_norm": 31.88382911682129, "learning_rate": 1.9998451011793113e-06, "loss": 0.1094, "num_input_tokens_seen": 14467072, "step": 21500 }, { "epoch": 0.5253707277746561, "grad_norm": 7.6989898681640625, "learning_rate": 1.999843596624928e-06, "loss": 0.0689, "num_input_tokens_seen": 14470272, "step": 21505 }, { "epoch": 0.5254928786065033, "grad_norm": 2.5370209217071533, "learning_rate": 1.999842084799413e-06, "loss": 0.012, "num_input_tokens_seen": 14474048, "step": 21510 }, { "epoch": 0.5256150294383505, "grad_norm": 31.16155433654785, "learning_rate": 1.999840565702777e-06, "loss": 0.0521, "num_input_tokens_seen": 14477760, "step": 21515 }, { "epoch": 0.5257371802701977, "grad_norm": 0.8267857432365417, "learning_rate": 1.9998390393350318e-06, "loss": 0.3199, "num_input_tokens_seen": 14481216, "step": 21520 }, { "epoch": 0.5258593311020449, "grad_norm": 14.096746444702148, "learning_rate": 1.9998375056961877e-06, "loss": 0.0071, "num_input_tokens_seen": 14484416, "step": 21525 }, { "epoch": 0.5259814819338919, "grad_norm": 10.87549114227295, "learning_rate": 1.999835964786257e-06, "loss": 0.1347, "num_input_tokens_seen": 14488000, "step": 21530 }, { "epoch": 0.5261036327657391, "grad_norm": 48.07831954956055, "learning_rate": 1.99983441660525e-06, "loss": 0.2018, "num_input_tokens_seen": 14491264, "step": 21535 }, { "epoch": 0.5262257835975863, "grad_norm": 0.5932395458221436, "learning_rate": 1.9998328611531783e-06, "loss": 0.1217, "num_input_tokens_seen": 14494784, "step": 21540 }, { "epoch": 0.5263479344294335, "grad_norm": 28.88714599609375, "learning_rate": 1.9998312984300527e-06, "loss": 0.2129, "num_input_tokens_seen": 14497856, "step": 21545 }, { "epoch": 0.5264700852612806, "grad_norm": 1.3318819999694824, "learning_rate": 1.9998297284358854e-06, "loss": 0.1674, "num_input_tokens_seen": 14501440, "step": 21550 }, { "epoch": 0.5265922360931278, "grad_norm": 0.5254182815551758, "learning_rate": 1.9998281511706874e-06, "loss": 0.1212, "num_input_tokens_seen": 14504832, "step": 21555 }, { "epoch": 0.526714386924975, "grad_norm": 1.647344708442688, "learning_rate": 1.99982656663447e-06, "loss": 0.0934, "num_input_tokens_seen": 14508352, "step": 21560 }, { "epoch": 0.5268365377568222, "grad_norm": 1.297066569328308, "learning_rate": 1.999824974827245e-06, "loss": 0.1684, "num_input_tokens_seen": 14511552, "step": 21565 }, { "epoch": 0.5269586885886693, "grad_norm": 41.3592643737793, "learning_rate": 1.9998233757490237e-06, "loss": 0.1697, "num_input_tokens_seen": 14514752, "step": 21570 }, { "epoch": 0.5270808394205164, "grad_norm": 8.706243515014648, "learning_rate": 1.9998217693998177e-06, "loss": 0.0283, "num_input_tokens_seen": 14517824, "step": 21575 }, { "epoch": 0.5272029902523636, "grad_norm": 9.767743110656738, "learning_rate": 1.9998201557796395e-06, "loss": 0.0999, "num_input_tokens_seen": 14521216, "step": 21580 }, { "epoch": 0.5273251410842108, "grad_norm": 32.12539291381836, "learning_rate": 1.9998185348885e-06, "loss": 0.0718, "num_input_tokens_seen": 14524800, "step": 21585 }, { "epoch": 0.527447291916058, "grad_norm": 1.750207543373108, "learning_rate": 1.999816906726411e-06, "loss": 0.0716, "num_input_tokens_seen": 14528640, "step": 21590 }, { "epoch": 0.5275694427479051, "grad_norm": 13.995022773742676, "learning_rate": 1.9998152712933846e-06, "loss": 0.0889, "num_input_tokens_seen": 14531968, "step": 21595 }, { "epoch": 0.5276915935797523, "grad_norm": 36.821590423583984, "learning_rate": 1.9998136285894326e-06, "loss": 0.0779, "num_input_tokens_seen": 14535296, "step": 21600 }, { "epoch": 0.5278137444115995, "grad_norm": 2.830824613571167, "learning_rate": 1.999811978614567e-06, "loss": 0.1281, "num_input_tokens_seen": 14538944, "step": 21605 }, { "epoch": 0.5279358952434466, "grad_norm": 17.418807983398438, "learning_rate": 1.9998103213687994e-06, "loss": 0.1101, "num_input_tokens_seen": 14542464, "step": 21610 }, { "epoch": 0.5280580460752937, "grad_norm": 25.319671630859375, "learning_rate": 1.9998086568521426e-06, "loss": 0.1037, "num_input_tokens_seen": 14545664, "step": 21615 }, { "epoch": 0.5281801969071409, "grad_norm": 0.47470173239707947, "learning_rate": 1.9998069850646084e-06, "loss": 0.2195, "num_input_tokens_seen": 14549120, "step": 21620 }, { "epoch": 0.5283023477389881, "grad_norm": 52.587093353271484, "learning_rate": 1.999805306006209e-06, "loss": 0.1886, "num_input_tokens_seen": 14552320, "step": 21625 }, { "epoch": 0.5284244985708353, "grad_norm": 0.40930500626564026, "learning_rate": 1.9998036196769564e-06, "loss": 0.1231, "num_input_tokens_seen": 14555520, "step": 21630 }, { "epoch": 0.5285466494026825, "grad_norm": 10.084890365600586, "learning_rate": 1.9998019260768626e-06, "loss": 0.0935, "num_input_tokens_seen": 14558848, "step": 21635 }, { "epoch": 0.5286688002345296, "grad_norm": 15.713746070861816, "learning_rate": 1.9998002252059406e-06, "loss": 0.247, "num_input_tokens_seen": 14562176, "step": 21640 }, { "epoch": 0.5287909510663767, "grad_norm": 12.560420036315918, "learning_rate": 1.9997985170642025e-06, "loss": 0.1526, "num_input_tokens_seen": 14565760, "step": 21645 }, { "epoch": 0.5289131018982239, "grad_norm": 27.058290481567383, "learning_rate": 1.9997968016516606e-06, "loss": 0.067, "num_input_tokens_seen": 14569408, "step": 21650 }, { "epoch": 0.5290352527300711, "grad_norm": 7.252396106719971, "learning_rate": 1.9997950789683277e-06, "loss": 0.0626, "num_input_tokens_seen": 14573504, "step": 21655 }, { "epoch": 0.5291574035619182, "grad_norm": 35.32759475708008, "learning_rate": 1.9997933490142156e-06, "loss": 0.0374, "num_input_tokens_seen": 14576704, "step": 21660 }, { "epoch": 0.5292795543937654, "grad_norm": 17.713476181030273, "learning_rate": 1.9997916117893374e-06, "loss": 0.084, "num_input_tokens_seen": 14580864, "step": 21665 }, { "epoch": 0.5294017052256126, "grad_norm": 22.07648468017578, "learning_rate": 1.999789867293706e-06, "loss": 0.0969, "num_input_tokens_seen": 14584320, "step": 21670 }, { "epoch": 0.5295238560574598, "grad_norm": 0.4838829040527344, "learning_rate": 1.9997881155273336e-06, "loss": 0.1203, "num_input_tokens_seen": 14587328, "step": 21675 }, { "epoch": 0.529646006889307, "grad_norm": 31.40498924255371, "learning_rate": 1.999786356490233e-06, "loss": 0.188, "num_input_tokens_seen": 14590720, "step": 21680 }, { "epoch": 0.529768157721154, "grad_norm": 13.535494804382324, "learning_rate": 1.999784590182417e-06, "loss": 0.1405, "num_input_tokens_seen": 14593728, "step": 21685 }, { "epoch": 0.5298903085530012, "grad_norm": 1.923133373260498, "learning_rate": 1.999782816603899e-06, "loss": 0.1511, "num_input_tokens_seen": 14597440, "step": 21690 }, { "epoch": 0.5300124593848484, "grad_norm": 18.139060974121094, "learning_rate": 1.9997810357546913e-06, "loss": 0.0933, "num_input_tokens_seen": 14600704, "step": 21695 }, { "epoch": 0.5301346102166956, "grad_norm": 1.0402435064315796, "learning_rate": 1.999779247634807e-06, "loss": 0.0149, "num_input_tokens_seen": 14603968, "step": 21700 }, { "epoch": 0.5302567610485427, "grad_norm": 1.9129348993301392, "learning_rate": 1.9997774522442587e-06, "loss": 0.0334, "num_input_tokens_seen": 14607488, "step": 21705 }, { "epoch": 0.5303789118803899, "grad_norm": 23.60872459411621, "learning_rate": 1.9997756495830606e-06, "loss": 0.1878, "num_input_tokens_seen": 14610880, "step": 21710 }, { "epoch": 0.5305010627122371, "grad_norm": 10.163957595825195, "learning_rate": 1.9997738396512243e-06, "loss": 0.2545, "num_input_tokens_seen": 14614336, "step": 21715 }, { "epoch": 0.5306232135440843, "grad_norm": 57.82450485229492, "learning_rate": 1.9997720224487642e-06, "loss": 0.1535, "num_input_tokens_seen": 14617600, "step": 21720 }, { "epoch": 0.5307453643759314, "grad_norm": 17.80607795715332, "learning_rate": 1.999770197975693e-06, "loss": 0.1522, "num_input_tokens_seen": 14620480, "step": 21725 }, { "epoch": 0.5308675152077785, "grad_norm": 8.344657897949219, "learning_rate": 1.999768366232024e-06, "loss": 0.0956, "num_input_tokens_seen": 14623808, "step": 21730 }, { "epoch": 0.5309896660396257, "grad_norm": 29.846019744873047, "learning_rate": 1.9997665272177706e-06, "loss": 0.1534, "num_input_tokens_seen": 14626944, "step": 21735 }, { "epoch": 0.5311118168714729, "grad_norm": 10.817802429199219, "learning_rate": 1.999764680932946e-06, "loss": 0.1231, "num_input_tokens_seen": 14629824, "step": 21740 }, { "epoch": 0.5312339677033201, "grad_norm": 35.45216369628906, "learning_rate": 1.9997628273775635e-06, "loss": 0.1844, "num_input_tokens_seen": 14633536, "step": 21745 }, { "epoch": 0.5313561185351672, "grad_norm": 17.77213478088379, "learning_rate": 1.999760966551637e-06, "loss": 0.1802, "num_input_tokens_seen": 14636480, "step": 21750 }, { "epoch": 0.5314782693670144, "grad_norm": 8.96764087677002, "learning_rate": 1.99975909845518e-06, "loss": 0.1074, "num_input_tokens_seen": 14640000, "step": 21755 }, { "epoch": 0.5316004201988616, "grad_norm": 0.3799511790275574, "learning_rate": 1.999757223088206e-06, "loss": 0.0523, "num_input_tokens_seen": 14643520, "step": 21760 }, { "epoch": 0.5317225710307087, "grad_norm": 6.152425765991211, "learning_rate": 1.9997553404507284e-06, "loss": 0.1024, "num_input_tokens_seen": 14646720, "step": 21765 }, { "epoch": 0.5318447218625559, "grad_norm": 1.7916256189346313, "learning_rate": 1.9997534505427607e-06, "loss": 0.0662, "num_input_tokens_seen": 14649920, "step": 21770 }, { "epoch": 0.531966872694403, "grad_norm": 26.20082664489746, "learning_rate": 1.9997515533643176e-06, "loss": 0.1042, "num_input_tokens_seen": 14653120, "step": 21775 }, { "epoch": 0.5320890235262502, "grad_norm": 3.543252468109131, "learning_rate": 1.999749648915412e-06, "loss": 0.1285, "num_input_tokens_seen": 14656704, "step": 21780 }, { "epoch": 0.5322111743580974, "grad_norm": 11.144600868225098, "learning_rate": 1.999747737196058e-06, "loss": 0.1138, "num_input_tokens_seen": 14659968, "step": 21785 }, { "epoch": 0.5323333251899446, "grad_norm": 5.704688549041748, "learning_rate": 1.9997458182062695e-06, "loss": 0.1466, "num_input_tokens_seen": 14663168, "step": 21790 }, { "epoch": 0.5324554760217917, "grad_norm": 13.300514221191406, "learning_rate": 1.999743891946061e-06, "loss": 0.1678, "num_input_tokens_seen": 14666880, "step": 21795 }, { "epoch": 0.5325776268536389, "grad_norm": 27.650346755981445, "learning_rate": 1.999741958415446e-06, "loss": 0.0876, "num_input_tokens_seen": 14670016, "step": 21800 }, { "epoch": 0.532699777685486, "grad_norm": 1.8453861474990845, "learning_rate": 1.999740017614438e-06, "loss": 0.1508, "num_input_tokens_seen": 14673152, "step": 21805 }, { "epoch": 0.5328219285173332, "grad_norm": 28.67298126220703, "learning_rate": 1.999738069543052e-06, "loss": 0.0931, "num_input_tokens_seen": 14676800, "step": 21810 }, { "epoch": 0.5329440793491804, "grad_norm": 14.552690505981445, "learning_rate": 1.9997361142013016e-06, "loss": 0.0399, "num_input_tokens_seen": 14680320, "step": 21815 }, { "epoch": 0.5330662301810275, "grad_norm": 23.46329116821289, "learning_rate": 1.9997341515892016e-06, "loss": 0.1401, "num_input_tokens_seen": 14683264, "step": 21820 }, { "epoch": 0.5331883810128747, "grad_norm": 12.535654067993164, "learning_rate": 1.9997321817067662e-06, "loss": 0.135, "num_input_tokens_seen": 14686656, "step": 21825 }, { "epoch": 0.5333105318447219, "grad_norm": 25.697513580322266, "learning_rate": 1.999730204554009e-06, "loss": 0.1525, "num_input_tokens_seen": 14690304, "step": 21830 }, { "epoch": 0.5334326826765691, "grad_norm": 2.353025436401367, "learning_rate": 1.999728220130945e-06, "loss": 0.1182, "num_input_tokens_seen": 14693632, "step": 21835 }, { "epoch": 0.5335548335084161, "grad_norm": 7.448736667633057, "learning_rate": 1.9997262284375886e-06, "loss": 0.0625, "num_input_tokens_seen": 14696768, "step": 21840 }, { "epoch": 0.5336769843402633, "grad_norm": 17.941051483154297, "learning_rate": 1.999724229473954e-06, "loss": 0.0942, "num_input_tokens_seen": 14699904, "step": 21845 }, { "epoch": 0.5337991351721105, "grad_norm": 11.695417404174805, "learning_rate": 1.999722223240056e-06, "loss": 0.1101, "num_input_tokens_seen": 14703936, "step": 21850 }, { "epoch": 0.5339212860039577, "grad_norm": 0.5295389294624329, "learning_rate": 1.999720209735909e-06, "loss": 0.0913, "num_input_tokens_seen": 14707264, "step": 21855 }, { "epoch": 0.5340434368358048, "grad_norm": 19.66712188720703, "learning_rate": 1.9997181889615277e-06, "loss": 0.1847, "num_input_tokens_seen": 14710720, "step": 21860 }, { "epoch": 0.534165587667652, "grad_norm": 8.764230728149414, "learning_rate": 1.999716160916927e-06, "loss": 0.0553, "num_input_tokens_seen": 14713920, "step": 21865 }, { "epoch": 0.5342877384994992, "grad_norm": 20.96441078186035, "learning_rate": 1.9997141256021214e-06, "loss": 0.1577, "num_input_tokens_seen": 14717184, "step": 21870 }, { "epoch": 0.5344098893313464, "grad_norm": 21.725391387939453, "learning_rate": 1.999712083017126e-06, "loss": 0.1194, "num_input_tokens_seen": 14720512, "step": 21875 }, { "epoch": 0.5345320401631936, "grad_norm": 0.7325649857521057, "learning_rate": 1.999710033161955e-06, "loss": 0.0821, "num_input_tokens_seen": 14723776, "step": 21880 }, { "epoch": 0.5346541909950406, "grad_norm": 4.8892083168029785, "learning_rate": 1.9997079760366242e-06, "loss": 0.0919, "num_input_tokens_seen": 14726784, "step": 21885 }, { "epoch": 0.5347763418268878, "grad_norm": 4.73694372177124, "learning_rate": 1.999705911641148e-06, "loss": 0.0458, "num_input_tokens_seen": 14730368, "step": 21890 }, { "epoch": 0.534898492658735, "grad_norm": 0.25473085045814514, "learning_rate": 1.9997038399755416e-06, "loss": 0.1431, "num_input_tokens_seen": 14733888, "step": 21895 }, { "epoch": 0.5350206434905822, "grad_norm": 12.120611190795898, "learning_rate": 1.99970176103982e-06, "loss": 0.1114, "num_input_tokens_seen": 14736960, "step": 21900 }, { "epoch": 0.5351427943224293, "grad_norm": 17.077253341674805, "learning_rate": 1.999699674833998e-06, "loss": 0.2132, "num_input_tokens_seen": 14740160, "step": 21905 }, { "epoch": 0.5352649451542765, "grad_norm": 7.404626369476318, "learning_rate": 1.9996975813580913e-06, "loss": 0.1527, "num_input_tokens_seen": 14743744, "step": 21910 }, { "epoch": 0.5353870959861237, "grad_norm": 2.2422728538513184, "learning_rate": 1.9996954806121145e-06, "loss": 0.0924, "num_input_tokens_seen": 14747648, "step": 21915 }, { "epoch": 0.5355092468179709, "grad_norm": 12.412498474121094, "learning_rate": 1.999693372596084e-06, "loss": 0.2639, "num_input_tokens_seen": 14750912, "step": 21920 }, { "epoch": 0.535631397649818, "grad_norm": 12.730358123779297, "learning_rate": 1.999691257310014e-06, "loss": 0.1055, "num_input_tokens_seen": 14754880, "step": 21925 }, { "epoch": 0.5357535484816651, "grad_norm": 16.045204162597656, "learning_rate": 1.99968913475392e-06, "loss": 0.1137, "num_input_tokens_seen": 14758336, "step": 21930 }, { "epoch": 0.5358756993135123, "grad_norm": 3.7376608848571777, "learning_rate": 1.9996870049278183e-06, "loss": 0.1668, "num_input_tokens_seen": 14762048, "step": 21935 }, { "epoch": 0.5359978501453595, "grad_norm": 21.60993194580078, "learning_rate": 1.9996848678317236e-06, "loss": 0.1627, "num_input_tokens_seen": 14765248, "step": 21940 }, { "epoch": 0.5361200009772067, "grad_norm": 18.804241180419922, "learning_rate": 1.9996827234656515e-06, "loss": 0.2253, "num_input_tokens_seen": 14768320, "step": 21945 }, { "epoch": 0.5362421518090538, "grad_norm": 17.075807571411133, "learning_rate": 1.999680571829618e-06, "loss": 0.1673, "num_input_tokens_seen": 14771840, "step": 21950 }, { "epoch": 0.536364302640901, "grad_norm": 17.029102325439453, "learning_rate": 1.9996784129236383e-06, "loss": 0.0802, "num_input_tokens_seen": 14775744, "step": 21955 }, { "epoch": 0.5364864534727481, "grad_norm": 9.187379837036133, "learning_rate": 1.999676246747728e-06, "loss": 0.0289, "num_input_tokens_seen": 14779776, "step": 21960 }, { "epoch": 0.5366086043045953, "grad_norm": 31.00434684753418, "learning_rate": 1.9996740733019037e-06, "loss": 0.2489, "num_input_tokens_seen": 14783168, "step": 21965 }, { "epoch": 0.5367307551364425, "grad_norm": 20.447832107543945, "learning_rate": 1.9996718925861805e-06, "loss": 0.0224, "num_input_tokens_seen": 14786240, "step": 21970 }, { "epoch": 0.5368529059682896, "grad_norm": 27.77326202392578, "learning_rate": 1.9996697046005746e-06, "loss": 0.1144, "num_input_tokens_seen": 14789248, "step": 21975 }, { "epoch": 0.5369750568001368, "grad_norm": 0.8397197127342224, "learning_rate": 1.9996675093451014e-06, "loss": 0.0216, "num_input_tokens_seen": 14792128, "step": 21980 }, { "epoch": 0.537097207631984, "grad_norm": 30.160160064697266, "learning_rate": 1.9996653068197774e-06, "loss": 0.0429, "num_input_tokens_seen": 14795200, "step": 21985 }, { "epoch": 0.5372193584638312, "grad_norm": 20.93111228942871, "learning_rate": 1.999663097024618e-06, "loss": 0.1353, "num_input_tokens_seen": 14798656, "step": 21990 }, { "epoch": 0.5373415092956783, "grad_norm": 15.375563621520996, "learning_rate": 1.9996608799596402e-06, "loss": 0.0957, "num_input_tokens_seen": 14802112, "step": 21995 }, { "epoch": 0.5374636601275254, "grad_norm": 31.491281509399414, "learning_rate": 1.9996586556248593e-06, "loss": 0.1855, "num_input_tokens_seen": 14805568, "step": 22000 }, { "epoch": 0.5375858109593726, "grad_norm": 16.63956642150879, "learning_rate": 1.999656424020292e-06, "loss": 0.1993, "num_input_tokens_seen": 14809280, "step": 22005 }, { "epoch": 0.5377079617912198, "grad_norm": 56.18572998046875, "learning_rate": 1.999654185145954e-06, "loss": 0.3984, "num_input_tokens_seen": 14812288, "step": 22010 }, { "epoch": 0.537830112623067, "grad_norm": 14.070927619934082, "learning_rate": 1.9996519390018626e-06, "loss": 0.1776, "num_input_tokens_seen": 14815296, "step": 22015 }, { "epoch": 0.5379522634549141, "grad_norm": 7.646341323852539, "learning_rate": 1.9996496855880327e-06, "loss": 0.1282, "num_input_tokens_seen": 14818368, "step": 22020 }, { "epoch": 0.5380744142867613, "grad_norm": 3.7417798042297363, "learning_rate": 1.9996474249044816e-06, "loss": 0.0895, "num_input_tokens_seen": 14821888, "step": 22025 }, { "epoch": 0.5381965651186085, "grad_norm": 19.079700469970703, "learning_rate": 1.999645156951226e-06, "loss": 0.0929, "num_input_tokens_seen": 14824960, "step": 22030 }, { "epoch": 0.5383187159504557, "grad_norm": 4.660904407501221, "learning_rate": 1.999642881728281e-06, "loss": 0.0822, "num_input_tokens_seen": 14828672, "step": 22035 }, { "epoch": 0.5384408667823027, "grad_norm": 17.99866485595703, "learning_rate": 1.9996405992356648e-06, "loss": 0.129, "num_input_tokens_seen": 14832192, "step": 22040 }, { "epoch": 0.5385630176141499, "grad_norm": 2.2056124210357666, "learning_rate": 1.999638309473393e-06, "loss": 0.1256, "num_input_tokens_seen": 14835776, "step": 22045 }, { "epoch": 0.5386851684459971, "grad_norm": 14.415079116821289, "learning_rate": 1.999636012441483e-06, "loss": 0.0424, "num_input_tokens_seen": 14839296, "step": 22050 }, { "epoch": 0.5388073192778443, "grad_norm": 2.106815814971924, "learning_rate": 1.9996337081399508e-06, "loss": 0.0114, "num_input_tokens_seen": 14842496, "step": 22055 }, { "epoch": 0.5389294701096915, "grad_norm": 0.8564073443412781, "learning_rate": 1.9996313965688134e-06, "loss": 0.2205, "num_input_tokens_seen": 14845760, "step": 22060 }, { "epoch": 0.5390516209415386, "grad_norm": 5.316920757293701, "learning_rate": 1.9996290777280873e-06, "loss": 0.0879, "num_input_tokens_seen": 14849088, "step": 22065 }, { "epoch": 0.5391737717733858, "grad_norm": 5.526222229003906, "learning_rate": 1.99962675161779e-06, "loss": 0.0739, "num_input_tokens_seen": 14852416, "step": 22070 }, { "epoch": 0.539295922605233, "grad_norm": 33.81382369995117, "learning_rate": 1.9996244182379376e-06, "loss": 0.2628, "num_input_tokens_seen": 14856000, "step": 22075 }, { "epoch": 0.5394180734370801, "grad_norm": 0.3070218861103058, "learning_rate": 1.9996220775885484e-06, "loss": 0.1335, "num_input_tokens_seen": 14859200, "step": 22080 }, { "epoch": 0.5395402242689272, "grad_norm": 20.04486656188965, "learning_rate": 1.999619729669638e-06, "loss": 0.0093, "num_input_tokens_seen": 14862464, "step": 22085 }, { "epoch": 0.5396623751007744, "grad_norm": 19.408672332763672, "learning_rate": 1.999617374481224e-06, "loss": 0.0814, "num_input_tokens_seen": 14865728, "step": 22090 }, { "epoch": 0.5397845259326216, "grad_norm": 10.519676208496094, "learning_rate": 1.999615012023324e-06, "loss": 0.1876, "num_input_tokens_seen": 14869056, "step": 22095 }, { "epoch": 0.5399066767644688, "grad_norm": 35.646114349365234, "learning_rate": 1.9996126422959544e-06, "loss": 0.238, "num_input_tokens_seen": 14872704, "step": 22100 }, { "epoch": 0.540028827596316, "grad_norm": 17.06508445739746, "learning_rate": 1.9996102652991332e-06, "loss": 0.1577, "num_input_tokens_seen": 14876032, "step": 22105 }, { "epoch": 0.5401509784281631, "grad_norm": 2.0623927116394043, "learning_rate": 1.9996078810328767e-06, "loss": 0.1087, "num_input_tokens_seen": 14880448, "step": 22110 }, { "epoch": 0.5402731292600103, "grad_norm": 19.45257568359375, "learning_rate": 1.9996054894972035e-06, "loss": 0.156, "num_input_tokens_seen": 14883584, "step": 22115 }, { "epoch": 0.5403952800918574, "grad_norm": 26.047109603881836, "learning_rate": 1.99960309069213e-06, "loss": 0.1056, "num_input_tokens_seen": 14888960, "step": 22120 }, { "epoch": 0.5405174309237046, "grad_norm": 28.765560150146484, "learning_rate": 1.999600684617674e-06, "loss": 0.1507, "num_input_tokens_seen": 14892544, "step": 22125 }, { "epoch": 0.5406395817555517, "grad_norm": 31.686187744140625, "learning_rate": 1.999598271273853e-06, "loss": 0.1527, "num_input_tokens_seen": 14896064, "step": 22130 }, { "epoch": 0.5407617325873989, "grad_norm": 40.38404083251953, "learning_rate": 1.9995958506606843e-06, "loss": 0.1882, "num_input_tokens_seen": 14899072, "step": 22135 }, { "epoch": 0.5408838834192461, "grad_norm": 0.6900488138198853, "learning_rate": 1.999593422778186e-06, "loss": 0.0974, "num_input_tokens_seen": 14902528, "step": 22140 }, { "epoch": 0.5410060342510933, "grad_norm": 2.6025142669677734, "learning_rate": 1.9995909876263753e-06, "loss": 0.1271, "num_input_tokens_seen": 14905792, "step": 22145 }, { "epoch": 0.5411281850829404, "grad_norm": 9.224105834960938, "learning_rate": 1.99958854520527e-06, "loss": 0.1134, "num_input_tokens_seen": 14909184, "step": 22150 }, { "epoch": 0.5412503359147876, "grad_norm": 3.023387908935547, "learning_rate": 1.9995860955148884e-06, "loss": 0.097, "num_input_tokens_seen": 14912512, "step": 22155 }, { "epoch": 0.5413724867466347, "grad_norm": 1.4665299654006958, "learning_rate": 1.999583638555247e-06, "loss": 0.0944, "num_input_tokens_seen": 14915712, "step": 22160 }, { "epoch": 0.5414946375784819, "grad_norm": 21.681215286254883, "learning_rate": 1.999581174326365e-06, "loss": 0.0748, "num_input_tokens_seen": 14919168, "step": 22165 }, { "epoch": 0.5416167884103291, "grad_norm": 35.79258346557617, "learning_rate": 1.99957870282826e-06, "loss": 0.1841, "num_input_tokens_seen": 14923072, "step": 22170 }, { "epoch": 0.5417389392421762, "grad_norm": 26.38593864440918, "learning_rate": 1.99957622406095e-06, "loss": 0.1218, "num_input_tokens_seen": 14926720, "step": 22175 }, { "epoch": 0.5418610900740234, "grad_norm": 1.901475191116333, "learning_rate": 1.9995737380244523e-06, "loss": 0.0952, "num_input_tokens_seen": 14929856, "step": 22180 }, { "epoch": 0.5419832409058706, "grad_norm": 14.192605972290039, "learning_rate": 1.999571244718786e-06, "loss": 0.1337, "num_input_tokens_seen": 14933056, "step": 22185 }, { "epoch": 0.5421053917377178, "grad_norm": 12.209223747253418, "learning_rate": 1.9995687441439685e-06, "loss": 0.104, "num_input_tokens_seen": 14936320, "step": 22190 }, { "epoch": 0.5422275425695648, "grad_norm": 0.9199336767196655, "learning_rate": 1.9995662363000184e-06, "loss": 0.0855, "num_input_tokens_seen": 14939264, "step": 22195 }, { "epoch": 0.542349693401412, "grad_norm": 23.20319938659668, "learning_rate": 1.999563721186953e-06, "loss": 0.1127, "num_input_tokens_seen": 14942976, "step": 22200 }, { "epoch": 0.5424718442332592, "grad_norm": 2.204730272293091, "learning_rate": 1.9995611988047926e-06, "loss": 0.0067, "num_input_tokens_seen": 14946304, "step": 22205 }, { "epoch": 0.5425939950651064, "grad_norm": 15.761679649353027, "learning_rate": 1.9995586691535537e-06, "loss": 0.1645, "num_input_tokens_seen": 14950336, "step": 22210 }, { "epoch": 0.5427161458969536, "grad_norm": 9.355384826660156, "learning_rate": 1.999556132233255e-06, "loss": 0.0654, "num_input_tokens_seen": 14953600, "step": 22215 }, { "epoch": 0.5428382967288007, "grad_norm": 4.341427326202393, "learning_rate": 1.9995535880439158e-06, "loss": 0.1185, "num_input_tokens_seen": 14956736, "step": 22220 }, { "epoch": 0.5429604475606479, "grad_norm": 7.728878498077393, "learning_rate": 1.999551036585554e-06, "loss": 0.0389, "num_input_tokens_seen": 14959808, "step": 22225 }, { "epoch": 0.5430825983924951, "grad_norm": 3.8340866565704346, "learning_rate": 1.999548477858188e-06, "loss": 0.1265, "num_input_tokens_seen": 14963008, "step": 22230 }, { "epoch": 0.5432047492243423, "grad_norm": 23.842802047729492, "learning_rate": 1.9995459118618364e-06, "loss": 0.08, "num_input_tokens_seen": 14965952, "step": 22235 }, { "epoch": 0.5433269000561893, "grad_norm": 44.32612228393555, "learning_rate": 1.9995433385965187e-06, "loss": 0.2909, "num_input_tokens_seen": 14969216, "step": 22240 }, { "epoch": 0.5434490508880365, "grad_norm": 11.29781723022461, "learning_rate": 1.9995407580622526e-06, "loss": 0.2292, "num_input_tokens_seen": 14972864, "step": 22245 }, { "epoch": 0.5435712017198837, "grad_norm": 31.859079360961914, "learning_rate": 1.9995381702590572e-06, "loss": 0.0804, "num_input_tokens_seen": 14976192, "step": 22250 }, { "epoch": 0.5436933525517309, "grad_norm": 26.400442123413086, "learning_rate": 1.9995355751869517e-06, "loss": 0.2389, "num_input_tokens_seen": 14979520, "step": 22255 }, { "epoch": 0.5438155033835781, "grad_norm": 15.490806579589844, "learning_rate": 1.9995329728459545e-06, "loss": 0.1369, "num_input_tokens_seen": 14982592, "step": 22260 }, { "epoch": 0.5439376542154252, "grad_norm": 32.64035415649414, "learning_rate": 1.999530363236085e-06, "loss": 0.113, "num_input_tokens_seen": 14986240, "step": 22265 }, { "epoch": 0.5440598050472724, "grad_norm": 15.278639793395996, "learning_rate": 1.9995277463573612e-06, "loss": 0.0672, "num_input_tokens_seen": 14989568, "step": 22270 }, { "epoch": 0.5441819558791195, "grad_norm": 17.714778900146484, "learning_rate": 1.999525122209803e-06, "loss": 0.0729, "num_input_tokens_seen": 14993280, "step": 22275 }, { "epoch": 0.5443041067109667, "grad_norm": 15.31547737121582, "learning_rate": 1.9995224907934295e-06, "loss": 0.07, "num_input_tokens_seen": 14997440, "step": 22280 }, { "epoch": 0.5444262575428138, "grad_norm": 22.41041374206543, "learning_rate": 1.9995198521082594e-06, "loss": 0.0847, "num_input_tokens_seen": 15000960, "step": 22285 }, { "epoch": 0.544548408374661, "grad_norm": 10.246545791625977, "learning_rate": 1.999517206154312e-06, "loss": 0.0509, "num_input_tokens_seen": 15004096, "step": 22290 }, { "epoch": 0.5446705592065082, "grad_norm": 22.861297607421875, "learning_rate": 1.999514552931607e-06, "loss": 0.1495, "num_input_tokens_seen": 15007680, "step": 22295 }, { "epoch": 0.5447927100383554, "grad_norm": 35.24309158325195, "learning_rate": 1.9995118924401632e-06, "loss": 0.151, "num_input_tokens_seen": 15011072, "step": 22300 }, { "epoch": 0.5449148608702026, "grad_norm": 0.9900000095367432, "learning_rate": 1.99950922468e-06, "loss": 0.1137, "num_input_tokens_seen": 15014208, "step": 22305 }, { "epoch": 0.5450370117020497, "grad_norm": 13.241475105285645, "learning_rate": 1.9995065496511367e-06, "loss": 0.0952, "num_input_tokens_seen": 15017344, "step": 22310 }, { "epoch": 0.5451591625338968, "grad_norm": 35.15651321411133, "learning_rate": 1.9995038673535933e-06, "loss": 0.1833, "num_input_tokens_seen": 15020992, "step": 22315 }, { "epoch": 0.545281313365744, "grad_norm": 11.369232177734375, "learning_rate": 1.9995011777873887e-06, "loss": 0.0792, "num_input_tokens_seen": 15024000, "step": 22320 }, { "epoch": 0.5454034641975912, "grad_norm": 8.909126281738281, "learning_rate": 1.999498480952543e-06, "loss": 0.1104, "num_input_tokens_seen": 15027840, "step": 22325 }, { "epoch": 0.5455256150294383, "grad_norm": 37.6539306640625, "learning_rate": 1.999495776849075e-06, "loss": 0.2498, "num_input_tokens_seen": 15031104, "step": 22330 }, { "epoch": 0.5456477658612855, "grad_norm": 9.263086318969727, "learning_rate": 1.999493065477005e-06, "loss": 0.1117, "num_input_tokens_seen": 15034432, "step": 22335 }, { "epoch": 0.5457699166931327, "grad_norm": 8.902159690856934, "learning_rate": 1.999490346836353e-06, "loss": 0.0532, "num_input_tokens_seen": 15037824, "step": 22340 }, { "epoch": 0.5458920675249799, "grad_norm": 32.49031448364258, "learning_rate": 1.999487620927138e-06, "loss": 0.0565, "num_input_tokens_seen": 15040960, "step": 22345 }, { "epoch": 0.5460142183568271, "grad_norm": 11.853031158447266, "learning_rate": 1.9994848877493806e-06, "loss": 0.0731, "num_input_tokens_seen": 15043904, "step": 22350 }, { "epoch": 0.5461363691886741, "grad_norm": 1.538690209388733, "learning_rate": 1.9994821473031e-06, "loss": 0.1369, "num_input_tokens_seen": 15046848, "step": 22355 }, { "epoch": 0.5462585200205213, "grad_norm": 3.006669521331787, "learning_rate": 1.9994793995883165e-06, "loss": 0.1618, "num_input_tokens_seen": 15050176, "step": 22360 }, { "epoch": 0.5463806708523685, "grad_norm": 2.1008987426757812, "learning_rate": 1.9994766446050497e-06, "loss": 0.1655, "num_input_tokens_seen": 15053632, "step": 22365 }, { "epoch": 0.5465028216842157, "grad_norm": 26.6789608001709, "learning_rate": 1.9994738823533203e-06, "loss": 0.0954, "num_input_tokens_seen": 15057088, "step": 22370 }, { "epoch": 0.5466249725160628, "grad_norm": 15.680182456970215, "learning_rate": 1.9994711128331474e-06, "loss": 0.0837, "num_input_tokens_seen": 15060224, "step": 22375 }, { "epoch": 0.54674712334791, "grad_norm": 4.9395318031311035, "learning_rate": 1.9994683360445522e-06, "loss": 0.1012, "num_input_tokens_seen": 15063424, "step": 22380 }, { "epoch": 0.5468692741797572, "grad_norm": 10.766144752502441, "learning_rate": 1.9994655519875546e-06, "loss": 0.0936, "num_input_tokens_seen": 15066368, "step": 22385 }, { "epoch": 0.5469914250116044, "grad_norm": 11.765340805053711, "learning_rate": 1.9994627606621745e-06, "loss": 0.0797, "num_input_tokens_seen": 15070016, "step": 22390 }, { "epoch": 0.5471135758434514, "grad_norm": 12.449207305908203, "learning_rate": 1.999459962068432e-06, "loss": 0.2556, "num_input_tokens_seen": 15073152, "step": 22395 }, { "epoch": 0.5472357266752986, "grad_norm": 19.93573760986328, "learning_rate": 1.9994571562063483e-06, "loss": 0.2031, "num_input_tokens_seen": 15076480, "step": 22400 }, { "epoch": 0.5473578775071458, "grad_norm": 3.5067667961120605, "learning_rate": 1.999454343075943e-06, "loss": 0.1312, "num_input_tokens_seen": 15079680, "step": 22405 }, { "epoch": 0.547480028338993, "grad_norm": 9.037195205688477, "learning_rate": 1.9994515226772373e-06, "loss": 0.089, "num_input_tokens_seen": 15082752, "step": 22410 }, { "epoch": 0.5476021791708402, "grad_norm": 18.904560089111328, "learning_rate": 1.9994486950102512e-06, "loss": 0.0578, "num_input_tokens_seen": 15086016, "step": 22415 }, { "epoch": 0.5477243300026873, "grad_norm": 0.3442266285419464, "learning_rate": 1.9994458600750054e-06, "loss": 0.0682, "num_input_tokens_seen": 15089408, "step": 22420 }, { "epoch": 0.5478464808345345, "grad_norm": 11.249054908752441, "learning_rate": 1.99944301787152e-06, "loss": 0.1, "num_input_tokens_seen": 15092544, "step": 22425 }, { "epoch": 0.5479686316663817, "grad_norm": 27.72504234313965, "learning_rate": 1.999440168399817e-06, "loss": 0.1168, "num_input_tokens_seen": 15096000, "step": 22430 }, { "epoch": 0.5480907824982288, "grad_norm": 22.7575626373291, "learning_rate": 1.9994373116599155e-06, "loss": 0.0899, "num_input_tokens_seen": 15099136, "step": 22435 }, { "epoch": 0.5482129333300759, "grad_norm": 24.837921142578125, "learning_rate": 1.9994344476518376e-06, "loss": 0.1968, "num_input_tokens_seen": 15102720, "step": 22440 }, { "epoch": 0.5483350841619231, "grad_norm": 44.41413879394531, "learning_rate": 1.9994315763756033e-06, "loss": 0.1988, "num_input_tokens_seen": 15106432, "step": 22445 }, { "epoch": 0.5484572349937703, "grad_norm": 15.867010116577148, "learning_rate": 1.9994286978312338e-06, "loss": 0.0623, "num_input_tokens_seen": 15109888, "step": 22450 }, { "epoch": 0.5485793858256175, "grad_norm": 2.3468613624572754, "learning_rate": 1.99942581201875e-06, "loss": 0.121, "num_input_tokens_seen": 15113472, "step": 22455 }, { "epoch": 0.5487015366574647, "grad_norm": 28.272499084472656, "learning_rate": 1.9994229189381726e-06, "loss": 0.0837, "num_input_tokens_seen": 15116544, "step": 22460 }, { "epoch": 0.5488236874893118, "grad_norm": 64.57221984863281, "learning_rate": 1.9994200185895233e-06, "loss": 0.1636, "num_input_tokens_seen": 15120000, "step": 22465 }, { "epoch": 0.548945838321159, "grad_norm": 29.440067291259766, "learning_rate": 1.9994171109728227e-06, "loss": 0.1474, "num_input_tokens_seen": 15123136, "step": 22470 }, { "epoch": 0.5490679891530061, "grad_norm": 31.00202178955078, "learning_rate": 1.999414196088092e-06, "loss": 0.1026, "num_input_tokens_seen": 15126592, "step": 22475 }, { "epoch": 0.5491901399848533, "grad_norm": 9.43714427947998, "learning_rate": 1.9994112739353526e-06, "loss": 0.0944, "num_input_tokens_seen": 15129984, "step": 22480 }, { "epoch": 0.5493122908167004, "grad_norm": 30.70720863342285, "learning_rate": 1.9994083445146255e-06, "loss": 0.1571, "num_input_tokens_seen": 15133504, "step": 22485 }, { "epoch": 0.5494344416485476, "grad_norm": 30.060976028442383, "learning_rate": 1.999405407825932e-06, "loss": 0.0858, "num_input_tokens_seen": 15136576, "step": 22490 }, { "epoch": 0.5495565924803948, "grad_norm": 22.569185256958008, "learning_rate": 1.999402463869294e-06, "loss": 0.1534, "num_input_tokens_seen": 15140224, "step": 22495 }, { "epoch": 0.549678743312242, "grad_norm": 4.827227592468262, "learning_rate": 1.9993995126447325e-06, "loss": 0.1497, "num_input_tokens_seen": 15143488, "step": 22500 }, { "epoch": 0.5498008941440892, "grad_norm": 35.69637680053711, "learning_rate": 1.9993965541522684e-06, "loss": 0.1645, "num_input_tokens_seen": 15146816, "step": 22505 }, { "epoch": 0.5499230449759362, "grad_norm": 1.8859728574752808, "learning_rate": 1.999393588391924e-06, "loss": 0.1014, "num_input_tokens_seen": 15150208, "step": 22510 }, { "epoch": 0.5500451958077834, "grad_norm": 12.349617004394531, "learning_rate": 1.9993906153637204e-06, "loss": 0.0376, "num_input_tokens_seen": 15153664, "step": 22515 }, { "epoch": 0.5501673466396306, "grad_norm": 49.3721809387207, "learning_rate": 1.9993876350676796e-06, "loss": 0.0523, "num_input_tokens_seen": 15157248, "step": 22520 }, { "epoch": 0.5502894974714778, "grad_norm": 20.546186447143555, "learning_rate": 1.999384647503823e-06, "loss": 0.1262, "num_input_tokens_seen": 15160256, "step": 22525 }, { "epoch": 0.5504116483033249, "grad_norm": 17.570690155029297, "learning_rate": 1.9993816526721725e-06, "loss": 0.2036, "num_input_tokens_seen": 15163712, "step": 22530 }, { "epoch": 0.5505337991351721, "grad_norm": 33.04792785644531, "learning_rate": 1.9993786505727503e-06, "loss": 0.0762, "num_input_tokens_seen": 15166912, "step": 22535 }, { "epoch": 0.5506559499670193, "grad_norm": 1.1662955284118652, "learning_rate": 1.9993756412055773e-06, "loss": 0.1405, "num_input_tokens_seen": 15170432, "step": 22540 }, { "epoch": 0.5507781007988665, "grad_norm": 15.58870792388916, "learning_rate": 1.999372624570676e-06, "loss": 0.2726, "num_input_tokens_seen": 15173760, "step": 22545 }, { "epoch": 0.5509002516307137, "grad_norm": 24.114355087280273, "learning_rate": 1.999369600668068e-06, "loss": 0.0718, "num_input_tokens_seen": 15176704, "step": 22550 }, { "epoch": 0.5510224024625607, "grad_norm": 9.267765998840332, "learning_rate": 1.9993665694977755e-06, "loss": 0.122, "num_input_tokens_seen": 15180096, "step": 22555 }, { "epoch": 0.5511445532944079, "grad_norm": 0.6705265045166016, "learning_rate": 1.9993635310598207e-06, "loss": 0.0551, "num_input_tokens_seen": 15183168, "step": 22560 }, { "epoch": 0.5512667041262551, "grad_norm": 1.3118199110031128, "learning_rate": 1.9993604853542254e-06, "loss": 0.0613, "num_input_tokens_seen": 15186560, "step": 22565 }, { "epoch": 0.5513888549581023, "grad_norm": 0.7696697115898132, "learning_rate": 1.9993574323810115e-06, "loss": 0.0921, "num_input_tokens_seen": 15189824, "step": 22570 }, { "epoch": 0.5515110057899494, "grad_norm": 15.944690704345703, "learning_rate": 1.999354372140202e-06, "loss": 0.1304, "num_input_tokens_seen": 15192960, "step": 22575 }, { "epoch": 0.5516331566217966, "grad_norm": 28.77052879333496, "learning_rate": 1.9993513046318186e-06, "loss": 0.1192, "num_input_tokens_seen": 15196544, "step": 22580 }, { "epoch": 0.5517553074536438, "grad_norm": 34.70576858520508, "learning_rate": 1.9993482298558836e-06, "loss": 0.1632, "num_input_tokens_seen": 15199552, "step": 22585 }, { "epoch": 0.551877458285491, "grad_norm": 15.37161922454834, "learning_rate": 1.99934514781242e-06, "loss": 0.0365, "num_input_tokens_seen": 15202880, "step": 22590 }, { "epoch": 0.5519996091173381, "grad_norm": 1.150341272354126, "learning_rate": 1.999342058501449e-06, "loss": 0.1904, "num_input_tokens_seen": 15205952, "step": 22595 }, { "epoch": 0.5521217599491852, "grad_norm": 14.19490909576416, "learning_rate": 1.999338961922994e-06, "loss": 0.0517, "num_input_tokens_seen": 15208896, "step": 22600 }, { "epoch": 0.5522439107810324, "grad_norm": 10.007599830627441, "learning_rate": 1.9993358580770774e-06, "loss": 0.2134, "num_input_tokens_seen": 15212224, "step": 22605 }, { "epoch": 0.5523660616128796, "grad_norm": 13.70540714263916, "learning_rate": 1.9993327469637215e-06, "loss": 0.1293, "num_input_tokens_seen": 15215360, "step": 22610 }, { "epoch": 0.5524882124447268, "grad_norm": 0.5830227136611938, "learning_rate": 1.9993296285829492e-06, "loss": 0.0241, "num_input_tokens_seen": 15219136, "step": 22615 }, { "epoch": 0.5526103632765739, "grad_norm": 0.17814908921718597, "learning_rate": 1.999326502934783e-06, "loss": 0.1111, "num_input_tokens_seen": 15222720, "step": 22620 }, { "epoch": 0.5527325141084211, "grad_norm": 31.84588623046875, "learning_rate": 1.9993233700192454e-06, "loss": 0.1049, "num_input_tokens_seen": 15225856, "step": 22625 }, { "epoch": 0.5528546649402682, "grad_norm": 8.72294807434082, "learning_rate": 1.99932022983636e-06, "loss": 0.1601, "num_input_tokens_seen": 15229120, "step": 22630 }, { "epoch": 0.5529768157721154, "grad_norm": 30.907611846923828, "learning_rate": 1.9993170823861488e-06, "loss": 0.2518, "num_input_tokens_seen": 15232384, "step": 22635 }, { "epoch": 0.5530989666039626, "grad_norm": 0.607671856880188, "learning_rate": 1.999313927668635e-06, "loss": 0.0603, "num_input_tokens_seen": 15235904, "step": 22640 }, { "epoch": 0.5532211174358097, "grad_norm": 3.4030096530914307, "learning_rate": 1.9993107656838415e-06, "loss": 0.0919, "num_input_tokens_seen": 15239296, "step": 22645 }, { "epoch": 0.5533432682676569, "grad_norm": 7.613509178161621, "learning_rate": 1.9993075964317912e-06, "loss": 0.092, "num_input_tokens_seen": 15242624, "step": 22650 }, { "epoch": 0.5534654190995041, "grad_norm": 24.67089080810547, "learning_rate": 1.999304419912508e-06, "loss": 0.0941, "num_input_tokens_seen": 15246464, "step": 22655 }, { "epoch": 0.5535875699313513, "grad_norm": 83.90634155273438, "learning_rate": 1.9993012361260134e-06, "loss": 0.1296, "num_input_tokens_seen": 15249984, "step": 22660 }, { "epoch": 0.5537097207631984, "grad_norm": 23.177980422973633, "learning_rate": 1.999298045072332e-06, "loss": 0.1609, "num_input_tokens_seen": 15253184, "step": 22665 }, { "epoch": 0.5538318715950455, "grad_norm": 19.674943923950195, "learning_rate": 1.999294846751486e-06, "loss": 0.1717, "num_input_tokens_seen": 15257280, "step": 22670 }, { "epoch": 0.5539540224268927, "grad_norm": 61.09177780151367, "learning_rate": 1.9992916411634995e-06, "loss": 0.1045, "num_input_tokens_seen": 15260288, "step": 22675 }, { "epoch": 0.5540761732587399, "grad_norm": 0.37195536494255066, "learning_rate": 1.9992884283083954e-06, "loss": 0.0683, "num_input_tokens_seen": 15263424, "step": 22680 }, { "epoch": 0.554198324090587, "grad_norm": 21.757293701171875, "learning_rate": 1.9992852081861967e-06, "loss": 0.345, "num_input_tokens_seen": 15267008, "step": 22685 }, { "epoch": 0.5543204749224342, "grad_norm": 0.7218241095542908, "learning_rate": 1.9992819807969275e-06, "loss": 0.1971, "num_input_tokens_seen": 15270208, "step": 22690 }, { "epoch": 0.5544426257542814, "grad_norm": 17.101781845092773, "learning_rate": 1.9992787461406107e-06, "loss": 0.0953, "num_input_tokens_seen": 15274048, "step": 22695 }, { "epoch": 0.5545647765861286, "grad_norm": 36.492496490478516, "learning_rate": 1.9992755042172705e-06, "loss": 0.1193, "num_input_tokens_seen": 15277760, "step": 22700 }, { "epoch": 0.5546869274179758, "grad_norm": 1.468518614768982, "learning_rate": 1.9992722550269296e-06, "loss": 0.0082, "num_input_tokens_seen": 15281664, "step": 22705 }, { "epoch": 0.5548090782498228, "grad_norm": 33.85728454589844, "learning_rate": 1.9992689985696123e-06, "loss": 0.1653, "num_input_tokens_seen": 15284928, "step": 22710 }, { "epoch": 0.55493122908167, "grad_norm": 0.2482694536447525, "learning_rate": 1.999265734845342e-06, "loss": 0.0878, "num_input_tokens_seen": 15288448, "step": 22715 }, { "epoch": 0.5550533799135172, "grad_norm": 17.697750091552734, "learning_rate": 1.9992624638541425e-06, "loss": 0.2041, "num_input_tokens_seen": 15291200, "step": 22720 }, { "epoch": 0.5551755307453644, "grad_norm": 6.032927989959717, "learning_rate": 1.9992591855960377e-06, "loss": 0.1671, "num_input_tokens_seen": 15294592, "step": 22725 }, { "epoch": 0.5552976815772115, "grad_norm": 37.14913558959961, "learning_rate": 1.9992559000710514e-06, "loss": 0.1456, "num_input_tokens_seen": 15297600, "step": 22730 }, { "epoch": 0.5554198324090587, "grad_norm": 1.839289903640747, "learning_rate": 1.9992526072792077e-06, "loss": 0.1581, "num_input_tokens_seen": 15300992, "step": 22735 }, { "epoch": 0.5555419832409059, "grad_norm": 16.924457550048828, "learning_rate": 1.9992493072205298e-06, "loss": 0.2398, "num_input_tokens_seen": 15304448, "step": 22740 }, { "epoch": 0.5556641340727531, "grad_norm": 0.8010534048080444, "learning_rate": 1.999245999895042e-06, "loss": 0.1129, "num_input_tokens_seen": 15307584, "step": 22745 }, { "epoch": 0.5557862849046002, "grad_norm": 10.311168670654297, "learning_rate": 1.999242685302769e-06, "loss": 0.1458, "num_input_tokens_seen": 15310848, "step": 22750 }, { "epoch": 0.5559084357364473, "grad_norm": 18.564916610717773, "learning_rate": 1.9992393634437343e-06, "loss": 0.0751, "num_input_tokens_seen": 15314624, "step": 22755 }, { "epoch": 0.5560305865682945, "grad_norm": 0.27975115180015564, "learning_rate": 1.999236034317962e-06, "loss": 0.0323, "num_input_tokens_seen": 15317888, "step": 22760 }, { "epoch": 0.5561527374001417, "grad_norm": 1.1480165719985962, "learning_rate": 1.9992326979254764e-06, "loss": 0.1697, "num_input_tokens_seen": 15321216, "step": 22765 }, { "epoch": 0.5562748882319889, "grad_norm": 11.348422050476074, "learning_rate": 1.9992293542663023e-06, "loss": 0.097, "num_input_tokens_seen": 15324416, "step": 22770 }, { "epoch": 0.556397039063836, "grad_norm": 28.212505340576172, "learning_rate": 1.999226003340463e-06, "loss": 0.1083, "num_input_tokens_seen": 15327936, "step": 22775 }, { "epoch": 0.5565191898956832, "grad_norm": 16.56534194946289, "learning_rate": 1.999222645147984e-06, "loss": 0.1821, "num_input_tokens_seen": 15331264, "step": 22780 }, { "epoch": 0.5566413407275304, "grad_norm": 8.404296875, "learning_rate": 1.999219279688889e-06, "loss": 0.08, "num_input_tokens_seen": 15334400, "step": 22785 }, { "epoch": 0.5567634915593775, "grad_norm": 8.955206871032715, "learning_rate": 1.999215906963203e-06, "loss": 0.112, "num_input_tokens_seen": 15337792, "step": 22790 }, { "epoch": 0.5568856423912247, "grad_norm": 1.1889548301696777, "learning_rate": 1.9992125269709494e-06, "loss": 0.0274, "num_input_tokens_seen": 15341504, "step": 22795 }, { "epoch": 0.5570077932230718, "grad_norm": 12.320734977722168, "learning_rate": 1.9992091397121536e-06, "loss": 0.0542, "num_input_tokens_seen": 15344704, "step": 22800 }, { "epoch": 0.557129944054919, "grad_norm": 0.4388432204723358, "learning_rate": 1.999205745186841e-06, "loss": 0.1122, "num_input_tokens_seen": 15347648, "step": 22805 }, { "epoch": 0.5572520948867662, "grad_norm": 1.2065421342849731, "learning_rate": 1.9992023433950346e-06, "loss": 0.0918, "num_input_tokens_seen": 15351360, "step": 22810 }, { "epoch": 0.5573742457186134, "grad_norm": 0.21142229437828064, "learning_rate": 1.9991989343367604e-06, "loss": 0.2024, "num_input_tokens_seen": 15354624, "step": 22815 }, { "epoch": 0.5574963965504605, "grad_norm": 1.033294916152954, "learning_rate": 1.9991955180120426e-06, "loss": 0.1199, "num_input_tokens_seen": 15357952, "step": 22820 }, { "epoch": 0.5576185473823076, "grad_norm": 59.05493927001953, "learning_rate": 1.9991920944209065e-06, "loss": 0.1254, "num_input_tokens_seen": 15361408, "step": 22825 }, { "epoch": 0.5577406982141548, "grad_norm": 21.44487190246582, "learning_rate": 1.9991886635633768e-06, "loss": 0.1779, "num_input_tokens_seen": 15364672, "step": 22830 }, { "epoch": 0.557862849046002, "grad_norm": 2.3592638969421387, "learning_rate": 1.9991852254394783e-06, "loss": 0.146, "num_input_tokens_seen": 15367936, "step": 22835 }, { "epoch": 0.5579849998778492, "grad_norm": 30.120269775390625, "learning_rate": 1.9991817800492357e-06, "loss": 0.1583, "num_input_tokens_seen": 15370880, "step": 22840 }, { "epoch": 0.5581071507096963, "grad_norm": 21.18474578857422, "learning_rate": 1.999178327392675e-06, "loss": 0.0518, "num_input_tokens_seen": 15374144, "step": 22845 }, { "epoch": 0.5582293015415435, "grad_norm": 0.8629884719848633, "learning_rate": 1.9991748674698202e-06, "loss": 0.0418, "num_input_tokens_seen": 15376960, "step": 22850 }, { "epoch": 0.5583514523733907, "grad_norm": 9.767329216003418, "learning_rate": 1.9991714002806977e-06, "loss": 0.1065, "num_input_tokens_seen": 15380608, "step": 22855 }, { "epoch": 0.5584736032052379, "grad_norm": 11.950826644897461, "learning_rate": 1.9991679258253314e-06, "loss": 0.1124, "num_input_tokens_seen": 15383872, "step": 22860 }, { "epoch": 0.558595754037085, "grad_norm": 22.139690399169922, "learning_rate": 1.9991644441037476e-06, "loss": 0.1763, "num_input_tokens_seen": 15387072, "step": 22865 }, { "epoch": 0.5587179048689321, "grad_norm": 7.289351940155029, "learning_rate": 1.9991609551159713e-06, "loss": 0.0812, "num_input_tokens_seen": 15390528, "step": 22870 }, { "epoch": 0.5588400557007793, "grad_norm": 17.15496826171875, "learning_rate": 1.9991574588620274e-06, "loss": 0.3103, "num_input_tokens_seen": 15393856, "step": 22875 }, { "epoch": 0.5589622065326265, "grad_norm": 56.3012580871582, "learning_rate": 1.999153955341942e-06, "loss": 0.2112, "num_input_tokens_seen": 15397440, "step": 22880 }, { "epoch": 0.5590843573644737, "grad_norm": 2.765378475189209, "learning_rate": 1.99915044455574e-06, "loss": 0.0452, "num_input_tokens_seen": 15400512, "step": 22885 }, { "epoch": 0.5592065081963208, "grad_norm": 1.1748318672180176, "learning_rate": 1.999146926503448e-06, "loss": 0.1835, "num_input_tokens_seen": 15403712, "step": 22890 }, { "epoch": 0.559328659028168, "grad_norm": 29.10700225830078, "learning_rate": 1.9991434011850897e-06, "loss": 0.1538, "num_input_tokens_seen": 15406976, "step": 22895 }, { "epoch": 0.5594508098600152, "grad_norm": 43.47568893432617, "learning_rate": 1.9991398686006927e-06, "loss": 0.1319, "num_input_tokens_seen": 15410432, "step": 22900 }, { "epoch": 0.5595729606918624, "grad_norm": 20.069847106933594, "learning_rate": 1.9991363287502816e-06, "loss": 0.1377, "num_input_tokens_seen": 15414016, "step": 22905 }, { "epoch": 0.5596951115237094, "grad_norm": 9.078425407409668, "learning_rate": 1.999132781633882e-06, "loss": 0.2197, "num_input_tokens_seen": 15417344, "step": 22910 }, { "epoch": 0.5598172623555566, "grad_norm": 31.116849899291992, "learning_rate": 1.9991292272515204e-06, "loss": 0.1525, "num_input_tokens_seen": 15421248, "step": 22915 }, { "epoch": 0.5599394131874038, "grad_norm": 3.8848142623901367, "learning_rate": 1.9991256656032224e-06, "loss": 0.1281, "num_input_tokens_seen": 15424320, "step": 22920 }, { "epoch": 0.560061564019251, "grad_norm": 13.48864459991455, "learning_rate": 1.999122096689014e-06, "loss": 0.1147, "num_input_tokens_seen": 15427264, "step": 22925 }, { "epoch": 0.5601837148510982, "grad_norm": 20.86384391784668, "learning_rate": 1.9991185205089206e-06, "loss": 0.0943, "num_input_tokens_seen": 15430912, "step": 22930 }, { "epoch": 0.5603058656829453, "grad_norm": 23.013959884643555, "learning_rate": 1.9991149370629684e-06, "loss": 0.0882, "num_input_tokens_seen": 15434496, "step": 22935 }, { "epoch": 0.5604280165147925, "grad_norm": 2.0098519325256348, "learning_rate": 1.999111346351184e-06, "loss": 0.1102, "num_input_tokens_seen": 15438016, "step": 22940 }, { "epoch": 0.5605501673466396, "grad_norm": 13.733756065368652, "learning_rate": 1.9991077483735934e-06, "loss": 0.2042, "num_input_tokens_seen": 15441216, "step": 22945 }, { "epoch": 0.5606723181784868, "grad_norm": 8.212313652038574, "learning_rate": 1.9991041431302224e-06, "loss": 0.0349, "num_input_tokens_seen": 15444544, "step": 22950 }, { "epoch": 0.5607944690103339, "grad_norm": 33.92264175415039, "learning_rate": 1.9991005306210967e-06, "loss": 0.2182, "num_input_tokens_seen": 15448256, "step": 22955 }, { "epoch": 0.5609166198421811, "grad_norm": 9.712660789489746, "learning_rate": 1.999096910846244e-06, "loss": 0.1111, "num_input_tokens_seen": 15451584, "step": 22960 }, { "epoch": 0.5610387706740283, "grad_norm": 16.679542541503906, "learning_rate": 1.999093283805689e-06, "loss": 0.093, "num_input_tokens_seen": 15455232, "step": 22965 }, { "epoch": 0.5611609215058755, "grad_norm": 12.44924545288086, "learning_rate": 1.99908964949946e-06, "loss": 0.1077, "num_input_tokens_seen": 15458432, "step": 22970 }, { "epoch": 0.5612830723377226, "grad_norm": 1.9566651582717896, "learning_rate": 1.9990860079275818e-06, "loss": 0.2061, "num_input_tokens_seen": 15461312, "step": 22975 }, { "epoch": 0.5614052231695698, "grad_norm": 13.28664493560791, "learning_rate": 1.9990823590900812e-06, "loss": 0.0531, "num_input_tokens_seen": 15465088, "step": 22980 }, { "epoch": 0.561527374001417, "grad_norm": 1.9221934080123901, "learning_rate": 1.9990787029869853e-06, "loss": 0.1349, "num_input_tokens_seen": 15468928, "step": 22985 }, { "epoch": 0.5616495248332641, "grad_norm": 15.852862358093262, "learning_rate": 1.9990750396183203e-06, "loss": 0.113, "num_input_tokens_seen": 15472512, "step": 22990 }, { "epoch": 0.5617716756651113, "grad_norm": 29.440183639526367, "learning_rate": 1.999071368984113e-06, "loss": 0.0624, "num_input_tokens_seen": 15476096, "step": 22995 }, { "epoch": 0.5618938264969584, "grad_norm": 30.857269287109375, "learning_rate": 1.9990676910843897e-06, "loss": 0.1023, "num_input_tokens_seen": 15479168, "step": 23000 }, { "epoch": 0.5620159773288056, "grad_norm": 4.187375068664551, "learning_rate": 1.9990640059191775e-06, "loss": 0.0564, "num_input_tokens_seen": 15483072, "step": 23005 }, { "epoch": 0.5621381281606528, "grad_norm": 0.8417240381240845, "learning_rate": 1.999060313488503e-06, "loss": 0.0212, "num_input_tokens_seen": 15487360, "step": 23010 }, { "epoch": 0.5622602789925, "grad_norm": 36.2202033996582, "learning_rate": 1.9990566137923935e-06, "loss": 0.1005, "num_input_tokens_seen": 15491072, "step": 23015 }, { "epoch": 0.562382429824347, "grad_norm": 1.2734776735305786, "learning_rate": 1.9990529068308755e-06, "loss": 0.1663, "num_input_tokens_seen": 15494336, "step": 23020 }, { "epoch": 0.5625045806561942, "grad_norm": 23.2691593170166, "learning_rate": 1.999049192603976e-06, "loss": 0.0967, "num_input_tokens_seen": 15497920, "step": 23025 }, { "epoch": 0.5626267314880414, "grad_norm": 40.35521697998047, "learning_rate": 1.999045471111722e-06, "loss": 0.0999, "num_input_tokens_seen": 15501184, "step": 23030 }, { "epoch": 0.5627488823198886, "grad_norm": 2.3825042247772217, "learning_rate": 1.999041742354141e-06, "loss": 0.128, "num_input_tokens_seen": 15505280, "step": 23035 }, { "epoch": 0.5628710331517358, "grad_norm": 11.271585464477539, "learning_rate": 1.9990380063312596e-06, "loss": 0.1172, "num_input_tokens_seen": 15508864, "step": 23040 }, { "epoch": 0.5629931839835829, "grad_norm": 29.12808609008789, "learning_rate": 1.999034263043105e-06, "loss": 0.2176, "num_input_tokens_seen": 15511808, "step": 23045 }, { "epoch": 0.5631153348154301, "grad_norm": 37.746665954589844, "learning_rate": 1.999030512489704e-06, "loss": 0.0788, "num_input_tokens_seen": 15515072, "step": 23050 }, { "epoch": 0.5632374856472773, "grad_norm": 13.444511413574219, "learning_rate": 1.9990267546710853e-06, "loss": 0.0788, "num_input_tokens_seen": 15518336, "step": 23055 }, { "epoch": 0.5633596364791245, "grad_norm": 28.247379302978516, "learning_rate": 1.9990229895872747e-06, "loss": 0.0714, "num_input_tokens_seen": 15521344, "step": 23060 }, { "epoch": 0.5634817873109715, "grad_norm": 34.90865707397461, "learning_rate": 1.9990192172383004e-06, "loss": 0.1301, "num_input_tokens_seen": 15524672, "step": 23065 }, { "epoch": 0.5636039381428187, "grad_norm": 1.4494937658309937, "learning_rate": 1.99901543762419e-06, "loss": 0.1546, "num_input_tokens_seen": 15528384, "step": 23070 }, { "epoch": 0.5637260889746659, "grad_norm": 0.2677226960659027, "learning_rate": 1.99901165074497e-06, "loss": 0.0498, "num_input_tokens_seen": 15531456, "step": 23075 }, { "epoch": 0.5638482398065131, "grad_norm": 57.566551208496094, "learning_rate": 1.999007856600669e-06, "loss": 0.0804, "num_input_tokens_seen": 15534656, "step": 23080 }, { "epoch": 0.5639703906383603, "grad_norm": 0.6210819482803345, "learning_rate": 1.999004055191314e-06, "loss": 0.0787, "num_input_tokens_seen": 15537664, "step": 23085 }, { "epoch": 0.5640925414702074, "grad_norm": 60.58045196533203, "learning_rate": 1.9990002465169333e-06, "loss": 0.1669, "num_input_tokens_seen": 15541056, "step": 23090 }, { "epoch": 0.5642146923020546, "grad_norm": 25.531545639038086, "learning_rate": 1.9989964305775535e-06, "loss": 0.116, "num_input_tokens_seen": 15545024, "step": 23095 }, { "epoch": 0.5643368431339018, "grad_norm": 0.1737961322069168, "learning_rate": 1.998992607373203e-06, "loss": 0.1408, "num_input_tokens_seen": 15548608, "step": 23100 }, { "epoch": 0.564458993965749, "grad_norm": 5.435920715332031, "learning_rate": 1.9989887769039097e-06, "loss": 0.0136, "num_input_tokens_seen": 15551680, "step": 23105 }, { "epoch": 0.564581144797596, "grad_norm": 21.51247787475586, "learning_rate": 1.9989849391697013e-06, "loss": 0.1633, "num_input_tokens_seen": 15555008, "step": 23110 }, { "epoch": 0.5647032956294432, "grad_norm": 40.0219612121582, "learning_rate": 1.9989810941706056e-06, "loss": 0.2074, "num_input_tokens_seen": 15558720, "step": 23115 }, { "epoch": 0.5648254464612904, "grad_norm": 0.7361969947814941, "learning_rate": 1.998977241906651e-06, "loss": 0.0749, "num_input_tokens_seen": 15562048, "step": 23120 }, { "epoch": 0.5649475972931376, "grad_norm": 21.490753173828125, "learning_rate": 1.9989733823778653e-06, "loss": 0.2976, "num_input_tokens_seen": 15564992, "step": 23125 }, { "epoch": 0.5650697481249848, "grad_norm": 7.7838454246521, "learning_rate": 1.998969515584276e-06, "loss": 0.1892, "num_input_tokens_seen": 15569152, "step": 23130 }, { "epoch": 0.5651918989568319, "grad_norm": 25.361003875732422, "learning_rate": 1.9989656415259118e-06, "loss": 0.2072, "num_input_tokens_seen": 15572928, "step": 23135 }, { "epoch": 0.565314049788679, "grad_norm": 4.220102310180664, "learning_rate": 1.998961760202801e-06, "loss": 0.1046, "num_input_tokens_seen": 15576256, "step": 23140 }, { "epoch": 0.5654362006205262, "grad_norm": 14.987222671508789, "learning_rate": 1.9989578716149713e-06, "loss": 0.213, "num_input_tokens_seen": 15579072, "step": 23145 }, { "epoch": 0.5655583514523734, "grad_norm": 1.6472913026809692, "learning_rate": 1.9989539757624515e-06, "loss": 0.0259, "num_input_tokens_seen": 15582912, "step": 23150 }, { "epoch": 0.5656805022842205, "grad_norm": 29.221179962158203, "learning_rate": 1.9989500726452697e-06, "loss": 0.1362, "num_input_tokens_seen": 15586496, "step": 23155 }, { "epoch": 0.5658026531160677, "grad_norm": 0.8034424185752869, "learning_rate": 1.9989461622634543e-06, "loss": 0.0701, "num_input_tokens_seen": 15589632, "step": 23160 }, { "epoch": 0.5659248039479149, "grad_norm": 6.2514238357543945, "learning_rate": 1.998942244617034e-06, "loss": 0.0172, "num_input_tokens_seen": 15593088, "step": 23165 }, { "epoch": 0.5660469547797621, "grad_norm": 12.599471092224121, "learning_rate": 1.998938319706036e-06, "loss": 0.2101, "num_input_tokens_seen": 15596352, "step": 23170 }, { "epoch": 0.5661691056116093, "grad_norm": 0.26789960265159607, "learning_rate": 1.9989343875304908e-06, "loss": 0.2097, "num_input_tokens_seen": 15600064, "step": 23175 }, { "epoch": 0.5662912564434563, "grad_norm": 0.5412006378173828, "learning_rate": 1.998930448090426e-06, "loss": 0.0918, "num_input_tokens_seen": 15603520, "step": 23180 }, { "epoch": 0.5664134072753035, "grad_norm": 0.15992608666419983, "learning_rate": 1.99892650138587e-06, "loss": 0.0316, "num_input_tokens_seen": 15607040, "step": 23185 }, { "epoch": 0.5665355581071507, "grad_norm": 2.5661678314208984, "learning_rate": 1.998922547416852e-06, "loss": 0.082, "num_input_tokens_seen": 15610368, "step": 23190 }, { "epoch": 0.5666577089389979, "grad_norm": 33.32538604736328, "learning_rate": 1.9989185861834003e-06, "loss": 0.1905, "num_input_tokens_seen": 15613696, "step": 23195 }, { "epoch": 0.566779859770845, "grad_norm": 2.2595510482788086, "learning_rate": 1.998914617685544e-06, "loss": 0.0609, "num_input_tokens_seen": 15617408, "step": 23200 }, { "epoch": 0.5669020106026922, "grad_norm": 55.58548355102539, "learning_rate": 1.998910641923312e-06, "loss": 0.1334, "num_input_tokens_seen": 15620480, "step": 23205 }, { "epoch": 0.5670241614345394, "grad_norm": 21.146512985229492, "learning_rate": 1.9989066588967333e-06, "loss": 0.1426, "num_input_tokens_seen": 15623424, "step": 23210 }, { "epoch": 0.5671463122663866, "grad_norm": 41.37771987915039, "learning_rate": 1.9989026686058365e-06, "loss": 0.1113, "num_input_tokens_seen": 15626624, "step": 23215 }, { "epoch": 0.5672684630982336, "grad_norm": 31.741256713867188, "learning_rate": 1.998898671050651e-06, "loss": 0.0721, "num_input_tokens_seen": 15630144, "step": 23220 }, { "epoch": 0.5673906139300808, "grad_norm": 0.1878584921360016, "learning_rate": 1.9988946662312052e-06, "loss": 0.1699, "num_input_tokens_seen": 15633472, "step": 23225 }, { "epoch": 0.567512764761928, "grad_norm": 33.57866287231445, "learning_rate": 1.9988906541475292e-06, "loss": 0.2271, "num_input_tokens_seen": 15636928, "step": 23230 }, { "epoch": 0.5676349155937752, "grad_norm": 10.572066307067871, "learning_rate": 1.9988866347996517e-06, "loss": 0.144, "num_input_tokens_seen": 15640256, "step": 23235 }, { "epoch": 0.5677570664256224, "grad_norm": 1.2487159967422485, "learning_rate": 1.9988826081876018e-06, "loss": 0.0924, "num_input_tokens_seen": 15643456, "step": 23240 }, { "epoch": 0.5678792172574695, "grad_norm": 22.059751510620117, "learning_rate": 1.9988785743114087e-06, "loss": 0.1721, "num_input_tokens_seen": 15646400, "step": 23245 }, { "epoch": 0.5680013680893167, "grad_norm": 35.92079162597656, "learning_rate": 1.9988745331711022e-06, "loss": 0.0163, "num_input_tokens_seen": 15649472, "step": 23250 }, { "epoch": 0.5681235189211639, "grad_norm": 0.7913476228713989, "learning_rate": 1.9988704847667115e-06, "loss": 0.062, "num_input_tokens_seen": 15653120, "step": 23255 }, { "epoch": 0.568245669753011, "grad_norm": 1.6168322563171387, "learning_rate": 1.9988664290982657e-06, "loss": 0.108, "num_input_tokens_seen": 15657088, "step": 23260 }, { "epoch": 0.5683678205848581, "grad_norm": 46.63363265991211, "learning_rate": 1.998862366165795e-06, "loss": 0.2226, "num_input_tokens_seen": 15660288, "step": 23265 }, { "epoch": 0.5684899714167053, "grad_norm": 33.23601150512695, "learning_rate": 1.998858295969328e-06, "loss": 0.1452, "num_input_tokens_seen": 15663424, "step": 23270 }, { "epoch": 0.5686121222485525, "grad_norm": 2.1423802375793457, "learning_rate": 1.998854218508895e-06, "loss": 0.0867, "num_input_tokens_seen": 15666496, "step": 23275 }, { "epoch": 0.5687342730803997, "grad_norm": 0.09010536223649979, "learning_rate": 1.9988501337845256e-06, "loss": 0.0866, "num_input_tokens_seen": 15670272, "step": 23280 }, { "epoch": 0.5688564239122469, "grad_norm": 2.580996036529541, "learning_rate": 1.9988460417962494e-06, "loss": 0.0921, "num_input_tokens_seen": 15673600, "step": 23285 }, { "epoch": 0.568978574744094, "grad_norm": 26.938867568969727, "learning_rate": 1.998841942544096e-06, "loss": 0.0575, "num_input_tokens_seen": 15676864, "step": 23290 }, { "epoch": 0.5691007255759412, "grad_norm": 66.43077850341797, "learning_rate": 1.9988378360280955e-06, "loss": 0.1449, "num_input_tokens_seen": 15680320, "step": 23295 }, { "epoch": 0.5692228764077883, "grad_norm": 7.964977264404297, "learning_rate": 1.9988337222482776e-06, "loss": 0.198, "num_input_tokens_seen": 15683328, "step": 23300 }, { "epoch": 0.5693450272396355, "grad_norm": 37.287174224853516, "learning_rate": 1.998829601204672e-06, "loss": 0.1606, "num_input_tokens_seen": 15686656, "step": 23305 }, { "epoch": 0.5694671780714826, "grad_norm": 1.1268587112426758, "learning_rate": 1.998825472897309e-06, "loss": 0.0729, "num_input_tokens_seen": 15690368, "step": 23310 }, { "epoch": 0.5695893289033298, "grad_norm": 2.2257397174835205, "learning_rate": 1.9988213373262183e-06, "loss": 0.1514, "num_input_tokens_seen": 15693504, "step": 23315 }, { "epoch": 0.569711479735177, "grad_norm": 1.2565116882324219, "learning_rate": 1.9988171944914305e-06, "loss": 0.0697, "num_input_tokens_seen": 15696640, "step": 23320 }, { "epoch": 0.5698336305670242, "grad_norm": 70.08368682861328, "learning_rate": 1.998813044392975e-06, "loss": 0.2161, "num_input_tokens_seen": 15700096, "step": 23325 }, { "epoch": 0.5699557813988714, "grad_norm": 8.830229759216309, "learning_rate": 1.9988088870308824e-06, "loss": 0.0429, "num_input_tokens_seen": 15703424, "step": 23330 }, { "epoch": 0.5700779322307185, "grad_norm": 76.5599136352539, "learning_rate": 1.9988047224051835e-06, "loss": 0.208, "num_input_tokens_seen": 15707008, "step": 23335 }, { "epoch": 0.5702000830625656, "grad_norm": 9.068888664245605, "learning_rate": 1.9988005505159078e-06, "loss": 0.1379, "num_input_tokens_seen": 15710592, "step": 23340 }, { "epoch": 0.5703222338944128, "grad_norm": 45.0673942565918, "learning_rate": 1.9987963713630856e-06, "loss": 0.4155, "num_input_tokens_seen": 15714304, "step": 23345 }, { "epoch": 0.57044438472626, "grad_norm": 36.7464714050293, "learning_rate": 1.9987921849467476e-06, "loss": 0.2139, "num_input_tokens_seen": 15717824, "step": 23350 }, { "epoch": 0.5705665355581071, "grad_norm": 0.4737611711025238, "learning_rate": 1.998787991266924e-06, "loss": 0.0858, "num_input_tokens_seen": 15721216, "step": 23355 }, { "epoch": 0.5706886863899543, "grad_norm": 21.547607421875, "learning_rate": 1.998783790323646e-06, "loss": 0.1141, "num_input_tokens_seen": 15724352, "step": 23360 }, { "epoch": 0.5708108372218015, "grad_norm": 12.531356811523438, "learning_rate": 1.998779582116943e-06, "loss": 0.107, "num_input_tokens_seen": 15727808, "step": 23365 }, { "epoch": 0.5709329880536487, "grad_norm": 5.0288872718811035, "learning_rate": 1.9987753666468473e-06, "loss": 0.1105, "num_input_tokens_seen": 15730752, "step": 23370 }, { "epoch": 0.5710551388854959, "grad_norm": 0.6317335963249207, "learning_rate": 1.9987711439133877e-06, "loss": 0.0375, "num_input_tokens_seen": 15734016, "step": 23375 }, { "epoch": 0.5711772897173429, "grad_norm": 29.76620864868164, "learning_rate": 1.9987669139165955e-06, "loss": 0.1535, "num_input_tokens_seen": 15737664, "step": 23380 }, { "epoch": 0.5712994405491901, "grad_norm": 0.38784685730934143, "learning_rate": 1.998762676656502e-06, "loss": 0.07, "num_input_tokens_seen": 15741120, "step": 23385 }, { "epoch": 0.5714215913810373, "grad_norm": 26.450305938720703, "learning_rate": 1.9987584321331377e-06, "loss": 0.1223, "num_input_tokens_seen": 15744320, "step": 23390 }, { "epoch": 0.5715437422128845, "grad_norm": 15.62353229522705, "learning_rate": 1.9987541803465335e-06, "loss": 0.0944, "num_input_tokens_seen": 15747776, "step": 23395 }, { "epoch": 0.5716658930447316, "grad_norm": 0.3525586426258087, "learning_rate": 1.9987499212967205e-06, "loss": 0.0861, "num_input_tokens_seen": 15751552, "step": 23400 }, { "epoch": 0.5717880438765788, "grad_norm": 0.8962100148200989, "learning_rate": 1.998745654983729e-06, "loss": 0.1516, "num_input_tokens_seen": 15754752, "step": 23405 }, { "epoch": 0.571910194708426, "grad_norm": 3.1474618911743164, "learning_rate": 1.9987413814075907e-06, "loss": 0.1224, "num_input_tokens_seen": 15757888, "step": 23410 }, { "epoch": 0.5720323455402732, "grad_norm": 0.26637038588523865, "learning_rate": 1.998737100568336e-06, "loss": 0.0904, "num_input_tokens_seen": 15761152, "step": 23415 }, { "epoch": 0.5721544963721203, "grad_norm": 16.29987144470215, "learning_rate": 1.998732812465997e-06, "loss": 0.1148, "num_input_tokens_seen": 15764672, "step": 23420 }, { "epoch": 0.5722766472039674, "grad_norm": 0.1825859397649765, "learning_rate": 1.9987285171006042e-06, "loss": 0.0455, "num_input_tokens_seen": 15768064, "step": 23425 }, { "epoch": 0.5723987980358146, "grad_norm": 12.438610076904297, "learning_rate": 1.998724214472189e-06, "loss": 0.1486, "num_input_tokens_seen": 15771328, "step": 23430 }, { "epoch": 0.5725209488676618, "grad_norm": 12.719995498657227, "learning_rate": 1.9987199045807823e-06, "loss": 0.2083, "num_input_tokens_seen": 15774528, "step": 23435 }, { "epoch": 0.572643099699509, "grad_norm": 34.43893814086914, "learning_rate": 1.9987155874264166e-06, "loss": 0.1026, "num_input_tokens_seen": 15777728, "step": 23440 }, { "epoch": 0.5727652505313561, "grad_norm": 8.031460762023926, "learning_rate": 1.998711263009122e-06, "loss": 0.0887, "num_input_tokens_seen": 15781120, "step": 23445 }, { "epoch": 0.5728874013632033, "grad_norm": 36.22572326660156, "learning_rate": 1.9987069313289307e-06, "loss": 0.2051, "num_input_tokens_seen": 15784320, "step": 23450 }, { "epoch": 0.5730095521950505, "grad_norm": 0.5932744741439819, "learning_rate": 1.9987025923858736e-06, "loss": 0.2077, "num_input_tokens_seen": 15787584, "step": 23455 }, { "epoch": 0.5731317030268976, "grad_norm": 26.168432235717773, "learning_rate": 1.998698246179983e-06, "loss": 0.3206, "num_input_tokens_seen": 15790976, "step": 23460 }, { "epoch": 0.5732538538587448, "grad_norm": 26.840835571289062, "learning_rate": 1.9986938927112903e-06, "loss": 0.1039, "num_input_tokens_seen": 15794368, "step": 23465 }, { "epoch": 0.5733760046905919, "grad_norm": 6.414973735809326, "learning_rate": 1.998689531979827e-06, "loss": 0.0355, "num_input_tokens_seen": 15797952, "step": 23470 }, { "epoch": 0.5734981555224391, "grad_norm": 12.151657104492188, "learning_rate": 1.998685163985624e-06, "loss": 0.1528, "num_input_tokens_seen": 15800960, "step": 23475 }, { "epoch": 0.5736203063542863, "grad_norm": 2.4973652362823486, "learning_rate": 1.9986807887287145e-06, "loss": 0.0738, "num_input_tokens_seen": 15805248, "step": 23480 }, { "epoch": 0.5737424571861335, "grad_norm": 8.367063522338867, "learning_rate": 1.99867640620913e-06, "loss": 0.0482, "num_input_tokens_seen": 15808384, "step": 23485 }, { "epoch": 0.5738646080179806, "grad_norm": 10.651350975036621, "learning_rate": 1.9986720164269014e-06, "loss": 0.1362, "num_input_tokens_seen": 15811392, "step": 23490 }, { "epoch": 0.5739867588498277, "grad_norm": 21.571401596069336, "learning_rate": 1.998667619382062e-06, "loss": 0.195, "num_input_tokens_seen": 15814784, "step": 23495 }, { "epoch": 0.5741089096816749, "grad_norm": 32.89027404785156, "learning_rate": 1.998663215074642e-06, "loss": 0.1152, "num_input_tokens_seen": 15818304, "step": 23500 }, { "epoch": 0.5742310605135221, "grad_norm": 18.197383880615234, "learning_rate": 1.9986588035046755e-06, "loss": 0.1916, "num_input_tokens_seen": 15821632, "step": 23505 }, { "epoch": 0.5743532113453692, "grad_norm": 16.271366119384766, "learning_rate": 1.998654384672193e-06, "loss": 0.1102, "num_input_tokens_seen": 15825280, "step": 23510 }, { "epoch": 0.5744753621772164, "grad_norm": 20.66217041015625, "learning_rate": 1.9986499585772275e-06, "loss": 0.2022, "num_input_tokens_seen": 15828672, "step": 23515 }, { "epoch": 0.5745975130090636, "grad_norm": 12.838661193847656, "learning_rate": 1.998645525219811e-06, "loss": 0.1016, "num_input_tokens_seen": 15831808, "step": 23520 }, { "epoch": 0.5747196638409108, "grad_norm": 27.447620391845703, "learning_rate": 1.9986410845999752e-06, "loss": 0.2264, "num_input_tokens_seen": 15835072, "step": 23525 }, { "epoch": 0.574841814672758, "grad_norm": 20.3715763092041, "learning_rate": 1.998636636717753e-06, "loss": 0.0441, "num_input_tokens_seen": 15838144, "step": 23530 }, { "epoch": 0.574963965504605, "grad_norm": 16.542808532714844, "learning_rate": 1.9986321815731766e-06, "loss": 0.1378, "num_input_tokens_seen": 15841728, "step": 23535 }, { "epoch": 0.5750861163364522, "grad_norm": 23.26079559326172, "learning_rate": 1.998627719166278e-06, "loss": 0.0454, "num_input_tokens_seen": 15845504, "step": 23540 }, { "epoch": 0.5752082671682994, "grad_norm": 16.223146438598633, "learning_rate": 1.9986232494970908e-06, "loss": 0.0466, "num_input_tokens_seen": 15849600, "step": 23545 }, { "epoch": 0.5753304180001466, "grad_norm": 28.47535514831543, "learning_rate": 1.9986187725656466e-06, "loss": 0.0798, "num_input_tokens_seen": 15852736, "step": 23550 }, { "epoch": 0.5754525688319937, "grad_norm": 21.106651306152344, "learning_rate": 1.9986142883719774e-06, "loss": 0.1278, "num_input_tokens_seen": 15856064, "step": 23555 }, { "epoch": 0.5755747196638409, "grad_norm": 0.2558057904243469, "learning_rate": 1.998609796916117e-06, "loss": 0.063, "num_input_tokens_seen": 15859648, "step": 23560 }, { "epoch": 0.5756968704956881, "grad_norm": 24.275699615478516, "learning_rate": 1.998605298198098e-06, "loss": 0.0897, "num_input_tokens_seen": 15862976, "step": 23565 }, { "epoch": 0.5758190213275353, "grad_norm": 14.609818458557129, "learning_rate": 1.9986007922179523e-06, "loss": 0.0718, "num_input_tokens_seen": 15866496, "step": 23570 }, { "epoch": 0.5759411721593825, "grad_norm": 0.0939621701836586, "learning_rate": 1.9985962789757126e-06, "loss": 0.101, "num_input_tokens_seen": 15869888, "step": 23575 }, { "epoch": 0.5760633229912295, "grad_norm": 3.2896945476531982, "learning_rate": 1.9985917584714126e-06, "loss": 0.0232, "num_input_tokens_seen": 15872960, "step": 23580 }, { "epoch": 0.5761854738230767, "grad_norm": 4.536641597747803, "learning_rate": 1.998587230705085e-06, "loss": 0.1324, "num_input_tokens_seen": 15876160, "step": 23585 }, { "epoch": 0.5763076246549239, "grad_norm": 0.8103224635124207, "learning_rate": 1.9985826956767618e-06, "loss": 0.0985, "num_input_tokens_seen": 15879360, "step": 23590 }, { "epoch": 0.5764297754867711, "grad_norm": 24.116247177124023, "learning_rate": 1.998578153386477e-06, "loss": 0.1614, "num_input_tokens_seen": 15883008, "step": 23595 }, { "epoch": 0.5765519263186182, "grad_norm": 0.26677149534225464, "learning_rate": 1.9985736038342634e-06, "loss": 0.0492, "num_input_tokens_seen": 15886400, "step": 23600 }, { "epoch": 0.5766740771504654, "grad_norm": 16.559232711791992, "learning_rate": 1.9985690470201537e-06, "loss": 0.0798, "num_input_tokens_seen": 15889920, "step": 23605 }, { "epoch": 0.5767962279823126, "grad_norm": 45.85171127319336, "learning_rate": 1.9985644829441816e-06, "loss": 0.1079, "num_input_tokens_seen": 15893248, "step": 23610 }, { "epoch": 0.5769183788141597, "grad_norm": 20.431299209594727, "learning_rate": 1.9985599116063796e-06, "loss": 0.1202, "num_input_tokens_seen": 15896000, "step": 23615 }, { "epoch": 0.5770405296460069, "grad_norm": 14.595911979675293, "learning_rate": 1.9985553330067816e-06, "loss": 0.1365, "num_input_tokens_seen": 15901504, "step": 23620 }, { "epoch": 0.577162680477854, "grad_norm": 49.17565155029297, "learning_rate": 1.9985507471454207e-06, "loss": 0.1305, "num_input_tokens_seen": 15904832, "step": 23625 }, { "epoch": 0.5772848313097012, "grad_norm": 14.491154670715332, "learning_rate": 1.9985461540223303e-06, "loss": 0.212, "num_input_tokens_seen": 15908224, "step": 23630 }, { "epoch": 0.5774069821415484, "grad_norm": 25.94092559814453, "learning_rate": 1.9985415536375434e-06, "loss": 0.1236, "num_input_tokens_seen": 15911744, "step": 23635 }, { "epoch": 0.5775291329733956, "grad_norm": 12.70712947845459, "learning_rate": 1.998536945991094e-06, "loss": 0.3236, "num_input_tokens_seen": 15914816, "step": 23640 }, { "epoch": 0.5776512838052427, "grad_norm": 35.25370407104492, "learning_rate": 1.9985323310830152e-06, "loss": 0.0493, "num_input_tokens_seen": 15917888, "step": 23645 }, { "epoch": 0.5777734346370899, "grad_norm": 43.0831413269043, "learning_rate": 1.9985277089133405e-06, "loss": 0.094, "num_input_tokens_seen": 15921536, "step": 23650 }, { "epoch": 0.577895585468937, "grad_norm": 21.820783615112305, "learning_rate": 1.998523079482104e-06, "loss": 0.0982, "num_input_tokens_seen": 15924800, "step": 23655 }, { "epoch": 0.5780177363007842, "grad_norm": 10.404958724975586, "learning_rate": 1.998518442789339e-06, "loss": 0.2915, "num_input_tokens_seen": 15928000, "step": 23660 }, { "epoch": 0.5781398871326314, "grad_norm": 1.6005030870437622, "learning_rate": 1.9985137988350795e-06, "loss": 0.2105, "num_input_tokens_seen": 15931328, "step": 23665 }, { "epoch": 0.5782620379644785, "grad_norm": 2.767606258392334, "learning_rate": 1.998509147619359e-06, "loss": 0.0554, "num_input_tokens_seen": 15934592, "step": 23670 }, { "epoch": 0.5783841887963257, "grad_norm": 34.56588363647461, "learning_rate": 1.998504489142211e-06, "loss": 0.1021, "num_input_tokens_seen": 15937984, "step": 23675 }, { "epoch": 0.5785063396281729, "grad_norm": 11.574126243591309, "learning_rate": 1.9984998234036704e-06, "loss": 0.08, "num_input_tokens_seen": 15941568, "step": 23680 }, { "epoch": 0.5786284904600201, "grad_norm": 64.17758178710938, "learning_rate": 1.9984951504037704e-06, "loss": 0.1458, "num_input_tokens_seen": 15945280, "step": 23685 }, { "epoch": 0.5787506412918672, "grad_norm": 10.47195053100586, "learning_rate": 1.998490470142545e-06, "loss": 0.0967, "num_input_tokens_seen": 15948288, "step": 23690 }, { "epoch": 0.5788727921237143, "grad_norm": 33.58171844482422, "learning_rate": 1.9984857826200284e-06, "loss": 0.1066, "num_input_tokens_seen": 15952064, "step": 23695 }, { "epoch": 0.5789949429555615, "grad_norm": 25.143238067626953, "learning_rate": 1.998481087836254e-06, "loss": 0.2611, "num_input_tokens_seen": 15956032, "step": 23700 }, { "epoch": 0.5791170937874087, "grad_norm": 14.816621780395508, "learning_rate": 1.9984763857912573e-06, "loss": 0.1465, "num_input_tokens_seen": 15959360, "step": 23705 }, { "epoch": 0.5792392446192559, "grad_norm": 9.596741676330566, "learning_rate": 1.998471676485072e-06, "loss": 0.0293, "num_input_tokens_seen": 15962752, "step": 23710 }, { "epoch": 0.579361395451103, "grad_norm": 24.233652114868164, "learning_rate": 1.9984669599177315e-06, "loss": 0.1096, "num_input_tokens_seen": 15965888, "step": 23715 }, { "epoch": 0.5794835462829502, "grad_norm": 26.97178077697754, "learning_rate": 1.9984622360892707e-06, "loss": 0.2157, "num_input_tokens_seen": 15969216, "step": 23720 }, { "epoch": 0.5796056971147974, "grad_norm": 27.27730941772461, "learning_rate": 1.998457504999724e-06, "loss": 0.1427, "num_input_tokens_seen": 15972672, "step": 23725 }, { "epoch": 0.5797278479466446, "grad_norm": 24.768224716186523, "learning_rate": 1.9984527666491262e-06, "loss": 0.0804, "num_input_tokens_seen": 15976320, "step": 23730 }, { "epoch": 0.5798499987784916, "grad_norm": 32.22639465332031, "learning_rate": 1.998448021037511e-06, "loss": 0.1042, "num_input_tokens_seen": 15979392, "step": 23735 }, { "epoch": 0.5799721496103388, "grad_norm": 33.8211669921875, "learning_rate": 1.998443268164913e-06, "loss": 0.141, "num_input_tokens_seen": 15982784, "step": 23740 }, { "epoch": 0.580094300442186, "grad_norm": 2.913667678833008, "learning_rate": 1.998438508031368e-06, "loss": 0.0586, "num_input_tokens_seen": 15986624, "step": 23745 }, { "epoch": 0.5802164512740332, "grad_norm": 3.4265143871307373, "learning_rate": 1.9984337406369084e-06, "loss": 0.0323, "num_input_tokens_seen": 15989696, "step": 23750 }, { "epoch": 0.5803386021058803, "grad_norm": 29.47190284729004, "learning_rate": 1.9984289659815707e-06, "loss": 0.061, "num_input_tokens_seen": 15993280, "step": 23755 }, { "epoch": 0.5804607529377275, "grad_norm": 15.11453628540039, "learning_rate": 1.998424184065389e-06, "loss": 0.0845, "num_input_tokens_seen": 15996096, "step": 23760 }, { "epoch": 0.5805829037695747, "grad_norm": 23.776002883911133, "learning_rate": 1.998419394888398e-06, "loss": 0.1935, "num_input_tokens_seen": 15999680, "step": 23765 }, { "epoch": 0.5807050546014219, "grad_norm": 0.5569315552711487, "learning_rate": 1.998414598450633e-06, "loss": 0.0199, "num_input_tokens_seen": 16003072, "step": 23770 }, { "epoch": 0.580827205433269, "grad_norm": 1.0250940322875977, "learning_rate": 1.998409794752128e-06, "loss": 0.0787, "num_input_tokens_seen": 16006400, "step": 23775 }, { "epoch": 0.5809493562651161, "grad_norm": 2.4078798294067383, "learning_rate": 1.9984049837929183e-06, "loss": 0.0284, "num_input_tokens_seen": 16009600, "step": 23780 }, { "epoch": 0.5810715070969633, "grad_norm": 9.696256637573242, "learning_rate": 1.9984001655730397e-06, "loss": 0.1318, "num_input_tokens_seen": 16013248, "step": 23785 }, { "epoch": 0.5811936579288105, "grad_norm": 44.90831756591797, "learning_rate": 1.998395340092526e-06, "loss": 0.126, "num_input_tokens_seen": 16016320, "step": 23790 }, { "epoch": 0.5813158087606577, "grad_norm": 20.846860885620117, "learning_rate": 1.998390507351413e-06, "loss": 0.0767, "num_input_tokens_seen": 16019968, "step": 23795 }, { "epoch": 0.5814379595925048, "grad_norm": 38.1915397644043, "learning_rate": 1.9983856673497357e-06, "loss": 0.0845, "num_input_tokens_seen": 16023232, "step": 23800 }, { "epoch": 0.581560110424352, "grad_norm": 32.8565559387207, "learning_rate": 1.9983808200875295e-06, "loss": 0.0591, "num_input_tokens_seen": 16026304, "step": 23805 }, { "epoch": 0.5816822612561992, "grad_norm": 0.4736187756061554, "learning_rate": 1.9983759655648293e-06, "loss": 0.1458, "num_input_tokens_seen": 16029824, "step": 23810 }, { "epoch": 0.5818044120880463, "grad_norm": 0.13305804133415222, "learning_rate": 1.9983711037816705e-06, "loss": 0.0809, "num_input_tokens_seen": 16034176, "step": 23815 }, { "epoch": 0.5819265629198935, "grad_norm": 45.56284713745117, "learning_rate": 1.9983662347380883e-06, "loss": 0.095, "num_input_tokens_seen": 16037824, "step": 23820 }, { "epoch": 0.5820487137517406, "grad_norm": 8.108329772949219, "learning_rate": 1.9983613584341184e-06, "loss": 0.1983, "num_input_tokens_seen": 16040960, "step": 23825 }, { "epoch": 0.5821708645835878, "grad_norm": 8.468049049377441, "learning_rate": 1.998356474869796e-06, "loss": 0.0863, "num_input_tokens_seen": 16044608, "step": 23830 }, { "epoch": 0.582293015415435, "grad_norm": 1.143520474433899, "learning_rate": 1.9983515840451574e-06, "loss": 0.1203, "num_input_tokens_seen": 16048192, "step": 23835 }, { "epoch": 0.5824151662472822, "grad_norm": 23.47208595275879, "learning_rate": 1.998346685960237e-06, "loss": 0.121, "num_input_tokens_seen": 16051456, "step": 23840 }, { "epoch": 0.5825373170791293, "grad_norm": 7.825652122497559, "learning_rate": 1.9983417806150716e-06, "loss": 0.0554, "num_input_tokens_seen": 16054976, "step": 23845 }, { "epoch": 0.5826594679109764, "grad_norm": 16.5784854888916, "learning_rate": 1.998336868009696e-06, "loss": 0.0686, "num_input_tokens_seen": 16058240, "step": 23850 }, { "epoch": 0.5827816187428236, "grad_norm": 40.77692413330078, "learning_rate": 1.998331948144146e-06, "loss": 0.1335, "num_input_tokens_seen": 16061312, "step": 23855 }, { "epoch": 0.5829037695746708, "grad_norm": 36.636295318603516, "learning_rate": 1.9983270210184573e-06, "loss": 0.0715, "num_input_tokens_seen": 16064768, "step": 23860 }, { "epoch": 0.583025920406518, "grad_norm": 0.427692711353302, "learning_rate": 1.998322086632666e-06, "loss": 0.082, "num_input_tokens_seen": 16068416, "step": 23865 }, { "epoch": 0.5831480712383651, "grad_norm": 71.27447509765625, "learning_rate": 1.9983171449868086e-06, "loss": 0.2176, "num_input_tokens_seen": 16071488, "step": 23870 }, { "epoch": 0.5832702220702123, "grad_norm": 0.7104368805885315, "learning_rate": 1.9983121960809198e-06, "loss": 0.1019, "num_input_tokens_seen": 16075200, "step": 23875 }, { "epoch": 0.5833923729020595, "grad_norm": 2.0664944648742676, "learning_rate": 1.9983072399150367e-06, "loss": 0.0461, "num_input_tokens_seen": 16078656, "step": 23880 }, { "epoch": 0.5835145237339067, "grad_norm": 17.349943161010742, "learning_rate": 1.9983022764891943e-06, "loss": 0.119, "num_input_tokens_seen": 16082112, "step": 23885 }, { "epoch": 0.5836366745657537, "grad_norm": 9.098965644836426, "learning_rate": 1.9982973058034297e-06, "loss": 0.1487, "num_input_tokens_seen": 16085376, "step": 23890 }, { "epoch": 0.5837588253976009, "grad_norm": 13.251080513000488, "learning_rate": 1.998292327857778e-06, "loss": 0.1018, "num_input_tokens_seen": 16088576, "step": 23895 }, { "epoch": 0.5838809762294481, "grad_norm": 23.803606033325195, "learning_rate": 1.998287342652277e-06, "loss": 0.0956, "num_input_tokens_seen": 16091904, "step": 23900 }, { "epoch": 0.5840031270612953, "grad_norm": 9.364514350891113, "learning_rate": 1.998282350186961e-06, "loss": 0.1887, "num_input_tokens_seen": 16094912, "step": 23905 }, { "epoch": 0.5841252778931425, "grad_norm": 2.0720276832580566, "learning_rate": 1.998277350461868e-06, "loss": 0.0914, "num_input_tokens_seen": 16097856, "step": 23910 }, { "epoch": 0.5842474287249896, "grad_norm": 19.379125595092773, "learning_rate": 1.998272343477033e-06, "loss": 0.1373, "num_input_tokens_seen": 16101632, "step": 23915 }, { "epoch": 0.5843695795568368, "grad_norm": 9.671746253967285, "learning_rate": 1.998267329232493e-06, "loss": 0.193, "num_input_tokens_seen": 16104704, "step": 23920 }, { "epoch": 0.584491730388684, "grad_norm": 4.722978591918945, "learning_rate": 1.9982623077282846e-06, "loss": 0.1243, "num_input_tokens_seen": 16107968, "step": 23925 }, { "epoch": 0.5846138812205312, "grad_norm": 6.806799411773682, "learning_rate": 1.9982572789644442e-06, "loss": 0.1061, "num_input_tokens_seen": 16111488, "step": 23930 }, { "epoch": 0.5847360320523782, "grad_norm": 0.32206377387046814, "learning_rate": 1.9982522429410085e-06, "loss": 0.0316, "num_input_tokens_seen": 16115136, "step": 23935 }, { "epoch": 0.5848581828842254, "grad_norm": 2.084754228591919, "learning_rate": 1.998247199658014e-06, "loss": 0.0877, "num_input_tokens_seen": 16118848, "step": 23940 }, { "epoch": 0.5849803337160726, "grad_norm": 10.631635665893555, "learning_rate": 1.9982421491154973e-06, "loss": 0.0641, "num_input_tokens_seen": 16122432, "step": 23945 }, { "epoch": 0.5851024845479198, "grad_norm": 43.512943267822266, "learning_rate": 1.998237091313495e-06, "loss": 0.1244, "num_input_tokens_seen": 16125888, "step": 23950 }, { "epoch": 0.585224635379767, "grad_norm": 0.6749215126037598, "learning_rate": 1.9982320262520445e-06, "loss": 0.1109, "num_input_tokens_seen": 16129408, "step": 23955 }, { "epoch": 0.5853467862116141, "grad_norm": 36.5847053527832, "learning_rate": 1.998226953931182e-06, "loss": 0.0638, "num_input_tokens_seen": 16133376, "step": 23960 }, { "epoch": 0.5854689370434613, "grad_norm": 0.6342559456825256, "learning_rate": 1.9982218743509445e-06, "loss": 0.0532, "num_input_tokens_seen": 16136640, "step": 23965 }, { "epoch": 0.5855910878753084, "grad_norm": 41.035850524902344, "learning_rate": 1.9982167875113692e-06, "loss": 0.2682, "num_input_tokens_seen": 16140096, "step": 23970 }, { "epoch": 0.5857132387071556, "grad_norm": 5.5960235595703125, "learning_rate": 1.9982116934124925e-06, "loss": 0.0938, "num_input_tokens_seen": 16143488, "step": 23975 }, { "epoch": 0.5858353895390027, "grad_norm": 27.44890022277832, "learning_rate": 1.9982065920543524e-06, "loss": 0.2205, "num_input_tokens_seen": 16147264, "step": 23980 }, { "epoch": 0.5859575403708499, "grad_norm": 6.895400047302246, "learning_rate": 1.9982014834369853e-06, "loss": 0.0564, "num_input_tokens_seen": 16150592, "step": 23985 }, { "epoch": 0.5860796912026971, "grad_norm": 35.16048812866211, "learning_rate": 1.9981963675604286e-06, "loss": 0.1804, "num_input_tokens_seen": 16153664, "step": 23990 }, { "epoch": 0.5862018420345443, "grad_norm": 15.177404403686523, "learning_rate": 1.9981912444247195e-06, "loss": 0.0559, "num_input_tokens_seen": 16156800, "step": 23995 }, { "epoch": 0.5863239928663915, "grad_norm": 39.688987731933594, "learning_rate": 1.9981861140298948e-06, "loss": 0.1858, "num_input_tokens_seen": 16160064, "step": 24000 }, { "epoch": 0.5864461436982386, "grad_norm": 11.082685470581055, "learning_rate": 1.9981809763759926e-06, "loss": 0.1304, "num_input_tokens_seen": 16163648, "step": 24005 }, { "epoch": 0.5865682945300857, "grad_norm": 17.348304748535156, "learning_rate": 1.9981758314630495e-06, "loss": 0.2359, "num_input_tokens_seen": 16166720, "step": 24010 }, { "epoch": 0.5866904453619329, "grad_norm": 19.414400100708008, "learning_rate": 1.998170679291104e-06, "loss": 0.116, "num_input_tokens_seen": 16170112, "step": 24015 }, { "epoch": 0.5868125961937801, "grad_norm": 37.96230697631836, "learning_rate": 1.9981655198601918e-06, "loss": 0.1495, "num_input_tokens_seen": 16173696, "step": 24020 }, { "epoch": 0.5869347470256272, "grad_norm": 9.247244834899902, "learning_rate": 1.9981603531703526e-06, "loss": 0.0887, "num_input_tokens_seen": 16177024, "step": 24025 }, { "epoch": 0.5870568978574744, "grad_norm": 15.933195114135742, "learning_rate": 1.998155179221622e-06, "loss": 0.0788, "num_input_tokens_seen": 16180608, "step": 24030 }, { "epoch": 0.5871790486893216, "grad_norm": 19.613866806030273, "learning_rate": 1.9981499980140386e-06, "loss": 0.1616, "num_input_tokens_seen": 16184128, "step": 24035 }, { "epoch": 0.5873011995211688, "grad_norm": 22.159255981445312, "learning_rate": 1.99814480954764e-06, "loss": 0.1168, "num_input_tokens_seen": 16187392, "step": 24040 }, { "epoch": 0.5874233503530158, "grad_norm": 17.167551040649414, "learning_rate": 1.998139613822464e-06, "loss": 0.1582, "num_input_tokens_seen": 16190912, "step": 24045 }, { "epoch": 0.587545501184863, "grad_norm": 15.00357437133789, "learning_rate": 1.998134410838548e-06, "loss": 0.0451, "num_input_tokens_seen": 16194176, "step": 24050 }, { "epoch": 0.5876676520167102, "grad_norm": 10.094125747680664, "learning_rate": 1.9981292005959305e-06, "loss": 0.1327, "num_input_tokens_seen": 16197952, "step": 24055 }, { "epoch": 0.5877898028485574, "grad_norm": 0.6391386985778809, "learning_rate": 1.998123983094649e-06, "loss": 0.0967, "num_input_tokens_seen": 16201600, "step": 24060 }, { "epoch": 0.5879119536804046, "grad_norm": 26.472667694091797, "learning_rate": 1.998118758334741e-06, "loss": 0.0567, "num_input_tokens_seen": 16205440, "step": 24065 }, { "epoch": 0.5880341045122517, "grad_norm": 16.453083038330078, "learning_rate": 1.998113526316245e-06, "loss": 0.0901, "num_input_tokens_seen": 16208640, "step": 24070 }, { "epoch": 0.5881562553440989, "grad_norm": 21.841779708862305, "learning_rate": 1.998108287039199e-06, "loss": 0.1703, "num_input_tokens_seen": 16211968, "step": 24075 }, { "epoch": 0.5882784061759461, "grad_norm": 17.799901962280273, "learning_rate": 1.998103040503641e-06, "loss": 0.0712, "num_input_tokens_seen": 16215296, "step": 24080 }, { "epoch": 0.5884005570077933, "grad_norm": 1.8213036060333252, "learning_rate": 1.9980977867096097e-06, "loss": 0.0626, "num_input_tokens_seen": 16218560, "step": 24085 }, { "epoch": 0.5885227078396403, "grad_norm": 9.8761625289917, "learning_rate": 1.9980925256571424e-06, "loss": 0.1004, "num_input_tokens_seen": 16221504, "step": 24090 }, { "epoch": 0.5886448586714875, "grad_norm": 58.34889602661133, "learning_rate": 1.9980872573462783e-06, "loss": 0.1884, "num_input_tokens_seen": 16225088, "step": 24095 }, { "epoch": 0.5887670095033347, "grad_norm": 23.790300369262695, "learning_rate": 1.9980819817770546e-06, "loss": 0.0542, "num_input_tokens_seen": 16228928, "step": 24100 }, { "epoch": 0.5888891603351819, "grad_norm": 22.176443099975586, "learning_rate": 1.9980766989495107e-06, "loss": 0.0811, "num_input_tokens_seen": 16232064, "step": 24105 }, { "epoch": 0.5890113111670291, "grad_norm": 12.136371612548828, "learning_rate": 1.9980714088636844e-06, "loss": 0.1497, "num_input_tokens_seen": 16235136, "step": 24110 }, { "epoch": 0.5891334619988762, "grad_norm": 16.363262176513672, "learning_rate": 1.9980661115196145e-06, "loss": 0.0565, "num_input_tokens_seen": 16239168, "step": 24115 }, { "epoch": 0.5892556128307234, "grad_norm": 1.3040426969528198, "learning_rate": 1.998060806917339e-06, "loss": 0.14, "num_input_tokens_seen": 16242368, "step": 24120 }, { "epoch": 0.5893777636625706, "grad_norm": 0.18406778573989868, "learning_rate": 1.9980554950568973e-06, "loss": 0.0388, "num_input_tokens_seen": 16245888, "step": 24125 }, { "epoch": 0.5894999144944177, "grad_norm": 2.5366475582122803, "learning_rate": 1.9980501759383276e-06, "loss": 0.1424, "num_input_tokens_seen": 16249152, "step": 24130 }, { "epoch": 0.5896220653262648, "grad_norm": 4.652827739715576, "learning_rate": 1.9980448495616685e-06, "loss": 0.0769, "num_input_tokens_seen": 16252416, "step": 24135 }, { "epoch": 0.589744216158112, "grad_norm": 24.142471313476562, "learning_rate": 1.9980395159269586e-06, "loss": 0.206, "num_input_tokens_seen": 16255872, "step": 24140 }, { "epoch": 0.5898663669899592, "grad_norm": 0.5410691499710083, "learning_rate": 1.9980341750342372e-06, "loss": 0.121, "num_input_tokens_seen": 16259968, "step": 24145 }, { "epoch": 0.5899885178218064, "grad_norm": 14.989289283752441, "learning_rate": 1.9980288268835425e-06, "loss": 0.2076, "num_input_tokens_seen": 16263488, "step": 24150 }, { "epoch": 0.5901106686536536, "grad_norm": 10.395974159240723, "learning_rate": 1.998023471474914e-06, "loss": 0.0236, "num_input_tokens_seen": 16266368, "step": 24155 }, { "epoch": 0.5902328194855007, "grad_norm": 26.658100128173828, "learning_rate": 1.9980181088083903e-06, "loss": 0.1343, "num_input_tokens_seen": 16269760, "step": 24160 }, { "epoch": 0.5903549703173478, "grad_norm": 1.5090116262435913, "learning_rate": 1.9980127388840106e-06, "loss": 0.1414, "num_input_tokens_seen": 16273216, "step": 24165 }, { "epoch": 0.590477121149195, "grad_norm": 12.554591178894043, "learning_rate": 1.9980073617018135e-06, "loss": 0.0678, "num_input_tokens_seen": 16276288, "step": 24170 }, { "epoch": 0.5905992719810422, "grad_norm": 12.531062126159668, "learning_rate": 1.9980019772618387e-06, "loss": 0.0568, "num_input_tokens_seen": 16279872, "step": 24175 }, { "epoch": 0.5907214228128893, "grad_norm": 25.284109115600586, "learning_rate": 1.997996585564125e-06, "loss": 0.1065, "num_input_tokens_seen": 16283136, "step": 24180 }, { "epoch": 0.5908435736447365, "grad_norm": 9.407654762268066, "learning_rate": 1.997991186608712e-06, "loss": 0.1299, "num_input_tokens_seen": 16286720, "step": 24185 }, { "epoch": 0.5909657244765837, "grad_norm": 1.669158697128296, "learning_rate": 1.9979857803956383e-06, "loss": 0.0945, "num_input_tokens_seen": 16289792, "step": 24190 }, { "epoch": 0.5910878753084309, "grad_norm": 16.71095848083496, "learning_rate": 1.9979803669249434e-06, "loss": 0.2258, "num_input_tokens_seen": 16293184, "step": 24195 }, { "epoch": 0.5912100261402781, "grad_norm": 21.24453353881836, "learning_rate": 1.9979749461966672e-06, "loss": 0.1589, "num_input_tokens_seen": 16296448, "step": 24200 }, { "epoch": 0.5913321769721251, "grad_norm": 23.289432525634766, "learning_rate": 1.997969518210849e-06, "loss": 0.2786, "num_input_tokens_seen": 16299904, "step": 24205 }, { "epoch": 0.5914543278039723, "grad_norm": 18.00309181213379, "learning_rate": 1.9979640829675273e-06, "loss": 0.0697, "num_input_tokens_seen": 16303424, "step": 24210 }, { "epoch": 0.5915764786358195, "grad_norm": 17.946182250976562, "learning_rate": 1.997958640466743e-06, "loss": 0.0782, "num_input_tokens_seen": 16306560, "step": 24215 }, { "epoch": 0.5916986294676667, "grad_norm": 7.101922512054443, "learning_rate": 1.997953190708535e-06, "loss": 0.2103, "num_input_tokens_seen": 16309632, "step": 24220 }, { "epoch": 0.5918207802995138, "grad_norm": 4.2682013511657715, "learning_rate": 1.9979477336929426e-06, "loss": 0.0855, "num_input_tokens_seen": 16312704, "step": 24225 }, { "epoch": 0.591942931131361, "grad_norm": 23.60270118713379, "learning_rate": 1.9979422694200062e-06, "loss": 0.0863, "num_input_tokens_seen": 16316160, "step": 24230 }, { "epoch": 0.5920650819632082, "grad_norm": 0.4715448319911957, "learning_rate": 1.997936797889765e-06, "loss": 0.0656, "num_input_tokens_seen": 16319872, "step": 24235 }, { "epoch": 0.5921872327950554, "grad_norm": 6.69528341293335, "learning_rate": 1.997931319102259e-06, "loss": 0.0183, "num_input_tokens_seen": 16322752, "step": 24240 }, { "epoch": 0.5923093836269026, "grad_norm": 8.694469451904297, "learning_rate": 1.9979258330575283e-06, "loss": 0.1014, "num_input_tokens_seen": 16326208, "step": 24245 }, { "epoch": 0.5924315344587496, "grad_norm": 17.657978057861328, "learning_rate": 1.9979203397556124e-06, "loss": 0.0631, "num_input_tokens_seen": 16329664, "step": 24250 }, { "epoch": 0.5925536852905968, "grad_norm": 16.77488899230957, "learning_rate": 1.997914839196551e-06, "loss": 0.1876, "num_input_tokens_seen": 16332800, "step": 24255 }, { "epoch": 0.592675836122444, "grad_norm": 1.8182240724563599, "learning_rate": 1.997909331380385e-06, "loss": 0.1368, "num_input_tokens_seen": 16336512, "step": 24260 }, { "epoch": 0.5927979869542912, "grad_norm": 26.796363830566406, "learning_rate": 1.997903816307154e-06, "loss": 0.0687, "num_input_tokens_seen": 16340160, "step": 24265 }, { "epoch": 0.5929201377861383, "grad_norm": 26.58648109436035, "learning_rate": 1.9978982939768975e-06, "loss": 0.2328, "num_input_tokens_seen": 16343680, "step": 24270 }, { "epoch": 0.5930422886179855, "grad_norm": 6.620156288146973, "learning_rate": 1.9978927643896567e-06, "loss": 0.0459, "num_input_tokens_seen": 16347200, "step": 24275 }, { "epoch": 0.5931644394498327, "grad_norm": 15.268500328063965, "learning_rate": 1.9978872275454713e-06, "loss": 0.1105, "num_input_tokens_seen": 16350272, "step": 24280 }, { "epoch": 0.5932865902816798, "grad_norm": 2.916973352432251, "learning_rate": 1.997881683444381e-06, "loss": 0.1195, "num_input_tokens_seen": 16353728, "step": 24285 }, { "epoch": 0.593408741113527, "grad_norm": 24.336292266845703, "learning_rate": 1.997876132086427e-06, "loss": 0.1208, "num_input_tokens_seen": 16357888, "step": 24290 }, { "epoch": 0.5935308919453741, "grad_norm": 7.969274997711182, "learning_rate": 1.99787057347165e-06, "loss": 0.1052, "num_input_tokens_seen": 16361280, "step": 24295 }, { "epoch": 0.5936530427772213, "grad_norm": 18.69225311279297, "learning_rate": 1.9978650076000887e-06, "loss": 0.1275, "num_input_tokens_seen": 16365312, "step": 24300 }, { "epoch": 0.5937751936090685, "grad_norm": 18.079143524169922, "learning_rate": 1.9978594344717855e-06, "loss": 0.0785, "num_input_tokens_seen": 16368832, "step": 24305 }, { "epoch": 0.5938973444409157, "grad_norm": 38.38662338256836, "learning_rate": 1.99785385408678e-06, "loss": 0.0691, "num_input_tokens_seen": 16372480, "step": 24310 }, { "epoch": 0.5940194952727628, "grad_norm": 0.4318760633468628, "learning_rate": 1.9978482664451126e-06, "loss": 0.1628, "num_input_tokens_seen": 16375424, "step": 24315 }, { "epoch": 0.59414164610461, "grad_norm": 3.622248411178589, "learning_rate": 1.997842671546824e-06, "loss": 0.1075, "num_input_tokens_seen": 16378624, "step": 24320 }, { "epoch": 0.5942637969364571, "grad_norm": 0.4291881024837494, "learning_rate": 1.997837069391956e-06, "loss": 0.0945, "num_input_tokens_seen": 16382592, "step": 24325 }, { "epoch": 0.5943859477683043, "grad_norm": 20.735027313232422, "learning_rate": 1.997831459980548e-06, "loss": 0.1419, "num_input_tokens_seen": 16386176, "step": 24330 }, { "epoch": 0.5945080986001514, "grad_norm": 23.61785125732422, "learning_rate": 1.997825843312641e-06, "loss": 0.1901, "num_input_tokens_seen": 16389376, "step": 24335 }, { "epoch": 0.5946302494319986, "grad_norm": 9.151907920837402, "learning_rate": 1.997820219388276e-06, "loss": 0.2088, "num_input_tokens_seen": 16392832, "step": 24340 }, { "epoch": 0.5947524002638458, "grad_norm": 11.61373233795166, "learning_rate": 1.997814588207494e-06, "loss": 0.0537, "num_input_tokens_seen": 16396544, "step": 24345 }, { "epoch": 0.594874551095693, "grad_norm": 10.756101608276367, "learning_rate": 1.9978089497703366e-06, "loss": 0.1235, "num_input_tokens_seen": 16399680, "step": 24350 }, { "epoch": 0.5949967019275402, "grad_norm": 19.338497161865234, "learning_rate": 1.9978033040768435e-06, "loss": 0.1369, "num_input_tokens_seen": 16403264, "step": 24355 }, { "epoch": 0.5951188527593873, "grad_norm": 8.648137092590332, "learning_rate": 1.9977976511270564e-06, "loss": 0.1138, "num_input_tokens_seen": 16406464, "step": 24360 }, { "epoch": 0.5952410035912344, "grad_norm": 22.793720245361328, "learning_rate": 1.9977919909210167e-06, "loss": 0.1247, "num_input_tokens_seen": 16409856, "step": 24365 }, { "epoch": 0.5953631544230816, "grad_norm": 12.735052108764648, "learning_rate": 1.997786323458765e-06, "loss": 0.0726, "num_input_tokens_seen": 16413120, "step": 24370 }, { "epoch": 0.5954853052549288, "grad_norm": 11.214115142822266, "learning_rate": 1.997780648740343e-06, "loss": 0.1003, "num_input_tokens_seen": 16416512, "step": 24375 }, { "epoch": 0.5956074560867759, "grad_norm": 14.181450843811035, "learning_rate": 1.997774966765792e-06, "loss": 0.1277, "num_input_tokens_seen": 16420736, "step": 24380 }, { "epoch": 0.5957296069186231, "grad_norm": 12.658021926879883, "learning_rate": 1.9977692775351525e-06, "loss": 0.0907, "num_input_tokens_seen": 16424384, "step": 24385 }, { "epoch": 0.5958517577504703, "grad_norm": 1.3953849077224731, "learning_rate": 1.997763581048467e-06, "loss": 0.0685, "num_input_tokens_seen": 16427520, "step": 24390 }, { "epoch": 0.5959739085823175, "grad_norm": 8.562386512756348, "learning_rate": 1.997757877305776e-06, "loss": 0.0535, "num_input_tokens_seen": 16430912, "step": 24395 }, { "epoch": 0.5960960594141647, "grad_norm": 31.926748275756836, "learning_rate": 1.997752166307121e-06, "loss": 0.1696, "num_input_tokens_seen": 16434048, "step": 24400 }, { "epoch": 0.5962182102460117, "grad_norm": 24.208181381225586, "learning_rate": 1.9977464480525447e-06, "loss": 0.1728, "num_input_tokens_seen": 16437120, "step": 24405 }, { "epoch": 0.5963403610778589, "grad_norm": 0.18723651766777039, "learning_rate": 1.997740722542087e-06, "loss": 0.0879, "num_input_tokens_seen": 16440256, "step": 24410 }, { "epoch": 0.5964625119097061, "grad_norm": 20.13685417175293, "learning_rate": 1.9977349897757913e-06, "loss": 0.1947, "num_input_tokens_seen": 16443520, "step": 24415 }, { "epoch": 0.5965846627415533, "grad_norm": 23.72542953491211, "learning_rate": 1.9977292497536976e-06, "loss": 0.0704, "num_input_tokens_seen": 16446720, "step": 24420 }, { "epoch": 0.5967068135734004, "grad_norm": 3.0584630966186523, "learning_rate": 1.997723502475849e-06, "loss": 0.1163, "num_input_tokens_seen": 16450112, "step": 24425 }, { "epoch": 0.5968289644052476, "grad_norm": 0.9413558840751648, "learning_rate": 1.9977177479422865e-06, "loss": 0.0953, "num_input_tokens_seen": 16453760, "step": 24430 }, { "epoch": 0.5969511152370948, "grad_norm": 18.52378273010254, "learning_rate": 1.997711986153052e-06, "loss": 0.1416, "num_input_tokens_seen": 16457280, "step": 24435 }, { "epoch": 0.597073266068942, "grad_norm": 7.341268539428711, "learning_rate": 1.997706217108188e-06, "loss": 0.073, "num_input_tokens_seen": 16460544, "step": 24440 }, { "epoch": 0.5971954169007891, "grad_norm": 44.4236946105957, "learning_rate": 1.997700440807736e-06, "loss": 0.1074, "num_input_tokens_seen": 16463552, "step": 24445 }, { "epoch": 0.5973175677326362, "grad_norm": 16.1765193939209, "learning_rate": 1.9976946572517377e-06, "loss": 0.2375, "num_input_tokens_seen": 16467008, "step": 24450 }, { "epoch": 0.5974397185644834, "grad_norm": 0.35032007098197937, "learning_rate": 1.997688866440236e-06, "loss": 0.0157, "num_input_tokens_seen": 16470784, "step": 24455 }, { "epoch": 0.5975618693963306, "grad_norm": 36.04292297363281, "learning_rate": 1.997683068373272e-06, "loss": 0.0783, "num_input_tokens_seen": 16473856, "step": 24460 }, { "epoch": 0.5976840202281778, "grad_norm": 11.974666595458984, "learning_rate": 1.997677263050889e-06, "loss": 0.0781, "num_input_tokens_seen": 16476992, "step": 24465 }, { "epoch": 0.5978061710600249, "grad_norm": 12.146720886230469, "learning_rate": 1.997671450473128e-06, "loss": 0.1661, "num_input_tokens_seen": 16480064, "step": 24470 }, { "epoch": 0.5979283218918721, "grad_norm": 32.76498794555664, "learning_rate": 1.997665630640032e-06, "loss": 0.1997, "num_input_tokens_seen": 16483200, "step": 24475 }, { "epoch": 0.5980504727237193, "grad_norm": 0.38879939913749695, "learning_rate": 1.9976598035516433e-06, "loss": 0.1039, "num_input_tokens_seen": 16486208, "step": 24480 }, { "epoch": 0.5981726235555664, "grad_norm": 11.602495193481445, "learning_rate": 1.997653969208004e-06, "loss": 0.1477, "num_input_tokens_seen": 16489664, "step": 24485 }, { "epoch": 0.5982947743874136, "grad_norm": 10.761492729187012, "learning_rate": 1.9976481276091572e-06, "loss": 0.1914, "num_input_tokens_seen": 16493184, "step": 24490 }, { "epoch": 0.5984169252192607, "grad_norm": 15.484816551208496, "learning_rate": 1.9976422787551443e-06, "loss": 0.0836, "num_input_tokens_seen": 16496448, "step": 24495 }, { "epoch": 0.5985390760511079, "grad_norm": 15.162385940551758, "learning_rate": 1.9976364226460087e-06, "loss": 0.1352, "num_input_tokens_seen": 16499648, "step": 24500 }, { "epoch": 0.5986612268829551, "grad_norm": 1.365733027458191, "learning_rate": 1.9976305592817928e-06, "loss": 0.152, "num_input_tokens_seen": 16502656, "step": 24505 }, { "epoch": 0.5987833777148023, "grad_norm": 12.582515716552734, "learning_rate": 1.997624688662539e-06, "loss": 0.1363, "num_input_tokens_seen": 16505728, "step": 24510 }, { "epoch": 0.5989055285466494, "grad_norm": 10.660904884338379, "learning_rate": 1.99761881078829e-06, "loss": 0.1303, "num_input_tokens_seen": 16509440, "step": 24515 }, { "epoch": 0.5990276793784965, "grad_norm": 22.079450607299805, "learning_rate": 1.9976129256590885e-06, "loss": 0.1228, "num_input_tokens_seen": 16512320, "step": 24520 }, { "epoch": 0.5991498302103437, "grad_norm": 4.311355113983154, "learning_rate": 1.997607033274978e-06, "loss": 0.0758, "num_input_tokens_seen": 16515456, "step": 24525 }, { "epoch": 0.5992719810421909, "grad_norm": 2.3310272693634033, "learning_rate": 1.9976011336360005e-06, "loss": 0.0758, "num_input_tokens_seen": 16518656, "step": 24530 }, { "epoch": 0.5993941318740381, "grad_norm": 18.549488067626953, "learning_rate": 1.9975952267421995e-06, "loss": 0.0999, "num_input_tokens_seen": 16521984, "step": 24535 }, { "epoch": 0.5995162827058852, "grad_norm": 19.15254020690918, "learning_rate": 1.9975893125936176e-06, "loss": 0.1037, "num_input_tokens_seen": 16525376, "step": 24540 }, { "epoch": 0.5996384335377324, "grad_norm": 3.7639474868774414, "learning_rate": 1.9975833911902975e-06, "loss": 0.2703, "num_input_tokens_seen": 16528512, "step": 24545 }, { "epoch": 0.5997605843695796, "grad_norm": 1.803524136543274, "learning_rate": 1.997577462532283e-06, "loss": 0.058, "num_input_tokens_seen": 16531840, "step": 24550 }, { "epoch": 0.5998827352014268, "grad_norm": 18.983144760131836, "learning_rate": 1.997571526619617e-06, "loss": 0.1103, "num_input_tokens_seen": 16535680, "step": 24555 }, { "epoch": 0.6000048860332738, "grad_norm": 13.398836135864258, "learning_rate": 1.9975655834523426e-06, "loss": 0.0944, "num_input_tokens_seen": 16539072, "step": 24560 }, { "epoch": 0.600127036865121, "grad_norm": 18.70598602294922, "learning_rate": 1.9975596330305027e-06, "loss": 0.1462, "num_input_tokens_seen": 16542464, "step": 24565 }, { "epoch": 0.6002491876969682, "grad_norm": 18.957441329956055, "learning_rate": 1.997553675354141e-06, "loss": 0.0323, "num_input_tokens_seen": 16545920, "step": 24570 }, { "epoch": 0.6003713385288154, "grad_norm": 14.466696739196777, "learning_rate": 1.9975477104233005e-06, "loss": 0.0757, "num_input_tokens_seen": 16549184, "step": 24575 }, { "epoch": 0.6004934893606625, "grad_norm": 16.620651245117188, "learning_rate": 1.9975417382380247e-06, "loss": 0.1313, "num_input_tokens_seen": 16552128, "step": 24580 }, { "epoch": 0.6006156401925097, "grad_norm": 26.417072296142578, "learning_rate": 1.997535758798357e-06, "loss": 0.1199, "num_input_tokens_seen": 16555712, "step": 24585 }, { "epoch": 0.6007377910243569, "grad_norm": 9.686941146850586, "learning_rate": 1.9975297721043413e-06, "loss": 0.1158, "num_input_tokens_seen": 16559104, "step": 24590 }, { "epoch": 0.6008599418562041, "grad_norm": 28.315837860107422, "learning_rate": 1.9975237781560205e-06, "loss": 0.0862, "num_input_tokens_seen": 16562880, "step": 24595 }, { "epoch": 0.6009820926880513, "grad_norm": 22.558536529541016, "learning_rate": 1.997517776953439e-06, "loss": 0.1336, "num_input_tokens_seen": 16566080, "step": 24600 }, { "epoch": 0.6011042435198983, "grad_norm": 24.751571655273438, "learning_rate": 1.9975117684966394e-06, "loss": 0.1872, "num_input_tokens_seen": 16569792, "step": 24605 }, { "epoch": 0.6012263943517455, "grad_norm": 0.8444401621818542, "learning_rate": 1.997505752785666e-06, "loss": 0.0663, "num_input_tokens_seen": 16572736, "step": 24610 }, { "epoch": 0.6013485451835927, "grad_norm": 2.5160603523254395, "learning_rate": 1.9974997298205624e-06, "loss": 0.1085, "num_input_tokens_seen": 16576192, "step": 24615 }, { "epoch": 0.6014706960154399, "grad_norm": 10.282331466674805, "learning_rate": 1.9974936996013727e-06, "loss": 0.2155, "num_input_tokens_seen": 16579328, "step": 24620 }, { "epoch": 0.601592846847287, "grad_norm": 2.0695960521698, "learning_rate": 1.9974876621281407e-06, "loss": 0.0528, "num_input_tokens_seen": 16582464, "step": 24625 }, { "epoch": 0.6017149976791342, "grad_norm": 42.36409378051758, "learning_rate": 1.9974816174009096e-06, "loss": 0.169, "num_input_tokens_seen": 16585856, "step": 24630 }, { "epoch": 0.6018371485109814, "grad_norm": 0.8332366943359375, "learning_rate": 1.9974755654197244e-06, "loss": 0.126, "num_input_tokens_seen": 16589568, "step": 24635 }, { "epoch": 0.6019592993428285, "grad_norm": 0.6155003309249878, "learning_rate": 1.9974695061846283e-06, "loss": 0.1085, "num_input_tokens_seen": 16593088, "step": 24640 }, { "epoch": 0.6020814501746757, "grad_norm": 1.1654919385910034, "learning_rate": 1.9974634396956656e-06, "loss": 0.1587, "num_input_tokens_seen": 16597376, "step": 24645 }, { "epoch": 0.6022036010065228, "grad_norm": 0.6619818210601807, "learning_rate": 1.9974573659528805e-06, "loss": 0.1954, "num_input_tokens_seen": 16600704, "step": 24650 }, { "epoch": 0.60232575183837, "grad_norm": 22.623855590820312, "learning_rate": 1.9974512849563174e-06, "loss": 0.117, "num_input_tokens_seen": 16604416, "step": 24655 }, { "epoch": 0.6024479026702172, "grad_norm": 2.9463284015655518, "learning_rate": 1.9974451967060204e-06, "loss": 0.1157, "num_input_tokens_seen": 16607680, "step": 24660 }, { "epoch": 0.6025700535020644, "grad_norm": 3.2619848251342773, "learning_rate": 1.997439101202033e-06, "loss": 0.0785, "num_input_tokens_seen": 16610752, "step": 24665 }, { "epoch": 0.6026922043339115, "grad_norm": 28.431598663330078, "learning_rate": 1.9974329984444007e-06, "loss": 0.1308, "num_input_tokens_seen": 16614336, "step": 24670 }, { "epoch": 0.6028143551657587, "grad_norm": 1.6486842632293701, "learning_rate": 1.997426888433167e-06, "loss": 0.0228, "num_input_tokens_seen": 16617856, "step": 24675 }, { "epoch": 0.6029365059976058, "grad_norm": 0.8875168561935425, "learning_rate": 1.9974207711683772e-06, "loss": 0.1213, "num_input_tokens_seen": 16621120, "step": 24680 }, { "epoch": 0.603058656829453, "grad_norm": 11.766212463378906, "learning_rate": 1.9974146466500746e-06, "loss": 0.1865, "num_input_tokens_seen": 16624320, "step": 24685 }, { "epoch": 0.6031808076613002, "grad_norm": 19.604454040527344, "learning_rate": 1.997408514878305e-06, "loss": 0.202, "num_input_tokens_seen": 16627456, "step": 24690 }, { "epoch": 0.6033029584931473, "grad_norm": 24.02836036682129, "learning_rate": 1.997402375853112e-06, "loss": 0.2102, "num_input_tokens_seen": 16630848, "step": 24695 }, { "epoch": 0.6034251093249945, "grad_norm": 1.0167436599731445, "learning_rate": 1.997396229574541e-06, "loss": 0.0908, "num_input_tokens_seen": 16634176, "step": 24700 }, { "epoch": 0.6035472601568417, "grad_norm": 23.059194564819336, "learning_rate": 1.9973900760426364e-06, "loss": 0.1419, "num_input_tokens_seen": 16637824, "step": 24705 }, { "epoch": 0.6036694109886889, "grad_norm": 4.2288498878479, "learning_rate": 1.9973839152574425e-06, "loss": 0.104, "num_input_tokens_seen": 16641344, "step": 24710 }, { "epoch": 0.603791561820536, "grad_norm": 0.7364501953125, "learning_rate": 1.9973777472190046e-06, "loss": 0.0662, "num_input_tokens_seen": 16644480, "step": 24715 }, { "epoch": 0.6039137126523831, "grad_norm": 12.857014656066895, "learning_rate": 1.9973715719273677e-06, "loss": 0.0551, "num_input_tokens_seen": 16647808, "step": 24720 }, { "epoch": 0.6040358634842303, "grad_norm": 5.512856960296631, "learning_rate": 1.9973653893825762e-06, "loss": 0.0722, "num_input_tokens_seen": 16651264, "step": 24725 }, { "epoch": 0.6041580143160775, "grad_norm": 41.143104553222656, "learning_rate": 1.9973591995846755e-06, "loss": 0.1888, "num_input_tokens_seen": 16654400, "step": 24730 }, { "epoch": 0.6042801651479247, "grad_norm": 5.552034854888916, "learning_rate": 1.9973530025337105e-06, "loss": 0.066, "num_input_tokens_seen": 16657856, "step": 24735 }, { "epoch": 0.6044023159797718, "grad_norm": 2.2578725814819336, "learning_rate": 1.997346798229726e-06, "loss": 0.0497, "num_input_tokens_seen": 16660864, "step": 24740 }, { "epoch": 0.604524466811619, "grad_norm": 10.679595947265625, "learning_rate": 1.9973405866727673e-06, "loss": 0.1068, "num_input_tokens_seen": 16664320, "step": 24745 }, { "epoch": 0.6046466176434662, "grad_norm": 8.024046897888184, "learning_rate": 1.99733436786288e-06, "loss": 0.1451, "num_input_tokens_seen": 16667648, "step": 24750 }, { "epoch": 0.6047687684753134, "grad_norm": 0.018951889127492905, "learning_rate": 1.997328141800109e-06, "loss": 0.0484, "num_input_tokens_seen": 16671424, "step": 24755 }, { "epoch": 0.6048909193071604, "grad_norm": 0.1066962331533432, "learning_rate": 1.997321908484499e-06, "loss": 0.2128, "num_input_tokens_seen": 16675264, "step": 24760 }, { "epoch": 0.6050130701390076, "grad_norm": 23.60710334777832, "learning_rate": 1.997315667916096e-06, "loss": 0.2069, "num_input_tokens_seen": 16678976, "step": 24765 }, { "epoch": 0.6051352209708548, "grad_norm": 12.42275333404541, "learning_rate": 1.997309420094945e-06, "loss": 0.0918, "num_input_tokens_seen": 16682624, "step": 24770 }, { "epoch": 0.605257371802702, "grad_norm": 3.2556633949279785, "learning_rate": 1.9973031650210922e-06, "loss": 0.1285, "num_input_tokens_seen": 16685952, "step": 24775 }, { "epoch": 0.6053795226345492, "grad_norm": 16.80837631225586, "learning_rate": 1.997296902694582e-06, "loss": 0.081, "num_input_tokens_seen": 16690112, "step": 24780 }, { "epoch": 0.6055016734663963, "grad_norm": 17.06452178955078, "learning_rate": 1.997290633115461e-06, "loss": 0.1219, "num_input_tokens_seen": 16693696, "step": 24785 }, { "epoch": 0.6056238242982435, "grad_norm": 7.428202152252197, "learning_rate": 1.9972843562837737e-06, "loss": 0.1666, "num_input_tokens_seen": 16696768, "step": 24790 }, { "epoch": 0.6057459751300907, "grad_norm": 19.68970489501953, "learning_rate": 1.997278072199567e-06, "loss": 0.0921, "num_input_tokens_seen": 16700032, "step": 24795 }, { "epoch": 0.6058681259619378, "grad_norm": 7.9083027839660645, "learning_rate": 1.997271780862885e-06, "loss": 0.0925, "num_input_tokens_seen": 16703744, "step": 24800 }, { "epoch": 0.6059902767937849, "grad_norm": 7.435184001922607, "learning_rate": 1.9972654822737753e-06, "loss": 0.085, "num_input_tokens_seen": 16707264, "step": 24805 }, { "epoch": 0.6061124276256321, "grad_norm": 16.882427215576172, "learning_rate": 1.997259176432282e-06, "loss": 0.0473, "num_input_tokens_seen": 16710976, "step": 24810 }, { "epoch": 0.6062345784574793, "grad_norm": 22.054662704467773, "learning_rate": 1.997252863338452e-06, "loss": 0.0786, "num_input_tokens_seen": 16714560, "step": 24815 }, { "epoch": 0.6063567292893265, "grad_norm": 4.894041061401367, "learning_rate": 1.9972465429923315e-06, "loss": 0.0628, "num_input_tokens_seen": 16718144, "step": 24820 }, { "epoch": 0.6064788801211737, "grad_norm": 7.330321788787842, "learning_rate": 1.997240215393965e-06, "loss": 0.1397, "num_input_tokens_seen": 16721344, "step": 24825 }, { "epoch": 0.6066010309530208, "grad_norm": 11.391115188598633, "learning_rate": 1.9972338805434002e-06, "loss": 0.1264, "num_input_tokens_seen": 16724480, "step": 24830 }, { "epoch": 0.606723181784868, "grad_norm": 10.962759971618652, "learning_rate": 1.9972275384406823e-06, "loss": 0.0505, "num_input_tokens_seen": 16727808, "step": 24835 }, { "epoch": 0.6068453326167151, "grad_norm": 14.033049583435059, "learning_rate": 1.997221189085857e-06, "loss": 0.0473, "num_input_tokens_seen": 16730752, "step": 24840 }, { "epoch": 0.6069674834485623, "grad_norm": 8.855101585388184, "learning_rate": 1.9972148324789714e-06, "loss": 0.1355, "num_input_tokens_seen": 16734144, "step": 24845 }, { "epoch": 0.6070896342804094, "grad_norm": 19.79100799560547, "learning_rate": 1.9972084686200712e-06, "loss": 0.2389, "num_input_tokens_seen": 16737792, "step": 24850 }, { "epoch": 0.6072117851122566, "grad_norm": 12.284224510192871, "learning_rate": 1.997202097509203e-06, "loss": 0.2192, "num_input_tokens_seen": 16740928, "step": 24855 }, { "epoch": 0.6073339359441038, "grad_norm": 17.570894241333008, "learning_rate": 1.997195719146413e-06, "loss": 0.1178, "num_input_tokens_seen": 16744256, "step": 24860 }, { "epoch": 0.607456086775951, "grad_norm": 12.302034378051758, "learning_rate": 1.9971893335317472e-06, "loss": 0.1032, "num_input_tokens_seen": 16747776, "step": 24865 }, { "epoch": 0.607578237607798, "grad_norm": 17.891658782958984, "learning_rate": 1.997182940665252e-06, "loss": 0.1418, "num_input_tokens_seen": 16751168, "step": 24870 }, { "epoch": 0.6077003884396452, "grad_norm": 7.210624694824219, "learning_rate": 1.997176540546975e-06, "loss": 0.1081, "num_input_tokens_seen": 16754624, "step": 24875 }, { "epoch": 0.6078225392714924, "grad_norm": 16.911026000976562, "learning_rate": 1.997170133176962e-06, "loss": 0.0947, "num_input_tokens_seen": 16758016, "step": 24880 }, { "epoch": 0.6079446901033396, "grad_norm": 8.62895679473877, "learning_rate": 1.9971637185552593e-06, "loss": 0.0705, "num_input_tokens_seen": 16761344, "step": 24885 }, { "epoch": 0.6080668409351868, "grad_norm": 18.918378829956055, "learning_rate": 1.997157296681914e-06, "loss": 0.1168, "num_input_tokens_seen": 16764480, "step": 24890 }, { "epoch": 0.6081889917670339, "grad_norm": 8.970353126525879, "learning_rate": 1.997150867556972e-06, "loss": 0.1466, "num_input_tokens_seen": 16767936, "step": 24895 }, { "epoch": 0.6083111425988811, "grad_norm": 17.729246139526367, "learning_rate": 1.997144431180481e-06, "loss": 0.0522, "num_input_tokens_seen": 16771328, "step": 24900 }, { "epoch": 0.6084332934307283, "grad_norm": 25.908403396606445, "learning_rate": 1.9971379875524876e-06, "loss": 0.1292, "num_input_tokens_seen": 16774400, "step": 24905 }, { "epoch": 0.6085554442625755, "grad_norm": 22.828269958496094, "learning_rate": 1.9971315366730388e-06, "loss": 0.13, "num_input_tokens_seen": 16777536, "step": 24910 }, { "epoch": 0.6086775950944225, "grad_norm": 13.071663856506348, "learning_rate": 1.997125078542181e-06, "loss": 0.0675, "num_input_tokens_seen": 16780416, "step": 24915 }, { "epoch": 0.6087997459262697, "grad_norm": 17.404088973999023, "learning_rate": 1.9971186131599617e-06, "loss": 0.0505, "num_input_tokens_seen": 16783360, "step": 24920 }, { "epoch": 0.6089218967581169, "grad_norm": 6.976062297821045, "learning_rate": 1.9971121405264275e-06, "loss": 0.1188, "num_input_tokens_seen": 16786496, "step": 24925 }, { "epoch": 0.6090440475899641, "grad_norm": 2.1634817123413086, "learning_rate": 1.997105660641625e-06, "loss": 0.0607, "num_input_tokens_seen": 16789504, "step": 24930 }, { "epoch": 0.6091661984218113, "grad_norm": 12.604896545410156, "learning_rate": 1.997099173505603e-06, "loss": 0.0866, "num_input_tokens_seen": 16793088, "step": 24935 }, { "epoch": 0.6092883492536584, "grad_norm": 1.5911040306091309, "learning_rate": 1.997092679118407e-06, "loss": 0.0902, "num_input_tokens_seen": 16796864, "step": 24940 }, { "epoch": 0.6094105000855056, "grad_norm": 11.825122833251953, "learning_rate": 1.9970861774800848e-06, "loss": 0.1309, "num_input_tokens_seen": 16800256, "step": 24945 }, { "epoch": 0.6095326509173528, "grad_norm": 20.49022102355957, "learning_rate": 1.9970796685906838e-06, "loss": 0.066, "num_input_tokens_seen": 16803648, "step": 24950 }, { "epoch": 0.6096548017492, "grad_norm": 12.80346393585205, "learning_rate": 1.9970731524502517e-06, "loss": 0.0865, "num_input_tokens_seen": 16807232, "step": 24955 }, { "epoch": 0.609776952581047, "grad_norm": 0.6074082851409912, "learning_rate": 1.9970666290588348e-06, "loss": 0.1032, "num_input_tokens_seen": 16810624, "step": 24960 }, { "epoch": 0.6098991034128942, "grad_norm": 37.6062126159668, "learning_rate": 1.9970600984164817e-06, "loss": 0.2635, "num_input_tokens_seen": 16813696, "step": 24965 }, { "epoch": 0.6100212542447414, "grad_norm": 25.862201690673828, "learning_rate": 1.9970535605232394e-06, "loss": 0.137, "num_input_tokens_seen": 16816960, "step": 24970 }, { "epoch": 0.6101434050765886, "grad_norm": 15.700074195861816, "learning_rate": 1.9970470153791553e-06, "loss": 0.0676, "num_input_tokens_seen": 16820096, "step": 24975 }, { "epoch": 0.6102655559084358, "grad_norm": 22.41241455078125, "learning_rate": 1.997040462984277e-06, "loss": 0.1488, "num_input_tokens_seen": 16823232, "step": 24980 }, { "epoch": 0.6103877067402829, "grad_norm": 7.360961437225342, "learning_rate": 1.997033903338652e-06, "loss": 0.0996, "num_input_tokens_seen": 16826304, "step": 24985 }, { "epoch": 0.61050985757213, "grad_norm": 19.96744728088379, "learning_rate": 1.9970273364423292e-06, "loss": 0.198, "num_input_tokens_seen": 16829760, "step": 24990 }, { "epoch": 0.6106320084039772, "grad_norm": 21.676393508911133, "learning_rate": 1.9970207622953547e-06, "loss": 0.1193, "num_input_tokens_seen": 16832832, "step": 24995 }, { "epoch": 0.6107541592358244, "grad_norm": 5.05864143371582, "learning_rate": 1.9970141808977773e-06, "loss": 0.0615, "num_input_tokens_seen": 16836416, "step": 25000 }, { "epoch": 0.6108763100676715, "grad_norm": 31.107868194580078, "learning_rate": 1.9970075922496444e-06, "loss": 0.1696, "num_input_tokens_seen": 16839616, "step": 25005 }, { "epoch": 0.6109984608995187, "grad_norm": 20.692991256713867, "learning_rate": 1.9970009963510044e-06, "loss": 0.0412, "num_input_tokens_seen": 16843008, "step": 25010 }, { "epoch": 0.6111206117313659, "grad_norm": 31.31917381286621, "learning_rate": 1.9969943932019047e-06, "loss": 0.1899, "num_input_tokens_seen": 16846464, "step": 25015 }, { "epoch": 0.6112427625632131, "grad_norm": 18.618642807006836, "learning_rate": 1.996987782802394e-06, "loss": 0.0507, "num_input_tokens_seen": 16849984, "step": 25020 }, { "epoch": 0.6113649133950603, "grad_norm": 0.8102744221687317, "learning_rate": 1.9969811651525196e-06, "loss": 0.0448, "num_input_tokens_seen": 16853184, "step": 25025 }, { "epoch": 0.6114870642269074, "grad_norm": 0.0666152685880661, "learning_rate": 1.9969745402523303e-06, "loss": 0.1035, "num_input_tokens_seen": 16856640, "step": 25030 }, { "epoch": 0.6116092150587545, "grad_norm": 41.11848831176758, "learning_rate": 1.9969679081018737e-06, "loss": 0.0872, "num_input_tokens_seen": 16859712, "step": 25035 }, { "epoch": 0.6117313658906017, "grad_norm": 9.146672248840332, "learning_rate": 1.9969612687011987e-06, "loss": 0.1359, "num_input_tokens_seen": 16863040, "step": 25040 }, { "epoch": 0.6118535167224489, "grad_norm": 22.66708755493164, "learning_rate": 1.996954622050353e-06, "loss": 0.205, "num_input_tokens_seen": 16866176, "step": 25045 }, { "epoch": 0.611975667554296, "grad_norm": 11.185577392578125, "learning_rate": 1.996947968149385e-06, "loss": 0.0858, "num_input_tokens_seen": 16869632, "step": 25050 }, { "epoch": 0.6120978183861432, "grad_norm": 4.112269878387451, "learning_rate": 1.9969413069983435e-06, "loss": 0.0653, "num_input_tokens_seen": 16873024, "step": 25055 }, { "epoch": 0.6122199692179904, "grad_norm": 10.49512004852295, "learning_rate": 1.9969346385972764e-06, "loss": 0.14, "num_input_tokens_seen": 16876736, "step": 25060 }, { "epoch": 0.6123421200498376, "grad_norm": 24.078441619873047, "learning_rate": 1.9969279629462327e-06, "loss": 0.2184, "num_input_tokens_seen": 16880128, "step": 25065 }, { "epoch": 0.6124642708816848, "grad_norm": 10.833511352539062, "learning_rate": 1.9969212800452608e-06, "loss": 0.066, "num_input_tokens_seen": 16883072, "step": 25070 }, { "epoch": 0.6125864217135318, "grad_norm": 18.679733276367188, "learning_rate": 1.996914589894409e-06, "loss": 0.0957, "num_input_tokens_seen": 16886464, "step": 25075 }, { "epoch": 0.612708572545379, "grad_norm": 4.080750942230225, "learning_rate": 1.9969078924937263e-06, "loss": 0.0604, "num_input_tokens_seen": 16890112, "step": 25080 }, { "epoch": 0.6128307233772262, "grad_norm": 33.040748596191406, "learning_rate": 1.9969011878432608e-06, "loss": 0.1052, "num_input_tokens_seen": 16893632, "step": 25085 }, { "epoch": 0.6129528742090734, "grad_norm": 27.59091567993164, "learning_rate": 1.996894475943062e-06, "loss": 0.1961, "num_input_tokens_seen": 16896832, "step": 25090 }, { "epoch": 0.6130750250409205, "grad_norm": 8.494720458984375, "learning_rate": 1.996887756793179e-06, "loss": 0.237, "num_input_tokens_seen": 16900224, "step": 25095 }, { "epoch": 0.6131971758727677, "grad_norm": 9.366389274597168, "learning_rate": 1.9968810303936593e-06, "loss": 0.0974, "num_input_tokens_seen": 16903552, "step": 25100 }, { "epoch": 0.6133193267046149, "grad_norm": 27.643888473510742, "learning_rate": 1.996874296744553e-06, "loss": 0.1542, "num_input_tokens_seen": 16906688, "step": 25105 }, { "epoch": 0.613441477536462, "grad_norm": 24.457929611206055, "learning_rate": 1.9968675558459085e-06, "loss": 0.0714, "num_input_tokens_seen": 16909760, "step": 25110 }, { "epoch": 0.6135636283683091, "grad_norm": 2.291285753250122, "learning_rate": 1.9968608076977753e-06, "loss": 0.0904, "num_input_tokens_seen": 16913088, "step": 25115 }, { "epoch": 0.6136857792001563, "grad_norm": 9.119710922241211, "learning_rate": 1.996854052300202e-06, "loss": 0.1161, "num_input_tokens_seen": 16916160, "step": 25120 }, { "epoch": 0.6138079300320035, "grad_norm": 16.056711196899414, "learning_rate": 1.996847289653238e-06, "loss": 0.034, "num_input_tokens_seen": 16919744, "step": 25125 }, { "epoch": 0.6139300808638507, "grad_norm": 14.756852149963379, "learning_rate": 1.996840519756932e-06, "loss": 0.1062, "num_input_tokens_seen": 16923264, "step": 25130 }, { "epoch": 0.6140522316956979, "grad_norm": 13.74815845489502, "learning_rate": 1.996833742611334e-06, "loss": 0.153, "num_input_tokens_seen": 16926272, "step": 25135 }, { "epoch": 0.614174382527545, "grad_norm": 12.079375267028809, "learning_rate": 1.996826958216493e-06, "loss": 0.1932, "num_input_tokens_seen": 16929728, "step": 25140 }, { "epoch": 0.6142965333593922, "grad_norm": 14.67190933227539, "learning_rate": 1.996820166572458e-06, "loss": 0.0886, "num_input_tokens_seen": 16933376, "step": 25145 }, { "epoch": 0.6144186841912394, "grad_norm": 4.266879081726074, "learning_rate": 1.996813367679279e-06, "loss": 0.0943, "num_input_tokens_seen": 16936448, "step": 25150 }, { "epoch": 0.6145408350230865, "grad_norm": 14.214287757873535, "learning_rate": 1.9968065615370046e-06, "loss": 0.1077, "num_input_tokens_seen": 16939968, "step": 25155 }, { "epoch": 0.6146629858549336, "grad_norm": 1.067999243736267, "learning_rate": 1.996799748145685e-06, "loss": 0.1043, "num_input_tokens_seen": 16943232, "step": 25160 }, { "epoch": 0.6147851366867808, "grad_norm": 9.434431076049805, "learning_rate": 1.9967929275053695e-06, "loss": 0.1222, "num_input_tokens_seen": 16945984, "step": 25165 }, { "epoch": 0.614907287518628, "grad_norm": 3.1949663162231445, "learning_rate": 1.996786099616108e-06, "loss": 0.2028, "num_input_tokens_seen": 16949376, "step": 25170 }, { "epoch": 0.6150294383504752, "grad_norm": 15.2947359085083, "learning_rate": 1.9967792644779496e-06, "loss": 0.1709, "num_input_tokens_seen": 16952512, "step": 25175 }, { "epoch": 0.6151515891823224, "grad_norm": 18.07305908203125, "learning_rate": 1.9967724220909444e-06, "loss": 0.154, "num_input_tokens_seen": 16955840, "step": 25180 }, { "epoch": 0.6152737400141695, "grad_norm": 16.07686424255371, "learning_rate": 1.996765572455142e-06, "loss": 0.0944, "num_input_tokens_seen": 16959616, "step": 25185 }, { "epoch": 0.6153958908460166, "grad_norm": 36.54639434814453, "learning_rate": 1.996758715570592e-06, "loss": 0.147, "num_input_tokens_seen": 16962880, "step": 25190 }, { "epoch": 0.6155180416778638, "grad_norm": 9.950146675109863, "learning_rate": 1.9967518514373447e-06, "loss": 0.1115, "num_input_tokens_seen": 16965952, "step": 25195 }, { "epoch": 0.615640192509711, "grad_norm": 22.124109268188477, "learning_rate": 1.9967449800554497e-06, "loss": 0.0778, "num_input_tokens_seen": 16969280, "step": 25200 }, { "epoch": 0.6157623433415581, "grad_norm": 17.615293502807617, "learning_rate": 1.996738101424957e-06, "loss": 0.1822, "num_input_tokens_seen": 16972736, "step": 25205 }, { "epoch": 0.6158844941734053, "grad_norm": 15.409132957458496, "learning_rate": 1.9967312155459175e-06, "loss": 0.1052, "num_input_tokens_seen": 16976064, "step": 25210 }, { "epoch": 0.6160066450052525, "grad_norm": 23.066360473632812, "learning_rate": 1.99672432241838e-06, "loss": 0.1949, "num_input_tokens_seen": 16979328, "step": 25215 }, { "epoch": 0.6161287958370997, "grad_norm": 13.9695463180542, "learning_rate": 1.9967174220423954e-06, "loss": 0.1043, "num_input_tokens_seen": 16982464, "step": 25220 }, { "epoch": 0.6162509466689469, "grad_norm": 0.6445543169975281, "learning_rate": 1.996710514418013e-06, "loss": 0.1102, "num_input_tokens_seen": 16986048, "step": 25225 }, { "epoch": 0.6163730975007939, "grad_norm": 15.676551818847656, "learning_rate": 1.996703599545284e-06, "loss": 0.134, "num_input_tokens_seen": 16989312, "step": 25230 }, { "epoch": 0.6164952483326411, "grad_norm": 1.9470303058624268, "learning_rate": 1.996696677424259e-06, "loss": 0.0338, "num_input_tokens_seen": 16993088, "step": 25235 }, { "epoch": 0.6166173991644883, "grad_norm": 13.360188484191895, "learning_rate": 1.996689748054987e-06, "loss": 0.1011, "num_input_tokens_seen": 16996672, "step": 25240 }, { "epoch": 0.6167395499963355, "grad_norm": 11.234295845031738, "learning_rate": 1.996682811437519e-06, "loss": 0.1098, "num_input_tokens_seen": 17000128, "step": 25245 }, { "epoch": 0.6168617008281826, "grad_norm": 6.157711505889893, "learning_rate": 1.9966758675719057e-06, "loss": 0.0585, "num_input_tokens_seen": 17003712, "step": 25250 }, { "epoch": 0.6169838516600298, "grad_norm": 10.527791023254395, "learning_rate": 1.996668916458197e-06, "loss": 0.0604, "num_input_tokens_seen": 17007872, "step": 25255 }, { "epoch": 0.617106002491877, "grad_norm": 1.8814789056777954, "learning_rate": 1.9966619580964446e-06, "loss": 0.0614, "num_input_tokens_seen": 17011392, "step": 25260 }, { "epoch": 0.6172281533237242, "grad_norm": 22.949785232543945, "learning_rate": 1.996654992486698e-06, "loss": 0.1322, "num_input_tokens_seen": 17014720, "step": 25265 }, { "epoch": 0.6173503041555713, "grad_norm": 15.85322093963623, "learning_rate": 1.9966480196290087e-06, "loss": 0.0883, "num_input_tokens_seen": 17018304, "step": 25270 }, { "epoch": 0.6174724549874184, "grad_norm": 11.826990127563477, "learning_rate": 1.996641039523426e-06, "loss": 0.1705, "num_input_tokens_seen": 17021888, "step": 25275 }, { "epoch": 0.6175946058192656, "grad_norm": 0.42177316546440125, "learning_rate": 1.9966340521700024e-06, "loss": 0.1127, "num_input_tokens_seen": 17025280, "step": 25280 }, { "epoch": 0.6177167566511128, "grad_norm": 0.6035380959510803, "learning_rate": 1.9966270575687876e-06, "loss": 0.021, "num_input_tokens_seen": 17029056, "step": 25285 }, { "epoch": 0.61783890748296, "grad_norm": 6.330459117889404, "learning_rate": 1.996620055719833e-06, "loss": 0.0255, "num_input_tokens_seen": 17032384, "step": 25290 }, { "epoch": 0.6179610583148071, "grad_norm": 0.0786670371890068, "learning_rate": 1.9966130466231886e-06, "loss": 0.081, "num_input_tokens_seen": 17035392, "step": 25295 }, { "epoch": 0.6180832091466543, "grad_norm": 4.98344087600708, "learning_rate": 1.996606030278907e-06, "loss": 0.1271, "num_input_tokens_seen": 17038848, "step": 25300 }, { "epoch": 0.6182053599785015, "grad_norm": 27.002077102661133, "learning_rate": 1.9965990066870374e-06, "loss": 0.1147, "num_input_tokens_seen": 17042816, "step": 25305 }, { "epoch": 0.6183275108103486, "grad_norm": 14.993169784545898, "learning_rate": 1.9965919758476325e-06, "loss": 0.0699, "num_input_tokens_seen": 17045824, "step": 25310 }, { "epoch": 0.6184496616421958, "grad_norm": 20.037349700927734, "learning_rate": 1.9965849377607423e-06, "loss": 0.1055, "num_input_tokens_seen": 17049152, "step": 25315 }, { "epoch": 0.6185718124740429, "grad_norm": 15.71719741821289, "learning_rate": 1.9965778924264183e-06, "loss": 0.1039, "num_input_tokens_seen": 17052416, "step": 25320 }, { "epoch": 0.6186939633058901, "grad_norm": 9.317479133605957, "learning_rate": 1.996570839844712e-06, "loss": 0.0592, "num_input_tokens_seen": 17055872, "step": 25325 }, { "epoch": 0.6188161141377373, "grad_norm": 2.1688971519470215, "learning_rate": 1.9965637800156747e-06, "loss": 0.1786, "num_input_tokens_seen": 17058880, "step": 25330 }, { "epoch": 0.6189382649695845, "grad_norm": 23.790361404418945, "learning_rate": 1.9965567129393576e-06, "loss": 0.0409, "num_input_tokens_seen": 17062208, "step": 25335 }, { "epoch": 0.6190604158014316, "grad_norm": 0.34484490752220154, "learning_rate": 1.9965496386158117e-06, "loss": 0.1413, "num_input_tokens_seen": 17065920, "step": 25340 }, { "epoch": 0.6191825666332788, "grad_norm": 1.4263343811035156, "learning_rate": 1.996542557045089e-06, "loss": 0.1079, "num_input_tokens_seen": 17069568, "step": 25345 }, { "epoch": 0.6193047174651259, "grad_norm": 34.41661071777344, "learning_rate": 1.9965354682272405e-06, "loss": 0.2241, "num_input_tokens_seen": 17073216, "step": 25350 }, { "epoch": 0.6194268682969731, "grad_norm": 0.44979265332221985, "learning_rate": 1.9965283721623185e-06, "loss": 0.1075, "num_input_tokens_seen": 17076928, "step": 25355 }, { "epoch": 0.6195490191288203, "grad_norm": 3.4772098064422607, "learning_rate": 1.9965212688503736e-06, "loss": 0.0357, "num_input_tokens_seen": 17080000, "step": 25360 }, { "epoch": 0.6196711699606674, "grad_norm": 20.216279983520508, "learning_rate": 1.9965141582914583e-06, "loss": 0.0264, "num_input_tokens_seen": 17083456, "step": 25365 }, { "epoch": 0.6197933207925146, "grad_norm": 41.14569854736328, "learning_rate": 1.996507040485624e-06, "loss": 0.1875, "num_input_tokens_seen": 17087616, "step": 25370 }, { "epoch": 0.6199154716243618, "grad_norm": 13.675374984741211, "learning_rate": 1.9964999154329224e-06, "loss": 0.1642, "num_input_tokens_seen": 17090688, "step": 25375 }, { "epoch": 0.620037622456209, "grad_norm": 0.7034504413604736, "learning_rate": 1.9964927831334056e-06, "loss": 0.1116, "num_input_tokens_seen": 17094272, "step": 25380 }, { "epoch": 0.620159773288056, "grad_norm": 33.59286880493164, "learning_rate": 1.996485643587125e-06, "loss": 0.1427, "num_input_tokens_seen": 17097344, "step": 25385 }, { "epoch": 0.6202819241199032, "grad_norm": 9.86114501953125, "learning_rate": 1.996478496794133e-06, "loss": 0.1185, "num_input_tokens_seen": 17101120, "step": 25390 }, { "epoch": 0.6204040749517504, "grad_norm": 11.22215461730957, "learning_rate": 1.9964713427544813e-06, "loss": 0.0828, "num_input_tokens_seen": 17104192, "step": 25395 }, { "epoch": 0.6205262257835976, "grad_norm": 22.48546028137207, "learning_rate": 1.996464181468222e-06, "loss": 0.1144, "num_input_tokens_seen": 17108096, "step": 25400 }, { "epoch": 0.6206483766154447, "grad_norm": 2.969916582107544, "learning_rate": 1.9964570129354066e-06, "loss": 0.2225, "num_input_tokens_seen": 17111104, "step": 25405 }, { "epoch": 0.6207705274472919, "grad_norm": 3.541419506072998, "learning_rate": 1.9964498371560886e-06, "loss": 0.0343, "num_input_tokens_seen": 17114432, "step": 25410 }, { "epoch": 0.6208926782791391, "grad_norm": 29.178781509399414, "learning_rate": 1.9964426541303186e-06, "loss": 0.1387, "num_input_tokens_seen": 17117440, "step": 25415 }, { "epoch": 0.6210148291109863, "grad_norm": 2.5262792110443115, "learning_rate": 1.9964354638581503e-06, "loss": 0.1161, "num_input_tokens_seen": 17120896, "step": 25420 }, { "epoch": 0.6211369799428335, "grad_norm": 0.37980780005455017, "learning_rate": 1.996428266339635e-06, "loss": 0.0561, "num_input_tokens_seen": 17124416, "step": 25425 }, { "epoch": 0.6212591307746805, "grad_norm": 26.190187454223633, "learning_rate": 1.9964210615748255e-06, "loss": 0.1095, "num_input_tokens_seen": 17127808, "step": 25430 }, { "epoch": 0.6213812816065277, "grad_norm": 0.6196547150611877, "learning_rate": 1.996413849563774e-06, "loss": 0.0779, "num_input_tokens_seen": 17131008, "step": 25435 }, { "epoch": 0.6215034324383749, "grad_norm": 0.7363035678863525, "learning_rate": 1.9964066303065325e-06, "loss": 0.0774, "num_input_tokens_seen": 17133952, "step": 25440 }, { "epoch": 0.6216255832702221, "grad_norm": 1.5437747240066528, "learning_rate": 1.9963994038031546e-06, "loss": 0.0625, "num_input_tokens_seen": 17137600, "step": 25445 }, { "epoch": 0.6217477341020692, "grad_norm": 83.51165008544922, "learning_rate": 1.996392170053692e-06, "loss": 0.2014, "num_input_tokens_seen": 17140864, "step": 25450 }, { "epoch": 0.6218698849339164, "grad_norm": 0.4975201189517975, "learning_rate": 1.9963849290581974e-06, "loss": 0.0841, "num_input_tokens_seen": 17144256, "step": 25455 }, { "epoch": 0.6219920357657636, "grad_norm": 2.202531576156616, "learning_rate": 1.996377680816724e-06, "loss": 0.1522, "num_input_tokens_seen": 17147776, "step": 25460 }, { "epoch": 0.6221141865976108, "grad_norm": 17.93608283996582, "learning_rate": 1.9963704253293237e-06, "loss": 0.1654, "num_input_tokens_seen": 17151232, "step": 25465 }, { "epoch": 0.6222363374294579, "grad_norm": 0.37878966331481934, "learning_rate": 1.99636316259605e-06, "loss": 0.1012, "num_input_tokens_seen": 17154624, "step": 25470 }, { "epoch": 0.622358488261305, "grad_norm": 15.742181777954102, "learning_rate": 1.9963558926169552e-06, "loss": 0.1666, "num_input_tokens_seen": 17158080, "step": 25475 }, { "epoch": 0.6224806390931522, "grad_norm": 0.2844081521034241, "learning_rate": 1.9963486153920925e-06, "loss": 0.0974, "num_input_tokens_seen": 17161920, "step": 25480 }, { "epoch": 0.6226027899249994, "grad_norm": 0.17397256195545197, "learning_rate": 1.9963413309215143e-06, "loss": 0.1068, "num_input_tokens_seen": 17165248, "step": 25485 }, { "epoch": 0.6227249407568466, "grad_norm": 9.662365913391113, "learning_rate": 1.9963340392052744e-06, "loss": 0.1319, "num_input_tokens_seen": 17168576, "step": 25490 }, { "epoch": 0.6228470915886937, "grad_norm": 43.83585739135742, "learning_rate": 1.9963267402434253e-06, "loss": 0.1814, "num_input_tokens_seen": 17172032, "step": 25495 }, { "epoch": 0.6229692424205409, "grad_norm": 22.395421981811523, "learning_rate": 1.99631943403602e-06, "loss": 0.0794, "num_input_tokens_seen": 17175552, "step": 25500 }, { "epoch": 0.623091393252388, "grad_norm": 13.20292854309082, "learning_rate": 1.996312120583112e-06, "loss": 0.0511, "num_input_tokens_seen": 17178880, "step": 25505 }, { "epoch": 0.6232135440842352, "grad_norm": 1.7054976224899292, "learning_rate": 1.996304799884754e-06, "loss": 0.1083, "num_input_tokens_seen": 17182208, "step": 25510 }, { "epoch": 0.6233356949160824, "grad_norm": 1.7298126220703125, "learning_rate": 1.996297471941e-06, "loss": 0.0684, "num_input_tokens_seen": 17185600, "step": 25515 }, { "epoch": 0.6234578457479295, "grad_norm": 23.883642196655273, "learning_rate": 1.9962901367519023e-06, "loss": 0.071, "num_input_tokens_seen": 17189184, "step": 25520 }, { "epoch": 0.6235799965797767, "grad_norm": 20.736419677734375, "learning_rate": 1.996282794317515e-06, "loss": 0.1737, "num_input_tokens_seen": 17193152, "step": 25525 }, { "epoch": 0.6237021474116239, "grad_norm": 9.693829536437988, "learning_rate": 1.996275444637891e-06, "loss": 0.1166, "num_input_tokens_seen": 17196672, "step": 25530 }, { "epoch": 0.6238242982434711, "grad_norm": 22.503318786621094, "learning_rate": 1.9962680877130842e-06, "loss": 0.1957, "num_input_tokens_seen": 17200000, "step": 25535 }, { "epoch": 0.6239464490753182, "grad_norm": 15.77971363067627, "learning_rate": 1.996260723543148e-06, "loss": 0.1698, "num_input_tokens_seen": 17203392, "step": 25540 }, { "epoch": 0.6240685999071653, "grad_norm": 52.5566291809082, "learning_rate": 1.996253352128136e-06, "loss": 0.1698, "num_input_tokens_seen": 17206592, "step": 25545 }, { "epoch": 0.6241907507390125, "grad_norm": 4.498231410980225, "learning_rate": 1.996245973468101e-06, "loss": 0.1594, "num_input_tokens_seen": 17210048, "step": 25550 }, { "epoch": 0.6243129015708597, "grad_norm": 6.834176063537598, "learning_rate": 1.9962385875630977e-06, "loss": 0.1584, "num_input_tokens_seen": 17214848, "step": 25555 }, { "epoch": 0.6244350524027069, "grad_norm": 20.276023864746094, "learning_rate": 1.9962311944131796e-06, "loss": 0.0985, "num_input_tokens_seen": 17218304, "step": 25560 }, { "epoch": 0.624557203234554, "grad_norm": 9.306912422180176, "learning_rate": 1.9962237940184003e-06, "loss": 0.0956, "num_input_tokens_seen": 17221632, "step": 25565 }, { "epoch": 0.6246793540664012, "grad_norm": 14.916698455810547, "learning_rate": 1.9962163863788134e-06, "loss": 0.1397, "num_input_tokens_seen": 17225152, "step": 25570 }, { "epoch": 0.6248015048982484, "grad_norm": 21.793596267700195, "learning_rate": 1.996208971494473e-06, "loss": 0.0717, "num_input_tokens_seen": 17229056, "step": 25575 }, { "epoch": 0.6249236557300956, "grad_norm": 1.2358403205871582, "learning_rate": 1.9962015493654334e-06, "loss": 0.0514, "num_input_tokens_seen": 17232320, "step": 25580 }, { "epoch": 0.6250458065619426, "grad_norm": 9.152735710144043, "learning_rate": 1.9961941199917477e-06, "loss": 0.1642, "num_input_tokens_seen": 17235328, "step": 25585 }, { "epoch": 0.6251679573937898, "grad_norm": 5.852708339691162, "learning_rate": 1.9961866833734705e-06, "loss": 0.0463, "num_input_tokens_seen": 17238336, "step": 25590 }, { "epoch": 0.625290108225637, "grad_norm": 16.526809692382812, "learning_rate": 1.996179239510656e-06, "loss": 0.1545, "num_input_tokens_seen": 17241728, "step": 25595 }, { "epoch": 0.6254122590574842, "grad_norm": 24.526718139648438, "learning_rate": 1.996171788403358e-06, "loss": 0.1015, "num_input_tokens_seen": 17245248, "step": 25600 }, { "epoch": 0.6255344098893314, "grad_norm": 3.7110772132873535, "learning_rate": 1.996164330051631e-06, "loss": 0.1408, "num_input_tokens_seen": 17248960, "step": 25605 }, { "epoch": 0.6256565607211785, "grad_norm": 28.51833152770996, "learning_rate": 1.996156864455529e-06, "loss": 0.0738, "num_input_tokens_seen": 17252352, "step": 25610 }, { "epoch": 0.6257787115530257, "grad_norm": 14.709949493408203, "learning_rate": 1.996149391615106e-06, "loss": 0.1187, "num_input_tokens_seen": 17255488, "step": 25615 }, { "epoch": 0.6259008623848729, "grad_norm": 17.817319869995117, "learning_rate": 1.996141911530417e-06, "loss": 0.0899, "num_input_tokens_seen": 17258752, "step": 25620 }, { "epoch": 0.62602301321672, "grad_norm": 17.193923950195312, "learning_rate": 1.996134424201516e-06, "loss": 0.1487, "num_input_tokens_seen": 17262272, "step": 25625 }, { "epoch": 0.6261451640485671, "grad_norm": 22.279172897338867, "learning_rate": 1.9961269296284574e-06, "loss": 0.1187, "num_input_tokens_seen": 17265600, "step": 25630 }, { "epoch": 0.6262673148804143, "grad_norm": 23.591035842895508, "learning_rate": 1.9961194278112963e-06, "loss": 0.072, "num_input_tokens_seen": 17268992, "step": 25635 }, { "epoch": 0.6263894657122615, "grad_norm": 10.646063804626465, "learning_rate": 1.9961119187500867e-06, "loss": 0.1307, "num_input_tokens_seen": 17273152, "step": 25640 }, { "epoch": 0.6265116165441087, "grad_norm": 13.584372520446777, "learning_rate": 1.996104402444883e-06, "loss": 0.0832, "num_input_tokens_seen": 17276480, "step": 25645 }, { "epoch": 0.6266337673759559, "grad_norm": 8.617551803588867, "learning_rate": 1.99609687889574e-06, "loss": 0.1212, "num_input_tokens_seen": 17279488, "step": 25650 }, { "epoch": 0.626755918207803, "grad_norm": 30.092082977294922, "learning_rate": 1.996089348102713e-06, "loss": 0.1022, "num_input_tokens_seen": 17283328, "step": 25655 }, { "epoch": 0.6268780690396502, "grad_norm": 26.2705135345459, "learning_rate": 1.996081810065856e-06, "loss": 0.0568, "num_input_tokens_seen": 17286848, "step": 25660 }, { "epoch": 0.6270002198714973, "grad_norm": 11.056796073913574, "learning_rate": 1.9960742647852246e-06, "loss": 0.1768, "num_input_tokens_seen": 17291072, "step": 25665 }, { "epoch": 0.6271223707033445, "grad_norm": 22.28460121154785, "learning_rate": 1.9960667122608732e-06, "loss": 0.1485, "num_input_tokens_seen": 17294208, "step": 25670 }, { "epoch": 0.6272445215351916, "grad_norm": 10.146092414855957, "learning_rate": 1.996059152492856e-06, "loss": 0.1853, "num_input_tokens_seen": 17298112, "step": 25675 }, { "epoch": 0.6273666723670388, "grad_norm": 12.486412048339844, "learning_rate": 1.9960515854812298e-06, "loss": 0.1721, "num_input_tokens_seen": 17301184, "step": 25680 }, { "epoch": 0.627488823198886, "grad_norm": 12.366875648498535, "learning_rate": 1.996044011226048e-06, "loss": 0.1373, "num_input_tokens_seen": 17304512, "step": 25685 }, { "epoch": 0.6276109740307332, "grad_norm": 12.222225189208984, "learning_rate": 1.996036429727366e-06, "loss": 0.0416, "num_input_tokens_seen": 17307520, "step": 25690 }, { "epoch": 0.6277331248625803, "grad_norm": 11.332962036132812, "learning_rate": 1.99602884098524e-06, "loss": 0.152, "num_input_tokens_seen": 17311104, "step": 25695 }, { "epoch": 0.6278552756944275, "grad_norm": 13.932801246643066, "learning_rate": 1.9960212449997238e-06, "loss": 0.0961, "num_input_tokens_seen": 17314560, "step": 25700 }, { "epoch": 0.6279774265262746, "grad_norm": 12.53373908996582, "learning_rate": 1.996013641770873e-06, "loss": 0.1072, "num_input_tokens_seen": 17317888, "step": 25705 }, { "epoch": 0.6280995773581218, "grad_norm": 7.917189598083496, "learning_rate": 1.9960060312987434e-06, "loss": 0.0653, "num_input_tokens_seen": 17320896, "step": 25710 }, { "epoch": 0.628221728189969, "grad_norm": 25.79001808166504, "learning_rate": 1.9959984135833902e-06, "loss": 0.1914, "num_input_tokens_seen": 17324160, "step": 25715 }, { "epoch": 0.6283438790218161, "grad_norm": 16.6901912689209, "learning_rate": 1.9959907886248686e-06, "loss": 0.1041, "num_input_tokens_seen": 17327168, "step": 25720 }, { "epoch": 0.6284660298536633, "grad_norm": 11.882837295532227, "learning_rate": 1.9959831564232335e-06, "loss": 0.1079, "num_input_tokens_seen": 17330624, "step": 25725 }, { "epoch": 0.6285881806855105, "grad_norm": 15.25439453125, "learning_rate": 1.9959755169785417e-06, "loss": 0.0786, "num_input_tokens_seen": 17333632, "step": 25730 }, { "epoch": 0.6287103315173577, "grad_norm": 9.499648094177246, "learning_rate": 1.995967870290848e-06, "loss": 0.0607, "num_input_tokens_seen": 17337024, "step": 25735 }, { "epoch": 0.6288324823492047, "grad_norm": 12.742144584655762, "learning_rate": 1.9959602163602077e-06, "loss": 0.0855, "num_input_tokens_seen": 17340352, "step": 25740 }, { "epoch": 0.6289546331810519, "grad_norm": 10.112947463989258, "learning_rate": 1.9959525551866767e-06, "loss": 0.2015, "num_input_tokens_seen": 17343616, "step": 25745 }, { "epoch": 0.6290767840128991, "grad_norm": 0.5093333721160889, "learning_rate": 1.9959448867703115e-06, "loss": 0.1225, "num_input_tokens_seen": 17346880, "step": 25750 }, { "epoch": 0.6291989348447463, "grad_norm": 11.308860778808594, "learning_rate": 1.995937211111167e-06, "loss": 0.1385, "num_input_tokens_seen": 17350080, "step": 25755 }, { "epoch": 0.6293210856765935, "grad_norm": 15.600391387939453, "learning_rate": 1.9959295282092987e-06, "loss": 0.1036, "num_input_tokens_seen": 17353152, "step": 25760 }, { "epoch": 0.6294432365084406, "grad_norm": 10.270207405090332, "learning_rate": 1.9959218380647638e-06, "loss": 0.1537, "num_input_tokens_seen": 17356736, "step": 25765 }, { "epoch": 0.6295653873402878, "grad_norm": 29.24824333190918, "learning_rate": 1.995914140677617e-06, "loss": 0.1194, "num_input_tokens_seen": 17360192, "step": 25770 }, { "epoch": 0.629687538172135, "grad_norm": 2.259371519088745, "learning_rate": 1.9959064360479144e-06, "loss": 0.0897, "num_input_tokens_seen": 17366144, "step": 25775 }, { "epoch": 0.6298096890039822, "grad_norm": 0.8481370806694031, "learning_rate": 1.9958987241757126e-06, "loss": 0.1683, "num_input_tokens_seen": 17369408, "step": 25780 }, { "epoch": 0.6299318398358292, "grad_norm": 12.3348388671875, "learning_rate": 1.9958910050610674e-06, "loss": 0.0886, "num_input_tokens_seen": 17372928, "step": 25785 }, { "epoch": 0.6300539906676764, "grad_norm": 22.768659591674805, "learning_rate": 1.995883278704035e-06, "loss": 0.0823, "num_input_tokens_seen": 17376448, "step": 25790 }, { "epoch": 0.6301761414995236, "grad_norm": 1.0633013248443604, "learning_rate": 1.9958755451046716e-06, "loss": 0.1744, "num_input_tokens_seen": 17379776, "step": 25795 }, { "epoch": 0.6302982923313708, "grad_norm": 47.173545837402344, "learning_rate": 1.9958678042630333e-06, "loss": 0.1318, "num_input_tokens_seen": 17382976, "step": 25800 }, { "epoch": 0.630420443163218, "grad_norm": 28.43817138671875, "learning_rate": 1.9958600561791765e-06, "loss": 0.0919, "num_input_tokens_seen": 17386368, "step": 25805 }, { "epoch": 0.6305425939950651, "grad_norm": 11.135772705078125, "learning_rate": 1.9958523008531574e-06, "loss": 0.1327, "num_input_tokens_seen": 17389376, "step": 25810 }, { "epoch": 0.6306647448269123, "grad_norm": 13.410459518432617, "learning_rate": 1.9958445382850325e-06, "loss": 0.1494, "num_input_tokens_seen": 17392768, "step": 25815 }, { "epoch": 0.6307868956587595, "grad_norm": 15.97334098815918, "learning_rate": 1.9958367684748585e-06, "loss": 0.1626, "num_input_tokens_seen": 17395648, "step": 25820 }, { "epoch": 0.6309090464906066, "grad_norm": 25.007976531982422, "learning_rate": 1.9958289914226917e-06, "loss": 0.097, "num_input_tokens_seen": 17398784, "step": 25825 }, { "epoch": 0.6310311973224537, "grad_norm": 22.77874755859375, "learning_rate": 1.9958212071285885e-06, "loss": 0.1565, "num_input_tokens_seen": 17402112, "step": 25830 }, { "epoch": 0.6311533481543009, "grad_norm": 22.907556533813477, "learning_rate": 1.9958134155926055e-06, "loss": 0.1212, "num_input_tokens_seen": 17405568, "step": 25835 }, { "epoch": 0.6312754989861481, "grad_norm": 13.266105651855469, "learning_rate": 1.9958056168147996e-06, "loss": 0.0662, "num_input_tokens_seen": 17408640, "step": 25840 }, { "epoch": 0.6313976498179953, "grad_norm": 14.850981712341309, "learning_rate": 1.9957978107952275e-06, "loss": 0.1154, "num_input_tokens_seen": 17412096, "step": 25845 }, { "epoch": 0.6315198006498425, "grad_norm": 6.523622989654541, "learning_rate": 1.995789997533946e-06, "loss": 0.124, "num_input_tokens_seen": 17415168, "step": 25850 }, { "epoch": 0.6316419514816896, "grad_norm": 28.513198852539062, "learning_rate": 1.995782177031011e-06, "loss": 0.1397, "num_input_tokens_seen": 17418368, "step": 25855 }, { "epoch": 0.6317641023135367, "grad_norm": 5.756598472595215, "learning_rate": 1.995774349286481e-06, "loss": 0.1161, "num_input_tokens_seen": 17421824, "step": 25860 }, { "epoch": 0.6318862531453839, "grad_norm": 9.733460426330566, "learning_rate": 1.995766514300412e-06, "loss": 0.0884, "num_input_tokens_seen": 17425024, "step": 25865 }, { "epoch": 0.6320084039772311, "grad_norm": 1.50833261013031, "learning_rate": 1.995758672072861e-06, "loss": 0.1705, "num_input_tokens_seen": 17428096, "step": 25870 }, { "epoch": 0.6321305548090782, "grad_norm": 3.84963059425354, "learning_rate": 1.995750822603885e-06, "loss": 0.0775, "num_input_tokens_seen": 17431360, "step": 25875 }, { "epoch": 0.6322527056409254, "grad_norm": 11.950239181518555, "learning_rate": 1.9957429658935415e-06, "loss": 0.1135, "num_input_tokens_seen": 17434816, "step": 25880 }, { "epoch": 0.6323748564727726, "grad_norm": 8.948583602905273, "learning_rate": 1.995735101941887e-06, "loss": 0.1094, "num_input_tokens_seen": 17438336, "step": 25885 }, { "epoch": 0.6324970073046198, "grad_norm": 27.25109100341797, "learning_rate": 1.995727230748979e-06, "loss": 0.0563, "num_input_tokens_seen": 17441536, "step": 25890 }, { "epoch": 0.632619158136467, "grad_norm": 1.836618423461914, "learning_rate": 1.995719352314875e-06, "loss": 0.1082, "num_input_tokens_seen": 17444992, "step": 25895 }, { "epoch": 0.632741308968314, "grad_norm": 12.529705047607422, "learning_rate": 1.995711466639632e-06, "loss": 0.1079, "num_input_tokens_seen": 17448192, "step": 25900 }, { "epoch": 0.6328634598001612, "grad_norm": 7.985225200653076, "learning_rate": 1.9957035737233072e-06, "loss": 0.0583, "num_input_tokens_seen": 17451456, "step": 25905 }, { "epoch": 0.6329856106320084, "grad_norm": 13.829870223999023, "learning_rate": 1.9956956735659583e-06, "loss": 0.0693, "num_input_tokens_seen": 17455168, "step": 25910 }, { "epoch": 0.6331077614638556, "grad_norm": 31.966915130615234, "learning_rate": 1.9956877661676427e-06, "loss": 0.0966, "num_input_tokens_seen": 17458688, "step": 25915 }, { "epoch": 0.6332299122957027, "grad_norm": 30.319499969482422, "learning_rate": 1.9956798515284178e-06, "loss": 0.171, "num_input_tokens_seen": 17462080, "step": 25920 }, { "epoch": 0.6333520631275499, "grad_norm": 35.57968521118164, "learning_rate": 1.9956719296483414e-06, "loss": 0.0922, "num_input_tokens_seen": 17465472, "step": 25925 }, { "epoch": 0.6334742139593971, "grad_norm": 7.720409870147705, "learning_rate": 1.9956640005274708e-06, "loss": 0.1561, "num_input_tokens_seen": 17469184, "step": 25930 }, { "epoch": 0.6335963647912443, "grad_norm": 27.032007217407227, "learning_rate": 1.9956560641658635e-06, "loss": 0.154, "num_input_tokens_seen": 17473344, "step": 25935 }, { "epoch": 0.6337185156230913, "grad_norm": 8.09611988067627, "learning_rate": 1.995648120563578e-06, "loss": 0.0313, "num_input_tokens_seen": 17476992, "step": 25940 }, { "epoch": 0.6338406664549385, "grad_norm": 17.054302215576172, "learning_rate": 1.9956401697206712e-06, "loss": 0.1835, "num_input_tokens_seen": 17480192, "step": 25945 }, { "epoch": 0.6339628172867857, "grad_norm": 27.591604232788086, "learning_rate": 1.9956322116372013e-06, "loss": 0.1739, "num_input_tokens_seen": 17483712, "step": 25950 }, { "epoch": 0.6340849681186329, "grad_norm": 31.46297264099121, "learning_rate": 1.9956242463132265e-06, "loss": 0.1891, "num_input_tokens_seen": 17486464, "step": 25955 }, { "epoch": 0.6342071189504801, "grad_norm": 20.843233108520508, "learning_rate": 1.9956162737488043e-06, "loss": 0.1866, "num_input_tokens_seen": 17490240, "step": 25960 }, { "epoch": 0.6343292697823272, "grad_norm": 6.045319557189941, "learning_rate": 1.9956082939439923e-06, "loss": 0.0477, "num_input_tokens_seen": 17493504, "step": 25965 }, { "epoch": 0.6344514206141744, "grad_norm": 1.8652979135513306, "learning_rate": 1.99560030689885e-06, "loss": 0.1017, "num_input_tokens_seen": 17496704, "step": 25970 }, { "epoch": 0.6345735714460216, "grad_norm": 17.12334632873535, "learning_rate": 1.9955923126134336e-06, "loss": 0.0958, "num_input_tokens_seen": 17499712, "step": 25975 }, { "epoch": 0.6346957222778687, "grad_norm": 1.6485508680343628, "learning_rate": 1.995584311087802e-06, "loss": 0.0124, "num_input_tokens_seen": 17503360, "step": 25980 }, { "epoch": 0.6348178731097158, "grad_norm": 24.18747329711914, "learning_rate": 1.995576302322014e-06, "loss": 0.0913, "num_input_tokens_seen": 17506944, "step": 25985 }, { "epoch": 0.634940023941563, "grad_norm": 0.19784998893737793, "learning_rate": 1.995568286316127e-06, "loss": 0.0773, "num_input_tokens_seen": 17510144, "step": 25990 }, { "epoch": 0.6350621747734102, "grad_norm": 1.5027730464935303, "learning_rate": 1.9955602630702004e-06, "loss": 0.1737, "num_input_tokens_seen": 17513216, "step": 25995 }, { "epoch": 0.6351843256052574, "grad_norm": 22.669113159179688, "learning_rate": 1.995552232584291e-06, "loss": 0.1516, "num_input_tokens_seen": 17516608, "step": 26000 }, { "epoch": 0.6353064764371046, "grad_norm": 0.15162105858325958, "learning_rate": 1.9955441948584584e-06, "loss": 0.1055, "num_input_tokens_seen": 17519872, "step": 26005 }, { "epoch": 0.6354286272689517, "grad_norm": 2.181324005126953, "learning_rate": 1.9955361498927604e-06, "loss": 0.1741, "num_input_tokens_seen": 17524032, "step": 26010 }, { "epoch": 0.6355507781007989, "grad_norm": 13.539152145385742, "learning_rate": 1.995528097687256e-06, "loss": 0.081, "num_input_tokens_seen": 17527168, "step": 26015 }, { "epoch": 0.635672928932646, "grad_norm": 5.645986557006836, "learning_rate": 1.995520038242003e-06, "loss": 0.1234, "num_input_tokens_seen": 17530240, "step": 26020 }, { "epoch": 0.6357950797644932, "grad_norm": 11.694540977478027, "learning_rate": 1.995511971557061e-06, "loss": 0.1828, "num_input_tokens_seen": 17533440, "step": 26025 }, { "epoch": 0.6359172305963403, "grad_norm": 23.707487106323242, "learning_rate": 1.9955038976324882e-06, "loss": 0.1446, "num_input_tokens_seen": 17536768, "step": 26030 }, { "epoch": 0.6360393814281875, "grad_norm": 0.40436723828315735, "learning_rate": 1.995495816468343e-06, "loss": 0.022, "num_input_tokens_seen": 17540352, "step": 26035 }, { "epoch": 0.6361615322600347, "grad_norm": 0.8619459867477417, "learning_rate": 1.9954877280646847e-06, "loss": 0.0648, "num_input_tokens_seen": 17543616, "step": 26040 }, { "epoch": 0.6362836830918819, "grad_norm": 12.597610473632812, "learning_rate": 1.995479632421572e-06, "loss": 0.0706, "num_input_tokens_seen": 17547520, "step": 26045 }, { "epoch": 0.6364058339237291, "grad_norm": 2.8348464965820312, "learning_rate": 1.9954715295390634e-06, "loss": 0.1602, "num_input_tokens_seen": 17550592, "step": 26050 }, { "epoch": 0.6365279847555761, "grad_norm": 0.8372215032577515, "learning_rate": 1.995463419417218e-06, "loss": 0.0806, "num_input_tokens_seen": 17553472, "step": 26055 }, { "epoch": 0.6366501355874233, "grad_norm": 0.1670871526002884, "learning_rate": 1.9954553020560952e-06, "loss": 0.1826, "num_input_tokens_seen": 17556928, "step": 26060 }, { "epoch": 0.6367722864192705, "grad_norm": 2.472623348236084, "learning_rate": 1.9954471774557536e-06, "loss": 0.0413, "num_input_tokens_seen": 17560384, "step": 26065 }, { "epoch": 0.6368944372511177, "grad_norm": 28.401121139526367, "learning_rate": 1.995439045616252e-06, "loss": 0.1716, "num_input_tokens_seen": 17563584, "step": 26070 }, { "epoch": 0.6370165880829648, "grad_norm": 36.34634780883789, "learning_rate": 1.9954309065376504e-06, "loss": 0.1879, "num_input_tokens_seen": 17566784, "step": 26075 }, { "epoch": 0.637138738914812, "grad_norm": 19.85298728942871, "learning_rate": 1.9954227602200075e-06, "loss": 0.1128, "num_input_tokens_seen": 17569984, "step": 26080 }, { "epoch": 0.6372608897466592, "grad_norm": 13.330507278442383, "learning_rate": 1.995414606663382e-06, "loss": 0.099, "num_input_tokens_seen": 17573504, "step": 26085 }, { "epoch": 0.6373830405785064, "grad_norm": 38.39118576049805, "learning_rate": 1.995406445867834e-06, "loss": 0.1222, "num_input_tokens_seen": 17577472, "step": 26090 }, { "epoch": 0.6375051914103536, "grad_norm": 17.758310317993164, "learning_rate": 1.9953982778334232e-06, "loss": 0.047, "num_input_tokens_seen": 17581504, "step": 26095 }, { "epoch": 0.6376273422422006, "grad_norm": 7.126058101654053, "learning_rate": 1.995390102560208e-06, "loss": 0.1131, "num_input_tokens_seen": 17584512, "step": 26100 }, { "epoch": 0.6377494930740478, "grad_norm": 34.98472595214844, "learning_rate": 1.995381920048248e-06, "loss": 0.3489, "num_input_tokens_seen": 17587648, "step": 26105 }, { "epoch": 0.637871643905895, "grad_norm": 64.70793151855469, "learning_rate": 1.995373730297603e-06, "loss": 0.12, "num_input_tokens_seen": 17591296, "step": 26110 }, { "epoch": 0.6379937947377422, "grad_norm": 21.996980667114258, "learning_rate": 1.9953655333083325e-06, "loss": 0.2038, "num_input_tokens_seen": 17594368, "step": 26115 }, { "epoch": 0.6381159455695893, "grad_norm": 18.887956619262695, "learning_rate": 1.995357329080496e-06, "loss": 0.0196, "num_input_tokens_seen": 17597440, "step": 26120 }, { "epoch": 0.6382380964014365, "grad_norm": 20.388181686401367, "learning_rate": 1.995349117614154e-06, "loss": 0.1449, "num_input_tokens_seen": 17600512, "step": 26125 }, { "epoch": 0.6383602472332837, "grad_norm": 11.004735946655273, "learning_rate": 1.995340898909365e-06, "loss": 0.0482, "num_input_tokens_seen": 17603584, "step": 26130 }, { "epoch": 0.6384823980651309, "grad_norm": 1.7154580354690552, "learning_rate": 1.9953326729661894e-06, "loss": 0.132, "num_input_tokens_seen": 17606720, "step": 26135 }, { "epoch": 0.638604548896978, "grad_norm": 7.153088569641113, "learning_rate": 1.9953244397846867e-06, "loss": 0.0638, "num_input_tokens_seen": 17610176, "step": 26140 }, { "epoch": 0.6387266997288251, "grad_norm": 0.09430580586194992, "learning_rate": 1.995316199364917e-06, "loss": 0.1026, "num_input_tokens_seen": 17613440, "step": 26145 }, { "epoch": 0.6388488505606723, "grad_norm": 32.89512634277344, "learning_rate": 1.9953079517069404e-06, "loss": 0.1488, "num_input_tokens_seen": 17616448, "step": 26150 }, { "epoch": 0.6389710013925195, "grad_norm": 27.483251571655273, "learning_rate": 1.9952996968108163e-06, "loss": 0.1919, "num_input_tokens_seen": 17619968, "step": 26155 }, { "epoch": 0.6390931522243667, "grad_norm": 4.14311408996582, "learning_rate": 1.9952914346766055e-06, "loss": 0.0421, "num_input_tokens_seen": 17623040, "step": 26160 }, { "epoch": 0.6392153030562138, "grad_norm": 26.047630310058594, "learning_rate": 1.9952831653043673e-06, "loss": 0.1205, "num_input_tokens_seen": 17626560, "step": 26165 }, { "epoch": 0.639337453888061, "grad_norm": 8.898675918579102, "learning_rate": 1.9952748886941623e-06, "loss": 0.1684, "num_input_tokens_seen": 17629888, "step": 26170 }, { "epoch": 0.6394596047199081, "grad_norm": 6.229605197906494, "learning_rate": 1.995266604846051e-06, "loss": 0.12, "num_input_tokens_seen": 17633600, "step": 26175 }, { "epoch": 0.6395817555517553, "grad_norm": 44.44392395019531, "learning_rate": 1.9952583137600927e-06, "loss": 0.0854, "num_input_tokens_seen": 17637184, "step": 26180 }, { "epoch": 0.6397039063836025, "grad_norm": 13.342550277709961, "learning_rate": 1.995250015436349e-06, "loss": 0.1761, "num_input_tokens_seen": 17640384, "step": 26185 }, { "epoch": 0.6398260572154496, "grad_norm": 21.162038803100586, "learning_rate": 1.9952417098748787e-06, "loss": 0.1808, "num_input_tokens_seen": 17643456, "step": 26190 }, { "epoch": 0.6399482080472968, "grad_norm": 10.54422664642334, "learning_rate": 1.9952333970757437e-06, "loss": 0.1093, "num_input_tokens_seen": 17646912, "step": 26195 }, { "epoch": 0.640070358879144, "grad_norm": 7.449054718017578, "learning_rate": 1.995225077039003e-06, "loss": 0.0242, "num_input_tokens_seen": 17650304, "step": 26200 }, { "epoch": 0.6401925097109912, "grad_norm": 19.278846740722656, "learning_rate": 1.9952167497647183e-06, "loss": 0.1729, "num_input_tokens_seen": 17653504, "step": 26205 }, { "epoch": 0.6403146605428383, "grad_norm": 7.0912628173828125, "learning_rate": 1.9952084152529496e-06, "loss": 0.0961, "num_input_tokens_seen": 17656768, "step": 26210 }, { "epoch": 0.6404368113746854, "grad_norm": 12.94286060333252, "learning_rate": 1.9952000735037577e-06, "loss": 0.0957, "num_input_tokens_seen": 17660032, "step": 26215 }, { "epoch": 0.6405589622065326, "grad_norm": 0.9649350643157959, "learning_rate": 1.995191724517203e-06, "loss": 0.1386, "num_input_tokens_seen": 17663424, "step": 26220 }, { "epoch": 0.6406811130383798, "grad_norm": 0.6071916222572327, "learning_rate": 1.9951833682933468e-06, "loss": 0.0311, "num_input_tokens_seen": 17666496, "step": 26225 }, { "epoch": 0.6408032638702269, "grad_norm": 17.107208251953125, "learning_rate": 1.995175004832249e-06, "loss": 0.0999, "num_input_tokens_seen": 17669632, "step": 26230 }, { "epoch": 0.6409254147020741, "grad_norm": 32.13701629638672, "learning_rate": 1.9951666341339717e-06, "loss": 0.1355, "num_input_tokens_seen": 17673408, "step": 26235 }, { "epoch": 0.6410475655339213, "grad_norm": 37.73835372924805, "learning_rate": 1.9951582561985743e-06, "loss": 0.1531, "num_input_tokens_seen": 17676416, "step": 26240 }, { "epoch": 0.6411697163657685, "grad_norm": 22.128427505493164, "learning_rate": 1.995149871026118e-06, "loss": 0.1753, "num_input_tokens_seen": 17679744, "step": 26245 }, { "epoch": 0.6412918671976157, "grad_norm": 10.967851638793945, "learning_rate": 1.995141478616665e-06, "loss": 0.1188, "num_input_tokens_seen": 17682944, "step": 26250 }, { "epoch": 0.6414140180294627, "grad_norm": 13.98348331451416, "learning_rate": 1.995133078970275e-06, "loss": 0.1189, "num_input_tokens_seen": 17686336, "step": 26255 }, { "epoch": 0.6415361688613099, "grad_norm": 5.867333889007568, "learning_rate": 1.99512467208701e-06, "loss": 0.099, "num_input_tokens_seen": 17689792, "step": 26260 }, { "epoch": 0.6416583196931571, "grad_norm": 12.464974403381348, "learning_rate": 1.9951162579669306e-06, "loss": 0.0971, "num_input_tokens_seen": 17693888, "step": 26265 }, { "epoch": 0.6417804705250043, "grad_norm": 26.993873596191406, "learning_rate": 1.995107836610098e-06, "loss": 0.1654, "num_input_tokens_seen": 17697152, "step": 26270 }, { "epoch": 0.6419026213568514, "grad_norm": 2.5690789222717285, "learning_rate": 1.9950994080165736e-06, "loss": 0.0956, "num_input_tokens_seen": 17700224, "step": 26275 }, { "epoch": 0.6420247721886986, "grad_norm": 11.626635551452637, "learning_rate": 1.9950909721864184e-06, "loss": 0.116, "num_input_tokens_seen": 17703488, "step": 26280 }, { "epoch": 0.6421469230205458, "grad_norm": 0.5176971554756165, "learning_rate": 1.9950825291196944e-06, "loss": 0.0592, "num_input_tokens_seen": 17706688, "step": 26285 }, { "epoch": 0.642269073852393, "grad_norm": 32.07765579223633, "learning_rate": 1.995074078816462e-06, "loss": 0.0388, "num_input_tokens_seen": 17709888, "step": 26290 }, { "epoch": 0.6423912246842401, "grad_norm": 0.510884702205658, "learning_rate": 1.9950656212767844e-06, "loss": 0.0883, "num_input_tokens_seen": 17713536, "step": 26295 }, { "epoch": 0.6425133755160872, "grad_norm": 13.728007316589355, "learning_rate": 1.995057156500721e-06, "loss": 0.1991, "num_input_tokens_seen": 17716928, "step": 26300 }, { "epoch": 0.6426355263479344, "grad_norm": 9.4790678024292, "learning_rate": 1.9950486844883348e-06, "loss": 0.134, "num_input_tokens_seen": 17720128, "step": 26305 }, { "epoch": 0.6427576771797816, "grad_norm": 29.495006561279297, "learning_rate": 1.9950402052396866e-06, "loss": 0.1259, "num_input_tokens_seen": 17723520, "step": 26310 }, { "epoch": 0.6428798280116288, "grad_norm": 33.22211837768555, "learning_rate": 1.9950317187548385e-06, "loss": 0.202, "num_input_tokens_seen": 17727232, "step": 26315 }, { "epoch": 0.6430019788434759, "grad_norm": 13.970088005065918, "learning_rate": 1.995023225033852e-06, "loss": 0.1676, "num_input_tokens_seen": 17730496, "step": 26320 }, { "epoch": 0.6431241296753231, "grad_norm": 12.428041458129883, "learning_rate": 1.9950147240767895e-06, "loss": 0.3496, "num_input_tokens_seen": 17734336, "step": 26325 }, { "epoch": 0.6432462805071703, "grad_norm": 13.682167053222656, "learning_rate": 1.9950062158837118e-06, "loss": 0.0899, "num_input_tokens_seen": 17737792, "step": 26330 }, { "epoch": 0.6433684313390174, "grad_norm": 16.482370376586914, "learning_rate": 1.9949977004546814e-06, "loss": 0.1277, "num_input_tokens_seen": 17741312, "step": 26335 }, { "epoch": 0.6434905821708646, "grad_norm": 3.444857358932495, "learning_rate": 1.99498917778976e-06, "loss": 0.1018, "num_input_tokens_seen": 17744640, "step": 26340 }, { "epoch": 0.6436127330027117, "grad_norm": 20.923669815063477, "learning_rate": 1.9949806478890095e-06, "loss": 0.143, "num_input_tokens_seen": 17748096, "step": 26345 }, { "epoch": 0.6437348838345589, "grad_norm": 3.6736643314361572, "learning_rate": 1.9949721107524924e-06, "loss": 0.0545, "num_input_tokens_seen": 17751424, "step": 26350 }, { "epoch": 0.6438570346664061, "grad_norm": 0.2368733286857605, "learning_rate": 1.9949635663802705e-06, "loss": 0.1333, "num_input_tokens_seen": 17754560, "step": 26355 }, { "epoch": 0.6439791854982533, "grad_norm": 2.8474624156951904, "learning_rate": 1.994955014772406e-06, "loss": 0.0863, "num_input_tokens_seen": 17757952, "step": 26360 }, { "epoch": 0.6441013363301004, "grad_norm": 31.334247589111328, "learning_rate": 1.9949464559289607e-06, "loss": 0.2483, "num_input_tokens_seen": 17761472, "step": 26365 }, { "epoch": 0.6442234871619476, "grad_norm": 8.788104057312012, "learning_rate": 1.9949378898499974e-06, "loss": 0.0981, "num_input_tokens_seen": 17764608, "step": 26370 }, { "epoch": 0.6443456379937947, "grad_norm": 0.9639668464660645, "learning_rate": 1.9949293165355783e-06, "loss": 0.1006, "num_input_tokens_seen": 17767872, "step": 26375 }, { "epoch": 0.6444677888256419, "grad_norm": 13.94326114654541, "learning_rate": 1.9949207359857656e-06, "loss": 0.0816, "num_input_tokens_seen": 17771136, "step": 26380 }, { "epoch": 0.6445899396574891, "grad_norm": 8.497142791748047, "learning_rate": 1.9949121482006216e-06, "loss": 0.1563, "num_input_tokens_seen": 17774464, "step": 26385 }, { "epoch": 0.6447120904893362, "grad_norm": 5.612656116485596, "learning_rate": 1.9949035531802086e-06, "loss": 0.0626, "num_input_tokens_seen": 17777664, "step": 26390 }, { "epoch": 0.6448342413211834, "grad_norm": 24.19043731689453, "learning_rate": 1.9948949509245897e-06, "loss": 0.1521, "num_input_tokens_seen": 17780672, "step": 26395 }, { "epoch": 0.6449563921530306, "grad_norm": 4.007930278778076, "learning_rate": 1.994886341433827e-06, "loss": 0.1073, "num_input_tokens_seen": 17783744, "step": 26400 }, { "epoch": 0.6450785429848778, "grad_norm": 24.8145809173584, "learning_rate": 1.994877724707983e-06, "loss": 0.225, "num_input_tokens_seen": 17787904, "step": 26405 }, { "epoch": 0.6452006938167248, "grad_norm": 12.867918014526367, "learning_rate": 1.994869100747121e-06, "loss": 0.044, "num_input_tokens_seen": 17791424, "step": 26410 }, { "epoch": 0.645322844648572, "grad_norm": 1.9785236120224, "learning_rate": 1.994860469551303e-06, "loss": 0.0557, "num_input_tokens_seen": 17794624, "step": 26415 }, { "epoch": 0.6454449954804192, "grad_norm": 11.472222328186035, "learning_rate": 1.9948518311205925e-06, "loss": 0.1954, "num_input_tokens_seen": 17797888, "step": 26420 }, { "epoch": 0.6455671463122664, "grad_norm": 2.1420435905456543, "learning_rate": 1.9948431854550517e-06, "loss": 0.084, "num_input_tokens_seen": 17801216, "step": 26425 }, { "epoch": 0.6456892971441136, "grad_norm": 9.826559066772461, "learning_rate": 1.9948345325547433e-06, "loss": 0.0945, "num_input_tokens_seen": 17804352, "step": 26430 }, { "epoch": 0.6458114479759607, "grad_norm": 2.30534291267395, "learning_rate": 1.994825872419731e-06, "loss": 0.0385, "num_input_tokens_seen": 17807616, "step": 26435 }, { "epoch": 0.6459335988078079, "grad_norm": 11.928797721862793, "learning_rate": 1.994817205050077e-06, "loss": 0.1779, "num_input_tokens_seen": 17810752, "step": 26440 }, { "epoch": 0.6460557496396551, "grad_norm": 0.038654062896966934, "learning_rate": 1.9948085304458453e-06, "loss": 0.2423, "num_input_tokens_seen": 17813824, "step": 26445 }, { "epoch": 0.6461779004715023, "grad_norm": 2.4233062267303467, "learning_rate": 1.994799848607098e-06, "loss": 0.0702, "num_input_tokens_seen": 17816960, "step": 26450 }, { "epoch": 0.6463000513033493, "grad_norm": 0.9603357315063477, "learning_rate": 1.9947911595338986e-06, "loss": 0.1297, "num_input_tokens_seen": 17820032, "step": 26455 }, { "epoch": 0.6464222021351965, "grad_norm": 0.8634178042411804, "learning_rate": 1.9947824632263102e-06, "loss": 0.0541, "num_input_tokens_seen": 17823680, "step": 26460 }, { "epoch": 0.6465443529670437, "grad_norm": 36.10398864746094, "learning_rate": 1.9947737596843964e-06, "loss": 0.1016, "num_input_tokens_seen": 17826880, "step": 26465 }, { "epoch": 0.6466665037988909, "grad_norm": 0.6066563725471497, "learning_rate": 1.9947650489082207e-06, "loss": 0.1124, "num_input_tokens_seen": 17830656, "step": 26470 }, { "epoch": 0.646788654630738, "grad_norm": 0.11005598306655884, "learning_rate": 1.9947563308978453e-06, "loss": 0.1057, "num_input_tokens_seen": 17834048, "step": 26475 }, { "epoch": 0.6469108054625852, "grad_norm": 0.32724529504776, "learning_rate": 1.9947476056533347e-06, "loss": 0.1586, "num_input_tokens_seen": 17837504, "step": 26480 }, { "epoch": 0.6470329562944324, "grad_norm": 28.255603790283203, "learning_rate": 1.994738873174752e-06, "loss": 0.1599, "num_input_tokens_seen": 17840704, "step": 26485 }, { "epoch": 0.6471551071262795, "grad_norm": 8.742977142333984, "learning_rate": 1.9947301334621603e-06, "loss": 0.1022, "num_input_tokens_seen": 17844736, "step": 26490 }, { "epoch": 0.6472772579581267, "grad_norm": 28.56509017944336, "learning_rate": 1.9947213865156237e-06, "loss": 0.1502, "num_input_tokens_seen": 17848384, "step": 26495 }, { "epoch": 0.6473994087899738, "grad_norm": 6.090130805969238, "learning_rate": 1.994712632335206e-06, "loss": 0.0923, "num_input_tokens_seen": 17851584, "step": 26500 }, { "epoch": 0.647521559621821, "grad_norm": 0.3652131259441376, "learning_rate": 1.9947038709209696e-06, "loss": 0.1213, "num_input_tokens_seen": 17855040, "step": 26505 }, { "epoch": 0.6476437104536682, "grad_norm": 14.173577308654785, "learning_rate": 1.99469510227298e-06, "loss": 0.1322, "num_input_tokens_seen": 17858880, "step": 26510 }, { "epoch": 0.6477658612855154, "grad_norm": 10.040029525756836, "learning_rate": 1.9946863263912995e-06, "loss": 0.0871, "num_input_tokens_seen": 17861952, "step": 26515 }, { "epoch": 0.6478880121173625, "grad_norm": 18.37601661682129, "learning_rate": 1.9946775432759927e-06, "loss": 0.0217, "num_input_tokens_seen": 17865024, "step": 26520 }, { "epoch": 0.6480101629492097, "grad_norm": 0.8798476457595825, "learning_rate": 1.994668752927123e-06, "loss": 0.0965, "num_input_tokens_seen": 17867968, "step": 26525 }, { "epoch": 0.6481323137810568, "grad_norm": 1.0580397844314575, "learning_rate": 1.9946599553447554e-06, "loss": 0.0301, "num_input_tokens_seen": 17871744, "step": 26530 }, { "epoch": 0.648254464612904, "grad_norm": 27.158893585205078, "learning_rate": 1.9946511505289524e-06, "loss": 0.0995, "num_input_tokens_seen": 17876992, "step": 26535 }, { "epoch": 0.6483766154447512, "grad_norm": 4.429322719573975, "learning_rate": 1.9946423384797785e-06, "loss": 0.0387, "num_input_tokens_seen": 17880064, "step": 26540 }, { "epoch": 0.6484987662765983, "grad_norm": 12.550520896911621, "learning_rate": 1.9946335191972986e-06, "loss": 0.0889, "num_input_tokens_seen": 17883840, "step": 26545 }, { "epoch": 0.6486209171084455, "grad_norm": 59.04765701293945, "learning_rate": 1.9946246926815758e-06, "loss": 0.1818, "num_input_tokens_seen": 17887424, "step": 26550 }, { "epoch": 0.6487430679402927, "grad_norm": 56.20634841918945, "learning_rate": 1.994615858932675e-06, "loss": 0.2036, "num_input_tokens_seen": 17890752, "step": 26555 }, { "epoch": 0.6488652187721399, "grad_norm": 29.603675842285156, "learning_rate": 1.99460701795066e-06, "loss": 0.1904, "num_input_tokens_seen": 17894528, "step": 26560 }, { "epoch": 0.648987369603987, "grad_norm": 30.748910903930664, "learning_rate": 1.994598169735595e-06, "loss": 0.1425, "num_input_tokens_seen": 17897600, "step": 26565 }, { "epoch": 0.6491095204358341, "grad_norm": 46.07027053833008, "learning_rate": 1.994589314287545e-06, "loss": 0.2702, "num_input_tokens_seen": 17901248, "step": 26570 }, { "epoch": 0.6492316712676813, "grad_norm": 1.5052845478057861, "learning_rate": 1.9945804516065737e-06, "loss": 0.1467, "num_input_tokens_seen": 17904448, "step": 26575 }, { "epoch": 0.6493538220995285, "grad_norm": 26.738313674926758, "learning_rate": 1.9945715816927464e-06, "loss": 0.1725, "num_input_tokens_seen": 17907968, "step": 26580 }, { "epoch": 0.6494759729313757, "grad_norm": 6.558767795562744, "learning_rate": 1.9945627045461263e-06, "loss": 0.027, "num_input_tokens_seen": 17911232, "step": 26585 }, { "epoch": 0.6495981237632228, "grad_norm": 11.350381851196289, "learning_rate": 1.9945538201667792e-06, "loss": 0.1729, "num_input_tokens_seen": 17914496, "step": 26590 }, { "epoch": 0.64972027459507, "grad_norm": 4.032224178314209, "learning_rate": 1.9945449285547694e-06, "loss": 0.0971, "num_input_tokens_seen": 17917760, "step": 26595 }, { "epoch": 0.6498424254269172, "grad_norm": 22.45256996154785, "learning_rate": 1.9945360297101607e-06, "loss": 0.1367, "num_input_tokens_seen": 17921280, "step": 26600 }, { "epoch": 0.6499645762587644, "grad_norm": 9.463356971740723, "learning_rate": 1.994527123633019e-06, "loss": 0.1497, "num_input_tokens_seen": 17924288, "step": 26605 }, { "epoch": 0.6500867270906114, "grad_norm": 0.3891908824443817, "learning_rate": 1.9945182103234085e-06, "loss": 0.0734, "num_input_tokens_seen": 17928192, "step": 26610 }, { "epoch": 0.6502088779224586, "grad_norm": 1.3331596851348877, "learning_rate": 1.9945092897813937e-06, "loss": 0.0735, "num_input_tokens_seen": 17931584, "step": 26615 }, { "epoch": 0.6503310287543058, "grad_norm": 8.364001274108887, "learning_rate": 1.99450036200704e-06, "loss": 0.0445, "num_input_tokens_seen": 17935296, "step": 26620 }, { "epoch": 0.650453179586153, "grad_norm": 3.21260929107666, "learning_rate": 1.9944914270004126e-06, "loss": 0.132, "num_input_tokens_seen": 17938496, "step": 26625 }, { "epoch": 0.6505753304180002, "grad_norm": 19.593875885009766, "learning_rate": 1.994482484761576e-06, "loss": 0.1255, "num_input_tokens_seen": 17941440, "step": 26630 }, { "epoch": 0.6506974812498473, "grad_norm": 11.708880424499512, "learning_rate": 1.994473535290595e-06, "loss": 0.1735, "num_input_tokens_seen": 17944576, "step": 26635 }, { "epoch": 0.6508196320816945, "grad_norm": 8.608492851257324, "learning_rate": 1.994464578587535e-06, "loss": 0.0996, "num_input_tokens_seen": 17948288, "step": 26640 }, { "epoch": 0.6509417829135417, "grad_norm": 3.7675461769104004, "learning_rate": 1.9944556146524613e-06, "loss": 0.1026, "num_input_tokens_seen": 17951360, "step": 26645 }, { "epoch": 0.6510639337453888, "grad_norm": 0.9686684608459473, "learning_rate": 1.9944466434854386e-06, "loss": 0.0945, "num_input_tokens_seen": 17954944, "step": 26650 }, { "epoch": 0.6511860845772359, "grad_norm": 1.992281198501587, "learning_rate": 1.9944376650865325e-06, "loss": 0.117, "num_input_tokens_seen": 17958080, "step": 26655 }, { "epoch": 0.6513082354090831, "grad_norm": 23.731996536254883, "learning_rate": 1.994428679455808e-06, "loss": 0.1298, "num_input_tokens_seen": 17961664, "step": 26660 }, { "epoch": 0.6514303862409303, "grad_norm": 0.7598839998245239, "learning_rate": 1.9944196865933313e-06, "loss": 0.0816, "num_input_tokens_seen": 17964672, "step": 26665 }, { "epoch": 0.6515525370727775, "grad_norm": 23.543251037597656, "learning_rate": 1.994410686499167e-06, "loss": 0.2047, "num_input_tokens_seen": 17967808, "step": 26670 }, { "epoch": 0.6516746879046247, "grad_norm": 14.285406112670898, "learning_rate": 1.9944016791733806e-06, "loss": 0.1808, "num_input_tokens_seen": 17971200, "step": 26675 }, { "epoch": 0.6517968387364718, "grad_norm": 9.57889461517334, "learning_rate": 1.9943926646160377e-06, "loss": 0.077, "num_input_tokens_seen": 17974528, "step": 26680 }, { "epoch": 0.651918989568319, "grad_norm": 9.059340476989746, "learning_rate": 1.994383642827204e-06, "loss": 0.0653, "num_input_tokens_seen": 17977984, "step": 26685 }, { "epoch": 0.6520411404001661, "grad_norm": 15.571893692016602, "learning_rate": 1.9943746138069446e-06, "loss": 0.1219, "num_input_tokens_seen": 17980800, "step": 26690 }, { "epoch": 0.6521632912320133, "grad_norm": 7.93010139465332, "learning_rate": 1.9943655775553257e-06, "loss": 0.1261, "num_input_tokens_seen": 17984128, "step": 26695 }, { "epoch": 0.6522854420638604, "grad_norm": 9.046224594116211, "learning_rate": 1.9943565340724133e-06, "loss": 0.0388, "num_input_tokens_seen": 17987776, "step": 26700 }, { "epoch": 0.6524075928957076, "grad_norm": 19.525766372680664, "learning_rate": 1.994347483358272e-06, "loss": 0.1485, "num_input_tokens_seen": 17991168, "step": 26705 }, { "epoch": 0.6525297437275548, "grad_norm": 15.36963176727295, "learning_rate": 1.9943384254129693e-06, "loss": 0.1009, "num_input_tokens_seen": 17994432, "step": 26710 }, { "epoch": 0.652651894559402, "grad_norm": 19.087261199951172, "learning_rate": 1.9943293602365694e-06, "loss": 0.3058, "num_input_tokens_seen": 17997696, "step": 26715 }, { "epoch": 0.6527740453912492, "grad_norm": 36.39265823364258, "learning_rate": 1.994320287829139e-06, "loss": 0.1395, "num_input_tokens_seen": 18001408, "step": 26720 }, { "epoch": 0.6528961962230962, "grad_norm": 17.34744644165039, "learning_rate": 1.9943112081907443e-06, "loss": 0.0725, "num_input_tokens_seen": 18004480, "step": 26725 }, { "epoch": 0.6530183470549434, "grad_norm": 8.52084732055664, "learning_rate": 1.9943021213214508e-06, "loss": 0.0555, "num_input_tokens_seen": 18007616, "step": 26730 }, { "epoch": 0.6531404978867906, "grad_norm": 8.265021324157715, "learning_rate": 1.994293027221325e-06, "loss": 0.0594, "num_input_tokens_seen": 18011072, "step": 26735 }, { "epoch": 0.6532626487186378, "grad_norm": 20.10539436340332, "learning_rate": 1.994283925890433e-06, "loss": 0.1316, "num_input_tokens_seen": 18013824, "step": 26740 }, { "epoch": 0.6533847995504849, "grad_norm": 16.809226989746094, "learning_rate": 1.9942748173288408e-06, "loss": 0.0942, "num_input_tokens_seen": 18017024, "step": 26745 }, { "epoch": 0.6535069503823321, "grad_norm": 11.143219947814941, "learning_rate": 1.9942657015366145e-06, "loss": 0.1295, "num_input_tokens_seen": 18020736, "step": 26750 }, { "epoch": 0.6536291012141793, "grad_norm": 19.93425178527832, "learning_rate": 1.9942565785138207e-06, "loss": 0.0743, "num_input_tokens_seen": 18024000, "step": 26755 }, { "epoch": 0.6537512520460265, "grad_norm": 2.00927996635437, "learning_rate": 1.994247448260526e-06, "loss": 0.1127, "num_input_tokens_seen": 18027072, "step": 26760 }, { "epoch": 0.6538734028778735, "grad_norm": 34.23299026489258, "learning_rate": 1.994238310776796e-06, "loss": 0.1408, "num_input_tokens_seen": 18030464, "step": 26765 }, { "epoch": 0.6539955537097207, "grad_norm": 9.078926086425781, "learning_rate": 1.9942291660626974e-06, "loss": 0.1588, "num_input_tokens_seen": 18033856, "step": 26770 }, { "epoch": 0.6541177045415679, "grad_norm": 13.535733222961426, "learning_rate": 1.9942200141182973e-06, "loss": 0.0567, "num_input_tokens_seen": 18037056, "step": 26775 }, { "epoch": 0.6542398553734151, "grad_norm": 1.5711990594863892, "learning_rate": 1.9942108549436617e-06, "loss": 0.0857, "num_input_tokens_seen": 18040064, "step": 26780 }, { "epoch": 0.6543620062052623, "grad_norm": 0.7724623084068298, "learning_rate": 1.9942016885388575e-06, "loss": 0.0756, "num_input_tokens_seen": 18043456, "step": 26785 }, { "epoch": 0.6544841570371094, "grad_norm": 25.43880844116211, "learning_rate": 1.994192514903951e-06, "loss": 0.0706, "num_input_tokens_seen": 18046656, "step": 26790 }, { "epoch": 0.6546063078689566, "grad_norm": 18.88958740234375, "learning_rate": 1.994183334039009e-06, "loss": 0.0439, "num_input_tokens_seen": 18050624, "step": 26795 }, { "epoch": 0.6547284587008038, "grad_norm": 25.788715362548828, "learning_rate": 1.9941741459440987e-06, "loss": 0.0505, "num_input_tokens_seen": 18054720, "step": 26800 }, { "epoch": 0.654850609532651, "grad_norm": 0.5160311460494995, "learning_rate": 1.9941649506192866e-06, "loss": 0.1256, "num_input_tokens_seen": 18057856, "step": 26805 }, { "epoch": 0.654972760364498, "grad_norm": 37.93132781982422, "learning_rate": 1.994155748064639e-06, "loss": 0.1938, "num_input_tokens_seen": 18061568, "step": 26810 }, { "epoch": 0.6550949111963452, "grad_norm": 9.682503700256348, "learning_rate": 1.994146538280224e-06, "loss": 0.1578, "num_input_tokens_seen": 18065408, "step": 26815 }, { "epoch": 0.6552170620281924, "grad_norm": 32.54736328125, "learning_rate": 1.994137321266108e-06, "loss": 0.0594, "num_input_tokens_seen": 18069184, "step": 26820 }, { "epoch": 0.6553392128600396, "grad_norm": 3.4581313133239746, "learning_rate": 1.994128097022358e-06, "loss": 0.0316, "num_input_tokens_seen": 18073344, "step": 26825 }, { "epoch": 0.6554613636918868, "grad_norm": 29.386049270629883, "learning_rate": 1.9941188655490406e-06, "loss": 0.2249, "num_input_tokens_seen": 18077056, "step": 26830 }, { "epoch": 0.6555835145237339, "grad_norm": 0.32932719588279724, "learning_rate": 1.994109626846224e-06, "loss": 0.1366, "num_input_tokens_seen": 18080576, "step": 26835 }, { "epoch": 0.6557056653555811, "grad_norm": 4.738411903381348, "learning_rate": 1.9941003809139746e-06, "loss": 0.0704, "num_input_tokens_seen": 18084032, "step": 26840 }, { "epoch": 0.6558278161874282, "grad_norm": 38.715606689453125, "learning_rate": 1.99409112775236e-06, "loss": 0.0993, "num_input_tokens_seen": 18087104, "step": 26845 }, { "epoch": 0.6559499670192754, "grad_norm": 24.37164878845215, "learning_rate": 1.994081867361447e-06, "loss": 0.1199, "num_input_tokens_seen": 18090432, "step": 26850 }, { "epoch": 0.6560721178511225, "grad_norm": 4.3538899421691895, "learning_rate": 1.9940725997413037e-06, "loss": 0.2395, "num_input_tokens_seen": 18093632, "step": 26855 }, { "epoch": 0.6561942686829697, "grad_norm": 0.20880497992038727, "learning_rate": 1.994063324891997e-06, "loss": 0.1031, "num_input_tokens_seen": 18096768, "step": 26860 }, { "epoch": 0.6563164195148169, "grad_norm": 17.8417911529541, "learning_rate": 1.9940540428135942e-06, "loss": 0.107, "num_input_tokens_seen": 18100160, "step": 26865 }, { "epoch": 0.6564385703466641, "grad_norm": 2.653338670730591, "learning_rate": 1.9940447535061627e-06, "loss": 0.126, "num_input_tokens_seen": 18103616, "step": 26870 }, { "epoch": 0.6565607211785113, "grad_norm": 10.069886207580566, "learning_rate": 1.994035456969771e-06, "loss": 0.1211, "num_input_tokens_seen": 18107392, "step": 26875 }, { "epoch": 0.6566828720103584, "grad_norm": 0.5909165143966675, "learning_rate": 1.994026153204486e-06, "loss": 0.0588, "num_input_tokens_seen": 18111552, "step": 26880 }, { "epoch": 0.6568050228422055, "grad_norm": 12.051314353942871, "learning_rate": 1.9940168422103752e-06, "loss": 0.052, "num_input_tokens_seen": 18114944, "step": 26885 }, { "epoch": 0.6569271736740527, "grad_norm": 16.518218994140625, "learning_rate": 1.9940075239875068e-06, "loss": 0.1785, "num_input_tokens_seen": 18118080, "step": 26890 }, { "epoch": 0.6570493245058999, "grad_norm": 0.5696353316307068, "learning_rate": 1.993998198535948e-06, "loss": 0.0843, "num_input_tokens_seen": 18121664, "step": 26895 }, { "epoch": 0.657171475337747, "grad_norm": 12.237276077270508, "learning_rate": 1.993988865855767e-06, "loss": 0.0735, "num_input_tokens_seen": 18124928, "step": 26900 }, { "epoch": 0.6572936261695942, "grad_norm": 18.05430793762207, "learning_rate": 1.9939795259470324e-06, "loss": 0.2564, "num_input_tokens_seen": 18128128, "step": 26905 }, { "epoch": 0.6574157770014414, "grad_norm": 2.649308919906616, "learning_rate": 1.9939701788098104e-06, "loss": 0.0907, "num_input_tokens_seen": 18131328, "step": 26910 }, { "epoch": 0.6575379278332886, "grad_norm": 17.577932357788086, "learning_rate": 1.99396082444417e-06, "loss": 0.098, "num_input_tokens_seen": 18134976, "step": 26915 }, { "epoch": 0.6576600786651358, "grad_norm": 1.0797964334487915, "learning_rate": 1.99395146285018e-06, "loss": 0.0583, "num_input_tokens_seen": 18138432, "step": 26920 }, { "epoch": 0.6577822294969828, "grad_norm": 21.48768424987793, "learning_rate": 1.993942094027907e-06, "loss": 0.151, "num_input_tokens_seen": 18141952, "step": 26925 }, { "epoch": 0.65790438032883, "grad_norm": 54.25938415527344, "learning_rate": 1.9939327179774198e-06, "loss": 0.1185, "num_input_tokens_seen": 18145920, "step": 26930 }, { "epoch": 0.6580265311606772, "grad_norm": 11.435982704162598, "learning_rate": 1.9939233346987863e-06, "loss": 0.1919, "num_input_tokens_seen": 18149440, "step": 26935 }, { "epoch": 0.6581486819925244, "grad_norm": 10.653462409973145, "learning_rate": 1.993913944192075e-06, "loss": 0.1845, "num_input_tokens_seen": 18152704, "step": 26940 }, { "epoch": 0.6582708328243715, "grad_norm": 16.521217346191406, "learning_rate": 1.9939045464573544e-06, "loss": 0.2145, "num_input_tokens_seen": 18155776, "step": 26945 }, { "epoch": 0.6583929836562187, "grad_norm": 21.310832977294922, "learning_rate": 1.993895141494693e-06, "loss": 0.1141, "num_input_tokens_seen": 18158912, "step": 26950 }, { "epoch": 0.6585151344880659, "grad_norm": 32.2432975769043, "learning_rate": 1.993885729304158e-06, "loss": 0.1262, "num_input_tokens_seen": 18162752, "step": 26955 }, { "epoch": 0.6586372853199131, "grad_norm": 0.8311634659767151, "learning_rate": 1.993876309885819e-06, "loss": 0.1028, "num_input_tokens_seen": 18166272, "step": 26960 }, { "epoch": 0.6587594361517602, "grad_norm": 15.096733093261719, "learning_rate": 1.993866883239744e-06, "loss": 0.1589, "num_input_tokens_seen": 18169600, "step": 26965 }, { "epoch": 0.6588815869836073, "grad_norm": 10.199955940246582, "learning_rate": 1.9938574493660023e-06, "loss": 0.0782, "num_input_tokens_seen": 18173568, "step": 26970 }, { "epoch": 0.6590037378154545, "grad_norm": 17.658369064331055, "learning_rate": 1.9938480082646613e-06, "loss": 0.1405, "num_input_tokens_seen": 18176576, "step": 26975 }, { "epoch": 0.6591258886473017, "grad_norm": 27.87171173095703, "learning_rate": 1.9938385599357907e-06, "loss": 0.0897, "num_input_tokens_seen": 18179840, "step": 26980 }, { "epoch": 0.6592480394791489, "grad_norm": 2.524864912033081, "learning_rate": 1.9938291043794585e-06, "loss": 0.1164, "num_input_tokens_seen": 18182784, "step": 26985 }, { "epoch": 0.659370190310996, "grad_norm": 17.257301330566406, "learning_rate": 1.9938196415957334e-06, "loss": 0.1907, "num_input_tokens_seen": 18185984, "step": 26990 }, { "epoch": 0.6594923411428432, "grad_norm": 0.8043541312217712, "learning_rate": 1.9938101715846853e-06, "loss": 0.0442, "num_input_tokens_seen": 18189056, "step": 26995 }, { "epoch": 0.6596144919746904, "grad_norm": 18.458202362060547, "learning_rate": 1.993800694346382e-06, "loss": 0.1052, "num_input_tokens_seen": 18192384, "step": 27000 }, { "epoch": 0.6597366428065375, "grad_norm": 12.032055854797363, "learning_rate": 1.9937912098808927e-06, "loss": 0.0961, "num_input_tokens_seen": 18195712, "step": 27005 }, { "epoch": 0.6598587936383846, "grad_norm": 8.875457763671875, "learning_rate": 1.9937817181882864e-06, "loss": 0.1263, "num_input_tokens_seen": 18198848, "step": 27010 }, { "epoch": 0.6599809444702318, "grad_norm": 1.453246831893921, "learning_rate": 1.9937722192686323e-06, "loss": 0.0561, "num_input_tokens_seen": 18202112, "step": 27015 }, { "epoch": 0.660103095302079, "grad_norm": 3.466822385787964, "learning_rate": 1.9937627131219995e-06, "loss": 0.071, "num_input_tokens_seen": 18205696, "step": 27020 }, { "epoch": 0.6602252461339262, "grad_norm": 40.0861930847168, "learning_rate": 1.9937531997484566e-06, "loss": 0.2277, "num_input_tokens_seen": 18209152, "step": 27025 }, { "epoch": 0.6603473969657734, "grad_norm": 1.530218243598938, "learning_rate": 1.993743679148073e-06, "loss": 0.1101, "num_input_tokens_seen": 18212352, "step": 27030 }, { "epoch": 0.6604695477976205, "grad_norm": 16.85233497619629, "learning_rate": 1.9937341513209183e-06, "loss": 0.1003, "num_input_tokens_seen": 18215680, "step": 27035 }, { "epoch": 0.6605916986294676, "grad_norm": 1.5461503267288208, "learning_rate": 1.9937246162670614e-06, "loss": 0.0563, "num_input_tokens_seen": 18219072, "step": 27040 }, { "epoch": 0.6607138494613148, "grad_norm": 11.609980583190918, "learning_rate": 1.9937150739865723e-06, "loss": 0.105, "num_input_tokens_seen": 18222464, "step": 27045 }, { "epoch": 0.660836000293162, "grad_norm": 17.705228805541992, "learning_rate": 1.9937055244795195e-06, "loss": 0.0846, "num_input_tokens_seen": 18226112, "step": 27050 }, { "epoch": 0.6609581511250091, "grad_norm": 10.436869621276855, "learning_rate": 1.993695967745973e-06, "loss": 0.1617, "num_input_tokens_seen": 18229184, "step": 27055 }, { "epoch": 0.6610803019568563, "grad_norm": 30.66120147705078, "learning_rate": 1.993686403786002e-06, "loss": 0.0614, "num_input_tokens_seen": 18232320, "step": 27060 }, { "epoch": 0.6612024527887035, "grad_norm": 5.639974594116211, "learning_rate": 1.9936768325996763e-06, "loss": 0.0612, "num_input_tokens_seen": 18235584, "step": 27065 }, { "epoch": 0.6613246036205507, "grad_norm": 0.6442601084709167, "learning_rate": 1.9936672541870656e-06, "loss": 0.0812, "num_input_tokens_seen": 18238656, "step": 27070 }, { "epoch": 0.6614467544523979, "grad_norm": 2.0177724361419678, "learning_rate": 1.993657668548239e-06, "loss": 0.0975, "num_input_tokens_seen": 18242240, "step": 27075 }, { "epoch": 0.661568905284245, "grad_norm": 0.5144067406654358, "learning_rate": 1.993648075683267e-06, "loss": 0.1443, "num_input_tokens_seen": 18245952, "step": 27080 }, { "epoch": 0.6616910561160921, "grad_norm": 3.1164534091949463, "learning_rate": 1.9936384755922185e-06, "loss": 0.1316, "num_input_tokens_seen": 18249088, "step": 27085 }, { "epoch": 0.6618132069479393, "grad_norm": 0.15728719532489777, "learning_rate": 1.993628868275164e-06, "loss": 0.0605, "num_input_tokens_seen": 18252672, "step": 27090 }, { "epoch": 0.6619353577797865, "grad_norm": 22.022380828857422, "learning_rate": 1.9936192537321733e-06, "loss": 0.2245, "num_input_tokens_seen": 18255744, "step": 27095 }, { "epoch": 0.6620575086116336, "grad_norm": 5.147634029388428, "learning_rate": 1.993609631963316e-06, "loss": 0.0751, "num_input_tokens_seen": 18259584, "step": 27100 }, { "epoch": 0.6621796594434808, "grad_norm": 2.5567846298217773, "learning_rate": 1.993600002968662e-06, "loss": 0.1795, "num_input_tokens_seen": 18262656, "step": 27105 }, { "epoch": 0.662301810275328, "grad_norm": 0.2086157351732254, "learning_rate": 1.9935903667482815e-06, "loss": 0.1977, "num_input_tokens_seen": 18266112, "step": 27110 }, { "epoch": 0.6624239611071752, "grad_norm": 25.842384338378906, "learning_rate": 1.993580723302245e-06, "loss": 0.1104, "num_input_tokens_seen": 18269248, "step": 27115 }, { "epoch": 0.6625461119390224, "grad_norm": 24.809724807739258, "learning_rate": 1.9935710726306215e-06, "loss": 0.1705, "num_input_tokens_seen": 18272640, "step": 27120 }, { "epoch": 0.6626682627708694, "grad_norm": 26.71112060546875, "learning_rate": 1.9935614147334825e-06, "loss": 0.116, "num_input_tokens_seen": 18276032, "step": 27125 }, { "epoch": 0.6627904136027166, "grad_norm": 6.851133346557617, "learning_rate": 1.9935517496108976e-06, "loss": 0.1621, "num_input_tokens_seen": 18279744, "step": 27130 }, { "epoch": 0.6629125644345638, "grad_norm": 19.518213272094727, "learning_rate": 1.9935420772629374e-06, "loss": 0.0469, "num_input_tokens_seen": 18282688, "step": 27135 }, { "epoch": 0.663034715266411, "grad_norm": 10.472986221313477, "learning_rate": 1.9935323976896713e-06, "loss": 0.1928, "num_input_tokens_seen": 18286080, "step": 27140 }, { "epoch": 0.6631568660982581, "grad_norm": 2.578057050704956, "learning_rate": 1.993522710891171e-06, "loss": 0.1215, "num_input_tokens_seen": 18289664, "step": 27145 }, { "epoch": 0.6632790169301053, "grad_norm": 5.96986722946167, "learning_rate": 1.993513016867506e-06, "loss": 0.0822, "num_input_tokens_seen": 18292608, "step": 27150 }, { "epoch": 0.6634011677619525, "grad_norm": 21.24750518798828, "learning_rate": 1.993503315618747e-06, "loss": 0.0923, "num_input_tokens_seen": 18295808, "step": 27155 }, { "epoch": 0.6635233185937996, "grad_norm": 15.410693168640137, "learning_rate": 1.993493607144965e-06, "loss": 0.1647, "num_input_tokens_seen": 18299264, "step": 27160 }, { "epoch": 0.6636454694256468, "grad_norm": 10.475720405578613, "learning_rate": 1.9934838914462303e-06, "loss": 0.1891, "num_input_tokens_seen": 18302336, "step": 27165 }, { "epoch": 0.6637676202574939, "grad_norm": 16.02503776550293, "learning_rate": 1.9934741685226133e-06, "loss": 0.0913, "num_input_tokens_seen": 18305792, "step": 27170 }, { "epoch": 0.6638897710893411, "grad_norm": 14.892638206481934, "learning_rate": 1.993464438374185e-06, "loss": 0.0621, "num_input_tokens_seen": 18308928, "step": 27175 }, { "epoch": 0.6640119219211883, "grad_norm": 33.46926498413086, "learning_rate": 1.993454701001016e-06, "loss": 0.1885, "num_input_tokens_seen": 18312064, "step": 27180 }, { "epoch": 0.6641340727530355, "grad_norm": 18.051313400268555, "learning_rate": 1.9934449564031774e-06, "loss": 0.1044, "num_input_tokens_seen": 18315712, "step": 27185 }, { "epoch": 0.6642562235848826, "grad_norm": 30.73536491394043, "learning_rate": 1.99343520458074e-06, "loss": 0.1608, "num_input_tokens_seen": 18319296, "step": 27190 }, { "epoch": 0.6643783744167298, "grad_norm": 14.0807466506958, "learning_rate": 1.993425445533774e-06, "loss": 0.0892, "num_input_tokens_seen": 18322688, "step": 27195 }, { "epoch": 0.664500525248577, "grad_norm": 17.66388702392578, "learning_rate": 1.993415679262351e-06, "loss": 0.031, "num_input_tokens_seen": 18326272, "step": 27200 }, { "epoch": 0.6646226760804241, "grad_norm": 2.5478765964508057, "learning_rate": 1.9934059057665428e-06, "loss": 0.1511, "num_input_tokens_seen": 18329216, "step": 27205 }, { "epoch": 0.6647448269122713, "grad_norm": 0.9261932373046875, "learning_rate": 1.993396125046419e-06, "loss": 0.0901, "num_input_tokens_seen": 18332864, "step": 27210 }, { "epoch": 0.6648669777441184, "grad_norm": 15.885643005371094, "learning_rate": 1.9933863371020515e-06, "loss": 0.1088, "num_input_tokens_seen": 18336064, "step": 27215 }, { "epoch": 0.6649891285759656, "grad_norm": 14.773920059204102, "learning_rate": 1.9933765419335114e-06, "loss": 0.0837, "num_input_tokens_seen": 18339584, "step": 27220 }, { "epoch": 0.6651112794078128, "grad_norm": 16.5922794342041, "learning_rate": 1.9933667395408703e-06, "loss": 0.1089, "num_input_tokens_seen": 18342656, "step": 27225 }, { "epoch": 0.66523343023966, "grad_norm": 19.870290756225586, "learning_rate": 1.9933569299241987e-06, "loss": 0.1059, "num_input_tokens_seen": 18346112, "step": 27230 }, { "epoch": 0.665355581071507, "grad_norm": 19.615880966186523, "learning_rate": 1.993347113083568e-06, "loss": 0.0226, "num_input_tokens_seen": 18349568, "step": 27235 }, { "epoch": 0.6654777319033542, "grad_norm": 0.1664038896560669, "learning_rate": 1.9933372890190503e-06, "loss": 0.024, "num_input_tokens_seen": 18352832, "step": 27240 }, { "epoch": 0.6655998827352014, "grad_norm": 26.959386825561523, "learning_rate": 1.9933274577307167e-06, "loss": 0.1599, "num_input_tokens_seen": 18355840, "step": 27245 }, { "epoch": 0.6657220335670486, "grad_norm": 40.0178108215332, "learning_rate": 1.993317619218639e-06, "loss": 0.176, "num_input_tokens_seen": 18359424, "step": 27250 }, { "epoch": 0.6658441843988958, "grad_norm": 35.85573959350586, "learning_rate": 1.9933077734828877e-06, "loss": 0.2333, "num_input_tokens_seen": 18362816, "step": 27255 }, { "epoch": 0.6659663352307429, "grad_norm": 33.02347946166992, "learning_rate": 1.993297920523535e-06, "loss": 0.088, "num_input_tokens_seen": 18366528, "step": 27260 }, { "epoch": 0.6660884860625901, "grad_norm": 16.668325424194336, "learning_rate": 1.9932880603406533e-06, "loss": 0.1244, "num_input_tokens_seen": 18370048, "step": 27265 }, { "epoch": 0.6662106368944373, "grad_norm": 0.7546607255935669, "learning_rate": 1.9932781929343135e-06, "loss": 0.0695, "num_input_tokens_seen": 18373120, "step": 27270 }, { "epoch": 0.6663327877262845, "grad_norm": 8.710541725158691, "learning_rate": 1.9932683183045872e-06, "loss": 0.182, "num_input_tokens_seen": 18376512, "step": 27275 }, { "epoch": 0.6664549385581315, "grad_norm": 12.80578899383545, "learning_rate": 1.993258436451547e-06, "loss": 0.1501, "num_input_tokens_seen": 18379776, "step": 27280 }, { "epoch": 0.6665770893899787, "grad_norm": 9.051602363586426, "learning_rate": 1.993248547375264e-06, "loss": 0.1145, "num_input_tokens_seen": 18383168, "step": 27285 }, { "epoch": 0.6666992402218259, "grad_norm": 14.54477596282959, "learning_rate": 1.9932386510758102e-06, "loss": 0.0978, "num_input_tokens_seen": 18386560, "step": 27290 }, { "epoch": 0.6668213910536731, "grad_norm": 29.884191513061523, "learning_rate": 1.9932287475532582e-06, "loss": 0.1649, "num_input_tokens_seen": 18390208, "step": 27295 }, { "epoch": 0.6669435418855202, "grad_norm": 42.955299377441406, "learning_rate": 1.9932188368076793e-06, "loss": 0.3528, "num_input_tokens_seen": 18393600, "step": 27300 }, { "epoch": 0.6670656927173674, "grad_norm": 11.649964332580566, "learning_rate": 1.993208918839146e-06, "loss": 0.1349, "num_input_tokens_seen": 18397184, "step": 27305 }, { "epoch": 0.6671878435492146, "grad_norm": 19.09230613708496, "learning_rate": 1.99319899364773e-06, "loss": 0.1671, "num_input_tokens_seen": 18401728, "step": 27310 }, { "epoch": 0.6673099943810618, "grad_norm": 16.074893951416016, "learning_rate": 1.9931890612335044e-06, "loss": 0.2605, "num_input_tokens_seen": 18405184, "step": 27315 }, { "epoch": 0.667432145212909, "grad_norm": 20.0411319732666, "learning_rate": 1.9931791215965405e-06, "loss": 0.0884, "num_input_tokens_seen": 18408256, "step": 27320 }, { "epoch": 0.667554296044756, "grad_norm": 5.844738006591797, "learning_rate": 1.993169174736911e-06, "loss": 0.1059, "num_input_tokens_seen": 18411264, "step": 27325 }, { "epoch": 0.6676764468766032, "grad_norm": 10.027684211730957, "learning_rate": 1.993159220654688e-06, "loss": 0.0379, "num_input_tokens_seen": 18414848, "step": 27330 }, { "epoch": 0.6677985977084504, "grad_norm": 1.0045827627182007, "learning_rate": 1.993149259349944e-06, "loss": 0.1058, "num_input_tokens_seen": 18418304, "step": 27335 }, { "epoch": 0.6679207485402976, "grad_norm": 6.239989757537842, "learning_rate": 1.9931392908227515e-06, "loss": 0.061, "num_input_tokens_seen": 18421696, "step": 27340 }, { "epoch": 0.6680428993721447, "grad_norm": 21.569900512695312, "learning_rate": 1.993129315073183e-06, "loss": 0.1927, "num_input_tokens_seen": 18424960, "step": 27345 }, { "epoch": 0.6681650502039919, "grad_norm": 4.123659610748291, "learning_rate": 1.993119332101311e-06, "loss": 0.0656, "num_input_tokens_seen": 18428032, "step": 27350 }, { "epoch": 0.668287201035839, "grad_norm": 3.6173691749572754, "learning_rate": 1.993109341907208e-06, "loss": 0.0642, "num_input_tokens_seen": 18431616, "step": 27355 }, { "epoch": 0.6684093518676862, "grad_norm": 30.157012939453125, "learning_rate": 1.993099344490947e-06, "loss": 0.1862, "num_input_tokens_seen": 18434560, "step": 27360 }, { "epoch": 0.6685315026995334, "grad_norm": 11.95051383972168, "learning_rate": 1.9930893398526e-06, "loss": 0.1303, "num_input_tokens_seen": 18437696, "step": 27365 }, { "epoch": 0.6686536535313805, "grad_norm": 24.992183685302734, "learning_rate": 1.9930793279922408e-06, "loss": 0.102, "num_input_tokens_seen": 18440896, "step": 27370 }, { "epoch": 0.6687758043632277, "grad_norm": 31.214475631713867, "learning_rate": 1.993069308909941e-06, "loss": 0.0309, "num_input_tokens_seen": 18443968, "step": 27375 }, { "epoch": 0.6688979551950749, "grad_norm": 19.314767837524414, "learning_rate": 1.9930592826057746e-06, "loss": 0.1691, "num_input_tokens_seen": 18447424, "step": 27380 }, { "epoch": 0.6690201060269221, "grad_norm": 1.9460688829421997, "learning_rate": 1.993049249079814e-06, "loss": 0.1198, "num_input_tokens_seen": 18450944, "step": 27385 }, { "epoch": 0.6691422568587692, "grad_norm": 36.64439392089844, "learning_rate": 1.9930392083321315e-06, "loss": 0.2652, "num_input_tokens_seen": 18453952, "step": 27390 }, { "epoch": 0.6692644076906163, "grad_norm": 0.25118666887283325, "learning_rate": 1.9930291603628013e-06, "loss": 0.0694, "num_input_tokens_seen": 18457088, "step": 27395 }, { "epoch": 0.6693865585224635, "grad_norm": 13.27421760559082, "learning_rate": 1.993019105171896e-06, "loss": 0.0814, "num_input_tokens_seen": 18459904, "step": 27400 }, { "epoch": 0.6695087093543107, "grad_norm": 23.91324234008789, "learning_rate": 1.9930090427594885e-06, "loss": 0.0772, "num_input_tokens_seen": 18463232, "step": 27405 }, { "epoch": 0.6696308601861579, "grad_norm": 2.134026050567627, "learning_rate": 1.992998973125652e-06, "loss": 0.0064, "num_input_tokens_seen": 18466816, "step": 27410 }, { "epoch": 0.669753011018005, "grad_norm": 0.7445598840713501, "learning_rate": 1.9929888962704603e-06, "loss": 0.0142, "num_input_tokens_seen": 18470080, "step": 27415 }, { "epoch": 0.6698751618498522, "grad_norm": 34.15476608276367, "learning_rate": 1.992978812193986e-06, "loss": 0.0758, "num_input_tokens_seen": 18473664, "step": 27420 }, { "epoch": 0.6699973126816994, "grad_norm": 0.30861836671829224, "learning_rate": 1.9929687208963026e-06, "loss": 0.0811, "num_input_tokens_seen": 18476928, "step": 27425 }, { "epoch": 0.6701194635135466, "grad_norm": 0.0861566960811615, "learning_rate": 1.992958622377484e-06, "loss": 0.1545, "num_input_tokens_seen": 18480320, "step": 27430 }, { "epoch": 0.6702416143453936, "grad_norm": 26.23560905456543, "learning_rate": 1.9929485166376026e-06, "loss": 0.1974, "num_input_tokens_seen": 18483840, "step": 27435 }, { "epoch": 0.6703637651772408, "grad_norm": 0.39746278524398804, "learning_rate": 1.992938403676733e-06, "loss": 0.07, "num_input_tokens_seen": 18486912, "step": 27440 }, { "epoch": 0.670485916009088, "grad_norm": 12.6868257522583, "learning_rate": 1.9929282834949483e-06, "loss": 0.1578, "num_input_tokens_seen": 18489984, "step": 27445 }, { "epoch": 0.6706080668409352, "grad_norm": 26.02521324157715, "learning_rate": 1.9929181560923217e-06, "loss": 0.2339, "num_input_tokens_seen": 18493312, "step": 27450 }, { "epoch": 0.6707302176727824, "grad_norm": 0.4023156464099884, "learning_rate": 1.9929080214689274e-06, "loss": 0.0862, "num_input_tokens_seen": 18496832, "step": 27455 }, { "epoch": 0.6708523685046295, "grad_norm": 18.912397384643555, "learning_rate": 1.992897879624839e-06, "loss": 0.1234, "num_input_tokens_seen": 18500160, "step": 27460 }, { "epoch": 0.6709745193364767, "grad_norm": 47.363975524902344, "learning_rate": 1.99288773056013e-06, "loss": 0.1291, "num_input_tokens_seen": 18503936, "step": 27465 }, { "epoch": 0.6710966701683239, "grad_norm": 9.934727668762207, "learning_rate": 1.9928775742748747e-06, "loss": 0.1425, "num_input_tokens_seen": 18507776, "step": 27470 }, { "epoch": 0.671218821000171, "grad_norm": 10.783501625061035, "learning_rate": 1.9928674107691463e-06, "loss": 0.0539, "num_input_tokens_seen": 18511104, "step": 27475 }, { "epoch": 0.6713409718320181, "grad_norm": 12.993548393249512, "learning_rate": 1.992857240043019e-06, "loss": 0.1931, "num_input_tokens_seen": 18514304, "step": 27480 }, { "epoch": 0.6714631226638653, "grad_norm": 20.122446060180664, "learning_rate": 1.992847062096567e-06, "loss": 0.0949, "num_input_tokens_seen": 18517696, "step": 27485 }, { "epoch": 0.6715852734957125, "grad_norm": 13.75194263458252, "learning_rate": 1.9928368769298636e-06, "loss": 0.1095, "num_input_tokens_seen": 18520896, "step": 27490 }, { "epoch": 0.6717074243275597, "grad_norm": 6.688014507293701, "learning_rate": 1.992826684542984e-06, "loss": 0.0394, "num_input_tokens_seen": 18524288, "step": 27495 }, { "epoch": 0.6718295751594069, "grad_norm": 4.681891918182373, "learning_rate": 1.9928164849360018e-06, "loss": 0.2707, "num_input_tokens_seen": 18527680, "step": 27500 }, { "epoch": 0.671951725991254, "grad_norm": 9.065070152282715, "learning_rate": 1.9928062781089906e-06, "loss": 0.0718, "num_input_tokens_seen": 18531328, "step": 27505 }, { "epoch": 0.6720738768231012, "grad_norm": 25.745248794555664, "learning_rate": 1.9927960640620256e-06, "loss": 0.2083, "num_input_tokens_seen": 18535040, "step": 27510 }, { "epoch": 0.6721960276549483, "grad_norm": 13.078929901123047, "learning_rate": 1.9927858427951804e-06, "loss": 0.0871, "num_input_tokens_seen": 18538304, "step": 27515 }, { "epoch": 0.6723181784867955, "grad_norm": 11.144567489624023, "learning_rate": 1.9927756143085293e-06, "loss": 0.099, "num_input_tokens_seen": 18541568, "step": 27520 }, { "epoch": 0.6724403293186426, "grad_norm": 11.036469459533691, "learning_rate": 1.9927653786021466e-06, "loss": 0.0368, "num_input_tokens_seen": 18545536, "step": 27525 }, { "epoch": 0.6725624801504898, "grad_norm": 8.302674293518066, "learning_rate": 1.992755135676108e-06, "loss": 0.1327, "num_input_tokens_seen": 18548864, "step": 27530 }, { "epoch": 0.672684630982337, "grad_norm": 14.096845626831055, "learning_rate": 1.9927448855304862e-06, "loss": 0.0681, "num_input_tokens_seen": 18552192, "step": 27535 }, { "epoch": 0.6728067818141842, "grad_norm": 9.845850944519043, "learning_rate": 1.992734628165357e-06, "loss": 0.06, "num_input_tokens_seen": 18555264, "step": 27540 }, { "epoch": 0.6729289326460314, "grad_norm": 14.49575138092041, "learning_rate": 1.9927243635807948e-06, "loss": 0.1047, "num_input_tokens_seen": 18558976, "step": 27545 }, { "epoch": 0.6730510834778785, "grad_norm": 6.2803730964660645, "learning_rate": 1.9927140917768736e-06, "loss": 0.0696, "num_input_tokens_seen": 18562304, "step": 27550 }, { "epoch": 0.6731732343097256, "grad_norm": 7.356665134429932, "learning_rate": 1.9927038127536686e-06, "loss": 0.0841, "num_input_tokens_seen": 18565888, "step": 27555 }, { "epoch": 0.6732953851415728, "grad_norm": 9.611376762390137, "learning_rate": 1.9926935265112543e-06, "loss": 0.0578, "num_input_tokens_seen": 18569536, "step": 27560 }, { "epoch": 0.67341753597342, "grad_norm": 24.40125846862793, "learning_rate": 1.992683233049706e-06, "loss": 0.0541, "num_input_tokens_seen": 18572608, "step": 27565 }, { "epoch": 0.6735396868052671, "grad_norm": 39.64024353027344, "learning_rate": 1.992672932369098e-06, "loss": 0.2431, "num_input_tokens_seen": 18576448, "step": 27570 }, { "epoch": 0.6736618376371143, "grad_norm": 8.382988929748535, "learning_rate": 1.9926626244695056e-06, "loss": 0.1644, "num_input_tokens_seen": 18579776, "step": 27575 }, { "epoch": 0.6737839884689615, "grad_norm": 4.696765422821045, "learning_rate": 1.9926523093510034e-06, "loss": 0.1107, "num_input_tokens_seen": 18582784, "step": 27580 }, { "epoch": 0.6739061393008087, "grad_norm": 21.366455078125, "learning_rate": 1.992641987013667e-06, "loss": 0.1118, "num_input_tokens_seen": 18585728, "step": 27585 }, { "epoch": 0.6740282901326557, "grad_norm": 42.7935791015625, "learning_rate": 1.9926316574575707e-06, "loss": 0.064, "num_input_tokens_seen": 18588800, "step": 27590 }, { "epoch": 0.6741504409645029, "grad_norm": 26.108415603637695, "learning_rate": 1.99262132068279e-06, "loss": 0.2027, "num_input_tokens_seen": 18592000, "step": 27595 }, { "epoch": 0.6742725917963501, "grad_norm": 10.856925010681152, "learning_rate": 1.9926109766894003e-06, "loss": 0.1205, "num_input_tokens_seen": 18595584, "step": 27600 }, { "epoch": 0.6743947426281973, "grad_norm": 16.994104385375977, "learning_rate": 1.992600625477476e-06, "loss": 0.1196, "num_input_tokens_seen": 18599296, "step": 27605 }, { "epoch": 0.6745168934600445, "grad_norm": 28.180084228515625, "learning_rate": 1.992590267047094e-06, "loss": 0.1573, "num_input_tokens_seen": 18602304, "step": 27610 }, { "epoch": 0.6746390442918916, "grad_norm": 1.7627689838409424, "learning_rate": 1.992579901398328e-06, "loss": 0.0645, "num_input_tokens_seen": 18605632, "step": 27615 }, { "epoch": 0.6747611951237388, "grad_norm": 15.111403465270996, "learning_rate": 1.992569528531254e-06, "loss": 0.1136, "num_input_tokens_seen": 18609152, "step": 27620 }, { "epoch": 0.674883345955586, "grad_norm": 16.666126251220703, "learning_rate": 1.9925591484459474e-06, "loss": 0.1165, "num_input_tokens_seen": 18612928, "step": 27625 }, { "epoch": 0.6750054967874332, "grad_norm": 10.067097663879395, "learning_rate": 1.992548761142484e-06, "loss": 0.1876, "num_input_tokens_seen": 18616768, "step": 27630 }, { "epoch": 0.6751276476192802, "grad_norm": 8.604232788085938, "learning_rate": 1.9925383666209387e-06, "loss": 0.0641, "num_input_tokens_seen": 18619840, "step": 27635 }, { "epoch": 0.6752497984511274, "grad_norm": 2.173659324645996, "learning_rate": 1.9925279648813875e-06, "loss": 0.0321, "num_input_tokens_seen": 18623104, "step": 27640 }, { "epoch": 0.6753719492829746, "grad_norm": 24.729249954223633, "learning_rate": 1.992517555923906e-06, "loss": 0.1091, "num_input_tokens_seen": 18626496, "step": 27645 }, { "epoch": 0.6754941001148218, "grad_norm": 3.241520643234253, "learning_rate": 1.99250713974857e-06, "loss": 0.075, "num_input_tokens_seen": 18629632, "step": 27650 }, { "epoch": 0.675616250946669, "grad_norm": 39.44907760620117, "learning_rate": 1.992496716355455e-06, "loss": 0.0667, "num_input_tokens_seen": 18632896, "step": 27655 }, { "epoch": 0.6757384017785161, "grad_norm": 15.261924743652344, "learning_rate": 1.9924862857446374e-06, "loss": 0.1238, "num_input_tokens_seen": 18636416, "step": 27660 }, { "epoch": 0.6758605526103633, "grad_norm": 32.404754638671875, "learning_rate": 1.9924758479161916e-06, "loss": 0.1344, "num_input_tokens_seen": 18639296, "step": 27665 }, { "epoch": 0.6759827034422105, "grad_norm": 5.737441539764404, "learning_rate": 1.992465402870195e-06, "loss": 0.1151, "num_input_tokens_seen": 18643072, "step": 27670 }, { "epoch": 0.6761048542740576, "grad_norm": 0.8999488353729248, "learning_rate": 1.9924549506067236e-06, "loss": 0.0603, "num_input_tokens_seen": 18646336, "step": 27675 }, { "epoch": 0.6762270051059047, "grad_norm": 11.820429801940918, "learning_rate": 1.992444491125852e-06, "loss": 0.0828, "num_input_tokens_seen": 18650112, "step": 27680 }, { "epoch": 0.6763491559377519, "grad_norm": 29.754968643188477, "learning_rate": 1.9924340244276576e-06, "loss": 0.1562, "num_input_tokens_seen": 18653632, "step": 27685 }, { "epoch": 0.6764713067695991, "grad_norm": 9.74490737915039, "learning_rate": 1.992423550512216e-06, "loss": 0.1278, "num_input_tokens_seen": 18656704, "step": 27690 }, { "epoch": 0.6765934576014463, "grad_norm": 10.454187393188477, "learning_rate": 1.9924130693796034e-06, "loss": 0.1062, "num_input_tokens_seen": 18660224, "step": 27695 }, { "epoch": 0.6767156084332935, "grad_norm": 3.680006742477417, "learning_rate": 1.9924025810298957e-06, "loss": 0.0639, "num_input_tokens_seen": 18663680, "step": 27700 }, { "epoch": 0.6768377592651406, "grad_norm": 18.555456161499023, "learning_rate": 1.99239208546317e-06, "loss": 0.1237, "num_input_tokens_seen": 18666880, "step": 27705 }, { "epoch": 0.6769599100969877, "grad_norm": 26.601221084594727, "learning_rate": 1.9923815826795018e-06, "loss": 0.0874, "num_input_tokens_seen": 18670464, "step": 27710 }, { "epoch": 0.6770820609288349, "grad_norm": 12.380290031433105, "learning_rate": 1.992371072678968e-06, "loss": 0.0819, "num_input_tokens_seen": 18673920, "step": 27715 }, { "epoch": 0.6772042117606821, "grad_norm": 9.469911575317383, "learning_rate": 1.9923605554616447e-06, "loss": 0.1129, "num_input_tokens_seen": 18677632, "step": 27720 }, { "epoch": 0.6773263625925292, "grad_norm": 26.473451614379883, "learning_rate": 1.9923500310276085e-06, "loss": 0.0451, "num_input_tokens_seen": 18681088, "step": 27725 }, { "epoch": 0.6774485134243764, "grad_norm": 23.648157119750977, "learning_rate": 1.9923394993769362e-06, "loss": 0.0925, "num_input_tokens_seen": 18684160, "step": 27730 }, { "epoch": 0.6775706642562236, "grad_norm": 28.7203426361084, "learning_rate": 1.992328960509704e-06, "loss": 0.143, "num_input_tokens_seen": 18687296, "step": 27735 }, { "epoch": 0.6776928150880708, "grad_norm": 24.867673873901367, "learning_rate": 1.9923184144259886e-06, "loss": 0.3171, "num_input_tokens_seen": 18690496, "step": 27740 }, { "epoch": 0.677814965919918, "grad_norm": 3.4889917373657227, "learning_rate": 1.992307861125867e-06, "loss": 0.1289, "num_input_tokens_seen": 18693824, "step": 27745 }, { "epoch": 0.677937116751765, "grad_norm": 5.78082799911499, "learning_rate": 1.9922973006094156e-06, "loss": 0.1006, "num_input_tokens_seen": 18697024, "step": 27750 }, { "epoch": 0.6780592675836122, "grad_norm": 20.06686019897461, "learning_rate": 1.9922867328767114e-06, "loss": 0.1345, "num_input_tokens_seen": 18699968, "step": 27755 }, { "epoch": 0.6781814184154594, "grad_norm": 7.188560485839844, "learning_rate": 1.992276157927831e-06, "loss": 0.0691, "num_input_tokens_seen": 18703296, "step": 27760 }, { "epoch": 0.6783035692473066, "grad_norm": 0.8174273371696472, "learning_rate": 1.9922655757628516e-06, "loss": 0.067, "num_input_tokens_seen": 18706496, "step": 27765 }, { "epoch": 0.6784257200791537, "grad_norm": 3.802631378173828, "learning_rate": 1.9922549863818504e-06, "loss": 0.0486, "num_input_tokens_seen": 18709696, "step": 27770 }, { "epoch": 0.6785478709110009, "grad_norm": 26.586246490478516, "learning_rate": 1.9922443897849037e-06, "loss": 0.0436, "num_input_tokens_seen": 18712704, "step": 27775 }, { "epoch": 0.6786700217428481, "grad_norm": 27.751142501831055, "learning_rate": 1.9922337859720887e-06, "loss": 0.0707, "num_input_tokens_seen": 18715648, "step": 27780 }, { "epoch": 0.6787921725746953, "grad_norm": 1.3806562423706055, "learning_rate": 1.992223174943483e-06, "loss": 0.1024, "num_input_tokens_seen": 18719040, "step": 27785 }, { "epoch": 0.6789143234065425, "grad_norm": 16.745941162109375, "learning_rate": 1.992212556699164e-06, "loss": 0.0903, "num_input_tokens_seen": 18722176, "step": 27790 }, { "epoch": 0.6790364742383895, "grad_norm": 35.78384780883789, "learning_rate": 1.9922019312392077e-06, "loss": 0.0914, "num_input_tokens_seen": 18725952, "step": 27795 }, { "epoch": 0.6791586250702367, "grad_norm": 16.95638656616211, "learning_rate": 1.992191298563692e-06, "loss": 0.0705, "num_input_tokens_seen": 18729152, "step": 27800 }, { "epoch": 0.6792807759020839, "grad_norm": 19.122446060180664, "learning_rate": 1.9921806586726946e-06, "loss": 0.0881, "num_input_tokens_seen": 18732480, "step": 27805 }, { "epoch": 0.6794029267339311, "grad_norm": 33.02607345581055, "learning_rate": 1.9921700115662927e-06, "loss": 0.1584, "num_input_tokens_seen": 18735488, "step": 27810 }, { "epoch": 0.6795250775657782, "grad_norm": 0.3400420546531677, "learning_rate": 1.992159357244564e-06, "loss": 0.1024, "num_input_tokens_seen": 18738688, "step": 27815 }, { "epoch": 0.6796472283976254, "grad_norm": 19.92205238342285, "learning_rate": 1.9921486957075847e-06, "loss": 0.1267, "num_input_tokens_seen": 18742336, "step": 27820 }, { "epoch": 0.6797693792294726, "grad_norm": 14.314276695251465, "learning_rate": 1.9921380269554337e-06, "loss": 0.1554, "num_input_tokens_seen": 18745856, "step": 27825 }, { "epoch": 0.6798915300613197, "grad_norm": 6.991714000701904, "learning_rate": 1.992127350988188e-06, "loss": 0.0212, "num_input_tokens_seen": 18748928, "step": 27830 }, { "epoch": 0.6800136808931668, "grad_norm": 2.587913751602173, "learning_rate": 1.9921166678059255e-06, "loss": 0.1102, "num_input_tokens_seen": 18752704, "step": 27835 }, { "epoch": 0.680135831725014, "grad_norm": 14.374567985534668, "learning_rate": 1.9921059774087234e-06, "loss": 0.0976, "num_input_tokens_seen": 18756288, "step": 27840 }, { "epoch": 0.6802579825568612, "grad_norm": 30.722314834594727, "learning_rate": 1.9920952797966598e-06, "loss": 0.1522, "num_input_tokens_seen": 18759360, "step": 27845 }, { "epoch": 0.6803801333887084, "grad_norm": 19.905860900878906, "learning_rate": 1.992084574969813e-06, "loss": 0.1346, "num_input_tokens_seen": 18762304, "step": 27850 }, { "epoch": 0.6805022842205556, "grad_norm": 1.7042980194091797, "learning_rate": 1.99207386292826e-06, "loss": 0.0575, "num_input_tokens_seen": 18765952, "step": 27855 }, { "epoch": 0.6806244350524027, "grad_norm": 26.2788143157959, "learning_rate": 1.992063143672079e-06, "loss": 0.0994, "num_input_tokens_seen": 18769344, "step": 27860 }, { "epoch": 0.6807465858842499, "grad_norm": 6.333975791931152, "learning_rate": 1.9920524172013482e-06, "loss": 0.0941, "num_input_tokens_seen": 18772480, "step": 27865 }, { "epoch": 0.680868736716097, "grad_norm": 20.061012268066406, "learning_rate": 1.9920416835161453e-06, "loss": 0.0403, "num_input_tokens_seen": 18775872, "step": 27870 }, { "epoch": 0.6809908875479442, "grad_norm": 17.780420303344727, "learning_rate": 1.9920309426165485e-06, "loss": 0.2086, "num_input_tokens_seen": 18779584, "step": 27875 }, { "epoch": 0.6811130383797913, "grad_norm": 23.597476959228516, "learning_rate": 1.992020194502635e-06, "loss": 0.1813, "num_input_tokens_seen": 18783168, "step": 27880 }, { "epoch": 0.6812351892116385, "grad_norm": 1.0889683961868286, "learning_rate": 1.992009439174485e-06, "loss": 0.0425, "num_input_tokens_seen": 18787072, "step": 27885 }, { "epoch": 0.6813573400434857, "grad_norm": 33.030147552490234, "learning_rate": 1.9919986766321754e-06, "loss": 0.0671, "num_input_tokens_seen": 18790272, "step": 27890 }, { "epoch": 0.6814794908753329, "grad_norm": 15.32840347290039, "learning_rate": 1.991987906875784e-06, "loss": 0.0871, "num_input_tokens_seen": 18793472, "step": 27895 }, { "epoch": 0.6816016417071801, "grad_norm": 41.0756721496582, "learning_rate": 1.9919771299053902e-06, "loss": 0.2074, "num_input_tokens_seen": 18797056, "step": 27900 }, { "epoch": 0.6817237925390272, "grad_norm": 17.466093063354492, "learning_rate": 1.991966345721072e-06, "loss": 0.1312, "num_input_tokens_seen": 18801088, "step": 27905 }, { "epoch": 0.6818459433708743, "grad_norm": 9.908241271972656, "learning_rate": 1.9919555543229072e-06, "loss": 0.0392, "num_input_tokens_seen": 18804224, "step": 27910 }, { "epoch": 0.6819680942027215, "grad_norm": 27.994863510131836, "learning_rate": 1.991944755710975e-06, "loss": 0.1424, "num_input_tokens_seen": 18807360, "step": 27915 }, { "epoch": 0.6820902450345687, "grad_norm": 0.7694000005722046, "learning_rate": 1.9919339498853537e-06, "loss": 0.169, "num_input_tokens_seen": 18810304, "step": 27920 }, { "epoch": 0.6822123958664158, "grad_norm": 2.132155418395996, "learning_rate": 1.9919231368461224e-06, "loss": 0.0517, "num_input_tokens_seen": 18813568, "step": 27925 }, { "epoch": 0.682334546698263, "grad_norm": 39.752418518066406, "learning_rate": 1.9919123165933586e-06, "loss": 0.1451, "num_input_tokens_seen": 18816640, "step": 27930 }, { "epoch": 0.6824566975301102, "grad_norm": 10.385972023010254, "learning_rate": 1.9919014891271423e-06, "loss": 0.0729, "num_input_tokens_seen": 18820032, "step": 27935 }, { "epoch": 0.6825788483619574, "grad_norm": 23.70648193359375, "learning_rate": 1.9918906544475507e-06, "loss": 0.082, "num_input_tokens_seen": 18823040, "step": 27940 }, { "epoch": 0.6827009991938046, "grad_norm": 25.676509857177734, "learning_rate": 1.9918798125546643e-06, "loss": 0.1848, "num_input_tokens_seen": 18826048, "step": 27945 }, { "epoch": 0.6828231500256516, "grad_norm": 34.59342575073242, "learning_rate": 1.991868963448561e-06, "loss": 0.1456, "num_input_tokens_seen": 18829504, "step": 27950 }, { "epoch": 0.6829453008574988, "grad_norm": 6.966836452484131, "learning_rate": 1.9918581071293196e-06, "loss": 0.0392, "num_input_tokens_seen": 18832768, "step": 27955 }, { "epoch": 0.683067451689346, "grad_norm": 7.659114360809326, "learning_rate": 1.9918472435970194e-06, "loss": 0.0952, "num_input_tokens_seen": 18836352, "step": 27960 }, { "epoch": 0.6831896025211932, "grad_norm": 21.104238510131836, "learning_rate": 1.991836372851739e-06, "loss": 0.0636, "num_input_tokens_seen": 18839488, "step": 27965 }, { "epoch": 0.6833117533530403, "grad_norm": 26.50173568725586, "learning_rate": 1.9918254948935576e-06, "loss": 0.1515, "num_input_tokens_seen": 18842432, "step": 27970 }, { "epoch": 0.6834339041848875, "grad_norm": 34.945533752441406, "learning_rate": 1.991814609722555e-06, "loss": 0.1336, "num_input_tokens_seen": 18845760, "step": 27975 }, { "epoch": 0.6835560550167347, "grad_norm": 0.18488149344921112, "learning_rate": 1.9918037173388098e-06, "loss": 0.0967, "num_input_tokens_seen": 18849152, "step": 27980 }, { "epoch": 0.6836782058485819, "grad_norm": 40.403011322021484, "learning_rate": 1.9917928177424005e-06, "loss": 0.0754, "num_input_tokens_seen": 18853184, "step": 27985 }, { "epoch": 0.683800356680429, "grad_norm": 4.494559288024902, "learning_rate": 1.9917819109334074e-06, "loss": 0.057, "num_input_tokens_seen": 18856448, "step": 27990 }, { "epoch": 0.6839225075122761, "grad_norm": 6.149030685424805, "learning_rate": 1.9917709969119097e-06, "loss": 0.0696, "num_input_tokens_seen": 18859392, "step": 27995 }, { "epoch": 0.6840446583441233, "grad_norm": 30.208892822265625, "learning_rate": 1.9917600756779866e-06, "loss": 0.1231, "num_input_tokens_seen": 18862848, "step": 28000 }, { "epoch": 0.6841668091759705, "grad_norm": 15.175532341003418, "learning_rate": 1.9917491472317173e-06, "loss": 0.1157, "num_input_tokens_seen": 18866048, "step": 28005 }, { "epoch": 0.6842889600078177, "grad_norm": 22.627941131591797, "learning_rate": 1.9917382115731814e-06, "loss": 0.0812, "num_input_tokens_seen": 18869696, "step": 28010 }, { "epoch": 0.6844111108396648, "grad_norm": 46.66557693481445, "learning_rate": 1.9917272687024586e-06, "loss": 0.171, "num_input_tokens_seen": 18872960, "step": 28015 }, { "epoch": 0.684533261671512, "grad_norm": 15.672872543334961, "learning_rate": 1.9917163186196284e-06, "loss": 0.1007, "num_input_tokens_seen": 18876480, "step": 28020 }, { "epoch": 0.6846554125033592, "grad_norm": 26.201805114746094, "learning_rate": 1.99170536132477e-06, "loss": 0.1806, "num_input_tokens_seen": 18880000, "step": 28025 }, { "epoch": 0.6847775633352063, "grad_norm": 1.5974482297897339, "learning_rate": 1.991694396817964e-06, "loss": 0.1536, "num_input_tokens_seen": 18883328, "step": 28030 }, { "epoch": 0.6848997141670535, "grad_norm": 30.57724952697754, "learning_rate": 1.991683425099289e-06, "loss": 0.0765, "num_input_tokens_seen": 18886464, "step": 28035 }, { "epoch": 0.6850218649989006, "grad_norm": 37.85762405395508, "learning_rate": 1.991672446168826e-06, "loss": 0.2011, "num_input_tokens_seen": 18889408, "step": 28040 }, { "epoch": 0.6851440158307478, "grad_norm": 23.697330474853516, "learning_rate": 1.9916614600266543e-06, "loss": 0.2166, "num_input_tokens_seen": 18892416, "step": 28045 }, { "epoch": 0.685266166662595, "grad_norm": 14.929670333862305, "learning_rate": 1.991650466672853e-06, "loss": 0.1019, "num_input_tokens_seen": 18895680, "step": 28050 }, { "epoch": 0.6853883174944422, "grad_norm": 15.712745666503906, "learning_rate": 1.9916394661075037e-06, "loss": 0.234, "num_input_tokens_seen": 18899072, "step": 28055 }, { "epoch": 0.6855104683262893, "grad_norm": 4.213320255279541, "learning_rate": 1.991628458330685e-06, "loss": 0.0497, "num_input_tokens_seen": 18902784, "step": 28060 }, { "epoch": 0.6856326191581364, "grad_norm": 11.769521713256836, "learning_rate": 1.9916174433424774e-06, "loss": 0.0878, "num_input_tokens_seen": 18905984, "step": 28065 }, { "epoch": 0.6857547699899836, "grad_norm": 8.652548789978027, "learning_rate": 1.991606421142961e-06, "loss": 0.0997, "num_input_tokens_seen": 18908992, "step": 28070 }, { "epoch": 0.6858769208218308, "grad_norm": 0.8606699705123901, "learning_rate": 1.991595391732216e-06, "loss": 0.0843, "num_input_tokens_seen": 18912256, "step": 28075 }, { "epoch": 0.685999071653678, "grad_norm": 5.10831356048584, "learning_rate": 1.991584355110323e-06, "loss": 0.0455, "num_input_tokens_seen": 18915584, "step": 28080 }, { "epoch": 0.6861212224855251, "grad_norm": 21.28264617919922, "learning_rate": 1.9915733112773613e-06, "loss": 0.1036, "num_input_tokens_seen": 18918784, "step": 28085 }, { "epoch": 0.6862433733173723, "grad_norm": 0.4145818054676056, "learning_rate": 1.9915622602334122e-06, "loss": 0.0086, "num_input_tokens_seen": 18921984, "step": 28090 }, { "epoch": 0.6863655241492195, "grad_norm": 45.218753814697266, "learning_rate": 1.9915512019785556e-06, "loss": 0.2377, "num_input_tokens_seen": 18925312, "step": 28095 }, { "epoch": 0.6864876749810667, "grad_norm": 24.27002716064453, "learning_rate": 1.9915401365128715e-06, "loss": 0.1546, "num_input_tokens_seen": 18928576, "step": 28100 }, { "epoch": 0.6866098258129137, "grad_norm": 0.8130379319190979, "learning_rate": 1.991529063836441e-06, "loss": 0.1028, "num_input_tokens_seen": 18932096, "step": 28105 }, { "epoch": 0.6867319766447609, "grad_norm": 12.115750312805176, "learning_rate": 1.991517983949345e-06, "loss": 0.1427, "num_input_tokens_seen": 18935360, "step": 28110 }, { "epoch": 0.6868541274766081, "grad_norm": 0.17904898524284363, "learning_rate": 1.991506896851663e-06, "loss": 0.0468, "num_input_tokens_seen": 18938688, "step": 28115 }, { "epoch": 0.6869762783084553, "grad_norm": 10.688509941101074, "learning_rate": 1.9914958025434764e-06, "loss": 0.1421, "num_input_tokens_seen": 18942080, "step": 28120 }, { "epoch": 0.6870984291403024, "grad_norm": 33.10630416870117, "learning_rate": 1.9914847010248657e-06, "loss": 0.0983, "num_input_tokens_seen": 18945152, "step": 28125 }, { "epoch": 0.6872205799721496, "grad_norm": 7.811311721801758, "learning_rate": 1.9914735922959116e-06, "loss": 0.0689, "num_input_tokens_seen": 18948224, "step": 28130 }, { "epoch": 0.6873427308039968, "grad_norm": 5.051802635192871, "learning_rate": 1.9914624763566946e-06, "loss": 0.0645, "num_input_tokens_seen": 18951680, "step": 28135 }, { "epoch": 0.687464881635844, "grad_norm": 12.035042762756348, "learning_rate": 1.991451353207296e-06, "loss": 0.0881, "num_input_tokens_seen": 18957248, "step": 28140 }, { "epoch": 0.6875870324676912, "grad_norm": 8.856138229370117, "learning_rate": 1.9914402228477962e-06, "loss": 0.1386, "num_input_tokens_seen": 18960640, "step": 28145 }, { "epoch": 0.6877091832995382, "grad_norm": 0.4886510670185089, "learning_rate": 1.991429085278277e-06, "loss": 0.1298, "num_input_tokens_seen": 18963840, "step": 28150 }, { "epoch": 0.6878313341313854, "grad_norm": 1.587204098701477, "learning_rate": 1.9914179404988185e-06, "loss": 0.0415, "num_input_tokens_seen": 18967616, "step": 28155 }, { "epoch": 0.6879534849632326, "grad_norm": 1.0043456554412842, "learning_rate": 1.991406788509502e-06, "loss": 0.0961, "num_input_tokens_seen": 18972928, "step": 28160 }, { "epoch": 0.6880756357950798, "grad_norm": 15.976723670959473, "learning_rate": 1.991395629310409e-06, "loss": 0.157, "num_input_tokens_seen": 18976192, "step": 28165 }, { "epoch": 0.6881977866269269, "grad_norm": 14.203861236572266, "learning_rate": 1.99138446290162e-06, "loss": 0.0917, "num_input_tokens_seen": 18979392, "step": 28170 }, { "epoch": 0.6883199374587741, "grad_norm": 4.559242248535156, "learning_rate": 1.9913732892832166e-06, "loss": 0.0725, "num_input_tokens_seen": 18982464, "step": 28175 }, { "epoch": 0.6884420882906213, "grad_norm": 32.41259765625, "learning_rate": 1.9913621084552797e-06, "loss": 0.156, "num_input_tokens_seen": 18985856, "step": 28180 }, { "epoch": 0.6885642391224684, "grad_norm": 9.131508827209473, "learning_rate": 1.9913509204178913e-06, "loss": 0.1084, "num_input_tokens_seen": 18989248, "step": 28185 }, { "epoch": 0.6886863899543156, "grad_norm": 11.379778861999512, "learning_rate": 1.9913397251711323e-06, "loss": 0.0997, "num_input_tokens_seen": 18992960, "step": 28190 }, { "epoch": 0.6888085407861627, "grad_norm": 8.573840141296387, "learning_rate": 1.991328522715084e-06, "loss": 0.0419, "num_input_tokens_seen": 18996352, "step": 28195 }, { "epoch": 0.6889306916180099, "grad_norm": 14.748952865600586, "learning_rate": 1.9913173130498283e-06, "loss": 0.2366, "num_input_tokens_seen": 19000064, "step": 28200 }, { "epoch": 0.6890528424498571, "grad_norm": 20.12869644165039, "learning_rate": 1.9913060961754463e-06, "loss": 0.069, "num_input_tokens_seen": 19003648, "step": 28205 }, { "epoch": 0.6891749932817043, "grad_norm": 1.0335286855697632, "learning_rate": 1.99129487209202e-06, "loss": 0.1106, "num_input_tokens_seen": 19007360, "step": 28210 }, { "epoch": 0.6892971441135514, "grad_norm": 32.707645416259766, "learning_rate": 1.9912836407996307e-06, "loss": 0.209, "num_input_tokens_seen": 19010304, "step": 28215 }, { "epoch": 0.6894192949453986, "grad_norm": 27.200157165527344, "learning_rate": 1.9912724022983597e-06, "loss": 0.1874, "num_input_tokens_seen": 19013696, "step": 28220 }, { "epoch": 0.6895414457772457, "grad_norm": 0.2330540120601654, "learning_rate": 1.9912611565882894e-06, "loss": 0.0766, "num_input_tokens_seen": 19016960, "step": 28225 }, { "epoch": 0.6896635966090929, "grad_norm": 0.19438402354717255, "learning_rate": 1.9912499036695016e-06, "loss": 0.0774, "num_input_tokens_seen": 19020736, "step": 28230 }, { "epoch": 0.6897857474409401, "grad_norm": 13.885992050170898, "learning_rate": 1.991238643542078e-06, "loss": 0.064, "num_input_tokens_seen": 19024256, "step": 28235 }, { "epoch": 0.6899078982727872, "grad_norm": 10.976266860961914, "learning_rate": 1.9912273762061e-06, "loss": 0.1147, "num_input_tokens_seen": 19027392, "step": 28240 }, { "epoch": 0.6900300491046344, "grad_norm": 32.6606330871582, "learning_rate": 1.9912161016616496e-06, "loss": 0.105, "num_input_tokens_seen": 19031168, "step": 28245 }, { "epoch": 0.6901521999364816, "grad_norm": 13.941425323486328, "learning_rate": 1.99120481990881e-06, "loss": 0.2081, "num_input_tokens_seen": 19034112, "step": 28250 }, { "epoch": 0.6902743507683288, "grad_norm": 8.773990631103516, "learning_rate": 1.991193530947662e-06, "loss": 0.1315, "num_input_tokens_seen": 19037248, "step": 28255 }, { "epoch": 0.6903965016001758, "grad_norm": 23.475357055664062, "learning_rate": 1.9911822347782876e-06, "loss": 0.1358, "num_input_tokens_seen": 19040320, "step": 28260 }, { "epoch": 0.690518652432023, "grad_norm": 19.392465591430664, "learning_rate": 1.9911709314007696e-06, "loss": 0.073, "num_input_tokens_seen": 19043648, "step": 28265 }, { "epoch": 0.6906408032638702, "grad_norm": 27.202695846557617, "learning_rate": 1.99115962081519e-06, "loss": 0.0217, "num_input_tokens_seen": 19047296, "step": 28270 }, { "epoch": 0.6907629540957174, "grad_norm": 0.36758825182914734, "learning_rate": 1.991148303021631e-06, "loss": 0.1819, "num_input_tokens_seen": 19050816, "step": 28275 }, { "epoch": 0.6908851049275646, "grad_norm": 15.720108985900879, "learning_rate": 1.9911369780201754e-06, "loss": 0.1465, "num_input_tokens_seen": 19054016, "step": 28280 }, { "epoch": 0.6910072557594117, "grad_norm": 3.562192440032959, "learning_rate": 1.991125645810905e-06, "loss": 0.0116, "num_input_tokens_seen": 19057088, "step": 28285 }, { "epoch": 0.6911294065912589, "grad_norm": 14.69986343383789, "learning_rate": 1.991114306393902e-06, "loss": 0.1489, "num_input_tokens_seen": 19060544, "step": 28290 }, { "epoch": 0.6912515574231061, "grad_norm": 1.1109415292739868, "learning_rate": 1.991102959769249e-06, "loss": 0.0615, "num_input_tokens_seen": 19063936, "step": 28295 }, { "epoch": 0.6913737082549533, "grad_norm": 13.292527198791504, "learning_rate": 1.991091605937029e-06, "loss": 0.3578, "num_input_tokens_seen": 19067264, "step": 28300 }, { "epoch": 0.6914958590868003, "grad_norm": 16.39297866821289, "learning_rate": 1.9910802448973245e-06, "loss": 0.1859, "num_input_tokens_seen": 19070272, "step": 28305 }, { "epoch": 0.6916180099186475, "grad_norm": 25.160232543945312, "learning_rate": 1.9910688766502177e-06, "loss": 0.1424, "num_input_tokens_seen": 19073408, "step": 28310 }, { "epoch": 0.6917401607504947, "grad_norm": 21.078258514404297, "learning_rate": 1.9910575011957914e-06, "loss": 0.0961, "num_input_tokens_seen": 19076544, "step": 28315 }, { "epoch": 0.6918623115823419, "grad_norm": 75.08121490478516, "learning_rate": 1.9910461185341287e-06, "loss": 0.0969, "num_input_tokens_seen": 19080256, "step": 28320 }, { "epoch": 0.6919844624141891, "grad_norm": 1.2669124603271484, "learning_rate": 1.9910347286653116e-06, "loss": 0.0501, "num_input_tokens_seen": 19083776, "step": 28325 }, { "epoch": 0.6921066132460362, "grad_norm": 5.270265102386475, "learning_rate": 1.9910233315894237e-06, "loss": 0.0835, "num_input_tokens_seen": 19087552, "step": 28330 }, { "epoch": 0.6922287640778834, "grad_norm": 0.8477705121040344, "learning_rate": 1.9910119273065474e-06, "loss": 0.1434, "num_input_tokens_seen": 19090688, "step": 28335 }, { "epoch": 0.6923509149097306, "grad_norm": 26.06892967224121, "learning_rate": 1.991000515816766e-06, "loss": 0.1814, "num_input_tokens_seen": 19093888, "step": 28340 }, { "epoch": 0.6924730657415777, "grad_norm": 14.113260269165039, "learning_rate": 1.990989097120162e-06, "loss": 0.2087, "num_input_tokens_seen": 19097152, "step": 28345 }, { "epoch": 0.6925952165734248, "grad_norm": 0.316220223903656, "learning_rate": 1.990977671216819e-06, "loss": 0.0672, "num_input_tokens_seen": 19100608, "step": 28350 }, { "epoch": 0.692717367405272, "grad_norm": 13.8766450881958, "learning_rate": 1.9909662381068195e-06, "loss": 0.3032, "num_input_tokens_seen": 19103936, "step": 28355 }, { "epoch": 0.6928395182371192, "grad_norm": 24.313175201416016, "learning_rate": 1.9909547977902473e-06, "loss": 0.1386, "num_input_tokens_seen": 19107136, "step": 28360 }, { "epoch": 0.6929616690689664, "grad_norm": 23.778879165649414, "learning_rate": 1.9909433502671853e-06, "loss": 0.0419, "num_input_tokens_seen": 19110400, "step": 28365 }, { "epoch": 0.6930838199008135, "grad_norm": 26.069019317626953, "learning_rate": 1.9909318955377165e-06, "loss": 0.1421, "num_input_tokens_seen": 19113920, "step": 28370 }, { "epoch": 0.6932059707326607, "grad_norm": 18.443458557128906, "learning_rate": 1.9909204336019247e-06, "loss": 0.1806, "num_input_tokens_seen": 19117696, "step": 28375 }, { "epoch": 0.6933281215645078, "grad_norm": 15.086554527282715, "learning_rate": 1.990908964459893e-06, "loss": 0.127, "num_input_tokens_seen": 19120832, "step": 28380 }, { "epoch": 0.693450272396355, "grad_norm": 35.456085205078125, "learning_rate": 1.9908974881117042e-06, "loss": 0.0871, "num_input_tokens_seen": 19124224, "step": 28385 }, { "epoch": 0.6935724232282022, "grad_norm": 26.71317481994629, "learning_rate": 1.990886004557443e-06, "loss": 0.1087, "num_input_tokens_seen": 19127232, "step": 28390 }, { "epoch": 0.6936945740600493, "grad_norm": 25.975605010986328, "learning_rate": 1.990874513797192e-06, "loss": 0.0849, "num_input_tokens_seen": 19130816, "step": 28395 }, { "epoch": 0.6938167248918965, "grad_norm": 11.141236305236816, "learning_rate": 1.990863015831035e-06, "loss": 0.1112, "num_input_tokens_seen": 19133888, "step": 28400 }, { "epoch": 0.6939388757237437, "grad_norm": 41.74148178100586, "learning_rate": 1.990851510659056e-06, "loss": 0.2546, "num_input_tokens_seen": 19136832, "step": 28405 }, { "epoch": 0.6940610265555909, "grad_norm": 26.85130500793457, "learning_rate": 1.990839998281338e-06, "loss": 0.1442, "num_input_tokens_seen": 19140032, "step": 28410 }, { "epoch": 0.694183177387438, "grad_norm": 18.30794906616211, "learning_rate": 1.9908284786979647e-06, "loss": 0.1615, "num_input_tokens_seen": 19142912, "step": 28415 }, { "epoch": 0.6943053282192851, "grad_norm": 14.795523643493652, "learning_rate": 1.9908169519090208e-06, "loss": 0.1455, "num_input_tokens_seen": 19145920, "step": 28420 }, { "epoch": 0.6944274790511323, "grad_norm": 2.048726797103882, "learning_rate": 1.990805417914589e-06, "loss": 0.1582, "num_input_tokens_seen": 19148992, "step": 28425 }, { "epoch": 0.6945496298829795, "grad_norm": 14.487932205200195, "learning_rate": 1.9907938767147542e-06, "loss": 0.1379, "num_input_tokens_seen": 19152448, "step": 28430 }, { "epoch": 0.6946717807148267, "grad_norm": 20.653125762939453, "learning_rate": 1.9907823283095998e-06, "loss": 0.0703, "num_input_tokens_seen": 19155776, "step": 28435 }, { "epoch": 0.6947939315466738, "grad_norm": 15.435851097106934, "learning_rate": 1.9907707726992095e-06, "loss": 0.1394, "num_input_tokens_seen": 19158848, "step": 28440 }, { "epoch": 0.694916082378521, "grad_norm": 15.184447288513184, "learning_rate": 1.9907592098836678e-06, "loss": 0.1629, "num_input_tokens_seen": 19162240, "step": 28445 }, { "epoch": 0.6950382332103682, "grad_norm": 14.811301231384277, "learning_rate": 1.9907476398630584e-06, "loss": 0.0741, "num_input_tokens_seen": 19165696, "step": 28450 }, { "epoch": 0.6951603840422154, "grad_norm": 17.973371505737305, "learning_rate": 1.990736062637466e-06, "loss": 0.128, "num_input_tokens_seen": 19168768, "step": 28455 }, { "epoch": 0.6952825348740624, "grad_norm": 19.12712860107422, "learning_rate": 1.9907244782069745e-06, "loss": 0.184, "num_input_tokens_seen": 19172544, "step": 28460 }, { "epoch": 0.6954046857059096, "grad_norm": 7.91143798828125, "learning_rate": 1.990712886571668e-06, "loss": 0.0412, "num_input_tokens_seen": 19176128, "step": 28465 }, { "epoch": 0.6955268365377568, "grad_norm": 7.214994430541992, "learning_rate": 1.990701287731631e-06, "loss": 0.1263, "num_input_tokens_seen": 19179776, "step": 28470 }, { "epoch": 0.695648987369604, "grad_norm": 28.023527145385742, "learning_rate": 1.9906896816869475e-06, "loss": 0.1573, "num_input_tokens_seen": 19183360, "step": 28475 }, { "epoch": 0.6957711382014512, "grad_norm": 2.206017255783081, "learning_rate": 1.9906780684377025e-06, "loss": 0.0671, "num_input_tokens_seen": 19186432, "step": 28480 }, { "epoch": 0.6958932890332983, "grad_norm": 17.74364471435547, "learning_rate": 1.99066644798398e-06, "loss": 0.1002, "num_input_tokens_seen": 19189760, "step": 28485 }, { "epoch": 0.6960154398651455, "grad_norm": 0.5346532464027405, "learning_rate": 1.9906548203258644e-06, "loss": 0.0669, "num_input_tokens_seen": 19192896, "step": 28490 }, { "epoch": 0.6961375906969927, "grad_norm": 10.89641284942627, "learning_rate": 1.990643185463441e-06, "loss": 0.158, "num_input_tokens_seen": 19196480, "step": 28495 }, { "epoch": 0.6962597415288398, "grad_norm": 17.990570068359375, "learning_rate": 1.9906315433967937e-06, "loss": 0.1286, "num_input_tokens_seen": 19200128, "step": 28500 }, { "epoch": 0.6963818923606869, "grad_norm": 23.91288948059082, "learning_rate": 1.990619894126007e-06, "loss": 0.125, "num_input_tokens_seen": 19203328, "step": 28505 }, { "epoch": 0.6965040431925341, "grad_norm": 10.834264755249023, "learning_rate": 1.9906082376511665e-06, "loss": 0.1438, "num_input_tokens_seen": 19206400, "step": 28510 }, { "epoch": 0.6966261940243813, "grad_norm": 2.1280264854431152, "learning_rate": 1.9905965739723563e-06, "loss": 0.0225, "num_input_tokens_seen": 19209920, "step": 28515 }, { "epoch": 0.6967483448562285, "grad_norm": 0.8384555578231812, "learning_rate": 1.9905849030896614e-06, "loss": 0.0699, "num_input_tokens_seen": 19213120, "step": 28520 }, { "epoch": 0.6968704956880757, "grad_norm": 6.136308670043945, "learning_rate": 1.9905732250031664e-06, "loss": 0.1218, "num_input_tokens_seen": 19216512, "step": 28525 }, { "epoch": 0.6969926465199228, "grad_norm": 23.7427921295166, "learning_rate": 1.9905615397129565e-06, "loss": 0.0611, "num_input_tokens_seen": 19219328, "step": 28530 }, { "epoch": 0.69711479735177, "grad_norm": 8.574586868286133, "learning_rate": 1.9905498472191168e-06, "loss": 0.1589, "num_input_tokens_seen": 19222528, "step": 28535 }, { "epoch": 0.6972369481836171, "grad_norm": 2.952967643737793, "learning_rate": 1.9905381475217323e-06, "loss": 0.1138, "num_input_tokens_seen": 19226048, "step": 28540 }, { "epoch": 0.6973590990154643, "grad_norm": 25.55811882019043, "learning_rate": 1.990526440620888e-06, "loss": 0.0534, "num_input_tokens_seen": 19229184, "step": 28545 }, { "epoch": 0.6974812498473114, "grad_norm": 24.35697364807129, "learning_rate": 1.9905147265166686e-06, "loss": 0.2159, "num_input_tokens_seen": 19232704, "step": 28550 }, { "epoch": 0.6976034006791586, "grad_norm": 1.1305099725723267, "learning_rate": 1.99050300520916e-06, "loss": 0.1376, "num_input_tokens_seen": 19236544, "step": 28555 }, { "epoch": 0.6977255515110058, "grad_norm": 5.858948230743408, "learning_rate": 1.9904912766984472e-06, "loss": 0.062, "num_input_tokens_seen": 19239616, "step": 28560 }, { "epoch": 0.697847702342853, "grad_norm": 20.73585319519043, "learning_rate": 1.990479540984615e-06, "loss": 0.0789, "num_input_tokens_seen": 19243136, "step": 28565 }, { "epoch": 0.6979698531747002, "grad_norm": 0.351258784532547, "learning_rate": 1.9904677980677496e-06, "loss": 0.1022, "num_input_tokens_seen": 19246400, "step": 28570 }, { "epoch": 0.6980920040065473, "grad_norm": 0.48112377524375916, "learning_rate": 1.990456047947936e-06, "loss": 0.0678, "num_input_tokens_seen": 19249472, "step": 28575 }, { "epoch": 0.6982141548383944, "grad_norm": 35.932861328125, "learning_rate": 1.9904442906252594e-06, "loss": 0.1318, "num_input_tokens_seen": 19252992, "step": 28580 }, { "epoch": 0.6983363056702416, "grad_norm": 0.44541868567466736, "learning_rate": 1.9904325260998055e-06, "loss": 0.1667, "num_input_tokens_seen": 19256064, "step": 28585 }, { "epoch": 0.6984584565020888, "grad_norm": 1.5130938291549683, "learning_rate": 1.99042075437166e-06, "loss": 0.1057, "num_input_tokens_seen": 19259456, "step": 28590 }, { "epoch": 0.6985806073339359, "grad_norm": 32.22797393798828, "learning_rate": 1.9904089754409083e-06, "loss": 0.1113, "num_input_tokens_seen": 19262272, "step": 28595 }, { "epoch": 0.6987027581657831, "grad_norm": 0.38581711053848267, "learning_rate": 1.990397189307636e-06, "loss": 0.0824, "num_input_tokens_seen": 19265664, "step": 28600 }, { "epoch": 0.6988249089976303, "grad_norm": 0.6255455613136292, "learning_rate": 1.9903853959719293e-06, "loss": 0.0588, "num_input_tokens_seen": 19269632, "step": 28605 }, { "epoch": 0.6989470598294775, "grad_norm": 19.804824829101562, "learning_rate": 1.9903735954338736e-06, "loss": 0.156, "num_input_tokens_seen": 19272768, "step": 28610 }, { "epoch": 0.6990692106613247, "grad_norm": 15.568331718444824, "learning_rate": 1.9903617876935544e-06, "loss": 0.2028, "num_input_tokens_seen": 19275904, "step": 28615 }, { "epoch": 0.6991913614931717, "grad_norm": 30.59419059753418, "learning_rate": 1.990349972751058e-06, "loss": 0.2168, "num_input_tokens_seen": 19279104, "step": 28620 }, { "epoch": 0.6993135123250189, "grad_norm": 11.51990795135498, "learning_rate": 1.9903381506064704e-06, "loss": 0.1528, "num_input_tokens_seen": 19282624, "step": 28625 }, { "epoch": 0.6994356631568661, "grad_norm": 18.640762329101562, "learning_rate": 1.9903263212598772e-06, "loss": 0.0853, "num_input_tokens_seen": 19285696, "step": 28630 }, { "epoch": 0.6995578139887133, "grad_norm": 0.11530325561761856, "learning_rate": 1.990314484711365e-06, "loss": 0.0641, "num_input_tokens_seen": 19288960, "step": 28635 }, { "epoch": 0.6996799648205604, "grad_norm": 25.19300079345703, "learning_rate": 1.990302640961019e-06, "loss": 0.1388, "num_input_tokens_seen": 19292032, "step": 28640 }, { "epoch": 0.6998021156524076, "grad_norm": 57.09481430053711, "learning_rate": 1.990290790008926e-06, "loss": 0.1323, "num_input_tokens_seen": 19294976, "step": 28645 }, { "epoch": 0.6999242664842548, "grad_norm": 12.756439208984375, "learning_rate": 1.9902789318551727e-06, "loss": 0.0876, "num_input_tokens_seen": 19298240, "step": 28650 }, { "epoch": 0.700046417316102, "grad_norm": 59.20932388305664, "learning_rate": 1.990267066499844e-06, "loss": 0.1469, "num_input_tokens_seen": 19301440, "step": 28655 }, { "epoch": 0.700168568147949, "grad_norm": 1.153745174407959, "learning_rate": 1.9902551939430266e-06, "loss": 0.0732, "num_input_tokens_seen": 19304640, "step": 28660 }, { "epoch": 0.7002907189797962, "grad_norm": 22.583284378051758, "learning_rate": 1.9902433141848076e-06, "loss": 0.0645, "num_input_tokens_seen": 19307968, "step": 28665 }, { "epoch": 0.7004128698116434, "grad_norm": 13.325112342834473, "learning_rate": 1.9902314272252724e-06, "loss": 0.0677, "num_input_tokens_seen": 19311104, "step": 28670 }, { "epoch": 0.7005350206434906, "grad_norm": 0.29962795972824097, "learning_rate": 1.9902195330645084e-06, "loss": 0.0831, "num_input_tokens_seen": 19314304, "step": 28675 }, { "epoch": 0.7006571714753378, "grad_norm": 29.55235481262207, "learning_rate": 1.9902076317026014e-06, "loss": 0.1596, "num_input_tokens_seen": 19317760, "step": 28680 }, { "epoch": 0.7007793223071849, "grad_norm": 16.093425750732422, "learning_rate": 1.990195723139638e-06, "loss": 0.1439, "num_input_tokens_seen": 19321344, "step": 28685 }, { "epoch": 0.7009014731390321, "grad_norm": 38.62454605102539, "learning_rate": 1.990183807375705e-06, "loss": 0.2093, "num_input_tokens_seen": 19325312, "step": 28690 }, { "epoch": 0.7010236239708793, "grad_norm": 1.2527316808700562, "learning_rate": 1.9901718844108894e-06, "loss": 0.1465, "num_input_tokens_seen": 19328896, "step": 28695 }, { "epoch": 0.7011457748027264, "grad_norm": 31.273618698120117, "learning_rate": 1.9901599542452773e-06, "loss": 0.083, "num_input_tokens_seen": 19332480, "step": 28700 }, { "epoch": 0.7012679256345735, "grad_norm": 1.2295677661895752, "learning_rate": 1.9901480168789554e-06, "loss": 0.0821, "num_input_tokens_seen": 19335680, "step": 28705 }, { "epoch": 0.7013900764664207, "grad_norm": 0.519361138343811, "learning_rate": 1.990136072312011e-06, "loss": 0.0284, "num_input_tokens_seen": 19339136, "step": 28710 }, { "epoch": 0.7015122272982679, "grad_norm": 25.611343383789062, "learning_rate": 1.9901241205445313e-06, "loss": 0.197, "num_input_tokens_seen": 19342592, "step": 28715 }, { "epoch": 0.7016343781301151, "grad_norm": 1.822134256362915, "learning_rate": 1.990112161576602e-06, "loss": 0.0998, "num_input_tokens_seen": 19346496, "step": 28720 }, { "epoch": 0.7017565289619623, "grad_norm": 11.004918098449707, "learning_rate": 1.990100195408311e-06, "loss": 0.1877, "num_input_tokens_seen": 19349952, "step": 28725 }, { "epoch": 0.7018786797938094, "grad_norm": 8.324329376220703, "learning_rate": 1.9900882220397454e-06, "loss": 0.1481, "num_input_tokens_seen": 19353408, "step": 28730 }, { "epoch": 0.7020008306256565, "grad_norm": 21.26910400390625, "learning_rate": 1.9900762414709913e-06, "loss": 0.1623, "num_input_tokens_seen": 19357120, "step": 28735 }, { "epoch": 0.7021229814575037, "grad_norm": 24.641504287719727, "learning_rate": 1.990064253702137e-06, "loss": 0.1476, "num_input_tokens_seen": 19360640, "step": 28740 }, { "epoch": 0.7022451322893509, "grad_norm": 100.31071472167969, "learning_rate": 1.990052258733269e-06, "loss": 0.0906, "num_input_tokens_seen": 19364672, "step": 28745 }, { "epoch": 0.702367283121198, "grad_norm": 0.5302855372428894, "learning_rate": 1.9900402565644745e-06, "loss": 0.0672, "num_input_tokens_seen": 19367680, "step": 28750 }, { "epoch": 0.7024894339530452, "grad_norm": 34.36920928955078, "learning_rate": 1.9900282471958413e-06, "loss": 0.1317, "num_input_tokens_seen": 19370624, "step": 28755 }, { "epoch": 0.7026115847848924, "grad_norm": 36.110626220703125, "learning_rate": 1.990016230627456e-06, "loss": 0.1801, "num_input_tokens_seen": 19373568, "step": 28760 }, { "epoch": 0.7027337356167396, "grad_norm": 1.0250879526138306, "learning_rate": 1.9900042068594066e-06, "loss": 0.1105, "num_input_tokens_seen": 19377024, "step": 28765 }, { "epoch": 0.7028558864485868, "grad_norm": 18.993581771850586, "learning_rate": 1.98999217589178e-06, "loss": 0.1156, "num_input_tokens_seen": 19380032, "step": 28770 }, { "epoch": 0.7029780372804338, "grad_norm": 19.046401977539062, "learning_rate": 1.9899801377246645e-06, "loss": 0.1092, "num_input_tokens_seen": 19383680, "step": 28775 }, { "epoch": 0.703100188112281, "grad_norm": 15.642848014831543, "learning_rate": 1.989968092358147e-06, "loss": 0.2078, "num_input_tokens_seen": 19386944, "step": 28780 }, { "epoch": 0.7032223389441282, "grad_norm": 4.28993558883667, "learning_rate": 1.9899560397923154e-06, "loss": 0.1218, "num_input_tokens_seen": 19390272, "step": 28785 }, { "epoch": 0.7033444897759754, "grad_norm": 0.3806248903274536, "learning_rate": 1.9899439800272568e-06, "loss": 0.1744, "num_input_tokens_seen": 19393344, "step": 28790 }, { "epoch": 0.7034666406078225, "grad_norm": 21.923810958862305, "learning_rate": 1.9899319130630597e-06, "loss": 0.0851, "num_input_tokens_seen": 19396992, "step": 28795 }, { "epoch": 0.7035887914396697, "grad_norm": 17.54344367980957, "learning_rate": 1.989919838899811e-06, "loss": 0.1174, "num_input_tokens_seen": 19400320, "step": 28800 }, { "epoch": 0.7037109422715169, "grad_norm": 0.955232560634613, "learning_rate": 1.9899077575376e-06, "loss": 0.1332, "num_input_tokens_seen": 19404032, "step": 28805 }, { "epoch": 0.7038330931033641, "grad_norm": 23.174301147460938, "learning_rate": 1.9898956689765127e-06, "loss": 0.1948, "num_input_tokens_seen": 19407360, "step": 28810 }, { "epoch": 0.7039552439352113, "grad_norm": 7.8368072509765625, "learning_rate": 1.989883573216638e-06, "loss": 0.0506, "num_input_tokens_seen": 19410560, "step": 28815 }, { "epoch": 0.7040773947670583, "grad_norm": 26.670555114746094, "learning_rate": 1.9898714702580637e-06, "loss": 0.0926, "num_input_tokens_seen": 19413760, "step": 28820 }, { "epoch": 0.7041995455989055, "grad_norm": 21.16333770751953, "learning_rate": 1.9898593601008776e-06, "loss": 0.1177, "num_input_tokens_seen": 19417216, "step": 28825 }, { "epoch": 0.7043216964307527, "grad_norm": 30.32469940185547, "learning_rate": 1.9898472427451684e-06, "loss": 0.0938, "num_input_tokens_seen": 19420480, "step": 28830 }, { "epoch": 0.7044438472625999, "grad_norm": 25.04443359375, "learning_rate": 1.989835118191024e-06, "loss": 0.197, "num_input_tokens_seen": 19424000, "step": 28835 }, { "epoch": 0.704565998094447, "grad_norm": 1.6711410284042358, "learning_rate": 1.989822986438532e-06, "loss": 0.0714, "num_input_tokens_seen": 19427648, "step": 28840 }, { "epoch": 0.7046881489262942, "grad_norm": 31.52394676208496, "learning_rate": 1.9898108474877805e-06, "loss": 0.0972, "num_input_tokens_seen": 19430912, "step": 28845 }, { "epoch": 0.7048102997581414, "grad_norm": 20.945629119873047, "learning_rate": 1.989798701338859e-06, "loss": 0.1386, "num_input_tokens_seen": 19434432, "step": 28850 }, { "epoch": 0.7049324505899885, "grad_norm": 11.488555908203125, "learning_rate": 1.989786547991855e-06, "loss": 0.2156, "num_input_tokens_seen": 19437824, "step": 28855 }, { "epoch": 0.7050546014218357, "grad_norm": 7.017218112945557, "learning_rate": 1.989774387446857e-06, "loss": 0.1754, "num_input_tokens_seen": 19441344, "step": 28860 }, { "epoch": 0.7051767522536828, "grad_norm": 0.6078383326530457, "learning_rate": 1.9897622197039533e-06, "loss": 0.0371, "num_input_tokens_seen": 19445056, "step": 28865 }, { "epoch": 0.70529890308553, "grad_norm": 8.64908504486084, "learning_rate": 1.9897500447632326e-06, "loss": 0.1202, "num_input_tokens_seen": 19448192, "step": 28870 }, { "epoch": 0.7054210539173772, "grad_norm": 16.532529830932617, "learning_rate": 1.9897378626247835e-06, "loss": 0.1468, "num_input_tokens_seen": 19451776, "step": 28875 }, { "epoch": 0.7055432047492244, "grad_norm": 1.9767354726791382, "learning_rate": 1.9897256732886943e-06, "loss": 0.0516, "num_input_tokens_seen": 19454848, "step": 28880 }, { "epoch": 0.7056653555810715, "grad_norm": 18.877872467041016, "learning_rate": 1.989713476755054e-06, "loss": 0.1063, "num_input_tokens_seen": 19458624, "step": 28885 }, { "epoch": 0.7057875064129187, "grad_norm": 6.534497261047363, "learning_rate": 1.9897012730239508e-06, "loss": 0.0761, "num_input_tokens_seen": 19461760, "step": 28890 }, { "epoch": 0.7059096572447658, "grad_norm": 12.019935607910156, "learning_rate": 1.989689062095474e-06, "loss": 0.1534, "num_input_tokens_seen": 19465600, "step": 28895 }, { "epoch": 0.706031808076613, "grad_norm": 6.496730327606201, "learning_rate": 1.989676843969712e-06, "loss": 0.0526, "num_input_tokens_seen": 19469504, "step": 28900 }, { "epoch": 0.7061539589084602, "grad_norm": 0.14786797761917114, "learning_rate": 1.9896646186467537e-06, "loss": 0.1339, "num_input_tokens_seen": 19472192, "step": 28905 }, { "epoch": 0.7062761097403073, "grad_norm": 14.278701782226562, "learning_rate": 1.9896523861266882e-06, "loss": 0.1101, "num_input_tokens_seen": 19476096, "step": 28910 }, { "epoch": 0.7063982605721545, "grad_norm": 13.4442720413208, "learning_rate": 1.9896401464096045e-06, "loss": 0.0629, "num_input_tokens_seen": 19479232, "step": 28915 }, { "epoch": 0.7065204114040017, "grad_norm": 17.962377548217773, "learning_rate": 1.9896278994955914e-06, "loss": 0.1961, "num_input_tokens_seen": 19482112, "step": 28920 }, { "epoch": 0.7066425622358489, "grad_norm": 14.128437042236328, "learning_rate": 1.9896156453847383e-06, "loss": 0.0798, "num_input_tokens_seen": 19485376, "step": 28925 }, { "epoch": 0.706764713067696, "grad_norm": 4.156303882598877, "learning_rate": 1.9896033840771333e-06, "loss": 0.0248, "num_input_tokens_seen": 19488768, "step": 28930 }, { "epoch": 0.7068868638995431, "grad_norm": 9.4829740524292, "learning_rate": 1.989591115572867e-06, "loss": 0.1189, "num_input_tokens_seen": 19492160, "step": 28935 }, { "epoch": 0.7070090147313903, "grad_norm": 4.698817729949951, "learning_rate": 1.9895788398720276e-06, "loss": 0.0783, "num_input_tokens_seen": 19495616, "step": 28940 }, { "epoch": 0.7071311655632375, "grad_norm": 17.0122013092041, "learning_rate": 1.9895665569747047e-06, "loss": 0.0378, "num_input_tokens_seen": 19498880, "step": 28945 }, { "epoch": 0.7072533163950846, "grad_norm": 15.190747261047363, "learning_rate": 1.989554266880988e-06, "loss": 0.0813, "num_input_tokens_seen": 19501952, "step": 28950 }, { "epoch": 0.7073754672269318, "grad_norm": 28.33185386657715, "learning_rate": 1.9895419695909663e-06, "loss": 0.0586, "num_input_tokens_seen": 19505792, "step": 28955 }, { "epoch": 0.707497618058779, "grad_norm": 6.6051225662231445, "learning_rate": 1.989529665104729e-06, "loss": 0.0589, "num_input_tokens_seen": 19509504, "step": 28960 }, { "epoch": 0.7076197688906262, "grad_norm": 10.518942832946777, "learning_rate": 1.989517353422366e-06, "loss": 0.1812, "num_input_tokens_seen": 19512640, "step": 28965 }, { "epoch": 0.7077419197224734, "grad_norm": 24.557565689086914, "learning_rate": 1.989505034543967e-06, "loss": 0.1463, "num_input_tokens_seen": 19516032, "step": 28970 }, { "epoch": 0.7078640705543204, "grad_norm": 36.49713897705078, "learning_rate": 1.989492708469621e-06, "loss": 0.191, "num_input_tokens_seen": 19519296, "step": 28975 }, { "epoch": 0.7079862213861676, "grad_norm": 1.5474926233291626, "learning_rate": 1.9894803751994176e-06, "loss": 0.1693, "num_input_tokens_seen": 19522688, "step": 28980 }, { "epoch": 0.7081083722180148, "grad_norm": 0.5776304006576538, "learning_rate": 1.989468034733447e-06, "loss": 0.0771, "num_input_tokens_seen": 19526720, "step": 28985 }, { "epoch": 0.708230523049862, "grad_norm": 4.457348346710205, "learning_rate": 1.989455687071799e-06, "loss": 0.1575, "num_input_tokens_seen": 19529856, "step": 28990 }, { "epoch": 0.7083526738817091, "grad_norm": 0.3430345952510834, "learning_rate": 1.9894433322145624e-06, "loss": 0.0408, "num_input_tokens_seen": 19533824, "step": 28995 }, { "epoch": 0.7084748247135563, "grad_norm": 1.5313557386398315, "learning_rate": 1.9894309701618285e-06, "loss": 0.075, "num_input_tokens_seen": 19537216, "step": 29000 }, { "epoch": 0.7085969755454035, "grad_norm": 19.49930191040039, "learning_rate": 1.989418600913686e-06, "loss": 0.1021, "num_input_tokens_seen": 19540288, "step": 29005 }, { "epoch": 0.7087191263772507, "grad_norm": 23.722192764282227, "learning_rate": 1.9894062244702258e-06, "loss": 0.0327, "num_input_tokens_seen": 19544064, "step": 29010 }, { "epoch": 0.7088412772090978, "grad_norm": 3.8661394119262695, "learning_rate": 1.989393840831537e-06, "loss": 0.089, "num_input_tokens_seen": 19547648, "step": 29015 }, { "epoch": 0.7089634280409449, "grad_norm": 13.596760749816895, "learning_rate": 1.98938144999771e-06, "loss": 0.1718, "num_input_tokens_seen": 19551488, "step": 29020 }, { "epoch": 0.7090855788727921, "grad_norm": 33.69784927368164, "learning_rate": 1.989369051968835e-06, "loss": 0.1411, "num_input_tokens_seen": 19554560, "step": 29025 }, { "epoch": 0.7092077297046393, "grad_norm": 21.88158416748047, "learning_rate": 1.9893566467450024e-06, "loss": 0.1518, "num_input_tokens_seen": 19557952, "step": 29030 }, { "epoch": 0.7093298805364865, "grad_norm": 2.869716167449951, "learning_rate": 1.989344234326302e-06, "loss": 0.0834, "num_input_tokens_seen": 19561088, "step": 29035 }, { "epoch": 0.7094520313683336, "grad_norm": 10.745006561279297, "learning_rate": 1.989331814712824e-06, "loss": 0.2314, "num_input_tokens_seen": 19564800, "step": 29040 }, { "epoch": 0.7095741822001808, "grad_norm": 30.001081466674805, "learning_rate": 1.9893193879046594e-06, "loss": 0.1122, "num_input_tokens_seen": 19568384, "step": 29045 }, { "epoch": 0.709696333032028, "grad_norm": 30.0404109954834, "learning_rate": 1.989306953901898e-06, "loss": 0.1931, "num_input_tokens_seen": 19571456, "step": 29050 }, { "epoch": 0.7098184838638751, "grad_norm": 43.12938690185547, "learning_rate": 1.9892945127046304e-06, "loss": 0.1065, "num_input_tokens_seen": 19575040, "step": 29055 }, { "epoch": 0.7099406346957223, "grad_norm": 27.3835506439209, "learning_rate": 1.989282064312947e-06, "loss": 0.0938, "num_input_tokens_seen": 19577920, "step": 29060 }, { "epoch": 0.7100627855275694, "grad_norm": 19.477853775024414, "learning_rate": 1.989269608726938e-06, "loss": 0.0894, "num_input_tokens_seen": 19581312, "step": 29065 }, { "epoch": 0.7101849363594166, "grad_norm": 23.02320098876953, "learning_rate": 1.9892571459466945e-06, "loss": 0.1275, "num_input_tokens_seen": 19584640, "step": 29070 }, { "epoch": 0.7103070871912638, "grad_norm": 20.052583694458008, "learning_rate": 1.9892446759723073e-06, "loss": 0.1724, "num_input_tokens_seen": 19588224, "step": 29075 }, { "epoch": 0.710429238023111, "grad_norm": 3.1162588596343994, "learning_rate": 1.989232198803866e-06, "loss": 0.1144, "num_input_tokens_seen": 19591744, "step": 29080 }, { "epoch": 0.710551388854958, "grad_norm": 15.635176658630371, "learning_rate": 1.9892197144414627e-06, "loss": 0.0349, "num_input_tokens_seen": 19595136, "step": 29085 }, { "epoch": 0.7106735396868052, "grad_norm": 3.7856173515319824, "learning_rate": 1.9892072228851876e-06, "loss": 0.0683, "num_input_tokens_seen": 19598464, "step": 29090 }, { "epoch": 0.7107956905186524, "grad_norm": 15.923710823059082, "learning_rate": 1.9891947241351313e-06, "loss": 0.0898, "num_input_tokens_seen": 19602048, "step": 29095 }, { "epoch": 0.7109178413504996, "grad_norm": 31.128541946411133, "learning_rate": 1.989182218191385e-06, "loss": 0.1653, "num_input_tokens_seen": 19605696, "step": 29100 }, { "epoch": 0.7110399921823468, "grad_norm": 31.127042770385742, "learning_rate": 1.9891697050540395e-06, "loss": 0.0267, "num_input_tokens_seen": 19608896, "step": 29105 }, { "epoch": 0.7111621430141939, "grad_norm": 19.851909637451172, "learning_rate": 1.9891571847231858e-06, "loss": 0.1332, "num_input_tokens_seen": 19612672, "step": 29110 }, { "epoch": 0.7112842938460411, "grad_norm": 27.556503295898438, "learning_rate": 1.989144657198915e-06, "loss": 0.185, "num_input_tokens_seen": 19616768, "step": 29115 }, { "epoch": 0.7114064446778883, "grad_norm": 1.0737075805664062, "learning_rate": 1.989132122481318e-06, "loss": 0.0358, "num_input_tokens_seen": 19619776, "step": 29120 }, { "epoch": 0.7115285955097355, "grad_norm": 67.04922485351562, "learning_rate": 1.9891195805704865e-06, "loss": 0.1996, "num_input_tokens_seen": 19623040, "step": 29125 }, { "epoch": 0.7116507463415825, "grad_norm": 9.727737426757812, "learning_rate": 1.9891070314665114e-06, "loss": 0.1877, "num_input_tokens_seen": 19625984, "step": 29130 }, { "epoch": 0.7117728971734297, "grad_norm": 0.47230786085128784, "learning_rate": 1.9890944751694838e-06, "loss": 0.1285, "num_input_tokens_seen": 19630272, "step": 29135 }, { "epoch": 0.7118950480052769, "grad_norm": 12.917232513427734, "learning_rate": 1.989081911679495e-06, "loss": 0.1313, "num_input_tokens_seen": 19633472, "step": 29140 }, { "epoch": 0.7120171988371241, "grad_norm": 0.19916245341300964, "learning_rate": 1.9890693409966366e-06, "loss": 0.1579, "num_input_tokens_seen": 19636416, "step": 29145 }, { "epoch": 0.7121393496689713, "grad_norm": 12.989840507507324, "learning_rate": 1.9890567631209996e-06, "loss": 0.1654, "num_input_tokens_seen": 19639616, "step": 29150 }, { "epoch": 0.7122615005008184, "grad_norm": 15.532271385192871, "learning_rate": 1.9890441780526764e-06, "loss": 0.2201, "num_input_tokens_seen": 19643456, "step": 29155 }, { "epoch": 0.7123836513326656, "grad_norm": 31.146207809448242, "learning_rate": 1.9890315857917577e-06, "loss": 0.0386, "num_input_tokens_seen": 19646784, "step": 29160 }, { "epoch": 0.7125058021645128, "grad_norm": 21.65290069580078, "learning_rate": 1.9890189863383354e-06, "loss": 0.1972, "num_input_tokens_seen": 19650112, "step": 29165 }, { "epoch": 0.71262795299636, "grad_norm": 13.02177906036377, "learning_rate": 1.9890063796925006e-06, "loss": 0.1494, "num_input_tokens_seen": 19653312, "step": 29170 }, { "epoch": 0.712750103828207, "grad_norm": 22.124685287475586, "learning_rate": 1.988993765854346e-06, "loss": 0.2056, "num_input_tokens_seen": 19656576, "step": 29175 }, { "epoch": 0.7128722546600542, "grad_norm": 1.5387930870056152, "learning_rate": 1.9889811448239625e-06, "loss": 0.1209, "num_input_tokens_seen": 19659520, "step": 29180 }, { "epoch": 0.7129944054919014, "grad_norm": 15.83926773071289, "learning_rate": 1.9889685166014417e-06, "loss": 0.1168, "num_input_tokens_seen": 19662912, "step": 29185 }, { "epoch": 0.7131165563237486, "grad_norm": 4.195895195007324, "learning_rate": 1.988955881186876e-06, "loss": 0.102, "num_input_tokens_seen": 19666176, "step": 29190 }, { "epoch": 0.7132387071555957, "grad_norm": 11.080657005310059, "learning_rate": 1.9889432385803574e-06, "loss": 0.0322, "num_input_tokens_seen": 19669888, "step": 29195 }, { "epoch": 0.7133608579874429, "grad_norm": 16.482271194458008, "learning_rate": 1.9889305887819776e-06, "loss": 0.1683, "num_input_tokens_seen": 19672960, "step": 29200 }, { "epoch": 0.71348300881929, "grad_norm": 15.85693359375, "learning_rate": 1.9889179317918285e-06, "loss": 0.1291, "num_input_tokens_seen": 19676224, "step": 29205 }, { "epoch": 0.7136051596511372, "grad_norm": 1.0122528076171875, "learning_rate": 1.988905267610002e-06, "loss": 0.0831, "num_input_tokens_seen": 19680000, "step": 29210 }, { "epoch": 0.7137273104829844, "grad_norm": 28.881397247314453, "learning_rate": 1.9888925962365907e-06, "loss": 0.083, "num_input_tokens_seen": 19683328, "step": 29215 }, { "epoch": 0.7138494613148315, "grad_norm": 18.924196243286133, "learning_rate": 1.9888799176716866e-06, "loss": 0.1372, "num_input_tokens_seen": 19686592, "step": 29220 }, { "epoch": 0.7139716121466787, "grad_norm": 2.5882859230041504, "learning_rate": 1.988867231915381e-06, "loss": 0.1241, "num_input_tokens_seen": 19689728, "step": 29225 }, { "epoch": 0.7140937629785259, "grad_norm": 39.74570846557617, "learning_rate": 1.9888545389677675e-06, "loss": 0.1089, "num_input_tokens_seen": 19692864, "step": 29230 }, { "epoch": 0.7142159138103731, "grad_norm": 8.0762939453125, "learning_rate": 1.9888418388289376e-06, "loss": 0.1428, "num_input_tokens_seen": 19695872, "step": 29235 }, { "epoch": 0.7143380646422202, "grad_norm": 1.500272512435913, "learning_rate": 1.988829131498984e-06, "loss": 0.0693, "num_input_tokens_seen": 19699328, "step": 29240 }, { "epoch": 0.7144602154740674, "grad_norm": 3.4893500804901123, "learning_rate": 1.9888164169779992e-06, "loss": 0.0795, "num_input_tokens_seen": 19702848, "step": 29245 }, { "epoch": 0.7145823663059145, "grad_norm": 20.037160873413086, "learning_rate": 1.9888036952660754e-06, "loss": 0.0835, "num_input_tokens_seen": 19706048, "step": 29250 }, { "epoch": 0.7147045171377617, "grad_norm": 0.3974679112434387, "learning_rate": 1.9887909663633047e-06, "loss": 0.0817, "num_input_tokens_seen": 19709440, "step": 29255 }, { "epoch": 0.7148266679696089, "grad_norm": 15.502142906188965, "learning_rate": 1.9887782302697803e-06, "loss": 0.0933, "num_input_tokens_seen": 19712704, "step": 29260 }, { "epoch": 0.714948818801456, "grad_norm": 1.3267139196395874, "learning_rate": 1.988765486985595e-06, "loss": 0.0394, "num_input_tokens_seen": 19715968, "step": 29265 }, { "epoch": 0.7150709696333032, "grad_norm": 2.264620542526245, "learning_rate": 1.988752736510841e-06, "loss": 0.0985, "num_input_tokens_seen": 19718784, "step": 29270 }, { "epoch": 0.7151931204651504, "grad_norm": 33.954647064208984, "learning_rate": 1.9887399788456113e-06, "loss": 0.1016, "num_input_tokens_seen": 19722368, "step": 29275 }, { "epoch": 0.7153152712969976, "grad_norm": 10.680964469909668, "learning_rate": 1.988727213989998e-06, "loss": 0.1463, "num_input_tokens_seen": 19726016, "step": 29280 }, { "epoch": 0.7154374221288446, "grad_norm": 8.014860153198242, "learning_rate": 1.9887144419440948e-06, "loss": 0.2273, "num_input_tokens_seen": 19729280, "step": 29285 }, { "epoch": 0.7155595729606918, "grad_norm": 11.822001457214355, "learning_rate": 1.9887016627079946e-06, "loss": 0.0731, "num_input_tokens_seen": 19732736, "step": 29290 }, { "epoch": 0.715681723792539, "grad_norm": 23.019365310668945, "learning_rate": 1.9886888762817897e-06, "loss": 0.1675, "num_input_tokens_seen": 19736832, "step": 29295 }, { "epoch": 0.7158038746243862, "grad_norm": 1.2372710704803467, "learning_rate": 1.988676082665573e-06, "loss": 0.1408, "num_input_tokens_seen": 19740224, "step": 29300 }, { "epoch": 0.7159260254562334, "grad_norm": 9.24066162109375, "learning_rate": 1.9886632818594384e-06, "loss": 0.1125, "num_input_tokens_seen": 19743872, "step": 29305 }, { "epoch": 0.7160481762880805, "grad_norm": 14.921540260314941, "learning_rate": 1.988650473863478e-06, "loss": 0.0717, "num_input_tokens_seen": 19747136, "step": 29310 }, { "epoch": 0.7161703271199277, "grad_norm": 1.0777639150619507, "learning_rate": 1.988637658677786e-06, "loss": 0.0765, "num_input_tokens_seen": 19750336, "step": 29315 }, { "epoch": 0.7162924779517749, "grad_norm": 18.06236457824707, "learning_rate": 1.9886248363024545e-06, "loss": 0.1244, "num_input_tokens_seen": 19754112, "step": 29320 }, { "epoch": 0.716414628783622, "grad_norm": 0.44852542877197266, "learning_rate": 1.9886120067375777e-06, "loss": 0.0769, "num_input_tokens_seen": 19757376, "step": 29325 }, { "epoch": 0.7165367796154691, "grad_norm": 3.454195022583008, "learning_rate": 1.9885991699832483e-06, "loss": 0.0164, "num_input_tokens_seen": 19760576, "step": 29330 }, { "epoch": 0.7166589304473163, "grad_norm": 26.183717727661133, "learning_rate": 1.98858632603956e-06, "loss": 0.1532, "num_input_tokens_seen": 19763776, "step": 29335 }, { "epoch": 0.7167810812791635, "grad_norm": 10.307730674743652, "learning_rate": 1.988573474906606e-06, "loss": 0.1228, "num_input_tokens_seen": 19767872, "step": 29340 }, { "epoch": 0.7169032321110107, "grad_norm": 26.782163619995117, "learning_rate": 1.9885606165844796e-06, "loss": 0.2179, "num_input_tokens_seen": 19771200, "step": 29345 }, { "epoch": 0.7170253829428579, "grad_norm": 11.243006706237793, "learning_rate": 1.9885477510732745e-06, "loss": 0.0435, "num_input_tokens_seen": 19774400, "step": 29350 }, { "epoch": 0.717147533774705, "grad_norm": 17.485429763793945, "learning_rate": 1.9885348783730843e-06, "loss": 0.1557, "num_input_tokens_seen": 19778368, "step": 29355 }, { "epoch": 0.7172696846065522, "grad_norm": 30.79226303100586, "learning_rate": 1.9885219984840027e-06, "loss": 0.0719, "num_input_tokens_seen": 19781824, "step": 29360 }, { "epoch": 0.7173918354383994, "grad_norm": 14.088363647460938, "learning_rate": 1.9885091114061233e-06, "loss": 0.0791, "num_input_tokens_seen": 19785152, "step": 29365 }, { "epoch": 0.7175139862702465, "grad_norm": 0.36788198351860046, "learning_rate": 1.9884962171395396e-06, "loss": 0.0856, "num_input_tokens_seen": 19788288, "step": 29370 }, { "epoch": 0.7176361371020936, "grad_norm": 5.974476337432861, "learning_rate": 1.9884833156843457e-06, "loss": 0.0835, "num_input_tokens_seen": 19792192, "step": 29375 }, { "epoch": 0.7177582879339408, "grad_norm": 16.720901489257812, "learning_rate": 1.988470407040635e-06, "loss": 0.2071, "num_input_tokens_seen": 19795520, "step": 29380 }, { "epoch": 0.717880438765788, "grad_norm": 0.36717942357063293, "learning_rate": 1.988457491208502e-06, "loss": 0.0197, "num_input_tokens_seen": 19798720, "step": 29385 }, { "epoch": 0.7180025895976352, "grad_norm": 2.1908514499664307, "learning_rate": 1.9884445681880402e-06, "loss": 0.0873, "num_input_tokens_seen": 19801792, "step": 29390 }, { "epoch": 0.7181247404294824, "grad_norm": 26.066774368286133, "learning_rate": 1.9884316379793435e-06, "loss": 0.1053, "num_input_tokens_seen": 19804928, "step": 29395 }, { "epoch": 0.7182468912613295, "grad_norm": 0.9114089608192444, "learning_rate": 1.9884187005825058e-06, "loss": 0.0368, "num_input_tokens_seen": 19808576, "step": 29400 }, { "epoch": 0.7183690420931766, "grad_norm": 30.349382400512695, "learning_rate": 1.988405755997622e-06, "loss": 0.0519, "num_input_tokens_seen": 19812288, "step": 29405 }, { "epoch": 0.7184911929250238, "grad_norm": 1.4230334758758545, "learning_rate": 1.9883928042247856e-06, "loss": 0.1122, "num_input_tokens_seen": 19815744, "step": 29410 }, { "epoch": 0.718613343756871, "grad_norm": 11.513659477233887, "learning_rate": 1.9883798452640904e-06, "loss": 0.1189, "num_input_tokens_seen": 19818880, "step": 29415 }, { "epoch": 0.7187354945887181, "grad_norm": 31.712440490722656, "learning_rate": 1.9883668791156316e-06, "loss": 0.3849, "num_input_tokens_seen": 19821952, "step": 29420 }, { "epoch": 0.7188576454205653, "grad_norm": 37.41859817504883, "learning_rate": 1.988353905779503e-06, "loss": 0.1492, "num_input_tokens_seen": 19825024, "step": 29425 }, { "epoch": 0.7189797962524125, "grad_norm": 29.80327796936035, "learning_rate": 1.9883409252557987e-06, "loss": 0.1241, "num_input_tokens_seen": 19828288, "step": 29430 }, { "epoch": 0.7191019470842597, "grad_norm": 5.770595550537109, "learning_rate": 1.9883279375446135e-06, "loss": 0.2149, "num_input_tokens_seen": 19831552, "step": 29435 }, { "epoch": 0.7192240979161069, "grad_norm": 10.977890014648438, "learning_rate": 1.9883149426460416e-06, "loss": 0.125, "num_input_tokens_seen": 19834688, "step": 29440 }, { "epoch": 0.7193462487479539, "grad_norm": 6.461169719696045, "learning_rate": 1.9883019405601775e-06, "loss": 0.0197, "num_input_tokens_seen": 19837952, "step": 29445 }, { "epoch": 0.7194683995798011, "grad_norm": 28.704113006591797, "learning_rate": 1.988288931287116e-06, "loss": 0.1573, "num_input_tokens_seen": 19841216, "step": 29450 }, { "epoch": 0.7195905504116483, "grad_norm": 0.888616681098938, "learning_rate": 1.9882759148269517e-06, "loss": 0.1007, "num_input_tokens_seen": 19844864, "step": 29455 }, { "epoch": 0.7197127012434955, "grad_norm": 7.894357204437256, "learning_rate": 1.988262891179779e-06, "loss": 0.075, "num_input_tokens_seen": 19848000, "step": 29460 }, { "epoch": 0.7198348520753426, "grad_norm": 25.15194320678711, "learning_rate": 1.988249860345693e-06, "loss": 0.0291, "num_input_tokens_seen": 19851456, "step": 29465 }, { "epoch": 0.7199570029071898, "grad_norm": 16.579662322998047, "learning_rate": 1.9882368223247883e-06, "loss": 0.1826, "num_input_tokens_seen": 19854656, "step": 29470 }, { "epoch": 0.720079153739037, "grad_norm": 0.5325263142585754, "learning_rate": 1.988223777117159e-06, "loss": 0.1084, "num_input_tokens_seen": 19857472, "step": 29475 }, { "epoch": 0.7202013045708842, "grad_norm": 24.2345027923584, "learning_rate": 1.988210724722901e-06, "loss": 0.104, "num_input_tokens_seen": 19861184, "step": 29480 }, { "epoch": 0.7203234554027312, "grad_norm": 2.030306339263916, "learning_rate": 1.988197665142109e-06, "loss": 0.1386, "num_input_tokens_seen": 19864640, "step": 29485 }, { "epoch": 0.7204456062345784, "grad_norm": 21.660234451293945, "learning_rate": 1.9881845983748774e-06, "loss": 0.1442, "num_input_tokens_seen": 19868160, "step": 29490 }, { "epoch": 0.7205677570664256, "grad_norm": 15.106928825378418, "learning_rate": 1.988171524421302e-06, "loss": 0.16, "num_input_tokens_seen": 19871552, "step": 29495 }, { "epoch": 0.7206899078982728, "grad_norm": 0.21059578657150269, "learning_rate": 1.9881584432814767e-06, "loss": 0.1296, "num_input_tokens_seen": 19874688, "step": 29500 }, { "epoch": 0.72081205873012, "grad_norm": 12.154464721679688, "learning_rate": 1.988145354955498e-06, "loss": 0.103, "num_input_tokens_seen": 19878592, "step": 29505 }, { "epoch": 0.7209342095619671, "grad_norm": 18.481788635253906, "learning_rate": 1.9881322594434606e-06, "loss": 0.1291, "num_input_tokens_seen": 19881792, "step": 29510 }, { "epoch": 0.7210563603938143, "grad_norm": 0.16375590860843658, "learning_rate": 1.9881191567454594e-06, "loss": 0.096, "num_input_tokens_seen": 19884800, "step": 29515 }, { "epoch": 0.7211785112256615, "grad_norm": 22.23729133605957, "learning_rate": 1.98810604686159e-06, "loss": 0.195, "num_input_tokens_seen": 19887808, "step": 29520 }, { "epoch": 0.7213006620575086, "grad_norm": 14.942244529724121, "learning_rate": 1.9880929297919476e-06, "loss": 0.0749, "num_input_tokens_seen": 19890816, "step": 29525 }, { "epoch": 0.7214228128893557, "grad_norm": 3.9260544776916504, "learning_rate": 1.988079805536628e-06, "loss": 0.179, "num_input_tokens_seen": 19894336, "step": 29530 }, { "epoch": 0.7215449637212029, "grad_norm": 7.509902000427246, "learning_rate": 1.988066674095726e-06, "loss": 0.1218, "num_input_tokens_seen": 19897728, "step": 29535 }, { "epoch": 0.7216671145530501, "grad_norm": 10.838415145874023, "learning_rate": 1.988053535469337e-06, "loss": 0.0808, "num_input_tokens_seen": 19901056, "step": 29540 }, { "epoch": 0.7217892653848973, "grad_norm": 17.19542694091797, "learning_rate": 1.9880403896575573e-06, "loss": 0.1164, "num_input_tokens_seen": 19904384, "step": 29545 }, { "epoch": 0.7219114162167445, "grad_norm": 27.95970916748047, "learning_rate": 1.9880272366604824e-06, "loss": 0.141, "num_input_tokens_seen": 19907904, "step": 29550 }, { "epoch": 0.7220335670485916, "grad_norm": 13.673565864562988, "learning_rate": 1.9880140764782074e-06, "loss": 0.1567, "num_input_tokens_seen": 19911296, "step": 29555 }, { "epoch": 0.7221557178804388, "grad_norm": 42.8197135925293, "learning_rate": 1.9880009091108284e-06, "loss": 0.1494, "num_input_tokens_seen": 19914496, "step": 29560 }, { "epoch": 0.7222778687122859, "grad_norm": 12.1239652633667, "learning_rate": 1.9879877345584412e-06, "loss": 0.0898, "num_input_tokens_seen": 19917760, "step": 29565 }, { "epoch": 0.7224000195441331, "grad_norm": 29.40520668029785, "learning_rate": 1.987974552821141e-06, "loss": 0.0657, "num_input_tokens_seen": 19921216, "step": 29570 }, { "epoch": 0.7225221703759802, "grad_norm": 16.474172592163086, "learning_rate": 1.9879613638990247e-06, "loss": 0.0873, "num_input_tokens_seen": 19924864, "step": 29575 }, { "epoch": 0.7226443212078274, "grad_norm": 8.598320007324219, "learning_rate": 1.987948167792187e-06, "loss": 0.1493, "num_input_tokens_seen": 19928064, "step": 29580 }, { "epoch": 0.7227664720396746, "grad_norm": 9.523024559020996, "learning_rate": 1.9879349645007246e-06, "loss": 0.0707, "num_input_tokens_seen": 19931584, "step": 29585 }, { "epoch": 0.7228886228715218, "grad_norm": 22.509151458740234, "learning_rate": 1.9879217540247338e-06, "loss": 0.195, "num_input_tokens_seen": 19935040, "step": 29590 }, { "epoch": 0.723010773703369, "grad_norm": 19.896160125732422, "learning_rate": 1.9879085363643102e-06, "loss": 0.1682, "num_input_tokens_seen": 19938560, "step": 29595 }, { "epoch": 0.723132924535216, "grad_norm": 1.9905105829238892, "learning_rate": 1.9878953115195498e-06, "loss": 0.0726, "num_input_tokens_seen": 19941696, "step": 29600 }, { "epoch": 0.7232550753670632, "grad_norm": 13.280790328979492, "learning_rate": 1.987882079490549e-06, "loss": 0.1861, "num_input_tokens_seen": 19945984, "step": 29605 }, { "epoch": 0.7233772261989104, "grad_norm": 4.532742977142334, "learning_rate": 1.9878688402774042e-06, "loss": 0.0521, "num_input_tokens_seen": 19949184, "step": 29610 }, { "epoch": 0.7234993770307576, "grad_norm": 23.19292449951172, "learning_rate": 1.9878555938802115e-06, "loss": 0.0757, "num_input_tokens_seen": 19952320, "step": 29615 }, { "epoch": 0.7236215278626047, "grad_norm": 0.6940127015113831, "learning_rate": 1.987842340299067e-06, "loss": 0.0802, "num_input_tokens_seen": 19955392, "step": 29620 }, { "epoch": 0.7237436786944519, "grad_norm": 0.5754919648170471, "learning_rate": 1.9878290795340674e-06, "loss": 0.1038, "num_input_tokens_seen": 19958912, "step": 29625 }, { "epoch": 0.7238658295262991, "grad_norm": 6.234220027923584, "learning_rate": 1.9878158115853088e-06, "loss": 0.0661, "num_input_tokens_seen": 19962368, "step": 29630 }, { "epoch": 0.7239879803581463, "grad_norm": 26.875852584838867, "learning_rate": 1.9878025364528883e-06, "loss": 0.1026, "num_input_tokens_seen": 19965760, "step": 29635 }, { "epoch": 0.7241101311899935, "grad_norm": 23.62150764465332, "learning_rate": 1.987789254136902e-06, "loss": 0.1179, "num_input_tokens_seen": 19969216, "step": 29640 }, { "epoch": 0.7242322820218405, "grad_norm": 0.09608346223831177, "learning_rate": 1.987775964637447e-06, "loss": 0.1176, "num_input_tokens_seen": 19972800, "step": 29645 }, { "epoch": 0.7243544328536877, "grad_norm": 0.43758726119995117, "learning_rate": 1.9877626679546185e-06, "loss": 0.0449, "num_input_tokens_seen": 19976192, "step": 29650 }, { "epoch": 0.7244765836855349, "grad_norm": 18.002891540527344, "learning_rate": 1.987749364088515e-06, "loss": 0.073, "num_input_tokens_seen": 19979648, "step": 29655 }, { "epoch": 0.7245987345173821, "grad_norm": 19.831453323364258, "learning_rate": 1.987736053039232e-06, "loss": 0.1078, "num_input_tokens_seen": 19983296, "step": 29660 }, { "epoch": 0.7247208853492292, "grad_norm": 11.848029136657715, "learning_rate": 1.987722734806867e-06, "loss": 0.1702, "num_input_tokens_seen": 19986944, "step": 29665 }, { "epoch": 0.7248430361810764, "grad_norm": 0.2218346893787384, "learning_rate": 1.9877094093915166e-06, "loss": 0.1022, "num_input_tokens_seen": 19990528, "step": 29670 }, { "epoch": 0.7249651870129236, "grad_norm": 6.095734596252441, "learning_rate": 1.9876960767932775e-06, "loss": 0.0221, "num_input_tokens_seen": 19993664, "step": 29675 }, { "epoch": 0.7250873378447708, "grad_norm": 35.75461196899414, "learning_rate": 1.9876827370122472e-06, "loss": 0.1655, "num_input_tokens_seen": 19997440, "step": 29680 }, { "epoch": 0.7252094886766179, "grad_norm": 54.55160903930664, "learning_rate": 1.987669390048522e-06, "loss": 0.1892, "num_input_tokens_seen": 20000704, "step": 29685 }, { "epoch": 0.725331639508465, "grad_norm": 0.1276654452085495, "learning_rate": 1.9876560359021997e-06, "loss": 0.0763, "num_input_tokens_seen": 20003904, "step": 29690 }, { "epoch": 0.7254537903403122, "grad_norm": 53.620574951171875, "learning_rate": 1.9876426745733768e-06, "loss": 0.1608, "num_input_tokens_seen": 20007040, "step": 29695 }, { "epoch": 0.7255759411721594, "grad_norm": 5.176275730133057, "learning_rate": 1.9876293060621507e-06, "loss": 0.1339, "num_input_tokens_seen": 20010496, "step": 29700 }, { "epoch": 0.7256980920040066, "grad_norm": 0.024253157898783684, "learning_rate": 1.987615930368619e-06, "loss": 0.0026, "num_input_tokens_seen": 20013696, "step": 29705 }, { "epoch": 0.7258202428358537, "grad_norm": 70.7147445678711, "learning_rate": 1.987602547492878e-06, "loss": 0.0895, "num_input_tokens_seen": 20016960, "step": 29710 }, { "epoch": 0.7259423936677009, "grad_norm": 24.393190383911133, "learning_rate": 1.987589157435026e-06, "loss": 0.2477, "num_input_tokens_seen": 20020288, "step": 29715 }, { "epoch": 0.726064544499548, "grad_norm": 0.23399071395397186, "learning_rate": 1.98757576019516e-06, "loss": 0.1913, "num_input_tokens_seen": 20023360, "step": 29720 }, { "epoch": 0.7261866953313952, "grad_norm": 26.293479919433594, "learning_rate": 1.9875623557733777e-06, "loss": 0.0992, "num_input_tokens_seen": 20026688, "step": 29725 }, { "epoch": 0.7263088461632423, "grad_norm": 9.954926490783691, "learning_rate": 1.9875489441697764e-06, "loss": 0.1794, "num_input_tokens_seen": 20029952, "step": 29730 }, { "epoch": 0.7264309969950895, "grad_norm": 22.676538467407227, "learning_rate": 1.987535525384453e-06, "loss": 0.0604, "num_input_tokens_seen": 20033536, "step": 29735 }, { "epoch": 0.7265531478269367, "grad_norm": 18.85304832458496, "learning_rate": 1.9875220994175058e-06, "loss": 0.0791, "num_input_tokens_seen": 20036928, "step": 29740 }, { "epoch": 0.7266752986587839, "grad_norm": 3.8162927627563477, "learning_rate": 1.987508666269033e-06, "loss": 0.0879, "num_input_tokens_seen": 20039936, "step": 29745 }, { "epoch": 0.7267974494906311, "grad_norm": 12.169044494628906, "learning_rate": 1.987495225939131e-06, "loss": 0.1627, "num_input_tokens_seen": 20043456, "step": 29750 }, { "epoch": 0.7269196003224782, "grad_norm": 0.9503796100616455, "learning_rate": 1.987481778427898e-06, "loss": 0.0406, "num_input_tokens_seen": 20046592, "step": 29755 }, { "epoch": 0.7270417511543253, "grad_norm": 13.596330642700195, "learning_rate": 1.9874683237354317e-06, "loss": 0.0608, "num_input_tokens_seen": 20050112, "step": 29760 }, { "epoch": 0.7271639019861725, "grad_norm": 1.3646011352539062, "learning_rate": 1.9874548618618303e-06, "loss": 0.0841, "num_input_tokens_seen": 20053312, "step": 29765 }, { "epoch": 0.7272860528180197, "grad_norm": 45.84081268310547, "learning_rate": 1.987441392807192e-06, "loss": 0.1837, "num_input_tokens_seen": 20056768, "step": 29770 }, { "epoch": 0.7274082036498668, "grad_norm": 0.8793035745620728, "learning_rate": 1.987427916571614e-06, "loss": 0.1468, "num_input_tokens_seen": 20059840, "step": 29775 }, { "epoch": 0.727530354481714, "grad_norm": 7.483636856079102, "learning_rate": 1.9874144331551946e-06, "loss": 0.1255, "num_input_tokens_seen": 20063424, "step": 29780 }, { "epoch": 0.7276525053135612, "grad_norm": 18.731956481933594, "learning_rate": 1.9874009425580317e-06, "loss": 0.2606, "num_input_tokens_seen": 20066752, "step": 29785 }, { "epoch": 0.7277746561454084, "grad_norm": 0.3487199544906616, "learning_rate": 1.9873874447802236e-06, "loss": 0.0847, "num_input_tokens_seen": 20069696, "step": 29790 }, { "epoch": 0.7278968069772556, "grad_norm": 30.68777084350586, "learning_rate": 1.9873739398218687e-06, "loss": 0.1515, "num_input_tokens_seen": 20072896, "step": 29795 }, { "epoch": 0.7280189578091026, "grad_norm": 4.902219295501709, "learning_rate": 1.9873604276830647e-06, "loss": 0.1143, "num_input_tokens_seen": 20076096, "step": 29800 }, { "epoch": 0.7281411086409498, "grad_norm": 23.401994705200195, "learning_rate": 1.9873469083639103e-06, "loss": 0.1678, "num_input_tokens_seen": 20079424, "step": 29805 }, { "epoch": 0.728263259472797, "grad_norm": 17.54922103881836, "learning_rate": 1.9873333818645033e-06, "loss": 0.1527, "num_input_tokens_seen": 20082752, "step": 29810 }, { "epoch": 0.7283854103046442, "grad_norm": 13.365433692932129, "learning_rate": 1.987319848184943e-06, "loss": 0.0715, "num_input_tokens_seen": 20086528, "step": 29815 }, { "epoch": 0.7285075611364913, "grad_norm": 13.830577850341797, "learning_rate": 1.987306307325327e-06, "loss": 0.0958, "num_input_tokens_seen": 20089984, "step": 29820 }, { "epoch": 0.7286297119683385, "grad_norm": 14.710626602172852, "learning_rate": 1.9872927592857535e-06, "loss": 0.0715, "num_input_tokens_seen": 20093504, "step": 29825 }, { "epoch": 0.7287518628001857, "grad_norm": 0.9895759224891663, "learning_rate": 1.987279204066322e-06, "loss": 0.0669, "num_input_tokens_seen": 20096640, "step": 29830 }, { "epoch": 0.7288740136320329, "grad_norm": 16.83676528930664, "learning_rate": 1.98726564166713e-06, "loss": 0.05, "num_input_tokens_seen": 20099776, "step": 29835 }, { "epoch": 0.72899616446388, "grad_norm": 19.734445571899414, "learning_rate": 1.987252072088277e-06, "loss": 0.1161, "num_input_tokens_seen": 20102784, "step": 29840 }, { "epoch": 0.7291183152957271, "grad_norm": 27.47193145751953, "learning_rate": 1.9872384953298615e-06, "loss": 0.2201, "num_input_tokens_seen": 20105920, "step": 29845 }, { "epoch": 0.7292404661275743, "grad_norm": 0.11942492425441742, "learning_rate": 1.987224911391982e-06, "loss": 0.0443, "num_input_tokens_seen": 20109248, "step": 29850 }, { "epoch": 0.7293626169594215, "grad_norm": 0.13697798550128937, "learning_rate": 1.987211320274738e-06, "loss": 0.0409, "num_input_tokens_seen": 20112448, "step": 29855 }, { "epoch": 0.7294847677912687, "grad_norm": 0.028840813785791397, "learning_rate": 1.987197721978227e-06, "loss": 0.1766, "num_input_tokens_seen": 20115520, "step": 29860 }, { "epoch": 0.7296069186231158, "grad_norm": 18.40993881225586, "learning_rate": 1.987184116502549e-06, "loss": 0.2882, "num_input_tokens_seen": 20118784, "step": 29865 }, { "epoch": 0.729729069454963, "grad_norm": 23.006805419921875, "learning_rate": 1.9871705038478025e-06, "loss": 0.1965, "num_input_tokens_seen": 20121600, "step": 29870 }, { "epoch": 0.7298512202868102, "grad_norm": 0.6357484459877014, "learning_rate": 1.9871568840140863e-06, "loss": 0.1923, "num_input_tokens_seen": 20124672, "step": 29875 }, { "epoch": 0.7299733711186573, "grad_norm": 0.2545313537120819, "learning_rate": 1.9871432570015e-06, "loss": 0.0443, "num_input_tokens_seen": 20128128, "step": 29880 }, { "epoch": 0.7300955219505045, "grad_norm": 5.443319320678711, "learning_rate": 1.9871296228101426e-06, "loss": 0.1228, "num_input_tokens_seen": 20131328, "step": 29885 }, { "epoch": 0.7302176727823516, "grad_norm": 0.4481469988822937, "learning_rate": 1.9871159814401127e-06, "loss": 0.0921, "num_input_tokens_seen": 20134784, "step": 29890 }, { "epoch": 0.7303398236141988, "grad_norm": 17.447044372558594, "learning_rate": 1.9871023328915102e-06, "loss": 0.1063, "num_input_tokens_seen": 20137856, "step": 29895 }, { "epoch": 0.730461974446046, "grad_norm": 3.0904958248138428, "learning_rate": 1.987088677164434e-06, "loss": 0.0474, "num_input_tokens_seen": 20141376, "step": 29900 }, { "epoch": 0.7305841252778932, "grad_norm": 19.669401168823242, "learning_rate": 1.9870750142589835e-06, "loss": 0.076, "num_input_tokens_seen": 20145408, "step": 29905 }, { "epoch": 0.7307062761097403, "grad_norm": 0.0858718529343605, "learning_rate": 1.987061344175258e-06, "loss": 0.0828, "num_input_tokens_seen": 20148416, "step": 29910 }, { "epoch": 0.7308284269415875, "grad_norm": 15.151061058044434, "learning_rate": 1.9870476669133566e-06, "loss": 0.2602, "num_input_tokens_seen": 20151552, "step": 29915 }, { "epoch": 0.7309505777734346, "grad_norm": 0.39679262042045593, "learning_rate": 1.987033982473379e-06, "loss": 0.1584, "num_input_tokens_seen": 20154752, "step": 29920 }, { "epoch": 0.7310727286052818, "grad_norm": 24.496591567993164, "learning_rate": 1.9870202908554253e-06, "loss": 0.2126, "num_input_tokens_seen": 20158080, "step": 29925 }, { "epoch": 0.731194879437129, "grad_norm": 16.306676864624023, "learning_rate": 1.9870065920595942e-06, "loss": 0.069, "num_input_tokens_seen": 20161792, "step": 29930 }, { "epoch": 0.7313170302689761, "grad_norm": 51.136375427246094, "learning_rate": 1.986992886085986e-06, "loss": 0.1502, "num_input_tokens_seen": 20164800, "step": 29935 }, { "epoch": 0.7314391811008233, "grad_norm": 14.362354278564453, "learning_rate": 1.9869791729347e-06, "loss": 0.1002, "num_input_tokens_seen": 20167744, "step": 29940 }, { "epoch": 0.7315613319326705, "grad_norm": 24.119476318359375, "learning_rate": 1.986965452605836e-06, "loss": 0.1387, "num_input_tokens_seen": 20171072, "step": 29945 }, { "epoch": 0.7316834827645177, "grad_norm": 5.217164516448975, "learning_rate": 1.9869517250994932e-06, "loss": 0.0676, "num_input_tokens_seen": 20174656, "step": 29950 }, { "epoch": 0.7318056335963647, "grad_norm": 9.99504566192627, "learning_rate": 1.9869379904157724e-06, "loss": 0.1438, "num_input_tokens_seen": 20177984, "step": 29955 }, { "epoch": 0.7319277844282119, "grad_norm": 0.7991713285446167, "learning_rate": 1.9869242485547734e-06, "loss": 0.0504, "num_input_tokens_seen": 20181504, "step": 29960 }, { "epoch": 0.7320499352600591, "grad_norm": 16.180362701416016, "learning_rate": 1.9869104995165957e-06, "loss": 0.1732, "num_input_tokens_seen": 20184704, "step": 29965 }, { "epoch": 0.7321720860919063, "grad_norm": 10.701885223388672, "learning_rate": 1.986896743301339e-06, "loss": 0.0326, "num_input_tokens_seen": 20188608, "step": 29970 }, { "epoch": 0.7322942369237535, "grad_norm": 14.758902549743652, "learning_rate": 1.986882979909104e-06, "loss": 0.1004, "num_input_tokens_seen": 20192384, "step": 29975 }, { "epoch": 0.7324163877556006, "grad_norm": 36.978084564208984, "learning_rate": 1.9868692093399905e-06, "loss": 0.1361, "num_input_tokens_seen": 20195264, "step": 29980 }, { "epoch": 0.7325385385874478, "grad_norm": 4.0124993324279785, "learning_rate": 1.986855431594099e-06, "loss": 0.0881, "num_input_tokens_seen": 20199232, "step": 29985 }, { "epoch": 0.732660689419295, "grad_norm": 1.9948008060455322, "learning_rate": 1.986841646671529e-06, "loss": 0.0377, "num_input_tokens_seen": 20203264, "step": 29990 }, { "epoch": 0.7327828402511422, "grad_norm": 0.7837508916854858, "learning_rate": 1.986827854572381e-06, "loss": 0.1068, "num_input_tokens_seen": 20206592, "step": 29995 }, { "epoch": 0.7329049910829892, "grad_norm": 13.117432594299316, "learning_rate": 1.9868140552967555e-06, "loss": 0.1614, "num_input_tokens_seen": 20209856, "step": 30000 }, { "epoch": 0.7330271419148364, "grad_norm": 37.95238494873047, "learning_rate": 1.986800248844753e-06, "loss": 0.0859, "num_input_tokens_seen": 20213376, "step": 30005 }, { "epoch": 0.7331492927466836, "grad_norm": 17.042789459228516, "learning_rate": 1.9867864352164735e-06, "loss": 0.1404, "num_input_tokens_seen": 20216704, "step": 30010 }, { "epoch": 0.7332714435785308, "grad_norm": 16.459827423095703, "learning_rate": 1.9867726144120173e-06, "loss": 0.1487, "num_input_tokens_seen": 20219968, "step": 30015 }, { "epoch": 0.7333935944103779, "grad_norm": 2.686231851577759, "learning_rate": 1.9867587864314858e-06, "loss": 0.0727, "num_input_tokens_seen": 20223424, "step": 30020 }, { "epoch": 0.7335157452422251, "grad_norm": 7.266627788543701, "learning_rate": 1.9867449512749787e-06, "loss": 0.095, "num_input_tokens_seen": 20226688, "step": 30025 }, { "epoch": 0.7336378960740723, "grad_norm": 13.3108491897583, "learning_rate": 1.986731108942597e-06, "loss": 0.0888, "num_input_tokens_seen": 20229888, "step": 30030 }, { "epoch": 0.7337600469059194, "grad_norm": 0.9927554130554199, "learning_rate": 1.9867172594344415e-06, "loss": 0.0408, "num_input_tokens_seen": 20233344, "step": 30035 }, { "epoch": 0.7338821977377666, "grad_norm": 13.185794830322266, "learning_rate": 1.986703402750612e-06, "loss": 0.0662, "num_input_tokens_seen": 20236736, "step": 30040 }, { "epoch": 0.7340043485696137, "grad_norm": 0.6203752160072327, "learning_rate": 1.9866895388912107e-06, "loss": 0.1591, "num_input_tokens_seen": 20240128, "step": 30045 }, { "epoch": 0.7341264994014609, "grad_norm": 1.171931266784668, "learning_rate": 1.9866756678563375e-06, "loss": 0.1238, "num_input_tokens_seen": 20243072, "step": 30050 }, { "epoch": 0.7342486502333081, "grad_norm": 30.23444366455078, "learning_rate": 1.9866617896460936e-06, "loss": 0.1967, "num_input_tokens_seen": 20246400, "step": 30055 }, { "epoch": 0.7343708010651553, "grad_norm": 20.61321258544922, "learning_rate": 1.9866479042605794e-06, "loss": 0.1251, "num_input_tokens_seen": 20249792, "step": 30060 }, { "epoch": 0.7344929518970024, "grad_norm": 3.040947198867798, "learning_rate": 1.9866340116998965e-06, "loss": 0.0821, "num_input_tokens_seen": 20253376, "step": 30065 }, { "epoch": 0.7346151027288496, "grad_norm": 0.0874570682644844, "learning_rate": 1.986620111964146e-06, "loss": 0.0891, "num_input_tokens_seen": 20256832, "step": 30070 }, { "epoch": 0.7347372535606967, "grad_norm": 22.917316436767578, "learning_rate": 1.986606205053428e-06, "loss": 0.3907, "num_input_tokens_seen": 20260352, "step": 30075 }, { "epoch": 0.7348594043925439, "grad_norm": 0.2937084138393402, "learning_rate": 1.9865922909678444e-06, "loss": 0.0612, "num_input_tokens_seen": 20263616, "step": 30080 }, { "epoch": 0.7349815552243911, "grad_norm": 0.6326661705970764, "learning_rate": 1.9865783697074965e-06, "loss": 0.1698, "num_input_tokens_seen": 20266880, "step": 30085 }, { "epoch": 0.7351037060562382, "grad_norm": 15.97380256652832, "learning_rate": 1.9865644412724857e-06, "loss": 0.0698, "num_input_tokens_seen": 20270336, "step": 30090 }, { "epoch": 0.7352258568880854, "grad_norm": 14.575950622558594, "learning_rate": 1.9865505056629122e-06, "loss": 0.1139, "num_input_tokens_seen": 20273408, "step": 30095 }, { "epoch": 0.7353480077199326, "grad_norm": 12.12072467803955, "learning_rate": 1.9865365628788787e-06, "loss": 0.193, "num_input_tokens_seen": 20276736, "step": 30100 }, { "epoch": 0.7354701585517798, "grad_norm": 7.838393211364746, "learning_rate": 1.9865226129204858e-06, "loss": 0.0898, "num_input_tokens_seen": 20280256, "step": 30105 }, { "epoch": 0.7355923093836269, "grad_norm": 14.239495277404785, "learning_rate": 1.9865086557878348e-06, "loss": 0.0696, "num_input_tokens_seen": 20283456, "step": 30110 }, { "epoch": 0.735714460215474, "grad_norm": 13.856727600097656, "learning_rate": 1.9864946914810278e-06, "loss": 0.1215, "num_input_tokens_seen": 20286720, "step": 30115 }, { "epoch": 0.7358366110473212, "grad_norm": 0.6099316477775574, "learning_rate": 1.986480720000166e-06, "loss": 0.1099, "num_input_tokens_seen": 20289920, "step": 30120 }, { "epoch": 0.7359587618791684, "grad_norm": 25.46120834350586, "learning_rate": 1.986466741345351e-06, "loss": 0.1411, "num_input_tokens_seen": 20293312, "step": 30125 }, { "epoch": 0.7360809127110156, "grad_norm": 15.407801628112793, "learning_rate": 1.9864527555166844e-06, "loss": 0.196, "num_input_tokens_seen": 20296576, "step": 30130 }, { "epoch": 0.7362030635428627, "grad_norm": 14.528338432312012, "learning_rate": 1.986438762514269e-06, "loss": 0.0724, "num_input_tokens_seen": 20300352, "step": 30135 }, { "epoch": 0.7363252143747099, "grad_norm": 10.5165433883667, "learning_rate": 1.9864247623382046e-06, "loss": 0.1201, "num_input_tokens_seen": 20303616, "step": 30140 }, { "epoch": 0.7364473652065571, "grad_norm": 0.36507347226142883, "learning_rate": 1.986410754988594e-06, "loss": 0.039, "num_input_tokens_seen": 20306880, "step": 30145 }, { "epoch": 0.7365695160384043, "grad_norm": 0.10817314684391022, "learning_rate": 1.9863967404655397e-06, "loss": 0.1245, "num_input_tokens_seen": 20310464, "step": 30150 }, { "epoch": 0.7366916668702513, "grad_norm": 26.634794235229492, "learning_rate": 1.9863827187691423e-06, "loss": 0.2526, "num_input_tokens_seen": 20313728, "step": 30155 }, { "epoch": 0.7368138177020985, "grad_norm": 94.8357925415039, "learning_rate": 1.986368689899505e-06, "loss": 0.0782, "num_input_tokens_seen": 20316800, "step": 30160 }, { "epoch": 0.7369359685339457, "grad_norm": 0.20598088204860687, "learning_rate": 1.9863546538567292e-06, "loss": 0.0681, "num_input_tokens_seen": 20320320, "step": 30165 }, { "epoch": 0.7370581193657929, "grad_norm": 1.645045518875122, "learning_rate": 1.9863406106409165e-06, "loss": 0.0784, "num_input_tokens_seen": 20323648, "step": 30170 }, { "epoch": 0.7371802701976401, "grad_norm": 0.7895174622535706, "learning_rate": 1.9863265602521703e-06, "loss": 0.0968, "num_input_tokens_seen": 20327104, "step": 30175 }, { "epoch": 0.7373024210294872, "grad_norm": 3.127288818359375, "learning_rate": 1.9863125026905917e-06, "loss": 0.1154, "num_input_tokens_seen": 20330432, "step": 30180 }, { "epoch": 0.7374245718613344, "grad_norm": 0.4002511203289032, "learning_rate": 1.9862984379562833e-06, "loss": 0.218, "num_input_tokens_seen": 20333696, "step": 30185 }, { "epoch": 0.7375467226931816, "grad_norm": 24.047679901123047, "learning_rate": 1.9862843660493475e-06, "loss": 0.1431, "num_input_tokens_seen": 20337088, "step": 30190 }, { "epoch": 0.7376688735250287, "grad_norm": 0.9673386216163635, "learning_rate": 1.9862702869698865e-06, "loss": 0.1025, "num_input_tokens_seen": 20340032, "step": 30195 }, { "epoch": 0.7377910243568758, "grad_norm": 6.145616054534912, "learning_rate": 1.986256200718003e-06, "loss": 0.0184, "num_input_tokens_seen": 20343296, "step": 30200 }, { "epoch": 0.737913175188723, "grad_norm": 20.293169021606445, "learning_rate": 1.9862421072937986e-06, "loss": 0.2651, "num_input_tokens_seen": 20346304, "step": 30205 }, { "epoch": 0.7380353260205702, "grad_norm": 25.896528244018555, "learning_rate": 1.9862280066973765e-06, "loss": 0.0288, "num_input_tokens_seen": 20349888, "step": 30210 }, { "epoch": 0.7381574768524174, "grad_norm": 0.5437335968017578, "learning_rate": 1.9862138989288393e-06, "loss": 0.1911, "num_input_tokens_seen": 20353408, "step": 30215 }, { "epoch": 0.7382796276842646, "grad_norm": 56.40694808959961, "learning_rate": 1.986199783988289e-06, "loss": 0.1025, "num_input_tokens_seen": 20356416, "step": 30220 }, { "epoch": 0.7384017785161117, "grad_norm": 0.2323615700006485, "learning_rate": 1.9861856618758292e-06, "loss": 0.2797, "num_input_tokens_seen": 20359360, "step": 30225 }, { "epoch": 0.7385239293479589, "grad_norm": 54.13261413574219, "learning_rate": 1.9861715325915612e-06, "loss": 0.137, "num_input_tokens_seen": 20362560, "step": 30230 }, { "epoch": 0.738646080179806, "grad_norm": 35.675174713134766, "learning_rate": 1.986157396135589e-06, "loss": 0.1808, "num_input_tokens_seen": 20365760, "step": 30235 }, { "epoch": 0.7387682310116532, "grad_norm": 0.4263570308685303, "learning_rate": 1.986143252508015e-06, "loss": 0.111, "num_input_tokens_seen": 20368832, "step": 30240 }, { "epoch": 0.7388903818435003, "grad_norm": 0.35891321301460266, "learning_rate": 1.986129101708942e-06, "loss": 0.0594, "num_input_tokens_seen": 20371904, "step": 30245 }, { "epoch": 0.7390125326753475, "grad_norm": 17.240028381347656, "learning_rate": 1.986114943738473e-06, "loss": 0.1885, "num_input_tokens_seen": 20375168, "step": 30250 }, { "epoch": 0.7391346835071947, "grad_norm": 42.6297721862793, "learning_rate": 1.986100778596711e-06, "loss": 0.0938, "num_input_tokens_seen": 20378496, "step": 30255 }, { "epoch": 0.7392568343390419, "grad_norm": 19.804243087768555, "learning_rate": 1.9860866062837584e-06, "loss": 0.0634, "num_input_tokens_seen": 20382080, "step": 30260 }, { "epoch": 0.7393789851708891, "grad_norm": 11.977760314941406, "learning_rate": 1.986072426799719e-06, "loss": 0.0541, "num_input_tokens_seen": 20385664, "step": 30265 }, { "epoch": 0.7395011360027361, "grad_norm": 1.364763617515564, "learning_rate": 1.9860582401446957e-06, "loss": 0.191, "num_input_tokens_seen": 20388992, "step": 30270 }, { "epoch": 0.7396232868345833, "grad_norm": 0.07516565173864365, "learning_rate": 1.986044046318792e-06, "loss": 0.0786, "num_input_tokens_seen": 20392448, "step": 30275 }, { "epoch": 0.7397454376664305, "grad_norm": 9.209480285644531, "learning_rate": 1.986029845322111e-06, "loss": 0.113, "num_input_tokens_seen": 20395584, "step": 30280 }, { "epoch": 0.7398675884982777, "grad_norm": 19.559255599975586, "learning_rate": 1.986015637154755e-06, "loss": 0.142, "num_input_tokens_seen": 20399040, "step": 30285 }, { "epoch": 0.7399897393301248, "grad_norm": 9.277286529541016, "learning_rate": 1.9860014218168283e-06, "loss": 0.1325, "num_input_tokens_seen": 20402496, "step": 30290 }, { "epoch": 0.740111890161972, "grad_norm": 0.35132232308387756, "learning_rate": 1.985987199308434e-06, "loss": 0.1215, "num_input_tokens_seen": 20405952, "step": 30295 }, { "epoch": 0.7402340409938192, "grad_norm": 13.239752769470215, "learning_rate": 1.985972969629676e-06, "loss": 0.1274, "num_input_tokens_seen": 20409600, "step": 30300 }, { "epoch": 0.7403561918256664, "grad_norm": 18.34958839416504, "learning_rate": 1.985958732780657e-06, "loss": 0.207, "num_input_tokens_seen": 20413184, "step": 30305 }, { "epoch": 0.7404783426575134, "grad_norm": 25.13006019592285, "learning_rate": 1.985944488761481e-06, "loss": 0.1524, "num_input_tokens_seen": 20416768, "step": 30310 }, { "epoch": 0.7406004934893606, "grad_norm": 2.601231336593628, "learning_rate": 1.9859302375722514e-06, "loss": 0.1147, "num_input_tokens_seen": 20420096, "step": 30315 }, { "epoch": 0.7407226443212078, "grad_norm": 22.842487335205078, "learning_rate": 1.985915979213072e-06, "loss": 0.1366, "num_input_tokens_seen": 20422912, "step": 30320 }, { "epoch": 0.740844795153055, "grad_norm": 9.892926216125488, "learning_rate": 1.9859017136840465e-06, "loss": 0.1295, "num_input_tokens_seen": 20426304, "step": 30325 }, { "epoch": 0.7409669459849022, "grad_norm": 2.6345763206481934, "learning_rate": 1.9858874409852786e-06, "loss": 0.0794, "num_input_tokens_seen": 20429696, "step": 30330 }, { "epoch": 0.7410890968167493, "grad_norm": 4.601977825164795, "learning_rate": 1.9858731611168713e-06, "loss": 0.1226, "num_input_tokens_seen": 20433024, "step": 30335 }, { "epoch": 0.7412112476485965, "grad_norm": 1.0514150857925415, "learning_rate": 1.9858588740789304e-06, "loss": 0.0475, "num_input_tokens_seen": 20436416, "step": 30340 }, { "epoch": 0.7413333984804437, "grad_norm": 1.2894636392593384, "learning_rate": 1.985844579871558e-06, "loss": 0.0868, "num_input_tokens_seen": 20439872, "step": 30345 }, { "epoch": 0.7414555493122909, "grad_norm": 2.993844747543335, "learning_rate": 1.9858302784948587e-06, "loss": 0.0766, "num_input_tokens_seen": 20443200, "step": 30350 }, { "epoch": 0.7415777001441379, "grad_norm": 1.2760772705078125, "learning_rate": 1.9858159699489364e-06, "loss": 0.0788, "num_input_tokens_seen": 20446592, "step": 30355 }, { "epoch": 0.7416998509759851, "grad_norm": 16.633935928344727, "learning_rate": 1.9858016542338954e-06, "loss": 0.116, "num_input_tokens_seen": 20449920, "step": 30360 }, { "epoch": 0.7418220018078323, "grad_norm": 0.8385209441184998, "learning_rate": 1.9857873313498394e-06, "loss": 0.0391, "num_input_tokens_seen": 20453376, "step": 30365 }, { "epoch": 0.7419441526396795, "grad_norm": 20.808696746826172, "learning_rate": 1.9857730012968727e-06, "loss": 0.1101, "num_input_tokens_seen": 20456832, "step": 30370 }, { "epoch": 0.7420663034715267, "grad_norm": 3.597788095474243, "learning_rate": 1.9857586640750997e-06, "loss": 0.1149, "num_input_tokens_seen": 20460096, "step": 30375 }, { "epoch": 0.7421884543033738, "grad_norm": 7.343143463134766, "learning_rate": 1.985744319684625e-06, "loss": 0.0846, "num_input_tokens_seen": 20463744, "step": 30380 }, { "epoch": 0.742310605135221, "grad_norm": 42.95801544189453, "learning_rate": 1.985729968125552e-06, "loss": 0.1214, "num_input_tokens_seen": 20467264, "step": 30385 }, { "epoch": 0.7424327559670681, "grad_norm": 15.052309036254883, "learning_rate": 1.9857156093979857e-06, "loss": 0.1327, "num_input_tokens_seen": 20470720, "step": 30390 }, { "epoch": 0.7425549067989153, "grad_norm": 45.49873733520508, "learning_rate": 1.9857012435020303e-06, "loss": 0.0937, "num_input_tokens_seen": 20473728, "step": 30395 }, { "epoch": 0.7426770576307624, "grad_norm": 11.375425338745117, "learning_rate": 1.98568687043779e-06, "loss": 0.0842, "num_input_tokens_seen": 20476992, "step": 30400 }, { "epoch": 0.7427992084626096, "grad_norm": 2.5061888694763184, "learning_rate": 1.98567249020537e-06, "loss": 0.1288, "num_input_tokens_seen": 20480512, "step": 30405 }, { "epoch": 0.7429213592944568, "grad_norm": 20.130294799804688, "learning_rate": 1.9856581028048746e-06, "loss": 0.1023, "num_input_tokens_seen": 20483776, "step": 30410 }, { "epoch": 0.743043510126304, "grad_norm": 0.7941417694091797, "learning_rate": 1.9856437082364084e-06, "loss": 0.01, "num_input_tokens_seen": 20487168, "step": 30415 }, { "epoch": 0.7431656609581512, "grad_norm": 20.287015914916992, "learning_rate": 1.9856293065000763e-06, "loss": 0.1419, "num_input_tokens_seen": 20490432, "step": 30420 }, { "epoch": 0.7432878117899983, "grad_norm": 34.13698196411133, "learning_rate": 1.9856148975959824e-06, "loss": 0.233, "num_input_tokens_seen": 20493888, "step": 30425 }, { "epoch": 0.7434099626218454, "grad_norm": 17.93484878540039, "learning_rate": 1.9856004815242317e-06, "loss": 0.1279, "num_input_tokens_seen": 20496960, "step": 30430 }, { "epoch": 0.7435321134536926, "grad_norm": 8.161019325256348, "learning_rate": 1.9855860582849293e-06, "loss": 0.1186, "num_input_tokens_seen": 20500480, "step": 30435 }, { "epoch": 0.7436542642855398, "grad_norm": 1.821930170059204, "learning_rate": 1.9855716278781802e-06, "loss": 0.0902, "num_input_tokens_seen": 20503808, "step": 30440 }, { "epoch": 0.7437764151173869, "grad_norm": 1.025917410850525, "learning_rate": 1.985557190304089e-06, "loss": 0.0876, "num_input_tokens_seen": 20507072, "step": 30445 }, { "epoch": 0.7438985659492341, "grad_norm": 9.403602600097656, "learning_rate": 1.985542745562761e-06, "loss": 0.1953, "num_input_tokens_seen": 20510336, "step": 30450 }, { "epoch": 0.7440207167810813, "grad_norm": 8.631169319152832, "learning_rate": 1.9855282936543007e-06, "loss": 0.1771, "num_input_tokens_seen": 20513472, "step": 30455 }, { "epoch": 0.7441428676129285, "grad_norm": 16.620887756347656, "learning_rate": 1.985513834578814e-06, "loss": 0.1286, "num_input_tokens_seen": 20517312, "step": 30460 }, { "epoch": 0.7442650184447757, "grad_norm": 0.690407395362854, "learning_rate": 1.9854993683364056e-06, "loss": 0.0696, "num_input_tokens_seen": 20520832, "step": 30465 }, { "epoch": 0.7443871692766227, "grad_norm": 8.413174629211426, "learning_rate": 1.9854848949271804e-06, "loss": 0.0721, "num_input_tokens_seen": 20523904, "step": 30470 }, { "epoch": 0.7445093201084699, "grad_norm": 19.46962547302246, "learning_rate": 1.985470414351244e-06, "loss": 0.0882, "num_input_tokens_seen": 20527424, "step": 30475 }, { "epoch": 0.7446314709403171, "grad_norm": 3.141707420349121, "learning_rate": 1.985455926608702e-06, "loss": 0.1156, "num_input_tokens_seen": 20530816, "step": 30480 }, { "epoch": 0.7447536217721643, "grad_norm": 46.364749908447266, "learning_rate": 1.985441431699659e-06, "loss": 0.1396, "num_input_tokens_seen": 20534208, "step": 30485 }, { "epoch": 0.7448757726040114, "grad_norm": 5.581932544708252, "learning_rate": 1.9854269296242216e-06, "loss": 0.0677, "num_input_tokens_seen": 20537664, "step": 30490 }, { "epoch": 0.7449979234358586, "grad_norm": 21.660884857177734, "learning_rate": 1.9854124203824936e-06, "loss": 0.084, "num_input_tokens_seen": 20540672, "step": 30495 }, { "epoch": 0.7451200742677058, "grad_norm": 8.933382034301758, "learning_rate": 1.985397903974582e-06, "loss": 0.0468, "num_input_tokens_seen": 20543808, "step": 30500 }, { "epoch": 0.745242225099553, "grad_norm": 0.280394971370697, "learning_rate": 1.985383380400592e-06, "loss": 0.196, "num_input_tokens_seen": 20547200, "step": 30505 }, { "epoch": 0.7453643759314001, "grad_norm": 28.417062759399414, "learning_rate": 1.9853688496606286e-06, "loss": 0.2718, "num_input_tokens_seen": 20550720, "step": 30510 }, { "epoch": 0.7454865267632472, "grad_norm": 36.462669372558594, "learning_rate": 1.985354311754798e-06, "loss": 0.0283, "num_input_tokens_seen": 20553792, "step": 30515 }, { "epoch": 0.7456086775950944, "grad_norm": 29.662124633789062, "learning_rate": 1.985339766683206e-06, "loss": 0.0974, "num_input_tokens_seen": 20557120, "step": 30520 }, { "epoch": 0.7457308284269416, "grad_norm": 18.273874282836914, "learning_rate": 1.985325214445958e-06, "loss": 0.0844, "num_input_tokens_seen": 20560256, "step": 30525 }, { "epoch": 0.7458529792587888, "grad_norm": 6.227776527404785, "learning_rate": 1.98531065504316e-06, "loss": 0.0386, "num_input_tokens_seen": 20563584, "step": 30530 }, { "epoch": 0.7459751300906359, "grad_norm": 13.516763687133789, "learning_rate": 1.985296088474918e-06, "loss": 0.2243, "num_input_tokens_seen": 20566912, "step": 30535 }, { "epoch": 0.7460972809224831, "grad_norm": 13.371734619140625, "learning_rate": 1.9852815147413376e-06, "loss": 0.2049, "num_input_tokens_seen": 20570176, "step": 30540 }, { "epoch": 0.7462194317543303, "grad_norm": 1.8927969932556152, "learning_rate": 1.985266933842525e-06, "loss": 0.0925, "num_input_tokens_seen": 20573440, "step": 30545 }, { "epoch": 0.7463415825861774, "grad_norm": 21.35140037536621, "learning_rate": 1.9852523457785864e-06, "loss": 0.1478, "num_input_tokens_seen": 20576768, "step": 30550 }, { "epoch": 0.7464637334180245, "grad_norm": 1.4200108051300049, "learning_rate": 1.985237750549628e-06, "loss": 0.077, "num_input_tokens_seen": 20580416, "step": 30555 }, { "epoch": 0.7465858842498717, "grad_norm": 15.096092224121094, "learning_rate": 1.9852231481557556e-06, "loss": 0.1403, "num_input_tokens_seen": 20583488, "step": 30560 }, { "epoch": 0.7467080350817189, "grad_norm": 12.31279182434082, "learning_rate": 1.985208538597075e-06, "loss": 0.1793, "num_input_tokens_seen": 20587520, "step": 30565 }, { "epoch": 0.7468301859135661, "grad_norm": 1.6888927221298218, "learning_rate": 1.9851939218736937e-06, "loss": 0.0606, "num_input_tokens_seen": 20591232, "step": 30570 }, { "epoch": 0.7469523367454133, "grad_norm": 14.612831115722656, "learning_rate": 1.9851792979857166e-06, "loss": 0.0999, "num_input_tokens_seen": 20594816, "step": 30575 }, { "epoch": 0.7470744875772604, "grad_norm": 15.661672592163086, "learning_rate": 1.9851646669332507e-06, "loss": 0.1943, "num_input_tokens_seen": 20597888, "step": 30580 }, { "epoch": 0.7471966384091075, "grad_norm": 50.38776397705078, "learning_rate": 1.9851500287164023e-06, "loss": 0.1239, "num_input_tokens_seen": 20601472, "step": 30585 }, { "epoch": 0.7473187892409547, "grad_norm": 5.186892032623291, "learning_rate": 1.985135383335278e-06, "loss": 0.1193, "num_input_tokens_seen": 20604864, "step": 30590 }, { "epoch": 0.7474409400728019, "grad_norm": 9.449992179870605, "learning_rate": 1.9851207307899847e-06, "loss": 0.0523, "num_input_tokens_seen": 20608384, "step": 30595 }, { "epoch": 0.747563090904649, "grad_norm": 16.765907287597656, "learning_rate": 1.985106071080628e-06, "loss": 0.0525, "num_input_tokens_seen": 20611712, "step": 30600 }, { "epoch": 0.7476852417364962, "grad_norm": 4.915185451507568, "learning_rate": 1.985091404207315e-06, "loss": 0.0778, "num_input_tokens_seen": 20615104, "step": 30605 }, { "epoch": 0.7478073925683434, "grad_norm": 10.675619125366211, "learning_rate": 1.9850767301701523e-06, "loss": 0.0828, "num_input_tokens_seen": 20618432, "step": 30610 }, { "epoch": 0.7479295434001906, "grad_norm": 9.624067306518555, "learning_rate": 1.985062048969247e-06, "loss": 0.1881, "num_input_tokens_seen": 20621760, "step": 30615 }, { "epoch": 0.7480516942320378, "grad_norm": 20.003002166748047, "learning_rate": 1.985047360604705e-06, "loss": 0.1091, "num_input_tokens_seen": 20625280, "step": 30620 }, { "epoch": 0.7481738450638848, "grad_norm": 17.2797794342041, "learning_rate": 1.9850326650766343e-06, "loss": 0.0786, "num_input_tokens_seen": 20628608, "step": 30625 }, { "epoch": 0.748295995895732, "grad_norm": 2.4404211044311523, "learning_rate": 1.985017962385141e-06, "loss": 0.0865, "num_input_tokens_seen": 20631744, "step": 30630 }, { "epoch": 0.7484181467275792, "grad_norm": 4.35370397567749, "learning_rate": 1.985003252530332e-06, "loss": 0.0841, "num_input_tokens_seen": 20635200, "step": 30635 }, { "epoch": 0.7485402975594264, "grad_norm": 19.579072952270508, "learning_rate": 1.984988535512314e-06, "loss": 0.0835, "num_input_tokens_seen": 20638912, "step": 30640 }, { "epoch": 0.7486624483912735, "grad_norm": 12.48481273651123, "learning_rate": 1.984973811331195e-06, "loss": 0.098, "num_input_tokens_seen": 20642368, "step": 30645 }, { "epoch": 0.7487845992231207, "grad_norm": 2.2552871704101562, "learning_rate": 1.9849590799870813e-06, "loss": 0.1273, "num_input_tokens_seen": 20646336, "step": 30650 }, { "epoch": 0.7489067500549679, "grad_norm": 17.537343978881836, "learning_rate": 1.98494434148008e-06, "loss": 0.0843, "num_input_tokens_seen": 20649600, "step": 30655 }, { "epoch": 0.7490289008868151, "grad_norm": 13.28173542022705, "learning_rate": 1.984929595810299e-06, "loss": 0.0638, "num_input_tokens_seen": 20653184, "step": 30660 }, { "epoch": 0.7491510517186623, "grad_norm": 18.33385467529297, "learning_rate": 1.984914842977845e-06, "loss": 0.1266, "num_input_tokens_seen": 20656192, "step": 30665 }, { "epoch": 0.7492732025505093, "grad_norm": 29.58997917175293, "learning_rate": 1.984900082982825e-06, "loss": 0.2753, "num_input_tokens_seen": 20659136, "step": 30670 }, { "epoch": 0.7493953533823565, "grad_norm": 17.214855194091797, "learning_rate": 1.9848853158253472e-06, "loss": 0.061, "num_input_tokens_seen": 20662400, "step": 30675 }, { "epoch": 0.7495175042142037, "grad_norm": 33.4477653503418, "learning_rate": 1.984870541505518e-06, "loss": 0.2228, "num_input_tokens_seen": 20665856, "step": 30680 }, { "epoch": 0.7496396550460509, "grad_norm": 38.7508430480957, "learning_rate": 1.9848557600234453e-06, "loss": 0.0521, "num_input_tokens_seen": 20669440, "step": 30685 }, { "epoch": 0.749761805877898, "grad_norm": 28.749181747436523, "learning_rate": 1.984840971379237e-06, "loss": 0.1166, "num_input_tokens_seen": 20672640, "step": 30690 }, { "epoch": 0.7498839567097452, "grad_norm": 3.055140733718872, "learning_rate": 1.9848261755730002e-06, "loss": 0.0697, "num_input_tokens_seen": 20676160, "step": 30695 }, { "epoch": 0.7500061075415924, "grad_norm": 10.008118629455566, "learning_rate": 1.9848113726048427e-06, "loss": 0.0459, "num_input_tokens_seen": 20679424, "step": 30700 }, { "epoch": 0.7500549678743312, "eval_loss": 0.1214575320482254, "eval_runtime": 48.0351, "eval_samples_per_second": 757.467, "eval_steps_per_second": 94.702, "num_input_tokens_seen": 20680640, "step": 30702 }, { "epoch": 0.7501282583734395, "grad_norm": 32.10566329956055, "learning_rate": 1.9847965624748717e-06, "loss": 0.0963, "num_input_tokens_seen": 20683008, "step": 30705 }, { "epoch": 0.7502504092052867, "grad_norm": 0.3510077893733978, "learning_rate": 1.9847817451831952e-06, "loss": 0.0733, "num_input_tokens_seen": 20686400, "step": 30710 }, { "epoch": 0.7503725600371338, "grad_norm": 12.46418285369873, "learning_rate": 1.9847669207299212e-06, "loss": 0.1472, "num_input_tokens_seen": 20690048, "step": 30715 }, { "epoch": 0.750494710868981, "grad_norm": 0.30937764048576355, "learning_rate": 1.984752089115157e-06, "loss": 0.1717, "num_input_tokens_seen": 20693824, "step": 30720 }, { "epoch": 0.7506168617008282, "grad_norm": 1.3294833898544312, "learning_rate": 1.9847372503390106e-06, "loss": 0.0115, "num_input_tokens_seen": 20697344, "step": 30725 }, { "epoch": 0.7507390125326754, "grad_norm": 2.612206220626831, "learning_rate": 1.984722404401591e-06, "loss": 0.0988, "num_input_tokens_seen": 20700992, "step": 30730 }, { "epoch": 0.7508611633645225, "grad_norm": 18.258258819580078, "learning_rate": 1.9847075513030042e-06, "loss": 0.1538, "num_input_tokens_seen": 20704384, "step": 30735 }, { "epoch": 0.7509833141963697, "grad_norm": 0.13408470153808594, "learning_rate": 1.9846926910433597e-06, "loss": 0.0825, "num_input_tokens_seen": 20707968, "step": 30740 }, { "epoch": 0.7511054650282168, "grad_norm": 76.22888946533203, "learning_rate": 1.984677823622765e-06, "loss": 0.2069, "num_input_tokens_seen": 20711680, "step": 30745 }, { "epoch": 0.751227615860064, "grad_norm": 17.194808959960938, "learning_rate": 1.9846629490413284e-06, "loss": 0.1362, "num_input_tokens_seen": 20714880, "step": 30750 }, { "epoch": 0.7513497666919112, "grad_norm": 12.739709854125977, "learning_rate": 1.9846480672991576e-06, "loss": 0.1142, "num_input_tokens_seen": 20718720, "step": 30755 }, { "epoch": 0.7514719175237583, "grad_norm": 0.08701960742473602, "learning_rate": 1.9846331783963618e-06, "loss": 0.1044, "num_input_tokens_seen": 20722112, "step": 30760 }, { "epoch": 0.7515940683556055, "grad_norm": 28.54833984375, "learning_rate": 1.9846182823330483e-06, "loss": 0.1727, "num_input_tokens_seen": 20725312, "step": 30765 }, { "epoch": 0.7517162191874527, "grad_norm": 32.205657958984375, "learning_rate": 1.984603379109326e-06, "loss": 0.0193, "num_input_tokens_seen": 20729664, "step": 30770 }, { "epoch": 0.7518383700192999, "grad_norm": 24.44585418701172, "learning_rate": 1.984588468725303e-06, "loss": 0.2256, "num_input_tokens_seen": 20732992, "step": 30775 }, { "epoch": 0.751960520851147, "grad_norm": 18.783945083618164, "learning_rate": 1.984573551181088e-06, "loss": 0.1115, "num_input_tokens_seen": 20736128, "step": 30780 }, { "epoch": 0.7520826716829941, "grad_norm": 36.28510665893555, "learning_rate": 1.984558626476789e-06, "loss": 0.0376, "num_input_tokens_seen": 20739456, "step": 30785 }, { "epoch": 0.7522048225148413, "grad_norm": 9.365072250366211, "learning_rate": 1.984543694612515e-06, "loss": 0.071, "num_input_tokens_seen": 20742976, "step": 30790 }, { "epoch": 0.7523269733466885, "grad_norm": 33.965946197509766, "learning_rate": 1.9845287555883745e-06, "loss": 0.0298, "num_input_tokens_seen": 20746304, "step": 30795 }, { "epoch": 0.7524491241785357, "grad_norm": 41.173213958740234, "learning_rate": 1.984513809404476e-06, "loss": 0.0712, "num_input_tokens_seen": 20749568, "step": 30800 }, { "epoch": 0.7525712750103828, "grad_norm": 23.996231079101562, "learning_rate": 1.9844988560609287e-06, "loss": 0.1266, "num_input_tokens_seen": 20753024, "step": 30805 }, { "epoch": 0.75269342584223, "grad_norm": 19.44346046447754, "learning_rate": 1.98448389555784e-06, "loss": 0.0992, "num_input_tokens_seen": 20756224, "step": 30810 }, { "epoch": 0.7528155766740772, "grad_norm": 23.903226852416992, "learning_rate": 1.9844689278953204e-06, "loss": 0.1103, "num_input_tokens_seen": 20759424, "step": 30815 }, { "epoch": 0.7529377275059244, "grad_norm": 14.277491569519043, "learning_rate": 1.984453953073478e-06, "loss": 0.123, "num_input_tokens_seen": 20763072, "step": 30820 }, { "epoch": 0.7530598783377714, "grad_norm": 17.374441146850586, "learning_rate": 1.984438971092421e-06, "loss": 0.0925, "num_input_tokens_seen": 20766208, "step": 30825 }, { "epoch": 0.7531820291696186, "grad_norm": 9.210493087768555, "learning_rate": 1.9844239819522595e-06, "loss": 0.1563, "num_input_tokens_seen": 20770496, "step": 30830 }, { "epoch": 0.7533041800014658, "grad_norm": 38.678043365478516, "learning_rate": 1.984408985653102e-06, "loss": 0.155, "num_input_tokens_seen": 20773696, "step": 30835 }, { "epoch": 0.753426330833313, "grad_norm": 6.5732622146606445, "learning_rate": 1.9843939821950577e-06, "loss": 0.0861, "num_input_tokens_seen": 20776896, "step": 30840 }, { "epoch": 0.7535484816651601, "grad_norm": 26.16443634033203, "learning_rate": 1.9843789715782356e-06, "loss": 0.1297, "num_input_tokens_seen": 20780224, "step": 30845 }, { "epoch": 0.7536706324970073, "grad_norm": 10.182201385498047, "learning_rate": 1.984363953802744e-06, "loss": 0.0626, "num_input_tokens_seen": 20783488, "step": 30850 }, { "epoch": 0.7537927833288545, "grad_norm": 27.792400360107422, "learning_rate": 1.984348928868694e-06, "loss": 0.2264, "num_input_tokens_seen": 20787136, "step": 30855 }, { "epoch": 0.7539149341607017, "grad_norm": 16.452760696411133, "learning_rate": 1.9843338967761934e-06, "loss": 0.1147, "num_input_tokens_seen": 20790912, "step": 30860 }, { "epoch": 0.7540370849925488, "grad_norm": 2.219255208969116, "learning_rate": 1.984318857525352e-06, "loss": 0.1371, "num_input_tokens_seen": 20794112, "step": 30865 }, { "epoch": 0.7541592358243959, "grad_norm": 0.26177316904067993, "learning_rate": 1.9843038111162796e-06, "loss": 0.3025, "num_input_tokens_seen": 20797184, "step": 30870 }, { "epoch": 0.7542813866562431, "grad_norm": 0.38541722297668457, "learning_rate": 1.9842887575490844e-06, "loss": 0.0441, "num_input_tokens_seen": 20800576, "step": 30875 }, { "epoch": 0.7544035374880903, "grad_norm": 10.842700004577637, "learning_rate": 1.9842736968238773e-06, "loss": 0.1573, "num_input_tokens_seen": 20803968, "step": 30880 }, { "epoch": 0.7545256883199375, "grad_norm": 14.55325698852539, "learning_rate": 1.9842586289407665e-06, "loss": 0.1318, "num_input_tokens_seen": 20807488, "step": 30885 }, { "epoch": 0.7546478391517846, "grad_norm": 28.751131057739258, "learning_rate": 1.9842435538998627e-06, "loss": 0.1698, "num_input_tokens_seen": 20811008, "step": 30890 }, { "epoch": 0.7547699899836318, "grad_norm": 17.204044342041016, "learning_rate": 1.9842284717012743e-06, "loss": 0.1536, "num_input_tokens_seen": 20814464, "step": 30895 }, { "epoch": 0.754892140815479, "grad_norm": 33.994510650634766, "learning_rate": 1.984213382345112e-06, "loss": 0.1537, "num_input_tokens_seen": 20817664, "step": 30900 }, { "epoch": 0.7550142916473261, "grad_norm": 16.428762435913086, "learning_rate": 1.984198285831486e-06, "loss": 0.1269, "num_input_tokens_seen": 20821184, "step": 30905 }, { "epoch": 0.7551364424791733, "grad_norm": 17.435991287231445, "learning_rate": 1.9841831821605045e-06, "loss": 0.1899, "num_input_tokens_seen": 20825216, "step": 30910 }, { "epoch": 0.7552585933110204, "grad_norm": 1.3063730001449585, "learning_rate": 1.9841680713322786e-06, "loss": 0.1374, "num_input_tokens_seen": 20829056, "step": 30915 }, { "epoch": 0.7553807441428676, "grad_norm": 2.680938959121704, "learning_rate": 1.984152953346918e-06, "loss": 0.1459, "num_input_tokens_seen": 20832512, "step": 30920 }, { "epoch": 0.7555028949747148, "grad_norm": 0.2480306476354599, "learning_rate": 1.984137828204532e-06, "loss": 0.1336, "num_input_tokens_seen": 20835968, "step": 30925 }, { "epoch": 0.755625045806562, "grad_norm": 5.193900108337402, "learning_rate": 1.9841226959052314e-06, "loss": 0.0356, "num_input_tokens_seen": 20839680, "step": 30930 }, { "epoch": 0.7557471966384091, "grad_norm": 1.7502617835998535, "learning_rate": 1.9841075564491253e-06, "loss": 0.1557, "num_input_tokens_seen": 20843456, "step": 30935 }, { "epoch": 0.7558693474702562, "grad_norm": 11.336082458496094, "learning_rate": 1.984092409836325e-06, "loss": 0.0466, "num_input_tokens_seen": 20846976, "step": 30940 }, { "epoch": 0.7559914983021034, "grad_norm": 9.426225662231445, "learning_rate": 1.984077256066939e-06, "loss": 0.0731, "num_input_tokens_seen": 20850048, "step": 30945 }, { "epoch": 0.7561136491339506, "grad_norm": 24.966283798217773, "learning_rate": 1.9840620951410797e-06, "loss": 0.1131, "num_input_tokens_seen": 20853632, "step": 30950 }, { "epoch": 0.7562357999657978, "grad_norm": 0.18268482387065887, "learning_rate": 1.9840469270588557e-06, "loss": 0.0763, "num_input_tokens_seen": 20857024, "step": 30955 }, { "epoch": 0.7563579507976449, "grad_norm": 10.175745964050293, "learning_rate": 1.9840317518203773e-06, "loss": 0.0524, "num_input_tokens_seen": 20860416, "step": 30960 }, { "epoch": 0.7564801016294921, "grad_norm": 30.9974365234375, "learning_rate": 1.984016569425756e-06, "loss": 0.1901, "num_input_tokens_seen": 20864064, "step": 30965 }, { "epoch": 0.7566022524613393, "grad_norm": 22.473278045654297, "learning_rate": 1.984001379875101e-06, "loss": 0.1206, "num_input_tokens_seen": 20867264, "step": 30970 }, { "epoch": 0.7567244032931865, "grad_norm": 21.42226791381836, "learning_rate": 1.9839861831685235e-06, "loss": 0.0689, "num_input_tokens_seen": 20870784, "step": 30975 }, { "epoch": 0.7568465541250335, "grad_norm": 12.539742469787598, "learning_rate": 1.983970979306134e-06, "loss": 0.0657, "num_input_tokens_seen": 20873856, "step": 30980 }, { "epoch": 0.7569687049568807, "grad_norm": 3.040371894836426, "learning_rate": 1.983955768288043e-06, "loss": 0.1117, "num_input_tokens_seen": 20876864, "step": 30985 }, { "epoch": 0.7570908557887279, "grad_norm": 20.58167266845703, "learning_rate": 1.9839405501143606e-06, "loss": 0.1582, "num_input_tokens_seen": 20880192, "step": 30990 }, { "epoch": 0.7572130066205751, "grad_norm": 21.035499572753906, "learning_rate": 1.983925324785198e-06, "loss": 0.202, "num_input_tokens_seen": 20883584, "step": 30995 }, { "epoch": 0.7573351574524223, "grad_norm": 20.17402458190918, "learning_rate": 1.983910092300666e-06, "loss": 0.166, "num_input_tokens_seen": 20886336, "step": 31000 }, { "epoch": 0.7574573082842694, "grad_norm": 22.944866180419922, "learning_rate": 1.983894852660875e-06, "loss": 0.0651, "num_input_tokens_seen": 20889728, "step": 31005 }, { "epoch": 0.7575794591161166, "grad_norm": 2.4674253463745117, "learning_rate": 1.983879605865936e-06, "loss": 0.0693, "num_input_tokens_seen": 20893184, "step": 31010 }, { "epoch": 0.7577016099479638, "grad_norm": 1.2100327014923096, "learning_rate": 1.9838643519159596e-06, "loss": 0.1108, "num_input_tokens_seen": 20896384, "step": 31015 }, { "epoch": 0.757823760779811, "grad_norm": 16.186931610107422, "learning_rate": 1.9838490908110573e-06, "loss": 0.1034, "num_input_tokens_seen": 20899776, "step": 31020 }, { "epoch": 0.757945911611658, "grad_norm": 29.93137550354004, "learning_rate": 1.9838338225513397e-06, "loss": 0.2739, "num_input_tokens_seen": 20902848, "step": 31025 }, { "epoch": 0.7580680624435052, "grad_norm": 12.030484199523926, "learning_rate": 1.9838185471369182e-06, "loss": 0.1084, "num_input_tokens_seen": 20906368, "step": 31030 }, { "epoch": 0.7581902132753524, "grad_norm": 0.8099299669265747, "learning_rate": 1.9838032645679033e-06, "loss": 0.1732, "num_input_tokens_seen": 20909952, "step": 31035 }, { "epoch": 0.7583123641071996, "grad_norm": 0.6834255456924438, "learning_rate": 1.9837879748444065e-06, "loss": 0.0867, "num_input_tokens_seen": 20913152, "step": 31040 }, { "epoch": 0.7584345149390468, "grad_norm": 18.76382064819336, "learning_rate": 1.983772677966539e-06, "loss": 0.1665, "num_input_tokens_seen": 20916608, "step": 31045 }, { "epoch": 0.7585566657708939, "grad_norm": 25.39693832397461, "learning_rate": 1.983757373934412e-06, "loss": 0.1466, "num_input_tokens_seen": 20920512, "step": 31050 }, { "epoch": 0.7586788166027411, "grad_norm": 1.737460970878601, "learning_rate": 1.983742062748137e-06, "loss": 0.0726, "num_input_tokens_seen": 20924032, "step": 31055 }, { "epoch": 0.7588009674345882, "grad_norm": 13.280280113220215, "learning_rate": 1.9837267444078245e-06, "loss": 0.0878, "num_input_tokens_seen": 20927424, "step": 31060 }, { "epoch": 0.7589231182664354, "grad_norm": 10.46796989440918, "learning_rate": 1.9837114189135867e-06, "loss": 0.0989, "num_input_tokens_seen": 20931136, "step": 31065 }, { "epoch": 0.7590452690982825, "grad_norm": 26.009592056274414, "learning_rate": 1.9836960862655352e-06, "loss": 0.1624, "num_input_tokens_seen": 20935040, "step": 31070 }, { "epoch": 0.7591674199301297, "grad_norm": 17.237714767456055, "learning_rate": 1.9836807464637814e-06, "loss": 0.0539, "num_input_tokens_seen": 20938304, "step": 31075 }, { "epoch": 0.7592895707619769, "grad_norm": 33.32792282104492, "learning_rate": 1.983665399508436e-06, "loss": 0.1256, "num_input_tokens_seen": 20941568, "step": 31080 }, { "epoch": 0.7594117215938241, "grad_norm": 44.93278121948242, "learning_rate": 1.9836500453996116e-06, "loss": 0.1151, "num_input_tokens_seen": 20944896, "step": 31085 }, { "epoch": 0.7595338724256712, "grad_norm": 10.59104061126709, "learning_rate": 1.9836346841374192e-06, "loss": 0.0971, "num_input_tokens_seen": 20948160, "step": 31090 }, { "epoch": 0.7596560232575184, "grad_norm": 38.38275146484375, "learning_rate": 1.9836193157219713e-06, "loss": 0.2196, "num_input_tokens_seen": 20951360, "step": 31095 }, { "epoch": 0.7597781740893655, "grad_norm": 17.22246742248535, "learning_rate": 1.983603940153379e-06, "loss": 0.1416, "num_input_tokens_seen": 20954752, "step": 31100 }, { "epoch": 0.7599003249212127, "grad_norm": 8.937365531921387, "learning_rate": 1.983588557431754e-06, "loss": 0.0766, "num_input_tokens_seen": 20957952, "step": 31105 }, { "epoch": 0.7600224757530599, "grad_norm": 2.2201955318450928, "learning_rate": 1.983573167557209e-06, "loss": 0.1191, "num_input_tokens_seen": 20961152, "step": 31110 }, { "epoch": 0.760144626584907, "grad_norm": 2.811094045639038, "learning_rate": 1.9835577705298545e-06, "loss": 0.1919, "num_input_tokens_seen": 20964224, "step": 31115 }, { "epoch": 0.7602667774167542, "grad_norm": 0.6476378440856934, "learning_rate": 1.983542366349804e-06, "loss": 0.0693, "num_input_tokens_seen": 20967552, "step": 31120 }, { "epoch": 0.7603889282486014, "grad_norm": 1.8725968599319458, "learning_rate": 1.9835269550171687e-06, "loss": 0.1099, "num_input_tokens_seen": 20970496, "step": 31125 }, { "epoch": 0.7605110790804486, "grad_norm": 5.410181999206543, "learning_rate": 1.983511536532061e-06, "loss": 0.1083, "num_input_tokens_seen": 20973632, "step": 31130 }, { "epoch": 0.7606332299122956, "grad_norm": 16.716602325439453, "learning_rate": 1.983496110894593e-06, "loss": 0.1891, "num_input_tokens_seen": 20977152, "step": 31135 }, { "epoch": 0.7607553807441428, "grad_norm": 0.11880694329738617, "learning_rate": 1.9834806781048764e-06, "loss": 0.0849, "num_input_tokens_seen": 20980736, "step": 31140 }, { "epoch": 0.76087753157599, "grad_norm": 2.018104314804077, "learning_rate": 1.983465238163024e-06, "loss": 0.0771, "num_input_tokens_seen": 20983936, "step": 31145 }, { "epoch": 0.7609996824078372, "grad_norm": 0.5362045168876648, "learning_rate": 1.9834497910691478e-06, "loss": 0.044, "num_input_tokens_seen": 20988224, "step": 31150 }, { "epoch": 0.7611218332396844, "grad_norm": 32.71379852294922, "learning_rate": 1.98343433682336e-06, "loss": 0.0602, "num_input_tokens_seen": 20991552, "step": 31155 }, { "epoch": 0.7612439840715315, "grad_norm": 21.509185791015625, "learning_rate": 1.9834188754257733e-06, "loss": 0.2165, "num_input_tokens_seen": 20994688, "step": 31160 }, { "epoch": 0.7613661349033787, "grad_norm": 37.06200408935547, "learning_rate": 1.9834034068765e-06, "loss": 0.0566, "num_input_tokens_seen": 20997952, "step": 31165 }, { "epoch": 0.7614882857352259, "grad_norm": 5.213249683380127, "learning_rate": 1.983387931175653e-06, "loss": 0.1226, "num_input_tokens_seen": 21001216, "step": 31170 }, { "epoch": 0.7616104365670731, "grad_norm": 10.156071662902832, "learning_rate": 1.983372448323344e-06, "loss": 0.0843, "num_input_tokens_seen": 21004736, "step": 31175 }, { "epoch": 0.7617325873989201, "grad_norm": 18.279315948486328, "learning_rate": 1.983356958319686e-06, "loss": 0.0738, "num_input_tokens_seen": 21008064, "step": 31180 }, { "epoch": 0.7618547382307673, "grad_norm": 3.7977609634399414, "learning_rate": 1.9833414611647925e-06, "loss": 0.044, "num_input_tokens_seen": 21011136, "step": 31185 }, { "epoch": 0.7619768890626145, "grad_norm": 21.37438201904297, "learning_rate": 1.9833259568587744e-06, "loss": 0.1582, "num_input_tokens_seen": 21014400, "step": 31190 }, { "epoch": 0.7620990398944617, "grad_norm": 14.26490306854248, "learning_rate": 1.983310445401746e-06, "loss": 0.0659, "num_input_tokens_seen": 21018112, "step": 31195 }, { "epoch": 0.7622211907263089, "grad_norm": 36.653236389160156, "learning_rate": 1.9832949267938195e-06, "loss": 0.0602, "num_input_tokens_seen": 21021120, "step": 31200 }, { "epoch": 0.762343341558156, "grad_norm": 1.8205872774124146, "learning_rate": 1.9832794010351077e-06, "loss": 0.1516, "num_input_tokens_seen": 21024384, "step": 31205 }, { "epoch": 0.7624654923900032, "grad_norm": 7.122799873352051, "learning_rate": 1.9832638681257238e-06, "loss": 0.1687, "num_input_tokens_seen": 21027328, "step": 31210 }, { "epoch": 0.7625876432218504, "grad_norm": 62.234737396240234, "learning_rate": 1.9832483280657805e-06, "loss": 0.1582, "num_input_tokens_seen": 21030912, "step": 31215 }, { "epoch": 0.7627097940536975, "grad_norm": 0.24601157009601593, "learning_rate": 1.983232780855391e-06, "loss": 0.0989, "num_input_tokens_seen": 21034688, "step": 31220 }, { "epoch": 0.7628319448855446, "grad_norm": 28.595394134521484, "learning_rate": 1.983217226494668e-06, "loss": 0.1562, "num_input_tokens_seen": 21037760, "step": 31225 }, { "epoch": 0.7629540957173918, "grad_norm": 10.716483116149902, "learning_rate": 1.983201664983725e-06, "loss": 0.1243, "num_input_tokens_seen": 21040896, "step": 31230 }, { "epoch": 0.763076246549239, "grad_norm": 0.8800622820854187, "learning_rate": 1.9831860963226754e-06, "loss": 0.1733, "num_input_tokens_seen": 21044864, "step": 31235 }, { "epoch": 0.7631983973810862, "grad_norm": 13.254878044128418, "learning_rate": 1.9831705205116317e-06, "loss": 0.1372, "num_input_tokens_seen": 21048128, "step": 31240 }, { "epoch": 0.7633205482129334, "grad_norm": 30.35858917236328, "learning_rate": 1.9831549375507076e-06, "loss": 0.1415, "num_input_tokens_seen": 21051968, "step": 31245 }, { "epoch": 0.7634426990447805, "grad_norm": 1.8185697793960571, "learning_rate": 1.983139347440016e-06, "loss": 0.0339, "num_input_tokens_seen": 21055296, "step": 31250 }, { "epoch": 0.7635648498766276, "grad_norm": 0.16570863127708435, "learning_rate": 1.983123750179671e-06, "loss": 0.04, "num_input_tokens_seen": 21058432, "step": 31255 }, { "epoch": 0.7636870007084748, "grad_norm": 52.01047897338867, "learning_rate": 1.9831081457697856e-06, "loss": 0.0793, "num_input_tokens_seen": 21061760, "step": 31260 }, { "epoch": 0.763809151540322, "grad_norm": 22.145647048950195, "learning_rate": 1.9830925342104736e-06, "loss": 0.1415, "num_input_tokens_seen": 21065088, "step": 31265 }, { "epoch": 0.7639313023721691, "grad_norm": 11.057469367980957, "learning_rate": 1.983076915501848e-06, "loss": 0.1292, "num_input_tokens_seen": 21068864, "step": 31270 }, { "epoch": 0.7640534532040163, "grad_norm": 51.785011291503906, "learning_rate": 1.9830612896440226e-06, "loss": 0.1785, "num_input_tokens_seen": 21072192, "step": 31275 }, { "epoch": 0.7641756040358635, "grad_norm": 43.112247467041016, "learning_rate": 1.983045656637111e-06, "loss": 0.158, "num_input_tokens_seen": 21075072, "step": 31280 }, { "epoch": 0.7642977548677107, "grad_norm": 32.858177185058594, "learning_rate": 1.9830300164812273e-06, "loss": 0.1041, "num_input_tokens_seen": 21079232, "step": 31285 }, { "epoch": 0.7644199056995579, "grad_norm": 0.2882313132286072, "learning_rate": 1.9830143691764846e-06, "loss": 0.0476, "num_input_tokens_seen": 21082560, "step": 31290 }, { "epoch": 0.764542056531405, "grad_norm": 1.6282767057418823, "learning_rate": 1.9829987147229974e-06, "loss": 0.0529, "num_input_tokens_seen": 21086144, "step": 31295 }, { "epoch": 0.7646642073632521, "grad_norm": 21.46419906616211, "learning_rate": 1.982983053120879e-06, "loss": 0.2246, "num_input_tokens_seen": 21089600, "step": 31300 }, { "epoch": 0.7647863581950993, "grad_norm": 31.539697647094727, "learning_rate": 1.9829673843702434e-06, "loss": 0.0652, "num_input_tokens_seen": 21093056, "step": 31305 }, { "epoch": 0.7649085090269465, "grad_norm": 12.277565956115723, "learning_rate": 1.9829517084712045e-06, "loss": 0.172, "num_input_tokens_seen": 21096256, "step": 31310 }, { "epoch": 0.7650306598587936, "grad_norm": 21.35080337524414, "learning_rate": 1.9829360254238767e-06, "loss": 0.1213, "num_input_tokens_seen": 21099520, "step": 31315 }, { "epoch": 0.7651528106906408, "grad_norm": 19.8055419921875, "learning_rate": 1.9829203352283735e-06, "loss": 0.1974, "num_input_tokens_seen": 21102592, "step": 31320 }, { "epoch": 0.765274961522488, "grad_norm": 9.7948637008667, "learning_rate": 1.982904637884809e-06, "loss": 0.1138, "num_input_tokens_seen": 21105920, "step": 31325 }, { "epoch": 0.7653971123543352, "grad_norm": 28.4554443359375, "learning_rate": 1.982888933393298e-06, "loss": 0.1358, "num_input_tokens_seen": 21109504, "step": 31330 }, { "epoch": 0.7655192631861824, "grad_norm": 0.3278570771217346, "learning_rate": 1.982873221753954e-06, "loss": 0.1178, "num_input_tokens_seen": 21112576, "step": 31335 }, { "epoch": 0.7656414140180294, "grad_norm": 14.416967391967773, "learning_rate": 1.982857502966892e-06, "loss": 0.1292, "num_input_tokens_seen": 21115840, "step": 31340 }, { "epoch": 0.7657635648498766, "grad_norm": 13.378105163574219, "learning_rate": 1.9828417770322255e-06, "loss": 0.1554, "num_input_tokens_seen": 21119296, "step": 31345 }, { "epoch": 0.7658857156817238, "grad_norm": 1.7292650938034058, "learning_rate": 1.9828260439500694e-06, "loss": 0.0441, "num_input_tokens_seen": 21122944, "step": 31350 }, { "epoch": 0.766007866513571, "grad_norm": 10.598871231079102, "learning_rate": 1.9828103037205376e-06, "loss": 0.1804, "num_input_tokens_seen": 21126016, "step": 31355 }, { "epoch": 0.7661300173454181, "grad_norm": 15.65273666381836, "learning_rate": 1.9827945563437455e-06, "loss": 0.1402, "num_input_tokens_seen": 21129344, "step": 31360 }, { "epoch": 0.7662521681772653, "grad_norm": 0.42412981390953064, "learning_rate": 1.9827788018198067e-06, "loss": 0.0889, "num_input_tokens_seen": 21132288, "step": 31365 }, { "epoch": 0.7663743190091125, "grad_norm": 10.7468843460083, "learning_rate": 1.9827630401488365e-06, "loss": 0.093, "num_input_tokens_seen": 21135616, "step": 31370 }, { "epoch": 0.7664964698409596, "grad_norm": 23.37337303161621, "learning_rate": 1.9827472713309486e-06, "loss": 0.1495, "num_input_tokens_seen": 21138816, "step": 31375 }, { "epoch": 0.7666186206728067, "grad_norm": 7.535536766052246, "learning_rate": 1.9827314953662584e-06, "loss": 0.1836, "num_input_tokens_seen": 21141952, "step": 31380 }, { "epoch": 0.7667407715046539, "grad_norm": 0.7598505616188049, "learning_rate": 1.9827157122548806e-06, "loss": 0.0087, "num_input_tokens_seen": 21145920, "step": 31385 }, { "epoch": 0.7668629223365011, "grad_norm": 22.626522064208984, "learning_rate": 1.98269992199693e-06, "loss": 0.1052, "num_input_tokens_seen": 21149120, "step": 31390 }, { "epoch": 0.7669850731683483, "grad_norm": 37.745269775390625, "learning_rate": 1.982684124592521e-06, "loss": 0.119, "num_input_tokens_seen": 21152128, "step": 31395 }, { "epoch": 0.7671072240001955, "grad_norm": 3.7507152557373047, "learning_rate": 1.9826683200417684e-06, "loss": 0.0412, "num_input_tokens_seen": 21155520, "step": 31400 }, { "epoch": 0.7672293748320426, "grad_norm": 29.963966369628906, "learning_rate": 1.982652508344788e-06, "loss": 0.2221, "num_input_tokens_seen": 21159040, "step": 31405 }, { "epoch": 0.7673515256638898, "grad_norm": 8.20248794555664, "learning_rate": 1.982636689501694e-06, "loss": 0.1157, "num_input_tokens_seen": 21162432, "step": 31410 }, { "epoch": 0.767473676495737, "grad_norm": 12.889708518981934, "learning_rate": 1.9826208635126017e-06, "loss": 0.0612, "num_input_tokens_seen": 21166208, "step": 31415 }, { "epoch": 0.7675958273275841, "grad_norm": 0.2748701870441437, "learning_rate": 1.9826050303776265e-06, "loss": 0.0213, "num_input_tokens_seen": 21169856, "step": 31420 }, { "epoch": 0.7677179781594312, "grad_norm": 10.752982139587402, "learning_rate": 1.982589190096883e-06, "loss": 0.1161, "num_input_tokens_seen": 21173504, "step": 31425 }, { "epoch": 0.7678401289912784, "grad_norm": 4.767438888549805, "learning_rate": 1.9825733426704867e-06, "loss": 0.0755, "num_input_tokens_seen": 21177088, "step": 31430 }, { "epoch": 0.7679622798231256, "grad_norm": 18.476245880126953, "learning_rate": 1.9825574880985525e-06, "loss": 0.1222, "num_input_tokens_seen": 21180544, "step": 31435 }, { "epoch": 0.7680844306549728, "grad_norm": 28.4121036529541, "learning_rate": 1.982541626381196e-06, "loss": 0.08, "num_input_tokens_seen": 21183616, "step": 31440 }, { "epoch": 0.76820658148682, "grad_norm": 0.08580884337425232, "learning_rate": 1.9825257575185326e-06, "loss": 0.0965, "num_input_tokens_seen": 21187456, "step": 31445 }, { "epoch": 0.768328732318667, "grad_norm": 12.523801803588867, "learning_rate": 1.9825098815106777e-06, "loss": 0.0524, "num_input_tokens_seen": 21190656, "step": 31450 }, { "epoch": 0.7684508831505142, "grad_norm": 21.91022300720215, "learning_rate": 1.982493998357747e-06, "loss": 0.1822, "num_input_tokens_seen": 21193920, "step": 31455 }, { "epoch": 0.7685730339823614, "grad_norm": 14.95090103149414, "learning_rate": 1.982478108059855e-06, "loss": 0.0697, "num_input_tokens_seen": 21196864, "step": 31460 }, { "epoch": 0.7686951848142086, "grad_norm": 17.197738647460938, "learning_rate": 1.982462210617118e-06, "loss": 0.1942, "num_input_tokens_seen": 21200064, "step": 31465 }, { "epoch": 0.7688173356460557, "grad_norm": 18.8331241607666, "learning_rate": 1.982446306029652e-06, "loss": 0.0911, "num_input_tokens_seen": 21203456, "step": 31470 }, { "epoch": 0.7689394864779029, "grad_norm": 54.55223846435547, "learning_rate": 1.982430394297572e-06, "loss": 0.1893, "num_input_tokens_seen": 21206464, "step": 31475 }, { "epoch": 0.7690616373097501, "grad_norm": 0.18907591700553894, "learning_rate": 1.9824144754209944e-06, "loss": 0.0927, "num_input_tokens_seen": 21209856, "step": 31480 }, { "epoch": 0.7691837881415973, "grad_norm": 21.11206817626953, "learning_rate": 1.982398549400034e-06, "loss": 0.1205, "num_input_tokens_seen": 21212672, "step": 31485 }, { "epoch": 0.7693059389734445, "grad_norm": 60.83143997192383, "learning_rate": 1.982382616234807e-06, "loss": 0.0765, "num_input_tokens_seen": 21216448, "step": 31490 }, { "epoch": 0.7694280898052915, "grad_norm": 14.315446853637695, "learning_rate": 1.98236667592543e-06, "loss": 0.1077, "num_input_tokens_seen": 21219392, "step": 31495 }, { "epoch": 0.7695502406371387, "grad_norm": 6.813923358917236, "learning_rate": 1.9823507284720174e-06, "loss": 0.0534, "num_input_tokens_seen": 21222464, "step": 31500 }, { "epoch": 0.7696723914689859, "grad_norm": 30.06726837158203, "learning_rate": 1.9823347738746868e-06, "loss": 0.2408, "num_input_tokens_seen": 21225600, "step": 31505 }, { "epoch": 0.7697945423008331, "grad_norm": 0.7989888787269592, "learning_rate": 1.9823188121335535e-06, "loss": 0.1162, "num_input_tokens_seen": 21228928, "step": 31510 }, { "epoch": 0.7699166931326802, "grad_norm": 6.25483512878418, "learning_rate": 1.9823028432487332e-06, "loss": 0.1069, "num_input_tokens_seen": 21232576, "step": 31515 }, { "epoch": 0.7700388439645274, "grad_norm": 0.3138057589530945, "learning_rate": 1.982286867220343e-06, "loss": 0.1632, "num_input_tokens_seen": 21236032, "step": 31520 }, { "epoch": 0.7701609947963746, "grad_norm": 27.970476150512695, "learning_rate": 1.9822708840484976e-06, "loss": 0.1045, "num_input_tokens_seen": 21239744, "step": 31525 }, { "epoch": 0.7702831456282218, "grad_norm": 46.2231559753418, "learning_rate": 1.9822548937333148e-06, "loss": 0.1838, "num_input_tokens_seen": 21243008, "step": 31530 }, { "epoch": 0.770405296460069, "grad_norm": 29.226469039916992, "learning_rate": 1.98223889627491e-06, "loss": 0.0838, "num_input_tokens_seen": 21246272, "step": 31535 }, { "epoch": 0.770527447291916, "grad_norm": 15.603208541870117, "learning_rate": 1.9822228916733996e-06, "loss": 0.1898, "num_input_tokens_seen": 21249600, "step": 31540 }, { "epoch": 0.7706495981237632, "grad_norm": 12.144451141357422, "learning_rate": 1.9822068799289003e-06, "loss": 0.1083, "num_input_tokens_seen": 21252992, "step": 31545 }, { "epoch": 0.7707717489556104, "grad_norm": 3.6689765453338623, "learning_rate": 1.982190861041529e-06, "loss": 0.0789, "num_input_tokens_seen": 21256448, "step": 31550 }, { "epoch": 0.7708938997874576, "grad_norm": 12.056143760681152, "learning_rate": 1.9821748350114004e-06, "loss": 0.0737, "num_input_tokens_seen": 21259392, "step": 31555 }, { "epoch": 0.7710160506193047, "grad_norm": 25.346725463867188, "learning_rate": 1.982158801838633e-06, "loss": 0.0581, "num_input_tokens_seen": 21262848, "step": 31560 }, { "epoch": 0.7711382014511519, "grad_norm": 7.405025005340576, "learning_rate": 1.9821427615233427e-06, "loss": 0.1367, "num_input_tokens_seen": 21266048, "step": 31565 }, { "epoch": 0.771260352282999, "grad_norm": 0.4394875168800354, "learning_rate": 1.9821267140656457e-06, "loss": 0.1666, "num_input_tokens_seen": 21269120, "step": 31570 }, { "epoch": 0.7713825031148462, "grad_norm": 30.82460594177246, "learning_rate": 1.982110659465659e-06, "loss": 0.1126, "num_input_tokens_seen": 21272320, "step": 31575 }, { "epoch": 0.7715046539466934, "grad_norm": 8.263945579528809, "learning_rate": 1.9820945977235e-06, "loss": 0.164, "num_input_tokens_seen": 21275328, "step": 31580 }, { "epoch": 0.7716268047785405, "grad_norm": 17.32110595703125, "learning_rate": 1.9820785288392844e-06, "loss": 0.0813, "num_input_tokens_seen": 21278720, "step": 31585 }, { "epoch": 0.7717489556103877, "grad_norm": 4.869380474090576, "learning_rate": 1.98206245281313e-06, "loss": 0.0579, "num_input_tokens_seen": 21282048, "step": 31590 }, { "epoch": 0.7718711064422349, "grad_norm": 0.19078025221824646, "learning_rate": 1.982046369645153e-06, "loss": 0.1305, "num_input_tokens_seen": 21285440, "step": 31595 }, { "epoch": 0.7719932572740821, "grad_norm": 24.013418197631836, "learning_rate": 1.9820302793354704e-06, "loss": 0.1274, "num_input_tokens_seen": 21288448, "step": 31600 }, { "epoch": 0.7721154081059292, "grad_norm": 0.5197308659553528, "learning_rate": 1.9820141818842e-06, "loss": 0.1899, "num_input_tokens_seen": 21291776, "step": 31605 }, { "epoch": 0.7722375589377763, "grad_norm": 12.339673042297363, "learning_rate": 1.981998077291458e-06, "loss": 0.1039, "num_input_tokens_seen": 21295360, "step": 31610 }, { "epoch": 0.7723597097696235, "grad_norm": 17.625242233276367, "learning_rate": 1.981981965557362e-06, "loss": 0.1433, "num_input_tokens_seen": 21298240, "step": 31615 }, { "epoch": 0.7724818606014707, "grad_norm": 0.9017745852470398, "learning_rate": 1.981965846682029e-06, "loss": 0.0809, "num_input_tokens_seen": 21301568, "step": 31620 }, { "epoch": 0.7726040114333178, "grad_norm": 1.5347234010696411, "learning_rate": 1.981949720665576e-06, "loss": 0.1297, "num_input_tokens_seen": 21305024, "step": 31625 }, { "epoch": 0.772726162265165, "grad_norm": 1.6608084440231323, "learning_rate": 1.981933587508121e-06, "loss": 0.0577, "num_input_tokens_seen": 21308224, "step": 31630 }, { "epoch": 0.7728483130970122, "grad_norm": 12.739276885986328, "learning_rate": 1.9819174472097807e-06, "loss": 0.171, "num_input_tokens_seen": 21311552, "step": 31635 }, { "epoch": 0.7729704639288594, "grad_norm": 19.08176612854004, "learning_rate": 1.9819012997706727e-06, "loss": 0.0992, "num_input_tokens_seen": 21314688, "step": 31640 }, { "epoch": 0.7730926147607066, "grad_norm": 2.2028987407684326, "learning_rate": 1.981885145190914e-06, "loss": 0.1189, "num_input_tokens_seen": 21317952, "step": 31645 }, { "epoch": 0.7732147655925536, "grad_norm": 0.6936241388320923, "learning_rate": 1.981868983470623e-06, "loss": 0.0643, "num_input_tokens_seen": 21321024, "step": 31650 }, { "epoch": 0.7733369164244008, "grad_norm": 0.8882758021354675, "learning_rate": 1.981852814609916e-06, "loss": 0.1486, "num_input_tokens_seen": 21324288, "step": 31655 }, { "epoch": 0.773459067256248, "grad_norm": 27.76462745666504, "learning_rate": 1.981836638608911e-06, "loss": 0.1112, "num_input_tokens_seen": 21327808, "step": 31660 }, { "epoch": 0.7735812180880952, "grad_norm": 20.353557586669922, "learning_rate": 1.981820455467727e-06, "loss": 0.1939, "num_input_tokens_seen": 21330944, "step": 31665 }, { "epoch": 0.7737033689199423, "grad_norm": 1.4390501976013184, "learning_rate": 1.9818042651864797e-06, "loss": 0.1189, "num_input_tokens_seen": 21334208, "step": 31670 }, { "epoch": 0.7738255197517895, "grad_norm": 6.415053844451904, "learning_rate": 1.981788067765288e-06, "loss": 0.0309, "num_input_tokens_seen": 21337088, "step": 31675 }, { "epoch": 0.7739476705836367, "grad_norm": 1.232407808303833, "learning_rate": 1.9817718632042695e-06, "loss": 0.1, "num_input_tokens_seen": 21340608, "step": 31680 }, { "epoch": 0.7740698214154839, "grad_norm": 0.8504520654678345, "learning_rate": 1.981755651503542e-06, "loss": 0.0482, "num_input_tokens_seen": 21343808, "step": 31685 }, { "epoch": 0.774191972247331, "grad_norm": 3.254563093185425, "learning_rate": 1.981739432663223e-06, "loss": 0.0957, "num_input_tokens_seen": 21347264, "step": 31690 }, { "epoch": 0.7743141230791781, "grad_norm": 7.742297649383545, "learning_rate": 1.981723206683431e-06, "loss": 0.0278, "num_input_tokens_seen": 21350912, "step": 31695 }, { "epoch": 0.7744362739110253, "grad_norm": 27.937456130981445, "learning_rate": 1.981706973564284e-06, "loss": 0.2443, "num_input_tokens_seen": 21354368, "step": 31700 }, { "epoch": 0.7745584247428725, "grad_norm": 6.21821403503418, "learning_rate": 1.9816907333058993e-06, "loss": 0.0087, "num_input_tokens_seen": 21357888, "step": 31705 }, { "epoch": 0.7746805755747197, "grad_norm": 34.79883575439453, "learning_rate": 1.981674485908396e-06, "loss": 0.1654, "num_input_tokens_seen": 21361344, "step": 31710 }, { "epoch": 0.7748027264065668, "grad_norm": 22.730897903442383, "learning_rate": 1.9816582313718917e-06, "loss": 0.0697, "num_input_tokens_seen": 21364224, "step": 31715 }, { "epoch": 0.774924877238414, "grad_norm": 14.0791654586792, "learning_rate": 1.9816419696965045e-06, "loss": 0.1227, "num_input_tokens_seen": 21367424, "step": 31720 }, { "epoch": 0.7750470280702612, "grad_norm": 19.73372459411621, "learning_rate": 1.9816257008823532e-06, "loss": 0.0791, "num_input_tokens_seen": 21370560, "step": 31725 }, { "epoch": 0.7751691789021083, "grad_norm": 0.3301796317100525, "learning_rate": 1.9816094249295557e-06, "loss": 0.0378, "num_input_tokens_seen": 21374720, "step": 31730 }, { "epoch": 0.7752913297339555, "grad_norm": 38.05630874633789, "learning_rate": 1.98159314183823e-06, "loss": 0.1237, "num_input_tokens_seen": 21378112, "step": 31735 }, { "epoch": 0.7754134805658026, "grad_norm": 2.712581157684326, "learning_rate": 1.9815768516084956e-06, "loss": 0.063, "num_input_tokens_seen": 21381504, "step": 31740 }, { "epoch": 0.7755356313976498, "grad_norm": 36.26502227783203, "learning_rate": 1.9815605542404698e-06, "loss": 0.2259, "num_input_tokens_seen": 21385344, "step": 31745 }, { "epoch": 0.775657782229497, "grad_norm": 10.401970863342285, "learning_rate": 1.981544249734272e-06, "loss": 0.0612, "num_input_tokens_seen": 21388736, "step": 31750 }, { "epoch": 0.7757799330613442, "grad_norm": 35.05522537231445, "learning_rate": 1.98152793809002e-06, "loss": 0.1576, "num_input_tokens_seen": 21391872, "step": 31755 }, { "epoch": 0.7759020838931913, "grad_norm": 6.847743988037109, "learning_rate": 1.9815116193078333e-06, "loss": 0.0493, "num_input_tokens_seen": 21395520, "step": 31760 }, { "epoch": 0.7760242347250385, "grad_norm": 34.985145568847656, "learning_rate": 1.98149529338783e-06, "loss": 0.0779, "num_input_tokens_seen": 21399296, "step": 31765 }, { "epoch": 0.7761463855568856, "grad_norm": 8.233406066894531, "learning_rate": 1.981478960330129e-06, "loss": 0.1058, "num_input_tokens_seen": 21402880, "step": 31770 }, { "epoch": 0.7762685363887328, "grad_norm": 14.839694023132324, "learning_rate": 1.9814626201348484e-06, "loss": 0.0803, "num_input_tokens_seen": 21406336, "step": 31775 }, { "epoch": 0.77639068722058, "grad_norm": 28.401588439941406, "learning_rate": 1.9814462728021084e-06, "loss": 0.1543, "num_input_tokens_seen": 21409536, "step": 31780 }, { "epoch": 0.7765128380524271, "grad_norm": 77.49806213378906, "learning_rate": 1.981429918332027e-06, "loss": 0.133, "num_input_tokens_seen": 21413056, "step": 31785 }, { "epoch": 0.7766349888842743, "grad_norm": 7.4428815841674805, "learning_rate": 1.981413556724723e-06, "loss": 0.0619, "num_input_tokens_seen": 21416896, "step": 31790 }, { "epoch": 0.7767571397161215, "grad_norm": 23.340656280517578, "learning_rate": 1.9813971879803155e-06, "loss": 0.1494, "num_input_tokens_seen": 21420096, "step": 31795 }, { "epoch": 0.7768792905479687, "grad_norm": 0.6986697316169739, "learning_rate": 1.9813808120989238e-06, "loss": 0.0836, "num_input_tokens_seen": 21423424, "step": 31800 }, { "epoch": 0.7770014413798157, "grad_norm": 49.14704132080078, "learning_rate": 1.981364429080667e-06, "loss": 0.0432, "num_input_tokens_seen": 21426688, "step": 31805 }, { "epoch": 0.7771235922116629, "grad_norm": 0.13445626199245453, "learning_rate": 1.9813480389256643e-06, "loss": 0.1327, "num_input_tokens_seen": 21430016, "step": 31810 }, { "epoch": 0.7772457430435101, "grad_norm": 15.11719799041748, "learning_rate": 1.9813316416340345e-06, "loss": 0.1589, "num_input_tokens_seen": 21433856, "step": 31815 }, { "epoch": 0.7773678938753573, "grad_norm": 49.894168853759766, "learning_rate": 1.981315237205897e-06, "loss": 0.1383, "num_input_tokens_seen": 21437120, "step": 31820 }, { "epoch": 0.7774900447072045, "grad_norm": 0.596743106842041, "learning_rate": 1.9812988256413715e-06, "loss": 0.1419, "num_input_tokens_seen": 21440704, "step": 31825 }, { "epoch": 0.7776121955390516, "grad_norm": 8.4089937210083, "learning_rate": 1.9812824069405766e-06, "loss": 0.2094, "num_input_tokens_seen": 21444416, "step": 31830 }, { "epoch": 0.7777343463708988, "grad_norm": 20.740663528442383, "learning_rate": 1.981265981103632e-06, "loss": 0.1335, "num_input_tokens_seen": 21447616, "step": 31835 }, { "epoch": 0.777856497202746, "grad_norm": 13.120567321777344, "learning_rate": 1.9812495481306577e-06, "loss": 0.0973, "num_input_tokens_seen": 21450752, "step": 31840 }, { "epoch": 0.7779786480345932, "grad_norm": 26.166528701782227, "learning_rate": 1.9812331080217726e-06, "loss": 0.1563, "num_input_tokens_seen": 21453888, "step": 31845 }, { "epoch": 0.7781007988664402, "grad_norm": 19.302303314208984, "learning_rate": 1.9812166607770965e-06, "loss": 0.0927, "num_input_tokens_seen": 21457216, "step": 31850 }, { "epoch": 0.7782229496982874, "grad_norm": 3.334254264831543, "learning_rate": 1.981200206396749e-06, "loss": 0.155, "num_input_tokens_seen": 21461248, "step": 31855 }, { "epoch": 0.7783451005301346, "grad_norm": 36.11018753051758, "learning_rate": 1.981183744880849e-06, "loss": 0.0909, "num_input_tokens_seen": 21464960, "step": 31860 }, { "epoch": 0.7784672513619818, "grad_norm": 1.2233189344406128, "learning_rate": 1.9811672762295176e-06, "loss": 0.0728, "num_input_tokens_seen": 21468032, "step": 31865 }, { "epoch": 0.778589402193829, "grad_norm": 12.21750259399414, "learning_rate": 1.9811508004428737e-06, "loss": 0.0325, "num_input_tokens_seen": 21471744, "step": 31870 }, { "epoch": 0.7787115530256761, "grad_norm": 7.178306579589844, "learning_rate": 1.981134317521037e-06, "loss": 0.129, "num_input_tokens_seen": 21475584, "step": 31875 }, { "epoch": 0.7788337038575233, "grad_norm": 20.816633224487305, "learning_rate": 1.981117827464128e-06, "loss": 0.1179, "num_input_tokens_seen": 21478720, "step": 31880 }, { "epoch": 0.7789558546893705, "grad_norm": 0.3289129436016083, "learning_rate": 1.981101330272266e-06, "loss": 0.0984, "num_input_tokens_seen": 21482112, "step": 31885 }, { "epoch": 0.7790780055212176, "grad_norm": 2.1822543144226074, "learning_rate": 1.9810848259455716e-06, "loss": 0.0813, "num_input_tokens_seen": 21485696, "step": 31890 }, { "epoch": 0.7792001563530647, "grad_norm": 12.473344802856445, "learning_rate": 1.981068314484164e-06, "loss": 0.0913, "num_input_tokens_seen": 21489216, "step": 31895 }, { "epoch": 0.7793223071849119, "grad_norm": 30.03060531616211, "learning_rate": 1.981051795888164e-06, "loss": 0.1576, "num_input_tokens_seen": 21492800, "step": 31900 }, { "epoch": 0.7794444580167591, "grad_norm": 0.5922881960868835, "learning_rate": 1.9810352701576917e-06, "loss": 0.1572, "num_input_tokens_seen": 21495744, "step": 31905 }, { "epoch": 0.7795666088486063, "grad_norm": 0.5877641439437866, "learning_rate": 1.981018737292867e-06, "loss": 0.133, "num_input_tokens_seen": 21498944, "step": 31910 }, { "epoch": 0.7796887596804534, "grad_norm": 14.157615661621094, "learning_rate": 1.98100219729381e-06, "loss": 0.0848, "num_input_tokens_seen": 21502400, "step": 31915 }, { "epoch": 0.7798109105123006, "grad_norm": 8.548436164855957, "learning_rate": 1.980985650160641e-06, "loss": 0.2272, "num_input_tokens_seen": 21505856, "step": 31920 }, { "epoch": 0.7799330613441477, "grad_norm": 19.105016708374023, "learning_rate": 1.9809690958934804e-06, "loss": 0.0443, "num_input_tokens_seen": 21509056, "step": 31925 }, { "epoch": 0.7800552121759949, "grad_norm": 0.6176168322563171, "learning_rate": 1.980952534492449e-06, "loss": 0.1034, "num_input_tokens_seen": 21512640, "step": 31930 }, { "epoch": 0.7801773630078421, "grad_norm": 0.15723645687103271, "learning_rate": 1.980935965957667e-06, "loss": 0.1021, "num_input_tokens_seen": 21515904, "step": 31935 }, { "epoch": 0.7802995138396892, "grad_norm": 16.714370727539062, "learning_rate": 1.9809193902892548e-06, "loss": 0.1275, "num_input_tokens_seen": 21519040, "step": 31940 }, { "epoch": 0.7804216646715364, "grad_norm": 2.1100945472717285, "learning_rate": 1.980902807487333e-06, "loss": 0.054, "num_input_tokens_seen": 21522368, "step": 31945 }, { "epoch": 0.7805438155033836, "grad_norm": 0.33257073163986206, "learning_rate": 1.980886217552022e-06, "loss": 0.0054, "num_input_tokens_seen": 21525696, "step": 31950 }, { "epoch": 0.7806659663352308, "grad_norm": 4.166648864746094, "learning_rate": 1.9808696204834427e-06, "loss": 0.0705, "num_input_tokens_seen": 21529344, "step": 31955 }, { "epoch": 0.7807881171670779, "grad_norm": 21.16596221923828, "learning_rate": 1.9808530162817153e-06, "loss": 0.1702, "num_input_tokens_seen": 21532992, "step": 31960 }, { "epoch": 0.780910267998925, "grad_norm": 7.611130237579346, "learning_rate": 1.9808364049469613e-06, "loss": 0.0475, "num_input_tokens_seen": 21536512, "step": 31965 }, { "epoch": 0.7810324188307722, "grad_norm": 16.877761840820312, "learning_rate": 1.980819786479301e-06, "loss": 0.2233, "num_input_tokens_seen": 21540096, "step": 31970 }, { "epoch": 0.7811545696626194, "grad_norm": 31.75934600830078, "learning_rate": 1.9808031608788557e-06, "loss": 0.116, "num_input_tokens_seen": 21543936, "step": 31975 }, { "epoch": 0.7812767204944666, "grad_norm": 2.284510612487793, "learning_rate": 1.980786528145746e-06, "loss": 0.1117, "num_input_tokens_seen": 21547328, "step": 31980 }, { "epoch": 0.7813988713263137, "grad_norm": 121.72061157226562, "learning_rate": 1.9807698882800924e-06, "loss": 0.1172, "num_input_tokens_seen": 21550592, "step": 31985 }, { "epoch": 0.7815210221581609, "grad_norm": 18.22707748413086, "learning_rate": 1.9807532412820165e-06, "loss": 0.1161, "num_input_tokens_seen": 21554048, "step": 31990 }, { "epoch": 0.7816431729900081, "grad_norm": 27.85599708557129, "learning_rate": 1.9807365871516394e-06, "loss": 0.1212, "num_input_tokens_seen": 21557440, "step": 31995 }, { "epoch": 0.7817653238218553, "grad_norm": 0.6961256861686707, "learning_rate": 1.9807199258890823e-06, "loss": 0.0333, "num_input_tokens_seen": 21561280, "step": 32000 }, { "epoch": 0.7818874746537023, "grad_norm": 12.567249298095703, "learning_rate": 1.980703257494466e-06, "loss": 0.0678, "num_input_tokens_seen": 21564672, "step": 32005 }, { "epoch": 0.7820096254855495, "grad_norm": 2.788113594055176, "learning_rate": 1.9806865819679116e-06, "loss": 0.0862, "num_input_tokens_seen": 21568704, "step": 32010 }, { "epoch": 0.7821317763173967, "grad_norm": 10.924982070922852, "learning_rate": 1.9806698993095407e-06, "loss": 0.026, "num_input_tokens_seen": 21572160, "step": 32015 }, { "epoch": 0.7822539271492439, "grad_norm": 25.12057876586914, "learning_rate": 1.9806532095194742e-06, "loss": 0.247, "num_input_tokens_seen": 21575680, "step": 32020 }, { "epoch": 0.7823760779810911, "grad_norm": 18.78164291381836, "learning_rate": 1.980636512597834e-06, "loss": 0.1509, "num_input_tokens_seen": 21579072, "step": 32025 }, { "epoch": 0.7824982288129382, "grad_norm": 59.385494232177734, "learning_rate": 1.9806198085447417e-06, "loss": 0.1559, "num_input_tokens_seen": 21582656, "step": 32030 }, { "epoch": 0.7826203796447854, "grad_norm": 0.6709699034690857, "learning_rate": 1.9806030973603183e-06, "loss": 0.1118, "num_input_tokens_seen": 21585600, "step": 32035 }, { "epoch": 0.7827425304766326, "grad_norm": 40.532989501953125, "learning_rate": 1.980586379044685e-06, "loss": 0.1132, "num_input_tokens_seen": 21588928, "step": 32040 }, { "epoch": 0.7828646813084797, "grad_norm": 16.257848739624023, "learning_rate": 1.9805696535979643e-06, "loss": 0.1005, "num_input_tokens_seen": 21591872, "step": 32045 }, { "epoch": 0.7829868321403268, "grad_norm": 18.62839698791504, "learning_rate": 1.980552921020277e-06, "loss": 0.0973, "num_input_tokens_seen": 21595456, "step": 32050 }, { "epoch": 0.783108982972174, "grad_norm": 13.70004940032959, "learning_rate": 1.980536181311745e-06, "loss": 0.0618, "num_input_tokens_seen": 21598208, "step": 32055 }, { "epoch": 0.7832311338040212, "grad_norm": 1.1185194253921509, "learning_rate": 1.9805194344724906e-06, "loss": 0.148, "num_input_tokens_seen": 21601152, "step": 32060 }, { "epoch": 0.7833532846358684, "grad_norm": 9.538461685180664, "learning_rate": 1.980502680502635e-06, "loss": 0.0887, "num_input_tokens_seen": 21604224, "step": 32065 }, { "epoch": 0.7834754354677156, "grad_norm": 17.74492645263672, "learning_rate": 1.9804859194023e-06, "loss": 0.1157, "num_input_tokens_seen": 21607168, "step": 32070 }, { "epoch": 0.7835975862995627, "grad_norm": 17.56431770324707, "learning_rate": 1.980469151171608e-06, "loss": 0.1083, "num_input_tokens_seen": 21610176, "step": 32075 }, { "epoch": 0.7837197371314099, "grad_norm": 30.301050186157227, "learning_rate": 1.9804523758106805e-06, "loss": 0.1411, "num_input_tokens_seen": 21613184, "step": 32080 }, { "epoch": 0.783841887963257, "grad_norm": 34.66545104980469, "learning_rate": 1.9804355933196397e-06, "loss": 0.1508, "num_input_tokens_seen": 21616384, "step": 32085 }, { "epoch": 0.7839640387951042, "grad_norm": 0.4128032922744751, "learning_rate": 1.9804188036986068e-06, "loss": 0.0559, "num_input_tokens_seen": 21619840, "step": 32090 }, { "epoch": 0.7840861896269513, "grad_norm": 18.219940185546875, "learning_rate": 1.9804020069477058e-06, "loss": 0.0879, "num_input_tokens_seen": 21622976, "step": 32095 }, { "epoch": 0.7842083404587985, "grad_norm": 45.373016357421875, "learning_rate": 1.980385203067057e-06, "loss": 0.0968, "num_input_tokens_seen": 21626432, "step": 32100 }, { "epoch": 0.7843304912906457, "grad_norm": 0.6460981369018555, "learning_rate": 1.9803683920567832e-06, "loss": 0.0054, "num_input_tokens_seen": 21629632, "step": 32105 }, { "epoch": 0.7844526421224929, "grad_norm": 24.669296264648438, "learning_rate": 1.9803515739170073e-06, "loss": 0.1798, "num_input_tokens_seen": 21632768, "step": 32110 }, { "epoch": 0.7845747929543401, "grad_norm": 40.41068649291992, "learning_rate": 1.9803347486478508e-06, "loss": 0.0212, "num_input_tokens_seen": 21635904, "step": 32115 }, { "epoch": 0.7846969437861872, "grad_norm": 49.69608688354492, "learning_rate": 1.980317916249436e-06, "loss": 0.1661, "num_input_tokens_seen": 21639168, "step": 32120 }, { "epoch": 0.7848190946180343, "grad_norm": 32.63628005981445, "learning_rate": 1.9803010767218864e-06, "loss": 0.1302, "num_input_tokens_seen": 21642432, "step": 32125 }, { "epoch": 0.7849412454498815, "grad_norm": 0.2649306356906891, "learning_rate": 1.980284230065323e-06, "loss": 0.0673, "num_input_tokens_seen": 21645888, "step": 32130 }, { "epoch": 0.7850633962817287, "grad_norm": 21.959501266479492, "learning_rate": 1.9802673762798696e-06, "loss": 0.3088, "num_input_tokens_seen": 21649216, "step": 32135 }, { "epoch": 0.7851855471135758, "grad_norm": 1.1638773679733276, "learning_rate": 1.9802505153656477e-06, "loss": 0.1599, "num_input_tokens_seen": 21652416, "step": 32140 }, { "epoch": 0.785307697945423, "grad_norm": 1.0158441066741943, "learning_rate": 1.9802336473227804e-06, "loss": 0.0481, "num_input_tokens_seen": 21655808, "step": 32145 }, { "epoch": 0.7854298487772702, "grad_norm": 4.161768436431885, "learning_rate": 1.9802167721513906e-06, "loss": 0.2222, "num_input_tokens_seen": 21658880, "step": 32150 }, { "epoch": 0.7855519996091174, "grad_norm": 7.720676898956299, "learning_rate": 1.9801998898516006e-06, "loss": 0.1291, "num_input_tokens_seen": 21662464, "step": 32155 }, { "epoch": 0.7856741504409646, "grad_norm": 0.2764137387275696, "learning_rate": 1.9801830004235333e-06, "loss": 0.089, "num_input_tokens_seen": 21665664, "step": 32160 }, { "epoch": 0.7857963012728116, "grad_norm": 37.62343978881836, "learning_rate": 1.9801661038673123e-06, "loss": 0.11, "num_input_tokens_seen": 21668928, "step": 32165 }, { "epoch": 0.7859184521046588, "grad_norm": 13.400060653686523, "learning_rate": 1.9801492001830587e-06, "loss": 0.0739, "num_input_tokens_seen": 21672832, "step": 32170 }, { "epoch": 0.786040602936506, "grad_norm": 7.543498992919922, "learning_rate": 1.980132289370897e-06, "loss": 0.0536, "num_input_tokens_seen": 21676608, "step": 32175 }, { "epoch": 0.7861627537683532, "grad_norm": 3.021576404571533, "learning_rate": 1.98011537143095e-06, "loss": 0.1529, "num_input_tokens_seen": 21679680, "step": 32180 }, { "epoch": 0.7862849046002003, "grad_norm": 17.60646629333496, "learning_rate": 1.98009844636334e-06, "loss": 0.1392, "num_input_tokens_seen": 21682624, "step": 32185 }, { "epoch": 0.7864070554320475, "grad_norm": 18.22474479675293, "learning_rate": 1.9800815141681902e-06, "loss": 0.1319, "num_input_tokens_seen": 21685632, "step": 32190 }, { "epoch": 0.7865292062638947, "grad_norm": 27.5200138092041, "learning_rate": 1.9800645748456247e-06, "loss": 0.2551, "num_input_tokens_seen": 21689024, "step": 32195 }, { "epoch": 0.7866513570957419, "grad_norm": 18.015851974487305, "learning_rate": 1.9800476283957656e-06, "loss": 0.101, "num_input_tokens_seen": 21692416, "step": 32200 }, { "epoch": 0.7867735079275889, "grad_norm": 33.747230529785156, "learning_rate": 1.9800306748187367e-06, "loss": 0.105, "num_input_tokens_seen": 21695552, "step": 32205 }, { "epoch": 0.7868956587594361, "grad_norm": 25.726058959960938, "learning_rate": 1.9800137141146612e-06, "loss": 0.0228, "num_input_tokens_seen": 21698752, "step": 32210 }, { "epoch": 0.7870178095912833, "grad_norm": 31.176944732666016, "learning_rate": 1.979996746283662e-06, "loss": 0.131, "num_input_tokens_seen": 21702336, "step": 32215 }, { "epoch": 0.7871399604231305, "grad_norm": 0.5230793952941895, "learning_rate": 1.9799797713258634e-06, "loss": 0.0525, "num_input_tokens_seen": 21705856, "step": 32220 }, { "epoch": 0.7872621112549777, "grad_norm": 27.23691177368164, "learning_rate": 1.9799627892413876e-06, "loss": 0.11, "num_input_tokens_seen": 21709056, "step": 32225 }, { "epoch": 0.7873842620868248, "grad_norm": 34.13618087768555, "learning_rate": 1.9799458000303594e-06, "loss": 0.1699, "num_input_tokens_seen": 21712832, "step": 32230 }, { "epoch": 0.787506412918672, "grad_norm": 14.356066703796387, "learning_rate": 1.9799288036929016e-06, "loss": 0.1034, "num_input_tokens_seen": 21716288, "step": 32235 }, { "epoch": 0.7876285637505192, "grad_norm": 16.639842987060547, "learning_rate": 1.979911800229138e-06, "loss": 0.0651, "num_input_tokens_seen": 21719872, "step": 32240 }, { "epoch": 0.7877507145823663, "grad_norm": 25.77016830444336, "learning_rate": 1.979894789639192e-06, "loss": 0.1812, "num_input_tokens_seen": 21723264, "step": 32245 }, { "epoch": 0.7878728654142134, "grad_norm": 15.297296524047852, "learning_rate": 1.9798777719231882e-06, "loss": 0.0791, "num_input_tokens_seen": 21726912, "step": 32250 }, { "epoch": 0.7879950162460606, "grad_norm": 1.2768000364303589, "learning_rate": 1.979860747081249e-06, "loss": 0.0353, "num_input_tokens_seen": 21730880, "step": 32255 }, { "epoch": 0.7881171670779078, "grad_norm": 0.267890602350235, "learning_rate": 1.979843715113499e-06, "loss": 0.1154, "num_input_tokens_seen": 21734080, "step": 32260 }, { "epoch": 0.788239317909755, "grad_norm": 22.803361892700195, "learning_rate": 1.9798266760200623e-06, "loss": 0.2999, "num_input_tokens_seen": 21737088, "step": 32265 }, { "epoch": 0.7883614687416022, "grad_norm": 1.6341105699539185, "learning_rate": 1.979809629801062e-06, "loss": 0.0086, "num_input_tokens_seen": 21740288, "step": 32270 }, { "epoch": 0.7884836195734493, "grad_norm": 36.74137878417969, "learning_rate": 1.979792576456623e-06, "loss": 0.1003, "num_input_tokens_seen": 21743424, "step": 32275 }, { "epoch": 0.7886057704052964, "grad_norm": 28.10059356689453, "learning_rate": 1.9797755159868687e-06, "loss": 0.2502, "num_input_tokens_seen": 21746368, "step": 32280 }, { "epoch": 0.7887279212371436, "grad_norm": 18.822345733642578, "learning_rate": 1.979758448391923e-06, "loss": 0.0359, "num_input_tokens_seen": 21749952, "step": 32285 }, { "epoch": 0.7888500720689908, "grad_norm": 16.408477783203125, "learning_rate": 1.9797413736719105e-06, "loss": 0.1076, "num_input_tokens_seen": 21753408, "step": 32290 }, { "epoch": 0.7889722229008379, "grad_norm": 0.689196765422821, "learning_rate": 1.9797242918269553e-06, "loss": 0.1068, "num_input_tokens_seen": 21756992, "step": 32295 }, { "epoch": 0.7890943737326851, "grad_norm": 1.9564344882965088, "learning_rate": 1.9797072028571816e-06, "loss": 0.0652, "num_input_tokens_seen": 21760704, "step": 32300 }, { "epoch": 0.7892165245645323, "grad_norm": 17.53862190246582, "learning_rate": 1.9796901067627135e-06, "loss": 0.1639, "num_input_tokens_seen": 21763904, "step": 32305 }, { "epoch": 0.7893386753963795, "grad_norm": 53.596839904785156, "learning_rate": 1.9796730035436756e-06, "loss": 0.1575, "num_input_tokens_seen": 21767168, "step": 32310 }, { "epoch": 0.7894608262282267, "grad_norm": 0.7122951149940491, "learning_rate": 1.9796558932001923e-06, "loss": 0.09, "num_input_tokens_seen": 21770688, "step": 32315 }, { "epoch": 0.7895829770600737, "grad_norm": 30.160568237304688, "learning_rate": 1.9796387757323873e-06, "loss": 0.1841, "num_input_tokens_seen": 21774144, "step": 32320 }, { "epoch": 0.7897051278919209, "grad_norm": 3.537285089492798, "learning_rate": 1.979621651140386e-06, "loss": 0.0075, "num_input_tokens_seen": 21777344, "step": 32325 }, { "epoch": 0.7898272787237681, "grad_norm": 14.99126148223877, "learning_rate": 1.9796045194243123e-06, "loss": 0.166, "num_input_tokens_seen": 21780544, "step": 32330 }, { "epoch": 0.7899494295556153, "grad_norm": 17.25547981262207, "learning_rate": 1.9795873805842914e-06, "loss": 0.121, "num_input_tokens_seen": 21783872, "step": 32335 }, { "epoch": 0.7900715803874624, "grad_norm": 39.5103759765625, "learning_rate": 1.9795702346204473e-06, "loss": 0.1063, "num_input_tokens_seen": 21787264, "step": 32340 }, { "epoch": 0.7901937312193096, "grad_norm": 34.86042404174805, "learning_rate": 1.9795530815329053e-06, "loss": 0.1067, "num_input_tokens_seen": 21790336, "step": 32345 }, { "epoch": 0.7903158820511568, "grad_norm": 39.395835876464844, "learning_rate": 1.979535921321789e-06, "loss": 0.0771, "num_input_tokens_seen": 21793536, "step": 32350 }, { "epoch": 0.790438032883004, "grad_norm": 0.45142436027526855, "learning_rate": 1.979518753987225e-06, "loss": 0.0313, "num_input_tokens_seen": 21796352, "step": 32355 }, { "epoch": 0.7905601837148512, "grad_norm": 0.10581724345684052, "learning_rate": 1.979501579529337e-06, "loss": 0.0862, "num_input_tokens_seen": 21800064, "step": 32360 }, { "epoch": 0.7906823345466982, "grad_norm": 28.593807220458984, "learning_rate": 1.9794843979482495e-06, "loss": 0.1273, "num_input_tokens_seen": 21803136, "step": 32365 }, { "epoch": 0.7908044853785454, "grad_norm": 19.38428497314453, "learning_rate": 1.9794672092440884e-06, "loss": 0.2033, "num_input_tokens_seen": 21806464, "step": 32370 }, { "epoch": 0.7909266362103926, "grad_norm": 21.668018341064453, "learning_rate": 1.9794500134169783e-06, "loss": 0.0742, "num_input_tokens_seen": 21809664, "step": 32375 }, { "epoch": 0.7910487870422398, "grad_norm": 20.23079490661621, "learning_rate": 1.979432810467044e-06, "loss": 0.1761, "num_input_tokens_seen": 21812800, "step": 32380 }, { "epoch": 0.7911709378740869, "grad_norm": 29.678831100463867, "learning_rate": 1.9794156003944115e-06, "loss": 0.0591, "num_input_tokens_seen": 21816576, "step": 32385 }, { "epoch": 0.7912930887059341, "grad_norm": 21.465499877929688, "learning_rate": 1.979398383199205e-06, "loss": 0.1364, "num_input_tokens_seen": 21819968, "step": 32390 }, { "epoch": 0.7914152395377813, "grad_norm": 22.944368362426758, "learning_rate": 1.9793811588815496e-06, "loss": 0.1386, "num_input_tokens_seen": 21823488, "step": 32395 }, { "epoch": 0.7915373903696284, "grad_norm": 22.3177547454834, "learning_rate": 1.9793639274415716e-06, "loss": 0.1915, "num_input_tokens_seen": 21826816, "step": 32400 }, { "epoch": 0.7916595412014756, "grad_norm": 1.5493758916854858, "learning_rate": 1.9793466888793953e-06, "loss": 0.109, "num_input_tokens_seen": 21830208, "step": 32405 }, { "epoch": 0.7917816920333227, "grad_norm": 0.8863944411277771, "learning_rate": 1.9793294431951467e-06, "loss": 0.0359, "num_input_tokens_seen": 21833344, "step": 32410 }, { "epoch": 0.7919038428651699, "grad_norm": 13.146297454833984, "learning_rate": 1.979312190388951e-06, "loss": 0.1417, "num_input_tokens_seen": 21836352, "step": 32415 }, { "epoch": 0.7920259936970171, "grad_norm": 2.506059408187866, "learning_rate": 1.9792949304609336e-06, "loss": 0.1282, "num_input_tokens_seen": 21840000, "step": 32420 }, { "epoch": 0.7921481445288643, "grad_norm": 24.569217681884766, "learning_rate": 1.97927766341122e-06, "loss": 0.1626, "num_input_tokens_seen": 21843264, "step": 32425 }, { "epoch": 0.7922702953607114, "grad_norm": 9.783016204833984, "learning_rate": 1.9792603892399357e-06, "loss": 0.1898, "num_input_tokens_seen": 21847168, "step": 32430 }, { "epoch": 0.7923924461925586, "grad_norm": 19.442590713500977, "learning_rate": 1.9792431079472067e-06, "loss": 0.1549, "num_input_tokens_seen": 21851136, "step": 32435 }, { "epoch": 0.7925145970244057, "grad_norm": 2.4780938625335693, "learning_rate": 1.9792258195331583e-06, "loss": 0.1009, "num_input_tokens_seen": 21854720, "step": 32440 }, { "epoch": 0.7926367478562529, "grad_norm": 8.407721519470215, "learning_rate": 1.9792085239979163e-06, "loss": 0.0791, "num_input_tokens_seen": 21858176, "step": 32445 }, { "epoch": 0.7927588986881, "grad_norm": 7.173499584197998, "learning_rate": 1.9791912213416065e-06, "loss": 0.2024, "num_input_tokens_seen": 21861376, "step": 32450 }, { "epoch": 0.7928810495199472, "grad_norm": 1.6724401712417603, "learning_rate": 1.9791739115643547e-06, "loss": 0.0583, "num_input_tokens_seen": 21864448, "step": 32455 }, { "epoch": 0.7930032003517944, "grad_norm": 15.159879684448242, "learning_rate": 1.9791565946662875e-06, "loss": 0.1266, "num_input_tokens_seen": 21868096, "step": 32460 }, { "epoch": 0.7931253511836416, "grad_norm": 22.336624145507812, "learning_rate": 1.9791392706475298e-06, "loss": 0.1452, "num_input_tokens_seen": 21871296, "step": 32465 }, { "epoch": 0.7932475020154888, "grad_norm": 0.22137446701526642, "learning_rate": 1.979121939508208e-06, "loss": 0.0596, "num_input_tokens_seen": 21875072, "step": 32470 }, { "epoch": 0.7933696528473358, "grad_norm": 2.6097412109375, "learning_rate": 1.9791046012484478e-06, "loss": 0.1029, "num_input_tokens_seen": 21878528, "step": 32475 }, { "epoch": 0.793491803679183, "grad_norm": 10.868502616882324, "learning_rate": 1.9790872558683755e-06, "loss": 0.0529, "num_input_tokens_seen": 21882176, "step": 32480 }, { "epoch": 0.7936139545110302, "grad_norm": 0.6876401305198669, "learning_rate": 1.9790699033681176e-06, "loss": 0.1334, "num_input_tokens_seen": 21885760, "step": 32485 }, { "epoch": 0.7937361053428774, "grad_norm": 31.104290008544922, "learning_rate": 1.9790525437478002e-06, "loss": 0.3129, "num_input_tokens_seen": 21889152, "step": 32490 }, { "epoch": 0.7938582561747245, "grad_norm": 2.89729380607605, "learning_rate": 1.9790351770075492e-06, "loss": 0.0127, "num_input_tokens_seen": 21892352, "step": 32495 }, { "epoch": 0.7939804070065717, "grad_norm": 8.085244178771973, "learning_rate": 1.9790178031474912e-06, "loss": 0.1781, "num_input_tokens_seen": 21895744, "step": 32500 }, { "epoch": 0.7941025578384189, "grad_norm": 25.23969078063965, "learning_rate": 1.9790004221677524e-06, "loss": 0.0983, "num_input_tokens_seen": 21898944, "step": 32505 }, { "epoch": 0.7942247086702661, "grad_norm": 12.462812423706055, "learning_rate": 1.978983034068459e-06, "loss": 0.1732, "num_input_tokens_seen": 21902400, "step": 32510 }, { "epoch": 0.7943468595021133, "grad_norm": 1.5091313123703003, "learning_rate": 1.9789656388497376e-06, "loss": 0.1392, "num_input_tokens_seen": 21905728, "step": 32515 }, { "epoch": 0.7944690103339603, "grad_norm": 6.18993616104126, "learning_rate": 1.978948236511715e-06, "loss": 0.1258, "num_input_tokens_seen": 21909184, "step": 32520 }, { "epoch": 0.7945911611658075, "grad_norm": 11.886087417602539, "learning_rate": 1.9789308270545175e-06, "loss": 0.0554, "num_input_tokens_seen": 21912256, "step": 32525 }, { "epoch": 0.7947133119976547, "grad_norm": 30.678869247436523, "learning_rate": 1.9789134104782716e-06, "loss": 0.0334, "num_input_tokens_seen": 21915904, "step": 32530 }, { "epoch": 0.7948354628295019, "grad_norm": 9.77381420135498, "learning_rate": 1.9788959867831043e-06, "loss": 0.2204, "num_input_tokens_seen": 21919232, "step": 32535 }, { "epoch": 0.794957613661349, "grad_norm": 0.5201031565666199, "learning_rate": 1.9788785559691417e-06, "loss": 0.1402, "num_input_tokens_seen": 21922048, "step": 32540 }, { "epoch": 0.7950797644931962, "grad_norm": 0.22782576084136963, "learning_rate": 1.9788611180365114e-06, "loss": 0.0586, "num_input_tokens_seen": 21925568, "step": 32545 }, { "epoch": 0.7952019153250434, "grad_norm": 30.4063663482666, "learning_rate": 1.9788436729853395e-06, "loss": 0.2114, "num_input_tokens_seen": 21928704, "step": 32550 }, { "epoch": 0.7953240661568906, "grad_norm": 39.70443344116211, "learning_rate": 1.978826220815753e-06, "loss": 0.1629, "num_input_tokens_seen": 21932032, "step": 32555 }, { "epoch": 0.7954462169887377, "grad_norm": 14.86407470703125, "learning_rate": 1.9788087615278793e-06, "loss": 0.1327, "num_input_tokens_seen": 21935808, "step": 32560 }, { "epoch": 0.7955683678205848, "grad_norm": 2.5342135429382324, "learning_rate": 1.9787912951218447e-06, "loss": 0.0649, "num_input_tokens_seen": 21939264, "step": 32565 }, { "epoch": 0.795690518652432, "grad_norm": 13.98967170715332, "learning_rate": 1.978773821597777e-06, "loss": 0.1148, "num_input_tokens_seen": 21942848, "step": 32570 }, { "epoch": 0.7958126694842792, "grad_norm": 0.22576695680618286, "learning_rate": 1.978756340955802e-06, "loss": 0.0996, "num_input_tokens_seen": 21946048, "step": 32575 }, { "epoch": 0.7959348203161264, "grad_norm": 13.705849647521973, "learning_rate": 1.9787388531960488e-06, "loss": 0.0786, "num_input_tokens_seen": 21949696, "step": 32580 }, { "epoch": 0.7960569711479735, "grad_norm": 0.5072705149650574, "learning_rate": 1.9787213583186423e-06, "loss": 0.0497, "num_input_tokens_seen": 21953088, "step": 32585 }, { "epoch": 0.7961791219798207, "grad_norm": 9.198322296142578, "learning_rate": 1.9787038563237117e-06, "loss": 0.1569, "num_input_tokens_seen": 21956224, "step": 32590 }, { "epoch": 0.7963012728116678, "grad_norm": 0.8958269357681274, "learning_rate": 1.978686347211383e-06, "loss": 0.0616, "num_input_tokens_seen": 21959360, "step": 32595 }, { "epoch": 0.796423423643515, "grad_norm": 38.93531036376953, "learning_rate": 1.9786688309817836e-06, "loss": 0.1849, "num_input_tokens_seen": 21962496, "step": 32600 }, { "epoch": 0.7965455744753622, "grad_norm": 17.935117721557617, "learning_rate": 1.978651307635042e-06, "loss": 0.1986, "num_input_tokens_seen": 21966016, "step": 32605 }, { "epoch": 0.7966677253072093, "grad_norm": 8.136306762695312, "learning_rate": 1.9786337771712845e-06, "loss": 0.114, "num_input_tokens_seen": 21969792, "step": 32610 }, { "epoch": 0.7967898761390565, "grad_norm": 0.9560222029685974, "learning_rate": 1.9786162395906388e-06, "loss": 0.1428, "num_input_tokens_seen": 21973184, "step": 32615 }, { "epoch": 0.7969120269709037, "grad_norm": 10.984770774841309, "learning_rate": 1.9785986948932324e-06, "loss": 0.1996, "num_input_tokens_seen": 21976768, "step": 32620 }, { "epoch": 0.7970341778027509, "grad_norm": 37.20198440551758, "learning_rate": 1.9785811430791932e-06, "loss": 0.0833, "num_input_tokens_seen": 21980480, "step": 32625 }, { "epoch": 0.797156328634598, "grad_norm": 0.6844213604927063, "learning_rate": 1.9785635841486492e-06, "loss": 0.0975, "num_input_tokens_seen": 21984320, "step": 32630 }, { "epoch": 0.7972784794664451, "grad_norm": 15.415057182312012, "learning_rate": 1.9785460181017274e-06, "loss": 0.0554, "num_input_tokens_seen": 21987776, "step": 32635 }, { "epoch": 0.7974006302982923, "grad_norm": 17.4122257232666, "learning_rate": 1.9785284449385557e-06, "loss": 0.1462, "num_input_tokens_seen": 21991360, "step": 32640 }, { "epoch": 0.7975227811301395, "grad_norm": 0.6733130812644958, "learning_rate": 1.978510864659262e-06, "loss": 0.1373, "num_input_tokens_seen": 21994816, "step": 32645 }, { "epoch": 0.7976449319619867, "grad_norm": 28.956544876098633, "learning_rate": 1.978493277263974e-06, "loss": 0.1573, "num_input_tokens_seen": 21998208, "step": 32650 }, { "epoch": 0.7977670827938338, "grad_norm": 26.73524284362793, "learning_rate": 1.97847568275282e-06, "loss": 0.081, "num_input_tokens_seen": 22002048, "step": 32655 }, { "epoch": 0.797889233625681, "grad_norm": 0.4550372064113617, "learning_rate": 1.978458081125927e-06, "loss": 0.0815, "num_input_tokens_seen": 22005056, "step": 32660 }, { "epoch": 0.7980113844575282, "grad_norm": 33.69190979003906, "learning_rate": 1.978440472383424e-06, "loss": 0.1989, "num_input_tokens_seen": 22008320, "step": 32665 }, { "epoch": 0.7981335352893754, "grad_norm": 16.381532669067383, "learning_rate": 1.978422856525439e-06, "loss": 0.1015, "num_input_tokens_seen": 22011008, "step": 32670 }, { "epoch": 0.7982556861212224, "grad_norm": 26.662216186523438, "learning_rate": 1.9784052335520997e-06, "loss": 0.1529, "num_input_tokens_seen": 22014016, "step": 32675 }, { "epoch": 0.7983778369530696, "grad_norm": 8.46114730834961, "learning_rate": 1.978387603463534e-06, "loss": 0.0387, "num_input_tokens_seen": 22017472, "step": 32680 }, { "epoch": 0.7984999877849168, "grad_norm": 0.5248132348060608, "learning_rate": 1.978369966259871e-06, "loss": 0.0815, "num_input_tokens_seen": 22020544, "step": 32685 }, { "epoch": 0.798622138616764, "grad_norm": 10.598689079284668, "learning_rate": 1.978352321941238e-06, "loss": 0.0838, "num_input_tokens_seen": 22023744, "step": 32690 }, { "epoch": 0.7987442894486112, "grad_norm": 0.6789451241493225, "learning_rate": 1.978334670507764e-06, "loss": 0.0937, "num_input_tokens_seen": 22026944, "step": 32695 }, { "epoch": 0.7988664402804583, "grad_norm": 25.466279983520508, "learning_rate": 1.9783170119595775e-06, "loss": 0.1423, "num_input_tokens_seen": 22030144, "step": 32700 }, { "epoch": 0.7989885911123055, "grad_norm": 18.387006759643555, "learning_rate": 1.978299346296806e-06, "loss": 0.1669, "num_input_tokens_seen": 22033408, "step": 32705 }, { "epoch": 0.7991107419441527, "grad_norm": 13.358413696289062, "learning_rate": 1.9782816735195786e-06, "loss": 0.1384, "num_input_tokens_seen": 22036928, "step": 32710 }, { "epoch": 0.7992328927759998, "grad_norm": 10.378283500671387, "learning_rate": 1.978263993628024e-06, "loss": 0.201, "num_input_tokens_seen": 22040512, "step": 32715 }, { "epoch": 0.7993550436078469, "grad_norm": 0.48704561591148376, "learning_rate": 1.9782463066222702e-06, "loss": 0.0564, "num_input_tokens_seen": 22043776, "step": 32720 }, { "epoch": 0.7994771944396941, "grad_norm": 35.04000473022461, "learning_rate": 1.9782286125024464e-06, "loss": 0.1006, "num_input_tokens_seen": 22046784, "step": 32725 }, { "epoch": 0.7995993452715413, "grad_norm": 23.447757720947266, "learning_rate": 1.9782109112686812e-06, "loss": 0.125, "num_input_tokens_seen": 22049856, "step": 32730 }, { "epoch": 0.7997214961033885, "grad_norm": 27.612590789794922, "learning_rate": 1.9781932029211027e-06, "loss": 0.1607, "num_input_tokens_seen": 22053056, "step": 32735 }, { "epoch": 0.7998436469352356, "grad_norm": 24.697265625, "learning_rate": 1.9781754874598403e-06, "loss": 0.1071, "num_input_tokens_seen": 22056128, "step": 32740 }, { "epoch": 0.7999657977670828, "grad_norm": 43.93508529663086, "learning_rate": 1.978157764885023e-06, "loss": 0.1883, "num_input_tokens_seen": 22059520, "step": 32745 }, { "epoch": 0.80008794859893, "grad_norm": 2.7973132133483887, "learning_rate": 1.9781400351967787e-06, "loss": 0.0534, "num_input_tokens_seen": 22063168, "step": 32750 }, { "epoch": 0.8002100994307771, "grad_norm": 21.7699031829834, "learning_rate": 1.9781222983952374e-06, "loss": 0.199, "num_input_tokens_seen": 22066496, "step": 32755 }, { "epoch": 0.8003322502626243, "grad_norm": 0.8064271807670593, "learning_rate": 1.9781045544805273e-06, "loss": 0.0776, "num_input_tokens_seen": 22069696, "step": 32760 }, { "epoch": 0.8004544010944714, "grad_norm": 2.368868589401245, "learning_rate": 1.9780868034527783e-06, "loss": 0.0598, "num_input_tokens_seen": 22073472, "step": 32765 }, { "epoch": 0.8005765519263186, "grad_norm": 22.38524627685547, "learning_rate": 1.9780690453121185e-06, "loss": 0.1173, "num_input_tokens_seen": 22076352, "step": 32770 }, { "epoch": 0.8006987027581658, "grad_norm": 3.6442208290100098, "learning_rate": 1.978051280058678e-06, "loss": 0.0787, "num_input_tokens_seen": 22079808, "step": 32775 }, { "epoch": 0.800820853590013, "grad_norm": 17.011329650878906, "learning_rate": 1.978033507692585e-06, "loss": 0.1477, "num_input_tokens_seen": 22082944, "step": 32780 }, { "epoch": 0.8009430044218601, "grad_norm": 21.807342529296875, "learning_rate": 1.9780157282139697e-06, "loss": 0.1523, "num_input_tokens_seen": 22086336, "step": 32785 }, { "epoch": 0.8010651552537073, "grad_norm": 2.756991386413574, "learning_rate": 1.977997941622961e-06, "loss": 0.0947, "num_input_tokens_seen": 22089280, "step": 32790 }, { "epoch": 0.8011873060855544, "grad_norm": 34.63652420043945, "learning_rate": 1.9779801479196877e-06, "loss": 0.1376, "num_input_tokens_seen": 22093120, "step": 32795 }, { "epoch": 0.8013094569174016, "grad_norm": 0.5275918245315552, "learning_rate": 1.97796234710428e-06, "loss": 0.1301, "num_input_tokens_seen": 22096640, "step": 32800 }, { "epoch": 0.8014316077492488, "grad_norm": 29.089365005493164, "learning_rate": 1.9779445391768673e-06, "loss": 0.1151, "num_input_tokens_seen": 22099776, "step": 32805 }, { "epoch": 0.8015537585810959, "grad_norm": 0.1555165946483612, "learning_rate": 1.9779267241375786e-06, "loss": 0.0475, "num_input_tokens_seen": 22102912, "step": 32810 }, { "epoch": 0.8016759094129431, "grad_norm": 7.828727722167969, "learning_rate": 1.977908901986544e-06, "loss": 0.0806, "num_input_tokens_seen": 22106304, "step": 32815 }, { "epoch": 0.8017980602447903, "grad_norm": 14.934768676757812, "learning_rate": 1.9778910727238925e-06, "loss": 0.0678, "num_input_tokens_seen": 22109568, "step": 32820 }, { "epoch": 0.8019202110766375, "grad_norm": 20.33085823059082, "learning_rate": 1.9778732363497544e-06, "loss": 0.1423, "num_input_tokens_seen": 22112960, "step": 32825 }, { "epoch": 0.8020423619084845, "grad_norm": 18.72711181640625, "learning_rate": 1.977855392864259e-06, "loss": 0.0479, "num_input_tokens_seen": 22116096, "step": 32830 }, { "epoch": 0.8021645127403317, "grad_norm": 0.35270586609840393, "learning_rate": 1.977837542267536e-06, "loss": 0.0336, "num_input_tokens_seen": 22119552, "step": 32835 }, { "epoch": 0.8022866635721789, "grad_norm": 6.33324670791626, "learning_rate": 1.9778196845597157e-06, "loss": 0.0892, "num_input_tokens_seen": 22122752, "step": 32840 }, { "epoch": 0.8024088144040261, "grad_norm": 16.176767349243164, "learning_rate": 1.977801819740928e-06, "loss": 0.0598, "num_input_tokens_seen": 22125952, "step": 32845 }, { "epoch": 0.8025309652358733, "grad_norm": 25.649085998535156, "learning_rate": 1.9777839478113015e-06, "loss": 0.134, "num_input_tokens_seen": 22129152, "step": 32850 }, { "epoch": 0.8026531160677204, "grad_norm": 0.7280669808387756, "learning_rate": 1.9777660687709678e-06, "loss": 0.0702, "num_input_tokens_seen": 22132416, "step": 32855 }, { "epoch": 0.8027752668995676, "grad_norm": 10.644025802612305, "learning_rate": 1.977748182620056e-06, "loss": 0.1017, "num_input_tokens_seen": 22135808, "step": 32860 }, { "epoch": 0.8028974177314148, "grad_norm": 0.5144332647323608, "learning_rate": 1.9777302893586966e-06, "loss": 0.1627, "num_input_tokens_seen": 22138944, "step": 32865 }, { "epoch": 0.803019568563262, "grad_norm": 2.8789901733398438, "learning_rate": 1.9777123889870197e-06, "loss": 0.0645, "num_input_tokens_seen": 22142272, "step": 32870 }, { "epoch": 0.803141719395109, "grad_norm": 17.88456916809082, "learning_rate": 1.9776944815051547e-06, "loss": 0.1231, "num_input_tokens_seen": 22145472, "step": 32875 }, { "epoch": 0.8032638702269562, "grad_norm": 21.952150344848633, "learning_rate": 1.977676566913233e-06, "loss": 0.0995, "num_input_tokens_seen": 22148608, "step": 32880 }, { "epoch": 0.8033860210588034, "grad_norm": 0.8212730884552002, "learning_rate": 1.9776586452113842e-06, "loss": 0.1121, "num_input_tokens_seen": 22152064, "step": 32885 }, { "epoch": 0.8035081718906506, "grad_norm": 21.854928970336914, "learning_rate": 1.977640716399739e-06, "loss": 0.098, "num_input_tokens_seen": 22155328, "step": 32890 }, { "epoch": 0.8036303227224978, "grad_norm": 7.417600631713867, "learning_rate": 1.9776227804784275e-06, "loss": 0.0745, "num_input_tokens_seen": 22158912, "step": 32895 }, { "epoch": 0.8037524735543449, "grad_norm": 0.8115556240081787, "learning_rate": 1.9776048374475794e-06, "loss": 0.0898, "num_input_tokens_seen": 22162112, "step": 32900 }, { "epoch": 0.8038746243861921, "grad_norm": 40.99019241333008, "learning_rate": 1.9775868873073267e-06, "loss": 0.1985, "num_input_tokens_seen": 22165504, "step": 32905 }, { "epoch": 0.8039967752180393, "grad_norm": 1.1838963031768799, "learning_rate": 1.9775689300577995e-06, "loss": 0.038, "num_input_tokens_seen": 22168832, "step": 32910 }, { "epoch": 0.8041189260498864, "grad_norm": 16.456296920776367, "learning_rate": 1.9775509656991277e-06, "loss": 0.1404, "num_input_tokens_seen": 22172096, "step": 32915 }, { "epoch": 0.8042410768817335, "grad_norm": 18.349414825439453, "learning_rate": 1.977532994231442e-06, "loss": 0.0826, "num_input_tokens_seen": 22175360, "step": 32920 }, { "epoch": 0.8043632277135807, "grad_norm": 6.988818168640137, "learning_rate": 1.9775150156548743e-06, "loss": 0.0411, "num_input_tokens_seen": 22179008, "step": 32925 }, { "epoch": 0.8044853785454279, "grad_norm": 106.58795166015625, "learning_rate": 1.977497029969554e-06, "loss": 0.1473, "num_input_tokens_seen": 22182336, "step": 32930 }, { "epoch": 0.8046075293772751, "grad_norm": 22.680753707885742, "learning_rate": 1.977479037175612e-06, "loss": 0.0851, "num_input_tokens_seen": 22185472, "step": 32935 }, { "epoch": 0.8047296802091223, "grad_norm": 8.83067512512207, "learning_rate": 1.97746103727318e-06, "loss": 0.2425, "num_input_tokens_seen": 22189120, "step": 32940 }, { "epoch": 0.8048518310409694, "grad_norm": 27.481521606445312, "learning_rate": 1.9774430302623887e-06, "loss": 0.1671, "num_input_tokens_seen": 22192768, "step": 32945 }, { "epoch": 0.8049739818728165, "grad_norm": 41.317142486572266, "learning_rate": 1.977425016143368e-06, "loss": 0.2252, "num_input_tokens_seen": 22196288, "step": 32950 }, { "epoch": 0.8050961327046637, "grad_norm": 45.449501037597656, "learning_rate": 1.9774069949162504e-06, "loss": 0.1459, "num_input_tokens_seen": 22199744, "step": 32955 }, { "epoch": 0.8052182835365109, "grad_norm": 0.5199189186096191, "learning_rate": 1.9773889665811657e-06, "loss": 0.132, "num_input_tokens_seen": 22203520, "step": 32960 }, { "epoch": 0.805340434368358, "grad_norm": 18.819425582885742, "learning_rate": 1.977370931138246e-06, "loss": 0.0947, "num_input_tokens_seen": 22206976, "step": 32965 }, { "epoch": 0.8054625852002052, "grad_norm": 36.9178352355957, "learning_rate": 1.977352888587622e-06, "loss": 0.2265, "num_input_tokens_seen": 22210496, "step": 32970 }, { "epoch": 0.8055847360320524, "grad_norm": 18.44637680053711, "learning_rate": 1.9773348389294243e-06, "loss": 0.09, "num_input_tokens_seen": 22213760, "step": 32975 }, { "epoch": 0.8057068868638996, "grad_norm": 1.2746877670288086, "learning_rate": 1.9773167821637855e-06, "loss": 0.1072, "num_input_tokens_seen": 22217152, "step": 32980 }, { "epoch": 0.8058290376957467, "grad_norm": 2.1266651153564453, "learning_rate": 1.977298718290836e-06, "loss": 0.1145, "num_input_tokens_seen": 22221056, "step": 32985 }, { "epoch": 0.8059511885275938, "grad_norm": 0.1620056927204132, "learning_rate": 1.9772806473107072e-06, "loss": 0.0897, "num_input_tokens_seen": 22224320, "step": 32990 }, { "epoch": 0.806073339359441, "grad_norm": 40.33462905883789, "learning_rate": 1.977262569223531e-06, "loss": 0.1855, "num_input_tokens_seen": 22227840, "step": 32995 }, { "epoch": 0.8061954901912882, "grad_norm": 0.31348946690559387, "learning_rate": 1.977244484029438e-06, "loss": 0.0983, "num_input_tokens_seen": 22230976, "step": 33000 }, { "epoch": 0.8063176410231354, "grad_norm": 28.071033477783203, "learning_rate": 1.9772263917285606e-06, "loss": 0.1356, "num_input_tokens_seen": 22234432, "step": 33005 }, { "epoch": 0.8064397918549825, "grad_norm": 10.321966171264648, "learning_rate": 1.97720829232103e-06, "loss": 0.1449, "num_input_tokens_seen": 22237824, "step": 33010 }, { "epoch": 0.8065619426868297, "grad_norm": 0.7909249067306519, "learning_rate": 1.9771901858069778e-06, "loss": 0.1319, "num_input_tokens_seen": 22241088, "step": 33015 }, { "epoch": 0.8066840935186769, "grad_norm": 20.836950302124023, "learning_rate": 1.9771720721865355e-06, "loss": 0.1683, "num_input_tokens_seen": 22244352, "step": 33020 }, { "epoch": 0.8068062443505241, "grad_norm": 3.560326099395752, "learning_rate": 1.977153951459836e-06, "loss": 0.1275, "num_input_tokens_seen": 22247552, "step": 33025 }, { "epoch": 0.8069283951823711, "grad_norm": 0.5384361147880554, "learning_rate": 1.977135823627009e-06, "loss": 0.083, "num_input_tokens_seen": 22251456, "step": 33030 }, { "epoch": 0.8070505460142183, "grad_norm": 0.37456443905830383, "learning_rate": 1.9771176886881883e-06, "loss": 0.1143, "num_input_tokens_seen": 22254656, "step": 33035 }, { "epoch": 0.8071726968460655, "grad_norm": 6.5614237785339355, "learning_rate": 1.9770995466435044e-06, "loss": 0.0217, "num_input_tokens_seen": 22257728, "step": 33040 }, { "epoch": 0.8072948476779127, "grad_norm": 42.75928497314453, "learning_rate": 1.97708139749309e-06, "loss": 0.1372, "num_input_tokens_seen": 22260992, "step": 33045 }, { "epoch": 0.8074169985097599, "grad_norm": 0.1671907603740692, "learning_rate": 1.977063241237077e-06, "loss": 0.1286, "num_input_tokens_seen": 22264128, "step": 33050 }, { "epoch": 0.807539149341607, "grad_norm": 10.650352478027344, "learning_rate": 1.9770450778755972e-06, "loss": 0.103, "num_input_tokens_seen": 22267392, "step": 33055 }, { "epoch": 0.8076613001734542, "grad_norm": 16.05209732055664, "learning_rate": 1.9770269074087825e-06, "loss": 0.1014, "num_input_tokens_seen": 22270720, "step": 33060 }, { "epoch": 0.8077834510053014, "grad_norm": 0.3560777008533478, "learning_rate": 1.9770087298367657e-06, "loss": 0.0469, "num_input_tokens_seen": 22273856, "step": 33065 }, { "epoch": 0.8079056018371485, "grad_norm": 0.26360440254211426, "learning_rate": 1.976990545159679e-06, "loss": 0.088, "num_input_tokens_seen": 22277248, "step": 33070 }, { "epoch": 0.8080277526689956, "grad_norm": 7.372500419616699, "learning_rate": 1.9769723533776537e-06, "loss": 0.1035, "num_input_tokens_seen": 22280512, "step": 33075 }, { "epoch": 0.8081499035008428, "grad_norm": 12.81210708618164, "learning_rate": 1.9769541544908228e-06, "loss": 0.1061, "num_input_tokens_seen": 22283776, "step": 33080 }, { "epoch": 0.80827205433269, "grad_norm": 16.666837692260742, "learning_rate": 1.9769359484993183e-06, "loss": 0.1643, "num_input_tokens_seen": 22288128, "step": 33085 }, { "epoch": 0.8083942051645372, "grad_norm": 0.25864091515541077, "learning_rate": 1.976917735403273e-06, "loss": 0.0042, "num_input_tokens_seen": 22291456, "step": 33090 }, { "epoch": 0.8085163559963844, "grad_norm": 6.298408508300781, "learning_rate": 1.976899515202819e-06, "loss": 0.2322, "num_input_tokens_seen": 22294528, "step": 33095 }, { "epoch": 0.8086385068282315, "grad_norm": 0.8649539351463318, "learning_rate": 1.9768812878980896e-06, "loss": 0.0393, "num_input_tokens_seen": 22297472, "step": 33100 }, { "epoch": 0.8087606576600787, "grad_norm": 0.1666417419910431, "learning_rate": 1.9768630534892164e-06, "loss": 0.1869, "num_input_tokens_seen": 22300672, "step": 33105 }, { "epoch": 0.8088828084919258, "grad_norm": 20.9919376373291, "learning_rate": 1.976844811976332e-06, "loss": 0.0332, "num_input_tokens_seen": 22304320, "step": 33110 }, { "epoch": 0.809004959323773, "grad_norm": 17.010021209716797, "learning_rate": 1.97682656335957e-06, "loss": 0.1254, "num_input_tokens_seen": 22307200, "step": 33115 }, { "epoch": 0.8091271101556201, "grad_norm": 17.23734474182129, "learning_rate": 1.976808307639062e-06, "loss": 0.1566, "num_input_tokens_seen": 22310464, "step": 33120 }, { "epoch": 0.8092492609874673, "grad_norm": 19.25299644470215, "learning_rate": 1.976790044814941e-06, "loss": 0.1316, "num_input_tokens_seen": 22313792, "step": 33125 }, { "epoch": 0.8093714118193145, "grad_norm": 1.0612061023712158, "learning_rate": 1.976771774887341e-06, "loss": 0.0509, "num_input_tokens_seen": 22317376, "step": 33130 }, { "epoch": 0.8094935626511617, "grad_norm": 30.042892456054688, "learning_rate": 1.976753497856393e-06, "loss": 0.1537, "num_input_tokens_seen": 22321024, "step": 33135 }, { "epoch": 0.8096157134830089, "grad_norm": 0.48264002799987793, "learning_rate": 1.9767352137222313e-06, "loss": 0.0427, "num_input_tokens_seen": 22324288, "step": 33140 }, { "epoch": 0.809737864314856, "grad_norm": 14.422656059265137, "learning_rate": 1.9767169224849884e-06, "loss": 0.1251, "num_input_tokens_seen": 22327360, "step": 33145 }, { "epoch": 0.8098600151467031, "grad_norm": 19.12939453125, "learning_rate": 1.9766986241447975e-06, "loss": 0.1213, "num_input_tokens_seen": 22330688, "step": 33150 }, { "epoch": 0.8099821659785503, "grad_norm": 27.57765007019043, "learning_rate": 1.9766803187017914e-06, "loss": 0.1751, "num_input_tokens_seen": 22334272, "step": 33155 }, { "epoch": 0.8101043168103975, "grad_norm": 0.18605254590511322, "learning_rate": 1.976662006156103e-06, "loss": 0.0847, "num_input_tokens_seen": 22337600, "step": 33160 }, { "epoch": 0.8102264676422446, "grad_norm": 23.411142349243164, "learning_rate": 1.9766436865078663e-06, "loss": 0.1183, "num_input_tokens_seen": 22340672, "step": 33165 }, { "epoch": 0.8103486184740918, "grad_norm": 47.72703170776367, "learning_rate": 1.9766253597572136e-06, "loss": 0.2254, "num_input_tokens_seen": 22343936, "step": 33170 }, { "epoch": 0.810470769305939, "grad_norm": 12.8462553024292, "learning_rate": 1.9766070259042786e-06, "loss": 0.1445, "num_input_tokens_seen": 22347008, "step": 33175 }, { "epoch": 0.8105929201377862, "grad_norm": 19.043668746948242, "learning_rate": 1.976588684949195e-06, "loss": 0.1202, "num_input_tokens_seen": 22350016, "step": 33180 }, { "epoch": 0.8107150709696334, "grad_norm": 8.385092735290527, "learning_rate": 1.9765703368920958e-06, "loss": 0.1223, "num_input_tokens_seen": 22353408, "step": 33185 }, { "epoch": 0.8108372218014804, "grad_norm": 10.078439712524414, "learning_rate": 1.976551981733114e-06, "loss": 0.0386, "num_input_tokens_seen": 22356736, "step": 33190 }, { "epoch": 0.8109593726333276, "grad_norm": 12.328940391540527, "learning_rate": 1.9765336194723836e-06, "loss": 0.1587, "num_input_tokens_seen": 22360384, "step": 33195 }, { "epoch": 0.8110815234651748, "grad_norm": 0.8422468900680542, "learning_rate": 1.9765152501100386e-06, "loss": 0.1238, "num_input_tokens_seen": 22363904, "step": 33200 }, { "epoch": 0.811203674297022, "grad_norm": 2.3422234058380127, "learning_rate": 1.9764968736462116e-06, "loss": 0.0939, "num_input_tokens_seen": 22367488, "step": 33205 }, { "epoch": 0.8113258251288691, "grad_norm": 0.6595805287361145, "learning_rate": 1.9764784900810367e-06, "loss": 0.0102, "num_input_tokens_seen": 22370560, "step": 33210 }, { "epoch": 0.8114479759607163, "grad_norm": 2.1569666862487793, "learning_rate": 1.9764600994146474e-06, "loss": 0.0797, "num_input_tokens_seen": 22374336, "step": 33215 }, { "epoch": 0.8115701267925635, "grad_norm": 15.887921333312988, "learning_rate": 1.976441701647178e-06, "loss": 0.1435, "num_input_tokens_seen": 22377792, "step": 33220 }, { "epoch": 0.8116922776244107, "grad_norm": 0.7121223211288452, "learning_rate": 1.976423296778762e-06, "loss": 0.082, "num_input_tokens_seen": 22380800, "step": 33225 }, { "epoch": 0.8118144284562578, "grad_norm": 33.41606903076172, "learning_rate": 1.976404884809533e-06, "loss": 0.1396, "num_input_tokens_seen": 22384128, "step": 33230 }, { "epoch": 0.8119365792881049, "grad_norm": 31.53432846069336, "learning_rate": 1.9763864657396247e-06, "loss": 0.0456, "num_input_tokens_seen": 22388032, "step": 33235 }, { "epoch": 0.8120587301199521, "grad_norm": 0.27628836035728455, "learning_rate": 1.976368039569172e-06, "loss": 0.1648, "num_input_tokens_seen": 22391424, "step": 33240 }, { "epoch": 0.8121808809517993, "grad_norm": 42.41301345825195, "learning_rate": 1.976349606298308e-06, "loss": 0.1858, "num_input_tokens_seen": 22395136, "step": 33245 }, { "epoch": 0.8123030317836465, "grad_norm": 5.459909439086914, "learning_rate": 1.9763311659271672e-06, "loss": 0.0211, "num_input_tokens_seen": 22398592, "step": 33250 }, { "epoch": 0.8124251826154936, "grad_norm": 14.837047576904297, "learning_rate": 1.976312718455883e-06, "loss": 0.1305, "num_input_tokens_seen": 22402368, "step": 33255 }, { "epoch": 0.8125473334473408, "grad_norm": 33.13792037963867, "learning_rate": 1.976294263884591e-06, "loss": 0.0898, "num_input_tokens_seen": 22406144, "step": 33260 }, { "epoch": 0.812669484279188, "grad_norm": 1.2484996318817139, "learning_rate": 1.9762758022134236e-06, "loss": 0.0859, "num_input_tokens_seen": 22409408, "step": 33265 }, { "epoch": 0.8127916351110351, "grad_norm": 5.0173258781433105, "learning_rate": 1.9762573334425164e-06, "loss": 0.1138, "num_input_tokens_seen": 22412736, "step": 33270 }, { "epoch": 0.8129137859428822, "grad_norm": 25.858051300048828, "learning_rate": 1.9762388575720035e-06, "loss": 0.1897, "num_input_tokens_seen": 22415936, "step": 33275 }, { "epoch": 0.8130359367747294, "grad_norm": 13.351430892944336, "learning_rate": 1.976220374602019e-06, "loss": 0.2099, "num_input_tokens_seen": 22419456, "step": 33280 }, { "epoch": 0.8131580876065766, "grad_norm": 21.063621520996094, "learning_rate": 1.976201884532697e-06, "loss": 0.2129, "num_input_tokens_seen": 22422720, "step": 33285 }, { "epoch": 0.8132802384384238, "grad_norm": 13.48155689239502, "learning_rate": 1.9761833873641727e-06, "loss": 0.0426, "num_input_tokens_seen": 22427072, "step": 33290 }, { "epoch": 0.813402389270271, "grad_norm": 0.40031149983406067, "learning_rate": 1.9761648830965804e-06, "loss": 0.0406, "num_input_tokens_seen": 22430656, "step": 33295 }, { "epoch": 0.813524540102118, "grad_norm": 40.440528869628906, "learning_rate": 1.976146371730054e-06, "loss": 0.1687, "num_input_tokens_seen": 22433920, "step": 33300 }, { "epoch": 0.8136466909339652, "grad_norm": 0.33487313985824585, "learning_rate": 1.976127853264729e-06, "loss": 0.0645, "num_input_tokens_seen": 22437632, "step": 33305 }, { "epoch": 0.8137688417658124, "grad_norm": 41.765663146972656, "learning_rate": 1.9761093277007394e-06, "loss": 0.1122, "num_input_tokens_seen": 22440704, "step": 33310 }, { "epoch": 0.8138909925976596, "grad_norm": 11.581916809082031, "learning_rate": 1.9760907950382204e-06, "loss": 0.1018, "num_input_tokens_seen": 22444032, "step": 33315 }, { "epoch": 0.8140131434295067, "grad_norm": 1.0770961046218872, "learning_rate": 1.9760722552773066e-06, "loss": 0.0939, "num_input_tokens_seen": 22447488, "step": 33320 }, { "epoch": 0.8141352942613539, "grad_norm": 13.341182708740234, "learning_rate": 1.976053708418133e-06, "loss": 0.1864, "num_input_tokens_seen": 22450880, "step": 33325 }, { "epoch": 0.8142574450932011, "grad_norm": 2.477572441101074, "learning_rate": 1.9760351544608343e-06, "loss": 0.0875, "num_input_tokens_seen": 22453888, "step": 33330 }, { "epoch": 0.8143795959250483, "grad_norm": 0.025869879871606827, "learning_rate": 1.976016593405545e-06, "loss": 0.1386, "num_input_tokens_seen": 22457152, "step": 33335 }, { "epoch": 0.8145017467568955, "grad_norm": 38.82683563232422, "learning_rate": 1.9759980252524007e-06, "loss": 0.1697, "num_input_tokens_seen": 22460288, "step": 33340 }, { "epoch": 0.8146238975887425, "grad_norm": 11.620903015136719, "learning_rate": 1.9759794500015365e-06, "loss": 0.0614, "num_input_tokens_seen": 22463424, "step": 33345 }, { "epoch": 0.8147460484205897, "grad_norm": 1.6850146055221558, "learning_rate": 1.9759608676530872e-06, "loss": 0.1574, "num_input_tokens_seen": 22466432, "step": 33350 }, { "epoch": 0.8148681992524369, "grad_norm": 9.8765869140625, "learning_rate": 1.975942278207188e-06, "loss": 0.0647, "num_input_tokens_seen": 22469440, "step": 33355 }, { "epoch": 0.8149903500842841, "grad_norm": 33.7421760559082, "learning_rate": 1.9759236816639733e-06, "loss": 0.0556, "num_input_tokens_seen": 22472256, "step": 33360 }, { "epoch": 0.8151125009161312, "grad_norm": 17.823688507080078, "learning_rate": 1.97590507802358e-06, "loss": 0.0245, "num_input_tokens_seen": 22475904, "step": 33365 }, { "epoch": 0.8152346517479784, "grad_norm": 3.1829872131347656, "learning_rate": 1.9758864672861423e-06, "loss": 0.0503, "num_input_tokens_seen": 22479040, "step": 33370 }, { "epoch": 0.8153568025798256, "grad_norm": 31.175722122192383, "learning_rate": 1.9758678494517957e-06, "loss": 0.2484, "num_input_tokens_seen": 22482816, "step": 33375 }, { "epoch": 0.8154789534116728, "grad_norm": 8.305288314819336, "learning_rate": 1.9758492245206754e-06, "loss": 0.1218, "num_input_tokens_seen": 22485824, "step": 33380 }, { "epoch": 0.81560110424352, "grad_norm": 28.994346618652344, "learning_rate": 1.975830592492917e-06, "loss": 0.1422, "num_input_tokens_seen": 22489024, "step": 33385 }, { "epoch": 0.815723255075367, "grad_norm": 36.34081268310547, "learning_rate": 1.9758119533686565e-06, "loss": 0.0978, "num_input_tokens_seen": 22492480, "step": 33390 }, { "epoch": 0.8158454059072142, "grad_norm": 0.08920477330684662, "learning_rate": 1.9757933071480285e-06, "loss": 0.0414, "num_input_tokens_seen": 22495616, "step": 33395 }, { "epoch": 0.8159675567390614, "grad_norm": 21.8082275390625, "learning_rate": 1.9757746538311694e-06, "loss": 0.0998, "num_input_tokens_seen": 22499072, "step": 33400 }, { "epoch": 0.8160897075709086, "grad_norm": 0.20586279034614563, "learning_rate": 1.9757559934182146e-06, "loss": 0.0641, "num_input_tokens_seen": 22502656, "step": 33405 }, { "epoch": 0.8162118584027557, "grad_norm": 25.77524757385254, "learning_rate": 1.9757373259092998e-06, "loss": 0.1152, "num_input_tokens_seen": 22506112, "step": 33410 }, { "epoch": 0.8163340092346029, "grad_norm": 0.29158711433410645, "learning_rate": 1.9757186513045604e-06, "loss": 0.1796, "num_input_tokens_seen": 22509376, "step": 33415 }, { "epoch": 0.81645616006645, "grad_norm": 2.5156776905059814, "learning_rate": 1.975699969604133e-06, "loss": 0.1909, "num_input_tokens_seen": 22513024, "step": 33420 }, { "epoch": 0.8165783108982972, "grad_norm": 4.3752121925354, "learning_rate": 1.9756812808081527e-06, "loss": 0.1162, "num_input_tokens_seen": 22516544, "step": 33425 }, { "epoch": 0.8167004617301444, "grad_norm": 38.32486343383789, "learning_rate": 1.975662584916756e-06, "loss": 0.0691, "num_input_tokens_seen": 22519808, "step": 33430 }, { "epoch": 0.8168226125619915, "grad_norm": 43.266361236572266, "learning_rate": 1.975643881930078e-06, "loss": 0.1068, "num_input_tokens_seen": 22523264, "step": 33435 }, { "epoch": 0.8169447633938387, "grad_norm": 22.88056755065918, "learning_rate": 1.9756251718482558e-06, "loss": 0.0976, "num_input_tokens_seen": 22526592, "step": 33440 }, { "epoch": 0.8170669142256859, "grad_norm": 0.9959394335746765, "learning_rate": 1.975606454671425e-06, "loss": 0.0408, "num_input_tokens_seen": 22530304, "step": 33445 }, { "epoch": 0.8171890650575331, "grad_norm": 2.919494152069092, "learning_rate": 1.975587730399721e-06, "loss": 0.1699, "num_input_tokens_seen": 22533568, "step": 33450 }, { "epoch": 0.8173112158893802, "grad_norm": 50.999454498291016, "learning_rate": 1.9755689990332813e-06, "loss": 0.1904, "num_input_tokens_seen": 22536704, "step": 33455 }, { "epoch": 0.8174333667212274, "grad_norm": 0.2796314060688019, "learning_rate": 1.975550260572241e-06, "loss": 0.0707, "num_input_tokens_seen": 22540416, "step": 33460 }, { "epoch": 0.8175555175530745, "grad_norm": 40.962093353271484, "learning_rate": 1.975531515016737e-06, "loss": 0.0283, "num_input_tokens_seen": 22543680, "step": 33465 }, { "epoch": 0.8176776683849217, "grad_norm": 3.3434290885925293, "learning_rate": 1.9755127623669053e-06, "loss": 0.1277, "num_input_tokens_seen": 22547264, "step": 33470 }, { "epoch": 0.8177998192167689, "grad_norm": 4.9675703048706055, "learning_rate": 1.9754940026228826e-06, "loss": 0.1602, "num_input_tokens_seen": 22550784, "step": 33475 }, { "epoch": 0.817921970048616, "grad_norm": 17.3238468170166, "learning_rate": 1.975475235784805e-06, "loss": 0.2102, "num_input_tokens_seen": 22554048, "step": 33480 }, { "epoch": 0.8180441208804632, "grad_norm": 34.665199279785156, "learning_rate": 1.975456461852809e-06, "loss": 0.0947, "num_input_tokens_seen": 22556864, "step": 33485 }, { "epoch": 0.8181662717123104, "grad_norm": 19.935836791992188, "learning_rate": 1.9754376808270316e-06, "loss": 0.0851, "num_input_tokens_seen": 22560192, "step": 33490 }, { "epoch": 0.8182884225441576, "grad_norm": 4.1569647789001465, "learning_rate": 1.975418892707609e-06, "loss": 0.1027, "num_input_tokens_seen": 22563584, "step": 33495 }, { "epoch": 0.8184105733760046, "grad_norm": 0.48780179023742676, "learning_rate": 1.975400097494678e-06, "loss": 0.0578, "num_input_tokens_seen": 22566720, "step": 33500 }, { "epoch": 0.8185327242078518, "grad_norm": 2.4946086406707764, "learning_rate": 1.9753812951883744e-06, "loss": 0.1196, "num_input_tokens_seen": 22570048, "step": 33505 }, { "epoch": 0.818654875039699, "grad_norm": 16.733564376831055, "learning_rate": 1.9753624857888362e-06, "loss": 0.0979, "num_input_tokens_seen": 22573376, "step": 33510 }, { "epoch": 0.8187770258715462, "grad_norm": 4.550495624542236, "learning_rate": 1.9753436692961992e-06, "loss": 0.0704, "num_input_tokens_seen": 22576704, "step": 33515 }, { "epoch": 0.8188991767033934, "grad_norm": 0.7165770530700684, "learning_rate": 1.975324845710601e-06, "loss": 0.1453, "num_input_tokens_seen": 22579840, "step": 33520 }, { "epoch": 0.8190213275352405, "grad_norm": 16.152631759643555, "learning_rate": 1.9753060150321786e-06, "loss": 0.051, "num_input_tokens_seen": 22583104, "step": 33525 }, { "epoch": 0.8191434783670877, "grad_norm": 7.786255359649658, "learning_rate": 1.975287177261068e-06, "loss": 0.1462, "num_input_tokens_seen": 22587008, "step": 33530 }, { "epoch": 0.8192656291989349, "grad_norm": 19.497220993041992, "learning_rate": 1.975268332397407e-06, "loss": 0.1047, "num_input_tokens_seen": 22590528, "step": 33535 }, { "epoch": 0.819387780030782, "grad_norm": 0.2000948041677475, "learning_rate": 1.975249480441332e-06, "loss": 0.0521, "num_input_tokens_seen": 22593600, "step": 33540 }, { "epoch": 0.8195099308626291, "grad_norm": 45.75498580932617, "learning_rate": 1.975230621392981e-06, "loss": 0.1804, "num_input_tokens_seen": 22596672, "step": 33545 }, { "epoch": 0.8196320816944763, "grad_norm": 7.454740047454834, "learning_rate": 1.9752117552524905e-06, "loss": 0.0828, "num_input_tokens_seen": 22600000, "step": 33550 }, { "epoch": 0.8197542325263235, "grad_norm": 15.156696319580078, "learning_rate": 1.9751928820199976e-06, "loss": 0.1668, "num_input_tokens_seen": 22603392, "step": 33555 }, { "epoch": 0.8198763833581707, "grad_norm": 0.2669239938259125, "learning_rate": 1.97517400169564e-06, "loss": 0.1005, "num_input_tokens_seen": 22606784, "step": 33560 }, { "epoch": 0.8199985341900178, "grad_norm": 3.1211867332458496, "learning_rate": 1.9751551142795545e-06, "loss": 0.1006, "num_input_tokens_seen": 22610112, "step": 33565 }, { "epoch": 0.820120685021865, "grad_norm": 21.486875534057617, "learning_rate": 1.975136219771879e-06, "loss": 0.1059, "num_input_tokens_seen": 22613312, "step": 33570 }, { "epoch": 0.8202428358537122, "grad_norm": 0.183716282248497, "learning_rate": 1.97511731817275e-06, "loss": 0.1553, "num_input_tokens_seen": 22616448, "step": 33575 }, { "epoch": 0.8203649866855593, "grad_norm": 41.991580963134766, "learning_rate": 1.9750984094823065e-06, "loss": 0.1131, "num_input_tokens_seen": 22620032, "step": 33580 }, { "epoch": 0.8204871375174065, "grad_norm": 1.0276497602462769, "learning_rate": 1.9750794937006847e-06, "loss": 0.0494, "num_input_tokens_seen": 22623296, "step": 33585 }, { "epoch": 0.8206092883492536, "grad_norm": 13.941420555114746, "learning_rate": 1.9750605708280224e-06, "loss": 0.1359, "num_input_tokens_seen": 22626560, "step": 33590 }, { "epoch": 0.8207314391811008, "grad_norm": 27.367237091064453, "learning_rate": 1.9750416408644573e-06, "loss": 0.0691, "num_input_tokens_seen": 22629696, "step": 33595 }, { "epoch": 0.820853590012948, "grad_norm": 8.477364540100098, "learning_rate": 1.9750227038101273e-06, "loss": 0.0392, "num_input_tokens_seen": 22632832, "step": 33600 }, { "epoch": 0.8209757408447952, "grad_norm": 21.650747299194336, "learning_rate": 1.9750037596651702e-06, "loss": 0.0865, "num_input_tokens_seen": 22636224, "step": 33605 }, { "epoch": 0.8210978916766423, "grad_norm": 9.361983299255371, "learning_rate": 1.974984808429723e-06, "loss": 0.2274, "num_input_tokens_seen": 22639744, "step": 33610 }, { "epoch": 0.8212200425084895, "grad_norm": 23.482181549072266, "learning_rate": 1.9749658501039247e-06, "loss": 0.1166, "num_input_tokens_seen": 22643456, "step": 33615 }, { "epoch": 0.8213421933403366, "grad_norm": 16.769548416137695, "learning_rate": 1.974946884687912e-06, "loss": 0.1583, "num_input_tokens_seen": 22647104, "step": 33620 }, { "epoch": 0.8214643441721838, "grad_norm": 1.3182966709136963, "learning_rate": 1.9749279121818236e-06, "loss": 0.1557, "num_input_tokens_seen": 22650240, "step": 33625 }, { "epoch": 0.821586495004031, "grad_norm": 31.42665672302246, "learning_rate": 1.974908932585797e-06, "loss": 0.1548, "num_input_tokens_seen": 22653440, "step": 33630 }, { "epoch": 0.8217086458358781, "grad_norm": 5.566858768463135, "learning_rate": 1.9748899458999706e-06, "loss": 0.0919, "num_input_tokens_seen": 22656960, "step": 33635 }, { "epoch": 0.8218307966677253, "grad_norm": 0.2816479802131653, "learning_rate": 1.974870952124482e-06, "loss": 0.0835, "num_input_tokens_seen": 22660160, "step": 33640 }, { "epoch": 0.8219529474995725, "grad_norm": 4.983752250671387, "learning_rate": 1.9748519512594697e-06, "loss": 0.1284, "num_input_tokens_seen": 22663360, "step": 33645 }, { "epoch": 0.8220750983314197, "grad_norm": 2.6677727699279785, "learning_rate": 1.974832943305072e-06, "loss": 0.0585, "num_input_tokens_seen": 22666624, "step": 33650 }, { "epoch": 0.8221972491632668, "grad_norm": 6.633481025695801, "learning_rate": 1.974813928261427e-06, "loss": 0.0665, "num_input_tokens_seen": 22670080, "step": 33655 }, { "epoch": 0.8223193999951139, "grad_norm": 1.056289792060852, "learning_rate": 1.9747949061286724e-06, "loss": 0.0723, "num_input_tokens_seen": 22673088, "step": 33660 }, { "epoch": 0.8224415508269611, "grad_norm": 20.022216796875, "learning_rate": 1.9747758769069477e-06, "loss": 0.1446, "num_input_tokens_seen": 22676288, "step": 33665 }, { "epoch": 0.8225637016588083, "grad_norm": 7.566336154937744, "learning_rate": 1.9747568405963902e-06, "loss": 0.3086, "num_input_tokens_seen": 22679552, "step": 33670 }, { "epoch": 0.8226858524906555, "grad_norm": 23.478660583496094, "learning_rate": 1.974737797197139e-06, "loss": 0.1643, "num_input_tokens_seen": 22682752, "step": 33675 }, { "epoch": 0.8228080033225026, "grad_norm": 4.6182355880737305, "learning_rate": 1.9747187467093324e-06, "loss": 0.0487, "num_input_tokens_seen": 22685696, "step": 33680 }, { "epoch": 0.8229301541543498, "grad_norm": 1.7821851968765259, "learning_rate": 1.9746996891331086e-06, "loss": 0.1025, "num_input_tokens_seen": 22689536, "step": 33685 }, { "epoch": 0.823052304986197, "grad_norm": 28.503374099731445, "learning_rate": 1.974680624468607e-06, "loss": 0.1342, "num_input_tokens_seen": 22692928, "step": 33690 }, { "epoch": 0.8231744558180442, "grad_norm": 0.768397867679596, "learning_rate": 1.974661552715965e-06, "loss": 0.1626, "num_input_tokens_seen": 22695936, "step": 33695 }, { "epoch": 0.8232966066498912, "grad_norm": 0.2706652879714966, "learning_rate": 1.9746424738753225e-06, "loss": 0.0827, "num_input_tokens_seen": 22698816, "step": 33700 }, { "epoch": 0.8234187574817384, "grad_norm": 6.46179723739624, "learning_rate": 1.974623387946818e-06, "loss": 0.0661, "num_input_tokens_seen": 22702528, "step": 33705 }, { "epoch": 0.8235409083135856, "grad_norm": 17.515541076660156, "learning_rate": 1.97460429493059e-06, "loss": 0.0431, "num_input_tokens_seen": 22705856, "step": 33710 }, { "epoch": 0.8236630591454328, "grad_norm": 21.961942672729492, "learning_rate": 1.974585194826777e-06, "loss": 0.1104, "num_input_tokens_seen": 22708992, "step": 33715 }, { "epoch": 0.82378520997728, "grad_norm": 15.293503761291504, "learning_rate": 1.9745660876355187e-06, "loss": 0.1426, "num_input_tokens_seen": 22712128, "step": 33720 }, { "epoch": 0.8239073608091271, "grad_norm": 6.357548236846924, "learning_rate": 1.9745469733569536e-06, "loss": 0.0338, "num_input_tokens_seen": 22715392, "step": 33725 }, { "epoch": 0.8240295116409743, "grad_norm": 29.685779571533203, "learning_rate": 1.9745278519912206e-06, "loss": 0.1035, "num_input_tokens_seen": 22719232, "step": 33730 }, { "epoch": 0.8241516624728215, "grad_norm": 46.714149475097656, "learning_rate": 1.9745087235384596e-06, "loss": 0.1042, "num_input_tokens_seen": 22722240, "step": 33735 }, { "epoch": 0.8242738133046686, "grad_norm": 15.977401733398438, "learning_rate": 1.9744895879988085e-06, "loss": 0.1429, "num_input_tokens_seen": 22725632, "step": 33740 }, { "epoch": 0.8243959641365157, "grad_norm": 51.249656677246094, "learning_rate": 1.974470445372407e-06, "loss": 0.194, "num_input_tokens_seen": 22728640, "step": 33745 }, { "epoch": 0.8245181149683629, "grad_norm": 18.208505630493164, "learning_rate": 1.9744512956593943e-06, "loss": 0.0791, "num_input_tokens_seen": 22731904, "step": 33750 }, { "epoch": 0.8246402658002101, "grad_norm": 11.103679656982422, "learning_rate": 1.97443213885991e-06, "loss": 0.1784, "num_input_tokens_seen": 22735040, "step": 33755 }, { "epoch": 0.8247624166320573, "grad_norm": 16.442033767700195, "learning_rate": 1.9744129749740925e-06, "loss": 0.196, "num_input_tokens_seen": 22738688, "step": 33760 }, { "epoch": 0.8248845674639045, "grad_norm": 4.221897602081299, "learning_rate": 1.974393804002082e-06, "loss": 0.0673, "num_input_tokens_seen": 22741696, "step": 33765 }, { "epoch": 0.8250067182957516, "grad_norm": 28.284793853759766, "learning_rate": 1.974374625944018e-06, "loss": 0.1446, "num_input_tokens_seen": 22745216, "step": 33770 }, { "epoch": 0.8251288691275988, "grad_norm": 13.259180068969727, "learning_rate": 1.9743554408000394e-06, "loss": 0.1043, "num_input_tokens_seen": 22748352, "step": 33775 }, { "epoch": 0.8252510199594459, "grad_norm": 11.632582664489746, "learning_rate": 1.974336248570286e-06, "loss": 0.156, "num_input_tokens_seen": 22751744, "step": 33780 }, { "epoch": 0.8253731707912931, "grad_norm": 0.9505820870399475, "learning_rate": 1.9743170492548974e-06, "loss": 0.1163, "num_input_tokens_seen": 22755712, "step": 33785 }, { "epoch": 0.8254953216231402, "grad_norm": 7.451425552368164, "learning_rate": 1.9742978428540132e-06, "loss": 0.126, "num_input_tokens_seen": 22759232, "step": 33790 }, { "epoch": 0.8256174724549874, "grad_norm": 15.902039527893066, "learning_rate": 1.974278629367773e-06, "loss": 0.0943, "num_input_tokens_seen": 22762688, "step": 33795 }, { "epoch": 0.8257396232868346, "grad_norm": 17.254871368408203, "learning_rate": 1.974259408796317e-06, "loss": 0.0327, "num_input_tokens_seen": 22766144, "step": 33800 }, { "epoch": 0.8258617741186818, "grad_norm": 12.781633377075195, "learning_rate": 1.9742401811397834e-06, "loss": 0.0922, "num_input_tokens_seen": 22769216, "step": 33805 }, { "epoch": 0.8259839249505289, "grad_norm": 4.0394182205200195, "learning_rate": 1.9742209463983143e-06, "loss": 0.0987, "num_input_tokens_seen": 22772864, "step": 33810 }, { "epoch": 0.826106075782376, "grad_norm": 23.54717445373535, "learning_rate": 1.9742017045720474e-06, "loss": 0.0945, "num_input_tokens_seen": 22775872, "step": 33815 }, { "epoch": 0.8262282266142232, "grad_norm": 41.05208969116211, "learning_rate": 1.9741824556611245e-06, "loss": 0.049, "num_input_tokens_seen": 22779456, "step": 33820 }, { "epoch": 0.8263503774460704, "grad_norm": 7.894565582275391, "learning_rate": 1.9741631996656846e-06, "loss": 0.0323, "num_input_tokens_seen": 22782528, "step": 33825 }, { "epoch": 0.8264725282779176, "grad_norm": 13.551430702209473, "learning_rate": 1.9741439365858677e-06, "loss": 0.1273, "num_input_tokens_seen": 22785664, "step": 33830 }, { "epoch": 0.8265946791097647, "grad_norm": 51.72720718383789, "learning_rate": 1.974124666421814e-06, "loss": 0.1618, "num_input_tokens_seen": 22788992, "step": 33835 }, { "epoch": 0.8267168299416119, "grad_norm": 0.09919314086437225, "learning_rate": 1.974105389173664e-06, "loss": 0.1561, "num_input_tokens_seen": 22792064, "step": 33840 }, { "epoch": 0.8268389807734591, "grad_norm": 0.31595703959465027, "learning_rate": 1.974086104841557e-06, "loss": 0.0267, "num_input_tokens_seen": 22795584, "step": 33845 }, { "epoch": 0.8269611316053063, "grad_norm": 14.558001518249512, "learning_rate": 1.974066813425635e-06, "loss": 0.1838, "num_input_tokens_seen": 22799104, "step": 33850 }, { "epoch": 0.8270832824371533, "grad_norm": 4.938451766967773, "learning_rate": 1.9740475149260364e-06, "loss": 0.0809, "num_input_tokens_seen": 22802112, "step": 33855 }, { "epoch": 0.8272054332690005, "grad_norm": 1.022783637046814, "learning_rate": 1.974028209342902e-06, "loss": 0.0087, "num_input_tokens_seen": 22805824, "step": 33860 }, { "epoch": 0.8273275841008477, "grad_norm": 18.273351669311523, "learning_rate": 1.974008896676373e-06, "loss": 0.0718, "num_input_tokens_seen": 22808768, "step": 33865 }, { "epoch": 0.8274497349326949, "grad_norm": 6.545739650726318, "learning_rate": 1.973989576926589e-06, "loss": 0.1391, "num_input_tokens_seen": 22812096, "step": 33870 }, { "epoch": 0.8275718857645421, "grad_norm": 9.688802719116211, "learning_rate": 1.973970250093691e-06, "loss": 0.1762, "num_input_tokens_seen": 22815296, "step": 33875 }, { "epoch": 0.8276940365963892, "grad_norm": 40.52885818481445, "learning_rate": 1.9739509161778196e-06, "loss": 0.1399, "num_input_tokens_seen": 22818944, "step": 33880 }, { "epoch": 0.8278161874282364, "grad_norm": 16.736942291259766, "learning_rate": 1.9739315751791146e-06, "loss": 0.084, "num_input_tokens_seen": 22822272, "step": 33885 }, { "epoch": 0.8279383382600836, "grad_norm": 0.25654950737953186, "learning_rate": 1.973912227097718e-06, "loss": 0.0412, "num_input_tokens_seen": 22825792, "step": 33890 }, { "epoch": 0.8280604890919308, "grad_norm": 5.899317741394043, "learning_rate": 1.9738928719337695e-06, "loss": 0.1633, "num_input_tokens_seen": 22829632, "step": 33895 }, { "epoch": 0.8281826399237778, "grad_norm": 36.036251068115234, "learning_rate": 1.97387350968741e-06, "loss": 0.1873, "num_input_tokens_seen": 22833152, "step": 33900 }, { "epoch": 0.828304790755625, "grad_norm": 0.2765234410762787, "learning_rate": 1.97385414035878e-06, "loss": 0.1141, "num_input_tokens_seen": 22836736, "step": 33905 }, { "epoch": 0.8284269415874722, "grad_norm": 9.578060150146484, "learning_rate": 1.973834763948021e-06, "loss": 0.0695, "num_input_tokens_seen": 22840064, "step": 33910 }, { "epoch": 0.8285490924193194, "grad_norm": 1.5241056680679321, "learning_rate": 1.9738153804552734e-06, "loss": 0.0851, "num_input_tokens_seen": 22843520, "step": 33915 }, { "epoch": 0.8286712432511666, "grad_norm": 5.904448986053467, "learning_rate": 1.973795989880679e-06, "loss": 0.1975, "num_input_tokens_seen": 22846720, "step": 33920 }, { "epoch": 0.8287933940830137, "grad_norm": 36.777313232421875, "learning_rate": 1.973776592224378e-06, "loss": 0.0485, "num_input_tokens_seen": 22850048, "step": 33925 }, { "epoch": 0.8289155449148609, "grad_norm": 2.7984845638275146, "learning_rate": 1.9737571874865115e-06, "loss": 0.0265, "num_input_tokens_seen": 22853504, "step": 33930 }, { "epoch": 0.829037695746708, "grad_norm": 0.4170108437538147, "learning_rate": 1.973737775667221e-06, "loss": 0.0968, "num_input_tokens_seen": 22856512, "step": 33935 }, { "epoch": 0.8291598465785552, "grad_norm": 15.714227676391602, "learning_rate": 1.9737183567666478e-06, "loss": 0.1644, "num_input_tokens_seen": 22860032, "step": 33940 }, { "epoch": 0.8292819974104023, "grad_norm": 7.653704643249512, "learning_rate": 1.9736989307849323e-06, "loss": 0.183, "num_input_tokens_seen": 22863424, "step": 33945 }, { "epoch": 0.8294041482422495, "grad_norm": 16.792009353637695, "learning_rate": 1.9736794977222166e-06, "loss": 0.155, "num_input_tokens_seen": 22866496, "step": 33950 }, { "epoch": 0.8295262990740967, "grad_norm": 0.18280255794525146, "learning_rate": 1.9736600575786413e-06, "loss": 0.0301, "num_input_tokens_seen": 22869632, "step": 33955 }, { "epoch": 0.8296484499059439, "grad_norm": 1.9841983318328857, "learning_rate": 1.973640610354349e-06, "loss": 0.1194, "num_input_tokens_seen": 22873216, "step": 33960 }, { "epoch": 0.8297706007377911, "grad_norm": 1.050477385520935, "learning_rate": 1.9736211560494796e-06, "loss": 0.1718, "num_input_tokens_seen": 22876480, "step": 33965 }, { "epoch": 0.8298927515696382, "grad_norm": 15.280632972717285, "learning_rate": 1.9736016946641755e-06, "loss": 0.1535, "num_input_tokens_seen": 22879552, "step": 33970 }, { "epoch": 0.8300149024014853, "grad_norm": 1.5406540632247925, "learning_rate": 1.973582226198578e-06, "loss": 0.2542, "num_input_tokens_seen": 22882816, "step": 33975 }, { "epoch": 0.8301370532333325, "grad_norm": 4.964468955993652, "learning_rate": 1.9735627506528284e-06, "loss": 0.1118, "num_input_tokens_seen": 22886208, "step": 33980 }, { "epoch": 0.8302592040651797, "grad_norm": 10.970595359802246, "learning_rate": 1.973543268027069e-06, "loss": 0.2168, "num_input_tokens_seen": 22889600, "step": 33985 }, { "epoch": 0.8303813548970268, "grad_norm": 12.386752128601074, "learning_rate": 1.9735237783214413e-06, "loss": 0.0727, "num_input_tokens_seen": 22893312, "step": 33990 }, { "epoch": 0.830503505728874, "grad_norm": 20.162940979003906, "learning_rate": 1.973504281536086e-06, "loss": 0.1407, "num_input_tokens_seen": 22896512, "step": 33995 }, { "epoch": 0.8306256565607212, "grad_norm": 15.259532928466797, "learning_rate": 1.9734847776711465e-06, "loss": 0.1526, "num_input_tokens_seen": 22900608, "step": 34000 }, { "epoch": 0.8307478073925684, "grad_norm": 0.4917971193790436, "learning_rate": 1.973465266726764e-06, "loss": 0.0175, "num_input_tokens_seen": 22904000, "step": 34005 }, { "epoch": 0.8308699582244156, "grad_norm": 9.38034725189209, "learning_rate": 1.9734457487030792e-06, "loss": 0.0464, "num_input_tokens_seen": 22907712, "step": 34010 }, { "epoch": 0.8309921090562626, "grad_norm": 5.182671546936035, "learning_rate": 1.973426223600236e-06, "loss": 0.0883, "num_input_tokens_seen": 22911232, "step": 34015 }, { "epoch": 0.8311142598881098, "grad_norm": 13.00207233428955, "learning_rate": 1.9734066914183752e-06, "loss": 0.0874, "num_input_tokens_seen": 22914624, "step": 34020 }, { "epoch": 0.831236410719957, "grad_norm": 2.0715930461883545, "learning_rate": 1.973387152157639e-06, "loss": 0.0753, "num_input_tokens_seen": 22918208, "step": 34025 }, { "epoch": 0.8313585615518042, "grad_norm": 26.91462516784668, "learning_rate": 1.97336760581817e-06, "loss": 0.2246, "num_input_tokens_seen": 22921664, "step": 34030 }, { "epoch": 0.8314807123836513, "grad_norm": 20.108482360839844, "learning_rate": 1.9733480524001096e-06, "loss": 0.1399, "num_input_tokens_seen": 22925184, "step": 34035 }, { "epoch": 0.8316028632154985, "grad_norm": 14.097265243530273, "learning_rate": 1.9733284919036007e-06, "loss": 0.1742, "num_input_tokens_seen": 22928640, "step": 34040 }, { "epoch": 0.8317250140473457, "grad_norm": 29.254487991333008, "learning_rate": 1.9733089243287847e-06, "loss": 0.1754, "num_input_tokens_seen": 22931968, "step": 34045 }, { "epoch": 0.8318471648791929, "grad_norm": 12.961664199829102, "learning_rate": 1.973289349675805e-06, "loss": 0.0714, "num_input_tokens_seen": 22935168, "step": 34050 }, { "epoch": 0.83196931571104, "grad_norm": 1.3785406351089478, "learning_rate": 1.973269767944803e-06, "loss": 0.2318, "num_input_tokens_seen": 22938944, "step": 34055 }, { "epoch": 0.8320914665428871, "grad_norm": 22.194021224975586, "learning_rate": 1.9732501791359217e-06, "loss": 0.1794, "num_input_tokens_seen": 22942016, "step": 34060 }, { "epoch": 0.8322136173747343, "grad_norm": 5.744136810302734, "learning_rate": 1.973230583249303e-06, "loss": 0.0536, "num_input_tokens_seen": 22945152, "step": 34065 }, { "epoch": 0.8323357682065815, "grad_norm": 16.040979385375977, "learning_rate": 1.97321098028509e-06, "loss": 0.081, "num_input_tokens_seen": 22948544, "step": 34070 }, { "epoch": 0.8324579190384287, "grad_norm": 5.4098100662231445, "learning_rate": 1.973191370243425e-06, "loss": 0.0337, "num_input_tokens_seen": 22951936, "step": 34075 }, { "epoch": 0.8325800698702758, "grad_norm": 0.48113128542900085, "learning_rate": 1.9731717531244507e-06, "loss": 0.0878, "num_input_tokens_seen": 22955520, "step": 34080 }, { "epoch": 0.832702220702123, "grad_norm": 11.145101547241211, "learning_rate": 1.973152128928309e-06, "loss": 0.0361, "num_input_tokens_seen": 22958656, "step": 34085 }, { "epoch": 0.8328243715339702, "grad_norm": 10.848712921142578, "learning_rate": 1.973132497655144e-06, "loss": 0.0429, "num_input_tokens_seen": 22962240, "step": 34090 }, { "epoch": 0.8329465223658173, "grad_norm": 0.10347910970449448, "learning_rate": 1.9731128593050974e-06, "loss": 0.0689, "num_input_tokens_seen": 22965568, "step": 34095 }, { "epoch": 0.8330686731976644, "grad_norm": 26.27857780456543, "learning_rate": 1.9730932138783122e-06, "loss": 0.0238, "num_input_tokens_seen": 22969152, "step": 34100 }, { "epoch": 0.8331908240295116, "grad_norm": 15.342934608459473, "learning_rate": 1.973073561374932e-06, "loss": 0.2501, "num_input_tokens_seen": 22972928, "step": 34105 }, { "epoch": 0.8333129748613588, "grad_norm": 13.802165985107422, "learning_rate": 1.9730539017950986e-06, "loss": 0.2352, "num_input_tokens_seen": 22976256, "step": 34110 }, { "epoch": 0.833435125693206, "grad_norm": 28.344444274902344, "learning_rate": 1.9730342351389555e-06, "loss": 0.1129, "num_input_tokens_seen": 22979648, "step": 34115 }, { "epoch": 0.8335572765250532, "grad_norm": 4.112672805786133, "learning_rate": 1.973014561406646e-06, "loss": 0.1751, "num_input_tokens_seen": 22982784, "step": 34120 }, { "epoch": 0.8336794273569003, "grad_norm": 44.83839416503906, "learning_rate": 1.972994880598313e-06, "loss": 0.1128, "num_input_tokens_seen": 22985728, "step": 34125 }, { "epoch": 0.8338015781887474, "grad_norm": 1.7575098276138306, "learning_rate": 1.9729751927140994e-06, "loss": 0.0816, "num_input_tokens_seen": 22989056, "step": 34130 }, { "epoch": 0.8339237290205946, "grad_norm": 18.21506690979004, "learning_rate": 1.9729554977541484e-06, "loss": 0.1597, "num_input_tokens_seen": 22992832, "step": 34135 }, { "epoch": 0.8340458798524418, "grad_norm": 1.0433268547058105, "learning_rate": 1.9729357957186034e-06, "loss": 0.0803, "num_input_tokens_seen": 22996096, "step": 34140 }, { "epoch": 0.8341680306842889, "grad_norm": 2.801239013671875, "learning_rate": 1.972916086607607e-06, "loss": 0.1056, "num_input_tokens_seen": 23000128, "step": 34145 }, { "epoch": 0.8342901815161361, "grad_norm": 32.306697845458984, "learning_rate": 1.9728963704213044e-06, "loss": 0.059, "num_input_tokens_seen": 23003136, "step": 34150 }, { "epoch": 0.8344123323479833, "grad_norm": 0.4662260413169861, "learning_rate": 1.9728766471598367e-06, "loss": 0.0162, "num_input_tokens_seen": 23006656, "step": 34155 }, { "epoch": 0.8345344831798305, "grad_norm": 42.929996490478516, "learning_rate": 1.972856916823349e-06, "loss": 0.0996, "num_input_tokens_seen": 23010304, "step": 34160 }, { "epoch": 0.8346566340116777, "grad_norm": 42.26788330078125, "learning_rate": 1.9728371794119836e-06, "loss": 0.2583, "num_input_tokens_seen": 23013440, "step": 34165 }, { "epoch": 0.8347787848435247, "grad_norm": 26.08919334411621, "learning_rate": 1.9728174349258844e-06, "loss": 0.2494, "num_input_tokens_seen": 23016576, "step": 34170 }, { "epoch": 0.8349009356753719, "grad_norm": 26.989700317382812, "learning_rate": 1.972797683365196e-06, "loss": 0.1302, "num_input_tokens_seen": 23019648, "step": 34175 }, { "epoch": 0.8350230865072191, "grad_norm": 6.1088972091674805, "learning_rate": 1.9727779247300606e-06, "loss": 0.0458, "num_input_tokens_seen": 23022912, "step": 34180 }, { "epoch": 0.8351452373390663, "grad_norm": 40.35777282714844, "learning_rate": 1.9727581590206226e-06, "loss": 0.0924, "num_input_tokens_seen": 23026624, "step": 34185 }, { "epoch": 0.8352673881709134, "grad_norm": 0.571607768535614, "learning_rate": 1.9727383862370255e-06, "loss": 0.1005, "num_input_tokens_seen": 23029824, "step": 34190 }, { "epoch": 0.8353895390027606, "grad_norm": 22.8275146484375, "learning_rate": 1.9727186063794133e-06, "loss": 0.1247, "num_input_tokens_seen": 23033408, "step": 34195 }, { "epoch": 0.8355116898346078, "grad_norm": 0.1954413503408432, "learning_rate": 1.9726988194479303e-06, "loss": 0.097, "num_input_tokens_seen": 23036928, "step": 34200 }, { "epoch": 0.835633840666455, "grad_norm": 20.129854202270508, "learning_rate": 1.9726790254427194e-06, "loss": 0.1309, "num_input_tokens_seen": 23040704, "step": 34205 }, { "epoch": 0.8357559914983022, "grad_norm": 0.10747195780277252, "learning_rate": 1.972659224363925e-06, "loss": 0.0704, "num_input_tokens_seen": 23044288, "step": 34210 }, { "epoch": 0.8358781423301492, "grad_norm": 0.35761135816574097, "learning_rate": 1.9726394162116907e-06, "loss": 0.1503, "num_input_tokens_seen": 23047552, "step": 34215 }, { "epoch": 0.8360002931619964, "grad_norm": 16.95950698852539, "learning_rate": 1.9726196009861614e-06, "loss": 0.1158, "num_input_tokens_seen": 23050944, "step": 34220 }, { "epoch": 0.8361224439938436, "grad_norm": 9.001870155334473, "learning_rate": 1.972599778687481e-06, "loss": 0.1044, "num_input_tokens_seen": 23054208, "step": 34225 }, { "epoch": 0.8362445948256908, "grad_norm": 16.461078643798828, "learning_rate": 1.972579949315793e-06, "loss": 0.0879, "num_input_tokens_seen": 23057472, "step": 34230 }, { "epoch": 0.8363667456575379, "grad_norm": 16.833782196044922, "learning_rate": 1.972560112871242e-06, "loss": 0.1846, "num_input_tokens_seen": 23061184, "step": 34235 }, { "epoch": 0.8364888964893851, "grad_norm": 45.61178970336914, "learning_rate": 1.972540269353972e-06, "loss": 0.1902, "num_input_tokens_seen": 23064256, "step": 34240 }, { "epoch": 0.8366110473212323, "grad_norm": 12.094864845275879, "learning_rate": 1.9725204187641282e-06, "loss": 0.0897, "num_input_tokens_seen": 23067584, "step": 34245 }, { "epoch": 0.8367331981530794, "grad_norm": 12.797513961791992, "learning_rate": 1.9725005611018544e-06, "loss": 0.1588, "num_input_tokens_seen": 23070912, "step": 34250 }, { "epoch": 0.8368553489849266, "grad_norm": 20.513973236083984, "learning_rate": 1.9724806963672947e-06, "loss": 0.0433, "num_input_tokens_seen": 23074560, "step": 34255 }, { "epoch": 0.8369774998167737, "grad_norm": 14.594740867614746, "learning_rate": 1.972460824560594e-06, "loss": 0.0958, "num_input_tokens_seen": 23078592, "step": 34260 }, { "epoch": 0.8370996506486209, "grad_norm": 4.112400531768799, "learning_rate": 1.972440945681896e-06, "loss": 0.0886, "num_input_tokens_seen": 23081920, "step": 34265 }, { "epoch": 0.8372218014804681, "grad_norm": 2.1494662761688232, "learning_rate": 1.9724210597313463e-06, "loss": 0.1852, "num_input_tokens_seen": 23084928, "step": 34270 }, { "epoch": 0.8373439523123153, "grad_norm": 3.337203025817871, "learning_rate": 1.972401166709089e-06, "loss": 0.0785, "num_input_tokens_seen": 23087936, "step": 34275 }, { "epoch": 0.8374661031441624, "grad_norm": 10.968202590942383, "learning_rate": 1.9723812666152695e-06, "loss": 0.1037, "num_input_tokens_seen": 23091072, "step": 34280 }, { "epoch": 0.8375882539760096, "grad_norm": 9.930312156677246, "learning_rate": 1.9723613594500312e-06, "loss": 0.0931, "num_input_tokens_seen": 23094528, "step": 34285 }, { "epoch": 0.8377104048078567, "grad_norm": 22.673933029174805, "learning_rate": 1.9723414452135197e-06, "loss": 0.1358, "num_input_tokens_seen": 23098560, "step": 34290 }, { "epoch": 0.8378325556397039, "grad_norm": 18.80763053894043, "learning_rate": 1.9723215239058797e-06, "loss": 0.0928, "num_input_tokens_seen": 23102080, "step": 34295 }, { "epoch": 0.8379547064715511, "grad_norm": 1.3128330707550049, "learning_rate": 1.972301595527256e-06, "loss": 0.0881, "num_input_tokens_seen": 23105728, "step": 34300 }, { "epoch": 0.8380768573033982, "grad_norm": 4.349679946899414, "learning_rate": 1.9722816600777937e-06, "loss": 0.1869, "num_input_tokens_seen": 23109184, "step": 34305 }, { "epoch": 0.8381990081352454, "grad_norm": 1.3009531497955322, "learning_rate": 1.972261717557638e-06, "loss": 0.0709, "num_input_tokens_seen": 23112576, "step": 34310 }, { "epoch": 0.8383211589670926, "grad_norm": 39.514305114746094, "learning_rate": 1.972241767966933e-06, "loss": 0.0985, "num_input_tokens_seen": 23116096, "step": 34315 }, { "epoch": 0.8384433097989398, "grad_norm": 25.719438552856445, "learning_rate": 1.9722218113058246e-06, "loss": 0.1932, "num_input_tokens_seen": 23119488, "step": 34320 }, { "epoch": 0.8385654606307869, "grad_norm": 32.30820083618164, "learning_rate": 1.9722018475744573e-06, "loss": 0.1424, "num_input_tokens_seen": 23122816, "step": 34325 }, { "epoch": 0.838687611462634, "grad_norm": 9.513768196105957, "learning_rate": 1.972181876772977e-06, "loss": 0.0381, "num_input_tokens_seen": 23125888, "step": 34330 }, { "epoch": 0.8388097622944812, "grad_norm": 20.49899673461914, "learning_rate": 1.9721618989015285e-06, "loss": 0.0856, "num_input_tokens_seen": 23129280, "step": 34335 }, { "epoch": 0.8389319131263284, "grad_norm": 48.79369354248047, "learning_rate": 1.972141913960257e-06, "loss": 0.1908, "num_input_tokens_seen": 23132608, "step": 34340 }, { "epoch": 0.8390540639581755, "grad_norm": 17.959671020507812, "learning_rate": 1.9721219219493087e-06, "loss": 0.0528, "num_input_tokens_seen": 23136000, "step": 34345 }, { "epoch": 0.8391762147900227, "grad_norm": 5.5302815437316895, "learning_rate": 1.972101922868828e-06, "loss": 0.1189, "num_input_tokens_seen": 23139520, "step": 34350 }, { "epoch": 0.8392983656218699, "grad_norm": 13.573472023010254, "learning_rate": 1.9720819167189605e-06, "loss": 0.0453, "num_input_tokens_seen": 23143168, "step": 34355 }, { "epoch": 0.8394205164537171, "grad_norm": 36.57072448730469, "learning_rate": 1.972061903499852e-06, "loss": 0.0662, "num_input_tokens_seen": 23146688, "step": 34360 }, { "epoch": 0.8395426672855643, "grad_norm": 0.20541389286518097, "learning_rate": 1.972041883211648e-06, "loss": 0.1032, "num_input_tokens_seen": 23149888, "step": 34365 }, { "epoch": 0.8396648181174113, "grad_norm": 0.509107232093811, "learning_rate": 1.9720218558544937e-06, "loss": 0.1527, "num_input_tokens_seen": 23153344, "step": 34370 }, { "epoch": 0.8397869689492585, "grad_norm": 72.95315551757812, "learning_rate": 1.972001821428535e-06, "loss": 0.1562, "num_input_tokens_seen": 23156608, "step": 34375 }, { "epoch": 0.8399091197811057, "grad_norm": 31.27646255493164, "learning_rate": 1.9719817799339178e-06, "loss": 0.0158, "num_input_tokens_seen": 23159936, "step": 34380 }, { "epoch": 0.8400312706129529, "grad_norm": 1.4220023155212402, "learning_rate": 1.9719617313707875e-06, "loss": 0.1794, "num_input_tokens_seen": 23163136, "step": 34385 }, { "epoch": 0.8401534214448, "grad_norm": 40.343875885009766, "learning_rate": 1.9719416757392906e-06, "loss": 0.1996, "num_input_tokens_seen": 23166336, "step": 34390 }, { "epoch": 0.8402755722766472, "grad_norm": 27.2926025390625, "learning_rate": 1.9719216130395718e-06, "loss": 0.1569, "num_input_tokens_seen": 23169536, "step": 34395 }, { "epoch": 0.8403977231084944, "grad_norm": 5.333216190338135, "learning_rate": 1.9719015432717776e-06, "loss": 0.1087, "num_input_tokens_seen": 23172544, "step": 34400 }, { "epoch": 0.8405198739403416, "grad_norm": 15.621480941772461, "learning_rate": 1.9718814664360543e-06, "loss": 0.0696, "num_input_tokens_seen": 23175936, "step": 34405 }, { "epoch": 0.8406420247721887, "grad_norm": 0.1253618448972702, "learning_rate": 1.9718613825325474e-06, "loss": 0.1507, "num_input_tokens_seen": 23179136, "step": 34410 }, { "epoch": 0.8407641756040358, "grad_norm": 35.372894287109375, "learning_rate": 1.971841291561403e-06, "loss": 0.1766, "num_input_tokens_seen": 23182464, "step": 34415 }, { "epoch": 0.840886326435883, "grad_norm": 21.857149124145508, "learning_rate": 1.9718211935227676e-06, "loss": 0.0337, "num_input_tokens_seen": 23186624, "step": 34420 }, { "epoch": 0.8410084772677302, "grad_norm": 5.667567729949951, "learning_rate": 1.971801088416787e-06, "loss": 0.0837, "num_input_tokens_seen": 23190016, "step": 34425 }, { "epoch": 0.8411306280995774, "grad_norm": 32.595516204833984, "learning_rate": 1.9717809762436075e-06, "loss": 0.065, "num_input_tokens_seen": 23193344, "step": 34430 }, { "epoch": 0.8412527789314245, "grad_norm": 11.109789848327637, "learning_rate": 1.9717608570033755e-06, "loss": 0.0964, "num_input_tokens_seen": 23196928, "step": 34435 }, { "epoch": 0.8413749297632717, "grad_norm": 0.47146910429000854, "learning_rate": 1.971740730696237e-06, "loss": 0.0474, "num_input_tokens_seen": 23200192, "step": 34440 }, { "epoch": 0.8414970805951189, "grad_norm": 11.289263725280762, "learning_rate": 1.9717205973223386e-06, "loss": 0.1256, "num_input_tokens_seen": 23203200, "step": 34445 }, { "epoch": 0.841619231426966, "grad_norm": 1.9000232219696045, "learning_rate": 1.9717004568818266e-06, "loss": 0.0673, "num_input_tokens_seen": 23206464, "step": 34450 }, { "epoch": 0.8417413822588132, "grad_norm": 47.20060729980469, "learning_rate": 1.9716803093748474e-06, "loss": 0.074, "num_input_tokens_seen": 23209600, "step": 34455 }, { "epoch": 0.8418635330906603, "grad_norm": 5.45139217376709, "learning_rate": 1.971660154801548e-06, "loss": 0.1398, "num_input_tokens_seen": 23212800, "step": 34460 }, { "epoch": 0.8419856839225075, "grad_norm": 1.3095818758010864, "learning_rate": 1.9716399931620743e-06, "loss": 0.0769, "num_input_tokens_seen": 23215872, "step": 34465 }, { "epoch": 0.8421078347543547, "grad_norm": 0.6733562350273132, "learning_rate": 1.9716198244565734e-06, "loss": 0.1463, "num_input_tokens_seen": 23219456, "step": 34470 }, { "epoch": 0.8422299855862019, "grad_norm": 0.19409097731113434, "learning_rate": 1.9715996486851915e-06, "loss": 0.0977, "num_input_tokens_seen": 23222912, "step": 34475 }, { "epoch": 0.842352136418049, "grad_norm": 1.5221052169799805, "learning_rate": 1.971579465848076e-06, "loss": 0.0233, "num_input_tokens_seen": 23225920, "step": 34480 }, { "epoch": 0.8424742872498961, "grad_norm": 44.901756286621094, "learning_rate": 1.971559275945373e-06, "loss": 0.0681, "num_input_tokens_seen": 23228992, "step": 34485 }, { "epoch": 0.8425964380817433, "grad_norm": 79.00640106201172, "learning_rate": 1.9715390789772297e-06, "loss": 0.2153, "num_input_tokens_seen": 23232064, "step": 34490 }, { "epoch": 0.8427185889135905, "grad_norm": 20.789770126342773, "learning_rate": 1.971518874943793e-06, "loss": 0.1866, "num_input_tokens_seen": 23235392, "step": 34495 }, { "epoch": 0.8428407397454377, "grad_norm": 2.384317398071289, "learning_rate": 1.97149866384521e-06, "loss": 0.0846, "num_input_tokens_seen": 23239040, "step": 34500 }, { "epoch": 0.8429628905772848, "grad_norm": 0.427656352519989, "learning_rate": 1.971478445681627e-06, "loss": 0.1035, "num_input_tokens_seen": 23242496, "step": 34505 }, { "epoch": 0.843085041409132, "grad_norm": 1.450196385383606, "learning_rate": 1.9714582204531916e-06, "loss": 0.0301, "num_input_tokens_seen": 23245696, "step": 34510 }, { "epoch": 0.8432071922409792, "grad_norm": 7.800384998321533, "learning_rate": 1.9714379881600507e-06, "loss": 0.1464, "num_input_tokens_seen": 23249152, "step": 34515 }, { "epoch": 0.8433293430728264, "grad_norm": 19.53229522705078, "learning_rate": 1.9714177488023514e-06, "loss": 0.3484, "num_input_tokens_seen": 23251968, "step": 34520 }, { "epoch": 0.8434514939046734, "grad_norm": 0.45127061009407043, "learning_rate": 1.971397502380241e-06, "loss": 0.0453, "num_input_tokens_seen": 23254976, "step": 34525 }, { "epoch": 0.8435736447365206, "grad_norm": 0.17473919689655304, "learning_rate": 1.971377248893867e-06, "loss": 0.0724, "num_input_tokens_seen": 23258048, "step": 34530 }, { "epoch": 0.8436957955683678, "grad_norm": 9.839377403259277, "learning_rate": 1.971356988343376e-06, "loss": 0.1655, "num_input_tokens_seen": 23262144, "step": 34535 }, { "epoch": 0.843817946400215, "grad_norm": 33.3941764831543, "learning_rate": 1.971336720728916e-06, "loss": 0.0349, "num_input_tokens_seen": 23266176, "step": 34540 }, { "epoch": 0.8439400972320622, "grad_norm": 26.51119613647461, "learning_rate": 1.9713164460506337e-06, "loss": 0.122, "num_input_tokens_seen": 23269440, "step": 34545 }, { "epoch": 0.8440622480639093, "grad_norm": 11.167991638183594, "learning_rate": 1.971296164308677e-06, "loss": 0.1521, "num_input_tokens_seen": 23272768, "step": 34550 }, { "epoch": 0.8441843988957565, "grad_norm": 27.68284797668457, "learning_rate": 1.971275875503194e-06, "loss": 0.1201, "num_input_tokens_seen": 23276288, "step": 34555 }, { "epoch": 0.8443065497276037, "grad_norm": 5.643763542175293, "learning_rate": 1.9712555796343307e-06, "loss": 0.0251, "num_input_tokens_seen": 23279424, "step": 34560 }, { "epoch": 0.8444287005594509, "grad_norm": 5.355016708374023, "learning_rate": 1.9712352767022364e-06, "loss": 0.0856, "num_input_tokens_seen": 23282816, "step": 34565 }, { "epoch": 0.8445508513912979, "grad_norm": 19.52223014831543, "learning_rate": 1.971214966707057e-06, "loss": 0.0897, "num_input_tokens_seen": 23286208, "step": 34570 }, { "epoch": 0.8446730022231451, "grad_norm": 32.717464447021484, "learning_rate": 1.971194649648942e-06, "loss": 0.0634, "num_input_tokens_seen": 23289536, "step": 34575 }, { "epoch": 0.8447951530549923, "grad_norm": 31.83834457397461, "learning_rate": 1.971174325528038e-06, "loss": 0.0682, "num_input_tokens_seen": 23293248, "step": 34580 }, { "epoch": 0.8449173038868395, "grad_norm": 43.34263229370117, "learning_rate": 1.971153994344493e-06, "loss": 0.1036, "num_input_tokens_seen": 23296128, "step": 34585 }, { "epoch": 0.8450394547186867, "grad_norm": 24.69635581970215, "learning_rate": 1.971133656098455e-06, "loss": 0.1706, "num_input_tokens_seen": 23299136, "step": 34590 }, { "epoch": 0.8451616055505338, "grad_norm": 8.943610191345215, "learning_rate": 1.9711133107900715e-06, "loss": 0.2031, "num_input_tokens_seen": 23302208, "step": 34595 }, { "epoch": 0.845283756382381, "grad_norm": 22.528676986694336, "learning_rate": 1.971092958419491e-06, "loss": 0.0252, "num_input_tokens_seen": 23305344, "step": 34600 }, { "epoch": 0.8454059072142281, "grad_norm": 0.08624982088804245, "learning_rate": 1.971072598986862e-06, "loss": 0.151, "num_input_tokens_seen": 23308480, "step": 34605 }, { "epoch": 0.8455280580460753, "grad_norm": 16.910186767578125, "learning_rate": 1.971052232492331e-06, "loss": 0.1835, "num_input_tokens_seen": 23311744, "step": 34610 }, { "epoch": 0.8456502088779224, "grad_norm": 1.0254402160644531, "learning_rate": 1.9710318589360476e-06, "loss": 0.0506, "num_input_tokens_seen": 23314880, "step": 34615 }, { "epoch": 0.8457723597097696, "grad_norm": 58.980690002441406, "learning_rate": 1.971011478318159e-06, "loss": 0.1065, "num_input_tokens_seen": 23318208, "step": 34620 }, { "epoch": 0.8458945105416168, "grad_norm": 25.34699821472168, "learning_rate": 1.970991090638814e-06, "loss": 0.1044, "num_input_tokens_seen": 23321600, "step": 34625 }, { "epoch": 0.846016661373464, "grad_norm": 33.81626510620117, "learning_rate": 1.9709706958981602e-06, "loss": 0.1925, "num_input_tokens_seen": 23325312, "step": 34630 }, { "epoch": 0.8461388122053111, "grad_norm": 0.3212079405784607, "learning_rate": 1.9709502940963468e-06, "loss": 0.1667, "num_input_tokens_seen": 23329216, "step": 34635 }, { "epoch": 0.8462609630371583, "grad_norm": 23.59168815612793, "learning_rate": 1.9709298852335214e-06, "loss": 0.0684, "num_input_tokens_seen": 23333120, "step": 34640 }, { "epoch": 0.8463831138690054, "grad_norm": 1.1817677021026611, "learning_rate": 1.9709094693098328e-06, "loss": 0.1576, "num_input_tokens_seen": 23336768, "step": 34645 }, { "epoch": 0.8465052647008526, "grad_norm": 0.2409643828868866, "learning_rate": 1.970889046325429e-06, "loss": 0.1096, "num_input_tokens_seen": 23340608, "step": 34650 }, { "epoch": 0.8466274155326998, "grad_norm": 22.38766860961914, "learning_rate": 1.97086861628046e-06, "loss": 0.1202, "num_input_tokens_seen": 23343808, "step": 34655 }, { "epoch": 0.8467495663645469, "grad_norm": 0.2981724739074707, "learning_rate": 1.9708481791750726e-06, "loss": 0.0725, "num_input_tokens_seen": 23347328, "step": 34660 }, { "epoch": 0.8468717171963941, "grad_norm": 28.143102645874023, "learning_rate": 1.970827735009416e-06, "loss": 0.2058, "num_input_tokens_seen": 23351232, "step": 34665 }, { "epoch": 0.8469938680282413, "grad_norm": 0.41867315769195557, "learning_rate": 1.970807283783639e-06, "loss": 0.1002, "num_input_tokens_seen": 23354624, "step": 34670 }, { "epoch": 0.8471160188600885, "grad_norm": 0.2982100546360016, "learning_rate": 1.9707868254978904e-06, "loss": 0.1547, "num_input_tokens_seen": 23358016, "step": 34675 }, { "epoch": 0.8472381696919355, "grad_norm": 2.644249677658081, "learning_rate": 1.970766360152319e-06, "loss": 0.0944, "num_input_tokens_seen": 23361600, "step": 34680 }, { "epoch": 0.8473603205237827, "grad_norm": 0.44487500190734863, "learning_rate": 1.9707458877470735e-06, "loss": 0.1293, "num_input_tokens_seen": 23364928, "step": 34685 }, { "epoch": 0.8474824713556299, "grad_norm": 30.027263641357422, "learning_rate": 1.970725408282303e-06, "loss": 0.0862, "num_input_tokens_seen": 23368256, "step": 34690 }, { "epoch": 0.8476046221874771, "grad_norm": 22.388761520385742, "learning_rate": 1.970704921758156e-06, "loss": 0.1469, "num_input_tokens_seen": 23372160, "step": 34695 }, { "epoch": 0.8477267730193243, "grad_norm": 0.25965288281440735, "learning_rate": 1.9706844281747817e-06, "loss": 0.1028, "num_input_tokens_seen": 23375424, "step": 34700 }, { "epoch": 0.8478489238511714, "grad_norm": 0.06832029670476913, "learning_rate": 1.970663927532329e-06, "loss": 0.0444, "num_input_tokens_seen": 23379072, "step": 34705 }, { "epoch": 0.8479710746830186, "grad_norm": 0.2789788544178009, "learning_rate": 1.9706434198309472e-06, "loss": 0.0764, "num_input_tokens_seen": 23382912, "step": 34710 }, { "epoch": 0.8480932255148658, "grad_norm": 4.7952446937561035, "learning_rate": 1.9706229050707855e-06, "loss": 0.1532, "num_input_tokens_seen": 23385920, "step": 34715 }, { "epoch": 0.848215376346713, "grad_norm": 0.19278299808502197, "learning_rate": 1.9706023832519932e-06, "loss": 0.0107, "num_input_tokens_seen": 23389248, "step": 34720 }, { "epoch": 0.84833752717856, "grad_norm": 29.569690704345703, "learning_rate": 1.970581854374719e-06, "loss": 0.2432, "num_input_tokens_seen": 23392320, "step": 34725 }, { "epoch": 0.8484596780104072, "grad_norm": 55.5435791015625, "learning_rate": 1.9705613184391124e-06, "loss": 0.0913, "num_input_tokens_seen": 23395392, "step": 34730 }, { "epoch": 0.8485818288422544, "grad_norm": 27.48069953918457, "learning_rate": 1.970540775445323e-06, "loss": 0.0944, "num_input_tokens_seen": 23398592, "step": 34735 }, { "epoch": 0.8487039796741016, "grad_norm": 15.223067283630371, "learning_rate": 1.9705202253935e-06, "loss": 0.1853, "num_input_tokens_seen": 23402112, "step": 34740 }, { "epoch": 0.8488261305059488, "grad_norm": 55.700496673583984, "learning_rate": 1.970499668283793e-06, "loss": 0.1728, "num_input_tokens_seen": 23405376, "step": 34745 }, { "epoch": 0.8489482813377959, "grad_norm": 0.5155206322669983, "learning_rate": 1.9704791041163514e-06, "loss": 0.0532, "num_input_tokens_seen": 23409024, "step": 34750 }, { "epoch": 0.8490704321696431, "grad_norm": 29.440126419067383, "learning_rate": 1.9704585328913247e-06, "loss": 0.1421, "num_input_tokens_seen": 23412224, "step": 34755 }, { "epoch": 0.8491925830014903, "grad_norm": 7.78788948059082, "learning_rate": 1.9704379546088626e-06, "loss": 0.1233, "num_input_tokens_seen": 23415552, "step": 34760 }, { "epoch": 0.8493147338333374, "grad_norm": 1.2759686708450317, "learning_rate": 1.9704173692691142e-06, "loss": 0.077, "num_input_tokens_seen": 23419328, "step": 34765 }, { "epoch": 0.8494368846651845, "grad_norm": 13.394909858703613, "learning_rate": 1.9703967768722305e-06, "loss": 0.1388, "num_input_tokens_seen": 23422656, "step": 34770 }, { "epoch": 0.8495590354970317, "grad_norm": 39.994712829589844, "learning_rate": 1.97037617741836e-06, "loss": 0.073, "num_input_tokens_seen": 23425920, "step": 34775 }, { "epoch": 0.8496811863288789, "grad_norm": 14.517159461975098, "learning_rate": 1.9703555709076528e-06, "loss": 0.1236, "num_input_tokens_seen": 23429696, "step": 34780 }, { "epoch": 0.8498033371607261, "grad_norm": 1.9515538215637207, "learning_rate": 1.9703349573402587e-06, "loss": 0.045, "num_input_tokens_seen": 23433600, "step": 34785 }, { "epoch": 0.8499254879925733, "grad_norm": 15.69411563873291, "learning_rate": 1.970314336716328e-06, "loss": 0.0993, "num_input_tokens_seen": 23437184, "step": 34790 }, { "epoch": 0.8500476388244204, "grad_norm": 69.99845886230469, "learning_rate": 1.9702937090360107e-06, "loss": 0.0301, "num_input_tokens_seen": 23440768, "step": 34795 }, { "epoch": 0.8501697896562675, "grad_norm": 6.935617446899414, "learning_rate": 1.9702730742994566e-06, "loss": 0.113, "num_input_tokens_seen": 23443776, "step": 34800 }, { "epoch": 0.8502919404881147, "grad_norm": 24.21588134765625, "learning_rate": 1.9702524325068156e-06, "loss": 0.1212, "num_input_tokens_seen": 23447360, "step": 34805 }, { "epoch": 0.8504140913199619, "grad_norm": 14.074463844299316, "learning_rate": 1.9702317836582378e-06, "loss": 0.1604, "num_input_tokens_seen": 23450880, "step": 34810 }, { "epoch": 0.850536242151809, "grad_norm": 47.931419372558594, "learning_rate": 1.9702111277538737e-06, "loss": 0.113, "num_input_tokens_seen": 23454720, "step": 34815 }, { "epoch": 0.8506583929836562, "grad_norm": 10.696855545043945, "learning_rate": 1.970190464793873e-06, "loss": 0.1991, "num_input_tokens_seen": 23457920, "step": 34820 }, { "epoch": 0.8507805438155034, "grad_norm": 7.281139373779297, "learning_rate": 1.9701697947783866e-06, "loss": 0.1783, "num_input_tokens_seen": 23461504, "step": 34825 }, { "epoch": 0.8509026946473506, "grad_norm": 7.84771203994751, "learning_rate": 1.9701491177075645e-06, "loss": 0.1332, "num_input_tokens_seen": 23464960, "step": 34830 }, { "epoch": 0.8510248454791978, "grad_norm": 19.61056137084961, "learning_rate": 1.9701284335815573e-06, "loss": 0.0566, "num_input_tokens_seen": 23468480, "step": 34835 }, { "epoch": 0.8511469963110448, "grad_norm": 4.750822067260742, "learning_rate": 1.9701077424005148e-06, "loss": 0.0771, "num_input_tokens_seen": 23472192, "step": 34840 }, { "epoch": 0.851269147142892, "grad_norm": 2.461702823638916, "learning_rate": 1.970087044164588e-06, "loss": 0.0344, "num_input_tokens_seen": 23475520, "step": 34845 }, { "epoch": 0.8513912979747392, "grad_norm": 2.9755985736846924, "learning_rate": 1.970066338873927e-06, "loss": 0.1812, "num_input_tokens_seen": 23478528, "step": 34850 }, { "epoch": 0.8515134488065864, "grad_norm": 15.43201732635498, "learning_rate": 1.9700456265286827e-06, "loss": 0.0944, "num_input_tokens_seen": 23482240, "step": 34855 }, { "epoch": 0.8516355996384335, "grad_norm": 0.9599284529685974, "learning_rate": 1.970024907129006e-06, "loss": 0.0797, "num_input_tokens_seen": 23485568, "step": 34860 }, { "epoch": 0.8517577504702807, "grad_norm": 7.2150492668151855, "learning_rate": 1.9700041806750477e-06, "loss": 0.0867, "num_input_tokens_seen": 23488832, "step": 34865 }, { "epoch": 0.8518799013021279, "grad_norm": 22.719148635864258, "learning_rate": 1.9699834471669574e-06, "loss": 0.1091, "num_input_tokens_seen": 23492672, "step": 34870 }, { "epoch": 0.8520020521339751, "grad_norm": 46.67292785644531, "learning_rate": 1.9699627066048868e-06, "loss": 0.0721, "num_input_tokens_seen": 23496000, "step": 34875 }, { "epoch": 0.8521242029658221, "grad_norm": 0.11148344725370407, "learning_rate": 1.9699419589889863e-06, "loss": 0.0762, "num_input_tokens_seen": 23499968, "step": 34880 }, { "epoch": 0.8522463537976693, "grad_norm": 20.346290588378906, "learning_rate": 1.9699212043194075e-06, "loss": 0.1507, "num_input_tokens_seen": 23503168, "step": 34885 }, { "epoch": 0.8523685046295165, "grad_norm": 52.53269577026367, "learning_rate": 1.9699004425963003e-06, "loss": 0.1667, "num_input_tokens_seen": 23506368, "step": 34890 }, { "epoch": 0.8524906554613637, "grad_norm": 1.005509376525879, "learning_rate": 1.9698796738198163e-06, "loss": 0.1714, "num_input_tokens_seen": 23509440, "step": 34895 }, { "epoch": 0.8526128062932109, "grad_norm": 6.515542030334473, "learning_rate": 1.9698588979901064e-06, "loss": 0.0708, "num_input_tokens_seen": 23512640, "step": 34900 }, { "epoch": 0.852734957125058, "grad_norm": 10.081113815307617, "learning_rate": 1.969838115107322e-06, "loss": 0.0841, "num_input_tokens_seen": 23515968, "step": 34905 }, { "epoch": 0.8528571079569052, "grad_norm": 42.108604431152344, "learning_rate": 1.9698173251716138e-06, "loss": 0.0816, "num_input_tokens_seen": 23519360, "step": 34910 }, { "epoch": 0.8529792587887524, "grad_norm": 38.693603515625, "learning_rate": 1.9697965281831335e-06, "loss": 0.1015, "num_input_tokens_seen": 23522496, "step": 34915 }, { "epoch": 0.8531014096205995, "grad_norm": 0.6154043078422546, "learning_rate": 1.9697757241420315e-06, "loss": 0.0559, "num_input_tokens_seen": 23526272, "step": 34920 }, { "epoch": 0.8532235604524466, "grad_norm": 0.0829826146364212, "learning_rate": 1.96975491304846e-06, "loss": 0.0682, "num_input_tokens_seen": 23529536, "step": 34925 }, { "epoch": 0.8533457112842938, "grad_norm": 13.311882972717285, "learning_rate": 1.9697340949025697e-06, "loss": 0.0585, "num_input_tokens_seen": 23532672, "step": 34930 }, { "epoch": 0.853467862116141, "grad_norm": 27.622638702392578, "learning_rate": 1.9697132697045126e-06, "loss": 0.0794, "num_input_tokens_seen": 23536384, "step": 34935 }, { "epoch": 0.8535900129479882, "grad_norm": 12.490110397338867, "learning_rate": 1.969692437454439e-06, "loss": 0.1116, "num_input_tokens_seen": 23540096, "step": 34940 }, { "epoch": 0.8537121637798354, "grad_norm": 24.080808639526367, "learning_rate": 1.969671598152502e-06, "loss": 0.1717, "num_input_tokens_seen": 23544256, "step": 34945 }, { "epoch": 0.8538343146116825, "grad_norm": 25.397422790527344, "learning_rate": 1.9696507517988517e-06, "loss": 0.1252, "num_input_tokens_seen": 23547392, "step": 34950 }, { "epoch": 0.8539564654435297, "grad_norm": 4.807941436767578, "learning_rate": 1.9696298983936408e-06, "loss": 0.1007, "num_input_tokens_seen": 23550592, "step": 34955 }, { "epoch": 0.8540786162753768, "grad_norm": 6.345379829406738, "learning_rate": 1.9696090379370203e-06, "loss": 0.0744, "num_input_tokens_seen": 23554688, "step": 34960 }, { "epoch": 0.854200767107224, "grad_norm": 0.38732364773750305, "learning_rate": 1.9695881704291423e-06, "loss": 0.145, "num_input_tokens_seen": 23558464, "step": 34965 }, { "epoch": 0.8543229179390711, "grad_norm": 1.831109881401062, "learning_rate": 1.969567295870158e-06, "loss": 0.0465, "num_input_tokens_seen": 23561216, "step": 34970 }, { "epoch": 0.8544450687709183, "grad_norm": 0.9281952977180481, "learning_rate": 1.9695464142602195e-06, "loss": 0.0931, "num_input_tokens_seen": 23565056, "step": 34975 }, { "epoch": 0.8545672196027655, "grad_norm": 0.18389178812503815, "learning_rate": 1.9695255255994788e-06, "loss": 0.0676, "num_input_tokens_seen": 23568320, "step": 34980 }, { "epoch": 0.8546893704346127, "grad_norm": 46.50515365600586, "learning_rate": 1.969504629888088e-06, "loss": 0.0429, "num_input_tokens_seen": 23571840, "step": 34985 }, { "epoch": 0.8548115212664599, "grad_norm": 29.618240356445312, "learning_rate": 1.9694837271261985e-06, "loss": 0.1533, "num_input_tokens_seen": 23575232, "step": 34990 }, { "epoch": 0.854933672098307, "grad_norm": 29.626087188720703, "learning_rate": 1.9694628173139626e-06, "loss": 0.1684, "num_input_tokens_seen": 23579072, "step": 34995 }, { "epoch": 0.8550558229301541, "grad_norm": 21.039466857910156, "learning_rate": 1.969441900451532e-06, "loss": 0.1663, "num_input_tokens_seen": 23582464, "step": 35000 }, { "epoch": 0.8551779737620013, "grad_norm": 9.503751754760742, "learning_rate": 1.96942097653906e-06, "loss": 0.2365, "num_input_tokens_seen": 23585600, "step": 35005 }, { "epoch": 0.8553001245938485, "grad_norm": 18.334392547607422, "learning_rate": 1.969400045576697e-06, "loss": 0.1384, "num_input_tokens_seen": 23589376, "step": 35010 }, { "epoch": 0.8554222754256956, "grad_norm": 18.294281005859375, "learning_rate": 1.969379107564597e-06, "loss": 0.0798, "num_input_tokens_seen": 23592384, "step": 35015 }, { "epoch": 0.8555444262575428, "grad_norm": 14.820401191711426, "learning_rate": 1.9693581625029108e-06, "loss": 0.0671, "num_input_tokens_seen": 23595904, "step": 35020 }, { "epoch": 0.85566657708939, "grad_norm": 13.708227157592773, "learning_rate": 1.9693372103917913e-06, "loss": 0.0827, "num_input_tokens_seen": 23598976, "step": 35025 }, { "epoch": 0.8557887279212372, "grad_norm": 15.605507850646973, "learning_rate": 1.969316251231391e-06, "loss": 0.1113, "num_input_tokens_seen": 23602368, "step": 35030 }, { "epoch": 0.8559108787530844, "grad_norm": 31.886110305786133, "learning_rate": 1.9692952850218624e-06, "loss": 0.1937, "num_input_tokens_seen": 23606272, "step": 35035 }, { "epoch": 0.8560330295849314, "grad_norm": 32.31554412841797, "learning_rate": 1.9692743117633576e-06, "loss": 0.1756, "num_input_tokens_seen": 23609856, "step": 35040 }, { "epoch": 0.8561551804167786, "grad_norm": 2.543952703475952, "learning_rate": 1.969253331456029e-06, "loss": 0.0429, "num_input_tokens_seen": 23613248, "step": 35045 }, { "epoch": 0.8562773312486258, "grad_norm": 15.292976379394531, "learning_rate": 1.96923234410003e-06, "loss": 0.0925, "num_input_tokens_seen": 23616256, "step": 35050 }, { "epoch": 0.856399482080473, "grad_norm": 33.541866302490234, "learning_rate": 1.9692113496955124e-06, "loss": 0.2194, "num_input_tokens_seen": 23619392, "step": 35055 }, { "epoch": 0.8565216329123201, "grad_norm": 13.459238052368164, "learning_rate": 1.9691903482426295e-06, "loss": 0.0587, "num_input_tokens_seen": 23623552, "step": 35060 }, { "epoch": 0.8566437837441673, "grad_norm": 1.0485910177230835, "learning_rate": 1.9691693397415333e-06, "loss": 0.0937, "num_input_tokens_seen": 23627200, "step": 35065 }, { "epoch": 0.8567659345760145, "grad_norm": 18.415218353271484, "learning_rate": 1.9691483241923776e-06, "loss": 0.154, "num_input_tokens_seen": 23630272, "step": 35070 }, { "epoch": 0.8568880854078617, "grad_norm": 23.374282836914062, "learning_rate": 1.969127301595314e-06, "loss": 0.0931, "num_input_tokens_seen": 23633408, "step": 35075 }, { "epoch": 0.8570102362397088, "grad_norm": 13.124170303344727, "learning_rate": 1.9691062719504962e-06, "loss": 0.2184, "num_input_tokens_seen": 23636928, "step": 35080 }, { "epoch": 0.8571323870715559, "grad_norm": 0.5865902900695801, "learning_rate": 1.969085235258077e-06, "loss": 0.0106, "num_input_tokens_seen": 23640384, "step": 35085 }, { "epoch": 0.8572545379034031, "grad_norm": 1.6755985021591187, "learning_rate": 1.969064191518209e-06, "loss": 0.0371, "num_input_tokens_seen": 23644416, "step": 35090 }, { "epoch": 0.8573766887352503, "grad_norm": 15.779130935668945, "learning_rate": 1.969043140731046e-06, "loss": 0.0719, "num_input_tokens_seen": 23647488, "step": 35095 }, { "epoch": 0.8574988395670975, "grad_norm": 18.528223037719727, "learning_rate": 1.96902208289674e-06, "loss": 0.105, "num_input_tokens_seen": 23651264, "step": 35100 }, { "epoch": 0.8576209903989446, "grad_norm": 9.245097160339355, "learning_rate": 1.9690010180154454e-06, "loss": 0.0889, "num_input_tokens_seen": 23654784, "step": 35105 }, { "epoch": 0.8577431412307918, "grad_norm": 7.738526344299316, "learning_rate": 1.9689799460873147e-06, "loss": 0.16, "num_input_tokens_seen": 23658368, "step": 35110 }, { "epoch": 0.857865292062639, "grad_norm": 1.2937458753585815, "learning_rate": 1.968958867112501e-06, "loss": 0.02, "num_input_tokens_seen": 23662144, "step": 35115 }, { "epoch": 0.8579874428944861, "grad_norm": 1.212963581085205, "learning_rate": 1.9689377810911577e-06, "loss": 0.1565, "num_input_tokens_seen": 23665472, "step": 35120 }, { "epoch": 0.8581095937263333, "grad_norm": 24.34621238708496, "learning_rate": 1.9689166880234385e-06, "loss": 0.046, "num_input_tokens_seen": 23668864, "step": 35125 }, { "epoch": 0.8582317445581804, "grad_norm": 12.43953800201416, "learning_rate": 1.9688955879094966e-06, "loss": 0.1591, "num_input_tokens_seen": 23672128, "step": 35130 }, { "epoch": 0.8583538953900276, "grad_norm": 21.296579360961914, "learning_rate": 1.9688744807494853e-06, "loss": 0.071, "num_input_tokens_seen": 23675648, "step": 35135 }, { "epoch": 0.8584760462218748, "grad_norm": 1.4470196962356567, "learning_rate": 1.968853366543558e-06, "loss": 0.0692, "num_input_tokens_seen": 23679232, "step": 35140 }, { "epoch": 0.858598197053722, "grad_norm": 23.92950439453125, "learning_rate": 1.9688322452918686e-06, "loss": 0.148, "num_input_tokens_seen": 23682176, "step": 35145 }, { "epoch": 0.8587203478855691, "grad_norm": 0.055989764630794525, "learning_rate": 1.9688111169945706e-06, "loss": 0.0971, "num_input_tokens_seen": 23685440, "step": 35150 }, { "epoch": 0.8588424987174162, "grad_norm": 34.370025634765625, "learning_rate": 1.9687899816518173e-06, "loss": 0.1121, "num_input_tokens_seen": 23688512, "step": 35155 }, { "epoch": 0.8589646495492634, "grad_norm": 23.321041107177734, "learning_rate": 1.968768839263763e-06, "loss": 0.0545, "num_input_tokens_seen": 23691840, "step": 35160 }, { "epoch": 0.8590868003811106, "grad_norm": 11.707565307617188, "learning_rate": 1.968747689830561e-06, "loss": 0.182, "num_input_tokens_seen": 23695488, "step": 35165 }, { "epoch": 0.8592089512129577, "grad_norm": 0.9634722471237183, "learning_rate": 1.968726533352365e-06, "loss": 0.1623, "num_input_tokens_seen": 23699136, "step": 35170 }, { "epoch": 0.8593311020448049, "grad_norm": 33.24665832519531, "learning_rate": 1.9687053698293293e-06, "loss": 0.315, "num_input_tokens_seen": 23702400, "step": 35175 }, { "epoch": 0.8594532528766521, "grad_norm": 20.123992919921875, "learning_rate": 1.9686841992616077e-06, "loss": 0.0951, "num_input_tokens_seen": 23705728, "step": 35180 }, { "epoch": 0.8595754037084993, "grad_norm": 48.02340316772461, "learning_rate": 1.968663021649354e-06, "loss": 0.2311, "num_input_tokens_seen": 23709120, "step": 35185 }, { "epoch": 0.8596975545403465, "grad_norm": 2.7457668781280518, "learning_rate": 1.9686418369927224e-06, "loss": 0.1489, "num_input_tokens_seen": 23712512, "step": 35190 }, { "epoch": 0.8598197053721935, "grad_norm": 19.584701538085938, "learning_rate": 1.9686206452918667e-06, "loss": 0.0941, "num_input_tokens_seen": 23715904, "step": 35195 }, { "epoch": 0.8599418562040407, "grad_norm": 14.326571464538574, "learning_rate": 1.968599446546941e-06, "loss": 0.0694, "num_input_tokens_seen": 23720000, "step": 35200 }, { "epoch": 0.8600640070358879, "grad_norm": 27.21695327758789, "learning_rate": 1.9685782407580995e-06, "loss": 0.1236, "num_input_tokens_seen": 23723200, "step": 35205 }, { "epoch": 0.8601861578677351, "grad_norm": 26.761911392211914, "learning_rate": 1.9685570279254967e-06, "loss": 0.0757, "num_input_tokens_seen": 23726656, "step": 35210 }, { "epoch": 0.8603083086995822, "grad_norm": 20.203405380249023, "learning_rate": 1.9685358080492865e-06, "loss": 0.0956, "num_input_tokens_seen": 23730048, "step": 35215 }, { "epoch": 0.8604304595314294, "grad_norm": 36.64809799194336, "learning_rate": 1.9685145811296237e-06, "loss": 0.074, "num_input_tokens_seen": 23733312, "step": 35220 }, { "epoch": 0.8605526103632766, "grad_norm": 6.595607757568359, "learning_rate": 1.9684933471666626e-06, "loss": 0.1863, "num_input_tokens_seen": 23736640, "step": 35225 }, { "epoch": 0.8606747611951238, "grad_norm": 60.852046966552734, "learning_rate": 1.968472106160557e-06, "loss": 0.0825, "num_input_tokens_seen": 23740416, "step": 35230 }, { "epoch": 0.860796912026971, "grad_norm": 36.590965270996094, "learning_rate": 1.9684508581114616e-06, "loss": 0.2678, "num_input_tokens_seen": 23743680, "step": 35235 }, { "epoch": 0.860919062858818, "grad_norm": 1.6028282642364502, "learning_rate": 1.9684296030195317e-06, "loss": 0.1087, "num_input_tokens_seen": 23747072, "step": 35240 }, { "epoch": 0.8610412136906652, "grad_norm": 1.05069899559021, "learning_rate": 1.9684083408849206e-06, "loss": 0.0148, "num_input_tokens_seen": 23750592, "step": 35245 }, { "epoch": 0.8611633645225124, "grad_norm": 51.769962310791016, "learning_rate": 1.968387071707784e-06, "loss": 0.2289, "num_input_tokens_seen": 23753408, "step": 35250 }, { "epoch": 0.8612855153543596, "grad_norm": 19.687625885009766, "learning_rate": 1.9683657954882757e-06, "loss": 0.1386, "num_input_tokens_seen": 23756480, "step": 35255 }, { "epoch": 0.8614076661862067, "grad_norm": 1.0411657094955444, "learning_rate": 1.968344512226551e-06, "loss": 0.1047, "num_input_tokens_seen": 23759808, "step": 35260 }, { "epoch": 0.8615298170180539, "grad_norm": 0.37326228618621826, "learning_rate": 1.9683232219227646e-06, "loss": 0.1218, "num_input_tokens_seen": 23763200, "step": 35265 }, { "epoch": 0.8616519678499011, "grad_norm": 4.8047027587890625, "learning_rate": 1.9683019245770717e-06, "loss": 0.1086, "num_input_tokens_seen": 23766464, "step": 35270 }, { "epoch": 0.8617741186817482, "grad_norm": 21.94441032409668, "learning_rate": 1.9682806201896264e-06, "loss": 0.234, "num_input_tokens_seen": 23770240, "step": 35275 }, { "epoch": 0.8618962695135954, "grad_norm": 36.986934661865234, "learning_rate": 1.968259308760584e-06, "loss": 0.0527, "num_input_tokens_seen": 23773632, "step": 35280 }, { "epoch": 0.8620184203454425, "grad_norm": 0.33001336455345154, "learning_rate": 1.9682379902900995e-06, "loss": 0.0753, "num_input_tokens_seen": 23777408, "step": 35285 }, { "epoch": 0.8621405711772897, "grad_norm": 41.58378219604492, "learning_rate": 1.968216664778328e-06, "loss": 0.1124, "num_input_tokens_seen": 23780864, "step": 35290 }, { "epoch": 0.8622627220091369, "grad_norm": 7.427133083343506, "learning_rate": 1.9681953322254243e-06, "loss": 0.184, "num_input_tokens_seen": 23784320, "step": 35295 }, { "epoch": 0.8623848728409841, "grad_norm": 38.231021881103516, "learning_rate": 1.968173992631544e-06, "loss": 0.0935, "num_input_tokens_seen": 23788608, "step": 35300 }, { "epoch": 0.8625070236728312, "grad_norm": 23.69070816040039, "learning_rate": 1.968152645996842e-06, "loss": 0.2388, "num_input_tokens_seen": 23792000, "step": 35305 }, { "epoch": 0.8626291745046784, "grad_norm": 13.36784839630127, "learning_rate": 1.9681312923214734e-06, "loss": 0.1465, "num_input_tokens_seen": 23795584, "step": 35310 }, { "epoch": 0.8627513253365255, "grad_norm": 1.5608197450637817, "learning_rate": 1.9681099316055934e-06, "loss": 0.1328, "num_input_tokens_seen": 23798720, "step": 35315 }, { "epoch": 0.8628734761683727, "grad_norm": 0.44920867681503296, "learning_rate": 1.968088563849358e-06, "loss": 0.2706, "num_input_tokens_seen": 23801728, "step": 35320 }, { "epoch": 0.8629956270002199, "grad_norm": 0.6280757188796997, "learning_rate": 1.968067189052922e-06, "loss": 0.078, "num_input_tokens_seen": 23805504, "step": 35325 }, { "epoch": 0.863117777832067, "grad_norm": 40.889556884765625, "learning_rate": 1.968045807216441e-06, "loss": 0.17, "num_input_tokens_seen": 23809344, "step": 35330 }, { "epoch": 0.8632399286639142, "grad_norm": 9.817305564880371, "learning_rate": 1.968024418340071e-06, "loss": 0.2026, "num_input_tokens_seen": 23812416, "step": 35335 }, { "epoch": 0.8633620794957614, "grad_norm": 10.511480331420898, "learning_rate": 1.968003022423966e-06, "loss": 0.1147, "num_input_tokens_seen": 23815744, "step": 35340 }, { "epoch": 0.8634842303276086, "grad_norm": 8.885690689086914, "learning_rate": 1.9679816194682834e-06, "loss": 0.1263, "num_input_tokens_seen": 23819200, "step": 35345 }, { "epoch": 0.8636063811594556, "grad_norm": 0.8410807847976685, "learning_rate": 1.967960209473178e-06, "loss": 0.0971, "num_input_tokens_seen": 23822528, "step": 35350 }, { "epoch": 0.8637285319913028, "grad_norm": 0.631669819355011, "learning_rate": 1.9679387924388058e-06, "loss": 0.1648, "num_input_tokens_seen": 23826240, "step": 35355 }, { "epoch": 0.86385068282315, "grad_norm": 22.954504013061523, "learning_rate": 1.967917368365322e-06, "loss": 0.0552, "num_input_tokens_seen": 23829184, "step": 35360 }, { "epoch": 0.8639728336549972, "grad_norm": 10.804740905761719, "learning_rate": 1.9678959372528828e-06, "loss": 0.1812, "num_input_tokens_seen": 23833088, "step": 35365 }, { "epoch": 0.8640949844868444, "grad_norm": 26.449541091918945, "learning_rate": 1.967874499101644e-06, "loss": 0.1272, "num_input_tokens_seen": 23836160, "step": 35370 }, { "epoch": 0.8642171353186915, "grad_norm": 0.31604552268981934, "learning_rate": 1.967853053911762e-06, "loss": 0.0314, "num_input_tokens_seen": 23839808, "step": 35375 }, { "epoch": 0.8643392861505387, "grad_norm": 11.573125839233398, "learning_rate": 1.967831601683392e-06, "loss": 0.1877, "num_input_tokens_seen": 23842816, "step": 35380 }, { "epoch": 0.8644614369823859, "grad_norm": 8.36064338684082, "learning_rate": 1.96781014241669e-06, "loss": 0.0687, "num_input_tokens_seen": 23846144, "step": 35385 }, { "epoch": 0.8645835878142331, "grad_norm": 16.87027359008789, "learning_rate": 1.9677886761118126e-06, "loss": 0.1174, "num_input_tokens_seen": 23849536, "step": 35390 }, { "epoch": 0.8647057386460801, "grad_norm": 3.9122729301452637, "learning_rate": 1.9677672027689156e-06, "loss": 0.177, "num_input_tokens_seen": 23853120, "step": 35395 }, { "epoch": 0.8648278894779273, "grad_norm": 7.3564066886901855, "learning_rate": 1.9677457223881553e-06, "loss": 0.2031, "num_input_tokens_seen": 23856768, "step": 35400 }, { "epoch": 0.8649500403097745, "grad_norm": 6.73646354675293, "learning_rate": 1.967724234969688e-06, "loss": 0.1071, "num_input_tokens_seen": 23860096, "step": 35405 }, { "epoch": 0.8650721911416217, "grad_norm": 9.172672271728516, "learning_rate": 1.967702740513669e-06, "loss": 0.1915, "num_input_tokens_seen": 23863296, "step": 35410 }, { "epoch": 0.8651943419734689, "grad_norm": 2.225639820098877, "learning_rate": 1.967681239020256e-06, "loss": 0.0757, "num_input_tokens_seen": 23866368, "step": 35415 }, { "epoch": 0.865316492805316, "grad_norm": 52.951171875, "learning_rate": 1.9676597304896046e-06, "loss": 0.1594, "num_input_tokens_seen": 23869568, "step": 35420 }, { "epoch": 0.8654386436371632, "grad_norm": 45.615421295166016, "learning_rate": 1.967638214921871e-06, "loss": 0.1046, "num_input_tokens_seen": 23873024, "step": 35425 }, { "epoch": 0.8655607944690104, "grad_norm": 0.6920754909515381, "learning_rate": 1.967616692317213e-06, "loss": 0.0774, "num_input_tokens_seen": 23876544, "step": 35430 }, { "epoch": 0.8656829453008575, "grad_norm": 0.914369523525238, "learning_rate": 1.9675951626757854e-06, "loss": 0.1239, "num_input_tokens_seen": 23880192, "step": 35435 }, { "epoch": 0.8658050961327046, "grad_norm": 8.883461952209473, "learning_rate": 1.9675736259977455e-06, "loss": 0.0905, "num_input_tokens_seen": 23883904, "step": 35440 }, { "epoch": 0.8659272469645518, "grad_norm": 0.3117360472679138, "learning_rate": 1.9675520822832504e-06, "loss": 0.0623, "num_input_tokens_seen": 23887296, "step": 35445 }, { "epoch": 0.866049397796399, "grad_norm": 28.35218048095703, "learning_rate": 1.967530531532456e-06, "loss": 0.0776, "num_input_tokens_seen": 23890560, "step": 35450 }, { "epoch": 0.8661715486282462, "grad_norm": 17.423656463623047, "learning_rate": 1.967508973745519e-06, "loss": 0.0951, "num_input_tokens_seen": 23893632, "step": 35455 }, { "epoch": 0.8662936994600933, "grad_norm": 5.609574794769287, "learning_rate": 1.967487408922597e-06, "loss": 0.0601, "num_input_tokens_seen": 23896640, "step": 35460 }, { "epoch": 0.8664158502919405, "grad_norm": 0.16245517134666443, "learning_rate": 1.9674658370638462e-06, "loss": 0.1787, "num_input_tokens_seen": 23900416, "step": 35465 }, { "epoch": 0.8665380011237876, "grad_norm": 12.920045852661133, "learning_rate": 1.9674442581694238e-06, "loss": 0.2974, "num_input_tokens_seen": 23903808, "step": 35470 }, { "epoch": 0.8666601519556348, "grad_norm": 0.45196205377578735, "learning_rate": 1.967422672239487e-06, "loss": 0.0575, "num_input_tokens_seen": 23907392, "step": 35475 }, { "epoch": 0.866782302787482, "grad_norm": 0.5795376896858215, "learning_rate": 1.967401079274191e-06, "loss": 0.1018, "num_input_tokens_seen": 23911168, "step": 35480 }, { "epoch": 0.8669044536193291, "grad_norm": 8.955952644348145, "learning_rate": 1.967379479273695e-06, "loss": 0.138, "num_input_tokens_seen": 23914304, "step": 35485 }, { "epoch": 0.8670266044511763, "grad_norm": 0.7784607410430908, "learning_rate": 1.9673578722381552e-06, "loss": 0.1295, "num_input_tokens_seen": 23917888, "step": 35490 }, { "epoch": 0.8671487552830235, "grad_norm": 3.1278843879699707, "learning_rate": 1.967336258167729e-06, "loss": 0.065, "num_input_tokens_seen": 23921472, "step": 35495 }, { "epoch": 0.8672709061148707, "grad_norm": 0.8848108053207397, "learning_rate": 1.9673146370625727e-06, "loss": 0.0754, "num_input_tokens_seen": 23924736, "step": 35500 }, { "epoch": 0.8673930569467178, "grad_norm": 41.079925537109375, "learning_rate": 1.9672930089228448e-06, "loss": 0.1791, "num_input_tokens_seen": 23927744, "step": 35505 }, { "epoch": 0.867515207778565, "grad_norm": 14.143448829650879, "learning_rate": 1.9672713737487018e-06, "loss": 0.1933, "num_input_tokens_seen": 23930944, "step": 35510 }, { "epoch": 0.8676373586104121, "grad_norm": 56.091392517089844, "learning_rate": 1.967249731540301e-06, "loss": 0.0915, "num_input_tokens_seen": 23934336, "step": 35515 }, { "epoch": 0.8677595094422593, "grad_norm": 28.087440490722656, "learning_rate": 1.9672280822978e-06, "loss": 0.093, "num_input_tokens_seen": 23937856, "step": 35520 }, { "epoch": 0.8678816602741065, "grad_norm": 1.3667480945587158, "learning_rate": 1.9672064260213565e-06, "loss": 0.1593, "num_input_tokens_seen": 23941312, "step": 35525 }, { "epoch": 0.8680038111059536, "grad_norm": 24.725561141967773, "learning_rate": 1.9671847627111273e-06, "loss": 0.1556, "num_input_tokens_seen": 23944256, "step": 35530 }, { "epoch": 0.8681259619378008, "grad_norm": 2.158342123031616, "learning_rate": 1.967163092367271e-06, "loss": 0.0886, "num_input_tokens_seen": 23947840, "step": 35535 }, { "epoch": 0.868248112769648, "grad_norm": 29.596519470214844, "learning_rate": 1.9671414149899438e-06, "loss": 0.0558, "num_input_tokens_seen": 23951040, "step": 35540 }, { "epoch": 0.8683702636014952, "grad_norm": 50.23384094238281, "learning_rate": 1.9671197305793044e-06, "loss": 0.1845, "num_input_tokens_seen": 23953984, "step": 35545 }, { "epoch": 0.8684924144333422, "grad_norm": 0.8262698650360107, "learning_rate": 1.9670980391355104e-06, "loss": 0.0353, "num_input_tokens_seen": 23957184, "step": 35550 }, { "epoch": 0.8686145652651894, "grad_norm": 24.694656372070312, "learning_rate": 1.9670763406587192e-06, "loss": 0.1873, "num_input_tokens_seen": 23960640, "step": 35555 }, { "epoch": 0.8687367160970366, "grad_norm": 0.3127105236053467, "learning_rate": 1.967054635149089e-06, "loss": 0.0577, "num_input_tokens_seen": 23963904, "step": 35560 }, { "epoch": 0.8688588669288838, "grad_norm": 12.424739837646484, "learning_rate": 1.967032922606777e-06, "loss": 0.121, "num_input_tokens_seen": 23967104, "step": 35565 }, { "epoch": 0.868981017760731, "grad_norm": 9.423223495483398, "learning_rate": 1.9670112030319412e-06, "loss": 0.0403, "num_input_tokens_seen": 23970240, "step": 35570 }, { "epoch": 0.8691031685925781, "grad_norm": 15.91766357421875, "learning_rate": 1.9669894764247403e-06, "loss": 0.187, "num_input_tokens_seen": 23973760, "step": 35575 }, { "epoch": 0.8692253194244253, "grad_norm": 13.034796714782715, "learning_rate": 1.966967742785332e-06, "loss": 0.1906, "num_input_tokens_seen": 23976960, "step": 35580 }, { "epoch": 0.8693474702562725, "grad_norm": 0.9588425755500793, "learning_rate": 1.966946002113874e-06, "loss": 0.0527, "num_input_tokens_seen": 23980352, "step": 35585 }, { "epoch": 0.8694696210881196, "grad_norm": 8.451891899108887, "learning_rate": 1.9669242544105245e-06, "loss": 0.0576, "num_input_tokens_seen": 23983872, "step": 35590 }, { "epoch": 0.8695917719199667, "grad_norm": 2.5281105041503906, "learning_rate": 1.966902499675442e-06, "loss": 0.0556, "num_input_tokens_seen": 23987520, "step": 35595 }, { "epoch": 0.8697139227518139, "grad_norm": 8.671502113342285, "learning_rate": 1.9668807379087843e-06, "loss": 0.1358, "num_input_tokens_seen": 23991232, "step": 35600 }, { "epoch": 0.8698360735836611, "grad_norm": 3.297170639038086, "learning_rate": 1.9668589691107096e-06, "loss": 0.1904, "num_input_tokens_seen": 23994816, "step": 35605 }, { "epoch": 0.8699582244155083, "grad_norm": 37.647254943847656, "learning_rate": 1.966837193281377e-06, "loss": 0.0965, "num_input_tokens_seen": 23997952, "step": 35610 }, { "epoch": 0.8700803752473555, "grad_norm": 20.210281372070312, "learning_rate": 1.9668154104209438e-06, "loss": 0.0625, "num_input_tokens_seen": 24001088, "step": 35615 }, { "epoch": 0.8702025260792026, "grad_norm": 1.265897512435913, "learning_rate": 1.966793620529569e-06, "loss": 0.1092, "num_input_tokens_seen": 24004992, "step": 35620 }, { "epoch": 0.8703246769110498, "grad_norm": 0.6645774841308594, "learning_rate": 1.9667718236074106e-06, "loss": 0.1368, "num_input_tokens_seen": 24008960, "step": 35625 }, { "epoch": 0.870446827742897, "grad_norm": 1.741341233253479, "learning_rate": 1.966750019654628e-06, "loss": 0.0791, "num_input_tokens_seen": 24012352, "step": 35630 }, { "epoch": 0.8705689785747441, "grad_norm": 0.28083935379981995, "learning_rate": 1.966728208671379e-06, "loss": 0.0905, "num_input_tokens_seen": 24015936, "step": 35635 }, { "epoch": 0.8706911294065912, "grad_norm": 22.628009796142578, "learning_rate": 1.9667063906578226e-06, "loss": 0.1447, "num_input_tokens_seen": 24019200, "step": 35640 }, { "epoch": 0.8708132802384384, "grad_norm": 0.0826408714056015, "learning_rate": 1.966684565614117e-06, "loss": 0.219, "num_input_tokens_seen": 24022592, "step": 35645 }, { "epoch": 0.8709354310702856, "grad_norm": 18.106290817260742, "learning_rate": 1.9666627335404214e-06, "loss": 0.1551, "num_input_tokens_seen": 24025984, "step": 35650 }, { "epoch": 0.8710575819021328, "grad_norm": 51.1240119934082, "learning_rate": 1.9666408944368948e-06, "loss": 0.1494, "num_input_tokens_seen": 24030080, "step": 35655 }, { "epoch": 0.87117973273398, "grad_norm": 1.2830450534820557, "learning_rate": 1.966619048303695e-06, "loss": 0.0704, "num_input_tokens_seen": 24033344, "step": 35660 }, { "epoch": 0.871301883565827, "grad_norm": 15.251612663269043, "learning_rate": 1.966597195140982e-06, "loss": 0.1243, "num_input_tokens_seen": 24036480, "step": 35665 }, { "epoch": 0.8714240343976742, "grad_norm": 10.089798927307129, "learning_rate": 1.966575334948914e-06, "loss": 0.267, "num_input_tokens_seen": 24039552, "step": 35670 }, { "epoch": 0.8715461852295214, "grad_norm": 2.0029706954956055, "learning_rate": 1.9665534677276502e-06, "loss": 0.0807, "num_input_tokens_seen": 24042816, "step": 35675 }, { "epoch": 0.8716683360613686, "grad_norm": 0.9593586921691895, "learning_rate": 1.9665315934773495e-06, "loss": 0.1276, "num_input_tokens_seen": 24046016, "step": 35680 }, { "epoch": 0.8717904868932157, "grad_norm": 16.183738708496094, "learning_rate": 1.966509712198171e-06, "loss": 0.083, "num_input_tokens_seen": 24049664, "step": 35685 }, { "epoch": 0.8719126377250629, "grad_norm": 10.109683990478516, "learning_rate": 1.966487823890274e-06, "loss": 0.1309, "num_input_tokens_seen": 24053248, "step": 35690 }, { "epoch": 0.8720347885569101, "grad_norm": 25.217132568359375, "learning_rate": 1.966465928553818e-06, "loss": 0.0964, "num_input_tokens_seen": 24056896, "step": 35695 }, { "epoch": 0.8721569393887573, "grad_norm": 11.477954864501953, "learning_rate": 1.9664440261889614e-06, "loss": 0.0599, "num_input_tokens_seen": 24060480, "step": 35700 }, { "epoch": 0.8722790902206043, "grad_norm": 19.288284301757812, "learning_rate": 1.966422116795864e-06, "loss": 0.076, "num_input_tokens_seen": 24063616, "step": 35705 }, { "epoch": 0.8724012410524515, "grad_norm": 2.0639655590057373, "learning_rate": 1.966400200374685e-06, "loss": 0.0638, "num_input_tokens_seen": 24067136, "step": 35710 }, { "epoch": 0.8725233918842987, "grad_norm": 9.67798137664795, "learning_rate": 1.9663782769255837e-06, "loss": 0.0867, "num_input_tokens_seen": 24070784, "step": 35715 }, { "epoch": 0.8726455427161459, "grad_norm": 30.0969295501709, "learning_rate": 1.9663563464487197e-06, "loss": 0.0867, "num_input_tokens_seen": 24074240, "step": 35720 }, { "epoch": 0.8727676935479931, "grad_norm": 42.898590087890625, "learning_rate": 1.9663344089442524e-06, "loss": 0.0729, "num_input_tokens_seen": 24077952, "step": 35725 }, { "epoch": 0.8728898443798402, "grad_norm": 2.0448718070983887, "learning_rate": 1.9663124644123416e-06, "loss": 0.2075, "num_input_tokens_seen": 24081344, "step": 35730 }, { "epoch": 0.8730119952116874, "grad_norm": 0.13667216897010803, "learning_rate": 1.9662905128531464e-06, "loss": 0.0498, "num_input_tokens_seen": 24084608, "step": 35735 }, { "epoch": 0.8731341460435346, "grad_norm": 3.772686243057251, "learning_rate": 1.966268554266827e-06, "loss": 0.0882, "num_input_tokens_seen": 24088064, "step": 35740 }, { "epoch": 0.8732562968753818, "grad_norm": 28.097793579101562, "learning_rate": 1.9662465886535426e-06, "loss": 0.1407, "num_input_tokens_seen": 24091776, "step": 35745 }, { "epoch": 0.8733784477072288, "grad_norm": 20.13945198059082, "learning_rate": 1.966224616013453e-06, "loss": 0.2092, "num_input_tokens_seen": 24095104, "step": 35750 }, { "epoch": 0.873500598539076, "grad_norm": 2.7947781085968018, "learning_rate": 1.9662026363467183e-06, "loss": 0.1278, "num_input_tokens_seen": 24098112, "step": 35755 }, { "epoch": 0.8736227493709232, "grad_norm": 11.682032585144043, "learning_rate": 1.966180649653498e-06, "loss": 0.2807, "num_input_tokens_seen": 24101568, "step": 35760 }, { "epoch": 0.8737449002027704, "grad_norm": 26.217021942138672, "learning_rate": 1.966158655933952e-06, "loss": 0.0905, "num_input_tokens_seen": 24104960, "step": 35765 }, { "epoch": 0.8738670510346176, "grad_norm": 8.426963806152344, "learning_rate": 1.966136655188241e-06, "loss": 0.1174, "num_input_tokens_seen": 24108480, "step": 35770 }, { "epoch": 0.8739892018664647, "grad_norm": 10.167957305908203, "learning_rate": 1.966114647416524e-06, "loss": 0.0826, "num_input_tokens_seen": 24111552, "step": 35775 }, { "epoch": 0.8741113526983119, "grad_norm": 13.67619800567627, "learning_rate": 1.9660926326189613e-06, "loss": 0.0744, "num_input_tokens_seen": 24114752, "step": 35780 }, { "epoch": 0.874233503530159, "grad_norm": 37.29083251953125, "learning_rate": 1.9660706107957134e-06, "loss": 0.1183, "num_input_tokens_seen": 24118016, "step": 35785 }, { "epoch": 0.8743556543620062, "grad_norm": 22.16767120361328, "learning_rate": 1.9660485819469395e-06, "loss": 0.174, "num_input_tokens_seen": 24121280, "step": 35790 }, { "epoch": 0.8744778051938533, "grad_norm": 0.2441328465938568, "learning_rate": 1.9660265460728015e-06, "loss": 0.0505, "num_input_tokens_seen": 24125312, "step": 35795 }, { "epoch": 0.8745999560257005, "grad_norm": 47.24000930786133, "learning_rate": 1.9660045031734578e-06, "loss": 0.0931, "num_input_tokens_seen": 24128256, "step": 35800 }, { "epoch": 0.8747221068575477, "grad_norm": 2.6885123252868652, "learning_rate": 1.96598245324907e-06, "loss": 0.1435, "num_input_tokens_seen": 24131584, "step": 35805 }, { "epoch": 0.8748442576893949, "grad_norm": 5.6425089836120605, "learning_rate": 1.965960396299798e-06, "loss": 0.0891, "num_input_tokens_seen": 24134912, "step": 35810 }, { "epoch": 0.8749664085212421, "grad_norm": 13.333474159240723, "learning_rate": 1.9659383323258017e-06, "loss": 0.1344, "num_input_tokens_seen": 24138304, "step": 35815 }, { "epoch": 0.8750885593530892, "grad_norm": 0.8057913184165955, "learning_rate": 1.9659162613272424e-06, "loss": 0.0714, "num_input_tokens_seen": 24141760, "step": 35820 }, { "epoch": 0.8752107101849363, "grad_norm": 30.415504455566406, "learning_rate": 1.9658941833042804e-06, "loss": 0.1576, "num_input_tokens_seen": 24144960, "step": 35825 }, { "epoch": 0.8753328610167835, "grad_norm": 5.3278326988220215, "learning_rate": 1.9658720982570757e-06, "loss": 0.0207, "num_input_tokens_seen": 24147904, "step": 35830 }, { "epoch": 0.8754550118486307, "grad_norm": 4.785195350646973, "learning_rate": 1.9658500061857897e-06, "loss": 0.1306, "num_input_tokens_seen": 24151104, "step": 35835 }, { "epoch": 0.8755771626804778, "grad_norm": 49.18978500366211, "learning_rate": 1.9658279070905825e-06, "loss": 0.1516, "num_input_tokens_seen": 24154432, "step": 35840 }, { "epoch": 0.875699313512325, "grad_norm": 11.545149803161621, "learning_rate": 1.9658058009716147e-06, "loss": 0.0642, "num_input_tokens_seen": 24157696, "step": 35845 }, { "epoch": 0.8758214643441722, "grad_norm": 10.62991714477539, "learning_rate": 1.965783687829048e-06, "loss": 0.0856, "num_input_tokens_seen": 24161152, "step": 35850 }, { "epoch": 0.8759436151760194, "grad_norm": 0.5424090623855591, "learning_rate": 1.965761567663042e-06, "loss": 0.2556, "num_input_tokens_seen": 24164416, "step": 35855 }, { "epoch": 0.8760657660078666, "grad_norm": 9.843696594238281, "learning_rate": 1.9657394404737582e-06, "loss": 0.0845, "num_input_tokens_seen": 24168000, "step": 35860 }, { "epoch": 0.8761879168397136, "grad_norm": 0.7926238179206848, "learning_rate": 1.9657173062613575e-06, "loss": 0.2504, "num_input_tokens_seen": 24171584, "step": 35865 }, { "epoch": 0.8763100676715608, "grad_norm": 18.188486099243164, "learning_rate": 1.965695165026001e-06, "loss": 0.1544, "num_input_tokens_seen": 24175168, "step": 35870 }, { "epoch": 0.876432218503408, "grad_norm": 1.0848253965377808, "learning_rate": 1.9656730167678494e-06, "loss": 0.0603, "num_input_tokens_seen": 24178432, "step": 35875 }, { "epoch": 0.8765543693352552, "grad_norm": 8.33212661743164, "learning_rate": 1.965650861487064e-06, "loss": 0.0844, "num_input_tokens_seen": 24181696, "step": 35880 }, { "epoch": 0.8766765201671023, "grad_norm": 8.785030364990234, "learning_rate": 1.9656286991838056e-06, "loss": 0.2487, "num_input_tokens_seen": 24184832, "step": 35885 }, { "epoch": 0.8767986709989495, "grad_norm": 34.26165771484375, "learning_rate": 1.9656065298582355e-06, "loss": 0.0238, "num_input_tokens_seen": 24188224, "step": 35890 }, { "epoch": 0.8769208218307967, "grad_norm": 27.247255325317383, "learning_rate": 1.9655843535105154e-06, "loss": 0.0667, "num_input_tokens_seen": 24191872, "step": 35895 }, { "epoch": 0.8770429726626439, "grad_norm": 4.9799957275390625, "learning_rate": 1.9655621701408057e-06, "loss": 0.063, "num_input_tokens_seen": 24195648, "step": 35900 }, { "epoch": 0.877165123494491, "grad_norm": 56.0616569519043, "learning_rate": 1.9655399797492683e-06, "loss": 0.1216, "num_input_tokens_seen": 24198976, "step": 35905 }, { "epoch": 0.8772872743263381, "grad_norm": 13.826311111450195, "learning_rate": 1.9655177823360645e-06, "loss": 0.0618, "num_input_tokens_seen": 24202304, "step": 35910 }, { "epoch": 0.8774094251581853, "grad_norm": 10.57601261138916, "learning_rate": 1.9654955779013557e-06, "loss": 0.1367, "num_input_tokens_seen": 24205504, "step": 35915 }, { "epoch": 0.8775315759900325, "grad_norm": 1.1815544366836548, "learning_rate": 1.9654733664453037e-06, "loss": 0.1328, "num_input_tokens_seen": 24209344, "step": 35920 }, { "epoch": 0.8776537268218797, "grad_norm": 12.932337760925293, "learning_rate": 1.9654511479680693e-06, "loss": 0.0406, "num_input_tokens_seen": 24212736, "step": 35925 }, { "epoch": 0.8777758776537268, "grad_norm": 17.29469871520996, "learning_rate": 1.9654289224698144e-06, "loss": 0.1151, "num_input_tokens_seen": 24216000, "step": 35930 }, { "epoch": 0.877898028485574, "grad_norm": 1.935131549835205, "learning_rate": 1.965406689950701e-06, "loss": 0.1295, "num_input_tokens_seen": 24219392, "step": 35935 }, { "epoch": 0.8780201793174212, "grad_norm": 0.7120591402053833, "learning_rate": 1.9653844504108906e-06, "loss": 0.1675, "num_input_tokens_seen": 24222784, "step": 35940 }, { "epoch": 0.8781423301492683, "grad_norm": 21.83226776123047, "learning_rate": 1.965362203850545e-06, "loss": 0.1002, "num_input_tokens_seen": 24225664, "step": 35945 }, { "epoch": 0.8782644809811155, "grad_norm": 0.8217087388038635, "learning_rate": 1.965339950269825e-06, "loss": 0.0102, "num_input_tokens_seen": 24229312, "step": 35950 }, { "epoch": 0.8783866318129626, "grad_norm": 17.43366050720215, "learning_rate": 1.9653176896688936e-06, "loss": 0.0219, "num_input_tokens_seen": 24232576, "step": 35955 }, { "epoch": 0.8785087826448098, "grad_norm": 15.183777809143066, "learning_rate": 1.965295422047912e-06, "loss": 0.0591, "num_input_tokens_seen": 24236096, "step": 35960 }, { "epoch": 0.878630933476657, "grad_norm": 10.468469619750977, "learning_rate": 1.965273147407043e-06, "loss": 0.1248, "num_input_tokens_seen": 24239680, "step": 35965 }, { "epoch": 0.8787530843085042, "grad_norm": 0.05369218438863754, "learning_rate": 1.965250865746448e-06, "loss": 0.0906, "num_input_tokens_seen": 24242816, "step": 35970 }, { "epoch": 0.8788752351403513, "grad_norm": 0.46325352787971497, "learning_rate": 1.9652285770662893e-06, "loss": 0.0539, "num_input_tokens_seen": 24246208, "step": 35975 }, { "epoch": 0.8789973859721985, "grad_norm": 47.057804107666016, "learning_rate": 1.965206281366728e-06, "loss": 0.088, "num_input_tokens_seen": 24249152, "step": 35980 }, { "epoch": 0.8791195368040456, "grad_norm": 24.42066192626953, "learning_rate": 1.9651839786479276e-06, "loss": 0.1986, "num_input_tokens_seen": 24252608, "step": 35985 }, { "epoch": 0.8792416876358928, "grad_norm": 1.8329250812530518, "learning_rate": 1.9651616689100498e-06, "loss": 0.1155, "num_input_tokens_seen": 24255680, "step": 35990 }, { "epoch": 0.8793638384677399, "grad_norm": 2.121000051498413, "learning_rate": 1.9651393521532563e-06, "loss": 0.1162, "num_input_tokens_seen": 24258880, "step": 35995 }, { "epoch": 0.8794859892995871, "grad_norm": 9.572766304016113, "learning_rate": 1.9651170283777098e-06, "loss": 0.19, "num_input_tokens_seen": 24262464, "step": 36000 }, { "epoch": 0.8796081401314343, "grad_norm": 23.756149291992188, "learning_rate": 1.9650946975835733e-06, "loss": 0.1447, "num_input_tokens_seen": 24265792, "step": 36005 }, { "epoch": 0.8797302909632815, "grad_norm": 12.380399703979492, "learning_rate": 1.9650723597710078e-06, "loss": 0.2048, "num_input_tokens_seen": 24268800, "step": 36010 }, { "epoch": 0.8798524417951287, "grad_norm": 18.560951232910156, "learning_rate": 1.965050014940177e-06, "loss": 0.1781, "num_input_tokens_seen": 24272192, "step": 36015 }, { "epoch": 0.8799745926269757, "grad_norm": 28.321802139282227, "learning_rate": 1.9650276630912427e-06, "loss": 0.0397, "num_input_tokens_seen": 24275840, "step": 36020 }, { "epoch": 0.8800967434588229, "grad_norm": 1.0200899839401245, "learning_rate": 1.965005304224368e-06, "loss": 0.1337, "num_input_tokens_seen": 24279296, "step": 36025 }, { "epoch": 0.8802188942906701, "grad_norm": 7.544238090515137, "learning_rate": 1.964982938339715e-06, "loss": 0.3578, "num_input_tokens_seen": 24282240, "step": 36030 }, { "epoch": 0.8803410451225173, "grad_norm": 23.50800323486328, "learning_rate": 1.9649605654374466e-06, "loss": 0.0886, "num_input_tokens_seen": 24285696, "step": 36035 }, { "epoch": 0.8804631959543644, "grad_norm": 0.366159588098526, "learning_rate": 1.964938185517725e-06, "loss": 0.0541, "num_input_tokens_seen": 24288704, "step": 36040 }, { "epoch": 0.8805853467862116, "grad_norm": 34.502220153808594, "learning_rate": 1.964915798580714e-06, "loss": 0.0778, "num_input_tokens_seen": 24291520, "step": 36045 }, { "epoch": 0.8807074976180588, "grad_norm": 32.538902282714844, "learning_rate": 1.9648934046265755e-06, "loss": 0.1126, "num_input_tokens_seen": 24294720, "step": 36050 }, { "epoch": 0.880829648449906, "grad_norm": 50.61782455444336, "learning_rate": 1.9648710036554726e-06, "loss": 0.1484, "num_input_tokens_seen": 24297984, "step": 36055 }, { "epoch": 0.8809517992817532, "grad_norm": 1.2384830713272095, "learning_rate": 1.9648485956675683e-06, "loss": 0.1794, "num_input_tokens_seen": 24301312, "step": 36060 }, { "epoch": 0.8810739501136002, "grad_norm": 14.007159233093262, "learning_rate": 1.9648261806630255e-06, "loss": 0.1554, "num_input_tokens_seen": 24304640, "step": 36065 }, { "epoch": 0.8811961009454474, "grad_norm": 24.65819549560547, "learning_rate": 1.964803758642007e-06, "loss": 0.0985, "num_input_tokens_seen": 24308480, "step": 36070 }, { "epoch": 0.8813182517772946, "grad_norm": 14.141555786132812, "learning_rate": 1.9647813296046766e-06, "loss": 0.1324, "num_input_tokens_seen": 24311680, "step": 36075 }, { "epoch": 0.8814404026091418, "grad_norm": 11.87048053741455, "learning_rate": 1.964758893551196e-06, "loss": 0.1911, "num_input_tokens_seen": 24314880, "step": 36080 }, { "epoch": 0.8815625534409889, "grad_norm": 19.78986358642578, "learning_rate": 1.96473645048173e-06, "loss": 0.0895, "num_input_tokens_seen": 24318400, "step": 36085 }, { "epoch": 0.8816847042728361, "grad_norm": 11.281794548034668, "learning_rate": 1.964714000396441e-06, "loss": 0.105, "num_input_tokens_seen": 24321536, "step": 36090 }, { "epoch": 0.8818068551046833, "grad_norm": 4.544544696807861, "learning_rate": 1.964691543295492e-06, "loss": 0.0495, "num_input_tokens_seen": 24324864, "step": 36095 }, { "epoch": 0.8819290059365305, "grad_norm": 13.858755111694336, "learning_rate": 1.964669079179047e-06, "loss": 0.0606, "num_input_tokens_seen": 24328000, "step": 36100 }, { "epoch": 0.8820511567683776, "grad_norm": 18.83074378967285, "learning_rate": 1.9646466080472686e-06, "loss": 0.0605, "num_input_tokens_seen": 24331008, "step": 36105 }, { "epoch": 0.8821733076002247, "grad_norm": 8.569042205810547, "learning_rate": 1.9646241299003206e-06, "loss": 0.1002, "num_input_tokens_seen": 24334080, "step": 36110 }, { "epoch": 0.8822954584320719, "grad_norm": 26.459529876708984, "learning_rate": 1.9646016447383665e-06, "loss": 0.1031, "num_input_tokens_seen": 24337472, "step": 36115 }, { "epoch": 0.8824176092639191, "grad_norm": 38.5592155456543, "learning_rate": 1.96457915256157e-06, "loss": 0.1362, "num_input_tokens_seen": 24340928, "step": 36120 }, { "epoch": 0.8825397600957663, "grad_norm": 17.598421096801758, "learning_rate": 1.9645566533700945e-06, "loss": 0.1261, "num_input_tokens_seen": 24344512, "step": 36125 }, { "epoch": 0.8826619109276134, "grad_norm": 5.009764671325684, "learning_rate": 1.9645341471641036e-06, "loss": 0.0493, "num_input_tokens_seen": 24347776, "step": 36130 }, { "epoch": 0.8827840617594606, "grad_norm": 8.929144859313965, "learning_rate": 1.964511633943761e-06, "loss": 0.0642, "num_input_tokens_seen": 24350976, "step": 36135 }, { "epoch": 0.8829062125913077, "grad_norm": 0.8995026350021362, "learning_rate": 1.9644891137092298e-06, "loss": 0.1568, "num_input_tokens_seen": 24354240, "step": 36140 }, { "epoch": 0.8830283634231549, "grad_norm": 0.6001954078674316, "learning_rate": 1.9644665864606747e-06, "loss": 0.0523, "num_input_tokens_seen": 24357312, "step": 36145 }, { "epoch": 0.8831505142550021, "grad_norm": 9.831707000732422, "learning_rate": 1.9644440521982594e-06, "loss": 0.1478, "num_input_tokens_seen": 24361024, "step": 36150 }, { "epoch": 0.8832726650868492, "grad_norm": 0.6541392207145691, "learning_rate": 1.9644215109221475e-06, "loss": 0.1884, "num_input_tokens_seen": 24364480, "step": 36155 }, { "epoch": 0.8833948159186964, "grad_norm": 7.485659122467041, "learning_rate": 1.9643989626325024e-06, "loss": 0.1435, "num_input_tokens_seen": 24367616, "step": 36160 }, { "epoch": 0.8835169667505436, "grad_norm": 15.767600059509277, "learning_rate": 1.9643764073294893e-06, "loss": 0.2141, "num_input_tokens_seen": 24370880, "step": 36165 }, { "epoch": 0.8836391175823908, "grad_norm": 21.12200164794922, "learning_rate": 1.9643538450132713e-06, "loss": 0.1681, "num_input_tokens_seen": 24374144, "step": 36170 }, { "epoch": 0.8837612684142379, "grad_norm": 31.78057289123535, "learning_rate": 1.964331275684013e-06, "loss": 0.0899, "num_input_tokens_seen": 24377344, "step": 36175 }, { "epoch": 0.883883419246085, "grad_norm": 0.7445668578147888, "learning_rate": 1.964308699341878e-06, "loss": 0.0782, "num_input_tokens_seen": 24380480, "step": 36180 }, { "epoch": 0.8840055700779322, "grad_norm": 4.764310836791992, "learning_rate": 1.964286115987031e-06, "loss": 0.0687, "num_input_tokens_seen": 24384000, "step": 36185 }, { "epoch": 0.8841277209097794, "grad_norm": 0.4389335513114929, "learning_rate": 1.9642635256196356e-06, "loss": 0.0591, "num_input_tokens_seen": 24387520, "step": 36190 }, { "epoch": 0.8842498717416266, "grad_norm": 16.75498390197754, "learning_rate": 1.9642409282398573e-06, "loss": 0.1161, "num_input_tokens_seen": 24391424, "step": 36195 }, { "epoch": 0.8843720225734737, "grad_norm": 35.38811111450195, "learning_rate": 1.964218323847859e-06, "loss": 0.1667, "num_input_tokens_seen": 24394560, "step": 36200 }, { "epoch": 0.8844941734053209, "grad_norm": 6.369503498077393, "learning_rate": 1.9641957124438058e-06, "loss": 0.0912, "num_input_tokens_seen": 24398080, "step": 36205 }, { "epoch": 0.8846163242371681, "grad_norm": 27.021238327026367, "learning_rate": 1.9641730940278617e-06, "loss": 0.097, "num_input_tokens_seen": 24401408, "step": 36210 }, { "epoch": 0.8847384750690153, "grad_norm": 17.28885269165039, "learning_rate": 1.964150468600192e-06, "loss": 0.1731, "num_input_tokens_seen": 24404736, "step": 36215 }, { "epoch": 0.8848606259008623, "grad_norm": 1.7445902824401855, "learning_rate": 1.964127836160961e-06, "loss": 0.0174, "num_input_tokens_seen": 24408000, "step": 36220 }, { "epoch": 0.8849827767327095, "grad_norm": 10.038021087646484, "learning_rate": 1.964105196710332e-06, "loss": 0.1235, "num_input_tokens_seen": 24411136, "step": 36225 }, { "epoch": 0.8851049275645567, "grad_norm": 25.007415771484375, "learning_rate": 1.9640825502484716e-06, "loss": 0.0469, "num_input_tokens_seen": 24414720, "step": 36230 }, { "epoch": 0.8852270783964039, "grad_norm": 16.24190330505371, "learning_rate": 1.9640598967755435e-06, "loss": 0.1697, "num_input_tokens_seen": 24418112, "step": 36235 }, { "epoch": 0.885349229228251, "grad_norm": 22.841869354248047, "learning_rate": 1.9640372362917128e-06, "loss": 0.1409, "num_input_tokens_seen": 24421184, "step": 36240 }, { "epoch": 0.8854713800600982, "grad_norm": 0.6201678514480591, "learning_rate": 1.964014568797143e-06, "loss": 0.0835, "num_input_tokens_seen": 24424448, "step": 36245 }, { "epoch": 0.8855935308919454, "grad_norm": 17.243867874145508, "learning_rate": 1.963991894292001e-06, "loss": 0.1017, "num_input_tokens_seen": 24427968, "step": 36250 }, { "epoch": 0.8857156817237926, "grad_norm": 40.84526443481445, "learning_rate": 1.9639692127764504e-06, "loss": 0.1615, "num_input_tokens_seen": 24431808, "step": 36255 }, { "epoch": 0.8858378325556397, "grad_norm": 12.356513023376465, "learning_rate": 1.9639465242506563e-06, "loss": 0.1469, "num_input_tokens_seen": 24435648, "step": 36260 }, { "epoch": 0.8859599833874868, "grad_norm": 11.762965202331543, "learning_rate": 1.9639238287147836e-06, "loss": 0.084, "num_input_tokens_seen": 24438720, "step": 36265 }, { "epoch": 0.886082134219334, "grad_norm": 0.16327081620693207, "learning_rate": 1.963901126168998e-06, "loss": 0.1053, "num_input_tokens_seen": 24442240, "step": 36270 }, { "epoch": 0.8862042850511812, "grad_norm": 7.906080722808838, "learning_rate": 1.9638784166134636e-06, "loss": 0.0492, "num_input_tokens_seen": 24445184, "step": 36275 }, { "epoch": 0.8863264358830284, "grad_norm": 3.6426432132720947, "learning_rate": 1.9638557000483463e-06, "loss": 0.0825, "num_input_tokens_seen": 24449280, "step": 36280 }, { "epoch": 0.8864485867148755, "grad_norm": 1.1091848611831665, "learning_rate": 1.963832976473811e-06, "loss": 0.0363, "num_input_tokens_seen": 24452608, "step": 36285 }, { "epoch": 0.8865707375467227, "grad_norm": 26.937637329101562, "learning_rate": 1.9638102458900236e-06, "loss": 0.0744, "num_input_tokens_seen": 24455616, "step": 36290 }, { "epoch": 0.8866928883785699, "grad_norm": 5.537595272064209, "learning_rate": 1.963787508297148e-06, "loss": 0.0468, "num_input_tokens_seen": 24459072, "step": 36295 }, { "epoch": 0.886815039210417, "grad_norm": 1.8377037048339844, "learning_rate": 1.9637647636953513e-06, "loss": 0.1084, "num_input_tokens_seen": 24462400, "step": 36300 }, { "epoch": 0.8869371900422642, "grad_norm": 2.740536689758301, "learning_rate": 1.9637420120847976e-06, "loss": 0.1197, "num_input_tokens_seen": 24465152, "step": 36305 }, { "epoch": 0.8870593408741113, "grad_norm": 1.6049529314041138, "learning_rate": 1.9637192534656528e-06, "loss": 0.0815, "num_input_tokens_seen": 24468160, "step": 36310 }, { "epoch": 0.8871814917059585, "grad_norm": 0.09656000882387161, "learning_rate": 1.9636964878380824e-06, "loss": 0.1247, "num_input_tokens_seen": 24470976, "step": 36315 }, { "epoch": 0.8873036425378057, "grad_norm": 0.3467960059642792, "learning_rate": 1.963673715202252e-06, "loss": 0.063, "num_input_tokens_seen": 24474752, "step": 36320 }, { "epoch": 0.8874257933696529, "grad_norm": 10.848044395446777, "learning_rate": 1.9636509355583268e-06, "loss": 0.1894, "num_input_tokens_seen": 24478464, "step": 36325 }, { "epoch": 0.8875479442015, "grad_norm": 19.020444869995117, "learning_rate": 1.9636281489064732e-06, "loss": 0.0863, "num_input_tokens_seen": 24481856, "step": 36330 }, { "epoch": 0.8876700950333472, "grad_norm": 30.788227081298828, "learning_rate": 1.9636053552468565e-06, "loss": 0.122, "num_input_tokens_seen": 24485120, "step": 36335 }, { "epoch": 0.8877922458651943, "grad_norm": 15.392607688903809, "learning_rate": 1.963582554579642e-06, "loss": 0.0674, "num_input_tokens_seen": 24488576, "step": 36340 }, { "epoch": 0.8879143966970415, "grad_norm": 11.759366035461426, "learning_rate": 1.9635597469049963e-06, "loss": 0.1177, "num_input_tokens_seen": 24492096, "step": 36345 }, { "epoch": 0.8880365475288887, "grad_norm": 11.81397533416748, "learning_rate": 1.9635369322230852e-06, "loss": 0.2048, "num_input_tokens_seen": 24495424, "step": 36350 }, { "epoch": 0.8881586983607358, "grad_norm": 11.798083305358887, "learning_rate": 1.9635141105340742e-06, "loss": 0.1205, "num_input_tokens_seen": 24498880, "step": 36355 }, { "epoch": 0.888280849192583, "grad_norm": 14.193384170532227, "learning_rate": 1.9634912818381293e-06, "loss": 0.1286, "num_input_tokens_seen": 24502336, "step": 36360 }, { "epoch": 0.8884030000244302, "grad_norm": 1.707037091255188, "learning_rate": 1.963468446135417e-06, "loss": 0.2572, "num_input_tokens_seen": 24505408, "step": 36365 }, { "epoch": 0.8885251508562774, "grad_norm": 11.981134414672852, "learning_rate": 1.9634456034261025e-06, "loss": 0.1699, "num_input_tokens_seen": 24509056, "step": 36370 }, { "epoch": 0.8886473016881244, "grad_norm": 0.7846937775611877, "learning_rate": 1.963422753710353e-06, "loss": 0.0129, "num_input_tokens_seen": 24512128, "step": 36375 }, { "epoch": 0.8887694525199716, "grad_norm": 10.940505027770996, "learning_rate": 1.9633998969883335e-06, "loss": 0.1042, "num_input_tokens_seen": 24515648, "step": 36380 }, { "epoch": 0.8888916033518188, "grad_norm": 13.387989044189453, "learning_rate": 1.963377033260211e-06, "loss": 0.0286, "num_input_tokens_seen": 24518848, "step": 36385 }, { "epoch": 0.889013754183666, "grad_norm": 1.929477572441101, "learning_rate": 1.963354162526152e-06, "loss": 0.0204, "num_input_tokens_seen": 24523072, "step": 36390 }, { "epoch": 0.8891359050155132, "grad_norm": 1.1878111362457275, "learning_rate": 1.963331284786322e-06, "loss": 0.1326, "num_input_tokens_seen": 24526976, "step": 36395 }, { "epoch": 0.8892580558473603, "grad_norm": 11.675740242004395, "learning_rate": 1.9633084000408883e-06, "loss": 0.1465, "num_input_tokens_seen": 24529984, "step": 36400 }, { "epoch": 0.8893802066792075, "grad_norm": 12.716568946838379, "learning_rate": 1.9632855082900163e-06, "loss": 0.0548, "num_input_tokens_seen": 24533120, "step": 36405 }, { "epoch": 0.8895023575110547, "grad_norm": 0.6486952900886536, "learning_rate": 1.9632626095338735e-06, "loss": 0.0965, "num_input_tokens_seen": 24536576, "step": 36410 }, { "epoch": 0.8896245083429019, "grad_norm": 0.16108357906341553, "learning_rate": 1.963239703772625e-06, "loss": 0.0375, "num_input_tokens_seen": 24539968, "step": 36415 }, { "epoch": 0.8897466591747489, "grad_norm": 0.5907543301582336, "learning_rate": 1.963216791006439e-06, "loss": 0.1732, "num_input_tokens_seen": 24543296, "step": 36420 }, { "epoch": 0.8898688100065961, "grad_norm": 34.28215026855469, "learning_rate": 1.9631938712354815e-06, "loss": 0.1472, "num_input_tokens_seen": 24546368, "step": 36425 }, { "epoch": 0.8899909608384433, "grad_norm": 17.69078254699707, "learning_rate": 1.9631709444599187e-06, "loss": 0.2331, "num_input_tokens_seen": 24549952, "step": 36430 }, { "epoch": 0.8901131116702905, "grad_norm": 0.20918723940849304, "learning_rate": 1.963148010679918e-06, "loss": 0.1199, "num_input_tokens_seen": 24553088, "step": 36435 }, { "epoch": 0.8902352625021377, "grad_norm": 17.097530364990234, "learning_rate": 1.963125069895646e-06, "loss": 0.1703, "num_input_tokens_seen": 24556288, "step": 36440 }, { "epoch": 0.8903574133339848, "grad_norm": 1.6573656797409058, "learning_rate": 1.9631021221072693e-06, "loss": 0.0402, "num_input_tokens_seen": 24559360, "step": 36445 }, { "epoch": 0.890479564165832, "grad_norm": 8.112147331237793, "learning_rate": 1.9630791673149546e-06, "loss": 0.0887, "num_input_tokens_seen": 24562624, "step": 36450 }, { "epoch": 0.8906017149976792, "grad_norm": 2.700888156890869, "learning_rate": 1.9630562055188697e-06, "loss": 0.0766, "num_input_tokens_seen": 24566336, "step": 36455 }, { "epoch": 0.8907238658295263, "grad_norm": 25.177814483642578, "learning_rate": 1.963033236719181e-06, "loss": 0.1509, "num_input_tokens_seen": 24569856, "step": 36460 }, { "epoch": 0.8908460166613734, "grad_norm": 2.9992785453796387, "learning_rate": 1.963010260916055e-06, "loss": 0.0133, "num_input_tokens_seen": 24573440, "step": 36465 }, { "epoch": 0.8909681674932206, "grad_norm": 36.274112701416016, "learning_rate": 1.9629872781096597e-06, "loss": 0.214, "num_input_tokens_seen": 24576768, "step": 36470 }, { "epoch": 0.8910903183250678, "grad_norm": 1.0101815462112427, "learning_rate": 1.9629642883001624e-06, "loss": 0.2347, "num_input_tokens_seen": 24579904, "step": 36475 }, { "epoch": 0.891212469156915, "grad_norm": 6.083399295806885, "learning_rate": 1.962941291487729e-06, "loss": 0.0076, "num_input_tokens_seen": 24583040, "step": 36480 }, { "epoch": 0.8913346199887622, "grad_norm": 27.649301528930664, "learning_rate": 1.962918287672528e-06, "loss": 0.1516, "num_input_tokens_seen": 24586432, "step": 36485 }, { "epoch": 0.8914567708206093, "grad_norm": 0.9985415935516357, "learning_rate": 1.962895276854726e-06, "loss": 0.1095, "num_input_tokens_seen": 24590080, "step": 36490 }, { "epoch": 0.8915789216524564, "grad_norm": 34.66645812988281, "learning_rate": 1.9628722590344905e-06, "loss": 0.1351, "num_input_tokens_seen": 24593920, "step": 36495 }, { "epoch": 0.8917010724843036, "grad_norm": 24.54486656188965, "learning_rate": 1.9628492342119892e-06, "loss": 0.1019, "num_input_tokens_seen": 24597568, "step": 36500 }, { "epoch": 0.8918232233161508, "grad_norm": 10.25721263885498, "learning_rate": 1.9628262023873893e-06, "loss": 0.1333, "num_input_tokens_seen": 24600896, "step": 36505 }, { "epoch": 0.8919453741479979, "grad_norm": 8.492855072021484, "learning_rate": 1.962803163560858e-06, "loss": 0.0895, "num_input_tokens_seen": 24604288, "step": 36510 }, { "epoch": 0.8920675249798451, "grad_norm": 18.10352325439453, "learning_rate": 1.9627801177325635e-06, "loss": 0.14, "num_input_tokens_seen": 24607552, "step": 36515 }, { "epoch": 0.8921896758116923, "grad_norm": 23.538808822631836, "learning_rate": 1.9627570649026726e-06, "loss": 0.1, "num_input_tokens_seen": 24610944, "step": 36520 }, { "epoch": 0.8923118266435395, "grad_norm": 5.911246299743652, "learning_rate": 1.9627340050713535e-06, "loss": 0.0558, "num_input_tokens_seen": 24614016, "step": 36525 }, { "epoch": 0.8924339774753866, "grad_norm": 15.545068740844727, "learning_rate": 1.9627109382387743e-06, "loss": 0.1489, "num_input_tokens_seen": 24617344, "step": 36530 }, { "epoch": 0.8925561283072337, "grad_norm": 25.84769058227539, "learning_rate": 1.9626878644051014e-06, "loss": 0.1632, "num_input_tokens_seen": 24621376, "step": 36535 }, { "epoch": 0.8926782791390809, "grad_norm": 20.257658004760742, "learning_rate": 1.962664783570504e-06, "loss": 0.1333, "num_input_tokens_seen": 24624896, "step": 36540 }, { "epoch": 0.8928004299709281, "grad_norm": 15.873553276062012, "learning_rate": 1.962641695735149e-06, "loss": 0.0889, "num_input_tokens_seen": 24628096, "step": 36545 }, { "epoch": 0.8929225808027753, "grad_norm": 1.569334864616394, "learning_rate": 1.962618600899205e-06, "loss": 0.0207, "num_input_tokens_seen": 24631296, "step": 36550 }, { "epoch": 0.8930447316346224, "grad_norm": 11.255741119384766, "learning_rate": 1.9625954990628396e-06, "loss": 0.0303, "num_input_tokens_seen": 24634560, "step": 36555 }, { "epoch": 0.8931668824664696, "grad_norm": 19.29964256286621, "learning_rate": 1.9625723902262205e-06, "loss": 0.1629, "num_input_tokens_seen": 24638016, "step": 36560 }, { "epoch": 0.8932890332983168, "grad_norm": 28.87752342224121, "learning_rate": 1.9625492743895166e-06, "loss": 0.0578, "num_input_tokens_seen": 24641536, "step": 36565 }, { "epoch": 0.893411184130164, "grad_norm": 10.156953811645508, "learning_rate": 1.962526151552895e-06, "loss": 0.0397, "num_input_tokens_seen": 24644608, "step": 36570 }, { "epoch": 0.893533334962011, "grad_norm": 28.341548919677734, "learning_rate": 1.9625030217165243e-06, "loss": 0.1283, "num_input_tokens_seen": 24648000, "step": 36575 }, { "epoch": 0.8936554857938582, "grad_norm": 14.793594360351562, "learning_rate": 1.962479884880573e-06, "loss": 0.0591, "num_input_tokens_seen": 24651712, "step": 36580 }, { "epoch": 0.8937776366257054, "grad_norm": 41.583290100097656, "learning_rate": 1.962456741045209e-06, "loss": 0.1834, "num_input_tokens_seen": 24654528, "step": 36585 }, { "epoch": 0.8938997874575526, "grad_norm": 0.33410540223121643, "learning_rate": 1.9624335902106005e-06, "loss": 0.0691, "num_input_tokens_seen": 24657920, "step": 36590 }, { "epoch": 0.8940219382893998, "grad_norm": 11.096932411193848, "learning_rate": 1.9624104323769167e-06, "loss": 0.096, "num_input_tokens_seen": 24661632, "step": 36595 }, { "epoch": 0.8941440891212469, "grad_norm": 17.332895278930664, "learning_rate": 1.962387267544325e-06, "loss": 0.1534, "num_input_tokens_seen": 24664960, "step": 36600 }, { "epoch": 0.8942662399530941, "grad_norm": 62.588287353515625, "learning_rate": 1.962364095712994e-06, "loss": 0.1002, "num_input_tokens_seen": 24668480, "step": 36605 }, { "epoch": 0.8943883907849413, "grad_norm": 47.255455017089844, "learning_rate": 1.962340916883093e-06, "loss": 0.1503, "num_input_tokens_seen": 24672064, "step": 36610 }, { "epoch": 0.8945105416167884, "grad_norm": 15.527400970458984, "learning_rate": 1.9623177310547898e-06, "loss": 0.0608, "num_input_tokens_seen": 24675392, "step": 36615 }, { "epoch": 0.8946326924486355, "grad_norm": 1.301206111907959, "learning_rate": 1.9622945382282526e-06, "loss": 0.2064, "num_input_tokens_seen": 24678528, "step": 36620 }, { "epoch": 0.8947548432804827, "grad_norm": 1.0320180654525757, "learning_rate": 1.9622713384036517e-06, "loss": 0.1047, "num_input_tokens_seen": 24681984, "step": 36625 }, { "epoch": 0.8948769941123299, "grad_norm": 1.01441490650177, "learning_rate": 1.962248131581154e-06, "loss": 0.0891, "num_input_tokens_seen": 24685376, "step": 36630 }, { "epoch": 0.8949991449441771, "grad_norm": 0.6056206822395325, "learning_rate": 1.962224917760929e-06, "loss": 0.0213, "num_input_tokens_seen": 24688576, "step": 36635 }, { "epoch": 0.8951212957760243, "grad_norm": 33.90119552612305, "learning_rate": 1.9622016969431465e-06, "loss": 0.1229, "num_input_tokens_seen": 24692160, "step": 36640 }, { "epoch": 0.8952434466078714, "grad_norm": 22.38607406616211, "learning_rate": 1.9621784691279737e-06, "loss": 0.0524, "num_input_tokens_seen": 24695424, "step": 36645 }, { "epoch": 0.8953655974397186, "grad_norm": 30.155961990356445, "learning_rate": 1.962155234315581e-06, "loss": 0.0897, "num_input_tokens_seen": 24698496, "step": 36650 }, { "epoch": 0.8954877482715657, "grad_norm": 13.02365779876709, "learning_rate": 1.962131992506136e-06, "loss": 0.1008, "num_input_tokens_seen": 24701440, "step": 36655 }, { "epoch": 0.8956098991034129, "grad_norm": 0.4950132966041565, "learning_rate": 1.9621087436998083e-06, "loss": 0.1414, "num_input_tokens_seen": 24704576, "step": 36660 }, { "epoch": 0.89573204993526, "grad_norm": 51.34440994262695, "learning_rate": 1.9620854878967675e-06, "loss": 0.0464, "num_input_tokens_seen": 24707392, "step": 36665 }, { "epoch": 0.8958542007671072, "grad_norm": 26.741159439086914, "learning_rate": 1.9620622250971817e-06, "loss": 0.1001, "num_input_tokens_seen": 24710976, "step": 36670 }, { "epoch": 0.8959763515989544, "grad_norm": 55.84865951538086, "learning_rate": 1.9620389553012214e-06, "loss": 0.1594, "num_input_tokens_seen": 24713984, "step": 36675 }, { "epoch": 0.8960985024308016, "grad_norm": 12.856290817260742, "learning_rate": 1.9620156785090544e-06, "loss": 0.0624, "num_input_tokens_seen": 24717504, "step": 36680 }, { "epoch": 0.8962206532626488, "grad_norm": 29.340457916259766, "learning_rate": 1.961992394720851e-06, "loss": 0.1965, "num_input_tokens_seen": 24721024, "step": 36685 }, { "epoch": 0.8963428040944958, "grad_norm": 1.2017858028411865, "learning_rate": 1.9619691039367798e-06, "loss": 0.0845, "num_input_tokens_seen": 24724928, "step": 36690 }, { "epoch": 0.896464954926343, "grad_norm": 0.448015421628952, "learning_rate": 1.961945806157011e-06, "loss": 0.0655, "num_input_tokens_seen": 24728448, "step": 36695 }, { "epoch": 0.8965871057581902, "grad_norm": 0.06794452667236328, "learning_rate": 1.9619225013817133e-06, "loss": 0.1331, "num_input_tokens_seen": 24732032, "step": 36700 }, { "epoch": 0.8967092565900374, "grad_norm": 8.08639144897461, "learning_rate": 1.9618991896110565e-06, "loss": 0.0997, "num_input_tokens_seen": 24735488, "step": 36705 }, { "epoch": 0.8968314074218845, "grad_norm": 0.6464542746543884, "learning_rate": 1.96187587084521e-06, "loss": 0.0924, "num_input_tokens_seen": 24739008, "step": 36710 }, { "epoch": 0.8969535582537317, "grad_norm": 22.804977416992188, "learning_rate": 1.9618525450843432e-06, "loss": 0.0832, "num_input_tokens_seen": 24742080, "step": 36715 }, { "epoch": 0.8970757090855789, "grad_norm": 12.513947486877441, "learning_rate": 1.9618292123286264e-06, "loss": 0.2628, "num_input_tokens_seen": 24745792, "step": 36720 }, { "epoch": 0.8971978599174261, "grad_norm": 7.804723262786865, "learning_rate": 1.961805872578229e-06, "loss": 0.0718, "num_input_tokens_seen": 24748928, "step": 36725 }, { "epoch": 0.8973200107492733, "grad_norm": 19.41393280029297, "learning_rate": 1.9617825258333204e-06, "loss": 0.0482, "num_input_tokens_seen": 24752256, "step": 36730 }, { "epoch": 0.8974421615811203, "grad_norm": 62.86684036254883, "learning_rate": 1.9617591720940703e-06, "loss": 0.049, "num_input_tokens_seen": 24755776, "step": 36735 }, { "epoch": 0.8975643124129675, "grad_norm": 38.602210998535156, "learning_rate": 1.961735811360649e-06, "loss": 0.0901, "num_input_tokens_seen": 24759296, "step": 36740 }, { "epoch": 0.8976864632448147, "grad_norm": 56.08420944213867, "learning_rate": 1.9617124436332263e-06, "loss": 0.1186, "num_input_tokens_seen": 24762688, "step": 36745 }, { "epoch": 0.8978086140766619, "grad_norm": 6.372490406036377, "learning_rate": 1.961689068911972e-06, "loss": 0.1298, "num_input_tokens_seen": 24765952, "step": 36750 }, { "epoch": 0.897930764908509, "grad_norm": 23.063125610351562, "learning_rate": 1.9616656871970562e-06, "loss": 0.0264, "num_input_tokens_seen": 24769472, "step": 36755 }, { "epoch": 0.8980529157403562, "grad_norm": 50.10791015625, "learning_rate": 1.9616422984886485e-06, "loss": 0.1623, "num_input_tokens_seen": 24773184, "step": 36760 }, { "epoch": 0.8981750665722034, "grad_norm": 29.54539680480957, "learning_rate": 1.96161890278692e-06, "loss": 0.2083, "num_input_tokens_seen": 24776320, "step": 36765 }, { "epoch": 0.8982972174040506, "grad_norm": 19.126075744628906, "learning_rate": 1.9615955000920396e-06, "loss": 0.0645, "num_input_tokens_seen": 24779776, "step": 36770 }, { "epoch": 0.8984193682358977, "grad_norm": 14.978646278381348, "learning_rate": 1.9615720904041785e-06, "loss": 0.1415, "num_input_tokens_seen": 24782976, "step": 36775 }, { "epoch": 0.8985415190677448, "grad_norm": 2.023380756378174, "learning_rate": 1.9615486737235064e-06, "loss": 0.1298, "num_input_tokens_seen": 24786176, "step": 36780 }, { "epoch": 0.898663669899592, "grad_norm": 2.8862316608428955, "learning_rate": 1.9615252500501936e-06, "loss": 0.0636, "num_input_tokens_seen": 24789824, "step": 36785 }, { "epoch": 0.8987858207314392, "grad_norm": 21.79472541809082, "learning_rate": 1.9615018193844108e-06, "loss": 0.0881, "num_input_tokens_seen": 24793280, "step": 36790 }, { "epoch": 0.8989079715632864, "grad_norm": 24.159772872924805, "learning_rate": 1.961478381726328e-06, "loss": 0.1375, "num_input_tokens_seen": 24796800, "step": 36795 }, { "epoch": 0.8990301223951335, "grad_norm": 20.305831909179688, "learning_rate": 1.9614549370761153e-06, "loss": 0.1272, "num_input_tokens_seen": 24800064, "step": 36800 }, { "epoch": 0.8991522732269807, "grad_norm": 23.235565185546875, "learning_rate": 1.9614314854339445e-06, "loss": 0.1704, "num_input_tokens_seen": 24803456, "step": 36805 }, { "epoch": 0.8992744240588278, "grad_norm": 0.05056135356426239, "learning_rate": 1.961408026799985e-06, "loss": 0.0022, "num_input_tokens_seen": 24806656, "step": 36810 }, { "epoch": 0.899396574890675, "grad_norm": 13.105710983276367, "learning_rate": 1.961384561174408e-06, "loss": 0.1131, "num_input_tokens_seen": 24809536, "step": 36815 }, { "epoch": 0.8995187257225221, "grad_norm": 0.12696394324302673, "learning_rate": 1.9613610885573835e-06, "loss": 0.0057, "num_input_tokens_seen": 24812800, "step": 36820 }, { "epoch": 0.8996408765543693, "grad_norm": 70.6884536743164, "learning_rate": 1.961337608949083e-06, "loss": 0.1304, "num_input_tokens_seen": 24816320, "step": 36825 }, { "epoch": 0.8997630273862165, "grad_norm": 12.785313606262207, "learning_rate": 1.9613141223496763e-06, "loss": 0.1914, "num_input_tokens_seen": 24819392, "step": 36830 }, { "epoch": 0.8998851782180637, "grad_norm": 21.849977493286133, "learning_rate": 1.961290628759335e-06, "loss": 0.0568, "num_input_tokens_seen": 24822592, "step": 36835 }, { "epoch": 0.9000073290499109, "grad_norm": 2.245121479034424, "learning_rate": 1.9612671281782297e-06, "loss": 0.0487, "num_input_tokens_seen": 24825664, "step": 36840 }, { "epoch": 0.900129479881758, "grad_norm": 0.4319484829902649, "learning_rate": 1.9612436206065313e-06, "loss": 0.1083, "num_input_tokens_seen": 24828864, "step": 36845 }, { "epoch": 0.9002516307136051, "grad_norm": 24.330896377563477, "learning_rate": 1.961220106044411e-06, "loss": 0.1028, "num_input_tokens_seen": 24832256, "step": 36850 }, { "epoch": 0.9003737815454523, "grad_norm": 1.2126699686050415, "learning_rate": 1.961196584492039e-06, "loss": 0.076, "num_input_tokens_seen": 24835520, "step": 36855 }, { "epoch": 0.9004959323772995, "grad_norm": 0.764924168586731, "learning_rate": 1.9611730559495876e-06, "loss": 0.071, "num_input_tokens_seen": 24838784, "step": 36860 }, { "epoch": 0.9006180832091466, "grad_norm": 26.342575073242188, "learning_rate": 1.9611495204172266e-06, "loss": 0.2169, "num_input_tokens_seen": 24842368, "step": 36865 }, { "epoch": 0.9007402340409938, "grad_norm": 0.21320796012878418, "learning_rate": 1.961125977895128e-06, "loss": 0.0943, "num_input_tokens_seen": 24845952, "step": 36870 }, { "epoch": 0.900862384872841, "grad_norm": 15.532743453979492, "learning_rate": 1.961102428383463e-06, "loss": 0.147, "num_input_tokens_seen": 24849216, "step": 36875 }, { "epoch": 0.9009845357046882, "grad_norm": 11.373522758483887, "learning_rate": 1.9610788718824024e-06, "loss": 0.0963, "num_input_tokens_seen": 24853056, "step": 36880 }, { "epoch": 0.9011066865365354, "grad_norm": 9.247547149658203, "learning_rate": 1.9610553083921176e-06, "loss": 0.1742, "num_input_tokens_seen": 24856448, "step": 36885 }, { "epoch": 0.9012288373683824, "grad_norm": 3.3656091690063477, "learning_rate": 1.9610317379127803e-06, "loss": 0.0552, "num_input_tokens_seen": 24859520, "step": 36890 }, { "epoch": 0.9013509882002296, "grad_norm": 0.7792104482650757, "learning_rate": 1.9610081604445618e-06, "loss": 0.1553, "num_input_tokens_seen": 24862656, "step": 36895 }, { "epoch": 0.9014731390320768, "grad_norm": 0.8532900214195251, "learning_rate": 1.9609845759876332e-06, "loss": 0.04, "num_input_tokens_seen": 24866112, "step": 36900 }, { "epoch": 0.901595289863924, "grad_norm": 1.0743188858032227, "learning_rate": 1.9609609845421666e-06, "loss": 0.1122, "num_input_tokens_seen": 24869312, "step": 36905 }, { "epoch": 0.9017174406957711, "grad_norm": 0.17355743050575256, "learning_rate": 1.960937386108333e-06, "loss": 0.1537, "num_input_tokens_seen": 24872576, "step": 36910 }, { "epoch": 0.9018395915276183, "grad_norm": 0.4766935706138611, "learning_rate": 1.9609137806863044e-06, "loss": 0.0725, "num_input_tokens_seen": 24876032, "step": 36915 }, { "epoch": 0.9019617423594655, "grad_norm": 0.37082409858703613, "learning_rate": 1.9608901682762522e-06, "loss": 0.2087, "num_input_tokens_seen": 24879168, "step": 36920 }, { "epoch": 0.9020838931913127, "grad_norm": 44.21205520629883, "learning_rate": 1.9608665488783485e-06, "loss": 0.1464, "num_input_tokens_seen": 24883008, "step": 36925 }, { "epoch": 0.9022060440231598, "grad_norm": 17.51647186279297, "learning_rate": 1.960842922492765e-06, "loss": 0.1137, "num_input_tokens_seen": 24886400, "step": 36930 }, { "epoch": 0.9023281948550069, "grad_norm": 4.397872447967529, "learning_rate": 1.9608192891196725e-06, "loss": 0.0908, "num_input_tokens_seen": 24890240, "step": 36935 }, { "epoch": 0.9024503456868541, "grad_norm": 29.296829223632812, "learning_rate": 1.9607956487592446e-06, "loss": 0.113, "num_input_tokens_seen": 24893696, "step": 36940 }, { "epoch": 0.9025724965187013, "grad_norm": 0.3690081536769867, "learning_rate": 1.960772001411652e-06, "loss": 0.0707, "num_input_tokens_seen": 24897088, "step": 36945 }, { "epoch": 0.9026946473505485, "grad_norm": 28.767560958862305, "learning_rate": 1.9607483470770667e-06, "loss": 0.1639, "num_input_tokens_seen": 24900288, "step": 36950 }, { "epoch": 0.9028167981823956, "grad_norm": 62.34882736206055, "learning_rate": 1.960724685755661e-06, "loss": 0.1239, "num_input_tokens_seen": 24904640, "step": 36955 }, { "epoch": 0.9029389490142428, "grad_norm": 18.672874450683594, "learning_rate": 1.9607010174476073e-06, "loss": 0.2087, "num_input_tokens_seen": 24907968, "step": 36960 }, { "epoch": 0.90306109984609, "grad_norm": 22.128129959106445, "learning_rate": 1.9606773421530774e-06, "loss": 0.0568, "num_input_tokens_seen": 24911744, "step": 36965 }, { "epoch": 0.9031832506779371, "grad_norm": 29.346067428588867, "learning_rate": 1.960653659872243e-06, "loss": 0.156, "num_input_tokens_seen": 24915328, "step": 36970 }, { "epoch": 0.9033054015097843, "grad_norm": 13.030354499816895, "learning_rate": 1.9606299706052775e-06, "loss": 0.2079, "num_input_tokens_seen": 24918464, "step": 36975 }, { "epoch": 0.9034275523416314, "grad_norm": 0.45906662940979004, "learning_rate": 1.960606274352352e-06, "loss": 0.0784, "num_input_tokens_seen": 24921664, "step": 36980 }, { "epoch": 0.9035497031734786, "grad_norm": 3.491370916366577, "learning_rate": 1.9605825711136395e-06, "loss": 0.0685, "num_input_tokens_seen": 24925248, "step": 36985 }, { "epoch": 0.9036718540053258, "grad_norm": 9.43264389038086, "learning_rate": 1.9605588608893124e-06, "loss": 0.1453, "num_input_tokens_seen": 24928448, "step": 36990 }, { "epoch": 0.903794004837173, "grad_norm": 11.920999526977539, "learning_rate": 1.9605351436795426e-06, "loss": 0.1158, "num_input_tokens_seen": 24932160, "step": 36995 }, { "epoch": 0.9039161556690201, "grad_norm": 0.8874409198760986, "learning_rate": 1.960511419484503e-06, "loss": 0.0426, "num_input_tokens_seen": 24935424, "step": 37000 }, { "epoch": 0.9040383065008673, "grad_norm": 22.666275024414062, "learning_rate": 1.9604876883043655e-06, "loss": 0.0423, "num_input_tokens_seen": 24938752, "step": 37005 }, { "epoch": 0.9041604573327144, "grad_norm": 18.00115203857422, "learning_rate": 1.960463950139304e-06, "loss": 0.1684, "num_input_tokens_seen": 24942272, "step": 37010 }, { "epoch": 0.9042826081645616, "grad_norm": 0.41937220096588135, "learning_rate": 1.96044020498949e-06, "loss": 0.1104, "num_input_tokens_seen": 24945728, "step": 37015 }, { "epoch": 0.9044047589964088, "grad_norm": 0.23672303557395935, "learning_rate": 1.9604164528550966e-06, "loss": 0.0333, "num_input_tokens_seen": 24949248, "step": 37020 }, { "epoch": 0.9045269098282559, "grad_norm": 20.977340698242188, "learning_rate": 1.960392693736296e-06, "loss": 0.1655, "num_input_tokens_seen": 24952896, "step": 37025 }, { "epoch": 0.9046490606601031, "grad_norm": 41.60050964355469, "learning_rate": 1.9603689276332617e-06, "loss": 0.1639, "num_input_tokens_seen": 24956096, "step": 37030 }, { "epoch": 0.9047712114919503, "grad_norm": 37.09358215332031, "learning_rate": 1.960345154546166e-06, "loss": 0.0335, "num_input_tokens_seen": 24959104, "step": 37035 }, { "epoch": 0.9048933623237975, "grad_norm": 9.280909538269043, "learning_rate": 1.9603213744751824e-06, "loss": 0.1253, "num_input_tokens_seen": 24962816, "step": 37040 }, { "epoch": 0.9050155131556445, "grad_norm": 0.6137846112251282, "learning_rate": 1.9602975874204836e-06, "loss": 0.2109, "num_input_tokens_seen": 24966528, "step": 37045 }, { "epoch": 0.9051376639874917, "grad_norm": 14.2232666015625, "learning_rate": 1.960273793382242e-06, "loss": 0.1559, "num_input_tokens_seen": 24970624, "step": 37050 }, { "epoch": 0.9052598148193389, "grad_norm": 8.7384672164917, "learning_rate": 1.9602499923606314e-06, "loss": 0.1607, "num_input_tokens_seen": 24974272, "step": 37055 }, { "epoch": 0.9053819656511861, "grad_norm": 4.077075004577637, "learning_rate": 1.960226184355824e-06, "loss": 0.1161, "num_input_tokens_seen": 24978048, "step": 37060 }, { "epoch": 0.9055041164830332, "grad_norm": 8.053204536437988, "learning_rate": 1.9602023693679942e-06, "loss": 0.1868, "num_input_tokens_seen": 24981184, "step": 37065 }, { "epoch": 0.9056262673148804, "grad_norm": 11.40694522857666, "learning_rate": 1.9601785473973145e-06, "loss": 0.0782, "num_input_tokens_seen": 24984832, "step": 37070 }, { "epoch": 0.9057484181467276, "grad_norm": 18.37397575378418, "learning_rate": 1.9601547184439577e-06, "loss": 0.1352, "num_input_tokens_seen": 24988288, "step": 37075 }, { "epoch": 0.9058705689785748, "grad_norm": 13.701996803283691, "learning_rate": 1.960130882508098e-06, "loss": 0.1083, "num_input_tokens_seen": 24992192, "step": 37080 }, { "epoch": 0.905992719810422, "grad_norm": 0.5049362778663635, "learning_rate": 1.960107039589908e-06, "loss": 0.1329, "num_input_tokens_seen": 24995584, "step": 37085 }, { "epoch": 0.906114870642269, "grad_norm": 15.812434196472168, "learning_rate": 1.9600831896895615e-06, "loss": 0.1082, "num_input_tokens_seen": 24998976, "step": 37090 }, { "epoch": 0.9062370214741162, "grad_norm": 11.897115707397461, "learning_rate": 1.9600593328072317e-06, "loss": 0.068, "num_input_tokens_seen": 25002496, "step": 37095 }, { "epoch": 0.9063591723059634, "grad_norm": 12.684399604797363, "learning_rate": 1.960035468943092e-06, "loss": 0.1702, "num_input_tokens_seen": 25005952, "step": 37100 }, { "epoch": 0.9064813231378106, "grad_norm": 34.3539924621582, "learning_rate": 1.9600115980973167e-06, "loss": 0.1441, "num_input_tokens_seen": 25009408, "step": 37105 }, { "epoch": 0.9066034739696577, "grad_norm": 1.0309791564941406, "learning_rate": 1.9599877202700784e-06, "loss": 0.0769, "num_input_tokens_seen": 25012416, "step": 37110 }, { "epoch": 0.9067256248015049, "grad_norm": 0.7294164896011353, "learning_rate": 1.9599638354615517e-06, "loss": 0.0935, "num_input_tokens_seen": 25015744, "step": 37115 }, { "epoch": 0.9068477756333521, "grad_norm": 9.473251342773438, "learning_rate": 1.959939943671909e-06, "loss": 0.131, "num_input_tokens_seen": 25019904, "step": 37120 }, { "epoch": 0.9069699264651992, "grad_norm": 15.669844627380371, "learning_rate": 1.9599160449013255e-06, "loss": 0.2101, "num_input_tokens_seen": 25023104, "step": 37125 }, { "epoch": 0.9070920772970464, "grad_norm": 18.923765182495117, "learning_rate": 1.959892139149974e-06, "loss": 0.0628, "num_input_tokens_seen": 25026112, "step": 37130 }, { "epoch": 0.9072142281288935, "grad_norm": 18.780458450317383, "learning_rate": 1.9598682264180288e-06, "loss": 0.0905, "num_input_tokens_seen": 25029632, "step": 37135 }, { "epoch": 0.9073363789607407, "grad_norm": 53.54373550415039, "learning_rate": 1.959844306705664e-06, "loss": 0.156, "num_input_tokens_seen": 25032896, "step": 37140 }, { "epoch": 0.9074585297925879, "grad_norm": 32.853431701660156, "learning_rate": 1.9598203800130524e-06, "loss": 0.052, "num_input_tokens_seen": 25036352, "step": 37145 }, { "epoch": 0.9075806806244351, "grad_norm": 31.716951370239258, "learning_rate": 1.9597964463403695e-06, "loss": 0.0664, "num_input_tokens_seen": 25039488, "step": 37150 }, { "epoch": 0.9077028314562822, "grad_norm": 31.609790802001953, "learning_rate": 1.9597725056877886e-06, "loss": 0.1988, "num_input_tokens_seen": 25042816, "step": 37155 }, { "epoch": 0.9078249822881294, "grad_norm": 0.3431050479412079, "learning_rate": 1.959748558055484e-06, "loss": 0.0328, "num_input_tokens_seen": 25045888, "step": 37160 }, { "epoch": 0.9079471331199765, "grad_norm": 26.98533821105957, "learning_rate": 1.9597246034436293e-06, "loss": 0.1926, "num_input_tokens_seen": 25049536, "step": 37165 }, { "epoch": 0.9080692839518237, "grad_norm": 14.834943771362305, "learning_rate": 1.9597006418523995e-06, "loss": 0.1487, "num_input_tokens_seen": 25053248, "step": 37170 }, { "epoch": 0.9081914347836709, "grad_norm": 16.529705047607422, "learning_rate": 1.9596766732819684e-06, "loss": 0.2229, "num_input_tokens_seen": 25056576, "step": 37175 }, { "epoch": 0.908313585615518, "grad_norm": 0.9060618877410889, "learning_rate": 1.9596526977325106e-06, "loss": 0.0712, "num_input_tokens_seen": 25059968, "step": 37180 }, { "epoch": 0.9084357364473652, "grad_norm": 14.722132682800293, "learning_rate": 1.9596287152042e-06, "loss": 0.1904, "num_input_tokens_seen": 25063296, "step": 37185 }, { "epoch": 0.9085578872792124, "grad_norm": 17.283632278442383, "learning_rate": 1.9596047256972114e-06, "loss": 0.0991, "num_input_tokens_seen": 25066240, "step": 37190 }, { "epoch": 0.9086800381110596, "grad_norm": 7.278754711151123, "learning_rate": 1.959580729211719e-06, "loss": 0.0686, "num_input_tokens_seen": 25069824, "step": 37195 }, { "epoch": 0.9088021889429067, "grad_norm": 2.6354358196258545, "learning_rate": 1.9595567257478974e-06, "loss": 0.022, "num_input_tokens_seen": 25073664, "step": 37200 }, { "epoch": 0.9089243397747538, "grad_norm": 21.20435905456543, "learning_rate": 1.9595327153059214e-06, "loss": 0.1277, "num_input_tokens_seen": 25076672, "step": 37205 }, { "epoch": 0.909046490606601, "grad_norm": 2.3268589973449707, "learning_rate": 1.9595086978859653e-06, "loss": 0.1181, "num_input_tokens_seen": 25080064, "step": 37210 }, { "epoch": 0.9091686414384482, "grad_norm": 8.44967269897461, "learning_rate": 1.959484673488204e-06, "loss": 0.1047, "num_input_tokens_seen": 25083392, "step": 37215 }, { "epoch": 0.9092907922702954, "grad_norm": 3.236764907836914, "learning_rate": 1.9594606421128123e-06, "loss": 0.0515, "num_input_tokens_seen": 25086912, "step": 37220 }, { "epoch": 0.9094129431021425, "grad_norm": 30.90077018737793, "learning_rate": 1.9594366037599645e-06, "loss": 0.1087, "num_input_tokens_seen": 25090560, "step": 37225 }, { "epoch": 0.9095350939339897, "grad_norm": 19.804662704467773, "learning_rate": 1.959412558429835e-06, "loss": 0.1335, "num_input_tokens_seen": 25093760, "step": 37230 }, { "epoch": 0.9096572447658369, "grad_norm": 22.87166976928711, "learning_rate": 1.9593885061226002e-06, "loss": 0.041, "num_input_tokens_seen": 25097216, "step": 37235 }, { "epoch": 0.9097793955976841, "grad_norm": 0.5402935147285461, "learning_rate": 1.959364446838434e-06, "loss": 0.1368, "num_input_tokens_seen": 25100160, "step": 37240 }, { "epoch": 0.9099015464295311, "grad_norm": 0.44300487637519836, "learning_rate": 1.9593403805775113e-06, "loss": 0.0885, "num_input_tokens_seen": 25104320, "step": 37245 }, { "epoch": 0.9100236972613783, "grad_norm": 26.597187042236328, "learning_rate": 1.9593163073400075e-06, "loss": 0.1744, "num_input_tokens_seen": 25107584, "step": 37250 }, { "epoch": 0.9101458480932255, "grad_norm": 35.40641403198242, "learning_rate": 1.9592922271260973e-06, "loss": 0.098, "num_input_tokens_seen": 25111232, "step": 37255 }, { "epoch": 0.9102679989250727, "grad_norm": 0.5295836329460144, "learning_rate": 1.959268139935956e-06, "loss": 0.1152, "num_input_tokens_seen": 25114560, "step": 37260 }, { "epoch": 0.9103901497569199, "grad_norm": 14.122322082519531, "learning_rate": 1.959244045769759e-06, "loss": 0.0152, "num_input_tokens_seen": 25118208, "step": 37265 }, { "epoch": 0.910512300588767, "grad_norm": 14.80065631866455, "learning_rate": 1.9592199446276812e-06, "loss": 0.0609, "num_input_tokens_seen": 25121280, "step": 37270 }, { "epoch": 0.9106344514206142, "grad_norm": 31.571918487548828, "learning_rate": 1.959195836509898e-06, "loss": 0.0292, "num_input_tokens_seen": 25124928, "step": 37275 }, { "epoch": 0.9107566022524614, "grad_norm": 23.620386123657227, "learning_rate": 1.9591717214165844e-06, "loss": 0.2271, "num_input_tokens_seen": 25128320, "step": 37280 }, { "epoch": 0.9108787530843085, "grad_norm": 34.001434326171875, "learning_rate": 1.9591475993479162e-06, "loss": 0.1586, "num_input_tokens_seen": 25131776, "step": 37285 }, { "epoch": 0.9110009039161556, "grad_norm": 11.455076217651367, "learning_rate": 1.959123470304069e-06, "loss": 0.1356, "num_input_tokens_seen": 25135296, "step": 37290 }, { "epoch": 0.9111230547480028, "grad_norm": 0.1452919840812683, "learning_rate": 1.9590993342852175e-06, "loss": 0.0101, "num_input_tokens_seen": 25138624, "step": 37295 }, { "epoch": 0.91124520557985, "grad_norm": 4.947422504425049, "learning_rate": 1.959075191291538e-06, "loss": 0.1217, "num_input_tokens_seen": 25142016, "step": 37300 }, { "epoch": 0.9113673564116972, "grad_norm": 1.445472240447998, "learning_rate": 1.959051041323206e-06, "loss": 0.0603, "num_input_tokens_seen": 25145792, "step": 37305 }, { "epoch": 0.9114895072435444, "grad_norm": 1.9578611850738525, "learning_rate": 1.959026884380396e-06, "loss": 0.1354, "num_input_tokens_seen": 25149568, "step": 37310 }, { "epoch": 0.9116116580753915, "grad_norm": 31.493968963623047, "learning_rate": 1.959002720463285e-06, "loss": 0.0892, "num_input_tokens_seen": 25152768, "step": 37315 }, { "epoch": 0.9117338089072387, "grad_norm": 0.8048754930496216, "learning_rate": 1.958978549572048e-06, "loss": 0.1312, "num_input_tokens_seen": 25155776, "step": 37320 }, { "epoch": 0.9118559597390858, "grad_norm": 16.5799617767334, "learning_rate": 1.958954371706862e-06, "loss": 0.106, "num_input_tokens_seen": 25159360, "step": 37325 }, { "epoch": 0.911978110570933, "grad_norm": 3.2302496433258057, "learning_rate": 1.9589301868679013e-06, "loss": 0.0733, "num_input_tokens_seen": 25162752, "step": 37330 }, { "epoch": 0.9121002614027801, "grad_norm": 0.7612953186035156, "learning_rate": 1.958905995055342e-06, "loss": 0.1728, "num_input_tokens_seen": 25165824, "step": 37335 }, { "epoch": 0.9122224122346273, "grad_norm": 18.163131713867188, "learning_rate": 1.9588817962693607e-06, "loss": 0.2231, "num_input_tokens_seen": 25169152, "step": 37340 }, { "epoch": 0.9123445630664745, "grad_norm": 6.804000377655029, "learning_rate": 1.9588575905101333e-06, "loss": 0.0484, "num_input_tokens_seen": 25173120, "step": 37345 }, { "epoch": 0.9124667138983217, "grad_norm": 16.38357925415039, "learning_rate": 1.958833377777835e-06, "loss": 0.1025, "num_input_tokens_seen": 25176320, "step": 37350 }, { "epoch": 0.9125888647301688, "grad_norm": 0.268574595451355, "learning_rate": 1.958809158072643e-06, "loss": 0.094, "num_input_tokens_seen": 25179968, "step": 37355 }, { "epoch": 0.912711015562016, "grad_norm": 18.270381927490234, "learning_rate": 1.958784931394733e-06, "loss": 0.0401, "num_input_tokens_seen": 25183296, "step": 37360 }, { "epoch": 0.9128331663938631, "grad_norm": 40.83668518066406, "learning_rate": 1.958760697744281e-06, "loss": 0.2674, "num_input_tokens_seen": 25187008, "step": 37365 }, { "epoch": 0.9129553172257103, "grad_norm": 59.549259185791016, "learning_rate": 1.958736457121463e-06, "loss": 0.0754, "num_input_tokens_seen": 25190656, "step": 37370 }, { "epoch": 0.9130774680575575, "grad_norm": 0.25304514169692993, "learning_rate": 1.958712209526456e-06, "loss": 0.1197, "num_input_tokens_seen": 25194112, "step": 37375 }, { "epoch": 0.9131996188894046, "grad_norm": 12.690157890319824, "learning_rate": 1.9586879549594356e-06, "loss": 0.0632, "num_input_tokens_seen": 25197120, "step": 37380 }, { "epoch": 0.9133217697212518, "grad_norm": 37.6353645324707, "learning_rate": 1.958663693420579e-06, "loss": 0.1644, "num_input_tokens_seen": 25200448, "step": 37385 }, { "epoch": 0.913443920553099, "grad_norm": 27.67104721069336, "learning_rate": 1.9586394249100616e-06, "loss": 0.0693, "num_input_tokens_seen": 25203648, "step": 37390 }, { "epoch": 0.9135660713849462, "grad_norm": 15.691805839538574, "learning_rate": 1.9586151494280604e-06, "loss": 0.0771, "num_input_tokens_seen": 25207104, "step": 37395 }, { "epoch": 0.9136882222167932, "grad_norm": 31.615686416625977, "learning_rate": 1.9585908669747523e-06, "loss": 0.2961, "num_input_tokens_seen": 25210560, "step": 37400 }, { "epoch": 0.9138103730486404, "grad_norm": 5.969202995300293, "learning_rate": 1.958566577550314e-06, "loss": 0.0501, "num_input_tokens_seen": 25213888, "step": 37405 }, { "epoch": 0.9139325238804876, "grad_norm": 3.6108410358428955, "learning_rate": 1.958542281154921e-06, "loss": 0.1711, "num_input_tokens_seen": 25217344, "step": 37410 }, { "epoch": 0.9140546747123348, "grad_norm": 0.9648376107215881, "learning_rate": 1.9585179777887514e-06, "loss": 0.1616, "num_input_tokens_seen": 25220416, "step": 37415 }, { "epoch": 0.914176825544182, "grad_norm": 0.8474432229995728, "learning_rate": 1.9584936674519806e-06, "loss": 0.0618, "num_input_tokens_seen": 25224064, "step": 37420 }, { "epoch": 0.9142989763760291, "grad_norm": 11.26650333404541, "learning_rate": 1.9584693501447863e-06, "loss": 0.0479, "num_input_tokens_seen": 25227072, "step": 37425 }, { "epoch": 0.9144211272078763, "grad_norm": 6.293632507324219, "learning_rate": 1.958445025867345e-06, "loss": 0.0213, "num_input_tokens_seen": 25230528, "step": 37430 }, { "epoch": 0.9145432780397235, "grad_norm": 11.405076026916504, "learning_rate": 1.9584206946198342e-06, "loss": 0.1662, "num_input_tokens_seen": 25233984, "step": 37435 }, { "epoch": 0.9146654288715707, "grad_norm": 0.46462714672088623, "learning_rate": 1.9583963564024297e-06, "loss": 0.0253, "num_input_tokens_seen": 25237184, "step": 37440 }, { "epoch": 0.9147875797034177, "grad_norm": 21.904184341430664, "learning_rate": 1.9583720112153094e-06, "loss": 0.1111, "num_input_tokens_seen": 25240576, "step": 37445 }, { "epoch": 0.9149097305352649, "grad_norm": 9.3959321975708, "learning_rate": 1.95834765905865e-06, "loss": 0.1968, "num_input_tokens_seen": 25243712, "step": 37450 }, { "epoch": 0.9150318813671121, "grad_norm": 24.887296676635742, "learning_rate": 1.958323299932629e-06, "loss": 0.1636, "num_input_tokens_seen": 25247232, "step": 37455 }, { "epoch": 0.9151540321989593, "grad_norm": 0.8720492720603943, "learning_rate": 1.9582989338374227e-06, "loss": 0.0255, "num_input_tokens_seen": 25250880, "step": 37460 }, { "epoch": 0.9152761830308065, "grad_norm": 34.30413055419922, "learning_rate": 1.958274560773209e-06, "loss": 0.081, "num_input_tokens_seen": 25254592, "step": 37465 }, { "epoch": 0.9153983338626536, "grad_norm": 13.46987533569336, "learning_rate": 1.958250180740165e-06, "loss": 0.0502, "num_input_tokens_seen": 25258048, "step": 37470 }, { "epoch": 0.9155204846945008, "grad_norm": 9.968213081359863, "learning_rate": 1.958225793738468e-06, "loss": 0.1089, "num_input_tokens_seen": 25261248, "step": 37475 }, { "epoch": 0.915642635526348, "grad_norm": 14.56130313873291, "learning_rate": 1.958201399768295e-06, "loss": 0.1609, "num_input_tokens_seen": 25265024, "step": 37480 }, { "epoch": 0.9157647863581951, "grad_norm": 0.8725386261940002, "learning_rate": 1.958176998829824e-06, "loss": 0.2316, "num_input_tokens_seen": 25268032, "step": 37485 }, { "epoch": 0.9158869371900422, "grad_norm": 2.6830427646636963, "learning_rate": 1.958152590923232e-06, "loss": 0.0842, "num_input_tokens_seen": 25271360, "step": 37490 }, { "epoch": 0.9160090880218894, "grad_norm": 20.3107967376709, "learning_rate": 1.958128176048697e-06, "loss": 0.0642, "num_input_tokens_seen": 25274816, "step": 37495 }, { "epoch": 0.9161312388537366, "grad_norm": 0.6933755874633789, "learning_rate": 1.9581037542063955e-06, "loss": 0.0984, "num_input_tokens_seen": 25278208, "step": 37500 }, { "epoch": 0.9162533896855838, "grad_norm": 8.6947021484375, "learning_rate": 1.958079325396506e-06, "loss": 0.1362, "num_input_tokens_seen": 25281600, "step": 37505 }, { "epoch": 0.916375540517431, "grad_norm": 2.2591030597686768, "learning_rate": 1.9580548896192066e-06, "loss": 0.0776, "num_input_tokens_seen": 25284864, "step": 37510 }, { "epoch": 0.916497691349278, "grad_norm": 0.5930300354957581, "learning_rate": 1.9580304468746736e-06, "loss": 0.1038, "num_input_tokens_seen": 25288192, "step": 37515 }, { "epoch": 0.9166198421811252, "grad_norm": 41.463714599609375, "learning_rate": 1.958005997163086e-06, "loss": 0.1667, "num_input_tokens_seen": 25291584, "step": 37520 }, { "epoch": 0.9167419930129724, "grad_norm": 15.766650199890137, "learning_rate": 1.9579815404846207e-06, "loss": 0.0361, "num_input_tokens_seen": 25295040, "step": 37525 }, { "epoch": 0.9168641438448196, "grad_norm": 21.16053581237793, "learning_rate": 1.957957076839456e-06, "loss": 0.1552, "num_input_tokens_seen": 25298432, "step": 37530 }, { "epoch": 0.9169862946766667, "grad_norm": 0.31387093663215637, "learning_rate": 1.95793260622777e-06, "loss": 0.0896, "num_input_tokens_seen": 25301632, "step": 37535 }, { "epoch": 0.9171084455085139, "grad_norm": 0.2981453537940979, "learning_rate": 1.95790812864974e-06, "loss": 0.134, "num_input_tokens_seen": 25305536, "step": 37540 }, { "epoch": 0.9172305963403611, "grad_norm": 15.161022186279297, "learning_rate": 1.9578836441055453e-06, "loss": 0.1619, "num_input_tokens_seen": 25309504, "step": 37545 }, { "epoch": 0.9173527471722083, "grad_norm": 34.08779525756836, "learning_rate": 1.9578591525953625e-06, "loss": 0.0792, "num_input_tokens_seen": 25313152, "step": 37550 }, { "epoch": 0.9174748980040555, "grad_norm": 18.717666625976562, "learning_rate": 1.9578346541193705e-06, "loss": 0.1003, "num_input_tokens_seen": 25316352, "step": 37555 }, { "epoch": 0.9175970488359025, "grad_norm": 23.372556686401367, "learning_rate": 1.957810148677747e-06, "loss": 0.2018, "num_input_tokens_seen": 25319680, "step": 37560 }, { "epoch": 0.9177191996677497, "grad_norm": 1.9030033349990845, "learning_rate": 1.957785636270671e-06, "loss": 0.0991, "num_input_tokens_seen": 25323584, "step": 37565 }, { "epoch": 0.9178413504995969, "grad_norm": 19.270458221435547, "learning_rate": 1.95776111689832e-06, "loss": 0.1829, "num_input_tokens_seen": 25326912, "step": 37570 }, { "epoch": 0.9179635013314441, "grad_norm": 26.126998901367188, "learning_rate": 1.957736590560872e-06, "loss": 0.1812, "num_input_tokens_seen": 25330112, "step": 37575 }, { "epoch": 0.9180856521632912, "grad_norm": 10.202178001403809, "learning_rate": 1.9577120572585067e-06, "loss": 0.1119, "num_input_tokens_seen": 25333184, "step": 37580 }, { "epoch": 0.9182078029951384, "grad_norm": 7.249847412109375, "learning_rate": 1.9576875169914016e-06, "loss": 0.0318, "num_input_tokens_seen": 25336640, "step": 37585 }, { "epoch": 0.9183299538269856, "grad_norm": 1.9853745698928833, "learning_rate": 1.957662969759735e-06, "loss": 0.0357, "num_input_tokens_seen": 25340096, "step": 37590 }, { "epoch": 0.9184521046588328, "grad_norm": 18.176387786865234, "learning_rate": 1.957638415563686e-06, "loss": 0.1304, "num_input_tokens_seen": 25343360, "step": 37595 }, { "epoch": 0.9185742554906798, "grad_norm": 17.54328727722168, "learning_rate": 1.9576138544034327e-06, "loss": 0.1276, "num_input_tokens_seen": 25347200, "step": 37600 }, { "epoch": 0.918696406322527, "grad_norm": 0.22984708845615387, "learning_rate": 1.9575892862791537e-06, "loss": 0.086, "num_input_tokens_seen": 25350656, "step": 37605 }, { "epoch": 0.9188185571543742, "grad_norm": 23.025104522705078, "learning_rate": 1.9575647111910276e-06, "loss": 0.1472, "num_input_tokens_seen": 25353536, "step": 37610 }, { "epoch": 0.9189407079862214, "grad_norm": 1.0077747106552124, "learning_rate": 1.957540129139234e-06, "loss": 0.2079, "num_input_tokens_seen": 25356928, "step": 37615 }, { "epoch": 0.9190628588180686, "grad_norm": 26.906963348388672, "learning_rate": 1.957515540123951e-06, "loss": 0.224, "num_input_tokens_seen": 25360064, "step": 37620 }, { "epoch": 0.9191850096499157, "grad_norm": 0.5911832451820374, "learning_rate": 1.9574909441453573e-06, "loss": 0.0315, "num_input_tokens_seen": 25363136, "step": 37625 }, { "epoch": 0.9193071604817629, "grad_norm": 30.879526138305664, "learning_rate": 1.957466341203632e-06, "loss": 0.0691, "num_input_tokens_seen": 25366592, "step": 37630 }, { "epoch": 0.91942931131361, "grad_norm": 23.551034927368164, "learning_rate": 1.9574417312989535e-06, "loss": 0.1469, "num_input_tokens_seen": 25370304, "step": 37635 }, { "epoch": 0.9195514621454572, "grad_norm": 9.431242942810059, "learning_rate": 1.9574171144315016e-06, "loss": 0.1158, "num_input_tokens_seen": 25373248, "step": 37640 }, { "epoch": 0.9196736129773043, "grad_norm": 1.5377452373504639, "learning_rate": 1.957392490601455e-06, "loss": 0.0728, "num_input_tokens_seen": 25376832, "step": 37645 }, { "epoch": 0.9197957638091515, "grad_norm": 7.705243110656738, "learning_rate": 1.9573678598089924e-06, "loss": 0.0135, "num_input_tokens_seen": 25380288, "step": 37650 }, { "epoch": 0.9199179146409987, "grad_norm": 6.997256278991699, "learning_rate": 1.9573432220542933e-06, "loss": 0.0112, "num_input_tokens_seen": 25383808, "step": 37655 }, { "epoch": 0.9200400654728459, "grad_norm": 0.8887893557548523, "learning_rate": 1.957318577337537e-06, "loss": 0.1431, "num_input_tokens_seen": 25386880, "step": 37660 }, { "epoch": 0.9201622163046931, "grad_norm": 1.3595986366271973, "learning_rate": 1.9572939256589025e-06, "loss": 0.138, "num_input_tokens_seen": 25390720, "step": 37665 }, { "epoch": 0.9202843671365402, "grad_norm": 0.3079362213611603, "learning_rate": 1.957269267018569e-06, "loss": 0.0168, "num_input_tokens_seen": 25394496, "step": 37670 }, { "epoch": 0.9204065179683873, "grad_norm": 0.5935457348823547, "learning_rate": 1.957244601416716e-06, "loss": 0.0043, "num_input_tokens_seen": 25398080, "step": 37675 }, { "epoch": 0.9205286688002345, "grad_norm": 10.200531959533691, "learning_rate": 1.957219928853523e-06, "loss": 0.2152, "num_input_tokens_seen": 25401664, "step": 37680 }, { "epoch": 0.9206508196320817, "grad_norm": 37.06196212768555, "learning_rate": 1.9571952493291685e-06, "loss": 0.1617, "num_input_tokens_seen": 25405568, "step": 37685 }, { "epoch": 0.9207729704639288, "grad_norm": 9.282423973083496, "learning_rate": 1.957170562843833e-06, "loss": 0.1938, "num_input_tokens_seen": 25409344, "step": 37690 }, { "epoch": 0.920895121295776, "grad_norm": 9.238424301147461, "learning_rate": 1.957145869397696e-06, "loss": 0.0743, "num_input_tokens_seen": 25412928, "step": 37695 }, { "epoch": 0.9210172721276232, "grad_norm": 24.76751708984375, "learning_rate": 1.9571211689909366e-06, "loss": 0.1165, "num_input_tokens_seen": 25416256, "step": 37700 }, { "epoch": 0.9211394229594704, "grad_norm": 2.6283040046691895, "learning_rate": 1.9570964616237348e-06, "loss": 0.0228, "num_input_tokens_seen": 25419904, "step": 37705 }, { "epoch": 0.9212615737913176, "grad_norm": 0.2652167081832886, "learning_rate": 1.9570717472962697e-06, "loss": 0.1253, "num_input_tokens_seen": 25423040, "step": 37710 }, { "epoch": 0.9213837246231646, "grad_norm": 16.254348754882812, "learning_rate": 1.9570470260087217e-06, "loss": 0.2298, "num_input_tokens_seen": 25426560, "step": 37715 }, { "epoch": 0.9215058754550118, "grad_norm": 0.7373325228691101, "learning_rate": 1.9570222977612704e-06, "loss": 0.059, "num_input_tokens_seen": 25430016, "step": 37720 }, { "epoch": 0.921628026286859, "grad_norm": 20.672805786132812, "learning_rate": 1.9569975625540954e-06, "loss": 0.1113, "num_input_tokens_seen": 25433536, "step": 37725 }, { "epoch": 0.9217501771187062, "grad_norm": 11.524206161499023, "learning_rate": 1.9569728203873767e-06, "loss": 0.0467, "num_input_tokens_seen": 25436992, "step": 37730 }, { "epoch": 0.9218723279505533, "grad_norm": 32.22142028808594, "learning_rate": 1.9569480712612943e-06, "loss": 0.0898, "num_input_tokens_seen": 25440128, "step": 37735 }, { "epoch": 0.9219944787824005, "grad_norm": 72.72855377197266, "learning_rate": 1.956923315176028e-06, "loss": 0.127, "num_input_tokens_seen": 25443648, "step": 37740 }, { "epoch": 0.9221166296142477, "grad_norm": 17.22328758239746, "learning_rate": 1.956898552131758e-06, "loss": 0.0535, "num_input_tokens_seen": 25446784, "step": 37745 }, { "epoch": 0.9222387804460949, "grad_norm": 16.77092933654785, "learning_rate": 1.9568737821286645e-06, "loss": 0.1066, "num_input_tokens_seen": 25450240, "step": 37750 }, { "epoch": 0.922360931277942, "grad_norm": 0.3280671536922455, "learning_rate": 1.9568490051669276e-06, "loss": 0.0087, "num_input_tokens_seen": 25453824, "step": 37755 }, { "epoch": 0.9224830821097891, "grad_norm": 22.226842880249023, "learning_rate": 1.9568242212467273e-06, "loss": 0.0612, "num_input_tokens_seen": 25457984, "step": 37760 }, { "epoch": 0.9226052329416363, "grad_norm": 0.11983904242515564, "learning_rate": 1.9567994303682437e-06, "loss": 0.1227, "num_input_tokens_seen": 25461376, "step": 37765 }, { "epoch": 0.9227273837734835, "grad_norm": 1.0264025926589966, "learning_rate": 1.9567746325316575e-06, "loss": 0.096, "num_input_tokens_seen": 25464704, "step": 37770 }, { "epoch": 0.9228495346053307, "grad_norm": 30.15180778503418, "learning_rate": 1.956749827737149e-06, "loss": 0.2156, "num_input_tokens_seen": 25468032, "step": 37775 }, { "epoch": 0.9229716854371778, "grad_norm": 56.08074188232422, "learning_rate": 1.956725015984898e-06, "loss": 0.1741, "num_input_tokens_seen": 25471744, "step": 37780 }, { "epoch": 0.923093836269025, "grad_norm": 8.849523544311523, "learning_rate": 1.956700197275086e-06, "loss": 0.165, "num_input_tokens_seen": 25475200, "step": 37785 }, { "epoch": 0.9232159871008722, "grad_norm": 10.861885070800781, "learning_rate": 1.9566753716078922e-06, "loss": 0.1686, "num_input_tokens_seen": 25478720, "step": 37790 }, { "epoch": 0.9233381379327193, "grad_norm": 8.872599601745605, "learning_rate": 1.9566505389834978e-06, "loss": 0.094, "num_input_tokens_seen": 25482368, "step": 37795 }, { "epoch": 0.9234602887645665, "grad_norm": 0.5979471802711487, "learning_rate": 1.9566256994020833e-06, "loss": 0.0531, "num_input_tokens_seen": 25485568, "step": 37800 }, { "epoch": 0.9235824395964136, "grad_norm": 0.13665713369846344, "learning_rate": 1.95660085286383e-06, "loss": 0.1456, "num_input_tokens_seen": 25488704, "step": 37805 }, { "epoch": 0.9237045904282608, "grad_norm": 0.41918784379959106, "learning_rate": 1.956575999368918e-06, "loss": 0.1239, "num_input_tokens_seen": 25491968, "step": 37810 }, { "epoch": 0.923826741260108, "grad_norm": 16.839773178100586, "learning_rate": 1.9565511389175273e-06, "loss": 0.1556, "num_input_tokens_seen": 25496576, "step": 37815 }, { "epoch": 0.9239488920919552, "grad_norm": 1.8773021697998047, "learning_rate": 1.9565262715098396e-06, "loss": 0.1001, "num_input_tokens_seen": 25499968, "step": 37820 }, { "epoch": 0.9240710429238023, "grad_norm": 0.22760513424873352, "learning_rate": 1.9565013971460362e-06, "loss": 0.069, "num_input_tokens_seen": 25503232, "step": 37825 }, { "epoch": 0.9241931937556495, "grad_norm": 28.925373077392578, "learning_rate": 1.956476515826297e-06, "loss": 0.1173, "num_input_tokens_seen": 25506560, "step": 37830 }, { "epoch": 0.9243153445874966, "grad_norm": 21.988372802734375, "learning_rate": 1.9564516275508033e-06, "loss": 0.15, "num_input_tokens_seen": 25509952, "step": 37835 }, { "epoch": 0.9244374954193438, "grad_norm": 5.837953090667725, "learning_rate": 1.956426732319736e-06, "loss": 0.1084, "num_input_tokens_seen": 25513408, "step": 37840 }, { "epoch": 0.924559646251191, "grad_norm": 6.572620868682861, "learning_rate": 1.9564018301332765e-06, "loss": 0.1143, "num_input_tokens_seen": 25516864, "step": 37845 }, { "epoch": 0.9246817970830381, "grad_norm": 0.9621378183364868, "learning_rate": 1.9563769209916055e-06, "loss": 0.206, "num_input_tokens_seen": 25520128, "step": 37850 }, { "epoch": 0.9248039479148853, "grad_norm": 16.34810447692871, "learning_rate": 1.9563520048949043e-06, "loss": 0.0725, "num_input_tokens_seen": 25523776, "step": 37855 }, { "epoch": 0.9249260987467325, "grad_norm": 18.789794921875, "learning_rate": 1.956327081843354e-06, "loss": 0.0724, "num_input_tokens_seen": 25526784, "step": 37860 }, { "epoch": 0.9250482495785797, "grad_norm": 0.49031341075897217, "learning_rate": 1.9563021518371363e-06, "loss": 0.0253, "num_input_tokens_seen": 25530240, "step": 37865 }, { "epoch": 0.9251704004104268, "grad_norm": 8.614137649536133, "learning_rate": 1.9562772148764317e-06, "loss": 0.1545, "num_input_tokens_seen": 25534208, "step": 37870 }, { "epoch": 0.9252925512422739, "grad_norm": 2.4033591747283936, "learning_rate": 1.9562522709614223e-06, "loss": 0.0044, "num_input_tokens_seen": 25537344, "step": 37875 }, { "epoch": 0.9254147020741211, "grad_norm": 0.6801878213882446, "learning_rate": 1.956227320092289e-06, "loss": 0.0535, "num_input_tokens_seen": 25541376, "step": 37880 }, { "epoch": 0.9255368529059683, "grad_norm": 17.762008666992188, "learning_rate": 1.9562023622692132e-06, "loss": 0.0822, "num_input_tokens_seen": 25544576, "step": 37885 }, { "epoch": 0.9256590037378154, "grad_norm": 0.22601798176765442, "learning_rate": 1.9561773974923774e-06, "loss": 0.0883, "num_input_tokens_seen": 25548288, "step": 37890 }, { "epoch": 0.9257811545696626, "grad_norm": 20.17107582092285, "learning_rate": 1.9561524257619617e-06, "loss": 0.0122, "num_input_tokens_seen": 25552128, "step": 37895 }, { "epoch": 0.9259033054015098, "grad_norm": 43.49441146850586, "learning_rate": 1.9561274470781485e-06, "loss": 0.1289, "num_input_tokens_seen": 25555712, "step": 37900 }, { "epoch": 0.926025456233357, "grad_norm": 20.65340232849121, "learning_rate": 1.9561024614411197e-06, "loss": 0.3542, "num_input_tokens_seen": 25558848, "step": 37905 }, { "epoch": 0.9261476070652042, "grad_norm": 24.24153709411621, "learning_rate": 1.956077468851056e-06, "loss": 0.1869, "num_input_tokens_seen": 25562176, "step": 37910 }, { "epoch": 0.9262697578970512, "grad_norm": 27.763818740844727, "learning_rate": 1.9560524693081405e-06, "loss": 0.1002, "num_input_tokens_seen": 25565248, "step": 37915 }, { "epoch": 0.9263919087288984, "grad_norm": 27.984113693237305, "learning_rate": 1.956027462812554e-06, "loss": 0.2334, "num_input_tokens_seen": 25568128, "step": 37920 }, { "epoch": 0.9265140595607456, "grad_norm": 24.396575927734375, "learning_rate": 1.9560024493644786e-06, "loss": 0.0868, "num_input_tokens_seen": 25571072, "step": 37925 }, { "epoch": 0.9266362103925928, "grad_norm": 19.87697982788086, "learning_rate": 1.955977428964096e-06, "loss": 0.1437, "num_input_tokens_seen": 25574528, "step": 37930 }, { "epoch": 0.9267583612244399, "grad_norm": 15.917418479919434, "learning_rate": 1.9559524016115887e-06, "loss": 0.159, "num_input_tokens_seen": 25577920, "step": 37935 }, { "epoch": 0.9268805120562871, "grad_norm": 6.159919261932373, "learning_rate": 1.9559273673071384e-06, "loss": 0.1351, "num_input_tokens_seen": 25581312, "step": 37940 }, { "epoch": 0.9270026628881343, "grad_norm": 9.192456245422363, "learning_rate": 1.955902326050927e-06, "loss": 0.0348, "num_input_tokens_seen": 25584768, "step": 37945 }, { "epoch": 0.9271248137199815, "grad_norm": 0.9583147764205933, "learning_rate": 1.9558772778431373e-06, "loss": 0.0667, "num_input_tokens_seen": 25588096, "step": 37950 }, { "epoch": 0.9272469645518286, "grad_norm": 0.444980263710022, "learning_rate": 1.9558522226839506e-06, "loss": 0.0897, "num_input_tokens_seen": 25591744, "step": 37955 }, { "epoch": 0.9273691153836757, "grad_norm": 77.95771789550781, "learning_rate": 1.955827160573549e-06, "loss": 0.1594, "num_input_tokens_seen": 25595264, "step": 37960 }, { "epoch": 0.9274912662155229, "grad_norm": 11.438605308532715, "learning_rate": 1.9558020915121157e-06, "loss": 0.1122, "num_input_tokens_seen": 25598464, "step": 37965 }, { "epoch": 0.9276134170473701, "grad_norm": 2.1093642711639404, "learning_rate": 1.9557770154998326e-06, "loss": 0.1329, "num_input_tokens_seen": 25601856, "step": 37970 }, { "epoch": 0.9277355678792173, "grad_norm": 14.986445426940918, "learning_rate": 1.9557519325368818e-06, "loss": 0.2071, "num_input_tokens_seen": 25605440, "step": 37975 }, { "epoch": 0.9278577187110644, "grad_norm": 72.62251281738281, "learning_rate": 1.955726842623446e-06, "loss": 0.121, "num_input_tokens_seen": 25608704, "step": 37980 }, { "epoch": 0.9279798695429116, "grad_norm": 28.621503829956055, "learning_rate": 1.9557017457597073e-06, "loss": 0.1347, "num_input_tokens_seen": 25611904, "step": 37985 }, { "epoch": 0.9281020203747588, "grad_norm": 38.730899810791016, "learning_rate": 1.9556766419458487e-06, "loss": 0.1338, "num_input_tokens_seen": 25614976, "step": 37990 }, { "epoch": 0.9282241712066059, "grad_norm": 14.130492210388184, "learning_rate": 1.955651531182052e-06, "loss": 0.0874, "num_input_tokens_seen": 25618496, "step": 37995 }, { "epoch": 0.9283463220384531, "grad_norm": 13.125405311584473, "learning_rate": 1.955626413468501e-06, "loss": 0.1874, "num_input_tokens_seen": 25621504, "step": 38000 }, { "epoch": 0.9284684728703002, "grad_norm": 24.891145706176758, "learning_rate": 1.9556012888053775e-06, "loss": 0.1164, "num_input_tokens_seen": 25625088, "step": 38005 }, { "epoch": 0.9285906237021474, "grad_norm": 17.112619400024414, "learning_rate": 1.955576157192864e-06, "loss": 0.2076, "num_input_tokens_seen": 25628480, "step": 38010 }, { "epoch": 0.9287127745339946, "grad_norm": 14.229789733886719, "learning_rate": 1.9555510186311445e-06, "loss": 0.135, "num_input_tokens_seen": 25632000, "step": 38015 }, { "epoch": 0.9288349253658418, "grad_norm": 7.512895584106445, "learning_rate": 1.9555258731204e-06, "loss": 0.1191, "num_input_tokens_seen": 25635264, "step": 38020 }, { "epoch": 0.9289570761976889, "grad_norm": 17.83555030822754, "learning_rate": 1.955500720660815e-06, "loss": 0.2239, "num_input_tokens_seen": 25638656, "step": 38025 }, { "epoch": 0.929079227029536, "grad_norm": 10.23780632019043, "learning_rate": 1.9554755612525716e-06, "loss": 0.0959, "num_input_tokens_seen": 25641920, "step": 38030 }, { "epoch": 0.9292013778613832, "grad_norm": 21.389467239379883, "learning_rate": 1.9554503948958525e-06, "loss": 0.074, "num_input_tokens_seen": 25645760, "step": 38035 }, { "epoch": 0.9293235286932304, "grad_norm": 12.27301025390625, "learning_rate": 1.955425221590842e-06, "loss": 0.149, "num_input_tokens_seen": 25649216, "step": 38040 }, { "epoch": 0.9294456795250776, "grad_norm": 1.4360170364379883, "learning_rate": 1.9554000413377218e-06, "loss": 0.0588, "num_input_tokens_seen": 25652160, "step": 38045 }, { "epoch": 0.9295678303569247, "grad_norm": 23.02863883972168, "learning_rate": 1.9553748541366755e-06, "loss": 0.1236, "num_input_tokens_seen": 25655424, "step": 38050 }, { "epoch": 0.9296899811887719, "grad_norm": 13.383890151977539, "learning_rate": 1.9553496599878865e-06, "loss": 0.0575, "num_input_tokens_seen": 25659072, "step": 38055 }, { "epoch": 0.9298121320206191, "grad_norm": 11.99791431427002, "learning_rate": 1.9553244588915375e-06, "loss": 0.0297, "num_input_tokens_seen": 25662080, "step": 38060 }, { "epoch": 0.9299342828524663, "grad_norm": 0.2585084140300751, "learning_rate": 1.9552992508478124e-06, "loss": 0.0378, "num_input_tokens_seen": 25665152, "step": 38065 }, { "epoch": 0.9300564336843133, "grad_norm": 19.530658721923828, "learning_rate": 1.955274035856894e-06, "loss": 0.1424, "num_input_tokens_seen": 25668800, "step": 38070 }, { "epoch": 0.9301785845161605, "grad_norm": 14.212098121643066, "learning_rate": 1.955248813918966e-06, "loss": 0.1317, "num_input_tokens_seen": 25672064, "step": 38075 }, { "epoch": 0.9303007353480077, "grad_norm": 3.2175631523132324, "learning_rate": 1.9552235850342115e-06, "loss": 0.073, "num_input_tokens_seen": 25675520, "step": 38080 }, { "epoch": 0.9304228861798549, "grad_norm": 83.01322174072266, "learning_rate": 1.955198349202814e-06, "loss": 0.1656, "num_input_tokens_seen": 25678912, "step": 38085 }, { "epoch": 0.9305450370117021, "grad_norm": 1.3272134065628052, "learning_rate": 1.9551731064249577e-06, "loss": 0.0914, "num_input_tokens_seen": 25682112, "step": 38090 }, { "epoch": 0.9306671878435492, "grad_norm": 22.99603271484375, "learning_rate": 1.9551478567008254e-06, "loss": 0.2259, "num_input_tokens_seen": 25685120, "step": 38095 }, { "epoch": 0.9307893386753964, "grad_norm": 1.4854393005371094, "learning_rate": 1.955122600030601e-06, "loss": 0.1399, "num_input_tokens_seen": 25687936, "step": 38100 }, { "epoch": 0.9309114895072436, "grad_norm": 9.913447380065918, "learning_rate": 1.9550973364144683e-06, "loss": 0.2016, "num_input_tokens_seen": 25691264, "step": 38105 }, { "epoch": 0.9310336403390908, "grad_norm": 12.232608795166016, "learning_rate": 1.9550720658526106e-06, "loss": 0.2067, "num_input_tokens_seen": 25694528, "step": 38110 }, { "epoch": 0.9311557911709378, "grad_norm": 4.132025241851807, "learning_rate": 1.9550467883452123e-06, "loss": 0.0582, "num_input_tokens_seen": 25698112, "step": 38115 }, { "epoch": 0.931277942002785, "grad_norm": 8.121847152709961, "learning_rate": 1.955021503892457e-06, "loss": 0.1722, "num_input_tokens_seen": 25701440, "step": 38120 }, { "epoch": 0.9314000928346322, "grad_norm": 8.623042106628418, "learning_rate": 1.9549962124945276e-06, "loss": 0.1415, "num_input_tokens_seen": 25704640, "step": 38125 }, { "epoch": 0.9315222436664794, "grad_norm": 0.715641438961029, "learning_rate": 1.9549709141516097e-06, "loss": 0.1866, "num_input_tokens_seen": 25707904, "step": 38130 }, { "epoch": 0.9316443944983266, "grad_norm": 1.868943452835083, "learning_rate": 1.9549456088638863e-06, "loss": 0.1155, "num_input_tokens_seen": 25710912, "step": 38135 }, { "epoch": 0.9317665453301737, "grad_norm": 9.373357772827148, "learning_rate": 1.954920296631541e-06, "loss": 0.0837, "num_input_tokens_seen": 25714560, "step": 38140 }, { "epoch": 0.9318886961620209, "grad_norm": 10.353510856628418, "learning_rate": 1.9548949774547593e-06, "loss": 0.0481, "num_input_tokens_seen": 25717824, "step": 38145 }, { "epoch": 0.932010846993868, "grad_norm": 30.065528869628906, "learning_rate": 1.954869651333724e-06, "loss": 0.1506, "num_input_tokens_seen": 25720896, "step": 38150 }, { "epoch": 0.9321329978257152, "grad_norm": 10.553356170654297, "learning_rate": 1.95484431826862e-06, "loss": 0.0912, "num_input_tokens_seen": 25724160, "step": 38155 }, { "epoch": 0.9322551486575623, "grad_norm": 1.0528043508529663, "learning_rate": 1.9548189782596308e-06, "loss": 0.0612, "num_input_tokens_seen": 25727360, "step": 38160 }, { "epoch": 0.9323772994894095, "grad_norm": 1.273256540298462, "learning_rate": 1.9547936313069416e-06, "loss": 0.134, "num_input_tokens_seen": 25730752, "step": 38165 }, { "epoch": 0.9324994503212567, "grad_norm": 22.34047508239746, "learning_rate": 1.9547682774107368e-06, "loss": 0.1049, "num_input_tokens_seen": 25733888, "step": 38170 }, { "epoch": 0.9326216011531039, "grad_norm": 0.28361839056015015, "learning_rate": 1.954742916571199e-06, "loss": 0.0668, "num_input_tokens_seen": 25737216, "step": 38175 }, { "epoch": 0.932743751984951, "grad_norm": 21.001340866088867, "learning_rate": 1.954717548788515e-06, "loss": 0.045, "num_input_tokens_seen": 25740096, "step": 38180 }, { "epoch": 0.9328659028167982, "grad_norm": 44.88008499145508, "learning_rate": 1.954692174062868e-06, "loss": 0.141, "num_input_tokens_seen": 25743040, "step": 38185 }, { "epoch": 0.9329880536486453, "grad_norm": 27.96893310546875, "learning_rate": 1.9546667923944424e-06, "loss": 0.2554, "num_input_tokens_seen": 25746368, "step": 38190 }, { "epoch": 0.9331102044804925, "grad_norm": 14.065231323242188, "learning_rate": 1.954641403783423e-06, "loss": 0.115, "num_input_tokens_seen": 25749568, "step": 38195 }, { "epoch": 0.9332323553123397, "grad_norm": 0.27887821197509766, "learning_rate": 1.9546160082299952e-06, "loss": 0.1343, "num_input_tokens_seen": 25753216, "step": 38200 }, { "epoch": 0.9333545061441868, "grad_norm": 0.0033274723682552576, "learning_rate": 1.954590605734343e-06, "loss": 0.1828, "num_input_tokens_seen": 25756800, "step": 38205 }, { "epoch": 0.933476656976034, "grad_norm": 7.273612976074219, "learning_rate": 1.9545651962966507e-06, "loss": 0.0638, "num_input_tokens_seen": 25759872, "step": 38210 }, { "epoch": 0.9335988078078812, "grad_norm": 6.034097671508789, "learning_rate": 1.9545397799171034e-06, "loss": 0.0519, "num_input_tokens_seen": 25763456, "step": 38215 }, { "epoch": 0.9337209586397284, "grad_norm": 8.82055950164795, "learning_rate": 1.9545143565958865e-06, "loss": 0.076, "num_input_tokens_seen": 25766720, "step": 38220 }, { "epoch": 0.9338431094715754, "grad_norm": 21.622636795043945, "learning_rate": 1.954488926333184e-06, "loss": 0.0628, "num_input_tokens_seen": 25769728, "step": 38225 }, { "epoch": 0.9339652603034226, "grad_norm": 6.863642692565918, "learning_rate": 1.954463489129182e-06, "loss": 0.0699, "num_input_tokens_seen": 25773056, "step": 38230 }, { "epoch": 0.9340874111352698, "grad_norm": 14.245474815368652, "learning_rate": 1.9544380449840645e-06, "loss": 0.1508, "num_input_tokens_seen": 25776640, "step": 38235 }, { "epoch": 0.934209561967117, "grad_norm": 11.309356689453125, "learning_rate": 1.9544125938980164e-06, "loss": 0.0959, "num_input_tokens_seen": 25779776, "step": 38240 }, { "epoch": 0.9343317127989642, "grad_norm": 0.8810875415802002, "learning_rate": 1.9543871358712237e-06, "loss": 0.0699, "num_input_tokens_seen": 25782848, "step": 38245 }, { "epoch": 0.9344538636308113, "grad_norm": 20.196170806884766, "learning_rate": 1.954361670903871e-06, "loss": 0.0802, "num_input_tokens_seen": 25786048, "step": 38250 }, { "epoch": 0.9345760144626585, "grad_norm": 2.611332416534424, "learning_rate": 1.9543361989961432e-06, "loss": 0.1007, "num_input_tokens_seen": 25789184, "step": 38255 }, { "epoch": 0.9346981652945057, "grad_norm": 14.635353088378906, "learning_rate": 1.954310720148226e-06, "loss": 0.1763, "num_input_tokens_seen": 25792704, "step": 38260 }, { "epoch": 0.9348203161263529, "grad_norm": 0.8673586845397949, "learning_rate": 1.954285234360305e-06, "loss": 0.1349, "num_input_tokens_seen": 25796224, "step": 38265 }, { "epoch": 0.9349424669581999, "grad_norm": 19.911422729492188, "learning_rate": 1.9542597416325647e-06, "loss": 0.262, "num_input_tokens_seen": 25799424, "step": 38270 }, { "epoch": 0.9350646177900471, "grad_norm": 11.518757820129395, "learning_rate": 1.954234241965191e-06, "loss": 0.1736, "num_input_tokens_seen": 25802880, "step": 38275 }, { "epoch": 0.9351867686218943, "grad_norm": 39.08585739135742, "learning_rate": 1.9542087353583694e-06, "loss": 0.1193, "num_input_tokens_seen": 25806400, "step": 38280 }, { "epoch": 0.9353089194537415, "grad_norm": 17.186824798583984, "learning_rate": 1.9541832218122846e-06, "loss": 0.0837, "num_input_tokens_seen": 25809856, "step": 38285 }, { "epoch": 0.9354310702855887, "grad_norm": 12.267500877380371, "learning_rate": 1.9541577013271233e-06, "loss": 0.0656, "num_input_tokens_seen": 25812992, "step": 38290 }, { "epoch": 0.9355532211174358, "grad_norm": 13.140397071838379, "learning_rate": 1.9541321739030703e-06, "loss": 0.0843, "num_input_tokens_seen": 25816128, "step": 38295 }, { "epoch": 0.935675371949283, "grad_norm": 26.44617462158203, "learning_rate": 1.954106639540312e-06, "loss": 0.1442, "num_input_tokens_seen": 25819392, "step": 38300 }, { "epoch": 0.9357975227811302, "grad_norm": 1.3397661447525024, "learning_rate": 1.954081098239033e-06, "loss": 0.055, "num_input_tokens_seen": 25822912, "step": 38305 }, { "epoch": 0.9359196736129773, "grad_norm": 12.671271324157715, "learning_rate": 1.9540555499994197e-06, "loss": 0.2042, "num_input_tokens_seen": 25826048, "step": 38310 }, { "epoch": 0.9360418244448244, "grad_norm": 22.01853370666504, "learning_rate": 1.954029994821658e-06, "loss": 0.2489, "num_input_tokens_seen": 25829120, "step": 38315 }, { "epoch": 0.9361639752766716, "grad_norm": 12.638045310974121, "learning_rate": 1.9540044327059336e-06, "loss": 0.1075, "num_input_tokens_seen": 25832320, "step": 38320 }, { "epoch": 0.9362861261085188, "grad_norm": 13.656826972961426, "learning_rate": 1.9539788636524326e-06, "loss": 0.1052, "num_input_tokens_seen": 25835584, "step": 38325 }, { "epoch": 0.936408276940366, "grad_norm": 0.3497765064239502, "learning_rate": 1.9539532876613404e-06, "loss": 0.0676, "num_input_tokens_seen": 25838720, "step": 38330 }, { "epoch": 0.9365304277722132, "grad_norm": 18.81248664855957, "learning_rate": 1.9539277047328433e-06, "loss": 0.0935, "num_input_tokens_seen": 25842304, "step": 38335 }, { "epoch": 0.9366525786040603, "grad_norm": 0.9873834848403931, "learning_rate": 1.9539021148671274e-06, "loss": 0.0629, "num_input_tokens_seen": 25845504, "step": 38340 }, { "epoch": 0.9367747294359074, "grad_norm": 25.43363380432129, "learning_rate": 1.953876518064379e-06, "loss": 0.2716, "num_input_tokens_seen": 25848512, "step": 38345 }, { "epoch": 0.9368968802677546, "grad_norm": 2.0168118476867676, "learning_rate": 1.9538509143247834e-06, "loss": 0.0158, "num_input_tokens_seen": 25851968, "step": 38350 }, { "epoch": 0.9370190310996018, "grad_norm": 8.315200805664062, "learning_rate": 1.953825303648528e-06, "loss": 0.0655, "num_input_tokens_seen": 25854976, "step": 38355 }, { "epoch": 0.9371411819314489, "grad_norm": 2.901071786880493, "learning_rate": 1.9537996860357983e-06, "loss": 0.0877, "num_input_tokens_seen": 25858176, "step": 38360 }, { "epoch": 0.9372633327632961, "grad_norm": 19.516382217407227, "learning_rate": 1.9537740614867806e-06, "loss": 0.077, "num_input_tokens_seen": 25862080, "step": 38365 }, { "epoch": 0.9373854835951433, "grad_norm": 10.567694664001465, "learning_rate": 1.953748430001661e-06, "loss": 0.1235, "num_input_tokens_seen": 25865472, "step": 38370 }, { "epoch": 0.9375076344269905, "grad_norm": 28.031982421875, "learning_rate": 1.9537227915806273e-06, "loss": 0.2081, "num_input_tokens_seen": 25869376, "step": 38375 }, { "epoch": 0.9376297852588377, "grad_norm": 27.6629638671875, "learning_rate": 1.953697146223864e-06, "loss": 0.0825, "num_input_tokens_seen": 25872576, "step": 38380 }, { "epoch": 0.9377519360906847, "grad_norm": 21.2484130859375, "learning_rate": 1.953671493931559e-06, "loss": 0.1068, "num_input_tokens_seen": 25875776, "step": 38385 }, { "epoch": 0.9378740869225319, "grad_norm": 5.103590488433838, "learning_rate": 1.9536458347038986e-06, "loss": 0.127, "num_input_tokens_seen": 25879232, "step": 38390 }, { "epoch": 0.9379962377543791, "grad_norm": 17.484661102294922, "learning_rate": 1.9536201685410687e-06, "loss": 0.1061, "num_input_tokens_seen": 25882112, "step": 38395 }, { "epoch": 0.9381183885862263, "grad_norm": 5.376446723937988, "learning_rate": 1.9535944954432564e-06, "loss": 0.0392, "num_input_tokens_seen": 25885568, "step": 38400 }, { "epoch": 0.9382405394180734, "grad_norm": 10.969557762145996, "learning_rate": 1.953568815410649e-06, "loss": 0.0759, "num_input_tokens_seen": 25889088, "step": 38405 }, { "epoch": 0.9383626902499206, "grad_norm": 39.0699577331543, "learning_rate": 1.953543128443432e-06, "loss": 0.1701, "num_input_tokens_seen": 25892480, "step": 38410 }, { "epoch": 0.9384848410817678, "grad_norm": 19.995424270629883, "learning_rate": 1.9535174345417936e-06, "loss": 0.106, "num_input_tokens_seen": 25895296, "step": 38415 }, { "epoch": 0.938606991913615, "grad_norm": 10.260697364807129, "learning_rate": 1.9534917337059194e-06, "loss": 0.152, "num_input_tokens_seen": 25898944, "step": 38420 }, { "epoch": 0.938729142745462, "grad_norm": 14.371946334838867, "learning_rate": 1.9534660259359976e-06, "loss": 0.2075, "num_input_tokens_seen": 25902208, "step": 38425 }, { "epoch": 0.9388512935773092, "grad_norm": 1.466731071472168, "learning_rate": 1.9534403112322137e-06, "loss": 0.0861, "num_input_tokens_seen": 25905216, "step": 38430 }, { "epoch": 0.9389734444091564, "grad_norm": 17.70195770263672, "learning_rate": 1.9534145895947557e-06, "loss": 0.1235, "num_input_tokens_seen": 25908352, "step": 38435 }, { "epoch": 0.9390955952410036, "grad_norm": 8.004560470581055, "learning_rate": 1.95338886102381e-06, "loss": 0.1441, "num_input_tokens_seen": 25912128, "step": 38440 }, { "epoch": 0.9392177460728508, "grad_norm": 12.48849105834961, "learning_rate": 1.9533631255195643e-06, "loss": 0.0406, "num_input_tokens_seen": 25915456, "step": 38445 }, { "epoch": 0.9393398969046979, "grad_norm": 19.446035385131836, "learning_rate": 1.9533373830822056e-06, "loss": 0.0632, "num_input_tokens_seen": 25918784, "step": 38450 }, { "epoch": 0.9394620477365451, "grad_norm": 17.027902603149414, "learning_rate": 1.953311633711921e-06, "loss": 0.1228, "num_input_tokens_seen": 25922048, "step": 38455 }, { "epoch": 0.9395841985683923, "grad_norm": 14.439995765686035, "learning_rate": 1.953285877408898e-06, "loss": 0.0834, "num_input_tokens_seen": 25925120, "step": 38460 }, { "epoch": 0.9397063494002394, "grad_norm": 1.492466926574707, "learning_rate": 1.9532601141733232e-06, "loss": 0.0435, "num_input_tokens_seen": 25928832, "step": 38465 }, { "epoch": 0.9398285002320865, "grad_norm": 1.7578264474868774, "learning_rate": 1.953234344005385e-06, "loss": 0.0766, "num_input_tokens_seen": 25932224, "step": 38470 }, { "epoch": 0.9399506510639337, "grad_norm": 17.704456329345703, "learning_rate": 1.95320856690527e-06, "loss": 0.1421, "num_input_tokens_seen": 25935616, "step": 38475 }, { "epoch": 0.9400728018957809, "grad_norm": 20.333786010742188, "learning_rate": 1.953182782873166e-06, "loss": 0.147, "num_input_tokens_seen": 25938688, "step": 38480 }, { "epoch": 0.9401949527276281, "grad_norm": 17.079853057861328, "learning_rate": 1.95315699190926e-06, "loss": 0.0807, "num_input_tokens_seen": 25942080, "step": 38485 }, { "epoch": 0.9403171035594753, "grad_norm": 10.847369194030762, "learning_rate": 1.9531311940137404e-06, "loss": 0.1887, "num_input_tokens_seen": 25945344, "step": 38490 }, { "epoch": 0.9404392543913224, "grad_norm": 8.297080993652344, "learning_rate": 1.9531053891867944e-06, "loss": 0.1246, "num_input_tokens_seen": 25949056, "step": 38495 }, { "epoch": 0.9405614052231696, "grad_norm": 13.134936332702637, "learning_rate": 1.9530795774286096e-06, "loss": 0.072, "num_input_tokens_seen": 25952640, "step": 38500 }, { "epoch": 0.9406835560550167, "grad_norm": 5.523425579071045, "learning_rate": 1.9530537587393735e-06, "loss": 0.1588, "num_input_tokens_seen": 25955776, "step": 38505 }, { "epoch": 0.9408057068868639, "grad_norm": 17.084697723388672, "learning_rate": 1.9530279331192747e-06, "loss": 0.0621, "num_input_tokens_seen": 25958976, "step": 38510 }, { "epoch": 0.940927857718711, "grad_norm": 23.85431480407715, "learning_rate": 1.9530021005685e-06, "loss": 0.0771, "num_input_tokens_seen": 25962624, "step": 38515 }, { "epoch": 0.9410500085505582, "grad_norm": 20.88892364501953, "learning_rate": 1.952976261087238e-06, "loss": 0.0954, "num_input_tokens_seen": 25965824, "step": 38520 }, { "epoch": 0.9411721593824054, "grad_norm": 15.951557159423828, "learning_rate": 1.9529504146756757e-06, "loss": 0.0415, "num_input_tokens_seen": 25969536, "step": 38525 }, { "epoch": 0.9412943102142526, "grad_norm": 2.573011636734009, "learning_rate": 1.952924561334002e-06, "loss": 0.0443, "num_input_tokens_seen": 25972672, "step": 38530 }, { "epoch": 0.9414164610460998, "grad_norm": 0.32983043789863586, "learning_rate": 1.952898701062405e-06, "loss": 0.1361, "num_input_tokens_seen": 25976000, "step": 38535 }, { "epoch": 0.9415386118779469, "grad_norm": 12.805581092834473, "learning_rate": 1.952872833861072e-06, "loss": 0.128, "num_input_tokens_seen": 25979840, "step": 38540 }, { "epoch": 0.941660762709794, "grad_norm": 9.775003433227539, "learning_rate": 1.9528469597301915e-06, "loss": 0.0814, "num_input_tokens_seen": 25982720, "step": 38545 }, { "epoch": 0.9417829135416412, "grad_norm": 5.329129219055176, "learning_rate": 1.9528210786699516e-06, "loss": 0.0383, "num_input_tokens_seen": 25986112, "step": 38550 }, { "epoch": 0.9419050643734884, "grad_norm": 1.2658426761627197, "learning_rate": 1.9527951906805405e-06, "loss": 0.1121, "num_input_tokens_seen": 25989376, "step": 38555 }, { "epoch": 0.9420272152053355, "grad_norm": 1.926620364189148, "learning_rate": 1.9527692957621466e-06, "loss": 0.051, "num_input_tokens_seen": 25992832, "step": 38560 }, { "epoch": 0.9421493660371827, "grad_norm": 0.40974071621894836, "learning_rate": 1.952743393914958e-06, "loss": 0.1363, "num_input_tokens_seen": 25996160, "step": 38565 }, { "epoch": 0.9422715168690299, "grad_norm": 18.52965545654297, "learning_rate": 1.952717485139163e-06, "loss": 0.0882, "num_input_tokens_seen": 25999424, "step": 38570 }, { "epoch": 0.9423936677008771, "grad_norm": 25.709993362426758, "learning_rate": 1.9526915694349508e-06, "loss": 0.2171, "num_input_tokens_seen": 26003200, "step": 38575 }, { "epoch": 0.9425158185327243, "grad_norm": 25.383703231811523, "learning_rate": 1.9526656468025087e-06, "loss": 0.1181, "num_input_tokens_seen": 26006400, "step": 38580 }, { "epoch": 0.9426379693645713, "grad_norm": 46.224788665771484, "learning_rate": 1.9526397172420262e-06, "loss": 0.1448, "num_input_tokens_seen": 26010560, "step": 38585 }, { "epoch": 0.9427601201964185, "grad_norm": 11.900120735168457, "learning_rate": 1.9526137807536914e-06, "loss": 0.1124, "num_input_tokens_seen": 26013952, "step": 38590 }, { "epoch": 0.9428822710282657, "grad_norm": 3.8370819091796875, "learning_rate": 1.9525878373376925e-06, "loss": 0.1023, "num_input_tokens_seen": 26017536, "step": 38595 }, { "epoch": 0.9430044218601129, "grad_norm": 3.7985072135925293, "learning_rate": 1.952561886994219e-06, "loss": 0.0568, "num_input_tokens_seen": 26020992, "step": 38600 }, { "epoch": 0.94312657269196, "grad_norm": 14.372045516967773, "learning_rate": 1.952535929723459e-06, "loss": 0.1432, "num_input_tokens_seen": 26024192, "step": 38605 }, { "epoch": 0.9432487235238072, "grad_norm": 13.311012268066406, "learning_rate": 1.9525099655256017e-06, "loss": 0.1304, "num_input_tokens_seen": 26027264, "step": 38610 }, { "epoch": 0.9433708743556544, "grad_norm": 37.545902252197266, "learning_rate": 1.9524839944008356e-06, "loss": 0.2119, "num_input_tokens_seen": 26030784, "step": 38615 }, { "epoch": 0.9434930251875016, "grad_norm": 2.7573108673095703, "learning_rate": 1.9524580163493504e-06, "loss": 0.1403, "num_input_tokens_seen": 26034880, "step": 38620 }, { "epoch": 0.9436151760193487, "grad_norm": 6.662299156188965, "learning_rate": 1.9524320313713333e-06, "loss": 0.1103, "num_input_tokens_seen": 26038464, "step": 38625 }, { "epoch": 0.9437373268511958, "grad_norm": 0.49036359786987305, "learning_rate": 1.952406039466975e-06, "loss": 0.1217, "num_input_tokens_seen": 26041920, "step": 38630 }, { "epoch": 0.943859477683043, "grad_norm": 1.0396976470947266, "learning_rate": 1.9523800406364637e-06, "loss": 0.0789, "num_input_tokens_seen": 26045376, "step": 38635 }, { "epoch": 0.9439816285148902, "grad_norm": 3.0205793380737305, "learning_rate": 1.952354034879988e-06, "loss": 0.1065, "num_input_tokens_seen": 26048576, "step": 38640 }, { "epoch": 0.9441037793467374, "grad_norm": 33.249732971191406, "learning_rate": 1.9523280221977383e-06, "loss": 0.0972, "num_input_tokens_seen": 26051712, "step": 38645 }, { "epoch": 0.9442259301785845, "grad_norm": 9.237648010253906, "learning_rate": 1.9523020025899027e-06, "loss": 0.1515, "num_input_tokens_seen": 26054784, "step": 38650 }, { "epoch": 0.9443480810104317, "grad_norm": 12.187664031982422, "learning_rate": 1.952275976056671e-06, "loss": 0.0833, "num_input_tokens_seen": 26057792, "step": 38655 }, { "epoch": 0.9444702318422789, "grad_norm": 26.437292098999023, "learning_rate": 1.9522499425982325e-06, "loss": 0.1098, "num_input_tokens_seen": 26061120, "step": 38660 }, { "epoch": 0.944592382674126, "grad_norm": 9.192244529724121, "learning_rate": 1.9522239022147756e-06, "loss": 0.0668, "num_input_tokens_seen": 26064128, "step": 38665 }, { "epoch": 0.9447145335059732, "grad_norm": 23.44373893737793, "learning_rate": 1.952197854906491e-06, "loss": 0.1106, "num_input_tokens_seen": 26067840, "step": 38670 }, { "epoch": 0.9448366843378203, "grad_norm": 10.469454765319824, "learning_rate": 1.9521718006735673e-06, "loss": 0.1118, "num_input_tokens_seen": 26071040, "step": 38675 }, { "epoch": 0.9449588351696675, "grad_norm": 1.4989031553268433, "learning_rate": 1.952145739516194e-06, "loss": 0.0134, "num_input_tokens_seen": 26074752, "step": 38680 }, { "epoch": 0.9450809860015147, "grad_norm": 1.131640076637268, "learning_rate": 1.9521196714345607e-06, "loss": 0.1698, "num_input_tokens_seen": 26078080, "step": 38685 }, { "epoch": 0.9452031368333619, "grad_norm": 2.65283465385437, "learning_rate": 1.9520935964288574e-06, "loss": 0.0761, "num_input_tokens_seen": 26082496, "step": 38690 }, { "epoch": 0.945325287665209, "grad_norm": 15.337343215942383, "learning_rate": 1.9520675144992734e-06, "loss": 0.0998, "num_input_tokens_seen": 26085952, "step": 38695 }, { "epoch": 0.9454474384970561, "grad_norm": 16.278797149658203, "learning_rate": 1.952041425645998e-06, "loss": 0.0951, "num_input_tokens_seen": 26089280, "step": 38700 }, { "epoch": 0.9455695893289033, "grad_norm": 0.27807071805000305, "learning_rate": 1.9520153298692215e-06, "loss": 0.1408, "num_input_tokens_seen": 26092352, "step": 38705 }, { "epoch": 0.9456917401607505, "grad_norm": 5.690235137939453, "learning_rate": 1.9519892271691335e-06, "loss": 0.1871, "num_input_tokens_seen": 26095872, "step": 38710 }, { "epoch": 0.9458138909925976, "grad_norm": 31.965085983276367, "learning_rate": 1.951963117545924e-06, "loss": 0.1672, "num_input_tokens_seen": 26099200, "step": 38715 }, { "epoch": 0.9459360418244448, "grad_norm": 12.31370735168457, "learning_rate": 1.9519370009997825e-06, "loss": 0.1707, "num_input_tokens_seen": 26102272, "step": 38720 }, { "epoch": 0.946058192656292, "grad_norm": 2.0016095638275146, "learning_rate": 1.951910877530899e-06, "loss": 0.0986, "num_input_tokens_seen": 26105792, "step": 38725 }, { "epoch": 0.9461803434881392, "grad_norm": 3.6056859493255615, "learning_rate": 1.9518847471394633e-06, "loss": 0.0627, "num_input_tokens_seen": 26109760, "step": 38730 }, { "epoch": 0.9463024943199864, "grad_norm": 7.464121341705322, "learning_rate": 1.951858609825666e-06, "loss": 0.0619, "num_input_tokens_seen": 26112960, "step": 38735 }, { "epoch": 0.9464246451518334, "grad_norm": 14.840576171875, "learning_rate": 1.9518324655896967e-06, "loss": 0.0992, "num_input_tokens_seen": 26115968, "step": 38740 }, { "epoch": 0.9465467959836806, "grad_norm": 1.0094752311706543, "learning_rate": 1.9518063144317457e-06, "loss": 0.1284, "num_input_tokens_seen": 26119360, "step": 38745 }, { "epoch": 0.9466689468155278, "grad_norm": 14.64438247680664, "learning_rate": 1.9517801563520037e-06, "loss": 0.0379, "num_input_tokens_seen": 26122816, "step": 38750 }, { "epoch": 0.946791097647375, "grad_norm": 0.7392638325691223, "learning_rate": 1.95175399135066e-06, "loss": 0.0435, "num_input_tokens_seen": 26126400, "step": 38755 }, { "epoch": 0.9469132484792221, "grad_norm": 39.79733657836914, "learning_rate": 1.951727819427905e-06, "loss": 0.1659, "num_input_tokens_seen": 26129472, "step": 38760 }, { "epoch": 0.9470353993110693, "grad_norm": 43.59321212768555, "learning_rate": 1.9517016405839296e-06, "loss": 0.2924, "num_input_tokens_seen": 26132672, "step": 38765 }, { "epoch": 0.9471575501429165, "grad_norm": 23.428409576416016, "learning_rate": 1.951675454818924e-06, "loss": 0.1363, "num_input_tokens_seen": 26136064, "step": 38770 }, { "epoch": 0.9472797009747637, "grad_norm": 17.838272094726562, "learning_rate": 1.9516492621330785e-06, "loss": 0.193, "num_input_tokens_seen": 26139520, "step": 38775 }, { "epoch": 0.9474018518066109, "grad_norm": 13.88142204284668, "learning_rate": 1.9516230625265835e-06, "loss": 0.1092, "num_input_tokens_seen": 26143360, "step": 38780 }, { "epoch": 0.9475240026384579, "grad_norm": 3.652280569076538, "learning_rate": 1.9515968559996295e-06, "loss": 0.1376, "num_input_tokens_seen": 26146624, "step": 38785 }, { "epoch": 0.9476461534703051, "grad_norm": 16.653409957885742, "learning_rate": 1.9515706425524075e-06, "loss": 0.1381, "num_input_tokens_seen": 26149952, "step": 38790 }, { "epoch": 0.9477683043021523, "grad_norm": 11.144438743591309, "learning_rate": 1.9515444221851075e-06, "loss": 0.1833, "num_input_tokens_seen": 26153088, "step": 38795 }, { "epoch": 0.9478904551339995, "grad_norm": 1.472706913948059, "learning_rate": 1.951518194897921e-06, "loss": 0.0472, "num_input_tokens_seen": 26157312, "step": 38800 }, { "epoch": 0.9480126059658466, "grad_norm": 2.764831304550171, "learning_rate": 1.9514919606910378e-06, "loss": 0.062, "num_input_tokens_seen": 26160320, "step": 38805 }, { "epoch": 0.9481347567976938, "grad_norm": 7.44785737991333, "learning_rate": 1.9514657195646492e-06, "loss": 0.0837, "num_input_tokens_seen": 26163968, "step": 38810 }, { "epoch": 0.948256907629541, "grad_norm": 5.181417465209961, "learning_rate": 1.9514394715189464e-06, "loss": 0.0822, "num_input_tokens_seen": 26167296, "step": 38815 }, { "epoch": 0.9483790584613881, "grad_norm": 0.2639814615249634, "learning_rate": 1.9514132165541194e-06, "loss": 0.0713, "num_input_tokens_seen": 26170816, "step": 38820 }, { "epoch": 0.9485012092932353, "grad_norm": 1.6539617776870728, "learning_rate": 1.95138695467036e-06, "loss": 0.109, "num_input_tokens_seen": 26174720, "step": 38825 }, { "epoch": 0.9486233601250824, "grad_norm": 16.51839828491211, "learning_rate": 1.951360685867858e-06, "loss": 0.073, "num_input_tokens_seen": 26177984, "step": 38830 }, { "epoch": 0.9487455109569296, "grad_norm": 27.56429100036621, "learning_rate": 1.951334410146806e-06, "loss": 0.0984, "num_input_tokens_seen": 26181568, "step": 38835 }, { "epoch": 0.9488676617887768, "grad_norm": 15.006582260131836, "learning_rate": 1.951308127507394e-06, "loss": 0.1195, "num_input_tokens_seen": 26185024, "step": 38840 }, { "epoch": 0.948989812620624, "grad_norm": 11.61131477355957, "learning_rate": 1.9512818379498135e-06, "loss": 0.1248, "num_input_tokens_seen": 26188352, "step": 38845 }, { "epoch": 0.9491119634524711, "grad_norm": 18.227680206298828, "learning_rate": 1.9512555414742557e-06, "loss": 0.0851, "num_input_tokens_seen": 26191616, "step": 38850 }, { "epoch": 0.9492341142843183, "grad_norm": 0.6630993485450745, "learning_rate": 1.9512292380809116e-06, "loss": 0.1412, "num_input_tokens_seen": 26195840, "step": 38855 }, { "epoch": 0.9493562651161654, "grad_norm": 0.3163129687309265, "learning_rate": 1.9512029277699726e-06, "loss": 0.1485, "num_input_tokens_seen": 26199296, "step": 38860 }, { "epoch": 0.9494784159480126, "grad_norm": 43.869964599609375, "learning_rate": 1.95117661054163e-06, "loss": 0.2412, "num_input_tokens_seen": 26202304, "step": 38865 }, { "epoch": 0.9496005667798598, "grad_norm": 1.4705532789230347, "learning_rate": 1.9511502863960755e-06, "loss": 0.1503, "num_input_tokens_seen": 26205568, "step": 38870 }, { "epoch": 0.9497227176117069, "grad_norm": 29.34637451171875, "learning_rate": 1.9511239553334998e-06, "loss": 0.0839, "num_input_tokens_seen": 26208960, "step": 38875 }, { "epoch": 0.9498448684435541, "grad_norm": 7.019904613494873, "learning_rate": 1.9510976173540953e-06, "loss": 0.0943, "num_input_tokens_seen": 26212032, "step": 38880 }, { "epoch": 0.9499670192754013, "grad_norm": 18.73554229736328, "learning_rate": 1.951071272458053e-06, "loss": 0.1021, "num_input_tokens_seen": 26216064, "step": 38885 }, { "epoch": 0.9500891701072485, "grad_norm": 4.497707843780518, "learning_rate": 1.9510449206455644e-06, "loss": 0.0941, "num_input_tokens_seen": 26219136, "step": 38890 }, { "epoch": 0.9502113209390955, "grad_norm": 0.6487525105476379, "learning_rate": 1.9510185619168216e-06, "loss": 0.0121, "num_input_tokens_seen": 26222336, "step": 38895 }, { "epoch": 0.9503334717709427, "grad_norm": 23.14124298095703, "learning_rate": 1.9509921962720163e-06, "loss": 0.1273, "num_input_tokens_seen": 26225408, "step": 38900 }, { "epoch": 0.9504556226027899, "grad_norm": 1.1773689985275269, "learning_rate": 1.9509658237113394e-06, "loss": 0.0294, "num_input_tokens_seen": 26228480, "step": 38905 }, { "epoch": 0.9505777734346371, "grad_norm": 2.263134717941284, "learning_rate": 1.9509394442349836e-06, "loss": 0.1051, "num_input_tokens_seen": 26231488, "step": 38910 }, { "epoch": 0.9506999242664843, "grad_norm": 1.4543462991714478, "learning_rate": 1.9509130578431405e-06, "loss": 0.1074, "num_input_tokens_seen": 26234944, "step": 38915 }, { "epoch": 0.9508220750983314, "grad_norm": 25.77239990234375, "learning_rate": 1.9508866645360018e-06, "loss": 0.1077, "num_input_tokens_seen": 26238208, "step": 38920 }, { "epoch": 0.9509442259301786, "grad_norm": 44.907447814941406, "learning_rate": 1.9508602643137593e-06, "loss": 0.1287, "num_input_tokens_seen": 26241344, "step": 38925 }, { "epoch": 0.9510663767620258, "grad_norm": 25.440441131591797, "learning_rate": 1.950833857176605e-06, "loss": 0.0629, "num_input_tokens_seen": 26244544, "step": 38930 }, { "epoch": 0.951188527593873, "grad_norm": 2.791712999343872, "learning_rate": 1.9508074431247316e-06, "loss": 0.0859, "num_input_tokens_seen": 26247744, "step": 38935 }, { "epoch": 0.95131067842572, "grad_norm": 26.376630783081055, "learning_rate": 1.950781022158331e-06, "loss": 0.1654, "num_input_tokens_seen": 26251392, "step": 38940 }, { "epoch": 0.9514328292575672, "grad_norm": 8.187178611755371, "learning_rate": 1.950754594277594e-06, "loss": 0.0961, "num_input_tokens_seen": 26254592, "step": 38945 }, { "epoch": 0.9515549800894144, "grad_norm": 0.9078636765480042, "learning_rate": 1.9507281594827147e-06, "loss": 0.0185, "num_input_tokens_seen": 26258176, "step": 38950 }, { "epoch": 0.9516771309212616, "grad_norm": 1.331112027168274, "learning_rate": 1.9507017177738845e-06, "loss": 0.0591, "num_input_tokens_seen": 26261888, "step": 38955 }, { "epoch": 0.9517992817531087, "grad_norm": 0.7054332494735718, "learning_rate": 1.9506752691512955e-06, "loss": 0.0263, "num_input_tokens_seen": 26264768, "step": 38960 }, { "epoch": 0.9519214325849559, "grad_norm": 1.5211477279663086, "learning_rate": 1.9506488136151403e-06, "loss": 0.0116, "num_input_tokens_seen": 26268096, "step": 38965 }, { "epoch": 0.9520435834168031, "grad_norm": 39.79738235473633, "learning_rate": 1.9506223511656113e-06, "loss": 0.3299, "num_input_tokens_seen": 26271488, "step": 38970 }, { "epoch": 0.9521657342486503, "grad_norm": 0.21544885635375977, "learning_rate": 1.9505958818029006e-06, "loss": 0.2123, "num_input_tokens_seen": 26274624, "step": 38975 }, { "epoch": 0.9522878850804974, "grad_norm": 2.099090337753296, "learning_rate": 1.9505694055272012e-06, "loss": 0.0553, "num_input_tokens_seen": 26277824, "step": 38980 }, { "epoch": 0.9524100359123445, "grad_norm": 31.99783706665039, "learning_rate": 1.9505429223387055e-06, "loss": 0.0606, "num_input_tokens_seen": 26280960, "step": 38985 }, { "epoch": 0.9525321867441917, "grad_norm": 1.8545265197753906, "learning_rate": 1.9505164322376056e-06, "loss": 0.1311, "num_input_tokens_seen": 26284416, "step": 38990 }, { "epoch": 0.9526543375760389, "grad_norm": 24.32158660888672, "learning_rate": 1.950489935224095e-06, "loss": 0.2482, "num_input_tokens_seen": 26287872, "step": 38995 }, { "epoch": 0.9527764884078861, "grad_norm": 15.769022941589355, "learning_rate": 1.9504634312983655e-06, "loss": 0.1956, "num_input_tokens_seen": 26291136, "step": 39000 }, { "epoch": 0.9528986392397332, "grad_norm": 15.915133476257324, "learning_rate": 1.9504369204606107e-06, "loss": 0.1214, "num_input_tokens_seen": 26294208, "step": 39005 }, { "epoch": 0.9530207900715804, "grad_norm": 21.899938583374023, "learning_rate": 1.950410402711023e-06, "loss": 0.1255, "num_input_tokens_seen": 26297024, "step": 39010 }, { "epoch": 0.9531429409034275, "grad_norm": 10.060094833374023, "learning_rate": 1.950383878049795e-06, "loss": 0.0935, "num_input_tokens_seen": 26300736, "step": 39015 }, { "epoch": 0.9532650917352747, "grad_norm": 24.96803855895996, "learning_rate": 1.9503573464771197e-06, "loss": 0.0614, "num_input_tokens_seen": 26304128, "step": 39020 }, { "epoch": 0.9533872425671219, "grad_norm": 0.47129759192466736, "learning_rate": 1.9503308079931904e-06, "loss": 0.081, "num_input_tokens_seen": 26307264, "step": 39025 }, { "epoch": 0.953509393398969, "grad_norm": 2.5152175426483154, "learning_rate": 1.9503042625981994e-06, "loss": 0.1635, "num_input_tokens_seen": 26310784, "step": 39030 }, { "epoch": 0.9536315442308162, "grad_norm": 19.16029930114746, "learning_rate": 1.9502777102923407e-06, "loss": 0.1404, "num_input_tokens_seen": 26313856, "step": 39035 }, { "epoch": 0.9537536950626634, "grad_norm": 1.4574499130249023, "learning_rate": 1.950251151075807e-06, "loss": 0.038, "num_input_tokens_seen": 26317312, "step": 39040 }, { "epoch": 0.9538758458945106, "grad_norm": 0.34809938073158264, "learning_rate": 1.950224584948791e-06, "loss": 0.2091, "num_input_tokens_seen": 26320704, "step": 39045 }, { "epoch": 0.9539979967263577, "grad_norm": 0.9987199902534485, "learning_rate": 1.9501980119114863e-06, "loss": 0.0838, "num_input_tokens_seen": 26323968, "step": 39050 }, { "epoch": 0.9541201475582048, "grad_norm": 0.7526427507400513, "learning_rate": 1.9501714319640863e-06, "loss": 0.0687, "num_input_tokens_seen": 26327232, "step": 39055 }, { "epoch": 0.954242298390052, "grad_norm": 8.890820503234863, "learning_rate": 1.950144845106784e-06, "loss": 0.1605, "num_input_tokens_seen": 26330432, "step": 39060 }, { "epoch": 0.9543644492218992, "grad_norm": 3.9952099323272705, "learning_rate": 1.950118251339773e-06, "loss": 0.1054, "num_input_tokens_seen": 26333760, "step": 39065 }, { "epoch": 0.9544866000537464, "grad_norm": 3.4970591068267822, "learning_rate": 1.950091650663246e-06, "loss": 0.0807, "num_input_tokens_seen": 26337280, "step": 39070 }, { "epoch": 0.9546087508855935, "grad_norm": 0.29328471422195435, "learning_rate": 1.950065043077397e-06, "loss": 0.0553, "num_input_tokens_seen": 26340800, "step": 39075 }, { "epoch": 0.9547309017174407, "grad_norm": 0.6970962882041931, "learning_rate": 1.95003842858242e-06, "loss": 0.0795, "num_input_tokens_seen": 26344320, "step": 39080 }, { "epoch": 0.9548530525492879, "grad_norm": 4.4035444259643555, "learning_rate": 1.9500118071785072e-06, "loss": 0.1429, "num_input_tokens_seen": 26348864, "step": 39085 }, { "epoch": 0.9549752033811351, "grad_norm": 39.284542083740234, "learning_rate": 1.949985178865854e-06, "loss": 0.0758, "num_input_tokens_seen": 26352192, "step": 39090 }, { "epoch": 0.9550973542129821, "grad_norm": 0.41842687129974365, "learning_rate": 1.9499585436446522e-06, "loss": 0.0745, "num_input_tokens_seen": 26355136, "step": 39095 }, { "epoch": 0.9552195050448293, "grad_norm": 39.18877410888672, "learning_rate": 1.949931901515097e-06, "loss": 0.1496, "num_input_tokens_seen": 26358464, "step": 39100 }, { "epoch": 0.9553416558766765, "grad_norm": 3.0865681171417236, "learning_rate": 1.949905252477381e-06, "loss": 0.0301, "num_input_tokens_seen": 26362112, "step": 39105 }, { "epoch": 0.9554638067085237, "grad_norm": 43.33049774169922, "learning_rate": 1.949878596531699e-06, "loss": 0.2, "num_input_tokens_seen": 26365824, "step": 39110 }, { "epoch": 0.9555859575403709, "grad_norm": 26.941396713256836, "learning_rate": 1.9498519336782445e-06, "loss": 0.0912, "num_input_tokens_seen": 26369280, "step": 39115 }, { "epoch": 0.955708108372218, "grad_norm": 56.721160888671875, "learning_rate": 1.9498252639172107e-06, "loss": 0.0706, "num_input_tokens_seen": 26372672, "step": 39120 }, { "epoch": 0.9558302592040652, "grad_norm": 8.66711139678955, "learning_rate": 1.9497985872487926e-06, "loss": 0.1052, "num_input_tokens_seen": 26376000, "step": 39125 }, { "epoch": 0.9559524100359124, "grad_norm": 0.9714411497116089, "learning_rate": 1.949771903673183e-06, "loss": 0.1322, "num_input_tokens_seen": 26379264, "step": 39130 }, { "epoch": 0.9560745608677595, "grad_norm": 0.6735852360725403, "learning_rate": 1.949745213190577e-06, "loss": 0.1274, "num_input_tokens_seen": 26382592, "step": 39135 }, { "epoch": 0.9561967116996066, "grad_norm": 1.7462108135223389, "learning_rate": 1.9497185158011687e-06, "loss": 0.0558, "num_input_tokens_seen": 26385536, "step": 39140 }, { "epoch": 0.9563188625314538, "grad_norm": 42.575992584228516, "learning_rate": 1.9496918115051516e-06, "loss": 0.0973, "num_input_tokens_seen": 26389248, "step": 39145 }, { "epoch": 0.956441013363301, "grad_norm": 22.976924896240234, "learning_rate": 1.9496651003027204e-06, "loss": 0.1796, "num_input_tokens_seen": 26392384, "step": 39150 }, { "epoch": 0.9565631641951482, "grad_norm": 12.775447845458984, "learning_rate": 1.949638382194069e-06, "loss": 0.1227, "num_input_tokens_seen": 26395520, "step": 39155 }, { "epoch": 0.9566853150269954, "grad_norm": 7.335485458374023, "learning_rate": 1.949611657179392e-06, "loss": 0.0913, "num_input_tokens_seen": 26399104, "step": 39160 }, { "epoch": 0.9568074658588425, "grad_norm": 0.8135385513305664, "learning_rate": 1.9495849252588835e-06, "loss": 0.0726, "num_input_tokens_seen": 26402560, "step": 39165 }, { "epoch": 0.9569296166906897, "grad_norm": 0.200018972158432, "learning_rate": 1.9495581864327378e-06, "loss": 0.2176, "num_input_tokens_seen": 26406272, "step": 39170 }, { "epoch": 0.9570517675225368, "grad_norm": 5.996309280395508, "learning_rate": 1.94953144070115e-06, "loss": 0.1195, "num_input_tokens_seen": 26409536, "step": 39175 }, { "epoch": 0.957173918354384, "grad_norm": 33.560302734375, "learning_rate": 1.949504688064314e-06, "loss": 0.0522, "num_input_tokens_seen": 26412736, "step": 39180 }, { "epoch": 0.9572960691862311, "grad_norm": 7.365058898925781, "learning_rate": 1.949477928522424e-06, "loss": 0.1914, "num_input_tokens_seen": 26416320, "step": 39185 }, { "epoch": 0.9574182200180783, "grad_norm": 4.897334575653076, "learning_rate": 1.949451162075676e-06, "loss": 0.0892, "num_input_tokens_seen": 26419456, "step": 39190 }, { "epoch": 0.9575403708499255, "grad_norm": 0.699462354183197, "learning_rate": 1.9494243887242634e-06, "loss": 0.0619, "num_input_tokens_seen": 26423488, "step": 39195 }, { "epoch": 0.9576625216817727, "grad_norm": 0.3113349378108978, "learning_rate": 1.9493976084683814e-06, "loss": 0.136, "num_input_tokens_seen": 26427264, "step": 39200 }, { "epoch": 0.9577846725136199, "grad_norm": 0.5005393028259277, "learning_rate": 1.949370821308224e-06, "loss": 0.0504, "num_input_tokens_seen": 26430592, "step": 39205 }, { "epoch": 0.957906823345467, "grad_norm": 24.41374969482422, "learning_rate": 1.9493440272439873e-06, "loss": 0.0703, "num_input_tokens_seen": 26433856, "step": 39210 }, { "epoch": 0.9580289741773141, "grad_norm": 20.471982955932617, "learning_rate": 1.9493172262758656e-06, "loss": 0.1319, "num_input_tokens_seen": 26436992, "step": 39215 }, { "epoch": 0.9581511250091613, "grad_norm": 34.05270767211914, "learning_rate": 1.9492904184040532e-06, "loss": 0.1383, "num_input_tokens_seen": 26440128, "step": 39220 }, { "epoch": 0.9582732758410085, "grad_norm": 6.32810640335083, "learning_rate": 1.9492636036287457e-06, "loss": 0.0417, "num_input_tokens_seen": 26443584, "step": 39225 }, { "epoch": 0.9583954266728556, "grad_norm": 22.411970138549805, "learning_rate": 1.9492367819501383e-06, "loss": 0.0836, "num_input_tokens_seen": 26446912, "step": 39230 }, { "epoch": 0.9585175775047028, "grad_norm": 18.24030876159668, "learning_rate": 1.9492099533684254e-06, "loss": 0.0688, "num_input_tokens_seen": 26449792, "step": 39235 }, { "epoch": 0.95863972833655, "grad_norm": 31.872873306274414, "learning_rate": 1.949183117883802e-06, "loss": 0.1123, "num_input_tokens_seen": 26453120, "step": 39240 }, { "epoch": 0.9587618791683972, "grad_norm": 22.10487174987793, "learning_rate": 1.9491562754964644e-06, "loss": 0.1026, "num_input_tokens_seen": 26456384, "step": 39245 }, { "epoch": 0.9588840300002442, "grad_norm": 0.6037797331809998, "learning_rate": 1.949129426206607e-06, "loss": 0.1303, "num_input_tokens_seen": 26460352, "step": 39250 }, { "epoch": 0.9590061808320914, "grad_norm": 67.08470916748047, "learning_rate": 1.949102570014425e-06, "loss": 0.1797, "num_input_tokens_seen": 26463680, "step": 39255 }, { "epoch": 0.9591283316639386, "grad_norm": 35.059085845947266, "learning_rate": 1.9490757069201135e-06, "loss": 0.0993, "num_input_tokens_seen": 26466816, "step": 39260 }, { "epoch": 0.9592504824957858, "grad_norm": 19.97450065612793, "learning_rate": 1.9490488369238686e-06, "loss": 0.1357, "num_input_tokens_seen": 26470016, "step": 39265 }, { "epoch": 0.959372633327633, "grad_norm": 0.4946680963039398, "learning_rate": 1.949021960025885e-06, "loss": 0.057, "num_input_tokens_seen": 26473344, "step": 39270 }, { "epoch": 0.9594947841594801, "grad_norm": 17.229921340942383, "learning_rate": 1.9489950762263584e-06, "loss": 0.0984, "num_input_tokens_seen": 26476800, "step": 39275 }, { "epoch": 0.9596169349913273, "grad_norm": 14.1148681640625, "learning_rate": 1.948968185525485e-06, "loss": 0.114, "num_input_tokens_seen": 26480384, "step": 39280 }, { "epoch": 0.9597390858231745, "grad_norm": 0.3873974680900574, "learning_rate": 1.9489412879234587e-06, "loss": 0.0964, "num_input_tokens_seen": 26483584, "step": 39285 }, { "epoch": 0.9598612366550217, "grad_norm": 11.539669036865234, "learning_rate": 1.9489143834204768e-06, "loss": 0.0955, "num_input_tokens_seen": 26487168, "step": 39290 }, { "epoch": 0.9599833874868687, "grad_norm": 0.8252934813499451, "learning_rate": 1.948887472016734e-06, "loss": 0.2023, "num_input_tokens_seen": 26490624, "step": 39295 }, { "epoch": 0.9601055383187159, "grad_norm": 0.29018840193748474, "learning_rate": 1.9488605537124267e-06, "loss": 0.0522, "num_input_tokens_seen": 26493952, "step": 39300 }, { "epoch": 0.9602276891505631, "grad_norm": 2.5185937881469727, "learning_rate": 1.94883362850775e-06, "loss": 0.0684, "num_input_tokens_seen": 26497216, "step": 39305 }, { "epoch": 0.9603498399824103, "grad_norm": 3.989112377166748, "learning_rate": 1.9488066964029e-06, "loss": 0.1094, "num_input_tokens_seen": 26500288, "step": 39310 }, { "epoch": 0.9604719908142575, "grad_norm": 11.158829689025879, "learning_rate": 1.948779757398072e-06, "loss": 0.0557, "num_input_tokens_seen": 26503680, "step": 39315 }, { "epoch": 0.9605941416461046, "grad_norm": 33.47761917114258, "learning_rate": 1.948752811493463e-06, "loss": 0.1563, "num_input_tokens_seen": 26506944, "step": 39320 }, { "epoch": 0.9607162924779518, "grad_norm": 21.618824005126953, "learning_rate": 1.9487258586892685e-06, "loss": 0.1214, "num_input_tokens_seen": 26510272, "step": 39325 }, { "epoch": 0.960838443309799, "grad_norm": 0.47546103596687317, "learning_rate": 1.948698898985684e-06, "loss": 0.1588, "num_input_tokens_seen": 26513408, "step": 39330 }, { "epoch": 0.9609605941416461, "grad_norm": 3.9736697673797607, "learning_rate": 1.948671932382906e-06, "loss": 0.1465, "num_input_tokens_seen": 26516736, "step": 39335 }, { "epoch": 0.9610827449734932, "grad_norm": 11.857345581054688, "learning_rate": 1.9486449588811304e-06, "loss": 0.0844, "num_input_tokens_seen": 26520384, "step": 39340 }, { "epoch": 0.9612048958053404, "grad_norm": 18.82761001586914, "learning_rate": 1.948617978480554e-06, "loss": 0.0408, "num_input_tokens_seen": 26523776, "step": 39345 }, { "epoch": 0.9613270466371876, "grad_norm": 38.90220642089844, "learning_rate": 1.9485909911813717e-06, "loss": 0.1468, "num_input_tokens_seen": 26527616, "step": 39350 }, { "epoch": 0.9614491974690348, "grad_norm": 5.618608474731445, "learning_rate": 1.9485639969837815e-06, "loss": 0.0336, "num_input_tokens_seen": 26530880, "step": 39355 }, { "epoch": 0.961571348300882, "grad_norm": 1.9378278255462646, "learning_rate": 1.948536995887978e-06, "loss": 0.1567, "num_input_tokens_seen": 26533952, "step": 39360 }, { "epoch": 0.9616934991327291, "grad_norm": 0.18136677145957947, "learning_rate": 1.948509987894159e-06, "loss": 0.0393, "num_input_tokens_seen": 26537216, "step": 39365 }, { "epoch": 0.9618156499645762, "grad_norm": 16.893386840820312, "learning_rate": 1.9484829730025195e-06, "loss": 0.1268, "num_input_tokens_seen": 26540672, "step": 39370 }, { "epoch": 0.9619378007964234, "grad_norm": 11.266032218933105, "learning_rate": 1.9484559512132575e-06, "loss": 0.2364, "num_input_tokens_seen": 26544000, "step": 39375 }, { "epoch": 0.9620599516282706, "grad_norm": 0.9606216549873352, "learning_rate": 1.948428922526568e-06, "loss": 0.1005, "num_input_tokens_seen": 26547264, "step": 39380 }, { "epoch": 0.9621821024601177, "grad_norm": 1.0269383192062378, "learning_rate": 1.9484018869426487e-06, "loss": 0.0725, "num_input_tokens_seen": 26550528, "step": 39385 }, { "epoch": 0.9623042532919649, "grad_norm": 13.481785774230957, "learning_rate": 1.9483748444616957e-06, "loss": 0.1236, "num_input_tokens_seen": 26553856, "step": 39390 }, { "epoch": 0.9624264041238121, "grad_norm": 1.561326265335083, "learning_rate": 1.9483477950839057e-06, "loss": 0.1777, "num_input_tokens_seen": 26557248, "step": 39395 }, { "epoch": 0.9625485549556593, "grad_norm": 28.832462310791016, "learning_rate": 1.9483207388094756e-06, "loss": 0.1267, "num_input_tokens_seen": 26560960, "step": 39400 }, { "epoch": 0.9626707057875065, "grad_norm": 15.7662992477417, "learning_rate": 1.948293675638602e-06, "loss": 0.1065, "num_input_tokens_seen": 26564288, "step": 39405 }, { "epoch": 0.9627928566193535, "grad_norm": 14.944591522216797, "learning_rate": 1.9482666055714816e-06, "loss": 0.0593, "num_input_tokens_seen": 26567616, "step": 39410 }, { "epoch": 0.9629150074512007, "grad_norm": 7.434094429016113, "learning_rate": 1.9482395286083116e-06, "loss": 0.091, "num_input_tokens_seen": 26570752, "step": 39415 }, { "epoch": 0.9630371582830479, "grad_norm": 11.297898292541504, "learning_rate": 1.948212444749289e-06, "loss": 0.1468, "num_input_tokens_seen": 26573760, "step": 39420 }, { "epoch": 0.9631593091148951, "grad_norm": 48.35212326049805, "learning_rate": 1.9481853539946098e-06, "loss": 0.192, "num_input_tokens_seen": 26576832, "step": 39425 }, { "epoch": 0.9632814599467422, "grad_norm": 19.07402801513672, "learning_rate": 1.948158256344472e-06, "loss": 0.109, "num_input_tokens_seen": 26580608, "step": 39430 }, { "epoch": 0.9634036107785894, "grad_norm": 24.01344108581543, "learning_rate": 1.948131151799072e-06, "loss": 0.085, "num_input_tokens_seen": 26583488, "step": 39435 }, { "epoch": 0.9635257616104366, "grad_norm": 16.847387313842773, "learning_rate": 1.9481040403586074e-06, "loss": 0.1276, "num_input_tokens_seen": 26587200, "step": 39440 }, { "epoch": 0.9636479124422838, "grad_norm": 12.600052833557129, "learning_rate": 1.948076922023275e-06, "loss": 0.0837, "num_input_tokens_seen": 26590656, "step": 39445 }, { "epoch": 0.963770063274131, "grad_norm": 4.261289596557617, "learning_rate": 1.948049796793273e-06, "loss": 0.1127, "num_input_tokens_seen": 26593984, "step": 39450 }, { "epoch": 0.963892214105978, "grad_norm": 28.10150909423828, "learning_rate": 1.9480226646687976e-06, "loss": 0.0991, "num_input_tokens_seen": 26597376, "step": 39455 }, { "epoch": 0.9640143649378252, "grad_norm": 29.681474685668945, "learning_rate": 1.947995525650046e-06, "loss": 0.0466, "num_input_tokens_seen": 26600256, "step": 39460 }, { "epoch": 0.9641365157696724, "grad_norm": 24.66652488708496, "learning_rate": 1.947968379737216e-06, "loss": 0.1007, "num_input_tokens_seen": 26603904, "step": 39465 }, { "epoch": 0.9642586666015196, "grad_norm": 0.7605220079421997, "learning_rate": 1.947941226930505e-06, "loss": 0.1234, "num_input_tokens_seen": 26607552, "step": 39470 }, { "epoch": 0.9643808174333667, "grad_norm": 0.7828469276428223, "learning_rate": 1.947914067230111e-06, "loss": 0.1156, "num_input_tokens_seen": 26610688, "step": 39475 }, { "epoch": 0.9645029682652139, "grad_norm": 1.205389142036438, "learning_rate": 1.9478869006362305e-06, "loss": 0.0695, "num_input_tokens_seen": 26614016, "step": 39480 }, { "epoch": 0.9646251190970611, "grad_norm": 1.7988033294677734, "learning_rate": 1.9478597271490614e-06, "loss": 0.0485, "num_input_tokens_seen": 26617408, "step": 39485 }, { "epoch": 0.9647472699289082, "grad_norm": 7.042235851287842, "learning_rate": 1.9478325467688013e-06, "loss": 0.0856, "num_input_tokens_seen": 26620736, "step": 39490 }, { "epoch": 0.9648694207607553, "grad_norm": 0.9602611660957336, "learning_rate": 1.9478053594956484e-06, "loss": 0.1242, "num_input_tokens_seen": 26624512, "step": 39495 }, { "epoch": 0.9649915715926025, "grad_norm": 2.938580274581909, "learning_rate": 1.9477781653297996e-06, "loss": 0.1077, "num_input_tokens_seen": 26627776, "step": 39500 }, { "epoch": 0.9651137224244497, "grad_norm": 43.11700439453125, "learning_rate": 1.9477509642714535e-06, "loss": 0.1278, "num_input_tokens_seen": 26631552, "step": 39505 }, { "epoch": 0.9652358732562969, "grad_norm": 20.765621185302734, "learning_rate": 1.947723756320807e-06, "loss": 0.1753, "num_input_tokens_seen": 26634560, "step": 39510 }, { "epoch": 0.9653580240881441, "grad_norm": 20.449045181274414, "learning_rate": 1.9476965414780587e-06, "loss": 0.1075, "num_input_tokens_seen": 26637632, "step": 39515 }, { "epoch": 0.9654801749199912, "grad_norm": 16.410585403442383, "learning_rate": 1.9476693197434063e-06, "loss": 0.2478, "num_input_tokens_seen": 26640832, "step": 39520 }, { "epoch": 0.9656023257518384, "grad_norm": 5.675082206726074, "learning_rate": 1.9476420911170478e-06, "loss": 0.0597, "num_input_tokens_seen": 26644096, "step": 39525 }, { "epoch": 0.9657244765836855, "grad_norm": 29.463457107543945, "learning_rate": 1.947614855599181e-06, "loss": 0.0845, "num_input_tokens_seen": 26647104, "step": 39530 }, { "epoch": 0.9658466274155327, "grad_norm": 11.038426399230957, "learning_rate": 1.947587613190004e-06, "loss": 0.0957, "num_input_tokens_seen": 26650112, "step": 39535 }, { "epoch": 0.9659687782473798, "grad_norm": 2.4716062545776367, "learning_rate": 1.947560363889715e-06, "loss": 0.0864, "num_input_tokens_seen": 26653056, "step": 39540 }, { "epoch": 0.966090929079227, "grad_norm": 4.964685440063477, "learning_rate": 1.9475331076985124e-06, "loss": 0.0976, "num_input_tokens_seen": 26656576, "step": 39545 }, { "epoch": 0.9662130799110742, "grad_norm": 19.059255599975586, "learning_rate": 1.947505844616594e-06, "loss": 0.115, "num_input_tokens_seen": 26659776, "step": 39550 }, { "epoch": 0.9663352307429214, "grad_norm": 39.80873107910156, "learning_rate": 1.9474785746441584e-06, "loss": 0.1513, "num_input_tokens_seen": 26663296, "step": 39555 }, { "epoch": 0.9664573815747686, "grad_norm": 0.6026878356933594, "learning_rate": 1.9474512977814034e-06, "loss": 0.1737, "num_input_tokens_seen": 26667008, "step": 39560 }, { "epoch": 0.9665795324066156, "grad_norm": 11.629830360412598, "learning_rate": 1.947424014028528e-06, "loss": 0.116, "num_input_tokens_seen": 26670464, "step": 39565 }, { "epoch": 0.9667016832384628, "grad_norm": 0.3201778829097748, "learning_rate": 1.9473967233857306e-06, "loss": 0.0084, "num_input_tokens_seen": 26673792, "step": 39570 }, { "epoch": 0.96682383407031, "grad_norm": 27.04475212097168, "learning_rate": 1.947369425853209e-06, "loss": 0.2801, "num_input_tokens_seen": 26676928, "step": 39575 }, { "epoch": 0.9669459849021572, "grad_norm": 0.3024914264678955, "learning_rate": 1.9473421214311624e-06, "loss": 0.0528, "num_input_tokens_seen": 26680256, "step": 39580 }, { "epoch": 0.9670681357340043, "grad_norm": 12.081028938293457, "learning_rate": 1.947314810119789e-06, "loss": 0.1016, "num_input_tokens_seen": 26683456, "step": 39585 }, { "epoch": 0.9671902865658515, "grad_norm": 9.354941368103027, "learning_rate": 1.947287491919287e-06, "loss": 0.1592, "num_input_tokens_seen": 26687040, "step": 39590 }, { "epoch": 0.9673124373976987, "grad_norm": 5.755664348602295, "learning_rate": 1.947260166829856e-06, "loss": 0.037, "num_input_tokens_seen": 26690880, "step": 39595 }, { "epoch": 0.9674345882295459, "grad_norm": 0.8709821105003357, "learning_rate": 1.9472328348516942e-06, "loss": 0.1352, "num_input_tokens_seen": 26694400, "step": 39600 }, { "epoch": 0.9675567390613931, "grad_norm": 0.0869455561041832, "learning_rate": 1.947205495985001e-06, "loss": 0.1238, "num_input_tokens_seen": 26697664, "step": 39605 }, { "epoch": 0.9676788898932401, "grad_norm": 18.37799072265625, "learning_rate": 1.947178150229974e-06, "loss": 0.0828, "num_input_tokens_seen": 26700672, "step": 39610 }, { "epoch": 0.9678010407250873, "grad_norm": 15.37241268157959, "learning_rate": 1.9471507975868133e-06, "loss": 0.0915, "num_input_tokens_seen": 26703424, "step": 39615 }, { "epoch": 0.9679231915569345, "grad_norm": 29.29631996154785, "learning_rate": 1.9471234380557166e-06, "loss": 0.0577, "num_input_tokens_seen": 26706496, "step": 39620 }, { "epoch": 0.9680453423887817, "grad_norm": 16.976232528686523, "learning_rate": 1.947096071636884e-06, "loss": 0.0826, "num_input_tokens_seen": 26710272, "step": 39625 }, { "epoch": 0.9681674932206288, "grad_norm": 0.2124803066253662, "learning_rate": 1.9470686983305137e-06, "loss": 0.2906, "num_input_tokens_seen": 26713344, "step": 39630 }, { "epoch": 0.968289644052476, "grad_norm": 14.788725852966309, "learning_rate": 1.9470413181368055e-06, "loss": 0.0887, "num_input_tokens_seen": 26717056, "step": 39635 }, { "epoch": 0.9684117948843232, "grad_norm": 2.2631194591522217, "learning_rate": 1.9470139310559575e-06, "loss": 0.1129, "num_input_tokens_seen": 26720448, "step": 39640 }, { "epoch": 0.9685339457161704, "grad_norm": 0.33006513118743896, "learning_rate": 1.9469865370881697e-06, "loss": 0.1292, "num_input_tokens_seen": 26723584, "step": 39645 }, { "epoch": 0.9686560965480175, "grad_norm": 37.54838180541992, "learning_rate": 1.946959136233641e-06, "loss": 0.1745, "num_input_tokens_seen": 26726656, "step": 39650 }, { "epoch": 0.9687782473798646, "grad_norm": 2.1035213470458984, "learning_rate": 1.946931728492571e-06, "loss": 0.0521, "num_input_tokens_seen": 26729728, "step": 39655 }, { "epoch": 0.9689003982117118, "grad_norm": 15.783363342285156, "learning_rate": 1.9469043138651593e-06, "loss": 0.1553, "num_input_tokens_seen": 26733312, "step": 39660 }, { "epoch": 0.969022549043559, "grad_norm": 10.670489311218262, "learning_rate": 1.9468768923516038e-06, "loss": 0.1097, "num_input_tokens_seen": 26736832, "step": 39665 }, { "epoch": 0.9691446998754062, "grad_norm": 27.422231674194336, "learning_rate": 1.9468494639521054e-06, "loss": 0.1189, "num_input_tokens_seen": 26740352, "step": 39670 }, { "epoch": 0.9692668507072533, "grad_norm": 0.7498286962509155, "learning_rate": 1.9468220286668627e-06, "loss": 0.0273, "num_input_tokens_seen": 26743680, "step": 39675 }, { "epoch": 0.9693890015391005, "grad_norm": 12.412466049194336, "learning_rate": 1.9467945864960756e-06, "loss": 0.1094, "num_input_tokens_seen": 26747072, "step": 39680 }, { "epoch": 0.9695111523709476, "grad_norm": 15.174398422241211, "learning_rate": 1.946767137439944e-06, "loss": 0.075, "num_input_tokens_seen": 26749824, "step": 39685 }, { "epoch": 0.9696333032027948, "grad_norm": 5.013666152954102, "learning_rate": 1.9467396814986667e-06, "loss": 0.0608, "num_input_tokens_seen": 26753792, "step": 39690 }, { "epoch": 0.969755454034642, "grad_norm": 0.363828182220459, "learning_rate": 1.946712218672444e-06, "loss": 0.0149, "num_input_tokens_seen": 26757056, "step": 39695 }, { "epoch": 0.9698776048664891, "grad_norm": 17.466541290283203, "learning_rate": 1.9466847489614752e-06, "loss": 0.1228, "num_input_tokens_seen": 26760512, "step": 39700 }, { "epoch": 0.9699997556983363, "grad_norm": 0.7071266174316406, "learning_rate": 1.9466572723659605e-06, "loss": 0.0055, "num_input_tokens_seen": 26763712, "step": 39705 }, { "epoch": 0.9701219065301835, "grad_norm": 16.308212280273438, "learning_rate": 1.9466297888860996e-06, "loss": 0.0845, "num_input_tokens_seen": 26766848, "step": 39710 }, { "epoch": 0.9702440573620307, "grad_norm": 8.815410614013672, "learning_rate": 1.9466022985220923e-06, "loss": 0.2151, "num_input_tokens_seen": 26770240, "step": 39715 }, { "epoch": 0.9703662081938778, "grad_norm": 22.193788528442383, "learning_rate": 1.946574801274138e-06, "loss": 0.1373, "num_input_tokens_seen": 26773504, "step": 39720 }, { "epoch": 0.970488359025725, "grad_norm": 0.9276455044746399, "learning_rate": 1.9465472971424373e-06, "loss": 0.0279, "num_input_tokens_seen": 26776960, "step": 39725 }, { "epoch": 0.9706105098575721, "grad_norm": 0.254749059677124, "learning_rate": 1.9465197861271904e-06, "loss": 0.1373, "num_input_tokens_seen": 26780224, "step": 39730 }, { "epoch": 0.9707326606894193, "grad_norm": 4.772306442260742, "learning_rate": 1.9464922682285966e-06, "loss": 0.0407, "num_input_tokens_seen": 26783936, "step": 39735 }, { "epoch": 0.9708548115212665, "grad_norm": 21.679712295532227, "learning_rate": 1.946464743446857e-06, "loss": 0.1103, "num_input_tokens_seen": 26787136, "step": 39740 }, { "epoch": 0.9709769623531136, "grad_norm": 0.8357428312301636, "learning_rate": 1.9464372117821707e-06, "loss": 0.0649, "num_input_tokens_seen": 26790656, "step": 39745 }, { "epoch": 0.9710991131849608, "grad_norm": 9.33323860168457, "learning_rate": 1.9464096732347386e-06, "loss": 0.1237, "num_input_tokens_seen": 26793856, "step": 39750 }, { "epoch": 0.971221264016808, "grad_norm": 0.6812987923622131, "learning_rate": 1.9463821278047607e-06, "loss": 0.0979, "num_input_tokens_seen": 26797312, "step": 39755 }, { "epoch": 0.9713434148486552, "grad_norm": 0.21187125146389008, "learning_rate": 1.9463545754924376e-06, "loss": 0.0288, "num_input_tokens_seen": 26800896, "step": 39760 }, { "epoch": 0.9714655656805022, "grad_norm": 28.24985694885254, "learning_rate": 1.9463270162979697e-06, "loss": 0.2784, "num_input_tokens_seen": 26804032, "step": 39765 }, { "epoch": 0.9715877165123494, "grad_norm": 6.990946292877197, "learning_rate": 1.9462994502215565e-06, "loss": 0.0278, "num_input_tokens_seen": 26807424, "step": 39770 }, { "epoch": 0.9717098673441966, "grad_norm": 13.497817039489746, "learning_rate": 1.9462718772634e-06, "loss": 0.1526, "num_input_tokens_seen": 26810560, "step": 39775 }, { "epoch": 0.9718320181760438, "grad_norm": 19.58252716064453, "learning_rate": 1.9462442974236996e-06, "loss": 0.0776, "num_input_tokens_seen": 26813824, "step": 39780 }, { "epoch": 0.9719541690078909, "grad_norm": 14.153072357177734, "learning_rate": 1.946216710702656e-06, "loss": 0.1508, "num_input_tokens_seen": 26817088, "step": 39785 }, { "epoch": 0.9720763198397381, "grad_norm": 6.661437034606934, "learning_rate": 1.94618911710047e-06, "loss": 0.1146, "num_input_tokens_seen": 26820224, "step": 39790 }, { "epoch": 0.9721984706715853, "grad_norm": 24.22982406616211, "learning_rate": 1.946161516617342e-06, "loss": 0.0732, "num_input_tokens_seen": 26823488, "step": 39795 }, { "epoch": 0.9723206215034325, "grad_norm": 38.69376754760742, "learning_rate": 1.9461339092534733e-06, "loss": 0.0899, "num_input_tokens_seen": 26826752, "step": 39800 }, { "epoch": 0.9724427723352796, "grad_norm": 35.627620697021484, "learning_rate": 1.9461062950090645e-06, "loss": 0.1212, "num_input_tokens_seen": 26829952, "step": 39805 }, { "epoch": 0.9725649231671267, "grad_norm": 12.235479354858398, "learning_rate": 1.946078673884316e-06, "loss": 0.0809, "num_input_tokens_seen": 26832640, "step": 39810 }, { "epoch": 0.9726870739989739, "grad_norm": 24.54392433166504, "learning_rate": 1.9460510458794286e-06, "loss": 0.0563, "num_input_tokens_seen": 26835776, "step": 39815 }, { "epoch": 0.9728092248308211, "grad_norm": 5.012967109680176, "learning_rate": 1.9460234109946044e-06, "loss": 0.1205, "num_input_tokens_seen": 26838912, "step": 39820 }, { "epoch": 0.9729313756626683, "grad_norm": 23.825641632080078, "learning_rate": 1.9459957692300426e-06, "loss": 0.0838, "num_input_tokens_seen": 26842112, "step": 39825 }, { "epoch": 0.9730535264945154, "grad_norm": 1.0864224433898926, "learning_rate": 1.9459681205859457e-06, "loss": 0.135, "num_input_tokens_seen": 26845184, "step": 39830 }, { "epoch": 0.9731756773263626, "grad_norm": 16.24464988708496, "learning_rate": 1.945940465062514e-06, "loss": 0.0454, "num_input_tokens_seen": 26848704, "step": 39835 }, { "epoch": 0.9732978281582098, "grad_norm": 31.012704849243164, "learning_rate": 1.945912802659949e-06, "loss": 0.0868, "num_input_tokens_seen": 26852032, "step": 39840 }, { "epoch": 0.9734199789900569, "grad_norm": 37.21706008911133, "learning_rate": 1.9458851333784514e-06, "loss": 0.0862, "num_input_tokens_seen": 26855424, "step": 39845 }, { "epoch": 0.9735421298219041, "grad_norm": 30.55225944519043, "learning_rate": 1.945857457218223e-06, "loss": 0.1357, "num_input_tokens_seen": 26858944, "step": 39850 }, { "epoch": 0.9736642806537512, "grad_norm": 15.448716163635254, "learning_rate": 1.945829774179464e-06, "loss": 0.2237, "num_input_tokens_seen": 26862208, "step": 39855 }, { "epoch": 0.9737864314855984, "grad_norm": 0.22018280625343323, "learning_rate": 1.9458020842623774e-06, "loss": 0.0235, "num_input_tokens_seen": 26865728, "step": 39860 }, { "epoch": 0.9739085823174456, "grad_norm": 12.636298179626465, "learning_rate": 1.9457743874671633e-06, "loss": 0.0422, "num_input_tokens_seen": 26869312, "step": 39865 }, { "epoch": 0.9740307331492928, "grad_norm": 28.28765869140625, "learning_rate": 1.9457466837940234e-06, "loss": 0.1107, "num_input_tokens_seen": 26872704, "step": 39870 }, { "epoch": 0.9741528839811399, "grad_norm": 18.39106559753418, "learning_rate": 1.9457189732431594e-06, "loss": 0.1688, "num_input_tokens_seen": 26876032, "step": 39875 }, { "epoch": 0.974275034812987, "grad_norm": 11.407864570617676, "learning_rate": 1.9456912558147724e-06, "loss": 0.1856, "num_input_tokens_seen": 26879360, "step": 39880 }, { "epoch": 0.9743971856448342, "grad_norm": 3.375286102294922, "learning_rate": 1.9456635315090645e-06, "loss": 0.1668, "num_input_tokens_seen": 26882752, "step": 39885 }, { "epoch": 0.9745193364766814, "grad_norm": 7.9621100425720215, "learning_rate": 1.945635800326237e-06, "loss": 0.1114, "num_input_tokens_seen": 26886272, "step": 39890 }, { "epoch": 0.9746414873085286, "grad_norm": 10.258283615112305, "learning_rate": 1.9456080622664913e-06, "loss": 0.1352, "num_input_tokens_seen": 26889344, "step": 39895 }, { "epoch": 0.9747636381403757, "grad_norm": 0.11732304841279984, "learning_rate": 1.94558031733003e-06, "loss": 0.1157, "num_input_tokens_seen": 26892544, "step": 39900 }, { "epoch": 0.9748857889722229, "grad_norm": 1.7938843965530396, "learning_rate": 1.9455525655170537e-06, "loss": 0.0424, "num_input_tokens_seen": 26895936, "step": 39905 }, { "epoch": 0.9750079398040701, "grad_norm": 0.14183376729488373, "learning_rate": 1.9455248068277653e-06, "loss": 0.0701, "num_input_tokens_seen": 26899008, "step": 39910 }, { "epoch": 0.9751300906359173, "grad_norm": 19.030942916870117, "learning_rate": 1.945497041262366e-06, "loss": 0.1555, "num_input_tokens_seen": 26902208, "step": 39915 }, { "epoch": 0.9752522414677643, "grad_norm": 9.254307746887207, "learning_rate": 1.945469268821058e-06, "loss": 0.1173, "num_input_tokens_seen": 26905408, "step": 39920 }, { "epoch": 0.9753743922996115, "grad_norm": 11.228315353393555, "learning_rate": 1.945441489504043e-06, "loss": 0.1268, "num_input_tokens_seen": 26908608, "step": 39925 }, { "epoch": 0.9754965431314587, "grad_norm": 16.239818572998047, "learning_rate": 1.9454137033115234e-06, "loss": 0.0952, "num_input_tokens_seen": 26911680, "step": 39930 }, { "epoch": 0.9756186939633059, "grad_norm": 12.501836776733398, "learning_rate": 1.9453859102437007e-06, "loss": 0.1024, "num_input_tokens_seen": 26915264, "step": 39935 }, { "epoch": 0.9757408447951531, "grad_norm": 25.5905818939209, "learning_rate": 1.945358110300778e-06, "loss": 0.1325, "num_input_tokens_seen": 26918848, "step": 39940 }, { "epoch": 0.9758629956270002, "grad_norm": 15.425862312316895, "learning_rate": 1.9453303034829563e-06, "loss": 0.1706, "num_input_tokens_seen": 26922304, "step": 39945 }, { "epoch": 0.9759851464588474, "grad_norm": 1.7378052473068237, "learning_rate": 1.9453024897904387e-06, "loss": 0.0871, "num_input_tokens_seen": 26925952, "step": 39950 }, { "epoch": 0.9761072972906946, "grad_norm": 25.161956787109375, "learning_rate": 1.9452746692234267e-06, "loss": 0.2264, "num_input_tokens_seen": 26929280, "step": 39955 }, { "epoch": 0.9762294481225418, "grad_norm": 15.880363464355469, "learning_rate": 1.9452468417821235e-06, "loss": 0.0949, "num_input_tokens_seen": 26932224, "step": 39960 }, { "epoch": 0.9763515989543888, "grad_norm": 1.2579472064971924, "learning_rate": 1.945219007466731e-06, "loss": 0.1502, "num_input_tokens_seen": 26935360, "step": 39965 }, { "epoch": 0.976473749786236, "grad_norm": 8.38668155670166, "learning_rate": 1.9451911662774515e-06, "loss": 0.0737, "num_input_tokens_seen": 26938944, "step": 39970 }, { "epoch": 0.9765959006180832, "grad_norm": 8.972485542297363, "learning_rate": 1.9451633182144875e-06, "loss": 0.18, "num_input_tokens_seen": 26942336, "step": 39975 }, { "epoch": 0.9767180514499304, "grad_norm": 10.496006965637207, "learning_rate": 1.9451354632780418e-06, "loss": 0.1195, "num_input_tokens_seen": 26945664, "step": 39980 }, { "epoch": 0.9768402022817776, "grad_norm": 0.6256728768348694, "learning_rate": 1.9451076014683166e-06, "loss": 0.0777, "num_input_tokens_seen": 26949568, "step": 39985 }, { "epoch": 0.9769623531136247, "grad_norm": 13.370022773742676, "learning_rate": 1.945079732785515e-06, "loss": 0.0916, "num_input_tokens_seen": 26952960, "step": 39990 }, { "epoch": 0.9770845039454719, "grad_norm": 22.760894775390625, "learning_rate": 1.9450518572298394e-06, "loss": 0.09, "num_input_tokens_seen": 26955904, "step": 39995 }, { "epoch": 0.977206654777319, "grad_norm": 1.959449291229248, "learning_rate": 1.945023974801492e-06, "loss": 0.0635, "num_input_tokens_seen": 26959872, "step": 40000 }, { "epoch": 0.9773288056091662, "grad_norm": 0.4411034882068634, "learning_rate": 1.9449960855006766e-06, "loss": 0.1043, "num_input_tokens_seen": 26963264, "step": 40005 }, { "epoch": 0.9774509564410133, "grad_norm": 33.63774108886719, "learning_rate": 1.9449681893275956e-06, "loss": 0.0797, "num_input_tokens_seen": 26966528, "step": 40010 }, { "epoch": 0.9775731072728605, "grad_norm": 0.423960417509079, "learning_rate": 1.9449402862824512e-06, "loss": 0.0364, "num_input_tokens_seen": 26970048, "step": 40015 }, { "epoch": 0.9776952581047077, "grad_norm": 33.858604431152344, "learning_rate": 1.944912376365447e-06, "loss": 0.2936, "num_input_tokens_seen": 26972928, "step": 40020 }, { "epoch": 0.9778174089365549, "grad_norm": 0.678591251373291, "learning_rate": 1.9448844595767865e-06, "loss": 0.0958, "num_input_tokens_seen": 26976256, "step": 40025 }, { "epoch": 0.9779395597684021, "grad_norm": 13.827244758605957, "learning_rate": 1.9448565359166715e-06, "loss": 0.2362, "num_input_tokens_seen": 26979648, "step": 40030 }, { "epoch": 0.9780617106002492, "grad_norm": 17.79885482788086, "learning_rate": 1.9448286053853054e-06, "loss": 0.0897, "num_input_tokens_seen": 26982848, "step": 40035 }, { "epoch": 0.9781838614320963, "grad_norm": 9.928762435913086, "learning_rate": 1.944800667982892e-06, "loss": 0.1079, "num_input_tokens_seen": 26986176, "step": 40040 }, { "epoch": 0.9783060122639435, "grad_norm": 13.501751899719238, "learning_rate": 1.944772723709634e-06, "loss": 0.1416, "num_input_tokens_seen": 26989312, "step": 40045 }, { "epoch": 0.9784281630957907, "grad_norm": 37.86274719238281, "learning_rate": 1.9447447725657346e-06, "loss": 0.0931, "num_input_tokens_seen": 26992640, "step": 40050 }, { "epoch": 0.9785503139276378, "grad_norm": 0.30410051345825195, "learning_rate": 1.944716814551397e-06, "loss": 0.081, "num_input_tokens_seen": 26996032, "step": 40055 }, { "epoch": 0.978672464759485, "grad_norm": 19.66057014465332, "learning_rate": 1.944688849666825e-06, "loss": 0.052, "num_input_tokens_seen": 26999360, "step": 40060 }, { "epoch": 0.9787946155913322, "grad_norm": 42.28346252441406, "learning_rate": 1.944660877912221e-06, "loss": 0.1148, "num_input_tokens_seen": 27002496, "step": 40065 }, { "epoch": 0.9789167664231794, "grad_norm": 4.766364574432373, "learning_rate": 1.9446328992877896e-06, "loss": 0.0271, "num_input_tokens_seen": 27005760, "step": 40070 }, { "epoch": 0.9790389172550265, "grad_norm": 7.8795013427734375, "learning_rate": 1.944604913793733e-06, "loss": 0.224, "num_input_tokens_seen": 27009344, "step": 40075 }, { "epoch": 0.9791610680868736, "grad_norm": 15.261124610900879, "learning_rate": 1.944576921430256e-06, "loss": 0.0777, "num_input_tokens_seen": 27012480, "step": 40080 }, { "epoch": 0.9792832189187208, "grad_norm": 0.24406935274600983, "learning_rate": 1.944548922197561e-06, "loss": 0.0154, "num_input_tokens_seen": 27015552, "step": 40085 }, { "epoch": 0.979405369750568, "grad_norm": 17.68901252746582, "learning_rate": 1.9445209160958526e-06, "loss": 0.0631, "num_input_tokens_seen": 27018560, "step": 40090 }, { "epoch": 0.9795275205824152, "grad_norm": 47.638511657714844, "learning_rate": 1.9444929031253337e-06, "loss": 0.3232, "num_input_tokens_seen": 27021888, "step": 40095 }, { "epoch": 0.9796496714142623, "grad_norm": 11.55770206451416, "learning_rate": 1.944464883286209e-06, "loss": 0.0867, "num_input_tokens_seen": 27025152, "step": 40100 }, { "epoch": 0.9797718222461095, "grad_norm": 11.932427406311035, "learning_rate": 1.9444368565786813e-06, "loss": 0.0918, "num_input_tokens_seen": 27028224, "step": 40105 }, { "epoch": 0.9798939730779567, "grad_norm": 6.641244888305664, "learning_rate": 1.9444088230029548e-06, "loss": 0.0568, "num_input_tokens_seen": 27031552, "step": 40110 }, { "epoch": 0.9800161239098039, "grad_norm": 23.15635871887207, "learning_rate": 1.944380782559233e-06, "loss": 0.0588, "num_input_tokens_seen": 27035072, "step": 40115 }, { "epoch": 0.9801382747416509, "grad_norm": 10.944313049316406, "learning_rate": 1.944352735247721e-06, "loss": 0.1429, "num_input_tokens_seen": 27038784, "step": 40120 }, { "epoch": 0.9802604255734981, "grad_norm": 8.97407341003418, "learning_rate": 1.944324681068621e-06, "loss": 0.144, "num_input_tokens_seen": 27041920, "step": 40125 }, { "epoch": 0.9803825764053453, "grad_norm": 24.198251724243164, "learning_rate": 1.944296620022138e-06, "loss": 0.1506, "num_input_tokens_seen": 27045184, "step": 40130 }, { "epoch": 0.9805047272371925, "grad_norm": 3.9922945499420166, "learning_rate": 1.944268552108476e-06, "loss": 0.0484, "num_input_tokens_seen": 27048448, "step": 40135 }, { "epoch": 0.9806268780690397, "grad_norm": 24.19867706298828, "learning_rate": 1.9442404773278396e-06, "loss": 0.1924, "num_input_tokens_seen": 27051968, "step": 40140 }, { "epoch": 0.9807490289008868, "grad_norm": 0.7200508713722229, "learning_rate": 1.9442123956804323e-06, "loss": 0.1174, "num_input_tokens_seen": 27055168, "step": 40145 }, { "epoch": 0.980871179732734, "grad_norm": 12.604074478149414, "learning_rate": 1.9441843071664584e-06, "loss": 0.0951, "num_input_tokens_seen": 27058688, "step": 40150 }, { "epoch": 0.9809933305645812, "grad_norm": 14.674346923828125, "learning_rate": 1.9441562117861224e-06, "loss": 0.1467, "num_input_tokens_seen": 27061760, "step": 40155 }, { "epoch": 0.9811154813964283, "grad_norm": 6.677186489105225, "learning_rate": 1.944128109539628e-06, "loss": 0.0662, "num_input_tokens_seen": 27065344, "step": 40160 }, { "epoch": 0.9812376322282754, "grad_norm": 6.980493068695068, "learning_rate": 1.9441000004271805e-06, "loss": 0.0604, "num_input_tokens_seen": 27068352, "step": 40165 }, { "epoch": 0.9813597830601226, "grad_norm": 14.983774185180664, "learning_rate": 1.944071884448984e-06, "loss": 0.0954, "num_input_tokens_seen": 27071744, "step": 40170 }, { "epoch": 0.9814819338919698, "grad_norm": 15.543222427368164, "learning_rate": 1.9440437616052425e-06, "loss": 0.1245, "num_input_tokens_seen": 27074944, "step": 40175 }, { "epoch": 0.981604084723817, "grad_norm": 1.5099848508834839, "learning_rate": 1.944015631896161e-06, "loss": 0.0687, "num_input_tokens_seen": 27078272, "step": 40180 }, { "epoch": 0.9817262355556642, "grad_norm": 20.4335994720459, "learning_rate": 1.9439874953219437e-06, "loss": 0.1081, "num_input_tokens_seen": 27081600, "step": 40185 }, { "epoch": 0.9818483863875113, "grad_norm": 1.7662246227264404, "learning_rate": 1.9439593518827955e-06, "loss": 0.0673, "num_input_tokens_seen": 27084480, "step": 40190 }, { "epoch": 0.9819705372193585, "grad_norm": 8.809418678283691, "learning_rate": 1.9439312015789213e-06, "loss": 0.0639, "num_input_tokens_seen": 27087552, "step": 40195 }, { "epoch": 0.9820926880512056, "grad_norm": 5.3898749351501465, "learning_rate": 1.9439030444105253e-06, "loss": 0.0221, "num_input_tokens_seen": 27090496, "step": 40200 }, { "epoch": 0.9822148388830528, "grad_norm": 20.789827346801758, "learning_rate": 1.9438748803778123e-06, "loss": 0.1202, "num_input_tokens_seen": 27093888, "step": 40205 }, { "epoch": 0.9823369897148999, "grad_norm": 15.2289400100708, "learning_rate": 1.943846709480988e-06, "loss": 0.0858, "num_input_tokens_seen": 27097024, "step": 40210 }, { "epoch": 0.9824591405467471, "grad_norm": 0.4235302209854126, "learning_rate": 1.9438185317202557e-06, "loss": 0.0748, "num_input_tokens_seen": 27100544, "step": 40215 }, { "epoch": 0.9825812913785943, "grad_norm": 22.20836067199707, "learning_rate": 1.9437903470958216e-06, "loss": 0.1275, "num_input_tokens_seen": 27104000, "step": 40220 }, { "epoch": 0.9827034422104415, "grad_norm": 28.422733306884766, "learning_rate": 1.94376215560789e-06, "loss": 0.0914, "num_input_tokens_seen": 27106944, "step": 40225 }, { "epoch": 0.9828255930422887, "grad_norm": 12.55518627166748, "learning_rate": 1.9437339572566666e-06, "loss": 0.1515, "num_input_tokens_seen": 27109952, "step": 40230 }, { "epoch": 0.9829477438741357, "grad_norm": 34.5494384765625, "learning_rate": 1.9437057520423557e-06, "loss": 0.2121, "num_input_tokens_seen": 27113024, "step": 40235 }, { "epoch": 0.9830698947059829, "grad_norm": 0.9313546419143677, "learning_rate": 1.9436775399651628e-06, "loss": 0.1038, "num_input_tokens_seen": 27116160, "step": 40240 }, { "epoch": 0.9831920455378301, "grad_norm": 0.2228740155696869, "learning_rate": 1.9436493210252932e-06, "loss": 0.115, "num_input_tokens_seen": 27119424, "step": 40245 }, { "epoch": 0.9833141963696773, "grad_norm": 18.93325424194336, "learning_rate": 1.9436210952229517e-06, "loss": 0.0432, "num_input_tokens_seen": 27122368, "step": 40250 }, { "epoch": 0.9834363472015244, "grad_norm": 9.483077049255371, "learning_rate": 1.943592862558344e-06, "loss": 0.1868, "num_input_tokens_seen": 27125952, "step": 40255 }, { "epoch": 0.9835584980333716, "grad_norm": 59.740150451660156, "learning_rate": 1.943564623031675e-06, "loss": 0.1231, "num_input_tokens_seen": 27129280, "step": 40260 }, { "epoch": 0.9836806488652188, "grad_norm": 3.750798463821411, "learning_rate": 1.9435363766431504e-06, "loss": 0.1181, "num_input_tokens_seen": 27132736, "step": 40265 }, { "epoch": 0.983802799697066, "grad_norm": 0.2580016255378723, "learning_rate": 1.9435081233929755e-06, "loss": 0.016, "num_input_tokens_seen": 27136384, "step": 40270 }, { "epoch": 0.9839249505289132, "grad_norm": 15.4879150390625, "learning_rate": 1.9434798632813556e-06, "loss": 0.0667, "num_input_tokens_seen": 27139904, "step": 40275 }, { "epoch": 0.9840471013607602, "grad_norm": 1.0932239294052124, "learning_rate": 1.9434515963084965e-06, "loss": 0.078, "num_input_tokens_seen": 27143488, "step": 40280 }, { "epoch": 0.9841692521926074, "grad_norm": 28.132898330688477, "learning_rate": 1.943423322474603e-06, "loss": 0.1724, "num_input_tokens_seen": 27146688, "step": 40285 }, { "epoch": 0.9842914030244546, "grad_norm": 34.23074722290039, "learning_rate": 1.9433950417798823e-06, "loss": 0.1108, "num_input_tokens_seen": 27149824, "step": 40290 }, { "epoch": 0.9844135538563018, "grad_norm": 16.52518653869629, "learning_rate": 1.9433667542245385e-06, "loss": 0.1305, "num_input_tokens_seen": 27153280, "step": 40295 }, { "epoch": 0.9845357046881489, "grad_norm": 19.801593780517578, "learning_rate": 1.9433384598087784e-06, "loss": 0.1398, "num_input_tokens_seen": 27156416, "step": 40300 }, { "epoch": 0.9846578555199961, "grad_norm": 1.1172616481781006, "learning_rate": 1.943310158532807e-06, "loss": 0.0858, "num_input_tokens_seen": 27159616, "step": 40305 }, { "epoch": 0.9847800063518433, "grad_norm": 29.34340476989746, "learning_rate": 1.9432818503968304e-06, "loss": 0.1594, "num_input_tokens_seen": 27163072, "step": 40310 }, { "epoch": 0.9849021571836905, "grad_norm": 0.5346962809562683, "learning_rate": 1.9432535354010542e-06, "loss": 0.062, "num_input_tokens_seen": 27166400, "step": 40315 }, { "epoch": 0.9850243080155375, "grad_norm": 17.92679214477539, "learning_rate": 1.943225213545685e-06, "loss": 0.1924, "num_input_tokens_seen": 27169600, "step": 40320 }, { "epoch": 0.9851464588473847, "grad_norm": 10.966693878173828, "learning_rate": 1.9431968848309287e-06, "loss": 0.1132, "num_input_tokens_seen": 27173376, "step": 40325 }, { "epoch": 0.9852686096792319, "grad_norm": 26.890884399414062, "learning_rate": 1.9431685492569907e-06, "loss": 0.1546, "num_input_tokens_seen": 27176640, "step": 40330 }, { "epoch": 0.9853907605110791, "grad_norm": 0.18494495749473572, "learning_rate": 1.943140206824077e-06, "loss": 0.0869, "num_input_tokens_seen": 27179840, "step": 40335 }, { "epoch": 0.9855129113429263, "grad_norm": 0.3660151958465576, "learning_rate": 1.943111857532394e-06, "loss": 0.0055, "num_input_tokens_seen": 27183040, "step": 40340 }, { "epoch": 0.9856350621747734, "grad_norm": 43.12258529663086, "learning_rate": 1.943083501382148e-06, "loss": 0.1628, "num_input_tokens_seen": 27186240, "step": 40345 }, { "epoch": 0.9857572130066206, "grad_norm": 1.3598332405090332, "learning_rate": 1.9430551383735455e-06, "loss": 0.0698, "num_input_tokens_seen": 27189632, "step": 40350 }, { "epoch": 0.9858793638384677, "grad_norm": 56.717376708984375, "learning_rate": 1.943026768506792e-06, "loss": 0.1483, "num_input_tokens_seen": 27192960, "step": 40355 }, { "epoch": 0.9860015146703149, "grad_norm": 32.48219299316406, "learning_rate": 1.9429983917820944e-06, "loss": 0.1704, "num_input_tokens_seen": 27196352, "step": 40360 }, { "epoch": 0.986123665502162, "grad_norm": 9.722297668457031, "learning_rate": 1.9429700081996587e-06, "loss": 0.1303, "num_input_tokens_seen": 27199680, "step": 40365 }, { "epoch": 0.9862458163340092, "grad_norm": 9.689001083374023, "learning_rate": 1.9429416177596917e-06, "loss": 0.0881, "num_input_tokens_seen": 27203392, "step": 40370 }, { "epoch": 0.9863679671658564, "grad_norm": 11.218616485595703, "learning_rate": 1.9429132204623993e-06, "loss": 0.1059, "num_input_tokens_seen": 27206848, "step": 40375 }, { "epoch": 0.9864901179977036, "grad_norm": 4.2435688972473145, "learning_rate": 1.9428848163079884e-06, "loss": 0.1589, "num_input_tokens_seen": 27210688, "step": 40380 }, { "epoch": 0.9866122688295508, "grad_norm": 0.222267284989357, "learning_rate": 1.942856405296666e-06, "loss": 0.08, "num_input_tokens_seen": 27214016, "step": 40385 }, { "epoch": 0.9867344196613979, "grad_norm": 12.685956001281738, "learning_rate": 1.942827987428638e-06, "loss": 0.1141, "num_input_tokens_seen": 27217408, "step": 40390 }, { "epoch": 0.986856570493245, "grad_norm": 22.542917251586914, "learning_rate": 1.9427995627041107e-06, "loss": 0.2218, "num_input_tokens_seen": 27220672, "step": 40395 }, { "epoch": 0.9869787213250922, "grad_norm": 10.268771171569824, "learning_rate": 1.942771131123292e-06, "loss": 0.1277, "num_input_tokens_seen": 27224640, "step": 40400 }, { "epoch": 0.9871008721569394, "grad_norm": 1.3927884101867676, "learning_rate": 1.9427426926863876e-06, "loss": 0.0716, "num_input_tokens_seen": 27228608, "step": 40405 }, { "epoch": 0.9872230229887865, "grad_norm": 10.72795295715332, "learning_rate": 1.942714247393605e-06, "loss": 0.1121, "num_input_tokens_seen": 27231744, "step": 40410 }, { "epoch": 0.9873451738206337, "grad_norm": 5.103229522705078, "learning_rate": 1.942685795245151e-06, "loss": 0.0523, "num_input_tokens_seen": 27235712, "step": 40415 }, { "epoch": 0.9874673246524809, "grad_norm": 10.616697311401367, "learning_rate": 1.9426573362412323e-06, "loss": 0.086, "num_input_tokens_seen": 27238976, "step": 40420 }, { "epoch": 0.9875894754843281, "grad_norm": 14.616541862487793, "learning_rate": 1.942628870382056e-06, "loss": 0.2343, "num_input_tokens_seen": 27241856, "step": 40425 }, { "epoch": 0.9877116263161753, "grad_norm": 1.1679258346557617, "learning_rate": 1.942600397667829e-06, "loss": 0.1004, "num_input_tokens_seen": 27245440, "step": 40430 }, { "epoch": 0.9878337771480223, "grad_norm": 24.777894973754883, "learning_rate": 1.942571918098758e-06, "loss": 0.0552, "num_input_tokens_seen": 27248896, "step": 40435 }, { "epoch": 0.9879559279798695, "grad_norm": 25.75714683532715, "learning_rate": 1.9425434316750507e-06, "loss": 0.2256, "num_input_tokens_seen": 27252416, "step": 40440 }, { "epoch": 0.9880780788117167, "grad_norm": 11.695103645324707, "learning_rate": 1.9425149383969144e-06, "loss": 0.039, "num_input_tokens_seen": 27255808, "step": 40445 }, { "epoch": 0.9882002296435639, "grad_norm": 34.632381439208984, "learning_rate": 1.9424864382645553e-06, "loss": 0.0572, "num_input_tokens_seen": 27259072, "step": 40450 }, { "epoch": 0.988322380475411, "grad_norm": 1.0183019638061523, "learning_rate": 1.9424579312781817e-06, "loss": 0.0936, "num_input_tokens_seen": 27262528, "step": 40455 }, { "epoch": 0.9884445313072582, "grad_norm": 25.458181381225586, "learning_rate": 1.942429417438001e-06, "loss": 0.0895, "num_input_tokens_seen": 27265984, "step": 40460 }, { "epoch": 0.9885666821391054, "grad_norm": 0.9326586127281189, "learning_rate": 1.9424008967442193e-06, "loss": 0.0736, "num_input_tokens_seen": 27269632, "step": 40465 }, { "epoch": 0.9886888329709526, "grad_norm": 26.359806060791016, "learning_rate": 1.942372369197045e-06, "loss": 0.073, "num_input_tokens_seen": 27272640, "step": 40470 }, { "epoch": 0.9888109838027997, "grad_norm": 5.261368274688721, "learning_rate": 1.9423438347966857e-06, "loss": 0.0215, "num_input_tokens_seen": 27275776, "step": 40475 }, { "epoch": 0.9889331346346468, "grad_norm": 16.494407653808594, "learning_rate": 1.942315293543348e-06, "loss": 0.0869, "num_input_tokens_seen": 27278912, "step": 40480 }, { "epoch": 0.989055285466494, "grad_norm": 28.753313064575195, "learning_rate": 1.9422867454372406e-06, "loss": 0.0808, "num_input_tokens_seen": 27282624, "step": 40485 }, { "epoch": 0.9891774362983412, "grad_norm": 12.6876859664917, "learning_rate": 1.9422581904785704e-06, "loss": 0.1726, "num_input_tokens_seen": 27286016, "step": 40490 }, { "epoch": 0.9892995871301884, "grad_norm": 0.036324337124824524, "learning_rate": 1.9422296286675447e-06, "loss": 0.1635, "num_input_tokens_seen": 27289472, "step": 40495 }, { "epoch": 0.9894217379620355, "grad_norm": 0.38467368483543396, "learning_rate": 1.9422010600043722e-06, "loss": 0.0267, "num_input_tokens_seen": 27292800, "step": 40500 }, { "epoch": 0.9895438887938827, "grad_norm": 22.768394470214844, "learning_rate": 1.9421724844892606e-06, "loss": 0.222, "num_input_tokens_seen": 27295936, "step": 40505 }, { "epoch": 0.9896660396257299, "grad_norm": 27.51454734802246, "learning_rate": 1.9421439021224164e-06, "loss": 0.0899, "num_input_tokens_seen": 27299712, "step": 40510 }, { "epoch": 0.989788190457577, "grad_norm": 0.10687658190727234, "learning_rate": 1.942115312904049e-06, "loss": 0.0432, "num_input_tokens_seen": 27302912, "step": 40515 }, { "epoch": 0.9899103412894242, "grad_norm": 47.35722351074219, "learning_rate": 1.9420867168343652e-06, "loss": 0.0669, "num_input_tokens_seen": 27306688, "step": 40520 }, { "epoch": 0.9900324921212713, "grad_norm": 1.9231311082839966, "learning_rate": 1.9420581139135733e-06, "loss": 0.0034, "num_input_tokens_seen": 27310144, "step": 40525 }, { "epoch": 0.9901546429531185, "grad_norm": 2.8595128059387207, "learning_rate": 1.942029504141882e-06, "loss": 0.0698, "num_input_tokens_seen": 27313536, "step": 40530 }, { "epoch": 0.9902767937849657, "grad_norm": 8.048189163208008, "learning_rate": 1.9420008875194986e-06, "loss": 0.1137, "num_input_tokens_seen": 27317184, "step": 40535 }, { "epoch": 0.9903989446168129, "grad_norm": 20.83180046081543, "learning_rate": 1.941972264046631e-06, "loss": 0.0686, "num_input_tokens_seen": 27320704, "step": 40540 }, { "epoch": 0.99052109544866, "grad_norm": 56.8033332824707, "learning_rate": 1.941943633723488e-06, "loss": 0.3562, "num_input_tokens_seen": 27323968, "step": 40545 }, { "epoch": 0.9906432462805072, "grad_norm": 0.15781305730342865, "learning_rate": 1.9419149965502773e-06, "loss": 0.0403, "num_input_tokens_seen": 27327232, "step": 40550 }, { "epoch": 0.9907653971123543, "grad_norm": 8.692741394042969, "learning_rate": 1.9418863525272077e-06, "loss": 0.1088, "num_input_tokens_seen": 27330176, "step": 40555 }, { "epoch": 0.9908875479442015, "grad_norm": 18.1630916595459, "learning_rate": 1.941857701654487e-06, "loss": 0.0694, "num_input_tokens_seen": 27335936, "step": 40560 }, { "epoch": 0.9910096987760487, "grad_norm": 0.15101860463619232, "learning_rate": 1.9418290439323243e-06, "loss": 0.0502, "num_input_tokens_seen": 27339392, "step": 40565 }, { "epoch": 0.9911318496078958, "grad_norm": 0.8477771878242493, "learning_rate": 1.9418003793609267e-06, "loss": 0.1334, "num_input_tokens_seen": 27342976, "step": 40570 }, { "epoch": 0.991254000439743, "grad_norm": 11.633031845092773, "learning_rate": 1.941771707940504e-06, "loss": 0.0908, "num_input_tokens_seen": 27346112, "step": 40575 }, { "epoch": 0.9913761512715902, "grad_norm": 8.685921669006348, "learning_rate": 1.941743029671264e-06, "loss": 0.0786, "num_input_tokens_seen": 27349312, "step": 40580 }, { "epoch": 0.9914983021034374, "grad_norm": 13.447248458862305, "learning_rate": 1.9417143445534152e-06, "loss": 0.1647, "num_input_tokens_seen": 27352192, "step": 40585 }, { "epoch": 0.9916204529352844, "grad_norm": 33.43043899536133, "learning_rate": 1.9416856525871666e-06, "loss": 0.1494, "num_input_tokens_seen": 27355520, "step": 40590 }, { "epoch": 0.9917426037671316, "grad_norm": 0.32424071431159973, "learning_rate": 1.941656953772726e-06, "loss": 0.1283, "num_input_tokens_seen": 27359040, "step": 40595 }, { "epoch": 0.9918647545989788, "grad_norm": 22.274261474609375, "learning_rate": 1.9416282481103038e-06, "loss": 0.1341, "num_input_tokens_seen": 27361920, "step": 40600 }, { "epoch": 0.991986905430826, "grad_norm": 4.439818859100342, "learning_rate": 1.941599535600107e-06, "loss": 0.0558, "num_input_tokens_seen": 27365248, "step": 40605 }, { "epoch": 0.9921090562626731, "grad_norm": 37.596778869628906, "learning_rate": 1.9415708162423452e-06, "loss": 0.1306, "num_input_tokens_seen": 27368512, "step": 40610 }, { "epoch": 0.9922312070945203, "grad_norm": 20.270496368408203, "learning_rate": 1.9415420900372275e-06, "loss": 0.1231, "num_input_tokens_seen": 27371840, "step": 40615 }, { "epoch": 0.9923533579263675, "grad_norm": 9.090349197387695, "learning_rate": 1.9415133569849622e-06, "loss": 0.205, "num_input_tokens_seen": 27375040, "step": 40620 }, { "epoch": 0.9924755087582147, "grad_norm": 0.16481897234916687, "learning_rate": 1.9414846170857587e-06, "loss": 0.0851, "num_input_tokens_seen": 27378688, "step": 40625 }, { "epoch": 0.9925976595900619, "grad_norm": 0.8626278638839722, "learning_rate": 1.941455870339826e-06, "loss": 0.0645, "num_input_tokens_seen": 27381952, "step": 40630 }, { "epoch": 0.9927198104219089, "grad_norm": 25.085214614868164, "learning_rate": 1.9414271167473726e-06, "loss": 0.1897, "num_input_tokens_seen": 27385152, "step": 40635 }, { "epoch": 0.9928419612537561, "grad_norm": 0.8803386092185974, "learning_rate": 1.941398356308608e-06, "loss": 0.0993, "num_input_tokens_seen": 27388800, "step": 40640 }, { "epoch": 0.9929641120856033, "grad_norm": 1.2641054391860962, "learning_rate": 1.9413695890237418e-06, "loss": 0.0957, "num_input_tokens_seen": 27392512, "step": 40645 }, { "epoch": 0.9930862629174505, "grad_norm": 1.7712982892990112, "learning_rate": 1.9413408148929823e-06, "loss": 0.1266, "num_input_tokens_seen": 27397824, "step": 40650 }, { "epoch": 0.9932084137492976, "grad_norm": 30.253278732299805, "learning_rate": 1.941312033916539e-06, "loss": 0.17, "num_input_tokens_seen": 27400896, "step": 40655 }, { "epoch": 0.9933305645811448, "grad_norm": 8.815448760986328, "learning_rate": 1.941283246094622e-06, "loss": 0.1171, "num_input_tokens_seen": 27404352, "step": 40660 }, { "epoch": 0.993452715412992, "grad_norm": 7.9292683601379395, "learning_rate": 1.9412544514274395e-06, "loss": 0.0449, "num_input_tokens_seen": 27407616, "step": 40665 }, { "epoch": 0.9935748662448391, "grad_norm": 0.6572582125663757, "learning_rate": 1.941225649915202e-06, "loss": 0.1234, "num_input_tokens_seen": 27410880, "step": 40670 }, { "epoch": 0.9936970170766863, "grad_norm": 13.578548431396484, "learning_rate": 1.941196841558118e-06, "loss": 0.1513, "num_input_tokens_seen": 27414272, "step": 40675 }, { "epoch": 0.9938191679085334, "grad_norm": 1.2818429470062256, "learning_rate": 1.9411680263563976e-06, "loss": 0.154, "num_input_tokens_seen": 27417536, "step": 40680 }, { "epoch": 0.9939413187403806, "grad_norm": 2.6503143310546875, "learning_rate": 1.9411392043102502e-06, "loss": 0.1063, "num_input_tokens_seen": 27421120, "step": 40685 }, { "epoch": 0.9940634695722278, "grad_norm": 21.41456413269043, "learning_rate": 1.9411103754198852e-06, "loss": 0.0305, "num_input_tokens_seen": 27424640, "step": 40690 }, { "epoch": 0.994185620404075, "grad_norm": 11.669831275939941, "learning_rate": 1.9410815396855126e-06, "loss": 0.1007, "num_input_tokens_seen": 27427776, "step": 40695 }, { "epoch": 0.9943077712359221, "grad_norm": 10.491043090820312, "learning_rate": 1.941052697107342e-06, "loss": 0.1402, "num_input_tokens_seen": 27430976, "step": 40700 }, { "epoch": 0.9944299220677693, "grad_norm": 0.19844773411750793, "learning_rate": 1.941023847685583e-06, "loss": 0.0659, "num_input_tokens_seen": 27434368, "step": 40705 }, { "epoch": 0.9945520728996164, "grad_norm": 6.634429931640625, "learning_rate": 1.9409949914204454e-06, "loss": 0.1137, "num_input_tokens_seen": 27437696, "step": 40710 }, { "epoch": 0.9946742237314636, "grad_norm": 4.638437271118164, "learning_rate": 1.9409661283121393e-06, "loss": 0.1326, "num_input_tokens_seen": 27441088, "step": 40715 }, { "epoch": 0.9947963745633108, "grad_norm": 8.209115028381348, "learning_rate": 1.9409372583608743e-06, "loss": 0.1082, "num_input_tokens_seen": 27444672, "step": 40720 }, { "epoch": 0.9949185253951579, "grad_norm": 20.867942810058594, "learning_rate": 1.9409083815668604e-06, "loss": 0.0943, "num_input_tokens_seen": 27448256, "step": 40725 }, { "epoch": 0.9950406762270051, "grad_norm": 41.91875076293945, "learning_rate": 1.9408794979303077e-06, "loss": 0.1296, "num_input_tokens_seen": 27451904, "step": 40730 }, { "epoch": 0.9951628270588523, "grad_norm": 20.121931076049805, "learning_rate": 1.940850607451426e-06, "loss": 0.0584, "num_input_tokens_seen": 27455424, "step": 40735 }, { "epoch": 0.9952849778906995, "grad_norm": 2.706618070602417, "learning_rate": 1.940821710130426e-06, "loss": 0.2389, "num_input_tokens_seen": 27458944, "step": 40740 }, { "epoch": 0.9954071287225466, "grad_norm": 17.6815128326416, "learning_rate": 1.9407928059675176e-06, "loss": 0.1535, "num_input_tokens_seen": 27462272, "step": 40745 }, { "epoch": 0.9955292795543937, "grad_norm": 7.666195869445801, "learning_rate": 1.9407638949629102e-06, "loss": 0.1028, "num_input_tokens_seen": 27465792, "step": 40750 }, { "epoch": 0.9956514303862409, "grad_norm": 10.247875213623047, "learning_rate": 1.940734977116815e-06, "loss": 0.0626, "num_input_tokens_seen": 27469504, "step": 40755 }, { "epoch": 0.9957735812180881, "grad_norm": 5.036080360412598, "learning_rate": 1.9407060524294426e-06, "loss": 0.1577, "num_input_tokens_seen": 27472896, "step": 40760 }, { "epoch": 0.9958957320499353, "grad_norm": 14.078655242919922, "learning_rate": 1.9406771209010024e-06, "loss": 0.1161, "num_input_tokens_seen": 27476608, "step": 40765 }, { "epoch": 0.9960178828817824, "grad_norm": 20.622859954833984, "learning_rate": 1.9406481825317052e-06, "loss": 0.1274, "num_input_tokens_seen": 27480128, "step": 40770 }, { "epoch": 0.9961400337136296, "grad_norm": 0.6095002889633179, "learning_rate": 1.940619237321761e-06, "loss": 0.0476, "num_input_tokens_seen": 27483264, "step": 40775 }, { "epoch": 0.9962621845454768, "grad_norm": 15.534045219421387, "learning_rate": 1.9405902852713812e-06, "loss": 0.0756, "num_input_tokens_seen": 27486912, "step": 40780 }, { "epoch": 0.996384335377324, "grad_norm": 26.785686492919922, "learning_rate": 1.940561326380776e-06, "loss": 0.1296, "num_input_tokens_seen": 27490176, "step": 40785 }, { "epoch": 0.996506486209171, "grad_norm": 10.619406700134277, "learning_rate": 1.940532360650155e-06, "loss": 0.0765, "num_input_tokens_seen": 27493248, "step": 40790 }, { "epoch": 0.9966286370410182, "grad_norm": 10.111905097961426, "learning_rate": 1.9405033880797303e-06, "loss": 0.0651, "num_input_tokens_seen": 27496512, "step": 40795 }, { "epoch": 0.9967507878728654, "grad_norm": 1.098646640777588, "learning_rate": 1.940474408669712e-06, "loss": 0.0417, "num_input_tokens_seen": 27499776, "step": 40800 }, { "epoch": 0.9968729387047126, "grad_norm": 0.4705561697483063, "learning_rate": 1.9404454224203108e-06, "loss": 0.016, "num_input_tokens_seen": 27503168, "step": 40805 }, { "epoch": 0.9969950895365598, "grad_norm": 22.709070205688477, "learning_rate": 1.9404164293317374e-06, "loss": 0.1012, "num_input_tokens_seen": 27506304, "step": 40810 }, { "epoch": 0.9971172403684069, "grad_norm": 8.215293884277344, "learning_rate": 1.940387429404203e-06, "loss": 0.2259, "num_input_tokens_seen": 27509632, "step": 40815 }, { "epoch": 0.9972393912002541, "grad_norm": 18.94011116027832, "learning_rate": 1.940358422637918e-06, "loss": 0.0741, "num_input_tokens_seen": 27513152, "step": 40820 }, { "epoch": 0.9973615420321013, "grad_norm": 29.055315017700195, "learning_rate": 1.940329409033094e-06, "loss": 0.1664, "num_input_tokens_seen": 27516352, "step": 40825 }, { "epoch": 0.9974836928639484, "grad_norm": 22.35759735107422, "learning_rate": 1.9403003885899415e-06, "loss": 0.1112, "num_input_tokens_seen": 27519616, "step": 40830 }, { "epoch": 0.9976058436957955, "grad_norm": 1.5217119455337524, "learning_rate": 1.9402713613086716e-06, "loss": 0.0942, "num_input_tokens_seen": 27522880, "step": 40835 }, { "epoch": 0.9977279945276427, "grad_norm": 19.438976287841797, "learning_rate": 1.9402423271894952e-06, "loss": 0.2131, "num_input_tokens_seen": 27525824, "step": 40840 }, { "epoch": 0.9978501453594899, "grad_norm": 2.120493173599243, "learning_rate": 1.9402132862326242e-06, "loss": 0.0197, "num_input_tokens_seen": 27529216, "step": 40845 }, { "epoch": 0.9979722961913371, "grad_norm": 0.46346285939216614, "learning_rate": 1.940184238438269e-06, "loss": 0.0347, "num_input_tokens_seen": 27532800, "step": 40850 }, { "epoch": 0.9980944470231842, "grad_norm": 11.514608383178711, "learning_rate": 1.940155183806641e-06, "loss": 0.0903, "num_input_tokens_seen": 27536064, "step": 40855 }, { "epoch": 0.9982165978550314, "grad_norm": 22.47312355041504, "learning_rate": 1.940126122337952e-06, "loss": 0.1097, "num_input_tokens_seen": 27538880, "step": 40860 }, { "epoch": 0.9983387486868786, "grad_norm": 0.6225314140319824, "learning_rate": 1.9400970540324125e-06, "loss": 0.032, "num_input_tokens_seen": 27542144, "step": 40865 }, { "epoch": 0.9984608995187257, "grad_norm": 9.540884971618652, "learning_rate": 1.940067978890235e-06, "loss": 0.09, "num_input_tokens_seen": 27545472, "step": 40870 }, { "epoch": 0.9985830503505729, "grad_norm": 1.1357994079589844, "learning_rate": 1.9400388969116295e-06, "loss": 0.1085, "num_input_tokens_seen": 27548800, "step": 40875 }, { "epoch": 0.99870520118242, "grad_norm": 27.24406623840332, "learning_rate": 1.9400098080968087e-06, "loss": 0.1362, "num_input_tokens_seen": 27552448, "step": 40880 }, { "epoch": 0.9988273520142672, "grad_norm": 14.14842414855957, "learning_rate": 1.939980712445984e-06, "loss": 0.0632, "num_input_tokens_seen": 27556096, "step": 40885 }, { "epoch": 0.9989495028461144, "grad_norm": 10.4360933303833, "learning_rate": 1.9399516099593666e-06, "loss": 0.0939, "num_input_tokens_seen": 27559616, "step": 40890 }, { "epoch": 0.9990716536779616, "grad_norm": 21.335147857666016, "learning_rate": 1.9399225006371684e-06, "loss": 0.1567, "num_input_tokens_seen": 27562880, "step": 40895 }, { "epoch": 0.9991938045098087, "grad_norm": 2.0332093238830566, "learning_rate": 1.9398933844796006e-06, "loss": 0.2327, "num_input_tokens_seen": 27566336, "step": 40900 }, { "epoch": 0.9993159553416558, "grad_norm": 1.2199807167053223, "learning_rate": 1.9398642614868755e-06, "loss": 0.091, "num_input_tokens_seen": 27569920, "step": 40905 }, { "epoch": 0.999438106173503, "grad_norm": 7.498171806335449, "learning_rate": 1.9398351316592048e-06, "loss": 0.0299, "num_input_tokens_seen": 27573312, "step": 40910 }, { "epoch": 0.9995602570053502, "grad_norm": 16.187875747680664, "learning_rate": 1.9398059949967998e-06, "loss": 0.0406, "num_input_tokens_seen": 27577536, "step": 40915 }, { "epoch": 0.9996824078371974, "grad_norm": 0.1693330556154251, "learning_rate": 1.9397768514998736e-06, "loss": 0.0677, "num_input_tokens_seen": 27581056, "step": 40920 }, { "epoch": 0.9998045586690445, "grad_norm": 9.364876747131348, "learning_rate": 1.9397477011686366e-06, "loss": 0.1449, "num_input_tokens_seen": 27584320, "step": 40925 }, { "epoch": 0.9999267095008917, "grad_norm": 1.984630823135376, "learning_rate": 1.939718544003302e-06, "loss": 0.0949, "num_input_tokens_seen": 27587584, "step": 40930 }, { "epoch": 1.0000488603327389, "grad_norm": 0.3797398507595062, "learning_rate": 1.9396893800040813e-06, "loss": 0.1214, "num_input_tokens_seen": 27591136, "step": 40935 }, { "epoch": 1.0000732904991083, "eval_loss": 0.12240181118249893, "eval_runtime": 47.7677, "eval_samples_per_second": 761.707, "eval_steps_per_second": 95.232, "num_input_tokens_seen": 27591776, "step": 40936 }, { "epoch": 1.000171011164586, "grad_norm": 49.70656204223633, "learning_rate": 1.9396602091711864e-06, "loss": 0.0258, "num_input_tokens_seen": 27594592, "step": 40940 }, { "epoch": 1.0002931619964333, "grad_norm": 33.254581451416016, "learning_rate": 1.93963103150483e-06, "loss": 0.0086, "num_input_tokens_seen": 27598112, "step": 40945 }, { "epoch": 1.0004153128282804, "grad_norm": 15.725682258605957, "learning_rate": 1.939601847005224e-06, "loss": 0.0999, "num_input_tokens_seen": 27601696, "step": 40950 }, { "epoch": 1.0005374636601276, "grad_norm": 26.894102096557617, "learning_rate": 1.9395726556725806e-06, "loss": 0.0866, "num_input_tokens_seen": 27605536, "step": 40955 }, { "epoch": 1.0006596144919746, "grad_norm": 4.941879749298096, "learning_rate": 1.939543457507112e-06, "loss": 0.0141, "num_input_tokens_seen": 27608608, "step": 40960 }, { "epoch": 1.0007817653238218, "grad_norm": 0.27386680245399475, "learning_rate": 1.939514252509031e-06, "loss": 0.0677, "num_input_tokens_seen": 27611872, "step": 40965 }, { "epoch": 1.000903916155669, "grad_norm": 23.498291015625, "learning_rate": 1.93948504067855e-06, "loss": 0.0574, "num_input_tokens_seen": 27615264, "step": 40970 }, { "epoch": 1.0010260669875162, "grad_norm": 0.18982845544815063, "learning_rate": 1.93945582201588e-06, "loss": 0.0023, "num_input_tokens_seen": 27618464, "step": 40975 }, { "epoch": 1.0011482178193634, "grad_norm": 0.3378293514251709, "learning_rate": 1.939426596521235e-06, "loss": 0.0609, "num_input_tokens_seen": 27622112, "step": 40980 }, { "epoch": 1.0012703686512106, "grad_norm": 0.12434734404087067, "learning_rate": 1.9393973641948275e-06, "loss": 0.0448, "num_input_tokens_seen": 27625312, "step": 40985 }, { "epoch": 1.0013925194830577, "grad_norm": 28.14246368408203, "learning_rate": 1.9393681250368696e-06, "loss": 0.1127, "num_input_tokens_seen": 27628320, "step": 40990 }, { "epoch": 1.001514670314905, "grad_norm": 0.08595714718103409, "learning_rate": 1.939338879047574e-06, "loss": 0.0845, "num_input_tokens_seen": 27631712, "step": 40995 }, { "epoch": 1.001636821146752, "grad_norm": 60.2764892578125, "learning_rate": 1.9393096262271533e-06, "loss": 0.118, "num_input_tokens_seen": 27634848, "step": 41000 }, { "epoch": 1.001758971978599, "grad_norm": 0.04198668897151947, "learning_rate": 1.9392803665758206e-06, "loss": 0.0737, "num_input_tokens_seen": 27638048, "step": 41005 }, { "epoch": 1.0018811228104463, "grad_norm": 9.843551635742188, "learning_rate": 1.939251100093788e-06, "loss": 0.1221, "num_input_tokens_seen": 27641184, "step": 41010 }, { "epoch": 1.0020032736422935, "grad_norm": 0.172799751162529, "learning_rate": 1.9392218267812687e-06, "loss": 0.0264, "num_input_tokens_seen": 27644704, "step": 41015 }, { "epoch": 1.0021254244741407, "grad_norm": 1.0534111261367798, "learning_rate": 1.939192546638476e-06, "loss": 0.0347, "num_input_tokens_seen": 27648160, "step": 41020 }, { "epoch": 1.0022475753059878, "grad_norm": 0.2743847370147705, "learning_rate": 1.9391632596656224e-06, "loss": 0.0593, "num_input_tokens_seen": 27651296, "step": 41025 }, { "epoch": 1.002369726137835, "grad_norm": 0.5838879942893982, "learning_rate": 1.9391339658629212e-06, "loss": 0.0681, "num_input_tokens_seen": 27654304, "step": 41030 }, { "epoch": 1.0024918769696822, "grad_norm": 10.347956657409668, "learning_rate": 1.939104665230585e-06, "loss": 0.083, "num_input_tokens_seen": 27657248, "step": 41035 }, { "epoch": 1.0026140278015294, "grad_norm": 11.926654815673828, "learning_rate": 1.939075357768827e-06, "loss": 0.1933, "num_input_tokens_seen": 27660576, "step": 41040 }, { "epoch": 1.0027361786333764, "grad_norm": 0.05128807947039604, "learning_rate": 1.9390460434778607e-06, "loss": 0.0077, "num_input_tokens_seen": 27664032, "step": 41045 }, { "epoch": 1.0028583294652236, "grad_norm": 1.0889049768447876, "learning_rate": 1.9390167223578984e-06, "loss": 0.0751, "num_input_tokens_seen": 27667360, "step": 41050 }, { "epoch": 1.0029804802970708, "grad_norm": 9.116876602172852, "learning_rate": 1.9389873944091544e-06, "loss": 0.0029, "num_input_tokens_seen": 27671136, "step": 41055 }, { "epoch": 1.003102631128918, "grad_norm": 21.027864456176758, "learning_rate": 1.9389580596318417e-06, "loss": 0.0609, "num_input_tokens_seen": 27674976, "step": 41060 }, { "epoch": 1.0032247819607651, "grad_norm": 0.7300817370414734, "learning_rate": 1.9389287180261733e-06, "loss": 0.0066, "num_input_tokens_seen": 27678624, "step": 41065 }, { "epoch": 1.0033469327926123, "grad_norm": 23.967458724975586, "learning_rate": 1.9388993695923627e-06, "loss": 0.0896, "num_input_tokens_seen": 27681888, "step": 41070 }, { "epoch": 1.0034690836244595, "grad_norm": 4.293575763702393, "learning_rate": 1.938870014330623e-06, "loss": 0.0598, "num_input_tokens_seen": 27685728, "step": 41075 }, { "epoch": 1.0035912344563067, "grad_norm": 0.03171401470899582, "learning_rate": 1.938840652241168e-06, "loss": 0.1375, "num_input_tokens_seen": 27689376, "step": 41080 }, { "epoch": 1.003713385288154, "grad_norm": 22.17302703857422, "learning_rate": 1.938811283324212e-06, "loss": 0.1238, "num_input_tokens_seen": 27692704, "step": 41085 }, { "epoch": 1.0038355361200009, "grad_norm": 0.015287423506379128, "learning_rate": 1.9387819075799674e-06, "loss": 0.0372, "num_input_tokens_seen": 27696224, "step": 41090 }, { "epoch": 1.003957686951848, "grad_norm": 0.6027705669403076, "learning_rate": 1.9387525250086482e-06, "loss": 0.0507, "num_input_tokens_seen": 27699424, "step": 41095 }, { "epoch": 1.0040798377836953, "grad_norm": 15.068196296691895, "learning_rate": 1.938723135610468e-06, "loss": 0.0971, "num_input_tokens_seen": 27702560, "step": 41100 }, { "epoch": 1.0042019886155424, "grad_norm": 18.538557052612305, "learning_rate": 1.938693739385641e-06, "loss": 0.1414, "num_input_tokens_seen": 27706400, "step": 41105 }, { "epoch": 1.0043241394473896, "grad_norm": 0.23932264745235443, "learning_rate": 1.9386643363343806e-06, "loss": 0.0009, "num_input_tokens_seen": 27709792, "step": 41110 }, { "epoch": 1.0044462902792368, "grad_norm": 20.438920974731445, "learning_rate": 1.9386349264569004e-06, "loss": 0.0653, "num_input_tokens_seen": 27712992, "step": 41115 }, { "epoch": 1.004568441111084, "grad_norm": 0.7962263226509094, "learning_rate": 1.938605509753415e-06, "loss": 0.0738, "num_input_tokens_seen": 27716448, "step": 41120 }, { "epoch": 1.0046905919429312, "grad_norm": 17.484432220458984, "learning_rate": 1.9385760862241374e-06, "loss": 0.0532, "num_input_tokens_seen": 27719584, "step": 41125 }, { "epoch": 1.0048127427747784, "grad_norm": 0.027080107480287552, "learning_rate": 1.9385466558692825e-06, "loss": 0.0434, "num_input_tokens_seen": 27723552, "step": 41130 }, { "epoch": 1.0049348936066254, "grad_norm": 2.654306173324585, "learning_rate": 1.9385172186890636e-06, "loss": 0.002, "num_input_tokens_seen": 27726752, "step": 41135 }, { "epoch": 1.0050570444384725, "grad_norm": 48.38421630859375, "learning_rate": 1.938487774683695e-06, "loss": 0.1055, "num_input_tokens_seen": 27730336, "step": 41140 }, { "epoch": 1.0051791952703197, "grad_norm": 19.56807518005371, "learning_rate": 1.938458323853391e-06, "loss": 0.0844, "num_input_tokens_seen": 27733472, "step": 41145 }, { "epoch": 1.005301346102167, "grad_norm": 0.3958321213722229, "learning_rate": 1.9384288661983656e-06, "loss": 0.0021, "num_input_tokens_seen": 27736928, "step": 41150 }, { "epoch": 1.0054234969340141, "grad_norm": 38.7618408203125, "learning_rate": 1.938399401718833e-06, "loss": 0.1801, "num_input_tokens_seen": 27740000, "step": 41155 }, { "epoch": 1.0055456477658613, "grad_norm": 0.25790756940841675, "learning_rate": 1.938369930415008e-06, "loss": 0.0933, "num_input_tokens_seen": 27743008, "step": 41160 }, { "epoch": 1.0056677985977085, "grad_norm": 88.54906463623047, "learning_rate": 1.938340452287104e-06, "loss": 0.1244, "num_input_tokens_seen": 27746464, "step": 41165 }, { "epoch": 1.0057899494295557, "grad_norm": 0.13079579174518585, "learning_rate": 1.938310967335336e-06, "loss": 0.0529, "num_input_tokens_seen": 27749920, "step": 41170 }, { "epoch": 1.0059121002614029, "grad_norm": 1.1786655187606812, "learning_rate": 1.9382814755599184e-06, "loss": 0.04, "num_input_tokens_seen": 27753248, "step": 41175 }, { "epoch": 1.0060342510932498, "grad_norm": 1.638745903968811, "learning_rate": 1.938251976961065e-06, "loss": 0.0037, "num_input_tokens_seen": 27756512, "step": 41180 }, { "epoch": 1.006156401925097, "grad_norm": 23.87853240966797, "learning_rate": 1.9382224715389914e-06, "loss": 0.1446, "num_input_tokens_seen": 27759712, "step": 41185 }, { "epoch": 1.0062785527569442, "grad_norm": 3.2461514472961426, "learning_rate": 1.938192959293912e-06, "loss": 0.0467, "num_input_tokens_seen": 27763360, "step": 41190 }, { "epoch": 1.0064007035887914, "grad_norm": 30.97145652770996, "learning_rate": 1.9381634402260403e-06, "loss": 0.0773, "num_input_tokens_seen": 27767008, "step": 41195 }, { "epoch": 1.0065228544206386, "grad_norm": 0.04162455350160599, "learning_rate": 1.938133914335592e-06, "loss": 0.0522, "num_input_tokens_seen": 27770528, "step": 41200 }, { "epoch": 1.0066450052524858, "grad_norm": 31.05025291442871, "learning_rate": 1.9381043816227812e-06, "loss": 0.081, "num_input_tokens_seen": 27773600, "step": 41205 }, { "epoch": 1.006767156084333, "grad_norm": 2.75881290435791, "learning_rate": 1.9380748420878235e-06, "loss": 0.0333, "num_input_tokens_seen": 27776800, "step": 41210 }, { "epoch": 1.0068893069161802, "grad_norm": 0.2996499240398407, "learning_rate": 1.938045295730933e-06, "loss": 0.003, "num_input_tokens_seen": 27779872, "step": 41215 }, { "epoch": 1.0070114577480274, "grad_norm": 0.05380595102906227, "learning_rate": 1.9380157425523252e-06, "loss": 0.07, "num_input_tokens_seen": 27783264, "step": 41220 }, { "epoch": 1.0071336085798743, "grad_norm": 11.7682523727417, "learning_rate": 1.937986182552214e-06, "loss": 0.1728, "num_input_tokens_seen": 27785952, "step": 41225 }, { "epoch": 1.0072557594117215, "grad_norm": 0.028988046571612358, "learning_rate": 1.9379566157308156e-06, "loss": 0.0718, "num_input_tokens_seen": 27788832, "step": 41230 }, { "epoch": 1.0073779102435687, "grad_norm": 7.975637435913086, "learning_rate": 1.937927042088344e-06, "loss": 0.0436, "num_input_tokens_seen": 27792672, "step": 41235 }, { "epoch": 1.007500061075416, "grad_norm": 0.07105622440576553, "learning_rate": 1.937897461625015e-06, "loss": 0.0749, "num_input_tokens_seen": 27796000, "step": 41240 }, { "epoch": 1.007622211907263, "grad_norm": 0.10940805077552795, "learning_rate": 1.9378678743410432e-06, "loss": 0.0926, "num_input_tokens_seen": 27799520, "step": 41245 }, { "epoch": 1.0077443627391103, "grad_norm": 8.517210960388184, "learning_rate": 1.937838280236644e-06, "loss": 0.1212, "num_input_tokens_seen": 27802720, "step": 41250 }, { "epoch": 1.0078665135709575, "grad_norm": 0.07815233618021011, "learning_rate": 1.9378086793120323e-06, "loss": 0.0305, "num_input_tokens_seen": 27806304, "step": 41255 }, { "epoch": 1.0079886644028047, "grad_norm": 0.14662419259548187, "learning_rate": 1.937779071567424e-06, "loss": 0.0022, "num_input_tokens_seen": 27810272, "step": 41260 }, { "epoch": 1.0081108152346518, "grad_norm": 81.27674102783203, "learning_rate": 1.937749457003034e-06, "loss": 0.0504, "num_input_tokens_seen": 27813792, "step": 41265 }, { "epoch": 1.0082329660664988, "grad_norm": 0.04992228373885155, "learning_rate": 1.9377198356190775e-06, "loss": 0.0024, "num_input_tokens_seen": 27817056, "step": 41270 }, { "epoch": 1.008355116898346, "grad_norm": 0.1316540688276291, "learning_rate": 1.93769020741577e-06, "loss": 0.1432, "num_input_tokens_seen": 27820384, "step": 41275 }, { "epoch": 1.0084772677301932, "grad_norm": 26.181007385253906, "learning_rate": 1.937660572393328e-06, "loss": 0.2132, "num_input_tokens_seen": 27823904, "step": 41280 }, { "epoch": 1.0085994185620404, "grad_norm": 16.70596694946289, "learning_rate": 1.9376309305519653e-06, "loss": 0.1263, "num_input_tokens_seen": 27827104, "step": 41285 }, { "epoch": 1.0087215693938876, "grad_norm": 0.11371473222970963, "learning_rate": 1.9376012818918984e-06, "loss": 0.0712, "num_input_tokens_seen": 27830624, "step": 41290 }, { "epoch": 1.0088437202257348, "grad_norm": 0.513312816619873, "learning_rate": 1.937571626413343e-06, "loss": 0.037, "num_input_tokens_seen": 27833696, "step": 41295 }, { "epoch": 1.008965871057582, "grad_norm": 2.094923257827759, "learning_rate": 1.9375419641165143e-06, "loss": 0.0138, "num_input_tokens_seen": 27837536, "step": 41300 }, { "epoch": 1.0090880218894291, "grad_norm": 0.1514255851507187, "learning_rate": 1.9375122950016287e-06, "loss": 0.1085, "num_input_tokens_seen": 27841056, "step": 41305 }, { "epoch": 1.0092101727212763, "grad_norm": 0.4462115466594696, "learning_rate": 1.9374826190689013e-06, "loss": 0.0351, "num_input_tokens_seen": 27844448, "step": 41310 }, { "epoch": 1.0093323235531233, "grad_norm": 1.3606981039047241, "learning_rate": 1.937452936318548e-06, "loss": 0.0918, "num_input_tokens_seen": 27847520, "step": 41315 }, { "epoch": 1.0094544743849705, "grad_norm": 26.182024002075195, "learning_rate": 1.937423246750785e-06, "loss": 0.0846, "num_input_tokens_seen": 27850976, "step": 41320 }, { "epoch": 1.0095766252168177, "grad_norm": 11.671547889709473, "learning_rate": 1.937393550365828e-06, "loss": 0.0609, "num_input_tokens_seen": 27853856, "step": 41325 }, { "epoch": 1.0096987760486649, "grad_norm": 14.877876281738281, "learning_rate": 1.9373638471638925e-06, "loss": 0.0724, "num_input_tokens_seen": 27857376, "step": 41330 }, { "epoch": 1.009820926880512, "grad_norm": 4.672638893127441, "learning_rate": 1.9373341371451956e-06, "loss": 0.0478, "num_input_tokens_seen": 27860704, "step": 41335 }, { "epoch": 1.0099430777123592, "grad_norm": 31.55280876159668, "learning_rate": 1.9373044203099527e-06, "loss": 0.279, "num_input_tokens_seen": 27864096, "step": 41340 }, { "epoch": 1.0100652285442064, "grad_norm": 0.4305602014064789, "learning_rate": 1.93727469665838e-06, "loss": 0.0546, "num_input_tokens_seen": 27867616, "step": 41345 }, { "epoch": 1.0101873793760536, "grad_norm": 0.4437588155269623, "learning_rate": 1.937244966190693e-06, "loss": 0.2221, "num_input_tokens_seen": 27871520, "step": 41350 }, { "epoch": 1.0103095302079008, "grad_norm": 0.3621104955673218, "learning_rate": 1.937215228907109e-06, "loss": 0.0076, "num_input_tokens_seen": 27874720, "step": 41355 }, { "epoch": 1.0104316810397478, "grad_norm": 1.216752290725708, "learning_rate": 1.9371854848078434e-06, "loss": 0.0873, "num_input_tokens_seen": 27877600, "step": 41360 }, { "epoch": 1.010553831871595, "grad_norm": 7.180300235748291, "learning_rate": 1.9371557338931133e-06, "loss": 0.1743, "num_input_tokens_seen": 27881120, "step": 41365 }, { "epoch": 1.0106759827034422, "grad_norm": 0.08157268166542053, "learning_rate": 1.9371259761631346e-06, "loss": 0.1012, "num_input_tokens_seen": 27884832, "step": 41370 }, { "epoch": 1.0107981335352894, "grad_norm": 141.767333984375, "learning_rate": 1.9370962116181235e-06, "loss": 0.0264, "num_input_tokens_seen": 27888096, "step": 41375 }, { "epoch": 1.0109202843671365, "grad_norm": 36.21950149536133, "learning_rate": 1.9370664402582966e-06, "loss": 0.2076, "num_input_tokens_seen": 27890912, "step": 41380 }, { "epoch": 1.0110424351989837, "grad_norm": 0.8839470148086548, "learning_rate": 1.937036662083871e-06, "loss": 0.0556, "num_input_tokens_seen": 27894176, "step": 41385 }, { "epoch": 1.011164586030831, "grad_norm": 2.825273275375366, "learning_rate": 1.9370068770950626e-06, "loss": 0.1091, "num_input_tokens_seen": 27897376, "step": 41390 }, { "epoch": 1.0112867368626781, "grad_norm": 0.04474136605858803, "learning_rate": 1.936977085292088e-06, "loss": 0.0532, "num_input_tokens_seen": 27900832, "step": 41395 }, { "epoch": 1.0114088876945253, "grad_norm": 0.22467297315597534, "learning_rate": 1.936947286675164e-06, "loss": 0.0966, "num_input_tokens_seen": 27904288, "step": 41400 }, { "epoch": 1.0115310385263723, "grad_norm": 0.5134358406066895, "learning_rate": 1.9369174812445073e-06, "loss": 0.0164, "num_input_tokens_seen": 27907744, "step": 41405 }, { "epoch": 1.0116531893582195, "grad_norm": 0.31668102741241455, "learning_rate": 1.9368876690003347e-06, "loss": 0.0632, "num_input_tokens_seen": 27911456, "step": 41410 }, { "epoch": 1.0117753401900667, "grad_norm": 0.5316899418830872, "learning_rate": 1.936857849942863e-06, "loss": 0.044, "num_input_tokens_seen": 27914912, "step": 41415 }, { "epoch": 1.0118974910219138, "grad_norm": 0.7631782293319702, "learning_rate": 1.9368280240723093e-06, "loss": 0.1014, "num_input_tokens_seen": 27917856, "step": 41420 }, { "epoch": 1.012019641853761, "grad_norm": 0.3482151925563812, "learning_rate": 1.93679819138889e-06, "loss": 0.0542, "num_input_tokens_seen": 27921760, "step": 41425 }, { "epoch": 1.0121417926856082, "grad_norm": 65.02391052246094, "learning_rate": 1.9367683518928226e-06, "loss": 0.0742, "num_input_tokens_seen": 27925344, "step": 41430 }, { "epoch": 1.0122639435174554, "grad_norm": 0.10657556354999542, "learning_rate": 1.9367385055843234e-06, "loss": 0.1166, "num_input_tokens_seen": 27928672, "step": 41435 }, { "epoch": 1.0123860943493026, "grad_norm": 102.85113525390625, "learning_rate": 1.93670865246361e-06, "loss": 0.0719, "num_input_tokens_seen": 27931872, "step": 41440 }, { "epoch": 1.0125082451811498, "grad_norm": 17.976972579956055, "learning_rate": 1.9366787925308992e-06, "loss": 0.0874, "num_input_tokens_seen": 27935264, "step": 41445 }, { "epoch": 1.0126303960129968, "grad_norm": 0.10465455800294876, "learning_rate": 1.9366489257864084e-06, "loss": 0.0223, "num_input_tokens_seen": 27938528, "step": 41450 }, { "epoch": 1.012752546844844, "grad_norm": 0.28243395686149597, "learning_rate": 1.9366190522303543e-06, "loss": 0.0771, "num_input_tokens_seen": 27942112, "step": 41455 }, { "epoch": 1.0128746976766911, "grad_norm": 28.175445556640625, "learning_rate": 1.936589171862955e-06, "loss": 0.1288, "num_input_tokens_seen": 27945376, "step": 41460 }, { "epoch": 1.0129968485085383, "grad_norm": 0.10519753396511078, "learning_rate": 1.936559284684427e-06, "loss": 0.0402, "num_input_tokens_seen": 27949152, "step": 41465 }, { "epoch": 1.0131189993403855, "grad_norm": 0.23309500515460968, "learning_rate": 1.9365293906949885e-06, "loss": 0.0106, "num_input_tokens_seen": 27952416, "step": 41470 }, { "epoch": 1.0132411501722327, "grad_norm": 0.46023571491241455, "learning_rate": 1.9364994898948557e-06, "loss": 0.1945, "num_input_tokens_seen": 27955936, "step": 41475 }, { "epoch": 1.01336330100408, "grad_norm": 0.044198326766490936, "learning_rate": 1.9364695822842473e-06, "loss": 0.1181, "num_input_tokens_seen": 27959328, "step": 41480 }, { "epoch": 1.013485451835927, "grad_norm": 0.14169353246688843, "learning_rate": 1.93643966786338e-06, "loss": 0.0289, "num_input_tokens_seen": 27962912, "step": 41485 }, { "epoch": 1.0136076026677743, "grad_norm": 14.789438247680664, "learning_rate": 1.9364097466324717e-06, "loss": 0.1157, "num_input_tokens_seen": 27966176, "step": 41490 }, { "epoch": 1.0137297534996212, "grad_norm": 42.44548797607422, "learning_rate": 1.9363798185917394e-06, "loss": 0.0885, "num_input_tokens_seen": 27969440, "step": 41495 }, { "epoch": 1.0138519043314684, "grad_norm": 0.3688395321369171, "learning_rate": 1.936349883741402e-06, "loss": 0.1333, "num_input_tokens_seen": 27972704, "step": 41500 }, { "epoch": 1.0139740551633156, "grad_norm": 0.2943406403064728, "learning_rate": 1.9363199420816753e-06, "loss": 0.0017, "num_input_tokens_seen": 27975904, "step": 41505 }, { "epoch": 1.0140962059951628, "grad_norm": 0.2473595291376114, "learning_rate": 1.936289993612779e-06, "loss": 0.0493, "num_input_tokens_seen": 27979424, "step": 41510 }, { "epoch": 1.01421835682701, "grad_norm": 47.705326080322266, "learning_rate": 1.9362600383349296e-06, "loss": 0.0614, "num_input_tokens_seen": 27982752, "step": 41515 }, { "epoch": 1.0143405076588572, "grad_norm": 0.3024010956287384, "learning_rate": 1.936230076248346e-06, "loss": 0.0367, "num_input_tokens_seen": 27986080, "step": 41520 }, { "epoch": 1.0144626584907044, "grad_norm": 0.04337543249130249, "learning_rate": 1.9362001073532448e-06, "loss": 0.1619, "num_input_tokens_seen": 27989344, "step": 41525 }, { "epoch": 1.0145848093225516, "grad_norm": 28.89052963256836, "learning_rate": 1.936170131649845e-06, "loss": 0.1091, "num_input_tokens_seen": 27993184, "step": 41530 }, { "epoch": 1.0147069601543985, "grad_norm": 12.181092262268066, "learning_rate": 1.936140149138364e-06, "loss": 0.009, "num_input_tokens_seen": 27996512, "step": 41535 }, { "epoch": 1.0148291109862457, "grad_norm": 0.18677693605422974, "learning_rate": 1.93611015981902e-06, "loss": 0.0661, "num_input_tokens_seen": 27999840, "step": 41540 }, { "epoch": 1.014951261818093, "grad_norm": 1.2122114896774292, "learning_rate": 1.936080163692031e-06, "loss": 0.0219, "num_input_tokens_seen": 28003296, "step": 41545 }, { "epoch": 1.01507341264994, "grad_norm": 0.7211357355117798, "learning_rate": 1.9360501607576155e-06, "loss": 0.0188, "num_input_tokens_seen": 28006816, "step": 41550 }, { "epoch": 1.0151955634817873, "grad_norm": 0.09014793485403061, "learning_rate": 1.9360201510159917e-06, "loss": 0.0162, "num_input_tokens_seen": 28010144, "step": 41555 }, { "epoch": 1.0153177143136345, "grad_norm": 23.518695831298828, "learning_rate": 1.9359901344673773e-06, "loss": 0.0835, "num_input_tokens_seen": 28013344, "step": 41560 }, { "epoch": 1.0154398651454817, "grad_norm": 0.16022305190563202, "learning_rate": 1.935960111111991e-06, "loss": 0.08, "num_input_tokens_seen": 28016864, "step": 41565 }, { "epoch": 1.0155620159773289, "grad_norm": 87.44234466552734, "learning_rate": 1.935930080950051e-06, "loss": 0.0109, "num_input_tokens_seen": 28020128, "step": 41570 }, { "epoch": 1.015684166809176, "grad_norm": 1.3785067796707153, "learning_rate": 1.9359000439817758e-06, "loss": 0.0855, "num_input_tokens_seen": 28023520, "step": 41575 }, { "epoch": 1.015806317641023, "grad_norm": 9.503249168395996, "learning_rate": 1.9358700002073833e-06, "loss": 0.2021, "num_input_tokens_seen": 28026592, "step": 41580 }, { "epoch": 1.0159284684728702, "grad_norm": 0.3659454882144928, "learning_rate": 1.935839949627093e-06, "loss": 0.0022, "num_input_tokens_seen": 28029920, "step": 41585 }, { "epoch": 1.0160506193047174, "grad_norm": 3.5858285427093506, "learning_rate": 1.9358098922411224e-06, "loss": 0.0185, "num_input_tokens_seen": 28033312, "step": 41590 }, { "epoch": 1.0161727701365646, "grad_norm": 29.0914363861084, "learning_rate": 1.935779828049691e-06, "loss": 0.1054, "num_input_tokens_seen": 28036704, "step": 41595 }, { "epoch": 1.0162949209684118, "grad_norm": 0.0005874556954950094, "learning_rate": 1.935749757053017e-06, "loss": 0.1035, "num_input_tokens_seen": 28040544, "step": 41600 }, { "epoch": 1.016417071800259, "grad_norm": 25.869298934936523, "learning_rate": 1.9357196792513188e-06, "loss": 0.1913, "num_input_tokens_seen": 28044064, "step": 41605 }, { "epoch": 1.0165392226321062, "grad_norm": 4.090465068817139, "learning_rate": 1.9356895946448154e-06, "loss": 0.0951, "num_input_tokens_seen": 28047776, "step": 41610 }, { "epoch": 1.0166613734639534, "grad_norm": 0.31763580441474915, "learning_rate": 1.9356595032337257e-06, "loss": 0.1271, "num_input_tokens_seen": 28050912, "step": 41615 }, { "epoch": 1.0167835242958005, "grad_norm": 18.087068557739258, "learning_rate": 1.935629405018269e-06, "loss": 0.1427, "num_input_tokens_seen": 28054304, "step": 41620 }, { "epoch": 1.0169056751276475, "grad_norm": 45.21240997314453, "learning_rate": 1.9355992999986627e-06, "loss": 0.1011, "num_input_tokens_seen": 28058144, "step": 41625 }, { "epoch": 1.0170278259594947, "grad_norm": 81.54298400878906, "learning_rate": 1.9355691881751272e-06, "loss": 0.1637, "num_input_tokens_seen": 28061728, "step": 41630 }, { "epoch": 1.017149976791342, "grad_norm": 0.40756484866142273, "learning_rate": 1.9355390695478805e-06, "loss": 0.0612, "num_input_tokens_seen": 28065376, "step": 41635 }, { "epoch": 1.017272127623189, "grad_norm": 11.844685554504395, "learning_rate": 1.9355089441171423e-06, "loss": 0.1635, "num_input_tokens_seen": 28069216, "step": 41640 }, { "epoch": 1.0173942784550363, "grad_norm": 0.27210983633995056, "learning_rate": 1.935478811883131e-06, "loss": 0.0512, "num_input_tokens_seen": 28072416, "step": 41645 }, { "epoch": 1.0175164292868835, "grad_norm": 36.61519241333008, "learning_rate": 1.935448672846067e-06, "loss": 0.1672, "num_input_tokens_seen": 28075488, "step": 41650 }, { "epoch": 1.0176385801187307, "grad_norm": 13.378524780273438, "learning_rate": 1.935418527006168e-06, "loss": 0.1745, "num_input_tokens_seen": 28078688, "step": 41655 }, { "epoch": 1.0177607309505778, "grad_norm": 4.558134078979492, "learning_rate": 1.9353883743636542e-06, "loss": 0.0607, "num_input_tokens_seen": 28081824, "step": 41660 }, { "epoch": 1.017882881782425, "grad_norm": 31.944599151611328, "learning_rate": 1.9353582149187444e-06, "loss": 0.2085, "num_input_tokens_seen": 28085408, "step": 41665 }, { "epoch": 1.018005032614272, "grad_norm": 0.09376510232686996, "learning_rate": 1.935328048671658e-06, "loss": 0.0043, "num_input_tokens_seen": 28089184, "step": 41670 }, { "epoch": 1.0181271834461192, "grad_norm": 1.3944560289382935, "learning_rate": 1.935297875622615e-06, "loss": 0.0094, "num_input_tokens_seen": 28092384, "step": 41675 }, { "epoch": 1.0182493342779664, "grad_norm": 10.583409309387207, "learning_rate": 1.9352676957718335e-06, "loss": 0.201, "num_input_tokens_seen": 28095520, "step": 41680 }, { "epoch": 1.0183714851098136, "grad_norm": 0.2167307585477829, "learning_rate": 1.9352375091195343e-06, "loss": 0.0463, "num_input_tokens_seen": 28098656, "step": 41685 }, { "epoch": 1.0184936359416608, "grad_norm": 7.407865524291992, "learning_rate": 1.935207315665936e-06, "loss": 0.0568, "num_input_tokens_seen": 28102752, "step": 41690 }, { "epoch": 1.018615786773508, "grad_norm": 0.38910236954689026, "learning_rate": 1.935177115411259e-06, "loss": 0.048, "num_input_tokens_seen": 28106592, "step": 41695 }, { "epoch": 1.0187379376053551, "grad_norm": 10.388458251953125, "learning_rate": 1.9351469083557223e-06, "loss": 0.0385, "num_input_tokens_seen": 28109856, "step": 41700 }, { "epoch": 1.0188600884372023, "grad_norm": 73.03234100341797, "learning_rate": 1.935116694499546e-06, "loss": 0.0921, "num_input_tokens_seen": 28113120, "step": 41705 }, { "epoch": 1.0189822392690495, "grad_norm": 13.084054946899414, "learning_rate": 1.9350864738429493e-06, "loss": 0.0033, "num_input_tokens_seen": 28116448, "step": 41710 }, { "epoch": 1.0191043901008965, "grad_norm": 0.3512897491455078, "learning_rate": 1.9350562463861524e-06, "loss": 0.0259, "num_input_tokens_seen": 28120096, "step": 41715 }, { "epoch": 1.0192265409327437, "grad_norm": 0.3952520489692688, "learning_rate": 1.9350260121293746e-06, "loss": 0.0799, "num_input_tokens_seen": 28123616, "step": 41720 }, { "epoch": 1.0193486917645909, "grad_norm": 17.376245498657227, "learning_rate": 1.9349957710728365e-06, "loss": 0.0821, "num_input_tokens_seen": 28126880, "step": 41725 }, { "epoch": 1.019470842596438, "grad_norm": 52.214195251464844, "learning_rate": 1.9349655232167575e-06, "loss": 0.0694, "num_input_tokens_seen": 28130528, "step": 41730 }, { "epoch": 1.0195929934282852, "grad_norm": 2.797252893447876, "learning_rate": 1.934935268561358e-06, "loss": 0.1287, "num_input_tokens_seen": 28133984, "step": 41735 }, { "epoch": 1.0197151442601324, "grad_norm": 4.020275592803955, "learning_rate": 1.9349050071068574e-06, "loss": 0.0028, "num_input_tokens_seen": 28137632, "step": 41740 }, { "epoch": 1.0198372950919796, "grad_norm": 27.068878173828125, "learning_rate": 1.9348747388534763e-06, "loss": 0.1427, "num_input_tokens_seen": 28141408, "step": 41745 }, { "epoch": 1.0199594459238268, "grad_norm": 0.11553628742694855, "learning_rate": 1.9348444638014343e-06, "loss": 0.0793, "num_input_tokens_seen": 28145184, "step": 41750 }, { "epoch": 1.020081596755674, "grad_norm": 2.73980712890625, "learning_rate": 1.9348141819509522e-06, "loss": 0.0018, "num_input_tokens_seen": 28148384, "step": 41755 }, { "epoch": 1.020203747587521, "grad_norm": 41.98025131225586, "learning_rate": 1.93478389330225e-06, "loss": 0.2052, "num_input_tokens_seen": 28151840, "step": 41760 }, { "epoch": 1.0203258984193682, "grad_norm": 0.1199553906917572, "learning_rate": 1.934753597855548e-06, "loss": 0.0889, "num_input_tokens_seen": 28155104, "step": 41765 }, { "epoch": 1.0204480492512153, "grad_norm": 0.15397368371486664, "learning_rate": 1.9347232956110663e-06, "loss": 0.0625, "num_input_tokens_seen": 28158048, "step": 41770 }, { "epoch": 1.0205702000830625, "grad_norm": 25.988941192626953, "learning_rate": 1.9346929865690258e-06, "loss": 0.1216, "num_input_tokens_seen": 28161440, "step": 41775 }, { "epoch": 1.0206923509149097, "grad_norm": 0.09182299673557281, "learning_rate": 1.934662670729646e-06, "loss": 0.0483, "num_input_tokens_seen": 28165216, "step": 41780 }, { "epoch": 1.020814501746757, "grad_norm": 56.44713592529297, "learning_rate": 1.9346323480931475e-06, "loss": 0.0602, "num_input_tokens_seen": 28168672, "step": 41785 }, { "epoch": 1.020936652578604, "grad_norm": 0.13070183992385864, "learning_rate": 1.934602018659752e-06, "loss": 0.1541, "num_input_tokens_seen": 28172128, "step": 41790 }, { "epoch": 1.0210588034104513, "grad_norm": 2.3567676544189453, "learning_rate": 1.9345716824296787e-06, "loss": 0.1585, "num_input_tokens_seen": 28177696, "step": 41795 }, { "epoch": 1.0211809542422985, "grad_norm": 0.1473759561777115, "learning_rate": 1.9345413394031487e-06, "loss": 0.0411, "num_input_tokens_seen": 28181216, "step": 41800 }, { "epoch": 1.0213031050741455, "grad_norm": 0.11678832769393921, "learning_rate": 1.9345109895803834e-06, "loss": 0.0372, "num_input_tokens_seen": 28184352, "step": 41805 }, { "epoch": 1.0214252559059926, "grad_norm": 89.31144714355469, "learning_rate": 1.934480632961602e-06, "loss": 0.0806, "num_input_tokens_seen": 28187616, "step": 41810 }, { "epoch": 1.0215474067378398, "grad_norm": 0.1610347330570221, "learning_rate": 1.9344502695470268e-06, "loss": 0.1387, "num_input_tokens_seen": 28191136, "step": 41815 }, { "epoch": 1.021669557569687, "grad_norm": 9.442001342773438, "learning_rate": 1.9344198993368776e-06, "loss": 0.0032, "num_input_tokens_seen": 28194336, "step": 41820 }, { "epoch": 1.0217917084015342, "grad_norm": 0.17783841490745544, "learning_rate": 1.9343895223313753e-06, "loss": 0.094, "num_input_tokens_seen": 28198240, "step": 41825 }, { "epoch": 1.0219138592333814, "grad_norm": 0.03595606982707977, "learning_rate": 1.9343591385307414e-06, "loss": 0.105, "num_input_tokens_seen": 28201248, "step": 41830 }, { "epoch": 1.0220360100652286, "grad_norm": 11.369623184204102, "learning_rate": 1.9343287479351964e-06, "loss": 0.1136, "num_input_tokens_seen": 28204448, "step": 41835 }, { "epoch": 1.0221581608970758, "grad_norm": 44.781982421875, "learning_rate": 1.9342983505449615e-06, "loss": 0.1168, "num_input_tokens_seen": 28207520, "step": 41840 }, { "epoch": 1.022280311728923, "grad_norm": 0.11527753621339798, "learning_rate": 1.9342679463602578e-06, "loss": 0.1676, "num_input_tokens_seen": 28210848, "step": 41845 }, { "epoch": 1.02240246256077, "grad_norm": 8.457847595214844, "learning_rate": 1.9342375353813062e-06, "loss": 0.1082, "num_input_tokens_seen": 28214496, "step": 41850 }, { "epoch": 1.0225246133926171, "grad_norm": 3.7740113735198975, "learning_rate": 1.9342071176083276e-06, "loss": 0.0846, "num_input_tokens_seen": 28217824, "step": 41855 }, { "epoch": 1.0226467642244643, "grad_norm": 9.33716869354248, "learning_rate": 1.934176693041544e-06, "loss": 0.0428, "num_input_tokens_seen": 28220896, "step": 41860 }, { "epoch": 1.0227689150563115, "grad_norm": 8.659668922424316, "learning_rate": 1.9341462616811765e-06, "loss": 0.1289, "num_input_tokens_seen": 28224544, "step": 41865 }, { "epoch": 1.0228910658881587, "grad_norm": 36.60452651977539, "learning_rate": 1.9341158235274455e-06, "loss": 0.0307, "num_input_tokens_seen": 28227360, "step": 41870 }, { "epoch": 1.023013216720006, "grad_norm": 2.6350722312927246, "learning_rate": 1.9340853785805733e-06, "loss": 0.0926, "num_input_tokens_seen": 28230752, "step": 41875 }, { "epoch": 1.023135367551853, "grad_norm": 33.65208053588867, "learning_rate": 1.934054926840781e-06, "loss": 0.0647, "num_input_tokens_seen": 28233888, "step": 41880 }, { "epoch": 1.0232575183837003, "grad_norm": 5.014811992645264, "learning_rate": 1.9340244683082898e-06, "loss": 0.0762, "num_input_tokens_seen": 28236960, "step": 41885 }, { "epoch": 1.0233796692155475, "grad_norm": 0.15645256638526917, "learning_rate": 1.933994002983322e-06, "loss": 0.0394, "num_input_tokens_seen": 28240480, "step": 41890 }, { "epoch": 1.0235018200473944, "grad_norm": 41.017433166503906, "learning_rate": 1.933963530866098e-06, "loss": 0.0261, "num_input_tokens_seen": 28243680, "step": 41895 }, { "epoch": 1.0236239708792416, "grad_norm": 0.6035808324813843, "learning_rate": 1.93393305195684e-06, "loss": 0.095, "num_input_tokens_seen": 28247456, "step": 41900 }, { "epoch": 1.0237461217110888, "grad_norm": 0.4977245628833771, "learning_rate": 1.93390256625577e-06, "loss": 0.0239, "num_input_tokens_seen": 28250720, "step": 41905 }, { "epoch": 1.023868272542936, "grad_norm": 12.085318565368652, "learning_rate": 1.9338720737631094e-06, "loss": 0.0914, "num_input_tokens_seen": 28253792, "step": 41910 }, { "epoch": 1.0239904233747832, "grad_norm": 52.246482849121094, "learning_rate": 1.9338415744790796e-06, "loss": 0.0252, "num_input_tokens_seen": 28257248, "step": 41915 }, { "epoch": 1.0241125742066304, "grad_norm": 0.08823520690202713, "learning_rate": 1.933811068403903e-06, "loss": 0.1487, "num_input_tokens_seen": 28260832, "step": 41920 }, { "epoch": 1.0242347250384776, "grad_norm": 2.159844160079956, "learning_rate": 1.933780555537801e-06, "loss": 0.033, "num_input_tokens_seen": 28264160, "step": 41925 }, { "epoch": 1.0243568758703248, "grad_norm": 67.18186950683594, "learning_rate": 1.9337500358809953e-06, "loss": 0.1593, "num_input_tokens_seen": 28267616, "step": 41930 }, { "epoch": 1.024479026702172, "grad_norm": 0.1259065419435501, "learning_rate": 1.9337195094337086e-06, "loss": 0.0448, "num_input_tokens_seen": 28270624, "step": 41935 }, { "epoch": 1.024601177534019, "grad_norm": 0.07436679303646088, "learning_rate": 1.9336889761961627e-06, "loss": 0.0556, "num_input_tokens_seen": 28273568, "step": 41940 }, { "epoch": 1.024723328365866, "grad_norm": 11.188752174377441, "learning_rate": 1.933658436168579e-06, "loss": 0.1861, "num_input_tokens_seen": 28276704, "step": 41945 }, { "epoch": 1.0248454791977133, "grad_norm": 69.03654479980469, "learning_rate": 1.93362788935118e-06, "loss": 0.057, "num_input_tokens_seen": 28279904, "step": 41950 }, { "epoch": 1.0249676300295605, "grad_norm": 2.1229348182678223, "learning_rate": 1.933597335744188e-06, "loss": 0.1571, "num_input_tokens_seen": 28283744, "step": 41955 }, { "epoch": 1.0250897808614077, "grad_norm": 37.386741638183594, "learning_rate": 1.933566775347825e-06, "loss": 0.0467, "num_input_tokens_seen": 28286944, "step": 41960 }, { "epoch": 1.0252119316932549, "grad_norm": 0.11623575538396835, "learning_rate": 1.9335362081623134e-06, "loss": 0.0018, "num_input_tokens_seen": 28290464, "step": 41965 }, { "epoch": 1.025334082525102, "grad_norm": 67.46884155273438, "learning_rate": 1.9335056341878754e-06, "loss": 0.1227, "num_input_tokens_seen": 28294816, "step": 41970 }, { "epoch": 1.0254562333569492, "grad_norm": 0.10298215597867966, "learning_rate": 1.9334750534247335e-06, "loss": 0.0017, "num_input_tokens_seen": 28299808, "step": 41975 }, { "epoch": 1.0255783841887964, "grad_norm": 0.2926267981529236, "learning_rate": 1.9334444658731095e-06, "loss": 0.1451, "num_input_tokens_seen": 28303136, "step": 41980 }, { "epoch": 1.0257005350206434, "grad_norm": 0.856563150882721, "learning_rate": 1.9334138715332267e-06, "loss": 0.1345, "num_input_tokens_seen": 28306272, "step": 41985 }, { "epoch": 1.0258226858524906, "grad_norm": 0.43083885312080383, "learning_rate": 1.933383270405307e-06, "loss": 0.0926, "num_input_tokens_seen": 28309472, "step": 41990 }, { "epoch": 1.0259448366843378, "grad_norm": 2.0180540084838867, "learning_rate": 1.933352662489573e-06, "loss": 0.0502, "num_input_tokens_seen": 28312480, "step": 41995 }, { "epoch": 1.026066987516185, "grad_norm": 1.1164207458496094, "learning_rate": 1.9333220477862476e-06, "loss": 0.0655, "num_input_tokens_seen": 28316192, "step": 42000 }, { "epoch": 1.0261891383480322, "grad_norm": 0.37522655725479126, "learning_rate": 1.9332914262955533e-06, "loss": 0.0018, "num_input_tokens_seen": 28320032, "step": 42005 }, { "epoch": 1.0263112891798793, "grad_norm": 0.19359171390533447, "learning_rate": 1.9332607980177124e-06, "loss": 0.0015, "num_input_tokens_seen": 28323936, "step": 42010 }, { "epoch": 1.0264334400117265, "grad_norm": 47.79861068725586, "learning_rate": 1.9332301629529484e-06, "loss": 0.1486, "num_input_tokens_seen": 28327072, "step": 42015 }, { "epoch": 1.0265555908435737, "grad_norm": 9.178513526916504, "learning_rate": 1.9331995211014833e-06, "loss": 0.1867, "num_input_tokens_seen": 28330272, "step": 42020 }, { "epoch": 1.026677741675421, "grad_norm": 61.96234893798828, "learning_rate": 1.9331688724635406e-06, "loss": 0.0235, "num_input_tokens_seen": 28333792, "step": 42025 }, { "epoch": 1.0267998925072679, "grad_norm": 32.31514358520508, "learning_rate": 1.9331382170393424e-06, "loss": 0.0892, "num_input_tokens_seen": 28337056, "step": 42030 }, { "epoch": 1.026922043339115, "grad_norm": 7.391241073608398, "learning_rate": 1.9331075548291125e-06, "loss": 0.0346, "num_input_tokens_seen": 28340576, "step": 42035 }, { "epoch": 1.0270441941709623, "grad_norm": 23.004222869873047, "learning_rate": 1.933076885833073e-06, "loss": 0.0051, "num_input_tokens_seen": 28343776, "step": 42040 }, { "epoch": 1.0271663450028095, "grad_norm": 30.365352630615234, "learning_rate": 1.933046210051448e-06, "loss": 0.0485, "num_input_tokens_seen": 28347232, "step": 42045 }, { "epoch": 1.0272884958346566, "grad_norm": 0.05469789728522301, "learning_rate": 1.9330155274844597e-06, "loss": 0.1192, "num_input_tokens_seen": 28350752, "step": 42050 }, { "epoch": 1.0274106466665038, "grad_norm": 0.6367778182029724, "learning_rate": 1.9329848381323318e-06, "loss": 0.1301, "num_input_tokens_seen": 28353952, "step": 42055 }, { "epoch": 1.027532797498351, "grad_norm": 0.13280487060546875, "learning_rate": 1.932954141995287e-06, "loss": 0.0456, "num_input_tokens_seen": 28358304, "step": 42060 }, { "epoch": 1.0276549483301982, "grad_norm": 7.7528486251831055, "learning_rate": 1.932923439073549e-06, "loss": 0.1302, "num_input_tokens_seen": 28361056, "step": 42065 }, { "epoch": 1.0277770991620452, "grad_norm": 0.5485358834266663, "learning_rate": 1.93289272936734e-06, "loss": 0.091, "num_input_tokens_seen": 28364256, "step": 42070 }, { "epoch": 1.0278992499938924, "grad_norm": 27.97885513305664, "learning_rate": 1.932862012876885e-06, "loss": 0.1344, "num_input_tokens_seen": 28367584, "step": 42075 }, { "epoch": 1.0280214008257396, "grad_norm": 1.5125770568847656, "learning_rate": 1.9328312896024063e-06, "loss": 0.0863, "num_input_tokens_seen": 28370976, "step": 42080 }, { "epoch": 1.0281435516575868, "grad_norm": 77.2655029296875, "learning_rate": 1.932800559544127e-06, "loss": 0.0371, "num_input_tokens_seen": 28374496, "step": 42085 }, { "epoch": 1.028265702489434, "grad_norm": 1.292657494544983, "learning_rate": 1.932769822702272e-06, "loss": 0.0506, "num_input_tokens_seen": 28377696, "step": 42090 }, { "epoch": 1.0283878533212811, "grad_norm": 0.18110032379627228, "learning_rate": 1.9327390790770636e-06, "loss": 0.1733, "num_input_tokens_seen": 28381216, "step": 42095 }, { "epoch": 1.0285100041531283, "grad_norm": 19.233768463134766, "learning_rate": 1.9327083286687256e-06, "loss": 0.0751, "num_input_tokens_seen": 28384544, "step": 42100 }, { "epoch": 1.0286321549849755, "grad_norm": 23.182018280029297, "learning_rate": 1.932677571477482e-06, "loss": 0.2169, "num_input_tokens_seen": 28388064, "step": 42105 }, { "epoch": 1.0287543058168227, "grad_norm": 50.74353790283203, "learning_rate": 1.9326468075035564e-06, "loss": 0.1519, "num_input_tokens_seen": 28391200, "step": 42110 }, { "epoch": 1.0288764566486697, "grad_norm": 0.3187883794307709, "learning_rate": 1.932616036747172e-06, "loss": 0.0666, "num_input_tokens_seen": 28394528, "step": 42115 }, { "epoch": 1.0289986074805169, "grad_norm": 9.614219665527344, "learning_rate": 1.932585259208553e-06, "loss": 0.0394, "num_input_tokens_seen": 28398048, "step": 42120 }, { "epoch": 1.029120758312364, "grad_norm": 0.8373148441314697, "learning_rate": 1.932554474887923e-06, "loss": 0.0299, "num_input_tokens_seen": 28401440, "step": 42125 }, { "epoch": 1.0292429091442112, "grad_norm": 51.213287353515625, "learning_rate": 1.9325236837855068e-06, "loss": 0.0323, "num_input_tokens_seen": 28404576, "step": 42130 }, { "epoch": 1.0293650599760584, "grad_norm": 0.054223284125328064, "learning_rate": 1.932492885901527e-06, "loss": 0.1606, "num_input_tokens_seen": 28407904, "step": 42135 }, { "epoch": 1.0294872108079056, "grad_norm": 0.13917988538742065, "learning_rate": 1.932462081236208e-06, "loss": 0.0915, "num_input_tokens_seen": 28410912, "step": 42140 }, { "epoch": 1.0296093616397528, "grad_norm": 91.04314422607422, "learning_rate": 1.932431269789774e-06, "loss": 0.0902, "num_input_tokens_seen": 28414560, "step": 42145 }, { "epoch": 1.0297315124716, "grad_norm": 0.33241578936576843, "learning_rate": 1.932400451562449e-06, "loss": 0.0204, "num_input_tokens_seen": 28418080, "step": 42150 }, { "epoch": 1.0298536633034472, "grad_norm": 0.13492560386657715, "learning_rate": 1.9323696265544572e-06, "loss": 0.1347, "num_input_tokens_seen": 28421152, "step": 42155 }, { "epoch": 1.0299758141352942, "grad_norm": 14.48997688293457, "learning_rate": 1.9323387947660227e-06, "loss": 0.0876, "num_input_tokens_seen": 28424480, "step": 42160 }, { "epoch": 1.0300979649671413, "grad_norm": 0.041152097284793854, "learning_rate": 1.93230795619737e-06, "loss": 0.069, "num_input_tokens_seen": 28428000, "step": 42165 }, { "epoch": 1.0302201157989885, "grad_norm": 15.964864730834961, "learning_rate": 1.9322771108487227e-06, "loss": 0.0107, "num_input_tokens_seen": 28431776, "step": 42170 }, { "epoch": 1.0303422666308357, "grad_norm": 28.510271072387695, "learning_rate": 1.9322462587203056e-06, "loss": 0.1202, "num_input_tokens_seen": 28434912, "step": 42175 }, { "epoch": 1.030464417462683, "grad_norm": 0.6475540995597839, "learning_rate": 1.932215399812343e-06, "loss": 0.1102, "num_input_tokens_seen": 28438432, "step": 42180 }, { "epoch": 1.03058656829453, "grad_norm": 0.31698504090309143, "learning_rate": 1.9321845341250592e-06, "loss": 0.0436, "num_input_tokens_seen": 28441568, "step": 42185 }, { "epoch": 1.0307087191263773, "grad_norm": 11.586368560791016, "learning_rate": 1.932153661658679e-06, "loss": 0.1289, "num_input_tokens_seen": 28444704, "step": 42190 }, { "epoch": 1.0308308699582245, "grad_norm": 0.12284345924854279, "learning_rate": 1.932122782413426e-06, "loss": 0.0261, "num_input_tokens_seen": 28447840, "step": 42195 }, { "epoch": 1.0309530207900717, "grad_norm": 0.2812725007534027, "learning_rate": 1.9320918963895262e-06, "loss": 0.0126, "num_input_tokens_seen": 28450784, "step": 42200 }, { "epoch": 1.0310751716219186, "grad_norm": 26.277828216552734, "learning_rate": 1.932061003587203e-06, "loss": 0.0412, "num_input_tokens_seen": 28454432, "step": 42205 }, { "epoch": 1.0311973224537658, "grad_norm": 0.1749674528837204, "learning_rate": 1.9320301040066816e-06, "loss": 0.0308, "num_input_tokens_seen": 28457568, "step": 42210 }, { "epoch": 1.031319473285613, "grad_norm": 0.3987172544002533, "learning_rate": 1.9319991976481863e-06, "loss": 0.1595, "num_input_tokens_seen": 28460896, "step": 42215 }, { "epoch": 1.0314416241174602, "grad_norm": 29.454450607299805, "learning_rate": 1.9319682845119425e-06, "loss": 0.1558, "num_input_tokens_seen": 28464032, "step": 42220 }, { "epoch": 1.0315637749493074, "grad_norm": 73.37692260742188, "learning_rate": 1.9319373645981748e-06, "loss": 0.0608, "num_input_tokens_seen": 28467104, "step": 42225 }, { "epoch": 1.0316859257811546, "grad_norm": 4.168240547180176, "learning_rate": 1.9319064379071075e-06, "loss": 0.0129, "num_input_tokens_seen": 28470432, "step": 42230 }, { "epoch": 1.0318080766130018, "grad_norm": 5.503375053405762, "learning_rate": 1.931875504438966e-06, "loss": 0.0663, "num_input_tokens_seen": 28476000, "step": 42235 }, { "epoch": 1.031930227444849, "grad_norm": 0.5394633412361145, "learning_rate": 1.931844564193976e-06, "loss": 0.1376, "num_input_tokens_seen": 28481120, "step": 42240 }, { "epoch": 1.0320523782766962, "grad_norm": 141.67755126953125, "learning_rate": 1.9318136171723606e-06, "loss": 0.2042, "num_input_tokens_seen": 28484896, "step": 42245 }, { "epoch": 1.0321745291085431, "grad_norm": 0.05775103718042374, "learning_rate": 1.9317826633743464e-06, "loss": 0.0014, "num_input_tokens_seen": 28488416, "step": 42250 }, { "epoch": 1.0322966799403903, "grad_norm": 0.0917782261967659, "learning_rate": 1.9317517028001584e-06, "loss": 0.0013, "num_input_tokens_seen": 28491936, "step": 42255 }, { "epoch": 1.0324188307722375, "grad_norm": 8.065529823303223, "learning_rate": 1.9317207354500206e-06, "loss": 0.2312, "num_input_tokens_seen": 28495392, "step": 42260 }, { "epoch": 1.0325409816040847, "grad_norm": 25.31867027282715, "learning_rate": 1.9316897613241596e-06, "loss": 0.1211, "num_input_tokens_seen": 28498592, "step": 42265 }, { "epoch": 1.0326631324359319, "grad_norm": 0.08368490636348724, "learning_rate": 1.9316587804228e-06, "loss": 0.1664, "num_input_tokens_seen": 28501728, "step": 42270 }, { "epoch": 1.032785283267779, "grad_norm": 0.0791000947356224, "learning_rate": 1.931627792746167e-06, "loss": 0.0567, "num_input_tokens_seen": 28504864, "step": 42275 }, { "epoch": 1.0329074340996263, "grad_norm": 1.1830270290374756, "learning_rate": 1.931596798294486e-06, "loss": 0.0706, "num_input_tokens_seen": 28508000, "step": 42280 }, { "epoch": 1.0330295849314735, "grad_norm": 25.891298294067383, "learning_rate": 1.9315657970679826e-06, "loss": 0.1044, "num_input_tokens_seen": 28511072, "step": 42285 }, { "epoch": 1.0331517357633206, "grad_norm": 37.095394134521484, "learning_rate": 1.9315347890668825e-06, "loss": 0.2329, "num_input_tokens_seen": 28514016, "step": 42290 }, { "epoch": 1.0332738865951676, "grad_norm": 16.82368278503418, "learning_rate": 1.9315037742914107e-06, "loss": 0.1215, "num_input_tokens_seen": 28517664, "step": 42295 }, { "epoch": 1.0333960374270148, "grad_norm": 0.26243355870246887, "learning_rate": 1.931472752741793e-06, "loss": 0.0044, "num_input_tokens_seen": 28520992, "step": 42300 }, { "epoch": 1.033518188258862, "grad_norm": 40.032291412353516, "learning_rate": 1.9314417244182547e-06, "loss": 0.1998, "num_input_tokens_seen": 28524512, "step": 42305 }, { "epoch": 1.0336403390907092, "grad_norm": 0.18673402070999146, "learning_rate": 1.9314106893210216e-06, "loss": 0.0614, "num_input_tokens_seen": 28528032, "step": 42310 }, { "epoch": 1.0337624899225564, "grad_norm": 0.09959449619054794, "learning_rate": 1.9313796474503194e-06, "loss": 0.0382, "num_input_tokens_seen": 28530848, "step": 42315 }, { "epoch": 1.0338846407544036, "grad_norm": 0.6416058540344238, "learning_rate": 1.931348598806374e-06, "loss": 0.0943, "num_input_tokens_seen": 28534304, "step": 42320 }, { "epoch": 1.0340067915862508, "grad_norm": 27.448389053344727, "learning_rate": 1.931317543389411e-06, "loss": 0.1328, "num_input_tokens_seen": 28537376, "step": 42325 }, { "epoch": 1.034128942418098, "grad_norm": 29.5507755279541, "learning_rate": 1.9312864811996567e-06, "loss": 0.1759, "num_input_tokens_seen": 28540128, "step": 42330 }, { "epoch": 1.0342510932499451, "grad_norm": 3.888038396835327, "learning_rate": 1.931255412237336e-06, "loss": 0.0172, "num_input_tokens_seen": 28543456, "step": 42335 }, { "epoch": 1.034373244081792, "grad_norm": 0.9766087532043457, "learning_rate": 1.931224336502676e-06, "loss": 0.1055, "num_input_tokens_seen": 28546912, "step": 42340 }, { "epoch": 1.0344953949136393, "grad_norm": 23.447490692138672, "learning_rate": 1.931193253995902e-06, "loss": 0.0127, "num_input_tokens_seen": 28550240, "step": 42345 }, { "epoch": 1.0346175457454865, "grad_norm": 1.529118299484253, "learning_rate": 1.93116216471724e-06, "loss": 0.0732, "num_input_tokens_seen": 28553696, "step": 42350 }, { "epoch": 1.0347396965773337, "grad_norm": 0.10727009922266006, "learning_rate": 1.9311310686669165e-06, "loss": 0.07, "num_input_tokens_seen": 28557408, "step": 42355 }, { "epoch": 1.0348618474091809, "grad_norm": 40.918243408203125, "learning_rate": 1.931099965845158e-06, "loss": 0.136, "num_input_tokens_seen": 28560672, "step": 42360 }, { "epoch": 1.034983998241028, "grad_norm": 0.0714857280254364, "learning_rate": 1.9310688562521894e-06, "loss": 0.1912, "num_input_tokens_seen": 28563744, "step": 42365 }, { "epoch": 1.0351061490728752, "grad_norm": 0.17165929079055786, "learning_rate": 1.9310377398882377e-06, "loss": 0.0607, "num_input_tokens_seen": 28567072, "step": 42370 }, { "epoch": 1.0352282999047224, "grad_norm": 0.09585746377706528, "learning_rate": 1.931006616753529e-06, "loss": 0.1144, "num_input_tokens_seen": 28570720, "step": 42375 }, { "epoch": 1.0353504507365696, "grad_norm": 35.07247543334961, "learning_rate": 1.93097548684829e-06, "loss": 0.2082, "num_input_tokens_seen": 28574176, "step": 42380 }, { "epoch": 1.0354726015684166, "grad_norm": 1.625064730644226, "learning_rate": 1.930944350172747e-06, "loss": 0.0518, "num_input_tokens_seen": 28577248, "step": 42385 }, { "epoch": 1.0355947524002638, "grad_norm": 0.36407986283302307, "learning_rate": 1.930913206727126e-06, "loss": 0.0541, "num_input_tokens_seen": 28580576, "step": 42390 }, { "epoch": 1.035716903232111, "grad_norm": 0.07209677994251251, "learning_rate": 1.9308820565116538e-06, "loss": 0.002, "num_input_tokens_seen": 28584032, "step": 42395 }, { "epoch": 1.0358390540639582, "grad_norm": 0.2674114406108856, "learning_rate": 1.930850899526557e-06, "loss": 0.0064, "num_input_tokens_seen": 28587552, "step": 42400 }, { "epoch": 1.0359612048958053, "grad_norm": 0.15182150900363922, "learning_rate": 1.930819735772062e-06, "loss": 0.065, "num_input_tokens_seen": 28591136, "step": 42405 }, { "epoch": 1.0360833557276525, "grad_norm": 0.5361425876617432, "learning_rate": 1.930788565248396e-06, "loss": 0.1083, "num_input_tokens_seen": 28594464, "step": 42410 }, { "epoch": 1.0362055065594997, "grad_norm": 14.758516311645508, "learning_rate": 1.9307573879557847e-06, "loss": 0.095, "num_input_tokens_seen": 28597856, "step": 42415 }, { "epoch": 1.036327657391347, "grad_norm": 0.43900468945503235, "learning_rate": 1.9307262038944552e-06, "loss": 0.1319, "num_input_tokens_seen": 28600928, "step": 42420 }, { "epoch": 1.036449808223194, "grad_norm": 59.04403305053711, "learning_rate": 1.9306950130646346e-06, "loss": 0.2187, "num_input_tokens_seen": 28603936, "step": 42425 }, { "epoch": 1.036571959055041, "grad_norm": 0.13893799483776093, "learning_rate": 1.9306638154665497e-06, "loss": 0.0097, "num_input_tokens_seen": 28606944, "step": 42430 }, { "epoch": 1.0366941098868883, "grad_norm": 37.18202209472656, "learning_rate": 1.930632611100427e-06, "loss": 0.0701, "num_input_tokens_seen": 28610208, "step": 42435 }, { "epoch": 1.0368162607187354, "grad_norm": 0.16810859739780426, "learning_rate": 1.9306013999664937e-06, "loss": 0.0021, "num_input_tokens_seen": 28613536, "step": 42440 }, { "epoch": 1.0369384115505826, "grad_norm": 43.39802932739258, "learning_rate": 1.930570182064977e-06, "loss": 0.0961, "num_input_tokens_seen": 28616864, "step": 42445 }, { "epoch": 1.0370605623824298, "grad_norm": 0.3364641070365906, "learning_rate": 1.9305389573961033e-06, "loss": 0.132, "num_input_tokens_seen": 28620128, "step": 42450 }, { "epoch": 1.037182713214277, "grad_norm": 33.43479537963867, "learning_rate": 1.9305077259601e-06, "loss": 0.0375, "num_input_tokens_seen": 28623520, "step": 42455 }, { "epoch": 1.0373048640461242, "grad_norm": 56.155067443847656, "learning_rate": 1.9304764877571944e-06, "loss": 0.2168, "num_input_tokens_seen": 28627040, "step": 42460 }, { "epoch": 1.0374270148779714, "grad_norm": 21.936141967773438, "learning_rate": 1.9304452427876138e-06, "loss": 0.0926, "num_input_tokens_seen": 28630368, "step": 42465 }, { "epoch": 1.0375491657098186, "grad_norm": 0.8858507871627808, "learning_rate": 1.9304139910515845e-06, "loss": 0.0037, "num_input_tokens_seen": 28633632, "step": 42470 }, { "epoch": 1.0376713165416656, "grad_norm": 0.4281214773654938, "learning_rate": 1.9303827325493346e-06, "loss": 0.0369, "num_input_tokens_seen": 28636704, "step": 42475 }, { "epoch": 1.0377934673735127, "grad_norm": 0.19972863793373108, "learning_rate": 1.9303514672810913e-06, "loss": 0.0419, "num_input_tokens_seen": 28640480, "step": 42480 }, { "epoch": 1.03791561820536, "grad_norm": 0.09961410611867905, "learning_rate": 1.930320195247082e-06, "loss": 0.119, "num_input_tokens_seen": 28643808, "step": 42485 }, { "epoch": 1.0380377690372071, "grad_norm": 0.15113762021064758, "learning_rate": 1.930288916447534e-06, "loss": 0.0031, "num_input_tokens_seen": 28647456, "step": 42490 }, { "epoch": 1.0381599198690543, "grad_norm": 102.24231719970703, "learning_rate": 1.930257630882675e-06, "loss": 0.2149, "num_input_tokens_seen": 28650976, "step": 42495 }, { "epoch": 1.0382820707009015, "grad_norm": 0.0898851677775383, "learning_rate": 1.930226338552732e-06, "loss": 0.0508, "num_input_tokens_seen": 28654304, "step": 42500 }, { "epoch": 1.0384042215327487, "grad_norm": 73.28435516357422, "learning_rate": 1.9301950394579328e-06, "loss": 0.1642, "num_input_tokens_seen": 28657440, "step": 42505 }, { "epoch": 1.0385263723645959, "grad_norm": 12.236141204833984, "learning_rate": 1.9301637335985052e-06, "loss": 0.09, "num_input_tokens_seen": 28661024, "step": 42510 }, { "epoch": 1.038648523196443, "grad_norm": 14.757637023925781, "learning_rate": 1.930132420974677e-06, "loss": 0.0374, "num_input_tokens_seen": 28664160, "step": 42515 }, { "epoch": 1.03877067402829, "grad_norm": 137.90829467773438, "learning_rate": 1.930101101586675e-06, "loss": 0.1329, "num_input_tokens_seen": 28667552, "step": 42520 }, { "epoch": 1.0388928248601372, "grad_norm": 101.94804382324219, "learning_rate": 1.9300697754347283e-06, "loss": 0.0922, "num_input_tokens_seen": 28670944, "step": 42525 }, { "epoch": 1.0390149756919844, "grad_norm": 175.0195770263672, "learning_rate": 1.9300384425190635e-06, "loss": 0.1599, "num_input_tokens_seen": 28673888, "step": 42530 }, { "epoch": 1.0391371265238316, "grad_norm": 9.110740661621094, "learning_rate": 1.9300071028399093e-06, "loss": 0.114, "num_input_tokens_seen": 28677088, "step": 42535 }, { "epoch": 1.0392592773556788, "grad_norm": 8.481295585632324, "learning_rate": 1.9299757563974934e-06, "loss": 0.2603, "num_input_tokens_seen": 28680224, "step": 42540 }, { "epoch": 1.039381428187526, "grad_norm": 0.07923160493373871, "learning_rate": 1.9299444031920437e-06, "loss": 0.0781, "num_input_tokens_seen": 28683680, "step": 42545 }, { "epoch": 1.0395035790193732, "grad_norm": 0.047697048634290695, "learning_rate": 1.9299130432237877e-06, "loss": 0.0054, "num_input_tokens_seen": 28687008, "step": 42550 }, { "epoch": 1.0396257298512204, "grad_norm": 70.10228729248047, "learning_rate": 1.929881676492954e-06, "loss": 0.044, "num_input_tokens_seen": 28690080, "step": 42555 }, { "epoch": 1.0397478806830676, "grad_norm": 0.24712218344211578, "learning_rate": 1.929850302999771e-06, "loss": 0.1201, "num_input_tokens_seen": 28693600, "step": 42560 }, { "epoch": 1.0398700315149145, "grad_norm": 7.952376842498779, "learning_rate": 1.9298189227444665e-06, "loss": 0.1339, "num_input_tokens_seen": 28697056, "step": 42565 }, { "epoch": 1.0399921823467617, "grad_norm": 19.63153648376465, "learning_rate": 1.9297875357272683e-06, "loss": 0.0307, "num_input_tokens_seen": 28700512, "step": 42570 }, { "epoch": 1.040114333178609, "grad_norm": 7.9623894691467285, "learning_rate": 1.9297561419484056e-06, "loss": 0.0128, "num_input_tokens_seen": 28703200, "step": 42575 }, { "epoch": 1.040236484010456, "grad_norm": 1.8297635316848755, "learning_rate": 1.9297247414081058e-06, "loss": 0.0397, "num_input_tokens_seen": 28706592, "step": 42580 }, { "epoch": 1.0403586348423033, "grad_norm": 0.15404774248600006, "learning_rate": 1.929693334106598e-06, "loss": 0.0027, "num_input_tokens_seen": 28709728, "step": 42585 }, { "epoch": 1.0404807856741505, "grad_norm": 0.034117963165044785, "learning_rate": 1.9296619200441095e-06, "loss": 0.0827, "num_input_tokens_seen": 28713376, "step": 42590 }, { "epoch": 1.0406029365059977, "grad_norm": 0.0477776899933815, "learning_rate": 1.9296304992208697e-06, "loss": 0.1676, "num_input_tokens_seen": 28716384, "step": 42595 }, { "epoch": 1.0407250873378449, "grad_norm": 33.39387130737305, "learning_rate": 1.9295990716371075e-06, "loss": 0.0847, "num_input_tokens_seen": 28719456, "step": 42600 }, { "epoch": 1.0408472381696918, "grad_norm": 0.19198709726333618, "learning_rate": 1.9295676372930505e-06, "loss": 0.05, "num_input_tokens_seen": 28723104, "step": 42605 }, { "epoch": 1.040969389001539, "grad_norm": 7.402000427246094, "learning_rate": 1.9295361961889272e-06, "loss": 0.1016, "num_input_tokens_seen": 28726368, "step": 42610 }, { "epoch": 1.0410915398333862, "grad_norm": 2.166149139404297, "learning_rate": 1.929504748324967e-06, "loss": 0.1046, "num_input_tokens_seen": 28729760, "step": 42615 }, { "epoch": 1.0412136906652334, "grad_norm": 29.45623016357422, "learning_rate": 1.929473293701398e-06, "loss": 0.1763, "num_input_tokens_seen": 28733024, "step": 42620 }, { "epoch": 1.0413358414970806, "grad_norm": 15.659850120544434, "learning_rate": 1.9294418323184495e-06, "loss": 0.101, "num_input_tokens_seen": 28736416, "step": 42625 }, { "epoch": 1.0414579923289278, "grad_norm": 19.953767776489258, "learning_rate": 1.92941036417635e-06, "loss": 0.1033, "num_input_tokens_seen": 28739744, "step": 42630 }, { "epoch": 1.041580143160775, "grad_norm": 14.26479434967041, "learning_rate": 1.929378889275328e-06, "loss": 0.0464, "num_input_tokens_seen": 28742560, "step": 42635 }, { "epoch": 1.0417022939926222, "grad_norm": 0.19203825294971466, "learning_rate": 1.929347407615613e-06, "loss": 0.005, "num_input_tokens_seen": 28745952, "step": 42640 }, { "epoch": 1.0418244448244693, "grad_norm": 75.64021301269531, "learning_rate": 1.9293159191974338e-06, "loss": 0.0939, "num_input_tokens_seen": 28749280, "step": 42645 }, { "epoch": 1.0419465956563165, "grad_norm": 0.12982279062271118, "learning_rate": 1.9292844240210193e-06, "loss": 0.0669, "num_input_tokens_seen": 28752416, "step": 42650 }, { "epoch": 1.0420687464881635, "grad_norm": 0.053554948419332504, "learning_rate": 1.9292529220865985e-06, "loss": 0.0011, "num_input_tokens_seen": 28756192, "step": 42655 }, { "epoch": 1.0421908973200107, "grad_norm": 0.06555648893117905, "learning_rate": 1.9292214133944003e-06, "loss": 0.0752, "num_input_tokens_seen": 28759328, "step": 42660 }, { "epoch": 1.0423130481518579, "grad_norm": 0.14871549606323242, "learning_rate": 1.929189897944654e-06, "loss": 0.0056, "num_input_tokens_seen": 28762656, "step": 42665 }, { "epoch": 1.042435198983705, "grad_norm": 0.1613975614309311, "learning_rate": 1.929158375737589e-06, "loss": 0.0822, "num_input_tokens_seen": 28766112, "step": 42670 }, { "epoch": 1.0425573498155523, "grad_norm": 13.389020919799805, "learning_rate": 1.9291268467734343e-06, "loss": 0.1787, "num_input_tokens_seen": 28769568, "step": 42675 }, { "epoch": 1.0426795006473994, "grad_norm": 12.727326393127441, "learning_rate": 1.92909531105242e-06, "loss": 0.0615, "num_input_tokens_seen": 28772896, "step": 42680 }, { "epoch": 1.0428016514792466, "grad_norm": 0.05295789614319801, "learning_rate": 1.929063768574774e-06, "loss": 0.0421, "num_input_tokens_seen": 28776224, "step": 42685 }, { "epoch": 1.0429238023110938, "grad_norm": 22.07121467590332, "learning_rate": 1.9290322193407264e-06, "loss": 0.0861, "num_input_tokens_seen": 28779488, "step": 42690 }, { "epoch": 1.0430459531429408, "grad_norm": 98.79450988769531, "learning_rate": 1.9290006633505065e-06, "loss": 0.101, "num_input_tokens_seen": 28782752, "step": 42695 }, { "epoch": 1.043168103974788, "grad_norm": 0.07113608717918396, "learning_rate": 1.928969100604344e-06, "loss": 0.0381, "num_input_tokens_seen": 28786080, "step": 42700 }, { "epoch": 1.0432902548066352, "grad_norm": 8.405142784118652, "learning_rate": 1.9289375311024683e-06, "loss": 0.0541, "num_input_tokens_seen": 28789664, "step": 42705 }, { "epoch": 1.0434124056384824, "grad_norm": 0.5017864108085632, "learning_rate": 1.9289059548451094e-06, "loss": 0.0356, "num_input_tokens_seen": 28792992, "step": 42710 }, { "epoch": 1.0435345564703296, "grad_norm": 4.686799049377441, "learning_rate": 1.9288743718324963e-06, "loss": 0.1014, "num_input_tokens_seen": 28796768, "step": 42715 }, { "epoch": 1.0436567073021767, "grad_norm": 0.3890798091888428, "learning_rate": 1.9288427820648586e-06, "loss": 0.093, "num_input_tokens_seen": 28800224, "step": 42720 }, { "epoch": 1.043778858134024, "grad_norm": 0.38483116030693054, "learning_rate": 1.9288111855424266e-06, "loss": 0.0736, "num_input_tokens_seen": 28803936, "step": 42725 }, { "epoch": 1.0439010089658711, "grad_norm": 0.13553577661514282, "learning_rate": 1.92877958226543e-06, "loss": 0.0022, "num_input_tokens_seen": 28807264, "step": 42730 }, { "epoch": 1.0440231597977183, "grad_norm": 0.2688484489917755, "learning_rate": 1.9287479722340985e-06, "loss": 0.1002, "num_input_tokens_seen": 28810592, "step": 42735 }, { "epoch": 1.0441453106295653, "grad_norm": 89.76862335205078, "learning_rate": 1.928716355448662e-06, "loss": 0.1476, "num_input_tokens_seen": 28813792, "step": 42740 }, { "epoch": 1.0442674614614125, "grad_norm": 44.90713119506836, "learning_rate": 1.92868473190935e-06, "loss": 0.0419, "num_input_tokens_seen": 28817568, "step": 42745 }, { "epoch": 1.0443896122932597, "grad_norm": 50.53390121459961, "learning_rate": 1.9286531016163934e-06, "loss": 0.0304, "num_input_tokens_seen": 28821216, "step": 42750 }, { "epoch": 1.0445117631251069, "grad_norm": 1.4952304363250732, "learning_rate": 1.928621464570021e-06, "loss": 0.1421, "num_input_tokens_seen": 28824352, "step": 42755 }, { "epoch": 1.044633913956954, "grad_norm": 0.21666060388088226, "learning_rate": 1.9285898207704637e-06, "loss": 0.0645, "num_input_tokens_seen": 28827616, "step": 42760 }, { "epoch": 1.0447560647888012, "grad_norm": 83.15283203125, "learning_rate": 1.928558170217952e-06, "loss": 0.113, "num_input_tokens_seen": 28830944, "step": 42765 }, { "epoch": 1.0448782156206484, "grad_norm": 0.0197481419891119, "learning_rate": 1.9285265129127147e-06, "loss": 0.1068, "num_input_tokens_seen": 28834336, "step": 42770 }, { "epoch": 1.0450003664524956, "grad_norm": 1.7862377166748047, "learning_rate": 1.9284948488549834e-06, "loss": 0.2002, "num_input_tokens_seen": 28837856, "step": 42775 }, { "epoch": 1.0451225172843428, "grad_norm": 0.22275428473949432, "learning_rate": 1.928463178044988e-06, "loss": 0.0059, "num_input_tokens_seen": 28840672, "step": 42780 }, { "epoch": 1.0452446681161898, "grad_norm": 1.0340585708618164, "learning_rate": 1.9284315004829582e-06, "loss": 0.1131, "num_input_tokens_seen": 28844128, "step": 42785 }, { "epoch": 1.045366818948037, "grad_norm": 0.07115308195352554, "learning_rate": 1.9283998161691247e-06, "loss": 0.1306, "num_input_tokens_seen": 28848032, "step": 42790 }, { "epoch": 1.0454889697798841, "grad_norm": 0.12588505446910858, "learning_rate": 1.9283681251037187e-06, "loss": 0.1494, "num_input_tokens_seen": 28851424, "step": 42795 }, { "epoch": 1.0456111206117313, "grad_norm": 0.15897129476070404, "learning_rate": 1.928336427286969e-06, "loss": 0.0593, "num_input_tokens_seen": 28854496, "step": 42800 }, { "epoch": 1.0457332714435785, "grad_norm": 15.23776626586914, "learning_rate": 1.928304722719108e-06, "loss": 0.1006, "num_input_tokens_seen": 28857888, "step": 42805 }, { "epoch": 1.0458554222754257, "grad_norm": 29.71601676940918, "learning_rate": 1.9282730114003652e-06, "loss": 0.0862, "num_input_tokens_seen": 28861280, "step": 42810 }, { "epoch": 1.045977573107273, "grad_norm": 0.10771415382623672, "learning_rate": 1.928241293330971e-06, "loss": 0.0299, "num_input_tokens_seen": 28864608, "step": 42815 }, { "epoch": 1.04609972393912, "grad_norm": 0.10723359137773514, "learning_rate": 1.928209568511157e-06, "loss": 0.0559, "num_input_tokens_seen": 28868320, "step": 42820 }, { "epoch": 1.0462218747709673, "grad_norm": 0.06635193526744843, "learning_rate": 1.928177836941153e-06, "loss": 0.0955, "num_input_tokens_seen": 28871456, "step": 42825 }, { "epoch": 1.0463440256028143, "grad_norm": 0.2644246220588684, "learning_rate": 1.92814609862119e-06, "loss": 0.0599, "num_input_tokens_seen": 28874592, "step": 42830 }, { "epoch": 1.0464661764346614, "grad_norm": 44.2717170715332, "learning_rate": 1.928114353551499e-06, "loss": 0.0608, "num_input_tokens_seen": 28878176, "step": 42835 }, { "epoch": 1.0465883272665086, "grad_norm": 0.18394502997398376, "learning_rate": 1.928082601732311e-06, "loss": 0.0408, "num_input_tokens_seen": 28881504, "step": 42840 }, { "epoch": 1.0467104780983558, "grad_norm": 0.11878114193677902, "learning_rate": 1.9280508431638567e-06, "loss": 0.0309, "num_input_tokens_seen": 28885216, "step": 42845 }, { "epoch": 1.046832628930203, "grad_norm": 30.702194213867188, "learning_rate": 1.928019077846367e-06, "loss": 0.1235, "num_input_tokens_seen": 28888544, "step": 42850 }, { "epoch": 1.0469547797620502, "grad_norm": 0.15721218287944794, "learning_rate": 1.927987305780073e-06, "loss": 0.1238, "num_input_tokens_seen": 28891744, "step": 42855 }, { "epoch": 1.0470769305938974, "grad_norm": 0.7722503542900085, "learning_rate": 1.9279555269652053e-06, "loss": 0.1013, "num_input_tokens_seen": 28894944, "step": 42860 }, { "epoch": 1.0471990814257446, "grad_norm": 1.5233492851257324, "learning_rate": 1.927923741401996e-06, "loss": 0.0614, "num_input_tokens_seen": 28897952, "step": 42865 }, { "epoch": 1.0473212322575918, "grad_norm": 6.74429178237915, "learning_rate": 1.9278919490906754e-06, "loss": 0.0939, "num_input_tokens_seen": 28901344, "step": 42870 }, { "epoch": 1.0474433830894387, "grad_norm": 44.9547119140625, "learning_rate": 1.9278601500314748e-06, "loss": 0.0972, "num_input_tokens_seen": 28904416, "step": 42875 }, { "epoch": 1.047565533921286, "grad_norm": 3.7963287830352783, "learning_rate": 1.927828344224626e-06, "loss": 0.0583, "num_input_tokens_seen": 28907872, "step": 42880 }, { "epoch": 1.0476876847531331, "grad_norm": 7.7191009521484375, "learning_rate": 1.9277965316703595e-06, "loss": 0.0871, "num_input_tokens_seen": 28910880, "step": 42885 }, { "epoch": 1.0478098355849803, "grad_norm": 165.5267791748047, "learning_rate": 1.9277647123689077e-06, "loss": 0.0423, "num_input_tokens_seen": 28914592, "step": 42890 }, { "epoch": 1.0479319864168275, "grad_norm": 30.80902862548828, "learning_rate": 1.9277328863205006e-06, "loss": 0.0861, "num_input_tokens_seen": 28918368, "step": 42895 }, { "epoch": 1.0480541372486747, "grad_norm": 0.3250162899494171, "learning_rate": 1.927701053525371e-06, "loss": 0.0989, "num_input_tokens_seen": 28922592, "step": 42900 }, { "epoch": 1.0481762880805219, "grad_norm": 0.34292522072792053, "learning_rate": 1.92766921398375e-06, "loss": 0.0402, "num_input_tokens_seen": 28926048, "step": 42905 }, { "epoch": 1.048298438912369, "grad_norm": 34.1561164855957, "learning_rate": 1.927637367695868e-06, "loss": 0.0911, "num_input_tokens_seen": 28929632, "step": 42910 }, { "epoch": 1.0484205897442163, "grad_norm": 0.0897122249007225, "learning_rate": 1.9276055146619582e-06, "loss": 0.0014, "num_input_tokens_seen": 28933088, "step": 42915 }, { "epoch": 1.0485427405760632, "grad_norm": 0.08465170115232468, "learning_rate": 1.9275736548822516e-06, "loss": 0.0398, "num_input_tokens_seen": 28936352, "step": 42920 }, { "epoch": 1.0486648914079104, "grad_norm": 12.695818901062012, "learning_rate": 1.9275417883569796e-06, "loss": 0.1345, "num_input_tokens_seen": 28939680, "step": 42925 }, { "epoch": 1.0487870422397576, "grad_norm": 0.030476637184619904, "learning_rate": 1.9275099150863747e-06, "loss": 0.1114, "num_input_tokens_seen": 28943648, "step": 42930 }, { "epoch": 1.0489091930716048, "grad_norm": 0.08667637407779694, "learning_rate": 1.9274780350706678e-06, "loss": 0.0751, "num_input_tokens_seen": 28946912, "step": 42935 }, { "epoch": 1.049031343903452, "grad_norm": 0.3473820090293884, "learning_rate": 1.9274461483100916e-06, "loss": 0.0015, "num_input_tokens_seen": 28950048, "step": 42940 }, { "epoch": 1.0491534947352992, "grad_norm": 78.53522491455078, "learning_rate": 1.927414254804877e-06, "loss": 0.0286, "num_input_tokens_seen": 28953504, "step": 42945 }, { "epoch": 1.0492756455671464, "grad_norm": 80.54452514648438, "learning_rate": 1.9273823545552573e-06, "loss": 0.1131, "num_input_tokens_seen": 28956704, "step": 42950 }, { "epoch": 1.0493977963989936, "grad_norm": 0.013884730637073517, "learning_rate": 1.927350447561463e-06, "loss": 0.0012, "num_input_tokens_seen": 28959968, "step": 42955 }, { "epoch": 1.0495199472308407, "grad_norm": 0.05496250092983246, "learning_rate": 1.927318533823727e-06, "loss": 0.0587, "num_input_tokens_seen": 28963744, "step": 42960 }, { "epoch": 1.0496420980626877, "grad_norm": 41.326560974121094, "learning_rate": 1.927286613342281e-06, "loss": 0.0692, "num_input_tokens_seen": 28967200, "step": 42965 }, { "epoch": 1.049764248894535, "grad_norm": 0.02888176217675209, "learning_rate": 1.9272546861173576e-06, "loss": 0.0739, "num_input_tokens_seen": 28971040, "step": 42970 }, { "epoch": 1.049886399726382, "grad_norm": 12.679499626159668, "learning_rate": 1.9272227521491887e-06, "loss": 0.1976, "num_input_tokens_seen": 28974240, "step": 42975 }, { "epoch": 1.0500085505582293, "grad_norm": 15.679131507873535, "learning_rate": 1.927190811438007e-06, "loss": 0.1454, "num_input_tokens_seen": 28977568, "step": 42980 }, { "epoch": 1.0501307013900765, "grad_norm": 0.0508650504052639, "learning_rate": 1.9271588639840434e-06, "loss": 0.061, "num_input_tokens_seen": 28981216, "step": 42985 }, { "epoch": 1.0502528522219237, "grad_norm": 1.0628223419189453, "learning_rate": 1.9271269097875317e-06, "loss": 0.0465, "num_input_tokens_seen": 28984672, "step": 42990 }, { "epoch": 1.0503750030537709, "grad_norm": 0.9339186549186707, "learning_rate": 1.9270949488487038e-06, "loss": 0.0031, "num_input_tokens_seen": 28988384, "step": 42995 }, { "epoch": 1.050497153885618, "grad_norm": 59.791831970214844, "learning_rate": 1.9270629811677917e-06, "loss": 0.0765, "num_input_tokens_seen": 28991648, "step": 43000 }, { "epoch": 1.0506193047174652, "grad_norm": 0.22662672400474548, "learning_rate": 1.927031006745029e-06, "loss": 0.0022, "num_input_tokens_seen": 28995104, "step": 43005 }, { "epoch": 1.0507414555493122, "grad_norm": 0.057953156530857086, "learning_rate": 1.9269990255806467e-06, "loss": 0.0963, "num_input_tokens_seen": 28998496, "step": 43010 }, { "epoch": 1.0508636063811594, "grad_norm": 0.03372470661997795, "learning_rate": 1.9269670376748783e-06, "loss": 0.0566, "num_input_tokens_seen": 29002272, "step": 43015 }, { "epoch": 1.0509857572130066, "grad_norm": 0.13802820444107056, "learning_rate": 1.9269350430279566e-06, "loss": 0.0036, "num_input_tokens_seen": 29005792, "step": 43020 }, { "epoch": 1.0511079080448538, "grad_norm": 21.07268524169922, "learning_rate": 1.926903041640114e-06, "loss": 0.0948, "num_input_tokens_seen": 29009056, "step": 43025 }, { "epoch": 1.051230058876701, "grad_norm": 65.3128890991211, "learning_rate": 1.9268710335115825e-06, "loss": 0.0552, "num_input_tokens_seen": 29012640, "step": 43030 }, { "epoch": 1.0513522097085481, "grad_norm": 14.12056827545166, "learning_rate": 1.926839018642596e-06, "loss": 0.1364, "num_input_tokens_seen": 29016352, "step": 43035 }, { "epoch": 1.0514743605403953, "grad_norm": 0.07477191835641861, "learning_rate": 1.926806997033387e-06, "loss": 0.1925, "num_input_tokens_seen": 29020512, "step": 43040 }, { "epoch": 1.0515965113722425, "grad_norm": 0.4883725047111511, "learning_rate": 1.926774968684188e-06, "loss": 0.0892, "num_input_tokens_seen": 29023904, "step": 43045 }, { "epoch": 1.0517186622040897, "grad_norm": 0.05889091640710831, "learning_rate": 1.926742933595232e-06, "loss": 0.1294, "num_input_tokens_seen": 29027040, "step": 43050 }, { "epoch": 1.0518408130359367, "grad_norm": 16.98215675354004, "learning_rate": 1.9267108917667528e-06, "loss": 0.0738, "num_input_tokens_seen": 29030048, "step": 43055 }, { "epoch": 1.0519629638677839, "grad_norm": 0.4212515950202942, "learning_rate": 1.926678843198982e-06, "loss": 0.0333, "num_input_tokens_seen": 29033568, "step": 43060 }, { "epoch": 1.052085114699631, "grad_norm": 23.378690719604492, "learning_rate": 1.926646787892154e-06, "loss": 0.1935, "num_input_tokens_seen": 29036832, "step": 43065 }, { "epoch": 1.0522072655314783, "grad_norm": 0.46372488141059875, "learning_rate": 1.926614725846501e-06, "loss": 0.0139, "num_input_tokens_seen": 29040608, "step": 43070 }, { "epoch": 1.0523294163633254, "grad_norm": 60.715450286865234, "learning_rate": 1.9265826570622565e-06, "loss": 0.074, "num_input_tokens_seen": 29043872, "step": 43075 }, { "epoch": 1.0524515671951726, "grad_norm": 17.99675941467285, "learning_rate": 1.9265505815396533e-06, "loss": 0.1008, "num_input_tokens_seen": 29047008, "step": 43080 }, { "epoch": 1.0525737180270198, "grad_norm": 1.093004822731018, "learning_rate": 1.926518499278926e-06, "loss": 0.1009, "num_input_tokens_seen": 29049888, "step": 43085 }, { "epoch": 1.052695868858867, "grad_norm": 49.34836196899414, "learning_rate": 1.9264864102803062e-06, "loss": 0.0497, "num_input_tokens_seen": 29053088, "step": 43090 }, { "epoch": 1.0528180196907142, "grad_norm": 35.51371383666992, "learning_rate": 1.9264543145440283e-06, "loss": 0.0636, "num_input_tokens_seen": 29056608, "step": 43095 }, { "epoch": 1.0529401705225612, "grad_norm": 0.07104449719190598, "learning_rate": 1.9264222120703253e-06, "loss": 0.1812, "num_input_tokens_seen": 29059744, "step": 43100 }, { "epoch": 1.0530623213544084, "grad_norm": 28.080835342407227, "learning_rate": 1.9263901028594307e-06, "loss": 0.1036, "num_input_tokens_seen": 29063648, "step": 43105 }, { "epoch": 1.0531844721862555, "grad_norm": 0.15192584693431854, "learning_rate": 1.9263579869115783e-06, "loss": 0.1083, "num_input_tokens_seen": 29066720, "step": 43110 }, { "epoch": 1.0533066230181027, "grad_norm": 32.0471305847168, "learning_rate": 1.9263258642270018e-06, "loss": 0.0401, "num_input_tokens_seen": 29070048, "step": 43115 }, { "epoch": 1.05342877384995, "grad_norm": 39.45811080932617, "learning_rate": 1.926293734805934e-06, "loss": 0.1869, "num_input_tokens_seen": 29072992, "step": 43120 }, { "epoch": 1.0535509246817971, "grad_norm": 0.7179755568504333, "learning_rate": 1.926261598648609e-06, "loss": 0.0173, "num_input_tokens_seen": 29076704, "step": 43125 }, { "epoch": 1.0536730755136443, "grad_norm": 0.21337614953517914, "learning_rate": 1.926229455755261e-06, "loss": 0.047, "num_input_tokens_seen": 29080288, "step": 43130 }, { "epoch": 1.0537952263454915, "grad_norm": 1.0191816091537476, "learning_rate": 1.926197306126123e-06, "loss": 0.1349, "num_input_tokens_seen": 29083168, "step": 43135 }, { "epoch": 1.0539173771773385, "grad_norm": 87.6432876586914, "learning_rate": 1.926165149761429e-06, "loss": 0.0556, "num_input_tokens_seen": 29086496, "step": 43140 }, { "epoch": 1.0540395280091857, "grad_norm": 25.76075553894043, "learning_rate": 1.9261329866614125e-06, "loss": 0.1828, "num_input_tokens_seen": 29089568, "step": 43145 }, { "epoch": 1.0541616788410328, "grad_norm": 47.1466178894043, "learning_rate": 1.9261008168263082e-06, "loss": 0.0622, "num_input_tokens_seen": 29093152, "step": 43150 }, { "epoch": 1.05428382967288, "grad_norm": 0.02713550254702568, "learning_rate": 1.92606864025635e-06, "loss": 0.0211, "num_input_tokens_seen": 29097312, "step": 43155 }, { "epoch": 1.0544059805047272, "grad_norm": 11.254847526550293, "learning_rate": 1.9260364569517715e-06, "loss": 0.1547, "num_input_tokens_seen": 29100576, "step": 43160 }, { "epoch": 1.0545281313365744, "grad_norm": 0.06463029235601425, "learning_rate": 1.926004266912806e-06, "loss": 0.1126, "num_input_tokens_seen": 29104032, "step": 43165 }, { "epoch": 1.0546502821684216, "grad_norm": 0.1764514297246933, "learning_rate": 1.9259720701396893e-06, "loss": 0.0023, "num_input_tokens_seen": 29107680, "step": 43170 }, { "epoch": 1.0547724330002688, "grad_norm": 7.495051860809326, "learning_rate": 1.9259398666326545e-06, "loss": 0.0037, "num_input_tokens_seen": 29110944, "step": 43175 }, { "epoch": 1.054894583832116, "grad_norm": 0.2713419198989868, "learning_rate": 1.9259076563919356e-06, "loss": 0.0061, "num_input_tokens_seen": 29113952, "step": 43180 }, { "epoch": 1.0550167346639632, "grad_norm": 23.387500762939453, "learning_rate": 1.9258754394177672e-06, "loss": 0.0589, "num_input_tokens_seen": 29117216, "step": 43185 }, { "epoch": 1.0551388854958101, "grad_norm": 0.5097941160202026, "learning_rate": 1.925843215710384e-06, "loss": 0.0329, "num_input_tokens_seen": 29120864, "step": 43190 }, { "epoch": 1.0552610363276573, "grad_norm": 5.5870280265808105, "learning_rate": 1.92581098527002e-06, "loss": 0.0941, "num_input_tokens_seen": 29124192, "step": 43195 }, { "epoch": 1.0553831871595045, "grad_norm": 0.08644071966409683, "learning_rate": 1.925778748096909e-06, "loss": 0.0453, "num_input_tokens_seen": 29127136, "step": 43200 }, { "epoch": 1.0555053379913517, "grad_norm": 90.56088256835938, "learning_rate": 1.925746504191286e-06, "loss": 0.0303, "num_input_tokens_seen": 29130272, "step": 43205 }, { "epoch": 1.055627488823199, "grad_norm": 0.05298614501953125, "learning_rate": 1.9257142535533857e-06, "loss": 0.0251, "num_input_tokens_seen": 29133728, "step": 43210 }, { "epoch": 1.055749639655046, "grad_norm": 75.3792953491211, "learning_rate": 1.925681996183442e-06, "loss": 0.1662, "num_input_tokens_seen": 29136992, "step": 43215 }, { "epoch": 1.0558717904868933, "grad_norm": 11.935559272766113, "learning_rate": 1.92564973208169e-06, "loss": 0.2896, "num_input_tokens_seen": 29140000, "step": 43220 }, { "epoch": 1.0559939413187405, "grad_norm": 0.2589423954486847, "learning_rate": 1.9256174612483644e-06, "loss": 0.0322, "num_input_tokens_seen": 29143328, "step": 43225 }, { "epoch": 1.0561160921505874, "grad_norm": 110.4568099975586, "learning_rate": 1.925585183683699e-06, "loss": 0.1653, "num_input_tokens_seen": 29146336, "step": 43230 }, { "epoch": 1.0562382429824346, "grad_norm": 11.954909324645996, "learning_rate": 1.92555289938793e-06, "loss": 0.093, "num_input_tokens_seen": 29149664, "step": 43235 }, { "epoch": 1.0563603938142818, "grad_norm": 0.0636618435382843, "learning_rate": 1.925520608361291e-06, "loss": 0.0861, "num_input_tokens_seen": 29152864, "step": 43240 }, { "epoch": 1.056482544646129, "grad_norm": 27.24610710144043, "learning_rate": 1.9254883106040173e-06, "loss": 0.1102, "num_input_tokens_seen": 29156128, "step": 43245 }, { "epoch": 1.0566046954779762, "grad_norm": 43.97340774536133, "learning_rate": 1.9254560061163437e-06, "loss": 0.1114, "num_input_tokens_seen": 29159200, "step": 43250 }, { "epoch": 1.0567268463098234, "grad_norm": 0.12428581714630127, "learning_rate": 1.9254236948985046e-06, "loss": 0.0591, "num_input_tokens_seen": 29162336, "step": 43255 }, { "epoch": 1.0568489971416706, "grad_norm": 31.620731353759766, "learning_rate": 1.9253913769507363e-06, "loss": 0.1124, "num_input_tokens_seen": 29165216, "step": 43260 }, { "epoch": 1.0569711479735178, "grad_norm": 0.5041882395744324, "learning_rate": 1.9253590522732727e-06, "loss": 0.1745, "num_input_tokens_seen": 29168352, "step": 43265 }, { "epoch": 1.057093298805365, "grad_norm": 17.660442352294922, "learning_rate": 1.9253267208663486e-06, "loss": 0.1335, "num_input_tokens_seen": 29171744, "step": 43270 }, { "epoch": 1.057215449637212, "grad_norm": 0.2939334809780121, "learning_rate": 1.9252943827302006e-06, "loss": 0.0037, "num_input_tokens_seen": 29174752, "step": 43275 }, { "epoch": 1.0573376004690591, "grad_norm": 0.38700637221336365, "learning_rate": 1.9252620378650627e-06, "loss": 0.1023, "num_input_tokens_seen": 29177952, "step": 43280 }, { "epoch": 1.0574597513009063, "grad_norm": 0.7174057960510254, "learning_rate": 1.92522968627117e-06, "loss": 0.0288, "num_input_tokens_seen": 29181024, "step": 43285 }, { "epoch": 1.0575819021327535, "grad_norm": 0.19030173122882843, "learning_rate": 1.9251973279487586e-06, "loss": 0.1486, "num_input_tokens_seen": 29184096, "step": 43290 }, { "epoch": 1.0577040529646007, "grad_norm": 0.8026861548423767, "learning_rate": 1.9251649628980633e-06, "loss": 0.094, "num_input_tokens_seen": 29187808, "step": 43295 }, { "epoch": 1.0578262037964479, "grad_norm": 2.309544086456299, "learning_rate": 1.925132591119319e-06, "loss": 0.0648, "num_input_tokens_seen": 29191072, "step": 43300 }, { "epoch": 1.057948354628295, "grad_norm": 2.089252471923828, "learning_rate": 1.9251002126127626e-06, "loss": 0.1138, "num_input_tokens_seen": 29194912, "step": 43305 }, { "epoch": 1.0580705054601423, "grad_norm": 10.03940200805664, "learning_rate": 1.9250678273786283e-06, "loss": 0.0849, "num_input_tokens_seen": 29198048, "step": 43310 }, { "epoch": 1.0581926562919894, "grad_norm": 19.415321350097656, "learning_rate": 1.9250354354171515e-06, "loss": 0.0362, "num_input_tokens_seen": 29201120, "step": 43315 }, { "epoch": 1.0583148071238364, "grad_norm": 134.2286834716797, "learning_rate": 1.9250030367285684e-06, "loss": 0.018, "num_input_tokens_seen": 29204192, "step": 43320 }, { "epoch": 1.0584369579556836, "grad_norm": 0.22014103829860687, "learning_rate": 1.9249706313131147e-06, "loss": 0.0911, "num_input_tokens_seen": 29207392, "step": 43325 }, { "epoch": 1.0585591087875308, "grad_norm": 0.7637476325035095, "learning_rate": 1.924938219171026e-06, "loss": 0.0367, "num_input_tokens_seen": 29210912, "step": 43330 }, { "epoch": 1.058681259619378, "grad_norm": 0.233280748128891, "learning_rate": 1.9249058003025367e-06, "loss": 0.0325, "num_input_tokens_seen": 29213536, "step": 43335 }, { "epoch": 1.0588034104512252, "grad_norm": 27.464370727539062, "learning_rate": 1.9248733747078847e-06, "loss": 0.0867, "num_input_tokens_seen": 29217056, "step": 43340 }, { "epoch": 1.0589255612830724, "grad_norm": 85.70044708251953, "learning_rate": 1.9248409423873044e-06, "loss": 0.1057, "num_input_tokens_seen": 29220384, "step": 43345 }, { "epoch": 1.0590477121149195, "grad_norm": 7.4150004386901855, "learning_rate": 1.9248085033410318e-06, "loss": 0.0897, "num_input_tokens_seen": 29223648, "step": 43350 }, { "epoch": 1.0591698629467667, "grad_norm": 0.10273587703704834, "learning_rate": 1.9247760575693036e-06, "loss": 0.0721, "num_input_tokens_seen": 29227360, "step": 43355 }, { "epoch": 1.059292013778614, "grad_norm": 0.9129404425621033, "learning_rate": 1.9247436050723545e-06, "loss": 0.0285, "num_input_tokens_seen": 29230944, "step": 43360 }, { "epoch": 1.059414164610461, "grad_norm": 21.414705276489258, "learning_rate": 1.9247111458504216e-06, "loss": 0.1744, "num_input_tokens_seen": 29234080, "step": 43365 }, { "epoch": 1.059536315442308, "grad_norm": 0.11530768871307373, "learning_rate": 1.9246786799037403e-06, "loss": 0.0446, "num_input_tokens_seen": 29237024, "step": 43370 }, { "epoch": 1.0596584662741553, "grad_norm": 0.08778601139783859, "learning_rate": 1.924646207232547e-06, "loss": 0.0152, "num_input_tokens_seen": 29240352, "step": 43375 }, { "epoch": 1.0597806171060025, "grad_norm": 2.5082666873931885, "learning_rate": 1.9246137278370783e-06, "loss": 0.0404, "num_input_tokens_seen": 29243616, "step": 43380 }, { "epoch": 1.0599027679378497, "grad_norm": 7.4902167320251465, "learning_rate": 1.9245812417175692e-06, "loss": 0.1136, "num_input_tokens_seen": 29247328, "step": 43385 }, { "epoch": 1.0600249187696968, "grad_norm": 0.378722608089447, "learning_rate": 1.9245487488742568e-06, "loss": 0.0794, "num_input_tokens_seen": 29250208, "step": 43390 }, { "epoch": 1.060147069601544, "grad_norm": 0.16203396022319794, "learning_rate": 1.9245162493073776e-06, "loss": 0.0574, "num_input_tokens_seen": 29253344, "step": 43395 }, { "epoch": 1.0602692204333912, "grad_norm": 54.568172454833984, "learning_rate": 1.924483743017167e-06, "loss": 0.0906, "num_input_tokens_seen": 29256864, "step": 43400 }, { "epoch": 1.0603913712652384, "grad_norm": 0.26273974776268005, "learning_rate": 1.9244512300038623e-06, "loss": 0.0496, "num_input_tokens_seen": 29260128, "step": 43405 }, { "epoch": 1.0605135220970854, "grad_norm": 0.2209581881761551, "learning_rate": 1.9244187102676993e-06, "loss": 0.052, "num_input_tokens_seen": 29263968, "step": 43410 }, { "epoch": 1.0606356729289326, "grad_norm": 0.10516363382339478, "learning_rate": 1.9243861838089153e-06, "loss": 0.0027, "num_input_tokens_seen": 29267232, "step": 43415 }, { "epoch": 1.0607578237607798, "grad_norm": 1.000648021697998, "learning_rate": 1.924353650627746e-06, "loss": 0.0472, "num_input_tokens_seen": 29270496, "step": 43420 }, { "epoch": 1.060879974592627, "grad_norm": 0.04793846979737282, "learning_rate": 1.9243211107244284e-06, "loss": 0.0737, "num_input_tokens_seen": 29273952, "step": 43425 }, { "epoch": 1.0610021254244741, "grad_norm": 20.53110122680664, "learning_rate": 1.924288564099199e-06, "loss": 0.2243, "num_input_tokens_seen": 29277280, "step": 43430 }, { "epoch": 1.0611242762563213, "grad_norm": 2.2913577556610107, "learning_rate": 1.9242560107522947e-06, "loss": 0.1931, "num_input_tokens_seen": 29280288, "step": 43435 }, { "epoch": 1.0612464270881685, "grad_norm": 1.403280258178711, "learning_rate": 1.9242234506839523e-06, "loss": 0.0778, "num_input_tokens_seen": 29283488, "step": 43440 }, { "epoch": 1.0613685779200157, "grad_norm": 54.15962600708008, "learning_rate": 1.9241908838944077e-06, "loss": 0.0773, "num_input_tokens_seen": 29287200, "step": 43445 }, { "epoch": 1.061490728751863, "grad_norm": 34.25768280029297, "learning_rate": 1.924158310383899e-06, "loss": 0.0145, "num_input_tokens_seen": 29290336, "step": 43450 }, { "epoch": 1.0616128795837099, "grad_norm": 37.82624816894531, "learning_rate": 1.9241257301526623e-06, "loss": 0.0795, "num_input_tokens_seen": 29293408, "step": 43455 }, { "epoch": 1.061735030415557, "grad_norm": 0.4427298605442047, "learning_rate": 1.9240931432009346e-06, "loss": 0.065, "num_input_tokens_seen": 29296992, "step": 43460 }, { "epoch": 1.0618571812474042, "grad_norm": 0.4335486590862274, "learning_rate": 1.9240605495289533e-06, "loss": 0.1063, "num_input_tokens_seen": 29300320, "step": 43465 }, { "epoch": 1.0619793320792514, "grad_norm": 21.76557731628418, "learning_rate": 1.924027949136955e-06, "loss": 0.105, "num_input_tokens_seen": 29303520, "step": 43470 }, { "epoch": 1.0621014829110986, "grad_norm": 6.954076766967773, "learning_rate": 1.9239953420251767e-06, "loss": 0.024, "num_input_tokens_seen": 29306592, "step": 43475 }, { "epoch": 1.0622236337429458, "grad_norm": 2.9126462936401367, "learning_rate": 1.9239627281938562e-06, "loss": 0.1404, "num_input_tokens_seen": 29310048, "step": 43480 }, { "epoch": 1.062345784574793, "grad_norm": 0.0804481953382492, "learning_rate": 1.92393010764323e-06, "loss": 0.0554, "num_input_tokens_seen": 29313440, "step": 43485 }, { "epoch": 1.0624679354066402, "grad_norm": 57.70439910888672, "learning_rate": 1.9238974803735357e-06, "loss": 0.0256, "num_input_tokens_seen": 29316896, "step": 43490 }, { "epoch": 1.0625900862384874, "grad_norm": 20.333383560180664, "learning_rate": 1.92386484638501e-06, "loss": 0.102, "num_input_tokens_seen": 29320288, "step": 43495 }, { "epoch": 1.0627122370703344, "grad_norm": 10.79039192199707, "learning_rate": 1.923832205677891e-06, "loss": 0.0439, "num_input_tokens_seen": 29323296, "step": 43500 }, { "epoch": 1.0628343879021815, "grad_norm": 0.3620006740093231, "learning_rate": 1.9237995582524154e-06, "loss": 0.0404, "num_input_tokens_seen": 29326304, "step": 43505 }, { "epoch": 1.0629565387340287, "grad_norm": 18.919797897338867, "learning_rate": 1.923766904108821e-06, "loss": 0.2442, "num_input_tokens_seen": 29330016, "step": 43510 }, { "epoch": 1.063078689565876, "grad_norm": 0.08378148823976517, "learning_rate": 1.9237342432473456e-06, "loss": 0.1451, "num_input_tokens_seen": 29333536, "step": 43515 }, { "epoch": 1.0632008403977231, "grad_norm": 26.00210952758789, "learning_rate": 1.923701575668226e-06, "loss": 0.1617, "num_input_tokens_seen": 29337120, "step": 43520 }, { "epoch": 1.0633229912295703, "grad_norm": 0.38692906498908997, "learning_rate": 1.9236689013717006e-06, "loss": 0.0353, "num_input_tokens_seen": 29340512, "step": 43525 }, { "epoch": 1.0634451420614175, "grad_norm": 30.114299774169922, "learning_rate": 1.9236362203580063e-06, "loss": 0.0301, "num_input_tokens_seen": 29343584, "step": 43530 }, { "epoch": 1.0635672928932647, "grad_norm": 0.2355722337961197, "learning_rate": 1.9236035326273806e-06, "loss": 0.0267, "num_input_tokens_seen": 29346848, "step": 43535 }, { "epoch": 1.0636894437251119, "grad_norm": 0.0823436751961708, "learning_rate": 1.923570838180062e-06, "loss": 0.1429, "num_input_tokens_seen": 29350432, "step": 43540 }, { "epoch": 1.0638115945569588, "grad_norm": 0.10285349190235138, "learning_rate": 1.9235381370162872e-06, "loss": 0.1278, "num_input_tokens_seen": 29353696, "step": 43545 }, { "epoch": 1.063933745388806, "grad_norm": 11.621041297912598, "learning_rate": 1.923505429136295e-06, "loss": 0.3489, "num_input_tokens_seen": 29357024, "step": 43550 }, { "epoch": 1.0640558962206532, "grad_norm": 0.04566968232393265, "learning_rate": 1.923472714540323e-06, "loss": 0.1869, "num_input_tokens_seen": 29360480, "step": 43555 }, { "epoch": 1.0641780470525004, "grad_norm": 0.7290300130844116, "learning_rate": 1.9234399932286093e-06, "loss": 0.1381, "num_input_tokens_seen": 29364384, "step": 43560 }, { "epoch": 1.0643001978843476, "grad_norm": 0.6765745878219604, "learning_rate": 1.9234072652013915e-06, "loss": 0.1341, "num_input_tokens_seen": 29368544, "step": 43565 }, { "epoch": 1.0644223487161948, "grad_norm": 0.37754663825035095, "learning_rate": 1.9233745304589074e-06, "loss": 0.1317, "num_input_tokens_seen": 29371552, "step": 43570 }, { "epoch": 1.064544499548042, "grad_norm": 0.2278277426958084, "learning_rate": 1.9233417890013956e-06, "loss": 0.0815, "num_input_tokens_seen": 29375008, "step": 43575 }, { "epoch": 1.0646666503798892, "grad_norm": 58.44916534423828, "learning_rate": 1.923309040829094e-06, "loss": 0.0384, "num_input_tokens_seen": 29378400, "step": 43580 }, { "epoch": 1.0647888012117361, "grad_norm": 0.45989304780960083, "learning_rate": 1.9232762859422404e-06, "loss": 0.0058, "num_input_tokens_seen": 29381600, "step": 43585 }, { "epoch": 1.0649109520435833, "grad_norm": 13.566542625427246, "learning_rate": 1.9232435243410735e-06, "loss": 0.0437, "num_input_tokens_seen": 29385184, "step": 43590 }, { "epoch": 1.0650331028754305, "grad_norm": 114.31439971923828, "learning_rate": 1.9232107560258317e-06, "loss": 0.0138, "num_input_tokens_seen": 29388128, "step": 43595 }, { "epoch": 1.0651552537072777, "grad_norm": 1.0016684532165527, "learning_rate": 1.9231779809967526e-06, "loss": 0.1534, "num_input_tokens_seen": 29391136, "step": 43600 }, { "epoch": 1.065277404539125, "grad_norm": 4.255090236663818, "learning_rate": 1.9231451992540747e-06, "loss": 0.0552, "num_input_tokens_seen": 29394272, "step": 43605 }, { "epoch": 1.065399555370972, "grad_norm": 34.4926872253418, "learning_rate": 1.923112410798037e-06, "loss": 0.1975, "num_input_tokens_seen": 29398176, "step": 43610 }, { "epoch": 1.0655217062028193, "grad_norm": 10.679594993591309, "learning_rate": 1.923079615628877e-06, "loss": 0.0785, "num_input_tokens_seen": 29401504, "step": 43615 }, { "epoch": 1.0656438570346665, "grad_norm": 0.48413512110710144, "learning_rate": 1.9230468137468344e-06, "loss": 0.0434, "num_input_tokens_seen": 29404512, "step": 43620 }, { "epoch": 1.0657660078665137, "grad_norm": 21.062744140625, "learning_rate": 1.923014005152147e-06, "loss": 0.1786, "num_input_tokens_seen": 29407584, "step": 43625 }, { "epoch": 1.0658881586983608, "grad_norm": 0.29427990317344666, "learning_rate": 1.9229811898450533e-06, "loss": 0.0271, "num_input_tokens_seen": 29410976, "step": 43630 }, { "epoch": 1.0660103095302078, "grad_norm": 19.07900047302246, "learning_rate": 1.9229483678257924e-06, "loss": 0.1013, "num_input_tokens_seen": 29414432, "step": 43635 }, { "epoch": 1.066132460362055, "grad_norm": 0.23669935762882233, "learning_rate": 1.9229155390946025e-06, "loss": 0.0404, "num_input_tokens_seen": 29418400, "step": 43640 }, { "epoch": 1.0662546111939022, "grad_norm": 0.9076651930809021, "learning_rate": 1.9228827036517227e-06, "loss": 0.1124, "num_input_tokens_seen": 29421600, "step": 43645 }, { "epoch": 1.0663767620257494, "grad_norm": 41.32793426513672, "learning_rate": 1.9228498614973917e-06, "loss": 0.1655, "num_input_tokens_seen": 29425120, "step": 43650 }, { "epoch": 1.0664989128575966, "grad_norm": 0.12387000024318695, "learning_rate": 1.922817012631848e-06, "loss": 0.0103, "num_input_tokens_seen": 29428640, "step": 43655 }, { "epoch": 1.0666210636894438, "grad_norm": 62.622108459472656, "learning_rate": 1.922784157055331e-06, "loss": 0.102, "num_input_tokens_seen": 29431968, "step": 43660 }, { "epoch": 1.066743214521291, "grad_norm": 0.23542404174804688, "learning_rate": 1.9227512947680795e-06, "loss": 0.0826, "num_input_tokens_seen": 29435040, "step": 43665 }, { "epoch": 1.0668653653531381, "grad_norm": 18.37114715576172, "learning_rate": 1.922718425770332e-06, "loss": 0.147, "num_input_tokens_seen": 29438368, "step": 43670 }, { "epoch": 1.066987516184985, "grad_norm": 0.10454131662845612, "learning_rate": 1.922685550062328e-06, "loss": 0.019, "num_input_tokens_seen": 29441632, "step": 43675 }, { "epoch": 1.0671096670168323, "grad_norm": 25.471887588500977, "learning_rate": 1.922652667644307e-06, "loss": 0.0625, "num_input_tokens_seen": 29444576, "step": 43680 }, { "epoch": 1.0672318178486795, "grad_norm": 2.033235788345337, "learning_rate": 1.922619778516507e-06, "loss": 0.0015, "num_input_tokens_seen": 29448032, "step": 43685 }, { "epoch": 1.0673539686805267, "grad_norm": 0.3267309367656708, "learning_rate": 1.922586882679168e-06, "loss": 0.0739, "num_input_tokens_seen": 29451424, "step": 43690 }, { "epoch": 1.0674761195123739, "grad_norm": 133.7386016845703, "learning_rate": 1.9225539801325293e-06, "loss": 0.0986, "num_input_tokens_seen": 29454624, "step": 43695 }, { "epoch": 1.067598270344221, "grad_norm": 5.09596586227417, "learning_rate": 1.92252107087683e-06, "loss": 0.0972, "num_input_tokens_seen": 29458016, "step": 43700 }, { "epoch": 1.0677204211760682, "grad_norm": 0.07073884457349777, "learning_rate": 1.922488154912309e-06, "loss": 0.0258, "num_input_tokens_seen": 29461664, "step": 43705 }, { "epoch": 1.0678425720079154, "grad_norm": 0.4683375656604767, "learning_rate": 1.9224552322392064e-06, "loss": 0.0009, "num_input_tokens_seen": 29464992, "step": 43710 }, { "epoch": 1.0679647228397626, "grad_norm": 0.09089743345975876, "learning_rate": 1.9224223028577613e-06, "loss": 0.0303, "num_input_tokens_seen": 29468192, "step": 43715 }, { "epoch": 1.0680868736716098, "grad_norm": 45.598567962646484, "learning_rate": 1.9223893667682125e-06, "loss": 0.2321, "num_input_tokens_seen": 29472160, "step": 43720 }, { "epoch": 1.0682090245034568, "grad_norm": 0.31310445070266724, "learning_rate": 1.9223564239708e-06, "loss": 0.1665, "num_input_tokens_seen": 29475808, "step": 43725 }, { "epoch": 1.068331175335304, "grad_norm": 0.5182093977928162, "learning_rate": 1.9223234744657644e-06, "loss": 0.0035, "num_input_tokens_seen": 29479904, "step": 43730 }, { "epoch": 1.0684533261671512, "grad_norm": 0.7424346804618835, "learning_rate": 1.922290518253344e-06, "loss": 0.0922, "num_input_tokens_seen": 29483104, "step": 43735 }, { "epoch": 1.0685754769989984, "grad_norm": 17.64950180053711, "learning_rate": 1.922257555333779e-06, "loss": 0.0933, "num_input_tokens_seen": 29486240, "step": 43740 }, { "epoch": 1.0686976278308455, "grad_norm": 1.901727318763733, "learning_rate": 1.9222245857073086e-06, "loss": 0.0801, "num_input_tokens_seen": 29489632, "step": 43745 }, { "epoch": 1.0688197786626927, "grad_norm": 0.5220657587051392, "learning_rate": 1.922191609374173e-06, "loss": 0.033, "num_input_tokens_seen": 29493600, "step": 43750 }, { "epoch": 1.06894192949454, "grad_norm": 12.645177841186523, "learning_rate": 1.9221586263346124e-06, "loss": 0.0382, "num_input_tokens_seen": 29496928, "step": 43755 }, { "epoch": 1.0690640803263871, "grad_norm": 67.977783203125, "learning_rate": 1.922125636588866e-06, "loss": 0.2569, "num_input_tokens_seen": 29500384, "step": 43760 }, { "epoch": 1.069186231158234, "grad_norm": 0.14421771466732025, "learning_rate": 1.9220926401371738e-06, "loss": 0.001, "num_input_tokens_seen": 29503712, "step": 43765 }, { "epoch": 1.0693083819900813, "grad_norm": 21.54201316833496, "learning_rate": 1.9220596369797765e-06, "loss": 0.2355, "num_input_tokens_seen": 29507488, "step": 43770 }, { "epoch": 1.0694305328219285, "grad_norm": 11.607332229614258, "learning_rate": 1.9220266271169127e-06, "loss": 0.0427, "num_input_tokens_seen": 29510816, "step": 43775 }, { "epoch": 1.0695526836537756, "grad_norm": 2.3675055503845215, "learning_rate": 1.921993610548824e-06, "loss": 0.0012, "num_input_tokens_seen": 29514016, "step": 43780 }, { "epoch": 1.0696748344856228, "grad_norm": 0.29298821091651917, "learning_rate": 1.9219605872757493e-06, "loss": 0.0944, "num_input_tokens_seen": 29517216, "step": 43785 }, { "epoch": 1.06979698531747, "grad_norm": 21.916318893432617, "learning_rate": 1.921927557297929e-06, "loss": 0.0709, "num_input_tokens_seen": 29520480, "step": 43790 }, { "epoch": 1.0699191361493172, "grad_norm": 0.1172017902135849, "learning_rate": 1.9218945206156043e-06, "loss": 0.0458, "num_input_tokens_seen": 29523552, "step": 43795 }, { "epoch": 1.0700412869811644, "grad_norm": 0.7703777551651001, "learning_rate": 1.921861477229014e-06, "loss": 0.0711, "num_input_tokens_seen": 29526752, "step": 43800 }, { "epoch": 1.0701634378130116, "grad_norm": 0.26279208064079285, "learning_rate": 1.9218284271384e-06, "loss": 0.0032, "num_input_tokens_seen": 29529760, "step": 43805 }, { "epoch": 1.0702855886448588, "grad_norm": 11.817235946655273, "learning_rate": 1.9217953703440007e-06, "loss": 0.0755, "num_input_tokens_seen": 29533088, "step": 43810 }, { "epoch": 1.0704077394767058, "grad_norm": 17.30320167541504, "learning_rate": 1.921762306846058e-06, "loss": 0.1385, "num_input_tokens_seen": 29536544, "step": 43815 }, { "epoch": 1.070529890308553, "grad_norm": 0.43149492144584656, "learning_rate": 1.921729236644812e-06, "loss": 0.0231, "num_input_tokens_seen": 29539680, "step": 43820 }, { "epoch": 1.0706520411404001, "grad_norm": 0.49309828877449036, "learning_rate": 1.9216961597405028e-06, "loss": 0.0352, "num_input_tokens_seen": 29543328, "step": 43825 }, { "epoch": 1.0707741919722473, "grad_norm": 33.81893539428711, "learning_rate": 1.9216630761333713e-06, "loss": 0.1591, "num_input_tokens_seen": 29546848, "step": 43830 }, { "epoch": 1.0708963428040945, "grad_norm": 0.11472368985414505, "learning_rate": 1.921629985823658e-06, "loss": 0.0081, "num_input_tokens_seen": 29550048, "step": 43835 }, { "epoch": 1.0710184936359417, "grad_norm": 1.5263179540634155, "learning_rate": 1.9215968888116038e-06, "loss": 0.0762, "num_input_tokens_seen": 29553312, "step": 43840 }, { "epoch": 1.071140644467789, "grad_norm": 12.363460540771484, "learning_rate": 1.9215637850974488e-06, "loss": 0.1954, "num_input_tokens_seen": 29556256, "step": 43845 }, { "epoch": 1.071262795299636, "grad_norm": 23.935073852539062, "learning_rate": 1.921530674681434e-06, "loss": 0.1681, "num_input_tokens_seen": 29559712, "step": 43850 }, { "epoch": 1.071384946131483, "grad_norm": 2.8645405769348145, "learning_rate": 1.921497557563801e-06, "loss": 0.0359, "num_input_tokens_seen": 29562976, "step": 43855 }, { "epoch": 1.0715070969633302, "grad_norm": 0.6907225251197815, "learning_rate": 1.921464433744789e-06, "loss": 0.0823, "num_input_tokens_seen": 29566496, "step": 43860 }, { "epoch": 1.0716292477951774, "grad_norm": 0.04008014500141144, "learning_rate": 1.9214313032246404e-06, "loss": 0.0272, "num_input_tokens_seen": 29569440, "step": 43865 }, { "epoch": 1.0717513986270246, "grad_norm": 0.18733011186122894, "learning_rate": 1.921398166003595e-06, "loss": 0.0815, "num_input_tokens_seen": 29572960, "step": 43870 }, { "epoch": 1.0718735494588718, "grad_norm": 12.819915771484375, "learning_rate": 1.921365022081895e-06, "loss": 0.0707, "num_input_tokens_seen": 29576096, "step": 43875 }, { "epoch": 1.071995700290719, "grad_norm": 22.413814544677734, "learning_rate": 1.9213318714597803e-06, "loss": 0.1535, "num_input_tokens_seen": 29579488, "step": 43880 }, { "epoch": 1.0721178511225662, "grad_norm": 0.17186643183231354, "learning_rate": 1.9212987141374924e-06, "loss": 0.0779, "num_input_tokens_seen": 29582880, "step": 43885 }, { "epoch": 1.0722400019544134, "grad_norm": 0.30322572588920593, "learning_rate": 1.9212655501152726e-06, "loss": 0.1816, "num_input_tokens_seen": 29586272, "step": 43890 }, { "epoch": 1.0723621527862606, "grad_norm": 0.1031859815120697, "learning_rate": 1.921232379393362e-06, "loss": 0.0386, "num_input_tokens_seen": 29589856, "step": 43895 }, { "epoch": 1.0724843036181075, "grad_norm": 1.8539291620254517, "learning_rate": 1.9211992019720015e-06, "loss": 0.0355, "num_input_tokens_seen": 29593824, "step": 43900 }, { "epoch": 1.0726064544499547, "grad_norm": 71.8545150756836, "learning_rate": 1.9211660178514326e-06, "loss": 0.0126, "num_input_tokens_seen": 29596960, "step": 43905 }, { "epoch": 1.072728605281802, "grad_norm": 20.38343620300293, "learning_rate": 1.921132827031897e-06, "loss": 0.1277, "num_input_tokens_seen": 29600416, "step": 43910 }, { "epoch": 1.072850756113649, "grad_norm": 0.04561031609773636, "learning_rate": 1.9210996295136356e-06, "loss": 0.0773, "num_input_tokens_seen": 29603616, "step": 43915 }, { "epoch": 1.0729729069454963, "grad_norm": 0.04822149500250816, "learning_rate": 1.92106642529689e-06, "loss": 0.0019, "num_input_tokens_seen": 29607328, "step": 43920 }, { "epoch": 1.0730950577773435, "grad_norm": 0.28394651412963867, "learning_rate": 1.9210332143819016e-06, "loss": 0.0476, "num_input_tokens_seen": 29610848, "step": 43925 }, { "epoch": 1.0732172086091907, "grad_norm": 1.6123360395431519, "learning_rate": 1.920999996768912e-06, "loss": 0.0321, "num_input_tokens_seen": 29613984, "step": 43930 }, { "epoch": 1.0733393594410379, "grad_norm": 0.0532742440700531, "learning_rate": 1.9209667724581623e-06, "loss": 0.047, "num_input_tokens_seen": 29617184, "step": 43935 }, { "epoch": 1.073461510272885, "grad_norm": 1.2695121765136719, "learning_rate": 1.9209335414498945e-06, "loss": 0.1575, "num_input_tokens_seen": 29620640, "step": 43940 }, { "epoch": 1.073583661104732, "grad_norm": 0.005534649360924959, "learning_rate": 1.9209003037443506e-06, "loss": 0.0857, "num_input_tokens_seen": 29623648, "step": 43945 }, { "epoch": 1.0737058119365792, "grad_norm": 16.748472213745117, "learning_rate": 1.920867059341772e-06, "loss": 0.1343, "num_input_tokens_seen": 29626784, "step": 43950 }, { "epoch": 1.0738279627684264, "grad_norm": 8.2855224609375, "learning_rate": 1.9208338082424006e-06, "loss": 0.0476, "num_input_tokens_seen": 29630176, "step": 43955 }, { "epoch": 1.0739501136002736, "grad_norm": 0.3709075152873993, "learning_rate": 1.920800550446478e-06, "loss": 0.0022, "num_input_tokens_seen": 29633312, "step": 43960 }, { "epoch": 1.0740722644321208, "grad_norm": 72.69107055664062, "learning_rate": 1.920767285954246e-06, "loss": 0.0819, "num_input_tokens_seen": 29636832, "step": 43965 }, { "epoch": 1.074194415263968, "grad_norm": 10.143033981323242, "learning_rate": 1.9207340147659465e-06, "loss": 0.0907, "num_input_tokens_seen": 29640416, "step": 43970 }, { "epoch": 1.0743165660958152, "grad_norm": 0.12058486044406891, "learning_rate": 1.9207007368818217e-06, "loss": 0.08, "num_input_tokens_seen": 29643680, "step": 43975 }, { "epoch": 1.0744387169276624, "grad_norm": 56.001800537109375, "learning_rate": 1.9206674523021135e-06, "loss": 0.1525, "num_input_tokens_seen": 29647328, "step": 43980 }, { "epoch": 1.0745608677595095, "grad_norm": 0.35678938031196594, "learning_rate": 1.9206341610270644e-06, "loss": 0.0012, "num_input_tokens_seen": 29651104, "step": 43985 }, { "epoch": 1.0746830185913565, "grad_norm": 0.42853277921676636, "learning_rate": 1.9206008630569157e-06, "loss": 0.059, "num_input_tokens_seen": 29654496, "step": 43990 }, { "epoch": 1.0748051694232037, "grad_norm": 0.038223832845687866, "learning_rate": 1.9205675583919096e-06, "loss": 0.1075, "num_input_tokens_seen": 29657760, "step": 43995 }, { "epoch": 1.0749273202550509, "grad_norm": 12.201236724853516, "learning_rate": 1.920534247032289e-06, "loss": 0.284, "num_input_tokens_seen": 29661024, "step": 44000 }, { "epoch": 1.075049471086898, "grad_norm": 17.112329483032227, "learning_rate": 1.9205009289782956e-06, "loss": 0.1003, "num_input_tokens_seen": 29663840, "step": 44005 }, { "epoch": 1.0751716219187453, "grad_norm": 8.583478927612305, "learning_rate": 1.9204676042301718e-06, "loss": 0.112, "num_input_tokens_seen": 29667488, "step": 44010 }, { "epoch": 1.0752937727505925, "grad_norm": 0.381719708442688, "learning_rate": 1.92043427278816e-06, "loss": 0.2006, "num_input_tokens_seen": 29671072, "step": 44015 }, { "epoch": 1.0754159235824396, "grad_norm": 0.16106979548931122, "learning_rate": 1.920400934652503e-06, "loss": 0.0107, "num_input_tokens_seen": 29674656, "step": 44020 }, { "epoch": 1.0755380744142868, "grad_norm": 102.35822296142578, "learning_rate": 1.9203675898234426e-06, "loss": 0.1733, "num_input_tokens_seen": 29678176, "step": 44025 }, { "epoch": 1.075660225246134, "grad_norm": 0.7088550925254822, "learning_rate": 1.9203342383012214e-06, "loss": 0.0602, "num_input_tokens_seen": 29681696, "step": 44030 }, { "epoch": 1.075782376077981, "grad_norm": 16.169422149658203, "learning_rate": 1.920300880086082e-06, "loss": 0.0779, "num_input_tokens_seen": 29684960, "step": 44035 }, { "epoch": 1.0759045269098282, "grad_norm": 51.24101638793945, "learning_rate": 1.9202675151782675e-06, "loss": 0.1817, "num_input_tokens_seen": 29688224, "step": 44040 }, { "epoch": 1.0760266777416754, "grad_norm": 0.2482772320508957, "learning_rate": 1.9202341435780197e-06, "loss": 0.0289, "num_input_tokens_seen": 29691616, "step": 44045 }, { "epoch": 1.0761488285735226, "grad_norm": 10.518967628479004, "learning_rate": 1.9202007652855822e-06, "loss": 0.0737, "num_input_tokens_seen": 29694688, "step": 44050 }, { "epoch": 1.0762709794053698, "grad_norm": 12.700284957885742, "learning_rate": 1.920167380301197e-06, "loss": 0.0577, "num_input_tokens_seen": 29697952, "step": 44055 }, { "epoch": 1.076393130237217, "grad_norm": 0.1491347998380661, "learning_rate": 1.920133988625107e-06, "loss": 0.0635, "num_input_tokens_seen": 29701152, "step": 44060 }, { "epoch": 1.0765152810690641, "grad_norm": 80.09822082519531, "learning_rate": 1.920100590257555e-06, "loss": 0.0122, "num_input_tokens_seen": 29704672, "step": 44065 }, { "epoch": 1.0766374319009113, "grad_norm": 0.10389073938131332, "learning_rate": 1.920067185198784e-06, "loss": 0.065, "num_input_tokens_seen": 29708192, "step": 44070 }, { "epoch": 1.0767595827327585, "grad_norm": 28.060272216796875, "learning_rate": 1.9200337734490374e-06, "loss": 0.0481, "num_input_tokens_seen": 29711264, "step": 44075 }, { "epoch": 1.0768817335646055, "grad_norm": 19.16193962097168, "learning_rate": 1.9200003550085575e-06, "loss": 0.1977, "num_input_tokens_seen": 29714400, "step": 44080 }, { "epoch": 1.0770038843964527, "grad_norm": 0.10936179757118225, "learning_rate": 1.919966929877587e-06, "loss": 0.0458, "num_input_tokens_seen": 29717920, "step": 44085 }, { "epoch": 1.0771260352282999, "grad_norm": 0.5304973125457764, "learning_rate": 1.9199334980563707e-06, "loss": 0.0403, "num_input_tokens_seen": 29721184, "step": 44090 }, { "epoch": 1.077248186060147, "grad_norm": 0.2360510379076004, "learning_rate": 1.91990005954515e-06, "loss": 0.0826, "num_input_tokens_seen": 29724832, "step": 44095 }, { "epoch": 1.0773703368919942, "grad_norm": 0.24685044586658478, "learning_rate": 1.919866614344169e-06, "loss": 0.0805, "num_input_tokens_seen": 29728416, "step": 44100 }, { "epoch": 1.0774924877238414, "grad_norm": 0.09603308141231537, "learning_rate": 1.9198331624536696e-06, "loss": 0.146, "num_input_tokens_seen": 29732064, "step": 44105 }, { "epoch": 1.0776146385556886, "grad_norm": 18.899932861328125, "learning_rate": 1.9197997038738967e-06, "loss": 0.1583, "num_input_tokens_seen": 29735840, "step": 44110 }, { "epoch": 1.0777367893875358, "grad_norm": 1.469382405281067, "learning_rate": 1.919766238605093e-06, "loss": 0.0657, "num_input_tokens_seen": 29739104, "step": 44115 }, { "epoch": 1.0778589402193828, "grad_norm": 73.46088409423828, "learning_rate": 1.9197327666475017e-06, "loss": 0.0568, "num_input_tokens_seen": 29742432, "step": 44120 }, { "epoch": 1.07798109105123, "grad_norm": 0.0979587659239769, "learning_rate": 1.9196992880013662e-06, "loss": 0.1157, "num_input_tokens_seen": 29745440, "step": 44125 }, { "epoch": 1.0781032418830772, "grad_norm": 0.204317107796669, "learning_rate": 1.9196658026669303e-06, "loss": 0.0797, "num_input_tokens_seen": 29748512, "step": 44130 }, { "epoch": 1.0782253927149243, "grad_norm": 29.698213577270508, "learning_rate": 1.9196323106444374e-06, "loss": 0.2125, "num_input_tokens_seen": 29751648, "step": 44135 }, { "epoch": 1.0783475435467715, "grad_norm": 0.14883090555667877, "learning_rate": 1.9195988119341306e-06, "loss": 0.0615, "num_input_tokens_seen": 29754592, "step": 44140 }, { "epoch": 1.0784696943786187, "grad_norm": 1.3855115175247192, "learning_rate": 1.9195653065362544e-06, "loss": 0.0466, "num_input_tokens_seen": 29757600, "step": 44145 }, { "epoch": 1.078591845210466, "grad_norm": 24.51190185546875, "learning_rate": 1.9195317944510517e-06, "loss": 0.036, "num_input_tokens_seen": 29761184, "step": 44150 }, { "epoch": 1.078713996042313, "grad_norm": 22.161216735839844, "learning_rate": 1.9194982756787662e-06, "loss": 0.1806, "num_input_tokens_seen": 29764768, "step": 44155 }, { "epoch": 1.0788361468741603, "grad_norm": 0.10817687213420868, "learning_rate": 1.9194647502196422e-06, "loss": 0.0684, "num_input_tokens_seen": 29768416, "step": 44160 }, { "epoch": 1.0789582977060075, "grad_norm": 29.709800720214844, "learning_rate": 1.9194312180739237e-06, "loss": 0.0755, "num_input_tokens_seen": 29771552, "step": 44165 }, { "epoch": 1.0790804485378545, "grad_norm": 11.85411262512207, "learning_rate": 1.9193976792418533e-06, "loss": 0.0485, "num_input_tokens_seen": 29775584, "step": 44170 }, { "epoch": 1.0792025993697016, "grad_norm": 0.3583597242832184, "learning_rate": 1.919364133723676e-06, "loss": 0.1085, "num_input_tokens_seen": 29778400, "step": 44175 }, { "epoch": 1.0793247502015488, "grad_norm": 0.35112762451171875, "learning_rate": 1.9193305815196355e-06, "loss": 0.0418, "num_input_tokens_seen": 29781728, "step": 44180 }, { "epoch": 1.079446901033396, "grad_norm": 14.021635055541992, "learning_rate": 1.9192970226299757e-06, "loss": 0.1246, "num_input_tokens_seen": 29785056, "step": 44185 }, { "epoch": 1.0795690518652432, "grad_norm": 0.19013722240924835, "learning_rate": 1.919263457054941e-06, "loss": 0.0421, "num_input_tokens_seen": 29788256, "step": 44190 }, { "epoch": 1.0796912026970904, "grad_norm": 2.3619987964630127, "learning_rate": 1.9192298847947746e-06, "loss": 0.0286, "num_input_tokens_seen": 29791392, "step": 44195 }, { "epoch": 1.0798133535289376, "grad_norm": 2.7994532585144043, "learning_rate": 1.9191963058497212e-06, "loss": 0.004, "num_input_tokens_seen": 29794336, "step": 44200 }, { "epoch": 1.0799355043607848, "grad_norm": 0.09319982677698135, "learning_rate": 1.9191627202200258e-06, "loss": 0.0518, "num_input_tokens_seen": 29797728, "step": 44205 }, { "epoch": 1.0800576551926317, "grad_norm": 1.3856775760650635, "learning_rate": 1.9191291279059312e-06, "loss": 0.0466, "num_input_tokens_seen": 29801056, "step": 44210 }, { "epoch": 1.080179806024479, "grad_norm": 0.10315916687250137, "learning_rate": 1.9190955289076825e-06, "loss": 0.1837, "num_input_tokens_seen": 29804512, "step": 44215 }, { "epoch": 1.0803019568563261, "grad_norm": 0.18177196383476257, "learning_rate": 1.9190619232255242e-06, "loss": 0.1528, "num_input_tokens_seen": 29808032, "step": 44220 }, { "epoch": 1.0804241076881733, "grad_norm": 16.280742645263672, "learning_rate": 1.9190283108597e-06, "loss": 0.1278, "num_input_tokens_seen": 29811296, "step": 44225 }, { "epoch": 1.0805462585200205, "grad_norm": 0.3395654559135437, "learning_rate": 1.918994691810455e-06, "loss": 0.0481, "num_input_tokens_seen": 29815904, "step": 44230 }, { "epoch": 1.0806684093518677, "grad_norm": 28.75006103515625, "learning_rate": 1.9189610660780335e-06, "loss": 0.0786, "num_input_tokens_seen": 29819040, "step": 44235 }, { "epoch": 1.0807905601837149, "grad_norm": 0.04071475565433502, "learning_rate": 1.9189274336626795e-06, "loss": 0.0959, "num_input_tokens_seen": 29822496, "step": 44240 }, { "epoch": 1.080912711015562, "grad_norm": 37.24461364746094, "learning_rate": 1.9188937945646386e-06, "loss": 0.095, "num_input_tokens_seen": 29825696, "step": 44245 }, { "epoch": 1.0810348618474093, "grad_norm": 13.556794166564941, "learning_rate": 1.9188601487841545e-06, "loss": 0.076, "num_input_tokens_seen": 29829024, "step": 44250 }, { "epoch": 1.0811570126792565, "grad_norm": 0.16886696219444275, "learning_rate": 1.9188264963214724e-06, "loss": 0.0274, "num_input_tokens_seen": 29832480, "step": 44255 }, { "epoch": 1.0812791635111034, "grad_norm": 1.2870328426361084, "learning_rate": 1.918792837176837e-06, "loss": 0.0585, "num_input_tokens_seen": 29835744, "step": 44260 }, { "epoch": 1.0814013143429506, "grad_norm": 1.8540476560592651, "learning_rate": 1.9187591713504925e-06, "loss": 0.0317, "num_input_tokens_seen": 29838752, "step": 44265 }, { "epoch": 1.0815234651747978, "grad_norm": 20.007299423217773, "learning_rate": 1.9187254988426846e-06, "loss": 0.0282, "num_input_tokens_seen": 29842080, "step": 44270 }, { "epoch": 1.081645616006645, "grad_norm": 1.7691351175308228, "learning_rate": 1.918691819653658e-06, "loss": 0.102, "num_input_tokens_seen": 29844832, "step": 44275 }, { "epoch": 1.0817677668384922, "grad_norm": 27.982473373413086, "learning_rate": 1.9186581337836567e-06, "loss": 0.1044, "num_input_tokens_seen": 29848096, "step": 44280 }, { "epoch": 1.0818899176703394, "grad_norm": 0.024598833173513412, "learning_rate": 1.918624441232927e-06, "loss": 0.0015, "num_input_tokens_seen": 29851552, "step": 44285 }, { "epoch": 1.0820120685021866, "grad_norm": 89.78707885742188, "learning_rate": 1.918590742001713e-06, "loss": 0.1048, "num_input_tokens_seen": 29855264, "step": 44290 }, { "epoch": 1.0821342193340338, "grad_norm": 0.07963894307613373, "learning_rate": 1.91855703609026e-06, "loss": 0.1001, "num_input_tokens_seen": 29858720, "step": 44295 }, { "epoch": 1.0822563701658807, "grad_norm": 0.34227511286735535, "learning_rate": 1.918523323498813e-06, "loss": 0.0587, "num_input_tokens_seen": 29862112, "step": 44300 }, { "epoch": 1.082378520997728, "grad_norm": 0.024144131690263748, "learning_rate": 1.9184896042276176e-06, "loss": 0.0833, "num_input_tokens_seen": 29865376, "step": 44305 }, { "epoch": 1.082500671829575, "grad_norm": 0.03241460397839546, "learning_rate": 1.9184558782769185e-06, "loss": 0.0984, "num_input_tokens_seen": 29868832, "step": 44310 }, { "epoch": 1.0826228226614223, "grad_norm": 0.9405149221420288, "learning_rate": 1.9184221456469615e-06, "loss": 0.0488, "num_input_tokens_seen": 29872096, "step": 44315 }, { "epoch": 1.0827449734932695, "grad_norm": 0.9719510078430176, "learning_rate": 1.9183884063379918e-06, "loss": 0.0499, "num_input_tokens_seen": 29875232, "step": 44320 }, { "epoch": 1.0828671243251167, "grad_norm": 25.23493766784668, "learning_rate": 1.9183546603502545e-06, "loss": 0.0383, "num_input_tokens_seen": 29878880, "step": 44325 }, { "epoch": 1.0829892751569639, "grad_norm": 0.5960846543312073, "learning_rate": 1.9183209076839944e-06, "loss": 0.0354, "num_input_tokens_seen": 29882400, "step": 44330 }, { "epoch": 1.083111425988811, "grad_norm": 0.018635839223861694, "learning_rate": 1.9182871483394585e-06, "loss": 0.0176, "num_input_tokens_seen": 29885920, "step": 44335 }, { "epoch": 1.0832335768206582, "grad_norm": 24.572978973388672, "learning_rate": 1.918253382316891e-06, "loss": 0.0967, "num_input_tokens_seen": 29889504, "step": 44340 }, { "epoch": 1.0833557276525054, "grad_norm": 0.4108743965625763, "learning_rate": 1.9182196096165383e-06, "loss": 0.0563, "num_input_tokens_seen": 29892896, "step": 44345 }, { "epoch": 1.0834778784843524, "grad_norm": 0.16322478652000427, "learning_rate": 1.9181858302386454e-06, "loss": 0.1665, "num_input_tokens_seen": 29896224, "step": 44350 }, { "epoch": 1.0836000293161996, "grad_norm": 28.877195358276367, "learning_rate": 1.9181520441834582e-06, "loss": 0.1475, "num_input_tokens_seen": 29899296, "step": 44355 }, { "epoch": 1.0837221801480468, "grad_norm": 13.3076171875, "learning_rate": 1.9181182514512222e-06, "loss": 0.1069, "num_input_tokens_seen": 29902368, "step": 44360 }, { "epoch": 1.083844330979894, "grad_norm": 0.08094339072704315, "learning_rate": 1.9180844520421838e-06, "loss": 0.0788, "num_input_tokens_seen": 29905888, "step": 44365 }, { "epoch": 1.0839664818117412, "grad_norm": 82.42140197753906, "learning_rate": 1.918050645956588e-06, "loss": 0.0217, "num_input_tokens_seen": 29909344, "step": 44370 }, { "epoch": 1.0840886326435883, "grad_norm": 8.881879806518555, "learning_rate": 1.918016833194681e-06, "loss": 0.0903, "num_input_tokens_seen": 29912480, "step": 44375 }, { "epoch": 1.0842107834754355, "grad_norm": 40.43014144897461, "learning_rate": 1.917983013756709e-06, "loss": 0.2413, "num_input_tokens_seen": 29916256, "step": 44380 }, { "epoch": 1.0843329343072827, "grad_norm": 1.9137089252471924, "learning_rate": 1.917949187642917e-06, "loss": 0.0461, "num_input_tokens_seen": 29919456, "step": 44385 }, { "epoch": 1.0844550851391297, "grad_norm": 21.525697708129883, "learning_rate": 1.917915354853552e-06, "loss": 0.2388, "num_input_tokens_seen": 29923232, "step": 44390 }, { "epoch": 1.0845772359709769, "grad_norm": 0.4614849090576172, "learning_rate": 1.9178815153888595e-06, "loss": 0.1065, "num_input_tokens_seen": 29926240, "step": 44395 }, { "epoch": 1.084699386802824, "grad_norm": 0.8693162798881531, "learning_rate": 1.917847669249086e-06, "loss": 0.0584, "num_input_tokens_seen": 29929696, "step": 44400 }, { "epoch": 1.0848215376346713, "grad_norm": 0.8169465661048889, "learning_rate": 1.917813816434477e-06, "loss": 0.088, "num_input_tokens_seen": 29933280, "step": 44405 }, { "epoch": 1.0849436884665185, "grad_norm": 0.28742265701293945, "learning_rate": 1.9177799569452793e-06, "loss": 0.1019, "num_input_tokens_seen": 29936608, "step": 44410 }, { "epoch": 1.0850658392983656, "grad_norm": 0.9252832531929016, "learning_rate": 1.917746090781739e-06, "loss": 0.068, "num_input_tokens_seen": 29940384, "step": 44415 }, { "epoch": 1.0851879901302128, "grad_norm": 25.838499069213867, "learning_rate": 1.917712217944102e-06, "loss": 0.0874, "num_input_tokens_seen": 29943840, "step": 44420 }, { "epoch": 1.08531014096206, "grad_norm": 0.21891480684280396, "learning_rate": 1.917678338432615e-06, "loss": 0.0362, "num_input_tokens_seen": 29947168, "step": 44425 }, { "epoch": 1.0854322917939072, "grad_norm": 39.00025177001953, "learning_rate": 1.917644452247524e-06, "loss": 0.1176, "num_input_tokens_seen": 29950432, "step": 44430 }, { "epoch": 1.0855544426257542, "grad_norm": 0.486000120639801, "learning_rate": 1.9176105593890765e-06, "loss": 0.0287, "num_input_tokens_seen": 29954208, "step": 44435 }, { "epoch": 1.0856765934576014, "grad_norm": 25.918472290039062, "learning_rate": 1.9175766598575177e-06, "loss": 0.1624, "num_input_tokens_seen": 29957280, "step": 44440 }, { "epoch": 1.0857987442894486, "grad_norm": 16.659223556518555, "learning_rate": 1.917542753653095e-06, "loss": 0.128, "num_input_tokens_seen": 29960416, "step": 44445 }, { "epoch": 1.0859208951212957, "grad_norm": 18.979354858398438, "learning_rate": 1.9175088407760543e-06, "loss": 0.0584, "num_input_tokens_seen": 29963808, "step": 44450 }, { "epoch": 1.086043045953143, "grad_norm": 51.75289535522461, "learning_rate": 1.917474921226642e-06, "loss": 0.1099, "num_input_tokens_seen": 29967072, "step": 44455 }, { "epoch": 1.0861651967849901, "grad_norm": 1.5100349187850952, "learning_rate": 1.917440995005106e-06, "loss": 0.0261, "num_input_tokens_seen": 29970464, "step": 44460 }, { "epoch": 1.0862873476168373, "grad_norm": 4.650681018829346, "learning_rate": 1.9174070621116924e-06, "loss": 0.1018, "num_input_tokens_seen": 29974560, "step": 44465 }, { "epoch": 1.0864094984486845, "grad_norm": 0.549470067024231, "learning_rate": 1.9173731225466477e-06, "loss": 0.0687, "num_input_tokens_seen": 29977504, "step": 44470 }, { "epoch": 1.0865316492805317, "grad_norm": 15.055164337158203, "learning_rate": 1.917339176310219e-06, "loss": 0.1373, "num_input_tokens_seen": 29980896, "step": 44475 }, { "epoch": 1.0866538001123787, "grad_norm": 0.01459528598934412, "learning_rate": 1.917305223402653e-06, "loss": 0.0765, "num_input_tokens_seen": 29984160, "step": 44480 }, { "epoch": 1.0867759509442259, "grad_norm": 0.21965134143829346, "learning_rate": 1.9172712638241964e-06, "loss": 0.0376, "num_input_tokens_seen": 29987424, "step": 44485 }, { "epoch": 1.086898101776073, "grad_norm": 0.47681552171707153, "learning_rate": 1.917237297575097e-06, "loss": 0.0685, "num_input_tokens_seen": 29990496, "step": 44490 }, { "epoch": 1.0870202526079202, "grad_norm": 0.1720905601978302, "learning_rate": 1.9172033246556008e-06, "loss": 0.0267, "num_input_tokens_seen": 29993760, "step": 44495 }, { "epoch": 1.0871424034397674, "grad_norm": 8.774606704711914, "learning_rate": 1.9171693450659556e-06, "loss": 0.0543, "num_input_tokens_seen": 29997280, "step": 44500 }, { "epoch": 1.0872645542716146, "grad_norm": 0.053868431597948074, "learning_rate": 1.917135358806408e-06, "loss": 0.0768, "num_input_tokens_seen": 30000480, "step": 44505 }, { "epoch": 1.0873867051034618, "grad_norm": 16.72242546081543, "learning_rate": 1.9171013658772055e-06, "loss": 0.1496, "num_input_tokens_seen": 30003872, "step": 44510 }, { "epoch": 1.087508855935309, "grad_norm": 0.11925757676362991, "learning_rate": 1.9170673662785953e-06, "loss": 0.0012, "num_input_tokens_seen": 30007264, "step": 44515 }, { "epoch": 1.0876310067671562, "grad_norm": 0.10318820923566818, "learning_rate": 1.9170333600108246e-06, "loss": 0.1859, "num_input_tokens_seen": 30011104, "step": 44520 }, { "epoch": 1.0877531575990032, "grad_norm": 0.32234182953834534, "learning_rate": 1.9169993470741407e-06, "loss": 0.0786, "num_input_tokens_seen": 30014368, "step": 44525 }, { "epoch": 1.0878753084308503, "grad_norm": 37.324275970458984, "learning_rate": 1.9169653274687905e-06, "loss": 0.1248, "num_input_tokens_seen": 30017952, "step": 44530 }, { "epoch": 1.0879974592626975, "grad_norm": 5.065712928771973, "learning_rate": 1.9169313011950223e-06, "loss": 0.0366, "num_input_tokens_seen": 30021408, "step": 44535 }, { "epoch": 1.0881196100945447, "grad_norm": 0.061776500195264816, "learning_rate": 1.9168972682530825e-06, "loss": 0.0697, "num_input_tokens_seen": 30025056, "step": 44540 }, { "epoch": 1.088241760926392, "grad_norm": 0.0976782888174057, "learning_rate": 1.9168632286432193e-06, "loss": 0.0544, "num_input_tokens_seen": 30028192, "step": 44545 }, { "epoch": 1.088363911758239, "grad_norm": 27.349529266357422, "learning_rate": 1.9168291823656804e-06, "loss": 0.1594, "num_input_tokens_seen": 30031200, "step": 44550 }, { "epoch": 1.0884860625900863, "grad_norm": 1.1122671365737915, "learning_rate": 1.916795129420713e-06, "loss": 0.0597, "num_input_tokens_seen": 30034208, "step": 44555 }, { "epoch": 1.0886082134219335, "grad_norm": 0.43841981887817383, "learning_rate": 1.9167610698085647e-06, "loss": 0.0384, "num_input_tokens_seen": 30037792, "step": 44560 }, { "epoch": 1.0887303642537807, "grad_norm": 38.47297286987305, "learning_rate": 1.9167270035294833e-06, "loss": 0.1703, "num_input_tokens_seen": 30041184, "step": 44565 }, { "epoch": 1.0888525150856276, "grad_norm": 107.69474029541016, "learning_rate": 1.9166929305837164e-06, "loss": 0.0229, "num_input_tokens_seen": 30044448, "step": 44570 }, { "epoch": 1.0889746659174748, "grad_norm": 60.87364959716797, "learning_rate": 1.9166588509715123e-06, "loss": 0.1048, "num_input_tokens_seen": 30047904, "step": 44575 }, { "epoch": 1.089096816749322, "grad_norm": 1.1863831281661987, "learning_rate": 1.916624764693118e-06, "loss": 0.062, "num_input_tokens_seen": 30051040, "step": 44580 }, { "epoch": 1.0892189675811692, "grad_norm": 0.0778331458568573, "learning_rate": 1.9165906717487824e-06, "loss": 0.001, "num_input_tokens_seen": 30054176, "step": 44585 }, { "epoch": 1.0893411184130164, "grad_norm": 14.185705184936523, "learning_rate": 1.916556572138753e-06, "loss": 0.1293, "num_input_tokens_seen": 30057888, "step": 44590 }, { "epoch": 1.0894632692448636, "grad_norm": 38.385772705078125, "learning_rate": 1.916522465863277e-06, "loss": 0.1436, "num_input_tokens_seen": 30061088, "step": 44595 }, { "epoch": 1.0895854200767108, "grad_norm": 19.040313720703125, "learning_rate": 1.916488352922604e-06, "loss": 0.1403, "num_input_tokens_seen": 30064480, "step": 44600 }, { "epoch": 1.089707570908558, "grad_norm": 44.07444763183594, "learning_rate": 1.9164542333169806e-06, "loss": 0.1873, "num_input_tokens_seen": 30067552, "step": 44605 }, { "epoch": 1.0898297217404052, "grad_norm": 0.06985550373792648, "learning_rate": 1.9164201070466556e-06, "loss": 0.0671, "num_input_tokens_seen": 30070816, "step": 44610 }, { "epoch": 1.0899518725722521, "grad_norm": 0.9508460760116577, "learning_rate": 1.916385974111877e-06, "loss": 0.0633, "num_input_tokens_seen": 30074400, "step": 44615 }, { "epoch": 1.0900740234040993, "grad_norm": 0.3236066699028015, "learning_rate": 1.9163518345128937e-06, "loss": 0.0559, "num_input_tokens_seen": 30077728, "step": 44620 }, { "epoch": 1.0901961742359465, "grad_norm": 0.3575074076652527, "learning_rate": 1.9163176882499526e-06, "loss": 0.0022, "num_input_tokens_seen": 30081376, "step": 44625 }, { "epoch": 1.0903183250677937, "grad_norm": 4.681931972503662, "learning_rate": 1.9162835353233034e-06, "loss": 0.0444, "num_input_tokens_seen": 30084448, "step": 44630 }, { "epoch": 1.0904404758996409, "grad_norm": 100.06119537353516, "learning_rate": 1.9162493757331934e-06, "loss": 0.1226, "num_input_tokens_seen": 30088096, "step": 44635 }, { "epoch": 1.090562626731488, "grad_norm": 18.056100845336914, "learning_rate": 1.9162152094798717e-06, "loss": 0.0867, "num_input_tokens_seen": 30091616, "step": 44640 }, { "epoch": 1.0906847775633353, "grad_norm": 42.22257614135742, "learning_rate": 1.9161810365635867e-06, "loss": 0.1927, "num_input_tokens_seen": 30094816, "step": 44645 }, { "epoch": 1.0908069283951825, "grad_norm": 0.09566375613212585, "learning_rate": 1.9161468569845867e-06, "loss": 0.1116, "num_input_tokens_seen": 30097952, "step": 44650 }, { "epoch": 1.0909290792270294, "grad_norm": 0.2680375576019287, "learning_rate": 1.91611267074312e-06, "loss": 0.0607, "num_input_tokens_seen": 30101728, "step": 44655 }, { "epoch": 1.0910512300588766, "grad_norm": 8.587791442871094, "learning_rate": 1.9160784778394362e-06, "loss": 0.1164, "num_input_tokens_seen": 30105248, "step": 44660 }, { "epoch": 1.0911733808907238, "grad_norm": 48.21713638305664, "learning_rate": 1.916044278273783e-06, "loss": 0.012, "num_input_tokens_seen": 30108640, "step": 44665 }, { "epoch": 1.091295531722571, "grad_norm": 10.47347354888916, "learning_rate": 1.916010072046409e-06, "loss": 0.0913, "num_input_tokens_seen": 30112160, "step": 44670 }, { "epoch": 1.0914176825544182, "grad_norm": 7.425508499145508, "learning_rate": 1.915975859157564e-06, "loss": 0.1595, "num_input_tokens_seen": 30115552, "step": 44675 }, { "epoch": 1.0915398333862654, "grad_norm": 22.443424224853516, "learning_rate": 1.915941639607496e-06, "loss": 0.0373, "num_input_tokens_seen": 30118688, "step": 44680 }, { "epoch": 1.0916619842181126, "grad_norm": 0.346167653799057, "learning_rate": 1.915907413396454e-06, "loss": 0.0661, "num_input_tokens_seen": 30121888, "step": 44685 }, { "epoch": 1.0917841350499597, "grad_norm": 2.4050328731536865, "learning_rate": 1.915873180524687e-06, "loss": 0.0781, "num_input_tokens_seen": 30125280, "step": 44690 }, { "epoch": 1.091906285881807, "grad_norm": 0.14322443306446075, "learning_rate": 1.9158389409924437e-06, "loss": 0.0918, "num_input_tokens_seen": 30128992, "step": 44695 }, { "epoch": 1.0920284367136541, "grad_norm": 0.1610415279865265, "learning_rate": 1.9158046947999737e-06, "loss": 0.0144, "num_input_tokens_seen": 30132448, "step": 44700 }, { "epoch": 1.092150587545501, "grad_norm": 0.2604481279850006, "learning_rate": 1.9157704419475255e-06, "loss": 0.0917, "num_input_tokens_seen": 30135712, "step": 44705 }, { "epoch": 1.0922727383773483, "grad_norm": 1.2483242750167847, "learning_rate": 1.915736182435348e-06, "loss": 0.0669, "num_input_tokens_seen": 30139296, "step": 44710 }, { "epoch": 1.0923948892091955, "grad_norm": 24.37545394897461, "learning_rate": 1.9157019162636906e-06, "loss": 0.0049, "num_input_tokens_seen": 30142688, "step": 44715 }, { "epoch": 1.0925170400410427, "grad_norm": 1.0107009410858154, "learning_rate": 1.915667643432803e-06, "loss": 0.1349, "num_input_tokens_seen": 30146592, "step": 44720 }, { "epoch": 1.0926391908728899, "grad_norm": 12.480439186096191, "learning_rate": 1.915633363942934e-06, "loss": 0.2515, "num_input_tokens_seen": 30149984, "step": 44725 }, { "epoch": 1.092761341704737, "grad_norm": 2.2978947162628174, "learning_rate": 1.9155990777943325e-06, "loss": 0.0047, "num_input_tokens_seen": 30154144, "step": 44730 }, { "epoch": 1.0928834925365842, "grad_norm": 0.31146731972694397, "learning_rate": 1.9155647849872487e-06, "loss": 0.1377, "num_input_tokens_seen": 30157984, "step": 44735 }, { "epoch": 1.0930056433684314, "grad_norm": 66.38398742675781, "learning_rate": 1.9155304855219316e-06, "loss": 0.2204, "num_input_tokens_seen": 30161056, "step": 44740 }, { "epoch": 1.0931277942002784, "grad_norm": 1.1195906400680542, "learning_rate": 1.91549617939863e-06, "loss": 0.0376, "num_input_tokens_seen": 30164704, "step": 44745 }, { "epoch": 1.0932499450321256, "grad_norm": 2.009263753890991, "learning_rate": 1.9154618666175942e-06, "loss": 0.0082, "num_input_tokens_seen": 30168352, "step": 44750 }, { "epoch": 1.0933720958639728, "grad_norm": 0.39526981115341187, "learning_rate": 1.9154275471790733e-06, "loss": 0.0475, "num_input_tokens_seen": 30171808, "step": 44755 }, { "epoch": 1.09349424669582, "grad_norm": 0.5147704482078552, "learning_rate": 1.9153932210833173e-06, "loss": 0.1045, "num_input_tokens_seen": 30175392, "step": 44760 }, { "epoch": 1.0936163975276672, "grad_norm": 2.1972246170043945, "learning_rate": 1.9153588883305756e-06, "loss": 0.1195, "num_input_tokens_seen": 30178720, "step": 44765 }, { "epoch": 1.0937385483595143, "grad_norm": 0.09867444634437561, "learning_rate": 1.9153245489210977e-06, "loss": 0.0416, "num_input_tokens_seen": 30181856, "step": 44770 }, { "epoch": 1.0938606991913615, "grad_norm": 0.3307472765445709, "learning_rate": 1.9152902028551335e-06, "loss": 0.0724, "num_input_tokens_seen": 30184992, "step": 44775 }, { "epoch": 1.0939828500232087, "grad_norm": 51.22042465209961, "learning_rate": 1.915255850132933e-06, "loss": 0.0851, "num_input_tokens_seen": 30188512, "step": 44780 }, { "epoch": 1.094105000855056, "grad_norm": 41.89115524291992, "learning_rate": 1.915221490754746e-06, "loss": 0.059, "num_input_tokens_seen": 30191584, "step": 44785 }, { "epoch": 1.094227151686903, "grad_norm": 7.868800640106201, "learning_rate": 1.9151871247208214e-06, "loss": 0.1301, "num_input_tokens_seen": 30194592, "step": 44790 }, { "epoch": 1.09434930251875, "grad_norm": 9.530349731445312, "learning_rate": 1.9151527520314105e-06, "loss": 0.2542, "num_input_tokens_seen": 30198560, "step": 44795 }, { "epoch": 1.0944714533505973, "grad_norm": 1.1136250495910645, "learning_rate": 1.9151183726867623e-06, "loss": 0.0945, "num_input_tokens_seen": 30201824, "step": 44800 }, { "epoch": 1.0945936041824444, "grad_norm": 10.920148849487305, "learning_rate": 1.9150839866871273e-06, "loss": 0.0295, "num_input_tokens_seen": 30205408, "step": 44805 }, { "epoch": 1.0947157550142916, "grad_norm": 2.137712240219116, "learning_rate": 1.9150495940327556e-06, "loss": 0.0894, "num_input_tokens_seen": 30208736, "step": 44810 }, { "epoch": 1.0948379058461388, "grad_norm": 41.97331619262695, "learning_rate": 1.915015194723897e-06, "loss": 0.1374, "num_input_tokens_seen": 30211872, "step": 44815 }, { "epoch": 1.094960056677986, "grad_norm": 17.44566535949707, "learning_rate": 1.9149807887608012e-06, "loss": 0.0371, "num_input_tokens_seen": 30215200, "step": 44820 }, { "epoch": 1.0950822075098332, "grad_norm": 0.21711544692516327, "learning_rate": 1.9149463761437196e-06, "loss": 0.1393, "num_input_tokens_seen": 30218272, "step": 44825 }, { "epoch": 1.0952043583416804, "grad_norm": 0.7705076336860657, "learning_rate": 1.914911956872902e-06, "loss": 0.0718, "num_input_tokens_seen": 30221600, "step": 44830 }, { "epoch": 1.0953265091735274, "grad_norm": 0.15149593353271484, "learning_rate": 1.9148775309485982e-06, "loss": 0.0227, "num_input_tokens_seen": 30225120, "step": 44835 }, { "epoch": 1.0954486600053746, "grad_norm": 0.34287703037261963, "learning_rate": 1.914843098371059e-06, "loss": 0.0023, "num_input_tokens_seen": 30228576, "step": 44840 }, { "epoch": 1.0955708108372217, "grad_norm": 0.2694163918495178, "learning_rate": 1.914808659140535e-06, "loss": 0.0347, "num_input_tokens_seen": 30232224, "step": 44845 }, { "epoch": 1.095692961669069, "grad_norm": 25.289270401000977, "learning_rate": 1.9147742132572763e-06, "loss": 0.0849, "num_input_tokens_seen": 30235296, "step": 44850 }, { "epoch": 1.0958151125009161, "grad_norm": 38.71376037597656, "learning_rate": 1.914739760721533e-06, "loss": 0.0941, "num_input_tokens_seen": 30238432, "step": 44855 }, { "epoch": 1.0959372633327633, "grad_norm": 19.535490036010742, "learning_rate": 1.9147053015335568e-06, "loss": 0.031, "num_input_tokens_seen": 30242592, "step": 44860 }, { "epoch": 1.0960594141646105, "grad_norm": 29.093135833740234, "learning_rate": 1.9146708356935974e-06, "loss": 0.1462, "num_input_tokens_seen": 30245728, "step": 44865 }, { "epoch": 1.0961815649964577, "grad_norm": 0.1613461971282959, "learning_rate": 1.9146363632019053e-06, "loss": 0.1299, "num_input_tokens_seen": 30249376, "step": 44870 }, { "epoch": 1.0963037158283049, "grad_norm": 0.3770635426044464, "learning_rate": 1.914601884058732e-06, "loss": 0.1636, "num_input_tokens_seen": 30252640, "step": 44875 }, { "epoch": 1.096425866660152, "grad_norm": 0.2593759298324585, "learning_rate": 1.9145673982643276e-06, "loss": 0.003, "num_input_tokens_seen": 30256288, "step": 44880 }, { "epoch": 1.096548017491999, "grad_norm": 0.16447781026363373, "learning_rate": 1.914532905818943e-06, "loss": 0.0457, "num_input_tokens_seen": 30259296, "step": 44885 }, { "epoch": 1.0966701683238462, "grad_norm": 0.06459282338619232, "learning_rate": 1.914498406722829e-06, "loss": 0.1412, "num_input_tokens_seen": 30262624, "step": 44890 }, { "epoch": 1.0967923191556934, "grad_norm": 35.29350280761719, "learning_rate": 1.914463900976237e-06, "loss": 0.1249, "num_input_tokens_seen": 30265760, "step": 44895 }, { "epoch": 1.0969144699875406, "grad_norm": 52.1586799621582, "learning_rate": 1.9144293885794177e-06, "loss": 0.164, "num_input_tokens_seen": 30268960, "step": 44900 }, { "epoch": 1.0970366208193878, "grad_norm": 0.08863640576601028, "learning_rate": 1.9143948695326217e-06, "loss": 0.0011, "num_input_tokens_seen": 30272480, "step": 44905 }, { "epoch": 1.097158771651235, "grad_norm": 1.0056798458099365, "learning_rate": 1.9143603438361e-06, "loss": 0.0457, "num_input_tokens_seen": 30275488, "step": 44910 }, { "epoch": 1.0972809224830822, "grad_norm": 0.2724807560443878, "learning_rate": 1.914325811490104e-06, "loss": 0.0966, "num_input_tokens_seen": 30278816, "step": 44915 }, { "epoch": 1.0974030733149294, "grad_norm": 13.868060111999512, "learning_rate": 1.914291272494885e-06, "loss": 0.1837, "num_input_tokens_seen": 30282016, "step": 44920 }, { "epoch": 1.0975252241467763, "grad_norm": 0.04919269308447838, "learning_rate": 1.914256726850694e-06, "loss": 0.0419, "num_input_tokens_seen": 30285344, "step": 44925 }, { "epoch": 1.0976473749786235, "grad_norm": 0.14486579596996307, "learning_rate": 1.914222174557782e-06, "loss": 0.1165, "num_input_tokens_seen": 30288608, "step": 44930 }, { "epoch": 1.0977695258104707, "grad_norm": 16.598464965820312, "learning_rate": 1.9141876156164006e-06, "loss": 0.1182, "num_input_tokens_seen": 30292000, "step": 44935 }, { "epoch": 1.097891676642318, "grad_norm": 0.25447985529899597, "learning_rate": 1.914153050026801e-06, "loss": 0.0429, "num_input_tokens_seen": 30295712, "step": 44940 }, { "epoch": 1.098013827474165, "grad_norm": 1.241197109222412, "learning_rate": 1.914118477789234e-06, "loss": 0.085, "num_input_tokens_seen": 30299296, "step": 44945 }, { "epoch": 1.0981359783060123, "grad_norm": 15.943071365356445, "learning_rate": 1.914083898903952e-06, "loss": 0.0929, "num_input_tokens_seen": 30303136, "step": 44950 }, { "epoch": 1.0982581291378595, "grad_norm": 0.248790442943573, "learning_rate": 1.914049313371206e-06, "loss": 0.0196, "num_input_tokens_seen": 30306464, "step": 44955 }, { "epoch": 1.0983802799697067, "grad_norm": 0.08668647706508636, "learning_rate": 1.914014721191248e-06, "loss": 0.0349, "num_input_tokens_seen": 30309728, "step": 44960 }, { "epoch": 1.0985024308015539, "grad_norm": 10.536722183227539, "learning_rate": 1.9139801223643283e-06, "loss": 0.1206, "num_input_tokens_seen": 30313056, "step": 44965 }, { "epoch": 1.0986245816334008, "grad_norm": 0.3854491114616394, "learning_rate": 1.9139455168907e-06, "loss": 0.0054, "num_input_tokens_seen": 30316384, "step": 44970 }, { "epoch": 1.098746732465248, "grad_norm": 101.83487701416016, "learning_rate": 1.9139109047706134e-06, "loss": 0.1733, "num_input_tokens_seen": 30319776, "step": 44975 }, { "epoch": 1.0988688832970952, "grad_norm": 0.9956525564193726, "learning_rate": 1.9138762860043213e-06, "loss": 0.1076, "num_input_tokens_seen": 30322912, "step": 44980 }, { "epoch": 1.0989910341289424, "grad_norm": 19.65517234802246, "learning_rate": 1.913841660592075e-06, "loss": 0.0892, "num_input_tokens_seen": 30325984, "step": 44985 }, { "epoch": 1.0991131849607896, "grad_norm": 9.605228424072266, "learning_rate": 1.913807028534126e-06, "loss": 0.2093, "num_input_tokens_seen": 30329120, "step": 44990 }, { "epoch": 1.0992353357926368, "grad_norm": 0.535914957523346, "learning_rate": 1.9137723898307275e-06, "loss": 0.0831, "num_input_tokens_seen": 30332256, "step": 44995 }, { "epoch": 1.099357486624484, "grad_norm": 8.649102210998535, "learning_rate": 1.9137377444821296e-06, "loss": 0.0393, "num_input_tokens_seen": 30335200, "step": 45000 }, { "epoch": 1.0994796374563311, "grad_norm": 0.646960437297821, "learning_rate": 1.913703092488585e-06, "loss": 0.0218, "num_input_tokens_seen": 30338656, "step": 45005 }, { "epoch": 1.0996017882881783, "grad_norm": 0.20934827625751495, "learning_rate": 1.9136684338503463e-06, "loss": 0.0289, "num_input_tokens_seen": 30341920, "step": 45010 }, { "epoch": 1.0997239391200253, "grad_norm": 0.4389401972293854, "learning_rate": 1.9136337685676644e-06, "loss": 0.0406, "num_input_tokens_seen": 30345248, "step": 45015 }, { "epoch": 1.0998460899518725, "grad_norm": 0.13102486729621887, "learning_rate": 1.9135990966407926e-06, "loss": 0.0512, "num_input_tokens_seen": 30350624, "step": 45020 }, { "epoch": 1.0999682407837197, "grad_norm": 27.58184814453125, "learning_rate": 1.913564418069982e-06, "loss": 0.0966, "num_input_tokens_seen": 30354336, "step": 45025 }, { "epoch": 1.1000903916155669, "grad_norm": 0.514782726764679, "learning_rate": 1.9135297328554853e-06, "loss": 0.0896, "num_input_tokens_seen": 30357344, "step": 45030 }, { "epoch": 1.100212542447414, "grad_norm": 16.277013778686523, "learning_rate": 1.9134950409975547e-06, "loss": 0.0729, "num_input_tokens_seen": 30360608, "step": 45035 }, { "epoch": 1.1003346932792613, "grad_norm": 0.12448076903820038, "learning_rate": 1.9134603424964425e-06, "loss": 0.0882, "num_input_tokens_seen": 30364000, "step": 45040 }, { "epoch": 1.1004568441111084, "grad_norm": 15.496138572692871, "learning_rate": 1.9134256373524008e-06, "loss": 0.2351, "num_input_tokens_seen": 30367648, "step": 45045 }, { "epoch": 1.1005789949429556, "grad_norm": 0.2786150872707367, "learning_rate": 1.9133909255656822e-06, "loss": 0.03, "num_input_tokens_seen": 30371040, "step": 45050 }, { "epoch": 1.1007011457748028, "grad_norm": 0.04457584023475647, "learning_rate": 1.91335620713654e-06, "loss": 0.0026, "num_input_tokens_seen": 30375072, "step": 45055 }, { "epoch": 1.1008232966066498, "grad_norm": 12.544975280761719, "learning_rate": 1.9133214820652247e-06, "loss": 0.1178, "num_input_tokens_seen": 30378400, "step": 45060 }, { "epoch": 1.100945447438497, "grad_norm": 0.055558640509843826, "learning_rate": 1.91328675035199e-06, "loss": 0.101, "num_input_tokens_seen": 30382496, "step": 45065 }, { "epoch": 1.1010675982703442, "grad_norm": 28.11234474182129, "learning_rate": 1.913252011997089e-06, "loss": 0.1579, "num_input_tokens_seen": 30385952, "step": 45070 }, { "epoch": 1.1011897491021914, "grad_norm": 0.06535232812166214, "learning_rate": 1.913217267000773e-06, "loss": 0.0704, "num_input_tokens_seen": 30389024, "step": 45075 }, { "epoch": 1.1013118999340386, "grad_norm": 51.46257400512695, "learning_rate": 1.913182515363296e-06, "loss": 0.0606, "num_input_tokens_seen": 30392288, "step": 45080 }, { "epoch": 1.1014340507658857, "grad_norm": 0.3011631965637207, "learning_rate": 1.9131477570849103e-06, "loss": 0.1839, "num_input_tokens_seen": 30395616, "step": 45085 }, { "epoch": 1.101556201597733, "grad_norm": 15.269543647766113, "learning_rate": 1.913112992165868e-06, "loss": 0.0551, "num_input_tokens_seen": 30398944, "step": 45090 }, { "epoch": 1.1016783524295801, "grad_norm": 0.13697107136249542, "learning_rate": 1.9130782206064228e-06, "loss": 0.1362, "num_input_tokens_seen": 30402464, "step": 45095 }, { "epoch": 1.1018005032614273, "grad_norm": 54.185874938964844, "learning_rate": 1.9130434424068265e-06, "loss": 0.0087, "num_input_tokens_seen": 30405984, "step": 45100 }, { "epoch": 1.1019226540932743, "grad_norm": 9.86892318725586, "learning_rate": 1.9130086575673335e-06, "loss": 0.1299, "num_input_tokens_seen": 30409568, "step": 45105 }, { "epoch": 1.1020448049251215, "grad_norm": 5.100939750671387, "learning_rate": 1.9129738660881956e-06, "loss": 0.0292, "num_input_tokens_seen": 30413152, "step": 45110 }, { "epoch": 1.1021669557569687, "grad_norm": 1.465461254119873, "learning_rate": 1.9129390679696663e-06, "loss": 0.0517, "num_input_tokens_seen": 30416800, "step": 45115 }, { "epoch": 1.1022891065888158, "grad_norm": 0.06135892868041992, "learning_rate": 1.9129042632119986e-06, "loss": 0.0245, "num_input_tokens_seen": 30420384, "step": 45120 }, { "epoch": 1.102411257420663, "grad_norm": 0.11938874423503876, "learning_rate": 1.9128694518154456e-06, "loss": 0.0089, "num_input_tokens_seen": 30423776, "step": 45125 }, { "epoch": 1.1025334082525102, "grad_norm": 4.974461555480957, "learning_rate": 1.91283463378026e-06, "loss": 0.084, "num_input_tokens_seen": 30427168, "step": 45130 }, { "epoch": 1.1026555590843574, "grad_norm": 0.2367285192012787, "learning_rate": 1.912799809106696e-06, "loss": 0.0005, "num_input_tokens_seen": 30430048, "step": 45135 }, { "epoch": 1.1027777099162046, "grad_norm": 135.47918701171875, "learning_rate": 1.912764977795006e-06, "loss": 0.0656, "num_input_tokens_seen": 30433056, "step": 45140 }, { "epoch": 1.1028998607480518, "grad_norm": 36.52012634277344, "learning_rate": 1.9127301398454436e-06, "loss": 0.1529, "num_input_tokens_seen": 30436128, "step": 45145 }, { "epoch": 1.1030220115798988, "grad_norm": 0.22205692529678345, "learning_rate": 1.912695295258262e-06, "loss": 0.1235, "num_input_tokens_seen": 30439328, "step": 45150 }, { "epoch": 1.103144162411746, "grad_norm": 0.08349934220314026, "learning_rate": 1.9126604440337145e-06, "loss": 0.0019, "num_input_tokens_seen": 30443040, "step": 45155 }, { "epoch": 1.1032663132435931, "grad_norm": 58.70408630371094, "learning_rate": 1.9126255861720552e-06, "loss": 0.1519, "num_input_tokens_seen": 30445920, "step": 45160 }, { "epoch": 1.1033884640754403, "grad_norm": 0.7588253617286682, "learning_rate": 1.912590721673537e-06, "loss": 0.0032, "num_input_tokens_seen": 30449120, "step": 45165 }, { "epoch": 1.1035106149072875, "grad_norm": 14.000785827636719, "learning_rate": 1.912555850538414e-06, "loss": 0.0689, "num_input_tokens_seen": 30452512, "step": 45170 }, { "epoch": 1.1036327657391347, "grad_norm": 0.2592606544494629, "learning_rate": 1.9125209727669385e-06, "loss": 0.0416, "num_input_tokens_seen": 30456224, "step": 45175 }, { "epoch": 1.103754916570982, "grad_norm": 66.11405944824219, "learning_rate": 1.912486088359366e-06, "loss": 0.0878, "num_input_tokens_seen": 30459424, "step": 45180 }, { "epoch": 1.103877067402829, "grad_norm": 6.067470073699951, "learning_rate": 1.9124511973159486e-06, "loss": 0.0887, "num_input_tokens_seen": 30462688, "step": 45185 }, { "epoch": 1.103999218234676, "grad_norm": 0.31076791882514954, "learning_rate": 1.912416299636941e-06, "loss": 0.0715, "num_input_tokens_seen": 30466272, "step": 45190 }, { "epoch": 1.1041213690665233, "grad_norm": 0.39867663383483887, "learning_rate": 1.912381395322597e-06, "loss": 0.1073, "num_input_tokens_seen": 30469536, "step": 45195 }, { "epoch": 1.1042435198983704, "grad_norm": 0.05573923513293266, "learning_rate": 1.912346484373169e-06, "loss": 0.0988, "num_input_tokens_seen": 30472608, "step": 45200 }, { "epoch": 1.1043656707302176, "grad_norm": 14.342602729797363, "learning_rate": 1.912311566788913e-06, "loss": 0.0773, "num_input_tokens_seen": 30475808, "step": 45205 }, { "epoch": 1.1044878215620648, "grad_norm": 58.24125289916992, "learning_rate": 1.9122766425700816e-06, "loss": 0.0827, "num_input_tokens_seen": 30479200, "step": 45210 }, { "epoch": 1.104609972393912, "grad_norm": 0.0803033709526062, "learning_rate": 1.912241711716929e-06, "loss": 0.1808, "num_input_tokens_seen": 30482912, "step": 45215 }, { "epoch": 1.1047321232257592, "grad_norm": 0.3461464047431946, "learning_rate": 1.9122067742297093e-06, "loss": 0.0685, "num_input_tokens_seen": 30486560, "step": 45220 }, { "epoch": 1.1048542740576064, "grad_norm": 1.3804208040237427, "learning_rate": 1.9121718301086766e-06, "loss": 0.1057, "num_input_tokens_seen": 30490016, "step": 45225 }, { "epoch": 1.1049764248894536, "grad_norm": 32.63319778442383, "learning_rate": 1.912136879354085e-06, "loss": 0.1027, "num_input_tokens_seen": 30493408, "step": 45230 }, { "epoch": 1.1050985757213008, "grad_norm": 25.16427993774414, "learning_rate": 1.912101921966189e-06, "loss": 0.1223, "num_input_tokens_seen": 30497312, "step": 45235 }, { "epoch": 1.1052207265531477, "grad_norm": 6.807790756225586, "learning_rate": 1.912066957945242e-06, "loss": 0.0408, "num_input_tokens_seen": 30500704, "step": 45240 }, { "epoch": 1.105342877384995, "grad_norm": 47.124122619628906, "learning_rate": 1.912031987291499e-06, "loss": 0.1212, "num_input_tokens_seen": 30503904, "step": 45245 }, { "epoch": 1.1054650282168421, "grad_norm": 16.168676376342773, "learning_rate": 1.911997010005214e-06, "loss": 0.1472, "num_input_tokens_seen": 30507296, "step": 45250 }, { "epoch": 1.1055871790486893, "grad_norm": 10.315686225891113, "learning_rate": 1.9119620260866415e-06, "loss": 0.0507, "num_input_tokens_seen": 30511264, "step": 45255 }, { "epoch": 1.1057093298805365, "grad_norm": 15.87485408782959, "learning_rate": 1.911927035536036e-06, "loss": 0.132, "num_input_tokens_seen": 30514272, "step": 45260 }, { "epoch": 1.1058314807123837, "grad_norm": 0.41578197479248047, "learning_rate": 1.9118920383536515e-06, "loss": 0.0812, "num_input_tokens_seen": 30517472, "step": 45265 }, { "epoch": 1.1059536315442309, "grad_norm": 27.88473129272461, "learning_rate": 1.911857034539743e-06, "loss": 0.056, "num_input_tokens_seen": 30520800, "step": 45270 }, { "epoch": 1.106075782376078, "grad_norm": 14.108925819396973, "learning_rate": 1.911822024094565e-06, "loss": 0.0698, "num_input_tokens_seen": 30524832, "step": 45275 }, { "epoch": 1.106197933207925, "grad_norm": 44.32224655151367, "learning_rate": 1.9117870070183718e-06, "loss": 0.0387, "num_input_tokens_seen": 30528160, "step": 45280 }, { "epoch": 1.1063200840397722, "grad_norm": 5.657730579376221, "learning_rate": 1.9117519833114185e-06, "loss": 0.0378, "num_input_tokens_seen": 30531040, "step": 45285 }, { "epoch": 1.1064422348716194, "grad_norm": 83.99029541015625, "learning_rate": 1.9117169529739595e-06, "loss": 0.1398, "num_input_tokens_seen": 30534304, "step": 45290 }, { "epoch": 1.1065643857034666, "grad_norm": 0.23789186775684357, "learning_rate": 1.9116819160062493e-06, "loss": 0.0015, "num_input_tokens_seen": 30537440, "step": 45295 }, { "epoch": 1.1066865365353138, "grad_norm": 8.420495986938477, "learning_rate": 1.9116468724085433e-06, "loss": 0.0918, "num_input_tokens_seen": 30541536, "step": 45300 }, { "epoch": 1.106808687367161, "grad_norm": 107.93529510498047, "learning_rate": 1.9116118221810956e-06, "loss": 0.1732, "num_input_tokens_seen": 30544992, "step": 45305 }, { "epoch": 1.1069308381990082, "grad_norm": 0.3019404411315918, "learning_rate": 1.911576765324162e-06, "loss": 0.2395, "num_input_tokens_seen": 30548128, "step": 45310 }, { "epoch": 1.1070529890308554, "grad_norm": 1.3898652791976929, "learning_rate": 1.911541701837997e-06, "loss": 0.1195, "num_input_tokens_seen": 30551008, "step": 45315 }, { "epoch": 1.1071751398627026, "grad_norm": 2.8064727783203125, "learning_rate": 1.9115066317228552e-06, "loss": 0.0957, "num_input_tokens_seen": 30554592, "step": 45320 }, { "epoch": 1.1072972906945497, "grad_norm": 18.04161834716797, "learning_rate": 1.911471554978992e-06, "loss": 0.1277, "num_input_tokens_seen": 30558304, "step": 45325 }, { "epoch": 1.1074194415263967, "grad_norm": 0.3619232773780823, "learning_rate": 1.911436471606663e-06, "loss": 0.0031, "num_input_tokens_seen": 30562080, "step": 45330 }, { "epoch": 1.107541592358244, "grad_norm": 27.725862503051758, "learning_rate": 1.9114013816061222e-06, "loss": 0.1244, "num_input_tokens_seen": 30565344, "step": 45335 }, { "epoch": 1.107663743190091, "grad_norm": 48.7161979675293, "learning_rate": 1.911366284977626e-06, "loss": 0.0315, "num_input_tokens_seen": 30569184, "step": 45340 }, { "epoch": 1.1077858940219383, "grad_norm": 0.17331643402576447, "learning_rate": 1.9113311817214287e-06, "loss": 0.0021, "num_input_tokens_seen": 30572256, "step": 45345 }, { "epoch": 1.1079080448537855, "grad_norm": 10.249677658081055, "learning_rate": 1.911296071837786e-06, "loss": 0.1518, "num_input_tokens_seen": 30575712, "step": 45350 }, { "epoch": 1.1080301956856327, "grad_norm": 0.9612787961959839, "learning_rate": 1.911260955326953e-06, "loss": 0.0362, "num_input_tokens_seen": 30579232, "step": 45355 }, { "epoch": 1.1081523465174798, "grad_norm": 28.470977783203125, "learning_rate": 1.9112258321891858e-06, "loss": 0.1136, "num_input_tokens_seen": 30583072, "step": 45360 }, { "epoch": 1.108274497349327, "grad_norm": 0.3659687042236328, "learning_rate": 1.9111907024247387e-06, "loss": 0.0033, "num_input_tokens_seen": 30586336, "step": 45365 }, { "epoch": 1.108396648181174, "grad_norm": 0.03629086911678314, "learning_rate": 1.9111555660338677e-06, "loss": 0.0824, "num_input_tokens_seen": 30589600, "step": 45370 }, { "epoch": 1.1085187990130212, "grad_norm": 5.431779384613037, "learning_rate": 1.9111204230168287e-06, "loss": 0.1192, "num_input_tokens_seen": 30593248, "step": 45375 }, { "epoch": 1.1086409498448684, "grad_norm": 0.3708115816116333, "learning_rate": 1.9110852733738766e-06, "loss": 0.0516, "num_input_tokens_seen": 30596384, "step": 45380 }, { "epoch": 1.1087631006767156, "grad_norm": 4.54123592376709, "learning_rate": 1.9110501171052676e-06, "loss": 0.0061, "num_input_tokens_seen": 30599840, "step": 45385 }, { "epoch": 1.1088852515085628, "grad_norm": 11.170312881469727, "learning_rate": 1.911014954211257e-06, "loss": 0.0958, "num_input_tokens_seen": 30603104, "step": 45390 }, { "epoch": 1.10900740234041, "grad_norm": 0.07969980686903, "learning_rate": 1.910979784692101e-06, "loss": 0.0008, "num_input_tokens_seen": 30606624, "step": 45395 }, { "epoch": 1.1091295531722571, "grad_norm": 22.515426635742188, "learning_rate": 1.9109446085480543e-06, "loss": 0.1607, "num_input_tokens_seen": 30609952, "step": 45400 }, { "epoch": 1.1092517040041043, "grad_norm": 38.219364166259766, "learning_rate": 1.9109094257793736e-06, "loss": 0.0909, "num_input_tokens_seen": 30613024, "step": 45405 }, { "epoch": 1.1093738548359515, "grad_norm": 0.10983511060476303, "learning_rate": 1.9108742363863147e-06, "loss": 0.0454, "num_input_tokens_seen": 30616736, "step": 45410 }, { "epoch": 1.1094960056677987, "grad_norm": 24.83835220336914, "learning_rate": 1.9108390403691333e-06, "loss": 0.0845, "num_input_tokens_seen": 30619808, "step": 45415 }, { "epoch": 1.1096181564996457, "grad_norm": 51.3784065246582, "learning_rate": 1.9108038377280856e-06, "loss": 0.0218, "num_input_tokens_seen": 30623200, "step": 45420 }, { "epoch": 1.1097403073314929, "grad_norm": 34.15353775024414, "learning_rate": 1.910768628463427e-06, "loss": 0.1623, "num_input_tokens_seen": 30626400, "step": 45425 }, { "epoch": 1.10986245816334, "grad_norm": 0.046371277421712875, "learning_rate": 1.9107334125754143e-06, "loss": 0.0092, "num_input_tokens_seen": 30629920, "step": 45430 }, { "epoch": 1.1099846089951872, "grad_norm": 59.40081787109375, "learning_rate": 1.910698190064303e-06, "loss": 0.2519, "num_input_tokens_seen": 30632928, "step": 45435 }, { "epoch": 1.1101067598270344, "grad_norm": 37.21415328979492, "learning_rate": 1.91066296093035e-06, "loss": 0.3251, "num_input_tokens_seen": 30636576, "step": 45440 }, { "epoch": 1.1102289106588816, "grad_norm": 20.704212188720703, "learning_rate": 1.9106277251738104e-06, "loss": 0.1401, "num_input_tokens_seen": 30639712, "step": 45445 }, { "epoch": 1.1103510614907288, "grad_norm": 3.910080671310425, "learning_rate": 1.9105924827949417e-06, "loss": 0.0518, "num_input_tokens_seen": 30643104, "step": 45450 }, { "epoch": 1.110473212322576, "grad_norm": 58.14051055908203, "learning_rate": 1.910557233793999e-06, "loss": 0.134, "num_input_tokens_seen": 30646304, "step": 45455 }, { "epoch": 1.110595363154423, "grad_norm": 0.6852989196777344, "learning_rate": 1.9105219781712396e-06, "loss": 0.002, "num_input_tokens_seen": 30649376, "step": 45460 }, { "epoch": 1.1107175139862702, "grad_norm": 0.48834651708602905, "learning_rate": 1.910486715926919e-06, "loss": 0.077, "num_input_tokens_seen": 30652512, "step": 45465 }, { "epoch": 1.1108396648181174, "grad_norm": 0.4228566288948059, "learning_rate": 1.9104514470612946e-06, "loss": 0.1008, "num_input_tokens_seen": 30656096, "step": 45470 }, { "epoch": 1.1109618156499645, "grad_norm": 1.1528277397155762, "learning_rate": 1.910416171574622e-06, "loss": 0.0885, "num_input_tokens_seen": 30659744, "step": 45475 }, { "epoch": 1.1110839664818117, "grad_norm": 20.612672805786133, "learning_rate": 1.9103808894671586e-06, "loss": 0.062, "num_input_tokens_seen": 30663008, "step": 45480 }, { "epoch": 1.111206117313659, "grad_norm": 21.08039665222168, "learning_rate": 1.91034560073916e-06, "loss": 0.039, "num_input_tokens_seen": 30666528, "step": 45485 }, { "epoch": 1.1113282681455061, "grad_norm": 58.673824310302734, "learning_rate": 1.9103103053908834e-06, "loss": 0.0721, "num_input_tokens_seen": 30669920, "step": 45490 }, { "epoch": 1.1114504189773533, "grad_norm": 0.6854089498519897, "learning_rate": 1.910275003422586e-06, "loss": 0.0012, "num_input_tokens_seen": 30673184, "step": 45495 }, { "epoch": 1.1115725698092005, "grad_norm": 71.15158081054688, "learning_rate": 1.910239694834523e-06, "loss": 0.0413, "num_input_tokens_seen": 30676384, "step": 45500 }, { "epoch": 1.1116947206410475, "grad_norm": 0.11159415543079376, "learning_rate": 1.910204379626953e-06, "loss": 0.0017, "num_input_tokens_seen": 30679840, "step": 45505 }, { "epoch": 1.1118168714728947, "grad_norm": 0.036455605179071426, "learning_rate": 1.9101690578001313e-06, "loss": 0.0628, "num_input_tokens_seen": 30683232, "step": 45510 }, { "epoch": 1.1119390223047418, "grad_norm": 0.4710502028465271, "learning_rate": 1.9101337293543156e-06, "loss": 0.0996, "num_input_tokens_seen": 30686560, "step": 45515 }, { "epoch": 1.112061173136589, "grad_norm": 30.097923278808594, "learning_rate": 1.910098394289763e-06, "loss": 0.0881, "num_input_tokens_seen": 30689632, "step": 45520 }, { "epoch": 1.1121833239684362, "grad_norm": 93.26819610595703, "learning_rate": 1.9100630526067292e-06, "loss": 0.0683, "num_input_tokens_seen": 30693024, "step": 45525 }, { "epoch": 1.1123054748002834, "grad_norm": 67.23088836669922, "learning_rate": 1.9100277043054727e-06, "loss": 0.0671, "num_input_tokens_seen": 30696160, "step": 45530 }, { "epoch": 1.1124276256321306, "grad_norm": 97.0174560546875, "learning_rate": 1.90999234938625e-06, "loss": 0.0694, "num_input_tokens_seen": 30699040, "step": 45535 }, { "epoch": 1.1125497764639778, "grad_norm": 0.051360975950956345, "learning_rate": 1.909956987849318e-06, "loss": 0.0007, "num_input_tokens_seen": 30702304, "step": 45540 }, { "epoch": 1.112671927295825, "grad_norm": 73.1390151977539, "learning_rate": 1.909921619694934e-06, "loss": 0.0649, "num_input_tokens_seen": 30705632, "step": 45545 }, { "epoch": 1.112794078127672, "grad_norm": 0.31845179200172424, "learning_rate": 1.909886244923356e-06, "loss": 0.0619, "num_input_tokens_seen": 30709088, "step": 45550 }, { "epoch": 1.1129162289595191, "grad_norm": 10.11508846282959, "learning_rate": 1.9098508635348398e-06, "loss": 0.1361, "num_input_tokens_seen": 30712480, "step": 45555 }, { "epoch": 1.1130383797913663, "grad_norm": 0.17602814733982086, "learning_rate": 1.909815475529643e-06, "loss": 0.0475, "num_input_tokens_seen": 30715680, "step": 45560 }, { "epoch": 1.1131605306232135, "grad_norm": 1.3473693132400513, "learning_rate": 1.909780080908024e-06, "loss": 0.0958, "num_input_tokens_seen": 30718880, "step": 45565 }, { "epoch": 1.1132826814550607, "grad_norm": 73.33531188964844, "learning_rate": 1.9097446796702395e-06, "loss": 0.0781, "num_input_tokens_seen": 30722144, "step": 45570 }, { "epoch": 1.113404832286908, "grad_norm": 12.397170066833496, "learning_rate": 1.909709271816547e-06, "loss": 0.1832, "num_input_tokens_seen": 30725280, "step": 45575 }, { "epoch": 1.113526983118755, "grad_norm": 0.47446826100349426, "learning_rate": 1.9096738573472035e-06, "loss": 0.0437, "num_input_tokens_seen": 30728928, "step": 45580 }, { "epoch": 1.1136491339506023, "grad_norm": 0.17872124910354614, "learning_rate": 1.9096384362624675e-06, "loss": 0.2124, "num_input_tokens_seen": 30732192, "step": 45585 }, { "epoch": 1.1137712847824495, "grad_norm": 51.15095138549805, "learning_rate": 1.909603008562596e-06, "loss": 0.1358, "num_input_tokens_seen": 30735264, "step": 45590 }, { "epoch": 1.1138934356142964, "grad_norm": 63.00979232788086, "learning_rate": 1.909567574247847e-06, "loss": 0.0688, "num_input_tokens_seen": 30738720, "step": 45595 }, { "epoch": 1.1140155864461436, "grad_norm": 0.1376773715019226, "learning_rate": 1.9095321333184777e-06, "loss": 0.0368, "num_input_tokens_seen": 30741984, "step": 45600 }, { "epoch": 1.1141377372779908, "grad_norm": 8.297344207763672, "learning_rate": 1.909496685774746e-06, "loss": 0.06, "num_input_tokens_seen": 30745248, "step": 45605 }, { "epoch": 1.114259888109838, "grad_norm": 1.1917724609375, "learning_rate": 1.90946123161691e-06, "loss": 0.0493, "num_input_tokens_seen": 30748832, "step": 45610 }, { "epoch": 1.1143820389416852, "grad_norm": 45.727996826171875, "learning_rate": 1.9094257708452275e-06, "loss": 0.1171, "num_input_tokens_seen": 30752352, "step": 45615 }, { "epoch": 1.1145041897735324, "grad_norm": 0.2739153206348419, "learning_rate": 1.909390303459956e-06, "loss": 0.1067, "num_input_tokens_seen": 30755296, "step": 45620 }, { "epoch": 1.1146263406053796, "grad_norm": 106.9163818359375, "learning_rate": 1.9093548294613533e-06, "loss": 0.2377, "num_input_tokens_seen": 30758432, "step": 45625 }, { "epoch": 1.1147484914372268, "grad_norm": 1.4975440502166748, "learning_rate": 1.9093193488496778e-06, "loss": 0.0355, "num_input_tokens_seen": 30761824, "step": 45630 }, { "epoch": 1.114870642269074, "grad_norm": 1.395308494567871, "learning_rate": 1.9092838616251877e-06, "loss": 0.0493, "num_input_tokens_seen": 30765728, "step": 45635 }, { "epoch": 1.114992793100921, "grad_norm": 21.039077758789062, "learning_rate": 1.9092483677881405e-06, "loss": 0.1208, "num_input_tokens_seen": 30769120, "step": 45640 }, { "epoch": 1.115114943932768, "grad_norm": 0.11878981441259384, "learning_rate": 1.909212867338795e-06, "loss": 0.001, "num_input_tokens_seen": 30772384, "step": 45645 }, { "epoch": 1.1152370947646153, "grad_norm": 8.132143020629883, "learning_rate": 1.9091773602774087e-06, "loss": 0.0624, "num_input_tokens_seen": 30775584, "step": 45650 }, { "epoch": 1.1153592455964625, "grad_norm": 27.349586486816406, "learning_rate": 1.90914184660424e-06, "loss": 0.1317, "num_input_tokens_seen": 30778464, "step": 45655 }, { "epoch": 1.1154813964283097, "grad_norm": 44.20734786987305, "learning_rate": 1.9091063263195473e-06, "loss": 0.1104, "num_input_tokens_seen": 30781984, "step": 45660 }, { "epoch": 1.1156035472601569, "grad_norm": 10.095579147338867, "learning_rate": 1.909070799423589e-06, "loss": 0.1635, "num_input_tokens_seen": 30785248, "step": 45665 }, { "epoch": 1.115725698092004, "grad_norm": 8.791191101074219, "learning_rate": 1.9090352659166232e-06, "loss": 0.1662, "num_input_tokens_seen": 30788896, "step": 45670 }, { "epoch": 1.1158478489238512, "grad_norm": 11.370107650756836, "learning_rate": 1.9089997257989084e-06, "loss": 0.0861, "num_input_tokens_seen": 30792416, "step": 45675 }, { "epoch": 1.1159699997556984, "grad_norm": 0.14778165519237518, "learning_rate": 1.9089641790707036e-06, "loss": 0.0014, "num_input_tokens_seen": 30796128, "step": 45680 }, { "epoch": 1.1160921505875454, "grad_norm": 0.42808398604393005, "learning_rate": 1.9089286257322664e-06, "loss": 0.0489, "num_input_tokens_seen": 30799328, "step": 45685 }, { "epoch": 1.1162143014193926, "grad_norm": 13.010444641113281, "learning_rate": 1.908893065783856e-06, "loss": 0.2035, "num_input_tokens_seen": 30803296, "step": 45690 }, { "epoch": 1.1163364522512398, "grad_norm": 3.7101378440856934, "learning_rate": 1.90885749922573e-06, "loss": 0.0675, "num_input_tokens_seen": 30806432, "step": 45695 }, { "epoch": 1.116458603083087, "grad_norm": 12.728604316711426, "learning_rate": 1.9088219260581488e-06, "loss": 0.1025, "num_input_tokens_seen": 30809824, "step": 45700 }, { "epoch": 1.1165807539149342, "grad_norm": 25.37266731262207, "learning_rate": 1.90878634628137e-06, "loss": 0.1207, "num_input_tokens_seen": 30813088, "step": 45705 }, { "epoch": 1.1167029047467814, "grad_norm": 91.44532012939453, "learning_rate": 1.908750759895652e-06, "loss": 0.0226, "num_input_tokens_seen": 30816800, "step": 45710 }, { "epoch": 1.1168250555786285, "grad_norm": 0.25230729579925537, "learning_rate": 1.908715166901254e-06, "loss": 0.0618, "num_input_tokens_seen": 30819680, "step": 45715 }, { "epoch": 1.1169472064104757, "grad_norm": 0.22132541239261627, "learning_rate": 1.908679567298435e-06, "loss": 0.0726, "num_input_tokens_seen": 30822816, "step": 45720 }, { "epoch": 1.1170693572423227, "grad_norm": 0.8834930658340454, "learning_rate": 1.908643961087454e-06, "loss": 0.007, "num_input_tokens_seen": 30825888, "step": 45725 }, { "epoch": 1.11719150807417, "grad_norm": 23.24863052368164, "learning_rate": 1.9086083482685696e-06, "loss": 0.0397, "num_input_tokens_seen": 30829408, "step": 45730 }, { "epoch": 1.117313658906017, "grad_norm": 0.04711702838540077, "learning_rate": 1.908572728842041e-06, "loss": 0.0484, "num_input_tokens_seen": 30832800, "step": 45735 }, { "epoch": 1.1174358097378643, "grad_norm": 1.2767448425292969, "learning_rate": 1.908537102808127e-06, "loss": 0.0684, "num_input_tokens_seen": 30836192, "step": 45740 }, { "epoch": 1.1175579605697115, "grad_norm": 0.08316470682621002, "learning_rate": 1.9085014701670866e-06, "loss": 0.0011, "num_input_tokens_seen": 30839904, "step": 45745 }, { "epoch": 1.1176801114015587, "grad_norm": 7.2097015380859375, "learning_rate": 1.9084658309191798e-06, "loss": 0.0022, "num_input_tokens_seen": 30842912, "step": 45750 }, { "epoch": 1.1178022622334058, "grad_norm": 0.1650509536266327, "learning_rate": 1.9084301850646645e-06, "loss": 0.0953, "num_input_tokens_seen": 30846560, "step": 45755 }, { "epoch": 1.117924413065253, "grad_norm": 116.87432861328125, "learning_rate": 1.908394532603801e-06, "loss": 0.0214, "num_input_tokens_seen": 30850016, "step": 45760 }, { "epoch": 1.1180465638971002, "grad_norm": 0.04541773349046707, "learning_rate": 1.908358873536848e-06, "loss": 0.0006, "num_input_tokens_seen": 30853152, "step": 45765 }, { "epoch": 1.1181687147289474, "grad_norm": 0.08230971544981003, "learning_rate": 1.9083232078640647e-06, "loss": 0.0494, "num_input_tokens_seen": 30856864, "step": 45770 }, { "epoch": 1.1182908655607944, "grad_norm": 13.339716911315918, "learning_rate": 1.908287535585711e-06, "loss": 0.1459, "num_input_tokens_seen": 30860128, "step": 45775 }, { "epoch": 1.1184130163926416, "grad_norm": 0.05430752784013748, "learning_rate": 1.9082518567020457e-06, "loss": 0.1142, "num_input_tokens_seen": 30863648, "step": 45780 }, { "epoch": 1.1185351672244888, "grad_norm": 0.3133643567562103, "learning_rate": 1.908216171213329e-06, "loss": 0.0439, "num_input_tokens_seen": 30866976, "step": 45785 }, { "epoch": 1.118657318056336, "grad_norm": 5.240131378173828, "learning_rate": 1.90818047911982e-06, "loss": 0.1414, "num_input_tokens_seen": 30870688, "step": 45790 }, { "epoch": 1.1187794688881831, "grad_norm": 0.13645559549331665, "learning_rate": 1.908144780421778e-06, "loss": 0.0685, "num_input_tokens_seen": 30873760, "step": 45795 }, { "epoch": 1.1189016197200303, "grad_norm": 3.473702907562256, "learning_rate": 1.908109075119463e-06, "loss": 0.0432, "num_input_tokens_seen": 30877152, "step": 45800 }, { "epoch": 1.1190237705518775, "grad_norm": 0.1498257964849472, "learning_rate": 1.9080733632131347e-06, "loss": 0.0734, "num_input_tokens_seen": 30881312, "step": 45805 }, { "epoch": 1.1191459213837247, "grad_norm": 16.656753540039062, "learning_rate": 1.9080376447030525e-06, "loss": 0.1117, "num_input_tokens_seen": 30884512, "step": 45810 }, { "epoch": 1.1192680722155717, "grad_norm": 0.9916263818740845, "learning_rate": 1.9080019195894766e-06, "loss": 0.1565, "num_input_tokens_seen": 30887648, "step": 45815 }, { "epoch": 1.1193902230474189, "grad_norm": 0.260072261095047, "learning_rate": 1.9079661878726663e-06, "loss": 0.0634, "num_input_tokens_seen": 30891296, "step": 45820 }, { "epoch": 1.119512373879266, "grad_norm": 0.287136435508728, "learning_rate": 1.9079304495528815e-06, "loss": 0.0536, "num_input_tokens_seen": 30894624, "step": 45825 }, { "epoch": 1.1196345247111132, "grad_norm": 48.25025177001953, "learning_rate": 1.9078947046303825e-06, "loss": 0.0291, "num_input_tokens_seen": 30897952, "step": 45830 }, { "epoch": 1.1197566755429604, "grad_norm": 71.15177917480469, "learning_rate": 1.907858953105429e-06, "loss": 0.1195, "num_input_tokens_seen": 30901152, "step": 45835 }, { "epoch": 1.1198788263748076, "grad_norm": 42.36701202392578, "learning_rate": 1.907823194978281e-06, "loss": 0.2208, "num_input_tokens_seen": 30904672, "step": 45840 }, { "epoch": 1.1200009772066548, "grad_norm": 0.25921159982681274, "learning_rate": 1.9077874302491985e-06, "loss": 0.0397, "num_input_tokens_seen": 30907808, "step": 45845 }, { "epoch": 1.120123128038502, "grad_norm": 0.9302563071250916, "learning_rate": 1.9077516589184416e-06, "loss": 0.0989, "num_input_tokens_seen": 30910880, "step": 45850 }, { "epoch": 1.1202452788703492, "grad_norm": 29.44671058654785, "learning_rate": 1.9077158809862707e-06, "loss": 0.1054, "num_input_tokens_seen": 30914336, "step": 45855 }, { "epoch": 1.1203674297021964, "grad_norm": 0.4291466772556305, "learning_rate": 1.9076800964529455e-06, "loss": 0.0362, "num_input_tokens_seen": 30917536, "step": 45860 }, { "epoch": 1.1204895805340434, "grad_norm": 0.14217062294483185, "learning_rate": 1.9076443053187265e-06, "loss": 0.0847, "num_input_tokens_seen": 30920736, "step": 45865 }, { "epoch": 1.1206117313658905, "grad_norm": 0.11226175725460052, "learning_rate": 1.907608507583874e-06, "loss": 0.0048, "num_input_tokens_seen": 30925216, "step": 45870 }, { "epoch": 1.1207338821977377, "grad_norm": 0.032658837735652924, "learning_rate": 1.9075727032486486e-06, "loss": 0.0274, "num_input_tokens_seen": 30928672, "step": 45875 }, { "epoch": 1.120856033029585, "grad_norm": 11.843403816223145, "learning_rate": 1.9075368923133102e-06, "loss": 0.0966, "num_input_tokens_seen": 30933024, "step": 45880 }, { "epoch": 1.120978183861432, "grad_norm": 0.8221790790557861, "learning_rate": 1.9075010747781194e-06, "loss": 0.0922, "num_input_tokens_seen": 30936032, "step": 45885 }, { "epoch": 1.1211003346932793, "grad_norm": 18.793682098388672, "learning_rate": 1.9074652506433367e-06, "loss": 0.0999, "num_input_tokens_seen": 30939424, "step": 45890 }, { "epoch": 1.1212224855251265, "grad_norm": 0.1459447145462036, "learning_rate": 1.9074294199092224e-06, "loss": 0.1094, "num_input_tokens_seen": 30942752, "step": 45895 }, { "epoch": 1.1213446363569737, "grad_norm": 0.7048338651657104, "learning_rate": 1.907393582576038e-06, "loss": 0.132, "num_input_tokens_seen": 30946208, "step": 45900 }, { "epoch": 1.1214667871888206, "grad_norm": 0.9069101214408875, "learning_rate": 1.9073577386440423e-06, "loss": 0.0294, "num_input_tokens_seen": 30949088, "step": 45905 }, { "epoch": 1.1215889380206678, "grad_norm": 10.497543334960938, "learning_rate": 1.9073218881134979e-06, "loss": 0.1609, "num_input_tokens_seen": 30952352, "step": 45910 }, { "epoch": 1.121711088852515, "grad_norm": 63.032737731933594, "learning_rate": 1.9072860309846647e-06, "loss": 0.0919, "num_input_tokens_seen": 30955296, "step": 45915 }, { "epoch": 1.1218332396843622, "grad_norm": 43.05061340332031, "learning_rate": 1.907250167257803e-06, "loss": 0.0923, "num_input_tokens_seen": 30958496, "step": 45920 }, { "epoch": 1.1219553905162094, "grad_norm": 1.525328278541565, "learning_rate": 1.9072142969331746e-06, "loss": 0.1738, "num_input_tokens_seen": 30961632, "step": 45925 }, { "epoch": 1.1220775413480566, "grad_norm": 0.12426701933145523, "learning_rate": 1.9071784200110392e-06, "loss": 0.0542, "num_input_tokens_seen": 30965024, "step": 45930 }, { "epoch": 1.1221996921799038, "grad_norm": 27.84307098388672, "learning_rate": 1.9071425364916588e-06, "loss": 0.0523, "num_input_tokens_seen": 30968352, "step": 45935 }, { "epoch": 1.122321843011751, "grad_norm": 0.10840397328138351, "learning_rate": 1.907106646375294e-06, "loss": 0.133, "num_input_tokens_seen": 30971808, "step": 45940 }, { "epoch": 1.1224439938435982, "grad_norm": 30.482160568237305, "learning_rate": 1.907070749662205e-06, "loss": 0.0629, "num_input_tokens_seen": 30975968, "step": 45945 }, { "epoch": 1.1225661446754454, "grad_norm": 11.64674186706543, "learning_rate": 1.907034846352654e-06, "loss": 0.1863, "num_input_tokens_seen": 30978976, "step": 45950 }, { "epoch": 1.1226882955072923, "grad_norm": 16.049978256225586, "learning_rate": 1.9069989364469016e-06, "loss": 0.1616, "num_input_tokens_seen": 30982368, "step": 45955 }, { "epoch": 1.1228104463391395, "grad_norm": 30.9919376373291, "learning_rate": 1.906963019945209e-06, "loss": 0.1339, "num_input_tokens_seen": 30985952, "step": 45960 }, { "epoch": 1.1229325971709867, "grad_norm": 0.8775382041931152, "learning_rate": 1.9069270968478376e-06, "loss": 0.0615, "num_input_tokens_seen": 30989536, "step": 45965 }, { "epoch": 1.123054748002834, "grad_norm": 0.7894930839538574, "learning_rate": 1.906891167155048e-06, "loss": 0.0056, "num_input_tokens_seen": 30992928, "step": 45970 }, { "epoch": 1.123176898834681, "grad_norm": 0.265011727809906, "learning_rate": 1.906855230867102e-06, "loss": 0.0017, "num_input_tokens_seen": 30996256, "step": 45975 }, { "epoch": 1.1232990496665283, "grad_norm": 22.024242401123047, "learning_rate": 1.906819287984261e-06, "loss": 0.1181, "num_input_tokens_seen": 30999648, "step": 45980 }, { "epoch": 1.1234212004983755, "grad_norm": 101.3171157836914, "learning_rate": 1.9067833385067862e-06, "loss": 0.1356, "num_input_tokens_seen": 31002528, "step": 45985 }, { "epoch": 1.1235433513302227, "grad_norm": 10.967216491699219, "learning_rate": 1.906747382434939e-06, "loss": 0.1763, "num_input_tokens_seen": 31005472, "step": 45990 }, { "epoch": 1.1236655021620696, "grad_norm": 0.22470149397850037, "learning_rate": 1.9067114197689809e-06, "loss": 0.0701, "num_input_tokens_seen": 31008864, "step": 45995 }, { "epoch": 1.1237876529939168, "grad_norm": 37.859989166259766, "learning_rate": 1.9066754505091735e-06, "loss": 0.0851, "num_input_tokens_seen": 31012192, "step": 46000 }, { "epoch": 1.123909803825764, "grad_norm": 9.107433319091797, "learning_rate": 1.9066394746557783e-06, "loss": 0.0815, "num_input_tokens_seen": 31015648, "step": 46005 }, { "epoch": 1.1240319546576112, "grad_norm": 37.07771301269531, "learning_rate": 1.9066034922090573e-06, "loss": 0.1494, "num_input_tokens_seen": 31018784, "step": 46010 }, { "epoch": 1.1241541054894584, "grad_norm": 0.12652719020843506, "learning_rate": 1.9065675031692718e-06, "loss": 0.0123, "num_input_tokens_seen": 31022240, "step": 46015 }, { "epoch": 1.1242762563213056, "grad_norm": 18.91283416748047, "learning_rate": 1.9065315075366834e-06, "loss": 0.1601, "num_input_tokens_seen": 31025504, "step": 46020 }, { "epoch": 1.1243984071531528, "grad_norm": 11.62320327758789, "learning_rate": 1.906495505311554e-06, "loss": 0.0768, "num_input_tokens_seen": 31028832, "step": 46025 }, { "epoch": 1.124520557985, "grad_norm": 33.32771682739258, "learning_rate": 1.9064594964941456e-06, "loss": 0.1274, "num_input_tokens_seen": 31032096, "step": 46030 }, { "epoch": 1.1246427088168471, "grad_norm": 0.2060808539390564, "learning_rate": 1.9064234810847198e-06, "loss": 0.0304, "num_input_tokens_seen": 31035488, "step": 46035 }, { "epoch": 1.124764859648694, "grad_norm": 3.325789213180542, "learning_rate": 1.9063874590835386e-06, "loss": 0.0452, "num_input_tokens_seen": 31038944, "step": 46040 }, { "epoch": 1.1248870104805413, "grad_norm": 3.3198955059051514, "learning_rate": 1.9063514304908641e-06, "loss": 0.2326, "num_input_tokens_seen": 31042528, "step": 46045 }, { "epoch": 1.1250091613123885, "grad_norm": 20.069190979003906, "learning_rate": 1.9063153953069583e-06, "loss": 0.1244, "num_input_tokens_seen": 31045664, "step": 46050 }, { "epoch": 1.1251313121442357, "grad_norm": 0.09302736073732376, "learning_rate": 1.906279353532083e-06, "loss": 0.0525, "num_input_tokens_seen": 31049248, "step": 46055 }, { "epoch": 1.1252534629760829, "grad_norm": 0.030136937275528908, "learning_rate": 1.9062433051665008e-06, "loss": 0.0353, "num_input_tokens_seen": 31052576, "step": 46060 }, { "epoch": 1.12537561380793, "grad_norm": 0.379341185092926, "learning_rate": 1.9062072502104734e-06, "loss": 0.013, "num_input_tokens_seen": 31055776, "step": 46065 }, { "epoch": 1.1254977646397772, "grad_norm": 117.80951690673828, "learning_rate": 1.906171188664263e-06, "loss": 0.179, "num_input_tokens_seen": 31059040, "step": 46070 }, { "epoch": 1.1256199154716244, "grad_norm": 0.32162290811538696, "learning_rate": 1.9061351205281322e-06, "loss": 0.0592, "num_input_tokens_seen": 31062432, "step": 46075 }, { "epoch": 1.1257420663034716, "grad_norm": 6.650218486785889, "learning_rate": 1.906099045802343e-06, "loss": 0.0514, "num_input_tokens_seen": 31065760, "step": 46080 }, { "epoch": 1.1258642171353186, "grad_norm": 0.14563031494617462, "learning_rate": 1.9060629644871576e-06, "loss": 0.0684, "num_input_tokens_seen": 31068896, "step": 46085 }, { "epoch": 1.1259863679671658, "grad_norm": 7.294920444488525, "learning_rate": 1.9060268765828388e-06, "loss": 0.1949, "num_input_tokens_seen": 31072288, "step": 46090 }, { "epoch": 1.126108518799013, "grad_norm": 0.2821553945541382, "learning_rate": 1.905990782089649e-06, "loss": 0.0292, "num_input_tokens_seen": 31075744, "step": 46095 }, { "epoch": 1.1262306696308602, "grad_norm": 0.4164447486400604, "learning_rate": 1.9059546810078504e-06, "loss": 0.0117, "num_input_tokens_seen": 31079456, "step": 46100 }, { "epoch": 1.1263528204627073, "grad_norm": 0.48666372895240784, "learning_rate": 1.9059185733377057e-06, "loss": 0.0041, "num_input_tokens_seen": 31082912, "step": 46105 }, { "epoch": 1.1264749712945545, "grad_norm": 48.44977569580078, "learning_rate": 1.9058824590794776e-06, "loss": 0.0755, "num_input_tokens_seen": 31086432, "step": 46110 }, { "epoch": 1.1265971221264017, "grad_norm": 22.645042419433594, "learning_rate": 1.9058463382334283e-06, "loss": 0.0963, "num_input_tokens_seen": 31090336, "step": 46115 }, { "epoch": 1.126719272958249, "grad_norm": 0.06348301470279694, "learning_rate": 1.9058102107998208e-06, "loss": 0.0509, "num_input_tokens_seen": 31093856, "step": 46120 }, { "epoch": 1.126841423790096, "grad_norm": 96.59019470214844, "learning_rate": 1.9057740767789182e-06, "loss": 0.0786, "num_input_tokens_seen": 31096928, "step": 46125 }, { "epoch": 1.1269635746219433, "grad_norm": 0.15443776547908783, "learning_rate": 1.9057379361709827e-06, "loss": 0.1037, "num_input_tokens_seen": 31100384, "step": 46130 }, { "epoch": 1.1270857254537903, "grad_norm": 1.693228006362915, "learning_rate": 1.9057017889762772e-06, "loss": 0.0674, "num_input_tokens_seen": 31103392, "step": 46135 }, { "epoch": 1.1272078762856375, "grad_norm": 0.08480946719646454, "learning_rate": 1.905665635195065e-06, "loss": 0.0368, "num_input_tokens_seen": 31107040, "step": 46140 }, { "epoch": 1.1273300271174846, "grad_norm": 0.12390197813510895, "learning_rate": 1.9056294748276081e-06, "loss": 0.0386, "num_input_tokens_seen": 31110112, "step": 46145 }, { "epoch": 1.1274521779493318, "grad_norm": 0.7042220234870911, "learning_rate": 1.9055933078741706e-06, "loss": 0.0635, "num_input_tokens_seen": 31113312, "step": 46150 }, { "epoch": 1.127574328781179, "grad_norm": 48.234859466552734, "learning_rate": 1.9055571343350148e-06, "loss": 0.0946, "num_input_tokens_seen": 31116768, "step": 46155 }, { "epoch": 1.1276964796130262, "grad_norm": 0.1741168200969696, "learning_rate": 1.905520954210404e-06, "loss": 0.0064, "num_input_tokens_seen": 31120544, "step": 46160 }, { "epoch": 1.1278186304448734, "grad_norm": 0.08128256350755692, "learning_rate": 1.9054847675006013e-06, "loss": 0.0645, "num_input_tokens_seen": 31123808, "step": 46165 }, { "epoch": 1.1279407812767204, "grad_norm": 0.11791330575942993, "learning_rate": 1.9054485742058697e-06, "loss": 0.0966, "num_input_tokens_seen": 31127264, "step": 46170 }, { "epoch": 1.1280629321085676, "grad_norm": 26.097557067871094, "learning_rate": 1.9054123743264725e-06, "loss": 0.0776, "num_input_tokens_seen": 31130976, "step": 46175 }, { "epoch": 1.1281850829404148, "grad_norm": 0.1617422103881836, "learning_rate": 1.9053761678626733e-06, "loss": 0.0974, "num_input_tokens_seen": 31133984, "step": 46180 }, { "epoch": 1.128307233772262, "grad_norm": 0.15047034621238708, "learning_rate": 1.9053399548147348e-06, "loss": 0.0061, "num_input_tokens_seen": 31137184, "step": 46185 }, { "epoch": 1.1284293846041091, "grad_norm": 0.13080565631389618, "learning_rate": 1.9053037351829207e-06, "loss": 0.0486, "num_input_tokens_seen": 31140448, "step": 46190 }, { "epoch": 1.1285515354359563, "grad_norm": 4.27202844619751, "learning_rate": 1.9052675089674942e-06, "loss": 0.1009, "num_input_tokens_seen": 31143712, "step": 46195 }, { "epoch": 1.1286736862678035, "grad_norm": 0.387268602848053, "learning_rate": 1.905231276168719e-06, "loss": 0.0236, "num_input_tokens_seen": 31146784, "step": 46200 }, { "epoch": 1.1287958370996507, "grad_norm": 4.517419338226318, "learning_rate": 1.9051950367868589e-06, "loss": 0.1454, "num_input_tokens_seen": 31150496, "step": 46205 }, { "epoch": 1.128917987931498, "grad_norm": 7.835855960845947, "learning_rate": 1.9051587908221766e-06, "loss": 0.1363, "num_input_tokens_seen": 31154208, "step": 46210 }, { "epoch": 1.129040138763345, "grad_norm": 119.49268341064453, "learning_rate": 1.905122538274936e-06, "loss": 0.0925, "num_input_tokens_seen": 31157600, "step": 46215 }, { "epoch": 1.129162289595192, "grad_norm": 0.29845014214515686, "learning_rate": 1.9050862791454011e-06, "loss": 0.0013, "num_input_tokens_seen": 31160800, "step": 46220 }, { "epoch": 1.1292844404270392, "grad_norm": 116.61756896972656, "learning_rate": 1.9050500134338353e-06, "loss": 0.199, "num_input_tokens_seen": 31164064, "step": 46225 }, { "epoch": 1.1294065912588864, "grad_norm": 0.298088014125824, "learning_rate": 1.9050137411405024e-06, "loss": 0.0435, "num_input_tokens_seen": 31167136, "step": 46230 }, { "epoch": 1.1295287420907336, "grad_norm": 0.2157052755355835, "learning_rate": 1.9049774622656661e-06, "loss": 0.0609, "num_input_tokens_seen": 31170144, "step": 46235 }, { "epoch": 1.1296508929225808, "grad_norm": 0.09936691075563431, "learning_rate": 1.90494117680959e-06, "loss": 0.0446, "num_input_tokens_seen": 31173280, "step": 46240 }, { "epoch": 1.129773043754428, "grad_norm": 36.17173385620117, "learning_rate": 1.9049048847725388e-06, "loss": 0.0281, "num_input_tokens_seen": 31177120, "step": 46245 }, { "epoch": 1.1298951945862752, "grad_norm": 32.515846252441406, "learning_rate": 1.9048685861547755e-06, "loss": 0.1817, "num_input_tokens_seen": 31180000, "step": 46250 }, { "epoch": 1.1300173454181224, "grad_norm": 0.07410810887813568, "learning_rate": 1.9048322809565644e-06, "loss": 0.0721, "num_input_tokens_seen": 31183200, "step": 46255 }, { "epoch": 1.1301394962499693, "grad_norm": 0.28702202439308167, "learning_rate": 1.90479596917817e-06, "loss": 0.0749, "num_input_tokens_seen": 31186720, "step": 46260 }, { "epoch": 1.1302616470818165, "grad_norm": 0.07194984704256058, "learning_rate": 1.9047596508198556e-06, "loss": 0.0618, "num_input_tokens_seen": 31189984, "step": 46265 }, { "epoch": 1.1303837979136637, "grad_norm": 1.1287305355072021, "learning_rate": 1.904723325881886e-06, "loss": 0.0574, "num_input_tokens_seen": 31193696, "step": 46270 }, { "epoch": 1.130505948745511, "grad_norm": 0.10475568473339081, "learning_rate": 1.9046869943645246e-06, "loss": 0.0477, "num_input_tokens_seen": 31197792, "step": 46275 }, { "epoch": 1.130628099577358, "grad_norm": 10.711601257324219, "learning_rate": 1.9046506562680365e-06, "loss": 0.0344, "num_input_tokens_seen": 31200992, "step": 46280 }, { "epoch": 1.1307502504092053, "grad_norm": 42.35076904296875, "learning_rate": 1.9046143115926851e-06, "loss": 0.0564, "num_input_tokens_seen": 31204128, "step": 46285 }, { "epoch": 1.1308724012410525, "grad_norm": 0.016980910673737526, "learning_rate": 1.9045779603387353e-06, "loss": 0.0887, "num_input_tokens_seen": 31207392, "step": 46290 }, { "epoch": 1.1309945520728997, "grad_norm": 106.5681381225586, "learning_rate": 1.9045416025064514e-06, "loss": 0.0545, "num_input_tokens_seen": 31210656, "step": 46295 }, { "epoch": 1.1311167029047469, "grad_norm": 0.17704324424266815, "learning_rate": 1.9045052380960972e-06, "loss": 0.0667, "num_input_tokens_seen": 31213792, "step": 46300 }, { "epoch": 1.131238853736594, "grad_norm": 2.8407654762268066, "learning_rate": 1.9044688671079382e-06, "loss": 0.1294, "num_input_tokens_seen": 31216672, "step": 46305 }, { "epoch": 1.131361004568441, "grad_norm": 0.3381950557231903, "learning_rate": 1.904432489542238e-06, "loss": 0.0577, "num_input_tokens_seen": 31220256, "step": 46310 }, { "epoch": 1.1314831554002882, "grad_norm": 0.33855950832366943, "learning_rate": 1.9043961053992616e-06, "loss": 0.0775, "num_input_tokens_seen": 31223776, "step": 46315 }, { "epoch": 1.1316053062321354, "grad_norm": 1.119606614112854, "learning_rate": 1.9043597146792733e-06, "loss": 0.0025, "num_input_tokens_seen": 31227168, "step": 46320 }, { "epoch": 1.1317274570639826, "grad_norm": 48.65851974487305, "learning_rate": 1.9043233173825382e-06, "loss": 0.2185, "num_input_tokens_seen": 31230368, "step": 46325 }, { "epoch": 1.1318496078958298, "grad_norm": 0.20404629409313202, "learning_rate": 1.9042869135093205e-06, "loss": 0.0088, "num_input_tokens_seen": 31233760, "step": 46330 }, { "epoch": 1.131971758727677, "grad_norm": 4.307328224182129, "learning_rate": 1.9042505030598853e-06, "loss": 0.0758, "num_input_tokens_seen": 31236896, "step": 46335 }, { "epoch": 1.1320939095595242, "grad_norm": 0.06919888406991959, "learning_rate": 1.904214086034497e-06, "loss": 0.0013, "num_input_tokens_seen": 31240928, "step": 46340 }, { "epoch": 1.1322160603913713, "grad_norm": 62.974281311035156, "learning_rate": 1.9041776624334206e-06, "loss": 0.0048, "num_input_tokens_seen": 31244448, "step": 46345 }, { "epoch": 1.1323382112232183, "grad_norm": 0.3358139991760254, "learning_rate": 1.9041412322569212e-06, "loss": 0.1378, "num_input_tokens_seen": 31247328, "step": 46350 }, { "epoch": 1.1324603620550655, "grad_norm": 0.21907946467399597, "learning_rate": 1.9041047955052639e-06, "loss": 0.1222, "num_input_tokens_seen": 31250464, "step": 46355 }, { "epoch": 1.1325825128869127, "grad_norm": 0.13157333433628082, "learning_rate": 1.9040683521787128e-06, "loss": 0.1001, "num_input_tokens_seen": 31253856, "step": 46360 }, { "epoch": 1.1327046637187599, "grad_norm": 12.132460594177246, "learning_rate": 1.9040319022775337e-06, "loss": 0.2089, "num_input_tokens_seen": 31256864, "step": 46365 }, { "epoch": 1.132826814550607, "grad_norm": 0.45818689465522766, "learning_rate": 1.9039954458019918e-06, "loss": 0.156, "num_input_tokens_seen": 31260064, "step": 46370 }, { "epoch": 1.1329489653824543, "grad_norm": 10.195818901062012, "learning_rate": 1.9039589827523512e-06, "loss": 0.1985, "num_input_tokens_seen": 31263328, "step": 46375 }, { "epoch": 1.1330711162143015, "grad_norm": 0.5322380661964417, "learning_rate": 1.903922513128878e-06, "loss": 0.0926, "num_input_tokens_seen": 31266592, "step": 46380 }, { "epoch": 1.1331932670461486, "grad_norm": 0.13091324269771576, "learning_rate": 1.9038860369318375e-06, "loss": 0.035, "num_input_tokens_seen": 31270560, "step": 46385 }, { "epoch": 1.1333154178779958, "grad_norm": 15.070964813232422, "learning_rate": 1.9038495541614945e-06, "loss": 0.0376, "num_input_tokens_seen": 31273760, "step": 46390 }, { "epoch": 1.133437568709843, "grad_norm": 0.10301220417022705, "learning_rate": 1.903813064818114e-06, "loss": 0.0388, "num_input_tokens_seen": 31277024, "step": 46395 }, { "epoch": 1.13355971954169, "grad_norm": 0.348623126745224, "learning_rate": 1.9037765689019622e-06, "loss": 0.0996, "num_input_tokens_seen": 31280160, "step": 46400 }, { "epoch": 1.1336818703735372, "grad_norm": 0.0504717081785202, "learning_rate": 1.9037400664133042e-06, "loss": 0.1017, "num_input_tokens_seen": 31283616, "step": 46405 }, { "epoch": 1.1338040212053844, "grad_norm": 0.7688374519348145, "learning_rate": 1.903703557352405e-06, "loss": 0.0062, "num_input_tokens_seen": 31287072, "step": 46410 }, { "epoch": 1.1339261720372316, "grad_norm": 8.846529960632324, "learning_rate": 1.9036670417195306e-06, "loss": 0.1631, "num_input_tokens_seen": 31290016, "step": 46415 }, { "epoch": 1.1340483228690788, "grad_norm": 105.6480712890625, "learning_rate": 1.9036305195149464e-06, "loss": 0.1306, "num_input_tokens_seen": 31293728, "step": 46420 }, { "epoch": 1.134170473700926, "grad_norm": 0.15042132139205933, "learning_rate": 1.9035939907389182e-06, "loss": 0.061, "num_input_tokens_seen": 31296672, "step": 46425 }, { "epoch": 1.1342926245327731, "grad_norm": 0.11271711438894272, "learning_rate": 1.9035574553917112e-06, "loss": 0.1522, "num_input_tokens_seen": 31300640, "step": 46430 }, { "epoch": 1.1344147753646203, "grad_norm": 1.1958802938461304, "learning_rate": 1.9035209134735916e-06, "loss": 0.0435, "num_input_tokens_seen": 31303968, "step": 46435 }, { "epoch": 1.1345369261964673, "grad_norm": 45.54319381713867, "learning_rate": 1.9034843649848248e-06, "loss": 0.1184, "num_input_tokens_seen": 31307040, "step": 46440 }, { "epoch": 1.1346590770283145, "grad_norm": 1.4599428176879883, "learning_rate": 1.9034478099256765e-06, "loss": 0.1014, "num_input_tokens_seen": 31310368, "step": 46445 }, { "epoch": 1.1347812278601617, "grad_norm": 1.1676909923553467, "learning_rate": 1.9034112482964128e-06, "loss": 0.0286, "num_input_tokens_seen": 31313376, "step": 46450 }, { "epoch": 1.1349033786920089, "grad_norm": 0.18391495943069458, "learning_rate": 1.9033746800972999e-06, "loss": 0.0489, "num_input_tokens_seen": 31316960, "step": 46455 }, { "epoch": 1.135025529523856, "grad_norm": 0.20511262118816376, "learning_rate": 1.903338105328603e-06, "loss": 0.0743, "num_input_tokens_seen": 31320096, "step": 46460 }, { "epoch": 1.1351476803557032, "grad_norm": 0.14756199717521667, "learning_rate": 1.9033015239905885e-06, "loss": 0.0019, "num_input_tokens_seen": 31323744, "step": 46465 }, { "epoch": 1.1352698311875504, "grad_norm": 1.0843437910079956, "learning_rate": 1.9032649360835222e-06, "loss": 0.0281, "num_input_tokens_seen": 31326944, "step": 46470 }, { "epoch": 1.1353919820193976, "grad_norm": 0.18381133675575256, "learning_rate": 1.9032283416076704e-06, "loss": 0.0533, "num_input_tokens_seen": 31330080, "step": 46475 }, { "epoch": 1.1355141328512448, "grad_norm": 0.07421161234378815, "learning_rate": 1.9031917405632993e-06, "loss": 0.1661, "num_input_tokens_seen": 31333856, "step": 46480 }, { "epoch": 1.135636283683092, "grad_norm": 0.14352209866046906, "learning_rate": 1.903155132950675e-06, "loss": 0.0009, "num_input_tokens_seen": 31336928, "step": 46485 }, { "epoch": 1.135758434514939, "grad_norm": 124.78722381591797, "learning_rate": 1.9031185187700634e-06, "loss": 0.1065, "num_input_tokens_seen": 31340320, "step": 46490 }, { "epoch": 1.1358805853467862, "grad_norm": 1.1804207563400269, "learning_rate": 1.9030818980217313e-06, "loss": 0.0969, "num_input_tokens_seen": 31343392, "step": 46495 }, { "epoch": 1.1360027361786333, "grad_norm": 26.019861221313477, "learning_rate": 1.9030452707059445e-06, "loss": 0.1355, "num_input_tokens_seen": 31346848, "step": 46500 }, { "epoch": 1.1361248870104805, "grad_norm": 0.1707308143377304, "learning_rate": 1.9030086368229696e-06, "loss": 0.0748, "num_input_tokens_seen": 31350432, "step": 46505 }, { "epoch": 1.1362470378423277, "grad_norm": 8.230561256408691, "learning_rate": 1.9029719963730732e-06, "loss": 0.0875, "num_input_tokens_seen": 31354656, "step": 46510 }, { "epoch": 1.136369188674175, "grad_norm": 0.444359689950943, "learning_rate": 1.9029353493565215e-06, "loss": 0.0497, "num_input_tokens_seen": 31358176, "step": 46515 }, { "epoch": 1.136491339506022, "grad_norm": 0.6157476305961609, "learning_rate": 1.9028986957735808e-06, "loss": 0.0028, "num_input_tokens_seen": 31361056, "step": 46520 }, { "epoch": 1.1366134903378693, "grad_norm": 46.505859375, "learning_rate": 1.9028620356245185e-06, "loss": 0.0854, "num_input_tokens_seen": 31363808, "step": 46525 }, { "epoch": 1.1367356411697163, "grad_norm": 10.289785385131836, "learning_rate": 1.9028253689096e-06, "loss": 0.114, "num_input_tokens_seen": 31367392, "step": 46530 }, { "epoch": 1.1368577920015634, "grad_norm": 102.41519165039062, "learning_rate": 1.902788695629093e-06, "loss": 0.0493, "num_input_tokens_seen": 31370912, "step": 46535 }, { "epoch": 1.1369799428334106, "grad_norm": 0.1399715542793274, "learning_rate": 1.902752015783264e-06, "loss": 0.0007, "num_input_tokens_seen": 31374240, "step": 46540 }, { "epoch": 1.1371020936652578, "grad_norm": 0.10617326200008392, "learning_rate": 1.902715329372379e-06, "loss": 0.2546, "num_input_tokens_seen": 31377632, "step": 46545 }, { "epoch": 1.137224244497105, "grad_norm": 0.24258571863174438, "learning_rate": 1.9026786363967056e-06, "loss": 0.0117, "num_input_tokens_seen": 31381088, "step": 46550 }, { "epoch": 1.1373463953289522, "grad_norm": 6.267822265625, "learning_rate": 1.9026419368565103e-06, "loss": 0.0839, "num_input_tokens_seen": 31384416, "step": 46555 }, { "epoch": 1.1374685461607994, "grad_norm": 0.2888457179069519, "learning_rate": 1.90260523075206e-06, "loss": 0.1168, "num_input_tokens_seen": 31387488, "step": 46560 }, { "epoch": 1.1375906969926466, "grad_norm": 23.509075164794922, "learning_rate": 1.9025685180836218e-06, "loss": 0.2021, "num_input_tokens_seen": 31391008, "step": 46565 }, { "epoch": 1.1377128478244938, "grad_norm": 0.20501813292503357, "learning_rate": 1.9025317988514624e-06, "loss": 0.0107, "num_input_tokens_seen": 31394208, "step": 46570 }, { "epoch": 1.137834998656341, "grad_norm": 0.46571019291877747, "learning_rate": 1.9024950730558493e-06, "loss": 0.139, "num_input_tokens_seen": 31397408, "step": 46575 }, { "epoch": 1.137957149488188, "grad_norm": 0.25909340381622314, "learning_rate": 1.902458340697049e-06, "loss": 0.0533, "num_input_tokens_seen": 31400992, "step": 46580 }, { "epoch": 1.1380793003200351, "grad_norm": 0.20560549199581146, "learning_rate": 1.902421601775329e-06, "loss": 0.0471, "num_input_tokens_seen": 31404448, "step": 46585 }, { "epoch": 1.1382014511518823, "grad_norm": 0.46063530445098877, "learning_rate": 1.9023848562909566e-06, "loss": 0.1839, "num_input_tokens_seen": 31407904, "step": 46590 }, { "epoch": 1.1383236019837295, "grad_norm": 0.26652881503105164, "learning_rate": 1.9023481042441985e-06, "loss": 0.1041, "num_input_tokens_seen": 31410976, "step": 46595 }, { "epoch": 1.1384457528155767, "grad_norm": 0.34124669432640076, "learning_rate": 1.9023113456353225e-06, "loss": 0.0976, "num_input_tokens_seen": 31414304, "step": 46600 }, { "epoch": 1.1385679036474239, "grad_norm": 77.86670684814453, "learning_rate": 1.9022745804645958e-06, "loss": 0.1017, "num_input_tokens_seen": 31417696, "step": 46605 }, { "epoch": 1.138690054479271, "grad_norm": 0.6383596062660217, "learning_rate": 1.9022378087322855e-06, "loss": 0.0565, "num_input_tokens_seen": 31420832, "step": 46610 }, { "epoch": 1.1388122053111183, "grad_norm": 0.22451035678386688, "learning_rate": 1.9022010304386588e-06, "loss": 0.0414, "num_input_tokens_seen": 31424288, "step": 46615 }, { "epoch": 1.1389343561429652, "grad_norm": 0.03640579804778099, "learning_rate": 1.902164245583984e-06, "loss": 0.0571, "num_input_tokens_seen": 31427680, "step": 46620 }, { "epoch": 1.1390565069748124, "grad_norm": 0.8211766481399536, "learning_rate": 1.9021274541685282e-06, "loss": 0.0807, "num_input_tokens_seen": 31430880, "step": 46625 }, { "epoch": 1.1391786578066596, "grad_norm": 0.2223214954137802, "learning_rate": 1.9020906561925587e-06, "loss": 0.1928, "num_input_tokens_seen": 31434144, "step": 46630 }, { "epoch": 1.1393008086385068, "grad_norm": 0.1394394040107727, "learning_rate": 1.9020538516563436e-06, "loss": 0.0031, "num_input_tokens_seen": 31437472, "step": 46635 }, { "epoch": 1.139422959470354, "grad_norm": 8.12044906616211, "learning_rate": 1.9020170405601498e-06, "loss": 0.1425, "num_input_tokens_seen": 31440672, "step": 46640 }, { "epoch": 1.1395451103022012, "grad_norm": 0.6035500764846802, "learning_rate": 1.9019802229042458e-06, "loss": 0.0012, "num_input_tokens_seen": 31443872, "step": 46645 }, { "epoch": 1.1396672611340484, "grad_norm": 0.03213443607091904, "learning_rate": 1.901943398688899e-06, "loss": 0.0506, "num_input_tokens_seen": 31447136, "step": 46650 }, { "epoch": 1.1397894119658956, "grad_norm": 45.71347427368164, "learning_rate": 1.901906567914377e-06, "loss": 0.2119, "num_input_tokens_seen": 31450656, "step": 46655 }, { "epoch": 1.1399115627977428, "grad_norm": 0.019582994282245636, "learning_rate": 1.9018697305809482e-06, "loss": 0.0027, "num_input_tokens_seen": 31453536, "step": 46660 }, { "epoch": 1.14003371362959, "grad_norm": 73.76444244384766, "learning_rate": 1.9018328866888798e-06, "loss": 0.1224, "num_input_tokens_seen": 31456672, "step": 46665 }, { "epoch": 1.140155864461437, "grad_norm": 0.5211069583892822, "learning_rate": 1.9017960362384402e-06, "loss": 0.1004, "num_input_tokens_seen": 31460192, "step": 46670 }, { "epoch": 1.140278015293284, "grad_norm": 0.09799375385046005, "learning_rate": 1.9017591792298974e-06, "loss": 0.1509, "num_input_tokens_seen": 31463328, "step": 46675 }, { "epoch": 1.1404001661251313, "grad_norm": 0.06055364012718201, "learning_rate": 1.9017223156635191e-06, "loss": 0.0268, "num_input_tokens_seen": 31466656, "step": 46680 }, { "epoch": 1.1405223169569785, "grad_norm": 0.13429653644561768, "learning_rate": 1.901685445539574e-06, "loss": 0.1924, "num_input_tokens_seen": 31469856, "step": 46685 }, { "epoch": 1.1406444677888257, "grad_norm": 0.21500791609287262, "learning_rate": 1.9016485688583295e-06, "loss": 0.0013, "num_input_tokens_seen": 31473184, "step": 46690 }, { "epoch": 1.1407666186206729, "grad_norm": 151.6306610107422, "learning_rate": 1.901611685620054e-06, "loss": 0.0728, "num_input_tokens_seen": 31476448, "step": 46695 }, { "epoch": 1.14088876945252, "grad_norm": 116.54022216796875, "learning_rate": 1.901574795825016e-06, "loss": 0.0809, "num_input_tokens_seen": 31479776, "step": 46700 }, { "epoch": 1.141010920284367, "grad_norm": 0.4956272840499878, "learning_rate": 1.9015378994734834e-06, "loss": 0.2304, "num_input_tokens_seen": 31483104, "step": 46705 }, { "epoch": 1.1411330711162142, "grad_norm": 32.742679595947266, "learning_rate": 1.901500996565725e-06, "loss": 0.0266, "num_input_tokens_seen": 31486432, "step": 46710 }, { "epoch": 1.1412552219480614, "grad_norm": 10.730284690856934, "learning_rate": 1.9014640871020084e-06, "loss": 0.1238, "num_input_tokens_seen": 31489952, "step": 46715 }, { "epoch": 1.1413773727799086, "grad_norm": 169.406005859375, "learning_rate": 1.9014271710826032e-06, "loss": 0.0747, "num_input_tokens_seen": 31493280, "step": 46720 }, { "epoch": 1.1414995236117558, "grad_norm": 0.1027897372841835, "learning_rate": 1.9013902485077767e-06, "loss": 0.0029, "num_input_tokens_seen": 31496608, "step": 46725 }, { "epoch": 1.141621674443603, "grad_norm": 0.20650623738765717, "learning_rate": 1.9013533193777977e-06, "loss": 0.0033, "num_input_tokens_seen": 31499616, "step": 46730 }, { "epoch": 1.1417438252754502, "grad_norm": 0.04875501990318298, "learning_rate": 1.9013163836929351e-06, "loss": 0.1099, "num_input_tokens_seen": 31502944, "step": 46735 }, { "epoch": 1.1418659761072973, "grad_norm": 0.8934175968170166, "learning_rate": 1.9012794414534574e-06, "loss": 0.0803, "num_input_tokens_seen": 31506592, "step": 46740 }, { "epoch": 1.1419881269391445, "grad_norm": 0.6407132148742676, "learning_rate": 1.9012424926596333e-06, "loss": 0.0397, "num_input_tokens_seen": 31509792, "step": 46745 }, { "epoch": 1.1421102777709917, "grad_norm": 0.04105055704712868, "learning_rate": 1.9012055373117312e-06, "loss": 0.0027, "num_input_tokens_seen": 31513056, "step": 46750 }, { "epoch": 1.1422324286028387, "grad_norm": 35.02225875854492, "learning_rate": 1.9011685754100202e-06, "loss": 0.1939, "num_input_tokens_seen": 31516896, "step": 46755 }, { "epoch": 1.1423545794346859, "grad_norm": 31.574607849121094, "learning_rate": 1.901131606954769e-06, "loss": 0.1571, "num_input_tokens_seen": 31520032, "step": 46760 }, { "epoch": 1.142476730266533, "grad_norm": 0.2570955157279968, "learning_rate": 1.901094631946246e-06, "loss": 0.058, "num_input_tokens_seen": 31523296, "step": 46765 }, { "epoch": 1.1425988810983803, "grad_norm": 3.1613028049468994, "learning_rate": 1.9010576503847207e-06, "loss": 0.2385, "num_input_tokens_seen": 31526368, "step": 46770 }, { "epoch": 1.1427210319302274, "grad_norm": 0.1640394628047943, "learning_rate": 1.901020662270462e-06, "loss": 0.1712, "num_input_tokens_seen": 31530080, "step": 46775 }, { "epoch": 1.1428431827620746, "grad_norm": 1.545048713684082, "learning_rate": 1.9009836676037382e-06, "loss": 0.1039, "num_input_tokens_seen": 31533856, "step": 46780 }, { "epoch": 1.1429653335939218, "grad_norm": 21.144142150878906, "learning_rate": 1.900946666384819e-06, "loss": 0.0891, "num_input_tokens_seen": 31537504, "step": 46785 }, { "epoch": 1.143087484425769, "grad_norm": 0.20183810591697693, "learning_rate": 1.9009096586139737e-06, "loss": 0.1749, "num_input_tokens_seen": 31540576, "step": 46790 }, { "epoch": 1.143209635257616, "grad_norm": 9.08875846862793, "learning_rate": 1.9008726442914708e-06, "loss": 0.099, "num_input_tokens_seen": 31543392, "step": 46795 }, { "epoch": 1.1433317860894632, "grad_norm": 0.41724827885627747, "learning_rate": 1.9008356234175794e-06, "loss": 0.0932, "num_input_tokens_seen": 31546848, "step": 46800 }, { "epoch": 1.1434539369213104, "grad_norm": 0.13978880643844604, "learning_rate": 1.9007985959925697e-06, "loss": 0.0097, "num_input_tokens_seen": 31550176, "step": 46805 }, { "epoch": 1.1435760877531576, "grad_norm": 30.22905158996582, "learning_rate": 1.9007615620167098e-06, "loss": 0.073, "num_input_tokens_seen": 31554016, "step": 46810 }, { "epoch": 1.1436982385850047, "grad_norm": 0.1616167426109314, "learning_rate": 1.9007245214902698e-06, "loss": 0.0379, "num_input_tokens_seen": 31557152, "step": 46815 }, { "epoch": 1.143820389416852, "grad_norm": 2.9157519340515137, "learning_rate": 1.900687474413519e-06, "loss": 0.0507, "num_input_tokens_seen": 31560480, "step": 46820 }, { "epoch": 1.1439425402486991, "grad_norm": 0.09915733337402344, "learning_rate": 1.9006504207867261e-06, "loss": 0.0025, "num_input_tokens_seen": 31563552, "step": 46825 }, { "epoch": 1.1440646910805463, "grad_norm": 0.6402256488800049, "learning_rate": 1.9006133606101615e-06, "loss": 0.0561, "num_input_tokens_seen": 31566752, "step": 46830 }, { "epoch": 1.1441868419123935, "grad_norm": 10.9564790725708, "learning_rate": 1.9005762938840942e-06, "loss": 0.1262, "num_input_tokens_seen": 31570080, "step": 46835 }, { "epoch": 1.1443089927442407, "grad_norm": 1.3392218351364136, "learning_rate": 1.9005392206087937e-06, "loss": 0.0728, "num_input_tokens_seen": 31573856, "step": 46840 }, { "epoch": 1.1444311435760877, "grad_norm": 10.963790893554688, "learning_rate": 1.9005021407845302e-06, "loss": 0.082, "num_input_tokens_seen": 31577376, "step": 46845 }, { "epoch": 1.1445532944079349, "grad_norm": 86.67719268798828, "learning_rate": 1.9004650544115726e-06, "loss": 0.1102, "num_input_tokens_seen": 31580704, "step": 46850 }, { "epoch": 1.144675445239782, "grad_norm": 0.4809873104095459, "learning_rate": 1.9004279614901908e-06, "loss": 0.0784, "num_input_tokens_seen": 31583648, "step": 46855 }, { "epoch": 1.1447975960716292, "grad_norm": 17.987932205200195, "learning_rate": 1.9003908620206548e-06, "loss": 0.0909, "num_input_tokens_seen": 31587296, "step": 46860 }, { "epoch": 1.1449197469034764, "grad_norm": 44.341793060302734, "learning_rate": 1.9003537560032344e-06, "loss": 0.1251, "num_input_tokens_seen": 31590496, "step": 46865 }, { "epoch": 1.1450418977353236, "grad_norm": 0.029724106192588806, "learning_rate": 1.9003166434381991e-06, "loss": 0.0921, "num_input_tokens_seen": 31594144, "step": 46870 }, { "epoch": 1.1451640485671708, "grad_norm": 0.1720057874917984, "learning_rate": 1.9002795243258194e-06, "loss": 0.0178, "num_input_tokens_seen": 31598240, "step": 46875 }, { "epoch": 1.145286199399018, "grad_norm": 20.10298728942871, "learning_rate": 1.9002423986663645e-06, "loss": 0.0763, "num_input_tokens_seen": 31601632, "step": 46880 }, { "epoch": 1.145408350230865, "grad_norm": 0.6785484552383423, "learning_rate": 1.9002052664601048e-06, "loss": 0.0697, "num_input_tokens_seen": 31605344, "step": 46885 }, { "epoch": 1.1455305010627121, "grad_norm": 1.321736454963684, "learning_rate": 1.9001681277073103e-06, "loss": 0.0721, "num_input_tokens_seen": 31608608, "step": 46890 }, { "epoch": 1.1456526518945593, "grad_norm": 8.711379051208496, "learning_rate": 1.9001309824082512e-06, "loss": 0.1639, "num_input_tokens_seen": 31612000, "step": 46895 }, { "epoch": 1.1457748027264065, "grad_norm": 13.112317085266113, "learning_rate": 1.9000938305631974e-06, "loss": 0.2635, "num_input_tokens_seen": 31615072, "step": 46900 }, { "epoch": 1.1458969535582537, "grad_norm": 1.1117407083511353, "learning_rate": 1.9000566721724193e-06, "loss": 0.1127, "num_input_tokens_seen": 31618272, "step": 46905 }, { "epoch": 1.146019104390101, "grad_norm": 0.35874027013778687, "learning_rate": 1.9000195072361866e-06, "loss": 0.0044, "num_input_tokens_seen": 31621536, "step": 46910 }, { "epoch": 1.146141255221948, "grad_norm": 9.392590522766113, "learning_rate": 1.89998233575477e-06, "loss": 0.074, "num_input_tokens_seen": 31624992, "step": 46915 }, { "epoch": 1.1462634060537953, "grad_norm": 0.24224503338336945, "learning_rate": 1.8999451577284403e-06, "loss": 0.0017, "num_input_tokens_seen": 31628064, "step": 46920 }, { "epoch": 1.1463855568856425, "grad_norm": 85.96072387695312, "learning_rate": 1.899907973157467e-06, "loss": 0.1145, "num_input_tokens_seen": 31631904, "step": 46925 }, { "epoch": 1.1465077077174897, "grad_norm": 78.0488510131836, "learning_rate": 1.899870782042121e-06, "loss": 0.0367, "num_input_tokens_seen": 31635680, "step": 46930 }, { "epoch": 1.1466298585493366, "grad_norm": 37.96966552734375, "learning_rate": 1.8998335843826724e-06, "loss": 0.1674, "num_input_tokens_seen": 31639008, "step": 46935 }, { "epoch": 1.1467520093811838, "grad_norm": 0.22917182743549347, "learning_rate": 1.899796380179392e-06, "loss": 0.0481, "num_input_tokens_seen": 31642208, "step": 46940 }, { "epoch": 1.146874160213031, "grad_norm": 0.07978054881095886, "learning_rate": 1.8997591694325505e-06, "loss": 0.0481, "num_input_tokens_seen": 31645856, "step": 46945 }, { "epoch": 1.1469963110448782, "grad_norm": 0.15004077553749084, "learning_rate": 1.8997219521424184e-06, "loss": 0.0102, "num_input_tokens_seen": 31649120, "step": 46950 }, { "epoch": 1.1471184618767254, "grad_norm": 0.09087875485420227, "learning_rate": 1.8996847283092658e-06, "loss": 0.1831, "num_input_tokens_seen": 31652128, "step": 46955 }, { "epoch": 1.1472406127085726, "grad_norm": 113.01051330566406, "learning_rate": 1.8996474979333645e-06, "loss": 0.0683, "num_input_tokens_seen": 31655392, "step": 46960 }, { "epoch": 1.1473627635404198, "grad_norm": 43.2165412902832, "learning_rate": 1.8996102610149843e-06, "loss": 0.1242, "num_input_tokens_seen": 31659552, "step": 46965 }, { "epoch": 1.147484914372267, "grad_norm": 8.446593284606934, "learning_rate": 1.8995730175543962e-06, "loss": 0.0492, "num_input_tokens_seen": 31662496, "step": 46970 }, { "epoch": 1.147607065204114, "grad_norm": 0.12521806359291077, "learning_rate": 1.899535767551871e-06, "loss": 0.0566, "num_input_tokens_seen": 31665696, "step": 46975 }, { "epoch": 1.1477292160359611, "grad_norm": 49.258235931396484, "learning_rate": 1.8994985110076802e-06, "loss": 0.0329, "num_input_tokens_seen": 31669856, "step": 46980 }, { "epoch": 1.1478513668678083, "grad_norm": 0.5356413722038269, "learning_rate": 1.8994612479220942e-06, "loss": 0.0303, "num_input_tokens_seen": 31672736, "step": 46985 }, { "epoch": 1.1479735176996555, "grad_norm": 0.22916997969150543, "learning_rate": 1.8994239782953838e-06, "loss": 0.0517, "num_input_tokens_seen": 31676128, "step": 46990 }, { "epoch": 1.1480956685315027, "grad_norm": 2.0166921615600586, "learning_rate": 1.8993867021278205e-06, "loss": 0.0416, "num_input_tokens_seen": 31679264, "step": 46995 }, { "epoch": 1.1482178193633499, "grad_norm": 0.0886596217751503, "learning_rate": 1.8993494194196754e-06, "loss": 0.0559, "num_input_tokens_seen": 31682144, "step": 47000 }, { "epoch": 1.148339970195197, "grad_norm": 0.19293953478336334, "learning_rate": 1.8993121301712192e-06, "loss": 0.1114, "num_input_tokens_seen": 31685024, "step": 47005 }, { "epoch": 1.1484621210270443, "grad_norm": 0.08507690578699112, "learning_rate": 1.8992748343827233e-06, "loss": 0.0433, "num_input_tokens_seen": 31688288, "step": 47010 }, { "epoch": 1.1485842718588914, "grad_norm": 18.874502182006836, "learning_rate": 1.8992375320544589e-06, "loss": 0.2048, "num_input_tokens_seen": 31691296, "step": 47015 }, { "epoch": 1.1487064226907386, "grad_norm": 8.155969619750977, "learning_rate": 1.8992002231866975e-06, "loss": 0.134, "num_input_tokens_seen": 31694688, "step": 47020 }, { "epoch": 1.1488285735225856, "grad_norm": 5.913618564605713, "learning_rate": 1.89916290777971e-06, "loss": 0.0332, "num_input_tokens_seen": 31698208, "step": 47025 }, { "epoch": 1.1489507243544328, "grad_norm": 0.2968266010284424, "learning_rate": 1.899125585833768e-06, "loss": 0.124, "num_input_tokens_seen": 31701600, "step": 47030 }, { "epoch": 1.14907287518628, "grad_norm": 0.13521993160247803, "learning_rate": 1.8990882573491432e-06, "loss": 0.1013, "num_input_tokens_seen": 31704928, "step": 47035 }, { "epoch": 1.1491950260181272, "grad_norm": 17.11520767211914, "learning_rate": 1.8990509223261064e-06, "loss": 0.1103, "num_input_tokens_seen": 31708640, "step": 47040 }, { "epoch": 1.1493171768499744, "grad_norm": 69.05915069580078, "learning_rate": 1.8990135807649295e-06, "loss": 0.1072, "num_input_tokens_seen": 31711968, "step": 47045 }, { "epoch": 1.1494393276818216, "grad_norm": 0.7382350564002991, "learning_rate": 1.898976232665884e-06, "loss": 0.0083, "num_input_tokens_seen": 31715552, "step": 47050 }, { "epoch": 1.1495614785136687, "grad_norm": 0.20631414651870728, "learning_rate": 1.8989388780292418e-06, "loss": 0.0794, "num_input_tokens_seen": 31718624, "step": 47055 }, { "epoch": 1.149683629345516, "grad_norm": 0.3034517765045166, "learning_rate": 1.8989015168552743e-06, "loss": 0.0679, "num_input_tokens_seen": 31722080, "step": 47060 }, { "epoch": 1.149805780177363, "grad_norm": 0.11916504055261612, "learning_rate": 1.898864149144253e-06, "loss": 0.0021, "num_input_tokens_seen": 31725536, "step": 47065 }, { "epoch": 1.14992793100921, "grad_norm": 0.3561840355396271, "learning_rate": 1.89882677489645e-06, "loss": 0.126, "num_input_tokens_seen": 31728480, "step": 47070 }, { "epoch": 1.1500500818410573, "grad_norm": 0.2590155303478241, "learning_rate": 1.898789394112137e-06, "loss": 0.0025, "num_input_tokens_seen": 31731552, "step": 47075 }, { "epoch": 1.1501722326729045, "grad_norm": 0.07878569513559341, "learning_rate": 1.8987520067915854e-06, "loss": 0.0754, "num_input_tokens_seen": 31735008, "step": 47080 }, { "epoch": 1.1502943835047517, "grad_norm": 60.1794319152832, "learning_rate": 1.8987146129350678e-06, "loss": 0.0624, "num_input_tokens_seen": 31737952, "step": 47085 }, { "epoch": 1.1504165343365989, "grad_norm": 0.09747473150491714, "learning_rate": 1.8986772125428558e-06, "loss": 0.0826, "num_input_tokens_seen": 31741408, "step": 47090 }, { "epoch": 1.150538685168446, "grad_norm": 215.47508239746094, "learning_rate": 1.8986398056152212e-06, "loss": 0.1433, "num_input_tokens_seen": 31744928, "step": 47095 }, { "epoch": 1.1506608360002932, "grad_norm": 0.09226039052009583, "learning_rate": 1.8986023921524364e-06, "loss": 0.1718, "num_input_tokens_seen": 31748640, "step": 47100 }, { "epoch": 1.1507829868321404, "grad_norm": 0.1691044718027115, "learning_rate": 1.8985649721547732e-06, "loss": 0.1613, "num_input_tokens_seen": 31752416, "step": 47105 }, { "epoch": 1.1509051376639876, "grad_norm": 0.20411577820777893, "learning_rate": 1.8985275456225038e-06, "loss": 0.1354, "num_input_tokens_seen": 31755744, "step": 47110 }, { "epoch": 1.1510272884958346, "grad_norm": 0.38191160559654236, "learning_rate": 1.8984901125559006e-06, "loss": 0.0489, "num_input_tokens_seen": 31759072, "step": 47115 }, { "epoch": 1.1511494393276818, "grad_norm": 0.07923942804336548, "learning_rate": 1.8984526729552354e-06, "loss": 0.051, "num_input_tokens_seen": 31762208, "step": 47120 }, { "epoch": 1.151271590159529, "grad_norm": 0.2053736001253128, "learning_rate": 1.898415226820781e-06, "loss": 0.1468, "num_input_tokens_seen": 31765472, "step": 47125 }, { "epoch": 1.1513937409913761, "grad_norm": 84.87515258789062, "learning_rate": 1.8983777741528094e-06, "loss": 0.1513, "num_input_tokens_seen": 31768352, "step": 47130 }, { "epoch": 1.1515158918232233, "grad_norm": 0.20891521871089935, "learning_rate": 1.8983403149515928e-06, "loss": 0.0462, "num_input_tokens_seen": 31772576, "step": 47135 }, { "epoch": 1.1516380426550705, "grad_norm": 10.00893783569336, "learning_rate": 1.8983028492174037e-06, "loss": 0.0464, "num_input_tokens_seen": 31775776, "step": 47140 }, { "epoch": 1.1517601934869177, "grad_norm": 0.5158972144126892, "learning_rate": 1.898265376950515e-06, "loss": 0.0162, "num_input_tokens_seen": 31779168, "step": 47145 }, { "epoch": 1.151882344318765, "grad_norm": 10.742677688598633, "learning_rate": 1.8982278981511986e-06, "loss": 0.1282, "num_input_tokens_seen": 31782304, "step": 47150 }, { "epoch": 1.1520044951506119, "grad_norm": 0.17852243781089783, "learning_rate": 1.8981904128197274e-06, "loss": 0.1716, "num_input_tokens_seen": 31785824, "step": 47155 }, { "epoch": 1.152126645982459, "grad_norm": 0.6007935404777527, "learning_rate": 1.898152920956374e-06, "loss": 0.0282, "num_input_tokens_seen": 31789088, "step": 47160 }, { "epoch": 1.1522487968143063, "grad_norm": 1.4212846755981445, "learning_rate": 1.8981154225614108e-06, "loss": 0.0165, "num_input_tokens_seen": 31792352, "step": 47165 }, { "epoch": 1.1523709476461534, "grad_norm": 0.3089074194431305, "learning_rate": 1.8980779176351112e-06, "loss": 0.04, "num_input_tokens_seen": 31795680, "step": 47170 }, { "epoch": 1.1524930984780006, "grad_norm": 22.73480987548828, "learning_rate": 1.8980404061777468e-06, "loss": 0.1514, "num_input_tokens_seen": 31798816, "step": 47175 }, { "epoch": 1.1526152493098478, "grad_norm": 0.07772679626941681, "learning_rate": 1.8980028881895916e-06, "loss": 0.0017, "num_input_tokens_seen": 31801888, "step": 47180 }, { "epoch": 1.152737400141695, "grad_norm": 0.057387080043554306, "learning_rate": 1.8979653636709173e-06, "loss": 0.0878, "num_input_tokens_seen": 31805408, "step": 47185 }, { "epoch": 1.1528595509735422, "grad_norm": 0.061906807124614716, "learning_rate": 1.8979278326219977e-06, "loss": 0.0473, "num_input_tokens_seen": 31808672, "step": 47190 }, { "epoch": 1.1529817018053894, "grad_norm": 34.71351623535156, "learning_rate": 1.8978902950431052e-06, "loss": 0.0428, "num_input_tokens_seen": 31812256, "step": 47195 }, { "epoch": 1.1531038526372366, "grad_norm": 25.56401824951172, "learning_rate": 1.897852750934513e-06, "loss": 0.0926, "num_input_tokens_seen": 31815840, "step": 47200 }, { "epoch": 1.1532260034690835, "grad_norm": 0.13676181435585022, "learning_rate": 1.8978152002964943e-06, "loss": 0.0025, "num_input_tokens_seen": 31819488, "step": 47205 }, { "epoch": 1.1533481543009307, "grad_norm": 0.024842863902449608, "learning_rate": 1.8977776431293218e-06, "loss": 0.2126, "num_input_tokens_seen": 31822880, "step": 47210 }, { "epoch": 1.153470305132778, "grad_norm": 0.13829733431339264, "learning_rate": 1.897740079433269e-06, "loss": 0.2088, "num_input_tokens_seen": 31825952, "step": 47215 }, { "epoch": 1.1535924559646251, "grad_norm": 0.07277069985866547, "learning_rate": 1.8977025092086087e-06, "loss": 0.0432, "num_input_tokens_seen": 31829024, "step": 47220 }, { "epoch": 1.1537146067964723, "grad_norm": 8.050619125366211, "learning_rate": 1.8976649324556143e-06, "loss": 0.1844, "num_input_tokens_seen": 31832672, "step": 47225 }, { "epoch": 1.1538367576283195, "grad_norm": 0.4239726960659027, "learning_rate": 1.897627349174559e-06, "loss": 0.0456, "num_input_tokens_seen": 31835680, "step": 47230 }, { "epoch": 1.1539589084601667, "grad_norm": 5.3124918937683105, "learning_rate": 1.8975897593657165e-06, "loss": 0.0201, "num_input_tokens_seen": 31839008, "step": 47235 }, { "epoch": 1.1540810592920137, "grad_norm": 0.22926749289035797, "learning_rate": 1.8975521630293595e-06, "loss": 0.0469, "num_input_tokens_seen": 31842976, "step": 47240 }, { "epoch": 1.1542032101238608, "grad_norm": 0.3021707236766815, "learning_rate": 1.897514560165762e-06, "loss": 0.0565, "num_input_tokens_seen": 31846944, "step": 47245 }, { "epoch": 1.154325360955708, "grad_norm": 0.4421197175979614, "learning_rate": 1.8974769507751968e-06, "loss": 0.2177, "num_input_tokens_seen": 31850080, "step": 47250 }, { "epoch": 1.1544475117875552, "grad_norm": 50.06604766845703, "learning_rate": 1.8974393348579383e-06, "loss": 0.0344, "num_input_tokens_seen": 31853408, "step": 47255 }, { "epoch": 1.1545696626194024, "grad_norm": 0.12287864089012146, "learning_rate": 1.8974017124142594e-06, "loss": 0.0009, "num_input_tokens_seen": 31857248, "step": 47260 }, { "epoch": 1.1546918134512496, "grad_norm": 168.93858337402344, "learning_rate": 1.897364083444434e-06, "loss": 0.115, "num_input_tokens_seen": 31860128, "step": 47265 }, { "epoch": 1.1548139642830968, "grad_norm": 74.55368041992188, "learning_rate": 1.8973264479487355e-06, "loss": 0.01, "num_input_tokens_seen": 31863712, "step": 47270 }, { "epoch": 1.154936115114944, "grad_norm": 0.07216308265924454, "learning_rate": 1.8972888059274377e-06, "loss": 0.0607, "num_input_tokens_seen": 31867232, "step": 47275 }, { "epoch": 1.1550582659467912, "grad_norm": 90.53994750976562, "learning_rate": 1.8972511573808144e-06, "loss": 0.108, "num_input_tokens_seen": 31871136, "step": 47280 }, { "epoch": 1.1551804167786384, "grad_norm": 0.903574526309967, "learning_rate": 1.8972135023091394e-06, "loss": 0.0396, "num_input_tokens_seen": 31873824, "step": 47285 }, { "epoch": 1.1553025676104853, "grad_norm": 38.021968841552734, "learning_rate": 1.8971758407126864e-06, "loss": 0.2427, "num_input_tokens_seen": 31877344, "step": 47290 }, { "epoch": 1.1554247184423325, "grad_norm": 19.66106414794922, "learning_rate": 1.8971381725917292e-06, "loss": 0.1947, "num_input_tokens_seen": 31880544, "step": 47295 }, { "epoch": 1.1555468692741797, "grad_norm": 0.1368977427482605, "learning_rate": 1.8971004979465422e-06, "loss": 0.0869, "num_input_tokens_seen": 31883872, "step": 47300 }, { "epoch": 1.155669020106027, "grad_norm": 0.7672892808914185, "learning_rate": 1.897062816777399e-06, "loss": 0.0875, "num_input_tokens_seen": 31887264, "step": 47305 }, { "epoch": 1.155791170937874, "grad_norm": 0.3361578583717346, "learning_rate": 1.8970251290845737e-06, "loss": 0.0316, "num_input_tokens_seen": 31890656, "step": 47310 }, { "epoch": 1.1559133217697213, "grad_norm": 0.45389366149902344, "learning_rate": 1.8969874348683404e-06, "loss": 0.0475, "num_input_tokens_seen": 31893984, "step": 47315 }, { "epoch": 1.1560354726015685, "grad_norm": 0.08049551397562027, "learning_rate": 1.8969497341289733e-06, "loss": 0.0016, "num_input_tokens_seen": 31897376, "step": 47320 }, { "epoch": 1.1561576234334157, "grad_norm": 11.899868965148926, "learning_rate": 1.8969120268667462e-06, "loss": 0.1015, "num_input_tokens_seen": 31900576, "step": 47325 }, { "epoch": 1.1562797742652626, "grad_norm": 0.06697845458984375, "learning_rate": 1.8968743130819338e-06, "loss": 0.0299, "num_input_tokens_seen": 31903904, "step": 47330 }, { "epoch": 1.1564019250971098, "grad_norm": 18.530529022216797, "learning_rate": 1.8968365927748102e-06, "loss": 0.1661, "num_input_tokens_seen": 31907040, "step": 47335 }, { "epoch": 1.156524075928957, "grad_norm": 17.1173095703125, "learning_rate": 1.8967988659456498e-06, "loss": 0.0757, "num_input_tokens_seen": 31910560, "step": 47340 }, { "epoch": 1.1566462267608042, "grad_norm": 0.10376951843500137, "learning_rate": 1.8967611325947266e-06, "loss": 0.1961, "num_input_tokens_seen": 31913696, "step": 47345 }, { "epoch": 1.1567683775926514, "grad_norm": 0.13244301080703735, "learning_rate": 1.896723392722315e-06, "loss": 0.0376, "num_input_tokens_seen": 31917152, "step": 47350 }, { "epoch": 1.1568905284244986, "grad_norm": 0.53319251537323, "learning_rate": 1.8966856463286903e-06, "loss": 0.0034, "num_input_tokens_seen": 31920480, "step": 47355 }, { "epoch": 1.1570126792563458, "grad_norm": 26.181171417236328, "learning_rate": 1.8966478934141262e-06, "loss": 0.2035, "num_input_tokens_seen": 31923872, "step": 47360 }, { "epoch": 1.157134830088193, "grad_norm": 10.004261016845703, "learning_rate": 1.8966101339788971e-06, "loss": 0.0391, "num_input_tokens_seen": 31927584, "step": 47365 }, { "epoch": 1.1572569809200401, "grad_norm": 162.45281982421875, "learning_rate": 1.8965723680232783e-06, "loss": 0.0416, "num_input_tokens_seen": 31931168, "step": 47370 }, { "epoch": 1.1573791317518873, "grad_norm": 0.23322609066963196, "learning_rate": 1.8965345955475441e-06, "loss": 0.0387, "num_input_tokens_seen": 31934304, "step": 47375 }, { "epoch": 1.1575012825837343, "grad_norm": 0.39940202236175537, "learning_rate": 1.896496816551969e-06, "loss": 0.0539, "num_input_tokens_seen": 31937760, "step": 47380 }, { "epoch": 1.1576234334155815, "grad_norm": 0.027425643056631088, "learning_rate": 1.8964590310368283e-06, "loss": 0.0016, "num_input_tokens_seen": 31941216, "step": 47385 }, { "epoch": 1.1577455842474287, "grad_norm": 0.03762891888618469, "learning_rate": 1.8964212390023959e-06, "loss": 0.0559, "num_input_tokens_seen": 31944736, "step": 47390 }, { "epoch": 1.1578677350792759, "grad_norm": 12.185388565063477, "learning_rate": 1.8963834404489474e-06, "loss": 0.1978, "num_input_tokens_seen": 31948128, "step": 47395 }, { "epoch": 1.157989885911123, "grad_norm": 0.20006321370601654, "learning_rate": 1.8963456353767575e-06, "loss": 0.0536, "num_input_tokens_seen": 31951392, "step": 47400 }, { "epoch": 1.1581120367429703, "grad_norm": 0.40255168080329895, "learning_rate": 1.8963078237861008e-06, "loss": 0.0924, "num_input_tokens_seen": 31955040, "step": 47405 }, { "epoch": 1.1582341875748174, "grad_norm": 21.12218475341797, "learning_rate": 1.8962700056772527e-06, "loss": 0.0955, "num_input_tokens_seen": 31957984, "step": 47410 }, { "epoch": 1.1583563384066646, "grad_norm": 0.21121670305728912, "learning_rate": 1.896232181050488e-06, "loss": 0.1105, "num_input_tokens_seen": 31961440, "step": 47415 }, { "epoch": 1.1584784892385116, "grad_norm": 0.07470469921827316, "learning_rate": 1.8961943499060818e-06, "loss": 0.0438, "num_input_tokens_seen": 31964832, "step": 47420 }, { "epoch": 1.1586006400703588, "grad_norm": 0.14237113296985626, "learning_rate": 1.8961565122443092e-06, "loss": 0.1184, "num_input_tokens_seen": 31968544, "step": 47425 }, { "epoch": 1.158722790902206, "grad_norm": 0.10641762614250183, "learning_rate": 1.8961186680654455e-06, "loss": 0.0383, "num_input_tokens_seen": 31971872, "step": 47430 }, { "epoch": 1.1588449417340532, "grad_norm": 0.497207909822464, "learning_rate": 1.896080817369766e-06, "loss": 0.0883, "num_input_tokens_seen": 31975072, "step": 47435 }, { "epoch": 1.1589670925659004, "grad_norm": 0.15804211795330048, "learning_rate": 1.8960429601575453e-06, "loss": 0.1452, "num_input_tokens_seen": 31978336, "step": 47440 }, { "epoch": 1.1590892433977475, "grad_norm": 3.5672736167907715, "learning_rate": 1.8960050964290595e-06, "loss": 0.0778, "num_input_tokens_seen": 31981664, "step": 47445 }, { "epoch": 1.1592113942295947, "grad_norm": 62.97404479980469, "learning_rate": 1.8959672261845836e-06, "loss": 0.0991, "num_input_tokens_seen": 31984928, "step": 47450 }, { "epoch": 1.159333545061442, "grad_norm": 0.9200723171234131, "learning_rate": 1.8959293494243931e-06, "loss": 0.0803, "num_input_tokens_seen": 31988448, "step": 47455 }, { "epoch": 1.1594556958932891, "grad_norm": 4.727555274963379, "learning_rate": 1.8958914661487632e-06, "loss": 0.086, "num_input_tokens_seen": 31991968, "step": 47460 }, { "epoch": 1.1595778467251363, "grad_norm": 1.4091517925262451, "learning_rate": 1.89585357635797e-06, "loss": 0.0076, "num_input_tokens_seen": 31995296, "step": 47465 }, { "epoch": 1.1596999975569833, "grad_norm": 0.4299665093421936, "learning_rate": 1.8958156800522884e-06, "loss": 0.0947, "num_input_tokens_seen": 31998688, "step": 47470 }, { "epoch": 1.1598221483888305, "grad_norm": 0.6416081190109253, "learning_rate": 1.8957777772319942e-06, "loss": 0.0526, "num_input_tokens_seen": 32001888, "step": 47475 }, { "epoch": 1.1599442992206777, "grad_norm": 1.7206716537475586, "learning_rate": 1.895739867897363e-06, "loss": 0.0028, "num_input_tokens_seen": 32005152, "step": 47480 }, { "epoch": 1.1600664500525248, "grad_norm": 28.344221115112305, "learning_rate": 1.8957019520486705e-06, "loss": 0.243, "num_input_tokens_seen": 32008480, "step": 47485 }, { "epoch": 1.160188600884372, "grad_norm": 16.08212661743164, "learning_rate": 1.8956640296861928e-06, "loss": 0.0791, "num_input_tokens_seen": 32011808, "step": 47490 }, { "epoch": 1.1603107517162192, "grad_norm": 0.15828566253185272, "learning_rate": 1.895626100810205e-06, "loss": 0.0302, "num_input_tokens_seen": 32014880, "step": 47495 }, { "epoch": 1.1604329025480664, "grad_norm": 108.21931457519531, "learning_rate": 1.8955881654209835e-06, "loss": 0.1028, "num_input_tokens_seen": 32018272, "step": 47500 }, { "epoch": 1.1605550533799136, "grad_norm": 1.4304190874099731, "learning_rate": 1.8955502235188042e-06, "loss": 0.0414, "num_input_tokens_seen": 32021728, "step": 47505 }, { "epoch": 1.1606772042117606, "grad_norm": 0.02215048484504223, "learning_rate": 1.8955122751039424e-06, "loss": 0.0638, "num_input_tokens_seen": 32024672, "step": 47510 }, { "epoch": 1.1607993550436078, "grad_norm": 2.74535870552063, "learning_rate": 1.8954743201766747e-06, "loss": 0.0016, "num_input_tokens_seen": 32027936, "step": 47515 }, { "epoch": 1.160921505875455, "grad_norm": 2.150292158126831, "learning_rate": 1.8954363587372768e-06, "loss": 0.0406, "num_input_tokens_seen": 32030816, "step": 47520 }, { "epoch": 1.1610436567073021, "grad_norm": 143.7350616455078, "learning_rate": 1.895398390786025e-06, "loss": 0.0563, "num_input_tokens_seen": 32034080, "step": 47525 }, { "epoch": 1.1611658075391493, "grad_norm": 83.70098876953125, "learning_rate": 1.895360416323195e-06, "loss": 0.0666, "num_input_tokens_seen": 32037152, "step": 47530 }, { "epoch": 1.1612879583709965, "grad_norm": 35.97445297241211, "learning_rate": 1.8953224353490636e-06, "loss": 0.0549, "num_input_tokens_seen": 32040608, "step": 47535 }, { "epoch": 1.1614101092028437, "grad_norm": 11.100581169128418, "learning_rate": 1.8952844478639064e-06, "loss": 0.0822, "num_input_tokens_seen": 32043808, "step": 47540 }, { "epoch": 1.161532260034691, "grad_norm": 0.08772708475589752, "learning_rate": 1.8952464538679997e-06, "loss": 0.0543, "num_input_tokens_seen": 32047072, "step": 47545 }, { "epoch": 1.161654410866538, "grad_norm": 0.9782987833023071, "learning_rate": 1.8952084533616203e-06, "loss": 0.095, "num_input_tokens_seen": 32050528, "step": 47550 }, { "epoch": 1.1617765616983853, "grad_norm": 14.846198081970215, "learning_rate": 1.8951704463450442e-06, "loss": 0.1886, "num_input_tokens_seen": 32054176, "step": 47555 }, { "epoch": 1.1618987125302322, "grad_norm": 25.585895538330078, "learning_rate": 1.8951324328185478e-06, "loss": 0.0999, "num_input_tokens_seen": 32057824, "step": 47560 }, { "epoch": 1.1620208633620794, "grad_norm": 0.7053946256637573, "learning_rate": 1.8950944127824076e-06, "loss": 0.0235, "num_input_tokens_seen": 32061664, "step": 47565 }, { "epoch": 1.1621430141939266, "grad_norm": 0.5136181116104126, "learning_rate": 1.8950563862369e-06, "loss": 0.0019, "num_input_tokens_seen": 32065376, "step": 47570 }, { "epoch": 1.1622651650257738, "grad_norm": 0.23183301091194153, "learning_rate": 1.8950183531823019e-06, "loss": 0.1614, "num_input_tokens_seen": 32068704, "step": 47575 }, { "epoch": 1.162387315857621, "grad_norm": 0.27271634340286255, "learning_rate": 1.8949803136188894e-06, "loss": 0.0803, "num_input_tokens_seen": 32072096, "step": 47580 }, { "epoch": 1.1625094666894682, "grad_norm": 0.2536061406135559, "learning_rate": 1.894942267546939e-06, "loss": 0.0427, "num_input_tokens_seen": 32076320, "step": 47585 }, { "epoch": 1.1626316175213154, "grad_norm": 148.37466430664062, "learning_rate": 1.8949042149667283e-06, "loss": 0.1575, "num_input_tokens_seen": 32079328, "step": 47590 }, { "epoch": 1.1627537683531626, "grad_norm": 1.8467384576797485, "learning_rate": 1.894866155878533e-06, "loss": 0.0022, "num_input_tokens_seen": 32083360, "step": 47595 }, { "epoch": 1.1628759191850095, "grad_norm": 8.60280990600586, "learning_rate": 1.8948280902826306e-06, "loss": 0.061, "num_input_tokens_seen": 32086752, "step": 47600 }, { "epoch": 1.1629980700168567, "grad_norm": 0.04383387789130211, "learning_rate": 1.8947900181792974e-06, "loss": 0.006, "num_input_tokens_seen": 32089568, "step": 47605 }, { "epoch": 1.163120220848704, "grad_norm": 1.2437485456466675, "learning_rate": 1.8947519395688109e-06, "loss": 0.0014, "num_input_tokens_seen": 32093280, "step": 47610 }, { "epoch": 1.1632423716805511, "grad_norm": 12.266228675842285, "learning_rate": 1.8947138544514473e-06, "loss": 0.1059, "num_input_tokens_seen": 32096480, "step": 47615 }, { "epoch": 1.1633645225123983, "grad_norm": 26.271141052246094, "learning_rate": 1.894675762827484e-06, "loss": 0.0932, "num_input_tokens_seen": 32099744, "step": 47620 }, { "epoch": 1.1634866733442455, "grad_norm": 8.145895004272461, "learning_rate": 1.894637664697198e-06, "loss": 0.2224, "num_input_tokens_seen": 32103200, "step": 47625 }, { "epoch": 1.1636088241760927, "grad_norm": 26.563020706176758, "learning_rate": 1.8945995600608662e-06, "loss": 0.0968, "num_input_tokens_seen": 32106592, "step": 47630 }, { "epoch": 1.1637309750079399, "grad_norm": 58.50276184082031, "learning_rate": 1.8945614489187658e-06, "loss": 0.1202, "num_input_tokens_seen": 32109920, "step": 47635 }, { "epoch": 1.163853125839787, "grad_norm": 4.910950660705566, "learning_rate": 1.8945233312711739e-06, "loss": 0.079, "num_input_tokens_seen": 32113376, "step": 47640 }, { "epoch": 1.1639752766716343, "grad_norm": 75.29106140136719, "learning_rate": 1.8944852071183676e-06, "loss": 0.0071, "num_input_tokens_seen": 32117024, "step": 47645 }, { "epoch": 1.1640974275034812, "grad_norm": 0.8739251494407654, "learning_rate": 1.8944470764606247e-06, "loss": 0.0531, "num_input_tokens_seen": 32120224, "step": 47650 }, { "epoch": 1.1642195783353284, "grad_norm": 0.44417768716812134, "learning_rate": 1.8944089392982216e-06, "loss": 0.0014, "num_input_tokens_seen": 32123744, "step": 47655 }, { "epoch": 1.1643417291671756, "grad_norm": 8.347487449645996, "learning_rate": 1.8943707956314364e-06, "loss": 0.1023, "num_input_tokens_seen": 32126880, "step": 47660 }, { "epoch": 1.1644638799990228, "grad_norm": 0.23629111051559448, "learning_rate": 1.8943326454605462e-06, "loss": 0.0043, "num_input_tokens_seen": 32130336, "step": 47665 }, { "epoch": 1.16458603083087, "grad_norm": 0.08674504607915878, "learning_rate": 1.8942944887858286e-06, "loss": 0.0538, "num_input_tokens_seen": 32133728, "step": 47670 }, { "epoch": 1.1647081816627172, "grad_norm": 4.8898186683654785, "learning_rate": 1.8942563256075607e-06, "loss": 0.0024, "num_input_tokens_seen": 32136864, "step": 47675 }, { "epoch": 1.1648303324945644, "grad_norm": 1.8654694557189941, "learning_rate": 1.8942181559260204e-06, "loss": 0.0203, "num_input_tokens_seen": 32140448, "step": 47680 }, { "epoch": 1.1649524833264115, "grad_norm": 0.09039192646741867, "learning_rate": 1.894179979741485e-06, "loss": 0.1419, "num_input_tokens_seen": 32143712, "step": 47685 }, { "epoch": 1.1650746341582585, "grad_norm": 12.765069961547852, "learning_rate": 1.8941417970542324e-06, "loss": 0.1193, "num_input_tokens_seen": 32147360, "step": 47690 }, { "epoch": 1.1651967849901057, "grad_norm": 0.030316416174173355, "learning_rate": 1.8941036078645403e-06, "loss": 0.036, "num_input_tokens_seen": 32151264, "step": 47695 }, { "epoch": 1.165318935821953, "grad_norm": 0.04319749027490616, "learning_rate": 1.894065412172686e-06, "loss": 0.0627, "num_input_tokens_seen": 32154784, "step": 47700 }, { "epoch": 1.1654410866538, "grad_norm": 0.11999724805355072, "learning_rate": 1.8940272099789476e-06, "loss": 0.0013, "num_input_tokens_seen": 32158048, "step": 47705 }, { "epoch": 1.1655632374856473, "grad_norm": 8.9257173538208, "learning_rate": 1.8939890012836032e-06, "loss": 0.1186, "num_input_tokens_seen": 32161376, "step": 47710 }, { "epoch": 1.1656853883174945, "grad_norm": 0.18495075404644012, "learning_rate": 1.89395078608693e-06, "loss": 0.0024, "num_input_tokens_seen": 32164448, "step": 47715 }, { "epoch": 1.1658075391493417, "grad_norm": 8.209742546081543, "learning_rate": 1.8939125643892062e-06, "loss": 0.1112, "num_input_tokens_seen": 32167648, "step": 47720 }, { "epoch": 1.1659296899811888, "grad_norm": 22.166501998901367, "learning_rate": 1.89387433619071e-06, "loss": 0.0691, "num_input_tokens_seen": 32170656, "step": 47725 }, { "epoch": 1.166051840813036, "grad_norm": 0.02899726666510105, "learning_rate": 1.893836101491719e-06, "loss": 0.0222, "num_input_tokens_seen": 32174048, "step": 47730 }, { "epoch": 1.1661739916448832, "grad_norm": 32.36577224731445, "learning_rate": 1.8937978602925114e-06, "loss": 0.1264, "num_input_tokens_seen": 32177632, "step": 47735 }, { "epoch": 1.1662961424767302, "grad_norm": 13.967851638793945, "learning_rate": 1.8937596125933654e-06, "loss": 0.0959, "num_input_tokens_seen": 32180704, "step": 47740 }, { "epoch": 1.1664182933085774, "grad_norm": 0.299034982919693, "learning_rate": 1.8937213583945595e-06, "loss": 0.1216, "num_input_tokens_seen": 32183904, "step": 47745 }, { "epoch": 1.1665404441404246, "grad_norm": 0.8588279485702515, "learning_rate": 1.8936830976963712e-06, "loss": 0.1122, "num_input_tokens_seen": 32186784, "step": 47750 }, { "epoch": 1.1666625949722718, "grad_norm": 0.05717243626713753, "learning_rate": 1.893644830499079e-06, "loss": 0.0621, "num_input_tokens_seen": 32190304, "step": 47755 }, { "epoch": 1.166784745804119, "grad_norm": 2.23378849029541, "learning_rate": 1.8936065568029614e-06, "loss": 0.0988, "num_input_tokens_seen": 32193504, "step": 47760 }, { "epoch": 1.1669068966359661, "grad_norm": 11.925138473510742, "learning_rate": 1.8935682766082964e-06, "loss": 0.0803, "num_input_tokens_seen": 32197216, "step": 47765 }, { "epoch": 1.1670290474678133, "grad_norm": 382.5774841308594, "learning_rate": 1.8935299899153625e-06, "loss": 0.1822, "num_input_tokens_seen": 32200672, "step": 47770 }, { "epoch": 1.1671511982996603, "grad_norm": 0.1967589259147644, "learning_rate": 1.8934916967244386e-06, "loss": 0.0734, "num_input_tokens_seen": 32203872, "step": 47775 }, { "epoch": 1.1672733491315075, "grad_norm": 0.02376456931233406, "learning_rate": 1.8934533970358022e-06, "loss": 0.1033, "num_input_tokens_seen": 32207776, "step": 47780 }, { "epoch": 1.1673954999633547, "grad_norm": 14.840989112854004, "learning_rate": 1.8934150908497327e-06, "loss": 0.2013, "num_input_tokens_seen": 32211168, "step": 47785 }, { "epoch": 1.1675176507952019, "grad_norm": 0.14429239928722382, "learning_rate": 1.8933767781665085e-06, "loss": 0.0334, "num_input_tokens_seen": 32214432, "step": 47790 }, { "epoch": 1.167639801627049, "grad_norm": 8.509486198425293, "learning_rate": 1.8933384589864077e-06, "loss": 0.154, "num_input_tokens_seen": 32217568, "step": 47795 }, { "epoch": 1.1677619524588962, "grad_norm": 0.18389767408370972, "learning_rate": 1.8933001333097094e-06, "loss": 0.1517, "num_input_tokens_seen": 32220832, "step": 47800 }, { "epoch": 1.1678841032907434, "grad_norm": 178.11093139648438, "learning_rate": 1.8932618011366922e-06, "loss": 0.155, "num_input_tokens_seen": 32224480, "step": 47805 }, { "epoch": 1.1680062541225906, "grad_norm": 26.663612365722656, "learning_rate": 1.8932234624676356e-06, "loss": 0.167, "num_input_tokens_seen": 32227680, "step": 47810 }, { "epoch": 1.1681284049544378, "grad_norm": 0.439523309469223, "learning_rate": 1.893185117302817e-06, "loss": 0.0336, "num_input_tokens_seen": 32231136, "step": 47815 }, { "epoch": 1.168250555786285, "grad_norm": 10.76060962677002, "learning_rate": 1.8931467656425163e-06, "loss": 0.0647, "num_input_tokens_seen": 32234336, "step": 47820 }, { "epoch": 1.168372706618132, "grad_norm": 0.914696991443634, "learning_rate": 1.8931084074870118e-06, "loss": 0.0296, "num_input_tokens_seen": 32237856, "step": 47825 }, { "epoch": 1.1684948574499792, "grad_norm": 0.30933263897895813, "learning_rate": 1.8930700428365832e-06, "loss": 0.0017, "num_input_tokens_seen": 32241440, "step": 47830 }, { "epoch": 1.1686170082818264, "grad_norm": 14.791959762573242, "learning_rate": 1.8930316716915087e-06, "loss": 0.0681, "num_input_tokens_seen": 32244512, "step": 47835 }, { "epoch": 1.1687391591136735, "grad_norm": 8.420495986938477, "learning_rate": 1.892993294052068e-06, "loss": 0.0032, "num_input_tokens_seen": 32247968, "step": 47840 }, { "epoch": 1.1688613099455207, "grad_norm": 0.24798475205898285, "learning_rate": 1.8929549099185396e-06, "loss": 0.0858, "num_input_tokens_seen": 32251104, "step": 47845 }, { "epoch": 1.168983460777368, "grad_norm": 36.27674865722656, "learning_rate": 1.892916519291203e-06, "loss": 0.0914, "num_input_tokens_seen": 32254432, "step": 47850 }, { "epoch": 1.1691056116092151, "grad_norm": 0.011220994405448437, "learning_rate": 1.8928781221703374e-06, "loss": 0.0972, "num_input_tokens_seen": 32258080, "step": 47855 }, { "epoch": 1.1692277624410623, "grad_norm": 0.1429062783718109, "learning_rate": 1.8928397185562217e-06, "loss": 0.0008, "num_input_tokens_seen": 32261152, "step": 47860 }, { "epoch": 1.1693499132729093, "grad_norm": 25.986040115356445, "learning_rate": 1.8928013084491354e-06, "loss": 0.0328, "num_input_tokens_seen": 32264288, "step": 47865 }, { "epoch": 1.1694720641047565, "grad_norm": 0.3018045127391815, "learning_rate": 1.8927628918493581e-06, "loss": 0.228, "num_input_tokens_seen": 32268256, "step": 47870 }, { "epoch": 1.1695942149366036, "grad_norm": 0.09300398826599121, "learning_rate": 1.8927244687571688e-06, "loss": 0.0219, "num_input_tokens_seen": 32271776, "step": 47875 }, { "epoch": 1.1697163657684508, "grad_norm": 11.6989164352417, "learning_rate": 1.8926860391728472e-06, "loss": 0.1846, "num_input_tokens_seen": 32275040, "step": 47880 }, { "epoch": 1.169838516600298, "grad_norm": 134.87442016601562, "learning_rate": 1.8926476030966724e-06, "loss": 0.1796, "num_input_tokens_seen": 32278368, "step": 47885 }, { "epoch": 1.1699606674321452, "grad_norm": 1.2735921144485474, "learning_rate": 1.8926091605289245e-06, "loss": 0.0011, "num_input_tokens_seen": 32281696, "step": 47890 }, { "epoch": 1.1700828182639924, "grad_norm": 0.2746407389640808, "learning_rate": 1.8925707114698823e-06, "loss": 0.1402, "num_input_tokens_seen": 32284960, "step": 47895 }, { "epoch": 1.1702049690958396, "grad_norm": 0.2347487211227417, "learning_rate": 1.892532255919826e-06, "loss": 0.0995, "num_input_tokens_seen": 32288352, "step": 47900 }, { "epoch": 1.1703271199276868, "grad_norm": 242.01580810546875, "learning_rate": 1.8924937938790348e-06, "loss": 0.1636, "num_input_tokens_seen": 32291744, "step": 47905 }, { "epoch": 1.170449270759534, "grad_norm": 0.5199395418167114, "learning_rate": 1.8924553253477891e-06, "loss": 0.11, "num_input_tokens_seen": 32295136, "step": 47910 }, { "epoch": 1.170571421591381, "grad_norm": 36.90918731689453, "learning_rate": 1.8924168503263682e-06, "loss": 0.1014, "num_input_tokens_seen": 32298528, "step": 47915 }, { "epoch": 1.1706935724232281, "grad_norm": 9.800524711608887, "learning_rate": 1.8923783688150517e-06, "loss": 0.0434, "num_input_tokens_seen": 32301600, "step": 47920 }, { "epoch": 1.1708157232550753, "grad_norm": 61.35971450805664, "learning_rate": 1.8923398808141195e-06, "loss": 0.0966, "num_input_tokens_seen": 32304544, "step": 47925 }, { "epoch": 1.1709378740869225, "grad_norm": 1.0777499675750732, "learning_rate": 1.8923013863238523e-06, "loss": 0.1141, "num_input_tokens_seen": 32308384, "step": 47930 }, { "epoch": 1.1710600249187697, "grad_norm": 0.5199394822120667, "learning_rate": 1.8922628853445288e-06, "loss": 0.0021, "num_input_tokens_seen": 32312160, "step": 47935 }, { "epoch": 1.171182175750617, "grad_norm": 0.17201387882232666, "learning_rate": 1.89222437787643e-06, "loss": 0.2046, "num_input_tokens_seen": 32315296, "step": 47940 }, { "epoch": 1.171304326582464, "grad_norm": 9.953340530395508, "learning_rate": 1.8921858639198354e-06, "loss": 0.1025, "num_input_tokens_seen": 32318304, "step": 47945 }, { "epoch": 1.1714264774143113, "grad_norm": 6.807833194732666, "learning_rate": 1.8921473434750254e-06, "loss": 0.0589, "num_input_tokens_seen": 32321632, "step": 47950 }, { "epoch": 1.1715486282461582, "grad_norm": 0.25497400760650635, "learning_rate": 1.8921088165422797e-06, "loss": 0.1001, "num_input_tokens_seen": 32325472, "step": 47955 }, { "epoch": 1.1716707790780054, "grad_norm": 2.2793524265289307, "learning_rate": 1.8920702831218787e-06, "loss": 0.055, "num_input_tokens_seen": 32328736, "step": 47960 }, { "epoch": 1.1717929299098526, "grad_norm": 0.5788264274597168, "learning_rate": 1.892031743214103e-06, "loss": 0.1018, "num_input_tokens_seen": 32331744, "step": 47965 }, { "epoch": 1.1719150807416998, "grad_norm": 0.24290089309215546, "learning_rate": 1.8919931968192322e-06, "loss": 0.0292, "num_input_tokens_seen": 32335264, "step": 47970 }, { "epoch": 1.172037231573547, "grad_norm": 0.3527112305164337, "learning_rate": 1.8919546439375468e-06, "loss": 0.0015, "num_input_tokens_seen": 32338784, "step": 47975 }, { "epoch": 1.1721593824053942, "grad_norm": 0.06581027805805206, "learning_rate": 1.8919160845693278e-06, "loss": 0.0441, "num_input_tokens_seen": 32341728, "step": 47980 }, { "epoch": 1.1722815332372414, "grad_norm": 148.33258056640625, "learning_rate": 1.891877518714855e-06, "loss": 0.0108, "num_input_tokens_seen": 32344992, "step": 47985 }, { "epoch": 1.1724036840690886, "grad_norm": 0.21731750667095184, "learning_rate": 1.891838946374409e-06, "loss": 0.0336, "num_input_tokens_seen": 32348192, "step": 47990 }, { "epoch": 1.1725258349009358, "grad_norm": 122.77140808105469, "learning_rate": 1.8918003675482702e-06, "loss": 0.1033, "num_input_tokens_seen": 32351136, "step": 47995 }, { "epoch": 1.172647985732783, "grad_norm": 0.18697240948677063, "learning_rate": 1.8917617822367193e-06, "loss": 0.0012, "num_input_tokens_seen": 32354464, "step": 48000 }, { "epoch": 1.17277013656463, "grad_norm": 4.480820655822754, "learning_rate": 1.8917231904400369e-06, "loss": 0.1176, "num_input_tokens_seen": 32357664, "step": 48005 }, { "epoch": 1.172892287396477, "grad_norm": 0.833780825138092, "learning_rate": 1.8916845921585036e-06, "loss": 0.0013, "num_input_tokens_seen": 32360992, "step": 48010 }, { "epoch": 1.1730144382283243, "grad_norm": 24.22505760192871, "learning_rate": 1.8916459873924e-06, "loss": 0.1414, "num_input_tokens_seen": 32364256, "step": 48015 }, { "epoch": 1.1731365890601715, "grad_norm": 108.00692749023438, "learning_rate": 1.8916073761420073e-06, "loss": 0.0688, "num_input_tokens_seen": 32367904, "step": 48020 }, { "epoch": 1.1732587398920187, "grad_norm": 0.07697835564613342, "learning_rate": 1.8915687584076054e-06, "loss": 0.0011, "num_input_tokens_seen": 32371232, "step": 48025 }, { "epoch": 1.1733808907238659, "grad_norm": 0.14206215739250183, "learning_rate": 1.8915301341894762e-06, "loss": 0.0009, "num_input_tokens_seen": 32374496, "step": 48030 }, { "epoch": 1.173503041555713, "grad_norm": 8.598139762878418, "learning_rate": 1.8914915034878997e-06, "loss": 0.1076, "num_input_tokens_seen": 32377696, "step": 48035 }, { "epoch": 1.1736251923875602, "grad_norm": 0.032318364828825, "learning_rate": 1.8914528663031575e-06, "loss": 0.048, "num_input_tokens_seen": 32381280, "step": 48040 }, { "epoch": 1.1737473432194072, "grad_norm": 6.112025737762451, "learning_rate": 1.89141422263553e-06, "loss": 0.1592, "num_input_tokens_seen": 32384992, "step": 48045 }, { "epoch": 1.1738694940512544, "grad_norm": 13.649267196655273, "learning_rate": 1.8913755724852988e-06, "loss": 0.0489, "num_input_tokens_seen": 32388576, "step": 48050 }, { "epoch": 1.1739916448831016, "grad_norm": 0.0673687532544136, "learning_rate": 1.8913369158527447e-06, "loss": 0.0907, "num_input_tokens_seen": 32391584, "step": 48055 }, { "epoch": 1.1741137957149488, "grad_norm": 0.7464065551757812, "learning_rate": 1.8912982527381486e-06, "loss": 0.0203, "num_input_tokens_seen": 32394848, "step": 48060 }, { "epoch": 1.174235946546796, "grad_norm": 0.08196679502725601, "learning_rate": 1.8912595831417919e-06, "loss": 0.1048, "num_input_tokens_seen": 32397920, "step": 48065 }, { "epoch": 1.1743580973786432, "grad_norm": 9.292841911315918, "learning_rate": 1.8912209070639558e-06, "loss": 0.0444, "num_input_tokens_seen": 32401184, "step": 48070 }, { "epoch": 1.1744802482104904, "grad_norm": 0.3272796869277954, "learning_rate": 1.8911822245049213e-06, "loss": 0.1023, "num_input_tokens_seen": 32404448, "step": 48075 }, { "epoch": 1.1746023990423375, "grad_norm": 0.04792598634958267, "learning_rate": 1.8911435354649705e-06, "loss": 0.0377, "num_input_tokens_seen": 32407584, "step": 48080 }, { "epoch": 1.1747245498741847, "grad_norm": 0.18216323852539062, "learning_rate": 1.8911048399443838e-06, "loss": 0.0557, "num_input_tokens_seen": 32411360, "step": 48085 }, { "epoch": 1.174846700706032, "grad_norm": 37.21038055419922, "learning_rate": 1.8910661379434432e-06, "loss": 0.1424, "num_input_tokens_seen": 32414816, "step": 48090 }, { "epoch": 1.1749688515378789, "grad_norm": 0.2778770923614502, "learning_rate": 1.89102742946243e-06, "loss": 0.0771, "num_input_tokens_seen": 32417888, "step": 48095 }, { "epoch": 1.175091002369726, "grad_norm": 0.4837040603160858, "learning_rate": 1.8909887145016257e-06, "loss": 0.1183, "num_input_tokens_seen": 32421472, "step": 48100 }, { "epoch": 1.1752131532015733, "grad_norm": 0.7100688219070435, "learning_rate": 1.8909499930613118e-06, "loss": 0.1321, "num_input_tokens_seen": 32424544, "step": 48105 }, { "epoch": 1.1753353040334205, "grad_norm": 0.995244562625885, "learning_rate": 1.8909112651417699e-06, "loss": 0.0028, "num_input_tokens_seen": 32427616, "step": 48110 }, { "epoch": 1.1754574548652676, "grad_norm": 9.889370918273926, "learning_rate": 1.8908725307432816e-06, "loss": 0.1127, "num_input_tokens_seen": 32430624, "step": 48115 }, { "epoch": 1.1755796056971148, "grad_norm": 1.033982515335083, "learning_rate": 1.8908337898661287e-06, "loss": 0.0027, "num_input_tokens_seen": 32434208, "step": 48120 }, { "epoch": 1.175701756528962, "grad_norm": 0.1141548901796341, "learning_rate": 1.8907950425105927e-06, "loss": 0.0397, "num_input_tokens_seen": 32437664, "step": 48125 }, { "epoch": 1.1758239073608092, "grad_norm": 23.38401222229004, "learning_rate": 1.8907562886769557e-06, "loss": 0.1019, "num_input_tokens_seen": 32441248, "step": 48130 }, { "epoch": 1.1759460581926562, "grad_norm": 0.01714020036160946, "learning_rate": 1.8907175283654992e-06, "loss": 0.0376, "num_input_tokens_seen": 32444320, "step": 48135 }, { "epoch": 1.1760682090245034, "grad_norm": 0.18078669905662537, "learning_rate": 1.8906787615765055e-06, "loss": 0.0544, "num_input_tokens_seen": 32447648, "step": 48140 }, { "epoch": 1.1761903598563506, "grad_norm": 12.950847625732422, "learning_rate": 1.8906399883102565e-06, "loss": 0.205, "num_input_tokens_seen": 32450848, "step": 48145 }, { "epoch": 1.1763125106881978, "grad_norm": 14.540743827819824, "learning_rate": 1.8906012085670336e-06, "loss": 0.076, "num_input_tokens_seen": 32454240, "step": 48150 }, { "epoch": 1.176434661520045, "grad_norm": 0.9453197717666626, "learning_rate": 1.890562422347119e-06, "loss": 0.1783, "num_input_tokens_seen": 32457440, "step": 48155 }, { "epoch": 1.1765568123518921, "grad_norm": 7.787158489227295, "learning_rate": 1.8905236296507953e-06, "loss": 0.2149, "num_input_tokens_seen": 32460704, "step": 48160 }, { "epoch": 1.1766789631837393, "grad_norm": 0.22460059821605682, "learning_rate": 1.890484830478344e-06, "loss": 0.1459, "num_input_tokens_seen": 32464800, "step": 48165 }, { "epoch": 1.1768011140155865, "grad_norm": 64.1084976196289, "learning_rate": 1.8904460248300478e-06, "loss": 0.0858, "num_input_tokens_seen": 32468192, "step": 48170 }, { "epoch": 1.1769232648474337, "grad_norm": 0.4196472764015198, "learning_rate": 1.8904072127061884e-06, "loss": 0.0026, "num_input_tokens_seen": 32471072, "step": 48175 }, { "epoch": 1.177045415679281, "grad_norm": 0.14943639934062958, "learning_rate": 1.8903683941070483e-06, "loss": 0.0297, "num_input_tokens_seen": 32473952, "step": 48180 }, { "epoch": 1.1771675665111279, "grad_norm": 2.338526487350464, "learning_rate": 1.8903295690329097e-06, "loss": 0.0313, "num_input_tokens_seen": 32476896, "step": 48185 }, { "epoch": 1.177289717342975, "grad_norm": 0.20392240583896637, "learning_rate": 1.890290737484055e-06, "loss": 0.0741, "num_input_tokens_seen": 32480224, "step": 48190 }, { "epoch": 1.1774118681748222, "grad_norm": 9.235472679138184, "learning_rate": 1.890251899460767e-06, "loss": 0.0477, "num_input_tokens_seen": 32483872, "step": 48195 }, { "epoch": 1.1775340190066694, "grad_norm": 27.74861717224121, "learning_rate": 1.8902130549633272e-06, "loss": 0.1153, "num_input_tokens_seen": 32487072, "step": 48200 }, { "epoch": 1.1776561698385166, "grad_norm": 0.10655547678470612, "learning_rate": 1.8901742039920188e-06, "loss": 0.0391, "num_input_tokens_seen": 32490272, "step": 48205 }, { "epoch": 1.1777783206703638, "grad_norm": 104.34963989257812, "learning_rate": 1.8901353465471242e-06, "loss": 0.1538, "num_input_tokens_seen": 32493536, "step": 48210 }, { "epoch": 1.177900471502211, "grad_norm": 0.06965136528015137, "learning_rate": 1.8900964826289258e-06, "loss": 0.0322, "num_input_tokens_seen": 32497248, "step": 48215 }, { "epoch": 1.1780226223340582, "grad_norm": 0.0271898340433836, "learning_rate": 1.8900576122377066e-06, "loss": 0.1566, "num_input_tokens_seen": 32500448, "step": 48220 }, { "epoch": 1.1781447731659052, "grad_norm": 30.9471435546875, "learning_rate": 1.8900187353737488e-06, "loss": 0.0295, "num_input_tokens_seen": 32503648, "step": 48225 }, { "epoch": 1.1782669239977523, "grad_norm": 0.1662287563085556, "learning_rate": 1.8899798520373356e-06, "loss": 0.001, "num_input_tokens_seen": 32507104, "step": 48230 }, { "epoch": 1.1783890748295995, "grad_norm": 111.50906372070312, "learning_rate": 1.8899409622287491e-06, "loss": 0.2231, "num_input_tokens_seen": 32510560, "step": 48235 }, { "epoch": 1.1785112256614467, "grad_norm": 11.14577579498291, "learning_rate": 1.8899020659482732e-06, "loss": 0.1259, "num_input_tokens_seen": 32514144, "step": 48240 }, { "epoch": 1.178633376493294, "grad_norm": 17.156038284301758, "learning_rate": 1.88986316319619e-06, "loss": 0.1139, "num_input_tokens_seen": 32517472, "step": 48245 }, { "epoch": 1.178755527325141, "grad_norm": 0.5925399661064148, "learning_rate": 1.8898242539727823e-06, "loss": 0.0018, "num_input_tokens_seen": 32520992, "step": 48250 }, { "epoch": 1.1788776781569883, "grad_norm": 17.069896697998047, "learning_rate": 1.8897853382783332e-06, "loss": 0.1008, "num_input_tokens_seen": 32523936, "step": 48255 }, { "epoch": 1.1789998289888355, "grad_norm": 0.23743368685245514, "learning_rate": 1.8897464161131258e-06, "loss": 0.0019, "num_input_tokens_seen": 32527328, "step": 48260 }, { "epoch": 1.1791219798206827, "grad_norm": 0.3088131546974182, "learning_rate": 1.8897074874774435e-06, "loss": 0.0286, "num_input_tokens_seen": 32530720, "step": 48265 }, { "epoch": 1.1792441306525299, "grad_norm": 0.4604056775569916, "learning_rate": 1.8896685523715687e-06, "loss": 0.187, "num_input_tokens_seen": 32534176, "step": 48270 }, { "epoch": 1.1793662814843768, "grad_norm": 0.2785622179508209, "learning_rate": 1.8896296107957853e-06, "loss": 0.0253, "num_input_tokens_seen": 32537696, "step": 48275 }, { "epoch": 1.179488432316224, "grad_norm": 22.758668899536133, "learning_rate": 1.8895906627503756e-06, "loss": 0.1362, "num_input_tokens_seen": 32541024, "step": 48280 }, { "epoch": 1.1796105831480712, "grad_norm": 0.6522172093391418, "learning_rate": 1.8895517082356236e-06, "loss": 0.1035, "num_input_tokens_seen": 32544416, "step": 48285 }, { "epoch": 1.1797327339799184, "grad_norm": 23.65367889404297, "learning_rate": 1.8895127472518121e-06, "loss": 0.1123, "num_input_tokens_seen": 32547936, "step": 48290 }, { "epoch": 1.1798548848117656, "grad_norm": 4.1731672286987305, "learning_rate": 1.8894737797992249e-06, "loss": 0.038, "num_input_tokens_seen": 32551072, "step": 48295 }, { "epoch": 1.1799770356436128, "grad_norm": 0.1166360080242157, "learning_rate": 1.8894348058781451e-06, "loss": 0.0423, "num_input_tokens_seen": 32554400, "step": 48300 }, { "epoch": 1.18009918647546, "grad_norm": 177.94476318359375, "learning_rate": 1.8893958254888562e-06, "loss": 0.0501, "num_input_tokens_seen": 32557536, "step": 48305 }, { "epoch": 1.180221337307307, "grad_norm": 13.788195610046387, "learning_rate": 1.8893568386316414e-06, "loss": 0.0472, "num_input_tokens_seen": 32560928, "step": 48310 }, { "epoch": 1.1803434881391541, "grad_norm": 0.22638006508350372, "learning_rate": 1.8893178453067846e-06, "loss": 0.0604, "num_input_tokens_seen": 32565280, "step": 48315 }, { "epoch": 1.1804656389710013, "grad_norm": 57.36815643310547, "learning_rate": 1.8892788455145694e-06, "loss": 0.0701, "num_input_tokens_seen": 32568736, "step": 48320 }, { "epoch": 1.1805877898028485, "grad_norm": 0.09134076535701752, "learning_rate": 1.8892398392552788e-06, "loss": 0.0004, "num_input_tokens_seen": 32572512, "step": 48325 }, { "epoch": 1.1807099406346957, "grad_norm": 18.480314254760742, "learning_rate": 1.8892008265291975e-06, "loss": 0.1615, "num_input_tokens_seen": 32575904, "step": 48330 }, { "epoch": 1.1808320914665429, "grad_norm": 0.061544980853796005, "learning_rate": 1.8891618073366082e-06, "loss": 0.0992, "num_input_tokens_seen": 32579104, "step": 48335 }, { "epoch": 1.18095424229839, "grad_norm": 0.22377759218215942, "learning_rate": 1.8891227816777953e-06, "loss": 0.0066, "num_input_tokens_seen": 32582688, "step": 48340 }, { "epoch": 1.1810763931302373, "grad_norm": 51.224544525146484, "learning_rate": 1.8890837495530423e-06, "loss": 0.1201, "num_input_tokens_seen": 32585952, "step": 48345 }, { "epoch": 1.1811985439620845, "grad_norm": 14.230652809143066, "learning_rate": 1.889044710962633e-06, "loss": 0.1483, "num_input_tokens_seen": 32588896, "step": 48350 }, { "epoch": 1.1813206947939316, "grad_norm": 0.10260359942913055, "learning_rate": 1.8890056659068516e-06, "loss": 0.1178, "num_input_tokens_seen": 32592096, "step": 48355 }, { "epoch": 1.1814428456257786, "grad_norm": 222.6650848388672, "learning_rate": 1.888966614385982e-06, "loss": 0.1299, "num_input_tokens_seen": 32595552, "step": 48360 }, { "epoch": 1.1815649964576258, "grad_norm": 0.18788084387779236, "learning_rate": 1.8889275564003078e-06, "loss": 0.0912, "num_input_tokens_seen": 32599520, "step": 48365 }, { "epoch": 1.181687147289473, "grad_norm": 0.09443093091249466, "learning_rate": 1.8888884919501136e-06, "loss": 0.0017, "num_input_tokens_seen": 32602656, "step": 48370 }, { "epoch": 1.1818092981213202, "grad_norm": 0.46310916543006897, "learning_rate": 1.888849421035683e-06, "loss": 0.0739, "num_input_tokens_seen": 32605984, "step": 48375 }, { "epoch": 1.1819314489531674, "grad_norm": 35.772621154785156, "learning_rate": 1.8888103436573003e-06, "loss": 0.1791, "num_input_tokens_seen": 32609056, "step": 48380 }, { "epoch": 1.1820535997850146, "grad_norm": 0.2312948852777481, "learning_rate": 1.8887712598152498e-06, "loss": 0.1844, "num_input_tokens_seen": 32612576, "step": 48385 }, { "epoch": 1.1821757506168618, "grad_norm": 0.7724999189376831, "learning_rate": 1.8887321695098157e-06, "loss": 0.0289, "num_input_tokens_seen": 32615776, "step": 48390 }, { "epoch": 1.182297901448709, "grad_norm": 0.4551694989204407, "learning_rate": 1.8886930727412822e-06, "loss": 0.0107, "num_input_tokens_seen": 32618720, "step": 48395 }, { "epoch": 1.182420052280556, "grad_norm": 55.0472412109375, "learning_rate": 1.8886539695099338e-06, "loss": 0.2006, "num_input_tokens_seen": 32622368, "step": 48400 }, { "epoch": 1.182542203112403, "grad_norm": 0.09480097144842148, "learning_rate": 1.8886148598160542e-06, "loss": 0.0016, "num_input_tokens_seen": 32625952, "step": 48405 }, { "epoch": 1.1826643539442503, "grad_norm": 13.491443634033203, "learning_rate": 1.888575743659929e-06, "loss": 0.0473, "num_input_tokens_seen": 32629600, "step": 48410 }, { "epoch": 1.1827865047760975, "grad_norm": 0.2457229048013687, "learning_rate": 1.8885366210418415e-06, "loss": 0.0073, "num_input_tokens_seen": 32632608, "step": 48415 }, { "epoch": 1.1829086556079447, "grad_norm": 0.18898482620716095, "learning_rate": 1.8884974919620769e-06, "loss": 0.0715, "num_input_tokens_seen": 32635936, "step": 48420 }, { "epoch": 1.1830308064397919, "grad_norm": 19.41193389892578, "learning_rate": 1.8884583564209196e-06, "loss": 0.1009, "num_input_tokens_seen": 32639456, "step": 48425 }, { "epoch": 1.183152957271639, "grad_norm": 106.54630279541016, "learning_rate": 1.8884192144186541e-06, "loss": 0.2983, "num_input_tokens_seen": 32642848, "step": 48430 }, { "epoch": 1.1832751081034862, "grad_norm": 12.161974906921387, "learning_rate": 1.8883800659555652e-06, "loss": 0.0824, "num_input_tokens_seen": 32646176, "step": 48435 }, { "epoch": 1.1833972589353334, "grad_norm": 26.162527084350586, "learning_rate": 1.8883409110319372e-06, "loss": 0.0827, "num_input_tokens_seen": 32649632, "step": 48440 }, { "epoch": 1.1835194097671806, "grad_norm": 33.377681732177734, "learning_rate": 1.8883017496480553e-06, "loss": 0.1353, "num_input_tokens_seen": 32652832, "step": 48445 }, { "epoch": 1.1836415605990276, "grad_norm": 53.93454360961914, "learning_rate": 1.8882625818042043e-06, "loss": 0.0216, "num_input_tokens_seen": 32656096, "step": 48450 }, { "epoch": 1.1837637114308748, "grad_norm": 0.2480727583169937, "learning_rate": 1.888223407500669e-06, "loss": 0.0541, "num_input_tokens_seen": 32659488, "step": 48455 }, { "epoch": 1.183885862262722, "grad_norm": 29.658166885375977, "learning_rate": 1.8881842267377339e-06, "loss": 0.1097, "num_input_tokens_seen": 32662688, "step": 48460 }, { "epoch": 1.1840080130945692, "grad_norm": 54.706947326660156, "learning_rate": 1.8881450395156844e-06, "loss": 0.1229, "num_input_tokens_seen": 32665760, "step": 48465 }, { "epoch": 1.1841301639264163, "grad_norm": 60.36311721801758, "learning_rate": 1.888105845834805e-06, "loss": 0.0777, "num_input_tokens_seen": 32669024, "step": 48470 }, { "epoch": 1.1842523147582635, "grad_norm": 0.33893075585365295, "learning_rate": 1.8880666456953812e-06, "loss": 0.0822, "num_input_tokens_seen": 32672224, "step": 48475 }, { "epoch": 1.1843744655901107, "grad_norm": 16.1857967376709, "learning_rate": 1.8880274390976983e-06, "loss": 0.0383, "num_input_tokens_seen": 32675488, "step": 48480 }, { "epoch": 1.184496616421958, "grad_norm": 0.5863775014877319, "learning_rate": 1.8879882260420406e-06, "loss": 0.0351, "num_input_tokens_seen": 32678688, "step": 48485 }, { "epoch": 1.1846187672538049, "grad_norm": 50.410255432128906, "learning_rate": 1.8879490065286937e-06, "loss": 0.1213, "num_input_tokens_seen": 32682784, "step": 48490 }, { "epoch": 1.184740918085652, "grad_norm": 1.918444037437439, "learning_rate": 1.8879097805579428e-06, "loss": 0.1368, "num_input_tokens_seen": 32685856, "step": 48495 }, { "epoch": 1.1848630689174993, "grad_norm": 0.1359431892633438, "learning_rate": 1.8878705481300732e-06, "loss": 0.0819, "num_input_tokens_seen": 32689056, "step": 48500 }, { "epoch": 1.1849852197493465, "grad_norm": 13.425424575805664, "learning_rate": 1.88783130924537e-06, "loss": 0.1993, "num_input_tokens_seen": 32692384, "step": 48505 }, { "epoch": 1.1851073705811936, "grad_norm": 118.53060150146484, "learning_rate": 1.887792063904119e-06, "loss": 0.0644, "num_input_tokens_seen": 32695712, "step": 48510 }, { "epoch": 1.1852295214130408, "grad_norm": 2.1681196689605713, "learning_rate": 1.887752812106605e-06, "loss": 0.1064, "num_input_tokens_seen": 32698976, "step": 48515 }, { "epoch": 1.185351672244888, "grad_norm": 16.27510643005371, "learning_rate": 1.8877135538531139e-06, "loss": 0.0749, "num_input_tokens_seen": 32702880, "step": 48520 }, { "epoch": 1.1854738230767352, "grad_norm": 1.0383045673370361, "learning_rate": 1.887674289143931e-06, "loss": 0.0552, "num_input_tokens_seen": 32706464, "step": 48525 }, { "epoch": 1.1855959739085824, "grad_norm": 0.9187676906585693, "learning_rate": 1.8876350179793423e-06, "loss": 0.1513, "num_input_tokens_seen": 32709664, "step": 48530 }, { "epoch": 1.1857181247404296, "grad_norm": 1.7384788990020752, "learning_rate": 1.8875957403596328e-06, "loss": 0.0842, "num_input_tokens_seen": 32713184, "step": 48535 }, { "epoch": 1.1858402755722766, "grad_norm": 0.10159554332494736, "learning_rate": 1.8875564562850882e-06, "loss": 0.039, "num_input_tokens_seen": 32716448, "step": 48540 }, { "epoch": 1.1859624264041237, "grad_norm": 0.4575258791446686, "learning_rate": 1.8875171657559943e-06, "loss": 0.0392, "num_input_tokens_seen": 32720096, "step": 48545 }, { "epoch": 1.186084577235971, "grad_norm": 105.02020263671875, "learning_rate": 1.8874778687726369e-06, "loss": 0.0228, "num_input_tokens_seen": 32723488, "step": 48550 }, { "epoch": 1.1862067280678181, "grad_norm": 38.00514221191406, "learning_rate": 1.8874385653353018e-06, "loss": 0.0652, "num_input_tokens_seen": 32726752, "step": 48555 }, { "epoch": 1.1863288788996653, "grad_norm": 18.732927322387695, "learning_rate": 1.8873992554442748e-06, "loss": 0.0803, "num_input_tokens_seen": 32730720, "step": 48560 }, { "epoch": 1.1864510297315125, "grad_norm": 0.14288055896759033, "learning_rate": 1.8873599390998419e-06, "loss": 0.0438, "num_input_tokens_seen": 32733984, "step": 48565 }, { "epoch": 1.1865731805633597, "grad_norm": 1.4553184509277344, "learning_rate": 1.8873206163022886e-06, "loss": 0.0943, "num_input_tokens_seen": 32737120, "step": 48570 }, { "epoch": 1.1866953313952069, "grad_norm": 30.531328201293945, "learning_rate": 1.887281287051901e-06, "loss": 0.0423, "num_input_tokens_seen": 32740512, "step": 48575 }, { "epoch": 1.1868174822270539, "grad_norm": 0.10857908427715302, "learning_rate": 1.8872419513489652e-06, "loss": 0.0007, "num_input_tokens_seen": 32743392, "step": 48580 }, { "epoch": 1.186939633058901, "grad_norm": 4.534030437469482, "learning_rate": 1.8872026091937676e-06, "loss": 0.0018, "num_input_tokens_seen": 32746400, "step": 48585 }, { "epoch": 1.1870617838907482, "grad_norm": 0.03740508109331131, "learning_rate": 1.8871632605865939e-06, "loss": 0.0158, "num_input_tokens_seen": 32749536, "step": 48590 }, { "epoch": 1.1871839347225954, "grad_norm": 0.11498308926820755, "learning_rate": 1.8871239055277304e-06, "loss": 0.0004, "num_input_tokens_seen": 32752992, "step": 48595 }, { "epoch": 1.1873060855544426, "grad_norm": 0.03428078442811966, "learning_rate": 1.8870845440174632e-06, "loss": 0.1389, "num_input_tokens_seen": 32756384, "step": 48600 }, { "epoch": 1.1874282363862898, "grad_norm": 12.763643264770508, "learning_rate": 1.8870451760560785e-06, "loss": 0.0459, "num_input_tokens_seen": 32759904, "step": 48605 }, { "epoch": 1.187550387218137, "grad_norm": 166.57456970214844, "learning_rate": 1.8870058016438629e-06, "loss": 0.0503, "num_input_tokens_seen": 32763232, "step": 48610 }, { "epoch": 1.1876725380499842, "grad_norm": 28.10552978515625, "learning_rate": 1.8869664207811025e-06, "loss": 0.1476, "num_input_tokens_seen": 32766752, "step": 48615 }, { "epoch": 1.1877946888818314, "grad_norm": 0.1488976925611496, "learning_rate": 1.8869270334680833e-06, "loss": 0.05, "num_input_tokens_seen": 32770336, "step": 48620 }, { "epoch": 1.1879168397136786, "grad_norm": 0.13166551291942596, "learning_rate": 1.8868876397050925e-06, "loss": 0.1422, "num_input_tokens_seen": 32773600, "step": 48625 }, { "epoch": 1.1880389905455255, "grad_norm": 0.9259403347969055, "learning_rate": 1.8868482394924163e-06, "loss": 0.1041, "num_input_tokens_seen": 32776800, "step": 48630 }, { "epoch": 1.1881611413773727, "grad_norm": 0.08954665809869766, "learning_rate": 1.886808832830341e-06, "loss": 0.0052, "num_input_tokens_seen": 32780064, "step": 48635 }, { "epoch": 1.18828329220922, "grad_norm": 12.26470947265625, "learning_rate": 1.8867694197191536e-06, "loss": 0.0493, "num_input_tokens_seen": 32783648, "step": 48640 }, { "epoch": 1.188405443041067, "grad_norm": 24.66190528869629, "learning_rate": 1.8867300001591402e-06, "loss": 0.0694, "num_input_tokens_seen": 32787296, "step": 48645 }, { "epoch": 1.1885275938729143, "grad_norm": 0.8406387567520142, "learning_rate": 1.8866905741505878e-06, "loss": 0.0524, "num_input_tokens_seen": 32790688, "step": 48650 }, { "epoch": 1.1886497447047615, "grad_norm": 13.112702369689941, "learning_rate": 1.8866511416937833e-06, "loss": 0.1508, "num_input_tokens_seen": 32793824, "step": 48655 }, { "epoch": 1.1887718955366087, "grad_norm": 1.6888105869293213, "learning_rate": 1.8866117027890128e-06, "loss": 0.1519, "num_input_tokens_seen": 32797664, "step": 48660 }, { "epoch": 1.1888940463684559, "grad_norm": 9.892433166503906, "learning_rate": 1.8865722574365639e-06, "loss": 0.1448, "num_input_tokens_seen": 32801376, "step": 48665 }, { "epoch": 1.1890161972003028, "grad_norm": 90.39240264892578, "learning_rate": 1.8865328056367229e-06, "loss": 0.1891, "num_input_tokens_seen": 32805280, "step": 48670 }, { "epoch": 1.18913834803215, "grad_norm": 51.569114685058594, "learning_rate": 1.886493347389777e-06, "loss": 0.0652, "num_input_tokens_seen": 32808608, "step": 48675 }, { "epoch": 1.1892604988639972, "grad_norm": 22.322158813476562, "learning_rate": 1.886453882696013e-06, "loss": 0.098, "num_input_tokens_seen": 32811808, "step": 48680 }, { "epoch": 1.1893826496958444, "grad_norm": 0.790939450263977, "learning_rate": 1.8864144115557177e-06, "loss": 0.0018, "num_input_tokens_seen": 32815456, "step": 48685 }, { "epoch": 1.1895048005276916, "grad_norm": 0.37682557106018066, "learning_rate": 1.8863749339691788e-06, "loss": 0.0915, "num_input_tokens_seen": 32818720, "step": 48690 }, { "epoch": 1.1896269513595388, "grad_norm": 166.25892639160156, "learning_rate": 1.8863354499366825e-06, "loss": 0.0542, "num_input_tokens_seen": 32821984, "step": 48695 }, { "epoch": 1.189749102191386, "grad_norm": 0.7915534973144531, "learning_rate": 1.8862959594585166e-06, "loss": 0.0797, "num_input_tokens_seen": 32825440, "step": 48700 }, { "epoch": 1.1898712530232332, "grad_norm": 0.34774404764175415, "learning_rate": 1.8862564625349683e-06, "loss": 0.102, "num_input_tokens_seen": 32828576, "step": 48705 }, { "epoch": 1.1899934038550803, "grad_norm": 0.022061169147491455, "learning_rate": 1.8862169591663247e-06, "loss": 0.0545, "num_input_tokens_seen": 32832608, "step": 48710 }, { "epoch": 1.1901155546869275, "grad_norm": 0.333593487739563, "learning_rate": 1.8861774493528725e-06, "loss": 0.1036, "num_input_tokens_seen": 32835936, "step": 48715 }, { "epoch": 1.1902377055187745, "grad_norm": 13.503067016601562, "learning_rate": 1.8861379330949002e-06, "loss": 0.1614, "num_input_tokens_seen": 32839264, "step": 48720 }, { "epoch": 1.1903598563506217, "grad_norm": 0.42545977234840393, "learning_rate": 1.886098410392694e-06, "loss": 0.0346, "num_input_tokens_seen": 32842656, "step": 48725 }, { "epoch": 1.1904820071824689, "grad_norm": 10.114810943603516, "learning_rate": 1.886058881246542e-06, "loss": 0.0614, "num_input_tokens_seen": 32846176, "step": 48730 }, { "epoch": 1.190604158014316, "grad_norm": 0.23883959650993347, "learning_rate": 1.8860193456567313e-06, "loss": 0.0386, "num_input_tokens_seen": 32849568, "step": 48735 }, { "epoch": 1.1907263088461633, "grad_norm": 15.07218074798584, "learning_rate": 1.8859798036235498e-06, "loss": 0.1608, "num_input_tokens_seen": 32853088, "step": 48740 }, { "epoch": 1.1908484596780105, "grad_norm": 0.1844908893108368, "learning_rate": 1.8859402551472847e-06, "loss": 0.0523, "num_input_tokens_seen": 32856544, "step": 48745 }, { "epoch": 1.1909706105098576, "grad_norm": 14.992534637451172, "learning_rate": 1.8859007002282242e-06, "loss": 0.0688, "num_input_tokens_seen": 32860064, "step": 48750 }, { "epoch": 1.1910927613417048, "grad_norm": 16.308456420898438, "learning_rate": 1.8858611388666552e-06, "loss": 0.2705, "num_input_tokens_seen": 32863136, "step": 48755 }, { "epoch": 1.1912149121735518, "grad_norm": 0.43662139773368835, "learning_rate": 1.8858215710628657e-06, "loss": 0.0421, "num_input_tokens_seen": 32866464, "step": 48760 }, { "epoch": 1.191337063005399, "grad_norm": 76.70539855957031, "learning_rate": 1.8857819968171436e-06, "loss": 0.1451, "num_input_tokens_seen": 32870176, "step": 48765 }, { "epoch": 1.1914592138372462, "grad_norm": 8.730032920837402, "learning_rate": 1.8857424161297764e-06, "loss": 0.0495, "num_input_tokens_seen": 32873760, "step": 48770 }, { "epoch": 1.1915813646690934, "grad_norm": 20.521678924560547, "learning_rate": 1.8857028290010524e-06, "loss": 0.172, "num_input_tokens_seen": 32876896, "step": 48775 }, { "epoch": 1.1917035155009406, "grad_norm": 1.9542988538742065, "learning_rate": 1.885663235431259e-06, "loss": 0.0676, "num_input_tokens_seen": 32880608, "step": 48780 }, { "epoch": 1.1918256663327877, "grad_norm": 0.7331753373146057, "learning_rate": 1.8856236354206843e-06, "loss": 0.0525, "num_input_tokens_seen": 32883680, "step": 48785 }, { "epoch": 1.191947817164635, "grad_norm": 1.1589040756225586, "learning_rate": 1.8855840289696165e-06, "loss": 0.0794, "num_input_tokens_seen": 32886816, "step": 48790 }, { "epoch": 1.1920699679964821, "grad_norm": 12.341038703918457, "learning_rate": 1.885544416078343e-06, "loss": 0.0449, "num_input_tokens_seen": 32890144, "step": 48795 }, { "epoch": 1.1921921188283293, "grad_norm": 9.298717498779297, "learning_rate": 1.885504796747153e-06, "loss": 0.3148, "num_input_tokens_seen": 32893024, "step": 48800 }, { "epoch": 1.1923142696601765, "grad_norm": 31.702577590942383, "learning_rate": 1.8854651709763334e-06, "loss": 0.0766, "num_input_tokens_seen": 32897056, "step": 48805 }, { "epoch": 1.1924364204920235, "grad_norm": 17.743892669677734, "learning_rate": 1.8854255387661734e-06, "loss": 0.0641, "num_input_tokens_seen": 32900064, "step": 48810 }, { "epoch": 1.1925585713238707, "grad_norm": 0.3661781847476959, "learning_rate": 1.8853859001169603e-06, "loss": 0.1346, "num_input_tokens_seen": 32903456, "step": 48815 }, { "epoch": 1.1926807221557179, "grad_norm": 9.447279930114746, "learning_rate": 1.8853462550289829e-06, "loss": 0.1509, "num_input_tokens_seen": 32906528, "step": 48820 }, { "epoch": 1.192802872987565, "grad_norm": 0.5084399580955505, "learning_rate": 1.8853066035025295e-06, "loss": 0.1062, "num_input_tokens_seen": 32909920, "step": 48825 }, { "epoch": 1.1929250238194122, "grad_norm": 1.1484479904174805, "learning_rate": 1.8852669455378884e-06, "loss": 0.0804, "num_input_tokens_seen": 32913824, "step": 48830 }, { "epoch": 1.1930471746512594, "grad_norm": 26.59931182861328, "learning_rate": 1.8852272811353477e-06, "loss": 0.0714, "num_input_tokens_seen": 32917216, "step": 48835 }, { "epoch": 1.1931693254831066, "grad_norm": 21.390541076660156, "learning_rate": 1.8851876102951964e-06, "loss": 0.0757, "num_input_tokens_seen": 32920544, "step": 48840 }, { "epoch": 1.1932914763149536, "grad_norm": 0.13798239827156067, "learning_rate": 1.8851479330177228e-06, "loss": 0.0366, "num_input_tokens_seen": 32924000, "step": 48845 }, { "epoch": 1.1934136271468008, "grad_norm": 1.2368896007537842, "learning_rate": 1.885108249303215e-06, "loss": 0.0422, "num_input_tokens_seen": 32927456, "step": 48850 }, { "epoch": 1.193535777978648, "grad_norm": 0.20851171016693115, "learning_rate": 1.885068559151962e-06, "loss": 0.0846, "num_input_tokens_seen": 32930912, "step": 48855 }, { "epoch": 1.1936579288104952, "grad_norm": 0.3231092393398285, "learning_rate": 1.8850288625642525e-06, "loss": 0.0491, "num_input_tokens_seen": 32934176, "step": 48860 }, { "epoch": 1.1937800796423423, "grad_norm": 11.667762756347656, "learning_rate": 1.8849891595403752e-06, "loss": 0.0707, "num_input_tokens_seen": 32937952, "step": 48865 }, { "epoch": 1.1939022304741895, "grad_norm": 0.18369610607624054, "learning_rate": 1.8849494500806187e-06, "loss": 0.0488, "num_input_tokens_seen": 32941088, "step": 48870 }, { "epoch": 1.1940243813060367, "grad_norm": 9.249509811401367, "learning_rate": 1.8849097341852716e-06, "loss": 0.1003, "num_input_tokens_seen": 32944032, "step": 48875 }, { "epoch": 1.194146532137884, "grad_norm": 0.4781881868839264, "learning_rate": 1.884870011854623e-06, "loss": 0.0657, "num_input_tokens_seen": 32947488, "step": 48880 }, { "epoch": 1.194268682969731, "grad_norm": 0.17906363308429718, "learning_rate": 1.8848302830889615e-06, "loss": 0.0476, "num_input_tokens_seen": 32950688, "step": 48885 }, { "epoch": 1.1943908338015783, "grad_norm": 0.6338130235671997, "learning_rate": 1.8847905478885764e-06, "loss": 0.1174, "num_input_tokens_seen": 32953952, "step": 48890 }, { "epoch": 1.1945129846334253, "grad_norm": 18.3079891204834, "learning_rate": 1.884750806253756e-06, "loss": 0.0362, "num_input_tokens_seen": 32957344, "step": 48895 }, { "epoch": 1.1946351354652724, "grad_norm": 12.767355918884277, "learning_rate": 1.8847110581847902e-06, "loss": 0.0785, "num_input_tokens_seen": 32960672, "step": 48900 }, { "epoch": 1.1947572862971196, "grad_norm": 0.2379637509584427, "learning_rate": 1.8846713036819677e-06, "loss": 0.0789, "num_input_tokens_seen": 32963872, "step": 48905 }, { "epoch": 1.1948794371289668, "grad_norm": 0.39095956087112427, "learning_rate": 1.8846315427455774e-06, "loss": 0.099, "num_input_tokens_seen": 32967328, "step": 48910 }, { "epoch": 1.195001587960814, "grad_norm": 0.7817822694778442, "learning_rate": 1.8845917753759086e-06, "loss": 0.0887, "num_input_tokens_seen": 32970848, "step": 48915 }, { "epoch": 1.1951237387926612, "grad_norm": 8.41550350189209, "learning_rate": 1.8845520015732503e-06, "loss": 0.1821, "num_input_tokens_seen": 32974176, "step": 48920 }, { "epoch": 1.1952458896245084, "grad_norm": 0.13997584581375122, "learning_rate": 1.8845122213378921e-06, "loss": 0.0193, "num_input_tokens_seen": 32977376, "step": 48925 }, { "epoch": 1.1953680404563556, "grad_norm": 1.8125817775726318, "learning_rate": 1.884472434670123e-06, "loss": 0.1289, "num_input_tokens_seen": 32980576, "step": 48930 }, { "epoch": 1.1954901912882026, "grad_norm": 25.555908203125, "learning_rate": 1.8844326415702328e-06, "loss": 0.0802, "num_input_tokens_seen": 32983584, "step": 48935 }, { "epoch": 1.1956123421200497, "grad_norm": 12.59317684173584, "learning_rate": 1.8843928420385101e-06, "loss": 0.109, "num_input_tokens_seen": 32986720, "step": 48940 }, { "epoch": 1.195734492951897, "grad_norm": 18.369953155517578, "learning_rate": 1.884353036075245e-06, "loss": 0.0378, "num_input_tokens_seen": 32990496, "step": 48945 }, { "epoch": 1.1958566437837441, "grad_norm": 0.26828473806381226, "learning_rate": 1.8843132236807268e-06, "loss": 0.1408, "num_input_tokens_seen": 32993760, "step": 48950 }, { "epoch": 1.1959787946155913, "grad_norm": 0.13218581676483154, "learning_rate": 1.8842734048552451e-06, "loss": 0.0941, "num_input_tokens_seen": 32997024, "step": 48955 }, { "epoch": 1.1961009454474385, "grad_norm": 2.13213849067688, "learning_rate": 1.884233579599089e-06, "loss": 0.0022, "num_input_tokens_seen": 33000992, "step": 48960 }, { "epoch": 1.1962230962792857, "grad_norm": 0.08893398940563202, "learning_rate": 1.8841937479125488e-06, "loss": 0.048, "num_input_tokens_seen": 33004256, "step": 48965 }, { "epoch": 1.1963452471111329, "grad_norm": 33.54923629760742, "learning_rate": 1.8841539097959135e-06, "loss": 0.175, "num_input_tokens_seen": 33008032, "step": 48970 }, { "epoch": 1.19646739794298, "grad_norm": 11.464241981506348, "learning_rate": 1.8841140652494736e-06, "loss": 0.1101, "num_input_tokens_seen": 33011680, "step": 48975 }, { "epoch": 1.1965895487748273, "grad_norm": 18.33887481689453, "learning_rate": 1.8840742142735179e-06, "loss": 0.1273, "num_input_tokens_seen": 33014880, "step": 48980 }, { "epoch": 1.1967116996066742, "grad_norm": 21.554765701293945, "learning_rate": 1.8840343568683373e-06, "loss": 0.1129, "num_input_tokens_seen": 33018272, "step": 48985 }, { "epoch": 1.1968338504385214, "grad_norm": 21.3913631439209, "learning_rate": 1.8839944930342207e-06, "loss": 0.0782, "num_input_tokens_seen": 33021664, "step": 48990 }, { "epoch": 1.1969560012703686, "grad_norm": 14.357202529907227, "learning_rate": 1.8839546227714584e-06, "loss": 0.122, "num_input_tokens_seen": 33024608, "step": 48995 }, { "epoch": 1.1970781521022158, "grad_norm": 67.93797302246094, "learning_rate": 1.8839147460803404e-06, "loss": 0.0397, "num_input_tokens_seen": 33027872, "step": 49000 }, { "epoch": 1.197200302934063, "grad_norm": 22.977998733520508, "learning_rate": 1.8838748629611568e-06, "loss": 0.1236, "num_input_tokens_seen": 33031136, "step": 49005 }, { "epoch": 1.1973224537659102, "grad_norm": 0.3773641586303711, "learning_rate": 1.8838349734141972e-06, "loss": 0.0368, "num_input_tokens_seen": 33034144, "step": 49010 }, { "epoch": 1.1974446045977574, "grad_norm": 0.11255284398794174, "learning_rate": 1.8837950774397519e-06, "loss": 0.0434, "num_input_tokens_seen": 33037344, "step": 49015 }, { "epoch": 1.1975667554296046, "grad_norm": 0.18880581855773926, "learning_rate": 1.8837551750381114e-06, "loss": 0.0687, "num_input_tokens_seen": 33040416, "step": 49020 }, { "epoch": 1.1976889062614515, "grad_norm": 162.01095581054688, "learning_rate": 1.8837152662095654e-06, "loss": 0.0608, "num_input_tokens_seen": 33043552, "step": 49025 }, { "epoch": 1.1978110570932987, "grad_norm": 0.333495557308197, "learning_rate": 1.8836753509544043e-06, "loss": 0.0286, "num_input_tokens_seen": 33046752, "step": 49030 }, { "epoch": 1.197933207925146, "grad_norm": 10.324767112731934, "learning_rate": 1.8836354292729184e-06, "loss": 0.0565, "num_input_tokens_seen": 33050080, "step": 49035 }, { "epoch": 1.198055358756993, "grad_norm": 16.853994369506836, "learning_rate": 1.8835955011653977e-06, "loss": 0.2166, "num_input_tokens_seen": 33053344, "step": 49040 }, { "epoch": 1.1981775095888403, "grad_norm": 0.9257891774177551, "learning_rate": 1.8835555666321333e-06, "loss": 0.0579, "num_input_tokens_seen": 33056544, "step": 49045 }, { "epoch": 1.1982996604206875, "grad_norm": 9.326736450195312, "learning_rate": 1.8835156256734148e-06, "loss": 0.0819, "num_input_tokens_seen": 33059616, "step": 49050 }, { "epoch": 1.1984218112525347, "grad_norm": 17.777437210083008, "learning_rate": 1.8834756782895331e-06, "loss": 0.0732, "num_input_tokens_seen": 33062944, "step": 49055 }, { "epoch": 1.1985439620843819, "grad_norm": 1.6575427055358887, "learning_rate": 1.883435724480779e-06, "loss": 0.0728, "num_input_tokens_seen": 33066464, "step": 49060 }, { "epoch": 1.198666112916229, "grad_norm": 0.591008722782135, "learning_rate": 1.8833957642474424e-06, "loss": 0.0304, "num_input_tokens_seen": 33070752, "step": 49065 }, { "epoch": 1.1987882637480762, "grad_norm": 0.5434102416038513, "learning_rate": 1.8833557975898141e-06, "loss": 0.0928, "num_input_tokens_seen": 33073952, "step": 49070 }, { "epoch": 1.1989104145799232, "grad_norm": 0.11578580737113953, "learning_rate": 1.883315824508185e-06, "loss": 0.081, "num_input_tokens_seen": 33077152, "step": 49075 }, { "epoch": 1.1990325654117704, "grad_norm": 0.2963810861110687, "learning_rate": 1.8832758450028456e-06, "loss": 0.0835, "num_input_tokens_seen": 33080544, "step": 49080 }, { "epoch": 1.1991547162436176, "grad_norm": 0.2604496479034424, "learning_rate": 1.883235859074087e-06, "loss": 0.0907, "num_input_tokens_seen": 33083872, "step": 49085 }, { "epoch": 1.1992768670754648, "grad_norm": 8.561380386352539, "learning_rate": 1.8831958667221992e-06, "loss": 0.1061, "num_input_tokens_seen": 33087520, "step": 49090 }, { "epoch": 1.199399017907312, "grad_norm": 125.36334228515625, "learning_rate": 1.8831558679474738e-06, "loss": 0.1422, "num_input_tokens_seen": 33090912, "step": 49095 }, { "epoch": 1.1995211687391591, "grad_norm": 21.69564437866211, "learning_rate": 1.8831158627502012e-06, "loss": 0.1067, "num_input_tokens_seen": 33094304, "step": 49100 }, { "epoch": 1.1996433195710063, "grad_norm": 0.19535818696022034, "learning_rate": 1.8830758511306726e-06, "loss": 0.0296, "num_input_tokens_seen": 33097504, "step": 49105 }, { "epoch": 1.1997654704028535, "grad_norm": 0.19155330955982208, "learning_rate": 1.8830358330891789e-06, "loss": 0.1116, "num_input_tokens_seen": 33100704, "step": 49110 }, { "epoch": 1.1998876212347005, "grad_norm": 15.071712493896484, "learning_rate": 1.882995808626011e-06, "loss": 0.0305, "num_input_tokens_seen": 33103968, "step": 49115 }, { "epoch": 1.2000097720665477, "grad_norm": 0.22924207150936127, "learning_rate": 1.8829557777414602e-06, "loss": 0.0876, "num_input_tokens_seen": 33107680, "step": 49120 }, { "epoch": 1.2001319228983949, "grad_norm": 29.139514923095703, "learning_rate": 1.8829157404358176e-06, "loss": 0.2117, "num_input_tokens_seen": 33110944, "step": 49125 }, { "epoch": 1.200254073730242, "grad_norm": 1.1408504247665405, "learning_rate": 1.882875696709374e-06, "loss": 0.1363, "num_input_tokens_seen": 33114208, "step": 49130 }, { "epoch": 1.2003762245620893, "grad_norm": 1.3661071062088013, "learning_rate": 1.882835646562421e-06, "loss": 0.0076, "num_input_tokens_seen": 33117600, "step": 49135 }, { "epoch": 1.2004983753939364, "grad_norm": 34.19418716430664, "learning_rate": 1.8827955899952497e-06, "loss": 0.1247, "num_input_tokens_seen": 33121248, "step": 49140 }, { "epoch": 1.2006205262257836, "grad_norm": 0.4470491409301758, "learning_rate": 1.8827555270081513e-06, "loss": 0.0067, "num_input_tokens_seen": 33124512, "step": 49145 }, { "epoch": 1.2007426770576308, "grad_norm": 0.7858008146286011, "learning_rate": 1.8827154576014178e-06, "loss": 0.196, "num_input_tokens_seen": 33127840, "step": 49150 }, { "epoch": 1.200864827889478, "grad_norm": 54.617897033691406, "learning_rate": 1.8826753817753396e-06, "loss": 0.0318, "num_input_tokens_seen": 33130784, "step": 49155 }, { "epoch": 1.2009869787213252, "grad_norm": 50.12831115722656, "learning_rate": 1.8826352995302086e-06, "loss": 0.1143, "num_input_tokens_seen": 33134176, "step": 49160 }, { "epoch": 1.2011091295531722, "grad_norm": 1.1908376216888428, "learning_rate": 1.8825952108663163e-06, "loss": 0.0035, "num_input_tokens_seen": 33137248, "step": 49165 }, { "epoch": 1.2012312803850194, "grad_norm": 24.506649017333984, "learning_rate": 1.8825551157839543e-06, "loss": 0.1935, "num_input_tokens_seen": 33140512, "step": 49170 }, { "epoch": 1.2013534312168666, "grad_norm": 0.5442198514938354, "learning_rate": 1.8825150142834143e-06, "loss": 0.1047, "num_input_tokens_seen": 33143968, "step": 49175 }, { "epoch": 1.2014755820487137, "grad_norm": 0.2753124535083771, "learning_rate": 1.8824749063649876e-06, "loss": 0.0758, "num_input_tokens_seen": 33147616, "step": 49180 }, { "epoch": 1.201597732880561, "grad_norm": 27.034799575805664, "learning_rate": 1.882434792028966e-06, "loss": 0.2307, "num_input_tokens_seen": 33151008, "step": 49185 }, { "epoch": 1.2017198837124081, "grad_norm": 31.514392852783203, "learning_rate": 1.8823946712756413e-06, "loss": 0.0385, "num_input_tokens_seen": 33154976, "step": 49190 }, { "epoch": 1.2018420345442553, "grad_norm": 17.733539581298828, "learning_rate": 1.8823545441053053e-06, "loss": 0.1566, "num_input_tokens_seen": 33158240, "step": 49195 }, { "epoch": 1.2019641853761025, "grad_norm": 21.296964645385742, "learning_rate": 1.8823144105182496e-06, "loss": 0.1751, "num_input_tokens_seen": 33162016, "step": 49200 }, { "epoch": 1.2020863362079495, "grad_norm": 0.2633887827396393, "learning_rate": 1.8822742705147663e-06, "loss": 0.126, "num_input_tokens_seen": 33165536, "step": 49205 }, { "epoch": 1.2022084870397967, "grad_norm": 87.6427230834961, "learning_rate": 1.8822341240951469e-06, "loss": 0.1073, "num_input_tokens_seen": 33168864, "step": 49210 }, { "epoch": 1.2023306378716438, "grad_norm": 26.27444839477539, "learning_rate": 1.882193971259684e-06, "loss": 0.1445, "num_input_tokens_seen": 33172192, "step": 49215 }, { "epoch": 1.202452788703491, "grad_norm": 8.962058067321777, "learning_rate": 1.8821538120086693e-06, "loss": 0.0778, "num_input_tokens_seen": 33175840, "step": 49220 }, { "epoch": 1.2025749395353382, "grad_norm": 0.818984866142273, "learning_rate": 1.8821136463423945e-06, "loss": 0.1007, "num_input_tokens_seen": 33179232, "step": 49225 }, { "epoch": 1.2026970903671854, "grad_norm": 0.5337079763412476, "learning_rate": 1.8820734742611522e-06, "loss": 0.0891, "num_input_tokens_seen": 33182496, "step": 49230 }, { "epoch": 1.2028192411990326, "grad_norm": 1.8495513200759888, "learning_rate": 1.8820332957652342e-06, "loss": 0.0656, "num_input_tokens_seen": 33185888, "step": 49235 }, { "epoch": 1.2029413920308798, "grad_norm": 0.3298915922641754, "learning_rate": 1.881993110854933e-06, "loss": 0.0315, "num_input_tokens_seen": 33189664, "step": 49240 }, { "epoch": 1.203063542862727, "grad_norm": 81.36953735351562, "learning_rate": 1.8819529195305405e-06, "loss": 0.0093, "num_input_tokens_seen": 33192736, "step": 49245 }, { "epoch": 1.2031856936945742, "grad_norm": 0.44343122839927673, "learning_rate": 1.8819127217923492e-06, "loss": 0.0607, "num_input_tokens_seen": 33196576, "step": 49250 }, { "epoch": 1.2033078445264211, "grad_norm": 0.6930961012840271, "learning_rate": 1.8818725176406515e-06, "loss": 0.0382, "num_input_tokens_seen": 33199776, "step": 49255 }, { "epoch": 1.2034299953582683, "grad_norm": 3.0237956047058105, "learning_rate": 1.8818323070757397e-06, "loss": 0.106, "num_input_tokens_seen": 33203040, "step": 49260 }, { "epoch": 1.2035521461901155, "grad_norm": 0.3650870621204376, "learning_rate": 1.881792090097906e-06, "loss": 0.093, "num_input_tokens_seen": 33206240, "step": 49265 }, { "epoch": 1.2036742970219627, "grad_norm": 31.69427490234375, "learning_rate": 1.881751866707443e-06, "loss": 0.1329, "num_input_tokens_seen": 33209248, "step": 49270 }, { "epoch": 1.20379644785381, "grad_norm": 0.1792955845594406, "learning_rate": 1.8817116369046435e-06, "loss": 0.0011, "num_input_tokens_seen": 33212384, "step": 49275 }, { "epoch": 1.203918598685657, "grad_norm": 0.19762201607227325, "learning_rate": 1.8816714006897998e-06, "loss": 0.0017, "num_input_tokens_seen": 33215840, "step": 49280 }, { "epoch": 1.2040407495175043, "grad_norm": 154.009765625, "learning_rate": 1.8816311580632042e-06, "loss": 0.114, "num_input_tokens_seen": 33219232, "step": 49285 }, { "epoch": 1.2041629003493515, "grad_norm": 40.82319259643555, "learning_rate": 1.88159090902515e-06, "loss": 0.0705, "num_input_tokens_seen": 33222752, "step": 49290 }, { "epoch": 1.2042850511811984, "grad_norm": 0.04902821406722069, "learning_rate": 1.8815506535759296e-06, "loss": 0.1219, "num_input_tokens_seen": 33225952, "step": 49295 }, { "epoch": 1.2044072020130456, "grad_norm": 20.65225601196289, "learning_rate": 1.8815103917158356e-06, "loss": 0.1627, "num_input_tokens_seen": 33229664, "step": 49300 }, { "epoch": 1.2045293528448928, "grad_norm": 0.23126031458377838, "learning_rate": 1.881470123445161e-06, "loss": 0.0472, "num_input_tokens_seen": 33233504, "step": 49305 }, { "epoch": 1.20465150367674, "grad_norm": 0.09219006448984146, "learning_rate": 1.8814298487641986e-06, "loss": 0.0009, "num_input_tokens_seen": 33236512, "step": 49310 }, { "epoch": 1.2047736545085872, "grad_norm": 0.14880770444869995, "learning_rate": 1.8813895676732411e-06, "loss": 0.0599, "num_input_tokens_seen": 33239968, "step": 49315 }, { "epoch": 1.2048958053404344, "grad_norm": 110.7625503540039, "learning_rate": 1.8813492801725818e-06, "loss": 0.0182, "num_input_tokens_seen": 33243104, "step": 49320 }, { "epoch": 1.2050179561722816, "grad_norm": 0.23787957429885864, "learning_rate": 1.8813089862625136e-06, "loss": 0.1218, "num_input_tokens_seen": 33246112, "step": 49325 }, { "epoch": 1.2051401070041288, "grad_norm": 0.011504840105772018, "learning_rate": 1.881268685943329e-06, "loss": 0.0602, "num_input_tokens_seen": 33249568, "step": 49330 }, { "epoch": 1.205262257835976, "grad_norm": 0.07925833016633987, "learning_rate": 1.881228379215322e-06, "loss": 0.0861, "num_input_tokens_seen": 33253472, "step": 49335 }, { "epoch": 1.2053844086678231, "grad_norm": 0.21478639543056488, "learning_rate": 1.8811880660787846e-06, "loss": 0.1753, "num_input_tokens_seen": 33257120, "step": 49340 }, { "epoch": 1.2055065594996701, "grad_norm": 0.07961128652095795, "learning_rate": 1.881147746534011e-06, "loss": 0.0815, "num_input_tokens_seen": 33260384, "step": 49345 }, { "epoch": 1.2056287103315173, "grad_norm": 22.093706130981445, "learning_rate": 1.8811074205812938e-06, "loss": 0.0344, "num_input_tokens_seen": 33263712, "step": 49350 }, { "epoch": 1.2057508611633645, "grad_norm": 0.31056874990463257, "learning_rate": 1.8810670882209264e-06, "loss": 0.0385, "num_input_tokens_seen": 33266656, "step": 49355 }, { "epoch": 1.2058730119952117, "grad_norm": 21.597169876098633, "learning_rate": 1.8810267494532025e-06, "loss": 0.1348, "num_input_tokens_seen": 33269792, "step": 49360 }, { "epoch": 1.2059951628270589, "grad_norm": 0.34313809871673584, "learning_rate": 1.8809864042784147e-06, "loss": 0.059, "num_input_tokens_seen": 33272992, "step": 49365 }, { "epoch": 1.206117313658906, "grad_norm": 22.627899169921875, "learning_rate": 1.880946052696857e-06, "loss": 0.0039, "num_input_tokens_seen": 33276320, "step": 49370 }, { "epoch": 1.2062394644907533, "grad_norm": 0.4183606207370758, "learning_rate": 1.8809056947088226e-06, "loss": 0.1022, "num_input_tokens_seen": 33279712, "step": 49375 }, { "epoch": 1.2063616153226002, "grad_norm": 11.300501823425293, "learning_rate": 1.880865330314605e-06, "loss": 0.0726, "num_input_tokens_seen": 33282912, "step": 49380 }, { "epoch": 1.2064837661544474, "grad_norm": 0.0277280081063509, "learning_rate": 1.880824959514498e-06, "loss": 0.1336, "num_input_tokens_seen": 33286048, "step": 49385 }, { "epoch": 1.2066059169862946, "grad_norm": 0.16218620538711548, "learning_rate": 1.8807845823087952e-06, "loss": 0.0506, "num_input_tokens_seen": 33289120, "step": 49390 }, { "epoch": 1.2067280678181418, "grad_norm": 0.07739616930484772, "learning_rate": 1.8807441986977894e-06, "loss": 0.0213, "num_input_tokens_seen": 33292640, "step": 49395 }, { "epoch": 1.206850218649989, "grad_norm": 16.28183937072754, "learning_rate": 1.8807038086817752e-06, "loss": 0.1098, "num_input_tokens_seen": 33296224, "step": 49400 }, { "epoch": 1.2069723694818362, "grad_norm": 0.10570183396339417, "learning_rate": 1.8806634122610461e-06, "loss": 0.1692, "num_input_tokens_seen": 33299488, "step": 49405 }, { "epoch": 1.2070945203136834, "grad_norm": 0.07286083698272705, "learning_rate": 1.8806230094358954e-06, "loss": 0.1198, "num_input_tokens_seen": 33302752, "step": 49410 }, { "epoch": 1.2072166711455306, "grad_norm": 95.79071807861328, "learning_rate": 1.8805826002066178e-06, "loss": 0.0987, "num_input_tokens_seen": 33305696, "step": 49415 }, { "epoch": 1.2073388219773777, "grad_norm": 21.291419982910156, "learning_rate": 1.8805421845735065e-06, "loss": 0.1584, "num_input_tokens_seen": 33308768, "step": 49420 }, { "epoch": 1.207460972809225, "grad_norm": 0.056146129965782166, "learning_rate": 1.8805017625368555e-06, "loss": 0.0575, "num_input_tokens_seen": 33312288, "step": 49425 }, { "epoch": 1.2075831236410721, "grad_norm": 0.043821725994348526, "learning_rate": 1.8804613340969592e-06, "loss": 0.1136, "num_input_tokens_seen": 33315616, "step": 49430 }, { "epoch": 1.207705274472919, "grad_norm": 0.25115376710891724, "learning_rate": 1.880420899254111e-06, "loss": 0.0248, "num_input_tokens_seen": 33318880, "step": 49435 }, { "epoch": 1.2078274253047663, "grad_norm": 0.3757718503475189, "learning_rate": 1.8803804580086053e-06, "loss": 0.0634, "num_input_tokens_seen": 33322336, "step": 49440 }, { "epoch": 1.2079495761366135, "grad_norm": 34.809120178222656, "learning_rate": 1.8803400103607362e-06, "loss": 0.1465, "num_input_tokens_seen": 33325536, "step": 49445 }, { "epoch": 1.2080717269684607, "grad_norm": 0.46399661898612976, "learning_rate": 1.8802995563107972e-06, "loss": 0.0011, "num_input_tokens_seen": 33329056, "step": 49450 }, { "epoch": 1.2081938778003078, "grad_norm": 39.4713020324707, "learning_rate": 1.8802590958590837e-06, "loss": 0.1953, "num_input_tokens_seen": 33332512, "step": 49455 }, { "epoch": 1.208316028632155, "grad_norm": 126.51252746582031, "learning_rate": 1.8802186290058887e-06, "loss": 0.0797, "num_input_tokens_seen": 33335840, "step": 49460 }, { "epoch": 1.2084381794640022, "grad_norm": 6.298394680023193, "learning_rate": 1.8801781557515078e-06, "loss": 0.0014, "num_input_tokens_seen": 33339168, "step": 49465 }, { "epoch": 1.2085603302958492, "grad_norm": 0.023851916193962097, "learning_rate": 1.8801376760962343e-06, "loss": 0.0374, "num_input_tokens_seen": 33342240, "step": 49470 }, { "epoch": 1.2086824811276964, "grad_norm": 0.04978490248322487, "learning_rate": 1.8800971900403626e-06, "loss": 0.0638, "num_input_tokens_seen": 33345888, "step": 49475 }, { "epoch": 1.2088046319595436, "grad_norm": 0.058799147605895996, "learning_rate": 1.8800566975841878e-06, "loss": 0.023, "num_input_tokens_seen": 33349536, "step": 49480 }, { "epoch": 1.2089267827913908, "grad_norm": 0.6056734919548035, "learning_rate": 1.8800161987280037e-06, "loss": 0.0884, "num_input_tokens_seen": 33352480, "step": 49485 }, { "epoch": 1.209048933623238, "grad_norm": 0.07104521989822388, "learning_rate": 1.8799756934721055e-06, "loss": 0.174, "num_input_tokens_seen": 33355808, "step": 49490 }, { "epoch": 1.2091710844550851, "grad_norm": 0.1675105094909668, "learning_rate": 1.879935181816787e-06, "loss": 0.143, "num_input_tokens_seen": 33359328, "step": 49495 }, { "epoch": 1.2092932352869323, "grad_norm": 0.27164918184280396, "learning_rate": 1.8798946637623434e-06, "loss": 0.0526, "num_input_tokens_seen": 33362592, "step": 49500 }, { "epoch": 1.2094153861187795, "grad_norm": 15.787830352783203, "learning_rate": 1.879854139309069e-06, "loss": 0.2042, "num_input_tokens_seen": 33365728, "step": 49505 }, { "epoch": 1.2095375369506267, "grad_norm": 0.5783581733703613, "learning_rate": 1.8798136084572587e-06, "loss": 0.1629, "num_input_tokens_seen": 33368672, "step": 49510 }, { "epoch": 1.209659687782474, "grad_norm": 0.5477581024169922, "learning_rate": 1.8797730712072072e-06, "loss": 0.0338, "num_input_tokens_seen": 33371744, "step": 49515 }, { "epoch": 1.2097818386143209, "grad_norm": 30.166261672973633, "learning_rate": 1.8797325275592094e-06, "loss": 0.0987, "num_input_tokens_seen": 33375072, "step": 49520 }, { "epoch": 1.209903989446168, "grad_norm": 0.4482729732990265, "learning_rate": 1.8796919775135597e-06, "loss": 0.1201, "num_input_tokens_seen": 33377952, "step": 49525 }, { "epoch": 1.2100261402780152, "grad_norm": 0.22780552506446838, "learning_rate": 1.8796514210705537e-06, "loss": 0.0782, "num_input_tokens_seen": 33381472, "step": 49530 }, { "epoch": 1.2101482911098624, "grad_norm": 0.6763140559196472, "learning_rate": 1.8796108582304857e-06, "loss": 0.0364, "num_input_tokens_seen": 33384800, "step": 49535 }, { "epoch": 1.2102704419417096, "grad_norm": 0.12094619870185852, "learning_rate": 1.8795702889936511e-06, "loss": 0.1381, "num_input_tokens_seen": 33388192, "step": 49540 }, { "epoch": 1.2103925927735568, "grad_norm": 5.095489501953125, "learning_rate": 1.8795297133603446e-06, "loss": 0.1393, "num_input_tokens_seen": 33391648, "step": 49545 }, { "epoch": 1.210514743605404, "grad_norm": 0.1459597945213318, "learning_rate": 1.8794891313308617e-06, "loss": 0.1072, "num_input_tokens_seen": 33394848, "step": 49550 }, { "epoch": 1.2106368944372512, "grad_norm": 16.98642921447754, "learning_rate": 1.8794485429054973e-06, "loss": 0.0801, "num_input_tokens_seen": 33397920, "step": 49555 }, { "epoch": 1.2107590452690982, "grad_norm": 0.34635409712791443, "learning_rate": 1.8794079480845464e-06, "loss": 0.051, "num_input_tokens_seen": 33401760, "step": 49560 }, { "epoch": 1.2108811961009454, "grad_norm": 0.36914634704589844, "learning_rate": 1.8793673468683044e-06, "loss": 0.0797, "num_input_tokens_seen": 33405216, "step": 49565 }, { "epoch": 1.2110033469327925, "grad_norm": 0.21115657687187195, "learning_rate": 1.8793267392570667e-06, "loss": 0.0243, "num_input_tokens_seen": 33408992, "step": 49570 }, { "epoch": 1.2111254977646397, "grad_norm": 0.3436622619628906, "learning_rate": 1.8792861252511282e-06, "loss": 0.0737, "num_input_tokens_seen": 33412512, "step": 49575 }, { "epoch": 1.211247648596487, "grad_norm": 0.27743715047836304, "learning_rate": 1.8792455048507847e-06, "loss": 0.108, "num_input_tokens_seen": 33415904, "step": 49580 }, { "epoch": 1.2113697994283341, "grad_norm": 0.7445909380912781, "learning_rate": 1.8792048780563311e-06, "loss": 0.0952, "num_input_tokens_seen": 33419744, "step": 49585 }, { "epoch": 1.2114919502601813, "grad_norm": 0.9027830362319946, "learning_rate": 1.8791642448680633e-06, "loss": 0.0513, "num_input_tokens_seen": 33422688, "step": 49590 }, { "epoch": 1.2116141010920285, "grad_norm": 8.377222061157227, "learning_rate": 1.879123605286277e-06, "loss": 0.0624, "num_input_tokens_seen": 33425888, "step": 49595 }, { "epoch": 1.2117362519238757, "grad_norm": 174.65496826171875, "learning_rate": 1.8790829593112669e-06, "loss": 0.0268, "num_input_tokens_seen": 33428832, "step": 49600 }, { "epoch": 1.2118584027557229, "grad_norm": 1.2196311950683594, "learning_rate": 1.8790423069433294e-06, "loss": 0.0843, "num_input_tokens_seen": 33432672, "step": 49605 }, { "epoch": 1.2119805535875698, "grad_norm": 0.6116811633110046, "learning_rate": 1.8790016481827596e-06, "loss": 0.1263, "num_input_tokens_seen": 33435744, "step": 49610 }, { "epoch": 1.212102704419417, "grad_norm": 1.9640523195266724, "learning_rate": 1.8789609830298534e-06, "loss": 0.0523, "num_input_tokens_seen": 33439328, "step": 49615 }, { "epoch": 1.2122248552512642, "grad_norm": 1.1018770933151245, "learning_rate": 1.8789203114849067e-06, "loss": 0.0036, "num_input_tokens_seen": 33442912, "step": 49620 }, { "epoch": 1.2123470060831114, "grad_norm": 0.18859164416790009, "learning_rate": 1.8788796335482148e-06, "loss": 0.0131, "num_input_tokens_seen": 33446112, "step": 49625 }, { "epoch": 1.2124691569149586, "grad_norm": 23.194774627685547, "learning_rate": 1.878838949220074e-06, "loss": 0.0871, "num_input_tokens_seen": 33449568, "step": 49630 }, { "epoch": 1.2125913077468058, "grad_norm": 15.930150985717773, "learning_rate": 1.87879825850078e-06, "loss": 0.1442, "num_input_tokens_seen": 33452768, "step": 49635 }, { "epoch": 1.212713458578653, "grad_norm": 0.10830620676279068, "learning_rate": 1.8787575613906287e-06, "loss": 0.0307, "num_input_tokens_seen": 33455904, "step": 49640 }, { "epoch": 1.2128356094105002, "grad_norm": 0.17682579159736633, "learning_rate": 1.878716857889916e-06, "loss": 0.0752, "num_input_tokens_seen": 33459296, "step": 49645 }, { "epoch": 1.2129577602423471, "grad_norm": 86.1319351196289, "learning_rate": 1.878676147998938e-06, "loss": 0.1513, "num_input_tokens_seen": 33462752, "step": 49650 }, { "epoch": 1.2130799110741943, "grad_norm": 44.84431457519531, "learning_rate": 1.8786354317179906e-06, "loss": 0.1706, "num_input_tokens_seen": 33466528, "step": 49655 }, { "epoch": 1.2132020619060415, "grad_norm": 79.0781021118164, "learning_rate": 1.8785947090473702e-06, "loss": 0.0216, "num_input_tokens_seen": 33469856, "step": 49660 }, { "epoch": 1.2133242127378887, "grad_norm": 1.6315191984176636, "learning_rate": 1.8785539799873727e-06, "loss": 0.1069, "num_input_tokens_seen": 33473120, "step": 49665 }, { "epoch": 1.213446363569736, "grad_norm": 0.07979770004749298, "learning_rate": 1.8785132445382944e-06, "loss": 0.1485, "num_input_tokens_seen": 33476384, "step": 49670 }, { "epoch": 1.213568514401583, "grad_norm": 0.18451182544231415, "learning_rate": 1.8784725027004313e-06, "loss": 0.0381, "num_input_tokens_seen": 33480096, "step": 49675 }, { "epoch": 1.2136906652334303, "grad_norm": 23.493776321411133, "learning_rate": 1.87843175447408e-06, "loss": 0.0876, "num_input_tokens_seen": 33483232, "step": 49680 }, { "epoch": 1.2138128160652775, "grad_norm": 0.05314216390252113, "learning_rate": 1.8783909998595368e-06, "loss": 0.0839, "num_input_tokens_seen": 33486560, "step": 49685 }, { "epoch": 1.2139349668971247, "grad_norm": 30.70671272277832, "learning_rate": 1.8783502388570978e-06, "loss": 0.1646, "num_input_tokens_seen": 33490208, "step": 49690 }, { "epoch": 1.2140571177289718, "grad_norm": 0.7101901173591614, "learning_rate": 1.8783094714670597e-06, "loss": 0.0009, "num_input_tokens_seen": 33493856, "step": 49695 }, { "epoch": 1.2141792685608188, "grad_norm": 0.2500157356262207, "learning_rate": 1.8782686976897192e-06, "loss": 0.0665, "num_input_tokens_seen": 33497056, "step": 49700 }, { "epoch": 1.214301419392666, "grad_norm": 44.201995849609375, "learning_rate": 1.878227917525372e-06, "loss": 0.0379, "num_input_tokens_seen": 33500896, "step": 49705 }, { "epoch": 1.2144235702245132, "grad_norm": 15.220165252685547, "learning_rate": 1.8781871309743153e-06, "loss": 0.1151, "num_input_tokens_seen": 33504800, "step": 49710 }, { "epoch": 1.2145457210563604, "grad_norm": 154.23687744140625, "learning_rate": 1.8781463380368455e-06, "loss": 0.1571, "num_input_tokens_seen": 33508000, "step": 49715 }, { "epoch": 1.2146678718882076, "grad_norm": 3.0431413650512695, "learning_rate": 1.8781055387132598e-06, "loss": 0.1452, "num_input_tokens_seen": 33511200, "step": 49720 }, { "epoch": 1.2147900227200548, "grad_norm": 55.84829330444336, "learning_rate": 1.8780647330038541e-06, "loss": 0.2403, "num_input_tokens_seen": 33514464, "step": 49725 }, { "epoch": 1.214912173551902, "grad_norm": 45.058982849121094, "learning_rate": 1.8780239209089254e-06, "loss": 0.1515, "num_input_tokens_seen": 33517472, "step": 49730 }, { "epoch": 1.2150343243837491, "grad_norm": 0.10691691935062408, "learning_rate": 1.8779831024287706e-06, "loss": 0.0479, "num_input_tokens_seen": 33521312, "step": 49735 }, { "epoch": 1.215156475215596, "grad_norm": 0.10126947611570358, "learning_rate": 1.8779422775636869e-06, "loss": 0.0816, "num_input_tokens_seen": 33524896, "step": 49740 }, { "epoch": 1.2152786260474433, "grad_norm": 130.59033203125, "learning_rate": 1.8779014463139706e-06, "loss": 0.0805, "num_input_tokens_seen": 33528096, "step": 49745 }, { "epoch": 1.2154007768792905, "grad_norm": 9.010315895080566, "learning_rate": 1.877860608679919e-06, "loss": 0.094, "num_input_tokens_seen": 33531104, "step": 49750 }, { "epoch": 1.2155229277111377, "grad_norm": 0.2826998233795166, "learning_rate": 1.8778197646618285e-06, "loss": 0.0978, "num_input_tokens_seen": 33534496, "step": 49755 }, { "epoch": 1.2156450785429849, "grad_norm": 0.1441763937473297, "learning_rate": 1.8777789142599968e-06, "loss": 0.096, "num_input_tokens_seen": 33537696, "step": 49760 }, { "epoch": 1.215767229374832, "grad_norm": 28.714641571044922, "learning_rate": 1.8777380574747208e-06, "loss": 0.2209, "num_input_tokens_seen": 33541024, "step": 49765 }, { "epoch": 1.2158893802066792, "grad_norm": 119.21171569824219, "learning_rate": 1.8776971943062975e-06, "loss": 0.1062, "num_input_tokens_seen": 33544288, "step": 49770 }, { "epoch": 1.2160115310385264, "grad_norm": 0.9597406983375549, "learning_rate": 1.8776563247550242e-06, "loss": 0.0532, "num_input_tokens_seen": 33547360, "step": 49775 }, { "epoch": 1.2161336818703736, "grad_norm": 0.5183383822441101, "learning_rate": 1.877615448821198e-06, "loss": 0.0779, "num_input_tokens_seen": 33551520, "step": 49780 }, { "epoch": 1.2162558327022208, "grad_norm": 72.81488037109375, "learning_rate": 1.8775745665051161e-06, "loss": 0.0497, "num_input_tokens_seen": 33554528, "step": 49785 }, { "epoch": 1.2163779835340678, "grad_norm": 0.13180294632911682, "learning_rate": 1.8775336778070762e-06, "loss": 0.0029, "num_input_tokens_seen": 33557984, "step": 49790 }, { "epoch": 1.216500134365915, "grad_norm": 0.5422255992889404, "learning_rate": 1.877492782727375e-06, "loss": 0.0614, "num_input_tokens_seen": 33560928, "step": 49795 }, { "epoch": 1.2166222851977622, "grad_norm": 0.05892868712544441, "learning_rate": 1.8774518812663104e-06, "loss": 0.1512, "num_input_tokens_seen": 33564256, "step": 49800 }, { "epoch": 1.2167444360296094, "grad_norm": 0.9378458261489868, "learning_rate": 1.8774109734241798e-06, "loss": 0.0401, "num_input_tokens_seen": 33567584, "step": 49805 }, { "epoch": 1.2168665868614565, "grad_norm": 140.070068359375, "learning_rate": 1.8773700592012806e-06, "loss": 0.0408, "num_input_tokens_seen": 33570976, "step": 49810 }, { "epoch": 1.2169887376933037, "grad_norm": 0.2291070520877838, "learning_rate": 1.8773291385979104e-06, "loss": 0.024, "num_input_tokens_seen": 33573984, "step": 49815 }, { "epoch": 1.217110888525151, "grad_norm": 0.6731608510017395, "learning_rate": 1.8772882116143667e-06, "loss": 0.1288, "num_input_tokens_seen": 33577504, "step": 49820 }, { "epoch": 1.2172330393569981, "grad_norm": 13.080459594726562, "learning_rate": 1.8772472782509473e-06, "loss": 0.1204, "num_input_tokens_seen": 33580960, "step": 49825 }, { "epoch": 1.217355190188845, "grad_norm": 0.11996249854564667, "learning_rate": 1.8772063385079493e-06, "loss": 0.1673, "num_input_tokens_seen": 33584032, "step": 49830 }, { "epoch": 1.2174773410206923, "grad_norm": 0.5631680488586426, "learning_rate": 1.877165392385671e-06, "loss": 0.0721, "num_input_tokens_seen": 33587296, "step": 49835 }, { "epoch": 1.2175994918525395, "grad_norm": 9.505610466003418, "learning_rate": 1.8771244398844104e-06, "loss": 0.0028, "num_input_tokens_seen": 33590368, "step": 49840 }, { "epoch": 1.2177216426843867, "grad_norm": 0.05609140172600746, "learning_rate": 1.8770834810044646e-06, "loss": 0.0344, "num_input_tokens_seen": 33593696, "step": 49845 }, { "epoch": 1.2178437935162338, "grad_norm": 0.06473961472511292, "learning_rate": 1.8770425157461318e-06, "loss": 0.0828, "num_input_tokens_seen": 33597024, "step": 49850 }, { "epoch": 1.217965944348081, "grad_norm": 0.9672479629516602, "learning_rate": 1.8770015441097103e-06, "loss": 0.0105, "num_input_tokens_seen": 33600288, "step": 49855 }, { "epoch": 1.2180880951799282, "grad_norm": 109.47171783447266, "learning_rate": 1.8769605660954975e-06, "loss": 0.2466, "num_input_tokens_seen": 33603232, "step": 49860 }, { "epoch": 1.2182102460117754, "grad_norm": 0.003171207383275032, "learning_rate": 1.8769195817037916e-06, "loss": 0.0755, "num_input_tokens_seen": 33606688, "step": 49865 }, { "epoch": 1.2183323968436226, "grad_norm": 11.40949821472168, "learning_rate": 1.8768785909348904e-06, "loss": 0.0893, "num_input_tokens_seen": 33610080, "step": 49870 }, { "epoch": 1.2184545476754698, "grad_norm": 0.20446684956550598, "learning_rate": 1.8768375937890926e-06, "loss": 0.0612, "num_input_tokens_seen": 33613280, "step": 49875 }, { "epoch": 1.2185766985073168, "grad_norm": 0.7266244888305664, "learning_rate": 1.8767965902666956e-06, "loss": 0.0436, "num_input_tokens_seen": 33616224, "step": 49880 }, { "epoch": 1.218698849339164, "grad_norm": 0.1490468531847, "learning_rate": 1.8767555803679981e-06, "loss": 0.1213, "num_input_tokens_seen": 33619680, "step": 49885 }, { "epoch": 1.2188210001710111, "grad_norm": 14.057254791259766, "learning_rate": 1.8767145640932984e-06, "loss": 0.1133, "num_input_tokens_seen": 33622880, "step": 49890 }, { "epoch": 1.2189431510028583, "grad_norm": 56.923553466796875, "learning_rate": 1.8766735414428943e-06, "loss": 0.0274, "num_input_tokens_seen": 33626080, "step": 49895 }, { "epoch": 1.2190653018347055, "grad_norm": 2.8939082622528076, "learning_rate": 1.8766325124170845e-06, "loss": 0.1154, "num_input_tokens_seen": 33630048, "step": 49900 }, { "epoch": 1.2191874526665527, "grad_norm": 58.96417999267578, "learning_rate": 1.8765914770161676e-06, "loss": 0.1555, "num_input_tokens_seen": 33633376, "step": 49905 }, { "epoch": 1.2193096034984, "grad_norm": 0.5027133226394653, "learning_rate": 1.8765504352404414e-06, "loss": 0.1089, "num_input_tokens_seen": 33638944, "step": 49910 }, { "epoch": 1.2194317543302469, "grad_norm": 10.49686050415039, "learning_rate": 1.8765093870902046e-06, "loss": 0.0729, "num_input_tokens_seen": 33642208, "step": 49915 }, { "epoch": 1.219553905162094, "grad_norm": 12.41897964477539, "learning_rate": 1.8764683325657558e-06, "loss": 0.0803, "num_input_tokens_seen": 33646240, "step": 49920 }, { "epoch": 1.2196760559939412, "grad_norm": 1.074639081954956, "learning_rate": 1.8764272716673936e-06, "loss": 0.0596, "num_input_tokens_seen": 33649888, "step": 49925 }, { "epoch": 1.2197982068257884, "grad_norm": 0.3886198401451111, "learning_rate": 1.8763862043954167e-06, "loss": 0.1279, "num_input_tokens_seen": 33652768, "step": 49930 }, { "epoch": 1.2199203576576356, "grad_norm": 0.15774321556091309, "learning_rate": 1.8763451307501234e-06, "loss": 0.03, "num_input_tokens_seen": 33656288, "step": 49935 }, { "epoch": 1.2200425084894828, "grad_norm": 0.2555408477783203, "learning_rate": 1.8763040507318126e-06, "loss": 0.014, "num_input_tokens_seen": 33659616, "step": 49940 }, { "epoch": 1.22016465932133, "grad_norm": 2.39106822013855, "learning_rate": 1.8762629643407832e-06, "loss": 0.1982, "num_input_tokens_seen": 33662688, "step": 49945 }, { "epoch": 1.2202868101531772, "grad_norm": 0.30551204085350037, "learning_rate": 1.876221871577334e-06, "loss": 0.0014, "num_input_tokens_seen": 33666016, "step": 49950 }, { "epoch": 1.2204089609850244, "grad_norm": 0.18218585848808289, "learning_rate": 1.8761807724417633e-06, "loss": 0.073, "num_input_tokens_seen": 33669536, "step": 49955 }, { "epoch": 1.2205311118168716, "grad_norm": 16.104793548583984, "learning_rate": 1.8761396669343705e-06, "loss": 0.0847, "num_input_tokens_seen": 33672672, "step": 49960 }, { "epoch": 1.2206532626487188, "grad_norm": 32.72917938232422, "learning_rate": 1.8760985550554545e-06, "loss": 0.0568, "num_input_tokens_seen": 33675808, "step": 49965 }, { "epoch": 1.2207754134805657, "grad_norm": 0.12076198309659958, "learning_rate": 1.876057436805314e-06, "loss": 0.0013, "num_input_tokens_seen": 33678816, "step": 49970 }, { "epoch": 1.220897564312413, "grad_norm": 1.4278993606567383, "learning_rate": 1.8760163121842483e-06, "loss": 0.0405, "num_input_tokens_seen": 33682272, "step": 49975 }, { "epoch": 1.22101971514426, "grad_norm": 0.31052538752555847, "learning_rate": 1.8759751811925564e-06, "loss": 0.2167, "num_input_tokens_seen": 33685792, "step": 49980 }, { "epoch": 1.2211418659761073, "grad_norm": 0.14995524287223816, "learning_rate": 1.875934043830537e-06, "loss": 0.1503, "num_input_tokens_seen": 33689056, "step": 49985 }, { "epoch": 1.2212640168079545, "grad_norm": 0.036245085299015045, "learning_rate": 1.87589290009849e-06, "loss": 0.1675, "num_input_tokens_seen": 33693280, "step": 49990 }, { "epoch": 1.2213861676398017, "grad_norm": 0.24092210829257965, "learning_rate": 1.8758517499967144e-06, "loss": 0.0654, "num_input_tokens_seen": 33696800, "step": 49995 }, { "epoch": 1.2215083184716489, "grad_norm": 17.42069435119629, "learning_rate": 1.8758105935255089e-06, "loss": 0.1313, "num_input_tokens_seen": 33700320, "step": 50000 }, { "epoch": 1.2216304693034958, "grad_norm": 179.08494567871094, "learning_rate": 1.8757694306851732e-06, "loss": 0.1726, "num_input_tokens_seen": 33703776, "step": 50005 }, { "epoch": 1.221752620135343, "grad_norm": 9.46633243560791, "learning_rate": 1.8757282614760071e-06, "loss": 0.0512, "num_input_tokens_seen": 33707488, "step": 50010 }, { "epoch": 1.2218747709671902, "grad_norm": 1.2339563369750977, "learning_rate": 1.8756870858983089e-06, "loss": 0.0313, "num_input_tokens_seen": 33710496, "step": 50015 }, { "epoch": 1.2219969217990374, "grad_norm": 0.23303188383579254, "learning_rate": 1.8756459039523791e-06, "loss": 0.079, "num_input_tokens_seen": 33714720, "step": 50020 }, { "epoch": 1.2221190726308846, "grad_norm": 0.16474978625774384, "learning_rate": 1.8756047156385169e-06, "loss": 0.002, "num_input_tokens_seen": 33717984, "step": 50025 }, { "epoch": 1.2222412234627318, "grad_norm": 34.933135986328125, "learning_rate": 1.8755635209570213e-06, "loss": 0.1796, "num_input_tokens_seen": 33720928, "step": 50030 }, { "epoch": 1.222363374294579, "grad_norm": 0.29329726099967957, "learning_rate": 1.8755223199081924e-06, "loss": 0.1181, "num_input_tokens_seen": 33724576, "step": 50035 }, { "epoch": 1.2224855251264262, "grad_norm": 0.19066888093948364, "learning_rate": 1.8754811124923298e-06, "loss": 0.0019, "num_input_tokens_seen": 33728480, "step": 50040 }, { "epoch": 1.2226076759582734, "grad_norm": 0.22548232972621918, "learning_rate": 1.8754398987097331e-06, "loss": 0.0013, "num_input_tokens_seen": 33731744, "step": 50045 }, { "epoch": 1.2227298267901205, "grad_norm": 12.361126899719238, "learning_rate": 1.8753986785607019e-06, "loss": 0.0598, "num_input_tokens_seen": 33735584, "step": 50050 }, { "epoch": 1.2228519776219675, "grad_norm": 17.38907241821289, "learning_rate": 1.8753574520455362e-06, "loss": 0.1249, "num_input_tokens_seen": 33739552, "step": 50055 }, { "epoch": 1.2229741284538147, "grad_norm": 0.934760570526123, "learning_rate": 1.8753162191645354e-06, "loss": 0.09, "num_input_tokens_seen": 33742688, "step": 50060 }, { "epoch": 1.223096279285662, "grad_norm": 0.1813424676656723, "learning_rate": 1.8752749799179997e-06, "loss": 0.0686, "num_input_tokens_seen": 33746080, "step": 50065 }, { "epoch": 1.223218430117509, "grad_norm": 0.2869721055030823, "learning_rate": 1.8752337343062291e-06, "loss": 0.1781, "num_input_tokens_seen": 33749216, "step": 50070 }, { "epoch": 1.2233405809493563, "grad_norm": 1.4320014715194702, "learning_rate": 1.8751924823295232e-06, "loss": 0.0901, "num_input_tokens_seen": 33752864, "step": 50075 }, { "epoch": 1.2234627317812035, "grad_norm": 0.3322947323322296, "learning_rate": 1.8751512239881824e-06, "loss": 0.0491, "num_input_tokens_seen": 33756192, "step": 50080 }, { "epoch": 1.2235848826130507, "grad_norm": 0.4504927694797516, "learning_rate": 1.8751099592825063e-06, "loss": 0.0625, "num_input_tokens_seen": 33759456, "step": 50085 }, { "epoch": 1.2237070334448978, "grad_norm": 39.807037353515625, "learning_rate": 1.8750686882127952e-06, "loss": 0.0798, "num_input_tokens_seen": 33762912, "step": 50090 }, { "epoch": 1.2238291842767448, "grad_norm": 0.16581711173057556, "learning_rate": 1.8750274107793492e-06, "loss": 0.0012, "num_input_tokens_seen": 33766304, "step": 50095 }, { "epoch": 1.223951335108592, "grad_norm": 15.672653198242188, "learning_rate": 1.8749861269824688e-06, "loss": 0.1302, "num_input_tokens_seen": 33769696, "step": 50100 }, { "epoch": 1.2240734859404392, "grad_norm": 0.11208673566579819, "learning_rate": 1.8749448368224536e-06, "loss": 0.0476, "num_input_tokens_seen": 33773088, "step": 50105 }, { "epoch": 1.2241956367722864, "grad_norm": 17.239974975585938, "learning_rate": 1.8749035402996042e-06, "loss": 0.0521, "num_input_tokens_seen": 33776864, "step": 50110 }, { "epoch": 1.2243177876041336, "grad_norm": 23.186294555664062, "learning_rate": 1.8748622374142213e-06, "loss": 0.1457, "num_input_tokens_seen": 33780192, "step": 50115 }, { "epoch": 1.2244399384359808, "grad_norm": 12.526515007019043, "learning_rate": 1.8748209281666047e-06, "loss": 0.108, "num_input_tokens_seen": 33783712, "step": 50120 }, { "epoch": 1.224562089267828, "grad_norm": 0.32619011402130127, "learning_rate": 1.874779612557055e-06, "loss": 0.0042, "num_input_tokens_seen": 33787296, "step": 50125 }, { "epoch": 1.2246842400996751, "grad_norm": 50.9199104309082, "learning_rate": 1.8747382905858728e-06, "loss": 0.1485, "num_input_tokens_seen": 33790752, "step": 50130 }, { "epoch": 1.2248063909315223, "grad_norm": 0.3081582188606262, "learning_rate": 1.8746969622533584e-06, "loss": 0.0507, "num_input_tokens_seen": 33794272, "step": 50135 }, { "epoch": 1.2249285417633695, "grad_norm": 0.18558557331562042, "learning_rate": 1.8746556275598122e-06, "loss": 0.0425, "num_input_tokens_seen": 33797664, "step": 50140 }, { "epoch": 1.2250506925952165, "grad_norm": 0.13867150247097015, "learning_rate": 1.8746142865055353e-06, "loss": 0.1256, "num_input_tokens_seen": 33801312, "step": 50145 }, { "epoch": 1.2251728434270637, "grad_norm": 0.007238840684294701, "learning_rate": 1.8745729390908278e-06, "loss": 0.0746, "num_input_tokens_seen": 33804896, "step": 50150 }, { "epoch": 1.2252949942589109, "grad_norm": 0.06682219356298447, "learning_rate": 1.8745315853159909e-06, "loss": 0.1385, "num_input_tokens_seen": 33808928, "step": 50155 }, { "epoch": 1.225417145090758, "grad_norm": 0.026519853621721268, "learning_rate": 1.874490225181325e-06, "loss": 0.0433, "num_input_tokens_seen": 33811808, "step": 50160 }, { "epoch": 1.2255392959226052, "grad_norm": 22.701797485351562, "learning_rate": 1.874448858687131e-06, "loss": 0.0743, "num_input_tokens_seen": 33815072, "step": 50165 }, { "epoch": 1.2256614467544524, "grad_norm": 0.12054687738418579, "learning_rate": 1.8744074858337097e-06, "loss": 0.0852, "num_input_tokens_seen": 33818656, "step": 50170 }, { "epoch": 1.2257835975862996, "grad_norm": 0.3371358811855316, "learning_rate": 1.874366106621362e-06, "loss": 0.0143, "num_input_tokens_seen": 33821920, "step": 50175 }, { "epoch": 1.2259057484181468, "grad_norm": 0.2913208603858948, "learning_rate": 1.8743247210503887e-06, "loss": 0.0373, "num_input_tokens_seen": 33825248, "step": 50180 }, { "epoch": 1.2260278992499938, "grad_norm": 0.15739411115646362, "learning_rate": 1.874283329121091e-06, "loss": 0.1028, "num_input_tokens_seen": 33828768, "step": 50185 }, { "epoch": 1.226150050081841, "grad_norm": 0.07163192331790924, "learning_rate": 1.8742419308337695e-06, "loss": 0.1173, "num_input_tokens_seen": 33832288, "step": 50190 }, { "epoch": 1.2262722009136882, "grad_norm": 8.579046249389648, "learning_rate": 1.874200526188726e-06, "loss": 0.0366, "num_input_tokens_seen": 33835616, "step": 50195 }, { "epoch": 1.2263943517455353, "grad_norm": 1.7747700214385986, "learning_rate": 1.8741591151862607e-06, "loss": 0.027, "num_input_tokens_seen": 33839520, "step": 50200 }, { "epoch": 1.2265165025773825, "grad_norm": 64.0686264038086, "learning_rate": 1.8741176978266755e-06, "loss": 0.0038, "num_input_tokens_seen": 33842720, "step": 50205 }, { "epoch": 1.2266386534092297, "grad_norm": 0.3116965889930725, "learning_rate": 1.8740762741102709e-06, "loss": 0.0015, "num_input_tokens_seen": 33846240, "step": 50210 }, { "epoch": 1.226760804241077, "grad_norm": 18.133525848388672, "learning_rate": 1.874034844037349e-06, "loss": 0.1725, "num_input_tokens_seen": 33849824, "step": 50215 }, { "epoch": 1.226882955072924, "grad_norm": 0.09997110813856125, "learning_rate": 1.8739934076082102e-06, "loss": 0.1997, "num_input_tokens_seen": 33853024, "step": 50220 }, { "epoch": 1.2270051059047713, "grad_norm": 0.17489533126354218, "learning_rate": 1.8739519648231568e-06, "loss": 0.0583, "num_input_tokens_seen": 33856672, "step": 50225 }, { "epoch": 1.2271272567366185, "grad_norm": 10.11457347869873, "learning_rate": 1.8739105156824893e-06, "loss": 0.0894, "num_input_tokens_seen": 33859744, "step": 50230 }, { "epoch": 1.2272494075684655, "grad_norm": 0.14010979235172272, "learning_rate": 1.8738690601865094e-06, "loss": 0.0098, "num_input_tokens_seen": 33863200, "step": 50235 }, { "epoch": 1.2273715584003126, "grad_norm": 0.6131026148796082, "learning_rate": 1.8738275983355188e-06, "loss": 0.1171, "num_input_tokens_seen": 33866336, "step": 50240 }, { "epoch": 1.2274937092321598, "grad_norm": 0.23327378928661346, "learning_rate": 1.8737861301298189e-06, "loss": 0.1202, "num_input_tokens_seen": 33869728, "step": 50245 }, { "epoch": 1.227615860064007, "grad_norm": 14.709436416625977, "learning_rate": 1.8737446555697112e-06, "loss": 0.0908, "num_input_tokens_seen": 33873568, "step": 50250 }, { "epoch": 1.2277380108958542, "grad_norm": 18.70151138305664, "learning_rate": 1.8737031746554972e-06, "loss": 0.1399, "num_input_tokens_seen": 33877344, "step": 50255 }, { "epoch": 1.2278601617277014, "grad_norm": 0.35863348841667175, "learning_rate": 1.8736616873874788e-06, "loss": 0.0458, "num_input_tokens_seen": 33880672, "step": 50260 }, { "epoch": 1.2279823125595486, "grad_norm": 0.9880019426345825, "learning_rate": 1.8736201937659577e-06, "loss": 0.0731, "num_input_tokens_seen": 33884512, "step": 50265 }, { "epoch": 1.2281044633913958, "grad_norm": 24.686311721801758, "learning_rate": 1.8735786937912358e-06, "loss": 0.0707, "num_input_tokens_seen": 33888544, "step": 50270 }, { "epoch": 1.2282266142232428, "grad_norm": 45.313541412353516, "learning_rate": 1.8735371874636142e-06, "loss": 0.0657, "num_input_tokens_seen": 33891680, "step": 50275 }, { "epoch": 1.22834876505509, "grad_norm": 30.227420806884766, "learning_rate": 1.8734956747833955e-06, "loss": 0.0042, "num_input_tokens_seen": 33895584, "step": 50280 }, { "epoch": 1.2284709158869371, "grad_norm": 0.12649253010749817, "learning_rate": 1.8734541557508811e-06, "loss": 0.044, "num_input_tokens_seen": 33898784, "step": 50285 }, { "epoch": 1.2285930667187843, "grad_norm": 0.6267743706703186, "learning_rate": 1.8734126303663733e-06, "loss": 0.1212, "num_input_tokens_seen": 33902112, "step": 50290 }, { "epoch": 1.2287152175506315, "grad_norm": 17.582807540893555, "learning_rate": 1.873371098630174e-06, "loss": 0.0733, "num_input_tokens_seen": 33905312, "step": 50295 }, { "epoch": 1.2288373683824787, "grad_norm": 27.228029251098633, "learning_rate": 1.8733295605425852e-06, "loss": 0.1261, "num_input_tokens_seen": 33908832, "step": 50300 }, { "epoch": 1.228959519214326, "grad_norm": 145.7957763671875, "learning_rate": 1.8732880161039088e-06, "loss": 0.1365, "num_input_tokens_seen": 33912352, "step": 50305 }, { "epoch": 1.229081670046173, "grad_norm": 4.750298500061035, "learning_rate": 1.873246465314447e-06, "loss": 0.0527, "num_input_tokens_seen": 33915360, "step": 50310 }, { "epoch": 1.2292038208780203, "grad_norm": 0.14604468643665314, "learning_rate": 1.873204908174502e-06, "loss": 0.1051, "num_input_tokens_seen": 33919456, "step": 50315 }, { "epoch": 1.2293259717098675, "grad_norm": 41.39289855957031, "learning_rate": 1.8731633446843765e-06, "loss": 0.2591, "num_input_tokens_seen": 33922464, "step": 50320 }, { "epoch": 1.2294481225417144, "grad_norm": 21.35294532775879, "learning_rate": 1.873121774844372e-06, "loss": 0.1767, "num_input_tokens_seen": 33925408, "step": 50325 }, { "epoch": 1.2295702733735616, "grad_norm": 0.12231894582509995, "learning_rate": 1.873080198654791e-06, "loss": 0.0073, "num_input_tokens_seen": 33928992, "step": 50330 }, { "epoch": 1.2296924242054088, "grad_norm": 0.5598852038383484, "learning_rate": 1.873038616115936e-06, "loss": 0.0729, "num_input_tokens_seen": 33932448, "step": 50335 }, { "epoch": 1.229814575037256, "grad_norm": 103.16210174560547, "learning_rate": 1.8729970272281092e-06, "loss": 0.0924, "num_input_tokens_seen": 33935776, "step": 50340 }, { "epoch": 1.2299367258691032, "grad_norm": 0.37978076934814453, "learning_rate": 1.8729554319916137e-06, "loss": 0.0866, "num_input_tokens_seen": 33939168, "step": 50345 }, { "epoch": 1.2300588767009504, "grad_norm": 27.279321670532227, "learning_rate": 1.872913830406751e-06, "loss": 0.0694, "num_input_tokens_seen": 33942432, "step": 50350 }, { "epoch": 1.2301810275327976, "grad_norm": 0.4356905221939087, "learning_rate": 1.8728722224738244e-06, "loss": 0.1368, "num_input_tokens_seen": 33945376, "step": 50355 }, { "epoch": 1.2303031783646448, "grad_norm": 0.22323796153068542, "learning_rate": 1.8728306081931362e-06, "loss": 0.048, "num_input_tokens_seen": 33948384, "step": 50360 }, { "epoch": 1.2304253291964917, "grad_norm": 0.5146710276603699, "learning_rate": 1.8727889875649892e-06, "loss": 0.0403, "num_input_tokens_seen": 33951520, "step": 50365 }, { "epoch": 1.230547480028339, "grad_norm": 0.6455696821212769, "learning_rate": 1.8727473605896856e-06, "loss": 0.1404, "num_input_tokens_seen": 33955040, "step": 50370 }, { "epoch": 1.230669630860186, "grad_norm": 0.19283178448677063, "learning_rate": 1.8727057272675286e-06, "loss": 0.0039, "num_input_tokens_seen": 33958304, "step": 50375 }, { "epoch": 1.2307917816920333, "grad_norm": 14.63194465637207, "learning_rate": 1.8726640875988209e-06, "loss": 0.1388, "num_input_tokens_seen": 33961888, "step": 50380 }, { "epoch": 1.2309139325238805, "grad_norm": 0.23423582315444946, "learning_rate": 1.8726224415838652e-06, "loss": 0.1215, "num_input_tokens_seen": 33965152, "step": 50385 }, { "epoch": 1.2310360833557277, "grad_norm": 0.20280465483665466, "learning_rate": 1.8725807892229644e-06, "loss": 0.0899, "num_input_tokens_seen": 33969184, "step": 50390 }, { "epoch": 1.2311582341875749, "grad_norm": 0.10060182958841324, "learning_rate": 1.8725391305164213e-06, "loss": 0.0357, "num_input_tokens_seen": 33972320, "step": 50395 }, { "epoch": 1.231280385019422, "grad_norm": 0.036975398659706116, "learning_rate": 1.8724974654645392e-06, "loss": 0.0211, "num_input_tokens_seen": 33975776, "step": 50400 }, { "epoch": 1.2314025358512692, "grad_norm": 0.09926356375217438, "learning_rate": 1.8724557940676206e-06, "loss": 0.0521, "num_input_tokens_seen": 33978720, "step": 50405 }, { "epoch": 1.2315246866831164, "grad_norm": 0.05969943851232529, "learning_rate": 1.872414116325969e-06, "loss": 0.0016, "num_input_tokens_seen": 33982048, "step": 50410 }, { "epoch": 1.2316468375149634, "grad_norm": 57.690975189208984, "learning_rate": 1.8723724322398874e-06, "loss": 0.1, "num_input_tokens_seen": 33984928, "step": 50415 }, { "epoch": 1.2317689883468106, "grad_norm": 0.20247139036655426, "learning_rate": 1.8723307418096782e-06, "loss": 0.0651, "num_input_tokens_seen": 33988128, "step": 50420 }, { "epoch": 1.2318911391786578, "grad_norm": 0.06691229343414307, "learning_rate": 1.8722890450356457e-06, "loss": 0.1015, "num_input_tokens_seen": 33991456, "step": 50425 }, { "epoch": 1.232013290010505, "grad_norm": 0.37158066034317017, "learning_rate": 1.8722473419180926e-06, "loss": 0.0219, "num_input_tokens_seen": 33994848, "step": 50430 }, { "epoch": 1.2321354408423522, "grad_norm": 40.26687240600586, "learning_rate": 1.8722056324573226e-06, "loss": 0.11, "num_input_tokens_seen": 33997984, "step": 50435 }, { "epoch": 1.2322575916741993, "grad_norm": 22.51052474975586, "learning_rate": 1.872163916653638e-06, "loss": 0.0329, "num_input_tokens_seen": 34001248, "step": 50440 }, { "epoch": 1.2323797425060465, "grad_norm": 9.003214836120605, "learning_rate": 1.8721221945073432e-06, "loss": 0.0817, "num_input_tokens_seen": 34005088, "step": 50445 }, { "epoch": 1.2325018933378937, "grad_norm": 20.297090530395508, "learning_rate": 1.872080466018741e-06, "loss": 0.1316, "num_input_tokens_seen": 34008544, "step": 50450 }, { "epoch": 1.2326240441697407, "grad_norm": 38.89817810058594, "learning_rate": 1.8720387311881352e-06, "loss": 0.1039, "num_input_tokens_seen": 34012192, "step": 50455 }, { "epoch": 1.2327461950015879, "grad_norm": 1.270237684249878, "learning_rate": 1.8719969900158293e-06, "loss": 0.115, "num_input_tokens_seen": 34015584, "step": 50460 }, { "epoch": 1.232868345833435, "grad_norm": 0.2633684277534485, "learning_rate": 1.8719552425021265e-06, "loss": 0.0018, "num_input_tokens_seen": 34018720, "step": 50465 }, { "epoch": 1.2329904966652823, "grad_norm": 22.630613327026367, "learning_rate": 1.8719134886473308e-06, "loss": 0.0622, "num_input_tokens_seen": 34022176, "step": 50470 }, { "epoch": 1.2331126474971295, "grad_norm": 60.35623550415039, "learning_rate": 1.8718717284517455e-06, "loss": 0.029, "num_input_tokens_seen": 34025248, "step": 50475 }, { "epoch": 1.2332347983289766, "grad_norm": 0.048686183989048004, "learning_rate": 1.871829961915675e-06, "loss": 0.1482, "num_input_tokens_seen": 34028704, "step": 50480 }, { "epoch": 1.2333569491608238, "grad_norm": 0.037194494158029556, "learning_rate": 1.871788189039422e-06, "loss": 0.046, "num_input_tokens_seen": 34032416, "step": 50485 }, { "epoch": 1.233479099992671, "grad_norm": 60.3780632019043, "learning_rate": 1.8717464098232912e-06, "loss": 0.055, "num_input_tokens_seen": 34036704, "step": 50490 }, { "epoch": 1.2336012508245182, "grad_norm": 8.005125999450684, "learning_rate": 1.8717046242675858e-06, "loss": 0.1558, "num_input_tokens_seen": 34039776, "step": 50495 }, { "epoch": 1.2337234016563654, "grad_norm": 0.16520294547080994, "learning_rate": 1.8716628323726099e-06, "loss": 0.0559, "num_input_tokens_seen": 34043040, "step": 50500 }, { "epoch": 1.2338455524882124, "grad_norm": 22.122344970703125, "learning_rate": 1.8716210341386676e-06, "loss": 0.0497, "num_input_tokens_seen": 34046048, "step": 50505 }, { "epoch": 1.2339677033200596, "grad_norm": 0.7964180707931519, "learning_rate": 1.8715792295660623e-06, "loss": 0.1009, "num_input_tokens_seen": 34049696, "step": 50510 }, { "epoch": 1.2340898541519068, "grad_norm": 14.673161506652832, "learning_rate": 1.8715374186550989e-06, "loss": 0.1194, "num_input_tokens_seen": 34052768, "step": 50515 }, { "epoch": 1.234212004983754, "grad_norm": 0.02644650824368, "learning_rate": 1.8714956014060808e-06, "loss": 0.1266, "num_input_tokens_seen": 34056096, "step": 50520 }, { "epoch": 1.2343341558156011, "grad_norm": 0.41322630643844604, "learning_rate": 1.8714537778193122e-06, "loss": 0.1359, "num_input_tokens_seen": 34059552, "step": 50525 }, { "epoch": 1.2344563066474483, "grad_norm": 0.3122689723968506, "learning_rate": 1.8714119478950974e-06, "loss": 0.0233, "num_input_tokens_seen": 34063136, "step": 50530 }, { "epoch": 1.2345784574792955, "grad_norm": 0.4729823172092438, "learning_rate": 1.8713701116337406e-06, "loss": 0.1906, "num_input_tokens_seen": 34066464, "step": 50535 }, { "epoch": 1.2347006083111425, "grad_norm": 0.9967532753944397, "learning_rate": 1.8713282690355459e-06, "loss": 0.0603, "num_input_tokens_seen": 34070112, "step": 50540 }, { "epoch": 1.2348227591429897, "grad_norm": 0.2099456489086151, "learning_rate": 1.8712864201008175e-06, "loss": 0.0033, "num_input_tokens_seen": 34073568, "step": 50545 }, { "epoch": 1.2349449099748369, "grad_norm": 0.5129920840263367, "learning_rate": 1.87124456482986e-06, "loss": 0.1586, "num_input_tokens_seen": 34076896, "step": 50550 }, { "epoch": 1.235067060806684, "grad_norm": 17.669719696044922, "learning_rate": 1.8712027032229778e-06, "loss": 0.0601, "num_input_tokens_seen": 34080160, "step": 50555 }, { "epoch": 1.2351892116385312, "grad_norm": 0.3886941969394684, "learning_rate": 1.8711608352804754e-06, "loss": 0.1422, "num_input_tokens_seen": 34083808, "step": 50560 }, { "epoch": 1.2353113624703784, "grad_norm": 28.27507781982422, "learning_rate": 1.8711189610026568e-06, "loss": 0.1348, "num_input_tokens_seen": 34087584, "step": 50565 }, { "epoch": 1.2354335133022256, "grad_norm": 0.2292449027299881, "learning_rate": 1.8710770803898268e-06, "loss": 0.0303, "num_input_tokens_seen": 34091040, "step": 50570 }, { "epoch": 1.2355556641340728, "grad_norm": 0.25094738602638245, "learning_rate": 1.8710351934422901e-06, "loss": 0.0039, "num_input_tokens_seen": 34095008, "step": 50575 }, { "epoch": 1.23567781496592, "grad_norm": 124.43775939941406, "learning_rate": 1.870993300160351e-06, "loss": 0.0217, "num_input_tokens_seen": 34098208, "step": 50580 }, { "epoch": 1.2357999657977672, "grad_norm": 78.6186752319336, "learning_rate": 1.8709514005443149e-06, "loss": 0.0245, "num_input_tokens_seen": 34101344, "step": 50585 }, { "epoch": 1.2359221166296142, "grad_norm": 155.6714324951172, "learning_rate": 1.8709094945944855e-06, "loss": 0.0373, "num_input_tokens_seen": 34104480, "step": 50590 }, { "epoch": 1.2360442674614613, "grad_norm": 0.1335565149784088, "learning_rate": 1.870867582311168e-06, "loss": 0.0477, "num_input_tokens_seen": 34107744, "step": 50595 }, { "epoch": 1.2361664182933085, "grad_norm": 0.07351504266262054, "learning_rate": 1.8708256636946671e-06, "loss": 0.0901, "num_input_tokens_seen": 34111072, "step": 50600 }, { "epoch": 1.2362885691251557, "grad_norm": 0.7340015769004822, "learning_rate": 1.870783738745288e-06, "loss": 0.0008, "num_input_tokens_seen": 34114912, "step": 50605 }, { "epoch": 1.236410719957003, "grad_norm": 8.105185508728027, "learning_rate": 1.8707418074633354e-06, "loss": 0.1406, "num_input_tokens_seen": 34118496, "step": 50610 }, { "epoch": 1.23653287078885, "grad_norm": 0.1442503035068512, "learning_rate": 1.870699869849114e-06, "loss": 0.0007, "num_input_tokens_seen": 34121696, "step": 50615 }, { "epoch": 1.2366550216206973, "grad_norm": 0.697321355342865, "learning_rate": 1.870657925902929e-06, "loss": 0.0469, "num_input_tokens_seen": 34124704, "step": 50620 }, { "epoch": 1.2367771724525445, "grad_norm": 0.26186203956604004, "learning_rate": 1.8706159756250855e-06, "loss": 0.076, "num_input_tokens_seen": 34128032, "step": 50625 }, { "epoch": 1.2368993232843914, "grad_norm": 34.05559539794922, "learning_rate": 1.8705740190158882e-06, "loss": 0.1679, "num_input_tokens_seen": 34131680, "step": 50630 }, { "epoch": 1.2370214741162386, "grad_norm": 15.350863456726074, "learning_rate": 1.8705320560756425e-06, "loss": 0.1274, "num_input_tokens_seen": 34134816, "step": 50635 }, { "epoch": 1.2371436249480858, "grad_norm": 0.6870012283325195, "learning_rate": 1.8704900868046537e-06, "loss": 0.1526, "num_input_tokens_seen": 34138272, "step": 50640 }, { "epoch": 1.237265775779933, "grad_norm": 20.827638626098633, "learning_rate": 1.8704481112032272e-06, "loss": 0.2716, "num_input_tokens_seen": 34141792, "step": 50645 }, { "epoch": 1.2373879266117802, "grad_norm": 11.061613082885742, "learning_rate": 1.8704061292716672e-06, "loss": 0.0997, "num_input_tokens_seen": 34145312, "step": 50650 }, { "epoch": 1.2375100774436274, "grad_norm": 0.1646193563938141, "learning_rate": 1.8703641410102802e-06, "loss": 0.0765, "num_input_tokens_seen": 34148704, "step": 50655 }, { "epoch": 1.2376322282754746, "grad_norm": 0.5354453921318054, "learning_rate": 1.8703221464193709e-06, "loss": 0.0349, "num_input_tokens_seen": 34151840, "step": 50660 }, { "epoch": 1.2377543791073218, "grad_norm": 0.34802761673927307, "learning_rate": 1.8702801454992448e-06, "loss": 0.0264, "num_input_tokens_seen": 34154848, "step": 50665 }, { "epoch": 1.237876529939169, "grad_norm": 0.7380416393280029, "learning_rate": 1.8702381382502076e-06, "loss": 0.0021, "num_input_tokens_seen": 34158752, "step": 50670 }, { "epoch": 1.2379986807710162, "grad_norm": 0.05595047399401665, "learning_rate": 1.8701961246725643e-06, "loss": 0.0724, "num_input_tokens_seen": 34161888, "step": 50675 }, { "epoch": 1.2381208316028631, "grad_norm": 4.639143943786621, "learning_rate": 1.870154104766621e-06, "loss": 0.0506, "num_input_tokens_seen": 34165280, "step": 50680 }, { "epoch": 1.2382429824347103, "grad_norm": 14.277225494384766, "learning_rate": 1.870112078532683e-06, "loss": 0.1808, "num_input_tokens_seen": 34168608, "step": 50685 }, { "epoch": 1.2383651332665575, "grad_norm": 0.036859460175037384, "learning_rate": 1.870070045971056e-06, "loss": 0.1062, "num_input_tokens_seen": 34171872, "step": 50690 }, { "epoch": 1.2384872840984047, "grad_norm": 0.103855200111866, "learning_rate": 1.870028007082045e-06, "loss": 0.1096, "num_input_tokens_seen": 34175520, "step": 50695 }, { "epoch": 1.2386094349302519, "grad_norm": 0.5549172163009644, "learning_rate": 1.869985961865957e-06, "loss": 0.1077, "num_input_tokens_seen": 34178592, "step": 50700 }, { "epoch": 1.238731585762099, "grad_norm": 0.07171110808849335, "learning_rate": 1.869943910323097e-06, "loss": 0.1536, "num_input_tokens_seen": 34182048, "step": 50705 }, { "epoch": 1.2388537365939463, "grad_norm": 14.18834114074707, "learning_rate": 1.8699018524537706e-06, "loss": 0.201, "num_input_tokens_seen": 34186016, "step": 50710 }, { "epoch": 1.2389758874257935, "grad_norm": 0.6210215091705322, "learning_rate": 1.8698597882582842e-06, "loss": 0.079, "num_input_tokens_seen": 34189792, "step": 50715 }, { "epoch": 1.2390980382576404, "grad_norm": 0.7056021690368652, "learning_rate": 1.8698177177369433e-06, "loss": 0.1121, "num_input_tokens_seen": 34193312, "step": 50720 }, { "epoch": 1.2392201890894876, "grad_norm": 0.4129893183708191, "learning_rate": 1.869775640890054e-06, "loss": 0.0983, "num_input_tokens_seen": 34196960, "step": 50725 }, { "epoch": 1.2393423399213348, "grad_norm": 4.039682388305664, "learning_rate": 1.8697335577179226e-06, "loss": 0.0083, "num_input_tokens_seen": 34200288, "step": 50730 }, { "epoch": 1.239464490753182, "grad_norm": 28.408931732177734, "learning_rate": 1.8696914682208544e-06, "loss": 0.0796, "num_input_tokens_seen": 34203296, "step": 50735 }, { "epoch": 1.2395866415850292, "grad_norm": 0.11473660171031952, "learning_rate": 1.8696493723991562e-06, "loss": 0.0923, "num_input_tokens_seen": 34206688, "step": 50740 }, { "epoch": 1.2397087924168764, "grad_norm": 0.5018324851989746, "learning_rate": 1.8696072702531339e-06, "loss": 0.0365, "num_input_tokens_seen": 34209760, "step": 50745 }, { "epoch": 1.2398309432487236, "grad_norm": 30.176328659057617, "learning_rate": 1.8695651617830934e-06, "loss": 0.0933, "num_input_tokens_seen": 34213152, "step": 50750 }, { "epoch": 1.2399530940805708, "grad_norm": 0.46327489614486694, "learning_rate": 1.8695230469893413e-06, "loss": 0.1318, "num_input_tokens_seen": 34216224, "step": 50755 }, { "epoch": 1.240075244912418, "grad_norm": 0.4507075846195221, "learning_rate": 1.8694809258721835e-06, "loss": 0.0015, "num_input_tokens_seen": 34219552, "step": 50760 }, { "epoch": 1.2401973957442651, "grad_norm": 0.1781848669052124, "learning_rate": 1.8694387984319268e-06, "loss": 0.0057, "num_input_tokens_seen": 34222624, "step": 50765 }, { "epoch": 1.240319546576112, "grad_norm": 0.054551366716623306, "learning_rate": 1.8693966646688774e-06, "loss": 0.0006, "num_input_tokens_seen": 34225504, "step": 50770 }, { "epoch": 1.2404416974079593, "grad_norm": 0.09252890199422836, "learning_rate": 1.8693545245833415e-06, "loss": 0.0929, "num_input_tokens_seen": 34228576, "step": 50775 }, { "epoch": 1.2405638482398065, "grad_norm": 24.160945892333984, "learning_rate": 1.8693123781756258e-06, "loss": 0.1348, "num_input_tokens_seen": 34231904, "step": 50780 }, { "epoch": 1.2406859990716537, "grad_norm": 0.16220663487911224, "learning_rate": 1.8692702254460363e-06, "loss": 0.0009, "num_input_tokens_seen": 34235040, "step": 50785 }, { "epoch": 1.2408081499035009, "grad_norm": 0.23448047041893005, "learning_rate": 1.8692280663948802e-06, "loss": 0.0466, "num_input_tokens_seen": 34238816, "step": 50790 }, { "epoch": 1.240930300735348, "grad_norm": 46.69455337524414, "learning_rate": 1.8691859010224636e-06, "loss": 0.2587, "num_input_tokens_seen": 34242208, "step": 50795 }, { "epoch": 1.2410524515671952, "grad_norm": 0.009700641967356205, "learning_rate": 1.8691437293290936e-06, "loss": 0.0009, "num_input_tokens_seen": 34245344, "step": 50800 }, { "epoch": 1.2411746023990424, "grad_norm": 71.18877410888672, "learning_rate": 1.8691015513150766e-06, "loss": 0.0756, "num_input_tokens_seen": 34248544, "step": 50805 }, { "epoch": 1.2412967532308894, "grad_norm": 0.861115574836731, "learning_rate": 1.8690593669807191e-06, "loss": 0.1529, "num_input_tokens_seen": 34251808, "step": 50810 }, { "epoch": 1.2414189040627366, "grad_norm": 31.997013092041016, "learning_rate": 1.8690171763263284e-06, "loss": 0.093, "num_input_tokens_seen": 34255328, "step": 50815 }, { "epoch": 1.2415410548945838, "grad_norm": 43.22612762451172, "learning_rate": 1.868974979352211e-06, "loss": 0.1919, "num_input_tokens_seen": 34258528, "step": 50820 }, { "epoch": 1.241663205726431, "grad_norm": 26.389102935791016, "learning_rate": 1.8689327760586737e-06, "loss": 0.0842, "num_input_tokens_seen": 34261920, "step": 50825 }, { "epoch": 1.2417853565582782, "grad_norm": 5.921286106109619, "learning_rate": 1.8688905664460237e-06, "loss": 0.0521, "num_input_tokens_seen": 34264992, "step": 50830 }, { "epoch": 1.2419075073901253, "grad_norm": 0.04653450474143028, "learning_rate": 1.8688483505145677e-06, "loss": 0.0758, "num_input_tokens_seen": 34268128, "step": 50835 }, { "epoch": 1.2420296582219725, "grad_norm": 0.05636782944202423, "learning_rate": 1.8688061282646129e-06, "loss": 0.0313, "num_input_tokens_seen": 34271200, "step": 50840 }, { "epoch": 1.2421518090538197, "grad_norm": 0.5799795985221863, "learning_rate": 1.868763899696466e-06, "loss": 0.104, "num_input_tokens_seen": 34274400, "step": 50845 }, { "epoch": 1.242273959885667, "grad_norm": 14.093242645263672, "learning_rate": 1.8687216648104344e-06, "loss": 0.1647, "num_input_tokens_seen": 34278240, "step": 50850 }, { "epoch": 1.242396110717514, "grad_norm": 36.90916442871094, "learning_rate": 1.8686794236068254e-06, "loss": 0.1633, "num_input_tokens_seen": 34281440, "step": 50855 }, { "epoch": 1.242518261549361, "grad_norm": 15.721162796020508, "learning_rate": 1.8686371760859458e-06, "loss": 0.0966, "num_input_tokens_seen": 34284576, "step": 50860 }, { "epoch": 1.2426404123812083, "grad_norm": 87.52764129638672, "learning_rate": 1.8685949222481034e-06, "loss": 0.1385, "num_input_tokens_seen": 34288032, "step": 50865 }, { "epoch": 1.2427625632130554, "grad_norm": 0.8054718971252441, "learning_rate": 1.8685526620936048e-06, "loss": 0.0023, "num_input_tokens_seen": 34291296, "step": 50870 }, { "epoch": 1.2428847140449026, "grad_norm": 1.4423446655273438, "learning_rate": 1.8685103956227578e-06, "loss": 0.0586, "num_input_tokens_seen": 34294496, "step": 50875 }, { "epoch": 1.2430068648767498, "grad_norm": 49.275943756103516, "learning_rate": 1.8684681228358694e-06, "loss": 0.1438, "num_input_tokens_seen": 34297632, "step": 50880 }, { "epoch": 1.243129015708597, "grad_norm": 1.6575486660003662, "learning_rate": 1.8684258437332472e-06, "loss": 0.2076, "num_input_tokens_seen": 34301280, "step": 50885 }, { "epoch": 1.2432511665404442, "grad_norm": 0.05155455693602562, "learning_rate": 1.8683835583151986e-06, "loss": 0.0412, "num_input_tokens_seen": 34304992, "step": 50890 }, { "epoch": 1.2433733173722914, "grad_norm": 0.618511438369751, "learning_rate": 1.8683412665820314e-06, "loss": 0.0577, "num_input_tokens_seen": 34308448, "step": 50895 }, { "epoch": 1.2434954682041384, "grad_norm": 66.2889175415039, "learning_rate": 1.868298968534053e-06, "loss": 0.2242, "num_input_tokens_seen": 34312160, "step": 50900 }, { "epoch": 1.2436176190359856, "grad_norm": 54.50128936767578, "learning_rate": 1.8682566641715709e-06, "loss": 0.1221, "num_input_tokens_seen": 34315680, "step": 50905 }, { "epoch": 1.2437397698678327, "grad_norm": 1.51670503616333, "learning_rate": 1.8682143534948928e-06, "loss": 0.0311, "num_input_tokens_seen": 34319008, "step": 50910 }, { "epoch": 1.24386192069968, "grad_norm": 137.47225952148438, "learning_rate": 1.8681720365043263e-06, "loss": 0.0355, "num_input_tokens_seen": 34322400, "step": 50915 }, { "epoch": 1.2439840715315271, "grad_norm": 0.07625175267457962, "learning_rate": 1.8681297132001794e-06, "loss": 0.0827, "num_input_tokens_seen": 34325984, "step": 50920 }, { "epoch": 1.2441062223633743, "grad_norm": 0.4132697880268097, "learning_rate": 1.8680873835827598e-06, "loss": 0.0321, "num_input_tokens_seen": 34329440, "step": 50925 }, { "epoch": 1.2442283731952215, "grad_norm": 0.17457693815231323, "learning_rate": 1.8680450476523748e-06, "loss": 0.05, "num_input_tokens_seen": 34332704, "step": 50930 }, { "epoch": 1.2443505240270687, "grad_norm": 0.5495626330375671, "learning_rate": 1.8680027054093332e-06, "loss": 0.0462, "num_input_tokens_seen": 34336032, "step": 50935 }, { "epoch": 1.2444726748589159, "grad_norm": 8.839376449584961, "learning_rate": 1.8679603568539423e-06, "loss": 0.0424, "num_input_tokens_seen": 34339040, "step": 50940 }, { "epoch": 1.244594825690763, "grad_norm": 14.500417709350586, "learning_rate": 1.8679180019865102e-06, "loss": 0.2089, "num_input_tokens_seen": 34342240, "step": 50945 }, { "epoch": 1.24471697652261, "grad_norm": 0.2148173600435257, "learning_rate": 1.867875640807345e-06, "loss": 0.0468, "num_input_tokens_seen": 34345696, "step": 50950 }, { "epoch": 1.2448391273544572, "grad_norm": 0.01791485585272312, "learning_rate": 1.8678332733167546e-06, "loss": 0.0874, "num_input_tokens_seen": 34349216, "step": 50955 }, { "epoch": 1.2449612781863044, "grad_norm": 26.81791114807129, "learning_rate": 1.8677908995150475e-06, "loss": 0.1386, "num_input_tokens_seen": 34352416, "step": 50960 }, { "epoch": 1.2450834290181516, "grad_norm": 0.6094515323638916, "learning_rate": 1.8677485194025313e-06, "loss": 0.0869, "num_input_tokens_seen": 34356000, "step": 50965 }, { "epoch": 1.2452055798499988, "grad_norm": 168.68463134765625, "learning_rate": 1.8677061329795145e-06, "loss": 0.0388, "num_input_tokens_seen": 34359712, "step": 50970 }, { "epoch": 1.245327730681846, "grad_norm": 19.0019588470459, "learning_rate": 1.8676637402463054e-06, "loss": 0.1767, "num_input_tokens_seen": 34363040, "step": 50975 }, { "epoch": 1.2454498815136932, "grad_norm": 21.750730514526367, "learning_rate": 1.867621341203212e-06, "loss": 0.0821, "num_input_tokens_seen": 34366176, "step": 50980 }, { "epoch": 1.2455720323455404, "grad_norm": 21.11234474182129, "learning_rate": 1.867578935850543e-06, "loss": 0.0614, "num_input_tokens_seen": 34369312, "step": 50985 }, { "epoch": 1.2456941831773873, "grad_norm": 0.19964002072811127, "learning_rate": 1.867536524188607e-06, "loss": 0.003, "num_input_tokens_seen": 34372576, "step": 50990 }, { "epoch": 1.2458163340092345, "grad_norm": 0.3555033206939697, "learning_rate": 1.8674941062177117e-06, "loss": 0.1066, "num_input_tokens_seen": 34376096, "step": 50995 }, { "epoch": 1.2459384848410817, "grad_norm": 0.11165308207273483, "learning_rate": 1.8674516819381657e-06, "loss": 0.0379, "num_input_tokens_seen": 34379616, "step": 51000 }, { "epoch": 1.246060635672929, "grad_norm": 1.0125317573547363, "learning_rate": 1.867409251350278e-06, "loss": 0.0926, "num_input_tokens_seen": 34382816, "step": 51005 }, { "epoch": 1.246182786504776, "grad_norm": 36.29445266723633, "learning_rate": 1.8673668144543567e-06, "loss": 0.116, "num_input_tokens_seen": 34386208, "step": 51010 }, { "epoch": 1.2463049373366233, "grad_norm": 9.032966613769531, "learning_rate": 1.867324371250711e-06, "loss": 0.1146, "num_input_tokens_seen": 34389344, "step": 51015 }, { "epoch": 1.2464270881684705, "grad_norm": 3.523752450942993, "learning_rate": 1.8672819217396491e-06, "loss": 0.0864, "num_input_tokens_seen": 34392800, "step": 51020 }, { "epoch": 1.2465492390003177, "grad_norm": 0.07758628576993942, "learning_rate": 1.8672394659214797e-06, "loss": 0.056, "num_input_tokens_seen": 34396064, "step": 51025 }, { "epoch": 1.2466713898321649, "grad_norm": 17.795822143554688, "learning_rate": 1.8671970037965116e-06, "loss": 0.1655, "num_input_tokens_seen": 34398880, "step": 51030 }, { "epoch": 1.246793540664012, "grad_norm": 133.96359252929688, "learning_rate": 1.8671545353650537e-06, "loss": 0.1223, "num_input_tokens_seen": 34402400, "step": 51035 }, { "epoch": 1.246915691495859, "grad_norm": 0.14668966829776764, "learning_rate": 1.8671120606274149e-06, "loss": 0.0025, "num_input_tokens_seen": 34405984, "step": 51040 }, { "epoch": 1.2470378423277062, "grad_norm": 30.48691749572754, "learning_rate": 1.8670695795839038e-06, "loss": 0.0859, "num_input_tokens_seen": 34409440, "step": 51045 }, { "epoch": 1.2471599931595534, "grad_norm": 0.7731919288635254, "learning_rate": 1.8670270922348296e-06, "loss": 0.0481, "num_input_tokens_seen": 34412576, "step": 51050 }, { "epoch": 1.2472821439914006, "grad_norm": 0.43882834911346436, "learning_rate": 1.866984598580501e-06, "loss": 0.1475, "num_input_tokens_seen": 34415776, "step": 51055 }, { "epoch": 1.2474042948232478, "grad_norm": 0.40969663858413696, "learning_rate": 1.8669420986212274e-06, "loss": 0.0036, "num_input_tokens_seen": 34419296, "step": 51060 }, { "epoch": 1.247526445655095, "grad_norm": 3.498124361038208, "learning_rate": 1.866899592357318e-06, "loss": 0.1043, "num_input_tokens_seen": 34422432, "step": 51065 }, { "epoch": 1.2476485964869422, "grad_norm": 62.7524299621582, "learning_rate": 1.866857079789081e-06, "loss": 0.0421, "num_input_tokens_seen": 34425568, "step": 51070 }, { "epoch": 1.2477707473187891, "grad_norm": 38.86537170410156, "learning_rate": 1.8668145609168265e-06, "loss": 0.1972, "num_input_tokens_seen": 34428960, "step": 51075 }, { "epoch": 1.2478928981506363, "grad_norm": 42.30865478515625, "learning_rate": 1.8667720357408632e-06, "loss": 0.0566, "num_input_tokens_seen": 34432352, "step": 51080 }, { "epoch": 1.2480150489824835, "grad_norm": 18.48854637145996, "learning_rate": 1.8667295042615006e-06, "loss": 0.0437, "num_input_tokens_seen": 34435680, "step": 51085 }, { "epoch": 1.2481371998143307, "grad_norm": 0.07613479346036911, "learning_rate": 1.866686966479048e-06, "loss": 0.0038, "num_input_tokens_seen": 34439904, "step": 51090 }, { "epoch": 1.2482593506461779, "grad_norm": 199.28736877441406, "learning_rate": 1.8666444223938145e-06, "loss": 0.2072, "num_input_tokens_seen": 34443104, "step": 51095 }, { "epoch": 1.248381501478025, "grad_norm": 3.169609546661377, "learning_rate": 1.8666018720061097e-06, "loss": 0.2199, "num_input_tokens_seen": 34446240, "step": 51100 }, { "epoch": 1.2485036523098723, "grad_norm": 0.19156071543693542, "learning_rate": 1.8665593153162429e-06, "loss": 0.0821, "num_input_tokens_seen": 34449888, "step": 51105 }, { "epoch": 1.2486258031417194, "grad_norm": 0.11386224627494812, "learning_rate": 1.8665167523245238e-06, "loss": 0.0439, "num_input_tokens_seen": 34453536, "step": 51110 }, { "epoch": 1.2487479539735666, "grad_norm": 0.17927947640419006, "learning_rate": 1.8664741830312618e-06, "loss": 0.0081, "num_input_tokens_seen": 34456928, "step": 51115 }, { "epoch": 1.2488701048054138, "grad_norm": 0.2570532262325287, "learning_rate": 1.8664316074367666e-06, "loss": 0.0491, "num_input_tokens_seen": 34459936, "step": 51120 }, { "epoch": 1.2489922556372608, "grad_norm": 7.341938018798828, "learning_rate": 1.8663890255413474e-06, "loss": 0.0583, "num_input_tokens_seen": 34463328, "step": 51125 }, { "epoch": 1.249114406469108, "grad_norm": 0.4755401313304901, "learning_rate": 1.8663464373453146e-06, "loss": 0.1613, "num_input_tokens_seen": 34466528, "step": 51130 }, { "epoch": 1.2492365573009552, "grad_norm": 49.70253372192383, "learning_rate": 1.8663038428489775e-06, "loss": 0.1676, "num_input_tokens_seen": 34470048, "step": 51135 }, { "epoch": 1.2493587081328024, "grad_norm": 10.13922119140625, "learning_rate": 1.8662612420526455e-06, "loss": 0.0033, "num_input_tokens_seen": 34473632, "step": 51140 }, { "epoch": 1.2494808589646496, "grad_norm": 1.5358637571334839, "learning_rate": 1.866218634956629e-06, "loss": 0.0473, "num_input_tokens_seen": 34476640, "step": 51145 }, { "epoch": 1.2496030097964967, "grad_norm": 0.0326588898897171, "learning_rate": 1.8661760215612374e-06, "loss": 0.0546, "num_input_tokens_seen": 34479904, "step": 51150 }, { "epoch": 1.249725160628344, "grad_norm": 0.05521900951862335, "learning_rate": 1.8661334018667806e-06, "loss": 0.0025, "num_input_tokens_seen": 34483360, "step": 51155 }, { "epoch": 1.2498473114601911, "grad_norm": 188.50030517578125, "learning_rate": 1.8660907758735693e-06, "loss": 0.1366, "num_input_tokens_seen": 34486240, "step": 51160 }, { "epoch": 1.249969462292038, "grad_norm": 18.786170959472656, "learning_rate": 1.8660481435819127e-06, "loss": 0.1577, "num_input_tokens_seen": 34489248, "step": 51165 }, { "epoch": 1.2500916131238853, "grad_norm": 89.58645629882812, "learning_rate": 1.8660055049921209e-06, "loss": 0.1395, "num_input_tokens_seen": 34492320, "step": 51170 }, { "epoch": 1.2500916131238853, "eval_loss": 0.1528467833995819, "eval_runtime": 47.3828, "eval_samples_per_second": 767.894, "eval_steps_per_second": 96.005, "num_input_tokens_seen": 34492320, "step": 51170 }, { "epoch": 1.2502137639557325, "grad_norm": 0.44812142848968506, "learning_rate": 1.8659628601045043e-06, "loss": 0.0012, "num_input_tokens_seen": 34495200, "step": 51175 }, { "epoch": 1.2503359147875797, "grad_norm": 0.7981209754943848, "learning_rate": 1.8659202089193728e-06, "loss": 0.1282, "num_input_tokens_seen": 34498272, "step": 51180 }, { "epoch": 1.2504580656194269, "grad_norm": 0.22657358646392822, "learning_rate": 1.8658775514370366e-06, "loss": 0.0379, "num_input_tokens_seen": 34501344, "step": 51185 }, { "epoch": 1.250580216451274, "grad_norm": 44.42075729370117, "learning_rate": 1.865834887657806e-06, "loss": 0.0774, "num_input_tokens_seen": 34504608, "step": 51190 }, { "epoch": 1.2507023672831212, "grad_norm": 31.185182571411133, "learning_rate": 1.8657922175819913e-06, "loss": 0.0997, "num_input_tokens_seen": 34508256, "step": 51195 }, { "epoch": 1.2508245181149684, "grad_norm": 0.07159898430109024, "learning_rate": 1.8657495412099026e-06, "loss": 0.0893, "num_input_tokens_seen": 34511968, "step": 51200 }, { "epoch": 1.2509466689468156, "grad_norm": 0.10588917136192322, "learning_rate": 1.8657068585418502e-06, "loss": 0.1052, "num_input_tokens_seen": 34515232, "step": 51205 }, { "epoch": 1.2510688197786628, "grad_norm": 81.37196350097656, "learning_rate": 1.865664169578145e-06, "loss": 0.1418, "num_input_tokens_seen": 34518240, "step": 51210 }, { "epoch": 1.25119097061051, "grad_norm": 0.0456620417535305, "learning_rate": 1.8656214743190972e-06, "loss": 0.0726, "num_input_tokens_seen": 34521696, "step": 51215 }, { "epoch": 1.251313121442357, "grad_norm": 74.22692108154297, "learning_rate": 1.865578772765017e-06, "loss": 0.2161, "num_input_tokens_seen": 34524768, "step": 51220 }, { "epoch": 1.2514352722742041, "grad_norm": 1.6251906156539917, "learning_rate": 1.8655360649162151e-06, "loss": 0.0474, "num_input_tokens_seen": 34527776, "step": 51225 }, { "epoch": 1.2515574231060513, "grad_norm": 0.29848966002464294, "learning_rate": 1.8654933507730025e-06, "loss": 0.065, "num_input_tokens_seen": 34531040, "step": 51230 }, { "epoch": 1.2516795739378985, "grad_norm": 9.70097827911377, "learning_rate": 1.865450630335689e-06, "loss": 0.0676, "num_input_tokens_seen": 34534560, "step": 51235 }, { "epoch": 1.2518017247697457, "grad_norm": 13.933403015136719, "learning_rate": 1.865407903604586e-06, "loss": 0.1243, "num_input_tokens_seen": 34537568, "step": 51240 }, { "epoch": 1.251923875601593, "grad_norm": 42.59235382080078, "learning_rate": 1.865365170580004e-06, "loss": 0.1417, "num_input_tokens_seen": 34540960, "step": 51245 }, { "epoch": 1.2520460264334399, "grad_norm": 2.9506120681762695, "learning_rate": 1.8653224312622534e-06, "loss": 0.1178, "num_input_tokens_seen": 34544416, "step": 51250 }, { "epoch": 1.252168177265287, "grad_norm": 0.5883849859237671, "learning_rate": 1.8652796856516458e-06, "loss": 0.0324, "num_input_tokens_seen": 34547232, "step": 51255 }, { "epoch": 1.2522903280971343, "grad_norm": 0.2781030833721161, "learning_rate": 1.8652369337484912e-06, "loss": 0.002, "num_input_tokens_seen": 34550240, "step": 51260 }, { "epoch": 1.2524124789289814, "grad_norm": 0.769974946975708, "learning_rate": 1.8651941755531012e-06, "loss": 0.1198, "num_input_tokens_seen": 34553248, "step": 51265 }, { "epoch": 1.2525346297608286, "grad_norm": 0.09712295234203339, "learning_rate": 1.8651514110657863e-06, "loss": 0.0548, "num_input_tokens_seen": 34556384, "step": 51270 }, { "epoch": 1.2526567805926758, "grad_norm": 0.042555950582027435, "learning_rate": 1.8651086402868574e-06, "loss": 0.1616, "num_input_tokens_seen": 34560224, "step": 51275 }, { "epoch": 1.252778931424523, "grad_norm": 0.05184526368975639, "learning_rate": 1.865065863216626e-06, "loss": 0.1452, "num_input_tokens_seen": 34563296, "step": 51280 }, { "epoch": 1.2529010822563702, "grad_norm": 7.814172267913818, "learning_rate": 1.865023079855403e-06, "loss": 0.0967, "num_input_tokens_seen": 34566752, "step": 51285 }, { "epoch": 1.2530232330882174, "grad_norm": 1.3137484788894653, "learning_rate": 1.8649802902034995e-06, "loss": 0.0183, "num_input_tokens_seen": 34570528, "step": 51290 }, { "epoch": 1.2531453839200646, "grad_norm": 1.5455907583236694, "learning_rate": 1.8649374942612266e-06, "loss": 0.0374, "num_input_tokens_seen": 34573856, "step": 51295 }, { "epoch": 1.2532675347519118, "grad_norm": 15.697102546691895, "learning_rate": 1.8648946920288956e-06, "loss": 0.1031, "num_input_tokens_seen": 34577376, "step": 51300 }, { "epoch": 1.253389685583759, "grad_norm": 0.39699557423591614, "learning_rate": 1.864851883506818e-06, "loss": 0.0986, "num_input_tokens_seen": 34580640, "step": 51305 }, { "epoch": 1.253511836415606, "grad_norm": 74.94749450683594, "learning_rate": 1.8648090686953046e-06, "loss": 0.0321, "num_input_tokens_seen": 34584160, "step": 51310 }, { "epoch": 1.2536339872474531, "grad_norm": 0.4404575228691101, "learning_rate": 1.8647662475946673e-06, "loss": 0.1095, "num_input_tokens_seen": 34587488, "step": 51315 }, { "epoch": 1.2537561380793003, "grad_norm": 1.1401361227035522, "learning_rate": 1.864723420205217e-06, "loss": 0.0014, "num_input_tokens_seen": 34591264, "step": 51320 }, { "epoch": 1.2538782889111475, "grad_norm": 9.906758308410645, "learning_rate": 1.8646805865272655e-06, "loss": 0.0824, "num_input_tokens_seen": 34594848, "step": 51325 }, { "epoch": 1.2540004397429947, "grad_norm": 10.644179344177246, "learning_rate": 1.864637746561124e-06, "loss": 0.0936, "num_input_tokens_seen": 34598304, "step": 51330 }, { "epoch": 1.2541225905748419, "grad_norm": 7.708846092224121, "learning_rate": 1.8645949003071047e-06, "loss": 0.1095, "num_input_tokens_seen": 34602144, "step": 51335 }, { "epoch": 1.2542447414066888, "grad_norm": 0.32949671149253845, "learning_rate": 1.8645520477655184e-06, "loss": 0.0402, "num_input_tokens_seen": 34605280, "step": 51340 }, { "epoch": 1.254366892238536, "grad_norm": 10.342193603515625, "learning_rate": 1.8645091889366774e-06, "loss": 0.198, "num_input_tokens_seen": 34608224, "step": 51345 }, { "epoch": 1.2544890430703832, "grad_norm": 1.9661191701889038, "learning_rate": 1.8644663238208927e-06, "loss": 0.093, "num_input_tokens_seen": 34612000, "step": 51350 }, { "epoch": 1.2546111939022304, "grad_norm": 5.437589168548584, "learning_rate": 1.8644234524184762e-06, "loss": 0.0053, "num_input_tokens_seen": 34615072, "step": 51355 }, { "epoch": 1.2547333447340776, "grad_norm": 5.895714282989502, "learning_rate": 1.8643805747297402e-06, "loss": 0.0437, "num_input_tokens_seen": 34618272, "step": 51360 }, { "epoch": 1.2548554955659248, "grad_norm": 9.616518020629883, "learning_rate": 1.8643376907549963e-06, "loss": 0.2466, "num_input_tokens_seen": 34621792, "step": 51365 }, { "epoch": 1.254977646397772, "grad_norm": 0.07669886201620102, "learning_rate": 1.864294800494556e-06, "loss": 0.0016, "num_input_tokens_seen": 34626208, "step": 51370 }, { "epoch": 1.2550997972296192, "grad_norm": 12.242938041687012, "learning_rate": 1.8642519039487317e-06, "loss": 0.0824, "num_input_tokens_seen": 34629600, "step": 51375 }, { "epoch": 1.2552219480614664, "grad_norm": 10.630461692810059, "learning_rate": 1.8642090011178348e-06, "loss": 0.1086, "num_input_tokens_seen": 34633056, "step": 51380 }, { "epoch": 1.2553440988933136, "grad_norm": 0.5824533700942993, "learning_rate": 1.8641660920021778e-06, "loss": 0.0059, "num_input_tokens_seen": 34636896, "step": 51385 }, { "epoch": 1.2554662497251607, "grad_norm": 19.69352149963379, "learning_rate": 1.8641231766020724e-06, "loss": 0.1896, "num_input_tokens_seen": 34640160, "step": 51390 }, { "epoch": 1.2555884005570077, "grad_norm": 4.431107521057129, "learning_rate": 1.864080254917831e-06, "loss": 0.1344, "num_input_tokens_seen": 34643488, "step": 51395 }, { "epoch": 1.255710551388855, "grad_norm": 8.024478912353516, "learning_rate": 1.8640373269497653e-06, "loss": 0.0929, "num_input_tokens_seen": 34647072, "step": 51400 }, { "epoch": 1.255832702220702, "grad_norm": 16.997468948364258, "learning_rate": 1.8639943926981881e-06, "loss": 0.105, "num_input_tokens_seen": 34650656, "step": 51405 }, { "epoch": 1.2559548530525493, "grad_norm": 7.817405700683594, "learning_rate": 1.863951452163411e-06, "loss": 0.0657, "num_input_tokens_seen": 34653792, "step": 51410 }, { "epoch": 1.2560770038843965, "grad_norm": 0.0734686404466629, "learning_rate": 1.8639085053457469e-06, "loss": 0.0307, "num_input_tokens_seen": 34656864, "step": 51415 }, { "epoch": 1.2561991547162437, "grad_norm": 0.5896303057670593, "learning_rate": 1.8638655522455072e-06, "loss": 0.0582, "num_input_tokens_seen": 34660256, "step": 51420 }, { "epoch": 1.2563213055480908, "grad_norm": 1.4299993515014648, "learning_rate": 1.8638225928630053e-06, "loss": 0.1335, "num_input_tokens_seen": 34663328, "step": 51425 }, { "epoch": 1.2564434563799378, "grad_norm": 0.17183057963848114, "learning_rate": 1.8637796271985532e-06, "loss": 0.0228, "num_input_tokens_seen": 34666720, "step": 51430 }, { "epoch": 1.256565607211785, "grad_norm": 44.34275817871094, "learning_rate": 1.8637366552524632e-06, "loss": 0.1212, "num_input_tokens_seen": 34670176, "step": 51435 }, { "epoch": 1.2566877580436322, "grad_norm": 0.44418561458587646, "learning_rate": 1.863693677025048e-06, "loss": 0.046, "num_input_tokens_seen": 34673312, "step": 51440 }, { "epoch": 1.2568099088754794, "grad_norm": 10.671781539916992, "learning_rate": 1.86365069251662e-06, "loss": 0.1023, "num_input_tokens_seen": 34676448, "step": 51445 }, { "epoch": 1.2569320597073266, "grad_norm": 0.2875211238861084, "learning_rate": 1.8636077017274917e-06, "loss": 0.0908, "num_input_tokens_seen": 34679456, "step": 51450 }, { "epoch": 1.2570542105391738, "grad_norm": 5.269903659820557, "learning_rate": 1.8635647046579762e-06, "loss": 0.0017, "num_input_tokens_seen": 34682656, "step": 51455 }, { "epoch": 1.257176361371021, "grad_norm": 0.19323447346687317, "learning_rate": 1.863521701308386e-06, "loss": 0.1742, "num_input_tokens_seen": 34686240, "step": 51460 }, { "epoch": 1.2572985122028681, "grad_norm": 0.09027817845344543, "learning_rate": 1.8634786916790332e-06, "loss": 0.1492, "num_input_tokens_seen": 34689568, "step": 51465 }, { "epoch": 1.2574206630347153, "grad_norm": 0.43901634216308594, "learning_rate": 1.8634356757702316e-06, "loss": 0.0579, "num_input_tokens_seen": 34692640, "step": 51470 }, { "epoch": 1.2575428138665625, "grad_norm": 0.5162341594696045, "learning_rate": 1.8633926535822932e-06, "loss": 0.0578, "num_input_tokens_seen": 34695840, "step": 51475 }, { "epoch": 1.2576649646984097, "grad_norm": 47.497135162353516, "learning_rate": 1.8633496251155314e-06, "loss": 0.0701, "num_input_tokens_seen": 34701216, "step": 51480 }, { "epoch": 1.2577871155302567, "grad_norm": 0.3074395954608917, "learning_rate": 1.8633065903702588e-06, "loss": 0.1401, "num_input_tokens_seen": 34704480, "step": 51485 }, { "epoch": 1.2579092663621039, "grad_norm": 0.1671035885810852, "learning_rate": 1.8632635493467887e-06, "loss": 0.0023, "num_input_tokens_seen": 34708000, "step": 51490 }, { "epoch": 1.258031417193951, "grad_norm": 0.5071985125541687, "learning_rate": 1.8632205020454336e-06, "loss": 0.0715, "num_input_tokens_seen": 34711136, "step": 51495 }, { "epoch": 1.2581535680257983, "grad_norm": 25.011734008789062, "learning_rate": 1.8631774484665067e-06, "loss": 0.0838, "num_input_tokens_seen": 34714592, "step": 51500 }, { "epoch": 1.2582757188576454, "grad_norm": 0.7971920967102051, "learning_rate": 1.8631343886103218e-06, "loss": 0.1563, "num_input_tokens_seen": 34717984, "step": 51505 }, { "epoch": 1.2583978696894926, "grad_norm": 19.045324325561523, "learning_rate": 1.863091322477191e-06, "loss": 0.1384, "num_input_tokens_seen": 34721184, "step": 51510 }, { "epoch": 1.2585200205213398, "grad_norm": 0.11327563226222992, "learning_rate": 1.863048250067428e-06, "loss": 0.1524, "num_input_tokens_seen": 34724192, "step": 51515 }, { "epoch": 1.2586421713531868, "grad_norm": 0.3514478802680969, "learning_rate": 1.863005171381346e-06, "loss": 0.0591, "num_input_tokens_seen": 34727392, "step": 51520 }, { "epoch": 1.258764322185034, "grad_norm": 14.835415840148926, "learning_rate": 1.8629620864192588e-06, "loss": 0.1017, "num_input_tokens_seen": 34731808, "step": 51525 }, { "epoch": 1.2588864730168812, "grad_norm": 5.161327838897705, "learning_rate": 1.8629189951814785e-06, "loss": 0.0119, "num_input_tokens_seen": 34735072, "step": 51530 }, { "epoch": 1.2590086238487284, "grad_norm": 2.658921003341675, "learning_rate": 1.8628758976683195e-06, "loss": 0.046, "num_input_tokens_seen": 34738784, "step": 51535 }, { "epoch": 1.2591307746805755, "grad_norm": 0.17709505558013916, "learning_rate": 1.862832793880095e-06, "loss": 0.0241, "num_input_tokens_seen": 34742176, "step": 51540 }, { "epoch": 1.2592529255124227, "grad_norm": 0.27550020813941956, "learning_rate": 1.8627896838171182e-06, "loss": 0.1454, "num_input_tokens_seen": 34745696, "step": 51545 }, { "epoch": 1.25937507634427, "grad_norm": 0.6853047013282776, "learning_rate": 1.8627465674797027e-06, "loss": 0.0427, "num_input_tokens_seen": 34748704, "step": 51550 }, { "epoch": 1.2594972271761171, "grad_norm": 0.29485732316970825, "learning_rate": 1.862703444868162e-06, "loss": 0.0989, "num_input_tokens_seen": 34752352, "step": 51555 }, { "epoch": 1.2596193780079643, "grad_norm": 2.1250858306884766, "learning_rate": 1.8626603159828101e-06, "loss": 0.0028, "num_input_tokens_seen": 34755808, "step": 51560 }, { "epoch": 1.2597415288398115, "grad_norm": 0.1356990784406662, "learning_rate": 1.86261718082396e-06, "loss": 0.1658, "num_input_tokens_seen": 34759520, "step": 51565 }, { "epoch": 1.2598636796716587, "grad_norm": 38.832767486572266, "learning_rate": 1.862574039391926e-06, "loss": 0.003, "num_input_tokens_seen": 34762976, "step": 51570 }, { "epoch": 1.2599858305035057, "grad_norm": 0.2098708599805832, "learning_rate": 1.8625308916870215e-06, "loss": 0.0021, "num_input_tokens_seen": 34766176, "step": 51575 }, { "epoch": 1.2601079813353528, "grad_norm": 34.0190544128418, "learning_rate": 1.8624877377095604e-06, "loss": 0.157, "num_input_tokens_seen": 34769440, "step": 51580 }, { "epoch": 1.2602301321672, "grad_norm": 103.00405883789062, "learning_rate": 1.8624445774598565e-06, "loss": 0.0354, "num_input_tokens_seen": 34772576, "step": 51585 }, { "epoch": 1.2603522829990472, "grad_norm": 196.07667541503906, "learning_rate": 1.8624014109382236e-06, "loss": 0.0539, "num_input_tokens_seen": 34776352, "step": 51590 }, { "epoch": 1.2604744338308944, "grad_norm": 3.750955820083618, "learning_rate": 1.8623582381449757e-06, "loss": 0.1369, "num_input_tokens_seen": 34779616, "step": 51595 }, { "epoch": 1.2605965846627416, "grad_norm": 123.82447814941406, "learning_rate": 1.8623150590804269e-06, "loss": 0.1751, "num_input_tokens_seen": 34783136, "step": 51600 }, { "epoch": 1.2607187354945888, "grad_norm": 0.05092499777674675, "learning_rate": 1.8622718737448908e-06, "loss": 0.0823, "num_input_tokens_seen": 34786400, "step": 51605 }, { "epoch": 1.2608408863264358, "grad_norm": 45.56595993041992, "learning_rate": 1.862228682138682e-06, "loss": 0.0585, "num_input_tokens_seen": 34789408, "step": 51610 }, { "epoch": 1.260963037158283, "grad_norm": 0.2559904456138611, "learning_rate": 1.8621854842621141e-06, "loss": 0.0389, "num_input_tokens_seen": 34792864, "step": 51615 }, { "epoch": 1.2610851879901301, "grad_norm": 12.372807502746582, "learning_rate": 1.8621422801155014e-06, "loss": 0.1365, "num_input_tokens_seen": 34796000, "step": 51620 }, { "epoch": 1.2612073388219773, "grad_norm": 0.3244743049144745, "learning_rate": 1.8620990696991586e-06, "loss": 0.1355, "num_input_tokens_seen": 34799264, "step": 51625 }, { "epoch": 1.2613294896538245, "grad_norm": 0.3361222445964813, "learning_rate": 1.862055853013399e-06, "loss": 0.1614, "num_input_tokens_seen": 34802528, "step": 51630 }, { "epoch": 1.2614516404856717, "grad_norm": 41.69643020629883, "learning_rate": 1.8620126300585372e-06, "loss": 0.1865, "num_input_tokens_seen": 34805984, "step": 51635 }, { "epoch": 1.261573791317519, "grad_norm": 22.042510986328125, "learning_rate": 1.861969400834888e-06, "loss": 0.0727, "num_input_tokens_seen": 34809568, "step": 51640 }, { "epoch": 1.261695942149366, "grad_norm": 0.43014460802078247, "learning_rate": 1.8619261653427655e-06, "loss": 0.0692, "num_input_tokens_seen": 34813216, "step": 51645 }, { "epoch": 1.2618180929812133, "grad_norm": 1.0715209245681763, "learning_rate": 1.8618829235824841e-06, "loss": 0.0024, "num_input_tokens_seen": 34816480, "step": 51650 }, { "epoch": 1.2619402438130605, "grad_norm": 0.30852288007736206, "learning_rate": 1.8618396755543584e-06, "loss": 0.0105, "num_input_tokens_seen": 34819872, "step": 51655 }, { "epoch": 1.2620623946449077, "grad_norm": 160.23773193359375, "learning_rate": 1.8617964212587027e-06, "loss": 0.0565, "num_input_tokens_seen": 34823200, "step": 51660 }, { "epoch": 1.2621845454767546, "grad_norm": 0.37034016847610474, "learning_rate": 1.8617531606958315e-06, "loss": 0.0384, "num_input_tokens_seen": 34826528, "step": 51665 }, { "epoch": 1.2623066963086018, "grad_norm": 0.008391081355512142, "learning_rate": 1.8617098938660595e-06, "loss": 0.0596, "num_input_tokens_seen": 34829728, "step": 51670 }, { "epoch": 1.262428847140449, "grad_norm": 13.188200950622559, "learning_rate": 1.8616666207697015e-06, "loss": 0.0799, "num_input_tokens_seen": 34833184, "step": 51675 }, { "epoch": 1.2625509979722962, "grad_norm": 3.760244846343994, "learning_rate": 1.8616233414070721e-06, "loss": 0.0877, "num_input_tokens_seen": 34836576, "step": 51680 }, { "epoch": 1.2626731488041434, "grad_norm": 0.5174466967582703, "learning_rate": 1.861580055778486e-06, "loss": 0.098, "num_input_tokens_seen": 34840352, "step": 51685 }, { "epoch": 1.2627952996359906, "grad_norm": 0.07368353754281998, "learning_rate": 1.861536763884258e-06, "loss": 0.0007, "num_input_tokens_seen": 34843680, "step": 51690 }, { "epoch": 1.2629174504678378, "grad_norm": 12.683340072631836, "learning_rate": 1.8614934657247028e-06, "loss": 0.1131, "num_input_tokens_seen": 34847008, "step": 51695 }, { "epoch": 1.2630396012996847, "grad_norm": 0.0876552015542984, "learning_rate": 1.8614501613001354e-06, "loss": 0.0387, "num_input_tokens_seen": 34850720, "step": 51700 }, { "epoch": 1.263161752131532, "grad_norm": 287.899169921875, "learning_rate": 1.8614068506108708e-06, "loss": 0.162, "num_input_tokens_seen": 34854432, "step": 51705 }, { "epoch": 1.2632839029633791, "grad_norm": 51.08266830444336, "learning_rate": 1.861363533657224e-06, "loss": 0.0343, "num_input_tokens_seen": 34857632, "step": 51710 }, { "epoch": 1.2634060537952263, "grad_norm": 63.332008361816406, "learning_rate": 1.8613202104395098e-06, "loss": 0.0383, "num_input_tokens_seen": 34861024, "step": 51715 }, { "epoch": 1.2635282046270735, "grad_norm": 1.2061738967895508, "learning_rate": 1.8612768809580435e-06, "loss": 0.0816, "num_input_tokens_seen": 34864160, "step": 51720 }, { "epoch": 1.2636503554589207, "grad_norm": 186.6913604736328, "learning_rate": 1.8612335452131398e-06, "loss": 0.1188, "num_input_tokens_seen": 34867424, "step": 51725 }, { "epoch": 1.2637725062907679, "grad_norm": 254.61875915527344, "learning_rate": 1.8611902032051141e-06, "loss": 0.1256, "num_input_tokens_seen": 34870880, "step": 51730 }, { "epoch": 1.263894657122615, "grad_norm": 2.1379923820495605, "learning_rate": 1.861146854934282e-06, "loss": 0.0437, "num_input_tokens_seen": 34874464, "step": 51735 }, { "epoch": 1.2640168079544623, "grad_norm": 0.7487949132919312, "learning_rate": 1.861103500400958e-06, "loss": 0.0037, "num_input_tokens_seen": 34877664, "step": 51740 }, { "epoch": 1.2641389587863094, "grad_norm": 0.02050768956542015, "learning_rate": 1.8610601396054579e-06, "loss": 0.1053, "num_input_tokens_seen": 34881312, "step": 51745 }, { "epoch": 1.2642611096181566, "grad_norm": 0.02770630270242691, "learning_rate": 1.8610167725480967e-06, "loss": 0.0473, "num_input_tokens_seen": 34884384, "step": 51750 }, { "epoch": 1.2643832604500036, "grad_norm": 1.7987079620361328, "learning_rate": 1.86097339922919e-06, "loss": 0.0283, "num_input_tokens_seen": 34887712, "step": 51755 }, { "epoch": 1.2645054112818508, "grad_norm": 1.5875380039215088, "learning_rate": 1.8609300196490532e-06, "loss": 0.0492, "num_input_tokens_seen": 34891168, "step": 51760 }, { "epoch": 1.264627562113698, "grad_norm": 2.108224868774414, "learning_rate": 1.8608866338080018e-06, "loss": 0.1076, "num_input_tokens_seen": 34894688, "step": 51765 }, { "epoch": 1.2647497129455452, "grad_norm": 0.34903234243392944, "learning_rate": 1.8608432417063512e-06, "loss": 0.1494, "num_input_tokens_seen": 34897760, "step": 51770 }, { "epoch": 1.2648718637773924, "grad_norm": 0.11632052809000015, "learning_rate": 1.860799843344417e-06, "loss": 0.0673, "num_input_tokens_seen": 34901216, "step": 51775 }, { "epoch": 1.2649940146092395, "grad_norm": 0.1959635615348816, "learning_rate": 1.860756438722515e-06, "loss": 0.0012, "num_input_tokens_seen": 34904544, "step": 51780 }, { "epoch": 1.2651161654410865, "grad_norm": 148.51904296875, "learning_rate": 1.8607130278409603e-06, "loss": 0.0723, "num_input_tokens_seen": 34907552, "step": 51785 }, { "epoch": 1.2652383162729337, "grad_norm": 0.23096473515033722, "learning_rate": 1.8606696107000692e-06, "loss": 0.1587, "num_input_tokens_seen": 34911200, "step": 51790 }, { "epoch": 1.265360467104781, "grad_norm": 0.1959235966205597, "learning_rate": 1.860626187300157e-06, "loss": 0.0537, "num_input_tokens_seen": 34915232, "step": 51795 }, { "epoch": 1.265482617936628, "grad_norm": 0.17887307703495026, "learning_rate": 1.86058275764154e-06, "loss": 0.1449, "num_input_tokens_seen": 34918304, "step": 51800 }, { "epoch": 1.2656047687684753, "grad_norm": 0.13273105025291443, "learning_rate": 1.8605393217245336e-06, "loss": 0.1132, "num_input_tokens_seen": 34922144, "step": 51805 }, { "epoch": 1.2657269196003225, "grad_norm": 0.9345943927764893, "learning_rate": 1.8604958795494535e-06, "loss": 0.0423, "num_input_tokens_seen": 34925152, "step": 51810 }, { "epoch": 1.2658490704321697, "grad_norm": 14.5985689163208, "learning_rate": 1.8604524311166163e-06, "loss": 0.1276, "num_input_tokens_seen": 34928608, "step": 51815 }, { "epoch": 1.2659712212640168, "grad_norm": 0.25379669666290283, "learning_rate": 1.8604089764263375e-06, "loss": 0.0426, "num_input_tokens_seen": 34931744, "step": 51820 }, { "epoch": 1.266093372095864, "grad_norm": 9.009862899780273, "learning_rate": 1.8603655154789331e-06, "loss": 0.0938, "num_input_tokens_seen": 34935392, "step": 51825 }, { "epoch": 1.2662155229277112, "grad_norm": 191.35870361328125, "learning_rate": 1.8603220482747192e-06, "loss": 0.2035, "num_input_tokens_seen": 34938400, "step": 51830 }, { "epoch": 1.2663376737595584, "grad_norm": 0.27894648909568787, "learning_rate": 1.8602785748140122e-06, "loss": 0.227, "num_input_tokens_seen": 34942240, "step": 51835 }, { "epoch": 1.2664598245914056, "grad_norm": 211.6515350341797, "learning_rate": 1.8602350950971277e-06, "loss": 0.0861, "num_input_tokens_seen": 34945504, "step": 51840 }, { "epoch": 1.2665819754232526, "grad_norm": 0.154354065656662, "learning_rate": 1.8601916091243825e-06, "loss": 0.1287, "num_input_tokens_seen": 34948768, "step": 51845 }, { "epoch": 1.2667041262550998, "grad_norm": 1.5715100765228271, "learning_rate": 1.8601481168960925e-06, "loss": 0.0453, "num_input_tokens_seen": 34952224, "step": 51850 }, { "epoch": 1.266826277086947, "grad_norm": 11.307893753051758, "learning_rate": 1.860104618412574e-06, "loss": 0.0872, "num_input_tokens_seen": 34956128, "step": 51855 }, { "epoch": 1.2669484279187941, "grad_norm": 0.08968228101730347, "learning_rate": 1.8600611136741432e-06, "loss": 0.0027, "num_input_tokens_seen": 34959840, "step": 51860 }, { "epoch": 1.2670705787506413, "grad_norm": 61.685367584228516, "learning_rate": 1.8600176026811169e-06, "loss": 0.1355, "num_input_tokens_seen": 34962912, "step": 51865 }, { "epoch": 1.2671927295824885, "grad_norm": 0.35035207867622375, "learning_rate": 1.8599740854338112e-06, "loss": 0.1379, "num_input_tokens_seen": 34966560, "step": 51870 }, { "epoch": 1.2673148804143355, "grad_norm": 0.2698616683483124, "learning_rate": 1.8599305619325428e-06, "loss": 0.009, "num_input_tokens_seen": 34969952, "step": 51875 }, { "epoch": 1.2674370312461827, "grad_norm": 9.4758882522583, "learning_rate": 1.8598870321776278e-06, "loss": 0.1323, "num_input_tokens_seen": 34973344, "step": 51880 }, { "epoch": 1.2675591820780299, "grad_norm": 18.896320343017578, "learning_rate": 1.8598434961693833e-06, "loss": 0.0956, "num_input_tokens_seen": 34976800, "step": 51885 }, { "epoch": 1.267681332909877, "grad_norm": 0.1476629227399826, "learning_rate": 1.8597999539081255e-06, "loss": 0.0017, "num_input_tokens_seen": 34980384, "step": 51890 }, { "epoch": 1.2678034837417242, "grad_norm": 8.873211860656738, "learning_rate": 1.859756405394171e-06, "loss": 0.0439, "num_input_tokens_seen": 34983904, "step": 51895 }, { "epoch": 1.2679256345735714, "grad_norm": 0.32919931411743164, "learning_rate": 1.8597128506278365e-06, "loss": 0.1104, "num_input_tokens_seen": 34987232, "step": 51900 }, { "epoch": 1.2680477854054186, "grad_norm": 9.547355651855469, "learning_rate": 1.8596692896094394e-06, "loss": 0.2373, "num_input_tokens_seen": 34990624, "step": 51905 }, { "epoch": 1.2681699362372658, "grad_norm": 0.020996596664190292, "learning_rate": 1.8596257223392959e-06, "loss": 0.0839, "num_input_tokens_seen": 34993568, "step": 51910 }, { "epoch": 1.268292087069113, "grad_norm": 0.9302226305007935, "learning_rate": 1.8595821488177228e-06, "loss": 0.0244, "num_input_tokens_seen": 34996960, "step": 51915 }, { "epoch": 1.2684142379009602, "grad_norm": 2.4010121822357178, "learning_rate": 1.8595385690450374e-06, "loss": 0.1089, "num_input_tokens_seen": 34999840, "step": 51920 }, { "epoch": 1.2685363887328074, "grad_norm": 0.9553894400596619, "learning_rate": 1.8594949830215558e-06, "loss": 0.0253, "num_input_tokens_seen": 35003296, "step": 51925 }, { "epoch": 1.2686585395646544, "grad_norm": 0.4104401767253876, "learning_rate": 1.859451390747596e-06, "loss": 0.0893, "num_input_tokens_seen": 35006496, "step": 51930 }, { "epoch": 1.2687806903965015, "grad_norm": 25.074024200439453, "learning_rate": 1.8594077922234742e-06, "loss": 0.1692, "num_input_tokens_seen": 35009568, "step": 51935 }, { "epoch": 1.2689028412283487, "grad_norm": 0.19158746302127838, "learning_rate": 1.859364187449508e-06, "loss": 0.1384, "num_input_tokens_seen": 35012832, "step": 51940 }, { "epoch": 1.269024992060196, "grad_norm": 34.13637924194336, "learning_rate": 1.8593205764260142e-06, "loss": 0.1384, "num_input_tokens_seen": 35016224, "step": 51945 }, { "epoch": 1.2691471428920431, "grad_norm": 0.3690553903579712, "learning_rate": 1.8592769591533099e-06, "loss": 0.0345, "num_input_tokens_seen": 35020128, "step": 51950 }, { "epoch": 1.2692692937238903, "grad_norm": 7.655487060546875, "learning_rate": 1.8592333356317128e-06, "loss": 0.1079, "num_input_tokens_seen": 35023392, "step": 51955 }, { "epoch": 1.2693914445557375, "grad_norm": 15.581256866455078, "learning_rate": 1.8591897058615396e-06, "loss": 0.1429, "num_input_tokens_seen": 35026656, "step": 51960 }, { "epoch": 1.2695135953875845, "grad_norm": 0.9874152541160583, "learning_rate": 1.8591460698431076e-06, "loss": 0.0967, "num_input_tokens_seen": 35029728, "step": 51965 }, { "epoch": 1.2696357462194316, "grad_norm": 0.16612599790096283, "learning_rate": 1.8591024275767345e-06, "loss": 0.0196, "num_input_tokens_seen": 35033568, "step": 51970 }, { "epoch": 1.2697578970512788, "grad_norm": 8.764939308166504, "learning_rate": 1.8590587790627372e-06, "loss": 0.1069, "num_input_tokens_seen": 35037024, "step": 51975 }, { "epoch": 1.269880047883126, "grad_norm": 0.25551822781562805, "learning_rate": 1.8590151243014337e-06, "loss": 0.0019, "num_input_tokens_seen": 35040608, "step": 51980 }, { "epoch": 1.2700021987149732, "grad_norm": 0.7786625623703003, "learning_rate": 1.858971463293141e-06, "loss": 0.0299, "num_input_tokens_seen": 35044128, "step": 51985 }, { "epoch": 1.2701243495468204, "grad_norm": 0.11077973991632462, "learning_rate": 1.858927796038177e-06, "loss": 0.0489, "num_input_tokens_seen": 35047520, "step": 51990 }, { "epoch": 1.2702465003786676, "grad_norm": 0.24186283349990845, "learning_rate": 1.8588841225368587e-06, "loss": 0.1065, "num_input_tokens_seen": 35050528, "step": 51995 }, { "epoch": 1.2703686512105148, "grad_norm": 0.4123366177082062, "learning_rate": 1.8588404427895044e-06, "loss": 0.032, "num_input_tokens_seen": 35053728, "step": 52000 }, { "epoch": 1.270490802042362, "grad_norm": 0.3522588312625885, "learning_rate": 1.8587967567964312e-06, "loss": 0.161, "num_input_tokens_seen": 35056992, "step": 52005 }, { "epoch": 1.2706129528742092, "grad_norm": 8.013328552246094, "learning_rate": 1.858753064557957e-06, "loss": 0.1344, "num_input_tokens_seen": 35060448, "step": 52010 }, { "epoch": 1.2707351037060564, "grad_norm": 36.01093673706055, "learning_rate": 1.8587093660743997e-06, "loss": 0.1726, "num_input_tokens_seen": 35064032, "step": 52015 }, { "epoch": 1.2708572545379033, "grad_norm": 0.21894040703773499, "learning_rate": 1.8586656613460766e-06, "loss": 0.0458, "num_input_tokens_seen": 35067296, "step": 52020 }, { "epoch": 1.2709794053697505, "grad_norm": 10.701966285705566, "learning_rate": 1.8586219503733061e-06, "loss": 0.0534, "num_input_tokens_seen": 35070240, "step": 52025 }, { "epoch": 1.2711015562015977, "grad_norm": 0.2591902017593384, "learning_rate": 1.8585782331564057e-06, "loss": 0.0728, "num_input_tokens_seen": 35073632, "step": 52030 }, { "epoch": 1.271223707033445, "grad_norm": 10.046820640563965, "learning_rate": 1.8585345096956938e-06, "loss": 0.0394, "num_input_tokens_seen": 35076832, "step": 52035 }, { "epoch": 1.271345857865292, "grad_norm": 21.24547576904297, "learning_rate": 1.8584907799914874e-06, "loss": 0.0483, "num_input_tokens_seen": 35080480, "step": 52040 }, { "epoch": 1.2714680086971393, "grad_norm": 11.880767822265625, "learning_rate": 1.858447044044106e-06, "loss": 0.1918, "num_input_tokens_seen": 35083744, "step": 52045 }, { "epoch": 1.2715901595289865, "grad_norm": 0.13276037573814392, "learning_rate": 1.858403301853866e-06, "loss": 0.1091, "num_input_tokens_seen": 35087264, "step": 52050 }, { "epoch": 1.2717123103608334, "grad_norm": 0.15990546345710754, "learning_rate": 1.8583595534210868e-06, "loss": 0.1953, "num_input_tokens_seen": 35090336, "step": 52055 }, { "epoch": 1.2718344611926806, "grad_norm": 0.265207439661026, "learning_rate": 1.8583157987460859e-06, "loss": 0.0675, "num_input_tokens_seen": 35093856, "step": 52060 }, { "epoch": 1.2719566120245278, "grad_norm": 180.78955078125, "learning_rate": 1.8582720378291817e-06, "loss": 0.0433, "num_input_tokens_seen": 35098208, "step": 52065 }, { "epoch": 1.272078762856375, "grad_norm": 0.7898123264312744, "learning_rate": 1.8582282706706922e-06, "loss": 0.002, "num_input_tokens_seen": 35101280, "step": 52070 }, { "epoch": 1.2722009136882222, "grad_norm": 12.134377479553223, "learning_rate": 1.858184497270936e-06, "loss": 0.1375, "num_input_tokens_seen": 35104480, "step": 52075 }, { "epoch": 1.2723230645200694, "grad_norm": 1.3107682466506958, "learning_rate": 1.8581407176302313e-06, "loss": 0.0917, "num_input_tokens_seen": 35108064, "step": 52080 }, { "epoch": 1.2724452153519166, "grad_norm": 0.24347443878650665, "learning_rate": 1.8580969317488964e-06, "loss": 0.0431, "num_input_tokens_seen": 35111072, "step": 52085 }, { "epoch": 1.2725673661837638, "grad_norm": 2.7623085975646973, "learning_rate": 1.8580531396272501e-06, "loss": 0.0015, "num_input_tokens_seen": 35114464, "step": 52090 }, { "epoch": 1.272689517015611, "grad_norm": 0.18461468815803528, "learning_rate": 1.8580093412656104e-06, "loss": 0.083, "num_input_tokens_seen": 35118112, "step": 52095 }, { "epoch": 1.2728116678474581, "grad_norm": 11.591814041137695, "learning_rate": 1.857965536664296e-06, "loss": 0.0866, "num_input_tokens_seen": 35121440, "step": 52100 }, { "epoch": 1.2729338186793053, "grad_norm": 16.923437118530273, "learning_rate": 1.8579217258236254e-06, "loss": 0.1703, "num_input_tokens_seen": 35124320, "step": 52105 }, { "epoch": 1.2730559695111523, "grad_norm": 3.304119825363159, "learning_rate": 1.8578779087439172e-06, "loss": 0.0437, "num_input_tokens_seen": 35127328, "step": 52110 }, { "epoch": 1.2731781203429995, "grad_norm": 0.3508507013320923, "learning_rate": 1.8578340854254902e-06, "loss": 0.0514, "num_input_tokens_seen": 35130528, "step": 52115 }, { "epoch": 1.2733002711748467, "grad_norm": 57.29536437988281, "learning_rate": 1.8577902558686631e-06, "loss": 0.0906, "num_input_tokens_seen": 35133920, "step": 52120 }, { "epoch": 1.2734224220066939, "grad_norm": 0.3323357105255127, "learning_rate": 1.8577464200737544e-06, "loss": 0.1203, "num_input_tokens_seen": 35137184, "step": 52125 }, { "epoch": 1.273544572838541, "grad_norm": 1.0517792701721191, "learning_rate": 1.857702578041083e-06, "loss": 0.0019, "num_input_tokens_seen": 35140704, "step": 52130 }, { "epoch": 1.2736667236703882, "grad_norm": 0.2565861642360687, "learning_rate": 1.8576587297709678e-06, "loss": 0.1035, "num_input_tokens_seen": 35143968, "step": 52135 }, { "epoch": 1.2737888745022354, "grad_norm": 0.4675154983997345, "learning_rate": 1.857614875263728e-06, "loss": 0.1608, "num_input_tokens_seen": 35147424, "step": 52140 }, { "epoch": 1.2739110253340824, "grad_norm": 0.13272573053836823, "learning_rate": 1.8575710145196817e-06, "loss": 0.0427, "num_input_tokens_seen": 35150496, "step": 52145 }, { "epoch": 1.2740331761659296, "grad_norm": 35.89418029785156, "learning_rate": 1.8575271475391484e-06, "loss": 0.2302, "num_input_tokens_seen": 35153952, "step": 52150 }, { "epoch": 1.2741553269977768, "grad_norm": 0.04896676540374756, "learning_rate": 1.8574832743224471e-06, "loss": 0.0811, "num_input_tokens_seen": 35157536, "step": 52155 }, { "epoch": 1.274277477829624, "grad_norm": 0.17201733589172363, "learning_rate": 1.8574393948698967e-06, "loss": 0.0011, "num_input_tokens_seen": 35160928, "step": 52160 }, { "epoch": 1.2743996286614712, "grad_norm": 28.33481788635254, "learning_rate": 1.8573955091818166e-06, "loss": 0.1059, "num_input_tokens_seen": 35164128, "step": 52165 }, { "epoch": 1.2745217794933184, "grad_norm": 0.08746245503425598, "learning_rate": 1.8573516172585256e-06, "loss": 0.0718, "num_input_tokens_seen": 35168224, "step": 52170 }, { "epoch": 1.2746439303251655, "grad_norm": 67.42756652832031, "learning_rate": 1.8573077191003433e-06, "loss": 0.0203, "num_input_tokens_seen": 35171616, "step": 52175 }, { "epoch": 1.2747660811570127, "grad_norm": 0.4297308325767517, "learning_rate": 1.857263814707588e-06, "loss": 0.0592, "num_input_tokens_seen": 35175008, "step": 52180 }, { "epoch": 1.27488823198886, "grad_norm": 0.16938745975494385, "learning_rate": 1.8572199040805803e-06, "loss": 0.1161, "num_input_tokens_seen": 35177952, "step": 52185 }, { "epoch": 1.2750103828207071, "grad_norm": 131.6497039794922, "learning_rate": 1.8571759872196386e-06, "loss": 0.1294, "num_input_tokens_seen": 35181728, "step": 52190 }, { "epoch": 1.2751325336525543, "grad_norm": 35.01044464111328, "learning_rate": 1.8571320641250829e-06, "loss": 0.1537, "num_input_tokens_seen": 35185376, "step": 52195 }, { "epoch": 1.2752546844844013, "grad_norm": 0.05216865614056587, "learning_rate": 1.857088134797232e-06, "loss": 0.0996, "num_input_tokens_seen": 35189024, "step": 52200 }, { "epoch": 1.2753768353162485, "grad_norm": 7.486613750457764, "learning_rate": 1.8570441992364057e-06, "loss": 0.0464, "num_input_tokens_seen": 35192928, "step": 52205 }, { "epoch": 1.2754989861480956, "grad_norm": 12.407038688659668, "learning_rate": 1.8570002574429236e-06, "loss": 0.0576, "num_input_tokens_seen": 35196064, "step": 52210 }, { "epoch": 1.2756211369799428, "grad_norm": 0.3270169496536255, "learning_rate": 1.8569563094171048e-06, "loss": 0.1012, "num_input_tokens_seen": 35199584, "step": 52215 }, { "epoch": 1.27574328781179, "grad_norm": 18.02934455871582, "learning_rate": 1.8569123551592693e-06, "loss": 0.0752, "num_input_tokens_seen": 35202720, "step": 52220 }, { "epoch": 1.2758654386436372, "grad_norm": 3.006917715072632, "learning_rate": 1.8568683946697368e-06, "loss": 0.2353, "num_input_tokens_seen": 35206112, "step": 52225 }, { "epoch": 1.2759875894754844, "grad_norm": 170.55621337890625, "learning_rate": 1.856824427948827e-06, "loss": 0.1563, "num_input_tokens_seen": 35209824, "step": 52230 }, { "epoch": 1.2761097403073314, "grad_norm": 0.22915074229240417, "learning_rate": 1.8567804549968593e-06, "loss": 0.0032, "num_input_tokens_seen": 35213600, "step": 52235 }, { "epoch": 1.2762318911391786, "grad_norm": 0.11022822558879852, "learning_rate": 1.8567364758141539e-06, "loss": 0.0018, "num_input_tokens_seen": 35216800, "step": 52240 }, { "epoch": 1.2763540419710258, "grad_norm": 18.70414924621582, "learning_rate": 1.85669249040103e-06, "loss": 0.1648, "num_input_tokens_seen": 35220320, "step": 52245 }, { "epoch": 1.276476192802873, "grad_norm": 14.830822944641113, "learning_rate": 1.8566484987578083e-06, "loss": 0.1565, "num_input_tokens_seen": 35223264, "step": 52250 }, { "epoch": 1.2765983436347201, "grad_norm": 84.62750244140625, "learning_rate": 1.856604500884808e-06, "loss": 0.1069, "num_input_tokens_seen": 35226784, "step": 52255 }, { "epoch": 1.2767204944665673, "grad_norm": 0.302837610244751, "learning_rate": 1.85656049678235e-06, "loss": 0.0816, "num_input_tokens_seen": 35230432, "step": 52260 }, { "epoch": 1.2768426452984145, "grad_norm": 0.25845223665237427, "learning_rate": 1.856516486450753e-06, "loss": 0.0797, "num_input_tokens_seen": 35233504, "step": 52265 }, { "epoch": 1.2769647961302617, "grad_norm": 35.83456802368164, "learning_rate": 1.8564724698903378e-06, "loss": 0.0503, "num_input_tokens_seen": 35236640, "step": 52270 }, { "epoch": 1.277086946962109, "grad_norm": 2.169076919555664, "learning_rate": 1.8564284471014247e-06, "loss": 0.0032, "num_input_tokens_seen": 35239840, "step": 52275 }, { "epoch": 1.277209097793956, "grad_norm": 1.791534662246704, "learning_rate": 1.8563844180843335e-06, "loss": 0.0338, "num_input_tokens_seen": 35243168, "step": 52280 }, { "epoch": 1.2773312486258033, "grad_norm": 18.362018585205078, "learning_rate": 1.8563403828393845e-06, "loss": 0.1058, "num_input_tokens_seen": 35246624, "step": 52285 }, { "epoch": 1.2774533994576502, "grad_norm": 0.2080661505460739, "learning_rate": 1.8562963413668977e-06, "loss": 0.001, "num_input_tokens_seen": 35249696, "step": 52290 }, { "epoch": 1.2775755502894974, "grad_norm": 10.347769737243652, "learning_rate": 1.8562522936671936e-06, "loss": 0.0666, "num_input_tokens_seen": 35252896, "step": 52295 }, { "epoch": 1.2776977011213446, "grad_norm": 0.09222155809402466, "learning_rate": 1.8562082397405927e-06, "loss": 0.001, "num_input_tokens_seen": 35256352, "step": 52300 }, { "epoch": 1.2778198519531918, "grad_norm": 0.015030997805297375, "learning_rate": 1.8561641795874153e-06, "loss": 0.0386, "num_input_tokens_seen": 35260064, "step": 52305 }, { "epoch": 1.277942002785039, "grad_norm": 19.347267150878906, "learning_rate": 1.8561201132079814e-06, "loss": 0.1596, "num_input_tokens_seen": 35263584, "step": 52310 }, { "epoch": 1.2780641536168862, "grad_norm": 0.30989161133766174, "learning_rate": 1.8560760406026119e-06, "loss": 0.0415, "num_input_tokens_seen": 35266784, "step": 52315 }, { "epoch": 1.2781863044487332, "grad_norm": 0.3502536714076996, "learning_rate": 1.8560319617716272e-06, "loss": 0.0582, "num_input_tokens_seen": 35270240, "step": 52320 }, { "epoch": 1.2783084552805803, "grad_norm": 35.52680587768555, "learning_rate": 1.8559878767153479e-06, "loss": 0.1099, "num_input_tokens_seen": 35273312, "step": 52325 }, { "epoch": 1.2784306061124275, "grad_norm": 0.20671890676021576, "learning_rate": 1.8559437854340944e-06, "loss": 0.0055, "num_input_tokens_seen": 35277024, "step": 52330 }, { "epoch": 1.2785527569442747, "grad_norm": 0.17156971991062164, "learning_rate": 1.8558996879281875e-06, "loss": 0.0301, "num_input_tokens_seen": 35280480, "step": 52335 }, { "epoch": 1.278674907776122, "grad_norm": 15.1109037399292, "learning_rate": 1.8558555841979477e-06, "loss": 0.1198, "num_input_tokens_seen": 35284640, "step": 52340 }, { "epoch": 1.278797058607969, "grad_norm": 0.019165517762303352, "learning_rate": 1.855811474243696e-06, "loss": 0.0244, "num_input_tokens_seen": 35287904, "step": 52345 }, { "epoch": 1.2789192094398163, "grad_norm": 78.49673461914062, "learning_rate": 1.855767358065753e-06, "loss": 0.1092, "num_input_tokens_seen": 35291168, "step": 52350 }, { "epoch": 1.2790413602716635, "grad_norm": 0.13271349668502808, "learning_rate": 1.8557232356644402e-06, "loss": 0.0594, "num_input_tokens_seen": 35294880, "step": 52355 }, { "epoch": 1.2791635111035107, "grad_norm": 0.1819307953119278, "learning_rate": 1.8556791070400771e-06, "loss": 0.0507, "num_input_tokens_seen": 35298272, "step": 52360 }, { "epoch": 1.2792856619353579, "grad_norm": 0.3739522397518158, "learning_rate": 1.8556349721929857e-06, "loss": 0.0006, "num_input_tokens_seen": 35301600, "step": 52365 }, { "epoch": 1.279407812767205, "grad_norm": 13.011190414428711, "learning_rate": 1.8555908311234868e-06, "loss": 0.1232, "num_input_tokens_seen": 35304736, "step": 52370 }, { "epoch": 1.2795299635990522, "grad_norm": 1.9926000833511353, "learning_rate": 1.8555466838319012e-06, "loss": 0.0618, "num_input_tokens_seen": 35308256, "step": 52375 }, { "epoch": 1.2796521144308992, "grad_norm": 7.410982608795166, "learning_rate": 1.8555025303185497e-06, "loss": 0.0386, "num_input_tokens_seen": 35311008, "step": 52380 }, { "epoch": 1.2797742652627464, "grad_norm": 11.874845504760742, "learning_rate": 1.855458370583754e-06, "loss": 0.0499, "num_input_tokens_seen": 35314272, "step": 52385 }, { "epoch": 1.2798964160945936, "grad_norm": 0.05146576091647148, "learning_rate": 1.8554142046278347e-06, "loss": 0.187, "num_input_tokens_seen": 35317600, "step": 52390 }, { "epoch": 1.2800185669264408, "grad_norm": 39.22667694091797, "learning_rate": 1.8553700324511132e-06, "loss": 0.0216, "num_input_tokens_seen": 35320864, "step": 52395 }, { "epoch": 1.280140717758288, "grad_norm": 0.18702971935272217, "learning_rate": 1.8553258540539111e-06, "loss": 0.0869, "num_input_tokens_seen": 35323936, "step": 52400 }, { "epoch": 1.2802628685901352, "grad_norm": 0.07560386508703232, "learning_rate": 1.8552816694365489e-06, "loss": 0.1137, "num_input_tokens_seen": 35327136, "step": 52405 }, { "epoch": 1.2803850194219821, "grad_norm": 0.2207210808992386, "learning_rate": 1.8552374785993487e-06, "loss": 0.0907, "num_input_tokens_seen": 35330720, "step": 52410 }, { "epoch": 1.2805071702538293, "grad_norm": 0.09498733282089233, "learning_rate": 1.8551932815426315e-06, "loss": 0.0615, "num_input_tokens_seen": 35333920, "step": 52415 }, { "epoch": 1.2806293210856765, "grad_norm": 0.7675701975822449, "learning_rate": 1.8551490782667188e-06, "loss": 0.0016, "num_input_tokens_seen": 35337504, "step": 52420 }, { "epoch": 1.2807514719175237, "grad_norm": 0.10342734307050705, "learning_rate": 1.8551048687719315e-06, "loss": 0.0842, "num_input_tokens_seen": 35340512, "step": 52425 }, { "epoch": 1.2808736227493709, "grad_norm": 0.13581562042236328, "learning_rate": 1.8550606530585922e-06, "loss": 0.0873, "num_input_tokens_seen": 35343392, "step": 52430 }, { "epoch": 1.280995773581218, "grad_norm": 0.09667894244194031, "learning_rate": 1.8550164311270215e-06, "loss": 0.0441, "num_input_tokens_seen": 35346784, "step": 52435 }, { "epoch": 1.2811179244130653, "grad_norm": 172.4896240234375, "learning_rate": 1.8549722029775414e-06, "loss": 0.0698, "num_input_tokens_seen": 35350048, "step": 52440 }, { "epoch": 1.2812400752449125, "grad_norm": 0.813492476940155, "learning_rate": 1.8549279686104734e-06, "loss": 0.0022, "num_input_tokens_seen": 35353568, "step": 52445 }, { "epoch": 1.2813622260767596, "grad_norm": 15.103371620178223, "learning_rate": 1.8548837280261393e-06, "loss": 0.0998, "num_input_tokens_seen": 35356896, "step": 52450 }, { "epoch": 1.2814843769086068, "grad_norm": 0.39661160111427307, "learning_rate": 1.8548394812248612e-06, "loss": 0.095, "num_input_tokens_seen": 35360288, "step": 52455 }, { "epoch": 1.281606527740454, "grad_norm": 14.238640785217285, "learning_rate": 1.85479522820696e-06, "loss": 0.1571, "num_input_tokens_seen": 35363296, "step": 52460 }, { "epoch": 1.281728678572301, "grad_norm": 1.1522719860076904, "learning_rate": 1.854750968972758e-06, "loss": 0.1396, "num_input_tokens_seen": 35366880, "step": 52465 }, { "epoch": 1.2818508294041482, "grad_norm": 19.622875213623047, "learning_rate": 1.8547067035225775e-06, "loss": 0.1012, "num_input_tokens_seen": 35370400, "step": 52470 }, { "epoch": 1.2819729802359954, "grad_norm": 0.05643227696418762, "learning_rate": 1.8546624318567395e-06, "loss": 0.0727, "num_input_tokens_seen": 35373792, "step": 52475 }, { "epoch": 1.2820951310678426, "grad_norm": 22.620677947998047, "learning_rate": 1.8546181539755665e-06, "loss": 0.0043, "num_input_tokens_seen": 35377056, "step": 52480 }, { "epoch": 1.2822172818996898, "grad_norm": 17.451501846313477, "learning_rate": 1.8545738698793807e-06, "loss": 0.074, "num_input_tokens_seen": 35380576, "step": 52485 }, { "epoch": 1.282339432731537, "grad_norm": 16.301881790161133, "learning_rate": 1.8545295795685033e-06, "loss": 0.0824, "num_input_tokens_seen": 35384352, "step": 52490 }, { "epoch": 1.2824615835633841, "grad_norm": 0.7614904046058655, "learning_rate": 1.8544852830432576e-06, "loss": 0.0362, "num_input_tokens_seen": 35387616, "step": 52495 }, { "epoch": 1.282583734395231, "grad_norm": 0.8684007525444031, "learning_rate": 1.8544409803039647e-06, "loss": 0.0025, "num_input_tokens_seen": 35391072, "step": 52500 }, { "epoch": 1.2827058852270783, "grad_norm": 59.57621383666992, "learning_rate": 1.8543966713509472e-06, "loss": 0.2443, "num_input_tokens_seen": 35394848, "step": 52505 }, { "epoch": 1.2828280360589255, "grad_norm": 41.55668640136719, "learning_rate": 1.8543523561845276e-06, "loss": 0.0889, "num_input_tokens_seen": 35398240, "step": 52510 }, { "epoch": 1.2829501868907727, "grad_norm": 5.202472686767578, "learning_rate": 1.8543080348050274e-06, "loss": 0.0338, "num_input_tokens_seen": 35401632, "step": 52515 }, { "epoch": 1.2830723377226199, "grad_norm": 88.77193450927734, "learning_rate": 1.8542637072127695e-06, "loss": 0.0603, "num_input_tokens_seen": 35404640, "step": 52520 }, { "epoch": 1.283194488554467, "grad_norm": 0.09404394775629044, "learning_rate": 1.8542193734080764e-06, "loss": 0.0614, "num_input_tokens_seen": 35407456, "step": 52525 }, { "epoch": 1.2833166393863142, "grad_norm": 0.25622621178627014, "learning_rate": 1.8541750333912703e-06, "loss": 0.1687, "num_input_tokens_seen": 35410784, "step": 52530 }, { "epoch": 1.2834387902181614, "grad_norm": 44.45455551147461, "learning_rate": 1.8541306871626733e-06, "loss": 0.0623, "num_input_tokens_seen": 35413856, "step": 52535 }, { "epoch": 1.2835609410500086, "grad_norm": 41.468326568603516, "learning_rate": 1.8540863347226084e-06, "loss": 0.1718, "num_input_tokens_seen": 35417056, "step": 52540 }, { "epoch": 1.2836830918818558, "grad_norm": 194.89041137695312, "learning_rate": 1.8540419760713979e-06, "loss": 0.1321, "num_input_tokens_seen": 35420192, "step": 52545 }, { "epoch": 1.283805242713703, "grad_norm": 0.43013298511505127, "learning_rate": 1.8539976112093644e-06, "loss": 0.0411, "num_input_tokens_seen": 35423712, "step": 52550 }, { "epoch": 1.28392739354555, "grad_norm": 30.879812240600586, "learning_rate": 1.853953240136831e-06, "loss": 0.1122, "num_input_tokens_seen": 35426848, "step": 52555 }, { "epoch": 1.2840495443773972, "grad_norm": 0.05143484100699425, "learning_rate": 1.8539088628541193e-06, "loss": 0.0471, "num_input_tokens_seen": 35429984, "step": 52560 }, { "epoch": 1.2841716952092443, "grad_norm": 21.9586238861084, "learning_rate": 1.8538644793615532e-06, "loss": 0.0956, "num_input_tokens_seen": 35433184, "step": 52565 }, { "epoch": 1.2842938460410915, "grad_norm": 2.9223499298095703, "learning_rate": 1.8538200896594546e-06, "loss": 0.0697, "num_input_tokens_seen": 35436576, "step": 52570 }, { "epoch": 1.2844159968729387, "grad_norm": 86.12430572509766, "learning_rate": 1.8537756937481465e-06, "loss": 0.0124, "num_input_tokens_seen": 35440288, "step": 52575 }, { "epoch": 1.284538147704786, "grad_norm": 20.265743255615234, "learning_rate": 1.8537312916279524e-06, "loss": 0.1329, "num_input_tokens_seen": 35443488, "step": 52580 }, { "epoch": 1.284660298536633, "grad_norm": 0.10588684678077698, "learning_rate": 1.8536868832991946e-06, "loss": 0.0242, "num_input_tokens_seen": 35447072, "step": 52585 }, { "epoch": 1.28478244936848, "grad_norm": 0.07364339381456375, "learning_rate": 1.8536424687621958e-06, "loss": 0.0297, "num_input_tokens_seen": 35450592, "step": 52590 }, { "epoch": 1.2849046002003273, "grad_norm": 0.6387683153152466, "learning_rate": 1.8535980480172797e-06, "loss": 0.0983, "num_input_tokens_seen": 35453920, "step": 52595 }, { "epoch": 1.2850267510321745, "grad_norm": 0.08949076384305954, "learning_rate": 1.8535536210647691e-06, "loss": 0.046, "num_input_tokens_seen": 35456928, "step": 52600 }, { "epoch": 1.2851489018640216, "grad_norm": 51.28281784057617, "learning_rate": 1.8535091879049868e-06, "loss": 0.1515, "num_input_tokens_seen": 35460192, "step": 52605 }, { "epoch": 1.2852710526958688, "grad_norm": 180.63563537597656, "learning_rate": 1.8534647485382561e-06, "loss": 0.1838, "num_input_tokens_seen": 35463712, "step": 52610 }, { "epoch": 1.285393203527716, "grad_norm": 0.5813500285148621, "learning_rate": 1.8534203029649002e-06, "loss": 0.0022, "num_input_tokens_seen": 35466720, "step": 52615 }, { "epoch": 1.2855153543595632, "grad_norm": 68.77971649169922, "learning_rate": 1.8533758511852424e-06, "loss": 0.0768, "num_input_tokens_seen": 35470176, "step": 52620 }, { "epoch": 1.2856375051914104, "grad_norm": 31.985763549804688, "learning_rate": 1.853331393199606e-06, "loss": 0.0344, "num_input_tokens_seen": 35473568, "step": 52625 }, { "epoch": 1.2857596560232576, "grad_norm": 0.49823251366615295, "learning_rate": 1.8532869290083139e-06, "loss": 0.0547, "num_input_tokens_seen": 35477536, "step": 52630 }, { "epoch": 1.2858818068551048, "grad_norm": 20.316926956176758, "learning_rate": 1.8532424586116899e-06, "loss": 0.1291, "num_input_tokens_seen": 35480480, "step": 52635 }, { "epoch": 1.286003957686952, "grad_norm": 80.80620574951172, "learning_rate": 1.8531979820100574e-06, "loss": 0.1439, "num_input_tokens_seen": 35484000, "step": 52640 }, { "epoch": 1.286126108518799, "grad_norm": 29.717788696289062, "learning_rate": 1.8531534992037395e-06, "loss": 0.0509, "num_input_tokens_seen": 35487008, "step": 52645 }, { "epoch": 1.2862482593506461, "grad_norm": 0.11400293558835983, "learning_rate": 1.8531090101930595e-06, "loss": 0.0082, "num_input_tokens_seen": 35490528, "step": 52650 }, { "epoch": 1.2863704101824933, "grad_norm": 0.2857948839664459, "learning_rate": 1.853064514978342e-06, "loss": 0.1251, "num_input_tokens_seen": 35493728, "step": 52655 }, { "epoch": 1.2864925610143405, "grad_norm": 9.247663497924805, "learning_rate": 1.8530200135599095e-06, "loss": 0.0998, "num_input_tokens_seen": 35496736, "step": 52660 }, { "epoch": 1.2866147118461877, "grad_norm": 0.12227342277765274, "learning_rate": 1.8529755059380863e-06, "loss": 0.0935, "num_input_tokens_seen": 35500000, "step": 52665 }, { "epoch": 1.2867368626780349, "grad_norm": 8.381556510925293, "learning_rate": 1.8529309921131954e-06, "loss": 0.0905, "num_input_tokens_seen": 35503008, "step": 52670 }, { "epoch": 1.286859013509882, "grad_norm": 0.7957706451416016, "learning_rate": 1.8528864720855613e-06, "loss": 0.0017, "num_input_tokens_seen": 35506336, "step": 52675 }, { "epoch": 1.286981164341729, "grad_norm": 0.2724474370479584, "learning_rate": 1.8528419458555072e-06, "loss": 0.0016, "num_input_tokens_seen": 35510112, "step": 52680 }, { "epoch": 1.2871033151735762, "grad_norm": 12.336623191833496, "learning_rate": 1.8527974134233571e-06, "loss": 0.0415, "num_input_tokens_seen": 35513056, "step": 52685 }, { "epoch": 1.2872254660054234, "grad_norm": 94.06697082519531, "learning_rate": 1.8527528747894347e-06, "loss": 0.0571, "num_input_tokens_seen": 35516832, "step": 52690 }, { "epoch": 1.2873476168372706, "grad_norm": 0.11085094511508942, "learning_rate": 1.8527083299540641e-06, "loss": 0.0006, "num_input_tokens_seen": 35520096, "step": 52695 }, { "epoch": 1.2874697676691178, "grad_norm": 21.436336517333984, "learning_rate": 1.8526637789175696e-06, "loss": 0.1375, "num_input_tokens_seen": 35523872, "step": 52700 }, { "epoch": 1.287591918500965, "grad_norm": 0.08883228898048401, "learning_rate": 1.8526192216802742e-06, "loss": 0.173, "num_input_tokens_seen": 35527840, "step": 52705 }, { "epoch": 1.2877140693328122, "grad_norm": 8.513446807861328, "learning_rate": 1.8525746582425028e-06, "loss": 0.1101, "num_input_tokens_seen": 35530976, "step": 52710 }, { "epoch": 1.2878362201646594, "grad_norm": 27.75189781188965, "learning_rate": 1.8525300886045792e-06, "loss": 0.2526, "num_input_tokens_seen": 35534432, "step": 52715 }, { "epoch": 1.2879583709965066, "grad_norm": 0.025965625420212746, "learning_rate": 1.8524855127668272e-06, "loss": 0.1337, "num_input_tokens_seen": 35537312, "step": 52720 }, { "epoch": 1.2880805218283538, "grad_norm": 4.335248947143555, "learning_rate": 1.8524409307295716e-06, "loss": 0.0739, "num_input_tokens_seen": 35540640, "step": 52725 }, { "epoch": 1.288202672660201, "grad_norm": 0.2505491077899933, "learning_rate": 1.8523963424931361e-06, "loss": 0.0419, "num_input_tokens_seen": 35543776, "step": 52730 }, { "epoch": 1.288324823492048, "grad_norm": 10.368141174316406, "learning_rate": 1.852351748057845e-06, "loss": 0.0466, "num_input_tokens_seen": 35547104, "step": 52735 }, { "epoch": 1.288446974323895, "grad_norm": 14.576863288879395, "learning_rate": 1.8523071474240228e-06, "loss": 0.0577, "num_input_tokens_seen": 35550368, "step": 52740 }, { "epoch": 1.2885691251557423, "grad_norm": 20.16292953491211, "learning_rate": 1.8522625405919938e-06, "loss": 0.1469, "num_input_tokens_seen": 35553248, "step": 52745 }, { "epoch": 1.2886912759875895, "grad_norm": 0.06736789643764496, "learning_rate": 1.8522179275620825e-06, "loss": 0.0592, "num_input_tokens_seen": 35556256, "step": 52750 }, { "epoch": 1.2888134268194367, "grad_norm": 0.46141862869262695, "learning_rate": 1.8521733083346131e-06, "loss": 0.1206, "num_input_tokens_seen": 35559712, "step": 52755 }, { "epoch": 1.2889355776512839, "grad_norm": 13.816710472106934, "learning_rate": 1.8521286829099104e-06, "loss": 0.0855, "num_input_tokens_seen": 35562656, "step": 52760 }, { "epoch": 1.289057728483131, "grad_norm": 0.1693304032087326, "learning_rate": 1.8520840512882985e-06, "loss": 0.1328, "num_input_tokens_seen": 35566432, "step": 52765 }, { "epoch": 1.289179879314978, "grad_norm": 21.824827194213867, "learning_rate": 1.8520394134701022e-06, "loss": 0.0984, "num_input_tokens_seen": 35569952, "step": 52770 }, { "epoch": 1.2893020301468252, "grad_norm": 15.071465492248535, "learning_rate": 1.8519947694556461e-06, "loss": 0.0838, "num_input_tokens_seen": 35573024, "step": 52775 }, { "epoch": 1.2894241809786724, "grad_norm": 0.7028993368148804, "learning_rate": 1.8519501192452548e-06, "loss": 0.1193, "num_input_tokens_seen": 35576352, "step": 52780 }, { "epoch": 1.2895463318105196, "grad_norm": 28.04998016357422, "learning_rate": 1.8519054628392535e-06, "loss": 0.0345, "num_input_tokens_seen": 35579552, "step": 52785 }, { "epoch": 1.2896684826423668, "grad_norm": 0.2715553641319275, "learning_rate": 1.8518608002379664e-06, "loss": 0.1339, "num_input_tokens_seen": 35583264, "step": 52790 }, { "epoch": 1.289790633474214, "grad_norm": 0.2599318325519562, "learning_rate": 1.8518161314417181e-06, "loss": 0.0316, "num_input_tokens_seen": 35586592, "step": 52795 }, { "epoch": 1.2899127843060612, "grad_norm": 1.9973633289337158, "learning_rate": 1.851771456450834e-06, "loss": 0.0029, "num_input_tokens_seen": 35589792, "step": 52800 }, { "epoch": 1.2900349351379083, "grad_norm": 0.6662908792495728, "learning_rate": 1.8517267752656387e-06, "loss": 0.0027, "num_input_tokens_seen": 35592992, "step": 52805 }, { "epoch": 1.2901570859697555, "grad_norm": 0.2350725382566452, "learning_rate": 1.8516820878864574e-06, "loss": 0.0313, "num_input_tokens_seen": 35596704, "step": 52810 }, { "epoch": 1.2902792368016027, "grad_norm": 12.004608154296875, "learning_rate": 1.8516373943136147e-06, "loss": 0.1372, "num_input_tokens_seen": 35600160, "step": 52815 }, { "epoch": 1.29040138763345, "grad_norm": 0.09558333456516266, "learning_rate": 1.8515926945474357e-06, "loss": 0.0029, "num_input_tokens_seen": 35603104, "step": 52820 }, { "epoch": 1.2905235384652969, "grad_norm": 27.230884552001953, "learning_rate": 1.851547988588246e-06, "loss": 0.1577, "num_input_tokens_seen": 35606432, "step": 52825 }, { "epoch": 1.290645689297144, "grad_norm": 98.923828125, "learning_rate": 1.8515032764363698e-06, "loss": 0.2623, "num_input_tokens_seen": 35610080, "step": 52830 }, { "epoch": 1.2907678401289913, "grad_norm": 7.860264778137207, "learning_rate": 1.8514585580921328e-06, "loss": 0.1614, "num_input_tokens_seen": 35613344, "step": 52835 }, { "epoch": 1.2908899909608385, "grad_norm": 24.293676376342773, "learning_rate": 1.8514138335558604e-06, "loss": 0.1081, "num_input_tokens_seen": 35616608, "step": 52840 }, { "epoch": 1.2910121417926856, "grad_norm": 0.30976778268814087, "learning_rate": 1.8513691028278776e-06, "loss": 0.001, "num_input_tokens_seen": 35619744, "step": 52845 }, { "epoch": 1.2911342926245328, "grad_norm": 0.3577079176902771, "learning_rate": 1.8513243659085097e-06, "loss": 0.0517, "num_input_tokens_seen": 35623264, "step": 52850 }, { "epoch": 1.29125644345638, "grad_norm": 0.2105468511581421, "learning_rate": 1.8512796227980818e-06, "loss": 0.0561, "num_input_tokens_seen": 35626720, "step": 52855 }, { "epoch": 1.291378594288227, "grad_norm": 0.14541052281856537, "learning_rate": 1.8512348734969196e-06, "loss": 0.0614, "num_input_tokens_seen": 35629792, "step": 52860 }, { "epoch": 1.2915007451200742, "grad_norm": 99.4928970336914, "learning_rate": 1.8511901180053485e-06, "loss": 0.1245, "num_input_tokens_seen": 35633184, "step": 52865 }, { "epoch": 1.2916228959519214, "grad_norm": 1.1235225200653076, "learning_rate": 1.8511453563236938e-06, "loss": 0.04, "num_input_tokens_seen": 35636704, "step": 52870 }, { "epoch": 1.2917450467837686, "grad_norm": 0.2470174878835678, "learning_rate": 1.8511005884522813e-06, "loss": 0.0387, "num_input_tokens_seen": 35640352, "step": 52875 }, { "epoch": 1.2918671976156157, "grad_norm": 0.2535180151462555, "learning_rate": 1.8510558143914363e-06, "loss": 0.0359, "num_input_tokens_seen": 35644064, "step": 52880 }, { "epoch": 1.291989348447463, "grad_norm": 0.05397181957960129, "learning_rate": 1.8510110341414847e-06, "loss": 0.0565, "num_input_tokens_seen": 35647200, "step": 52885 }, { "epoch": 1.2921114992793101, "grad_norm": 34.268272399902344, "learning_rate": 1.8509662477027517e-06, "loss": 0.1693, "num_input_tokens_seen": 35650400, "step": 52890 }, { "epoch": 1.2922336501111573, "grad_norm": 113.81163024902344, "learning_rate": 1.8509214550755633e-06, "loss": 0.1691, "num_input_tokens_seen": 35653600, "step": 52895 }, { "epoch": 1.2923558009430045, "grad_norm": 9.641225814819336, "learning_rate": 1.8508766562602455e-06, "loss": 0.1415, "num_input_tokens_seen": 35657440, "step": 52900 }, { "epoch": 1.2924779517748517, "grad_norm": 0.2733023464679718, "learning_rate": 1.8508318512571238e-06, "loss": 0.0871, "num_input_tokens_seen": 35661152, "step": 52905 }, { "epoch": 1.2926001026066989, "grad_norm": 115.42435455322266, "learning_rate": 1.8507870400665236e-06, "loss": 0.0049, "num_input_tokens_seen": 35664928, "step": 52910 }, { "epoch": 1.2927222534385459, "grad_norm": 9.324633598327637, "learning_rate": 1.8507422226887712e-06, "loss": 0.0415, "num_input_tokens_seen": 35668384, "step": 52915 }, { "epoch": 1.292844404270393, "grad_norm": 13.958195686340332, "learning_rate": 1.850697399124193e-06, "loss": 0.1226, "num_input_tokens_seen": 35671648, "step": 52920 }, { "epoch": 1.2929665551022402, "grad_norm": 59.904056549072266, "learning_rate": 1.8506525693731141e-06, "loss": 0.2342, "num_input_tokens_seen": 35674912, "step": 52925 }, { "epoch": 1.2930887059340874, "grad_norm": 0.11946405470371246, "learning_rate": 1.8506077334358615e-06, "loss": 0.0255, "num_input_tokens_seen": 35678688, "step": 52930 }, { "epoch": 1.2932108567659346, "grad_norm": 0.045888565480709076, "learning_rate": 1.85056289131276e-06, "loss": 0.0016, "num_input_tokens_seen": 35682400, "step": 52935 }, { "epoch": 1.2933330075977818, "grad_norm": 36.49285888671875, "learning_rate": 1.8505180430041367e-06, "loss": 0.0977, "num_input_tokens_seen": 35685792, "step": 52940 }, { "epoch": 1.2934551584296288, "grad_norm": 0.20501862466335297, "learning_rate": 1.8504731885103175e-06, "loss": 0.0011, "num_input_tokens_seen": 35689120, "step": 52945 }, { "epoch": 1.293577309261476, "grad_norm": 0.058447353541851044, "learning_rate": 1.8504283278316284e-06, "loss": 0.1232, "num_input_tokens_seen": 35692896, "step": 52950 }, { "epoch": 1.2936994600933232, "grad_norm": 0.02482379972934723, "learning_rate": 1.8503834609683957e-06, "loss": 0.1634, "num_input_tokens_seen": 35695840, "step": 52955 }, { "epoch": 1.2938216109251703, "grad_norm": 0.09336850047111511, "learning_rate": 1.8503385879209457e-06, "loss": 0.0694, "num_input_tokens_seen": 35699296, "step": 52960 }, { "epoch": 1.2939437617570175, "grad_norm": 0.09030751138925552, "learning_rate": 1.8502937086896048e-06, "loss": 0.1277, "num_input_tokens_seen": 35702368, "step": 52965 }, { "epoch": 1.2940659125888647, "grad_norm": 1.2011581659317017, "learning_rate": 1.8502488232746996e-06, "loss": 0.0476, "num_input_tokens_seen": 35705696, "step": 52970 }, { "epoch": 1.294188063420712, "grad_norm": 0.05959802493453026, "learning_rate": 1.8502039316765562e-06, "loss": 0.0302, "num_input_tokens_seen": 35708832, "step": 52975 }, { "epoch": 1.294310214252559, "grad_norm": 0.40464505553245544, "learning_rate": 1.8501590338955008e-06, "loss": 0.0026, "num_input_tokens_seen": 35711968, "step": 52980 }, { "epoch": 1.2944323650844063, "grad_norm": 0.19674935936927795, "learning_rate": 1.8501141299318605e-06, "loss": 0.0961, "num_input_tokens_seen": 35715296, "step": 52985 }, { "epoch": 1.2945545159162535, "grad_norm": 0.3937717080116272, "learning_rate": 1.8500692197859616e-06, "loss": 0.0009, "num_input_tokens_seen": 35718368, "step": 52990 }, { "epoch": 1.2946766667481007, "grad_norm": 0.4105014204978943, "learning_rate": 1.850024303458131e-06, "loss": 0.0488, "num_input_tokens_seen": 35721824, "step": 52995 }, { "epoch": 1.2947988175799476, "grad_norm": 0.04927727207541466, "learning_rate": 1.8499793809486945e-06, "loss": 0.1413, "num_input_tokens_seen": 35725024, "step": 53000 }, { "epoch": 1.2949209684117948, "grad_norm": 0.9458499550819397, "learning_rate": 1.8499344522579794e-06, "loss": 0.0305, "num_input_tokens_seen": 35728416, "step": 53005 }, { "epoch": 1.295043119243642, "grad_norm": 22.578027725219727, "learning_rate": 1.8498895173863125e-06, "loss": 0.0607, "num_input_tokens_seen": 35731808, "step": 53010 }, { "epoch": 1.2951652700754892, "grad_norm": 12.686452865600586, "learning_rate": 1.8498445763340204e-06, "loss": 0.1866, "num_input_tokens_seen": 35734880, "step": 53015 }, { "epoch": 1.2952874209073364, "grad_norm": 0.2515212595462799, "learning_rate": 1.84979962910143e-06, "loss": 0.0943, "num_input_tokens_seen": 35738464, "step": 53020 }, { "epoch": 1.2954095717391836, "grad_norm": 0.03380141779780388, "learning_rate": 1.8497546756888683e-06, "loss": 0.1724, "num_input_tokens_seen": 35742176, "step": 53025 }, { "epoch": 1.2955317225710308, "grad_norm": 31.458084106445312, "learning_rate": 1.8497097160966616e-06, "loss": 0.1477, "num_input_tokens_seen": 35745696, "step": 53030 }, { "epoch": 1.2956538734028777, "grad_norm": 21.03895378112793, "learning_rate": 1.8496647503251377e-06, "loss": 0.1135, "num_input_tokens_seen": 35749472, "step": 53035 }, { "epoch": 1.295776024234725, "grad_norm": 0.5178502798080444, "learning_rate": 1.849619778374623e-06, "loss": 0.1194, "num_input_tokens_seen": 35752736, "step": 53040 }, { "epoch": 1.2958981750665721, "grad_norm": 0.3661326766014099, "learning_rate": 1.8495748002454446e-06, "loss": 0.0285, "num_input_tokens_seen": 35756128, "step": 53045 }, { "epoch": 1.2960203258984193, "grad_norm": 7.51620626449585, "learning_rate": 1.84952981593793e-06, "loss": 0.1519, "num_input_tokens_seen": 35759136, "step": 53050 }, { "epoch": 1.2961424767302665, "grad_norm": 0.9327648878097534, "learning_rate": 1.8494848254524062e-06, "loss": 0.0572, "num_input_tokens_seen": 35762336, "step": 53055 }, { "epoch": 1.2962646275621137, "grad_norm": 0.03591260313987732, "learning_rate": 1.8494398287892002e-06, "loss": 0.0026, "num_input_tokens_seen": 35765600, "step": 53060 }, { "epoch": 1.2963867783939609, "grad_norm": 0.6520580649375916, "learning_rate": 1.849394825948639e-06, "loss": 0.0017, "num_input_tokens_seen": 35769056, "step": 53065 }, { "epoch": 1.296508929225808, "grad_norm": 0.5664069056510925, "learning_rate": 1.8493498169310505e-06, "loss": 0.0598, "num_input_tokens_seen": 35772384, "step": 53070 }, { "epoch": 1.2966310800576553, "grad_norm": 66.2616958618164, "learning_rate": 1.8493048017367613e-06, "loss": 0.1647, "num_input_tokens_seen": 35775584, "step": 53075 }, { "epoch": 1.2967532308895025, "grad_norm": 0.5477205514907837, "learning_rate": 1.8492597803660995e-06, "loss": 0.1373, "num_input_tokens_seen": 35779232, "step": 53080 }, { "epoch": 1.2968753817213496, "grad_norm": 9.841727256774902, "learning_rate": 1.8492147528193919e-06, "loss": 0.0436, "num_input_tokens_seen": 35782816, "step": 53085 }, { "epoch": 1.2969975325531966, "grad_norm": 166.41403198242188, "learning_rate": 1.8491697190969664e-06, "loss": 0.0809, "num_input_tokens_seen": 35785952, "step": 53090 }, { "epoch": 1.2971196833850438, "grad_norm": 0.22653362154960632, "learning_rate": 1.8491246791991502e-06, "loss": 0.0008, "num_input_tokens_seen": 35789792, "step": 53095 }, { "epoch": 1.297241834216891, "grad_norm": 26.30208969116211, "learning_rate": 1.849079633126271e-06, "loss": 0.0711, "num_input_tokens_seen": 35793184, "step": 53100 }, { "epoch": 1.2973639850487382, "grad_norm": 30.160884857177734, "learning_rate": 1.8490345808786564e-06, "loss": 0.2188, "num_input_tokens_seen": 35796832, "step": 53105 }, { "epoch": 1.2974861358805854, "grad_norm": 37.4919319152832, "learning_rate": 1.8489895224566339e-06, "loss": 0.0716, "num_input_tokens_seen": 35800160, "step": 53110 }, { "epoch": 1.2976082867124326, "grad_norm": 132.77760314941406, "learning_rate": 1.848944457860531e-06, "loss": 0.1335, "num_input_tokens_seen": 35803616, "step": 53115 }, { "epoch": 1.2977304375442797, "grad_norm": 8.152387619018555, "learning_rate": 1.8488993870906761e-06, "loss": 0.1742, "num_input_tokens_seen": 35807584, "step": 53120 }, { "epoch": 1.2978525883761267, "grad_norm": 0.08918684720993042, "learning_rate": 1.8488543101473963e-06, "loss": 0.1373, "num_input_tokens_seen": 35811488, "step": 53125 }, { "epoch": 1.297974739207974, "grad_norm": 0.4842546582221985, "learning_rate": 1.8488092270310197e-06, "loss": 0.2268, "num_input_tokens_seen": 35814880, "step": 53130 }, { "epoch": 1.298096890039821, "grad_norm": 0.5850440263748169, "learning_rate": 1.848764137741874e-06, "loss": 0.1117, "num_input_tokens_seen": 35818016, "step": 53135 }, { "epoch": 1.2982190408716683, "grad_norm": 0.5294996500015259, "learning_rate": 1.8487190422802872e-06, "loss": 0.0394, "num_input_tokens_seen": 35821408, "step": 53140 }, { "epoch": 1.2983411917035155, "grad_norm": 0.23933136463165283, "learning_rate": 1.8486739406465874e-06, "loss": 0.0022, "num_input_tokens_seen": 35824672, "step": 53145 }, { "epoch": 1.2984633425353627, "grad_norm": 11.323979377746582, "learning_rate": 1.8486288328411024e-06, "loss": 0.1339, "num_input_tokens_seen": 35827936, "step": 53150 }, { "epoch": 1.2985854933672099, "grad_norm": 49.09061050415039, "learning_rate": 1.8485837188641602e-06, "loss": 0.0335, "num_input_tokens_seen": 35831136, "step": 53155 }, { "epoch": 1.298707644199057, "grad_norm": 41.00819396972656, "learning_rate": 1.848538598716089e-06, "loss": 0.1079, "num_input_tokens_seen": 35834784, "step": 53160 }, { "epoch": 1.2988297950309042, "grad_norm": 0.033825136721134186, "learning_rate": 1.8484934723972167e-06, "loss": 0.029, "num_input_tokens_seen": 35838176, "step": 53165 }, { "epoch": 1.2989519458627514, "grad_norm": 17.26095962524414, "learning_rate": 1.8484483399078718e-06, "loss": 0.1262, "num_input_tokens_seen": 35841120, "step": 53170 }, { "epoch": 1.2990740966945986, "grad_norm": 0.11489014327526093, "learning_rate": 1.8484032012483825e-06, "loss": 0.0012, "num_input_tokens_seen": 35844256, "step": 53175 }, { "epoch": 1.2991962475264456, "grad_norm": 5.575078964233398, "learning_rate": 1.8483580564190768e-06, "loss": 0.0595, "num_input_tokens_seen": 35848224, "step": 53180 }, { "epoch": 1.2993183983582928, "grad_norm": 22.667858123779297, "learning_rate": 1.848312905420283e-06, "loss": 0.1228, "num_input_tokens_seen": 35851808, "step": 53185 }, { "epoch": 1.29944054919014, "grad_norm": 12.718514442443848, "learning_rate": 1.84826774825233e-06, "loss": 0.0527, "num_input_tokens_seen": 35855200, "step": 53190 }, { "epoch": 1.2995627000219871, "grad_norm": 13.314168930053711, "learning_rate": 1.8482225849155455e-06, "loss": 0.0597, "num_input_tokens_seen": 35858720, "step": 53195 }, { "epoch": 1.2996848508538343, "grad_norm": 0.3152402639389038, "learning_rate": 1.8481774154102584e-06, "loss": 0.0569, "num_input_tokens_seen": 35862304, "step": 53200 }, { "epoch": 1.2998070016856815, "grad_norm": 39.78969955444336, "learning_rate": 1.8481322397367966e-06, "loss": 0.1173, "num_input_tokens_seen": 35865696, "step": 53205 }, { "epoch": 1.2999291525175287, "grad_norm": 0.2929113209247589, "learning_rate": 1.8480870578954893e-06, "loss": 0.0653, "num_input_tokens_seen": 35869216, "step": 53210 }, { "epoch": 1.3000513033493757, "grad_norm": 2.2540366649627686, "learning_rate": 1.8480418698866646e-06, "loss": 0.0502, "num_input_tokens_seen": 35872480, "step": 53215 }, { "epoch": 1.3001734541812229, "grad_norm": 0.15386171638965607, "learning_rate": 1.8479966757106516e-06, "loss": 0.0023, "num_input_tokens_seen": 35875872, "step": 53220 }, { "epoch": 1.30029560501307, "grad_norm": 0.24568603932857513, "learning_rate": 1.8479514753677785e-06, "loss": 0.1281, "num_input_tokens_seen": 35878688, "step": 53225 }, { "epoch": 1.3004177558449173, "grad_norm": 0.803600013256073, "learning_rate": 1.8479062688583743e-06, "loss": 0.1363, "num_input_tokens_seen": 35881760, "step": 53230 }, { "epoch": 1.3005399066767644, "grad_norm": 4.497910499572754, "learning_rate": 1.8478610561827676e-06, "loss": 0.0084, "num_input_tokens_seen": 35885024, "step": 53235 }, { "epoch": 1.3006620575086116, "grad_norm": 12.895638465881348, "learning_rate": 1.8478158373412872e-06, "loss": 0.2536, "num_input_tokens_seen": 35888288, "step": 53240 }, { "epoch": 1.3007842083404588, "grad_norm": 1.133604884147644, "learning_rate": 1.8477706123342623e-06, "loss": 0.0507, "num_input_tokens_seen": 35891424, "step": 53245 }, { "epoch": 1.300906359172306, "grad_norm": 0.3395247459411621, "learning_rate": 1.847725381162021e-06, "loss": 0.0091, "num_input_tokens_seen": 35894624, "step": 53250 }, { "epoch": 1.3010285100041532, "grad_norm": 13.708138465881348, "learning_rate": 1.8476801438248932e-06, "loss": 0.1495, "num_input_tokens_seen": 35897888, "step": 53255 }, { "epoch": 1.3011506608360004, "grad_norm": 1.0327856540679932, "learning_rate": 1.8476349003232073e-06, "loss": 0.1332, "num_input_tokens_seen": 35901472, "step": 53260 }, { "epoch": 1.3012728116678476, "grad_norm": 0.15163828432559967, "learning_rate": 1.847589650657292e-06, "loss": 0.004, "num_input_tokens_seen": 35904480, "step": 53265 }, { "epoch": 1.3013949624996946, "grad_norm": 0.1811094433069229, "learning_rate": 1.847544394827477e-06, "loss": 0.0764, "num_input_tokens_seen": 35907808, "step": 53270 }, { "epoch": 1.3015171133315417, "grad_norm": 0.6422911286354065, "learning_rate": 1.8474991328340915e-06, "loss": 0.2005, "num_input_tokens_seen": 35910752, "step": 53275 }, { "epoch": 1.301639264163389, "grad_norm": 91.4090576171875, "learning_rate": 1.847453864677464e-06, "loss": 0.0881, "num_input_tokens_seen": 35913952, "step": 53280 }, { "epoch": 1.3017614149952361, "grad_norm": 0.15008161962032318, "learning_rate": 1.8474085903579245e-06, "loss": 0.1268, "num_input_tokens_seen": 35917408, "step": 53285 }, { "epoch": 1.3018835658270833, "grad_norm": 28.611242294311523, "learning_rate": 1.8473633098758014e-06, "loss": 0.0852, "num_input_tokens_seen": 35920544, "step": 53290 }, { "epoch": 1.3020057166589305, "grad_norm": 0.25884881615638733, "learning_rate": 1.8473180232314244e-06, "loss": 0.0741, "num_input_tokens_seen": 35923552, "step": 53295 }, { "epoch": 1.3021278674907777, "grad_norm": 0.07952199131250381, "learning_rate": 1.8472727304251227e-06, "loss": 0.0429, "num_input_tokens_seen": 35927200, "step": 53300 }, { "epoch": 1.3022500183226247, "grad_norm": 150.61093139648438, "learning_rate": 1.8472274314572262e-06, "loss": 0.0497, "num_input_tokens_seen": 35930784, "step": 53305 }, { "epoch": 1.3023721691544718, "grad_norm": 0.13954846560955048, "learning_rate": 1.847182126328064e-06, "loss": 0.0659, "num_input_tokens_seen": 35933920, "step": 53310 }, { "epoch": 1.302494319986319, "grad_norm": 0.19231536984443665, "learning_rate": 1.8471368150379652e-06, "loss": 0.0328, "num_input_tokens_seen": 35937504, "step": 53315 }, { "epoch": 1.3026164708181662, "grad_norm": 13.778584480285645, "learning_rate": 1.8470914975872596e-06, "loss": 0.0522, "num_input_tokens_seen": 35941024, "step": 53320 }, { "epoch": 1.3027386216500134, "grad_norm": 79.91692352294922, "learning_rate": 1.847046173976277e-06, "loss": 0.0972, "num_input_tokens_seen": 35945120, "step": 53325 }, { "epoch": 1.3028607724818606, "grad_norm": 9.358743667602539, "learning_rate": 1.8470008442053468e-06, "loss": 0.0486, "num_input_tokens_seen": 35948448, "step": 53330 }, { "epoch": 1.3029829233137078, "grad_norm": 0.1844514161348343, "learning_rate": 1.8469555082747985e-06, "loss": 0.1491, "num_input_tokens_seen": 35951584, "step": 53335 }, { "epoch": 1.303105074145555, "grad_norm": 121.97779083251953, "learning_rate": 1.846910166184962e-06, "loss": 0.1864, "num_input_tokens_seen": 35954912, "step": 53340 }, { "epoch": 1.3032272249774022, "grad_norm": 100.90186309814453, "learning_rate": 1.846864817936167e-06, "loss": 0.1499, "num_input_tokens_seen": 35958368, "step": 53345 }, { "epoch": 1.3033493758092494, "grad_norm": 11.588038444519043, "learning_rate": 1.8468194635287432e-06, "loss": 0.0402, "num_input_tokens_seen": 35961632, "step": 53350 }, { "epoch": 1.3034715266410966, "grad_norm": 0.1409424990415573, "learning_rate": 1.8467741029630207e-06, "loss": 0.0991, "num_input_tokens_seen": 35965344, "step": 53355 }, { "epoch": 1.3035936774729435, "grad_norm": 0.23103132843971252, "learning_rate": 1.8467287362393288e-06, "loss": 0.0474, "num_input_tokens_seen": 35968736, "step": 53360 }, { "epoch": 1.3037158283047907, "grad_norm": 7.011451721191406, "learning_rate": 1.846683363357998e-06, "loss": 0.0616, "num_input_tokens_seen": 35971808, "step": 53365 }, { "epoch": 1.303837979136638, "grad_norm": 0.4242778420448303, "learning_rate": 1.8466379843193583e-06, "loss": 0.0982, "num_input_tokens_seen": 35975328, "step": 53370 }, { "epoch": 1.303960129968485, "grad_norm": 23.884788513183594, "learning_rate": 1.846592599123739e-06, "loss": 0.1391, "num_input_tokens_seen": 35978336, "step": 53375 }, { "epoch": 1.3040822808003323, "grad_norm": 98.13916778564453, "learning_rate": 1.8465472077714707e-06, "loss": 0.1133, "num_input_tokens_seen": 35981728, "step": 53380 }, { "epoch": 1.3042044316321795, "grad_norm": 146.73292541503906, "learning_rate": 1.8465018102628837e-06, "loss": 0.0989, "num_input_tokens_seen": 35984992, "step": 53385 }, { "epoch": 1.3043265824640267, "grad_norm": 0.2913326621055603, "learning_rate": 1.8464564065983077e-06, "loss": 0.0663, "num_input_tokens_seen": 35988192, "step": 53390 }, { "epoch": 1.3044487332958736, "grad_norm": 0.06666086614131927, "learning_rate": 1.846410996778073e-06, "loss": 0.1115, "num_input_tokens_seen": 35991264, "step": 53395 }, { "epoch": 1.3045708841277208, "grad_norm": 0.7160685062408447, "learning_rate": 1.8463655808025098e-06, "loss": 0.1054, "num_input_tokens_seen": 35994912, "step": 53400 }, { "epoch": 1.304693034959568, "grad_norm": 0.5617475509643555, "learning_rate": 1.8463201586719486e-06, "loss": 0.1084, "num_input_tokens_seen": 35998112, "step": 53405 }, { "epoch": 1.3048151857914152, "grad_norm": 0.6565627455711365, "learning_rate": 1.8462747303867197e-06, "loss": 0.002, "num_input_tokens_seen": 36001248, "step": 53410 }, { "epoch": 1.3049373366232624, "grad_norm": 0.11988291144371033, "learning_rate": 1.846229295947153e-06, "loss": 0.0767, "num_input_tokens_seen": 36004384, "step": 53415 }, { "epoch": 1.3050594874551096, "grad_norm": 15.187780380249023, "learning_rate": 1.8461838553535793e-06, "loss": 0.0331, "num_input_tokens_seen": 36008480, "step": 53420 }, { "epoch": 1.3051816382869568, "grad_norm": 15.98518180847168, "learning_rate": 1.8461384086063292e-06, "loss": 0.0761, "num_input_tokens_seen": 36012384, "step": 53425 }, { "epoch": 1.305303789118804, "grad_norm": 78.05189514160156, "learning_rate": 1.846092955705733e-06, "loss": 0.0968, "num_input_tokens_seen": 36015648, "step": 53430 }, { "epoch": 1.3054259399506511, "grad_norm": 25.664134979248047, "learning_rate": 1.846047496652121e-06, "loss": 0.0418, "num_input_tokens_seen": 36018912, "step": 53435 }, { "epoch": 1.3055480907824983, "grad_norm": 14.023910522460938, "learning_rate": 1.8460020314458244e-06, "loss": 0.0844, "num_input_tokens_seen": 36022560, "step": 53440 }, { "epoch": 1.3056702416143455, "grad_norm": 6.5276312828063965, "learning_rate": 1.8459565600871732e-06, "loss": 0.0424, "num_input_tokens_seen": 36025760, "step": 53445 }, { "epoch": 1.3057923924461925, "grad_norm": 0.2286670058965683, "learning_rate": 1.8459110825764986e-06, "loss": 0.002, "num_input_tokens_seen": 36029088, "step": 53450 }, { "epoch": 1.3059145432780397, "grad_norm": 0.03177253156900406, "learning_rate": 1.845865598914131e-06, "loss": 0.0551, "num_input_tokens_seen": 36032160, "step": 53455 }, { "epoch": 1.3060366941098869, "grad_norm": 0.27544674277305603, "learning_rate": 1.8458201091004011e-06, "loss": 0.0005, "num_input_tokens_seen": 36035808, "step": 53460 }, { "epoch": 1.306158844941734, "grad_norm": 0.2392241656780243, "learning_rate": 1.84577461313564e-06, "loss": 0.0771, "num_input_tokens_seen": 36039328, "step": 53465 }, { "epoch": 1.3062809957735813, "grad_norm": 33.41484832763672, "learning_rate": 1.8457291110201782e-06, "loss": 0.1988, "num_input_tokens_seen": 36042592, "step": 53470 }, { "epoch": 1.3064031466054284, "grad_norm": 7.893235206604004, "learning_rate": 1.8456836027543472e-06, "loss": 0.1351, "num_input_tokens_seen": 36045856, "step": 53475 }, { "epoch": 1.3065252974372754, "grad_norm": 27.955108642578125, "learning_rate": 1.8456380883384774e-06, "loss": 0.0866, "num_input_tokens_seen": 36049504, "step": 53480 }, { "epoch": 1.3066474482691226, "grad_norm": 0.7363135814666748, "learning_rate": 1.8455925677729e-06, "loss": 0.092, "num_input_tokens_seen": 36052576, "step": 53485 }, { "epoch": 1.3067695991009698, "grad_norm": 0.09255994111299515, "learning_rate": 1.8455470410579462e-06, "loss": 0.1593, "num_input_tokens_seen": 36055968, "step": 53490 }, { "epoch": 1.306891749932817, "grad_norm": 1.4908761978149414, "learning_rate": 1.8455015081939465e-06, "loss": 0.108, "num_input_tokens_seen": 36059872, "step": 53495 }, { "epoch": 1.3070139007646642, "grad_norm": 0.4646112024784088, "learning_rate": 1.8454559691812326e-06, "loss": 0.1167, "num_input_tokens_seen": 36063520, "step": 53500 }, { "epoch": 1.3071360515965114, "grad_norm": 103.47087097167969, "learning_rate": 1.8454104240201355e-06, "loss": 0.0282, "num_input_tokens_seen": 36066848, "step": 53505 }, { "epoch": 1.3072582024283586, "grad_norm": 16.62038803100586, "learning_rate": 1.8453648727109865e-06, "loss": 0.1318, "num_input_tokens_seen": 36070432, "step": 53510 }, { "epoch": 1.3073803532602057, "grad_norm": 0.8345311284065247, "learning_rate": 1.8453193152541167e-06, "loss": 0.1191, "num_input_tokens_seen": 36073632, "step": 53515 }, { "epoch": 1.307502504092053, "grad_norm": 33.0948600769043, "learning_rate": 1.8452737516498576e-06, "loss": 0.0371, "num_input_tokens_seen": 36076704, "step": 53520 }, { "epoch": 1.3076246549239001, "grad_norm": 47.80204391479492, "learning_rate": 1.8452281818985402e-06, "loss": 0.074, "num_input_tokens_seen": 36079712, "step": 53525 }, { "epoch": 1.3077468057557473, "grad_norm": 0.07585328072309494, "learning_rate": 1.845182606000496e-06, "loss": 0.0565, "num_input_tokens_seen": 36082656, "step": 53530 }, { "epoch": 1.3078689565875945, "grad_norm": 24.113906860351562, "learning_rate": 1.845137023956057e-06, "loss": 0.0552, "num_input_tokens_seen": 36085728, "step": 53535 }, { "epoch": 1.3079911074194415, "grad_norm": 214.59718322753906, "learning_rate": 1.8450914357655538e-06, "loss": 0.02, "num_input_tokens_seen": 36088928, "step": 53540 }, { "epoch": 1.3081132582512887, "grad_norm": 0.5538395643234253, "learning_rate": 1.8450458414293187e-06, "loss": 0.1131, "num_input_tokens_seen": 36091936, "step": 53545 }, { "epoch": 1.3082354090831358, "grad_norm": 0.42593154311180115, "learning_rate": 1.8450002409476828e-06, "loss": 0.0659, "num_input_tokens_seen": 36095328, "step": 53550 }, { "epoch": 1.308357559914983, "grad_norm": 0.06701342761516571, "learning_rate": 1.844954634320978e-06, "loss": 0.0435, "num_input_tokens_seen": 36098784, "step": 53555 }, { "epoch": 1.3084797107468302, "grad_norm": 0.13026872277259827, "learning_rate": 1.8449090215495358e-06, "loss": 0.0249, "num_input_tokens_seen": 36102112, "step": 53560 }, { "epoch": 1.3086018615786774, "grad_norm": 42.17403793334961, "learning_rate": 1.8448634026336877e-06, "loss": 0.1132, "num_input_tokens_seen": 36105376, "step": 53565 }, { "epoch": 1.3087240124105244, "grad_norm": 0.14000383019447327, "learning_rate": 1.844817777573766e-06, "loss": 0.0413, "num_input_tokens_seen": 36108896, "step": 53570 }, { "epoch": 1.3088461632423716, "grad_norm": 0.750639021396637, "learning_rate": 1.844772146370102e-06, "loss": 0.0455, "num_input_tokens_seen": 36112416, "step": 53575 }, { "epoch": 1.3089683140742188, "grad_norm": 0.13125310838222504, "learning_rate": 1.8447265090230277e-06, "loss": 0.0558, "num_input_tokens_seen": 36115552, "step": 53580 }, { "epoch": 1.309090464906066, "grad_norm": 10.72737979888916, "learning_rate": 1.8446808655328755e-06, "loss": 0.0438, "num_input_tokens_seen": 36119264, "step": 53585 }, { "epoch": 1.3092126157379131, "grad_norm": 0.692608654499054, "learning_rate": 1.8446352158999764e-06, "loss": 0.1691, "num_input_tokens_seen": 36122848, "step": 53590 }, { "epoch": 1.3093347665697603, "grad_norm": 0.09286288172006607, "learning_rate": 1.8445895601246628e-06, "loss": 0.0925, "num_input_tokens_seen": 36126752, "step": 53595 }, { "epoch": 1.3094569174016075, "grad_norm": 0.0832144096493721, "learning_rate": 1.844543898207267e-06, "loss": 0.1044, "num_input_tokens_seen": 36129696, "step": 53600 }, { "epoch": 1.3095790682334547, "grad_norm": 0.653047502040863, "learning_rate": 1.8444982301481207e-06, "loss": 0.0014, "num_input_tokens_seen": 36133216, "step": 53605 }, { "epoch": 1.309701219065302, "grad_norm": 295.8388366699219, "learning_rate": 1.8444525559475559e-06, "loss": 0.0781, "num_input_tokens_seen": 36136416, "step": 53610 }, { "epoch": 1.309823369897149, "grad_norm": 0.0643298551440239, "learning_rate": 1.8444068756059052e-06, "loss": 0.0889, "num_input_tokens_seen": 36140128, "step": 53615 }, { "epoch": 1.3099455207289963, "grad_norm": 8.68905258178711, "learning_rate": 1.8443611891235008e-06, "loss": 0.1508, "num_input_tokens_seen": 36143968, "step": 53620 }, { "epoch": 1.3100676715608432, "grad_norm": 28.05567169189453, "learning_rate": 1.8443154965006741e-06, "loss": 0.1148, "num_input_tokens_seen": 36147104, "step": 53625 }, { "epoch": 1.3101898223926904, "grad_norm": 0.059366848319768906, "learning_rate": 1.8442697977377586e-06, "loss": 0.0471, "num_input_tokens_seen": 36150560, "step": 53630 }, { "epoch": 1.3103119732245376, "grad_norm": 0.25347110629081726, "learning_rate": 1.8442240928350858e-06, "loss": 0.063, "num_input_tokens_seen": 36153760, "step": 53635 }, { "epoch": 1.3104341240563848, "grad_norm": 20.07191276550293, "learning_rate": 1.8441783817929885e-06, "loss": 0.1102, "num_input_tokens_seen": 36156896, "step": 53640 }, { "epoch": 1.310556274888232, "grad_norm": 0.0963999480009079, "learning_rate": 1.844132664611799e-06, "loss": 0.0036, "num_input_tokens_seen": 36160480, "step": 53645 }, { "epoch": 1.3106784257200792, "grad_norm": 44.44879150390625, "learning_rate": 1.8440869412918497e-06, "loss": 0.126, "num_input_tokens_seen": 36164192, "step": 53650 }, { "epoch": 1.3108005765519264, "grad_norm": 0.14370276033878326, "learning_rate": 1.8440412118334727e-06, "loss": 0.0899, "num_input_tokens_seen": 36167968, "step": 53655 }, { "epoch": 1.3109227273837734, "grad_norm": 0.36140382289886475, "learning_rate": 1.8439954762370015e-06, "loss": 0.0861, "num_input_tokens_seen": 36171296, "step": 53660 }, { "epoch": 1.3110448782156205, "grad_norm": 0.23507973551750183, "learning_rate": 1.8439497345027677e-06, "loss": 0.0879, "num_input_tokens_seen": 36174304, "step": 53665 }, { "epoch": 1.3111670290474677, "grad_norm": 0.28569769859313965, "learning_rate": 1.8439039866311049e-06, "loss": 0.0018, "num_input_tokens_seen": 36177248, "step": 53670 }, { "epoch": 1.311289179879315, "grad_norm": 0.07938076555728912, "learning_rate": 1.8438582326223451e-06, "loss": 0.1782, "num_input_tokens_seen": 36180512, "step": 53675 }, { "epoch": 1.3114113307111621, "grad_norm": 8.001997947692871, "learning_rate": 1.8438124724768213e-06, "loss": 0.1314, "num_input_tokens_seen": 36183648, "step": 53680 }, { "epoch": 1.3115334815430093, "grad_norm": 7.661314487457275, "learning_rate": 1.843766706194866e-06, "loss": 0.0956, "num_input_tokens_seen": 36186656, "step": 53685 }, { "epoch": 1.3116556323748565, "grad_norm": 11.160171508789062, "learning_rate": 1.8437209337768127e-06, "loss": 0.0805, "num_input_tokens_seen": 36189920, "step": 53690 }, { "epoch": 1.3117777832067037, "grad_norm": 0.42267847061157227, "learning_rate": 1.8436751552229937e-06, "loss": 0.0928, "num_input_tokens_seen": 36193120, "step": 53695 }, { "epoch": 1.3118999340385509, "grad_norm": 133.1011199951172, "learning_rate": 1.843629370533742e-06, "loss": 0.0605, "num_input_tokens_seen": 36196960, "step": 53700 }, { "epoch": 1.312022084870398, "grad_norm": 30.683536529541016, "learning_rate": 1.8435835797093906e-06, "loss": 0.2534, "num_input_tokens_seen": 36200224, "step": 53705 }, { "epoch": 1.3121442357022453, "grad_norm": 27.335384368896484, "learning_rate": 1.8435377827502724e-06, "loss": 0.0969, "num_input_tokens_seen": 36204192, "step": 53710 }, { "epoch": 1.3122663865340922, "grad_norm": 14.113719940185547, "learning_rate": 1.8434919796567208e-06, "loss": 0.0439, "num_input_tokens_seen": 36209568, "step": 53715 }, { "epoch": 1.3123885373659394, "grad_norm": 8.532432556152344, "learning_rate": 1.8434461704290685e-06, "loss": 0.0879, "num_input_tokens_seen": 36212640, "step": 53720 }, { "epoch": 1.3125106881977866, "grad_norm": 0.20278561115264893, "learning_rate": 1.8434003550676488e-06, "loss": 0.0398, "num_input_tokens_seen": 36216544, "step": 53725 }, { "epoch": 1.3126328390296338, "grad_norm": 0.1209520697593689, "learning_rate": 1.843354533572795e-06, "loss": 0.1991, "num_input_tokens_seen": 36219808, "step": 53730 }, { "epoch": 1.312754989861481, "grad_norm": 0.5357292294502258, "learning_rate": 1.84330870594484e-06, "loss": 0.0356, "num_input_tokens_seen": 36222944, "step": 53735 }, { "epoch": 1.3128771406933282, "grad_norm": 1.1882165670394897, "learning_rate": 1.8432628721841174e-06, "loss": 0.0385, "num_input_tokens_seen": 36226080, "step": 53740 }, { "epoch": 1.3129992915251754, "grad_norm": 0.13929365575313568, "learning_rate": 1.8432170322909602e-06, "loss": 0.0696, "num_input_tokens_seen": 36229472, "step": 53745 }, { "epoch": 1.3131214423570223, "grad_norm": 0.17472250759601593, "learning_rate": 1.8431711862657022e-06, "loss": 0.1067, "num_input_tokens_seen": 36232800, "step": 53750 }, { "epoch": 1.3132435931888695, "grad_norm": 7.303803443908691, "learning_rate": 1.8431253341086764e-06, "loss": 0.1298, "num_input_tokens_seen": 36236704, "step": 53755 }, { "epoch": 1.3133657440207167, "grad_norm": 75.92008972167969, "learning_rate": 1.8430794758202165e-06, "loss": 0.1319, "num_input_tokens_seen": 36239712, "step": 53760 }, { "epoch": 1.313487894852564, "grad_norm": 0.2109651118516922, "learning_rate": 1.8430336114006555e-06, "loss": 0.0364, "num_input_tokens_seen": 36243168, "step": 53765 }, { "epoch": 1.313610045684411, "grad_norm": 0.046810101717710495, "learning_rate": 1.8429877408503279e-06, "loss": 0.0531, "num_input_tokens_seen": 36246816, "step": 53770 }, { "epoch": 1.3137321965162583, "grad_norm": 0.08798840641975403, "learning_rate": 1.8429418641695665e-06, "loss": 0.0306, "num_input_tokens_seen": 36250144, "step": 53775 }, { "epoch": 1.3138543473481055, "grad_norm": 0.4276348352432251, "learning_rate": 1.8428959813587048e-06, "loss": 0.0013, "num_input_tokens_seen": 36253280, "step": 53780 }, { "epoch": 1.3139764981799527, "grad_norm": 19.770280838012695, "learning_rate": 1.8428500924180774e-06, "loss": 0.2003, "num_input_tokens_seen": 36256864, "step": 53785 }, { "epoch": 1.3140986490117998, "grad_norm": 0.14534105360507965, "learning_rate": 1.842804197348017e-06, "loss": 0.0865, "num_input_tokens_seen": 36260064, "step": 53790 }, { "epoch": 1.314220799843647, "grad_norm": 15.357917785644531, "learning_rate": 1.8427582961488579e-06, "loss": 0.1247, "num_input_tokens_seen": 36263584, "step": 53795 }, { "epoch": 1.3143429506754942, "grad_norm": 30.096118927001953, "learning_rate": 1.8427123888209337e-06, "loss": 0.1986, "num_input_tokens_seen": 36267424, "step": 53800 }, { "epoch": 1.3144651015073412, "grad_norm": 191.9586181640625, "learning_rate": 1.8426664753645786e-06, "loss": 0.1484, "num_input_tokens_seen": 36271584, "step": 53805 }, { "epoch": 1.3145872523391884, "grad_norm": 11.76651382446289, "learning_rate": 1.8426205557801259e-06, "loss": 0.0482, "num_input_tokens_seen": 36274912, "step": 53810 }, { "epoch": 1.3147094031710356, "grad_norm": 66.1670913696289, "learning_rate": 1.84257463006791e-06, "loss": 0.0452, "num_input_tokens_seen": 36279392, "step": 53815 }, { "epoch": 1.3148315540028828, "grad_norm": 1.6978089809417725, "learning_rate": 1.842528698228265e-06, "loss": 0.0494, "num_input_tokens_seen": 36283040, "step": 53820 }, { "epoch": 1.31495370483473, "grad_norm": 0.1684218943119049, "learning_rate": 1.8424827602615247e-06, "loss": 0.0014, "num_input_tokens_seen": 36286304, "step": 53825 }, { "epoch": 1.3150758556665771, "grad_norm": 25.381460189819336, "learning_rate": 1.842436816168023e-06, "loss": 0.1553, "num_input_tokens_seen": 36289632, "step": 53830 }, { "epoch": 1.3151980064984243, "grad_norm": 0.4152490198612213, "learning_rate": 1.8423908659480943e-06, "loss": 0.0241, "num_input_tokens_seen": 36292960, "step": 53835 }, { "epoch": 1.3153201573302713, "grad_norm": 106.53073120117188, "learning_rate": 1.8423449096020724e-06, "loss": 0.113, "num_input_tokens_seen": 36295840, "step": 53840 }, { "epoch": 1.3154423081621185, "grad_norm": 8.367879867553711, "learning_rate": 1.842298947130292e-06, "loss": 0.1004, "num_input_tokens_seen": 36299360, "step": 53845 }, { "epoch": 1.3155644589939657, "grad_norm": 3.1824214458465576, "learning_rate": 1.8422529785330872e-06, "loss": 0.1731, "num_input_tokens_seen": 36302624, "step": 53850 }, { "epoch": 1.3156866098258129, "grad_norm": 0.230075404047966, "learning_rate": 1.8422070038107918e-06, "loss": 0.1034, "num_input_tokens_seen": 36305760, "step": 53855 }, { "epoch": 1.31580876065766, "grad_norm": 0.2761656939983368, "learning_rate": 1.8421610229637405e-06, "loss": 0.1065, "num_input_tokens_seen": 36309280, "step": 53860 }, { "epoch": 1.3159309114895072, "grad_norm": 32.37919616699219, "learning_rate": 1.842115035992268e-06, "loss": 0.1159, "num_input_tokens_seen": 36312672, "step": 53865 }, { "epoch": 1.3160530623213544, "grad_norm": 15.567728996276855, "learning_rate": 1.8420690428967087e-06, "loss": 0.0485, "num_input_tokens_seen": 36315808, "step": 53870 }, { "epoch": 1.3161752131532016, "grad_norm": 0.15238645672798157, "learning_rate": 1.8420230436773965e-06, "loss": 0.0246, "num_input_tokens_seen": 36319008, "step": 53875 }, { "epoch": 1.3162973639850488, "grad_norm": 0.3245013952255249, "learning_rate": 1.8419770383346664e-06, "loss": 0.0456, "num_input_tokens_seen": 36322080, "step": 53880 }, { "epoch": 1.316419514816896, "grad_norm": 0.5807885527610779, "learning_rate": 1.8419310268688525e-06, "loss": 0.1091, "num_input_tokens_seen": 36325600, "step": 53885 }, { "epoch": 1.3165416656487432, "grad_norm": 0.1590554565191269, "learning_rate": 1.84188500928029e-06, "loss": 0.091, "num_input_tokens_seen": 36328800, "step": 53890 }, { "epoch": 1.3166638164805902, "grad_norm": 0.3508022129535675, "learning_rate": 1.8418389855693132e-06, "loss": 0.1389, "num_input_tokens_seen": 36331744, "step": 53895 }, { "epoch": 1.3167859673124374, "grad_norm": 124.73145294189453, "learning_rate": 1.841792955736257e-06, "loss": 0.1134, "num_input_tokens_seen": 36334944, "step": 53900 }, { "epoch": 1.3169081181442845, "grad_norm": 66.41177368164062, "learning_rate": 1.841746919781456e-06, "loss": 0.1849, "num_input_tokens_seen": 36338272, "step": 53905 }, { "epoch": 1.3170302689761317, "grad_norm": 0.18778330087661743, "learning_rate": 1.8417008777052447e-06, "loss": 0.0561, "num_input_tokens_seen": 36341664, "step": 53910 }, { "epoch": 1.317152419807979, "grad_norm": 0.9680259823799133, "learning_rate": 1.8416548295079583e-06, "loss": 0.012, "num_input_tokens_seen": 36344992, "step": 53915 }, { "epoch": 1.3172745706398261, "grad_norm": 0.6024382710456848, "learning_rate": 1.841608775189932e-06, "loss": 0.0688, "num_input_tokens_seen": 36348320, "step": 53920 }, { "epoch": 1.3173967214716733, "grad_norm": 1.7419893741607666, "learning_rate": 1.8415627147514998e-06, "loss": 0.0525, "num_input_tokens_seen": 36352032, "step": 53925 }, { "epoch": 1.3175188723035203, "grad_norm": 5.147568225860596, "learning_rate": 1.8415166481929976e-06, "loss": 0.0305, "num_input_tokens_seen": 36355744, "step": 53930 }, { "epoch": 1.3176410231353675, "grad_norm": 0.10421989113092422, "learning_rate": 1.8414705755147597e-06, "loss": 0.0499, "num_input_tokens_seen": 36359584, "step": 53935 }, { "epoch": 1.3177631739672147, "grad_norm": 33.054359436035156, "learning_rate": 1.8414244967171216e-06, "loss": 0.0467, "num_input_tokens_seen": 36363104, "step": 53940 }, { "epoch": 1.3178853247990618, "grad_norm": 105.69279479980469, "learning_rate": 1.8413784118004184e-06, "loss": 0.1175, "num_input_tokens_seen": 36366432, "step": 53945 }, { "epoch": 1.318007475630909, "grad_norm": 0.07659657299518585, "learning_rate": 1.8413323207649847e-06, "loss": 0.0013, "num_input_tokens_seen": 36370208, "step": 53950 }, { "epoch": 1.3181296264627562, "grad_norm": 31.84999656677246, "learning_rate": 1.8412862236111565e-06, "loss": 0.1004, "num_input_tokens_seen": 36373536, "step": 53955 }, { "epoch": 1.3182517772946034, "grad_norm": 0.1787625253200531, "learning_rate": 1.8412401203392681e-06, "loss": 0.0384, "num_input_tokens_seen": 36377184, "step": 53960 }, { "epoch": 1.3183739281264506, "grad_norm": 18.62636947631836, "learning_rate": 1.8411940109496556e-06, "loss": 0.0789, "num_input_tokens_seen": 36380576, "step": 53965 }, { "epoch": 1.3184960789582978, "grad_norm": 1.6066945791244507, "learning_rate": 1.841147895442654e-06, "loss": 0.0648, "num_input_tokens_seen": 36383840, "step": 53970 }, { "epoch": 1.318618229790145, "grad_norm": 0.07495055347681046, "learning_rate": 1.8411017738185985e-06, "loss": 0.0579, "num_input_tokens_seen": 36387296, "step": 53975 }, { "epoch": 1.3187403806219922, "grad_norm": 8.100799560546875, "learning_rate": 1.8410556460778248e-06, "loss": 0.0875, "num_input_tokens_seen": 36390624, "step": 53980 }, { "epoch": 1.3188625314538391, "grad_norm": 10.08414077758789, "learning_rate": 1.8410095122206682e-06, "loss": 0.0979, "num_input_tokens_seen": 36393760, "step": 53985 }, { "epoch": 1.3189846822856863, "grad_norm": 49.63814163208008, "learning_rate": 1.8409633722474642e-06, "loss": 0.0856, "num_input_tokens_seen": 36396704, "step": 53990 }, { "epoch": 1.3191068331175335, "grad_norm": 0.1970123052597046, "learning_rate": 1.8409172261585483e-06, "loss": 0.0566, "num_input_tokens_seen": 36399968, "step": 53995 }, { "epoch": 1.3192289839493807, "grad_norm": 26.510272979736328, "learning_rate": 1.8408710739542563e-06, "loss": 0.213, "num_input_tokens_seen": 36403808, "step": 54000 }, { "epoch": 1.319351134781228, "grad_norm": 0.24183687567710876, "learning_rate": 1.840824915634924e-06, "loss": 0.0025, "num_input_tokens_seen": 36406688, "step": 54005 }, { "epoch": 1.319473285613075, "grad_norm": 28.09096336364746, "learning_rate": 1.840778751200886e-06, "loss": 0.2569, "num_input_tokens_seen": 36409952, "step": 54010 }, { "epoch": 1.319595436444922, "grad_norm": 101.31450653076172, "learning_rate": 1.8407325806524795e-06, "loss": 0.1203, "num_input_tokens_seen": 36412960, "step": 54015 }, { "epoch": 1.3197175872767692, "grad_norm": 0.6510666608810425, "learning_rate": 1.840686403990039e-06, "loss": 0.0017, "num_input_tokens_seen": 36416736, "step": 54020 }, { "epoch": 1.3198397381086164, "grad_norm": 0.13488955795764923, "learning_rate": 1.8406402212139011e-06, "loss": 0.0513, "num_input_tokens_seen": 36419872, "step": 54025 }, { "epoch": 1.3199618889404636, "grad_norm": 6.511417388916016, "learning_rate": 1.8405940323244013e-06, "loss": 0.0788, "num_input_tokens_seen": 36423200, "step": 54030 }, { "epoch": 1.3200840397723108, "grad_norm": 0.5640556216239929, "learning_rate": 1.8405478373218757e-06, "loss": 0.1123, "num_input_tokens_seen": 36426720, "step": 54035 }, { "epoch": 1.320206190604158, "grad_norm": 7.7084269523620605, "learning_rate": 1.8405016362066604e-06, "loss": 0.1202, "num_input_tokens_seen": 36429920, "step": 54040 }, { "epoch": 1.3203283414360052, "grad_norm": 1.3634628057479858, "learning_rate": 1.8404554289790906e-06, "loss": 0.0792, "num_input_tokens_seen": 36433760, "step": 54045 }, { "epoch": 1.3204504922678524, "grad_norm": 127.5936508178711, "learning_rate": 1.8404092156395032e-06, "loss": 0.109, "num_input_tokens_seen": 36437024, "step": 54050 }, { "epoch": 1.3205726430996996, "grad_norm": 31.527488708496094, "learning_rate": 1.8403629961882338e-06, "loss": 0.0546, "num_input_tokens_seen": 36440736, "step": 54055 }, { "epoch": 1.3206947939315468, "grad_norm": 45.58065414428711, "learning_rate": 1.8403167706256188e-06, "loss": 0.1726, "num_input_tokens_seen": 36444640, "step": 54060 }, { "epoch": 1.320816944763394, "grad_norm": 0.5619506239891052, "learning_rate": 1.8402705389519941e-06, "loss": 0.0416, "num_input_tokens_seen": 36448160, "step": 54065 }, { "epoch": 1.3209390955952411, "grad_norm": 0.17187543213367462, "learning_rate": 1.8402243011676961e-06, "loss": 0.07, "num_input_tokens_seen": 36451296, "step": 54070 }, { "epoch": 1.321061246427088, "grad_norm": 6.773105621337891, "learning_rate": 1.8401780572730609e-06, "loss": 0.0016, "num_input_tokens_seen": 36455008, "step": 54075 }, { "epoch": 1.3211833972589353, "grad_norm": 0.2591158151626587, "learning_rate": 1.8401318072684248e-06, "loss": 0.1247, "num_input_tokens_seen": 36458080, "step": 54080 }, { "epoch": 1.3213055480907825, "grad_norm": 0.9889698028564453, "learning_rate": 1.8400855511541246e-06, "loss": 0.0699, "num_input_tokens_seen": 36461792, "step": 54085 }, { "epoch": 1.3214276989226297, "grad_norm": 2.2531867027282715, "learning_rate": 1.8400392889304961e-06, "loss": 0.044, "num_input_tokens_seen": 36464800, "step": 54090 }, { "epoch": 1.3215498497544769, "grad_norm": 0.2619583010673523, "learning_rate": 1.839993020597876e-06, "loss": 0.0501, "num_input_tokens_seen": 36469088, "step": 54095 }, { "epoch": 1.321672000586324, "grad_norm": 0.28559795022010803, "learning_rate": 1.8399467461566006e-06, "loss": 0.042, "num_input_tokens_seen": 36472288, "step": 54100 }, { "epoch": 1.321794151418171, "grad_norm": 0.024589255452156067, "learning_rate": 1.8399004656070067e-06, "loss": 0.046, "num_input_tokens_seen": 36475552, "step": 54105 }, { "epoch": 1.3219163022500182, "grad_norm": 0.2193070650100708, "learning_rate": 1.8398541789494307e-06, "loss": 0.042, "num_input_tokens_seen": 36478944, "step": 54110 }, { "epoch": 1.3220384530818654, "grad_norm": 17.28835678100586, "learning_rate": 1.839807886184209e-06, "loss": 0.0808, "num_input_tokens_seen": 36482208, "step": 54115 }, { "epoch": 1.3221606039137126, "grad_norm": 41.440818786621094, "learning_rate": 1.8397615873116785e-06, "loss": 0.1379, "num_input_tokens_seen": 36485152, "step": 54120 }, { "epoch": 1.3222827547455598, "grad_norm": 0.27941083908081055, "learning_rate": 1.8397152823321761e-06, "loss": 0.0595, "num_input_tokens_seen": 36488160, "step": 54125 }, { "epoch": 1.322404905577407, "grad_norm": 0.16612698137760162, "learning_rate": 1.8396689712460382e-06, "loss": 0.0372, "num_input_tokens_seen": 36491616, "step": 54130 }, { "epoch": 1.3225270564092542, "grad_norm": 0.026570206508040428, "learning_rate": 1.8396226540536017e-06, "loss": 0.0994, "num_input_tokens_seen": 36494880, "step": 54135 }, { "epoch": 1.3226492072411014, "grad_norm": 96.77842712402344, "learning_rate": 1.8395763307552034e-06, "loss": 0.1139, "num_input_tokens_seen": 36498144, "step": 54140 }, { "epoch": 1.3227713580729485, "grad_norm": 76.5399169921875, "learning_rate": 1.8395300013511803e-06, "loss": 0.067, "num_input_tokens_seen": 36501344, "step": 54145 }, { "epoch": 1.3228935089047957, "grad_norm": 0.3292557895183563, "learning_rate": 1.839483665841869e-06, "loss": 0.0782, "num_input_tokens_seen": 36504480, "step": 54150 }, { "epoch": 1.323015659736643, "grad_norm": 10.224272727966309, "learning_rate": 1.8394373242276069e-06, "loss": 0.0399, "num_input_tokens_seen": 36507936, "step": 54155 }, { "epoch": 1.32313781056849, "grad_norm": 14.834850311279297, "learning_rate": 1.8393909765087307e-06, "loss": 0.0857, "num_input_tokens_seen": 36511008, "step": 54160 }, { "epoch": 1.323259961400337, "grad_norm": 40.261287689208984, "learning_rate": 1.8393446226855779e-06, "loss": 0.078, "num_input_tokens_seen": 36513952, "step": 54165 }, { "epoch": 1.3233821122321843, "grad_norm": 1.2118006944656372, "learning_rate": 1.8392982627584845e-06, "loss": 0.0536, "num_input_tokens_seen": 36517088, "step": 54170 }, { "epoch": 1.3235042630640315, "grad_norm": 0.18435494601726532, "learning_rate": 1.839251896727789e-06, "loss": 0.0804, "num_input_tokens_seen": 36520544, "step": 54175 }, { "epoch": 1.3236264138958787, "grad_norm": 0.12301947921514511, "learning_rate": 1.8392055245938277e-06, "loss": 0.0014, "num_input_tokens_seen": 36523808, "step": 54180 }, { "epoch": 1.3237485647277258, "grad_norm": 0.2818373739719391, "learning_rate": 1.8391591463569383e-06, "loss": 0.0521, "num_input_tokens_seen": 36527072, "step": 54185 }, { "epoch": 1.323870715559573, "grad_norm": 18.952112197875977, "learning_rate": 1.8391127620174578e-06, "loss": 0.091, "num_input_tokens_seen": 36530336, "step": 54190 }, { "epoch": 1.32399286639142, "grad_norm": 12.950271606445312, "learning_rate": 1.8390663715757236e-06, "loss": 0.151, "num_input_tokens_seen": 36533344, "step": 54195 }, { "epoch": 1.3241150172232672, "grad_norm": 15.284626007080078, "learning_rate": 1.839019975032073e-06, "loss": 0.2122, "num_input_tokens_seen": 36536992, "step": 54200 }, { "epoch": 1.3242371680551144, "grad_norm": 3.371776819229126, "learning_rate": 1.8389735723868433e-06, "loss": 0.0159, "num_input_tokens_seen": 36540448, "step": 54205 }, { "epoch": 1.3243593188869616, "grad_norm": 8.688360214233398, "learning_rate": 1.8389271636403726e-06, "loss": 0.0922, "num_input_tokens_seen": 36544032, "step": 54210 }, { "epoch": 1.3244814697188088, "grad_norm": 11.385930061340332, "learning_rate": 1.8388807487929977e-06, "loss": 0.1983, "num_input_tokens_seen": 36547488, "step": 54215 }, { "epoch": 1.324603620550656, "grad_norm": 0.7899482846260071, "learning_rate": 1.8388343278450562e-06, "loss": 0.0692, "num_input_tokens_seen": 36550496, "step": 54220 }, { "epoch": 1.3247257713825031, "grad_norm": 0.2948339283466339, "learning_rate": 1.838787900796886e-06, "loss": 0.1026, "num_input_tokens_seen": 36554528, "step": 54225 }, { "epoch": 1.3248479222143503, "grad_norm": 15.418424606323242, "learning_rate": 1.8387414676488247e-06, "loss": 0.0733, "num_input_tokens_seen": 36557792, "step": 54230 }, { "epoch": 1.3249700730461975, "grad_norm": 0.3756644129753113, "learning_rate": 1.8386950284012097e-06, "loss": 0.043, "num_input_tokens_seen": 36560928, "step": 54235 }, { "epoch": 1.3250922238780447, "grad_norm": 61.26029586791992, "learning_rate": 1.8386485830543787e-06, "loss": 0.029, "num_input_tokens_seen": 36563616, "step": 54240 }, { "epoch": 1.325214374709892, "grad_norm": 46.892730712890625, "learning_rate": 1.83860213160867e-06, "loss": 0.0231, "num_input_tokens_seen": 36567072, "step": 54245 }, { "epoch": 1.3253365255417389, "grad_norm": 18.61212921142578, "learning_rate": 1.8385556740644207e-06, "loss": 0.0864, "num_input_tokens_seen": 36570208, "step": 54250 }, { "epoch": 1.325458676373586, "grad_norm": 0.2406308799982071, "learning_rate": 1.8385092104219692e-06, "loss": 0.0398, "num_input_tokens_seen": 36573856, "step": 54255 }, { "epoch": 1.3255808272054332, "grad_norm": 14.590946197509766, "learning_rate": 1.8384627406816532e-06, "loss": 0.1231, "num_input_tokens_seen": 36577248, "step": 54260 }, { "epoch": 1.3257029780372804, "grad_norm": 10.663087844848633, "learning_rate": 1.8384162648438104e-06, "loss": 0.0889, "num_input_tokens_seen": 36580768, "step": 54265 }, { "epoch": 1.3258251288691276, "grad_norm": 29.073938369750977, "learning_rate": 1.8383697829087792e-06, "loss": 0.1318, "num_input_tokens_seen": 36584416, "step": 54270 }, { "epoch": 1.3259472797009748, "grad_norm": 31.9458065032959, "learning_rate": 1.8383232948768975e-06, "loss": 0.0856, "num_input_tokens_seen": 36588000, "step": 54275 }, { "epoch": 1.326069430532822, "grad_norm": 13.791265487670898, "learning_rate": 1.8382768007485033e-06, "loss": 0.1556, "num_input_tokens_seen": 36591072, "step": 54280 }, { "epoch": 1.326191581364669, "grad_norm": 22.632469177246094, "learning_rate": 1.8382303005239346e-06, "loss": 0.1886, "num_input_tokens_seen": 36594464, "step": 54285 }, { "epoch": 1.3263137321965162, "grad_norm": 7.899172306060791, "learning_rate": 1.8381837942035299e-06, "loss": 0.0695, "num_input_tokens_seen": 36597920, "step": 54290 }, { "epoch": 1.3264358830283633, "grad_norm": 0.49219897389411926, "learning_rate": 1.838137281787627e-06, "loss": 0.1333, "num_input_tokens_seen": 36601056, "step": 54295 }, { "epoch": 1.3265580338602105, "grad_norm": 0.22854328155517578, "learning_rate": 1.8380907632765644e-06, "loss": 0.0037, "num_input_tokens_seen": 36604256, "step": 54300 }, { "epoch": 1.3266801846920577, "grad_norm": 58.15485763549805, "learning_rate": 1.8380442386706805e-06, "loss": 0.1421, "num_input_tokens_seen": 36607584, "step": 54305 }, { "epoch": 1.326802335523905, "grad_norm": 0.4951092600822449, "learning_rate": 1.8379977079703134e-06, "loss": 0.0126, "num_input_tokens_seen": 36611360, "step": 54310 }, { "epoch": 1.326924486355752, "grad_norm": 0.1765560656785965, "learning_rate": 1.8379511711758013e-06, "loss": 0.1882, "num_input_tokens_seen": 36614432, "step": 54315 }, { "epoch": 1.3270466371875993, "grad_norm": 14.93560791015625, "learning_rate": 1.8379046282874833e-06, "loss": 0.1443, "num_input_tokens_seen": 36617504, "step": 54320 }, { "epoch": 1.3271687880194465, "grad_norm": 0.6865839958190918, "learning_rate": 1.8378580793056972e-06, "loss": 0.1574, "num_input_tokens_seen": 36620512, "step": 54325 }, { "epoch": 1.3272909388512937, "grad_norm": 0.17002366483211517, "learning_rate": 1.837811524230782e-06, "loss": 0.0449, "num_input_tokens_seen": 36624032, "step": 54330 }, { "epoch": 1.3274130896831409, "grad_norm": 68.06137084960938, "learning_rate": 1.837764963063076e-06, "loss": 0.0106, "num_input_tokens_seen": 36627552, "step": 54335 }, { "epoch": 1.3275352405149878, "grad_norm": 12.449389457702637, "learning_rate": 1.837718395802918e-06, "loss": 0.0677, "num_input_tokens_seen": 36630880, "step": 54340 }, { "epoch": 1.327657391346835, "grad_norm": 35.67185592651367, "learning_rate": 1.8376718224506462e-06, "loss": 0.0778, "num_input_tokens_seen": 36634336, "step": 54345 }, { "epoch": 1.3277795421786822, "grad_norm": 0.34582069516181946, "learning_rate": 1.8376252430065996e-06, "loss": 0.0222, "num_input_tokens_seen": 36637472, "step": 54350 }, { "epoch": 1.3279016930105294, "grad_norm": 25.928327560424805, "learning_rate": 1.8375786574711172e-06, "loss": 0.0163, "num_input_tokens_seen": 36640864, "step": 54355 }, { "epoch": 1.3280238438423766, "grad_norm": 44.198951721191406, "learning_rate": 1.8375320658445373e-06, "loss": 0.1653, "num_input_tokens_seen": 36644256, "step": 54360 }, { "epoch": 1.3281459946742238, "grad_norm": 22.284814834594727, "learning_rate": 1.8374854681271991e-06, "loss": 0.0613, "num_input_tokens_seen": 36647584, "step": 54365 }, { "epoch": 1.328268145506071, "grad_norm": 0.10727919638156891, "learning_rate": 1.8374388643194415e-06, "loss": 0.1438, "num_input_tokens_seen": 36651040, "step": 54370 }, { "epoch": 1.328390296337918, "grad_norm": 6.798345565795898, "learning_rate": 1.8373922544216026e-06, "loss": 0.003, "num_input_tokens_seen": 36654304, "step": 54375 }, { "epoch": 1.3285124471697651, "grad_norm": 24.0903263092041, "learning_rate": 1.8373456384340224e-06, "loss": 0.1305, "num_input_tokens_seen": 36657504, "step": 54380 }, { "epoch": 1.3286345980016123, "grad_norm": 0.21511498093605042, "learning_rate": 1.8372990163570396e-06, "loss": 0.033, "num_input_tokens_seen": 36660704, "step": 54385 }, { "epoch": 1.3287567488334595, "grad_norm": 0.841050922870636, "learning_rate": 1.8372523881909929e-06, "loss": 0.0023, "num_input_tokens_seen": 36664288, "step": 54390 }, { "epoch": 1.3288788996653067, "grad_norm": 37.736202239990234, "learning_rate": 1.837205753936222e-06, "loss": 0.1235, "num_input_tokens_seen": 36667872, "step": 54395 }, { "epoch": 1.329001050497154, "grad_norm": 12.41240406036377, "learning_rate": 1.8371591135930653e-06, "loss": 0.1215, "num_input_tokens_seen": 36671328, "step": 54400 }, { "epoch": 1.329123201329001, "grad_norm": 0.09699174016714096, "learning_rate": 1.8371124671618627e-06, "loss": 0.0267, "num_input_tokens_seen": 36674400, "step": 54405 }, { "epoch": 1.3292453521608483, "grad_norm": 0.32535770535469055, "learning_rate": 1.8370658146429529e-06, "loss": 0.0422, "num_input_tokens_seen": 36678560, "step": 54410 }, { "epoch": 1.3293675029926955, "grad_norm": 15.304163932800293, "learning_rate": 1.8370191560366752e-06, "loss": 0.0718, "num_input_tokens_seen": 36681568, "step": 54415 }, { "epoch": 1.3294896538245426, "grad_norm": 12.182839393615723, "learning_rate": 1.8369724913433694e-06, "loss": 0.1318, "num_input_tokens_seen": 36684832, "step": 54420 }, { "epoch": 1.3296118046563898, "grad_norm": 0.05617258697748184, "learning_rate": 1.8369258205633741e-06, "loss": 0.08, "num_input_tokens_seen": 36688608, "step": 54425 }, { "epoch": 1.3297339554882368, "grad_norm": 30.782344818115234, "learning_rate": 1.8368791436970295e-06, "loss": 0.26, "num_input_tokens_seen": 36692000, "step": 54430 }, { "epoch": 1.329856106320084, "grad_norm": 0.36918166279792786, "learning_rate": 1.8368324607446747e-06, "loss": 0.0638, "num_input_tokens_seen": 36695904, "step": 54435 }, { "epoch": 1.3299782571519312, "grad_norm": 0.805653989315033, "learning_rate": 1.8367857717066485e-06, "loss": 0.015, "num_input_tokens_seen": 36699360, "step": 54440 }, { "epoch": 1.3301004079837784, "grad_norm": 0.3441406786441803, "learning_rate": 1.8367390765832917e-06, "loss": 0.0694, "num_input_tokens_seen": 36702624, "step": 54445 }, { "epoch": 1.3302225588156256, "grad_norm": 28.947162628173828, "learning_rate": 1.8366923753749433e-06, "loss": 0.1328, "num_input_tokens_seen": 36706272, "step": 54450 }, { "epoch": 1.3303447096474728, "grad_norm": 0.245017409324646, "learning_rate": 1.8366456680819428e-06, "loss": 0.068, "num_input_tokens_seen": 36709344, "step": 54455 }, { "epoch": 1.33046686047932, "grad_norm": 97.548828125, "learning_rate": 1.83659895470463e-06, "loss": 0.1217, "num_input_tokens_seen": 36712672, "step": 54460 }, { "epoch": 1.330589011311167, "grad_norm": 18.679906845092773, "learning_rate": 1.8365522352433445e-06, "loss": 0.1103, "num_input_tokens_seen": 36716000, "step": 54465 }, { "epoch": 1.330711162143014, "grad_norm": 0.12989388406276703, "learning_rate": 1.8365055096984264e-06, "loss": 0.036, "num_input_tokens_seen": 36719968, "step": 54470 }, { "epoch": 1.3308333129748613, "grad_norm": 0.12728965282440186, "learning_rate": 1.8364587780702147e-06, "loss": 0.0812, "num_input_tokens_seen": 36723360, "step": 54475 }, { "epoch": 1.3309554638067085, "grad_norm": 0.06374026089906693, "learning_rate": 1.8364120403590502e-06, "loss": 0.1162, "num_input_tokens_seen": 36726432, "step": 54480 }, { "epoch": 1.3310776146385557, "grad_norm": 3.7236487865448, "learning_rate": 1.8363652965652723e-06, "loss": 0.1438, "num_input_tokens_seen": 36729888, "step": 54485 }, { "epoch": 1.3311997654704029, "grad_norm": 0.206837460398674, "learning_rate": 1.836318546689221e-06, "loss": 0.0286, "num_input_tokens_seen": 36732896, "step": 54490 }, { "epoch": 1.33132191630225, "grad_norm": 0.04891075938940048, "learning_rate": 1.8362717907312364e-06, "loss": 0.0012, "num_input_tokens_seen": 36736352, "step": 54495 }, { "epoch": 1.3314440671340972, "grad_norm": 0.18277254700660706, "learning_rate": 1.8362250286916581e-06, "loss": 0.0319, "num_input_tokens_seen": 36740064, "step": 54500 }, { "epoch": 1.3315662179659444, "grad_norm": 0.12982165813446045, "learning_rate": 1.8361782605708267e-06, "loss": 0.0768, "num_input_tokens_seen": 36743328, "step": 54505 }, { "epoch": 1.3316883687977916, "grad_norm": 0.2659108638763428, "learning_rate": 1.836131486369082e-06, "loss": 0.0441, "num_input_tokens_seen": 36746464, "step": 54510 }, { "epoch": 1.3318105196296388, "grad_norm": 0.06435118615627289, "learning_rate": 1.8360847060867642e-06, "loss": 0.0483, "num_input_tokens_seen": 36750240, "step": 54515 }, { "epoch": 1.3319326704614858, "grad_norm": 15.892475128173828, "learning_rate": 1.8360379197242137e-06, "loss": 0.0849, "num_input_tokens_seen": 36753760, "step": 54520 }, { "epoch": 1.332054821293333, "grad_norm": 28.929258346557617, "learning_rate": 1.8359911272817706e-06, "loss": 0.0359, "num_input_tokens_seen": 36757216, "step": 54525 }, { "epoch": 1.3321769721251802, "grad_norm": 59.461448669433594, "learning_rate": 1.835944328759775e-06, "loss": 0.0463, "num_input_tokens_seen": 36760864, "step": 54530 }, { "epoch": 1.3322991229570273, "grad_norm": 11.817412376403809, "learning_rate": 1.8358975241585675e-06, "loss": 0.1228, "num_input_tokens_seen": 36764064, "step": 54535 }, { "epoch": 1.3324212737888745, "grad_norm": 0.08971010148525238, "learning_rate": 1.8358507134784882e-06, "loss": 0.0567, "num_input_tokens_seen": 36767520, "step": 54540 }, { "epoch": 1.3325434246207217, "grad_norm": 0.039157934486866, "learning_rate": 1.8358038967198776e-06, "loss": 0.1073, "num_input_tokens_seen": 36771296, "step": 54545 }, { "epoch": 1.3326655754525687, "grad_norm": 0.17689518630504608, "learning_rate": 1.8357570738830768e-06, "loss": 0.0462, "num_input_tokens_seen": 36774880, "step": 54550 }, { "epoch": 1.3327877262844159, "grad_norm": 0.06715977936983109, "learning_rate": 1.8357102449684254e-06, "loss": 0.053, "num_input_tokens_seen": 36778464, "step": 54555 }, { "epoch": 1.332909877116263, "grad_norm": 11.433544158935547, "learning_rate": 1.8356634099762643e-06, "loss": 0.0801, "num_input_tokens_seen": 36781792, "step": 54560 }, { "epoch": 1.3330320279481103, "grad_norm": 0.11322541534900665, "learning_rate": 1.8356165689069343e-06, "loss": 0.0478, "num_input_tokens_seen": 36784992, "step": 54565 }, { "epoch": 1.3331541787799575, "grad_norm": 0.3144833445549011, "learning_rate": 1.8355697217607758e-06, "loss": 0.0293, "num_input_tokens_seen": 36788320, "step": 54570 }, { "epoch": 1.3332763296118046, "grad_norm": 1.0199109315872192, "learning_rate": 1.8355228685381293e-06, "loss": 0.0018, "num_input_tokens_seen": 36791776, "step": 54575 }, { "epoch": 1.3333984804436518, "grad_norm": 0.3016822636127472, "learning_rate": 1.8354760092393363e-06, "loss": 0.1309, "num_input_tokens_seen": 36795040, "step": 54580 }, { "epoch": 1.333520631275499, "grad_norm": 20.83384895324707, "learning_rate": 1.8354291438647366e-06, "loss": 0.0681, "num_input_tokens_seen": 36798304, "step": 54585 }, { "epoch": 1.3336427821073462, "grad_norm": 0.282366007566452, "learning_rate": 1.8353822724146714e-06, "loss": 0.1073, "num_input_tokens_seen": 36802144, "step": 54590 }, { "epoch": 1.3337649329391934, "grad_norm": 0.6827936172485352, "learning_rate": 1.8353353948894819e-06, "loss": 0.1152, "num_input_tokens_seen": 36805408, "step": 54595 }, { "epoch": 1.3338870837710406, "grad_norm": 0.4102949798107147, "learning_rate": 1.8352885112895086e-06, "loss": 0.0011, "num_input_tokens_seen": 36809056, "step": 54600 }, { "epoch": 1.3340092346028878, "grad_norm": 0.2885624170303345, "learning_rate": 1.8352416216150926e-06, "loss": 0.1124, "num_input_tokens_seen": 36812128, "step": 54605 }, { "epoch": 1.3341313854347348, "grad_norm": 0.47842562198638916, "learning_rate": 1.8351947258665747e-06, "loss": 0.1089, "num_input_tokens_seen": 36816224, "step": 54610 }, { "epoch": 1.334253536266582, "grad_norm": 220.07904052734375, "learning_rate": 1.8351478240442963e-06, "loss": 0.0894, "num_input_tokens_seen": 36819424, "step": 54615 }, { "epoch": 1.3343756870984291, "grad_norm": 56.795082092285156, "learning_rate": 1.8351009161485983e-06, "loss": 0.0718, "num_input_tokens_seen": 36823136, "step": 54620 }, { "epoch": 1.3344978379302763, "grad_norm": 0.40516746044158936, "learning_rate": 1.835054002179822e-06, "loss": 0.032, "num_input_tokens_seen": 36826464, "step": 54625 }, { "epoch": 1.3346199887621235, "grad_norm": 0.2434043288230896, "learning_rate": 1.835007082138308e-06, "loss": 0.0474, "num_input_tokens_seen": 36829920, "step": 54630 }, { "epoch": 1.3347421395939707, "grad_norm": 2.3370065689086914, "learning_rate": 1.8349601560243983e-06, "loss": 0.0734, "num_input_tokens_seen": 36832928, "step": 54635 }, { "epoch": 1.3348642904258177, "grad_norm": 0.26671692728996277, "learning_rate": 1.8349132238384334e-06, "loss": 0.0744, "num_input_tokens_seen": 36836128, "step": 54640 }, { "epoch": 1.3349864412576649, "grad_norm": 0.07411301136016846, "learning_rate": 1.8348662855807552e-06, "loss": 0.0449, "num_input_tokens_seen": 36839776, "step": 54645 }, { "epoch": 1.335108592089512, "grad_norm": 0.05784473940730095, "learning_rate": 1.8348193412517051e-06, "loss": 0.236, "num_input_tokens_seen": 36842720, "step": 54650 }, { "epoch": 1.3352307429213592, "grad_norm": 17.83575439453125, "learning_rate": 1.8347723908516234e-06, "loss": 0.0411, "num_input_tokens_seen": 36846496, "step": 54655 }, { "epoch": 1.3353528937532064, "grad_norm": 0.06487853825092316, "learning_rate": 1.834725434380853e-06, "loss": 0.1155, "num_input_tokens_seen": 36849632, "step": 54660 }, { "epoch": 1.3354750445850536, "grad_norm": 18.144908905029297, "learning_rate": 1.8346784718397346e-06, "loss": 0.0413, "num_input_tokens_seen": 36852640, "step": 54665 }, { "epoch": 1.3355971954169008, "grad_norm": 3.455583095550537, "learning_rate": 1.8346315032286098e-06, "loss": 0.0515, "num_input_tokens_seen": 36856096, "step": 54670 }, { "epoch": 1.335719346248748, "grad_norm": 13.571123123168945, "learning_rate": 1.83458452854782e-06, "loss": 0.1245, "num_input_tokens_seen": 36859360, "step": 54675 }, { "epoch": 1.3358414970805952, "grad_norm": 31.01885414123535, "learning_rate": 1.8345375477977076e-06, "loss": 0.1605, "num_input_tokens_seen": 36862560, "step": 54680 }, { "epoch": 1.3359636479124424, "grad_norm": 0.04093638062477112, "learning_rate": 1.8344905609786132e-06, "loss": 0.1335, "num_input_tokens_seen": 36865696, "step": 54685 }, { "epoch": 1.3360857987442896, "grad_norm": 0.0845719650387764, "learning_rate": 1.8344435680908793e-06, "loss": 0.0543, "num_input_tokens_seen": 36868896, "step": 54690 }, { "epoch": 1.3362079495761365, "grad_norm": 17.767742156982422, "learning_rate": 1.8343965691348471e-06, "loss": 0.1462, "num_input_tokens_seen": 36872096, "step": 54695 }, { "epoch": 1.3363301004079837, "grad_norm": 34.696651458740234, "learning_rate": 1.8343495641108586e-06, "loss": 0.103, "num_input_tokens_seen": 36875616, "step": 54700 }, { "epoch": 1.336452251239831, "grad_norm": 2.183518886566162, "learning_rate": 1.8343025530192558e-06, "loss": 0.1377, "num_input_tokens_seen": 36878944, "step": 54705 }, { "epoch": 1.336574402071678, "grad_norm": 0.4736291766166687, "learning_rate": 1.8342555358603804e-06, "loss": 0.018, "num_input_tokens_seen": 36882336, "step": 54710 }, { "epoch": 1.3366965529035253, "grad_norm": 13.378650665283203, "learning_rate": 1.8342085126345743e-06, "loss": 0.1011, "num_input_tokens_seen": 36885792, "step": 54715 }, { "epoch": 1.3368187037353725, "grad_norm": 0.12041685730218887, "learning_rate": 1.8341614833421794e-06, "loss": 0.0022, "num_input_tokens_seen": 36889120, "step": 54720 }, { "epoch": 1.3369408545672197, "grad_norm": 0.9578997492790222, "learning_rate": 1.8341144479835382e-06, "loss": 0.054, "num_input_tokens_seen": 36892512, "step": 54725 }, { "epoch": 1.3370630053990666, "grad_norm": 0.6917126774787903, "learning_rate": 1.8340674065589923e-06, "loss": 0.0393, "num_input_tokens_seen": 36895904, "step": 54730 }, { "epoch": 1.3371851562309138, "grad_norm": 0.4527418911457062, "learning_rate": 1.8340203590688837e-06, "loss": 0.0437, "num_input_tokens_seen": 36899360, "step": 54735 }, { "epoch": 1.337307307062761, "grad_norm": 4.271320343017578, "learning_rate": 1.8339733055135546e-06, "loss": 0.046, "num_input_tokens_seen": 36902368, "step": 54740 }, { "epoch": 1.3374294578946082, "grad_norm": 23.710651397705078, "learning_rate": 1.8339262458933476e-06, "loss": 0.097, "num_input_tokens_seen": 36905696, "step": 54745 }, { "epoch": 1.3375516087264554, "grad_norm": 31.41887664794922, "learning_rate": 1.8338791802086045e-06, "loss": 0.1131, "num_input_tokens_seen": 36910176, "step": 54750 }, { "epoch": 1.3376737595583026, "grad_norm": 33.71608352661133, "learning_rate": 1.8338321084596678e-06, "loss": 0.068, "num_input_tokens_seen": 36913312, "step": 54755 }, { "epoch": 1.3377959103901498, "grad_norm": 16.4295711517334, "learning_rate": 1.8337850306468795e-06, "loss": 0.068, "num_input_tokens_seen": 36916576, "step": 54760 }, { "epoch": 1.337918061221997, "grad_norm": 0.2625795304775238, "learning_rate": 1.8337379467705824e-06, "loss": 0.0019, "num_input_tokens_seen": 36920352, "step": 54765 }, { "epoch": 1.3380402120538442, "grad_norm": 0.9406046271324158, "learning_rate": 1.8336908568311187e-06, "loss": 0.1602, "num_input_tokens_seen": 36923296, "step": 54770 }, { "epoch": 1.3381623628856913, "grad_norm": 28.13994026184082, "learning_rate": 1.8336437608288309e-06, "loss": 0.0468, "num_input_tokens_seen": 36926496, "step": 54775 }, { "epoch": 1.3382845137175385, "grad_norm": 8.35828971862793, "learning_rate": 1.8335966587640615e-06, "loss": 0.2092, "num_input_tokens_seen": 36929952, "step": 54780 }, { "epoch": 1.3384066645493855, "grad_norm": 0.12866359949111938, "learning_rate": 1.8335495506371529e-06, "loss": 0.002, "num_input_tokens_seen": 36933024, "step": 54785 }, { "epoch": 1.3385288153812327, "grad_norm": 0.12256859987974167, "learning_rate": 1.8335024364484477e-06, "loss": 0.0973, "num_input_tokens_seen": 36936608, "step": 54790 }, { "epoch": 1.3386509662130799, "grad_norm": 0.11025379598140717, "learning_rate": 1.8334553161982887e-06, "loss": 0.1195, "num_input_tokens_seen": 36940128, "step": 54795 }, { "epoch": 1.338773117044927, "grad_norm": 0.2998100817203522, "learning_rate": 1.8334081898870185e-06, "loss": 0.0434, "num_input_tokens_seen": 36944672, "step": 54800 }, { "epoch": 1.3388952678767743, "grad_norm": 0.1619083136320114, "learning_rate": 1.8333610575149795e-06, "loss": 0.053, "num_input_tokens_seen": 36948192, "step": 54805 }, { "epoch": 1.3390174187086215, "grad_norm": 0.2661537230014801, "learning_rate": 1.8333139190825149e-06, "loss": 0.0582, "num_input_tokens_seen": 36951456, "step": 54810 }, { "epoch": 1.3391395695404686, "grad_norm": 7.476252555847168, "learning_rate": 1.8332667745899672e-06, "loss": 0.0817, "num_input_tokens_seen": 36954400, "step": 54815 }, { "epoch": 1.3392617203723156, "grad_norm": 0.09815490990877151, "learning_rate": 1.8332196240376797e-06, "loss": 0.1396, "num_input_tokens_seen": 36957792, "step": 54820 }, { "epoch": 1.3393838712041628, "grad_norm": 0.5720617771148682, "learning_rate": 1.833172467425995e-06, "loss": 0.0484, "num_input_tokens_seen": 36960864, "step": 54825 }, { "epoch": 1.33950602203601, "grad_norm": 0.2281419336795807, "learning_rate": 1.8331253047552558e-06, "loss": 0.0858, "num_input_tokens_seen": 36964192, "step": 54830 }, { "epoch": 1.3396281728678572, "grad_norm": 16.90079116821289, "learning_rate": 1.8330781360258052e-06, "loss": 0.1635, "num_input_tokens_seen": 36968736, "step": 54835 }, { "epoch": 1.3397503236997044, "grad_norm": 0.24762359261512756, "learning_rate": 1.8330309612379867e-06, "loss": 0.0901, "num_input_tokens_seen": 36972192, "step": 54840 }, { "epoch": 1.3398724745315516, "grad_norm": 3.342427968978882, "learning_rate": 1.832983780392143e-06, "loss": 0.0333, "num_input_tokens_seen": 36975520, "step": 54845 }, { "epoch": 1.3399946253633988, "grad_norm": 27.706703186035156, "learning_rate": 1.8329365934886168e-06, "loss": 0.0752, "num_input_tokens_seen": 36979232, "step": 54850 }, { "epoch": 1.340116776195246, "grad_norm": 0.2744644284248352, "learning_rate": 1.8328894005277519e-06, "loss": 0.0017, "num_input_tokens_seen": 36982880, "step": 54855 }, { "epoch": 1.3402389270270931, "grad_norm": 5.694372177124023, "learning_rate": 1.8328422015098913e-06, "loss": 0.0017, "num_input_tokens_seen": 36986080, "step": 54860 }, { "epoch": 1.3403610778589403, "grad_norm": 27.293333053588867, "learning_rate": 1.832794996435378e-06, "loss": 0.2505, "num_input_tokens_seen": 36989216, "step": 54865 }, { "epoch": 1.3404832286907875, "grad_norm": 0.42047351598739624, "learning_rate": 1.8327477853045554e-06, "loss": 0.0796, "num_input_tokens_seen": 36992352, "step": 54870 }, { "epoch": 1.3406053795226345, "grad_norm": 0.1544327735900879, "learning_rate": 1.8327005681177674e-06, "loss": 0.0009, "num_input_tokens_seen": 36996128, "step": 54875 }, { "epoch": 1.3407275303544817, "grad_norm": 10.551417350769043, "learning_rate": 1.8326533448753565e-06, "loss": 0.0478, "num_input_tokens_seen": 36999136, "step": 54880 }, { "epoch": 1.3408496811863289, "grad_norm": 33.77448654174805, "learning_rate": 1.8326061155776666e-06, "loss": 0.1097, "num_input_tokens_seen": 37002656, "step": 54885 }, { "epoch": 1.340971832018176, "grad_norm": 0.24251246452331543, "learning_rate": 1.8325588802250411e-06, "loss": 0.1133, "num_input_tokens_seen": 37005728, "step": 54890 }, { "epoch": 1.3410939828500232, "grad_norm": 0.3213561177253723, "learning_rate": 1.8325116388178238e-06, "loss": 0.0501, "num_input_tokens_seen": 37008864, "step": 54895 }, { "epoch": 1.3412161336818704, "grad_norm": 15.927510261535645, "learning_rate": 1.8324643913563573e-06, "loss": 0.1672, "num_input_tokens_seen": 37012384, "step": 54900 }, { "epoch": 1.3413382845137176, "grad_norm": 11.647205352783203, "learning_rate": 1.8324171378409862e-06, "loss": 0.2182, "num_input_tokens_seen": 37015648, "step": 54905 }, { "epoch": 1.3414604353455646, "grad_norm": 0.29703488945961, "learning_rate": 1.832369878272054e-06, "loss": 0.0635, "num_input_tokens_seen": 37018976, "step": 54910 }, { "epoch": 1.3415825861774118, "grad_norm": 2.9432787895202637, "learning_rate": 1.832322612649904e-06, "loss": 0.0387, "num_input_tokens_seen": 37022112, "step": 54915 }, { "epoch": 1.341704737009259, "grad_norm": 0.3091279864311218, "learning_rate": 1.83227534097488e-06, "loss": 0.1057, "num_input_tokens_seen": 37025184, "step": 54920 }, { "epoch": 1.3418268878411062, "grad_norm": 282.4009704589844, "learning_rate": 1.8322280632473256e-06, "loss": 0.0704, "num_input_tokens_seen": 37028192, "step": 54925 }, { "epoch": 1.3419490386729533, "grad_norm": 10.560799598693848, "learning_rate": 1.8321807794675853e-06, "loss": 0.0933, "num_input_tokens_seen": 37031008, "step": 54930 }, { "epoch": 1.3420711895048005, "grad_norm": 48.856502532958984, "learning_rate": 1.8321334896360026e-06, "loss": 0.1536, "num_input_tokens_seen": 37034528, "step": 54935 }, { "epoch": 1.3421933403366477, "grad_norm": 0.8152018785476685, "learning_rate": 1.832086193752921e-06, "loss": 0.0302, "num_input_tokens_seen": 37037728, "step": 54940 }, { "epoch": 1.342315491168495, "grad_norm": 0.5506560206413269, "learning_rate": 1.832038891818685e-06, "loss": 0.0906, "num_input_tokens_seen": 37041312, "step": 54945 }, { "epoch": 1.342437642000342, "grad_norm": 0.3046576678752899, "learning_rate": 1.8319915838336387e-06, "loss": 0.072, "num_input_tokens_seen": 37044384, "step": 54950 }, { "epoch": 1.3425597928321893, "grad_norm": 0.04380049556493759, "learning_rate": 1.831944269798125e-06, "loss": 0.0005, "num_input_tokens_seen": 37048032, "step": 54955 }, { "epoch": 1.3426819436640365, "grad_norm": 15.407459259033203, "learning_rate": 1.8318969497124894e-06, "loss": 0.1263, "num_input_tokens_seen": 37051424, "step": 54960 }, { "epoch": 1.3428040944958834, "grad_norm": 0.5923014879226685, "learning_rate": 1.8318496235770756e-06, "loss": 0.1187, "num_input_tokens_seen": 37054816, "step": 54965 }, { "epoch": 1.3429262453277306, "grad_norm": 0.29871129989624023, "learning_rate": 1.8318022913922272e-06, "loss": 0.1211, "num_input_tokens_seen": 37058528, "step": 54970 }, { "epoch": 1.3430483961595778, "grad_norm": 0.05903381481766701, "learning_rate": 1.8317549531582888e-06, "loss": 0.0724, "num_input_tokens_seen": 37062112, "step": 54975 }, { "epoch": 1.343170546991425, "grad_norm": 0.6294770836830139, "learning_rate": 1.8317076088756047e-06, "loss": 0.0456, "num_input_tokens_seen": 37065184, "step": 54980 }, { "epoch": 1.3432926978232722, "grad_norm": 0.3868195712566376, "learning_rate": 1.8316602585445194e-06, "loss": 0.0368, "num_input_tokens_seen": 37068576, "step": 54985 }, { "epoch": 1.3434148486551194, "grad_norm": 0.2004663050174713, "learning_rate": 1.831612902165377e-06, "loss": 0.0752, "num_input_tokens_seen": 37071968, "step": 54990 }, { "epoch": 1.3435369994869666, "grad_norm": 0.35088229179382324, "learning_rate": 1.8315655397385217e-06, "loss": 0.0529, "num_input_tokens_seen": 37075104, "step": 54995 }, { "epoch": 1.3436591503188136, "grad_norm": 0.14613264799118042, "learning_rate": 1.8315181712642981e-06, "loss": 0.0326, "num_input_tokens_seen": 37078304, "step": 55000 }, { "epoch": 1.3437813011506607, "grad_norm": 17.115861892700195, "learning_rate": 1.8314707967430509e-06, "loss": 0.2934, "num_input_tokens_seen": 37081440, "step": 55005 }, { "epoch": 1.343903451982508, "grad_norm": 224.58563232421875, "learning_rate": 1.8314234161751242e-06, "loss": 0.1698, "num_input_tokens_seen": 37084128, "step": 55010 }, { "epoch": 1.3440256028143551, "grad_norm": 0.42700228095054626, "learning_rate": 1.8313760295608632e-06, "loss": 0.1783, "num_input_tokens_seen": 37088032, "step": 55015 }, { "epoch": 1.3441477536462023, "grad_norm": 0.14409606158733368, "learning_rate": 1.8313286369006119e-06, "loss": 0.0167, "num_input_tokens_seen": 37091104, "step": 55020 }, { "epoch": 1.3442699044780495, "grad_norm": 0.18862088024616241, "learning_rate": 1.8312812381947147e-06, "loss": 0.0025, "num_input_tokens_seen": 37094624, "step": 55025 }, { "epoch": 1.3443920553098967, "grad_norm": 0.29231026768684387, "learning_rate": 1.8312338334435174e-06, "loss": 0.0877, "num_input_tokens_seen": 37098272, "step": 55030 }, { "epoch": 1.3445142061417439, "grad_norm": 0.12736621499061584, "learning_rate": 1.8311864226473636e-06, "loss": 0.1341, "num_input_tokens_seen": 37101344, "step": 55035 }, { "epoch": 1.344636356973591, "grad_norm": 0.0764683336019516, "learning_rate": 1.831139005806599e-06, "loss": 0.0253, "num_input_tokens_seen": 37104800, "step": 55040 }, { "epoch": 1.3447585078054383, "grad_norm": 6.596743106842041, "learning_rate": 1.8310915829215677e-06, "loss": 0.0942, "num_input_tokens_seen": 37108384, "step": 55045 }, { "epoch": 1.3448806586372855, "grad_norm": 0.1305336207151413, "learning_rate": 1.831044153992615e-06, "loss": 0.2179, "num_input_tokens_seen": 37111776, "step": 55050 }, { "epoch": 1.3450028094691324, "grad_norm": 35.683441162109375, "learning_rate": 1.8309967190200855e-06, "loss": 0.1414, "num_input_tokens_seen": 37114976, "step": 55055 }, { "epoch": 1.3451249603009796, "grad_norm": 12.118048667907715, "learning_rate": 1.8309492780043243e-06, "loss": 0.1777, "num_input_tokens_seen": 37118304, "step": 55060 }, { "epoch": 1.3452471111328268, "grad_norm": 93.1062240600586, "learning_rate": 1.8309018309456767e-06, "loss": 0.0744, "num_input_tokens_seen": 37121568, "step": 55065 }, { "epoch": 1.345369261964674, "grad_norm": 0.6298710703849792, "learning_rate": 1.8308543778444875e-06, "loss": 0.0465, "num_input_tokens_seen": 37124704, "step": 55070 }, { "epoch": 1.3454914127965212, "grad_norm": 21.240869522094727, "learning_rate": 1.8308069187011017e-06, "loss": 0.1673, "num_input_tokens_seen": 37128288, "step": 55075 }, { "epoch": 1.3456135636283684, "grad_norm": 0.5027111172676086, "learning_rate": 1.8307594535158645e-06, "loss": 0.0441, "num_input_tokens_seen": 37131296, "step": 55080 }, { "epoch": 1.3457357144602153, "grad_norm": 0.4280967116355896, "learning_rate": 1.8307119822891213e-06, "loss": 0.1084, "num_input_tokens_seen": 37134432, "step": 55085 }, { "epoch": 1.3458578652920625, "grad_norm": 40.367759704589844, "learning_rate": 1.830664505021217e-06, "loss": 0.0891, "num_input_tokens_seen": 37137696, "step": 55090 }, { "epoch": 1.3459800161239097, "grad_norm": 0.12468226999044418, "learning_rate": 1.830617021712497e-06, "loss": 0.0556, "num_input_tokens_seen": 37141024, "step": 55095 }, { "epoch": 1.346102166955757, "grad_norm": 10.89808177947998, "learning_rate": 1.8305695323633065e-06, "loss": 0.0951, "num_input_tokens_seen": 37144224, "step": 55100 }, { "epoch": 1.346224317787604, "grad_norm": 4.6593337059021, "learning_rate": 1.830522036973991e-06, "loss": 0.0283, "num_input_tokens_seen": 37147744, "step": 55105 }, { "epoch": 1.3463464686194513, "grad_norm": 10.192591667175293, "learning_rate": 1.830474535544896e-06, "loss": 0.1267, "num_input_tokens_seen": 37150816, "step": 55110 }, { "epoch": 1.3464686194512985, "grad_norm": 1.1582913398742676, "learning_rate": 1.8304270280763667e-06, "loss": 0.1288, "num_input_tokens_seen": 37154016, "step": 55115 }, { "epoch": 1.3465907702831457, "grad_norm": 0.21199598908424377, "learning_rate": 1.8303795145687488e-06, "loss": 0.0154, "num_input_tokens_seen": 37157216, "step": 55120 }, { "epoch": 1.3467129211149929, "grad_norm": 103.62749481201172, "learning_rate": 1.8303319950223877e-06, "loss": 0.0831, "num_input_tokens_seen": 37160608, "step": 55125 }, { "epoch": 1.34683507194684, "grad_norm": 0.7188388705253601, "learning_rate": 1.8302844694376289e-06, "loss": 0.0366, "num_input_tokens_seen": 37164192, "step": 55130 }, { "epoch": 1.3469572227786872, "grad_norm": 1.1744076013565063, "learning_rate": 1.830236937814818e-06, "loss": 0.0646, "num_input_tokens_seen": 37167328, "step": 55135 }, { "epoch": 1.3470793736105344, "grad_norm": 0.013013658113777637, "learning_rate": 1.830189400154301e-06, "loss": 0.0703, "num_input_tokens_seen": 37170784, "step": 55140 }, { "epoch": 1.3472015244423814, "grad_norm": 0.06146685406565666, "learning_rate": 1.8301418564564238e-06, "loss": 0.0018, "num_input_tokens_seen": 37174560, "step": 55145 }, { "epoch": 1.3473236752742286, "grad_norm": 0.30197107791900635, "learning_rate": 1.830094306721531e-06, "loss": 0.0507, "num_input_tokens_seen": 37178016, "step": 55150 }, { "epoch": 1.3474458261060758, "grad_norm": 8.145421028137207, "learning_rate": 1.8300467509499695e-06, "loss": 0.0356, "num_input_tokens_seen": 37181600, "step": 55155 }, { "epoch": 1.347567976937923, "grad_norm": 59.542667388916016, "learning_rate": 1.8299991891420845e-06, "loss": 0.0041, "num_input_tokens_seen": 37184608, "step": 55160 }, { "epoch": 1.3476901277697702, "grad_norm": 2.546189069747925, "learning_rate": 1.8299516212982225e-06, "loss": 0.0263, "num_input_tokens_seen": 37187616, "step": 55165 }, { "epoch": 1.3478122786016173, "grad_norm": 0.07865026593208313, "learning_rate": 1.8299040474187288e-06, "loss": 0.0432, "num_input_tokens_seen": 37190880, "step": 55170 }, { "epoch": 1.3479344294334643, "grad_norm": 0.05466358736157417, "learning_rate": 1.8298564675039499e-06, "loss": 0.1084, "num_input_tokens_seen": 37193696, "step": 55175 }, { "epoch": 1.3480565802653115, "grad_norm": 33.85342788696289, "learning_rate": 1.8298088815542312e-06, "loss": 0.1264, "num_input_tokens_seen": 37196768, "step": 55180 }, { "epoch": 1.3481787310971587, "grad_norm": 14.359149932861328, "learning_rate": 1.8297612895699195e-06, "loss": 0.2703, "num_input_tokens_seen": 37200032, "step": 55185 }, { "epoch": 1.3483008819290059, "grad_norm": 14.786344528198242, "learning_rate": 1.8297136915513605e-06, "loss": 0.2105, "num_input_tokens_seen": 37203488, "step": 55190 }, { "epoch": 1.348423032760853, "grad_norm": 0.13315944373607635, "learning_rate": 1.8296660874989e-06, "loss": 0.0619, "num_input_tokens_seen": 37206880, "step": 55195 }, { "epoch": 1.3485451835927003, "grad_norm": 0.07801298052072525, "learning_rate": 1.829618477412885e-06, "loss": 0.0022, "num_input_tokens_seen": 37210208, "step": 55200 }, { "epoch": 1.3486673344245474, "grad_norm": 0.10913047939538956, "learning_rate": 1.8295708612936611e-06, "loss": 0.0777, "num_input_tokens_seen": 37213408, "step": 55205 }, { "epoch": 1.3487894852563946, "grad_norm": 0.04249902069568634, "learning_rate": 1.8295232391415747e-06, "loss": 0.067, "num_input_tokens_seen": 37216800, "step": 55210 }, { "epoch": 1.3489116360882418, "grad_norm": 9.33828353881836, "learning_rate": 1.8294756109569722e-06, "loss": 0.1029, "num_input_tokens_seen": 37220448, "step": 55215 }, { "epoch": 1.349033786920089, "grad_norm": 0.07323089241981506, "learning_rate": 1.8294279767402001e-06, "loss": 0.0033, "num_input_tokens_seen": 37223584, "step": 55220 }, { "epoch": 1.3491559377519362, "grad_norm": 255.05075073242188, "learning_rate": 1.8293803364916044e-06, "loss": 0.1111, "num_input_tokens_seen": 37226464, "step": 55225 }, { "epoch": 1.3492780885837832, "grad_norm": 0.13870365917682648, "learning_rate": 1.8293326902115323e-06, "loss": 0.2226, "num_input_tokens_seen": 37229728, "step": 55230 }, { "epoch": 1.3494002394156304, "grad_norm": 196.775634765625, "learning_rate": 1.8292850379003294e-06, "loss": 0.1753, "num_input_tokens_seen": 37232864, "step": 55235 }, { "epoch": 1.3495223902474776, "grad_norm": 0.12823425233364105, "learning_rate": 1.8292373795583425e-06, "loss": 0.0801, "num_input_tokens_seen": 37236320, "step": 55240 }, { "epoch": 1.3496445410793247, "grad_norm": 0.6417508125305176, "learning_rate": 1.8291897151859187e-06, "loss": 0.0025, "num_input_tokens_seen": 37239840, "step": 55245 }, { "epoch": 1.349766691911172, "grad_norm": 39.938053131103516, "learning_rate": 1.8291420447834043e-06, "loss": 0.2231, "num_input_tokens_seen": 37243040, "step": 55250 }, { "epoch": 1.3498888427430191, "grad_norm": 0.058599360287189484, "learning_rate": 1.8290943683511457e-06, "loss": 0.0016, "num_input_tokens_seen": 37246112, "step": 55255 }, { "epoch": 1.3500109935748663, "grad_norm": 0.35725781321525574, "learning_rate": 1.8290466858894899e-06, "loss": 0.0456, "num_input_tokens_seen": 37249952, "step": 55260 }, { "epoch": 1.3501331444067133, "grad_norm": 0.5817795395851135, "learning_rate": 1.8289989973987838e-06, "loss": 0.0938, "num_input_tokens_seen": 37253152, "step": 55265 }, { "epoch": 1.3502552952385605, "grad_norm": 0.3643583655357361, "learning_rate": 1.8289513028793739e-06, "loss": 0.1001, "num_input_tokens_seen": 37256352, "step": 55270 }, { "epoch": 1.3503774460704077, "grad_norm": 0.15800811350345612, "learning_rate": 1.8289036023316072e-06, "loss": 0.0461, "num_input_tokens_seen": 37259488, "step": 55275 }, { "epoch": 1.3504995969022549, "grad_norm": 0.039382465183734894, "learning_rate": 1.8288558957558301e-06, "loss": 0.0754, "num_input_tokens_seen": 37262752, "step": 55280 }, { "epoch": 1.350621747734102, "grad_norm": 0.5504450798034668, "learning_rate": 1.8288081831523907e-06, "loss": 0.1319, "num_input_tokens_seen": 37266080, "step": 55285 }, { "epoch": 1.3507438985659492, "grad_norm": 0.5390429496765137, "learning_rate": 1.8287604645216348e-06, "loss": 0.0835, "num_input_tokens_seen": 37269472, "step": 55290 }, { "epoch": 1.3508660493977964, "grad_norm": 0.1276429444551468, "learning_rate": 1.8287127398639102e-06, "loss": 0.0482, "num_input_tokens_seen": 37272544, "step": 55295 }, { "epoch": 1.3509882002296436, "grad_norm": 16.06958770751953, "learning_rate": 1.8286650091795638e-06, "loss": 0.1202, "num_input_tokens_seen": 37276448, "step": 55300 }, { "epoch": 1.3511103510614908, "grad_norm": 0.48938968777656555, "learning_rate": 1.828617272468942e-06, "loss": 0.0391, "num_input_tokens_seen": 37279904, "step": 55305 }, { "epoch": 1.351232501893338, "grad_norm": 0.21368904411792755, "learning_rate": 1.8285695297323928e-06, "loss": 0.0494, "num_input_tokens_seen": 37282784, "step": 55310 }, { "epoch": 1.3513546527251852, "grad_norm": 0.7595393657684326, "learning_rate": 1.828521780970263e-06, "loss": 0.0919, "num_input_tokens_seen": 37285856, "step": 55315 }, { "epoch": 1.3514768035570321, "grad_norm": 10.359819412231445, "learning_rate": 1.8284740261829002e-06, "loss": 0.144, "num_input_tokens_seen": 37288992, "step": 55320 }, { "epoch": 1.3515989543888793, "grad_norm": 0.3968583941459656, "learning_rate": 1.8284262653706515e-06, "loss": 0.0438, "num_input_tokens_seen": 37292320, "step": 55325 }, { "epoch": 1.3517211052207265, "grad_norm": 0.9557807445526123, "learning_rate": 1.8283784985338638e-06, "loss": 0.0744, "num_input_tokens_seen": 37295712, "step": 55330 }, { "epoch": 1.3518432560525737, "grad_norm": 24.249900817871094, "learning_rate": 1.828330725672885e-06, "loss": 0.1685, "num_input_tokens_seen": 37299040, "step": 55335 }, { "epoch": 1.351965406884421, "grad_norm": 1.363347053527832, "learning_rate": 1.8282829467880624e-06, "loss": 0.0024, "num_input_tokens_seen": 37302176, "step": 55340 }, { "epoch": 1.352087557716268, "grad_norm": 0.12959076464176178, "learning_rate": 1.8282351618797435e-06, "loss": 0.0802, "num_input_tokens_seen": 37305312, "step": 55345 }, { "epoch": 1.3522097085481153, "grad_norm": 28.02058219909668, "learning_rate": 1.8281873709482759e-06, "loss": 0.0617, "num_input_tokens_seen": 37308512, "step": 55350 }, { "epoch": 1.3523318593799623, "grad_norm": 6.930314540863037, "learning_rate": 1.8281395739940067e-06, "loss": 0.0833, "num_input_tokens_seen": 37311712, "step": 55355 }, { "epoch": 1.3524540102118094, "grad_norm": 0.5654293298721313, "learning_rate": 1.828091771017284e-06, "loss": 0.0447, "num_input_tokens_seen": 37315424, "step": 55360 }, { "epoch": 1.3525761610436566, "grad_norm": 7.034704685211182, "learning_rate": 1.8280439620184549e-06, "loss": 0.0523, "num_input_tokens_seen": 37318880, "step": 55365 }, { "epoch": 1.3526983118755038, "grad_norm": 0.38624125719070435, "learning_rate": 1.8279961469978676e-06, "loss": 0.0028, "num_input_tokens_seen": 37322208, "step": 55370 }, { "epoch": 1.352820462707351, "grad_norm": 0.046895623207092285, "learning_rate": 1.8279483259558694e-06, "loss": 0.1267, "num_input_tokens_seen": 37326048, "step": 55375 }, { "epoch": 1.3529426135391982, "grad_norm": 0.30466148257255554, "learning_rate": 1.8279004988928085e-06, "loss": 0.0956, "num_input_tokens_seen": 37329504, "step": 55380 }, { "epoch": 1.3530647643710454, "grad_norm": 0.2624111473560333, "learning_rate": 1.8278526658090325e-06, "loss": 0.0289, "num_input_tokens_seen": 37332896, "step": 55385 }, { "epoch": 1.3531869152028926, "grad_norm": 0.18605853617191315, "learning_rate": 1.8278048267048894e-06, "loss": 0.0558, "num_input_tokens_seen": 37336416, "step": 55390 }, { "epoch": 1.3533090660347398, "grad_norm": 0.3882926106452942, "learning_rate": 1.8277569815807266e-06, "loss": 0.044, "num_input_tokens_seen": 37339488, "step": 55395 }, { "epoch": 1.353431216866587, "grad_norm": 16.696517944335938, "learning_rate": 1.8277091304368926e-06, "loss": 0.1294, "num_input_tokens_seen": 37342816, "step": 55400 }, { "epoch": 1.3535533676984342, "grad_norm": 0.25264209508895874, "learning_rate": 1.8276612732737351e-06, "loss": 0.1302, "num_input_tokens_seen": 37346016, "step": 55405 }, { "epoch": 1.3536755185302811, "grad_norm": 21.005630493164062, "learning_rate": 1.8276134100916024e-06, "loss": 0.1187, "num_input_tokens_seen": 37349600, "step": 55410 }, { "epoch": 1.3537976693621283, "grad_norm": 8.199868202209473, "learning_rate": 1.8275655408908421e-06, "loss": 0.1065, "num_input_tokens_seen": 37352608, "step": 55415 }, { "epoch": 1.3539198201939755, "grad_norm": 0.061909351497888565, "learning_rate": 1.8275176656718025e-06, "loss": 0.0017, "num_input_tokens_seen": 37355808, "step": 55420 }, { "epoch": 1.3540419710258227, "grad_norm": 0.8016281723976135, "learning_rate": 1.8274697844348321e-06, "loss": 0.0723, "num_input_tokens_seen": 37359392, "step": 55425 }, { "epoch": 1.3541641218576699, "grad_norm": 0.0208908561617136, "learning_rate": 1.827421897180279e-06, "loss": 0.0567, "num_input_tokens_seen": 37362720, "step": 55430 }, { "epoch": 1.354286272689517, "grad_norm": 0.1459772139787674, "learning_rate": 1.827374003908491e-06, "loss": 0.074, "num_input_tokens_seen": 37366112, "step": 55435 }, { "epoch": 1.3544084235213643, "grad_norm": 101.6792984008789, "learning_rate": 1.8273261046198169e-06, "loss": 0.1579, "num_input_tokens_seen": 37369056, "step": 55440 }, { "epoch": 1.3545305743532112, "grad_norm": 0.41175195574760437, "learning_rate": 1.8272781993146046e-06, "loss": 0.0733, "num_input_tokens_seen": 37372192, "step": 55445 }, { "epoch": 1.3546527251850584, "grad_norm": 0.5852706432342529, "learning_rate": 1.827230287993203e-06, "loss": 0.0467, "num_input_tokens_seen": 37375712, "step": 55450 }, { "epoch": 1.3547748760169056, "grad_norm": 2.3407347202301025, "learning_rate": 1.8271823706559602e-06, "loss": 0.1188, "num_input_tokens_seen": 37379104, "step": 55455 }, { "epoch": 1.3548970268487528, "grad_norm": 72.70645141601562, "learning_rate": 1.8271344473032246e-06, "loss": 0.0039, "num_input_tokens_seen": 37382496, "step": 55460 }, { "epoch": 1.3550191776806, "grad_norm": 16.15511131286621, "learning_rate": 1.827086517935345e-06, "loss": 0.1075, "num_input_tokens_seen": 37385888, "step": 55465 }, { "epoch": 1.3551413285124472, "grad_norm": 0.5082676410675049, "learning_rate": 1.8270385825526698e-06, "loss": 0.0014, "num_input_tokens_seen": 37389408, "step": 55470 }, { "epoch": 1.3552634793442944, "grad_norm": 0.10025462508201599, "learning_rate": 1.8269906411555473e-06, "loss": 0.1457, "num_input_tokens_seen": 37393376, "step": 55475 }, { "epoch": 1.3553856301761416, "grad_norm": 0.05350198969244957, "learning_rate": 1.8269426937443266e-06, "loss": 0.0955, "num_input_tokens_seen": 37396832, "step": 55480 }, { "epoch": 1.3555077810079887, "grad_norm": 0.14534598588943481, "learning_rate": 1.8268947403193562e-06, "loss": 0.0859, "num_input_tokens_seen": 37400288, "step": 55485 }, { "epoch": 1.355629931839836, "grad_norm": 6.821413993835449, "learning_rate": 1.8268467808809849e-06, "loss": 0.0015, "num_input_tokens_seen": 37403872, "step": 55490 }, { "epoch": 1.3557520826716831, "grad_norm": 5.831641674041748, "learning_rate": 1.8267988154295612e-06, "loss": 0.1744, "num_input_tokens_seen": 37407136, "step": 55495 }, { "epoch": 1.35587423350353, "grad_norm": 37.35738754272461, "learning_rate": 1.8267508439654345e-06, "loss": 0.1595, "num_input_tokens_seen": 37410976, "step": 55500 }, { "epoch": 1.3559963843353773, "grad_norm": 12.805477142333984, "learning_rate": 1.826702866488953e-06, "loss": 0.0964, "num_input_tokens_seen": 37414816, "step": 55505 }, { "epoch": 1.3561185351672245, "grad_norm": 30.72103500366211, "learning_rate": 1.826654883000466e-06, "loss": 0.1445, "num_input_tokens_seen": 37418272, "step": 55510 }, { "epoch": 1.3562406859990717, "grad_norm": 16.386653900146484, "learning_rate": 1.8266068935003226e-06, "loss": 0.0936, "num_input_tokens_seen": 37421792, "step": 55515 }, { "epoch": 1.3563628368309188, "grad_norm": 0.3761710822582245, "learning_rate": 1.826558897988871e-06, "loss": 0.0016, "num_input_tokens_seen": 37425056, "step": 55520 }, { "epoch": 1.356484987662766, "grad_norm": 7.814873695373535, "learning_rate": 1.8265108964664608e-06, "loss": 0.0576, "num_input_tokens_seen": 37428256, "step": 55525 }, { "epoch": 1.3566071384946132, "grad_norm": 0.6091344356536865, "learning_rate": 1.8264628889334414e-06, "loss": 0.0034, "num_input_tokens_seen": 37431968, "step": 55530 }, { "epoch": 1.3567292893264602, "grad_norm": 0.08361712843179703, "learning_rate": 1.8264148753901616e-06, "loss": 0.0277, "num_input_tokens_seen": 37436192, "step": 55535 }, { "epoch": 1.3568514401583074, "grad_norm": 0.1606522798538208, "learning_rate": 1.8263668558369703e-06, "loss": 0.0666, "num_input_tokens_seen": 37439456, "step": 55540 }, { "epoch": 1.3569735909901546, "grad_norm": 36.44951629638672, "learning_rate": 1.8263188302742173e-06, "loss": 0.0879, "num_input_tokens_seen": 37442912, "step": 55545 }, { "epoch": 1.3570957418220018, "grad_norm": 16.47828483581543, "learning_rate": 1.8262707987022512e-06, "loss": 0.105, "num_input_tokens_seen": 37445856, "step": 55550 }, { "epoch": 1.357217892653849, "grad_norm": 0.06848545372486115, "learning_rate": 1.8262227611214218e-06, "loss": 0.1269, "num_input_tokens_seen": 37449184, "step": 55555 }, { "epoch": 1.3573400434856961, "grad_norm": 10.186595916748047, "learning_rate": 1.826174717532078e-06, "loss": 0.061, "num_input_tokens_seen": 37452192, "step": 55560 }, { "epoch": 1.3574621943175433, "grad_norm": 1.475244164466858, "learning_rate": 1.8261266679345696e-06, "loss": 0.1319, "num_input_tokens_seen": 37455392, "step": 55565 }, { "epoch": 1.3575843451493905, "grad_norm": 0.04712071269750595, "learning_rate": 1.8260786123292458e-06, "loss": 0.0028, "num_input_tokens_seen": 37458784, "step": 55570 }, { "epoch": 1.3577064959812377, "grad_norm": 0.3326287567615509, "learning_rate": 1.8260305507164565e-06, "loss": 0.0717, "num_input_tokens_seen": 37461984, "step": 55575 }, { "epoch": 1.357828646813085, "grad_norm": 100.86023712158203, "learning_rate": 1.8259824830965504e-06, "loss": 0.2017, "num_input_tokens_seen": 37465888, "step": 55580 }, { "epoch": 1.357950797644932, "grad_norm": 21.718538284301758, "learning_rate": 1.8259344094698777e-06, "loss": 0.0853, "num_input_tokens_seen": 37469664, "step": 55585 }, { "epoch": 1.358072948476779, "grad_norm": 0.4292604327201843, "learning_rate": 1.8258863298367877e-06, "loss": 0.0399, "num_input_tokens_seen": 37473056, "step": 55590 }, { "epoch": 1.3581950993086263, "grad_norm": 0.042389508336782455, "learning_rate": 1.8258382441976306e-06, "loss": 0.0738, "num_input_tokens_seen": 37476192, "step": 55595 }, { "epoch": 1.3583172501404734, "grad_norm": 0.13057366013526917, "learning_rate": 1.8257901525527553e-06, "loss": 0.0733, "num_input_tokens_seen": 37479520, "step": 55600 }, { "epoch": 1.3584394009723206, "grad_norm": 0.26481014490127563, "learning_rate": 1.8257420549025117e-06, "loss": 0.0521, "num_input_tokens_seen": 37482528, "step": 55605 }, { "epoch": 1.3585615518041678, "grad_norm": 0.13100023567676544, "learning_rate": 1.82569395124725e-06, "loss": 0.002, "num_input_tokens_seen": 37486048, "step": 55610 }, { "epoch": 1.358683702636015, "grad_norm": 0.2691006660461426, "learning_rate": 1.82564584158732e-06, "loss": 0.1526, "num_input_tokens_seen": 37489120, "step": 55615 }, { "epoch": 1.358805853467862, "grad_norm": 0.04777399078011513, "learning_rate": 1.8255977259230714e-06, "loss": 0.0569, "num_input_tokens_seen": 37492256, "step": 55620 }, { "epoch": 1.3589280042997092, "grad_norm": 5.601587772369385, "learning_rate": 1.8255496042548537e-06, "loss": 0.0938, "num_input_tokens_seen": 37495392, "step": 55625 }, { "epoch": 1.3590501551315564, "grad_norm": 0.3111015856266022, "learning_rate": 1.8255014765830174e-06, "loss": 0.0346, "num_input_tokens_seen": 37500512, "step": 55630 }, { "epoch": 1.3591723059634035, "grad_norm": 20.101476669311523, "learning_rate": 1.8254533429079125e-06, "loss": 0.12, "num_input_tokens_seen": 37504288, "step": 55635 }, { "epoch": 1.3592944567952507, "grad_norm": 0.36264368891716003, "learning_rate": 1.8254052032298886e-06, "loss": 0.0534, "num_input_tokens_seen": 37507744, "step": 55640 }, { "epoch": 1.359416607627098, "grad_norm": 0.7125762104988098, "learning_rate": 1.8253570575492963e-06, "loss": 0.133, "num_input_tokens_seen": 37511136, "step": 55645 }, { "epoch": 1.3595387584589451, "grad_norm": 3.620149612426758, "learning_rate": 1.8253089058664852e-06, "loss": 0.0398, "num_input_tokens_seen": 37514272, "step": 55650 }, { "epoch": 1.3596609092907923, "grad_norm": 0.6286777257919312, "learning_rate": 1.825260748181806e-06, "loss": 0.0322, "num_input_tokens_seen": 37517152, "step": 55655 }, { "epoch": 1.3597830601226395, "grad_norm": 0.07149165123701096, "learning_rate": 1.8252125844956083e-06, "loss": 0.1159, "num_input_tokens_seen": 37520480, "step": 55660 }, { "epoch": 1.3599052109544867, "grad_norm": 0.33753979206085205, "learning_rate": 1.8251644148082433e-06, "loss": 0.0401, "num_input_tokens_seen": 37523552, "step": 55665 }, { "epoch": 1.3600273617863339, "grad_norm": 1.23895263671875, "learning_rate": 1.8251162391200604e-06, "loss": 0.0273, "num_input_tokens_seen": 37527200, "step": 55670 }, { "epoch": 1.360149512618181, "grad_norm": 0.11427648365497589, "learning_rate": 1.8250680574314101e-06, "loss": 0.0507, "num_input_tokens_seen": 37530528, "step": 55675 }, { "epoch": 1.360271663450028, "grad_norm": 39.01353073120117, "learning_rate": 1.8250198697426434e-06, "loss": 0.1472, "num_input_tokens_seen": 37533664, "step": 55680 }, { "epoch": 1.3603938142818752, "grad_norm": 0.10856301337480545, "learning_rate": 1.82497167605411e-06, "loss": 0.0013, "num_input_tokens_seen": 37537120, "step": 55685 }, { "epoch": 1.3605159651137224, "grad_norm": 21.2423038482666, "learning_rate": 1.8249234763661608e-06, "loss": 0.2504, "num_input_tokens_seen": 37540256, "step": 55690 }, { "epoch": 1.3606381159455696, "grad_norm": 0.3467349708080292, "learning_rate": 1.8248752706791461e-06, "loss": 0.096, "num_input_tokens_seen": 37544096, "step": 55695 }, { "epoch": 1.3607602667774168, "grad_norm": 0.029728004708886147, "learning_rate": 1.8248270589934167e-06, "loss": 0.1089, "num_input_tokens_seen": 37547424, "step": 55700 }, { "epoch": 1.360882417609264, "grad_norm": 0.06764955818653107, "learning_rate": 1.824778841309323e-06, "loss": 0.0008, "num_input_tokens_seen": 37551264, "step": 55705 }, { "epoch": 1.361004568441111, "grad_norm": 0.18158482015132904, "learning_rate": 1.8247306176272157e-06, "loss": 0.0011, "num_input_tokens_seen": 37554400, "step": 55710 }, { "epoch": 1.3611267192729581, "grad_norm": 161.26670837402344, "learning_rate": 1.8246823879474458e-06, "loss": 0.1135, "num_input_tokens_seen": 37558048, "step": 55715 }, { "epoch": 1.3612488701048053, "grad_norm": 0.35041800141334534, "learning_rate": 1.8246341522703635e-06, "loss": 0.0316, "num_input_tokens_seen": 37561376, "step": 55720 }, { "epoch": 1.3613710209366525, "grad_norm": 0.19455307722091675, "learning_rate": 1.8245859105963197e-06, "loss": 0.0492, "num_input_tokens_seen": 37564768, "step": 55725 }, { "epoch": 1.3614931717684997, "grad_norm": 18.347190856933594, "learning_rate": 1.8245376629256657e-06, "loss": 0.1975, "num_input_tokens_seen": 37567968, "step": 55730 }, { "epoch": 1.361615322600347, "grad_norm": 0.7373723387718201, "learning_rate": 1.8244894092587517e-06, "loss": 0.041, "num_input_tokens_seen": 37571360, "step": 55735 }, { "epoch": 1.361737473432194, "grad_norm": 0.1448211967945099, "learning_rate": 1.8244411495959291e-06, "loss": 0.0009, "num_input_tokens_seen": 37574752, "step": 55740 }, { "epoch": 1.3618596242640413, "grad_norm": 0.23177365958690643, "learning_rate": 1.8243928839375488e-06, "loss": 0.142, "num_input_tokens_seen": 37578016, "step": 55745 }, { "epoch": 1.3619817750958885, "grad_norm": 0.027521274983882904, "learning_rate": 1.8243446122839615e-06, "loss": 0.0089, "num_input_tokens_seen": 37581408, "step": 55750 }, { "epoch": 1.3621039259277357, "grad_norm": 0.07896178960800171, "learning_rate": 1.8242963346355187e-06, "loss": 0.076, "num_input_tokens_seen": 37584864, "step": 55755 }, { "epoch": 1.3622260767595828, "grad_norm": 0.21244436502456665, "learning_rate": 1.8242480509925713e-06, "loss": 0.0738, "num_input_tokens_seen": 37588192, "step": 55760 }, { "epoch": 1.3623482275914298, "grad_norm": 8.31706428527832, "learning_rate": 1.8241997613554702e-06, "loss": 0.1124, "num_input_tokens_seen": 37591584, "step": 55765 }, { "epoch": 1.362470378423277, "grad_norm": 165.4110107421875, "learning_rate": 1.8241514657245669e-06, "loss": 0.1641, "num_input_tokens_seen": 37594912, "step": 55770 }, { "epoch": 1.3625925292551242, "grad_norm": 0.0612252838909626, "learning_rate": 1.8241031641002125e-06, "loss": 0.0662, "num_input_tokens_seen": 37598240, "step": 55775 }, { "epoch": 1.3627146800869714, "grad_norm": 1.860999584197998, "learning_rate": 1.8240548564827577e-06, "loss": 0.0031, "num_input_tokens_seen": 37601312, "step": 55780 }, { "epoch": 1.3628368309188186, "grad_norm": 0.02734232135117054, "learning_rate": 1.8240065428725552e-06, "loss": 0.0013, "num_input_tokens_seen": 37604640, "step": 55785 }, { "epoch": 1.3629589817506658, "grad_norm": 6.640567779541016, "learning_rate": 1.823958223269955e-06, "loss": 0.0404, "num_input_tokens_seen": 37608032, "step": 55790 }, { "epoch": 1.363081132582513, "grad_norm": 0.0057974387891590595, "learning_rate": 1.823909897675309e-06, "loss": 0.0571, "num_input_tokens_seen": 37611296, "step": 55795 }, { "epoch": 1.36320328341436, "grad_norm": 1.323617696762085, "learning_rate": 1.8238615660889685e-06, "loss": 0.0555, "num_input_tokens_seen": 37614432, "step": 55800 }, { "epoch": 1.3633254342462071, "grad_norm": 0.33827343583106995, "learning_rate": 1.8238132285112853e-06, "loss": 0.0917, "num_input_tokens_seen": 37618080, "step": 55805 }, { "epoch": 1.3634475850780543, "grad_norm": 93.0377197265625, "learning_rate": 1.8237648849426103e-06, "loss": 0.0064, "num_input_tokens_seen": 37621152, "step": 55810 }, { "epoch": 1.3635697359099015, "grad_norm": 0.7247235178947449, "learning_rate": 1.823716535383296e-06, "loss": 0.3132, "num_input_tokens_seen": 37624352, "step": 55815 }, { "epoch": 1.3636918867417487, "grad_norm": 77.18084716796875, "learning_rate": 1.8236681798336935e-06, "loss": 0.1327, "num_input_tokens_seen": 37627680, "step": 55820 }, { "epoch": 1.3638140375735959, "grad_norm": 0.5121102929115295, "learning_rate": 1.8236198182941543e-06, "loss": 0.0672, "num_input_tokens_seen": 37631008, "step": 55825 }, { "epoch": 1.363936188405443, "grad_norm": 0.45688682794570923, "learning_rate": 1.8235714507650302e-06, "loss": 0.0468, "num_input_tokens_seen": 37634784, "step": 55830 }, { "epoch": 1.3640583392372903, "grad_norm": 19.641523361206055, "learning_rate": 1.823523077246673e-06, "loss": 0.0416, "num_input_tokens_seen": 37637856, "step": 55835 }, { "epoch": 1.3641804900691374, "grad_norm": 5.420958042144775, "learning_rate": 1.8234746977394346e-06, "loss": 0.0454, "num_input_tokens_seen": 37641312, "step": 55840 }, { "epoch": 1.3643026409009846, "grad_norm": 103.85394287109375, "learning_rate": 1.8234263122436667e-06, "loss": 0.1019, "num_input_tokens_seen": 37644960, "step": 55845 }, { "epoch": 1.3644247917328318, "grad_norm": 0.18337062001228333, "learning_rate": 1.8233779207597211e-06, "loss": 0.0462, "num_input_tokens_seen": 37648096, "step": 55850 }, { "epoch": 1.3645469425646788, "grad_norm": 0.46553394198417664, "learning_rate": 1.8233295232879497e-06, "loss": 0.0809, "num_input_tokens_seen": 37651488, "step": 55855 }, { "epoch": 1.364669093396526, "grad_norm": 112.44773864746094, "learning_rate": 1.8232811198287048e-06, "loss": 0.1134, "num_input_tokens_seen": 37654368, "step": 55860 }, { "epoch": 1.3647912442283732, "grad_norm": 10.225543022155762, "learning_rate": 1.823232710382338e-06, "loss": 0.1791, "num_input_tokens_seen": 37657696, "step": 55865 }, { "epoch": 1.3649133950602204, "grad_norm": 0.09325478971004486, "learning_rate": 1.8231842949492016e-06, "loss": 0.1053, "num_input_tokens_seen": 37660896, "step": 55870 }, { "epoch": 1.3650355458920675, "grad_norm": 0.08168121427297592, "learning_rate": 1.8231358735296475e-06, "loss": 0.0018, "num_input_tokens_seen": 37664224, "step": 55875 }, { "epoch": 1.3651576967239147, "grad_norm": 9.982057571411133, "learning_rate": 1.823087446124028e-06, "loss": 0.0786, "num_input_tokens_seen": 37667424, "step": 55880 }, { "epoch": 1.365279847555762, "grad_norm": 9.831002235412598, "learning_rate": 1.8230390127326954e-06, "loss": 0.1927, "num_input_tokens_seen": 37670560, "step": 55885 }, { "epoch": 1.365401998387609, "grad_norm": 9.530682563781738, "learning_rate": 1.8229905733560011e-06, "loss": 0.0488, "num_input_tokens_seen": 37673696, "step": 55890 }, { "epoch": 1.365524149219456, "grad_norm": 0.12205583602190018, "learning_rate": 1.8229421279942985e-06, "loss": 0.0618, "num_input_tokens_seen": 37676640, "step": 55895 }, { "epoch": 1.3656463000513033, "grad_norm": 165.70962524414062, "learning_rate": 1.8228936766479394e-06, "loss": 0.1043, "num_input_tokens_seen": 37679776, "step": 55900 }, { "epoch": 1.3657684508831505, "grad_norm": 0.35768038034439087, "learning_rate": 1.822845219317276e-06, "loss": 0.0794, "num_input_tokens_seen": 37683104, "step": 55905 }, { "epoch": 1.3658906017149977, "grad_norm": 8.801888465881348, "learning_rate": 1.822796756002661e-06, "loss": 0.1115, "num_input_tokens_seen": 37686496, "step": 55910 }, { "epoch": 1.3660127525468448, "grad_norm": 0.4240803122520447, "learning_rate": 1.8227482867044466e-06, "loss": 0.0018, "num_input_tokens_seen": 37689824, "step": 55915 }, { "epoch": 1.366134903378692, "grad_norm": 21.410913467407227, "learning_rate": 1.8226998114229852e-06, "loss": 0.1187, "num_input_tokens_seen": 37693088, "step": 55920 }, { "epoch": 1.3662570542105392, "grad_norm": 0.18422946333885193, "learning_rate": 1.8226513301586298e-06, "loss": 0.001, "num_input_tokens_seen": 37696544, "step": 55925 }, { "epoch": 1.3663792050423864, "grad_norm": 13.039327621459961, "learning_rate": 1.8226028429117326e-06, "loss": 0.1704, "num_input_tokens_seen": 37700192, "step": 55930 }, { "epoch": 1.3665013558742336, "grad_norm": 2.682344436645508, "learning_rate": 1.8225543496826461e-06, "loss": 0.107, "num_input_tokens_seen": 37703456, "step": 55935 }, { "epoch": 1.3666235067060808, "grad_norm": 0.18516357243061066, "learning_rate": 1.8225058504717232e-06, "loss": 0.0997, "num_input_tokens_seen": 37706912, "step": 55940 }, { "epoch": 1.3667456575379278, "grad_norm": 43.21474075317383, "learning_rate": 1.8224573452793166e-06, "loss": 0.1256, "num_input_tokens_seen": 37710240, "step": 55945 }, { "epoch": 1.366867808369775, "grad_norm": 0.15679652988910675, "learning_rate": 1.822408834105779e-06, "loss": 0.0811, "num_input_tokens_seen": 37714144, "step": 55950 }, { "epoch": 1.3669899592016221, "grad_norm": 0.360243022441864, "learning_rate": 1.822360316951463e-06, "loss": 0.0367, "num_input_tokens_seen": 37717472, "step": 55955 }, { "epoch": 1.3671121100334693, "grad_norm": 0.2765692174434662, "learning_rate": 1.8223117938167217e-06, "loss": 0.055, "num_input_tokens_seen": 37720864, "step": 55960 }, { "epoch": 1.3672342608653165, "grad_norm": 0.22037814557552338, "learning_rate": 1.8222632647019079e-06, "loss": 0.0378, "num_input_tokens_seen": 37724064, "step": 55965 }, { "epoch": 1.3673564116971637, "grad_norm": 1.3900530338287354, "learning_rate": 1.8222147296073741e-06, "loss": 0.0034, "num_input_tokens_seen": 37727008, "step": 55970 }, { "epoch": 1.367478562529011, "grad_norm": 14.512275695800781, "learning_rate": 1.8221661885334741e-06, "loss": 0.1093, "num_input_tokens_seen": 37730080, "step": 55975 }, { "epoch": 1.3676007133608579, "grad_norm": 0.17701953649520874, "learning_rate": 1.8221176414805602e-06, "loss": 0.0725, "num_input_tokens_seen": 37733344, "step": 55980 }, { "epoch": 1.367722864192705, "grad_norm": 0.06762678176164627, "learning_rate": 1.8220690884489857e-06, "loss": 0.0012, "num_input_tokens_seen": 37736480, "step": 55985 }, { "epoch": 1.3678450150245522, "grad_norm": 0.10125814378261566, "learning_rate": 1.8220205294391037e-06, "loss": 0.1238, "num_input_tokens_seen": 37739936, "step": 55990 }, { "epoch": 1.3679671658563994, "grad_norm": 13.808995246887207, "learning_rate": 1.8219719644512672e-06, "loss": 0.0506, "num_input_tokens_seen": 37743008, "step": 55995 }, { "epoch": 1.3680893166882466, "grad_norm": 19.54206085205078, "learning_rate": 1.82192339348583e-06, "loss": 0.0047, "num_input_tokens_seen": 37746400, "step": 56000 }, { "epoch": 1.3682114675200938, "grad_norm": 95.54154205322266, "learning_rate": 1.8218748165431444e-06, "loss": 0.0508, "num_input_tokens_seen": 37750432, "step": 56005 }, { "epoch": 1.368333618351941, "grad_norm": 20.368661880493164, "learning_rate": 1.821826233623564e-06, "loss": 0.2266, "num_input_tokens_seen": 37754208, "step": 56010 }, { "epoch": 1.3684557691837882, "grad_norm": 26.827817916870117, "learning_rate": 1.8217776447274424e-06, "loss": 0.0979, "num_input_tokens_seen": 37757408, "step": 56015 }, { "epoch": 1.3685779200156354, "grad_norm": 14.997940063476562, "learning_rate": 1.8217290498551326e-06, "loss": 0.2237, "num_input_tokens_seen": 37760736, "step": 56020 }, { "epoch": 1.3687000708474826, "grad_norm": 9.052905082702637, "learning_rate": 1.8216804490069882e-06, "loss": 0.0844, "num_input_tokens_seen": 37763744, "step": 56025 }, { "epoch": 1.3688222216793298, "grad_norm": 0.05417915806174278, "learning_rate": 1.8216318421833625e-06, "loss": 0.1457, "num_input_tokens_seen": 37766880, "step": 56030 }, { "epoch": 1.3689443725111767, "grad_norm": 18.598569869995117, "learning_rate": 1.821583229384609e-06, "loss": 0.13, "num_input_tokens_seen": 37770016, "step": 56035 }, { "epoch": 1.369066523343024, "grad_norm": 17.506576538085938, "learning_rate": 1.8215346106110814e-06, "loss": 0.2275, "num_input_tokens_seen": 37773024, "step": 56040 }, { "epoch": 1.3691886741748711, "grad_norm": 33.39712905883789, "learning_rate": 1.8214859858631333e-06, "loss": 0.0569, "num_input_tokens_seen": 37775968, "step": 56045 }, { "epoch": 1.3693108250067183, "grad_norm": 19.74469757080078, "learning_rate": 1.8214373551411177e-06, "loss": 0.1, "num_input_tokens_seen": 37779296, "step": 56050 }, { "epoch": 1.3694329758385655, "grad_norm": 49.746131896972656, "learning_rate": 1.8213887184453892e-06, "loss": 0.19, "num_input_tokens_seen": 37782944, "step": 56055 }, { "epoch": 1.3695551266704127, "grad_norm": 37.75736999511719, "learning_rate": 1.8213400757763009e-06, "loss": 0.153, "num_input_tokens_seen": 37786912, "step": 56060 }, { "epoch": 1.3696772775022599, "grad_norm": 0.30505508184432983, "learning_rate": 1.8212914271342064e-06, "loss": 0.0899, "num_input_tokens_seen": 37790368, "step": 56065 }, { "epoch": 1.3697994283341068, "grad_norm": 0.30792438983917236, "learning_rate": 1.8212427725194599e-06, "loss": 0.0811, "num_input_tokens_seen": 37793696, "step": 56070 }, { "epoch": 1.369921579165954, "grad_norm": 20.89712905883789, "learning_rate": 1.821194111932415e-06, "loss": 0.0796, "num_input_tokens_seen": 37797088, "step": 56075 }, { "epoch": 1.3700437299978012, "grad_norm": 61.594722747802734, "learning_rate": 1.821145445373426e-06, "loss": 0.0422, "num_input_tokens_seen": 37800480, "step": 56080 }, { "epoch": 1.3701658808296484, "grad_norm": 8.598461151123047, "learning_rate": 1.8210967728428458e-06, "loss": 0.1484, "num_input_tokens_seen": 37803488, "step": 56085 }, { "epoch": 1.3702880316614956, "grad_norm": 0.8642435073852539, "learning_rate": 1.8210480943410296e-06, "loss": 0.0048, "num_input_tokens_seen": 37806496, "step": 56090 }, { "epoch": 1.3704101824933428, "grad_norm": 9.462559700012207, "learning_rate": 1.8209994098683306e-06, "loss": 0.1011, "num_input_tokens_seen": 37809888, "step": 56095 }, { "epoch": 1.37053233332519, "grad_norm": 10.7000732421875, "learning_rate": 1.8209507194251033e-06, "loss": 0.1184, "num_input_tokens_seen": 37813344, "step": 56100 }, { "epoch": 1.3706544841570372, "grad_norm": 25.909149169921875, "learning_rate": 1.8209020230117012e-06, "loss": 0.1454, "num_input_tokens_seen": 37816480, "step": 56105 }, { "epoch": 1.3707766349888844, "grad_norm": 0.681384265422821, "learning_rate": 1.8208533206284788e-06, "loss": 0.0406, "num_input_tokens_seen": 37820192, "step": 56110 }, { "epoch": 1.3708987858207315, "grad_norm": 0.4287295937538147, "learning_rate": 1.8208046122757903e-06, "loss": 0.0023, "num_input_tokens_seen": 37823904, "step": 56115 }, { "epoch": 1.3710209366525787, "grad_norm": 0.18749327957630157, "learning_rate": 1.8207558979539903e-06, "loss": 0.0481, "num_input_tokens_seen": 37827488, "step": 56120 }, { "epoch": 1.3711430874844257, "grad_norm": 32.42850875854492, "learning_rate": 1.820707177663432e-06, "loss": 0.1668, "num_input_tokens_seen": 37830816, "step": 56125 }, { "epoch": 1.371265238316273, "grad_norm": 0.08906259387731552, "learning_rate": 1.8206584514044709e-06, "loss": 0.1436, "num_input_tokens_seen": 37834016, "step": 56130 }, { "epoch": 1.37138738914812, "grad_norm": 51.5129508972168, "learning_rate": 1.8206097191774608e-06, "loss": 0.0594, "num_input_tokens_seen": 37837024, "step": 56135 }, { "epoch": 1.3715095399799673, "grad_norm": 0.3601904511451721, "learning_rate": 1.820560980982756e-06, "loss": 0.0926, "num_input_tokens_seen": 37840288, "step": 56140 }, { "epoch": 1.3716316908118145, "grad_norm": 5.214352607727051, "learning_rate": 1.8205122368207107e-06, "loss": 0.0034, "num_input_tokens_seen": 37843680, "step": 56145 }, { "epoch": 1.3717538416436617, "grad_norm": 0.2316035032272339, "learning_rate": 1.82046348669168e-06, "loss": 0.0444, "num_input_tokens_seen": 37846752, "step": 56150 }, { "epoch": 1.3718759924755086, "grad_norm": 0.059439271688461304, "learning_rate": 1.8204147305960182e-06, "loss": 0.0316, "num_input_tokens_seen": 37849824, "step": 56155 }, { "epoch": 1.3719981433073558, "grad_norm": 0.20835110545158386, "learning_rate": 1.8203659685340797e-06, "loss": 0.0528, "num_input_tokens_seen": 37853088, "step": 56160 }, { "epoch": 1.372120294139203, "grad_norm": 0.10169114917516708, "learning_rate": 1.8203172005062194e-06, "loss": 0.0712, "num_input_tokens_seen": 37856416, "step": 56165 }, { "epoch": 1.3722424449710502, "grad_norm": 34.08015441894531, "learning_rate": 1.8202684265127916e-06, "loss": 0.068, "num_input_tokens_seen": 37859488, "step": 56170 }, { "epoch": 1.3723645958028974, "grad_norm": 29.192033767700195, "learning_rate": 1.8202196465541513e-06, "loss": 0.0865, "num_input_tokens_seen": 37862624, "step": 56175 }, { "epoch": 1.3724867466347446, "grad_norm": 0.05454200878739357, "learning_rate": 1.820170860630653e-06, "loss": 0.1067, "num_input_tokens_seen": 37865760, "step": 56180 }, { "epoch": 1.3726088974665918, "grad_norm": 22.10439682006836, "learning_rate": 1.8201220687426515e-06, "loss": 0.1904, "num_input_tokens_seen": 37869024, "step": 56185 }, { "epoch": 1.372731048298439, "grad_norm": 56.317832946777344, "learning_rate": 1.8200732708905018e-06, "loss": 0.0204, "num_input_tokens_seen": 37872672, "step": 56190 }, { "epoch": 1.3728531991302861, "grad_norm": 0.6248363852500916, "learning_rate": 1.820024467074559e-06, "loss": 0.0023, "num_input_tokens_seen": 37875936, "step": 56195 }, { "epoch": 1.3729753499621333, "grad_norm": 0.26846882700920105, "learning_rate": 1.8199756572951775e-06, "loss": 0.0177, "num_input_tokens_seen": 37879008, "step": 56200 }, { "epoch": 1.3730975007939805, "grad_norm": 0.04101533815264702, "learning_rate": 1.8199268415527125e-06, "loss": 0.029, "num_input_tokens_seen": 37881952, "step": 56205 }, { "epoch": 1.3732196516258277, "grad_norm": 1.919983983039856, "learning_rate": 1.8198780198475189e-06, "loss": 0.0261, "num_input_tokens_seen": 37885088, "step": 56210 }, { "epoch": 1.3733418024576747, "grad_norm": 0.13918881118297577, "learning_rate": 1.8198291921799519e-06, "loss": 0.0503, "num_input_tokens_seen": 37888096, "step": 56215 }, { "epoch": 1.3734639532895219, "grad_norm": 0.4293416738510132, "learning_rate": 1.8197803585503665e-06, "loss": 0.0466, "num_input_tokens_seen": 37891616, "step": 56220 }, { "epoch": 1.373586104121369, "grad_norm": 0.013383172452449799, "learning_rate": 1.8197315189591175e-06, "loss": 0.1277, "num_input_tokens_seen": 37895136, "step": 56225 }, { "epoch": 1.3737082549532162, "grad_norm": 90.03067779541016, "learning_rate": 1.8196826734065608e-06, "loss": 0.1435, "num_input_tokens_seen": 37898336, "step": 56230 }, { "epoch": 1.3738304057850634, "grad_norm": 29.73512077331543, "learning_rate": 1.8196338218930513e-06, "loss": 0.1152, "num_input_tokens_seen": 37901856, "step": 56235 }, { "epoch": 1.3739525566169106, "grad_norm": 0.12980686128139496, "learning_rate": 1.819584964418944e-06, "loss": 0.0549, "num_input_tokens_seen": 37904928, "step": 56240 }, { "epoch": 1.3740747074487576, "grad_norm": 22.57590675354004, "learning_rate": 1.8195361009845945e-06, "loss": 0.1053, "num_input_tokens_seen": 37908448, "step": 56245 }, { "epoch": 1.3741968582806048, "grad_norm": 0.1411312073469162, "learning_rate": 1.819487231590358e-06, "loss": 0.0571, "num_input_tokens_seen": 37912352, "step": 56250 }, { "epoch": 1.374319009112452, "grad_norm": 36.76695251464844, "learning_rate": 1.8194383562365898e-06, "loss": 0.1285, "num_input_tokens_seen": 37916128, "step": 56255 }, { "epoch": 1.3744411599442992, "grad_norm": 32.090065002441406, "learning_rate": 1.8193894749236458e-06, "loss": 0.0941, "num_input_tokens_seen": 37919904, "step": 56260 }, { "epoch": 1.3745633107761464, "grad_norm": 0.1330694556236267, "learning_rate": 1.8193405876518808e-06, "loss": 0.0929, "num_input_tokens_seen": 37923232, "step": 56265 }, { "epoch": 1.3746854616079935, "grad_norm": 1.796851396560669, "learning_rate": 1.8192916944216507e-06, "loss": 0.085, "num_input_tokens_seen": 37926688, "step": 56270 }, { "epoch": 1.3748076124398407, "grad_norm": 0.1381426304578781, "learning_rate": 1.8192427952333112e-06, "loss": 0.0989, "num_input_tokens_seen": 37929888, "step": 56275 }, { "epoch": 1.374929763271688, "grad_norm": 0.1486775428056717, "learning_rate": 1.8191938900872177e-06, "loss": 0.0382, "num_input_tokens_seen": 37932896, "step": 56280 }, { "epoch": 1.3750519141035351, "grad_norm": 0.06270882487297058, "learning_rate": 1.8191449789837258e-06, "loss": 0.002, "num_input_tokens_seen": 37936736, "step": 56285 }, { "epoch": 1.3751740649353823, "grad_norm": 9.44651985168457, "learning_rate": 1.8190960619231915e-06, "loss": 0.0336, "num_input_tokens_seen": 37940576, "step": 56290 }, { "epoch": 1.3752962157672295, "grad_norm": 65.55650329589844, "learning_rate": 1.81904713890597e-06, "loss": 0.0731, "num_input_tokens_seen": 37943840, "step": 56295 }, { "epoch": 1.3754183665990765, "grad_norm": 2.183594226837158, "learning_rate": 1.8189982099324177e-06, "loss": 0.081, "num_input_tokens_seen": 37947040, "step": 56300 }, { "epoch": 1.3755405174309236, "grad_norm": 47.141258239746094, "learning_rate": 1.81894927500289e-06, "loss": 0.1332, "num_input_tokens_seen": 37950368, "step": 56305 }, { "epoch": 1.3756626682627708, "grad_norm": 0.08392799645662308, "learning_rate": 1.818900334117743e-06, "loss": 0.1287, "num_input_tokens_seen": 37953696, "step": 56310 }, { "epoch": 1.375784819094618, "grad_norm": 0.17838051915168762, "learning_rate": 1.8188513872773326e-06, "loss": 0.0293, "num_input_tokens_seen": 37956768, "step": 56315 }, { "epoch": 1.3759069699264652, "grad_norm": 34.301727294921875, "learning_rate": 1.8188024344820145e-06, "loss": 0.2067, "num_input_tokens_seen": 37960032, "step": 56320 }, { "epoch": 1.3760291207583124, "grad_norm": 13.057069778442383, "learning_rate": 1.8187534757321447e-06, "loss": 0.0548, "num_input_tokens_seen": 37964640, "step": 56325 }, { "epoch": 1.3761512715901596, "grad_norm": 0.19041498005390167, "learning_rate": 1.8187045110280796e-06, "loss": 0.0159, "num_input_tokens_seen": 37968288, "step": 56330 }, { "epoch": 1.3762734224220066, "grad_norm": 66.84744262695312, "learning_rate": 1.8186555403701753e-06, "loss": 0.1024, "num_input_tokens_seen": 37971744, "step": 56335 }, { "epoch": 1.3763955732538538, "grad_norm": 12.135207176208496, "learning_rate": 1.8186065637587876e-06, "loss": 0.0608, "num_input_tokens_seen": 37975456, "step": 56340 }, { "epoch": 1.376517724085701, "grad_norm": 21.856000900268555, "learning_rate": 1.8185575811942723e-06, "loss": 0.162, "num_input_tokens_seen": 37979168, "step": 56345 }, { "epoch": 1.3766398749175481, "grad_norm": 19.666250228881836, "learning_rate": 1.8185085926769867e-06, "loss": 0.0232, "num_input_tokens_seen": 37982560, "step": 56350 }, { "epoch": 1.3767620257493953, "grad_norm": 0.049027394503355026, "learning_rate": 1.8184595982072863e-06, "loss": 0.2073, "num_input_tokens_seen": 37985888, "step": 56355 }, { "epoch": 1.3768841765812425, "grad_norm": 0.3508736193180084, "learning_rate": 1.8184105977855276e-06, "loss": 0.1076, "num_input_tokens_seen": 37988960, "step": 56360 }, { "epoch": 1.3770063274130897, "grad_norm": 0.4071526825428009, "learning_rate": 1.8183615914120666e-06, "loss": 0.0922, "num_input_tokens_seen": 37992288, "step": 56365 }, { "epoch": 1.377128478244937, "grad_norm": 1.8135322332382202, "learning_rate": 1.8183125790872605e-06, "loss": 0.0049, "num_input_tokens_seen": 37995488, "step": 56370 }, { "epoch": 1.377250629076784, "grad_norm": 0.19190309941768646, "learning_rate": 1.8182635608114647e-06, "loss": 0.0121, "num_input_tokens_seen": 37998560, "step": 56375 }, { "epoch": 1.3773727799086313, "grad_norm": 12.494829177856445, "learning_rate": 1.8182145365850366e-06, "loss": 0.1404, "num_input_tokens_seen": 38001952, "step": 56380 }, { "epoch": 1.3774949307404785, "grad_norm": 15.6360445022583, "learning_rate": 1.8181655064083322e-06, "loss": 0.0923, "num_input_tokens_seen": 38005920, "step": 56385 }, { "epoch": 1.3776170815723254, "grad_norm": 3.3667964935302734, "learning_rate": 1.818116470281708e-06, "loss": 0.0437, "num_input_tokens_seen": 38009120, "step": 56390 }, { "epoch": 1.3777392324041726, "grad_norm": 114.14451599121094, "learning_rate": 1.818067428205521e-06, "loss": 0.063, "num_input_tokens_seen": 38012384, "step": 56395 }, { "epoch": 1.3778613832360198, "grad_norm": 8.4616117477417, "learning_rate": 1.8180183801801277e-06, "loss": 0.0565, "num_input_tokens_seen": 38015776, "step": 56400 }, { "epoch": 1.377983534067867, "grad_norm": 0.40798187255859375, "learning_rate": 1.8179693262058844e-06, "loss": 0.0022, "num_input_tokens_seen": 38019808, "step": 56405 }, { "epoch": 1.3781056848997142, "grad_norm": 0.16288278996944427, "learning_rate": 1.8179202662831483e-06, "loss": 0.112, "num_input_tokens_seen": 38022944, "step": 56410 }, { "epoch": 1.3782278357315614, "grad_norm": 0.03791259601712227, "learning_rate": 1.8178712004122763e-06, "loss": 0.0337, "num_input_tokens_seen": 38026336, "step": 56415 }, { "epoch": 1.3783499865634086, "grad_norm": 53.5873908996582, "learning_rate": 1.8178221285936246e-06, "loss": 0.081, "num_input_tokens_seen": 38030048, "step": 56420 }, { "epoch": 1.3784721373952555, "grad_norm": 0.43458986282348633, "learning_rate": 1.8177730508275504e-06, "loss": 0.0586, "num_input_tokens_seen": 38033632, "step": 56425 }, { "epoch": 1.3785942882271027, "grad_norm": 0.22739242017269135, "learning_rate": 1.8177239671144106e-06, "loss": 0.0415, "num_input_tokens_seen": 38036896, "step": 56430 }, { "epoch": 1.37871643905895, "grad_norm": 0.29765599966049194, "learning_rate": 1.8176748774545626e-06, "loss": 0.1464, "num_input_tokens_seen": 38040416, "step": 56435 }, { "epoch": 1.378838589890797, "grad_norm": 0.030600015074014664, "learning_rate": 1.8176257818483624e-06, "loss": 0.0029, "num_input_tokens_seen": 38044000, "step": 56440 }, { "epoch": 1.3789607407226443, "grad_norm": 1.0469118356704712, "learning_rate": 1.8175766802961681e-06, "loss": 0.0743, "num_input_tokens_seen": 38047392, "step": 56445 }, { "epoch": 1.3790828915544915, "grad_norm": 50.24403381347656, "learning_rate": 1.817527572798336e-06, "loss": 0.0954, "num_input_tokens_seen": 38050592, "step": 56450 }, { "epoch": 1.3792050423863387, "grad_norm": 42.963157653808594, "learning_rate": 1.8174784593552235e-06, "loss": 0.0879, "num_input_tokens_seen": 38053536, "step": 56455 }, { "epoch": 1.3793271932181859, "grad_norm": 0.0433623380959034, "learning_rate": 1.817429339967188e-06, "loss": 0.1678, "num_input_tokens_seen": 38056864, "step": 56460 }, { "epoch": 1.379449344050033, "grad_norm": 16.934593200683594, "learning_rate": 1.817380214634586e-06, "loss": 0.1658, "num_input_tokens_seen": 38060256, "step": 56465 }, { "epoch": 1.3795714948818802, "grad_norm": 63.58816146850586, "learning_rate": 1.8173310833577754e-06, "loss": 0.1639, "num_input_tokens_seen": 38063392, "step": 56470 }, { "epoch": 1.3796936457137274, "grad_norm": 1.4449207782745361, "learning_rate": 1.8172819461371138e-06, "loss": 0.0794, "num_input_tokens_seen": 38066912, "step": 56475 }, { "epoch": 1.3798157965455744, "grad_norm": 225.5787811279297, "learning_rate": 1.8172328029729577e-06, "loss": 0.1965, "num_input_tokens_seen": 38070240, "step": 56480 }, { "epoch": 1.3799379473774216, "grad_norm": 0.6155655980110168, "learning_rate": 1.8171836538656645e-06, "loss": 0.1164, "num_input_tokens_seen": 38073760, "step": 56485 }, { "epoch": 1.3800600982092688, "grad_norm": 118.65497589111328, "learning_rate": 1.8171344988155925e-06, "loss": 0.0277, "num_input_tokens_seen": 38076832, "step": 56490 }, { "epoch": 1.380182249041116, "grad_norm": 24.945920944213867, "learning_rate": 1.8170853378230985e-06, "loss": 0.1765, "num_input_tokens_seen": 38079840, "step": 56495 }, { "epoch": 1.3803043998729632, "grad_norm": 1.0863746404647827, "learning_rate": 1.8170361708885402e-06, "loss": 0.143, "num_input_tokens_seen": 38082848, "step": 56500 }, { "epoch": 1.3804265507048104, "grad_norm": 17.139713287353516, "learning_rate": 1.816986998012275e-06, "loss": 0.098, "num_input_tokens_seen": 38086304, "step": 56505 }, { "epoch": 1.3805487015366575, "grad_norm": 1.9433549642562866, "learning_rate": 1.8169378191946607e-06, "loss": 0.0048, "num_input_tokens_seen": 38090016, "step": 56510 }, { "epoch": 1.3806708523685045, "grad_norm": 0.7481113076210022, "learning_rate": 1.8168886344360549e-06, "loss": 0.0051, "num_input_tokens_seen": 38093152, "step": 56515 }, { "epoch": 1.3807930032003517, "grad_norm": 7.925774097442627, "learning_rate": 1.816839443736815e-06, "loss": 0.1642, "num_input_tokens_seen": 38096864, "step": 56520 }, { "epoch": 1.3809151540321989, "grad_norm": 0.4268783926963806, "learning_rate": 1.816790247097299e-06, "loss": 0.0691, "num_input_tokens_seen": 38099936, "step": 56525 }, { "epoch": 1.381037304864046, "grad_norm": 0.5027146935462952, "learning_rate": 1.8167410445178649e-06, "loss": 0.1108, "num_input_tokens_seen": 38103264, "step": 56530 }, { "epoch": 1.3811594556958933, "grad_norm": 0.23187850415706635, "learning_rate": 1.8166918359988702e-06, "loss": 0.1469, "num_input_tokens_seen": 38106592, "step": 56535 }, { "epoch": 1.3812816065277405, "grad_norm": 1.227955937385559, "learning_rate": 1.8166426215406726e-06, "loss": 0.0857, "num_input_tokens_seen": 38109600, "step": 56540 }, { "epoch": 1.3814037573595876, "grad_norm": 7.709028244018555, "learning_rate": 1.8165934011436303e-06, "loss": 0.1107, "num_input_tokens_seen": 38112672, "step": 56545 }, { "epoch": 1.3815259081914348, "grad_norm": 0.14321696758270264, "learning_rate": 1.8165441748081012e-06, "loss": 0.0608, "num_input_tokens_seen": 38115872, "step": 56550 }, { "epoch": 1.381648059023282, "grad_norm": 0.7300055623054504, "learning_rate": 1.8164949425344428e-06, "loss": 0.0424, "num_input_tokens_seen": 38120480, "step": 56555 }, { "epoch": 1.3817702098551292, "grad_norm": 3.9435174465179443, "learning_rate": 1.8164457043230144e-06, "loss": 0.025, "num_input_tokens_seen": 38123616, "step": 56560 }, { "epoch": 1.3818923606869764, "grad_norm": 0.3503316342830658, "learning_rate": 1.8163964601741726e-06, "loss": 0.1618, "num_input_tokens_seen": 38127712, "step": 56565 }, { "epoch": 1.3820145115188234, "grad_norm": 0.2841152846813202, "learning_rate": 1.8163472100882763e-06, "loss": 0.063, "num_input_tokens_seen": 38131488, "step": 56570 }, { "epoch": 1.3821366623506706, "grad_norm": 0.612076461315155, "learning_rate": 1.8162979540656837e-06, "loss": 0.1056, "num_input_tokens_seen": 38134496, "step": 56575 }, { "epoch": 1.3822588131825178, "grad_norm": 0.1776486039161682, "learning_rate": 1.8162486921067525e-06, "loss": 0.1367, "num_input_tokens_seen": 38137888, "step": 56580 }, { "epoch": 1.382380964014365, "grad_norm": 0.20384326577186584, "learning_rate": 1.8161994242118416e-06, "loss": 0.0409, "num_input_tokens_seen": 38141472, "step": 56585 }, { "epoch": 1.3825031148462121, "grad_norm": 0.05330972000956535, "learning_rate": 1.8161501503813085e-06, "loss": 0.1291, "num_input_tokens_seen": 38144992, "step": 56590 }, { "epoch": 1.3826252656780593, "grad_norm": 1.6077696084976196, "learning_rate": 1.8161008706155126e-06, "loss": 0.0197, "num_input_tokens_seen": 38148256, "step": 56595 }, { "epoch": 1.3827474165099065, "grad_norm": 0.05597035586833954, "learning_rate": 1.8160515849148112e-06, "loss": 0.1117, "num_input_tokens_seen": 38151392, "step": 56600 }, { "epoch": 1.3828695673417535, "grad_norm": 2.726749897003174, "learning_rate": 1.8160022932795632e-06, "loss": 0.0846, "num_input_tokens_seen": 38154528, "step": 56605 }, { "epoch": 1.3829917181736007, "grad_norm": 110.53439331054688, "learning_rate": 1.8159529957101273e-06, "loss": 0.0342, "num_input_tokens_seen": 38157920, "step": 56610 }, { "epoch": 1.3831138690054479, "grad_norm": 0.10677741467952728, "learning_rate": 1.8159036922068616e-06, "loss": 0.0348, "num_input_tokens_seen": 38161568, "step": 56615 }, { "epoch": 1.383236019837295, "grad_norm": 43.46922302246094, "learning_rate": 1.8158543827701249e-06, "loss": 0.1753, "num_input_tokens_seen": 38165024, "step": 56620 }, { "epoch": 1.3833581706691422, "grad_norm": 0.23257319629192352, "learning_rate": 1.8158050674002757e-06, "loss": 0.0462, "num_input_tokens_seen": 38168288, "step": 56625 }, { "epoch": 1.3834803215009894, "grad_norm": 39.41869354248047, "learning_rate": 1.8157557460976725e-06, "loss": 0.2128, "num_input_tokens_seen": 38171488, "step": 56630 }, { "epoch": 1.3836024723328366, "grad_norm": 0.14419890940189362, "learning_rate": 1.815706418862674e-06, "loss": 0.0025, "num_input_tokens_seen": 38174624, "step": 56635 }, { "epoch": 1.3837246231646838, "grad_norm": 70.12617492675781, "learning_rate": 1.8156570856956393e-06, "loss": 0.0736, "num_input_tokens_seen": 38177888, "step": 56640 }, { "epoch": 1.383846773996531, "grad_norm": 0.39914312958717346, "learning_rate": 1.8156077465969267e-06, "loss": 0.0017, "num_input_tokens_seen": 38180960, "step": 56645 }, { "epoch": 1.3839689248283782, "grad_norm": 12.098282814025879, "learning_rate": 1.8155584015668954e-06, "loss": 0.0432, "num_input_tokens_seen": 38184224, "step": 56650 }, { "epoch": 1.3840910756602254, "grad_norm": 0.2898300290107727, "learning_rate": 1.8155090506059039e-06, "loss": 0.0316, "num_input_tokens_seen": 38187552, "step": 56655 }, { "epoch": 1.3842132264920723, "grad_norm": 0.002333325333893299, "learning_rate": 1.815459693714311e-06, "loss": 0.1797, "num_input_tokens_seen": 38190752, "step": 56660 }, { "epoch": 1.3843353773239195, "grad_norm": 15.202603340148926, "learning_rate": 1.8154103308924763e-06, "loss": 0.2378, "num_input_tokens_seen": 38194720, "step": 56665 }, { "epoch": 1.3844575281557667, "grad_norm": 93.10588836669922, "learning_rate": 1.815360962140758e-06, "loss": 0.0175, "num_input_tokens_seen": 38198624, "step": 56670 }, { "epoch": 1.384579678987614, "grad_norm": 1.7387146949768066, "learning_rate": 1.8153115874595158e-06, "loss": 0.1147, "num_input_tokens_seen": 38201760, "step": 56675 }, { "epoch": 1.384701829819461, "grad_norm": 7.717111587524414, "learning_rate": 1.815262206849108e-06, "loss": 0.0694, "num_input_tokens_seen": 38205088, "step": 56680 }, { "epoch": 1.3848239806513083, "grad_norm": 0.2825879752635956, "learning_rate": 1.8152128203098943e-06, "loss": 0.0993, "num_input_tokens_seen": 38208736, "step": 56685 }, { "epoch": 1.3849461314831553, "grad_norm": 109.01095581054688, "learning_rate": 1.815163427842234e-06, "loss": 0.0757, "num_input_tokens_seen": 38212128, "step": 56690 }, { "epoch": 1.3850682823150025, "grad_norm": 22.071168899536133, "learning_rate": 1.8151140294464858e-06, "loss": 0.114, "num_input_tokens_seen": 38215392, "step": 56695 }, { "epoch": 1.3851904331468496, "grad_norm": 0.2266807109117508, "learning_rate": 1.8150646251230092e-06, "loss": 0.0642, "num_input_tokens_seen": 38218464, "step": 56700 }, { "epoch": 1.3853125839786968, "grad_norm": 28.931777954101562, "learning_rate": 1.8150152148721637e-06, "loss": 0.1127, "num_input_tokens_seen": 38221728, "step": 56705 }, { "epoch": 1.385434734810544, "grad_norm": 3.359818458557129, "learning_rate": 1.8149657986943078e-06, "loss": 0.0424, "num_input_tokens_seen": 38225120, "step": 56710 }, { "epoch": 1.3855568856423912, "grad_norm": 38.96266174316406, "learning_rate": 1.8149163765898016e-06, "loss": 0.0771, "num_input_tokens_seen": 38228256, "step": 56715 }, { "epoch": 1.3856790364742384, "grad_norm": 0.7429246306419373, "learning_rate": 1.8148669485590044e-06, "loss": 0.0552, "num_input_tokens_seen": 38232224, "step": 56720 }, { "epoch": 1.3858011873060856, "grad_norm": 0.12821519374847412, "learning_rate": 1.8148175146022758e-06, "loss": 0.0013, "num_input_tokens_seen": 38235424, "step": 56725 }, { "epoch": 1.3859233381379328, "grad_norm": 0.5723388195037842, "learning_rate": 1.8147680747199748e-06, "loss": 0.0982, "num_input_tokens_seen": 38238816, "step": 56730 }, { "epoch": 1.38604548896978, "grad_norm": 0.63655686378479, "learning_rate": 1.8147186289124611e-06, "loss": 0.0017, "num_input_tokens_seen": 38243168, "step": 56735 }, { "epoch": 1.3861676398016272, "grad_norm": 0.07612661272287369, "learning_rate": 1.8146691771800945e-06, "loss": 0.045, "num_input_tokens_seen": 38246368, "step": 56740 }, { "epoch": 1.3862897906334744, "grad_norm": 0.20316410064697266, "learning_rate": 1.8146197195232347e-06, "loss": 0.0683, "num_input_tokens_seen": 38249568, "step": 56745 }, { "epoch": 1.3864119414653213, "grad_norm": 9.185523986816406, "learning_rate": 1.814570255942241e-06, "loss": 0.0041, "num_input_tokens_seen": 38252960, "step": 56750 }, { "epoch": 1.3865340922971685, "grad_norm": 0.09728414565324783, "learning_rate": 1.8145207864374734e-06, "loss": 0.0906, "num_input_tokens_seen": 38256480, "step": 56755 }, { "epoch": 1.3866562431290157, "grad_norm": 0.043544042855501175, "learning_rate": 1.8144713110092915e-06, "loss": 0.0711, "num_input_tokens_seen": 38260064, "step": 56760 }, { "epoch": 1.3867783939608629, "grad_norm": 18.682769775390625, "learning_rate": 1.8144218296580553e-06, "loss": 0.0695, "num_input_tokens_seen": 38263328, "step": 56765 }, { "epoch": 1.38690054479271, "grad_norm": 0.15236736834049225, "learning_rate": 1.8143723423841241e-06, "loss": 0.023, "num_input_tokens_seen": 38267040, "step": 56770 }, { "epoch": 1.3870226956245573, "grad_norm": 0.08610755205154419, "learning_rate": 1.814322849187859e-06, "loss": 0.0352, "num_input_tokens_seen": 38270368, "step": 56775 }, { "epoch": 1.3871448464564042, "grad_norm": 10.737128257751465, "learning_rate": 1.814273350069618e-06, "loss": 0.0862, "num_input_tokens_seen": 38273632, "step": 56780 }, { "epoch": 1.3872669972882514, "grad_norm": 0.05491538718342781, "learning_rate": 1.8142238450297632e-06, "loss": 0.1411, "num_input_tokens_seen": 38276768, "step": 56785 }, { "epoch": 1.3873891481200986, "grad_norm": 0.25832599401474, "learning_rate": 1.814174334068653e-06, "loss": 0.0709, "num_input_tokens_seen": 38279968, "step": 56790 }, { "epoch": 1.3875112989519458, "grad_norm": 24.963336944580078, "learning_rate": 1.8141248171866482e-06, "loss": 0.0428, "num_input_tokens_seen": 38283424, "step": 56795 }, { "epoch": 1.387633449783793, "grad_norm": 23.700716018676758, "learning_rate": 1.814075294384109e-06, "loss": 0.181, "num_input_tokens_seen": 38286880, "step": 56800 }, { "epoch": 1.3877556006156402, "grad_norm": 0.20313750207424164, "learning_rate": 1.8140257656613952e-06, "loss": 0.1029, "num_input_tokens_seen": 38290208, "step": 56805 }, { "epoch": 1.3878777514474874, "grad_norm": 0.13214872777462006, "learning_rate": 1.8139762310188666e-06, "loss": 0.0015, "num_input_tokens_seen": 38293600, "step": 56810 }, { "epoch": 1.3879999022793346, "grad_norm": 104.4972915649414, "learning_rate": 1.8139266904568844e-06, "loss": 0.0907, "num_input_tokens_seen": 38297184, "step": 56815 }, { "epoch": 1.3881220531111818, "grad_norm": 3.2046422958374023, "learning_rate": 1.8138771439758083e-06, "loss": 0.0459, "num_input_tokens_seen": 38300448, "step": 56820 }, { "epoch": 1.388244203943029, "grad_norm": 0.06722808629274368, "learning_rate": 1.8138275915759986e-06, "loss": 0.0013, "num_input_tokens_seen": 38303712, "step": 56825 }, { "epoch": 1.3883663547748761, "grad_norm": 54.982337951660156, "learning_rate": 1.8137780332578158e-06, "loss": 0.1807, "num_input_tokens_seen": 38307296, "step": 56830 }, { "epoch": 1.388488505606723, "grad_norm": 20.035268783569336, "learning_rate": 1.8137284690216204e-06, "loss": 0.2124, "num_input_tokens_seen": 38310496, "step": 56835 }, { "epoch": 1.3886106564385703, "grad_norm": 1.0956863164901733, "learning_rate": 1.8136788988677725e-06, "loss": 0.099, "num_input_tokens_seen": 38313888, "step": 56840 }, { "epoch": 1.3887328072704175, "grad_norm": 0.11295932531356812, "learning_rate": 1.813629322796633e-06, "loss": 0.0729, "num_input_tokens_seen": 38317536, "step": 56845 }, { "epoch": 1.3888549581022647, "grad_norm": 0.755732536315918, "learning_rate": 1.8135797408085623e-06, "loss": 0.1225, "num_input_tokens_seen": 38321120, "step": 56850 }, { "epoch": 1.3889771089341119, "grad_norm": 0.18139754235744476, "learning_rate": 1.8135301529039207e-06, "loss": 0.0755, "num_input_tokens_seen": 38324320, "step": 56855 }, { "epoch": 1.389099259765959, "grad_norm": 0.15731483697891235, "learning_rate": 1.813480559083069e-06, "loss": 0.0337, "num_input_tokens_seen": 38327840, "step": 56860 }, { "epoch": 1.3892214105978062, "grad_norm": 91.60194396972656, "learning_rate": 1.813430959346368e-06, "loss": 0.1575, "num_input_tokens_seen": 38331104, "step": 56865 }, { "epoch": 1.3893435614296532, "grad_norm": 0.30752700567245483, "learning_rate": 1.813381353694178e-06, "loss": 0.1537, "num_input_tokens_seen": 38334432, "step": 56870 }, { "epoch": 1.3894657122615004, "grad_norm": 0.6529026627540588, "learning_rate": 1.8133317421268601e-06, "loss": 0.0716, "num_input_tokens_seen": 38338016, "step": 56875 }, { "epoch": 1.3895878630933476, "grad_norm": 0.7751749157905579, "learning_rate": 1.8132821246447753e-06, "loss": 0.0679, "num_input_tokens_seen": 38341344, "step": 56880 }, { "epoch": 1.3897100139251948, "grad_norm": 170.2229766845703, "learning_rate": 1.813232501248284e-06, "loss": 0.1301, "num_input_tokens_seen": 38344928, "step": 56885 }, { "epoch": 1.389832164757042, "grad_norm": 0.23280911147594452, "learning_rate": 1.813182871937747e-06, "loss": 0.0464, "num_input_tokens_seen": 38348128, "step": 56890 }, { "epoch": 1.3899543155888892, "grad_norm": 38.519474029541016, "learning_rate": 1.8131332367135256e-06, "loss": 0.1033, "num_input_tokens_seen": 38351392, "step": 56895 }, { "epoch": 1.3900764664207363, "grad_norm": 0.19992072880268097, "learning_rate": 1.8130835955759807e-06, "loss": 0.0761, "num_input_tokens_seen": 38354208, "step": 56900 }, { "epoch": 1.3901986172525835, "grad_norm": 0.10394836962223053, "learning_rate": 1.8130339485254731e-06, "loss": 0.0616, "num_input_tokens_seen": 38357728, "step": 56905 }, { "epoch": 1.3903207680844307, "grad_norm": 0.1428837925195694, "learning_rate": 1.812984295562364e-06, "loss": 0.0903, "num_input_tokens_seen": 38361056, "step": 56910 }, { "epoch": 1.390442918916278, "grad_norm": 15.613577842712402, "learning_rate": 1.8129346366870143e-06, "loss": 0.0795, "num_input_tokens_seen": 38364256, "step": 56915 }, { "epoch": 1.390565069748125, "grad_norm": 0.44246914982795715, "learning_rate": 1.8128849718997854e-06, "loss": 0.0247, "num_input_tokens_seen": 38367840, "step": 56920 }, { "epoch": 1.390687220579972, "grad_norm": 0.33950066566467285, "learning_rate": 1.8128353012010385e-06, "loss": 0.035, "num_input_tokens_seen": 38371680, "step": 56925 }, { "epoch": 1.3908093714118193, "grad_norm": 15.934062004089355, "learning_rate": 1.8127856245911343e-06, "loss": 0.1329, "num_input_tokens_seen": 38374944, "step": 56930 }, { "epoch": 1.3909315222436665, "grad_norm": 0.11535021662712097, "learning_rate": 1.8127359420704344e-06, "loss": 0.0395, "num_input_tokens_seen": 38378336, "step": 56935 }, { "epoch": 1.3910536730755136, "grad_norm": 0.03485637903213501, "learning_rate": 1.8126862536393005e-06, "loss": 0.0486, "num_input_tokens_seen": 38381664, "step": 56940 }, { "epoch": 1.3911758239073608, "grad_norm": 16.718181610107422, "learning_rate": 1.8126365592980935e-06, "loss": 0.0424, "num_input_tokens_seen": 38385056, "step": 56945 }, { "epoch": 1.391297974739208, "grad_norm": 32.83033752441406, "learning_rate": 1.8125868590471748e-06, "loss": 0.1111, "num_input_tokens_seen": 38388512, "step": 56950 }, { "epoch": 1.3914201255710552, "grad_norm": 0.171127587556839, "learning_rate": 1.8125371528869059e-06, "loss": 0.003, "num_input_tokens_seen": 38391712, "step": 56955 }, { "epoch": 1.3915422764029022, "grad_norm": 0.08284179121255875, "learning_rate": 1.812487440817648e-06, "loss": 0.154, "num_input_tokens_seen": 38395040, "step": 56960 }, { "epoch": 1.3916644272347494, "grad_norm": 86.25468444824219, "learning_rate": 1.8124377228397631e-06, "loss": 0.0939, "num_input_tokens_seen": 38398048, "step": 56965 }, { "epoch": 1.3917865780665966, "grad_norm": 37.10374450683594, "learning_rate": 1.8123879989536129e-06, "loss": 0.1104, "num_input_tokens_seen": 38401248, "step": 56970 }, { "epoch": 1.3919087288984437, "grad_norm": 20.548166275024414, "learning_rate": 1.8123382691595581e-06, "loss": 0.0908, "num_input_tokens_seen": 38404832, "step": 56975 }, { "epoch": 1.392030879730291, "grad_norm": 0.08781076967716217, "learning_rate": 1.8122885334579615e-06, "loss": 0.0896, "num_input_tokens_seen": 38408096, "step": 56980 }, { "epoch": 1.3921530305621381, "grad_norm": 78.29131317138672, "learning_rate": 1.8122387918491838e-06, "loss": 0.1319, "num_input_tokens_seen": 38412128, "step": 56985 }, { "epoch": 1.3922751813939853, "grad_norm": 37.14921188354492, "learning_rate": 1.8121890443335873e-06, "loss": 0.1685, "num_input_tokens_seen": 38415520, "step": 56990 }, { "epoch": 1.3923973322258325, "grad_norm": 0.22196723520755768, "learning_rate": 1.8121392909115334e-06, "loss": 0.0021, "num_input_tokens_seen": 38419424, "step": 56995 }, { "epoch": 1.3925194830576797, "grad_norm": 47.17499923706055, "learning_rate": 1.8120895315833842e-06, "loss": 0.1008, "num_input_tokens_seen": 38423392, "step": 57000 }, { "epoch": 1.3926416338895269, "grad_norm": 109.18597412109375, "learning_rate": 1.8120397663495015e-06, "loss": 0.151, "num_input_tokens_seen": 38427104, "step": 57005 }, { "epoch": 1.392763784721374, "grad_norm": 0.2220344990491867, "learning_rate": 1.8119899952102476e-06, "loss": 0.0376, "num_input_tokens_seen": 38430688, "step": 57010 }, { "epoch": 1.392885935553221, "grad_norm": 86.03609466552734, "learning_rate": 1.8119402181659837e-06, "loss": 0.0085, "num_input_tokens_seen": 38434400, "step": 57015 }, { "epoch": 1.3930080863850682, "grad_norm": 0.1834421306848526, "learning_rate": 1.811890435217072e-06, "loss": 0.0804, "num_input_tokens_seen": 38437920, "step": 57020 }, { "epoch": 1.3931302372169154, "grad_norm": 19.444744110107422, "learning_rate": 1.811840646363875e-06, "loss": 0.0872, "num_input_tokens_seen": 38441248, "step": 57025 }, { "epoch": 1.3932523880487626, "grad_norm": 2.8895413875579834, "learning_rate": 1.8117908516067542e-06, "loss": 0.221, "num_input_tokens_seen": 38444896, "step": 57030 }, { "epoch": 1.3933745388806098, "grad_norm": 0.1735847443342209, "learning_rate": 1.8117410509460723e-06, "loss": 0.0527, "num_input_tokens_seen": 38448096, "step": 57035 }, { "epoch": 1.393496689712457, "grad_norm": 66.44129943847656, "learning_rate": 1.811691244382191e-06, "loss": 0.059, "num_input_tokens_seen": 38451616, "step": 57040 }, { "epoch": 1.3936188405443042, "grad_norm": 7.641853332519531, "learning_rate": 1.8116414319154726e-06, "loss": 0.0346, "num_input_tokens_seen": 38455392, "step": 57045 }, { "epoch": 1.3937409913761512, "grad_norm": 0.3213024139404297, "learning_rate": 1.8115916135462794e-06, "loss": 0.0558, "num_input_tokens_seen": 38458720, "step": 57050 }, { "epoch": 1.3938631422079983, "grad_norm": 4.296091079711914, "learning_rate": 1.8115417892749738e-06, "loss": 0.1203, "num_input_tokens_seen": 38461856, "step": 57055 }, { "epoch": 1.3939852930398455, "grad_norm": 63.78632736206055, "learning_rate": 1.811491959101918e-06, "loss": 0.137, "num_input_tokens_seen": 38465056, "step": 57060 }, { "epoch": 1.3941074438716927, "grad_norm": 0.44734886288642883, "learning_rate": 1.8114421230274743e-06, "loss": 0.0427, "num_input_tokens_seen": 38468320, "step": 57065 }, { "epoch": 1.39422959470354, "grad_norm": 0.3252753019332886, "learning_rate": 1.8113922810520053e-06, "loss": 0.047, "num_input_tokens_seen": 38471520, "step": 57070 }, { "epoch": 1.394351745535387, "grad_norm": 0.053497254848480225, "learning_rate": 1.811342433175873e-06, "loss": 0.085, "num_input_tokens_seen": 38474720, "step": 57075 }, { "epoch": 1.3944738963672343, "grad_norm": 1.370205283164978, "learning_rate": 1.8112925793994408e-06, "loss": 0.0732, "num_input_tokens_seen": 38477856, "step": 57080 }, { "epoch": 1.3945960471990815, "grad_norm": 72.65982818603516, "learning_rate": 1.811242719723071e-06, "loss": 0.1524, "num_input_tokens_seen": 38481056, "step": 57085 }, { "epoch": 1.3947181980309287, "grad_norm": 0.2532006800174713, "learning_rate": 1.8111928541471254e-06, "loss": 0.0897, "num_input_tokens_seen": 38484256, "step": 57090 }, { "epoch": 1.3948403488627759, "grad_norm": 0.07710134983062744, "learning_rate": 1.8111429826719673e-06, "loss": 0.0368, "num_input_tokens_seen": 38487520, "step": 57095 }, { "epoch": 1.394962499694623, "grad_norm": 32.57456588745117, "learning_rate": 1.8110931052979593e-06, "loss": 0.1924, "num_input_tokens_seen": 38490720, "step": 57100 }, { "epoch": 1.39508465052647, "grad_norm": 9.302146911621094, "learning_rate": 1.8110432220254641e-06, "loss": 0.2088, "num_input_tokens_seen": 38494432, "step": 57105 }, { "epoch": 1.3952068013583172, "grad_norm": 0.8289868831634521, "learning_rate": 1.8109933328548443e-06, "loss": 0.0304, "num_input_tokens_seen": 38497824, "step": 57110 }, { "epoch": 1.3953289521901644, "grad_norm": 0.10691874474287033, "learning_rate": 1.8109434377864631e-06, "loss": 0.0008, "num_input_tokens_seen": 38500896, "step": 57115 }, { "epoch": 1.3954511030220116, "grad_norm": 13.294049263000488, "learning_rate": 1.810893536820683e-06, "loss": 0.1938, "num_input_tokens_seen": 38503776, "step": 57120 }, { "epoch": 1.3955732538538588, "grad_norm": 0.11053433269262314, "learning_rate": 1.8108436299578669e-06, "loss": 0.0298, "num_input_tokens_seen": 38507232, "step": 57125 }, { "epoch": 1.395695404685706, "grad_norm": 0.09031997621059418, "learning_rate": 1.810793717198378e-06, "loss": 0.0875, "num_input_tokens_seen": 38510944, "step": 57130 }, { "epoch": 1.3958175555175532, "grad_norm": 86.59408569335938, "learning_rate": 1.8107437985425792e-06, "loss": 0.0342, "num_input_tokens_seen": 38514208, "step": 57135 }, { "epoch": 1.3959397063494001, "grad_norm": 19.59816551208496, "learning_rate": 1.810693873990833e-06, "loss": 0.1204, "num_input_tokens_seen": 38517728, "step": 57140 }, { "epoch": 1.3960618571812473, "grad_norm": 1.211097240447998, "learning_rate": 1.8106439435435035e-06, "loss": 0.1372, "num_input_tokens_seen": 38521120, "step": 57145 }, { "epoch": 1.3961840080130945, "grad_norm": 0.18598124384880066, "learning_rate": 1.8105940072009527e-06, "loss": 0.002, "num_input_tokens_seen": 38524320, "step": 57150 }, { "epoch": 1.3963061588449417, "grad_norm": 11.330933570861816, "learning_rate": 1.8105440649635445e-06, "loss": 0.1249, "num_input_tokens_seen": 38528352, "step": 57155 }, { "epoch": 1.3964283096767889, "grad_norm": 0.22256216406822205, "learning_rate": 1.8104941168316416e-06, "loss": 0.1006, "num_input_tokens_seen": 38531296, "step": 57160 }, { "epoch": 1.396550460508636, "grad_norm": 1.1994155645370483, "learning_rate": 1.810444162805608e-06, "loss": 0.0017, "num_input_tokens_seen": 38534944, "step": 57165 }, { "epoch": 1.3966726113404833, "grad_norm": 8.909801483154297, "learning_rate": 1.8103942028858059e-06, "loss": 0.1503, "num_input_tokens_seen": 38538720, "step": 57170 }, { "epoch": 1.3967947621723305, "grad_norm": 0.5872882008552551, "learning_rate": 1.8103442370725995e-06, "loss": 0.0017, "num_input_tokens_seen": 38541984, "step": 57175 }, { "epoch": 1.3969169130041776, "grad_norm": 0.8501310348510742, "learning_rate": 1.8102942653663518e-06, "loss": 0.0414, "num_input_tokens_seen": 38546144, "step": 57180 }, { "epoch": 1.3970390638360248, "grad_norm": 0.03388998657464981, "learning_rate": 1.8102442877674261e-06, "loss": 0.1137, "num_input_tokens_seen": 38549600, "step": 57185 }, { "epoch": 1.397161214667872, "grad_norm": 0.6265448927879333, "learning_rate": 1.810194304276186e-06, "loss": 0.0352, "num_input_tokens_seen": 38552928, "step": 57190 }, { "epoch": 1.397283365499719, "grad_norm": 0.2135477215051651, "learning_rate": 1.8101443148929954e-06, "loss": 0.0021, "num_input_tokens_seen": 38556256, "step": 57195 }, { "epoch": 1.3974055163315662, "grad_norm": 0.6835227012634277, "learning_rate": 1.810094319618217e-06, "loss": 0.0396, "num_input_tokens_seen": 38559648, "step": 57200 }, { "epoch": 1.3975276671634134, "grad_norm": 24.1704044342041, "learning_rate": 1.810044318452215e-06, "loss": 0.0853, "num_input_tokens_seen": 38562784, "step": 57205 }, { "epoch": 1.3976498179952606, "grad_norm": 134.97132873535156, "learning_rate": 1.8099943113953529e-06, "loss": 0.049, "num_input_tokens_seen": 38566368, "step": 57210 }, { "epoch": 1.3977719688271077, "grad_norm": 19.390241622924805, "learning_rate": 1.8099442984479942e-06, "loss": 0.1038, "num_input_tokens_seen": 38569824, "step": 57215 }, { "epoch": 1.397894119658955, "grad_norm": 0.13285577297210693, "learning_rate": 1.8098942796105027e-06, "loss": 0.0255, "num_input_tokens_seen": 38573408, "step": 57220 }, { "epoch": 1.398016270490802, "grad_norm": 0.08401376008987427, "learning_rate": 1.8098442548832424e-06, "loss": 0.1684, "num_input_tokens_seen": 38576864, "step": 57225 }, { "epoch": 1.398138421322649, "grad_norm": 0.12962745130062103, "learning_rate": 1.8097942242665765e-06, "loss": 0.2394, "num_input_tokens_seen": 38580384, "step": 57230 }, { "epoch": 1.3982605721544963, "grad_norm": 0.4125172793865204, "learning_rate": 1.8097441877608695e-06, "loss": 0.0485, "num_input_tokens_seen": 38583648, "step": 57235 }, { "epoch": 1.3983827229863435, "grad_norm": 211.63461303710938, "learning_rate": 1.809694145366485e-06, "loss": 0.0928, "num_input_tokens_seen": 38587744, "step": 57240 }, { "epoch": 1.3985048738181907, "grad_norm": 0.08615150302648544, "learning_rate": 1.8096440970837866e-06, "loss": 0.0088, "num_input_tokens_seen": 38591520, "step": 57245 }, { "epoch": 1.3986270246500379, "grad_norm": 20.34548568725586, "learning_rate": 1.8095940429131386e-06, "loss": 0.1195, "num_input_tokens_seen": 38594784, "step": 57250 }, { "epoch": 1.398749175481885, "grad_norm": 0.2246903032064438, "learning_rate": 1.8095439828549051e-06, "loss": 0.137, "num_input_tokens_seen": 38598368, "step": 57255 }, { "epoch": 1.3988713263137322, "grad_norm": 251.88229370117188, "learning_rate": 1.80949391690945e-06, "loss": 0.1184, "num_input_tokens_seen": 38601888, "step": 57260 }, { "epoch": 1.3989934771455794, "grad_norm": 0.17487865686416626, "learning_rate": 1.8094438450771375e-06, "loss": 0.0366, "num_input_tokens_seen": 38605408, "step": 57265 }, { "epoch": 1.3991156279774266, "grad_norm": 0.1770249605178833, "learning_rate": 1.8093937673583315e-06, "loss": 0.1262, "num_input_tokens_seen": 38608800, "step": 57270 }, { "epoch": 1.3992377788092738, "grad_norm": 27.551986694335938, "learning_rate": 1.8093436837533961e-06, "loss": 0.1281, "num_input_tokens_seen": 38612192, "step": 57275 }, { "epoch": 1.399359929641121, "grad_norm": 13.532742500305176, "learning_rate": 1.809293594262696e-06, "loss": 0.0841, "num_input_tokens_seen": 38615200, "step": 57280 }, { "epoch": 1.399482080472968, "grad_norm": 0.6043211221694946, "learning_rate": 1.8092434988865953e-06, "loss": 0.1067, "num_input_tokens_seen": 38618208, "step": 57285 }, { "epoch": 1.3996042313048151, "grad_norm": 0.6223563551902771, "learning_rate": 1.809193397625458e-06, "loss": 0.0774, "num_input_tokens_seen": 38621856, "step": 57290 }, { "epoch": 1.3997263821366623, "grad_norm": 0.7312188744544983, "learning_rate": 1.8091432904796488e-06, "loss": 0.0382, "num_input_tokens_seen": 38624864, "step": 57295 }, { "epoch": 1.3998485329685095, "grad_norm": 31.77884292602539, "learning_rate": 1.8090931774495321e-06, "loss": 0.1126, "num_input_tokens_seen": 38628832, "step": 57300 }, { "epoch": 1.3999706838003567, "grad_norm": 1.9252561330795288, "learning_rate": 1.8090430585354719e-06, "loss": 0.0725, "num_input_tokens_seen": 38632160, "step": 57305 }, { "epoch": 1.400092834632204, "grad_norm": 11.0371732711792, "learning_rate": 1.808992933737833e-06, "loss": 0.112, "num_input_tokens_seen": 38635488, "step": 57310 }, { "epoch": 1.4002149854640509, "grad_norm": 12.347792625427246, "learning_rate": 1.80894280305698e-06, "loss": 0.1026, "num_input_tokens_seen": 38638880, "step": 57315 }, { "epoch": 1.400337136295898, "grad_norm": 1.6804922819137573, "learning_rate": 1.8088926664932775e-06, "loss": 0.0514, "num_input_tokens_seen": 38642336, "step": 57320 }, { "epoch": 1.4004592871277453, "grad_norm": 12.734495162963867, "learning_rate": 1.80884252404709e-06, "loss": 0.0423, "num_input_tokens_seen": 38645984, "step": 57325 }, { "epoch": 1.4005814379595924, "grad_norm": 3.755786180496216, "learning_rate": 1.8087923757187817e-06, "loss": 0.0354, "num_input_tokens_seen": 38649056, "step": 57330 }, { "epoch": 1.4007035887914396, "grad_norm": 18.840024948120117, "learning_rate": 1.808742221508718e-06, "loss": 0.1126, "num_input_tokens_seen": 38652640, "step": 57335 }, { "epoch": 1.4008257396232868, "grad_norm": 30.551414489746094, "learning_rate": 1.8086920614172633e-06, "loss": 0.119, "num_input_tokens_seen": 38655904, "step": 57340 }, { "epoch": 1.400947890455134, "grad_norm": 0.1037088930606842, "learning_rate": 1.8086418954447825e-06, "loss": 0.0705, "num_input_tokens_seen": 38659232, "step": 57345 }, { "epoch": 1.4010700412869812, "grad_norm": 0.2562164068222046, "learning_rate": 1.80859172359164e-06, "loss": 0.0021, "num_input_tokens_seen": 38662688, "step": 57350 }, { "epoch": 1.4011921921188284, "grad_norm": 0.8684639930725098, "learning_rate": 1.8085415458582012e-06, "loss": 0.0354, "num_input_tokens_seen": 38666208, "step": 57355 }, { "epoch": 1.4013143429506756, "grad_norm": 0.16640126705169678, "learning_rate": 1.808491362244831e-06, "loss": 0.0081, "num_input_tokens_seen": 38669664, "step": 57360 }, { "epoch": 1.4014364937825228, "grad_norm": 229.04251098632812, "learning_rate": 1.8084411727518938e-06, "loss": 0.2416, "num_input_tokens_seen": 38672672, "step": 57365 }, { "epoch": 1.4015586446143697, "grad_norm": 45.303890228271484, "learning_rate": 1.8083909773797555e-06, "loss": 0.157, "num_input_tokens_seen": 38676000, "step": 57370 }, { "epoch": 1.401680795446217, "grad_norm": 0.21114136278629303, "learning_rate": 1.8083407761287802e-06, "loss": 0.0918, "num_input_tokens_seen": 38679904, "step": 57375 }, { "epoch": 1.4018029462780641, "grad_norm": 0.13060590624809265, "learning_rate": 1.8082905689993333e-06, "loss": 0.0186, "num_input_tokens_seen": 38683360, "step": 57380 }, { "epoch": 1.4019250971099113, "grad_norm": 0.2954758107662201, "learning_rate": 1.8082403559917801e-06, "loss": 0.0348, "num_input_tokens_seen": 38686752, "step": 57385 }, { "epoch": 1.4020472479417585, "grad_norm": 10.650467872619629, "learning_rate": 1.8081901371064854e-06, "loss": 0.0465, "num_input_tokens_seen": 38690976, "step": 57390 }, { "epoch": 1.4021693987736057, "grad_norm": 111.42206573486328, "learning_rate": 1.8081399123438147e-06, "loss": 0.2443, "num_input_tokens_seen": 38693984, "step": 57395 }, { "epoch": 1.4022915496054529, "grad_norm": 20.380661010742188, "learning_rate": 1.8080896817041337e-06, "loss": 0.0703, "num_input_tokens_seen": 38697376, "step": 57400 }, { "epoch": 1.4024137004372998, "grad_norm": 0.3858691155910492, "learning_rate": 1.8080394451878066e-06, "loss": 0.003, "num_input_tokens_seen": 38700896, "step": 57405 }, { "epoch": 1.402535851269147, "grad_norm": 144.43235778808594, "learning_rate": 1.8079892027951997e-06, "loss": 0.1784, "num_input_tokens_seen": 38704480, "step": 57410 }, { "epoch": 1.4026580021009942, "grad_norm": 85.30264282226562, "learning_rate": 1.8079389545266776e-06, "loss": 0.0667, "num_input_tokens_seen": 38707680, "step": 57415 }, { "epoch": 1.4027801529328414, "grad_norm": 3.402688503265381, "learning_rate": 1.8078887003826067e-06, "loss": 0.1021, "num_input_tokens_seen": 38710752, "step": 57420 }, { "epoch": 1.4029023037646886, "grad_norm": 0.0745268315076828, "learning_rate": 1.8078384403633513e-06, "loss": 0.0199, "num_input_tokens_seen": 38713760, "step": 57425 }, { "epoch": 1.4030244545965358, "grad_norm": 0.1893487423658371, "learning_rate": 1.8077881744692778e-06, "loss": 0.0928, "num_input_tokens_seen": 38717280, "step": 57430 }, { "epoch": 1.403146605428383, "grad_norm": 0.1066637858748436, "learning_rate": 1.8077379027007513e-06, "loss": 0.085, "num_input_tokens_seen": 38720928, "step": 57435 }, { "epoch": 1.4032687562602302, "grad_norm": 0.33574655652046204, "learning_rate": 1.8076876250581376e-06, "loss": 0.0015, "num_input_tokens_seen": 38724320, "step": 57440 }, { "epoch": 1.4033909070920774, "grad_norm": 0.13137076795101166, "learning_rate": 1.807637341541802e-06, "loss": 0.0119, "num_input_tokens_seen": 38727328, "step": 57445 }, { "epoch": 1.4035130579239246, "grad_norm": 3.9827892780303955, "learning_rate": 1.807587052152111e-06, "loss": 0.0021, "num_input_tokens_seen": 38730208, "step": 57450 }, { "epoch": 1.4036352087557717, "grad_norm": 19.064966201782227, "learning_rate": 1.807536756889429e-06, "loss": 0.1549, "num_input_tokens_seen": 38733920, "step": 57455 }, { "epoch": 1.4037573595876187, "grad_norm": 13.327079772949219, "learning_rate": 1.807486455754123e-06, "loss": 0.0534, "num_input_tokens_seen": 38737120, "step": 57460 }, { "epoch": 1.403879510419466, "grad_norm": 0.09143029898405075, "learning_rate": 1.8074361487465582e-06, "loss": 0.0009, "num_input_tokens_seen": 38740576, "step": 57465 }, { "epoch": 1.404001661251313, "grad_norm": 8.409533500671387, "learning_rate": 1.8073858358671004e-06, "loss": 0.0514, "num_input_tokens_seen": 38743712, "step": 57470 }, { "epoch": 1.4041238120831603, "grad_norm": 0.055469900369644165, "learning_rate": 1.8073355171161157e-06, "loss": 0.1495, "num_input_tokens_seen": 38747104, "step": 57475 }, { "epoch": 1.4042459629150075, "grad_norm": 17.54491424560547, "learning_rate": 1.8072851924939702e-06, "loss": 0.116, "num_input_tokens_seen": 38750624, "step": 57480 }, { "epoch": 1.4043681137468547, "grad_norm": 0.06926427781581879, "learning_rate": 1.8072348620010294e-06, "loss": 0.1466, "num_input_tokens_seen": 38753696, "step": 57485 }, { "epoch": 1.4044902645787019, "grad_norm": 0.17716003954410553, "learning_rate": 1.8071845256376597e-06, "loss": 0.0367, "num_input_tokens_seen": 38757280, "step": 57490 }, { "epoch": 1.4046124154105488, "grad_norm": 0.09693924337625504, "learning_rate": 1.8071341834042268e-06, "loss": 0.1259, "num_input_tokens_seen": 38760160, "step": 57495 }, { "epoch": 1.404734566242396, "grad_norm": 0.2468055784702301, "learning_rate": 1.8070838353010973e-06, "loss": 0.0536, "num_input_tokens_seen": 38763104, "step": 57500 }, { "epoch": 1.4048567170742432, "grad_norm": 195.17642211914062, "learning_rate": 1.807033481328637e-06, "loss": 0.0231, "num_input_tokens_seen": 38766432, "step": 57505 }, { "epoch": 1.4049788679060904, "grad_norm": 27.664806365966797, "learning_rate": 1.806983121487212e-06, "loss": 0.0559, "num_input_tokens_seen": 38770144, "step": 57510 }, { "epoch": 1.4051010187379376, "grad_norm": 0.06676580756902695, "learning_rate": 1.8069327557771889e-06, "loss": 0.0498, "num_input_tokens_seen": 38773280, "step": 57515 }, { "epoch": 1.4052231695697848, "grad_norm": 0.34984180331230164, "learning_rate": 1.8068823841989338e-06, "loss": 0.0375, "num_input_tokens_seen": 38776608, "step": 57520 }, { "epoch": 1.405345320401632, "grad_norm": 19.778776168823242, "learning_rate": 1.8068320067528129e-06, "loss": 0.0967, "num_input_tokens_seen": 38779808, "step": 57525 }, { "epoch": 1.4054674712334791, "grad_norm": 0.1447640359401703, "learning_rate": 1.8067816234391925e-06, "loss": 0.1562, "num_input_tokens_seen": 38783200, "step": 57530 }, { "epoch": 1.4055896220653263, "grad_norm": 0.1753024160861969, "learning_rate": 1.8067312342584393e-06, "loss": 0.0736, "num_input_tokens_seen": 38786144, "step": 57535 }, { "epoch": 1.4057117728971735, "grad_norm": 0.07908373326063156, "learning_rate": 1.8066808392109193e-06, "loss": 0.1177, "num_input_tokens_seen": 38789344, "step": 57540 }, { "epoch": 1.4058339237290207, "grad_norm": 0.6226028203964233, "learning_rate": 1.8066304382969995e-06, "loss": 0.0773, "num_input_tokens_seen": 38792224, "step": 57545 }, { "epoch": 1.4059560745608677, "grad_norm": 81.13831329345703, "learning_rate": 1.8065800315170461e-06, "loss": 0.0862, "num_input_tokens_seen": 38796384, "step": 57550 }, { "epoch": 1.4060782253927149, "grad_norm": 0.05820373818278313, "learning_rate": 1.8065296188714259e-06, "loss": 0.0964, "num_input_tokens_seen": 38799712, "step": 57555 }, { "epoch": 1.406200376224562, "grad_norm": 15.190373420715332, "learning_rate": 1.8064792003605054e-06, "loss": 0.1511, "num_input_tokens_seen": 38803360, "step": 57560 }, { "epoch": 1.4063225270564093, "grad_norm": 19.76511001586914, "learning_rate": 1.806428775984651e-06, "loss": 0.073, "num_input_tokens_seen": 38806752, "step": 57565 }, { "epoch": 1.4064446778882564, "grad_norm": 102.18384552001953, "learning_rate": 1.80637834574423e-06, "loss": 0.0458, "num_input_tokens_seen": 38809760, "step": 57570 }, { "epoch": 1.4065668287201036, "grad_norm": 0.060823310166597366, "learning_rate": 1.8063279096396084e-06, "loss": 0.1079, "num_input_tokens_seen": 38813408, "step": 57575 }, { "epoch": 1.4066889795519508, "grad_norm": 193.80770874023438, "learning_rate": 1.8062774676711534e-06, "loss": 0.0348, "num_input_tokens_seen": 38816608, "step": 57580 }, { "epoch": 1.4068111303837978, "grad_norm": 2.4402763843536377, "learning_rate": 1.8062270198392322e-06, "loss": 0.0337, "num_input_tokens_seen": 38819552, "step": 57585 }, { "epoch": 1.406933281215645, "grad_norm": 11.247885704040527, "learning_rate": 1.8061765661442108e-06, "loss": 0.0309, "num_input_tokens_seen": 38822688, "step": 57590 }, { "epoch": 1.4070554320474922, "grad_norm": 0.40253564715385437, "learning_rate": 1.8061261065864568e-06, "loss": 0.1429, "num_input_tokens_seen": 38826208, "step": 57595 }, { "epoch": 1.4071775828793394, "grad_norm": 8.919163703918457, "learning_rate": 1.806075641166337e-06, "loss": 0.2824, "num_input_tokens_seen": 38829664, "step": 57600 }, { "epoch": 1.4072997337111866, "grad_norm": 30.795196533203125, "learning_rate": 1.8060251698842182e-06, "loss": 0.0616, "num_input_tokens_seen": 38833184, "step": 57605 }, { "epoch": 1.4074218845430337, "grad_norm": 0.13794507086277008, "learning_rate": 1.8059746927404676e-06, "loss": 0.1233, "num_input_tokens_seen": 38836384, "step": 57610 }, { "epoch": 1.407544035374881, "grad_norm": 33.86235809326172, "learning_rate": 1.8059242097354522e-06, "loss": 0.0892, "num_input_tokens_seen": 38840032, "step": 57615 }, { "epoch": 1.4076661862067281, "grad_norm": 0.26369521021842957, "learning_rate": 1.8058737208695391e-06, "loss": 0.0226, "num_input_tokens_seen": 38843360, "step": 57620 }, { "epoch": 1.4077883370385753, "grad_norm": 0.03817284107208252, "learning_rate": 1.8058232261430957e-06, "loss": 0.2279, "num_input_tokens_seen": 38846560, "step": 57625 }, { "epoch": 1.4079104878704225, "grad_norm": 0.21635249257087708, "learning_rate": 1.8057727255564892e-06, "loss": 0.0297, "num_input_tokens_seen": 38849888, "step": 57630 }, { "epoch": 1.4080326387022697, "grad_norm": 0.21980540454387665, "learning_rate": 1.8057222191100863e-06, "loss": 0.1256, "num_input_tokens_seen": 38853472, "step": 57635 }, { "epoch": 1.4081547895341167, "grad_norm": 89.9726333618164, "learning_rate": 1.805671706804255e-06, "loss": 0.114, "num_input_tokens_seen": 38857120, "step": 57640 }, { "epoch": 1.4082769403659638, "grad_norm": 18.657129287719727, "learning_rate": 1.8056211886393622e-06, "loss": 0.1426, "num_input_tokens_seen": 38860512, "step": 57645 }, { "epoch": 1.408399091197811, "grad_norm": 93.33203125, "learning_rate": 1.8055706646157756e-06, "loss": 0.1036, "num_input_tokens_seen": 38864160, "step": 57650 }, { "epoch": 1.4085212420296582, "grad_norm": 0.14268730580806732, "learning_rate": 1.8055201347338625e-06, "loss": 0.0018, "num_input_tokens_seen": 38867424, "step": 57655 }, { "epoch": 1.4086433928615054, "grad_norm": 1.1824917793273926, "learning_rate": 1.8054695989939904e-06, "loss": 0.063, "num_input_tokens_seen": 38870688, "step": 57660 }, { "epoch": 1.4087655436933526, "grad_norm": 0.21655486524105072, "learning_rate": 1.8054190573965263e-06, "loss": 0.0545, "num_input_tokens_seen": 38874080, "step": 57665 }, { "epoch": 1.4088876945251998, "grad_norm": 181.86268615722656, "learning_rate": 1.8053685099418385e-06, "loss": 0.0517, "num_input_tokens_seen": 38877088, "step": 57670 }, { "epoch": 1.4090098453570468, "grad_norm": 2.238178253173828, "learning_rate": 1.8053179566302942e-06, "loss": 0.0338, "num_input_tokens_seen": 38880224, "step": 57675 }, { "epoch": 1.409131996188894, "grad_norm": 0.26335620880126953, "learning_rate": 1.805267397462261e-06, "loss": 0.1188, "num_input_tokens_seen": 38883424, "step": 57680 }, { "epoch": 1.4092541470207411, "grad_norm": 0.3708566725254059, "learning_rate": 1.805216832438107e-06, "loss": 0.0563, "num_input_tokens_seen": 38886752, "step": 57685 }, { "epoch": 1.4093762978525883, "grad_norm": 0.582304060459137, "learning_rate": 1.8051662615581994e-06, "loss": 0.0523, "num_input_tokens_seen": 38890336, "step": 57690 }, { "epoch": 1.4094984486844355, "grad_norm": 0.12347531318664551, "learning_rate": 1.805115684822906e-06, "loss": 0.0014, "num_input_tokens_seen": 38893792, "step": 57695 }, { "epoch": 1.4096205995162827, "grad_norm": 0.10339561849832535, "learning_rate": 1.8050651022325952e-06, "loss": 0.001, "num_input_tokens_seen": 38897120, "step": 57700 }, { "epoch": 1.40974275034813, "grad_norm": 3.116520643234253, "learning_rate": 1.805014513787634e-06, "loss": 0.0374, "num_input_tokens_seen": 38900576, "step": 57705 }, { "epoch": 1.409864901179977, "grad_norm": 28.094335556030273, "learning_rate": 1.804963919488391e-06, "loss": 0.142, "num_input_tokens_seen": 38904544, "step": 57710 }, { "epoch": 1.4099870520118243, "grad_norm": 0.1864064782857895, "learning_rate": 1.804913319335234e-06, "loss": 0.0544, "num_input_tokens_seen": 38907616, "step": 57715 }, { "epoch": 1.4101092028436715, "grad_norm": 10.022531509399414, "learning_rate": 1.8048627133285306e-06, "loss": 0.198, "num_input_tokens_seen": 38910752, "step": 57720 }, { "epoch": 1.4102313536755187, "grad_norm": 16.5054931640625, "learning_rate": 1.804812101468649e-06, "loss": 0.0391, "num_input_tokens_seen": 38913760, "step": 57725 }, { "epoch": 1.4103535045073656, "grad_norm": 17.392940521240234, "learning_rate": 1.8047614837559574e-06, "loss": 0.059, "num_input_tokens_seen": 38916640, "step": 57730 }, { "epoch": 1.4104756553392128, "grad_norm": 22.643449783325195, "learning_rate": 1.8047108601908243e-06, "loss": 0.0686, "num_input_tokens_seen": 38919520, "step": 57735 }, { "epoch": 1.41059780617106, "grad_norm": 0.13620342314243317, "learning_rate": 1.8046602307736168e-06, "loss": 0.0847, "num_input_tokens_seen": 38922784, "step": 57740 }, { "epoch": 1.4107199570029072, "grad_norm": 0.21663911640644073, "learning_rate": 1.8046095955047038e-06, "loss": 0.0265, "num_input_tokens_seen": 38926688, "step": 57745 }, { "epoch": 1.4108421078347544, "grad_norm": 1.4268443584442139, "learning_rate": 1.8045589543844537e-06, "loss": 0.1017, "num_input_tokens_seen": 38929824, "step": 57750 }, { "epoch": 1.4109642586666016, "grad_norm": 1.3341022729873657, "learning_rate": 1.8045083074132341e-06, "loss": 0.0632, "num_input_tokens_seen": 38933408, "step": 57755 }, { "epoch": 1.4110864094984485, "grad_norm": 0.027623578906059265, "learning_rate": 1.804457654591414e-06, "loss": 0.102, "num_input_tokens_seen": 38937184, "step": 57760 }, { "epoch": 1.4112085603302957, "grad_norm": 0.09516473114490509, "learning_rate": 1.8044069959193612e-06, "loss": 0.0575, "num_input_tokens_seen": 38940512, "step": 57765 }, { "epoch": 1.411330711162143, "grad_norm": 0.0526285395026207, "learning_rate": 1.8043563313974445e-06, "loss": 0.0731, "num_input_tokens_seen": 38943968, "step": 57770 }, { "epoch": 1.4114528619939901, "grad_norm": 0.2618235945701599, "learning_rate": 1.8043056610260324e-06, "loss": 0.0789, "num_input_tokens_seen": 38947680, "step": 57775 }, { "epoch": 1.4115750128258373, "grad_norm": 0.09851998090744019, "learning_rate": 1.804254984805493e-06, "loss": 0.0939, "num_input_tokens_seen": 38951008, "step": 57780 }, { "epoch": 1.4116971636576845, "grad_norm": 0.20549650490283966, "learning_rate": 1.804204302736195e-06, "loss": 0.1177, "num_input_tokens_seen": 38954208, "step": 57785 }, { "epoch": 1.4118193144895317, "grad_norm": 1.1573891639709473, "learning_rate": 1.804153614818507e-06, "loss": 0.0814, "num_input_tokens_seen": 38957344, "step": 57790 }, { "epoch": 1.4119414653213789, "grad_norm": 0.22255335748195648, "learning_rate": 1.8041029210527976e-06, "loss": 0.0297, "num_input_tokens_seen": 38960928, "step": 57795 }, { "epoch": 1.412063616153226, "grad_norm": 0.2250811904668808, "learning_rate": 1.8040522214394356e-06, "loss": 0.0021, "num_input_tokens_seen": 38963936, "step": 57800 }, { "epoch": 1.4121857669850733, "grad_norm": 0.12040767073631287, "learning_rate": 1.8040015159787894e-06, "loss": 0.0705, "num_input_tokens_seen": 38967584, "step": 57805 }, { "epoch": 1.4123079178169204, "grad_norm": 0.05583275854587555, "learning_rate": 1.8039508046712281e-06, "loss": 0.0421, "num_input_tokens_seen": 38971104, "step": 57810 }, { "epoch": 1.4124300686487676, "grad_norm": 68.72763061523438, "learning_rate": 1.8039000875171202e-06, "loss": 0.1292, "num_input_tokens_seen": 38974112, "step": 57815 }, { "epoch": 1.4125522194806146, "grad_norm": 0.1513207107782364, "learning_rate": 1.8038493645168349e-06, "loss": 0.0562, "num_input_tokens_seen": 38977440, "step": 57820 }, { "epoch": 1.4126743703124618, "grad_norm": 0.06547381728887558, "learning_rate": 1.8037986356707404e-06, "loss": 0.0516, "num_input_tokens_seen": 38980896, "step": 57825 }, { "epoch": 1.412796521144309, "grad_norm": 0.052669234573841095, "learning_rate": 1.8037479009792062e-06, "loss": 0.0619, "num_input_tokens_seen": 38984032, "step": 57830 }, { "epoch": 1.4129186719761562, "grad_norm": 49.396663665771484, "learning_rate": 1.8036971604426015e-06, "loss": 0.1209, "num_input_tokens_seen": 38987168, "step": 57835 }, { "epoch": 1.4130408228080034, "grad_norm": 0.13010339438915253, "learning_rate": 1.8036464140612943e-06, "loss": 0.0015, "num_input_tokens_seen": 38990560, "step": 57840 }, { "epoch": 1.4131629736398506, "grad_norm": 0.014206153340637684, "learning_rate": 1.8035956618356546e-06, "loss": 0.0598, "num_input_tokens_seen": 38994208, "step": 57845 }, { "epoch": 1.4132851244716975, "grad_norm": 327.2905578613281, "learning_rate": 1.8035449037660508e-06, "loss": 0.124, "num_input_tokens_seen": 38997600, "step": 57850 }, { "epoch": 1.4134072753035447, "grad_norm": 0.030538057908415794, "learning_rate": 1.8034941398528525e-06, "loss": 0.0878, "num_input_tokens_seen": 39000992, "step": 57855 }, { "epoch": 1.413529426135392, "grad_norm": 0.0992370992898941, "learning_rate": 1.8034433700964287e-06, "loss": 0.0991, "num_input_tokens_seen": 39004192, "step": 57860 }, { "epoch": 1.413651576967239, "grad_norm": 1.4549504518508911, "learning_rate": 1.8033925944971484e-06, "loss": 0.0457, "num_input_tokens_seen": 39007648, "step": 57865 }, { "epoch": 1.4137737277990863, "grad_norm": 0.08177211880683899, "learning_rate": 1.8033418130553812e-06, "loss": 0.1027, "num_input_tokens_seen": 39011104, "step": 57870 }, { "epoch": 1.4138958786309335, "grad_norm": 0.17393673956394196, "learning_rate": 1.8032910257714966e-06, "loss": 0.0397, "num_input_tokens_seen": 39014624, "step": 57875 }, { "epoch": 1.4140180294627807, "grad_norm": 0.10555342584848404, "learning_rate": 1.803240232645863e-06, "loss": 0.1179, "num_input_tokens_seen": 39017696, "step": 57880 }, { "epoch": 1.4141401802946278, "grad_norm": 0.06716419011354446, "learning_rate": 1.803189433678851e-06, "loss": 0.0424, "num_input_tokens_seen": 39020896, "step": 57885 }, { "epoch": 1.414262331126475, "grad_norm": 17.76363182067871, "learning_rate": 1.803138628870829e-06, "loss": 0.0841, "num_input_tokens_seen": 39024864, "step": 57890 }, { "epoch": 1.4143844819583222, "grad_norm": 0.2943059802055359, "learning_rate": 1.803087818222167e-06, "loss": 0.1301, "num_input_tokens_seen": 39028896, "step": 57895 }, { "epoch": 1.4145066327901694, "grad_norm": 14.014280319213867, "learning_rate": 1.803037001733234e-06, "loss": 0.11, "num_input_tokens_seen": 39032224, "step": 57900 }, { "epoch": 1.4146287836220164, "grad_norm": 0.12080468982458115, "learning_rate": 1.8029861794044005e-06, "loss": 0.0814, "num_input_tokens_seen": 39035936, "step": 57905 }, { "epoch": 1.4147509344538636, "grad_norm": 0.2700631022453308, "learning_rate": 1.8029353512360354e-06, "loss": 0.0946, "num_input_tokens_seen": 39039200, "step": 57910 }, { "epoch": 1.4148730852857108, "grad_norm": 9.89631462097168, "learning_rate": 1.8028845172285083e-06, "loss": 0.1305, "num_input_tokens_seen": 39042464, "step": 57915 }, { "epoch": 1.414995236117558, "grad_norm": 0.3272474706172943, "learning_rate": 1.802833677382189e-06, "loss": 0.0931, "num_input_tokens_seen": 39045472, "step": 57920 }, { "epoch": 1.4151173869494051, "grad_norm": 18.658485412597656, "learning_rate": 1.8027828316974476e-06, "loss": 0.1595, "num_input_tokens_seen": 39048544, "step": 57925 }, { "epoch": 1.4152395377812523, "grad_norm": 21.56070327758789, "learning_rate": 1.8027319801746532e-06, "loss": 0.21, "num_input_tokens_seen": 39051936, "step": 57930 }, { "epoch": 1.4153616886130995, "grad_norm": 0.30690234899520874, "learning_rate": 1.8026811228141762e-06, "loss": 0.13, "num_input_tokens_seen": 39055648, "step": 57935 }, { "epoch": 1.4154838394449465, "grad_norm": 0.8390116095542908, "learning_rate": 1.8026302596163857e-06, "loss": 0.0517, "num_input_tokens_seen": 39058656, "step": 57940 }, { "epoch": 1.4156059902767937, "grad_norm": 98.68684387207031, "learning_rate": 1.8025793905816523e-06, "loss": 0.045, "num_input_tokens_seen": 39061920, "step": 57945 }, { "epoch": 1.4157281411086409, "grad_norm": 0.09113955497741699, "learning_rate": 1.802528515710346e-06, "loss": 0.1386, "num_input_tokens_seen": 39064864, "step": 57950 }, { "epoch": 1.415850291940488, "grad_norm": 8.854911804199219, "learning_rate": 1.8024776350028363e-06, "loss": 0.1239, "num_input_tokens_seen": 39068448, "step": 57955 }, { "epoch": 1.4159724427723352, "grad_norm": 24.10318374633789, "learning_rate": 1.8024267484594933e-06, "loss": 0.1009, "num_input_tokens_seen": 39072480, "step": 57960 }, { "epoch": 1.4160945936041824, "grad_norm": 0.13761503994464874, "learning_rate": 1.8023758560806873e-06, "loss": 0.0409, "num_input_tokens_seen": 39075744, "step": 57965 }, { "epoch": 1.4162167444360296, "grad_norm": 0.2878974676132202, "learning_rate": 1.802324957866788e-06, "loss": 0.0018, "num_input_tokens_seen": 39079328, "step": 57970 }, { "epoch": 1.4163388952678768, "grad_norm": 0.13723404705524445, "learning_rate": 1.8022740538181662e-06, "loss": 0.016, "num_input_tokens_seen": 39082592, "step": 57975 }, { "epoch": 1.416461046099724, "grad_norm": 57.850948333740234, "learning_rate": 1.8022231439351914e-06, "loss": 0.2262, "num_input_tokens_seen": 39085920, "step": 57980 }, { "epoch": 1.4165831969315712, "grad_norm": 0.20545895397663116, "learning_rate": 1.8021722282182342e-06, "loss": 0.0397, "num_input_tokens_seen": 39088928, "step": 57985 }, { "epoch": 1.4167053477634184, "grad_norm": 1.8310734033584595, "learning_rate": 1.802121306667665e-06, "loss": 0.1332, "num_input_tokens_seen": 39092064, "step": 57990 }, { "epoch": 1.4168274985952654, "grad_norm": 0.4682214856147766, "learning_rate": 1.8020703792838535e-06, "loss": 0.1581, "num_input_tokens_seen": 39095456, "step": 57995 }, { "epoch": 1.4169496494271125, "grad_norm": 0.3079153001308441, "learning_rate": 1.8020194460671707e-06, "loss": 0.0014, "num_input_tokens_seen": 39098336, "step": 58000 }, { "epoch": 1.4170718002589597, "grad_norm": 0.013312513008713722, "learning_rate": 1.8019685070179868e-06, "loss": 0.0703, "num_input_tokens_seen": 39101856, "step": 58005 }, { "epoch": 1.417193951090807, "grad_norm": 15.343265533447266, "learning_rate": 1.8019175621366722e-06, "loss": 0.078, "num_input_tokens_seen": 39104800, "step": 58010 }, { "epoch": 1.4173161019226541, "grad_norm": 0.2046901136636734, "learning_rate": 1.8018666114235973e-06, "loss": 0.0502, "num_input_tokens_seen": 39108192, "step": 58015 }, { "epoch": 1.4174382527545013, "grad_norm": 22.222043991088867, "learning_rate": 1.801815654879133e-06, "loss": 0.0477, "num_input_tokens_seen": 39111648, "step": 58020 }, { "epoch": 1.4175604035863485, "grad_norm": 9.82966423034668, "learning_rate": 1.8017646925036495e-06, "loss": 0.0822, "num_input_tokens_seen": 39114976, "step": 58025 }, { "epoch": 1.4176825544181955, "grad_norm": 0.5586780905723572, "learning_rate": 1.8017137242975174e-06, "loss": 0.1607, "num_input_tokens_seen": 39118688, "step": 58030 }, { "epoch": 1.4178047052500427, "grad_norm": 0.5261523127555847, "learning_rate": 1.8016627502611072e-06, "loss": 0.1481, "num_input_tokens_seen": 39121824, "step": 58035 }, { "epoch": 1.4179268560818898, "grad_norm": 37.271671295166016, "learning_rate": 1.8016117703947902e-06, "loss": 0.1372, "num_input_tokens_seen": 39125280, "step": 58040 }, { "epoch": 1.418049006913737, "grad_norm": 0.5471317768096924, "learning_rate": 1.8015607846989367e-06, "loss": 0.0425, "num_input_tokens_seen": 39128416, "step": 58045 }, { "epoch": 1.4181711577455842, "grad_norm": 20.493061065673828, "learning_rate": 1.8015097931739175e-06, "loss": 0.067, "num_input_tokens_seen": 39131296, "step": 58050 }, { "epoch": 1.4182933085774314, "grad_norm": 0.3601451814174652, "learning_rate": 1.8014587958201038e-06, "loss": 0.118, "num_input_tokens_seen": 39134304, "step": 58055 }, { "epoch": 1.4184154594092786, "grad_norm": 31.987966537475586, "learning_rate": 1.801407792637866e-06, "loss": 0.0955, "num_input_tokens_seen": 39138208, "step": 58060 }, { "epoch": 1.4185376102411258, "grad_norm": 0.3367749750614166, "learning_rate": 1.801356783627575e-06, "loss": 0.0852, "num_input_tokens_seen": 39141344, "step": 58065 }, { "epoch": 1.418659761072973, "grad_norm": 108.01628112792969, "learning_rate": 1.8013057687896022e-06, "loss": 0.131, "num_input_tokens_seen": 39144672, "step": 58070 }, { "epoch": 1.4187819119048202, "grad_norm": 0.4019928574562073, "learning_rate": 1.8012547481243182e-06, "loss": 0.1219, "num_input_tokens_seen": 39147744, "step": 58075 }, { "epoch": 1.4189040627366674, "grad_norm": 0.054344676434993744, "learning_rate": 1.8012037216320942e-06, "loss": 0.0932, "num_input_tokens_seen": 39151392, "step": 58080 }, { "epoch": 1.4190262135685143, "grad_norm": 0.14216575026512146, "learning_rate": 1.8011526893133012e-06, "loss": 0.1037, "num_input_tokens_seen": 39154464, "step": 58085 }, { "epoch": 1.4191483644003615, "grad_norm": 10.013678550720215, "learning_rate": 1.8011016511683103e-06, "loss": 0.0417, "num_input_tokens_seen": 39157920, "step": 58090 }, { "epoch": 1.4192705152322087, "grad_norm": 21.88975715637207, "learning_rate": 1.8010506071974926e-06, "loss": 0.2024, "num_input_tokens_seen": 39161312, "step": 58095 }, { "epoch": 1.419392666064056, "grad_norm": 25.340179443359375, "learning_rate": 1.8009995574012198e-06, "loss": 0.1039, "num_input_tokens_seen": 39164448, "step": 58100 }, { "epoch": 1.419514816895903, "grad_norm": 21.190763473510742, "learning_rate": 1.8009485017798624e-06, "loss": 0.1066, "num_input_tokens_seen": 39167776, "step": 58105 }, { "epoch": 1.4196369677277503, "grad_norm": 0.21470917761325836, "learning_rate": 1.8008974403337924e-06, "loss": 0.1594, "num_input_tokens_seen": 39171104, "step": 58110 }, { "epoch": 1.4197591185595975, "grad_norm": 1.8665906190872192, "learning_rate": 1.8008463730633807e-06, "loss": 0.0865, "num_input_tokens_seen": 39174560, "step": 58115 }, { "epoch": 1.4198812693914444, "grad_norm": 0.051145486533641815, "learning_rate": 1.8007952999689989e-06, "loss": 0.0939, "num_input_tokens_seen": 39177760, "step": 58120 }, { "epoch": 1.4200034202232916, "grad_norm": 1.2939541339874268, "learning_rate": 1.800744221051018e-06, "loss": 0.049, "num_input_tokens_seen": 39181152, "step": 58125 }, { "epoch": 1.4201255710551388, "grad_norm": 11.657868385314941, "learning_rate": 1.80069313630981e-06, "loss": 0.1063, "num_input_tokens_seen": 39184544, "step": 58130 }, { "epoch": 1.420247721886986, "grad_norm": 0.22248221933841705, "learning_rate": 1.8006420457457457e-06, "loss": 0.0017, "num_input_tokens_seen": 39188000, "step": 58135 }, { "epoch": 1.4203698727188332, "grad_norm": 16.86621856689453, "learning_rate": 1.8005909493591975e-06, "loss": 0.1047, "num_input_tokens_seen": 39190816, "step": 58140 }, { "epoch": 1.4204920235506804, "grad_norm": 0.03387094661593437, "learning_rate": 1.8005398471505364e-06, "loss": 0.0524, "num_input_tokens_seen": 39194208, "step": 58145 }, { "epoch": 1.4206141743825276, "grad_norm": 0.009047556668519974, "learning_rate": 1.8004887391201343e-06, "loss": 0.2904, "num_input_tokens_seen": 39198048, "step": 58150 }, { "epoch": 1.4207363252143748, "grad_norm": 0.5070739388465881, "learning_rate": 1.8004376252683629e-06, "loss": 0.0442, "num_input_tokens_seen": 39201376, "step": 58155 }, { "epoch": 1.420858476046222, "grad_norm": 0.357709676027298, "learning_rate": 1.8003865055955938e-06, "loss": 0.061, "num_input_tokens_seen": 39204960, "step": 58160 }, { "epoch": 1.4209806268780691, "grad_norm": 2.56449556350708, "learning_rate": 1.8003353801021985e-06, "loss": 0.0012, "num_input_tokens_seen": 39208544, "step": 58165 }, { "epoch": 1.4211027777099163, "grad_norm": 16.43491554260254, "learning_rate": 1.8002842487885493e-06, "loss": 0.1808, "num_input_tokens_seen": 39211808, "step": 58170 }, { "epoch": 1.4212249285417633, "grad_norm": 0.1726624220609665, "learning_rate": 1.8002331116550176e-06, "loss": 0.0492, "num_input_tokens_seen": 39215264, "step": 58175 }, { "epoch": 1.4213470793736105, "grad_norm": 0.0881342962384224, "learning_rate": 1.8001819687019758e-06, "loss": 0.0165, "num_input_tokens_seen": 39218400, "step": 58180 }, { "epoch": 1.4214692302054577, "grad_norm": 0.7044225931167603, "learning_rate": 1.800130819929795e-06, "loss": 0.11, "num_input_tokens_seen": 39221856, "step": 58185 }, { "epoch": 1.4215913810373049, "grad_norm": 0.49497997760772705, "learning_rate": 1.800079665338848e-06, "loss": 0.0348, "num_input_tokens_seen": 39225184, "step": 58190 }, { "epoch": 1.421713531869152, "grad_norm": 12.24195671081543, "learning_rate": 1.8000285049295066e-06, "loss": 0.1837, "num_input_tokens_seen": 39228128, "step": 58195 }, { "epoch": 1.4218356827009992, "grad_norm": 4.202094078063965, "learning_rate": 1.7999773387021423e-06, "loss": 0.0519, "num_input_tokens_seen": 39231584, "step": 58200 }, { "epoch": 1.4219578335328464, "grad_norm": 0.598829448223114, "learning_rate": 1.7999261666571281e-06, "loss": 0.2783, "num_input_tokens_seen": 39234720, "step": 58205 }, { "epoch": 1.4220799843646934, "grad_norm": 0.19293074309825897, "learning_rate": 1.7998749887948352e-06, "loss": 0.1263, "num_input_tokens_seen": 39238560, "step": 58210 }, { "epoch": 1.4222021351965406, "grad_norm": 0.10075836628675461, "learning_rate": 1.7998238051156367e-06, "loss": 0.0268, "num_input_tokens_seen": 39242272, "step": 58215 }, { "epoch": 1.4223242860283878, "grad_norm": 0.21634642779827118, "learning_rate": 1.799772615619904e-06, "loss": 0.0969, "num_input_tokens_seen": 39245600, "step": 58220 }, { "epoch": 1.422446436860235, "grad_norm": 0.28032878041267395, "learning_rate": 1.79972142030801e-06, "loss": 0.0824, "num_input_tokens_seen": 39249248, "step": 58225 }, { "epoch": 1.4225685876920822, "grad_norm": 0.16427673399448395, "learning_rate": 1.7996702191803265e-06, "loss": 0.0967, "num_input_tokens_seen": 39252512, "step": 58230 }, { "epoch": 1.4226907385239294, "grad_norm": 0.6388681530952454, "learning_rate": 1.7996190122372262e-06, "loss": 0.0322, "num_input_tokens_seen": 39255904, "step": 58235 }, { "epoch": 1.4228128893557765, "grad_norm": 0.6887816190719604, "learning_rate": 1.7995677994790813e-06, "loss": 0.0035, "num_input_tokens_seen": 39259104, "step": 58240 }, { "epoch": 1.4229350401876237, "grad_norm": 0.16875456273555756, "learning_rate": 1.7995165809062644e-06, "loss": 0.1021, "num_input_tokens_seen": 39262368, "step": 58245 }, { "epoch": 1.423057191019471, "grad_norm": 16.76205825805664, "learning_rate": 1.7994653565191478e-06, "loss": 0.1678, "num_input_tokens_seen": 39266016, "step": 58250 }, { "epoch": 1.4231793418513181, "grad_norm": 22.382341384887695, "learning_rate": 1.799414126318104e-06, "loss": 0.2146, "num_input_tokens_seen": 39269472, "step": 58255 }, { "epoch": 1.4233014926831653, "grad_norm": 0.4394358694553375, "learning_rate": 1.7993628903035058e-06, "loss": 0.0499, "num_input_tokens_seen": 39272800, "step": 58260 }, { "epoch": 1.4234236435150123, "grad_norm": 1.7339340448379517, "learning_rate": 1.7993116484757259e-06, "loss": 0.1058, "num_input_tokens_seen": 39276320, "step": 58265 }, { "epoch": 1.4235457943468595, "grad_norm": 0.19091808795928955, "learning_rate": 1.7992604008351364e-06, "loss": 0.0026, "num_input_tokens_seen": 39279776, "step": 58270 }, { "epoch": 1.4236679451787067, "grad_norm": 33.9887809753418, "learning_rate": 1.7992091473821102e-06, "loss": 0.0919, "num_input_tokens_seen": 39283360, "step": 58275 }, { "epoch": 1.4237900960105538, "grad_norm": 107.04004669189453, "learning_rate": 1.7991578881170203e-06, "loss": 0.2179, "num_input_tokens_seen": 39286880, "step": 58280 }, { "epoch": 1.423912246842401, "grad_norm": 19.60547637939453, "learning_rate": 1.7991066230402392e-06, "loss": 0.1943, "num_input_tokens_seen": 39290016, "step": 58285 }, { "epoch": 1.4240343976742482, "grad_norm": 0.8691688776016235, "learning_rate": 1.79905535215214e-06, "loss": 0.0399, "num_input_tokens_seen": 39293408, "step": 58290 }, { "epoch": 1.4241565485060952, "grad_norm": 0.0896698608994484, "learning_rate": 1.799004075453095e-06, "loss": 0.1184, "num_input_tokens_seen": 39297056, "step": 58295 }, { "epoch": 1.4242786993379424, "grad_norm": 0.5838584303855896, "learning_rate": 1.7989527929434777e-06, "loss": 0.0444, "num_input_tokens_seen": 39300128, "step": 58300 }, { "epoch": 1.4244008501697896, "grad_norm": 11.378144264221191, "learning_rate": 1.7989015046236608e-06, "loss": 0.1591, "num_input_tokens_seen": 39303456, "step": 58305 }, { "epoch": 1.4245230010016368, "grad_norm": 20.538105010986328, "learning_rate": 1.798850210494017e-06, "loss": 0.101, "num_input_tokens_seen": 39306848, "step": 58310 }, { "epoch": 1.424645151833484, "grad_norm": 13.059782981872559, "learning_rate": 1.79879891055492e-06, "loss": 0.0354, "num_input_tokens_seen": 39309984, "step": 58315 }, { "epoch": 1.4247673026653311, "grad_norm": 41.89487075805664, "learning_rate": 1.7987476048067425e-06, "loss": 0.0058, "num_input_tokens_seen": 39313120, "step": 58320 }, { "epoch": 1.4248894534971783, "grad_norm": 38.37577438354492, "learning_rate": 1.7986962932498572e-06, "loss": 0.113, "num_input_tokens_seen": 39316512, "step": 58325 }, { "epoch": 1.4250116043290255, "grad_norm": 13.37543773651123, "learning_rate": 1.7986449758846378e-06, "loss": 0.1018, "num_input_tokens_seen": 39319712, "step": 58330 }, { "epoch": 1.4251337551608727, "grad_norm": 47.16141891479492, "learning_rate": 1.7985936527114576e-06, "loss": 0.1428, "num_input_tokens_seen": 39322912, "step": 58335 }, { "epoch": 1.42525590599272, "grad_norm": 0.9388689398765564, "learning_rate": 1.798542323730689e-06, "loss": 0.0386, "num_input_tokens_seen": 39326304, "step": 58340 }, { "epoch": 1.425378056824567, "grad_norm": 54.705787658691406, "learning_rate": 1.7984909889427065e-06, "loss": 0.1189, "num_input_tokens_seen": 39329696, "step": 58345 }, { "epoch": 1.4255002076564143, "grad_norm": 0.6181700825691223, "learning_rate": 1.798439648347882e-06, "loss": 0.0391, "num_input_tokens_seen": 39332896, "step": 58350 }, { "epoch": 1.4256223584882612, "grad_norm": 0.873293936252594, "learning_rate": 1.7983883019465905e-06, "loss": 0.0033, "num_input_tokens_seen": 39336416, "step": 58355 }, { "epoch": 1.4257445093201084, "grad_norm": 78.40597534179688, "learning_rate": 1.7983369497392038e-06, "loss": 0.1611, "num_input_tokens_seen": 39339552, "step": 58360 }, { "epoch": 1.4258666601519556, "grad_norm": 0.04144344478845596, "learning_rate": 1.7982855917260965e-06, "loss": 0.0557, "num_input_tokens_seen": 39343008, "step": 58365 }, { "epoch": 1.4259888109838028, "grad_norm": 23.530664443969727, "learning_rate": 1.7982342279076415e-06, "loss": 0.0352, "num_input_tokens_seen": 39346848, "step": 58370 }, { "epoch": 1.42611096181565, "grad_norm": 0.06019030511379242, "learning_rate": 1.7981828582842122e-06, "loss": 0.0015, "num_input_tokens_seen": 39351584, "step": 58375 }, { "epoch": 1.4262331126474972, "grad_norm": 99.20355987548828, "learning_rate": 1.7981314828561829e-06, "loss": 0.1958, "num_input_tokens_seen": 39354976, "step": 58380 }, { "epoch": 1.4263552634793442, "grad_norm": 0.35224005579948425, "learning_rate": 1.7980801016239267e-06, "loss": 0.0402, "num_input_tokens_seen": 39357984, "step": 58385 }, { "epoch": 1.4264774143111913, "grad_norm": 102.94215393066406, "learning_rate": 1.7980287145878173e-06, "loss": 0.1442, "num_input_tokens_seen": 39361440, "step": 58390 }, { "epoch": 1.4265995651430385, "grad_norm": 16.342992782592773, "learning_rate": 1.7979773217482284e-06, "loss": 0.1693, "num_input_tokens_seen": 39364256, "step": 58395 }, { "epoch": 1.4267217159748857, "grad_norm": 0.09097766876220703, "learning_rate": 1.7979259231055338e-06, "loss": 0.0666, "num_input_tokens_seen": 39367712, "step": 58400 }, { "epoch": 1.426843866806733, "grad_norm": 0.2545751929283142, "learning_rate": 1.7978745186601075e-06, "loss": 0.1043, "num_input_tokens_seen": 39370656, "step": 58405 }, { "epoch": 1.42696601763858, "grad_norm": 151.2428436279297, "learning_rate": 1.7978231084123229e-06, "loss": 0.0842, "num_input_tokens_seen": 39374048, "step": 58410 }, { "epoch": 1.4270881684704273, "grad_norm": 0.01164193358272314, "learning_rate": 1.7977716923625538e-06, "loss": 0.0402, "num_input_tokens_seen": 39377568, "step": 58415 }, { "epoch": 1.4272103193022745, "grad_norm": 0.5238284468650818, "learning_rate": 1.7977202705111746e-06, "loss": 0.0249, "num_input_tokens_seen": 39381280, "step": 58420 }, { "epoch": 1.4273324701341217, "grad_norm": 36.011661529541016, "learning_rate": 1.7976688428585592e-06, "loss": 0.2239, "num_input_tokens_seen": 39384608, "step": 58425 }, { "epoch": 1.4274546209659689, "grad_norm": 18.976518630981445, "learning_rate": 1.7976174094050813e-06, "loss": 0.0534, "num_input_tokens_seen": 39388192, "step": 58430 }, { "epoch": 1.427576771797816, "grad_norm": 14.480353355407715, "learning_rate": 1.797565970151115e-06, "loss": 0.103, "num_input_tokens_seen": 39391520, "step": 58435 }, { "epoch": 1.427698922629663, "grad_norm": 0.8790879845619202, "learning_rate": 1.7975145250970346e-06, "loss": 0.1048, "num_input_tokens_seen": 39394784, "step": 58440 }, { "epoch": 1.4278210734615102, "grad_norm": 0.1189805343747139, "learning_rate": 1.797463074243214e-06, "loss": 0.1401, "num_input_tokens_seen": 39398048, "step": 58445 }, { "epoch": 1.4279432242933574, "grad_norm": 59.93393325805664, "learning_rate": 1.7974116175900273e-06, "loss": 0.0479, "num_input_tokens_seen": 39401568, "step": 58450 }, { "epoch": 1.4280653751252046, "grad_norm": 22.313800811767578, "learning_rate": 1.797360155137849e-06, "loss": 0.0029, "num_input_tokens_seen": 39404576, "step": 58455 }, { "epoch": 1.4281875259570518, "grad_norm": 0.21892331540584564, "learning_rate": 1.797308686887053e-06, "loss": 0.1259, "num_input_tokens_seen": 39407712, "step": 58460 }, { "epoch": 1.428309676788899, "grad_norm": 0.3004763722419739, "learning_rate": 1.797257212838014e-06, "loss": 0.0671, "num_input_tokens_seen": 39410976, "step": 58465 }, { "epoch": 1.4284318276207462, "grad_norm": 0.31758275628089905, "learning_rate": 1.797205732991106e-06, "loss": 0.0049, "num_input_tokens_seen": 39414432, "step": 58470 }, { "epoch": 1.4285539784525931, "grad_norm": 16.61806297302246, "learning_rate": 1.7971542473467036e-06, "loss": 0.1524, "num_input_tokens_seen": 39417760, "step": 58475 }, { "epoch": 1.4286761292844403, "grad_norm": 9.997937202453613, "learning_rate": 1.797102755905181e-06, "loss": 0.097, "num_input_tokens_seen": 39420960, "step": 58480 }, { "epoch": 1.4287982801162875, "grad_norm": 12.6223726272583, "learning_rate": 1.7970512586669128e-06, "loss": 0.0374, "num_input_tokens_seen": 39424224, "step": 58485 }, { "epoch": 1.4289204309481347, "grad_norm": 3.157579183578491, "learning_rate": 1.7969997556322736e-06, "loss": 0.0026, "num_input_tokens_seen": 39427872, "step": 58490 }, { "epoch": 1.429042581779982, "grad_norm": 2.2822046279907227, "learning_rate": 1.7969482468016377e-06, "loss": 0.132, "num_input_tokens_seen": 39431520, "step": 58495 }, { "epoch": 1.429164732611829, "grad_norm": 0.06637066602706909, "learning_rate": 1.7968967321753796e-06, "loss": 0.0397, "num_input_tokens_seen": 39435552, "step": 58500 }, { "epoch": 1.4292868834436763, "grad_norm": 0.06641530990600586, "learning_rate": 1.7968452117538742e-06, "loss": 0.0854, "num_input_tokens_seen": 39439200, "step": 58505 }, { "epoch": 1.4294090342755235, "grad_norm": 1.0716360807418823, "learning_rate": 1.7967936855374964e-06, "loss": 0.0433, "num_input_tokens_seen": 39442400, "step": 58510 }, { "epoch": 1.4295311851073706, "grad_norm": 24.701417922973633, "learning_rate": 1.7967421535266203e-06, "loss": 0.1447, "num_input_tokens_seen": 39445600, "step": 58515 }, { "epoch": 1.4296533359392178, "grad_norm": 8.375645637512207, "learning_rate": 1.796690615721621e-06, "loss": 0.0557, "num_input_tokens_seen": 39449056, "step": 58520 }, { "epoch": 1.429775486771065, "grad_norm": 0.015680750831961632, "learning_rate": 1.7966390721228733e-06, "loss": 0.0828, "num_input_tokens_seen": 39452768, "step": 58525 }, { "epoch": 1.429897637602912, "grad_norm": 0.13656115531921387, "learning_rate": 1.7965875227307522e-06, "loss": 0.1749, "num_input_tokens_seen": 39456352, "step": 58530 }, { "epoch": 1.4300197884347592, "grad_norm": 39.53352737426758, "learning_rate": 1.796535967545632e-06, "loss": 0.1555, "num_input_tokens_seen": 39459360, "step": 58535 }, { "epoch": 1.4301419392666064, "grad_norm": 0.15889528393745422, "learning_rate": 1.7964844065678882e-06, "loss": 0.0359, "num_input_tokens_seen": 39462944, "step": 58540 }, { "epoch": 1.4302640900984536, "grad_norm": 13.821887969970703, "learning_rate": 1.7964328397978954e-06, "loss": 0.0913, "num_input_tokens_seen": 39466336, "step": 58545 }, { "epoch": 1.4303862409303008, "grad_norm": 30.154619216918945, "learning_rate": 1.796381267236029e-06, "loss": 0.0912, "num_input_tokens_seen": 39469664, "step": 58550 }, { "epoch": 1.430508391762148, "grad_norm": 13.006333351135254, "learning_rate": 1.7963296888826638e-06, "loss": 0.1042, "num_input_tokens_seen": 39473056, "step": 58555 }, { "epoch": 1.4306305425939951, "grad_norm": 3.447408676147461, "learning_rate": 1.796278104738175e-06, "loss": 0.0864, "num_input_tokens_seen": 39476384, "step": 58560 }, { "epoch": 1.430752693425842, "grad_norm": 0.3481005132198334, "learning_rate": 1.7962265148029374e-06, "loss": 0.0593, "num_input_tokens_seen": 39479520, "step": 58565 }, { "epoch": 1.4308748442576893, "grad_norm": 0.144733265042305, "learning_rate": 1.7961749190773263e-06, "loss": 0.1574, "num_input_tokens_seen": 39483040, "step": 58570 }, { "epoch": 1.4309969950895365, "grad_norm": 52.843868255615234, "learning_rate": 1.7961233175617173e-06, "loss": 0.1276, "num_input_tokens_seen": 39486560, "step": 58575 }, { "epoch": 1.4311191459213837, "grad_norm": 0.256605327129364, "learning_rate": 1.7960717102564855e-06, "loss": 0.0025, "num_input_tokens_seen": 39489888, "step": 58580 }, { "epoch": 1.4312412967532309, "grad_norm": 0.4557490646839142, "learning_rate": 1.796020097162006e-06, "loss": 0.1429, "num_input_tokens_seen": 39493344, "step": 58585 }, { "epoch": 1.431363447585078, "grad_norm": 89.59080505371094, "learning_rate": 1.7959684782786542e-06, "loss": 0.0477, "num_input_tokens_seen": 39496608, "step": 58590 }, { "epoch": 1.4314855984169252, "grad_norm": 0.42434826493263245, "learning_rate": 1.7959168536068056e-06, "loss": 0.0153, "num_input_tokens_seen": 39499552, "step": 58595 }, { "epoch": 1.4316077492487724, "grad_norm": 36.179927825927734, "learning_rate": 1.7958652231468357e-06, "loss": 0.1024, "num_input_tokens_seen": 39503008, "step": 58600 }, { "epoch": 1.4317299000806196, "grad_norm": 0.0644034817814827, "learning_rate": 1.7958135868991195e-06, "loss": 0.0562, "num_input_tokens_seen": 39506400, "step": 58605 }, { "epoch": 1.4318520509124668, "grad_norm": 0.04763718321919441, "learning_rate": 1.7957619448640332e-06, "loss": 0.1308, "num_input_tokens_seen": 39510240, "step": 58610 }, { "epoch": 1.431974201744314, "grad_norm": 18.100360870361328, "learning_rate": 1.7957102970419516e-06, "loss": 0.0285, "num_input_tokens_seen": 39513504, "step": 58615 }, { "epoch": 1.432096352576161, "grad_norm": 32.79466247558594, "learning_rate": 1.795658643433251e-06, "loss": 0.0449, "num_input_tokens_seen": 39516960, "step": 58620 }, { "epoch": 1.4322185034080082, "grad_norm": 0.055818889290094376, "learning_rate": 1.7956069840383066e-06, "loss": 0.0842, "num_input_tokens_seen": 39520544, "step": 58625 }, { "epoch": 1.4323406542398553, "grad_norm": 0.12097417563199997, "learning_rate": 1.7955553188574944e-06, "loss": 0.0956, "num_input_tokens_seen": 39524256, "step": 58630 }, { "epoch": 1.4324628050717025, "grad_norm": 0.05628423020243645, "learning_rate": 1.7955036478911896e-06, "loss": 0.1319, "num_input_tokens_seen": 39528032, "step": 58635 }, { "epoch": 1.4325849559035497, "grad_norm": 21.71570587158203, "learning_rate": 1.7954519711397689e-06, "loss": 0.0841, "num_input_tokens_seen": 39531424, "step": 58640 }, { "epoch": 1.432707106735397, "grad_norm": 0.11519166082143784, "learning_rate": 1.795400288603607e-06, "loss": 0.0688, "num_input_tokens_seen": 39535200, "step": 58645 }, { "epoch": 1.432829257567244, "grad_norm": 0.2426871657371521, "learning_rate": 1.7953486002830802e-06, "loss": 0.0019, "num_input_tokens_seen": 39538720, "step": 58650 }, { "epoch": 1.432951408399091, "grad_norm": 33.8471794128418, "learning_rate": 1.7952969061785647e-06, "loss": 0.008, "num_input_tokens_seen": 39541920, "step": 58655 }, { "epoch": 1.4330735592309383, "grad_norm": 63.138633728027344, "learning_rate": 1.7952452062904362e-06, "loss": 0.0869, "num_input_tokens_seen": 39545312, "step": 58660 }, { "epoch": 1.4331957100627855, "grad_norm": 0.07256802171468735, "learning_rate": 1.7951935006190709e-06, "loss": 0.0162, "num_input_tokens_seen": 39548384, "step": 58665 }, { "epoch": 1.4333178608946326, "grad_norm": 0.044713884592056274, "learning_rate": 1.795141789164844e-06, "loss": 0.0794, "num_input_tokens_seen": 39552096, "step": 58670 }, { "epoch": 1.4334400117264798, "grad_norm": 145.95172119140625, "learning_rate": 1.7950900719281326e-06, "loss": 0.0993, "num_input_tokens_seen": 39555744, "step": 58675 }, { "epoch": 1.433562162558327, "grad_norm": 25.724279403686523, "learning_rate": 1.7950383489093118e-06, "loss": 0.1404, "num_input_tokens_seen": 39559200, "step": 58680 }, { "epoch": 1.4336843133901742, "grad_norm": 0.8542588353157043, "learning_rate": 1.7949866201087592e-06, "loss": 0.0497, "num_input_tokens_seen": 39562592, "step": 58685 }, { "epoch": 1.4338064642220214, "grad_norm": 20.692792892456055, "learning_rate": 1.7949348855268494e-06, "loss": 0.0657, "num_input_tokens_seen": 39565728, "step": 58690 }, { "epoch": 1.4339286150538686, "grad_norm": 9.70878791809082, "learning_rate": 1.7948831451639594e-06, "loss": 0.0124, "num_input_tokens_seen": 39569056, "step": 58695 }, { "epoch": 1.4340507658857158, "grad_norm": 20.545381546020508, "learning_rate": 1.7948313990204654e-06, "loss": 0.0726, "num_input_tokens_seen": 39572000, "step": 58700 }, { "epoch": 1.434172916717563, "grad_norm": 0.12542176246643066, "learning_rate": 1.7947796470967438e-06, "loss": 0.0827, "num_input_tokens_seen": 39575328, "step": 58705 }, { "epoch": 1.43429506754941, "grad_norm": 0.3020426332950592, "learning_rate": 1.7947278893931705e-06, "loss": 0.085, "num_input_tokens_seen": 39578656, "step": 58710 }, { "epoch": 1.4344172183812571, "grad_norm": 0.0259727593511343, "learning_rate": 1.7946761259101226e-06, "loss": 0.0675, "num_input_tokens_seen": 39582176, "step": 58715 }, { "epoch": 1.4345393692131043, "grad_norm": 0.03346165642142296, "learning_rate": 1.7946243566479762e-06, "loss": 0.0015, "num_input_tokens_seen": 39585824, "step": 58720 }, { "epoch": 1.4346615200449515, "grad_norm": 0.2484491467475891, "learning_rate": 1.7945725816071074e-06, "loss": 0.0597, "num_input_tokens_seen": 39588640, "step": 58725 }, { "epoch": 1.4347836708767987, "grad_norm": 179.8692626953125, "learning_rate": 1.794520800787893e-06, "loss": 0.1699, "num_input_tokens_seen": 39591840, "step": 58730 }, { "epoch": 1.434905821708646, "grad_norm": 0.08298604935407639, "learning_rate": 1.79446901419071e-06, "loss": 0.0763, "num_input_tokens_seen": 39595232, "step": 58735 }, { "epoch": 1.435027972540493, "grad_norm": 1.8074345588684082, "learning_rate": 1.7944172218159348e-06, "loss": 0.0246, "num_input_tokens_seen": 39598560, "step": 58740 }, { "epoch": 1.43515012337234, "grad_norm": 33.335609436035156, "learning_rate": 1.7943654236639436e-06, "loss": 0.1222, "num_input_tokens_seen": 39601632, "step": 58745 }, { "epoch": 1.4352722742041872, "grad_norm": 0.4874430000782013, "learning_rate": 1.7943136197351135e-06, "loss": 0.0024, "num_input_tokens_seen": 39605472, "step": 58750 }, { "epoch": 1.4353944250360344, "grad_norm": 37.11099624633789, "learning_rate": 1.794261810029821e-06, "loss": 0.1361, "num_input_tokens_seen": 39609056, "step": 58755 }, { "epoch": 1.4355165758678816, "grad_norm": 0.23305939137935638, "learning_rate": 1.794209994548443e-06, "loss": 0.0786, "num_input_tokens_seen": 39611936, "step": 58760 }, { "epoch": 1.4356387266997288, "grad_norm": 10.859920501708984, "learning_rate": 1.7941581732913562e-06, "loss": 0.1309, "num_input_tokens_seen": 39615520, "step": 58765 }, { "epoch": 1.435760877531576, "grad_norm": 0.14673854410648346, "learning_rate": 1.7941063462589376e-06, "loss": 0.0376, "num_input_tokens_seen": 39619296, "step": 58770 }, { "epoch": 1.4358830283634232, "grad_norm": 0.037890564650297165, "learning_rate": 1.7940545134515642e-06, "loss": 0.1143, "num_input_tokens_seen": 39622560, "step": 58775 }, { "epoch": 1.4360051791952704, "grad_norm": 0.8310354948043823, "learning_rate": 1.7940026748696128e-06, "loss": 0.0009, "num_input_tokens_seen": 39625888, "step": 58780 }, { "epoch": 1.4361273300271176, "grad_norm": 2.9553492069244385, "learning_rate": 1.7939508305134604e-06, "loss": 0.0466, "num_input_tokens_seen": 39629984, "step": 58785 }, { "epoch": 1.4362494808589648, "grad_norm": 0.17864832282066345, "learning_rate": 1.7938989803834838e-06, "loss": 0.1889, "num_input_tokens_seen": 39633312, "step": 58790 }, { "epoch": 1.436371631690812, "grad_norm": 2.7538199424743652, "learning_rate": 1.7938471244800603e-06, "loss": 0.118, "num_input_tokens_seen": 39636640, "step": 58795 }, { "epoch": 1.436493782522659, "grad_norm": 0.39362823963165283, "learning_rate": 1.7937952628035673e-06, "loss": 0.0903, "num_input_tokens_seen": 39639968, "step": 58800 }, { "epoch": 1.436615933354506, "grad_norm": 0.21269036829471588, "learning_rate": 1.7937433953543815e-06, "loss": 0.0207, "num_input_tokens_seen": 39643296, "step": 58805 }, { "epoch": 1.4367380841863533, "grad_norm": 0.2682448923587799, "learning_rate": 1.79369152213288e-06, "loss": 0.1242, "num_input_tokens_seen": 39647008, "step": 58810 }, { "epoch": 1.4368602350182005, "grad_norm": 22.306591033935547, "learning_rate": 1.7936396431394405e-06, "loss": 0.0988, "num_input_tokens_seen": 39650464, "step": 58815 }, { "epoch": 1.4369823858500477, "grad_norm": 0.32385772466659546, "learning_rate": 1.7935877583744402e-06, "loss": 0.064, "num_input_tokens_seen": 39653856, "step": 58820 }, { "epoch": 1.4371045366818949, "grad_norm": 0.8138442039489746, "learning_rate": 1.7935358678382563e-06, "loss": 0.0035, "num_input_tokens_seen": 39657184, "step": 58825 }, { "epoch": 1.437226687513742, "grad_norm": 0.2751285135746002, "learning_rate": 1.7934839715312657e-06, "loss": 0.0999, "num_input_tokens_seen": 39660384, "step": 58830 }, { "epoch": 1.437348838345589, "grad_norm": 2.161569118499756, "learning_rate": 1.7934320694538462e-06, "loss": 0.1024, "num_input_tokens_seen": 39663840, "step": 58835 }, { "epoch": 1.4374709891774362, "grad_norm": 8.404568672180176, "learning_rate": 1.7933801616063756e-06, "loss": 0.0798, "num_input_tokens_seen": 39666912, "step": 58840 }, { "epoch": 1.4375931400092834, "grad_norm": 0.01838994212448597, "learning_rate": 1.793328247989231e-06, "loss": 0.0659, "num_input_tokens_seen": 39670176, "step": 58845 }, { "epoch": 1.4377152908411306, "grad_norm": 2.829005241394043, "learning_rate": 1.7932763286027903e-06, "loss": 0.0839, "num_input_tokens_seen": 39673248, "step": 58850 }, { "epoch": 1.4378374416729778, "grad_norm": 4.462026596069336, "learning_rate": 1.7932244034474305e-06, "loss": 0.0521, "num_input_tokens_seen": 39677024, "step": 58855 }, { "epoch": 1.437959592504825, "grad_norm": 14.132698059082031, "learning_rate": 1.7931724725235294e-06, "loss": 0.0915, "num_input_tokens_seen": 39680032, "step": 58860 }, { "epoch": 1.4380817433366722, "grad_norm": 0.03028332069516182, "learning_rate": 1.7931205358314648e-06, "loss": 0.0043, "num_input_tokens_seen": 39683424, "step": 58865 }, { "epoch": 1.4382038941685193, "grad_norm": 0.847586989402771, "learning_rate": 1.7930685933716142e-06, "loss": 0.0966, "num_input_tokens_seen": 39686816, "step": 58870 }, { "epoch": 1.4383260450003665, "grad_norm": 43.78037643432617, "learning_rate": 1.7930166451443558e-06, "loss": 0.1053, "num_input_tokens_seen": 39690016, "step": 58875 }, { "epoch": 1.4384481958322137, "grad_norm": 61.2498664855957, "learning_rate": 1.7929646911500669e-06, "loss": 0.1942, "num_input_tokens_seen": 39693792, "step": 58880 }, { "epoch": 1.438570346664061, "grad_norm": 0.8849436044692993, "learning_rate": 1.7929127313891254e-06, "loss": 0.069, "num_input_tokens_seen": 39696992, "step": 58885 }, { "epoch": 1.4386924974959079, "grad_norm": 15.736291885375977, "learning_rate": 1.7928607658619095e-06, "loss": 0.1596, "num_input_tokens_seen": 39700192, "step": 58890 }, { "epoch": 1.438814648327755, "grad_norm": 0.7857063412666321, "learning_rate": 1.7928087945687963e-06, "loss": 0.0381, "num_input_tokens_seen": 39703264, "step": 58895 }, { "epoch": 1.4389367991596023, "grad_norm": 29.244619369506836, "learning_rate": 1.7927568175101652e-06, "loss": 0.0723, "num_input_tokens_seen": 39706208, "step": 58900 }, { "epoch": 1.4390589499914495, "grad_norm": 10.40878677368164, "learning_rate": 1.7927048346863925e-06, "loss": 0.1089, "num_input_tokens_seen": 39709280, "step": 58905 }, { "epoch": 1.4391811008232966, "grad_norm": 11.405935287475586, "learning_rate": 1.7926528460978573e-06, "loss": 0.0524, "num_input_tokens_seen": 39712800, "step": 58910 }, { "epoch": 1.4393032516551438, "grad_norm": 2.9893431663513184, "learning_rate": 1.7926008517449373e-06, "loss": 0.0369, "num_input_tokens_seen": 39715872, "step": 58915 }, { "epoch": 1.4394254024869908, "grad_norm": 2.8281633853912354, "learning_rate": 1.7925488516280113e-06, "loss": 0.0941, "num_input_tokens_seen": 39718944, "step": 58920 }, { "epoch": 1.439547553318838, "grad_norm": 0.636328399181366, "learning_rate": 1.7924968457474563e-06, "loss": 0.0318, "num_input_tokens_seen": 39722144, "step": 58925 }, { "epoch": 1.4396697041506852, "grad_norm": 8.355908393859863, "learning_rate": 1.7924448341036512e-06, "loss": 0.0685, "num_input_tokens_seen": 39725280, "step": 58930 }, { "epoch": 1.4397918549825324, "grad_norm": 0.20460966229438782, "learning_rate": 1.792392816696974e-06, "loss": 0.0387, "num_input_tokens_seen": 39729440, "step": 58935 }, { "epoch": 1.4399140058143796, "grad_norm": 0.26626622676849365, "learning_rate": 1.7923407935278032e-06, "loss": 0.0736, "num_input_tokens_seen": 39732896, "step": 58940 }, { "epoch": 1.4400361566462268, "grad_norm": 10.729291915893555, "learning_rate": 1.7922887645965173e-06, "loss": 0.0424, "num_input_tokens_seen": 39736032, "step": 58945 }, { "epoch": 1.440158307478074, "grad_norm": 0.5907509326934814, "learning_rate": 1.792236729903494e-06, "loss": 0.1568, "num_input_tokens_seen": 39739040, "step": 58950 }, { "epoch": 1.4402804583099211, "grad_norm": 0.06551672518253326, "learning_rate": 1.7921846894491121e-06, "loss": 0.0775, "num_input_tokens_seen": 39742496, "step": 58955 }, { "epoch": 1.4404026091417683, "grad_norm": 0.35346364974975586, "learning_rate": 1.7921326432337505e-06, "loss": 0.1183, "num_input_tokens_seen": 39745568, "step": 58960 }, { "epoch": 1.4405247599736155, "grad_norm": 0.16065485775470734, "learning_rate": 1.792080591257787e-06, "loss": 0.0617, "num_input_tokens_seen": 39748960, "step": 58965 }, { "epoch": 1.4406469108054627, "grad_norm": 0.12054590880870819, "learning_rate": 1.7920285335216004e-06, "loss": 0.0014, "num_input_tokens_seen": 39752224, "step": 58970 }, { "epoch": 1.4407690616373097, "grad_norm": 0.05816735327243805, "learning_rate": 1.7919764700255693e-06, "loss": 0.02, "num_input_tokens_seen": 39755616, "step": 58975 }, { "epoch": 1.4408912124691569, "grad_norm": 26.84071922302246, "learning_rate": 1.7919244007700725e-06, "loss": 0.1716, "num_input_tokens_seen": 39758816, "step": 58980 }, { "epoch": 1.441013363301004, "grad_norm": 0.4252214729785919, "learning_rate": 1.791872325755488e-06, "loss": 0.0015, "num_input_tokens_seen": 39761952, "step": 58985 }, { "epoch": 1.4411355141328512, "grad_norm": 15.643749237060547, "learning_rate": 1.7918202449821954e-06, "loss": 0.3353, "num_input_tokens_seen": 39765024, "step": 58990 }, { "epoch": 1.4412576649646984, "grad_norm": 1.339182734489441, "learning_rate": 1.7917681584505727e-06, "loss": 0.1574, "num_input_tokens_seen": 39768352, "step": 58995 }, { "epoch": 1.4413798157965456, "grad_norm": 0.2828620374202728, "learning_rate": 1.791716066160999e-06, "loss": 0.0455, "num_input_tokens_seen": 39772576, "step": 59000 }, { "epoch": 1.4415019666283928, "grad_norm": 1.1742899417877197, "learning_rate": 1.7916639681138532e-06, "loss": 0.0619, "num_input_tokens_seen": 39775456, "step": 59005 }, { "epoch": 1.4416241174602398, "grad_norm": 10.849560737609863, "learning_rate": 1.791611864309514e-06, "loss": 0.0899, "num_input_tokens_seen": 39779104, "step": 59010 }, { "epoch": 1.441746268292087, "grad_norm": 12.845623016357422, "learning_rate": 1.7915597547483606e-06, "loss": 0.058, "num_input_tokens_seen": 39782432, "step": 59015 }, { "epoch": 1.4418684191239342, "grad_norm": 10.224414825439453, "learning_rate": 1.7915076394307717e-06, "loss": 0.1444, "num_input_tokens_seen": 39785696, "step": 59020 }, { "epoch": 1.4419905699557813, "grad_norm": 12.651299476623535, "learning_rate": 1.7914555183571266e-06, "loss": 0.0745, "num_input_tokens_seen": 39788832, "step": 59025 }, { "epoch": 1.4421127207876285, "grad_norm": 0.08147825300693512, "learning_rate": 1.7914033915278036e-06, "loss": 0.0778, "num_input_tokens_seen": 39792032, "step": 59030 }, { "epoch": 1.4422348716194757, "grad_norm": 11.500874519348145, "learning_rate": 1.7913512589431825e-06, "loss": 0.0806, "num_input_tokens_seen": 39795232, "step": 59035 }, { "epoch": 1.442357022451323, "grad_norm": 10.94676399230957, "learning_rate": 1.7912991206036421e-06, "loss": 0.0474, "num_input_tokens_seen": 39798880, "step": 59040 }, { "epoch": 1.44247917328317, "grad_norm": 0.040244702249765396, "learning_rate": 1.791246976509562e-06, "loss": 0.0484, "num_input_tokens_seen": 39802144, "step": 59045 }, { "epoch": 1.4426013241150173, "grad_norm": 43.799583435058594, "learning_rate": 1.7911948266613205e-06, "loss": 0.2378, "num_input_tokens_seen": 39805472, "step": 59050 }, { "epoch": 1.4427234749468645, "grad_norm": 150.8211669921875, "learning_rate": 1.791142671059298e-06, "loss": 0.1259, "num_input_tokens_seen": 39808864, "step": 59055 }, { "epoch": 1.4428456257787117, "grad_norm": 0.9879307746887207, "learning_rate": 1.7910905097038728e-06, "loss": 0.0342, "num_input_tokens_seen": 39812320, "step": 59060 }, { "epoch": 1.4429677766105586, "grad_norm": 31.04475212097168, "learning_rate": 1.7910383425954248e-06, "loss": 0.1464, "num_input_tokens_seen": 39815904, "step": 59065 }, { "epoch": 1.4430899274424058, "grad_norm": 32.14227294921875, "learning_rate": 1.7909861697343333e-06, "loss": 0.0687, "num_input_tokens_seen": 39818976, "step": 59070 }, { "epoch": 1.443212078274253, "grad_norm": 0.20150291919708252, "learning_rate": 1.7909339911209775e-06, "loss": 0.0014, "num_input_tokens_seen": 39822624, "step": 59075 }, { "epoch": 1.4433342291061002, "grad_norm": 140.92352294921875, "learning_rate": 1.790881806755737e-06, "loss": 0.1436, "num_input_tokens_seen": 39825824, "step": 59080 }, { "epoch": 1.4434563799379474, "grad_norm": 0.2697312533855438, "learning_rate": 1.7908296166389914e-06, "loss": 0.0311, "num_input_tokens_seen": 39829472, "step": 59085 }, { "epoch": 1.4435785307697946, "grad_norm": 15.324634552001953, "learning_rate": 1.7907774207711199e-06, "loss": 0.0496, "num_input_tokens_seen": 39832800, "step": 59090 }, { "epoch": 1.4437006816016418, "grad_norm": 14.72012996673584, "learning_rate": 1.7907252191525023e-06, "loss": 0.1983, "num_input_tokens_seen": 39836640, "step": 59095 }, { "epoch": 1.4438228324334887, "grad_norm": 0.4023982584476471, "learning_rate": 1.7906730117835185e-06, "loss": 0.0758, "num_input_tokens_seen": 39840224, "step": 59100 }, { "epoch": 1.443944983265336, "grad_norm": 11.866134643554688, "learning_rate": 1.7906207986645477e-06, "loss": 0.0342, "num_input_tokens_seen": 39843872, "step": 59105 }, { "epoch": 1.4440671340971831, "grad_norm": 19.953330993652344, "learning_rate": 1.7905685797959697e-06, "loss": 0.103, "num_input_tokens_seen": 39846944, "step": 59110 }, { "epoch": 1.4441892849290303, "grad_norm": 0.373773992061615, "learning_rate": 1.7905163551781643e-06, "loss": 0.1281, "num_input_tokens_seen": 39851296, "step": 59115 }, { "epoch": 1.4443114357608775, "grad_norm": 0.054920095950365067, "learning_rate": 1.7904641248115117e-06, "loss": 0.0785, "num_input_tokens_seen": 39854880, "step": 59120 }, { "epoch": 1.4444335865927247, "grad_norm": 24.973628997802734, "learning_rate": 1.7904118886963913e-06, "loss": 0.0942, "num_input_tokens_seen": 39858016, "step": 59125 }, { "epoch": 1.4445557374245719, "grad_norm": 0.299342542886734, "learning_rate": 1.7903596468331829e-06, "loss": 0.0749, "num_input_tokens_seen": 39861408, "step": 59130 }, { "epoch": 1.444677888256419, "grad_norm": 124.8744125366211, "learning_rate": 1.7903073992222666e-06, "loss": 0.0521, "num_input_tokens_seen": 39864608, "step": 59135 }, { "epoch": 1.4448000390882663, "grad_norm": 0.42814555764198303, "learning_rate": 1.7902551458640222e-06, "loss": 0.2063, "num_input_tokens_seen": 39868000, "step": 59140 }, { "epoch": 1.4449221899201135, "grad_norm": 19.024675369262695, "learning_rate": 1.79020288675883e-06, "loss": 0.1936, "num_input_tokens_seen": 39871520, "step": 59145 }, { "epoch": 1.4450443407519606, "grad_norm": 0.2149042934179306, "learning_rate": 1.79015062190707e-06, "loss": 0.1798, "num_input_tokens_seen": 39874656, "step": 59150 }, { "epoch": 1.4451664915838076, "grad_norm": 0.3116861581802368, "learning_rate": 1.7900983513091218e-06, "loss": 0.0061, "num_input_tokens_seen": 39877728, "step": 59155 }, { "epoch": 1.4452886424156548, "grad_norm": 13.638267517089844, "learning_rate": 1.790046074965366e-06, "loss": 0.0904, "num_input_tokens_seen": 39880992, "step": 59160 }, { "epoch": 1.445410793247502, "grad_norm": 0.12967146933078766, "learning_rate": 1.7899937928761829e-06, "loss": 0.1306, "num_input_tokens_seen": 39884192, "step": 59165 }, { "epoch": 1.4455329440793492, "grad_norm": 8.077123641967773, "learning_rate": 1.789941505041952e-06, "loss": 0.161, "num_input_tokens_seen": 39887136, "step": 59170 }, { "epoch": 1.4456550949111964, "grad_norm": 0.0998186245560646, "learning_rate": 1.7898892114630542e-06, "loss": 0.0333, "num_input_tokens_seen": 39890720, "step": 59175 }, { "epoch": 1.4457772457430436, "grad_norm": 0.12306854128837585, "learning_rate": 1.78983691213987e-06, "loss": 0.1145, "num_input_tokens_seen": 39894176, "step": 59180 }, { "epoch": 1.4458993965748907, "grad_norm": 67.00183868408203, "learning_rate": 1.789784607072779e-06, "loss": 0.0209, "num_input_tokens_seen": 39897376, "step": 59185 }, { "epoch": 1.4460215474067377, "grad_norm": 0.15235014259815216, "learning_rate": 1.7897322962621616e-06, "loss": 0.044, "num_input_tokens_seen": 39900320, "step": 59190 }, { "epoch": 1.446143698238585, "grad_norm": 69.80928802490234, "learning_rate": 1.789679979708399e-06, "loss": 0.0499, "num_input_tokens_seen": 39903520, "step": 59195 }, { "epoch": 1.446265849070432, "grad_norm": 0.33987465500831604, "learning_rate": 1.7896276574118709e-06, "loss": 0.2074, "num_input_tokens_seen": 39907552, "step": 59200 }, { "epoch": 1.4463879999022793, "grad_norm": 59.96290969848633, "learning_rate": 1.7895753293729583e-06, "loss": 0.104, "num_input_tokens_seen": 39911392, "step": 59205 }, { "epoch": 1.4465101507341265, "grad_norm": 40.25039291381836, "learning_rate": 1.7895229955920414e-06, "loss": 0.1042, "num_input_tokens_seen": 39914912, "step": 59210 }, { "epoch": 1.4466323015659737, "grad_norm": 0.11362236738204956, "learning_rate": 1.789470656069501e-06, "loss": 0.0912, "num_input_tokens_seen": 39918176, "step": 59215 }, { "epoch": 1.4467544523978209, "grad_norm": 0.18816527724266052, "learning_rate": 1.7894183108057175e-06, "loss": 0.0032, "num_input_tokens_seen": 39921504, "step": 59220 }, { "epoch": 1.446876603229668, "grad_norm": 16.205690383911133, "learning_rate": 1.789365959801072e-06, "loss": 0.0899, "num_input_tokens_seen": 39924576, "step": 59225 }, { "epoch": 1.4469987540615152, "grad_norm": 0.13214744627475739, "learning_rate": 1.7893136030559445e-06, "loss": 0.0011, "num_input_tokens_seen": 39927968, "step": 59230 }, { "epoch": 1.4471209048933624, "grad_norm": 52.55016326904297, "learning_rate": 1.7892612405707168e-06, "loss": 0.1049, "num_input_tokens_seen": 39931424, "step": 59235 }, { "epoch": 1.4472430557252096, "grad_norm": 0.0789654478430748, "learning_rate": 1.7892088723457685e-06, "loss": 0.0006, "num_input_tokens_seen": 39934944, "step": 59240 }, { "epoch": 1.4473652065570566, "grad_norm": 0.0595073476433754, "learning_rate": 1.7891564983814813e-06, "loss": 0.0215, "num_input_tokens_seen": 39938272, "step": 59245 }, { "epoch": 1.4474873573889038, "grad_norm": 0.2962166368961334, "learning_rate": 1.7891041186782356e-06, "loss": 0.1159, "num_input_tokens_seen": 39941280, "step": 59250 }, { "epoch": 1.447609508220751, "grad_norm": 0.09220468997955322, "learning_rate": 1.7890517332364125e-06, "loss": 0.0907, "num_input_tokens_seen": 39944672, "step": 59255 }, { "epoch": 1.4477316590525982, "grad_norm": 0.18392859399318695, "learning_rate": 1.7889993420563934e-06, "loss": 0.0762, "num_input_tokens_seen": 39948064, "step": 59260 }, { "epoch": 1.4478538098844453, "grad_norm": 0.08621831238269806, "learning_rate": 1.7889469451385586e-06, "loss": 0.0961, "num_input_tokens_seen": 39951008, "step": 59265 }, { "epoch": 1.4479759607162925, "grad_norm": 19.898418426513672, "learning_rate": 1.7888945424832893e-06, "loss": 0.1437, "num_input_tokens_seen": 39954912, "step": 59270 }, { "epoch": 1.4480981115481397, "grad_norm": 0.03776702284812927, "learning_rate": 1.7888421340909666e-06, "loss": 0.1528, "num_input_tokens_seen": 39957984, "step": 59275 }, { "epoch": 1.4482202623799867, "grad_norm": 46.947853088378906, "learning_rate": 1.788789719961972e-06, "loss": 0.1918, "num_input_tokens_seen": 39960928, "step": 59280 }, { "epoch": 1.4483424132118339, "grad_norm": 0.1397467702627182, "learning_rate": 1.7887373000966864e-06, "loss": 0.0038, "num_input_tokens_seen": 39964448, "step": 59285 }, { "epoch": 1.448464564043681, "grad_norm": 0.8697220087051392, "learning_rate": 1.7886848744954909e-06, "loss": 0.0225, "num_input_tokens_seen": 39967840, "step": 59290 }, { "epoch": 1.4485867148755283, "grad_norm": 12.394600868225098, "learning_rate": 1.7886324431587668e-06, "loss": 0.1449, "num_input_tokens_seen": 39970848, "step": 59295 }, { "epoch": 1.4487088657073754, "grad_norm": 0.15624871850013733, "learning_rate": 1.7885800060868954e-06, "loss": 0.1448, "num_input_tokens_seen": 39974048, "step": 59300 }, { "epoch": 1.4488310165392226, "grad_norm": 0.13877400755882263, "learning_rate": 1.788527563280258e-06, "loss": 0.0019, "num_input_tokens_seen": 39977056, "step": 59305 }, { "epoch": 1.4489531673710698, "grad_norm": 0.24705491960048676, "learning_rate": 1.7884751147392364e-06, "loss": 0.0009, "num_input_tokens_seen": 39980384, "step": 59310 }, { "epoch": 1.449075318202917, "grad_norm": 0.1556907594203949, "learning_rate": 1.7884226604642117e-06, "loss": 0.0523, "num_input_tokens_seen": 39983712, "step": 59315 }, { "epoch": 1.4491974690347642, "grad_norm": 0.7603535652160645, "learning_rate": 1.7883702004555652e-06, "loss": 0.1469, "num_input_tokens_seen": 39986976, "step": 59320 }, { "epoch": 1.4493196198666114, "grad_norm": 2.096450090408325, "learning_rate": 1.7883177347136785e-06, "loss": 0.0092, "num_input_tokens_seen": 39990240, "step": 59325 }, { "epoch": 1.4494417706984586, "grad_norm": 0.38189834356307983, "learning_rate": 1.7882652632389332e-06, "loss": 0.0009, "num_input_tokens_seen": 39993440, "step": 59330 }, { "epoch": 1.4495639215303056, "grad_norm": 0.08693571388721466, "learning_rate": 1.788212786031711e-06, "loss": 0.1739, "num_input_tokens_seen": 39996768, "step": 59335 }, { "epoch": 1.4496860723621527, "grad_norm": 11.836777687072754, "learning_rate": 1.7881603030923935e-06, "loss": 0.1538, "num_input_tokens_seen": 40000608, "step": 59340 }, { "epoch": 1.449808223194, "grad_norm": 0.20686087012290955, "learning_rate": 1.788107814421362e-06, "loss": 0.0392, "num_input_tokens_seen": 40003872, "step": 59345 }, { "epoch": 1.4499303740258471, "grad_norm": 0.1668231338262558, "learning_rate": 1.7880553200189987e-06, "loss": 0.085, "num_input_tokens_seen": 40007328, "step": 59350 }, { "epoch": 1.4500525248576943, "grad_norm": 0.15712271630764008, "learning_rate": 1.7880028198856852e-06, "loss": 0.0583, "num_input_tokens_seen": 40011168, "step": 59355 }, { "epoch": 1.4501746756895415, "grad_norm": 0.5437976121902466, "learning_rate": 1.787950314021803e-06, "loss": 0.0058, "num_input_tokens_seen": 40014432, "step": 59360 }, { "epoch": 1.4502968265213887, "grad_norm": 0.09233890473842621, "learning_rate": 1.7878978024277344e-06, "loss": 0.0448, "num_input_tokens_seen": 40017888, "step": 59365 }, { "epoch": 1.4504189773532357, "grad_norm": 0.13593538105487823, "learning_rate": 1.7878452851038612e-06, "loss": 0.0014, "num_input_tokens_seen": 40021472, "step": 59370 }, { "epoch": 1.4505411281850829, "grad_norm": 22.736013412475586, "learning_rate": 1.7877927620505648e-06, "loss": 0.0793, "num_input_tokens_seen": 40024480, "step": 59375 }, { "epoch": 1.45066327901693, "grad_norm": 2.9554624557495117, "learning_rate": 1.7877402332682278e-06, "loss": 0.0604, "num_input_tokens_seen": 40028320, "step": 59380 }, { "epoch": 1.4507854298487772, "grad_norm": 0.04954606294631958, "learning_rate": 1.787687698757232e-06, "loss": 0.0002, "num_input_tokens_seen": 40031584, "step": 59385 }, { "epoch": 1.4509075806806244, "grad_norm": 0.10807470232248306, "learning_rate": 1.7876351585179593e-06, "loss": 0.0494, "num_input_tokens_seen": 40034976, "step": 59390 }, { "epoch": 1.4510297315124716, "grad_norm": 8.378605842590332, "learning_rate": 1.7875826125507917e-06, "loss": 0.1674, "num_input_tokens_seen": 40038176, "step": 59395 }, { "epoch": 1.4511518823443188, "grad_norm": 38.70036697387695, "learning_rate": 1.787530060856112e-06, "loss": 0.1429, "num_input_tokens_seen": 40041312, "step": 59400 }, { "epoch": 1.451274033176166, "grad_norm": 0.11730583012104034, "learning_rate": 1.7874775034343012e-06, "loss": 0.005, "num_input_tokens_seen": 40044576, "step": 59405 }, { "epoch": 1.4513961840080132, "grad_norm": 12.826897621154785, "learning_rate": 1.7874249402857426e-06, "loss": 0.0218, "num_input_tokens_seen": 40047584, "step": 59410 }, { "epoch": 1.4515183348398604, "grad_norm": 0.23351694643497467, "learning_rate": 1.787372371410818e-06, "loss": 0.016, "num_input_tokens_seen": 40051168, "step": 59415 }, { "epoch": 1.4516404856717076, "grad_norm": 0.27510517835617065, "learning_rate": 1.7873197968099097e-06, "loss": 0.056, "num_input_tokens_seen": 40054816, "step": 59420 }, { "epoch": 1.4517626365035545, "grad_norm": 0.12724097073078156, "learning_rate": 1.7872672164834e-06, "loss": 0.0509, "num_input_tokens_seen": 40058016, "step": 59425 }, { "epoch": 1.4518847873354017, "grad_norm": 186.72894287109375, "learning_rate": 1.7872146304316714e-06, "loss": 0.3248, "num_input_tokens_seen": 40061792, "step": 59430 }, { "epoch": 1.452006938167249, "grad_norm": 0.6554990410804749, "learning_rate": 1.7871620386551065e-06, "loss": 0.1667, "num_input_tokens_seen": 40064928, "step": 59435 }, { "epoch": 1.452129088999096, "grad_norm": 0.29013246297836304, "learning_rate": 1.7871094411540872e-06, "loss": 0.0771, "num_input_tokens_seen": 40068384, "step": 59440 }, { "epoch": 1.4522512398309433, "grad_norm": 0.043106433004140854, "learning_rate": 1.7870568379289965e-06, "loss": 0.0294, "num_input_tokens_seen": 40071648, "step": 59445 }, { "epoch": 1.4523733906627905, "grad_norm": 0.246318057179451, "learning_rate": 1.787004228980217e-06, "loss": 0.0534, "num_input_tokens_seen": 40074976, "step": 59450 }, { "epoch": 1.4524955414946374, "grad_norm": 0.21862973272800446, "learning_rate": 1.7869516143081307e-06, "loss": 0.1416, "num_input_tokens_seen": 40078688, "step": 59455 }, { "epoch": 1.4526176923264846, "grad_norm": 1.7106530666351318, "learning_rate": 1.7868989939131204e-06, "loss": 0.0022, "num_input_tokens_seen": 40081888, "step": 59460 }, { "epoch": 1.4527398431583318, "grad_norm": 0.24451476335525513, "learning_rate": 1.7868463677955697e-06, "loss": 0.0479, "num_input_tokens_seen": 40085088, "step": 59465 }, { "epoch": 1.452861993990179, "grad_norm": 20.032474517822266, "learning_rate": 1.78679373595586e-06, "loss": 0.2927, "num_input_tokens_seen": 40088352, "step": 59470 }, { "epoch": 1.4529841448220262, "grad_norm": 42.15612030029297, "learning_rate": 1.786741098394375e-06, "loss": 0.1888, "num_input_tokens_seen": 40091744, "step": 59475 }, { "epoch": 1.4531062956538734, "grad_norm": 0.5244346857070923, "learning_rate": 1.7866884551114968e-06, "loss": 0.1454, "num_input_tokens_seen": 40095072, "step": 59480 }, { "epoch": 1.4532284464857206, "grad_norm": 0.15843136608600616, "learning_rate": 1.7866358061076086e-06, "loss": 0.0011, "num_input_tokens_seen": 40098464, "step": 59485 }, { "epoch": 1.4533505973175678, "grad_norm": 57.28195571899414, "learning_rate": 1.7865831513830933e-06, "loss": 0.1509, "num_input_tokens_seen": 40101728, "step": 59490 }, { "epoch": 1.453472748149415, "grad_norm": 15.683917999267578, "learning_rate": 1.7865304909383338e-06, "loss": 0.1103, "num_input_tokens_seen": 40104736, "step": 59495 }, { "epoch": 1.4535948989812622, "grad_norm": 64.17626190185547, "learning_rate": 1.786477824773713e-06, "loss": 0.0494, "num_input_tokens_seen": 40108576, "step": 59500 }, { "epoch": 1.4537170498131093, "grad_norm": 0.3834023177623749, "learning_rate": 1.7864251528896139e-06, "loss": 0.0026, "num_input_tokens_seen": 40111584, "step": 59505 }, { "epoch": 1.4538392006449565, "grad_norm": 0.7233591079711914, "learning_rate": 1.7863724752864195e-06, "loss": 0.0802, "num_input_tokens_seen": 40115232, "step": 59510 }, { "epoch": 1.4539613514768035, "grad_norm": 0.16249462962150574, "learning_rate": 1.7863197919645133e-06, "loss": 0.0333, "num_input_tokens_seen": 40118432, "step": 59515 }, { "epoch": 1.4540835023086507, "grad_norm": 31.28944206237793, "learning_rate": 1.7862671029242775e-06, "loss": 0.0861, "num_input_tokens_seen": 40121760, "step": 59520 }, { "epoch": 1.4542056531404979, "grad_norm": 0.1898259073495865, "learning_rate": 1.7862144081660963e-06, "loss": 0.0021, "num_input_tokens_seen": 40125600, "step": 59525 }, { "epoch": 1.454327803972345, "grad_norm": 0.05062809959053993, "learning_rate": 1.7861617076903524e-06, "loss": 0.0012, "num_input_tokens_seen": 40128736, "step": 59530 }, { "epoch": 1.4544499548041923, "grad_norm": 0.07426691800355911, "learning_rate": 1.7861090014974289e-06, "loss": 0.001, "num_input_tokens_seen": 40132320, "step": 59535 }, { "epoch": 1.4545721056360394, "grad_norm": 49.9919319152832, "learning_rate": 1.7860562895877097e-06, "loss": 0.0529, "num_input_tokens_seen": 40135840, "step": 59540 }, { "epoch": 1.4546942564678864, "grad_norm": 0.09730231761932373, "learning_rate": 1.786003571961577e-06, "loss": 0.0624, "num_input_tokens_seen": 40139232, "step": 59545 }, { "epoch": 1.4548164072997336, "grad_norm": 2.2113256454467773, "learning_rate": 1.7859508486194156e-06, "loss": 0.0007, "num_input_tokens_seen": 40142048, "step": 59550 }, { "epoch": 1.4549385581315808, "grad_norm": 15.290037155151367, "learning_rate": 1.785898119561608e-06, "loss": 0.049, "num_input_tokens_seen": 40145440, "step": 59555 }, { "epoch": 1.455060708963428, "grad_norm": 0.3437047302722931, "learning_rate": 1.785845384788538e-06, "loss": 0.0465, "num_input_tokens_seen": 40148576, "step": 59560 }, { "epoch": 1.4551828597952752, "grad_norm": 0.5709901452064514, "learning_rate": 1.7857926443005888e-06, "loss": 0.08, "num_input_tokens_seen": 40151712, "step": 59565 }, { "epoch": 1.4553050106271224, "grad_norm": 0.43648040294647217, "learning_rate": 1.7857398980981442e-06, "loss": 0.0449, "num_input_tokens_seen": 40154784, "step": 59570 }, { "epoch": 1.4554271614589696, "grad_norm": 16.983592987060547, "learning_rate": 1.7856871461815878e-06, "loss": 0.0793, "num_input_tokens_seen": 40157856, "step": 59575 }, { "epoch": 1.4555493122908167, "grad_norm": 0.21682091057300568, "learning_rate": 1.785634388551303e-06, "loss": 0.2216, "num_input_tokens_seen": 40161184, "step": 59580 }, { "epoch": 1.455671463122664, "grad_norm": 5.58907413482666, "learning_rate": 1.7855816252076739e-06, "loss": 0.0674, "num_input_tokens_seen": 40164384, "step": 59585 }, { "epoch": 1.4557936139545111, "grad_norm": 0.136368989944458, "learning_rate": 1.7855288561510837e-06, "loss": 0.0298, "num_input_tokens_seen": 40167648, "step": 59590 }, { "epoch": 1.4559157647863583, "grad_norm": 0.09996183216571808, "learning_rate": 1.7854760813819166e-06, "loss": 0.0039, "num_input_tokens_seen": 40170912, "step": 59595 }, { "epoch": 1.4560379156182053, "grad_norm": 0.14230936765670776, "learning_rate": 1.785423300900556e-06, "loss": 0.092, "num_input_tokens_seen": 40174368, "step": 59600 }, { "epoch": 1.4561600664500525, "grad_norm": 0.06128491833806038, "learning_rate": 1.7853705147073859e-06, "loss": 0.147, "num_input_tokens_seen": 40177824, "step": 59605 }, { "epoch": 1.4562822172818997, "grad_norm": 208.32565307617188, "learning_rate": 1.78531772280279e-06, "loss": 0.1516, "num_input_tokens_seen": 40180896, "step": 59610 }, { "epoch": 1.4564043681137468, "grad_norm": 0.5086117386817932, "learning_rate": 1.7852649251871528e-06, "loss": 0.0996, "num_input_tokens_seen": 40183968, "step": 59615 }, { "epoch": 1.456526518945594, "grad_norm": 24.724302291870117, "learning_rate": 1.7852121218608573e-06, "loss": 0.0661, "num_input_tokens_seen": 40187168, "step": 59620 }, { "epoch": 1.4566486697774412, "grad_norm": 0.044920340180397034, "learning_rate": 1.7851593128242885e-06, "loss": 0.1033, "num_input_tokens_seen": 40190560, "step": 59625 }, { "epoch": 1.4567708206092884, "grad_norm": 83.80802917480469, "learning_rate": 1.78510649807783e-06, "loss": 0.1114, "num_input_tokens_seen": 40193952, "step": 59630 }, { "epoch": 1.4568929714411354, "grad_norm": 28.874740600585938, "learning_rate": 1.7850536776218656e-06, "loss": 0.1104, "num_input_tokens_seen": 40196896, "step": 59635 }, { "epoch": 1.4570151222729826, "grad_norm": 0.782297670841217, "learning_rate": 1.7850008514567797e-06, "loss": 0.0943, "num_input_tokens_seen": 40199648, "step": 59640 }, { "epoch": 1.4571372731048298, "grad_norm": 0.2725517749786377, "learning_rate": 1.784948019582957e-06, "loss": 0.0485, "num_input_tokens_seen": 40202656, "step": 59645 }, { "epoch": 1.457259423936677, "grad_norm": 0.3396630883216858, "learning_rate": 1.7848951820007807e-06, "loss": 0.001, "num_input_tokens_seen": 40205792, "step": 59650 }, { "epoch": 1.4573815747685241, "grad_norm": 47.51850509643555, "learning_rate": 1.7848423387106355e-06, "loss": 0.0638, "num_input_tokens_seen": 40208800, "step": 59655 }, { "epoch": 1.4575037256003713, "grad_norm": 21.536033630371094, "learning_rate": 1.7847894897129058e-06, "loss": 0.0392, "num_input_tokens_seen": 40212640, "step": 59660 }, { "epoch": 1.4576258764322185, "grad_norm": 0.07097387313842773, "learning_rate": 1.784736635007976e-06, "loss": 0.0841, "num_input_tokens_seen": 40216032, "step": 59665 }, { "epoch": 1.4577480272640657, "grad_norm": 21.68965721130371, "learning_rate": 1.7846837745962301e-06, "loss": 0.1628, "num_input_tokens_seen": 40219552, "step": 59670 }, { "epoch": 1.457870178095913, "grad_norm": 1.3079677820205688, "learning_rate": 1.784630908478053e-06, "loss": 0.0269, "num_input_tokens_seen": 40222752, "step": 59675 }, { "epoch": 1.45799232892776, "grad_norm": 0.4254034161567688, "learning_rate": 1.7845780366538285e-06, "loss": 0.0403, "num_input_tokens_seen": 40226080, "step": 59680 }, { "epoch": 1.4581144797596073, "grad_norm": 67.33355712890625, "learning_rate": 1.7845251591239418e-06, "loss": 0.2522, "num_input_tokens_seen": 40229408, "step": 59685 }, { "epoch": 1.4582366305914543, "grad_norm": 0.17583763599395752, "learning_rate": 1.7844722758887772e-06, "loss": 0.0425, "num_input_tokens_seen": 40232928, "step": 59690 }, { "epoch": 1.4583587814233014, "grad_norm": 0.09350127726793289, "learning_rate": 1.7844193869487189e-06, "loss": 0.0396, "num_input_tokens_seen": 40236384, "step": 59695 }, { "epoch": 1.4584809322551486, "grad_norm": 0.47614389657974243, "learning_rate": 1.7843664923041522e-06, "loss": 0.0483, "num_input_tokens_seen": 40239648, "step": 59700 }, { "epoch": 1.4586030830869958, "grad_norm": 65.01331329345703, "learning_rate": 1.784313591955461e-06, "loss": 0.1454, "num_input_tokens_seen": 40242784, "step": 59705 }, { "epoch": 1.458725233918843, "grad_norm": 9.80173110961914, "learning_rate": 1.784260685903031e-06, "loss": 0.0302, "num_input_tokens_seen": 40246304, "step": 59710 }, { "epoch": 1.4588473847506902, "grad_norm": 25.71861457824707, "learning_rate": 1.7842077741472457e-06, "loss": 0.1307, "num_input_tokens_seen": 40249760, "step": 59715 }, { "epoch": 1.4589695355825374, "grad_norm": 0.6096875071525574, "learning_rate": 1.7841548566884908e-06, "loss": 0.0339, "num_input_tokens_seen": 40252832, "step": 59720 }, { "epoch": 1.4590916864143844, "grad_norm": 185.60562133789062, "learning_rate": 1.784101933527151e-06, "loss": 0.1555, "num_input_tokens_seen": 40256736, "step": 59725 }, { "epoch": 1.4592138372462315, "grad_norm": 65.35801696777344, "learning_rate": 1.7840490046636108e-06, "loss": 0.1282, "num_input_tokens_seen": 40259936, "step": 59730 }, { "epoch": 1.4593359880780787, "grad_norm": 31.8502140045166, "learning_rate": 1.7839960700982555e-06, "loss": 0.2229, "num_input_tokens_seen": 40263264, "step": 59735 }, { "epoch": 1.459458138909926, "grad_norm": 0.36327114701271057, "learning_rate": 1.7839431298314698e-06, "loss": 0.1385, "num_input_tokens_seen": 40266208, "step": 59740 }, { "epoch": 1.4595802897417731, "grad_norm": 51.076438903808594, "learning_rate": 1.7838901838636389e-06, "loss": 0.0454, "num_input_tokens_seen": 40269152, "step": 59745 }, { "epoch": 1.4597024405736203, "grad_norm": 21.2176513671875, "learning_rate": 1.7838372321951478e-06, "loss": 0.0643, "num_input_tokens_seen": 40272416, "step": 59750 }, { "epoch": 1.4598245914054675, "grad_norm": 0.8600711226463318, "learning_rate": 1.7837842748263813e-06, "loss": 0.1077, "num_input_tokens_seen": 40275488, "step": 59755 }, { "epoch": 1.4599467422373147, "grad_norm": 0.1487630307674408, "learning_rate": 1.7837313117577251e-06, "loss": 0.0021, "num_input_tokens_seen": 40279136, "step": 59760 }, { "epoch": 1.4600688930691619, "grad_norm": 18.513385772705078, "learning_rate": 1.7836783429895636e-06, "loss": 0.0646, "num_input_tokens_seen": 40282592, "step": 59765 }, { "epoch": 1.460191043901009, "grad_norm": 0.26967713236808777, "learning_rate": 1.7836253685222827e-06, "loss": 0.0802, "num_input_tokens_seen": 40285984, "step": 59770 }, { "epoch": 1.4603131947328563, "grad_norm": 26.209548950195312, "learning_rate": 1.7835723883562673e-06, "loss": 0.0581, "num_input_tokens_seen": 40289504, "step": 59775 }, { "epoch": 1.4604353455647032, "grad_norm": 18.547000885009766, "learning_rate": 1.7835194024919026e-06, "loss": 0.0379, "num_input_tokens_seen": 40293152, "step": 59780 }, { "epoch": 1.4605574963965504, "grad_norm": 0.2680453658103943, "learning_rate": 1.783466410929574e-06, "loss": 0.0462, "num_input_tokens_seen": 40296928, "step": 59785 }, { "epoch": 1.4606796472283976, "grad_norm": 0.46763402223587036, "learning_rate": 1.7834134136696672e-06, "loss": 0.0274, "num_input_tokens_seen": 40300512, "step": 59790 }, { "epoch": 1.4608017980602448, "grad_norm": 19.22868537902832, "learning_rate": 1.783360410712567e-06, "loss": 0.0352, "num_input_tokens_seen": 40303968, "step": 59795 }, { "epoch": 1.460923948892092, "grad_norm": 0.03059125877916813, "learning_rate": 1.7833074020586597e-06, "loss": 0.0287, "num_input_tokens_seen": 40307296, "step": 59800 }, { "epoch": 1.4610460997239392, "grad_norm": 14.651070594787598, "learning_rate": 1.7832543877083302e-06, "loss": 0.1239, "num_input_tokens_seen": 40310496, "step": 59805 }, { "epoch": 1.4611682505557864, "grad_norm": 0.4671648442745209, "learning_rate": 1.7832013676619636e-06, "loss": 0.0801, "num_input_tokens_seen": 40313568, "step": 59810 }, { "epoch": 1.4612904013876333, "grad_norm": 24.84051513671875, "learning_rate": 1.7831483419199462e-06, "loss": 0.0944, "num_input_tokens_seen": 40316832, "step": 59815 }, { "epoch": 1.4614125522194805, "grad_norm": 0.6817691922187805, "learning_rate": 1.7830953104826638e-06, "loss": 0.002, "num_input_tokens_seen": 40319904, "step": 59820 }, { "epoch": 1.4615347030513277, "grad_norm": 20.075729370117188, "learning_rate": 1.7830422733505012e-06, "loss": 0.0462, "num_input_tokens_seen": 40323936, "step": 59825 }, { "epoch": 1.461656853883175, "grad_norm": 0.13368147611618042, "learning_rate": 1.782989230523845e-06, "loss": 0.1381, "num_input_tokens_seen": 40328032, "step": 59830 }, { "epoch": 1.461779004715022, "grad_norm": 0.01761273667216301, "learning_rate": 1.7829361820030803e-06, "loss": 0.0352, "num_input_tokens_seen": 40331424, "step": 59835 }, { "epoch": 1.4619011555468693, "grad_norm": 91.9080810546875, "learning_rate": 1.782883127788593e-06, "loss": 0.0854, "num_input_tokens_seen": 40334624, "step": 59840 }, { "epoch": 1.4620233063787165, "grad_norm": 0.8963506817817688, "learning_rate": 1.782830067880769e-06, "loss": 0.0596, "num_input_tokens_seen": 40338208, "step": 59845 }, { "epoch": 1.4621454572105637, "grad_norm": 0.13214941322803497, "learning_rate": 1.7827770022799947e-06, "loss": 0.2008, "num_input_tokens_seen": 40342112, "step": 59850 }, { "epoch": 1.4622676080424108, "grad_norm": 0.053667087107896805, "learning_rate": 1.7827239309866548e-06, "loss": 0.0595, "num_input_tokens_seen": 40345184, "step": 59855 }, { "epoch": 1.462389758874258, "grad_norm": 16.44167137145996, "learning_rate": 1.7826708540011363e-06, "loss": 0.1361, "num_input_tokens_seen": 40348576, "step": 59860 }, { "epoch": 1.4625119097061052, "grad_norm": 0.0644519031047821, "learning_rate": 1.7826177713238248e-06, "loss": 0.0892, "num_input_tokens_seen": 40352224, "step": 59865 }, { "epoch": 1.4626340605379522, "grad_norm": 0.4683791995048523, "learning_rate": 1.7825646829551064e-06, "loss": 0.002, "num_input_tokens_seen": 40355744, "step": 59870 }, { "epoch": 1.4627562113697994, "grad_norm": 0.19856318831443787, "learning_rate": 1.782511588895367e-06, "loss": 0.0888, "num_input_tokens_seen": 40359008, "step": 59875 }, { "epoch": 1.4628783622016466, "grad_norm": 34.53307342529297, "learning_rate": 1.782458489144993e-06, "loss": 0.1662, "num_input_tokens_seen": 40362208, "step": 59880 }, { "epoch": 1.4630005130334938, "grad_norm": 0.33793848752975464, "learning_rate": 1.7824053837043706e-06, "loss": 0.1133, "num_input_tokens_seen": 40365472, "step": 59885 }, { "epoch": 1.463122663865341, "grad_norm": 0.9318715929985046, "learning_rate": 1.7823522725738855e-06, "loss": 0.0614, "num_input_tokens_seen": 40368736, "step": 59890 }, { "epoch": 1.4632448146971881, "grad_norm": 5.252325534820557, "learning_rate": 1.7822991557539244e-06, "loss": 0.0025, "num_input_tokens_seen": 40372128, "step": 59895 }, { "epoch": 1.4633669655290353, "grad_norm": 11.749712944030762, "learning_rate": 1.7822460332448733e-06, "loss": 0.1934, "num_input_tokens_seen": 40375392, "step": 59900 }, { "epoch": 1.4634891163608823, "grad_norm": 15.675751686096191, "learning_rate": 1.7821929050471188e-06, "loss": 0.0993, "num_input_tokens_seen": 40378464, "step": 59905 }, { "epoch": 1.4636112671927295, "grad_norm": 19.527395248413086, "learning_rate": 1.7821397711610468e-06, "loss": 0.0423, "num_input_tokens_seen": 40382304, "step": 59910 }, { "epoch": 1.4637334180245767, "grad_norm": 0.1722240298986435, "learning_rate": 1.7820866315870444e-06, "loss": 0.0396, "num_input_tokens_seen": 40386272, "step": 59915 }, { "epoch": 1.4638555688564239, "grad_norm": 90.19779968261719, "learning_rate": 1.7820334863254974e-06, "loss": 0.0489, "num_input_tokens_seen": 40390112, "step": 59920 }, { "epoch": 1.463977719688271, "grad_norm": 0.4945489168167114, "learning_rate": 1.7819803353767926e-06, "loss": 0.1565, "num_input_tokens_seen": 40393888, "step": 59925 }, { "epoch": 1.4640998705201183, "grad_norm": 15.186417579650879, "learning_rate": 1.7819271787413164e-06, "loss": 0.1662, "num_input_tokens_seen": 40397088, "step": 59930 }, { "epoch": 1.4642220213519654, "grad_norm": 0.28768110275268555, "learning_rate": 1.7818740164194556e-06, "loss": 0.1424, "num_input_tokens_seen": 40400736, "step": 59935 }, { "epoch": 1.4643441721838126, "grad_norm": 0.15257716178894043, "learning_rate": 1.7818208484115967e-06, "loss": 0.0331, "num_input_tokens_seen": 40404768, "step": 59940 }, { "epoch": 1.4644663230156598, "grad_norm": 0.33717256784439087, "learning_rate": 1.781767674718126e-06, "loss": 0.1021, "num_input_tokens_seen": 40407968, "step": 59945 }, { "epoch": 1.464588473847507, "grad_norm": 0.1763121485710144, "learning_rate": 1.7817144953394307e-06, "loss": 0.0847, "num_input_tokens_seen": 40411104, "step": 59950 }, { "epoch": 1.4647106246793542, "grad_norm": 64.01554870605469, "learning_rate": 1.7816613102758976e-06, "loss": 0.1078, "num_input_tokens_seen": 40414368, "step": 59955 }, { "epoch": 1.4648327755112012, "grad_norm": 2.6066277027130127, "learning_rate": 1.781608119527913e-06, "loss": 0.0459, "num_input_tokens_seen": 40417568, "step": 59960 }, { "epoch": 1.4649549263430484, "grad_norm": 0.572694718837738, "learning_rate": 1.7815549230958637e-06, "loss": 0.1725, "num_input_tokens_seen": 40420640, "step": 59965 }, { "epoch": 1.4650770771748955, "grad_norm": 44.864845275878906, "learning_rate": 1.7815017209801369e-06, "loss": 0.1199, "num_input_tokens_seen": 40423648, "step": 59970 }, { "epoch": 1.4651992280067427, "grad_norm": 0.10838565975427628, "learning_rate": 1.7814485131811195e-06, "loss": 0.0028, "num_input_tokens_seen": 40427232, "step": 59975 }, { "epoch": 1.46532137883859, "grad_norm": 12.047739028930664, "learning_rate": 1.7813952996991984e-06, "loss": 0.2917, "num_input_tokens_seen": 40430176, "step": 59980 }, { "epoch": 1.4654435296704371, "grad_norm": 0.120354562997818, "learning_rate": 1.7813420805347602e-06, "loss": 0.0253, "num_input_tokens_seen": 40433696, "step": 59985 }, { "epoch": 1.465565680502284, "grad_norm": 34.92924880981445, "learning_rate": 1.7812888556881926e-06, "loss": 0.1689, "num_input_tokens_seen": 40436896, "step": 59990 }, { "epoch": 1.4656878313341313, "grad_norm": 10.803354263305664, "learning_rate": 1.781235625159882e-06, "loss": 0.0497, "num_input_tokens_seen": 40440544, "step": 59995 }, { "epoch": 1.4658099821659785, "grad_norm": 19.33785629272461, "learning_rate": 1.781182388950216e-06, "loss": 0.048, "num_input_tokens_seen": 40443936, "step": 60000 }, { "epoch": 1.4659321329978257, "grad_norm": 0.15235191583633423, "learning_rate": 1.7811291470595815e-06, "loss": 0.1678, "num_input_tokens_seen": 40447968, "step": 60005 }, { "epoch": 1.4660542838296728, "grad_norm": 0.34952807426452637, "learning_rate": 1.7810758994883656e-06, "loss": 0.0026, "num_input_tokens_seen": 40451744, "step": 60010 }, { "epoch": 1.46617643466152, "grad_norm": 0.13237085938453674, "learning_rate": 1.781022646236956e-06, "loss": 0.0326, "num_input_tokens_seen": 40454880, "step": 60015 }, { "epoch": 1.4662985854933672, "grad_norm": 10.759251594543457, "learning_rate": 1.7809693873057393e-06, "loss": 0.1341, "num_input_tokens_seen": 40458080, "step": 60020 }, { "epoch": 1.4664207363252144, "grad_norm": 0.4018060863018036, "learning_rate": 1.7809161226951032e-06, "loss": 0.0014, "num_input_tokens_seen": 40461856, "step": 60025 }, { "epoch": 1.4665428871570616, "grad_norm": 35.23345184326172, "learning_rate": 1.7808628524054352e-06, "loss": 0.1523, "num_input_tokens_seen": 40465056, "step": 60030 }, { "epoch": 1.4666650379889088, "grad_norm": 9.999808311462402, "learning_rate": 1.7808095764371225e-06, "loss": 0.0853, "num_input_tokens_seen": 40468256, "step": 60035 }, { "epoch": 1.466787188820756, "grad_norm": 4.482977390289307, "learning_rate": 1.7807562947905526e-06, "loss": 0.1113, "num_input_tokens_seen": 40471712, "step": 60040 }, { "epoch": 1.4669093396526032, "grad_norm": 109.09564208984375, "learning_rate": 1.7807030074661127e-06, "loss": 0.0828, "num_input_tokens_seen": 40475360, "step": 60045 }, { "epoch": 1.4670314904844501, "grad_norm": 24.180208206176758, "learning_rate": 1.7806497144641909e-06, "loss": 0.0282, "num_input_tokens_seen": 40478752, "step": 60050 }, { "epoch": 1.4671536413162973, "grad_norm": 0.015254548750817776, "learning_rate": 1.7805964157851739e-06, "loss": 0.0005, "num_input_tokens_seen": 40482016, "step": 60055 }, { "epoch": 1.4672757921481445, "grad_norm": 22.02342987060547, "learning_rate": 1.7805431114294503e-06, "loss": 0.0425, "num_input_tokens_seen": 40485280, "step": 60060 }, { "epoch": 1.4673979429799917, "grad_norm": 0.04338202252984047, "learning_rate": 1.7804898013974068e-06, "loss": 0.1388, "num_input_tokens_seen": 40489056, "step": 60065 }, { "epoch": 1.467520093811839, "grad_norm": 0.06497588753700256, "learning_rate": 1.780436485689432e-06, "loss": 0.0005, "num_input_tokens_seen": 40492768, "step": 60070 }, { "epoch": 1.467642244643686, "grad_norm": 1.3534506559371948, "learning_rate": 1.7803831643059128e-06, "loss": 0.0839, "num_input_tokens_seen": 40496160, "step": 60075 }, { "epoch": 1.467764395475533, "grad_norm": 0.034730665385723114, "learning_rate": 1.7803298372472373e-06, "loss": 0.1656, "num_input_tokens_seen": 40499552, "step": 60080 }, { "epoch": 1.4678865463073802, "grad_norm": 57.326541900634766, "learning_rate": 1.7802765045137935e-06, "loss": 0.0605, "num_input_tokens_seen": 40502432, "step": 60085 }, { "epoch": 1.4680086971392274, "grad_norm": 10.931227684020996, "learning_rate": 1.7802231661059692e-06, "loss": 0.0969, "num_input_tokens_seen": 40505824, "step": 60090 }, { "epoch": 1.4681308479710746, "grad_norm": 0.2942550778388977, "learning_rate": 1.780169822024152e-06, "loss": 0.0764, "num_input_tokens_seen": 40509920, "step": 60095 }, { "epoch": 1.4682529988029218, "grad_norm": 2.650104522705078, "learning_rate": 1.78011647226873e-06, "loss": 0.2528, "num_input_tokens_seen": 40513120, "step": 60100 }, { "epoch": 1.468375149634769, "grad_norm": 0.17480523884296417, "learning_rate": 1.7800631168400915e-06, "loss": 0.0569, "num_input_tokens_seen": 40516512, "step": 60105 }, { "epoch": 1.4684973004666162, "grad_norm": 131.7166290283203, "learning_rate": 1.7800097557386238e-06, "loss": 0.2744, "num_input_tokens_seen": 40519776, "step": 60110 }, { "epoch": 1.4686194512984634, "grad_norm": 0.33879879117012024, "learning_rate": 1.7799563889647156e-06, "loss": 0.1327, "num_input_tokens_seen": 40522976, "step": 60115 }, { "epoch": 1.4687416021303106, "grad_norm": 0.4208439588546753, "learning_rate": 1.7799030165187548e-06, "loss": 0.0038, "num_input_tokens_seen": 40526496, "step": 60120 }, { "epoch": 1.4688637529621578, "grad_norm": 8.661870002746582, "learning_rate": 1.7798496384011291e-06, "loss": 0.0437, "num_input_tokens_seen": 40529888, "step": 60125 }, { "epoch": 1.468985903794005, "grad_norm": 2.0140016078948975, "learning_rate": 1.7797962546122274e-06, "loss": 0.0683, "num_input_tokens_seen": 40533664, "step": 60130 }, { "epoch": 1.469108054625852, "grad_norm": 14.874700546264648, "learning_rate": 1.7797428651524378e-06, "loss": 0.0433, "num_input_tokens_seen": 40538016, "step": 60135 }, { "epoch": 1.4692302054576991, "grad_norm": 25.21275520324707, "learning_rate": 1.779689470022148e-06, "loss": 0.0481, "num_input_tokens_seen": 40541728, "step": 60140 }, { "epoch": 1.4693523562895463, "grad_norm": 11.36154556274414, "learning_rate": 1.7796360692217468e-06, "loss": 0.0712, "num_input_tokens_seen": 40544736, "step": 60145 }, { "epoch": 1.4694745071213935, "grad_norm": 0.37468650937080383, "learning_rate": 1.7795826627516224e-06, "loss": 0.0833, "num_input_tokens_seen": 40547808, "step": 60150 }, { "epoch": 1.4695966579532407, "grad_norm": 0.36312851309776306, "learning_rate": 1.779529250612163e-06, "loss": 0.0974, "num_input_tokens_seen": 40551712, "step": 60155 }, { "epoch": 1.4697188087850879, "grad_norm": 10.961259841918945, "learning_rate": 1.7794758328037575e-06, "loss": 0.0931, "num_input_tokens_seen": 40555424, "step": 60160 }, { "epoch": 1.469840959616935, "grad_norm": 13.064085960388184, "learning_rate": 1.779422409326794e-06, "loss": 0.1147, "num_input_tokens_seen": 40559072, "step": 60165 }, { "epoch": 1.469963110448782, "grad_norm": 12.516934394836426, "learning_rate": 1.779368980181661e-06, "loss": 0.1045, "num_input_tokens_seen": 40562208, "step": 60170 }, { "epoch": 1.4700852612806292, "grad_norm": 30.780580520629883, "learning_rate": 1.7793155453687473e-06, "loss": 0.1029, "num_input_tokens_seen": 40565664, "step": 60175 }, { "epoch": 1.4702074121124764, "grad_norm": 40.40333938598633, "learning_rate": 1.7792621048884412e-06, "loss": 0.1246, "num_input_tokens_seen": 40569056, "step": 60180 }, { "epoch": 1.4703295629443236, "grad_norm": 9.920197486877441, "learning_rate": 1.7792086587411315e-06, "loss": 0.2031, "num_input_tokens_seen": 40571936, "step": 60185 }, { "epoch": 1.4704517137761708, "grad_norm": 18.63920783996582, "learning_rate": 1.7791552069272071e-06, "loss": 0.1175, "num_input_tokens_seen": 40575520, "step": 60190 }, { "epoch": 1.470573864608018, "grad_norm": 1.8878223896026611, "learning_rate": 1.779101749447056e-06, "loss": 0.0451, "num_input_tokens_seen": 40579168, "step": 60195 }, { "epoch": 1.4706960154398652, "grad_norm": 13.10407543182373, "learning_rate": 1.779048286301068e-06, "loss": 0.1231, "num_input_tokens_seen": 40582240, "step": 60200 }, { "epoch": 1.4708181662717124, "grad_norm": 53.87441635131836, "learning_rate": 1.778994817489631e-06, "loss": 0.0335, "num_input_tokens_seen": 40585568, "step": 60205 }, { "epoch": 1.4709403171035595, "grad_norm": 0.6808045506477356, "learning_rate": 1.778941343013134e-06, "loss": 0.0749, "num_input_tokens_seen": 40588768, "step": 60210 }, { "epoch": 1.4710624679354067, "grad_norm": 0.517785906791687, "learning_rate": 1.7788878628719663e-06, "loss": 0.1077, "num_input_tokens_seen": 40591968, "step": 60215 }, { "epoch": 1.471184618767254, "grad_norm": 0.20099209249019623, "learning_rate": 1.7788343770665165e-06, "loss": 0.043, "num_input_tokens_seen": 40595424, "step": 60220 }, { "epoch": 1.471306769599101, "grad_norm": 0.23198401927947998, "learning_rate": 1.7787808855971737e-06, "loss": 0.0018, "num_input_tokens_seen": 40598560, "step": 60225 }, { "epoch": 1.471428920430948, "grad_norm": 32.09675216674805, "learning_rate": 1.7787273884643268e-06, "loss": 0.1062, "num_input_tokens_seen": 40601888, "step": 60230 }, { "epoch": 1.4715510712627953, "grad_norm": 0.19732925295829773, "learning_rate": 1.7786738856683647e-06, "loss": 0.039, "num_input_tokens_seen": 40605600, "step": 60235 }, { "epoch": 1.4716732220946425, "grad_norm": 0.18621958792209625, "learning_rate": 1.7786203772096768e-06, "loss": 0.0543, "num_input_tokens_seen": 40608928, "step": 60240 }, { "epoch": 1.4717953729264897, "grad_norm": 24.08376693725586, "learning_rate": 1.7785668630886521e-06, "loss": 0.2321, "num_input_tokens_seen": 40612192, "step": 60245 }, { "epoch": 1.4719175237583368, "grad_norm": 1.6449941396713257, "learning_rate": 1.77851334330568e-06, "loss": 0.039, "num_input_tokens_seen": 40615136, "step": 60250 }, { "epoch": 1.472039674590184, "grad_norm": 0.0721750482916832, "learning_rate": 1.7784598178611492e-06, "loss": 0.0719, "num_input_tokens_seen": 40618720, "step": 60255 }, { "epoch": 1.472161825422031, "grad_norm": 9.434758186340332, "learning_rate": 1.7784062867554493e-06, "loss": 0.2505, "num_input_tokens_seen": 40621984, "step": 60260 }, { "epoch": 1.4722839762538782, "grad_norm": 0.3179467022418976, "learning_rate": 1.7783527499889694e-06, "loss": 0.1049, "num_input_tokens_seen": 40625440, "step": 60265 }, { "epoch": 1.4724061270857254, "grad_norm": 34.49399948120117, "learning_rate": 1.778299207562099e-06, "loss": 0.0771, "num_input_tokens_seen": 40628192, "step": 60270 }, { "epoch": 1.4725282779175726, "grad_norm": 0.4097166657447815, "learning_rate": 1.7782456594752275e-06, "loss": 0.0427, "num_input_tokens_seen": 40631968, "step": 60275 }, { "epoch": 1.4726504287494198, "grad_norm": 0.10307589173316956, "learning_rate": 1.7781921057287442e-06, "loss": 0.0634, "num_input_tokens_seen": 40636128, "step": 60280 }, { "epoch": 1.472772579581267, "grad_norm": 0.3781689405441284, "learning_rate": 1.7781385463230385e-06, "loss": 0.0008, "num_input_tokens_seen": 40639136, "step": 60285 }, { "epoch": 1.4728947304131141, "grad_norm": 9.588024139404297, "learning_rate": 1.7780849812585e-06, "loss": 0.1044, "num_input_tokens_seen": 40641888, "step": 60290 }, { "epoch": 1.4730168812449613, "grad_norm": 0.3940590023994446, "learning_rate": 1.7780314105355183e-06, "loss": 0.1011, "num_input_tokens_seen": 40644832, "step": 60295 }, { "epoch": 1.4731390320768085, "grad_norm": 0.2877940833568573, "learning_rate": 1.7779778341544832e-06, "loss": 0.1502, "num_input_tokens_seen": 40648032, "step": 60300 }, { "epoch": 1.4732611829086557, "grad_norm": 22.556068420410156, "learning_rate": 1.7779242521157837e-06, "loss": 0.1569, "num_input_tokens_seen": 40651296, "step": 60305 }, { "epoch": 1.473383333740503, "grad_norm": 1.5956579446792603, "learning_rate": 1.77787066441981e-06, "loss": 0.1258, "num_input_tokens_seen": 40654624, "step": 60310 }, { "epoch": 1.4735054845723499, "grad_norm": 0.47433945536613464, "learning_rate": 1.7778170710669513e-06, "loss": 0.0015, "num_input_tokens_seen": 40658144, "step": 60315 }, { "epoch": 1.473627635404197, "grad_norm": 0.32676297426223755, "learning_rate": 1.7777634720575978e-06, "loss": 0.0685, "num_input_tokens_seen": 40661408, "step": 60320 }, { "epoch": 1.4737497862360442, "grad_norm": 0.8634158372879028, "learning_rate": 1.777709867392139e-06, "loss": 0.1081, "num_input_tokens_seen": 40664672, "step": 60325 }, { "epoch": 1.4738719370678914, "grad_norm": 0.12412005662918091, "learning_rate": 1.7776562570709652e-06, "loss": 0.0795, "num_input_tokens_seen": 40668128, "step": 60330 }, { "epoch": 1.4739940878997386, "grad_norm": 1.616155743598938, "learning_rate": 1.7776026410944659e-06, "loss": 0.1111, "num_input_tokens_seen": 40671200, "step": 60335 }, { "epoch": 1.4741162387315858, "grad_norm": 0.20798847079277039, "learning_rate": 1.7775490194630307e-06, "loss": 0.1285, "num_input_tokens_seen": 40674272, "step": 60340 }, { "epoch": 1.474238389563433, "grad_norm": 1.0060458183288574, "learning_rate": 1.7774953921770504e-06, "loss": 0.0018, "num_input_tokens_seen": 40677728, "step": 60345 }, { "epoch": 1.47436054039528, "grad_norm": 0.6992678642272949, "learning_rate": 1.7774417592369142e-06, "loss": 0.1822, "num_input_tokens_seen": 40681120, "step": 60350 }, { "epoch": 1.4744826912271272, "grad_norm": 0.23237474262714386, "learning_rate": 1.7773881206430122e-06, "loss": 0.0315, "num_input_tokens_seen": 40684512, "step": 60355 }, { "epoch": 1.4746048420589744, "grad_norm": 17.29538917541504, "learning_rate": 1.7773344763957349e-06, "loss": 0.0442, "num_input_tokens_seen": 40688480, "step": 60360 }, { "epoch": 1.4747269928908215, "grad_norm": 8.166539192199707, "learning_rate": 1.7772808264954724e-06, "loss": 0.1618, "num_input_tokens_seen": 40691872, "step": 60365 }, { "epoch": 1.4748491437226687, "grad_norm": 71.33097076416016, "learning_rate": 1.7772271709426145e-06, "loss": 0.0412, "num_input_tokens_seen": 40695264, "step": 60370 }, { "epoch": 1.474971294554516, "grad_norm": 0.055804669857025146, "learning_rate": 1.7771735097375514e-06, "loss": 0.0019, "num_input_tokens_seen": 40698976, "step": 60375 }, { "epoch": 1.4750934453863631, "grad_norm": 0.2121654748916626, "learning_rate": 1.777119842880674e-06, "loss": 0.0681, "num_input_tokens_seen": 40702432, "step": 60380 }, { "epoch": 1.4752155962182103, "grad_norm": 0.056822698563337326, "learning_rate": 1.7770661703723716e-06, "loss": 0.1681, "num_input_tokens_seen": 40705824, "step": 60385 }, { "epoch": 1.4753377470500575, "grad_norm": 20.086894989013672, "learning_rate": 1.7770124922130352e-06, "loss": 0.2188, "num_input_tokens_seen": 40709088, "step": 60390 }, { "epoch": 1.4754598978819047, "grad_norm": 60.431453704833984, "learning_rate": 1.7769588084030547e-06, "loss": 0.1257, "num_input_tokens_seen": 40712352, "step": 60395 }, { "epoch": 1.4755820487137519, "grad_norm": 21.10099983215332, "learning_rate": 1.776905118942821e-06, "loss": 0.1547, "num_input_tokens_seen": 40716192, "step": 60400 }, { "epoch": 1.4757041995455988, "grad_norm": 0.06091950461268425, "learning_rate": 1.7768514238327244e-06, "loss": 0.0813, "num_input_tokens_seen": 40719520, "step": 60405 }, { "epoch": 1.475826350377446, "grad_norm": 12.754447937011719, "learning_rate": 1.7767977230731552e-06, "loss": 0.0564, "num_input_tokens_seen": 40722528, "step": 60410 }, { "epoch": 1.4759485012092932, "grad_norm": 85.40321350097656, "learning_rate": 1.776744016664504e-06, "loss": 0.2374, "num_input_tokens_seen": 40725984, "step": 60415 }, { "epoch": 1.4760706520411404, "grad_norm": 0.15571066737174988, "learning_rate": 1.7766903046071613e-06, "loss": 0.0318, "num_input_tokens_seen": 40729248, "step": 60420 }, { "epoch": 1.4761928028729876, "grad_norm": 0.17578119039535522, "learning_rate": 1.776636586901518e-06, "loss": 0.0308, "num_input_tokens_seen": 40732576, "step": 60425 }, { "epoch": 1.4763149537048348, "grad_norm": 95.00015258789062, "learning_rate": 1.7765828635479645e-06, "loss": 0.0675, "num_input_tokens_seen": 40735840, "step": 60430 }, { "epoch": 1.476437104536682, "grad_norm": 9.245221138000488, "learning_rate": 1.7765291345468913e-06, "loss": 0.1742, "num_input_tokens_seen": 40739104, "step": 60435 }, { "epoch": 1.476559255368529, "grad_norm": 2.94415545463562, "learning_rate": 1.7764753998986898e-06, "loss": 0.0036, "num_input_tokens_seen": 40742944, "step": 60440 }, { "epoch": 1.4766814062003761, "grad_norm": 57.272377014160156, "learning_rate": 1.77642165960375e-06, "loss": 0.1498, "num_input_tokens_seen": 40746144, "step": 60445 }, { "epoch": 1.4768035570322233, "grad_norm": 0.19033876061439514, "learning_rate": 1.7763679136624632e-06, "loss": 0.042, "num_input_tokens_seen": 40749280, "step": 60450 }, { "epoch": 1.4769257078640705, "grad_norm": 0.36067670583724976, "learning_rate": 1.77631416207522e-06, "loss": 0.1468, "num_input_tokens_seen": 40752608, "step": 60455 }, { "epoch": 1.4770478586959177, "grad_norm": 17.70503807067871, "learning_rate": 1.7762604048424117e-06, "loss": 0.1179, "num_input_tokens_seen": 40756192, "step": 60460 }, { "epoch": 1.477170009527765, "grad_norm": 0.3026616871356964, "learning_rate": 1.7762066419644286e-06, "loss": 0.0883, "num_input_tokens_seen": 40759648, "step": 60465 }, { "epoch": 1.477292160359612, "grad_norm": 0.1730181723833084, "learning_rate": 1.7761528734416621e-06, "loss": 0.0364, "num_input_tokens_seen": 40763232, "step": 60470 }, { "epoch": 1.4774143111914593, "grad_norm": 19.374736785888672, "learning_rate": 1.7760990992745033e-06, "loss": 0.0974, "num_input_tokens_seen": 40766496, "step": 60475 }, { "epoch": 1.4775364620233065, "grad_norm": 16.91132926940918, "learning_rate": 1.776045319463343e-06, "loss": 0.1396, "num_input_tokens_seen": 40769760, "step": 60480 }, { "epoch": 1.4776586128551537, "grad_norm": 10.6881742477417, "learning_rate": 1.7759915340085724e-06, "loss": 0.1247, "num_input_tokens_seen": 40773088, "step": 60485 }, { "epoch": 1.4777807636870008, "grad_norm": 0.2255319356918335, "learning_rate": 1.7759377429105826e-06, "loss": 0.2395, "num_input_tokens_seen": 40776736, "step": 60490 }, { "epoch": 1.4779029145188478, "grad_norm": 0.22949527204036713, "learning_rate": 1.775883946169765e-06, "loss": 0.0034, "num_input_tokens_seen": 40779936, "step": 60495 }, { "epoch": 1.478025065350695, "grad_norm": 31.79056167602539, "learning_rate": 1.7758301437865107e-06, "loss": 0.0059, "num_input_tokens_seen": 40783584, "step": 60500 }, { "epoch": 1.4781472161825422, "grad_norm": 0.13012264668941498, "learning_rate": 1.7757763357612108e-06, "loss": 0.084, "num_input_tokens_seen": 40786720, "step": 60505 }, { "epoch": 1.4782693670143894, "grad_norm": 82.19654083251953, "learning_rate": 1.7757225220942567e-06, "loss": 0.0243, "num_input_tokens_seen": 40790496, "step": 60510 }, { "epoch": 1.4783915178462366, "grad_norm": 0.14999359846115112, "learning_rate": 1.7756687027860396e-06, "loss": 0.0969, "num_input_tokens_seen": 40793888, "step": 60515 }, { "epoch": 1.4785136686780838, "grad_norm": 0.06497511267662048, "learning_rate": 1.7756148778369512e-06, "loss": 0.0723, "num_input_tokens_seen": 40797024, "step": 60520 }, { "epoch": 1.4786358195099307, "grad_norm": 12.169816970825195, "learning_rate": 1.775561047247383e-06, "loss": 0.1222, "num_input_tokens_seen": 40800288, "step": 60525 }, { "epoch": 1.478757970341778, "grad_norm": 0.07355663180351257, "learning_rate": 1.775507211017726e-06, "loss": 0.219, "num_input_tokens_seen": 40803424, "step": 60530 }, { "epoch": 1.478880121173625, "grad_norm": 0.49550914764404297, "learning_rate": 1.7754533691483721e-06, "loss": 0.0009, "num_input_tokens_seen": 40806880, "step": 60535 }, { "epoch": 1.4790022720054723, "grad_norm": 2.6531922817230225, "learning_rate": 1.7753995216397128e-06, "loss": 0.0022, "num_input_tokens_seen": 40810080, "step": 60540 }, { "epoch": 1.4791244228373195, "grad_norm": 14.86800479888916, "learning_rate": 1.7753456684921395e-06, "loss": 0.1097, "num_input_tokens_seen": 40813472, "step": 60545 }, { "epoch": 1.4792465736691667, "grad_norm": 2.2587924003601074, "learning_rate": 1.775291809706044e-06, "loss": 0.0336, "num_input_tokens_seen": 40817120, "step": 60550 }, { "epoch": 1.4793687245010139, "grad_norm": 20.083477020263672, "learning_rate": 1.7752379452818179e-06, "loss": 0.0927, "num_input_tokens_seen": 40820320, "step": 60555 }, { "epoch": 1.479490875332861, "grad_norm": 0.12122409045696259, "learning_rate": 1.7751840752198528e-06, "loss": 0.0943, "num_input_tokens_seen": 40823904, "step": 60560 }, { "epoch": 1.4796130261647082, "grad_norm": 0.036234304308891296, "learning_rate": 1.7751301995205408e-06, "loss": 0.0115, "num_input_tokens_seen": 40827552, "step": 60565 }, { "epoch": 1.4797351769965554, "grad_norm": 0.3813953101634979, "learning_rate": 1.7750763181842735e-06, "loss": 0.0008, "num_input_tokens_seen": 40831136, "step": 60570 }, { "epoch": 1.4798573278284026, "grad_norm": 0.12160413712263107, "learning_rate": 1.7750224312114428e-06, "loss": 0.0518, "num_input_tokens_seen": 40834592, "step": 60575 }, { "epoch": 1.4799794786602498, "grad_norm": 0.6403422355651855, "learning_rate": 1.7749685386024405e-06, "loss": 0.0595, "num_input_tokens_seen": 40837408, "step": 60580 }, { "epoch": 1.4801016294920968, "grad_norm": 14.806707382202148, "learning_rate": 1.7749146403576585e-06, "loss": 0.1728, "num_input_tokens_seen": 40840736, "step": 60585 }, { "epoch": 1.480223780323944, "grad_norm": 0.1330418586730957, "learning_rate": 1.7748607364774886e-06, "loss": 0.135, "num_input_tokens_seen": 40844384, "step": 60590 }, { "epoch": 1.4803459311557912, "grad_norm": 0.299897164106369, "learning_rate": 1.7748068269623234e-06, "loss": 0.0297, "num_input_tokens_seen": 40847712, "step": 60595 }, { "epoch": 1.4804680819876384, "grad_norm": 12.508942604064941, "learning_rate": 1.7747529118125542e-06, "loss": 0.3403, "num_input_tokens_seen": 40851040, "step": 60600 }, { "epoch": 1.4805902328194855, "grad_norm": 0.423649400472641, "learning_rate": 1.7746989910285738e-06, "loss": 0.0287, "num_input_tokens_seen": 40854560, "step": 60605 }, { "epoch": 1.4807123836513327, "grad_norm": 18.635122299194336, "learning_rate": 1.7746450646107736e-06, "loss": 0.0652, "num_input_tokens_seen": 40857760, "step": 60610 }, { "epoch": 1.4808345344831797, "grad_norm": 0.6192677021026611, "learning_rate": 1.7745911325595463e-06, "loss": 0.0025, "num_input_tokens_seen": 40861280, "step": 60615 }, { "epoch": 1.4809566853150269, "grad_norm": 58.895484924316406, "learning_rate": 1.7745371948752838e-06, "loss": 0.0533, "num_input_tokens_seen": 40864672, "step": 60620 }, { "epoch": 1.481078836146874, "grad_norm": 47.88134765625, "learning_rate": 1.774483251558379e-06, "loss": 0.1157, "num_input_tokens_seen": 40868192, "step": 60625 }, { "epoch": 1.4812009869787213, "grad_norm": 0.38593101501464844, "learning_rate": 1.7744293026092233e-06, "loss": 0.1239, "num_input_tokens_seen": 40871648, "step": 60630 }, { "epoch": 1.4813231378105685, "grad_norm": 8.302868843078613, "learning_rate": 1.7743753480282094e-06, "loss": 0.123, "num_input_tokens_seen": 40874976, "step": 60635 }, { "epoch": 1.4814452886424156, "grad_norm": 0.47456851601600647, "learning_rate": 1.7743213878157297e-06, "loss": 0.0655, "num_input_tokens_seen": 40878752, "step": 60640 }, { "epoch": 1.4815674394742628, "grad_norm": 0.27059075236320496, "learning_rate": 1.7742674219721768e-06, "loss": 0.0573, "num_input_tokens_seen": 40882400, "step": 60645 }, { "epoch": 1.48168959030611, "grad_norm": 0.7263038754463196, "learning_rate": 1.7742134504979425e-06, "loss": 0.0266, "num_input_tokens_seen": 40885984, "step": 60650 }, { "epoch": 1.4818117411379572, "grad_norm": 0.49012497067451477, "learning_rate": 1.77415947339342e-06, "loss": 0.0397, "num_input_tokens_seen": 40889376, "step": 60655 }, { "epoch": 1.4819338919698044, "grad_norm": 18.042882919311523, "learning_rate": 1.774105490659002e-06, "loss": 0.0456, "num_input_tokens_seen": 40892512, "step": 60660 }, { "epoch": 1.4820560428016516, "grad_norm": 0.13812461495399475, "learning_rate": 1.77405150229508e-06, "loss": 0.1718, "num_input_tokens_seen": 40895520, "step": 60665 }, { "epoch": 1.4821781936334986, "grad_norm": 0.17089881002902985, "learning_rate": 1.7739975083020474e-06, "loss": 0.0013, "num_input_tokens_seen": 40898592, "step": 60670 }, { "epoch": 1.4823003444653458, "grad_norm": 0.5600114464759827, "learning_rate": 1.773943508680297e-06, "loss": 0.1473, "num_input_tokens_seen": 40901728, "step": 60675 }, { "epoch": 1.482422495297193, "grad_norm": 413.35546875, "learning_rate": 1.7738895034302212e-06, "loss": 0.0816, "num_input_tokens_seen": 40905184, "step": 60680 }, { "epoch": 1.4825446461290401, "grad_norm": 13.743282318115234, "learning_rate": 1.7738354925522128e-06, "loss": 0.1157, "num_input_tokens_seen": 40908960, "step": 60685 }, { "epoch": 1.4826667969608873, "grad_norm": 0.12234126776456833, "learning_rate": 1.7737814760466643e-06, "loss": 0.0656, "num_input_tokens_seen": 40912096, "step": 60690 }, { "epoch": 1.4827889477927345, "grad_norm": 32.00730895996094, "learning_rate": 1.773727453913969e-06, "loss": 0.1437, "num_input_tokens_seen": 40915424, "step": 60695 }, { "epoch": 1.4829110986245817, "grad_norm": 23.14132308959961, "learning_rate": 1.7736734261545196e-06, "loss": 0.0724, "num_input_tokens_seen": 40919008, "step": 60700 }, { "epoch": 1.4830332494564287, "grad_norm": 123.06578826904297, "learning_rate": 1.773619392768709e-06, "loss": 0.1297, "num_input_tokens_seen": 40922336, "step": 60705 }, { "epoch": 1.4831554002882759, "grad_norm": 19.842958450317383, "learning_rate": 1.7735653537569299e-06, "loss": 0.128, "num_input_tokens_seen": 40925792, "step": 60710 }, { "epoch": 1.483277551120123, "grad_norm": 96.30461883544922, "learning_rate": 1.7735113091195755e-06, "loss": 0.1701, "num_input_tokens_seen": 40928672, "step": 60715 }, { "epoch": 1.4833997019519702, "grad_norm": 27.956899642944336, "learning_rate": 1.773457258857039e-06, "loss": 0.1278, "num_input_tokens_seen": 40932256, "step": 60720 }, { "epoch": 1.4835218527838174, "grad_norm": 17.01154327392578, "learning_rate": 1.773403202969713e-06, "loss": 0.0683, "num_input_tokens_seen": 40935584, "step": 60725 }, { "epoch": 1.4836440036156646, "grad_norm": 12.74868106842041, "learning_rate": 1.773349141457991e-06, "loss": 0.2014, "num_input_tokens_seen": 40938784, "step": 60730 }, { "epoch": 1.4837661544475118, "grad_norm": 2.696553945541382, "learning_rate": 1.7732950743222661e-06, "loss": 0.0533, "num_input_tokens_seen": 40942368, "step": 60735 }, { "epoch": 1.483888305279359, "grad_norm": 17.248987197875977, "learning_rate": 1.7732410015629315e-06, "loss": 0.1076, "num_input_tokens_seen": 40945440, "step": 60740 }, { "epoch": 1.4840104561112062, "grad_norm": 2.0968856811523438, "learning_rate": 1.77318692318038e-06, "loss": 0.0818, "num_input_tokens_seen": 40948512, "step": 60745 }, { "epoch": 1.4841326069430534, "grad_norm": 0.5884655117988586, "learning_rate": 1.7731328391750055e-06, "loss": 0.04, "num_input_tokens_seen": 40951840, "step": 60750 }, { "epoch": 1.4842547577749006, "grad_norm": 15.215673446655273, "learning_rate": 1.773078749547201e-06, "loss": 0.0523, "num_input_tokens_seen": 40955168, "step": 60755 }, { "epoch": 1.4843769086067475, "grad_norm": 190.5538330078125, "learning_rate": 1.77302465429736e-06, "loss": 0.051, "num_input_tokens_seen": 40958944, "step": 60760 }, { "epoch": 1.4844990594385947, "grad_norm": 10.600131034851074, "learning_rate": 1.7729705534258757e-06, "loss": 0.0605, "num_input_tokens_seen": 40962336, "step": 60765 }, { "epoch": 1.484621210270442, "grad_norm": 0.3233378827571869, "learning_rate": 1.7729164469331418e-06, "loss": 0.0227, "num_input_tokens_seen": 40966112, "step": 60770 }, { "epoch": 1.484743361102289, "grad_norm": 0.6192806959152222, "learning_rate": 1.7728623348195515e-06, "loss": 0.0049, "num_input_tokens_seen": 40969184, "step": 60775 }, { "epoch": 1.4848655119341363, "grad_norm": 10.349754333496094, "learning_rate": 1.7728082170854983e-06, "loss": 0.1831, "num_input_tokens_seen": 40972768, "step": 60780 }, { "epoch": 1.4849876627659835, "grad_norm": 17.2828426361084, "learning_rate": 1.772754093731376e-06, "loss": 0.073, "num_input_tokens_seen": 40976672, "step": 60785 }, { "epoch": 1.4851098135978307, "grad_norm": 0.6874222755432129, "learning_rate": 1.772699964757578e-06, "loss": 0.0274, "num_input_tokens_seen": 40980192, "step": 60790 }, { "epoch": 1.4852319644296776, "grad_norm": 0.15286195278167725, "learning_rate": 1.7726458301644982e-06, "loss": 0.0567, "num_input_tokens_seen": 40983712, "step": 60795 }, { "epoch": 1.4853541152615248, "grad_norm": 14.385998725891113, "learning_rate": 1.7725916899525298e-06, "loss": 0.0805, "num_input_tokens_seen": 40987104, "step": 60800 }, { "epoch": 1.485476266093372, "grad_norm": 0.2787397503852844, "learning_rate": 1.7725375441220672e-06, "loss": 0.0451, "num_input_tokens_seen": 40990816, "step": 60805 }, { "epoch": 1.4855984169252192, "grad_norm": 0.04248502850532532, "learning_rate": 1.7724833926735037e-06, "loss": 0.1048, "num_input_tokens_seen": 40994336, "step": 60810 }, { "epoch": 1.4857205677570664, "grad_norm": 0.4895806610584259, "learning_rate": 1.772429235607233e-06, "loss": 0.0065, "num_input_tokens_seen": 40998240, "step": 60815 }, { "epoch": 1.4858427185889136, "grad_norm": 18.34246063232422, "learning_rate": 1.7723750729236492e-06, "loss": 0.08, "num_input_tokens_seen": 41001632, "step": 60820 }, { "epoch": 1.4859648694207608, "grad_norm": 0.18905378878116608, "learning_rate": 1.7723209046231462e-06, "loss": 0.0097, "num_input_tokens_seen": 41005344, "step": 60825 }, { "epoch": 1.486087020252608, "grad_norm": 0.3081001341342926, "learning_rate": 1.772266730706118e-06, "loss": 0.1109, "num_input_tokens_seen": 41008736, "step": 60830 }, { "epoch": 1.4862091710844552, "grad_norm": 70.08404541015625, "learning_rate": 1.772212551172958e-06, "loss": 0.0916, "num_input_tokens_seen": 41012064, "step": 60835 }, { "epoch": 1.4863313219163024, "grad_norm": 0.1584898680448532, "learning_rate": 1.772158366024061e-06, "loss": 0.1368, "num_input_tokens_seen": 41015264, "step": 60840 }, { "epoch": 1.4864534727481495, "grad_norm": 0.09998618066310883, "learning_rate": 1.7721041752598205e-06, "loss": 0.1338, "num_input_tokens_seen": 41018272, "step": 60845 }, { "epoch": 1.4865756235799965, "grad_norm": 0.2783723771572113, "learning_rate": 1.7720499788806307e-06, "loss": 0.0726, "num_input_tokens_seen": 41021472, "step": 60850 }, { "epoch": 1.4866977744118437, "grad_norm": 0.23562151193618774, "learning_rate": 1.771995776886886e-06, "loss": 0.1353, "num_input_tokens_seen": 41024800, "step": 60855 }, { "epoch": 1.4868199252436909, "grad_norm": 0.4206966459751129, "learning_rate": 1.7719415692789803e-06, "loss": 0.0378, "num_input_tokens_seen": 41028256, "step": 60860 }, { "epoch": 1.486942076075538, "grad_norm": 1.2113614082336426, "learning_rate": 1.771887356057308e-06, "loss": 0.0713, "num_input_tokens_seen": 41031840, "step": 60865 }, { "epoch": 1.4870642269073853, "grad_norm": 15.251075744628906, "learning_rate": 1.7718331372222629e-06, "loss": 0.1079, "num_input_tokens_seen": 41035936, "step": 60870 }, { "epoch": 1.4871863777392325, "grad_norm": 0.31838202476501465, "learning_rate": 1.7717789127742399e-06, "loss": 0.1966, "num_input_tokens_seen": 41039520, "step": 60875 }, { "epoch": 1.4873085285710796, "grad_norm": 0.06436757743358612, "learning_rate": 1.771724682713633e-06, "loss": 0.0857, "num_input_tokens_seen": 41042912, "step": 60880 }, { "epoch": 1.4874306794029266, "grad_norm": 0.29811179637908936, "learning_rate": 1.7716704470408365e-06, "loss": 0.1083, "num_input_tokens_seen": 41046112, "step": 60885 }, { "epoch": 1.4875528302347738, "grad_norm": 16.720657348632812, "learning_rate": 1.7716162057562451e-06, "loss": 0.0346, "num_input_tokens_seen": 41049824, "step": 60890 }, { "epoch": 1.487674981066621, "grad_norm": 2.4709200859069824, "learning_rate": 1.771561958860253e-06, "loss": 0.0555, "num_input_tokens_seen": 41053664, "step": 60895 }, { "epoch": 1.4877971318984682, "grad_norm": 0.22559817135334015, "learning_rate": 1.771507706353255e-06, "loss": 0.0507, "num_input_tokens_seen": 41057184, "step": 60900 }, { "epoch": 1.4879192827303154, "grad_norm": 1.0803998708724976, "learning_rate": 1.7714534482356454e-06, "loss": 0.002, "num_input_tokens_seen": 41060256, "step": 60905 }, { "epoch": 1.4880414335621626, "grad_norm": 10.669313430786133, "learning_rate": 1.7713991845078186e-06, "loss": 0.1203, "num_input_tokens_seen": 41063520, "step": 60910 }, { "epoch": 1.4881635843940098, "grad_norm": 0.5857841968536377, "learning_rate": 1.7713449151701698e-06, "loss": 0.0725, "num_input_tokens_seen": 41066720, "step": 60915 }, { "epoch": 1.488285735225857, "grad_norm": 0.047439418733119965, "learning_rate": 1.7712906402230933e-06, "loss": 0.0618, "num_input_tokens_seen": 41069920, "step": 60920 }, { "epoch": 1.4884078860577041, "grad_norm": 0.353522926568985, "learning_rate": 1.7712363596669835e-06, "loss": 0.0585, "num_input_tokens_seen": 41073120, "step": 60925 }, { "epoch": 1.4885300368895513, "grad_norm": 0.3170434236526489, "learning_rate": 1.7711820735022354e-06, "loss": 0.0559, "num_input_tokens_seen": 41076256, "step": 60930 }, { "epoch": 1.4886521877213985, "grad_norm": 0.6093167066574097, "learning_rate": 1.7711277817292443e-06, "loss": 0.0344, "num_input_tokens_seen": 41079392, "step": 60935 }, { "epoch": 1.4887743385532455, "grad_norm": 0.14489717781543732, "learning_rate": 1.7710734843484044e-06, "loss": 0.0011, "num_input_tokens_seen": 41083232, "step": 60940 }, { "epoch": 1.4888964893850927, "grad_norm": 0.2810435891151428, "learning_rate": 1.7710191813601102e-06, "loss": 0.0877, "num_input_tokens_seen": 41086560, "step": 60945 }, { "epoch": 1.4890186402169399, "grad_norm": 0.062317151576280594, "learning_rate": 1.770964872764758e-06, "loss": 0.0294, "num_input_tokens_seen": 41089888, "step": 60950 }, { "epoch": 1.489140791048787, "grad_norm": 0.14095567166805267, "learning_rate": 1.770910558562741e-06, "loss": 0.0886, "num_input_tokens_seen": 41093280, "step": 60955 }, { "epoch": 1.4892629418806342, "grad_norm": 31.579681396484375, "learning_rate": 1.7708562387544558e-06, "loss": 0.0311, "num_input_tokens_seen": 41096736, "step": 60960 }, { "epoch": 1.4893850927124814, "grad_norm": 19.332107543945312, "learning_rate": 1.7708019133402962e-06, "loss": 0.0838, "num_input_tokens_seen": 41100000, "step": 60965 }, { "epoch": 1.4895072435443286, "grad_norm": 0.7163593769073486, "learning_rate": 1.7707475823206582e-06, "loss": 0.0983, "num_input_tokens_seen": 41103456, "step": 60970 }, { "epoch": 1.4896293943761756, "grad_norm": 17.53666877746582, "learning_rate": 1.7706932456959362e-06, "loss": 0.0961, "num_input_tokens_seen": 41106720, "step": 60975 }, { "epoch": 1.4897515452080228, "grad_norm": 0.1998463273048401, "learning_rate": 1.7706389034665257e-06, "loss": 0.0471, "num_input_tokens_seen": 41110560, "step": 60980 }, { "epoch": 1.48987369603987, "grad_norm": 0.6920112371444702, "learning_rate": 1.7705845556328217e-06, "loss": 0.0978, "num_input_tokens_seen": 41114016, "step": 60985 }, { "epoch": 1.4899958468717172, "grad_norm": 16.81520652770996, "learning_rate": 1.7705302021952198e-06, "loss": 0.2061, "num_input_tokens_seen": 41117152, "step": 60990 }, { "epoch": 1.4901179977035643, "grad_norm": 90.10795593261719, "learning_rate": 1.7704758431541146e-06, "loss": 0.0534, "num_input_tokens_seen": 41120416, "step": 60995 }, { "epoch": 1.4902401485354115, "grad_norm": 0.15558181703090668, "learning_rate": 1.7704214785099024e-06, "loss": 0.0008, "num_input_tokens_seen": 41123872, "step": 61000 }, { "epoch": 1.4903622993672587, "grad_norm": 25.905107498168945, "learning_rate": 1.7703671082629776e-06, "loss": 0.2285, "num_input_tokens_seen": 41127520, "step": 61005 }, { "epoch": 1.490484450199106, "grad_norm": 0.2514127492904663, "learning_rate": 1.7703127324137358e-06, "loss": 0.0022, "num_input_tokens_seen": 41131296, "step": 61010 }, { "epoch": 1.490606601030953, "grad_norm": 13.766162872314453, "learning_rate": 1.7702583509625732e-06, "loss": 0.2344, "num_input_tokens_seen": 41134304, "step": 61015 }, { "epoch": 1.4907287518628003, "grad_norm": 0.7958281636238098, "learning_rate": 1.7702039639098842e-06, "loss": 0.0826, "num_input_tokens_seen": 41137568, "step": 61020 }, { "epoch": 1.4908509026946475, "grad_norm": 8.671177864074707, "learning_rate": 1.770149571256065e-06, "loss": 0.1647, "num_input_tokens_seen": 41140960, "step": 61025 }, { "epoch": 1.4909730535264945, "grad_norm": 21.409822463989258, "learning_rate": 1.7700951730015113e-06, "loss": 0.1514, "num_input_tokens_seen": 41144288, "step": 61030 }, { "epoch": 1.4910952043583416, "grad_norm": 11.56128215789795, "learning_rate": 1.770040769146618e-06, "loss": 0.068, "num_input_tokens_seen": 41148000, "step": 61035 }, { "epoch": 1.4912173551901888, "grad_norm": 0.8195192813873291, "learning_rate": 1.769986359691781e-06, "loss": 0.0308, "num_input_tokens_seen": 41151584, "step": 61040 }, { "epoch": 1.491339506022036, "grad_norm": 0.4487052857875824, "learning_rate": 1.7699319446373963e-06, "loss": 0.16, "num_input_tokens_seen": 41154848, "step": 61045 }, { "epoch": 1.4914616568538832, "grad_norm": 43.25626754760742, "learning_rate": 1.7698775239838596e-06, "loss": 0.1018, "num_input_tokens_seen": 41158304, "step": 61050 }, { "epoch": 1.4915838076857304, "grad_norm": 0.4304538071155548, "learning_rate": 1.769823097731566e-06, "loss": 0.1217, "num_input_tokens_seen": 41161696, "step": 61055 }, { "epoch": 1.4917059585175774, "grad_norm": 2.894843578338623, "learning_rate": 1.769768665880912e-06, "loss": 0.1014, "num_input_tokens_seen": 41165792, "step": 61060 }, { "epoch": 1.4918281093494246, "grad_norm": 135.68406677246094, "learning_rate": 1.7697142284322931e-06, "loss": 0.1832, "num_input_tokens_seen": 41169120, "step": 61065 }, { "epoch": 1.4919502601812717, "grad_norm": 72.74583435058594, "learning_rate": 1.7696597853861057e-06, "loss": 0.1397, "num_input_tokens_seen": 41172192, "step": 61070 }, { "epoch": 1.492072411013119, "grad_norm": 0.8611149787902832, "learning_rate": 1.769605336742745e-06, "loss": 0.0759, "num_input_tokens_seen": 41175584, "step": 61075 }, { "epoch": 1.4921945618449661, "grad_norm": 0.2745153605937958, "learning_rate": 1.7695508825026074e-06, "loss": 0.0608, "num_input_tokens_seen": 41178656, "step": 61080 }, { "epoch": 1.4923167126768133, "grad_norm": 24.806541442871094, "learning_rate": 1.7694964226660884e-06, "loss": 0.0982, "num_input_tokens_seen": 41181792, "step": 61085 }, { "epoch": 1.4924388635086605, "grad_norm": 13.456315040588379, "learning_rate": 1.769441957233585e-06, "loss": 0.0757, "num_input_tokens_seen": 41184864, "step": 61090 }, { "epoch": 1.4925610143405077, "grad_norm": 29.400386810302734, "learning_rate": 1.7693874862054928e-06, "loss": 0.0755, "num_input_tokens_seen": 41188640, "step": 61095 }, { "epoch": 1.4926831651723549, "grad_norm": 0.4075230062007904, "learning_rate": 1.7693330095822074e-06, "loss": 0.0749, "num_input_tokens_seen": 41192096, "step": 61100 }, { "epoch": 1.492805316004202, "grad_norm": 0.12924809753894806, "learning_rate": 1.7692785273641256e-06, "loss": 0.0377, "num_input_tokens_seen": 41195104, "step": 61105 }, { "epoch": 1.4929274668360493, "grad_norm": 0.38088515400886536, "learning_rate": 1.7692240395516435e-06, "loss": 0.0257, "num_input_tokens_seen": 41198816, "step": 61110 }, { "epoch": 1.4930496176678965, "grad_norm": 0.2689894735813141, "learning_rate": 1.7691695461451573e-06, "loss": 0.0178, "num_input_tokens_seen": 41202144, "step": 61115 }, { "epoch": 1.4931717684997434, "grad_norm": 23.341785430908203, "learning_rate": 1.769115047145063e-06, "loss": 0.1223, "num_input_tokens_seen": 41205344, "step": 61120 }, { "epoch": 1.4932939193315906, "grad_norm": 0.17626403272151947, "learning_rate": 1.7690605425517578e-06, "loss": 0.1508, "num_input_tokens_seen": 41208544, "step": 61125 }, { "epoch": 1.4934160701634378, "grad_norm": 0.6624254584312439, "learning_rate": 1.7690060323656368e-06, "loss": 0.026, "num_input_tokens_seen": 41211552, "step": 61130 }, { "epoch": 1.493538220995285, "grad_norm": 53.69667053222656, "learning_rate": 1.7689515165870974e-06, "loss": 0.0044, "num_input_tokens_seen": 41214944, "step": 61135 }, { "epoch": 1.4936603718271322, "grad_norm": 25.466035842895508, "learning_rate": 1.7688969952165358e-06, "loss": 0.0931, "num_input_tokens_seen": 41218400, "step": 61140 }, { "epoch": 1.4937825226589794, "grad_norm": 71.1301040649414, "learning_rate": 1.7688424682543483e-06, "loss": 0.0658, "num_input_tokens_seen": 41221472, "step": 61145 }, { "epoch": 1.4939046734908263, "grad_norm": 76.20751190185547, "learning_rate": 1.768787935700932e-06, "loss": 0.0199, "num_input_tokens_seen": 41225184, "step": 61150 }, { "epoch": 1.4940268243226735, "grad_norm": 1.1861448287963867, "learning_rate": 1.7687333975566828e-06, "loss": 0.0259, "num_input_tokens_seen": 41228448, "step": 61155 }, { "epoch": 1.4941489751545207, "grad_norm": 0.051383551210165024, "learning_rate": 1.7686788538219971e-06, "loss": 0.091, "num_input_tokens_seen": 41231904, "step": 61160 }, { "epoch": 1.494271125986368, "grad_norm": 0.0947948694229126, "learning_rate": 1.7686243044972727e-06, "loss": 0.0311, "num_input_tokens_seen": 41234912, "step": 61165 }, { "epoch": 1.494393276818215, "grad_norm": 7.97659158706665, "learning_rate": 1.7685697495829054e-06, "loss": 0.2079, "num_input_tokens_seen": 41237856, "step": 61170 }, { "epoch": 1.4945154276500623, "grad_norm": 28.53188705444336, "learning_rate": 1.768515189079292e-06, "loss": 0.1181, "num_input_tokens_seen": 41241056, "step": 61175 }, { "epoch": 1.4946375784819095, "grad_norm": 17.43120574951172, "learning_rate": 1.7684606229868294e-06, "loss": 0.2076, "num_input_tokens_seen": 41244128, "step": 61180 }, { "epoch": 1.4947597293137567, "grad_norm": 0.34168973565101624, "learning_rate": 1.7684060513059147e-06, "loss": 0.0468, "num_input_tokens_seen": 41247328, "step": 61185 }, { "epoch": 1.4948818801456039, "grad_norm": 0.08489463478326797, "learning_rate": 1.7683514740369442e-06, "loss": 0.0097, "num_input_tokens_seen": 41251040, "step": 61190 }, { "epoch": 1.495004030977451, "grad_norm": 0.22557464241981506, "learning_rate": 1.7682968911803157e-06, "loss": 0.0571, "num_input_tokens_seen": 41254240, "step": 61195 }, { "epoch": 1.4951261818092982, "grad_norm": 10.98664665222168, "learning_rate": 1.768242302736425e-06, "loss": 0.0489, "num_input_tokens_seen": 41257376, "step": 61200 }, { "epoch": 1.4952483326411452, "grad_norm": 0.31744417548179626, "learning_rate": 1.7681877087056699e-06, "loss": 0.1054, "num_input_tokens_seen": 41260832, "step": 61205 }, { "epoch": 1.4953704834729924, "grad_norm": 18.24903106689453, "learning_rate": 1.768133109088447e-06, "loss": 0.1017, "num_input_tokens_seen": 41263904, "step": 61210 }, { "epoch": 1.4954926343048396, "grad_norm": 1.512769341468811, "learning_rate": 1.7680785038851536e-06, "loss": 0.1413, "num_input_tokens_seen": 41267168, "step": 61215 }, { "epoch": 1.4956147851366868, "grad_norm": 12.158363342285156, "learning_rate": 1.768023893096187e-06, "loss": 0.073, "num_input_tokens_seen": 41270496, "step": 61220 }, { "epoch": 1.495736935968534, "grad_norm": 108.42285919189453, "learning_rate": 1.7679692767219437e-06, "loss": 0.1188, "num_input_tokens_seen": 41273824, "step": 61225 }, { "epoch": 1.4958590868003812, "grad_norm": 0.15408147871494293, "learning_rate": 1.7679146547628214e-06, "loss": 0.0501, "num_input_tokens_seen": 41277408, "step": 61230 }, { "epoch": 1.4959812376322283, "grad_norm": 2.2216410636901855, "learning_rate": 1.7678600272192172e-06, "loss": 0.0318, "num_input_tokens_seen": 41280864, "step": 61235 }, { "epoch": 1.4961033884640753, "grad_norm": 0.5559858083724976, "learning_rate": 1.7678053940915284e-06, "loss": 0.0297, "num_input_tokens_seen": 41284256, "step": 61240 }, { "epoch": 1.4962255392959225, "grad_norm": 0.15119442343711853, "learning_rate": 1.767750755380152e-06, "loss": 0.0066, "num_input_tokens_seen": 41287264, "step": 61245 }, { "epoch": 1.4963476901277697, "grad_norm": 0.7975942492485046, "learning_rate": 1.767696111085486e-06, "loss": 0.0994, "num_input_tokens_seen": 41290400, "step": 61250 }, { "epoch": 1.4964698409596169, "grad_norm": 15.413094520568848, "learning_rate": 1.767641461207927e-06, "loss": 0.1207, "num_input_tokens_seen": 41293664, "step": 61255 }, { "epoch": 1.496591991791464, "grad_norm": 1.0722957849502563, "learning_rate": 1.7675868057478733e-06, "loss": 0.0529, "num_input_tokens_seen": 41297184, "step": 61260 }, { "epoch": 1.4967141426233113, "grad_norm": 0.5817152857780457, "learning_rate": 1.7675321447057217e-06, "loss": 0.0674, "num_input_tokens_seen": 41300640, "step": 61265 }, { "epoch": 1.4968362934551585, "grad_norm": 15.53848934173584, "learning_rate": 1.7674774780818698e-06, "loss": 0.0671, "num_input_tokens_seen": 41303648, "step": 61270 }, { "epoch": 1.4969584442870056, "grad_norm": 0.3177522122859955, "learning_rate": 1.7674228058767151e-06, "loss": 0.0012, "num_input_tokens_seen": 41307232, "step": 61275 }, { "epoch": 1.4970805951188528, "grad_norm": 44.76844024658203, "learning_rate": 1.7673681280906556e-06, "loss": 0.1176, "num_input_tokens_seen": 41310432, "step": 61280 }, { "epoch": 1.4972027459507, "grad_norm": 0.697921633720398, "learning_rate": 1.7673134447240887e-06, "loss": 0.0818, "num_input_tokens_seen": 41314016, "step": 61285 }, { "epoch": 1.4973248967825472, "grad_norm": 0.05985622480511665, "learning_rate": 1.7672587557774117e-06, "loss": 0.0908, "num_input_tokens_seen": 41317152, "step": 61290 }, { "epoch": 1.4974470476143942, "grad_norm": 27.428375244140625, "learning_rate": 1.767204061251023e-06, "loss": 0.163, "num_input_tokens_seen": 41320288, "step": 61295 }, { "epoch": 1.4975691984462414, "grad_norm": 108.94695281982422, "learning_rate": 1.7671493611453202e-06, "loss": 0.0073, "num_input_tokens_seen": 41323488, "step": 61300 }, { "epoch": 1.4976913492780886, "grad_norm": 47.66750717163086, "learning_rate": 1.7670946554607006e-06, "loss": 0.006, "num_input_tokens_seen": 41327136, "step": 61305 }, { "epoch": 1.4978135001099357, "grad_norm": 0.47193843126296997, "learning_rate": 1.7670399441975622e-06, "loss": 0.1182, "num_input_tokens_seen": 41330784, "step": 61310 }, { "epoch": 1.497935650941783, "grad_norm": 69.85272979736328, "learning_rate": 1.766985227356303e-06, "loss": 0.1859, "num_input_tokens_seen": 41334112, "step": 61315 }, { "epoch": 1.4980578017736301, "grad_norm": 1.0827215909957886, "learning_rate": 1.766930504937321e-06, "loss": 0.0557, "num_input_tokens_seen": 41337696, "step": 61320 }, { "epoch": 1.4981799526054773, "grad_norm": 0.1586698740720749, "learning_rate": 1.7668757769410144e-06, "loss": 0.178, "num_input_tokens_seen": 41341472, "step": 61325 }, { "epoch": 1.4983021034373243, "grad_norm": 2.0001184940338135, "learning_rate": 1.7668210433677808e-06, "loss": 0.0019, "num_input_tokens_seen": 41345248, "step": 61330 }, { "epoch": 1.4984242542691715, "grad_norm": 53.12262725830078, "learning_rate": 1.7667663042180182e-06, "loss": 0.0774, "num_input_tokens_seen": 41348320, "step": 61335 }, { "epoch": 1.4985464051010187, "grad_norm": 0.2784538269042969, "learning_rate": 1.766711559492125e-06, "loss": 0.0294, "num_input_tokens_seen": 41351776, "step": 61340 }, { "epoch": 1.4986685559328659, "grad_norm": 0.18959470093250275, "learning_rate": 1.7666568091904989e-06, "loss": 0.1105, "num_input_tokens_seen": 41355104, "step": 61345 }, { "epoch": 1.498790706764713, "grad_norm": 0.5925389528274536, "learning_rate": 1.7666020533135382e-06, "loss": 0.0619, "num_input_tokens_seen": 41358240, "step": 61350 }, { "epoch": 1.4989128575965602, "grad_norm": 12.532886505126953, "learning_rate": 1.7665472918616412e-06, "loss": 0.1805, "num_input_tokens_seen": 41361504, "step": 61355 }, { "epoch": 1.4990350084284074, "grad_norm": 0.5867086052894592, "learning_rate": 1.7664925248352062e-06, "loss": 0.0039, "num_input_tokens_seen": 41364960, "step": 61360 }, { "epoch": 1.4991571592602546, "grad_norm": 37.33353805541992, "learning_rate": 1.7664377522346312e-06, "loss": 0.1081, "num_input_tokens_seen": 41368224, "step": 61365 }, { "epoch": 1.4992793100921018, "grad_norm": 0.01031525433063507, "learning_rate": 1.766382974060315e-06, "loss": 0.0004, "num_input_tokens_seen": 41371424, "step": 61370 }, { "epoch": 1.499401460923949, "grad_norm": 1.8833508491516113, "learning_rate": 1.7663281903126557e-06, "loss": 0.036, "num_input_tokens_seen": 41374496, "step": 61375 }, { "epoch": 1.4995236117557962, "grad_norm": 0.07677330821752548, "learning_rate": 1.7662734009920516e-06, "loss": 0.12, "num_input_tokens_seen": 41377888, "step": 61380 }, { "epoch": 1.4996457625876431, "grad_norm": 12.478357315063477, "learning_rate": 1.7662186060989011e-06, "loss": 0.1095, "num_input_tokens_seen": 41381152, "step": 61385 }, { "epoch": 1.4997679134194903, "grad_norm": 0.14680707454681396, "learning_rate": 1.7661638056336031e-06, "loss": 0.1075, "num_input_tokens_seen": 41384544, "step": 61390 }, { "epoch": 1.4998900642513375, "grad_norm": 0.17693321406841278, "learning_rate": 1.7661089995965556e-06, "loss": 0.0568, "num_input_tokens_seen": 41387872, "step": 61395 }, { "epoch": 1.5000122150831847, "grad_norm": 55.276275634765625, "learning_rate": 1.7660541879881574e-06, "loss": 0.0425, "num_input_tokens_seen": 41391200, "step": 61400 }, { "epoch": 1.5001099357486625, "eval_loss": 0.15240158140659332, "eval_runtime": 47.7435, "eval_samples_per_second": 762.093, "eval_steps_per_second": 95.28, "num_input_tokens_seen": 41393504, "step": 61404 }, { "epoch": 1.500134365915032, "grad_norm": 25.473384857177734, "learning_rate": 1.765999370808807e-06, "loss": 0.1068, "num_input_tokens_seen": 41394336, "step": 61405 }, { "epoch": 1.500256516746879, "grad_norm": 0.4577730596065521, "learning_rate": 1.7659445480589034e-06, "loss": 0.0767, "num_input_tokens_seen": 41397856, "step": 61410 }, { "epoch": 1.500378667578726, "grad_norm": 1.0948963165283203, "learning_rate": 1.765889719738845e-06, "loss": 0.0235, "num_input_tokens_seen": 41401120, "step": 61415 }, { "epoch": 1.5005008184105733, "grad_norm": 1.4936326742172241, "learning_rate": 1.7658348858490304e-06, "loss": 0.0499, "num_input_tokens_seen": 41404512, "step": 61420 }, { "epoch": 1.5006229692424204, "grad_norm": 128.0543670654297, "learning_rate": 1.7657800463898587e-06, "loss": 0.0994, "num_input_tokens_seen": 41407968, "step": 61425 }, { "epoch": 1.5007451200742676, "grad_norm": 0.02252628654241562, "learning_rate": 1.7657252013617283e-06, "loss": 0.048, "num_input_tokens_seen": 41411424, "step": 61430 }, { "epoch": 1.5008672709061148, "grad_norm": 1.9966330528259277, "learning_rate": 1.7656703507650386e-06, "loss": 0.1001, "num_input_tokens_seen": 41414624, "step": 61435 }, { "epoch": 1.500989421737962, "grad_norm": 12.924593925476074, "learning_rate": 1.765615494600188e-06, "loss": 0.0904, "num_input_tokens_seen": 41418272, "step": 61440 }, { "epoch": 1.5011115725698092, "grad_norm": 0.442061185836792, "learning_rate": 1.7655606328675754e-06, "loss": 0.1318, "num_input_tokens_seen": 41421600, "step": 61445 }, { "epoch": 1.5012337234016564, "grad_norm": 0.0899852067232132, "learning_rate": 1.7655057655676003e-06, "loss": 0.0189, "num_input_tokens_seen": 41425184, "step": 61450 }, { "epoch": 1.5013558742335036, "grad_norm": 0.10864616930484772, "learning_rate": 1.7654508927006612e-06, "loss": 0.0023, "num_input_tokens_seen": 41428448, "step": 61455 }, { "epoch": 1.5014780250653508, "grad_norm": 17.705018997192383, "learning_rate": 1.7653960142671574e-06, "loss": 0.137, "num_input_tokens_seen": 41431840, "step": 61460 }, { "epoch": 1.501600175897198, "grad_norm": 0.43008753657341003, "learning_rate": 1.7653411302674877e-06, "loss": 0.1144, "num_input_tokens_seen": 41434848, "step": 61465 }, { "epoch": 1.5017223267290452, "grad_norm": 12.536665916442871, "learning_rate": 1.7652862407020517e-06, "loss": 0.1333, "num_input_tokens_seen": 41437856, "step": 61470 }, { "epoch": 1.5018444775608923, "grad_norm": 0.07049085944890976, "learning_rate": 1.7652313455712483e-06, "loss": 0.0135, "num_input_tokens_seen": 41443104, "step": 61475 }, { "epoch": 1.5019666283927393, "grad_norm": 0.454694002866745, "learning_rate": 1.7651764448754767e-06, "loss": 0.085, "num_input_tokens_seen": 41446240, "step": 61480 }, { "epoch": 1.5020887792245865, "grad_norm": 0.4432663917541504, "learning_rate": 1.7651215386151361e-06, "loss": 0.1033, "num_input_tokens_seen": 41449888, "step": 61485 }, { "epoch": 1.5022109300564337, "grad_norm": 0.5971336960792542, "learning_rate": 1.765066626790626e-06, "loss": 0.0637, "num_input_tokens_seen": 41453728, "step": 61490 }, { "epoch": 1.5023330808882809, "grad_norm": 0.9203071594238281, "learning_rate": 1.7650117094023456e-06, "loss": 0.1901, "num_input_tokens_seen": 41457184, "step": 61495 }, { "epoch": 1.5024552317201278, "grad_norm": 20.82863426208496, "learning_rate": 1.764956786450694e-06, "loss": 0.0401, "num_input_tokens_seen": 41461024, "step": 61500 }, { "epoch": 1.502577382551975, "grad_norm": 0.18386538326740265, "learning_rate": 1.7649018579360712e-06, "loss": 0.147, "num_input_tokens_seen": 41464544, "step": 61505 }, { "epoch": 1.5026995333838222, "grad_norm": 0.1778634488582611, "learning_rate": 1.7648469238588763e-06, "loss": 0.0493, "num_input_tokens_seen": 41467616, "step": 61510 }, { "epoch": 1.5028216842156694, "grad_norm": 0.11675328761339188, "learning_rate": 1.764791984219509e-06, "loss": 0.0026, "num_input_tokens_seen": 41471456, "step": 61515 }, { "epoch": 1.5029438350475166, "grad_norm": 0.2592644691467285, "learning_rate": 1.7647370390183686e-06, "loss": 0.1136, "num_input_tokens_seen": 41475168, "step": 61520 }, { "epoch": 1.5030659858793638, "grad_norm": 0.22310876846313477, "learning_rate": 1.7646820882558546e-06, "loss": 0.0544, "num_input_tokens_seen": 41478880, "step": 61525 }, { "epoch": 1.503188136711211, "grad_norm": 17.4942569732666, "learning_rate": 1.7646271319323667e-06, "loss": 0.0734, "num_input_tokens_seen": 41482016, "step": 61530 }, { "epoch": 1.5033102875430582, "grad_norm": 22.144229888916016, "learning_rate": 1.7645721700483049e-06, "loss": 0.1048, "num_input_tokens_seen": 41485536, "step": 61535 }, { "epoch": 1.5034324383749054, "grad_norm": 0.22404468059539795, "learning_rate": 1.7645172026040687e-06, "loss": 0.0583, "num_input_tokens_seen": 41488800, "step": 61540 }, { "epoch": 1.5035545892067526, "grad_norm": 54.67439270019531, "learning_rate": 1.7644622296000575e-06, "loss": 0.1607, "num_input_tokens_seen": 41491936, "step": 61545 }, { "epoch": 1.5036767400385997, "grad_norm": 42.04545974731445, "learning_rate": 1.7644072510366714e-06, "loss": 0.1701, "num_input_tokens_seen": 41495520, "step": 61550 }, { "epoch": 1.503798890870447, "grad_norm": 12.226096153259277, "learning_rate": 1.7643522669143103e-06, "loss": 0.0329, "num_input_tokens_seen": 41498848, "step": 61555 }, { "epoch": 1.5039210417022941, "grad_norm": 0.7583820819854736, "learning_rate": 1.764297277233374e-06, "loss": 0.0945, "num_input_tokens_seen": 41502304, "step": 61560 }, { "epoch": 1.5040431925341413, "grad_norm": 1.9888880252838135, "learning_rate": 1.764242281994262e-06, "loss": 0.0717, "num_input_tokens_seen": 41505504, "step": 61565 }, { "epoch": 1.5041653433659883, "grad_norm": 0.3815326392650604, "learning_rate": 1.7641872811973749e-06, "loss": 0.0826, "num_input_tokens_seen": 41508768, "step": 61570 }, { "epoch": 1.5042874941978355, "grad_norm": 40.32561111450195, "learning_rate": 1.7641322748431122e-06, "loss": 0.1313, "num_input_tokens_seen": 41511904, "step": 61575 }, { "epoch": 1.5044096450296827, "grad_norm": 0.571736752986908, "learning_rate": 1.764077262931874e-06, "loss": 0.0715, "num_input_tokens_seen": 41515424, "step": 61580 }, { "epoch": 1.5045317958615299, "grad_norm": 27.67289161682129, "learning_rate": 1.7640222454640602e-06, "loss": 0.0735, "num_input_tokens_seen": 41519264, "step": 61585 }, { "epoch": 1.5046539466933768, "grad_norm": 0.2176530808210373, "learning_rate": 1.7639672224400716e-06, "loss": 0.0869, "num_input_tokens_seen": 41522400, "step": 61590 }, { "epoch": 1.504776097525224, "grad_norm": 43.00497055053711, "learning_rate": 1.763912193860308e-06, "loss": 0.0572, "num_input_tokens_seen": 41525600, "step": 61595 }, { "epoch": 1.5048982483570712, "grad_norm": 0.44729945063591003, "learning_rate": 1.763857159725169e-06, "loss": 0.0358, "num_input_tokens_seen": 41528992, "step": 61600 }, { "epoch": 1.5050203991889184, "grad_norm": 0.49896079301834106, "learning_rate": 1.7638021200350555e-06, "loss": 0.0787, "num_input_tokens_seen": 41532320, "step": 61605 }, { "epoch": 1.5051425500207656, "grad_norm": 128.6964874267578, "learning_rate": 1.7637470747903675e-06, "loss": 0.0351, "num_input_tokens_seen": 41535776, "step": 61610 }, { "epoch": 1.5052647008526128, "grad_norm": 0.9645973443984985, "learning_rate": 1.7636920239915053e-06, "loss": 0.0176, "num_input_tokens_seen": 41539040, "step": 61615 }, { "epoch": 1.50538685168446, "grad_norm": 0.09933934360742569, "learning_rate": 1.7636369676388694e-06, "loss": 0.0372, "num_input_tokens_seen": 41541984, "step": 61620 }, { "epoch": 1.5055090025163071, "grad_norm": 38.70368957519531, "learning_rate": 1.76358190573286e-06, "loss": 0.2492, "num_input_tokens_seen": 41545312, "step": 61625 }, { "epoch": 1.5056311533481543, "grad_norm": 0.6128526329994202, "learning_rate": 1.7635268382738774e-06, "loss": 0.0623, "num_input_tokens_seen": 41548448, "step": 61630 }, { "epoch": 1.5057533041800015, "grad_norm": 0.14343132078647614, "learning_rate": 1.7634717652623228e-06, "loss": 0.0943, "num_input_tokens_seen": 41551904, "step": 61635 }, { "epoch": 1.5058754550118487, "grad_norm": 32.26298522949219, "learning_rate": 1.7634166866985958e-06, "loss": 0.1596, "num_input_tokens_seen": 41555232, "step": 61640 }, { "epoch": 1.505997605843696, "grad_norm": 0.15098640322685242, "learning_rate": 1.7633616025830972e-06, "loss": 0.0694, "num_input_tokens_seen": 41558624, "step": 61645 }, { "epoch": 1.506119756675543, "grad_norm": 0.2517228424549103, "learning_rate": 1.7633065129162282e-06, "loss": 0.1143, "num_input_tokens_seen": 41562208, "step": 61650 }, { "epoch": 1.5062419075073903, "grad_norm": 0.30164381861686707, "learning_rate": 1.7632514176983886e-06, "loss": 0.0379, "num_input_tokens_seen": 41565664, "step": 61655 }, { "epoch": 1.5063640583392373, "grad_norm": 1.0749255418777466, "learning_rate": 1.7631963169299794e-06, "loss": 0.0141, "num_input_tokens_seen": 41568672, "step": 61660 }, { "epoch": 1.5064862091710844, "grad_norm": 9.241209030151367, "learning_rate": 1.7631412106114014e-06, "loss": 0.209, "num_input_tokens_seen": 41572000, "step": 61665 }, { "epoch": 1.5066083600029316, "grad_norm": 0.10479751229286194, "learning_rate": 1.763086098743055e-06, "loss": 0.0995, "num_input_tokens_seen": 41575200, "step": 61670 }, { "epoch": 1.5067305108347788, "grad_norm": 8.743705749511719, "learning_rate": 1.7630309813253417e-06, "loss": 0.3212, "num_input_tokens_seen": 41578208, "step": 61675 }, { "epoch": 1.5068526616666258, "grad_norm": 0.21430103480815887, "learning_rate": 1.7629758583586613e-06, "loss": 0.07, "num_input_tokens_seen": 41581600, "step": 61680 }, { "epoch": 1.506974812498473, "grad_norm": 0.16763003170490265, "learning_rate": 1.7629207298434157e-06, "loss": 0.0053, "num_input_tokens_seen": 41584928, "step": 61685 }, { "epoch": 1.5070969633303202, "grad_norm": 0.0599144846200943, "learning_rate": 1.7628655957800054e-06, "loss": 0.0005, "num_input_tokens_seen": 41588448, "step": 61690 }, { "epoch": 1.5072191141621674, "grad_norm": 0.06027615815401077, "learning_rate": 1.7628104561688311e-06, "loss": 0.0389, "num_input_tokens_seen": 41592096, "step": 61695 }, { "epoch": 1.5073412649940146, "grad_norm": 0.7883102893829346, "learning_rate": 1.7627553110102936e-06, "loss": 0.0391, "num_input_tokens_seen": 41595168, "step": 61700 }, { "epoch": 1.5074634158258617, "grad_norm": 0.34857890009880066, "learning_rate": 1.762700160304795e-06, "loss": 0.0008, "num_input_tokens_seen": 41598560, "step": 61705 }, { "epoch": 1.507585566657709, "grad_norm": 40.58872604370117, "learning_rate": 1.7626450040527355e-06, "loss": 0.0621, "num_input_tokens_seen": 41601632, "step": 61710 }, { "epoch": 1.5077077174895561, "grad_norm": 29.350317001342773, "learning_rate": 1.7625898422545163e-06, "loss": 0.1054, "num_input_tokens_seen": 41604704, "step": 61715 }, { "epoch": 1.5078298683214033, "grad_norm": 22.25473976135254, "learning_rate": 1.7625346749105385e-06, "loss": 0.1185, "num_input_tokens_seen": 41608352, "step": 61720 }, { "epoch": 1.5079520191532505, "grad_norm": 13.916818618774414, "learning_rate": 1.7624795020212036e-06, "loss": 0.1244, "num_input_tokens_seen": 41611808, "step": 61725 }, { "epoch": 1.5080741699850977, "grad_norm": 0.13384878635406494, "learning_rate": 1.762424323586913e-06, "loss": 0.0329, "num_input_tokens_seen": 41615072, "step": 61730 }, { "epoch": 1.5081963208169449, "grad_norm": 30.317041397094727, "learning_rate": 1.7623691396080674e-06, "loss": 0.0861, "num_input_tokens_seen": 41618272, "step": 61735 }, { "epoch": 1.508318471648792, "grad_norm": 1.0107468366622925, "learning_rate": 1.7623139500850682e-06, "loss": 0.0107, "num_input_tokens_seen": 41621216, "step": 61740 }, { "epoch": 1.5084406224806393, "grad_norm": 9.42230224609375, "learning_rate": 1.762258755018317e-06, "loss": 0.0473, "num_input_tokens_seen": 41624544, "step": 61745 }, { "epoch": 1.5085627733124862, "grad_norm": 0.12761496007442474, "learning_rate": 1.7622035544082153e-06, "loss": 0.0818, "num_input_tokens_seen": 41627872, "step": 61750 }, { "epoch": 1.5086849241443334, "grad_norm": 14.09807014465332, "learning_rate": 1.762148348255164e-06, "loss": 0.0524, "num_input_tokens_seen": 41631200, "step": 61755 }, { "epoch": 1.5088070749761806, "grad_norm": 120.56591796875, "learning_rate": 1.7620931365595651e-06, "loss": 0.1926, "num_input_tokens_seen": 41634016, "step": 61760 }, { "epoch": 1.5089292258080278, "grad_norm": 0.5521315336227417, "learning_rate": 1.7620379193218198e-06, "loss": 0.0595, "num_input_tokens_seen": 41637408, "step": 61765 }, { "epoch": 1.5090513766398748, "grad_norm": 89.52035522460938, "learning_rate": 1.7619826965423301e-06, "loss": 0.1163, "num_input_tokens_seen": 41640928, "step": 61770 }, { "epoch": 1.509173527471722, "grad_norm": 35.262062072753906, "learning_rate": 1.7619274682214971e-06, "loss": 0.1921, "num_input_tokens_seen": 41644320, "step": 61775 }, { "epoch": 1.5092956783035691, "grad_norm": 0.3857623040676117, "learning_rate": 1.7618722343597225e-06, "loss": 0.0986, "num_input_tokens_seen": 41647328, "step": 61780 }, { "epoch": 1.5094178291354163, "grad_norm": 0.6144522428512573, "learning_rate": 1.7618169949574082e-06, "loss": 0.1232, "num_input_tokens_seen": 41650400, "step": 61785 }, { "epoch": 1.5095399799672635, "grad_norm": 0.09201917052268982, "learning_rate": 1.7617617500149558e-06, "loss": 0.0418, "num_input_tokens_seen": 41653536, "step": 61790 }, { "epoch": 1.5096621307991107, "grad_norm": 0.29864656925201416, "learning_rate": 1.7617064995327674e-06, "loss": 0.0038, "num_input_tokens_seen": 41656736, "step": 61795 }, { "epoch": 1.509784281630958, "grad_norm": 0.046912241727113724, "learning_rate": 1.761651243511244e-06, "loss": 0.0024, "num_input_tokens_seen": 41659616, "step": 61800 }, { "epoch": 1.509906432462805, "grad_norm": 22.605022430419922, "learning_rate": 1.761595981950788e-06, "loss": 0.0471, "num_input_tokens_seen": 41662816, "step": 61805 }, { "epoch": 1.5100285832946523, "grad_norm": 17.55577278137207, "learning_rate": 1.7615407148518014e-06, "loss": 0.0431, "num_input_tokens_seen": 41666016, "step": 61810 }, { "epoch": 1.5101507341264995, "grad_norm": 12.887489318847656, "learning_rate": 1.7614854422146855e-06, "loss": 0.0486, "num_input_tokens_seen": 41669280, "step": 61815 }, { "epoch": 1.5102728849583467, "grad_norm": 0.6606060266494751, "learning_rate": 1.7614301640398429e-06, "loss": 0.1103, "num_input_tokens_seen": 41672672, "step": 61820 }, { "epoch": 1.5103950357901939, "grad_norm": 0.03383230045437813, "learning_rate": 1.7613748803276752e-06, "loss": 0.0483, "num_input_tokens_seen": 41676320, "step": 61825 }, { "epoch": 1.510517186622041, "grad_norm": 0.3259274959564209, "learning_rate": 1.761319591078585e-06, "loss": 0.0515, "num_input_tokens_seen": 41679648, "step": 61830 }, { "epoch": 1.510639337453888, "grad_norm": 0.23923155665397644, "learning_rate": 1.7612642962929733e-06, "loss": 0.1682, "num_input_tokens_seen": 41683040, "step": 61835 }, { "epoch": 1.5107614882857352, "grad_norm": 16.628183364868164, "learning_rate": 1.7612089959712434e-06, "loss": 0.1369, "num_input_tokens_seen": 41686368, "step": 61840 }, { "epoch": 1.5108836391175824, "grad_norm": 0.04525705799460411, "learning_rate": 1.7611536901137969e-06, "loss": 0.1149, "num_input_tokens_seen": 41689568, "step": 61845 }, { "epoch": 1.5110057899494296, "grad_norm": 0.886325478553772, "learning_rate": 1.7610983787210357e-06, "loss": 0.0051, "num_input_tokens_seen": 41692832, "step": 61850 }, { "epoch": 1.5111279407812768, "grad_norm": 0.14912337064743042, "learning_rate": 1.7610430617933628e-06, "loss": 0.087, "num_input_tokens_seen": 41696480, "step": 61855 }, { "epoch": 1.5112500916131237, "grad_norm": 0.42184045910835266, "learning_rate": 1.7609877393311798e-06, "loss": 0.002, "num_input_tokens_seen": 41699616, "step": 61860 }, { "epoch": 1.511372242444971, "grad_norm": 0.41546207666397095, "learning_rate": 1.7609324113348892e-06, "loss": 0.1238, "num_input_tokens_seen": 41703072, "step": 61865 }, { "epoch": 1.5114943932768181, "grad_norm": 0.3638473451137543, "learning_rate": 1.7608770778048936e-06, "loss": 0.1283, "num_input_tokens_seen": 41706464, "step": 61870 }, { "epoch": 1.5116165441086653, "grad_norm": 37.72365188598633, "learning_rate": 1.7608217387415954e-06, "loss": 0.1226, "num_input_tokens_seen": 41709664, "step": 61875 }, { "epoch": 1.5117386949405125, "grad_norm": 84.65232849121094, "learning_rate": 1.7607663941453966e-06, "loss": 0.2157, "num_input_tokens_seen": 41713504, "step": 61880 }, { "epoch": 1.5118608457723597, "grad_norm": 8.479555130004883, "learning_rate": 1.7607110440167e-06, "loss": 0.1551, "num_input_tokens_seen": 41717216, "step": 61885 }, { "epoch": 1.5119829966042069, "grad_norm": 0.47316303849220276, "learning_rate": 1.7606556883559081e-06, "loss": 0.0153, "num_input_tokens_seen": 41720544, "step": 61890 }, { "epoch": 1.512105147436054, "grad_norm": 14.604045867919922, "learning_rate": 1.7606003271634235e-06, "loss": 0.156, "num_input_tokens_seen": 41724128, "step": 61895 }, { "epoch": 1.5122272982679013, "grad_norm": 136.4307861328125, "learning_rate": 1.760544960439649e-06, "loss": 0.1273, "num_input_tokens_seen": 41727840, "step": 61900 }, { "epoch": 1.5123494490997484, "grad_norm": 1.5975284576416016, "learning_rate": 1.7604895881849865e-06, "loss": 0.0023, "num_input_tokens_seen": 41731424, "step": 61905 }, { "epoch": 1.5124715999315956, "grad_norm": 25.729183197021484, "learning_rate": 1.7604342103998393e-06, "loss": 0.0992, "num_input_tokens_seen": 41734752, "step": 61910 }, { "epoch": 1.5125937507634428, "grad_norm": 0.16049420833587646, "learning_rate": 1.76037882708461e-06, "loss": 0.0758, "num_input_tokens_seen": 41738080, "step": 61915 }, { "epoch": 1.51271590159529, "grad_norm": 0.9564996361732483, "learning_rate": 1.7603234382397014e-06, "loss": 0.0601, "num_input_tokens_seen": 41741216, "step": 61920 }, { "epoch": 1.512838052427137, "grad_norm": 23.576576232910156, "learning_rate": 1.7602680438655164e-06, "loss": 0.1144, "num_input_tokens_seen": 41744992, "step": 61925 }, { "epoch": 1.5129602032589842, "grad_norm": 18.271236419677734, "learning_rate": 1.7602126439624576e-06, "loss": 0.048, "num_input_tokens_seen": 41748448, "step": 61930 }, { "epoch": 1.5130823540908314, "grad_norm": 20.256929397583008, "learning_rate": 1.7601572385309279e-06, "loss": 0.072, "num_input_tokens_seen": 41751648, "step": 61935 }, { "epoch": 1.5132045049226786, "grad_norm": 0.0887921005487442, "learning_rate": 1.7601018275713301e-06, "loss": 0.0386, "num_input_tokens_seen": 41755040, "step": 61940 }, { "epoch": 1.5133266557545257, "grad_norm": 21.042631149291992, "learning_rate": 1.760046411084068e-06, "loss": 0.0827, "num_input_tokens_seen": 41758240, "step": 61945 }, { "epoch": 1.5134488065863727, "grad_norm": 34.03895950317383, "learning_rate": 1.7599909890695434e-06, "loss": 0.1892, "num_input_tokens_seen": 41761504, "step": 61950 }, { "epoch": 1.51357095741822, "grad_norm": 0.2431178241968155, "learning_rate": 1.7599355615281602e-06, "loss": 0.0432, "num_input_tokens_seen": 41764768, "step": 61955 }, { "epoch": 1.513693108250067, "grad_norm": 0.03246299922466278, "learning_rate": 1.7598801284603211e-06, "loss": 0.0385, "num_input_tokens_seen": 41768800, "step": 61960 }, { "epoch": 1.5138152590819143, "grad_norm": 0.05149323120713234, "learning_rate": 1.7598246898664293e-06, "loss": 0.0589, "num_input_tokens_seen": 41771808, "step": 61965 }, { "epoch": 1.5139374099137615, "grad_norm": 0.07109751552343369, "learning_rate": 1.759769245746888e-06, "loss": 0.0018, "num_input_tokens_seen": 41775072, "step": 61970 }, { "epoch": 1.5140595607456087, "grad_norm": 123.77120971679688, "learning_rate": 1.7597137961021004e-06, "loss": 0.2485, "num_input_tokens_seen": 41778784, "step": 61975 }, { "epoch": 1.5141817115774558, "grad_norm": 0.3029272258281708, "learning_rate": 1.7596583409324697e-06, "loss": 0.0541, "num_input_tokens_seen": 41782816, "step": 61980 }, { "epoch": 1.514303862409303, "grad_norm": 18.087604522705078, "learning_rate": 1.7596028802383995e-06, "loss": 0.1162, "num_input_tokens_seen": 41786848, "step": 61985 }, { "epoch": 1.5144260132411502, "grad_norm": 19.319604873657227, "learning_rate": 1.7595474140202927e-06, "loss": 0.0587, "num_input_tokens_seen": 41790304, "step": 61990 }, { "epoch": 1.5145481640729974, "grad_norm": 0.5831694006919861, "learning_rate": 1.7594919422785525e-06, "loss": 0.1085, "num_input_tokens_seen": 41793440, "step": 61995 }, { "epoch": 1.5146703149048446, "grad_norm": 0.0200160201638937, "learning_rate": 1.7594364650135827e-06, "loss": 0.0604, "num_input_tokens_seen": 41797088, "step": 62000 }, { "epoch": 1.5147924657366918, "grad_norm": 0.11137901246547699, "learning_rate": 1.759380982225787e-06, "loss": 0.0746, "num_input_tokens_seen": 41800224, "step": 62005 }, { "epoch": 1.514914616568539, "grad_norm": 0.20222550630569458, "learning_rate": 1.7593254939155684e-06, "loss": 0.0012, "num_input_tokens_seen": 41803488, "step": 62010 }, { "epoch": 1.515036767400386, "grad_norm": 0.36145535111427307, "learning_rate": 1.7592700000833305e-06, "loss": 0.1087, "num_input_tokens_seen": 41806944, "step": 62015 }, { "epoch": 1.5151589182322331, "grad_norm": 0.12234114110469818, "learning_rate": 1.759214500729477e-06, "loss": 0.0663, "num_input_tokens_seen": 41810016, "step": 62020 }, { "epoch": 1.5152810690640803, "grad_norm": 0.007487828843295574, "learning_rate": 1.7591589958544113e-06, "loss": 0.1765, "num_input_tokens_seen": 41813664, "step": 62025 }, { "epoch": 1.5154032198959275, "grad_norm": 0.17571020126342773, "learning_rate": 1.7591034854585373e-06, "loss": 0.0018, "num_input_tokens_seen": 41817120, "step": 62030 }, { "epoch": 1.5155253707277745, "grad_norm": 9.199395179748535, "learning_rate": 1.7590479695422587e-06, "loss": 0.086, "num_input_tokens_seen": 41820512, "step": 62035 }, { "epoch": 1.5156475215596217, "grad_norm": 11.8097505569458, "learning_rate": 1.758992448105979e-06, "loss": 0.1341, "num_input_tokens_seen": 41824160, "step": 62040 }, { "epoch": 1.5157696723914689, "grad_norm": 38.68931198120117, "learning_rate": 1.7589369211501019e-06, "loss": 0.0295, "num_input_tokens_seen": 41827168, "step": 62045 }, { "epoch": 1.515891823223316, "grad_norm": 0.1641501635313034, "learning_rate": 1.7588813886750315e-06, "loss": 0.0363, "num_input_tokens_seen": 41830560, "step": 62050 }, { "epoch": 1.5160139740551632, "grad_norm": 1.2902582883834839, "learning_rate": 1.7588258506811716e-06, "loss": 0.0621, "num_input_tokens_seen": 41833568, "step": 62055 }, { "epoch": 1.5161361248870104, "grad_norm": 50.38115310668945, "learning_rate": 1.7587703071689259e-06, "loss": 0.1542, "num_input_tokens_seen": 41837024, "step": 62060 }, { "epoch": 1.5162582757188576, "grad_norm": 0.12362977862358093, "learning_rate": 1.7587147581386988e-06, "loss": 0.0697, "num_input_tokens_seen": 41840864, "step": 62065 }, { "epoch": 1.5163804265507048, "grad_norm": 0.03375665470957756, "learning_rate": 1.7586592035908935e-06, "loss": 0.0945, "num_input_tokens_seen": 41844896, "step": 62070 }, { "epoch": 1.516502577382552, "grad_norm": 0.1380762904882431, "learning_rate": 1.7586036435259147e-06, "loss": 0.14, "num_input_tokens_seen": 41848544, "step": 62075 }, { "epoch": 1.5166247282143992, "grad_norm": 10.012866973876953, "learning_rate": 1.758548077944166e-06, "loss": 0.0913, "num_input_tokens_seen": 41851680, "step": 62080 }, { "epoch": 1.5167468790462464, "grad_norm": 0.35353362560272217, "learning_rate": 1.7584925068460516e-06, "loss": 0.0311, "num_input_tokens_seen": 41854880, "step": 62085 }, { "epoch": 1.5168690298780936, "grad_norm": 0.4082539677619934, "learning_rate": 1.7584369302319757e-06, "loss": 0.0375, "num_input_tokens_seen": 41857760, "step": 62090 }, { "epoch": 1.5169911807099408, "grad_norm": 0.18866710364818573, "learning_rate": 1.7583813481023424e-06, "loss": 0.0032, "num_input_tokens_seen": 41860960, "step": 62095 }, { "epoch": 1.517113331541788, "grad_norm": 0.3199007511138916, "learning_rate": 1.758325760457556e-06, "loss": 0.0452, "num_input_tokens_seen": 41864736, "step": 62100 }, { "epoch": 1.517235482373635, "grad_norm": 0.3004762828350067, "learning_rate": 1.7582701672980208e-06, "loss": 0.0375, "num_input_tokens_seen": 41868128, "step": 62105 }, { "epoch": 1.5173576332054821, "grad_norm": 0.13247863948345184, "learning_rate": 1.7582145686241412e-06, "loss": 0.1857, "num_input_tokens_seen": 41871328, "step": 62110 }, { "epoch": 1.5174797840373293, "grad_norm": 0.05241371691226959, "learning_rate": 1.7581589644363208e-06, "loss": 0.0529, "num_input_tokens_seen": 41874400, "step": 62115 }, { "epoch": 1.5176019348691765, "grad_norm": 0.058131348341703415, "learning_rate": 1.7581033547349648e-06, "loss": 0.0553, "num_input_tokens_seen": 41877728, "step": 62120 }, { "epoch": 1.5177240857010235, "grad_norm": 0.27324989438056946, "learning_rate": 1.7580477395204774e-06, "loss": 0.0991, "num_input_tokens_seen": 41880736, "step": 62125 }, { "epoch": 1.5178462365328707, "grad_norm": 0.12253384292125702, "learning_rate": 1.7579921187932628e-06, "loss": 0.1013, "num_input_tokens_seen": 41884896, "step": 62130 }, { "epoch": 1.5179683873647178, "grad_norm": 13.14607048034668, "learning_rate": 1.7579364925537257e-06, "loss": 0.2171, "num_input_tokens_seen": 41888160, "step": 62135 }, { "epoch": 1.518090538196565, "grad_norm": 34.0280876159668, "learning_rate": 1.7578808608022704e-06, "loss": 0.2605, "num_input_tokens_seen": 41891168, "step": 62140 }, { "epoch": 1.5182126890284122, "grad_norm": 0.339428573846817, "learning_rate": 1.7578252235393017e-06, "loss": 0.125, "num_input_tokens_seen": 41894368, "step": 62145 }, { "epoch": 1.5183348398602594, "grad_norm": 1.1708426475524902, "learning_rate": 1.7577695807652243e-06, "loss": 0.0043, "num_input_tokens_seen": 41897312, "step": 62150 }, { "epoch": 1.5184569906921066, "grad_norm": 184.12338256835938, "learning_rate": 1.7577139324804424e-06, "loss": 0.0465, "num_input_tokens_seen": 41900704, "step": 62155 }, { "epoch": 1.5185791415239538, "grad_norm": 9.056930541992188, "learning_rate": 1.757658278685361e-06, "loss": 0.1147, "num_input_tokens_seen": 41903904, "step": 62160 }, { "epoch": 1.518701292355801, "grad_norm": 0.4163637161254883, "learning_rate": 1.7576026193803853e-06, "loss": 0.109, "num_input_tokens_seen": 41907104, "step": 62165 }, { "epoch": 1.5188234431876482, "grad_norm": 86.08924102783203, "learning_rate": 1.7575469545659192e-06, "loss": 0.2651, "num_input_tokens_seen": 41909984, "step": 62170 }, { "epoch": 1.5189455940194954, "grad_norm": 65.60690307617188, "learning_rate": 1.757491284242368e-06, "loss": 0.0926, "num_input_tokens_seen": 41912992, "step": 62175 }, { "epoch": 1.5190677448513425, "grad_norm": 0.5181287527084351, "learning_rate": 1.7574356084101362e-06, "loss": 0.0986, "num_input_tokens_seen": 41916064, "step": 62180 }, { "epoch": 1.5191898956831897, "grad_norm": 0.423454225063324, "learning_rate": 1.7573799270696293e-06, "loss": 0.0296, "num_input_tokens_seen": 41919392, "step": 62185 }, { "epoch": 1.519312046515037, "grad_norm": 1.2272237539291382, "learning_rate": 1.7573242402212515e-06, "loss": 0.1071, "num_input_tokens_seen": 41922720, "step": 62190 }, { "epoch": 1.519434197346884, "grad_norm": 3.1436374187469482, "learning_rate": 1.7572685478654083e-06, "loss": 0.1806, "num_input_tokens_seen": 41926624, "step": 62195 }, { "epoch": 1.519556348178731, "grad_norm": 3.7778868675231934, "learning_rate": 1.7572128500025048e-06, "loss": 0.0039, "num_input_tokens_seen": 41929760, "step": 62200 }, { "epoch": 1.5196784990105783, "grad_norm": 9.93069076538086, "learning_rate": 1.7571571466329454e-06, "loss": 0.1229, "num_input_tokens_seen": 41932960, "step": 62205 }, { "epoch": 1.5198006498424255, "grad_norm": 147.24993896484375, "learning_rate": 1.7571014377571358e-06, "loss": 0.1609, "num_input_tokens_seen": 41936416, "step": 62210 }, { "epoch": 1.5199228006742724, "grad_norm": 0.2786606550216675, "learning_rate": 1.757045723375481e-06, "loss": 0.0593, "num_input_tokens_seen": 41939488, "step": 62215 }, { "epoch": 1.5200449515061196, "grad_norm": 0.8053382635116577, "learning_rate": 1.7569900034883856e-06, "loss": 0.1467, "num_input_tokens_seen": 41942816, "step": 62220 }, { "epoch": 1.5201671023379668, "grad_norm": 0.6396535038948059, "learning_rate": 1.7569342780962555e-06, "loss": 0.0073, "num_input_tokens_seen": 41946528, "step": 62225 }, { "epoch": 1.520289253169814, "grad_norm": 0.1954188495874405, "learning_rate": 1.756878547199496e-06, "loss": 0.0223, "num_input_tokens_seen": 41950688, "step": 62230 }, { "epoch": 1.5204114040016612, "grad_norm": 0.14850103855133057, "learning_rate": 1.756822810798512e-06, "loss": 0.1032, "num_input_tokens_seen": 41953952, "step": 62235 }, { "epoch": 1.5205335548335084, "grad_norm": 0.09374181181192398, "learning_rate": 1.756767068893709e-06, "loss": 0.0415, "num_input_tokens_seen": 41957216, "step": 62240 }, { "epoch": 1.5206557056653556, "grad_norm": 10.11430835723877, "learning_rate": 1.7567113214854921e-06, "loss": 0.086, "num_input_tokens_seen": 41959904, "step": 62245 }, { "epoch": 1.5207778564972028, "grad_norm": 1.3693947792053223, "learning_rate": 1.756655568574267e-06, "loss": 0.032, "num_input_tokens_seen": 41963488, "step": 62250 }, { "epoch": 1.52090000732905, "grad_norm": 0.07667896896600723, "learning_rate": 1.756599810160439e-06, "loss": 0.0492, "num_input_tokens_seen": 41966304, "step": 62255 }, { "epoch": 1.5210221581608971, "grad_norm": 11.006379127502441, "learning_rate": 1.756544046244414e-06, "loss": 0.159, "num_input_tokens_seen": 41969888, "step": 62260 }, { "epoch": 1.5211443089927443, "grad_norm": 0.1665768325328827, "learning_rate": 1.756488276826597e-06, "loss": 0.0011, "num_input_tokens_seen": 41972832, "step": 62265 }, { "epoch": 1.5212664598245915, "grad_norm": 0.045914895832538605, "learning_rate": 1.756432501907394e-06, "loss": 0.0602, "num_input_tokens_seen": 41975840, "step": 62270 }, { "epoch": 1.5213886106564387, "grad_norm": 0.14401188492774963, "learning_rate": 1.7563767214872104e-06, "loss": 0.0315, "num_input_tokens_seen": 41979168, "step": 62275 }, { "epoch": 1.521510761488286, "grad_norm": 0.042610686272382736, "learning_rate": 1.7563209355664514e-06, "loss": 0.228, "num_input_tokens_seen": 41982304, "step": 62280 }, { "epoch": 1.5216329123201329, "grad_norm": 0.048549313098192215, "learning_rate": 1.7562651441455237e-06, "loss": 0.0013, "num_input_tokens_seen": 41985888, "step": 62285 }, { "epoch": 1.52175506315198, "grad_norm": 1.941735029220581, "learning_rate": 1.7562093472248321e-06, "loss": 0.174, "num_input_tokens_seen": 41989152, "step": 62290 }, { "epoch": 1.5218772139838272, "grad_norm": 0.39207449555397034, "learning_rate": 1.7561535448047828e-06, "loss": 0.0824, "num_input_tokens_seen": 41992224, "step": 62295 }, { "epoch": 1.5219993648156744, "grad_norm": 0.11600027978420258, "learning_rate": 1.7560977368857814e-06, "loss": 0.0686, "num_input_tokens_seen": 41995488, "step": 62300 }, { "epoch": 1.5221215156475214, "grad_norm": 0.003041791031137109, "learning_rate": 1.756041923468234e-06, "loss": 0.1924, "num_input_tokens_seen": 41999008, "step": 62305 }, { "epoch": 1.5222436664793686, "grad_norm": 0.44268959760665894, "learning_rate": 1.7559861045525467e-06, "loss": 0.0175, "num_input_tokens_seen": 42002592, "step": 62310 }, { "epoch": 1.5223658173112158, "grad_norm": 0.3703887164592743, "learning_rate": 1.7559302801391247e-06, "loss": 0.0774, "num_input_tokens_seen": 42006176, "step": 62315 }, { "epoch": 1.522487968143063, "grad_norm": 0.3298993706703186, "learning_rate": 1.7558744502283745e-06, "loss": 0.039, "num_input_tokens_seen": 42009760, "step": 62320 }, { "epoch": 1.5226101189749102, "grad_norm": 94.69880676269531, "learning_rate": 1.7558186148207018e-06, "loss": 0.079, "num_input_tokens_seen": 42013088, "step": 62325 }, { "epoch": 1.5227322698067574, "grad_norm": 0.14409717917442322, "learning_rate": 1.7557627739165133e-06, "loss": 0.0347, "num_input_tokens_seen": 42016864, "step": 62330 }, { "epoch": 1.5228544206386045, "grad_norm": 134.7539825439453, "learning_rate": 1.7557069275162145e-06, "loss": 0.119, "num_input_tokens_seen": 42020192, "step": 62335 }, { "epoch": 1.5229765714704517, "grad_norm": 0.2854374051094055, "learning_rate": 1.7556510756202114e-06, "loss": 0.038, "num_input_tokens_seen": 42023712, "step": 62340 }, { "epoch": 1.523098722302299, "grad_norm": 2.959113836288452, "learning_rate": 1.7555952182289104e-06, "loss": 0.0948, "num_input_tokens_seen": 42026784, "step": 62345 }, { "epoch": 1.5232208731341461, "grad_norm": 24.652820587158203, "learning_rate": 1.755539355342718e-06, "loss": 0.1393, "num_input_tokens_seen": 42029792, "step": 62350 }, { "epoch": 1.5233430239659933, "grad_norm": 1.0689424276351929, "learning_rate": 1.75548348696204e-06, "loss": 0.0318, "num_input_tokens_seen": 42032800, "step": 62355 }, { "epoch": 1.5234651747978405, "grad_norm": 14.94307804107666, "learning_rate": 1.7554276130872832e-06, "loss": 0.1744, "num_input_tokens_seen": 42036000, "step": 62360 }, { "epoch": 1.5235873256296877, "grad_norm": 0.11315063387155533, "learning_rate": 1.7553717337188534e-06, "loss": 0.1032, "num_input_tokens_seen": 42040032, "step": 62365 }, { "epoch": 1.5237094764615347, "grad_norm": 0.4759387671947479, "learning_rate": 1.7553158488571572e-06, "loss": 0.0543, "num_input_tokens_seen": 42043296, "step": 62370 }, { "epoch": 1.5238316272933818, "grad_norm": 29.490541458129883, "learning_rate": 1.755259958502601e-06, "loss": 0.1868, "num_input_tokens_seen": 42046880, "step": 62375 }, { "epoch": 1.523953778125229, "grad_norm": 0.1935776025056839, "learning_rate": 1.755204062655591e-06, "loss": 0.0293, "num_input_tokens_seen": 42050080, "step": 62380 }, { "epoch": 1.5240759289570762, "grad_norm": 0.09843038767576218, "learning_rate": 1.7551481613165341e-06, "loss": 0.0824, "num_input_tokens_seen": 42053600, "step": 62385 }, { "epoch": 1.5241980797889234, "grad_norm": 0.8339051008224487, "learning_rate": 1.755092254485837e-06, "loss": 0.0032, "num_input_tokens_seen": 42056736, "step": 62390 }, { "epoch": 1.5243202306207704, "grad_norm": 0.03356233239173889, "learning_rate": 1.7550363421639056e-06, "loss": 0.1097, "num_input_tokens_seen": 42060768, "step": 62395 }, { "epoch": 1.5244423814526176, "grad_norm": 0.30012017488479614, "learning_rate": 1.7549804243511469e-06, "loss": 0.1254, "num_input_tokens_seen": 42064288, "step": 62400 }, { "epoch": 1.5245645322844648, "grad_norm": 15.219573020935059, "learning_rate": 1.7549245010479674e-06, "loss": 0.1479, "num_input_tokens_seen": 42067616, "step": 62405 }, { "epoch": 1.524686683116312, "grad_norm": 1.6977224349975586, "learning_rate": 1.7548685722547738e-06, "loss": 0.0592, "num_input_tokens_seen": 42071136, "step": 62410 }, { "epoch": 1.5248088339481591, "grad_norm": 0.7301633358001709, "learning_rate": 1.7548126379719732e-06, "loss": 0.0017, "num_input_tokens_seen": 42074464, "step": 62415 }, { "epoch": 1.5249309847800063, "grad_norm": 0.2103739231824875, "learning_rate": 1.754756698199972e-06, "loss": 0.1663, "num_input_tokens_seen": 42077792, "step": 62420 }, { "epoch": 1.5250531356118535, "grad_norm": 20.333240509033203, "learning_rate": 1.7547007529391769e-06, "loss": 0.1711, "num_input_tokens_seen": 42080992, "step": 62425 }, { "epoch": 1.5251752864437007, "grad_norm": 1.7456847429275513, "learning_rate": 1.7546448021899952e-06, "loss": 0.0016, "num_input_tokens_seen": 42084256, "step": 62430 }, { "epoch": 1.525297437275548, "grad_norm": 0.07221655547618866, "learning_rate": 1.754588845952833e-06, "loss": 0.0055, "num_input_tokens_seen": 42087456, "step": 62435 }, { "epoch": 1.525419588107395, "grad_norm": 0.10700752586126328, "learning_rate": 1.7545328842280985e-06, "loss": 0.0075, "num_input_tokens_seen": 42090528, "step": 62440 }, { "epoch": 1.5255417389392423, "grad_norm": 31.133052825927734, "learning_rate": 1.7544769170161973e-06, "loss": 0.0679, "num_input_tokens_seen": 42094240, "step": 62445 }, { "epoch": 1.5256638897710895, "grad_norm": 14.079894065856934, "learning_rate": 1.7544209443175372e-06, "loss": 0.0625, "num_input_tokens_seen": 42097568, "step": 62450 }, { "epoch": 1.5257860406029367, "grad_norm": 0.0958448052406311, "learning_rate": 1.7543649661325254e-06, "loss": 0.1018, "num_input_tokens_seen": 42100832, "step": 62455 }, { "epoch": 1.5259081914347836, "grad_norm": 0.026826851069927216, "learning_rate": 1.7543089824615682e-06, "loss": 0.0524, "num_input_tokens_seen": 42104032, "step": 62460 }, { "epoch": 1.5260303422666308, "grad_norm": 1.1849817037582397, "learning_rate": 1.7542529933050735e-06, "loss": 0.0736, "num_input_tokens_seen": 42107744, "step": 62465 }, { "epoch": 1.526152493098478, "grad_norm": 0.04849791154265404, "learning_rate": 1.754196998663448e-06, "loss": 0.0697, "num_input_tokens_seen": 42111392, "step": 62470 }, { "epoch": 1.5262746439303252, "grad_norm": 0.4404328763484955, "learning_rate": 1.7541409985370993e-06, "loss": 0.0861, "num_input_tokens_seen": 42114528, "step": 62475 }, { "epoch": 1.5263967947621724, "grad_norm": 14.965144157409668, "learning_rate": 1.754084992926434e-06, "loss": 0.2115, "num_input_tokens_seen": 42117920, "step": 62480 }, { "epoch": 1.5265189455940193, "grad_norm": 34.34897994995117, "learning_rate": 1.75402898183186e-06, "loss": 0.0692, "num_input_tokens_seen": 42121376, "step": 62485 }, { "epoch": 1.5266410964258665, "grad_norm": 0.1501975655555725, "learning_rate": 1.7539729652537848e-06, "loss": 0.1354, "num_input_tokens_seen": 42124512, "step": 62490 }, { "epoch": 1.5267632472577137, "grad_norm": 6.973504543304443, "learning_rate": 1.753916943192615e-06, "loss": 0.063, "num_input_tokens_seen": 42127840, "step": 62495 }, { "epoch": 1.526885398089561, "grad_norm": 0.12668637931346893, "learning_rate": 1.7538609156487585e-06, "loss": 0.0009, "num_input_tokens_seen": 42131360, "step": 62500 }, { "epoch": 1.527007548921408, "grad_norm": 0.4277445375919342, "learning_rate": 1.7538048826226225e-06, "loss": 0.1256, "num_input_tokens_seen": 42134816, "step": 62505 }, { "epoch": 1.5271296997532553, "grad_norm": 0.07875073701143265, "learning_rate": 1.753748844114615e-06, "loss": 0.0323, "num_input_tokens_seen": 42137824, "step": 62510 }, { "epoch": 1.5272518505851025, "grad_norm": 0.3303001821041107, "learning_rate": 1.753692800125143e-06, "loss": 0.1573, "num_input_tokens_seen": 42141280, "step": 62515 }, { "epoch": 1.5273740014169497, "grad_norm": 118.35843658447266, "learning_rate": 1.753636750654614e-06, "loss": 0.0461, "num_input_tokens_seen": 42144736, "step": 62520 }, { "epoch": 1.5274961522487969, "grad_norm": 1.3716565370559692, "learning_rate": 1.7535806957034365e-06, "loss": 0.0953, "num_input_tokens_seen": 42148384, "step": 62525 }, { "epoch": 1.527618303080644, "grad_norm": 0.14611275494098663, "learning_rate": 1.7535246352720167e-06, "loss": 0.0707, "num_input_tokens_seen": 42151904, "step": 62530 }, { "epoch": 1.5277404539124912, "grad_norm": 0.5512341260910034, "learning_rate": 1.7534685693607637e-06, "loss": 0.0573, "num_input_tokens_seen": 42155360, "step": 62535 }, { "epoch": 1.5278626047443384, "grad_norm": 0.0678791031241417, "learning_rate": 1.753412497970084e-06, "loss": 0.1038, "num_input_tokens_seen": 42158496, "step": 62540 }, { "epoch": 1.5279847555761856, "grad_norm": 14.49067211151123, "learning_rate": 1.7533564211003865e-06, "loss": 0.1688, "num_input_tokens_seen": 42161888, "step": 62545 }, { "epoch": 1.5281069064080326, "grad_norm": 12.960949897766113, "learning_rate": 1.7533003387520784e-06, "loss": 0.1191, "num_input_tokens_seen": 42165600, "step": 62550 }, { "epoch": 1.5282290572398798, "grad_norm": 0.16139183938503265, "learning_rate": 1.7532442509255673e-06, "loss": 0.0018, "num_input_tokens_seen": 42168928, "step": 62555 }, { "epoch": 1.528351208071727, "grad_norm": 0.18819627165794373, "learning_rate": 1.753188157621262e-06, "loss": 0.0421, "num_input_tokens_seen": 42172192, "step": 62560 }, { "epoch": 1.5284733589035742, "grad_norm": 68.75452423095703, "learning_rate": 1.7531320588395693e-06, "loss": 0.1245, "num_input_tokens_seen": 42177504, "step": 62565 }, { "epoch": 1.5285955097354211, "grad_norm": 22.286762237548828, "learning_rate": 1.7530759545808977e-06, "loss": 0.0582, "num_input_tokens_seen": 42180640, "step": 62570 }, { "epoch": 1.5287176605672683, "grad_norm": 24.4525089263916, "learning_rate": 1.7530198448456556e-06, "loss": 0.0592, "num_input_tokens_seen": 42183968, "step": 62575 }, { "epoch": 1.5288398113991155, "grad_norm": 0.3057885766029358, "learning_rate": 1.7529637296342502e-06, "loss": 0.001, "num_input_tokens_seen": 42187744, "step": 62580 }, { "epoch": 1.5289619622309627, "grad_norm": 0.1315106451511383, "learning_rate": 1.7529076089470905e-06, "loss": 0.0325, "num_input_tokens_seen": 42191264, "step": 62585 }, { "epoch": 1.52908411306281, "grad_norm": 0.6707960963249207, "learning_rate": 1.752851482784584e-06, "loss": 0.0993, "num_input_tokens_seen": 42195040, "step": 62590 }, { "epoch": 1.529206263894657, "grad_norm": 0.04225427657365799, "learning_rate": 1.7527953511471387e-06, "loss": 0.0747, "num_input_tokens_seen": 42198368, "step": 62595 }, { "epoch": 1.5293284147265043, "grad_norm": 0.10846621543169022, "learning_rate": 1.7527392140351634e-06, "loss": 0.0597, "num_input_tokens_seen": 42201696, "step": 62600 }, { "epoch": 1.5294505655583515, "grad_norm": 34.9310417175293, "learning_rate": 1.7526830714490662e-06, "loss": 0.0322, "num_input_tokens_seen": 42204960, "step": 62605 }, { "epoch": 1.5295727163901987, "grad_norm": 0.08704297244548798, "learning_rate": 1.752626923389255e-06, "loss": 0.1846, "num_input_tokens_seen": 42208288, "step": 62610 }, { "epoch": 1.5296948672220458, "grad_norm": 7.316606044769287, "learning_rate": 1.7525707698561382e-06, "loss": 0.0035, "num_input_tokens_seen": 42211936, "step": 62615 }, { "epoch": 1.529817018053893, "grad_norm": 0.4856109917163849, "learning_rate": 1.7525146108501248e-06, "loss": 0.0012, "num_input_tokens_seen": 42215328, "step": 62620 }, { "epoch": 1.5299391688857402, "grad_norm": 16.65413475036621, "learning_rate": 1.7524584463716226e-06, "loss": 0.0523, "num_input_tokens_seen": 42218528, "step": 62625 }, { "epoch": 1.5300613197175874, "grad_norm": 0.06861788034439087, "learning_rate": 1.7524022764210401e-06, "loss": 0.1023, "num_input_tokens_seen": 42221536, "step": 62630 }, { "epoch": 1.5301834705494346, "grad_norm": 0.09857258200645447, "learning_rate": 1.7523461009987862e-06, "loss": 0.0021, "num_input_tokens_seen": 42225120, "step": 62635 }, { "epoch": 1.5303056213812816, "grad_norm": 0.22179517149925232, "learning_rate": 1.7522899201052686e-06, "loss": 0.2342, "num_input_tokens_seen": 42228704, "step": 62640 }, { "epoch": 1.5304277722131288, "grad_norm": 20.480735778808594, "learning_rate": 1.7522337337408968e-06, "loss": 0.1148, "num_input_tokens_seen": 42231904, "step": 62645 }, { "epoch": 1.530549923044976, "grad_norm": 0.1441388875246048, "learning_rate": 1.7521775419060786e-06, "loss": 0.0011, "num_input_tokens_seen": 42235040, "step": 62650 }, { "epoch": 1.5306720738768231, "grad_norm": 0.2403523474931717, "learning_rate": 1.7521213446012232e-06, "loss": 0.0391, "num_input_tokens_seen": 42238624, "step": 62655 }, { "epoch": 1.53079422470867, "grad_norm": 0.18997099995613098, "learning_rate": 1.752065141826739e-06, "loss": 0.0433, "num_input_tokens_seen": 42242016, "step": 62660 }, { "epoch": 1.5309163755405173, "grad_norm": 31.88978385925293, "learning_rate": 1.7520089335830348e-06, "loss": 0.1549, "num_input_tokens_seen": 42245984, "step": 62665 }, { "epoch": 1.5310385263723645, "grad_norm": 17.45441246032715, "learning_rate": 1.7519527198705193e-06, "loss": 0.0822, "num_input_tokens_seen": 42249312, "step": 62670 }, { "epoch": 1.5311606772042117, "grad_norm": 0.6886013150215149, "learning_rate": 1.7518965006896016e-06, "loss": 0.0554, "num_input_tokens_seen": 42252384, "step": 62675 }, { "epoch": 1.5312828280360589, "grad_norm": 0.599663257598877, "learning_rate": 1.7518402760406903e-06, "loss": 0.1014, "num_input_tokens_seen": 42255712, "step": 62680 }, { "epoch": 1.531404978867906, "grad_norm": 6.776517391204834, "learning_rate": 1.7517840459241944e-06, "loss": 0.0966, "num_input_tokens_seen": 42258912, "step": 62685 }, { "epoch": 1.5315271296997532, "grad_norm": 0.3810465931892395, "learning_rate": 1.7517278103405225e-06, "loss": 0.0772, "num_input_tokens_seen": 42262048, "step": 62690 }, { "epoch": 1.5316492805316004, "grad_norm": 1.3760406970977783, "learning_rate": 1.7516715692900834e-06, "loss": 0.1186, "num_input_tokens_seen": 42265632, "step": 62695 }, { "epoch": 1.5317714313634476, "grad_norm": 0.25284773111343384, "learning_rate": 1.751615322773287e-06, "loss": 0.0012, "num_input_tokens_seen": 42268832, "step": 62700 }, { "epoch": 1.5318935821952948, "grad_norm": 6.275792598724365, "learning_rate": 1.7515590707905416e-06, "loss": 0.0519, "num_input_tokens_seen": 42271968, "step": 62705 }, { "epoch": 1.532015733027142, "grad_norm": 0.36964312195777893, "learning_rate": 1.7515028133422566e-06, "loss": 0.0301, "num_input_tokens_seen": 42275360, "step": 62710 }, { "epoch": 1.5321378838589892, "grad_norm": 0.2160339504480362, "learning_rate": 1.751446550428841e-06, "loss": 0.0958, "num_input_tokens_seen": 42278560, "step": 62715 }, { "epoch": 1.5322600346908364, "grad_norm": 0.16960953176021576, "learning_rate": 1.7513902820507038e-06, "loss": 0.0023, "num_input_tokens_seen": 42282272, "step": 62720 }, { "epoch": 1.5323821855226836, "grad_norm": 3.283630609512329, "learning_rate": 1.7513340082082547e-06, "loss": 0.1115, "num_input_tokens_seen": 42286112, "step": 62725 }, { "epoch": 1.5325043363545305, "grad_norm": 15.866968154907227, "learning_rate": 1.7512777289019022e-06, "loss": 0.0755, "num_input_tokens_seen": 42289376, "step": 62730 }, { "epoch": 1.5326264871863777, "grad_norm": 3.386458158493042, "learning_rate": 1.7512214441320564e-06, "loss": 0.0407, "num_input_tokens_seen": 42292768, "step": 62735 }, { "epoch": 1.532748638018225, "grad_norm": 0.09874772280454636, "learning_rate": 1.751165153899126e-06, "loss": 0.0592, "num_input_tokens_seen": 42295776, "step": 62740 }, { "epoch": 1.532870788850072, "grad_norm": 1.7841368913650513, "learning_rate": 1.7511088582035204e-06, "loss": 0.0503, "num_input_tokens_seen": 42298784, "step": 62745 }, { "epoch": 1.532992939681919, "grad_norm": 0.28716370463371277, "learning_rate": 1.7510525570456496e-06, "loss": 0.0556, "num_input_tokens_seen": 42302432, "step": 62750 }, { "epoch": 1.5331150905137663, "grad_norm": 0.11394919455051422, "learning_rate": 1.7509962504259223e-06, "loss": 0.0758, "num_input_tokens_seen": 42305952, "step": 62755 }, { "epoch": 1.5332372413456135, "grad_norm": 0.3958131670951843, "learning_rate": 1.7509399383447482e-06, "loss": 0.1926, "num_input_tokens_seen": 42308960, "step": 62760 }, { "epoch": 1.5333593921774606, "grad_norm": 0.1787228137254715, "learning_rate": 1.7508836208025367e-06, "loss": 0.1107, "num_input_tokens_seen": 42312544, "step": 62765 }, { "epoch": 1.5334815430093078, "grad_norm": 0.3476884961128235, "learning_rate": 1.750827297799698e-06, "loss": 0.0015, "num_input_tokens_seen": 42316064, "step": 62770 }, { "epoch": 1.533603693841155, "grad_norm": 0.13672898709774017, "learning_rate": 1.7507709693366412e-06, "loss": 0.0993, "num_input_tokens_seen": 42319328, "step": 62775 }, { "epoch": 1.5337258446730022, "grad_norm": 24.389223098754883, "learning_rate": 1.7507146354137759e-06, "loss": 0.0441, "num_input_tokens_seen": 42322656, "step": 62780 }, { "epoch": 1.5338479955048494, "grad_norm": 2.3359413146972656, "learning_rate": 1.7506582960315117e-06, "loss": 0.1276, "num_input_tokens_seen": 42326304, "step": 62785 }, { "epoch": 1.5339701463366966, "grad_norm": 15.77255630493164, "learning_rate": 1.7506019511902586e-06, "loss": 0.1918, "num_input_tokens_seen": 42329504, "step": 62790 }, { "epoch": 1.5340922971685438, "grad_norm": 22.603723526000977, "learning_rate": 1.750545600890426e-06, "loss": 0.1111, "num_input_tokens_seen": 42332960, "step": 62795 }, { "epoch": 1.534214448000391, "grad_norm": 0.08420402556657791, "learning_rate": 1.7504892451324241e-06, "loss": 0.1264, "num_input_tokens_seen": 42336352, "step": 62800 }, { "epoch": 1.5343365988322382, "grad_norm": 0.03107322007417679, "learning_rate": 1.7504328839166628e-06, "loss": 0.0363, "num_input_tokens_seen": 42339296, "step": 62805 }, { "epoch": 1.5344587496640854, "grad_norm": 0.05710528418421745, "learning_rate": 1.7503765172435515e-06, "loss": 0.1388, "num_input_tokens_seen": 42342304, "step": 62810 }, { "epoch": 1.5345809004959325, "grad_norm": 0.04239802435040474, "learning_rate": 1.7503201451135002e-06, "loss": 0.0102, "num_input_tokens_seen": 42346016, "step": 62815 }, { "epoch": 1.5347030513277795, "grad_norm": 71.819580078125, "learning_rate": 1.7502637675269192e-06, "loss": 0.1099, "num_input_tokens_seen": 42349344, "step": 62820 }, { "epoch": 1.5348252021596267, "grad_norm": 0.10941146314144135, "learning_rate": 1.7502073844842183e-06, "loss": 0.0517, "num_input_tokens_seen": 42352480, "step": 62825 }, { "epoch": 1.534947352991474, "grad_norm": 0.17934951186180115, "learning_rate": 1.7501509959858074e-06, "loss": 0.0806, "num_input_tokens_seen": 42355680, "step": 62830 }, { "epoch": 1.535069503823321, "grad_norm": 0.6699011325836182, "learning_rate": 1.7500946020320967e-06, "loss": 0.112, "num_input_tokens_seen": 42358752, "step": 62835 }, { "epoch": 1.535191654655168, "grad_norm": 9.080674171447754, "learning_rate": 1.7500382026234964e-06, "loss": 0.2533, "num_input_tokens_seen": 42362336, "step": 62840 }, { "epoch": 1.5353138054870152, "grad_norm": 7.4552154541015625, "learning_rate": 1.7499817977604163e-06, "loss": 0.0778, "num_input_tokens_seen": 42365728, "step": 62845 }, { "epoch": 1.5354359563188624, "grad_norm": 0.44541171193122864, "learning_rate": 1.7499253874432672e-06, "loss": 0.0658, "num_input_tokens_seen": 42369888, "step": 62850 }, { "epoch": 1.5355581071507096, "grad_norm": 0.41716626286506653, "learning_rate": 1.7498689716724586e-06, "loss": 0.1032, "num_input_tokens_seen": 42373472, "step": 62855 }, { "epoch": 1.5356802579825568, "grad_norm": 0.11591717600822449, "learning_rate": 1.7498125504484014e-06, "loss": 0.0364, "num_input_tokens_seen": 42377056, "step": 62860 }, { "epoch": 1.535802408814404, "grad_norm": 27.05655860900879, "learning_rate": 1.7497561237715055e-06, "loss": 0.1149, "num_input_tokens_seen": 42380384, "step": 62865 }, { "epoch": 1.5359245596462512, "grad_norm": 0.4578407406806946, "learning_rate": 1.7496996916421818e-06, "loss": 0.0337, "num_input_tokens_seen": 42383776, "step": 62870 }, { "epoch": 1.5360467104780984, "grad_norm": 18.179445266723633, "learning_rate": 1.7496432540608398e-06, "loss": 0.2404, "num_input_tokens_seen": 42387232, "step": 62875 }, { "epoch": 1.5361688613099456, "grad_norm": 49.866397857666016, "learning_rate": 1.7495868110278905e-06, "loss": 0.0842, "num_input_tokens_seen": 42390624, "step": 62880 }, { "epoch": 1.5362910121417928, "grad_norm": 8.857733726501465, "learning_rate": 1.7495303625437447e-06, "loss": 0.2111, "num_input_tokens_seen": 42394080, "step": 62885 }, { "epoch": 1.53641316297364, "grad_norm": 1.7023998498916626, "learning_rate": 1.749473908608812e-06, "loss": 0.04, "num_input_tokens_seen": 42397024, "step": 62890 }, { "epoch": 1.5365353138054871, "grad_norm": 10.899248123168945, "learning_rate": 1.7494174492235038e-06, "loss": 0.0856, "num_input_tokens_seen": 42400480, "step": 62895 }, { "epoch": 1.5366574646373343, "grad_norm": 23.474842071533203, "learning_rate": 1.7493609843882302e-06, "loss": 0.0542, "num_input_tokens_seen": 42404128, "step": 62900 }, { "epoch": 1.5367796154691813, "grad_norm": 8.95337200164795, "learning_rate": 1.749304514103402e-06, "loss": 0.0997, "num_input_tokens_seen": 42407392, "step": 62905 }, { "epoch": 1.5369017663010285, "grad_norm": 20.889585494995117, "learning_rate": 1.74924803836943e-06, "loss": 0.1064, "num_input_tokens_seen": 42410784, "step": 62910 }, { "epoch": 1.5370239171328757, "grad_norm": 0.41612470149993896, "learning_rate": 1.7491915571867245e-06, "loss": 0.0706, "num_input_tokens_seen": 42414880, "step": 62915 }, { "epoch": 1.5371460679647229, "grad_norm": 10.44105339050293, "learning_rate": 1.7491350705556967e-06, "loss": 0.0611, "num_input_tokens_seen": 42417888, "step": 62920 }, { "epoch": 1.53726821879657, "grad_norm": 1.0386707782745361, "learning_rate": 1.749078578476757e-06, "loss": 0.0141, "num_input_tokens_seen": 42421344, "step": 62925 }, { "epoch": 1.537390369628417, "grad_norm": 9.058958053588867, "learning_rate": 1.7490220809503163e-06, "loss": 0.0567, "num_input_tokens_seen": 42425312, "step": 62930 }, { "epoch": 1.5375125204602642, "grad_norm": 28.129384994506836, "learning_rate": 1.7489655779767856e-06, "loss": 0.1468, "num_input_tokens_seen": 42428192, "step": 62935 }, { "epoch": 1.5376346712921114, "grad_norm": 1.2702558040618896, "learning_rate": 1.748909069556576e-06, "loss": 0.1312, "num_input_tokens_seen": 42431840, "step": 62940 }, { "epoch": 1.5377568221239586, "grad_norm": 3.0834596157073975, "learning_rate": 1.7488525556900981e-06, "loss": 0.073, "num_input_tokens_seen": 42435616, "step": 62945 }, { "epoch": 1.5378789729558058, "grad_norm": 10.420439720153809, "learning_rate": 1.748796036377763e-06, "loss": 0.0668, "num_input_tokens_seen": 42440672, "step": 62950 }, { "epoch": 1.538001123787653, "grad_norm": 9.713109016418457, "learning_rate": 1.7487395116199815e-06, "loss": 0.0608, "num_input_tokens_seen": 42444192, "step": 62955 }, { "epoch": 1.5381232746195002, "grad_norm": 64.52426147460938, "learning_rate": 1.7486829814171653e-06, "loss": 0.1059, "num_input_tokens_seen": 42447200, "step": 62960 }, { "epoch": 1.5382454254513473, "grad_norm": 1.0289015769958496, "learning_rate": 1.7486264457697249e-06, "loss": 0.0915, "num_input_tokens_seen": 42450528, "step": 62965 }, { "epoch": 1.5383675762831945, "grad_norm": 143.9002685546875, "learning_rate": 1.7485699046780714e-06, "loss": 0.1535, "num_input_tokens_seen": 42453920, "step": 62970 }, { "epoch": 1.5384897271150417, "grad_norm": 0.10126427561044693, "learning_rate": 1.7485133581426165e-06, "loss": 0.1188, "num_input_tokens_seen": 42456800, "step": 62975 }, { "epoch": 1.538611877946889, "grad_norm": 0.7094244360923767, "learning_rate": 1.7484568061637712e-06, "loss": 0.0455, "num_input_tokens_seen": 42460512, "step": 62980 }, { "epoch": 1.538734028778736, "grad_norm": 2.6824791431427, "learning_rate": 1.7484002487419466e-06, "loss": 0.1353, "num_input_tokens_seen": 42463840, "step": 62985 }, { "epoch": 1.5388561796105833, "grad_norm": 19.794893264770508, "learning_rate": 1.748343685877554e-06, "loss": 0.0933, "num_input_tokens_seen": 42467552, "step": 62990 }, { "epoch": 1.5389783304424303, "grad_norm": 0.8362667560577393, "learning_rate": 1.7482871175710048e-06, "loss": 0.0359, "num_input_tokens_seen": 42470880, "step": 62995 }, { "epoch": 1.5391004812742775, "grad_norm": 0.17811979353427887, "learning_rate": 1.7482305438227104e-06, "loss": 0.0564, "num_input_tokens_seen": 42474016, "step": 63000 }, { "epoch": 1.5392226321061246, "grad_norm": 0.18364813923835754, "learning_rate": 1.7481739646330822e-06, "loss": 0.0762, "num_input_tokens_seen": 42477408, "step": 63005 }, { "epoch": 1.5393447829379718, "grad_norm": 16.569730758666992, "learning_rate": 1.748117380002532e-06, "loss": 0.0955, "num_input_tokens_seen": 42480672, "step": 63010 }, { "epoch": 1.539466933769819, "grad_norm": 0.3204478919506073, "learning_rate": 1.7480607899314707e-06, "loss": 0.0247, "num_input_tokens_seen": 42484192, "step": 63015 }, { "epoch": 1.539589084601666, "grad_norm": 0.3662739396095276, "learning_rate": 1.7480041944203102e-06, "loss": 0.1316, "num_input_tokens_seen": 42487392, "step": 63020 }, { "epoch": 1.5397112354335132, "grad_norm": 0.3306255042552948, "learning_rate": 1.7479475934694623e-06, "loss": 0.1054, "num_input_tokens_seen": 42490656, "step": 63025 }, { "epoch": 1.5398333862653604, "grad_norm": 9.560601234436035, "learning_rate": 1.7478909870793378e-06, "loss": 0.1108, "num_input_tokens_seen": 42493792, "step": 63030 }, { "epoch": 1.5399555370972076, "grad_norm": 0.8780872225761414, "learning_rate": 1.7478343752503494e-06, "loss": 0.0703, "num_input_tokens_seen": 42497504, "step": 63035 }, { "epoch": 1.5400776879290548, "grad_norm": 0.5312938690185547, "learning_rate": 1.747777757982908e-06, "loss": 0.0612, "num_input_tokens_seen": 42500832, "step": 63040 }, { "epoch": 1.540199838760902, "grad_norm": 0.25888314843177795, "learning_rate": 1.7477211352774254e-06, "loss": 0.0314, "num_input_tokens_seen": 42504352, "step": 63045 }, { "epoch": 1.5403219895927491, "grad_norm": 0.16010764241218567, "learning_rate": 1.7476645071343141e-06, "loss": 0.0689, "num_input_tokens_seen": 42507744, "step": 63050 }, { "epoch": 1.5404441404245963, "grad_norm": 0.1844155341386795, "learning_rate": 1.7476078735539853e-06, "loss": 0.1989, "num_input_tokens_seen": 42510816, "step": 63055 }, { "epoch": 1.5405662912564435, "grad_norm": 2.511029005050659, "learning_rate": 1.7475512345368509e-06, "loss": 0.1072, "num_input_tokens_seen": 42514016, "step": 63060 }, { "epoch": 1.5406884420882907, "grad_norm": 9.338394165039062, "learning_rate": 1.7474945900833227e-06, "loss": 0.0739, "num_input_tokens_seen": 42518112, "step": 63065 }, { "epoch": 1.540810592920138, "grad_norm": 0.3963111937046051, "learning_rate": 1.7474379401938125e-06, "loss": 0.0852, "num_input_tokens_seen": 42521312, "step": 63070 }, { "epoch": 1.540932743751985, "grad_norm": 0.3336862027645111, "learning_rate": 1.7473812848687334e-06, "loss": 0.0435, "num_input_tokens_seen": 42524448, "step": 63075 }, { "epoch": 1.5410548945838323, "grad_norm": 19.525541305541992, "learning_rate": 1.7473246241084958e-06, "loss": 0.1233, "num_input_tokens_seen": 42527712, "step": 63080 }, { "epoch": 1.5411770454156792, "grad_norm": 0.3767625689506531, "learning_rate": 1.7472679579135129e-06, "loss": 0.0974, "num_input_tokens_seen": 42531424, "step": 63085 }, { "epoch": 1.5412991962475264, "grad_norm": 112.72207641601562, "learning_rate": 1.7472112862841963e-06, "loss": 0.1039, "num_input_tokens_seen": 42534560, "step": 63090 }, { "epoch": 1.5414213470793736, "grad_norm": 1.469272255897522, "learning_rate": 1.7471546092209585e-06, "loss": 0.101, "num_input_tokens_seen": 42537632, "step": 63095 }, { "epoch": 1.5415434979112208, "grad_norm": 8.400092124938965, "learning_rate": 1.7470979267242111e-06, "loss": 0.1874, "num_input_tokens_seen": 42540640, "step": 63100 }, { "epoch": 1.541665648743068, "grad_norm": 14.375085830688477, "learning_rate": 1.7470412387943668e-06, "loss": 0.127, "num_input_tokens_seen": 42543648, "step": 63105 }, { "epoch": 1.541787799574915, "grad_norm": 0.7379078269004822, "learning_rate": 1.7469845454318374e-06, "loss": 0.0244, "num_input_tokens_seen": 42546784, "step": 63110 }, { "epoch": 1.5419099504067622, "grad_norm": 0.09788034111261368, "learning_rate": 1.7469278466370359e-06, "loss": 0.0402, "num_input_tokens_seen": 42550112, "step": 63115 }, { "epoch": 1.5420321012386093, "grad_norm": 0.040544167160987854, "learning_rate": 1.7468711424103742e-06, "loss": 0.0152, "num_input_tokens_seen": 42553120, "step": 63120 }, { "epoch": 1.5421542520704565, "grad_norm": 64.45174407958984, "learning_rate": 1.7468144327522644e-06, "loss": 0.164, "num_input_tokens_seen": 42556512, "step": 63125 }, { "epoch": 1.5422764029023037, "grad_norm": 1.8336378335952759, "learning_rate": 1.7467577176631192e-06, "loss": 0.0329, "num_input_tokens_seen": 42559904, "step": 63130 }, { "epoch": 1.542398553734151, "grad_norm": 16.188873291015625, "learning_rate": 1.746700997143351e-06, "loss": 0.0034, "num_input_tokens_seen": 42563168, "step": 63135 }, { "epoch": 1.542520704565998, "grad_norm": 0.4830506443977356, "learning_rate": 1.7466442711933724e-06, "loss": 0.042, "num_input_tokens_seen": 42566240, "step": 63140 }, { "epoch": 1.5426428553978453, "grad_norm": 0.06538606435060501, "learning_rate": 1.7465875398135958e-06, "loss": 0.0024, "num_input_tokens_seen": 42569376, "step": 63145 }, { "epoch": 1.5427650062296925, "grad_norm": 265.35064697265625, "learning_rate": 1.746530803004434e-06, "loss": 0.0882, "num_input_tokens_seen": 42573024, "step": 63150 }, { "epoch": 1.5428871570615397, "grad_norm": 10.878697395324707, "learning_rate": 1.7464740607662991e-06, "loss": 0.0602, "num_input_tokens_seen": 42576480, "step": 63155 }, { "epoch": 1.5430093078933869, "grad_norm": 0.4172748327255249, "learning_rate": 1.746417313099604e-06, "loss": 0.0833, "num_input_tokens_seen": 42579360, "step": 63160 }, { "epoch": 1.543131458725234, "grad_norm": 31.220434188842773, "learning_rate": 1.7463605600047618e-06, "loss": 0.2162, "num_input_tokens_seen": 42582624, "step": 63165 }, { "epoch": 1.5432536095570812, "grad_norm": 0.008305324241518974, "learning_rate": 1.7463038014821848e-06, "loss": 0.0718, "num_input_tokens_seen": 42586016, "step": 63170 }, { "epoch": 1.5433757603889282, "grad_norm": 0.0460757315158844, "learning_rate": 1.7462470375322856e-06, "loss": 0.0469, "num_input_tokens_seen": 42589088, "step": 63175 }, { "epoch": 1.5434979112207754, "grad_norm": 0.18197746574878693, "learning_rate": 1.7461902681554773e-06, "loss": 0.0732, "num_input_tokens_seen": 42592800, "step": 63180 }, { "epoch": 1.5436200620526226, "grad_norm": 87.91574096679688, "learning_rate": 1.7461334933521725e-06, "loss": 0.1156, "num_input_tokens_seen": 42596320, "step": 63185 }, { "epoch": 1.5437422128844698, "grad_norm": 0.7851141095161438, "learning_rate": 1.7460767131227844e-06, "loss": 0.0074, "num_input_tokens_seen": 42599456, "step": 63190 }, { "epoch": 1.5438643637163167, "grad_norm": 2.0627529621124268, "learning_rate": 1.7460199274677262e-06, "loss": 0.0416, "num_input_tokens_seen": 42602656, "step": 63195 }, { "epoch": 1.543986514548164, "grad_norm": 0.045423515141010284, "learning_rate": 1.7459631363874098e-06, "loss": 0.1269, "num_input_tokens_seen": 42605856, "step": 63200 }, { "epoch": 1.5441086653800111, "grad_norm": 37.60093688964844, "learning_rate": 1.745906339882249e-06, "loss": 0.0604, "num_input_tokens_seen": 42609248, "step": 63205 }, { "epoch": 1.5442308162118583, "grad_norm": 8.284845352172852, "learning_rate": 1.7458495379526568e-06, "loss": 0.1087, "num_input_tokens_seen": 42612512, "step": 63210 }, { "epoch": 1.5443529670437055, "grad_norm": 0.04278009012341499, "learning_rate": 1.745792730599046e-06, "loss": 0.0718, "num_input_tokens_seen": 42615584, "step": 63215 }, { "epoch": 1.5444751178755527, "grad_norm": 153.6990203857422, "learning_rate": 1.7457359178218304e-06, "loss": 0.2104, "num_input_tokens_seen": 42618784, "step": 63220 }, { "epoch": 1.5445972687073999, "grad_norm": 0.11112756282091141, "learning_rate": 1.745679099621422e-06, "loss": 0.0914, "num_input_tokens_seen": 42621920, "step": 63225 }, { "epoch": 1.544719419539247, "grad_norm": 0.2005205750465393, "learning_rate": 1.7456222759982348e-06, "loss": 0.0784, "num_input_tokens_seen": 42625568, "step": 63230 }, { "epoch": 1.5448415703710943, "grad_norm": 0.27125903964042664, "learning_rate": 1.745565446952682e-06, "loss": 0.0009, "num_input_tokens_seen": 42628896, "step": 63235 }, { "epoch": 1.5449637212029415, "grad_norm": 115.29658508300781, "learning_rate": 1.7455086124851764e-06, "loss": 0.0905, "num_input_tokens_seen": 42631776, "step": 63240 }, { "epoch": 1.5450858720347886, "grad_norm": 13.909765243530273, "learning_rate": 1.7454517725961319e-06, "loss": 0.0461, "num_input_tokens_seen": 42635232, "step": 63245 }, { "epoch": 1.5452080228666358, "grad_norm": 58.41990280151367, "learning_rate": 1.7453949272859619e-06, "loss": 0.0771, "num_input_tokens_seen": 42638624, "step": 63250 }, { "epoch": 1.545330173698483, "grad_norm": 0.1262422800064087, "learning_rate": 1.745338076555079e-06, "loss": 0.0015, "num_input_tokens_seen": 42641824, "step": 63255 }, { "epoch": 1.5454523245303302, "grad_norm": 12.13207721710205, "learning_rate": 1.7452812204038972e-06, "loss": 0.0622, "num_input_tokens_seen": 42645408, "step": 63260 }, { "epoch": 1.5455744753621772, "grad_norm": 38.522159576416016, "learning_rate": 1.74522435883283e-06, "loss": 0.1353, "num_input_tokens_seen": 42649056, "step": 63265 }, { "epoch": 1.5456966261940244, "grad_norm": 14.3047513961792, "learning_rate": 1.745167491842291e-06, "loss": 0.1149, "num_input_tokens_seen": 42652320, "step": 63270 }, { "epoch": 1.5458187770258716, "grad_norm": 0.16538095474243164, "learning_rate": 1.7451106194326933e-06, "loss": 0.0959, "num_input_tokens_seen": 42655776, "step": 63275 }, { "epoch": 1.5459409278577187, "grad_norm": 26.421430587768555, "learning_rate": 1.745053741604451e-06, "loss": 0.0981, "num_input_tokens_seen": 42658912, "step": 63280 }, { "epoch": 1.5460630786895657, "grad_norm": 22.397008895874023, "learning_rate": 1.7449968583579776e-06, "loss": 0.1071, "num_input_tokens_seen": 42662496, "step": 63285 }, { "epoch": 1.546185229521413, "grad_norm": 0.22859802842140198, "learning_rate": 1.7449399696936862e-06, "loss": 0.1257, "num_input_tokens_seen": 42666592, "step": 63290 }, { "epoch": 1.54630738035326, "grad_norm": 22.230684280395508, "learning_rate": 1.7448830756119912e-06, "loss": 0.0495, "num_input_tokens_seen": 42669856, "step": 63295 }, { "epoch": 1.5464295311851073, "grad_norm": 30.695005416870117, "learning_rate": 1.7448261761133062e-06, "loss": 0.0459, "num_input_tokens_seen": 42673120, "step": 63300 }, { "epoch": 1.5465516820169545, "grad_norm": 0.09240459650754929, "learning_rate": 1.7447692711980448e-06, "loss": 0.0671, "num_input_tokens_seen": 42676512, "step": 63305 }, { "epoch": 1.5466738328488017, "grad_norm": 0.0510527528822422, "learning_rate": 1.744712360866621e-06, "loss": 0.0339, "num_input_tokens_seen": 42680096, "step": 63310 }, { "epoch": 1.5467959836806489, "grad_norm": 0.5635117888450623, "learning_rate": 1.7446554451194486e-06, "loss": 0.1685, "num_input_tokens_seen": 42683232, "step": 63315 }, { "epoch": 1.546918134512496, "grad_norm": 36.4852409362793, "learning_rate": 1.7445985239569416e-06, "loss": 0.1446, "num_input_tokens_seen": 42686560, "step": 63320 }, { "epoch": 1.5470402853443432, "grad_norm": 9.156826972961426, "learning_rate": 1.7445415973795137e-06, "loss": 0.0934, "num_input_tokens_seen": 42689888, "step": 63325 }, { "epoch": 1.5471624361761904, "grad_norm": 17.986148834228516, "learning_rate": 1.7444846653875791e-06, "loss": 0.1435, "num_input_tokens_seen": 42693216, "step": 63330 }, { "epoch": 1.5472845870080376, "grad_norm": 0.1627344936132431, "learning_rate": 1.7444277279815518e-06, "loss": 0.1261, "num_input_tokens_seen": 42696800, "step": 63335 }, { "epoch": 1.5474067378398848, "grad_norm": 26.671512603759766, "learning_rate": 1.744370785161846e-06, "loss": 0.1507, "num_input_tokens_seen": 42700640, "step": 63340 }, { "epoch": 1.547528888671732, "grad_norm": 3.354576587677002, "learning_rate": 1.7443138369288754e-06, "loss": 0.0884, "num_input_tokens_seen": 42704096, "step": 63345 }, { "epoch": 1.5476510395035792, "grad_norm": 0.390531986951828, "learning_rate": 1.7442568832830546e-06, "loss": 0.0721, "num_input_tokens_seen": 42707616, "step": 63350 }, { "epoch": 1.5477731903354262, "grad_norm": 10.482186317443848, "learning_rate": 1.7441999242247974e-06, "loss": 0.1018, "num_input_tokens_seen": 42710688, "step": 63355 }, { "epoch": 1.5478953411672733, "grad_norm": 0.33957600593566895, "learning_rate": 1.7441429597545181e-06, "loss": 0.0021, "num_input_tokens_seen": 42714080, "step": 63360 }, { "epoch": 1.5480174919991205, "grad_norm": 0.203145369887352, "learning_rate": 1.7440859898726312e-06, "loss": 0.032, "num_input_tokens_seen": 42717472, "step": 63365 }, { "epoch": 1.5481396428309677, "grad_norm": 0.8855333924293518, "learning_rate": 1.7440290145795507e-06, "loss": 0.1066, "num_input_tokens_seen": 42720672, "step": 63370 }, { "epoch": 1.5482617936628147, "grad_norm": 0.3422999680042267, "learning_rate": 1.7439720338756913e-06, "loss": 0.129, "num_input_tokens_seen": 42723808, "step": 63375 }, { "epoch": 1.5483839444946619, "grad_norm": 0.5624834895133972, "learning_rate": 1.743915047761467e-06, "loss": 0.046, "num_input_tokens_seen": 42726944, "step": 63380 }, { "epoch": 1.548506095326509, "grad_norm": 8.767424583435059, "learning_rate": 1.7438580562372925e-06, "loss": 0.1086, "num_input_tokens_seen": 42730336, "step": 63385 }, { "epoch": 1.5486282461583563, "grad_norm": 1.0835167169570923, "learning_rate": 1.7438010593035822e-06, "loss": 0.1548, "num_input_tokens_seen": 42733472, "step": 63390 }, { "epoch": 1.5487503969902034, "grad_norm": 0.2543872594833374, "learning_rate": 1.7437440569607502e-06, "loss": 0.1006, "num_input_tokens_seen": 42737248, "step": 63395 }, { "epoch": 1.5488725478220506, "grad_norm": 2.4009156227111816, "learning_rate": 1.7436870492092117e-06, "loss": 0.0539, "num_input_tokens_seen": 42740448, "step": 63400 }, { "epoch": 1.5489946986538978, "grad_norm": 12.928868293762207, "learning_rate": 1.7436300360493808e-06, "loss": 0.074, "num_input_tokens_seen": 42743840, "step": 63405 }, { "epoch": 1.549116849485745, "grad_norm": 18.121219635009766, "learning_rate": 1.7435730174816725e-06, "loss": 0.31, "num_input_tokens_seen": 42747104, "step": 63410 }, { "epoch": 1.5492390003175922, "grad_norm": 6.432738780975342, "learning_rate": 1.743515993506501e-06, "loss": 0.0478, "num_input_tokens_seen": 42750880, "step": 63415 }, { "epoch": 1.5493611511494394, "grad_norm": 0.18056033551692963, "learning_rate": 1.7434589641242812e-06, "loss": 0.0756, "num_input_tokens_seen": 42755296, "step": 63420 }, { "epoch": 1.5494833019812866, "grad_norm": 64.7015380859375, "learning_rate": 1.7434019293354278e-06, "loss": 0.1218, "num_input_tokens_seen": 42758624, "step": 63425 }, { "epoch": 1.5496054528131338, "grad_norm": 12.332176208496094, "learning_rate": 1.7433448891403559e-06, "loss": 0.1732, "num_input_tokens_seen": 42761760, "step": 63430 }, { "epoch": 1.549727603644981, "grad_norm": 0.3980131149291992, "learning_rate": 1.7432878435394795e-06, "loss": 0.0417, "num_input_tokens_seen": 42765088, "step": 63435 }, { "epoch": 1.549849754476828, "grad_norm": 0.6029239892959595, "learning_rate": 1.7432307925332146e-06, "loss": 0.0317, "num_input_tokens_seen": 42768160, "step": 63440 }, { "epoch": 1.5499719053086751, "grad_norm": 0.5722103714942932, "learning_rate": 1.743173736121975e-06, "loss": 0.0716, "num_input_tokens_seen": 42772000, "step": 63445 }, { "epoch": 1.5500940561405223, "grad_norm": 0.04331221058964729, "learning_rate": 1.7431166743061762e-06, "loss": 0.0256, "num_input_tokens_seen": 42775072, "step": 63450 }, { "epoch": 1.5502162069723695, "grad_norm": 0.3946976959705353, "learning_rate": 1.7430596070862332e-06, "loss": 0.0631, "num_input_tokens_seen": 42778272, "step": 63455 }, { "epoch": 1.5503383578042167, "grad_norm": 14.206043243408203, "learning_rate": 1.743002534462561e-06, "loss": 0.0389, "num_input_tokens_seen": 42781984, "step": 63460 }, { "epoch": 1.5504605086360637, "grad_norm": 0.05685468763113022, "learning_rate": 1.7429454564355744e-06, "loss": 0.0364, "num_input_tokens_seen": 42785056, "step": 63465 }, { "epoch": 1.5505826594679109, "grad_norm": 13.921191215515137, "learning_rate": 1.7428883730056884e-06, "loss": 0.1069, "num_input_tokens_seen": 42788192, "step": 63470 }, { "epoch": 1.550704810299758, "grad_norm": 11.072580337524414, "learning_rate": 1.7428312841733187e-06, "loss": 0.0492, "num_input_tokens_seen": 42791712, "step": 63475 }, { "epoch": 1.5508269611316052, "grad_norm": 0.28202781081199646, "learning_rate": 1.7427741899388798e-06, "loss": 0.1006, "num_input_tokens_seen": 42795360, "step": 63480 }, { "epoch": 1.5509491119634524, "grad_norm": 11.548276901245117, "learning_rate": 1.7427170903027874e-06, "loss": 0.1221, "num_input_tokens_seen": 42798496, "step": 63485 }, { "epoch": 1.5510712627952996, "grad_norm": 0.10354668647050858, "learning_rate": 1.7426599852654564e-06, "loss": 0.1871, "num_input_tokens_seen": 42802080, "step": 63490 }, { "epoch": 1.5511934136271468, "grad_norm": 0.5099055171012878, "learning_rate": 1.7426028748273023e-06, "loss": 0.0024, "num_input_tokens_seen": 42805536, "step": 63495 }, { "epoch": 1.551315564458994, "grad_norm": 0.38024604320526123, "learning_rate": 1.7425457589887405e-06, "loss": 0.0444, "num_input_tokens_seen": 42808736, "step": 63500 }, { "epoch": 1.5514377152908412, "grad_norm": 25.034109115600586, "learning_rate": 1.7424886377501862e-06, "loss": 0.0526, "num_input_tokens_seen": 42812192, "step": 63505 }, { "epoch": 1.5515598661226884, "grad_norm": 0.5092397332191467, "learning_rate": 1.7424315111120547e-06, "loss": 0.0499, "num_input_tokens_seen": 42815712, "step": 63510 }, { "epoch": 1.5516820169545356, "grad_norm": 0.198876291513443, "learning_rate": 1.7423743790747616e-06, "loss": 0.0718, "num_input_tokens_seen": 42819168, "step": 63515 }, { "epoch": 1.5518041677863827, "grad_norm": 0.14343546330928802, "learning_rate": 1.7423172416387221e-06, "loss": 0.0942, "num_input_tokens_seen": 42822496, "step": 63520 }, { "epoch": 1.55192631861823, "grad_norm": 0.5389144420623779, "learning_rate": 1.7422600988043521e-06, "loss": 0.0979, "num_input_tokens_seen": 42825504, "step": 63525 }, { "epoch": 1.552048469450077, "grad_norm": 0.06327793747186661, "learning_rate": 1.7422029505720671e-06, "loss": 0.0011, "num_input_tokens_seen": 42828448, "step": 63530 }, { "epoch": 1.552170620281924, "grad_norm": 0.1345394402742386, "learning_rate": 1.7421457969422828e-06, "loss": 0.0825, "num_input_tokens_seen": 42832224, "step": 63535 }, { "epoch": 1.5522927711137713, "grad_norm": 22.174745559692383, "learning_rate": 1.7420886379154145e-06, "loss": 0.044, "num_input_tokens_seen": 42835808, "step": 63540 }, { "epoch": 1.5524149219456185, "grad_norm": 71.50115966796875, "learning_rate": 1.742031473491878e-06, "loss": 0.23, "num_input_tokens_seen": 42839072, "step": 63545 }, { "epoch": 1.5525370727774657, "grad_norm": 36.79838943481445, "learning_rate": 1.7419743036720892e-06, "loss": 0.0474, "num_input_tokens_seen": 42842144, "step": 63550 }, { "epoch": 1.5526592236093126, "grad_norm": 0.07557157427072525, "learning_rate": 1.7419171284564634e-06, "loss": 0.0748, "num_input_tokens_seen": 42845920, "step": 63555 }, { "epoch": 1.5527813744411598, "grad_norm": 0.1075301319360733, "learning_rate": 1.7418599478454165e-06, "loss": 0.1036, "num_input_tokens_seen": 42849184, "step": 63560 }, { "epoch": 1.552903525273007, "grad_norm": 7.113918304443359, "learning_rate": 1.7418027618393651e-06, "loss": 0.1284, "num_input_tokens_seen": 42852192, "step": 63565 }, { "epoch": 1.5530256761048542, "grad_norm": 0.17635847628116608, "learning_rate": 1.741745570438724e-06, "loss": 0.0016, "num_input_tokens_seen": 42855840, "step": 63570 }, { "epoch": 1.5531478269367014, "grad_norm": 0.34732112288475037, "learning_rate": 1.7416883736439098e-06, "loss": 0.1145, "num_input_tokens_seen": 42858912, "step": 63575 }, { "epoch": 1.5532699777685486, "grad_norm": 0.2674591839313507, "learning_rate": 1.7416311714553385e-06, "loss": 0.0483, "num_input_tokens_seen": 42862112, "step": 63580 }, { "epoch": 1.5533921286003958, "grad_norm": 21.65047836303711, "learning_rate": 1.7415739638734257e-06, "loss": 0.0595, "num_input_tokens_seen": 42865312, "step": 63585 }, { "epoch": 1.553514279432243, "grad_norm": 0.06598281860351562, "learning_rate": 1.7415167508985876e-06, "loss": 0.0916, "num_input_tokens_seen": 42868640, "step": 63590 }, { "epoch": 1.5536364302640902, "grad_norm": 12.143324851989746, "learning_rate": 1.74145953253124e-06, "loss": 0.1145, "num_input_tokens_seen": 42871712, "step": 63595 }, { "epoch": 1.5537585810959373, "grad_norm": 8.922744750976562, "learning_rate": 1.7414023087717996e-06, "loss": 0.0532, "num_input_tokens_seen": 42875232, "step": 63600 }, { "epoch": 1.5538807319277845, "grad_norm": 13.498004913330078, "learning_rate": 1.741345079620682e-06, "loss": 0.0717, "num_input_tokens_seen": 42879072, "step": 63605 }, { "epoch": 1.5540028827596317, "grad_norm": 0.186791330575943, "learning_rate": 1.7412878450783036e-06, "loss": 0.0011, "num_input_tokens_seen": 42882272, "step": 63610 }, { "epoch": 1.554125033591479, "grad_norm": 0.030450141057372093, "learning_rate": 1.7412306051450806e-06, "loss": 0.0703, "num_input_tokens_seen": 42885536, "step": 63615 }, { "epoch": 1.5542471844233259, "grad_norm": 28.711727142333984, "learning_rate": 1.741173359821429e-06, "loss": 0.077, "num_input_tokens_seen": 42888864, "step": 63620 }, { "epoch": 1.554369335255173, "grad_norm": 0.3964475989341736, "learning_rate": 1.7411161091077657e-06, "loss": 0.1488, "num_input_tokens_seen": 42892128, "step": 63625 }, { "epoch": 1.5544914860870203, "grad_norm": 0.155262753367424, "learning_rate": 1.7410588530045067e-06, "loss": 0.1056, "num_input_tokens_seen": 42895328, "step": 63630 }, { "epoch": 1.5546136369188674, "grad_norm": 16.852354049682617, "learning_rate": 1.7410015915120684e-06, "loss": 0.1505, "num_input_tokens_seen": 42898656, "step": 63635 }, { "epoch": 1.5547357877507146, "grad_norm": 39.86412048339844, "learning_rate": 1.7409443246308674e-06, "loss": 0.1335, "num_input_tokens_seen": 42901664, "step": 63640 }, { "epoch": 1.5548579385825616, "grad_norm": 0.5770293474197388, "learning_rate": 1.7408870523613194e-06, "loss": 0.0452, "num_input_tokens_seen": 42905184, "step": 63645 }, { "epoch": 1.5549800894144088, "grad_norm": 0.6164991855621338, "learning_rate": 1.7408297747038422e-06, "loss": 0.0443, "num_input_tokens_seen": 42908256, "step": 63650 }, { "epoch": 1.555102240246256, "grad_norm": 49.42633819580078, "learning_rate": 1.740772491658851e-06, "loss": 0.0963, "num_input_tokens_seen": 42911904, "step": 63655 }, { "epoch": 1.5552243910781032, "grad_norm": 0.07455496490001678, "learning_rate": 1.7407152032267635e-06, "loss": 0.0008, "num_input_tokens_seen": 42915040, "step": 63660 }, { "epoch": 1.5553465419099504, "grad_norm": 0.21186937391757965, "learning_rate": 1.7406579094079957e-06, "loss": 0.0613, "num_input_tokens_seen": 42918112, "step": 63665 }, { "epoch": 1.5554686927417976, "grad_norm": 3.4493956565856934, "learning_rate": 1.740600610202964e-06, "loss": 0.1516, "num_input_tokens_seen": 42921632, "step": 63670 }, { "epoch": 1.5555908435736447, "grad_norm": 56.91830062866211, "learning_rate": 1.7405433056120857e-06, "loss": 0.1109, "num_input_tokens_seen": 42924832, "step": 63675 }, { "epoch": 1.555712994405492, "grad_norm": 0.1591949760913849, "learning_rate": 1.7404859956357774e-06, "loss": 0.078, "num_input_tokens_seen": 42928352, "step": 63680 }, { "epoch": 1.5558351452373391, "grad_norm": 0.17558668553829193, "learning_rate": 1.7404286802744556e-06, "loss": 0.0986, "num_input_tokens_seen": 42931616, "step": 63685 }, { "epoch": 1.5559572960691863, "grad_norm": 0.4763665795326233, "learning_rate": 1.7403713595285374e-06, "loss": 0.074, "num_input_tokens_seen": 42934880, "step": 63690 }, { "epoch": 1.5560794469010335, "grad_norm": 0.38771793246269226, "learning_rate": 1.7403140333984397e-06, "loss": 0.0027, "num_input_tokens_seen": 42938336, "step": 63695 }, { "epoch": 1.5562015977328807, "grad_norm": 9.273704528808594, "learning_rate": 1.7402567018845788e-06, "loss": 0.0601, "num_input_tokens_seen": 42941600, "step": 63700 }, { "epoch": 1.5563237485647279, "grad_norm": 0.10114264488220215, "learning_rate": 1.7401993649873722e-06, "loss": 0.0707, "num_input_tokens_seen": 42945184, "step": 63705 }, { "epoch": 1.5564458993965749, "grad_norm": 1.2880104780197144, "learning_rate": 1.740142022707237e-06, "loss": 0.0307, "num_input_tokens_seen": 42948640, "step": 63710 }, { "epoch": 1.556568050228422, "grad_norm": 95.30414581298828, "learning_rate": 1.7400846750445898e-06, "loss": 0.0839, "num_input_tokens_seen": 42952352, "step": 63715 }, { "epoch": 1.5566902010602692, "grad_norm": 28.39178466796875, "learning_rate": 1.7400273219998476e-06, "loss": 0.0931, "num_input_tokens_seen": 42955872, "step": 63720 }, { "epoch": 1.5568123518921164, "grad_norm": 0.11044997721910477, "learning_rate": 1.739969963573428e-06, "loss": 0.0014, "num_input_tokens_seen": 42958944, "step": 63725 }, { "epoch": 1.5569345027239634, "grad_norm": 109.26212310791016, "learning_rate": 1.7399125997657476e-06, "loss": 0.0107, "num_input_tokens_seen": 42962208, "step": 63730 }, { "epoch": 1.5570566535558106, "grad_norm": 0.16350312530994415, "learning_rate": 1.7398552305772238e-06, "loss": 0.0131, "num_input_tokens_seen": 42965408, "step": 63735 }, { "epoch": 1.5571788043876578, "grad_norm": 0.10667629539966583, "learning_rate": 1.7397978560082737e-06, "loss": 0.0463, "num_input_tokens_seen": 42968928, "step": 63740 }, { "epoch": 1.557300955219505, "grad_norm": 37.49443435668945, "learning_rate": 1.7397404760593147e-06, "loss": 0.2915, "num_input_tokens_seen": 42972064, "step": 63745 }, { "epoch": 1.5574231060513521, "grad_norm": 0.09544025361537933, "learning_rate": 1.739683090730764e-06, "loss": 0.0011, "num_input_tokens_seen": 42975328, "step": 63750 }, { "epoch": 1.5575452568831993, "grad_norm": 0.026848675683140755, "learning_rate": 1.7396257000230388e-06, "loss": 0.0585, "num_input_tokens_seen": 42978848, "step": 63755 }, { "epoch": 1.5576674077150465, "grad_norm": 12.65404987335205, "learning_rate": 1.7395683039365564e-06, "loss": 0.1041, "num_input_tokens_seen": 42982176, "step": 63760 }, { "epoch": 1.5577895585468937, "grad_norm": 11.774423599243164, "learning_rate": 1.7395109024717347e-06, "loss": 0.1248, "num_input_tokens_seen": 42985824, "step": 63765 }, { "epoch": 1.557911709378741, "grad_norm": 0.6977723836898804, "learning_rate": 1.7394534956289908e-06, "loss": 0.0651, "num_input_tokens_seen": 42989472, "step": 63770 }, { "epoch": 1.558033860210588, "grad_norm": 17.038700103759766, "learning_rate": 1.7393960834087422e-06, "loss": 0.1488, "num_input_tokens_seen": 42992480, "step": 63775 }, { "epoch": 1.5581560110424353, "grad_norm": 57.71630859375, "learning_rate": 1.7393386658114063e-06, "loss": 0.0823, "num_input_tokens_seen": 42995872, "step": 63780 }, { "epoch": 1.5582781618742825, "grad_norm": 38.44292449951172, "learning_rate": 1.7392812428374009e-06, "loss": 0.0981, "num_input_tokens_seen": 42999200, "step": 63785 }, { "epoch": 1.5584003127061297, "grad_norm": 15.2906494140625, "learning_rate": 1.7392238144871433e-06, "loss": 0.0597, "num_input_tokens_seen": 43003040, "step": 63790 }, { "epoch": 1.5585224635379769, "grad_norm": 1.041969656944275, "learning_rate": 1.7391663807610513e-06, "loss": 0.2189, "num_input_tokens_seen": 43006368, "step": 63795 }, { "epoch": 1.5586446143698238, "grad_norm": 0.5180288553237915, "learning_rate": 1.7391089416595426e-06, "loss": 0.0449, "num_input_tokens_seen": 43010528, "step": 63800 }, { "epoch": 1.558766765201671, "grad_norm": 21.01852035522461, "learning_rate": 1.7390514971830348e-06, "loss": 0.1947, "num_input_tokens_seen": 43013600, "step": 63805 }, { "epoch": 1.5588889160335182, "grad_norm": 13.4508638381958, "learning_rate": 1.7389940473319458e-06, "loss": 0.1185, "num_input_tokens_seen": 43017056, "step": 63810 }, { "epoch": 1.5590110668653654, "grad_norm": 23.188987731933594, "learning_rate": 1.7389365921066935e-06, "loss": 0.1306, "num_input_tokens_seen": 43020384, "step": 63815 }, { "epoch": 1.5591332176972124, "grad_norm": 8.736359596252441, "learning_rate": 1.7388791315076952e-06, "loss": 0.0493, "num_input_tokens_seen": 43023776, "step": 63820 }, { "epoch": 1.5592553685290595, "grad_norm": 0.2003953605890274, "learning_rate": 1.7388216655353694e-06, "loss": 0.0326, "num_input_tokens_seen": 43027232, "step": 63825 }, { "epoch": 1.5593775193609067, "grad_norm": 14.04684066772461, "learning_rate": 1.7387641941901334e-06, "loss": 0.0376, "num_input_tokens_seen": 43030624, "step": 63830 }, { "epoch": 1.559499670192754, "grad_norm": 0.3257925808429718, "learning_rate": 1.738706717472406e-06, "loss": 0.032, "num_input_tokens_seen": 43034272, "step": 63835 }, { "epoch": 1.5596218210246011, "grad_norm": 0.9033854603767395, "learning_rate": 1.7386492353826043e-06, "loss": 0.0396, "num_input_tokens_seen": 43037472, "step": 63840 }, { "epoch": 1.5597439718564483, "grad_norm": 34.160926818847656, "learning_rate": 1.7385917479211466e-06, "loss": 0.0999, "num_input_tokens_seen": 43040800, "step": 63845 }, { "epoch": 1.5598661226882955, "grad_norm": 0.5950321555137634, "learning_rate": 1.7385342550884514e-06, "loss": 0.1058, "num_input_tokens_seen": 43044000, "step": 63850 }, { "epoch": 1.5599882735201427, "grad_norm": 53.574180603027344, "learning_rate": 1.7384767568849363e-06, "loss": 0.1929, "num_input_tokens_seen": 43047392, "step": 63855 }, { "epoch": 1.5601104243519899, "grad_norm": 0.0721253752708435, "learning_rate": 1.7384192533110195e-06, "loss": 0.0006, "num_input_tokens_seen": 43050656, "step": 63860 }, { "epoch": 1.560232575183837, "grad_norm": 0.22721441090106964, "learning_rate": 1.7383617443671192e-06, "loss": 0.0021, "num_input_tokens_seen": 43054176, "step": 63865 }, { "epoch": 1.5603547260156843, "grad_norm": 0.8160129189491272, "learning_rate": 1.738304230053654e-06, "loss": 0.2202, "num_input_tokens_seen": 43057312, "step": 63870 }, { "epoch": 1.5604768768475314, "grad_norm": 0.3547270894050598, "learning_rate": 1.7382467103710417e-06, "loss": 0.0036, "num_input_tokens_seen": 43060768, "step": 63875 }, { "epoch": 1.5605990276793786, "grad_norm": 0.08533972501754761, "learning_rate": 1.738189185319701e-06, "loss": 0.1038, "num_input_tokens_seen": 43064224, "step": 63880 }, { "epoch": 1.5607211785112258, "grad_norm": 0.0986095517873764, "learning_rate": 1.7381316549000496e-06, "loss": 0.0009, "num_input_tokens_seen": 43067680, "step": 63885 }, { "epoch": 1.5608433293430728, "grad_norm": 8.909045219421387, "learning_rate": 1.7380741191125063e-06, "loss": 0.1021, "num_input_tokens_seen": 43070880, "step": 63890 }, { "epoch": 1.56096548017492, "grad_norm": 0.10495179146528244, "learning_rate": 1.7380165779574899e-06, "loss": 0.0362, "num_input_tokens_seen": 43074272, "step": 63895 }, { "epoch": 1.5610876310067672, "grad_norm": 71.92479705810547, "learning_rate": 1.7379590314354178e-06, "loss": 0.1742, "num_input_tokens_seen": 43077472, "step": 63900 }, { "epoch": 1.5612097818386144, "grad_norm": 0.01743456721305847, "learning_rate": 1.7379014795467097e-06, "loss": 0.0004, "num_input_tokens_seen": 43080672, "step": 63905 }, { "epoch": 1.5613319326704613, "grad_norm": 0.5658043622970581, "learning_rate": 1.7378439222917834e-06, "loss": 0.0006, "num_input_tokens_seen": 43083872, "step": 63910 }, { "epoch": 1.5614540835023085, "grad_norm": 0.8468763828277588, "learning_rate": 1.7377863596710575e-06, "loss": 0.0017, "num_input_tokens_seen": 43087264, "step": 63915 }, { "epoch": 1.5615762343341557, "grad_norm": 0.7965541481971741, "learning_rate": 1.737728791684951e-06, "loss": 0.0293, "num_input_tokens_seen": 43090464, "step": 63920 }, { "epoch": 1.561698385166003, "grad_norm": 176.9981231689453, "learning_rate": 1.7376712183338823e-06, "loss": 0.1468, "num_input_tokens_seen": 43093664, "step": 63925 }, { "epoch": 1.56182053599785, "grad_norm": 20.599937438964844, "learning_rate": 1.7376136396182696e-06, "loss": 0.232, "num_input_tokens_seen": 43096672, "step": 63930 }, { "epoch": 1.5619426868296973, "grad_norm": 72.59481048583984, "learning_rate": 1.7375560555385324e-06, "loss": 0.1323, "num_input_tokens_seen": 43100064, "step": 63935 }, { "epoch": 1.5620648376615445, "grad_norm": 0.28954315185546875, "learning_rate": 1.7374984660950896e-06, "loss": 0.0674, "num_input_tokens_seen": 43103264, "step": 63940 }, { "epoch": 1.5621869884933917, "grad_norm": 0.06325811892747879, "learning_rate": 1.737440871288359e-06, "loss": 0.0028, "num_input_tokens_seen": 43106208, "step": 63945 }, { "epoch": 1.5623091393252388, "grad_norm": 74.24653625488281, "learning_rate": 1.7373832711187604e-06, "loss": 0.1143, "num_input_tokens_seen": 43109792, "step": 63950 }, { "epoch": 1.562431290157086, "grad_norm": 38.554996490478516, "learning_rate": 1.737325665586712e-06, "loss": 0.0738, "num_input_tokens_seen": 43112992, "step": 63955 }, { "epoch": 1.5625534409889332, "grad_norm": 0.032150134444236755, "learning_rate": 1.7372680546926333e-06, "loss": 0.0492, "num_input_tokens_seen": 43116064, "step": 63960 }, { "epoch": 1.5626755918207804, "grad_norm": 106.1802978515625, "learning_rate": 1.737210438436943e-06, "loss": 0.0771, "num_input_tokens_seen": 43119072, "step": 63965 }, { "epoch": 1.5627977426526276, "grad_norm": 11.328579902648926, "learning_rate": 1.7371528168200603e-06, "loss": 0.0857, "num_input_tokens_seen": 43122528, "step": 63970 }, { "epoch": 1.5629198934844746, "grad_norm": 0.25265300273895264, "learning_rate": 1.7370951898424036e-06, "loss": 0.0913, "num_input_tokens_seen": 43126048, "step": 63975 }, { "epoch": 1.5630420443163218, "grad_norm": 0.15176236629486084, "learning_rate": 1.7370375575043927e-06, "loss": 0.0444, "num_input_tokens_seen": 43129376, "step": 63980 }, { "epoch": 1.563164195148169, "grad_norm": 0.054856766015291214, "learning_rate": 1.7369799198064463e-06, "loss": 0.0455, "num_input_tokens_seen": 43132704, "step": 63985 }, { "epoch": 1.5632863459800161, "grad_norm": 36.12868118286133, "learning_rate": 1.736922276748984e-06, "loss": 0.1218, "num_input_tokens_seen": 43135968, "step": 63990 }, { "epoch": 1.5634084968118633, "grad_norm": 0.5585662722587585, "learning_rate": 1.7368646283324245e-06, "loss": 0.0014, "num_input_tokens_seen": 43139104, "step": 63995 }, { "epoch": 1.5635306476437103, "grad_norm": 0.11328171193599701, "learning_rate": 1.7368069745571869e-06, "loss": 0.1232, "num_input_tokens_seen": 43142304, "step": 64000 }, { "epoch": 1.5636527984755575, "grad_norm": 39.63161849975586, "learning_rate": 1.7367493154236913e-06, "loss": 0.1279, "num_input_tokens_seen": 43145312, "step": 64005 }, { "epoch": 1.5637749493074047, "grad_norm": 107.07310485839844, "learning_rate": 1.736691650932356e-06, "loss": 0.1902, "num_input_tokens_seen": 43148384, "step": 64010 }, { "epoch": 1.5638971001392519, "grad_norm": 0.07622405141592026, "learning_rate": 1.7366339810836012e-06, "loss": 0.0904, "num_input_tokens_seen": 43151392, "step": 64015 }, { "epoch": 1.564019250971099, "grad_norm": 0.05828621983528137, "learning_rate": 1.736576305877846e-06, "loss": 0.0997, "num_input_tokens_seen": 43154592, "step": 64020 }, { "epoch": 1.5641414018029463, "grad_norm": 32.318519592285156, "learning_rate": 1.7365186253155097e-06, "loss": 0.2109, "num_input_tokens_seen": 43157920, "step": 64025 }, { "epoch": 1.5642635526347934, "grad_norm": 17.8074893951416, "learning_rate": 1.736460939397012e-06, "loss": 0.0488, "num_input_tokens_seen": 43161568, "step": 64030 }, { "epoch": 1.5643857034666406, "grad_norm": 18.10813331604004, "learning_rate": 1.736403248122772e-06, "loss": 0.1417, "num_input_tokens_seen": 43164832, "step": 64035 }, { "epoch": 1.5645078542984878, "grad_norm": 8.896696090698242, "learning_rate": 1.7363455514932097e-06, "loss": 0.1029, "num_input_tokens_seen": 43168160, "step": 64040 }, { "epoch": 1.564630005130335, "grad_norm": 34.16524887084961, "learning_rate": 1.7362878495087446e-06, "loss": 0.1021, "num_input_tokens_seen": 43171424, "step": 64045 }, { "epoch": 1.5647521559621822, "grad_norm": 36.849449157714844, "learning_rate": 1.7362301421697963e-06, "loss": 0.0037, "num_input_tokens_seen": 43174304, "step": 64050 }, { "epoch": 1.5648743067940294, "grad_norm": 30.30545425415039, "learning_rate": 1.7361724294767839e-06, "loss": 0.0322, "num_input_tokens_seen": 43177952, "step": 64055 }, { "epoch": 1.5649964576258766, "grad_norm": 0.8645069003105164, "learning_rate": 1.7361147114301279e-06, "loss": 0.1224, "num_input_tokens_seen": 43181152, "step": 64060 }, { "epoch": 1.5651186084577235, "grad_norm": 0.04241948202252388, "learning_rate": 1.7360569880302478e-06, "loss": 0.038, "num_input_tokens_seen": 43184416, "step": 64065 }, { "epoch": 1.5652407592895707, "grad_norm": 0.1801944077014923, "learning_rate": 1.735999259277563e-06, "loss": 0.0331, "num_input_tokens_seen": 43187680, "step": 64070 }, { "epoch": 1.565362910121418, "grad_norm": 0.2641149163246155, "learning_rate": 1.7359415251724938e-06, "loss": 0.107, "num_input_tokens_seen": 43191264, "step": 64075 }, { "epoch": 1.5654850609532651, "grad_norm": 0.15654908120632172, "learning_rate": 1.73588378571546e-06, "loss": 0.1043, "num_input_tokens_seen": 43194336, "step": 64080 }, { "epoch": 1.5656072117851123, "grad_norm": 0.35645878314971924, "learning_rate": 1.7358260409068813e-06, "loss": 0.0851, "num_input_tokens_seen": 43197088, "step": 64085 }, { "epoch": 1.5657293626169593, "grad_norm": 0.330046147108078, "learning_rate": 1.7357682907471776e-06, "loss": 0.0457, "num_input_tokens_seen": 43200672, "step": 64090 }, { "epoch": 1.5658515134488065, "grad_norm": 0.18050967156887054, "learning_rate": 1.7357105352367692e-06, "loss": 0.0474, "num_input_tokens_seen": 43203872, "step": 64095 }, { "epoch": 1.5659736642806537, "grad_norm": 0.2091437578201294, "learning_rate": 1.7356527743760756e-06, "loss": 0.0015, "num_input_tokens_seen": 43207136, "step": 64100 }, { "epoch": 1.5660958151125008, "grad_norm": 464.6640930175781, "learning_rate": 1.7355950081655175e-06, "loss": 0.0337, "num_input_tokens_seen": 43210208, "step": 64105 }, { "epoch": 1.566217965944348, "grad_norm": 73.33601379394531, "learning_rate": 1.7355372366055145e-06, "loss": 0.1464, "num_input_tokens_seen": 43213792, "step": 64110 }, { "epoch": 1.5663401167761952, "grad_norm": 0.3620937466621399, "learning_rate": 1.7354794596964869e-06, "loss": 0.1498, "num_input_tokens_seen": 43216928, "step": 64115 }, { "epoch": 1.5664622676080424, "grad_norm": 1.3183008432388306, "learning_rate": 1.7354216774388549e-06, "loss": 0.0867, "num_input_tokens_seen": 43219872, "step": 64120 }, { "epoch": 1.5665844184398896, "grad_norm": 21.02619743347168, "learning_rate": 1.7353638898330384e-06, "loss": 0.1274, "num_input_tokens_seen": 43223264, "step": 64125 }, { "epoch": 1.5667065692717368, "grad_norm": 1.7390692234039307, "learning_rate": 1.7353060968794582e-06, "loss": 0.0383, "num_input_tokens_seen": 43226656, "step": 64130 }, { "epoch": 1.566828720103584, "grad_norm": 0.12568193674087524, "learning_rate": 1.735248298578534e-06, "loss": 0.0377, "num_input_tokens_seen": 43229728, "step": 64135 }, { "epoch": 1.5669508709354312, "grad_norm": 27.419641494750977, "learning_rate": 1.7351904949306867e-06, "loss": 0.0776, "num_input_tokens_seen": 43233056, "step": 64140 }, { "epoch": 1.5670730217672784, "grad_norm": 1.1645451784133911, "learning_rate": 1.7351326859363363e-06, "loss": 0.0346, "num_input_tokens_seen": 43236832, "step": 64145 }, { "epoch": 1.5671951725991256, "grad_norm": 13.680094718933105, "learning_rate": 1.7350748715959035e-06, "loss": 0.1614, "num_input_tokens_seen": 43240224, "step": 64150 }, { "epoch": 1.5673173234309725, "grad_norm": 0.7623006105422974, "learning_rate": 1.7350170519098079e-06, "loss": 0.0474, "num_input_tokens_seen": 43242848, "step": 64155 }, { "epoch": 1.5674394742628197, "grad_norm": 42.0316276550293, "learning_rate": 1.7349592268784712e-06, "loss": 0.0847, "num_input_tokens_seen": 43246240, "step": 64160 }, { "epoch": 1.567561625094667, "grad_norm": 0.2583397328853607, "learning_rate": 1.7349013965023129e-06, "loss": 0.0447, "num_input_tokens_seen": 43249824, "step": 64165 }, { "epoch": 1.567683775926514, "grad_norm": 46.822105407714844, "learning_rate": 1.7348435607817544e-06, "loss": 0.1189, "num_input_tokens_seen": 43253536, "step": 64170 }, { "epoch": 1.5678059267583613, "grad_norm": 0.4105156660079956, "learning_rate": 1.7347857197172155e-06, "loss": 0.0759, "num_input_tokens_seen": 43256992, "step": 64175 }, { "epoch": 1.5679280775902082, "grad_norm": 32.56477355957031, "learning_rate": 1.7347278733091174e-06, "loss": 0.1785, "num_input_tokens_seen": 43260256, "step": 64180 }, { "epoch": 1.5680502284220554, "grad_norm": 31.76134490966797, "learning_rate": 1.7346700215578808e-06, "loss": 0.0056, "num_input_tokens_seen": 43263456, "step": 64185 }, { "epoch": 1.5681723792539026, "grad_norm": 27.205013275146484, "learning_rate": 1.7346121644639258e-06, "loss": 0.1561, "num_input_tokens_seen": 43267168, "step": 64190 }, { "epoch": 1.5682945300857498, "grad_norm": 57.560577392578125, "learning_rate": 1.7345543020276735e-06, "loss": 0.1017, "num_input_tokens_seen": 43270560, "step": 64195 }, { "epoch": 1.568416680917597, "grad_norm": 0.020825877785682678, "learning_rate": 1.734496434249545e-06, "loss": 0.168, "num_input_tokens_seen": 43273504, "step": 64200 }, { "epoch": 1.5685388317494442, "grad_norm": 0.14802296459674835, "learning_rate": 1.734438561129961e-06, "loss": 0.021, "num_input_tokens_seen": 43276896, "step": 64205 }, { "epoch": 1.5686609825812914, "grad_norm": 0.1316024214029312, "learning_rate": 1.734380682669342e-06, "loss": 0.0446, "num_input_tokens_seen": 43280160, "step": 64210 }, { "epoch": 1.5687831334131386, "grad_norm": 7.745893478393555, "learning_rate": 1.734322798868109e-06, "loss": 0.1164, "num_input_tokens_seen": 43283232, "step": 64215 }, { "epoch": 1.5689052842449858, "grad_norm": 73.67494201660156, "learning_rate": 1.7342649097266837e-06, "loss": 0.0437, "num_input_tokens_seen": 43286688, "step": 64220 }, { "epoch": 1.569027435076833, "grad_norm": 21.934650421142578, "learning_rate": 1.734207015245486e-06, "loss": 0.1511, "num_input_tokens_seen": 43290080, "step": 64225 }, { "epoch": 1.5691495859086801, "grad_norm": 0.7329850792884827, "learning_rate": 1.7341491154249374e-06, "loss": 0.0503, "num_input_tokens_seen": 43293728, "step": 64230 }, { "epoch": 1.5692717367405273, "grad_norm": 0.7669029235839844, "learning_rate": 1.734091210265459e-06, "loss": 0.0409, "num_input_tokens_seen": 43297120, "step": 64235 }, { "epoch": 1.5693938875723745, "grad_norm": 0.008779598399996758, "learning_rate": 1.7340332997674722e-06, "loss": 0.0017, "num_input_tokens_seen": 43300064, "step": 64240 }, { "epoch": 1.5695160384042215, "grad_norm": 0.07726084440946579, "learning_rate": 1.7339753839313972e-06, "loss": 0.1356, "num_input_tokens_seen": 43303328, "step": 64245 }, { "epoch": 1.5696381892360687, "grad_norm": 26.37903594970703, "learning_rate": 1.7339174627576564e-06, "loss": 0.0647, "num_input_tokens_seen": 43306272, "step": 64250 }, { "epoch": 1.5697603400679159, "grad_norm": 0.27737849950790405, "learning_rate": 1.7338595362466702e-06, "loss": 0.1772, "num_input_tokens_seen": 43309472, "step": 64255 }, { "epoch": 1.569882490899763, "grad_norm": 0.08733111619949341, "learning_rate": 1.73380160439886e-06, "loss": 0.0525, "num_input_tokens_seen": 43312608, "step": 64260 }, { "epoch": 1.57000464173161, "grad_norm": 0.07386315613985062, "learning_rate": 1.7337436672146472e-06, "loss": 0.0366, "num_input_tokens_seen": 43316000, "step": 64265 }, { "epoch": 1.5701267925634572, "grad_norm": 0.4418039321899414, "learning_rate": 1.7336857246944532e-06, "loss": 0.074, "num_input_tokens_seen": 43318752, "step": 64270 }, { "epoch": 1.5702489433953044, "grad_norm": 0.2571321725845337, "learning_rate": 1.7336277768386992e-06, "loss": 0.0944, "num_input_tokens_seen": 43321760, "step": 64275 }, { "epoch": 1.5703710942271516, "grad_norm": 0.031792718917131424, "learning_rate": 1.7335698236478065e-06, "loss": 0.1143, "num_input_tokens_seen": 43324896, "step": 64280 }, { "epoch": 1.5704932450589988, "grad_norm": 17.855899810791016, "learning_rate": 1.733511865122197e-06, "loss": 0.0621, "num_input_tokens_seen": 43328480, "step": 64285 }, { "epoch": 1.570615395890846, "grad_norm": 0.48485782742500305, "learning_rate": 1.7334539012622918e-06, "loss": 0.0431, "num_input_tokens_seen": 43331680, "step": 64290 }, { "epoch": 1.5707375467226932, "grad_norm": 0.012995784170925617, "learning_rate": 1.7333959320685125e-06, "loss": 0.2625, "num_input_tokens_seen": 43334880, "step": 64295 }, { "epoch": 1.5708596975545404, "grad_norm": 125.5101547241211, "learning_rate": 1.7333379575412809e-06, "loss": 0.0224, "num_input_tokens_seen": 43338208, "step": 64300 }, { "epoch": 1.5709818483863875, "grad_norm": 1.2389947175979614, "learning_rate": 1.7332799776810184e-06, "loss": 0.0158, "num_input_tokens_seen": 43341280, "step": 64305 }, { "epoch": 1.5711039992182347, "grad_norm": 20.650035858154297, "learning_rate": 1.7332219924881465e-06, "loss": 0.1509, "num_input_tokens_seen": 43344864, "step": 64310 }, { "epoch": 1.571226150050082, "grad_norm": 0.08857346326112747, "learning_rate": 1.7331640019630874e-06, "loss": 0.0489, "num_input_tokens_seen": 43348192, "step": 64315 }, { "epoch": 1.5713483008819291, "grad_norm": 0.0540444515645504, "learning_rate": 1.733106006106262e-06, "loss": 0.1044, "num_input_tokens_seen": 43351776, "step": 64320 }, { "epoch": 1.5714704517137763, "grad_norm": 160.40611267089844, "learning_rate": 1.7330480049180927e-06, "loss": 0.1271, "num_input_tokens_seen": 43355744, "step": 64325 }, { "epoch": 1.5715926025456235, "grad_norm": 0.03458704426884651, "learning_rate": 1.7329899983990013e-06, "loss": 0.0984, "num_input_tokens_seen": 43359392, "step": 64330 }, { "epoch": 1.5717147533774705, "grad_norm": 34.95131301879883, "learning_rate": 1.7329319865494094e-06, "loss": 0.1279, "num_input_tokens_seen": 43362912, "step": 64335 }, { "epoch": 1.5718369042093177, "grad_norm": 0.5489761829376221, "learning_rate": 1.7328739693697389e-06, "loss": 0.0015, "num_input_tokens_seen": 43366048, "step": 64340 }, { "epoch": 1.5719590550411648, "grad_norm": 0.08397287130355835, "learning_rate": 1.7328159468604118e-06, "loss": 0.062, "num_input_tokens_seen": 43368992, "step": 64345 }, { "epoch": 1.572081205873012, "grad_norm": 38.11424255371094, "learning_rate": 1.73275791902185e-06, "loss": 0.0844, "num_input_tokens_seen": 43372576, "step": 64350 }, { "epoch": 1.572203356704859, "grad_norm": 0.03804247826337814, "learning_rate": 1.7326998858544757e-06, "loss": 0.0953, "num_input_tokens_seen": 43375968, "step": 64355 }, { "epoch": 1.5723255075367062, "grad_norm": 9.633445739746094, "learning_rate": 1.7326418473587108e-06, "loss": 0.0172, "num_input_tokens_seen": 43379040, "step": 64360 }, { "epoch": 1.5724476583685534, "grad_norm": 22.06827735900879, "learning_rate": 1.732583803534977e-06, "loss": 0.128, "num_input_tokens_seen": 43382496, "step": 64365 }, { "epoch": 1.5725698092004006, "grad_norm": 88.87641143798828, "learning_rate": 1.732525754383697e-06, "loss": 0.0183, "num_input_tokens_seen": 43385696, "step": 64370 }, { "epoch": 1.5726919600322478, "grad_norm": 18.26456642150879, "learning_rate": 1.7324676999052925e-06, "loss": 0.0768, "num_input_tokens_seen": 43389472, "step": 64375 }, { "epoch": 1.572814110864095, "grad_norm": 0.2304680347442627, "learning_rate": 1.7324096401001862e-06, "loss": 0.0202, "num_input_tokens_seen": 43393120, "step": 64380 }, { "epoch": 1.5729362616959421, "grad_norm": 0.14339575171470642, "learning_rate": 1.7323515749687997e-06, "loss": 0.0321, "num_input_tokens_seen": 43396128, "step": 64385 }, { "epoch": 1.5730584125277893, "grad_norm": 0.18540513515472412, "learning_rate": 1.7322935045115557e-06, "loss": 0.0721, "num_input_tokens_seen": 43399840, "step": 64390 }, { "epoch": 1.5731805633596365, "grad_norm": 9.878387451171875, "learning_rate": 1.732235428728876e-06, "loss": 0.0916, "num_input_tokens_seen": 43402912, "step": 64395 }, { "epoch": 1.5733027141914837, "grad_norm": 1.50212824344635, "learning_rate": 1.732177347621184e-06, "loss": 0.18, "num_input_tokens_seen": 43405856, "step": 64400 }, { "epoch": 1.573424865023331, "grad_norm": 1.5309032201766968, "learning_rate": 1.7321192611889008e-06, "loss": 0.0016, "num_input_tokens_seen": 43409248, "step": 64405 }, { "epoch": 1.573547015855178, "grad_norm": 26.32704734802246, "learning_rate": 1.7320611694324497e-06, "loss": 0.0976, "num_input_tokens_seen": 43413216, "step": 64410 }, { "epoch": 1.5736691666870253, "grad_norm": 2.3686509132385254, "learning_rate": 1.7320030723522527e-06, "loss": 0.0812, "num_input_tokens_seen": 43416352, "step": 64415 }, { "epoch": 1.5737913175188725, "grad_norm": 0.5000460147857666, "learning_rate": 1.7319449699487327e-06, "loss": 0.0454, "num_input_tokens_seen": 43419488, "step": 64420 }, { "epoch": 1.5739134683507194, "grad_norm": 25.742910385131836, "learning_rate": 1.731886862222312e-06, "loss": 0.1577, "num_input_tokens_seen": 43423456, "step": 64425 }, { "epoch": 1.5740356191825666, "grad_norm": 0.22029350697994232, "learning_rate": 1.7318287491734131e-06, "loss": 0.0902, "num_input_tokens_seen": 43427040, "step": 64430 }, { "epoch": 1.5741577700144138, "grad_norm": 0.23451527953147888, "learning_rate": 1.7317706308024587e-06, "loss": 0.2328, "num_input_tokens_seen": 43430432, "step": 64435 }, { "epoch": 1.574279920846261, "grad_norm": 1.194338321685791, "learning_rate": 1.7317125071098712e-06, "loss": 0.006, "num_input_tokens_seen": 43433952, "step": 64440 }, { "epoch": 1.574402071678108, "grad_norm": 0.13677367568016052, "learning_rate": 1.731654378096074e-06, "loss": 0.1223, "num_input_tokens_seen": 43436896, "step": 64445 }, { "epoch": 1.5745242225099552, "grad_norm": 1.720357894897461, "learning_rate": 1.731596243761489e-06, "loss": 0.0534, "num_input_tokens_seen": 43440096, "step": 64450 }, { "epoch": 1.5746463733418024, "grad_norm": 0.05810659006237984, "learning_rate": 1.7315381041065396e-06, "loss": 0.0694, "num_input_tokens_seen": 43443424, "step": 64455 }, { "epoch": 1.5747685241736495, "grad_norm": 18.547910690307617, "learning_rate": 1.7314799591316483e-06, "loss": 0.0658, "num_input_tokens_seen": 43446560, "step": 64460 }, { "epoch": 1.5748906750054967, "grad_norm": 24.63653564453125, "learning_rate": 1.7314218088372378e-06, "loss": 0.1655, "num_input_tokens_seen": 43450336, "step": 64465 }, { "epoch": 1.575012825837344, "grad_norm": 0.2034972906112671, "learning_rate": 1.7313636532237315e-06, "loss": 0.1515, "num_input_tokens_seen": 43453984, "step": 64470 }, { "epoch": 1.5751349766691911, "grad_norm": 12.121557235717773, "learning_rate": 1.7313054922915518e-06, "loss": 0.0834, "num_input_tokens_seen": 43457248, "step": 64475 }, { "epoch": 1.5752571275010383, "grad_norm": 0.231455460190773, "learning_rate": 1.7312473260411217e-06, "loss": 0.0865, "num_input_tokens_seen": 43460384, "step": 64480 }, { "epoch": 1.5753792783328855, "grad_norm": 26.974008560180664, "learning_rate": 1.7311891544728645e-06, "loss": 0.146, "num_input_tokens_seen": 43463392, "step": 64485 }, { "epoch": 1.5755014291647327, "grad_norm": 9.78650188446045, "learning_rate": 1.7311309775872031e-06, "loss": 0.0538, "num_input_tokens_seen": 43466528, "step": 64490 }, { "epoch": 1.5756235799965799, "grad_norm": 0.1319877803325653, "learning_rate": 1.7310727953845607e-06, "loss": 0.1977, "num_input_tokens_seen": 43469984, "step": 64495 }, { "epoch": 1.575745730828427, "grad_norm": 91.83895874023438, "learning_rate": 1.7310146078653602e-06, "loss": 0.0642, "num_input_tokens_seen": 43473440, "step": 64500 }, { "epoch": 1.5758678816602743, "grad_norm": 0.08182687312364578, "learning_rate": 1.7309564150300248e-06, "loss": 0.0194, "num_input_tokens_seen": 43477024, "step": 64505 }, { "epoch": 1.5759900324921212, "grad_norm": 14.08334732055664, "learning_rate": 1.7308982168789779e-06, "loss": 0.0588, "num_input_tokens_seen": 43480672, "step": 64510 }, { "epoch": 1.5761121833239684, "grad_norm": 11.436392784118652, "learning_rate": 1.7308400134126427e-06, "loss": 0.1166, "num_input_tokens_seen": 43484128, "step": 64515 }, { "epoch": 1.5762343341558156, "grad_norm": 17.357013702392578, "learning_rate": 1.730781804631442e-06, "loss": 0.0807, "num_input_tokens_seen": 43487520, "step": 64520 }, { "epoch": 1.5763564849876628, "grad_norm": 15.593206405639648, "learning_rate": 1.7307235905357996e-06, "loss": 0.1008, "num_input_tokens_seen": 43491232, "step": 64525 }, { "epoch": 1.57647863581951, "grad_norm": 3.3694682121276855, "learning_rate": 1.7306653711261387e-06, "loss": 0.1098, "num_input_tokens_seen": 43494240, "step": 64530 }, { "epoch": 1.576600786651357, "grad_norm": 0.666233479976654, "learning_rate": 1.7306071464028826e-06, "loss": 0.0432, "num_input_tokens_seen": 43497632, "step": 64535 }, { "epoch": 1.5767229374832041, "grad_norm": 18.11580467224121, "learning_rate": 1.730548916366455e-06, "loss": 0.0777, "num_input_tokens_seen": 43500896, "step": 64540 }, { "epoch": 1.5768450883150513, "grad_norm": 0.9629843235015869, "learning_rate": 1.730490681017279e-06, "loss": 0.0021, "num_input_tokens_seen": 43504288, "step": 64545 }, { "epoch": 1.5769672391468985, "grad_norm": 15.17105484008789, "learning_rate": 1.7304324403557783e-06, "loss": 0.0614, "num_input_tokens_seen": 43508192, "step": 64550 }, { "epoch": 1.5770893899787457, "grad_norm": 65.02957916259766, "learning_rate": 1.7303741943823767e-06, "loss": 0.1901, "num_input_tokens_seen": 43511776, "step": 64555 }, { "epoch": 1.577211540810593, "grad_norm": 0.9925330877304077, "learning_rate": 1.7303159430974974e-06, "loss": 0.0285, "num_input_tokens_seen": 43514976, "step": 64560 }, { "epoch": 1.57733369164244, "grad_norm": 0.08024363964796066, "learning_rate": 1.7302576865015642e-06, "loss": 0.1113, "num_input_tokens_seen": 43518432, "step": 64565 }, { "epoch": 1.5774558424742873, "grad_norm": 0.06430874764919281, "learning_rate": 1.7301994245950004e-06, "loss": 0.0049, "num_input_tokens_seen": 43522080, "step": 64570 }, { "epoch": 1.5775779933061345, "grad_norm": 28.217370986938477, "learning_rate": 1.7301411573782301e-06, "loss": 0.0845, "num_input_tokens_seen": 43525344, "step": 64575 }, { "epoch": 1.5777001441379817, "grad_norm": 0.19830197095870972, "learning_rate": 1.7300828848516771e-06, "loss": 0.0425, "num_input_tokens_seen": 43529248, "step": 64580 }, { "epoch": 1.5778222949698288, "grad_norm": 0.25371474027633667, "learning_rate": 1.730024607015765e-06, "loss": 0.1888, "num_input_tokens_seen": 43532384, "step": 64585 }, { "epoch": 1.577944445801676, "grad_norm": 21.84136962890625, "learning_rate": 1.7299663238709172e-06, "loss": 0.0429, "num_input_tokens_seen": 43535776, "step": 64590 }, { "epoch": 1.5780665966335232, "grad_norm": 0.4660642147064209, "learning_rate": 1.7299080354175584e-06, "loss": 0.0589, "num_input_tokens_seen": 43539296, "step": 64595 }, { "epoch": 1.5781887474653702, "grad_norm": 12.314225196838379, "learning_rate": 1.7298497416561118e-06, "loss": 0.0614, "num_input_tokens_seen": 43542496, "step": 64600 }, { "epoch": 1.5783108982972174, "grad_norm": 16.878976821899414, "learning_rate": 1.7297914425870017e-06, "loss": 0.0759, "num_input_tokens_seen": 43545952, "step": 64605 }, { "epoch": 1.5784330491290646, "grad_norm": 0.6693881750106812, "learning_rate": 1.7297331382106517e-06, "loss": 0.057, "num_input_tokens_seen": 43549088, "step": 64610 }, { "epoch": 1.5785551999609118, "grad_norm": 0.040294088423252106, "learning_rate": 1.7296748285274863e-06, "loss": 0.0591, "num_input_tokens_seen": 43552288, "step": 64615 }, { "epoch": 1.578677350792759, "grad_norm": 16.48003578186035, "learning_rate": 1.7296165135379292e-06, "loss": 0.131, "num_input_tokens_seen": 43555680, "step": 64620 }, { "epoch": 1.578799501624606, "grad_norm": 1.05326509475708, "learning_rate": 1.7295581932424045e-06, "loss": 0.0414, "num_input_tokens_seen": 43559968, "step": 64625 }, { "epoch": 1.578921652456453, "grad_norm": 0.1204957589507103, "learning_rate": 1.7294998676413367e-06, "loss": 0.0777, "num_input_tokens_seen": 43563360, "step": 64630 }, { "epoch": 1.5790438032883003, "grad_norm": 17.75843048095703, "learning_rate": 1.7294415367351492e-06, "loss": 0.0833, "num_input_tokens_seen": 43566880, "step": 64635 }, { "epoch": 1.5791659541201475, "grad_norm": 0.19816263020038605, "learning_rate": 1.7293832005242668e-06, "loss": 0.0345, "num_input_tokens_seen": 43570272, "step": 64640 }, { "epoch": 1.5792881049519947, "grad_norm": 0.08913075178861618, "learning_rate": 1.7293248590091138e-06, "loss": 0.0457, "num_input_tokens_seen": 43573536, "step": 64645 }, { "epoch": 1.5794102557838419, "grad_norm": 12.546802520751953, "learning_rate": 1.7292665121901142e-06, "loss": 0.1409, "num_input_tokens_seen": 43576672, "step": 64650 }, { "epoch": 1.579532406615689, "grad_norm": 34.74605941772461, "learning_rate": 1.7292081600676922e-06, "loss": 0.0689, "num_input_tokens_seen": 43580192, "step": 64655 }, { "epoch": 1.5796545574475362, "grad_norm": 0.2504430413246155, "learning_rate": 1.7291498026422724e-06, "loss": 0.0539, "num_input_tokens_seen": 43583584, "step": 64660 }, { "epoch": 1.5797767082793834, "grad_norm": 12.61136245727539, "learning_rate": 1.7290914399142792e-06, "loss": 0.238, "num_input_tokens_seen": 43587488, "step": 64665 }, { "epoch": 1.5798988591112306, "grad_norm": 18.86175537109375, "learning_rate": 1.729033071884137e-06, "loss": 0.0774, "num_input_tokens_seen": 43590432, "step": 64670 }, { "epoch": 1.5800210099430778, "grad_norm": 39.11001968383789, "learning_rate": 1.72897469855227e-06, "loss": 0.0615, "num_input_tokens_seen": 43593440, "step": 64675 }, { "epoch": 1.580143160774925, "grad_norm": 0.5635188221931458, "learning_rate": 1.728916319919103e-06, "loss": 0.0023, "num_input_tokens_seen": 43596768, "step": 64680 }, { "epoch": 1.5802653116067722, "grad_norm": 0.09311782568693161, "learning_rate": 1.7288579359850606e-06, "loss": 0.0012, "num_input_tokens_seen": 43600288, "step": 64685 }, { "epoch": 1.5803874624386192, "grad_norm": 0.1712763011455536, "learning_rate": 1.728799546750567e-06, "loss": 0.0733, "num_input_tokens_seen": 43603936, "step": 64690 }, { "epoch": 1.5805096132704664, "grad_norm": 0.16141662001609802, "learning_rate": 1.728741152216047e-06, "loss": 0.0009, "num_input_tokens_seen": 43608096, "step": 64695 }, { "epoch": 1.5806317641023135, "grad_norm": 11.50343132019043, "learning_rate": 1.7286827523819256e-06, "loss": 0.1137, "num_input_tokens_seen": 43612000, "step": 64700 }, { "epoch": 1.5807539149341607, "grad_norm": 17.6826229095459, "learning_rate": 1.7286243472486274e-06, "loss": 0.2981, "num_input_tokens_seen": 43615840, "step": 64705 }, { "epoch": 1.580876065766008, "grad_norm": 0.3164464831352234, "learning_rate": 1.7285659368165766e-06, "loss": 0.0367, "num_input_tokens_seen": 43619296, "step": 64710 }, { "epoch": 1.5809982165978549, "grad_norm": 25.950359344482422, "learning_rate": 1.7285075210861986e-06, "loss": 0.1975, "num_input_tokens_seen": 43622496, "step": 64715 }, { "epoch": 1.581120367429702, "grad_norm": 0.5272526144981384, "learning_rate": 1.7284491000579178e-06, "loss": 0.0018, "num_input_tokens_seen": 43625952, "step": 64720 }, { "epoch": 1.5812425182615493, "grad_norm": 0.36506155133247375, "learning_rate": 1.7283906737321592e-06, "loss": 0.0131, "num_input_tokens_seen": 43629344, "step": 64725 }, { "epoch": 1.5813646690933965, "grad_norm": 1.3668677806854248, "learning_rate": 1.7283322421093478e-06, "loss": 0.1193, "num_input_tokens_seen": 43632736, "step": 64730 }, { "epoch": 1.5814868199252436, "grad_norm": 0.4250573217868805, "learning_rate": 1.7282738051899084e-06, "loss": 0.0439, "num_input_tokens_seen": 43635872, "step": 64735 }, { "epoch": 1.5816089707570908, "grad_norm": 13.346234321594238, "learning_rate": 1.728215362974266e-06, "loss": 0.0818, "num_input_tokens_seen": 43639136, "step": 64740 }, { "epoch": 1.581731121588938, "grad_norm": 0.7282446622848511, "learning_rate": 1.7281569154628456e-06, "loss": 0.068, "num_input_tokens_seen": 43642208, "step": 64745 }, { "epoch": 1.5818532724207852, "grad_norm": 0.3492739796638489, "learning_rate": 1.7280984626560725e-06, "loss": 0.0308, "num_input_tokens_seen": 43645920, "step": 64750 }, { "epoch": 1.5819754232526324, "grad_norm": 0.5976834297180176, "learning_rate": 1.728040004554371e-06, "loss": 0.106, "num_input_tokens_seen": 43649696, "step": 64755 }, { "epoch": 1.5820975740844796, "grad_norm": 0.11738604307174683, "learning_rate": 1.7279815411581674e-06, "loss": 0.0527, "num_input_tokens_seen": 43653408, "step": 64760 }, { "epoch": 1.5822197249163268, "grad_norm": 10.756890296936035, "learning_rate": 1.727923072467886e-06, "loss": 0.143, "num_input_tokens_seen": 43656480, "step": 64765 }, { "epoch": 1.582341875748174, "grad_norm": 23.856487274169922, "learning_rate": 1.727864598483952e-06, "loss": 0.1607, "num_input_tokens_seen": 43659936, "step": 64770 }, { "epoch": 1.5824640265800212, "grad_norm": 0.2920784056186676, "learning_rate": 1.7278061192067913e-06, "loss": 0.0015, "num_input_tokens_seen": 43663072, "step": 64775 }, { "epoch": 1.5825861774118681, "grad_norm": 17.303253173828125, "learning_rate": 1.7277476346368284e-06, "loss": 0.1688, "num_input_tokens_seen": 43666272, "step": 64780 }, { "epoch": 1.5827083282437153, "grad_norm": 0.15339502692222595, "learning_rate": 1.7276891447744888e-06, "loss": 0.0072, "num_input_tokens_seen": 43669408, "step": 64785 }, { "epoch": 1.5828304790755625, "grad_norm": 0.4489547312259674, "learning_rate": 1.7276306496201983e-06, "loss": 0.0011, "num_input_tokens_seen": 43672672, "step": 64790 }, { "epoch": 1.5829526299074097, "grad_norm": 0.1969112902879715, "learning_rate": 1.727572149174382e-06, "loss": 0.1677, "num_input_tokens_seen": 43675680, "step": 64795 }, { "epoch": 1.5830747807392567, "grad_norm": 0.2483512908220291, "learning_rate": 1.727513643437465e-06, "loss": 0.0625, "num_input_tokens_seen": 43678816, "step": 64800 }, { "epoch": 1.5831969315711039, "grad_norm": 14.387808799743652, "learning_rate": 1.7274551324098736e-06, "loss": 0.0381, "num_input_tokens_seen": 43682208, "step": 64805 }, { "epoch": 1.583319082402951, "grad_norm": 0.1599772721529007, "learning_rate": 1.7273966160920326e-06, "loss": 0.1027, "num_input_tokens_seen": 43685536, "step": 64810 }, { "epoch": 1.5834412332347982, "grad_norm": 0.1974867284297943, "learning_rate": 1.7273380944843678e-06, "loss": 0.0851, "num_input_tokens_seen": 43688800, "step": 64815 }, { "epoch": 1.5835633840666454, "grad_norm": 13.60886001586914, "learning_rate": 1.727279567587305e-06, "loss": 0.048, "num_input_tokens_seen": 43692000, "step": 64820 }, { "epoch": 1.5836855348984926, "grad_norm": 1.444703459739685, "learning_rate": 1.727221035401269e-06, "loss": 0.0625, "num_input_tokens_seen": 43695648, "step": 64825 }, { "epoch": 1.5838076857303398, "grad_norm": 18.226131439208984, "learning_rate": 1.7271624979266864e-06, "loss": 0.0375, "num_input_tokens_seen": 43698784, "step": 64830 }, { "epoch": 1.583929836562187, "grad_norm": 51.74169158935547, "learning_rate": 1.7271039551639826e-06, "loss": 0.0834, "num_input_tokens_seen": 43702112, "step": 64835 }, { "epoch": 1.5840519873940342, "grad_norm": 0.7884621024131775, "learning_rate": 1.727045407113583e-06, "loss": 0.0592, "num_input_tokens_seen": 43705696, "step": 64840 }, { "epoch": 1.5841741382258814, "grad_norm": 0.3161929249763489, "learning_rate": 1.7269868537759137e-06, "loss": 0.0441, "num_input_tokens_seen": 43709216, "step": 64845 }, { "epoch": 1.5842962890577286, "grad_norm": 0.4939757287502289, "learning_rate": 1.7269282951514006e-06, "loss": 0.0017, "num_input_tokens_seen": 43712544, "step": 64850 }, { "epoch": 1.5844184398895758, "grad_norm": 14.162518501281738, "learning_rate": 1.7268697312404694e-06, "loss": 0.1, "num_input_tokens_seen": 43716128, "step": 64855 }, { "epoch": 1.584540590721423, "grad_norm": 0.12434427440166473, "learning_rate": 1.726811162043546e-06, "loss": 0.096, "num_input_tokens_seen": 43719200, "step": 64860 }, { "epoch": 1.5846627415532701, "grad_norm": 0.20762816071510315, "learning_rate": 1.7267525875610562e-06, "loss": 0.0464, "num_input_tokens_seen": 43722912, "step": 64865 }, { "epoch": 1.584784892385117, "grad_norm": 0.6017253994941711, "learning_rate": 1.7266940077934262e-06, "loss": 0.0985, "num_input_tokens_seen": 43726240, "step": 64870 }, { "epoch": 1.5849070432169643, "grad_norm": 3.5484423637390137, "learning_rate": 1.726635422741082e-06, "loss": 0.0261, "num_input_tokens_seen": 43729504, "step": 64875 }, { "epoch": 1.5850291940488115, "grad_norm": 10.419028282165527, "learning_rate": 1.7265768324044495e-06, "loss": 0.0808, "num_input_tokens_seen": 43732960, "step": 64880 }, { "epoch": 1.5851513448806587, "grad_norm": 2.4890594482421875, "learning_rate": 1.7265182367839548e-06, "loss": 0.0526, "num_input_tokens_seen": 43736096, "step": 64885 }, { "epoch": 1.5852734957125056, "grad_norm": 17.033557891845703, "learning_rate": 1.7264596358800244e-06, "loss": 0.1818, "num_input_tokens_seen": 43739680, "step": 64890 }, { "epoch": 1.5853956465443528, "grad_norm": 10.960592269897461, "learning_rate": 1.7264010296930836e-06, "loss": 0.1195, "num_input_tokens_seen": 43743136, "step": 64895 }, { "epoch": 1.5855177973762, "grad_norm": 0.5055883526802063, "learning_rate": 1.7263424182235595e-06, "loss": 0.1465, "num_input_tokens_seen": 43746272, "step": 64900 }, { "epoch": 1.5856399482080472, "grad_norm": 22.364749908447266, "learning_rate": 1.7262838014718777e-06, "loss": 0.1043, "num_input_tokens_seen": 43749664, "step": 64905 }, { "epoch": 1.5857620990398944, "grad_norm": 0.5056003928184509, "learning_rate": 1.7262251794384648e-06, "loss": 0.0007, "num_input_tokens_seen": 43752800, "step": 64910 }, { "epoch": 1.5858842498717416, "grad_norm": 0.07914532721042633, "learning_rate": 1.7261665521237472e-06, "loss": 0.0954, "num_input_tokens_seen": 43756320, "step": 64915 }, { "epoch": 1.5860064007035888, "grad_norm": 0.12225379794836044, "learning_rate": 1.7261079195281512e-06, "loss": 0.1966, "num_input_tokens_seen": 43759968, "step": 64920 }, { "epoch": 1.586128551535436, "grad_norm": 52.27254867553711, "learning_rate": 1.7260492816521032e-06, "loss": 0.0482, "num_input_tokens_seen": 43763104, "step": 64925 }, { "epoch": 1.5862507023672832, "grad_norm": 0.3266526758670807, "learning_rate": 1.7259906384960293e-06, "loss": 0.1171, "num_input_tokens_seen": 43766368, "step": 64930 }, { "epoch": 1.5863728531991304, "grad_norm": 0.229824960231781, "learning_rate": 1.7259319900603562e-06, "loss": 0.0357, "num_input_tokens_seen": 43769376, "step": 64935 }, { "epoch": 1.5864950040309775, "grad_norm": 0.10840941965579987, "learning_rate": 1.7258733363455104e-06, "loss": 0.0418, "num_input_tokens_seen": 43772768, "step": 64940 }, { "epoch": 1.5866171548628247, "grad_norm": 19.677263259887695, "learning_rate": 1.7258146773519187e-06, "loss": 0.1548, "num_input_tokens_seen": 43775904, "step": 64945 }, { "epoch": 1.586739305694672, "grad_norm": 0.18879550695419312, "learning_rate": 1.725756013080007e-06, "loss": 0.1765, "num_input_tokens_seen": 43779040, "step": 64950 }, { "epoch": 1.586861456526519, "grad_norm": 0.5741985440254211, "learning_rate": 1.7256973435302027e-06, "loss": 0.003, "num_input_tokens_seen": 43782368, "step": 64955 }, { "epoch": 1.586983607358366, "grad_norm": 0.23880359530448914, "learning_rate": 1.725638668702932e-06, "loss": 0.0608, "num_input_tokens_seen": 43785504, "step": 64960 }, { "epoch": 1.5871057581902133, "grad_norm": 0.7572619318962097, "learning_rate": 1.725579988598622e-06, "loss": 0.0023, "num_input_tokens_seen": 43788640, "step": 64965 }, { "epoch": 1.5872279090220605, "grad_norm": 21.19196319580078, "learning_rate": 1.725521303217699e-06, "loss": 0.1631, "num_input_tokens_seen": 43792032, "step": 64970 }, { "epoch": 1.5873500598539076, "grad_norm": 0.24510979652404785, "learning_rate": 1.7254626125605898e-06, "loss": 0.0757, "num_input_tokens_seen": 43795552, "step": 64975 }, { "epoch": 1.5874722106857546, "grad_norm": 10.422627449035645, "learning_rate": 1.7254039166277213e-06, "loss": 0.0851, "num_input_tokens_seen": 43799840, "step": 64980 }, { "epoch": 1.5875943615176018, "grad_norm": 21.904415130615234, "learning_rate": 1.7253452154195206e-06, "loss": 0.0863, "num_input_tokens_seen": 43802720, "step": 64985 }, { "epoch": 1.587716512349449, "grad_norm": 0.7586684823036194, "learning_rate": 1.7252865089364144e-06, "loss": 0.001, "num_input_tokens_seen": 43807456, "step": 64990 }, { "epoch": 1.5878386631812962, "grad_norm": 0.05671500787138939, "learning_rate": 1.7252277971788298e-06, "loss": 0.1012, "num_input_tokens_seen": 43810656, "step": 64995 }, { "epoch": 1.5879608140131434, "grad_norm": 0.18258820474147797, "learning_rate": 1.7251690801471934e-06, "loss": 0.0888, "num_input_tokens_seen": 43813984, "step": 65000 }, { "epoch": 1.5880829648449906, "grad_norm": 0.19848677515983582, "learning_rate": 1.7251103578419323e-06, "loss": 0.0607, "num_input_tokens_seen": 43817504, "step": 65005 }, { "epoch": 1.5882051156768378, "grad_norm": 0.6028345227241516, "learning_rate": 1.725051630263474e-06, "loss": 0.0944, "num_input_tokens_seen": 43820896, "step": 65010 }, { "epoch": 1.588327266508685, "grad_norm": 14.807291984558105, "learning_rate": 1.7249928974122448e-06, "loss": 0.1087, "num_input_tokens_seen": 43823968, "step": 65015 }, { "epoch": 1.5884494173405321, "grad_norm": 0.07424309104681015, "learning_rate": 1.7249341592886721e-06, "loss": 0.0211, "num_input_tokens_seen": 43827552, "step": 65020 }, { "epoch": 1.5885715681723793, "grad_norm": 45.41783142089844, "learning_rate": 1.7248754158931838e-06, "loss": 0.1244, "num_input_tokens_seen": 43830816, "step": 65025 }, { "epoch": 1.5886937190042265, "grad_norm": 70.42790985107422, "learning_rate": 1.724816667226206e-06, "loss": 0.1491, "num_input_tokens_seen": 43834272, "step": 65030 }, { "epoch": 1.5888158698360737, "grad_norm": 0.2781122922897339, "learning_rate": 1.7247579132881668e-06, "loss": 0.0019, "num_input_tokens_seen": 43837664, "step": 65035 }, { "epoch": 1.588938020667921, "grad_norm": 0.11927325278520584, "learning_rate": 1.724699154079493e-06, "loss": 0.1295, "num_input_tokens_seen": 43840800, "step": 65040 }, { "epoch": 1.5890601714997679, "grad_norm": 0.09100142866373062, "learning_rate": 1.724640389600612e-06, "loss": 0.1518, "num_input_tokens_seen": 43844768, "step": 65045 }, { "epoch": 1.589182322331615, "grad_norm": 0.07561935484409332, "learning_rate": 1.7245816198519511e-06, "loss": 0.0872, "num_input_tokens_seen": 43848032, "step": 65050 }, { "epoch": 1.5893044731634622, "grad_norm": 10.376869201660156, "learning_rate": 1.7245228448339383e-06, "loss": 0.0401, "num_input_tokens_seen": 43851168, "step": 65055 }, { "epoch": 1.5894266239953094, "grad_norm": 97.4819564819336, "learning_rate": 1.724464064547e-06, "loss": 0.1346, "num_input_tokens_seen": 43854112, "step": 65060 }, { "epoch": 1.5895487748271566, "grad_norm": 0.20881225168704987, "learning_rate": 1.724405278991564e-06, "loss": 0.0352, "num_input_tokens_seen": 43857440, "step": 65065 }, { "epoch": 1.5896709256590036, "grad_norm": 0.13099144399166107, "learning_rate": 1.7243464881680583e-06, "loss": 0.0423, "num_input_tokens_seen": 43860768, "step": 65070 }, { "epoch": 1.5897930764908508, "grad_norm": 21.404760360717773, "learning_rate": 1.7242876920769102e-06, "loss": 0.0743, "num_input_tokens_seen": 43863776, "step": 65075 }, { "epoch": 1.589915227322698, "grad_norm": 0.13627037405967712, "learning_rate": 1.7242288907185469e-06, "loss": 0.0569, "num_input_tokens_seen": 43867424, "step": 65080 }, { "epoch": 1.5900373781545452, "grad_norm": 10.283689498901367, "learning_rate": 1.7241700840933964e-06, "loss": 0.1019, "num_input_tokens_seen": 43870688, "step": 65085 }, { "epoch": 1.5901595289863923, "grad_norm": 0.56661057472229, "learning_rate": 1.7241112722018864e-06, "loss": 0.084, "num_input_tokens_seen": 43873696, "step": 65090 }, { "epoch": 1.5902816798182395, "grad_norm": 0.2931710481643677, "learning_rate": 1.7240524550444442e-06, "loss": 0.0284, "num_input_tokens_seen": 43876704, "step": 65095 }, { "epoch": 1.5904038306500867, "grad_norm": 79.61620330810547, "learning_rate": 1.7239936326214978e-06, "loss": 0.01, "num_input_tokens_seen": 43879904, "step": 65100 }, { "epoch": 1.590525981481934, "grad_norm": 1.0331748723983765, "learning_rate": 1.7239348049334754e-06, "loss": 0.0773, "num_input_tokens_seen": 43883488, "step": 65105 }, { "epoch": 1.590648132313781, "grad_norm": 5.423376083374023, "learning_rate": 1.7238759719808043e-06, "loss": 0.0556, "num_input_tokens_seen": 43886560, "step": 65110 }, { "epoch": 1.5907702831456283, "grad_norm": 8.856134414672852, "learning_rate": 1.7238171337639122e-06, "loss": 0.089, "num_input_tokens_seen": 43889568, "step": 65115 }, { "epoch": 1.5908924339774755, "grad_norm": 58.78102493286133, "learning_rate": 1.7237582902832273e-06, "loss": 0.1761, "num_input_tokens_seen": 43893344, "step": 65120 }, { "epoch": 1.5910145848093227, "grad_norm": 0.2834632992744446, "learning_rate": 1.7236994415391774e-06, "loss": 0.1145, "num_input_tokens_seen": 43896544, "step": 65125 }, { "epoch": 1.5911367356411699, "grad_norm": 33.866458892822266, "learning_rate": 1.7236405875321904e-06, "loss": 0.1264, "num_input_tokens_seen": 43900064, "step": 65130 }, { "epoch": 1.5912588864730168, "grad_norm": 0.4411328136920929, "learning_rate": 1.7235817282626947e-06, "loss": 0.0535, "num_input_tokens_seen": 43903264, "step": 65135 }, { "epoch": 1.591381037304864, "grad_norm": 0.19796854257583618, "learning_rate": 1.7235228637311179e-06, "loss": 0.0065, "num_input_tokens_seen": 43906336, "step": 65140 }, { "epoch": 1.5915031881367112, "grad_norm": 0.765709638595581, "learning_rate": 1.723463993937888e-06, "loss": 0.0023, "num_input_tokens_seen": 43909664, "step": 65145 }, { "epoch": 1.5916253389685584, "grad_norm": 4.744302749633789, "learning_rate": 1.7234051188834338e-06, "loss": 0.0792, "num_input_tokens_seen": 43912608, "step": 65150 }, { "epoch": 1.5917474898004056, "grad_norm": 0.09163492918014526, "learning_rate": 1.7233462385681828e-06, "loss": 0.0474, "num_input_tokens_seen": 43915872, "step": 65155 }, { "epoch": 1.5918696406322526, "grad_norm": 35.5812873840332, "learning_rate": 1.723287352992563e-06, "loss": 0.2247, "num_input_tokens_seen": 43919072, "step": 65160 }, { "epoch": 1.5919917914640997, "grad_norm": 12.814735412597656, "learning_rate": 1.7232284621570037e-06, "loss": 0.0595, "num_input_tokens_seen": 43922272, "step": 65165 }, { "epoch": 1.592113942295947, "grad_norm": 20.20494842529297, "learning_rate": 1.7231695660619323e-06, "loss": 0.0626, "num_input_tokens_seen": 43925344, "step": 65170 }, { "epoch": 1.5922360931277941, "grad_norm": 0.18822571635246277, "learning_rate": 1.723110664707777e-06, "loss": 0.0018, "num_input_tokens_seen": 43928608, "step": 65175 }, { "epoch": 1.5923582439596413, "grad_norm": 56.161495208740234, "learning_rate": 1.7230517580949666e-06, "loss": 0.1544, "num_input_tokens_seen": 43931936, "step": 65180 }, { "epoch": 1.5924803947914885, "grad_norm": 0.06047762185335159, "learning_rate": 1.7229928462239296e-06, "loss": 0.0791, "num_input_tokens_seen": 43935456, "step": 65185 }, { "epoch": 1.5926025456233357, "grad_norm": 0.25422996282577515, "learning_rate": 1.7229339290950938e-06, "loss": 0.0047, "num_input_tokens_seen": 43938848, "step": 65190 }, { "epoch": 1.5927246964551829, "grad_norm": 0.334971159696579, "learning_rate": 1.7228750067088882e-06, "loss": 0.0549, "num_input_tokens_seen": 43942432, "step": 65195 }, { "epoch": 1.59284684728703, "grad_norm": 0.35727155208587646, "learning_rate": 1.7228160790657414e-06, "loss": 0.1257, "num_input_tokens_seen": 43945824, "step": 65200 }, { "epoch": 1.5929689981188773, "grad_norm": 0.2726553976535797, "learning_rate": 1.722757146166081e-06, "loss": 0.008, "num_input_tokens_seen": 43949344, "step": 65205 }, { "epoch": 1.5930911489507245, "grad_norm": 20.234010696411133, "learning_rate": 1.7226982080103367e-06, "loss": 0.1465, "num_input_tokens_seen": 43952992, "step": 65210 }, { "epoch": 1.5932132997825716, "grad_norm": 1.7432730197906494, "learning_rate": 1.7226392645989365e-06, "loss": 0.1148, "num_input_tokens_seen": 43956320, "step": 65215 }, { "epoch": 1.5933354506144188, "grad_norm": 0.2908968925476074, "learning_rate": 1.7225803159323094e-06, "loss": 0.0726, "num_input_tokens_seen": 43959328, "step": 65220 }, { "epoch": 1.5934576014462658, "grad_norm": 0.14125913381576538, "learning_rate": 1.7225213620108835e-06, "loss": 0.0435, "num_input_tokens_seen": 43962528, "step": 65225 }, { "epoch": 1.593579752278113, "grad_norm": 121.56201934814453, "learning_rate": 1.7224624028350885e-06, "loss": 0.0547, "num_input_tokens_seen": 43965600, "step": 65230 }, { "epoch": 1.5937019031099602, "grad_norm": 0.18387913703918457, "learning_rate": 1.722403438405352e-06, "loss": 0.0571, "num_input_tokens_seen": 43968992, "step": 65235 }, { "epoch": 1.5938240539418074, "grad_norm": 0.11335892230272293, "learning_rate": 1.7223444687221038e-06, "loss": 0.0006, "num_input_tokens_seen": 43972704, "step": 65240 }, { "epoch": 1.5939462047736546, "grad_norm": 17.0925350189209, "learning_rate": 1.722285493785772e-06, "loss": 0.1366, "num_input_tokens_seen": 43975904, "step": 65245 }, { "epoch": 1.5940683556055015, "grad_norm": 14.52631664276123, "learning_rate": 1.722226513596786e-06, "loss": 0.1517, "num_input_tokens_seen": 43979040, "step": 65250 }, { "epoch": 1.5941905064373487, "grad_norm": 237.27642822265625, "learning_rate": 1.7221675281555745e-06, "loss": 0.0374, "num_input_tokens_seen": 43982624, "step": 65255 }, { "epoch": 1.594312657269196, "grad_norm": 1.3043992519378662, "learning_rate": 1.7221085374625665e-06, "loss": 0.0302, "num_input_tokens_seen": 43985888, "step": 65260 }, { "epoch": 1.594434808101043, "grad_norm": 33.467750549316406, "learning_rate": 1.7220495415181913e-06, "loss": 0.212, "num_input_tokens_seen": 43989344, "step": 65265 }, { "epoch": 1.5945569589328903, "grad_norm": 142.94471740722656, "learning_rate": 1.721990540322877e-06, "loss": 0.0446, "num_input_tokens_seen": 43992608, "step": 65270 }, { "epoch": 1.5946791097647375, "grad_norm": 0.17905016243457794, "learning_rate": 1.7219315338770536e-06, "loss": 0.0013, "num_input_tokens_seen": 43995680, "step": 65275 }, { "epoch": 1.5948012605965847, "grad_norm": 32.415428161621094, "learning_rate": 1.7218725221811501e-06, "loss": 0.086, "num_input_tokens_seen": 43999264, "step": 65280 }, { "epoch": 1.5949234114284319, "grad_norm": 23.203704833984375, "learning_rate": 1.7218135052355954e-06, "loss": 0.1713, "num_input_tokens_seen": 44003552, "step": 65285 }, { "epoch": 1.595045562260279, "grad_norm": 2.7013700008392334, "learning_rate": 1.7217544830408187e-06, "loss": 0.0031, "num_input_tokens_seen": 44006688, "step": 65290 }, { "epoch": 1.5951677130921262, "grad_norm": 0.369367390871048, "learning_rate": 1.7216954555972492e-06, "loss": 0.0326, "num_input_tokens_seen": 44010336, "step": 65295 }, { "epoch": 1.5952898639239734, "grad_norm": 0.4860530197620392, "learning_rate": 1.7216364229053162e-06, "loss": 0.0907, "num_input_tokens_seen": 44013472, "step": 65300 }, { "epoch": 1.5954120147558206, "grad_norm": 11.849100112915039, "learning_rate": 1.721577384965449e-06, "loss": 0.1265, "num_input_tokens_seen": 44016928, "step": 65305 }, { "epoch": 1.5955341655876678, "grad_norm": 20.93488121032715, "learning_rate": 1.7215183417780771e-06, "loss": 0.0761, "num_input_tokens_seen": 44020512, "step": 65310 }, { "epoch": 1.5956563164195148, "grad_norm": 23.495223999023438, "learning_rate": 1.7214592933436298e-06, "loss": 0.0912, "num_input_tokens_seen": 44023840, "step": 65315 }, { "epoch": 1.595778467251362, "grad_norm": 0.09012192487716675, "learning_rate": 1.7214002396625365e-06, "loss": 0.2207, "num_input_tokens_seen": 44027296, "step": 65320 }, { "epoch": 1.5959006180832092, "grad_norm": 14.036751747131348, "learning_rate": 1.7213411807352265e-06, "loss": 0.0872, "num_input_tokens_seen": 44030496, "step": 65325 }, { "epoch": 1.5960227689150563, "grad_norm": 1.248896837234497, "learning_rate": 1.7212821165621295e-06, "loss": 0.0273, "num_input_tokens_seen": 44033952, "step": 65330 }, { "epoch": 1.5961449197469033, "grad_norm": 2.6545727252960205, "learning_rate": 1.7212230471436748e-06, "loss": 0.0352, "num_input_tokens_seen": 44037344, "step": 65335 }, { "epoch": 1.5962670705787505, "grad_norm": 13.713151931762695, "learning_rate": 1.7211639724802921e-06, "loss": 0.0852, "num_input_tokens_seen": 44040800, "step": 65340 }, { "epoch": 1.5963892214105977, "grad_norm": 256.8730773925781, "learning_rate": 1.7211048925724112e-06, "loss": 0.084, "num_input_tokens_seen": 44043744, "step": 65345 }, { "epoch": 1.5965113722424449, "grad_norm": 0.1393524557352066, "learning_rate": 1.7210458074204614e-06, "loss": 0.0648, "num_input_tokens_seen": 44047008, "step": 65350 }, { "epoch": 1.596633523074292, "grad_norm": 80.3912124633789, "learning_rate": 1.7209867170248726e-06, "loss": 0.2229, "num_input_tokens_seen": 44050208, "step": 65355 }, { "epoch": 1.5967556739061393, "grad_norm": 0.04061346873641014, "learning_rate": 1.7209276213860747e-06, "loss": 0.0578, "num_input_tokens_seen": 44053408, "step": 65360 }, { "epoch": 1.5968778247379865, "grad_norm": 20.790264129638672, "learning_rate": 1.7208685205044971e-06, "loss": 0.0661, "num_input_tokens_seen": 44056672, "step": 65365 }, { "epoch": 1.5969999755698336, "grad_norm": 0.7041231989860535, "learning_rate": 1.7208094143805695e-06, "loss": 0.1116, "num_input_tokens_seen": 44060256, "step": 65370 }, { "epoch": 1.5971221264016808, "grad_norm": 1.3291122913360596, "learning_rate": 1.7207503030147222e-06, "loss": 0.1478, "num_input_tokens_seen": 44063456, "step": 65375 }, { "epoch": 1.597244277233528, "grad_norm": 2.3246588706970215, "learning_rate": 1.7206911864073848e-06, "loss": 0.0438, "num_input_tokens_seen": 44066912, "step": 65380 }, { "epoch": 1.5973664280653752, "grad_norm": 0.02847222425043583, "learning_rate": 1.720632064558987e-06, "loss": 0.0284, "num_input_tokens_seen": 44070432, "step": 65385 }, { "epoch": 1.5974885788972224, "grad_norm": 32.99223327636719, "learning_rate": 1.7205729374699594e-06, "loss": 0.0805, "num_input_tokens_seen": 44073888, "step": 65390 }, { "epoch": 1.5976107297290696, "grad_norm": 0.5882880091667175, "learning_rate": 1.7205138051407312e-06, "loss": 0.1458, "num_input_tokens_seen": 44076896, "step": 65395 }, { "epoch": 1.5977328805609168, "grad_norm": 20.05073356628418, "learning_rate": 1.7204546675717333e-06, "loss": 0.0646, "num_input_tokens_seen": 44080224, "step": 65400 }, { "epoch": 1.5978550313927637, "grad_norm": 0.05178692564368248, "learning_rate": 1.720395524763395e-06, "loss": 0.1099, "num_input_tokens_seen": 44083808, "step": 65405 }, { "epoch": 1.597977182224611, "grad_norm": 0.40675821900367737, "learning_rate": 1.7203363767161468e-06, "loss": 0.084, "num_input_tokens_seen": 44086944, "step": 65410 }, { "epoch": 1.5980993330564581, "grad_norm": 16.000343322753906, "learning_rate": 1.7202772234304184e-06, "loss": 0.0325, "num_input_tokens_seen": 44089888, "step": 65415 }, { "epoch": 1.5982214838883053, "grad_norm": 20.65907859802246, "learning_rate": 1.7202180649066405e-06, "loss": 0.1516, "num_input_tokens_seen": 44093216, "step": 65420 }, { "epoch": 1.5983436347201523, "grad_norm": 0.039159201085567474, "learning_rate": 1.720158901145243e-06, "loss": 0.0362, "num_input_tokens_seen": 44097120, "step": 65425 }, { "epoch": 1.5984657855519995, "grad_norm": 15.833292961120605, "learning_rate": 1.7200997321466563e-06, "loss": 0.1402, "num_input_tokens_seen": 44100320, "step": 65430 }, { "epoch": 1.5985879363838467, "grad_norm": 1.0264534950256348, "learning_rate": 1.7200405579113108e-06, "loss": 0.0018, "num_input_tokens_seen": 44103456, "step": 65435 }, { "epoch": 1.5987100872156939, "grad_norm": 0.41825219988822937, "learning_rate": 1.7199813784396366e-06, "loss": 0.0501, "num_input_tokens_seen": 44106592, "step": 65440 }, { "epoch": 1.598832238047541, "grad_norm": 0.1690363883972168, "learning_rate": 1.7199221937320645e-06, "loss": 0.0764, "num_input_tokens_seen": 44109792, "step": 65445 }, { "epoch": 1.5989543888793882, "grad_norm": 0.024017304182052612, "learning_rate": 1.7198630037890243e-06, "loss": 0.1109, "num_input_tokens_seen": 44113248, "step": 65450 }, { "epoch": 1.5990765397112354, "grad_norm": 26.482099533081055, "learning_rate": 1.7198038086109467e-06, "loss": 0.2167, "num_input_tokens_seen": 44116640, "step": 65455 }, { "epoch": 1.5991986905430826, "grad_norm": 0.3642503023147583, "learning_rate": 1.7197446081982623e-06, "loss": 0.0713, "num_input_tokens_seen": 44120032, "step": 65460 }, { "epoch": 1.5993208413749298, "grad_norm": 17.329883575439453, "learning_rate": 1.719685402551401e-06, "loss": 0.07, "num_input_tokens_seen": 44123872, "step": 65465 }, { "epoch": 1.599442992206777, "grad_norm": 473.49810791015625, "learning_rate": 1.7196261916707947e-06, "loss": 0.0373, "num_input_tokens_seen": 44127136, "step": 65470 }, { "epoch": 1.5995651430386242, "grad_norm": 0.20757238566875458, "learning_rate": 1.7195669755568727e-06, "loss": 0.0014, "num_input_tokens_seen": 44130144, "step": 65475 }, { "epoch": 1.5996872938704714, "grad_norm": 0.28598150610923767, "learning_rate": 1.7195077542100663e-06, "loss": 0.0558, "num_input_tokens_seen": 44133024, "step": 65480 }, { "epoch": 1.5998094447023186, "grad_norm": 0.23164817690849304, "learning_rate": 1.7194485276308057e-06, "loss": 0.1076, "num_input_tokens_seen": 44136224, "step": 65485 }, { "epoch": 1.5999315955341658, "grad_norm": 47.41874694824219, "learning_rate": 1.7193892958195222e-06, "loss": 0.1035, "num_input_tokens_seen": 44139552, "step": 65490 }, { "epoch": 1.6000537463660127, "grad_norm": 186.92962646484375, "learning_rate": 1.719330058776646e-06, "loss": 0.0076, "num_input_tokens_seen": 44142816, "step": 65495 }, { "epoch": 1.60017589719786, "grad_norm": 0.5157283544540405, "learning_rate": 1.7192708165026084e-06, "loss": 0.0379, "num_input_tokens_seen": 44146080, "step": 65500 }, { "epoch": 1.600298048029707, "grad_norm": 14.778525352478027, "learning_rate": 1.7192115689978398e-06, "loss": 0.1149, "num_input_tokens_seen": 44149728, "step": 65505 }, { "epoch": 1.6004201988615543, "grad_norm": 17.854949951171875, "learning_rate": 1.7191523162627712e-06, "loss": 0.1016, "num_input_tokens_seen": 44153312, "step": 65510 }, { "epoch": 1.6005423496934013, "grad_norm": 0.2640315592288971, "learning_rate": 1.7190930582978335e-06, "loss": 0.001, "num_input_tokens_seen": 44156960, "step": 65515 }, { "epoch": 1.6006645005252484, "grad_norm": 13.823732376098633, "learning_rate": 1.7190337951034577e-06, "loss": 0.1725, "num_input_tokens_seen": 44160416, "step": 65520 }, { "epoch": 1.6007866513570956, "grad_norm": 0.19923101365566254, "learning_rate": 1.7189745266800748e-06, "loss": 0.098, "num_input_tokens_seen": 44163488, "step": 65525 }, { "epoch": 1.6009088021889428, "grad_norm": 0.05441410467028618, "learning_rate": 1.718915253028116e-06, "loss": 0.1148, "num_input_tokens_seen": 44166816, "step": 65530 }, { "epoch": 1.60103095302079, "grad_norm": 0.20565907657146454, "learning_rate": 1.7188559741480117e-06, "loss": 0.0477, "num_input_tokens_seen": 44170720, "step": 65535 }, { "epoch": 1.6011531038526372, "grad_norm": 0.03282782807946205, "learning_rate": 1.7187966900401936e-06, "loss": 0.0029, "num_input_tokens_seen": 44174112, "step": 65540 }, { "epoch": 1.6012752546844844, "grad_norm": 0.2828992009162903, "learning_rate": 1.7187374007050926e-06, "loss": 0.0929, "num_input_tokens_seen": 44177376, "step": 65545 }, { "epoch": 1.6013974055163316, "grad_norm": 0.222542405128479, "learning_rate": 1.7186781061431398e-06, "loss": 0.0586, "num_input_tokens_seen": 44180512, "step": 65550 }, { "epoch": 1.6015195563481788, "grad_norm": 8.429993629455566, "learning_rate": 1.7186188063547666e-06, "loss": 0.1894, "num_input_tokens_seen": 44184032, "step": 65555 }, { "epoch": 1.601641707180026, "grad_norm": 13.292488098144531, "learning_rate": 1.7185595013404044e-06, "loss": 0.0665, "num_input_tokens_seen": 44187168, "step": 65560 }, { "epoch": 1.6017638580118732, "grad_norm": 1.3764312267303467, "learning_rate": 1.718500191100484e-06, "loss": 0.0818, "num_input_tokens_seen": 44190304, "step": 65565 }, { "epoch": 1.6018860088437203, "grad_norm": 8.841894149780273, "learning_rate": 1.718440875635437e-06, "loss": 0.1852, "num_input_tokens_seen": 44193632, "step": 65570 }, { "epoch": 1.6020081596755675, "grad_norm": 0.5281084775924683, "learning_rate": 1.7183815549456946e-06, "loss": 0.0989, "num_input_tokens_seen": 44196960, "step": 65575 }, { "epoch": 1.6021303105074145, "grad_norm": 18.164323806762695, "learning_rate": 1.7183222290316883e-06, "loss": 0.0721, "num_input_tokens_seen": 44200288, "step": 65580 }, { "epoch": 1.6022524613392617, "grad_norm": 24.348045349121094, "learning_rate": 1.7182628978938498e-06, "loss": 0.1025, "num_input_tokens_seen": 44203616, "step": 65585 }, { "epoch": 1.6023746121711089, "grad_norm": 50.18940734863281, "learning_rate": 1.71820356153261e-06, "loss": 0.0754, "num_input_tokens_seen": 44206624, "step": 65590 }, { "epoch": 1.602496763002956, "grad_norm": 36.11235809326172, "learning_rate": 1.7181442199484009e-06, "loss": 0.0047, "num_input_tokens_seen": 44209888, "step": 65595 }, { "epoch": 1.6026189138348033, "grad_norm": 20.994815826416016, "learning_rate": 1.7180848731416542e-06, "loss": 0.114, "num_input_tokens_seen": 44214176, "step": 65600 }, { "epoch": 1.6027410646666502, "grad_norm": 13.22746753692627, "learning_rate": 1.7180255211128007e-06, "loss": 0.0821, "num_input_tokens_seen": 44217376, "step": 65605 }, { "epoch": 1.6028632154984974, "grad_norm": 6.7888994216918945, "learning_rate": 1.7179661638622726e-06, "loss": 0.0539, "num_input_tokens_seen": 44220512, "step": 65610 }, { "epoch": 1.6029853663303446, "grad_norm": 11.641776084899902, "learning_rate": 1.7179068013905014e-06, "loss": 0.1417, "num_input_tokens_seen": 44223840, "step": 65615 }, { "epoch": 1.6031075171621918, "grad_norm": 19.492708206176758, "learning_rate": 1.717847433697919e-06, "loss": 0.0381, "num_input_tokens_seen": 44227168, "step": 65620 }, { "epoch": 1.603229667994039, "grad_norm": 0.5217481255531311, "learning_rate": 1.7177880607849568e-06, "loss": 0.1578, "num_input_tokens_seen": 44230752, "step": 65625 }, { "epoch": 1.6033518188258862, "grad_norm": 0.8416579365730286, "learning_rate": 1.717728682652047e-06, "loss": 0.09, "num_input_tokens_seen": 44233888, "step": 65630 }, { "epoch": 1.6034739696577334, "grad_norm": 8.32114315032959, "learning_rate": 1.717669299299621e-06, "loss": 0.004, "num_input_tokens_seen": 44237280, "step": 65635 }, { "epoch": 1.6035961204895806, "grad_norm": 0.1598159223794937, "learning_rate": 1.7176099107281106e-06, "loss": 0.1614, "num_input_tokens_seen": 44240288, "step": 65640 }, { "epoch": 1.6037182713214277, "grad_norm": 1.3760980367660522, "learning_rate": 1.7175505169379483e-06, "loss": 0.0475, "num_input_tokens_seen": 44243936, "step": 65645 }, { "epoch": 1.603840422153275, "grad_norm": 1.223433494567871, "learning_rate": 1.7174911179295654e-06, "loss": 0.0748, "num_input_tokens_seen": 44247200, "step": 65650 }, { "epoch": 1.6039625729851221, "grad_norm": 0.18195028603076935, "learning_rate": 1.7174317137033944e-06, "loss": 0.0416, "num_input_tokens_seen": 44250720, "step": 65655 }, { "epoch": 1.6040847238169693, "grad_norm": 0.07075879722833633, "learning_rate": 1.7173723042598667e-06, "loss": 0.0533, "num_input_tokens_seen": 44253792, "step": 65660 }, { "epoch": 1.6042068746488165, "grad_norm": 0.3297138214111328, "learning_rate": 1.7173128895994148e-06, "loss": 0.0099, "num_input_tokens_seen": 44257184, "step": 65665 }, { "epoch": 1.6043290254806635, "grad_norm": 1.093284010887146, "learning_rate": 1.7172534697224708e-06, "loss": 0.2448, "num_input_tokens_seen": 44260640, "step": 65670 }, { "epoch": 1.6044511763125107, "grad_norm": 0.09150218218564987, "learning_rate": 1.7171940446294664e-06, "loss": 0.0923, "num_input_tokens_seen": 44264032, "step": 65675 }, { "epoch": 1.6045733271443579, "grad_norm": 0.9866107106208801, "learning_rate": 1.717134614320834e-06, "loss": 0.0069, "num_input_tokens_seen": 44267360, "step": 65680 }, { "epoch": 1.604695477976205, "grad_norm": 0.14237858355045319, "learning_rate": 1.717075178797006e-06, "loss": 0.1055, "num_input_tokens_seen": 44270368, "step": 65685 }, { "epoch": 1.6048176288080522, "grad_norm": 0.20932254195213318, "learning_rate": 1.7170157380584143e-06, "loss": 0.0554, "num_input_tokens_seen": 44273504, "step": 65690 }, { "epoch": 1.6049397796398992, "grad_norm": 0.7047435641288757, "learning_rate": 1.7169562921054913e-06, "loss": 0.0332, "num_input_tokens_seen": 44277024, "step": 65695 }, { "epoch": 1.6050619304717464, "grad_norm": 46.69514465332031, "learning_rate": 1.716896840938669e-06, "loss": 0.0248, "num_input_tokens_seen": 44280224, "step": 65700 }, { "epoch": 1.6051840813035936, "grad_norm": 0.255687415599823, "learning_rate": 1.7168373845583805e-06, "loss": 0.0531, "num_input_tokens_seen": 44283680, "step": 65705 }, { "epoch": 1.6053062321354408, "grad_norm": 34.751155853271484, "learning_rate": 1.7167779229650576e-06, "loss": 0.0277, "num_input_tokens_seen": 44287456, "step": 65710 }, { "epoch": 1.605428382967288, "grad_norm": 0.03389814496040344, "learning_rate": 1.7167184561591328e-06, "loss": 0.0948, "num_input_tokens_seen": 44291360, "step": 65715 }, { "epoch": 1.6055505337991351, "grad_norm": 9.623665809631348, "learning_rate": 1.7166589841410387e-06, "loss": 0.1253, "num_input_tokens_seen": 44294816, "step": 65720 }, { "epoch": 1.6056726846309823, "grad_norm": 0.047234319150447845, "learning_rate": 1.7165995069112077e-06, "loss": 0.093, "num_input_tokens_seen": 44298016, "step": 65725 }, { "epoch": 1.6057948354628295, "grad_norm": 34.660728454589844, "learning_rate": 1.7165400244700723e-06, "loss": 0.0925, "num_input_tokens_seen": 44301344, "step": 65730 }, { "epoch": 1.6059169862946767, "grad_norm": 0.3465474843978882, "learning_rate": 1.7164805368180652e-06, "loss": 0.0401, "num_input_tokens_seen": 44304992, "step": 65735 }, { "epoch": 1.606039137126524, "grad_norm": 0.5020141005516052, "learning_rate": 1.7164210439556187e-06, "loss": 0.0368, "num_input_tokens_seen": 44308192, "step": 65740 }, { "epoch": 1.606161287958371, "grad_norm": 25.945310592651367, "learning_rate": 1.716361545883166e-06, "loss": 0.1424, "num_input_tokens_seen": 44311712, "step": 65745 }, { "epoch": 1.6062834387902183, "grad_norm": 0.5995503067970276, "learning_rate": 1.7163020426011393e-06, "loss": 0.1523, "num_input_tokens_seen": 44315232, "step": 65750 }, { "epoch": 1.6064055896220655, "grad_norm": 0.045718465000391006, "learning_rate": 1.7162425341099715e-06, "loss": 0.1506, "num_input_tokens_seen": 44318560, "step": 65755 }, { "epoch": 1.6065277404539124, "grad_norm": 0.055583883076906204, "learning_rate": 1.7161830204100952e-06, "loss": 0.0675, "num_input_tokens_seen": 44322144, "step": 65760 }, { "epoch": 1.6066498912857596, "grad_norm": 80.08256530761719, "learning_rate": 1.7161235015019435e-06, "loss": 0.1056, "num_input_tokens_seen": 44325472, "step": 65765 }, { "epoch": 1.6067720421176068, "grad_norm": 16.74420166015625, "learning_rate": 1.7160639773859491e-06, "loss": 0.1892, "num_input_tokens_seen": 44328672, "step": 65770 }, { "epoch": 1.606894192949454, "grad_norm": 38.66650390625, "learning_rate": 1.7160044480625447e-06, "loss": 0.1966, "num_input_tokens_seen": 44332192, "step": 65775 }, { "epoch": 1.6070163437813012, "grad_norm": 0.3365372121334076, "learning_rate": 1.7159449135321636e-06, "loss": 0.0466, "num_input_tokens_seen": 44336352, "step": 65780 }, { "epoch": 1.6071384946131482, "grad_norm": 0.8096862435340881, "learning_rate": 1.7158853737952383e-06, "loss": 0.092, "num_input_tokens_seen": 44339616, "step": 65785 }, { "epoch": 1.6072606454449954, "grad_norm": 25.179733276367188, "learning_rate": 1.715825828852202e-06, "loss": 0.1559, "num_input_tokens_seen": 44342624, "step": 65790 }, { "epoch": 1.6073827962768426, "grad_norm": 0.2534705102443695, "learning_rate": 1.715766278703488e-06, "loss": 0.0751, "num_input_tokens_seen": 44345376, "step": 65795 }, { "epoch": 1.6075049471086897, "grad_norm": 0.39350154995918274, "learning_rate": 1.7157067233495289e-06, "loss": 0.0654, "num_input_tokens_seen": 44348448, "step": 65800 }, { "epoch": 1.607627097940537, "grad_norm": 117.36756896972656, "learning_rate": 1.715647162790758e-06, "loss": 0.0446, "num_input_tokens_seen": 44352032, "step": 65805 }, { "epoch": 1.6077492487723841, "grad_norm": 1.1069661378860474, "learning_rate": 1.7155875970276086e-06, "loss": 0.0896, "num_input_tokens_seen": 44355104, "step": 65810 }, { "epoch": 1.6078713996042313, "grad_norm": 17.73491096496582, "learning_rate": 1.7155280260605137e-06, "loss": 0.0153, "num_input_tokens_seen": 44358240, "step": 65815 }, { "epoch": 1.6079935504360785, "grad_norm": 0.22711671888828278, "learning_rate": 1.7154684498899063e-06, "loss": 0.044, "num_input_tokens_seen": 44361824, "step": 65820 }, { "epoch": 1.6081157012679257, "grad_norm": 0.28996729850769043, "learning_rate": 1.7154088685162203e-06, "loss": 0.0039, "num_input_tokens_seen": 44365536, "step": 65825 }, { "epoch": 1.6082378520997729, "grad_norm": 1.2250621318817139, "learning_rate": 1.7153492819398881e-06, "loss": 0.0479, "num_input_tokens_seen": 44368864, "step": 65830 }, { "epoch": 1.60836000293162, "grad_norm": 0.9239709973335266, "learning_rate": 1.7152896901613439e-06, "loss": 0.0011, "num_input_tokens_seen": 44372448, "step": 65835 }, { "epoch": 1.6084821537634673, "grad_norm": 1.7483092546463013, "learning_rate": 1.7152300931810206e-06, "loss": 0.0603, "num_input_tokens_seen": 44375392, "step": 65840 }, { "epoch": 1.6086043045953144, "grad_norm": 0.5381038784980774, "learning_rate": 1.7151704909993515e-06, "loss": 0.0437, "num_input_tokens_seen": 44378464, "step": 65845 }, { "epoch": 1.6087264554271614, "grad_norm": 0.05454336851835251, "learning_rate": 1.7151108836167705e-06, "loss": 0.1134, "num_input_tokens_seen": 44381856, "step": 65850 }, { "epoch": 1.6088486062590086, "grad_norm": 0.3915286362171173, "learning_rate": 1.7150512710337105e-06, "loss": 0.0011, "num_input_tokens_seen": 44385312, "step": 65855 }, { "epoch": 1.6089707570908558, "grad_norm": 26.920024871826172, "learning_rate": 1.7149916532506055e-06, "loss": 0.0016, "num_input_tokens_seen": 44388320, "step": 65860 }, { "epoch": 1.609092907922703, "grad_norm": 3.6959805488586426, "learning_rate": 1.7149320302678892e-06, "loss": 0.173, "num_input_tokens_seen": 44391968, "step": 65865 }, { "epoch": 1.60921505875455, "grad_norm": 0.21909748017787933, "learning_rate": 1.7148724020859943e-06, "loss": 0.0882, "num_input_tokens_seen": 44395296, "step": 65870 }, { "epoch": 1.6093372095863971, "grad_norm": 23.31797218322754, "learning_rate": 1.7148127687053553e-06, "loss": 0.1272, "num_input_tokens_seen": 44398624, "step": 65875 }, { "epoch": 1.6094593604182443, "grad_norm": 37.92839813232422, "learning_rate": 1.7147531301264056e-06, "loss": 0.1108, "num_input_tokens_seen": 44402016, "step": 65880 }, { "epoch": 1.6095815112500915, "grad_norm": 0.0698784664273262, "learning_rate": 1.7146934863495787e-06, "loss": 0.0632, "num_input_tokens_seen": 44405664, "step": 65885 }, { "epoch": 1.6097036620819387, "grad_norm": 0.12959900498390198, "learning_rate": 1.714633837375309e-06, "loss": 0.1824, "num_input_tokens_seen": 44409056, "step": 65890 }, { "epoch": 1.609825812913786, "grad_norm": 84.17263793945312, "learning_rate": 1.7145741832040294e-06, "loss": 0.0606, "num_input_tokens_seen": 44412384, "step": 65895 }, { "epoch": 1.609947963745633, "grad_norm": 10.694718360900879, "learning_rate": 1.7145145238361743e-06, "loss": 0.2215, "num_input_tokens_seen": 44415648, "step": 65900 }, { "epoch": 1.6100701145774803, "grad_norm": 0.30155470967292786, "learning_rate": 1.7144548592721772e-06, "loss": 0.0994, "num_input_tokens_seen": 44418848, "step": 65905 }, { "epoch": 1.6101922654093275, "grad_norm": 0.2619548439979553, "learning_rate": 1.7143951895124724e-06, "loss": 0.091, "num_input_tokens_seen": 44421664, "step": 65910 }, { "epoch": 1.6103144162411747, "grad_norm": 0.08416949957609177, "learning_rate": 1.714335514557494e-06, "loss": 0.0934, "num_input_tokens_seen": 44425440, "step": 65915 }, { "epoch": 1.6104365670730219, "grad_norm": 0.12342800199985504, "learning_rate": 1.714275834407675e-06, "loss": 0.0027, "num_input_tokens_seen": 44429152, "step": 65920 }, { "epoch": 1.610558717904869, "grad_norm": 38.95140075683594, "learning_rate": 1.71421614906345e-06, "loss": 0.0914, "num_input_tokens_seen": 44432416, "step": 65925 }, { "epoch": 1.6106808687367162, "grad_norm": 0.13258185982704163, "learning_rate": 1.7141564585252534e-06, "loss": 0.0421, "num_input_tokens_seen": 44435616, "step": 65930 }, { "epoch": 1.6108030195685634, "grad_norm": 11.2254638671875, "learning_rate": 1.714096762793519e-06, "loss": 0.1913, "num_input_tokens_seen": 44438752, "step": 65935 }, { "epoch": 1.6109251704004104, "grad_norm": 252.81646728515625, "learning_rate": 1.7140370618686807e-06, "loss": 0.0242, "num_input_tokens_seen": 44442208, "step": 65940 }, { "epoch": 1.6110473212322576, "grad_norm": 25.23392677307129, "learning_rate": 1.7139773557511727e-06, "loss": 0.1016, "num_input_tokens_seen": 44445664, "step": 65945 }, { "epoch": 1.6111694720641048, "grad_norm": 8.368292808532715, "learning_rate": 1.7139176444414296e-06, "loss": 0.05, "num_input_tokens_seen": 44449184, "step": 65950 }, { "epoch": 1.611291622895952, "grad_norm": 59.63330078125, "learning_rate": 1.7138579279398853e-06, "loss": 0.1257, "num_input_tokens_seen": 44453088, "step": 65955 }, { "epoch": 1.611413773727799, "grad_norm": 0.263844758272171, "learning_rate": 1.7137982062469737e-06, "loss": 0.0661, "num_input_tokens_seen": 44456544, "step": 65960 }, { "epoch": 1.6115359245596461, "grad_norm": 0.11820437759160995, "learning_rate": 1.7137384793631302e-06, "loss": 0.1086, "num_input_tokens_seen": 44460000, "step": 65965 }, { "epoch": 1.6116580753914933, "grad_norm": 0.3315877914428711, "learning_rate": 1.7136787472887884e-06, "loss": 0.0199, "num_input_tokens_seen": 44463904, "step": 65970 }, { "epoch": 1.6117802262233405, "grad_norm": 10.633949279785156, "learning_rate": 1.7136190100243826e-06, "loss": 0.154, "num_input_tokens_seen": 44467040, "step": 65975 }, { "epoch": 1.6119023770551877, "grad_norm": 27.93235969543457, "learning_rate": 1.7135592675703475e-06, "loss": 0.0465, "num_input_tokens_seen": 44469984, "step": 65980 }, { "epoch": 1.6120245278870349, "grad_norm": 7.1234235763549805, "learning_rate": 1.7134995199271174e-06, "loss": 0.1811, "num_input_tokens_seen": 44473248, "step": 65985 }, { "epoch": 1.612146678718882, "grad_norm": 0.2649827301502228, "learning_rate": 1.7134397670951268e-06, "loss": 0.0015, "num_input_tokens_seen": 44476640, "step": 65990 }, { "epoch": 1.6122688295507293, "grad_norm": 49.69743347167969, "learning_rate": 1.7133800090748106e-06, "loss": 0.1421, "num_input_tokens_seen": 44479520, "step": 65995 }, { "epoch": 1.6123909803825764, "grad_norm": 0.11934787780046463, "learning_rate": 1.713320245866603e-06, "loss": 0.1589, "num_input_tokens_seen": 44482592, "step": 66000 }, { "epoch": 1.6125131312144236, "grad_norm": 81.90103912353516, "learning_rate": 1.7132604774709385e-06, "loss": 0.2063, "num_input_tokens_seen": 44485984, "step": 66005 }, { "epoch": 1.6126352820462708, "grad_norm": 1.1120253801345825, "learning_rate": 1.7132007038882522e-06, "loss": 0.0966, "num_input_tokens_seen": 44489184, "step": 66010 }, { "epoch": 1.612757432878118, "grad_norm": 18.164979934692383, "learning_rate": 1.7131409251189783e-06, "loss": 0.0427, "num_input_tokens_seen": 44492704, "step": 66015 }, { "epoch": 1.6128795837099652, "grad_norm": 0.24019227921962738, "learning_rate": 1.7130811411635522e-06, "loss": 0.0581, "num_input_tokens_seen": 44495904, "step": 66020 }, { "epoch": 1.6130017345418124, "grad_norm": 0.2978065013885498, "learning_rate": 1.713021352022408e-06, "loss": 0.004, "num_input_tokens_seen": 44499616, "step": 66025 }, { "epoch": 1.6131238853736594, "grad_norm": 19.480119705200195, "learning_rate": 1.7129615576959804e-06, "loss": 0.1521, "num_input_tokens_seen": 44503008, "step": 66030 }, { "epoch": 1.6132460362055066, "grad_norm": 13.071788787841797, "learning_rate": 1.7129017581847052e-06, "loss": 0.1051, "num_input_tokens_seen": 44506208, "step": 66035 }, { "epoch": 1.6133681870373537, "grad_norm": 0.3018631637096405, "learning_rate": 1.7128419534890162e-06, "loss": 0.0537, "num_input_tokens_seen": 44509216, "step": 66040 }, { "epoch": 1.613490337869201, "grad_norm": 1.6079626083374023, "learning_rate": 1.712782143609349e-06, "loss": 0.0643, "num_input_tokens_seen": 44512800, "step": 66045 }, { "epoch": 1.613612488701048, "grad_norm": 36.73057174682617, "learning_rate": 1.7127223285461385e-06, "loss": 0.1507, "num_input_tokens_seen": 44516640, "step": 66050 }, { "epoch": 1.613734639532895, "grad_norm": 1.0381956100463867, "learning_rate": 1.7126625082998195e-06, "loss": 0.0514, "num_input_tokens_seen": 44520160, "step": 66055 }, { "epoch": 1.6138567903647423, "grad_norm": 18.15398597717285, "learning_rate": 1.7126026828708266e-06, "loss": 0.1484, "num_input_tokens_seen": 44523360, "step": 66060 }, { "epoch": 1.6139789411965895, "grad_norm": 0.37995773553848267, "learning_rate": 1.7125428522595956e-06, "loss": 0.0504, "num_input_tokens_seen": 44527264, "step": 66065 }, { "epoch": 1.6141010920284367, "grad_norm": 13.00422191619873, "learning_rate": 1.7124830164665616e-06, "loss": 0.0785, "num_input_tokens_seen": 44530528, "step": 66070 }, { "epoch": 1.6142232428602838, "grad_norm": 1.3020102977752686, "learning_rate": 1.7124231754921592e-06, "loss": 0.1119, "num_input_tokens_seen": 44533344, "step": 66075 }, { "epoch": 1.614345393692131, "grad_norm": 0.14015237987041473, "learning_rate": 1.7123633293368239e-06, "loss": 0.0351, "num_input_tokens_seen": 44536416, "step": 66080 }, { "epoch": 1.6144675445239782, "grad_norm": 0.17543506622314453, "learning_rate": 1.7123034780009906e-06, "loss": 0.0275, "num_input_tokens_seen": 44539552, "step": 66085 }, { "epoch": 1.6145896953558254, "grad_norm": 13.367767333984375, "learning_rate": 1.7122436214850952e-06, "loss": 0.0778, "num_input_tokens_seen": 44542880, "step": 66090 }, { "epoch": 1.6147118461876726, "grad_norm": 20.807472229003906, "learning_rate": 1.7121837597895725e-06, "loss": 0.0436, "num_input_tokens_seen": 44545760, "step": 66095 }, { "epoch": 1.6148339970195198, "grad_norm": 17.611282348632812, "learning_rate": 1.712123892914858e-06, "loss": 0.1088, "num_input_tokens_seen": 44548832, "step": 66100 }, { "epoch": 1.614956147851367, "grad_norm": 0.49443256855010986, "learning_rate": 1.712064020861387e-06, "loss": 0.0025, "num_input_tokens_seen": 44552224, "step": 66105 }, { "epoch": 1.6150782986832142, "grad_norm": 13.236289978027344, "learning_rate": 1.7120041436295947e-06, "loss": 0.1381, "num_input_tokens_seen": 44555424, "step": 66110 }, { "epoch": 1.6152004495150611, "grad_norm": 11.525460243225098, "learning_rate": 1.7119442612199169e-06, "loss": 0.066, "num_input_tokens_seen": 44558496, "step": 66115 }, { "epoch": 1.6153226003469083, "grad_norm": 20.939476013183594, "learning_rate": 1.7118843736327891e-06, "loss": 0.1894, "num_input_tokens_seen": 44561888, "step": 66120 }, { "epoch": 1.6154447511787555, "grad_norm": 0.06147155165672302, "learning_rate": 1.7118244808686464e-06, "loss": 0.2398, "num_input_tokens_seen": 44565216, "step": 66125 }, { "epoch": 1.6155669020106027, "grad_norm": 0.08817419409751892, "learning_rate": 1.7117645829279245e-06, "loss": 0.0939, "num_input_tokens_seen": 44568416, "step": 66130 }, { "epoch": 1.61568905284245, "grad_norm": 33.70576095581055, "learning_rate": 1.7117046798110594e-06, "loss": 0.0843, "num_input_tokens_seen": 44571360, "step": 66135 }, { "epoch": 1.6158112036742969, "grad_norm": 0.171207457780838, "learning_rate": 1.7116447715184866e-06, "loss": 0.0584, "num_input_tokens_seen": 44575008, "step": 66140 }, { "epoch": 1.615933354506144, "grad_norm": 0.44628486037254333, "learning_rate": 1.7115848580506413e-06, "loss": 0.0668, "num_input_tokens_seen": 44578848, "step": 66145 }, { "epoch": 1.6160555053379912, "grad_norm": 0.09966251999139786, "learning_rate": 1.7115249394079596e-06, "loss": 0.0433, "num_input_tokens_seen": 44582880, "step": 66150 }, { "epoch": 1.6161776561698384, "grad_norm": 0.0741414725780487, "learning_rate": 1.7114650155908771e-06, "loss": 0.0462, "num_input_tokens_seen": 44586144, "step": 66155 }, { "epoch": 1.6162998070016856, "grad_norm": 0.5622798800468445, "learning_rate": 1.71140508659983e-06, "loss": 0.0029, "num_input_tokens_seen": 44589792, "step": 66160 }, { "epoch": 1.6164219578335328, "grad_norm": 0.09299297630786896, "learning_rate": 1.7113451524352533e-06, "loss": 0.0599, "num_input_tokens_seen": 44593056, "step": 66165 }, { "epoch": 1.61654410866538, "grad_norm": 0.10339483618736267, "learning_rate": 1.7112852130975838e-06, "loss": 0.0373, "num_input_tokens_seen": 44596320, "step": 66170 }, { "epoch": 1.6166662594972272, "grad_norm": 102.27527618408203, "learning_rate": 1.7112252685872566e-06, "loss": 0.1563, "num_input_tokens_seen": 44599520, "step": 66175 }, { "epoch": 1.6167884103290744, "grad_norm": 0.0032880015205591917, "learning_rate": 1.7111653189047076e-06, "loss": 0.0564, "num_input_tokens_seen": 44603040, "step": 66180 }, { "epoch": 1.6169105611609216, "grad_norm": 0.06474665552377701, "learning_rate": 1.7111053640503737e-06, "loss": 0.0227, "num_input_tokens_seen": 44606880, "step": 66185 }, { "epoch": 1.6170327119927688, "grad_norm": 0.08754179626703262, "learning_rate": 1.71104540402469e-06, "loss": 0.0364, "num_input_tokens_seen": 44610464, "step": 66190 }, { "epoch": 1.617154862824616, "grad_norm": 35.25015640258789, "learning_rate": 1.7109854388280932e-06, "loss": 0.1687, "num_input_tokens_seen": 44613728, "step": 66195 }, { "epoch": 1.6172770136564631, "grad_norm": 0.07697348296642303, "learning_rate": 1.710925468461019e-06, "loss": 0.2382, "num_input_tokens_seen": 44617696, "step": 66200 }, { "epoch": 1.6173991644883101, "grad_norm": 10.768204689025879, "learning_rate": 1.7108654929239033e-06, "loss": 0.1017, "num_input_tokens_seen": 44621024, "step": 66205 }, { "epoch": 1.6175213153201573, "grad_norm": 0.8216606378555298, "learning_rate": 1.7108055122171825e-06, "loss": 0.1305, "num_input_tokens_seen": 44623968, "step": 66210 }, { "epoch": 1.6176434661520045, "grad_norm": 64.47852325439453, "learning_rate": 1.710745526341293e-06, "loss": 0.073, "num_input_tokens_seen": 44627232, "step": 66215 }, { "epoch": 1.6177656169838517, "grad_norm": 0.25741147994995117, "learning_rate": 1.710685535296671e-06, "loss": 0.0392, "num_input_tokens_seen": 44630432, "step": 66220 }, { "epoch": 1.6178877678156989, "grad_norm": 0.2642552852630615, "learning_rate": 1.7106255390837525e-06, "loss": 0.0633, "num_input_tokens_seen": 44634080, "step": 66225 }, { "epoch": 1.6180099186475458, "grad_norm": 103.3814697265625, "learning_rate": 1.710565537702974e-06, "loss": 0.1125, "num_input_tokens_seen": 44637472, "step": 66230 }, { "epoch": 1.618132069479393, "grad_norm": 18.73349952697754, "learning_rate": 1.7105055311547716e-06, "loss": 0.0341, "num_input_tokens_seen": 44640992, "step": 66235 }, { "epoch": 1.6182542203112402, "grad_norm": 16.76979637145996, "learning_rate": 1.7104455194395822e-06, "loss": 0.1151, "num_input_tokens_seen": 44644000, "step": 66240 }, { "epoch": 1.6183763711430874, "grad_norm": 24.473278045654297, "learning_rate": 1.7103855025578416e-06, "loss": 0.1159, "num_input_tokens_seen": 44648224, "step": 66245 }, { "epoch": 1.6184985219749346, "grad_norm": 24.458818435668945, "learning_rate": 1.7103254805099867e-06, "loss": 0.1734, "num_input_tokens_seen": 44651616, "step": 66250 }, { "epoch": 1.6186206728067818, "grad_norm": 89.10078430175781, "learning_rate": 1.7102654532964538e-06, "loss": 0.0259, "num_input_tokens_seen": 44655072, "step": 66255 }, { "epoch": 1.618742823638629, "grad_norm": 0.7390759587287903, "learning_rate": 1.7102054209176794e-06, "loss": 0.0601, "num_input_tokens_seen": 44658656, "step": 66260 }, { "epoch": 1.6188649744704762, "grad_norm": 0.2607508599758148, "learning_rate": 1.7101453833741005e-06, "loss": 0.0548, "num_input_tokens_seen": 44662304, "step": 66265 }, { "epoch": 1.6189871253023234, "grad_norm": 1.9803550243377686, "learning_rate": 1.710085340666153e-06, "loss": 0.0321, "num_input_tokens_seen": 44665312, "step": 66270 }, { "epoch": 1.6191092761341705, "grad_norm": 9.75775146484375, "learning_rate": 1.710025292794274e-06, "loss": 0.0417, "num_input_tokens_seen": 44669024, "step": 66275 }, { "epoch": 1.6192314269660177, "grad_norm": 0.05687793344259262, "learning_rate": 1.7099652397589002e-06, "loss": 0.0017, "num_input_tokens_seen": 44672160, "step": 66280 }, { "epoch": 1.619353577797865, "grad_norm": 10.78559684753418, "learning_rate": 1.7099051815604681e-06, "loss": 0.1311, "num_input_tokens_seen": 44675872, "step": 66285 }, { "epoch": 1.6194757286297121, "grad_norm": 0.29572921991348267, "learning_rate": 1.7098451181994147e-06, "loss": 0.1039, "num_input_tokens_seen": 44679840, "step": 66290 }, { "epoch": 1.619597879461559, "grad_norm": 0.10324651747941971, "learning_rate": 1.7097850496761764e-06, "loss": 0.2029, "num_input_tokens_seen": 44683232, "step": 66295 }, { "epoch": 1.6197200302934063, "grad_norm": 9.582807540893555, "learning_rate": 1.709724975991191e-06, "loss": 0.2378, "num_input_tokens_seen": 44686240, "step": 66300 }, { "epoch": 1.6198421811252535, "grad_norm": 0.16958528757095337, "learning_rate": 1.7096648971448938e-06, "loss": 0.0358, "num_input_tokens_seen": 44690272, "step": 66305 }, { "epoch": 1.6199643319571007, "grad_norm": 57.915733337402344, "learning_rate": 1.709604813137723e-06, "loss": 0.0322, "num_input_tokens_seen": 44693344, "step": 66310 }, { "epoch": 1.6200864827889478, "grad_norm": 1.0083634853363037, "learning_rate": 1.7095447239701153e-06, "loss": 0.0937, "num_input_tokens_seen": 44696928, "step": 66315 }, { "epoch": 1.6202086336207948, "grad_norm": 0.18606296181678772, "learning_rate": 1.7094846296425072e-06, "loss": 0.0822, "num_input_tokens_seen": 44700512, "step": 66320 }, { "epoch": 1.620330784452642, "grad_norm": 0.2539637088775635, "learning_rate": 1.7094245301553362e-06, "loss": 0.045, "num_input_tokens_seen": 44703392, "step": 66325 }, { "epoch": 1.6204529352844892, "grad_norm": 11.006232261657715, "learning_rate": 1.7093644255090394e-06, "loss": 0.0862, "num_input_tokens_seen": 44706720, "step": 66330 }, { "epoch": 1.6205750861163364, "grad_norm": 0.09770510345697403, "learning_rate": 1.7093043157040533e-06, "loss": 0.0469, "num_input_tokens_seen": 44709856, "step": 66335 }, { "epoch": 1.6206972369481836, "grad_norm": 0.7676712870597839, "learning_rate": 1.709244200740816e-06, "loss": 0.0538, "num_input_tokens_seen": 44713184, "step": 66340 }, { "epoch": 1.6208193877800308, "grad_norm": 1.2458648681640625, "learning_rate": 1.7091840806197636e-06, "loss": 0.0821, "num_input_tokens_seen": 44716256, "step": 66345 }, { "epoch": 1.620941538611878, "grad_norm": 0.07486743479967117, "learning_rate": 1.709123955341334e-06, "loss": 0.0016, "num_input_tokens_seen": 44719904, "step": 66350 }, { "epoch": 1.6210636894437251, "grad_norm": 0.9548220634460449, "learning_rate": 1.7090638249059641e-06, "loss": 0.0014, "num_input_tokens_seen": 44723232, "step": 66355 }, { "epoch": 1.6211858402755723, "grad_norm": 1.062659502029419, "learning_rate": 1.7090036893140915e-06, "loss": 0.0441, "num_input_tokens_seen": 44727072, "step": 66360 }, { "epoch": 1.6213079911074195, "grad_norm": 0.13680623471736908, "learning_rate": 1.7089435485661535e-06, "loss": 0.0897, "num_input_tokens_seen": 44730720, "step": 66365 }, { "epoch": 1.6214301419392667, "grad_norm": 6.874294281005859, "learning_rate": 1.7088834026625869e-06, "loss": 0.0022, "num_input_tokens_seen": 44733920, "step": 66370 }, { "epoch": 1.621552292771114, "grad_norm": 253.76951599121094, "learning_rate": 1.70882325160383e-06, "loss": 0.1949, "num_input_tokens_seen": 44737312, "step": 66375 }, { "epoch": 1.621674443602961, "grad_norm": 0.1610388159751892, "learning_rate": 1.7087630953903197e-06, "loss": 0.1604, "num_input_tokens_seen": 44740704, "step": 66380 }, { "epoch": 1.621796594434808, "grad_norm": 11.13759708404541, "learning_rate": 1.7087029340224933e-06, "loss": 0.2189, "num_input_tokens_seen": 44744288, "step": 66385 }, { "epoch": 1.6219187452666552, "grad_norm": 29.627588272094727, "learning_rate": 1.7086427675007886e-06, "loss": 0.0373, "num_input_tokens_seen": 44747808, "step": 66390 }, { "epoch": 1.6220408960985024, "grad_norm": 0.06932839751243591, "learning_rate": 1.7085825958256431e-06, "loss": 0.04, "num_input_tokens_seen": 44751136, "step": 66395 }, { "epoch": 1.6221630469303496, "grad_norm": 0.04329407215118408, "learning_rate": 1.7085224189974944e-06, "loss": 0.0008, "num_input_tokens_seen": 44754464, "step": 66400 }, { "epoch": 1.6222851977621966, "grad_norm": 0.03631928190588951, "learning_rate": 1.7084622370167803e-06, "loss": 0.0907, "num_input_tokens_seen": 44757920, "step": 66405 }, { "epoch": 1.6224073485940438, "grad_norm": 0.1588095873594284, "learning_rate": 1.708402049883938e-06, "loss": 0.1325, "num_input_tokens_seen": 44761504, "step": 66410 }, { "epoch": 1.622529499425891, "grad_norm": 0.6479094624519348, "learning_rate": 1.7083418575994055e-06, "loss": 0.0319, "num_input_tokens_seen": 44764960, "step": 66415 }, { "epoch": 1.6226516502577382, "grad_norm": 0.24373003840446472, "learning_rate": 1.7082816601636205e-06, "loss": 0.0828, "num_input_tokens_seen": 44768224, "step": 66420 }, { "epoch": 1.6227738010895854, "grad_norm": 0.11970167607069016, "learning_rate": 1.7082214575770209e-06, "loss": 0.0197, "num_input_tokens_seen": 44771488, "step": 66425 }, { "epoch": 1.6228959519214325, "grad_norm": 0.31107163429260254, "learning_rate": 1.7081612498400442e-06, "loss": 0.0376, "num_input_tokens_seen": 44774432, "step": 66430 }, { "epoch": 1.6230181027532797, "grad_norm": 0.07137488573789597, "learning_rate": 1.7081010369531286e-06, "loss": 0.1097, "num_input_tokens_seen": 44777824, "step": 66435 }, { "epoch": 1.623140253585127, "grad_norm": 0.04188014194369316, "learning_rate": 1.7080408189167116e-06, "loss": 0.0008, "num_input_tokens_seen": 44781152, "step": 66440 }, { "epoch": 1.6232624044169741, "grad_norm": 0.25597256422042847, "learning_rate": 1.7079805957312315e-06, "loss": 0.0708, "num_input_tokens_seen": 44784224, "step": 66445 }, { "epoch": 1.6233845552488213, "grad_norm": 11.878859519958496, "learning_rate": 1.707920367397126e-06, "loss": 0.0751, "num_input_tokens_seen": 44787808, "step": 66450 }, { "epoch": 1.6235067060806685, "grad_norm": 25.821226119995117, "learning_rate": 1.7078601339148332e-06, "loss": 0.0875, "num_input_tokens_seen": 44791072, "step": 66455 }, { "epoch": 1.6236288569125157, "grad_norm": 1.8043272495269775, "learning_rate": 1.7077998952847912e-06, "loss": 0.0368, "num_input_tokens_seen": 44794080, "step": 66460 }, { "epoch": 1.6237510077443629, "grad_norm": 137.92518615722656, "learning_rate": 1.7077396515074379e-06, "loss": 0.124, "num_input_tokens_seen": 44796960, "step": 66465 }, { "epoch": 1.62387315857621, "grad_norm": 15.676214218139648, "learning_rate": 1.7076794025832112e-06, "loss": 0.121, "num_input_tokens_seen": 44800160, "step": 66470 }, { "epoch": 1.623995309408057, "grad_norm": 0.04584408178925514, "learning_rate": 1.70761914851255e-06, "loss": 0.1473, "num_input_tokens_seen": 44803488, "step": 66475 }, { "epoch": 1.6241174602399042, "grad_norm": 0.16386158764362335, "learning_rate": 1.7075588892958917e-06, "loss": 0.0533, "num_input_tokens_seen": 44807008, "step": 66480 }, { "epoch": 1.6242396110717514, "grad_norm": 41.72587966918945, "learning_rate": 1.7074986249336751e-06, "loss": 0.0263, "num_input_tokens_seen": 44810336, "step": 66485 }, { "epoch": 1.6243617619035986, "grad_norm": 33.555728912353516, "learning_rate": 1.707438355426338e-06, "loss": 0.1167, "num_input_tokens_seen": 44813152, "step": 66490 }, { "epoch": 1.6244839127354456, "grad_norm": 0.2910143733024597, "learning_rate": 1.707378080774319e-06, "loss": 0.115, "num_input_tokens_seen": 44816992, "step": 66495 }, { "epoch": 1.6246060635672928, "grad_norm": 12.734646797180176, "learning_rate": 1.7073178009780564e-06, "loss": 0.156, "num_input_tokens_seen": 44820064, "step": 66500 }, { "epoch": 1.62472821439914, "grad_norm": 0.8806232810020447, "learning_rate": 1.7072575160379886e-06, "loss": 0.0938, "num_input_tokens_seen": 44823776, "step": 66505 }, { "epoch": 1.6248503652309871, "grad_norm": 0.18991605937480927, "learning_rate": 1.7071972259545535e-06, "loss": 0.0759, "num_input_tokens_seen": 44827296, "step": 66510 }, { "epoch": 1.6249725160628343, "grad_norm": 1.1119245290756226, "learning_rate": 1.7071369307281903e-06, "loss": 0.0549, "num_input_tokens_seen": 44830304, "step": 66515 }, { "epoch": 1.6250946668946815, "grad_norm": 39.90916061401367, "learning_rate": 1.7070766303593369e-06, "loss": 0.0036, "num_input_tokens_seen": 44834144, "step": 66520 }, { "epoch": 1.6252168177265287, "grad_norm": 10.871047973632812, "learning_rate": 1.7070163248484323e-06, "loss": 0.296, "num_input_tokens_seen": 44837344, "step": 66525 }, { "epoch": 1.625338968558376, "grad_norm": 0.11879879981279373, "learning_rate": 1.706956014195915e-06, "loss": 0.0014, "num_input_tokens_seen": 44840800, "step": 66530 }, { "epoch": 1.625461119390223, "grad_norm": 0.1101454347372055, "learning_rate": 1.7068956984022229e-06, "loss": 0.1023, "num_input_tokens_seen": 44843616, "step": 66535 }, { "epoch": 1.6255832702220703, "grad_norm": 0.17406058311462402, "learning_rate": 1.7068353774677956e-06, "loss": 0.0053, "num_input_tokens_seen": 44846560, "step": 66540 }, { "epoch": 1.6257054210539175, "grad_norm": 63.51289749145508, "learning_rate": 1.706775051393071e-06, "loss": 0.1193, "num_input_tokens_seen": 44850016, "step": 66545 }, { "epoch": 1.6258275718857647, "grad_norm": 163.7349395751953, "learning_rate": 1.7067147201784882e-06, "loss": 0.0211, "num_input_tokens_seen": 44853536, "step": 66550 }, { "epoch": 1.6259497227176118, "grad_norm": 60.688262939453125, "learning_rate": 1.7066543838244857e-06, "loss": 0.0221, "num_input_tokens_seen": 44856928, "step": 66555 }, { "epoch": 1.626071873549459, "grad_norm": 0.15369991958141327, "learning_rate": 1.7065940423315032e-06, "loss": 0.0409, "num_input_tokens_seen": 44860384, "step": 66560 }, { "epoch": 1.626194024381306, "grad_norm": 10.165790557861328, "learning_rate": 1.706533695699978e-06, "loss": 0.0624, "num_input_tokens_seen": 44863328, "step": 66565 }, { "epoch": 1.6263161752131532, "grad_norm": 0.4016942083835602, "learning_rate": 1.7064733439303497e-06, "loss": 0.0517, "num_input_tokens_seen": 44866656, "step": 66570 }, { "epoch": 1.6264383260450004, "grad_norm": 49.472816467285156, "learning_rate": 1.7064129870230576e-06, "loss": 0.0994, "num_input_tokens_seen": 44870368, "step": 66575 }, { "epoch": 1.6265604768768476, "grad_norm": 0.06103126332163811, "learning_rate": 1.7063526249785403e-06, "loss": 0.0459, "num_input_tokens_seen": 44873376, "step": 66580 }, { "epoch": 1.6266826277086945, "grad_norm": 0.10283923149108887, "learning_rate": 1.7062922577972366e-06, "loss": 0.0376, "num_input_tokens_seen": 44876704, "step": 66585 }, { "epoch": 1.6268047785405417, "grad_norm": 0.15717634558677673, "learning_rate": 1.7062318854795854e-06, "loss": 0.3196, "num_input_tokens_seen": 44880096, "step": 66590 }, { "epoch": 1.626926929372389, "grad_norm": 0.3138972222805023, "learning_rate": 1.7061715080260264e-06, "loss": 0.0331, "num_input_tokens_seen": 44883552, "step": 66595 }, { "epoch": 1.627049080204236, "grad_norm": 246.54254150390625, "learning_rate": 1.706111125436998e-06, "loss": 0.099, "num_input_tokens_seen": 44886816, "step": 66600 }, { "epoch": 1.6271712310360833, "grad_norm": 0.12112493067979813, "learning_rate": 1.7060507377129396e-06, "loss": 0.0374, "num_input_tokens_seen": 44890464, "step": 66605 }, { "epoch": 1.6272933818679305, "grad_norm": 0.10736341774463654, "learning_rate": 1.7059903448542903e-06, "loss": 0.0543, "num_input_tokens_seen": 44893728, "step": 66610 }, { "epoch": 1.6274155326997777, "grad_norm": 0.24470113217830658, "learning_rate": 1.7059299468614893e-06, "loss": 0.0874, "num_input_tokens_seen": 44897440, "step": 66615 }, { "epoch": 1.6275376835316249, "grad_norm": 0.12989413738250732, "learning_rate": 1.705869543734976e-06, "loss": 0.049, "num_input_tokens_seen": 44901344, "step": 66620 }, { "epoch": 1.627659834363472, "grad_norm": 0.07117544859647751, "learning_rate": 1.7058091354751895e-06, "loss": 0.0819, "num_input_tokens_seen": 44904608, "step": 66625 }, { "epoch": 1.6277819851953192, "grad_norm": 0.21030397713184357, "learning_rate": 1.705748722082569e-06, "loss": 0.0016, "num_input_tokens_seen": 44907936, "step": 66630 }, { "epoch": 1.6279041360271664, "grad_norm": 0.3971431255340576, "learning_rate": 1.7056883035575542e-06, "loss": 0.0496, "num_input_tokens_seen": 44911264, "step": 66635 }, { "epoch": 1.6280262868590136, "grad_norm": 0.25932618975639343, "learning_rate": 1.7056278799005841e-06, "loss": 0.0025, "num_input_tokens_seen": 44915104, "step": 66640 }, { "epoch": 1.6281484376908608, "grad_norm": 105.23665618896484, "learning_rate": 1.705567451112098e-06, "loss": 0.1403, "num_input_tokens_seen": 44918624, "step": 66645 }, { "epoch": 1.6282705885227078, "grad_norm": 0.11826230585575104, "learning_rate": 1.705507017192536e-06, "loss": 0.0961, "num_input_tokens_seen": 44922336, "step": 66650 }, { "epoch": 1.628392739354555, "grad_norm": 0.07961612194776535, "learning_rate": 1.7054465781423373e-06, "loss": 0.0438, "num_input_tokens_seen": 44925536, "step": 66655 }, { "epoch": 1.6285148901864022, "grad_norm": 8.724510192871094, "learning_rate": 1.7053861339619408e-06, "loss": 0.0817, "num_input_tokens_seen": 44928864, "step": 66660 }, { "epoch": 1.6286370410182494, "grad_norm": 0.1295159012079239, "learning_rate": 1.7053256846517874e-06, "loss": 0.0014, "num_input_tokens_seen": 44932000, "step": 66665 }, { "epoch": 1.6287591918500965, "grad_norm": 46.75648880004883, "learning_rate": 1.7052652302123152e-06, "loss": 0.1552, "num_input_tokens_seen": 44935456, "step": 66670 }, { "epoch": 1.6288813426819435, "grad_norm": 0.3630995750427246, "learning_rate": 1.7052047706439648e-06, "loss": 0.0643, "num_input_tokens_seen": 44938720, "step": 66675 }, { "epoch": 1.6290034935137907, "grad_norm": 0.10988431423902512, "learning_rate": 1.7051443059471758e-06, "loss": 0.1296, "num_input_tokens_seen": 44941792, "step": 66680 }, { "epoch": 1.629125644345638, "grad_norm": 28.34921646118164, "learning_rate": 1.7050838361223874e-06, "loss": 0.1599, "num_input_tokens_seen": 44945248, "step": 66685 }, { "epoch": 1.629247795177485, "grad_norm": 12.040345191955566, "learning_rate": 1.7050233611700399e-06, "loss": 0.1362, "num_input_tokens_seen": 44948256, "step": 66690 }, { "epoch": 1.6293699460093323, "grad_norm": 19.319721221923828, "learning_rate": 1.704962881090573e-06, "loss": 0.0916, "num_input_tokens_seen": 44951392, "step": 66695 }, { "epoch": 1.6294920968411795, "grad_norm": 80.89275360107422, "learning_rate": 1.7049023958844261e-06, "loss": 0.1602, "num_input_tokens_seen": 44954592, "step": 66700 }, { "epoch": 1.6296142476730267, "grad_norm": 20.608259201049805, "learning_rate": 1.7048419055520396e-06, "loss": 0.0779, "num_input_tokens_seen": 44957856, "step": 66705 }, { "epoch": 1.6297363985048738, "grad_norm": 133.53970336914062, "learning_rate": 1.704781410093853e-06, "loss": 0.0228, "num_input_tokens_seen": 44961312, "step": 66710 }, { "epoch": 1.629858549336721, "grad_norm": 0.08650583773851395, "learning_rate": 1.704720909510307e-06, "loss": 0.099, "num_input_tokens_seen": 44964512, "step": 66715 }, { "epoch": 1.6299807001685682, "grad_norm": 28.098222732543945, "learning_rate": 1.7046604038018404e-06, "loss": 0.0452, "num_input_tokens_seen": 44967968, "step": 66720 }, { "epoch": 1.6301028510004154, "grad_norm": 0.07196227461099625, "learning_rate": 1.704599892968894e-06, "loss": 0.0016, "num_input_tokens_seen": 44970912, "step": 66725 }, { "epoch": 1.6302250018322626, "grad_norm": 25.876453399658203, "learning_rate": 1.7045393770119075e-06, "loss": 0.1082, "num_input_tokens_seen": 44974240, "step": 66730 }, { "epoch": 1.6303471526641098, "grad_norm": 12.297942161560059, "learning_rate": 1.7044788559313214e-06, "loss": 0.1062, "num_input_tokens_seen": 44977504, "step": 66735 }, { "epoch": 1.6304693034959568, "grad_norm": 23.527793884277344, "learning_rate": 1.7044183297275753e-06, "loss": 0.1591, "num_input_tokens_seen": 44980704, "step": 66740 }, { "epoch": 1.630591454327804, "grad_norm": 17.326770782470703, "learning_rate": 1.7043577984011099e-06, "loss": 0.1346, "num_input_tokens_seen": 44983904, "step": 66745 }, { "epoch": 1.6307136051596511, "grad_norm": 0.08420751988887787, "learning_rate": 1.7042972619523651e-06, "loss": 0.1032, "num_input_tokens_seen": 44987296, "step": 66750 }, { "epoch": 1.6308357559914983, "grad_norm": 20.060701370239258, "learning_rate": 1.7042367203817812e-06, "loss": 0.1449, "num_input_tokens_seen": 44990688, "step": 66755 }, { "epoch": 1.6309579068233455, "grad_norm": 0.18126173317432404, "learning_rate": 1.7041761736897984e-06, "loss": 0.0012, "num_input_tokens_seen": 44994080, "step": 66760 }, { "epoch": 1.6310800576551925, "grad_norm": 16.160730361938477, "learning_rate": 1.7041156218768571e-06, "loss": 0.0847, "num_input_tokens_seen": 44997216, "step": 66765 }, { "epoch": 1.6312022084870397, "grad_norm": 11.004593849182129, "learning_rate": 1.7040550649433975e-06, "loss": 0.0459, "num_input_tokens_seen": 45000672, "step": 66770 }, { "epoch": 1.6313243593188869, "grad_norm": 38.77311325073242, "learning_rate": 1.70399450288986e-06, "loss": 0.0595, "num_input_tokens_seen": 45004000, "step": 66775 }, { "epoch": 1.631446510150734, "grad_norm": 11.385759353637695, "learning_rate": 1.7039339357166854e-06, "loss": 0.1191, "num_input_tokens_seen": 45008224, "step": 66780 }, { "epoch": 1.6315686609825812, "grad_norm": 29.47661590576172, "learning_rate": 1.703873363424314e-06, "loss": 0.1004, "num_input_tokens_seen": 45011232, "step": 66785 }, { "epoch": 1.6316908118144284, "grad_norm": 37.67548370361328, "learning_rate": 1.7038127860131859e-06, "loss": 0.2868, "num_input_tokens_seen": 45014880, "step": 66790 }, { "epoch": 1.6318129626462756, "grad_norm": 0.49903541803359985, "learning_rate": 1.7037522034837418e-06, "loss": 0.0023, "num_input_tokens_seen": 45018016, "step": 66795 }, { "epoch": 1.6319351134781228, "grad_norm": 0.042273178696632385, "learning_rate": 1.7036916158364227e-06, "loss": 0.0195, "num_input_tokens_seen": 45020960, "step": 66800 }, { "epoch": 1.63205726430997, "grad_norm": 0.08780739456415176, "learning_rate": 1.7036310230716686e-06, "loss": 0.0265, "num_input_tokens_seen": 45024032, "step": 66805 }, { "epoch": 1.6321794151418172, "grad_norm": 0.3247887194156647, "learning_rate": 1.7035704251899207e-06, "loss": 0.0013, "num_input_tokens_seen": 45027488, "step": 66810 }, { "epoch": 1.6323015659736644, "grad_norm": 25.774038314819336, "learning_rate": 1.7035098221916195e-06, "loss": 0.0521, "num_input_tokens_seen": 45030944, "step": 66815 }, { "epoch": 1.6324237168055116, "grad_norm": 0.09247536212205887, "learning_rate": 1.7034492140772057e-06, "loss": 0.0854, "num_input_tokens_seen": 45034208, "step": 66820 }, { "epoch": 1.6325458676373588, "grad_norm": 0.2177923172712326, "learning_rate": 1.7033886008471196e-06, "loss": 0.1261, "num_input_tokens_seen": 45037792, "step": 66825 }, { "epoch": 1.6326680184692057, "grad_norm": 0.06111787632107735, "learning_rate": 1.7033279825018026e-06, "loss": 0.0007, "num_input_tokens_seen": 45041184, "step": 66830 }, { "epoch": 1.632790169301053, "grad_norm": 18.811080932617188, "learning_rate": 1.7032673590416953e-06, "loss": 0.1032, "num_input_tokens_seen": 45044384, "step": 66835 }, { "epoch": 1.6329123201329, "grad_norm": 0.5662484169006348, "learning_rate": 1.7032067304672387e-06, "loss": 0.0113, "num_input_tokens_seen": 45047776, "step": 66840 }, { "epoch": 1.6330344709647473, "grad_norm": 0.045659150928258896, "learning_rate": 1.7031460967788735e-06, "loss": 0.0011, "num_input_tokens_seen": 45050976, "step": 66845 }, { "epoch": 1.6331566217965945, "grad_norm": 0.10969070345163345, "learning_rate": 1.7030854579770408e-06, "loss": 0.1239, "num_input_tokens_seen": 45053984, "step": 66850 }, { "epoch": 1.6332787726284415, "grad_norm": 1.7151886224746704, "learning_rate": 1.7030248140621816e-06, "loss": 0.0526, "num_input_tokens_seen": 45057312, "step": 66855 }, { "epoch": 1.6334009234602886, "grad_norm": 0.0635419562458992, "learning_rate": 1.7029641650347368e-06, "loss": 0.0331, "num_input_tokens_seen": 45060576, "step": 66860 }, { "epoch": 1.6335230742921358, "grad_norm": 27.04867172241211, "learning_rate": 1.7029035108951474e-06, "loss": 0.1633, "num_input_tokens_seen": 45063776, "step": 66865 }, { "epoch": 1.633645225123983, "grad_norm": 125.61565399169922, "learning_rate": 1.7028428516438549e-06, "loss": 0.0636, "num_input_tokens_seen": 45067104, "step": 66870 }, { "epoch": 1.6337673759558302, "grad_norm": 211.0738525390625, "learning_rate": 1.7027821872813002e-06, "loss": 0.1001, "num_input_tokens_seen": 45070112, "step": 66875 }, { "epoch": 1.6338895267876774, "grad_norm": 1.0476380586624146, "learning_rate": 1.7027215178079242e-06, "loss": 0.0053, "num_input_tokens_seen": 45073952, "step": 66880 }, { "epoch": 1.6340116776195246, "grad_norm": 3.8595800399780273, "learning_rate": 1.7026608432241683e-06, "loss": 0.111, "num_input_tokens_seen": 45077280, "step": 66885 }, { "epoch": 1.6341338284513718, "grad_norm": 0.18746982514858246, "learning_rate": 1.702600163530474e-06, "loss": 0.0501, "num_input_tokens_seen": 45080672, "step": 66890 }, { "epoch": 1.634255979283219, "grad_norm": 14.622883796691895, "learning_rate": 1.702539478727282e-06, "loss": 0.2267, "num_input_tokens_seen": 45083744, "step": 66895 }, { "epoch": 1.6343781301150662, "grad_norm": 14.971632957458496, "learning_rate": 1.7024787888150339e-06, "loss": 0.1211, "num_input_tokens_seen": 45086880, "step": 66900 }, { "epoch": 1.6345002809469134, "grad_norm": 13.314606666564941, "learning_rate": 1.7024180937941712e-06, "loss": 0.0809, "num_input_tokens_seen": 45090336, "step": 66905 }, { "epoch": 1.6346224317787605, "grad_norm": 0.21442019939422607, "learning_rate": 1.7023573936651355e-06, "loss": 0.0365, "num_input_tokens_seen": 45093792, "step": 66910 }, { "epoch": 1.6347445826106077, "grad_norm": 0.02547086589038372, "learning_rate": 1.7022966884283677e-06, "loss": 0.0009, "num_input_tokens_seen": 45096928, "step": 66915 }, { "epoch": 1.6348667334424547, "grad_norm": 0.12754671275615692, "learning_rate": 1.7022359780843095e-06, "loss": 0.0279, "num_input_tokens_seen": 45100000, "step": 66920 }, { "epoch": 1.634988884274302, "grad_norm": 0.4442298710346222, "learning_rate": 1.702175262633402e-06, "loss": 0.0012, "num_input_tokens_seen": 45103264, "step": 66925 }, { "epoch": 1.635111035106149, "grad_norm": 0.027086948975920677, "learning_rate": 1.7021145420760877e-06, "loss": 0.096, "num_input_tokens_seen": 45106720, "step": 66930 }, { "epoch": 1.6352331859379963, "grad_norm": 26.187314987182617, "learning_rate": 1.7020538164128074e-06, "loss": 0.2329, "num_input_tokens_seen": 45109920, "step": 66935 }, { "epoch": 1.6353553367698432, "grad_norm": 138.03111267089844, "learning_rate": 1.7019930856440027e-06, "loss": 0.1619, "num_input_tokens_seen": 45113568, "step": 66940 }, { "epoch": 1.6354774876016904, "grad_norm": 1.1136021614074707, "learning_rate": 1.7019323497701159e-06, "loss": 0.0393, "num_input_tokens_seen": 45116704, "step": 66945 }, { "epoch": 1.6355996384335376, "grad_norm": 94.63319396972656, "learning_rate": 1.7018716087915882e-06, "loss": 0.0604, "num_input_tokens_seen": 45119776, "step": 66950 }, { "epoch": 1.6357217892653848, "grad_norm": 0.4391125440597534, "learning_rate": 1.701810862708861e-06, "loss": 0.0023, "num_input_tokens_seen": 45122976, "step": 66955 }, { "epoch": 1.635843940097232, "grad_norm": 0.6100500226020813, "learning_rate": 1.7017501115223766e-06, "loss": 0.0294, "num_input_tokens_seen": 45126048, "step": 66960 }, { "epoch": 1.6359660909290792, "grad_norm": 0.3072821795940399, "learning_rate": 1.7016893552325766e-06, "loss": 0.0043, "num_input_tokens_seen": 45129248, "step": 66965 }, { "epoch": 1.6360882417609264, "grad_norm": 0.6370717883110046, "learning_rate": 1.701628593839903e-06, "loss": 0.0391, "num_input_tokens_seen": 45132768, "step": 66970 }, { "epoch": 1.6362103925927736, "grad_norm": 0.20756691694259644, "learning_rate": 1.7015678273447977e-06, "loss": 0.1749, "num_input_tokens_seen": 45136352, "step": 66975 }, { "epoch": 1.6363325434246208, "grad_norm": 0.08585977554321289, "learning_rate": 1.7015070557477022e-06, "loss": 0.0005, "num_input_tokens_seen": 45139552, "step": 66980 }, { "epoch": 1.636454694256468, "grad_norm": 0.10425865650177002, "learning_rate": 1.7014462790490586e-06, "loss": 0.01, "num_input_tokens_seen": 45143328, "step": 66985 }, { "epoch": 1.6365768450883151, "grad_norm": 14.16010570526123, "learning_rate": 1.7013854972493093e-06, "loss": 0.0569, "num_input_tokens_seen": 45146272, "step": 66990 }, { "epoch": 1.6366989959201623, "grad_norm": 0.08292701095342636, "learning_rate": 1.7013247103488962e-06, "loss": 0.0698, "num_input_tokens_seen": 45149600, "step": 66995 }, { "epoch": 1.6368211467520095, "grad_norm": 0.12289398908615112, "learning_rate": 1.7012639183482609e-06, "loss": 0.0003, "num_input_tokens_seen": 45153248, "step": 67000 }, { "epoch": 1.6369432975838567, "grad_norm": 0.17947377264499664, "learning_rate": 1.7012031212478456e-06, "loss": 0.0504, "num_input_tokens_seen": 45156192, "step": 67005 }, { "epoch": 1.6370654484157037, "grad_norm": 12.637063980102539, "learning_rate": 1.7011423190480926e-06, "loss": 0.1407, "num_input_tokens_seen": 45159264, "step": 67010 }, { "epoch": 1.6371875992475509, "grad_norm": 0.3538811504840851, "learning_rate": 1.7010815117494444e-06, "loss": 0.0457, "num_input_tokens_seen": 45162528, "step": 67015 }, { "epoch": 1.637309750079398, "grad_norm": 0.1899852603673935, "learning_rate": 1.7010206993523425e-06, "loss": 0.084, "num_input_tokens_seen": 45165792, "step": 67020 }, { "epoch": 1.6374319009112452, "grad_norm": 0.1661899983882904, "learning_rate": 1.70095988185723e-06, "loss": 0.0945, "num_input_tokens_seen": 45169056, "step": 67025 }, { "epoch": 1.6375540517430922, "grad_norm": 0.22782708704471588, "learning_rate": 1.7008990592645483e-06, "loss": 0.2022, "num_input_tokens_seen": 45172384, "step": 67030 }, { "epoch": 1.6376762025749394, "grad_norm": 17.975170135498047, "learning_rate": 1.7008382315747402e-06, "loss": 0.1716, "num_input_tokens_seen": 45175712, "step": 67035 }, { "epoch": 1.6377983534067866, "grad_norm": 8.688823699951172, "learning_rate": 1.700777398788248e-06, "loss": 0.1536, "num_input_tokens_seen": 45178656, "step": 67040 }, { "epoch": 1.6379205042386338, "grad_norm": 21.929100036621094, "learning_rate": 1.700716560905514e-06, "loss": 0.0731, "num_input_tokens_seen": 45182368, "step": 67045 }, { "epoch": 1.638042655070481, "grad_norm": 0.026093896478414536, "learning_rate": 1.7006557179269806e-06, "loss": 0.0579, "num_input_tokens_seen": 45185504, "step": 67050 }, { "epoch": 1.6381648059023282, "grad_norm": 0.4132900834083557, "learning_rate": 1.7005948698530907e-06, "loss": 0.0516, "num_input_tokens_seen": 45188640, "step": 67055 }, { "epoch": 1.6382869567341753, "grad_norm": 0.28247717022895813, "learning_rate": 1.7005340166842866e-06, "loss": 0.0013, "num_input_tokens_seen": 45191904, "step": 67060 }, { "epoch": 1.6384091075660225, "grad_norm": 8.85312557220459, "learning_rate": 1.7004731584210102e-06, "loss": 0.0958, "num_input_tokens_seen": 45195040, "step": 67065 }, { "epoch": 1.6385312583978697, "grad_norm": 1.7231601476669312, "learning_rate": 1.700412295063705e-06, "loss": 0.0385, "num_input_tokens_seen": 45198432, "step": 67070 }, { "epoch": 1.638653409229717, "grad_norm": 0.08209192752838135, "learning_rate": 1.700351426612813e-06, "loss": 0.066, "num_input_tokens_seen": 45201504, "step": 67075 }, { "epoch": 1.638775560061564, "grad_norm": 22.106382369995117, "learning_rate": 1.7002905530687767e-06, "loss": 0.1887, "num_input_tokens_seen": 45204704, "step": 67080 }, { "epoch": 1.6388977108934113, "grad_norm": 0.6420429348945618, "learning_rate": 1.7002296744320396e-06, "loss": 0.109, "num_input_tokens_seen": 45207840, "step": 67085 }, { "epoch": 1.6390198617252585, "grad_norm": 0.14940212666988373, "learning_rate": 1.700168790703044e-06, "loss": 0.0866, "num_input_tokens_seen": 45211168, "step": 67090 }, { "epoch": 1.6391420125571057, "grad_norm": 0.1484726071357727, "learning_rate": 1.7001079018822325e-06, "loss": 0.0852, "num_input_tokens_seen": 45214752, "step": 67095 }, { "epoch": 1.6392641633889526, "grad_norm": 23.14564323425293, "learning_rate": 1.7000470079700482e-06, "loss": 0.067, "num_input_tokens_seen": 45217824, "step": 67100 }, { "epoch": 1.6393863142207998, "grad_norm": 0.2891077697277069, "learning_rate": 1.6999861089669337e-06, "loss": 0.0357, "num_input_tokens_seen": 45221408, "step": 67105 }, { "epoch": 1.639508465052647, "grad_norm": 0.6765737533569336, "learning_rate": 1.6999252048733314e-06, "loss": 0.0021, "num_input_tokens_seen": 45224672, "step": 67110 }, { "epoch": 1.6396306158844942, "grad_norm": 10.747055053710938, "learning_rate": 1.6998642956896853e-06, "loss": 0.1089, "num_input_tokens_seen": 45228128, "step": 67115 }, { "epoch": 1.6397527667163412, "grad_norm": 0.43889570236206055, "learning_rate": 1.699803381416438e-06, "loss": 0.0386, "num_input_tokens_seen": 45231840, "step": 67120 }, { "epoch": 1.6398749175481884, "grad_norm": 18.788455963134766, "learning_rate": 1.699742462054032e-06, "loss": 0.0364, "num_input_tokens_seen": 45235424, "step": 67125 }, { "epoch": 1.6399970683800356, "grad_norm": 0.07699179649353027, "learning_rate": 1.6996815376029105e-06, "loss": 0.0412, "num_input_tokens_seen": 45238752, "step": 67130 }, { "epoch": 1.6401192192118828, "grad_norm": 0.29230740666389465, "learning_rate": 1.6996206080635167e-06, "loss": 0.0014, "num_input_tokens_seen": 45242272, "step": 67135 }, { "epoch": 1.64024137004373, "grad_norm": 42.41120910644531, "learning_rate": 1.6995596734362937e-06, "loss": 0.1312, "num_input_tokens_seen": 45245600, "step": 67140 }, { "epoch": 1.6403635208755771, "grad_norm": 0.2593778073787689, "learning_rate": 1.6994987337216845e-06, "loss": 0.1014, "num_input_tokens_seen": 45249184, "step": 67145 }, { "epoch": 1.6404856717074243, "grad_norm": 0.2428825944662094, "learning_rate": 1.6994377889201328e-06, "loss": 0.1057, "num_input_tokens_seen": 45252704, "step": 67150 }, { "epoch": 1.6406078225392715, "grad_norm": 22.88860321044922, "learning_rate": 1.699376839032081e-06, "loss": 0.1044, "num_input_tokens_seen": 45256224, "step": 67155 }, { "epoch": 1.6407299733711187, "grad_norm": 0.028404124081134796, "learning_rate": 1.6993158840579728e-06, "loss": 0.0037, "num_input_tokens_seen": 45260064, "step": 67160 }, { "epoch": 1.640852124202966, "grad_norm": 0.25550132989883423, "learning_rate": 1.6992549239982515e-06, "loss": 0.0722, "num_input_tokens_seen": 45263392, "step": 67165 }, { "epoch": 1.640974275034813, "grad_norm": 21.50788116455078, "learning_rate": 1.6991939588533601e-06, "loss": 0.159, "num_input_tokens_seen": 45266208, "step": 67170 }, { "epoch": 1.6410964258666603, "grad_norm": 0.05193869397044182, "learning_rate": 1.6991329886237421e-06, "loss": 0.1023, "num_input_tokens_seen": 45269984, "step": 67175 }, { "epoch": 1.6412185766985075, "grad_norm": 0.48035043478012085, "learning_rate": 1.6990720133098412e-06, "loss": 0.0088, "num_input_tokens_seen": 45272992, "step": 67180 }, { "epoch": 1.6413407275303544, "grad_norm": 0.20229879021644592, "learning_rate": 1.6990110329121005e-06, "loss": 0.002, "num_input_tokens_seen": 45276320, "step": 67185 }, { "epoch": 1.6414628783622016, "grad_norm": 0.06353387981653214, "learning_rate": 1.6989500474309637e-06, "loss": 0.0477, "num_input_tokens_seen": 45279520, "step": 67190 }, { "epoch": 1.6415850291940488, "grad_norm": 0.2843955159187317, "learning_rate": 1.6988890568668741e-06, "loss": 0.1012, "num_input_tokens_seen": 45282848, "step": 67195 }, { "epoch": 1.641707180025896, "grad_norm": 0.19131289422512054, "learning_rate": 1.6988280612202751e-06, "loss": 0.0607, "num_input_tokens_seen": 45285856, "step": 67200 }, { "epoch": 1.6418293308577432, "grad_norm": 13.557336807250977, "learning_rate": 1.6987670604916106e-06, "loss": 0.302, "num_input_tokens_seen": 45288864, "step": 67205 }, { "epoch": 1.6419514816895902, "grad_norm": 3.8761701583862305, "learning_rate": 1.6987060546813242e-06, "loss": 0.1141, "num_input_tokens_seen": 45292448, "step": 67210 }, { "epoch": 1.6420736325214373, "grad_norm": 131.6263885498047, "learning_rate": 1.6986450437898592e-06, "loss": 0.1661, "num_input_tokens_seen": 45295456, "step": 67215 }, { "epoch": 1.6421957833532845, "grad_norm": 72.59944915771484, "learning_rate": 1.6985840278176596e-06, "loss": 0.0886, "num_input_tokens_seen": 45298592, "step": 67220 }, { "epoch": 1.6423179341851317, "grad_norm": 8.932476997375488, "learning_rate": 1.6985230067651695e-06, "loss": 0.2015, "num_input_tokens_seen": 45301792, "step": 67225 }, { "epoch": 1.642440085016979, "grad_norm": 11.90882396697998, "learning_rate": 1.6984619806328317e-06, "loss": 0.159, "num_input_tokens_seen": 45306208, "step": 67230 }, { "epoch": 1.642562235848826, "grad_norm": 0.7757220268249512, "learning_rate": 1.6984009494210904e-06, "loss": 0.0794, "num_input_tokens_seen": 45309344, "step": 67235 }, { "epoch": 1.6426843866806733, "grad_norm": 0.2992716133594513, "learning_rate": 1.69833991313039e-06, "loss": 0.0459, "num_input_tokens_seen": 45312800, "step": 67240 }, { "epoch": 1.6428065375125205, "grad_norm": 97.35224151611328, "learning_rate": 1.6982788717611735e-06, "loss": 0.1316, "num_input_tokens_seen": 45316000, "step": 67245 }, { "epoch": 1.6429286883443677, "grad_norm": 0.13269208371639252, "learning_rate": 1.6982178253138857e-06, "loss": 0.0026, "num_input_tokens_seen": 45319392, "step": 67250 }, { "epoch": 1.6430508391762149, "grad_norm": 0.40496477484703064, "learning_rate": 1.6981567737889698e-06, "loss": 0.0425, "num_input_tokens_seen": 45323488, "step": 67255 }, { "epoch": 1.643172990008062, "grad_norm": 0.16997823119163513, "learning_rate": 1.6980957171868702e-06, "loss": 0.0061, "num_input_tokens_seen": 45326688, "step": 67260 }, { "epoch": 1.6432951408399092, "grad_norm": 1.7047306299209595, "learning_rate": 1.6980346555080306e-06, "loss": 0.1156, "num_input_tokens_seen": 45330208, "step": 67265 }, { "epoch": 1.6434172916717564, "grad_norm": 0.08190785348415375, "learning_rate": 1.6979735887528954e-06, "loss": 0.0174, "num_input_tokens_seen": 45333536, "step": 67270 }, { "epoch": 1.6435394425036034, "grad_norm": 34.33685302734375, "learning_rate": 1.6979125169219085e-06, "loss": 0.1974, "num_input_tokens_seen": 45336928, "step": 67275 }, { "epoch": 1.6436615933354506, "grad_norm": 0.03678745776414871, "learning_rate": 1.6978514400155137e-06, "loss": 0.0702, "num_input_tokens_seen": 45340192, "step": 67280 }, { "epoch": 1.6437837441672978, "grad_norm": 0.08641359955072403, "learning_rate": 1.697790358034156e-06, "loss": 0.0019, "num_input_tokens_seen": 45343520, "step": 67285 }, { "epoch": 1.643905894999145, "grad_norm": 0.09858337789773941, "learning_rate": 1.6977292709782792e-06, "loss": 0.0007, "num_input_tokens_seen": 45346656, "step": 67290 }, { "epoch": 1.6440280458309922, "grad_norm": 0.03240935131907463, "learning_rate": 1.6976681788483268e-06, "loss": 0.1003, "num_input_tokens_seen": 45349984, "step": 67295 }, { "epoch": 1.6441501966628391, "grad_norm": 0.8038926720619202, "learning_rate": 1.6976070816447443e-06, "loss": 0.0783, "num_input_tokens_seen": 45353248, "step": 67300 }, { "epoch": 1.6442723474946863, "grad_norm": 0.22960323095321655, "learning_rate": 1.6975459793679753e-06, "loss": 0.0014, "num_input_tokens_seen": 45356448, "step": 67305 }, { "epoch": 1.6443944983265335, "grad_norm": 0.05000200867652893, "learning_rate": 1.6974848720184647e-06, "loss": 0.0959, "num_input_tokens_seen": 45359712, "step": 67310 }, { "epoch": 1.6445166491583807, "grad_norm": 177.49916076660156, "learning_rate": 1.697423759596656e-06, "loss": 0.15, "num_input_tokens_seen": 45362912, "step": 67315 }, { "epoch": 1.6446387999902279, "grad_norm": 0.19165050983428955, "learning_rate": 1.6973626421029944e-06, "loss": 0.0369, "num_input_tokens_seen": 45366432, "step": 67320 }, { "epoch": 1.644760950822075, "grad_norm": 0.19500640034675598, "learning_rate": 1.697301519537924e-06, "loss": 0.149, "num_input_tokens_seen": 45369760, "step": 67325 }, { "epoch": 1.6448831016539223, "grad_norm": 25.245399475097656, "learning_rate": 1.6972403919018895e-06, "loss": 0.0926, "num_input_tokens_seen": 45373408, "step": 67330 }, { "epoch": 1.6450052524857695, "grad_norm": 0.12566934525966644, "learning_rate": 1.6971792591953352e-06, "loss": 0.0958, "num_input_tokens_seen": 45376864, "step": 67335 }, { "epoch": 1.6451274033176166, "grad_norm": 0.1085042655467987, "learning_rate": 1.6971181214187058e-06, "loss": 0.0568, "num_input_tokens_seen": 45380192, "step": 67340 }, { "epoch": 1.6452495541494638, "grad_norm": 9.45516586303711, "learning_rate": 1.697056978572446e-06, "loss": 0.0958, "num_input_tokens_seen": 45383072, "step": 67345 }, { "epoch": 1.645371704981311, "grad_norm": 19.096742630004883, "learning_rate": 1.6969958306570002e-06, "loss": 0.1171, "num_input_tokens_seen": 45386208, "step": 67350 }, { "epoch": 1.6454938558131582, "grad_norm": 27.534833908081055, "learning_rate": 1.6969346776728134e-06, "loss": 0.1764, "num_input_tokens_seen": 45390048, "step": 67355 }, { "epoch": 1.6456160066450054, "grad_norm": 12.015131950378418, "learning_rate": 1.6968735196203303e-06, "loss": 0.1151, "num_input_tokens_seen": 45393824, "step": 67360 }, { "epoch": 1.6457381574768524, "grad_norm": 3.8008766174316406, "learning_rate": 1.6968123564999952e-06, "loss": 0.1273, "num_input_tokens_seen": 45397088, "step": 67365 }, { "epoch": 1.6458603083086996, "grad_norm": 0.44486188888549805, "learning_rate": 1.6967511883122536e-06, "loss": 0.1678, "num_input_tokens_seen": 45400480, "step": 67370 }, { "epoch": 1.6459824591405467, "grad_norm": 0.8876047134399414, "learning_rate": 1.6966900150575498e-06, "loss": 0.1328, "num_input_tokens_seen": 45403936, "step": 67375 }, { "epoch": 1.646104609972394, "grad_norm": 0.7775227427482605, "learning_rate": 1.696628836736329e-06, "loss": 0.0541, "num_input_tokens_seen": 45407392, "step": 67380 }, { "epoch": 1.6462267608042411, "grad_norm": 70.5943374633789, "learning_rate": 1.6965676533490357e-06, "loss": 0.0406, "num_input_tokens_seen": 45411040, "step": 67385 }, { "epoch": 1.646348911636088, "grad_norm": 45.5142936706543, "learning_rate": 1.6965064648961146e-06, "loss": 0.0364, "num_input_tokens_seen": 45414240, "step": 67390 }, { "epoch": 1.6464710624679353, "grad_norm": 44.99553680419922, "learning_rate": 1.696445271378012e-06, "loss": 0.0662, "num_input_tokens_seen": 45417760, "step": 67395 }, { "epoch": 1.6465932132997825, "grad_norm": 0.03843267634510994, "learning_rate": 1.6963840727951717e-06, "loss": 0.1032, "num_input_tokens_seen": 45421024, "step": 67400 }, { "epoch": 1.6467153641316297, "grad_norm": 0.28163111209869385, "learning_rate": 1.6963228691480391e-06, "loss": 0.0662, "num_input_tokens_seen": 45424672, "step": 67405 }, { "epoch": 1.6468375149634769, "grad_norm": 223.5466766357422, "learning_rate": 1.6962616604370595e-06, "loss": 0.2042, "num_input_tokens_seen": 45427936, "step": 67410 }, { "epoch": 1.646959665795324, "grad_norm": 9.289713859558105, "learning_rate": 1.6962004466626776e-06, "loss": 0.1869, "num_input_tokens_seen": 45430816, "step": 67415 }, { "epoch": 1.6470818166271712, "grad_norm": 11.627837181091309, "learning_rate": 1.6961392278253386e-06, "loss": 0.1323, "num_input_tokens_seen": 45434336, "step": 67420 }, { "epoch": 1.6472039674590184, "grad_norm": 0.8815947771072388, "learning_rate": 1.6960780039254882e-06, "loss": 0.084, "num_input_tokens_seen": 45437664, "step": 67425 }, { "epoch": 1.6473261182908656, "grad_norm": 10.567502975463867, "learning_rate": 1.6960167749635714e-06, "loss": 0.1477, "num_input_tokens_seen": 45441056, "step": 67430 }, { "epoch": 1.6474482691227128, "grad_norm": 0.7220805883407593, "learning_rate": 1.6959555409400332e-06, "loss": 0.0439, "num_input_tokens_seen": 45444384, "step": 67435 }, { "epoch": 1.64757041995456, "grad_norm": 39.08369827270508, "learning_rate": 1.6958943018553194e-06, "loss": 0.0756, "num_input_tokens_seen": 45447712, "step": 67440 }, { "epoch": 1.6476925707864072, "grad_norm": 0.0401715449988842, "learning_rate": 1.695833057709875e-06, "loss": 0.0039, "num_input_tokens_seen": 45451232, "step": 67445 }, { "epoch": 1.6478147216182544, "grad_norm": 0.2429829090833664, "learning_rate": 1.6957718085041453e-06, "loss": 0.0736, "num_input_tokens_seen": 45454304, "step": 67450 }, { "epoch": 1.6479368724501013, "grad_norm": 0.35649529099464417, "learning_rate": 1.6957105542385758e-06, "loss": 0.1157, "num_input_tokens_seen": 45457376, "step": 67455 }, { "epoch": 1.6480590232819485, "grad_norm": 180.90599060058594, "learning_rate": 1.695649294913612e-06, "loss": 0.0474, "num_input_tokens_seen": 45460448, "step": 67460 }, { "epoch": 1.6481811741137957, "grad_norm": 58.405635833740234, "learning_rate": 1.6955880305296996e-06, "loss": 0.2013, "num_input_tokens_seen": 45463904, "step": 67465 }, { "epoch": 1.648303324945643, "grad_norm": 124.4507064819336, "learning_rate": 1.695526761087284e-06, "loss": 0.0781, "num_input_tokens_seen": 45467424, "step": 67470 }, { "epoch": 1.6484254757774899, "grad_norm": 0.10375184565782547, "learning_rate": 1.6954654865868107e-06, "loss": 0.0341, "num_input_tokens_seen": 45470816, "step": 67475 }, { "epoch": 1.648547626609337, "grad_norm": 20.57008934020996, "learning_rate": 1.695404207028725e-06, "loss": 0.0845, "num_input_tokens_seen": 45474400, "step": 67480 }, { "epoch": 1.6486697774411843, "grad_norm": 36.42217254638672, "learning_rate": 1.6953429224134731e-06, "loss": 0.2447, "num_input_tokens_seen": 45478112, "step": 67485 }, { "epoch": 1.6487919282730314, "grad_norm": 0.3584747910499573, "learning_rate": 1.6952816327415004e-06, "loss": 0.0339, "num_input_tokens_seen": 45481184, "step": 67490 }, { "epoch": 1.6489140791048786, "grad_norm": 0.2505834400653839, "learning_rate": 1.6952203380132529e-06, "loss": 0.1602, "num_input_tokens_seen": 45484704, "step": 67495 }, { "epoch": 1.6490362299367258, "grad_norm": 0.3703761100769043, "learning_rate": 1.6951590382291761e-06, "loss": 0.0029, "num_input_tokens_seen": 45488608, "step": 67500 }, { "epoch": 1.649158380768573, "grad_norm": 0.5771975517272949, "learning_rate": 1.6950977333897156e-06, "loss": 0.1515, "num_input_tokens_seen": 45491680, "step": 67505 }, { "epoch": 1.6492805316004202, "grad_norm": 0.5081115961074829, "learning_rate": 1.6950364234953173e-06, "loss": 0.0019, "num_input_tokens_seen": 45495200, "step": 67510 }, { "epoch": 1.6494026824322674, "grad_norm": 0.19448351860046387, "learning_rate": 1.6949751085464273e-06, "loss": 0.0996, "num_input_tokens_seen": 45498336, "step": 67515 }, { "epoch": 1.6495248332641146, "grad_norm": 0.21856345236301422, "learning_rate": 1.6949137885434914e-06, "loss": 0.0285, "num_input_tokens_seen": 45501536, "step": 67520 }, { "epoch": 1.6496469840959618, "grad_norm": 28.951377868652344, "learning_rate": 1.6948524634869555e-06, "loss": 0.1134, "num_input_tokens_seen": 45505056, "step": 67525 }, { "epoch": 1.649769134927809, "grad_norm": 0.367519736289978, "learning_rate": 1.6947911333772657e-06, "loss": 0.0424, "num_input_tokens_seen": 45508256, "step": 67530 }, { "epoch": 1.6498912857596562, "grad_norm": 15.366878509521484, "learning_rate": 1.6947297982148678e-06, "loss": 0.2835, "num_input_tokens_seen": 45511456, "step": 67535 }, { "epoch": 1.6500134365915033, "grad_norm": 27.542980194091797, "learning_rate": 1.694668458000208e-06, "loss": 0.1688, "num_input_tokens_seen": 45514976, "step": 67540 }, { "epoch": 1.6501355874233503, "grad_norm": 0.08444590866565704, "learning_rate": 1.6946071127337323e-06, "loss": 0.019, "num_input_tokens_seen": 45518432, "step": 67545 }, { "epoch": 1.6502577382551975, "grad_norm": 0.12355171144008636, "learning_rate": 1.694545762415887e-06, "loss": 0.0685, "num_input_tokens_seen": 45522272, "step": 67550 }, { "epoch": 1.6503798890870447, "grad_norm": 0.13620570302009583, "learning_rate": 1.6944844070471178e-06, "loss": 0.1469, "num_input_tokens_seen": 45525792, "step": 67555 }, { "epoch": 1.6505020399188919, "grad_norm": 0.13408538699150085, "learning_rate": 1.6944230466278712e-06, "loss": 0.1414, "num_input_tokens_seen": 45529568, "step": 67560 }, { "epoch": 1.6506241907507389, "grad_norm": 0.328299343585968, "learning_rate": 1.6943616811585936e-06, "loss": 0.0635, "num_input_tokens_seen": 45532640, "step": 67565 }, { "epoch": 1.650746341582586, "grad_norm": 3.6790201663970947, "learning_rate": 1.6943003106397313e-06, "loss": 0.0022, "num_input_tokens_seen": 45536096, "step": 67570 }, { "epoch": 1.6508684924144332, "grad_norm": 3.0552306175231934, "learning_rate": 1.69423893507173e-06, "loss": 0.0309, "num_input_tokens_seen": 45539424, "step": 67575 }, { "epoch": 1.6509906432462804, "grad_norm": 30.636735916137695, "learning_rate": 1.6941775544550368e-06, "loss": 0.1266, "num_input_tokens_seen": 45542944, "step": 67580 }, { "epoch": 1.6511127940781276, "grad_norm": 21.842836380004883, "learning_rate": 1.6941161687900975e-06, "loss": 0.0458, "num_input_tokens_seen": 45546336, "step": 67585 }, { "epoch": 1.6512349449099748, "grad_norm": 18.858291625976562, "learning_rate": 1.694054778077359e-06, "loss": 0.21, "num_input_tokens_seen": 45549664, "step": 67590 }, { "epoch": 1.651357095741822, "grad_norm": 22.735301971435547, "learning_rate": 1.693993382317267e-06, "loss": 0.0357, "num_input_tokens_seen": 45552608, "step": 67595 }, { "epoch": 1.6514792465736692, "grad_norm": 13.76203441619873, "learning_rate": 1.6939319815102686e-06, "loss": 0.0687, "num_input_tokens_seen": 45555616, "step": 67600 }, { "epoch": 1.6516013974055164, "grad_norm": 0.1456945240497589, "learning_rate": 1.6938705756568106e-06, "loss": 0.0286, "num_input_tokens_seen": 45559136, "step": 67605 }, { "epoch": 1.6517235482373636, "grad_norm": 13.13014030456543, "learning_rate": 1.6938091647573385e-06, "loss": 0.1458, "num_input_tokens_seen": 45562400, "step": 67610 }, { "epoch": 1.6518456990692107, "grad_norm": 7.869939804077148, "learning_rate": 1.6937477488122997e-06, "loss": 0.0453, "num_input_tokens_seen": 45565728, "step": 67615 }, { "epoch": 1.651967849901058, "grad_norm": 0.9261747598648071, "learning_rate": 1.693686327822141e-06, "loss": 0.1273, "num_input_tokens_seen": 45569184, "step": 67620 }, { "epoch": 1.6520900007329051, "grad_norm": 20.965072631835938, "learning_rate": 1.6936249017873086e-06, "loss": 0.0947, "num_input_tokens_seen": 45572384, "step": 67625 }, { "epoch": 1.6522121515647523, "grad_norm": 153.28591918945312, "learning_rate": 1.6935634707082494e-06, "loss": 0.0361, "num_input_tokens_seen": 45575776, "step": 67630 }, { "epoch": 1.6523343023965993, "grad_norm": 0.2594250440597534, "learning_rate": 1.69350203458541e-06, "loss": 0.1171, "num_input_tokens_seen": 45579232, "step": 67635 }, { "epoch": 1.6524564532284465, "grad_norm": 0.5927287340164185, "learning_rate": 1.6934405934192372e-06, "loss": 0.0348, "num_input_tokens_seen": 45582496, "step": 67640 }, { "epoch": 1.6525786040602937, "grad_norm": 2.2029905319213867, "learning_rate": 1.693379147210178e-06, "loss": 0.0035, "num_input_tokens_seen": 45586080, "step": 67645 }, { "epoch": 1.6527007548921409, "grad_norm": 0.10701204091310501, "learning_rate": 1.6933176959586792e-06, "loss": 0.2543, "num_input_tokens_seen": 45589216, "step": 67650 }, { "epoch": 1.6528229057239878, "grad_norm": 0.11217296123504639, "learning_rate": 1.6932562396651874e-06, "loss": 0.0465, "num_input_tokens_seen": 45592288, "step": 67655 }, { "epoch": 1.652945056555835, "grad_norm": 0.21580864489078522, "learning_rate": 1.6931947783301502e-06, "loss": 0.049, "num_input_tokens_seen": 45595296, "step": 67660 }, { "epoch": 1.6530672073876822, "grad_norm": 0.8896117806434631, "learning_rate": 1.6931333119540138e-06, "loss": 0.1545, "num_input_tokens_seen": 45598688, "step": 67665 }, { "epoch": 1.6531893582195294, "grad_norm": 0.05038753151893616, "learning_rate": 1.6930718405372254e-06, "loss": 0.1369, "num_input_tokens_seen": 45601952, "step": 67670 }, { "epoch": 1.6533115090513766, "grad_norm": 30.97816276550293, "learning_rate": 1.6930103640802327e-06, "loss": 0.1289, "num_input_tokens_seen": 45605472, "step": 67675 }, { "epoch": 1.6534336598832238, "grad_norm": 17.580766677856445, "learning_rate": 1.6929488825834816e-06, "loss": 0.194, "num_input_tokens_seen": 45608480, "step": 67680 }, { "epoch": 1.653555810715071, "grad_norm": 0.264511376619339, "learning_rate": 1.6928873960474204e-06, "loss": 0.0867, "num_input_tokens_seen": 45611872, "step": 67685 }, { "epoch": 1.6536779615469182, "grad_norm": 0.17803645133972168, "learning_rate": 1.6928259044724954e-06, "loss": 0.0908, "num_input_tokens_seen": 45615136, "step": 67690 }, { "epoch": 1.6538001123787653, "grad_norm": 74.42533111572266, "learning_rate": 1.6927644078591539e-06, "loss": 0.0776, "num_input_tokens_seen": 45618848, "step": 67695 }, { "epoch": 1.6539222632106125, "grad_norm": 0.20543590188026428, "learning_rate": 1.6927029062078435e-06, "loss": 0.0745, "num_input_tokens_seen": 45622560, "step": 67700 }, { "epoch": 1.6540444140424597, "grad_norm": 0.21554772555828094, "learning_rate": 1.6926413995190112e-06, "loss": 0.0492, "num_input_tokens_seen": 45626336, "step": 67705 }, { "epoch": 1.654166564874307, "grad_norm": 0.43790730834007263, "learning_rate": 1.6925798877931046e-06, "loss": 0.0027, "num_input_tokens_seen": 45629600, "step": 67710 }, { "epoch": 1.654288715706154, "grad_norm": 9.921793937683105, "learning_rate": 1.6925183710305704e-06, "loss": 0.1117, "num_input_tokens_seen": 45632736, "step": 67715 }, { "epoch": 1.6544108665380013, "grad_norm": 0.18384996056556702, "learning_rate": 1.6924568492318566e-06, "loss": 0.0551, "num_input_tokens_seen": 45636384, "step": 67720 }, { "epoch": 1.6545330173698483, "grad_norm": 16.222341537475586, "learning_rate": 1.6923953223974103e-06, "loss": 0.0821, "num_input_tokens_seen": 45639712, "step": 67725 }, { "epoch": 1.6546551682016954, "grad_norm": 0.13005872070789337, "learning_rate": 1.692333790527679e-06, "loss": 0.0315, "num_input_tokens_seen": 45642720, "step": 67730 }, { "epoch": 1.6547773190335426, "grad_norm": 0.10399110615253448, "learning_rate": 1.69227225362311e-06, "loss": 0.0022, "num_input_tokens_seen": 45646304, "step": 67735 }, { "epoch": 1.6548994698653898, "grad_norm": 11.264019012451172, "learning_rate": 1.692210711684151e-06, "loss": 0.1646, "num_input_tokens_seen": 45649888, "step": 67740 }, { "epoch": 1.6550216206972368, "grad_norm": 0.09520275890827179, "learning_rate": 1.6921491647112497e-06, "loss": 0.0147, "num_input_tokens_seen": 45653216, "step": 67745 }, { "epoch": 1.655143771529084, "grad_norm": 0.1157083511352539, "learning_rate": 1.6920876127048534e-06, "loss": 0.0367, "num_input_tokens_seen": 45656416, "step": 67750 }, { "epoch": 1.6552659223609312, "grad_norm": 0.22962799668312073, "learning_rate": 1.6920260556654098e-06, "loss": 0.0492, "num_input_tokens_seen": 45659360, "step": 67755 }, { "epoch": 1.6553880731927784, "grad_norm": 0.038359832018613815, "learning_rate": 1.6919644935933666e-06, "loss": 0.0991, "num_input_tokens_seen": 45662752, "step": 67760 }, { "epoch": 1.6555102240246256, "grad_norm": 0.17788651585578918, "learning_rate": 1.6919029264891713e-06, "loss": 0.062, "num_input_tokens_seen": 45666144, "step": 67765 }, { "epoch": 1.6556323748564727, "grad_norm": 0.0891755148768425, "learning_rate": 1.6918413543532722e-06, "loss": 0.0816, "num_input_tokens_seen": 45669088, "step": 67770 }, { "epoch": 1.65575452568832, "grad_norm": 11.52059268951416, "learning_rate": 1.6917797771861165e-06, "loss": 0.0582, "num_input_tokens_seen": 45672032, "step": 67775 }, { "epoch": 1.6558766765201671, "grad_norm": 0.11397583037614822, "learning_rate": 1.691718194988152e-06, "loss": 0.0879, "num_input_tokens_seen": 45675104, "step": 67780 }, { "epoch": 1.6559988273520143, "grad_norm": 0.13635925948619843, "learning_rate": 1.6916566077598272e-06, "loss": 0.1022, "num_input_tokens_seen": 45677984, "step": 67785 }, { "epoch": 1.6561209781838615, "grad_norm": 0.19257278740406036, "learning_rate": 1.6915950155015892e-06, "loss": 0.0397, "num_input_tokens_seen": 45681376, "step": 67790 }, { "epoch": 1.6562431290157087, "grad_norm": 0.24093586206436157, "learning_rate": 1.6915334182138863e-06, "loss": 0.001, "num_input_tokens_seen": 45684512, "step": 67795 }, { "epoch": 1.6563652798475559, "grad_norm": 9.37103271484375, "learning_rate": 1.6914718158971662e-06, "loss": 0.1561, "num_input_tokens_seen": 45688096, "step": 67800 }, { "epoch": 1.656487430679403, "grad_norm": 0.19032365083694458, "learning_rate": 1.6914102085518773e-06, "loss": 0.0419, "num_input_tokens_seen": 45691040, "step": 67805 }, { "epoch": 1.65660958151125, "grad_norm": 229.4463653564453, "learning_rate": 1.6913485961784672e-06, "loss": 0.0523, "num_input_tokens_seen": 45694432, "step": 67810 }, { "epoch": 1.6567317323430972, "grad_norm": 8.384881973266602, "learning_rate": 1.6912869787773842e-06, "loss": 0.0485, "num_input_tokens_seen": 45697696, "step": 67815 }, { "epoch": 1.6568538831749444, "grad_norm": 0.07127055525779724, "learning_rate": 1.6912253563490765e-06, "loss": 0.044, "num_input_tokens_seen": 45701024, "step": 67820 }, { "epoch": 1.6569760340067916, "grad_norm": 15.135320663452148, "learning_rate": 1.6911637288939922e-06, "loss": 0.1131, "num_input_tokens_seen": 45704480, "step": 67825 }, { "epoch": 1.6570981848386388, "grad_norm": 26.051050186157227, "learning_rate": 1.6911020964125791e-06, "loss": 0.0481, "num_input_tokens_seen": 45707552, "step": 67830 }, { "epoch": 1.6572203356704858, "grad_norm": 0.23497499525547028, "learning_rate": 1.6910404589052857e-06, "loss": 0.0008, "num_input_tokens_seen": 45711072, "step": 67835 }, { "epoch": 1.657342486502333, "grad_norm": 0.049059245735406876, "learning_rate": 1.6909788163725605e-06, "loss": 0.0647, "num_input_tokens_seen": 45714400, "step": 67840 }, { "epoch": 1.6574646373341801, "grad_norm": 16.081838607788086, "learning_rate": 1.6909171688148512e-06, "loss": 0.1721, "num_input_tokens_seen": 45717728, "step": 67845 }, { "epoch": 1.6575867881660273, "grad_norm": 0.1584680676460266, "learning_rate": 1.6908555162326064e-06, "loss": 0.0284, "num_input_tokens_seen": 45721248, "step": 67850 }, { "epoch": 1.6577089389978745, "grad_norm": 10.100417137145996, "learning_rate": 1.6907938586262747e-06, "loss": 0.0729, "num_input_tokens_seen": 45724640, "step": 67855 }, { "epoch": 1.6578310898297217, "grad_norm": 86.2887954711914, "learning_rate": 1.690732195996304e-06, "loss": 0.0603, "num_input_tokens_seen": 45727840, "step": 67860 }, { "epoch": 1.657953240661569, "grad_norm": 0.12551839649677277, "learning_rate": 1.6906705283431432e-06, "loss": 0.24, "num_input_tokens_seen": 45731168, "step": 67865 }, { "epoch": 1.658075391493416, "grad_norm": 9.954278945922852, "learning_rate": 1.6906088556672405e-06, "loss": 0.0306, "num_input_tokens_seen": 45734432, "step": 67870 }, { "epoch": 1.6581975423252633, "grad_norm": 0.18424488604068756, "learning_rate": 1.6905471779690443e-06, "loss": 0.0692, "num_input_tokens_seen": 45737952, "step": 67875 }, { "epoch": 1.6583196931571105, "grad_norm": 1.1578551530838013, "learning_rate": 1.6904854952490035e-06, "loss": 0.1682, "num_input_tokens_seen": 45741216, "step": 67880 }, { "epoch": 1.6584418439889577, "grad_norm": 0.32225939631462097, "learning_rate": 1.6904238075075665e-06, "loss": 0.075, "num_input_tokens_seen": 45744864, "step": 67885 }, { "epoch": 1.6585639948208049, "grad_norm": 0.1321701854467392, "learning_rate": 1.6903621147451816e-06, "loss": 0.0613, "num_input_tokens_seen": 45747936, "step": 67890 }, { "epoch": 1.658686145652652, "grad_norm": 0.5468298196792603, "learning_rate": 1.6903004169622976e-06, "loss": 0.0063, "num_input_tokens_seen": 45751456, "step": 67895 }, { "epoch": 1.658808296484499, "grad_norm": 46.02381134033203, "learning_rate": 1.6902387141593637e-06, "loss": 0.1953, "num_input_tokens_seen": 45755104, "step": 67900 }, { "epoch": 1.6589304473163462, "grad_norm": 0.20815016329288483, "learning_rate": 1.6901770063368281e-06, "loss": 0.089, "num_input_tokens_seen": 45758816, "step": 67905 }, { "epoch": 1.6590525981481934, "grad_norm": 8.63784408569336, "learning_rate": 1.6901152934951397e-06, "loss": 0.1403, "num_input_tokens_seen": 45762272, "step": 67910 }, { "epoch": 1.6591747489800406, "grad_norm": 0.018525369465351105, "learning_rate": 1.6900535756347472e-06, "loss": 0.1207, "num_input_tokens_seen": 45765216, "step": 67915 }, { "epoch": 1.6592968998118878, "grad_norm": 323.701904296875, "learning_rate": 1.6899918527560995e-06, "loss": 0.1671, "num_input_tokens_seen": 45768672, "step": 67920 }, { "epoch": 1.6594190506437347, "grad_norm": 0.3565976619720459, "learning_rate": 1.6899301248596454e-06, "loss": 0.0019, "num_input_tokens_seen": 45772000, "step": 67925 }, { "epoch": 1.659541201475582, "grad_norm": 0.3739701807498932, "learning_rate": 1.6898683919458342e-06, "loss": 0.1282, "num_input_tokens_seen": 45775648, "step": 67930 }, { "epoch": 1.6596633523074291, "grad_norm": 10.951455116271973, "learning_rate": 1.689806654015114e-06, "loss": 0.1585, "num_input_tokens_seen": 45778912, "step": 67935 }, { "epoch": 1.6597855031392763, "grad_norm": 531.7941284179688, "learning_rate": 1.6897449110679344e-06, "loss": 0.0189, "num_input_tokens_seen": 45782304, "step": 67940 }, { "epoch": 1.6599076539711235, "grad_norm": 31.341873168945312, "learning_rate": 1.6896831631047444e-06, "loss": 0.0686, "num_input_tokens_seen": 45785632, "step": 67945 }, { "epoch": 1.6600298048029707, "grad_norm": 38.764163970947266, "learning_rate": 1.6896214101259928e-06, "loss": 0.2004, "num_input_tokens_seen": 45788640, "step": 67950 }, { "epoch": 1.6601519556348179, "grad_norm": 19.04844093322754, "learning_rate": 1.6895596521321292e-06, "loss": 0.0817, "num_input_tokens_seen": 45792160, "step": 67955 }, { "epoch": 1.660274106466665, "grad_norm": 43.606266021728516, "learning_rate": 1.689497889123602e-06, "loss": 0.2003, "num_input_tokens_seen": 45795680, "step": 67960 }, { "epoch": 1.6603962572985123, "grad_norm": 28.905261993408203, "learning_rate": 1.6894361211008608e-06, "loss": 0.0544, "num_input_tokens_seen": 45798624, "step": 67965 }, { "epoch": 1.6605184081303594, "grad_norm": 0.7634138464927673, "learning_rate": 1.6893743480643546e-06, "loss": 0.0039, "num_input_tokens_seen": 45801632, "step": 67970 }, { "epoch": 1.6606405589622066, "grad_norm": 0.33794939517974854, "learning_rate": 1.689312570014533e-06, "loss": 0.0492, "num_input_tokens_seen": 45805280, "step": 67975 }, { "epoch": 1.6607627097940538, "grad_norm": 17.452777862548828, "learning_rate": 1.6892507869518447e-06, "loss": 0.1252, "num_input_tokens_seen": 45808480, "step": 67980 }, { "epoch": 1.660884860625901, "grad_norm": 0.7841615080833435, "learning_rate": 1.6891889988767392e-06, "loss": 0.0444, "num_input_tokens_seen": 45811680, "step": 67985 }, { "epoch": 1.661007011457748, "grad_norm": 203.92428588867188, "learning_rate": 1.6891272057896661e-06, "loss": 0.0777, "num_input_tokens_seen": 45814944, "step": 67990 }, { "epoch": 1.6611291622895952, "grad_norm": 20.686752319335938, "learning_rate": 1.689065407691075e-06, "loss": 0.0692, "num_input_tokens_seen": 45818272, "step": 67995 }, { "epoch": 1.6612513131214424, "grad_norm": 103.55084991455078, "learning_rate": 1.6890036045814142e-06, "loss": 0.2293, "num_input_tokens_seen": 45821472, "step": 68000 }, { "epoch": 1.6613734639532896, "grad_norm": 0.2858763039112091, "learning_rate": 1.6889417964611343e-06, "loss": 0.0691, "num_input_tokens_seen": 45824736, "step": 68005 }, { "epoch": 1.6614956147851365, "grad_norm": 1.4139119386672974, "learning_rate": 1.6888799833306842e-06, "loss": 0.0022, "num_input_tokens_seen": 45827872, "step": 68010 }, { "epoch": 1.6616177656169837, "grad_norm": 0.013525686226785183, "learning_rate": 1.6888181651905136e-06, "loss": 0.0388, "num_input_tokens_seen": 45831456, "step": 68015 }, { "epoch": 1.661739916448831, "grad_norm": 44.468807220458984, "learning_rate": 1.688756342041072e-06, "loss": 0.0825, "num_input_tokens_seen": 45835616, "step": 68020 }, { "epoch": 1.661862067280678, "grad_norm": 0.33210596442222595, "learning_rate": 1.688694513882809e-06, "loss": 0.0442, "num_input_tokens_seen": 45839776, "step": 68025 }, { "epoch": 1.6619842181125253, "grad_norm": 0.23905229568481445, "learning_rate": 1.6886326807161746e-06, "loss": 0.0012, "num_input_tokens_seen": 45843680, "step": 68030 }, { "epoch": 1.6621063689443725, "grad_norm": 0.04572295397520065, "learning_rate": 1.6885708425416178e-06, "loss": 0.1546, "num_input_tokens_seen": 45846688, "step": 68035 }, { "epoch": 1.6622285197762197, "grad_norm": 20.764741897583008, "learning_rate": 1.688508999359589e-06, "loss": 0.0749, "num_input_tokens_seen": 45850272, "step": 68040 }, { "epoch": 1.6623506706080668, "grad_norm": 17.399734497070312, "learning_rate": 1.688447151170537e-06, "loss": 0.1484, "num_input_tokens_seen": 45853728, "step": 68045 }, { "epoch": 1.662472821439914, "grad_norm": 0.7581236958503723, "learning_rate": 1.6883852979749124e-06, "loss": 0.0673, "num_input_tokens_seen": 45856864, "step": 68050 }, { "epoch": 1.6625949722717612, "grad_norm": 1.076304316520691, "learning_rate": 1.6883234397731647e-06, "loss": 0.0156, "num_input_tokens_seen": 45860192, "step": 68055 }, { "epoch": 1.6627171231036084, "grad_norm": 2.357579231262207, "learning_rate": 1.688261576565744e-06, "loss": 0.0551, "num_input_tokens_seen": 45863584, "step": 68060 }, { "epoch": 1.6628392739354556, "grad_norm": 0.12220649421215057, "learning_rate": 1.6881997083530999e-06, "loss": 0.0017, "num_input_tokens_seen": 45866976, "step": 68065 }, { "epoch": 1.6629614247673028, "grad_norm": 0.12620511651039124, "learning_rate": 1.6881378351356825e-06, "loss": 0.0667, "num_input_tokens_seen": 45870240, "step": 68070 }, { "epoch": 1.66308357559915, "grad_norm": 0.01762893982231617, "learning_rate": 1.6880759569139414e-06, "loss": 0.0887, "num_input_tokens_seen": 45873696, "step": 68075 }, { "epoch": 1.663205726430997, "grad_norm": 0.046484898775815964, "learning_rate": 1.688014073688327e-06, "loss": 0.1637, "num_input_tokens_seen": 45877536, "step": 68080 }, { "epoch": 1.6633278772628441, "grad_norm": 0.13141871988773346, "learning_rate": 1.6879521854592893e-06, "loss": 0.0667, "num_input_tokens_seen": 45881120, "step": 68085 }, { "epoch": 1.6634500280946913, "grad_norm": 14.069496154785156, "learning_rate": 1.6878902922272781e-06, "loss": 0.0385, "num_input_tokens_seen": 45884576, "step": 68090 }, { "epoch": 1.6635721789265385, "grad_norm": 0.33704790472984314, "learning_rate": 1.687828393992744e-06, "loss": 0.0879, "num_input_tokens_seen": 45887776, "step": 68095 }, { "epoch": 1.6636943297583855, "grad_norm": 17.2185115814209, "learning_rate": 1.6877664907561367e-06, "loss": 0.18, "num_input_tokens_seen": 45891104, "step": 68100 }, { "epoch": 1.6638164805902327, "grad_norm": 0.07757803797721863, "learning_rate": 1.6877045825179063e-06, "loss": 0.0007, "num_input_tokens_seen": 45894880, "step": 68105 }, { "epoch": 1.6639386314220799, "grad_norm": 191.504150390625, "learning_rate": 1.6876426692785032e-06, "loss": 0.0989, "num_input_tokens_seen": 45898592, "step": 68110 }, { "epoch": 1.664060782253927, "grad_norm": 0.2756825089454651, "learning_rate": 1.6875807510383777e-06, "loss": 0.0914, "num_input_tokens_seen": 45901920, "step": 68115 }, { "epoch": 1.6641829330857743, "grad_norm": 0.012194006703794003, "learning_rate": 1.6875188277979802e-06, "loss": 0.0416, "num_input_tokens_seen": 45905312, "step": 68120 }, { "epoch": 1.6643050839176214, "grad_norm": 0.0791403129696846, "learning_rate": 1.6874568995577608e-06, "loss": 0.1037, "num_input_tokens_seen": 45908448, "step": 68125 }, { "epoch": 1.6644272347494686, "grad_norm": 8.660093307495117, "learning_rate": 1.6873949663181698e-06, "loss": 0.1828, "num_input_tokens_seen": 45911392, "step": 68130 }, { "epoch": 1.6645493855813158, "grad_norm": 35.485111236572266, "learning_rate": 1.6873330280796578e-06, "loss": 0.1889, "num_input_tokens_seen": 45914656, "step": 68135 }, { "epoch": 1.664671536413163, "grad_norm": 0.08751770853996277, "learning_rate": 1.6872710848426752e-06, "loss": 0.1231, "num_input_tokens_seen": 45918240, "step": 68140 }, { "epoch": 1.6647936872450102, "grad_norm": 0.339227557182312, "learning_rate": 1.6872091366076725e-06, "loss": 0.0487, "num_input_tokens_seen": 45922016, "step": 68145 }, { "epoch": 1.6649158380768574, "grad_norm": 0.7971973419189453, "learning_rate": 1.6871471833751e-06, "loss": 0.1104, "num_input_tokens_seen": 45924960, "step": 68150 }, { "epoch": 1.6650379889087046, "grad_norm": 12.896892547607422, "learning_rate": 1.6870852251454082e-06, "loss": 0.1008, "num_input_tokens_seen": 45928608, "step": 68155 }, { "epoch": 1.6651601397405518, "grad_norm": 0.27142223715782166, "learning_rate": 1.687023261919048e-06, "loss": 0.0479, "num_input_tokens_seen": 45932448, "step": 68160 }, { "epoch": 1.665282290572399, "grad_norm": 0.1016705185174942, "learning_rate": 1.6869612936964699e-06, "loss": 0.0546, "num_input_tokens_seen": 45935712, "step": 68165 }, { "epoch": 1.665404441404246, "grad_norm": 3.339890480041504, "learning_rate": 1.6868993204781242e-06, "loss": 0.0772, "num_input_tokens_seen": 45938912, "step": 68170 }, { "epoch": 1.6655265922360931, "grad_norm": 8.35922622680664, "learning_rate": 1.6868373422644623e-06, "loss": 0.095, "num_input_tokens_seen": 45942560, "step": 68175 }, { "epoch": 1.6656487430679403, "grad_norm": 22.813560485839844, "learning_rate": 1.6867753590559346e-06, "loss": 0.092, "num_input_tokens_seen": 45945632, "step": 68180 }, { "epoch": 1.6657708938997875, "grad_norm": 0.4119601249694824, "learning_rate": 1.6867133708529915e-06, "loss": 0.1687, "num_input_tokens_seen": 45949152, "step": 68185 }, { "epoch": 1.6658930447316345, "grad_norm": 7.913878440856934, "learning_rate": 1.686651377656084e-06, "loss": 0.1097, "num_input_tokens_seen": 45952800, "step": 68190 }, { "epoch": 1.6660151955634817, "grad_norm": 0.6191280484199524, "learning_rate": 1.6865893794656631e-06, "loss": 0.1661, "num_input_tokens_seen": 45955808, "step": 68195 }, { "epoch": 1.6661373463953288, "grad_norm": 1.6392009258270264, "learning_rate": 1.6865273762821794e-06, "loss": 0.0467, "num_input_tokens_seen": 45959648, "step": 68200 }, { "epoch": 1.666259497227176, "grad_norm": 0.5763128995895386, "learning_rate": 1.6864653681060841e-06, "loss": 0.0033, "num_input_tokens_seen": 45963040, "step": 68205 }, { "epoch": 1.6663816480590232, "grad_norm": 0.14941108226776123, "learning_rate": 1.686403354937828e-06, "loss": 0.1121, "num_input_tokens_seen": 45966432, "step": 68210 }, { "epoch": 1.6665037988908704, "grad_norm": 0.26819905638694763, "learning_rate": 1.6863413367778622e-06, "loss": 0.0397, "num_input_tokens_seen": 45969568, "step": 68215 }, { "epoch": 1.6666259497227176, "grad_norm": 0.2349829375743866, "learning_rate": 1.6862793136266376e-06, "loss": 0.1143, "num_input_tokens_seen": 45972640, "step": 68220 }, { "epoch": 1.6667481005545648, "grad_norm": 0.09882470965385437, "learning_rate": 1.686217285484605e-06, "loss": 0.0426, "num_input_tokens_seen": 45976352, "step": 68225 }, { "epoch": 1.666870251386412, "grad_norm": 18.198345184326172, "learning_rate": 1.6861552523522157e-06, "loss": 0.0423, "num_input_tokens_seen": 45979680, "step": 68230 }, { "epoch": 1.6669924022182592, "grad_norm": 101.0886459350586, "learning_rate": 1.6860932142299212e-06, "loss": 0.0519, "num_input_tokens_seen": 45983072, "step": 68235 }, { "epoch": 1.6671145530501064, "grad_norm": 18.422719955444336, "learning_rate": 1.6860311711181722e-06, "loss": 0.1198, "num_input_tokens_seen": 45986400, "step": 68240 }, { "epoch": 1.6672367038819536, "grad_norm": 0.1812196522951126, "learning_rate": 1.6859691230174198e-06, "loss": 0.0444, "num_input_tokens_seen": 45989408, "step": 68245 }, { "epoch": 1.6673588547138007, "grad_norm": 0.20358893275260925, "learning_rate": 1.6859070699281155e-06, "loss": 0.0011, "num_input_tokens_seen": 45993184, "step": 68250 }, { "epoch": 1.667481005545648, "grad_norm": 20.3399658203125, "learning_rate": 1.6858450118507107e-06, "loss": 0.0943, "num_input_tokens_seen": 45996832, "step": 68255 }, { "epoch": 1.667603156377495, "grad_norm": 0.04567122459411621, "learning_rate": 1.6857829487856563e-06, "loss": 0.0301, "num_input_tokens_seen": 46000352, "step": 68260 }, { "epoch": 1.667725307209342, "grad_norm": 63.65852737426758, "learning_rate": 1.6857208807334038e-06, "loss": 0.0598, "num_input_tokens_seen": 46003424, "step": 68265 }, { "epoch": 1.6678474580411893, "grad_norm": 0.3260882794857025, "learning_rate": 1.6856588076944048e-06, "loss": 0.0956, "num_input_tokens_seen": 46006816, "step": 68270 }, { "epoch": 1.6679696088730365, "grad_norm": 26.87339210510254, "learning_rate": 1.6855967296691104e-06, "loss": 0.0745, "num_input_tokens_seen": 46009888, "step": 68275 }, { "epoch": 1.6680917597048834, "grad_norm": 0.2553943395614624, "learning_rate": 1.6855346466579725e-06, "loss": 0.0364, "num_input_tokens_seen": 46013792, "step": 68280 }, { "epoch": 1.6682139105367306, "grad_norm": 0.247576043009758, "learning_rate": 1.6854725586614419e-06, "loss": 0.0034, "num_input_tokens_seen": 46017120, "step": 68285 }, { "epoch": 1.6683360613685778, "grad_norm": 0.5242090225219727, "learning_rate": 1.6854104656799707e-06, "loss": 0.0019, "num_input_tokens_seen": 46020256, "step": 68290 }, { "epoch": 1.668458212200425, "grad_norm": 2.042462110519409, "learning_rate": 1.6853483677140098e-06, "loss": 0.0043, "num_input_tokens_seen": 46023904, "step": 68295 }, { "epoch": 1.6685803630322722, "grad_norm": 0.25644490122795105, "learning_rate": 1.6852862647640116e-06, "loss": 0.1332, "num_input_tokens_seen": 46027232, "step": 68300 }, { "epoch": 1.6687025138641194, "grad_norm": 0.1754622459411621, "learning_rate": 1.6852241568304274e-06, "loss": 0.046, "num_input_tokens_seen": 46030560, "step": 68305 }, { "epoch": 1.6688246646959666, "grad_norm": 16.908388137817383, "learning_rate": 1.6851620439137087e-06, "loss": 0.0841, "num_input_tokens_seen": 46034208, "step": 68310 }, { "epoch": 1.6689468155278138, "grad_norm": 0.1532597690820694, "learning_rate": 1.6850999260143076e-06, "loss": 0.0816, "num_input_tokens_seen": 46037536, "step": 68315 }, { "epoch": 1.669068966359661, "grad_norm": 0.12587259709835052, "learning_rate": 1.6850378031326752e-06, "loss": 0.0013, "num_input_tokens_seen": 46040992, "step": 68320 }, { "epoch": 1.6691911171915081, "grad_norm": 10.260220527648926, "learning_rate": 1.6849756752692636e-06, "loss": 0.0786, "num_input_tokens_seen": 46044384, "step": 68325 }, { "epoch": 1.6693132680233553, "grad_norm": 0.05229521170258522, "learning_rate": 1.684913542424525e-06, "loss": 0.067, "num_input_tokens_seen": 46047904, "step": 68330 }, { "epoch": 1.6694354188552025, "grad_norm": 0.24571794271469116, "learning_rate": 1.6848514045989108e-06, "loss": 0.0374, "num_input_tokens_seen": 46051040, "step": 68335 }, { "epoch": 1.6695575696870497, "grad_norm": 12.776507377624512, "learning_rate": 1.6847892617928729e-06, "loss": 0.1782, "num_input_tokens_seen": 46054432, "step": 68340 }, { "epoch": 1.6696797205188967, "grad_norm": 57.233673095703125, "learning_rate": 1.6847271140068633e-06, "loss": 0.175, "num_input_tokens_seen": 46058080, "step": 68345 }, { "epoch": 1.6698018713507439, "grad_norm": 123.28487396240234, "learning_rate": 1.684664961241334e-06, "loss": 0.0545, "num_input_tokens_seen": 46061280, "step": 68350 }, { "epoch": 1.669924022182591, "grad_norm": 0.09082359075546265, "learning_rate": 1.684602803496737e-06, "loss": 0.2261, "num_input_tokens_seen": 46064864, "step": 68355 }, { "epoch": 1.6700461730144383, "grad_norm": 0.5304872393608093, "learning_rate": 1.684540640773524e-06, "loss": 0.0986, "num_input_tokens_seen": 46068000, "step": 68360 }, { "epoch": 1.6701683238462854, "grad_norm": 0.37393248081207275, "learning_rate": 1.6844784730721476e-06, "loss": 0.0021, "num_input_tokens_seen": 46071712, "step": 68365 }, { "epoch": 1.6702904746781324, "grad_norm": 268.9071350097656, "learning_rate": 1.6844163003930599e-06, "loss": 0.0439, "num_input_tokens_seen": 46075296, "step": 68370 }, { "epoch": 1.6704126255099796, "grad_norm": 0.05923445522785187, "learning_rate": 1.6843541227367121e-06, "loss": 0.0014, "num_input_tokens_seen": 46078624, "step": 68375 }, { "epoch": 1.6705347763418268, "grad_norm": 0.42169424891471863, "learning_rate": 1.6842919401035575e-06, "loss": 0.1804, "num_input_tokens_seen": 46081696, "step": 68380 }, { "epoch": 1.670656927173674, "grad_norm": 6.835901260375977, "learning_rate": 1.6842297524940477e-06, "loss": 0.112, "num_input_tokens_seen": 46085216, "step": 68385 }, { "epoch": 1.6707790780055212, "grad_norm": 0.5587440133094788, "learning_rate": 1.6841675599086354e-06, "loss": 0.0388, "num_input_tokens_seen": 46088608, "step": 68390 }, { "epoch": 1.6709012288373684, "grad_norm": 0.23931415379047394, "learning_rate": 1.6841053623477723e-06, "loss": 0.1337, "num_input_tokens_seen": 46091616, "step": 68395 }, { "epoch": 1.6710233796692155, "grad_norm": 0.36907368898391724, "learning_rate": 1.6840431598119112e-06, "loss": 0.0462, "num_input_tokens_seen": 46096864, "step": 68400 }, { "epoch": 1.6711455305010627, "grad_norm": 0.272148460149765, "learning_rate": 1.683980952301504e-06, "loss": 0.0923, "num_input_tokens_seen": 46100448, "step": 68405 }, { "epoch": 1.67126768133291, "grad_norm": 0.0677834302186966, "learning_rate": 1.6839187398170033e-06, "loss": 0.0824, "num_input_tokens_seen": 46103584, "step": 68410 }, { "epoch": 1.6713898321647571, "grad_norm": 0.13646988570690155, "learning_rate": 1.683856522358862e-06, "loss": 0.0901, "num_input_tokens_seen": 46106592, "step": 68415 }, { "epoch": 1.6715119829966043, "grad_norm": 0.26839596033096313, "learning_rate": 1.6837942999275318e-06, "loss": 0.0017, "num_input_tokens_seen": 46109920, "step": 68420 }, { "epoch": 1.6716341338284515, "grad_norm": 0.13362592458724976, "learning_rate": 1.6837320725234657e-06, "loss": 0.0331, "num_input_tokens_seen": 46113248, "step": 68425 }, { "epoch": 1.6717562846602987, "grad_norm": 38.74449157714844, "learning_rate": 1.6836698401471158e-06, "loss": 0.0887, "num_input_tokens_seen": 46116896, "step": 68430 }, { "epoch": 1.6718784354921457, "grad_norm": 293.8094482421875, "learning_rate": 1.6836076027989351e-06, "loss": 0.0547, "num_input_tokens_seen": 46120224, "step": 68435 }, { "epoch": 1.6720005863239928, "grad_norm": 10.379058837890625, "learning_rate": 1.683545360479376e-06, "loss": 0.0575, "num_input_tokens_seen": 46123616, "step": 68440 }, { "epoch": 1.67212273715584, "grad_norm": 0.09068046510219574, "learning_rate": 1.6834831131888914e-06, "loss": 0.0708, "num_input_tokens_seen": 46126560, "step": 68445 }, { "epoch": 1.6722448879876872, "grad_norm": 1.8615421056747437, "learning_rate": 1.6834208609279336e-06, "loss": 0.026, "num_input_tokens_seen": 46129888, "step": 68450 }, { "epoch": 1.6723670388195344, "grad_norm": 16.04012107849121, "learning_rate": 1.6833586036969556e-06, "loss": 0.1095, "num_input_tokens_seen": 46133664, "step": 68455 }, { "epoch": 1.6724891896513814, "grad_norm": 0.149423286318779, "learning_rate": 1.6832963414964098e-06, "loss": 0.0005, "num_input_tokens_seen": 46136608, "step": 68460 }, { "epoch": 1.6726113404832286, "grad_norm": 0.07547707855701447, "learning_rate": 1.6832340743267493e-06, "loss": 0.0005, "num_input_tokens_seen": 46139872, "step": 68465 }, { "epoch": 1.6727334913150758, "grad_norm": 33.1000862121582, "learning_rate": 1.683171802188427e-06, "loss": 0.1449, "num_input_tokens_seen": 46143200, "step": 68470 }, { "epoch": 1.672855642146923, "grad_norm": 0.10011862218379974, "learning_rate": 1.6831095250818956e-06, "loss": 0.0376, "num_input_tokens_seen": 46146720, "step": 68475 }, { "epoch": 1.6729777929787701, "grad_norm": 25.451860427856445, "learning_rate": 1.6830472430076076e-06, "loss": 0.1879, "num_input_tokens_seen": 46149984, "step": 68480 }, { "epoch": 1.6730999438106173, "grad_norm": 0.10208665579557419, "learning_rate": 1.6829849559660167e-06, "loss": 0.0003, "num_input_tokens_seen": 46153632, "step": 68485 }, { "epoch": 1.6732220946424645, "grad_norm": 0.20292365550994873, "learning_rate": 1.6829226639575756e-06, "loss": 0.0528, "num_input_tokens_seen": 46156768, "step": 68490 }, { "epoch": 1.6733442454743117, "grad_norm": 0.09113001823425293, "learning_rate": 1.6828603669827368e-06, "loss": 0.0007, "num_input_tokens_seen": 46159968, "step": 68495 }, { "epoch": 1.673466396306159, "grad_norm": 20.79863739013672, "learning_rate": 1.682798065041954e-06, "loss": 0.114, "num_input_tokens_seen": 46164320, "step": 68500 }, { "epoch": 1.673588547138006, "grad_norm": 0.12156138569116592, "learning_rate": 1.68273575813568e-06, "loss": 0.1651, "num_input_tokens_seen": 46167968, "step": 68505 }, { "epoch": 1.6737106979698533, "grad_norm": 0.05434705317020416, "learning_rate": 1.682673446264368e-06, "loss": 0.1471, "num_input_tokens_seen": 46170976, "step": 68510 }, { "epoch": 1.6738328488017005, "grad_norm": 71.84229278564453, "learning_rate": 1.682611129428471e-06, "loss": 0.0332, "num_input_tokens_seen": 46174432, "step": 68515 }, { "epoch": 1.6739549996335477, "grad_norm": 53.085418701171875, "learning_rate": 1.6825488076284424e-06, "loss": 0.0901, "num_input_tokens_seen": 46177888, "step": 68520 }, { "epoch": 1.6740771504653946, "grad_norm": 0.19759757816791534, "learning_rate": 1.682486480864735e-06, "loss": 0.0046, "num_input_tokens_seen": 46181408, "step": 68525 }, { "epoch": 1.6741993012972418, "grad_norm": 14.114509582519531, "learning_rate": 1.6824241491378025e-06, "loss": 0.1017, "num_input_tokens_seen": 46184480, "step": 68530 }, { "epoch": 1.674321452129089, "grad_norm": 0.03609864413738251, "learning_rate": 1.6823618124480984e-06, "loss": 0.0012, "num_input_tokens_seen": 46187616, "step": 68535 }, { "epoch": 1.6744436029609362, "grad_norm": 15.648370742797852, "learning_rate": 1.682299470796075e-06, "loss": 0.0433, "num_input_tokens_seen": 46191008, "step": 68540 }, { "epoch": 1.6745657537927832, "grad_norm": 2.687039613723755, "learning_rate": 1.6822371241821864e-06, "loss": 0.0571, "num_input_tokens_seen": 46194592, "step": 68545 }, { "epoch": 1.6746879046246304, "grad_norm": 0.2621817886829376, "learning_rate": 1.6821747726068865e-06, "loss": 0.1668, "num_input_tokens_seen": 46197856, "step": 68550 }, { "epoch": 1.6748100554564775, "grad_norm": 0.1592407524585724, "learning_rate": 1.6821124160706276e-06, "loss": 0.0703, "num_input_tokens_seen": 46202080, "step": 68555 }, { "epoch": 1.6749322062883247, "grad_norm": 0.02570989541709423, "learning_rate": 1.6820500545738642e-06, "loss": 0.0012, "num_input_tokens_seen": 46205600, "step": 68560 }, { "epoch": 1.675054357120172, "grad_norm": 0.15985862910747528, "learning_rate": 1.6819876881170491e-06, "loss": 0.0521, "num_input_tokens_seen": 46209248, "step": 68565 }, { "epoch": 1.6751765079520191, "grad_norm": 0.19796176254749298, "learning_rate": 1.6819253167006359e-06, "loss": 0.0331, "num_input_tokens_seen": 46212448, "step": 68570 }, { "epoch": 1.6752986587838663, "grad_norm": 0.055551838129758835, "learning_rate": 1.6818629403250787e-06, "loss": 0.0933, "num_input_tokens_seen": 46215392, "step": 68575 }, { "epoch": 1.6754208096157135, "grad_norm": 0.15789180994033813, "learning_rate": 1.6818005589908308e-06, "loss": 0.0253, "num_input_tokens_seen": 46218464, "step": 68580 }, { "epoch": 1.6755429604475607, "grad_norm": 8.036543846130371, "learning_rate": 1.681738172698346e-06, "loss": 0.1125, "num_input_tokens_seen": 46221984, "step": 68585 }, { "epoch": 1.6756651112794079, "grad_norm": 0.08947142213582993, "learning_rate": 1.6816757814480775e-06, "loss": 0.0936, "num_input_tokens_seen": 46224928, "step": 68590 }, { "epoch": 1.675787262111255, "grad_norm": 6.953040599822998, "learning_rate": 1.6816133852404795e-06, "loss": 0.0543, "num_input_tokens_seen": 46228192, "step": 68595 }, { "epoch": 1.6759094129431023, "grad_norm": 0.3942214548587799, "learning_rate": 1.6815509840760055e-06, "loss": 0.0585, "num_input_tokens_seen": 46231712, "step": 68600 }, { "epoch": 1.6760315637749494, "grad_norm": 19.978500366210938, "learning_rate": 1.6814885779551096e-06, "loss": 0.1653, "num_input_tokens_seen": 46234848, "step": 68605 }, { "epoch": 1.6761537146067966, "grad_norm": 0.3641318380832672, "learning_rate": 1.6814261668782454e-06, "loss": 0.111, "num_input_tokens_seen": 46238752, "step": 68610 }, { "epoch": 1.6762758654386436, "grad_norm": 0.5056087970733643, "learning_rate": 1.681363750845867e-06, "loss": 0.115, "num_input_tokens_seen": 46241952, "step": 68615 }, { "epoch": 1.6763980162704908, "grad_norm": 205.91683959960938, "learning_rate": 1.681301329858428e-06, "loss": 0.0271, "num_input_tokens_seen": 46245408, "step": 68620 }, { "epoch": 1.676520167102338, "grad_norm": 21.606521606445312, "learning_rate": 1.6812389039163824e-06, "loss": 0.1538, "num_input_tokens_seen": 46249184, "step": 68625 }, { "epoch": 1.6766423179341852, "grad_norm": 0.33925703167915344, "learning_rate": 1.6811764730201844e-06, "loss": 0.121, "num_input_tokens_seen": 46252512, "step": 68630 }, { "epoch": 1.6767644687660321, "grad_norm": 8.783360481262207, "learning_rate": 1.6811140371702876e-06, "loss": 0.2316, "num_input_tokens_seen": 46255648, "step": 68635 }, { "epoch": 1.6768866195978793, "grad_norm": 34.9462890625, "learning_rate": 1.6810515963671465e-06, "loss": 0.1531, "num_input_tokens_seen": 46259104, "step": 68640 }, { "epoch": 1.6770087704297265, "grad_norm": 18.75974464416504, "learning_rate": 1.680989150611215e-06, "loss": 0.1858, "num_input_tokens_seen": 46262368, "step": 68645 }, { "epoch": 1.6771309212615737, "grad_norm": 26.14114761352539, "learning_rate": 1.6809266999029475e-06, "loss": 0.0424, "num_input_tokens_seen": 46265184, "step": 68650 }, { "epoch": 1.677253072093421, "grad_norm": 4.71269416809082, "learning_rate": 1.6808642442427975e-06, "loss": 0.072, "num_input_tokens_seen": 46268576, "step": 68655 }, { "epoch": 1.677375222925268, "grad_norm": 0.7655762434005737, "learning_rate": 1.6808017836312198e-06, "loss": 0.0042, "num_input_tokens_seen": 46271840, "step": 68660 }, { "epoch": 1.6774973737571153, "grad_norm": 0.3685450553894043, "learning_rate": 1.6807393180686683e-06, "loss": 0.1212, "num_input_tokens_seen": 46274976, "step": 68665 }, { "epoch": 1.6776195245889625, "grad_norm": 0.5734411478042603, "learning_rate": 1.6806768475555973e-06, "loss": 0.1753, "num_input_tokens_seen": 46277856, "step": 68670 }, { "epoch": 1.6777416754208097, "grad_norm": 8.355131149291992, "learning_rate": 1.6806143720924616e-06, "loss": 0.2848, "num_input_tokens_seen": 46281696, "step": 68675 }, { "epoch": 1.6778638262526568, "grad_norm": 0.27669835090637207, "learning_rate": 1.6805518916797149e-06, "loss": 0.1093, "num_input_tokens_seen": 46284960, "step": 68680 }, { "epoch": 1.677985977084504, "grad_norm": 0.5218971967697144, "learning_rate": 1.6804894063178114e-06, "loss": 0.0304, "num_input_tokens_seen": 46288288, "step": 68685 }, { "epoch": 1.6781081279163512, "grad_norm": 24.64486312866211, "learning_rate": 1.6804269160072064e-06, "loss": 0.048, "num_input_tokens_seen": 46291616, "step": 68690 }, { "epoch": 1.6782302787481984, "grad_norm": 18.304548263549805, "learning_rate": 1.6803644207483535e-06, "loss": 0.143, "num_input_tokens_seen": 46294624, "step": 68695 }, { "epoch": 1.6783524295800456, "grad_norm": 0.279045432806015, "learning_rate": 1.6803019205417076e-06, "loss": 0.0056, "num_input_tokens_seen": 46298080, "step": 68700 }, { "epoch": 1.6784745804118926, "grad_norm": 0.15253521502017975, "learning_rate": 1.6802394153877236e-06, "loss": 0.1018, "num_input_tokens_seen": 46301472, "step": 68705 }, { "epoch": 1.6785967312437398, "grad_norm": 71.8766098022461, "learning_rate": 1.6801769052868553e-06, "loss": 0.1373, "num_input_tokens_seen": 46304544, "step": 68710 }, { "epoch": 1.678718882075587, "grad_norm": 7.808777332305908, "learning_rate": 1.6801143902395576e-06, "loss": 0.0873, "num_input_tokens_seen": 46307616, "step": 68715 }, { "epoch": 1.6788410329074341, "grad_norm": 0.20652924478054047, "learning_rate": 1.6800518702462851e-06, "loss": 0.038, "num_input_tokens_seen": 46311264, "step": 68720 }, { "epoch": 1.678963183739281, "grad_norm": 0.9998967051506042, "learning_rate": 1.6799893453074924e-06, "loss": 0.0029, "num_input_tokens_seen": 46314656, "step": 68725 }, { "epoch": 1.6790853345711283, "grad_norm": 1.2813893556594849, "learning_rate": 1.6799268154236346e-06, "loss": 0.0564, "num_input_tokens_seen": 46317856, "step": 68730 }, { "epoch": 1.6792074854029755, "grad_norm": 0.016885561868548393, "learning_rate": 1.679864280595166e-06, "loss": 0.0444, "num_input_tokens_seen": 46321248, "step": 68735 }, { "epoch": 1.6793296362348227, "grad_norm": 38.7974739074707, "learning_rate": 1.6798017408225414e-06, "loss": 0.1281, "num_input_tokens_seen": 46324256, "step": 68740 }, { "epoch": 1.6794517870666699, "grad_norm": 0.31258314847946167, "learning_rate": 1.6797391961062157e-06, "loss": 0.0977, "num_input_tokens_seen": 46327712, "step": 68745 }, { "epoch": 1.679573937898517, "grad_norm": 0.10686086118221283, "learning_rate": 1.6796766464466436e-06, "loss": 0.1268, "num_input_tokens_seen": 46330976, "step": 68750 }, { "epoch": 1.6796960887303642, "grad_norm": 1.2706555128097534, "learning_rate": 1.6796140918442803e-06, "loss": 0.0773, "num_input_tokens_seen": 46334624, "step": 68755 }, { "epoch": 1.6798182395622114, "grad_norm": 0.030615871772170067, "learning_rate": 1.6795515322995804e-06, "loss": 0.0809, "num_input_tokens_seen": 46338144, "step": 68760 }, { "epoch": 1.6799403903940586, "grad_norm": 30.333724975585938, "learning_rate": 1.679488967812999e-06, "loss": 0.1333, "num_input_tokens_seen": 46341216, "step": 68765 }, { "epoch": 1.6800625412259058, "grad_norm": 5.713530540466309, "learning_rate": 1.6794263983849913e-06, "loss": 0.1215, "num_input_tokens_seen": 46344800, "step": 68770 }, { "epoch": 1.680184692057753, "grad_norm": 47.80161666870117, "learning_rate": 1.6793638240160117e-06, "loss": 0.0794, "num_input_tokens_seen": 46347872, "step": 68775 }, { "epoch": 1.6803068428896002, "grad_norm": 9.830896377563477, "learning_rate": 1.679301244706516e-06, "loss": 0.1, "num_input_tokens_seen": 46351264, "step": 68780 }, { "epoch": 1.6804289937214474, "grad_norm": 9.594369888305664, "learning_rate": 1.6792386604569588e-06, "loss": 0.1632, "num_input_tokens_seen": 46354592, "step": 68785 }, { "epoch": 1.6805511445532946, "grad_norm": 0.4086272716522217, "learning_rate": 1.6791760712677955e-06, "loss": 0.0026, "num_input_tokens_seen": 46357856, "step": 68790 }, { "epoch": 1.6806732953851415, "grad_norm": 0.39797380566596985, "learning_rate": 1.6791134771394807e-06, "loss": 0.0588, "num_input_tokens_seen": 46360928, "step": 68795 }, { "epoch": 1.6807954462169887, "grad_norm": 8.402929306030273, "learning_rate": 1.6790508780724705e-06, "loss": 0.105, "num_input_tokens_seen": 46364192, "step": 68800 }, { "epoch": 1.680917597048836, "grad_norm": 212.35462951660156, "learning_rate": 1.6789882740672194e-06, "loss": 0.0054, "num_input_tokens_seen": 46367072, "step": 68805 }, { "epoch": 1.681039747880683, "grad_norm": 128.37506103515625, "learning_rate": 1.6789256651241832e-06, "loss": 0.056, "num_input_tokens_seen": 46370208, "step": 68810 }, { "epoch": 1.68116189871253, "grad_norm": 0.21154679358005524, "learning_rate": 1.6788630512438168e-06, "loss": 0.0143, "num_input_tokens_seen": 46374112, "step": 68815 }, { "epoch": 1.6812840495443773, "grad_norm": 27.879682540893555, "learning_rate": 1.6788004324265757e-06, "loss": 0.0289, "num_input_tokens_seen": 46377120, "step": 68820 }, { "epoch": 1.6814062003762245, "grad_norm": 0.17257338762283325, "learning_rate": 1.6787378086729152e-06, "loss": 0.1079, "num_input_tokens_seen": 46380256, "step": 68825 }, { "epoch": 1.6815283512080716, "grad_norm": 0.2479773759841919, "learning_rate": 1.678675179983291e-06, "loss": 0.0743, "num_input_tokens_seen": 46383392, "step": 68830 }, { "epoch": 1.6816505020399188, "grad_norm": 191.01602172851562, "learning_rate": 1.6786125463581585e-06, "loss": 0.1769, "num_input_tokens_seen": 46386592, "step": 68835 }, { "epoch": 1.681772652871766, "grad_norm": 18.10806655883789, "learning_rate": 1.6785499077979726e-06, "loss": 0.0417, "num_input_tokens_seen": 46389856, "step": 68840 }, { "epoch": 1.6818948037036132, "grad_norm": 0.41562801599502563, "learning_rate": 1.6784872643031896e-06, "loss": 0.1182, "num_input_tokens_seen": 46393376, "step": 68845 }, { "epoch": 1.6820169545354604, "grad_norm": 0.07078684866428375, "learning_rate": 1.6784246158742643e-06, "loss": 0.0869, "num_input_tokens_seen": 46396512, "step": 68850 }, { "epoch": 1.6821391053673076, "grad_norm": 0.12547147274017334, "learning_rate": 1.678361962511653e-06, "loss": 0.0886, "num_input_tokens_seen": 46399776, "step": 68855 }, { "epoch": 1.6822612561991548, "grad_norm": 0.07189838588237762, "learning_rate": 1.6782993042158112e-06, "loss": 0.0896, "num_input_tokens_seen": 46403104, "step": 68860 }, { "epoch": 1.682383407031002, "grad_norm": 10.442744255065918, "learning_rate": 1.678236640987194e-06, "loss": 0.1263, "num_input_tokens_seen": 46406368, "step": 68865 }, { "epoch": 1.6825055578628492, "grad_norm": 9.556235313415527, "learning_rate": 1.6781739728262579e-06, "loss": 0.1121, "num_input_tokens_seen": 46409632, "step": 68870 }, { "epoch": 1.6826277086946964, "grad_norm": 0.1713194102048874, "learning_rate": 1.6781112997334582e-06, "loss": 0.0018, "num_input_tokens_seen": 46413344, "step": 68875 }, { "epoch": 1.6827498595265433, "grad_norm": 14.221380233764648, "learning_rate": 1.6780486217092507e-06, "loss": 0.1148, "num_input_tokens_seen": 46416672, "step": 68880 }, { "epoch": 1.6828720103583905, "grad_norm": 0.2144031971693039, "learning_rate": 1.677985938754091e-06, "loss": 0.0273, "num_input_tokens_seen": 46420000, "step": 68885 }, { "epoch": 1.6829941611902377, "grad_norm": 0.7320120334625244, "learning_rate": 1.6779232508684355e-06, "loss": 0.0751, "num_input_tokens_seen": 46423840, "step": 68890 }, { "epoch": 1.683116312022085, "grad_norm": 19.1189022064209, "learning_rate": 1.6778605580527398e-06, "loss": 0.1406, "num_input_tokens_seen": 46427552, "step": 68895 }, { "epoch": 1.683238462853932, "grad_norm": 0.19422592222690582, "learning_rate": 1.6777978603074595e-06, "loss": 0.1186, "num_input_tokens_seen": 46431008, "step": 68900 }, { "epoch": 1.683360613685779, "grad_norm": 22.207813262939453, "learning_rate": 1.6777351576330512e-06, "loss": 0.1768, "num_input_tokens_seen": 46434400, "step": 68905 }, { "epoch": 1.6834827645176262, "grad_norm": 8.345748901367188, "learning_rate": 1.6776724500299704e-06, "loss": 0.0808, "num_input_tokens_seen": 46437536, "step": 68910 }, { "epoch": 1.6836049153494734, "grad_norm": 1.5099958181381226, "learning_rate": 1.6776097374986732e-06, "loss": 0.0464, "num_input_tokens_seen": 46440608, "step": 68915 }, { "epoch": 1.6837270661813206, "grad_norm": 0.21554777026176453, "learning_rate": 1.6775470200396159e-06, "loss": 0.1128, "num_input_tokens_seen": 46443872, "step": 68920 }, { "epoch": 1.6838492170131678, "grad_norm": 1.5570076704025269, "learning_rate": 1.6774842976532542e-06, "loss": 0.1164, "num_input_tokens_seen": 46447072, "step": 68925 }, { "epoch": 1.683971367845015, "grad_norm": 0.06811830401420593, "learning_rate": 1.6774215703400447e-06, "loss": 0.0688, "num_input_tokens_seen": 46450656, "step": 68930 }, { "epoch": 1.6840935186768622, "grad_norm": 24.41603660583496, "learning_rate": 1.677358838100443e-06, "loss": 0.1232, "num_input_tokens_seen": 46454048, "step": 68935 }, { "epoch": 1.6842156695087094, "grad_norm": 10.065936088562012, "learning_rate": 1.6772961009349063e-06, "loss": 0.1119, "num_input_tokens_seen": 46457440, "step": 68940 }, { "epoch": 1.6843378203405566, "grad_norm": 1.9582791328430176, "learning_rate": 1.6772333588438893e-06, "loss": 0.0454, "num_input_tokens_seen": 46461024, "step": 68945 }, { "epoch": 1.6844599711724038, "grad_norm": 22.67094612121582, "learning_rate": 1.67717061182785e-06, "loss": 0.1713, "num_input_tokens_seen": 46464096, "step": 68950 }, { "epoch": 1.684582122004251, "grad_norm": 20.608978271484375, "learning_rate": 1.6771078598872435e-06, "loss": 0.0426, "num_input_tokens_seen": 46467488, "step": 68955 }, { "epoch": 1.6847042728360981, "grad_norm": 0.4787451922893524, "learning_rate": 1.6770451030225267e-06, "loss": 0.0021, "num_input_tokens_seen": 46471200, "step": 68960 }, { "epoch": 1.6848264236679453, "grad_norm": 0.26517850160598755, "learning_rate": 1.6769823412341553e-06, "loss": 0.0053, "num_input_tokens_seen": 46474592, "step": 68965 }, { "epoch": 1.6849485744997923, "grad_norm": 58.144775390625, "learning_rate": 1.6769195745225866e-06, "loss": 0.0448, "num_input_tokens_seen": 46478048, "step": 68970 }, { "epoch": 1.6850707253316395, "grad_norm": 4.901430606842041, "learning_rate": 1.6768568028882767e-06, "loss": 0.0328, "num_input_tokens_seen": 46482080, "step": 68975 }, { "epoch": 1.6851928761634867, "grad_norm": 0.4926772117614746, "learning_rate": 1.6767940263316817e-06, "loss": 0.0858, "num_input_tokens_seen": 46485472, "step": 68980 }, { "epoch": 1.6853150269953339, "grad_norm": 0.352978378534317, "learning_rate": 1.676731244853259e-06, "loss": 0.0376, "num_input_tokens_seen": 46488736, "step": 68985 }, { "epoch": 1.685437177827181, "grad_norm": 12.455333709716797, "learning_rate": 1.6766684584534647e-06, "loss": 0.0457, "num_input_tokens_seen": 46492320, "step": 68990 }, { "epoch": 1.685559328659028, "grad_norm": 25.381471633911133, "learning_rate": 1.6766056671327551e-06, "loss": 0.0196, "num_input_tokens_seen": 46495520, "step": 68995 }, { "epoch": 1.6856814794908752, "grad_norm": 19.42300033569336, "learning_rate": 1.6765428708915871e-06, "loss": 0.1296, "num_input_tokens_seen": 46498848, "step": 69000 }, { "epoch": 1.6858036303227224, "grad_norm": 0.03760528564453125, "learning_rate": 1.6764800697304172e-06, "loss": 0.0889, "num_input_tokens_seen": 46502496, "step": 69005 }, { "epoch": 1.6859257811545696, "grad_norm": 14.339239120483398, "learning_rate": 1.6764172636497026e-06, "loss": 0.1051, "num_input_tokens_seen": 46506336, "step": 69010 }, { "epoch": 1.6860479319864168, "grad_norm": 0.08472023904323578, "learning_rate": 1.6763544526499e-06, "loss": 0.0354, "num_input_tokens_seen": 46509856, "step": 69015 }, { "epoch": 1.686170082818264, "grad_norm": 12.50338363647461, "learning_rate": 1.6762916367314651e-06, "loss": 0.1313, "num_input_tokens_seen": 46513120, "step": 69020 }, { "epoch": 1.6862922336501112, "grad_norm": 0.1388133466243744, "learning_rate": 1.6762288158948562e-06, "loss": 0.0722, "num_input_tokens_seen": 46516704, "step": 69025 }, { "epoch": 1.6864143844819584, "grad_norm": 0.10646776854991913, "learning_rate": 1.6761659901405291e-06, "loss": 0.0932, "num_input_tokens_seen": 46520288, "step": 69030 }, { "epoch": 1.6865365353138055, "grad_norm": 35.494781494140625, "learning_rate": 1.6761031594689414e-06, "loss": 0.1082, "num_input_tokens_seen": 46523552, "step": 69035 }, { "epoch": 1.6866586861456527, "grad_norm": 1.7615808248519897, "learning_rate": 1.6760403238805494e-06, "loss": 0.0745, "num_input_tokens_seen": 46527200, "step": 69040 }, { "epoch": 1.6867808369775, "grad_norm": 0.13375285267829895, "learning_rate": 1.6759774833758104e-06, "loss": 0.0354, "num_input_tokens_seen": 46530528, "step": 69045 }, { "epoch": 1.686902987809347, "grad_norm": 0.13198687136173248, "learning_rate": 1.6759146379551812e-06, "loss": 0.0381, "num_input_tokens_seen": 46533856, "step": 69050 }, { "epoch": 1.6870251386411943, "grad_norm": 0.4579293727874756, "learning_rate": 1.675851787619119e-06, "loss": 0.0202, "num_input_tokens_seen": 46537184, "step": 69055 }, { "epoch": 1.6871472894730413, "grad_norm": 16.71668815612793, "learning_rate": 1.6757889323680811e-06, "loss": 0.0344, "num_input_tokens_seen": 46540640, "step": 69060 }, { "epoch": 1.6872694403048885, "grad_norm": 11.475341796875, "learning_rate": 1.675726072202524e-06, "loss": 0.1578, "num_input_tokens_seen": 46544096, "step": 69065 }, { "epoch": 1.6873915911367356, "grad_norm": 25.129030227661133, "learning_rate": 1.6756632071229053e-06, "loss": 0.1053, "num_input_tokens_seen": 46547168, "step": 69070 }, { "epoch": 1.6875137419685828, "grad_norm": 0.5593408942222595, "learning_rate": 1.6756003371296822e-06, "loss": 0.0037, "num_input_tokens_seen": 46550624, "step": 69075 }, { "epoch": 1.68763589280043, "grad_norm": 0.16857105493545532, "learning_rate": 1.6755374622233114e-06, "loss": 0.1352, "num_input_tokens_seen": 46554848, "step": 69080 }, { "epoch": 1.687758043632277, "grad_norm": 0.42032769322395325, "learning_rate": 1.6754745824042505e-06, "loss": 0.078, "num_input_tokens_seen": 46558112, "step": 69085 }, { "epoch": 1.6878801944641242, "grad_norm": 43.47393798828125, "learning_rate": 1.675411697672957e-06, "loss": 0.0805, "num_input_tokens_seen": 46561056, "step": 69090 }, { "epoch": 1.6880023452959714, "grad_norm": 0.16293494403362274, "learning_rate": 1.6753488080298877e-06, "loss": 0.0913, "num_input_tokens_seen": 46564128, "step": 69095 }, { "epoch": 1.6881244961278186, "grad_norm": 0.2841865122318268, "learning_rate": 1.6752859134755003e-06, "loss": 0.1935, "num_input_tokens_seen": 46567584, "step": 69100 }, { "epoch": 1.6882466469596658, "grad_norm": 0.5506531596183777, "learning_rate": 1.6752230140102522e-06, "loss": 0.0271, "num_input_tokens_seen": 46571104, "step": 69105 }, { "epoch": 1.688368797791513, "grad_norm": 0.11144746840000153, "learning_rate": 1.6751601096346006e-06, "loss": 0.0652, "num_input_tokens_seen": 46574624, "step": 69110 }, { "epoch": 1.6884909486233601, "grad_norm": 0.2003175914287567, "learning_rate": 1.675097200349003e-06, "loss": 0.0684, "num_input_tokens_seen": 46578144, "step": 69115 }, { "epoch": 1.6886130994552073, "grad_norm": 0.19120670855045319, "learning_rate": 1.6750342861539174e-06, "loss": 0.0776, "num_input_tokens_seen": 46581600, "step": 69120 }, { "epoch": 1.6887352502870545, "grad_norm": 0.9610084295272827, "learning_rate": 1.6749713670498007e-06, "loss": 0.0025, "num_input_tokens_seen": 46584480, "step": 69125 }, { "epoch": 1.6888574011189017, "grad_norm": 10.52871322631836, "learning_rate": 1.6749084430371103e-06, "loss": 0.0582, "num_input_tokens_seen": 46588192, "step": 69130 }, { "epoch": 1.688979551950749, "grad_norm": 69.1805648803711, "learning_rate": 1.6748455141163048e-06, "loss": 0.0433, "num_input_tokens_seen": 46591712, "step": 69135 }, { "epoch": 1.689101702782596, "grad_norm": 0.05372300744056702, "learning_rate": 1.6747825802878408e-06, "loss": 0.0615, "num_input_tokens_seen": 46594912, "step": 69140 }, { "epoch": 1.6892238536144433, "grad_norm": 0.12427949160337448, "learning_rate": 1.6747196415521768e-06, "loss": 0.0818, "num_input_tokens_seen": 46598432, "step": 69145 }, { "epoch": 1.6893460044462902, "grad_norm": 0.06056433171033859, "learning_rate": 1.6746566979097697e-06, "loss": 0.1306, "num_input_tokens_seen": 46601504, "step": 69150 }, { "epoch": 1.6894681552781374, "grad_norm": 23.436250686645508, "learning_rate": 1.6745937493610776e-06, "loss": 0.0447, "num_input_tokens_seen": 46605024, "step": 69155 }, { "epoch": 1.6895903061099846, "grad_norm": 1.130778193473816, "learning_rate": 1.6745307959065584e-06, "loss": 0.002, "num_input_tokens_seen": 46608416, "step": 69160 }, { "epoch": 1.6897124569418318, "grad_norm": 15.070996284484863, "learning_rate": 1.6744678375466697e-06, "loss": 0.1776, "num_input_tokens_seen": 46612256, "step": 69165 }, { "epoch": 1.6898346077736788, "grad_norm": 0.3131832480430603, "learning_rate": 1.6744048742818698e-06, "loss": 0.0016, "num_input_tokens_seen": 46615776, "step": 69170 }, { "epoch": 1.689956758605526, "grad_norm": 0.5031534433364868, "learning_rate": 1.674341906112616e-06, "loss": 0.1415, "num_input_tokens_seen": 46618912, "step": 69175 }, { "epoch": 1.6900789094373732, "grad_norm": 0.9567481279373169, "learning_rate": 1.6742789330393668e-06, "loss": 0.0341, "num_input_tokens_seen": 46622496, "step": 69180 }, { "epoch": 1.6902010602692203, "grad_norm": 19.70442008972168, "learning_rate": 1.6742159550625794e-06, "loss": 0.0437, "num_input_tokens_seen": 46625696, "step": 69185 }, { "epoch": 1.6903232111010675, "grad_norm": 27.411558151245117, "learning_rate": 1.6741529721827123e-06, "loss": 0.1078, "num_input_tokens_seen": 46629088, "step": 69190 }, { "epoch": 1.6904453619329147, "grad_norm": 51.215877532958984, "learning_rate": 1.6740899844002238e-06, "loss": 0.3351, "num_input_tokens_seen": 46632416, "step": 69195 }, { "epoch": 1.690567512764762, "grad_norm": 0.056414637714624405, "learning_rate": 1.6740269917155715e-06, "loss": 0.1243, "num_input_tokens_seen": 46635872, "step": 69200 }, { "epoch": 1.690689663596609, "grad_norm": 43.76165771484375, "learning_rate": 1.6739639941292134e-06, "loss": 0.1398, "num_input_tokens_seen": 46639264, "step": 69205 }, { "epoch": 1.6908118144284563, "grad_norm": 0.24457961320877075, "learning_rate": 1.673900991641608e-06, "loss": 0.1354, "num_input_tokens_seen": 46642720, "step": 69210 }, { "epoch": 1.6909339652603035, "grad_norm": 0.11181172728538513, "learning_rate": 1.6738379842532134e-06, "loss": 0.1438, "num_input_tokens_seen": 46646368, "step": 69215 }, { "epoch": 1.6910561160921507, "grad_norm": 0.517985999584198, "learning_rate": 1.6737749719644877e-06, "loss": 0.0384, "num_input_tokens_seen": 46650208, "step": 69220 }, { "epoch": 1.6911782669239979, "grad_norm": 64.62832641601562, "learning_rate": 1.673711954775889e-06, "loss": 0.1072, "num_input_tokens_seen": 46653216, "step": 69225 }, { "epoch": 1.691300417755845, "grad_norm": 0.5788198113441467, "learning_rate": 1.673648932687876e-06, "loss": 0.1493, "num_input_tokens_seen": 46656544, "step": 69230 }, { "epoch": 1.6914225685876922, "grad_norm": 0.27993226051330566, "learning_rate": 1.6735859057009068e-06, "loss": 0.0677, "num_input_tokens_seen": 46659808, "step": 69235 }, { "epoch": 1.6915447194195392, "grad_norm": 48.72805404663086, "learning_rate": 1.6735228738154397e-06, "loss": 0.1851, "num_input_tokens_seen": 46663008, "step": 69240 }, { "epoch": 1.6916668702513864, "grad_norm": 104.7187728881836, "learning_rate": 1.6734598370319328e-06, "loss": 0.0098, "num_input_tokens_seen": 46666144, "step": 69245 }, { "epoch": 1.6917890210832336, "grad_norm": 0.23837001621723175, "learning_rate": 1.673396795350845e-06, "loss": 0.1005, "num_input_tokens_seen": 46669984, "step": 69250 }, { "epoch": 1.6919111719150808, "grad_norm": 78.72986602783203, "learning_rate": 1.6733337487726346e-06, "loss": 0.0059, "num_input_tokens_seen": 46673248, "step": 69255 }, { "epoch": 1.6920333227469277, "grad_norm": 0.32147687673568726, "learning_rate": 1.67327069729776e-06, "loss": 0.0398, "num_input_tokens_seen": 46676640, "step": 69260 }, { "epoch": 1.692155473578775, "grad_norm": 0.1801147311925888, "learning_rate": 1.6732076409266802e-06, "loss": 0.0572, "num_input_tokens_seen": 46680352, "step": 69265 }, { "epoch": 1.6922776244106221, "grad_norm": 40.23493576049805, "learning_rate": 1.673144579659853e-06, "loss": 0.095, "num_input_tokens_seen": 46684192, "step": 69270 }, { "epoch": 1.6923997752424693, "grad_norm": 0.26882249116897583, "learning_rate": 1.6730815134977374e-06, "loss": 0.0931, "num_input_tokens_seen": 46687200, "step": 69275 }, { "epoch": 1.6925219260743165, "grad_norm": 25.242671966552734, "learning_rate": 1.6730184424407922e-06, "loss": 0.0697, "num_input_tokens_seen": 46690912, "step": 69280 }, { "epoch": 1.6926440769061637, "grad_norm": 34.11570739746094, "learning_rate": 1.6729553664894756e-06, "loss": 0.0688, "num_input_tokens_seen": 46694304, "step": 69285 }, { "epoch": 1.6927662277380109, "grad_norm": 15.972042083740234, "learning_rate": 1.6728922856442465e-06, "loss": 0.0371, "num_input_tokens_seen": 46697632, "step": 69290 }, { "epoch": 1.692888378569858, "grad_norm": 0.31541693210601807, "learning_rate": 1.672829199905564e-06, "loss": 0.0344, "num_input_tokens_seen": 46700960, "step": 69295 }, { "epoch": 1.6930105294017053, "grad_norm": 22.75095558166504, "learning_rate": 1.6727661092738865e-06, "loss": 0.1221, "num_input_tokens_seen": 46704032, "step": 69300 }, { "epoch": 1.6931326802335525, "grad_norm": 21.048616409301758, "learning_rate": 1.6727030137496728e-06, "loss": 0.0472, "num_input_tokens_seen": 46707424, "step": 69305 }, { "epoch": 1.6932548310653996, "grad_norm": 74.95085144042969, "learning_rate": 1.672639913333382e-06, "loss": 0.1567, "num_input_tokens_seen": 46710624, "step": 69310 }, { "epoch": 1.6933769818972468, "grad_norm": 0.23408760130405426, "learning_rate": 1.6725768080254726e-06, "loss": 0.0425, "num_input_tokens_seen": 46714208, "step": 69315 }, { "epoch": 1.693499132729094, "grad_norm": 0.7480437755584717, "learning_rate": 1.6725136978264038e-06, "loss": 0.0015, "num_input_tokens_seen": 46717536, "step": 69320 }, { "epoch": 1.6936212835609412, "grad_norm": 0.23599670827388763, "learning_rate": 1.6724505827366349e-06, "loss": 0.0483, "num_input_tokens_seen": 46720736, "step": 69325 }, { "epoch": 1.6937434343927882, "grad_norm": 0.039702676236629486, "learning_rate": 1.6723874627566242e-06, "loss": 0.0668, "num_input_tokens_seen": 46724192, "step": 69330 }, { "epoch": 1.6938655852246354, "grad_norm": 1.4262231588363647, "learning_rate": 1.672324337886831e-06, "loss": 0.0038, "num_input_tokens_seen": 46727904, "step": 69335 }, { "epoch": 1.6939877360564826, "grad_norm": 0.09134913980960846, "learning_rate": 1.6722612081277143e-06, "loss": 0.1054, "num_input_tokens_seen": 46730912, "step": 69340 }, { "epoch": 1.6941098868883298, "grad_norm": 0.017633700743317604, "learning_rate": 1.6721980734797334e-06, "loss": 0.0426, "num_input_tokens_seen": 46734048, "step": 69345 }, { "epoch": 1.6942320377201767, "grad_norm": 0.31877022981643677, "learning_rate": 1.6721349339433472e-06, "loss": 0.0033, "num_input_tokens_seen": 46737184, "step": 69350 }, { "epoch": 1.694354188552024, "grad_norm": 18.597759246826172, "learning_rate": 1.672071789519015e-06, "loss": 0.0835, "num_input_tokens_seen": 46740384, "step": 69355 }, { "epoch": 1.694476339383871, "grad_norm": 71.34232330322266, "learning_rate": 1.672008640207196e-06, "loss": 0.1602, "num_input_tokens_seen": 46743328, "step": 69360 }, { "epoch": 1.6945984902157183, "grad_norm": 0.0754992663860321, "learning_rate": 1.6719454860083495e-06, "loss": 0.0006, "num_input_tokens_seen": 46746464, "step": 69365 }, { "epoch": 1.6947206410475655, "grad_norm": 20.865156173706055, "learning_rate": 1.6718823269229348e-06, "loss": 0.1186, "num_input_tokens_seen": 46749536, "step": 69370 }, { "epoch": 1.6948427918794127, "grad_norm": 0.1701429784297943, "learning_rate": 1.6718191629514112e-06, "loss": 0.0287, "num_input_tokens_seen": 46752800, "step": 69375 }, { "epoch": 1.6949649427112599, "grad_norm": 0.1040308028459549, "learning_rate": 1.6717559940942373e-06, "loss": 0.0619, "num_input_tokens_seen": 46756256, "step": 69380 }, { "epoch": 1.695087093543107, "grad_norm": 1.0910602807998657, "learning_rate": 1.6716928203518736e-06, "loss": 0.1257, "num_input_tokens_seen": 46760096, "step": 69385 }, { "epoch": 1.6952092443749542, "grad_norm": 0.39105257391929626, "learning_rate": 1.671629641724779e-06, "loss": 0.1073, "num_input_tokens_seen": 46763616, "step": 69390 }, { "epoch": 1.6953313952068014, "grad_norm": 0.020331593230366707, "learning_rate": 1.671566458213413e-06, "loss": 0.0021, "num_input_tokens_seen": 46767328, "step": 69395 }, { "epoch": 1.6954535460386486, "grad_norm": 1.1094465255737305, "learning_rate": 1.6715032698182352e-06, "loss": 0.0018, "num_input_tokens_seen": 46770912, "step": 69400 }, { "epoch": 1.6955756968704958, "grad_norm": 17.54354476928711, "learning_rate": 1.6714400765397047e-06, "loss": 0.2174, "num_input_tokens_seen": 46774816, "step": 69405 }, { "epoch": 1.695697847702343, "grad_norm": 180.63446044921875, "learning_rate": 1.6713768783782815e-06, "loss": 0.0606, "num_input_tokens_seen": 46777888, "step": 69410 }, { "epoch": 1.69581999853419, "grad_norm": 72.77506256103516, "learning_rate": 1.6713136753344253e-06, "loss": 0.1072, "num_input_tokens_seen": 46781280, "step": 69415 }, { "epoch": 1.6959421493660372, "grad_norm": 0.4094438850879669, "learning_rate": 1.6712504674085951e-06, "loss": 0.0504, "num_input_tokens_seen": 46784800, "step": 69420 }, { "epoch": 1.6960643001978843, "grad_norm": 0.544030487537384, "learning_rate": 1.6711872546012512e-06, "loss": 0.0243, "num_input_tokens_seen": 46788064, "step": 69425 }, { "epoch": 1.6961864510297315, "grad_norm": 24.769922256469727, "learning_rate": 1.671124036912853e-06, "loss": 0.2521, "num_input_tokens_seen": 46791904, "step": 69430 }, { "epoch": 1.6963086018615787, "grad_norm": 0.060142744332551956, "learning_rate": 1.6710608143438606e-06, "loss": 0.0017, "num_input_tokens_seen": 46795168, "step": 69435 }, { "epoch": 1.6964307526934257, "grad_norm": 0.5809279084205627, "learning_rate": 1.670997586894733e-06, "loss": 0.1179, "num_input_tokens_seen": 46798944, "step": 69440 }, { "epoch": 1.6965529035252729, "grad_norm": 1.001853346824646, "learning_rate": 1.6709343545659307e-06, "loss": 0.0248, "num_input_tokens_seen": 46801952, "step": 69445 }, { "epoch": 1.69667505435712, "grad_norm": 0.9293713569641113, "learning_rate": 1.670871117357913e-06, "loss": 0.0537, "num_input_tokens_seen": 46805216, "step": 69450 }, { "epoch": 1.6967972051889673, "grad_norm": 0.30881839990615845, "learning_rate": 1.6708078752711408e-06, "loss": 0.0364, "num_input_tokens_seen": 46809120, "step": 69455 }, { "epoch": 1.6969193560208145, "grad_norm": 0.004009113647043705, "learning_rate": 1.6707446283060727e-06, "loss": 0.0009, "num_input_tokens_seen": 46812384, "step": 69460 }, { "epoch": 1.6970415068526616, "grad_norm": 0.1301659345626831, "learning_rate": 1.6706813764631696e-06, "loss": 0.0731, "num_input_tokens_seen": 46815456, "step": 69465 }, { "epoch": 1.6971636576845088, "grad_norm": 20.536996841430664, "learning_rate": 1.6706181197428908e-06, "loss": 0.0398, "num_input_tokens_seen": 46818592, "step": 69470 }, { "epoch": 1.697285808516356, "grad_norm": 11.145803451538086, "learning_rate": 1.6705548581456967e-06, "loss": 0.1567, "num_input_tokens_seen": 46822048, "step": 69475 }, { "epoch": 1.6974079593482032, "grad_norm": 0.12410733848810196, "learning_rate": 1.6704915916720474e-06, "loss": 0.0009, "num_input_tokens_seen": 46825568, "step": 69480 }, { "epoch": 1.6975301101800504, "grad_norm": 1.088709831237793, "learning_rate": 1.670428320322403e-06, "loss": 0.0305, "num_input_tokens_seen": 46828768, "step": 69485 }, { "epoch": 1.6976522610118976, "grad_norm": 0.5592111349105835, "learning_rate": 1.6703650440972235e-06, "loss": 0.031, "num_input_tokens_seen": 46831840, "step": 69490 }, { "epoch": 1.6977744118437448, "grad_norm": 18.33035659790039, "learning_rate": 1.670301762996969e-06, "loss": 0.0762, "num_input_tokens_seen": 46835104, "step": 69495 }, { "epoch": 1.697896562675592, "grad_norm": 9.992659568786621, "learning_rate": 1.6702384770220998e-06, "loss": 0.1448, "num_input_tokens_seen": 46838432, "step": 69500 }, { "epoch": 1.698018713507439, "grad_norm": 0.07260085642337799, "learning_rate": 1.6701751861730763e-06, "loss": 0.1469, "num_input_tokens_seen": 46841632, "step": 69505 }, { "epoch": 1.6981408643392861, "grad_norm": 50.94322204589844, "learning_rate": 1.6701118904503581e-06, "loss": 0.1473, "num_input_tokens_seen": 46845088, "step": 69510 }, { "epoch": 1.6982630151711333, "grad_norm": 0.3196463882923126, "learning_rate": 1.6700485898544067e-06, "loss": 0.1444, "num_input_tokens_seen": 46848288, "step": 69515 }, { "epoch": 1.6983851660029805, "grad_norm": 12.756587028503418, "learning_rate": 1.6699852843856813e-06, "loss": 0.0402, "num_input_tokens_seen": 46851552, "step": 69520 }, { "epoch": 1.6985073168348277, "grad_norm": 0.03349412605166435, "learning_rate": 1.6699219740446426e-06, "loss": 0.1171, "num_input_tokens_seen": 46854880, "step": 69525 }, { "epoch": 1.6986294676666747, "grad_norm": 9.584224700927734, "learning_rate": 1.6698586588317515e-06, "loss": 0.0676, "num_input_tokens_seen": 46858208, "step": 69530 }, { "epoch": 1.6987516184985219, "grad_norm": 0.9124466776847839, "learning_rate": 1.669795338747468e-06, "loss": 0.0055, "num_input_tokens_seen": 46861792, "step": 69535 }, { "epoch": 1.698873769330369, "grad_norm": 40.700469970703125, "learning_rate": 1.6697320137922524e-06, "loss": 0.0251, "num_input_tokens_seen": 46865184, "step": 69540 }, { "epoch": 1.6989959201622162, "grad_norm": 30.935789108276367, "learning_rate": 1.6696686839665655e-06, "loss": 0.1008, "num_input_tokens_seen": 46868512, "step": 69545 }, { "epoch": 1.6991180709940634, "grad_norm": 0.03304780647158623, "learning_rate": 1.669605349270868e-06, "loss": 0.0005, "num_input_tokens_seen": 46871712, "step": 69550 }, { "epoch": 1.6992402218259106, "grad_norm": 11.477385520935059, "learning_rate": 1.66954200970562e-06, "loss": 0.0801, "num_input_tokens_seen": 46875040, "step": 69555 }, { "epoch": 1.6993623726577578, "grad_norm": 0.036940060555934906, "learning_rate": 1.6694786652712827e-06, "loss": 0.0201, "num_input_tokens_seen": 46878432, "step": 69560 }, { "epoch": 1.699484523489605, "grad_norm": 0.2850690484046936, "learning_rate": 1.6694153159683162e-06, "loss": 0.1146, "num_input_tokens_seen": 46881248, "step": 69565 }, { "epoch": 1.6996066743214522, "grad_norm": 0.2382366955280304, "learning_rate": 1.6693519617971816e-06, "loss": 0.1317, "num_input_tokens_seen": 46885408, "step": 69570 }, { "epoch": 1.6997288251532994, "grad_norm": 0.17419885098934174, "learning_rate": 1.6692886027583397e-06, "loss": 0.0317, "num_input_tokens_seen": 46888928, "step": 69575 }, { "epoch": 1.6998509759851466, "grad_norm": 0.36933374404907227, "learning_rate": 1.669225238852251e-06, "loss": 0.1065, "num_input_tokens_seen": 46892320, "step": 69580 }, { "epoch": 1.6999731268169938, "grad_norm": 3.722100019454956, "learning_rate": 1.6691618700793763e-06, "loss": 0.2065, "num_input_tokens_seen": 46895648, "step": 69585 }, { "epoch": 1.700095277648841, "grad_norm": 20.817859649658203, "learning_rate": 1.6690984964401764e-06, "loss": 0.1437, "num_input_tokens_seen": 46898784, "step": 69590 }, { "epoch": 1.700217428480688, "grad_norm": 23.722640991210938, "learning_rate": 1.6690351179351123e-06, "loss": 0.1236, "num_input_tokens_seen": 46901728, "step": 69595 }, { "epoch": 1.700339579312535, "grad_norm": 0.49860385060310364, "learning_rate": 1.668971734564645e-06, "loss": 0.161, "num_input_tokens_seen": 46905248, "step": 69600 }, { "epoch": 1.7004617301443823, "grad_norm": 13.744087219238281, "learning_rate": 1.668908346329235e-06, "loss": 0.0781, "num_input_tokens_seen": 46908384, "step": 69605 }, { "epoch": 1.7005838809762295, "grad_norm": 37.584285736083984, "learning_rate": 1.668844953229344e-06, "loss": 0.2057, "num_input_tokens_seen": 46911648, "step": 69610 }, { "epoch": 1.7007060318080767, "grad_norm": 0.019656836986541748, "learning_rate": 1.6687815552654325e-06, "loss": 0.0927, "num_input_tokens_seen": 46914976, "step": 69615 }, { "epoch": 1.7008281826399236, "grad_norm": 0.4337562024593353, "learning_rate": 1.6687181524379613e-06, "loss": 0.0305, "num_input_tokens_seen": 46918752, "step": 69620 }, { "epoch": 1.7009503334717708, "grad_norm": 0.05799011513590813, "learning_rate": 1.6686547447473924e-06, "loss": 0.0596, "num_input_tokens_seen": 46922336, "step": 69625 }, { "epoch": 1.701072484303618, "grad_norm": 0.18245580792427063, "learning_rate": 1.668591332194186e-06, "loss": 0.0261, "num_input_tokens_seen": 46925792, "step": 69630 }, { "epoch": 1.7011946351354652, "grad_norm": 0.5550912022590637, "learning_rate": 1.6685279147788036e-06, "loss": 0.0987, "num_input_tokens_seen": 46930272, "step": 69635 }, { "epoch": 1.7013167859673124, "grad_norm": 21.118289947509766, "learning_rate": 1.6684644925017067e-06, "loss": 0.1937, "num_input_tokens_seen": 46933536, "step": 69640 }, { "epoch": 1.7014389367991596, "grad_norm": 71.64469146728516, "learning_rate": 1.6684010653633559e-06, "loss": 0.149, "num_input_tokens_seen": 46936864, "step": 69645 }, { "epoch": 1.7015610876310068, "grad_norm": 3.9675886631011963, "learning_rate": 1.6683376333642127e-06, "loss": 0.0415, "num_input_tokens_seen": 46940128, "step": 69650 }, { "epoch": 1.701683238462854, "grad_norm": 1.2720179557800293, "learning_rate": 1.6682741965047386e-06, "loss": 0.0025, "num_input_tokens_seen": 46943264, "step": 69655 }, { "epoch": 1.7018053892947012, "grad_norm": 19.108028411865234, "learning_rate": 1.6682107547853948e-06, "loss": 0.1639, "num_input_tokens_seen": 46946656, "step": 69660 }, { "epoch": 1.7019275401265483, "grad_norm": 0.23258452117443085, "learning_rate": 1.6681473082066426e-06, "loss": 0.0259, "num_input_tokens_seen": 46949920, "step": 69665 }, { "epoch": 1.7020496909583955, "grad_norm": 35.97227096557617, "learning_rate": 1.6680838567689436e-06, "loss": 0.0954, "num_input_tokens_seen": 46954144, "step": 69670 }, { "epoch": 1.7021718417902427, "grad_norm": 12.28347396850586, "learning_rate": 1.6680204004727592e-06, "loss": 0.1621, "num_input_tokens_seen": 46957472, "step": 69675 }, { "epoch": 1.70229399262209, "grad_norm": 52.67514419555664, "learning_rate": 1.6679569393185506e-06, "loss": 0.0807, "num_input_tokens_seen": 46961376, "step": 69680 }, { "epoch": 1.7024161434539369, "grad_norm": 13.373403549194336, "learning_rate": 1.6678934733067793e-06, "loss": 0.0841, "num_input_tokens_seen": 46964640, "step": 69685 }, { "epoch": 1.702538294285784, "grad_norm": 0.8186175227165222, "learning_rate": 1.6678300024379073e-06, "loss": 0.0441, "num_input_tokens_seen": 46968480, "step": 69690 }, { "epoch": 1.7026604451176313, "grad_norm": 1.1019536256790161, "learning_rate": 1.6677665267123956e-06, "loss": 0.0706, "num_input_tokens_seen": 46971744, "step": 69695 }, { "epoch": 1.7027825959494785, "grad_norm": 28.882869720458984, "learning_rate": 1.6677030461307065e-06, "loss": 0.0741, "num_input_tokens_seen": 46975200, "step": 69700 }, { "epoch": 1.7029047467813254, "grad_norm": 65.26024627685547, "learning_rate": 1.667639560693301e-06, "loss": 0.1458, "num_input_tokens_seen": 46979232, "step": 69705 }, { "epoch": 1.7030268976131726, "grad_norm": 0.39398840069770813, "learning_rate": 1.6675760704006412e-06, "loss": 0.0283, "num_input_tokens_seen": 46983520, "step": 69710 }, { "epoch": 1.7031490484450198, "grad_norm": 0.35874584317207336, "learning_rate": 1.6675125752531884e-06, "loss": 0.1395, "num_input_tokens_seen": 46986720, "step": 69715 }, { "epoch": 1.703271199276867, "grad_norm": 28.241769790649414, "learning_rate": 1.667449075251405e-06, "loss": 0.1001, "num_input_tokens_seen": 46989920, "step": 69720 }, { "epoch": 1.7033933501087142, "grad_norm": 9.246172904968262, "learning_rate": 1.6673855703957523e-06, "loss": 0.0372, "num_input_tokens_seen": 46993184, "step": 69725 }, { "epoch": 1.7035155009405614, "grad_norm": 15.649399757385254, "learning_rate": 1.667322060686692e-06, "loss": 0.1044, "num_input_tokens_seen": 46996640, "step": 69730 }, { "epoch": 1.7036376517724086, "grad_norm": 18.9222412109375, "learning_rate": 1.667258546124686e-06, "loss": 0.0357, "num_input_tokens_seen": 46999968, "step": 69735 }, { "epoch": 1.7037598026042557, "grad_norm": 0.4368104338645935, "learning_rate": 1.6671950267101972e-06, "loss": 0.0337, "num_input_tokens_seen": 47003488, "step": 69740 }, { "epoch": 1.703881953436103, "grad_norm": 0.14030565321445465, "learning_rate": 1.667131502443686e-06, "loss": 0.0329, "num_input_tokens_seen": 47007136, "step": 69745 }, { "epoch": 1.7040041042679501, "grad_norm": 0.21858637034893036, "learning_rate": 1.6670679733256154e-06, "loss": 0.1352, "num_input_tokens_seen": 47010528, "step": 69750 }, { "epoch": 1.7041262550997973, "grad_norm": 0.08449557423591614, "learning_rate": 1.6670044393564467e-06, "loss": 0.0414, "num_input_tokens_seen": 47013792, "step": 69755 }, { "epoch": 1.7042484059316445, "grad_norm": 0.11067601293325424, "learning_rate": 1.6669409005366426e-06, "loss": 0.0764, "num_input_tokens_seen": 47016928, "step": 69760 }, { "epoch": 1.7043705567634917, "grad_norm": 0.1474686861038208, "learning_rate": 1.666877356866665e-06, "loss": 0.1261, "num_input_tokens_seen": 47020320, "step": 69765 }, { "epoch": 1.7044927075953389, "grad_norm": 0.42805859446525574, "learning_rate": 1.6668138083469756e-06, "loss": 0.03, "num_input_tokens_seen": 47023648, "step": 69770 }, { "epoch": 1.7046148584271859, "grad_norm": 28.6177978515625, "learning_rate": 1.666750254978037e-06, "loss": 0.2081, "num_input_tokens_seen": 47027360, "step": 69775 }, { "epoch": 1.704737009259033, "grad_norm": 0.14059709012508392, "learning_rate": 1.6666866967603113e-06, "loss": 0.004, "num_input_tokens_seen": 47031200, "step": 69780 }, { "epoch": 1.7048591600908802, "grad_norm": 0.14516912400722504, "learning_rate": 1.6666231336942604e-06, "loss": 0.0831, "num_input_tokens_seen": 47034848, "step": 69785 }, { "epoch": 1.7049813109227274, "grad_norm": 0.589957594871521, "learning_rate": 1.666559565780347e-06, "loss": 0.0354, "num_input_tokens_seen": 47038496, "step": 69790 }, { "epoch": 1.7051034617545744, "grad_norm": 0.5143163204193115, "learning_rate": 1.666495993019033e-06, "loss": 0.0632, "num_input_tokens_seen": 47041824, "step": 69795 }, { "epoch": 1.7052256125864216, "grad_norm": 0.0627521201968193, "learning_rate": 1.6664324154107807e-06, "loss": 0.0357, "num_input_tokens_seen": 47045088, "step": 69800 }, { "epoch": 1.7053477634182688, "grad_norm": 28.07826805114746, "learning_rate": 1.666368832956053e-06, "loss": 0.1022, "num_input_tokens_seen": 47048416, "step": 69805 }, { "epoch": 1.705469914250116, "grad_norm": 0.01447698101401329, "learning_rate": 1.666305245655312e-06, "loss": 0.0911, "num_input_tokens_seen": 47051488, "step": 69810 }, { "epoch": 1.7055920650819631, "grad_norm": 0.7165692448616028, "learning_rate": 1.6662416535090196e-06, "loss": 0.2176, "num_input_tokens_seen": 47054944, "step": 69815 }, { "epoch": 1.7057142159138103, "grad_norm": 22.16487693786621, "learning_rate": 1.6661780565176388e-06, "loss": 0.1212, "num_input_tokens_seen": 47058208, "step": 69820 }, { "epoch": 1.7058363667456575, "grad_norm": 0.23730194568634033, "learning_rate": 1.6661144546816321e-06, "loss": 0.0909, "num_input_tokens_seen": 47061728, "step": 69825 }, { "epoch": 1.7059585175775047, "grad_norm": 1.4801957607269287, "learning_rate": 1.6660508480014618e-06, "loss": 0.0594, "num_input_tokens_seen": 47064864, "step": 69830 }, { "epoch": 1.706080668409352, "grad_norm": 1.1917213201522827, "learning_rate": 1.665987236477591e-06, "loss": 0.0653, "num_input_tokens_seen": 47067936, "step": 69835 }, { "epoch": 1.706202819241199, "grad_norm": 3.69415545463562, "learning_rate": 1.6659236201104814e-06, "loss": 0.1373, "num_input_tokens_seen": 47071200, "step": 69840 }, { "epoch": 1.7063249700730463, "grad_norm": 58.802913665771484, "learning_rate": 1.665859998900596e-06, "loss": 0.1086, "num_input_tokens_seen": 47075040, "step": 69845 }, { "epoch": 1.7064471209048935, "grad_norm": 1.404065728187561, "learning_rate": 1.6657963728483981e-06, "loss": 0.1358, "num_input_tokens_seen": 47078944, "step": 69850 }, { "epoch": 1.7065692717367407, "grad_norm": 0.05396494269371033, "learning_rate": 1.6657327419543496e-06, "loss": 0.1373, "num_input_tokens_seen": 47081952, "step": 69855 }, { "epoch": 1.7066914225685879, "grad_norm": 0.3325052857398987, "learning_rate": 1.665669106218914e-06, "loss": 0.0671, "num_input_tokens_seen": 47085408, "step": 69860 }, { "epoch": 1.7068135734004348, "grad_norm": 11.508901596069336, "learning_rate": 1.665605465642553e-06, "loss": 0.0399, "num_input_tokens_seen": 47089568, "step": 69865 }, { "epoch": 1.706935724232282, "grad_norm": 14.941136360168457, "learning_rate": 1.6655418202257305e-06, "loss": 0.033, "num_input_tokens_seen": 47093024, "step": 69870 }, { "epoch": 1.7070578750641292, "grad_norm": 12.48923397064209, "learning_rate": 1.6654781699689086e-06, "loss": 0.0416, "num_input_tokens_seen": 47096544, "step": 69875 }, { "epoch": 1.7071800258959764, "grad_norm": 0.9678665995597839, "learning_rate": 1.6654145148725506e-06, "loss": 0.0335, "num_input_tokens_seen": 47100128, "step": 69880 }, { "epoch": 1.7073021767278234, "grad_norm": 0.45745396614074707, "learning_rate": 1.665350854937119e-06, "loss": 0.0633, "num_input_tokens_seen": 47103840, "step": 69885 }, { "epoch": 1.7074243275596706, "grad_norm": 29.906553268432617, "learning_rate": 1.6652871901630772e-06, "loss": 0.2042, "num_input_tokens_seen": 47107040, "step": 69890 }, { "epoch": 1.7075464783915177, "grad_norm": 2.247661590576172, "learning_rate": 1.665223520550888e-06, "loss": 0.111, "num_input_tokens_seen": 47110560, "step": 69895 }, { "epoch": 1.707668629223365, "grad_norm": 0.3283466398715973, "learning_rate": 1.6651598461010146e-06, "loss": 0.0022, "num_input_tokens_seen": 47114016, "step": 69900 }, { "epoch": 1.7077907800552121, "grad_norm": 0.268053263425827, "learning_rate": 1.6650961668139197e-06, "loss": 0.0567, "num_input_tokens_seen": 47116704, "step": 69905 }, { "epoch": 1.7079129308870593, "grad_norm": 0.3636005222797394, "learning_rate": 1.6650324826900666e-06, "loss": 0.0021, "num_input_tokens_seen": 47120288, "step": 69910 }, { "epoch": 1.7080350817189065, "grad_norm": 0.027881575748324394, "learning_rate": 1.6649687937299183e-06, "loss": 0.0011, "num_input_tokens_seen": 47123168, "step": 69915 }, { "epoch": 1.7081572325507537, "grad_norm": 1.2808572053909302, "learning_rate": 1.6649050999339382e-06, "loss": 0.0612, "num_input_tokens_seen": 47126304, "step": 69920 }, { "epoch": 1.7082793833826009, "grad_norm": 21.61374282836914, "learning_rate": 1.6648414013025895e-06, "loss": 0.1222, "num_input_tokens_seen": 47130016, "step": 69925 }, { "epoch": 1.708401534214448, "grad_norm": 0.17723463475704193, "learning_rate": 1.6647776978363354e-06, "loss": 0.0545, "num_input_tokens_seen": 47133728, "step": 69930 }, { "epoch": 1.7085236850462953, "grad_norm": 0.22193682193756104, "learning_rate": 1.6647139895356388e-06, "loss": 0.0524, "num_input_tokens_seen": 47137056, "step": 69935 }, { "epoch": 1.7086458358781424, "grad_norm": 106.50350952148438, "learning_rate": 1.6646502764009633e-06, "loss": 0.0917, "num_input_tokens_seen": 47140512, "step": 69940 }, { "epoch": 1.7087679867099896, "grad_norm": 1.4736918210983276, "learning_rate": 1.6645865584327723e-06, "loss": 0.0343, "num_input_tokens_seen": 47143904, "step": 69945 }, { "epoch": 1.7088901375418366, "grad_norm": 133.668701171875, "learning_rate": 1.664522835631529e-06, "loss": 0.1482, "num_input_tokens_seen": 47147296, "step": 69950 }, { "epoch": 1.7090122883736838, "grad_norm": 15.187579154968262, "learning_rate": 1.6644591079976971e-06, "loss": 0.1285, "num_input_tokens_seen": 47150304, "step": 69955 }, { "epoch": 1.709134439205531, "grad_norm": 0.03582199290394783, "learning_rate": 1.6643953755317397e-06, "loss": 0.0495, "num_input_tokens_seen": 47153888, "step": 69960 }, { "epoch": 1.7092565900373782, "grad_norm": 10.01606559753418, "learning_rate": 1.6643316382341204e-06, "loss": 0.0723, "num_input_tokens_seen": 47157280, "step": 69965 }, { "epoch": 1.7093787408692254, "grad_norm": 0.12903539836406708, "learning_rate": 1.664267896105303e-06, "loss": 0.1263, "num_input_tokens_seen": 47160608, "step": 69970 }, { "epoch": 1.7095008917010723, "grad_norm": 21.337604522705078, "learning_rate": 1.6642041491457507e-06, "loss": 0.1161, "num_input_tokens_seen": 47163808, "step": 69975 }, { "epoch": 1.7096230425329195, "grad_norm": 0.10576992481946945, "learning_rate": 1.6641403973559268e-06, "loss": 0.1276, "num_input_tokens_seen": 47167072, "step": 69980 }, { "epoch": 1.7097451933647667, "grad_norm": 0.13519780337810516, "learning_rate": 1.6640766407362955e-06, "loss": 0.0363, "num_input_tokens_seen": 47170592, "step": 69985 }, { "epoch": 1.709867344196614, "grad_norm": 0.6011918783187866, "learning_rate": 1.6640128792873205e-06, "loss": 0.0511, "num_input_tokens_seen": 47174304, "step": 69990 }, { "epoch": 1.709989495028461, "grad_norm": 0.9644982814788818, "learning_rate": 1.663949113009465e-06, "loss": 0.1105, "num_input_tokens_seen": 47177632, "step": 69995 }, { "epoch": 1.7101116458603083, "grad_norm": 0.28336676955223083, "learning_rate": 1.663885341903193e-06, "loss": 0.0849, "num_input_tokens_seen": 47181280, "step": 70000 }, { "epoch": 1.7102337966921555, "grad_norm": 0.11140467971563339, "learning_rate": 1.6638215659689683e-06, "loss": 0.0892, "num_input_tokens_seen": 47184288, "step": 70005 }, { "epoch": 1.7103559475240027, "grad_norm": 0.16174258291721344, "learning_rate": 1.6637577852072547e-06, "loss": 0.0731, "num_input_tokens_seen": 47187872, "step": 70010 }, { "epoch": 1.7104780983558499, "grad_norm": 41.12967300415039, "learning_rate": 1.6636939996185157e-06, "loss": 0.0885, "num_input_tokens_seen": 47191200, "step": 70015 }, { "epoch": 1.710600249187697, "grad_norm": 0.4021644592285156, "learning_rate": 1.6636302092032155e-06, "loss": 0.256, "num_input_tokens_seen": 47194592, "step": 70020 }, { "epoch": 1.7107224000195442, "grad_norm": 7.773596286773682, "learning_rate": 1.6635664139618183e-06, "loss": 0.0816, "num_input_tokens_seen": 47197792, "step": 70025 }, { "epoch": 1.7108445508513914, "grad_norm": 0.6293780207633972, "learning_rate": 1.6635026138947873e-06, "loss": 0.0045, "num_input_tokens_seen": 47201120, "step": 70030 }, { "epoch": 1.7109667016832386, "grad_norm": 0.10260359942913055, "learning_rate": 1.6634388090025867e-06, "loss": 0.001, "num_input_tokens_seen": 47204768, "step": 70035 }, { "epoch": 1.7110888525150856, "grad_norm": 11.693562507629395, "learning_rate": 1.663374999285681e-06, "loss": 0.0746, "num_input_tokens_seen": 47208160, "step": 70040 }, { "epoch": 1.7112110033469328, "grad_norm": 0.05745501071214676, "learning_rate": 1.6633111847445336e-06, "loss": 0.0009, "num_input_tokens_seen": 47211552, "step": 70045 }, { "epoch": 1.71133315417878, "grad_norm": 15.584721565246582, "learning_rate": 1.6632473653796088e-06, "loss": 0.0466, "num_input_tokens_seen": 47214752, "step": 70050 }, { "epoch": 1.7114553050106271, "grad_norm": 0.18859045207500458, "learning_rate": 1.6631835411913713e-06, "loss": 0.1216, "num_input_tokens_seen": 47218336, "step": 70055 }, { "epoch": 1.7115774558424743, "grad_norm": 14.733503341674805, "learning_rate": 1.6631197121802843e-06, "loss": 0.084, "num_input_tokens_seen": 47221920, "step": 70060 }, { "epoch": 1.7116996066743213, "grad_norm": 156.82640075683594, "learning_rate": 1.6630558783468122e-06, "loss": 0.0601, "num_input_tokens_seen": 47225056, "step": 70065 }, { "epoch": 1.7118217575061685, "grad_norm": 126.34097290039062, "learning_rate": 1.66299203969142e-06, "loss": 0.0522, "num_input_tokens_seen": 47228768, "step": 70070 }, { "epoch": 1.7119439083380157, "grad_norm": 0.05393688380718231, "learning_rate": 1.6629281962145706e-06, "loss": 0.0836, "num_input_tokens_seen": 47231904, "step": 70075 }, { "epoch": 1.7120660591698629, "grad_norm": 0.07792874425649643, "learning_rate": 1.6628643479167297e-06, "loss": 0.0383, "num_input_tokens_seen": 47235808, "step": 70080 }, { "epoch": 1.71218821000171, "grad_norm": 13.525362014770508, "learning_rate": 1.6628004947983606e-06, "loss": 0.0637, "num_input_tokens_seen": 47239008, "step": 70085 }, { "epoch": 1.7123103608335573, "grad_norm": 0.12766079604625702, "learning_rate": 1.6627366368599285e-06, "loss": 0.0009, "num_input_tokens_seen": 47242336, "step": 70090 }, { "epoch": 1.7124325116654044, "grad_norm": 0.15544456243515015, "learning_rate": 1.6626727741018967e-06, "loss": 0.071, "num_input_tokens_seen": 47245472, "step": 70095 }, { "epoch": 1.7125546624972516, "grad_norm": 0.012700174935162067, "learning_rate": 1.6626089065247306e-06, "loss": 0.0495, "num_input_tokens_seen": 47248608, "step": 70100 }, { "epoch": 1.7126768133290988, "grad_norm": 0.15384520590305328, "learning_rate": 1.6625450341288943e-06, "loss": 0.157, "num_input_tokens_seen": 47252384, "step": 70105 }, { "epoch": 1.712798964160946, "grad_norm": 0.03365683928132057, "learning_rate": 1.6624811569148523e-06, "loss": 0.0948, "num_input_tokens_seen": 47255456, "step": 70110 }, { "epoch": 1.7129211149927932, "grad_norm": 0.5229410529136658, "learning_rate": 1.662417274883069e-06, "loss": 0.0011, "num_input_tokens_seen": 47258656, "step": 70115 }, { "epoch": 1.7130432658246404, "grad_norm": 0.28059008717536926, "learning_rate": 1.6623533880340093e-06, "loss": 0.0389, "num_input_tokens_seen": 47261984, "step": 70120 }, { "epoch": 1.7131654166564876, "grad_norm": 0.13389243185520172, "learning_rate": 1.6622894963681376e-06, "loss": 0.1023, "num_input_tokens_seen": 47265440, "step": 70125 }, { "epoch": 1.7132875674883346, "grad_norm": 27.324216842651367, "learning_rate": 1.6622255998859183e-06, "loss": 0.1753, "num_input_tokens_seen": 47268512, "step": 70130 }, { "epoch": 1.7134097183201817, "grad_norm": 32.270782470703125, "learning_rate": 1.6621616985878166e-06, "loss": 0.1303, "num_input_tokens_seen": 47272096, "step": 70135 }, { "epoch": 1.713531869152029, "grad_norm": 20.028017044067383, "learning_rate": 1.6620977924742967e-06, "loss": 0.1832, "num_input_tokens_seen": 47275168, "step": 70140 }, { "epoch": 1.7136540199838761, "grad_norm": 0.21445448696613312, "learning_rate": 1.6620338815458237e-06, "loss": 0.0486, "num_input_tokens_seen": 47278624, "step": 70145 }, { "epoch": 1.7137761708157233, "grad_norm": 0.2595180869102478, "learning_rate": 1.661969965802862e-06, "loss": 0.0282, "num_input_tokens_seen": 47281760, "step": 70150 }, { "epoch": 1.7138983216475703, "grad_norm": 0.051422055810689926, "learning_rate": 1.6619060452458773e-06, "loss": 0.0466, "num_input_tokens_seen": 47285408, "step": 70155 }, { "epoch": 1.7140204724794175, "grad_norm": 21.490816116333008, "learning_rate": 1.661842119875333e-06, "loss": 0.1555, "num_input_tokens_seen": 47289504, "step": 70160 }, { "epoch": 1.7141426233112647, "grad_norm": 13.273415565490723, "learning_rate": 1.6617781896916955e-06, "loss": 0.038, "num_input_tokens_seen": 47292832, "step": 70165 }, { "epoch": 1.7142647741431118, "grad_norm": 0.24678562581539154, "learning_rate": 1.6617142546954286e-06, "loss": 0.0519, "num_input_tokens_seen": 47296096, "step": 70170 }, { "epoch": 1.714386924974959, "grad_norm": 89.38595581054688, "learning_rate": 1.6616503148869977e-06, "loss": 0.1749, "num_input_tokens_seen": 47299424, "step": 70175 }, { "epoch": 1.7145090758068062, "grad_norm": 0.040162667632102966, "learning_rate": 1.661586370266868e-06, "loss": 0.0288, "num_input_tokens_seen": 47302752, "step": 70180 }, { "epoch": 1.7146312266386534, "grad_norm": 13.061795234680176, "learning_rate": 1.661522420835504e-06, "loss": 0.0801, "num_input_tokens_seen": 47306080, "step": 70185 }, { "epoch": 1.7147533774705006, "grad_norm": 0.04859983175992966, "learning_rate": 1.6614584665933711e-06, "loss": 0.1098, "num_input_tokens_seen": 47309536, "step": 70190 }, { "epoch": 1.7148755283023478, "grad_norm": 1.0230239629745483, "learning_rate": 1.661394507540934e-06, "loss": 0.007, "num_input_tokens_seen": 47312864, "step": 70195 }, { "epoch": 1.714997679134195, "grad_norm": 1.9080485105514526, "learning_rate": 1.661330543678659e-06, "loss": 0.173, "num_input_tokens_seen": 47316128, "step": 70200 }, { "epoch": 1.7151198299660422, "grad_norm": 0.044832922518253326, "learning_rate": 1.6612665750070097e-06, "loss": 0.1017, "num_input_tokens_seen": 47319456, "step": 70205 }, { "epoch": 1.7152419807978894, "grad_norm": 0.2528229057788849, "learning_rate": 1.6612026015264522e-06, "loss": 0.0431, "num_input_tokens_seen": 47322720, "step": 70210 }, { "epoch": 1.7153641316297366, "grad_norm": 30.157514572143555, "learning_rate": 1.6611386232374516e-06, "loss": 0.0533, "num_input_tokens_seen": 47325792, "step": 70215 }, { "epoch": 1.7154862824615835, "grad_norm": 23.45171546936035, "learning_rate": 1.6610746401404728e-06, "loss": 0.0897, "num_input_tokens_seen": 47329056, "step": 70220 }, { "epoch": 1.7156084332934307, "grad_norm": 33.5923957824707, "learning_rate": 1.6610106522359816e-06, "loss": 0.0649, "num_input_tokens_seen": 47331936, "step": 70225 }, { "epoch": 1.715730584125278, "grad_norm": 1.0114668607711792, "learning_rate": 1.6609466595244432e-06, "loss": 0.0611, "num_input_tokens_seen": 47334944, "step": 70230 }, { "epoch": 1.715852734957125, "grad_norm": 0.21693406999111176, "learning_rate": 1.660882662006323e-06, "loss": 0.034, "num_input_tokens_seen": 47338208, "step": 70235 }, { "epoch": 1.715974885788972, "grad_norm": 0.12061869353055954, "learning_rate": 1.6608186596820863e-06, "loss": 0.0561, "num_input_tokens_seen": 47341472, "step": 70240 }, { "epoch": 1.7160970366208192, "grad_norm": 227.574951171875, "learning_rate": 1.6607546525521984e-06, "loss": 0.0466, "num_input_tokens_seen": 47344480, "step": 70245 }, { "epoch": 1.7162191874526664, "grad_norm": 12.42898941040039, "learning_rate": 1.660690640617125e-06, "loss": 0.0434, "num_input_tokens_seen": 47347872, "step": 70250 }, { "epoch": 1.7163413382845136, "grad_norm": 9.49234676361084, "learning_rate": 1.6606266238773317e-06, "loss": 0.1255, "num_input_tokens_seen": 47351328, "step": 70255 }, { "epoch": 1.7164634891163608, "grad_norm": 0.019718781113624573, "learning_rate": 1.6605626023332836e-06, "loss": 0.0415, "num_input_tokens_seen": 47354592, "step": 70260 }, { "epoch": 1.716585639948208, "grad_norm": 6.5894455909729, "learning_rate": 1.660498575985447e-06, "loss": 0.1191, "num_input_tokens_seen": 47358944, "step": 70265 }, { "epoch": 1.7167077907800552, "grad_norm": 0.34928131103515625, "learning_rate": 1.660434544834287e-06, "loss": 0.139, "num_input_tokens_seen": 47362080, "step": 70270 }, { "epoch": 1.7168299416119024, "grad_norm": 0.05047084018588066, "learning_rate": 1.6603705088802692e-06, "loss": 0.0686, "num_input_tokens_seen": 47365280, "step": 70275 }, { "epoch": 1.7169520924437496, "grad_norm": 0.19307588040828705, "learning_rate": 1.6603064681238595e-06, "loss": 0.1732, "num_input_tokens_seen": 47368544, "step": 70280 }, { "epoch": 1.7170742432755968, "grad_norm": 3.607551097869873, "learning_rate": 1.6602424225655236e-06, "loss": 0.096, "num_input_tokens_seen": 47371616, "step": 70285 }, { "epoch": 1.717196394107444, "grad_norm": 168.88800048828125, "learning_rate": 1.6601783722057273e-06, "loss": 0.0898, "num_input_tokens_seen": 47375328, "step": 70290 }, { "epoch": 1.7173185449392911, "grad_norm": 17.703678131103516, "learning_rate": 1.660114317044936e-06, "loss": 0.0325, "num_input_tokens_seen": 47378592, "step": 70295 }, { "epoch": 1.7174406957711383, "grad_norm": 25.313291549682617, "learning_rate": 1.6600502570836162e-06, "loss": 0.1035, "num_input_tokens_seen": 47381664, "step": 70300 }, { "epoch": 1.7175628466029855, "grad_norm": 0.20463314652442932, "learning_rate": 1.6599861923222332e-06, "loss": 0.0312, "num_input_tokens_seen": 47384864, "step": 70305 }, { "epoch": 1.7176849974348325, "grad_norm": 0.32572174072265625, "learning_rate": 1.659922122761253e-06, "loss": 0.1744, "num_input_tokens_seen": 47388320, "step": 70310 }, { "epoch": 1.7178071482666797, "grad_norm": 0.2342342883348465, "learning_rate": 1.6598580484011415e-06, "loss": 0.0006, "num_input_tokens_seen": 47392416, "step": 70315 }, { "epoch": 1.7179292990985269, "grad_norm": 0.9499707818031311, "learning_rate": 1.659793969242365e-06, "loss": 0.1155, "num_input_tokens_seen": 47395872, "step": 70320 }, { "epoch": 1.718051449930374, "grad_norm": 0.0458591990172863, "learning_rate": 1.6597298852853894e-06, "loss": 0.0345, "num_input_tokens_seen": 47399264, "step": 70325 }, { "epoch": 1.718173600762221, "grad_norm": 0.1385032832622528, "learning_rate": 1.6596657965306807e-06, "loss": 0.1316, "num_input_tokens_seen": 47402400, "step": 70330 }, { "epoch": 1.7182957515940682, "grad_norm": 2.4102401733398438, "learning_rate": 1.6596017029787048e-06, "loss": 0.0012, "num_input_tokens_seen": 47405600, "step": 70335 }, { "epoch": 1.7184179024259154, "grad_norm": 23.362293243408203, "learning_rate": 1.6595376046299276e-06, "loss": 0.151, "num_input_tokens_seen": 47408928, "step": 70340 }, { "epoch": 1.7185400532577626, "grad_norm": 0.18636535108089447, "learning_rate": 1.6594735014848161e-06, "loss": 0.1138, "num_input_tokens_seen": 47412576, "step": 70345 }, { "epoch": 1.7186622040896098, "grad_norm": 11.467135429382324, "learning_rate": 1.6594093935438354e-06, "loss": 0.0559, "num_input_tokens_seen": 47416032, "step": 70350 }, { "epoch": 1.718784354921457, "grad_norm": 0.03606804087758064, "learning_rate": 1.6593452808074524e-06, "loss": 0.0006, "num_input_tokens_seen": 47419040, "step": 70355 }, { "epoch": 1.7189065057533042, "grad_norm": 77.13521575927734, "learning_rate": 1.6592811632761335e-06, "loss": 0.0244, "num_input_tokens_seen": 47422432, "step": 70360 }, { "epoch": 1.7190286565851514, "grad_norm": 0.0497257299721241, "learning_rate": 1.6592170409503444e-06, "loss": 0.0547, "num_input_tokens_seen": 47425888, "step": 70365 }, { "epoch": 1.7191508074169985, "grad_norm": 12.063864707946777, "learning_rate": 1.6591529138305515e-06, "loss": 0.0812, "num_input_tokens_seen": 47429216, "step": 70370 }, { "epoch": 1.7192729582488457, "grad_norm": 0.09922768175601959, "learning_rate": 1.6590887819172215e-06, "loss": 0.1318, "num_input_tokens_seen": 47432800, "step": 70375 }, { "epoch": 1.719395109080693, "grad_norm": 0.570949137210846, "learning_rate": 1.6590246452108206e-06, "loss": 0.156, "num_input_tokens_seen": 47436512, "step": 70380 }, { "epoch": 1.7195172599125401, "grad_norm": 27.616113662719727, "learning_rate": 1.6589605037118153e-06, "loss": 0.2372, "num_input_tokens_seen": 47440160, "step": 70385 }, { "epoch": 1.7196394107443873, "grad_norm": 0.17594002187252045, "learning_rate": 1.6588963574206719e-06, "loss": 0.0403, "num_input_tokens_seen": 47443424, "step": 70390 }, { "epoch": 1.7197615615762345, "grad_norm": 35.87648010253906, "learning_rate": 1.6588322063378567e-06, "loss": 0.0508, "num_input_tokens_seen": 47446880, "step": 70395 }, { "epoch": 1.7198837124080815, "grad_norm": 21.05569076538086, "learning_rate": 1.6587680504638368e-06, "loss": 0.1788, "num_input_tokens_seen": 47450272, "step": 70400 }, { "epoch": 1.7200058632399287, "grad_norm": 17.221059799194336, "learning_rate": 1.6587038897990783e-06, "loss": 0.051, "num_input_tokens_seen": 47453728, "step": 70405 }, { "epoch": 1.7201280140717758, "grad_norm": 16.515247344970703, "learning_rate": 1.6586397243440483e-06, "loss": 0.0631, "num_input_tokens_seen": 47456672, "step": 70410 }, { "epoch": 1.720250164903623, "grad_norm": 0.07961339503526688, "learning_rate": 1.6585755540992125e-06, "loss": 0.1466, "num_input_tokens_seen": 47459936, "step": 70415 }, { "epoch": 1.72037231573547, "grad_norm": 34.02830505371094, "learning_rate": 1.6585113790650388e-06, "loss": 0.1025, "num_input_tokens_seen": 47463328, "step": 70420 }, { "epoch": 1.7204944665673172, "grad_norm": 0.5048316121101379, "learning_rate": 1.6584471992419927e-06, "loss": 0.0013, "num_input_tokens_seen": 47467232, "step": 70425 }, { "epoch": 1.7206166173991644, "grad_norm": 0.17081189155578613, "learning_rate": 1.6583830146305418e-06, "loss": 0.0816, "num_input_tokens_seen": 47470496, "step": 70430 }, { "epoch": 1.7207387682310116, "grad_norm": 0.06982258707284927, "learning_rate": 1.6583188252311522e-06, "loss": 0.0989, "num_input_tokens_seen": 47474080, "step": 70435 }, { "epoch": 1.7208609190628588, "grad_norm": 0.29233846068382263, "learning_rate": 1.6582546310442913e-06, "loss": 0.1682, "num_input_tokens_seen": 47477664, "step": 70440 }, { "epoch": 1.720983069894706, "grad_norm": 0.3050183951854706, "learning_rate": 1.6581904320704254e-06, "loss": 0.0531, "num_input_tokens_seen": 47481376, "step": 70445 }, { "epoch": 1.7211052207265531, "grad_norm": 23.65858268737793, "learning_rate": 1.658126228310022e-06, "loss": 0.0985, "num_input_tokens_seen": 47484384, "step": 70450 }, { "epoch": 1.7212273715584003, "grad_norm": 13.939019203186035, "learning_rate": 1.6580620197635473e-06, "loss": 0.0026, "num_input_tokens_seen": 47487840, "step": 70455 }, { "epoch": 1.7213495223902475, "grad_norm": 0.7158112525939941, "learning_rate": 1.6579978064314688e-06, "loss": 0.0616, "num_input_tokens_seen": 47491168, "step": 70460 }, { "epoch": 1.7214716732220947, "grad_norm": 0.05182803422212601, "learning_rate": 1.6579335883142534e-06, "loss": 0.0845, "num_input_tokens_seen": 47494688, "step": 70465 }, { "epoch": 1.721593824053942, "grad_norm": 3.6295721530914307, "learning_rate": 1.6578693654123676e-06, "loss": 0.0646, "num_input_tokens_seen": 47498080, "step": 70470 }, { "epoch": 1.721715974885789, "grad_norm": 0.44479724764823914, "learning_rate": 1.6578051377262792e-06, "loss": 0.085, "num_input_tokens_seen": 47501088, "step": 70475 }, { "epoch": 1.7218381257176363, "grad_norm": 120.40805053710938, "learning_rate": 1.6577409052564545e-06, "loss": 0.0571, "num_input_tokens_seen": 47504480, "step": 70480 }, { "epoch": 1.7219602765494832, "grad_norm": 0.19627024233341217, "learning_rate": 1.6576766680033613e-06, "loss": 0.0518, "num_input_tokens_seen": 47508576, "step": 70485 }, { "epoch": 1.7220824273813304, "grad_norm": 10.544469833374023, "learning_rate": 1.6576124259674667e-06, "loss": 0.0763, "num_input_tokens_seen": 47511840, "step": 70490 }, { "epoch": 1.7222045782131776, "grad_norm": 0.26938116550445557, "learning_rate": 1.6575481791492374e-06, "loss": 0.008, "num_input_tokens_seen": 47515168, "step": 70495 }, { "epoch": 1.7223267290450248, "grad_norm": 0.15729977190494537, "learning_rate": 1.657483927549141e-06, "loss": 0.1826, "num_input_tokens_seen": 47518624, "step": 70500 }, { "epoch": 1.722448879876872, "grad_norm": 0.2031847983598709, "learning_rate": 1.6574196711676444e-06, "loss": 0.0811, "num_input_tokens_seen": 47521760, "step": 70505 }, { "epoch": 1.722571030708719, "grad_norm": 14.030067443847656, "learning_rate": 1.6573554100052154e-06, "loss": 0.1212, "num_input_tokens_seen": 47524896, "step": 70510 }, { "epoch": 1.7226931815405662, "grad_norm": 0.7857367992401123, "learning_rate": 1.657291144062321e-06, "loss": 0.0577, "num_input_tokens_seen": 47528736, "step": 70515 }, { "epoch": 1.7228153323724134, "grad_norm": 0.08645419031381607, "learning_rate": 1.6572268733394283e-06, "loss": 0.0864, "num_input_tokens_seen": 47532192, "step": 70520 }, { "epoch": 1.7229374832042605, "grad_norm": 23.457870483398438, "learning_rate": 1.6571625978370055e-06, "loss": 0.1231, "num_input_tokens_seen": 47535328, "step": 70525 }, { "epoch": 1.7230596340361077, "grad_norm": 10.771952629089355, "learning_rate": 1.657098317555519e-06, "loss": 0.2692, "num_input_tokens_seen": 47538400, "step": 70530 }, { "epoch": 1.723181784867955, "grad_norm": 2.8880395889282227, "learning_rate": 1.6570340324954374e-06, "loss": 0.0742, "num_input_tokens_seen": 47541792, "step": 70535 }, { "epoch": 1.7233039356998021, "grad_norm": 0.0832524523139, "learning_rate": 1.656969742657227e-06, "loss": 0.0373, "num_input_tokens_seen": 47545184, "step": 70540 }, { "epoch": 1.7234260865316493, "grad_norm": 180.890869140625, "learning_rate": 1.6569054480413564e-06, "loss": 0.1764, "num_input_tokens_seen": 47548192, "step": 70545 }, { "epoch": 1.7235482373634965, "grad_norm": 0.5971752405166626, "learning_rate": 1.6568411486482923e-06, "loss": 0.051, "num_input_tokens_seen": 47551456, "step": 70550 }, { "epoch": 1.7236703881953437, "grad_norm": 0.18406574428081512, "learning_rate": 1.656776844478503e-06, "loss": 0.1628, "num_input_tokens_seen": 47555296, "step": 70555 }, { "epoch": 1.7237925390271909, "grad_norm": 0.5610669255256653, "learning_rate": 1.6567125355324555e-06, "loss": 0.0636, "num_input_tokens_seen": 47558688, "step": 70560 }, { "epoch": 1.723914689859038, "grad_norm": 48.64958190917969, "learning_rate": 1.6566482218106184e-06, "loss": 0.1408, "num_input_tokens_seen": 47562080, "step": 70565 }, { "epoch": 1.7240368406908853, "grad_norm": 81.54186248779297, "learning_rate": 1.6565839033134584e-06, "loss": 0.0796, "num_input_tokens_seen": 47565216, "step": 70570 }, { "epoch": 1.7241589915227322, "grad_norm": 18.318082809448242, "learning_rate": 1.6565195800414434e-06, "loss": 0.0569, "num_input_tokens_seen": 47568544, "step": 70575 }, { "epoch": 1.7242811423545794, "grad_norm": 0.33882206678390503, "learning_rate": 1.656455251995042e-06, "loss": 0.0396, "num_input_tokens_seen": 47571808, "step": 70580 }, { "epoch": 1.7244032931864266, "grad_norm": 0.1423991471529007, "learning_rate": 1.6563909191747212e-06, "loss": 0.0013, "num_input_tokens_seen": 47575072, "step": 70585 }, { "epoch": 1.7245254440182738, "grad_norm": 32.475624084472656, "learning_rate": 1.656326581580949e-06, "loss": 0.1305, "num_input_tokens_seen": 47578592, "step": 70590 }, { "epoch": 1.724647594850121, "grad_norm": 0.16308899223804474, "learning_rate": 1.656262239214193e-06, "loss": 0.0725, "num_input_tokens_seen": 47581728, "step": 70595 }, { "epoch": 1.724769745681968, "grad_norm": 0.12075145542621613, "learning_rate": 1.6561978920749223e-06, "loss": 0.0006, "num_input_tokens_seen": 47585184, "step": 70600 }, { "epoch": 1.7248918965138151, "grad_norm": 0.24487103521823883, "learning_rate": 1.6561335401636036e-06, "loss": 0.0488, "num_input_tokens_seen": 47588384, "step": 70605 }, { "epoch": 1.7250140473456623, "grad_norm": 0.39849162101745605, "learning_rate": 1.6560691834807052e-06, "loss": 0.032, "num_input_tokens_seen": 47591840, "step": 70610 }, { "epoch": 1.7251361981775095, "grad_norm": 0.7076511979103088, "learning_rate": 1.6560048220266955e-06, "loss": 0.0013, "num_input_tokens_seen": 47595424, "step": 70615 }, { "epoch": 1.7252583490093567, "grad_norm": 0.3278559446334839, "learning_rate": 1.6559404558020424e-06, "loss": 0.1268, "num_input_tokens_seen": 47599264, "step": 70620 }, { "epoch": 1.725380499841204, "grad_norm": 0.050880298018455505, "learning_rate": 1.6558760848072135e-06, "loss": 0.1171, "num_input_tokens_seen": 47602144, "step": 70625 }, { "epoch": 1.725502650673051, "grad_norm": 40.476478576660156, "learning_rate": 1.6558117090426772e-06, "loss": 0.2168, "num_input_tokens_seen": 47605536, "step": 70630 }, { "epoch": 1.7256248015048983, "grad_norm": 12.389641761779785, "learning_rate": 1.6557473285089023e-06, "loss": 0.0798, "num_input_tokens_seen": 47608416, "step": 70635 }, { "epoch": 1.7257469523367455, "grad_norm": 0.38528257608413696, "learning_rate": 1.6556829432063562e-06, "loss": 0.1222, "num_input_tokens_seen": 47611936, "step": 70640 }, { "epoch": 1.7258691031685927, "grad_norm": 0.23907269537448883, "learning_rate": 1.6556185531355074e-06, "loss": 0.064, "num_input_tokens_seen": 47615328, "step": 70645 }, { "epoch": 1.7259912540004398, "grad_norm": 0.34818893671035767, "learning_rate": 1.655554158296824e-06, "loss": 0.0015, "num_input_tokens_seen": 47619040, "step": 70650 }, { "epoch": 1.726113404832287, "grad_norm": 14.175068855285645, "learning_rate": 1.6554897586907746e-06, "loss": 0.1151, "num_input_tokens_seen": 47622368, "step": 70655 }, { "epoch": 1.7262355556641342, "grad_norm": 0.10468627512454987, "learning_rate": 1.6554253543178272e-06, "loss": 0.029, "num_input_tokens_seen": 47625952, "step": 70660 }, { "epoch": 1.7263577064959812, "grad_norm": 24.79138946533203, "learning_rate": 1.6553609451784505e-06, "loss": 0.1943, "num_input_tokens_seen": 47629088, "step": 70665 }, { "epoch": 1.7264798573278284, "grad_norm": 113.18524169921875, "learning_rate": 1.655296531273113e-06, "loss": 0.0506, "num_input_tokens_seen": 47632416, "step": 70670 }, { "epoch": 1.7266020081596756, "grad_norm": 41.06086730957031, "learning_rate": 1.6552321126022824e-06, "loss": 0.0964, "num_input_tokens_seen": 47635936, "step": 70675 }, { "epoch": 1.7267241589915228, "grad_norm": 0.1683439463376999, "learning_rate": 1.6551676891664278e-06, "loss": 0.1082, "num_input_tokens_seen": 47639264, "step": 70680 }, { "epoch": 1.72684630982337, "grad_norm": 0.3442506492137909, "learning_rate": 1.6551032609660174e-06, "loss": 0.1175, "num_input_tokens_seen": 47642208, "step": 70685 }, { "epoch": 1.726968460655217, "grad_norm": 19.647884368896484, "learning_rate": 1.6550388280015199e-06, "loss": 0.1192, "num_input_tokens_seen": 47645920, "step": 70690 }, { "epoch": 1.727090611487064, "grad_norm": 20.30986976623535, "learning_rate": 1.654974390273404e-06, "loss": 0.0551, "num_input_tokens_seen": 47649312, "step": 70695 }, { "epoch": 1.7272127623189113, "grad_norm": 50.00175476074219, "learning_rate": 1.6549099477821384e-06, "loss": 0.0847, "num_input_tokens_seen": 47652448, "step": 70700 }, { "epoch": 1.7273349131507585, "grad_norm": 0.24548085033893585, "learning_rate": 1.6548455005281912e-06, "loss": 0.1452, "num_input_tokens_seen": 47655840, "step": 70705 }, { "epoch": 1.7274570639826057, "grad_norm": 0.5623430013656616, "learning_rate": 1.6547810485120315e-06, "loss": 0.0222, "num_input_tokens_seen": 47658848, "step": 70710 }, { "epoch": 1.7275792148144529, "grad_norm": 0.13892212510108948, "learning_rate": 1.6547165917341274e-06, "loss": 0.0349, "num_input_tokens_seen": 47662048, "step": 70715 }, { "epoch": 1.7277013656463, "grad_norm": 0.058080315589904785, "learning_rate": 1.6546521301949489e-06, "loss": 0.1226, "num_input_tokens_seen": 47665120, "step": 70720 }, { "epoch": 1.7278235164781472, "grad_norm": 12.511260986328125, "learning_rate": 1.6545876638949636e-06, "loss": 0.134, "num_input_tokens_seen": 47668576, "step": 70725 }, { "epoch": 1.7279456673099944, "grad_norm": 10.45933723449707, "learning_rate": 1.6545231928346411e-06, "loss": 0.1153, "num_input_tokens_seen": 47672224, "step": 70730 }, { "epoch": 1.7280678181418416, "grad_norm": 0.14672933518886566, "learning_rate": 1.6544587170144496e-06, "loss": 0.0368, "num_input_tokens_seen": 47675552, "step": 70735 }, { "epoch": 1.7281899689736888, "grad_norm": 0.03236455097794533, "learning_rate": 1.6543942364348583e-06, "loss": 0.0445, "num_input_tokens_seen": 47679520, "step": 70740 }, { "epoch": 1.728312119805536, "grad_norm": 0.19849258661270142, "learning_rate": 1.6543297510963362e-06, "loss": 0.0499, "num_input_tokens_seen": 47682592, "step": 70745 }, { "epoch": 1.7284342706373832, "grad_norm": 0.2941298186779022, "learning_rate": 1.6542652609993519e-06, "loss": 0.0015, "num_input_tokens_seen": 47685600, "step": 70750 }, { "epoch": 1.7285564214692302, "grad_norm": 0.624882698059082, "learning_rate": 1.6542007661443749e-06, "loss": 0.1076, "num_input_tokens_seen": 47688800, "step": 70755 }, { "epoch": 1.7286785723010774, "grad_norm": 66.73082733154297, "learning_rate": 1.654136266531874e-06, "loss": 0.1117, "num_input_tokens_seen": 47692448, "step": 70760 }, { "epoch": 1.7288007231329245, "grad_norm": 19.269960403442383, "learning_rate": 1.6540717621623182e-06, "loss": 0.2228, "num_input_tokens_seen": 47695712, "step": 70765 }, { "epoch": 1.7289228739647717, "grad_norm": 0.282610684633255, "learning_rate": 1.6540072530361767e-06, "loss": 0.0448, "num_input_tokens_seen": 47698784, "step": 70770 }, { "epoch": 1.7290450247966187, "grad_norm": 0.4029996991157532, "learning_rate": 1.6539427391539183e-06, "loss": 0.0365, "num_input_tokens_seen": 47701600, "step": 70775 }, { "epoch": 1.729167175628466, "grad_norm": 144.61106872558594, "learning_rate": 1.6538782205160124e-06, "loss": 0.0411, "num_input_tokens_seen": 47704928, "step": 70780 }, { "epoch": 1.729289326460313, "grad_norm": 0.9398413300514221, "learning_rate": 1.6538136971229284e-06, "loss": 0.0804, "num_input_tokens_seen": 47708128, "step": 70785 }, { "epoch": 1.7294114772921603, "grad_norm": 0.23816479742527008, "learning_rate": 1.6537491689751352e-06, "loss": 0.0767, "num_input_tokens_seen": 47711456, "step": 70790 }, { "epoch": 1.7295336281240075, "grad_norm": 0.2797488868236542, "learning_rate": 1.6536846360731022e-06, "loss": 0.0489, "num_input_tokens_seen": 47714784, "step": 70795 }, { "epoch": 1.7296557789558547, "grad_norm": 14.246989250183105, "learning_rate": 1.653620098417299e-06, "loss": 0.1632, "num_input_tokens_seen": 47717856, "step": 70800 }, { "epoch": 1.7297779297877018, "grad_norm": 20.030927658081055, "learning_rate": 1.6535555560081945e-06, "loss": 0.0536, "num_input_tokens_seen": 47721440, "step": 70805 }, { "epoch": 1.729900080619549, "grad_norm": 0.24360552430152893, "learning_rate": 1.653491008846258e-06, "loss": 0.0379, "num_input_tokens_seen": 47724832, "step": 70810 }, { "epoch": 1.7300222314513962, "grad_norm": 0.24199669063091278, "learning_rate": 1.6534264569319594e-06, "loss": 0.1593, "num_input_tokens_seen": 47727968, "step": 70815 }, { "epoch": 1.7301443822832434, "grad_norm": 0.16798633337020874, "learning_rate": 1.6533619002657676e-06, "loss": 0.121, "num_input_tokens_seen": 47731360, "step": 70820 }, { "epoch": 1.7302665331150906, "grad_norm": 143.9950714111328, "learning_rate": 1.6532973388481523e-06, "loss": 0.0291, "num_input_tokens_seen": 47735200, "step": 70825 }, { "epoch": 1.7303886839469378, "grad_norm": 1.209952712059021, "learning_rate": 1.6532327726795834e-06, "loss": 0.088, "num_input_tokens_seen": 47738464, "step": 70830 }, { "epoch": 1.730510834778785, "grad_norm": 63.46141052246094, "learning_rate": 1.65316820176053e-06, "loss": 0.2023, "num_input_tokens_seen": 47741664, "step": 70835 }, { "epoch": 1.7306329856106322, "grad_norm": 16.589221954345703, "learning_rate": 1.6531036260914615e-06, "loss": 0.0257, "num_input_tokens_seen": 47745632, "step": 70840 }, { "epoch": 1.7307551364424791, "grad_norm": 0.10137242823839188, "learning_rate": 1.6530390456728478e-06, "loss": 0.0009, "num_input_tokens_seen": 47748832, "step": 70845 }, { "epoch": 1.7308772872743263, "grad_norm": 2.2535622119903564, "learning_rate": 1.6529744605051586e-06, "loss": 0.1395, "num_input_tokens_seen": 47752608, "step": 70850 }, { "epoch": 1.7309994381061735, "grad_norm": 0.13834618031978607, "learning_rate": 1.6529098705888636e-06, "loss": 0.0413, "num_input_tokens_seen": 47755936, "step": 70855 }, { "epoch": 1.7311215889380207, "grad_norm": 194.70693969726562, "learning_rate": 1.6528452759244322e-06, "loss": 0.1253, "num_input_tokens_seen": 47759200, "step": 70860 }, { "epoch": 1.7312437397698677, "grad_norm": 0.24020245671272278, "learning_rate": 1.6527806765123345e-06, "loss": 0.0245, "num_input_tokens_seen": 47762848, "step": 70865 }, { "epoch": 1.7313658906017149, "grad_norm": 0.7342332601547241, "learning_rate": 1.6527160723530403e-06, "loss": 0.0019, "num_input_tokens_seen": 47766048, "step": 70870 }, { "epoch": 1.731488041433562, "grad_norm": 0.24523884057998657, "learning_rate": 1.6526514634470188e-06, "loss": 0.0652, "num_input_tokens_seen": 47769696, "step": 70875 }, { "epoch": 1.7316101922654092, "grad_norm": 0.05468997359275818, "learning_rate": 1.6525868497947406e-06, "loss": 0.0015, "num_input_tokens_seen": 47772896, "step": 70880 }, { "epoch": 1.7317323430972564, "grad_norm": 163.22579956054688, "learning_rate": 1.6525222313966754e-06, "loss": 0.0574, "num_input_tokens_seen": 47776096, "step": 70885 }, { "epoch": 1.7318544939291036, "grad_norm": 227.71888732910156, "learning_rate": 1.6524576082532927e-06, "loss": 0.1191, "num_input_tokens_seen": 47779360, "step": 70890 }, { "epoch": 1.7319766447609508, "grad_norm": 39.0536003112793, "learning_rate": 1.6523929803650632e-06, "loss": 0.1471, "num_input_tokens_seen": 47783008, "step": 70895 }, { "epoch": 1.732098795592798, "grad_norm": 2.195404291152954, "learning_rate": 1.6523283477324561e-06, "loss": 0.0375, "num_input_tokens_seen": 47786080, "step": 70900 }, { "epoch": 1.7322209464246452, "grad_norm": 44.34257507324219, "learning_rate": 1.652263710355942e-06, "loss": 0.1278, "num_input_tokens_seen": 47789088, "step": 70905 }, { "epoch": 1.7323430972564924, "grad_norm": 0.3584393560886383, "learning_rate": 1.6521990682359906e-06, "loss": 0.1328, "num_input_tokens_seen": 47792992, "step": 70910 }, { "epoch": 1.7324652480883396, "grad_norm": 0.08577506244182587, "learning_rate": 1.6521344213730723e-06, "loss": 0.089, "num_input_tokens_seen": 47796512, "step": 70915 }, { "epoch": 1.7325873989201868, "grad_norm": 29.132362365722656, "learning_rate": 1.652069769767657e-06, "loss": 0.091, "num_input_tokens_seen": 47800032, "step": 70920 }, { "epoch": 1.732709549752034, "grad_norm": 36.29575729370117, "learning_rate": 1.6520051134202154e-06, "loss": 0.0469, "num_input_tokens_seen": 47803872, "step": 70925 }, { "epoch": 1.7328317005838811, "grad_norm": 0.9603968262672424, "learning_rate": 1.6519404523312166e-06, "loss": 0.1223, "num_input_tokens_seen": 47807584, "step": 70930 }, { "epoch": 1.732953851415728, "grad_norm": 0.1118437722325325, "learning_rate": 1.6518757865011316e-06, "loss": 0.0338, "num_input_tokens_seen": 47811040, "step": 70935 }, { "epoch": 1.7330760022475753, "grad_norm": 103.07813262939453, "learning_rate": 1.651811115930431e-06, "loss": 0.0911, "num_input_tokens_seen": 47814048, "step": 70940 }, { "epoch": 1.7331981530794225, "grad_norm": 10.315584182739258, "learning_rate": 1.651746440619584e-06, "loss": 0.1733, "num_input_tokens_seen": 47817440, "step": 70945 }, { "epoch": 1.7333203039112697, "grad_norm": 12.482510566711426, "learning_rate": 1.651681760569062e-06, "loss": 0.0495, "num_input_tokens_seen": 47820512, "step": 70950 }, { "epoch": 1.7334424547431166, "grad_norm": 11.532674789428711, "learning_rate": 1.651617075779335e-06, "loss": 0.0826, "num_input_tokens_seen": 47823648, "step": 70955 }, { "epoch": 1.7335646055749638, "grad_norm": 0.13834688067436218, "learning_rate": 1.651552386250873e-06, "loss": 0.0421, "num_input_tokens_seen": 47827168, "step": 70960 }, { "epoch": 1.733686756406811, "grad_norm": 0.4201122522354126, "learning_rate": 1.6514876919841472e-06, "loss": 0.0399, "num_input_tokens_seen": 47830560, "step": 70965 }, { "epoch": 1.7338089072386582, "grad_norm": 0.19618572294712067, "learning_rate": 1.6514229929796274e-06, "loss": 0.1524, "num_input_tokens_seen": 47833696, "step": 70970 }, { "epoch": 1.7339310580705054, "grad_norm": 0.6358382701873779, "learning_rate": 1.6513582892377846e-06, "loss": 0.0509, "num_input_tokens_seen": 47836960, "step": 70975 }, { "epoch": 1.7340532089023526, "grad_norm": 0.05346240848302841, "learning_rate": 1.651293580759089e-06, "loss": 0.0013, "num_input_tokens_seen": 47840032, "step": 70980 }, { "epoch": 1.7341753597341998, "grad_norm": 8.918015480041504, "learning_rate": 1.6512288675440113e-06, "loss": 0.0849, "num_input_tokens_seen": 47842912, "step": 70985 }, { "epoch": 1.734297510566047, "grad_norm": 1.396060824394226, "learning_rate": 1.6511641495930224e-06, "loss": 0.0021, "num_input_tokens_seen": 47846624, "step": 70990 }, { "epoch": 1.7344196613978942, "grad_norm": 13.514362335205078, "learning_rate": 1.651099426906592e-06, "loss": 0.1011, "num_input_tokens_seen": 47849632, "step": 70995 }, { "epoch": 1.7345418122297414, "grad_norm": 10.2088041305542, "learning_rate": 1.651034699485192e-06, "loss": 0.2572, "num_input_tokens_seen": 47853792, "step": 71000 }, { "epoch": 1.7346639630615885, "grad_norm": 30.044353485107422, "learning_rate": 1.6509699673292925e-06, "loss": 0.1634, "num_input_tokens_seen": 47857248, "step": 71005 }, { "epoch": 1.7347861138934357, "grad_norm": 19.223125457763672, "learning_rate": 1.6509052304393643e-06, "loss": 0.2716, "num_input_tokens_seen": 47860448, "step": 71010 }, { "epoch": 1.734908264725283, "grad_norm": 0.053986601531505585, "learning_rate": 1.650840488815878e-06, "loss": 0.0248, "num_input_tokens_seen": 47864672, "step": 71015 }, { "epoch": 1.73503041555713, "grad_norm": 72.20088195800781, "learning_rate": 1.6507757424593047e-06, "loss": 0.0304, "num_input_tokens_seen": 47868448, "step": 71020 }, { "epoch": 1.735152566388977, "grad_norm": 0.3802885413169861, "learning_rate": 1.6507109913701154e-06, "loss": 0.187, "num_input_tokens_seen": 47872288, "step": 71025 }, { "epoch": 1.7352747172208243, "grad_norm": 3.7849714756011963, "learning_rate": 1.6506462355487804e-06, "loss": 0.038, "num_input_tokens_seen": 47875744, "step": 71030 }, { "epoch": 1.7353968680526715, "grad_norm": 1.352379322052002, "learning_rate": 1.650581474995771e-06, "loss": 0.0689, "num_input_tokens_seen": 47879392, "step": 71035 }, { "epoch": 1.7355190188845186, "grad_norm": 18.819459915161133, "learning_rate": 1.6505167097115581e-06, "loss": 0.0723, "num_input_tokens_seen": 47882528, "step": 71040 }, { "epoch": 1.7356411697163656, "grad_norm": 15.091285705566406, "learning_rate": 1.650451939696613e-06, "loss": 0.0519, "num_input_tokens_seen": 47886432, "step": 71045 }, { "epoch": 1.7357633205482128, "grad_norm": 0.20803719758987427, "learning_rate": 1.6503871649514064e-06, "loss": 0.0505, "num_input_tokens_seen": 47889504, "step": 71050 }, { "epoch": 1.73588547138006, "grad_norm": 153.0974578857422, "learning_rate": 1.6503223854764093e-06, "loss": 0.0962, "num_input_tokens_seen": 47892768, "step": 71055 }, { "epoch": 1.7360076222119072, "grad_norm": 68.89737701416016, "learning_rate": 1.6502576012720928e-06, "loss": 0.057, "num_input_tokens_seen": 47896480, "step": 71060 }, { "epoch": 1.7361297730437544, "grad_norm": 112.9688949584961, "learning_rate": 1.6501928123389282e-06, "loss": 0.0938, "num_input_tokens_seen": 47900256, "step": 71065 }, { "epoch": 1.7362519238756016, "grad_norm": 0.1029193177819252, "learning_rate": 1.6501280186773867e-06, "loss": 0.079, "num_input_tokens_seen": 47903648, "step": 71070 }, { "epoch": 1.7363740747074488, "grad_norm": 35.86996841430664, "learning_rate": 1.6500632202879392e-06, "loss": 0.1451, "num_input_tokens_seen": 47906912, "step": 71075 }, { "epoch": 1.736496225539296, "grad_norm": 93.33722686767578, "learning_rate": 1.6499984171710572e-06, "loss": 0.0872, "num_input_tokens_seen": 47910560, "step": 71080 }, { "epoch": 1.7366183763711431, "grad_norm": 0.13678595423698425, "learning_rate": 1.6499336093272121e-06, "loss": 0.0756, "num_input_tokens_seen": 47913632, "step": 71085 }, { "epoch": 1.7367405272029903, "grad_norm": 0.1121816635131836, "learning_rate": 1.6498687967568745e-06, "loss": 0.0782, "num_input_tokens_seen": 47917152, "step": 71090 }, { "epoch": 1.7368626780348375, "grad_norm": 0.06260692328214645, "learning_rate": 1.6498039794605166e-06, "loss": 0.0485, "num_input_tokens_seen": 47920288, "step": 71095 }, { "epoch": 1.7369848288666847, "grad_norm": 11.609285354614258, "learning_rate": 1.649739157438609e-06, "loss": 0.0867, "num_input_tokens_seen": 47923872, "step": 71100 }, { "epoch": 1.737106979698532, "grad_norm": 14.361102104187012, "learning_rate": 1.649674330691624e-06, "loss": 0.036, "num_input_tokens_seen": 47927200, "step": 71105 }, { "epoch": 1.7372291305303789, "grad_norm": 19.945722579956055, "learning_rate": 1.6496094992200322e-06, "loss": 0.0798, "num_input_tokens_seen": 47930464, "step": 71110 }, { "epoch": 1.737351281362226, "grad_norm": 0.09397384524345398, "learning_rate": 1.6495446630243056e-06, "loss": 0.0412, "num_input_tokens_seen": 47933984, "step": 71115 }, { "epoch": 1.7374734321940732, "grad_norm": 3.3554627895355225, "learning_rate": 1.649479822104915e-06, "loss": 0.059, "num_input_tokens_seen": 47937184, "step": 71120 }, { "epoch": 1.7375955830259204, "grad_norm": 14.1300630569458, "learning_rate": 1.649414976462333e-06, "loss": 0.0557, "num_input_tokens_seen": 47940512, "step": 71125 }, { "epoch": 1.7377177338577676, "grad_norm": 0.28302502632141113, "learning_rate": 1.6493501260970306e-06, "loss": 0.0509, "num_input_tokens_seen": 47944096, "step": 71130 }, { "epoch": 1.7378398846896146, "grad_norm": 0.15091900527477264, "learning_rate": 1.6492852710094792e-06, "loss": 0.0978, "num_input_tokens_seen": 47947872, "step": 71135 }, { "epoch": 1.7379620355214618, "grad_norm": 123.89974975585938, "learning_rate": 1.649220411200151e-06, "loss": 0.0629, "num_input_tokens_seen": 47951200, "step": 71140 }, { "epoch": 1.738084186353309, "grad_norm": 0.14565247297286987, "learning_rate": 1.649155546669517e-06, "loss": 0.0905, "num_input_tokens_seen": 47954400, "step": 71145 }, { "epoch": 1.7382063371851562, "grad_norm": 26.535411834716797, "learning_rate": 1.6490906774180493e-06, "loss": 0.12, "num_input_tokens_seen": 47957664, "step": 71150 }, { "epoch": 1.7383284880170033, "grad_norm": 64.44357299804688, "learning_rate": 1.6490258034462196e-06, "loss": 0.0773, "num_input_tokens_seen": 47960928, "step": 71155 }, { "epoch": 1.7384506388488505, "grad_norm": 1.9539140462875366, "learning_rate": 1.6489609247544998e-06, "loss": 0.0025, "num_input_tokens_seen": 47964512, "step": 71160 }, { "epoch": 1.7385727896806977, "grad_norm": 0.23583951592445374, "learning_rate": 1.6488960413433617e-06, "loss": 0.1051, "num_input_tokens_seen": 47967456, "step": 71165 }, { "epoch": 1.738694940512545, "grad_norm": 53.65757751464844, "learning_rate": 1.6488311532132768e-06, "loss": 0.0377, "num_input_tokens_seen": 47970336, "step": 71170 }, { "epoch": 1.738817091344392, "grad_norm": 88.5074691772461, "learning_rate": 1.6487662603647174e-06, "loss": 0.1525, "num_input_tokens_seen": 47973984, "step": 71175 }, { "epoch": 1.7389392421762393, "grad_norm": 27.1817569732666, "learning_rate": 1.6487013627981554e-06, "loss": 0.045, "num_input_tokens_seen": 47977248, "step": 71180 }, { "epoch": 1.7390613930080865, "grad_norm": 26.244455337524414, "learning_rate": 1.648636460514062e-06, "loss": 0.1077, "num_input_tokens_seen": 47980384, "step": 71185 }, { "epoch": 1.7391835438399337, "grad_norm": 139.1673583984375, "learning_rate": 1.6485715535129107e-06, "loss": 0.0564, "num_input_tokens_seen": 47983584, "step": 71190 }, { "epoch": 1.7393056946717809, "grad_norm": 0.4898732006549835, "learning_rate": 1.648506641795172e-06, "loss": 0.0383, "num_input_tokens_seen": 47986656, "step": 71195 }, { "epoch": 1.7394278455036278, "grad_norm": 0.0008758667972870171, "learning_rate": 1.6484417253613184e-06, "loss": 0.0527, "num_input_tokens_seen": 47990240, "step": 71200 }, { "epoch": 1.739549996335475, "grad_norm": 3.5209832191467285, "learning_rate": 1.6483768042118227e-06, "loss": 0.1255, "num_input_tokens_seen": 47993568, "step": 71205 }, { "epoch": 1.7396721471673222, "grad_norm": 12.4790678024292, "learning_rate": 1.6483118783471563e-06, "loss": 0.1047, "num_input_tokens_seen": 47996832, "step": 71210 }, { "epoch": 1.7397942979991694, "grad_norm": 80.48369598388672, "learning_rate": 1.6482469477677916e-06, "loss": 0.0045, "num_input_tokens_seen": 48000224, "step": 71215 }, { "epoch": 1.7399164488310166, "grad_norm": 33.51426315307617, "learning_rate": 1.6481820124742005e-06, "loss": 0.1856, "num_input_tokens_seen": 48003744, "step": 71220 }, { "epoch": 1.7400385996628636, "grad_norm": 29.704303741455078, "learning_rate": 1.6481170724668556e-06, "loss": 0.0031, "num_input_tokens_seen": 48006880, "step": 71225 }, { "epoch": 1.7401607504947108, "grad_norm": 10.239999771118164, "learning_rate": 1.648052127746229e-06, "loss": 0.0432, "num_input_tokens_seen": 48010144, "step": 71230 }, { "epoch": 1.740282901326558, "grad_norm": 0.9402587413787842, "learning_rate": 1.6479871783127932e-06, "loss": 0.0296, "num_input_tokens_seen": 48013216, "step": 71235 }, { "epoch": 1.7404050521584051, "grad_norm": 55.33900833129883, "learning_rate": 1.6479222241670204e-06, "loss": 0.1239, "num_input_tokens_seen": 48016544, "step": 71240 }, { "epoch": 1.7405272029902523, "grad_norm": 0.043433524668216705, "learning_rate": 1.6478572653093826e-06, "loss": 0.0498, "num_input_tokens_seen": 48019744, "step": 71245 }, { "epoch": 1.7406493538220995, "grad_norm": 0.17277486622333527, "learning_rate": 1.6477923017403526e-06, "loss": 0.0319, "num_input_tokens_seen": 48023200, "step": 71250 }, { "epoch": 1.7407715046539467, "grad_norm": 37.4304313659668, "learning_rate": 1.647727333460403e-06, "loss": 0.1408, "num_input_tokens_seen": 48027104, "step": 71255 }, { "epoch": 1.740893655485794, "grad_norm": 0.046346426010131836, "learning_rate": 1.6476623604700058e-06, "loss": 0.0516, "num_input_tokens_seen": 48031072, "step": 71260 }, { "epoch": 1.741015806317641, "grad_norm": 0.047822318971157074, "learning_rate": 1.6475973827696336e-06, "loss": 0.0052, "num_input_tokens_seen": 48034144, "step": 71265 }, { "epoch": 1.7411379571494883, "grad_norm": 0.5275576114654541, "learning_rate": 1.6475324003597591e-06, "loss": 0.0012, "num_input_tokens_seen": 48037216, "step": 71270 }, { "epoch": 1.7412601079813355, "grad_norm": 34.40401840209961, "learning_rate": 1.6474674132408548e-06, "loss": 0.1353, "num_input_tokens_seen": 48040416, "step": 71275 }, { "epoch": 1.7413822588131826, "grad_norm": 22.45237159729004, "learning_rate": 1.6474024214133935e-06, "loss": 0.0746, "num_input_tokens_seen": 48044064, "step": 71280 }, { "epoch": 1.7415044096450298, "grad_norm": 28.317441940307617, "learning_rate": 1.6473374248778475e-06, "loss": 0.145, "num_input_tokens_seen": 48047520, "step": 71285 }, { "epoch": 1.7416265604768768, "grad_norm": 0.4682207703590393, "learning_rate": 1.6472724236346897e-06, "loss": 0.1019, "num_input_tokens_seen": 48050848, "step": 71290 }, { "epoch": 1.741748711308724, "grad_norm": 0.2123502492904663, "learning_rate": 1.647207417684393e-06, "loss": 0.2008, "num_input_tokens_seen": 48054560, "step": 71295 }, { "epoch": 1.7418708621405712, "grad_norm": 0.3460741937160492, "learning_rate": 1.6471424070274295e-06, "loss": 0.0145, "num_input_tokens_seen": 48058144, "step": 71300 }, { "epoch": 1.7419930129724184, "grad_norm": 0.04934366047382355, "learning_rate": 1.6470773916642726e-06, "loss": 0.0253, "num_input_tokens_seen": 48061088, "step": 71305 }, { "epoch": 1.7421151638042653, "grad_norm": 54.11192321777344, "learning_rate": 1.6470123715953944e-06, "loss": 0.1064, "num_input_tokens_seen": 48064224, "step": 71310 }, { "epoch": 1.7422373146361125, "grad_norm": 0.3336058259010315, "learning_rate": 1.6469473468212688e-06, "loss": 0.0435, "num_input_tokens_seen": 48067296, "step": 71315 }, { "epoch": 1.7423594654679597, "grad_norm": 0.8524634838104248, "learning_rate": 1.646882317342368e-06, "loss": 0.0313, "num_input_tokens_seen": 48071712, "step": 71320 }, { "epoch": 1.742481616299807, "grad_norm": 0.20961619913578033, "learning_rate": 1.6468172831591647e-06, "loss": 0.0538, "num_input_tokens_seen": 48075104, "step": 71325 }, { "epoch": 1.742603767131654, "grad_norm": 0.06928694993257523, "learning_rate": 1.6467522442721325e-06, "loss": 0.0011, "num_input_tokens_seen": 48078688, "step": 71330 }, { "epoch": 1.7427259179635013, "grad_norm": 30.175479888916016, "learning_rate": 1.6466872006817436e-06, "loss": 0.152, "num_input_tokens_seen": 48082272, "step": 71335 }, { "epoch": 1.7428480687953485, "grad_norm": 0.060041431337594986, "learning_rate": 1.6466221523884715e-06, "loss": 0.0009, "num_input_tokens_seen": 48085856, "step": 71340 }, { "epoch": 1.7429702196271957, "grad_norm": 0.18987978994846344, "learning_rate": 1.6465570993927895e-06, "loss": 0.1235, "num_input_tokens_seen": 48089312, "step": 71345 }, { "epoch": 1.7430923704590429, "grad_norm": 12.799039840698242, "learning_rate": 1.6464920416951702e-06, "loss": 0.0428, "num_input_tokens_seen": 48092704, "step": 71350 }, { "epoch": 1.74321452129089, "grad_norm": 0.3737916648387909, "learning_rate": 1.6464269792960867e-06, "loss": 0.0574, "num_input_tokens_seen": 48096224, "step": 71355 }, { "epoch": 1.7433366721227372, "grad_norm": 0.11154882609844208, "learning_rate": 1.6463619121960127e-06, "loss": 0.0495, "num_input_tokens_seen": 48099424, "step": 71360 }, { "epoch": 1.7434588229545844, "grad_norm": 0.8025643229484558, "learning_rate": 1.646296840395421e-06, "loss": 0.0956, "num_input_tokens_seen": 48102560, "step": 71365 }, { "epoch": 1.7435809737864316, "grad_norm": 0.3329102098941803, "learning_rate": 1.6462317638947846e-06, "loss": 0.1285, "num_input_tokens_seen": 48105824, "step": 71370 }, { "epoch": 1.7437031246182788, "grad_norm": 0.03304976969957352, "learning_rate": 1.646166682694577e-06, "loss": 0.0709, "num_input_tokens_seen": 48109472, "step": 71375 }, { "epoch": 1.7438252754501258, "grad_norm": 0.25978147983551025, "learning_rate": 1.6461015967952717e-06, "loss": 0.08, "num_input_tokens_seen": 48112608, "step": 71380 }, { "epoch": 1.743947426281973, "grad_norm": 0.16290909051895142, "learning_rate": 1.6460365061973418e-06, "loss": 0.0549, "num_input_tokens_seen": 48115936, "step": 71385 }, { "epoch": 1.7440695771138202, "grad_norm": 0.23594853281974792, "learning_rate": 1.6459714109012603e-06, "loss": 0.0378, "num_input_tokens_seen": 48119968, "step": 71390 }, { "epoch": 1.7441917279456673, "grad_norm": 24.003435134887695, "learning_rate": 1.6459063109075014e-06, "loss": 0.1648, "num_input_tokens_seen": 48123232, "step": 71395 }, { "epoch": 1.7443138787775143, "grad_norm": 9.70009708404541, "learning_rate": 1.6458412062165378e-06, "loss": 0.1554, "num_input_tokens_seen": 48126816, "step": 71400 }, { "epoch": 1.7444360296093615, "grad_norm": 35.708526611328125, "learning_rate": 1.6457760968288432e-06, "loss": 0.1034, "num_input_tokens_seen": 48129888, "step": 71405 }, { "epoch": 1.7445581804412087, "grad_norm": 9.53857135772705, "learning_rate": 1.6457109827448914e-06, "loss": 0.1703, "num_input_tokens_seen": 48133152, "step": 71410 }, { "epoch": 1.7446803312730559, "grad_norm": 25.09573745727539, "learning_rate": 1.6456458639651553e-06, "loss": 0.1, "num_input_tokens_seen": 48136480, "step": 71415 }, { "epoch": 1.744802482104903, "grad_norm": 0.6554991602897644, "learning_rate": 1.6455807404901093e-06, "loss": 0.0925, "num_input_tokens_seen": 48139808, "step": 71420 }, { "epoch": 1.7449246329367503, "grad_norm": 0.8519078493118286, "learning_rate": 1.6455156123202264e-06, "loss": 0.0316, "num_input_tokens_seen": 48142688, "step": 71425 }, { "epoch": 1.7450467837685975, "grad_norm": 0.08944659680128098, "learning_rate": 1.64545047945598e-06, "loss": 0.0916, "num_input_tokens_seen": 48146144, "step": 71430 }, { "epoch": 1.7451689346004446, "grad_norm": 0.468730628490448, "learning_rate": 1.6453853418978444e-06, "loss": 0.0066, "num_input_tokens_seen": 48149472, "step": 71435 }, { "epoch": 1.7452910854322918, "grad_norm": 0.04151911288499832, "learning_rate": 1.6453201996462928e-06, "loss": 0.1033, "num_input_tokens_seen": 48152608, "step": 71440 }, { "epoch": 1.745413236264139, "grad_norm": 0.18563562631607056, "learning_rate": 1.6452550527017994e-06, "loss": 0.0029, "num_input_tokens_seen": 48155552, "step": 71445 }, { "epoch": 1.7455353870959862, "grad_norm": 9.495387077331543, "learning_rate": 1.6451899010648377e-06, "loss": 0.0464, "num_input_tokens_seen": 48158752, "step": 71450 }, { "epoch": 1.7456575379278334, "grad_norm": 48.27766036987305, "learning_rate": 1.6451247447358812e-06, "loss": 0.1674, "num_input_tokens_seen": 48162144, "step": 71455 }, { "epoch": 1.7457796887596806, "grad_norm": 0.08520246297121048, "learning_rate": 1.6450595837154042e-06, "loss": 0.0626, "num_input_tokens_seen": 48165792, "step": 71460 }, { "epoch": 1.7459018395915278, "grad_norm": 29.665040969848633, "learning_rate": 1.6449944180038805e-06, "loss": 0.0679, "num_input_tokens_seen": 48168864, "step": 71465 }, { "epoch": 1.7460239904233747, "grad_norm": 0.4952899217605591, "learning_rate": 1.6449292476017835e-06, "loss": 0.0022, "num_input_tokens_seen": 48172128, "step": 71470 }, { "epoch": 1.746146141255222, "grad_norm": 0.07937739044427872, "learning_rate": 1.6448640725095882e-06, "loss": 0.1354, "num_input_tokens_seen": 48175328, "step": 71475 }, { "epoch": 1.7462682920870691, "grad_norm": 10.36311149597168, "learning_rate": 1.6447988927277674e-06, "loss": 0.0472, "num_input_tokens_seen": 48178848, "step": 71480 }, { "epoch": 1.7463904429189163, "grad_norm": 1.0270441770553589, "learning_rate": 1.6447337082567958e-06, "loss": 0.1125, "num_input_tokens_seen": 48182432, "step": 71485 }, { "epoch": 1.7465125937507633, "grad_norm": 0.06908803433179855, "learning_rate": 1.6446685190971472e-06, "loss": 0.042, "num_input_tokens_seen": 48186208, "step": 71490 }, { "epoch": 1.7466347445826105, "grad_norm": 164.5870361328125, "learning_rate": 1.6446033252492958e-06, "loss": 0.1288, "num_input_tokens_seen": 48189792, "step": 71495 }, { "epoch": 1.7467568954144577, "grad_norm": 1.4709440469741821, "learning_rate": 1.6445381267137158e-06, "loss": 0.0895, "num_input_tokens_seen": 48193312, "step": 71500 }, { "epoch": 1.7468790462463049, "grad_norm": 24.193714141845703, "learning_rate": 1.644472923490881e-06, "loss": 0.1459, "num_input_tokens_seen": 48196384, "step": 71505 }, { "epoch": 1.747001197078152, "grad_norm": 0.11595480889081955, "learning_rate": 1.6444077155812656e-06, "loss": 0.0364, "num_input_tokens_seen": 48199840, "step": 71510 }, { "epoch": 1.7471233479099992, "grad_norm": 2.7352027893066406, "learning_rate": 1.6443425029853442e-06, "loss": 0.003, "num_input_tokens_seen": 48203616, "step": 71515 }, { "epoch": 1.7472454987418464, "grad_norm": 0.22880008816719055, "learning_rate": 1.6442772857035906e-06, "loss": 0.0822, "num_input_tokens_seen": 48206816, "step": 71520 }, { "epoch": 1.7473676495736936, "grad_norm": 9.08481502532959, "learning_rate": 1.6442120637364796e-06, "loss": 0.0369, "num_input_tokens_seen": 48210464, "step": 71525 }, { "epoch": 1.7474898004055408, "grad_norm": 23.009002685546875, "learning_rate": 1.6441468370844848e-06, "loss": 0.0411, "num_input_tokens_seen": 48214112, "step": 71530 }, { "epoch": 1.747611951237388, "grad_norm": 0.7533755898475647, "learning_rate": 1.6440816057480812e-06, "loss": 0.2124, "num_input_tokens_seen": 48217824, "step": 71535 }, { "epoch": 1.7477341020692352, "grad_norm": 0.3032033443450928, "learning_rate": 1.6440163697277432e-06, "loss": 0.0437, "num_input_tokens_seen": 48221216, "step": 71540 }, { "epoch": 1.7478562529010824, "grad_norm": 12.668747901916504, "learning_rate": 1.6439511290239447e-06, "loss": 0.1492, "num_input_tokens_seen": 48224992, "step": 71545 }, { "epoch": 1.7479784037329296, "grad_norm": 23.0964412689209, "learning_rate": 1.6438858836371604e-06, "loss": 0.0295, "num_input_tokens_seen": 48228384, "step": 71550 }, { "epoch": 1.7481005545647765, "grad_norm": 0.36234384775161743, "learning_rate": 1.6438206335678647e-06, "loss": 0.0376, "num_input_tokens_seen": 48231584, "step": 71555 }, { "epoch": 1.7482227053966237, "grad_norm": 0.10900892317295074, "learning_rate": 1.6437553788165319e-06, "loss": 0.0459, "num_input_tokens_seen": 48234720, "step": 71560 }, { "epoch": 1.748344856228471, "grad_norm": 0.11470355093479156, "learning_rate": 1.6436901193836372e-06, "loss": 0.0754, "num_input_tokens_seen": 48238368, "step": 71565 }, { "epoch": 1.748467007060318, "grad_norm": 0.06794214248657227, "learning_rate": 1.6436248552696547e-06, "loss": 0.06, "num_input_tokens_seen": 48241760, "step": 71570 }, { "epoch": 1.7485891578921653, "grad_norm": 12.417028427124023, "learning_rate": 1.6435595864750592e-06, "loss": 0.2301, "num_input_tokens_seen": 48245344, "step": 71575 }, { "epoch": 1.7487113087240123, "grad_norm": 20.654052734375, "learning_rate": 1.6434943130003253e-06, "loss": 0.0589, "num_input_tokens_seen": 48248992, "step": 71580 }, { "epoch": 1.7488334595558594, "grad_norm": 0.12384731322526932, "learning_rate": 1.6434290348459279e-06, "loss": 0.0801, "num_input_tokens_seen": 48252128, "step": 71585 }, { "epoch": 1.7489556103877066, "grad_norm": 35.29301452636719, "learning_rate": 1.643363752012341e-06, "loss": 0.0718, "num_input_tokens_seen": 48255456, "step": 71590 }, { "epoch": 1.7490777612195538, "grad_norm": 13.87523078918457, "learning_rate": 1.6432984645000403e-06, "loss": 0.2395, "num_input_tokens_seen": 48258464, "step": 71595 }, { "epoch": 1.749199912051401, "grad_norm": 0.09350656718015671, "learning_rate": 1.6432331723095e-06, "loss": 0.0602, "num_input_tokens_seen": 48261728, "step": 71600 }, { "epoch": 1.7493220628832482, "grad_norm": 0.042109668254852295, "learning_rate": 1.6431678754411951e-06, "loss": 0.1416, "num_input_tokens_seen": 48265248, "step": 71605 }, { "epoch": 1.7494442137150954, "grad_norm": 11.492691040039062, "learning_rate": 1.6431025738956002e-06, "loss": 0.1015, "num_input_tokens_seen": 48268768, "step": 71610 }, { "epoch": 1.7495663645469426, "grad_norm": 19.275936126708984, "learning_rate": 1.6430372676731904e-06, "loss": 0.15, "num_input_tokens_seen": 48272096, "step": 71615 }, { "epoch": 1.7496885153787898, "grad_norm": 15.951652526855469, "learning_rate": 1.6429719567744406e-06, "loss": 0.0779, "num_input_tokens_seen": 48275552, "step": 71620 }, { "epoch": 1.749810666210637, "grad_norm": 0.5586774349212646, "learning_rate": 1.6429066411998261e-06, "loss": 0.1512, "num_input_tokens_seen": 48278944, "step": 71625 }, { "epoch": 1.7499328170424842, "grad_norm": 0.03820788115262985, "learning_rate": 1.6428413209498216e-06, "loss": 0.1058, "num_input_tokens_seen": 48282080, "step": 71630 }, { "epoch": 1.7500549678743313, "grad_norm": 168.43922424316406, "learning_rate": 1.6427759960249018e-06, "loss": 0.1097, "num_input_tokens_seen": 48285280, "step": 71635 }, { "epoch": 1.7501282583734397, "eval_loss": 0.12312835454940796, "eval_runtime": 47.5799, "eval_samples_per_second": 764.714, "eval_steps_per_second": 95.608, "num_input_tokens_seen": 48287456, "step": 71638 }, { "epoch": 1.7501771187061785, "grad_norm": 88.0875015258789, "learning_rate": 1.6427106664255423e-06, "loss": 0.0614, "num_input_tokens_seen": 48288800, "step": 71640 }, { "epoch": 1.7502992695380255, "grad_norm": 0.25800928473472595, "learning_rate": 1.642645332152218e-06, "loss": 0.0532, "num_input_tokens_seen": 48291744, "step": 71645 }, { "epoch": 1.7504214203698727, "grad_norm": 1.028090000152588, "learning_rate": 1.6425799932054037e-06, "loss": 0.0864, "num_input_tokens_seen": 48295264, "step": 71650 }, { "epoch": 1.7505435712017199, "grad_norm": 0.6299525499343872, "learning_rate": 1.642514649585575e-06, "loss": 0.1333, "num_input_tokens_seen": 48298400, "step": 71655 }, { "epoch": 1.750665722033567, "grad_norm": 94.1126708984375, "learning_rate": 1.6424493012932072e-06, "loss": 0.1662, "num_input_tokens_seen": 48301600, "step": 71660 }, { "epoch": 1.7507878728654143, "grad_norm": 0.396215558052063, "learning_rate": 1.6423839483287751e-06, "loss": 0.0255, "num_input_tokens_seen": 48305312, "step": 71665 }, { "epoch": 1.7509100236972612, "grad_norm": 0.31599822640419006, "learning_rate": 1.6423185906927542e-06, "loss": 0.0415, "num_input_tokens_seen": 48308448, "step": 71670 }, { "epoch": 1.7510321745291084, "grad_norm": 0.15240201354026794, "learning_rate": 1.6422532283856195e-06, "loss": 0.0536, "num_input_tokens_seen": 48311968, "step": 71675 }, { "epoch": 1.7511543253609556, "grad_norm": 201.6836395263672, "learning_rate": 1.6421878614078466e-06, "loss": 0.0186, "num_input_tokens_seen": 48315424, "step": 71680 }, { "epoch": 1.7512764761928028, "grad_norm": 0.1291874200105667, "learning_rate": 1.642122489759911e-06, "loss": 0.0714, "num_input_tokens_seen": 48319008, "step": 71685 }, { "epoch": 1.75139862702465, "grad_norm": 0.5408225059509277, "learning_rate": 1.642057113442288e-06, "loss": 0.1259, "num_input_tokens_seen": 48323168, "step": 71690 }, { "epoch": 1.7515207778564972, "grad_norm": 0.1968991905450821, "learning_rate": 1.641991732455453e-06, "loss": 0.0668, "num_input_tokens_seen": 48326752, "step": 71695 }, { "epoch": 1.7516429286883444, "grad_norm": 107.3658218383789, "learning_rate": 1.6419263467998813e-06, "loss": 0.0899, "num_input_tokens_seen": 48330144, "step": 71700 }, { "epoch": 1.7517650795201916, "grad_norm": 22.380523681640625, "learning_rate": 1.6418609564760485e-06, "loss": 0.1817, "num_input_tokens_seen": 48333280, "step": 71705 }, { "epoch": 1.7518872303520387, "grad_norm": 0.2987278699874878, "learning_rate": 1.6417955614844304e-06, "loss": 0.1945, "num_input_tokens_seen": 48336544, "step": 71710 }, { "epoch": 1.752009381183886, "grad_norm": 0.23216158151626587, "learning_rate": 1.6417301618255021e-06, "loss": 0.0362, "num_input_tokens_seen": 48339744, "step": 71715 }, { "epoch": 1.7521315320157331, "grad_norm": 0.3558136522769928, "learning_rate": 1.6416647574997397e-06, "loss": 0.0275, "num_input_tokens_seen": 48343200, "step": 71720 }, { "epoch": 1.7522536828475803, "grad_norm": 55.75593948364258, "learning_rate": 1.6415993485076184e-06, "loss": 0.0569, "num_input_tokens_seen": 48346848, "step": 71725 }, { "epoch": 1.7523758336794275, "grad_norm": 0.14112365245819092, "learning_rate": 1.6415339348496144e-06, "loss": 0.002, "num_input_tokens_seen": 48350560, "step": 71730 }, { "epoch": 1.7524979845112745, "grad_norm": 7.328372955322266, "learning_rate": 1.6414685165262027e-06, "loss": 0.1212, "num_input_tokens_seen": 48354016, "step": 71735 }, { "epoch": 1.7526201353431217, "grad_norm": 11.065155029296875, "learning_rate": 1.6414030935378597e-06, "loss": 0.1804, "num_input_tokens_seen": 48357728, "step": 71740 }, { "epoch": 1.7527422861749689, "grad_norm": 0.29226332902908325, "learning_rate": 1.6413376658850607e-06, "loss": 0.0572, "num_input_tokens_seen": 48361248, "step": 71745 }, { "epoch": 1.752864437006816, "grad_norm": 0.9687400460243225, "learning_rate": 1.6412722335682818e-06, "loss": 0.0051, "num_input_tokens_seen": 48364704, "step": 71750 }, { "epoch": 1.7529865878386632, "grad_norm": 144.53562927246094, "learning_rate": 1.6412067965879986e-06, "loss": 0.096, "num_input_tokens_seen": 48367840, "step": 71755 }, { "epoch": 1.7531087386705102, "grad_norm": 0.12270642817020416, "learning_rate": 1.6411413549446873e-06, "loss": 0.0214, "num_input_tokens_seen": 48371552, "step": 71760 }, { "epoch": 1.7532308895023574, "grad_norm": 0.2248445302248001, "learning_rate": 1.6410759086388235e-06, "loss": 0.0951, "num_input_tokens_seen": 48374624, "step": 71765 }, { "epoch": 1.7533530403342046, "grad_norm": 22.86701774597168, "learning_rate": 1.6410104576708835e-06, "loss": 0.0808, "num_input_tokens_seen": 48378208, "step": 71770 }, { "epoch": 1.7534751911660518, "grad_norm": 19.745939254760742, "learning_rate": 1.6409450020413424e-06, "loss": 0.1468, "num_input_tokens_seen": 48381536, "step": 71775 }, { "epoch": 1.753597341997899, "grad_norm": 20.355432510375977, "learning_rate": 1.6408795417506773e-06, "loss": 0.1083, "num_input_tokens_seen": 48385440, "step": 71780 }, { "epoch": 1.7537194928297462, "grad_norm": 0.256071001291275, "learning_rate": 1.6408140767993639e-06, "loss": 0.0812, "num_input_tokens_seen": 48388640, "step": 71785 }, { "epoch": 1.7538416436615933, "grad_norm": 0.07664134353399277, "learning_rate": 1.640748607187878e-06, "loss": 0.155, "num_input_tokens_seen": 48392096, "step": 71790 }, { "epoch": 1.7539637944934405, "grad_norm": 7.759392261505127, "learning_rate": 1.640683132916696e-06, "loss": 0.0755, "num_input_tokens_seen": 48395552, "step": 71795 }, { "epoch": 1.7540859453252877, "grad_norm": 1.141500473022461, "learning_rate": 1.6406176539862936e-06, "loss": 0.1561, "num_input_tokens_seen": 48398816, "step": 71800 }, { "epoch": 1.754208096157135, "grad_norm": 18.568471908569336, "learning_rate": 1.6405521703971476e-06, "loss": 0.1175, "num_input_tokens_seen": 48402400, "step": 71805 }, { "epoch": 1.754330246988982, "grad_norm": 90.30744171142578, "learning_rate": 1.640486682149734e-06, "loss": 0.0259, "num_input_tokens_seen": 48405728, "step": 71810 }, { "epoch": 1.7544523978208293, "grad_norm": 11.252006530761719, "learning_rate": 1.6404211892445288e-06, "loss": 0.1611, "num_input_tokens_seen": 48408800, "step": 71815 }, { "epoch": 1.7545745486526765, "grad_norm": 0.3409070670604706, "learning_rate": 1.6403556916820088e-06, "loss": 0.0316, "num_input_tokens_seen": 48412000, "step": 71820 }, { "epoch": 1.7546966994845234, "grad_norm": 0.1269320547580719, "learning_rate": 1.6402901894626497e-06, "loss": 0.0616, "num_input_tokens_seen": 48415584, "step": 71825 }, { "epoch": 1.7548188503163706, "grad_norm": 0.20163556933403015, "learning_rate": 1.6402246825869281e-06, "loss": 0.0016, "num_input_tokens_seen": 48418976, "step": 71830 }, { "epoch": 1.7549410011482178, "grad_norm": 0.2925082743167877, "learning_rate": 1.6401591710553201e-06, "loss": 0.0401, "num_input_tokens_seen": 48422240, "step": 71835 }, { "epoch": 1.755063151980065, "grad_norm": 0.11735465377569199, "learning_rate": 1.6400936548683028e-06, "loss": 0.0016, "num_input_tokens_seen": 48425632, "step": 71840 }, { "epoch": 1.755185302811912, "grad_norm": 91.04149627685547, "learning_rate": 1.6400281340263524e-06, "loss": 0.0936, "num_input_tokens_seen": 48428448, "step": 71845 }, { "epoch": 1.7553074536437592, "grad_norm": 0.017427001148462296, "learning_rate": 1.6399626085299452e-06, "loss": 0.0006, "num_input_tokens_seen": 48432416, "step": 71850 }, { "epoch": 1.7554296044756064, "grad_norm": 35.123146057128906, "learning_rate": 1.6398970783795577e-06, "loss": 0.1226, "num_input_tokens_seen": 48436128, "step": 71855 }, { "epoch": 1.7555517553074536, "grad_norm": 13.244025230407715, "learning_rate": 1.6398315435756666e-06, "loss": 0.1049, "num_input_tokens_seen": 48439520, "step": 71860 }, { "epoch": 1.7556739061393007, "grad_norm": 61.519649505615234, "learning_rate": 1.6397660041187482e-06, "loss": 0.0826, "num_input_tokens_seen": 48442272, "step": 71865 }, { "epoch": 1.755796056971148, "grad_norm": 0.528657078742981, "learning_rate": 1.6397004600092794e-06, "loss": 0.0228, "num_input_tokens_seen": 48445472, "step": 71870 }, { "epoch": 1.7559182078029951, "grad_norm": 0.6462512612342834, "learning_rate": 1.639634911247737e-06, "loss": 0.0483, "num_input_tokens_seen": 48448928, "step": 71875 }, { "epoch": 1.7560403586348423, "grad_norm": 0.5328592658042908, "learning_rate": 1.6395693578345973e-06, "loss": 0.1246, "num_input_tokens_seen": 48452576, "step": 71880 }, { "epoch": 1.7561625094666895, "grad_norm": 1.7375816106796265, "learning_rate": 1.6395037997703373e-06, "loss": 0.0114, "num_input_tokens_seen": 48455840, "step": 71885 }, { "epoch": 1.7562846602985367, "grad_norm": 0.04445521533489227, "learning_rate": 1.6394382370554337e-06, "loss": 0.0011, "num_input_tokens_seen": 48458848, "step": 71890 }, { "epoch": 1.7564068111303839, "grad_norm": 0.13666929304599762, "learning_rate": 1.6393726696903634e-06, "loss": 0.002, "num_input_tokens_seen": 48462304, "step": 71895 }, { "epoch": 1.756528961962231, "grad_norm": 52.5565185546875, "learning_rate": 1.6393070976756027e-06, "loss": 0.1411, "num_input_tokens_seen": 48465888, "step": 71900 }, { "epoch": 1.7566511127940783, "grad_norm": 1.7487032413482666, "learning_rate": 1.639241521011629e-06, "loss": 0.0123, "num_input_tokens_seen": 48469152, "step": 71905 }, { "epoch": 1.7567732636259255, "grad_norm": 5.133216857910156, "learning_rate": 1.6391759396989188e-06, "loss": 0.0587, "num_input_tokens_seen": 48472480, "step": 71910 }, { "epoch": 1.7568954144577724, "grad_norm": 0.04294794425368309, "learning_rate": 1.6391103537379496e-06, "loss": 0.0357, "num_input_tokens_seen": 48476128, "step": 71915 }, { "epoch": 1.7570175652896196, "grad_norm": 45.32661056518555, "learning_rate": 1.639044763129198e-06, "loss": 0.3049, "num_input_tokens_seen": 48479392, "step": 71920 }, { "epoch": 1.7571397161214668, "grad_norm": 0.3233184516429901, "learning_rate": 1.638979167873141e-06, "loss": 0.0465, "num_input_tokens_seen": 48482976, "step": 71925 }, { "epoch": 1.757261866953314, "grad_norm": 0.39100873470306396, "learning_rate": 1.6389135679702554e-06, "loss": 0.0277, "num_input_tokens_seen": 48486048, "step": 71930 }, { "epoch": 1.757384017785161, "grad_norm": 12.223175048828125, "learning_rate": 1.6388479634210187e-06, "loss": 0.0618, "num_input_tokens_seen": 48489120, "step": 71935 }, { "epoch": 1.7575061686170081, "grad_norm": 0.32122930884361267, "learning_rate": 1.6387823542259075e-06, "loss": 0.0703, "num_input_tokens_seen": 48492448, "step": 71940 }, { "epoch": 1.7576283194488553, "grad_norm": 0.18866363167762756, "learning_rate": 1.6387167403853994e-06, "loss": 0.0019, "num_input_tokens_seen": 48496096, "step": 71945 }, { "epoch": 1.7577504702807025, "grad_norm": 0.5957804918289185, "learning_rate": 1.6386511218999714e-06, "loss": 0.1197, "num_input_tokens_seen": 48499680, "step": 71950 }, { "epoch": 1.7578726211125497, "grad_norm": 11.999557495117188, "learning_rate": 1.6385854987701007e-06, "loss": 0.2407, "num_input_tokens_seen": 48503584, "step": 71955 }, { "epoch": 1.757994771944397, "grad_norm": 0.06358401477336884, "learning_rate": 1.6385198709962642e-06, "loss": 0.0942, "num_input_tokens_seen": 48506912, "step": 71960 }, { "epoch": 1.758116922776244, "grad_norm": 45.37395095825195, "learning_rate": 1.6384542385789397e-06, "loss": 0.2849, "num_input_tokens_seen": 48509856, "step": 71965 }, { "epoch": 1.7582390736080913, "grad_norm": 0.21417827904224396, "learning_rate": 1.638388601518604e-06, "loss": 0.0458, "num_input_tokens_seen": 48513120, "step": 71970 }, { "epoch": 1.7583612244399385, "grad_norm": 0.17939996719360352, "learning_rate": 1.6383229598157353e-06, "loss": 0.0368, "num_input_tokens_seen": 48516320, "step": 71975 }, { "epoch": 1.7584833752717857, "grad_norm": 0.11681331694126129, "learning_rate": 1.63825731347081e-06, "loss": 0.0925, "num_input_tokens_seen": 48520032, "step": 71980 }, { "epoch": 1.7586055261036329, "grad_norm": 17.2757511138916, "learning_rate": 1.6381916624843058e-06, "loss": 0.1507, "num_input_tokens_seen": 48523168, "step": 71985 }, { "epoch": 1.75872767693548, "grad_norm": 0.13226760923862457, "learning_rate": 1.6381260068567e-06, "loss": 0.0392, "num_input_tokens_seen": 48526560, "step": 71990 }, { "epoch": 1.7588498277673272, "grad_norm": 0.06085921451449394, "learning_rate": 1.6380603465884706e-06, "loss": 0.0382, "num_input_tokens_seen": 48529632, "step": 71995 }, { "epoch": 1.7589719785991744, "grad_norm": 0.6363232135772705, "learning_rate": 1.6379946816800945e-06, "loss": 0.1274, "num_input_tokens_seen": 48533216, "step": 72000 }, { "epoch": 1.7590941294310214, "grad_norm": 15.696700096130371, "learning_rate": 1.6379290121320495e-06, "loss": 0.1549, "num_input_tokens_seen": 48536224, "step": 72005 }, { "epoch": 1.7592162802628686, "grad_norm": 0.08707629889249802, "learning_rate": 1.6378633379448133e-06, "loss": 0.1014, "num_input_tokens_seen": 48539552, "step": 72010 }, { "epoch": 1.7593384310947158, "grad_norm": 0.519372284412384, "learning_rate": 1.637797659118863e-06, "loss": 0.1593, "num_input_tokens_seen": 48542880, "step": 72015 }, { "epoch": 1.759460581926563, "grad_norm": 5.493022918701172, "learning_rate": 1.6377319756546771e-06, "loss": 0.0546, "num_input_tokens_seen": 48546016, "step": 72020 }, { "epoch": 1.75958273275841, "grad_norm": 0.174288809299469, "learning_rate": 1.637666287552732e-06, "loss": 0.0447, "num_input_tokens_seen": 48549408, "step": 72025 }, { "epoch": 1.7597048835902571, "grad_norm": 0.8021009564399719, "learning_rate": 1.6376005948135068e-06, "loss": 0.0023, "num_input_tokens_seen": 48552672, "step": 72030 }, { "epoch": 1.7598270344221043, "grad_norm": 0.18382374942302704, "learning_rate": 1.6375348974374784e-06, "loss": 0.04, "num_input_tokens_seen": 48556384, "step": 72035 }, { "epoch": 1.7599491852539515, "grad_norm": 36.59886932373047, "learning_rate": 1.6374691954251247e-06, "loss": 0.1235, "num_input_tokens_seen": 48559392, "step": 72040 }, { "epoch": 1.7600713360857987, "grad_norm": 0.23298950493335724, "learning_rate": 1.6374034887769238e-06, "loss": 0.0937, "num_input_tokens_seen": 48562592, "step": 72045 }, { "epoch": 1.7601934869176459, "grad_norm": 0.3184853196144104, "learning_rate": 1.6373377774933528e-06, "loss": 0.0362, "num_input_tokens_seen": 48565536, "step": 72050 }, { "epoch": 1.760315637749493, "grad_norm": 0.3271631896495819, "learning_rate": 1.6372720615748903e-06, "loss": 0.089, "num_input_tokens_seen": 48569120, "step": 72055 }, { "epoch": 1.7604377885813403, "grad_norm": 0.6756600141525269, "learning_rate": 1.637206341022014e-06, "loss": 0.0921, "num_input_tokens_seen": 48572384, "step": 72060 }, { "epoch": 1.7605599394131874, "grad_norm": 0.039529088884592056, "learning_rate": 1.6371406158352016e-06, "loss": 0.0427, "num_input_tokens_seen": 48576416, "step": 72065 }, { "epoch": 1.7606820902450346, "grad_norm": 40.706382751464844, "learning_rate": 1.6370748860149316e-06, "loss": 0.1243, "num_input_tokens_seen": 48579232, "step": 72070 }, { "epoch": 1.7608042410768818, "grad_norm": 0.6725810170173645, "learning_rate": 1.6370091515616817e-06, "loss": 0.0553, "num_input_tokens_seen": 48582432, "step": 72075 }, { "epoch": 1.760926391908729, "grad_norm": 13.003739356994629, "learning_rate": 1.63694341247593e-06, "loss": 0.1632, "num_input_tokens_seen": 48586016, "step": 72080 }, { "epoch": 1.7610485427405762, "grad_norm": 12.829425811767578, "learning_rate": 1.6368776687581538e-06, "loss": 0.1723, "num_input_tokens_seen": 48589344, "step": 72085 }, { "epoch": 1.7611706935724232, "grad_norm": 1.4721359014511108, "learning_rate": 1.6368119204088323e-06, "loss": 0.1579, "num_input_tokens_seen": 48592672, "step": 72090 }, { "epoch": 1.7612928444042704, "grad_norm": 0.20425674319267273, "learning_rate": 1.6367461674284432e-06, "loss": 0.0904, "num_input_tokens_seen": 48595552, "step": 72095 }, { "epoch": 1.7614149952361176, "grad_norm": 8.486249923706055, "learning_rate": 1.6366804098174648e-06, "loss": 0.1016, "num_input_tokens_seen": 48598688, "step": 72100 }, { "epoch": 1.7615371460679647, "grad_norm": 0.8847968578338623, "learning_rate": 1.6366146475763754e-06, "loss": 0.0079, "num_input_tokens_seen": 48601824, "step": 72105 }, { "epoch": 1.761659296899812, "grad_norm": 10.223726272583008, "learning_rate": 1.6365488807056528e-06, "loss": 0.0842, "num_input_tokens_seen": 48604896, "step": 72110 }, { "epoch": 1.761781447731659, "grad_norm": 11.736681938171387, "learning_rate": 1.6364831092057752e-06, "loss": 0.0733, "num_input_tokens_seen": 48609248, "step": 72115 }, { "epoch": 1.761903598563506, "grad_norm": 0.1761174350976944, "learning_rate": 1.6364173330772217e-06, "loss": 0.0018, "num_input_tokens_seen": 48612256, "step": 72120 }, { "epoch": 1.7620257493953533, "grad_norm": 0.1778940111398697, "learning_rate": 1.63635155232047e-06, "loss": 0.0627, "num_input_tokens_seen": 48615584, "step": 72125 }, { "epoch": 1.7621479002272005, "grad_norm": 0.12312234938144684, "learning_rate": 1.636285766935999e-06, "loss": 0.0468, "num_input_tokens_seen": 48618976, "step": 72130 }, { "epoch": 1.7622700510590477, "grad_norm": 267.4752197265625, "learning_rate": 1.6362199769242863e-06, "loss": 0.0157, "num_input_tokens_seen": 48622816, "step": 72135 }, { "epoch": 1.7623922018908948, "grad_norm": 0.9525810480117798, "learning_rate": 1.636154182285811e-06, "loss": 0.0765, "num_input_tokens_seen": 48625760, "step": 72140 }, { "epoch": 1.762514352722742, "grad_norm": 14.541299819946289, "learning_rate": 1.6360883830210515e-06, "loss": 0.1066, "num_input_tokens_seen": 48628832, "step": 72145 }, { "epoch": 1.7626365035545892, "grad_norm": 11.16724967956543, "learning_rate": 1.636022579130486e-06, "loss": 0.1817, "num_input_tokens_seen": 48631968, "step": 72150 }, { "epoch": 1.7627586543864364, "grad_norm": 19.138643264770508, "learning_rate": 1.6359567706145931e-06, "loss": 0.1552, "num_input_tokens_seen": 48635424, "step": 72155 }, { "epoch": 1.7628808052182836, "grad_norm": 0.2547426223754883, "learning_rate": 1.635890957473852e-06, "loss": 0.0411, "num_input_tokens_seen": 48638560, "step": 72160 }, { "epoch": 1.7630029560501308, "grad_norm": 0.5138275027275085, "learning_rate": 1.6358251397087405e-06, "loss": 0.0705, "num_input_tokens_seen": 48642208, "step": 72165 }, { "epoch": 1.763125106881978, "grad_norm": 11.29153060913086, "learning_rate": 1.6357593173197378e-06, "loss": 0.0825, "num_input_tokens_seen": 48645664, "step": 72170 }, { "epoch": 1.7632472577138252, "grad_norm": 11.48965835571289, "learning_rate": 1.6356934903073221e-06, "loss": 0.1601, "num_input_tokens_seen": 48649056, "step": 72175 }, { "epoch": 1.7633694085456721, "grad_norm": 22.086517333984375, "learning_rate": 1.6356276586719722e-06, "loss": 0.1951, "num_input_tokens_seen": 48652320, "step": 72180 }, { "epoch": 1.7634915593775193, "grad_norm": 0.4593576490879059, "learning_rate": 1.6355618224141672e-06, "loss": 0.0517, "num_input_tokens_seen": 48655712, "step": 72185 }, { "epoch": 1.7636137102093665, "grad_norm": 0.20573964715003967, "learning_rate": 1.6354959815343859e-06, "loss": 0.1116, "num_input_tokens_seen": 48659808, "step": 72190 }, { "epoch": 1.7637358610412137, "grad_norm": 16.481679916381836, "learning_rate": 1.6354301360331064e-06, "loss": 0.0629, "num_input_tokens_seen": 48663456, "step": 72195 }, { "epoch": 1.763858011873061, "grad_norm": 0.2852122485637665, "learning_rate": 1.6353642859108084e-06, "loss": 0.058, "num_input_tokens_seen": 48666272, "step": 72200 }, { "epoch": 1.7639801627049079, "grad_norm": 22.044973373413086, "learning_rate": 1.6352984311679704e-06, "loss": 0.161, "num_input_tokens_seen": 48669536, "step": 72205 }, { "epoch": 1.764102313536755, "grad_norm": 0.2694436311721802, "learning_rate": 1.6352325718050713e-06, "loss": 0.0984, "num_input_tokens_seen": 48672992, "step": 72210 }, { "epoch": 1.7642244643686023, "grad_norm": 1.510804533958435, "learning_rate": 1.6351667078225902e-06, "loss": 0.0499, "num_input_tokens_seen": 48676448, "step": 72215 }, { "epoch": 1.7643466152004494, "grad_norm": 0.21066798269748688, "learning_rate": 1.6351008392210055e-06, "loss": 0.0657, "num_input_tokens_seen": 48679776, "step": 72220 }, { "epoch": 1.7644687660322966, "grad_norm": 0.34505897760391235, "learning_rate": 1.635034966000797e-06, "loss": 0.0016, "num_input_tokens_seen": 48683360, "step": 72225 }, { "epoch": 1.7645909168641438, "grad_norm": 0.53311687707901, "learning_rate": 1.6349690881624437e-06, "loss": 0.0697, "num_input_tokens_seen": 48686496, "step": 72230 }, { "epoch": 1.764713067695991, "grad_norm": 0.930202305316925, "learning_rate": 1.634903205706424e-06, "loss": 0.0867, "num_input_tokens_seen": 48689952, "step": 72235 }, { "epoch": 1.7648352185278382, "grad_norm": 0.12176904827356339, "learning_rate": 1.6348373186332175e-06, "loss": 0.053, "num_input_tokens_seen": 48693472, "step": 72240 }, { "epoch": 1.7649573693596854, "grad_norm": 0.03570260852575302, "learning_rate": 1.6347714269433032e-06, "loss": 0.0437, "num_input_tokens_seen": 48697120, "step": 72245 }, { "epoch": 1.7650795201915326, "grad_norm": 19.815536499023438, "learning_rate": 1.6347055306371606e-06, "loss": 0.0846, "num_input_tokens_seen": 48700256, "step": 72250 }, { "epoch": 1.7652016710233798, "grad_norm": 73.4245834350586, "learning_rate": 1.6346396297152688e-06, "loss": 0.1, "num_input_tokens_seen": 48704160, "step": 72255 }, { "epoch": 1.765323821855227, "grad_norm": 0.6247529983520508, "learning_rate": 1.6345737241781064e-06, "loss": 0.0857, "num_input_tokens_seen": 48707936, "step": 72260 }, { "epoch": 1.7654459726870741, "grad_norm": 0.030399378389120102, "learning_rate": 1.6345078140261536e-06, "loss": 0.0014, "num_input_tokens_seen": 48711392, "step": 72265 }, { "epoch": 1.7655681235189211, "grad_norm": 40.35014724731445, "learning_rate": 1.634441899259889e-06, "loss": 0.1794, "num_input_tokens_seen": 48714848, "step": 72270 }, { "epoch": 1.7656902743507683, "grad_norm": 9.394624710083008, "learning_rate": 1.6343759798797926e-06, "loss": 0.0416, "num_input_tokens_seen": 48718048, "step": 72275 }, { "epoch": 1.7658124251826155, "grad_norm": 0.10819534957408905, "learning_rate": 1.6343100558863432e-06, "loss": 0.0587, "num_input_tokens_seen": 48721312, "step": 72280 }, { "epoch": 1.7659345760144627, "grad_norm": 37.31603240966797, "learning_rate": 1.6342441272800205e-06, "loss": 0.164, "num_input_tokens_seen": 48724448, "step": 72285 }, { "epoch": 1.7660567268463099, "grad_norm": 69.5995101928711, "learning_rate": 1.634178194061304e-06, "loss": 0.0512, "num_input_tokens_seen": 48727712, "step": 72290 }, { "epoch": 1.7661788776781568, "grad_norm": 12.813841819763184, "learning_rate": 1.634112256230673e-06, "loss": 0.0474, "num_input_tokens_seen": 48730912, "step": 72295 }, { "epoch": 1.766301028510004, "grad_norm": 26.569725036621094, "learning_rate": 1.634046313788607e-06, "loss": 0.0715, "num_input_tokens_seen": 48734112, "step": 72300 }, { "epoch": 1.7664231793418512, "grad_norm": 11.442526817321777, "learning_rate": 1.633980366735586e-06, "loss": 0.0877, "num_input_tokens_seen": 48737760, "step": 72305 }, { "epoch": 1.7665453301736984, "grad_norm": 0.3668510615825653, "learning_rate": 1.6339144150720889e-06, "loss": 0.1014, "num_input_tokens_seen": 48740896, "step": 72310 }, { "epoch": 1.7666674810055456, "grad_norm": 0.25583159923553467, "learning_rate": 1.633848458798596e-06, "loss": 0.0031, "num_input_tokens_seen": 48744480, "step": 72315 }, { "epoch": 1.7667896318373928, "grad_norm": 9.13083553314209, "learning_rate": 1.6337824979155866e-06, "loss": 0.1545, "num_input_tokens_seen": 48747680, "step": 72320 }, { "epoch": 1.76691178266924, "grad_norm": 15.840726852416992, "learning_rate": 1.6337165324235402e-06, "loss": 0.1265, "num_input_tokens_seen": 48751200, "step": 72325 }, { "epoch": 1.7670339335010872, "grad_norm": 0.15436357259750366, "learning_rate": 1.6336505623229368e-06, "loss": 0.1546, "num_input_tokens_seen": 48754656, "step": 72330 }, { "epoch": 1.7671560843329344, "grad_norm": 0.25369733572006226, "learning_rate": 1.633584587614256e-06, "loss": 0.0522, "num_input_tokens_seen": 48758560, "step": 72335 }, { "epoch": 1.7672782351647816, "grad_norm": 77.1099853515625, "learning_rate": 1.6335186082979778e-06, "loss": 0.1567, "num_input_tokens_seen": 48761760, "step": 72340 }, { "epoch": 1.7674003859966287, "grad_norm": 11.019214630126953, "learning_rate": 1.6334526243745819e-06, "loss": 0.0445, "num_input_tokens_seen": 48765280, "step": 72345 }, { "epoch": 1.767522536828476, "grad_norm": 0.6401473879814148, "learning_rate": 1.633386635844548e-06, "loss": 0.0527, "num_input_tokens_seen": 48768544, "step": 72350 }, { "epoch": 1.7676446876603231, "grad_norm": 29.522113800048828, "learning_rate": 1.633320642708356e-06, "loss": 0.0989, "num_input_tokens_seen": 48772384, "step": 72355 }, { "epoch": 1.76776683849217, "grad_norm": 57.17478561401367, "learning_rate": 1.6332546449664865e-06, "loss": 0.1298, "num_input_tokens_seen": 48775520, "step": 72360 }, { "epoch": 1.7678889893240173, "grad_norm": 0.05358585715293884, "learning_rate": 1.6331886426194184e-06, "loss": 0.0787, "num_input_tokens_seen": 48779808, "step": 72365 }, { "epoch": 1.7680111401558645, "grad_norm": 0.36916717886924744, "learning_rate": 1.6331226356676324e-06, "loss": 0.081, "num_input_tokens_seen": 48783392, "step": 72370 }, { "epoch": 1.7681332909877117, "grad_norm": 8.119627952575684, "learning_rate": 1.633056624111608e-06, "loss": 0.135, "num_input_tokens_seen": 48786336, "step": 72375 }, { "epoch": 1.7682554418195586, "grad_norm": 0.031201016157865524, "learning_rate": 1.6329906079518262e-06, "loss": 0.0253, "num_input_tokens_seen": 48789600, "step": 72380 }, { "epoch": 1.7683775926514058, "grad_norm": 36.73933029174805, "learning_rate": 1.632924587188766e-06, "loss": 0.0988, "num_input_tokens_seen": 48792672, "step": 72385 }, { "epoch": 1.768499743483253, "grad_norm": 0.22203010320663452, "learning_rate": 1.6328585618229077e-06, "loss": 0.0365, "num_input_tokens_seen": 48796128, "step": 72390 }, { "epoch": 1.7686218943151002, "grad_norm": 0.7779269814491272, "learning_rate": 1.632792531854732e-06, "loss": 0.045, "num_input_tokens_seen": 48799520, "step": 72395 }, { "epoch": 1.7687440451469474, "grad_norm": 0.07156257331371307, "learning_rate": 1.632726497284719e-06, "loss": 0.0014, "num_input_tokens_seen": 48802464, "step": 72400 }, { "epoch": 1.7688661959787946, "grad_norm": 0.24317099153995514, "learning_rate": 1.6326604581133484e-06, "loss": 0.1229, "num_input_tokens_seen": 48805728, "step": 72405 }, { "epoch": 1.7689883468106418, "grad_norm": 0.3279658555984497, "learning_rate": 1.632594414341101e-06, "loss": 0.0568, "num_input_tokens_seen": 48808928, "step": 72410 }, { "epoch": 1.769110497642489, "grad_norm": 202.01376342773438, "learning_rate": 1.632528365968457e-06, "loss": 0.0917, "num_input_tokens_seen": 48812448, "step": 72415 }, { "epoch": 1.7692326484743361, "grad_norm": 13.737601280212402, "learning_rate": 1.6324623129958966e-06, "loss": 0.25, "num_input_tokens_seen": 48815968, "step": 72420 }, { "epoch": 1.7693547993061833, "grad_norm": 10.069300651550293, "learning_rate": 1.6323962554238997e-06, "loss": 0.1087, "num_input_tokens_seen": 48819168, "step": 72425 }, { "epoch": 1.7694769501380305, "grad_norm": 2.0507044792175293, "learning_rate": 1.6323301932529475e-06, "loss": 0.0829, "num_input_tokens_seen": 48823264, "step": 72430 }, { "epoch": 1.7695991009698777, "grad_norm": 0.16294556856155396, "learning_rate": 1.6322641264835198e-06, "loss": 0.058, "num_input_tokens_seen": 48826464, "step": 72435 }, { "epoch": 1.769721251801725, "grad_norm": 0.5161846280097961, "learning_rate": 1.6321980551160976e-06, "loss": 0.0264, "num_input_tokens_seen": 48830176, "step": 72440 }, { "epoch": 1.769843402633572, "grad_norm": 0.369511216878891, "learning_rate": 1.6321319791511607e-06, "loss": 0.0425, "num_input_tokens_seen": 48833760, "step": 72445 }, { "epoch": 1.769965553465419, "grad_norm": 19.229990005493164, "learning_rate": 1.6320658985891904e-06, "loss": 0.1903, "num_input_tokens_seen": 48837088, "step": 72450 }, { "epoch": 1.7700877042972663, "grad_norm": 0.26264458894729614, "learning_rate": 1.6319998134306668e-06, "loss": 0.0487, "num_input_tokens_seen": 48840544, "step": 72455 }, { "epoch": 1.7702098551291134, "grad_norm": 0.5501359105110168, "learning_rate": 1.6319337236760706e-06, "loss": 0.0658, "num_input_tokens_seen": 48844192, "step": 72460 }, { "epoch": 1.7703320059609606, "grad_norm": 107.52610778808594, "learning_rate": 1.6318676293258822e-06, "loss": 0.2244, "num_input_tokens_seen": 48847712, "step": 72465 }, { "epoch": 1.7704541567928076, "grad_norm": 0.16678033769130707, "learning_rate": 1.6318015303805827e-06, "loss": 0.1484, "num_input_tokens_seen": 48850912, "step": 72470 }, { "epoch": 1.7705763076246548, "grad_norm": 15.765122413635254, "learning_rate": 1.6317354268406524e-06, "loss": 0.1223, "num_input_tokens_seen": 48854112, "step": 72475 }, { "epoch": 1.770698458456502, "grad_norm": 0.19218267500400543, "learning_rate": 1.6316693187065723e-06, "loss": 0.0031, "num_input_tokens_seen": 48857440, "step": 72480 }, { "epoch": 1.7708206092883492, "grad_norm": 0.17282667756080627, "learning_rate": 1.6316032059788229e-06, "loss": 0.0671, "num_input_tokens_seen": 48861600, "step": 72485 }, { "epoch": 1.7709427601201964, "grad_norm": 0.11995959281921387, "learning_rate": 1.6315370886578848e-06, "loss": 0.036, "num_input_tokens_seen": 48864736, "step": 72490 }, { "epoch": 1.7710649109520435, "grad_norm": 10.746925354003906, "learning_rate": 1.6314709667442395e-06, "loss": 0.0519, "num_input_tokens_seen": 48867936, "step": 72495 }, { "epoch": 1.7711870617838907, "grad_norm": 0.24249105155467987, "learning_rate": 1.6314048402383675e-06, "loss": 0.002, "num_input_tokens_seen": 48871200, "step": 72500 }, { "epoch": 1.771309212615738, "grad_norm": 88.96400451660156, "learning_rate": 1.6313387091407496e-06, "loss": 0.0519, "num_input_tokens_seen": 48874272, "step": 72505 }, { "epoch": 1.7714313634475851, "grad_norm": 21.104406356811523, "learning_rate": 1.6312725734518668e-06, "loss": 0.0645, "num_input_tokens_seen": 48878176, "step": 72510 }, { "epoch": 1.7715535142794323, "grad_norm": 0.1095280647277832, "learning_rate": 1.6312064331722e-06, "loss": 0.1158, "num_input_tokens_seen": 48881760, "step": 72515 }, { "epoch": 1.7716756651112795, "grad_norm": 20.461257934570312, "learning_rate": 1.6311402883022302e-06, "loss": 0.0697, "num_input_tokens_seen": 48885472, "step": 72520 }, { "epoch": 1.7717978159431267, "grad_norm": 46.38597106933594, "learning_rate": 1.6310741388424388e-06, "loss": 0.0456, "num_input_tokens_seen": 48888800, "step": 72525 }, { "epoch": 1.7719199667749739, "grad_norm": 99.10078430175781, "learning_rate": 1.631007984793306e-06, "loss": 0.2052, "num_input_tokens_seen": 48892064, "step": 72530 }, { "epoch": 1.772042117606821, "grad_norm": 13.505167961120605, "learning_rate": 1.6309418261553139e-06, "loss": 0.0733, "num_input_tokens_seen": 48895904, "step": 72535 }, { "epoch": 1.772164268438668, "grad_norm": 0.2780132293701172, "learning_rate": 1.6308756629289429e-06, "loss": 0.1274, "num_input_tokens_seen": 48899296, "step": 72540 }, { "epoch": 1.7722864192705152, "grad_norm": 14.029288291931152, "learning_rate": 1.6308094951146742e-06, "loss": 0.1346, "num_input_tokens_seen": 48902816, "step": 72545 }, { "epoch": 1.7724085701023624, "grad_norm": 22.366783142089844, "learning_rate": 1.6307433227129895e-06, "loss": 0.0313, "num_input_tokens_seen": 48906400, "step": 72550 }, { "epoch": 1.7725307209342096, "grad_norm": 1.1991052627563477, "learning_rate": 1.6306771457243696e-06, "loss": 0.0037, "num_input_tokens_seen": 48909600, "step": 72555 }, { "epoch": 1.7726528717660566, "grad_norm": 0.1703871339559555, "learning_rate": 1.6306109641492958e-06, "loss": 0.1203, "num_input_tokens_seen": 48912672, "step": 72560 }, { "epoch": 1.7727750225979038, "grad_norm": 142.6514892578125, "learning_rate": 1.6305447779882497e-06, "loss": 0.0387, "num_input_tokens_seen": 48916000, "step": 72565 }, { "epoch": 1.772897173429751, "grad_norm": 15.24199104309082, "learning_rate": 1.6304785872417121e-06, "loss": 0.1363, "num_input_tokens_seen": 48918880, "step": 72570 }, { "epoch": 1.7730193242615981, "grad_norm": 1.303954839706421, "learning_rate": 1.630412391910165e-06, "loss": 0.0632, "num_input_tokens_seen": 48922272, "step": 72575 }, { "epoch": 1.7731414750934453, "grad_norm": 10.182337760925293, "learning_rate": 1.630346191994089e-06, "loss": 0.0572, "num_input_tokens_seen": 48925408, "step": 72580 }, { "epoch": 1.7732636259252925, "grad_norm": 15.994193077087402, "learning_rate": 1.630279987493966e-06, "loss": 0.167, "num_input_tokens_seen": 48928608, "step": 72585 }, { "epoch": 1.7733857767571397, "grad_norm": 11.406634330749512, "learning_rate": 1.6302137784102774e-06, "loss": 0.0463, "num_input_tokens_seen": 48934048, "step": 72590 }, { "epoch": 1.773507927588987, "grad_norm": 0.16068458557128906, "learning_rate": 1.630147564743505e-06, "loss": 0.0021, "num_input_tokens_seen": 48936800, "step": 72595 }, { "epoch": 1.773630078420834, "grad_norm": 0.2641652524471283, "learning_rate": 1.63008134649413e-06, "loss": 0.0102, "num_input_tokens_seen": 48940128, "step": 72600 }, { "epoch": 1.7737522292526813, "grad_norm": 42.87362289428711, "learning_rate": 1.6300151236626336e-06, "loss": 0.1173, "num_input_tokens_seen": 48943584, "step": 72605 }, { "epoch": 1.7738743800845285, "grad_norm": 24.8110408782959, "learning_rate": 1.629948896249498e-06, "loss": 0.1184, "num_input_tokens_seen": 48947040, "step": 72610 }, { "epoch": 1.7739965309163757, "grad_norm": 0.45504873991012573, "learning_rate": 1.6298826642552043e-06, "loss": 0.0622, "num_input_tokens_seen": 48950496, "step": 72615 }, { "epoch": 1.7741186817482228, "grad_norm": 16.71000862121582, "learning_rate": 1.629816427680235e-06, "loss": 0.1365, "num_input_tokens_seen": 48953824, "step": 72620 }, { "epoch": 1.7742408325800698, "grad_norm": 0.1281004250049591, "learning_rate": 1.6297501865250708e-06, "loss": 0.0999, "num_input_tokens_seen": 48957152, "step": 72625 }, { "epoch": 1.774362983411917, "grad_norm": 14.322222709655762, "learning_rate": 1.629683940790194e-06, "loss": 0.0882, "num_input_tokens_seen": 48960224, "step": 72630 }, { "epoch": 1.7744851342437642, "grad_norm": 0.14506135880947113, "learning_rate": 1.6296176904760866e-06, "loss": 0.0013, "num_input_tokens_seen": 48963232, "step": 72635 }, { "epoch": 1.7746072850756114, "grad_norm": 66.70890808105469, "learning_rate": 1.6295514355832296e-06, "loss": 0.1249, "num_input_tokens_seen": 48966496, "step": 72640 }, { "epoch": 1.7747294359074586, "grad_norm": 0.7553220391273499, "learning_rate": 1.629485176112105e-06, "loss": 0.1268, "num_input_tokens_seen": 48969760, "step": 72645 }, { "epoch": 1.7748515867393055, "grad_norm": 1.3736454248428345, "learning_rate": 1.6294189120631954e-06, "loss": 0.1543, "num_input_tokens_seen": 48972896, "step": 72650 }, { "epoch": 1.7749737375711527, "grad_norm": 0.14934030175209045, "learning_rate": 1.6293526434369818e-06, "loss": 0.0008, "num_input_tokens_seen": 48976288, "step": 72655 }, { "epoch": 1.775095888403, "grad_norm": 9.05644702911377, "learning_rate": 1.6292863702339466e-06, "loss": 0.1141, "num_input_tokens_seen": 48979680, "step": 72660 }, { "epoch": 1.7752180392348471, "grad_norm": 1.1264457702636719, "learning_rate": 1.6292200924545715e-06, "loss": 0.003, "num_input_tokens_seen": 48983136, "step": 72665 }, { "epoch": 1.7753401900666943, "grad_norm": 9.034127235412598, "learning_rate": 1.6291538100993391e-06, "loss": 0.1314, "num_input_tokens_seen": 48986144, "step": 72670 }, { "epoch": 1.7754623408985415, "grad_norm": 0.23916614055633545, "learning_rate": 1.6290875231687306e-06, "loss": 0.1202, "num_input_tokens_seen": 48990048, "step": 72675 }, { "epoch": 1.7755844917303887, "grad_norm": 1.9375858306884766, "learning_rate": 1.6290212316632285e-06, "loss": 0.038, "num_input_tokens_seen": 48993696, "step": 72680 }, { "epoch": 1.7757066425622359, "grad_norm": 0.254934161901474, "learning_rate": 1.628954935583315e-06, "loss": 0.0612, "num_input_tokens_seen": 48997088, "step": 72685 }, { "epoch": 1.775828793394083, "grad_norm": 0.21548773348331451, "learning_rate": 1.628888634929472e-06, "loss": 0.0965, "num_input_tokens_seen": 49000608, "step": 72690 }, { "epoch": 1.7759509442259303, "grad_norm": 30.685073852539062, "learning_rate": 1.6288223297021814e-06, "loss": 0.0807, "num_input_tokens_seen": 49004192, "step": 72695 }, { "epoch": 1.7760730950577774, "grad_norm": 11.643782615661621, "learning_rate": 1.628756019901926e-06, "loss": 0.0388, "num_input_tokens_seen": 49007200, "step": 72700 }, { "epoch": 1.7761952458896246, "grad_norm": 0.5212379693984985, "learning_rate": 1.6286897055291874e-06, "loss": 0.1639, "num_input_tokens_seen": 49010144, "step": 72705 }, { "epoch": 1.7763173967214718, "grad_norm": 17.569440841674805, "learning_rate": 1.6286233865844486e-06, "loss": 0.1141, "num_input_tokens_seen": 49013920, "step": 72710 }, { "epoch": 1.7764395475533188, "grad_norm": 0.24917493760585785, "learning_rate": 1.6285570630681914e-06, "loss": 0.0585, "num_input_tokens_seen": 49017120, "step": 72715 }, { "epoch": 1.776561698385166, "grad_norm": 61.48064041137695, "learning_rate": 1.6284907349808976e-06, "loss": 0.1279, "num_input_tokens_seen": 49020576, "step": 72720 }, { "epoch": 1.7766838492170132, "grad_norm": 0.10875872522592545, "learning_rate": 1.6284244023230507e-06, "loss": 0.032, "num_input_tokens_seen": 49023840, "step": 72725 }, { "epoch": 1.7768060000488604, "grad_norm": 0.7517804503440857, "learning_rate": 1.6283580650951324e-06, "loss": 0.0013, "num_input_tokens_seen": 49027424, "step": 72730 }, { "epoch": 1.7769281508807075, "grad_norm": 0.1774255931377411, "learning_rate": 1.6282917232976252e-06, "loss": 0.0744, "num_input_tokens_seen": 49030432, "step": 72735 }, { "epoch": 1.7770503017125545, "grad_norm": 0.1688397228717804, "learning_rate": 1.6282253769310115e-06, "loss": 0.0605, "num_input_tokens_seen": 49034080, "step": 72740 }, { "epoch": 1.7771724525444017, "grad_norm": 0.04853454604744911, "learning_rate": 1.628159025995774e-06, "loss": 0.0023, "num_input_tokens_seen": 49037600, "step": 72745 }, { "epoch": 1.777294603376249, "grad_norm": 0.39164552092552185, "learning_rate": 1.6280926704923949e-06, "loss": 0.014, "num_input_tokens_seen": 49040800, "step": 72750 }, { "epoch": 1.777416754208096, "grad_norm": 13.104290008544922, "learning_rate": 1.6280263104213572e-06, "loss": 0.1248, "num_input_tokens_seen": 49044064, "step": 72755 }, { "epoch": 1.7775389050399433, "grad_norm": 0.2612842321395874, "learning_rate": 1.6279599457831431e-06, "loss": 0.0014, "num_input_tokens_seen": 49047200, "step": 72760 }, { "epoch": 1.7776610558717905, "grad_norm": 24.87212562561035, "learning_rate": 1.6278935765782356e-06, "loss": 0.1021, "num_input_tokens_seen": 49050528, "step": 72765 }, { "epoch": 1.7777832067036377, "grad_norm": 0.3829386830329895, "learning_rate": 1.6278272028071168e-06, "loss": 0.0831, "num_input_tokens_seen": 49054112, "step": 72770 }, { "epoch": 1.7779053575354848, "grad_norm": 57.15519332885742, "learning_rate": 1.62776082447027e-06, "loss": 0.1695, "num_input_tokens_seen": 49057184, "step": 72775 }, { "epoch": 1.778027508367332, "grad_norm": 0.10678430646657944, "learning_rate": 1.6276944415681776e-06, "loss": 0.0281, "num_input_tokens_seen": 49060896, "step": 72780 }, { "epoch": 1.7781496591991792, "grad_norm": 20.0386962890625, "learning_rate": 1.6276280541013223e-06, "loss": 0.1584, "num_input_tokens_seen": 49064544, "step": 72785 }, { "epoch": 1.7782718100310264, "grad_norm": 0.7888308763504028, "learning_rate": 1.627561662070187e-06, "loss": 0.1369, "num_input_tokens_seen": 49068320, "step": 72790 }, { "epoch": 1.7783939608628736, "grad_norm": 0.6277143955230713, "learning_rate": 1.6274952654752547e-06, "loss": 0.0878, "num_input_tokens_seen": 49071712, "step": 72795 }, { "epoch": 1.7785161116947208, "grad_norm": 0.11778520047664642, "learning_rate": 1.6274288643170078e-06, "loss": 0.0296, "num_input_tokens_seen": 49075168, "step": 72800 }, { "epoch": 1.7786382625265678, "grad_norm": 29.370920181274414, "learning_rate": 1.6273624585959295e-06, "loss": 0.0897, "num_input_tokens_seen": 49078816, "step": 72805 }, { "epoch": 1.778760413358415, "grad_norm": 225.03904724121094, "learning_rate": 1.6272960483125026e-06, "loss": 0.11, "num_input_tokens_seen": 49082272, "step": 72810 }, { "epoch": 1.7788825641902621, "grad_norm": 10.878161430358887, "learning_rate": 1.6272296334672101e-06, "loss": 0.1619, "num_input_tokens_seen": 49085664, "step": 72815 }, { "epoch": 1.7790047150221093, "grad_norm": 0.03372422978281975, "learning_rate": 1.6271632140605351e-06, "loss": 0.1086, "num_input_tokens_seen": 49089376, "step": 72820 }, { "epoch": 1.7791268658539565, "grad_norm": 0.4881143569946289, "learning_rate": 1.6270967900929607e-06, "loss": 0.1185, "num_input_tokens_seen": 49092640, "step": 72825 }, { "epoch": 1.7792490166858035, "grad_norm": 10.149094581604004, "learning_rate": 1.6270303615649695e-06, "loss": 0.1189, "num_input_tokens_seen": 49095968, "step": 72830 }, { "epoch": 1.7793711675176507, "grad_norm": 6.329721450805664, "learning_rate": 1.6269639284770448e-06, "loss": 0.0279, "num_input_tokens_seen": 49099360, "step": 72835 }, { "epoch": 1.7794933183494979, "grad_norm": 0.6420906186103821, "learning_rate": 1.62689749082967e-06, "loss": 0.0224, "num_input_tokens_seen": 49102560, "step": 72840 }, { "epoch": 1.779615469181345, "grad_norm": 18.043487548828125, "learning_rate": 1.6268310486233282e-06, "loss": 0.2352, "num_input_tokens_seen": 49105824, "step": 72845 }, { "epoch": 1.7797376200131922, "grad_norm": 23.664506912231445, "learning_rate": 1.626764601858502e-06, "loss": 0.0645, "num_input_tokens_seen": 49108896, "step": 72850 }, { "epoch": 1.7798597708450394, "grad_norm": 0.12446639686822891, "learning_rate": 1.6266981505356752e-06, "loss": 0.0024, "num_input_tokens_seen": 49112352, "step": 72855 }, { "epoch": 1.7799819216768866, "grad_norm": 19.845417022705078, "learning_rate": 1.626631694655331e-06, "loss": 0.128, "num_input_tokens_seen": 49115744, "step": 72860 }, { "epoch": 1.7801040725087338, "grad_norm": 0.07817646861076355, "learning_rate": 1.6265652342179523e-06, "loss": 0.0014, "num_input_tokens_seen": 49119264, "step": 72865 }, { "epoch": 1.780226223340581, "grad_norm": 0.2219124287366867, "learning_rate": 1.626498769224023e-06, "loss": 0.0843, "num_input_tokens_seen": 49122592, "step": 72870 }, { "epoch": 1.7803483741724282, "grad_norm": 0.1201687902212143, "learning_rate": 1.6264322996740258e-06, "loss": 0.0023, "num_input_tokens_seen": 49126176, "step": 72875 }, { "epoch": 1.7804705250042754, "grad_norm": 0.20573893189430237, "learning_rate": 1.6263658255684447e-06, "loss": 0.0575, "num_input_tokens_seen": 49129504, "step": 72880 }, { "epoch": 1.7805926758361226, "grad_norm": 0.45061349868774414, "learning_rate": 1.6262993469077628e-06, "loss": 0.001, "num_input_tokens_seen": 49132576, "step": 72885 }, { "epoch": 1.7807148266679698, "grad_norm": 0.11227881163358688, "learning_rate": 1.6262328636924635e-06, "loss": 0.143, "num_input_tokens_seen": 49136032, "step": 72890 }, { "epoch": 1.7808369774998167, "grad_norm": 0.24952024221420288, "learning_rate": 1.6261663759230303e-06, "loss": 0.0331, "num_input_tokens_seen": 49139168, "step": 72895 }, { "epoch": 1.780959128331664, "grad_norm": 0.6618596911430359, "learning_rate": 1.6260998835999472e-06, "loss": 0.0545, "num_input_tokens_seen": 49142176, "step": 72900 }, { "epoch": 1.781081279163511, "grad_norm": 10.293169975280762, "learning_rate": 1.626033386723697e-06, "loss": 0.1411, "num_input_tokens_seen": 49145504, "step": 72905 }, { "epoch": 1.7812034299953583, "grad_norm": 0.8537778258323669, "learning_rate": 1.6259668852947637e-06, "loss": 0.0998, "num_input_tokens_seen": 49149024, "step": 72910 }, { "epoch": 1.7813255808272053, "grad_norm": 19.98094367980957, "learning_rate": 1.6259003793136309e-06, "loss": 0.1419, "num_input_tokens_seen": 49152416, "step": 72915 }, { "epoch": 1.7814477316590525, "grad_norm": 0.12676870822906494, "learning_rate": 1.625833868780782e-06, "loss": 0.0397, "num_input_tokens_seen": 49155744, "step": 72920 }, { "epoch": 1.7815698824908996, "grad_norm": 198.677490234375, "learning_rate": 1.625767353696701e-06, "loss": 0.0564, "num_input_tokens_seen": 49159776, "step": 72925 }, { "epoch": 1.7816920333227468, "grad_norm": 0.27948421239852905, "learning_rate": 1.6257008340618715e-06, "loss": 0.001, "num_input_tokens_seen": 49163104, "step": 72930 }, { "epoch": 1.781814184154594, "grad_norm": 0.081379234790802, "learning_rate": 1.6256343098767773e-06, "loss": 0.1116, "num_input_tokens_seen": 49166304, "step": 72935 }, { "epoch": 1.7819363349864412, "grad_norm": 0.48774203658103943, "learning_rate": 1.6255677811419022e-06, "loss": 0.0013, "num_input_tokens_seen": 49169312, "step": 72940 }, { "epoch": 1.7820584858182884, "grad_norm": 1.4804660081863403, "learning_rate": 1.6255012478577296e-06, "loss": 0.1343, "num_input_tokens_seen": 49173408, "step": 72945 }, { "epoch": 1.7821806366501356, "grad_norm": 161.1129608154297, "learning_rate": 1.625434710024744e-06, "loss": 0.1763, "num_input_tokens_seen": 49177120, "step": 72950 }, { "epoch": 1.7823027874819828, "grad_norm": 14.122322082519531, "learning_rate": 1.6253681676434289e-06, "loss": 0.0555, "num_input_tokens_seen": 49181024, "step": 72955 }, { "epoch": 1.78242493831383, "grad_norm": 15.058162689208984, "learning_rate": 1.6253016207142682e-06, "loss": 0.1329, "num_input_tokens_seen": 49184096, "step": 72960 }, { "epoch": 1.7825470891456772, "grad_norm": 10.185879707336426, "learning_rate": 1.625235069237746e-06, "loss": 0.2357, "num_input_tokens_seen": 49187488, "step": 72965 }, { "epoch": 1.7826692399775244, "grad_norm": 0.4589506685733795, "learning_rate": 1.6251685132143463e-06, "loss": 0.0376, "num_input_tokens_seen": 49190624, "step": 72970 }, { "epoch": 1.7827913908093715, "grad_norm": 3.827714204788208, "learning_rate": 1.625101952644553e-06, "loss": 0.0964, "num_input_tokens_seen": 49193568, "step": 72975 }, { "epoch": 1.7829135416412187, "grad_norm": 0.05556550249457359, "learning_rate": 1.6250353875288501e-06, "loss": 0.0367, "num_input_tokens_seen": 49196832, "step": 72980 }, { "epoch": 1.7830356924730657, "grad_norm": 0.19241148233413696, "learning_rate": 1.6249688178677215e-06, "loss": 0.1076, "num_input_tokens_seen": 49199840, "step": 72985 }, { "epoch": 1.783157843304913, "grad_norm": 105.45845031738281, "learning_rate": 1.6249022436616518e-06, "loss": 0.1231, "num_input_tokens_seen": 49203808, "step": 72990 }, { "epoch": 1.78327999413676, "grad_norm": 0.3520190119743347, "learning_rate": 1.624835664911125e-06, "loss": 0.1205, "num_input_tokens_seen": 49207136, "step": 72995 }, { "epoch": 1.7834021449686073, "grad_norm": 0.20569193363189697, "learning_rate": 1.624769081616625e-06, "loss": 0.0732, "num_input_tokens_seen": 49210784, "step": 73000 }, { "epoch": 1.7835242958004542, "grad_norm": 9.263427734375, "learning_rate": 1.6247024937786364e-06, "loss": 0.0901, "num_input_tokens_seen": 49214368, "step": 73005 }, { "epoch": 1.7836464466323014, "grad_norm": 7.391837120056152, "learning_rate": 1.6246359013976432e-06, "loss": 0.108, "num_input_tokens_seen": 49218144, "step": 73010 }, { "epoch": 1.7837685974641486, "grad_norm": 0.1080571860074997, "learning_rate": 1.6245693044741296e-06, "loss": 0.0013, "num_input_tokens_seen": 49221600, "step": 73015 }, { "epoch": 1.7838907482959958, "grad_norm": 8.889108657836914, "learning_rate": 1.6245027030085798e-06, "loss": 0.0907, "num_input_tokens_seen": 49225120, "step": 73020 }, { "epoch": 1.784012899127843, "grad_norm": 0.280179500579834, "learning_rate": 1.624436097001479e-06, "loss": 0.0778, "num_input_tokens_seen": 49228512, "step": 73025 }, { "epoch": 1.7841350499596902, "grad_norm": 0.154941126704216, "learning_rate": 1.6243694864533103e-06, "loss": 0.106, "num_input_tokens_seen": 49231904, "step": 73030 }, { "epoch": 1.7842572007915374, "grad_norm": 12.580093383789062, "learning_rate": 1.6243028713645592e-06, "loss": 0.0837, "num_input_tokens_seen": 49235360, "step": 73035 }, { "epoch": 1.7843793516233846, "grad_norm": 16.738759994506836, "learning_rate": 1.6242362517357095e-06, "loss": 0.0717, "num_input_tokens_seen": 49238304, "step": 73040 }, { "epoch": 1.7845015024552318, "grad_norm": 0.8823060393333435, "learning_rate": 1.6241696275672458e-06, "loss": 0.0952, "num_input_tokens_seen": 49241312, "step": 73045 }, { "epoch": 1.784623653287079, "grad_norm": 0.06604723632335663, "learning_rate": 1.6241029988596528e-06, "loss": 0.0681, "num_input_tokens_seen": 49244384, "step": 73050 }, { "epoch": 1.7847458041189261, "grad_norm": 16.02294921875, "learning_rate": 1.624036365613415e-06, "loss": 0.0327, "num_input_tokens_seen": 49248160, "step": 73055 }, { "epoch": 1.7848679549507733, "grad_norm": 0.2083832174539566, "learning_rate": 1.623969727829017e-06, "loss": 0.1776, "num_input_tokens_seen": 49251360, "step": 73060 }, { "epoch": 1.7849901057826205, "grad_norm": 1.2275177240371704, "learning_rate": 1.6239030855069432e-06, "loss": 0.0851, "num_input_tokens_seen": 49254816, "step": 73065 }, { "epoch": 1.7851122566144677, "grad_norm": 11.574962615966797, "learning_rate": 1.6238364386476783e-06, "loss": 0.0367, "num_input_tokens_seen": 49258144, "step": 73070 }, { "epoch": 1.7852344074463147, "grad_norm": 21.744110107421875, "learning_rate": 1.623769787251707e-06, "loss": 0.1875, "num_input_tokens_seen": 49261408, "step": 73075 }, { "epoch": 1.7853565582781619, "grad_norm": 57.97651290893555, "learning_rate": 1.623703131319514e-06, "loss": 0.1415, "num_input_tokens_seen": 49264416, "step": 73080 }, { "epoch": 1.785478709110009, "grad_norm": 0.6353474855422974, "learning_rate": 1.6236364708515842e-06, "loss": 0.0578, "num_input_tokens_seen": 49267616, "step": 73085 }, { "epoch": 1.7856008599418562, "grad_norm": 13.432452201843262, "learning_rate": 1.623569805848402e-06, "loss": 0.0612, "num_input_tokens_seen": 49271008, "step": 73090 }, { "epoch": 1.7857230107737032, "grad_norm": 16.59269905090332, "learning_rate": 1.6235031363104528e-06, "loss": 0.1369, "num_input_tokens_seen": 49274272, "step": 73095 }, { "epoch": 1.7858451616055504, "grad_norm": 0.1436922550201416, "learning_rate": 1.623436462238221e-06, "loss": 0.0156, "num_input_tokens_seen": 49277536, "step": 73100 }, { "epoch": 1.7859673124373976, "grad_norm": 0.5524347424507141, "learning_rate": 1.6233697836321913e-06, "loss": 0.1298, "num_input_tokens_seen": 49280992, "step": 73105 }, { "epoch": 1.7860894632692448, "grad_norm": 1.849530816078186, "learning_rate": 1.623303100492849e-06, "loss": 0.071, "num_input_tokens_seen": 49284192, "step": 73110 }, { "epoch": 1.786211614101092, "grad_norm": 11.363874435424805, "learning_rate": 1.623236412820679e-06, "loss": 0.1023, "num_input_tokens_seen": 49287264, "step": 73115 }, { "epoch": 1.7863337649329392, "grad_norm": 0.2736768424510956, "learning_rate": 1.6231697206161661e-06, "loss": 0.11, "num_input_tokens_seen": 49290912, "step": 73120 }, { "epoch": 1.7864559157647864, "grad_norm": 0.707730770111084, "learning_rate": 1.6231030238797956e-06, "loss": 0.1004, "num_input_tokens_seen": 49294048, "step": 73125 }, { "epoch": 1.7865780665966335, "grad_norm": 0.5541930794715881, "learning_rate": 1.623036322612052e-06, "loss": 0.0276, "num_input_tokens_seen": 49297120, "step": 73130 }, { "epoch": 1.7867002174284807, "grad_norm": 0.1723814308643341, "learning_rate": 1.622969616813421e-06, "loss": 0.1215, "num_input_tokens_seen": 49300512, "step": 73135 }, { "epoch": 1.786822368260328, "grad_norm": 0.07303165644407272, "learning_rate": 1.6229029064843871e-06, "loss": 0.2217, "num_input_tokens_seen": 49303392, "step": 73140 }, { "epoch": 1.786944519092175, "grad_norm": 8.014287948608398, "learning_rate": 1.6228361916254358e-06, "loss": 0.0755, "num_input_tokens_seen": 49307232, "step": 73145 }, { "epoch": 1.7870666699240223, "grad_norm": 13.333250999450684, "learning_rate": 1.6227694722370525e-06, "loss": 0.003, "num_input_tokens_seen": 49310560, "step": 73150 }, { "epoch": 1.7871888207558695, "grad_norm": 18.574920654296875, "learning_rate": 1.6227027483197214e-06, "loss": 0.025, "num_input_tokens_seen": 49314272, "step": 73155 }, { "epoch": 1.7873109715877165, "grad_norm": 12.966170310974121, "learning_rate": 1.622636019873929e-06, "loss": 0.0363, "num_input_tokens_seen": 49317472, "step": 73160 }, { "epoch": 1.7874331224195636, "grad_norm": 11.653533935546875, "learning_rate": 1.62256928690016e-06, "loss": 0.1551, "num_input_tokens_seen": 49320544, "step": 73165 }, { "epoch": 1.7875552732514108, "grad_norm": 15.924564361572266, "learning_rate": 1.6225025493988995e-06, "loss": 0.0316, "num_input_tokens_seen": 49323744, "step": 73170 }, { "epoch": 1.787677424083258, "grad_norm": 224.7387237548828, "learning_rate": 1.6224358073706327e-06, "loss": 0.1097, "num_input_tokens_seen": 49327456, "step": 73175 }, { "epoch": 1.7877995749151052, "grad_norm": 35.19367599487305, "learning_rate": 1.622369060815846e-06, "loss": 0.124, "num_input_tokens_seen": 49331040, "step": 73180 }, { "epoch": 1.7879217257469522, "grad_norm": 0.002536064712330699, "learning_rate": 1.6223023097350238e-06, "loss": 0.0979, "num_input_tokens_seen": 49334816, "step": 73185 }, { "epoch": 1.7880438765787994, "grad_norm": 23.19019889831543, "learning_rate": 1.6222355541286517e-06, "loss": 0.1201, "num_input_tokens_seen": 49338016, "step": 73190 }, { "epoch": 1.7881660274106466, "grad_norm": 22.717191696166992, "learning_rate": 1.6221687939972154e-06, "loss": 0.1417, "num_input_tokens_seen": 49341152, "step": 73195 }, { "epoch": 1.7882881782424938, "grad_norm": 0.4923929274082184, "learning_rate": 1.6221020293412003e-06, "loss": 0.0307, "num_input_tokens_seen": 49344608, "step": 73200 }, { "epoch": 1.788410329074341, "grad_norm": 27.623798370361328, "learning_rate": 1.6220352601610916e-06, "loss": 0.0766, "num_input_tokens_seen": 49347936, "step": 73205 }, { "epoch": 1.7885324799061881, "grad_norm": 66.23554229736328, "learning_rate": 1.6219684864573755e-06, "loss": 0.0262, "num_input_tokens_seen": 49351264, "step": 73210 }, { "epoch": 1.7886546307380353, "grad_norm": 0.4153870940208435, "learning_rate": 1.6219017082305373e-06, "loss": 0.0033, "num_input_tokens_seen": 49354528, "step": 73215 }, { "epoch": 1.7887767815698825, "grad_norm": 92.46559143066406, "learning_rate": 1.6218349254810627e-06, "loss": 0.0614, "num_input_tokens_seen": 49357728, "step": 73220 }, { "epoch": 1.7888989324017297, "grad_norm": 8.169533729553223, "learning_rate": 1.621768138209437e-06, "loss": 0.0965, "num_input_tokens_seen": 49361312, "step": 73225 }, { "epoch": 1.789021083233577, "grad_norm": 0.24197158217430115, "learning_rate": 1.621701346416146e-06, "loss": 0.0449, "num_input_tokens_seen": 49365024, "step": 73230 }, { "epoch": 1.789143234065424, "grad_norm": 0.07424899935722351, "learning_rate": 1.621634550101676e-06, "loss": 0.054, "num_input_tokens_seen": 49368416, "step": 73235 }, { "epoch": 1.7892653848972713, "grad_norm": 0.09136585891246796, "learning_rate": 1.621567749266512e-06, "loss": 0.0255, "num_input_tokens_seen": 49371936, "step": 73240 }, { "epoch": 1.7893875357291185, "grad_norm": 0.4230661988258362, "learning_rate": 1.6215009439111404e-06, "loss": 0.0013, "num_input_tokens_seen": 49375392, "step": 73245 }, { "epoch": 1.7895096865609654, "grad_norm": 2.4775891304016113, "learning_rate": 1.621434134036047e-06, "loss": 0.0731, "num_input_tokens_seen": 49378592, "step": 73250 }, { "epoch": 1.7896318373928126, "grad_norm": 32.23170852661133, "learning_rate": 1.621367319641717e-06, "loss": 0.2222, "num_input_tokens_seen": 49381856, "step": 73255 }, { "epoch": 1.7897539882246598, "grad_norm": 0.35263219475746155, "learning_rate": 1.621300500728637e-06, "loss": 0.0916, "num_input_tokens_seen": 49385568, "step": 73260 }, { "epoch": 1.789876139056507, "grad_norm": 0.02084103785455227, "learning_rate": 1.6212336772972926e-06, "loss": 0.0386, "num_input_tokens_seen": 49389152, "step": 73265 }, { "epoch": 1.7899982898883542, "grad_norm": 0.027315644547343254, "learning_rate": 1.6211668493481697e-06, "loss": 0.0428, "num_input_tokens_seen": 49392096, "step": 73270 }, { "epoch": 1.7901204407202012, "grad_norm": 6.037578582763672, "learning_rate": 1.6211000168817544e-06, "loss": 0.0436, "num_input_tokens_seen": 49395680, "step": 73275 }, { "epoch": 1.7902425915520483, "grad_norm": 0.10098898410797119, "learning_rate": 1.6210331798985325e-06, "loss": 0.1954, "num_input_tokens_seen": 49398752, "step": 73280 }, { "epoch": 1.7903647423838955, "grad_norm": 0.04568028822541237, "learning_rate": 1.6209663383989907e-06, "loss": 0.0974, "num_input_tokens_seen": 49401952, "step": 73285 }, { "epoch": 1.7904868932157427, "grad_norm": 0.14879471063613892, "learning_rate": 1.6208994923836145e-06, "loss": 0.0597, "num_input_tokens_seen": 49405088, "step": 73290 }, { "epoch": 1.79060904404759, "grad_norm": 340.20709228515625, "learning_rate": 1.6208326418528903e-06, "loss": 0.0971, "num_input_tokens_seen": 49408352, "step": 73295 }, { "epoch": 1.790731194879437, "grad_norm": 32.714664459228516, "learning_rate": 1.6207657868073037e-06, "loss": 0.1478, "num_input_tokens_seen": 49411680, "step": 73300 }, { "epoch": 1.7908533457112843, "grad_norm": 0.08181426674127579, "learning_rate": 1.620698927247342e-06, "loss": 0.067, "num_input_tokens_seen": 49415072, "step": 73305 }, { "epoch": 1.7909754965431315, "grad_norm": 0.3073747158050537, "learning_rate": 1.6206320631734903e-06, "loss": 0.0026, "num_input_tokens_seen": 49418208, "step": 73310 }, { "epoch": 1.7910976473749787, "grad_norm": 20.518089294433594, "learning_rate": 1.6205651945862355e-06, "loss": 0.1575, "num_input_tokens_seen": 49421856, "step": 73315 }, { "epoch": 1.7912197982068259, "grad_norm": 0.11983784288167953, "learning_rate": 1.6204983214860634e-06, "loss": 0.1557, "num_input_tokens_seen": 49425248, "step": 73320 }, { "epoch": 1.791341949038673, "grad_norm": 44.16547393798828, "learning_rate": 1.620431443873461e-06, "loss": 0.0876, "num_input_tokens_seen": 49428256, "step": 73325 }, { "epoch": 1.7914640998705202, "grad_norm": 3.9383909702301025, "learning_rate": 1.620364561748914e-06, "loss": 0.0589, "num_input_tokens_seen": 49431200, "step": 73330 }, { "epoch": 1.7915862507023674, "grad_norm": 0.316801518201828, "learning_rate": 1.6202976751129092e-06, "loss": 0.0266, "num_input_tokens_seen": 49434528, "step": 73335 }, { "epoch": 1.7917084015342144, "grad_norm": 22.89853286743164, "learning_rate": 1.6202307839659328e-06, "loss": 0.1441, "num_input_tokens_seen": 49437728, "step": 73340 }, { "epoch": 1.7918305523660616, "grad_norm": 0.22337937355041504, "learning_rate": 1.6201638883084714e-06, "loss": 0.1626, "num_input_tokens_seen": 49440928, "step": 73345 }, { "epoch": 1.7919527031979088, "grad_norm": 10.1043119430542, "learning_rate": 1.6200969881410113e-06, "loss": 0.1022, "num_input_tokens_seen": 49443872, "step": 73350 }, { "epoch": 1.792074854029756, "grad_norm": 0.4906000792980194, "learning_rate": 1.620030083464039e-06, "loss": 0.0422, "num_input_tokens_seen": 49447136, "step": 73355 }, { "epoch": 1.7921970048616032, "grad_norm": 98.32612609863281, "learning_rate": 1.6199631742780415e-06, "loss": 0.1179, "num_input_tokens_seen": 49450464, "step": 73360 }, { "epoch": 1.7923191556934501, "grad_norm": 0.3492704927921295, "learning_rate": 1.6198962605835046e-06, "loss": 0.0342, "num_input_tokens_seen": 49454368, "step": 73365 }, { "epoch": 1.7924413065252973, "grad_norm": 0.2637254595756531, "learning_rate": 1.6198293423809157e-06, "loss": 0.1297, "num_input_tokens_seen": 49457440, "step": 73370 }, { "epoch": 1.7925634573571445, "grad_norm": 0.23060853779315948, "learning_rate": 1.619762419670761e-06, "loss": 0.0668, "num_input_tokens_seen": 49460896, "step": 73375 }, { "epoch": 1.7926856081889917, "grad_norm": 13.903261184692383, "learning_rate": 1.6196954924535274e-06, "loss": 0.0754, "num_input_tokens_seen": 49464032, "step": 73380 }, { "epoch": 1.7928077590208389, "grad_norm": 9.217611312866211, "learning_rate": 1.6196285607297013e-06, "loss": 0.0775, "num_input_tokens_seen": 49467104, "step": 73385 }, { "epoch": 1.792929909852686, "grad_norm": 0.06661586463451385, "learning_rate": 1.6195616244997698e-06, "loss": 0.1124, "num_input_tokens_seen": 49470432, "step": 73390 }, { "epoch": 1.7930520606845333, "grad_norm": 0.15379488468170166, "learning_rate": 1.6194946837642194e-06, "loss": 0.0208, "num_input_tokens_seen": 49473824, "step": 73395 }, { "epoch": 1.7931742115163805, "grad_norm": 329.20697021484375, "learning_rate": 1.6194277385235372e-06, "loss": 0.0871, "num_input_tokens_seen": 49477536, "step": 73400 }, { "epoch": 1.7932963623482276, "grad_norm": 0.23400792479515076, "learning_rate": 1.6193607887782098e-06, "loss": 0.001, "num_input_tokens_seen": 49481120, "step": 73405 }, { "epoch": 1.7934185131800748, "grad_norm": 0.6813299059867859, "learning_rate": 1.619293834528724e-06, "loss": 0.0447, "num_input_tokens_seen": 49484512, "step": 73410 }, { "epoch": 1.793540664011922, "grad_norm": 22.358814239501953, "learning_rate": 1.6192268757755674e-06, "loss": 0.1016, "num_input_tokens_seen": 49487648, "step": 73415 }, { "epoch": 1.7936628148437692, "grad_norm": 0.401123970746994, "learning_rate": 1.6191599125192256e-06, "loss": 0.0026, "num_input_tokens_seen": 49491104, "step": 73420 }, { "epoch": 1.7937849656756164, "grad_norm": 0.22458702325820923, "learning_rate": 1.6190929447601872e-06, "loss": 0.094, "num_input_tokens_seen": 49494496, "step": 73425 }, { "epoch": 1.7939071165074634, "grad_norm": 12.835310935974121, "learning_rate": 1.6190259724989378e-06, "loss": 0.0112, "num_input_tokens_seen": 49498016, "step": 73430 }, { "epoch": 1.7940292673393106, "grad_norm": 24.49115562438965, "learning_rate": 1.6189589957359652e-06, "loss": 0.1724, "num_input_tokens_seen": 49501280, "step": 73435 }, { "epoch": 1.7941514181711578, "grad_norm": 19.634645462036133, "learning_rate": 1.6188920144717564e-06, "loss": 0.1396, "num_input_tokens_seen": 49504224, "step": 73440 }, { "epoch": 1.794273569003005, "grad_norm": 0.5965172052383423, "learning_rate": 1.6188250287067984e-06, "loss": 0.0372, "num_input_tokens_seen": 49507808, "step": 73445 }, { "epoch": 1.794395719834852, "grad_norm": 0.1391860544681549, "learning_rate": 1.6187580384415785e-06, "loss": 0.0516, "num_input_tokens_seen": 49511264, "step": 73450 }, { "epoch": 1.794517870666699, "grad_norm": 27.543590545654297, "learning_rate": 1.6186910436765833e-06, "loss": 0.0438, "num_input_tokens_seen": 49514656, "step": 73455 }, { "epoch": 1.7946400214985463, "grad_norm": 0.5109656453132629, "learning_rate": 1.6186240444123005e-06, "loss": 0.0227, "num_input_tokens_seen": 49517728, "step": 73460 }, { "epoch": 1.7947621723303935, "grad_norm": 0.40214264392852783, "learning_rate": 1.6185570406492174e-06, "loss": 0.1171, "num_input_tokens_seen": 49520864, "step": 73465 }, { "epoch": 1.7948843231622407, "grad_norm": 0.30075138807296753, "learning_rate": 1.6184900323878211e-06, "loss": 0.0544, "num_input_tokens_seen": 49523872, "step": 73470 }, { "epoch": 1.7950064739940879, "grad_norm": 1.6809254884719849, "learning_rate": 1.618423019628599e-06, "loss": 0.1353, "num_input_tokens_seen": 49527584, "step": 73475 }, { "epoch": 1.795128624825935, "grad_norm": 0.1620369851589203, "learning_rate": 1.6183560023720384e-06, "loss": 0.001, "num_input_tokens_seen": 49530976, "step": 73480 }, { "epoch": 1.7952507756577822, "grad_norm": 3.483069658279419, "learning_rate": 1.6182889806186264e-06, "loss": 0.0426, "num_input_tokens_seen": 49533792, "step": 73485 }, { "epoch": 1.7953729264896294, "grad_norm": 33.021671295166016, "learning_rate": 1.6182219543688507e-06, "loss": 0.1981, "num_input_tokens_seen": 49537056, "step": 73490 }, { "epoch": 1.7954950773214766, "grad_norm": 0.7459858655929565, "learning_rate": 1.6181549236231989e-06, "loss": 0.0319, "num_input_tokens_seen": 49540000, "step": 73495 }, { "epoch": 1.7956172281533238, "grad_norm": 20.98183822631836, "learning_rate": 1.618087888382158e-06, "loss": 0.042, "num_input_tokens_seen": 49543904, "step": 73500 }, { "epoch": 1.795739378985171, "grad_norm": 0.34212106466293335, "learning_rate": 1.6180208486462159e-06, "loss": 0.0453, "num_input_tokens_seen": 49547360, "step": 73505 }, { "epoch": 1.7958615298170182, "grad_norm": 0.07674697786569595, "learning_rate": 1.61795380441586e-06, "loss": 0.116, "num_input_tokens_seen": 49550624, "step": 73510 }, { "epoch": 1.7959836806488654, "grad_norm": 0.45899268984794617, "learning_rate": 1.6178867556915775e-06, "loss": 0.0884, "num_input_tokens_seen": 49553696, "step": 73515 }, { "epoch": 1.7961058314807123, "grad_norm": 1.0446715354919434, "learning_rate": 1.6178197024738566e-06, "loss": 0.0022, "num_input_tokens_seen": 49557152, "step": 73520 }, { "epoch": 1.7962279823125595, "grad_norm": 0.03729706257581711, "learning_rate": 1.6177526447631845e-06, "loss": 0.0508, "num_input_tokens_seen": 49560224, "step": 73525 }, { "epoch": 1.7963501331444067, "grad_norm": 0.20750153064727783, "learning_rate": 1.617685582560049e-06, "loss": 0.07, "num_input_tokens_seen": 49563552, "step": 73530 }, { "epoch": 1.796472283976254, "grad_norm": 16.814863204956055, "learning_rate": 1.617618515864938e-06, "loss": 0.0935, "num_input_tokens_seen": 49567200, "step": 73535 }, { "epoch": 1.7965944348081009, "grad_norm": 0.13789507746696472, "learning_rate": 1.617551444678339e-06, "loss": 0.0344, "num_input_tokens_seen": 49570720, "step": 73540 }, { "epoch": 1.796716585639948, "grad_norm": 0.23944073915481567, "learning_rate": 1.6174843690007396e-06, "loss": 0.001, "num_input_tokens_seen": 49573728, "step": 73545 }, { "epoch": 1.7968387364717953, "grad_norm": 0.19656451046466827, "learning_rate": 1.6174172888326279e-06, "loss": 0.0986, "num_input_tokens_seen": 49577120, "step": 73550 }, { "epoch": 1.7969608873036425, "grad_norm": 22.739328384399414, "learning_rate": 1.6173502041744915e-06, "loss": 0.0424, "num_input_tokens_seen": 49580448, "step": 73555 }, { "epoch": 1.7970830381354896, "grad_norm": 0.4079742431640625, "learning_rate": 1.6172831150268188e-06, "loss": 0.1051, "num_input_tokens_seen": 49584544, "step": 73560 }, { "epoch": 1.7972051889673368, "grad_norm": 0.04261103272438049, "learning_rate": 1.6172160213900967e-06, "loss": 0.0408, "num_input_tokens_seen": 49587616, "step": 73565 }, { "epoch": 1.797327339799184, "grad_norm": 0.020871929824352264, "learning_rate": 1.617148923264814e-06, "loss": 0.11, "num_input_tokens_seen": 49591072, "step": 73570 }, { "epoch": 1.7974494906310312, "grad_norm": 9.203412055969238, "learning_rate": 1.617081820651458e-06, "loss": 0.184, "num_input_tokens_seen": 49594336, "step": 73575 }, { "epoch": 1.7975716414628784, "grad_norm": 0.29414746165275574, "learning_rate": 1.6170147135505175e-06, "loss": 0.0354, "num_input_tokens_seen": 49598176, "step": 73580 }, { "epoch": 1.7976937922947256, "grad_norm": 342.13507080078125, "learning_rate": 1.6169476019624796e-06, "loss": 0.0947, "num_input_tokens_seen": 49601120, "step": 73585 }, { "epoch": 1.7978159431265728, "grad_norm": 13.11091423034668, "learning_rate": 1.616880485887833e-06, "loss": 0.1298, "num_input_tokens_seen": 49604704, "step": 73590 }, { "epoch": 1.79793809395842, "grad_norm": 25.429052352905273, "learning_rate": 1.6168133653270657e-06, "loss": 0.0295, "num_input_tokens_seen": 49607968, "step": 73595 }, { "epoch": 1.7980602447902672, "grad_norm": 0.16069746017456055, "learning_rate": 1.6167462402806658e-06, "loss": 0.1576, "num_input_tokens_seen": 49611360, "step": 73600 }, { "epoch": 1.7981823956221143, "grad_norm": 0.32684841752052307, "learning_rate": 1.6166791107491212e-06, "loss": 0.1108, "num_input_tokens_seen": 49614944, "step": 73605 }, { "epoch": 1.7983045464539613, "grad_norm": 48.709228515625, "learning_rate": 1.61661197673292e-06, "loss": 0.1571, "num_input_tokens_seen": 49618528, "step": 73610 }, { "epoch": 1.7984266972858085, "grad_norm": 0.20360717177391052, "learning_rate": 1.616544838232551e-06, "loss": 0.0018, "num_input_tokens_seen": 49622304, "step": 73615 }, { "epoch": 1.7985488481176557, "grad_norm": 0.22650174796581268, "learning_rate": 1.6164776952485017e-06, "loss": 0.0008, "num_input_tokens_seen": 49625376, "step": 73620 }, { "epoch": 1.7986709989495029, "grad_norm": 0.02403288520872593, "learning_rate": 1.6164105477812612e-06, "loss": 0.0985, "num_input_tokens_seen": 49628576, "step": 73625 }, { "epoch": 1.7987931497813499, "grad_norm": 8.939688682556152, "learning_rate": 1.6163433958313174e-06, "loss": 0.0531, "num_input_tokens_seen": 49632352, "step": 73630 }, { "epoch": 1.798915300613197, "grad_norm": 0.2442442625761032, "learning_rate": 1.6162762393991585e-06, "loss": 0.1582, "num_input_tokens_seen": 49636000, "step": 73635 }, { "epoch": 1.7990374514450442, "grad_norm": 0.18538600206375122, "learning_rate": 1.6162090784852728e-06, "loss": 0.1563, "num_input_tokens_seen": 49639136, "step": 73640 }, { "epoch": 1.7991596022768914, "grad_norm": 0.099936343729496, "learning_rate": 1.616141913090149e-06, "loss": 0.0032, "num_input_tokens_seen": 49642848, "step": 73645 }, { "epoch": 1.7992817531087386, "grad_norm": 0.20571479201316833, "learning_rate": 1.616074743214276e-06, "loss": 0.1117, "num_input_tokens_seen": 49645856, "step": 73650 }, { "epoch": 1.7994039039405858, "grad_norm": 0.04828282445669174, "learning_rate": 1.6160075688581414e-06, "loss": 0.0952, "num_input_tokens_seen": 49649120, "step": 73655 }, { "epoch": 1.799526054772433, "grad_norm": 0.1694461703300476, "learning_rate": 1.6159403900222342e-06, "loss": 0.0019, "num_input_tokens_seen": 49653280, "step": 73660 }, { "epoch": 1.7996482056042802, "grad_norm": 8.066333770751953, "learning_rate": 1.6158732067070426e-06, "loss": 0.0895, "num_input_tokens_seen": 49656480, "step": 73665 }, { "epoch": 1.7997703564361274, "grad_norm": 2.2907662391662598, "learning_rate": 1.6158060189130556e-06, "loss": 0.0036, "num_input_tokens_seen": 49659744, "step": 73670 }, { "epoch": 1.7998925072679746, "grad_norm": 0.21309258043766022, "learning_rate": 1.6157388266407614e-06, "loss": 0.1183, "num_input_tokens_seen": 49663136, "step": 73675 }, { "epoch": 1.8000146580998218, "grad_norm": 0.2838139235973358, "learning_rate": 1.6156716298906487e-06, "loss": 0.1008, "num_input_tokens_seen": 49666336, "step": 73680 }, { "epoch": 1.800136808931669, "grad_norm": 13.1321439743042, "learning_rate": 1.615604428663207e-06, "loss": 0.0426, "num_input_tokens_seen": 49669408, "step": 73685 }, { "epoch": 1.8002589597635161, "grad_norm": 0.43830278515815735, "learning_rate": 1.6155372229589234e-06, "loss": 0.0701, "num_input_tokens_seen": 49672800, "step": 73690 }, { "epoch": 1.8003811105953633, "grad_norm": 12.03991413116455, "learning_rate": 1.6154700127782883e-06, "loss": 0.0654, "num_input_tokens_seen": 49676000, "step": 73695 }, { "epoch": 1.8005032614272103, "grad_norm": 0.11567030102014542, "learning_rate": 1.6154027981217894e-06, "loss": 0.1242, "num_input_tokens_seen": 49678752, "step": 73700 }, { "epoch": 1.8006254122590575, "grad_norm": 17.359439849853516, "learning_rate": 1.6153355789899159e-06, "loss": 0.1069, "num_input_tokens_seen": 49681952, "step": 73705 }, { "epoch": 1.8007475630909047, "grad_norm": 0.09139929711818695, "learning_rate": 1.6152683553831565e-06, "loss": 0.0015, "num_input_tokens_seen": 49685536, "step": 73710 }, { "epoch": 1.8008697139227519, "grad_norm": 15.379899024963379, "learning_rate": 1.6152011273020002e-06, "loss": 0.1863, "num_input_tokens_seen": 49689056, "step": 73715 }, { "epoch": 1.8009918647545988, "grad_norm": 20.884328842163086, "learning_rate": 1.6151338947469358e-06, "loss": 0.0266, "num_input_tokens_seen": 49692128, "step": 73720 }, { "epoch": 1.801114015586446, "grad_norm": 0.5158682465553284, "learning_rate": 1.6150666577184521e-06, "loss": 0.0509, "num_input_tokens_seen": 49695072, "step": 73725 }, { "epoch": 1.8012361664182932, "grad_norm": 0.5207684636116028, "learning_rate": 1.6149994162170386e-06, "loss": 0.0198, "num_input_tokens_seen": 49698720, "step": 73730 }, { "epoch": 1.8013583172501404, "grad_norm": 137.9433135986328, "learning_rate": 1.6149321702431836e-06, "loss": 0.1982, "num_input_tokens_seen": 49702176, "step": 73735 }, { "epoch": 1.8014804680819876, "grad_norm": 0.596860408782959, "learning_rate": 1.6148649197973768e-06, "loss": 0.0983, "num_input_tokens_seen": 49705824, "step": 73740 }, { "epoch": 1.8016026189138348, "grad_norm": 26.139259338378906, "learning_rate": 1.6147976648801068e-06, "loss": 0.1295, "num_input_tokens_seen": 49708960, "step": 73745 }, { "epoch": 1.801724769745682, "grad_norm": 0.34380877017974854, "learning_rate": 1.6147304054918626e-06, "loss": 0.0654, "num_input_tokens_seen": 49712544, "step": 73750 }, { "epoch": 1.8018469205775292, "grad_norm": 0.10611290484666824, "learning_rate": 1.6146631416331338e-06, "loss": 0.0359, "num_input_tokens_seen": 49716128, "step": 73755 }, { "epoch": 1.8019690714093763, "grad_norm": 0.09845145046710968, "learning_rate": 1.6145958733044092e-06, "loss": 0.1364, "num_input_tokens_seen": 49719456, "step": 73760 }, { "epoch": 1.8020912222412235, "grad_norm": 17.626033782958984, "learning_rate": 1.614528600506178e-06, "loss": 0.1124, "num_input_tokens_seen": 49722400, "step": 73765 }, { "epoch": 1.8022133730730707, "grad_norm": 0.046197183430194855, "learning_rate": 1.6144613232389295e-06, "loss": 0.0819, "num_input_tokens_seen": 49726112, "step": 73770 }, { "epoch": 1.802335523904918, "grad_norm": 0.08827891200780869, "learning_rate": 1.614394041503153e-06, "loss": 0.075, "num_input_tokens_seen": 49729248, "step": 73775 }, { "epoch": 1.802457674736765, "grad_norm": 8.221805572509766, "learning_rate": 1.6143267552993382e-06, "loss": 0.182, "num_input_tokens_seen": 49732640, "step": 73780 }, { "epoch": 1.802579825568612, "grad_norm": 0.11934688687324524, "learning_rate": 1.6142594646279738e-06, "loss": 0.165, "num_input_tokens_seen": 49736480, "step": 73785 }, { "epoch": 1.8027019764004593, "grad_norm": 0.350424200296402, "learning_rate": 1.614192169489549e-06, "loss": 0.1129, "num_input_tokens_seen": 49740064, "step": 73790 }, { "epoch": 1.8028241272323065, "grad_norm": 0.7355371117591858, "learning_rate": 1.6141248698845538e-06, "loss": 0.0624, "num_input_tokens_seen": 49743264, "step": 73795 }, { "epoch": 1.8029462780641536, "grad_norm": 7.172179222106934, "learning_rate": 1.6140575658134772e-06, "loss": 0.101, "num_input_tokens_seen": 49747488, "step": 73800 }, { "epoch": 1.8030684288960008, "grad_norm": 0.3119280934333801, "learning_rate": 1.6139902572768094e-06, "loss": 0.1085, "num_input_tokens_seen": 49750752, "step": 73805 }, { "epoch": 1.8031905797278478, "grad_norm": 0.2874053120613098, "learning_rate": 1.6139229442750385e-06, "loss": 0.0637, "num_input_tokens_seen": 49753824, "step": 73810 }, { "epoch": 1.803312730559695, "grad_norm": 0.2166033685207367, "learning_rate": 1.6138556268086557e-06, "loss": 0.0009, "num_input_tokens_seen": 49757216, "step": 73815 }, { "epoch": 1.8034348813915422, "grad_norm": 0.30320924520492554, "learning_rate": 1.613788304878149e-06, "loss": 0.0381, "num_input_tokens_seen": 49760736, "step": 73820 }, { "epoch": 1.8035570322233894, "grad_norm": 0.18933552503585815, "learning_rate": 1.6137209784840086e-06, "loss": 0.0885, "num_input_tokens_seen": 49764576, "step": 73825 }, { "epoch": 1.8036791830552366, "grad_norm": 121.48233795166016, "learning_rate": 1.6136536476267243e-06, "loss": 0.0437, "num_input_tokens_seen": 49767712, "step": 73830 }, { "epoch": 1.8038013338870837, "grad_norm": 0.2928767204284668, "learning_rate": 1.6135863123067858e-06, "loss": 0.0381, "num_input_tokens_seen": 49771296, "step": 73835 }, { "epoch": 1.803923484718931, "grad_norm": 109.94903564453125, "learning_rate": 1.6135189725246828e-06, "loss": 0.0836, "num_input_tokens_seen": 49774624, "step": 73840 }, { "epoch": 1.8040456355507781, "grad_norm": 12.922600746154785, "learning_rate": 1.6134516282809045e-06, "loss": 0.1184, "num_input_tokens_seen": 49778784, "step": 73845 }, { "epoch": 1.8041677863826253, "grad_norm": 14.157644271850586, "learning_rate": 1.6133842795759408e-06, "loss": 0.082, "num_input_tokens_seen": 49782240, "step": 73850 }, { "epoch": 1.8042899372144725, "grad_norm": 0.6490665078163147, "learning_rate": 1.613316926410282e-06, "loss": 0.0021, "num_input_tokens_seen": 49785184, "step": 73855 }, { "epoch": 1.8044120880463197, "grad_norm": 0.5274012684822083, "learning_rate": 1.6132495687844174e-06, "loss": 0.0884, "num_input_tokens_seen": 49788384, "step": 73860 }, { "epoch": 1.8045342388781669, "grad_norm": 30.976333618164062, "learning_rate": 1.6131822066988372e-06, "loss": 0.1359, "num_input_tokens_seen": 49791776, "step": 73865 }, { "epoch": 1.804656389710014, "grad_norm": 51.13031768798828, "learning_rate": 1.6131148401540307e-06, "loss": 0.1146, "num_input_tokens_seen": 49795680, "step": 73870 }, { "epoch": 1.804778540541861, "grad_norm": 0.002726664301007986, "learning_rate": 1.6130474691504885e-06, "loss": 0.0008, "num_input_tokens_seen": 49799264, "step": 73875 }, { "epoch": 1.8049006913737082, "grad_norm": 16.781251907348633, "learning_rate": 1.6129800936887002e-06, "loss": 0.0959, "num_input_tokens_seen": 49802720, "step": 73880 }, { "epoch": 1.8050228422055554, "grad_norm": 0.9871700406074524, "learning_rate": 1.6129127137691554e-06, "loss": 0.0705, "num_input_tokens_seen": 49806560, "step": 73885 }, { "epoch": 1.8051449930374026, "grad_norm": 31.406795501708984, "learning_rate": 1.6128453293923446e-06, "loss": 0.1185, "num_input_tokens_seen": 49809760, "step": 73890 }, { "epoch": 1.8052671438692498, "grad_norm": 0.21563898026943207, "learning_rate": 1.6127779405587578e-06, "loss": 0.038, "num_input_tokens_seen": 49813344, "step": 73895 }, { "epoch": 1.8053892947010968, "grad_norm": 0.48208603262901306, "learning_rate": 1.6127105472688852e-06, "loss": 0.0756, "num_input_tokens_seen": 49816928, "step": 73900 }, { "epoch": 1.805511445532944, "grad_norm": 1.9552193880081177, "learning_rate": 1.6126431495232167e-06, "loss": 0.0234, "num_input_tokens_seen": 49820320, "step": 73905 }, { "epoch": 1.8056335963647911, "grad_norm": 137.20310974121094, "learning_rate": 1.6125757473222423e-06, "loss": 0.1326, "num_input_tokens_seen": 49823712, "step": 73910 }, { "epoch": 1.8057557471966383, "grad_norm": 0.2588064670562744, "learning_rate": 1.6125083406664523e-06, "loss": 0.1187, "num_input_tokens_seen": 49826848, "step": 73915 }, { "epoch": 1.8058778980284855, "grad_norm": 0.5283862352371216, "learning_rate": 1.6124409295563369e-06, "loss": 0.1636, "num_input_tokens_seen": 49829856, "step": 73920 }, { "epoch": 1.8060000488603327, "grad_norm": 1.8500924110412598, "learning_rate": 1.612373513992386e-06, "loss": 0.0391, "num_input_tokens_seen": 49833056, "step": 73925 }, { "epoch": 1.80612219969218, "grad_norm": 0.8077684044837952, "learning_rate": 1.6123060939750908e-06, "loss": 0.0296, "num_input_tokens_seen": 49836832, "step": 73930 }, { "epoch": 1.806244350524027, "grad_norm": 0.27247530221939087, "learning_rate": 1.6122386695049409e-06, "loss": 0.0689, "num_input_tokens_seen": 49840160, "step": 73935 }, { "epoch": 1.8063665013558743, "grad_norm": 0.2970154285430908, "learning_rate": 1.6121712405824263e-06, "loss": 0.0879, "num_input_tokens_seen": 49843808, "step": 73940 }, { "epoch": 1.8064886521877215, "grad_norm": 0.5835685133934021, "learning_rate": 1.6121038072080382e-06, "loss": 0.0351, "num_input_tokens_seen": 49847008, "step": 73945 }, { "epoch": 1.8066108030195687, "grad_norm": 31.69612693786621, "learning_rate": 1.6120363693822663e-06, "loss": 0.1674, "num_input_tokens_seen": 49850528, "step": 73950 }, { "epoch": 1.8067329538514159, "grad_norm": 11.472599029541016, "learning_rate": 1.6119689271056013e-06, "loss": 0.1243, "num_input_tokens_seen": 49853792, "step": 73955 }, { "epoch": 1.806855104683263, "grad_norm": 2.412147283554077, "learning_rate": 1.6119014803785338e-06, "loss": 0.0015, "num_input_tokens_seen": 49856928, "step": 73960 }, { "epoch": 1.80697725551511, "grad_norm": 0.26464715600013733, "learning_rate": 1.6118340292015545e-06, "loss": 0.0041, "num_input_tokens_seen": 49860512, "step": 73965 }, { "epoch": 1.8070994063469572, "grad_norm": 0.1113317459821701, "learning_rate": 1.6117665735751529e-06, "loss": 0.0398, "num_input_tokens_seen": 49864416, "step": 73970 }, { "epoch": 1.8072215571788044, "grad_norm": 1.2402263879776, "learning_rate": 1.6116991134998208e-06, "loss": 0.0755, "num_input_tokens_seen": 49867552, "step": 73975 }, { "epoch": 1.8073437080106516, "grad_norm": 0.6913524866104126, "learning_rate": 1.6116316489760477e-06, "loss": 0.1772, "num_input_tokens_seen": 49870752, "step": 73980 }, { "epoch": 1.8074658588424986, "grad_norm": 0.17152690887451172, "learning_rate": 1.6115641800043252e-06, "loss": 0.1199, "num_input_tokens_seen": 49873952, "step": 73985 }, { "epoch": 1.8075880096743457, "grad_norm": 0.07484672963619232, "learning_rate": 1.6114967065851431e-06, "loss": 0.0553, "num_input_tokens_seen": 49877152, "step": 73990 }, { "epoch": 1.807710160506193, "grad_norm": 0.34017398953437805, "learning_rate": 1.6114292287189928e-06, "loss": 0.0014, "num_input_tokens_seen": 49880672, "step": 73995 }, { "epoch": 1.8078323113380401, "grad_norm": 0.23076596856117249, "learning_rate": 1.6113617464063646e-06, "loss": 0.1862, "num_input_tokens_seen": 49884128, "step": 74000 }, { "epoch": 1.8079544621698873, "grad_norm": 19.423137664794922, "learning_rate": 1.6112942596477491e-06, "loss": 0.1815, "num_input_tokens_seen": 49887264, "step": 74005 }, { "epoch": 1.8080766130017345, "grad_norm": 0.2476540058851242, "learning_rate": 1.6112267684436378e-06, "loss": 0.0047, "num_input_tokens_seen": 49890400, "step": 74010 }, { "epoch": 1.8081987638335817, "grad_norm": 0.1816900074481964, "learning_rate": 1.6111592727945205e-06, "loss": 0.0378, "num_input_tokens_seen": 49893792, "step": 74015 }, { "epoch": 1.8083209146654289, "grad_norm": 0.04045751318335533, "learning_rate": 1.611091772700889e-06, "loss": 0.0564, "num_input_tokens_seen": 49897056, "step": 74020 }, { "epoch": 1.808443065497276, "grad_norm": 18.38286781311035, "learning_rate": 1.6110242681632335e-06, "loss": 0.1873, "num_input_tokens_seen": 49900128, "step": 74025 }, { "epoch": 1.8085652163291233, "grad_norm": 13.320504188537598, "learning_rate": 1.6109567591820454e-06, "loss": 0.1264, "num_input_tokens_seen": 49903584, "step": 74030 }, { "epoch": 1.8086873671609704, "grad_norm": 0.5575301647186279, "learning_rate": 1.6108892457578151e-06, "loss": 0.0015, "num_input_tokens_seen": 49906848, "step": 74035 }, { "epoch": 1.8088095179928176, "grad_norm": 1.4660645723342896, "learning_rate": 1.6108217278910342e-06, "loss": 0.0855, "num_input_tokens_seen": 49910688, "step": 74040 }, { "epoch": 1.8089316688246648, "grad_norm": 13.347358703613281, "learning_rate": 1.6107542055821934e-06, "loss": 0.0939, "num_input_tokens_seen": 49913824, "step": 74045 }, { "epoch": 1.809053819656512, "grad_norm": 0.15625794231891632, "learning_rate": 1.6106866788317837e-06, "loss": 0.0009, "num_input_tokens_seen": 49917216, "step": 74050 }, { "epoch": 1.809175970488359, "grad_norm": 0.2677502930164337, "learning_rate": 1.6106191476402961e-06, "loss": 0.0365, "num_input_tokens_seen": 49920800, "step": 74055 }, { "epoch": 1.8092981213202062, "grad_norm": 34.20901107788086, "learning_rate": 1.6105516120082218e-06, "loss": 0.2032, "num_input_tokens_seen": 49924192, "step": 74060 }, { "epoch": 1.8094202721520534, "grad_norm": 30.46437644958496, "learning_rate": 1.610484071936052e-06, "loss": 0.0806, "num_input_tokens_seen": 49927456, "step": 74065 }, { "epoch": 1.8095424229839006, "grad_norm": 0.4633569121360779, "learning_rate": 1.6104165274242782e-06, "loss": 0.0486, "num_input_tokens_seen": 49930528, "step": 74070 }, { "epoch": 1.8096645738157475, "grad_norm": 0.09732931852340698, "learning_rate": 1.610348978473391e-06, "loss": 0.0669, "num_input_tokens_seen": 49933920, "step": 74075 }, { "epoch": 1.8097867246475947, "grad_norm": 0.17056681215763092, "learning_rate": 1.6102814250838814e-06, "loss": 0.0025, "num_input_tokens_seen": 49937120, "step": 74080 }, { "epoch": 1.809908875479442, "grad_norm": 268.8205261230469, "learning_rate": 1.6102138672562417e-06, "loss": 0.1609, "num_input_tokens_seen": 49940448, "step": 74085 }, { "epoch": 1.810031026311289, "grad_norm": 0.40876808762550354, "learning_rate": 1.6101463049909626e-06, "loss": 0.0982, "num_input_tokens_seen": 49943712, "step": 74090 }, { "epoch": 1.8101531771431363, "grad_norm": 0.37469613552093506, "learning_rate": 1.6100787382885352e-06, "loss": 0.1006, "num_input_tokens_seen": 49946784, "step": 74095 }, { "epoch": 1.8102753279749835, "grad_norm": 2.2056779861450195, "learning_rate": 1.6100111671494511e-06, "loss": 0.1143, "num_input_tokens_seen": 49949920, "step": 74100 }, { "epoch": 1.8103974788068307, "grad_norm": 0.019310932606458664, "learning_rate": 1.6099435915742018e-06, "loss": 0.0009, "num_input_tokens_seen": 49953568, "step": 74105 }, { "epoch": 1.8105196296386779, "grad_norm": 0.16825686395168304, "learning_rate": 1.6098760115632785e-06, "loss": 0.0568, "num_input_tokens_seen": 49957152, "step": 74110 }, { "epoch": 1.810641780470525, "grad_norm": 15.703826904296875, "learning_rate": 1.6098084271171732e-06, "loss": 0.0408, "num_input_tokens_seen": 49960736, "step": 74115 }, { "epoch": 1.8107639313023722, "grad_norm": 0.25060200691223145, "learning_rate": 1.6097408382363768e-06, "loss": 0.0006, "num_input_tokens_seen": 49964256, "step": 74120 }, { "epoch": 1.8108860821342194, "grad_norm": 0.20446979999542236, "learning_rate": 1.6096732449213812e-06, "loss": 0.166, "num_input_tokens_seen": 49967456, "step": 74125 }, { "epoch": 1.8110082329660666, "grad_norm": 33.712005615234375, "learning_rate": 1.6096056471726775e-06, "loss": 0.0413, "num_input_tokens_seen": 49970912, "step": 74130 }, { "epoch": 1.8111303837979138, "grad_norm": 0.1862323135137558, "learning_rate": 1.6095380449907577e-06, "loss": 0.0854, "num_input_tokens_seen": 49973984, "step": 74135 }, { "epoch": 1.811252534629761, "grad_norm": 0.0933285802602768, "learning_rate": 1.609470438376113e-06, "loss": 0.0806, "num_input_tokens_seen": 49977248, "step": 74140 }, { "epoch": 1.811374685461608, "grad_norm": 13.727092742919922, "learning_rate": 1.609402827329236e-06, "loss": 0.1203, "num_input_tokens_seen": 49980384, "step": 74145 }, { "epoch": 1.8114968362934551, "grad_norm": 0.2599273920059204, "learning_rate": 1.609335211850617e-06, "loss": 0.0476, "num_input_tokens_seen": 49983584, "step": 74150 }, { "epoch": 1.8116189871253023, "grad_norm": 0.024421192705631256, "learning_rate": 1.6092675919407487e-06, "loss": 0.1712, "num_input_tokens_seen": 49987104, "step": 74155 }, { "epoch": 1.8117411379571495, "grad_norm": 0.3774043023586273, "learning_rate": 1.6091999676001228e-06, "loss": 0.0908, "num_input_tokens_seen": 49990752, "step": 74160 }, { "epoch": 1.8118632887889965, "grad_norm": 0.11246316134929657, "learning_rate": 1.609132338829231e-06, "loss": 0.0425, "num_input_tokens_seen": 49994208, "step": 74165 }, { "epoch": 1.8119854396208437, "grad_norm": 12.335349082946777, "learning_rate": 1.6090647056285645e-06, "loss": 0.0904, "num_input_tokens_seen": 49997536, "step": 74170 }, { "epoch": 1.8121075904526909, "grad_norm": 0.5349389910697937, "learning_rate": 1.608997067998616e-06, "loss": 0.06, "num_input_tokens_seen": 50000544, "step": 74175 }, { "epoch": 1.812229741284538, "grad_norm": 0.22816990315914154, "learning_rate": 1.608929425939877e-06, "loss": 0.096, "num_input_tokens_seen": 50003680, "step": 74180 }, { "epoch": 1.8123518921163853, "grad_norm": 0.25701552629470825, "learning_rate": 1.6088617794528392e-06, "loss": 0.1466, "num_input_tokens_seen": 50006880, "step": 74185 }, { "epoch": 1.8124740429482324, "grad_norm": 21.37340545654297, "learning_rate": 1.608794128537995e-06, "loss": 0.1236, "num_input_tokens_seen": 50010528, "step": 74190 }, { "epoch": 1.8125961937800796, "grad_norm": 0.05798621103167534, "learning_rate": 1.608726473195836e-06, "loss": 0.0713, "num_input_tokens_seen": 50014176, "step": 74195 }, { "epoch": 1.8127183446119268, "grad_norm": 0.3602031469345093, "learning_rate": 1.6086588134268544e-06, "loss": 0.1177, "num_input_tokens_seen": 50017312, "step": 74200 }, { "epoch": 1.812840495443774, "grad_norm": 0.13574683666229248, "learning_rate": 1.6085911492315423e-06, "loss": 0.0225, "num_input_tokens_seen": 50021024, "step": 74205 }, { "epoch": 1.8129626462756212, "grad_norm": 38.42686462402344, "learning_rate": 1.6085234806103918e-06, "loss": 0.1781, "num_input_tokens_seen": 50024416, "step": 74210 }, { "epoch": 1.8130847971074684, "grad_norm": 0.257497102022171, "learning_rate": 1.6084558075638946e-06, "loss": 0.0969, "num_input_tokens_seen": 50027680, "step": 74215 }, { "epoch": 1.8132069479393156, "grad_norm": 23.47085189819336, "learning_rate": 1.608388130092543e-06, "loss": 0.2283, "num_input_tokens_seen": 50030752, "step": 74220 }, { "epoch": 1.8133290987711628, "grad_norm": 1.341367483139038, "learning_rate": 1.6083204481968297e-06, "loss": 0.004, "num_input_tokens_seen": 50033824, "step": 74225 }, { "epoch": 1.81345124960301, "grad_norm": 0.4435931444168091, "learning_rate": 1.6082527618772462e-06, "loss": 0.0022, "num_input_tokens_seen": 50036896, "step": 74230 }, { "epoch": 1.813573400434857, "grad_norm": 0.06710055470466614, "learning_rate": 1.608185071134285e-06, "loss": 0.1087, "num_input_tokens_seen": 50040544, "step": 74235 }, { "epoch": 1.8136955512667041, "grad_norm": 0.07674826681613922, "learning_rate": 1.6081173759684385e-06, "loss": 0.0011, "num_input_tokens_seen": 50044128, "step": 74240 }, { "epoch": 1.8138177020985513, "grad_norm": 0.14639919996261597, "learning_rate": 1.6080496763801989e-06, "loss": 0.0458, "num_input_tokens_seen": 50047520, "step": 74245 }, { "epoch": 1.8139398529303985, "grad_norm": 0.4655853807926178, "learning_rate": 1.6079819723700585e-06, "loss": 0.1124, "num_input_tokens_seen": 50050528, "step": 74250 }, { "epoch": 1.8140620037622455, "grad_norm": 0.3685029447078705, "learning_rate": 1.6079142639385096e-06, "loss": 0.0276, "num_input_tokens_seen": 50054176, "step": 74255 }, { "epoch": 1.8141841545940927, "grad_norm": 93.46351623535156, "learning_rate": 1.6078465510860446e-06, "loss": 0.1728, "num_input_tokens_seen": 50057440, "step": 74260 }, { "epoch": 1.8143063054259398, "grad_norm": 21.421630859375, "learning_rate": 1.607778833813156e-06, "loss": 0.0582, "num_input_tokens_seen": 50060832, "step": 74265 }, { "epoch": 1.814428456257787, "grad_norm": 37.1475830078125, "learning_rate": 1.6077111121203364e-06, "loss": 0.1701, "num_input_tokens_seen": 50064480, "step": 74270 }, { "epoch": 1.8145506070896342, "grad_norm": 17.186304092407227, "learning_rate": 1.607643386008078e-06, "loss": 0.0996, "num_input_tokens_seen": 50067936, "step": 74275 }, { "epoch": 1.8146727579214814, "grad_norm": 0.339376300573349, "learning_rate": 1.6075756554768736e-06, "loss": 0.0023, "num_input_tokens_seen": 50071328, "step": 74280 }, { "epoch": 1.8147949087533286, "grad_norm": 113.74297332763672, "learning_rate": 1.6075079205272155e-06, "loss": 0.0529, "num_input_tokens_seen": 50074592, "step": 74285 }, { "epoch": 1.8149170595851758, "grad_norm": 125.77494812011719, "learning_rate": 1.6074401811595965e-06, "loss": 0.2788, "num_input_tokens_seen": 50078304, "step": 74290 }, { "epoch": 1.815039210417023, "grad_norm": 0.16352109611034393, "learning_rate": 1.6073724373745088e-06, "loss": 0.038, "num_input_tokens_seen": 50081952, "step": 74295 }, { "epoch": 1.8151613612488702, "grad_norm": 9.420287132263184, "learning_rate": 1.6073046891724458e-06, "loss": 0.045, "num_input_tokens_seen": 50085472, "step": 74300 }, { "epoch": 1.8152835120807174, "grad_norm": 110.37330627441406, "learning_rate": 1.6072369365538996e-06, "loss": 0.1823, "num_input_tokens_seen": 50088416, "step": 74305 }, { "epoch": 1.8154056629125646, "grad_norm": 12.090911865234375, "learning_rate": 1.607169179519363e-06, "loss": 0.0917, "num_input_tokens_seen": 50092128, "step": 74310 }, { "epoch": 1.8155278137444117, "grad_norm": 0.09399466961622238, "learning_rate": 1.607101418069329e-06, "loss": 0.1018, "num_input_tokens_seen": 50094880, "step": 74315 }, { "epoch": 1.8156499645762587, "grad_norm": 8.643115043640137, "learning_rate": 1.60703365220429e-06, "loss": 0.1279, "num_input_tokens_seen": 50098592, "step": 74320 }, { "epoch": 1.815772115408106, "grad_norm": 8.991259574890137, "learning_rate": 1.606965881924739e-06, "loss": 0.0388, "num_input_tokens_seen": 50102112, "step": 74325 }, { "epoch": 1.815894266239953, "grad_norm": 23.121740341186523, "learning_rate": 1.6068981072311689e-06, "loss": 0.0739, "num_input_tokens_seen": 50105376, "step": 74330 }, { "epoch": 1.8160164170718003, "grad_norm": 0.6355023384094238, "learning_rate": 1.6068303281240725e-06, "loss": 0.0028, "num_input_tokens_seen": 50109024, "step": 74335 }, { "epoch": 1.8161385679036475, "grad_norm": 0.07773970812559128, "learning_rate": 1.6067625446039428e-06, "loss": 0.034, "num_input_tokens_seen": 50112608, "step": 74340 }, { "epoch": 1.8162607187354944, "grad_norm": 18.94000816345215, "learning_rate": 1.6066947566712728e-06, "loss": 0.0482, "num_input_tokens_seen": 50115808, "step": 74345 }, { "epoch": 1.8163828695673416, "grad_norm": 0.08719933778047562, "learning_rate": 1.6066269643265551e-06, "loss": 0.0401, "num_input_tokens_seen": 50119264, "step": 74350 }, { "epoch": 1.8165050203991888, "grad_norm": 0.2026486098766327, "learning_rate": 1.606559167570283e-06, "loss": 0.1096, "num_input_tokens_seen": 50122848, "step": 74355 }, { "epoch": 1.816627171231036, "grad_norm": 0.22614862024784088, "learning_rate": 1.6064913664029497e-06, "loss": 0.094, "num_input_tokens_seen": 50125984, "step": 74360 }, { "epoch": 1.8167493220628832, "grad_norm": 25.180818557739258, "learning_rate": 1.6064235608250479e-06, "loss": 0.1342, "num_input_tokens_seen": 50129248, "step": 74365 }, { "epoch": 1.8168714728947304, "grad_norm": 0.04529504477977753, "learning_rate": 1.6063557508370708e-06, "loss": 0.0858, "num_input_tokens_seen": 50132384, "step": 74370 }, { "epoch": 1.8169936237265776, "grad_norm": 4.388415336608887, "learning_rate": 1.6062879364395117e-06, "loss": 0.0504, "num_input_tokens_seen": 50135712, "step": 74375 }, { "epoch": 1.8171157745584248, "grad_norm": 60.08488845825195, "learning_rate": 1.6062201176328636e-06, "loss": 0.1098, "num_input_tokens_seen": 50138912, "step": 74380 }, { "epoch": 1.817237925390272, "grad_norm": 0.33900535106658936, "learning_rate": 1.6061522944176198e-06, "loss": 0.0416, "num_input_tokens_seen": 50142176, "step": 74385 }, { "epoch": 1.8173600762221191, "grad_norm": 17.24686050415039, "learning_rate": 1.6060844667942733e-06, "loss": 0.12, "num_input_tokens_seen": 50145504, "step": 74390 }, { "epoch": 1.8174822270539663, "grad_norm": 12.73847484588623, "learning_rate": 1.6060166347633177e-06, "loss": 0.078, "num_input_tokens_seen": 50148512, "step": 74395 }, { "epoch": 1.8176043778858135, "grad_norm": 0.10699830204248428, "learning_rate": 1.6059487983252462e-06, "loss": 0.0009, "num_input_tokens_seen": 50151904, "step": 74400 }, { "epoch": 1.8177265287176607, "grad_norm": 0.09634699672460556, "learning_rate": 1.605880957480552e-06, "loss": 0.0013, "num_input_tokens_seen": 50154912, "step": 74405 }, { "epoch": 1.8178486795495077, "grad_norm": 0.02464139088988304, "learning_rate": 1.6058131122297285e-06, "loss": 0.0562, "num_input_tokens_seen": 50158304, "step": 74410 }, { "epoch": 1.8179708303813549, "grad_norm": 98.05704498291016, "learning_rate": 1.605745262573269e-06, "loss": 0.1851, "num_input_tokens_seen": 50161248, "step": 74415 }, { "epoch": 1.818092981213202, "grad_norm": 1.038590908050537, "learning_rate": 1.6056774085116671e-06, "loss": 0.0014, "num_input_tokens_seen": 50164320, "step": 74420 }, { "epoch": 1.8182151320450493, "grad_norm": 0.07926052808761597, "learning_rate": 1.605609550045416e-06, "loss": 0.0429, "num_input_tokens_seen": 50167712, "step": 74425 }, { "epoch": 1.8183372828768964, "grad_norm": 0.05045356974005699, "learning_rate": 1.6055416871750098e-06, "loss": 0.0007, "num_input_tokens_seen": 50171680, "step": 74430 }, { "epoch": 1.8184594337087434, "grad_norm": 0.08700229227542877, "learning_rate": 1.6054738199009412e-06, "loss": 0.1016, "num_input_tokens_seen": 50175392, "step": 74435 }, { "epoch": 1.8185815845405906, "grad_norm": 0.2734203040599823, "learning_rate": 1.6054059482237043e-06, "loss": 0.0608, "num_input_tokens_seen": 50178720, "step": 74440 }, { "epoch": 1.8187037353724378, "grad_norm": 0.25550442934036255, "learning_rate": 1.6053380721437927e-06, "loss": 0.0404, "num_input_tokens_seen": 50181536, "step": 74445 }, { "epoch": 1.818825886204285, "grad_norm": 33.2105827331543, "learning_rate": 1.6052701916616993e-06, "loss": 0.1494, "num_input_tokens_seen": 50184800, "step": 74450 }, { "epoch": 1.8189480370361322, "grad_norm": 0.3790890872478485, "learning_rate": 1.6052023067779189e-06, "loss": 0.0619, "num_input_tokens_seen": 50187936, "step": 74455 }, { "epoch": 1.8190701878679794, "grad_norm": 0.04852888360619545, "learning_rate": 1.605134417492944e-06, "loss": 0.0656, "num_input_tokens_seen": 50191008, "step": 74460 }, { "epoch": 1.8191923386998265, "grad_norm": 0.017970800399780273, "learning_rate": 1.6050665238072689e-06, "loss": 0.0818, "num_input_tokens_seen": 50194400, "step": 74465 }, { "epoch": 1.8193144895316737, "grad_norm": 0.09376344829797745, "learning_rate": 1.6049986257213878e-06, "loss": 0.053, "num_input_tokens_seen": 50197280, "step": 74470 }, { "epoch": 1.819436640363521, "grad_norm": 14.67425537109375, "learning_rate": 1.6049307232357935e-06, "loss": 0.0762, "num_input_tokens_seen": 50200480, "step": 74475 }, { "epoch": 1.8195587911953681, "grad_norm": 0.39991647005081177, "learning_rate": 1.6048628163509803e-06, "loss": 0.0584, "num_input_tokens_seen": 50203936, "step": 74480 }, { "epoch": 1.8196809420272153, "grad_norm": 0.068689726293087, "learning_rate": 1.6047949050674422e-06, "loss": 0.042, "num_input_tokens_seen": 50207136, "step": 74485 }, { "epoch": 1.8198030928590625, "grad_norm": 38.983455657958984, "learning_rate": 1.6047269893856728e-06, "loss": 0.1156, "num_input_tokens_seen": 50210144, "step": 74490 }, { "epoch": 1.8199252436909097, "grad_norm": 79.4344482421875, "learning_rate": 1.604659069306166e-06, "loss": 0.1282, "num_input_tokens_seen": 50213792, "step": 74495 }, { "epoch": 1.8200473945227567, "grad_norm": 13.209699630737305, "learning_rate": 1.604591144829416e-06, "loss": 0.1632, "num_input_tokens_seen": 50217312, "step": 74500 }, { "epoch": 1.8201695453546038, "grad_norm": 8.681307792663574, "learning_rate": 1.6045232159559166e-06, "loss": 0.1738, "num_input_tokens_seen": 50220384, "step": 74505 }, { "epoch": 1.820291696186451, "grad_norm": 2.21926212310791, "learning_rate": 1.6044552826861613e-06, "loss": 0.0321, "num_input_tokens_seen": 50223840, "step": 74510 }, { "epoch": 1.8204138470182982, "grad_norm": 0.26441994309425354, "learning_rate": 1.604387345020645e-06, "loss": 0.0891, "num_input_tokens_seen": 50227232, "step": 74515 }, { "epoch": 1.8205359978501452, "grad_norm": 0.08761029690504074, "learning_rate": 1.6043194029598612e-06, "loss": 0.0439, "num_input_tokens_seen": 50230752, "step": 74520 }, { "epoch": 1.8206581486819924, "grad_norm": 12.762765884399414, "learning_rate": 1.6042514565043047e-06, "loss": 0.1376, "num_input_tokens_seen": 50234016, "step": 74525 }, { "epoch": 1.8207802995138396, "grad_norm": 2.54213285446167, "learning_rate": 1.6041835056544683e-06, "loss": 0.0032, "num_input_tokens_seen": 50237216, "step": 74530 }, { "epoch": 1.8209024503456868, "grad_norm": 0.26932165026664734, "learning_rate": 1.6041155504108477e-06, "loss": 0.0703, "num_input_tokens_seen": 50240992, "step": 74535 }, { "epoch": 1.821024601177534, "grad_norm": 0.16381070017814636, "learning_rate": 1.6040475907739356e-06, "loss": 0.104, "num_input_tokens_seen": 50244320, "step": 74540 }, { "epoch": 1.8211467520093811, "grad_norm": 10.372252464294434, "learning_rate": 1.6039796267442273e-06, "loss": 0.0845, "num_input_tokens_seen": 50247776, "step": 74545 }, { "epoch": 1.8212689028412283, "grad_norm": 2.9311392307281494, "learning_rate": 1.6039116583222168e-06, "loss": 0.1112, "num_input_tokens_seen": 50251616, "step": 74550 }, { "epoch": 1.8213910536730755, "grad_norm": 0.2916661500930786, "learning_rate": 1.603843685508398e-06, "loss": 0.0036, "num_input_tokens_seen": 50255264, "step": 74555 }, { "epoch": 1.8215132045049227, "grad_norm": 0.24182464182376862, "learning_rate": 1.603775708303266e-06, "loss": 0.0749, "num_input_tokens_seen": 50258592, "step": 74560 }, { "epoch": 1.82163535533677, "grad_norm": 0.12705932557582855, "learning_rate": 1.6037077267073143e-06, "loss": 0.0439, "num_input_tokens_seen": 50261472, "step": 74565 }, { "epoch": 1.821757506168617, "grad_norm": 0.017557255923748016, "learning_rate": 1.6036397407210376e-06, "loss": 0.0977, "num_input_tokens_seen": 50264992, "step": 74570 }, { "epoch": 1.8218796570004643, "grad_norm": 0.5035400390625, "learning_rate": 1.6035717503449302e-06, "loss": 0.0846, "num_input_tokens_seen": 50268576, "step": 74575 }, { "epoch": 1.8220018078323115, "grad_norm": 1.3602540493011475, "learning_rate": 1.603503755579487e-06, "loss": 0.0368, "num_input_tokens_seen": 50271904, "step": 74580 }, { "epoch": 1.8221239586641587, "grad_norm": 192.67697143554688, "learning_rate": 1.6034357564252021e-06, "loss": 0.303, "num_input_tokens_seen": 50275296, "step": 74585 }, { "epoch": 1.8222461094960056, "grad_norm": 3.1577250957489014, "learning_rate": 1.6033677528825699e-06, "loss": 0.0453, "num_input_tokens_seen": 50278624, "step": 74590 }, { "epoch": 1.8223682603278528, "grad_norm": 12.087418556213379, "learning_rate": 1.6032997449520855e-06, "loss": 0.0841, "num_input_tokens_seen": 50282016, "step": 74595 }, { "epoch": 1.8224904111597, "grad_norm": 119.16487121582031, "learning_rate": 1.6032317326342427e-06, "loss": 0.0473, "num_input_tokens_seen": 50285152, "step": 74600 }, { "epoch": 1.8226125619915472, "grad_norm": 0.6227423548698425, "learning_rate": 1.6031637159295366e-06, "loss": 0.0353, "num_input_tokens_seen": 50288672, "step": 74605 }, { "epoch": 1.8227347128233942, "grad_norm": 0.19224794209003448, "learning_rate": 1.6030956948384618e-06, "loss": 0.0296, "num_input_tokens_seen": 50291744, "step": 74610 }, { "epoch": 1.8228568636552414, "grad_norm": 12.381990432739258, "learning_rate": 1.6030276693615129e-06, "loss": 0.221, "num_input_tokens_seen": 50295136, "step": 74615 }, { "epoch": 1.8229790144870885, "grad_norm": 14.696539878845215, "learning_rate": 1.6029596394991844e-06, "loss": 0.1465, "num_input_tokens_seen": 50298656, "step": 74620 }, { "epoch": 1.8231011653189357, "grad_norm": 27.2199764251709, "learning_rate": 1.6028916052519714e-06, "loss": 0.1616, "num_input_tokens_seen": 50302112, "step": 74625 }, { "epoch": 1.823223316150783, "grad_norm": 292.4683837890625, "learning_rate": 1.6028235666203687e-06, "loss": 0.1432, "num_input_tokens_seen": 50305696, "step": 74630 }, { "epoch": 1.8233454669826301, "grad_norm": 0.1818641871213913, "learning_rate": 1.6027555236048705e-06, "loss": 0.0013, "num_input_tokens_seen": 50309472, "step": 74635 }, { "epoch": 1.8234676178144773, "grad_norm": 2.184302806854248, "learning_rate": 1.6026874762059722e-06, "loss": 0.0759, "num_input_tokens_seen": 50312672, "step": 74640 }, { "epoch": 1.8235897686463245, "grad_norm": 11.973766326904297, "learning_rate": 1.6026194244241683e-06, "loss": 0.1408, "num_input_tokens_seen": 50315744, "step": 74645 }, { "epoch": 1.8237119194781717, "grad_norm": 6.665470600128174, "learning_rate": 1.602551368259954e-06, "loss": 0.1048, "num_input_tokens_seen": 50319392, "step": 74650 }, { "epoch": 1.8238340703100189, "grad_norm": 0.46375200152397156, "learning_rate": 1.602483307713824e-06, "loss": 0.1153, "num_input_tokens_seen": 50322528, "step": 74655 }, { "epoch": 1.823956221141866, "grad_norm": 7.4403533935546875, "learning_rate": 1.6024152427862733e-06, "loss": 0.1064, "num_input_tokens_seen": 50325920, "step": 74660 }, { "epoch": 1.8240783719737133, "grad_norm": 0.5627632141113281, "learning_rate": 1.6023471734777971e-06, "loss": 0.0453, "num_input_tokens_seen": 50328992, "step": 74665 }, { "epoch": 1.8242005228055604, "grad_norm": 1.347942590713501, "learning_rate": 1.6022790997888903e-06, "loss": 0.0076, "num_input_tokens_seen": 50332320, "step": 74670 }, { "epoch": 1.8243226736374076, "grad_norm": 0.6452302932739258, "learning_rate": 1.6022110217200478e-06, "loss": 0.0306, "num_input_tokens_seen": 50336096, "step": 74675 }, { "epoch": 1.8244448244692546, "grad_norm": 8.13100814819336, "learning_rate": 1.6021429392717645e-06, "loss": 0.1109, "num_input_tokens_seen": 50339104, "step": 74680 }, { "epoch": 1.8245669753011018, "grad_norm": 0.6359624266624451, "learning_rate": 1.6020748524445361e-06, "loss": 0.0014, "num_input_tokens_seen": 50342432, "step": 74685 }, { "epoch": 1.824689126132949, "grad_norm": 0.28832563757896423, "learning_rate": 1.6020067612388575e-06, "loss": 0.0579, "num_input_tokens_seen": 50345952, "step": 74690 }, { "epoch": 1.8248112769647962, "grad_norm": 0.3576095402240753, "learning_rate": 1.6019386656552234e-06, "loss": 0.1848, "num_input_tokens_seen": 50349088, "step": 74695 }, { "epoch": 1.8249334277966431, "grad_norm": 0.17821946740150452, "learning_rate": 1.6018705656941299e-06, "loss": 0.0851, "num_input_tokens_seen": 50352352, "step": 74700 }, { "epoch": 1.8250555786284903, "grad_norm": 1.7348898649215698, "learning_rate": 1.6018024613560717e-06, "loss": 0.0545, "num_input_tokens_seen": 50356128, "step": 74705 }, { "epoch": 1.8251777294603375, "grad_norm": 2.7360644340515137, "learning_rate": 1.601734352641544e-06, "loss": 0.0043, "num_input_tokens_seen": 50359904, "step": 74710 }, { "epoch": 1.8252998802921847, "grad_norm": 0.2862713634967804, "learning_rate": 1.6016662395510422e-06, "loss": 0.1019, "num_input_tokens_seen": 50363872, "step": 74715 }, { "epoch": 1.825422031124032, "grad_norm": 9.152181625366211, "learning_rate": 1.6015981220850616e-06, "loss": 0.0392, "num_input_tokens_seen": 50367200, "step": 74720 }, { "epoch": 1.825544181955879, "grad_norm": 40.95872116088867, "learning_rate": 1.601530000244098e-06, "loss": 0.166, "num_input_tokens_seen": 50370912, "step": 74725 }, { "epoch": 1.8256663327877263, "grad_norm": 4.845951557159424, "learning_rate": 1.6014618740286458e-06, "loss": 0.0225, "num_input_tokens_seen": 50374240, "step": 74730 }, { "epoch": 1.8257884836195735, "grad_norm": 0.17217379808425903, "learning_rate": 1.6013937434392015e-06, "loss": 0.1129, "num_input_tokens_seen": 50377440, "step": 74735 }, { "epoch": 1.8259106344514207, "grad_norm": 3.039346218109131, "learning_rate": 1.6013256084762603e-06, "loss": 0.0816, "num_input_tokens_seen": 50380704, "step": 74740 }, { "epoch": 1.8260327852832678, "grad_norm": 0.3498546779155731, "learning_rate": 1.6012574691403174e-06, "loss": 0.0287, "num_input_tokens_seen": 50383776, "step": 74745 }, { "epoch": 1.826154936115115, "grad_norm": 0.5262460708618164, "learning_rate": 1.6011893254318682e-06, "loss": 0.0916, "num_input_tokens_seen": 50386848, "step": 74750 }, { "epoch": 1.8262770869469622, "grad_norm": 0.03679165989160538, "learning_rate": 1.601121177351409e-06, "loss": 0.0883, "num_input_tokens_seen": 50390112, "step": 74755 }, { "epoch": 1.8263992377788094, "grad_norm": 123.02760314941406, "learning_rate": 1.6010530248994345e-06, "loss": 0.0841, "num_input_tokens_seen": 50393504, "step": 74760 }, { "epoch": 1.8265213886106566, "grad_norm": 0.201580211520195, "learning_rate": 1.6009848680764409e-06, "loss": 0.0606, "num_input_tokens_seen": 50396576, "step": 74765 }, { "epoch": 1.8266435394425036, "grad_norm": 230.34939575195312, "learning_rate": 1.6009167068829239e-06, "loss": 0.0721, "num_input_tokens_seen": 50400032, "step": 74770 }, { "epoch": 1.8267656902743508, "grad_norm": 11.82022762298584, "learning_rate": 1.6008485413193786e-06, "loss": 0.1413, "num_input_tokens_seen": 50403360, "step": 74775 }, { "epoch": 1.826887841106198, "grad_norm": 0.478629469871521, "learning_rate": 1.6007803713863015e-06, "loss": 0.0292, "num_input_tokens_seen": 50406560, "step": 74780 }, { "epoch": 1.8270099919380451, "grad_norm": 0.9284724593162537, "learning_rate": 1.6007121970841877e-06, "loss": 0.0033, "num_input_tokens_seen": 50410464, "step": 74785 }, { "epoch": 1.827132142769892, "grad_norm": 20.248388290405273, "learning_rate": 1.6006440184135333e-06, "loss": 0.1076, "num_input_tokens_seen": 50413472, "step": 74790 }, { "epoch": 1.8272542936017393, "grad_norm": 1.305576205253601, "learning_rate": 1.6005758353748338e-06, "loss": 0.0026, "num_input_tokens_seen": 50416544, "step": 74795 }, { "epoch": 1.8273764444335865, "grad_norm": 9.549858093261719, "learning_rate": 1.6005076479685854e-06, "loss": 0.0919, "num_input_tokens_seen": 50419680, "step": 74800 }, { "epoch": 1.8274985952654337, "grad_norm": 0.11238489300012589, "learning_rate": 1.600439456195284e-06, "loss": 0.0169, "num_input_tokens_seen": 50423072, "step": 74805 }, { "epoch": 1.8276207460972809, "grad_norm": 22.02189064025879, "learning_rate": 1.6003712600554255e-06, "loss": 0.1338, "num_input_tokens_seen": 50426016, "step": 74810 }, { "epoch": 1.827742896929128, "grad_norm": 0.10497987270355225, "learning_rate": 1.6003030595495056e-06, "loss": 0.0809, "num_input_tokens_seen": 50429472, "step": 74815 }, { "epoch": 1.8278650477609752, "grad_norm": 21.62155532836914, "learning_rate": 1.6002348546780202e-06, "loss": 0.1086, "num_input_tokens_seen": 50432736, "step": 74820 }, { "epoch": 1.8279871985928224, "grad_norm": 53.237098693847656, "learning_rate": 1.6001666454414657e-06, "loss": 0.3054, "num_input_tokens_seen": 50436320, "step": 74825 }, { "epoch": 1.8281093494246696, "grad_norm": 0.15742164850234985, "learning_rate": 1.6000984318403376e-06, "loss": 0.0274, "num_input_tokens_seen": 50439776, "step": 74830 }, { "epoch": 1.8282315002565168, "grad_norm": 18.078733444213867, "learning_rate": 1.6000302138751328e-06, "loss": 0.1356, "num_input_tokens_seen": 50443168, "step": 74835 }, { "epoch": 1.828353651088364, "grad_norm": 4.11641263961792, "learning_rate": 1.5999619915463466e-06, "loss": 0.0999, "num_input_tokens_seen": 50446752, "step": 74840 }, { "epoch": 1.8284758019202112, "grad_norm": 38.77667999267578, "learning_rate": 1.5998937648544756e-06, "loss": 0.0559, "num_input_tokens_seen": 50450144, "step": 74845 }, { "epoch": 1.8285979527520584, "grad_norm": 0.16692113876342773, "learning_rate": 1.5998255338000157e-06, "loss": 0.0009, "num_input_tokens_seen": 50453280, "step": 74850 }, { "epoch": 1.8287201035839054, "grad_norm": 0.014067198149859905, "learning_rate": 1.599757298383463e-06, "loss": 0.053, "num_input_tokens_seen": 50456416, "step": 74855 }, { "epoch": 1.8288422544157525, "grad_norm": 0.4518554210662842, "learning_rate": 1.599689058605314e-06, "loss": 0.0388, "num_input_tokens_seen": 50459488, "step": 74860 }, { "epoch": 1.8289644052475997, "grad_norm": 32.26401138305664, "learning_rate": 1.599620814466065e-06, "loss": 0.075, "num_input_tokens_seen": 50463776, "step": 74865 }, { "epoch": 1.829086556079447, "grad_norm": 0.04703347757458687, "learning_rate": 1.599552565966212e-06, "loss": 0.071, "num_input_tokens_seen": 50467040, "step": 74870 }, { "epoch": 1.8292087069112941, "grad_norm": 0.25102993845939636, "learning_rate": 1.5994843131062519e-06, "loss": 0.0455, "num_input_tokens_seen": 50471200, "step": 74875 }, { "epoch": 1.829330857743141, "grad_norm": 11.341893196105957, "learning_rate": 1.5994160558866802e-06, "loss": 0.0644, "num_input_tokens_seen": 50474272, "step": 74880 }, { "epoch": 1.8294530085749883, "grad_norm": 19.64974594116211, "learning_rate": 1.5993477943079937e-06, "loss": 0.1855, "num_input_tokens_seen": 50477792, "step": 74885 }, { "epoch": 1.8295751594068355, "grad_norm": 0.3654881715774536, "learning_rate": 1.599279528370689e-06, "loss": 0.0278, "num_input_tokens_seen": 50481248, "step": 74890 }, { "epoch": 1.8296973102386827, "grad_norm": 0.16216430068016052, "learning_rate": 1.5992112580752623e-06, "loss": 0.0006, "num_input_tokens_seen": 50484896, "step": 74895 }, { "epoch": 1.8298194610705298, "grad_norm": 19.21926498413086, "learning_rate": 1.5991429834222104e-06, "loss": 0.1846, "num_input_tokens_seen": 50488160, "step": 74900 }, { "epoch": 1.829941611902377, "grad_norm": 10.870716094970703, "learning_rate": 1.5990747044120294e-06, "loss": 0.005, "num_input_tokens_seen": 50491872, "step": 74905 }, { "epoch": 1.8300637627342242, "grad_norm": 0.14991120994091034, "learning_rate": 1.5990064210452158e-06, "loss": 0.1366, "num_input_tokens_seen": 50495392, "step": 74910 }, { "epoch": 1.8301859135660714, "grad_norm": 0.1929914802312851, "learning_rate": 1.5989381333222664e-06, "loss": 0.0999, "num_input_tokens_seen": 50498848, "step": 74915 }, { "epoch": 1.8303080643979186, "grad_norm": 18.372793197631836, "learning_rate": 1.5988698412436783e-06, "loss": 0.034, "num_input_tokens_seen": 50502368, "step": 74920 }, { "epoch": 1.8304302152297658, "grad_norm": 13.448803901672363, "learning_rate": 1.5988015448099472e-06, "loss": 0.0404, "num_input_tokens_seen": 50505760, "step": 74925 }, { "epoch": 1.830552366061613, "grad_norm": 0.15691198408603668, "learning_rate": 1.5987332440215705e-06, "loss": 0.0499, "num_input_tokens_seen": 50509280, "step": 74930 }, { "epoch": 1.8306745168934602, "grad_norm": 21.244211196899414, "learning_rate": 1.5986649388790443e-06, "loss": 0.1397, "num_input_tokens_seen": 50512416, "step": 74935 }, { "epoch": 1.8307966677253074, "grad_norm": 329.8246765136719, "learning_rate": 1.5985966293828659e-06, "loss": 0.0294, "num_input_tokens_seen": 50516064, "step": 74940 }, { "epoch": 1.8309188185571543, "grad_norm": 0.13184209167957306, "learning_rate": 1.5985283155335316e-06, "loss": 0.0787, "num_input_tokens_seen": 50519456, "step": 74945 }, { "epoch": 1.8310409693890015, "grad_norm": 13.898524284362793, "learning_rate": 1.5984599973315385e-06, "loss": 0.1047, "num_input_tokens_seen": 50522592, "step": 74950 }, { "epoch": 1.8311631202208487, "grad_norm": 0.10374942421913147, "learning_rate": 1.5983916747773834e-06, "loss": 0.0326, "num_input_tokens_seen": 50526368, "step": 74955 }, { "epoch": 1.831285271052696, "grad_norm": 0.08185227960348129, "learning_rate": 1.598323347871563e-06, "loss": 0.0006, "num_input_tokens_seen": 50529696, "step": 74960 }, { "epoch": 1.831407421884543, "grad_norm": 13.482980728149414, "learning_rate": 1.5982550166145744e-06, "loss": 0.0736, "num_input_tokens_seen": 50532960, "step": 74965 }, { "epoch": 1.83152957271639, "grad_norm": 0.312678724527359, "learning_rate": 1.5981866810069142e-06, "loss": 0.082, "num_input_tokens_seen": 50536416, "step": 74970 }, { "epoch": 1.8316517235482372, "grad_norm": 0.11006156355142593, "learning_rate": 1.5981183410490796e-06, "loss": 0.0615, "num_input_tokens_seen": 50540000, "step": 74975 }, { "epoch": 1.8317738743800844, "grad_norm": 0.10871271789073944, "learning_rate": 1.5980499967415677e-06, "loss": 0.1137, "num_input_tokens_seen": 50543776, "step": 74980 }, { "epoch": 1.8318960252119316, "grad_norm": 0.22636280953884125, "learning_rate": 1.5979816480848754e-06, "loss": 0.0009, "num_input_tokens_seen": 50546976, "step": 74985 }, { "epoch": 1.8320181760437788, "grad_norm": 12.59542465209961, "learning_rate": 1.5979132950794996e-06, "loss": 0.1536, "num_input_tokens_seen": 50551264, "step": 74990 }, { "epoch": 1.832140326875626, "grad_norm": 17.66059684753418, "learning_rate": 1.5978449377259376e-06, "loss": 0.1114, "num_input_tokens_seen": 50555104, "step": 74995 }, { "epoch": 1.8322624777074732, "grad_norm": 4.7083964347839355, "learning_rate": 1.5977765760246863e-06, "loss": 0.1184, "num_input_tokens_seen": 50558560, "step": 75000 }, { "epoch": 1.8323846285393204, "grad_norm": 0.0550217367708683, "learning_rate": 1.597708209976243e-06, "loss": 0.0198, "num_input_tokens_seen": 50561760, "step": 75005 }, { "epoch": 1.8325067793711676, "grad_norm": 165.42160034179688, "learning_rate": 1.5976398395811046e-06, "loss": 0.0595, "num_input_tokens_seen": 50564960, "step": 75010 }, { "epoch": 1.8326289302030148, "grad_norm": 0.008861426264047623, "learning_rate": 1.5975714648397686e-06, "loss": 0.0956, "num_input_tokens_seen": 50568160, "step": 75015 }, { "epoch": 1.832751081034862, "grad_norm": 0.027347374707460403, "learning_rate": 1.5975030857527326e-06, "loss": 0.0812, "num_input_tokens_seen": 50571040, "step": 75020 }, { "epoch": 1.8328732318667091, "grad_norm": 16.737857818603516, "learning_rate": 1.5974347023204932e-06, "loss": 0.1012, "num_input_tokens_seen": 50574176, "step": 75025 }, { "epoch": 1.8329953826985563, "grad_norm": 0.5741835832595825, "learning_rate": 1.5973663145435482e-06, "loss": 0.0047, "num_input_tokens_seen": 50577120, "step": 75030 }, { "epoch": 1.8331175335304033, "grad_norm": 1.5658107995986938, "learning_rate": 1.5972979224223942e-06, "loss": 0.1798, "num_input_tokens_seen": 50580640, "step": 75035 }, { "epoch": 1.8332396843622505, "grad_norm": 0.02867000922560692, "learning_rate": 1.597229525957529e-06, "loss": 0.0024, "num_input_tokens_seen": 50584352, "step": 75040 }, { "epoch": 1.8333618351940977, "grad_norm": 0.545559287071228, "learning_rate": 1.5971611251494505e-06, "loss": 0.0988, "num_input_tokens_seen": 50587360, "step": 75045 }, { "epoch": 1.8334839860259449, "grad_norm": 0.14214016497135162, "learning_rate": 1.5970927199986557e-06, "loss": 0.1133, "num_input_tokens_seen": 50590688, "step": 75050 }, { "epoch": 1.8336061368577918, "grad_norm": 19.883060455322266, "learning_rate": 1.5970243105056418e-06, "loss": 0.1702, "num_input_tokens_seen": 50593632, "step": 75055 }, { "epoch": 1.833728287689639, "grad_norm": 36.7739372253418, "learning_rate": 1.5969558966709066e-06, "loss": 0.0866, "num_input_tokens_seen": 50596896, "step": 75060 }, { "epoch": 1.8338504385214862, "grad_norm": 16.683443069458008, "learning_rate": 1.5968874784949476e-06, "loss": 0.1055, "num_input_tokens_seen": 50599968, "step": 75065 }, { "epoch": 1.8339725893533334, "grad_norm": 15.718001365661621, "learning_rate": 1.5968190559782622e-06, "loss": 0.0348, "num_input_tokens_seen": 50603104, "step": 75070 }, { "epoch": 1.8340947401851806, "grad_norm": 10.17602252960205, "learning_rate": 1.5967506291213481e-06, "loss": 0.1494, "num_input_tokens_seen": 50606496, "step": 75075 }, { "epoch": 1.8342168910170278, "grad_norm": 0.4043470621109009, "learning_rate": 1.5966821979247031e-06, "loss": 0.0829, "num_input_tokens_seen": 50610016, "step": 75080 }, { "epoch": 1.834339041848875, "grad_norm": 0.36872056126594543, "learning_rate": 1.5966137623888246e-06, "loss": 0.072, "num_input_tokens_seen": 50612960, "step": 75085 }, { "epoch": 1.8344611926807222, "grad_norm": 0.2587651014328003, "learning_rate": 1.5965453225142102e-06, "loss": 0.0437, "num_input_tokens_seen": 50616672, "step": 75090 }, { "epoch": 1.8345833435125694, "grad_norm": 0.0520302951335907, "learning_rate": 1.5964768783013579e-06, "loss": 0.0035, "num_input_tokens_seen": 50619872, "step": 75095 }, { "epoch": 1.8347054943444165, "grad_norm": 31.207691192626953, "learning_rate": 1.5964084297507652e-06, "loss": 0.1989, "num_input_tokens_seen": 50623584, "step": 75100 }, { "epoch": 1.8348276451762637, "grad_norm": 16.3104305267334, "learning_rate": 1.5963399768629299e-06, "loss": 0.0274, "num_input_tokens_seen": 50626592, "step": 75105 }, { "epoch": 1.834949796008111, "grad_norm": 0.9252538681030273, "learning_rate": 1.5962715196383503e-06, "loss": 0.0848, "num_input_tokens_seen": 50630304, "step": 75110 }, { "epoch": 1.8350719468399581, "grad_norm": 0.3111483156681061, "learning_rate": 1.5962030580775236e-06, "loss": 0.0054, "num_input_tokens_seen": 50633696, "step": 75115 }, { "epoch": 1.8351940976718053, "grad_norm": 0.1261308789253235, "learning_rate": 1.596134592180948e-06, "loss": 0.1591, "num_input_tokens_seen": 50637024, "step": 75120 }, { "epoch": 1.8353162485036523, "grad_norm": 0.15849518775939941, "learning_rate": 1.5960661219491208e-06, "loss": 0.0389, "num_input_tokens_seen": 50640416, "step": 75125 }, { "epoch": 1.8354383993354995, "grad_norm": 0.5551608800888062, "learning_rate": 1.595997647382541e-06, "loss": 0.1209, "num_input_tokens_seen": 50643616, "step": 75130 }, { "epoch": 1.8355605501673466, "grad_norm": 0.15090814232826233, "learning_rate": 1.5959291684817057e-06, "loss": 0.1069, "num_input_tokens_seen": 50646752, "step": 75135 }, { "epoch": 1.8356827009991938, "grad_norm": 0.0902179405093193, "learning_rate": 1.5958606852471132e-06, "loss": 0.0722, "num_input_tokens_seen": 50650272, "step": 75140 }, { "epoch": 1.8358048518310408, "grad_norm": 259.31622314453125, "learning_rate": 1.595792197679262e-06, "loss": 0.0127, "num_input_tokens_seen": 50654176, "step": 75145 }, { "epoch": 1.835927002662888, "grad_norm": 0.12596093118190765, "learning_rate": 1.5957237057786492e-06, "loss": 0.0315, "num_input_tokens_seen": 50657440, "step": 75150 }, { "epoch": 1.8360491534947352, "grad_norm": 0.2953812777996063, "learning_rate": 1.595655209545774e-06, "loss": 0.0672, "num_input_tokens_seen": 50660640, "step": 75155 }, { "epoch": 1.8361713043265824, "grad_norm": 10.196359634399414, "learning_rate": 1.5955867089811332e-06, "loss": 0.089, "num_input_tokens_seen": 50664288, "step": 75160 }, { "epoch": 1.8362934551584296, "grad_norm": 0.03514920175075531, "learning_rate": 1.5955182040852257e-06, "loss": 0.0342, "num_input_tokens_seen": 50668000, "step": 75165 }, { "epoch": 1.8364156059902768, "grad_norm": 71.40430450439453, "learning_rate": 1.59544969485855e-06, "loss": 0.0254, "num_input_tokens_seen": 50671328, "step": 75170 }, { "epoch": 1.836537756822124, "grad_norm": 0.9081661105155945, "learning_rate": 1.5953811813016037e-06, "loss": 0.003, "num_input_tokens_seen": 50674720, "step": 75175 }, { "epoch": 1.8366599076539711, "grad_norm": 0.2207932323217392, "learning_rate": 1.5953126634148855e-06, "loss": 0.1863, "num_input_tokens_seen": 50678112, "step": 75180 }, { "epoch": 1.8367820584858183, "grad_norm": 41.23016357421875, "learning_rate": 1.5952441411988934e-06, "loss": 0.1438, "num_input_tokens_seen": 50681440, "step": 75185 }, { "epoch": 1.8369042093176655, "grad_norm": 21.08063316345215, "learning_rate": 1.5951756146541257e-06, "loss": 0.1104, "num_input_tokens_seen": 50684704, "step": 75190 }, { "epoch": 1.8370263601495127, "grad_norm": 162.88192749023438, "learning_rate": 1.5951070837810808e-06, "loss": 0.089, "num_input_tokens_seen": 50688736, "step": 75195 }, { "epoch": 1.83714851098136, "grad_norm": 0.09662999957799911, "learning_rate": 1.5950385485802574e-06, "loss": 0.0431, "num_input_tokens_seen": 50691808, "step": 75200 }, { "epoch": 1.837270661813207, "grad_norm": 0.15366408228874207, "learning_rate": 1.5949700090521536e-06, "loss": 0.0347, "num_input_tokens_seen": 50695136, "step": 75205 }, { "epoch": 1.8373928126450543, "grad_norm": 16.026094436645508, "learning_rate": 1.594901465197268e-06, "loss": 0.1796, "num_input_tokens_seen": 50698848, "step": 75210 }, { "epoch": 1.8375149634769012, "grad_norm": 33.631309509277344, "learning_rate": 1.5948329170160983e-06, "loss": 0.0567, "num_input_tokens_seen": 50701984, "step": 75215 }, { "epoch": 1.8376371143087484, "grad_norm": 0.02875753492116928, "learning_rate": 1.5947643645091442e-06, "loss": 0.2215, "num_input_tokens_seen": 50705440, "step": 75220 }, { "epoch": 1.8377592651405956, "grad_norm": 42.001739501953125, "learning_rate": 1.5946958076769035e-06, "loss": 0.1647, "num_input_tokens_seen": 50709472, "step": 75225 }, { "epoch": 1.8378814159724428, "grad_norm": 0.2549021244049072, "learning_rate": 1.5946272465198748e-06, "loss": 0.0869, "num_input_tokens_seen": 50712608, "step": 75230 }, { "epoch": 1.8380035668042898, "grad_norm": 0.1781689077615738, "learning_rate": 1.5945586810385572e-06, "loss": 0.1903, "num_input_tokens_seen": 50716000, "step": 75235 }, { "epoch": 1.838125717636137, "grad_norm": 21.01819610595703, "learning_rate": 1.5944901112334486e-06, "loss": 0.2179, "num_input_tokens_seen": 50719392, "step": 75240 }, { "epoch": 1.8382478684679842, "grad_norm": 0.1993514448404312, "learning_rate": 1.5944215371050482e-06, "loss": 0.0641, "num_input_tokens_seen": 50723232, "step": 75245 }, { "epoch": 1.8383700192998313, "grad_norm": 91.43272399902344, "learning_rate": 1.5943529586538543e-06, "loss": 0.1439, "num_input_tokens_seen": 50726560, "step": 75250 }, { "epoch": 1.8384921701316785, "grad_norm": 30.449052810668945, "learning_rate": 1.594284375880366e-06, "loss": 0.0482, "num_input_tokens_seen": 50729824, "step": 75255 }, { "epoch": 1.8386143209635257, "grad_norm": 1.8023955821990967, "learning_rate": 1.5942157887850818e-06, "loss": 0.0023, "num_input_tokens_seen": 50733024, "step": 75260 }, { "epoch": 1.838736471795373, "grad_norm": 10.874223709106445, "learning_rate": 1.5941471973685007e-06, "loss": 0.0956, "num_input_tokens_seen": 50736544, "step": 75265 }, { "epoch": 1.83885862262722, "grad_norm": 20.578174591064453, "learning_rate": 1.5940786016311214e-06, "loss": 0.0363, "num_input_tokens_seen": 50740064, "step": 75270 }, { "epoch": 1.8389807734590673, "grad_norm": 14.255890846252441, "learning_rate": 1.5940100015734426e-06, "loss": 0.0902, "num_input_tokens_seen": 50743136, "step": 75275 }, { "epoch": 1.8391029242909145, "grad_norm": 22.841188430786133, "learning_rate": 1.5939413971959632e-06, "loss": 0.0455, "num_input_tokens_seen": 50746400, "step": 75280 }, { "epoch": 1.8392250751227617, "grad_norm": 21.676034927368164, "learning_rate": 1.5938727884991824e-06, "loss": 0.1632, "num_input_tokens_seen": 50750048, "step": 75285 }, { "epoch": 1.8393472259546089, "grad_norm": 31.792327880859375, "learning_rate": 1.5938041754835987e-06, "loss": 0.1416, "num_input_tokens_seen": 50753440, "step": 75290 }, { "epoch": 1.839469376786456, "grad_norm": 1.4194421768188477, "learning_rate": 1.5937355581497115e-06, "loss": 0.0531, "num_input_tokens_seen": 50756704, "step": 75295 }, { "epoch": 1.8395915276183032, "grad_norm": 41.14981460571289, "learning_rate": 1.5936669364980198e-06, "loss": 0.1068, "num_input_tokens_seen": 50759904, "step": 75300 }, { "epoch": 1.8397136784501502, "grad_norm": 20.266094207763672, "learning_rate": 1.5935983105290221e-06, "loss": 0.1172, "num_input_tokens_seen": 50763040, "step": 75305 }, { "epoch": 1.8398358292819974, "grad_norm": 0.4391668140888214, "learning_rate": 1.593529680243218e-06, "loss": 0.0771, "num_input_tokens_seen": 50766432, "step": 75310 }, { "epoch": 1.8399579801138446, "grad_norm": 45.64616775512695, "learning_rate": 1.5934610456411064e-06, "loss": 0.0495, "num_input_tokens_seen": 50770016, "step": 75315 }, { "epoch": 1.8400801309456918, "grad_norm": 14.22166633605957, "learning_rate": 1.5933924067231864e-06, "loss": 0.1261, "num_input_tokens_seen": 50773664, "step": 75320 }, { "epoch": 1.8402022817775388, "grad_norm": 26.305788040161133, "learning_rate": 1.5933237634899573e-06, "loss": 0.1299, "num_input_tokens_seen": 50777120, "step": 75325 }, { "epoch": 1.840324432609386, "grad_norm": 0.10604507476091385, "learning_rate": 1.5932551159419184e-06, "loss": 0.0197, "num_input_tokens_seen": 50780384, "step": 75330 }, { "epoch": 1.8404465834412331, "grad_norm": 0.2902368903160095, "learning_rate": 1.5931864640795684e-06, "loss": 0.0031, "num_input_tokens_seen": 50783328, "step": 75335 }, { "epoch": 1.8405687342730803, "grad_norm": 0.4008534550666809, "learning_rate": 1.5931178079034072e-06, "loss": 0.0049, "num_input_tokens_seen": 50786336, "step": 75340 }, { "epoch": 1.8406908851049275, "grad_norm": 1.7231696844100952, "learning_rate": 1.5930491474139337e-06, "loss": 0.0367, "num_input_tokens_seen": 50789472, "step": 75345 }, { "epoch": 1.8408130359367747, "grad_norm": 0.048120759427547455, "learning_rate": 1.592980482611647e-06, "loss": 0.0835, "num_input_tokens_seen": 50792672, "step": 75350 }, { "epoch": 1.840935186768622, "grad_norm": 0.7223793268203735, "learning_rate": 1.5929118134970468e-06, "loss": 0.002, "num_input_tokens_seen": 50795808, "step": 75355 }, { "epoch": 1.841057337600469, "grad_norm": 5.6044602394104, "learning_rate": 1.5928431400706326e-06, "loss": 0.1351, "num_input_tokens_seen": 50799392, "step": 75360 }, { "epoch": 1.8411794884323163, "grad_norm": 11.443306922912598, "learning_rate": 1.5927744623329034e-06, "loss": 0.0389, "num_input_tokens_seen": 50802592, "step": 75365 }, { "epoch": 1.8413016392641635, "grad_norm": 74.66730499267578, "learning_rate": 1.5927057802843591e-06, "loss": 0.0911, "num_input_tokens_seen": 50806432, "step": 75370 }, { "epoch": 1.8414237900960106, "grad_norm": 0.2384769469499588, "learning_rate": 1.5926370939254987e-06, "loss": 0.236, "num_input_tokens_seen": 50809824, "step": 75375 }, { "epoch": 1.8415459409278578, "grad_norm": 1.9501762390136719, "learning_rate": 1.5925684032568221e-06, "loss": 0.002, "num_input_tokens_seen": 50813408, "step": 75380 }, { "epoch": 1.841668091759705, "grad_norm": 199.66368103027344, "learning_rate": 1.592499708278829e-06, "loss": 0.1323, "num_input_tokens_seen": 50816544, "step": 75385 }, { "epoch": 1.841790242591552, "grad_norm": 27.194421768188477, "learning_rate": 1.5924310089920181e-06, "loss": 0.0376, "num_input_tokens_seen": 50819808, "step": 75390 }, { "epoch": 1.8419123934233992, "grad_norm": 2.1573281288146973, "learning_rate": 1.59236230539689e-06, "loss": 0.1633, "num_input_tokens_seen": 50823264, "step": 75395 }, { "epoch": 1.8420345442552464, "grad_norm": 0.855754017829895, "learning_rate": 1.5922935974939438e-06, "loss": 0.0342, "num_input_tokens_seen": 50826592, "step": 75400 }, { "epoch": 1.8421566950870936, "grad_norm": 143.57122802734375, "learning_rate": 1.592224885283679e-06, "loss": 0.0632, "num_input_tokens_seen": 50829856, "step": 75405 }, { "epoch": 1.8422788459189408, "grad_norm": 14.12177848815918, "learning_rate": 1.592156168766596e-06, "loss": 0.1241, "num_input_tokens_seen": 50833056, "step": 75410 }, { "epoch": 1.8424009967507877, "grad_norm": 0.26234906911849976, "learning_rate": 1.5920874479431935e-06, "loss": 0.03, "num_input_tokens_seen": 50836256, "step": 75415 }, { "epoch": 1.842523147582635, "grad_norm": 17.115942001342773, "learning_rate": 1.592018722813972e-06, "loss": 0.2169, "num_input_tokens_seen": 50840096, "step": 75420 }, { "epoch": 1.842645298414482, "grad_norm": 10.910876274108887, "learning_rate": 1.5919499933794313e-06, "loss": 0.1498, "num_input_tokens_seen": 50843232, "step": 75425 }, { "epoch": 1.8427674492463293, "grad_norm": 20.35202980041504, "learning_rate": 1.591881259640071e-06, "loss": 0.0988, "num_input_tokens_seen": 50846752, "step": 75430 }, { "epoch": 1.8428896000781765, "grad_norm": 0.1045135036110878, "learning_rate": 1.591812521596391e-06, "loss": 0.0021, "num_input_tokens_seen": 50850208, "step": 75435 }, { "epoch": 1.8430117509100237, "grad_norm": 0.29922038316726685, "learning_rate": 1.5917437792488913e-06, "loss": 0.0425, "num_input_tokens_seen": 50853664, "step": 75440 }, { "epoch": 1.8431339017418709, "grad_norm": 0.24386277794837952, "learning_rate": 1.5916750325980713e-06, "loss": 0.1005, "num_input_tokens_seen": 50857120, "step": 75445 }, { "epoch": 1.843256052573718, "grad_norm": 0.11934429407119751, "learning_rate": 1.5916062816444313e-06, "loss": 0.0014, "num_input_tokens_seen": 50860256, "step": 75450 }, { "epoch": 1.8433782034055652, "grad_norm": 47.07133865356445, "learning_rate": 1.5915375263884716e-06, "loss": 0.0911, "num_input_tokens_seen": 50863712, "step": 75455 }, { "epoch": 1.8435003542374124, "grad_norm": 46.206600189208984, "learning_rate": 1.591468766830692e-06, "loss": 0.0649, "num_input_tokens_seen": 50867488, "step": 75460 }, { "epoch": 1.8436225050692596, "grad_norm": 0.06218447536230087, "learning_rate": 1.5914000029715922e-06, "loss": 0.0338, "num_input_tokens_seen": 50870816, "step": 75465 }, { "epoch": 1.8437446559011068, "grad_norm": 0.04517967998981476, "learning_rate": 1.5913312348116726e-06, "loss": 0.0766, "num_input_tokens_seen": 50874144, "step": 75470 }, { "epoch": 1.843866806732954, "grad_norm": 0.13802210986614227, "learning_rate": 1.591262462351433e-06, "loss": 0.0731, "num_input_tokens_seen": 50877664, "step": 75475 }, { "epoch": 1.843988957564801, "grad_norm": 0.2226584255695343, "learning_rate": 1.5911936855913738e-06, "loss": 0.0005, "num_input_tokens_seen": 50880992, "step": 75480 }, { "epoch": 1.8441111083966482, "grad_norm": 38.50403594970703, "learning_rate": 1.5911249045319954e-06, "loss": 0.1547, "num_input_tokens_seen": 50883936, "step": 75485 }, { "epoch": 1.8442332592284953, "grad_norm": 0.5120588541030884, "learning_rate": 1.5910561191737975e-06, "loss": 0.001, "num_input_tokens_seen": 50887776, "step": 75490 }, { "epoch": 1.8443554100603425, "grad_norm": 6.489198207855225, "learning_rate": 1.5909873295172807e-06, "loss": 0.0598, "num_input_tokens_seen": 50891424, "step": 75495 }, { "epoch": 1.8444775608921897, "grad_norm": 0.11778406798839569, "learning_rate": 1.590918535562945e-06, "loss": 0.0868, "num_input_tokens_seen": 50895008, "step": 75500 }, { "epoch": 1.8445997117240367, "grad_norm": 26.87613868713379, "learning_rate": 1.5908497373112903e-06, "loss": 0.1014, "num_input_tokens_seen": 50898272, "step": 75505 }, { "epoch": 1.8447218625558839, "grad_norm": 0.16408833861351013, "learning_rate": 1.590780934762818e-06, "loss": 0.0017, "num_input_tokens_seen": 50901344, "step": 75510 }, { "epoch": 1.844844013387731, "grad_norm": 0.22428208589553833, "learning_rate": 1.5907121279180276e-06, "loss": 0.0018, "num_input_tokens_seen": 50904800, "step": 75515 }, { "epoch": 1.8449661642195783, "grad_norm": 0.16060960292816162, "learning_rate": 1.5906433167774198e-06, "loss": 0.084, "num_input_tokens_seen": 50908192, "step": 75520 }, { "epoch": 1.8450883150514255, "grad_norm": 0.32337161898612976, "learning_rate": 1.5905745013414949e-06, "loss": 0.0587, "num_input_tokens_seen": 50911264, "step": 75525 }, { "epoch": 1.8452104658832726, "grad_norm": 34.4917106628418, "learning_rate": 1.5905056816107533e-06, "loss": 0.1477, "num_input_tokens_seen": 50914528, "step": 75530 }, { "epoch": 1.8453326167151198, "grad_norm": 0.016017381101846695, "learning_rate": 1.5904368575856958e-06, "loss": 0.085, "num_input_tokens_seen": 50917984, "step": 75535 }, { "epoch": 1.845454767546967, "grad_norm": 0.15001077950000763, "learning_rate": 1.5903680292668224e-06, "loss": 0.0291, "num_input_tokens_seen": 50920928, "step": 75540 }, { "epoch": 1.8455769183788142, "grad_norm": 41.64552307128906, "learning_rate": 1.590299196654634e-06, "loss": 0.2256, "num_input_tokens_seen": 50924384, "step": 75545 }, { "epoch": 1.8456990692106614, "grad_norm": 0.10622581094503403, "learning_rate": 1.5902303597496309e-06, "loss": 0.0007, "num_input_tokens_seen": 50928352, "step": 75550 }, { "epoch": 1.8458212200425086, "grad_norm": 0.18985126912593842, "learning_rate": 1.590161518552314e-06, "loss": 0.1029, "num_input_tokens_seen": 50931616, "step": 75555 }, { "epoch": 1.8459433708743558, "grad_norm": 0.15927618741989136, "learning_rate": 1.590092673063184e-06, "loss": 0.0211, "num_input_tokens_seen": 50935200, "step": 75560 }, { "epoch": 1.846065521706203, "grad_norm": 0.08424060046672821, "learning_rate": 1.5900238232827412e-06, "loss": 0.1201, "num_input_tokens_seen": 50938656, "step": 75565 }, { "epoch": 1.84618767253805, "grad_norm": 0.03481528162956238, "learning_rate": 1.5899549692114864e-06, "loss": 0.1135, "num_input_tokens_seen": 50942048, "step": 75570 }, { "epoch": 1.8463098233698971, "grad_norm": 0.04872078076004982, "learning_rate": 1.5898861108499205e-06, "loss": 0.1495, "num_input_tokens_seen": 50945312, "step": 75575 }, { "epoch": 1.8464319742017443, "grad_norm": 0.22296403348445892, "learning_rate": 1.5898172481985442e-06, "loss": 0.0314, "num_input_tokens_seen": 50948448, "step": 75580 }, { "epoch": 1.8465541250335915, "grad_norm": 22.515535354614258, "learning_rate": 1.589748381257858e-06, "loss": 0.0422, "num_input_tokens_seen": 50951904, "step": 75585 }, { "epoch": 1.8466762758654387, "grad_norm": 0.6905881762504578, "learning_rate": 1.5896795100283631e-06, "loss": 0.0283, "num_input_tokens_seen": 50955360, "step": 75590 }, { "epoch": 1.8467984266972857, "grad_norm": 7.03764009475708, "learning_rate": 1.5896106345105601e-06, "loss": 0.0983, "num_input_tokens_seen": 50958688, "step": 75595 }, { "epoch": 1.8469205775291329, "grad_norm": 0.14742566645145416, "learning_rate": 1.5895417547049502e-06, "loss": 0.0621, "num_input_tokens_seen": 50962848, "step": 75600 }, { "epoch": 1.84704272836098, "grad_norm": 0.4235324263572693, "learning_rate": 1.5894728706120336e-06, "loss": 0.0487, "num_input_tokens_seen": 50966048, "step": 75605 }, { "epoch": 1.8471648791928272, "grad_norm": 2.398937940597534, "learning_rate": 1.5894039822323121e-06, "loss": 0.0017, "num_input_tokens_seen": 50969440, "step": 75610 }, { "epoch": 1.8472870300246744, "grad_norm": 0.14467774331569672, "learning_rate": 1.5893350895662865e-06, "loss": 0.0604, "num_input_tokens_seen": 50972448, "step": 75615 }, { "epoch": 1.8474091808565216, "grad_norm": 308.7352600097656, "learning_rate": 1.5892661926144575e-06, "loss": 0.0731, "num_input_tokens_seen": 50975904, "step": 75620 }, { "epoch": 1.8475313316883688, "grad_norm": 0.09449799358844757, "learning_rate": 1.5891972913773263e-06, "loss": 0.0026, "num_input_tokens_seen": 50978912, "step": 75625 }, { "epoch": 1.847653482520216, "grad_norm": 281.8724670410156, "learning_rate": 1.5891283858553935e-06, "loss": 0.1608, "num_input_tokens_seen": 50982048, "step": 75630 }, { "epoch": 1.8477756333520632, "grad_norm": 9.985726356506348, "learning_rate": 1.5890594760491606e-06, "loss": 0.1126, "num_input_tokens_seen": 50985440, "step": 75635 }, { "epoch": 1.8478977841839104, "grad_norm": 0.03078383021056652, "learning_rate": 1.5889905619591292e-06, "loss": 0.0486, "num_input_tokens_seen": 50988576, "step": 75640 }, { "epoch": 1.8480199350157576, "grad_norm": 70.28709411621094, "learning_rate": 1.5889216435858001e-06, "loss": 0.1633, "num_input_tokens_seen": 50991648, "step": 75645 }, { "epoch": 1.8481420858476048, "grad_norm": 0.9800973534584045, "learning_rate": 1.5888527209296743e-06, "loss": 0.1638, "num_input_tokens_seen": 50995424, "step": 75650 }, { "epoch": 1.848264236679452, "grad_norm": 0.2955226004123688, "learning_rate": 1.588783793991253e-06, "loss": 0.0348, "num_input_tokens_seen": 50998816, "step": 75655 }, { "epoch": 1.848386387511299, "grad_norm": 35.41645812988281, "learning_rate": 1.5887148627710372e-06, "loss": 0.0945, "num_input_tokens_seen": 51002208, "step": 75660 }, { "epoch": 1.848508538343146, "grad_norm": 0.2809849679470062, "learning_rate": 1.5886459272695292e-06, "loss": 0.0817, "num_input_tokens_seen": 51005472, "step": 75665 }, { "epoch": 1.8486306891749933, "grad_norm": 53.2969856262207, "learning_rate": 1.5885769874872294e-06, "loss": 0.2804, "num_input_tokens_seen": 51009120, "step": 75670 }, { "epoch": 1.8487528400068405, "grad_norm": 27.16531753540039, "learning_rate": 1.5885080434246394e-06, "loss": 0.1359, "num_input_tokens_seen": 51012128, "step": 75675 }, { "epoch": 1.8488749908386874, "grad_norm": 0.28885719180107117, "learning_rate": 1.5884390950822608e-06, "loss": 0.0364, "num_input_tokens_seen": 51015712, "step": 75680 }, { "epoch": 1.8489971416705346, "grad_norm": 5.167070388793945, "learning_rate": 1.5883701424605947e-06, "loss": 0.067, "num_input_tokens_seen": 51018848, "step": 75685 }, { "epoch": 1.8491192925023818, "grad_norm": 23.454187393188477, "learning_rate": 1.5883011855601427e-06, "loss": 0.1145, "num_input_tokens_seen": 51022240, "step": 75690 }, { "epoch": 1.849241443334229, "grad_norm": 0.4352051019668579, "learning_rate": 1.5882322243814063e-06, "loss": 0.002, "num_input_tokens_seen": 51025504, "step": 75695 }, { "epoch": 1.8493635941660762, "grad_norm": 59.894744873046875, "learning_rate": 1.588163258924887e-06, "loss": 0.0279, "num_input_tokens_seen": 51028896, "step": 75700 }, { "epoch": 1.8494857449979234, "grad_norm": 0.07904600352048874, "learning_rate": 1.588094289191086e-06, "loss": 0.0345, "num_input_tokens_seen": 51032736, "step": 75705 }, { "epoch": 1.8496078958297706, "grad_norm": 0.4518510699272156, "learning_rate": 1.5880253151805054e-06, "loss": 0.1506, "num_input_tokens_seen": 51036128, "step": 75710 }, { "epoch": 1.8497300466616178, "grad_norm": 0.1093616634607315, "learning_rate": 1.5879563368936463e-06, "loss": 0.0747, "num_input_tokens_seen": 51039392, "step": 75715 }, { "epoch": 1.849852197493465, "grad_norm": 1.2742111682891846, "learning_rate": 1.5878873543310109e-06, "loss": 0.0353, "num_input_tokens_seen": 51042464, "step": 75720 }, { "epoch": 1.8499743483253122, "grad_norm": 9.084955215454102, "learning_rate": 1.5878183674931005e-06, "loss": 0.0627, "num_input_tokens_seen": 51046112, "step": 75725 }, { "epoch": 1.8500964991571593, "grad_norm": 2.01727557182312, "learning_rate": 1.5877493763804167e-06, "loss": 0.0591, "num_input_tokens_seen": 51049504, "step": 75730 }, { "epoch": 1.8502186499890065, "grad_norm": 54.07387924194336, "learning_rate": 1.5876803809934613e-06, "loss": 0.1982, "num_input_tokens_seen": 51053088, "step": 75735 }, { "epoch": 1.8503408008208537, "grad_norm": 0.08706037700176239, "learning_rate": 1.5876113813327363e-06, "loss": 0.0013, "num_input_tokens_seen": 51056416, "step": 75740 }, { "epoch": 1.850462951652701, "grad_norm": 12.536171913146973, "learning_rate": 1.587542377398743e-06, "loss": 0.1587, "num_input_tokens_seen": 51059744, "step": 75745 }, { "epoch": 1.8505851024845479, "grad_norm": 35.133575439453125, "learning_rate": 1.587473369191984e-06, "loss": 0.1871, "num_input_tokens_seen": 51063072, "step": 75750 }, { "epoch": 1.850707253316395, "grad_norm": 2.3288958072662354, "learning_rate": 1.58740435671296e-06, "loss": 0.0596, "num_input_tokens_seen": 51066336, "step": 75755 }, { "epoch": 1.8508294041482423, "grad_norm": 0.4715663194656372, "learning_rate": 1.5873353399621737e-06, "loss": 0.0658, "num_input_tokens_seen": 51069792, "step": 75760 }, { "epoch": 1.8509515549800895, "grad_norm": 1.3501707315444946, "learning_rate": 1.5872663189401272e-06, "loss": 0.0011, "num_input_tokens_seen": 51072864, "step": 75765 }, { "epoch": 1.8510737058119364, "grad_norm": 0.39104995131492615, "learning_rate": 1.5871972936473217e-06, "loss": 0.1394, "num_input_tokens_seen": 51075872, "step": 75770 }, { "epoch": 1.8511958566437836, "grad_norm": 19.183887481689453, "learning_rate": 1.5871282640842601e-06, "loss": 0.037, "num_input_tokens_seen": 51079008, "step": 75775 }, { "epoch": 1.8513180074756308, "grad_norm": 13.764801979064941, "learning_rate": 1.5870592302514431e-06, "loss": 0.1345, "num_input_tokens_seen": 51082464, "step": 75780 }, { "epoch": 1.851440158307478, "grad_norm": 100.37277221679688, "learning_rate": 1.5869901921493738e-06, "loss": 0.1049, "num_input_tokens_seen": 51086304, "step": 75785 }, { "epoch": 1.8515623091393252, "grad_norm": 27.52959442138672, "learning_rate": 1.5869211497785539e-06, "loss": 0.1019, "num_input_tokens_seen": 51089696, "step": 75790 }, { "epoch": 1.8516844599711724, "grad_norm": 0.3320782780647278, "learning_rate": 1.5868521031394858e-06, "loss": 0.0703, "num_input_tokens_seen": 51093024, "step": 75795 }, { "epoch": 1.8518066108030196, "grad_norm": 0.0150426235049963, "learning_rate": 1.586783052232671e-06, "loss": 0.0664, "num_input_tokens_seen": 51096416, "step": 75800 }, { "epoch": 1.8519287616348667, "grad_norm": 0.34932833909988403, "learning_rate": 1.5867139970586124e-06, "loss": 0.1082, "num_input_tokens_seen": 51099680, "step": 75805 }, { "epoch": 1.852050912466714, "grad_norm": 0.6799086332321167, "learning_rate": 1.5866449376178115e-06, "loss": 0.0262, "num_input_tokens_seen": 51103392, "step": 75810 }, { "epoch": 1.8521730632985611, "grad_norm": 0.31468117237091064, "learning_rate": 1.5865758739107707e-06, "loss": 0.0365, "num_input_tokens_seen": 51106592, "step": 75815 }, { "epoch": 1.8522952141304083, "grad_norm": 0.13998661935329437, "learning_rate": 1.5865068059379926e-06, "loss": 0.0021, "num_input_tokens_seen": 51109792, "step": 75820 }, { "epoch": 1.8524173649622555, "grad_norm": 1.9981356859207153, "learning_rate": 1.5864377336999795e-06, "loss": 0.0016, "num_input_tokens_seen": 51113120, "step": 75825 }, { "epoch": 1.8525395157941027, "grad_norm": 0.25120308995246887, "learning_rate": 1.5863686571972332e-06, "loss": 0.0932, "num_input_tokens_seen": 51116512, "step": 75830 }, { "epoch": 1.8526616666259499, "grad_norm": 0.23657932877540588, "learning_rate": 1.5862995764302562e-06, "loss": 0.0318, "num_input_tokens_seen": 51119968, "step": 75835 }, { "epoch": 1.8527838174577969, "grad_norm": 1.5655349493026733, "learning_rate": 1.5862304913995513e-06, "loss": 0.0396, "num_input_tokens_seen": 51123232, "step": 75840 }, { "epoch": 1.852905968289644, "grad_norm": 24.702547073364258, "learning_rate": 1.58616140210562e-06, "loss": 0.1938, "num_input_tokens_seen": 51126496, "step": 75845 }, { "epoch": 1.8530281191214912, "grad_norm": 21.289745330810547, "learning_rate": 1.5860923085489656e-06, "loss": 0.0655, "num_input_tokens_seen": 51129760, "step": 75850 }, { "epoch": 1.8531502699533384, "grad_norm": 0.23596727848052979, "learning_rate": 1.5860232107300906e-06, "loss": 0.0028, "num_input_tokens_seen": 51133408, "step": 75855 }, { "epoch": 1.8532724207851854, "grad_norm": 0.6734174489974976, "learning_rate": 1.585954108649497e-06, "loss": 0.2307, "num_input_tokens_seen": 51136672, "step": 75860 }, { "epoch": 1.8533945716170326, "grad_norm": 20.81391143798828, "learning_rate": 1.5858850023076874e-06, "loss": 0.0909, "num_input_tokens_seen": 51140000, "step": 75865 }, { "epoch": 1.8535167224488798, "grad_norm": 22.20842933654785, "learning_rate": 1.585815891705165e-06, "loss": 0.0877, "num_input_tokens_seen": 51143072, "step": 75870 }, { "epoch": 1.853638873280727, "grad_norm": 0.023959221318364143, "learning_rate": 1.5857467768424312e-06, "loss": 0.001, "num_input_tokens_seen": 51146592, "step": 75875 }, { "epoch": 1.8537610241125742, "grad_norm": 32.944183349609375, "learning_rate": 1.5856776577199895e-06, "loss": 0.1578, "num_input_tokens_seen": 51150304, "step": 75880 }, { "epoch": 1.8538831749444213, "grad_norm": 0.3040035367012024, "learning_rate": 1.5856085343383426e-06, "loss": 0.0734, "num_input_tokens_seen": 51153568, "step": 75885 }, { "epoch": 1.8540053257762685, "grad_norm": 0.096220001578331, "learning_rate": 1.5855394066979925e-06, "loss": 0.0787, "num_input_tokens_seen": 51156960, "step": 75890 }, { "epoch": 1.8541274766081157, "grad_norm": 1.3828034400939941, "learning_rate": 1.5854702747994427e-06, "loss": 0.0606, "num_input_tokens_seen": 51160288, "step": 75895 }, { "epoch": 1.854249627439963, "grad_norm": 0.2657002806663513, "learning_rate": 1.5854011386431955e-06, "loss": 0.0024, "num_input_tokens_seen": 51163616, "step": 75900 }, { "epoch": 1.85437177827181, "grad_norm": 0.6370258927345276, "learning_rate": 1.5853319982297538e-06, "loss": 0.0779, "num_input_tokens_seen": 51166816, "step": 75905 }, { "epoch": 1.8544939291036573, "grad_norm": 0.1296832263469696, "learning_rate": 1.58526285355962e-06, "loss": 0.1529, "num_input_tokens_seen": 51170208, "step": 75910 }, { "epoch": 1.8546160799355045, "grad_norm": 0.043997667729854584, "learning_rate": 1.5851937046332976e-06, "loss": 0.0861, "num_input_tokens_seen": 51173856, "step": 75915 }, { "epoch": 1.8547382307673517, "grad_norm": 0.2165103554725647, "learning_rate": 1.5851245514512895e-06, "loss": 0.0991, "num_input_tokens_seen": 51176864, "step": 75920 }, { "epoch": 1.8548603815991986, "grad_norm": 1.8013496398925781, "learning_rate": 1.5850553940140979e-06, "loss": 0.0879, "num_input_tokens_seen": 51180448, "step": 75925 }, { "epoch": 1.8549825324310458, "grad_norm": 68.27543640136719, "learning_rate": 1.584986232322226e-06, "loss": 0.132, "num_input_tokens_seen": 51183776, "step": 75930 }, { "epoch": 1.855104683262893, "grad_norm": 113.5135498046875, "learning_rate": 1.5849170663761772e-06, "loss": 0.0056, "num_input_tokens_seen": 51187232, "step": 75935 }, { "epoch": 1.8552268340947402, "grad_norm": 0.03377193957567215, "learning_rate": 1.584847896176454e-06, "loss": 0.002, "num_input_tokens_seen": 51190304, "step": 75940 }, { "epoch": 1.8553489849265874, "grad_norm": 0.14906467497348785, "learning_rate": 1.5847787217235595e-06, "loss": 0.0393, "num_input_tokens_seen": 51193888, "step": 75945 }, { "epoch": 1.8554711357584344, "grad_norm": 0.892361044883728, "learning_rate": 1.5847095430179972e-06, "loss": 0.0694, "num_input_tokens_seen": 51197536, "step": 75950 }, { "epoch": 1.8555932865902816, "grad_norm": 0.5475803017616272, "learning_rate": 1.5846403600602695e-06, "loss": 0.1415, "num_input_tokens_seen": 51200992, "step": 75955 }, { "epoch": 1.8557154374221287, "grad_norm": 95.13793182373047, "learning_rate": 1.5845711728508802e-06, "loss": 0.0605, "num_input_tokens_seen": 51204448, "step": 75960 }, { "epoch": 1.855837588253976, "grad_norm": 0.030163267627358437, "learning_rate": 1.5845019813903318e-06, "loss": 0.1219, "num_input_tokens_seen": 51207648, "step": 75965 }, { "epoch": 1.8559597390858231, "grad_norm": 28.8455867767334, "learning_rate": 1.5844327856791276e-06, "loss": 0.1198, "num_input_tokens_seen": 51211040, "step": 75970 }, { "epoch": 1.8560818899176703, "grad_norm": 0.10754287987947464, "learning_rate": 1.5843635857177712e-06, "loss": 0.0704, "num_input_tokens_seen": 51214304, "step": 75975 }, { "epoch": 1.8562040407495175, "grad_norm": 41.19390106201172, "learning_rate": 1.584294381506766e-06, "loss": 0.2049, "num_input_tokens_seen": 51217248, "step": 75980 }, { "epoch": 1.8563261915813647, "grad_norm": 22.16908836364746, "learning_rate": 1.5842251730466143e-06, "loss": 0.0761, "num_input_tokens_seen": 51221088, "step": 75985 }, { "epoch": 1.8564483424132119, "grad_norm": 26.360881805419922, "learning_rate": 1.5841559603378204e-06, "loss": 0.1281, "num_input_tokens_seen": 51224224, "step": 75990 }, { "epoch": 1.856570493245059, "grad_norm": 0.0927048996090889, "learning_rate": 1.584086743380887e-06, "loss": 0.0007, "num_input_tokens_seen": 51226976, "step": 75995 }, { "epoch": 1.8566926440769063, "grad_norm": 24.040475845336914, "learning_rate": 1.584017522176318e-06, "loss": 0.1139, "num_input_tokens_seen": 51230688, "step": 76000 }, { "epoch": 1.8568147949087535, "grad_norm": 0.10228940844535828, "learning_rate": 1.5839482967246162e-06, "loss": 0.0939, "num_input_tokens_seen": 51234464, "step": 76005 }, { "epoch": 1.8569369457406006, "grad_norm": 11.87697982788086, "learning_rate": 1.5838790670262853e-06, "loss": 0.0364, "num_input_tokens_seen": 51237728, "step": 76010 }, { "epoch": 1.8570590965724476, "grad_norm": 88.89161682128906, "learning_rate": 1.583809833081829e-06, "loss": 0.0908, "num_input_tokens_seen": 51240672, "step": 76015 }, { "epoch": 1.8571812474042948, "grad_norm": 14.098811149597168, "learning_rate": 1.5837405948917506e-06, "loss": 0.0514, "num_input_tokens_seen": 51243808, "step": 76020 }, { "epoch": 1.857303398236142, "grad_norm": 0.14344164729118347, "learning_rate": 1.5836713524565535e-06, "loss": 0.0146, "num_input_tokens_seen": 51247520, "step": 76025 }, { "epoch": 1.8574255490679892, "grad_norm": 28.8133602142334, "learning_rate": 1.583602105776741e-06, "loss": 0.0418, "num_input_tokens_seen": 51250912, "step": 76030 }, { "epoch": 1.8575476998998364, "grad_norm": 1.3341981172561646, "learning_rate": 1.5835328548528173e-06, "loss": 0.1308, "num_input_tokens_seen": 51255200, "step": 76035 }, { "epoch": 1.8576698507316833, "grad_norm": 0.05615212023258209, "learning_rate": 1.5834635996852858e-06, "loss": 0.0411, "num_input_tokens_seen": 51258464, "step": 76040 }, { "epoch": 1.8577920015635305, "grad_norm": 23.91923713684082, "learning_rate": 1.58339434027465e-06, "loss": 0.0802, "num_input_tokens_seen": 51261856, "step": 76045 }, { "epoch": 1.8579141523953777, "grad_norm": 27.17865753173828, "learning_rate": 1.583325076621414e-06, "loss": 0.0297, "num_input_tokens_seen": 51264864, "step": 76050 }, { "epoch": 1.858036303227225, "grad_norm": 31.009824752807617, "learning_rate": 1.5832558087260806e-06, "loss": 0.0735, "num_input_tokens_seen": 51267872, "step": 76055 }, { "epoch": 1.858158454059072, "grad_norm": 0.057755716145038605, "learning_rate": 1.5831865365891544e-06, "loss": 0.0784, "num_input_tokens_seen": 51270624, "step": 76060 }, { "epoch": 1.8582806048909193, "grad_norm": 26.972583770751953, "learning_rate": 1.5831172602111385e-06, "loss": 0.0406, "num_input_tokens_seen": 51274272, "step": 76065 }, { "epoch": 1.8584027557227665, "grad_norm": 0.13842856884002686, "learning_rate": 1.5830479795925372e-06, "loss": 0.0465, "num_input_tokens_seen": 51277664, "step": 76070 }, { "epoch": 1.8585249065546137, "grad_norm": 0.19427655637264252, "learning_rate": 1.5829786947338544e-06, "loss": 0.2475, "num_input_tokens_seen": 51281184, "step": 76075 }, { "epoch": 1.8586470573864609, "grad_norm": 24.42550277709961, "learning_rate": 1.5829094056355934e-06, "loss": 0.1517, "num_input_tokens_seen": 51284576, "step": 76080 }, { "epoch": 1.858769208218308, "grad_norm": 0.07590050995349884, "learning_rate": 1.5828401122982589e-06, "loss": 0.127, "num_input_tokens_seen": 51287904, "step": 76085 }, { "epoch": 1.8588913590501552, "grad_norm": 55.993324279785156, "learning_rate": 1.582770814722354e-06, "loss": 0.1128, "num_input_tokens_seen": 51291360, "step": 76090 }, { "epoch": 1.8590135098820024, "grad_norm": 0.5621238350868225, "learning_rate": 1.582701512908383e-06, "loss": 0.1007, "num_input_tokens_seen": 51295008, "step": 76095 }, { "epoch": 1.8591356607138496, "grad_norm": 14.658206939697266, "learning_rate": 1.5826322068568497e-06, "loss": 0.0684, "num_input_tokens_seen": 51298592, "step": 76100 }, { "epoch": 1.8592578115456966, "grad_norm": 64.94924926757812, "learning_rate": 1.5825628965682585e-06, "loss": 0.1603, "num_input_tokens_seen": 51302048, "step": 76105 }, { "epoch": 1.8593799623775438, "grad_norm": 0.025940556079149246, "learning_rate": 1.5824935820431132e-06, "loss": 0.0359, "num_input_tokens_seen": 51305568, "step": 76110 }, { "epoch": 1.859502113209391, "grad_norm": 0.47116488218307495, "learning_rate": 1.582424263281918e-06, "loss": 0.0026, "num_input_tokens_seen": 51308768, "step": 76115 }, { "epoch": 1.8596242640412382, "grad_norm": 2.6873984336853027, "learning_rate": 1.5823549402851768e-06, "loss": 0.0414, "num_input_tokens_seen": 51312096, "step": 76120 }, { "epoch": 1.8597464148730853, "grad_norm": 0.17809846997261047, "learning_rate": 1.5822856130533937e-06, "loss": 0.0744, "num_input_tokens_seen": 51315104, "step": 76125 }, { "epoch": 1.8598685657049323, "grad_norm": 0.41667261719703674, "learning_rate": 1.5822162815870734e-06, "loss": 0.0508, "num_input_tokens_seen": 51318432, "step": 76130 }, { "epoch": 1.8599907165367795, "grad_norm": 11.715912818908691, "learning_rate": 1.5821469458867194e-06, "loss": 0.1015, "num_input_tokens_seen": 51322528, "step": 76135 }, { "epoch": 1.8601128673686267, "grad_norm": 0.06440366059541702, "learning_rate": 1.5820776059528363e-06, "loss": 0.1237, "num_input_tokens_seen": 51325728, "step": 76140 }, { "epoch": 1.8602350182004739, "grad_norm": 0.03905702009797096, "learning_rate": 1.5820082617859283e-06, "loss": 0.0658, "num_input_tokens_seen": 51329120, "step": 76145 }, { "epoch": 1.860357169032321, "grad_norm": 15.647366523742676, "learning_rate": 1.5819389133864997e-06, "loss": 0.1557, "num_input_tokens_seen": 51332320, "step": 76150 }, { "epoch": 1.8604793198641683, "grad_norm": 0.25174450874328613, "learning_rate": 1.5818695607550544e-06, "loss": 0.0885, "num_input_tokens_seen": 51335264, "step": 76155 }, { "epoch": 1.8606014706960154, "grad_norm": 0.3436072766780853, "learning_rate": 1.5818002038920977e-06, "loss": 0.0928, "num_input_tokens_seen": 51338336, "step": 76160 }, { "epoch": 1.8607236215278626, "grad_norm": 220.35096740722656, "learning_rate": 1.5817308427981332e-06, "loss": 0.1215, "num_input_tokens_seen": 51341664, "step": 76165 }, { "epoch": 1.8608457723597098, "grad_norm": 0.21717888116836548, "learning_rate": 1.5816614774736656e-06, "loss": 0.118, "num_input_tokens_seen": 51344864, "step": 76170 }, { "epoch": 1.860967923191557, "grad_norm": 0.24044664204120636, "learning_rate": 1.5815921079191994e-06, "loss": 0.0739, "num_input_tokens_seen": 51347744, "step": 76175 }, { "epoch": 1.8610900740234042, "grad_norm": 0.2545783519744873, "learning_rate": 1.5815227341352389e-06, "loss": 0.0379, "num_input_tokens_seen": 51351136, "step": 76180 }, { "epoch": 1.8612122248552514, "grad_norm": 0.3077756464481354, "learning_rate": 1.5814533561222885e-06, "loss": 0.0019, "num_input_tokens_seen": 51354592, "step": 76185 }, { "epoch": 1.8613343756870986, "grad_norm": 74.25428009033203, "learning_rate": 1.581383973880853e-06, "loss": 0.0707, "num_input_tokens_seen": 51357728, "step": 76190 }, { "epoch": 1.8614565265189456, "grad_norm": 0.5155385732650757, "learning_rate": 1.5813145874114366e-06, "loss": 0.1285, "num_input_tokens_seen": 51360800, "step": 76195 }, { "epoch": 1.8615786773507927, "grad_norm": 0.3103267252445221, "learning_rate": 1.5812451967145445e-06, "loss": 0.0436, "num_input_tokens_seen": 51363680, "step": 76200 }, { "epoch": 1.86170082818264, "grad_norm": 0.4102085530757904, "learning_rate": 1.5811758017906809e-06, "loss": 0.0765, "num_input_tokens_seen": 51366880, "step": 76205 }, { "epoch": 1.8618229790144871, "grad_norm": 0.8872631788253784, "learning_rate": 1.5811064026403507e-06, "loss": 0.0246, "num_input_tokens_seen": 51370208, "step": 76210 }, { "epoch": 1.861945129846334, "grad_norm": 14.930642127990723, "learning_rate": 1.5810369992640583e-06, "loss": 0.0803, "num_input_tokens_seen": 51373344, "step": 76215 }, { "epoch": 1.8620672806781813, "grad_norm": 13.179936408996582, "learning_rate": 1.5809675916623087e-06, "loss": 0.0507, "num_input_tokens_seen": 51376608, "step": 76220 }, { "epoch": 1.8621894315100285, "grad_norm": 0.05479566380381584, "learning_rate": 1.5808981798356063e-06, "loss": 0.057, "num_input_tokens_seen": 51380128, "step": 76225 }, { "epoch": 1.8623115823418757, "grad_norm": 0.8399147987365723, "learning_rate": 1.5808287637844559e-06, "loss": 0.002, "num_input_tokens_seen": 51383712, "step": 76230 }, { "epoch": 1.8624337331737228, "grad_norm": 36.5905647277832, "learning_rate": 1.580759343509363e-06, "loss": 0.087, "num_input_tokens_seen": 51386720, "step": 76235 }, { "epoch": 1.86255588400557, "grad_norm": 40.69601821899414, "learning_rate": 1.5806899190108318e-06, "loss": 0.1132, "num_input_tokens_seen": 51389856, "step": 76240 }, { "epoch": 1.8626780348374172, "grad_norm": 30.452613830566406, "learning_rate": 1.5806204902893674e-06, "loss": 0.093, "num_input_tokens_seen": 51393056, "step": 76245 }, { "epoch": 1.8628001856692644, "grad_norm": 22.763813018798828, "learning_rate": 1.5805510573454744e-06, "loss": 0.2589, "num_input_tokens_seen": 51396128, "step": 76250 }, { "epoch": 1.8629223365011116, "grad_norm": 10.09261417388916, "learning_rate": 1.580481620179658e-06, "loss": 0.0593, "num_input_tokens_seen": 51399456, "step": 76255 }, { "epoch": 1.8630444873329588, "grad_norm": 10.72971248626709, "learning_rate": 1.580412178792423e-06, "loss": 0.1307, "num_input_tokens_seen": 51403168, "step": 76260 }, { "epoch": 1.863166638164806, "grad_norm": 29.890047073364258, "learning_rate": 1.5803427331842748e-06, "loss": 0.1434, "num_input_tokens_seen": 51406688, "step": 76265 }, { "epoch": 1.8632887889966532, "grad_norm": 0.7407713532447815, "learning_rate": 1.5802732833557182e-06, "loss": 0.0023, "num_input_tokens_seen": 51410272, "step": 76270 }, { "epoch": 1.8634109398285004, "grad_norm": 18.422584533691406, "learning_rate": 1.580203829307258e-06, "loss": 0.111, "num_input_tokens_seen": 51413984, "step": 76275 }, { "epoch": 1.8635330906603476, "grad_norm": 0.01969938911497593, "learning_rate": 1.5801343710393997e-06, "loss": 0.0689, "num_input_tokens_seen": 51417376, "step": 76280 }, { "epoch": 1.8636552414921945, "grad_norm": 0.7834815979003906, "learning_rate": 1.5800649085526478e-06, "loss": 0.0344, "num_input_tokens_seen": 51420704, "step": 76285 }, { "epoch": 1.8637773923240417, "grad_norm": 11.597505569458008, "learning_rate": 1.5799954418475081e-06, "loss": 0.1379, "num_input_tokens_seen": 51423904, "step": 76290 }, { "epoch": 1.863899543155889, "grad_norm": 15.526674270629883, "learning_rate": 1.579925970924486e-06, "loss": 0.1141, "num_input_tokens_seen": 51427296, "step": 76295 }, { "epoch": 1.864021693987736, "grad_norm": 0.04713597893714905, "learning_rate": 1.5798564957840856e-06, "loss": 0.0843, "num_input_tokens_seen": 51431328, "step": 76300 }, { "epoch": 1.864143844819583, "grad_norm": 0.9391548037528992, "learning_rate": 1.579787016426813e-06, "loss": 0.0035, "num_input_tokens_seen": 51434912, "step": 76305 }, { "epoch": 1.8642659956514303, "grad_norm": 0.10538437217473984, "learning_rate": 1.5797175328531733e-06, "loss": 0.001, "num_input_tokens_seen": 51438304, "step": 76310 }, { "epoch": 1.8643881464832774, "grad_norm": 4.421464443206787, "learning_rate": 1.5796480450636719e-06, "loss": 0.0259, "num_input_tokens_seen": 51441696, "step": 76315 }, { "epoch": 1.8645102973151246, "grad_norm": 43.166439056396484, "learning_rate": 1.5795785530588138e-06, "loss": 0.1345, "num_input_tokens_seen": 51445024, "step": 76320 }, { "epoch": 1.8646324481469718, "grad_norm": 0.16540168225765228, "learning_rate": 1.5795090568391048e-06, "loss": 0.1112, "num_input_tokens_seen": 51448352, "step": 76325 }, { "epoch": 1.864754598978819, "grad_norm": 47.41411209106445, "learning_rate": 1.5794395564050499e-06, "loss": 0.0865, "num_input_tokens_seen": 51451808, "step": 76330 }, { "epoch": 1.8648767498106662, "grad_norm": 0.0872948095202446, "learning_rate": 1.5793700517571547e-06, "loss": 0.0862, "num_input_tokens_seen": 51454688, "step": 76335 }, { "epoch": 1.8649989006425134, "grad_norm": 10.607466697692871, "learning_rate": 1.5793005428959245e-06, "loss": 0.1357, "num_input_tokens_seen": 51458272, "step": 76340 }, { "epoch": 1.8651210514743606, "grad_norm": 3.705434560775757, "learning_rate": 1.5792310298218651e-06, "loss": 0.0021, "num_input_tokens_seen": 51461536, "step": 76345 }, { "epoch": 1.8652432023062078, "grad_norm": 0.37921032309532166, "learning_rate": 1.579161512535482e-06, "loss": 0.0451, "num_input_tokens_seen": 51464608, "step": 76350 }, { "epoch": 1.865365353138055, "grad_norm": 253.26100158691406, "learning_rate": 1.5790919910372806e-06, "loss": 0.1823, "num_input_tokens_seen": 51467744, "step": 76355 }, { "epoch": 1.8654875039699021, "grad_norm": 18.376888275146484, "learning_rate": 1.579022465327766e-06, "loss": 0.0786, "num_input_tokens_seen": 51470752, "step": 76360 }, { "epoch": 1.8656096548017493, "grad_norm": 0.1428607702255249, "learning_rate": 1.578952935407445e-06, "loss": 0.0775, "num_input_tokens_seen": 51473760, "step": 76365 }, { "epoch": 1.8657318056335965, "grad_norm": 0.2554032802581787, "learning_rate": 1.578883401276822e-06, "loss": 0.0017, "num_input_tokens_seen": 51476832, "step": 76370 }, { "epoch": 1.8658539564654435, "grad_norm": 27.811237335205078, "learning_rate": 1.5788138629364033e-06, "loss": 0.1384, "num_input_tokens_seen": 51480224, "step": 76375 }, { "epoch": 1.8659761072972907, "grad_norm": 41.41610336303711, "learning_rate": 1.5787443203866947e-06, "loss": 0.0977, "num_input_tokens_seen": 51483360, "step": 76380 }, { "epoch": 1.8660982581291379, "grad_norm": 0.9137465953826904, "learning_rate": 1.5786747736282019e-06, "loss": 0.0297, "num_input_tokens_seen": 51486624, "step": 76385 }, { "epoch": 1.866220408960985, "grad_norm": 0.2629835903644562, "learning_rate": 1.5786052226614301e-06, "loss": 0.1164, "num_input_tokens_seen": 51489952, "step": 76390 }, { "epoch": 1.866342559792832, "grad_norm": 2.6729328632354736, "learning_rate": 1.5785356674868857e-06, "loss": 0.0587, "num_input_tokens_seen": 51493536, "step": 76395 }, { "epoch": 1.8664647106246792, "grad_norm": 0.1776157021522522, "learning_rate": 1.5784661081050743e-06, "loss": 0.0442, "num_input_tokens_seen": 51496672, "step": 76400 }, { "epoch": 1.8665868614565264, "grad_norm": 0.07628049701452255, "learning_rate": 1.5783965445165018e-06, "loss": 0.1155, "num_input_tokens_seen": 51499872, "step": 76405 }, { "epoch": 1.8667090122883736, "grad_norm": 30.831989288330078, "learning_rate": 1.5783269767216738e-06, "loss": 0.1668, "num_input_tokens_seen": 51503968, "step": 76410 }, { "epoch": 1.8668311631202208, "grad_norm": 18.025569915771484, "learning_rate": 1.5782574047210968e-06, "loss": 0.1493, "num_input_tokens_seen": 51507232, "step": 76415 }, { "epoch": 1.866953313952068, "grad_norm": 0.03697647899389267, "learning_rate": 1.5781878285152765e-06, "loss": 0.0012, "num_input_tokens_seen": 51510752, "step": 76420 }, { "epoch": 1.8670754647839152, "grad_norm": 0.6512537598609924, "learning_rate": 1.5781182481047184e-06, "loss": 0.0636, "num_input_tokens_seen": 51513760, "step": 76425 }, { "epoch": 1.8671976156157624, "grad_norm": 0.23654574155807495, "learning_rate": 1.5780486634899291e-06, "loss": 0.0952, "num_input_tokens_seen": 51517408, "step": 76430 }, { "epoch": 1.8673197664476096, "grad_norm": 41.54343795776367, "learning_rate": 1.5779790746714145e-06, "loss": 0.1567, "num_input_tokens_seen": 51520736, "step": 76435 }, { "epoch": 1.8674419172794567, "grad_norm": 0.069539874792099, "learning_rate": 1.5779094816496806e-06, "loss": 0.0368, "num_input_tokens_seen": 51524192, "step": 76440 }, { "epoch": 1.867564068111304, "grad_norm": 8.968692779541016, "learning_rate": 1.5778398844252334e-06, "loss": 0.1951, "num_input_tokens_seen": 51527904, "step": 76445 }, { "epoch": 1.8676862189431511, "grad_norm": 0.03013712167739868, "learning_rate": 1.5777702829985794e-06, "loss": 0.0265, "num_input_tokens_seen": 51531296, "step": 76450 }, { "epoch": 1.8678083697749983, "grad_norm": 0.4158450961112976, "learning_rate": 1.577700677370224e-06, "loss": 0.0533, "num_input_tokens_seen": 51534496, "step": 76455 }, { "epoch": 1.8679305206068453, "grad_norm": 0.07961289584636688, "learning_rate": 1.5776310675406743e-06, "loss": 0.1515, "num_input_tokens_seen": 51538144, "step": 76460 }, { "epoch": 1.8680526714386925, "grad_norm": 0.5942521691322327, "learning_rate": 1.577561453510436e-06, "loss": 0.0012, "num_input_tokens_seen": 51541536, "step": 76465 }, { "epoch": 1.8681748222705397, "grad_norm": 114.15911865234375, "learning_rate": 1.5774918352800156e-06, "loss": 0.0686, "num_input_tokens_seen": 51545376, "step": 76470 }, { "epoch": 1.8682969731023868, "grad_norm": 0.8016293048858643, "learning_rate": 1.5774222128499188e-06, "loss": 0.0261, "num_input_tokens_seen": 51549152, "step": 76475 }, { "epoch": 1.868419123934234, "grad_norm": 0.21990229189395905, "learning_rate": 1.5773525862206528e-06, "loss": 0.0014, "num_input_tokens_seen": 51552736, "step": 76480 }, { "epoch": 1.868541274766081, "grad_norm": 32.91740798950195, "learning_rate": 1.5772829553927235e-06, "loss": 0.078, "num_input_tokens_seen": 51556576, "step": 76485 }, { "epoch": 1.8686634255979282, "grad_norm": 0.42278850078582764, "learning_rate": 1.577213320366637e-06, "loss": 0.0085, "num_input_tokens_seen": 51559776, "step": 76490 }, { "epoch": 1.8687855764297754, "grad_norm": 0.015430174767971039, "learning_rate": 1.5771436811429002e-06, "loss": 0.0239, "num_input_tokens_seen": 51562912, "step": 76495 }, { "epoch": 1.8689077272616226, "grad_norm": 34.49660110473633, "learning_rate": 1.5770740377220192e-06, "loss": 0.1157, "num_input_tokens_seen": 51566368, "step": 76500 }, { "epoch": 1.8690298780934698, "grad_norm": 0.05728043243288994, "learning_rate": 1.5770043901045007e-06, "loss": 0.0966, "num_input_tokens_seen": 51569952, "step": 76505 }, { "epoch": 1.869152028925317, "grad_norm": 7.740024566650391, "learning_rate": 1.5769347382908511e-06, "loss": 0.2293, "num_input_tokens_seen": 51573024, "step": 76510 }, { "epoch": 1.8692741797571641, "grad_norm": 93.66651153564453, "learning_rate": 1.5768650822815767e-06, "loss": 0.1675, "num_input_tokens_seen": 51576544, "step": 76515 }, { "epoch": 1.8693963305890113, "grad_norm": 20.219310760498047, "learning_rate": 1.5767954220771844e-06, "loss": 0.0407, "num_input_tokens_seen": 51579616, "step": 76520 }, { "epoch": 1.8695184814208585, "grad_norm": 0.25965866446495056, "learning_rate": 1.5767257576781808e-06, "loss": 0.1259, "num_input_tokens_seen": 51583136, "step": 76525 }, { "epoch": 1.8696406322527057, "grad_norm": 0.3182509243488312, "learning_rate": 1.576656089085072e-06, "loss": 0.0756, "num_input_tokens_seen": 51586400, "step": 76530 }, { "epoch": 1.869762783084553, "grad_norm": 6.768677711486816, "learning_rate": 1.5765864162983654e-06, "loss": 0.1691, "num_input_tokens_seen": 51589600, "step": 76535 }, { "epoch": 1.8698849339164, "grad_norm": 2.752490758895874, "learning_rate": 1.576516739318567e-06, "loss": 0.1084, "num_input_tokens_seen": 51592800, "step": 76540 }, { "epoch": 1.8700070847482473, "grad_norm": 0.23286566138267517, "learning_rate": 1.5764470581461842e-06, "loss": 0.0868, "num_input_tokens_seen": 51596128, "step": 76545 }, { "epoch": 1.8701292355800943, "grad_norm": 0.8963605165481567, "learning_rate": 1.576377372781723e-06, "loss": 0.1062, "num_input_tokens_seen": 51599392, "step": 76550 }, { "epoch": 1.8702513864119414, "grad_norm": 17.253986358642578, "learning_rate": 1.5763076832256905e-06, "loss": 0.0709, "num_input_tokens_seen": 51603104, "step": 76555 }, { "epoch": 1.8703735372437886, "grad_norm": 0.5030640959739685, "learning_rate": 1.5762379894785938e-06, "loss": 0.0825, "num_input_tokens_seen": 51606304, "step": 76560 }, { "epoch": 1.8704956880756358, "grad_norm": 0.284263014793396, "learning_rate": 1.5761682915409389e-06, "loss": 0.0386, "num_input_tokens_seen": 51609632, "step": 76565 }, { "epoch": 1.870617838907483, "grad_norm": 0.16007032990455627, "learning_rate": 1.5760985894132336e-06, "loss": 0.0324, "num_input_tokens_seen": 51612768, "step": 76570 }, { "epoch": 1.87073998973933, "grad_norm": 0.1427338570356369, "learning_rate": 1.5760288830959846e-06, "loss": 0.0845, "num_input_tokens_seen": 51615968, "step": 76575 }, { "epoch": 1.8708621405711772, "grad_norm": 0.4901255667209625, "learning_rate": 1.5759591725896986e-06, "loss": 0.0019, "num_input_tokens_seen": 51619168, "step": 76580 }, { "epoch": 1.8709842914030244, "grad_norm": 0.03649323433637619, "learning_rate": 1.5758894578948823e-06, "loss": 0.047, "num_input_tokens_seen": 51622624, "step": 76585 }, { "epoch": 1.8711064422348715, "grad_norm": 20.135515213012695, "learning_rate": 1.575819739012043e-06, "loss": 0.102, "num_input_tokens_seen": 51626016, "step": 76590 }, { "epoch": 1.8712285930667187, "grad_norm": 0.1097368523478508, "learning_rate": 1.5757500159416877e-06, "loss": 0.0222, "num_input_tokens_seen": 51629408, "step": 76595 }, { "epoch": 1.871350743898566, "grad_norm": 0.6177487969398499, "learning_rate": 1.5756802886843237e-06, "loss": 0.072, "num_input_tokens_seen": 51632864, "step": 76600 }, { "epoch": 1.8714728947304131, "grad_norm": 48.99726486206055, "learning_rate": 1.5756105572404575e-06, "loss": 0.0833, "num_input_tokens_seen": 51635936, "step": 76605 }, { "epoch": 1.8715950455622603, "grad_norm": 0.3567311763763428, "learning_rate": 1.5755408216105966e-06, "loss": 0.1141, "num_input_tokens_seen": 51639520, "step": 76610 }, { "epoch": 1.8717171963941075, "grad_norm": 0.19925457239151, "learning_rate": 1.5754710817952481e-06, "loss": 0.1466, "num_input_tokens_seen": 51642656, "step": 76615 }, { "epoch": 1.8718393472259547, "grad_norm": 0.13557025790214539, "learning_rate": 1.5754013377949189e-06, "loss": 0.1726, "num_input_tokens_seen": 51646368, "step": 76620 }, { "epoch": 1.8719614980578019, "grad_norm": 172.94630432128906, "learning_rate": 1.5753315896101165e-06, "loss": 0.0896, "num_input_tokens_seen": 51649632, "step": 76625 }, { "epoch": 1.872083648889649, "grad_norm": 14.283888816833496, "learning_rate": 1.575261837241348e-06, "loss": 0.0933, "num_input_tokens_seen": 51654624, "step": 76630 }, { "epoch": 1.8722057997214963, "grad_norm": 0.4296637177467346, "learning_rate": 1.575192080689121e-06, "loss": 0.0481, "num_input_tokens_seen": 51658208, "step": 76635 }, { "epoch": 1.8723279505533432, "grad_norm": 12.624166488647461, "learning_rate": 1.5751223199539422e-06, "loss": 0.1073, "num_input_tokens_seen": 51661344, "step": 76640 }, { "epoch": 1.8724501013851904, "grad_norm": 0.1500295251607895, "learning_rate": 1.5750525550363192e-06, "loss": 0.0008, "num_input_tokens_seen": 51664608, "step": 76645 }, { "epoch": 1.8725722522170376, "grad_norm": 0.16735275089740753, "learning_rate": 1.5749827859367594e-06, "loss": 0.0017, "num_input_tokens_seen": 51668192, "step": 76650 }, { "epoch": 1.8726944030488848, "grad_norm": 134.92787170410156, "learning_rate": 1.57491301265577e-06, "loss": 0.1194, "num_input_tokens_seen": 51672096, "step": 76655 }, { "epoch": 1.872816553880732, "grad_norm": 0.06441762298345566, "learning_rate": 1.5748432351938587e-06, "loss": 0.0222, "num_input_tokens_seen": 51675552, "step": 76660 }, { "epoch": 1.872938704712579, "grad_norm": 0.14489486813545227, "learning_rate": 1.5747734535515327e-06, "loss": 0.0675, "num_input_tokens_seen": 51679264, "step": 76665 }, { "epoch": 1.8730608555444261, "grad_norm": 38.53899002075195, "learning_rate": 1.5747036677292998e-06, "loss": 0.0698, "num_input_tokens_seen": 51682464, "step": 76670 }, { "epoch": 1.8731830063762733, "grad_norm": 0.45673030614852905, "learning_rate": 1.5746338777276668e-06, "loss": 0.1214, "num_input_tokens_seen": 51685600, "step": 76675 }, { "epoch": 1.8733051572081205, "grad_norm": 0.7562420964241028, "learning_rate": 1.5745640835471422e-06, "loss": 0.0604, "num_input_tokens_seen": 51688992, "step": 76680 }, { "epoch": 1.8734273080399677, "grad_norm": 0.14636746048927307, "learning_rate": 1.5744942851882326e-06, "loss": 0.0427, "num_input_tokens_seen": 51692128, "step": 76685 }, { "epoch": 1.873549458871815, "grad_norm": 0.37488046288490295, "learning_rate": 1.5744244826514463e-06, "loss": 0.1883, "num_input_tokens_seen": 51695520, "step": 76690 }, { "epoch": 1.873671609703662, "grad_norm": 0.9592730402946472, "learning_rate": 1.5743546759372906e-06, "loss": 0.0018, "num_input_tokens_seen": 51698976, "step": 76695 }, { "epoch": 1.8737937605355093, "grad_norm": 13.799247741699219, "learning_rate": 1.5742848650462731e-06, "loss": 0.1953, "num_input_tokens_seen": 51702368, "step": 76700 }, { "epoch": 1.8739159113673565, "grad_norm": 133.1254119873047, "learning_rate": 1.574215049978902e-06, "loss": 0.2132, "num_input_tokens_seen": 51706208, "step": 76705 }, { "epoch": 1.8740380621992037, "grad_norm": 0.19949565827846527, "learning_rate": 1.5741452307356842e-06, "loss": 0.0495, "num_input_tokens_seen": 51709408, "step": 76710 }, { "epoch": 1.8741602130310508, "grad_norm": 0.18503406643867493, "learning_rate": 1.574075407317128e-06, "loss": 0.0024, "num_input_tokens_seen": 51712864, "step": 76715 }, { "epoch": 1.874282363862898, "grad_norm": 0.4176699221134186, "learning_rate": 1.5740055797237408e-06, "loss": 0.0219, "num_input_tokens_seen": 51715936, "step": 76720 }, { "epoch": 1.8744045146947452, "grad_norm": 143.21966552734375, "learning_rate": 1.573935747956031e-06, "loss": 0.0642, "num_input_tokens_seen": 51719136, "step": 76725 }, { "epoch": 1.8745266655265922, "grad_norm": 0.2589048743247986, "learning_rate": 1.5738659120145057e-06, "loss": 0.1189, "num_input_tokens_seen": 51722592, "step": 76730 }, { "epoch": 1.8746488163584394, "grad_norm": 0.07927648723125458, "learning_rate": 1.5737960718996734e-06, "loss": 0.1064, "num_input_tokens_seen": 51725792, "step": 76735 }, { "epoch": 1.8747709671902866, "grad_norm": 13.073819160461426, "learning_rate": 1.5737262276120417e-06, "loss": 0.0906, "num_input_tokens_seen": 51729696, "step": 76740 }, { "epoch": 1.8748931180221338, "grad_norm": 0.2299795001745224, "learning_rate": 1.5736563791521188e-06, "loss": 0.0356, "num_input_tokens_seen": 51733024, "step": 76745 }, { "epoch": 1.8750152688539807, "grad_norm": 0.06773748993873596, "learning_rate": 1.5735865265204118e-06, "loss": 0.0008, "num_input_tokens_seen": 51737312, "step": 76750 }, { "epoch": 1.875137419685828, "grad_norm": 0.18835890293121338, "learning_rate": 1.5735166697174296e-06, "loss": 0.1585, "num_input_tokens_seen": 51740448, "step": 76755 }, { "epoch": 1.8752595705176751, "grad_norm": 0.07563555985689163, "learning_rate": 1.5734468087436801e-06, "loss": 0.0831, "num_input_tokens_seen": 51743712, "step": 76760 }, { "epoch": 1.8753817213495223, "grad_norm": 3.672663927078247, "learning_rate": 1.573376943599671e-06, "loss": 0.0483, "num_input_tokens_seen": 51747360, "step": 76765 }, { "epoch": 1.8755038721813695, "grad_norm": 0.13904969394207, "learning_rate": 1.5733070742859105e-06, "loss": 0.0009, "num_input_tokens_seen": 51750560, "step": 76770 }, { "epoch": 1.8756260230132167, "grad_norm": 0.7363821864128113, "learning_rate": 1.5732372008029069e-06, "loss": 0.0011, "num_input_tokens_seen": 51754144, "step": 76775 }, { "epoch": 1.8757481738450639, "grad_norm": 0.575900673866272, "learning_rate": 1.5731673231511683e-06, "loss": 0.0656, "num_input_tokens_seen": 51757728, "step": 76780 }, { "epoch": 1.875870324676911, "grad_norm": 0.02883243001997471, "learning_rate": 1.5730974413312023e-06, "loss": 0.1484, "num_input_tokens_seen": 51761440, "step": 76785 }, { "epoch": 1.8759924755087583, "grad_norm": 25.203136444091797, "learning_rate": 1.573027555343518e-06, "loss": 0.0709, "num_input_tokens_seen": 51765088, "step": 76790 }, { "epoch": 1.8761146263406054, "grad_norm": 0.15883779525756836, "learning_rate": 1.5729576651886229e-06, "loss": 0.0387, "num_input_tokens_seen": 51769056, "step": 76795 }, { "epoch": 1.8762367771724526, "grad_norm": 17.070417404174805, "learning_rate": 1.5728877708670258e-06, "loss": 0.117, "num_input_tokens_seen": 51772384, "step": 76800 }, { "epoch": 1.8763589280042998, "grad_norm": 3.9865686893463135, "learning_rate": 1.5728178723792347e-06, "loss": 0.0016, "num_input_tokens_seen": 51775520, "step": 76805 }, { "epoch": 1.876481078836147, "grad_norm": 47.375892639160156, "learning_rate": 1.5727479697257578e-06, "loss": 0.2674, "num_input_tokens_seen": 51778720, "step": 76810 }, { "epoch": 1.8766032296679942, "grad_norm": 8.242105484008789, "learning_rate": 1.5726780629071037e-06, "loss": 0.1556, "num_input_tokens_seen": 51781920, "step": 76815 }, { "epoch": 1.8767253804998412, "grad_norm": 0.23152075707912445, "learning_rate": 1.572608151923781e-06, "loss": 0.0942, "num_input_tokens_seen": 51784992, "step": 76820 }, { "epoch": 1.8768475313316884, "grad_norm": 0.5574703216552734, "learning_rate": 1.5725382367762972e-06, "loss": 0.0206, "num_input_tokens_seen": 51788576, "step": 76825 }, { "epoch": 1.8769696821635355, "grad_norm": 136.82997131347656, "learning_rate": 1.5724683174651616e-06, "loss": 0.021, "num_input_tokens_seen": 51791968, "step": 76830 }, { "epoch": 1.8770918329953827, "grad_norm": 0.15793973207473755, "learning_rate": 1.5723983939908826e-06, "loss": 0.0357, "num_input_tokens_seen": 51795616, "step": 76835 }, { "epoch": 1.8772139838272297, "grad_norm": 0.1076064333319664, "learning_rate": 1.5723284663539684e-06, "loss": 0.0541, "num_input_tokens_seen": 51798688, "step": 76840 }, { "epoch": 1.877336134659077, "grad_norm": 0.09073779731988907, "learning_rate": 1.5722585345549276e-06, "loss": 0.2528, "num_input_tokens_seen": 51801632, "step": 76845 }, { "epoch": 1.877458285490924, "grad_norm": 0.10438819974660873, "learning_rate": 1.5721885985942689e-06, "loss": 0.0017, "num_input_tokens_seen": 51805024, "step": 76850 }, { "epoch": 1.8775804363227713, "grad_norm": 0.3387993574142456, "learning_rate": 1.5721186584725007e-06, "loss": 0.0908, "num_input_tokens_seen": 51808224, "step": 76855 }, { "epoch": 1.8777025871546185, "grad_norm": 0.10911522060632706, "learning_rate": 1.572048714190132e-06, "loss": 0.0377, "num_input_tokens_seen": 51812256, "step": 76860 }, { "epoch": 1.8778247379864657, "grad_norm": 0.5376071929931641, "learning_rate": 1.571978765747671e-06, "loss": 0.1122, "num_input_tokens_seen": 51815584, "step": 76865 }, { "epoch": 1.8779468888183128, "grad_norm": 20.533918380737305, "learning_rate": 1.5719088131456264e-06, "loss": 0.1408, "num_input_tokens_seen": 51818848, "step": 76870 }, { "epoch": 1.87806903965016, "grad_norm": 14.685124397277832, "learning_rate": 1.5718388563845073e-06, "loss": 0.0538, "num_input_tokens_seen": 51822368, "step": 76875 }, { "epoch": 1.8781911904820072, "grad_norm": 0.15065699815750122, "learning_rate": 1.5717688954648223e-06, "loss": 0.0606, "num_input_tokens_seen": 51826016, "step": 76880 }, { "epoch": 1.8783133413138544, "grad_norm": 0.8592997193336487, "learning_rate": 1.5716989303870797e-06, "loss": 0.093, "num_input_tokens_seen": 51829856, "step": 76885 }, { "epoch": 1.8784354921457016, "grad_norm": 0.5073649287223816, "learning_rate": 1.5716289611517892e-06, "loss": 0.0452, "num_input_tokens_seen": 51833056, "step": 76890 }, { "epoch": 1.8785576429775488, "grad_norm": 176.64024353027344, "learning_rate": 1.571558987759459e-06, "loss": 0.0483, "num_input_tokens_seen": 51836704, "step": 76895 }, { "epoch": 1.878679793809396, "grad_norm": 18.90421485900879, "learning_rate": 1.5714890102105983e-06, "loss": 0.1351, "num_input_tokens_seen": 51839392, "step": 76900 }, { "epoch": 1.8788019446412432, "grad_norm": 0.08553940802812576, "learning_rate": 1.5714190285057152e-06, "loss": 0.0565, "num_input_tokens_seen": 51843296, "step": 76905 }, { "epoch": 1.8789240954730901, "grad_norm": 233.33682250976562, "learning_rate": 1.5713490426453198e-06, "loss": 0.009, "num_input_tokens_seen": 51846624, "step": 76910 }, { "epoch": 1.8790462463049373, "grad_norm": 130.19361877441406, "learning_rate": 1.5712790526299203e-06, "loss": 0.2575, "num_input_tokens_seen": 51849888, "step": 76915 }, { "epoch": 1.8791683971367845, "grad_norm": 1.1826436519622803, "learning_rate": 1.5712090584600256e-06, "loss": 0.1942, "num_input_tokens_seen": 51853728, "step": 76920 }, { "epoch": 1.8792905479686317, "grad_norm": 0.44003385305404663, "learning_rate": 1.5711390601361454e-06, "loss": 0.0867, "num_input_tokens_seen": 51857440, "step": 76925 }, { "epoch": 1.8794126988004787, "grad_norm": 0.6665119528770447, "learning_rate": 1.5710690576587883e-06, "loss": 0.0122, "num_input_tokens_seen": 51860960, "step": 76930 }, { "epoch": 1.8795348496323259, "grad_norm": 0.19465108215808868, "learning_rate": 1.5709990510284632e-06, "loss": 0.0533, "num_input_tokens_seen": 51864480, "step": 76935 }, { "epoch": 1.879657000464173, "grad_norm": 0.41062381863594055, "learning_rate": 1.5709290402456795e-06, "loss": 0.0375, "num_input_tokens_seen": 51867552, "step": 76940 }, { "epoch": 1.8797791512960202, "grad_norm": 0.5571926832199097, "learning_rate": 1.5708590253109462e-06, "loss": 0.041, "num_input_tokens_seen": 51870816, "step": 76945 }, { "epoch": 1.8799013021278674, "grad_norm": 2.615623712539673, "learning_rate": 1.5707890062247727e-06, "loss": 0.0976, "num_input_tokens_seen": 51874080, "step": 76950 }, { "epoch": 1.8800234529597146, "grad_norm": 0.13177724182605743, "learning_rate": 1.5707189829876678e-06, "loss": 0.0668, "num_input_tokens_seen": 51877600, "step": 76955 }, { "epoch": 1.8801456037915618, "grad_norm": 0.5526846647262573, "learning_rate": 1.5706489556001411e-06, "loss": 0.0864, "num_input_tokens_seen": 51881376, "step": 76960 }, { "epoch": 1.880267754623409, "grad_norm": 0.015390805900096893, "learning_rate": 1.5705789240627017e-06, "loss": 0.106, "num_input_tokens_seen": 51884576, "step": 76965 }, { "epoch": 1.8803899054552562, "grad_norm": 0.27080556750297546, "learning_rate": 1.570508888375859e-06, "loss": 0.0831, "num_input_tokens_seen": 51887776, "step": 76970 }, { "epoch": 1.8805120562871034, "grad_norm": 0.2317759245634079, "learning_rate": 1.5704388485401221e-06, "loss": 0.0864, "num_input_tokens_seen": 51890976, "step": 76975 }, { "epoch": 1.8806342071189506, "grad_norm": 0.05991874635219574, "learning_rate": 1.5703688045560004e-06, "loss": 0.0714, "num_input_tokens_seen": 51894368, "step": 76980 }, { "epoch": 1.8807563579507978, "grad_norm": 181.63429260253906, "learning_rate": 1.5702987564240035e-06, "loss": 0.0956, "num_input_tokens_seen": 51897824, "step": 76985 }, { "epoch": 1.880878508782645, "grad_norm": 13.823509216308594, "learning_rate": 1.5702287041446406e-06, "loss": 0.0912, "num_input_tokens_seen": 51901280, "step": 76990 }, { "epoch": 1.881000659614492, "grad_norm": 34.22384262084961, "learning_rate": 1.5701586477184212e-06, "loss": 0.1558, "num_input_tokens_seen": 51904800, "step": 76995 }, { "epoch": 1.881122810446339, "grad_norm": 15.785733222961426, "learning_rate": 1.5700885871458546e-06, "loss": 0.258, "num_input_tokens_seen": 51908192, "step": 77000 }, { "epoch": 1.8812449612781863, "grad_norm": 0.17888937890529633, "learning_rate": 1.5700185224274504e-06, "loss": 0.085, "num_input_tokens_seen": 51911712, "step": 77005 }, { "epoch": 1.8813671121100335, "grad_norm": 2.345381736755371, "learning_rate": 1.5699484535637183e-06, "loss": 0.0468, "num_input_tokens_seen": 51915104, "step": 77010 }, { "epoch": 1.8814892629418807, "grad_norm": 16.388687133789062, "learning_rate": 1.5698783805551682e-06, "loss": 0.1079, "num_input_tokens_seen": 51918176, "step": 77015 }, { "epoch": 1.8816114137737276, "grad_norm": 0.1745808720588684, "learning_rate": 1.5698083034023086e-06, "loss": 0.1028, "num_input_tokens_seen": 51921184, "step": 77020 }, { "epoch": 1.8817335646055748, "grad_norm": 2.841417074203491, "learning_rate": 1.5697382221056501e-06, "loss": 0.0429, "num_input_tokens_seen": 51925216, "step": 77025 }, { "epoch": 1.881855715437422, "grad_norm": 55.37129592895508, "learning_rate": 1.5696681366657018e-06, "loss": 0.0533, "num_input_tokens_seen": 51928224, "step": 77030 }, { "epoch": 1.8819778662692692, "grad_norm": 76.16033935546875, "learning_rate": 1.5695980470829736e-06, "loss": 0.0829, "num_input_tokens_seen": 51931872, "step": 77035 }, { "epoch": 1.8821000171011164, "grad_norm": 0.11781585216522217, "learning_rate": 1.5695279533579754e-06, "loss": 0.0499, "num_input_tokens_seen": 51935136, "step": 77040 }, { "epoch": 1.8822221679329636, "grad_norm": 4.420412063598633, "learning_rate": 1.5694578554912167e-06, "loss": 0.0416, "num_input_tokens_seen": 51938592, "step": 77045 }, { "epoch": 1.8823443187648108, "grad_norm": 10.299237251281738, "learning_rate": 1.5693877534832072e-06, "loss": 0.1148, "num_input_tokens_seen": 51942048, "step": 77050 }, { "epoch": 1.882466469596658, "grad_norm": 0.2998434901237488, "learning_rate": 1.569317647334457e-06, "loss": 0.0878, "num_input_tokens_seen": 51945376, "step": 77055 }, { "epoch": 1.8825886204285052, "grad_norm": 0.10223901271820068, "learning_rate": 1.5692475370454754e-06, "loss": 0.0298, "num_input_tokens_seen": 51948512, "step": 77060 }, { "epoch": 1.8827107712603524, "grad_norm": 0.3003741502761841, "learning_rate": 1.569177422616773e-06, "loss": 0.3041, "num_input_tokens_seen": 51951840, "step": 77065 }, { "epoch": 1.8828329220921995, "grad_norm": 3.530810832977295, "learning_rate": 1.569107304048859e-06, "loss": 0.0568, "num_input_tokens_seen": 51955168, "step": 77070 }, { "epoch": 1.8829550729240467, "grad_norm": 0.7776597738265991, "learning_rate": 1.5690371813422437e-06, "loss": 0.0458, "num_input_tokens_seen": 51958624, "step": 77075 }, { "epoch": 1.883077223755894, "grad_norm": 76.65692138671875, "learning_rate": 1.5689670544974369e-06, "loss": 0.1906, "num_input_tokens_seen": 51962080, "step": 77080 }, { "epoch": 1.883199374587741, "grad_norm": 0.09555058926343918, "learning_rate": 1.5688969235149487e-06, "loss": 0.0247, "num_input_tokens_seen": 51965472, "step": 77085 }, { "epoch": 1.883321525419588, "grad_norm": 0.4007159471511841, "learning_rate": 1.568826788395289e-06, "loss": 0.0012, "num_input_tokens_seen": 51968928, "step": 77090 }, { "epoch": 1.8834436762514353, "grad_norm": 0.09671560674905777, "learning_rate": 1.568756649138968e-06, "loss": 0.0317, "num_input_tokens_seen": 51972000, "step": 77095 }, { "epoch": 1.8835658270832825, "grad_norm": 9.398812294006348, "learning_rate": 1.5686865057464958e-06, "loss": 0.0484, "num_input_tokens_seen": 51975648, "step": 77100 }, { "epoch": 1.8836879779151297, "grad_norm": 17.455774307250977, "learning_rate": 1.568616358218382e-06, "loss": 0.1971, "num_input_tokens_seen": 51978528, "step": 77105 }, { "epoch": 1.8838101287469766, "grad_norm": 11.775774955749512, "learning_rate": 1.5685462065551373e-06, "loss": 0.183, "num_input_tokens_seen": 51981728, "step": 77110 }, { "epoch": 1.8839322795788238, "grad_norm": 0.1156628429889679, "learning_rate": 1.5684760507572716e-06, "loss": 0.0013, "num_input_tokens_seen": 51985184, "step": 77115 }, { "epoch": 1.884054430410671, "grad_norm": 66.80492401123047, "learning_rate": 1.5684058908252952e-06, "loss": 0.0107, "num_input_tokens_seen": 51989344, "step": 77120 }, { "epoch": 1.8841765812425182, "grad_norm": 33.782405853271484, "learning_rate": 1.5683357267597183e-06, "loss": 0.0973, "num_input_tokens_seen": 51992736, "step": 77125 }, { "epoch": 1.8842987320743654, "grad_norm": 0.2235114723443985, "learning_rate": 1.5682655585610514e-06, "loss": 0.0648, "num_input_tokens_seen": 51996064, "step": 77130 }, { "epoch": 1.8844208829062126, "grad_norm": 39.23573684692383, "learning_rate": 1.5681953862298043e-06, "loss": 0.1465, "num_input_tokens_seen": 51999712, "step": 77135 }, { "epoch": 1.8845430337380598, "grad_norm": 19.437856674194336, "learning_rate": 1.5681252097664875e-06, "loss": 0.036, "num_input_tokens_seen": 52003360, "step": 77140 }, { "epoch": 1.884665184569907, "grad_norm": 0.020282620564103127, "learning_rate": 1.5680550291716113e-06, "loss": 0.0753, "num_input_tokens_seen": 52006816, "step": 77145 }, { "epoch": 1.8847873354017541, "grad_norm": 1.5308507680892944, "learning_rate": 1.5679848444456862e-06, "loss": 0.0653, "num_input_tokens_seen": 52010208, "step": 77150 }, { "epoch": 1.8849094862336013, "grad_norm": 17.294179916381836, "learning_rate": 1.5679146555892223e-06, "loss": 0.1015, "num_input_tokens_seen": 52013472, "step": 77155 }, { "epoch": 1.8850316370654485, "grad_norm": 0.08894924819469452, "learning_rate": 1.5678444626027308e-06, "loss": 0.0014, "num_input_tokens_seen": 52016736, "step": 77160 }, { "epoch": 1.8851537878972957, "grad_norm": 0.4388797879219055, "learning_rate": 1.567774265486721e-06, "loss": 0.1694, "num_input_tokens_seen": 52020640, "step": 77165 }, { "epoch": 1.885275938729143, "grad_norm": 32.14431381225586, "learning_rate": 1.5677040642417048e-06, "loss": 0.0343, "num_input_tokens_seen": 52024032, "step": 77170 }, { "epoch": 1.8853980895609899, "grad_norm": 21.51976776123047, "learning_rate": 1.5676338588681914e-06, "loss": 0.1471, "num_input_tokens_seen": 52027296, "step": 77175 }, { "epoch": 1.885520240392837, "grad_norm": 0.0837312787771225, "learning_rate": 1.567563649366692e-06, "loss": 0.1674, "num_input_tokens_seen": 52030496, "step": 77180 }, { "epoch": 1.8856423912246842, "grad_norm": 63.973487854003906, "learning_rate": 1.5674934357377168e-06, "loss": 0.1197, "num_input_tokens_seen": 52033568, "step": 77185 }, { "epoch": 1.8857645420565314, "grad_norm": 3.189739227294922, "learning_rate": 1.5674232179817773e-06, "loss": 0.0036, "num_input_tokens_seen": 52037344, "step": 77190 }, { "epoch": 1.8858866928883786, "grad_norm": 62.23170852661133, "learning_rate": 1.5673529960993832e-06, "loss": 0.1402, "num_input_tokens_seen": 52041120, "step": 77195 }, { "epoch": 1.8860088437202256, "grad_norm": 0.8425273299217224, "learning_rate": 1.5672827700910456e-06, "loss": 0.0463, "num_input_tokens_seen": 52044192, "step": 77200 }, { "epoch": 1.8861309945520728, "grad_norm": 23.748029708862305, "learning_rate": 1.5672125399572748e-06, "loss": 0.071, "num_input_tokens_seen": 52047840, "step": 77205 }, { "epoch": 1.88625314538392, "grad_norm": 0.06841769814491272, "learning_rate": 1.5671423056985824e-06, "loss": 0.0378, "num_input_tokens_seen": 52051296, "step": 77210 }, { "epoch": 1.8863752962157672, "grad_norm": 0.13464903831481934, "learning_rate": 1.5670720673154783e-06, "loss": 0.0576, "num_input_tokens_seen": 52055456, "step": 77215 }, { "epoch": 1.8864974470476144, "grad_norm": 0.1433732807636261, "learning_rate": 1.5670018248084735e-06, "loss": 0.0263, "num_input_tokens_seen": 52058592, "step": 77220 }, { "epoch": 1.8866195978794615, "grad_norm": 0.3391604423522949, "learning_rate": 1.566931578178079e-06, "loss": 0.1117, "num_input_tokens_seen": 52061856, "step": 77225 }, { "epoch": 1.8867417487113087, "grad_norm": 0.9799418449401855, "learning_rate": 1.5668613274248056e-06, "loss": 0.0423, "num_input_tokens_seen": 52065504, "step": 77230 }, { "epoch": 1.886863899543156, "grad_norm": 21.551013946533203, "learning_rate": 1.5667910725491645e-06, "loss": 0.2192, "num_input_tokens_seen": 52069024, "step": 77235 }, { "epoch": 1.886986050375003, "grad_norm": 84.48568725585938, "learning_rate": 1.5667208135516658e-06, "loss": 0.0861, "num_input_tokens_seen": 52073376, "step": 77240 }, { "epoch": 1.8871082012068503, "grad_norm": 0.10802389681339264, "learning_rate": 1.566650550432821e-06, "loss": 0.0812, "num_input_tokens_seen": 52076256, "step": 77245 }, { "epoch": 1.8872303520386975, "grad_norm": 28.95943260192871, "learning_rate": 1.5665802831931412e-06, "loss": 0.1082, "num_input_tokens_seen": 52079456, "step": 77250 }, { "epoch": 1.8873525028705447, "grad_norm": 5.773714542388916, "learning_rate": 1.5665100118331371e-06, "loss": 0.0032, "num_input_tokens_seen": 52082528, "step": 77255 }, { "epoch": 1.8874746537023919, "grad_norm": 0.03499281033873558, "learning_rate": 1.5664397363533198e-06, "loss": 0.0648, "num_input_tokens_seen": 52085792, "step": 77260 }, { "epoch": 1.8875968045342388, "grad_norm": 0.3379361033439636, "learning_rate": 1.5663694567542004e-06, "loss": 0.167, "num_input_tokens_seen": 52089184, "step": 77265 }, { "epoch": 1.887718955366086, "grad_norm": 0.5242000222206116, "learning_rate": 1.5662991730362899e-06, "loss": 0.0372, "num_input_tokens_seen": 52092448, "step": 77270 }, { "epoch": 1.8878411061979332, "grad_norm": 29.624061584472656, "learning_rate": 1.5662288852000995e-06, "loss": 0.0688, "num_input_tokens_seen": 52096096, "step": 77275 }, { "epoch": 1.8879632570297804, "grad_norm": 0.6497032642364502, "learning_rate": 1.5661585932461403e-06, "loss": 0.0309, "num_input_tokens_seen": 52099488, "step": 77280 }, { "epoch": 1.8880854078616274, "grad_norm": 8.505046844482422, "learning_rate": 1.5660882971749237e-06, "loss": 0.0414, "num_input_tokens_seen": 52103008, "step": 77285 }, { "epoch": 1.8882075586934746, "grad_norm": 0.12121704965829849, "learning_rate": 1.5660179969869604e-06, "loss": 0.0011, "num_input_tokens_seen": 52106464, "step": 77290 }, { "epoch": 1.8883297095253218, "grad_norm": 0.01899469457566738, "learning_rate": 1.5659476926827625e-06, "loss": 0.0005, "num_input_tokens_seen": 52109856, "step": 77295 }, { "epoch": 1.888451860357169, "grad_norm": 0.14597168564796448, "learning_rate": 1.5658773842628405e-06, "loss": 0.1511, "num_input_tokens_seen": 52113056, "step": 77300 }, { "epoch": 1.8885740111890161, "grad_norm": 30.08360481262207, "learning_rate": 1.565807071727706e-06, "loss": 0.0981, "num_input_tokens_seen": 52116512, "step": 77305 }, { "epoch": 1.8886961620208633, "grad_norm": 79.73748016357422, "learning_rate": 1.5657367550778702e-06, "loss": 0.1299, "num_input_tokens_seen": 52119968, "step": 77310 }, { "epoch": 1.8888183128527105, "grad_norm": 26.3062801361084, "learning_rate": 1.5656664343138447e-06, "loss": 0.1116, "num_input_tokens_seen": 52123232, "step": 77315 }, { "epoch": 1.8889404636845577, "grad_norm": 0.47073596715927124, "learning_rate": 1.5655961094361403e-06, "loss": 0.0592, "num_input_tokens_seen": 52126496, "step": 77320 }, { "epoch": 1.889062614516405, "grad_norm": 13.913512229919434, "learning_rate": 1.5655257804452696e-06, "loss": 0.1957, "num_input_tokens_seen": 52129952, "step": 77325 }, { "epoch": 1.889184765348252, "grad_norm": 1.774534821510315, "learning_rate": 1.5654554473417428e-06, "loss": 0.1567, "num_input_tokens_seen": 52133344, "step": 77330 }, { "epoch": 1.8893069161800993, "grad_norm": 30.571760177612305, "learning_rate": 1.565385110126072e-06, "loss": 0.0769, "num_input_tokens_seen": 52136480, "step": 77335 }, { "epoch": 1.8894290670119465, "grad_norm": 0.2755120098590851, "learning_rate": 1.5653147687987684e-06, "loss": 0.0255, "num_input_tokens_seen": 52139616, "step": 77340 }, { "epoch": 1.8895512178437937, "grad_norm": 0.5474405884742737, "learning_rate": 1.565244423360344e-06, "loss": 0.0013, "num_input_tokens_seen": 52142816, "step": 77345 }, { "epoch": 1.8896733686756408, "grad_norm": 112.4779052734375, "learning_rate": 1.5651740738113101e-06, "loss": 0.0546, "num_input_tokens_seen": 52145760, "step": 77350 }, { "epoch": 1.8897955195074878, "grad_norm": 5.316534519195557, "learning_rate": 1.5651037201521784e-06, "loss": 0.0848, "num_input_tokens_seen": 52149024, "step": 77355 }, { "epoch": 1.889917670339335, "grad_norm": 0.46590232849121094, "learning_rate": 1.5650333623834607e-06, "loss": 0.0013, "num_input_tokens_seen": 52152992, "step": 77360 }, { "epoch": 1.8900398211711822, "grad_norm": 12.223104476928711, "learning_rate": 1.564963000505668e-06, "loss": 0.1025, "num_input_tokens_seen": 52156256, "step": 77365 }, { "epoch": 1.8901619720030294, "grad_norm": 0.6928628087043762, "learning_rate": 1.5648926345193123e-06, "loss": 0.0752, "num_input_tokens_seen": 52159712, "step": 77370 }, { "epoch": 1.8902841228348763, "grad_norm": 41.46940994262695, "learning_rate": 1.564822264424906e-06, "loss": 0.1883, "num_input_tokens_seen": 52162592, "step": 77375 }, { "epoch": 1.8904062736667235, "grad_norm": 15.121652603149414, "learning_rate": 1.5647518902229594e-06, "loss": 0.0896, "num_input_tokens_seen": 52166112, "step": 77380 }, { "epoch": 1.8905284244985707, "grad_norm": 26.036794662475586, "learning_rate": 1.564681511913986e-06, "loss": 0.0747, "num_input_tokens_seen": 52169888, "step": 77385 }, { "epoch": 1.890650575330418, "grad_norm": 0.16673624515533447, "learning_rate": 1.5646111294984963e-06, "loss": 0.1649, "num_input_tokens_seen": 52173280, "step": 77390 }, { "epoch": 1.890772726162265, "grad_norm": 0.116355299949646, "learning_rate": 1.5645407429770025e-06, "loss": 0.0928, "num_input_tokens_seen": 52176800, "step": 77395 }, { "epoch": 1.8908948769941123, "grad_norm": 0.3401149809360504, "learning_rate": 1.564470352350017e-06, "loss": 0.0702, "num_input_tokens_seen": 52180000, "step": 77400 }, { "epoch": 1.8910170278259595, "grad_norm": 0.05878855660557747, "learning_rate": 1.5643999576180509e-06, "loss": 0.0429, "num_input_tokens_seen": 52183392, "step": 77405 }, { "epoch": 1.8911391786578067, "grad_norm": 8.723549842834473, "learning_rate": 1.5643295587816167e-06, "loss": 0.143, "num_input_tokens_seen": 52186720, "step": 77410 }, { "epoch": 1.8912613294896539, "grad_norm": 123.5528564453125, "learning_rate": 1.5642591558412263e-06, "loss": 0.0799, "num_input_tokens_seen": 52189728, "step": 77415 }, { "epoch": 1.891383480321501, "grad_norm": 0.3717953562736511, "learning_rate": 1.5641887487973914e-06, "loss": 0.0017, "num_input_tokens_seen": 52193248, "step": 77420 }, { "epoch": 1.8915056311533482, "grad_norm": 0.2517523765563965, "learning_rate": 1.564118337650624e-06, "loss": 0.0022, "num_input_tokens_seen": 52196512, "step": 77425 }, { "epoch": 1.8916277819851954, "grad_norm": 0.0997331365942955, "learning_rate": 1.5640479224014364e-06, "loss": 0.0443, "num_input_tokens_seen": 52199648, "step": 77430 }, { "epoch": 1.8917499328170426, "grad_norm": 0.14800933003425598, "learning_rate": 1.5639775030503409e-06, "loss": 0.0391, "num_input_tokens_seen": 52203104, "step": 77435 }, { "epoch": 1.8918720836488898, "grad_norm": 0.1393311619758606, "learning_rate": 1.5639070795978491e-06, "loss": 0.0855, "num_input_tokens_seen": 52206688, "step": 77440 }, { "epoch": 1.8919942344807368, "grad_norm": 24.84368133544922, "learning_rate": 1.5638366520444732e-06, "loss": 0.1721, "num_input_tokens_seen": 52210976, "step": 77445 }, { "epoch": 1.892116385312584, "grad_norm": 32.104915618896484, "learning_rate": 1.5637662203907255e-06, "loss": 0.0824, "num_input_tokens_seen": 52214240, "step": 77450 }, { "epoch": 1.8922385361444312, "grad_norm": 0.10727355629205704, "learning_rate": 1.5636957846371184e-06, "loss": 0.0363, "num_input_tokens_seen": 52217184, "step": 77455 }, { "epoch": 1.8923606869762784, "grad_norm": 0.18539845943450928, "learning_rate": 1.563625344784164e-06, "loss": 0.0547, "num_input_tokens_seen": 52220384, "step": 77460 }, { "epoch": 1.8924828378081253, "grad_norm": 9.1328763961792, "learning_rate": 1.5635549008323742e-06, "loss": 0.166, "num_input_tokens_seen": 52223584, "step": 77465 }, { "epoch": 1.8926049886399725, "grad_norm": 15.195426940917969, "learning_rate": 1.5634844527822617e-06, "loss": 0.0522, "num_input_tokens_seen": 52226592, "step": 77470 }, { "epoch": 1.8927271394718197, "grad_norm": 0.5996956825256348, "learning_rate": 1.563414000634339e-06, "loss": 0.0969, "num_input_tokens_seen": 52229664, "step": 77475 }, { "epoch": 1.8928492903036669, "grad_norm": 1.2738440036773682, "learning_rate": 1.563343544389118e-06, "loss": 0.0367, "num_input_tokens_seen": 52232800, "step": 77480 }, { "epoch": 1.892971441135514, "grad_norm": 0.11556824296712875, "learning_rate": 1.563273084047111e-06, "loss": 0.0511, "num_input_tokens_seen": 52236320, "step": 77485 }, { "epoch": 1.8930935919673613, "grad_norm": 0.18463782966136932, "learning_rate": 1.5632026196088308e-06, "loss": 0.0613, "num_input_tokens_seen": 52240032, "step": 77490 }, { "epoch": 1.8932157427992085, "grad_norm": 0.044787704944610596, "learning_rate": 1.5631321510747894e-06, "loss": 0.0434, "num_input_tokens_seen": 52243424, "step": 77495 }, { "epoch": 1.8933378936310556, "grad_norm": 11.942049980163574, "learning_rate": 1.5630616784455e-06, "loss": 0.1223, "num_input_tokens_seen": 52246880, "step": 77500 }, { "epoch": 1.8934600444629028, "grad_norm": 0.16144710779190063, "learning_rate": 1.5629912017214744e-06, "loss": 0.0921, "num_input_tokens_seen": 52250336, "step": 77505 }, { "epoch": 1.89358219529475, "grad_norm": 16.349254608154297, "learning_rate": 1.5629207209032252e-06, "loss": 0.1228, "num_input_tokens_seen": 52253792, "step": 77510 }, { "epoch": 1.8937043461265972, "grad_norm": 38.79521179199219, "learning_rate": 1.5628502359912652e-06, "loss": 0.164, "num_input_tokens_seen": 52257184, "step": 77515 }, { "epoch": 1.8938264969584444, "grad_norm": 0.5846835374832153, "learning_rate": 1.562779746986107e-06, "loss": 0.1079, "num_input_tokens_seen": 52260576, "step": 77520 }, { "epoch": 1.8939486477902916, "grad_norm": 7.383497714996338, "learning_rate": 1.5627092538882632e-06, "loss": 0.0671, "num_input_tokens_seen": 52264352, "step": 77525 }, { "epoch": 1.8940707986221386, "grad_norm": 15.111196517944336, "learning_rate": 1.562638756698246e-06, "loss": 0.0792, "num_input_tokens_seen": 52267808, "step": 77530 }, { "epoch": 1.8941929494539858, "grad_norm": 0.34534600377082825, "learning_rate": 1.562568255416569e-06, "loss": 0.1215, "num_input_tokens_seen": 52271520, "step": 77535 }, { "epoch": 1.894315100285833, "grad_norm": 0.1826377511024475, "learning_rate": 1.5624977500437437e-06, "loss": 0.0386, "num_input_tokens_seen": 52274656, "step": 77540 }, { "epoch": 1.8944372511176801, "grad_norm": 0.25956910848617554, "learning_rate": 1.5624272405802838e-06, "loss": 0.0021, "num_input_tokens_seen": 52277792, "step": 77545 }, { "epoch": 1.8945594019495273, "grad_norm": 0.15048423409461975, "learning_rate": 1.5623567270267018e-06, "loss": 0.0528, "num_input_tokens_seen": 52281120, "step": 77550 }, { "epoch": 1.8946815527813743, "grad_norm": 0.8038405179977417, "learning_rate": 1.5622862093835102e-06, "loss": 0.026, "num_input_tokens_seen": 52284448, "step": 77555 }, { "epoch": 1.8948037036132215, "grad_norm": 0.24553032219409943, "learning_rate": 1.5622156876512223e-06, "loss": 0.0388, "num_input_tokens_seen": 52287456, "step": 77560 }, { "epoch": 1.8949258544450687, "grad_norm": 0.2718951106071472, "learning_rate": 1.5621451618303505e-06, "loss": 0.0014, "num_input_tokens_seen": 52290592, "step": 77565 }, { "epoch": 1.8950480052769159, "grad_norm": 75.5166244506836, "learning_rate": 1.5620746319214078e-06, "loss": 0.1395, "num_input_tokens_seen": 52294112, "step": 77570 }, { "epoch": 1.895170156108763, "grad_norm": 13.819537162780762, "learning_rate": 1.5620040979249074e-06, "loss": 0.0677, "num_input_tokens_seen": 52297184, "step": 77575 }, { "epoch": 1.8952923069406102, "grad_norm": 0.12002184242010117, "learning_rate": 1.561933559841362e-06, "loss": 0.0555, "num_input_tokens_seen": 52301152, "step": 77580 }, { "epoch": 1.8954144577724574, "grad_norm": 21.226789474487305, "learning_rate": 1.5618630176712846e-06, "loss": 0.231, "num_input_tokens_seen": 52304416, "step": 77585 }, { "epoch": 1.8955366086043046, "grad_norm": 0.043165311217308044, "learning_rate": 1.561792471415188e-06, "loss": 0.1493, "num_input_tokens_seen": 52308000, "step": 77590 }, { "epoch": 1.8956587594361518, "grad_norm": 0.0803232491016388, "learning_rate": 1.5617219210735858e-06, "loss": 0.1698, "num_input_tokens_seen": 52311200, "step": 77595 }, { "epoch": 1.895780910267999, "grad_norm": 0.17685692012310028, "learning_rate": 1.5616513666469904e-06, "loss": 0.0042, "num_input_tokens_seen": 52314976, "step": 77600 }, { "epoch": 1.8959030610998462, "grad_norm": 8.60054874420166, "learning_rate": 1.5615808081359154e-06, "loss": 0.1115, "num_input_tokens_seen": 52317856, "step": 77605 }, { "epoch": 1.8960252119316934, "grad_norm": 8.479482650756836, "learning_rate": 1.5615102455408735e-06, "loss": 0.0836, "num_input_tokens_seen": 52321056, "step": 77610 }, { "epoch": 1.8961473627635406, "grad_norm": 0.04393242672085762, "learning_rate": 1.5614396788623786e-06, "loss": 0.0015, "num_input_tokens_seen": 52324256, "step": 77615 }, { "epoch": 1.8962695135953875, "grad_norm": 0.5366582274436951, "learning_rate": 1.5613691081009428e-06, "loss": 0.0791, "num_input_tokens_seen": 52327328, "step": 77620 }, { "epoch": 1.8963916644272347, "grad_norm": 39.71879196166992, "learning_rate": 1.56129853325708e-06, "loss": 0.0862, "num_input_tokens_seen": 52330528, "step": 77625 }, { "epoch": 1.896513815259082, "grad_norm": 2.7838211059570312, "learning_rate": 1.5612279543313033e-06, "loss": 0.0818, "num_input_tokens_seen": 52334880, "step": 77630 }, { "epoch": 1.896635966090929, "grad_norm": 8.047002792358398, "learning_rate": 1.561157371324126e-06, "loss": 0.0512, "num_input_tokens_seen": 52338720, "step": 77635 }, { "epoch": 1.8967581169227763, "grad_norm": 0.09864991903305054, "learning_rate": 1.5610867842360614e-06, "loss": 0.0703, "num_input_tokens_seen": 52341984, "step": 77640 }, { "epoch": 1.8968802677546233, "grad_norm": 7.230635166168213, "learning_rate": 1.5610161930676226e-06, "loss": 0.064, "num_input_tokens_seen": 52344928, "step": 77645 }, { "epoch": 1.8970024185864705, "grad_norm": 182.32345581054688, "learning_rate": 1.5609455978193232e-06, "loss": 0.0177, "num_input_tokens_seen": 52348448, "step": 77650 }, { "epoch": 1.8971245694183176, "grad_norm": 0.32675567269325256, "learning_rate": 1.5608749984916767e-06, "loss": 0.071, "num_input_tokens_seen": 52351648, "step": 77655 }, { "epoch": 1.8972467202501648, "grad_norm": 44.629539489746094, "learning_rate": 1.5608043950851964e-06, "loss": 0.0611, "num_input_tokens_seen": 52354912, "step": 77660 }, { "epoch": 1.897368871082012, "grad_norm": 0.3594166338443756, "learning_rate": 1.5607337876003954e-06, "loss": 0.0366, "num_input_tokens_seen": 52358112, "step": 77665 }, { "epoch": 1.8974910219138592, "grad_norm": 0.12800997495651245, "learning_rate": 1.5606631760377878e-06, "loss": 0.0577, "num_input_tokens_seen": 52360928, "step": 77670 }, { "epoch": 1.8976131727457064, "grad_norm": 0.0739443376660347, "learning_rate": 1.5605925603978866e-06, "loss": 0.044, "num_input_tokens_seen": 52364256, "step": 77675 }, { "epoch": 1.8977353235775536, "grad_norm": 14.31065845489502, "learning_rate": 1.5605219406812054e-06, "loss": 0.1288, "num_input_tokens_seen": 52367392, "step": 77680 }, { "epoch": 1.8978574744094008, "grad_norm": 13.595717430114746, "learning_rate": 1.5604513168882582e-06, "loss": 0.0254, "num_input_tokens_seen": 52370976, "step": 77685 }, { "epoch": 1.897979625241248, "grad_norm": 0.24001628160476685, "learning_rate": 1.560380689019558e-06, "loss": 0.1965, "num_input_tokens_seen": 52374240, "step": 77690 }, { "epoch": 1.8981017760730952, "grad_norm": 184.75628662109375, "learning_rate": 1.5603100570756192e-06, "loss": 0.0589, "num_input_tokens_seen": 52377248, "step": 77695 }, { "epoch": 1.8982239269049423, "grad_norm": 10.1908597946167, "learning_rate": 1.5602394210569544e-06, "loss": 0.1434, "num_input_tokens_seen": 52380320, "step": 77700 }, { "epoch": 1.8983460777367895, "grad_norm": 0.0293444711714983, "learning_rate": 1.560168780964078e-06, "loss": 0.0915, "num_input_tokens_seen": 52383904, "step": 77705 }, { "epoch": 1.8984682285686365, "grad_norm": 0.5846049785614014, "learning_rate": 1.5600981367975037e-06, "loss": 0.0484, "num_input_tokens_seen": 52387168, "step": 77710 }, { "epoch": 1.8985903794004837, "grad_norm": 1.261268138885498, "learning_rate": 1.5600274885577446e-06, "loss": 0.0511, "num_input_tokens_seen": 52389984, "step": 77715 }, { "epoch": 1.8987125302323309, "grad_norm": 0.9083055853843689, "learning_rate": 1.5599568362453158e-06, "loss": 0.0516, "num_input_tokens_seen": 52393056, "step": 77720 }, { "epoch": 1.898834681064178, "grad_norm": 0.023565126582980156, "learning_rate": 1.5598861798607297e-06, "loss": 0.0975, "num_input_tokens_seen": 52396512, "step": 77725 }, { "epoch": 1.8989568318960253, "grad_norm": 47.272361755371094, "learning_rate": 1.5598155194045007e-06, "loss": 0.1994, "num_input_tokens_seen": 52399840, "step": 77730 }, { "epoch": 1.8990789827278722, "grad_norm": 0.0744083821773529, "learning_rate": 1.559744854877143e-06, "loss": 0.0337, "num_input_tokens_seen": 52403360, "step": 77735 }, { "epoch": 1.8992011335597194, "grad_norm": 20.310832977294922, "learning_rate": 1.55967418627917e-06, "loss": 0.1266, "num_input_tokens_seen": 52407136, "step": 77740 }, { "epoch": 1.8993232843915666, "grad_norm": 20.577260971069336, "learning_rate": 1.5596035136110957e-06, "loss": 0.1016, "num_input_tokens_seen": 52410464, "step": 77745 }, { "epoch": 1.8994454352234138, "grad_norm": 4.613851547241211, "learning_rate": 1.559532836873434e-06, "loss": 0.0045, "num_input_tokens_seen": 52413728, "step": 77750 }, { "epoch": 1.899567586055261, "grad_norm": 53.65876770019531, "learning_rate": 1.5594621560666994e-06, "loss": 0.004, "num_input_tokens_seen": 52416800, "step": 77755 }, { "epoch": 1.8996897368871082, "grad_norm": 0.1271817535161972, "learning_rate": 1.5593914711914054e-06, "loss": 0.09, "num_input_tokens_seen": 52420064, "step": 77760 }, { "epoch": 1.8998118877189554, "grad_norm": 0.3270379900932312, "learning_rate": 1.5593207822480661e-06, "loss": 0.0992, "num_input_tokens_seen": 52423776, "step": 77765 }, { "epoch": 1.8999340385508026, "grad_norm": 23.36074447631836, "learning_rate": 1.5592500892371958e-06, "loss": 0.0913, "num_input_tokens_seen": 52427104, "step": 77770 }, { "epoch": 1.9000561893826498, "grad_norm": 0.04623517394065857, "learning_rate": 1.5591793921593079e-06, "loss": 0.0226, "num_input_tokens_seen": 52430048, "step": 77775 }, { "epoch": 1.900178340214497, "grad_norm": 0.12228715419769287, "learning_rate": 1.5591086910149174e-06, "loss": 0.144, "num_input_tokens_seen": 52433632, "step": 77780 }, { "epoch": 1.9003004910463441, "grad_norm": 203.8829345703125, "learning_rate": 1.5590379858045384e-06, "loss": 0.0424, "num_input_tokens_seen": 52436896, "step": 77785 }, { "epoch": 1.9004226418781913, "grad_norm": 36.67129135131836, "learning_rate": 1.5589672765286846e-06, "loss": 0.1, "num_input_tokens_seen": 52440096, "step": 77790 }, { "epoch": 1.9005447927100385, "grad_norm": 0.8187667727470398, "learning_rate": 1.5588965631878704e-06, "loss": 0.0882, "num_input_tokens_seen": 52443552, "step": 77795 }, { "epoch": 1.9006669435418855, "grad_norm": 0.08393049985170364, "learning_rate": 1.5588258457826098e-06, "loss": 0.001, "num_input_tokens_seen": 52447456, "step": 77800 }, { "epoch": 1.9007890943737327, "grad_norm": 0.029797382652759552, "learning_rate": 1.5587551243134173e-06, "loss": 0.0901, "num_input_tokens_seen": 52451040, "step": 77805 }, { "epoch": 1.9009112452055799, "grad_norm": 270.3726806640625, "learning_rate": 1.5586843987808078e-06, "loss": 0.0213, "num_input_tokens_seen": 52454560, "step": 77810 }, { "epoch": 1.901033396037427, "grad_norm": 0.9959912300109863, "learning_rate": 1.558613669185295e-06, "loss": 0.0803, "num_input_tokens_seen": 52458016, "step": 77815 }, { "epoch": 1.901155546869274, "grad_norm": 0.7972824573516846, "learning_rate": 1.558542935527393e-06, "loss": 0.0008, "num_input_tokens_seen": 52461088, "step": 77820 }, { "epoch": 1.9012776977011212, "grad_norm": 23.410091400146484, "learning_rate": 1.5584721978076167e-06, "loss": 0.2709, "num_input_tokens_seen": 52465184, "step": 77825 }, { "epoch": 1.9013998485329684, "grad_norm": 0.019310766831040382, "learning_rate": 1.5584014560264803e-06, "loss": 0.0012, "num_input_tokens_seen": 52468384, "step": 77830 }, { "epoch": 1.9015219993648156, "grad_norm": 50.934814453125, "learning_rate": 1.5583307101844984e-06, "loss": 0.0785, "num_input_tokens_seen": 52471456, "step": 77835 }, { "epoch": 1.9016441501966628, "grad_norm": 11.939221382141113, "learning_rate": 1.5582599602821854e-06, "loss": 0.1133, "num_input_tokens_seen": 52474912, "step": 77840 }, { "epoch": 1.90176630102851, "grad_norm": 8.909550666809082, "learning_rate": 1.5581892063200556e-06, "loss": 0.128, "num_input_tokens_seen": 52478496, "step": 77845 }, { "epoch": 1.9018884518603572, "grad_norm": 0.1402382254600525, "learning_rate": 1.5581184482986242e-06, "loss": 0.0018, "num_input_tokens_seen": 52482144, "step": 77850 }, { "epoch": 1.9020106026922043, "grad_norm": 0.1215723529458046, "learning_rate": 1.558047686218405e-06, "loss": 0.0545, "num_input_tokens_seen": 52485664, "step": 77855 }, { "epoch": 1.9021327535240515, "grad_norm": 0.1959735006093979, "learning_rate": 1.5579769200799132e-06, "loss": 0.0012, "num_input_tokens_seen": 52489120, "step": 77860 }, { "epoch": 1.9022549043558987, "grad_norm": 0.03554701805114746, "learning_rate": 1.557906149883663e-06, "loss": 0.0627, "num_input_tokens_seen": 52492192, "step": 77865 }, { "epoch": 1.902377055187746, "grad_norm": 14.456132888793945, "learning_rate": 1.557835375630169e-06, "loss": 0.0922, "num_input_tokens_seen": 52495840, "step": 77870 }, { "epoch": 1.902499206019593, "grad_norm": 1.007307767868042, "learning_rate": 1.5577645973199465e-06, "loss": 0.0438, "num_input_tokens_seen": 52498784, "step": 77875 }, { "epoch": 1.9026213568514403, "grad_norm": 7.78939151763916, "learning_rate": 1.5576938149535096e-06, "loss": 0.1144, "num_input_tokens_seen": 52502176, "step": 77880 }, { "epoch": 1.9027435076832875, "grad_norm": 1.2881722450256348, "learning_rate": 1.5576230285313732e-06, "loss": 0.0658, "num_input_tokens_seen": 52505696, "step": 77885 }, { "epoch": 1.9028656585151345, "grad_norm": 0.14054358005523682, "learning_rate": 1.5575522380540522e-06, "loss": 0.0224, "num_input_tokens_seen": 52508832, "step": 77890 }, { "epoch": 1.9029878093469816, "grad_norm": 0.10390869528055191, "learning_rate": 1.5574814435220616e-06, "loss": 0.037, "num_input_tokens_seen": 52511840, "step": 77895 }, { "epoch": 1.9031099601788288, "grad_norm": 0.19694241881370544, "learning_rate": 1.5574106449359157e-06, "loss": 0.1035, "num_input_tokens_seen": 52515104, "step": 77900 }, { "epoch": 1.903232111010676, "grad_norm": 0.08465810120105743, "learning_rate": 1.55733984229613e-06, "loss": 0.1224, "num_input_tokens_seen": 52518752, "step": 77905 }, { "epoch": 1.903354261842523, "grad_norm": 0.5364916324615479, "learning_rate": 1.5572690356032187e-06, "loss": 0.1126, "num_input_tokens_seen": 52522208, "step": 77910 }, { "epoch": 1.9034764126743702, "grad_norm": 1.2812743186950684, "learning_rate": 1.557198224857697e-06, "loss": 0.0724, "num_input_tokens_seen": 52525472, "step": 77915 }, { "epoch": 1.9035985635062174, "grad_norm": 75.62810516357422, "learning_rate": 1.5571274100600805e-06, "loss": 0.1327, "num_input_tokens_seen": 52528928, "step": 77920 }, { "epoch": 1.9037207143380646, "grad_norm": 0.19514200091362, "learning_rate": 1.5570565912108833e-06, "loss": 0.0025, "num_input_tokens_seen": 52532832, "step": 77925 }, { "epoch": 1.9038428651699117, "grad_norm": 32.084651947021484, "learning_rate": 1.5569857683106205e-06, "loss": 0.0865, "num_input_tokens_seen": 52536224, "step": 77930 }, { "epoch": 1.903965016001759, "grad_norm": 0.9640393853187561, "learning_rate": 1.5569149413598077e-06, "loss": 0.1001, "num_input_tokens_seen": 52539680, "step": 77935 }, { "epoch": 1.9040871668336061, "grad_norm": 2.7728748321533203, "learning_rate": 1.5568441103589596e-06, "loss": 0.0035, "num_input_tokens_seen": 52542624, "step": 77940 }, { "epoch": 1.9042093176654533, "grad_norm": 0.49883055686950684, "learning_rate": 1.5567732753085915e-06, "loss": 0.0354, "num_input_tokens_seen": 52545952, "step": 77945 }, { "epoch": 1.9043314684973005, "grad_norm": 0.12617257237434387, "learning_rate": 1.556702436209218e-06, "loss": 0.0308, "num_input_tokens_seen": 52549408, "step": 77950 }, { "epoch": 1.9044536193291477, "grad_norm": 1.3749181032180786, "learning_rate": 1.556631593061355e-06, "loss": 0.0018, "num_input_tokens_seen": 52552544, "step": 77955 }, { "epoch": 1.9045757701609949, "grad_norm": 48.934146881103516, "learning_rate": 1.556560745865517e-06, "loss": 0.1904, "num_input_tokens_seen": 52556000, "step": 77960 }, { "epoch": 1.904697920992842, "grad_norm": 27.332305908203125, "learning_rate": 1.5564898946222198e-06, "loss": 0.1821, "num_input_tokens_seen": 52559712, "step": 77965 }, { "epoch": 1.9048200718246893, "grad_norm": 0.02467111311852932, "learning_rate": 1.5564190393319784e-06, "loss": 0.0012, "num_input_tokens_seen": 52563168, "step": 77970 }, { "epoch": 1.9049422226565365, "grad_norm": 0.9920415282249451, "learning_rate": 1.5563481799953082e-06, "loss": 0.0939, "num_input_tokens_seen": 52566944, "step": 77975 }, { "epoch": 1.9050643734883834, "grad_norm": 26.86354637145996, "learning_rate": 1.556277316612724e-06, "loss": 0.1157, "num_input_tokens_seen": 52570144, "step": 77980 }, { "epoch": 1.9051865243202306, "grad_norm": 0.035159774124622345, "learning_rate": 1.556206449184742e-06, "loss": 0.0557, "num_input_tokens_seen": 52573216, "step": 77985 }, { "epoch": 1.9053086751520778, "grad_norm": 14.330402374267578, "learning_rate": 1.5561355777118768e-06, "loss": 0.158, "num_input_tokens_seen": 52575904, "step": 77990 }, { "epoch": 1.905430825983925, "grad_norm": 0.6020281910896301, "learning_rate": 1.5560647021946442e-06, "loss": 0.0497, "num_input_tokens_seen": 52579232, "step": 77995 }, { "epoch": 1.905552976815772, "grad_norm": 6.074164867401123, "learning_rate": 1.5559938226335593e-06, "loss": 0.0926, "num_input_tokens_seen": 52582432, "step": 78000 }, { "epoch": 1.9056751276476191, "grad_norm": 0.13777603209018707, "learning_rate": 1.5559229390291382e-06, "loss": 0.0307, "num_input_tokens_seen": 52585696, "step": 78005 }, { "epoch": 1.9057972784794663, "grad_norm": 25.201900482177734, "learning_rate": 1.5558520513818958e-06, "loss": 0.3789, "num_input_tokens_seen": 52588448, "step": 78010 }, { "epoch": 1.9059194293113135, "grad_norm": 0.08686228096485138, "learning_rate": 1.5557811596923477e-06, "loss": 0.047, "num_input_tokens_seen": 52591712, "step": 78015 }, { "epoch": 1.9060415801431607, "grad_norm": 0.028829792514443398, "learning_rate": 1.5557102639610095e-06, "loss": 0.0008, "num_input_tokens_seen": 52594976, "step": 78020 }, { "epoch": 1.906163730975008, "grad_norm": 16.483469009399414, "learning_rate": 1.555639364188397e-06, "loss": 0.3379, "num_input_tokens_seen": 52598560, "step": 78025 }, { "epoch": 1.906285881806855, "grad_norm": 0.33806437253952026, "learning_rate": 1.5555684603750252e-06, "loss": 0.0404, "num_input_tokens_seen": 52601440, "step": 78030 }, { "epoch": 1.9064080326387023, "grad_norm": 12.354910850524902, "learning_rate": 1.5554975525214104e-06, "loss": 0.0938, "num_input_tokens_seen": 52604640, "step": 78035 }, { "epoch": 1.9065301834705495, "grad_norm": 0.7182216644287109, "learning_rate": 1.555426640628068e-06, "loss": 0.1132, "num_input_tokens_seen": 52607904, "step": 78040 }, { "epoch": 1.9066523343023967, "grad_norm": 0.13206681609153748, "learning_rate": 1.5553557246955137e-06, "loss": 0.0164, "num_input_tokens_seen": 52610848, "step": 78045 }, { "epoch": 1.9067744851342439, "grad_norm": 0.39031508564949036, "learning_rate": 1.555284804724263e-06, "loss": 0.0401, "num_input_tokens_seen": 52614304, "step": 78050 }, { "epoch": 1.906896635966091, "grad_norm": 0.11643342673778534, "learning_rate": 1.5552138807148318e-06, "loss": 0.1338, "num_input_tokens_seen": 52618208, "step": 78055 }, { "epoch": 1.9070187867979382, "grad_norm": 0.06826582551002502, "learning_rate": 1.5551429526677363e-06, "loss": 0.0733, "num_input_tokens_seen": 52621216, "step": 78060 }, { "epoch": 1.9071409376297852, "grad_norm": 0.1077682301402092, "learning_rate": 1.5550720205834917e-06, "loss": 0.0397, "num_input_tokens_seen": 52624736, "step": 78065 }, { "epoch": 1.9072630884616324, "grad_norm": 8.253615379333496, "learning_rate": 1.555001084462614e-06, "loss": 0.154, "num_input_tokens_seen": 52627936, "step": 78070 }, { "epoch": 1.9073852392934796, "grad_norm": 13.16157341003418, "learning_rate": 1.5549301443056192e-06, "loss": 0.1156, "num_input_tokens_seen": 52631200, "step": 78075 }, { "epoch": 1.9075073901253268, "grad_norm": 1.2118772268295288, "learning_rate": 1.5548592001130234e-06, "loss": 0.0882, "num_input_tokens_seen": 52635104, "step": 78080 }, { "epoch": 1.907629540957174, "grad_norm": 82.68314361572266, "learning_rate": 1.5547882518853417e-06, "loss": 0.2286, "num_input_tokens_seen": 52638048, "step": 78085 }, { "epoch": 1.907751691789021, "grad_norm": 16.325532913208008, "learning_rate": 1.554717299623091e-06, "loss": 0.203, "num_input_tokens_seen": 52641312, "step": 78090 }, { "epoch": 1.9078738426208681, "grad_norm": 13.200202941894531, "learning_rate": 1.554646343326787e-06, "loss": 0.0695, "num_input_tokens_seen": 52644704, "step": 78095 }, { "epoch": 1.9079959934527153, "grad_norm": 0.7149518728256226, "learning_rate": 1.5545753829969455e-06, "loss": 0.0021, "num_input_tokens_seen": 52647904, "step": 78100 }, { "epoch": 1.9081181442845625, "grad_norm": 38.67225646972656, "learning_rate": 1.5545044186340826e-06, "loss": 0.0808, "num_input_tokens_seen": 52651424, "step": 78105 }, { "epoch": 1.9082402951164097, "grad_norm": 0.16422489285469055, "learning_rate": 1.554433450238714e-06, "loss": 0.0022, "num_input_tokens_seen": 52654624, "step": 78110 }, { "epoch": 1.9083624459482569, "grad_norm": 0.5577579736709595, "learning_rate": 1.5543624778113568e-06, "loss": 0.0401, "num_input_tokens_seen": 52657824, "step": 78115 }, { "epoch": 1.908484596780104, "grad_norm": 0.15506233274936676, "learning_rate": 1.5542915013525265e-06, "loss": 0.0846, "num_input_tokens_seen": 52660832, "step": 78120 }, { "epoch": 1.9086067476119513, "grad_norm": 0.20442821085453033, "learning_rate": 1.5542205208627393e-06, "loss": 0.1013, "num_input_tokens_seen": 52664288, "step": 78125 }, { "epoch": 1.9087288984437984, "grad_norm": 0.047798607498407364, "learning_rate": 1.5541495363425113e-06, "loss": 0.0475, "num_input_tokens_seen": 52667552, "step": 78130 }, { "epoch": 1.9088510492756456, "grad_norm": 1.5664831399917603, "learning_rate": 1.5540785477923587e-06, "loss": 0.0094, "num_input_tokens_seen": 52671072, "step": 78135 }, { "epoch": 1.9089732001074928, "grad_norm": 7.628649711608887, "learning_rate": 1.5540075552127982e-06, "loss": 0.1499, "num_input_tokens_seen": 52674272, "step": 78140 }, { "epoch": 1.90909535093934, "grad_norm": 38.61286926269531, "learning_rate": 1.5539365586043456e-06, "loss": 0.1028, "num_input_tokens_seen": 52677600, "step": 78145 }, { "epoch": 1.9092175017711872, "grad_norm": 0.0798473060131073, "learning_rate": 1.553865557967517e-06, "loss": 0.0684, "num_input_tokens_seen": 52680928, "step": 78150 }, { "epoch": 1.9093396526030342, "grad_norm": 0.15310020744800568, "learning_rate": 1.5537945533028296e-06, "loss": 0.1008, "num_input_tokens_seen": 52684256, "step": 78155 }, { "epoch": 1.9094618034348814, "grad_norm": 0.4250839650630951, "learning_rate": 1.553723544610799e-06, "loss": 0.1565, "num_input_tokens_seen": 52687776, "step": 78160 }, { "epoch": 1.9095839542667286, "grad_norm": 1.4815961122512817, "learning_rate": 1.553652531891942e-06, "loss": 0.0459, "num_input_tokens_seen": 52691168, "step": 78165 }, { "epoch": 1.9097061050985757, "grad_norm": 0.09083965420722961, "learning_rate": 1.5535815151467747e-06, "loss": 0.1741, "num_input_tokens_seen": 52694496, "step": 78170 }, { "epoch": 1.909828255930423, "grad_norm": 131.01443481445312, "learning_rate": 1.5535104943758137e-06, "loss": 0.1741, "num_input_tokens_seen": 52697568, "step": 78175 }, { "epoch": 1.90995040676227, "grad_norm": 84.11591339111328, "learning_rate": 1.5534394695795757e-06, "loss": 0.1526, "num_input_tokens_seen": 52700896, "step": 78180 }, { "epoch": 1.910072557594117, "grad_norm": 0.23095948994159698, "learning_rate": 1.553368440758577e-06, "loss": 0.0386, "num_input_tokens_seen": 52703968, "step": 78185 }, { "epoch": 1.9101947084259643, "grad_norm": 0.20871803164482117, "learning_rate": 1.5532974079133339e-06, "loss": 0.0855, "num_input_tokens_seen": 52707296, "step": 78190 }, { "epoch": 1.9103168592578115, "grad_norm": 1.4178715944290161, "learning_rate": 1.5532263710443636e-06, "loss": 0.0046, "num_input_tokens_seen": 52710496, "step": 78195 }, { "epoch": 1.9104390100896587, "grad_norm": 13.72677230834961, "learning_rate": 1.5531553301521824e-06, "loss": 0.0676, "num_input_tokens_seen": 52713824, "step": 78200 }, { "epoch": 1.9105611609215059, "grad_norm": 0.8392394781112671, "learning_rate": 1.5530842852373063e-06, "loss": 0.0933, "num_input_tokens_seen": 52717088, "step": 78205 }, { "epoch": 1.910683311753353, "grad_norm": 0.3146764039993286, "learning_rate": 1.5530132363002528e-06, "loss": 0.0364, "num_input_tokens_seen": 52720928, "step": 78210 }, { "epoch": 1.9108054625852002, "grad_norm": 0.3037707805633545, "learning_rate": 1.5529421833415383e-06, "loss": 0.0951, "num_input_tokens_seen": 52724192, "step": 78215 }, { "epoch": 1.9109276134170474, "grad_norm": 0.6358781456947327, "learning_rate": 1.5528711263616795e-06, "loss": 0.0037, "num_input_tokens_seen": 52727904, "step": 78220 }, { "epoch": 1.9110497642488946, "grad_norm": 0.17406722903251648, "learning_rate": 1.5528000653611932e-06, "loss": 0.0388, "num_input_tokens_seen": 52730976, "step": 78225 }, { "epoch": 1.9111719150807418, "grad_norm": 0.07249428331851959, "learning_rate": 1.5527290003405961e-06, "loss": 0.1138, "num_input_tokens_seen": 52734176, "step": 78230 }, { "epoch": 1.911294065912589, "grad_norm": 0.38793230056762695, "learning_rate": 1.5526579313004053e-06, "loss": 0.0956, "num_input_tokens_seen": 52737312, "step": 78235 }, { "epoch": 1.9114162167444362, "grad_norm": 0.16493968665599823, "learning_rate": 1.552586858241137e-06, "loss": 0.1298, "num_input_tokens_seen": 52740576, "step": 78240 }, { "epoch": 1.9115383675762831, "grad_norm": 0.046525854617357254, "learning_rate": 1.5525157811633087e-06, "loss": 0.1594, "num_input_tokens_seen": 52743712, "step": 78245 }, { "epoch": 1.9116605184081303, "grad_norm": 17.563119888305664, "learning_rate": 1.552444700067437e-06, "loss": 0.1102, "num_input_tokens_seen": 52746720, "step": 78250 }, { "epoch": 1.9117826692399775, "grad_norm": 1.223860502243042, "learning_rate": 1.5523736149540388e-06, "loss": 0.0974, "num_input_tokens_seen": 52750048, "step": 78255 }, { "epoch": 1.9119048200718247, "grad_norm": 0.09920724481344223, "learning_rate": 1.5523025258236312e-06, "loss": 0.0353, "num_input_tokens_seen": 52753440, "step": 78260 }, { "epoch": 1.912026970903672, "grad_norm": 101.08455657958984, "learning_rate": 1.5522314326767309e-06, "loss": 0.0222, "num_input_tokens_seen": 52756896, "step": 78265 }, { "epoch": 1.9121491217355189, "grad_norm": 8.766607284545898, "learning_rate": 1.5521603355138552e-06, "loss": 0.0442, "num_input_tokens_seen": 52760736, "step": 78270 }, { "epoch": 1.912271272567366, "grad_norm": 0.4223634600639343, "learning_rate": 1.5520892343355208e-06, "loss": 0.0553, "num_input_tokens_seen": 52764768, "step": 78275 }, { "epoch": 1.9123934233992133, "grad_norm": 16.334505081176758, "learning_rate": 1.5520181291422454e-06, "loss": 0.0684, "num_input_tokens_seen": 52768096, "step": 78280 }, { "epoch": 1.9125155742310604, "grad_norm": 1.246199131011963, "learning_rate": 1.5519470199345455e-06, "loss": 0.0949, "num_input_tokens_seen": 52771616, "step": 78285 }, { "epoch": 1.9126377250629076, "grad_norm": 24.813953399658203, "learning_rate": 1.5518759067129383e-06, "loss": 0.261, "num_input_tokens_seen": 52774624, "step": 78290 }, { "epoch": 1.9127598758947548, "grad_norm": 0.43296635150909424, "learning_rate": 1.5518047894779413e-06, "loss": 0.0024, "num_input_tokens_seen": 52778336, "step": 78295 }, { "epoch": 1.912882026726602, "grad_norm": 1.1578298807144165, "learning_rate": 1.5517336682300711e-06, "loss": 0.0392, "num_input_tokens_seen": 52781664, "step": 78300 }, { "epoch": 1.9130041775584492, "grad_norm": 98.7223892211914, "learning_rate": 1.5516625429698455e-06, "loss": 0.2153, "num_input_tokens_seen": 52784800, "step": 78305 }, { "epoch": 1.9131263283902964, "grad_norm": 36.56499099731445, "learning_rate": 1.5515914136977815e-06, "loss": 0.0306, "num_input_tokens_seen": 52788448, "step": 78310 }, { "epoch": 1.9132484792221436, "grad_norm": 6.932656764984131, "learning_rate": 1.5515202804143964e-06, "loss": 0.0718, "num_input_tokens_seen": 52791648, "step": 78315 }, { "epoch": 1.9133706300539908, "grad_norm": 0.1303882896900177, "learning_rate": 1.5514491431202075e-06, "loss": 0.0966, "num_input_tokens_seen": 52795168, "step": 78320 }, { "epoch": 1.913492780885838, "grad_norm": 0.4207264482975006, "learning_rate": 1.5513780018157321e-06, "loss": 0.0456, "num_input_tokens_seen": 52798752, "step": 78325 }, { "epoch": 1.9136149317176852, "grad_norm": 114.55884552001953, "learning_rate": 1.5513068565014875e-06, "loss": 0.0781, "num_input_tokens_seen": 52802336, "step": 78330 }, { "epoch": 1.9137370825495321, "grad_norm": 126.00515747070312, "learning_rate": 1.5512357071779912e-06, "loss": 0.2495, "num_input_tokens_seen": 52805600, "step": 78335 }, { "epoch": 1.9138592333813793, "grad_norm": 0.026732511818408966, "learning_rate": 1.5511645538457604e-06, "loss": 0.167, "num_input_tokens_seen": 52808928, "step": 78340 }, { "epoch": 1.9139813842132265, "grad_norm": 11.265633583068848, "learning_rate": 1.551093396505313e-06, "loss": 0.0669, "num_input_tokens_seen": 52812064, "step": 78345 }, { "epoch": 1.9141035350450737, "grad_norm": 1.563814401626587, "learning_rate": 1.551022235157166e-06, "loss": 0.0021, "num_input_tokens_seen": 52815712, "step": 78350 }, { "epoch": 1.9142256858769207, "grad_norm": 12.1010160446167, "learning_rate": 1.550951069801837e-06, "loss": 0.051, "num_input_tokens_seen": 52819360, "step": 78355 }, { "epoch": 1.9143478367087678, "grad_norm": 10.78503704071045, "learning_rate": 1.550879900439844e-06, "loss": 0.0383, "num_input_tokens_seen": 52822688, "step": 78360 }, { "epoch": 1.914469987540615, "grad_norm": 0.09379678964614868, "learning_rate": 1.5508087270717041e-06, "loss": 0.0459, "num_input_tokens_seen": 52825952, "step": 78365 }, { "epoch": 1.9145921383724622, "grad_norm": 84.47173309326172, "learning_rate": 1.550737549697935e-06, "loss": 0.1156, "num_input_tokens_seen": 52829152, "step": 78370 }, { "epoch": 1.9147142892043094, "grad_norm": 30.063278198242188, "learning_rate": 1.550666368319054e-06, "loss": 0.1214, "num_input_tokens_seen": 52832480, "step": 78375 }, { "epoch": 1.9148364400361566, "grad_norm": 0.1367904096841812, "learning_rate": 1.5505951829355791e-06, "loss": 0.1197, "num_input_tokens_seen": 52835872, "step": 78380 }, { "epoch": 1.9149585908680038, "grad_norm": 63.807098388671875, "learning_rate": 1.5505239935480283e-06, "loss": 0.1606, "num_input_tokens_seen": 52839712, "step": 78385 }, { "epoch": 1.915080741699851, "grad_norm": 0.2028699666261673, "learning_rate": 1.550452800156919e-06, "loss": 0.002, "num_input_tokens_seen": 52843360, "step": 78390 }, { "epoch": 1.9152028925316982, "grad_norm": 0.36074304580688477, "learning_rate": 1.5503816027627684e-06, "loss": 0.0617, "num_input_tokens_seen": 52846304, "step": 78395 }, { "epoch": 1.9153250433635454, "grad_norm": 0.44605323672294617, "learning_rate": 1.5503104013660946e-06, "loss": 0.174, "num_input_tokens_seen": 52849696, "step": 78400 }, { "epoch": 1.9154471941953926, "grad_norm": 0.6416441202163696, "learning_rate": 1.550239195967416e-06, "loss": 0.0981, "num_input_tokens_seen": 52853024, "step": 78405 }, { "epoch": 1.9155693450272397, "grad_norm": 20.85800552368164, "learning_rate": 1.55016798656725e-06, "loss": 0.3347, "num_input_tokens_seen": 52856160, "step": 78410 }, { "epoch": 1.915691495859087, "grad_norm": 0.6915989518165588, "learning_rate": 1.5500967731661146e-06, "loss": 0.0996, "num_input_tokens_seen": 52859360, "step": 78415 }, { "epoch": 1.9158136466909341, "grad_norm": 0.4440751373767853, "learning_rate": 1.550025555764527e-06, "loss": 0.0661, "num_input_tokens_seen": 52862752, "step": 78420 }, { "epoch": 1.915935797522781, "grad_norm": 0.34711381793022156, "learning_rate": 1.5499543343630056e-06, "loss": 0.0543, "num_input_tokens_seen": 52866336, "step": 78425 }, { "epoch": 1.9160579483546283, "grad_norm": 4.468153953552246, "learning_rate": 1.5498831089620686e-06, "loss": 0.0118, "num_input_tokens_seen": 52869152, "step": 78430 }, { "epoch": 1.9161800991864755, "grad_norm": 0.1526588648557663, "learning_rate": 1.549811879562234e-06, "loss": 0.0841, "num_input_tokens_seen": 52872736, "step": 78435 }, { "epoch": 1.9163022500183227, "grad_norm": 47.45873260498047, "learning_rate": 1.549740646164019e-06, "loss": 0.0809, "num_input_tokens_seen": 52876256, "step": 78440 }, { "epoch": 1.9164244008501696, "grad_norm": 1.7582581043243408, "learning_rate": 1.5496694087679427e-06, "loss": 0.2138, "num_input_tokens_seen": 52879776, "step": 78445 }, { "epoch": 1.9165465516820168, "grad_norm": 1.4794515371322632, "learning_rate": 1.5495981673745222e-06, "loss": 0.0528, "num_input_tokens_seen": 52882848, "step": 78450 }, { "epoch": 1.916668702513864, "grad_norm": 18.945890426635742, "learning_rate": 1.549526921984276e-06, "loss": 0.1183, "num_input_tokens_seen": 52886304, "step": 78455 }, { "epoch": 1.9167908533457112, "grad_norm": 102.38660430908203, "learning_rate": 1.5494556725977224e-06, "loss": 0.0998, "num_input_tokens_seen": 52889952, "step": 78460 }, { "epoch": 1.9169130041775584, "grad_norm": 0.22804000973701477, "learning_rate": 1.5493844192153794e-06, "loss": 0.0358, "num_input_tokens_seen": 52893280, "step": 78465 }, { "epoch": 1.9170351550094056, "grad_norm": 8.567605018615723, "learning_rate": 1.549313161837765e-06, "loss": 0.0706, "num_input_tokens_seen": 52896608, "step": 78470 }, { "epoch": 1.9171573058412528, "grad_norm": 0.02231750637292862, "learning_rate": 1.5492419004653977e-06, "loss": 0.1005, "num_input_tokens_seen": 52899808, "step": 78475 }, { "epoch": 1.9172794566731, "grad_norm": 0.3702198266983032, "learning_rate": 1.5491706350987954e-06, "loss": 0.0561, "num_input_tokens_seen": 52903136, "step": 78480 }, { "epoch": 1.9174016075049471, "grad_norm": 11.615361213684082, "learning_rate": 1.5490993657384766e-06, "loss": 0.1255, "num_input_tokens_seen": 52906720, "step": 78485 }, { "epoch": 1.9175237583367943, "grad_norm": 10.031021118164062, "learning_rate": 1.5490280923849595e-06, "loss": 0.1982, "num_input_tokens_seen": 52910624, "step": 78490 }, { "epoch": 1.9176459091686415, "grad_norm": 0.313969224691391, "learning_rate": 1.5489568150387624e-06, "loss": 0.0295, "num_input_tokens_seen": 52914208, "step": 78495 }, { "epoch": 1.9177680600004887, "grad_norm": 120.33798217773438, "learning_rate": 1.5488855337004035e-06, "loss": 0.1131, "num_input_tokens_seen": 52917344, "step": 78500 }, { "epoch": 1.917890210832336, "grad_norm": 1.7783594131469727, "learning_rate": 1.548814248370402e-06, "loss": 0.0533, "num_input_tokens_seen": 52920928, "step": 78505 }, { "epoch": 1.918012361664183, "grad_norm": 1.2943834066390991, "learning_rate": 1.548742959049275e-06, "loss": 0.151, "num_input_tokens_seen": 52924704, "step": 78510 }, { "epoch": 1.91813451249603, "grad_norm": 14.105642318725586, "learning_rate": 1.548671665737542e-06, "loss": 0.0807, "num_input_tokens_seen": 52927712, "step": 78515 }, { "epoch": 1.9182566633278773, "grad_norm": 0.6679994463920593, "learning_rate": 1.5486003684357209e-06, "loss": 0.16, "num_input_tokens_seen": 52930976, "step": 78520 }, { "epoch": 1.9183788141597244, "grad_norm": 38.2706413269043, "learning_rate": 1.5485290671443306e-06, "loss": 0.1026, "num_input_tokens_seen": 52934496, "step": 78525 }, { "epoch": 1.9185009649915716, "grad_norm": 328.0785827636719, "learning_rate": 1.5484577618638892e-06, "loss": 0.1407, "num_input_tokens_seen": 52937952, "step": 78530 }, { "epoch": 1.9186231158234186, "grad_norm": 20.970483779907227, "learning_rate": 1.5483864525949156e-06, "loss": 0.1558, "num_input_tokens_seen": 52941280, "step": 78535 }, { "epoch": 1.9187452666552658, "grad_norm": 2.9742214679718018, "learning_rate": 1.5483151393379278e-06, "loss": 0.0748, "num_input_tokens_seen": 52944544, "step": 78540 }, { "epoch": 1.918867417487113, "grad_norm": 0.6080707907676697, "learning_rate": 1.5482438220934453e-06, "loss": 0.1002, "num_input_tokens_seen": 52948000, "step": 78545 }, { "epoch": 1.9189895683189602, "grad_norm": 26.098880767822266, "learning_rate": 1.5481725008619857e-06, "loss": 0.1008, "num_input_tokens_seen": 52951520, "step": 78550 }, { "epoch": 1.9191117191508074, "grad_norm": 38.22494888305664, "learning_rate": 1.5481011756440688e-06, "loss": 0.0848, "num_input_tokens_seen": 52954656, "step": 78555 }, { "epoch": 1.9192338699826546, "grad_norm": 0.06840623915195465, "learning_rate": 1.5480298464402127e-06, "loss": 0.07, "num_input_tokens_seen": 52958368, "step": 78560 }, { "epoch": 1.9193560208145017, "grad_norm": 0.19684047996997833, "learning_rate": 1.5479585132509358e-06, "loss": 0.09, "num_input_tokens_seen": 52961504, "step": 78565 }, { "epoch": 1.919478171646349, "grad_norm": 0.4583105742931366, "learning_rate": 1.5478871760767574e-06, "loss": 0.0319, "num_input_tokens_seen": 52964704, "step": 78570 }, { "epoch": 1.9196003224781961, "grad_norm": 4.073686122894287, "learning_rate": 1.5478158349181963e-06, "loss": 0.0083, "num_input_tokens_seen": 52967904, "step": 78575 }, { "epoch": 1.9197224733100433, "grad_norm": 67.97811889648438, "learning_rate": 1.5477444897757707e-06, "loss": 0.055, "num_input_tokens_seen": 52971360, "step": 78580 }, { "epoch": 1.9198446241418905, "grad_norm": 0.246597558259964, "learning_rate": 1.54767314065e-06, "loss": 0.0767, "num_input_tokens_seen": 52974624, "step": 78585 }, { "epoch": 1.9199667749737377, "grad_norm": 226.3640594482422, "learning_rate": 1.547601787541403e-06, "loss": 0.0667, "num_input_tokens_seen": 52977696, "step": 78590 }, { "epoch": 1.9200889258055849, "grad_norm": 0.27767878770828247, "learning_rate": 1.5475304304504983e-06, "loss": 0.0052, "num_input_tokens_seen": 52980960, "step": 78595 }, { "epoch": 1.9202110766374318, "grad_norm": 0.8122377991676331, "learning_rate": 1.5474590693778054e-06, "loss": 0.0763, "num_input_tokens_seen": 52984800, "step": 78600 }, { "epoch": 1.920333227469279, "grad_norm": 0.2373126596212387, "learning_rate": 1.5473877043238428e-06, "loss": 0.0017, "num_input_tokens_seen": 52987872, "step": 78605 }, { "epoch": 1.9204553783011262, "grad_norm": 0.38778069615364075, "learning_rate": 1.5473163352891295e-06, "loss": 0.0367, "num_input_tokens_seen": 52991008, "step": 78610 }, { "epoch": 1.9205775291329734, "grad_norm": 0.26472291350364685, "learning_rate": 1.5472449622741844e-06, "loss": 0.0514, "num_input_tokens_seen": 52994720, "step": 78615 }, { "epoch": 1.9206996799648206, "grad_norm": 23.649662017822266, "learning_rate": 1.547173585279527e-06, "loss": 0.2303, "num_input_tokens_seen": 52998176, "step": 78620 }, { "epoch": 1.9208218307966676, "grad_norm": 30.59381675720215, "learning_rate": 1.5471022043056761e-06, "loss": 0.1555, "num_input_tokens_seen": 53001568, "step": 78625 }, { "epoch": 1.9209439816285148, "grad_norm": 0.15853320062160492, "learning_rate": 1.5470308193531505e-06, "loss": 0.0836, "num_input_tokens_seen": 53004704, "step": 78630 }, { "epoch": 1.921066132460362, "grad_norm": 0.23013024032115936, "learning_rate": 1.54695943042247e-06, "loss": 0.057, "num_input_tokens_seen": 53007968, "step": 78635 }, { "epoch": 1.9211882832922091, "grad_norm": 17.81313705444336, "learning_rate": 1.5468880375141535e-06, "loss": 0.0948, "num_input_tokens_seen": 53011488, "step": 78640 }, { "epoch": 1.9213104341240563, "grad_norm": 0.38524720072746277, "learning_rate": 1.5468166406287197e-06, "loss": 0.0349, "num_input_tokens_seen": 53014944, "step": 78645 }, { "epoch": 1.9214325849559035, "grad_norm": 0.1310257464647293, "learning_rate": 1.5467452397666885e-06, "loss": 0.05, "num_input_tokens_seen": 53018208, "step": 78650 }, { "epoch": 1.9215547357877507, "grad_norm": 1.4099310636520386, "learning_rate": 1.5466738349285788e-06, "loss": 0.0014, "num_input_tokens_seen": 53021408, "step": 78655 }, { "epoch": 1.921676886619598, "grad_norm": 0.2426673024892807, "learning_rate": 1.54660242611491e-06, "loss": 0.0496, "num_input_tokens_seen": 53024672, "step": 78660 }, { "epoch": 1.921799037451445, "grad_norm": 89.316162109375, "learning_rate": 1.5465310133262014e-06, "loss": 0.1131, "num_input_tokens_seen": 53027936, "step": 78665 }, { "epoch": 1.9219211882832923, "grad_norm": 20.908267974853516, "learning_rate": 1.5464595965629719e-06, "loss": 0.1438, "num_input_tokens_seen": 53031648, "step": 78670 }, { "epoch": 1.9220433391151395, "grad_norm": 0.5099788308143616, "learning_rate": 1.5463881758257414e-06, "loss": 0.0672, "num_input_tokens_seen": 53034720, "step": 78675 }, { "epoch": 1.9221654899469867, "grad_norm": 0.08424662798643112, "learning_rate": 1.5463167511150292e-06, "loss": 0.0052, "num_input_tokens_seen": 53038432, "step": 78680 }, { "epoch": 1.9222876407788339, "grad_norm": 326.7347717285156, "learning_rate": 1.5462453224313547e-06, "loss": 0.1965, "num_input_tokens_seen": 53042080, "step": 78685 }, { "epoch": 1.9224097916106808, "grad_norm": 14.694061279296875, "learning_rate": 1.5461738897752371e-06, "loss": 0.0696, "num_input_tokens_seen": 53045280, "step": 78690 }, { "epoch": 1.922531942442528, "grad_norm": 0.0032516128849238157, "learning_rate": 1.5461024531471961e-06, "loss": 0.0011, "num_input_tokens_seen": 53048416, "step": 78695 }, { "epoch": 1.9226540932743752, "grad_norm": 0.2199341207742691, "learning_rate": 1.5460310125477516e-06, "loss": 0.0433, "num_input_tokens_seen": 53051104, "step": 78700 }, { "epoch": 1.9227762441062224, "grad_norm": 0.2078741043806076, "learning_rate": 1.5459595679774223e-06, "loss": 0.1933, "num_input_tokens_seen": 53054048, "step": 78705 }, { "epoch": 1.9228983949380696, "grad_norm": 0.2788570821285248, "learning_rate": 1.5458881194367282e-06, "loss": 0.0407, "num_input_tokens_seen": 53057568, "step": 78710 }, { "epoch": 1.9230205457699165, "grad_norm": 22.922252655029297, "learning_rate": 1.5458166669261888e-06, "loss": 0.2197, "num_input_tokens_seen": 53060512, "step": 78715 }, { "epoch": 1.9231426966017637, "grad_norm": 36.99504852294922, "learning_rate": 1.545745210446324e-06, "loss": 0.1353, "num_input_tokens_seen": 53063456, "step": 78720 }, { "epoch": 1.923264847433611, "grad_norm": 52.599510192871094, "learning_rate": 1.5456737499976532e-06, "loss": 0.17, "num_input_tokens_seen": 53067424, "step": 78725 }, { "epoch": 1.9233869982654581, "grad_norm": 0.23235607147216797, "learning_rate": 1.5456022855806961e-06, "loss": 0.0516, "num_input_tokens_seen": 53070944, "step": 78730 }, { "epoch": 1.9235091490973053, "grad_norm": 0.22378988564014435, "learning_rate": 1.5455308171959724e-06, "loss": 0.0501, "num_input_tokens_seen": 53074016, "step": 78735 }, { "epoch": 1.9236312999291525, "grad_norm": 39.33481216430664, "learning_rate": 1.5454593448440018e-06, "loss": 0.1306, "num_input_tokens_seen": 53077728, "step": 78740 }, { "epoch": 1.9237534507609997, "grad_norm": 45.01771926879883, "learning_rate": 1.5453878685253043e-06, "loss": 0.062, "num_input_tokens_seen": 53080928, "step": 78745 }, { "epoch": 1.9238756015928469, "grad_norm": 0.009599471464753151, "learning_rate": 1.5453163882403994e-06, "loss": 0.0918, "num_input_tokens_seen": 53084768, "step": 78750 }, { "epoch": 1.923997752424694, "grad_norm": 0.0814778283238411, "learning_rate": 1.5452449039898073e-06, "loss": 0.0855, "num_input_tokens_seen": 53088544, "step": 78755 }, { "epoch": 1.9241199032565413, "grad_norm": 0.13089479506015778, "learning_rate": 1.5451734157740471e-06, "loss": 0.0711, "num_input_tokens_seen": 53091872, "step": 78760 }, { "epoch": 1.9242420540883884, "grad_norm": 0.35896846652030945, "learning_rate": 1.5451019235936396e-06, "loss": 0.0743, "num_input_tokens_seen": 53095008, "step": 78765 }, { "epoch": 1.9243642049202356, "grad_norm": 0.2049417793750763, "learning_rate": 1.5450304274491043e-06, "loss": 0.0544, "num_input_tokens_seen": 53098592, "step": 78770 }, { "epoch": 1.9244863557520828, "grad_norm": 0.10769752413034439, "learning_rate": 1.5449589273409608e-06, "loss": 0.0351, "num_input_tokens_seen": 53101984, "step": 78775 }, { "epoch": 1.9246085065839298, "grad_norm": 6.025005340576172, "learning_rate": 1.5448874232697298e-06, "loss": 0.1972, "num_input_tokens_seen": 53105440, "step": 78780 }, { "epoch": 1.924730657415777, "grad_norm": 0.20146672427654266, "learning_rate": 1.5448159152359307e-06, "loss": 0.1779, "num_input_tokens_seen": 53108128, "step": 78785 }, { "epoch": 1.9248528082476242, "grad_norm": 0.3935956656932831, "learning_rate": 1.544744403240084e-06, "loss": 0.0853, "num_input_tokens_seen": 53111584, "step": 78790 }, { "epoch": 1.9249749590794714, "grad_norm": 0.06383128464221954, "learning_rate": 1.5446728872827091e-06, "loss": 0.0016, "num_input_tokens_seen": 53115168, "step": 78795 }, { "epoch": 1.9250971099113185, "grad_norm": 0.9724140167236328, "learning_rate": 1.5446013673643266e-06, "loss": 0.0082, "num_input_tokens_seen": 53118944, "step": 78800 }, { "epoch": 1.9252192607431655, "grad_norm": 0.08090006560087204, "learning_rate": 1.5445298434854563e-06, "loss": 0.1288, "num_input_tokens_seen": 53122144, "step": 78805 }, { "epoch": 1.9253414115750127, "grad_norm": 0.15845705568790436, "learning_rate": 1.5444583156466187e-06, "loss": 0.1123, "num_input_tokens_seen": 53125728, "step": 78810 }, { "epoch": 1.92546356240686, "grad_norm": 0.0770653635263443, "learning_rate": 1.544386783848334e-06, "loss": 0.0863, "num_input_tokens_seen": 53129696, "step": 78815 }, { "epoch": 1.925585713238707, "grad_norm": 0.22629189491271973, "learning_rate": 1.544315248091122e-06, "loss": 0.0478, "num_input_tokens_seen": 53132832, "step": 78820 }, { "epoch": 1.9257078640705543, "grad_norm": 0.29965588450431824, "learning_rate": 1.544243708375503e-06, "loss": 0.1206, "num_input_tokens_seen": 53136352, "step": 78825 }, { "epoch": 1.9258300149024015, "grad_norm": 0.3292113244533539, "learning_rate": 1.5441721647019974e-06, "loss": 0.0627, "num_input_tokens_seen": 53139936, "step": 78830 }, { "epoch": 1.9259521657342487, "grad_norm": 14.793441772460938, "learning_rate": 1.5441006170711255e-06, "loss": 0.0372, "num_input_tokens_seen": 53143328, "step": 78835 }, { "epoch": 1.9260743165660958, "grad_norm": 0.19428370893001556, "learning_rate": 1.5440290654834075e-06, "loss": 0.0459, "num_input_tokens_seen": 53146656, "step": 78840 }, { "epoch": 1.926196467397943, "grad_norm": 21.550212860107422, "learning_rate": 1.5439575099393639e-06, "loss": 0.1206, "num_input_tokens_seen": 53149920, "step": 78845 }, { "epoch": 1.9263186182297902, "grad_norm": 0.11072822660207748, "learning_rate": 1.543885950439515e-06, "loss": 0.0809, "num_input_tokens_seen": 53153312, "step": 78850 }, { "epoch": 1.9264407690616374, "grad_norm": 41.060089111328125, "learning_rate": 1.543814386984381e-06, "loss": 0.0791, "num_input_tokens_seen": 53156896, "step": 78855 }, { "epoch": 1.9265629198934846, "grad_norm": 1.5634219646453857, "learning_rate": 1.5437428195744829e-06, "loss": 0.1982, "num_input_tokens_seen": 53160288, "step": 78860 }, { "epoch": 1.9266850707253318, "grad_norm": 0.5574802160263062, "learning_rate": 1.5436712482103401e-06, "loss": 0.1215, "num_input_tokens_seen": 53163808, "step": 78865 }, { "epoch": 1.9268072215571788, "grad_norm": 0.07435950636863708, "learning_rate": 1.5435996728924744e-06, "loss": 0.0611, "num_input_tokens_seen": 53167136, "step": 78870 }, { "epoch": 1.926929372389026, "grad_norm": 0.4004596471786499, "learning_rate": 1.5435280936214055e-06, "loss": 0.1073, "num_input_tokens_seen": 53170528, "step": 78875 }, { "epoch": 1.9270515232208731, "grad_norm": 0.15348993241786957, "learning_rate": 1.543456510397654e-06, "loss": 0.0009, "num_input_tokens_seen": 53173472, "step": 78880 }, { "epoch": 1.9271736740527203, "grad_norm": 17.38145637512207, "learning_rate": 1.5433849232217407e-06, "loss": 0.103, "num_input_tokens_seen": 53177056, "step": 78885 }, { "epoch": 1.9272958248845673, "grad_norm": 0.4298340678215027, "learning_rate": 1.543313332094186e-06, "loss": 0.0019, "num_input_tokens_seen": 53180768, "step": 78890 }, { "epoch": 1.9274179757164145, "grad_norm": 18.220590591430664, "learning_rate": 1.5432417370155104e-06, "loss": 0.0486, "num_input_tokens_seen": 53183968, "step": 78895 }, { "epoch": 1.9275401265482617, "grad_norm": 0.14308039844036102, "learning_rate": 1.5431701379862353e-06, "loss": 0.0708, "num_input_tokens_seen": 53187296, "step": 78900 }, { "epoch": 1.9276622773801089, "grad_norm": 1.517258644104004, "learning_rate": 1.5430985350068804e-06, "loss": 0.0107, "num_input_tokens_seen": 53190752, "step": 78905 }, { "epoch": 1.927784428211956, "grad_norm": 1.682701587677002, "learning_rate": 1.543026928077967e-06, "loss": 0.0949, "num_input_tokens_seen": 53194400, "step": 78910 }, { "epoch": 1.9279065790438032, "grad_norm": 0.18249306082725525, "learning_rate": 1.5429553172000157e-06, "loss": 0.1189, "num_input_tokens_seen": 53197856, "step": 78915 }, { "epoch": 1.9280287298756504, "grad_norm": 0.07325104624032974, "learning_rate": 1.5428837023735475e-06, "loss": 0.0013, "num_input_tokens_seen": 53200928, "step": 78920 }, { "epoch": 1.9281508807074976, "grad_norm": 0.027356114238500595, "learning_rate": 1.5428120835990829e-06, "loss": 0.0833, "num_input_tokens_seen": 53204384, "step": 78925 }, { "epoch": 1.9282730315393448, "grad_norm": 17.057506561279297, "learning_rate": 1.5427404608771427e-06, "loss": 0.1537, "num_input_tokens_seen": 53207392, "step": 78930 }, { "epoch": 1.928395182371192, "grad_norm": 0.018845299258828163, "learning_rate": 1.542668834208248e-06, "loss": 0.058, "num_input_tokens_seen": 53210912, "step": 78935 }, { "epoch": 1.9285173332030392, "grad_norm": 26.701641082763672, "learning_rate": 1.5425972035929196e-06, "loss": 0.0759, "num_input_tokens_seen": 53214432, "step": 78940 }, { "epoch": 1.9286394840348864, "grad_norm": 0.2233164757490158, "learning_rate": 1.5425255690316783e-06, "loss": 0.0417, "num_input_tokens_seen": 53217760, "step": 78945 }, { "epoch": 1.9287616348667336, "grad_norm": 10.752738952636719, "learning_rate": 1.5424539305250452e-06, "loss": 0.1116, "num_input_tokens_seen": 53220896, "step": 78950 }, { "epoch": 1.9288837856985808, "grad_norm": 13.85923957824707, "learning_rate": 1.542382288073541e-06, "loss": 0.2375, "num_input_tokens_seen": 53224160, "step": 78955 }, { "epoch": 1.9290059365304277, "grad_norm": 0.13441646099090576, "learning_rate": 1.5423106416776873e-06, "loss": 0.1627, "num_input_tokens_seen": 53227360, "step": 78960 }, { "epoch": 1.929128087362275, "grad_norm": 16.526451110839844, "learning_rate": 1.5422389913380046e-06, "loss": 0.0888, "num_input_tokens_seen": 53230560, "step": 78965 }, { "epoch": 1.9292502381941221, "grad_norm": 21.69394874572754, "learning_rate": 1.5421673370550142e-06, "loss": 0.0429, "num_input_tokens_seen": 53233824, "step": 78970 }, { "epoch": 1.9293723890259693, "grad_norm": 0.10255489498376846, "learning_rate": 1.542095678829237e-06, "loss": 0.0709, "num_input_tokens_seen": 53237984, "step": 78975 }, { "epoch": 1.9294945398578163, "grad_norm": 9.572564125061035, "learning_rate": 1.5420240166611942e-06, "loss": 0.264, "num_input_tokens_seen": 53241312, "step": 78980 }, { "epoch": 1.9296166906896635, "grad_norm": 17.74254608154297, "learning_rate": 1.5419523505514068e-06, "loss": 0.1324, "num_input_tokens_seen": 53245024, "step": 78985 }, { "epoch": 1.9297388415215107, "grad_norm": 30.855518341064453, "learning_rate": 1.5418806805003964e-06, "loss": 0.0684, "num_input_tokens_seen": 53248480, "step": 78990 }, { "epoch": 1.9298609923533578, "grad_norm": 22.330480575561523, "learning_rate": 1.5418090065086838e-06, "loss": 0.066, "num_input_tokens_seen": 53251744, "step": 78995 }, { "epoch": 1.929983143185205, "grad_norm": 0.38473808765411377, "learning_rate": 1.5417373285767903e-06, "loss": 0.0335, "num_input_tokens_seen": 53255328, "step": 79000 }, { "epoch": 1.9301052940170522, "grad_norm": 145.94451904296875, "learning_rate": 1.5416656467052374e-06, "loss": 0.0654, "num_input_tokens_seen": 53258784, "step": 79005 }, { "epoch": 1.9302274448488994, "grad_norm": 9.839098930358887, "learning_rate": 1.5415939608945463e-06, "loss": 0.1352, "num_input_tokens_seen": 53262368, "step": 79010 }, { "epoch": 1.9303495956807466, "grad_norm": 37.14972686767578, "learning_rate": 1.5415222711452382e-06, "loss": 0.093, "num_input_tokens_seen": 53266144, "step": 79015 }, { "epoch": 1.9304717465125938, "grad_norm": 0.1843390315771103, "learning_rate": 1.5414505774578342e-06, "loss": 0.0653, "num_input_tokens_seen": 53269472, "step": 79020 }, { "epoch": 1.930593897344441, "grad_norm": 75.80435180664062, "learning_rate": 1.5413788798328563e-06, "loss": 0.0632, "num_input_tokens_seen": 53272864, "step": 79025 }, { "epoch": 1.9307160481762882, "grad_norm": 16.962350845336914, "learning_rate": 1.5413071782708254e-06, "loss": 0.1579, "num_input_tokens_seen": 53275680, "step": 79030 }, { "epoch": 1.9308381990081354, "grad_norm": 0.4690840542316437, "learning_rate": 1.5412354727722631e-06, "loss": 0.0986, "num_input_tokens_seen": 53279456, "step": 79035 }, { "epoch": 1.9309603498399825, "grad_norm": 0.4627876281738281, "learning_rate": 1.541163763337691e-06, "loss": 0.0759, "num_input_tokens_seen": 53282592, "step": 79040 }, { "epoch": 1.9310825006718297, "grad_norm": 0.4975791871547699, "learning_rate": 1.5410920499676303e-06, "loss": 0.0426, "num_input_tokens_seen": 53286240, "step": 79045 }, { "epoch": 1.9312046515036767, "grad_norm": 0.12812969088554382, "learning_rate": 1.5410203326626028e-06, "loss": 0.0322, "num_input_tokens_seen": 53289632, "step": 79050 }, { "epoch": 1.931326802335524, "grad_norm": 22.060348510742188, "learning_rate": 1.54094861142313e-06, "loss": 0.0042, "num_input_tokens_seen": 53292896, "step": 79055 }, { "epoch": 1.931448953167371, "grad_norm": 54.32301330566406, "learning_rate": 1.5408768862497332e-06, "loss": 0.1168, "num_input_tokens_seen": 53296480, "step": 79060 }, { "epoch": 1.9315711039992183, "grad_norm": 22.543865203857422, "learning_rate": 1.5408051571429344e-06, "loss": 0.11, "num_input_tokens_seen": 53299744, "step": 79065 }, { "epoch": 1.9316932548310652, "grad_norm": 60.06733322143555, "learning_rate": 1.540733424103255e-06, "loss": 0.046, "num_input_tokens_seen": 53302624, "step": 79070 }, { "epoch": 1.9318154056629124, "grad_norm": 87.2182846069336, "learning_rate": 1.5406616871312166e-06, "loss": 0.0843, "num_input_tokens_seen": 53305824, "step": 79075 }, { "epoch": 1.9319375564947596, "grad_norm": 10.24471378326416, "learning_rate": 1.540589946227341e-06, "loss": 0.1187, "num_input_tokens_seen": 53308704, "step": 79080 }, { "epoch": 1.9320597073266068, "grad_norm": 0.9021111726760864, "learning_rate": 1.5405182013921498e-06, "loss": 0.0444, "num_input_tokens_seen": 53311712, "step": 79085 }, { "epoch": 1.932181858158454, "grad_norm": 0.08363750576972961, "learning_rate": 1.5404464526261651e-06, "loss": 0.175, "num_input_tokens_seen": 53315104, "step": 79090 }, { "epoch": 1.9323040089903012, "grad_norm": 0.9060919284820557, "learning_rate": 1.5403746999299083e-06, "loss": 0.0021, "num_input_tokens_seen": 53317920, "step": 79095 }, { "epoch": 1.9324261598221484, "grad_norm": 98.47150421142578, "learning_rate": 1.540302943303901e-06, "loss": 0.103, "num_input_tokens_seen": 53321376, "step": 79100 }, { "epoch": 1.9325483106539956, "grad_norm": 0.3717132806777954, "learning_rate": 1.5402311827486663e-06, "loss": 0.1041, "num_input_tokens_seen": 53324448, "step": 79105 }, { "epoch": 1.9326704614858428, "grad_norm": 8.272316932678223, "learning_rate": 1.5401594182647241e-06, "loss": 0.1032, "num_input_tokens_seen": 53328032, "step": 79110 }, { "epoch": 1.93279261231769, "grad_norm": 0.6474543809890747, "learning_rate": 1.5400876498525978e-06, "loss": 0.0018, "num_input_tokens_seen": 53331552, "step": 79115 }, { "epoch": 1.9329147631495371, "grad_norm": 0.18280108273029327, "learning_rate": 1.540015877512809e-06, "loss": 0.0707, "num_input_tokens_seen": 53335456, "step": 79120 }, { "epoch": 1.9330369139813843, "grad_norm": 1.0236232280731201, "learning_rate": 1.5399441012458793e-06, "loss": 0.1359, "num_input_tokens_seen": 53338720, "step": 79125 }, { "epoch": 1.9331590648132315, "grad_norm": 1.5323357582092285, "learning_rate": 1.5398723210523313e-06, "loss": 0.1184, "num_input_tokens_seen": 53341920, "step": 79130 }, { "epoch": 1.9332812156450785, "grad_norm": 0.17041514813899994, "learning_rate": 1.5398005369326859e-06, "loss": 0.035, "num_input_tokens_seen": 53345568, "step": 79135 }, { "epoch": 1.9334033664769257, "grad_norm": 1.063998818397522, "learning_rate": 1.5397287488874662e-06, "loss": 0.2139, "num_input_tokens_seen": 53348768, "step": 79140 }, { "epoch": 1.9335255173087729, "grad_norm": 0.6377407908439636, "learning_rate": 1.5396569569171935e-06, "loss": 0.088, "num_input_tokens_seen": 53352224, "step": 79145 }, { "epoch": 1.93364766814062, "grad_norm": 0.3253001570701599, "learning_rate": 1.5395851610223906e-06, "loss": 0.0501, "num_input_tokens_seen": 53355104, "step": 79150 }, { "epoch": 1.9337698189724672, "grad_norm": 9.995173454284668, "learning_rate": 1.5395133612035794e-06, "loss": 0.1531, "num_input_tokens_seen": 53358304, "step": 79155 }, { "epoch": 1.9338919698043142, "grad_norm": 0.03718879073858261, "learning_rate": 1.5394415574612816e-06, "loss": 0.0222, "num_input_tokens_seen": 53362016, "step": 79160 }, { "epoch": 1.9340141206361614, "grad_norm": 1.7763270139694214, "learning_rate": 1.5393697497960196e-06, "loss": 0.0414, "num_input_tokens_seen": 53365536, "step": 79165 }, { "epoch": 1.9341362714680086, "grad_norm": 14.749547958374023, "learning_rate": 1.5392979382083163e-06, "loss": 0.2279, "num_input_tokens_seen": 53369440, "step": 79170 }, { "epoch": 1.9342584222998558, "grad_norm": 28.31888771057129, "learning_rate": 1.5392261226986926e-06, "loss": 0.0723, "num_input_tokens_seen": 53372576, "step": 79175 }, { "epoch": 1.934380573131703, "grad_norm": 0.3445660173892975, "learning_rate": 1.5391543032676721e-06, "loss": 0.2502, "num_input_tokens_seen": 53375584, "step": 79180 }, { "epoch": 1.9345027239635502, "grad_norm": 0.6460484862327576, "learning_rate": 1.5390824799157763e-06, "loss": 0.0783, "num_input_tokens_seen": 53378528, "step": 79185 }, { "epoch": 1.9346248747953974, "grad_norm": 0.07000091671943665, "learning_rate": 1.5390106526435277e-06, "loss": 0.0312, "num_input_tokens_seen": 53381984, "step": 79190 }, { "epoch": 1.9347470256272445, "grad_norm": 51.154441833496094, "learning_rate": 1.5389388214514485e-06, "loss": 0.0512, "num_input_tokens_seen": 53385440, "step": 79195 }, { "epoch": 1.9348691764590917, "grad_norm": 7.436317443847656, "learning_rate": 1.5388669863400614e-06, "loss": 0.0864, "num_input_tokens_seen": 53388896, "step": 79200 }, { "epoch": 1.934991327290939, "grad_norm": 0.16629944741725922, "learning_rate": 1.5387951473098883e-06, "loss": 0.036, "num_input_tokens_seen": 53392288, "step": 79205 }, { "epoch": 1.9351134781227861, "grad_norm": 0.2667045593261719, "learning_rate": 1.5387233043614525e-06, "loss": 0.2508, "num_input_tokens_seen": 53395936, "step": 79210 }, { "epoch": 1.9352356289546333, "grad_norm": 0.6180437207221985, "learning_rate": 1.5386514574952756e-06, "loss": 0.0236, "num_input_tokens_seen": 53399200, "step": 79215 }, { "epoch": 1.9353577797864805, "grad_norm": 22.34016990661621, "learning_rate": 1.5385796067118805e-06, "loss": 0.0322, "num_input_tokens_seen": 53402784, "step": 79220 }, { "epoch": 1.9354799306183275, "grad_norm": 17.962881088256836, "learning_rate": 1.5385077520117898e-06, "loss": 0.1729, "num_input_tokens_seen": 53406624, "step": 79225 }, { "epoch": 1.9356020814501746, "grad_norm": 14.11638069152832, "learning_rate": 1.5384358933955257e-06, "loss": 0.2673, "num_input_tokens_seen": 53409824, "step": 79230 }, { "epoch": 1.9357242322820218, "grad_norm": 0.1589454859495163, "learning_rate": 1.5383640308636108e-06, "loss": 0.0014, "num_input_tokens_seen": 53413280, "step": 79235 }, { "epoch": 1.935846383113869, "grad_norm": 113.4941177368164, "learning_rate": 1.5382921644165682e-06, "loss": 0.0711, "num_input_tokens_seen": 53417312, "step": 79240 }, { "epoch": 1.9359685339457162, "grad_norm": 9.330788612365723, "learning_rate": 1.53822029405492e-06, "loss": 0.0745, "num_input_tokens_seen": 53420832, "step": 79245 }, { "epoch": 1.9360906847775632, "grad_norm": 0.9437325596809387, "learning_rate": 1.5381484197791891e-06, "loss": 0.0018, "num_input_tokens_seen": 53423840, "step": 79250 }, { "epoch": 1.9362128356094104, "grad_norm": 0.37375345826148987, "learning_rate": 1.5380765415898984e-06, "loss": 0.1126, "num_input_tokens_seen": 53426976, "step": 79255 }, { "epoch": 1.9363349864412576, "grad_norm": 0.38319942355155945, "learning_rate": 1.53800465948757e-06, "loss": 0.0283, "num_input_tokens_seen": 53430176, "step": 79260 }, { "epoch": 1.9364571372731048, "grad_norm": 6.067543029785156, "learning_rate": 1.537932773472727e-06, "loss": 0.0509, "num_input_tokens_seen": 53433824, "step": 79265 }, { "epoch": 1.936579288104952, "grad_norm": 44.16752624511719, "learning_rate": 1.5378608835458922e-06, "loss": 0.0054, "num_input_tokens_seen": 53437024, "step": 79270 }, { "epoch": 1.9367014389367991, "grad_norm": 0.036722905933856964, "learning_rate": 1.5377889897075886e-06, "loss": 0.0606, "num_input_tokens_seen": 53440224, "step": 79275 }, { "epoch": 1.9368235897686463, "grad_norm": 0.9072578549385071, "learning_rate": 1.537717091958339e-06, "loss": 0.0886, "num_input_tokens_seen": 53443808, "step": 79280 }, { "epoch": 1.9369457406004935, "grad_norm": 0.0430944450199604, "learning_rate": 1.5376451902986659e-06, "loss": 0.1185, "num_input_tokens_seen": 53446880, "step": 79285 }, { "epoch": 1.9370678914323407, "grad_norm": 11.946255683898926, "learning_rate": 1.5375732847290923e-06, "loss": 0.0366, "num_input_tokens_seen": 53450464, "step": 79290 }, { "epoch": 1.937190042264188, "grad_norm": 0.02077353373169899, "learning_rate": 1.5375013752501412e-06, "loss": 0.0236, "num_input_tokens_seen": 53453792, "step": 79295 }, { "epoch": 1.937312193096035, "grad_norm": 13.418227195739746, "learning_rate": 1.5374294618623354e-06, "loss": 0.0492, "num_input_tokens_seen": 53457120, "step": 79300 }, { "epoch": 1.9374343439278823, "grad_norm": 0.3700249493122101, "learning_rate": 1.537357544566198e-06, "loss": 0.043, "num_input_tokens_seen": 53460448, "step": 79305 }, { "epoch": 1.9375564947597295, "grad_norm": 0.8246507048606873, "learning_rate": 1.537285623362252e-06, "loss": 0.0276, "num_input_tokens_seen": 53463776, "step": 79310 }, { "epoch": 1.9376786455915764, "grad_norm": 13.733869552612305, "learning_rate": 1.5372136982510203e-06, "loss": 0.0755, "num_input_tokens_seen": 53467232, "step": 79315 }, { "epoch": 1.9378007964234236, "grad_norm": 0.0676179900765419, "learning_rate": 1.5371417692330267e-06, "loss": 0.0909, "num_input_tokens_seen": 53470752, "step": 79320 }, { "epoch": 1.9379229472552708, "grad_norm": 226.75962829589844, "learning_rate": 1.537069836308793e-06, "loss": 0.0852, "num_input_tokens_seen": 53473824, "step": 79325 }, { "epoch": 1.938045098087118, "grad_norm": 9.021869659423828, "learning_rate": 1.5369978994788436e-06, "loss": 0.1114, "num_input_tokens_seen": 53476512, "step": 79330 }, { "epoch": 1.9381672489189652, "grad_norm": 14.622756004333496, "learning_rate": 1.5369259587437006e-06, "loss": 0.0344, "num_input_tokens_seen": 53479520, "step": 79335 }, { "epoch": 1.9382893997508122, "grad_norm": 9.884684562683105, "learning_rate": 1.5368540141038876e-06, "loss": 0.1046, "num_input_tokens_seen": 53482656, "step": 79340 }, { "epoch": 1.9384115505826593, "grad_norm": 0.8400244116783142, "learning_rate": 1.5367820655599283e-06, "loss": 0.0904, "num_input_tokens_seen": 53486240, "step": 79345 }, { "epoch": 1.9385337014145065, "grad_norm": 113.14087677001953, "learning_rate": 1.536710113112345e-06, "loss": 0.1344, "num_input_tokens_seen": 53489760, "step": 79350 }, { "epoch": 1.9386558522463537, "grad_norm": 0.09375862032175064, "learning_rate": 1.5366381567616615e-06, "loss": 0.2347, "num_input_tokens_seen": 53492960, "step": 79355 }, { "epoch": 1.938778003078201, "grad_norm": 14.247235298156738, "learning_rate": 1.5365661965084008e-06, "loss": 0.0645, "num_input_tokens_seen": 53496224, "step": 79360 }, { "epoch": 1.938900153910048, "grad_norm": 17.835500717163086, "learning_rate": 1.5364942323530868e-06, "loss": 0.1529, "num_input_tokens_seen": 53499168, "step": 79365 }, { "epoch": 1.9390223047418953, "grad_norm": 0.8982129693031311, "learning_rate": 1.536422264296242e-06, "loss": 0.1296, "num_input_tokens_seen": 53502176, "step": 79370 }, { "epoch": 1.9391444555737425, "grad_norm": 0.0543377548456192, "learning_rate": 1.5363502923383906e-06, "loss": 0.1098, "num_input_tokens_seen": 53505120, "step": 79375 }, { "epoch": 1.9392666064055897, "grad_norm": 0.25972822308540344, "learning_rate": 1.5362783164800554e-06, "loss": 0.0626, "num_input_tokens_seen": 53508256, "step": 79380 }, { "epoch": 1.9393887572374369, "grad_norm": 0.461689293384552, "learning_rate": 1.5362063367217603e-06, "loss": 0.0417, "num_input_tokens_seen": 53511648, "step": 79385 }, { "epoch": 1.939510908069284, "grad_norm": 1.3230669498443604, "learning_rate": 1.5361343530640283e-06, "loss": 0.1514, "num_input_tokens_seen": 53514656, "step": 79390 }, { "epoch": 1.9396330589011312, "grad_norm": 0.2136191427707672, "learning_rate": 1.536062365507383e-06, "loss": 0.0348, "num_input_tokens_seen": 53517664, "step": 79395 }, { "epoch": 1.9397552097329784, "grad_norm": 1.2831095457077026, "learning_rate": 1.5359903740523481e-06, "loss": 0.1085, "num_input_tokens_seen": 53520992, "step": 79400 }, { "epoch": 1.9398773605648254, "grad_norm": 0.11328689008951187, "learning_rate": 1.535918378699447e-06, "loss": 0.1096, "num_input_tokens_seen": 53524000, "step": 79405 }, { "epoch": 1.9399995113966726, "grad_norm": 0.14550715684890747, "learning_rate": 1.5358463794492034e-06, "loss": 0.1842, "num_input_tokens_seen": 53527200, "step": 79410 }, { "epoch": 1.9401216622285198, "grad_norm": 2.1023430824279785, "learning_rate": 1.5357743763021407e-06, "loss": 0.0709, "num_input_tokens_seen": 53530784, "step": 79415 }, { "epoch": 1.940243813060367, "grad_norm": 0.47813543677330017, "learning_rate": 1.5357023692587827e-06, "loss": 0.0449, "num_input_tokens_seen": 53534048, "step": 79420 }, { "epoch": 1.940365963892214, "grad_norm": 0.7654673457145691, "learning_rate": 1.5356303583196528e-06, "loss": 0.0918, "num_input_tokens_seen": 53537440, "step": 79425 }, { "epoch": 1.9404881147240611, "grad_norm": 0.6435978412628174, "learning_rate": 1.5355583434852749e-06, "loss": 0.068, "num_input_tokens_seen": 53540064, "step": 79430 }, { "epoch": 1.9406102655559083, "grad_norm": 16.76521873474121, "learning_rate": 1.535486324756173e-06, "loss": 0.0767, "num_input_tokens_seen": 53543648, "step": 79435 }, { "epoch": 1.9407324163877555, "grad_norm": 0.1286541223526001, "learning_rate": 1.5354143021328704e-06, "loss": 0.0317, "num_input_tokens_seen": 53547488, "step": 79440 }, { "epoch": 1.9408545672196027, "grad_norm": 16.738767623901367, "learning_rate": 1.5353422756158909e-06, "loss": 0.0711, "num_input_tokens_seen": 53551136, "step": 79445 }, { "epoch": 1.94097671805145, "grad_norm": 97.97506713867188, "learning_rate": 1.5352702452057584e-06, "loss": 0.0091, "num_input_tokens_seen": 53554144, "step": 79450 }, { "epoch": 1.941098868883297, "grad_norm": 13.813468933105469, "learning_rate": 1.5351982109029964e-06, "loss": 0.0815, "num_input_tokens_seen": 53557280, "step": 79455 }, { "epoch": 1.9412210197151443, "grad_norm": 0.16498295962810516, "learning_rate": 1.5351261727081295e-06, "loss": 0.0037, "num_input_tokens_seen": 53560288, "step": 79460 }, { "epoch": 1.9413431705469915, "grad_norm": 27.545860290527344, "learning_rate": 1.5350541306216809e-06, "loss": 0.1259, "num_input_tokens_seen": 53563808, "step": 79465 }, { "epoch": 1.9414653213788386, "grad_norm": 0.7710163593292236, "learning_rate": 1.5349820846441748e-06, "loss": 0.001, "num_input_tokens_seen": 53567392, "step": 79470 }, { "epoch": 1.9415874722106858, "grad_norm": 56.430171966552734, "learning_rate": 1.5349100347761353e-06, "loss": 0.1426, "num_input_tokens_seen": 53570400, "step": 79475 }, { "epoch": 1.941709623042533, "grad_norm": 43.03308868408203, "learning_rate": 1.5348379810180858e-06, "loss": 0.0804, "num_input_tokens_seen": 53573792, "step": 79480 }, { "epoch": 1.9418317738743802, "grad_norm": 0.030352571979165077, "learning_rate": 1.5347659233705507e-06, "loss": 0.0546, "num_input_tokens_seen": 53576992, "step": 79485 }, { "epoch": 1.9419539247062274, "grad_norm": 0.024639304727315903, "learning_rate": 1.534693861834054e-06, "loss": 0.1037, "num_input_tokens_seen": 53580256, "step": 79490 }, { "epoch": 1.9420760755380744, "grad_norm": 0.3017178177833557, "learning_rate": 1.5346217964091198e-06, "loss": 0.0331, "num_input_tokens_seen": 53584224, "step": 79495 }, { "epoch": 1.9421982263699216, "grad_norm": 0.5502581596374512, "learning_rate": 1.5345497270962724e-06, "loss": 0.0263, "num_input_tokens_seen": 53587744, "step": 79500 }, { "epoch": 1.9423203772017688, "grad_norm": 31.735069274902344, "learning_rate": 1.5344776538960353e-06, "loss": 0.0533, "num_input_tokens_seen": 53590880, "step": 79505 }, { "epoch": 1.942442528033616, "grad_norm": 0.3141069710254669, "learning_rate": 1.534405576808933e-06, "loss": 0.144, "num_input_tokens_seen": 53594208, "step": 79510 }, { "epoch": 1.942564678865463, "grad_norm": 0.16800308227539062, "learning_rate": 1.5343334958354893e-06, "loss": 0.0418, "num_input_tokens_seen": 53597792, "step": 79515 }, { "epoch": 1.94268682969731, "grad_norm": 0.10520976036787033, "learning_rate": 1.534261410976229e-06, "loss": 0.0889, "num_input_tokens_seen": 53600864, "step": 79520 }, { "epoch": 1.9428089805291573, "grad_norm": 0.51318359375, "learning_rate": 1.5341893222316759e-06, "loss": 0.1442, "num_input_tokens_seen": 53604320, "step": 79525 }, { "epoch": 1.9429311313610045, "grad_norm": 0.11362060904502869, "learning_rate": 1.5341172296023545e-06, "loss": 0.0782, "num_input_tokens_seen": 53607776, "step": 79530 }, { "epoch": 1.9430532821928517, "grad_norm": 13.606107711791992, "learning_rate": 1.5340451330887891e-06, "loss": 0.0684, "num_input_tokens_seen": 53611552, "step": 79535 }, { "epoch": 1.9431754330246989, "grad_norm": 0.14235706627368927, "learning_rate": 1.5339730326915038e-06, "loss": 0.1107, "num_input_tokens_seen": 53615648, "step": 79540 }, { "epoch": 1.943297583856546, "grad_norm": 0.2929399907588959, "learning_rate": 1.5339009284110228e-06, "loss": 0.1097, "num_input_tokens_seen": 53618848, "step": 79545 }, { "epoch": 1.9434197346883932, "grad_norm": 0.07972901314496994, "learning_rate": 1.5338288202478706e-06, "loss": 0.0017, "num_input_tokens_seen": 53622432, "step": 79550 }, { "epoch": 1.9435418855202404, "grad_norm": 1.1873873472213745, "learning_rate": 1.5337567082025714e-06, "loss": 0.0393, "num_input_tokens_seen": 53626016, "step": 79555 }, { "epoch": 1.9436640363520876, "grad_norm": 40.611732482910156, "learning_rate": 1.5336845922756502e-06, "loss": 0.1167, "num_input_tokens_seen": 53629792, "step": 79560 }, { "epoch": 1.9437861871839348, "grad_norm": 0.14855530858039856, "learning_rate": 1.5336124724676314e-06, "loss": 0.012, "num_input_tokens_seen": 53632736, "step": 79565 }, { "epoch": 1.943908338015782, "grad_norm": 0.19300849735736847, "learning_rate": 1.533540348779039e-06, "loss": 0.0345, "num_input_tokens_seen": 53636192, "step": 79570 }, { "epoch": 1.9440304888476292, "grad_norm": 0.4298967123031616, "learning_rate": 1.5334682212103973e-06, "loss": 0.0412, "num_input_tokens_seen": 53639968, "step": 79575 }, { "epoch": 1.9441526396794764, "grad_norm": 8.890560150146484, "learning_rate": 1.5333960897622313e-06, "loss": 0.0968, "num_input_tokens_seen": 53643104, "step": 79580 }, { "epoch": 1.9442747905113233, "grad_norm": 0.10859589278697968, "learning_rate": 1.5333239544350656e-06, "loss": 0.0289, "num_input_tokens_seen": 53645856, "step": 79585 }, { "epoch": 1.9443969413431705, "grad_norm": 0.24548016488552094, "learning_rate": 1.533251815229425e-06, "loss": 0.0682, "num_input_tokens_seen": 53649696, "step": 79590 }, { "epoch": 1.9445190921750177, "grad_norm": 12.39043140411377, "learning_rate": 1.5331796721458332e-06, "loss": 0.2259, "num_input_tokens_seen": 53653152, "step": 79595 }, { "epoch": 1.944641243006865, "grad_norm": 0.2160811573266983, "learning_rate": 1.5331075251848159e-06, "loss": 0.1408, "num_input_tokens_seen": 53656672, "step": 79600 }, { "epoch": 1.9447633938387119, "grad_norm": 32.371551513671875, "learning_rate": 1.5330353743468968e-06, "loss": 0.1304, "num_input_tokens_seen": 53659744, "step": 79605 }, { "epoch": 1.944885544670559, "grad_norm": 0.23671525716781616, "learning_rate": 1.5329632196326015e-06, "loss": 0.0999, "num_input_tokens_seen": 53663136, "step": 79610 }, { "epoch": 1.9450076955024063, "grad_norm": 0.14334982633590698, "learning_rate": 1.532891061042454e-06, "loss": 0.0075, "num_input_tokens_seen": 53666208, "step": 79615 }, { "epoch": 1.9451298463342535, "grad_norm": 12.645910263061523, "learning_rate": 1.5328188985769795e-06, "loss": 0.0496, "num_input_tokens_seen": 53669792, "step": 79620 }, { "epoch": 1.9452519971661006, "grad_norm": 80.78424835205078, "learning_rate": 1.5327467322367028e-06, "loss": 0.0075, "num_input_tokens_seen": 53673312, "step": 79625 }, { "epoch": 1.9453741479979478, "grad_norm": 0.27641037106513977, "learning_rate": 1.5326745620221484e-06, "loss": 0.0013, "num_input_tokens_seen": 53676832, "step": 79630 }, { "epoch": 1.945496298829795, "grad_norm": 0.14950011670589447, "learning_rate": 1.5326023879338411e-06, "loss": 0.1459, "num_input_tokens_seen": 53679968, "step": 79635 }, { "epoch": 1.9456184496616422, "grad_norm": 2.9096691608428955, "learning_rate": 1.5325302099723065e-06, "loss": 0.1007, "num_input_tokens_seen": 53683488, "step": 79640 }, { "epoch": 1.9457406004934894, "grad_norm": 0.5228842496871948, "learning_rate": 1.5324580281380689e-06, "loss": 0.079, "num_input_tokens_seen": 53686496, "step": 79645 }, { "epoch": 1.9458627513253366, "grad_norm": 27.07952308654785, "learning_rate": 1.5323858424316529e-06, "loss": 0.0798, "num_input_tokens_seen": 53690592, "step": 79650 }, { "epoch": 1.9459849021571838, "grad_norm": 2.3000648021698, "learning_rate": 1.5323136528535842e-06, "loss": 0.0389, "num_input_tokens_seen": 53693920, "step": 79655 }, { "epoch": 1.946107052989031, "grad_norm": 42.30258560180664, "learning_rate": 1.5322414594043874e-06, "loss": 0.2223, "num_input_tokens_seen": 53697376, "step": 79660 }, { "epoch": 1.9462292038208782, "grad_norm": 0.7183613181114197, "learning_rate": 1.5321692620845875e-06, "loss": 0.1392, "num_input_tokens_seen": 53700768, "step": 79665 }, { "epoch": 1.9463513546527251, "grad_norm": 2.183073043823242, "learning_rate": 1.5320970608947093e-06, "loss": 0.1499, "num_input_tokens_seen": 53704032, "step": 79670 }, { "epoch": 1.9464735054845723, "grad_norm": 6.9765777587890625, "learning_rate": 1.5320248558352784e-06, "loss": 0.1676, "num_input_tokens_seen": 53707360, "step": 79675 }, { "epoch": 1.9465956563164195, "grad_norm": 18.949779510498047, "learning_rate": 1.5319526469068196e-06, "loss": 0.0718, "num_input_tokens_seen": 53710560, "step": 79680 }, { "epoch": 1.9467178071482667, "grad_norm": 33.51317596435547, "learning_rate": 1.5318804341098583e-06, "loss": 0.1137, "num_input_tokens_seen": 53713504, "step": 79685 }, { "epoch": 1.9468399579801139, "grad_norm": 0.09625236690044403, "learning_rate": 1.5318082174449192e-06, "loss": 0.1201, "num_input_tokens_seen": 53716704, "step": 79690 }, { "epoch": 1.9469621088119609, "grad_norm": 1.1682844161987305, "learning_rate": 1.5317359969125279e-06, "loss": 0.0778, "num_input_tokens_seen": 53720480, "step": 79695 }, { "epoch": 1.947084259643808, "grad_norm": 21.76727294921875, "learning_rate": 1.5316637725132094e-06, "loss": 0.1615, "num_input_tokens_seen": 53723616, "step": 79700 }, { "epoch": 1.9472064104756552, "grad_norm": 0.10677886009216309, "learning_rate": 1.5315915442474887e-06, "loss": 0.0031, "num_input_tokens_seen": 53726816, "step": 79705 }, { "epoch": 1.9473285613075024, "grad_norm": 31.727231979370117, "learning_rate": 1.5315193121158915e-06, "loss": 0.1203, "num_input_tokens_seen": 53730080, "step": 79710 }, { "epoch": 1.9474507121393496, "grad_norm": 0.15762321650981903, "learning_rate": 1.5314470761189429e-06, "loss": 0.0737, "num_input_tokens_seen": 53733088, "step": 79715 }, { "epoch": 1.9475728629711968, "grad_norm": 12.666200637817383, "learning_rate": 1.5313748362571681e-06, "loss": 0.2403, "num_input_tokens_seen": 53736544, "step": 79720 }, { "epoch": 1.947695013803044, "grad_norm": 8.358912467956543, "learning_rate": 1.5313025925310928e-06, "loss": 0.0776, "num_input_tokens_seen": 53739680, "step": 79725 }, { "epoch": 1.9478171646348912, "grad_norm": 15.929183959960938, "learning_rate": 1.5312303449412419e-06, "loss": 0.0806, "num_input_tokens_seen": 53743072, "step": 79730 }, { "epoch": 1.9479393154667384, "grad_norm": 0.23921556770801544, "learning_rate": 1.531158093488141e-06, "loss": 0.0024, "num_input_tokens_seen": 53746144, "step": 79735 }, { "epoch": 1.9480614662985856, "grad_norm": 34.633052825927734, "learning_rate": 1.5310858381723154e-06, "loss": 0.0449, "num_input_tokens_seen": 53749344, "step": 79740 }, { "epoch": 1.9481836171304328, "grad_norm": 0.1212652325630188, "learning_rate": 1.5310135789942915e-06, "loss": 0.0016, "num_input_tokens_seen": 53752928, "step": 79745 }, { "epoch": 1.94830576796228, "grad_norm": 0.3784651756286621, "learning_rate": 1.5309413159545935e-06, "loss": 0.0685, "num_input_tokens_seen": 53756576, "step": 79750 }, { "epoch": 1.9484279187941271, "grad_norm": 23.24256706237793, "learning_rate": 1.5308690490537477e-06, "loss": 0.1998, "num_input_tokens_seen": 53759776, "step": 79755 }, { "epoch": 1.948550069625974, "grad_norm": 38.22763442993164, "learning_rate": 1.530796778292279e-06, "loss": 0.1284, "num_input_tokens_seen": 53763104, "step": 79760 }, { "epoch": 1.9486722204578213, "grad_norm": 0.16112475097179413, "learning_rate": 1.5307245036707136e-06, "loss": 0.0915, "num_input_tokens_seen": 53766304, "step": 79765 }, { "epoch": 1.9487943712896685, "grad_norm": 192.25909423828125, "learning_rate": 1.5306522251895766e-06, "loss": 0.2308, "num_input_tokens_seen": 53769632, "step": 79770 }, { "epoch": 1.9489165221215157, "grad_norm": 0.2769120931625366, "learning_rate": 1.5305799428493944e-06, "loss": 0.0016, "num_input_tokens_seen": 53773216, "step": 79775 }, { "epoch": 1.9490386729533629, "grad_norm": 1.8229011297225952, "learning_rate": 1.5305076566506918e-06, "loss": 0.0651, "num_input_tokens_seen": 53776544, "step": 79780 }, { "epoch": 1.9491608237852098, "grad_norm": 0.07343819737434387, "learning_rate": 1.530435366593995e-06, "loss": 0.0341, "num_input_tokens_seen": 53780192, "step": 79785 }, { "epoch": 1.949282974617057, "grad_norm": 6.312066555023193, "learning_rate": 1.5303630726798294e-06, "loss": 0.0921, "num_input_tokens_seen": 53783520, "step": 79790 }, { "epoch": 1.9494051254489042, "grad_norm": 23.09063720703125, "learning_rate": 1.5302907749087209e-06, "loss": 0.0783, "num_input_tokens_seen": 53787616, "step": 79795 }, { "epoch": 1.9495272762807514, "grad_norm": 0.6398938894271851, "learning_rate": 1.5302184732811952e-06, "loss": 0.0736, "num_input_tokens_seen": 53791072, "step": 79800 }, { "epoch": 1.9496494271125986, "grad_norm": 0.10749911516904831, "learning_rate": 1.5301461677977782e-06, "loss": 0.0424, "num_input_tokens_seen": 53794528, "step": 79805 }, { "epoch": 1.9497715779444458, "grad_norm": 0.19549265503883362, "learning_rate": 1.530073858458996e-06, "loss": 0.0995, "num_input_tokens_seen": 53798048, "step": 79810 }, { "epoch": 1.949893728776293, "grad_norm": 0.5635766983032227, "learning_rate": 1.5300015452653737e-06, "loss": 0.045, "num_input_tokens_seen": 53801568, "step": 79815 }, { "epoch": 1.9500158796081402, "grad_norm": 0.032533228397369385, "learning_rate": 1.529929228217438e-06, "loss": 0.0998, "num_input_tokens_seen": 53804768, "step": 79820 }, { "epoch": 1.9501380304399873, "grad_norm": 0.07323387265205383, "learning_rate": 1.5298569073157138e-06, "loss": 0.1129, "num_input_tokens_seen": 53807968, "step": 79825 }, { "epoch": 1.9502601812718345, "grad_norm": 140.5348358154297, "learning_rate": 1.529784582560728e-06, "loss": 0.0385, "num_input_tokens_seen": 53811680, "step": 79830 }, { "epoch": 1.9503823321036817, "grad_norm": 0.1245734691619873, "learning_rate": 1.5297122539530061e-06, "loss": 0.1016, "num_input_tokens_seen": 53815520, "step": 79835 }, { "epoch": 1.950504482935529, "grad_norm": 15.518820762634277, "learning_rate": 1.5296399214930746e-06, "loss": 0.1642, "num_input_tokens_seen": 53818784, "step": 79840 }, { "epoch": 1.950626633767376, "grad_norm": 0.4381111264228821, "learning_rate": 1.529567585181459e-06, "loss": 0.1013, "num_input_tokens_seen": 53822048, "step": 79845 }, { "epoch": 1.950748784599223, "grad_norm": 27.99272346496582, "learning_rate": 1.529495245018685e-06, "loss": 0.0947, "num_input_tokens_seen": 53825056, "step": 79850 }, { "epoch": 1.9508709354310703, "grad_norm": 0.1914575845003128, "learning_rate": 1.5294229010052799e-06, "loss": 0.0969, "num_input_tokens_seen": 53828448, "step": 79855 }, { "epoch": 1.9509930862629175, "grad_norm": 23.358551025390625, "learning_rate": 1.5293505531417686e-06, "loss": 0.0795, "num_input_tokens_seen": 53831648, "step": 79860 }, { "epoch": 1.9511152370947646, "grad_norm": 20.271875381469727, "learning_rate": 1.5292782014286778e-06, "loss": 0.1232, "num_input_tokens_seen": 53834720, "step": 79865 }, { "epoch": 1.9512373879266118, "grad_norm": 0.1967715322971344, "learning_rate": 1.5292058458665336e-06, "loss": 0.0317, "num_input_tokens_seen": 53837664, "step": 79870 }, { "epoch": 1.9513595387584588, "grad_norm": 0.1904851347208023, "learning_rate": 1.5291334864558621e-06, "loss": 0.1238, "num_input_tokens_seen": 53840928, "step": 79875 }, { "epoch": 1.951481689590306, "grad_norm": 2.9282398223876953, "learning_rate": 1.5290611231971895e-06, "loss": 0.1142, "num_input_tokens_seen": 53844576, "step": 79880 }, { "epoch": 1.9516038404221532, "grad_norm": 11.948089599609375, "learning_rate": 1.5289887560910422e-06, "loss": 0.0476, "num_input_tokens_seen": 53848032, "step": 79885 }, { "epoch": 1.9517259912540004, "grad_norm": 175.88540649414062, "learning_rate": 1.528916385137946e-06, "loss": 0.1525, "num_input_tokens_seen": 53851296, "step": 79890 }, { "epoch": 1.9518481420858476, "grad_norm": 126.39527130126953, "learning_rate": 1.528844010338428e-06, "loss": 0.1233, "num_input_tokens_seen": 53854496, "step": 79895 }, { "epoch": 1.9519702929176947, "grad_norm": 0.3504171669483185, "learning_rate": 1.5287716316930146e-06, "loss": 0.1436, "num_input_tokens_seen": 53858080, "step": 79900 }, { "epoch": 1.952092443749542, "grad_norm": 18.548446655273438, "learning_rate": 1.528699249202231e-06, "loss": 0.1101, "num_input_tokens_seen": 53861472, "step": 79905 }, { "epoch": 1.9522145945813891, "grad_norm": 0.4505668878555298, "learning_rate": 1.5286268628666044e-06, "loss": 0.1607, "num_input_tokens_seen": 53864416, "step": 79910 }, { "epoch": 1.9523367454132363, "grad_norm": 26.039945602416992, "learning_rate": 1.5285544726866611e-06, "loss": 0.0703, "num_input_tokens_seen": 53867808, "step": 79915 }, { "epoch": 1.9524588962450835, "grad_norm": 10.314916610717773, "learning_rate": 1.5284820786629274e-06, "loss": 0.1685, "num_input_tokens_seen": 53870752, "step": 79920 }, { "epoch": 1.9525810470769307, "grad_norm": 12.654891967773438, "learning_rate": 1.52840968079593e-06, "loss": 0.0704, "num_input_tokens_seen": 53873952, "step": 79925 }, { "epoch": 1.9527031979087779, "grad_norm": 0.6298151016235352, "learning_rate": 1.528337279086195e-06, "loss": 0.0026, "num_input_tokens_seen": 53877664, "step": 79930 }, { "epoch": 1.952825348740625, "grad_norm": 0.339324414730072, "learning_rate": 1.5282648735342495e-06, "loss": 0.0327, "num_input_tokens_seen": 53880800, "step": 79935 }, { "epoch": 1.952947499572472, "grad_norm": 113.13359069824219, "learning_rate": 1.5281924641406198e-06, "loss": 0.0877, "num_input_tokens_seen": 53884960, "step": 79940 }, { "epoch": 1.9530696504043192, "grad_norm": 0.9290973544120789, "learning_rate": 1.5281200509058322e-06, "loss": 0.0775, "num_input_tokens_seen": 53887968, "step": 79945 }, { "epoch": 1.9531918012361664, "grad_norm": 0.639180600643158, "learning_rate": 1.5280476338304139e-06, "loss": 0.1576, "num_input_tokens_seen": 53891168, "step": 79950 }, { "epoch": 1.9533139520680136, "grad_norm": 7.0396294593811035, "learning_rate": 1.527975212914891e-06, "loss": 0.0053, "num_input_tokens_seen": 53894560, "step": 79955 }, { "epoch": 1.9534361028998606, "grad_norm": 55.963619232177734, "learning_rate": 1.5279027881597904e-06, "loss": 0.0029, "num_input_tokens_seen": 53897824, "step": 79960 }, { "epoch": 1.9535582537317078, "grad_norm": 0.18510715663433075, "learning_rate": 1.5278303595656384e-06, "loss": 0.0022, "num_input_tokens_seen": 53901600, "step": 79965 }, { "epoch": 1.953680404563555, "grad_norm": 0.17064093053340912, "learning_rate": 1.5277579271329623e-06, "loss": 0.0011, "num_input_tokens_seen": 53904992, "step": 79970 }, { "epoch": 1.9538025553954022, "grad_norm": 0.2390449494123459, "learning_rate": 1.5276854908622887e-06, "loss": 0.0596, "num_input_tokens_seen": 53908000, "step": 79975 }, { "epoch": 1.9539247062272493, "grad_norm": 0.059372562915086746, "learning_rate": 1.527613050754144e-06, "loss": 0.0008, "num_input_tokens_seen": 53911328, "step": 79980 }, { "epoch": 1.9540468570590965, "grad_norm": 17.186986923217773, "learning_rate": 1.5275406068090555e-06, "loss": 0.2629, "num_input_tokens_seen": 53914464, "step": 79985 }, { "epoch": 1.9541690078909437, "grad_norm": 0.08534011989831924, "learning_rate": 1.5274681590275495e-06, "loss": 0.0013, "num_input_tokens_seen": 53917856, "step": 79990 }, { "epoch": 1.954291158722791, "grad_norm": 0.15575003623962402, "learning_rate": 1.5273957074101539e-06, "loss": 0.0438, "num_input_tokens_seen": 53921376, "step": 79995 }, { "epoch": 1.954413309554638, "grad_norm": 31.548913955688477, "learning_rate": 1.5273232519573943e-06, "loss": 0.074, "num_input_tokens_seen": 53924512, "step": 80000 }, { "epoch": 1.9545354603864853, "grad_norm": 87.1722183227539, "learning_rate": 1.5272507926697983e-06, "loss": 0.0981, "num_input_tokens_seen": 53927904, "step": 80005 }, { "epoch": 1.9546576112183325, "grad_norm": 10.903924942016602, "learning_rate": 1.527178329547893e-06, "loss": 0.21, "num_input_tokens_seen": 53931296, "step": 80010 }, { "epoch": 1.9547797620501797, "grad_norm": 0.21467016637325287, "learning_rate": 1.5271058625922044e-06, "loss": 0.0498, "num_input_tokens_seen": 53934688, "step": 80015 }, { "epoch": 1.9549019128820269, "grad_norm": 21.4910888671875, "learning_rate": 1.5270333918032607e-06, "loss": 0.1497, "num_input_tokens_seen": 53938016, "step": 80020 }, { "epoch": 1.955024063713874, "grad_norm": 0.37493425607681274, "learning_rate": 1.5269609171815884e-06, "loss": 0.0401, "num_input_tokens_seen": 53940960, "step": 80025 }, { "epoch": 1.955146214545721, "grad_norm": 0.16464658081531525, "learning_rate": 1.5268884387277143e-06, "loss": 0.1183, "num_input_tokens_seen": 53944608, "step": 80030 }, { "epoch": 1.9552683653775682, "grad_norm": 0.4630794823169708, "learning_rate": 1.5268159564421658e-06, "loss": 0.1696, "num_input_tokens_seen": 53947552, "step": 80035 }, { "epoch": 1.9553905162094154, "grad_norm": 0.6731862425804138, "learning_rate": 1.5267434703254701e-06, "loss": 0.0395, "num_input_tokens_seen": 53950880, "step": 80040 }, { "epoch": 1.9555126670412626, "grad_norm": 0.07116980850696564, "learning_rate": 1.5266709803781544e-06, "loss": 0.065, "num_input_tokens_seen": 53954336, "step": 80045 }, { "epoch": 1.9556348178731096, "grad_norm": 30.472640991210938, "learning_rate": 1.5265984866007453e-06, "loss": 0.0143, "num_input_tokens_seen": 53957728, "step": 80050 }, { "epoch": 1.9557569687049567, "grad_norm": 0.06493127346038818, "learning_rate": 1.5265259889937708e-06, "loss": 0.0829, "num_input_tokens_seen": 53961056, "step": 80055 }, { "epoch": 1.955879119536804, "grad_norm": 0.16695833206176758, "learning_rate": 1.5264534875577575e-06, "loss": 0.1419, "num_input_tokens_seen": 53964320, "step": 80060 }, { "epoch": 1.9560012703686511, "grad_norm": 24.053009033203125, "learning_rate": 1.526380982293233e-06, "loss": 0.1244, "num_input_tokens_seen": 53967328, "step": 80065 }, { "epoch": 1.9561234212004983, "grad_norm": 8.561441421508789, "learning_rate": 1.5263084732007242e-06, "loss": 0.0039, "num_input_tokens_seen": 53970400, "step": 80070 }, { "epoch": 1.9562455720323455, "grad_norm": 8.988033294677734, "learning_rate": 1.5262359602807583e-06, "loss": 0.0678, "num_input_tokens_seen": 53973536, "step": 80075 }, { "epoch": 1.9563677228641927, "grad_norm": 41.8480339050293, "learning_rate": 1.5261634435338632e-06, "loss": 0.0714, "num_input_tokens_seen": 53976736, "step": 80080 }, { "epoch": 1.9564898736960399, "grad_norm": 10.159090042114258, "learning_rate": 1.526090922960566e-06, "loss": 0.0319, "num_input_tokens_seen": 53980064, "step": 80085 }, { "epoch": 1.956612024527887, "grad_norm": 0.9245445132255554, "learning_rate": 1.5260183985613945e-06, "loss": 0.0608, "num_input_tokens_seen": 53983584, "step": 80090 }, { "epoch": 1.9567341753597343, "grad_norm": 0.111895851790905, "learning_rate": 1.5259458703368754e-06, "loss": 0.0483, "num_input_tokens_seen": 53986976, "step": 80095 }, { "epoch": 1.9568563261915815, "grad_norm": 20.46241569519043, "learning_rate": 1.5258733382875365e-06, "loss": 0.2075, "num_input_tokens_seen": 53990688, "step": 80100 }, { "epoch": 1.9569784770234286, "grad_norm": 256.2488098144531, "learning_rate": 1.5258008024139052e-06, "loss": 0.1296, "num_input_tokens_seen": 53993824, "step": 80105 }, { "epoch": 1.9571006278552758, "grad_norm": 0.30050423741340637, "learning_rate": 1.5257282627165093e-06, "loss": 0.1, "num_input_tokens_seen": 53998304, "step": 80110 }, { "epoch": 1.957222778687123, "grad_norm": 2.183509349822998, "learning_rate": 1.5256557191958756e-06, "loss": 0.0711, "num_input_tokens_seen": 54001952, "step": 80115 }, { "epoch": 1.95734492951897, "grad_norm": 0.1657891422510147, "learning_rate": 1.5255831718525324e-06, "loss": 0.0077, "num_input_tokens_seen": 54005216, "step": 80120 }, { "epoch": 1.9574670803508172, "grad_norm": 0.2397499829530716, "learning_rate": 1.5255106206870073e-06, "loss": 0.1188, "num_input_tokens_seen": 54010592, "step": 80125 }, { "epoch": 1.9575892311826644, "grad_norm": 0.7385396957397461, "learning_rate": 1.525438065699827e-06, "loss": 0.0665, "num_input_tokens_seen": 54013728, "step": 80130 }, { "epoch": 1.9577113820145116, "grad_norm": 0.024819033220410347, "learning_rate": 1.52536550689152e-06, "loss": 0.0343, "num_input_tokens_seen": 54016864, "step": 80135 }, { "epoch": 1.9578335328463585, "grad_norm": 97.39778137207031, "learning_rate": 1.525292944262614e-06, "loss": 0.0164, "num_input_tokens_seen": 54020384, "step": 80140 }, { "epoch": 1.9579556836782057, "grad_norm": 0.9959608912467957, "learning_rate": 1.525220377813636e-06, "loss": 0.127, "num_input_tokens_seen": 54024096, "step": 80145 }, { "epoch": 1.958077834510053, "grad_norm": 0.8587321639060974, "learning_rate": 1.5251478075451145e-06, "loss": 0.1639, "num_input_tokens_seen": 54027232, "step": 80150 }, { "epoch": 1.9581999853419, "grad_norm": 31.005212783813477, "learning_rate": 1.525075233457577e-06, "loss": 0.1081, "num_input_tokens_seen": 54030368, "step": 80155 }, { "epoch": 1.9583221361737473, "grad_norm": 42.02476501464844, "learning_rate": 1.5250026555515508e-06, "loss": 0.0384, "num_input_tokens_seen": 54033760, "step": 80160 }, { "epoch": 1.9584442870055945, "grad_norm": 0.03223037347197533, "learning_rate": 1.5249300738275642e-06, "loss": 0.1221, "num_input_tokens_seen": 54036960, "step": 80165 }, { "epoch": 1.9585664378374417, "grad_norm": 0.610651969909668, "learning_rate": 1.5248574882861448e-06, "loss": 0.1412, "num_input_tokens_seen": 54040544, "step": 80170 }, { "epoch": 1.9586885886692889, "grad_norm": 0.4873940944671631, "learning_rate": 1.5247848989278209e-06, "loss": 0.0873, "num_input_tokens_seen": 54043936, "step": 80175 }, { "epoch": 1.958810739501136, "grad_norm": 0.03432140499353409, "learning_rate": 1.5247123057531197e-06, "loss": 0.1489, "num_input_tokens_seen": 54047008, "step": 80180 }, { "epoch": 1.9589328903329832, "grad_norm": 0.025498438626527786, "learning_rate": 1.5246397087625698e-06, "loss": 0.0386, "num_input_tokens_seen": 54050464, "step": 80185 }, { "epoch": 1.9590550411648304, "grad_norm": 48.72001266479492, "learning_rate": 1.5245671079566987e-06, "loss": 0.1232, "num_input_tokens_seen": 54053856, "step": 80190 }, { "epoch": 1.9591771919966776, "grad_norm": 1.187605857849121, "learning_rate": 1.5244945033360343e-06, "loss": 0.0367, "num_input_tokens_seen": 54056992, "step": 80195 }, { "epoch": 1.9592993428285248, "grad_norm": 0.10178008675575256, "learning_rate": 1.524421894901105e-06, "loss": 0.0035, "num_input_tokens_seen": 54060256, "step": 80200 }, { "epoch": 1.959421493660372, "grad_norm": 32.345829010009766, "learning_rate": 1.5243492826524388e-06, "loss": 0.0954, "num_input_tokens_seen": 54063584, "step": 80205 }, { "epoch": 1.959543644492219, "grad_norm": 34.569976806640625, "learning_rate": 1.5242766665905635e-06, "loss": 0.0685, "num_input_tokens_seen": 54067424, "step": 80210 }, { "epoch": 1.9596657953240662, "grad_norm": 0.30393078923225403, "learning_rate": 1.5242040467160071e-06, "loss": 0.1259, "num_input_tokens_seen": 54070560, "step": 80215 }, { "epoch": 1.9597879461559133, "grad_norm": 0.013163371942937374, "learning_rate": 1.524131423029298e-06, "loss": 0.0437, "num_input_tokens_seen": 54074016, "step": 80220 }, { "epoch": 1.9599100969877605, "grad_norm": 0.25483468174934387, "learning_rate": 1.5240587955309642e-06, "loss": 0.0531, "num_input_tokens_seen": 54077344, "step": 80225 }, { "epoch": 1.9600322478196075, "grad_norm": 33.77492141723633, "learning_rate": 1.5239861642215336e-06, "loss": 0.1066, "num_input_tokens_seen": 54080864, "step": 80230 }, { "epoch": 1.9601543986514547, "grad_norm": 23.985240936279297, "learning_rate": 1.5239135291015349e-06, "loss": 0.0723, "num_input_tokens_seen": 54084704, "step": 80235 }, { "epoch": 1.9602765494833019, "grad_norm": 0.08957021683454514, "learning_rate": 1.523840890171496e-06, "loss": 0.0152, "num_input_tokens_seen": 54088224, "step": 80240 }, { "epoch": 1.960398700315149, "grad_norm": 0.14128699898719788, "learning_rate": 1.5237682474319455e-06, "loss": 0.0367, "num_input_tokens_seen": 54091360, "step": 80245 }, { "epoch": 1.9605208511469963, "grad_norm": 0.04403325542807579, "learning_rate": 1.5236956008834114e-06, "loss": 0.0373, "num_input_tokens_seen": 54094624, "step": 80250 }, { "epoch": 1.9606430019788434, "grad_norm": 0.16336075961589813, "learning_rate": 1.523622950526422e-06, "loss": 0.0512, "num_input_tokens_seen": 54097696, "step": 80255 }, { "epoch": 1.9607651528106906, "grad_norm": 27.28483772277832, "learning_rate": 1.5235502963615054e-06, "loss": 0.0995, "num_input_tokens_seen": 54101280, "step": 80260 }, { "epoch": 1.9608873036425378, "grad_norm": 0.08058968931436539, "learning_rate": 1.5234776383891906e-06, "loss": 0.1105, "num_input_tokens_seen": 54104416, "step": 80265 }, { "epoch": 1.961009454474385, "grad_norm": 0.2745673358440399, "learning_rate": 1.5234049766100055e-06, "loss": 0.0724, "num_input_tokens_seen": 54107872, "step": 80270 }, { "epoch": 1.9611316053062322, "grad_norm": 13.895827293395996, "learning_rate": 1.5233323110244785e-06, "loss": 0.1278, "num_input_tokens_seen": 54111136, "step": 80275 }, { "epoch": 1.9612537561380794, "grad_norm": 25.77663230895996, "learning_rate": 1.523259641633138e-06, "loss": 0.0964, "num_input_tokens_seen": 54114656, "step": 80280 }, { "epoch": 1.9613759069699266, "grad_norm": 0.32250508666038513, "learning_rate": 1.523186968436513e-06, "loss": 0.051, "num_input_tokens_seen": 54117856, "step": 80285 }, { "epoch": 1.9614980578017738, "grad_norm": 0.6567564606666565, "learning_rate": 1.5231142914351316e-06, "loss": 0.0274, "num_input_tokens_seen": 54121056, "step": 80290 }, { "epoch": 1.9616202086336207, "grad_norm": 0.4896738827228546, "learning_rate": 1.5230416106295221e-06, "loss": 0.0569, "num_input_tokens_seen": 54124640, "step": 80295 }, { "epoch": 1.961742359465468, "grad_norm": 131.39283752441406, "learning_rate": 1.5229689260202134e-06, "loss": 0.0749, "num_input_tokens_seen": 54127584, "step": 80300 }, { "epoch": 1.9618645102973151, "grad_norm": 0.7090680003166199, "learning_rate": 1.5228962376077344e-06, "loss": 0.0353, "num_input_tokens_seen": 54130528, "step": 80305 }, { "epoch": 1.9619866611291623, "grad_norm": 11.709858894348145, "learning_rate": 1.5228235453926131e-06, "loss": 0.1514, "num_input_tokens_seen": 54133600, "step": 80310 }, { "epoch": 1.9621088119610095, "grad_norm": 0.08688928186893463, "learning_rate": 1.5227508493753783e-06, "loss": 0.0796, "num_input_tokens_seen": 54136608, "step": 80315 }, { "epoch": 1.9622309627928565, "grad_norm": 0.44097834825515747, "learning_rate": 1.5226781495565588e-06, "loss": 0.1009, "num_input_tokens_seen": 54139936, "step": 80320 }, { "epoch": 1.9623531136247037, "grad_norm": 3.780630111694336, "learning_rate": 1.5226054459366831e-06, "loss": 0.0519, "num_input_tokens_seen": 54143136, "step": 80325 }, { "epoch": 1.9624752644565508, "grad_norm": 1.3856637477874756, "learning_rate": 1.5225327385162801e-06, "loss": 0.0286, "num_input_tokens_seen": 54146528, "step": 80330 }, { "epoch": 1.962597415288398, "grad_norm": 0.13977724313735962, "learning_rate": 1.5224600272958785e-06, "loss": 0.2561, "num_input_tokens_seen": 54149664, "step": 80335 }, { "epoch": 1.9627195661202452, "grad_norm": 0.2395777404308319, "learning_rate": 1.522387312276007e-06, "loss": 0.0868, "num_input_tokens_seen": 54153184, "step": 80340 }, { "epoch": 1.9628417169520924, "grad_norm": 0.7118207812309265, "learning_rate": 1.5223145934571944e-06, "loss": 0.1178, "num_input_tokens_seen": 54156256, "step": 80345 }, { "epoch": 1.9629638677839396, "grad_norm": 195.8506317138672, "learning_rate": 1.5222418708399696e-06, "loss": 0.1157, "num_input_tokens_seen": 54159776, "step": 80350 }, { "epoch": 1.9630860186157868, "grad_norm": 197.17930603027344, "learning_rate": 1.5221691444248615e-06, "loss": 0.1211, "num_input_tokens_seen": 54162912, "step": 80355 }, { "epoch": 1.963208169447634, "grad_norm": 82.97769165039062, "learning_rate": 1.522096414212399e-06, "loss": 0.1338, "num_input_tokens_seen": 54166112, "step": 80360 }, { "epoch": 1.9633303202794812, "grad_norm": 0.5258122086524963, "learning_rate": 1.522023680203111e-06, "loss": 0.1737, "num_input_tokens_seen": 54169952, "step": 80365 }, { "epoch": 1.9634524711113284, "grad_norm": 0.33753493428230286, "learning_rate": 1.5219509423975262e-06, "loss": 0.0497, "num_input_tokens_seen": 54173216, "step": 80370 }, { "epoch": 1.9635746219431756, "grad_norm": 0.5717714428901672, "learning_rate": 1.5218782007961738e-06, "loss": 0.0026, "num_input_tokens_seen": 54176416, "step": 80375 }, { "epoch": 1.9636967727750227, "grad_norm": 38.53734588623047, "learning_rate": 1.5218054553995829e-06, "loss": 0.0518, "num_input_tokens_seen": 54179808, "step": 80380 }, { "epoch": 1.9638189236068697, "grad_norm": 16.9853515625, "learning_rate": 1.521732706208282e-06, "loss": 0.1372, "num_input_tokens_seen": 54183520, "step": 80385 }, { "epoch": 1.963941074438717, "grad_norm": 15.189033508300781, "learning_rate": 1.521659953222801e-06, "loss": 0.12, "num_input_tokens_seen": 54187232, "step": 80390 }, { "epoch": 1.964063225270564, "grad_norm": 0.04450573772192001, "learning_rate": 1.5215871964436683e-06, "loss": 0.021, "num_input_tokens_seen": 54190624, "step": 80395 }, { "epoch": 1.9641853761024113, "grad_norm": 0.09292330592870712, "learning_rate": 1.5215144358714134e-06, "loss": 0.0013, "num_input_tokens_seen": 54194720, "step": 80400 }, { "epoch": 1.9643075269342585, "grad_norm": 0.19563770294189453, "learning_rate": 1.521441671506565e-06, "loss": 0.0366, "num_input_tokens_seen": 54198240, "step": 80405 }, { "epoch": 1.9644296777661054, "grad_norm": 38.16703796386719, "learning_rate": 1.5213689033496526e-06, "loss": 0.1229, "num_input_tokens_seen": 54201376, "step": 80410 }, { "epoch": 1.9645518285979526, "grad_norm": 1.607812523841858, "learning_rate": 1.5212961314012054e-06, "loss": 0.0042, "num_input_tokens_seen": 54205024, "step": 80415 }, { "epoch": 1.9646739794297998, "grad_norm": 0.6975657343864441, "learning_rate": 1.5212233556617524e-06, "loss": 0.0394, "num_input_tokens_seen": 54208224, "step": 80420 }, { "epoch": 1.964796130261647, "grad_norm": 33.357261657714844, "learning_rate": 1.5211505761318231e-06, "loss": 0.0308, "num_input_tokens_seen": 54211424, "step": 80425 }, { "epoch": 1.9649182810934942, "grad_norm": 19.344261169433594, "learning_rate": 1.5210777928119466e-06, "loss": 0.0389, "num_input_tokens_seen": 54214496, "step": 80430 }, { "epoch": 1.9650404319253414, "grad_norm": 22.692310333251953, "learning_rate": 1.5210050057026521e-06, "loss": 0.0804, "num_input_tokens_seen": 54218336, "step": 80435 }, { "epoch": 1.9651625827571886, "grad_norm": 0.6270188689231873, "learning_rate": 1.520932214804469e-06, "loss": 0.1344, "num_input_tokens_seen": 54221984, "step": 80440 }, { "epoch": 1.9652847335890358, "grad_norm": 16.021167755126953, "learning_rate": 1.520859420117927e-06, "loss": 0.1511, "num_input_tokens_seen": 54225248, "step": 80445 }, { "epoch": 1.965406884420883, "grad_norm": 1.0109308958053589, "learning_rate": 1.520786621643555e-06, "loss": 0.2087, "num_input_tokens_seen": 54228512, "step": 80450 }, { "epoch": 1.9655290352527302, "grad_norm": 1.2305643558502197, "learning_rate": 1.5207138193818824e-06, "loss": 0.1028, "num_input_tokens_seen": 54231776, "step": 80455 }, { "epoch": 1.9656511860845773, "grad_norm": 0.3660091459751129, "learning_rate": 1.5206410133334393e-06, "loss": 0.0016, "num_input_tokens_seen": 54235360, "step": 80460 }, { "epoch": 1.9657733369164245, "grad_norm": 319.0777282714844, "learning_rate": 1.5205682034987547e-06, "loss": 0.0289, "num_input_tokens_seen": 54238496, "step": 80465 }, { "epoch": 1.9658954877482717, "grad_norm": 0.11898450553417206, "learning_rate": 1.520495389878358e-06, "loss": 0.0339, "num_input_tokens_seen": 54241568, "step": 80470 }, { "epoch": 1.9660176385801187, "grad_norm": 10.782960891723633, "learning_rate": 1.5204225724727789e-06, "loss": 0.1891, "num_input_tokens_seen": 54244960, "step": 80475 }, { "epoch": 1.9661397894119659, "grad_norm": 26.708084106445312, "learning_rate": 1.5203497512825465e-06, "loss": 0.0773, "num_input_tokens_seen": 54248480, "step": 80480 }, { "epoch": 1.966261940243813, "grad_norm": 2.7352306842803955, "learning_rate": 1.5202769263081908e-06, "loss": 0.0412, "num_input_tokens_seen": 54251680, "step": 80485 }, { "epoch": 1.9663840910756603, "grad_norm": 0.025932036340236664, "learning_rate": 1.5202040975502417e-06, "loss": 0.221, "num_input_tokens_seen": 54254816, "step": 80490 }, { "epoch": 1.9665062419075072, "grad_norm": 11.335835456848145, "learning_rate": 1.5201312650092283e-06, "loss": 0.1006, "num_input_tokens_seen": 54257952, "step": 80495 }, { "epoch": 1.9666283927393544, "grad_norm": 28.126676559448242, "learning_rate": 1.5200584286856808e-06, "loss": 0.1159, "num_input_tokens_seen": 54261024, "step": 80500 }, { "epoch": 1.9667505435712016, "grad_norm": 1.0174667835235596, "learning_rate": 1.519985588580128e-06, "loss": 0.0986, "num_input_tokens_seen": 54264416, "step": 80505 }, { "epoch": 1.9668726944030488, "grad_norm": 9.535443305969238, "learning_rate": 1.5199127446931e-06, "loss": 0.0933, "num_input_tokens_seen": 54267680, "step": 80510 }, { "epoch": 1.966994845234896, "grad_norm": 0.37511146068573, "learning_rate": 1.5198398970251273e-06, "loss": 0.0496, "num_input_tokens_seen": 54271392, "step": 80515 }, { "epoch": 1.9671169960667432, "grad_norm": 51.843143463134766, "learning_rate": 1.519767045576739e-06, "loss": 0.0501, "num_input_tokens_seen": 54274784, "step": 80520 }, { "epoch": 1.9672391468985904, "grad_norm": 19.047075271606445, "learning_rate": 1.5196941903484648e-06, "loss": 0.1235, "num_input_tokens_seen": 54278176, "step": 80525 }, { "epoch": 1.9673612977304376, "grad_norm": 0.09213992953300476, "learning_rate": 1.5196213313408346e-06, "loss": 0.0017, "num_input_tokens_seen": 54281568, "step": 80530 }, { "epoch": 1.9674834485622847, "grad_norm": 0.04056869074702263, "learning_rate": 1.5195484685543783e-06, "loss": 0.0486, "num_input_tokens_seen": 54284768, "step": 80535 }, { "epoch": 1.967605599394132, "grad_norm": 78.23375701904297, "learning_rate": 1.5194756019896256e-06, "loss": 0.054, "num_input_tokens_seen": 54288288, "step": 80540 }, { "epoch": 1.9677277502259791, "grad_norm": 17.906198501586914, "learning_rate": 1.5194027316471068e-06, "loss": 0.0774, "num_input_tokens_seen": 54291744, "step": 80545 }, { "epoch": 1.9678499010578263, "grad_norm": 5.588540554046631, "learning_rate": 1.5193298575273517e-06, "loss": 0.0913, "num_input_tokens_seen": 54294496, "step": 80550 }, { "epoch": 1.9679720518896735, "grad_norm": 0.35017406940460205, "learning_rate": 1.51925697963089e-06, "loss": 0.0978, "num_input_tokens_seen": 54298144, "step": 80555 }, { "epoch": 1.9680942027215207, "grad_norm": 0.2523149251937866, "learning_rate": 1.5191840979582522e-06, "loss": 0.0596, "num_input_tokens_seen": 54301024, "step": 80560 }, { "epoch": 1.9682163535533677, "grad_norm": 3.521493673324585, "learning_rate": 1.5191112125099678e-06, "loss": 0.0024, "num_input_tokens_seen": 54304288, "step": 80565 }, { "epoch": 1.9683385043852148, "grad_norm": 0.1956097036600113, "learning_rate": 1.519038323286567e-06, "loss": 0.0716, "num_input_tokens_seen": 54307488, "step": 80570 }, { "epoch": 1.968460655217062, "grad_norm": 0.2618773579597473, "learning_rate": 1.5189654302885798e-06, "loss": 0.0178, "num_input_tokens_seen": 54310752, "step": 80575 }, { "epoch": 1.9685828060489092, "grad_norm": 0.04372239485383034, "learning_rate": 1.5188925335165369e-06, "loss": 0.0965, "num_input_tokens_seen": 54314080, "step": 80580 }, { "epoch": 1.9687049568807562, "grad_norm": 0.05728540197014809, "learning_rate": 1.5188196329709675e-06, "loss": 0.0889, "num_input_tokens_seen": 54317344, "step": 80585 }, { "epoch": 1.9688271077126034, "grad_norm": 14.37979507446289, "learning_rate": 1.5187467286524022e-06, "loss": 0.0476, "num_input_tokens_seen": 54321696, "step": 80590 }, { "epoch": 1.9689492585444506, "grad_norm": 0.176511749625206, "learning_rate": 1.5186738205613714e-06, "loss": 0.0349, "num_input_tokens_seen": 54324896, "step": 80595 }, { "epoch": 1.9690714093762978, "grad_norm": 0.5647554397583008, "learning_rate": 1.5186009086984048e-06, "loss": 0.04, "num_input_tokens_seen": 54327904, "step": 80600 }, { "epoch": 1.969193560208145, "grad_norm": 0.11472149938344955, "learning_rate": 1.5185279930640329e-06, "loss": 0.1077, "num_input_tokens_seen": 54331616, "step": 80605 }, { "epoch": 1.9693157110399921, "grad_norm": 0.41294869780540466, "learning_rate": 1.518455073658786e-06, "loss": 0.0114, "num_input_tokens_seen": 54334624, "step": 80610 }, { "epoch": 1.9694378618718393, "grad_norm": 0.6599180102348328, "learning_rate": 1.5183821504831946e-06, "loss": 0.0017, "num_input_tokens_seen": 54338080, "step": 80615 }, { "epoch": 1.9695600127036865, "grad_norm": 91.60684967041016, "learning_rate": 1.5183092235377887e-06, "loss": 0.1975, "num_input_tokens_seen": 54341152, "step": 80620 }, { "epoch": 1.9696821635355337, "grad_norm": 13.102996826171875, "learning_rate": 1.5182362928230984e-06, "loss": 0.034, "num_input_tokens_seen": 54344608, "step": 80625 }, { "epoch": 1.969804314367381, "grad_norm": 0.4793389141559601, "learning_rate": 1.518163358339655e-06, "loss": 0.0687, "num_input_tokens_seen": 54347936, "step": 80630 }, { "epoch": 1.969926465199228, "grad_norm": 0.47380492091178894, "learning_rate": 1.5180904200879876e-06, "loss": 0.0011, "num_input_tokens_seen": 54351264, "step": 80635 }, { "epoch": 1.9700486160310753, "grad_norm": 19.74294090270996, "learning_rate": 1.5180174780686277e-06, "loss": 0.1065, "num_input_tokens_seen": 54354592, "step": 80640 }, { "epoch": 1.9701707668629225, "grad_norm": 8.016862869262695, "learning_rate": 1.5179445322821055e-06, "loss": 0.0454, "num_input_tokens_seen": 54357856, "step": 80645 }, { "epoch": 1.9702929176947697, "grad_norm": 0.6117857098579407, "learning_rate": 1.5178715827289508e-06, "loss": 0.0426, "num_input_tokens_seen": 54360928, "step": 80650 }, { "epoch": 1.9704150685266166, "grad_norm": 0.5771762728691101, "learning_rate": 1.5177986294096951e-06, "loss": 0.154, "num_input_tokens_seen": 54364128, "step": 80655 }, { "epoch": 1.9705372193584638, "grad_norm": 0.1816076785326004, "learning_rate": 1.5177256723248685e-06, "loss": 0.0668, "num_input_tokens_seen": 54367200, "step": 80660 }, { "epoch": 1.970659370190311, "grad_norm": 0.4547644853591919, "learning_rate": 1.5176527114750012e-06, "loss": 0.0838, "num_input_tokens_seen": 54370784, "step": 80665 }, { "epoch": 1.9707815210221582, "grad_norm": 0.1515176147222519, "learning_rate": 1.5175797468606243e-06, "loss": 0.1594, "num_input_tokens_seen": 54373920, "step": 80670 }, { "epoch": 1.9709036718540052, "grad_norm": 22.52135467529297, "learning_rate": 1.517506778482269e-06, "loss": 0.1475, "num_input_tokens_seen": 54377376, "step": 80675 }, { "epoch": 1.9710258226858524, "grad_norm": 14.372892379760742, "learning_rate": 1.5174338063404644e-06, "loss": 0.11, "num_input_tokens_seen": 54380768, "step": 80680 }, { "epoch": 1.9711479735176995, "grad_norm": 1.638512134552002, "learning_rate": 1.5173608304357422e-06, "loss": 0.0018, "num_input_tokens_seen": 54384160, "step": 80685 }, { "epoch": 1.9712701243495467, "grad_norm": 0.21261566877365112, "learning_rate": 1.5172878507686329e-06, "loss": 0.0318, "num_input_tokens_seen": 54387744, "step": 80690 }, { "epoch": 1.971392275181394, "grad_norm": 0.11088140308856964, "learning_rate": 1.5172148673396673e-06, "loss": 0.0724, "num_input_tokens_seen": 54390880, "step": 80695 }, { "epoch": 1.9715144260132411, "grad_norm": 0.2497844099998474, "learning_rate": 1.5171418801493757e-06, "loss": 0.1306, "num_input_tokens_seen": 54394272, "step": 80700 }, { "epoch": 1.9716365768450883, "grad_norm": 0.1071532815694809, "learning_rate": 1.5170688891982895e-06, "loss": 0.1369, "num_input_tokens_seen": 54397408, "step": 80705 }, { "epoch": 1.9717587276769355, "grad_norm": 0.11594808101654053, "learning_rate": 1.5169958944869393e-06, "loss": 0.0628, "num_input_tokens_seen": 54400800, "step": 80710 }, { "epoch": 1.9718808785087827, "grad_norm": 0.5123701691627502, "learning_rate": 1.5169228960158557e-06, "loss": 0.0422, "num_input_tokens_seen": 54403872, "step": 80715 }, { "epoch": 1.9720030293406299, "grad_norm": 0.08987320214509964, "learning_rate": 1.51684989378557e-06, "loss": 0.0017, "num_input_tokens_seen": 54407584, "step": 80720 }, { "epoch": 1.972125180172477, "grad_norm": 0.7225486040115356, "learning_rate": 1.5167768877966128e-06, "loss": 0.0369, "num_input_tokens_seen": 54410848, "step": 80725 }, { "epoch": 1.9722473310043243, "grad_norm": 0.14365102350711823, "learning_rate": 1.5167038780495151e-06, "loss": 0.0298, "num_input_tokens_seen": 54413984, "step": 80730 }, { "epoch": 1.9723694818361714, "grad_norm": 27.856651306152344, "learning_rate": 1.5166308645448077e-06, "loss": 0.0452, "num_input_tokens_seen": 54417504, "step": 80735 }, { "epoch": 1.9724916326680186, "grad_norm": 0.09453947842121124, "learning_rate": 1.516557847283022e-06, "loss": 0.0985, "num_input_tokens_seen": 54421024, "step": 80740 }, { "epoch": 1.9726137834998656, "grad_norm": 10.844173431396484, "learning_rate": 1.5164848262646883e-06, "loss": 0.0931, "num_input_tokens_seen": 54424480, "step": 80745 }, { "epoch": 1.9727359343317128, "grad_norm": 0.4086884558200836, "learning_rate": 1.5164118014903382e-06, "loss": 0.0469, "num_input_tokens_seen": 54427296, "step": 80750 }, { "epoch": 1.97285808516356, "grad_norm": 0.2042410671710968, "learning_rate": 1.5163387729605028e-06, "loss": 0.0497, "num_input_tokens_seen": 54430624, "step": 80755 }, { "epoch": 1.9729802359954072, "grad_norm": 0.1424875557422638, "learning_rate": 1.5162657406757125e-06, "loss": 0.0958, "num_input_tokens_seen": 54434080, "step": 80760 }, { "epoch": 1.9731023868272541, "grad_norm": 0.2018500119447708, "learning_rate": 1.516192704636499e-06, "loss": 0.1314, "num_input_tokens_seen": 54437024, "step": 80765 }, { "epoch": 1.9732245376591013, "grad_norm": 0.5857983231544495, "learning_rate": 1.5161196648433936e-06, "loss": 0.102, "num_input_tokens_seen": 54439840, "step": 80770 }, { "epoch": 1.9733466884909485, "grad_norm": 67.6739501953125, "learning_rate": 1.516046621296927e-06, "loss": 0.1239, "num_input_tokens_seen": 54442912, "step": 80775 }, { "epoch": 1.9734688393227957, "grad_norm": 26.2843074798584, "learning_rate": 1.5159735739976307e-06, "loss": 0.0033, "num_input_tokens_seen": 54445920, "step": 80780 }, { "epoch": 1.973590990154643, "grad_norm": 0.5349447131156921, "learning_rate": 1.515900522946036e-06, "loss": 0.0027, "num_input_tokens_seen": 54449184, "step": 80785 }, { "epoch": 1.97371314098649, "grad_norm": 0.2931225597858429, "learning_rate": 1.5158274681426732e-06, "loss": 0.078, "num_input_tokens_seen": 54452704, "step": 80790 }, { "epoch": 1.9738352918183373, "grad_norm": 0.07107269763946533, "learning_rate": 1.5157544095880747e-06, "loss": 0.0011, "num_input_tokens_seen": 54455904, "step": 80795 }, { "epoch": 1.9739574426501845, "grad_norm": 0.1544140875339508, "learning_rate": 1.5156813472827717e-06, "loss": 0.0354, "num_input_tokens_seen": 54459360, "step": 80800 }, { "epoch": 1.9740795934820317, "grad_norm": 0.10960984230041504, "learning_rate": 1.515608281227295e-06, "loss": 0.1044, "num_input_tokens_seen": 54462432, "step": 80805 }, { "epoch": 1.9742017443138788, "grad_norm": 202.6284942626953, "learning_rate": 1.515535211422176e-06, "loss": 0.0775, "num_input_tokens_seen": 54465568, "step": 80810 }, { "epoch": 1.974323895145726, "grad_norm": 18.3772029876709, "learning_rate": 1.5154621378679467e-06, "loss": 0.0375, "num_input_tokens_seen": 54469216, "step": 80815 }, { "epoch": 1.9744460459775732, "grad_norm": 0.4663357734680176, "learning_rate": 1.5153890605651377e-06, "loss": 0.0969, "num_input_tokens_seen": 54472416, "step": 80820 }, { "epoch": 1.9745681968094204, "grad_norm": 0.13823716342449188, "learning_rate": 1.5153159795142809e-06, "loss": 0.0172, "num_input_tokens_seen": 54476128, "step": 80825 }, { "epoch": 1.9746903476412674, "grad_norm": 0.3354507386684418, "learning_rate": 1.5152428947159077e-06, "loss": 0.1739, "num_input_tokens_seen": 54479328, "step": 80830 }, { "epoch": 1.9748124984731146, "grad_norm": 0.10311849415302277, "learning_rate": 1.5151698061705497e-06, "loss": 0.0405, "num_input_tokens_seen": 54482912, "step": 80835 }, { "epoch": 1.9749346493049618, "grad_norm": 0.3807520270347595, "learning_rate": 1.5150967138787384e-06, "loss": 0.045, "num_input_tokens_seen": 54486304, "step": 80840 }, { "epoch": 1.975056800136809, "grad_norm": 21.44227409362793, "learning_rate": 1.5150236178410052e-06, "loss": 0.0367, "num_input_tokens_seen": 54489760, "step": 80845 }, { "epoch": 1.9751789509686561, "grad_norm": 55.89031219482422, "learning_rate": 1.5149505180578818e-06, "loss": 0.1388, "num_input_tokens_seen": 54493024, "step": 80850 }, { "epoch": 1.9753011018005031, "grad_norm": 34.56232452392578, "learning_rate": 1.514877414529899e-06, "loss": 0.2005, "num_input_tokens_seen": 54496096, "step": 80855 }, { "epoch": 1.9754232526323503, "grad_norm": 21.254436492919922, "learning_rate": 1.5148043072575899e-06, "loss": 0.0668, "num_input_tokens_seen": 54499616, "step": 80860 }, { "epoch": 1.9755454034641975, "grad_norm": 0.16856719553470612, "learning_rate": 1.5147311962414852e-06, "loss": 0.0071, "num_input_tokens_seen": 54503264, "step": 80865 }, { "epoch": 1.9756675542960447, "grad_norm": 0.2091529816389084, "learning_rate": 1.514658081482117e-06, "loss": 0.0546, "num_input_tokens_seen": 54507232, "step": 80870 }, { "epoch": 1.9757897051278919, "grad_norm": 0.3104109764099121, "learning_rate": 1.5145849629800166e-06, "loss": 0.0021, "num_input_tokens_seen": 54510240, "step": 80875 }, { "epoch": 1.975911855959739, "grad_norm": 0.293990820646286, "learning_rate": 1.514511840735716e-06, "loss": 0.0716, "num_input_tokens_seen": 54513760, "step": 80880 }, { "epoch": 1.9760340067915863, "grad_norm": 0.12436817586421967, "learning_rate": 1.5144387147497469e-06, "loss": 0.0341, "num_input_tokens_seen": 54517280, "step": 80885 }, { "epoch": 1.9761561576234334, "grad_norm": 0.32219186425209045, "learning_rate": 1.514365585022641e-06, "loss": 0.1111, "num_input_tokens_seen": 54520736, "step": 80890 }, { "epoch": 1.9762783084552806, "grad_norm": 0.2317805290222168, "learning_rate": 1.5142924515549306e-06, "loss": 0.1001, "num_input_tokens_seen": 54524128, "step": 80895 }, { "epoch": 1.9764004592871278, "grad_norm": 11.194290161132812, "learning_rate": 1.5142193143471467e-06, "loss": 0.1185, "num_input_tokens_seen": 54527584, "step": 80900 }, { "epoch": 1.976522610118975, "grad_norm": 26.01723289489746, "learning_rate": 1.5141461733998217e-06, "loss": 0.1664, "num_input_tokens_seen": 54530464, "step": 80905 }, { "epoch": 1.9766447609508222, "grad_norm": 0.5856109857559204, "learning_rate": 1.5140730287134876e-06, "loss": 0.0421, "num_input_tokens_seen": 54533856, "step": 80910 }, { "epoch": 1.9767669117826694, "grad_norm": 0.4464196562767029, "learning_rate": 1.513999880288676e-06, "loss": 0.1165, "num_input_tokens_seen": 54537376, "step": 80915 }, { "epoch": 1.9768890626145164, "grad_norm": 0.8154142498970032, "learning_rate": 1.513926728125919e-06, "loss": 0.102, "num_input_tokens_seen": 54541088, "step": 80920 }, { "epoch": 1.9770112134463635, "grad_norm": 39.605228424072266, "learning_rate": 1.5138535722257488e-06, "loss": 0.1535, "num_input_tokens_seen": 54544544, "step": 80925 }, { "epoch": 1.9771333642782107, "grad_norm": 43.534088134765625, "learning_rate": 1.5137804125886973e-06, "loss": 0.1311, "num_input_tokens_seen": 54547872, "step": 80930 }, { "epoch": 1.977255515110058, "grad_norm": 0.26322290301322937, "learning_rate": 1.5137072492152962e-06, "loss": 0.2092, "num_input_tokens_seen": 54551200, "step": 80935 }, { "epoch": 1.9773776659419051, "grad_norm": 10.208868980407715, "learning_rate": 1.513634082106078e-06, "loss": 0.2748, "num_input_tokens_seen": 54554528, "step": 80940 }, { "epoch": 1.977499816773752, "grad_norm": 0.43643590807914734, "learning_rate": 1.5135609112615746e-06, "loss": 0.0636, "num_input_tokens_seen": 54557856, "step": 80945 }, { "epoch": 1.9776219676055993, "grad_norm": 20.197067260742188, "learning_rate": 1.5134877366823178e-06, "loss": 0.1343, "num_input_tokens_seen": 54561184, "step": 80950 }, { "epoch": 1.9777441184374465, "grad_norm": 6.695237159729004, "learning_rate": 1.5134145583688406e-06, "loss": 0.058, "num_input_tokens_seen": 54564320, "step": 80955 }, { "epoch": 1.9778662692692937, "grad_norm": 0.9582432508468628, "learning_rate": 1.5133413763216742e-06, "loss": 0.0711, "num_input_tokens_seen": 54567648, "step": 80960 }, { "epoch": 1.9779884201011408, "grad_norm": 105.59188842773438, "learning_rate": 1.5132681905413515e-06, "loss": 0.0481, "num_input_tokens_seen": 54570912, "step": 80965 }, { "epoch": 1.978110570932988, "grad_norm": 24.445819854736328, "learning_rate": 1.5131950010284043e-06, "loss": 0.1138, "num_input_tokens_seen": 54574112, "step": 80970 }, { "epoch": 1.9782327217648352, "grad_norm": 0.8526203632354736, "learning_rate": 1.513121807783365e-06, "loss": 0.0475, "num_input_tokens_seen": 54577888, "step": 80975 }, { "epoch": 1.9783548725966824, "grad_norm": 0.6153159141540527, "learning_rate": 1.513048610806766e-06, "loss": 0.0688, "num_input_tokens_seen": 54581600, "step": 80980 }, { "epoch": 1.9784770234285296, "grad_norm": 0.9376423954963684, "learning_rate": 1.5129754100991394e-06, "loss": 0.0358, "num_input_tokens_seen": 54584608, "step": 80985 }, { "epoch": 1.9785991742603768, "grad_norm": 0.151042178273201, "learning_rate": 1.512902205661018e-06, "loss": 0.0421, "num_input_tokens_seen": 54588000, "step": 80990 }, { "epoch": 1.978721325092224, "grad_norm": 19.37543487548828, "learning_rate": 1.5128289974929334e-06, "loss": 0.1467, "num_input_tokens_seen": 54591136, "step": 80995 }, { "epoch": 1.9788434759240712, "grad_norm": 0.05743106082081795, "learning_rate": 1.5127557855954186e-06, "loss": 0.0667, "num_input_tokens_seen": 54594400, "step": 81000 }, { "epoch": 1.9789656267559184, "grad_norm": 2.3568992614746094, "learning_rate": 1.5126825699690056e-06, "loss": 0.0786, "num_input_tokens_seen": 54598432, "step": 81005 }, { "epoch": 1.9790877775877653, "grad_norm": 122.1474380493164, "learning_rate": 1.512609350614227e-06, "loss": 0.1151, "num_input_tokens_seen": 54601696, "step": 81010 }, { "epoch": 1.9792099284196125, "grad_norm": 13.598340034484863, "learning_rate": 1.5125361275316157e-06, "loss": 0.1662, "num_input_tokens_seen": 54604960, "step": 81015 }, { "epoch": 1.9793320792514597, "grad_norm": 16.693645477294922, "learning_rate": 1.5124629007217036e-06, "loss": 0.069, "num_input_tokens_seen": 54608480, "step": 81020 }, { "epoch": 1.979454230083307, "grad_norm": 0.4208316206932068, "learning_rate": 1.5123896701850237e-06, "loss": 0.0018, "num_input_tokens_seen": 54611936, "step": 81025 }, { "epoch": 1.9795763809151539, "grad_norm": 55.367950439453125, "learning_rate": 1.512316435922108e-06, "loss": 0.145, "num_input_tokens_seen": 54614816, "step": 81030 }, { "epoch": 1.979698531747001, "grad_norm": 10.672937393188477, "learning_rate": 1.5122431979334894e-06, "loss": 0.0485, "num_input_tokens_seen": 54618016, "step": 81035 }, { "epoch": 1.9798206825788482, "grad_norm": 0.3374331593513489, "learning_rate": 1.5121699562197006e-06, "loss": 0.1454, "num_input_tokens_seen": 54621408, "step": 81040 }, { "epoch": 1.9799428334106954, "grad_norm": 0.8995051383972168, "learning_rate": 1.5120967107812738e-06, "loss": 0.0675, "num_input_tokens_seen": 54624800, "step": 81045 }, { "epoch": 1.9800649842425426, "grad_norm": 52.55927276611328, "learning_rate": 1.5120234616187423e-06, "loss": 0.0325, "num_input_tokens_seen": 54628128, "step": 81050 }, { "epoch": 1.9801871350743898, "grad_norm": 104.41256713867188, "learning_rate": 1.5119502087326387e-06, "loss": 0.0427, "num_input_tokens_seen": 54631456, "step": 81055 }, { "epoch": 1.980309285906237, "grad_norm": 0.05695815756917, "learning_rate": 1.511876952123495e-06, "loss": 0.1117, "num_input_tokens_seen": 54634592, "step": 81060 }, { "epoch": 1.9804314367380842, "grad_norm": 0.5008662939071655, "learning_rate": 1.511803691791845e-06, "loss": 0.0639, "num_input_tokens_seen": 54637792, "step": 81065 }, { "epoch": 1.9805535875699314, "grad_norm": 0.3222808241844177, "learning_rate": 1.5117304277382204e-06, "loss": 0.0815, "num_input_tokens_seen": 54641184, "step": 81070 }, { "epoch": 1.9806757384017786, "grad_norm": 0.11780696362257004, "learning_rate": 1.5116571599631544e-06, "loss": 0.0257, "num_input_tokens_seen": 54644640, "step": 81075 }, { "epoch": 1.9807978892336258, "grad_norm": 0.8247701525688171, "learning_rate": 1.51158388846718e-06, "loss": 0.0029, "num_input_tokens_seen": 54647648, "step": 81080 }, { "epoch": 1.980920040065473, "grad_norm": 21.25235366821289, "learning_rate": 1.5115106132508305e-06, "loss": 0.0702, "num_input_tokens_seen": 54650720, "step": 81085 }, { "epoch": 1.9810421908973201, "grad_norm": 0.10468591004610062, "learning_rate": 1.5114373343146375e-06, "loss": 0.0007, "num_input_tokens_seen": 54653984, "step": 81090 }, { "epoch": 1.9811643417291673, "grad_norm": 0.09131321310997009, "learning_rate": 1.5113640516591354e-06, "loss": 0.1593, "num_input_tokens_seen": 54657568, "step": 81095 }, { "epoch": 1.9812864925610143, "grad_norm": 1.060448169708252, "learning_rate": 1.5112907652848556e-06, "loss": 0.1267, "num_input_tokens_seen": 54660640, "step": 81100 }, { "epoch": 1.9814086433928615, "grad_norm": 0.17581862211227417, "learning_rate": 1.5112174751923324e-06, "loss": 0.1972, "num_input_tokens_seen": 54663648, "step": 81105 }, { "epoch": 1.9815307942247087, "grad_norm": 23.455965042114258, "learning_rate": 1.511144181382098e-06, "loss": 0.1394, "num_input_tokens_seen": 54666912, "step": 81110 }, { "epoch": 1.9816529450565559, "grad_norm": 0.5338903665542603, "learning_rate": 1.5110708838546856e-06, "loss": 0.0018, "num_input_tokens_seen": 54670048, "step": 81115 }, { "epoch": 1.9817750958884028, "grad_norm": 0.14137008786201477, "learning_rate": 1.5109975826106285e-06, "loss": 0.0385, "num_input_tokens_seen": 54673312, "step": 81120 }, { "epoch": 1.98189724672025, "grad_norm": 17.056808471679688, "learning_rate": 1.5109242776504591e-06, "loss": 0.012, "num_input_tokens_seen": 54676640, "step": 81125 }, { "epoch": 1.9820193975520972, "grad_norm": 0.04783693701028824, "learning_rate": 1.5108509689747115e-06, "loss": 0.0411, "num_input_tokens_seen": 54679968, "step": 81130 }, { "epoch": 1.9821415483839444, "grad_norm": 0.04254491254687309, "learning_rate": 1.5107776565839177e-06, "loss": 0.001, "num_input_tokens_seen": 54684064, "step": 81135 }, { "epoch": 1.9822636992157916, "grad_norm": 13.71863842010498, "learning_rate": 1.510704340478612e-06, "loss": 0.1699, "num_input_tokens_seen": 54687392, "step": 81140 }, { "epoch": 1.9823858500476388, "grad_norm": 9.231290817260742, "learning_rate": 1.5106310206593265e-06, "loss": 0.1538, "num_input_tokens_seen": 54690464, "step": 81145 }, { "epoch": 1.982508000879486, "grad_norm": 0.3563421964645386, "learning_rate": 1.510557697126595e-06, "loss": 0.2718, "num_input_tokens_seen": 54693984, "step": 81150 }, { "epoch": 1.9826301517113332, "grad_norm": 546.980712890625, "learning_rate": 1.5104843698809506e-06, "loss": 0.1438, "num_input_tokens_seen": 54697632, "step": 81155 }, { "epoch": 1.9827523025431804, "grad_norm": 0.5848055481910706, "learning_rate": 1.5104110389229265e-06, "loss": 0.0019, "num_input_tokens_seen": 54701344, "step": 81160 }, { "epoch": 1.9828744533750275, "grad_norm": 11.758065223693848, "learning_rate": 1.5103377042530561e-06, "loss": 0.1028, "num_input_tokens_seen": 54704544, "step": 81165 }, { "epoch": 1.9829966042068747, "grad_norm": 27.06719207763672, "learning_rate": 1.5102643658718726e-06, "loss": 0.0757, "num_input_tokens_seen": 54708704, "step": 81170 }, { "epoch": 1.983118755038722, "grad_norm": 0.3434150815010071, "learning_rate": 1.5101910237799093e-06, "loss": 0.0797, "num_input_tokens_seen": 54711776, "step": 81175 }, { "epoch": 1.9832409058705691, "grad_norm": 0.2370101511478424, "learning_rate": 1.5101176779776999e-06, "loss": 0.044, "num_input_tokens_seen": 54714784, "step": 81180 }, { "epoch": 1.9833630567024163, "grad_norm": 0.17519745230674744, "learning_rate": 1.5100443284657773e-06, "loss": 0.0014, "num_input_tokens_seen": 54718048, "step": 81185 }, { "epoch": 1.9834852075342633, "grad_norm": 0.06736115366220474, "learning_rate": 1.5099709752446754e-06, "loss": 0.1347, "num_input_tokens_seen": 54721184, "step": 81190 }, { "epoch": 1.9836073583661105, "grad_norm": 0.07850868999958038, "learning_rate": 1.5098976183149272e-06, "loss": 0.1472, "num_input_tokens_seen": 54724576, "step": 81195 }, { "epoch": 1.9837295091979577, "grad_norm": 0.20919404923915863, "learning_rate": 1.5098242576770666e-06, "loss": 0.1159, "num_input_tokens_seen": 54727648, "step": 81200 }, { "epoch": 1.9838516600298048, "grad_norm": 0.24704480171203613, "learning_rate": 1.5097508933316267e-06, "loss": 0.0484, "num_input_tokens_seen": 54731232, "step": 81205 }, { "epoch": 1.9839738108616518, "grad_norm": 8.519333839416504, "learning_rate": 1.5096775252791414e-06, "loss": 0.1048, "num_input_tokens_seen": 54735136, "step": 81210 }, { "epoch": 1.984095961693499, "grad_norm": 0.39622700214385986, "learning_rate": 1.5096041535201435e-06, "loss": 0.0012, "num_input_tokens_seen": 54738656, "step": 81215 }, { "epoch": 1.9842181125253462, "grad_norm": 3.677612066268921, "learning_rate": 1.5095307780551676e-06, "loss": 0.0957, "num_input_tokens_seen": 54741728, "step": 81220 }, { "epoch": 1.9843402633571934, "grad_norm": 0.1543489396572113, "learning_rate": 1.5094573988847468e-06, "loss": 0.0014, "num_input_tokens_seen": 54744864, "step": 81225 }, { "epoch": 1.9844624141890406, "grad_norm": 0.38194113969802856, "learning_rate": 1.5093840160094145e-06, "loss": 0.0008, "num_input_tokens_seen": 54748128, "step": 81230 }, { "epoch": 1.9845845650208878, "grad_norm": 103.0581283569336, "learning_rate": 1.509310629429705e-06, "loss": 0.0874, "num_input_tokens_seen": 54751712, "step": 81235 }, { "epoch": 1.984706715852735, "grad_norm": 74.20256042480469, "learning_rate": 1.5092372391461515e-06, "loss": 0.0342, "num_input_tokens_seen": 54754976, "step": 81240 }, { "epoch": 1.9848288666845821, "grad_norm": 0.07519308477640152, "learning_rate": 1.5091638451592878e-06, "loss": 0.0006, "num_input_tokens_seen": 54758176, "step": 81245 }, { "epoch": 1.9849510175164293, "grad_norm": 0.26383519172668457, "learning_rate": 1.5090904474696478e-06, "loss": 0.0261, "num_input_tokens_seen": 54761888, "step": 81250 }, { "epoch": 1.9850731683482765, "grad_norm": 0.06626437604427338, "learning_rate": 1.5090170460777647e-06, "loss": 0.0012, "num_input_tokens_seen": 54765408, "step": 81255 }, { "epoch": 1.9851953191801237, "grad_norm": 0.7158876061439514, "learning_rate": 1.508943640984173e-06, "loss": 0.0779, "num_input_tokens_seen": 54768928, "step": 81260 }, { "epoch": 1.985317470011971, "grad_norm": 30.467967987060547, "learning_rate": 1.5088702321894062e-06, "loss": 0.0576, "num_input_tokens_seen": 54772448, "step": 81265 }, { "epoch": 1.985439620843818, "grad_norm": 22.207685470581055, "learning_rate": 1.5087968196939985e-06, "loss": 0.1292, "num_input_tokens_seen": 54775648, "step": 81270 }, { "epoch": 1.9855617716756653, "grad_norm": 0.11838986724615097, "learning_rate": 1.5087234034984833e-06, "loss": 0.0034, "num_input_tokens_seen": 54778976, "step": 81275 }, { "epoch": 1.9856839225075122, "grad_norm": 14.396102905273438, "learning_rate": 1.5086499836033945e-06, "loss": 0.1485, "num_input_tokens_seen": 54782048, "step": 81280 }, { "epoch": 1.9858060733393594, "grad_norm": 0.539726972579956, "learning_rate": 1.5085765600092663e-06, "loss": 0.0958, "num_input_tokens_seen": 54785568, "step": 81285 }, { "epoch": 1.9859282241712066, "grad_norm": 18.099950790405273, "learning_rate": 1.5085031327166324e-06, "loss": 0.0731, "num_input_tokens_seen": 54788896, "step": 81290 }, { "epoch": 1.9860503750030538, "grad_norm": 0.7998467683792114, "learning_rate": 1.5084297017260274e-06, "loss": 0.092, "num_input_tokens_seen": 54792224, "step": 81295 }, { "epoch": 1.9861725258349008, "grad_norm": 20.516544342041016, "learning_rate": 1.5083562670379847e-06, "loss": 0.1245, "num_input_tokens_seen": 54795552, "step": 81300 }, { "epoch": 1.986294676666748, "grad_norm": 0.3030303716659546, "learning_rate": 1.5082828286530385e-06, "loss": 0.18, "num_input_tokens_seen": 54799392, "step": 81305 }, { "epoch": 1.9864168274985952, "grad_norm": 10.065673828125, "learning_rate": 1.5082093865717226e-06, "loss": 0.0018, "num_input_tokens_seen": 54802720, "step": 81310 }, { "epoch": 1.9865389783304424, "grad_norm": 16.148216247558594, "learning_rate": 1.5081359407945717e-06, "loss": 0.0463, "num_input_tokens_seen": 54805984, "step": 81315 }, { "epoch": 1.9866611291622895, "grad_norm": 37.96180725097656, "learning_rate": 1.5080624913221192e-06, "loss": 0.1515, "num_input_tokens_seen": 54809376, "step": 81320 }, { "epoch": 1.9867832799941367, "grad_norm": 0.017173312604427338, "learning_rate": 1.5079890381549e-06, "loss": 0.0629, "num_input_tokens_seen": 54812256, "step": 81325 }, { "epoch": 1.986905430825984, "grad_norm": 21.360572814941406, "learning_rate": 1.5079155812934474e-06, "loss": 0.1602, "num_input_tokens_seen": 54815776, "step": 81330 }, { "epoch": 1.987027581657831, "grad_norm": 129.48104858398438, "learning_rate": 1.5078421207382963e-06, "loss": 0.0121, "num_input_tokens_seen": 54819552, "step": 81335 }, { "epoch": 1.9871497324896783, "grad_norm": 0.5234513878822327, "learning_rate": 1.5077686564899808e-06, "loss": 0.003, "num_input_tokens_seen": 54822624, "step": 81340 }, { "epoch": 1.9872718833215255, "grad_norm": 5.080391883850098, "learning_rate": 1.507695188549035e-06, "loss": 0.0501, "num_input_tokens_seen": 54825888, "step": 81345 }, { "epoch": 1.9873940341533727, "grad_norm": 22.208263397216797, "learning_rate": 1.5076217169159933e-06, "loss": 0.1933, "num_input_tokens_seen": 54829280, "step": 81350 }, { "epoch": 1.9875161849852199, "grad_norm": 0.19975244998931885, "learning_rate": 1.5075482415913899e-06, "loss": 0.1009, "num_input_tokens_seen": 54832416, "step": 81355 }, { "epoch": 1.987638335817067, "grad_norm": 0.25327712297439575, "learning_rate": 1.5074747625757591e-06, "loss": 0.0637, "num_input_tokens_seen": 54836000, "step": 81360 }, { "epoch": 1.987760486648914, "grad_norm": 27.042043685913086, "learning_rate": 1.5074012798696356e-06, "loss": 0.1048, "num_input_tokens_seen": 54839328, "step": 81365 }, { "epoch": 1.9878826374807612, "grad_norm": 23.07643699645996, "learning_rate": 1.5073277934735531e-06, "loss": 0.1378, "num_input_tokens_seen": 54842656, "step": 81370 }, { "epoch": 1.9880047883126084, "grad_norm": 0.5719969272613525, "learning_rate": 1.5072543033880466e-06, "loss": 0.1105, "num_input_tokens_seen": 54846176, "step": 81375 }, { "epoch": 1.9881269391444556, "grad_norm": 52.8463249206543, "learning_rate": 1.5071808096136503e-06, "loss": 0.0333, "num_input_tokens_seen": 54849952, "step": 81380 }, { "epoch": 1.9882490899763028, "grad_norm": 17.789306640625, "learning_rate": 1.507107312150899e-06, "loss": 0.084, "num_input_tokens_seen": 54852768, "step": 81385 }, { "epoch": 1.9883712408081498, "grad_norm": 0.5003214478492737, "learning_rate": 1.5070338110003266e-06, "loss": 0.0249, "num_input_tokens_seen": 54855968, "step": 81390 }, { "epoch": 1.988493391639997, "grad_norm": 0.13848499953746796, "learning_rate": 1.5069603061624683e-06, "loss": 0.0502, "num_input_tokens_seen": 54859296, "step": 81395 }, { "epoch": 1.9886155424718441, "grad_norm": 0.16113311052322388, "learning_rate": 1.5068867976378582e-06, "loss": 0.0339, "num_input_tokens_seen": 54863136, "step": 81400 }, { "epoch": 1.9887376933036913, "grad_norm": 0.14104273915290833, "learning_rate": 1.506813285427031e-06, "loss": 0.0677, "num_input_tokens_seen": 54866400, "step": 81405 }, { "epoch": 1.9888598441355385, "grad_norm": 60.22726058959961, "learning_rate": 1.5067397695305212e-06, "loss": 0.1462, "num_input_tokens_seen": 54869920, "step": 81410 }, { "epoch": 1.9889819949673857, "grad_norm": 1.4266036748886108, "learning_rate": 1.5066662499488634e-06, "loss": 0.0628, "num_input_tokens_seen": 54873248, "step": 81415 }, { "epoch": 1.989104145799233, "grad_norm": 19.480825424194336, "learning_rate": 1.5065927266825923e-06, "loss": 0.0587, "num_input_tokens_seen": 54877280, "step": 81420 }, { "epoch": 1.98922629663108, "grad_norm": 10.97437572479248, "learning_rate": 1.5065191997322426e-06, "loss": 0.0877, "num_input_tokens_seen": 54879968, "step": 81425 }, { "epoch": 1.9893484474629273, "grad_norm": 17.77425765991211, "learning_rate": 1.5064456690983493e-06, "loss": 0.041, "num_input_tokens_seen": 54883104, "step": 81430 }, { "epoch": 1.9894705982947745, "grad_norm": 0.09516488760709763, "learning_rate": 1.5063721347814468e-06, "loss": 0.0558, "num_input_tokens_seen": 54886624, "step": 81435 }, { "epoch": 1.9895927491266217, "grad_norm": 0.2274305522441864, "learning_rate": 1.50629859678207e-06, "loss": 0.0027, "num_input_tokens_seen": 54889952, "step": 81440 }, { "epoch": 1.9897148999584688, "grad_norm": 0.13979065418243408, "learning_rate": 1.5062250551007533e-06, "loss": 0.0344, "num_input_tokens_seen": 54893792, "step": 81445 }, { "epoch": 1.989837050790316, "grad_norm": 0.2326391339302063, "learning_rate": 1.5061515097380323e-06, "loss": 0.0756, "num_input_tokens_seen": 54897504, "step": 81450 }, { "epoch": 1.989959201622163, "grad_norm": 0.6787064671516418, "learning_rate": 1.5060779606944412e-06, "loss": 0.0261, "num_input_tokens_seen": 54900704, "step": 81455 }, { "epoch": 1.9900813524540102, "grad_norm": 3.1736743450164795, "learning_rate": 1.506004407970515e-06, "loss": 0.0433, "num_input_tokens_seen": 54904224, "step": 81460 }, { "epoch": 1.9902035032858574, "grad_norm": 0.01049636211246252, "learning_rate": 1.5059308515667888e-06, "loss": 0.1088, "num_input_tokens_seen": 54907360, "step": 81465 }, { "epoch": 1.9903256541177046, "grad_norm": 0.04513026773929596, "learning_rate": 1.5058572914837973e-06, "loss": 0.0366, "num_input_tokens_seen": 54911776, "step": 81470 }, { "epoch": 1.9904478049495518, "grad_norm": 33.00575637817383, "learning_rate": 1.505783727722075e-06, "loss": 0.0852, "num_input_tokens_seen": 54915168, "step": 81475 }, { "epoch": 1.9905699557813987, "grad_norm": 0.5446120500564575, "learning_rate": 1.505710160282158e-06, "loss": 0.0808, "num_input_tokens_seen": 54918240, "step": 81480 }, { "epoch": 1.990692106613246, "grad_norm": 9.04538631439209, "learning_rate": 1.5056365891645805e-06, "loss": 0.1603, "num_input_tokens_seen": 54921568, "step": 81485 }, { "epoch": 1.990814257445093, "grad_norm": 1.6430739164352417, "learning_rate": 1.5055630143698778e-06, "loss": 0.0425, "num_input_tokens_seen": 54924704, "step": 81490 }, { "epoch": 1.9909364082769403, "grad_norm": 114.92130279541016, "learning_rate": 1.505489435898585e-06, "loss": 0.0634, "num_input_tokens_seen": 54928096, "step": 81495 }, { "epoch": 1.9910585591087875, "grad_norm": 19.503250122070312, "learning_rate": 1.505415853751237e-06, "loss": 0.0872, "num_input_tokens_seen": 54931424, "step": 81500 }, { "epoch": 1.9911807099406347, "grad_norm": 67.42121887207031, "learning_rate": 1.5053422679283688e-06, "loss": 0.1399, "num_input_tokens_seen": 54934752, "step": 81505 }, { "epoch": 1.9913028607724819, "grad_norm": 21.0851993560791, "learning_rate": 1.5052686784305158e-06, "loss": 0.0519, "num_input_tokens_seen": 54938080, "step": 81510 }, { "epoch": 1.991425011604329, "grad_norm": 0.4135420024394989, "learning_rate": 1.505195085258213e-06, "loss": 0.039, "num_input_tokens_seen": 54941856, "step": 81515 }, { "epoch": 1.9915471624361762, "grad_norm": 27.66628074645996, "learning_rate": 1.5051214884119956e-06, "loss": 0.0728, "num_input_tokens_seen": 54945056, "step": 81520 }, { "epoch": 1.9916693132680234, "grad_norm": 0.38148805499076843, "learning_rate": 1.505047887892399e-06, "loss": 0.1029, "num_input_tokens_seen": 54948448, "step": 81525 }, { "epoch": 1.9917914640998706, "grad_norm": 0.16693571209907532, "learning_rate": 1.5049742836999584e-06, "loss": 0.0036, "num_input_tokens_seen": 54951840, "step": 81530 }, { "epoch": 1.9919136149317178, "grad_norm": 23.07744598388672, "learning_rate": 1.5049006758352088e-06, "loss": 0.2146, "num_input_tokens_seen": 54955040, "step": 81535 }, { "epoch": 1.992035765763565, "grad_norm": 61.181007385253906, "learning_rate": 1.5048270642986855e-06, "loss": 0.0869, "num_input_tokens_seen": 54958496, "step": 81540 }, { "epoch": 1.992157916595412, "grad_norm": 297.8876037597656, "learning_rate": 1.5047534490909243e-06, "loss": 0.0977, "num_input_tokens_seen": 54961760, "step": 81545 }, { "epoch": 1.9922800674272592, "grad_norm": 0.23931384086608887, "learning_rate": 1.5046798302124603e-06, "loss": 0.1243, "num_input_tokens_seen": 54965152, "step": 81550 }, { "epoch": 1.9924022182591064, "grad_norm": 0.10794756561517715, "learning_rate": 1.5046062076638288e-06, "loss": 0.0381, "num_input_tokens_seen": 54968736, "step": 81555 }, { "epoch": 1.9925243690909535, "grad_norm": 16.027257919311523, "learning_rate": 1.5045325814455657e-06, "loss": 0.1014, "num_input_tokens_seen": 54972256, "step": 81560 }, { "epoch": 1.9926465199228007, "grad_norm": 0.3013782799243927, "learning_rate": 1.5044589515582051e-06, "loss": 0.0302, "num_input_tokens_seen": 54975776, "step": 81565 }, { "epoch": 1.9927686707546477, "grad_norm": 0.038673609495162964, "learning_rate": 1.5043853180022837e-06, "loss": 0.0014, "num_input_tokens_seen": 54978656, "step": 81570 }, { "epoch": 1.9928908215864949, "grad_norm": 0.4944474995136261, "learning_rate": 1.5043116807783364e-06, "loss": 0.2286, "num_input_tokens_seen": 54982048, "step": 81575 }, { "epoch": 1.993012972418342, "grad_norm": 0.6024330854415894, "learning_rate": 1.5042380398868991e-06, "loss": 0.0634, "num_input_tokens_seen": 54985504, "step": 81580 }, { "epoch": 1.9931351232501893, "grad_norm": 0.741905152797699, "learning_rate": 1.5041643953285074e-06, "loss": 0.001, "num_input_tokens_seen": 54989152, "step": 81585 }, { "epoch": 1.9932572740820365, "grad_norm": 452.8699951171875, "learning_rate": 1.5040907471036962e-06, "loss": 0.0087, "num_input_tokens_seen": 54992672, "step": 81590 }, { "epoch": 1.9933794249138836, "grad_norm": 0.4961146116256714, "learning_rate": 1.5040170952130019e-06, "loss": 0.133, "num_input_tokens_seen": 54995936, "step": 81595 }, { "epoch": 1.9935015757457308, "grad_norm": 0.024277739226818085, "learning_rate": 1.5039434396569592e-06, "loss": 0.2358, "num_input_tokens_seen": 54999008, "step": 81600 }, { "epoch": 1.993623726577578, "grad_norm": 0.5896772742271423, "learning_rate": 1.5038697804361046e-06, "loss": 0.1468, "num_input_tokens_seen": 55002784, "step": 81605 }, { "epoch": 1.9937458774094252, "grad_norm": 10.540372848510742, "learning_rate": 1.5037961175509737e-06, "loss": 0.1263, "num_input_tokens_seen": 55005920, "step": 81610 }, { "epoch": 1.9938680282412724, "grad_norm": 0.5984711647033691, "learning_rate": 1.5037224510021016e-06, "loss": 0.0316, "num_input_tokens_seen": 55009504, "step": 81615 }, { "epoch": 1.9939901790731196, "grad_norm": 0.5763449668884277, "learning_rate": 1.5036487807900243e-06, "loss": 0.0791, "num_input_tokens_seen": 55012768, "step": 81620 }, { "epoch": 1.9941123299049668, "grad_norm": 23.172338485717773, "learning_rate": 1.5035751069152775e-06, "loss": 0.0762, "num_input_tokens_seen": 55016288, "step": 81625 }, { "epoch": 1.994234480736814, "grad_norm": 24.80535888671875, "learning_rate": 1.5035014293783972e-06, "loss": 0.1994, "num_input_tokens_seen": 55019296, "step": 81630 }, { "epoch": 1.994356631568661, "grad_norm": 0.14634768664836884, "learning_rate": 1.503427748179919e-06, "loss": 0.045, "num_input_tokens_seen": 55022176, "step": 81635 }, { "epoch": 1.9944787824005081, "grad_norm": 30.956127166748047, "learning_rate": 1.503354063320379e-06, "loss": 0.1222, "num_input_tokens_seen": 55025184, "step": 81640 }, { "epoch": 1.9946009332323553, "grad_norm": 151.15733337402344, "learning_rate": 1.503280374800313e-06, "loss": 0.1907, "num_input_tokens_seen": 55028768, "step": 81645 }, { "epoch": 1.9947230840642025, "grad_norm": 0.30794233083724976, "learning_rate": 1.5032066826202563e-06, "loss": 0.0371, "num_input_tokens_seen": 55032608, "step": 81650 }, { "epoch": 1.9948452348960495, "grad_norm": 0.3560013473033905, "learning_rate": 1.5031329867807457e-06, "loss": 0.0012, "num_input_tokens_seen": 55036064, "step": 81655 }, { "epoch": 1.9949673857278967, "grad_norm": 41.600990295410156, "learning_rate": 1.5030592872823164e-06, "loss": 0.1023, "num_input_tokens_seen": 55039072, "step": 81660 }, { "epoch": 1.9950895365597439, "grad_norm": 2.1408729553222656, "learning_rate": 1.5029855841255047e-06, "loss": 0.092, "num_input_tokens_seen": 55042272, "step": 81665 }, { "epoch": 1.995211687391591, "grad_norm": 54.596038818359375, "learning_rate": 1.5029118773108467e-06, "loss": 0.1157, "num_input_tokens_seen": 55045216, "step": 81670 }, { "epoch": 1.9953338382234382, "grad_norm": 16.272363662719727, "learning_rate": 1.5028381668388783e-06, "loss": 0.0362, "num_input_tokens_seen": 55048608, "step": 81675 }, { "epoch": 1.9954559890552854, "grad_norm": 0.11136430501937866, "learning_rate": 1.5027644527101353e-06, "loss": 0.1336, "num_input_tokens_seen": 55051872, "step": 81680 }, { "epoch": 1.9955781398871326, "grad_norm": 17.665876388549805, "learning_rate": 1.5026907349251538e-06, "loss": 0.1057, "num_input_tokens_seen": 55055264, "step": 81685 }, { "epoch": 1.9957002907189798, "grad_norm": 0.5513327121734619, "learning_rate": 1.5026170134844705e-06, "loss": 0.1327, "num_input_tokens_seen": 55058336, "step": 81690 }, { "epoch": 1.995822441550827, "grad_norm": 139.2957000732422, "learning_rate": 1.5025432883886208e-06, "loss": 0.0102, "num_input_tokens_seen": 55061216, "step": 81695 }, { "epoch": 1.9959445923826742, "grad_norm": 13.870861053466797, "learning_rate": 1.502469559638141e-06, "loss": 0.1804, "num_input_tokens_seen": 55064800, "step": 81700 }, { "epoch": 1.9960667432145214, "grad_norm": 108.46243286132812, "learning_rate": 1.5023958272335677e-06, "loss": 0.0232, "num_input_tokens_seen": 55068064, "step": 81705 }, { "epoch": 1.9961888940463686, "grad_norm": 32.1049919128418, "learning_rate": 1.5023220911754368e-06, "loss": 0.1199, "num_input_tokens_seen": 55071840, "step": 81710 }, { "epoch": 1.9963110448782158, "grad_norm": 1.4670863151550293, "learning_rate": 1.502248351464285e-06, "loss": 0.102, "num_input_tokens_seen": 55075232, "step": 81715 }, { "epoch": 1.996433195710063, "grad_norm": 0.029370633885264397, "learning_rate": 1.5021746081006474e-06, "loss": 0.0461, "num_input_tokens_seen": 55078368, "step": 81720 }, { "epoch": 1.99655534654191, "grad_norm": 0.2155640423297882, "learning_rate": 1.502100861085061e-06, "loss": 0.0729, "num_input_tokens_seen": 55082080, "step": 81725 }, { "epoch": 1.996677497373757, "grad_norm": 12.511549949645996, "learning_rate": 1.5020271104180623e-06, "loss": 0.1241, "num_input_tokens_seen": 55085024, "step": 81730 }, { "epoch": 1.9967996482056043, "grad_norm": 8.582071304321289, "learning_rate": 1.5019533561001875e-06, "loss": 0.1156, "num_input_tokens_seen": 55088544, "step": 81735 }, { "epoch": 1.9969217990374515, "grad_norm": 57.26712417602539, "learning_rate": 1.5018795981319727e-06, "loss": 0.1221, "num_input_tokens_seen": 55091936, "step": 81740 }, { "epoch": 1.9970439498692985, "grad_norm": 1.2923898696899414, "learning_rate": 1.5018058365139546e-06, "loss": 0.0466, "num_input_tokens_seen": 55095328, "step": 81745 }, { "epoch": 1.9971661007011456, "grad_norm": 12.235097885131836, "learning_rate": 1.5017320712466695e-06, "loss": 0.1147, "num_input_tokens_seen": 55098656, "step": 81750 }, { "epoch": 1.9972882515329928, "grad_norm": 32.15383529663086, "learning_rate": 1.5016583023306538e-06, "loss": 0.0567, "num_input_tokens_seen": 55102048, "step": 81755 }, { "epoch": 1.99741040236484, "grad_norm": 0.5230231285095215, "learning_rate": 1.5015845297664437e-06, "loss": 0.0016, "num_input_tokens_seen": 55105120, "step": 81760 }, { "epoch": 1.9975325531966872, "grad_norm": 16.578676223754883, "learning_rate": 1.5015107535545765e-06, "loss": 0.0671, "num_input_tokens_seen": 55108192, "step": 81765 }, { "epoch": 1.9976547040285344, "grad_norm": 14.295267105102539, "learning_rate": 1.501436973695588e-06, "loss": 0.2056, "num_input_tokens_seen": 55111200, "step": 81770 }, { "epoch": 1.9977768548603816, "grad_norm": 10.518472671508789, "learning_rate": 1.5013631901900147e-06, "loss": 0.188, "num_input_tokens_seen": 55114400, "step": 81775 }, { "epoch": 1.9978990056922288, "grad_norm": 12.591024398803711, "learning_rate": 1.501289403038394e-06, "loss": 0.0903, "num_input_tokens_seen": 55117728, "step": 81780 }, { "epoch": 1.998021156524076, "grad_norm": 0.04502028971910477, "learning_rate": 1.5012156122412615e-06, "loss": 0.0297, "num_input_tokens_seen": 55121568, "step": 81785 }, { "epoch": 1.9981433073559232, "grad_norm": 1.9730950593948364, "learning_rate": 1.501141817799154e-06, "loss": 0.0873, "num_input_tokens_seen": 55124896, "step": 81790 }, { "epoch": 1.9982654581877703, "grad_norm": 292.1108093261719, "learning_rate": 1.5010680197126089e-06, "loss": 0.0424, "num_input_tokens_seen": 55127904, "step": 81795 }, { "epoch": 1.9983876090196175, "grad_norm": 10.42750358581543, "learning_rate": 1.5009942179821624e-06, "loss": 0.1556, "num_input_tokens_seen": 55130784, "step": 81800 }, { "epoch": 1.9985097598514647, "grad_norm": 15.448634147644043, "learning_rate": 1.5009204126083507e-06, "loss": 0.0675, "num_input_tokens_seen": 55133984, "step": 81805 }, { "epoch": 1.998631910683312, "grad_norm": 12.578239440917969, "learning_rate": 1.5008466035917117e-06, "loss": 0.0825, "num_input_tokens_seen": 55137376, "step": 81810 }, { "epoch": 1.9987540615151589, "grad_norm": 286.89434814453125, "learning_rate": 1.500772790932781e-06, "loss": 0.0692, "num_input_tokens_seen": 55140640, "step": 81815 }, { "epoch": 1.998876212347006, "grad_norm": 20.852294921875, "learning_rate": 1.5006989746320962e-06, "loss": 0.1433, "num_input_tokens_seen": 55144032, "step": 81820 }, { "epoch": 1.9989983631788533, "grad_norm": 0.14329631626605988, "learning_rate": 1.5006251546901936e-06, "loss": 0.0021, "num_input_tokens_seen": 55147680, "step": 81825 }, { "epoch": 1.9991205140107005, "grad_norm": 0.04360923171043396, "learning_rate": 1.5005513311076103e-06, "loss": 0.0312, "num_input_tokens_seen": 55151136, "step": 81830 }, { "epoch": 1.9992426648425474, "grad_norm": 7.934567451477051, "learning_rate": 1.500477503884883e-06, "loss": 0.0798, "num_input_tokens_seen": 55154080, "step": 81835 }, { "epoch": 1.9993648156743946, "grad_norm": 17.159452438354492, "learning_rate": 1.5004036730225486e-06, "loss": 0.0722, "num_input_tokens_seen": 55157152, "step": 81840 }, { "epoch": 1.9994869665062418, "grad_norm": 103.14313507080078, "learning_rate": 1.5003298385211443e-06, "loss": 0.0506, "num_input_tokens_seen": 55160864, "step": 81845 }, { "epoch": 1.999609117338089, "grad_norm": 1.8348734378814697, "learning_rate": 1.5002560003812064e-06, "loss": 0.0443, "num_input_tokens_seen": 55164448, "step": 81850 }, { "epoch": 1.9997312681699362, "grad_norm": 13.934343338012695, "learning_rate": 1.5001821586032729e-06, "loss": 0.1503, "num_input_tokens_seen": 55167904, "step": 81855 }, { "epoch": 1.9998534190017834, "grad_norm": 0.1302555799484253, "learning_rate": 1.50010831318788e-06, "loss": 0.0473, "num_input_tokens_seen": 55171040, "step": 81860 }, { "epoch": 1.9999755698336306, "grad_norm": 12.425285339355469, "learning_rate": 1.500034464135565e-06, "loss": 0.2202, "num_input_tokens_seen": 55174048, "step": 81865 }, { "epoch": 2.0000977206654778, "grad_norm": 0.4271923303604126, "learning_rate": 1.4999606114468647e-06, "loss": 0.0019, "num_input_tokens_seen": 55177384, "step": 81870 }, { "epoch": 2.0001465809982166, "eval_loss": 0.12374971061944962, "eval_runtime": 47.6455, "eval_samples_per_second": 763.661, "eval_steps_per_second": 95.476, "num_input_tokens_seen": 55178600, "step": 81872 }, { "epoch": 2.000219871497325, "grad_norm": 0.012282461859285831, "learning_rate": 1.4998867551223164e-06, "loss": 0.0235, "num_input_tokens_seen": 55180584, "step": 81875 }, { "epoch": 2.000342022329172, "grad_norm": 0.15548351407051086, "learning_rate": 1.4998128951624572e-06, "loss": 0.0016, "num_input_tokens_seen": 55183592, "step": 81880 }, { "epoch": 2.0004641731610193, "grad_norm": 0.16244475543498993, "learning_rate": 1.4997390315678242e-06, "loss": 0.0287, "num_input_tokens_seen": 55187368, "step": 81885 }, { "epoch": 2.0005863239928665, "grad_norm": 0.03867373988032341, "learning_rate": 1.4996651643389545e-06, "loss": 0.0262, "num_input_tokens_seen": 55191016, "step": 81890 }, { "epoch": 2.0007084748247137, "grad_norm": 1.0324246883392334, "learning_rate": 1.4995912934763854e-06, "loss": 0.0651, "num_input_tokens_seen": 55194600, "step": 81895 }, { "epoch": 2.000830625656561, "grad_norm": 143.933837890625, "learning_rate": 1.4995174189806542e-06, "loss": 0.0267, "num_input_tokens_seen": 55197736, "step": 81900 }, { "epoch": 2.000952776488408, "grad_norm": 0.10085337609052658, "learning_rate": 1.4994435408522976e-06, "loss": 0.047, "num_input_tokens_seen": 55201320, "step": 81905 }, { "epoch": 2.0010749273202553, "grad_norm": 0.27604278922080994, "learning_rate": 1.4993696590918533e-06, "loss": 0.0019, "num_input_tokens_seen": 55204520, "step": 81910 }, { "epoch": 2.001197078152102, "grad_norm": 14.148911476135254, "learning_rate": 1.4992957736998589e-06, "loss": 0.0854, "num_input_tokens_seen": 55207656, "step": 81915 }, { "epoch": 2.001319228983949, "grad_norm": 0.190697580575943, "learning_rate": 1.4992218846768509e-06, "loss": 0.0359, "num_input_tokens_seen": 55210856, "step": 81920 }, { "epoch": 2.0014413798157964, "grad_norm": 0.6571767926216125, "learning_rate": 1.4991479920233673e-06, "loss": 0.0026, "num_input_tokens_seen": 55214312, "step": 81925 }, { "epoch": 2.0015635306476436, "grad_norm": 0.1770526021718979, "learning_rate": 1.4990740957399452e-06, "loss": 0.0017, "num_input_tokens_seen": 55217256, "step": 81930 }, { "epoch": 2.0016856814794908, "grad_norm": 0.2347087413072586, "learning_rate": 1.499000195827122e-06, "loss": 0.0011, "num_input_tokens_seen": 55220648, "step": 81935 }, { "epoch": 2.001807832311338, "grad_norm": 18.45768928527832, "learning_rate": 1.4989262922854353e-06, "loss": 0.0394, "num_input_tokens_seen": 55223848, "step": 81940 }, { "epoch": 2.001929983143185, "grad_norm": 0.006956683937460184, "learning_rate": 1.4988523851154221e-06, "loss": 0.0271, "num_input_tokens_seen": 55227048, "step": 81945 }, { "epoch": 2.0020521339750323, "grad_norm": 0.07157830893993378, "learning_rate": 1.4987784743176206e-06, "loss": 0.0005, "num_input_tokens_seen": 55230568, "step": 81950 }, { "epoch": 2.0021742848068795, "grad_norm": 0.054001644253730774, "learning_rate": 1.4987045598925678e-06, "loss": 0.1124, "num_input_tokens_seen": 55233448, "step": 81955 }, { "epoch": 2.0022964356387267, "grad_norm": 0.013696554116904736, "learning_rate": 1.4986306418408011e-06, "loss": 0.0502, "num_input_tokens_seen": 55236392, "step": 81960 }, { "epoch": 2.002418586470574, "grad_norm": 0.04159333184361458, "learning_rate": 1.4985567201628584e-06, "loss": 0.0002, "num_input_tokens_seen": 55239592, "step": 81965 }, { "epoch": 2.002540737302421, "grad_norm": 0.21539679169654846, "learning_rate": 1.498482794859277e-06, "loss": 0.0293, "num_input_tokens_seen": 55242920, "step": 81970 }, { "epoch": 2.0026628881342683, "grad_norm": 0.0375695638358593, "learning_rate": 1.4984088659305949e-06, "loss": 0.0001, "num_input_tokens_seen": 55246312, "step": 81975 }, { "epoch": 2.0027850389661155, "grad_norm": 0.021181389689445496, "learning_rate": 1.4983349333773493e-06, "loss": 0.0007, "num_input_tokens_seen": 55249320, "step": 81980 }, { "epoch": 2.0029071897979627, "grad_norm": 0.10235611349344254, "learning_rate": 1.4982609972000779e-06, "loss": 0.0017, "num_input_tokens_seen": 55252584, "step": 81985 }, { "epoch": 2.00302934062981, "grad_norm": 0.08041008561849594, "learning_rate": 1.4981870573993187e-06, "loss": 0.0005, "num_input_tokens_seen": 55256424, "step": 81990 }, { "epoch": 2.003151491461657, "grad_norm": 16.012470245361328, "learning_rate": 1.498113113975609e-06, "loss": 0.0694, "num_input_tokens_seen": 55260136, "step": 81995 }, { "epoch": 2.003273642293504, "grad_norm": 33.74158477783203, "learning_rate": 1.4980391669294872e-06, "loss": 0.0646, "num_input_tokens_seen": 55263464, "step": 82000 }, { "epoch": 2.003395793125351, "grad_norm": 0.030892247334122658, "learning_rate": 1.4979652162614902e-06, "loss": 0.0001, "num_input_tokens_seen": 55267048, "step": 82005 }, { "epoch": 2.003517943957198, "grad_norm": 26.975217819213867, "learning_rate": 1.4978912619721563e-06, "loss": 0.0811, "num_input_tokens_seen": 55270056, "step": 82010 }, { "epoch": 2.0036400947890454, "grad_norm": 0.011399905197322369, "learning_rate": 1.4978173040620233e-06, "loss": 0.0692, "num_input_tokens_seen": 55273384, "step": 82015 }, { "epoch": 2.0037622456208926, "grad_norm": 0.23399126529693604, "learning_rate": 1.497743342531629e-06, "loss": 0.0016, "num_input_tokens_seen": 55276136, "step": 82020 }, { "epoch": 2.0038843964527397, "grad_norm": 0.0349041149020195, "learning_rate": 1.4976693773815113e-06, "loss": 0.0005, "num_input_tokens_seen": 55280040, "step": 82025 }, { "epoch": 2.004006547284587, "grad_norm": 0.43788591027259827, "learning_rate": 1.497595408612208e-06, "loss": 0.0006, "num_input_tokens_seen": 55283240, "step": 82030 }, { "epoch": 2.004128698116434, "grad_norm": 0.2341459095478058, "learning_rate": 1.4975214362242567e-06, "loss": 0.0575, "num_input_tokens_seen": 55286760, "step": 82035 }, { "epoch": 2.0042508489482813, "grad_norm": 0.0617409273982048, "learning_rate": 1.4974474602181962e-06, "loss": 0.0749, "num_input_tokens_seen": 55290344, "step": 82040 }, { "epoch": 2.0043729997801285, "grad_norm": 15.370429992675781, "learning_rate": 1.4973734805945635e-06, "loss": 0.1332, "num_input_tokens_seen": 55293928, "step": 82045 }, { "epoch": 2.0044951506119757, "grad_norm": 0.027430595830082893, "learning_rate": 1.4972994973538976e-06, "loss": 0.0002, "num_input_tokens_seen": 55296936, "step": 82050 }, { "epoch": 2.004617301443823, "grad_norm": 0.2160617858171463, "learning_rate": 1.4972255104967355e-06, "loss": 0.0304, "num_input_tokens_seen": 55300456, "step": 82055 }, { "epoch": 2.00473945227567, "grad_norm": 0.0136873172596097, "learning_rate": 1.497151520023616e-06, "loss": 0.0007, "num_input_tokens_seen": 55303528, "step": 82060 }, { "epoch": 2.0048616031075173, "grad_norm": 0.04217163100838661, "learning_rate": 1.4970775259350767e-06, "loss": 0.0003, "num_input_tokens_seen": 55306600, "step": 82065 }, { "epoch": 2.0049837539393645, "grad_norm": 0.10878748446702957, "learning_rate": 1.4970035282316562e-06, "loss": 0.0004, "num_input_tokens_seen": 55309992, "step": 82070 }, { "epoch": 2.0051059047712116, "grad_norm": 0.9541506171226501, "learning_rate": 1.4969295269138924e-06, "loss": 0.0445, "num_input_tokens_seen": 55313576, "step": 82075 }, { "epoch": 2.005228055603059, "grad_norm": 0.03260614722967148, "learning_rate": 1.4968555219823233e-06, "loss": 0.0393, "num_input_tokens_seen": 55317096, "step": 82080 }, { "epoch": 2.005350206434906, "grad_norm": 0.04927986115217209, "learning_rate": 1.4967815134374872e-06, "loss": 0.0003, "num_input_tokens_seen": 55320808, "step": 82085 }, { "epoch": 2.0054723572667528, "grad_norm": 0.3475458323955536, "learning_rate": 1.4967075012799224e-06, "loss": 0.0002, "num_input_tokens_seen": 55323944, "step": 82090 }, { "epoch": 2.0055945080986, "grad_norm": 0.06237736716866493, "learning_rate": 1.4966334855101667e-06, "loss": 0.0001, "num_input_tokens_seen": 55327080, "step": 82095 }, { "epoch": 2.005716658930447, "grad_norm": 0.0665455311536789, "learning_rate": 1.496559466128759e-06, "loss": 0.0004, "num_input_tokens_seen": 55330664, "step": 82100 }, { "epoch": 2.0058388097622943, "grad_norm": 0.006148200482130051, "learning_rate": 1.4964854431362372e-06, "loss": 0.0006, "num_input_tokens_seen": 55333864, "step": 82105 }, { "epoch": 2.0059609605941415, "grad_norm": 0.07519830018281937, "learning_rate": 1.49641141653314e-06, "loss": 0.0004, "num_input_tokens_seen": 55336808, "step": 82110 }, { "epoch": 2.0060831114259887, "grad_norm": 17.028867721557617, "learning_rate": 1.4963373863200053e-06, "loss": 0.0759, "num_input_tokens_seen": 55340328, "step": 82115 }, { "epoch": 2.006205262257836, "grad_norm": 0.02648085355758667, "learning_rate": 1.4962633524973716e-06, "loss": 0.0007, "num_input_tokens_seen": 55343656, "step": 82120 }, { "epoch": 2.006327413089683, "grad_norm": 2.6933555603027344, "learning_rate": 1.4961893150657775e-06, "loss": 0.0006, "num_input_tokens_seen": 55347048, "step": 82125 }, { "epoch": 2.0064495639215303, "grad_norm": 21.312503814697266, "learning_rate": 1.496115274025761e-06, "loss": 0.0942, "num_input_tokens_seen": 55350248, "step": 82130 }, { "epoch": 2.0065717147533775, "grad_norm": 0.0015857619000598788, "learning_rate": 1.4960412293778609e-06, "loss": 0.0362, "num_input_tokens_seen": 55354024, "step": 82135 }, { "epoch": 2.0066938655852247, "grad_norm": 0.007982099428772926, "learning_rate": 1.4959671811226152e-06, "loss": 0.0268, "num_input_tokens_seen": 55357480, "step": 82140 }, { "epoch": 2.006816016417072, "grad_norm": 0.003060021670535207, "learning_rate": 1.4958931292605631e-06, "loss": 0.0377, "num_input_tokens_seen": 55361256, "step": 82145 }, { "epoch": 2.006938167248919, "grad_norm": 0.052214279770851135, "learning_rate": 1.495819073792243e-06, "loss": 0.049, "num_input_tokens_seen": 55364200, "step": 82150 }, { "epoch": 2.0070603180807662, "grad_norm": 0.572651207447052, "learning_rate": 1.4957450147181928e-06, "loss": 0.0635, "num_input_tokens_seen": 55368168, "step": 82155 }, { "epoch": 2.0071824689126134, "grad_norm": 0.002212380524724722, "learning_rate": 1.4956709520389517e-06, "loss": 0.0004, "num_input_tokens_seen": 55371688, "step": 82160 }, { "epoch": 2.0073046197444606, "grad_norm": 21.469194412231445, "learning_rate": 1.495596885755058e-06, "loss": 0.0643, "num_input_tokens_seen": 55375016, "step": 82165 }, { "epoch": 2.007426770576308, "grad_norm": 0.4995213449001312, "learning_rate": 1.4955228158670509e-06, "loss": 0.0005, "num_input_tokens_seen": 55380456, "step": 82170 }, { "epoch": 2.007548921408155, "grad_norm": 0.023713387548923492, "learning_rate": 1.4954487423754682e-06, "loss": 0.0001, "num_input_tokens_seen": 55383784, "step": 82175 }, { "epoch": 2.0076710722400017, "grad_norm": 0.020805804058909416, "learning_rate": 1.4953746652808492e-06, "loss": 0.0392, "num_input_tokens_seen": 55386856, "step": 82180 }, { "epoch": 2.007793223071849, "grad_norm": 0.3754969835281372, "learning_rate": 1.4953005845837322e-06, "loss": 0.0635, "num_input_tokens_seen": 55390568, "step": 82185 }, { "epoch": 2.007915373903696, "grad_norm": 0.010111669078469276, "learning_rate": 1.495226500284656e-06, "loss": 0.0001, "num_input_tokens_seen": 55393576, "step": 82190 }, { "epoch": 2.0080375247355433, "grad_norm": 0.008723114617168903, "learning_rate": 1.4951524123841598e-06, "loss": 0.0582, "num_input_tokens_seen": 55396968, "step": 82195 }, { "epoch": 2.0081596755673905, "grad_norm": 0.0035427026450634003, "learning_rate": 1.495078320882782e-06, "loss": 0.0018, "num_input_tokens_seen": 55400040, "step": 82200 }, { "epoch": 2.0082818263992377, "grad_norm": 0.23637795448303223, "learning_rate": 1.4950042257810616e-06, "loss": 0.0003, "num_input_tokens_seen": 55403432, "step": 82205 }, { "epoch": 2.008403977231085, "grad_norm": 0.1661052256822586, "learning_rate": 1.4949301270795372e-06, "loss": 0.0285, "num_input_tokens_seen": 55406632, "step": 82210 }, { "epoch": 2.008526128062932, "grad_norm": 0.16679638624191284, "learning_rate": 1.4948560247787477e-06, "loss": 0.0309, "num_input_tokens_seen": 55409832, "step": 82215 }, { "epoch": 2.0086482788947793, "grad_norm": 5.093140125274658, "learning_rate": 1.494781918879232e-06, "loss": 0.002, "num_input_tokens_seen": 55413224, "step": 82220 }, { "epoch": 2.0087704297266264, "grad_norm": 0.006286388263106346, "learning_rate": 1.4947078093815294e-06, "loss": 0.0609, "num_input_tokens_seen": 55416936, "step": 82225 }, { "epoch": 2.0088925805584736, "grad_norm": 7.307034969329834, "learning_rate": 1.4946336962861782e-06, "loss": 0.0009, "num_input_tokens_seen": 55419880, "step": 82230 }, { "epoch": 2.009014731390321, "grad_norm": 16.302734375, "learning_rate": 1.494559579593718e-06, "loss": 0.0786, "num_input_tokens_seen": 55423080, "step": 82235 }, { "epoch": 2.009136882222168, "grad_norm": 0.00019041105406358838, "learning_rate": 1.4944854593046876e-06, "loss": 0.0002, "num_input_tokens_seen": 55426280, "step": 82240 }, { "epoch": 2.009259033054015, "grad_norm": 12.382462501525879, "learning_rate": 1.4944113354196258e-06, "loss": 0.0316, "num_input_tokens_seen": 55429800, "step": 82245 }, { "epoch": 2.0093811838858624, "grad_norm": 0.1528623104095459, "learning_rate": 1.4943372079390718e-06, "loss": 0.0985, "num_input_tokens_seen": 55433320, "step": 82250 }, { "epoch": 2.0095033347177096, "grad_norm": 0.5868679285049438, "learning_rate": 1.4942630768635644e-06, "loss": 0.0003, "num_input_tokens_seen": 55437160, "step": 82255 }, { "epoch": 2.009625485549557, "grad_norm": 0.033141035586595535, "learning_rate": 1.4941889421936433e-06, "loss": 0.0002, "num_input_tokens_seen": 55441000, "step": 82260 }, { "epoch": 2.009747636381404, "grad_norm": 0.05835259333252907, "learning_rate": 1.4941148039298472e-06, "loss": 0.0276, "num_input_tokens_seen": 55443944, "step": 82265 }, { "epoch": 2.0098697872132507, "grad_norm": 0.14883288741111755, "learning_rate": 1.4940406620727154e-06, "loss": 0.0445, "num_input_tokens_seen": 55447272, "step": 82270 }, { "epoch": 2.009991938045098, "grad_norm": 28.439109802246094, "learning_rate": 1.493966516622787e-06, "loss": 0.1243, "num_input_tokens_seen": 55450920, "step": 82275 }, { "epoch": 2.010114088876945, "grad_norm": 0.02109772339463234, "learning_rate": 1.4938923675806012e-06, "loss": 0.0006, "num_input_tokens_seen": 55454440, "step": 82280 }, { "epoch": 2.0102362397087923, "grad_norm": 100.99060821533203, "learning_rate": 1.4938182149466974e-06, "loss": 0.0021, "num_input_tokens_seen": 55458152, "step": 82285 }, { "epoch": 2.0103583905406395, "grad_norm": 0.028115229681134224, "learning_rate": 1.4937440587216144e-06, "loss": 0.0004, "num_input_tokens_seen": 55461544, "step": 82290 }, { "epoch": 2.0104805413724867, "grad_norm": 0.009398140013217926, "learning_rate": 1.493669898905892e-06, "loss": 0.0002, "num_input_tokens_seen": 55465256, "step": 82295 }, { "epoch": 2.010602692204334, "grad_norm": 0.023891514167189598, "learning_rate": 1.4935957355000693e-06, "loss": 0.0696, "num_input_tokens_seen": 55469032, "step": 82300 }, { "epoch": 2.010724843036181, "grad_norm": 0.007383616175502539, "learning_rate": 1.4935215685046858e-06, "loss": 0.103, "num_input_tokens_seen": 55472424, "step": 82305 }, { "epoch": 2.0108469938680282, "grad_norm": 0.05158611014485359, "learning_rate": 1.4934473979202804e-06, "loss": 0.0419, "num_input_tokens_seen": 55475752, "step": 82310 }, { "epoch": 2.0109691446998754, "grad_norm": 0.03322657570242882, "learning_rate": 1.4933732237473928e-06, "loss": 0.0002, "num_input_tokens_seen": 55479912, "step": 82315 }, { "epoch": 2.0110912955317226, "grad_norm": 0.027250945568084717, "learning_rate": 1.4932990459865626e-06, "loss": 0.0835, "num_input_tokens_seen": 55483496, "step": 82320 }, { "epoch": 2.01121344636357, "grad_norm": 0.0014734260039404035, "learning_rate": 1.493224864638329e-06, "loss": 0.0489, "num_input_tokens_seen": 55486952, "step": 82325 }, { "epoch": 2.011335597195417, "grad_norm": 0.007549791131168604, "learning_rate": 1.4931506797032316e-06, "loss": 0.0007, "num_input_tokens_seen": 55490216, "step": 82330 }, { "epoch": 2.011457748027264, "grad_norm": 30.353553771972656, "learning_rate": 1.49307649118181e-06, "loss": 0.0442, "num_input_tokens_seen": 55493352, "step": 82335 }, { "epoch": 2.0115798988591114, "grad_norm": 21.21133804321289, "learning_rate": 1.4930022990746034e-06, "loss": 0.1068, "num_input_tokens_seen": 55496424, "step": 82340 }, { "epoch": 2.0117020496909586, "grad_norm": 0.3378548324108124, "learning_rate": 1.4929281033821513e-06, "loss": 0.0003, "num_input_tokens_seen": 55499816, "step": 82345 }, { "epoch": 2.0118242005228058, "grad_norm": 0.03759992495179176, "learning_rate": 1.4928539041049935e-06, "loss": 0.0016, "num_input_tokens_seen": 55503464, "step": 82350 }, { "epoch": 2.011946351354653, "grad_norm": 0.14137744903564453, "learning_rate": 1.4927797012436694e-06, "loss": 0.0003, "num_input_tokens_seen": 55507048, "step": 82355 }, { "epoch": 2.0120685021864997, "grad_norm": 0.012416354380548, "learning_rate": 1.492705494798719e-06, "loss": 0.0003, "num_input_tokens_seen": 55510440, "step": 82360 }, { "epoch": 2.012190653018347, "grad_norm": 0.033463358879089355, "learning_rate": 1.4926312847706817e-06, "loss": 0.0236, "num_input_tokens_seen": 55514024, "step": 82365 }, { "epoch": 2.012312803850194, "grad_norm": 0.016904111951589584, "learning_rate": 1.4925570711600972e-06, "loss": 0.0004, "num_input_tokens_seen": 55517224, "step": 82370 }, { "epoch": 2.0124349546820413, "grad_norm": 0.18639439344406128, "learning_rate": 1.492482853967505e-06, "loss": 0.0002, "num_input_tokens_seen": 55520808, "step": 82375 }, { "epoch": 2.0125571055138884, "grad_norm": 0.2579617202281952, "learning_rate": 1.4924086331934454e-06, "loss": 0.0009, "num_input_tokens_seen": 55523816, "step": 82380 }, { "epoch": 2.0126792563457356, "grad_norm": 1.6947616338729858, "learning_rate": 1.4923344088384576e-06, "loss": 0.0028, "num_input_tokens_seen": 55527208, "step": 82385 }, { "epoch": 2.012801407177583, "grad_norm": 0.005877126008272171, "learning_rate": 1.4922601809030814e-06, "loss": 0.0118, "num_input_tokens_seen": 55530408, "step": 82390 }, { "epoch": 2.01292355800943, "grad_norm": 0.07566092163324356, "learning_rate": 1.492185949387857e-06, "loss": 0.0002, "num_input_tokens_seen": 55533608, "step": 82395 }, { "epoch": 2.013045708841277, "grad_norm": 0.016146007925271988, "learning_rate": 1.492111714293324e-06, "loss": 0.0002, "num_input_tokens_seen": 55536744, "step": 82400 }, { "epoch": 2.0131678596731244, "grad_norm": 0.009761742316186428, "learning_rate": 1.492037475620022e-06, "loss": 0.0285, "num_input_tokens_seen": 55540392, "step": 82405 }, { "epoch": 2.0132900105049716, "grad_norm": 0.1334611028432846, "learning_rate": 1.4919632333684913e-06, "loss": 0.0002, "num_input_tokens_seen": 55543464, "step": 82410 }, { "epoch": 2.0134121613368188, "grad_norm": 0.005181447137147188, "learning_rate": 1.4918889875392716e-06, "loss": 0.041, "num_input_tokens_seen": 55546600, "step": 82415 }, { "epoch": 2.013534312168666, "grad_norm": 32.899227142333984, "learning_rate": 1.4918147381329028e-06, "loss": 0.092, "num_input_tokens_seen": 55549608, "step": 82420 }, { "epoch": 2.013656463000513, "grad_norm": 0.08666107058525085, "learning_rate": 1.491740485149925e-06, "loss": 0.0001, "num_input_tokens_seen": 55553128, "step": 82425 }, { "epoch": 2.0137786138323603, "grad_norm": 0.022960102185606956, "learning_rate": 1.491666228590878e-06, "loss": 0.0594, "num_input_tokens_seen": 55556328, "step": 82430 }, { "epoch": 2.0139007646642075, "grad_norm": 0.004995742812752724, "learning_rate": 1.4915919684563023e-06, "loss": 0.1018, "num_input_tokens_seen": 55559336, "step": 82435 }, { "epoch": 2.0140229154960547, "grad_norm": 0.036234088242053986, "learning_rate": 1.4915177047467374e-06, "loss": 0.0006, "num_input_tokens_seen": 55562792, "step": 82440 }, { "epoch": 2.014145066327902, "grad_norm": 0.0267754215747118, "learning_rate": 1.4914434374627237e-06, "loss": 0.0002, "num_input_tokens_seen": 55565736, "step": 82445 }, { "epoch": 2.0142672171597487, "grad_norm": 0.008292196318507195, "learning_rate": 1.491369166604801e-06, "loss": 0.0002, "num_input_tokens_seen": 55569704, "step": 82450 }, { "epoch": 2.014389367991596, "grad_norm": 0.29497432708740234, "learning_rate": 1.4912948921735093e-06, "loss": 0.0369, "num_input_tokens_seen": 55572712, "step": 82455 }, { "epoch": 2.014511518823443, "grad_norm": 0.017537608742713928, "learning_rate": 1.4912206141693893e-06, "loss": 0.0076, "num_input_tokens_seen": 55575976, "step": 82460 }, { "epoch": 2.0146336696552902, "grad_norm": 0.05061454698443413, "learning_rate": 1.491146332592981e-06, "loss": 0.0001, "num_input_tokens_seen": 55579624, "step": 82465 }, { "epoch": 2.0147558204871374, "grad_norm": 0.0060595618560910225, "learning_rate": 1.491072047444824e-06, "loss": 0.0561, "num_input_tokens_seen": 55582824, "step": 82470 }, { "epoch": 2.0148779713189846, "grad_norm": 0.016623610630631447, "learning_rate": 1.4909977587254595e-06, "loss": 0.0205, "num_input_tokens_seen": 55586472, "step": 82475 }, { "epoch": 2.015000122150832, "grad_norm": 0.011867456138134003, "learning_rate": 1.4909234664354266e-06, "loss": 0.0476, "num_input_tokens_seen": 55589288, "step": 82480 }, { "epoch": 2.015122272982679, "grad_norm": 0.5147221088409424, "learning_rate": 1.490849170575267e-06, "loss": 0.0005, "num_input_tokens_seen": 55592488, "step": 82485 }, { "epoch": 2.015244423814526, "grad_norm": 0.07372906059026718, "learning_rate": 1.4907748711455198e-06, "loss": 0.0001, "num_input_tokens_seen": 55596392, "step": 82490 }, { "epoch": 2.0153665746463734, "grad_norm": 17.638710021972656, "learning_rate": 1.4907005681467257e-06, "loss": 0.0414, "num_input_tokens_seen": 55599720, "step": 82495 }, { "epoch": 2.0154887254782206, "grad_norm": 0.024421432986855507, "learning_rate": 1.490626261579425e-06, "loss": 0.0001, "num_input_tokens_seen": 55602920, "step": 82500 }, { "epoch": 2.0156108763100677, "grad_norm": 0.9297134280204773, "learning_rate": 1.4905519514441585e-06, "loss": 0.0006, "num_input_tokens_seen": 55606248, "step": 82505 }, { "epoch": 2.015733027141915, "grad_norm": 0.0029619180131703615, "learning_rate": 1.490477637741466e-06, "loss": 0.0062, "num_input_tokens_seen": 55609384, "step": 82510 }, { "epoch": 2.015855177973762, "grad_norm": 0.04868128523230553, "learning_rate": 1.4904033204718881e-06, "loss": 0.087, "num_input_tokens_seen": 55612584, "step": 82515 }, { "epoch": 2.0159773288056093, "grad_norm": 0.0035741401370614767, "learning_rate": 1.4903289996359659e-06, "loss": 0.0778, "num_input_tokens_seen": 55615976, "step": 82520 }, { "epoch": 2.0160994796374565, "grad_norm": 0.02756413072347641, "learning_rate": 1.4902546752342389e-06, "loss": 0.0, "num_input_tokens_seen": 55618920, "step": 82525 }, { "epoch": 2.0162216304693037, "grad_norm": 0.002254326129332185, "learning_rate": 1.490180347267248e-06, "loss": 0.0633, "num_input_tokens_seen": 55621992, "step": 82530 }, { "epoch": 2.016343781301151, "grad_norm": 0.015056909061968327, "learning_rate": 1.4901060157355338e-06, "loss": 0.0003, "num_input_tokens_seen": 55625192, "step": 82535 }, { "epoch": 2.0164659321329976, "grad_norm": 0.0272979699075222, "learning_rate": 1.490031680639637e-06, "loss": 0.0003, "num_input_tokens_seen": 55628264, "step": 82540 }, { "epoch": 2.016588082964845, "grad_norm": 0.007410817313939333, "learning_rate": 1.4899573419800979e-06, "loss": 0.0002, "num_input_tokens_seen": 55631784, "step": 82545 }, { "epoch": 2.016710233796692, "grad_norm": 0.0029097087681293488, "learning_rate": 1.489882999757457e-06, "loss": 0.1244, "num_input_tokens_seen": 55635368, "step": 82550 }, { "epoch": 2.016832384628539, "grad_norm": 0.08003532141447067, "learning_rate": 1.4898086539722556e-06, "loss": 0.0248, "num_input_tokens_seen": 55638440, "step": 82555 }, { "epoch": 2.0169545354603864, "grad_norm": 0.37841618061065674, "learning_rate": 1.4897343046250337e-06, "loss": 0.0003, "num_input_tokens_seen": 55641704, "step": 82560 }, { "epoch": 2.0170766862922336, "grad_norm": 0.002581874141469598, "learning_rate": 1.489659951716332e-06, "loss": 0.0002, "num_input_tokens_seen": 55644648, "step": 82565 }, { "epoch": 2.0171988371240808, "grad_norm": 0.05569832772016525, "learning_rate": 1.4895855952466918e-06, "loss": 0.0004, "num_input_tokens_seen": 55647976, "step": 82570 }, { "epoch": 2.017320987955928, "grad_norm": 0.011651813052594662, "learning_rate": 1.4895112352166533e-06, "loss": 0.0796, "num_input_tokens_seen": 55651176, "step": 82575 }, { "epoch": 2.017443138787775, "grad_norm": 0.09822391718626022, "learning_rate": 1.4894368716267573e-06, "loss": 0.0703, "num_input_tokens_seen": 55654952, "step": 82580 }, { "epoch": 2.0175652896196223, "grad_norm": 0.003922508098185062, "learning_rate": 1.4893625044775451e-06, "loss": 0.0001, "num_input_tokens_seen": 55658344, "step": 82585 }, { "epoch": 2.0176874404514695, "grad_norm": 0.21082554757595062, "learning_rate": 1.4892881337695569e-06, "loss": 0.1, "num_input_tokens_seen": 55661864, "step": 82590 }, { "epoch": 2.0178095912833167, "grad_norm": 0.008030406199395657, "learning_rate": 1.4892137595033338e-06, "loss": 0.0391, "num_input_tokens_seen": 55665576, "step": 82595 }, { "epoch": 2.017931742115164, "grad_norm": 0.06815065443515778, "learning_rate": 1.4891393816794167e-06, "loss": 0.0122, "num_input_tokens_seen": 55668904, "step": 82600 }, { "epoch": 2.018053892947011, "grad_norm": 0.04512683302164078, "learning_rate": 1.4890650002983466e-06, "loss": 0.0012, "num_input_tokens_seen": 55672168, "step": 82605 }, { "epoch": 2.0181760437788583, "grad_norm": 0.002467342419549823, "learning_rate": 1.4889906153606639e-06, "loss": 0.0278, "num_input_tokens_seen": 55675624, "step": 82610 }, { "epoch": 2.0182981946107055, "grad_norm": 0.011767430230975151, "learning_rate": 1.4889162268669103e-06, "loss": 0.0216, "num_input_tokens_seen": 55678696, "step": 82615 }, { "epoch": 2.0184203454425527, "grad_norm": 16.49197769165039, "learning_rate": 1.4888418348176265e-06, "loss": 0.0693, "num_input_tokens_seen": 55681832, "step": 82620 }, { "epoch": 2.0185424962743994, "grad_norm": 0.0088810408487916, "learning_rate": 1.4887674392133528e-06, "loss": 0.0889, "num_input_tokens_seen": 55684648, "step": 82625 }, { "epoch": 2.0186646471062466, "grad_norm": 0.23101457953453064, "learning_rate": 1.488693040054631e-06, "loss": 0.0317, "num_input_tokens_seen": 55688552, "step": 82630 }, { "epoch": 2.018786797938094, "grad_norm": 0.008358001708984375, "learning_rate": 1.4886186373420022e-06, "loss": 0.0002, "num_input_tokens_seen": 55691688, "step": 82635 }, { "epoch": 2.018908948769941, "grad_norm": 16.869508743286133, "learning_rate": 1.4885442310760073e-06, "loss": 0.0431, "num_input_tokens_seen": 55695016, "step": 82640 }, { "epoch": 2.019031099601788, "grad_norm": 0.4266534745693207, "learning_rate": 1.4884698212571873e-06, "loss": 0.0524, "num_input_tokens_seen": 55698984, "step": 82645 }, { "epoch": 2.0191532504336354, "grad_norm": 76.40272521972656, "learning_rate": 1.4883954078860833e-06, "loss": 0.0188, "num_input_tokens_seen": 55702056, "step": 82650 }, { "epoch": 2.0192754012654826, "grad_norm": 0.3047426640987396, "learning_rate": 1.4883209909632365e-06, "loss": 0.0008, "num_input_tokens_seen": 55705320, "step": 82655 }, { "epoch": 2.0193975520973297, "grad_norm": 0.015569807961583138, "learning_rate": 1.488246570489188e-06, "loss": 0.0615, "num_input_tokens_seen": 55708776, "step": 82660 }, { "epoch": 2.019519702929177, "grad_norm": 0.3473021388053894, "learning_rate": 1.4881721464644792e-06, "loss": 0.0442, "num_input_tokens_seen": 55711976, "step": 82665 }, { "epoch": 2.019641853761024, "grad_norm": 207.76756286621094, "learning_rate": 1.4880977188896514e-06, "loss": 0.0361, "num_input_tokens_seen": 55715240, "step": 82670 }, { "epoch": 2.0197640045928713, "grad_norm": 0.05208823084831238, "learning_rate": 1.4880232877652454e-06, "loss": 0.0426, "num_input_tokens_seen": 55718696, "step": 82675 }, { "epoch": 2.0198861554247185, "grad_norm": 14.826360702514648, "learning_rate": 1.4879488530918032e-06, "loss": 0.0392, "num_input_tokens_seen": 55722408, "step": 82680 }, { "epoch": 2.0200083062565657, "grad_norm": 65.82763671875, "learning_rate": 1.4878744148698655e-06, "loss": 0.0549, "num_input_tokens_seen": 55725352, "step": 82685 }, { "epoch": 2.020130457088413, "grad_norm": 15.8949556350708, "learning_rate": 1.4877999730999738e-06, "loss": 0.1252, "num_input_tokens_seen": 55729320, "step": 82690 }, { "epoch": 2.02025260792026, "grad_norm": 31.089956283569336, "learning_rate": 1.4877255277826694e-06, "loss": 0.0441, "num_input_tokens_seen": 55732776, "step": 82695 }, { "epoch": 2.0203747587521073, "grad_norm": 0.20987153053283691, "learning_rate": 1.4876510789184939e-06, "loss": 0.1352, "num_input_tokens_seen": 55736104, "step": 82700 }, { "epoch": 2.0204969095839544, "grad_norm": 0.026835085824131966, "learning_rate": 1.4875766265079888e-06, "loss": 0.0021, "num_input_tokens_seen": 55739432, "step": 82705 }, { "epoch": 2.0206190604158016, "grad_norm": 0.049756210297346115, "learning_rate": 1.487502170551695e-06, "loss": 0.028, "num_input_tokens_seen": 55742568, "step": 82710 }, { "epoch": 2.0207412112476484, "grad_norm": 0.08421991765499115, "learning_rate": 1.4874277110501545e-06, "loss": 0.06, "num_input_tokens_seen": 55746344, "step": 82715 }, { "epoch": 2.0208633620794956, "grad_norm": 0.017324473708868027, "learning_rate": 1.4873532480039084e-06, "loss": 0.0002, "num_input_tokens_seen": 55749096, "step": 82720 }, { "epoch": 2.0209855129113428, "grad_norm": 0.3781276345252991, "learning_rate": 1.4872787814134983e-06, "loss": 0.0545, "num_input_tokens_seen": 55752680, "step": 82725 }, { "epoch": 2.02110766374319, "grad_norm": 0.02490142732858658, "learning_rate": 1.487204311279466e-06, "loss": 0.001, "num_input_tokens_seen": 55756712, "step": 82730 }, { "epoch": 2.021229814575037, "grad_norm": 0.017400478944182396, "learning_rate": 1.4871298376023531e-06, "loss": 0.1147, "num_input_tokens_seen": 55759976, "step": 82735 }, { "epoch": 2.0213519654068843, "grad_norm": 0.02002478949725628, "learning_rate": 1.4870553603827007e-06, "loss": 0.0001, "num_input_tokens_seen": 55763432, "step": 82740 }, { "epoch": 2.0214741162387315, "grad_norm": 0.09595032036304474, "learning_rate": 1.486980879621051e-06, "loss": 0.0474, "num_input_tokens_seen": 55766888, "step": 82745 }, { "epoch": 2.0215962670705787, "grad_norm": 0.18604975938796997, "learning_rate": 1.4869063953179452e-06, "loss": 0.0007, "num_input_tokens_seen": 55770280, "step": 82750 }, { "epoch": 2.021718417902426, "grad_norm": 0.14785557985305786, "learning_rate": 1.4868319074739252e-06, "loss": 0.0002, "num_input_tokens_seen": 55774056, "step": 82755 }, { "epoch": 2.021840568734273, "grad_norm": 0.016452603042125702, "learning_rate": 1.4867574160895327e-06, "loss": 0.0274, "num_input_tokens_seen": 55777320, "step": 82760 }, { "epoch": 2.0219627195661203, "grad_norm": 42.86761474609375, "learning_rate": 1.4866829211653092e-06, "loss": 0.0312, "num_input_tokens_seen": 55780904, "step": 82765 }, { "epoch": 2.0220848703979675, "grad_norm": 0.06126724183559418, "learning_rate": 1.4866084227017966e-06, "loss": 0.0282, "num_input_tokens_seen": 55783784, "step": 82770 }, { "epoch": 2.0222070212298147, "grad_norm": 45.65149688720703, "learning_rate": 1.4865339206995367e-06, "loss": 0.0368, "num_input_tokens_seen": 55787560, "step": 82775 }, { "epoch": 2.022329172061662, "grad_norm": 0.022739626467227936, "learning_rate": 1.486459415159071e-06, "loss": 0.0002, "num_input_tokens_seen": 55791336, "step": 82780 }, { "epoch": 2.022451322893509, "grad_norm": 0.04010211303830147, "learning_rate": 1.486384906080942e-06, "loss": 0.0192, "num_input_tokens_seen": 55796456, "step": 82785 }, { "epoch": 2.0225734737253562, "grad_norm": 0.02531035989522934, "learning_rate": 1.4863103934656908e-06, "loss": 0.0002, "num_input_tokens_seen": 55800744, "step": 82790 }, { "epoch": 2.0226956245572034, "grad_norm": 0.09000199288129807, "learning_rate": 1.4862358773138599e-06, "loss": 0.0401, "num_input_tokens_seen": 55804136, "step": 82795 }, { "epoch": 2.0228177753890506, "grad_norm": 0.01600159890949726, "learning_rate": 1.486161357625991e-06, "loss": 0.0001, "num_input_tokens_seen": 55807528, "step": 82800 }, { "epoch": 2.0229399262208974, "grad_norm": 0.04003525152802467, "learning_rate": 1.4860868344026258e-06, "loss": 0.0002, "num_input_tokens_seen": 55810920, "step": 82805 }, { "epoch": 2.0230620770527445, "grad_norm": 44.31636428833008, "learning_rate": 1.486012307644306e-06, "loss": 0.0823, "num_input_tokens_seen": 55814120, "step": 82810 }, { "epoch": 2.0231842278845917, "grad_norm": 0.14130207896232605, "learning_rate": 1.4859377773515745e-06, "loss": 0.0001, "num_input_tokens_seen": 55817448, "step": 82815 }, { "epoch": 2.023306378716439, "grad_norm": 0.048643555492162704, "learning_rate": 1.4858632435249728e-06, "loss": 0.0418, "num_input_tokens_seen": 55820712, "step": 82820 }, { "epoch": 2.023428529548286, "grad_norm": 0.0007211269694380462, "learning_rate": 1.4857887061650426e-06, "loss": 0.0001, "num_input_tokens_seen": 55824296, "step": 82825 }, { "epoch": 2.0235506803801333, "grad_norm": 0.0032669936772435904, "learning_rate": 1.4857141652723264e-06, "loss": 0.1306, "num_input_tokens_seen": 55827688, "step": 82830 }, { "epoch": 2.0236728312119805, "grad_norm": 0.05028359591960907, "learning_rate": 1.4856396208473662e-06, "loss": 0.0003, "num_input_tokens_seen": 55832168, "step": 82835 }, { "epoch": 2.0237949820438277, "grad_norm": 23.42269515991211, "learning_rate": 1.4855650728907038e-06, "loss": 0.0759, "num_input_tokens_seen": 55835624, "step": 82840 }, { "epoch": 2.023917132875675, "grad_norm": 0.004075738601386547, "learning_rate": 1.4854905214028817e-06, "loss": 0.0002, "num_input_tokens_seen": 55839080, "step": 82845 }, { "epoch": 2.024039283707522, "grad_norm": 0.13513143360614777, "learning_rate": 1.4854159663844423e-06, "loss": 0.0642, "num_input_tokens_seen": 55842216, "step": 82850 }, { "epoch": 2.0241614345393693, "grad_norm": 0.07616754621267319, "learning_rate": 1.4853414078359272e-06, "loss": 0.0002, "num_input_tokens_seen": 55845672, "step": 82855 }, { "epoch": 2.0242835853712164, "grad_norm": 0.0007707093027420342, "learning_rate": 1.485266845757879e-06, "loss": 0.0749, "num_input_tokens_seen": 55849704, "step": 82860 }, { "epoch": 2.0244057362030636, "grad_norm": 0.025444896891713142, "learning_rate": 1.4851922801508393e-06, "loss": 0.007, "num_input_tokens_seen": 55853288, "step": 82865 }, { "epoch": 2.024527887034911, "grad_norm": 19.272645950317383, "learning_rate": 1.4851177110153512e-06, "loss": 0.0792, "num_input_tokens_seen": 55856616, "step": 82870 }, { "epoch": 2.024650037866758, "grad_norm": 0.09565642476081848, "learning_rate": 1.4850431383519563e-06, "loss": 0.0004, "num_input_tokens_seen": 55860200, "step": 82875 }, { "epoch": 2.024772188698605, "grad_norm": 0.13231025636196136, "learning_rate": 1.4849685621611976e-06, "loss": 0.0001, "num_input_tokens_seen": 55863784, "step": 82880 }, { "epoch": 2.0248943395304524, "grad_norm": 0.023808278143405914, "learning_rate": 1.4848939824436171e-06, "loss": 0.0114, "num_input_tokens_seen": 55867048, "step": 82885 }, { "epoch": 2.0250164903622996, "grad_norm": 0.07147572934627533, "learning_rate": 1.4848193991997572e-06, "loss": 0.0002, "num_input_tokens_seen": 55870376, "step": 82890 }, { "epoch": 2.0251386411941463, "grad_norm": 0.002442893572151661, "learning_rate": 1.4847448124301598e-06, "loss": 0.0001, "num_input_tokens_seen": 55874088, "step": 82895 }, { "epoch": 2.0252607920259935, "grad_norm": 1.2758766412734985, "learning_rate": 1.484670222135368e-06, "loss": 0.0276, "num_input_tokens_seen": 55877224, "step": 82900 }, { "epoch": 2.0253829428578407, "grad_norm": 0.016900723800063133, "learning_rate": 1.484595628315924e-06, "loss": 0.0953, "num_input_tokens_seen": 55880808, "step": 82905 }, { "epoch": 2.025505093689688, "grad_norm": 0.021285761147737503, "learning_rate": 1.48452103097237e-06, "loss": 0.0002, "num_input_tokens_seen": 55884200, "step": 82910 }, { "epoch": 2.025627244521535, "grad_norm": 0.003971973434090614, "learning_rate": 1.4844464301052494e-06, "loss": 0.0919, "num_input_tokens_seen": 55887784, "step": 82915 }, { "epoch": 2.0257493953533823, "grad_norm": 0.038204390555620193, "learning_rate": 1.4843718257151034e-06, "loss": 0.0002, "num_input_tokens_seen": 55890920, "step": 82920 }, { "epoch": 2.0258715461852295, "grad_norm": 0.005248145200312138, "learning_rate": 1.4842972178024753e-06, "loss": 0.0001, "num_input_tokens_seen": 55894248, "step": 82925 }, { "epoch": 2.0259936970170767, "grad_norm": 0.012961753644049168, "learning_rate": 1.4842226063679077e-06, "loss": 0.0377, "num_input_tokens_seen": 55897192, "step": 82930 }, { "epoch": 2.026115847848924, "grad_norm": 0.08081217110157013, "learning_rate": 1.484147991411943e-06, "loss": 0.0003, "num_input_tokens_seen": 55900136, "step": 82935 }, { "epoch": 2.026237998680771, "grad_norm": 0.13495701551437378, "learning_rate": 1.484073372935124e-06, "loss": 0.0004, "num_input_tokens_seen": 55903912, "step": 82940 }, { "epoch": 2.0263601495126182, "grad_norm": 0.006603249814361334, "learning_rate": 1.4839987509379933e-06, "loss": 0.0002, "num_input_tokens_seen": 55906920, "step": 82945 }, { "epoch": 2.0264823003444654, "grad_norm": 33.194488525390625, "learning_rate": 1.4839241254210932e-06, "loss": 0.0399, "num_input_tokens_seen": 55910440, "step": 82950 }, { "epoch": 2.0266044511763126, "grad_norm": 2.8557236194610596, "learning_rate": 1.483849496384967e-06, "loss": 0.0004, "num_input_tokens_seen": 55913768, "step": 82955 }, { "epoch": 2.02672660200816, "grad_norm": 0.006654535885900259, "learning_rate": 1.483774863830157e-06, "loss": 0.0001, "num_input_tokens_seen": 55916712, "step": 82960 }, { "epoch": 2.026848752840007, "grad_norm": 2.445951223373413, "learning_rate": 1.483700227757206e-06, "loss": 0.0005, "num_input_tokens_seen": 55919848, "step": 82965 }, { "epoch": 2.026970903671854, "grad_norm": 0.012774002738296986, "learning_rate": 1.4836255881666568e-06, "loss": 0.159, "num_input_tokens_seen": 55923368, "step": 82970 }, { "epoch": 2.0270930545037014, "grad_norm": 0.0121014304459095, "learning_rate": 1.4835509450590525e-06, "loss": 0.0001, "num_input_tokens_seen": 55926376, "step": 82975 }, { "epoch": 2.0272152053355486, "grad_norm": 0.02520882338285446, "learning_rate": 1.4834762984349354e-06, "loss": 0.0001, "num_input_tokens_seen": 55930152, "step": 82980 }, { "epoch": 2.0273373561673953, "grad_norm": 0.008401707746088505, "learning_rate": 1.4834016482948489e-06, "loss": 0.0002, "num_input_tokens_seen": 55933288, "step": 82985 }, { "epoch": 2.0274595069992425, "grad_norm": 0.006767922546714544, "learning_rate": 1.4833269946393353e-06, "loss": 0.0002, "num_input_tokens_seen": 55937704, "step": 82990 }, { "epoch": 2.0275816578310897, "grad_norm": 0.025144563987851143, "learning_rate": 1.483252337468938e-06, "loss": 0.0213, "num_input_tokens_seen": 55940776, "step": 82995 }, { "epoch": 2.027703808662937, "grad_norm": 0.03302784636616707, "learning_rate": 1.4831776767841996e-06, "loss": 0.0004, "num_input_tokens_seen": 55943912, "step": 83000 }, { "epoch": 2.027825959494784, "grad_norm": 0.19085726141929626, "learning_rate": 1.4831030125856633e-06, "loss": 0.048, "num_input_tokens_seen": 55947368, "step": 83005 }, { "epoch": 2.0279481103266312, "grad_norm": 0.0025873398408293724, "learning_rate": 1.4830283448738718e-06, "loss": 0.0642, "num_input_tokens_seen": 55950824, "step": 83010 }, { "epoch": 2.0280702611584784, "grad_norm": 0.28894203901290894, "learning_rate": 1.4829536736493685e-06, "loss": 0.1304, "num_input_tokens_seen": 55954600, "step": 83015 }, { "epoch": 2.0281924119903256, "grad_norm": 0.006737336050719023, "learning_rate": 1.482878998912696e-06, "loss": 0.0001, "num_input_tokens_seen": 55958376, "step": 83020 }, { "epoch": 2.028314562822173, "grad_norm": 0.04847032576799393, "learning_rate": 1.4828043206643976e-06, "loss": 0.0407, "num_input_tokens_seen": 55961320, "step": 83025 }, { "epoch": 2.02843671365402, "grad_norm": 0.0007056460017338395, "learning_rate": 1.4827296389050161e-06, "loss": 0.0004, "num_input_tokens_seen": 55964648, "step": 83030 }, { "epoch": 2.028558864485867, "grad_norm": 0.14868846535682678, "learning_rate": 1.482654953635095e-06, "loss": 0.0479, "num_input_tokens_seen": 55967976, "step": 83035 }, { "epoch": 2.0286810153177144, "grad_norm": 0.004213292151689529, "learning_rate": 1.4825802648551774e-06, "loss": 0.0001, "num_input_tokens_seen": 55970920, "step": 83040 }, { "epoch": 2.0288031661495616, "grad_norm": 0.019025111570954323, "learning_rate": 1.482505572565806e-06, "loss": 0.0491, "num_input_tokens_seen": 55974184, "step": 83045 }, { "epoch": 2.0289253169814088, "grad_norm": 56.368988037109375, "learning_rate": 1.4824308767675247e-06, "loss": 0.1344, "num_input_tokens_seen": 55977640, "step": 83050 }, { "epoch": 2.029047467813256, "grad_norm": 0.04584551602602005, "learning_rate": 1.4823561774608759e-06, "loss": 0.0002, "num_input_tokens_seen": 55980904, "step": 83055 }, { "epoch": 2.029169618645103, "grad_norm": 0.1676466315984726, "learning_rate": 1.4822814746464034e-06, "loss": 0.034, "num_input_tokens_seen": 55984552, "step": 83060 }, { "epoch": 2.0292917694769503, "grad_norm": 0.10654141008853912, "learning_rate": 1.4822067683246503e-06, "loss": 0.0361, "num_input_tokens_seen": 55987816, "step": 83065 }, { "epoch": 2.029413920308797, "grad_norm": 0.03309716284275055, "learning_rate": 1.4821320584961601e-06, "loss": 0.0002, "num_input_tokens_seen": 55991592, "step": 83070 }, { "epoch": 2.0295360711406443, "grad_norm": 0.06819537281990051, "learning_rate": 1.4820573451614757e-06, "loss": 0.0004, "num_input_tokens_seen": 55994920, "step": 83075 }, { "epoch": 2.0296582219724915, "grad_norm": 0.08601061999797821, "learning_rate": 1.4819826283211407e-06, "loss": 0.0004, "num_input_tokens_seen": 55998184, "step": 83080 }, { "epoch": 2.0297803728043387, "grad_norm": 0.19331924617290497, "learning_rate": 1.4819079079756982e-06, "loss": 0.0003, "num_input_tokens_seen": 56001704, "step": 83085 }, { "epoch": 2.029902523636186, "grad_norm": 0.0026453707832843065, "learning_rate": 1.4818331841256919e-06, "loss": 0.1135, "num_input_tokens_seen": 56004840, "step": 83090 }, { "epoch": 2.030024674468033, "grad_norm": 0.051979582756757736, "learning_rate": 1.481758456771665e-06, "loss": 0.0003, "num_input_tokens_seen": 56007784, "step": 83095 }, { "epoch": 2.03014682529988, "grad_norm": 0.013150978833436966, "learning_rate": 1.481683725914161e-06, "loss": 0.0512, "num_input_tokens_seen": 56011496, "step": 83100 }, { "epoch": 2.0302689761317274, "grad_norm": 0.11980671435594559, "learning_rate": 1.4816089915537235e-06, "loss": 0.0002, "num_input_tokens_seen": 56015016, "step": 83105 }, { "epoch": 2.0303911269635746, "grad_norm": 0.020589416846632957, "learning_rate": 1.4815342536908962e-06, "loss": 0.0489, "num_input_tokens_seen": 56018088, "step": 83110 }, { "epoch": 2.030513277795422, "grad_norm": 0.0686459019780159, "learning_rate": 1.4814595123262218e-06, "loss": 0.0001, "num_input_tokens_seen": 56021544, "step": 83115 }, { "epoch": 2.030635428627269, "grad_norm": 930.4236450195312, "learning_rate": 1.4813847674602447e-06, "loss": 0.1347, "num_input_tokens_seen": 56025192, "step": 83120 }, { "epoch": 2.030757579459116, "grad_norm": 0.17426523566246033, "learning_rate": 1.4813100190935077e-06, "loss": 0.124, "num_input_tokens_seen": 56028392, "step": 83125 }, { "epoch": 2.0308797302909634, "grad_norm": 0.061261072754859924, "learning_rate": 1.4812352672265549e-06, "loss": 0.0594, "num_input_tokens_seen": 56031912, "step": 83130 }, { "epoch": 2.0310018811228105, "grad_norm": 0.05861698463559151, "learning_rate": 1.48116051185993e-06, "loss": 0.0001, "num_input_tokens_seen": 56035304, "step": 83135 }, { "epoch": 2.0311240319546577, "grad_norm": 0.025122350081801414, "learning_rate": 1.4810857529941762e-06, "loss": 0.0616, "num_input_tokens_seen": 56038696, "step": 83140 }, { "epoch": 2.031246182786505, "grad_norm": 0.013969638384878635, "learning_rate": 1.4810109906298375e-06, "loss": 0.076, "num_input_tokens_seen": 56041896, "step": 83145 }, { "epoch": 2.031368333618352, "grad_norm": 0.015171969309449196, "learning_rate": 1.4809362247674578e-06, "loss": 0.1202, "num_input_tokens_seen": 56045416, "step": 83150 }, { "epoch": 2.0314904844501993, "grad_norm": 0.002995851216837764, "learning_rate": 1.48086145540758e-06, "loss": 0.0196, "num_input_tokens_seen": 56048936, "step": 83155 }, { "epoch": 2.031612635282046, "grad_norm": 0.024294119328260422, "learning_rate": 1.4807866825507487e-06, "loss": 0.0005, "num_input_tokens_seen": 56052264, "step": 83160 }, { "epoch": 2.0317347861138932, "grad_norm": 0.28206366300582886, "learning_rate": 1.4807119061975074e-06, "loss": 0.0346, "num_input_tokens_seen": 56055464, "step": 83165 }, { "epoch": 2.0318569369457404, "grad_norm": 16.086278915405273, "learning_rate": 1.4806371263483995e-06, "loss": 0.0758, "num_input_tokens_seen": 56058792, "step": 83170 }, { "epoch": 2.0319790877775876, "grad_norm": 0.005598350428044796, "learning_rate": 1.4805623430039693e-06, "loss": 0.025, "num_input_tokens_seen": 56062248, "step": 83175 }, { "epoch": 2.032101238609435, "grad_norm": 17.215137481689453, "learning_rate": 1.4804875561647604e-06, "loss": 0.0016, "num_input_tokens_seen": 56065768, "step": 83180 }, { "epoch": 2.032223389441282, "grad_norm": 11.782296180725098, "learning_rate": 1.4804127658313168e-06, "loss": 0.0915, "num_input_tokens_seen": 56069032, "step": 83185 }, { "epoch": 2.032345540273129, "grad_norm": 29.0299129486084, "learning_rate": 1.4803379720041824e-06, "loss": 0.0537, "num_input_tokens_seen": 56072360, "step": 83190 }, { "epoch": 2.0324676911049764, "grad_norm": 0.00915348157286644, "learning_rate": 1.480263174683901e-06, "loss": 0.0415, "num_input_tokens_seen": 56075816, "step": 83195 }, { "epoch": 2.0325898419368236, "grad_norm": 0.12145712971687317, "learning_rate": 1.4801883738710168e-06, "loss": 0.0252, "num_input_tokens_seen": 56079080, "step": 83200 }, { "epoch": 2.0327119927686708, "grad_norm": 266.5506591796875, "learning_rate": 1.4801135695660734e-06, "loss": 0.01, "num_input_tokens_seen": 56082344, "step": 83205 }, { "epoch": 2.032834143600518, "grad_norm": 0.19172163307666779, "learning_rate": 1.480038761769615e-06, "loss": 0.0004, "num_input_tokens_seen": 56085672, "step": 83210 }, { "epoch": 2.032956294432365, "grad_norm": 15.215696334838867, "learning_rate": 1.4799639504821857e-06, "loss": 0.0973, "num_input_tokens_seen": 56088872, "step": 83215 }, { "epoch": 2.0330784452642123, "grad_norm": 0.0854988619685173, "learning_rate": 1.4798891357043296e-06, "loss": 0.0001, "num_input_tokens_seen": 56092200, "step": 83220 }, { "epoch": 2.0332005960960595, "grad_norm": 1.005537986755371, "learning_rate": 1.4798143174365902e-06, "loss": 0.0547, "num_input_tokens_seen": 56095720, "step": 83225 }, { "epoch": 2.0333227469279067, "grad_norm": 0.24813531339168549, "learning_rate": 1.4797394956795125e-06, "loss": 0.0004, "num_input_tokens_seen": 56098856, "step": 83230 }, { "epoch": 2.033444897759754, "grad_norm": 0.06855268031358719, "learning_rate": 1.4796646704336397e-06, "loss": 0.0008, "num_input_tokens_seen": 56102248, "step": 83235 }, { "epoch": 2.033567048591601, "grad_norm": 0.032730553299188614, "learning_rate": 1.4795898416995167e-06, "loss": 0.1074, "num_input_tokens_seen": 56105640, "step": 83240 }, { "epoch": 2.0336891994234483, "grad_norm": 0.02327229455113411, "learning_rate": 1.479515009477687e-06, "loss": 0.0002, "num_input_tokens_seen": 56108840, "step": 83245 }, { "epoch": 2.033811350255295, "grad_norm": 0.020187703892588615, "learning_rate": 1.4794401737686956e-06, "loss": 0.0893, "num_input_tokens_seen": 56112040, "step": 83250 }, { "epoch": 2.033933501087142, "grad_norm": 0.05681902542710304, "learning_rate": 1.4793653345730864e-06, "loss": 0.0386, "num_input_tokens_seen": 56115688, "step": 83255 }, { "epoch": 2.0340556519189894, "grad_norm": 33.550838470458984, "learning_rate": 1.4792904918914034e-06, "loss": 0.0768, "num_input_tokens_seen": 56118696, "step": 83260 }, { "epoch": 2.0341778027508366, "grad_norm": 0.5451377630233765, "learning_rate": 1.4792156457241912e-06, "loss": 0.001, "num_input_tokens_seen": 56122472, "step": 83265 }, { "epoch": 2.034299953582684, "grad_norm": 1.9952231645584106, "learning_rate": 1.4791407960719935e-06, "loss": 0.0035, "num_input_tokens_seen": 56125608, "step": 83270 }, { "epoch": 2.034422104414531, "grad_norm": 0.2995298206806183, "learning_rate": 1.4790659429353553e-06, "loss": 0.0427, "num_input_tokens_seen": 56128680, "step": 83275 }, { "epoch": 2.034544255246378, "grad_norm": 0.5154494047164917, "learning_rate": 1.4789910863148206e-06, "loss": 0.0005, "num_input_tokens_seen": 56132968, "step": 83280 }, { "epoch": 2.0346664060782254, "grad_norm": 18.13103675842285, "learning_rate": 1.4789162262109338e-06, "loss": 0.1166, "num_input_tokens_seen": 56136168, "step": 83285 }, { "epoch": 2.0347885569100725, "grad_norm": 0.30100521445274353, "learning_rate": 1.4788413626242396e-06, "loss": 0.0022, "num_input_tokens_seen": 56139304, "step": 83290 }, { "epoch": 2.0349107077419197, "grad_norm": 0.09149324893951416, "learning_rate": 1.4787664955552822e-06, "loss": 0.0004, "num_input_tokens_seen": 56142376, "step": 83295 }, { "epoch": 2.035032858573767, "grad_norm": 0.21753954887390137, "learning_rate": 1.4786916250046063e-06, "loss": 0.0003, "num_input_tokens_seen": 56145832, "step": 83300 }, { "epoch": 2.035155009405614, "grad_norm": 37.52893829345703, "learning_rate": 1.4786167509727556e-06, "loss": 0.002, "num_input_tokens_seen": 56149544, "step": 83305 }, { "epoch": 2.0352771602374613, "grad_norm": 0.015320549719035625, "learning_rate": 1.4785418734602752e-06, "loss": 0.0006, "num_input_tokens_seen": 56153512, "step": 83310 }, { "epoch": 2.0353993110693085, "grad_norm": 54.01702117919922, "learning_rate": 1.4784669924677102e-06, "loss": 0.0235, "num_input_tokens_seen": 56156904, "step": 83315 }, { "epoch": 2.0355214619011557, "grad_norm": 0.06559678912162781, "learning_rate": 1.4783921079956042e-06, "loss": 0.0004, "num_input_tokens_seen": 56160360, "step": 83320 }, { "epoch": 2.035643612733003, "grad_norm": 10.981175422668457, "learning_rate": 1.478317220044502e-06, "loss": 0.0439, "num_input_tokens_seen": 56163112, "step": 83325 }, { "epoch": 2.03576576356485, "grad_norm": 0.0281961802393198, "learning_rate": 1.4782423286149484e-06, "loss": 0.0, "num_input_tokens_seen": 56166120, "step": 83330 }, { "epoch": 2.0358879143966973, "grad_norm": 0.09851742535829544, "learning_rate": 1.478167433707488e-06, "loss": 0.0421, "num_input_tokens_seen": 56169448, "step": 83335 }, { "epoch": 2.036010065228544, "grad_norm": 168.35960388183594, "learning_rate": 1.4780925353226651e-06, "loss": 0.0176, "num_input_tokens_seen": 56172648, "step": 83340 }, { "epoch": 2.036132216060391, "grad_norm": 0.10178277641534805, "learning_rate": 1.478017633461025e-06, "loss": 0.0003, "num_input_tokens_seen": 56175976, "step": 83345 }, { "epoch": 2.0362543668922384, "grad_norm": 0.021149465814232826, "learning_rate": 1.477942728123112e-06, "loss": 0.0514, "num_input_tokens_seen": 56179496, "step": 83350 }, { "epoch": 2.0363765177240856, "grad_norm": 0.0038066126871854067, "learning_rate": 1.4778678193094712e-06, "loss": 0.0001, "num_input_tokens_seen": 56183144, "step": 83355 }, { "epoch": 2.0364986685559328, "grad_norm": 0.02866493910551071, "learning_rate": 1.477792907020647e-06, "loss": 0.0002, "num_input_tokens_seen": 56186600, "step": 83360 }, { "epoch": 2.03662081938778, "grad_norm": 0.0018447516486048698, "learning_rate": 1.477717991257184e-06, "loss": 0.0156, "num_input_tokens_seen": 56189928, "step": 83365 }, { "epoch": 2.036742970219627, "grad_norm": 0.04497027024626732, "learning_rate": 1.4776430720196275e-06, "loss": 0.0004, "num_input_tokens_seen": 56193000, "step": 83370 }, { "epoch": 2.0368651210514743, "grad_norm": 0.0963190421462059, "learning_rate": 1.4775681493085218e-06, "loss": 0.001, "num_input_tokens_seen": 56196456, "step": 83375 }, { "epoch": 2.0369872718833215, "grad_norm": 0.007222073618322611, "learning_rate": 1.4774932231244125e-06, "loss": 0.0004, "num_input_tokens_seen": 56200360, "step": 83380 }, { "epoch": 2.0371094227151687, "grad_norm": 0.10533749312162399, "learning_rate": 1.4774182934678438e-06, "loss": 0.0095, "num_input_tokens_seen": 56203944, "step": 83385 }, { "epoch": 2.037231573547016, "grad_norm": 39.62832260131836, "learning_rate": 1.477343360339361e-06, "loss": 0.0367, "num_input_tokens_seen": 56207848, "step": 83390 }, { "epoch": 2.037353724378863, "grad_norm": 0.0017830976285040379, "learning_rate": 1.4772684237395088e-06, "loss": 0.0001, "num_input_tokens_seen": 56211880, "step": 83395 }, { "epoch": 2.0374758752107103, "grad_norm": 0.002838310319930315, "learning_rate": 1.4771934836688322e-06, "loss": 0.0001, "num_input_tokens_seen": 56215144, "step": 83400 }, { "epoch": 2.0375980260425575, "grad_norm": 0.017945390194654465, "learning_rate": 1.477118540127876e-06, "loss": 0.0001, "num_input_tokens_seen": 56218664, "step": 83405 }, { "epoch": 2.0377201768744047, "grad_norm": 0.005239225924015045, "learning_rate": 1.477043593117186e-06, "loss": 0.0437, "num_input_tokens_seen": 56222568, "step": 83410 }, { "epoch": 2.037842327706252, "grad_norm": 0.061735525727272034, "learning_rate": 1.4769686426373065e-06, "loss": 0.0001, "num_input_tokens_seen": 56226536, "step": 83415 }, { "epoch": 2.037964478538099, "grad_norm": 0.0004880604974459857, "learning_rate": 1.4768936886887826e-06, "loss": 0.059, "num_input_tokens_seen": 56230120, "step": 83420 }, { "epoch": 2.0380866293699462, "grad_norm": 0.0011679143644869328, "learning_rate": 1.4768187312721598e-06, "loss": 0.0001, "num_input_tokens_seen": 56233384, "step": 83425 }, { "epoch": 2.038208780201793, "grad_norm": 0.0034079423639923334, "learning_rate": 1.4767437703879825e-06, "loss": 0.0003, "num_input_tokens_seen": 56236904, "step": 83430 }, { "epoch": 2.03833093103364, "grad_norm": 0.08557354658842087, "learning_rate": 1.4766688060367965e-06, "loss": 0.0001, "num_input_tokens_seen": 56239848, "step": 83435 }, { "epoch": 2.0384530818654873, "grad_norm": 0.8352116346359253, "learning_rate": 1.4765938382191468e-06, "loss": 0.0003, "num_input_tokens_seen": 56243176, "step": 83440 }, { "epoch": 2.0385752326973345, "grad_norm": 0.005509098991751671, "learning_rate": 1.4765188669355784e-06, "loss": 0.0522, "num_input_tokens_seen": 56246760, "step": 83445 }, { "epoch": 2.0386973835291817, "grad_norm": 0.07742209732532501, "learning_rate": 1.4764438921866367e-06, "loss": 0.0001, "num_input_tokens_seen": 56249640, "step": 83450 }, { "epoch": 2.038819534361029, "grad_norm": 0.008906389586627483, "learning_rate": 1.476368913972867e-06, "loss": 0.0604, "num_input_tokens_seen": 56252648, "step": 83455 }, { "epoch": 2.038941685192876, "grad_norm": 0.009594157338142395, "learning_rate": 1.4762939322948142e-06, "loss": 0.0002, "num_input_tokens_seen": 56255720, "step": 83460 }, { "epoch": 2.0390638360247233, "grad_norm": 0.20015442371368408, "learning_rate": 1.4762189471530237e-06, "loss": 0.1104, "num_input_tokens_seen": 56258536, "step": 83465 }, { "epoch": 2.0391859868565705, "grad_norm": 18.550580978393555, "learning_rate": 1.4761439585480413e-06, "loss": 0.1059, "num_input_tokens_seen": 56262248, "step": 83470 }, { "epoch": 2.0393081376884177, "grad_norm": 0.43347489833831787, "learning_rate": 1.4760689664804117e-06, "loss": 0.0003, "num_input_tokens_seen": 56266408, "step": 83475 }, { "epoch": 2.039430288520265, "grad_norm": 0.0659785196185112, "learning_rate": 1.4759939709506808e-06, "loss": 0.0001, "num_input_tokens_seen": 56269928, "step": 83480 }, { "epoch": 2.039552439352112, "grad_norm": 0.0010263713775202632, "learning_rate": 1.4759189719593936e-06, "loss": 0.0291, "num_input_tokens_seen": 56273704, "step": 83485 }, { "epoch": 2.0396745901839592, "grad_norm": 0.37888064980506897, "learning_rate": 1.4758439695070956e-06, "loss": 0.0007, "num_input_tokens_seen": 56277288, "step": 83490 }, { "epoch": 2.0397967410158064, "grad_norm": 501.5647888183594, "learning_rate": 1.475768963594332e-06, "loss": 0.0345, "num_input_tokens_seen": 56280744, "step": 83495 }, { "epoch": 2.0399188918476536, "grad_norm": 0.025557933375239372, "learning_rate": 1.4756939542216488e-06, "loss": 0.0004, "num_input_tokens_seen": 56284008, "step": 83500 }, { "epoch": 2.040041042679501, "grad_norm": 0.005212422460317612, "learning_rate": 1.4756189413895912e-06, "loss": 0.0383, "num_input_tokens_seen": 56287400, "step": 83505 }, { "epoch": 2.040163193511348, "grad_norm": 0.025140509009361267, "learning_rate": 1.4755439250987046e-06, "loss": 0.0001, "num_input_tokens_seen": 56290536, "step": 83510 }, { "epoch": 2.040285344343195, "grad_norm": 0.016255099326372147, "learning_rate": 1.475468905349535e-06, "loss": 0.0001, "num_input_tokens_seen": 56293864, "step": 83515 }, { "epoch": 2.040407495175042, "grad_norm": 1.7284395694732666, "learning_rate": 1.4753938821426274e-06, "loss": 0.0621, "num_input_tokens_seen": 56296872, "step": 83520 }, { "epoch": 2.040529646006889, "grad_norm": 613.140625, "learning_rate": 1.4753188554785276e-06, "loss": 0.0216, "num_input_tokens_seen": 56300008, "step": 83525 }, { "epoch": 2.0406517968387363, "grad_norm": 0.010204845108091831, "learning_rate": 1.4752438253577816e-06, "loss": 0.0002, "num_input_tokens_seen": 56303272, "step": 83530 }, { "epoch": 2.0407739476705835, "grad_norm": 0.018574582412838936, "learning_rate": 1.4751687917809342e-06, "loss": 0.0003, "num_input_tokens_seen": 56306664, "step": 83535 }, { "epoch": 2.0408960985024307, "grad_norm": 31.733015060424805, "learning_rate": 1.4750937547485316e-06, "loss": 0.0503, "num_input_tokens_seen": 56309608, "step": 83540 }, { "epoch": 2.041018249334278, "grad_norm": 0.0498012974858284, "learning_rate": 1.4750187142611195e-06, "loss": 0.083, "num_input_tokens_seen": 56313384, "step": 83545 }, { "epoch": 2.041140400166125, "grad_norm": 0.008056207560002804, "learning_rate": 1.4749436703192436e-06, "loss": 0.0003, "num_input_tokens_seen": 56316392, "step": 83550 }, { "epoch": 2.0412625509979723, "grad_norm": 0.011606397107243538, "learning_rate": 1.4748686229234497e-06, "loss": 0.0001, "num_input_tokens_seen": 56319784, "step": 83555 }, { "epoch": 2.0413847018298195, "grad_norm": 29.21710968017578, "learning_rate": 1.474793572074283e-06, "loss": 0.1114, "num_input_tokens_seen": 56322856, "step": 83560 }, { "epoch": 2.0415068526616666, "grad_norm": 0.05728091299533844, "learning_rate": 1.47471851777229e-06, "loss": 0.0002, "num_input_tokens_seen": 56325928, "step": 83565 }, { "epoch": 2.041629003493514, "grad_norm": 0.02401016652584076, "learning_rate": 1.4746434600180165e-06, "loss": 0.0001, "num_input_tokens_seen": 56329000, "step": 83570 }, { "epoch": 2.041751154325361, "grad_norm": 33.782012939453125, "learning_rate": 1.4745683988120079e-06, "loss": 0.0626, "num_input_tokens_seen": 56332392, "step": 83575 }, { "epoch": 2.041873305157208, "grad_norm": 0.0074986666440963745, "learning_rate": 1.4744933341548105e-06, "loss": 0.0003, "num_input_tokens_seen": 56335720, "step": 83580 }, { "epoch": 2.0419954559890554, "grad_norm": 0.004022131208330393, "learning_rate": 1.4744182660469697e-06, "loss": 0.0433, "num_input_tokens_seen": 56338984, "step": 83585 }, { "epoch": 2.0421176068209026, "grad_norm": 0.6381987929344177, "learning_rate": 1.4743431944890315e-06, "loss": 0.0005, "num_input_tokens_seen": 56342504, "step": 83590 }, { "epoch": 2.04223975765275, "grad_norm": 33.13918685913086, "learning_rate": 1.4742681194815423e-06, "loss": 0.107, "num_input_tokens_seen": 56346088, "step": 83595 }, { "epoch": 2.042361908484597, "grad_norm": 0.03665100038051605, "learning_rate": 1.4741930410250477e-06, "loss": 0.0558, "num_input_tokens_seen": 56349672, "step": 83600 }, { "epoch": 2.042484059316444, "grad_norm": 19.979780197143555, "learning_rate": 1.4741179591200936e-06, "loss": 0.0592, "num_input_tokens_seen": 56353064, "step": 83605 }, { "epoch": 2.042606210148291, "grad_norm": 0.08430862426757812, "learning_rate": 1.4740428737672263e-06, "loss": 0.0792, "num_input_tokens_seen": 56356072, "step": 83610 }, { "epoch": 2.042728360980138, "grad_norm": 50.88820266723633, "learning_rate": 1.4739677849669919e-06, "loss": 0.0275, "num_input_tokens_seen": 56359656, "step": 83615 }, { "epoch": 2.0428505118119853, "grad_norm": 0.003031768836081028, "learning_rate": 1.4738926927199358e-06, "loss": 0.0653, "num_input_tokens_seen": 56363048, "step": 83620 }, { "epoch": 2.0429726626438325, "grad_norm": 12.282257080078125, "learning_rate": 1.473817597026605e-06, "loss": 0.0515, "num_input_tokens_seen": 56366632, "step": 83625 }, { "epoch": 2.0430948134756797, "grad_norm": 0.33622220158576965, "learning_rate": 1.4737424978875453e-06, "loss": 0.0004, "num_input_tokens_seen": 56370152, "step": 83630 }, { "epoch": 2.043216964307527, "grad_norm": 0.0007160211098380387, "learning_rate": 1.4736673953033023e-06, "loss": 0.0693, "num_input_tokens_seen": 56373416, "step": 83635 }, { "epoch": 2.043339115139374, "grad_norm": 0.8499990105628967, "learning_rate": 1.473592289274423e-06, "loss": 0.0005, "num_input_tokens_seen": 56376488, "step": 83640 }, { "epoch": 2.0434612659712212, "grad_norm": 0.2281034141778946, "learning_rate": 1.473517179801453e-06, "loss": 0.0482, "num_input_tokens_seen": 56379624, "step": 83645 }, { "epoch": 2.0435834168030684, "grad_norm": 0.23095224797725677, "learning_rate": 1.4734420668849384e-06, "loss": 0.0008, "num_input_tokens_seen": 56382760, "step": 83650 }, { "epoch": 2.0437055676349156, "grad_norm": 0.08366420865058899, "learning_rate": 1.4733669505254263e-06, "loss": 0.0539, "num_input_tokens_seen": 56386856, "step": 83655 }, { "epoch": 2.043827718466763, "grad_norm": 0.39161205291748047, "learning_rate": 1.473291830723462e-06, "loss": 0.0019, "num_input_tokens_seen": 56390504, "step": 83660 }, { "epoch": 2.04394986929861, "grad_norm": 0.48570919036865234, "learning_rate": 1.4732167074795925e-06, "loss": 0.0344, "num_input_tokens_seen": 56393448, "step": 83665 }, { "epoch": 2.044072020130457, "grad_norm": 0.008141516707837582, "learning_rate": 1.4731415807943638e-06, "loss": 0.0002, "num_input_tokens_seen": 56397288, "step": 83670 }, { "epoch": 2.0441941709623044, "grad_norm": 0.05445622652769089, "learning_rate": 1.4730664506683219e-06, "loss": 0.0003, "num_input_tokens_seen": 56400744, "step": 83675 }, { "epoch": 2.0443163217941516, "grad_norm": 0.005790103226900101, "learning_rate": 1.4729913171020138e-06, "loss": 0.0002, "num_input_tokens_seen": 56404200, "step": 83680 }, { "epoch": 2.0444384726259988, "grad_norm": 0.17963790893554688, "learning_rate": 1.4729161800959857e-06, "loss": 0.0814, "num_input_tokens_seen": 56407272, "step": 83685 }, { "epoch": 2.044560623457846, "grad_norm": 0.009207534603774548, "learning_rate": 1.4728410396507839e-06, "loss": 0.0003, "num_input_tokens_seen": 56410344, "step": 83690 }, { "epoch": 2.0446827742896927, "grad_norm": 0.03656099736690521, "learning_rate": 1.4727658957669548e-06, "loss": 0.0005, "num_input_tokens_seen": 56413480, "step": 83695 }, { "epoch": 2.04480492512154, "grad_norm": 0.007025151047855616, "learning_rate": 1.472690748445045e-06, "loss": 0.0, "num_input_tokens_seen": 56416680, "step": 83700 }, { "epoch": 2.044927075953387, "grad_norm": 0.008257608860731125, "learning_rate": 1.4726155976856012e-06, "loss": 0.0007, "num_input_tokens_seen": 56420072, "step": 83705 }, { "epoch": 2.0450492267852343, "grad_norm": 0.005688189063221216, "learning_rate": 1.4725404434891693e-06, "loss": 0.0592, "num_input_tokens_seen": 56423784, "step": 83710 }, { "epoch": 2.0451713776170815, "grad_norm": 0.0242256298661232, "learning_rate": 1.472465285856296e-06, "loss": 0.0205, "num_input_tokens_seen": 56427688, "step": 83715 }, { "epoch": 2.0452935284489286, "grad_norm": 0.026224689558148384, "learning_rate": 1.4723901247875283e-06, "loss": 0.0001, "num_input_tokens_seen": 56430888, "step": 83720 }, { "epoch": 2.045415679280776, "grad_norm": 0.013745622709393501, "learning_rate": 1.4723149602834127e-06, "loss": 0.0001, "num_input_tokens_seen": 56434152, "step": 83725 }, { "epoch": 2.045537830112623, "grad_norm": 0.00482986168935895, "learning_rate": 1.4722397923444955e-06, "loss": 0.0, "num_input_tokens_seen": 56437416, "step": 83730 }, { "epoch": 2.04565998094447, "grad_norm": 0.018875837326049805, "learning_rate": 1.4721646209713239e-06, "loss": 0.0, "num_input_tokens_seen": 56440744, "step": 83735 }, { "epoch": 2.0457821317763174, "grad_norm": 0.013076196424663067, "learning_rate": 1.472089446164444e-06, "loss": 0.0001, "num_input_tokens_seen": 56443752, "step": 83740 }, { "epoch": 2.0459042826081646, "grad_norm": 0.03415194898843765, "learning_rate": 1.4720142679244022e-06, "loss": 0.0003, "num_input_tokens_seen": 56447016, "step": 83745 }, { "epoch": 2.046026433440012, "grad_norm": 0.008563662879168987, "learning_rate": 1.471939086251746e-06, "loss": 0.049, "num_input_tokens_seen": 56450536, "step": 83750 }, { "epoch": 2.046148584271859, "grad_norm": 0.009224042296409607, "learning_rate": 1.471863901147022e-06, "loss": 0.0, "num_input_tokens_seen": 56453864, "step": 83755 }, { "epoch": 2.046270735103706, "grad_norm": 20.467226028442383, "learning_rate": 1.4717887126107766e-06, "loss": 0.062, "num_input_tokens_seen": 56457640, "step": 83760 }, { "epoch": 2.0463928859355534, "grad_norm": 0.025449080392718315, "learning_rate": 1.471713520643557e-06, "loss": 0.0396, "num_input_tokens_seen": 56461224, "step": 83765 }, { "epoch": 2.0465150367674005, "grad_norm": 0.004977928474545479, "learning_rate": 1.4716383252459096e-06, "loss": 0.0001, "num_input_tokens_seen": 56464680, "step": 83770 }, { "epoch": 2.0466371875992477, "grad_norm": 95.45176696777344, "learning_rate": 1.4715631264183812e-06, "loss": 0.0774, "num_input_tokens_seen": 56468648, "step": 83775 }, { "epoch": 2.046759338431095, "grad_norm": 0.08478084206581116, "learning_rate": 1.4714879241615195e-06, "loss": 0.0593, "num_input_tokens_seen": 56472040, "step": 83780 }, { "epoch": 2.0468814892629417, "grad_norm": 0.03638599067926407, "learning_rate": 1.4714127184758703e-06, "loss": 0.0001, "num_input_tokens_seen": 56475240, "step": 83785 }, { "epoch": 2.047003640094789, "grad_norm": 0.023271914571523666, "learning_rate": 1.4713375093619812e-06, "loss": 0.0007, "num_input_tokens_seen": 56478760, "step": 83790 }, { "epoch": 2.047125790926636, "grad_norm": 0.013252345845103264, "learning_rate": 1.471262296820399e-06, "loss": 0.0231, "num_input_tokens_seen": 56482408, "step": 83795 }, { "epoch": 2.0472479417584832, "grad_norm": 0.011788555420935154, "learning_rate": 1.4711870808516706e-06, "loss": 0.0009, "num_input_tokens_seen": 56486056, "step": 83800 }, { "epoch": 2.0473700925903304, "grad_norm": 0.037248142063617706, "learning_rate": 1.4711118614563427e-06, "loss": 0.0644, "num_input_tokens_seen": 56489768, "step": 83805 }, { "epoch": 2.0474922434221776, "grad_norm": 0.0038712075911462307, "learning_rate": 1.4710366386349631e-06, "loss": 0.0003, "num_input_tokens_seen": 56492904, "step": 83810 }, { "epoch": 2.047614394254025, "grad_norm": 0.07566796988248825, "learning_rate": 1.4709614123880783e-06, "loss": 0.0652, "num_input_tokens_seen": 56496168, "step": 83815 }, { "epoch": 2.047736545085872, "grad_norm": 47.76158905029297, "learning_rate": 1.470886182716235e-06, "loss": 0.043, "num_input_tokens_seen": 56499624, "step": 83820 }, { "epoch": 2.047858695917719, "grad_norm": 13.864922523498535, "learning_rate": 1.4708109496199815e-06, "loss": 0.044, "num_input_tokens_seen": 56502888, "step": 83825 }, { "epoch": 2.0479808467495664, "grad_norm": 0.012407737784087658, "learning_rate": 1.4707357130998635e-06, "loss": 0.0002, "num_input_tokens_seen": 56506152, "step": 83830 }, { "epoch": 2.0481029975814136, "grad_norm": 0.026929527521133423, "learning_rate": 1.470660473156429e-06, "loss": 0.1097, "num_input_tokens_seen": 56509416, "step": 83835 }, { "epoch": 2.0482251484132608, "grad_norm": 0.03948600962758064, "learning_rate": 1.4705852297902248e-06, "loss": 0.1012, "num_input_tokens_seen": 56513320, "step": 83840 }, { "epoch": 2.048347299245108, "grad_norm": 0.06307268887758255, "learning_rate": 1.4705099830017983e-06, "loss": 0.1403, "num_input_tokens_seen": 56516776, "step": 83845 }, { "epoch": 2.048469450076955, "grad_norm": 0.013860165141522884, "learning_rate": 1.470434732791697e-06, "loss": 0.1199, "num_input_tokens_seen": 56520232, "step": 83850 }, { "epoch": 2.0485916009088023, "grad_norm": 0.21431736648082733, "learning_rate": 1.4703594791604674e-06, "loss": 0.0007, "num_input_tokens_seen": 56523688, "step": 83855 }, { "epoch": 2.0487137517406495, "grad_norm": 0.3006341755390167, "learning_rate": 1.4702842221086573e-06, "loss": 0.0463, "num_input_tokens_seen": 56526888, "step": 83860 }, { "epoch": 2.0488359025724967, "grad_norm": 0.27715471386909485, "learning_rate": 1.470208961636814e-06, "loss": 0.0004, "num_input_tokens_seen": 56530152, "step": 83865 }, { "epoch": 2.048958053404344, "grad_norm": 0.04112695902585983, "learning_rate": 1.4701336977454841e-06, "loss": 0.0006, "num_input_tokens_seen": 56533800, "step": 83870 }, { "epoch": 2.0490802042361906, "grad_norm": 0.04875229671597481, "learning_rate": 1.470058430435216e-06, "loss": 0.0003, "num_input_tokens_seen": 56537512, "step": 83875 }, { "epoch": 2.049202355068038, "grad_norm": 0.16153334081172943, "learning_rate": 1.4699831597065565e-06, "loss": 0.0003, "num_input_tokens_seen": 56540904, "step": 83880 }, { "epoch": 2.049324505899885, "grad_norm": 0.021278617903590202, "learning_rate": 1.469907885560053e-06, "loss": 0.0006, "num_input_tokens_seen": 56544040, "step": 83885 }, { "epoch": 2.049446656731732, "grad_norm": 338.5051574707031, "learning_rate": 1.4698326079962532e-06, "loss": 0.0098, "num_input_tokens_seen": 56547368, "step": 83890 }, { "epoch": 2.0495688075635794, "grad_norm": 0.8432843685150146, "learning_rate": 1.4697573270157038e-06, "loss": 0.0327, "num_input_tokens_seen": 56550696, "step": 83895 }, { "epoch": 2.0496909583954266, "grad_norm": 0.017997203394770622, "learning_rate": 1.469682042618953e-06, "loss": 0.0614, "num_input_tokens_seen": 56553960, "step": 83900 }, { "epoch": 2.0498131092272738, "grad_norm": 0.022712845355272293, "learning_rate": 1.469606754806548e-06, "loss": 0.0002, "num_input_tokens_seen": 56557096, "step": 83905 }, { "epoch": 2.049935260059121, "grad_norm": 0.005200402345508337, "learning_rate": 1.4695314635790366e-06, "loss": 0.0003, "num_input_tokens_seen": 56560232, "step": 83910 }, { "epoch": 2.050057410890968, "grad_norm": 0.027068452909588814, "learning_rate": 1.4694561689369657e-06, "loss": 0.0001, "num_input_tokens_seen": 56563240, "step": 83915 }, { "epoch": 2.0501795617228153, "grad_norm": 0.0025052025448530912, "learning_rate": 1.4693808708808837e-06, "loss": 0.0111, "num_input_tokens_seen": 56567016, "step": 83920 }, { "epoch": 2.0503017125546625, "grad_norm": 0.07576905190944672, "learning_rate": 1.4693055694113377e-06, "loss": 0.0002, "num_input_tokens_seen": 56570408, "step": 83925 }, { "epoch": 2.0504238633865097, "grad_norm": 0.06434017419815063, "learning_rate": 1.469230264528875e-06, "loss": 0.0001, "num_input_tokens_seen": 56573608, "step": 83930 }, { "epoch": 2.050546014218357, "grad_norm": 0.15855221450328827, "learning_rate": 1.469154956234044e-06, "loss": 0.0001, "num_input_tokens_seen": 56576872, "step": 83935 }, { "epoch": 2.050668165050204, "grad_norm": 0.2547149360179901, "learning_rate": 1.4690796445273918e-06, "loss": 0.0714, "num_input_tokens_seen": 56580200, "step": 83940 }, { "epoch": 2.0507903158820513, "grad_norm": 0.0009536282741464674, "learning_rate": 1.4690043294094665e-06, "loss": 0.0373, "num_input_tokens_seen": 56583784, "step": 83945 }, { "epoch": 2.0509124667138985, "grad_norm": 0.00821103435009718, "learning_rate": 1.4689290108808152e-06, "loss": 0.0004, "num_input_tokens_seen": 56586920, "step": 83950 }, { "epoch": 2.0510346175457457, "grad_norm": 0.00451961625367403, "learning_rate": 1.4688536889419861e-06, "loss": 0.1476, "num_input_tokens_seen": 56589928, "step": 83955 }, { "epoch": 2.051156768377593, "grad_norm": 0.020787635818123817, "learning_rate": 1.468778363593527e-06, "loss": 0.0411, "num_input_tokens_seen": 56593384, "step": 83960 }, { "epoch": 2.0512789192094396, "grad_norm": 0.005360802169889212, "learning_rate": 1.4687030348359855e-06, "loss": 0.0005, "num_input_tokens_seen": 56596648, "step": 83965 }, { "epoch": 2.051401070041287, "grad_norm": 255.1057891845703, "learning_rate": 1.4686277026699094e-06, "loss": 0.0247, "num_input_tokens_seen": 56600488, "step": 83970 }, { "epoch": 2.051523220873134, "grad_norm": 29.247344970703125, "learning_rate": 1.4685523670958466e-06, "loss": 0.0652, "num_input_tokens_seen": 56603752, "step": 83975 }, { "epoch": 2.051645371704981, "grad_norm": 0.009687669575214386, "learning_rate": 1.468477028114345e-06, "loss": 0.0007, "num_input_tokens_seen": 56606760, "step": 83980 }, { "epoch": 2.0517675225368284, "grad_norm": 0.019696874544024467, "learning_rate": 1.4684016857259524e-06, "loss": 0.0728, "num_input_tokens_seen": 56610280, "step": 83985 }, { "epoch": 2.0518896733686756, "grad_norm": 0.7689847350120544, "learning_rate": 1.4683263399312171e-06, "loss": 0.0004, "num_input_tokens_seen": 56613480, "step": 83990 }, { "epoch": 2.0520118242005227, "grad_norm": 0.022250818088650703, "learning_rate": 1.4682509907306863e-06, "loss": 0.0446, "num_input_tokens_seen": 56616744, "step": 83995 }, { "epoch": 2.05213397503237, "grad_norm": 0.06082061305642128, "learning_rate": 1.4681756381249085e-06, "loss": 0.0003, "num_input_tokens_seen": 56620584, "step": 84000 }, { "epoch": 2.052256125864217, "grad_norm": 0.33939114212989807, "learning_rate": 1.4681002821144315e-06, "loss": 0.0361, "num_input_tokens_seen": 56624104, "step": 84005 }, { "epoch": 2.0523782766960643, "grad_norm": 0.008764307014644146, "learning_rate": 1.4680249226998033e-06, "loss": 0.0764, "num_input_tokens_seen": 56627560, "step": 84010 }, { "epoch": 2.0525004275279115, "grad_norm": 9.919015884399414, "learning_rate": 1.467949559881572e-06, "loss": 0.0414, "num_input_tokens_seen": 56630888, "step": 84015 }, { "epoch": 2.0526225783597587, "grad_norm": 1.171688437461853, "learning_rate": 1.467874193660286e-06, "loss": 0.0008, "num_input_tokens_seen": 56634280, "step": 84020 }, { "epoch": 2.052744729191606, "grad_norm": 0.02687976323068142, "learning_rate": 1.4677988240364922e-06, "loss": 0.0001, "num_input_tokens_seen": 56637288, "step": 84025 }, { "epoch": 2.052866880023453, "grad_norm": 104.11189270019531, "learning_rate": 1.4677234510107402e-06, "loss": 0.0377, "num_input_tokens_seen": 56640552, "step": 84030 }, { "epoch": 2.0529890308553003, "grad_norm": 0.005070993211120367, "learning_rate": 1.4676480745835774e-06, "loss": 0.0002, "num_input_tokens_seen": 56644008, "step": 84035 }, { "epoch": 2.0531111816871475, "grad_norm": 0.042603764683008194, "learning_rate": 1.4675726947555519e-06, "loss": 0.0301, "num_input_tokens_seen": 56647208, "step": 84040 }, { "epoch": 2.0532333325189946, "grad_norm": 0.03279830515384674, "learning_rate": 1.467497311527212e-06, "loss": 0.0034, "num_input_tokens_seen": 56651304, "step": 84045 }, { "epoch": 2.053355483350842, "grad_norm": 0.023159446194767952, "learning_rate": 1.467421924899106e-06, "loss": 0.0001, "num_input_tokens_seen": 56654504, "step": 84050 }, { "epoch": 2.0534776341826886, "grad_norm": 0.006754196248948574, "learning_rate": 1.4673465348717817e-06, "loss": 0.0613, "num_input_tokens_seen": 56657832, "step": 84055 }, { "epoch": 2.0535997850145358, "grad_norm": 0.013502690009772778, "learning_rate": 1.4672711414457879e-06, "loss": 0.025, "num_input_tokens_seen": 56661736, "step": 84060 }, { "epoch": 2.053721935846383, "grad_norm": 0.037468913942575455, "learning_rate": 1.4671957446216728e-06, "loss": 0.0004, "num_input_tokens_seen": 56664744, "step": 84065 }, { "epoch": 2.05384408667823, "grad_norm": 0.05100453272461891, "learning_rate": 1.4671203443999844e-06, "loss": 0.0001, "num_input_tokens_seen": 56668328, "step": 84070 }, { "epoch": 2.0539662375100773, "grad_norm": 0.013385049998760223, "learning_rate": 1.4670449407812715e-06, "loss": 0.0857, "num_input_tokens_seen": 56671400, "step": 84075 }, { "epoch": 2.0540883883419245, "grad_norm": 0.022386562079191208, "learning_rate": 1.4669695337660818e-06, "loss": 0.0003, "num_input_tokens_seen": 56674728, "step": 84080 }, { "epoch": 2.0542105391737717, "grad_norm": 0.11775214225053787, "learning_rate": 1.4668941233549642e-06, "loss": 0.151, "num_input_tokens_seen": 56678120, "step": 84085 }, { "epoch": 2.054332690005619, "grad_norm": 0.08483375608921051, "learning_rate": 1.4668187095484673e-06, "loss": 0.1097, "num_input_tokens_seen": 56681448, "step": 84090 }, { "epoch": 2.054454840837466, "grad_norm": 0.05424126237630844, "learning_rate": 1.4667432923471389e-06, "loss": 0.0005, "num_input_tokens_seen": 56685160, "step": 84095 }, { "epoch": 2.0545769916693133, "grad_norm": 0.3192615509033203, "learning_rate": 1.4666678717515275e-06, "loss": 0.0368, "num_input_tokens_seen": 56688424, "step": 84100 }, { "epoch": 2.0546991425011605, "grad_norm": 0.17142631113529205, "learning_rate": 1.4665924477621824e-06, "loss": 0.0007, "num_input_tokens_seen": 56691880, "step": 84105 }, { "epoch": 2.0548212933330077, "grad_norm": 67.49655151367188, "learning_rate": 1.466517020379651e-06, "loss": 0.0392, "num_input_tokens_seen": 56695464, "step": 84110 }, { "epoch": 2.054943444164855, "grad_norm": 0.03887330740690231, "learning_rate": 1.4664415896044826e-06, "loss": 0.0001, "num_input_tokens_seen": 56699176, "step": 84115 }, { "epoch": 2.055065594996702, "grad_norm": 19.39141845703125, "learning_rate": 1.466366155437225e-06, "loss": 0.0491, "num_input_tokens_seen": 56702248, "step": 84120 }, { "epoch": 2.0551877458285492, "grad_norm": 0.12292356789112091, "learning_rate": 1.4662907178784277e-06, "loss": 0.0004, "num_input_tokens_seen": 56705832, "step": 84125 }, { "epoch": 2.0553098966603964, "grad_norm": 0.062030475586652756, "learning_rate": 1.466215276928639e-06, "loss": 0.0002, "num_input_tokens_seen": 56709160, "step": 84130 }, { "epoch": 2.0554320474922436, "grad_norm": 598.9610595703125, "learning_rate": 1.4661398325884074e-06, "loss": 0.0376, "num_input_tokens_seen": 56712936, "step": 84135 }, { "epoch": 2.0555541983240904, "grad_norm": 20.79981231689453, "learning_rate": 1.4660643848582813e-06, "loss": 0.1215, "num_input_tokens_seen": 56716456, "step": 84140 }, { "epoch": 2.0556763491559376, "grad_norm": 0.03508620709180832, "learning_rate": 1.4659889337388099e-06, "loss": 0.057, "num_input_tokens_seen": 56719656, "step": 84145 }, { "epoch": 2.0557984999877847, "grad_norm": 0.03408919274806976, "learning_rate": 1.4659134792305415e-06, "loss": 0.0001, "num_input_tokens_seen": 56723112, "step": 84150 }, { "epoch": 2.055920650819632, "grad_norm": 0.2693452835083008, "learning_rate": 1.4658380213340249e-06, "loss": 0.0401, "num_input_tokens_seen": 56726248, "step": 84155 }, { "epoch": 2.056042801651479, "grad_norm": 0.03239583224058151, "learning_rate": 1.465762560049809e-06, "loss": 0.0643, "num_input_tokens_seen": 56729384, "step": 84160 }, { "epoch": 2.0561649524833263, "grad_norm": 0.010898438282310963, "learning_rate": 1.4656870953784426e-06, "loss": 0.0002, "num_input_tokens_seen": 56733288, "step": 84165 }, { "epoch": 2.0562871033151735, "grad_norm": 0.011616937816143036, "learning_rate": 1.4656116273204742e-06, "loss": 0.0004, "num_input_tokens_seen": 56736424, "step": 84170 }, { "epoch": 2.0564092541470207, "grad_norm": 0.018615955486893654, "learning_rate": 1.465536155876453e-06, "loss": 0.0329, "num_input_tokens_seen": 56739816, "step": 84175 }, { "epoch": 2.056531404978868, "grad_norm": 0.007698644418269396, "learning_rate": 1.4654606810469275e-06, "loss": 0.1285, "num_input_tokens_seen": 56743208, "step": 84180 }, { "epoch": 2.056653555810715, "grad_norm": 0.06628992408514023, "learning_rate": 1.4653852028324467e-06, "loss": 0.0529, "num_input_tokens_seen": 56746984, "step": 84185 }, { "epoch": 2.0567757066425623, "grad_norm": 0.07687515765428543, "learning_rate": 1.4653097212335594e-06, "loss": 0.0003, "num_input_tokens_seen": 56750824, "step": 84190 }, { "epoch": 2.0568978574744095, "grad_norm": 0.02513556368649006, "learning_rate": 1.465234236250815e-06, "loss": 0.0904, "num_input_tokens_seen": 56754472, "step": 84195 }, { "epoch": 2.0570200083062566, "grad_norm": 433.2476501464844, "learning_rate": 1.4651587478847623e-06, "loss": 0.0169, "num_input_tokens_seen": 56757736, "step": 84200 }, { "epoch": 2.057142159138104, "grad_norm": 17.649721145629883, "learning_rate": 1.4650832561359496e-06, "loss": 0.0745, "num_input_tokens_seen": 56761128, "step": 84205 }, { "epoch": 2.057264309969951, "grad_norm": 0.15260326862335205, "learning_rate": 1.4650077610049264e-06, "loss": 0.0949, "num_input_tokens_seen": 56764584, "step": 84210 }, { "epoch": 2.057386460801798, "grad_norm": 0.0442718043923378, "learning_rate": 1.4649322624922418e-06, "loss": 0.0992, "num_input_tokens_seen": 56768488, "step": 84215 }, { "epoch": 2.0575086116336454, "grad_norm": 0.018568266183137894, "learning_rate": 1.4648567605984447e-06, "loss": 0.0001, "num_input_tokens_seen": 56771880, "step": 84220 }, { "epoch": 2.0576307624654926, "grad_norm": 0.47161176800727844, "learning_rate": 1.4647812553240844e-06, "loss": 0.0006, "num_input_tokens_seen": 56775464, "step": 84225 }, { "epoch": 2.0577529132973393, "grad_norm": 0.22203360497951508, "learning_rate": 1.4647057466697094e-06, "loss": 0.0006, "num_input_tokens_seen": 56778984, "step": 84230 }, { "epoch": 2.0578750641291865, "grad_norm": 0.007652414962649345, "learning_rate": 1.4646302346358697e-06, "loss": 0.0472, "num_input_tokens_seen": 56782504, "step": 84235 }, { "epoch": 2.0579972149610337, "grad_norm": 0.05677541717886925, "learning_rate": 1.4645547192231134e-06, "loss": 0.0006, "num_input_tokens_seen": 56785576, "step": 84240 }, { "epoch": 2.058119365792881, "grad_norm": 0.020264064893126488, "learning_rate": 1.4644792004319909e-06, "loss": 0.0006, "num_input_tokens_seen": 56788968, "step": 84245 }, { "epoch": 2.058241516624728, "grad_norm": 0.036386922001838684, "learning_rate": 1.4644036782630502e-06, "loss": 0.0503, "num_input_tokens_seen": 56792040, "step": 84250 }, { "epoch": 2.0583636674565753, "grad_norm": 0.1485571414232254, "learning_rate": 1.4643281527168414e-06, "loss": 0.11, "num_input_tokens_seen": 56795688, "step": 84255 }, { "epoch": 2.0584858182884225, "grad_norm": 0.04273013770580292, "learning_rate": 1.464252623793913e-06, "loss": 0.0372, "num_input_tokens_seen": 56799016, "step": 84260 }, { "epoch": 2.0586079691202697, "grad_norm": 0.012745685875415802, "learning_rate": 1.464177091494815e-06, "loss": 0.0007, "num_input_tokens_seen": 56801960, "step": 84265 }, { "epoch": 2.058730119952117, "grad_norm": 0.023471122607588768, "learning_rate": 1.4641015558200962e-06, "loss": 0.0004, "num_input_tokens_seen": 56805352, "step": 84270 }, { "epoch": 2.058852270783964, "grad_norm": 0.024831267073750496, "learning_rate": 1.4640260167703058e-06, "loss": 0.0008, "num_input_tokens_seen": 56809576, "step": 84275 }, { "epoch": 2.0589744216158112, "grad_norm": 0.020891066640615463, "learning_rate": 1.4639504743459937e-06, "loss": 0.0138, "num_input_tokens_seen": 56812904, "step": 84280 }, { "epoch": 2.0590965724476584, "grad_norm": 0.05312500149011612, "learning_rate": 1.463874928547709e-06, "loss": 0.0002, "num_input_tokens_seen": 56815784, "step": 84285 }, { "epoch": 2.0592187232795056, "grad_norm": 0.2950860261917114, "learning_rate": 1.463799379376001e-06, "loss": 0.0387, "num_input_tokens_seen": 56818792, "step": 84290 }, { "epoch": 2.059340874111353, "grad_norm": 0.0019791696686297655, "learning_rate": 1.463723826831419e-06, "loss": 0.0357, "num_input_tokens_seen": 56822568, "step": 84295 }, { "epoch": 2.0594630249432, "grad_norm": 0.0492100790143013, "learning_rate": 1.463648270914513e-06, "loss": 0.0306, "num_input_tokens_seen": 56825704, "step": 84300 }, { "epoch": 2.059585175775047, "grad_norm": 52.4030876159668, "learning_rate": 1.4635727116258316e-06, "loss": 0.0431, "num_input_tokens_seen": 56829032, "step": 84305 }, { "epoch": 2.0597073266068944, "grad_norm": 0.014774279668927193, "learning_rate": 1.4634971489659251e-06, "loss": 0.0383, "num_input_tokens_seen": 56832424, "step": 84310 }, { "epoch": 2.0598294774387416, "grad_norm": 0.026415273547172546, "learning_rate": 1.4634215829353425e-06, "loss": 0.0003, "num_input_tokens_seen": 56835816, "step": 84315 }, { "epoch": 2.0599516282705883, "grad_norm": 0.0518280453979969, "learning_rate": 1.4633460135346334e-06, "loss": 0.082, "num_input_tokens_seen": 56839272, "step": 84320 }, { "epoch": 2.0600737791024355, "grad_norm": 0.10253674536943436, "learning_rate": 1.4632704407643477e-06, "loss": 0.0002, "num_input_tokens_seen": 56842280, "step": 84325 }, { "epoch": 2.0601959299342827, "grad_norm": 29.64972496032715, "learning_rate": 1.4631948646250347e-06, "loss": 0.0491, "num_input_tokens_seen": 56845544, "step": 84330 }, { "epoch": 2.06031808076613, "grad_norm": 0.05544205382466316, "learning_rate": 1.4631192851172437e-06, "loss": 0.0378, "num_input_tokens_seen": 56849000, "step": 84335 }, { "epoch": 2.060440231597977, "grad_norm": 0.2877916991710663, "learning_rate": 1.4630437022415252e-06, "loss": 0.1917, "num_input_tokens_seen": 56852584, "step": 84340 }, { "epoch": 2.0605623824298243, "grad_norm": 0.04226279631257057, "learning_rate": 1.462968115998428e-06, "loss": 0.0425, "num_input_tokens_seen": 56856040, "step": 84345 }, { "epoch": 2.0606845332616714, "grad_norm": 0.08056000620126724, "learning_rate": 1.4628925263885025e-06, "loss": 0.0002, "num_input_tokens_seen": 56859624, "step": 84350 }, { "epoch": 2.0608066840935186, "grad_norm": 0.6615628600120544, "learning_rate": 1.4628169334122979e-06, "loss": 0.0103, "num_input_tokens_seen": 56862696, "step": 84355 }, { "epoch": 2.060928834925366, "grad_norm": 0.0882110670208931, "learning_rate": 1.462741337070364e-06, "loss": 0.0621, "num_input_tokens_seen": 56866088, "step": 84360 }, { "epoch": 2.061050985757213, "grad_norm": 0.029368186369538307, "learning_rate": 1.4626657373632504e-06, "loss": 0.0002, "num_input_tokens_seen": 56869416, "step": 84365 }, { "epoch": 2.06117313658906, "grad_norm": 0.06895339488983154, "learning_rate": 1.4625901342915074e-06, "loss": 0.0006, "num_input_tokens_seen": 56873768, "step": 84370 }, { "epoch": 2.0612952874209074, "grad_norm": 0.004868913907557726, "learning_rate": 1.4625145278556846e-06, "loss": 0.0001, "num_input_tokens_seen": 56876904, "step": 84375 }, { "epoch": 2.0614174382527546, "grad_norm": 0.0036217966116964817, "learning_rate": 1.4624389180563314e-06, "loss": 0.0656, "num_input_tokens_seen": 56880936, "step": 84380 }, { "epoch": 2.0615395890846018, "grad_norm": 0.03403620421886444, "learning_rate": 1.4623633048939984e-06, "loss": 0.0701, "num_input_tokens_seen": 56884392, "step": 84385 }, { "epoch": 2.061661739916449, "grad_norm": 1.1728190183639526, "learning_rate": 1.462287688369235e-06, "loss": 0.0007, "num_input_tokens_seen": 56887784, "step": 84390 }, { "epoch": 2.061783890748296, "grad_norm": 0.02199065126478672, "learning_rate": 1.4622120684825912e-06, "loss": 0.0458, "num_input_tokens_seen": 56891368, "step": 84395 }, { "epoch": 2.0619060415801433, "grad_norm": 0.600640594959259, "learning_rate": 1.4621364452346168e-06, "loss": 0.0004, "num_input_tokens_seen": 56894888, "step": 84400 }, { "epoch": 2.0620281924119905, "grad_norm": 412.3470153808594, "learning_rate": 1.4620608186258617e-06, "loss": 0.1089, "num_input_tokens_seen": 56898216, "step": 84405 }, { "epoch": 2.0621503432438373, "grad_norm": 0.09077963978052139, "learning_rate": 1.4619851886568764e-06, "loss": 0.0679, "num_input_tokens_seen": 56901672, "step": 84410 }, { "epoch": 2.0622724940756845, "grad_norm": 11.91611385345459, "learning_rate": 1.4619095553282104e-06, "loss": 0.1514, "num_input_tokens_seen": 56905192, "step": 84415 }, { "epoch": 2.0623946449075317, "grad_norm": 0.22250288724899292, "learning_rate": 1.4618339186404138e-06, "loss": 0.0006, "num_input_tokens_seen": 56908584, "step": 84420 }, { "epoch": 2.062516795739379, "grad_norm": 1.0068647861480713, "learning_rate": 1.4617582785940369e-06, "loss": 0.0012, "num_input_tokens_seen": 56912232, "step": 84425 }, { "epoch": 2.062638946571226, "grad_norm": 0.07770141959190369, "learning_rate": 1.4616826351896294e-06, "loss": 0.034, "num_input_tokens_seen": 56915688, "step": 84430 }, { "epoch": 2.0627610974030732, "grad_norm": 0.06902618706226349, "learning_rate": 1.4616069884277417e-06, "loss": 0.039, "num_input_tokens_seen": 56919208, "step": 84435 }, { "epoch": 2.0628832482349204, "grad_norm": 0.06890308111906052, "learning_rate": 1.4615313383089238e-06, "loss": 0.0011, "num_input_tokens_seen": 56922600, "step": 84440 }, { "epoch": 2.0630053990667676, "grad_norm": 0.021308647468686104, "learning_rate": 1.4614556848337261e-06, "loss": 0.0004, "num_input_tokens_seen": 56926120, "step": 84445 }, { "epoch": 2.063127549898615, "grad_norm": 280.44140625, "learning_rate": 1.4613800280026983e-06, "loss": 0.0896, "num_input_tokens_seen": 56929448, "step": 84450 }, { "epoch": 2.063249700730462, "grad_norm": 0.09632374346256256, "learning_rate": 1.4613043678163908e-06, "loss": 0.0335, "num_input_tokens_seen": 56933032, "step": 84455 }, { "epoch": 2.063371851562309, "grad_norm": 24.449237823486328, "learning_rate": 1.461228704275354e-06, "loss": 0.1152, "num_input_tokens_seen": 56936104, "step": 84460 }, { "epoch": 2.0634940023941564, "grad_norm": 0.04614495486021042, "learning_rate": 1.4611530373801379e-06, "loss": 0.0003, "num_input_tokens_seen": 56939240, "step": 84465 }, { "epoch": 2.0636161532260036, "grad_norm": 5.583603382110596, "learning_rate": 1.461077367131293e-06, "loss": 0.001, "num_input_tokens_seen": 56942824, "step": 84470 }, { "epoch": 2.0637383040578507, "grad_norm": 0.7003459334373474, "learning_rate": 1.4610016935293695e-06, "loss": 0.0352, "num_input_tokens_seen": 56946408, "step": 84475 }, { "epoch": 2.063860454889698, "grad_norm": 0.06893998384475708, "learning_rate": 1.4609260165749175e-06, "loss": 0.038, "num_input_tokens_seen": 56949736, "step": 84480 }, { "epoch": 2.063982605721545, "grad_norm": 0.03078891895711422, "learning_rate": 1.4608503362684875e-06, "loss": 0.0259, "num_input_tokens_seen": 56952936, "step": 84485 }, { "epoch": 2.0641047565533923, "grad_norm": 0.4498542249202728, "learning_rate": 1.4607746526106299e-06, "loss": 0.0005, "num_input_tokens_seen": 56956328, "step": 84490 }, { "epoch": 2.0642269073852395, "grad_norm": 0.4549460709095001, "learning_rate": 1.4606989656018953e-06, "loss": 0.0674, "num_input_tokens_seen": 56959400, "step": 84495 }, { "epoch": 2.0643490582170863, "grad_norm": 0.02741813100874424, "learning_rate": 1.4606232752428338e-06, "loss": 0.0004, "num_input_tokens_seen": 56963048, "step": 84500 }, { "epoch": 2.0644712090489334, "grad_norm": 0.032297734171152115, "learning_rate": 1.460547581533996e-06, "loss": 0.035, "num_input_tokens_seen": 56966120, "step": 84505 }, { "epoch": 2.0645933598807806, "grad_norm": 0.060209326446056366, "learning_rate": 1.4604718844759325e-06, "loss": 0.03, "num_input_tokens_seen": 56969768, "step": 84510 }, { "epoch": 2.064715510712628, "grad_norm": 0.008528118953108788, "learning_rate": 1.4603961840691934e-06, "loss": 0.0011, "num_input_tokens_seen": 56972968, "step": 84515 }, { "epoch": 2.064837661544475, "grad_norm": 73.88001251220703, "learning_rate": 1.4603204803143293e-06, "loss": 0.0861, "num_input_tokens_seen": 56975912, "step": 84520 }, { "epoch": 2.064959812376322, "grad_norm": 0.005481546279042959, "learning_rate": 1.4602447732118907e-06, "loss": 0.0002, "num_input_tokens_seen": 56979752, "step": 84525 }, { "epoch": 2.0650819632081694, "grad_norm": 0.0511334091424942, "learning_rate": 1.4601690627624288e-06, "loss": 0.0003, "num_input_tokens_seen": 56983080, "step": 84530 }, { "epoch": 2.0652041140400166, "grad_norm": 0.09094161540269852, "learning_rate": 1.4600933489664934e-06, "loss": 0.0565, "num_input_tokens_seen": 56986600, "step": 84535 }, { "epoch": 2.0653262648718638, "grad_norm": 0.052177079021930695, "learning_rate": 1.4600176318246356e-06, "loss": 0.0001, "num_input_tokens_seen": 56989928, "step": 84540 }, { "epoch": 2.065448415703711, "grad_norm": 54.41684341430664, "learning_rate": 1.4599419113374057e-06, "loss": 0.1238, "num_input_tokens_seen": 56993384, "step": 84545 }, { "epoch": 2.065570566535558, "grad_norm": 0.010534948669373989, "learning_rate": 1.4598661875053545e-06, "loss": 0.0001, "num_input_tokens_seen": 56996840, "step": 84550 }, { "epoch": 2.0656927173674053, "grad_norm": 99.29583740234375, "learning_rate": 1.459790460329033e-06, "loss": 0.0555, "num_input_tokens_seen": 57000104, "step": 84555 }, { "epoch": 2.0658148681992525, "grad_norm": 1.3880772590637207, "learning_rate": 1.4597147298089914e-06, "loss": 0.0003, "num_input_tokens_seen": 57003560, "step": 84560 }, { "epoch": 2.0659370190310997, "grad_norm": 0.003567177802324295, "learning_rate": 1.4596389959457803e-06, "loss": 0.0001, "num_input_tokens_seen": 57007080, "step": 84565 }, { "epoch": 2.066059169862947, "grad_norm": 0.10701990872621536, "learning_rate": 1.4595632587399513e-06, "loss": 0.0001, "num_input_tokens_seen": 57010472, "step": 84570 }, { "epoch": 2.066181320694794, "grad_norm": 0.3246977925300598, "learning_rate": 1.4594875181920546e-06, "loss": 0.0467, "num_input_tokens_seen": 57013864, "step": 84575 }, { "epoch": 2.0663034715266413, "grad_norm": 0.0017067211447283626, "learning_rate": 1.4594117743026407e-06, "loss": 0.0653, "num_input_tokens_seen": 57017320, "step": 84580 }, { "epoch": 2.066425622358488, "grad_norm": 37.886878967285156, "learning_rate": 1.459336027072261e-06, "loss": 0.0534, "num_input_tokens_seen": 57021096, "step": 84585 }, { "epoch": 2.0665477731903352, "grad_norm": 0.05324367433786392, "learning_rate": 1.459260276501466e-06, "loss": 0.0085, "num_input_tokens_seen": 57024744, "step": 84590 }, { "epoch": 2.0666699240221824, "grad_norm": 0.031217625364661217, "learning_rate": 1.4591845225908073e-06, "loss": 0.0676, "num_input_tokens_seen": 57028008, "step": 84595 }, { "epoch": 2.0667920748540296, "grad_norm": 0.09894023090600967, "learning_rate": 1.4591087653408347e-06, "loss": 0.0002, "num_input_tokens_seen": 57030952, "step": 84600 }, { "epoch": 2.066914225685877, "grad_norm": 39.6179313659668, "learning_rate": 1.4590330047521e-06, "loss": 0.0689, "num_input_tokens_seen": 57034216, "step": 84605 }, { "epoch": 2.067036376517724, "grad_norm": 0.9363937973976135, "learning_rate": 1.458957240825154e-06, "loss": 0.0004, "num_input_tokens_seen": 57037672, "step": 84610 }, { "epoch": 2.067158527349571, "grad_norm": 0.242109477519989, "learning_rate": 1.458881473560547e-06, "loss": 0.0001, "num_input_tokens_seen": 57040936, "step": 84615 }, { "epoch": 2.0672806781814184, "grad_norm": 0.004208797123283148, "learning_rate": 1.4588057029588308e-06, "loss": 0.0725, "num_input_tokens_seen": 57044264, "step": 84620 }, { "epoch": 2.0674028290132656, "grad_norm": 33.177547454833984, "learning_rate": 1.458729929020556e-06, "loss": 0.0572, "num_input_tokens_seen": 57047976, "step": 84625 }, { "epoch": 2.0675249798451127, "grad_norm": 0.010071083903312683, "learning_rate": 1.4586541517462739e-06, "loss": 0.0851, "num_input_tokens_seen": 57051752, "step": 84630 }, { "epoch": 2.06764713067696, "grad_norm": 0.19519588351249695, "learning_rate": 1.4585783711365355e-06, "loss": 0.0002, "num_input_tokens_seen": 57055208, "step": 84635 }, { "epoch": 2.067769281508807, "grad_norm": 0.014187058433890343, "learning_rate": 1.4585025871918913e-06, "loss": 0.0001, "num_input_tokens_seen": 57058792, "step": 84640 }, { "epoch": 2.0678914323406543, "grad_norm": 0.22282736003398895, "learning_rate": 1.4584267999128934e-06, "loss": 0.0003, "num_input_tokens_seen": 57062760, "step": 84645 }, { "epoch": 2.0680135831725015, "grad_norm": 14.674356460571289, "learning_rate": 1.4583510093000923e-06, "loss": 0.0536, "num_input_tokens_seen": 57066216, "step": 84650 }, { "epoch": 2.0681357340043487, "grad_norm": 0.17305071651935577, "learning_rate": 1.4582752153540397e-06, "loss": 0.0005, "num_input_tokens_seen": 57069416, "step": 84655 }, { "epoch": 2.068257884836196, "grad_norm": 0.8630813360214233, "learning_rate": 1.4581994180752863e-06, "loss": 0.0008, "num_input_tokens_seen": 57073192, "step": 84660 }, { "epoch": 2.068380035668043, "grad_norm": 0.07105296105146408, "learning_rate": 1.4581236174643836e-06, "loss": 0.0502, "num_input_tokens_seen": 57076456, "step": 84665 }, { "epoch": 2.0685021864998903, "grad_norm": 0.009249145165085793, "learning_rate": 1.4580478135218828e-06, "loss": 0.0004, "num_input_tokens_seen": 57080104, "step": 84670 }, { "epoch": 2.0686243373317375, "grad_norm": 14.103489875793457, "learning_rate": 1.4579720062483348e-06, "loss": 0.0383, "num_input_tokens_seen": 57083176, "step": 84675 }, { "epoch": 2.068746488163584, "grad_norm": 0.008026017807424068, "learning_rate": 1.4578961956442913e-06, "loss": 0.0003, "num_input_tokens_seen": 57086440, "step": 84680 }, { "epoch": 2.0688686389954314, "grad_norm": 0.046018462628126144, "learning_rate": 1.4578203817103036e-06, "loss": 0.0003, "num_input_tokens_seen": 57089512, "step": 84685 }, { "epoch": 2.0689907898272786, "grad_norm": 0.26206257939338684, "learning_rate": 1.4577445644469229e-06, "loss": 0.0002, "num_input_tokens_seen": 57092584, "step": 84690 }, { "epoch": 2.0691129406591258, "grad_norm": 0.18885137140750885, "learning_rate": 1.4576687438547003e-06, "loss": 0.064, "num_input_tokens_seen": 57095656, "step": 84695 }, { "epoch": 2.069235091490973, "grad_norm": 0.15367726981639862, "learning_rate": 1.457592919934188e-06, "loss": 0.0564, "num_input_tokens_seen": 57099560, "step": 84700 }, { "epoch": 2.06935724232282, "grad_norm": 0.02957310900092125, "learning_rate": 1.4575170926859368e-06, "loss": 0.0001, "num_input_tokens_seen": 57102952, "step": 84705 }, { "epoch": 2.0694793931546673, "grad_norm": 0.05982162058353424, "learning_rate": 1.4574412621104982e-06, "loss": 0.0062, "num_input_tokens_seen": 57106088, "step": 84710 }, { "epoch": 2.0696015439865145, "grad_norm": 0.007052605506032705, "learning_rate": 1.4573654282084236e-06, "loss": 0.0901, "num_input_tokens_seen": 57109352, "step": 84715 }, { "epoch": 2.0697236948183617, "grad_norm": 0.0015864939196035266, "learning_rate": 1.4572895909802644e-06, "loss": 0.0002, "num_input_tokens_seen": 57112616, "step": 84720 }, { "epoch": 2.069845845650209, "grad_norm": 0.12181193381547928, "learning_rate": 1.4572137504265727e-06, "loss": 0.0393, "num_input_tokens_seen": 57116072, "step": 84725 }, { "epoch": 2.069967996482056, "grad_norm": 0.0038857213221490383, "learning_rate": 1.4571379065478995e-06, "loss": 0.0001, "num_input_tokens_seen": 57119208, "step": 84730 }, { "epoch": 2.0700901473139033, "grad_norm": 0.016026942059397697, "learning_rate": 1.4570620593447967e-06, "loss": 0.0017, "num_input_tokens_seen": 57122344, "step": 84735 }, { "epoch": 2.0702122981457505, "grad_norm": 0.017551347613334656, "learning_rate": 1.4569862088178151e-06, "loss": 0.0432, "num_input_tokens_seen": 57125480, "step": 84740 }, { "epoch": 2.0703344489775977, "grad_norm": 21.246379852294922, "learning_rate": 1.4569103549675073e-06, "loss": 0.0536, "num_input_tokens_seen": 57128936, "step": 84745 }, { "epoch": 2.070456599809445, "grad_norm": 0.07237745076417923, "learning_rate": 1.4568344977944242e-06, "loss": 0.0001, "num_input_tokens_seen": 57132328, "step": 84750 }, { "epoch": 2.070578750641292, "grad_norm": 0.014093225821852684, "learning_rate": 1.456758637299118e-06, "loss": 0.0, "num_input_tokens_seen": 57135592, "step": 84755 }, { "epoch": 2.0707009014731392, "grad_norm": 0.0032465895637869835, "learning_rate": 1.4566827734821403e-06, "loss": 0.0598, "num_input_tokens_seen": 57138728, "step": 84760 }, { "epoch": 2.070823052304986, "grad_norm": 75.4856185913086, "learning_rate": 1.4566069063440424e-06, "loss": 0.0354, "num_input_tokens_seen": 57141608, "step": 84765 }, { "epoch": 2.070945203136833, "grad_norm": 0.03004450909793377, "learning_rate": 1.4565310358853762e-06, "loss": 0.0017, "num_input_tokens_seen": 57144744, "step": 84770 }, { "epoch": 2.0710673539686804, "grad_norm": 93.50947570800781, "learning_rate": 1.4564551621066937e-06, "loss": 0.0326, "num_input_tokens_seen": 57148136, "step": 84775 }, { "epoch": 2.0711895048005275, "grad_norm": 0.0619245208799839, "learning_rate": 1.4563792850085464e-06, "loss": 0.0411, "num_input_tokens_seen": 57151400, "step": 84780 }, { "epoch": 2.0713116556323747, "grad_norm": 0.23388482630252838, "learning_rate": 1.456303404591486e-06, "loss": 0.12, "num_input_tokens_seen": 57154920, "step": 84785 }, { "epoch": 2.071433806464222, "grad_norm": 0.08751174062490463, "learning_rate": 1.456227520856065e-06, "loss": 0.0385, "num_input_tokens_seen": 57158120, "step": 84790 }, { "epoch": 2.071555957296069, "grad_norm": 0.019446225836873055, "learning_rate": 1.4561516338028343e-06, "loss": 0.0526, "num_input_tokens_seen": 57160936, "step": 84795 }, { "epoch": 2.0716781081279163, "grad_norm": 0.013060618191957474, "learning_rate": 1.4560757434323463e-06, "loss": 0.0002, "num_input_tokens_seen": 57164776, "step": 84800 }, { "epoch": 2.0718002589597635, "grad_norm": 0.009493088349699974, "learning_rate": 1.455999849745153e-06, "loss": 0.0001, "num_input_tokens_seen": 57168872, "step": 84805 }, { "epoch": 2.0719224097916107, "grad_norm": 0.033220697194337845, "learning_rate": 1.4559239527418062e-06, "loss": 0.0001, "num_input_tokens_seen": 57172136, "step": 84810 }, { "epoch": 2.072044560623458, "grad_norm": 0.3769541382789612, "learning_rate": 1.4558480524228576e-06, "loss": 0.0007, "num_input_tokens_seen": 57176168, "step": 84815 }, { "epoch": 2.072166711455305, "grad_norm": 0.17406384646892548, "learning_rate": 1.4557721487888594e-06, "loss": 0.0006, "num_input_tokens_seen": 57179816, "step": 84820 }, { "epoch": 2.0722888622871523, "grad_norm": 0.07895692437887192, "learning_rate": 1.4556962418403637e-06, "loss": 0.0003, "num_input_tokens_seen": 57183080, "step": 84825 }, { "epoch": 2.0724110131189994, "grad_norm": 0.050496265292167664, "learning_rate": 1.4556203315779222e-06, "loss": 0.1638, "num_input_tokens_seen": 57186344, "step": 84830 }, { "epoch": 2.0725331639508466, "grad_norm": 0.20722278952598572, "learning_rate": 1.4555444180020867e-06, "loss": 0.0708, "num_input_tokens_seen": 57189480, "step": 84835 }, { "epoch": 2.072655314782694, "grad_norm": 0.4477522671222687, "learning_rate": 1.4554685011134102e-06, "loss": 0.0537, "num_input_tokens_seen": 57192360, "step": 84840 }, { "epoch": 2.072777465614541, "grad_norm": 16.252840042114258, "learning_rate": 1.4553925809124443e-06, "loss": 0.0546, "num_input_tokens_seen": 57195880, "step": 84845 }, { "epoch": 2.072899616446388, "grad_norm": 0.4574112892150879, "learning_rate": 1.455316657399741e-06, "loss": 0.0422, "num_input_tokens_seen": 57199272, "step": 84850 }, { "epoch": 2.073021767278235, "grad_norm": 0.017640331760048866, "learning_rate": 1.4552407305758524e-06, "loss": 0.0002, "num_input_tokens_seen": 57202728, "step": 84855 }, { "epoch": 2.073143918110082, "grad_norm": 0.012551628053188324, "learning_rate": 1.4551648004413307e-06, "loss": 0.0311, "num_input_tokens_seen": 57206120, "step": 84860 }, { "epoch": 2.0732660689419293, "grad_norm": 14.287816047668457, "learning_rate": 1.4550888669967281e-06, "loss": 0.0477, "num_input_tokens_seen": 57209576, "step": 84865 }, { "epoch": 2.0733882197737765, "grad_norm": 1.5717146396636963, "learning_rate": 1.4550129302425972e-06, "loss": 0.129, "num_input_tokens_seen": 57212776, "step": 84870 }, { "epoch": 2.0735103706056237, "grad_norm": 0.011195817030966282, "learning_rate": 1.4549369901794894e-06, "loss": 0.0003, "num_input_tokens_seen": 57216104, "step": 84875 }, { "epoch": 2.073632521437471, "grad_norm": 20.77536392211914, "learning_rate": 1.4548610468079578e-06, "loss": 0.1224, "num_input_tokens_seen": 57219112, "step": 84880 }, { "epoch": 2.073754672269318, "grad_norm": 10.758296966552734, "learning_rate": 1.4547851001285542e-06, "loss": 0.0338, "num_input_tokens_seen": 57222440, "step": 84885 }, { "epoch": 2.0738768231011653, "grad_norm": 1.7609142065048218, "learning_rate": 1.4547091501418312e-06, "loss": 0.0008, "num_input_tokens_seen": 57225960, "step": 84890 }, { "epoch": 2.0739989739330125, "grad_norm": 0.15660512447357178, "learning_rate": 1.4546331968483405e-06, "loss": 0.0004, "num_input_tokens_seen": 57229480, "step": 84895 }, { "epoch": 2.0741211247648597, "grad_norm": 0.09017514437437057, "learning_rate": 1.4545572402486352e-06, "loss": 0.0561, "num_input_tokens_seen": 57232680, "step": 84900 }, { "epoch": 2.074243275596707, "grad_norm": 0.6560986638069153, "learning_rate": 1.4544812803432676e-06, "loss": 0.0009, "num_input_tokens_seen": 57236200, "step": 84905 }, { "epoch": 2.074365426428554, "grad_norm": 0.05695900693535805, "learning_rate": 1.4544053171327897e-06, "loss": 0.0004, "num_input_tokens_seen": 57239144, "step": 84910 }, { "epoch": 2.0744875772604012, "grad_norm": 0.09295591711997986, "learning_rate": 1.4543293506177538e-06, "loss": 0.0445, "num_input_tokens_seen": 57242216, "step": 84915 }, { "epoch": 2.0746097280922484, "grad_norm": 0.1317606270313263, "learning_rate": 1.4542533807987132e-06, "loss": 0.0581, "num_input_tokens_seen": 57245800, "step": 84920 }, { "epoch": 2.0747318789240956, "grad_norm": 32.33819580078125, "learning_rate": 1.4541774076762197e-06, "loss": 0.0395, "num_input_tokens_seen": 57249064, "step": 84925 }, { "epoch": 2.074854029755943, "grad_norm": 0.05708523467183113, "learning_rate": 1.4541014312508257e-06, "loss": 0.0599, "num_input_tokens_seen": 57252200, "step": 84930 }, { "epoch": 2.07497618058779, "grad_norm": 0.10730971395969391, "learning_rate": 1.454025451523084e-06, "loss": 0.0001, "num_input_tokens_seen": 57255272, "step": 84935 }, { "epoch": 2.075098331419637, "grad_norm": 28.495298385620117, "learning_rate": 1.4539494684935473e-06, "loss": 0.0823, "num_input_tokens_seen": 57259048, "step": 84940 }, { "epoch": 2.075220482251484, "grad_norm": 0.28393909335136414, "learning_rate": 1.4538734821627679e-06, "loss": 0.0443, "num_input_tokens_seen": 57262568, "step": 84945 }, { "epoch": 2.075342633083331, "grad_norm": 0.0327572263777256, "learning_rate": 1.4537974925312986e-06, "loss": 0.0003, "num_input_tokens_seen": 57265640, "step": 84950 }, { "epoch": 2.0754647839151783, "grad_norm": 0.19717465341091156, "learning_rate": 1.4537214995996914e-06, "loss": 0.1051, "num_input_tokens_seen": 57269352, "step": 84955 }, { "epoch": 2.0755869347470255, "grad_norm": 0.15424473583698273, "learning_rate": 1.4536455033684995e-06, "loss": 0.0354, "num_input_tokens_seen": 57272488, "step": 84960 }, { "epoch": 2.0757090855788727, "grad_norm": 0.6080400943756104, "learning_rate": 1.4535695038382759e-06, "loss": 0.0002, "num_input_tokens_seen": 57276520, "step": 84965 }, { "epoch": 2.07583123641072, "grad_norm": 0.002733609639108181, "learning_rate": 1.4534935010095727e-06, "loss": 0.0572, "num_input_tokens_seen": 57280104, "step": 84970 }, { "epoch": 2.075953387242567, "grad_norm": 0.15572920441627502, "learning_rate": 1.453417494882943e-06, "loss": 0.0803, "num_input_tokens_seen": 57283368, "step": 84975 }, { "epoch": 2.0760755380744143, "grad_norm": 0.08092348277568817, "learning_rate": 1.453341485458939e-06, "loss": 0.0001, "num_input_tokens_seen": 57286312, "step": 84980 }, { "epoch": 2.0761976889062614, "grad_norm": 1.1062157154083252, "learning_rate": 1.4532654727381139e-06, "loss": 0.1466, "num_input_tokens_seen": 57289256, "step": 84985 }, { "epoch": 2.0763198397381086, "grad_norm": 1.6455450057983398, "learning_rate": 1.45318945672102e-06, "loss": 0.0008, "num_input_tokens_seen": 57292584, "step": 84990 }, { "epoch": 2.076441990569956, "grad_norm": 0.004664831329137087, "learning_rate": 1.453113437408211e-06, "loss": 0.0323, "num_input_tokens_seen": 57295784, "step": 84995 }, { "epoch": 2.076564141401803, "grad_norm": 0.08192527294158936, "learning_rate": 1.4530374148002391e-06, "loss": 0.0315, "num_input_tokens_seen": 57299176, "step": 85000 }, { "epoch": 2.07668629223365, "grad_norm": 0.020025499165058136, "learning_rate": 1.4529613888976572e-06, "loss": 0.0002, "num_input_tokens_seen": 57302440, "step": 85005 }, { "epoch": 2.0768084430654974, "grad_norm": 0.009146141819655895, "learning_rate": 1.452885359701018e-06, "loss": 0.0547, "num_input_tokens_seen": 57305832, "step": 85010 }, { "epoch": 2.0769305938973446, "grad_norm": 0.024533890187740326, "learning_rate": 1.452809327210875e-06, "loss": 0.0001, "num_input_tokens_seen": 57309544, "step": 85015 }, { "epoch": 2.0770527447291918, "grad_norm": 18.826255798339844, "learning_rate": 1.4527332914277807e-06, "loss": 0.0516, "num_input_tokens_seen": 57313000, "step": 85020 }, { "epoch": 2.077174895561039, "grad_norm": 0.02399023249745369, "learning_rate": 1.4526572523522882e-06, "loss": 0.0001, "num_input_tokens_seen": 57316264, "step": 85025 }, { "epoch": 2.077297046392886, "grad_norm": 0.017879458144307137, "learning_rate": 1.4525812099849502e-06, "loss": 0.0015, "num_input_tokens_seen": 57319656, "step": 85030 }, { "epoch": 2.077419197224733, "grad_norm": 8.3052396774292, "learning_rate": 1.45250516432632e-06, "loss": 0.0015, "num_input_tokens_seen": 57322600, "step": 85035 }, { "epoch": 2.07754134805658, "grad_norm": 0.005566952284425497, "learning_rate": 1.4524291153769505e-06, "loss": 0.0003, "num_input_tokens_seen": 57325864, "step": 85040 }, { "epoch": 2.0776634988884273, "grad_norm": 0.0613483190536499, "learning_rate": 1.452353063137395e-06, "loss": 0.0458, "num_input_tokens_seen": 57328744, "step": 85045 }, { "epoch": 2.0777856497202745, "grad_norm": 0.14459988474845886, "learning_rate": 1.452277007608206e-06, "loss": 0.0301, "num_input_tokens_seen": 57332392, "step": 85050 }, { "epoch": 2.0779078005521217, "grad_norm": 24.976425170898438, "learning_rate": 1.452200948789937e-06, "loss": 0.0471, "num_input_tokens_seen": 57335784, "step": 85055 }, { "epoch": 2.078029951383969, "grad_norm": 0.11778343468904495, "learning_rate": 1.4521248866831415e-06, "loss": 0.0004, "num_input_tokens_seen": 57339048, "step": 85060 }, { "epoch": 2.078152102215816, "grad_norm": 166.90821838378906, "learning_rate": 1.452048821288372e-06, "loss": 0.1021, "num_input_tokens_seen": 57341928, "step": 85065 }, { "epoch": 2.0782742530476632, "grad_norm": 0.06748581677675247, "learning_rate": 1.4519727526061818e-06, "loss": 0.0493, "num_input_tokens_seen": 57345000, "step": 85070 }, { "epoch": 2.0783964038795104, "grad_norm": 11.902010917663574, "learning_rate": 1.451896680637124e-06, "loss": 0.0419, "num_input_tokens_seen": 57348584, "step": 85075 }, { "epoch": 2.0785185547113576, "grad_norm": 0.03496910631656647, "learning_rate": 1.4518206053817524e-06, "loss": 0.038, "num_input_tokens_seen": 57351848, "step": 85080 }, { "epoch": 2.078640705543205, "grad_norm": 0.007627247367054224, "learning_rate": 1.4517445268406196e-06, "loss": 0.0302, "num_input_tokens_seen": 57355240, "step": 85085 }, { "epoch": 2.078762856375052, "grad_norm": 0.020205846056342125, "learning_rate": 1.451668445014279e-06, "loss": 0.0365, "num_input_tokens_seen": 57358248, "step": 85090 }, { "epoch": 2.078885007206899, "grad_norm": 0.03811624273657799, "learning_rate": 1.4515923599032841e-06, "loss": 0.0004, "num_input_tokens_seen": 57361832, "step": 85095 }, { "epoch": 2.0790071580387464, "grad_norm": 0.26009073853492737, "learning_rate": 1.451516271508188e-06, "loss": 0.0381, "num_input_tokens_seen": 57365736, "step": 85100 }, { "epoch": 2.0791293088705936, "grad_norm": 0.03631829842925072, "learning_rate": 1.4514401798295444e-06, "loss": 0.0002, "num_input_tokens_seen": 57369768, "step": 85105 }, { "epoch": 2.0792514597024407, "grad_norm": 0.03883937746286392, "learning_rate": 1.4513640848679063e-06, "loss": 0.0254, "num_input_tokens_seen": 57373288, "step": 85110 }, { "epoch": 2.079373610534288, "grad_norm": 53.71021270751953, "learning_rate": 1.451287986623827e-06, "loss": 0.1445, "num_input_tokens_seen": 57376680, "step": 85115 }, { "epoch": 2.079495761366135, "grad_norm": 88.04527282714844, "learning_rate": 1.45121188509786e-06, "loss": 0.0622, "num_input_tokens_seen": 57379752, "step": 85120 }, { "epoch": 2.079617912197982, "grad_norm": 0.0611591562628746, "learning_rate": 1.4511357802905591e-06, "loss": 0.0591, "num_input_tokens_seen": 57382952, "step": 85125 }, { "epoch": 2.079740063029829, "grad_norm": 0.004782163072377443, "learning_rate": 1.4510596722024775e-06, "loss": 0.0716, "num_input_tokens_seen": 57386280, "step": 85130 }, { "epoch": 2.0798622138616762, "grad_norm": 0.09725327044725418, "learning_rate": 1.4509835608341685e-06, "loss": 0.0018, "num_input_tokens_seen": 57389160, "step": 85135 }, { "epoch": 2.0799843646935234, "grad_norm": 0.09552471339702606, "learning_rate": 1.450907446186186e-06, "loss": 0.0007, "num_input_tokens_seen": 57392360, "step": 85140 }, { "epoch": 2.0801065155253706, "grad_norm": 0.3419274389743805, "learning_rate": 1.4508313282590827e-06, "loss": 0.0005, "num_input_tokens_seen": 57395432, "step": 85145 }, { "epoch": 2.080228666357218, "grad_norm": 0.016205064952373505, "learning_rate": 1.450755207053413e-06, "loss": 0.0236, "num_input_tokens_seen": 57398952, "step": 85150 }, { "epoch": 2.080350817189065, "grad_norm": 0.010081671178340912, "learning_rate": 1.45067908256973e-06, "loss": 0.1286, "num_input_tokens_seen": 57402344, "step": 85155 }, { "epoch": 2.080472968020912, "grad_norm": 0.055685579776763916, "learning_rate": 1.450602954808588e-06, "loss": 0.0012, "num_input_tokens_seen": 57405480, "step": 85160 }, { "epoch": 2.0805951188527594, "grad_norm": 7.206017017364502, "learning_rate": 1.4505268237705396e-06, "loss": 0.1086, "num_input_tokens_seen": 57410920, "step": 85165 }, { "epoch": 2.0807172696846066, "grad_norm": 0.1437099277973175, "learning_rate": 1.4504506894561394e-06, "loss": 0.0002, "num_input_tokens_seen": 57414440, "step": 85170 }, { "epoch": 2.0808394205164538, "grad_norm": 0.009064151905477047, "learning_rate": 1.4503745518659404e-06, "loss": 0.0001, "num_input_tokens_seen": 57417512, "step": 85175 }, { "epoch": 2.080961571348301, "grad_norm": 0.015507766976952553, "learning_rate": 1.4502984110004967e-06, "loss": 0.0484, "num_input_tokens_seen": 57420584, "step": 85180 }, { "epoch": 2.081083722180148, "grad_norm": 14.597432136535645, "learning_rate": 1.4502222668603616e-06, "loss": 0.0516, "num_input_tokens_seen": 57424168, "step": 85185 }, { "epoch": 2.0812058730119953, "grad_norm": 0.00974042434245348, "learning_rate": 1.450146119446089e-06, "loss": 0.0012, "num_input_tokens_seen": 57427304, "step": 85190 }, { "epoch": 2.0813280238438425, "grad_norm": 0.024810034781694412, "learning_rate": 1.4500699687582332e-06, "loss": 0.0311, "num_input_tokens_seen": 57431400, "step": 85195 }, { "epoch": 2.0814501746756897, "grad_norm": 0.0483444482088089, "learning_rate": 1.4499938147973472e-06, "loss": 0.0544, "num_input_tokens_seen": 57434728, "step": 85200 }, { "epoch": 2.081572325507537, "grad_norm": 0.1792319118976593, "learning_rate": 1.4499176575639851e-06, "loss": 0.0002, "num_input_tokens_seen": 57438248, "step": 85205 }, { "epoch": 2.0816944763393836, "grad_norm": 0.06439114362001419, "learning_rate": 1.4498414970587008e-06, "loss": 0.0341, "num_input_tokens_seen": 57441512, "step": 85210 }, { "epoch": 2.081816627171231, "grad_norm": 0.15774594247341156, "learning_rate": 1.4497653332820482e-06, "loss": 0.0003, "num_input_tokens_seen": 57444712, "step": 85215 }, { "epoch": 2.081938778003078, "grad_norm": 0.009701626375317574, "learning_rate": 1.449689166234581e-06, "loss": 0.0003, "num_input_tokens_seen": 57447976, "step": 85220 }, { "epoch": 2.082060928834925, "grad_norm": 0.06566569954156876, "learning_rate": 1.4496129959168535e-06, "loss": 0.001, "num_input_tokens_seen": 57451432, "step": 85225 }, { "epoch": 2.0821830796667724, "grad_norm": 0.7955803871154785, "learning_rate": 1.4495368223294194e-06, "loss": 0.0444, "num_input_tokens_seen": 57455208, "step": 85230 }, { "epoch": 2.0823052304986196, "grad_norm": 0.011005281470716, "learning_rate": 1.4494606454728323e-06, "loss": 0.0002, "num_input_tokens_seen": 57458920, "step": 85235 }, { "epoch": 2.082427381330467, "grad_norm": 2.7995944023132324, "learning_rate": 1.4493844653476468e-06, "loss": 0.0336, "num_input_tokens_seen": 57462056, "step": 85240 }, { "epoch": 2.082549532162314, "grad_norm": 0.12895996868610382, "learning_rate": 1.4493082819544165e-06, "loss": 0.0319, "num_input_tokens_seen": 57465704, "step": 85245 }, { "epoch": 2.082671682994161, "grad_norm": 0.134056955575943, "learning_rate": 1.4492320952936954e-06, "loss": 0.0516, "num_input_tokens_seen": 57468776, "step": 85250 }, { "epoch": 2.0827938338260084, "grad_norm": 0.019701072946190834, "learning_rate": 1.4491559053660377e-06, "loss": 0.0449, "num_input_tokens_seen": 57472360, "step": 85255 }, { "epoch": 2.0829159846578555, "grad_norm": 0.005952196195721626, "learning_rate": 1.4490797121719976e-06, "loss": 0.0, "num_input_tokens_seen": 57475496, "step": 85260 }, { "epoch": 2.0830381354897027, "grad_norm": 0.029273463413119316, "learning_rate": 1.4490035157121287e-06, "loss": 0.0001, "num_input_tokens_seen": 57479080, "step": 85265 }, { "epoch": 2.08316028632155, "grad_norm": 0.006493884138762951, "learning_rate": 1.4489273159869858e-06, "loss": 0.0001, "num_input_tokens_seen": 57482536, "step": 85270 }, { "epoch": 2.083282437153397, "grad_norm": 0.12485459446907043, "learning_rate": 1.4488511129971226e-06, "loss": 0.0003, "num_input_tokens_seen": 57486056, "step": 85275 }, { "epoch": 2.0834045879852443, "grad_norm": 0.026284409686923027, "learning_rate": 1.4487749067430931e-06, "loss": 0.062, "num_input_tokens_seen": 57489448, "step": 85280 }, { "epoch": 2.0835267388170915, "grad_norm": 0.015463829971849918, "learning_rate": 1.4486986972254525e-06, "loss": 0.0004, "num_input_tokens_seen": 57492520, "step": 85285 }, { "epoch": 2.0836488896489387, "grad_norm": 0.1737455427646637, "learning_rate": 1.448622484444754e-06, "loss": 0.0004, "num_input_tokens_seen": 57495656, "step": 85290 }, { "epoch": 2.083771040480786, "grad_norm": 65.79161071777344, "learning_rate": 1.448546268401552e-06, "loss": 0.0884, "num_input_tokens_seen": 57499112, "step": 85295 }, { "epoch": 2.083893191312633, "grad_norm": 33.39630889892578, "learning_rate": 1.4484700490964007e-06, "loss": 0.1168, "num_input_tokens_seen": 57502568, "step": 85300 }, { "epoch": 2.08401534214448, "grad_norm": 0.05079827457666397, "learning_rate": 1.4483938265298545e-06, "loss": 0.0005, "num_input_tokens_seen": 57506216, "step": 85305 }, { "epoch": 2.084137492976327, "grad_norm": 0.005445330403745174, "learning_rate": 1.448317600702468e-06, "loss": 0.0001, "num_input_tokens_seen": 57509992, "step": 85310 }, { "epoch": 2.084259643808174, "grad_norm": 0.022690260782837868, "learning_rate": 1.4482413716147954e-06, "loss": 0.0003, "num_input_tokens_seen": 57513256, "step": 85315 }, { "epoch": 2.0843817946400214, "grad_norm": 0.0060226828791201115, "learning_rate": 1.448165139267391e-06, "loss": 0.1029, "num_input_tokens_seen": 57516456, "step": 85320 }, { "epoch": 2.0845039454718686, "grad_norm": 0.20211663842201233, "learning_rate": 1.448088903660809e-06, "loss": 0.0475, "num_input_tokens_seen": 57520424, "step": 85325 }, { "epoch": 2.0846260963037158, "grad_norm": 42.50727081298828, "learning_rate": 1.4480126647956044e-06, "loss": 0.0373, "num_input_tokens_seen": 57523816, "step": 85330 }, { "epoch": 2.084748247135563, "grad_norm": 0.020627789199352264, "learning_rate": 1.4479364226723308e-06, "loss": 0.0637, "num_input_tokens_seen": 57527080, "step": 85335 }, { "epoch": 2.08487039796741, "grad_norm": 0.10394884645938873, "learning_rate": 1.447860177291543e-06, "loss": 0.0002, "num_input_tokens_seen": 57529960, "step": 85340 }, { "epoch": 2.0849925487992573, "grad_norm": 33.26313400268555, "learning_rate": 1.4477839286537958e-06, "loss": 0.1198, "num_input_tokens_seen": 57532904, "step": 85345 }, { "epoch": 2.0851146996311045, "grad_norm": 152.42120361328125, "learning_rate": 1.4477076767596433e-06, "loss": 0.0724, "num_input_tokens_seen": 57535976, "step": 85350 }, { "epoch": 2.0852368504629517, "grad_norm": 0.007731488440185785, "learning_rate": 1.4476314216096402e-06, "loss": 0.0002, "num_input_tokens_seen": 57539176, "step": 85355 }, { "epoch": 2.085359001294799, "grad_norm": 0.00451649259775877, "learning_rate": 1.4475551632043408e-06, "loss": 0.0001, "num_input_tokens_seen": 57542504, "step": 85360 }, { "epoch": 2.085481152126646, "grad_norm": 0.0020277267321944237, "learning_rate": 1.4474789015443001e-06, "loss": 0.0807, "num_input_tokens_seen": 57546088, "step": 85365 }, { "epoch": 2.0856033029584933, "grad_norm": 0.08840125054121017, "learning_rate": 1.4474026366300724e-06, "loss": 0.0412, "num_input_tokens_seen": 57549672, "step": 85370 }, { "epoch": 2.0857254537903405, "grad_norm": 0.5002579092979431, "learning_rate": 1.4473263684622124e-06, "loss": 0.093, "num_input_tokens_seen": 57553320, "step": 85375 }, { "epoch": 2.0858476046221877, "grad_norm": 0.12200009077787399, "learning_rate": 1.4472500970412747e-06, "loss": 0.0004, "num_input_tokens_seen": 57556520, "step": 85380 }, { "epoch": 2.085969755454035, "grad_norm": 0.007834597490727901, "learning_rate": 1.4471738223678141e-06, "loss": 0.0426, "num_input_tokens_seen": 57559976, "step": 85385 }, { "epoch": 2.0860919062858816, "grad_norm": 0.11974035948514938, "learning_rate": 1.4470975444423853e-06, "loss": 0.0004, "num_input_tokens_seen": 57563752, "step": 85390 }, { "epoch": 2.086214057117729, "grad_norm": 16.715085983276367, "learning_rate": 1.4470212632655425e-06, "loss": 0.0388, "num_input_tokens_seen": 57567080, "step": 85395 }, { "epoch": 2.086336207949576, "grad_norm": 0.1420247107744217, "learning_rate": 1.4469449788378411e-06, "loss": 0.0465, "num_input_tokens_seen": 57570600, "step": 85400 }, { "epoch": 2.086458358781423, "grad_norm": 0.23512078821659088, "learning_rate": 1.4468686911598356e-06, "loss": 0.0002, "num_input_tokens_seen": 57573672, "step": 85405 }, { "epoch": 2.0865805096132704, "grad_norm": 95.10938262939453, "learning_rate": 1.4467924002320807e-06, "loss": 0.0926, "num_input_tokens_seen": 57576872, "step": 85410 }, { "epoch": 2.0867026604451175, "grad_norm": 65.81708526611328, "learning_rate": 1.4467161060551313e-06, "loss": 0.0384, "num_input_tokens_seen": 57580008, "step": 85415 }, { "epoch": 2.0868248112769647, "grad_norm": 0.5347601771354675, "learning_rate": 1.4466398086295422e-06, "loss": 0.0281, "num_input_tokens_seen": 57583400, "step": 85420 }, { "epoch": 2.086946962108812, "grad_norm": 0.03770304098725319, "learning_rate": 1.4465635079558683e-06, "loss": 0.0689, "num_input_tokens_seen": 57586728, "step": 85425 }, { "epoch": 2.087069112940659, "grad_norm": 0.03212061896920204, "learning_rate": 1.4464872040346646e-06, "loss": 0.0012, "num_input_tokens_seen": 57590120, "step": 85430 }, { "epoch": 2.0871912637725063, "grad_norm": 0.1276782602071762, "learning_rate": 1.4464108968664857e-06, "loss": 0.0006, "num_input_tokens_seen": 57593896, "step": 85435 }, { "epoch": 2.0873134146043535, "grad_norm": 0.007270419038832188, "learning_rate": 1.4463345864518867e-06, "loss": 0.0002, "num_input_tokens_seen": 57597160, "step": 85440 }, { "epoch": 2.0874355654362007, "grad_norm": 0.06667440384626389, "learning_rate": 1.4462582727914228e-06, "loss": 0.048, "num_input_tokens_seen": 57600360, "step": 85445 }, { "epoch": 2.087557716268048, "grad_norm": 0.18071290850639343, "learning_rate": 1.4461819558856484e-06, "loss": 0.038, "num_input_tokens_seen": 57603560, "step": 85450 }, { "epoch": 2.087679867099895, "grad_norm": 0.4118402600288391, "learning_rate": 1.446105635735119e-06, "loss": 0.0011, "num_input_tokens_seen": 57607016, "step": 85455 }, { "epoch": 2.0878020179317422, "grad_norm": 0.07384537905454636, "learning_rate": 1.4460293123403893e-06, "loss": 0.0248, "num_input_tokens_seen": 57610344, "step": 85460 }, { "epoch": 2.0879241687635894, "grad_norm": 0.04173500835895538, "learning_rate": 1.4459529857020144e-06, "loss": 0.0407, "num_input_tokens_seen": 57613992, "step": 85465 }, { "epoch": 2.0880463195954366, "grad_norm": 0.006109563168138266, "learning_rate": 1.4458766558205495e-06, "loss": 0.0002, "num_input_tokens_seen": 57617320, "step": 85470 }, { "epoch": 2.088168470427284, "grad_norm": 0.09735125303268433, "learning_rate": 1.4458003226965496e-06, "loss": 0.0002, "num_input_tokens_seen": 57620776, "step": 85475 }, { "epoch": 2.0882906212591306, "grad_norm": 0.016522737219929695, "learning_rate": 1.4457239863305702e-06, "loss": 0.0502, "num_input_tokens_seen": 57624296, "step": 85480 }, { "epoch": 2.0884127720909778, "grad_norm": 0.02506168559193611, "learning_rate": 1.4456476467231658e-06, "loss": 0.0003, "num_input_tokens_seen": 57628072, "step": 85485 }, { "epoch": 2.088534922922825, "grad_norm": 0.011805309914052486, "learning_rate": 1.4455713038748918e-06, "loss": 0.0523, "num_input_tokens_seen": 57631208, "step": 85490 }, { "epoch": 2.088657073754672, "grad_norm": 0.010718842037022114, "learning_rate": 1.4454949577863036e-06, "loss": 0.0003, "num_input_tokens_seen": 57634344, "step": 85495 }, { "epoch": 2.0887792245865193, "grad_norm": 133.2101593017578, "learning_rate": 1.4454186084579561e-06, "loss": 0.002, "num_input_tokens_seen": 57637992, "step": 85500 }, { "epoch": 2.0889013754183665, "grad_norm": 16.67730712890625, "learning_rate": 1.4453422558904047e-06, "loss": 0.0525, "num_input_tokens_seen": 57641128, "step": 85505 }, { "epoch": 2.0890235262502137, "grad_norm": 0.01813196763396263, "learning_rate": 1.4452659000842047e-06, "loss": 0.0004, "num_input_tokens_seen": 57644520, "step": 85510 }, { "epoch": 2.089145677082061, "grad_norm": 8.29789924621582, "learning_rate": 1.4451895410399111e-06, "loss": 0.0081, "num_input_tokens_seen": 57647400, "step": 85515 }, { "epoch": 2.089267827913908, "grad_norm": 0.0024462079163640738, "learning_rate": 1.4451131787580795e-06, "loss": 0.0002, "num_input_tokens_seen": 57650728, "step": 85520 }, { "epoch": 2.0893899787457553, "grad_norm": 0.07815330475568771, "learning_rate": 1.4450368132392652e-06, "loss": 0.0126, "num_input_tokens_seen": 57654120, "step": 85525 }, { "epoch": 2.0895121295776025, "grad_norm": 0.14444968104362488, "learning_rate": 1.4449604444840236e-06, "loss": 0.0686, "num_input_tokens_seen": 57657896, "step": 85530 }, { "epoch": 2.0896342804094497, "grad_norm": 0.08869968354701996, "learning_rate": 1.4448840724929098e-06, "loss": 0.0994, "num_input_tokens_seen": 57661224, "step": 85535 }, { "epoch": 2.089756431241297, "grad_norm": 0.0426652766764164, "learning_rate": 1.4448076972664795e-06, "loss": 0.0443, "num_input_tokens_seen": 57664296, "step": 85540 }, { "epoch": 2.089878582073144, "grad_norm": 0.017624255269765854, "learning_rate": 1.4447313188052878e-06, "loss": 0.0002, "num_input_tokens_seen": 57667816, "step": 85545 }, { "epoch": 2.090000732904991, "grad_norm": 0.04679286852478981, "learning_rate": 1.4446549371098907e-06, "loss": 0.0417, "num_input_tokens_seen": 57670824, "step": 85550 }, { "epoch": 2.0901228837368384, "grad_norm": 603.0365600585938, "learning_rate": 1.4445785521808428e-06, "loss": 0.1099, "num_input_tokens_seen": 57674920, "step": 85555 }, { "epoch": 2.0902450345686856, "grad_norm": 10.674302101135254, "learning_rate": 1.4445021640187005e-06, "loss": 0.0588, "num_input_tokens_seen": 57678184, "step": 85560 }, { "epoch": 2.090367185400533, "grad_norm": 0.0036526755429804325, "learning_rate": 1.4444257726240187e-06, "loss": 0.0002, "num_input_tokens_seen": 57681256, "step": 85565 }, { "epoch": 2.0904893362323795, "grad_norm": 187.0728302001953, "learning_rate": 1.4443493779973533e-06, "loss": 0.0317, "num_input_tokens_seen": 57684456, "step": 85570 }, { "epoch": 2.0906114870642267, "grad_norm": 0.030918600037693977, "learning_rate": 1.4442729801392597e-06, "loss": 0.0002, "num_input_tokens_seen": 57687912, "step": 85575 }, { "epoch": 2.090733637896074, "grad_norm": 0.4674592912197113, "learning_rate": 1.4441965790502933e-06, "loss": 0.0002, "num_input_tokens_seen": 57691240, "step": 85580 }, { "epoch": 2.090855788727921, "grad_norm": 0.009129178710281849, "learning_rate": 1.44412017473101e-06, "loss": 0.0367, "num_input_tokens_seen": 57694632, "step": 85585 }, { "epoch": 2.0909779395597683, "grad_norm": 0.17762425541877747, "learning_rate": 1.4440437671819652e-06, "loss": 0.0003, "num_input_tokens_seen": 57697960, "step": 85590 }, { "epoch": 2.0911000903916155, "grad_norm": 0.07223685830831528, "learning_rate": 1.4439673564037152e-06, "loss": 0.0392, "num_input_tokens_seen": 57701992, "step": 85595 }, { "epoch": 2.0912222412234627, "grad_norm": 0.04923141375184059, "learning_rate": 1.4438909423968148e-06, "loss": 0.0001, "num_input_tokens_seen": 57705320, "step": 85600 }, { "epoch": 2.09134439205531, "grad_norm": 0.010022669099271297, "learning_rate": 1.4438145251618198e-06, "loss": 0.0004, "num_input_tokens_seen": 57708328, "step": 85605 }, { "epoch": 2.091466542887157, "grad_norm": 0.10577555745840073, "learning_rate": 1.4437381046992865e-06, "loss": 0.0731, "num_input_tokens_seen": 57711528, "step": 85610 }, { "epoch": 2.0915886937190042, "grad_norm": 0.014822940342128277, "learning_rate": 1.4436616810097704e-06, "loss": 0.0679, "num_input_tokens_seen": 57714856, "step": 85615 }, { "epoch": 2.0917108445508514, "grad_norm": 14.269906997680664, "learning_rate": 1.4435852540938272e-06, "loss": 0.053, "num_input_tokens_seen": 57717992, "step": 85620 }, { "epoch": 2.0918329953826986, "grad_norm": 0.09064082056283951, "learning_rate": 1.4435088239520125e-06, "loss": 0.0002, "num_input_tokens_seen": 57721896, "step": 85625 }, { "epoch": 2.091955146214546, "grad_norm": 0.03526727482676506, "learning_rate": 1.4434323905848826e-06, "loss": 0.0005, "num_input_tokens_seen": 57725352, "step": 85630 }, { "epoch": 2.092077297046393, "grad_norm": 0.02126150205731392, "learning_rate": 1.443355953992993e-06, "loss": 0.0002, "num_input_tokens_seen": 57728744, "step": 85635 }, { "epoch": 2.09219944787824, "grad_norm": 0.044596266001462936, "learning_rate": 1.4432795141768999e-06, "loss": 0.0466, "num_input_tokens_seen": 57731624, "step": 85640 }, { "epoch": 2.0923215987100874, "grad_norm": 47.65426254272461, "learning_rate": 1.4432030711371586e-06, "loss": 0.0797, "num_input_tokens_seen": 57734760, "step": 85645 }, { "epoch": 2.0924437495419346, "grad_norm": 0.016679583117365837, "learning_rate": 1.4431266248743254e-06, "loss": 0.0006, "num_input_tokens_seen": 57737832, "step": 85650 }, { "epoch": 2.0925659003737813, "grad_norm": 0.01737331971526146, "learning_rate": 1.4430501753889563e-06, "loss": 0.0002, "num_input_tokens_seen": 57741288, "step": 85655 }, { "epoch": 2.0926880512056285, "grad_norm": 0.009950646199285984, "learning_rate": 1.4429737226816072e-06, "loss": 0.0001, "num_input_tokens_seen": 57744616, "step": 85660 }, { "epoch": 2.0928102020374757, "grad_norm": 0.14951828122138977, "learning_rate": 1.4428972667528338e-06, "loss": 0.0001, "num_input_tokens_seen": 57748008, "step": 85665 }, { "epoch": 2.092932352869323, "grad_norm": 0.06490232795476913, "learning_rate": 1.4428208076031925e-06, "loss": 0.0468, "num_input_tokens_seen": 57750952, "step": 85670 }, { "epoch": 2.09305450370117, "grad_norm": 0.004932792857289314, "learning_rate": 1.4427443452332392e-06, "loss": 0.0002, "num_input_tokens_seen": 57754088, "step": 85675 }, { "epoch": 2.0931766545330173, "grad_norm": 0.020456485450267792, "learning_rate": 1.4426678796435301e-06, "loss": 0.0001, "num_input_tokens_seen": 57757480, "step": 85680 }, { "epoch": 2.0932988053648645, "grad_norm": 0.0004195286310277879, "learning_rate": 1.4425914108346209e-06, "loss": 0.0001, "num_input_tokens_seen": 57760872, "step": 85685 }, { "epoch": 2.0934209561967116, "grad_norm": 0.008397513069212437, "learning_rate": 1.442514938807068e-06, "loss": 0.0001, "num_input_tokens_seen": 57764584, "step": 85690 }, { "epoch": 2.093543107028559, "grad_norm": 0.004736917093396187, "learning_rate": 1.4424384635614274e-06, "loss": 0.0003, "num_input_tokens_seen": 57768104, "step": 85695 }, { "epoch": 2.093665257860406, "grad_norm": 0.007678162772208452, "learning_rate": 1.4423619850982554e-06, "loss": 0.0001, "num_input_tokens_seen": 57771048, "step": 85700 }, { "epoch": 2.093787408692253, "grad_norm": 0.05174483731389046, "learning_rate": 1.442285503418108e-06, "loss": 0.1421, "num_input_tokens_seen": 57774248, "step": 85705 }, { "epoch": 2.0939095595241004, "grad_norm": 0.011872244998812675, "learning_rate": 1.4422090185215413e-06, "loss": 0.0003, "num_input_tokens_seen": 57777832, "step": 85710 }, { "epoch": 2.0940317103559476, "grad_norm": 0.006190591957420111, "learning_rate": 1.4421325304091118e-06, "loss": 0.0822, "num_input_tokens_seen": 57781032, "step": 85715 }, { "epoch": 2.094153861187795, "grad_norm": 0.31522297859191895, "learning_rate": 1.4420560390813755e-06, "loss": 0.0003, "num_input_tokens_seen": 57784744, "step": 85720 }, { "epoch": 2.094276012019642, "grad_norm": 0.022517632693052292, "learning_rate": 1.4419795445388892e-06, "loss": 0.033, "num_input_tokens_seen": 57788648, "step": 85725 }, { "epoch": 2.094398162851489, "grad_norm": 0.0021469765342772007, "learning_rate": 1.4419030467822084e-06, "loss": 0.0001, "num_input_tokens_seen": 57792232, "step": 85730 }, { "epoch": 2.0945203136833364, "grad_norm": 0.06888701021671295, "learning_rate": 1.4418265458118897e-06, "loss": 0.0003, "num_input_tokens_seen": 57795624, "step": 85735 }, { "epoch": 2.0946424645151835, "grad_norm": 0.09264671057462692, "learning_rate": 1.4417500416284898e-06, "loss": 0.0614, "num_input_tokens_seen": 57798888, "step": 85740 }, { "epoch": 2.0947646153470307, "grad_norm": 0.030658259987831116, "learning_rate": 1.4416735342325646e-06, "loss": 0.0001, "num_input_tokens_seen": 57802280, "step": 85745 }, { "epoch": 2.0948867661788775, "grad_norm": 0.015449795871973038, "learning_rate": 1.441597023624671e-06, "loss": 0.0057, "num_input_tokens_seen": 57805736, "step": 85750 }, { "epoch": 2.0950089170107247, "grad_norm": 0.02338220365345478, "learning_rate": 1.4415205098053647e-06, "loss": 0.0001, "num_input_tokens_seen": 57809000, "step": 85755 }, { "epoch": 2.095131067842572, "grad_norm": 0.005242054350674152, "learning_rate": 1.4414439927752026e-06, "loss": 0.0598, "num_input_tokens_seen": 57812712, "step": 85760 }, { "epoch": 2.095253218674419, "grad_norm": 643.7587890625, "learning_rate": 1.4413674725347408e-06, "loss": 0.0097, "num_input_tokens_seen": 57815848, "step": 85765 }, { "epoch": 2.0953753695062662, "grad_norm": 254.23770141601562, "learning_rate": 1.4412909490845364e-06, "loss": 0.0621, "num_input_tokens_seen": 57819560, "step": 85770 }, { "epoch": 2.0954975203381134, "grad_norm": 0.009493929333984852, "learning_rate": 1.4412144224251454e-06, "loss": 0.0743, "num_input_tokens_seen": 57822824, "step": 85775 }, { "epoch": 2.0956196711699606, "grad_norm": 0.09335071593523026, "learning_rate": 1.4411378925571246e-06, "loss": 0.0001, "num_input_tokens_seen": 57826792, "step": 85780 }, { "epoch": 2.095741822001808, "grad_norm": 0.026658060029149055, "learning_rate": 1.4410613594810302e-06, "loss": 0.0003, "num_input_tokens_seen": 57830440, "step": 85785 }, { "epoch": 2.095863972833655, "grad_norm": 0.02904869057238102, "learning_rate": 1.440984823197419e-06, "loss": 0.0003, "num_input_tokens_seen": 57833576, "step": 85790 }, { "epoch": 2.095986123665502, "grad_norm": 0.038817401975393295, "learning_rate": 1.4409082837068476e-06, "loss": 0.0001, "num_input_tokens_seen": 57836968, "step": 85795 }, { "epoch": 2.0961082744973494, "grad_norm": 0.008263013325631618, "learning_rate": 1.4408317410098725e-06, "loss": 0.0003, "num_input_tokens_seen": 57840104, "step": 85800 }, { "epoch": 2.0962304253291966, "grad_norm": 0.0016716079553589225, "learning_rate": 1.4407551951070504e-06, "loss": 0.0468, "num_input_tokens_seen": 57843304, "step": 85805 }, { "epoch": 2.0963525761610438, "grad_norm": 0.006488029845058918, "learning_rate": 1.440678645998938e-06, "loss": 0.1659, "num_input_tokens_seen": 57846568, "step": 85810 }, { "epoch": 2.096474726992891, "grad_norm": 0.05113215744495392, "learning_rate": 1.4406020936860921e-06, "loss": 0.0002, "num_input_tokens_seen": 57849896, "step": 85815 }, { "epoch": 2.096596877824738, "grad_norm": 0.05706647038459778, "learning_rate": 1.4405255381690692e-06, "loss": 0.0798, "num_input_tokens_seen": 57852968, "step": 85820 }, { "epoch": 2.0967190286565853, "grad_norm": 0.06567330658435822, "learning_rate": 1.440448979448426e-06, "loss": 0.0424, "num_input_tokens_seen": 57856936, "step": 85825 }, { "epoch": 2.0968411794884325, "grad_norm": 0.03565848246216774, "learning_rate": 1.4403724175247191e-06, "loss": 0.0004, "num_input_tokens_seen": 57859880, "step": 85830 }, { "epoch": 2.0969633303202793, "grad_norm": 0.05197528749704361, "learning_rate": 1.4402958523985061e-06, "loss": 0.0001, "num_input_tokens_seen": 57863080, "step": 85835 }, { "epoch": 2.0970854811521265, "grad_norm": 15.99670124053955, "learning_rate": 1.440219284070343e-06, "loss": 0.0918, "num_input_tokens_seen": 57866280, "step": 85840 }, { "epoch": 2.0972076319839736, "grad_norm": 0.04562178999185562, "learning_rate": 1.4401427125407866e-06, "loss": 0.0002, "num_input_tokens_seen": 57869672, "step": 85845 }, { "epoch": 2.097329782815821, "grad_norm": 0.23651406168937683, "learning_rate": 1.4400661378103944e-06, "loss": 0.0704, "num_input_tokens_seen": 57872744, "step": 85850 }, { "epoch": 2.097451933647668, "grad_norm": 0.027074666693806648, "learning_rate": 1.4399895598797226e-06, "loss": 0.0001, "num_input_tokens_seen": 57876136, "step": 85855 }, { "epoch": 2.097574084479515, "grad_norm": 0.04177837446331978, "learning_rate": 1.4399129787493288e-06, "loss": 0.0378, "num_input_tokens_seen": 57879656, "step": 85860 }, { "epoch": 2.0976962353113624, "grad_norm": 0.009644084610044956, "learning_rate": 1.4398363944197688e-06, "loss": 0.0577, "num_input_tokens_seen": 57882984, "step": 85865 }, { "epoch": 2.0978183861432096, "grad_norm": 0.12638725340366364, "learning_rate": 1.439759806891601e-06, "loss": 0.0002, "num_input_tokens_seen": 57886440, "step": 85870 }, { "epoch": 2.097940536975057, "grad_norm": 0.6511943340301514, "learning_rate": 1.4396832161653811e-06, "loss": 0.0005, "num_input_tokens_seen": 57889448, "step": 85875 }, { "epoch": 2.098062687806904, "grad_norm": 0.23605570197105408, "learning_rate": 1.4396066222416668e-06, "loss": 0.03, "num_input_tokens_seen": 57892520, "step": 85880 }, { "epoch": 2.098184838638751, "grad_norm": 0.01215800829231739, "learning_rate": 1.4395300251210147e-06, "loss": 0.0006, "num_input_tokens_seen": 57895528, "step": 85885 }, { "epoch": 2.0983069894705983, "grad_norm": 0.005944849457591772, "learning_rate": 1.439453424803982e-06, "loss": 0.0001, "num_input_tokens_seen": 57898728, "step": 85890 }, { "epoch": 2.0984291403024455, "grad_norm": 0.010880285874009132, "learning_rate": 1.4393768212911259e-06, "loss": 0.0002, "num_input_tokens_seen": 57901992, "step": 85895 }, { "epoch": 2.0985512911342927, "grad_norm": 0.0785965770483017, "learning_rate": 1.4393002145830035e-06, "loss": 0.0879, "num_input_tokens_seen": 57905704, "step": 85900 }, { "epoch": 2.09867344196614, "grad_norm": 2.173537015914917, "learning_rate": 1.439223604680172e-06, "loss": 0.0004, "num_input_tokens_seen": 57909288, "step": 85905 }, { "epoch": 2.098795592797987, "grad_norm": 0.008875908330082893, "learning_rate": 1.439146991583188e-06, "loss": 0.0002, "num_input_tokens_seen": 57912744, "step": 85910 }, { "epoch": 2.0989177436298343, "grad_norm": 0.05505775660276413, "learning_rate": 1.439070375292609e-06, "loss": 0.0537, "num_input_tokens_seen": 57916328, "step": 85915 }, { "epoch": 2.0990398944616815, "grad_norm": 0.03384046256542206, "learning_rate": 1.4389937558089919e-06, "loss": 0.0002, "num_input_tokens_seen": 57919720, "step": 85920 }, { "epoch": 2.0991620452935287, "grad_norm": 0.01677374541759491, "learning_rate": 1.4389171331328945e-06, "loss": 0.1204, "num_input_tokens_seen": 57923240, "step": 85925 }, { "epoch": 2.0992841961253754, "grad_norm": 0.0018917974084615707, "learning_rate": 1.4388405072648735e-06, "loss": 0.0001, "num_input_tokens_seen": 57926440, "step": 85930 }, { "epoch": 2.0994063469572226, "grad_norm": 0.016427690163254738, "learning_rate": 1.4387638782054863e-06, "loss": 0.0001, "num_input_tokens_seen": 57930024, "step": 85935 }, { "epoch": 2.09952849778907, "grad_norm": 0.00689767487347126, "learning_rate": 1.4386872459552902e-06, "loss": 0.0444, "num_input_tokens_seen": 57933224, "step": 85940 }, { "epoch": 2.099650648620917, "grad_norm": 0.3828757107257843, "learning_rate": 1.4386106105148425e-06, "loss": 0.0002, "num_input_tokens_seen": 57936936, "step": 85945 }, { "epoch": 2.099772799452764, "grad_norm": 0.01371038518846035, "learning_rate": 1.4385339718847002e-06, "loss": 0.0001, "num_input_tokens_seen": 57940136, "step": 85950 }, { "epoch": 2.0998949502846114, "grad_norm": 51.16667556762695, "learning_rate": 1.4384573300654213e-06, "loss": 0.049, "num_input_tokens_seen": 57944040, "step": 85955 }, { "epoch": 2.1000171011164586, "grad_norm": 0.03594563156366348, "learning_rate": 1.4383806850575627e-06, "loss": 0.0636, "num_input_tokens_seen": 57947496, "step": 85960 }, { "epoch": 2.1001392519483058, "grad_norm": 0.0012629919219762087, "learning_rate": 1.4383040368616816e-06, "loss": 0.0, "num_input_tokens_seen": 57951336, "step": 85965 }, { "epoch": 2.100261402780153, "grad_norm": 0.13319532573223114, "learning_rate": 1.4382273854783358e-06, "loss": 0.0366, "num_input_tokens_seen": 57954920, "step": 85970 }, { "epoch": 2.100383553612, "grad_norm": 22.986907958984375, "learning_rate": 1.4381507309080827e-06, "loss": 0.1009, "num_input_tokens_seen": 57958312, "step": 85975 }, { "epoch": 2.1005057044438473, "grad_norm": 0.0110292574390769, "learning_rate": 1.4380740731514793e-06, "loss": 0.0002, "num_input_tokens_seen": 57961960, "step": 85980 }, { "epoch": 2.1006278552756945, "grad_norm": 0.008005023933947086, "learning_rate": 1.4379974122090835e-06, "loss": 0.0796, "num_input_tokens_seen": 57965352, "step": 85985 }, { "epoch": 2.1007500061075417, "grad_norm": 0.05698973312973976, "learning_rate": 1.4379207480814527e-06, "loss": 0.0002, "num_input_tokens_seen": 57968488, "step": 85990 }, { "epoch": 2.100872156939389, "grad_norm": 0.0639193058013916, "learning_rate": 1.4378440807691447e-06, "loss": 0.0513, "num_input_tokens_seen": 57972136, "step": 85995 }, { "epoch": 2.100994307771236, "grad_norm": 0.311469703912735, "learning_rate": 1.4377674102727166e-06, "loss": 0.0003, "num_input_tokens_seen": 57975464, "step": 86000 }, { "epoch": 2.1011164586030833, "grad_norm": 0.024497320875525475, "learning_rate": 1.4376907365927262e-06, "loss": 0.0002, "num_input_tokens_seen": 57979048, "step": 86005 }, { "epoch": 2.1012386094349305, "grad_norm": 0.4448446035385132, "learning_rate": 1.437614059729731e-06, "loss": 0.0003, "num_input_tokens_seen": 57982248, "step": 86010 }, { "epoch": 2.101360760266777, "grad_norm": 0.0009100750903598964, "learning_rate": 1.4375373796842887e-06, "loss": 0.0001, "num_input_tokens_seen": 57985640, "step": 86015 }, { "epoch": 2.1014829110986244, "grad_norm": 0.041483260691165924, "learning_rate": 1.4374606964569569e-06, "loss": 0.0001, "num_input_tokens_seen": 57988776, "step": 86020 }, { "epoch": 2.1016050619304716, "grad_norm": 0.017451265826821327, "learning_rate": 1.4373840100482932e-06, "loss": 0.0001, "num_input_tokens_seen": 57992296, "step": 86025 }, { "epoch": 2.1017272127623188, "grad_norm": 0.030862795189023018, "learning_rate": 1.4373073204588556e-06, "loss": 0.0388, "num_input_tokens_seen": 57995624, "step": 86030 }, { "epoch": 2.101849363594166, "grad_norm": 0.0727197602391243, "learning_rate": 1.437230627689201e-06, "loss": 0.0002, "num_input_tokens_seen": 57999016, "step": 86035 }, { "epoch": 2.101971514426013, "grad_norm": 0.06713508814573288, "learning_rate": 1.4371539317398882e-06, "loss": 0.0003, "num_input_tokens_seen": 58002280, "step": 86040 }, { "epoch": 2.1020936652578603, "grad_norm": 0.006942774634808302, "learning_rate": 1.437077232611474e-06, "loss": 0.0003, "num_input_tokens_seen": 58005416, "step": 86045 }, { "epoch": 2.1022158160897075, "grad_norm": 37.54534149169922, "learning_rate": 1.4370005303045168e-06, "loss": 0.0501, "num_input_tokens_seen": 58008680, "step": 86050 }, { "epoch": 2.1023379669215547, "grad_norm": 0.006586906500160694, "learning_rate": 1.436923824819574e-06, "loss": 0.0001, "num_input_tokens_seen": 58012200, "step": 86055 }, { "epoch": 2.102460117753402, "grad_norm": 0.03484753891825676, "learning_rate": 1.4368471161572042e-06, "loss": 0.0001, "num_input_tokens_seen": 58015784, "step": 86060 }, { "epoch": 2.102582268585249, "grad_norm": 0.003911359701305628, "learning_rate": 1.4367704043179643e-06, "loss": 0.0371, "num_input_tokens_seen": 58019048, "step": 86065 }, { "epoch": 2.1027044194170963, "grad_norm": 0.0028033980634063482, "learning_rate": 1.4366936893024124e-06, "loss": 0.0001, "num_input_tokens_seen": 58022056, "step": 86070 }, { "epoch": 2.1028265702489435, "grad_norm": 0.005028760991990566, "learning_rate": 1.4366169711111068e-06, "loss": 0.0001, "num_input_tokens_seen": 58026344, "step": 86075 }, { "epoch": 2.1029487210807907, "grad_norm": 0.2529297173023224, "learning_rate": 1.4365402497446048e-06, "loss": 0.0001, "num_input_tokens_seen": 58029416, "step": 86080 }, { "epoch": 2.103070871912638, "grad_norm": 0.010397453792393208, "learning_rate": 1.436463525203465e-06, "loss": 0.0372, "num_input_tokens_seen": 58032744, "step": 86085 }, { "epoch": 2.103193022744485, "grad_norm": 0.03357874974608421, "learning_rate": 1.4363867974882448e-06, "loss": 0.0073, "num_input_tokens_seen": 58036328, "step": 86090 }, { "epoch": 2.1033151735763322, "grad_norm": 0.31488898396492004, "learning_rate": 1.436310066599503e-06, "loss": 0.0002, "num_input_tokens_seen": 58039848, "step": 86095 }, { "epoch": 2.1034373244081794, "grad_norm": 0.003075455315411091, "learning_rate": 1.4362333325377964e-06, "loss": 0.0752, "num_input_tokens_seen": 58042728, "step": 86100 }, { "epoch": 2.103559475240026, "grad_norm": 0.002096857177093625, "learning_rate": 1.436156595303684e-06, "loss": 0.0003, "num_input_tokens_seen": 58045992, "step": 86105 }, { "epoch": 2.1036816260718734, "grad_norm": 0.09626047313213348, "learning_rate": 1.4360798548977235e-06, "loss": 0.0467, "num_input_tokens_seen": 58049448, "step": 86110 }, { "epoch": 2.1038037769037206, "grad_norm": 0.04785279184579849, "learning_rate": 1.4360031113204729e-06, "loss": 0.045, "num_input_tokens_seen": 58052776, "step": 86115 }, { "epoch": 2.1039259277355677, "grad_norm": 0.004664618521928787, "learning_rate": 1.4359263645724905e-06, "loss": 0.0454, "num_input_tokens_seen": 58056040, "step": 86120 }, { "epoch": 2.104048078567415, "grad_norm": 0.0005619783769361675, "learning_rate": 1.4358496146543343e-06, "loss": 0.0878, "num_input_tokens_seen": 58059304, "step": 86125 }, { "epoch": 2.104170229399262, "grad_norm": 0.028854873031377792, "learning_rate": 1.4357728615665626e-06, "loss": 0.0716, "num_input_tokens_seen": 58062568, "step": 86130 }, { "epoch": 2.1042923802311093, "grad_norm": 0.6892449855804443, "learning_rate": 1.435696105309733e-06, "loss": 0.0006, "num_input_tokens_seen": 58066344, "step": 86135 }, { "epoch": 2.1044145310629565, "grad_norm": 0.1289759874343872, "learning_rate": 1.4356193458844045e-06, "loss": 0.0009, "num_input_tokens_seen": 58069544, "step": 86140 }, { "epoch": 2.1045366818948037, "grad_norm": 0.12908408045768738, "learning_rate": 1.4355425832911348e-06, "loss": 0.0001, "num_input_tokens_seen": 58072936, "step": 86145 }, { "epoch": 2.104658832726651, "grad_norm": 0.13376732170581818, "learning_rate": 1.4354658175304824e-06, "loss": 0.0001, "num_input_tokens_seen": 58077032, "step": 86150 }, { "epoch": 2.104780983558498, "grad_norm": 0.0008134989766404033, "learning_rate": 1.4353890486030054e-06, "loss": 0.0608, "num_input_tokens_seen": 58080552, "step": 86155 }, { "epoch": 2.1049031343903453, "grad_norm": 36.05324935913086, "learning_rate": 1.4353122765092622e-06, "loss": 0.0652, "num_input_tokens_seen": 58084136, "step": 86160 }, { "epoch": 2.1050252852221925, "grad_norm": 0.035665228962898254, "learning_rate": 1.435235501249811e-06, "loss": 0.0525, "num_input_tokens_seen": 58087720, "step": 86165 }, { "epoch": 2.1051474360540396, "grad_norm": 0.06152858957648277, "learning_rate": 1.4351587228252102e-06, "loss": 0.0515, "num_input_tokens_seen": 58090856, "step": 86170 }, { "epoch": 2.105269586885887, "grad_norm": 18.646289825439453, "learning_rate": 1.4350819412360182e-06, "loss": 0.0427, "num_input_tokens_seen": 58094056, "step": 86175 }, { "epoch": 2.105391737717734, "grad_norm": 17.207197189331055, "learning_rate": 1.4350051564827932e-06, "loss": 0.0004, "num_input_tokens_seen": 58097832, "step": 86180 }, { "epoch": 2.105513888549581, "grad_norm": 0.006788597907871008, "learning_rate": 1.4349283685660935e-06, "loss": 0.0803, "num_input_tokens_seen": 58101032, "step": 86185 }, { "epoch": 2.1056360393814284, "grad_norm": 0.01485143881291151, "learning_rate": 1.434851577486478e-06, "loss": 0.0002, "num_input_tokens_seen": 58103976, "step": 86190 }, { "epoch": 2.105758190213275, "grad_norm": 0.016216788440942764, "learning_rate": 1.4347747832445047e-06, "loss": 0.0004, "num_input_tokens_seen": 58107496, "step": 86195 }, { "epoch": 2.1058803410451223, "grad_norm": 0.06826010346412659, "learning_rate": 1.4346979858407323e-06, "loss": 0.0007, "num_input_tokens_seen": 58110760, "step": 86200 }, { "epoch": 2.1060024918769695, "grad_norm": 0.0071051702834665775, "learning_rate": 1.434621185275719e-06, "loss": 0.0001, "num_input_tokens_seen": 58114280, "step": 86205 }, { "epoch": 2.1061246427088167, "grad_norm": 161.30799865722656, "learning_rate": 1.434544381550024e-06, "loss": 0.0036, "num_input_tokens_seen": 58117864, "step": 86210 }, { "epoch": 2.106246793540664, "grad_norm": 0.39801251888275146, "learning_rate": 1.4344675746642054e-06, "loss": 0.0799, "num_input_tokens_seen": 58121000, "step": 86215 }, { "epoch": 2.106368944372511, "grad_norm": 0.12843145430088043, "learning_rate": 1.4343907646188217e-06, "loss": 0.0002, "num_input_tokens_seen": 58124136, "step": 86220 }, { "epoch": 2.1064910952043583, "grad_norm": 0.005412922706454992, "learning_rate": 1.434313951414431e-06, "loss": 0.0001, "num_input_tokens_seen": 58127912, "step": 86225 }, { "epoch": 2.1066132460362055, "grad_norm": 0.03722435235977173, "learning_rate": 1.4342371350515927e-06, "loss": 0.0545, "num_input_tokens_seen": 58130792, "step": 86230 }, { "epoch": 2.1067353968680527, "grad_norm": 0.011225051246583462, "learning_rate": 1.4341603155308653e-06, "loss": 0.0466, "num_input_tokens_seen": 58134568, "step": 86235 }, { "epoch": 2.1068575476999, "grad_norm": 0.013920868746936321, "learning_rate": 1.4340834928528072e-06, "loss": 0.0418, "num_input_tokens_seen": 58137640, "step": 86240 }, { "epoch": 2.106979698531747, "grad_norm": 0.26774507761001587, "learning_rate": 1.434006667017977e-06, "loss": 0.0003, "num_input_tokens_seen": 58141672, "step": 86245 }, { "epoch": 2.1071018493635942, "grad_norm": 0.006552118342369795, "learning_rate": 1.433929838026934e-06, "loss": 0.0, "num_input_tokens_seen": 58145000, "step": 86250 }, { "epoch": 2.1072240001954414, "grad_norm": 0.015171009115874767, "learning_rate": 1.4338530058802363e-06, "loss": 0.0004, "num_input_tokens_seen": 58148968, "step": 86255 }, { "epoch": 2.1073461510272886, "grad_norm": 0.014841420575976372, "learning_rate": 1.4337761705784427e-06, "loss": 0.0001, "num_input_tokens_seen": 58152296, "step": 86260 }, { "epoch": 2.107468301859136, "grad_norm": 0.008177024312317371, "learning_rate": 1.4336993321221123e-06, "loss": 0.0693, "num_input_tokens_seen": 58155624, "step": 86265 }, { "epoch": 2.107590452690983, "grad_norm": 0.007435632403939962, "learning_rate": 1.4336224905118038e-06, "loss": 0.0002, "num_input_tokens_seen": 58158888, "step": 86270 }, { "epoch": 2.10771260352283, "grad_norm": 0.009212308563292027, "learning_rate": 1.4335456457480758e-06, "loss": 0.0001, "num_input_tokens_seen": 58162216, "step": 86275 }, { "epoch": 2.107834754354677, "grad_norm": 0.017764560878276825, "learning_rate": 1.4334687978314873e-06, "loss": 0.0513, "num_input_tokens_seen": 58166184, "step": 86280 }, { "epoch": 2.107956905186524, "grad_norm": 0.012132183648645878, "learning_rate": 1.433391946762597e-06, "loss": 0.0002, "num_input_tokens_seen": 58169320, "step": 86285 }, { "epoch": 2.1080790560183713, "grad_norm": 0.011081576347351074, "learning_rate": 1.4333150925419639e-06, "loss": 0.0479, "num_input_tokens_seen": 58172712, "step": 86290 }, { "epoch": 2.1082012068502185, "grad_norm": 0.015859754756093025, "learning_rate": 1.4332382351701467e-06, "loss": 0.1066, "num_input_tokens_seen": 58176040, "step": 86295 }, { "epoch": 2.1083233576820657, "grad_norm": 0.012283757328987122, "learning_rate": 1.4331613746477049e-06, "loss": 0.0539, "num_input_tokens_seen": 58179176, "step": 86300 }, { "epoch": 2.108445508513913, "grad_norm": 0.008912608027458191, "learning_rate": 1.4330845109751967e-06, "loss": 0.0003, "num_input_tokens_seen": 58183016, "step": 86305 }, { "epoch": 2.10856765934576, "grad_norm": 14.167522430419922, "learning_rate": 1.433007644153182e-06, "loss": 0.0009, "num_input_tokens_seen": 58186536, "step": 86310 }, { "epoch": 2.1086898101776073, "grad_norm": 1.1770907640457153, "learning_rate": 1.432930774182219e-06, "loss": 0.0005, "num_input_tokens_seen": 58189864, "step": 86315 }, { "epoch": 2.1088119610094544, "grad_norm": 0.015262456610798836, "learning_rate": 1.4328539010628668e-06, "loss": 0.0823, "num_input_tokens_seen": 58193000, "step": 86320 }, { "epoch": 2.1089341118413016, "grad_norm": 0.007925025187432766, "learning_rate": 1.4327770247956847e-06, "loss": 0.0003, "num_input_tokens_seen": 58196584, "step": 86325 }, { "epoch": 2.109056262673149, "grad_norm": 0.2659885883331299, "learning_rate": 1.4327001453812318e-06, "loss": 0.0003, "num_input_tokens_seen": 58199976, "step": 86330 }, { "epoch": 2.109178413504996, "grad_norm": 0.019817404448986053, "learning_rate": 1.432623262820067e-06, "loss": 0.0452, "num_input_tokens_seen": 58203304, "step": 86335 }, { "epoch": 2.109300564336843, "grad_norm": 131.40748596191406, "learning_rate": 1.4325463771127492e-06, "loss": 0.0019, "num_input_tokens_seen": 58206696, "step": 86340 }, { "epoch": 2.1094227151686904, "grad_norm": 0.05100691691040993, "learning_rate": 1.432469488259838e-06, "loss": 0.0006, "num_input_tokens_seen": 58210408, "step": 86345 }, { "epoch": 2.1095448660005376, "grad_norm": 0.06888487935066223, "learning_rate": 1.4323925962618925e-06, "loss": 0.0001, "num_input_tokens_seen": 58213928, "step": 86350 }, { "epoch": 2.109667016832385, "grad_norm": 0.02099837362766266, "learning_rate": 1.4323157011194716e-06, "loss": 0.0002, "num_input_tokens_seen": 58217064, "step": 86355 }, { "epoch": 2.109789167664232, "grad_norm": 0.08666683733463287, "learning_rate": 1.4322388028331344e-06, "loss": 0.0753, "num_input_tokens_seen": 58220648, "step": 86360 }, { "epoch": 2.109911318496079, "grad_norm": 0.05943422019481659, "learning_rate": 1.432161901403441e-06, "loss": 0.0756, "num_input_tokens_seen": 58224040, "step": 86365 }, { "epoch": 2.1100334693279263, "grad_norm": 8.62483024597168, "learning_rate": 1.4320849968309497e-06, "loss": 0.0342, "num_input_tokens_seen": 58227752, "step": 86370 }, { "epoch": 2.110155620159773, "grad_norm": 0.06837952882051468, "learning_rate": 1.4320080891162201e-06, "loss": 0.0595, "num_input_tokens_seen": 58230696, "step": 86375 }, { "epoch": 2.1102777709916203, "grad_norm": 0.02601909264922142, "learning_rate": 1.4319311782598113e-06, "loss": 0.0007, "num_input_tokens_seen": 58233896, "step": 86380 }, { "epoch": 2.1103999218234675, "grad_norm": 0.007753228303045034, "learning_rate": 1.4318542642622828e-06, "loss": 0.0008, "num_input_tokens_seen": 58237096, "step": 86385 }, { "epoch": 2.1105220726553147, "grad_norm": 0.01853867433965206, "learning_rate": 1.431777347124194e-06, "loss": 0.0002, "num_input_tokens_seen": 58240360, "step": 86390 }, { "epoch": 2.110644223487162, "grad_norm": 0.028115682303905487, "learning_rate": 1.4317004268461044e-06, "loss": 0.0005, "num_input_tokens_seen": 58244008, "step": 86395 }, { "epoch": 2.110766374319009, "grad_norm": 0.03277409449219704, "learning_rate": 1.431623503428573e-06, "loss": 0.0002, "num_input_tokens_seen": 58247400, "step": 86400 }, { "epoch": 2.1108885251508562, "grad_norm": 39.95744705200195, "learning_rate": 1.4315465768721593e-06, "loss": 0.0651, "num_input_tokens_seen": 58250856, "step": 86405 }, { "epoch": 2.1110106759827034, "grad_norm": 0.06884709000587463, "learning_rate": 1.431469647177423e-06, "loss": 0.0787, "num_input_tokens_seen": 58254440, "step": 86410 }, { "epoch": 2.1111328268145506, "grad_norm": 0.00929190032184124, "learning_rate": 1.4313927143449235e-06, "loss": 0.0002, "num_input_tokens_seen": 58258152, "step": 86415 }, { "epoch": 2.111254977646398, "grad_norm": 0.04569040238857269, "learning_rate": 1.43131577837522e-06, "loss": 0.0628, "num_input_tokens_seen": 58261480, "step": 86420 }, { "epoch": 2.111377128478245, "grad_norm": 0.020279861986637115, "learning_rate": 1.431238839268872e-06, "loss": 0.0005, "num_input_tokens_seen": 58265064, "step": 86425 }, { "epoch": 2.111499279310092, "grad_norm": 0.33997291326522827, "learning_rate": 1.4311618970264392e-06, "loss": 0.0327, "num_input_tokens_seen": 58268456, "step": 86430 }, { "epoch": 2.1116214301419394, "grad_norm": 0.03252893313765526, "learning_rate": 1.4310849516484813e-06, "loss": 0.0004, "num_input_tokens_seen": 58271912, "step": 86435 }, { "epoch": 2.1117435809737866, "grad_norm": 0.036082323640584946, "learning_rate": 1.4310080031355575e-06, "loss": 0.0008, "num_input_tokens_seen": 58274984, "step": 86440 }, { "epoch": 2.1118657318056338, "grad_norm": 0.09711766242980957, "learning_rate": 1.4309310514882277e-06, "loss": 0.0764, "num_input_tokens_seen": 58278312, "step": 86445 }, { "epoch": 2.111987882637481, "grad_norm": 0.011256006546318531, "learning_rate": 1.4308540967070513e-06, "loss": 0.0497, "num_input_tokens_seen": 58281832, "step": 86450 }, { "epoch": 2.112110033469328, "grad_norm": 0.09498558193445206, "learning_rate": 1.430777138792588e-06, "loss": 0.0046, "num_input_tokens_seen": 58285288, "step": 86455 }, { "epoch": 2.112232184301175, "grad_norm": 0.029148347675800323, "learning_rate": 1.4307001777453977e-06, "loss": 0.0002, "num_input_tokens_seen": 58289128, "step": 86460 }, { "epoch": 2.112354335133022, "grad_norm": 0.12875746190547943, "learning_rate": 1.4306232135660397e-06, "loss": 0.0495, "num_input_tokens_seen": 58292776, "step": 86465 }, { "epoch": 2.1124764859648693, "grad_norm": 0.014981545507907867, "learning_rate": 1.430546246255074e-06, "loss": 0.0523, "num_input_tokens_seen": 58296872, "step": 86470 }, { "epoch": 2.1125986367967164, "grad_norm": 0.016068417578935623, "learning_rate": 1.4304692758130599e-06, "loss": 0.0002, "num_input_tokens_seen": 58300456, "step": 86475 }, { "epoch": 2.1127207876285636, "grad_norm": 0.2005162090063095, "learning_rate": 1.4303923022405577e-06, "loss": 0.0002, "num_input_tokens_seen": 58303656, "step": 86480 }, { "epoch": 2.112842938460411, "grad_norm": 0.017666978761553764, "learning_rate": 1.430315325538127e-06, "loss": 0.0004, "num_input_tokens_seen": 58306920, "step": 86485 }, { "epoch": 2.112965089292258, "grad_norm": 0.13356828689575195, "learning_rate": 1.4302383457063272e-06, "loss": 0.0003, "num_input_tokens_seen": 58310184, "step": 86490 }, { "epoch": 2.113087240124105, "grad_norm": 0.006724830716848373, "learning_rate": 1.4301613627457186e-06, "loss": 0.0001, "num_input_tokens_seen": 58313704, "step": 86495 }, { "epoch": 2.1132093909559524, "grad_norm": 0.03474617004394531, "learning_rate": 1.4300843766568609e-06, "loss": 0.0513, "num_input_tokens_seen": 58316904, "step": 86500 }, { "epoch": 2.1133315417877996, "grad_norm": 356.2970275878906, "learning_rate": 1.4300073874403139e-06, "loss": 0.005, "num_input_tokens_seen": 58320424, "step": 86505 }, { "epoch": 2.1134536926196468, "grad_norm": 0.009362326003611088, "learning_rate": 1.4299303950966372e-06, "loss": 0.0002, "num_input_tokens_seen": 58323880, "step": 86510 }, { "epoch": 2.113575843451494, "grad_norm": 15.823705673217773, "learning_rate": 1.4298533996263916e-06, "loss": 0.0696, "num_input_tokens_seen": 58327336, "step": 86515 }, { "epoch": 2.113697994283341, "grad_norm": 0.013039246201515198, "learning_rate": 1.429776401030136e-06, "loss": 0.0517, "num_input_tokens_seen": 58330536, "step": 86520 }, { "epoch": 2.1138201451151883, "grad_norm": 0.12527424097061157, "learning_rate": 1.4296993993084313e-06, "loss": 0.0001, "num_input_tokens_seen": 58333928, "step": 86525 }, { "epoch": 2.1139422959470355, "grad_norm": 0.6121593713760376, "learning_rate": 1.4296223944618366e-06, "loss": 0.0405, "num_input_tokens_seen": 58337384, "step": 86530 }, { "epoch": 2.1140644467788827, "grad_norm": 0.014215141534805298, "learning_rate": 1.4295453864909125e-06, "loss": 0.0003, "num_input_tokens_seen": 58341032, "step": 86535 }, { "epoch": 2.11418659761073, "grad_norm": 0.04024295508861542, "learning_rate": 1.4294683753962187e-06, "loss": 0.0001, "num_input_tokens_seen": 58344232, "step": 86540 }, { "epoch": 2.114308748442577, "grad_norm": 0.017275016754865646, "learning_rate": 1.429391361178315e-06, "loss": 0.1218, "num_input_tokens_seen": 58347816, "step": 86545 }, { "epoch": 2.114430899274424, "grad_norm": 0.02299557812511921, "learning_rate": 1.4293143438377624e-06, "loss": 0.0559, "num_input_tokens_seen": 58351144, "step": 86550 }, { "epoch": 2.114553050106271, "grad_norm": 0.07241678982973099, "learning_rate": 1.4292373233751202e-06, "loss": 0.0001, "num_input_tokens_seen": 58354472, "step": 86555 }, { "epoch": 2.1146752009381182, "grad_norm": 0.3192169964313507, "learning_rate": 1.4291602997909489e-06, "loss": 0.0856, "num_input_tokens_seen": 58357928, "step": 86560 }, { "epoch": 2.1147973517699654, "grad_norm": 0.19804099202156067, "learning_rate": 1.4290832730858082e-06, "loss": 0.0006, "num_input_tokens_seen": 58361320, "step": 86565 }, { "epoch": 2.1149195026018126, "grad_norm": 0.1991894245147705, "learning_rate": 1.4290062432602588e-06, "loss": 0.0375, "num_input_tokens_seen": 58364904, "step": 86570 }, { "epoch": 2.11504165343366, "grad_norm": 0.08356073498725891, "learning_rate": 1.4289292103148604e-06, "loss": 0.0238, "num_input_tokens_seen": 58368168, "step": 86575 }, { "epoch": 2.115163804265507, "grad_norm": 8.472793579101562, "learning_rate": 1.4288521742501734e-06, "loss": 0.1127, "num_input_tokens_seen": 58371112, "step": 86580 }, { "epoch": 2.115285955097354, "grad_norm": 18.845136642456055, "learning_rate": 1.4287751350667584e-06, "loss": 0.0591, "num_input_tokens_seen": 58374376, "step": 86585 }, { "epoch": 2.1154081059292014, "grad_norm": 0.05907197669148445, "learning_rate": 1.4286980927651749e-06, "loss": 0.0003, "num_input_tokens_seen": 58377512, "step": 86590 }, { "epoch": 2.1155302567610486, "grad_norm": 66.45479583740234, "learning_rate": 1.4286210473459837e-06, "loss": 0.0732, "num_input_tokens_seen": 58380840, "step": 86595 }, { "epoch": 2.1156524075928957, "grad_norm": 0.5579411387443542, "learning_rate": 1.428543998809745e-06, "loss": 0.0357, "num_input_tokens_seen": 58384232, "step": 86600 }, { "epoch": 2.115774558424743, "grad_norm": 0.16705289483070374, "learning_rate": 1.4284669471570188e-06, "loss": 0.0002, "num_input_tokens_seen": 58387624, "step": 86605 }, { "epoch": 2.11589670925659, "grad_norm": 0.11491364985704422, "learning_rate": 1.428389892388366e-06, "loss": 0.0002, "num_input_tokens_seen": 58391144, "step": 86610 }, { "epoch": 2.1160188600884373, "grad_norm": 0.024216242134571075, "learning_rate": 1.4283128345043464e-06, "loss": 0.0005, "num_input_tokens_seen": 58394728, "step": 86615 }, { "epoch": 2.1161410109202845, "grad_norm": 100.37772369384766, "learning_rate": 1.428235773505521e-06, "loss": 0.007, "num_input_tokens_seen": 58398248, "step": 86620 }, { "epoch": 2.1162631617521317, "grad_norm": 0.06938160955905914, "learning_rate": 1.4281587093924496e-06, "loss": 0.0003, "num_input_tokens_seen": 58401768, "step": 86625 }, { "epoch": 2.116385312583979, "grad_norm": 0.04211665689945221, "learning_rate": 1.4280816421656932e-06, "loss": 0.0226, "num_input_tokens_seen": 58405352, "step": 86630 }, { "epoch": 2.116507463415826, "grad_norm": 0.024318603798747063, "learning_rate": 1.428004571825812e-06, "loss": 0.0361, "num_input_tokens_seen": 58408808, "step": 86635 }, { "epoch": 2.116629614247673, "grad_norm": 9.751173973083496, "learning_rate": 1.427927498373366e-06, "loss": 0.057, "num_input_tokens_seen": 58412456, "step": 86640 }, { "epoch": 2.11675176507952, "grad_norm": 0.14358443021774292, "learning_rate": 1.4278504218089164e-06, "loss": 0.0002, "num_input_tokens_seen": 58416616, "step": 86645 }, { "epoch": 2.116873915911367, "grad_norm": 0.046868305653333664, "learning_rate": 1.4277733421330233e-06, "loss": 0.0239, "num_input_tokens_seen": 58420456, "step": 86650 }, { "epoch": 2.1169960667432144, "grad_norm": 0.0075447880662977695, "learning_rate": 1.4276962593462476e-06, "loss": 0.0007, "num_input_tokens_seen": 58423912, "step": 86655 }, { "epoch": 2.1171182175750616, "grad_norm": 0.2818554937839508, "learning_rate": 1.4276191734491497e-06, "loss": 0.047, "num_input_tokens_seen": 58427560, "step": 86660 }, { "epoch": 2.1172403684069088, "grad_norm": 0.009356479160487652, "learning_rate": 1.4275420844422898e-06, "loss": 0.0261, "num_input_tokens_seen": 58431016, "step": 86665 }, { "epoch": 2.117362519238756, "grad_norm": 33.352073669433594, "learning_rate": 1.4274649923262292e-06, "loss": 0.0992, "num_input_tokens_seen": 58434152, "step": 86670 }, { "epoch": 2.117484670070603, "grad_norm": 0.08762264996767044, "learning_rate": 1.427387897101528e-06, "loss": 0.0008, "num_input_tokens_seen": 58437096, "step": 86675 }, { "epoch": 2.1176068209024503, "grad_norm": 0.22447797656059265, "learning_rate": 1.4273107987687477e-06, "loss": 0.0003, "num_input_tokens_seen": 58440488, "step": 86680 }, { "epoch": 2.1177289717342975, "grad_norm": 123.31275939941406, "learning_rate": 1.4272336973284476e-06, "loss": 0.0415, "num_input_tokens_seen": 58444136, "step": 86685 }, { "epoch": 2.1178511225661447, "grad_norm": 0.04530341550707817, "learning_rate": 1.4271565927811894e-06, "loss": 0.0001, "num_input_tokens_seen": 58447784, "step": 86690 }, { "epoch": 2.117973273397992, "grad_norm": 0.023438459262251854, "learning_rate": 1.4270794851275336e-06, "loss": 0.0001, "num_input_tokens_seen": 58451112, "step": 86695 }, { "epoch": 2.118095424229839, "grad_norm": 0.04608852416276932, "learning_rate": 1.4270023743680407e-06, "loss": 0.0001, "num_input_tokens_seen": 58454248, "step": 86700 }, { "epoch": 2.1182175750616863, "grad_norm": 0.01683771423995495, "learning_rate": 1.4269252605032718e-06, "loss": 0.0002, "num_input_tokens_seen": 58457704, "step": 86705 }, { "epoch": 2.1183397258935335, "grad_norm": 0.07870247960090637, "learning_rate": 1.4268481435337875e-06, "loss": 0.0748, "num_input_tokens_seen": 58461160, "step": 86710 }, { "epoch": 2.1184618767253807, "grad_norm": 0.00973932072520256, "learning_rate": 1.4267710234601488e-06, "loss": 0.1228, "num_input_tokens_seen": 58464552, "step": 86715 }, { "epoch": 2.118584027557228, "grad_norm": 2.5605545043945312, "learning_rate": 1.4266939002829163e-06, "loss": 0.0006, "num_input_tokens_seen": 58468136, "step": 86720 }, { "epoch": 2.1187061783890746, "grad_norm": 0.007359154988080263, "learning_rate": 1.4266167740026513e-06, "loss": 0.0001, "num_input_tokens_seen": 58471336, "step": 86725 }, { "epoch": 2.118828329220922, "grad_norm": 0.039782486855983734, "learning_rate": 1.4265396446199142e-06, "loss": 0.0001, "num_input_tokens_seen": 58474600, "step": 86730 }, { "epoch": 2.118950480052769, "grad_norm": 41.74934768676758, "learning_rate": 1.426462512135266e-06, "loss": 0.1686, "num_input_tokens_seen": 58477672, "step": 86735 }, { "epoch": 2.119072630884616, "grad_norm": 34.31242370605469, "learning_rate": 1.426385376549268e-06, "loss": 0.0006, "num_input_tokens_seen": 58481000, "step": 86740 }, { "epoch": 2.1191947817164634, "grad_norm": 0.22472701966762543, "learning_rate": 1.4263082378624804e-06, "loss": 0.0455, "num_input_tokens_seen": 58484328, "step": 86745 }, { "epoch": 2.1193169325483106, "grad_norm": 22.060367584228516, "learning_rate": 1.4262310960754649e-06, "loss": 0.0405, "num_input_tokens_seen": 58487656, "step": 86750 }, { "epoch": 2.1194390833801577, "grad_norm": 0.008212032727897167, "learning_rate": 1.4261539511887822e-06, "loss": 0.0003, "num_input_tokens_seen": 58490728, "step": 86755 }, { "epoch": 2.119561234212005, "grad_norm": 0.662553608417511, "learning_rate": 1.4260768032029932e-06, "loss": 0.0004, "num_input_tokens_seen": 58494184, "step": 86760 }, { "epoch": 2.119683385043852, "grad_norm": 0.18204031884670258, "learning_rate": 1.4259996521186591e-06, "loss": 0.0002, "num_input_tokens_seen": 58497576, "step": 86765 }, { "epoch": 2.1198055358756993, "grad_norm": 0.005318759009242058, "learning_rate": 1.4259224979363413e-06, "loss": 0.0115, "num_input_tokens_seen": 58500648, "step": 86770 }, { "epoch": 2.1199276867075465, "grad_norm": 0.008218302391469479, "learning_rate": 1.4258453406566002e-06, "loss": 0.0001, "num_input_tokens_seen": 58504232, "step": 86775 }, { "epoch": 2.1200498375393937, "grad_norm": 0.00433523952960968, "learning_rate": 1.4257681802799973e-06, "loss": 0.0001, "num_input_tokens_seen": 58507560, "step": 86780 }, { "epoch": 2.120171988371241, "grad_norm": 0.038819458335638046, "learning_rate": 1.4256910168070938e-06, "loss": 0.0001, "num_input_tokens_seen": 58511144, "step": 86785 }, { "epoch": 2.120294139203088, "grad_norm": 0.15920616686344147, "learning_rate": 1.4256138502384508e-06, "loss": 0.0501, "num_input_tokens_seen": 58514024, "step": 86790 }, { "epoch": 2.1204162900349353, "grad_norm": 0.36001694202423096, "learning_rate": 1.425536680574629e-06, "loss": 0.0004, "num_input_tokens_seen": 58517544, "step": 86795 }, { "epoch": 2.1205384408667824, "grad_norm": 0.013765096664428711, "learning_rate": 1.4254595078161905e-06, "loss": 0.0001, "num_input_tokens_seen": 58521256, "step": 86800 }, { "epoch": 2.1206605916986296, "grad_norm": 0.06415829807519913, "learning_rate": 1.4253823319636958e-06, "loss": 0.0389, "num_input_tokens_seen": 58524840, "step": 86805 }, { "epoch": 2.120782742530477, "grad_norm": 0.0557989776134491, "learning_rate": 1.4253051530177063e-06, "loss": 0.0001, "num_input_tokens_seen": 58528040, "step": 86810 }, { "epoch": 2.120904893362324, "grad_norm": 0.6082044839859009, "learning_rate": 1.4252279709787834e-06, "loss": 0.0864, "num_input_tokens_seen": 58531944, "step": 86815 }, { "epoch": 2.1210270441941708, "grad_norm": 0.0982634648680687, "learning_rate": 1.4251507858474882e-06, "loss": 0.0879, "num_input_tokens_seen": 58535080, "step": 86820 }, { "epoch": 2.121149195026018, "grad_norm": 0.3374102711677551, "learning_rate": 1.4250735976243823e-06, "loss": 0.0402, "num_input_tokens_seen": 58538216, "step": 86825 }, { "epoch": 2.121271345857865, "grad_norm": 0.05706077441573143, "learning_rate": 1.4249964063100266e-06, "loss": 0.0003, "num_input_tokens_seen": 58541672, "step": 86830 }, { "epoch": 2.1213934966897123, "grad_norm": 0.00961530301719904, "learning_rate": 1.4249192119049832e-06, "loss": 0.0004, "num_input_tokens_seen": 58544872, "step": 86835 }, { "epoch": 2.1215156475215595, "grad_norm": 0.2719575762748718, "learning_rate": 1.4248420144098128e-06, "loss": 0.054, "num_input_tokens_seen": 58548264, "step": 86840 }, { "epoch": 2.1216377983534067, "grad_norm": 0.04637189581990242, "learning_rate": 1.4247648138250768e-06, "loss": 0.0455, "num_input_tokens_seen": 58551656, "step": 86845 }, { "epoch": 2.121759949185254, "grad_norm": 0.02738938108086586, "learning_rate": 1.4246876101513369e-06, "loss": 0.0001, "num_input_tokens_seen": 58555240, "step": 86850 }, { "epoch": 2.121882100017101, "grad_norm": 19.862667083740234, "learning_rate": 1.4246104033891545e-06, "loss": 0.0448, "num_input_tokens_seen": 58559080, "step": 86855 }, { "epoch": 2.1220042508489483, "grad_norm": 0.0276873130351305, "learning_rate": 1.4245331935390913e-06, "loss": 0.0001, "num_input_tokens_seen": 58562728, "step": 86860 }, { "epoch": 2.1221264016807955, "grad_norm": 0.032264113426208496, "learning_rate": 1.424455980601708e-06, "loss": 0.0001, "num_input_tokens_seen": 58566120, "step": 86865 }, { "epoch": 2.1222485525126427, "grad_norm": 0.8636866807937622, "learning_rate": 1.424378764577567e-06, "loss": 0.0003, "num_input_tokens_seen": 58569768, "step": 86870 }, { "epoch": 2.12237070334449, "grad_norm": 0.05388767644762993, "learning_rate": 1.4243015454672294e-06, "loss": 0.0004, "num_input_tokens_seen": 58573096, "step": 86875 }, { "epoch": 2.122492854176337, "grad_norm": 0.08161472529172897, "learning_rate": 1.4242243232712569e-06, "loss": 0.0001, "num_input_tokens_seen": 58577128, "step": 86880 }, { "epoch": 2.1226150050081842, "grad_norm": 0.09701462835073471, "learning_rate": 1.424147097990211e-06, "loss": 0.0515, "num_input_tokens_seen": 58580328, "step": 86885 }, { "epoch": 2.1227371558400314, "grad_norm": 22.87986183166504, "learning_rate": 1.4240698696246535e-06, "loss": 0.048, "num_input_tokens_seen": 58583976, "step": 86890 }, { "epoch": 2.1228593066718786, "grad_norm": 1.0535171031951904, "learning_rate": 1.4239926381751455e-06, "loss": 0.0004, "num_input_tokens_seen": 58587304, "step": 86895 }, { "epoch": 2.122981457503726, "grad_norm": 0.009154610335826874, "learning_rate": 1.423915403642249e-06, "loss": 0.0003, "num_input_tokens_seen": 58590504, "step": 86900 }, { "epoch": 2.1231036083355725, "grad_norm": 0.23902444541454315, "learning_rate": 1.4238381660265259e-06, "loss": 0.0417, "num_input_tokens_seen": 58593768, "step": 86905 }, { "epoch": 2.1232257591674197, "grad_norm": 432.938232421875, "learning_rate": 1.4237609253285377e-06, "loss": 0.0994, "num_input_tokens_seen": 58597352, "step": 86910 }, { "epoch": 2.123347909999267, "grad_norm": 12.413274765014648, "learning_rate": 1.4236836815488458e-06, "loss": 0.1099, "num_input_tokens_seen": 58600616, "step": 86915 }, { "epoch": 2.123470060831114, "grad_norm": 31.247962951660156, "learning_rate": 1.4236064346880123e-06, "loss": 0.0416, "num_input_tokens_seen": 58604264, "step": 86920 }, { "epoch": 2.1235922116629613, "grad_norm": 0.13186196982860565, "learning_rate": 1.423529184746599e-06, "loss": 0.032, "num_input_tokens_seen": 58607848, "step": 86925 }, { "epoch": 2.1237143624948085, "grad_norm": 9.545666694641113, "learning_rate": 1.4234519317251674e-06, "loss": 0.0918, "num_input_tokens_seen": 58610920, "step": 86930 }, { "epoch": 2.1238365133266557, "grad_norm": 0.056094422936439514, "learning_rate": 1.4233746756242795e-06, "loss": 0.0003, "num_input_tokens_seen": 58614440, "step": 86935 }, { "epoch": 2.123958664158503, "grad_norm": 0.004741915967315435, "learning_rate": 1.4232974164444972e-06, "loss": 0.0003, "num_input_tokens_seen": 58617704, "step": 86940 }, { "epoch": 2.12408081499035, "grad_norm": 0.052919551730155945, "learning_rate": 1.4232201541863822e-06, "loss": 0.0412, "num_input_tokens_seen": 58621032, "step": 86945 }, { "epoch": 2.1242029658221973, "grad_norm": 0.21680483222007751, "learning_rate": 1.4231428888504964e-06, "loss": 0.1003, "num_input_tokens_seen": 58625128, "step": 86950 }, { "epoch": 2.1243251166540444, "grad_norm": 19.620275497436523, "learning_rate": 1.4230656204374017e-06, "loss": 0.0425, "num_input_tokens_seen": 58628584, "step": 86955 }, { "epoch": 2.1244472674858916, "grad_norm": 0.05666939914226532, "learning_rate": 1.4229883489476599e-06, "loss": 0.0337, "num_input_tokens_seen": 58631976, "step": 86960 }, { "epoch": 2.124569418317739, "grad_norm": 0.74814373254776, "learning_rate": 1.422911074381833e-06, "loss": 0.0371, "num_input_tokens_seen": 58635304, "step": 86965 }, { "epoch": 2.124691569149586, "grad_norm": 0.12086189538240433, "learning_rate": 1.4228337967404833e-06, "loss": 0.0003, "num_input_tokens_seen": 58638376, "step": 86970 }, { "epoch": 2.124813719981433, "grad_norm": 0.035326484590768814, "learning_rate": 1.4227565160241724e-06, "loss": 0.0988, "num_input_tokens_seen": 58641512, "step": 86975 }, { "epoch": 2.1249358708132804, "grad_norm": 0.014093923382461071, "learning_rate": 1.4226792322334622e-06, "loss": 0.0436, "num_input_tokens_seen": 58644968, "step": 86980 }, { "epoch": 2.1250580216451276, "grad_norm": 88.9645767211914, "learning_rate": 1.4226019453689151e-06, "loss": 0.1034, "num_input_tokens_seen": 58648296, "step": 86985 }, { "epoch": 2.1251801724769748, "grad_norm": 0.2062487155199051, "learning_rate": 1.422524655431093e-06, "loss": 0.0004, "num_input_tokens_seen": 58651880, "step": 86990 }, { "epoch": 2.125302323308822, "grad_norm": 0.2350393831729889, "learning_rate": 1.422447362420558e-06, "loss": 0.0016, "num_input_tokens_seen": 58655848, "step": 86995 }, { "epoch": 2.1254244741406687, "grad_norm": 0.06757447868585587, "learning_rate": 1.422370066337872e-06, "loss": 0.0008, "num_input_tokens_seen": 58659688, "step": 87000 }, { "epoch": 2.125546624972516, "grad_norm": 0.2243819236755371, "learning_rate": 1.4222927671835976e-06, "loss": 0.0005, "num_input_tokens_seen": 58663144, "step": 87005 }, { "epoch": 2.125668775804363, "grad_norm": 0.18053650856018066, "learning_rate": 1.4222154649582963e-06, "loss": 0.0488, "num_input_tokens_seen": 58666472, "step": 87010 }, { "epoch": 2.1257909266362103, "grad_norm": 0.00022531415743287653, "learning_rate": 1.4221381596625307e-06, "loss": 0.005, "num_input_tokens_seen": 58670184, "step": 87015 }, { "epoch": 2.1259130774680575, "grad_norm": 0.03191062808036804, "learning_rate": 1.4220608512968627e-06, "loss": 0.0002, "num_input_tokens_seen": 58673384, "step": 87020 }, { "epoch": 2.1260352282999047, "grad_norm": 0.20486636459827423, "learning_rate": 1.4219835398618548e-06, "loss": 0.0002, "num_input_tokens_seen": 58676520, "step": 87025 }, { "epoch": 2.126157379131752, "grad_norm": 0.008991490118205547, "learning_rate": 1.4219062253580691e-06, "loss": 0.0001, "num_input_tokens_seen": 58680104, "step": 87030 }, { "epoch": 2.126279529963599, "grad_norm": 0.015981163829565048, "learning_rate": 1.421828907786068e-06, "loss": 0.0004, "num_input_tokens_seen": 58683240, "step": 87035 }, { "epoch": 2.1264016807954462, "grad_norm": 0.20896996557712555, "learning_rate": 1.4217515871464132e-06, "loss": 0.062, "num_input_tokens_seen": 58686888, "step": 87040 }, { "epoch": 2.1265238316272934, "grad_norm": 0.138705775141716, "learning_rate": 1.4216742634396677e-06, "loss": 0.0225, "num_input_tokens_seen": 58690088, "step": 87045 }, { "epoch": 2.1266459824591406, "grad_norm": 0.12179794907569885, "learning_rate": 1.4215969366663936e-06, "loss": 0.0001, "num_input_tokens_seen": 58693288, "step": 87050 }, { "epoch": 2.126768133290988, "grad_norm": 0.041572365909814835, "learning_rate": 1.4215196068271531e-06, "loss": 0.0004, "num_input_tokens_seen": 58696616, "step": 87055 }, { "epoch": 2.126890284122835, "grad_norm": 0.3251390755176544, "learning_rate": 1.4214422739225087e-06, "loss": 0.0502, "num_input_tokens_seen": 58699752, "step": 87060 }, { "epoch": 2.127012434954682, "grad_norm": 18.905942916870117, "learning_rate": 1.4213649379530228e-06, "loss": 0.1728, "num_input_tokens_seen": 58703272, "step": 87065 }, { "epoch": 2.1271345857865294, "grad_norm": 30.930639266967773, "learning_rate": 1.4212875989192573e-06, "loss": 0.0559, "num_input_tokens_seen": 58706536, "step": 87070 }, { "epoch": 2.1272567366183766, "grad_norm": 34.58064651489258, "learning_rate": 1.4212102568217755e-06, "loss": 0.1699, "num_input_tokens_seen": 58710120, "step": 87075 }, { "epoch": 2.1273788874502237, "grad_norm": 41.41680145263672, "learning_rate": 1.4211329116611392e-06, "loss": 0.1176, "num_input_tokens_seen": 58713128, "step": 87080 }, { "epoch": 2.1275010382820705, "grad_norm": 0.3319126069545746, "learning_rate": 1.4210555634379113e-06, "loss": 0.0312, "num_input_tokens_seen": 58715752, "step": 87085 }, { "epoch": 2.1276231891139177, "grad_norm": 2.9074134826660156, "learning_rate": 1.420978212152654e-06, "loss": 0.0013, "num_input_tokens_seen": 58719016, "step": 87090 }, { "epoch": 2.127745339945765, "grad_norm": 124.58538055419922, "learning_rate": 1.4209008578059299e-06, "loss": 0.0369, "num_input_tokens_seen": 58722216, "step": 87095 }, { "epoch": 2.127867490777612, "grad_norm": 0.031058380380272865, "learning_rate": 1.4208235003983017e-06, "loss": 0.0001, "num_input_tokens_seen": 58725800, "step": 87100 }, { "epoch": 2.1279896416094592, "grad_norm": 0.14676395058631897, "learning_rate": 1.4207461399303316e-06, "loss": 0.0003, "num_input_tokens_seen": 58728744, "step": 87105 }, { "epoch": 2.1281117924413064, "grad_norm": 0.11780012398958206, "learning_rate": 1.4206687764025825e-06, "loss": 0.0396, "num_input_tokens_seen": 58732392, "step": 87110 }, { "epoch": 2.1282339432731536, "grad_norm": 0.019878914579749107, "learning_rate": 1.4205914098156168e-06, "loss": 0.0004, "num_input_tokens_seen": 58735592, "step": 87115 }, { "epoch": 2.128356094105001, "grad_norm": 0.05015110224485397, "learning_rate": 1.4205140401699973e-06, "loss": 0.0325, "num_input_tokens_seen": 58739048, "step": 87120 }, { "epoch": 2.128478244936848, "grad_norm": 0.05444740504026413, "learning_rate": 1.4204366674662867e-06, "loss": 0.0001, "num_input_tokens_seen": 58742696, "step": 87125 }, { "epoch": 2.128600395768695, "grad_norm": 16.788597106933594, "learning_rate": 1.4203592917050476e-06, "loss": 0.0418, "num_input_tokens_seen": 58746216, "step": 87130 }, { "epoch": 2.1287225466005424, "grad_norm": 0.09274864196777344, "learning_rate": 1.4202819128868422e-06, "loss": 0.0001, "num_input_tokens_seen": 58749608, "step": 87135 }, { "epoch": 2.1288446974323896, "grad_norm": 0.013930370099842548, "learning_rate": 1.4202045310122341e-06, "loss": 0.0001, "num_input_tokens_seen": 58752680, "step": 87140 }, { "epoch": 2.1289668482642368, "grad_norm": 0.3440989851951599, "learning_rate": 1.4201271460817859e-06, "loss": 0.0069, "num_input_tokens_seen": 58756200, "step": 87145 }, { "epoch": 2.129088999096084, "grad_norm": 220.0663604736328, "learning_rate": 1.4200497580960597e-06, "loss": 0.0305, "num_input_tokens_seen": 58759208, "step": 87150 }, { "epoch": 2.129211149927931, "grad_norm": 34.489627838134766, "learning_rate": 1.4199723670556187e-06, "loss": 0.1128, "num_input_tokens_seen": 58762600, "step": 87155 }, { "epoch": 2.1293333007597783, "grad_norm": 0.28581130504608154, "learning_rate": 1.419894972961026e-06, "loss": 0.0003, "num_input_tokens_seen": 58765928, "step": 87160 }, { "epoch": 2.1294554515916255, "grad_norm": 159.50994873046875, "learning_rate": 1.4198175758128436e-06, "loss": 0.1243, "num_input_tokens_seen": 58769192, "step": 87165 }, { "epoch": 2.1295776024234723, "grad_norm": 0.1996682584285736, "learning_rate": 1.4197401756116352e-06, "loss": 0.0542, "num_input_tokens_seen": 58772072, "step": 87170 }, { "epoch": 2.1296997532553195, "grad_norm": 0.0032133408822119236, "learning_rate": 1.419662772357963e-06, "loss": 0.0302, "num_input_tokens_seen": 58775336, "step": 87175 }, { "epoch": 2.1298219040871667, "grad_norm": 0.0018217507749795914, "learning_rate": 1.4195853660523907e-06, "loss": 0.0539, "num_input_tokens_seen": 58779112, "step": 87180 }, { "epoch": 2.129944054919014, "grad_norm": 0.006405272521078587, "learning_rate": 1.4195079566954805e-06, "loss": 0.0675, "num_input_tokens_seen": 58782760, "step": 87185 }, { "epoch": 2.130066205750861, "grad_norm": 0.30736804008483887, "learning_rate": 1.419430544287796e-06, "loss": 0.0288, "num_input_tokens_seen": 58786664, "step": 87190 }, { "epoch": 2.130188356582708, "grad_norm": 36.743751525878906, "learning_rate": 1.4193531288298993e-06, "loss": 0.0721, "num_input_tokens_seen": 58789928, "step": 87195 }, { "epoch": 2.1303105074145554, "grad_norm": 0.030499042943120003, "learning_rate": 1.419275710322354e-06, "loss": 0.0002, "num_input_tokens_seen": 58793512, "step": 87200 }, { "epoch": 2.1304326582464026, "grad_norm": 0.0741528794169426, "learning_rate": 1.419198288765723e-06, "loss": 0.0003, "num_input_tokens_seen": 58796456, "step": 87205 }, { "epoch": 2.13055480907825, "grad_norm": 0.9557086229324341, "learning_rate": 1.4191208641605693e-06, "loss": 0.0005, "num_input_tokens_seen": 58799528, "step": 87210 }, { "epoch": 2.130676959910097, "grad_norm": 191.2623291015625, "learning_rate": 1.4190434365074559e-06, "loss": 0.0044, "num_input_tokens_seen": 58803048, "step": 87215 }, { "epoch": 2.130799110741944, "grad_norm": 0.003294636495411396, "learning_rate": 1.418966005806946e-06, "loss": 0.0366, "num_input_tokens_seen": 58806248, "step": 87220 }, { "epoch": 2.1309212615737914, "grad_norm": 0.1290232241153717, "learning_rate": 1.4188885720596022e-06, "loss": 0.1019, "num_input_tokens_seen": 58809448, "step": 87225 }, { "epoch": 2.1310434124056385, "grad_norm": 0.012942397966980934, "learning_rate": 1.4188111352659884e-06, "loss": 0.0567, "num_input_tokens_seen": 58812648, "step": 87230 }, { "epoch": 2.1311655632374857, "grad_norm": 0.004310682415962219, "learning_rate": 1.4187336954266674e-06, "loss": 0.0002, "num_input_tokens_seen": 58815784, "step": 87235 }, { "epoch": 2.131287714069333, "grad_norm": 11.728301048278809, "learning_rate": 1.4186562525422025e-06, "loss": 0.1964, "num_input_tokens_seen": 58819240, "step": 87240 }, { "epoch": 2.13140986490118, "grad_norm": 1.623407006263733, "learning_rate": 1.4185788066131566e-06, "loss": 0.0005, "num_input_tokens_seen": 58822760, "step": 87245 }, { "epoch": 2.1315320157330273, "grad_norm": 0.38209009170532227, "learning_rate": 1.4185013576400928e-06, "loss": 0.0262, "num_input_tokens_seen": 58825704, "step": 87250 }, { "epoch": 2.1316541665648745, "grad_norm": 0.17902544140815735, "learning_rate": 1.418423905623575e-06, "loss": 0.0006, "num_input_tokens_seen": 58829416, "step": 87255 }, { "epoch": 2.1317763173967217, "grad_norm": 559.6621704101562, "learning_rate": 1.4183464505641656e-06, "loss": 0.1206, "num_input_tokens_seen": 58832744, "step": 87260 }, { "epoch": 2.1318984682285684, "grad_norm": 0.02113385498523712, "learning_rate": 1.4182689924624285e-06, "loss": 0.0003, "num_input_tokens_seen": 58835752, "step": 87265 }, { "epoch": 2.1320206190604156, "grad_norm": 0.0700068548321724, "learning_rate": 1.4181915313189269e-06, "loss": 0.0479, "num_input_tokens_seen": 58838952, "step": 87270 }, { "epoch": 2.132142769892263, "grad_norm": 0.02595849707722664, "learning_rate": 1.4181140671342235e-06, "loss": 0.044, "num_input_tokens_seen": 58842024, "step": 87275 }, { "epoch": 2.13226492072411, "grad_norm": 22.93673324584961, "learning_rate": 1.4180365999088826e-06, "loss": 0.1059, "num_input_tokens_seen": 58845800, "step": 87280 }, { "epoch": 2.132387071555957, "grad_norm": 0.012322576716542244, "learning_rate": 1.4179591296434669e-06, "loss": 0.0003, "num_input_tokens_seen": 58849896, "step": 87285 }, { "epoch": 2.1325092223878044, "grad_norm": 0.16245467960834503, "learning_rate": 1.4178816563385398e-06, "loss": 0.0013, "num_input_tokens_seen": 58853032, "step": 87290 }, { "epoch": 2.1326313732196516, "grad_norm": 20.192350387573242, "learning_rate": 1.4178041799946653e-06, "loss": 0.0812, "num_input_tokens_seen": 58856296, "step": 87295 }, { "epoch": 2.1327535240514988, "grad_norm": 130.31716918945312, "learning_rate": 1.4177267006124064e-06, "loss": 0.0668, "num_input_tokens_seen": 58859688, "step": 87300 }, { "epoch": 2.132875674883346, "grad_norm": 73.06332397460938, "learning_rate": 1.4176492181923267e-06, "loss": 0.064, "num_input_tokens_seen": 58863656, "step": 87305 }, { "epoch": 2.132997825715193, "grad_norm": 0.1604689657688141, "learning_rate": 1.4175717327349893e-06, "loss": 0.0002, "num_input_tokens_seen": 58867304, "step": 87310 }, { "epoch": 2.1331199765470403, "grad_norm": 0.010926165618002415, "learning_rate": 1.417494244240958e-06, "loss": 0.0009, "num_input_tokens_seen": 58870952, "step": 87315 }, { "epoch": 2.1332421273788875, "grad_norm": 0.008155797608196735, "learning_rate": 1.4174167527107961e-06, "loss": 0.042, "num_input_tokens_seen": 58874600, "step": 87320 }, { "epoch": 2.1333642782107347, "grad_norm": 23.913652420043945, "learning_rate": 1.4173392581450674e-06, "loss": 0.0888, "num_input_tokens_seen": 58877864, "step": 87325 }, { "epoch": 2.133486429042582, "grad_norm": 0.018106626346707344, "learning_rate": 1.4172617605443353e-06, "loss": 0.0483, "num_input_tokens_seen": 58881000, "step": 87330 }, { "epoch": 2.133608579874429, "grad_norm": 0.0024407675955444574, "learning_rate": 1.4171842599091636e-06, "loss": 0.0001, "num_input_tokens_seen": 58884328, "step": 87335 }, { "epoch": 2.1337307307062763, "grad_norm": 0.4352200925350189, "learning_rate": 1.4171067562401157e-06, "loss": 0.0466, "num_input_tokens_seen": 58887400, "step": 87340 }, { "epoch": 2.1338528815381235, "grad_norm": 0.054799050092697144, "learning_rate": 1.4170292495377554e-06, "loss": 0.0215, "num_input_tokens_seen": 58891048, "step": 87345 }, { "epoch": 2.13397503236997, "grad_norm": 1.654451847076416, "learning_rate": 1.416951739802646e-06, "loss": 0.0014, "num_input_tokens_seen": 58894504, "step": 87350 }, { "epoch": 2.1340971832018174, "grad_norm": 0.7009220123291016, "learning_rate": 1.4168742270353515e-06, "loss": 0.0004, "num_input_tokens_seen": 58897704, "step": 87355 }, { "epoch": 2.1342193340336646, "grad_norm": 0.09969975799322128, "learning_rate": 1.4167967112364357e-06, "loss": 0.001, "num_input_tokens_seen": 58900776, "step": 87360 }, { "epoch": 2.134341484865512, "grad_norm": 0.19023500382900238, "learning_rate": 1.416719192406462e-06, "loss": 0.0003, "num_input_tokens_seen": 58904168, "step": 87365 }, { "epoch": 2.134463635697359, "grad_norm": 0.1158929094672203, "learning_rate": 1.4166416705459941e-06, "loss": 0.1239, "num_input_tokens_seen": 58907240, "step": 87370 }, { "epoch": 2.134585786529206, "grad_norm": 0.24258743226528168, "learning_rate": 1.4165641456555959e-06, "loss": 0.0004, "num_input_tokens_seen": 58910504, "step": 87375 }, { "epoch": 2.1347079373610534, "grad_norm": 0.011560074985027313, "learning_rate": 1.4164866177358312e-06, "loss": 0.0003, "num_input_tokens_seen": 58913896, "step": 87380 }, { "epoch": 2.1348300881929005, "grad_norm": 0.0013848430244252086, "learning_rate": 1.4164090867872638e-06, "loss": 0.0005, "num_input_tokens_seen": 58917480, "step": 87385 }, { "epoch": 2.1349522390247477, "grad_norm": 0.1996019333600998, "learning_rate": 1.4163315528104576e-06, "loss": 0.0001, "num_input_tokens_seen": 58921000, "step": 87390 }, { "epoch": 2.135074389856595, "grad_norm": 23.325389862060547, "learning_rate": 1.4162540158059765e-06, "loss": 0.0933, "num_input_tokens_seen": 58924008, "step": 87395 }, { "epoch": 2.135196540688442, "grad_norm": 0.017512032762169838, "learning_rate": 1.416176475774384e-06, "loss": 0.0377, "num_input_tokens_seen": 58927784, "step": 87400 }, { "epoch": 2.1353186915202893, "grad_norm": 0.0444110669195652, "learning_rate": 1.4160989327162443e-06, "loss": 0.0003, "num_input_tokens_seen": 58930920, "step": 87405 }, { "epoch": 2.1354408423521365, "grad_norm": 0.00017387409752700478, "learning_rate": 1.4160213866321216e-06, "loss": 0.0001, "num_input_tokens_seen": 58934248, "step": 87410 }, { "epoch": 2.1355629931839837, "grad_norm": 0.006010678131133318, "learning_rate": 1.4159438375225788e-06, "loss": 0.0001, "num_input_tokens_seen": 58937576, "step": 87415 }, { "epoch": 2.135685144015831, "grad_norm": 0.02587669901549816, "learning_rate": 1.4158662853881809e-06, "loss": 0.0002, "num_input_tokens_seen": 58940840, "step": 87420 }, { "epoch": 2.135807294847678, "grad_norm": 0.43707767128944397, "learning_rate": 1.4157887302294916e-06, "loss": 0.0003, "num_input_tokens_seen": 58943912, "step": 87425 }, { "epoch": 2.1359294456795253, "grad_norm": 0.0007447099778801203, "learning_rate": 1.4157111720470746e-06, "loss": 0.0692, "num_input_tokens_seen": 58947624, "step": 87430 }, { "epoch": 2.1360515965113724, "grad_norm": 0.004575371742248535, "learning_rate": 1.4156336108414944e-06, "loss": 0.0002, "num_input_tokens_seen": 58951080, "step": 87435 }, { "epoch": 2.1361737473432196, "grad_norm": 26.027252197265625, "learning_rate": 1.4155560466133146e-06, "loss": 0.0458, "num_input_tokens_seen": 58954920, "step": 87440 }, { "epoch": 2.1362958981750664, "grad_norm": 0.03521507978439331, "learning_rate": 1.4154784793630993e-06, "loss": 0.0405, "num_input_tokens_seen": 58958376, "step": 87445 }, { "epoch": 2.1364180490069136, "grad_norm": 38.19405746459961, "learning_rate": 1.415400909091413e-06, "loss": 0.068, "num_input_tokens_seen": 58961512, "step": 87450 }, { "epoch": 2.1365401998387608, "grad_norm": 0.009486167691648006, "learning_rate": 1.4153233357988197e-06, "loss": 0.0, "num_input_tokens_seen": 58964968, "step": 87455 }, { "epoch": 2.136662350670608, "grad_norm": 0.016952499747276306, "learning_rate": 1.4152457594858834e-06, "loss": 0.0001, "num_input_tokens_seen": 58968232, "step": 87460 }, { "epoch": 2.136784501502455, "grad_norm": 0.0029374232981354, "learning_rate": 1.415168180153168e-06, "loss": 0.0001, "num_input_tokens_seen": 58971688, "step": 87465 }, { "epoch": 2.1369066523343023, "grad_norm": 0.18648330867290497, "learning_rate": 1.415090597801238e-06, "loss": 0.0002, "num_input_tokens_seen": 58975016, "step": 87470 }, { "epoch": 2.1370288031661495, "grad_norm": 0.0014150391798466444, "learning_rate": 1.4150130124306574e-06, "loss": 0.0005, "num_input_tokens_seen": 58978408, "step": 87475 }, { "epoch": 2.1371509539979967, "grad_norm": 33.36257553100586, "learning_rate": 1.4149354240419906e-06, "loss": 0.3102, "num_input_tokens_seen": 58981416, "step": 87480 }, { "epoch": 2.137273104829844, "grad_norm": 0.2093086987733841, "learning_rate": 1.414857832635802e-06, "loss": 0.0778, "num_input_tokens_seen": 58984488, "step": 87485 }, { "epoch": 2.137395255661691, "grad_norm": 0.030884187668561935, "learning_rate": 1.4147802382126553e-06, "loss": 0.0527, "num_input_tokens_seen": 58988520, "step": 87490 }, { "epoch": 2.1375174064935383, "grad_norm": 0.04900249466300011, "learning_rate": 1.4147026407731156e-06, "loss": 0.0001, "num_input_tokens_seen": 58992040, "step": 87495 }, { "epoch": 2.1376395573253855, "grad_norm": 0.12208317965269089, "learning_rate": 1.4146250403177464e-06, "loss": 0.0316, "num_input_tokens_seen": 58995432, "step": 87500 }, { "epoch": 2.1377617081572327, "grad_norm": 6.88958740234375, "learning_rate": 1.4145474368471124e-06, "loss": 0.0014, "num_input_tokens_seen": 58999464, "step": 87505 }, { "epoch": 2.13788385898908, "grad_norm": 0.01789010316133499, "learning_rate": 1.4144698303617782e-06, "loss": 0.0005, "num_input_tokens_seen": 59002472, "step": 87510 }, { "epoch": 2.138006009820927, "grad_norm": 0.26290953159332275, "learning_rate": 1.4143922208623078e-06, "loss": 0.0044, "num_input_tokens_seen": 59005352, "step": 87515 }, { "epoch": 2.1381281606527742, "grad_norm": 0.011401143856346607, "learning_rate": 1.4143146083492656e-06, "loss": 0.0865, "num_input_tokens_seen": 59008808, "step": 87520 }, { "epoch": 2.1382503114846214, "grad_norm": 0.038983121514320374, "learning_rate": 1.4142369928232164e-06, "loss": 0.0002, "num_input_tokens_seen": 59012136, "step": 87525 }, { "epoch": 2.138372462316468, "grad_norm": 0.15900737047195435, "learning_rate": 1.414159374284724e-06, "loss": 0.0601, "num_input_tokens_seen": 59015656, "step": 87530 }, { "epoch": 2.1384946131483153, "grad_norm": 0.14519399404525757, "learning_rate": 1.4140817527343534e-06, "loss": 0.0923, "num_input_tokens_seen": 59019304, "step": 87535 }, { "epoch": 2.1386167639801625, "grad_norm": 0.28216826915740967, "learning_rate": 1.4140041281726686e-06, "loss": 0.0002, "num_input_tokens_seen": 59022696, "step": 87540 }, { "epoch": 2.1387389148120097, "grad_norm": 19.394559860229492, "learning_rate": 1.413926500600235e-06, "loss": 0.0358, "num_input_tokens_seen": 59026280, "step": 87545 }, { "epoch": 2.138861065643857, "grad_norm": 0.026544447988271713, "learning_rate": 1.4138488700176163e-06, "loss": 0.0001, "num_input_tokens_seen": 59029736, "step": 87550 }, { "epoch": 2.138983216475704, "grad_norm": 0.17702056467533112, "learning_rate": 1.4137712364253774e-06, "loss": 0.0029, "num_input_tokens_seen": 59033192, "step": 87555 }, { "epoch": 2.1391053673075513, "grad_norm": 0.13731959462165833, "learning_rate": 1.4136935998240827e-06, "loss": 0.0051, "num_input_tokens_seen": 59036072, "step": 87560 }, { "epoch": 2.1392275181393985, "grad_norm": 0.007630325388163328, "learning_rate": 1.413615960214297e-06, "loss": 0.0003, "num_input_tokens_seen": 59039144, "step": 87565 }, { "epoch": 2.1393496689712457, "grad_norm": 28.514596939086914, "learning_rate": 1.4135383175965844e-06, "loss": 0.0824, "num_input_tokens_seen": 59042792, "step": 87570 }, { "epoch": 2.139471819803093, "grad_norm": 0.005110455676913261, "learning_rate": 1.41346067197151e-06, "loss": 0.0279, "num_input_tokens_seen": 59045800, "step": 87575 }, { "epoch": 2.13959397063494, "grad_norm": 41.230953216552734, "learning_rate": 1.4133830233396386e-06, "loss": 0.0739, "num_input_tokens_seen": 59049000, "step": 87580 }, { "epoch": 2.1397161214667872, "grad_norm": 0.053071048110723495, "learning_rate": 1.413305371701535e-06, "loss": 0.0615, "num_input_tokens_seen": 59052136, "step": 87585 }, { "epoch": 2.1398382722986344, "grad_norm": 0.1270586997270584, "learning_rate": 1.413227717057763e-06, "loss": 0.0559, "num_input_tokens_seen": 59055400, "step": 87590 }, { "epoch": 2.1399604231304816, "grad_norm": 0.13777196407318115, "learning_rate": 1.413150059408888e-06, "loss": 0.0003, "num_input_tokens_seen": 59058536, "step": 87595 }, { "epoch": 2.140082573962329, "grad_norm": 0.22946496307849884, "learning_rate": 1.4130723987554747e-06, "loss": 0.0394, "num_input_tokens_seen": 59061416, "step": 87600 }, { "epoch": 2.140204724794176, "grad_norm": 164.1780242919922, "learning_rate": 1.4129947350980878e-06, "loss": 0.121, "num_input_tokens_seen": 59064488, "step": 87605 }, { "epoch": 2.140326875626023, "grad_norm": 0.3987724184989929, "learning_rate": 1.4129170684372921e-06, "loss": 0.076, "num_input_tokens_seen": 59067880, "step": 87610 }, { "epoch": 2.1404490264578704, "grad_norm": 0.3457280993461609, "learning_rate": 1.4128393987736526e-06, "loss": 0.0013, "num_input_tokens_seen": 59070888, "step": 87615 }, { "epoch": 2.1405711772897176, "grad_norm": 0.03645173832774162, "learning_rate": 1.412761726107734e-06, "loss": 0.0106, "num_input_tokens_seen": 59073960, "step": 87620 }, { "epoch": 2.1406933281215643, "grad_norm": 0.3854512870311737, "learning_rate": 1.412684050440101e-06, "loss": 0.0318, "num_input_tokens_seen": 59077416, "step": 87625 }, { "epoch": 2.1408154789534115, "grad_norm": 0.014370037242770195, "learning_rate": 1.4126063717713183e-06, "loss": 0.0445, "num_input_tokens_seen": 59080680, "step": 87630 }, { "epoch": 2.1409376297852587, "grad_norm": 0.035824961960315704, "learning_rate": 1.4125286901019513e-06, "loss": 0.0002, "num_input_tokens_seen": 59084008, "step": 87635 }, { "epoch": 2.141059780617106, "grad_norm": 0.15429726243019104, "learning_rate": 1.4124510054325648e-06, "loss": 0.0002, "num_input_tokens_seen": 59087272, "step": 87640 }, { "epoch": 2.141181931448953, "grad_norm": 0.05572713911533356, "learning_rate": 1.4123733177637236e-06, "loss": 0.0414, "num_input_tokens_seen": 59090920, "step": 87645 }, { "epoch": 2.1413040822808003, "grad_norm": 0.38019487261772156, "learning_rate": 1.4122956270959927e-06, "loss": 0.0004, "num_input_tokens_seen": 59094440, "step": 87650 }, { "epoch": 2.1414262331126475, "grad_norm": 0.1866544634103775, "learning_rate": 1.412217933429937e-06, "loss": 0.0589, "num_input_tokens_seen": 59097704, "step": 87655 }, { "epoch": 2.1415483839444946, "grad_norm": 0.01762351393699646, "learning_rate": 1.4121402367661217e-06, "loss": 0.0003, "num_input_tokens_seen": 59100712, "step": 87660 }, { "epoch": 2.141670534776342, "grad_norm": 0.14558862149715424, "learning_rate": 1.4120625371051119e-06, "loss": 0.0344, "num_input_tokens_seen": 59104040, "step": 87665 }, { "epoch": 2.141792685608189, "grad_norm": 0.004253696650266647, "learning_rate": 1.4119848344474723e-06, "loss": 0.0003, "num_input_tokens_seen": 59107432, "step": 87670 }, { "epoch": 2.141914836440036, "grad_norm": 0.03200583904981613, "learning_rate": 1.4119071287937683e-06, "loss": 0.0582, "num_input_tokens_seen": 59110888, "step": 87675 }, { "epoch": 2.1420369872718834, "grad_norm": 0.024416061118245125, "learning_rate": 1.4118294201445648e-06, "loss": 0.0766, "num_input_tokens_seen": 59114024, "step": 87680 }, { "epoch": 2.1421591381037306, "grad_norm": 0.23416262865066528, "learning_rate": 1.411751708500427e-06, "loss": 0.0003, "num_input_tokens_seen": 59117160, "step": 87685 }, { "epoch": 2.142281288935578, "grad_norm": 0.008143766783177853, "learning_rate": 1.41167399386192e-06, "loss": 0.0003, "num_input_tokens_seen": 59120360, "step": 87690 }, { "epoch": 2.142403439767425, "grad_norm": 0.030633976683020592, "learning_rate": 1.4115962762296088e-06, "loss": 0.0321, "num_input_tokens_seen": 59123496, "step": 87695 }, { "epoch": 2.142525590599272, "grad_norm": 0.056163497269153595, "learning_rate": 1.411518555604059e-06, "loss": 0.001, "num_input_tokens_seen": 59126952, "step": 87700 }, { "epoch": 2.1426477414311194, "grad_norm": 0.0033839961979538202, "learning_rate": 1.4114408319858355e-06, "loss": 0.0225, "num_input_tokens_seen": 59130920, "step": 87705 }, { "epoch": 2.142769892262966, "grad_norm": 633.6400756835938, "learning_rate": 1.4113631053755037e-06, "loss": 0.0648, "num_input_tokens_seen": 59133992, "step": 87710 }, { "epoch": 2.1428920430948133, "grad_norm": 14.898810386657715, "learning_rate": 1.4112853757736288e-06, "loss": 0.0513, "num_input_tokens_seen": 59137448, "step": 87715 }, { "epoch": 2.1430141939266605, "grad_norm": 1.018584132194519, "learning_rate": 1.411207643180776e-06, "loss": 0.0005, "num_input_tokens_seen": 59140712, "step": 87720 }, { "epoch": 2.1431363447585077, "grad_norm": 0.012547546997666359, "learning_rate": 1.4111299075975103e-06, "loss": 0.0001, "num_input_tokens_seen": 59143976, "step": 87725 }, { "epoch": 2.143258495590355, "grad_norm": 0.04866180196404457, "learning_rate": 1.4110521690243977e-06, "loss": 0.0377, "num_input_tokens_seen": 59147944, "step": 87730 }, { "epoch": 2.143380646422202, "grad_norm": 0.005499622318893671, "learning_rate": 1.4109744274620031e-06, "loss": 0.0271, "num_input_tokens_seen": 59151272, "step": 87735 }, { "epoch": 2.1435027972540492, "grad_norm": 0.0666104257106781, "learning_rate": 1.410896682910892e-06, "loss": 0.0003, "num_input_tokens_seen": 59154792, "step": 87740 }, { "epoch": 2.1436249480858964, "grad_norm": 0.014532984234392643, "learning_rate": 1.4108189353716292e-06, "loss": 0.0006, "num_input_tokens_seen": 59158184, "step": 87745 }, { "epoch": 2.1437470989177436, "grad_norm": 0.028684677556157112, "learning_rate": 1.4107411848447813e-06, "loss": 0.0001, "num_input_tokens_seen": 59161512, "step": 87750 }, { "epoch": 2.143869249749591, "grad_norm": 26.549978256225586, "learning_rate": 1.4106634313309124e-06, "loss": 0.0964, "num_input_tokens_seen": 59164456, "step": 87755 }, { "epoch": 2.143991400581438, "grad_norm": 67.54285430908203, "learning_rate": 1.4105856748305889e-06, "loss": 0.1318, "num_input_tokens_seen": 59167784, "step": 87760 }, { "epoch": 2.144113551413285, "grad_norm": 0.00807912740856409, "learning_rate": 1.410507915344376e-06, "loss": 0.0, "num_input_tokens_seen": 59170792, "step": 87765 }, { "epoch": 2.1442357022451324, "grad_norm": 0.05754079297184944, "learning_rate": 1.4104301528728393e-06, "loss": 0.0351, "num_input_tokens_seen": 59174440, "step": 87770 }, { "epoch": 2.1443578530769796, "grad_norm": 0.09071355313062668, "learning_rate": 1.410352387416544e-06, "loss": 0.049, "num_input_tokens_seen": 59178216, "step": 87775 }, { "epoch": 2.1444800039088268, "grad_norm": 0.04613567888736725, "learning_rate": 1.4102746189760555e-06, "loss": 0.0053, "num_input_tokens_seen": 59181736, "step": 87780 }, { "epoch": 2.144602154740674, "grad_norm": 0.14142712950706482, "learning_rate": 1.4101968475519398e-06, "loss": 0.0006, "num_input_tokens_seen": 59185512, "step": 87785 }, { "epoch": 2.144724305572521, "grad_norm": 0.06886684149503708, "learning_rate": 1.410119073144762e-06, "loss": 0.0002, "num_input_tokens_seen": 59189096, "step": 87790 }, { "epoch": 2.144846456404368, "grad_norm": 0.0034576309844851494, "learning_rate": 1.4100412957550884e-06, "loss": 0.0262, "num_input_tokens_seen": 59192296, "step": 87795 }, { "epoch": 2.144968607236215, "grad_norm": 0.015819577500224113, "learning_rate": 1.4099635153834842e-06, "loss": 0.0, "num_input_tokens_seen": 59195560, "step": 87800 }, { "epoch": 2.1450907580680623, "grad_norm": 0.0029255012050271034, "learning_rate": 1.409885732030515e-06, "loss": 0.0383, "num_input_tokens_seen": 59198504, "step": 87805 }, { "epoch": 2.1452129088999095, "grad_norm": 0.021271195262670517, "learning_rate": 1.4098079456967462e-06, "loss": 0.0001, "num_input_tokens_seen": 59202152, "step": 87810 }, { "epoch": 2.1453350597317566, "grad_norm": 0.294455349445343, "learning_rate": 1.4097301563827443e-06, "loss": 0.0004, "num_input_tokens_seen": 59205480, "step": 87815 }, { "epoch": 2.145457210563604, "grad_norm": 0.002129147993400693, "learning_rate": 1.409652364089074e-06, "loss": 0.0001, "num_input_tokens_seen": 59208488, "step": 87820 }, { "epoch": 2.145579361395451, "grad_norm": 0.0035055570770055056, "learning_rate": 1.4095745688163016e-06, "loss": 0.073, "num_input_tokens_seen": 59211752, "step": 87825 }, { "epoch": 2.145701512227298, "grad_norm": 0.0019378452561795712, "learning_rate": 1.4094967705649932e-06, "loss": 0.0001, "num_input_tokens_seen": 59214888, "step": 87830 }, { "epoch": 2.1458236630591454, "grad_norm": 0.006830559112131596, "learning_rate": 1.4094189693357138e-06, "loss": 0.0001, "num_input_tokens_seen": 59218152, "step": 87835 }, { "epoch": 2.1459458138909926, "grad_norm": 0.024698445573449135, "learning_rate": 1.4093411651290295e-06, "loss": 0.0577, "num_input_tokens_seen": 59221736, "step": 87840 }, { "epoch": 2.14606796472284, "grad_norm": 0.009774798527359962, "learning_rate": 1.4092633579455062e-06, "loss": 0.0001, "num_input_tokens_seen": 59224936, "step": 87845 }, { "epoch": 2.146190115554687, "grad_norm": 0.0053397067822515965, "learning_rate": 1.4091855477857099e-06, "loss": 0.1333, "num_input_tokens_seen": 59228200, "step": 87850 }, { "epoch": 2.146312266386534, "grad_norm": 0.003685411997139454, "learning_rate": 1.4091077346502059e-06, "loss": 0.0534, "num_input_tokens_seen": 59231592, "step": 87855 }, { "epoch": 2.1464344172183814, "grad_norm": 0.5126121640205383, "learning_rate": 1.4090299185395607e-06, "loss": 0.0003, "num_input_tokens_seen": 59235112, "step": 87860 }, { "epoch": 2.1465565680502285, "grad_norm": 0.0015000292332842946, "learning_rate": 1.4089520994543395e-06, "loss": 0.0397, "num_input_tokens_seen": 59239016, "step": 87865 }, { "epoch": 2.1466787188820757, "grad_norm": 0.06686501950025558, "learning_rate": 1.408874277395109e-06, "loss": 0.0015, "num_input_tokens_seen": 59242344, "step": 87870 }, { "epoch": 2.146800869713923, "grad_norm": 0.029950261116027832, "learning_rate": 1.4087964523624352e-06, "loss": 0.0439, "num_input_tokens_seen": 59246312, "step": 87875 }, { "epoch": 2.14692302054577, "grad_norm": 178.24267578125, "learning_rate": 1.408718624356883e-06, "loss": 0.1187, "num_input_tokens_seen": 59249896, "step": 87880 }, { "epoch": 2.1470451713776173, "grad_norm": 0.7557882070541382, "learning_rate": 1.4086407933790189e-06, "loss": 0.0004, "num_input_tokens_seen": 59255080, "step": 87885 }, { "epoch": 2.147167322209464, "grad_norm": 0.2542136609554291, "learning_rate": 1.4085629594294094e-06, "loss": 0.0009, "num_input_tokens_seen": 59258216, "step": 87890 }, { "epoch": 2.1472894730413112, "grad_norm": 95.09031677246094, "learning_rate": 1.4084851225086204e-06, "loss": 0.0389, "num_input_tokens_seen": 59261352, "step": 87895 }, { "epoch": 2.1474116238731584, "grad_norm": 0.16496166586875916, "learning_rate": 1.4084072826172171e-06, "loss": 0.0003, "num_input_tokens_seen": 59264296, "step": 87900 }, { "epoch": 2.1475337747050056, "grad_norm": 0.011495031416416168, "learning_rate": 1.4083294397557665e-06, "loss": 0.0001, "num_input_tokens_seen": 59268072, "step": 87905 }, { "epoch": 2.147655925536853, "grad_norm": 0.06308521330356598, "learning_rate": 1.4082515939248342e-06, "loss": 0.0001, "num_input_tokens_seen": 59271208, "step": 87910 }, { "epoch": 2.1477780763687, "grad_norm": 0.0014856046764180064, "learning_rate": 1.4081737451249868e-06, "loss": 0.0001, "num_input_tokens_seen": 59274216, "step": 87915 }, { "epoch": 2.147900227200547, "grad_norm": 0.0024500424042344093, "learning_rate": 1.4080958933567901e-06, "loss": 0.0, "num_input_tokens_seen": 59277864, "step": 87920 }, { "epoch": 2.1480223780323944, "grad_norm": 0.024505984038114548, "learning_rate": 1.4080180386208105e-06, "loss": 0.0003, "num_input_tokens_seen": 59281384, "step": 87925 }, { "epoch": 2.1481445288642416, "grad_norm": 0.012265844270586967, "learning_rate": 1.4079401809176136e-06, "loss": 0.0001, "num_input_tokens_seen": 59285096, "step": 87930 }, { "epoch": 2.1482666796960888, "grad_norm": 0.006017817184329033, "learning_rate": 1.4078623202477662e-06, "loss": 0.0002, "num_input_tokens_seen": 59288552, "step": 87935 }, { "epoch": 2.148388830527936, "grad_norm": 0.29101741313934326, "learning_rate": 1.407784456611834e-06, "loss": 0.0002, "num_input_tokens_seen": 59291752, "step": 87940 }, { "epoch": 2.148510981359783, "grad_norm": 0.03301127254962921, "learning_rate": 1.4077065900103836e-06, "loss": 0.0, "num_input_tokens_seen": 59295400, "step": 87945 }, { "epoch": 2.1486331321916303, "grad_norm": 0.001643263385631144, "learning_rate": 1.4076287204439817e-06, "loss": 0.0437, "num_input_tokens_seen": 59298728, "step": 87950 }, { "epoch": 2.1487552830234775, "grad_norm": 0.039305005222558975, "learning_rate": 1.4075508479131936e-06, "loss": 0.0001, "num_input_tokens_seen": 59302376, "step": 87955 }, { "epoch": 2.1488774338553247, "grad_norm": 56.951690673828125, "learning_rate": 1.4074729724185864e-06, "loss": 0.0992, "num_input_tokens_seen": 59305384, "step": 87960 }, { "epoch": 2.148999584687172, "grad_norm": 0.00010884566290769726, "learning_rate": 1.407395093960726e-06, "loss": 0.0002, "num_input_tokens_seen": 59309032, "step": 87965 }, { "epoch": 2.149121735519019, "grad_norm": 44.920066833496094, "learning_rate": 1.4073172125401792e-06, "loss": 0.1488, "num_input_tokens_seen": 59312552, "step": 87970 }, { "epoch": 2.149243886350866, "grad_norm": 0.010293465107679367, "learning_rate": 1.4072393281575117e-06, "loss": 0.0, "num_input_tokens_seen": 59316264, "step": 87975 }, { "epoch": 2.149366037182713, "grad_norm": 27.4898624420166, "learning_rate": 1.4071614408132903e-06, "loss": 0.1237, "num_input_tokens_seen": 59319592, "step": 87980 }, { "epoch": 2.14948818801456, "grad_norm": 0.035651687532663345, "learning_rate": 1.4070835505080816e-06, "loss": 0.0001, "num_input_tokens_seen": 59322920, "step": 87985 }, { "epoch": 2.1496103388464074, "grad_norm": 699.4857788085938, "learning_rate": 1.4070056572424519e-06, "loss": 0.0455, "num_input_tokens_seen": 59326056, "step": 87990 }, { "epoch": 2.1497324896782546, "grad_norm": 0.002766907447949052, "learning_rate": 1.4069277610169672e-06, "loss": 0.0002, "num_input_tokens_seen": 59330792, "step": 87995 }, { "epoch": 2.1498546405101018, "grad_norm": 0.7083733677864075, "learning_rate": 1.4068498618321946e-06, "loss": 0.0004, "num_input_tokens_seen": 59333736, "step": 88000 }, { "epoch": 2.149976791341949, "grad_norm": 0.010112437419593334, "learning_rate": 1.4067719596887003e-06, "loss": 0.0963, "num_input_tokens_seen": 59337128, "step": 88005 }, { "epoch": 2.150098942173796, "grad_norm": 0.00013781416055280715, "learning_rate": 1.4066940545870506e-06, "loss": 0.0003, "num_input_tokens_seen": 59340584, "step": 88010 }, { "epoch": 2.1502210930056433, "grad_norm": 0.02945570833981037, "learning_rate": 1.406616146527813e-06, "loss": 0.0193, "num_input_tokens_seen": 59343848, "step": 88015 }, { "epoch": 2.1503432438374905, "grad_norm": 0.03614374250173569, "learning_rate": 1.4065382355115532e-06, "loss": 0.026, "num_input_tokens_seen": 59347240, "step": 88020 }, { "epoch": 2.1504653946693377, "grad_norm": 0.17033955454826355, "learning_rate": 1.4064603215388378e-06, "loss": 0.0296, "num_input_tokens_seen": 59351016, "step": 88025 }, { "epoch": 2.150587545501185, "grad_norm": 0.009592707268893719, "learning_rate": 1.4063824046102338e-06, "loss": 0.0935, "num_input_tokens_seen": 59354536, "step": 88030 }, { "epoch": 2.150709696333032, "grad_norm": 0.008734858594834805, "learning_rate": 1.4063044847263074e-06, "loss": 0.0001, "num_input_tokens_seen": 59357544, "step": 88035 }, { "epoch": 2.1508318471648793, "grad_norm": 0.04149794578552246, "learning_rate": 1.4062265618876258e-06, "loss": 0.1847, "num_input_tokens_seen": 59360808, "step": 88040 }, { "epoch": 2.1509539979967265, "grad_norm": 0.012698042206466198, "learning_rate": 1.4061486360947555e-06, "loss": 0.0882, "num_input_tokens_seen": 59363880, "step": 88045 }, { "epoch": 2.1510761488285737, "grad_norm": 0.13717693090438843, "learning_rate": 1.4060707073482628e-06, "loss": 0.0865, "num_input_tokens_seen": 59367144, "step": 88050 }, { "epoch": 2.151198299660421, "grad_norm": 0.03378063067793846, "learning_rate": 1.4059927756487147e-06, "loss": 0.0001, "num_input_tokens_seen": 59370664, "step": 88055 }, { "epoch": 2.151320450492268, "grad_norm": 0.011890546418726444, "learning_rate": 1.4059148409966778e-06, "loss": 0.0487, "num_input_tokens_seen": 59373928, "step": 88060 }, { "epoch": 2.1514426013241152, "grad_norm": 0.07420341670513153, "learning_rate": 1.405836903392719e-06, "loss": 0.0006, "num_input_tokens_seen": 59377064, "step": 88065 }, { "epoch": 2.151564752155962, "grad_norm": 0.07378300279378891, "learning_rate": 1.4057589628374053e-06, "loss": 0.0003, "num_input_tokens_seen": 59380776, "step": 88070 }, { "epoch": 2.151686902987809, "grad_norm": 0.03141964226961136, "learning_rate": 1.4056810193313034e-06, "loss": 0.0338, "num_input_tokens_seen": 59383720, "step": 88075 }, { "epoch": 2.1518090538196564, "grad_norm": 0.052101925015449524, "learning_rate": 1.40560307287498e-06, "loss": 0.0002, "num_input_tokens_seen": 59386792, "step": 88080 }, { "epoch": 2.1519312046515036, "grad_norm": 0.031124841421842575, "learning_rate": 1.405525123469002e-06, "loss": 0.0005, "num_input_tokens_seen": 59389800, "step": 88085 }, { "epoch": 2.1520533554833507, "grad_norm": 0.329045832157135, "learning_rate": 1.405447171113936e-06, "loss": 0.0629, "num_input_tokens_seen": 59392936, "step": 88090 }, { "epoch": 2.152175506315198, "grad_norm": 0.007800894323736429, "learning_rate": 1.405369215810349e-06, "loss": 0.0463, "num_input_tokens_seen": 59396008, "step": 88095 }, { "epoch": 2.152297657147045, "grad_norm": 0.04379410296678543, "learning_rate": 1.405291257558808e-06, "loss": 0.0004, "num_input_tokens_seen": 59399208, "step": 88100 }, { "epoch": 2.1524198079788923, "grad_norm": 0.1042783185839653, "learning_rate": 1.4052132963598804e-06, "loss": 0.0001, "num_input_tokens_seen": 59403048, "step": 88105 }, { "epoch": 2.1525419588107395, "grad_norm": 57.833778381347656, "learning_rate": 1.4051353322141324e-06, "loss": 0.0848, "num_input_tokens_seen": 59406312, "step": 88110 }, { "epoch": 2.1526641096425867, "grad_norm": 0.002779137110337615, "learning_rate": 1.4050573651221313e-06, "loss": 0.0654, "num_input_tokens_seen": 59409512, "step": 88115 }, { "epoch": 2.152786260474434, "grad_norm": 0.007450256962329149, "learning_rate": 1.404979395084444e-06, "loss": 0.0005, "num_input_tokens_seen": 59412712, "step": 88120 }, { "epoch": 2.152908411306281, "grad_norm": 0.028415225446224213, "learning_rate": 1.404901422101638e-06, "loss": 0.0003, "num_input_tokens_seen": 59415976, "step": 88125 }, { "epoch": 2.1530305621381283, "grad_norm": 20.51732063293457, "learning_rate": 1.4048234461742798e-06, "loss": 0.0564, "num_input_tokens_seen": 59418984, "step": 88130 }, { "epoch": 2.1531527129699755, "grad_norm": 3.080681324005127, "learning_rate": 1.4047454673029366e-06, "loss": 0.0007, "num_input_tokens_seen": 59422440, "step": 88135 }, { "epoch": 2.1532748638018226, "grad_norm": 0.10545322299003601, "learning_rate": 1.4046674854881756e-06, "loss": 0.1477, "num_input_tokens_seen": 59425704, "step": 88140 }, { "epoch": 2.15339701463367, "grad_norm": 0.01797093264758587, "learning_rate": 1.4045895007305635e-06, "loss": 0.0002, "num_input_tokens_seen": 59428712, "step": 88145 }, { "epoch": 2.153519165465517, "grad_norm": 0.0754646360874176, "learning_rate": 1.4045115130306679e-06, "loss": 0.0002, "num_input_tokens_seen": 59431720, "step": 88150 }, { "epoch": 2.1536413162973638, "grad_norm": 0.01984359510242939, "learning_rate": 1.4044335223890557e-06, "loss": 0.0198, "num_input_tokens_seen": 59435048, "step": 88155 }, { "epoch": 2.153763467129211, "grad_norm": 0.004418569151312113, "learning_rate": 1.4043555288062941e-06, "loss": 0.0001, "num_input_tokens_seen": 59438760, "step": 88160 }, { "epoch": 2.153885617961058, "grad_norm": 11.380366325378418, "learning_rate": 1.4042775322829506e-06, "loss": 0.0004, "num_input_tokens_seen": 59441896, "step": 88165 }, { "epoch": 2.1540077687929053, "grad_norm": 0.07178980112075806, "learning_rate": 1.4041995328195919e-06, "loss": 0.0868, "num_input_tokens_seen": 59445224, "step": 88170 }, { "epoch": 2.1541299196247525, "grad_norm": 0.0017138965195044875, "learning_rate": 1.4041215304167855e-06, "loss": 0.0001, "num_input_tokens_seen": 59448104, "step": 88175 }, { "epoch": 2.1542520704565997, "grad_norm": 0.012803579680621624, "learning_rate": 1.4040435250750988e-06, "loss": 0.0001, "num_input_tokens_seen": 59451688, "step": 88180 }, { "epoch": 2.154374221288447, "grad_norm": 0.008812888525426388, "learning_rate": 1.4039655167950987e-06, "loss": 0.0001, "num_input_tokens_seen": 59454760, "step": 88185 }, { "epoch": 2.154496372120294, "grad_norm": 5.979305267333984, "learning_rate": 1.403887505577353e-06, "loss": 0.0007, "num_input_tokens_seen": 59458024, "step": 88190 }, { "epoch": 2.1546185229521413, "grad_norm": 16.921903610229492, "learning_rate": 1.4038094914224285e-06, "loss": 0.0656, "num_input_tokens_seen": 59461288, "step": 88195 }, { "epoch": 2.1547406737839885, "grad_norm": 0.007237650454044342, "learning_rate": 1.4037314743308928e-06, "loss": 0.0229, "num_input_tokens_seen": 59465000, "step": 88200 }, { "epoch": 2.1548628246158357, "grad_norm": 0.024005282670259476, "learning_rate": 1.4036534543033133e-06, "loss": 0.0001, "num_input_tokens_seen": 59468264, "step": 88205 }, { "epoch": 2.154984975447683, "grad_norm": 0.01715189404785633, "learning_rate": 1.4035754313402573e-06, "loss": 0.0668, "num_input_tokens_seen": 59471848, "step": 88210 }, { "epoch": 2.15510712627953, "grad_norm": 0.004532475955784321, "learning_rate": 1.403497405442292e-06, "loss": 0.2547, "num_input_tokens_seen": 59475304, "step": 88215 }, { "epoch": 2.1552292771113772, "grad_norm": 1.9837899208068848, "learning_rate": 1.403419376609985e-06, "loss": 0.0004, "num_input_tokens_seen": 59478696, "step": 88220 }, { "epoch": 2.1553514279432244, "grad_norm": 0.04568152502179146, "learning_rate": 1.4033413448439042e-06, "loss": 0.0002, "num_input_tokens_seen": 59482216, "step": 88225 }, { "epoch": 2.1554735787750716, "grad_norm": 0.12637126445770264, "learning_rate": 1.4032633101446166e-06, "loss": 0.0004, "num_input_tokens_seen": 59485672, "step": 88230 }, { "epoch": 2.155595729606919, "grad_norm": 0.05594087392091751, "learning_rate": 1.4031852725126897e-06, "loss": 0.0825, "num_input_tokens_seen": 59488808, "step": 88235 }, { "epoch": 2.1557178804387656, "grad_norm": 0.010974193923175335, "learning_rate": 1.4031072319486907e-06, "loss": 0.1709, "num_input_tokens_seen": 59492072, "step": 88240 }, { "epoch": 2.155840031270613, "grad_norm": 0.011812552809715271, "learning_rate": 1.403029188453188e-06, "loss": 0.0001, "num_input_tokens_seen": 59495976, "step": 88245 }, { "epoch": 2.15596218210246, "grad_norm": 0.008403713814914227, "learning_rate": 1.4029511420267484e-06, "loss": 0.0004, "num_input_tokens_seen": 59499240, "step": 88250 }, { "epoch": 2.156084332934307, "grad_norm": 0.17212870717048645, "learning_rate": 1.4028730926699395e-06, "loss": 0.0003, "num_input_tokens_seen": 59502184, "step": 88255 }, { "epoch": 2.1562064837661543, "grad_norm": 0.020884985104203224, "learning_rate": 1.4027950403833294e-06, "loss": 0.0001, "num_input_tokens_seen": 59506024, "step": 88260 }, { "epoch": 2.1563286345980015, "grad_norm": 0.03501040115952492, "learning_rate": 1.4027169851674851e-06, "loss": 0.0002, "num_input_tokens_seen": 59509032, "step": 88265 }, { "epoch": 2.1564507854298487, "grad_norm": 0.7422922849655151, "learning_rate": 1.402638927022975e-06, "loss": 0.0009, "num_input_tokens_seen": 59512488, "step": 88270 }, { "epoch": 2.156572936261696, "grad_norm": 0.024366816505789757, "learning_rate": 1.402560865950366e-06, "loss": 0.0002, "num_input_tokens_seen": 59515816, "step": 88275 }, { "epoch": 2.156695087093543, "grad_norm": 0.0642450600862503, "learning_rate": 1.4024828019502262e-06, "loss": 0.0007, "num_input_tokens_seen": 59519016, "step": 88280 }, { "epoch": 2.1568172379253903, "grad_norm": 75.12712097167969, "learning_rate": 1.4024047350231234e-06, "loss": 0.0403, "num_input_tokens_seen": 59522024, "step": 88285 }, { "epoch": 2.1569393887572375, "grad_norm": 0.1974213570356369, "learning_rate": 1.4023266651696249e-06, "loss": 0.0002, "num_input_tokens_seen": 59525224, "step": 88290 }, { "epoch": 2.1570615395890846, "grad_norm": 0.040102358907461166, "learning_rate": 1.4022485923902988e-06, "loss": 0.0436, "num_input_tokens_seen": 59528424, "step": 88295 }, { "epoch": 2.157183690420932, "grad_norm": 0.014993073418736458, "learning_rate": 1.4021705166857126e-06, "loss": 0.0001, "num_input_tokens_seen": 59532072, "step": 88300 }, { "epoch": 2.157305841252779, "grad_norm": 0.008291895501315594, "learning_rate": 1.4020924380564342e-06, "loss": 0.0001, "num_input_tokens_seen": 59535208, "step": 88305 }, { "epoch": 2.157427992084626, "grad_norm": 303.73114013671875, "learning_rate": 1.4020143565030318e-06, "loss": 0.0055, "num_input_tokens_seen": 59538536, "step": 88310 }, { "epoch": 2.1575501429164734, "grad_norm": 103.22540283203125, "learning_rate": 1.4019362720260723e-06, "loss": 0.0424, "num_input_tokens_seen": 59541544, "step": 88315 }, { "epoch": 2.1576722937483206, "grad_norm": 0.002675070893019438, "learning_rate": 1.4018581846261246e-06, "loss": 0.0404, "num_input_tokens_seen": 59545128, "step": 88320 }, { "epoch": 2.157794444580168, "grad_norm": 0.10408946126699448, "learning_rate": 1.4017800943037558e-06, "loss": 0.0001, "num_input_tokens_seen": 59548712, "step": 88325 }, { "epoch": 2.157916595412015, "grad_norm": 0.07306545227766037, "learning_rate": 1.4017020010595344e-06, "loss": 0.0377, "num_input_tokens_seen": 59551912, "step": 88330 }, { "epoch": 2.1580387462438617, "grad_norm": 0.04442272335290909, "learning_rate": 1.401623904894028e-06, "loss": 0.0001, "num_input_tokens_seen": 59554984, "step": 88335 }, { "epoch": 2.158160897075709, "grad_norm": 48.30341720581055, "learning_rate": 1.4015458058078042e-06, "loss": 0.049, "num_input_tokens_seen": 59558312, "step": 88340 }, { "epoch": 2.158283047907556, "grad_norm": 0.021329158917069435, "learning_rate": 1.4014677038014315e-06, "loss": 0.0583, "num_input_tokens_seen": 59561448, "step": 88345 }, { "epoch": 2.1584051987394033, "grad_norm": 0.03582082316279411, "learning_rate": 1.4013895988754776e-06, "loss": 0.0027, "num_input_tokens_seen": 59564264, "step": 88350 }, { "epoch": 2.1585273495712505, "grad_norm": 0.03441407531499863, "learning_rate": 1.4013114910305107e-06, "loss": 0.0367, "num_input_tokens_seen": 59567848, "step": 88355 }, { "epoch": 2.1586495004030977, "grad_norm": 0.009749101474881172, "learning_rate": 1.4012333802670985e-06, "loss": 0.0436, "num_input_tokens_seen": 59572136, "step": 88360 }, { "epoch": 2.158771651234945, "grad_norm": 0.00150212156586349, "learning_rate": 1.4011552665858094e-06, "loss": 0.0001, "num_input_tokens_seen": 59575784, "step": 88365 }, { "epoch": 2.158893802066792, "grad_norm": 0.0041437577456235886, "learning_rate": 1.4010771499872114e-06, "loss": 0.0019, "num_input_tokens_seen": 59579112, "step": 88370 }, { "epoch": 2.1590159528986392, "grad_norm": 0.01603412814438343, "learning_rate": 1.4009990304718722e-06, "loss": 0.056, "num_input_tokens_seen": 59582184, "step": 88375 }, { "epoch": 2.1591381037304864, "grad_norm": 0.05813143774867058, "learning_rate": 1.4009209080403603e-06, "loss": 0.0347, "num_input_tokens_seen": 59584936, "step": 88380 }, { "epoch": 2.1592602545623336, "grad_norm": 0.04797271639108658, "learning_rate": 1.400842782693244e-06, "loss": 0.0294, "num_input_tokens_seen": 59588200, "step": 88385 }, { "epoch": 2.159382405394181, "grad_norm": 14.291007041931152, "learning_rate": 1.4007646544310912e-06, "loss": 0.0552, "num_input_tokens_seen": 59591528, "step": 88390 }, { "epoch": 2.159504556226028, "grad_norm": 0.10813934355974197, "learning_rate": 1.4006865232544696e-06, "loss": 0.0003, "num_input_tokens_seen": 59595112, "step": 88395 }, { "epoch": 2.159626707057875, "grad_norm": 0.24442404508590698, "learning_rate": 1.4006083891639481e-06, "loss": 0.1146, "num_input_tokens_seen": 59598632, "step": 88400 }, { "epoch": 2.1597488578897224, "grad_norm": 0.008931396529078484, "learning_rate": 1.4005302521600945e-06, "loss": 0.0001, "num_input_tokens_seen": 59602024, "step": 88405 }, { "epoch": 2.1598710087215696, "grad_norm": 0.06182999163866043, "learning_rate": 1.4004521122434772e-06, "loss": 0.0182, "num_input_tokens_seen": 59605352, "step": 88410 }, { "epoch": 2.1599931595534168, "grad_norm": 0.007702971808612347, "learning_rate": 1.4003739694146644e-06, "loss": 0.0001, "num_input_tokens_seen": 59608296, "step": 88415 }, { "epoch": 2.1601153103852635, "grad_norm": 0.16781401634216309, "learning_rate": 1.4002958236742246e-06, "loss": 0.0517, "num_input_tokens_seen": 59611368, "step": 88420 }, { "epoch": 2.1602374612171107, "grad_norm": 0.08598300069570541, "learning_rate": 1.4002176750227257e-06, "loss": 0.0537, "num_input_tokens_seen": 59614440, "step": 88425 }, { "epoch": 2.160359612048958, "grad_norm": 0.02408665232360363, "learning_rate": 1.4001395234607362e-06, "loss": 0.0573, "num_input_tokens_seen": 59617832, "step": 88430 }, { "epoch": 2.160481762880805, "grad_norm": 0.01334523968398571, "learning_rate": 1.4000613689888248e-06, "loss": 0.0303, "num_input_tokens_seen": 59621224, "step": 88435 }, { "epoch": 2.1606039137126523, "grad_norm": 0.004405375570058823, "learning_rate": 1.399983211607559e-06, "loss": 0.0767, "num_input_tokens_seen": 59624360, "step": 88440 }, { "epoch": 2.1607260645444994, "grad_norm": 0.02678143046796322, "learning_rate": 1.3999050513175081e-06, "loss": 0.0001, "num_input_tokens_seen": 59627496, "step": 88445 }, { "epoch": 2.1608482153763466, "grad_norm": 0.0012523168697953224, "learning_rate": 1.39982688811924e-06, "loss": 0.0005, "num_input_tokens_seen": 59631208, "step": 88450 }, { "epoch": 2.160970366208194, "grad_norm": 0.0031171184964478016, "learning_rate": 1.3997487220133232e-06, "loss": 0.0441, "num_input_tokens_seen": 59634280, "step": 88455 }, { "epoch": 2.161092517040041, "grad_norm": 0.02987041138112545, "learning_rate": 1.3996705530003262e-06, "loss": 0.0314, "num_input_tokens_seen": 59637672, "step": 88460 }, { "epoch": 2.161214667871888, "grad_norm": 0.003604191355407238, "learning_rate": 1.3995923810808176e-06, "loss": 0.0001, "num_input_tokens_seen": 59641256, "step": 88465 }, { "epoch": 2.1613368187037354, "grad_norm": 0.8536860942840576, "learning_rate": 1.3995142062553654e-06, "loss": 0.0001, "num_input_tokens_seen": 59644584, "step": 88470 }, { "epoch": 2.1614589695355826, "grad_norm": 0.1137804314494133, "learning_rate": 1.3994360285245386e-06, "loss": 0.0431, "num_input_tokens_seen": 59647912, "step": 88475 }, { "epoch": 2.1615811203674298, "grad_norm": 0.14593037962913513, "learning_rate": 1.3993578478889054e-06, "loss": 0.0001, "num_input_tokens_seen": 59651304, "step": 88480 }, { "epoch": 2.161703271199277, "grad_norm": 13.66635513305664, "learning_rate": 1.3992796643490348e-06, "loss": 0.0991, "num_input_tokens_seen": 59654568, "step": 88485 }, { "epoch": 2.161825422031124, "grad_norm": 0.046653177589178085, "learning_rate": 1.399201477905495e-06, "loss": 0.0002, "num_input_tokens_seen": 59659112, "step": 88490 }, { "epoch": 2.1619475728629713, "grad_norm": 0.016694676131010056, "learning_rate": 1.3991232885588546e-06, "loss": 0.0597, "num_input_tokens_seen": 59662568, "step": 88495 }, { "epoch": 2.1620697236948185, "grad_norm": 0.09434916824102402, "learning_rate": 1.3990450963096824e-06, "loss": 0.0002, "num_input_tokens_seen": 59665832, "step": 88500 }, { "epoch": 2.1621918745266657, "grad_norm": 3.1131694316864014, "learning_rate": 1.398966901158547e-06, "loss": 0.0536, "num_input_tokens_seen": 59669544, "step": 88505 }, { "epoch": 2.162314025358513, "grad_norm": 0.0042829355224967, "learning_rate": 1.3988887031060168e-06, "loss": 0.0708, "num_input_tokens_seen": 59672872, "step": 88510 }, { "epoch": 2.1624361761903597, "grad_norm": 21.86163902282715, "learning_rate": 1.3988105021526608e-06, "loss": 0.1252, "num_input_tokens_seen": 59676200, "step": 88515 }, { "epoch": 2.162558327022207, "grad_norm": 0.009639190509915352, "learning_rate": 1.3987322982990474e-06, "loss": 0.0001, "num_input_tokens_seen": 59679144, "step": 88520 }, { "epoch": 2.162680477854054, "grad_norm": 0.0002484617580194026, "learning_rate": 1.3986540915457457e-06, "loss": 0.0002, "num_input_tokens_seen": 59682600, "step": 88525 }, { "epoch": 2.1628026286859012, "grad_norm": 0.04339480400085449, "learning_rate": 1.398575881893324e-06, "loss": 0.1044, "num_input_tokens_seen": 59685928, "step": 88530 }, { "epoch": 2.1629247795177484, "grad_norm": 0.05362813547253609, "learning_rate": 1.3984976693423512e-06, "loss": 0.0002, "num_input_tokens_seen": 59689320, "step": 88535 }, { "epoch": 2.1630469303495956, "grad_norm": 0.27106356620788574, "learning_rate": 1.3984194538933961e-06, "loss": 0.0003, "num_input_tokens_seen": 59692904, "step": 88540 }, { "epoch": 2.163169081181443, "grad_norm": 0.2880617678165436, "learning_rate": 1.3983412355470283e-06, "loss": 0.0003, "num_input_tokens_seen": 59696040, "step": 88545 }, { "epoch": 2.16329123201329, "grad_norm": 25.8460636138916, "learning_rate": 1.3982630143038154e-06, "loss": 0.1038, "num_input_tokens_seen": 59699368, "step": 88550 }, { "epoch": 2.163413382845137, "grad_norm": 0.028124157339334488, "learning_rate": 1.3981847901643266e-06, "loss": 0.0558, "num_input_tokens_seen": 59702248, "step": 88555 }, { "epoch": 2.1635355336769844, "grad_norm": 0.03156181052327156, "learning_rate": 1.398106563129131e-06, "loss": 0.0054, "num_input_tokens_seen": 59705512, "step": 88560 }, { "epoch": 2.1636576845088316, "grad_norm": 0.01643490605056286, "learning_rate": 1.3980283331987973e-06, "loss": 0.0636, "num_input_tokens_seen": 59708712, "step": 88565 }, { "epoch": 2.1637798353406787, "grad_norm": 0.14175744354724884, "learning_rate": 1.3979501003738948e-06, "loss": 0.0157, "num_input_tokens_seen": 59712168, "step": 88570 }, { "epoch": 2.163901986172526, "grad_norm": 0.3935987651348114, "learning_rate": 1.397871864654992e-06, "loss": 0.0006, "num_input_tokens_seen": 59715496, "step": 88575 }, { "epoch": 2.164024137004373, "grad_norm": 0.011017389595508575, "learning_rate": 1.397793626042658e-06, "loss": 0.0006, "num_input_tokens_seen": 59718888, "step": 88580 }, { "epoch": 2.1641462878362203, "grad_norm": 0.04418168216943741, "learning_rate": 1.3977153845374616e-06, "loss": 0.1219, "num_input_tokens_seen": 59722280, "step": 88585 }, { "epoch": 2.1642684386680675, "grad_norm": 0.007941069081425667, "learning_rate": 1.397637140139972e-06, "loss": 0.0005, "num_input_tokens_seen": 59725608, "step": 88590 }, { "epoch": 2.1643905894999147, "grad_norm": 0.0148239741101861, "learning_rate": 1.3975588928507583e-06, "loss": 0.0559, "num_input_tokens_seen": 59729000, "step": 88595 }, { "epoch": 2.1645127403317614, "grad_norm": 49.897762298583984, "learning_rate": 1.3974806426703894e-06, "loss": 0.0894, "num_input_tokens_seen": 59732520, "step": 88600 }, { "epoch": 2.1646348911636086, "grad_norm": 0.09293018281459808, "learning_rate": 1.3974023895994342e-06, "loss": 0.0502, "num_input_tokens_seen": 59736232, "step": 88605 }, { "epoch": 2.164757041995456, "grad_norm": 20.600238800048828, "learning_rate": 1.3973241336384622e-06, "loss": 0.0663, "num_input_tokens_seen": 59739048, "step": 88610 }, { "epoch": 2.164879192827303, "grad_norm": 0.13299432396888733, "learning_rate": 1.397245874788042e-06, "loss": 0.0003, "num_input_tokens_seen": 59742760, "step": 88615 }, { "epoch": 2.16500134365915, "grad_norm": 0.0673794373869896, "learning_rate": 1.397167613048743e-06, "loss": 0.0001, "num_input_tokens_seen": 59746216, "step": 88620 }, { "epoch": 2.1651234944909974, "grad_norm": 0.001309892744757235, "learning_rate": 1.397089348421134e-06, "loss": 0.0612, "num_input_tokens_seen": 59749544, "step": 88625 }, { "epoch": 2.1652456453228446, "grad_norm": 0.04341740533709526, "learning_rate": 1.397011080905785e-06, "loss": 0.0002, "num_input_tokens_seen": 59753128, "step": 88630 }, { "epoch": 2.1653677961546918, "grad_norm": 0.05454143509268761, "learning_rate": 1.3969328105032643e-06, "loss": 0.0003, "num_input_tokens_seen": 59756392, "step": 88635 }, { "epoch": 2.165489946986539, "grad_norm": 0.22605502605438232, "learning_rate": 1.3968545372141416e-06, "loss": 0.0002, "num_input_tokens_seen": 59759528, "step": 88640 }, { "epoch": 2.165612097818386, "grad_norm": 0.08796440064907074, "learning_rate": 1.3967762610389858e-06, "loss": 0.0256, "num_input_tokens_seen": 59763240, "step": 88645 }, { "epoch": 2.1657342486502333, "grad_norm": 0.006114218384027481, "learning_rate": 1.3966979819783666e-06, "loss": 0.0002, "num_input_tokens_seen": 59766504, "step": 88650 }, { "epoch": 2.1658563994820805, "grad_norm": 496.6544494628906, "learning_rate": 1.3966197000328528e-06, "loss": 0.0406, "num_input_tokens_seen": 59769768, "step": 88655 }, { "epoch": 2.1659785503139277, "grad_norm": 0.05280788987874985, "learning_rate": 1.3965414152030138e-06, "loss": 0.0002, "num_input_tokens_seen": 59772968, "step": 88660 }, { "epoch": 2.166100701145775, "grad_norm": 0.9186227321624756, "learning_rate": 1.3964631274894189e-06, "loss": 0.0001, "num_input_tokens_seen": 59776232, "step": 88665 }, { "epoch": 2.166222851977622, "grad_norm": 0.027731165289878845, "learning_rate": 1.3963848368926376e-06, "loss": 0.0396, "num_input_tokens_seen": 59779880, "step": 88670 }, { "epoch": 2.1663450028094693, "grad_norm": 0.006741840858012438, "learning_rate": 1.3963065434132392e-06, "loss": 0.0005, "num_input_tokens_seen": 59783592, "step": 88675 }, { "epoch": 2.1664671536413165, "grad_norm": 0.2137477844953537, "learning_rate": 1.3962282470517933e-06, "loss": 0.0002, "num_input_tokens_seen": 59786792, "step": 88680 }, { "epoch": 2.1665893044731637, "grad_norm": 0.24367313086986542, "learning_rate": 1.3961499478088685e-06, "loss": 0.0003, "num_input_tokens_seen": 59789928, "step": 88685 }, { "epoch": 2.166711455305011, "grad_norm": 0.004783410578966141, "learning_rate": 1.3960716456850347e-06, "loss": 0.0839, "num_input_tokens_seen": 59793512, "step": 88690 }, { "epoch": 2.1668336061368576, "grad_norm": 0.010234296321868896, "learning_rate": 1.3959933406808616e-06, "loss": 0.0556, "num_input_tokens_seen": 59796584, "step": 88695 }, { "epoch": 2.166955756968705, "grad_norm": 0.026050550863146782, "learning_rate": 1.3959150327969188e-06, "loss": 0.1148, "num_input_tokens_seen": 59799976, "step": 88700 }, { "epoch": 2.167077907800552, "grad_norm": 0.5169499516487122, "learning_rate": 1.395836722033775e-06, "loss": 0.0005, "num_input_tokens_seen": 59803368, "step": 88705 }, { "epoch": 2.167200058632399, "grad_norm": 0.01806020550429821, "learning_rate": 1.395758408392e-06, "loss": 0.0003, "num_input_tokens_seen": 59806760, "step": 88710 }, { "epoch": 2.1673222094642464, "grad_norm": 0.049956656992435455, "learning_rate": 1.3956800918721637e-06, "loss": 0.0002, "num_input_tokens_seen": 59809896, "step": 88715 }, { "epoch": 2.1674443602960936, "grad_norm": 0.01550681795924902, "learning_rate": 1.3956017724748347e-06, "loss": 0.0, "num_input_tokens_seen": 59813352, "step": 88720 }, { "epoch": 2.1675665111279407, "grad_norm": 0.06467894464731216, "learning_rate": 1.395523450200584e-06, "loss": 0.0001, "num_input_tokens_seen": 59816488, "step": 88725 }, { "epoch": 2.167688661959788, "grad_norm": 0.13828709721565247, "learning_rate": 1.39544512504998e-06, "loss": 0.0002, "num_input_tokens_seen": 59819624, "step": 88730 }, { "epoch": 2.167810812791635, "grad_norm": 0.388836532831192, "learning_rate": 1.3953667970235928e-06, "loss": 0.0002, "num_input_tokens_seen": 59823144, "step": 88735 }, { "epoch": 2.1679329636234823, "grad_norm": 0.1045452430844307, "learning_rate": 1.3952884661219917e-06, "loss": 0.1275, "num_input_tokens_seen": 59826664, "step": 88740 }, { "epoch": 2.1680551144553295, "grad_norm": 0.026057695969939232, "learning_rate": 1.395210132345747e-06, "loss": 0.0001, "num_input_tokens_seen": 59830184, "step": 88745 }, { "epoch": 2.1681772652871767, "grad_norm": 0.022811226546764374, "learning_rate": 1.3951317956954274e-06, "loss": 0.0001, "num_input_tokens_seen": 59833256, "step": 88750 }, { "epoch": 2.168299416119024, "grad_norm": 0.05536481365561485, "learning_rate": 1.3950534561716035e-06, "loss": 0.0739, "num_input_tokens_seen": 59836648, "step": 88755 }, { "epoch": 2.168421566950871, "grad_norm": 0.003870234126225114, "learning_rate": 1.3949751137748442e-06, "loss": 0.0001, "num_input_tokens_seen": 59840616, "step": 88760 }, { "epoch": 2.1685437177827183, "grad_norm": 0.008767131716012955, "learning_rate": 1.39489676850572e-06, "loss": 0.0628, "num_input_tokens_seen": 59843688, "step": 88765 }, { "epoch": 2.1686658686145655, "grad_norm": 0.036568425595760345, "learning_rate": 1.3948184203648002e-06, "loss": 0.0001, "num_input_tokens_seen": 59847144, "step": 88770 }, { "epoch": 2.1687880194464126, "grad_norm": 0.06900046020746231, "learning_rate": 1.3947400693526545e-06, "loss": 0.0427, "num_input_tokens_seen": 59850216, "step": 88775 }, { "epoch": 2.1689101702782594, "grad_norm": 24.118446350097656, "learning_rate": 1.3946617154698529e-06, "loss": 0.0568, "num_input_tokens_seen": 59853480, "step": 88780 }, { "epoch": 2.1690323211101066, "grad_norm": 0.021634750068187714, "learning_rate": 1.3945833587169653e-06, "loss": 0.0002, "num_input_tokens_seen": 59857320, "step": 88785 }, { "epoch": 2.1691544719419538, "grad_norm": 0.050916168838739395, "learning_rate": 1.3945049990945613e-06, "loss": 0.0002, "num_input_tokens_seen": 59860392, "step": 88790 }, { "epoch": 2.169276622773801, "grad_norm": 0.20405644178390503, "learning_rate": 1.3944266366032107e-06, "loss": 0.0002, "num_input_tokens_seen": 59863976, "step": 88795 }, { "epoch": 2.169398773605648, "grad_norm": 0.0013021642807871103, "learning_rate": 1.3943482712434837e-06, "loss": 0.0, "num_input_tokens_seen": 59866920, "step": 88800 }, { "epoch": 2.1695209244374953, "grad_norm": 0.7083492875099182, "learning_rate": 1.39426990301595e-06, "loss": 0.0003, "num_input_tokens_seen": 59870504, "step": 88805 }, { "epoch": 2.1696430752693425, "grad_norm": 0.008465035818517208, "learning_rate": 1.3941915319211797e-06, "loss": 0.0001, "num_input_tokens_seen": 59874088, "step": 88810 }, { "epoch": 2.1697652261011897, "grad_norm": 107.73428344726562, "learning_rate": 1.394113157959742e-06, "loss": 0.0558, "num_input_tokens_seen": 59877544, "step": 88815 }, { "epoch": 2.169887376933037, "grad_norm": 0.7204272747039795, "learning_rate": 1.3940347811322078e-06, "loss": 0.0003, "num_input_tokens_seen": 59881448, "step": 88820 }, { "epoch": 2.170009527764884, "grad_norm": 20.292095184326172, "learning_rate": 1.3939564014391468e-06, "loss": 0.0664, "num_input_tokens_seen": 59885416, "step": 88825 }, { "epoch": 2.1701316785967313, "grad_norm": 0.005525513086467981, "learning_rate": 1.3938780188811286e-06, "loss": 0.0, "num_input_tokens_seen": 59889000, "step": 88830 }, { "epoch": 2.1702538294285785, "grad_norm": 0.5506719350814819, "learning_rate": 1.3937996334587235e-06, "loss": 0.0475, "num_input_tokens_seen": 59892328, "step": 88835 }, { "epoch": 2.1703759802604257, "grad_norm": 0.016354860737919807, "learning_rate": 1.3937212451725018e-06, "loss": 0.0813, "num_input_tokens_seen": 59895720, "step": 88840 }, { "epoch": 2.170498131092273, "grad_norm": 0.017355183139443398, "learning_rate": 1.3936428540230328e-06, "loss": 0.0001, "num_input_tokens_seen": 59899496, "step": 88845 }, { "epoch": 2.17062028192412, "grad_norm": 0.013150669634342194, "learning_rate": 1.3935644600108875e-06, "loss": 0.0, "num_input_tokens_seen": 59902696, "step": 88850 }, { "epoch": 2.1707424327559672, "grad_norm": 26.938447952270508, "learning_rate": 1.3934860631366358e-06, "loss": 0.1509, "num_input_tokens_seen": 59906024, "step": 88855 }, { "epoch": 2.1708645835878144, "grad_norm": 0.05356710031628609, "learning_rate": 1.3934076634008474e-06, "loss": 0.0001, "num_input_tokens_seen": 59909672, "step": 88860 }, { "epoch": 2.170986734419661, "grad_norm": 0.07025501132011414, "learning_rate": 1.3933292608040927e-06, "loss": 0.0836, "num_input_tokens_seen": 59912872, "step": 88865 }, { "epoch": 2.1711088852515084, "grad_norm": 18.241281509399414, "learning_rate": 1.3932508553469417e-06, "loss": 0.1332, "num_input_tokens_seen": 59916200, "step": 88870 }, { "epoch": 2.1712310360833555, "grad_norm": 0.16294845938682556, "learning_rate": 1.3931724470299646e-06, "loss": 0.0531, "num_input_tokens_seen": 59919784, "step": 88875 }, { "epoch": 2.1713531869152027, "grad_norm": 0.03206993266940117, "learning_rate": 1.393094035853732e-06, "loss": 0.0003, "num_input_tokens_seen": 59923368, "step": 88880 }, { "epoch": 2.17147533774705, "grad_norm": 0.07751631736755371, "learning_rate": 1.3930156218188137e-06, "loss": 0.0502, "num_input_tokens_seen": 59926632, "step": 88885 }, { "epoch": 2.171597488578897, "grad_norm": 16.496200561523438, "learning_rate": 1.3929372049257802e-06, "loss": 0.0377, "num_input_tokens_seen": 59929832, "step": 88890 }, { "epoch": 2.1717196394107443, "grad_norm": 0.1615595519542694, "learning_rate": 1.3928587851752015e-06, "loss": 0.0008, "num_input_tokens_seen": 59933480, "step": 88895 }, { "epoch": 2.1718417902425915, "grad_norm": 19.633365631103516, "learning_rate": 1.392780362567648e-06, "loss": 0.048, "num_input_tokens_seen": 59937256, "step": 88900 }, { "epoch": 2.1719639410744387, "grad_norm": 0.04594242572784424, "learning_rate": 1.3927019371036903e-06, "loss": 0.074, "num_input_tokens_seen": 59940328, "step": 88905 }, { "epoch": 2.172086091906286, "grad_norm": 0.3396862745285034, "learning_rate": 1.3926235087838982e-06, "loss": 0.0447, "num_input_tokens_seen": 59943528, "step": 88910 }, { "epoch": 2.172208242738133, "grad_norm": 27.388818740844727, "learning_rate": 1.3925450776088426e-06, "loss": 0.0433, "num_input_tokens_seen": 59946856, "step": 88915 }, { "epoch": 2.1723303935699803, "grad_norm": 0.012639293447136879, "learning_rate": 1.3924666435790936e-06, "loss": 0.0003, "num_input_tokens_seen": 59949928, "step": 88920 }, { "epoch": 2.1724525444018274, "grad_norm": 0.05224275588989258, "learning_rate": 1.3923882066952216e-06, "loss": 0.0431, "num_input_tokens_seen": 59953704, "step": 88925 }, { "epoch": 2.1725746952336746, "grad_norm": 0.11708616465330124, "learning_rate": 1.3923097669577967e-06, "loss": 0.0004, "num_input_tokens_seen": 59957032, "step": 88930 }, { "epoch": 2.172696846065522, "grad_norm": 0.07801163196563721, "learning_rate": 1.3922313243673899e-06, "loss": 0.0538, "num_input_tokens_seen": 59960104, "step": 88935 }, { "epoch": 2.172818996897369, "grad_norm": 0.0019404669292271137, "learning_rate": 1.3921528789245713e-06, "loss": 0.0267, "num_input_tokens_seen": 59963368, "step": 88940 }, { "epoch": 2.172941147729216, "grad_norm": 0.010919238440692425, "learning_rate": 1.3920744306299117e-06, "loss": 0.0488, "num_input_tokens_seen": 59966248, "step": 88945 }, { "epoch": 2.1730632985610634, "grad_norm": 0.022607387974858284, "learning_rate": 1.391995979483981e-06, "loss": 0.0003, "num_input_tokens_seen": 59970344, "step": 88950 }, { "epoch": 2.1731854493929106, "grad_norm": 0.10743965208530426, "learning_rate": 1.3919175254873505e-06, "loss": 0.0002, "num_input_tokens_seen": 59973352, "step": 88955 }, { "epoch": 2.1733076002247573, "grad_norm": 0.06325899064540863, "learning_rate": 1.3918390686405903e-06, "loss": 0.0513, "num_input_tokens_seen": 59976488, "step": 88960 }, { "epoch": 2.1734297510566045, "grad_norm": 41.05293655395508, "learning_rate": 1.391760608944271e-06, "loss": 0.0526, "num_input_tokens_seen": 59980136, "step": 88965 }, { "epoch": 2.1735519018884517, "grad_norm": 0.0012355023063719273, "learning_rate": 1.3916821463989629e-06, "loss": 0.0488, "num_input_tokens_seen": 59984040, "step": 88970 }, { "epoch": 2.173674052720299, "grad_norm": 0.010925422422587872, "learning_rate": 1.3916036810052373e-06, "loss": 0.1476, "num_input_tokens_seen": 59987112, "step": 88975 }, { "epoch": 2.173796203552146, "grad_norm": 19.451017379760742, "learning_rate": 1.391525212763664e-06, "loss": 0.0713, "num_input_tokens_seen": 59990376, "step": 88980 }, { "epoch": 2.1739183543839933, "grad_norm": 0.040949687361717224, "learning_rate": 1.3914467416748144e-06, "loss": 0.0001, "num_input_tokens_seen": 59993896, "step": 88985 }, { "epoch": 2.1740405052158405, "grad_norm": 0.4370739161968231, "learning_rate": 1.3913682677392587e-06, "loss": 0.0669, "num_input_tokens_seen": 59996840, "step": 88990 }, { "epoch": 2.1741626560476877, "grad_norm": 0.016194364055991173, "learning_rate": 1.3912897909575675e-06, "loss": 0.0655, "num_input_tokens_seen": 60000232, "step": 88995 }, { "epoch": 2.174284806879535, "grad_norm": 1.7703710794448853, "learning_rate": 1.3912113113303117e-06, "loss": 0.0543, "num_input_tokens_seen": 60003944, "step": 89000 }, { "epoch": 2.174406957711382, "grad_norm": 0.024700865149497986, "learning_rate": 1.3911328288580621e-06, "loss": 0.0481, "num_input_tokens_seen": 60007272, "step": 89005 }, { "epoch": 2.1745291085432292, "grad_norm": 0.38946980237960815, "learning_rate": 1.3910543435413898e-06, "loss": 0.0006, "num_input_tokens_seen": 60010536, "step": 89010 }, { "epoch": 2.1746512593750764, "grad_norm": 0.041124243289232254, "learning_rate": 1.3909758553808646e-06, "loss": 0.0003, "num_input_tokens_seen": 60013992, "step": 89015 }, { "epoch": 2.1747734102069236, "grad_norm": 0.11730754375457764, "learning_rate": 1.390897364377058e-06, "loss": 0.0004, "num_input_tokens_seen": 60017384, "step": 89020 }, { "epoch": 2.174895561038771, "grad_norm": 0.04993576556444168, "learning_rate": 1.3908188705305405e-06, "loss": 0.002, "num_input_tokens_seen": 60020584, "step": 89025 }, { "epoch": 2.175017711870618, "grad_norm": 0.03723525628447533, "learning_rate": 1.390740373841883e-06, "loss": 0.0003, "num_input_tokens_seen": 60024616, "step": 89030 }, { "epoch": 2.175139862702465, "grad_norm": 0.03435581177473068, "learning_rate": 1.3906618743116567e-06, "loss": 0.0017, "num_input_tokens_seen": 60028264, "step": 89035 }, { "epoch": 2.1752620135343124, "grad_norm": 0.015532208606600761, "learning_rate": 1.390583371940432e-06, "loss": 0.0001, "num_input_tokens_seen": 60031464, "step": 89040 }, { "epoch": 2.175384164366159, "grad_norm": 11.067805290222168, "learning_rate": 1.3905048667287799e-06, "loss": 0.0005, "num_input_tokens_seen": 60034856, "step": 89045 }, { "epoch": 2.1755063151980063, "grad_norm": 0.007327179424464703, "learning_rate": 1.3904263586772716e-06, "loss": 0.0001, "num_input_tokens_seen": 60038248, "step": 89050 }, { "epoch": 2.1756284660298535, "grad_norm": 0.013096985407173634, "learning_rate": 1.3903478477864776e-06, "loss": 0.0004, "num_input_tokens_seen": 60041384, "step": 89055 }, { "epoch": 2.1757506168617007, "grad_norm": 18.17698860168457, "learning_rate": 1.390269334056969e-06, "loss": 0.0701, "num_input_tokens_seen": 60044840, "step": 89060 }, { "epoch": 2.175872767693548, "grad_norm": 0.020957941189408302, "learning_rate": 1.390190817489317e-06, "loss": 0.0752, "num_input_tokens_seen": 60048232, "step": 89065 }, { "epoch": 2.175994918525395, "grad_norm": 0.006288980599492788, "learning_rate": 1.3901122980840928e-06, "loss": 0.0856, "num_input_tokens_seen": 60051944, "step": 89070 }, { "epoch": 2.1761170693572423, "grad_norm": 0.010946640744805336, "learning_rate": 1.3900337758418665e-06, "loss": 0.0001, "num_input_tokens_seen": 60055208, "step": 89075 }, { "epoch": 2.1762392201890894, "grad_norm": 14.283775329589844, "learning_rate": 1.3899552507632098e-06, "loss": 0.0582, "num_input_tokens_seen": 60058920, "step": 89080 }, { "epoch": 2.1763613710209366, "grad_norm": 0.020361153408885002, "learning_rate": 1.3898767228486936e-06, "loss": 0.0328, "num_input_tokens_seen": 60061864, "step": 89085 }, { "epoch": 2.176483521852784, "grad_norm": 0.007297951728105545, "learning_rate": 1.389798192098889e-06, "loss": 0.0001, "num_input_tokens_seen": 60065192, "step": 89090 }, { "epoch": 2.176605672684631, "grad_norm": 0.02128020115196705, "learning_rate": 1.389719658514367e-06, "loss": 0.0613, "num_input_tokens_seen": 60068456, "step": 89095 }, { "epoch": 2.176727823516478, "grad_norm": 0.008222533389925957, "learning_rate": 1.3896411220956991e-06, "loss": 0.07, "num_input_tokens_seen": 60072296, "step": 89100 }, { "epoch": 2.1768499743483254, "grad_norm": 4.085846900939941, "learning_rate": 1.3895625828434561e-06, "loss": 0.0383, "num_input_tokens_seen": 60075560, "step": 89105 }, { "epoch": 2.1769721251801726, "grad_norm": 0.0071534449234604836, "learning_rate": 1.3894840407582092e-06, "loss": 0.0001, "num_input_tokens_seen": 60078760, "step": 89110 }, { "epoch": 2.1770942760120198, "grad_norm": 0.048654671758413315, "learning_rate": 1.3894054958405295e-06, "loss": 0.0005, "num_input_tokens_seen": 60082152, "step": 89115 }, { "epoch": 2.177216426843867, "grad_norm": 4.770420074462891, "learning_rate": 1.3893269480909886e-06, "loss": 0.0011, "num_input_tokens_seen": 60085800, "step": 89120 }, { "epoch": 2.177338577675714, "grad_norm": 0.008286378346383572, "learning_rate": 1.389248397510157e-06, "loss": 0.0004, "num_input_tokens_seen": 60089128, "step": 89125 }, { "epoch": 2.1774607285075613, "grad_norm": 317.94000244140625, "learning_rate": 1.3891698440986063e-06, "loss": 0.0704, "num_input_tokens_seen": 60092264, "step": 89130 }, { "epoch": 2.1775828793394085, "grad_norm": 0.0218020211905241, "learning_rate": 1.389091287856908e-06, "loss": 0.0661, "num_input_tokens_seen": 60095208, "step": 89135 }, { "epoch": 2.1777050301712553, "grad_norm": 0.042609941214323044, "learning_rate": 1.3890127287856334e-06, "loss": 0.0504, "num_input_tokens_seen": 60098920, "step": 89140 }, { "epoch": 2.1778271810031025, "grad_norm": 0.5689958333969116, "learning_rate": 1.3889341668853536e-06, "loss": 0.0005, "num_input_tokens_seen": 60102184, "step": 89145 }, { "epoch": 2.1779493318349497, "grad_norm": 0.026712533086538315, "learning_rate": 1.3888556021566397e-06, "loss": 0.0372, "num_input_tokens_seen": 60105960, "step": 89150 }, { "epoch": 2.178071482666797, "grad_norm": 0.05201387032866478, "learning_rate": 1.3887770346000632e-06, "loss": 0.1155, "num_input_tokens_seen": 60109352, "step": 89155 }, { "epoch": 2.178193633498644, "grad_norm": 0.13787080347537994, "learning_rate": 1.3886984642161957e-06, "loss": 0.011, "num_input_tokens_seen": 60112360, "step": 89160 }, { "epoch": 2.1783157843304912, "grad_norm": 0.010372105054557323, "learning_rate": 1.3886198910056086e-06, "loss": 0.0003, "num_input_tokens_seen": 60116136, "step": 89165 }, { "epoch": 2.1784379351623384, "grad_norm": 0.0984877273440361, "learning_rate": 1.388541314968873e-06, "loss": 0.0004, "num_input_tokens_seen": 60119336, "step": 89170 }, { "epoch": 2.1785600859941856, "grad_norm": 0.010339641943573952, "learning_rate": 1.3884627361065604e-06, "loss": 0.0002, "num_input_tokens_seen": 60122792, "step": 89175 }, { "epoch": 2.178682236826033, "grad_norm": 0.00042667731759138405, "learning_rate": 1.3883841544192424e-06, "loss": 0.0071, "num_input_tokens_seen": 60126184, "step": 89180 }, { "epoch": 2.17880438765788, "grad_norm": 0.23866403102874756, "learning_rate": 1.38830556990749e-06, "loss": 0.0002, "num_input_tokens_seen": 60129320, "step": 89185 }, { "epoch": 2.178926538489727, "grad_norm": 14.645671844482422, "learning_rate": 1.3882269825718753e-06, "loss": 0.1062, "num_input_tokens_seen": 60133096, "step": 89190 }, { "epoch": 2.1790486893215744, "grad_norm": 0.08448100090026855, "learning_rate": 1.3881483924129693e-06, "loss": 0.0008, "num_input_tokens_seen": 60136104, "step": 89195 }, { "epoch": 2.1791708401534216, "grad_norm": 0.007324859034270048, "learning_rate": 1.3880697994313442e-06, "loss": 0.0559, "num_input_tokens_seen": 60139368, "step": 89200 }, { "epoch": 2.1792929909852687, "grad_norm": 0.2774519920349121, "learning_rate": 1.3879912036275712e-06, "loss": 0.1086, "num_input_tokens_seen": 60142568, "step": 89205 }, { "epoch": 2.179415141817116, "grad_norm": 0.007932466454803944, "learning_rate": 1.3879126050022213e-06, "loss": 0.0001, "num_input_tokens_seen": 60146088, "step": 89210 }, { "epoch": 2.179537292648963, "grad_norm": 0.2126796841621399, "learning_rate": 1.3878340035558671e-06, "loss": 0.001, "num_input_tokens_seen": 60149352, "step": 89215 }, { "epoch": 2.1796594434808103, "grad_norm": 0.05298115313053131, "learning_rate": 1.3877553992890796e-06, "loss": 0.0847, "num_input_tokens_seen": 60152680, "step": 89220 }, { "epoch": 2.179781594312657, "grad_norm": 0.04341858625411987, "learning_rate": 1.3876767922024305e-06, "loss": 0.0003, "num_input_tokens_seen": 60156392, "step": 89225 }, { "epoch": 2.1799037451445042, "grad_norm": 0.020297808572649956, "learning_rate": 1.3875981822964912e-06, "loss": 0.0506, "num_input_tokens_seen": 60159656, "step": 89230 }, { "epoch": 2.1800258959763514, "grad_norm": 23.527563095092773, "learning_rate": 1.387519569571834e-06, "loss": 0.1247, "num_input_tokens_seen": 60163048, "step": 89235 }, { "epoch": 2.1801480468081986, "grad_norm": 0.08387105166912079, "learning_rate": 1.38744095402903e-06, "loss": 0.1293, "num_input_tokens_seen": 60166056, "step": 89240 }, { "epoch": 2.180270197640046, "grad_norm": 0.036509111523628235, "learning_rate": 1.3873623356686517e-06, "loss": 0.0432, "num_input_tokens_seen": 60169896, "step": 89245 }, { "epoch": 2.180392348471893, "grad_norm": 0.03160225227475166, "learning_rate": 1.3872837144912696e-06, "loss": 0.0004, "num_input_tokens_seen": 60173608, "step": 89250 }, { "epoch": 2.18051449930374, "grad_norm": 15.129281044006348, "learning_rate": 1.3872050904974566e-06, "loss": 0.0363, "num_input_tokens_seen": 60176936, "step": 89255 }, { "epoch": 2.1806366501355874, "grad_norm": 0.3375871181488037, "learning_rate": 1.387126463687784e-06, "loss": 0.0466, "num_input_tokens_seen": 60180008, "step": 89260 }, { "epoch": 2.1807588009674346, "grad_norm": 0.048338957130908966, "learning_rate": 1.3870478340628235e-06, "loss": 0.0394, "num_input_tokens_seen": 60182888, "step": 89265 }, { "epoch": 2.1808809517992818, "grad_norm": 0.030270587652921677, "learning_rate": 1.3869692016231473e-06, "loss": 0.0002, "num_input_tokens_seen": 60185832, "step": 89270 }, { "epoch": 2.181003102631129, "grad_norm": 0.05345119908452034, "learning_rate": 1.3868905663693272e-06, "loss": 0.0005, "num_input_tokens_seen": 60188840, "step": 89275 }, { "epoch": 2.181125253462976, "grad_norm": 0.006547401659190655, "learning_rate": 1.386811928301934e-06, "loss": 0.1913, "num_input_tokens_seen": 60192104, "step": 89280 }, { "epoch": 2.1812474042948233, "grad_norm": 0.11802975833415985, "learning_rate": 1.386733287421541e-06, "loss": 0.068, "num_input_tokens_seen": 60195368, "step": 89285 }, { "epoch": 2.1813695551266705, "grad_norm": 0.07596275210380554, "learning_rate": 1.3866546437287195e-06, "loss": 0.0343, "num_input_tokens_seen": 60198824, "step": 89290 }, { "epoch": 2.1814917059585177, "grad_norm": 0.08031544834375381, "learning_rate": 1.3865759972240411e-06, "loss": 0.0516, "num_input_tokens_seen": 60202024, "step": 89295 }, { "epoch": 2.181613856790365, "grad_norm": 0.038348667323589325, "learning_rate": 1.3864973479080786e-06, "loss": 0.0002, "num_input_tokens_seen": 60205224, "step": 89300 }, { "epoch": 2.181736007622212, "grad_norm": 0.05317453667521477, "learning_rate": 1.386418695781403e-06, "loss": 0.0002, "num_input_tokens_seen": 60208808, "step": 89305 }, { "epoch": 2.181858158454059, "grad_norm": 0.013864437118172646, "learning_rate": 1.3863400408445867e-06, "loss": 0.0017, "num_input_tokens_seen": 60212200, "step": 89310 }, { "epoch": 2.1819803092859065, "grad_norm": 0.031996458768844604, "learning_rate": 1.3862613830982018e-06, "loss": 0.0004, "num_input_tokens_seen": 60215464, "step": 89315 }, { "epoch": 2.182102460117753, "grad_norm": 43.89045715332031, "learning_rate": 1.3861827225428204e-06, "loss": 0.0729, "num_input_tokens_seen": 60218600, "step": 89320 }, { "epoch": 2.1822246109496004, "grad_norm": 0.05054045841097832, "learning_rate": 1.3861040591790144e-06, "loss": 0.0524, "num_input_tokens_seen": 60221992, "step": 89325 }, { "epoch": 2.1823467617814476, "grad_norm": 0.053709473460912704, "learning_rate": 1.3860253930073555e-06, "loss": 0.1776, "num_input_tokens_seen": 60225256, "step": 89330 }, { "epoch": 2.182468912613295, "grad_norm": 0.014191855676472187, "learning_rate": 1.3859467240284165e-06, "loss": 0.0133, "num_input_tokens_seen": 60228840, "step": 89335 }, { "epoch": 2.182591063445142, "grad_norm": 0.028932316228747368, "learning_rate": 1.3858680522427686e-06, "loss": 0.0005, "num_input_tokens_seen": 60232424, "step": 89340 }, { "epoch": 2.182713214276989, "grad_norm": 0.0772460401058197, "learning_rate": 1.3857893776509849e-06, "loss": 0.0009, "num_input_tokens_seen": 60235624, "step": 89345 }, { "epoch": 2.1828353651088364, "grad_norm": 0.15768887102603912, "learning_rate": 1.385710700253637e-06, "loss": 0.0492, "num_input_tokens_seen": 60238888, "step": 89350 }, { "epoch": 2.1829575159406835, "grad_norm": 0.21225495636463165, "learning_rate": 1.385632020051297e-06, "loss": 0.0399, "num_input_tokens_seen": 60242408, "step": 89355 }, { "epoch": 2.1830796667725307, "grad_norm": 0.05385947600007057, "learning_rate": 1.3855533370445374e-06, "loss": 0.0133, "num_input_tokens_seen": 60246376, "step": 89360 }, { "epoch": 2.183201817604378, "grad_norm": 0.05395977944135666, "learning_rate": 1.3854746512339301e-06, "loss": 0.0004, "num_input_tokens_seen": 60249576, "step": 89365 }, { "epoch": 2.183323968436225, "grad_norm": 0.01952124759554863, "learning_rate": 1.3853959626200475e-06, "loss": 0.0538, "num_input_tokens_seen": 60252776, "step": 89370 }, { "epoch": 2.1834461192680723, "grad_norm": 0.03978893533349037, "learning_rate": 1.3853172712034617e-06, "loss": 0.1073, "num_input_tokens_seen": 60255784, "step": 89375 }, { "epoch": 2.1835682700999195, "grad_norm": 0.03560865297913551, "learning_rate": 1.3852385769847453e-06, "loss": 0.0619, "num_input_tokens_seen": 60259240, "step": 89380 }, { "epoch": 2.1836904209317667, "grad_norm": 0.03531051427125931, "learning_rate": 1.3851598799644702e-06, "loss": 0.0001, "num_input_tokens_seen": 60262760, "step": 89385 }, { "epoch": 2.183812571763614, "grad_norm": 0.022893404588103294, "learning_rate": 1.3850811801432087e-06, "loss": 0.0006, "num_input_tokens_seen": 60266408, "step": 89390 }, { "epoch": 2.183934722595461, "grad_norm": 0.14207857847213745, "learning_rate": 1.3850024775215337e-06, "loss": 0.0434, "num_input_tokens_seen": 60269608, "step": 89395 }, { "epoch": 2.1840568734273083, "grad_norm": 41.99595260620117, "learning_rate": 1.384923772100017e-06, "loss": 0.0347, "num_input_tokens_seen": 60272616, "step": 89400 }, { "epoch": 2.184179024259155, "grad_norm": 0.13540434837341309, "learning_rate": 1.3848450638792305e-06, "loss": 0.0004, "num_input_tokens_seen": 60276392, "step": 89405 }, { "epoch": 2.184301175091002, "grad_norm": 17.306758880615234, "learning_rate": 1.3847663528597477e-06, "loss": 0.0013, "num_input_tokens_seen": 60280104, "step": 89410 }, { "epoch": 2.1844233259228494, "grad_norm": 0.04647582396864891, "learning_rate": 1.3846876390421405e-06, "loss": 0.0448, "num_input_tokens_seen": 60283368, "step": 89415 }, { "epoch": 2.1845454767546966, "grad_norm": 0.07750110328197479, "learning_rate": 1.3846089224269815e-06, "loss": 0.0003, "num_input_tokens_seen": 60286888, "step": 89420 }, { "epoch": 2.1846676275865438, "grad_norm": 0.10336387157440186, "learning_rate": 1.3845302030148428e-06, "loss": 0.0002, "num_input_tokens_seen": 60290472, "step": 89425 }, { "epoch": 2.184789778418391, "grad_norm": 0.36430805921554565, "learning_rate": 1.384451480806297e-06, "loss": 0.0003, "num_input_tokens_seen": 60293736, "step": 89430 }, { "epoch": 2.184911929250238, "grad_norm": 0.011753383092582226, "learning_rate": 1.3843727558019166e-06, "loss": 0.0455, "num_input_tokens_seen": 60297128, "step": 89435 }, { "epoch": 2.1850340800820853, "grad_norm": 0.021449893712997437, "learning_rate": 1.3842940280022738e-06, "loss": 0.0537, "num_input_tokens_seen": 60300520, "step": 89440 }, { "epoch": 2.1851562309139325, "grad_norm": 0.01641305908560753, "learning_rate": 1.384215297407942e-06, "loss": 0.0003, "num_input_tokens_seen": 60304040, "step": 89445 }, { "epoch": 2.1852783817457797, "grad_norm": 0.03722817450761795, "learning_rate": 1.384136564019493e-06, "loss": 0.0528, "num_input_tokens_seen": 60307304, "step": 89450 }, { "epoch": 2.185400532577627, "grad_norm": 0.034665465354919434, "learning_rate": 1.3840578278374996e-06, "loss": 0.0003, "num_input_tokens_seen": 60310440, "step": 89455 }, { "epoch": 2.185522683409474, "grad_norm": 0.02144056372344494, "learning_rate": 1.3839790888625345e-06, "loss": 0.0004, "num_input_tokens_seen": 60313576, "step": 89460 }, { "epoch": 2.1856448342413213, "grad_norm": 0.04060309752821922, "learning_rate": 1.38390034709517e-06, "loss": 0.0003, "num_input_tokens_seen": 60316776, "step": 89465 }, { "epoch": 2.1857669850731685, "grad_norm": 0.013973901979625225, "learning_rate": 1.383821602535979e-06, "loss": 0.0006, "num_input_tokens_seen": 60320424, "step": 89470 }, { "epoch": 2.1858891359050157, "grad_norm": 0.0008165006875060499, "learning_rate": 1.3837428551855342e-06, "loss": 0.0001, "num_input_tokens_seen": 60323816, "step": 89475 }, { "epoch": 2.186011286736863, "grad_norm": 0.06573706865310669, "learning_rate": 1.383664105044408e-06, "loss": 0.0001, "num_input_tokens_seen": 60327016, "step": 89480 }, { "epoch": 2.18613343756871, "grad_norm": 0.12895028293132782, "learning_rate": 1.3835853521131733e-06, "loss": 0.0106, "num_input_tokens_seen": 60329960, "step": 89485 }, { "epoch": 2.186255588400557, "grad_norm": 0.0590890534222126, "learning_rate": 1.3835065963924026e-06, "loss": 0.0446, "num_input_tokens_seen": 60333288, "step": 89490 }, { "epoch": 2.186377739232404, "grad_norm": 0.09835655242204666, "learning_rate": 1.3834278378826687e-06, "loss": 0.0002, "num_input_tokens_seen": 60336680, "step": 89495 }, { "epoch": 2.186499890064251, "grad_norm": 0.01371688675135374, "learning_rate": 1.3833490765845445e-06, "loss": 0.0001, "num_input_tokens_seen": 60340136, "step": 89500 }, { "epoch": 2.1866220408960984, "grad_norm": 0.035382047295570374, "learning_rate": 1.383270312498603e-06, "loss": 0.0001, "num_input_tokens_seen": 60343592, "step": 89505 }, { "epoch": 2.1867441917279455, "grad_norm": 0.00039148752694018185, "learning_rate": 1.3831915456254164e-06, "loss": 0.062, "num_input_tokens_seen": 60346792, "step": 89510 }, { "epoch": 2.1868663425597927, "grad_norm": 0.034438714385032654, "learning_rate": 1.383112775965558e-06, "loss": 0.0006, "num_input_tokens_seen": 60350632, "step": 89515 }, { "epoch": 2.18698849339164, "grad_norm": 0.001694877864792943, "learning_rate": 1.3830340035196004e-06, "loss": 0.0999, "num_input_tokens_seen": 60353768, "step": 89520 }, { "epoch": 2.187110644223487, "grad_norm": 12.802435874938965, "learning_rate": 1.3829552282881165e-06, "loss": 0.0445, "num_input_tokens_seen": 60356968, "step": 89525 }, { "epoch": 2.1872327950553343, "grad_norm": 0.022081870585680008, "learning_rate": 1.3828764502716793e-06, "loss": 0.0009, "num_input_tokens_seen": 60360232, "step": 89530 }, { "epoch": 2.1873549458871815, "grad_norm": 0.021757181733846664, "learning_rate": 1.3827976694708614e-06, "loss": 0.0298, "num_input_tokens_seen": 60363048, "step": 89535 }, { "epoch": 2.1874770967190287, "grad_norm": 0.12817370891571045, "learning_rate": 1.3827188858862359e-06, "loss": 0.0005, "num_input_tokens_seen": 60366504, "step": 89540 }, { "epoch": 2.187599247550876, "grad_norm": 0.120881088078022, "learning_rate": 1.3826400995183755e-06, "loss": 0.0493, "num_input_tokens_seen": 60369832, "step": 89545 }, { "epoch": 2.187721398382723, "grad_norm": 0.5409322381019592, "learning_rate": 1.3825613103678539e-06, "loss": 0.0621, "num_input_tokens_seen": 60373032, "step": 89550 }, { "epoch": 2.1878435492145702, "grad_norm": 14.173566818237305, "learning_rate": 1.382482518435243e-06, "loss": 0.1244, "num_input_tokens_seen": 60376616, "step": 89555 }, { "epoch": 2.1879657000464174, "grad_norm": 0.16862067580223083, "learning_rate": 1.3824037237211166e-06, "loss": 0.0003, "num_input_tokens_seen": 60379752, "step": 89560 }, { "epoch": 2.1880878508782646, "grad_norm": 0.1347108781337738, "learning_rate": 1.3823249262260476e-06, "loss": 0.0001, "num_input_tokens_seen": 60383208, "step": 89565 }, { "epoch": 2.188210001710112, "grad_norm": 0.014512077905237675, "learning_rate": 1.3822461259506088e-06, "loss": 0.0004, "num_input_tokens_seen": 60386344, "step": 89570 }, { "epoch": 2.188332152541959, "grad_norm": 1.0698745250701904, "learning_rate": 1.3821673228953735e-06, "loss": 0.0005, "num_input_tokens_seen": 60389608, "step": 89575 }, { "epoch": 2.188454303373806, "grad_norm": 0.024578392505645752, "learning_rate": 1.3820885170609142e-06, "loss": 0.0001, "num_input_tokens_seen": 60393064, "step": 89580 }, { "epoch": 2.188576454205653, "grad_norm": 0.015470233745872974, "learning_rate": 1.382009708447805e-06, "loss": 0.0001, "num_input_tokens_seen": 60396712, "step": 89585 }, { "epoch": 2.1886986050375, "grad_norm": 0.07507811486721039, "learning_rate": 1.3819308970566178e-06, "loss": 0.0706, "num_input_tokens_seen": 60400360, "step": 89590 }, { "epoch": 2.1888207558693473, "grad_norm": 0.027581792324781418, "learning_rate": 1.381852082887927e-06, "loss": 0.0002, "num_input_tokens_seen": 60403752, "step": 89595 }, { "epoch": 2.1889429067011945, "grad_norm": 0.02173568122088909, "learning_rate": 1.3817732659423048e-06, "loss": 0.0001, "num_input_tokens_seen": 60407208, "step": 89600 }, { "epoch": 2.1890650575330417, "grad_norm": 0.20994439721107483, "learning_rate": 1.3816944462203251e-06, "loss": 0.0672, "num_input_tokens_seen": 60410728, "step": 89605 }, { "epoch": 2.189187208364889, "grad_norm": 0.05724925175309181, "learning_rate": 1.3816156237225602e-06, "loss": 0.1354, "num_input_tokens_seen": 60414184, "step": 89610 }, { "epoch": 2.189309359196736, "grad_norm": 0.202612042427063, "learning_rate": 1.3815367984495842e-06, "loss": 0.0521, "num_input_tokens_seen": 60417640, "step": 89615 }, { "epoch": 2.1894315100285833, "grad_norm": 0.0011157530825585127, "learning_rate": 1.3814579704019697e-06, "loss": 0.0002, "num_input_tokens_seen": 60421160, "step": 89620 }, { "epoch": 2.1895536608604305, "grad_norm": 10.986202239990234, "learning_rate": 1.3813791395802905e-06, "loss": 0.0438, "num_input_tokens_seen": 60424488, "step": 89625 }, { "epoch": 2.1896758116922777, "grad_norm": 0.03978127986192703, "learning_rate": 1.3813003059851198e-06, "loss": 0.0001, "num_input_tokens_seen": 60427944, "step": 89630 }, { "epoch": 2.189797962524125, "grad_norm": 0.07209401577711105, "learning_rate": 1.3812214696170303e-06, "loss": 0.0456, "num_input_tokens_seen": 60431336, "step": 89635 }, { "epoch": 2.189920113355972, "grad_norm": 0.007610888220369816, "learning_rate": 1.381142630476596e-06, "loss": 0.044, "num_input_tokens_seen": 60434536, "step": 89640 }, { "epoch": 2.190042264187819, "grad_norm": 0.00974766630679369, "learning_rate": 1.3810637885643898e-06, "loss": 0.0, "num_input_tokens_seen": 60437672, "step": 89645 }, { "epoch": 2.1901644150196664, "grad_norm": 251.47662353515625, "learning_rate": 1.3809849438809853e-06, "loss": 0.1284, "num_input_tokens_seen": 60440616, "step": 89650 }, { "epoch": 2.1902865658515136, "grad_norm": 53.546661376953125, "learning_rate": 1.3809060964269557e-06, "loss": 0.1287, "num_input_tokens_seen": 60444008, "step": 89655 }, { "epoch": 2.190408716683361, "grad_norm": 61.50639343261719, "learning_rate": 1.3808272462028747e-06, "loss": 0.0911, "num_input_tokens_seen": 60447656, "step": 89660 }, { "epoch": 2.190530867515208, "grad_norm": 0.013333199545741081, "learning_rate": 1.3807483932093157e-06, "loss": 0.0001, "num_input_tokens_seen": 60451304, "step": 89665 }, { "epoch": 2.1906530183470547, "grad_norm": 0.07766670733690262, "learning_rate": 1.3806695374468515e-06, "loss": 0.0006, "num_input_tokens_seen": 60454248, "step": 89670 }, { "epoch": 2.190775169178902, "grad_norm": 0.0028038809541612864, "learning_rate": 1.3805906789160564e-06, "loss": 0.0001, "num_input_tokens_seen": 60457320, "step": 89675 }, { "epoch": 2.190897320010749, "grad_norm": 0.08032268285751343, "learning_rate": 1.3805118176175033e-06, "loss": 0.001, "num_input_tokens_seen": 60460392, "step": 89680 }, { "epoch": 2.1910194708425963, "grad_norm": 0.0032088463194668293, "learning_rate": 1.380432953551766e-06, "loss": 0.0002, "num_input_tokens_seen": 60464104, "step": 89685 }, { "epoch": 2.1911416216744435, "grad_norm": 0.011546256020665169, "learning_rate": 1.3803540867194182e-06, "loss": 0.0002, "num_input_tokens_seen": 60467560, "step": 89690 }, { "epoch": 2.1912637725062907, "grad_norm": 1.7818785905838013, "learning_rate": 1.3802752171210329e-06, "loss": 0.0008, "num_input_tokens_seen": 60470696, "step": 89695 }, { "epoch": 2.191385923338138, "grad_norm": 0.06002940610051155, "learning_rate": 1.3801963447571837e-06, "loss": 0.0003, "num_input_tokens_seen": 60474216, "step": 89700 }, { "epoch": 2.191508074169985, "grad_norm": 0.0799216628074646, "learning_rate": 1.380117469628445e-06, "loss": 0.0018, "num_input_tokens_seen": 60477608, "step": 89705 }, { "epoch": 2.1916302250018322, "grad_norm": 0.004125783685594797, "learning_rate": 1.3800385917353894e-06, "loss": 0.0002, "num_input_tokens_seen": 60480616, "step": 89710 }, { "epoch": 2.1917523758336794, "grad_norm": 0.03527636453509331, "learning_rate": 1.379959711078591e-06, "loss": 0.0001, "num_input_tokens_seen": 60484392, "step": 89715 }, { "epoch": 2.1918745266655266, "grad_norm": 0.025349846109747887, "learning_rate": 1.3798808276586233e-06, "loss": 0.0001, "num_input_tokens_seen": 60487592, "step": 89720 }, { "epoch": 2.191996677497374, "grad_norm": 0.028911394998431206, "learning_rate": 1.3798019414760603e-06, "loss": 0.1269, "num_input_tokens_seen": 60490600, "step": 89725 }, { "epoch": 2.192118828329221, "grad_norm": 0.015381722711026669, "learning_rate": 1.3797230525314754e-06, "loss": 0.0008, "num_input_tokens_seen": 60493672, "step": 89730 }, { "epoch": 2.192240979161068, "grad_norm": 0.014268257655203342, "learning_rate": 1.379644160825442e-06, "loss": 0.1136, "num_input_tokens_seen": 60496936, "step": 89735 }, { "epoch": 2.1923631299929154, "grad_norm": 0.08037779480218887, "learning_rate": 1.3795652663585347e-06, "loss": 0.0825, "num_input_tokens_seen": 60500072, "step": 89740 }, { "epoch": 2.1924852808247626, "grad_norm": 0.07356099039316177, "learning_rate": 1.3794863691313264e-06, "loss": 0.0013, "num_input_tokens_seen": 60503400, "step": 89745 }, { "epoch": 2.1926074316566098, "grad_norm": 0.00598715478554368, "learning_rate": 1.379407469144391e-06, "loss": 0.0067, "num_input_tokens_seen": 60506792, "step": 89750 }, { "epoch": 2.192729582488457, "grad_norm": 0.006357715930789709, "learning_rate": 1.3793285663983026e-06, "loss": 0.0003, "num_input_tokens_seen": 60509928, "step": 89755 }, { "epoch": 2.192851733320304, "grad_norm": 67.47779846191406, "learning_rate": 1.3792496608936348e-06, "loss": 0.0691, "num_input_tokens_seen": 60513576, "step": 89760 }, { "epoch": 2.192973884152151, "grad_norm": 0.016489354893565178, "learning_rate": 1.3791707526309615e-06, "loss": 0.0001, "num_input_tokens_seen": 60517096, "step": 89765 }, { "epoch": 2.193096034983998, "grad_norm": 0.08872917294502258, "learning_rate": 1.3790918416108567e-06, "loss": 0.065, "num_input_tokens_seen": 60520872, "step": 89770 }, { "epoch": 2.1932181858158453, "grad_norm": 0.05728995054960251, "learning_rate": 1.3790129278338936e-06, "loss": 0.0002, "num_input_tokens_seen": 60524776, "step": 89775 }, { "epoch": 2.1933403366476925, "grad_norm": 0.017248833552002907, "learning_rate": 1.3789340113006466e-06, "loss": 0.0001, "num_input_tokens_seen": 60528552, "step": 89780 }, { "epoch": 2.1934624874795396, "grad_norm": 28.960683822631836, "learning_rate": 1.3788550920116899e-06, "loss": 0.2405, "num_input_tokens_seen": 60531560, "step": 89785 }, { "epoch": 2.193584638311387, "grad_norm": 0.0018641628557816148, "learning_rate": 1.378776169967597e-06, "loss": 0.1328, "num_input_tokens_seen": 60534760, "step": 89790 }, { "epoch": 2.193706789143234, "grad_norm": 0.1414841264486313, "learning_rate": 1.3786972451689419e-06, "loss": 0.0013, "num_input_tokens_seen": 60538344, "step": 89795 }, { "epoch": 2.193828939975081, "grad_norm": 0.04694800451397896, "learning_rate": 1.3786183176162985e-06, "loss": 0.0002, "num_input_tokens_seen": 60541288, "step": 89800 }, { "epoch": 2.1939510908069284, "grad_norm": 0.1992703676223755, "learning_rate": 1.3785393873102407e-06, "loss": 0.0005, "num_input_tokens_seen": 60545512, "step": 89805 }, { "epoch": 2.1940732416387756, "grad_norm": 0.01089775562286377, "learning_rate": 1.3784604542513428e-06, "loss": 0.0011, "num_input_tokens_seen": 60549032, "step": 89810 }, { "epoch": 2.194195392470623, "grad_norm": 0.03481999412178993, "learning_rate": 1.3783815184401788e-06, "loss": 0.089, "num_input_tokens_seen": 60552424, "step": 89815 }, { "epoch": 2.19431754330247, "grad_norm": 0.004920847248286009, "learning_rate": 1.3783025798773224e-06, "loss": 0.0489, "num_input_tokens_seen": 60555496, "step": 89820 }, { "epoch": 2.194439694134317, "grad_norm": 0.018685011193156242, "learning_rate": 1.378223638563348e-06, "loss": 0.0003, "num_input_tokens_seen": 60559528, "step": 89825 }, { "epoch": 2.1945618449661644, "grad_norm": 0.13076385855674744, "learning_rate": 1.3781446944988297e-06, "loss": 0.0002, "num_input_tokens_seen": 60563176, "step": 89830 }, { "epoch": 2.1946839957980115, "grad_norm": 0.05780500918626785, "learning_rate": 1.3780657476843414e-06, "loss": 0.0481, "num_input_tokens_seen": 60566376, "step": 89835 }, { "epoch": 2.1948061466298587, "grad_norm": 1.6194716691970825, "learning_rate": 1.3779867981204571e-06, "loss": 0.0743, "num_input_tokens_seen": 60569832, "step": 89840 }, { "epoch": 2.194928297461706, "grad_norm": 0.0012161084450781345, "learning_rate": 1.3779078458077513e-06, "loss": 0.0873, "num_input_tokens_seen": 60573032, "step": 89845 }, { "epoch": 2.1950504482935527, "grad_norm": 0.01662173680961132, "learning_rate": 1.3778288907467982e-06, "loss": 0.0001, "num_input_tokens_seen": 60576296, "step": 89850 }, { "epoch": 2.1951725991254, "grad_norm": 0.003397295018658042, "learning_rate": 1.3777499329381714e-06, "loss": 0.0002, "num_input_tokens_seen": 60579496, "step": 89855 }, { "epoch": 2.195294749957247, "grad_norm": 0.1815873384475708, "learning_rate": 1.3776709723824459e-06, "loss": 0.046, "num_input_tokens_seen": 60583208, "step": 89860 }, { "epoch": 2.1954169007890942, "grad_norm": 0.025489671155810356, "learning_rate": 1.377592009080195e-06, "loss": 0.1189, "num_input_tokens_seen": 60586280, "step": 89865 }, { "epoch": 2.1955390516209414, "grad_norm": 0.17918871343135834, "learning_rate": 1.3775130430319936e-06, "loss": 0.0671, "num_input_tokens_seen": 60589544, "step": 89870 }, { "epoch": 2.1956612024527886, "grad_norm": 0.028614962473511696, "learning_rate": 1.377434074238416e-06, "loss": 0.0458, "num_input_tokens_seen": 60593064, "step": 89875 }, { "epoch": 2.195783353284636, "grad_norm": 0.03311797231435776, "learning_rate": 1.377355102700036e-06, "loss": 0.0002, "num_input_tokens_seen": 60596008, "step": 89880 }, { "epoch": 2.195905504116483, "grad_norm": 49.83849334716797, "learning_rate": 1.3772761284174286e-06, "loss": 0.087, "num_input_tokens_seen": 60599528, "step": 89885 }, { "epoch": 2.19602765494833, "grad_norm": 0.08110342174768448, "learning_rate": 1.3771971513911675e-06, "loss": 0.0002, "num_input_tokens_seen": 60603304, "step": 89890 }, { "epoch": 2.1961498057801774, "grad_norm": 15.620611190795898, "learning_rate": 1.3771181716218277e-06, "loss": 0.0338, "num_input_tokens_seen": 60606504, "step": 89895 }, { "epoch": 2.1962719566120246, "grad_norm": 0.09303645044565201, "learning_rate": 1.3770391891099824e-06, "loss": 0.0557, "num_input_tokens_seen": 60609960, "step": 89900 }, { "epoch": 2.1963941074438718, "grad_norm": 0.007297168485820293, "learning_rate": 1.376960203856207e-06, "loss": 0.0003, "num_input_tokens_seen": 60613416, "step": 89905 }, { "epoch": 2.196516258275719, "grad_norm": 0.0076292455196380615, "learning_rate": 1.3768812158610757e-06, "loss": 0.0001, "num_input_tokens_seen": 60616616, "step": 89910 }, { "epoch": 2.196638409107566, "grad_norm": 0.003555135801434517, "learning_rate": 1.3768022251251627e-06, "loss": 0.0, "num_input_tokens_seen": 60620072, "step": 89915 }, { "epoch": 2.1967605599394133, "grad_norm": 13.932856559753418, "learning_rate": 1.3767232316490428e-06, "loss": 0.0513, "num_input_tokens_seen": 60623400, "step": 89920 }, { "epoch": 2.1968827107712605, "grad_norm": 46.25975036621094, "learning_rate": 1.3766442354332899e-06, "loss": 0.0836, "num_input_tokens_seen": 60626600, "step": 89925 }, { "epoch": 2.1970048616031077, "grad_norm": 0.12795943021774292, "learning_rate": 1.3765652364784787e-06, "loss": 0.0261, "num_input_tokens_seen": 60629864, "step": 89930 }, { "epoch": 2.1971270124349545, "grad_norm": 0.22673307359218597, "learning_rate": 1.3764862347851844e-06, "loss": 0.0002, "num_input_tokens_seen": 60633640, "step": 89935 }, { "epoch": 2.1972491632668016, "grad_norm": 0.02614295296370983, "learning_rate": 1.3764072303539806e-06, "loss": 0.0729, "num_input_tokens_seen": 60636904, "step": 89940 }, { "epoch": 2.197371314098649, "grad_norm": 37.92133331298828, "learning_rate": 1.3763282231854425e-06, "loss": 0.0451, "num_input_tokens_seen": 60640680, "step": 89945 }, { "epoch": 2.197493464930496, "grad_norm": 0.6765152812004089, "learning_rate": 1.376249213280144e-06, "loss": 0.0003, "num_input_tokens_seen": 60643880, "step": 89950 }, { "epoch": 2.197615615762343, "grad_norm": 0.031104207038879395, "learning_rate": 1.37617020063866e-06, "loss": 0.0311, "num_input_tokens_seen": 60647016, "step": 89955 }, { "epoch": 2.1977377665941904, "grad_norm": 0.03291548416018486, "learning_rate": 1.3760911852615654e-06, "loss": 0.0001, "num_input_tokens_seen": 60650216, "step": 89960 }, { "epoch": 2.1978599174260376, "grad_norm": 0.13013535737991333, "learning_rate": 1.376012167149434e-06, "loss": 0.0002, "num_input_tokens_seen": 60653224, "step": 89965 }, { "epoch": 2.197982068257885, "grad_norm": 0.0047679804265499115, "learning_rate": 1.3759331463028414e-06, "loss": 0.0841, "num_input_tokens_seen": 60656552, "step": 89970 }, { "epoch": 2.198104219089732, "grad_norm": 0.024526601657271385, "learning_rate": 1.3758541227223618e-06, "loss": 0.097, "num_input_tokens_seen": 60660776, "step": 89975 }, { "epoch": 2.198226369921579, "grad_norm": 241.2824249267578, "learning_rate": 1.3757750964085698e-06, "loss": 0.0182, "num_input_tokens_seen": 60664168, "step": 89980 }, { "epoch": 2.1983485207534263, "grad_norm": 0.0037704810965806246, "learning_rate": 1.3756960673620403e-06, "loss": 0.0001, "num_input_tokens_seen": 60667688, "step": 89985 }, { "epoch": 2.1984706715852735, "grad_norm": 0.014645399525761604, "learning_rate": 1.375617035583348e-06, "loss": 0.0001, "num_input_tokens_seen": 60670952, "step": 89990 }, { "epoch": 2.1985928224171207, "grad_norm": 0.301076203584671, "learning_rate": 1.3755380010730677e-06, "loss": 0.0002, "num_input_tokens_seen": 60674600, "step": 89995 }, { "epoch": 2.198714973248968, "grad_norm": 0.014310811646282673, "learning_rate": 1.375458963831774e-06, "loss": 0.0679, "num_input_tokens_seen": 60677800, "step": 90000 }, { "epoch": 2.198837124080815, "grad_norm": 0.003124415874481201, "learning_rate": 1.3753799238600416e-06, "loss": 0.0548, "num_input_tokens_seen": 60681192, "step": 90005 }, { "epoch": 2.1989592749126623, "grad_norm": 1.3749229907989502, "learning_rate": 1.3753008811584455e-06, "loss": 0.0006, "num_input_tokens_seen": 60684328, "step": 90010 }, { "epoch": 2.1990814257445095, "grad_norm": 0.003533211536705494, "learning_rate": 1.3752218357275605e-06, "loss": 0.0643, "num_input_tokens_seen": 60687528, "step": 90015 }, { "epoch": 2.1992035765763567, "grad_norm": 1.0003328323364258, "learning_rate": 1.3751427875679613e-06, "loss": 0.0002, "num_input_tokens_seen": 60691304, "step": 90020 }, { "epoch": 2.199325727408204, "grad_norm": 14.11498737335205, "learning_rate": 1.3750637366802227e-06, "loss": 0.1813, "num_input_tokens_seen": 60694376, "step": 90025 }, { "epoch": 2.1994478782400506, "grad_norm": 0.04383881017565727, "learning_rate": 1.37498468306492e-06, "loss": 0.0651, "num_input_tokens_seen": 60697320, "step": 90030 }, { "epoch": 2.199570029071898, "grad_norm": 0.08588389307260513, "learning_rate": 1.3749056267226276e-06, "loss": 0.0005, "num_input_tokens_seen": 60700648, "step": 90035 }, { "epoch": 2.199692179903745, "grad_norm": 0.12183842808008194, "learning_rate": 1.3748265676539207e-06, "loss": 0.0003, "num_input_tokens_seen": 60703656, "step": 90040 }, { "epoch": 2.199814330735592, "grad_norm": 248.8981170654297, "learning_rate": 1.3747475058593742e-06, "loss": 0.0727, "num_input_tokens_seen": 60706856, "step": 90045 }, { "epoch": 2.1999364815674394, "grad_norm": 22.585060119628906, "learning_rate": 1.3746684413395634e-06, "loss": 0.0516, "num_input_tokens_seen": 60710632, "step": 90050 }, { "epoch": 2.2000586323992866, "grad_norm": 0.40043529868125916, "learning_rate": 1.3745893740950622e-06, "loss": 0.0003, "num_input_tokens_seen": 60714344, "step": 90055 }, { "epoch": 2.2001807832311338, "grad_norm": 0.006907534785568714, "learning_rate": 1.374510304126447e-06, "loss": 0.0949, "num_input_tokens_seen": 60717480, "step": 90060 }, { "epoch": 2.200302934062981, "grad_norm": 36.79753494262695, "learning_rate": 1.3744312314342918e-06, "loss": 0.1318, "num_input_tokens_seen": 60721576, "step": 90065 }, { "epoch": 2.200425084894828, "grad_norm": 3.2954399585723877, "learning_rate": 1.374352156019172e-06, "loss": 0.0243, "num_input_tokens_seen": 60724712, "step": 90070 }, { "epoch": 2.2005472357266753, "grad_norm": 0.00985129177570343, "learning_rate": 1.3742730778816626e-06, "loss": 0.0779, "num_input_tokens_seen": 60728168, "step": 90075 }, { "epoch": 2.2006693865585225, "grad_norm": 0.004111927468329668, "learning_rate": 1.3741939970223388e-06, "loss": 0.0005, "num_input_tokens_seen": 60732072, "step": 90080 }, { "epoch": 2.2007915373903697, "grad_norm": 2.2431719303131104, "learning_rate": 1.3741149134417756e-06, "loss": 0.0424, "num_input_tokens_seen": 60735464, "step": 90085 }, { "epoch": 2.200913688222217, "grad_norm": 0.14430655539035797, "learning_rate": 1.3740358271405481e-06, "loss": 0.0006, "num_input_tokens_seen": 60738664, "step": 90090 }, { "epoch": 2.201035839054064, "grad_norm": 18.08392333984375, "learning_rate": 1.3739567381192316e-06, "loss": 0.0478, "num_input_tokens_seen": 60742056, "step": 90095 }, { "epoch": 2.2011579898859113, "grad_norm": 0.01610242761671543, "learning_rate": 1.373877646378401e-06, "loss": 0.0002, "num_input_tokens_seen": 60745000, "step": 90100 }, { "epoch": 2.2012801407177585, "grad_norm": 0.08473029732704163, "learning_rate": 1.3737985519186316e-06, "loss": 0.0397, "num_input_tokens_seen": 60748328, "step": 90105 }, { "epoch": 2.2014022915496056, "grad_norm": 18.636194229125977, "learning_rate": 1.3737194547404986e-06, "loss": 0.0622, "num_input_tokens_seen": 60751528, "step": 90110 }, { "epoch": 2.2015244423814524, "grad_norm": 0.43971750140190125, "learning_rate": 1.373640354844577e-06, "loss": 0.0006, "num_input_tokens_seen": 60754472, "step": 90115 }, { "epoch": 2.2016465932132996, "grad_norm": 0.31794047355651855, "learning_rate": 1.3735612522314423e-06, "loss": 0.0004, "num_input_tokens_seen": 60757672, "step": 90120 }, { "epoch": 2.2017687440451468, "grad_norm": 0.01444853376597166, "learning_rate": 1.37348214690167e-06, "loss": 0.0477, "num_input_tokens_seen": 60760808, "step": 90125 }, { "epoch": 2.201890894876994, "grad_norm": 0.0342915803194046, "learning_rate": 1.373403038855835e-06, "loss": 0.0005, "num_input_tokens_seen": 60764520, "step": 90130 }, { "epoch": 2.202013045708841, "grad_norm": 0.07567956298589706, "learning_rate": 1.3733239280945124e-06, "loss": 0.0005, "num_input_tokens_seen": 60768168, "step": 90135 }, { "epoch": 2.2021351965406883, "grad_norm": 0.0931648463010788, "learning_rate": 1.373244814618278e-06, "loss": 0.0753, "num_input_tokens_seen": 60771304, "step": 90140 }, { "epoch": 2.2022573473725355, "grad_norm": 268.7383728027344, "learning_rate": 1.3731656984277069e-06, "loss": 0.0331, "num_input_tokens_seen": 60774632, "step": 90145 }, { "epoch": 2.2023794982043827, "grad_norm": 3.1963915824890137, "learning_rate": 1.3730865795233744e-06, "loss": 0.0361, "num_input_tokens_seen": 60777896, "step": 90150 }, { "epoch": 2.20250164903623, "grad_norm": 0.13845311105251312, "learning_rate": 1.373007457905856e-06, "loss": 0.0006, "num_input_tokens_seen": 60781096, "step": 90155 }, { "epoch": 2.202623799868077, "grad_norm": 0.012970576994121075, "learning_rate": 1.3729283335757272e-06, "loss": 0.0003, "num_input_tokens_seen": 60784616, "step": 90160 }, { "epoch": 2.2027459506999243, "grad_norm": 0.005554935894906521, "learning_rate": 1.372849206533563e-06, "loss": 0.0003, "num_input_tokens_seen": 60787816, "step": 90165 }, { "epoch": 2.2028681015317715, "grad_norm": 0.021228192374110222, "learning_rate": 1.3727700767799393e-06, "loss": 0.0515, "num_input_tokens_seen": 60790760, "step": 90170 }, { "epoch": 2.2029902523636187, "grad_norm": 0.03763711079955101, "learning_rate": 1.372690944315431e-06, "loss": 0.0002, "num_input_tokens_seen": 60793768, "step": 90175 }, { "epoch": 2.203112403195466, "grad_norm": 23.194503784179688, "learning_rate": 1.372611809140614e-06, "loss": 0.0848, "num_input_tokens_seen": 60797032, "step": 90180 }, { "epoch": 2.203234554027313, "grad_norm": 0.2247171700000763, "learning_rate": 1.3725326712560638e-06, "loss": 0.0307, "num_input_tokens_seen": 60800232, "step": 90185 }, { "epoch": 2.2033567048591602, "grad_norm": 0.0721370056271553, "learning_rate": 1.3724535306623558e-06, "loss": 0.0562, "num_input_tokens_seen": 60803880, "step": 90190 }, { "epoch": 2.2034788556910074, "grad_norm": 0.08112762123346329, "learning_rate": 1.3723743873600658e-06, "loss": 0.0002, "num_input_tokens_seen": 60807400, "step": 90195 }, { "epoch": 2.2036010065228546, "grad_norm": 15.60988712310791, "learning_rate": 1.3722952413497689e-06, "loss": 0.0491, "num_input_tokens_seen": 60810472, "step": 90200 }, { "epoch": 2.203723157354702, "grad_norm": 0.002482697134837508, "learning_rate": 1.372216092632041e-06, "loss": 0.0502, "num_input_tokens_seen": 60814120, "step": 90205 }, { "epoch": 2.2038453081865486, "grad_norm": 0.2348630726337433, "learning_rate": 1.372136941207457e-06, "loss": 0.0286, "num_input_tokens_seen": 60817512, "step": 90210 }, { "epoch": 2.2039674590183957, "grad_norm": 0.005988630000501871, "learning_rate": 1.3720577870765934e-06, "loss": 0.0015, "num_input_tokens_seen": 60821160, "step": 90215 }, { "epoch": 2.204089609850243, "grad_norm": 0.21893872320652008, "learning_rate": 1.3719786302400258e-06, "loss": 0.0001, "num_input_tokens_seen": 60824808, "step": 90220 }, { "epoch": 2.20421176068209, "grad_norm": 194.6992950439453, "learning_rate": 1.3718994706983293e-06, "loss": 0.0117, "num_input_tokens_seen": 60828136, "step": 90225 }, { "epoch": 2.2043339115139373, "grad_norm": 0.3046874403953552, "learning_rate": 1.3718203084520798e-06, "loss": 0.0005, "num_input_tokens_seen": 60831208, "step": 90230 }, { "epoch": 2.2044560623457845, "grad_norm": 53.637351989746094, "learning_rate": 1.371741143501853e-06, "loss": 0.0699, "num_input_tokens_seen": 60834344, "step": 90235 }, { "epoch": 2.2045782131776317, "grad_norm": 0.007826529443264008, "learning_rate": 1.3716619758482249e-06, "loss": 0.0004, "num_input_tokens_seen": 60837672, "step": 90240 }, { "epoch": 2.204700364009479, "grad_norm": 0.0040454016998410225, "learning_rate": 1.3715828054917705e-06, "loss": 0.0489, "num_input_tokens_seen": 60841576, "step": 90245 }, { "epoch": 2.204822514841326, "grad_norm": 0.01919216848909855, "learning_rate": 1.3715036324330665e-06, "loss": 0.044, "num_input_tokens_seen": 60845032, "step": 90250 }, { "epoch": 2.2049446656731733, "grad_norm": 6.789739563828334e-05, "learning_rate": 1.3714244566726878e-06, "loss": 0.0004, "num_input_tokens_seen": 60848936, "step": 90255 }, { "epoch": 2.2050668165050205, "grad_norm": 30.994749069213867, "learning_rate": 1.3713452782112107e-06, "loss": 0.0502, "num_input_tokens_seen": 60852520, "step": 90260 }, { "epoch": 2.2051889673368676, "grad_norm": 0.2674175202846527, "learning_rate": 1.3712660970492108e-06, "loss": 0.0002, "num_input_tokens_seen": 60855976, "step": 90265 }, { "epoch": 2.205311118168715, "grad_norm": 0.35465195775032043, "learning_rate": 1.371186913187264e-06, "loss": 0.0004, "num_input_tokens_seen": 60859240, "step": 90270 }, { "epoch": 2.205433269000562, "grad_norm": 0.05108984559774399, "learning_rate": 1.3711077266259459e-06, "loss": 0.0915, "num_input_tokens_seen": 60862312, "step": 90275 }, { "epoch": 2.205555419832409, "grad_norm": 13.732041358947754, "learning_rate": 1.3710285373658328e-06, "loss": 0.1077, "num_input_tokens_seen": 60865640, "step": 90280 }, { "epoch": 2.2056775706642564, "grad_norm": 0.12964510917663574, "learning_rate": 1.3709493454075004e-06, "loss": 0.0458, "num_input_tokens_seen": 60869160, "step": 90285 }, { "epoch": 2.2057997214961036, "grad_norm": 0.15524911880493164, "learning_rate": 1.3708701507515245e-06, "loss": 0.0004, "num_input_tokens_seen": 60872488, "step": 90290 }, { "epoch": 2.2059218723279503, "grad_norm": 0.061491161584854126, "learning_rate": 1.3707909533984811e-06, "loss": 0.0002, "num_input_tokens_seen": 60875496, "step": 90295 }, { "epoch": 2.2060440231597975, "grad_norm": 0.01920522190630436, "learning_rate": 1.3707117533489463e-06, "loss": 0.0952, "num_input_tokens_seen": 60878632, "step": 90300 }, { "epoch": 2.2061661739916447, "grad_norm": 0.08350993692874908, "learning_rate": 1.370632550603496e-06, "loss": 0.0002, "num_input_tokens_seen": 60881704, "step": 90305 }, { "epoch": 2.206288324823492, "grad_norm": 0.5707146525382996, "learning_rate": 1.3705533451627058e-06, "loss": 0.0509, "num_input_tokens_seen": 60884968, "step": 90310 }, { "epoch": 2.206410475655339, "grad_norm": 0.0054165455512702465, "learning_rate": 1.3704741370271522e-06, "loss": 0.0392, "num_input_tokens_seen": 60888488, "step": 90315 }, { "epoch": 2.2065326264871863, "grad_norm": 0.04952197149395943, "learning_rate": 1.370394926197411e-06, "loss": 0.0003, "num_input_tokens_seen": 60891816, "step": 90320 }, { "epoch": 2.2066547773190335, "grad_norm": 0.012744433246552944, "learning_rate": 1.3703157126740583e-06, "loss": 0.0002, "num_input_tokens_seen": 60895272, "step": 90325 }, { "epoch": 2.2067769281508807, "grad_norm": 0.2308608442544937, "learning_rate": 1.37023649645767e-06, "loss": 0.0567, "num_input_tokens_seen": 60898728, "step": 90330 }, { "epoch": 2.206899078982728, "grad_norm": 0.019291376695036888, "learning_rate": 1.3701572775488225e-06, "loss": 0.0215, "num_input_tokens_seen": 60902120, "step": 90335 }, { "epoch": 2.207021229814575, "grad_norm": 0.010389735922217369, "learning_rate": 1.3700780559480913e-06, "loss": 0.1538, "num_input_tokens_seen": 60905640, "step": 90340 }, { "epoch": 2.2071433806464222, "grad_norm": 0.0885319635272026, "learning_rate": 1.3699988316560536e-06, "loss": 0.0001, "num_input_tokens_seen": 60909288, "step": 90345 }, { "epoch": 2.2072655314782694, "grad_norm": 16.90826988220215, "learning_rate": 1.3699196046732844e-06, "loss": 0.0392, "num_input_tokens_seen": 60912680, "step": 90350 }, { "epoch": 2.2073876823101166, "grad_norm": 36.88505554199219, "learning_rate": 1.3698403750003604e-06, "loss": 0.0567, "num_input_tokens_seen": 60916264, "step": 90355 }, { "epoch": 2.207509833141964, "grad_norm": 0.008276794105768204, "learning_rate": 1.3697611426378582e-06, "loss": 0.064, "num_input_tokens_seen": 60919400, "step": 90360 }, { "epoch": 2.207631983973811, "grad_norm": 0.14579278230667114, "learning_rate": 1.3696819075863527e-06, "loss": 0.0507, "num_input_tokens_seen": 60922408, "step": 90365 }, { "epoch": 2.207754134805658, "grad_norm": 1.9445993900299072, "learning_rate": 1.3696026698464216e-06, "loss": 0.0576, "num_input_tokens_seen": 60925608, "step": 90370 }, { "epoch": 2.2078762856375054, "grad_norm": 0.6416146755218506, "learning_rate": 1.3695234294186403e-06, "loss": 0.0005, "num_input_tokens_seen": 60928616, "step": 90375 }, { "epoch": 2.207998436469352, "grad_norm": 0.6925631165504456, "learning_rate": 1.369444186303585e-06, "loss": 0.039, "num_input_tokens_seen": 60932136, "step": 90380 }, { "epoch": 2.2081205873011998, "grad_norm": 2.5282368659973145, "learning_rate": 1.3693649405018323e-06, "loss": 0.0706, "num_input_tokens_seen": 60935656, "step": 90385 }, { "epoch": 2.2082427381330465, "grad_norm": 0.6798292398452759, "learning_rate": 1.3692856920139586e-06, "loss": 0.0013, "num_input_tokens_seen": 60938664, "step": 90390 }, { "epoch": 2.2083648889648937, "grad_norm": 0.047019269317388535, "learning_rate": 1.36920644084054e-06, "loss": 0.1045, "num_input_tokens_seen": 60942120, "step": 90395 }, { "epoch": 2.208487039796741, "grad_norm": 0.004635084420442581, "learning_rate": 1.3691271869821526e-06, "loss": 0.0011, "num_input_tokens_seen": 60945640, "step": 90400 }, { "epoch": 2.208609190628588, "grad_norm": 0.024701261892914772, "learning_rate": 1.369047930439373e-06, "loss": 0.0944, "num_input_tokens_seen": 60949096, "step": 90405 }, { "epoch": 2.2087313414604353, "grad_norm": 0.15579397976398468, "learning_rate": 1.368968671212778e-06, "loss": 0.0002, "num_input_tokens_seen": 60952808, "step": 90410 }, { "epoch": 2.2088534922922824, "grad_norm": 0.4236942529678345, "learning_rate": 1.3688894093029432e-06, "loss": 0.0506, "num_input_tokens_seen": 60955880, "step": 90415 }, { "epoch": 2.2089756431241296, "grad_norm": 0.33078476786613464, "learning_rate": 1.3688101447104456e-06, "loss": 0.0004, "num_input_tokens_seen": 60959208, "step": 90420 }, { "epoch": 2.209097793955977, "grad_norm": 0.03016742318868637, "learning_rate": 1.3687308774358616e-06, "loss": 0.0005, "num_input_tokens_seen": 60962600, "step": 90425 }, { "epoch": 2.209219944787824, "grad_norm": 0.010646332986652851, "learning_rate": 1.368651607479767e-06, "loss": 0.0002, "num_input_tokens_seen": 60966248, "step": 90430 }, { "epoch": 2.209342095619671, "grad_norm": 0.028630100190639496, "learning_rate": 1.3685723348427388e-06, "loss": 0.0005, "num_input_tokens_seen": 60969448, "step": 90435 }, { "epoch": 2.2094642464515184, "grad_norm": 0.015928685665130615, "learning_rate": 1.3684930595253538e-06, "loss": 0.0094, "num_input_tokens_seen": 60972776, "step": 90440 }, { "epoch": 2.2095863972833656, "grad_norm": 0.03674129769206047, "learning_rate": 1.3684137815281882e-06, "loss": 0.0001, "num_input_tokens_seen": 60976040, "step": 90445 }, { "epoch": 2.209708548115213, "grad_norm": 0.01847653090953827, "learning_rate": 1.3683345008518181e-06, "loss": 0.0887, "num_input_tokens_seen": 60979304, "step": 90450 }, { "epoch": 2.20983069894706, "grad_norm": 0.0479811355471611, "learning_rate": 1.3682552174968208e-06, "loss": 0.074, "num_input_tokens_seen": 60982632, "step": 90455 }, { "epoch": 2.209952849778907, "grad_norm": 0.058019086718559265, "learning_rate": 1.3681759314637723e-06, "loss": 0.0298, "num_input_tokens_seen": 60986408, "step": 90460 }, { "epoch": 2.2100750006107543, "grad_norm": 0.02313101850450039, "learning_rate": 1.3680966427532494e-06, "loss": 0.0002, "num_input_tokens_seen": 60989992, "step": 90465 }, { "epoch": 2.2101971514426015, "grad_norm": 0.3875404894351959, "learning_rate": 1.3680173513658289e-06, "loss": 0.0217, "num_input_tokens_seen": 60993320, "step": 90470 }, { "epoch": 2.2103193022744483, "grad_norm": 23.808727264404297, "learning_rate": 1.367938057302087e-06, "loss": 0.0378, "num_input_tokens_seen": 60996712, "step": 90475 }, { "epoch": 2.2104414531062955, "grad_norm": 0.011256799101829529, "learning_rate": 1.3678587605626007e-06, "loss": 0.0004, "num_input_tokens_seen": 61000168, "step": 90480 }, { "epoch": 2.2105636039381427, "grad_norm": 0.009701249189674854, "learning_rate": 1.3677794611479466e-06, "loss": 0.0011, "num_input_tokens_seen": 61003304, "step": 90485 }, { "epoch": 2.21068575476999, "grad_norm": 0.31478777527809143, "learning_rate": 1.367700159058701e-06, "loss": 0.0577, "num_input_tokens_seen": 61006824, "step": 90490 }, { "epoch": 2.210807905601837, "grad_norm": 0.023821784183382988, "learning_rate": 1.3676208542954414e-06, "loss": 0.0446, "num_input_tokens_seen": 61010152, "step": 90495 }, { "epoch": 2.2109300564336842, "grad_norm": 0.018663931638002396, "learning_rate": 1.3675415468587436e-06, "loss": 0.1367, "num_input_tokens_seen": 61013608, "step": 90500 }, { "epoch": 2.2110522072655314, "grad_norm": 0.009511927142739296, "learning_rate": 1.3674622367491852e-06, "loss": 0.0008, "num_input_tokens_seen": 61016872, "step": 90505 }, { "epoch": 2.2111743580973786, "grad_norm": 0.05456589534878731, "learning_rate": 1.3673829239673424e-06, "loss": 0.0004, "num_input_tokens_seen": 61020200, "step": 90510 }, { "epoch": 2.211296508929226, "grad_norm": 6.82743501663208, "learning_rate": 1.3673036085137926e-06, "loss": 0.0008, "num_input_tokens_seen": 61023464, "step": 90515 }, { "epoch": 2.211418659761073, "grad_norm": 0.06412504613399506, "learning_rate": 1.3672242903891117e-06, "loss": 0.0002, "num_input_tokens_seen": 61027176, "step": 90520 }, { "epoch": 2.21154081059292, "grad_norm": 0.0497899204492569, "learning_rate": 1.3671449695938768e-06, "loss": 0.0002, "num_input_tokens_seen": 61030440, "step": 90525 }, { "epoch": 2.2116629614247674, "grad_norm": 0.008111564442515373, "learning_rate": 1.3670656461286655e-06, "loss": 0.0004, "num_input_tokens_seen": 61033896, "step": 90530 }, { "epoch": 2.2117851122566146, "grad_norm": 0.06232449412345886, "learning_rate": 1.3669863199940538e-06, "loss": 0.011, "num_input_tokens_seen": 61037096, "step": 90535 }, { "epoch": 2.2119072630884618, "grad_norm": 0.052241094410419464, "learning_rate": 1.3669069911906189e-06, "loss": 0.0002, "num_input_tokens_seen": 61040680, "step": 90540 }, { "epoch": 2.212029413920309, "grad_norm": 0.2057826668024063, "learning_rate": 1.3668276597189375e-06, "loss": 0.1162, "num_input_tokens_seen": 61044072, "step": 90545 }, { "epoch": 2.212151564752156, "grad_norm": 0.09444887936115265, "learning_rate": 1.3667483255795868e-06, "loss": 0.0005, "num_input_tokens_seen": 61047208, "step": 90550 }, { "epoch": 2.2122737155840033, "grad_norm": 0.004477665759623051, "learning_rate": 1.3666689887731434e-06, "loss": 0.0625, "num_input_tokens_seen": 61050536, "step": 90555 }, { "epoch": 2.21239586641585, "grad_norm": 0.010698872618377209, "learning_rate": 1.366589649300185e-06, "loss": 0.0355, "num_input_tokens_seen": 61053992, "step": 90560 }, { "epoch": 2.2125180172476973, "grad_norm": 0.14260634779930115, "learning_rate": 1.366510307161288e-06, "loss": 0.1054, "num_input_tokens_seen": 61057448, "step": 90565 }, { "epoch": 2.2126401680795444, "grad_norm": 0.05694550275802612, "learning_rate": 1.3664309623570293e-06, "loss": 0.0379, "num_input_tokens_seen": 61060456, "step": 90570 }, { "epoch": 2.2127623189113916, "grad_norm": 0.012118209153413773, "learning_rate": 1.3663516148879861e-06, "loss": 0.1337, "num_input_tokens_seen": 61064168, "step": 90575 }, { "epoch": 2.212884469743239, "grad_norm": 73.87699890136719, "learning_rate": 1.3662722647547355e-06, "loss": 0.1482, "num_input_tokens_seen": 61067688, "step": 90580 }, { "epoch": 2.213006620575086, "grad_norm": 0.34617283940315247, "learning_rate": 1.366192911957854e-06, "loss": 0.0408, "num_input_tokens_seen": 61071016, "step": 90585 }, { "epoch": 2.213128771406933, "grad_norm": 1.8424748182296753, "learning_rate": 1.3661135564979198e-06, "loss": 0.0242, "num_input_tokens_seen": 61074024, "step": 90590 }, { "epoch": 2.2132509222387804, "grad_norm": 0.06380114704370499, "learning_rate": 1.366034198375509e-06, "loss": 0.0005, "num_input_tokens_seen": 61077160, "step": 90595 }, { "epoch": 2.2133730730706276, "grad_norm": 0.5055896043777466, "learning_rate": 1.3659548375911992e-06, "loss": 0.0005, "num_input_tokens_seen": 61080232, "step": 90600 }, { "epoch": 2.2134952239024748, "grad_norm": 0.08805550634860992, "learning_rate": 1.3658754741455674e-06, "loss": 0.1485, "num_input_tokens_seen": 61084584, "step": 90605 }, { "epoch": 2.213617374734322, "grad_norm": 0.40562599897384644, "learning_rate": 1.3657961080391907e-06, "loss": 0.0731, "num_input_tokens_seen": 61088040, "step": 90610 }, { "epoch": 2.213739525566169, "grad_norm": 0.041347797960042953, "learning_rate": 1.3657167392726463e-06, "loss": 0.0745, "num_input_tokens_seen": 61091304, "step": 90615 }, { "epoch": 2.2138616763980163, "grad_norm": 0.05579262226819992, "learning_rate": 1.3656373678465114e-06, "loss": 0.0939, "num_input_tokens_seen": 61095016, "step": 90620 }, { "epoch": 2.2139838272298635, "grad_norm": 0.01780310645699501, "learning_rate": 1.3655579937613633e-06, "loss": 0.0014, "num_input_tokens_seen": 61098280, "step": 90625 }, { "epoch": 2.2141059780617107, "grad_norm": 0.34165212512016296, "learning_rate": 1.365478617017779e-06, "loss": 0.0004, "num_input_tokens_seen": 61101608, "step": 90630 }, { "epoch": 2.214228128893558, "grad_norm": 0.11793646216392517, "learning_rate": 1.3653992376163359e-06, "loss": 0.0006, "num_input_tokens_seen": 61104872, "step": 90635 }, { "epoch": 2.214350279725405, "grad_norm": 225.5457763671875, "learning_rate": 1.3653198555576113e-06, "loss": 0.1049, "num_input_tokens_seen": 61108328, "step": 90640 }, { "epoch": 2.2144724305572523, "grad_norm": 0.031289275735616684, "learning_rate": 1.3652404708421823e-06, "loss": 0.0003, "num_input_tokens_seen": 61111336, "step": 90645 }, { "epoch": 2.2145945813890995, "grad_norm": 0.016135146841406822, "learning_rate": 1.3651610834706266e-06, "loss": 0.0296, "num_input_tokens_seen": 61114408, "step": 90650 }, { "epoch": 2.2147167322209462, "grad_norm": 0.06828448176383972, "learning_rate": 1.3650816934435211e-06, "loss": 0.0348, "num_input_tokens_seen": 61118120, "step": 90655 }, { "epoch": 2.2148388830527934, "grad_norm": 14.590534210205078, "learning_rate": 1.3650023007614436e-06, "loss": 0.0594, "num_input_tokens_seen": 61121448, "step": 90660 }, { "epoch": 2.2149610338846406, "grad_norm": 0.016529662534594536, "learning_rate": 1.3649229054249709e-06, "loss": 0.0411, "num_input_tokens_seen": 61125160, "step": 90665 }, { "epoch": 2.215083184716488, "grad_norm": 0.03518630936741829, "learning_rate": 1.3648435074346812e-06, "loss": 0.0007, "num_input_tokens_seen": 61128296, "step": 90670 }, { "epoch": 2.215205335548335, "grad_norm": 25.680883407592773, "learning_rate": 1.364764106791151e-06, "loss": 0.04, "num_input_tokens_seen": 61131560, "step": 90675 }, { "epoch": 2.215327486380182, "grad_norm": 0.016405398026108742, "learning_rate": 1.3646847034949577e-06, "loss": 0.0004, "num_input_tokens_seen": 61134632, "step": 90680 }, { "epoch": 2.2154496372120294, "grad_norm": 0.10307609289884567, "learning_rate": 1.3646052975466798e-06, "loss": 0.0002, "num_input_tokens_seen": 61138152, "step": 90685 }, { "epoch": 2.2155717880438766, "grad_norm": 0.07581989467144012, "learning_rate": 1.3645258889468938e-06, "loss": 0.0352, "num_input_tokens_seen": 61141480, "step": 90690 }, { "epoch": 2.2156939388757237, "grad_norm": 0.03339642286300659, "learning_rate": 1.3644464776961778e-06, "loss": 0.0001, "num_input_tokens_seen": 61145320, "step": 90695 }, { "epoch": 2.215816089707571, "grad_norm": 0.17576716840267181, "learning_rate": 1.3643670637951086e-06, "loss": 0.0874, "num_input_tokens_seen": 61148520, "step": 90700 }, { "epoch": 2.215938240539418, "grad_norm": 0.08091838657855988, "learning_rate": 1.3642876472442642e-06, "loss": 0.1244, "num_input_tokens_seen": 61151528, "step": 90705 }, { "epoch": 2.2160603913712653, "grad_norm": 0.018327750265598297, "learning_rate": 1.3642082280442219e-06, "loss": 0.0002, "num_input_tokens_seen": 61155112, "step": 90710 }, { "epoch": 2.2161825422031125, "grad_norm": 0.01857393980026245, "learning_rate": 1.3641288061955599e-06, "loss": 0.0001, "num_input_tokens_seen": 61158568, "step": 90715 }, { "epoch": 2.2163046930349597, "grad_norm": 0.012956013903021812, "learning_rate": 1.364049381698855e-06, "loss": 0.0565, "num_input_tokens_seen": 61162024, "step": 90720 }, { "epoch": 2.216426843866807, "grad_norm": 0.11538074910640717, "learning_rate": 1.363969954554685e-06, "loss": 0.0587, "num_input_tokens_seen": 61165544, "step": 90725 }, { "epoch": 2.216548994698654, "grad_norm": 210.1100311279297, "learning_rate": 1.3638905247636276e-06, "loss": 0.1062, "num_input_tokens_seen": 61168936, "step": 90730 }, { "epoch": 2.2166711455305013, "grad_norm": 0.04966253042221069, "learning_rate": 1.3638110923262608e-06, "loss": 0.0007, "num_input_tokens_seen": 61172072, "step": 90735 }, { "epoch": 2.216793296362348, "grad_norm": 0.04618317633867264, "learning_rate": 1.3637316572431613e-06, "loss": 0.0465, "num_input_tokens_seen": 61175080, "step": 90740 }, { "epoch": 2.216915447194195, "grad_norm": 0.05596147105097771, "learning_rate": 1.3636522195149077e-06, "loss": 0.0372, "num_input_tokens_seen": 61178536, "step": 90745 }, { "epoch": 2.2170375980260424, "grad_norm": 35.95188903808594, "learning_rate": 1.363572779142077e-06, "loss": 0.0348, "num_input_tokens_seen": 61181800, "step": 90750 }, { "epoch": 2.2171597488578896, "grad_norm": 0.37679508328437805, "learning_rate": 1.3634933361252477e-06, "loss": 0.0004, "num_input_tokens_seen": 61185192, "step": 90755 }, { "epoch": 2.2172818996897368, "grad_norm": 0.2642463743686676, "learning_rate": 1.3634138904649969e-06, "loss": 0.0004, "num_input_tokens_seen": 61189672, "step": 90760 }, { "epoch": 2.217404050521584, "grad_norm": 374.71319580078125, "learning_rate": 1.3633344421619027e-06, "loss": 0.0411, "num_input_tokens_seen": 61192744, "step": 90765 }, { "epoch": 2.217526201353431, "grad_norm": 0.31421008706092834, "learning_rate": 1.3632549912165425e-06, "loss": 0.0002, "num_input_tokens_seen": 61196072, "step": 90770 }, { "epoch": 2.2176483521852783, "grad_norm": 0.08052527159452438, "learning_rate": 1.3631755376294944e-06, "loss": 0.0159, "num_input_tokens_seen": 61199336, "step": 90775 }, { "epoch": 2.2177705030171255, "grad_norm": 0.09668438136577606, "learning_rate": 1.363096081401336e-06, "loss": 0.0001, "num_input_tokens_seen": 61202664, "step": 90780 }, { "epoch": 2.2178926538489727, "grad_norm": 0.08282022178173065, "learning_rate": 1.3630166225326453e-06, "loss": 0.0003, "num_input_tokens_seen": 61206056, "step": 90785 }, { "epoch": 2.21801480468082, "grad_norm": 0.07777266949415207, "learning_rate": 1.3629371610240004e-06, "loss": 0.0214, "num_input_tokens_seen": 61209384, "step": 90790 }, { "epoch": 2.218136955512667, "grad_norm": 0.006129283923655748, "learning_rate": 1.3628576968759784e-06, "loss": 0.0001, "num_input_tokens_seen": 61212648, "step": 90795 }, { "epoch": 2.2182591063445143, "grad_norm": 0.017785819247364998, "learning_rate": 1.3627782300891575e-06, "loss": 0.0001, "num_input_tokens_seen": 61215656, "step": 90800 }, { "epoch": 2.2183812571763615, "grad_norm": 0.0023185538593679667, "learning_rate": 1.362698760664116e-06, "loss": 0.0282, "num_input_tokens_seen": 61219048, "step": 90805 }, { "epoch": 2.2185034080082087, "grad_norm": 0.024504944682121277, "learning_rate": 1.3626192886014317e-06, "loss": 0.0379, "num_input_tokens_seen": 61222312, "step": 90810 }, { "epoch": 2.218625558840056, "grad_norm": 0.050144314765930176, "learning_rate": 1.3625398139016824e-06, "loss": 0.0011, "num_input_tokens_seen": 61226152, "step": 90815 }, { "epoch": 2.218747709671903, "grad_norm": 0.03911041468381882, "learning_rate": 1.362460336565446e-06, "loss": 0.0003, "num_input_tokens_seen": 61229608, "step": 90820 }, { "epoch": 2.2188698605037502, "grad_norm": 0.23010863363742828, "learning_rate": 1.3623808565933005e-06, "loss": 0.0404, "num_input_tokens_seen": 61233000, "step": 90825 }, { "epoch": 2.2189920113355974, "grad_norm": 0.043779559433460236, "learning_rate": 1.362301373985824e-06, "loss": 0.0087, "num_input_tokens_seen": 61236200, "step": 90830 }, { "epoch": 2.219114162167444, "grad_norm": 0.378178209066391, "learning_rate": 1.3622218887435942e-06, "loss": 0.0058, "num_input_tokens_seen": 61239848, "step": 90835 }, { "epoch": 2.2192363129992914, "grad_norm": 0.07556562125682831, "learning_rate": 1.3621424008671895e-06, "loss": 0.0002, "num_input_tokens_seen": 61243176, "step": 90840 }, { "epoch": 2.2193584638311386, "grad_norm": 0.7005949020385742, "learning_rate": 1.362062910357188e-06, "loss": 0.0003, "num_input_tokens_seen": 61246824, "step": 90845 }, { "epoch": 2.2194806146629857, "grad_norm": 0.023375436663627625, "learning_rate": 1.3619834172141675e-06, "loss": 0.0817, "num_input_tokens_seen": 61250152, "step": 90850 }, { "epoch": 2.219602765494833, "grad_norm": 0.0043090349063277245, "learning_rate": 1.3619039214387065e-06, "loss": 0.0001, "num_input_tokens_seen": 61253864, "step": 90855 }, { "epoch": 2.21972491632668, "grad_norm": 0.030831608921289444, "learning_rate": 1.3618244230313826e-06, "loss": 0.0001, "num_input_tokens_seen": 61257064, "step": 90860 }, { "epoch": 2.2198470671585273, "grad_norm": 0.4308274984359741, "learning_rate": 1.361744921992774e-06, "loss": 0.0004, "num_input_tokens_seen": 61260200, "step": 90865 }, { "epoch": 2.2199692179903745, "grad_norm": 0.033835988491773605, "learning_rate": 1.3616654183234596e-06, "loss": 0.0524, "num_input_tokens_seen": 61263464, "step": 90870 }, { "epoch": 2.2200913688222217, "grad_norm": 0.011145480908453465, "learning_rate": 1.3615859120240165e-06, "loss": 0.0334, "num_input_tokens_seen": 61266792, "step": 90875 }, { "epoch": 2.220213519654069, "grad_norm": 0.9979506731033325, "learning_rate": 1.3615064030950236e-06, "loss": 0.0002, "num_input_tokens_seen": 61270184, "step": 90880 }, { "epoch": 2.220335670485916, "grad_norm": 0.016083527356386185, "learning_rate": 1.361426891537059e-06, "loss": 0.049, "num_input_tokens_seen": 61273640, "step": 90885 }, { "epoch": 2.2204578213177633, "grad_norm": 0.0046060411259531975, "learning_rate": 1.3613473773507007e-06, "loss": 0.0811, "num_input_tokens_seen": 61276968, "step": 90890 }, { "epoch": 2.2205799721496104, "grad_norm": 0.010451358743011951, "learning_rate": 1.3612678605365268e-06, "loss": 0.0, "num_input_tokens_seen": 61279848, "step": 90895 }, { "epoch": 2.2207021229814576, "grad_norm": 0.0008068532915785909, "learning_rate": 1.3611883410951162e-06, "loss": 0.0002, "num_input_tokens_seen": 61283432, "step": 90900 }, { "epoch": 2.220824273813305, "grad_norm": 1.482542872428894, "learning_rate": 1.3611088190270467e-06, "loss": 0.0475, "num_input_tokens_seen": 61287080, "step": 90905 }, { "epoch": 2.220946424645152, "grad_norm": 0.016690026968717575, "learning_rate": 1.361029294332897e-06, "loss": 0.0, "num_input_tokens_seen": 61290216, "step": 90910 }, { "epoch": 2.221068575476999, "grad_norm": 0.0038016163744032383, "learning_rate": 1.3609497670132448e-06, "loss": 0.087, "num_input_tokens_seen": 61293416, "step": 90915 }, { "epoch": 2.221190726308846, "grad_norm": 0.000619497848674655, "learning_rate": 1.3608702370686689e-06, "loss": 0.0002, "num_input_tokens_seen": 61296424, "step": 90920 }, { "epoch": 2.221312877140693, "grad_norm": 23.480012893676758, "learning_rate": 1.3607907044997476e-06, "loss": 0.1818, "num_input_tokens_seen": 61299880, "step": 90925 }, { "epoch": 2.2214350279725403, "grad_norm": 0.06933058053255081, "learning_rate": 1.3607111693070595e-06, "loss": 0.0001, "num_input_tokens_seen": 61303720, "step": 90930 }, { "epoch": 2.2215571788043875, "grad_norm": 0.037259768694639206, "learning_rate": 1.3606316314911826e-06, "loss": 0.0741, "num_input_tokens_seen": 61306920, "step": 90935 }, { "epoch": 2.2216793296362347, "grad_norm": 0.015051459893584251, "learning_rate": 1.3605520910526953e-06, "loss": 0.0001, "num_input_tokens_seen": 61310120, "step": 90940 }, { "epoch": 2.221801480468082, "grad_norm": 0.06870535016059875, "learning_rate": 1.3604725479921765e-06, "loss": 0.0002, "num_input_tokens_seen": 61313512, "step": 90945 }, { "epoch": 2.221923631299929, "grad_norm": 970.2088623046875, "learning_rate": 1.3603930023102042e-06, "loss": 0.0292, "num_input_tokens_seen": 61316584, "step": 90950 }, { "epoch": 2.2220457821317763, "grad_norm": 0.002735344460234046, "learning_rate": 1.3603134540073571e-06, "loss": 0.062, "num_input_tokens_seen": 61320232, "step": 90955 }, { "epoch": 2.2221679329636235, "grad_norm": 0.005101367831230164, "learning_rate": 1.3602339030842135e-06, "loss": 0.0001, "num_input_tokens_seen": 61323432, "step": 90960 }, { "epoch": 2.2222900837954707, "grad_norm": 0.21339182555675507, "learning_rate": 1.3601543495413521e-06, "loss": 0.0421, "num_input_tokens_seen": 61326760, "step": 90965 }, { "epoch": 2.222412234627318, "grad_norm": 14.178060531616211, "learning_rate": 1.3600747933793516e-06, "loss": 0.0841, "num_input_tokens_seen": 61330152, "step": 90970 }, { "epoch": 2.222534385459165, "grad_norm": 0.11821889132261276, "learning_rate": 1.3599952345987902e-06, "loss": 0.0005, "num_input_tokens_seen": 61333480, "step": 90975 }, { "epoch": 2.2226565362910122, "grad_norm": 13.821526527404785, "learning_rate": 1.3599156732002467e-06, "loss": 0.0673, "num_input_tokens_seen": 61336936, "step": 90980 }, { "epoch": 2.2227786871228594, "grad_norm": 0.028573287650942802, "learning_rate": 1.3598361091842999e-06, "loss": 0.0565, "num_input_tokens_seen": 61340392, "step": 90985 }, { "epoch": 2.2229008379547066, "grad_norm": 47.76668930053711, "learning_rate": 1.3597565425515273e-06, "loss": 0.0946, "num_input_tokens_seen": 61343400, "step": 90990 }, { "epoch": 2.223022988786554, "grad_norm": 0.028161996975541115, "learning_rate": 1.359676973302509e-06, "loss": 0.0033, "num_input_tokens_seen": 61346536, "step": 90995 }, { "epoch": 2.223145139618401, "grad_norm": 0.05201875790953636, "learning_rate": 1.359597401437823e-06, "loss": 0.0479, "num_input_tokens_seen": 61350056, "step": 91000 }, { "epoch": 2.2232672904502477, "grad_norm": 0.01158086210489273, "learning_rate": 1.3595178269580478e-06, "loss": 0.0855, "num_input_tokens_seen": 61353640, "step": 91005 }, { "epoch": 2.223389441282095, "grad_norm": 147.93606567382812, "learning_rate": 1.3594382498637625e-06, "loss": 0.0065, "num_input_tokens_seen": 61356712, "step": 91010 }, { "epoch": 2.223511592113942, "grad_norm": 0.26321476697921753, "learning_rate": 1.3593586701555454e-06, "loss": 0.0441, "num_input_tokens_seen": 61359912, "step": 91015 }, { "epoch": 2.2236337429457893, "grad_norm": 26.365808486938477, "learning_rate": 1.359279087833975e-06, "loss": 0.0261, "num_input_tokens_seen": 61363624, "step": 91020 }, { "epoch": 2.2237558937776365, "grad_norm": 0.005364961456507444, "learning_rate": 1.359199502899631e-06, "loss": 0.0001, "num_input_tokens_seen": 61366952, "step": 91025 }, { "epoch": 2.2238780446094837, "grad_norm": 3.7186083793640137, "learning_rate": 1.3591199153530916e-06, "loss": 0.0011, "num_input_tokens_seen": 61371048, "step": 91030 }, { "epoch": 2.224000195441331, "grad_norm": 0.005052721593528986, "learning_rate": 1.3590403251949354e-06, "loss": 0.0367, "num_input_tokens_seen": 61373992, "step": 91035 }, { "epoch": 2.224122346273178, "grad_norm": 0.14445973932743073, "learning_rate": 1.3589607324257415e-06, "loss": 0.0014, "num_input_tokens_seen": 61377192, "step": 91040 }, { "epoch": 2.2242444971050253, "grad_norm": 89.84341430664062, "learning_rate": 1.3588811370460884e-06, "loss": 0.1273, "num_input_tokens_seen": 61380648, "step": 91045 }, { "epoch": 2.2243666479368724, "grad_norm": 7.445562839508057, "learning_rate": 1.3588015390565551e-06, "loss": 0.0011, "num_input_tokens_seen": 61384104, "step": 91050 }, { "epoch": 2.2244887987687196, "grad_norm": 0.06055779755115509, "learning_rate": 1.3587219384577207e-06, "loss": 0.035, "num_input_tokens_seen": 61387880, "step": 91055 }, { "epoch": 2.224610949600567, "grad_norm": 0.003716902807354927, "learning_rate": 1.3586423352501637e-06, "loss": 0.0001, "num_input_tokens_seen": 61390952, "step": 91060 }, { "epoch": 2.224733100432414, "grad_norm": 18.314638137817383, "learning_rate": 1.3585627294344635e-06, "loss": 0.0524, "num_input_tokens_seen": 61394152, "step": 91065 }, { "epoch": 2.224855251264261, "grad_norm": 0.034229401499032974, "learning_rate": 1.3584831210111985e-06, "loss": 0.0001, "num_input_tokens_seen": 61397416, "step": 91070 }, { "epoch": 2.2249774020961084, "grad_norm": 0.04336778447031975, "learning_rate": 1.3584035099809477e-06, "loss": 0.0377, "num_input_tokens_seen": 61400872, "step": 91075 }, { "epoch": 2.2250995529279556, "grad_norm": 0.009136239066720009, "learning_rate": 1.3583238963442904e-06, "loss": 0.0003, "num_input_tokens_seen": 61404584, "step": 91080 }, { "epoch": 2.2252217037598028, "grad_norm": 0.1326877623796463, "learning_rate": 1.3582442801018052e-06, "loss": 0.0001, "num_input_tokens_seen": 61407912, "step": 91085 }, { "epoch": 2.22534385459165, "grad_norm": 0.32096967101097107, "learning_rate": 1.3581646612540713e-06, "loss": 0.0009, "num_input_tokens_seen": 61412136, "step": 91090 }, { "epoch": 2.225466005423497, "grad_norm": 0.017594700679183006, "learning_rate": 1.3580850398016676e-06, "loss": 0.0003, "num_input_tokens_seen": 61415656, "step": 91095 }, { "epoch": 2.225588156255344, "grad_norm": 21.886730194091797, "learning_rate": 1.3580054157451732e-06, "loss": 0.0772, "num_input_tokens_seen": 61418856, "step": 91100 }, { "epoch": 2.225710307087191, "grad_norm": 29.80620765686035, "learning_rate": 1.3579257890851673e-06, "loss": 0.0622, "num_input_tokens_seen": 61422248, "step": 91105 }, { "epoch": 2.2258324579190383, "grad_norm": 0.08096987754106522, "learning_rate": 1.3578461598222286e-06, "loss": 0.0008, "num_input_tokens_seen": 61425320, "step": 91110 }, { "epoch": 2.2259546087508855, "grad_norm": 0.020485376939177513, "learning_rate": 1.357766527956936e-06, "loss": 0.0423, "num_input_tokens_seen": 61429032, "step": 91115 }, { "epoch": 2.2260767595827327, "grad_norm": 0.1648709774017334, "learning_rate": 1.3576868934898696e-06, "loss": 0.0304, "num_input_tokens_seen": 61432680, "step": 91120 }, { "epoch": 2.22619891041458, "grad_norm": 0.0049276407808065414, "learning_rate": 1.3576072564216077e-06, "loss": 0.0004, "num_input_tokens_seen": 61435944, "step": 91125 }, { "epoch": 2.226321061246427, "grad_norm": 0.36935797333717346, "learning_rate": 1.3575276167527297e-06, "loss": 0.0435, "num_input_tokens_seen": 61439272, "step": 91130 }, { "epoch": 2.2264432120782742, "grad_norm": 0.015589174814522266, "learning_rate": 1.3574479744838147e-06, "loss": 0.0525, "num_input_tokens_seen": 61442792, "step": 91135 }, { "epoch": 2.2265653629101214, "grad_norm": 41.798988342285156, "learning_rate": 1.357368329615442e-06, "loss": 0.2178, "num_input_tokens_seen": 61446120, "step": 91140 }, { "epoch": 2.2266875137419686, "grad_norm": 0.14397107064723969, "learning_rate": 1.3572886821481905e-06, "loss": 0.0001, "num_input_tokens_seen": 61449448, "step": 91145 }, { "epoch": 2.226809664573816, "grad_norm": 0.08464895933866501, "learning_rate": 1.3572090320826395e-06, "loss": 0.0003, "num_input_tokens_seen": 61452584, "step": 91150 }, { "epoch": 2.226931815405663, "grad_norm": 7.609369277954102, "learning_rate": 1.3571293794193684e-06, "loss": 0.0008, "num_input_tokens_seen": 61455720, "step": 91155 }, { "epoch": 2.22705396623751, "grad_norm": 6.147663593292236, "learning_rate": 1.3570497241589564e-06, "loss": 0.0008, "num_input_tokens_seen": 61459048, "step": 91160 }, { "epoch": 2.2271761170693574, "grad_norm": 0.4475948214530945, "learning_rate": 1.356970066301983e-06, "loss": 0.0007, "num_input_tokens_seen": 61462376, "step": 91165 }, { "epoch": 2.2272982679012046, "grad_norm": 250.3100128173828, "learning_rate": 1.3568904058490272e-06, "loss": 0.0588, "num_input_tokens_seen": 61466408, "step": 91170 }, { "epoch": 2.2274204187330517, "grad_norm": 0.02227071113884449, "learning_rate": 1.356810742800668e-06, "loss": 0.0003, "num_input_tokens_seen": 61470056, "step": 91175 }, { "epoch": 2.227542569564899, "grad_norm": 0.08421640843153, "learning_rate": 1.3567310771574853e-06, "loss": 0.0002, "num_input_tokens_seen": 61472872, "step": 91180 }, { "epoch": 2.2276647203967457, "grad_norm": 0.001982541289180517, "learning_rate": 1.3566514089200584e-06, "loss": 0.182, "num_input_tokens_seen": 61476136, "step": 91185 }, { "epoch": 2.227786871228593, "grad_norm": 0.01702370122075081, "learning_rate": 1.3565717380889664e-06, "loss": 0.0003, "num_input_tokens_seen": 61479848, "step": 91190 }, { "epoch": 2.22790902206044, "grad_norm": 0.023320624604821205, "learning_rate": 1.356492064664789e-06, "loss": 0.0004, "num_input_tokens_seen": 61483112, "step": 91195 }, { "epoch": 2.2280311728922872, "grad_norm": 17.018516540527344, "learning_rate": 1.3564123886481054e-06, "loss": 0.0375, "num_input_tokens_seen": 61486760, "step": 91200 }, { "epoch": 2.2281533237241344, "grad_norm": 0.004173085559159517, "learning_rate": 1.3563327100394947e-06, "loss": 0.0001, "num_input_tokens_seen": 61490152, "step": 91205 }, { "epoch": 2.2282754745559816, "grad_norm": 0.04542386531829834, "learning_rate": 1.356253028839537e-06, "loss": 0.0001, "num_input_tokens_seen": 61493352, "step": 91210 }, { "epoch": 2.228397625387829, "grad_norm": 0.009219934232532978, "learning_rate": 1.3561733450488113e-06, "loss": 0.0001, "num_input_tokens_seen": 61496744, "step": 91215 }, { "epoch": 2.228519776219676, "grad_norm": 56.5626335144043, "learning_rate": 1.3560936586678974e-06, "loss": 0.131, "num_input_tokens_seen": 61499752, "step": 91220 }, { "epoch": 2.228641927051523, "grad_norm": 0.10610431432723999, "learning_rate": 1.3560139696973747e-06, "loss": 0.0001, "num_input_tokens_seen": 61503080, "step": 91225 }, { "epoch": 2.2287640778833704, "grad_norm": 0.029894093051552773, "learning_rate": 1.3559342781378225e-06, "loss": 0.0002, "num_input_tokens_seen": 61506792, "step": 91230 }, { "epoch": 2.2288862287152176, "grad_norm": 0.03196987137198448, "learning_rate": 1.3558545839898206e-06, "loss": 0.0003, "num_input_tokens_seen": 61510312, "step": 91235 }, { "epoch": 2.2290083795470648, "grad_norm": 1.6127263307571411, "learning_rate": 1.3557748872539484e-06, "loss": 0.0984, "num_input_tokens_seen": 61513320, "step": 91240 }, { "epoch": 2.229130530378912, "grad_norm": 0.008849059231579304, "learning_rate": 1.3556951879307855e-06, "loss": 0.0001, "num_input_tokens_seen": 61516520, "step": 91245 }, { "epoch": 2.229252681210759, "grad_norm": 0.4685281813144684, "learning_rate": 1.3556154860209114e-06, "loss": 0.0013, "num_input_tokens_seen": 61519784, "step": 91250 }, { "epoch": 2.2293748320426063, "grad_norm": 0.024512961506843567, "learning_rate": 1.355535781524906e-06, "loss": 0.0002, "num_input_tokens_seen": 61522792, "step": 91255 }, { "epoch": 2.2294969828744535, "grad_norm": 37.255821228027344, "learning_rate": 1.3554560744433488e-06, "loss": 0.1264, "num_input_tokens_seen": 61526824, "step": 91260 }, { "epoch": 2.2296191337063007, "grad_norm": 397.60003662109375, "learning_rate": 1.3553763647768192e-06, "loss": 0.0239, "num_input_tokens_seen": 61529896, "step": 91265 }, { "epoch": 2.229741284538148, "grad_norm": 0.05338110774755478, "learning_rate": 1.355296652525897e-06, "loss": 0.0419, "num_input_tokens_seen": 61533288, "step": 91270 }, { "epoch": 2.229863435369995, "grad_norm": 0.0043336073867976665, "learning_rate": 1.3552169376911625e-06, "loss": 0.0001, "num_input_tokens_seen": 61536936, "step": 91275 }, { "epoch": 2.229985586201842, "grad_norm": 0.0046080537140369415, "learning_rate": 1.3551372202731945e-06, "loss": 0.0001, "num_input_tokens_seen": 61540456, "step": 91280 }, { "epoch": 2.230107737033689, "grad_norm": 0.011748994700610638, "learning_rate": 1.3550575002725732e-06, "loss": 0.0275, "num_input_tokens_seen": 61543592, "step": 91285 }, { "epoch": 2.230229887865536, "grad_norm": 1.6699762344360352, "learning_rate": 1.3549777776898786e-06, "loss": 0.0034, "num_input_tokens_seen": 61546792, "step": 91290 }, { "epoch": 2.2303520386973834, "grad_norm": 0.01124012004584074, "learning_rate": 1.3548980525256897e-06, "loss": 0.0002, "num_input_tokens_seen": 61550184, "step": 91295 }, { "epoch": 2.2304741895292306, "grad_norm": 0.0027650375850498676, "learning_rate": 1.3548183247805867e-06, "loss": 0.0004, "num_input_tokens_seen": 61553768, "step": 91300 }, { "epoch": 2.230596340361078, "grad_norm": 0.05154336616396904, "learning_rate": 1.3547385944551495e-06, "loss": 0.0225, "num_input_tokens_seen": 61556968, "step": 91305 }, { "epoch": 2.230718491192925, "grad_norm": 0.00040175282629206777, "learning_rate": 1.3546588615499576e-06, "loss": 0.0303, "num_input_tokens_seen": 61560424, "step": 91310 }, { "epoch": 2.230840642024772, "grad_norm": 0.08982338011264801, "learning_rate": 1.3545791260655915e-06, "loss": 0.0561, "num_input_tokens_seen": 61563368, "step": 91315 }, { "epoch": 2.2309627928566194, "grad_norm": 0.0134674571454525, "learning_rate": 1.3544993880026305e-06, "loss": 0.0004, "num_input_tokens_seen": 61566696, "step": 91320 }, { "epoch": 2.2310849436884665, "grad_norm": 0.01658649742603302, "learning_rate": 1.3544196473616544e-06, "loss": 0.0397, "num_input_tokens_seen": 61569896, "step": 91325 }, { "epoch": 2.2312070945203137, "grad_norm": 0.08493436127901077, "learning_rate": 1.3543399041432432e-06, "loss": 0.0002, "num_input_tokens_seen": 61572968, "step": 91330 }, { "epoch": 2.231329245352161, "grad_norm": 0.02740195393562317, "learning_rate": 1.3542601583479774e-06, "loss": 0.0932, "num_input_tokens_seen": 61576232, "step": 91335 }, { "epoch": 2.231451396184008, "grad_norm": 0.20961058139801025, "learning_rate": 1.3541804099764362e-06, "loss": 0.0369, "num_input_tokens_seen": 61579240, "step": 91340 }, { "epoch": 2.2315735470158553, "grad_norm": 0.08280511945486069, "learning_rate": 1.3541006590291998e-06, "loss": 0.0008, "num_input_tokens_seen": 61582696, "step": 91345 }, { "epoch": 2.2316956978477025, "grad_norm": 0.005895423702895641, "learning_rate": 1.3540209055068484e-06, "loss": 0.0001, "num_input_tokens_seen": 61586088, "step": 91350 }, { "epoch": 2.2318178486795497, "grad_norm": 0.02408262901008129, "learning_rate": 1.3539411494099614e-06, "loss": 0.0001, "num_input_tokens_seen": 61589224, "step": 91355 }, { "epoch": 2.231939999511397, "grad_norm": 0.005107124801725149, "learning_rate": 1.353861390739119e-06, "loss": 0.0002, "num_input_tokens_seen": 61592808, "step": 91360 }, { "epoch": 2.2320621503432436, "grad_norm": 0.010785761289298534, "learning_rate": 1.3537816294949017e-06, "loss": 0.0002, "num_input_tokens_seen": 61596008, "step": 91365 }, { "epoch": 2.232184301175091, "grad_norm": 47.25584411621094, "learning_rate": 1.353701865677889e-06, "loss": 0.2142, "num_input_tokens_seen": 61599272, "step": 91370 }, { "epoch": 2.232306452006938, "grad_norm": 0.009135080501437187, "learning_rate": 1.3536220992886615e-06, "loss": 0.0002, "num_input_tokens_seen": 61602920, "step": 91375 }, { "epoch": 2.232428602838785, "grad_norm": 0.0020226880442351103, "learning_rate": 1.3535423303277989e-06, "loss": 0.0007, "num_input_tokens_seen": 61605992, "step": 91380 }, { "epoch": 2.2325507536706324, "grad_norm": 32.131187438964844, "learning_rate": 1.3534625587958814e-06, "loss": 0.0978, "num_input_tokens_seen": 61609320, "step": 91385 }, { "epoch": 2.2326729045024796, "grad_norm": 0.05036914721131325, "learning_rate": 1.353382784693489e-06, "loss": 0.0008, "num_input_tokens_seen": 61612712, "step": 91390 }, { "epoch": 2.2327950553343268, "grad_norm": 0.2048310786485672, "learning_rate": 1.353303008021202e-06, "loss": 0.0578, "num_input_tokens_seen": 61616168, "step": 91395 }, { "epoch": 2.232917206166174, "grad_norm": 0.8726524114608765, "learning_rate": 1.3532232287796007e-06, "loss": 0.0456, "num_input_tokens_seen": 61619560, "step": 91400 }, { "epoch": 2.233039356998021, "grad_norm": 0.06254065781831741, "learning_rate": 1.353143446969265e-06, "loss": 0.0002, "num_input_tokens_seen": 61623144, "step": 91405 }, { "epoch": 2.2331615078298683, "grad_norm": 0.007272529415786266, "learning_rate": 1.3530636625907747e-06, "loss": 0.0538, "num_input_tokens_seen": 61626600, "step": 91410 }, { "epoch": 2.2332836586617155, "grad_norm": 0.24082855880260468, "learning_rate": 1.352983875644711e-06, "loss": 0.0005, "num_input_tokens_seen": 61629480, "step": 91415 }, { "epoch": 2.2334058094935627, "grad_norm": 0.12771330773830414, "learning_rate": 1.3529040861316535e-06, "loss": 0.0611, "num_input_tokens_seen": 61632872, "step": 91420 }, { "epoch": 2.23352796032541, "grad_norm": 3.524974822998047, "learning_rate": 1.3528242940521821e-06, "loss": 0.0003, "num_input_tokens_seen": 61636200, "step": 91425 }, { "epoch": 2.233650111157257, "grad_norm": 0.03709175065159798, "learning_rate": 1.352744499406878e-06, "loss": 0.0005, "num_input_tokens_seen": 61639272, "step": 91430 }, { "epoch": 2.2337722619891043, "grad_norm": 0.04541729390621185, "learning_rate": 1.352664702196321e-06, "loss": 0.0002, "num_input_tokens_seen": 61642536, "step": 91435 }, { "epoch": 2.2338944128209515, "grad_norm": 0.008953643962740898, "learning_rate": 1.3525849024210913e-06, "loss": 0.0003, "num_input_tokens_seen": 61645736, "step": 91440 }, { "epoch": 2.2340165636527987, "grad_norm": 23.101348876953125, "learning_rate": 1.3525051000817699e-06, "loss": 0.1131, "num_input_tokens_seen": 61649640, "step": 91445 }, { "epoch": 2.2341387144846454, "grad_norm": 10.648353576660156, "learning_rate": 1.352425295178936e-06, "loss": 0.0431, "num_input_tokens_seen": 61652904, "step": 91450 }, { "epoch": 2.234260865316493, "grad_norm": 0.06287828832864761, "learning_rate": 1.3523454877131703e-06, "loss": 0.0002, "num_input_tokens_seen": 61656296, "step": 91455 }, { "epoch": 2.23438301614834, "grad_norm": 0.0150414127856493, "learning_rate": 1.352265677685054e-06, "loss": 0.0075, "num_input_tokens_seen": 61659688, "step": 91460 }, { "epoch": 2.234505166980187, "grad_norm": 0.065169557929039, "learning_rate": 1.352185865095167e-06, "loss": 0.0005, "num_input_tokens_seen": 61662696, "step": 91465 }, { "epoch": 2.234627317812034, "grad_norm": 2.2550110816955566, "learning_rate": 1.3521060499440893e-06, "loss": 0.0008, "num_input_tokens_seen": 61666152, "step": 91470 }, { "epoch": 2.2347494686438814, "grad_norm": 0.026500001549720764, "learning_rate": 1.352026232232402e-06, "loss": 0.0003, "num_input_tokens_seen": 61669288, "step": 91475 }, { "epoch": 2.2348716194757285, "grad_norm": 0.2615768015384674, "learning_rate": 1.351946411960685e-06, "loss": 0.0586, "num_input_tokens_seen": 61672808, "step": 91480 }, { "epoch": 2.2349937703075757, "grad_norm": 12.543548583984375, "learning_rate": 1.351866589129519e-06, "loss": 0.0469, "num_input_tokens_seen": 61675752, "step": 91485 }, { "epoch": 2.235115921139423, "grad_norm": 0.040481340140104294, "learning_rate": 1.3517867637394846e-06, "loss": 0.0579, "num_input_tokens_seen": 61679144, "step": 91490 }, { "epoch": 2.23523807197127, "grad_norm": 0.01803850382566452, "learning_rate": 1.3517069357911626e-06, "loss": 0.0002, "num_input_tokens_seen": 61683304, "step": 91495 }, { "epoch": 2.2353602228031173, "grad_norm": 38.861106872558594, "learning_rate": 1.351627105285133e-06, "loss": 0.1406, "num_input_tokens_seen": 61686248, "step": 91500 }, { "epoch": 2.2354823736349645, "grad_norm": 0.0414762981235981, "learning_rate": 1.3515472722219763e-06, "loss": 0.0408, "num_input_tokens_seen": 61689960, "step": 91505 }, { "epoch": 2.2356045244668117, "grad_norm": 0.12040600925683975, "learning_rate": 1.3514674366022734e-06, "loss": 0.1092, "num_input_tokens_seen": 61693672, "step": 91510 }, { "epoch": 2.235726675298659, "grad_norm": 0.214358851313591, "learning_rate": 1.3513875984266045e-06, "loss": 0.0002, "num_input_tokens_seen": 61697256, "step": 91515 }, { "epoch": 2.235848826130506, "grad_norm": 0.328814834356308, "learning_rate": 1.3513077576955506e-06, "loss": 0.1026, "num_input_tokens_seen": 61700456, "step": 91520 }, { "epoch": 2.2359709769623533, "grad_norm": 0.03226625919342041, "learning_rate": 1.3512279144096924e-06, "loss": 0.0003, "num_input_tokens_seen": 61703464, "step": 91525 }, { "epoch": 2.2360931277942004, "grad_norm": 0.03546365350484848, "learning_rate": 1.3511480685696101e-06, "loss": 0.0644, "num_input_tokens_seen": 61706728, "step": 91530 }, { "epoch": 2.2362152786260476, "grad_norm": 0.2883140444755554, "learning_rate": 1.3510682201758847e-06, "loss": 0.0004, "num_input_tokens_seen": 61710248, "step": 91535 }, { "epoch": 2.236337429457895, "grad_norm": 0.028474008664488792, "learning_rate": 1.350988369229097e-06, "loss": 0.0503, "num_input_tokens_seen": 61713192, "step": 91540 }, { "epoch": 2.2364595802897416, "grad_norm": 0.12916983664035797, "learning_rate": 1.3509085157298272e-06, "loss": 0.0582, "num_input_tokens_seen": 61716136, "step": 91545 }, { "epoch": 2.2365817311215888, "grad_norm": 0.20014813542366028, "learning_rate": 1.3508286596786565e-06, "loss": 0.0005, "num_input_tokens_seen": 61719400, "step": 91550 }, { "epoch": 2.236703881953436, "grad_norm": 11.671493530273438, "learning_rate": 1.3507488010761651e-06, "loss": 0.1024, "num_input_tokens_seen": 61722472, "step": 91555 }, { "epoch": 2.236826032785283, "grad_norm": 0.2516132593154907, "learning_rate": 1.3506689399229342e-06, "loss": 0.0002, "num_input_tokens_seen": 61726248, "step": 91560 }, { "epoch": 2.2369481836171303, "grad_norm": 0.03510146215558052, "learning_rate": 1.3505890762195446e-06, "loss": 0.0502, "num_input_tokens_seen": 61729768, "step": 91565 }, { "epoch": 2.2370703344489775, "grad_norm": 0.05277731269598007, "learning_rate": 1.3505092099665771e-06, "loss": 0.0005, "num_input_tokens_seen": 61733032, "step": 91570 }, { "epoch": 2.2371924852808247, "grad_norm": 0.01292071770876646, "learning_rate": 1.3504293411646122e-06, "loss": 0.0007, "num_input_tokens_seen": 61736680, "step": 91575 }, { "epoch": 2.237314636112672, "grad_norm": 43.903099060058594, "learning_rate": 1.3503494698142305e-06, "loss": 0.0538, "num_input_tokens_seen": 61740584, "step": 91580 }, { "epoch": 2.237436786944519, "grad_norm": 0.020390598103404045, "learning_rate": 1.3502695959160136e-06, "loss": 0.0003, "num_input_tokens_seen": 61744040, "step": 91585 }, { "epoch": 2.2375589377763663, "grad_norm": 0.07818000018596649, "learning_rate": 1.350189719470542e-06, "loss": 0.029, "num_input_tokens_seen": 61747624, "step": 91590 }, { "epoch": 2.2376810886082135, "grad_norm": 0.07038668543100357, "learning_rate": 1.3501098404783963e-06, "loss": 0.0006, "num_input_tokens_seen": 61751016, "step": 91595 }, { "epoch": 2.2378032394400607, "grad_norm": 0.04207471385598183, "learning_rate": 1.3500299589401581e-06, "loss": 0.0001, "num_input_tokens_seen": 61754344, "step": 91600 }, { "epoch": 2.237925390271908, "grad_norm": 0.023803038522601128, "learning_rate": 1.3499500748564076e-06, "loss": 0.0421, "num_input_tokens_seen": 61757416, "step": 91605 }, { "epoch": 2.238047541103755, "grad_norm": 18.444482803344727, "learning_rate": 1.349870188227726e-06, "loss": 0.0563, "num_input_tokens_seen": 61760744, "step": 91610 }, { "epoch": 2.2381696919356022, "grad_norm": 0.013019073754549026, "learning_rate": 1.3497902990546942e-06, "loss": 0.0428, "num_input_tokens_seen": 61764008, "step": 91615 }, { "epoch": 2.2382918427674494, "grad_norm": 17.95228385925293, "learning_rate": 1.3497104073378936e-06, "loss": 0.0697, "num_input_tokens_seen": 61767528, "step": 91620 }, { "epoch": 2.2384139935992966, "grad_norm": 0.06719338148832321, "learning_rate": 1.3496305130779044e-06, "loss": 0.0008, "num_input_tokens_seen": 61770984, "step": 91625 }, { "epoch": 2.2385361444311433, "grad_norm": 0.1391330361366272, "learning_rate": 1.3495506162753085e-06, "loss": 0.0846, "num_input_tokens_seen": 61774568, "step": 91630 }, { "epoch": 2.2386582952629905, "grad_norm": 110.32960510253906, "learning_rate": 1.3494707169306866e-06, "loss": 0.0028, "num_input_tokens_seen": 61777576, "step": 91635 }, { "epoch": 2.2387804460948377, "grad_norm": 0.09409749507904053, "learning_rate": 1.349390815044619e-06, "loss": 0.0422, "num_input_tokens_seen": 61780904, "step": 91640 }, { "epoch": 2.238902596926685, "grad_norm": 0.8571845889091492, "learning_rate": 1.3493109106176879e-06, "loss": 0.1066, "num_input_tokens_seen": 61784104, "step": 91645 }, { "epoch": 2.239024747758532, "grad_norm": 0.03418490290641785, "learning_rate": 1.349231003650474e-06, "loss": 0.0829, "num_input_tokens_seen": 61787240, "step": 91650 }, { "epoch": 2.2391468985903793, "grad_norm": 0.5480493903160095, "learning_rate": 1.349151094143558e-06, "loss": 0.0011, "num_input_tokens_seen": 61790632, "step": 91655 }, { "epoch": 2.2392690494222265, "grad_norm": 0.00613664323464036, "learning_rate": 1.3490711820975217e-06, "loss": 0.0003, "num_input_tokens_seen": 61793640, "step": 91660 }, { "epoch": 2.2393912002540737, "grad_norm": 0.047891564667224884, "learning_rate": 1.3489912675129455e-06, "loss": 0.0489, "num_input_tokens_seen": 61797160, "step": 91665 }, { "epoch": 2.239513351085921, "grad_norm": 0.014169635251164436, "learning_rate": 1.348911350390411e-06, "loss": 0.0002, "num_input_tokens_seen": 61801064, "step": 91670 }, { "epoch": 2.239635501917768, "grad_norm": 0.06814318895339966, "learning_rate": 1.3488314307304994e-06, "loss": 0.0253, "num_input_tokens_seen": 61804264, "step": 91675 }, { "epoch": 2.2397576527496152, "grad_norm": 19.467390060424805, "learning_rate": 1.3487515085337917e-06, "loss": 0.0518, "num_input_tokens_seen": 61807336, "step": 91680 }, { "epoch": 2.2398798035814624, "grad_norm": 0.11614019423723221, "learning_rate": 1.3486715838008693e-06, "loss": 0.0004, "num_input_tokens_seen": 61810920, "step": 91685 }, { "epoch": 2.2400019544133096, "grad_norm": 0.04380062595009804, "learning_rate": 1.3485916565323135e-06, "loss": 0.0458, "num_input_tokens_seen": 61814184, "step": 91690 }, { "epoch": 2.240124105245157, "grad_norm": 0.2181423455476761, "learning_rate": 1.3485117267287053e-06, "loss": 0.0025, "num_input_tokens_seen": 61817576, "step": 91695 }, { "epoch": 2.240246256077004, "grad_norm": 0.1650727093219757, "learning_rate": 1.348431794390626e-06, "loss": 0.0373, "num_input_tokens_seen": 61820840, "step": 91700 }, { "epoch": 2.240368406908851, "grad_norm": 0.1175679862499237, "learning_rate": 1.3483518595186572e-06, "loss": 0.0001, "num_input_tokens_seen": 61824424, "step": 91705 }, { "epoch": 2.2404905577406984, "grad_norm": 0.04276014491915703, "learning_rate": 1.3482719221133799e-06, "loss": 0.0001, "num_input_tokens_seen": 61827624, "step": 91710 }, { "epoch": 2.2406127085725456, "grad_norm": 725.5232543945312, "learning_rate": 1.3481919821753754e-06, "loss": 0.0285, "num_input_tokens_seen": 61830760, "step": 91715 }, { "epoch": 2.2407348594043928, "grad_norm": 0.016906613484025, "learning_rate": 1.348112039705225e-06, "loss": 0.0002, "num_input_tokens_seen": 61834088, "step": 91720 }, { "epoch": 2.2408570102362395, "grad_norm": 0.0037014957051724195, "learning_rate": 1.3480320947035106e-06, "loss": 0.0002, "num_input_tokens_seen": 61837672, "step": 91725 }, { "epoch": 2.2409791610680867, "grad_norm": 0.13906598091125488, "learning_rate": 1.347952147170813e-06, "loss": 0.0367, "num_input_tokens_seen": 61840680, "step": 91730 }, { "epoch": 2.241101311899934, "grad_norm": 0.21275968849658966, "learning_rate": 1.3478721971077137e-06, "loss": 0.0004, "num_input_tokens_seen": 61843880, "step": 91735 }, { "epoch": 2.241223462731781, "grad_norm": 0.06388302892446518, "learning_rate": 1.3477922445147943e-06, "loss": 0.0173, "num_input_tokens_seen": 61847336, "step": 91740 }, { "epoch": 2.2413456135636283, "grad_norm": 0.003205791814252734, "learning_rate": 1.347712289392636e-06, "loss": 0.0384, "num_input_tokens_seen": 61850600, "step": 91745 }, { "epoch": 2.2414677643954755, "grad_norm": 0.15189264714717865, "learning_rate": 1.3476323317418208e-06, "loss": 0.0002, "num_input_tokens_seen": 61853864, "step": 91750 }, { "epoch": 2.2415899152273226, "grad_norm": 25.96412467956543, "learning_rate": 1.3475523715629296e-06, "loss": 0.1162, "num_input_tokens_seen": 61857064, "step": 91755 }, { "epoch": 2.24171206605917, "grad_norm": 0.05080273002386093, "learning_rate": 1.3474724088565442e-06, "loss": 0.0001, "num_input_tokens_seen": 61860328, "step": 91760 }, { "epoch": 2.241834216891017, "grad_norm": 0.10395577549934387, "learning_rate": 1.3473924436232456e-06, "loss": 0.0004, "num_input_tokens_seen": 61863464, "step": 91765 }, { "epoch": 2.241956367722864, "grad_norm": 0.040744926780462265, "learning_rate": 1.347312475863616e-06, "loss": 0.0389, "num_input_tokens_seen": 61866920, "step": 91770 }, { "epoch": 2.2420785185547114, "grad_norm": 75.73230743408203, "learning_rate": 1.3472325055782366e-06, "loss": 0.0905, "num_input_tokens_seen": 61870056, "step": 91775 }, { "epoch": 2.2422006693865586, "grad_norm": 3.3570287227630615, "learning_rate": 1.347152532767689e-06, "loss": 0.0055, "num_input_tokens_seen": 61873512, "step": 91780 }, { "epoch": 2.242322820218406, "grad_norm": 0.03612152487039566, "learning_rate": 1.347072557432555e-06, "loss": 0.0001, "num_input_tokens_seen": 61877736, "step": 91785 }, { "epoch": 2.242444971050253, "grad_norm": 34.28316116333008, "learning_rate": 1.3469925795734155e-06, "loss": 0.1082, "num_input_tokens_seen": 61880808, "step": 91790 }, { "epoch": 2.2425671218821, "grad_norm": 0.10308131575584412, "learning_rate": 1.346912599190853e-06, "loss": 0.0716, "num_input_tokens_seen": 61884648, "step": 91795 }, { "epoch": 2.2426892727139474, "grad_norm": 0.3189535439014435, "learning_rate": 1.346832616285449e-06, "loss": 0.0777, "num_input_tokens_seen": 61887784, "step": 91800 }, { "epoch": 2.2428114235457945, "grad_norm": 0.056606896221637726, "learning_rate": 1.3467526308577846e-06, "loss": 0.0003, "num_input_tokens_seen": 61891240, "step": 91805 }, { "epoch": 2.2429335743776413, "grad_norm": 0.0325242318212986, "learning_rate": 1.3466726429084418e-06, "loss": 0.0001, "num_input_tokens_seen": 61894568, "step": 91810 }, { "epoch": 2.2430557252094885, "grad_norm": 17.941125869750977, "learning_rate": 1.3465926524380024e-06, "loss": 0.1248, "num_input_tokens_seen": 61897896, "step": 91815 }, { "epoch": 2.2431778760413357, "grad_norm": 0.6458048224449158, "learning_rate": 1.3465126594470481e-06, "loss": 0.0005, "num_input_tokens_seen": 61901160, "step": 91820 }, { "epoch": 2.243300026873183, "grad_norm": 0.11034848541021347, "learning_rate": 1.3464326639361604e-06, "loss": 0.0002, "num_input_tokens_seen": 61904424, "step": 91825 }, { "epoch": 2.24342217770503, "grad_norm": 0.27809590101242065, "learning_rate": 1.346352665905921e-06, "loss": 0.0004, "num_input_tokens_seen": 61907752, "step": 91830 }, { "epoch": 2.2435443285368772, "grad_norm": 0.01850729063153267, "learning_rate": 1.3462726653569121e-06, "loss": 0.0002, "num_input_tokens_seen": 61910760, "step": 91835 }, { "epoch": 2.2436664793687244, "grad_norm": 0.6994618773460388, "learning_rate": 1.3461926622897153e-06, "loss": 0.0691, "num_input_tokens_seen": 61914216, "step": 91840 }, { "epoch": 2.2437886302005716, "grad_norm": 465.07183837890625, "learning_rate": 1.3461126567049123e-06, "loss": 0.1318, "num_input_tokens_seen": 61917416, "step": 91845 }, { "epoch": 2.243910781032419, "grad_norm": 0.026234636083245277, "learning_rate": 1.3460326486030849e-06, "loss": 0.1174, "num_input_tokens_seen": 61920680, "step": 91850 }, { "epoch": 2.244032931864266, "grad_norm": 0.005727563053369522, "learning_rate": 1.345952637984815e-06, "loss": 0.0346, "num_input_tokens_seen": 61923880, "step": 91855 }, { "epoch": 2.244155082696113, "grad_norm": 0.09239700436592102, "learning_rate": 1.3458726248506844e-06, "loss": 0.0004, "num_input_tokens_seen": 61926888, "step": 91860 }, { "epoch": 2.2442772335279604, "grad_norm": 3.4369235038757324, "learning_rate": 1.3457926092012752e-06, "loss": 0.0443, "num_input_tokens_seen": 61930344, "step": 91865 }, { "epoch": 2.2443993843598076, "grad_norm": 81.35676574707031, "learning_rate": 1.3457125910371692e-06, "loss": 0.0434, "num_input_tokens_seen": 61933736, "step": 91870 }, { "epoch": 2.2445215351916548, "grad_norm": 0.09962138533592224, "learning_rate": 1.345632570358948e-06, "loss": 0.0007, "num_input_tokens_seen": 61937064, "step": 91875 }, { "epoch": 2.244643686023502, "grad_norm": 0.06848546862602234, "learning_rate": 1.345552547167194e-06, "loss": 0.0007, "num_input_tokens_seen": 61940712, "step": 91880 }, { "epoch": 2.244765836855349, "grad_norm": 0.01280384510755539, "learning_rate": 1.345472521462489e-06, "loss": 0.0627, "num_input_tokens_seen": 61944040, "step": 91885 }, { "epoch": 2.2448879876871963, "grad_norm": 0.19952426850795746, "learning_rate": 1.3453924932454145e-06, "loss": 0.0617, "num_input_tokens_seen": 61947112, "step": 91890 }, { "epoch": 2.2450101385190435, "grad_norm": 0.06788770854473114, "learning_rate": 1.3453124625165533e-06, "loss": 0.0385, "num_input_tokens_seen": 61950376, "step": 91895 }, { "epoch": 2.2451322893508907, "grad_norm": 0.01820383220911026, "learning_rate": 1.3452324292764866e-06, "loss": 0.0001, "num_input_tokens_seen": 61953832, "step": 91900 }, { "epoch": 2.2452544401827375, "grad_norm": 0.05058905854821205, "learning_rate": 1.345152393525797e-06, "loss": 0.053, "num_input_tokens_seen": 61956840, "step": 91905 }, { "epoch": 2.2453765910145846, "grad_norm": 0.037442367523908615, "learning_rate": 1.3450723552650667e-06, "loss": 0.0002, "num_input_tokens_seen": 61960488, "step": 91910 }, { "epoch": 2.245498741846432, "grad_norm": 0.12428835779428482, "learning_rate": 1.3449923144948772e-06, "loss": 0.0002, "num_input_tokens_seen": 61963688, "step": 91915 }, { "epoch": 2.245620892678279, "grad_norm": 0.008461522869765759, "learning_rate": 1.3449122712158106e-06, "loss": 0.0002, "num_input_tokens_seen": 61967336, "step": 91920 }, { "epoch": 2.245743043510126, "grad_norm": 0.03980327025055885, "learning_rate": 1.3448322254284495e-06, "loss": 0.0567, "num_input_tokens_seen": 61970792, "step": 91925 }, { "epoch": 2.2458651943419734, "grad_norm": 70.60750579833984, "learning_rate": 1.3447521771333754e-06, "loss": 0.0668, "num_input_tokens_seen": 61974056, "step": 91930 }, { "epoch": 2.2459873451738206, "grad_norm": 0.00029991817427799106, "learning_rate": 1.344672126331171e-06, "loss": 0.001, "num_input_tokens_seen": 61977256, "step": 91935 }, { "epoch": 2.246109496005668, "grad_norm": 59.51887893676758, "learning_rate": 1.3445920730224177e-06, "loss": 0.0492, "num_input_tokens_seen": 61980456, "step": 91940 }, { "epoch": 2.246231646837515, "grad_norm": 74.84333801269531, "learning_rate": 1.3445120172076987e-06, "loss": 0.0787, "num_input_tokens_seen": 61983848, "step": 91945 }, { "epoch": 2.246353797669362, "grad_norm": 0.0036761623341590166, "learning_rate": 1.3444319588875955e-06, "loss": 0.0002, "num_input_tokens_seen": 61987112, "step": 91950 }, { "epoch": 2.2464759485012094, "grad_norm": 0.02540251985192299, "learning_rate": 1.3443518980626904e-06, "loss": 0.0004, "num_input_tokens_seen": 61990568, "step": 91955 }, { "epoch": 2.2465980993330565, "grad_norm": 0.748134195804596, "learning_rate": 1.3442718347335658e-06, "loss": 0.0004, "num_input_tokens_seen": 61994152, "step": 91960 }, { "epoch": 2.2467202501649037, "grad_norm": 0.026896782219409943, "learning_rate": 1.3441917689008038e-06, "loss": 0.039, "num_input_tokens_seen": 61997736, "step": 91965 }, { "epoch": 2.246842400996751, "grad_norm": 0.016510486602783203, "learning_rate": 1.3441117005649867e-06, "loss": 0.0001, "num_input_tokens_seen": 62001640, "step": 91970 }, { "epoch": 2.246964551828598, "grad_norm": 0.005256003234535456, "learning_rate": 1.3440316297266967e-06, "loss": 0.0002, "num_input_tokens_seen": 62004776, "step": 91975 }, { "epoch": 2.2470867026604453, "grad_norm": 0.004536962136626244, "learning_rate": 1.343951556386516e-06, "loss": 0.0322, "num_input_tokens_seen": 62008232, "step": 91980 }, { "epoch": 2.2472088534922925, "grad_norm": 0.2646792531013489, "learning_rate": 1.343871480545027e-06, "loss": 0.0467, "num_input_tokens_seen": 62011496, "step": 91985 }, { "epoch": 2.2473310043241392, "grad_norm": 0.029792839661240578, "learning_rate": 1.3437914022028122e-06, "loss": 0.0005, "num_input_tokens_seen": 62014888, "step": 91990 }, { "epoch": 2.2474531551559864, "grad_norm": 0.21611711382865906, "learning_rate": 1.343711321360454e-06, "loss": 0.0544, "num_input_tokens_seen": 62018280, "step": 91995 }, { "epoch": 2.2475753059878336, "grad_norm": 0.004224136937409639, "learning_rate": 1.3436312380185345e-06, "loss": 0.0003, "num_input_tokens_seen": 62021736, "step": 92000 }, { "epoch": 2.247697456819681, "grad_norm": 1.352657437324524, "learning_rate": 1.3435511521776363e-06, "loss": 0.0888, "num_input_tokens_seen": 62025192, "step": 92005 }, { "epoch": 2.247819607651528, "grad_norm": 0.04550763592123985, "learning_rate": 1.343471063838342e-06, "loss": 0.0008, "num_input_tokens_seen": 62028904, "step": 92010 }, { "epoch": 2.247941758483375, "grad_norm": 28.128278732299805, "learning_rate": 1.3433909730012334e-06, "loss": 0.0931, "num_input_tokens_seen": 62032552, "step": 92015 }, { "epoch": 2.2480639093152224, "grad_norm": 49.448265075683594, "learning_rate": 1.3433108796668933e-06, "loss": 0.0638, "num_input_tokens_seen": 62035944, "step": 92020 }, { "epoch": 2.2481860601470696, "grad_norm": 0.18086501955986023, "learning_rate": 1.3432307838359043e-06, "loss": 0.0744, "num_input_tokens_seen": 62039336, "step": 92025 }, { "epoch": 2.2483082109789168, "grad_norm": 295.6267395019531, "learning_rate": 1.3431506855088483e-06, "loss": 0.1497, "num_input_tokens_seen": 62042408, "step": 92030 }, { "epoch": 2.248430361810764, "grad_norm": 28.082304000854492, "learning_rate": 1.3430705846863086e-06, "loss": 0.0363, "num_input_tokens_seen": 62045992, "step": 92035 }, { "epoch": 2.248552512642611, "grad_norm": 0.6816192865371704, "learning_rate": 1.3429904813688674e-06, "loss": 0.0425, "num_input_tokens_seen": 62049576, "step": 92040 }, { "epoch": 2.2486746634744583, "grad_norm": 83.5420913696289, "learning_rate": 1.3429103755571066e-06, "loss": 0.08, "num_input_tokens_seen": 62052840, "step": 92045 }, { "epoch": 2.2487968143063055, "grad_norm": 0.04537109658122063, "learning_rate": 1.34283026725161e-06, "loss": 0.0002, "num_input_tokens_seen": 62055912, "step": 92050 }, { "epoch": 2.2489189651381527, "grad_norm": 0.021133245900273323, "learning_rate": 1.342750156452959e-06, "loss": 0.0343, "num_input_tokens_seen": 62059304, "step": 92055 }, { "epoch": 2.24904111597, "grad_norm": 0.06331618875265121, "learning_rate": 1.342670043161737e-06, "loss": 0.0355, "num_input_tokens_seen": 62064808, "step": 92060 }, { "epoch": 2.249163266801847, "grad_norm": 0.020840996876358986, "learning_rate": 1.3425899273785262e-06, "loss": 0.0001, "num_input_tokens_seen": 62068136, "step": 92065 }, { "epoch": 2.2492854176336943, "grad_norm": 0.39848390221595764, "learning_rate": 1.3425098091039095e-06, "loss": 0.0363, "num_input_tokens_seen": 62070888, "step": 92070 }, { "epoch": 2.249407568465541, "grad_norm": 0.003665744327008724, "learning_rate": 1.3424296883384688e-06, "loss": 0.0171, "num_input_tokens_seen": 62073832, "step": 92075 }, { "epoch": 2.249529719297388, "grad_norm": 0.005300711374729872, "learning_rate": 1.3423495650827877e-06, "loss": 0.0, "num_input_tokens_seen": 62077288, "step": 92080 }, { "epoch": 2.2496518701292354, "grad_norm": 0.0019977991469204426, "learning_rate": 1.3422694393374484e-06, "loss": 0.0009, "num_input_tokens_seen": 62080360, "step": 92085 }, { "epoch": 2.2497740209610826, "grad_norm": 0.1273031085729599, "learning_rate": 1.3421893111030338e-06, "loss": 0.0001, "num_input_tokens_seen": 62083496, "step": 92090 }, { "epoch": 2.2498961717929298, "grad_norm": 0.059631090611219406, "learning_rate": 1.3421091803801262e-06, "loss": 0.1333, "num_input_tokens_seen": 62086760, "step": 92095 }, { "epoch": 2.250018322624777, "grad_norm": 0.0072194174863398075, "learning_rate": 1.342029047169309e-06, "loss": 0.0001, "num_input_tokens_seen": 62090024, "step": 92100 }, { "epoch": 2.250140473456624, "grad_norm": 0.014334439300000668, "learning_rate": 1.341948911471164e-06, "loss": 0.0001, "num_input_tokens_seen": 62093160, "step": 92105 }, { "epoch": 2.2501649036229936, "eval_loss": 0.2040521502494812, "eval_runtime": 47.5159, "eval_samples_per_second": 765.743, "eval_steps_per_second": 95.736, "num_input_tokens_seen": 62093992, "step": 92106 }, { "epoch": 2.2502626242884713, "grad_norm": 0.042339157313108444, "learning_rate": 1.341868773286275e-06, "loss": 0.0001, "num_input_tokens_seen": 62097000, "step": 92110 }, { "epoch": 2.2503847751203185, "grad_norm": 0.02461964450776577, "learning_rate": 1.3417886326152247e-06, "loss": 0.0002, "num_input_tokens_seen": 62100072, "step": 92115 }, { "epoch": 2.2505069259521657, "grad_norm": 0.015833193436264992, "learning_rate": 1.3417084894585948e-06, "loss": 0.043, "num_input_tokens_seen": 62103272, "step": 92120 }, { "epoch": 2.250629076784013, "grad_norm": 0.13749870657920837, "learning_rate": 1.341628343816969e-06, "loss": 0.0002, "num_input_tokens_seen": 62106344, "step": 92125 }, { "epoch": 2.25075122761586, "grad_norm": 0.00958644412457943, "learning_rate": 1.3415481956909305e-06, "loss": 0.0598, "num_input_tokens_seen": 62109480, "step": 92130 }, { "epoch": 2.2508733784477073, "grad_norm": 0.1764431893825531, "learning_rate": 1.341468045081061e-06, "loss": 0.1393, "num_input_tokens_seen": 62113064, "step": 92135 }, { "epoch": 2.2509955292795545, "grad_norm": 40.462764739990234, "learning_rate": 1.3413878919879443e-06, "loss": 0.1327, "num_input_tokens_seen": 62116776, "step": 92140 }, { "epoch": 2.2511176801114017, "grad_norm": 0.11698302626609802, "learning_rate": 1.341307736412163e-06, "loss": 0.0002, "num_input_tokens_seen": 62120168, "step": 92145 }, { "epoch": 2.251239830943249, "grad_norm": 0.05886290222406387, "learning_rate": 1.3412275783543002e-06, "loss": 0.0004, "num_input_tokens_seen": 62123304, "step": 92150 }, { "epoch": 2.251361981775096, "grad_norm": 0.08000266551971436, "learning_rate": 1.3411474178149384e-06, "loss": 0.0531, "num_input_tokens_seen": 62126824, "step": 92155 }, { "epoch": 2.2514841326069432, "grad_norm": 0.04475337266921997, "learning_rate": 1.341067254794661e-06, "loss": 0.0002, "num_input_tokens_seen": 62130664, "step": 92160 }, { "epoch": 2.2516062834387904, "grad_norm": 0.02364620752632618, "learning_rate": 1.340987089294051e-06, "loss": 0.0002, "num_input_tokens_seen": 62134184, "step": 92165 }, { "epoch": 2.251728434270637, "grad_norm": 0.017477652058005333, "learning_rate": 1.3409069213136908e-06, "loss": 0.0002, "num_input_tokens_seen": 62137640, "step": 92170 }, { "epoch": 2.2518505851024844, "grad_norm": 0.1361808478832245, "learning_rate": 1.3408267508541639e-06, "loss": 0.0003, "num_input_tokens_seen": 62140904, "step": 92175 }, { "epoch": 2.2519727359343316, "grad_norm": 892.4404296875, "learning_rate": 1.3407465779160532e-06, "loss": 0.0098, "num_input_tokens_seen": 62144424, "step": 92180 }, { "epoch": 2.2520948867661787, "grad_norm": 11.566713333129883, "learning_rate": 1.3406664024999417e-06, "loss": 0.0006, "num_input_tokens_seen": 62147432, "step": 92185 }, { "epoch": 2.252217037598026, "grad_norm": 0.12640665471553802, "learning_rate": 1.3405862246064126e-06, "loss": 0.038, "num_input_tokens_seen": 62150312, "step": 92190 }, { "epoch": 2.252339188429873, "grad_norm": 350.6678771972656, "learning_rate": 1.3405060442360488e-06, "loss": 0.1214, "num_input_tokens_seen": 62153768, "step": 92195 }, { "epoch": 2.2524613392617203, "grad_norm": 0.06017898768186569, "learning_rate": 1.3404258613894332e-06, "loss": 0.0414, "num_input_tokens_seen": 62157160, "step": 92200 }, { "epoch": 2.2525834900935675, "grad_norm": 0.04330041632056236, "learning_rate": 1.3403456760671494e-06, "loss": 0.0925, "num_input_tokens_seen": 62160744, "step": 92205 }, { "epoch": 2.2527056409254147, "grad_norm": 0.09254544973373413, "learning_rate": 1.3402654882697803e-06, "loss": 0.0628, "num_input_tokens_seen": 62163816, "step": 92210 }, { "epoch": 2.252827791757262, "grad_norm": 0.040174610912799835, "learning_rate": 1.3401852979979094e-06, "loss": 0.0385, "num_input_tokens_seen": 62167144, "step": 92215 }, { "epoch": 2.252949942589109, "grad_norm": 0.08553251624107361, "learning_rate": 1.340105105252119e-06, "loss": 0.0001, "num_input_tokens_seen": 62170792, "step": 92220 }, { "epoch": 2.2530720934209563, "grad_norm": 0.037372469902038574, "learning_rate": 1.3400249100329932e-06, "loss": 0.0983, "num_input_tokens_seen": 62174248, "step": 92225 }, { "epoch": 2.2531942442528035, "grad_norm": 0.02428109012544155, "learning_rate": 1.3399447123411146e-06, "loss": 0.0312, "num_input_tokens_seen": 62177384, "step": 92230 }, { "epoch": 2.2533163950846506, "grad_norm": 0.20832835137844086, "learning_rate": 1.3398645121770664e-06, "loss": 0.0005, "num_input_tokens_seen": 62180712, "step": 92235 }, { "epoch": 2.253438545916498, "grad_norm": 0.023863671347498894, "learning_rate": 1.3397843095414324e-06, "loss": 0.0003, "num_input_tokens_seen": 62184104, "step": 92240 }, { "epoch": 2.253560696748345, "grad_norm": 731.2637939453125, "learning_rate": 1.3397041044347953e-06, "loss": 0.0966, "num_input_tokens_seen": 62187752, "step": 92245 }, { "epoch": 2.253682847580192, "grad_norm": 188.05191040039062, "learning_rate": 1.3396238968577387e-06, "loss": 0.0053, "num_input_tokens_seen": 62191272, "step": 92250 }, { "epoch": 2.253804998412039, "grad_norm": 0.0365290492773056, "learning_rate": 1.339543686810846e-06, "loss": 0.0002, "num_input_tokens_seen": 62194536, "step": 92255 }, { "epoch": 2.2539271492438866, "grad_norm": 0.060201194137334824, "learning_rate": 1.3394634742946998e-06, "loss": 0.0527, "num_input_tokens_seen": 62197608, "step": 92260 }, { "epoch": 2.2540493000757333, "grad_norm": 0.01749134622514248, "learning_rate": 1.339383259309884e-06, "loss": 0.0004, "num_input_tokens_seen": 62200744, "step": 92265 }, { "epoch": 2.2541714509075805, "grad_norm": 46.34138107299805, "learning_rate": 1.3393030418569821e-06, "loss": 0.0025, "num_input_tokens_seen": 62204200, "step": 92270 }, { "epoch": 2.2542936017394277, "grad_norm": 0.11681878566741943, "learning_rate": 1.3392228219365772e-06, "loss": 0.1194, "num_input_tokens_seen": 62207656, "step": 92275 }, { "epoch": 2.254415752571275, "grad_norm": 0.004872969351708889, "learning_rate": 1.3391425995492524e-06, "loss": 0.0002, "num_input_tokens_seen": 62211432, "step": 92280 }, { "epoch": 2.254537903403122, "grad_norm": 0.02966301515698433, "learning_rate": 1.3390623746955918e-06, "loss": 0.0002, "num_input_tokens_seen": 62214952, "step": 92285 }, { "epoch": 2.2546600542349693, "grad_norm": 63.925384521484375, "learning_rate": 1.3389821473761783e-06, "loss": 0.0381, "num_input_tokens_seen": 62218344, "step": 92290 }, { "epoch": 2.2547822050668165, "grad_norm": 0.01595384255051613, "learning_rate": 1.338901917591595e-06, "loss": 0.0961, "num_input_tokens_seen": 62221992, "step": 92295 }, { "epoch": 2.2549043558986637, "grad_norm": 0.4929356276988983, "learning_rate": 1.3388216853424262e-06, "loss": 0.0986, "num_input_tokens_seen": 62225320, "step": 92300 }, { "epoch": 2.255026506730511, "grad_norm": 0.05259265378117561, "learning_rate": 1.3387414506292548e-06, "loss": 0.0002, "num_input_tokens_seen": 62228264, "step": 92305 }, { "epoch": 2.255148657562358, "grad_norm": 42.33659362792969, "learning_rate": 1.3386612134526648e-06, "loss": 0.0362, "num_input_tokens_seen": 62231528, "step": 92310 }, { "epoch": 2.2552708083942052, "grad_norm": 0.002288345480337739, "learning_rate": 1.3385809738132392e-06, "loss": 0.0891, "num_input_tokens_seen": 62235048, "step": 92315 }, { "epoch": 2.2553929592260524, "grad_norm": 0.0041145095601677895, "learning_rate": 1.3385007317115614e-06, "loss": 0.0002, "num_input_tokens_seen": 62238120, "step": 92320 }, { "epoch": 2.2555151100578996, "grad_norm": 0.019636070355772972, "learning_rate": 1.3384204871482156e-06, "loss": 0.0315, "num_input_tokens_seen": 62241576, "step": 92325 }, { "epoch": 2.255637260889747, "grad_norm": 0.023572169244289398, "learning_rate": 1.3383402401237848e-06, "loss": 0.0003, "num_input_tokens_seen": 62244840, "step": 92330 }, { "epoch": 2.255759411721594, "grad_norm": 0.05980467051267624, "learning_rate": 1.3382599906388529e-06, "loss": 0.0001, "num_input_tokens_seen": 62247592, "step": 92335 }, { "epoch": 2.2558815625534407, "grad_norm": 0.0034921050537377596, "learning_rate": 1.3381797386940035e-06, "loss": 0.0001, "num_input_tokens_seen": 62251112, "step": 92340 }, { "epoch": 2.2560037133852884, "grad_norm": 0.009633969515562057, "learning_rate": 1.33809948428982e-06, "loss": 0.0969, "num_input_tokens_seen": 62254184, "step": 92345 }, { "epoch": 2.256125864217135, "grad_norm": 0.24233314394950867, "learning_rate": 1.3380192274268859e-06, "loss": 0.0001, "num_input_tokens_seen": 62258216, "step": 92350 }, { "epoch": 2.2562480150489823, "grad_norm": 0.006902449764311314, "learning_rate": 1.337938968105785e-06, "loss": 0.0489, "num_input_tokens_seen": 62261736, "step": 92355 }, { "epoch": 2.2563701658808295, "grad_norm": 21.458951950073242, "learning_rate": 1.337858706327101e-06, "loss": 0.0563, "num_input_tokens_seen": 62265064, "step": 92360 }, { "epoch": 2.2564923167126767, "grad_norm": 0.04538079351186752, "learning_rate": 1.337778442091418e-06, "loss": 0.0648, "num_input_tokens_seen": 62268072, "step": 92365 }, { "epoch": 2.256614467544524, "grad_norm": 0.004993412643671036, "learning_rate": 1.337698175399319e-06, "loss": 0.0, "num_input_tokens_seen": 62271592, "step": 92370 }, { "epoch": 2.256736618376371, "grad_norm": 1.0822269916534424, "learning_rate": 1.3376179062513884e-06, "loss": 0.0006, "num_input_tokens_seen": 62274600, "step": 92375 }, { "epoch": 2.2568587692082183, "grad_norm": 0.021430518478155136, "learning_rate": 1.3375376346482094e-06, "loss": 0.0789, "num_input_tokens_seen": 62277992, "step": 92380 }, { "epoch": 2.2569809200400655, "grad_norm": 0.038618966937065125, "learning_rate": 1.337457360590366e-06, "loss": 0.0431, "num_input_tokens_seen": 62281896, "step": 92385 }, { "epoch": 2.2571030708719126, "grad_norm": 0.08098925650119781, "learning_rate": 1.3373770840784416e-06, "loss": 0.0203, "num_input_tokens_seen": 62285544, "step": 92390 }, { "epoch": 2.25722522170376, "grad_norm": 0.04787323251366615, "learning_rate": 1.3372968051130205e-06, "loss": 0.1419, "num_input_tokens_seen": 62289192, "step": 92395 }, { "epoch": 2.257347372535607, "grad_norm": 36.374874114990234, "learning_rate": 1.3372165236946864e-06, "loss": 0.0342, "num_input_tokens_seen": 62292392, "step": 92400 }, { "epoch": 2.257469523367454, "grad_norm": 0.0471741184592247, "learning_rate": 1.3371362398240228e-06, "loss": 0.0007, "num_input_tokens_seen": 62295272, "step": 92405 }, { "epoch": 2.2575916741993014, "grad_norm": 0.07099335640668869, "learning_rate": 1.3370559535016138e-06, "loss": 0.0346, "num_input_tokens_seen": 62298920, "step": 92410 }, { "epoch": 2.2577138250311486, "grad_norm": 0.0030002526473253965, "learning_rate": 1.3369756647280436e-06, "loss": 0.1336, "num_input_tokens_seen": 62302120, "step": 92415 }, { "epoch": 2.257835975862996, "grad_norm": 0.037754494696855545, "learning_rate": 1.3368953735038951e-06, "loss": 0.0003, "num_input_tokens_seen": 62305192, "step": 92420 }, { "epoch": 2.257958126694843, "grad_norm": 0.003917124588042498, "learning_rate": 1.3368150798297535e-06, "loss": 0.0837, "num_input_tokens_seen": 62308776, "step": 92425 }, { "epoch": 2.25808027752669, "grad_norm": 0.027190549299120903, "learning_rate": 1.336734783706202e-06, "loss": 0.0004, "num_input_tokens_seen": 62311848, "step": 92430 }, { "epoch": 2.258202428358537, "grad_norm": 0.0012887478806078434, "learning_rate": 1.3366544851338244e-06, "loss": 0.0527, "num_input_tokens_seen": 62315944, "step": 92435 }, { "epoch": 2.258324579190384, "grad_norm": 0.010424080304801464, "learning_rate": 1.3365741841132047e-06, "loss": 0.0002, "num_input_tokens_seen": 62319656, "step": 92440 }, { "epoch": 2.2584467300222313, "grad_norm": 0.14947333931922913, "learning_rate": 1.3364938806449271e-06, "loss": 0.1209, "num_input_tokens_seen": 62323368, "step": 92445 }, { "epoch": 2.2585688808540785, "grad_norm": 0.005441546440124512, "learning_rate": 1.3364135747295752e-06, "loss": 0.0005, "num_input_tokens_seen": 62326632, "step": 92450 }, { "epoch": 2.2586910316859257, "grad_norm": 0.0159373190253973, "learning_rate": 1.3363332663677338e-06, "loss": 0.0502, "num_input_tokens_seen": 62330664, "step": 92455 }, { "epoch": 2.258813182517773, "grad_norm": 0.2237975001335144, "learning_rate": 1.3362529555599861e-06, "loss": 0.0453, "num_input_tokens_seen": 62334248, "step": 92460 }, { "epoch": 2.25893533334962, "grad_norm": 0.01939545013010502, "learning_rate": 1.3361726423069165e-06, "loss": 0.0003, "num_input_tokens_seen": 62337768, "step": 92465 }, { "epoch": 2.2590574841814672, "grad_norm": 158.32150268554688, "learning_rate": 1.3360923266091093e-06, "loss": 0.0844, "num_input_tokens_seen": 62341096, "step": 92470 }, { "epoch": 2.2591796350133144, "grad_norm": 0.02396237477660179, "learning_rate": 1.3360120084671481e-06, "loss": 0.0006, "num_input_tokens_seen": 62344616, "step": 92475 }, { "epoch": 2.2593017858451616, "grad_norm": 0.05617120489478111, "learning_rate": 1.3359316878816174e-06, "loss": 0.001, "num_input_tokens_seen": 62347944, "step": 92480 }, { "epoch": 2.259423936677009, "grad_norm": 0.031269315630197525, "learning_rate": 1.3358513648531008e-06, "loss": 0.0912, "num_input_tokens_seen": 62351144, "step": 92485 }, { "epoch": 2.259546087508856, "grad_norm": 0.038197267800569534, "learning_rate": 1.3357710393821828e-06, "loss": 0.0002, "num_input_tokens_seen": 62354216, "step": 92490 }, { "epoch": 2.259668238340703, "grad_norm": 0.006710516754537821, "learning_rate": 1.3356907114694477e-06, "loss": 0.1095, "num_input_tokens_seen": 62357416, "step": 92495 }, { "epoch": 2.2597903891725504, "grad_norm": 13.797673225402832, "learning_rate": 1.3356103811154792e-06, "loss": 0.1062, "num_input_tokens_seen": 62361448, "step": 92500 }, { "epoch": 2.2599125400043976, "grad_norm": 0.003856425639241934, "learning_rate": 1.3355300483208621e-06, "loss": 0.0005, "num_input_tokens_seen": 62365032, "step": 92505 }, { "epoch": 2.2600346908362448, "grad_norm": 0.1337459236383438, "learning_rate": 1.3354497130861796e-06, "loss": 0.0007, "num_input_tokens_seen": 62368168, "step": 92510 }, { "epoch": 2.260156841668092, "grad_norm": 0.06924041360616684, "learning_rate": 1.335369375412017e-06, "loss": 0.0373, "num_input_tokens_seen": 62371368, "step": 92515 }, { "epoch": 2.2602789924999387, "grad_norm": 0.06534288823604584, "learning_rate": 1.335289035298958e-06, "loss": 0.0004, "num_input_tokens_seen": 62374760, "step": 92520 }, { "epoch": 2.2604011433317863, "grad_norm": 8.957152366638184, "learning_rate": 1.3352086927475872e-06, "loss": 0.1364, "num_input_tokens_seen": 62377704, "step": 92525 }, { "epoch": 2.260523294163633, "grad_norm": 0.004728742875158787, "learning_rate": 1.3351283477584883e-06, "loss": 0.0001, "num_input_tokens_seen": 62380904, "step": 92530 }, { "epoch": 2.2606454449954803, "grad_norm": 0.2021309733390808, "learning_rate": 1.3350480003322463e-06, "loss": 0.0002, "num_input_tokens_seen": 62384424, "step": 92535 }, { "epoch": 2.2607675958273274, "grad_norm": 0.004681295249611139, "learning_rate": 1.334967650469445e-06, "loss": 0.0008, "num_input_tokens_seen": 62387688, "step": 92540 }, { "epoch": 2.2608897466591746, "grad_norm": 0.2683848440647125, "learning_rate": 1.3348872981706685e-06, "loss": 0.0004, "num_input_tokens_seen": 62391272, "step": 92545 }, { "epoch": 2.261011897491022, "grad_norm": 0.4058259427547455, "learning_rate": 1.3348069434365017e-06, "loss": 0.0563, "num_input_tokens_seen": 62394536, "step": 92550 }, { "epoch": 2.261134048322869, "grad_norm": 0.12689943611621857, "learning_rate": 1.3347265862675288e-06, "loss": 0.0943, "num_input_tokens_seen": 62397864, "step": 92555 }, { "epoch": 2.261256199154716, "grad_norm": 0.039587125182151794, "learning_rate": 1.3346462266643342e-06, "loss": 0.0003, "num_input_tokens_seen": 62401256, "step": 92560 }, { "epoch": 2.2613783499865634, "grad_norm": 0.006709498818963766, "learning_rate": 1.334565864627502e-06, "loss": 0.0395, "num_input_tokens_seen": 62404648, "step": 92565 }, { "epoch": 2.2615005008184106, "grad_norm": 0.02049305848777294, "learning_rate": 1.334485500157617e-06, "loss": 0.0004, "num_input_tokens_seen": 62407976, "step": 92570 }, { "epoch": 2.2616226516502578, "grad_norm": 0.08842063695192337, "learning_rate": 1.3344051332552635e-06, "loss": 0.0003, "num_input_tokens_seen": 62411240, "step": 92575 }, { "epoch": 2.261744802482105, "grad_norm": 0.0080694779753685, "learning_rate": 1.334324763921026e-06, "loss": 0.0004, "num_input_tokens_seen": 62414952, "step": 92580 }, { "epoch": 2.261866953313952, "grad_norm": 23.28043556213379, "learning_rate": 1.334244392155489e-06, "loss": 0.0822, "num_input_tokens_seen": 62417960, "step": 92585 }, { "epoch": 2.2619891041457993, "grad_norm": 0.1557777374982834, "learning_rate": 1.3341640179592363e-06, "loss": 0.004, "num_input_tokens_seen": 62422120, "step": 92590 }, { "epoch": 2.2621112549776465, "grad_norm": 11.396211624145508, "learning_rate": 1.3340836413328536e-06, "loss": 0.1738, "num_input_tokens_seen": 62425384, "step": 92595 }, { "epoch": 2.2622334058094937, "grad_norm": 0.22009818255901337, "learning_rate": 1.3340032622769245e-06, "loss": 0.0003, "num_input_tokens_seen": 62428584, "step": 92600 }, { "epoch": 2.262355556641341, "grad_norm": 0.5843344330787659, "learning_rate": 1.3339228807920337e-06, "loss": 0.0002, "num_input_tokens_seen": 62432296, "step": 92605 }, { "epoch": 2.262477707473188, "grad_norm": 0.17534910142421722, "learning_rate": 1.333842496878766e-06, "loss": 0.0002, "num_input_tokens_seen": 62435880, "step": 92610 }, { "epoch": 2.262599858305035, "grad_norm": 10.780598640441895, "learning_rate": 1.333762110537706e-06, "loss": 0.0238, "num_input_tokens_seen": 62439272, "step": 92615 }, { "epoch": 2.262722009136882, "grad_norm": 0.1346311718225479, "learning_rate": 1.3336817217694383e-06, "loss": 0.0002, "num_input_tokens_seen": 62442664, "step": 92620 }, { "epoch": 2.2628441599687292, "grad_norm": 571.826416015625, "learning_rate": 1.333601330574547e-06, "loss": 0.0877, "num_input_tokens_seen": 62446120, "step": 92625 }, { "epoch": 2.2629663108005764, "grad_norm": 0.005242965184152126, "learning_rate": 1.3335209369536174e-06, "loss": 0.0961, "num_input_tokens_seen": 62449640, "step": 92630 }, { "epoch": 2.2630884616324236, "grad_norm": 0.14449675381183624, "learning_rate": 1.3334405409072336e-06, "loss": 0.0885, "num_input_tokens_seen": 62453160, "step": 92635 }, { "epoch": 2.263210612464271, "grad_norm": 32.600486755371094, "learning_rate": 1.3333601424359806e-06, "loss": 0.046, "num_input_tokens_seen": 62456552, "step": 92640 }, { "epoch": 2.263332763296118, "grad_norm": 0.040783826261758804, "learning_rate": 1.3332797415404431e-06, "loss": 0.0005, "num_input_tokens_seen": 62460008, "step": 92645 }, { "epoch": 2.263454914127965, "grad_norm": 0.06487621366977692, "learning_rate": 1.3331993382212058e-06, "loss": 0.0002, "num_input_tokens_seen": 62463400, "step": 92650 }, { "epoch": 2.2635770649598124, "grad_norm": 0.0489821583032608, "learning_rate": 1.333118932478853e-06, "loss": 0.0002, "num_input_tokens_seen": 62466856, "step": 92655 }, { "epoch": 2.2636992157916596, "grad_norm": 0.2700420320034027, "learning_rate": 1.3330385243139697e-06, "loss": 0.0005, "num_input_tokens_seen": 62469928, "step": 92660 }, { "epoch": 2.2638213666235067, "grad_norm": 8.69739818572998, "learning_rate": 1.332958113727141e-06, "loss": 0.0006, "num_input_tokens_seen": 62472936, "step": 92665 }, { "epoch": 2.263943517455354, "grad_norm": 0.021106649190187454, "learning_rate": 1.3328777007189507e-06, "loss": 0.009, "num_input_tokens_seen": 62476200, "step": 92670 }, { "epoch": 2.264065668287201, "grad_norm": 0.10658904165029526, "learning_rate": 1.3327972852899847e-06, "loss": 0.0429, "num_input_tokens_seen": 62479400, "step": 92675 }, { "epoch": 2.2641878191190483, "grad_norm": 0.09654999524354935, "learning_rate": 1.3327168674408273e-06, "loss": 0.0715, "num_input_tokens_seen": 62483432, "step": 92680 }, { "epoch": 2.2643099699508955, "grad_norm": 0.13167276978492737, "learning_rate": 1.3326364471720632e-06, "loss": 0.0844, "num_input_tokens_seen": 62487080, "step": 92685 }, { "epoch": 2.2644321207827427, "grad_norm": 0.17308863997459412, "learning_rate": 1.332556024484278e-06, "loss": 0.0573, "num_input_tokens_seen": 62489896, "step": 92690 }, { "epoch": 2.26455427161459, "grad_norm": 0.03751340135931969, "learning_rate": 1.3324755993780554e-06, "loss": 0.039, "num_input_tokens_seen": 62493736, "step": 92695 }, { "epoch": 2.2646764224464366, "grad_norm": 0.036548275500535965, "learning_rate": 1.3323951718539808e-06, "loss": 0.0007, "num_input_tokens_seen": 62497128, "step": 92700 }, { "epoch": 2.2647985732782843, "grad_norm": 0.049087874591350555, "learning_rate": 1.332314741912639e-06, "loss": 0.0034, "num_input_tokens_seen": 62500520, "step": 92705 }, { "epoch": 2.264920724110131, "grad_norm": 0.05646089091897011, "learning_rate": 1.3322343095546153e-06, "loss": 0.0006, "num_input_tokens_seen": 62503528, "step": 92710 }, { "epoch": 2.265042874941978, "grad_norm": 15.631532669067383, "learning_rate": 1.3321538747804942e-06, "loss": 0.083, "num_input_tokens_seen": 62506408, "step": 92715 }, { "epoch": 2.2651650257738254, "grad_norm": 50.563568115234375, "learning_rate": 1.3320734375908608e-06, "loss": 0.0493, "num_input_tokens_seen": 62509608, "step": 92720 }, { "epoch": 2.2652871766056726, "grad_norm": 0.07339024543762207, "learning_rate": 1.3319929979863e-06, "loss": 0.0335, "num_input_tokens_seen": 62512744, "step": 92725 }, { "epoch": 2.2654093274375198, "grad_norm": 0.013352487236261368, "learning_rate": 1.3319125559673968e-06, "loss": 0.0001, "num_input_tokens_seen": 62516136, "step": 92730 }, { "epoch": 2.265531478269367, "grad_norm": 80.81047058105469, "learning_rate": 1.3318321115347364e-06, "loss": 0.0446, "num_input_tokens_seen": 62519592, "step": 92735 }, { "epoch": 2.265653629101214, "grad_norm": 0.11078570783138275, "learning_rate": 1.3317516646889036e-06, "loss": 0.0003, "num_input_tokens_seen": 62522728, "step": 92740 }, { "epoch": 2.2657757799330613, "grad_norm": 0.06942203640937805, "learning_rate": 1.3316712154304835e-06, "loss": 0.0002, "num_input_tokens_seen": 62526312, "step": 92745 }, { "epoch": 2.2658979307649085, "grad_norm": 0.015332935377955437, "learning_rate": 1.331590763760061e-06, "loss": 0.0003, "num_input_tokens_seen": 62529640, "step": 92750 }, { "epoch": 2.2660200815967557, "grad_norm": 0.1101103127002716, "learning_rate": 1.3315103096782215e-06, "loss": 0.0001, "num_input_tokens_seen": 62532904, "step": 92755 }, { "epoch": 2.266142232428603, "grad_norm": 0.006774018984287977, "learning_rate": 1.3314298531855492e-06, "loss": 0.0367, "num_input_tokens_seen": 62536040, "step": 92760 }, { "epoch": 2.26626438326045, "grad_norm": 44.68410873413086, "learning_rate": 1.3313493942826304e-06, "loss": 0.0282, "num_input_tokens_seen": 62539304, "step": 92765 }, { "epoch": 2.2663865340922973, "grad_norm": 0.03351452574133873, "learning_rate": 1.3312689329700497e-06, "loss": 0.0003, "num_input_tokens_seen": 62542696, "step": 92770 }, { "epoch": 2.2665086849241445, "grad_norm": 0.010534519329667091, "learning_rate": 1.331188469248392e-06, "loss": 0.0318, "num_input_tokens_seen": 62546024, "step": 92775 }, { "epoch": 2.2666308357559917, "grad_norm": 0.11503587663173676, "learning_rate": 1.3311080031182428e-06, "loss": 0.0005, "num_input_tokens_seen": 62549032, "step": 92780 }, { "epoch": 2.2667529865878384, "grad_norm": 0.00525831151753664, "learning_rate": 1.331027534580187e-06, "loss": 0.0931, "num_input_tokens_seen": 62552360, "step": 92785 }, { "epoch": 2.266875137419686, "grad_norm": 0.010182957164943218, "learning_rate": 1.3309470636348103e-06, "loss": 0.0513, "num_input_tokens_seen": 62555752, "step": 92790 }, { "epoch": 2.266997288251533, "grad_norm": 44.56449508666992, "learning_rate": 1.3308665902826972e-06, "loss": 0.0737, "num_input_tokens_seen": 62559208, "step": 92795 }, { "epoch": 2.26711943908338, "grad_norm": 0.00906476378440857, "learning_rate": 1.3307861145244335e-06, "loss": 0.0002, "num_input_tokens_seen": 62562472, "step": 92800 }, { "epoch": 2.267241589915227, "grad_norm": 0.006960950791835785, "learning_rate": 1.330705636360604e-06, "loss": 0.081, "num_input_tokens_seen": 62565480, "step": 92805 }, { "epoch": 2.2673637407470744, "grad_norm": 0.6850554943084717, "learning_rate": 1.3306251557917942e-06, "loss": 0.0004, "num_input_tokens_seen": 62568936, "step": 92810 }, { "epoch": 2.2674858915789216, "grad_norm": 0.059107035398483276, "learning_rate": 1.3305446728185894e-06, "loss": 0.0497, "num_input_tokens_seen": 62572264, "step": 92815 }, { "epoch": 2.2676080424107687, "grad_norm": 0.08122166246175766, "learning_rate": 1.330464187441575e-06, "loss": 0.0001, "num_input_tokens_seen": 62576360, "step": 92820 }, { "epoch": 2.267730193242616, "grad_norm": 0.02651078999042511, "learning_rate": 1.3303836996613359e-06, "loss": 0.0281, "num_input_tokens_seen": 62579688, "step": 92825 }, { "epoch": 2.267852344074463, "grad_norm": 0.04520462080836296, "learning_rate": 1.3303032094784575e-06, "loss": 0.048, "num_input_tokens_seen": 62582632, "step": 92830 }, { "epoch": 2.2679744949063103, "grad_norm": 0.026764435693621635, "learning_rate": 1.3302227168935255e-06, "loss": 0.0001, "num_input_tokens_seen": 62585896, "step": 92835 }, { "epoch": 2.2680966457381575, "grad_norm": 0.014396784827113152, "learning_rate": 1.3301422219071252e-06, "loss": 0.0001, "num_input_tokens_seen": 62589800, "step": 92840 }, { "epoch": 2.2682187965700047, "grad_norm": 0.07177358120679855, "learning_rate": 1.330061724519842e-06, "loss": 0.0001, "num_input_tokens_seen": 62593704, "step": 92845 }, { "epoch": 2.268340947401852, "grad_norm": 0.008762385696172714, "learning_rate": 1.329981224732261e-06, "loss": 0.0553, "num_input_tokens_seen": 62596520, "step": 92850 }, { "epoch": 2.268463098233699, "grad_norm": 0.05073574557900429, "learning_rate": 1.3299007225449677e-06, "loss": 0.0001, "num_input_tokens_seen": 62599784, "step": 92855 }, { "epoch": 2.2685852490655463, "grad_norm": 0.043613236397504807, "learning_rate": 1.3298202179585475e-06, "loss": 0.0002, "num_input_tokens_seen": 62603176, "step": 92860 }, { "epoch": 2.2687073998973935, "grad_norm": 0.2333085983991623, "learning_rate": 1.3297397109735862e-06, "loss": 0.002, "num_input_tokens_seen": 62606888, "step": 92865 }, { "epoch": 2.2688295507292406, "grad_norm": 0.4680311679840088, "learning_rate": 1.329659201590669e-06, "loss": 0.0004, "num_input_tokens_seen": 62609960, "step": 92870 }, { "epoch": 2.268951701561088, "grad_norm": 0.013720767572522163, "learning_rate": 1.3295786898103814e-06, "loss": 0.0005, "num_input_tokens_seen": 62613032, "step": 92875 }, { "epoch": 2.2690738523929346, "grad_norm": 0.00633205333724618, "learning_rate": 1.3294981756333087e-06, "loss": 0.0001, "num_input_tokens_seen": 62616104, "step": 92880 }, { "epoch": 2.269196003224782, "grad_norm": 0.015389472246170044, "learning_rate": 1.3294176590600368e-06, "loss": 0.0644, "num_input_tokens_seen": 62619240, "step": 92885 }, { "epoch": 2.269318154056629, "grad_norm": 0.04347032308578491, "learning_rate": 1.3293371400911513e-06, "loss": 0.0001, "num_input_tokens_seen": 62622568, "step": 92890 }, { "epoch": 2.269440304888476, "grad_norm": 0.0029214448295533657, "learning_rate": 1.3292566187272374e-06, "loss": 0.0679, "num_input_tokens_seen": 62626152, "step": 92895 }, { "epoch": 2.2695624557203233, "grad_norm": 42.35622787475586, "learning_rate": 1.3291760949688806e-06, "loss": 0.0608, "num_input_tokens_seen": 62629224, "step": 92900 }, { "epoch": 2.2696846065521705, "grad_norm": 0.01074572466313839, "learning_rate": 1.329095568816667e-06, "loss": 0.0001, "num_input_tokens_seen": 62632872, "step": 92905 }, { "epoch": 2.2698067573840177, "grad_norm": 0.012575157918035984, "learning_rate": 1.3290150402711817e-06, "loss": 0.0001, "num_input_tokens_seen": 62636136, "step": 92910 }, { "epoch": 2.269928908215865, "grad_norm": 0.08424925804138184, "learning_rate": 1.3289345093330104e-06, "loss": 0.1128, "num_input_tokens_seen": 62639272, "step": 92915 }, { "epoch": 2.270051059047712, "grad_norm": 2.9216606616973877, "learning_rate": 1.3288539760027391e-06, "loss": 0.0308, "num_input_tokens_seen": 62642472, "step": 92920 }, { "epoch": 2.2701732098795593, "grad_norm": 241.16482543945312, "learning_rate": 1.3287734402809533e-06, "loss": 0.0691, "num_input_tokens_seen": 62645992, "step": 92925 }, { "epoch": 2.2702953607114065, "grad_norm": 0.0212919432669878, "learning_rate": 1.3286929021682385e-06, "loss": 0.0029, "num_input_tokens_seen": 62649704, "step": 92930 }, { "epoch": 2.2704175115432537, "grad_norm": 0.13828763365745544, "learning_rate": 1.3286123616651806e-06, "loss": 0.0002, "num_input_tokens_seen": 62652968, "step": 92935 }, { "epoch": 2.270539662375101, "grad_norm": 0.008991804905235767, "learning_rate": 1.3285318187723652e-06, "loss": 0.0502, "num_input_tokens_seen": 62656424, "step": 92940 }, { "epoch": 2.270661813206948, "grad_norm": 0.016623621806502342, "learning_rate": 1.3284512734903779e-06, "loss": 0.0548, "num_input_tokens_seen": 62659304, "step": 92945 }, { "epoch": 2.2707839640387952, "grad_norm": 0.018210897222161293, "learning_rate": 1.3283707258198047e-06, "loss": 0.0646, "num_input_tokens_seen": 62662888, "step": 92950 }, { "epoch": 2.2709061148706424, "grad_norm": 0.004802016541361809, "learning_rate": 1.3282901757612314e-06, "loss": 0.0716, "num_input_tokens_seen": 62666280, "step": 92955 }, { "epoch": 2.2710282657024896, "grad_norm": 0.012645594775676727, "learning_rate": 1.3282096233152435e-06, "loss": 0.0642, "num_input_tokens_seen": 62669992, "step": 92960 }, { "epoch": 2.2711504165343364, "grad_norm": 0.010564680211246014, "learning_rate": 1.3281290684824268e-06, "loss": 0.013, "num_input_tokens_seen": 62673128, "step": 92965 }, { "epoch": 2.271272567366184, "grad_norm": 0.008313731290400028, "learning_rate": 1.3280485112633675e-06, "loss": 0.1342, "num_input_tokens_seen": 62676328, "step": 92970 }, { "epoch": 2.2713947181980307, "grad_norm": 0.023614229634404182, "learning_rate": 1.327967951658651e-06, "loss": 0.0004, "num_input_tokens_seen": 62680232, "step": 92975 }, { "epoch": 2.271516869029878, "grad_norm": 0.0680750384926796, "learning_rate": 1.3278873896688633e-06, "loss": 0.0007, "num_input_tokens_seen": 62683624, "step": 92980 }, { "epoch": 2.271639019861725, "grad_norm": 0.023569390177726746, "learning_rate": 1.3278068252945908e-06, "loss": 0.0001, "num_input_tokens_seen": 62686952, "step": 92985 }, { "epoch": 2.2717611706935723, "grad_norm": 104.64521026611328, "learning_rate": 1.327726258536418e-06, "loss": 0.1115, "num_input_tokens_seen": 62690600, "step": 92990 }, { "epoch": 2.2718833215254195, "grad_norm": 0.01823197677731514, "learning_rate": 1.3276456893949325e-06, "loss": 0.0355, "num_input_tokens_seen": 62693928, "step": 92995 }, { "epoch": 2.2720054723572667, "grad_norm": 0.02083517052233219, "learning_rate": 1.3275651178707194e-06, "loss": 0.0633, "num_input_tokens_seen": 62697320, "step": 93000 }, { "epoch": 2.272127623189114, "grad_norm": 0.01520773395895958, "learning_rate": 1.3274845439643645e-06, "loss": 0.0125, "num_input_tokens_seen": 62700712, "step": 93005 }, { "epoch": 2.272249774020961, "grad_norm": 0.005817287135869265, "learning_rate": 1.3274039676764535e-06, "loss": 0.0001, "num_input_tokens_seen": 62703656, "step": 93010 }, { "epoch": 2.2723719248528083, "grad_norm": 0.012480477802455425, "learning_rate": 1.3273233890075733e-06, "loss": 0.0003, "num_input_tokens_seen": 62707240, "step": 93015 }, { "epoch": 2.2724940756846554, "grad_norm": 0.010067179799079895, "learning_rate": 1.327242807958309e-06, "loss": 0.0003, "num_input_tokens_seen": 62710376, "step": 93020 }, { "epoch": 2.2726162265165026, "grad_norm": 0.05216876044869423, "learning_rate": 1.3271622245292473e-06, "loss": 0.0431, "num_input_tokens_seen": 62713512, "step": 93025 }, { "epoch": 2.27273837734835, "grad_norm": 0.0039581493474543095, "learning_rate": 1.3270816387209738e-06, "loss": 0.0336, "num_input_tokens_seen": 62716904, "step": 93030 }, { "epoch": 2.272860528180197, "grad_norm": 0.012547915801405907, "learning_rate": 1.3270010505340748e-06, "loss": 0.0526, "num_input_tokens_seen": 62720296, "step": 93035 }, { "epoch": 2.272982679012044, "grad_norm": 0.6097759008407593, "learning_rate": 1.3269204599691357e-06, "loss": 0.0002, "num_input_tokens_seen": 62723432, "step": 93040 }, { "epoch": 2.2731048298438914, "grad_norm": 0.04886188358068466, "learning_rate": 1.3268398670267438e-06, "loss": 0.0525, "num_input_tokens_seen": 62726824, "step": 93045 }, { "epoch": 2.2732269806757386, "grad_norm": 0.056709174066782, "learning_rate": 1.326759271707484e-06, "loss": 0.0008, "num_input_tokens_seen": 62730024, "step": 93050 }, { "epoch": 2.2733491315075858, "grad_norm": 0.023183194920420647, "learning_rate": 1.3266786740119428e-06, "loss": 0.0525, "num_input_tokens_seen": 62733096, "step": 93055 }, { "epoch": 2.2734712823394325, "grad_norm": 0.004460521508008242, "learning_rate": 1.3265980739407068e-06, "loss": 0.0002, "num_input_tokens_seen": 62737320, "step": 93060 }, { "epoch": 2.2735934331712797, "grad_norm": 0.025767408311367035, "learning_rate": 1.3265174714943618e-06, "loss": 0.0004, "num_input_tokens_seen": 62740904, "step": 93065 }, { "epoch": 2.273715584003127, "grad_norm": 0.015531780198216438, "learning_rate": 1.3264368666734933e-06, "loss": 0.0359, "num_input_tokens_seen": 62744296, "step": 93070 }, { "epoch": 2.273837734834974, "grad_norm": 0.019820986315608025, "learning_rate": 1.3263562594786886e-06, "loss": 0.0389, "num_input_tokens_seen": 62747304, "step": 93075 }, { "epoch": 2.2739598856668213, "grad_norm": 21.325904846191406, "learning_rate": 1.3262756499105333e-06, "loss": 0.0492, "num_input_tokens_seen": 62750632, "step": 93080 }, { "epoch": 2.2740820364986685, "grad_norm": 0.04072573408484459, "learning_rate": 1.3261950379696136e-06, "loss": 0.1146, "num_input_tokens_seen": 62753832, "step": 93085 }, { "epoch": 2.2742041873305157, "grad_norm": 0.11378677934408188, "learning_rate": 1.326114423656516e-06, "loss": 0.0013, "num_input_tokens_seen": 62757416, "step": 93090 }, { "epoch": 2.274326338162363, "grad_norm": 86.40130615234375, "learning_rate": 1.3260338069718266e-06, "loss": 0.037, "num_input_tokens_seen": 62760424, "step": 93095 }, { "epoch": 2.27444848899421, "grad_norm": 0.27668333053588867, "learning_rate": 1.3259531879161316e-06, "loss": 0.0509, "num_input_tokens_seen": 62764072, "step": 93100 }, { "epoch": 2.2745706398260572, "grad_norm": 0.05763066187500954, "learning_rate": 1.3258725664900173e-06, "loss": 0.0347, "num_input_tokens_seen": 62767720, "step": 93105 }, { "epoch": 2.2746927906579044, "grad_norm": 0.0003041908785235137, "learning_rate": 1.3257919426940703e-06, "loss": 0.0002, "num_input_tokens_seen": 62771112, "step": 93110 }, { "epoch": 2.2748149414897516, "grad_norm": 0.09503401070833206, "learning_rate": 1.3257113165288764e-06, "loss": 0.0004, "num_input_tokens_seen": 62774440, "step": 93115 }, { "epoch": 2.274937092321599, "grad_norm": 0.01079515554010868, "learning_rate": 1.3256306879950224e-06, "loss": 0.0004, "num_input_tokens_seen": 62777960, "step": 93120 }, { "epoch": 2.275059243153446, "grad_norm": 25.221628189086914, "learning_rate": 1.3255500570930945e-06, "loss": 0.0501, "num_input_tokens_seen": 62781160, "step": 93125 }, { "epoch": 2.275181393985293, "grad_norm": 0.0537249855697155, "learning_rate": 1.3254694238236788e-06, "loss": 0.0002, "num_input_tokens_seen": 62784168, "step": 93130 }, { "epoch": 2.2753035448171404, "grad_norm": 0.0006929787923581898, "learning_rate": 1.3253887881873618e-06, "loss": 0.0001, "num_input_tokens_seen": 62787112, "step": 93135 }, { "epoch": 2.2754256956489876, "grad_norm": 0.08567481487989426, "learning_rate": 1.32530815018473e-06, "loss": 0.0002, "num_input_tokens_seen": 62790760, "step": 93140 }, { "epoch": 2.2755478464808343, "grad_norm": 0.028937041759490967, "learning_rate": 1.3252275098163701e-06, "loss": 0.0524, "num_input_tokens_seen": 62794344, "step": 93145 }, { "epoch": 2.275669997312682, "grad_norm": 41.68051528930664, "learning_rate": 1.3251468670828683e-06, "loss": 0.1383, "num_input_tokens_seen": 62797672, "step": 93150 }, { "epoch": 2.2757921481445287, "grad_norm": 0.09774192422628403, "learning_rate": 1.325066221984811e-06, "loss": 0.0647, "num_input_tokens_seen": 62801320, "step": 93155 }, { "epoch": 2.275914298976376, "grad_norm": 0.010497506707906723, "learning_rate": 1.3249855745227847e-06, "loss": 0.1175, "num_input_tokens_seen": 62804392, "step": 93160 }, { "epoch": 2.276036449808223, "grad_norm": 0.33297374844551086, "learning_rate": 1.3249049246973757e-06, "loss": 0.0003, "num_input_tokens_seen": 62807528, "step": 93165 }, { "epoch": 2.2761586006400703, "grad_norm": 0.11804129928350449, "learning_rate": 1.3248242725091707e-06, "loss": 0.0396, "num_input_tokens_seen": 62810792, "step": 93170 }, { "epoch": 2.2762807514719174, "grad_norm": 0.05837703496217728, "learning_rate": 1.3247436179587563e-06, "loss": 0.0005, "num_input_tokens_seen": 62814120, "step": 93175 }, { "epoch": 2.2764029023037646, "grad_norm": 0.28625261783599854, "learning_rate": 1.324662961046719e-06, "loss": 0.0692, "num_input_tokens_seen": 62817576, "step": 93180 }, { "epoch": 2.276525053135612, "grad_norm": 0.04373976215720177, "learning_rate": 1.3245823017736454e-06, "loss": 0.0486, "num_input_tokens_seen": 62821224, "step": 93185 }, { "epoch": 2.276647203967459, "grad_norm": 0.07923418283462524, "learning_rate": 1.324501640140122e-06, "loss": 0.0341, "num_input_tokens_seen": 62824552, "step": 93190 }, { "epoch": 2.276769354799306, "grad_norm": 25.96709442138672, "learning_rate": 1.3244209761467352e-06, "loss": 0.0629, "num_input_tokens_seen": 62827752, "step": 93195 }, { "epoch": 2.2768915056311534, "grad_norm": 0.0062352376990020275, "learning_rate": 1.324340309794072e-06, "loss": 0.0069, "num_input_tokens_seen": 62830696, "step": 93200 }, { "epoch": 2.2770136564630006, "grad_norm": 14.839695930480957, "learning_rate": 1.3242596410827187e-06, "loss": 0.0998, "num_input_tokens_seen": 62834088, "step": 93205 }, { "epoch": 2.2771358072948478, "grad_norm": 0.013603686355054379, "learning_rate": 1.3241789700132621e-06, "loss": 0.0689, "num_input_tokens_seen": 62837288, "step": 93210 }, { "epoch": 2.277257958126695, "grad_norm": 0.03261496499180794, "learning_rate": 1.324098296586289e-06, "loss": 0.1573, "num_input_tokens_seen": 62840488, "step": 93215 }, { "epoch": 2.277380108958542, "grad_norm": 0.0007041191565804183, "learning_rate": 1.324017620802386e-06, "loss": 0.0571, "num_input_tokens_seen": 62843880, "step": 93220 }, { "epoch": 2.2775022597903893, "grad_norm": 13.467567443847656, "learning_rate": 1.3239369426621391e-06, "loss": 0.1587, "num_input_tokens_seen": 62847144, "step": 93225 }, { "epoch": 2.2776244106222365, "grad_norm": 0.07765517383813858, "learning_rate": 1.323856262166136e-06, "loss": 0.0686, "num_input_tokens_seen": 62850600, "step": 93230 }, { "epoch": 2.2777465614540837, "grad_norm": 0.0022951867431402206, "learning_rate": 1.323775579314963e-06, "loss": 0.0006, "num_input_tokens_seen": 62854248, "step": 93235 }, { "epoch": 2.2778687122859305, "grad_norm": 0.04524902254343033, "learning_rate": 1.323694894109207e-06, "loss": 0.0004, "num_input_tokens_seen": 62857512, "step": 93240 }, { "epoch": 2.2779908631177777, "grad_norm": 0.06177098676562309, "learning_rate": 1.3236142065494546e-06, "loss": 0.0003, "num_input_tokens_seen": 62861096, "step": 93245 }, { "epoch": 2.278113013949625, "grad_norm": 39.986083984375, "learning_rate": 1.3235335166362926e-06, "loss": 0.1099, "num_input_tokens_seen": 62864424, "step": 93250 }, { "epoch": 2.278235164781472, "grad_norm": 0.056117504835128784, "learning_rate": 1.323452824370308e-06, "loss": 0.0002, "num_input_tokens_seen": 62867624, "step": 93255 }, { "epoch": 2.2783573156133192, "grad_norm": 0.06814853101968765, "learning_rate": 1.3233721297520875e-06, "loss": 0.0283, "num_input_tokens_seen": 62871080, "step": 93260 }, { "epoch": 2.2784794664451664, "grad_norm": 0.08036066591739655, "learning_rate": 1.3232914327822177e-06, "loss": 0.0589, "num_input_tokens_seen": 62874024, "step": 93265 }, { "epoch": 2.2786016172770136, "grad_norm": 0.01860756240785122, "learning_rate": 1.3232107334612858e-06, "loss": 0.0002, "num_input_tokens_seen": 62877160, "step": 93270 }, { "epoch": 2.278723768108861, "grad_norm": 0.06197257712483406, "learning_rate": 1.3231300317898786e-06, "loss": 0.0421, "num_input_tokens_seen": 62880296, "step": 93275 }, { "epoch": 2.278845918940708, "grad_norm": 27.19537925720215, "learning_rate": 1.3230493277685826e-06, "loss": 0.0779, "num_input_tokens_seen": 62883560, "step": 93280 }, { "epoch": 2.278968069772555, "grad_norm": 0.1904158592224121, "learning_rate": 1.322968621397985e-06, "loss": 0.0413, "num_input_tokens_seen": 62886952, "step": 93285 }, { "epoch": 2.2790902206044024, "grad_norm": 0.03217314928770065, "learning_rate": 1.322887912678673e-06, "loss": 0.0428, "num_input_tokens_seen": 62890344, "step": 93290 }, { "epoch": 2.2792123714362496, "grad_norm": 0.2809275984764099, "learning_rate": 1.322807201611233e-06, "loss": 0.0002, "num_input_tokens_seen": 62894248, "step": 93295 }, { "epoch": 2.2793345222680967, "grad_norm": 0.024791276082396507, "learning_rate": 1.3227264881962522e-06, "loss": 0.0004, "num_input_tokens_seen": 62898216, "step": 93300 }, { "epoch": 2.279456673099944, "grad_norm": 0.37984398007392883, "learning_rate": 1.322645772434318e-06, "loss": 0.0003, "num_input_tokens_seen": 62902184, "step": 93305 }, { "epoch": 2.279578823931791, "grad_norm": 0.07157126814126968, "learning_rate": 1.3225650543260168e-06, "loss": 0.1207, "num_input_tokens_seen": 62905640, "step": 93310 }, { "epoch": 2.2797009747636383, "grad_norm": 0.19514942169189453, "learning_rate": 1.3224843338719356e-06, "loss": 0.0002, "num_input_tokens_seen": 62908712, "step": 93315 }, { "epoch": 2.2798231255954855, "grad_norm": 0.960155725479126, "learning_rate": 1.3224036110726614e-06, "loss": 0.0415, "num_input_tokens_seen": 62912104, "step": 93320 }, { "epoch": 2.2799452764273322, "grad_norm": 0.04093187674880028, "learning_rate": 1.3223228859287815e-06, "loss": 0.0006, "num_input_tokens_seen": 62915688, "step": 93325 }, { "epoch": 2.28006742725918, "grad_norm": 0.006360192783176899, "learning_rate": 1.3222421584408832e-06, "loss": 0.0001, "num_input_tokens_seen": 62919144, "step": 93330 }, { "epoch": 2.2801895780910266, "grad_norm": 390.7582092285156, "learning_rate": 1.3221614286095531e-06, "loss": 0.0564, "num_input_tokens_seen": 62922728, "step": 93335 }, { "epoch": 2.280311728922874, "grad_norm": 0.014665957540273666, "learning_rate": 1.3220806964353784e-06, "loss": 0.0039, "num_input_tokens_seen": 62926120, "step": 93340 }, { "epoch": 2.280433879754721, "grad_norm": 0.0038363176863640547, "learning_rate": 1.3219999619189462e-06, "loss": 0.036, "num_input_tokens_seen": 62929384, "step": 93345 }, { "epoch": 2.280556030586568, "grad_norm": 0.015091368928551674, "learning_rate": 1.3219192250608436e-06, "loss": 0.0003, "num_input_tokens_seen": 62932584, "step": 93350 }, { "epoch": 2.2806781814184154, "grad_norm": 0.01974570006132126, "learning_rate": 1.321838485861658e-06, "loss": 0.0001, "num_input_tokens_seen": 62935976, "step": 93355 }, { "epoch": 2.2808003322502626, "grad_norm": 0.009472187608480453, "learning_rate": 1.3217577443219763e-06, "loss": 0.0001, "num_input_tokens_seen": 62939176, "step": 93360 }, { "epoch": 2.2809224830821098, "grad_norm": 0.018192993476986885, "learning_rate": 1.3216770004423858e-06, "loss": 0.0003, "num_input_tokens_seen": 62942312, "step": 93365 }, { "epoch": 2.281044633913957, "grad_norm": 0.002576603088527918, "learning_rate": 1.3215962542234735e-06, "loss": 0.0, "num_input_tokens_seen": 62945448, "step": 93370 }, { "epoch": 2.281166784745804, "grad_norm": 0.1858045905828476, "learning_rate": 1.321515505665827e-06, "loss": 0.0002, "num_input_tokens_seen": 62948584, "step": 93375 }, { "epoch": 2.2812889355776513, "grad_norm": 0.042001720517873764, "learning_rate": 1.321434754770033e-06, "loss": 0.0001, "num_input_tokens_seen": 62951528, "step": 93380 }, { "epoch": 2.2814110864094985, "grad_norm": 0.17240160703659058, "learning_rate": 1.3213540015366789e-06, "loss": 0.0001, "num_input_tokens_seen": 62954664, "step": 93385 }, { "epoch": 2.2815332372413457, "grad_norm": 0.08950382471084595, "learning_rate": 1.3212732459663524e-06, "loss": 0.0003, "num_input_tokens_seen": 62958504, "step": 93390 }, { "epoch": 2.281655388073193, "grad_norm": 0.003479085164144635, "learning_rate": 1.32119248805964e-06, "loss": 0.0001, "num_input_tokens_seen": 62961384, "step": 93395 }, { "epoch": 2.28177753890504, "grad_norm": 0.0005125111783854663, "learning_rate": 1.3211117278171297e-06, "loss": 0.0355, "num_input_tokens_seen": 62965224, "step": 93400 }, { "epoch": 2.2818996897368873, "grad_norm": 0.018022790551185608, "learning_rate": 1.3210309652394087e-06, "loss": 0.0006, "num_input_tokens_seen": 62968488, "step": 93405 }, { "epoch": 2.282021840568734, "grad_norm": 133.34310913085938, "learning_rate": 1.3209502003270641e-06, "loss": 0.0325, "num_input_tokens_seen": 62972264, "step": 93410 }, { "epoch": 2.2821439914005817, "grad_norm": 0.027756785973906517, "learning_rate": 1.3208694330806834e-06, "loss": 0.1406, "num_input_tokens_seen": 62975464, "step": 93415 }, { "epoch": 2.2822661422324284, "grad_norm": 0.0014253148110583425, "learning_rate": 1.3207886635008535e-06, "loss": 0.0667, "num_input_tokens_seen": 62978792, "step": 93420 }, { "epoch": 2.2823882930642756, "grad_norm": 0.0035388257820159197, "learning_rate": 1.3207078915881624e-06, "loss": 0.0002, "num_input_tokens_seen": 62982312, "step": 93425 }, { "epoch": 2.282510443896123, "grad_norm": 0.05845514312386513, "learning_rate": 1.3206271173431973e-06, "loss": 0.0, "num_input_tokens_seen": 62985640, "step": 93430 }, { "epoch": 2.28263259472797, "grad_norm": 0.18427708745002747, "learning_rate": 1.3205463407665456e-06, "loss": 0.0002, "num_input_tokens_seen": 62988840, "step": 93435 }, { "epoch": 2.282754745559817, "grad_norm": 0.00430617481470108, "learning_rate": 1.3204655618587946e-06, "loss": 0.1984, "num_input_tokens_seen": 62992168, "step": 93440 }, { "epoch": 2.2828768963916644, "grad_norm": 20.258394241333008, "learning_rate": 1.3203847806205316e-06, "loss": 0.0423, "num_input_tokens_seen": 62995432, "step": 93445 }, { "epoch": 2.2829990472235115, "grad_norm": 0.0059243980795145035, "learning_rate": 1.3203039970523446e-06, "loss": 0.0516, "num_input_tokens_seen": 62998696, "step": 93450 }, { "epoch": 2.2831211980553587, "grad_norm": 19.95654296875, "learning_rate": 1.3202232111548208e-06, "loss": 0.0456, "num_input_tokens_seen": 63002024, "step": 93455 }, { "epoch": 2.283243348887206, "grad_norm": 0.07648692280054092, "learning_rate": 1.3201424229285476e-06, "loss": 0.0001, "num_input_tokens_seen": 63005480, "step": 93460 }, { "epoch": 2.283365499719053, "grad_norm": 0.11834115535020828, "learning_rate": 1.3200616323741129e-06, "loss": 0.0004, "num_input_tokens_seen": 63008872, "step": 93465 }, { "epoch": 2.2834876505509003, "grad_norm": 0.12660598754882812, "learning_rate": 1.3199808394921034e-06, "loss": 0.0004, "num_input_tokens_seen": 63012328, "step": 93470 }, { "epoch": 2.2836098013827475, "grad_norm": 0.01059301383793354, "learning_rate": 1.3199000442831074e-06, "loss": 0.062, "num_input_tokens_seen": 63015912, "step": 93475 }, { "epoch": 2.2837319522145947, "grad_norm": 0.0022777384147047997, "learning_rate": 1.3198192467477122e-06, "loss": 0.0001, "num_input_tokens_seen": 63019304, "step": 93480 }, { "epoch": 2.283854103046442, "grad_norm": 0.13538259267807007, "learning_rate": 1.3197384468865057e-06, "loss": 0.0002, "num_input_tokens_seen": 63022632, "step": 93485 }, { "epoch": 2.283976253878289, "grad_norm": 34.09814453125, "learning_rate": 1.3196576447000748e-06, "loss": 0.1495, "num_input_tokens_seen": 63025704, "step": 93490 }, { "epoch": 2.2840984047101363, "grad_norm": 21.362201690673828, "learning_rate": 1.3195768401890077e-06, "loss": 0.128, "num_input_tokens_seen": 63028968, "step": 93495 }, { "epoch": 2.2842205555419834, "grad_norm": 0.09496381878852844, "learning_rate": 1.3194960333538918e-06, "loss": 0.0016, "num_input_tokens_seen": 63032488, "step": 93500 }, { "epoch": 2.28434270637383, "grad_norm": 5.9453537687659264e-05, "learning_rate": 1.3194152241953148e-06, "loss": 0.0014, "num_input_tokens_seen": 63035816, "step": 93505 }, { "epoch": 2.2844648572056774, "grad_norm": 0.012698279693722725, "learning_rate": 1.3193344127138647e-06, "loss": 0.0005, "num_input_tokens_seen": 63039336, "step": 93510 }, { "epoch": 2.2845870080375246, "grad_norm": 0.024744637310504913, "learning_rate": 1.3192535989101285e-06, "loss": 0.0943, "num_input_tokens_seen": 63043304, "step": 93515 }, { "epoch": 2.2847091588693718, "grad_norm": 0.05719945207238197, "learning_rate": 1.3191727827846945e-06, "loss": 0.0004, "num_input_tokens_seen": 63046888, "step": 93520 }, { "epoch": 2.284831309701219, "grad_norm": 0.016214219853281975, "learning_rate": 1.31909196433815e-06, "loss": 0.0266, "num_input_tokens_seen": 63050344, "step": 93525 }, { "epoch": 2.284953460533066, "grad_norm": 0.10411170870065689, "learning_rate": 1.3190111435710828e-06, "loss": 0.0393, "num_input_tokens_seen": 63053416, "step": 93530 }, { "epoch": 2.2850756113649133, "grad_norm": 0.6027932167053223, "learning_rate": 1.3189303204840809e-06, "loss": 0.0315, "num_input_tokens_seen": 63056680, "step": 93535 }, { "epoch": 2.2851977621967605, "grad_norm": 0.018875345587730408, "learning_rate": 1.3188494950777318e-06, "loss": 0.0002, "num_input_tokens_seen": 63060200, "step": 93540 }, { "epoch": 2.2853199130286077, "grad_norm": 0.019894888624548912, "learning_rate": 1.3187686673526238e-06, "loss": 0.0001, "num_input_tokens_seen": 63063592, "step": 93545 }, { "epoch": 2.285442063860455, "grad_norm": 0.08889276534318924, "learning_rate": 1.3186878373093438e-06, "loss": 0.0578, "num_input_tokens_seen": 63067432, "step": 93550 }, { "epoch": 2.285564214692302, "grad_norm": 0.030163150280714035, "learning_rate": 1.3186070049484806e-06, "loss": 0.0002, "num_input_tokens_seen": 63070632, "step": 93555 }, { "epoch": 2.2856863655241493, "grad_norm": 0.026965975761413574, "learning_rate": 1.3185261702706211e-06, "loss": 0.0592, "num_input_tokens_seen": 63074472, "step": 93560 }, { "epoch": 2.2858085163559965, "grad_norm": 0.012265127152204514, "learning_rate": 1.3184453332763542e-06, "loss": 0.1175, "num_input_tokens_seen": 63077480, "step": 93565 }, { "epoch": 2.2859306671878437, "grad_norm": 70.07109832763672, "learning_rate": 1.3183644939662668e-06, "loss": 0.0708, "num_input_tokens_seen": 63082664, "step": 93570 }, { "epoch": 2.286052818019691, "grad_norm": 0.15027621388435364, "learning_rate": 1.318283652340947e-06, "loss": 0.0003, "num_input_tokens_seen": 63086248, "step": 93575 }, { "epoch": 2.286174968851538, "grad_norm": 390.86004638671875, "learning_rate": 1.3182028084009832e-06, "loss": 0.0194, "num_input_tokens_seen": 63089704, "step": 93580 }, { "epoch": 2.2862971196833852, "grad_norm": 135.47552490234375, "learning_rate": 1.318121962146963e-06, "loss": 0.1173, "num_input_tokens_seen": 63092840, "step": 93585 }, { "epoch": 2.286419270515232, "grad_norm": 0.01921449974179268, "learning_rate": 1.3180411135794742e-06, "loss": 0.0005, "num_input_tokens_seen": 63096104, "step": 93590 }, { "epoch": 2.2865414213470796, "grad_norm": 0.05642332881689072, "learning_rate": 1.317960262699105e-06, "loss": 0.0478, "num_input_tokens_seen": 63099560, "step": 93595 }, { "epoch": 2.2866635721789264, "grad_norm": 36.93173599243164, "learning_rate": 1.317879409506443e-06, "loss": 0.0823, "num_input_tokens_seen": 63102952, "step": 93600 }, { "epoch": 2.2867857230107735, "grad_norm": 0.023783983662724495, "learning_rate": 1.3177985540020765e-06, "loss": 0.1719, "num_input_tokens_seen": 63106728, "step": 93605 }, { "epoch": 2.2869078738426207, "grad_norm": 0.04622891917824745, "learning_rate": 1.3177176961865934e-06, "loss": 0.0003, "num_input_tokens_seen": 63110440, "step": 93610 }, { "epoch": 2.287030024674468, "grad_norm": 0.06299111992120743, "learning_rate": 1.3176368360605818e-06, "loss": 0.0495, "num_input_tokens_seen": 63113960, "step": 93615 }, { "epoch": 2.287152175506315, "grad_norm": 0.5457538962364197, "learning_rate": 1.3175559736246302e-06, "loss": 0.0435, "num_input_tokens_seen": 63117032, "step": 93620 }, { "epoch": 2.2872743263381623, "grad_norm": 0.15435582399368286, "learning_rate": 1.3174751088793257e-06, "loss": 0.0382, "num_input_tokens_seen": 63120360, "step": 93625 }, { "epoch": 2.2873964771700095, "grad_norm": 0.006368239410221577, "learning_rate": 1.3173942418252566e-06, "loss": 0.0003, "num_input_tokens_seen": 63123368, "step": 93630 }, { "epoch": 2.2875186280018567, "grad_norm": 0.010606862604618073, "learning_rate": 1.3173133724630114e-06, "loss": 0.0003, "num_input_tokens_seen": 63127144, "step": 93635 }, { "epoch": 2.287640778833704, "grad_norm": 19.65880012512207, "learning_rate": 1.3172325007931782e-06, "loss": 0.0272, "num_input_tokens_seen": 63130664, "step": 93640 }, { "epoch": 2.287762929665551, "grad_norm": 0.015704812481999397, "learning_rate": 1.3171516268163447e-06, "loss": 0.0012, "num_input_tokens_seen": 63133672, "step": 93645 }, { "epoch": 2.2878850804973982, "grad_norm": 0.0535929799079895, "learning_rate": 1.3170707505330993e-06, "loss": 0.0635, "num_input_tokens_seen": 63137512, "step": 93650 }, { "epoch": 2.2880072313292454, "grad_norm": 0.030435847118496895, "learning_rate": 1.3169898719440301e-06, "loss": 0.1236, "num_input_tokens_seen": 63140776, "step": 93655 }, { "epoch": 2.2881293821610926, "grad_norm": 0.016993314027786255, "learning_rate": 1.3169089910497254e-06, "loss": 0.0006, "num_input_tokens_seen": 63144232, "step": 93660 }, { "epoch": 2.28825153299294, "grad_norm": 0.0037211801391094923, "learning_rate": 1.3168281078507735e-06, "loss": 0.0117, "num_input_tokens_seen": 63147624, "step": 93665 }, { "epoch": 2.288373683824787, "grad_norm": 0.058646202087402344, "learning_rate": 1.316747222347762e-06, "loss": 0.0002, "num_input_tokens_seen": 63150824, "step": 93670 }, { "epoch": 2.288495834656634, "grad_norm": 0.15246830880641937, "learning_rate": 1.3166663345412796e-06, "loss": 0.0422, "num_input_tokens_seen": 63154088, "step": 93675 }, { "epoch": 2.2886179854884814, "grad_norm": 0.34976163506507874, "learning_rate": 1.3165854444319148e-06, "loss": 0.0003, "num_input_tokens_seen": 63158632, "step": 93680 }, { "epoch": 2.288740136320328, "grad_norm": 0.10934649407863617, "learning_rate": 1.316504552020255e-06, "loss": 0.0002, "num_input_tokens_seen": 63161704, "step": 93685 }, { "epoch": 2.2888622871521753, "grad_norm": 0.0155659019947052, "learning_rate": 1.316423657306889e-06, "loss": 0.0002, "num_input_tokens_seen": 63164712, "step": 93690 }, { "epoch": 2.2889844379840225, "grad_norm": 0.018553772941231728, "learning_rate": 1.3163427602924052e-06, "loss": 0.0003, "num_input_tokens_seen": 63168040, "step": 93695 }, { "epoch": 2.2891065888158697, "grad_norm": 0.09233409911394119, "learning_rate": 1.3162618609773917e-06, "loss": 0.0003, "num_input_tokens_seen": 63170984, "step": 93700 }, { "epoch": 2.289228739647717, "grad_norm": 0.9830764532089233, "learning_rate": 1.316180959362437e-06, "loss": 0.0006, "num_input_tokens_seen": 63174312, "step": 93705 }, { "epoch": 2.289350890479564, "grad_norm": 0.07397404313087463, "learning_rate": 1.3161000554481292e-06, "loss": 0.0002, "num_input_tokens_seen": 63177512, "step": 93710 }, { "epoch": 2.2894730413114113, "grad_norm": 0.06915701925754547, "learning_rate": 1.3160191492350568e-06, "loss": 0.0007, "num_input_tokens_seen": 63180776, "step": 93715 }, { "epoch": 2.2895951921432585, "grad_norm": 0.0010076145408675075, "learning_rate": 1.3159382407238083e-06, "loss": 0.0002, "num_input_tokens_seen": 63183976, "step": 93720 }, { "epoch": 2.2897173429751057, "grad_norm": 0.018281618133187294, "learning_rate": 1.3158573299149716e-06, "loss": 0.0764, "num_input_tokens_seen": 63186856, "step": 93725 }, { "epoch": 2.289839493806953, "grad_norm": 0.0028682551346719265, "learning_rate": 1.3157764168091356e-06, "loss": 0.0001, "num_input_tokens_seen": 63190376, "step": 93730 }, { "epoch": 2.2899616446388, "grad_norm": 0.02593367174267769, "learning_rate": 1.3156955014068886e-06, "loss": 0.0002, "num_input_tokens_seen": 63193512, "step": 93735 }, { "epoch": 2.290083795470647, "grad_norm": 0.05124660208821297, "learning_rate": 1.3156145837088192e-06, "loss": 0.0001, "num_input_tokens_seen": 63196840, "step": 93740 }, { "epoch": 2.2902059463024944, "grad_norm": 26.817052841186523, "learning_rate": 1.3155336637155154e-06, "loss": 0.0659, "num_input_tokens_seen": 63200040, "step": 93745 }, { "epoch": 2.2903280971343416, "grad_norm": 0.020981481298804283, "learning_rate": 1.315452741427566e-06, "loss": 0.1073, "num_input_tokens_seen": 63203752, "step": 93750 }, { "epoch": 2.290450247966189, "grad_norm": 0.016080111265182495, "learning_rate": 1.3153718168455595e-06, "loss": 0.1151, "num_input_tokens_seen": 63207144, "step": 93755 }, { "epoch": 2.290572398798036, "grad_norm": 0.04538425803184509, "learning_rate": 1.315290889970084e-06, "loss": 0.0002, "num_input_tokens_seen": 63210536, "step": 93760 }, { "epoch": 2.290694549629883, "grad_norm": 0.043526507914066315, "learning_rate": 1.3152099608017286e-06, "loss": 0.0005, "num_input_tokens_seen": 63213800, "step": 93765 }, { "epoch": 2.29081670046173, "grad_norm": 797.885498046875, "learning_rate": 1.3151290293410818e-06, "loss": 0.0102, "num_input_tokens_seen": 63217320, "step": 93770 }, { "epoch": 2.2909388512935775, "grad_norm": 0.13713596761226654, "learning_rate": 1.315048095588732e-06, "loss": 0.0002, "num_input_tokens_seen": 63220712, "step": 93775 }, { "epoch": 2.2910610021254243, "grad_norm": 0.03999638929963112, "learning_rate": 1.3149671595452674e-06, "loss": 0.0384, "num_input_tokens_seen": 63224104, "step": 93780 }, { "epoch": 2.2911831529572715, "grad_norm": 0.03355813026428223, "learning_rate": 1.3148862212112765e-06, "loss": 0.0002, "num_input_tokens_seen": 63227304, "step": 93785 }, { "epoch": 2.2913053037891187, "grad_norm": 0.0039594462141394615, "learning_rate": 1.314805280587349e-06, "loss": 0.0001, "num_input_tokens_seen": 63230376, "step": 93790 }, { "epoch": 2.291427454620966, "grad_norm": 0.012313015758991241, "learning_rate": 1.3147243376740724e-06, "loss": 0.002, "num_input_tokens_seen": 63233320, "step": 93795 }, { "epoch": 2.291549605452813, "grad_norm": 0.011920818127691746, "learning_rate": 1.314643392472036e-06, "loss": 0.0005, "num_input_tokens_seen": 63236648, "step": 93800 }, { "epoch": 2.2916717562846602, "grad_norm": 0.3594484329223633, "learning_rate": 1.3145624449818283e-06, "loss": 0.0002, "num_input_tokens_seen": 63239848, "step": 93805 }, { "epoch": 2.2917939071165074, "grad_norm": 0.04723503440618515, "learning_rate": 1.3144814952040375e-06, "loss": 0.0002, "num_input_tokens_seen": 63243368, "step": 93810 }, { "epoch": 2.2919160579483546, "grad_norm": 0.05795615538954735, "learning_rate": 1.314400543139253e-06, "loss": 0.084, "num_input_tokens_seen": 63246568, "step": 93815 }, { "epoch": 2.292038208780202, "grad_norm": 0.10335791110992432, "learning_rate": 1.3143195887880631e-06, "loss": 0.0701, "num_input_tokens_seen": 63250024, "step": 93820 }, { "epoch": 2.292160359612049, "grad_norm": 0.006964336149394512, "learning_rate": 1.3142386321510565e-06, "loss": 0.0002, "num_input_tokens_seen": 63253608, "step": 93825 }, { "epoch": 2.292282510443896, "grad_norm": 0.008066564798355103, "learning_rate": 1.3141576732288223e-06, "loss": 0.0008, "num_input_tokens_seen": 63256616, "step": 93830 }, { "epoch": 2.2924046612757434, "grad_norm": 0.005343073047697544, "learning_rate": 1.314076712021949e-06, "loss": 0.099, "num_input_tokens_seen": 63259752, "step": 93835 }, { "epoch": 2.2925268121075906, "grad_norm": 0.1738848239183426, "learning_rate": 1.3139957485310251e-06, "loss": 0.0002, "num_input_tokens_seen": 63263528, "step": 93840 }, { "epoch": 2.2926489629394378, "grad_norm": 0.01025724783539772, "learning_rate": 1.31391478275664e-06, "loss": 0.0001, "num_input_tokens_seen": 63267112, "step": 93845 }, { "epoch": 2.292771113771285, "grad_norm": 19.27631378173828, "learning_rate": 1.3138338146993814e-06, "loss": 0.054, "num_input_tokens_seen": 63270568, "step": 93850 }, { "epoch": 2.2928932646031317, "grad_norm": 0.3640555739402771, "learning_rate": 1.3137528443598398e-06, "loss": 0.0007, "num_input_tokens_seen": 63274088, "step": 93855 }, { "epoch": 2.2930154154349793, "grad_norm": 0.019612310454249382, "learning_rate": 1.3136718717386025e-06, "loss": 0.1433, "num_input_tokens_seen": 63277288, "step": 93860 }, { "epoch": 2.293137566266826, "grad_norm": 0.043557919561862946, "learning_rate": 1.3135908968362596e-06, "loss": 0.0002, "num_input_tokens_seen": 63280360, "step": 93865 }, { "epoch": 2.2932597170986733, "grad_norm": 0.08848714083433151, "learning_rate": 1.313509919653399e-06, "loss": 0.0003, "num_input_tokens_seen": 63283560, "step": 93870 }, { "epoch": 2.2933818679305205, "grad_norm": 338.09820556640625, "learning_rate": 1.3134289401906099e-06, "loss": 0.0597, "num_input_tokens_seen": 63287528, "step": 93875 }, { "epoch": 2.2935040187623676, "grad_norm": 0.005891845561563969, "learning_rate": 1.3133479584484812e-06, "loss": 0.0002, "num_input_tokens_seen": 63291112, "step": 93880 }, { "epoch": 2.293626169594215, "grad_norm": 0.007043534889817238, "learning_rate": 1.3132669744276022e-06, "loss": 0.0593, "num_input_tokens_seen": 63294568, "step": 93885 }, { "epoch": 2.293748320426062, "grad_norm": 0.030527032911777496, "learning_rate": 1.3131859881285612e-06, "loss": 0.0417, "num_input_tokens_seen": 63298664, "step": 93890 }, { "epoch": 2.293870471257909, "grad_norm": 0.02705656923353672, "learning_rate": 1.3131049995519474e-06, "loss": 0.0002, "num_input_tokens_seen": 63301800, "step": 93895 }, { "epoch": 2.2939926220897564, "grad_norm": 0.013978756964206696, "learning_rate": 1.3130240086983499e-06, "loss": 0.0001, "num_input_tokens_seen": 63305448, "step": 93900 }, { "epoch": 2.2941147729216036, "grad_norm": 0.18781529366970062, "learning_rate": 1.3129430155683579e-06, "loss": 0.0466, "num_input_tokens_seen": 63308968, "step": 93905 }, { "epoch": 2.294236923753451, "grad_norm": 14.817819595336914, "learning_rate": 1.3128620201625596e-06, "loss": 0.1974, "num_input_tokens_seen": 63312808, "step": 93910 }, { "epoch": 2.294359074585298, "grad_norm": 0.0723084881901741, "learning_rate": 1.3127810224815447e-06, "loss": 0.0494, "num_input_tokens_seen": 63315880, "step": 93915 }, { "epoch": 2.294481225417145, "grad_norm": 0.10592442750930786, "learning_rate": 1.3127000225259025e-06, "loss": 0.0008, "num_input_tokens_seen": 63318952, "step": 93920 }, { "epoch": 2.2946033762489924, "grad_norm": 0.20150475203990936, "learning_rate": 1.3126190202962213e-06, "loss": 0.0001, "num_input_tokens_seen": 63322024, "step": 93925 }, { "epoch": 2.2947255270808395, "grad_norm": 0.14693403244018555, "learning_rate": 1.3125380157930908e-06, "loss": 0.0004, "num_input_tokens_seen": 63325736, "step": 93930 }, { "epoch": 2.2948476779126867, "grad_norm": 0.008689050562679768, "learning_rate": 1.3124570090170994e-06, "loss": 0.1105, "num_input_tokens_seen": 63329448, "step": 93935 }, { "epoch": 2.294969828744534, "grad_norm": 18.56661033630371, "learning_rate": 1.3123759999688367e-06, "loss": 0.1233, "num_input_tokens_seen": 63333672, "step": 93940 }, { "epoch": 2.295091979576381, "grad_norm": 18.206514358520508, "learning_rate": 1.3122949886488913e-06, "loss": 0.0527, "num_input_tokens_seen": 63337320, "step": 93945 }, { "epoch": 2.295214130408228, "grad_norm": 0.25258105993270874, "learning_rate": 1.3122139750578533e-06, "loss": 0.0094, "num_input_tokens_seen": 63340328, "step": 93950 }, { "epoch": 2.2953362812400755, "grad_norm": 0.03619527071714401, "learning_rate": 1.3121329591963112e-06, "loss": 0.0312, "num_input_tokens_seen": 63344104, "step": 93955 }, { "epoch": 2.2954584320719222, "grad_norm": 0.03644130378961563, "learning_rate": 1.3120519410648543e-06, "loss": 0.0003, "num_input_tokens_seen": 63347496, "step": 93960 }, { "epoch": 2.2955805829037694, "grad_norm": 0.023112885653972626, "learning_rate": 1.3119709206640716e-06, "loss": 0.0006, "num_input_tokens_seen": 63350376, "step": 93965 }, { "epoch": 2.2957027337356166, "grad_norm": 0.00579156493768096, "learning_rate": 1.3118898979945528e-06, "loss": 0.1245, "num_input_tokens_seen": 63354280, "step": 93970 }, { "epoch": 2.295824884567464, "grad_norm": 0.004455236252397299, "learning_rate": 1.3118088730568863e-06, "loss": 0.0408, "num_input_tokens_seen": 63357864, "step": 93975 }, { "epoch": 2.295947035399311, "grad_norm": 0.006268753204494715, "learning_rate": 1.3117278458516622e-06, "loss": 0.0003, "num_input_tokens_seen": 63361320, "step": 93980 }, { "epoch": 2.296069186231158, "grad_norm": 0.007929849438369274, "learning_rate": 1.3116468163794691e-06, "loss": 0.0355, "num_input_tokens_seen": 63364392, "step": 93985 }, { "epoch": 2.2961913370630054, "grad_norm": 0.26138797402381897, "learning_rate": 1.3115657846408965e-06, "loss": 0.0518, "num_input_tokens_seen": 63367400, "step": 93990 }, { "epoch": 2.2963134878948526, "grad_norm": 0.0377698689699173, "learning_rate": 1.3114847506365338e-06, "loss": 0.0567, "num_input_tokens_seen": 63371048, "step": 93995 }, { "epoch": 2.2964356387266998, "grad_norm": 7.9693803787231445, "learning_rate": 1.3114037143669702e-06, "loss": 0.0216, "num_input_tokens_seen": 63374056, "step": 94000 }, { "epoch": 2.296557789558547, "grad_norm": 25.192068099975586, "learning_rate": 1.3113226758327952e-06, "loss": 0.0643, "num_input_tokens_seen": 63377512, "step": 94005 }, { "epoch": 2.296679940390394, "grad_norm": 0.246130108833313, "learning_rate": 1.3112416350345977e-06, "loss": 0.0006, "num_input_tokens_seen": 63380712, "step": 94010 }, { "epoch": 2.2968020912222413, "grad_norm": 0.03563880920410156, "learning_rate": 1.3111605919729676e-06, "loss": 0.038, "num_input_tokens_seen": 63384040, "step": 94015 }, { "epoch": 2.2969242420540885, "grad_norm": 15.70835018157959, "learning_rate": 1.3110795466484939e-06, "loss": 0.0514, "num_input_tokens_seen": 63387432, "step": 94020 }, { "epoch": 2.2970463928859357, "grad_norm": 0.05262516438961029, "learning_rate": 1.3109984990617658e-06, "loss": 0.0426, "num_input_tokens_seen": 63391656, "step": 94025 }, { "epoch": 2.297168543717783, "grad_norm": 0.1655740588903427, "learning_rate": 1.3109174492133732e-06, "loss": 0.0015, "num_input_tokens_seen": 63394792, "step": 94030 }, { "epoch": 2.2972906945496296, "grad_norm": 13.976034164428711, "learning_rate": 1.3108363971039053e-06, "loss": 0.0007, "num_input_tokens_seen": 63397992, "step": 94035 }, { "epoch": 2.2974128453814773, "grad_norm": 0.5051965117454529, "learning_rate": 1.3107553427339515e-06, "loss": 0.0002, "num_input_tokens_seen": 63401512, "step": 94040 }, { "epoch": 2.297534996213324, "grad_norm": 17.933237075805664, "learning_rate": 1.310674286104101e-06, "loss": 0.0331, "num_input_tokens_seen": 63404776, "step": 94045 }, { "epoch": 2.297657147045171, "grad_norm": 0.023094022646546364, "learning_rate": 1.310593227214944e-06, "loss": 0.0002, "num_input_tokens_seen": 63408936, "step": 94050 }, { "epoch": 2.2977792978770184, "grad_norm": 0.041221149265766144, "learning_rate": 1.3105121660670692e-06, "loss": 0.0001, "num_input_tokens_seen": 63412712, "step": 94055 }, { "epoch": 2.2979014487088656, "grad_norm": 0.01685006357729435, "learning_rate": 1.3104311026610666e-06, "loss": 0.0001, "num_input_tokens_seen": 63416296, "step": 94060 }, { "epoch": 2.298023599540713, "grad_norm": 8.67899227142334, "learning_rate": 1.310350036997525e-06, "loss": 0.0934, "num_input_tokens_seen": 63419112, "step": 94065 }, { "epoch": 2.29814575037256, "grad_norm": 0.00395004590973258, "learning_rate": 1.310268969077035e-06, "loss": 0.0353, "num_input_tokens_seen": 63422888, "step": 94070 }, { "epoch": 2.298267901204407, "grad_norm": 32.90062713623047, "learning_rate": 1.3101878989001856e-06, "loss": 0.1447, "num_input_tokens_seen": 63426344, "step": 94075 }, { "epoch": 2.2983900520362543, "grad_norm": 0.5018079876899719, "learning_rate": 1.3101068264675662e-06, "loss": 0.001, "num_input_tokens_seen": 63429992, "step": 94080 }, { "epoch": 2.2985122028681015, "grad_norm": 0.061042070388793945, "learning_rate": 1.3100257517797668e-06, "loss": 0.0001, "num_input_tokens_seen": 63433512, "step": 94085 }, { "epoch": 2.2986343536999487, "grad_norm": 88.58975982666016, "learning_rate": 1.3099446748373764e-06, "loss": 0.1512, "num_input_tokens_seen": 63436520, "step": 94090 }, { "epoch": 2.298756504531796, "grad_norm": 0.05687016621232033, "learning_rate": 1.3098635956409851e-06, "loss": 0.0002, "num_input_tokens_seen": 63439720, "step": 94095 }, { "epoch": 2.298878655363643, "grad_norm": 0.039876531809568405, "learning_rate": 1.3097825141911821e-06, "loss": 0.0005, "num_input_tokens_seen": 63443176, "step": 94100 }, { "epoch": 2.2990008061954903, "grad_norm": 0.13145720958709717, "learning_rate": 1.3097014304885578e-06, "loss": 0.1533, "num_input_tokens_seen": 63446376, "step": 94105 }, { "epoch": 2.2991229570273375, "grad_norm": 10.550003051757812, "learning_rate": 1.3096203445337013e-06, "loss": 0.113, "num_input_tokens_seen": 63449832, "step": 94110 }, { "epoch": 2.2992451078591847, "grad_norm": 0.14364692568778992, "learning_rate": 1.309539256327202e-06, "loss": 0.1354, "num_input_tokens_seen": 63452776, "step": 94115 }, { "epoch": 2.299367258691032, "grad_norm": 0.009568443521857262, "learning_rate": 1.3094581658696505e-06, "loss": 0.0712, "num_input_tokens_seen": 63456104, "step": 94120 }, { "epoch": 2.299489409522879, "grad_norm": 0.9077567458152771, "learning_rate": 1.3093770731616358e-06, "loss": 0.0532, "num_input_tokens_seen": 63458856, "step": 94125 }, { "epoch": 2.299611560354726, "grad_norm": 0.018772780895233154, "learning_rate": 1.3092959782037478e-06, "loss": 0.0367, "num_input_tokens_seen": 63461800, "step": 94130 }, { "epoch": 2.299733711186573, "grad_norm": 27.450637817382812, "learning_rate": 1.3092148809965763e-06, "loss": 0.0827, "num_input_tokens_seen": 63465640, "step": 94135 }, { "epoch": 2.29985586201842, "grad_norm": 0.020906388759613037, "learning_rate": 1.3091337815407108e-06, "loss": 0.0007, "num_input_tokens_seen": 63468584, "step": 94140 }, { "epoch": 2.2999780128502674, "grad_norm": 15.979035377502441, "learning_rate": 1.3090526798367414e-06, "loss": 0.033, "num_input_tokens_seen": 63472488, "step": 94145 }, { "epoch": 2.3001001636821146, "grad_norm": 0.022671345621347427, "learning_rate": 1.3089715758852578e-06, "loss": 0.053, "num_input_tokens_seen": 63475752, "step": 94150 }, { "epoch": 2.3002223145139618, "grad_norm": 0.09151072800159454, "learning_rate": 1.3088904696868498e-06, "loss": 0.001, "num_input_tokens_seen": 63479208, "step": 94155 }, { "epoch": 2.300344465345809, "grad_norm": 0.7091579437255859, "learning_rate": 1.308809361242107e-06, "loss": 0.0005, "num_input_tokens_seen": 63482856, "step": 94160 }, { "epoch": 2.300466616177656, "grad_norm": 0.010377227328717709, "learning_rate": 1.3087282505516197e-06, "loss": 0.0007, "num_input_tokens_seen": 63486248, "step": 94165 }, { "epoch": 2.3005887670095033, "grad_norm": 0.10020996630191803, "learning_rate": 1.3086471376159777e-06, "loss": 0.0003, "num_input_tokens_seen": 63489512, "step": 94170 }, { "epoch": 2.3007109178413505, "grad_norm": 0.018191087990999222, "learning_rate": 1.3085660224357703e-06, "loss": 0.0005, "num_input_tokens_seen": 63492456, "step": 94175 }, { "epoch": 2.3008330686731977, "grad_norm": 0.7195940613746643, "learning_rate": 1.3084849050115883e-06, "loss": 0.0003, "num_input_tokens_seen": 63495976, "step": 94180 }, { "epoch": 2.300955219505045, "grad_norm": 0.06771797686815262, "learning_rate": 1.3084037853440206e-06, "loss": 0.0457, "num_input_tokens_seen": 63499176, "step": 94185 }, { "epoch": 2.301077370336892, "grad_norm": 0.08082715421915054, "learning_rate": 1.308322663433658e-06, "loss": 0.0295, "num_input_tokens_seen": 63502440, "step": 94190 }, { "epoch": 2.3011995211687393, "grad_norm": 32.88578796386719, "learning_rate": 1.3082415392810896e-06, "loss": 0.0837, "num_input_tokens_seen": 63505640, "step": 94195 }, { "epoch": 2.3013216720005865, "grad_norm": 0.005474249366670847, "learning_rate": 1.3081604128869064e-06, "loss": 0.0005, "num_input_tokens_seen": 63509224, "step": 94200 }, { "epoch": 2.3014438228324336, "grad_norm": 0.17572320997714996, "learning_rate": 1.3080792842516974e-06, "loss": 0.038, "num_input_tokens_seen": 63512488, "step": 94205 }, { "epoch": 2.301565973664281, "grad_norm": 0.04181716963648796, "learning_rate": 1.3079981533760532e-06, "loss": 0.0001, "num_input_tokens_seen": 63516008, "step": 94210 }, { "epoch": 2.3016881244961276, "grad_norm": 0.004344448447227478, "learning_rate": 1.3079170202605633e-06, "loss": 0.0405, "num_input_tokens_seen": 63519208, "step": 94215 }, { "epoch": 2.301810275327975, "grad_norm": 0.004446935374289751, "learning_rate": 1.3078358849058182e-06, "loss": 0.0309, "num_input_tokens_seen": 63522216, "step": 94220 }, { "epoch": 2.301932426159822, "grad_norm": 0.041591398417949677, "learning_rate": 1.3077547473124076e-06, "loss": 0.0003, "num_input_tokens_seen": 63525928, "step": 94225 }, { "epoch": 2.302054576991669, "grad_norm": 0.023177804425358772, "learning_rate": 1.3076736074809219e-06, "loss": 0.0373, "num_input_tokens_seen": 63529064, "step": 94230 }, { "epoch": 2.3021767278235163, "grad_norm": 0.021942226216197014, "learning_rate": 1.3075924654119507e-06, "loss": 0.0007, "num_input_tokens_seen": 63532328, "step": 94235 }, { "epoch": 2.3022988786553635, "grad_norm": 0.04694988951086998, "learning_rate": 1.307511321106085e-06, "loss": 0.0002, "num_input_tokens_seen": 63535976, "step": 94240 }, { "epoch": 2.3024210294872107, "grad_norm": 0.058997299522161484, "learning_rate": 1.3074301745639138e-06, "loss": 0.0002, "num_input_tokens_seen": 63539368, "step": 94245 }, { "epoch": 2.302543180319058, "grad_norm": 0.013562050648033619, "learning_rate": 1.3073490257860278e-06, "loss": 0.001, "num_input_tokens_seen": 63542632, "step": 94250 }, { "epoch": 2.302665331150905, "grad_norm": 0.3930176794528961, "learning_rate": 1.3072678747730166e-06, "loss": 0.0003, "num_input_tokens_seen": 63546408, "step": 94255 }, { "epoch": 2.3027874819827523, "grad_norm": 0.0023700748570263386, "learning_rate": 1.307186721525471e-06, "loss": 0.0001, "num_input_tokens_seen": 63549800, "step": 94260 }, { "epoch": 2.3029096328145995, "grad_norm": 0.025645380839705467, "learning_rate": 1.3071055660439811e-06, "loss": 0.0697, "num_input_tokens_seen": 63553768, "step": 94265 }, { "epoch": 2.3030317836464467, "grad_norm": 0.21440570056438446, "learning_rate": 1.3070244083291368e-06, "loss": 0.0515, "num_input_tokens_seen": 63556840, "step": 94270 }, { "epoch": 2.303153934478294, "grad_norm": 0.11980664730072021, "learning_rate": 1.3069432483815285e-06, "loss": 0.14, "num_input_tokens_seen": 63560616, "step": 94275 }, { "epoch": 2.303276085310141, "grad_norm": 2.025825023651123, "learning_rate": 1.3068620862017466e-06, "loss": 0.0005, "num_input_tokens_seen": 63564200, "step": 94280 }, { "epoch": 2.3033982361419882, "grad_norm": 0.009357582777738571, "learning_rate": 1.3067809217903807e-06, "loss": 0.0003, "num_input_tokens_seen": 63567400, "step": 94285 }, { "epoch": 2.3035203869738354, "grad_norm": 69.97030639648438, "learning_rate": 1.3066997551480215e-06, "loss": 0.0739, "num_input_tokens_seen": 63571624, "step": 94290 }, { "epoch": 2.3036425378056826, "grad_norm": 0.189670592546463, "learning_rate": 1.3066185862752592e-06, "loss": 0.0889, "num_input_tokens_seen": 63575272, "step": 94295 }, { "epoch": 2.30376468863753, "grad_norm": 0.019184645265340805, "learning_rate": 1.3065374151726842e-06, "loss": 0.0001, "num_input_tokens_seen": 63578664, "step": 94300 }, { "epoch": 2.303886839469377, "grad_norm": 35.09974670410156, "learning_rate": 1.3064562418408863e-06, "loss": 0.1545, "num_input_tokens_seen": 63581992, "step": 94305 }, { "epoch": 2.3040089903012237, "grad_norm": 0.017247380688786507, "learning_rate": 1.3063750662804567e-06, "loss": 0.0004, "num_input_tokens_seen": 63585512, "step": 94310 }, { "epoch": 2.304131141133071, "grad_norm": 0.01316153910011053, "learning_rate": 1.3062938884919844e-06, "loss": 0.0003, "num_input_tokens_seen": 63588712, "step": 94315 }, { "epoch": 2.304253291964918, "grad_norm": 0.0003070076054427773, "learning_rate": 1.3062127084760613e-06, "loss": 0.0758, "num_input_tokens_seen": 63592232, "step": 94320 }, { "epoch": 2.3043754427967653, "grad_norm": 0.03631015121936798, "learning_rate": 1.3061315262332768e-06, "loss": 0.0004, "num_input_tokens_seen": 63595944, "step": 94325 }, { "epoch": 2.3044975936286125, "grad_norm": 0.006778942886739969, "learning_rate": 1.3060503417642218e-06, "loss": 0.0002, "num_input_tokens_seen": 63599400, "step": 94330 }, { "epoch": 2.3046197444604597, "grad_norm": 0.02677575685083866, "learning_rate": 1.3059691550694858e-06, "loss": 0.0429, "num_input_tokens_seen": 63603112, "step": 94335 }, { "epoch": 2.304741895292307, "grad_norm": 0.07894811779260635, "learning_rate": 1.3058879661496602e-06, "loss": 0.0378, "num_input_tokens_seen": 63606888, "step": 94340 }, { "epoch": 2.304864046124154, "grad_norm": 0.45543399453163147, "learning_rate": 1.305806775005335e-06, "loss": 0.0002, "num_input_tokens_seen": 63609832, "step": 94345 }, { "epoch": 2.3049861969560013, "grad_norm": 0.05380082130432129, "learning_rate": 1.3057255816371007e-06, "loss": 0.0001, "num_input_tokens_seen": 63613096, "step": 94350 }, { "epoch": 2.3051083477878485, "grad_norm": 0.11246181279420853, "learning_rate": 1.3056443860455476e-06, "loss": 0.0699, "num_input_tokens_seen": 63616680, "step": 94355 }, { "epoch": 2.3052304986196956, "grad_norm": 0.012971539981663227, "learning_rate": 1.3055631882312664e-06, "loss": 0.0706, "num_input_tokens_seen": 63619752, "step": 94360 }, { "epoch": 2.305352649451543, "grad_norm": 0.010960533283650875, "learning_rate": 1.3054819881948476e-06, "loss": 0.0004, "num_input_tokens_seen": 63622952, "step": 94365 }, { "epoch": 2.30547480028339, "grad_norm": 122.7162857055664, "learning_rate": 1.3054007859368813e-06, "loss": 0.0342, "num_input_tokens_seen": 63626792, "step": 94370 }, { "epoch": 2.305596951115237, "grad_norm": 0.2455373853445053, "learning_rate": 1.3053195814579587e-06, "loss": 0.0002, "num_input_tokens_seen": 63629992, "step": 94375 }, { "epoch": 2.3057191019470844, "grad_norm": 22.135835647583008, "learning_rate": 1.3052383747586697e-06, "loss": 0.0437, "num_input_tokens_seen": 63633128, "step": 94380 }, { "epoch": 2.3058412527789316, "grad_norm": 0.011918697506189346, "learning_rate": 1.3051571658396053e-06, "loss": 0.0002, "num_input_tokens_seen": 63636392, "step": 94385 }, { "epoch": 2.305963403610779, "grad_norm": 0.01846807263791561, "learning_rate": 1.3050759547013558e-06, "loss": 0.0, "num_input_tokens_seen": 63639720, "step": 94390 }, { "epoch": 2.3060855544426255, "grad_norm": 0.04838526248931885, "learning_rate": 1.3049947413445123e-06, "loss": 0.0002, "num_input_tokens_seen": 63642728, "step": 94395 }, { "epoch": 2.306207705274473, "grad_norm": 0.020267801359295845, "learning_rate": 1.3049135257696646e-06, "loss": 0.0001, "num_input_tokens_seen": 63645800, "step": 94400 }, { "epoch": 2.30632985610632, "grad_norm": 0.009767679497599602, "learning_rate": 1.304832307977404e-06, "loss": 0.0002, "num_input_tokens_seen": 63649000, "step": 94405 }, { "epoch": 2.306452006938167, "grad_norm": 0.09315386414527893, "learning_rate": 1.3047510879683206e-06, "loss": 0.0448, "num_input_tokens_seen": 63652200, "step": 94410 }, { "epoch": 2.3065741577700143, "grad_norm": 0.010149150155484676, "learning_rate": 1.3046698657430053e-06, "loss": 0.0001, "num_input_tokens_seen": 63655976, "step": 94415 }, { "epoch": 2.3066963086018615, "grad_norm": 45.44666290283203, "learning_rate": 1.3045886413020491e-06, "loss": 0.1456, "num_input_tokens_seen": 63659048, "step": 94420 }, { "epoch": 2.3068184594337087, "grad_norm": 0.021444687619805336, "learning_rate": 1.304507414646042e-06, "loss": 0.0004, "num_input_tokens_seen": 63662248, "step": 94425 }, { "epoch": 2.306940610265556, "grad_norm": 0.0023106755688786507, "learning_rate": 1.3044261857755753e-06, "loss": 0.0001, "num_input_tokens_seen": 63665320, "step": 94430 }, { "epoch": 2.307062761097403, "grad_norm": 0.004815931431949139, "learning_rate": 1.3043449546912394e-06, "loss": 0.0, "num_input_tokens_seen": 63668968, "step": 94435 }, { "epoch": 2.3071849119292502, "grad_norm": 0.04161781445145607, "learning_rate": 1.3042637213936255e-06, "loss": 0.1804, "num_input_tokens_seen": 63672296, "step": 94440 }, { "epoch": 2.3073070627610974, "grad_norm": 1.1624021530151367, "learning_rate": 1.3041824858833235e-06, "loss": 0.0012, "num_input_tokens_seen": 63675304, "step": 94445 }, { "epoch": 2.3074292135929446, "grad_norm": 0.016916943714022636, "learning_rate": 1.3041012481609248e-06, "loss": 0.0335, "num_input_tokens_seen": 63678568, "step": 94450 }, { "epoch": 2.307551364424792, "grad_norm": 53.67993927001953, "learning_rate": 1.3040200082270202e-06, "loss": 0.1145, "num_input_tokens_seen": 63682088, "step": 94455 }, { "epoch": 2.307673515256639, "grad_norm": 0.032480210065841675, "learning_rate": 1.3039387660822e-06, "loss": 0.0367, "num_input_tokens_seen": 63685672, "step": 94460 }, { "epoch": 2.307795666088486, "grad_norm": 1.5269211530685425, "learning_rate": 1.3038575217270555e-06, "loss": 0.0003, "num_input_tokens_seen": 63689128, "step": 94465 }, { "epoch": 2.3079178169203334, "grad_norm": 0.0025496436282992363, "learning_rate": 1.3037762751621773e-06, "loss": 0.0002, "num_input_tokens_seen": 63692520, "step": 94470 }, { "epoch": 2.3080399677521806, "grad_norm": 0.010646814480423927, "learning_rate": 1.3036950263881563e-06, "loss": 0.0533, "num_input_tokens_seen": 63695784, "step": 94475 }, { "epoch": 2.3081621185840273, "grad_norm": 0.0811719000339508, "learning_rate": 1.3036137754055835e-06, "loss": 0.0001, "num_input_tokens_seen": 63698984, "step": 94480 }, { "epoch": 2.308284269415875, "grad_norm": 0.004129193257540464, "learning_rate": 1.30353252221505e-06, "loss": 0.0002, "num_input_tokens_seen": 63702376, "step": 94485 }, { "epoch": 2.3084064202477217, "grad_norm": 0.18701672554016113, "learning_rate": 1.3034512668171457e-06, "loss": 0.0008, "num_input_tokens_seen": 63705704, "step": 94490 }, { "epoch": 2.308528571079569, "grad_norm": 195.37823486328125, "learning_rate": 1.3033700092124626e-06, "loss": 0.0522, "num_input_tokens_seen": 63708904, "step": 94495 }, { "epoch": 2.308650721911416, "grad_norm": 0.030486810952425003, "learning_rate": 1.3032887494015913e-06, "loss": 0.047, "num_input_tokens_seen": 63712040, "step": 94500 }, { "epoch": 2.3087728727432633, "grad_norm": 0.02747334912419319, "learning_rate": 1.3032074873851224e-06, "loss": 0.0001, "num_input_tokens_seen": 63715368, "step": 94505 }, { "epoch": 2.3088950235751105, "grad_norm": 0.035794202238321304, "learning_rate": 1.303126223163647e-06, "loss": 0.0001, "num_input_tokens_seen": 63719016, "step": 94510 }, { "epoch": 2.3090171744069576, "grad_norm": 19.372329711914062, "learning_rate": 1.3030449567377565e-06, "loss": 0.0573, "num_input_tokens_seen": 63722344, "step": 94515 }, { "epoch": 2.309139325238805, "grad_norm": 0.3616223633289337, "learning_rate": 1.3029636881080412e-06, "loss": 0.0007, "num_input_tokens_seen": 63725480, "step": 94520 }, { "epoch": 2.309261476070652, "grad_norm": 0.006535464432090521, "learning_rate": 1.3028824172750927e-06, "loss": 0.0003, "num_input_tokens_seen": 63728488, "step": 94525 }, { "epoch": 2.309383626902499, "grad_norm": 35.66829299926758, "learning_rate": 1.3028011442395017e-06, "loss": 0.0389, "num_input_tokens_seen": 63732136, "step": 94530 }, { "epoch": 2.3095057777343464, "grad_norm": 0.06885640323162079, "learning_rate": 1.3027198690018592e-06, "loss": 0.0534, "num_input_tokens_seen": 63735528, "step": 94535 }, { "epoch": 2.3096279285661936, "grad_norm": 0.009454714134335518, "learning_rate": 1.3026385915627566e-06, "loss": 0.0, "num_input_tokens_seen": 63738984, "step": 94540 }, { "epoch": 2.309750079398041, "grad_norm": 7.843992710113525, "learning_rate": 1.3025573119227847e-06, "loss": 0.0932, "num_input_tokens_seen": 63742248, "step": 94545 }, { "epoch": 2.309872230229888, "grad_norm": 163.3396453857422, "learning_rate": 1.3024760300825348e-06, "loss": 0.0247, "num_input_tokens_seen": 63746024, "step": 94550 }, { "epoch": 2.309994381061735, "grad_norm": 5.96230411529541, "learning_rate": 1.302394746042598e-06, "loss": 0.0825, "num_input_tokens_seen": 63749096, "step": 94555 }, { "epoch": 2.3101165318935823, "grad_norm": 0.0034930650144815445, "learning_rate": 1.3023134598035647e-06, "loss": 0.0284, "num_input_tokens_seen": 63752616, "step": 94560 }, { "epoch": 2.3102386827254295, "grad_norm": 0.0070127821527421474, "learning_rate": 1.3022321713660268e-06, "loss": 0.0539, "num_input_tokens_seen": 63756136, "step": 94565 }, { "epoch": 2.3103608335572767, "grad_norm": 0.0699462965130806, "learning_rate": 1.3021508807305754e-06, "loss": 0.0002, "num_input_tokens_seen": 63759720, "step": 94570 }, { "epoch": 2.3104829843891235, "grad_norm": 0.01093141920864582, "learning_rate": 1.3020695878978015e-06, "loss": 0.0007, "num_input_tokens_seen": 63763496, "step": 94575 }, { "epoch": 2.3106051352209707, "grad_norm": 16.72857093811035, "learning_rate": 1.3019882928682963e-06, "loss": 0.0664, "num_input_tokens_seen": 63766760, "step": 94580 }, { "epoch": 2.310727286052818, "grad_norm": 0.03342173248529434, "learning_rate": 1.3019069956426511e-06, "loss": 0.0004, "num_input_tokens_seen": 63770024, "step": 94585 }, { "epoch": 2.310849436884665, "grad_norm": 0.12329500913619995, "learning_rate": 1.301825696221457e-06, "loss": 0.0002, "num_input_tokens_seen": 63773608, "step": 94590 }, { "epoch": 2.3109715877165122, "grad_norm": 0.01329412218183279, "learning_rate": 1.301744394605305e-06, "loss": 0.0001, "num_input_tokens_seen": 63776936, "step": 94595 }, { "epoch": 2.3110937385483594, "grad_norm": 0.04466555267572403, "learning_rate": 1.3016630907947868e-06, "loss": 0.0002, "num_input_tokens_seen": 63780072, "step": 94600 }, { "epoch": 2.3112158893802066, "grad_norm": 101.69499969482422, "learning_rate": 1.3015817847904934e-06, "loss": 0.2443, "num_input_tokens_seen": 63783208, "step": 94605 }, { "epoch": 2.311338040212054, "grad_norm": 0.30019229650497437, "learning_rate": 1.3015004765930164e-06, "loss": 0.1325, "num_input_tokens_seen": 63786536, "step": 94610 }, { "epoch": 2.311460191043901, "grad_norm": 0.07922214269638062, "learning_rate": 1.3014191662029466e-06, "loss": 0.0448, "num_input_tokens_seen": 63790184, "step": 94615 }, { "epoch": 2.311582341875748, "grad_norm": 0.13556693494319916, "learning_rate": 1.3013378536208757e-06, "loss": 0.0543, "num_input_tokens_seen": 63793512, "step": 94620 }, { "epoch": 2.3117044927075954, "grad_norm": 0.1541840136051178, "learning_rate": 1.3012565388473947e-06, "loss": 0.0007, "num_input_tokens_seen": 63796712, "step": 94625 }, { "epoch": 2.3118266435394426, "grad_norm": 0.04244079068303108, "learning_rate": 1.301175221883095e-06, "loss": 0.0001, "num_input_tokens_seen": 63799592, "step": 94630 }, { "epoch": 2.3119487943712898, "grad_norm": 0.11242552846670151, "learning_rate": 1.3010939027285684e-06, "loss": 0.0326, "num_input_tokens_seen": 63803304, "step": 94635 }, { "epoch": 2.312070945203137, "grad_norm": 0.00623678881675005, "learning_rate": 1.3010125813844057e-06, "loss": 0.0001, "num_input_tokens_seen": 63806568, "step": 94640 }, { "epoch": 2.312193096034984, "grad_norm": 0.01181286945939064, "learning_rate": 1.3009312578511988e-06, "loss": 0.0019, "num_input_tokens_seen": 63810024, "step": 94645 }, { "epoch": 2.3123152468668313, "grad_norm": 589.23291015625, "learning_rate": 1.3008499321295388e-06, "loss": 0.2024, "num_input_tokens_seen": 63813736, "step": 94650 }, { "epoch": 2.3124373976986785, "grad_norm": 0.0026681246235966682, "learning_rate": 1.300768604220017e-06, "loss": 0.0003, "num_input_tokens_seen": 63817320, "step": 94655 }, { "epoch": 2.3125595485305253, "grad_norm": 0.02497980371117592, "learning_rate": 1.3006872741232252e-06, "loss": 0.0001, "num_input_tokens_seen": 63821096, "step": 94660 }, { "epoch": 2.312681699362373, "grad_norm": 0.028597678989171982, "learning_rate": 1.3006059418397545e-06, "loss": 0.0955, "num_input_tokens_seen": 63825000, "step": 94665 }, { "epoch": 2.3128038501942196, "grad_norm": 42.822933197021484, "learning_rate": 1.3005246073701965e-06, "loss": 0.0467, "num_input_tokens_seen": 63828712, "step": 94670 }, { "epoch": 2.312926001026067, "grad_norm": 0.1000511422753334, "learning_rate": 1.3004432707151428e-06, "loss": 0.0992, "num_input_tokens_seen": 63832360, "step": 94675 }, { "epoch": 2.313048151857914, "grad_norm": 1.0598971843719482, "learning_rate": 1.300361931875185e-06, "loss": 0.0536, "num_input_tokens_seen": 63835944, "step": 94680 }, { "epoch": 2.313170302689761, "grad_norm": 0.026618486270308495, "learning_rate": 1.3002805908509144e-06, "loss": 0.0001, "num_input_tokens_seen": 63839592, "step": 94685 }, { "epoch": 2.3132924535216084, "grad_norm": 0.10733477026224136, "learning_rate": 1.3001992476429221e-06, "loss": 0.0001, "num_input_tokens_seen": 63842792, "step": 94690 }, { "epoch": 2.3134146043534556, "grad_norm": 0.0665910467505455, "learning_rate": 1.3001179022518006e-06, "loss": 0.0005, "num_input_tokens_seen": 63845736, "step": 94695 }, { "epoch": 2.3135367551853028, "grad_norm": 0.20579425990581512, "learning_rate": 1.300036554678141e-06, "loss": 0.0839, "num_input_tokens_seen": 63849128, "step": 94700 }, { "epoch": 2.31365890601715, "grad_norm": 0.03927788883447647, "learning_rate": 1.299955204922535e-06, "loss": 0.0553, "num_input_tokens_seen": 63852200, "step": 94705 }, { "epoch": 2.313781056848997, "grad_norm": 0.08418112993240356, "learning_rate": 1.299873852985574e-06, "loss": 0.0305, "num_input_tokens_seen": 63855784, "step": 94710 }, { "epoch": 2.3139032076808443, "grad_norm": 0.014477049000561237, "learning_rate": 1.2997924988678496e-06, "loss": 0.0003, "num_input_tokens_seen": 63858920, "step": 94715 }, { "epoch": 2.3140253585126915, "grad_norm": 0.3408757150173187, "learning_rate": 1.2997111425699532e-06, "loss": 0.0399, "num_input_tokens_seen": 63862056, "step": 94720 }, { "epoch": 2.3141475093445387, "grad_norm": 0.4299645721912384, "learning_rate": 1.299629784092477e-06, "loss": 0.0012, "num_input_tokens_seen": 63865576, "step": 94725 }, { "epoch": 2.314269660176386, "grad_norm": 7.6998701095581055, "learning_rate": 1.2995484234360123e-06, "loss": 0.0367, "num_input_tokens_seen": 63868968, "step": 94730 }, { "epoch": 2.314391811008233, "grad_norm": 45.46992111206055, "learning_rate": 1.299467060601151e-06, "loss": 0.047, "num_input_tokens_seen": 63872616, "step": 94735 }, { "epoch": 2.3145139618400803, "grad_norm": 0.43784013390541077, "learning_rate": 1.299385695588485e-06, "loss": 0.0569, "num_input_tokens_seen": 63876200, "step": 94740 }, { "epoch": 2.3146361126719275, "grad_norm": 0.0034390855580568314, "learning_rate": 1.2993043283986055e-06, "loss": 0.0926, "num_input_tokens_seen": 63879784, "step": 94745 }, { "epoch": 2.3147582635037747, "grad_norm": 0.01996810920536518, "learning_rate": 1.2992229590321042e-06, "loss": 0.0635, "num_input_tokens_seen": 63882984, "step": 94750 }, { "epoch": 2.3148804143356214, "grad_norm": 277.53094482421875, "learning_rate": 1.299141587489573e-06, "loss": 0.0324, "num_input_tokens_seen": 63886312, "step": 94755 }, { "epoch": 2.3150025651674686, "grad_norm": 0.7463566064834595, "learning_rate": 1.299060213771604e-06, "loss": 0.0005, "num_input_tokens_seen": 63889256, "step": 94760 }, { "epoch": 2.315124715999316, "grad_norm": 0.27979183197021484, "learning_rate": 1.2989788378787886e-06, "loss": 0.0006, "num_input_tokens_seen": 63892264, "step": 94765 }, { "epoch": 2.315246866831163, "grad_norm": 0.1391063630580902, "learning_rate": 1.2988974598117188e-06, "loss": 0.0002, "num_input_tokens_seen": 63895528, "step": 94770 }, { "epoch": 2.31536901766301, "grad_norm": 0.08584363013505936, "learning_rate": 1.2988160795709861e-06, "loss": 0.0003, "num_input_tokens_seen": 63898984, "step": 94775 }, { "epoch": 2.3154911684948574, "grad_norm": 0.18024323880672455, "learning_rate": 1.2987346971571823e-06, "loss": 0.0001, "num_input_tokens_seen": 63902376, "step": 94780 }, { "epoch": 2.3156133193267046, "grad_norm": 0.012486801482737064, "learning_rate": 1.2986533125708998e-06, "loss": 0.0693, "num_input_tokens_seen": 63905704, "step": 94785 }, { "epoch": 2.3157354701585517, "grad_norm": 0.04421261325478554, "learning_rate": 1.2985719258127299e-06, "loss": 0.0716, "num_input_tokens_seen": 63908648, "step": 94790 }, { "epoch": 2.315857620990399, "grad_norm": 0.010590645484626293, "learning_rate": 1.298490536883265e-06, "loss": 0.0001, "num_input_tokens_seen": 63911912, "step": 94795 }, { "epoch": 2.315979771822246, "grad_norm": 14.425555229187012, "learning_rate": 1.2984091457830961e-06, "loss": 0.0525, "num_input_tokens_seen": 63916008, "step": 94800 }, { "epoch": 2.3161019226540933, "grad_norm": 0.01041900459676981, "learning_rate": 1.298327752512816e-06, "loss": 0.0595, "num_input_tokens_seen": 63919208, "step": 94805 }, { "epoch": 2.3162240734859405, "grad_norm": 0.006424942519515753, "learning_rate": 1.298246357073016e-06, "loss": 0.0679, "num_input_tokens_seen": 63922920, "step": 94810 }, { "epoch": 2.3163462243177877, "grad_norm": 72.95399475097656, "learning_rate": 1.2981649594642886e-06, "loss": 0.0776, "num_input_tokens_seen": 63926248, "step": 94815 }, { "epoch": 2.316468375149635, "grad_norm": 0.033855266869068146, "learning_rate": 1.298083559687225e-06, "loss": 0.0362, "num_input_tokens_seen": 63929384, "step": 94820 }, { "epoch": 2.316590525981482, "grad_norm": 0.008695218712091446, "learning_rate": 1.2980021577424178e-06, "loss": 0.0431, "num_input_tokens_seen": 63932520, "step": 94825 }, { "epoch": 2.3167126768133293, "grad_norm": 0.09830284863710403, "learning_rate": 1.2979207536304588e-06, "loss": 0.0002, "num_input_tokens_seen": 63935720, "step": 94830 }, { "epoch": 2.3168348276451765, "grad_norm": 17.72136688232422, "learning_rate": 1.29783934735194e-06, "loss": 0.1531, "num_input_tokens_seen": 63939368, "step": 94835 }, { "epoch": 2.316956978477023, "grad_norm": 0.014057500287890434, "learning_rate": 1.2977579389074533e-06, "loss": 0.0001, "num_input_tokens_seen": 63943016, "step": 94840 }, { "epoch": 2.317079129308871, "grad_norm": 81.38212585449219, "learning_rate": 1.2976765282975905e-06, "loss": 0.1494, "num_input_tokens_seen": 63946408, "step": 94845 }, { "epoch": 2.3172012801407176, "grad_norm": 0.003075582906603813, "learning_rate": 1.297595115522944e-06, "loss": 0.001, "num_input_tokens_seen": 63949608, "step": 94850 }, { "epoch": 2.3173234309725648, "grad_norm": 0.08976204693317413, "learning_rate": 1.297513700584106e-06, "loss": 0.0007, "num_input_tokens_seen": 63953128, "step": 94855 }, { "epoch": 2.317445581804412, "grad_norm": 0.11242163181304932, "learning_rate": 1.2974322834816684e-06, "loss": 0.001, "num_input_tokens_seen": 63956136, "step": 94860 }, { "epoch": 2.317567732636259, "grad_norm": 0.008920049294829369, "learning_rate": 1.2973508642162233e-06, "loss": 0.0416, "num_input_tokens_seen": 63960040, "step": 94865 }, { "epoch": 2.3176898834681063, "grad_norm": 0.14441661536693573, "learning_rate": 1.2972694427883625e-06, "loss": 0.0003, "num_input_tokens_seen": 63963304, "step": 94870 }, { "epoch": 2.3178120342999535, "grad_norm": 0.06034664064645767, "learning_rate": 1.2971880191986785e-06, "loss": 0.0003, "num_input_tokens_seen": 63966632, "step": 94875 }, { "epoch": 2.3179341851318007, "grad_norm": 0.16961410641670227, "learning_rate": 1.297106593447763e-06, "loss": 0.0303, "num_input_tokens_seen": 63970088, "step": 94880 }, { "epoch": 2.318056335963648, "grad_norm": 0.46061891317367554, "learning_rate": 1.2970251655362086e-06, "loss": 0.0005, "num_input_tokens_seen": 63973352, "step": 94885 }, { "epoch": 2.318178486795495, "grad_norm": 0.13485822081565857, "learning_rate": 1.2969437354646073e-06, "loss": 0.0743, "num_input_tokens_seen": 63976552, "step": 94890 }, { "epoch": 2.3183006376273423, "grad_norm": 0.03163136541843414, "learning_rate": 1.2968623032335515e-06, "loss": 0.0535, "num_input_tokens_seen": 63979752, "step": 94895 }, { "epoch": 2.3184227884591895, "grad_norm": 0.06518939882516861, "learning_rate": 1.296780868843633e-06, "loss": 0.0007, "num_input_tokens_seen": 63983720, "step": 94900 }, { "epoch": 2.3185449392910367, "grad_norm": 1.7363409996032715, "learning_rate": 1.2966994322954439e-06, "loss": 0.0007, "num_input_tokens_seen": 63987368, "step": 94905 }, { "epoch": 2.318667090122884, "grad_norm": 0.023380771279335022, "learning_rate": 1.2966179935895774e-06, "loss": 0.0444, "num_input_tokens_seen": 63990568, "step": 94910 }, { "epoch": 2.318789240954731, "grad_norm": 0.09954951703548431, "learning_rate": 1.2965365527266245e-06, "loss": 0.1065, "num_input_tokens_seen": 63993704, "step": 94915 }, { "epoch": 2.3189113917865782, "grad_norm": 0.03531714528799057, "learning_rate": 1.2964551097071784e-06, "loss": 0.0003, "num_input_tokens_seen": 63997160, "step": 94920 }, { "epoch": 2.3190335426184254, "grad_norm": 0.3534320294857025, "learning_rate": 1.2963736645318308e-06, "loss": 0.0519, "num_input_tokens_seen": 64000488, "step": 94925 }, { "epoch": 2.3191556934502726, "grad_norm": 0.08531579375267029, "learning_rate": 1.2962922172011744e-06, "loss": 0.0711, "num_input_tokens_seen": 64004008, "step": 94930 }, { "epoch": 2.3192778442821194, "grad_norm": 0.012248575687408447, "learning_rate": 1.296210767715801e-06, "loss": 0.0287, "num_input_tokens_seen": 64007528, "step": 94935 }, { "epoch": 2.3193999951139666, "grad_norm": 0.025879565626382828, "learning_rate": 1.296129316076303e-06, "loss": 0.049, "num_input_tokens_seen": 64011112, "step": 94940 }, { "epoch": 2.3195221459458137, "grad_norm": 0.04887163266539574, "learning_rate": 1.2960478622832736e-06, "loss": 0.0318, "num_input_tokens_seen": 64014440, "step": 94945 }, { "epoch": 2.319644296777661, "grad_norm": 0.005958162248134613, "learning_rate": 1.295966406337304e-06, "loss": 0.0549, "num_input_tokens_seen": 64018024, "step": 94950 }, { "epoch": 2.319766447609508, "grad_norm": 0.026194145902991295, "learning_rate": 1.2958849482389876e-06, "loss": 0.0363, "num_input_tokens_seen": 64021352, "step": 94955 }, { "epoch": 2.3198885984413553, "grad_norm": 0.0223369337618351, "learning_rate": 1.2958034879889158e-06, "loss": 0.0001, "num_input_tokens_seen": 64024552, "step": 94960 }, { "epoch": 2.3200107492732025, "grad_norm": 0.022756468504667282, "learning_rate": 1.295722025587682e-06, "loss": 0.0935, "num_input_tokens_seen": 64027944, "step": 94965 }, { "epoch": 2.3201329001050497, "grad_norm": 0.5314485430717468, "learning_rate": 1.2956405610358776e-06, "loss": 0.0005, "num_input_tokens_seen": 64031528, "step": 94970 }, { "epoch": 2.320255050936897, "grad_norm": 0.2667846977710724, "learning_rate": 1.2955590943340956e-06, "loss": 0.0001, "num_input_tokens_seen": 64034728, "step": 94975 }, { "epoch": 2.320377201768744, "grad_norm": 0.019529344514012337, "learning_rate": 1.2954776254829288e-06, "loss": 0.0003, "num_input_tokens_seen": 64037864, "step": 94980 }, { "epoch": 2.3204993526005913, "grad_norm": 0.005352795589715242, "learning_rate": 1.2953961544829688e-06, "loss": 0.141, "num_input_tokens_seen": 64041576, "step": 94985 }, { "epoch": 2.3206215034324384, "grad_norm": 0.08801307529211044, "learning_rate": 1.2953146813348085e-06, "loss": 0.0004, "num_input_tokens_seen": 64044584, "step": 94990 }, { "epoch": 2.3207436542642856, "grad_norm": 0.14798977971076965, "learning_rate": 1.2952332060390405e-06, "loss": 0.0626, "num_input_tokens_seen": 64047848, "step": 94995 }, { "epoch": 2.320865805096133, "grad_norm": 20.214397430419922, "learning_rate": 1.2951517285962571e-06, "loss": 0.0465, "num_input_tokens_seen": 64050856, "step": 95000 }, { "epoch": 2.32098795592798, "grad_norm": 0.031365349888801575, "learning_rate": 1.2950702490070514e-06, "loss": 0.0296, "num_input_tokens_seen": 64054312, "step": 95005 }, { "epoch": 2.321110106759827, "grad_norm": 0.16067151725292206, "learning_rate": 1.294988767272015e-06, "loss": 0.0689, "num_input_tokens_seen": 64057640, "step": 95010 }, { "epoch": 2.3212322575916744, "grad_norm": 0.003375637112185359, "learning_rate": 1.2949072833917414e-06, "loss": 0.0004, "num_input_tokens_seen": 64061160, "step": 95015 }, { "epoch": 2.321354408423521, "grad_norm": 0.04902317747473717, "learning_rate": 1.2948257973668224e-06, "loss": 0.0004, "num_input_tokens_seen": 64064296, "step": 95020 }, { "epoch": 2.3214765592553688, "grad_norm": 0.186828151345253, "learning_rate": 1.294744309197851e-06, "loss": 0.0002, "num_input_tokens_seen": 64067688, "step": 95025 }, { "epoch": 2.3215987100872155, "grad_norm": 0.013217689469456673, "learning_rate": 1.294662818885419e-06, "loss": 0.0288, "num_input_tokens_seen": 64070696, "step": 95030 }, { "epoch": 2.3217208609190627, "grad_norm": 0.012650329619646072, "learning_rate": 1.2945813264301207e-06, "loss": 0.0002, "num_input_tokens_seen": 64073704, "step": 95035 }, { "epoch": 2.32184301175091, "grad_norm": 0.08450979739427567, "learning_rate": 1.2944998318325474e-06, "loss": 0.0003, "num_input_tokens_seen": 64077160, "step": 95040 }, { "epoch": 2.321965162582757, "grad_norm": 0.042767226696014404, "learning_rate": 1.2944183350932922e-06, "loss": 0.0002, "num_input_tokens_seen": 64080552, "step": 95045 }, { "epoch": 2.3220873134146043, "grad_norm": 0.0025647578295320272, "learning_rate": 1.2943368362129477e-06, "loss": 0.0017, "num_input_tokens_seen": 64084648, "step": 95050 }, { "epoch": 2.3222094642464515, "grad_norm": 19.454233169555664, "learning_rate": 1.2942553351921063e-06, "loss": 0.047, "num_input_tokens_seen": 64088040, "step": 95055 }, { "epoch": 2.3223316150782987, "grad_norm": 0.5485376119613647, "learning_rate": 1.294173832031361e-06, "loss": 0.0577, "num_input_tokens_seen": 64091752, "step": 95060 }, { "epoch": 2.322453765910146, "grad_norm": 0.5301234126091003, "learning_rate": 1.2940923267313049e-06, "loss": 0.0907, "num_input_tokens_seen": 64094824, "step": 95065 }, { "epoch": 2.322575916741993, "grad_norm": 0.02889396995306015, "learning_rate": 1.2940108192925298e-06, "loss": 0.0004, "num_input_tokens_seen": 64098088, "step": 95070 }, { "epoch": 2.3226980675738402, "grad_norm": 0.06749209761619568, "learning_rate": 1.2939293097156295e-06, "loss": 0.0001, "num_input_tokens_seen": 64101800, "step": 95075 }, { "epoch": 2.3228202184056874, "grad_norm": 34.30602264404297, "learning_rate": 1.2938477980011958e-06, "loss": 0.0591, "num_input_tokens_seen": 64105320, "step": 95080 }, { "epoch": 2.3229423692375346, "grad_norm": 0.2598949670791626, "learning_rate": 1.2937662841498218e-06, "loss": 0.1721, "num_input_tokens_seen": 64108776, "step": 95085 }, { "epoch": 2.323064520069382, "grad_norm": 0.3084917366504669, "learning_rate": 1.2936847681621003e-06, "loss": 0.0525, "num_input_tokens_seen": 64112232, "step": 95090 }, { "epoch": 2.323186670901229, "grad_norm": 0.2027728110551834, "learning_rate": 1.2936032500386242e-06, "loss": 0.06, "num_input_tokens_seen": 64115368, "step": 95095 }, { "epoch": 2.323308821733076, "grad_norm": 1.021079659461975, "learning_rate": 1.2935217297799866e-06, "loss": 0.0291, "num_input_tokens_seen": 64119144, "step": 95100 }, { "epoch": 2.323430972564923, "grad_norm": 0.01855880580842495, "learning_rate": 1.2934402073867798e-06, "loss": 0.0002, "num_input_tokens_seen": 64123048, "step": 95105 }, { "epoch": 2.3235531233967706, "grad_norm": 0.014141903258860111, "learning_rate": 1.293358682859597e-06, "loss": 0.0822, "num_input_tokens_seen": 64126568, "step": 95110 }, { "epoch": 2.3236752742286173, "grad_norm": 249.31118774414062, "learning_rate": 1.293277156199031e-06, "loss": 0.0745, "num_input_tokens_seen": 64130216, "step": 95115 }, { "epoch": 2.3237974250604645, "grad_norm": 0.007649078033864498, "learning_rate": 1.2931956274056747e-06, "loss": 0.055, "num_input_tokens_seen": 64134120, "step": 95120 }, { "epoch": 2.3239195758923117, "grad_norm": 0.0565241314470768, "learning_rate": 1.2931140964801208e-06, "loss": 0.1924, "num_input_tokens_seen": 64137128, "step": 95125 }, { "epoch": 2.324041726724159, "grad_norm": 0.04068436101078987, "learning_rate": 1.293032563422962e-06, "loss": 0.0003, "num_input_tokens_seen": 64140648, "step": 95130 }, { "epoch": 2.324163877556006, "grad_norm": 0.06296813488006592, "learning_rate": 1.2929510282347922e-06, "loss": 0.0003, "num_input_tokens_seen": 64144104, "step": 95135 }, { "epoch": 2.3242860283878533, "grad_norm": 677.3106079101562, "learning_rate": 1.2928694909162036e-06, "loss": 0.082, "num_input_tokens_seen": 64147432, "step": 95140 }, { "epoch": 2.3244081792197004, "grad_norm": 0.04508993402123451, "learning_rate": 1.2927879514677891e-06, "loss": 0.089, "num_input_tokens_seen": 64150696, "step": 95145 }, { "epoch": 2.3245303300515476, "grad_norm": 100.00861358642578, "learning_rate": 1.2927064098901418e-06, "loss": 0.0979, "num_input_tokens_seen": 64154088, "step": 95150 }, { "epoch": 2.324652480883395, "grad_norm": 0.8293306231498718, "learning_rate": 1.2926248661838548e-06, "loss": 0.0014, "num_input_tokens_seen": 64157416, "step": 95155 }, { "epoch": 2.324774631715242, "grad_norm": 0.0203672107309103, "learning_rate": 1.2925433203495213e-06, "loss": 0.0361, "num_input_tokens_seen": 64160872, "step": 95160 }, { "epoch": 2.324896782547089, "grad_norm": 0.14864440262317657, "learning_rate": 1.2924617723877337e-06, "loss": 0.0002, "num_input_tokens_seen": 64164200, "step": 95165 }, { "epoch": 2.3250189333789364, "grad_norm": 32.7774543762207, "learning_rate": 1.292380222299086e-06, "loss": 0.0714, "num_input_tokens_seen": 64167784, "step": 95170 }, { "epoch": 2.3251410842107836, "grad_norm": 0.17909160256385803, "learning_rate": 1.2922986700841704e-06, "loss": 0.0007, "num_input_tokens_seen": 64171368, "step": 95175 }, { "epoch": 2.3252632350426308, "grad_norm": 0.01783503033220768, "learning_rate": 1.2922171157435802e-06, "loss": 0.0005, "num_input_tokens_seen": 64174696, "step": 95180 }, { "epoch": 2.325385385874478, "grad_norm": 0.2182997614145279, "learning_rate": 1.2921355592779083e-06, "loss": 0.0627, "num_input_tokens_seen": 64178280, "step": 95185 }, { "epoch": 2.325507536706325, "grad_norm": 0.0018611556151881814, "learning_rate": 1.2920540006877483e-06, "loss": 0.0005, "num_input_tokens_seen": 64182120, "step": 95190 }, { "epoch": 2.3256296875381723, "grad_norm": 5.40140962600708, "learning_rate": 1.2919724399736931e-06, "loss": 0.0009, "num_input_tokens_seen": 64185064, "step": 95195 }, { "epoch": 2.325751838370019, "grad_norm": 0.4457661807537079, "learning_rate": 1.2918908771363358e-06, "loss": 0.0502, "num_input_tokens_seen": 64188456, "step": 95200 }, { "epoch": 2.3258739892018663, "grad_norm": 0.22518938779830933, "learning_rate": 1.2918093121762694e-06, "loss": 0.0002, "num_input_tokens_seen": 64191720, "step": 95205 }, { "epoch": 2.3259961400337135, "grad_norm": 0.08013353496789932, "learning_rate": 1.2917277450940874e-06, "loss": 0.0004, "num_input_tokens_seen": 64195048, "step": 95210 }, { "epoch": 2.3261182908655607, "grad_norm": 0.12415852397680283, "learning_rate": 1.2916461758903823e-06, "loss": 0.0004, "num_input_tokens_seen": 64198440, "step": 95215 }, { "epoch": 2.326240441697408, "grad_norm": 0.011967546306550503, "learning_rate": 1.2915646045657483e-06, "loss": 0.049, "num_input_tokens_seen": 64202088, "step": 95220 }, { "epoch": 2.326362592529255, "grad_norm": 0.09109295904636383, "learning_rate": 1.2914830311207778e-06, "loss": 0.0608, "num_input_tokens_seen": 64205096, "step": 95225 }, { "epoch": 2.3264847433611022, "grad_norm": 0.07817357033491135, "learning_rate": 1.2914014555560643e-06, "loss": 0.0003, "num_input_tokens_seen": 64208232, "step": 95230 }, { "epoch": 2.3266068941929494, "grad_norm": 44.14269256591797, "learning_rate": 1.2913198778722013e-06, "loss": 0.0907, "num_input_tokens_seen": 64211624, "step": 95235 }, { "epoch": 2.3267290450247966, "grad_norm": 0.034214459359645844, "learning_rate": 1.2912382980697816e-06, "loss": 0.0001, "num_input_tokens_seen": 64214760, "step": 95240 }, { "epoch": 2.326851195856644, "grad_norm": 19.30805206298828, "learning_rate": 1.2911567161493985e-06, "loss": 0.0725, "num_input_tokens_seen": 64217960, "step": 95245 }, { "epoch": 2.326973346688491, "grad_norm": 0.19850492477416992, "learning_rate": 1.2910751321116455e-06, "loss": 0.0575, "num_input_tokens_seen": 64221288, "step": 95250 }, { "epoch": 2.327095497520338, "grad_norm": 0.016696002334356308, "learning_rate": 1.2909935459571159e-06, "loss": 0.0007, "num_input_tokens_seen": 64225000, "step": 95255 }, { "epoch": 2.3272176483521854, "grad_norm": 17.366100311279297, "learning_rate": 1.290911957686403e-06, "loss": 0.0886, "num_input_tokens_seen": 64228328, "step": 95260 }, { "epoch": 2.3273397991840326, "grad_norm": 0.14225991070270538, "learning_rate": 1.2908303673001e-06, "loss": 0.1204, "num_input_tokens_seen": 64231912, "step": 95265 }, { "epoch": 2.3274619500158797, "grad_norm": 0.05388676002621651, "learning_rate": 1.2907487747988007e-06, "loss": 0.0006, "num_input_tokens_seen": 64235112, "step": 95270 }, { "epoch": 2.327584100847727, "grad_norm": 0.008519626222550869, "learning_rate": 1.2906671801830978e-06, "loss": 0.0002, "num_input_tokens_seen": 64238824, "step": 95275 }, { "epoch": 2.327706251679574, "grad_norm": 23.24995231628418, "learning_rate": 1.290585583453585e-06, "loss": 0.0632, "num_input_tokens_seen": 64241896, "step": 95280 }, { "epoch": 2.327828402511421, "grad_norm": 0.05381803587079048, "learning_rate": 1.2905039846108558e-06, "loss": 0.044, "num_input_tokens_seen": 64245352, "step": 95285 }, { "epoch": 2.3279505533432685, "grad_norm": 39.409934997558594, "learning_rate": 1.2904223836555035e-06, "loss": 0.044, "num_input_tokens_seen": 64248872, "step": 95290 }, { "epoch": 2.3280727041751152, "grad_norm": 0.16154932975769043, "learning_rate": 1.2903407805881215e-06, "loss": 0.0008, "num_input_tokens_seen": 64252584, "step": 95295 }, { "epoch": 2.3281948550069624, "grad_norm": 0.07096408307552338, "learning_rate": 1.290259175409303e-06, "loss": 0.0401, "num_input_tokens_seen": 64255656, "step": 95300 }, { "epoch": 2.3283170058388096, "grad_norm": 0.02677714265882969, "learning_rate": 1.290177568119642e-06, "loss": 0.0003, "num_input_tokens_seen": 64259240, "step": 95305 }, { "epoch": 2.328439156670657, "grad_norm": 0.06973837316036224, "learning_rate": 1.2900959587197314e-06, "loss": 0.0001, "num_input_tokens_seen": 64262952, "step": 95310 }, { "epoch": 2.328561307502504, "grad_norm": 0.02655634470283985, "learning_rate": 1.2900143472101652e-06, "loss": 0.0008, "num_input_tokens_seen": 64266024, "step": 95315 }, { "epoch": 2.328683458334351, "grad_norm": 0.021072372794151306, "learning_rate": 1.2899327335915364e-06, "loss": 0.0002, "num_input_tokens_seen": 64269288, "step": 95320 }, { "epoch": 2.3288056091661984, "grad_norm": 0.011516228318214417, "learning_rate": 1.2898511178644394e-06, "loss": 0.0527, "num_input_tokens_seen": 64272488, "step": 95325 }, { "epoch": 2.3289277599980456, "grad_norm": 0.009769659489393234, "learning_rate": 1.2897695000294666e-06, "loss": 0.0001, "num_input_tokens_seen": 64276008, "step": 95330 }, { "epoch": 2.3290499108298928, "grad_norm": 0.003023561555892229, "learning_rate": 1.2896878800872122e-06, "loss": 0.0, "num_input_tokens_seen": 64279208, "step": 95335 }, { "epoch": 2.32917206166174, "grad_norm": 0.006055025849491358, "learning_rate": 1.2896062580382693e-06, "loss": 0.0003, "num_input_tokens_seen": 64282088, "step": 95340 }, { "epoch": 2.329294212493587, "grad_norm": 0.8509352803230286, "learning_rate": 1.289524633883232e-06, "loss": 0.0003, "num_input_tokens_seen": 64285864, "step": 95345 }, { "epoch": 2.3294163633254343, "grad_norm": 12.91662883758545, "learning_rate": 1.2894430076226939e-06, "loss": 0.0603, "num_input_tokens_seen": 64289256, "step": 95350 }, { "epoch": 2.3295385141572815, "grad_norm": 0.005020641256123781, "learning_rate": 1.2893613792572482e-06, "loss": 0.0, "num_input_tokens_seen": 64292776, "step": 95355 }, { "epoch": 2.3296606649891287, "grad_norm": 0.010004975832998753, "learning_rate": 1.2892797487874888e-06, "loss": 0.0003, "num_input_tokens_seen": 64295656, "step": 95360 }, { "epoch": 2.329782815820976, "grad_norm": 0.0014859403017908335, "learning_rate": 1.289198116214009e-06, "loss": 0.0526, "num_input_tokens_seen": 64298600, "step": 95365 }, { "epoch": 2.329904966652823, "grad_norm": 0.16149680316448212, "learning_rate": 1.289116481537403e-06, "loss": 0.0571, "num_input_tokens_seen": 64301864, "step": 95370 }, { "epoch": 2.3300271174846703, "grad_norm": 212.5308837890625, "learning_rate": 1.2890348447582642e-06, "loss": 0.1041, "num_input_tokens_seen": 64304552, "step": 95375 }, { "epoch": 2.330149268316517, "grad_norm": 0.029706710949540138, "learning_rate": 1.288953205877186e-06, "loss": 0.0001, "num_input_tokens_seen": 64307624, "step": 95380 }, { "epoch": 2.330271419148364, "grad_norm": 32.41347122192383, "learning_rate": 1.2888715648947629e-06, "loss": 0.1376, "num_input_tokens_seen": 64311016, "step": 95385 }, { "epoch": 2.3303935699802114, "grad_norm": 0.09344623237848282, "learning_rate": 1.2887899218115876e-06, "loss": 0.0712, "num_input_tokens_seen": 64314280, "step": 95390 }, { "epoch": 2.3305157208120586, "grad_norm": 0.004723446909338236, "learning_rate": 1.2887082766282545e-06, "loss": 0.0002, "num_input_tokens_seen": 64317416, "step": 95395 }, { "epoch": 2.330637871643906, "grad_norm": 0.014271215535700321, "learning_rate": 1.288626629345357e-06, "loss": 0.0003, "num_input_tokens_seen": 64320936, "step": 95400 }, { "epoch": 2.330760022475753, "grad_norm": 0.02553671970963478, "learning_rate": 1.2885449799634888e-06, "loss": 0.0454, "num_input_tokens_seen": 64324200, "step": 95405 }, { "epoch": 2.3308821733076, "grad_norm": 0.14257365465164185, "learning_rate": 1.2884633284832446e-06, "loss": 0.1252, "num_input_tokens_seen": 64327464, "step": 95410 }, { "epoch": 2.3310043241394474, "grad_norm": 0.06047545745968819, "learning_rate": 1.288381674905217e-06, "loss": 0.1476, "num_input_tokens_seen": 64330728, "step": 95415 }, { "epoch": 2.3311264749712945, "grad_norm": 0.005019609350711107, "learning_rate": 1.2883000192300003e-06, "loss": 0.0612, "num_input_tokens_seen": 64334120, "step": 95420 }, { "epoch": 2.3312486258031417, "grad_norm": 0.009447050280869007, "learning_rate": 1.2882183614581885e-06, "loss": 0.0001, "num_input_tokens_seen": 64337192, "step": 95425 }, { "epoch": 2.331370776634989, "grad_norm": 0.034353192895650864, "learning_rate": 1.2881367015903752e-06, "loss": 0.001, "num_input_tokens_seen": 64340328, "step": 95430 }, { "epoch": 2.331492927466836, "grad_norm": 0.10627281665802002, "learning_rate": 1.2880550396271543e-06, "loss": 0.0007, "num_input_tokens_seen": 64343400, "step": 95435 }, { "epoch": 2.3316150782986833, "grad_norm": 0.005389807280153036, "learning_rate": 1.2879733755691196e-06, "loss": 0.0337, "num_input_tokens_seen": 64346472, "step": 95440 }, { "epoch": 2.3317372291305305, "grad_norm": 0.010320378467440605, "learning_rate": 1.287891709416865e-06, "loss": 0.0002, "num_input_tokens_seen": 64350056, "step": 95445 }, { "epoch": 2.3318593799623777, "grad_norm": 0.26166412234306335, "learning_rate": 1.2878100411709847e-06, "loss": 0.0443, "num_input_tokens_seen": 64353128, "step": 95450 }, { "epoch": 2.331981530794225, "grad_norm": 0.2837740480899811, "learning_rate": 1.2877283708320724e-06, "loss": 0.0003, "num_input_tokens_seen": 64356392, "step": 95455 }, { "epoch": 2.332103681626072, "grad_norm": 0.01635722443461418, "learning_rate": 1.2876466984007217e-06, "loss": 0.0379, "num_input_tokens_seen": 64359592, "step": 95460 }, { "epoch": 2.332225832457919, "grad_norm": 0.0667223259806633, "learning_rate": 1.2875650238775268e-06, "loss": 0.0756, "num_input_tokens_seen": 64363240, "step": 95465 }, { "epoch": 2.3323479832897664, "grad_norm": 0.02749939262866974, "learning_rate": 1.2874833472630819e-06, "loss": 0.0002, "num_input_tokens_seen": 64367016, "step": 95470 }, { "epoch": 2.332470134121613, "grad_norm": 0.019013844430446625, "learning_rate": 1.2874016685579807e-06, "loss": 0.0408, "num_input_tokens_seen": 64370152, "step": 95475 }, { "epoch": 2.3325922849534604, "grad_norm": 0.08791220188140869, "learning_rate": 1.2873199877628177e-06, "loss": 0.0529, "num_input_tokens_seen": 64373672, "step": 95480 }, { "epoch": 2.3327144357853076, "grad_norm": 0.15154674649238586, "learning_rate": 1.2872383048781862e-06, "loss": 0.0004, "num_input_tokens_seen": 64376488, "step": 95485 }, { "epoch": 2.3328365866171548, "grad_norm": 0.005082068033516407, "learning_rate": 1.2871566199046801e-06, "loss": 0.0001, "num_input_tokens_seen": 64379944, "step": 95490 }, { "epoch": 2.332958737449002, "grad_norm": 0.07538677006959915, "learning_rate": 1.287074932842894e-06, "loss": 0.0003, "num_input_tokens_seen": 64382760, "step": 95495 }, { "epoch": 2.333080888280849, "grad_norm": 0.051306627690792084, "learning_rate": 1.286993243693422e-06, "loss": 0.0001, "num_input_tokens_seen": 64386024, "step": 95500 }, { "epoch": 2.3332030391126963, "grad_norm": 0.0033834660425782204, "learning_rate": 1.2869115524568577e-06, "loss": 0.0001, "num_input_tokens_seen": 64389224, "step": 95505 }, { "epoch": 2.3333251899445435, "grad_norm": 0.002595925237983465, "learning_rate": 1.2868298591337955e-06, "loss": 0.1076, "num_input_tokens_seen": 64392296, "step": 95510 }, { "epoch": 2.3334473407763907, "grad_norm": 0.07373785972595215, "learning_rate": 1.2867481637248294e-06, "loss": 0.113, "num_input_tokens_seen": 64395880, "step": 95515 }, { "epoch": 2.333569491608238, "grad_norm": 0.0118758799508214, "learning_rate": 1.2866664662305537e-06, "loss": 0.0001, "num_input_tokens_seen": 64399208, "step": 95520 }, { "epoch": 2.333691642440085, "grad_norm": 1.4112813472747803, "learning_rate": 1.2865847666515622e-06, "loss": 0.0492, "num_input_tokens_seen": 64402024, "step": 95525 }, { "epoch": 2.3338137932719323, "grad_norm": 76.51634216308594, "learning_rate": 1.2865030649884493e-06, "loss": 0.0528, "num_input_tokens_seen": 64405608, "step": 95530 }, { "epoch": 2.3339359441037795, "grad_norm": 0.04707219451665878, "learning_rate": 1.2864213612418088e-06, "loss": 0.0001, "num_input_tokens_seen": 64409128, "step": 95535 }, { "epoch": 2.3340580949356267, "grad_norm": 73.23444366455078, "learning_rate": 1.2863396554122355e-06, "loss": 0.0845, "num_input_tokens_seen": 64412776, "step": 95540 }, { "epoch": 2.334180245767474, "grad_norm": 0.012980327010154724, "learning_rate": 1.286257947500323e-06, "loss": 0.1, "num_input_tokens_seen": 64416104, "step": 95545 }, { "epoch": 2.3343023965993206, "grad_norm": 0.0025070449337363243, "learning_rate": 1.2861762375066658e-06, "loss": 0.0377, "num_input_tokens_seen": 64419240, "step": 95550 }, { "epoch": 2.3344245474311682, "grad_norm": 0.1651676595211029, "learning_rate": 1.2860945254318578e-06, "loss": 0.0003, "num_input_tokens_seen": 64422696, "step": 95555 }, { "epoch": 2.334546698263015, "grad_norm": 0.9793582558631897, "learning_rate": 1.2860128112764934e-06, "loss": 0.0908, "num_input_tokens_seen": 64425768, "step": 95560 }, { "epoch": 2.334668849094862, "grad_norm": 0.0149913365021348, "learning_rate": 1.2859310950411672e-06, "loss": 0.0001, "num_input_tokens_seen": 64429096, "step": 95565 }, { "epoch": 2.3347909999267094, "grad_norm": 0.0045698219910264015, "learning_rate": 1.285849376726473e-06, "loss": 0.0002, "num_input_tokens_seen": 64433064, "step": 95570 }, { "epoch": 2.3349131507585565, "grad_norm": 0.029431438073515892, "learning_rate": 1.2857676563330055e-06, "loss": 0.0002, "num_input_tokens_seen": 64436200, "step": 95575 }, { "epoch": 2.3350353015904037, "grad_norm": 0.42357417941093445, "learning_rate": 1.2856859338613585e-06, "loss": 0.0737, "num_input_tokens_seen": 64439656, "step": 95580 }, { "epoch": 2.335157452422251, "grad_norm": 0.01732746884226799, "learning_rate": 1.2856042093121267e-06, "loss": 0.1409, "num_input_tokens_seen": 64442792, "step": 95585 }, { "epoch": 2.335279603254098, "grad_norm": 0.03916094824671745, "learning_rate": 1.2855224826859045e-06, "loss": 0.0001, "num_input_tokens_seen": 64445992, "step": 95590 }, { "epoch": 2.3354017540859453, "grad_norm": 0.006325630936771631, "learning_rate": 1.2854407539832855e-06, "loss": 0.0564, "num_input_tokens_seen": 64449640, "step": 95595 }, { "epoch": 2.3355239049177925, "grad_norm": 0.034851573407649994, "learning_rate": 1.2853590232048648e-06, "loss": 0.0468, "num_input_tokens_seen": 64452648, "step": 95600 }, { "epoch": 2.3356460557496397, "grad_norm": 47.87200164794922, "learning_rate": 1.2852772903512366e-06, "loss": 0.0495, "num_input_tokens_seen": 64455784, "step": 95605 }, { "epoch": 2.335768206581487, "grad_norm": 0.03328532353043556, "learning_rate": 1.285195555422995e-06, "loss": 0.0705, "num_input_tokens_seen": 64458664, "step": 95610 }, { "epoch": 2.335890357413334, "grad_norm": 0.1640070378780365, "learning_rate": 1.2851138184207345e-06, "loss": 0.0775, "num_input_tokens_seen": 64462184, "step": 95615 }, { "epoch": 2.3360125082451813, "grad_norm": 0.1048487052321434, "learning_rate": 1.2850320793450497e-06, "loss": 0.035, "num_input_tokens_seen": 64465768, "step": 95620 }, { "epoch": 2.3361346590770284, "grad_norm": 1.1335134506225586, "learning_rate": 1.2849503381965354e-06, "loss": 0.0754, "num_input_tokens_seen": 64469096, "step": 95625 }, { "epoch": 2.3362568099088756, "grad_norm": 0.07938817888498306, "learning_rate": 1.2848685949757853e-06, "loss": 0.0429, "num_input_tokens_seen": 64472296, "step": 95630 }, { "epoch": 2.336378960740723, "grad_norm": 0.04152943566441536, "learning_rate": 1.2847868496833942e-06, "loss": 0.0127, "num_input_tokens_seen": 64475112, "step": 95635 }, { "epoch": 2.33650111157257, "grad_norm": 0.021501827985048294, "learning_rate": 1.2847051023199566e-06, "loss": 0.0723, "num_input_tokens_seen": 64478568, "step": 95640 }, { "epoch": 2.3366232624044168, "grad_norm": 0.050651174038648605, "learning_rate": 1.2846233528860667e-06, "loss": 0.0002, "num_input_tokens_seen": 64482088, "step": 95645 }, { "epoch": 2.336745413236264, "grad_norm": 0.13994047045707703, "learning_rate": 1.2845416013823195e-06, "loss": 0.0469, "num_input_tokens_seen": 64485288, "step": 95650 }, { "epoch": 2.336867564068111, "grad_norm": 0.02746342495083809, "learning_rate": 1.284459847809309e-06, "loss": 0.0005, "num_input_tokens_seen": 64488936, "step": 95655 }, { "epoch": 2.3369897148999583, "grad_norm": 0.2462010383605957, "learning_rate": 1.28437809216763e-06, "loss": 0.0004, "num_input_tokens_seen": 64492840, "step": 95660 }, { "epoch": 2.3371118657318055, "grad_norm": 0.20011571049690247, "learning_rate": 1.284296334457877e-06, "loss": 0.0005, "num_input_tokens_seen": 64496616, "step": 95665 }, { "epoch": 2.3372340165636527, "grad_norm": 0.021411439403891563, "learning_rate": 1.2842145746806448e-06, "loss": 0.0029, "num_input_tokens_seen": 64500008, "step": 95670 }, { "epoch": 2.3373561673955, "grad_norm": 0.018261928111314774, "learning_rate": 1.2841328128365275e-06, "loss": 0.0001, "num_input_tokens_seen": 64503720, "step": 95675 }, { "epoch": 2.337478318227347, "grad_norm": 0.01709025539457798, "learning_rate": 1.2840510489261202e-06, "loss": 0.0002, "num_input_tokens_seen": 64506792, "step": 95680 }, { "epoch": 2.3376004690591943, "grad_norm": 0.031307484954595566, "learning_rate": 1.2839692829500172e-06, "loss": 0.0001, "num_input_tokens_seen": 64509864, "step": 95685 }, { "epoch": 2.3377226198910415, "grad_norm": 108.65628814697266, "learning_rate": 1.2838875149088133e-06, "loss": 0.034, "num_input_tokens_seen": 64513000, "step": 95690 }, { "epoch": 2.3378447707228887, "grad_norm": 0.01993979513645172, "learning_rate": 1.2838057448031028e-06, "loss": 0.0001, "num_input_tokens_seen": 64516520, "step": 95695 }, { "epoch": 2.337966921554736, "grad_norm": 0.001427598879672587, "learning_rate": 1.2837239726334807e-06, "loss": 0.0477, "num_input_tokens_seen": 64520552, "step": 95700 }, { "epoch": 2.338089072386583, "grad_norm": 0.017566991969943047, "learning_rate": 1.2836421984005416e-06, "loss": 0.0413, "num_input_tokens_seen": 64523624, "step": 95705 }, { "epoch": 2.3382112232184302, "grad_norm": 0.02006099559366703, "learning_rate": 1.2835604221048801e-06, "loss": 0.0001, "num_input_tokens_seen": 64526568, "step": 95710 }, { "epoch": 2.3383333740502774, "grad_norm": 0.008515398018062115, "learning_rate": 1.283478643747091e-06, "loss": 0.0001, "num_input_tokens_seen": 64530472, "step": 95715 }, { "epoch": 2.3384555248821246, "grad_norm": 0.01195614319294691, "learning_rate": 1.2833968633277685e-06, "loss": 0.0059, "num_input_tokens_seen": 64533928, "step": 95720 }, { "epoch": 2.338577675713972, "grad_norm": 0.06796823441982269, "learning_rate": 1.2833150808475085e-06, "loss": 0.0002, "num_input_tokens_seen": 64537256, "step": 95725 }, { "epoch": 2.3386998265458185, "grad_norm": 0.013403164222836494, "learning_rate": 1.2832332963069045e-06, "loss": 0.0001, "num_input_tokens_seen": 64540520, "step": 95730 }, { "epoch": 2.338821977377666, "grad_norm": 0.001977914245799184, "learning_rate": 1.2831515097065521e-06, "loss": 0.0003, "num_input_tokens_seen": 64544168, "step": 95735 }, { "epoch": 2.338944128209513, "grad_norm": 0.006951137911528349, "learning_rate": 1.2830697210470455e-06, "loss": 0.0001, "num_input_tokens_seen": 64547624, "step": 95740 }, { "epoch": 2.33906627904136, "grad_norm": 0.0032081448007375, "learning_rate": 1.28298793032898e-06, "loss": 0.056, "num_input_tokens_seen": 64551144, "step": 95745 }, { "epoch": 2.3391884298732073, "grad_norm": 0.08706554770469666, "learning_rate": 1.2829061375529503e-06, "loss": 0.0003, "num_input_tokens_seen": 64554344, "step": 95750 }, { "epoch": 2.3393105807050545, "grad_norm": 0.002248481148853898, "learning_rate": 1.2828243427195507e-06, "loss": 0.0604, "num_input_tokens_seen": 64557736, "step": 95755 }, { "epoch": 2.3394327315369017, "grad_norm": 0.016986599192023277, "learning_rate": 1.2827425458293766e-06, "loss": 0.0002, "num_input_tokens_seen": 64560808, "step": 95760 }, { "epoch": 2.339554882368749, "grad_norm": 0.0013638599775731564, "learning_rate": 1.2826607468830227e-06, "loss": 0.0002, "num_input_tokens_seen": 64564712, "step": 95765 }, { "epoch": 2.339677033200596, "grad_norm": 0.022970302030444145, "learning_rate": 1.2825789458810836e-06, "loss": 0.0019, "num_input_tokens_seen": 64567656, "step": 95770 }, { "epoch": 2.3397991840324432, "grad_norm": 0.4640812873840332, "learning_rate": 1.2824971428241544e-06, "loss": 0.0279, "num_input_tokens_seen": 64570792, "step": 95775 }, { "epoch": 2.3399213348642904, "grad_norm": 0.05848347395658493, "learning_rate": 1.2824153377128301e-06, "loss": 0.0007, "num_input_tokens_seen": 64574440, "step": 95780 }, { "epoch": 2.3400434856961376, "grad_norm": 0.0471373051404953, "learning_rate": 1.2823335305477058e-06, "loss": 0.0764, "num_input_tokens_seen": 64577896, "step": 95785 }, { "epoch": 2.340165636527985, "grad_norm": 0.19193512201309204, "learning_rate": 1.2822517213293756e-06, "loss": 0.0002, "num_input_tokens_seen": 64580904, "step": 95790 }, { "epoch": 2.340287787359832, "grad_norm": 0.17396211624145508, "learning_rate": 1.2821699100584354e-06, "loss": 0.1267, "num_input_tokens_seen": 64584616, "step": 95795 }, { "epoch": 2.340409938191679, "grad_norm": 0.05379107594490051, "learning_rate": 1.2820880967354798e-06, "loss": 0.0413, "num_input_tokens_seen": 64587752, "step": 95800 }, { "epoch": 2.3405320890235264, "grad_norm": 0.06677083671092987, "learning_rate": 1.2820062813611033e-06, "loss": 0.0001, "num_input_tokens_seen": 64591208, "step": 95805 }, { "epoch": 2.3406542398553736, "grad_norm": 0.035529084503650665, "learning_rate": 1.2819244639359012e-06, "loss": 0.0549, "num_input_tokens_seen": 64594856, "step": 95810 }, { "epoch": 2.3407763906872208, "grad_norm": 0.007604526821523905, "learning_rate": 1.2818426444604686e-06, "loss": 0.0697, "num_input_tokens_seen": 64598120, "step": 95815 }, { "epoch": 2.340898541519068, "grad_norm": 0.015153356827795506, "learning_rate": 1.2817608229354009e-06, "loss": 0.0001, "num_input_tokens_seen": 64601448, "step": 95820 }, { "epoch": 2.3410206923509147, "grad_norm": 23.192989349365234, "learning_rate": 1.2816789993612925e-06, "loss": 0.1384, "num_input_tokens_seen": 64604648, "step": 95825 }, { "epoch": 2.341142843182762, "grad_norm": 0.0845850259065628, "learning_rate": 1.2815971737387385e-06, "loss": 0.0585, "num_input_tokens_seen": 64607912, "step": 95830 }, { "epoch": 2.341264994014609, "grad_norm": 94.90465545654297, "learning_rate": 1.2815153460683343e-06, "loss": 0.0027, "num_input_tokens_seen": 64611432, "step": 95835 }, { "epoch": 2.3413871448464563, "grad_norm": 0.023676864802837372, "learning_rate": 1.2814335163506746e-06, "loss": 0.0001, "num_input_tokens_seen": 64614632, "step": 95840 }, { "epoch": 2.3415092956783035, "grad_norm": 0.013862925581634045, "learning_rate": 1.281351684586355e-06, "loss": 0.0002, "num_input_tokens_seen": 64617640, "step": 95845 }, { "epoch": 2.3416314465101506, "grad_norm": 0.08961334079504013, "learning_rate": 1.28126985077597e-06, "loss": 0.0746, "num_input_tokens_seen": 64620712, "step": 95850 }, { "epoch": 2.341753597341998, "grad_norm": 0.023721953853964806, "learning_rate": 1.281188014920115e-06, "loss": 0.0003, "num_input_tokens_seen": 64623912, "step": 95855 }, { "epoch": 2.341875748173845, "grad_norm": 0.040392301976680756, "learning_rate": 1.2811061770193852e-06, "loss": 0.0004, "num_input_tokens_seen": 64627560, "step": 95860 }, { "epoch": 2.341997899005692, "grad_norm": 264.14202880859375, "learning_rate": 1.2810243370743758e-06, "loss": 0.0355, "num_input_tokens_seen": 64631272, "step": 95865 }, { "epoch": 2.3421200498375394, "grad_norm": 0.0721978172659874, "learning_rate": 1.2809424950856814e-06, "loss": 0.0002, "num_input_tokens_seen": 64634792, "step": 95870 }, { "epoch": 2.3422422006693866, "grad_norm": 0.15529726445674896, "learning_rate": 1.280860651053898e-06, "loss": 0.0259, "num_input_tokens_seen": 64638632, "step": 95875 }, { "epoch": 2.342364351501234, "grad_norm": 0.18332402408123016, "learning_rate": 1.2807788049796201e-06, "loss": 0.0002, "num_input_tokens_seen": 64642024, "step": 95880 }, { "epoch": 2.342486502333081, "grad_norm": 0.049804773181676865, "learning_rate": 1.2806969568634436e-06, "loss": 0.1175, "num_input_tokens_seen": 64645416, "step": 95885 }, { "epoch": 2.342608653164928, "grad_norm": 59.60102844238281, "learning_rate": 1.2806151067059632e-06, "loss": 0.0397, "num_input_tokens_seen": 64648680, "step": 95890 }, { "epoch": 2.3427308039967754, "grad_norm": 0.15184377133846283, "learning_rate": 1.280533254507774e-06, "loss": 0.0945, "num_input_tokens_seen": 64651944, "step": 95895 }, { "epoch": 2.3428529548286225, "grad_norm": 0.13011978566646576, "learning_rate": 1.280451400269472e-06, "loss": 0.0003, "num_input_tokens_seen": 64655400, "step": 95900 }, { "epoch": 2.3429751056604697, "grad_norm": 0.016899550333619118, "learning_rate": 1.2803695439916515e-06, "loss": 0.0002, "num_input_tokens_seen": 64659112, "step": 95905 }, { "epoch": 2.3430972564923165, "grad_norm": 0.003829409135505557, "learning_rate": 1.2802876856749088e-06, "loss": 0.0001, "num_input_tokens_seen": 64662312, "step": 95910 }, { "epoch": 2.343219407324164, "grad_norm": 0.017823584377765656, "learning_rate": 1.2802058253198383e-06, "loss": 0.0, "num_input_tokens_seen": 64665832, "step": 95915 }, { "epoch": 2.343341558156011, "grad_norm": 0.08013685047626495, "learning_rate": 1.2801239629270356e-06, "loss": 0.0914, "num_input_tokens_seen": 64669160, "step": 95920 }, { "epoch": 2.343463708987858, "grad_norm": 109.04890441894531, "learning_rate": 1.2800420984970962e-06, "loss": 0.0702, "num_input_tokens_seen": 64672552, "step": 95925 }, { "epoch": 2.3435858598197052, "grad_norm": 0.05170980468392372, "learning_rate": 1.2799602320306151e-06, "loss": 0.067, "num_input_tokens_seen": 64675752, "step": 95930 }, { "epoch": 2.3437080106515524, "grad_norm": 0.0404609739780426, "learning_rate": 1.2798783635281881e-06, "loss": 0.0002, "num_input_tokens_seen": 64679272, "step": 95935 }, { "epoch": 2.3438301614833996, "grad_norm": 0.017396869137883186, "learning_rate": 1.2797964929904106e-06, "loss": 0.0002, "num_input_tokens_seen": 64682792, "step": 95940 }, { "epoch": 2.343952312315247, "grad_norm": 0.023043381050229073, "learning_rate": 1.2797146204178775e-06, "loss": 0.0002, "num_input_tokens_seen": 64686184, "step": 95945 }, { "epoch": 2.344074463147094, "grad_norm": 135.3673553466797, "learning_rate": 1.2796327458111847e-06, "loss": 0.029, "num_input_tokens_seen": 64689256, "step": 95950 }, { "epoch": 2.344196613978941, "grad_norm": 0.03290106728672981, "learning_rate": 1.2795508691709272e-06, "loss": 0.0989, "num_input_tokens_seen": 64692648, "step": 95955 }, { "epoch": 2.3443187648107884, "grad_norm": 0.007633588742464781, "learning_rate": 1.2794689904977006e-06, "loss": 0.0515, "num_input_tokens_seen": 64695912, "step": 95960 }, { "epoch": 2.3444409156426356, "grad_norm": 0.28993934392929077, "learning_rate": 1.2793871097921e-06, "loss": 0.0006, "num_input_tokens_seen": 64698856, "step": 95965 }, { "epoch": 2.3445630664744828, "grad_norm": 0.01689867116510868, "learning_rate": 1.2793052270547215e-06, "loss": 0.0502, "num_input_tokens_seen": 64701928, "step": 95970 }, { "epoch": 2.34468521730633, "grad_norm": 31.987777709960938, "learning_rate": 1.2792233422861603e-06, "loss": 0.1532, "num_input_tokens_seen": 64705128, "step": 95975 }, { "epoch": 2.344807368138177, "grad_norm": 0.19975918531417847, "learning_rate": 1.2791414554870116e-06, "loss": 0.0481, "num_input_tokens_seen": 64708072, "step": 95980 }, { "epoch": 2.3449295189700243, "grad_norm": 0.05912720412015915, "learning_rate": 1.2790595666578717e-06, "loss": 0.037, "num_input_tokens_seen": 64711144, "step": 95985 }, { "epoch": 2.3450516698018715, "grad_norm": 0.5126543641090393, "learning_rate": 1.2789776757993352e-06, "loss": 0.0006, "num_input_tokens_seen": 64714536, "step": 95990 }, { "epoch": 2.3451738206337187, "grad_norm": 0.07353532314300537, "learning_rate": 1.278895782911998e-06, "loss": 0.0004, "num_input_tokens_seen": 64717480, "step": 95995 }, { "epoch": 2.345295971465566, "grad_norm": 0.0022288633044809103, "learning_rate": 1.2788138879964557e-06, "loss": 0.0002, "num_input_tokens_seen": 64721000, "step": 96000 }, { "epoch": 2.3454181222974126, "grad_norm": 10.458391189575195, "learning_rate": 1.2787319910533036e-06, "loss": 0.041, "num_input_tokens_seen": 64724456, "step": 96005 }, { "epoch": 2.34554027312926, "grad_norm": 0.010931176133453846, "learning_rate": 1.2786500920831377e-06, "loss": 0.049, "num_input_tokens_seen": 64728104, "step": 96010 }, { "epoch": 2.345662423961107, "grad_norm": 0.0860881358385086, "learning_rate": 1.2785681910865535e-06, "loss": 0.0442, "num_input_tokens_seen": 64731112, "step": 96015 }, { "epoch": 2.345784574792954, "grad_norm": 0.17197354137897491, "learning_rate": 1.2784862880641462e-06, "loss": 0.0736, "num_input_tokens_seen": 64734376, "step": 96020 }, { "epoch": 2.3459067256248014, "grad_norm": 0.14629042148590088, "learning_rate": 1.2784043830165119e-06, "loss": 0.0002, "num_input_tokens_seen": 64737768, "step": 96025 }, { "epoch": 2.3460288764566486, "grad_norm": 0.0716601237654686, "learning_rate": 1.2783224759442459e-06, "loss": 0.0594, "num_input_tokens_seen": 64740776, "step": 96030 }, { "epoch": 2.346151027288496, "grad_norm": 0.009555949829518795, "learning_rate": 1.2782405668479442e-06, "loss": 0.0458, "num_input_tokens_seen": 64743976, "step": 96035 }, { "epoch": 2.346273178120343, "grad_norm": 0.02965528704226017, "learning_rate": 1.2781586557282022e-06, "loss": 0.0798, "num_input_tokens_seen": 64747176, "step": 96040 }, { "epoch": 2.34639532895219, "grad_norm": 1.9350935220718384, "learning_rate": 1.2780767425856155e-06, "loss": 0.0426, "num_input_tokens_seen": 64750248, "step": 96045 }, { "epoch": 2.3465174797840374, "grad_norm": 0.12773877382278442, "learning_rate": 1.2779948274207802e-06, "loss": 0.0006, "num_input_tokens_seen": 64753192, "step": 96050 }, { "epoch": 2.3466396306158845, "grad_norm": 29.7601261138916, "learning_rate": 1.2779129102342915e-06, "loss": 0.0522, "num_input_tokens_seen": 64756584, "step": 96055 }, { "epoch": 2.3467617814477317, "grad_norm": 0.022628452628850937, "learning_rate": 1.2778309910267454e-06, "loss": 0.0005, "num_input_tokens_seen": 64760168, "step": 96060 }, { "epoch": 2.346883932279579, "grad_norm": 0.03361308202147484, "learning_rate": 1.2777490697987378e-06, "loss": 0.0003, "num_input_tokens_seen": 64763432, "step": 96065 }, { "epoch": 2.347006083111426, "grad_norm": 0.05504537373781204, "learning_rate": 1.277667146550864e-06, "loss": 0.0005, "num_input_tokens_seen": 64766568, "step": 96070 }, { "epoch": 2.3471282339432733, "grad_norm": 0.10005243122577667, "learning_rate": 1.2775852212837202e-06, "loss": 0.0002, "num_input_tokens_seen": 64769768, "step": 96075 }, { "epoch": 2.3472503847751205, "grad_norm": 0.3429010808467865, "learning_rate": 1.277503293997902e-06, "loss": 0.0669, "num_input_tokens_seen": 64772648, "step": 96080 }, { "epoch": 2.3473725356069677, "grad_norm": 0.10272646695375443, "learning_rate": 1.277421364694005e-06, "loss": 0.0002, "num_input_tokens_seen": 64775912, "step": 96085 }, { "epoch": 2.3474946864388144, "grad_norm": 0.3535402715206146, "learning_rate": 1.2773394333726253e-06, "loss": 0.0228, "num_input_tokens_seen": 64779176, "step": 96090 }, { "epoch": 2.347616837270662, "grad_norm": 0.01437195111066103, "learning_rate": 1.2772575000343589e-06, "loss": 0.0003, "num_input_tokens_seen": 64782696, "step": 96095 }, { "epoch": 2.347738988102509, "grad_norm": 72.5531234741211, "learning_rate": 1.277175564679801e-06, "loss": 0.0837, "num_input_tokens_seen": 64785960, "step": 96100 }, { "epoch": 2.347861138934356, "grad_norm": 0.07231893390417099, "learning_rate": 1.2770936273095483e-06, "loss": 0.0741, "num_input_tokens_seen": 64789224, "step": 96105 }, { "epoch": 2.347983289766203, "grad_norm": 0.04119507223367691, "learning_rate": 1.2770116879241961e-06, "loss": 0.0006, "num_input_tokens_seen": 64792360, "step": 96110 }, { "epoch": 2.3481054405980504, "grad_norm": 19.756370544433594, "learning_rate": 1.27692974652434e-06, "loss": 0.0465, "num_input_tokens_seen": 64796072, "step": 96115 }, { "epoch": 2.3482275914298976, "grad_norm": 0.024013273417949677, "learning_rate": 1.2768478031105764e-06, "loss": 0.0628, "num_input_tokens_seen": 64799016, "step": 96120 }, { "epoch": 2.3483497422617448, "grad_norm": 0.012543603777885437, "learning_rate": 1.2767658576835015e-06, "loss": 0.0003, "num_input_tokens_seen": 64802600, "step": 96125 }, { "epoch": 2.348471893093592, "grad_norm": 0.00011490545875858516, "learning_rate": 1.2766839102437105e-06, "loss": 0.0532, "num_input_tokens_seen": 64805928, "step": 96130 }, { "epoch": 2.348594043925439, "grad_norm": 0.8810346722602844, "learning_rate": 1.2766019607917997e-06, "loss": 0.0017, "num_input_tokens_seen": 64809320, "step": 96135 }, { "epoch": 2.3487161947572863, "grad_norm": 0.004579383414238691, "learning_rate": 1.2765200093283653e-06, "loss": 0.063, "num_input_tokens_seen": 64812712, "step": 96140 }, { "epoch": 2.3488383455891335, "grad_norm": 0.06198953837156296, "learning_rate": 1.2764380558540028e-06, "loss": 0.0786, "num_input_tokens_seen": 64815912, "step": 96145 }, { "epoch": 2.3489604964209807, "grad_norm": 0.029107533395290375, "learning_rate": 1.2763561003693087e-06, "loss": 0.0001, "num_input_tokens_seen": 64819560, "step": 96150 }, { "epoch": 2.349082647252828, "grad_norm": 0.009671103209257126, "learning_rate": 1.2762741428748785e-06, "loss": 0.0348, "num_input_tokens_seen": 64823144, "step": 96155 }, { "epoch": 2.349204798084675, "grad_norm": 0.005305442493408918, "learning_rate": 1.2761921833713082e-06, "loss": 0.1014, "num_input_tokens_seen": 64826920, "step": 96160 }, { "epoch": 2.3493269489165223, "grad_norm": 0.0013102086959406734, "learning_rate": 1.2761102218591943e-06, "loss": 0.0003, "num_input_tokens_seen": 64830248, "step": 96165 }, { "epoch": 2.3494490997483695, "grad_norm": 0.24055279791355133, "learning_rate": 1.2760282583391327e-06, "loss": 0.0028, "num_input_tokens_seen": 64833320, "step": 96170 }, { "epoch": 2.349571250580216, "grad_norm": 94.44710540771484, "learning_rate": 1.275946292811719e-06, "loss": 0.1198, "num_input_tokens_seen": 64836648, "step": 96175 }, { "epoch": 2.349693401412064, "grad_norm": 1.151880145072937, "learning_rate": 1.2758643252775498e-06, "loss": 0.1153, "num_input_tokens_seen": 64839784, "step": 96180 }, { "epoch": 2.3498155522439106, "grad_norm": 0.24090705811977386, "learning_rate": 1.275782355737221e-06, "loss": 0.0293, "num_input_tokens_seen": 64842984, "step": 96185 }, { "epoch": 2.3499377030757578, "grad_norm": 0.03649113327264786, "learning_rate": 1.275700384191329e-06, "loss": 0.0003, "num_input_tokens_seen": 64846184, "step": 96190 }, { "epoch": 2.350059853907605, "grad_norm": 0.009620449505746365, "learning_rate": 1.2756184106404693e-06, "loss": 0.0002, "num_input_tokens_seen": 64849192, "step": 96195 }, { "epoch": 2.350182004739452, "grad_norm": 0.021057967096567154, "learning_rate": 1.2755364350852387e-06, "loss": 0.0001, "num_input_tokens_seen": 64852584, "step": 96200 }, { "epoch": 2.3503041555712993, "grad_norm": 0.005199748557060957, "learning_rate": 1.2754544575262327e-06, "loss": 0.0001, "num_input_tokens_seen": 64857768, "step": 96205 }, { "epoch": 2.3504263064031465, "grad_norm": 0.0033945294562727213, "learning_rate": 1.2753724779640483e-06, "loss": 0.0503, "num_input_tokens_seen": 64861224, "step": 96210 }, { "epoch": 2.3505484572349937, "grad_norm": 0.008970481343567371, "learning_rate": 1.2752904963992807e-06, "loss": 0.0816, "num_input_tokens_seen": 64864552, "step": 96215 }, { "epoch": 2.350670608066841, "grad_norm": 0.019377127289772034, "learning_rate": 1.2752085128325267e-06, "loss": 0.0001, "num_input_tokens_seen": 64868072, "step": 96220 }, { "epoch": 2.350792758898688, "grad_norm": 0.05443967506289482, "learning_rate": 1.2751265272643826e-06, "loss": 0.0004, "num_input_tokens_seen": 64871144, "step": 96225 }, { "epoch": 2.3509149097305353, "grad_norm": 38.987884521484375, "learning_rate": 1.275044539695444e-06, "loss": 0.0466, "num_input_tokens_seen": 64874600, "step": 96230 }, { "epoch": 2.3510370605623825, "grad_norm": 0.016544029116630554, "learning_rate": 1.2749625501263076e-06, "loss": 0.0395, "num_input_tokens_seen": 64877992, "step": 96235 }, { "epoch": 2.3511592113942297, "grad_norm": 0.031460560858249664, "learning_rate": 1.2748805585575699e-06, "loss": 0.0002, "num_input_tokens_seen": 64881128, "step": 96240 }, { "epoch": 2.351281362226077, "grad_norm": 0.03763119876384735, "learning_rate": 1.2747985649898265e-06, "loss": 0.0001, "num_input_tokens_seen": 64884136, "step": 96245 }, { "epoch": 2.351403513057924, "grad_norm": 0.11334902048110962, "learning_rate": 1.2747165694236741e-06, "loss": 0.1235, "num_input_tokens_seen": 64887016, "step": 96250 }, { "epoch": 2.3515256638897712, "grad_norm": 0.01264060940593481, "learning_rate": 1.274634571859709e-06, "loss": 0.0001, "num_input_tokens_seen": 64890472, "step": 96255 }, { "epoch": 2.3516478147216184, "grad_norm": 0.5173587203025818, "learning_rate": 1.2745525722985276e-06, "loss": 0.0003, "num_input_tokens_seen": 64893352, "step": 96260 }, { "epoch": 2.3517699655534656, "grad_norm": 0.02787947840988636, "learning_rate": 1.2744705707407259e-06, "loss": 0.0322, "num_input_tokens_seen": 64896360, "step": 96265 }, { "epoch": 2.3518921163853124, "grad_norm": 29.06874656677246, "learning_rate": 1.2743885671869003e-06, "loss": 0.1283, "num_input_tokens_seen": 64899752, "step": 96270 }, { "epoch": 2.3520142672171596, "grad_norm": 203.33737182617188, "learning_rate": 1.2743065616376472e-06, "loss": 0.0737, "num_input_tokens_seen": 64903208, "step": 96275 }, { "epoch": 2.3521364180490067, "grad_norm": 0.17877565324306488, "learning_rate": 1.274224554093563e-06, "loss": 0.0397, "num_input_tokens_seen": 64906152, "step": 96280 }, { "epoch": 2.352258568880854, "grad_norm": 0.0367661714553833, "learning_rate": 1.2741425445552442e-06, "loss": 0.0003, "num_input_tokens_seen": 64909480, "step": 96285 }, { "epoch": 2.352380719712701, "grad_norm": 680.6268920898438, "learning_rate": 1.274060533023287e-06, "loss": 0.015, "num_input_tokens_seen": 64912360, "step": 96290 }, { "epoch": 2.3525028705445483, "grad_norm": 30.654756546020508, "learning_rate": 1.273978519498288e-06, "loss": 0.1568, "num_input_tokens_seen": 64915688, "step": 96295 }, { "epoch": 2.3526250213763955, "grad_norm": 0.08082496374845505, "learning_rate": 1.2738965039808433e-06, "loss": 0.0002, "num_input_tokens_seen": 64918696, "step": 96300 }, { "epoch": 2.3527471722082427, "grad_norm": 22.63221549987793, "learning_rate": 1.2738144864715498e-06, "loss": 0.2386, "num_input_tokens_seen": 64921896, "step": 96305 }, { "epoch": 2.35286932304009, "grad_norm": 0.04736287146806717, "learning_rate": 1.2737324669710036e-06, "loss": 0.0751, "num_input_tokens_seen": 64925992, "step": 96310 }, { "epoch": 2.352991473871937, "grad_norm": 0.09722006320953369, "learning_rate": 1.2736504454798013e-06, "loss": 0.0601, "num_input_tokens_seen": 64929064, "step": 96315 }, { "epoch": 2.3531136247037843, "grad_norm": 0.23957017064094543, "learning_rate": 1.2735684219985395e-06, "loss": 0.0003, "num_input_tokens_seen": 64932648, "step": 96320 }, { "epoch": 2.3532357755356315, "grad_norm": 0.1643843650817871, "learning_rate": 1.2734863965278143e-06, "loss": 0.001, "num_input_tokens_seen": 64936104, "step": 96325 }, { "epoch": 2.3533579263674786, "grad_norm": 0.5928179025650024, "learning_rate": 1.2734043690682228e-06, "loss": 0.0004, "num_input_tokens_seen": 64939176, "step": 96330 }, { "epoch": 2.353480077199326, "grad_norm": 0.006342485547065735, "learning_rate": 1.2733223396203606e-06, "loss": 0.0729, "num_input_tokens_seen": 64942696, "step": 96335 }, { "epoch": 2.353602228031173, "grad_norm": 0.018328078091144562, "learning_rate": 1.2732403081848254e-06, "loss": 0.1607, "num_input_tokens_seen": 64946088, "step": 96340 }, { "epoch": 2.35372437886302, "grad_norm": 0.010952993296086788, "learning_rate": 1.273158274762213e-06, "loss": 0.0001, "num_input_tokens_seen": 64949736, "step": 96345 }, { "epoch": 2.3538465296948674, "grad_norm": 3.82192325592041, "learning_rate": 1.27307623935312e-06, "loss": 0.001, "num_input_tokens_seen": 64952872, "step": 96350 }, { "epoch": 2.353968680526714, "grad_norm": 0.029450977221131325, "learning_rate": 1.2729942019581433e-06, "loss": 0.0491, "num_input_tokens_seen": 64957096, "step": 96355 }, { "epoch": 2.354090831358562, "grad_norm": 3.583500623703003, "learning_rate": 1.2729121625778793e-06, "loss": 0.0009, "num_input_tokens_seen": 64960744, "step": 96360 }, { "epoch": 2.3542129821904085, "grad_norm": 0.03824234753847122, "learning_rate": 1.2728301212129246e-06, "loss": 0.0507, "num_input_tokens_seen": 64964584, "step": 96365 }, { "epoch": 2.3543351330222557, "grad_norm": 0.051010388880968094, "learning_rate": 1.272748077863876e-06, "loss": 0.0248, "num_input_tokens_seen": 64968232, "step": 96370 }, { "epoch": 2.354457283854103, "grad_norm": 0.03927096351981163, "learning_rate": 1.27266603253133e-06, "loss": 0.0002, "num_input_tokens_seen": 64971496, "step": 96375 }, { "epoch": 2.35457943468595, "grad_norm": 0.09921469539403915, "learning_rate": 1.272583985215883e-06, "loss": 0.0775, "num_input_tokens_seen": 64974824, "step": 96380 }, { "epoch": 2.3547015855177973, "grad_norm": 1501.3050537109375, "learning_rate": 1.2725019359181323e-06, "loss": 0.0183, "num_input_tokens_seen": 64978088, "step": 96385 }, { "epoch": 2.3548237363496445, "grad_norm": 38.874263763427734, "learning_rate": 1.2724198846386743e-06, "loss": 0.1264, "num_input_tokens_seen": 64981608, "step": 96390 }, { "epoch": 2.3549458871814917, "grad_norm": 0.051000069826841354, "learning_rate": 1.2723378313781053e-06, "loss": 0.0001, "num_input_tokens_seen": 64984744, "step": 96395 }, { "epoch": 2.355068038013339, "grad_norm": 0.05670246481895447, "learning_rate": 1.2722557761370224e-06, "loss": 0.0253, "num_input_tokens_seen": 64988456, "step": 96400 }, { "epoch": 2.355190188845186, "grad_norm": 0.020489536225795746, "learning_rate": 1.2721737189160221e-06, "loss": 0.0001, "num_input_tokens_seen": 64991784, "step": 96405 }, { "epoch": 2.3553123396770332, "grad_norm": 0.017961619421839714, "learning_rate": 1.2720916597157017e-06, "loss": 0.0002, "num_input_tokens_seen": 64994920, "step": 96410 }, { "epoch": 2.3554344905088804, "grad_norm": 0.4348694086074829, "learning_rate": 1.2720095985366578e-06, "loss": 0.0005, "num_input_tokens_seen": 64998376, "step": 96415 }, { "epoch": 2.3555566413407276, "grad_norm": 0.15724071860313416, "learning_rate": 1.2719275353794863e-06, "loss": 0.0002, "num_input_tokens_seen": 65001448, "step": 96420 }, { "epoch": 2.355678792172575, "grad_norm": 0.0027769957669079304, "learning_rate": 1.271845470244785e-06, "loss": 0.0178, "num_input_tokens_seen": 65006824, "step": 96425 }, { "epoch": 2.355800943004422, "grad_norm": 294.8739013671875, "learning_rate": 1.27176340313315e-06, "loss": 0.0458, "num_input_tokens_seen": 65010024, "step": 96430 }, { "epoch": 2.355923093836269, "grad_norm": 1.417127013206482, "learning_rate": 1.2716813340451787e-06, "loss": 0.0004, "num_input_tokens_seen": 65013288, "step": 96435 }, { "epoch": 2.3560452446681164, "grad_norm": 0.038331493735313416, "learning_rate": 1.2715992629814673e-06, "loss": 0.0004, "num_input_tokens_seen": 65016680, "step": 96440 }, { "epoch": 2.3561673954999636, "grad_norm": 0.003267770865932107, "learning_rate": 1.2715171899426134e-06, "loss": 0.0003, "num_input_tokens_seen": 65019560, "step": 96445 }, { "epoch": 2.3562895463318103, "grad_norm": 0.0478217639029026, "learning_rate": 1.2714351149292135e-06, "loss": 0.0581, "num_input_tokens_seen": 65022888, "step": 96450 }, { "epoch": 2.3564116971636575, "grad_norm": 0.014052304439246655, "learning_rate": 1.2713530379418642e-06, "loss": 0.104, "num_input_tokens_seen": 65025768, "step": 96455 }, { "epoch": 2.3565338479955047, "grad_norm": 0.023245450109243393, "learning_rate": 1.2712709589811628e-06, "loss": 0.0005, "num_input_tokens_seen": 65029672, "step": 96460 }, { "epoch": 2.356655998827352, "grad_norm": 0.01749522238969803, "learning_rate": 1.271188878047706e-06, "loss": 0.0004, "num_input_tokens_seen": 65032936, "step": 96465 }, { "epoch": 2.356778149659199, "grad_norm": 0.03706004470586777, "learning_rate": 1.2711067951420906e-06, "loss": 0.0393, "num_input_tokens_seen": 65035944, "step": 96470 }, { "epoch": 2.3569003004910463, "grad_norm": 0.17029958963394165, "learning_rate": 1.2710247102649138e-06, "loss": 0.0338, "num_input_tokens_seen": 65039080, "step": 96475 }, { "epoch": 2.3570224513228935, "grad_norm": 0.4401090443134308, "learning_rate": 1.2709426234167723e-06, "loss": 0.0011, "num_input_tokens_seen": 65042536, "step": 96480 }, { "epoch": 2.3571446021547406, "grad_norm": 41.670413970947266, "learning_rate": 1.2708605345982634e-06, "loss": 0.129, "num_input_tokens_seen": 65046056, "step": 96485 }, { "epoch": 2.357266752986588, "grad_norm": 9.80988597869873, "learning_rate": 1.2707784438099833e-06, "loss": 0.0008, "num_input_tokens_seen": 65049512, "step": 96490 }, { "epoch": 2.357388903818435, "grad_norm": 0.06891076266765594, "learning_rate": 1.27069635105253e-06, "loss": 0.0002, "num_input_tokens_seen": 65052968, "step": 96495 }, { "epoch": 2.357511054650282, "grad_norm": 0.016416283324360847, "learning_rate": 1.2706142563264999e-06, "loss": 0.0453, "num_input_tokens_seen": 65055976, "step": 96500 }, { "epoch": 2.3576332054821294, "grad_norm": 85.0042724609375, "learning_rate": 1.2705321596324901e-06, "loss": 0.0924, "num_input_tokens_seen": 65059432, "step": 96505 }, { "epoch": 2.3577553563139766, "grad_norm": 0.14299145340919495, "learning_rate": 1.2704500609710977e-06, "loss": 0.0001, "num_input_tokens_seen": 65062760, "step": 96510 }, { "epoch": 2.357877507145824, "grad_norm": 0.006079481448978186, "learning_rate": 1.2703679603429198e-06, "loss": 0.145, "num_input_tokens_seen": 65066088, "step": 96515 }, { "epoch": 2.357999657977671, "grad_norm": 37.76004409790039, "learning_rate": 1.2702858577485533e-06, "loss": 0.0889, "num_input_tokens_seen": 65069352, "step": 96520 }, { "epoch": 2.358121808809518, "grad_norm": 0.027957482263445854, "learning_rate": 1.2702037531885954e-06, "loss": 0.0003, "num_input_tokens_seen": 65072936, "step": 96525 }, { "epoch": 2.3582439596413654, "grad_norm": 0.3856599032878876, "learning_rate": 1.270121646663643e-06, "loss": 0.0004, "num_input_tokens_seen": 65076200, "step": 96530 }, { "epoch": 2.358366110473212, "grad_norm": 0.018702376633882523, "learning_rate": 1.2700395381742937e-06, "loss": 0.0002, "num_input_tokens_seen": 65079400, "step": 96535 }, { "epoch": 2.3584882613050597, "grad_norm": 51.89600372314453, "learning_rate": 1.269957427721144e-06, "loss": 0.0814, "num_input_tokens_seen": 65082536, "step": 96540 }, { "epoch": 2.3586104121369065, "grad_norm": 28.521587371826172, "learning_rate": 1.2698753153047913e-06, "loss": 0.1191, "num_input_tokens_seen": 65085928, "step": 96545 }, { "epoch": 2.3587325629687537, "grad_norm": 0.2891164720058441, "learning_rate": 1.2697932009258324e-06, "loss": 0.0002, "num_input_tokens_seen": 65089320, "step": 96550 }, { "epoch": 2.358854713800601, "grad_norm": 0.14744800329208374, "learning_rate": 1.269711084584865e-06, "loss": 0.0001, "num_input_tokens_seen": 65092136, "step": 96555 }, { "epoch": 2.358976864632448, "grad_norm": 0.021144427359104156, "learning_rate": 1.2696289662824863e-06, "loss": 0.0002, "num_input_tokens_seen": 65095528, "step": 96560 }, { "epoch": 2.3590990154642952, "grad_norm": 0.054040584713220596, "learning_rate": 1.2695468460192928e-06, "loss": 0.0002, "num_input_tokens_seen": 65098472, "step": 96565 }, { "epoch": 2.3592211662961424, "grad_norm": 0.10927646607160568, "learning_rate": 1.2694647237958827e-06, "loss": 0.0438, "num_input_tokens_seen": 65101992, "step": 96570 }, { "epoch": 2.3593433171279896, "grad_norm": 2.334500551223755, "learning_rate": 1.2693825996128524e-06, "loss": 0.0005, "num_input_tokens_seen": 65105576, "step": 96575 }, { "epoch": 2.359465467959837, "grad_norm": 0.20562472939491272, "learning_rate": 1.2693004734707993e-06, "loss": 0.0029, "num_input_tokens_seen": 65108968, "step": 96580 }, { "epoch": 2.359587618791684, "grad_norm": 0.20102748274803162, "learning_rate": 1.2692183453703205e-06, "loss": 0.0005, "num_input_tokens_seen": 65111976, "step": 96585 }, { "epoch": 2.359709769623531, "grad_norm": 0.008200347423553467, "learning_rate": 1.2691362153120135e-06, "loss": 0.0001, "num_input_tokens_seen": 65115560, "step": 96590 }, { "epoch": 2.3598319204553784, "grad_norm": 0.03276849538087845, "learning_rate": 1.269054083296476e-06, "loss": 0.0001, "num_input_tokens_seen": 65118888, "step": 96595 }, { "epoch": 2.3599540712872256, "grad_norm": 0.19144579768180847, "learning_rate": 1.2689719493243046e-06, "loss": 0.0454, "num_input_tokens_seen": 65122600, "step": 96600 }, { "epoch": 2.3600762221190728, "grad_norm": 0.05740584433078766, "learning_rate": 1.2688898133960968e-06, "loss": 0.0003, "num_input_tokens_seen": 65126248, "step": 96605 }, { "epoch": 2.36019837295092, "grad_norm": 0.009974388405680656, "learning_rate": 1.2688076755124499e-06, "loss": 0.1508, "num_input_tokens_seen": 65129640, "step": 96610 }, { "epoch": 2.360320523782767, "grad_norm": 0.0036886536981910467, "learning_rate": 1.2687255356739615e-06, "loss": 0.0001, "num_input_tokens_seen": 65132712, "step": 96615 }, { "epoch": 2.360442674614614, "grad_norm": 0.005350802559405565, "learning_rate": 1.2686433938812287e-06, "loss": 0.0591, "num_input_tokens_seen": 65136104, "step": 96620 }, { "epoch": 2.3605648254464615, "grad_norm": 0.13386641442775726, "learning_rate": 1.2685612501348486e-06, "loss": 0.0002, "num_input_tokens_seen": 65139624, "step": 96625 }, { "epoch": 2.3606869762783083, "grad_norm": 0.006131905596703291, "learning_rate": 1.268479104435419e-06, "loss": 0.0002, "num_input_tokens_seen": 65143208, "step": 96630 }, { "epoch": 2.3608091271101554, "grad_norm": 0.014093791134655476, "learning_rate": 1.2683969567835372e-06, "loss": 0.0001, "num_input_tokens_seen": 65146536, "step": 96635 }, { "epoch": 2.3609312779420026, "grad_norm": 0.11882232129573822, "learning_rate": 1.2683148071798006e-06, "loss": 0.0007, "num_input_tokens_seen": 65149928, "step": 96640 }, { "epoch": 2.36105342877385, "grad_norm": 0.006873989012092352, "learning_rate": 1.2682326556248066e-06, "loss": 0.0001, "num_input_tokens_seen": 65153256, "step": 96645 }, { "epoch": 2.361175579605697, "grad_norm": 0.007074476219713688, "learning_rate": 1.2681505021191523e-06, "loss": 0.0002, "num_input_tokens_seen": 65156328, "step": 96650 }, { "epoch": 2.361297730437544, "grad_norm": 0.005505382549017668, "learning_rate": 1.2680683466634355e-06, "loss": 0.0003, "num_input_tokens_seen": 65159528, "step": 96655 }, { "epoch": 2.3614198812693914, "grad_norm": 22.611209869384766, "learning_rate": 1.2679861892582535e-06, "loss": 0.1168, "num_input_tokens_seen": 65163176, "step": 96660 }, { "epoch": 2.3615420321012386, "grad_norm": 0.21755099296569824, "learning_rate": 1.2679040299042041e-06, "loss": 0.0502, "num_input_tokens_seen": 65166504, "step": 96665 }, { "epoch": 2.3616641829330858, "grad_norm": 0.017916874960064888, "learning_rate": 1.2678218686018848e-06, "loss": 0.0626, "num_input_tokens_seen": 65169704, "step": 96670 }, { "epoch": 2.361786333764933, "grad_norm": 744.1478271484375, "learning_rate": 1.267739705351892e-06, "loss": 0.0961, "num_input_tokens_seen": 65173608, "step": 96675 }, { "epoch": 2.36190848459678, "grad_norm": 0.18370923399925232, "learning_rate": 1.2676575401548248e-06, "loss": 0.0542, "num_input_tokens_seen": 65176680, "step": 96680 }, { "epoch": 2.3620306354286273, "grad_norm": 0.1509036123752594, "learning_rate": 1.2675753730112798e-06, "loss": 0.0672, "num_input_tokens_seen": 65179688, "step": 96685 }, { "epoch": 2.3621527862604745, "grad_norm": 0.9493951797485352, "learning_rate": 1.2674932039218545e-06, "loss": 0.0005, "num_input_tokens_seen": 65183208, "step": 96690 }, { "epoch": 2.3622749370923217, "grad_norm": 0.038394249975681305, "learning_rate": 1.2674110328871469e-06, "loss": 0.0004, "num_input_tokens_seen": 65186536, "step": 96695 }, { "epoch": 2.362397087924169, "grad_norm": 18.89492416381836, "learning_rate": 1.2673288599077543e-06, "loss": 0.1285, "num_input_tokens_seen": 65189992, "step": 96700 }, { "epoch": 2.362519238756016, "grad_norm": 43.87201690673828, "learning_rate": 1.2672466849842742e-06, "loss": 0.1172, "num_input_tokens_seen": 65193832, "step": 96705 }, { "epoch": 2.3626413895878633, "grad_norm": 54.42050552368164, "learning_rate": 1.2671645081173044e-06, "loss": 0.0933, "num_input_tokens_seen": 65197224, "step": 96710 }, { "epoch": 2.36276354041971, "grad_norm": 0.047918014228343964, "learning_rate": 1.2670823293074423e-06, "loss": 0.0002, "num_input_tokens_seen": 65200872, "step": 96715 }, { "epoch": 2.3628856912515572, "grad_norm": 0.08623618632555008, "learning_rate": 1.2670001485552858e-06, "loss": 0.004, "num_input_tokens_seen": 65204008, "step": 96720 }, { "epoch": 2.3630078420834044, "grad_norm": 0.017091054469347, "learning_rate": 1.2669179658614327e-06, "loss": 0.0002, "num_input_tokens_seen": 65207464, "step": 96725 }, { "epoch": 2.3631299929152516, "grad_norm": 0.004988627042621374, "learning_rate": 1.26683578122648e-06, "loss": 0.0003, "num_input_tokens_seen": 65210344, "step": 96730 }, { "epoch": 2.363252143747099, "grad_norm": 0.0018072007223963737, "learning_rate": 1.2667535946510258e-06, "loss": 0.0606, "num_input_tokens_seen": 65214056, "step": 96735 }, { "epoch": 2.363374294578946, "grad_norm": 0.009117374196648598, "learning_rate": 1.2666714061356675e-06, "loss": 0.0378, "num_input_tokens_seen": 65216936, "step": 96740 }, { "epoch": 2.363496445410793, "grad_norm": 0.5432358384132385, "learning_rate": 1.2665892156810035e-06, "loss": 0.0013, "num_input_tokens_seen": 65219816, "step": 96745 }, { "epoch": 2.3636185962426404, "grad_norm": 0.029280368238687515, "learning_rate": 1.2665070232876304e-06, "loss": 0.0414, "num_input_tokens_seen": 65223080, "step": 96750 }, { "epoch": 2.3637407470744876, "grad_norm": 0.0141510721296072, "learning_rate": 1.266424828956147e-06, "loss": 0.0002, "num_input_tokens_seen": 65226472, "step": 96755 }, { "epoch": 2.3638628979063347, "grad_norm": 0.06533598899841309, "learning_rate": 1.2663426326871505e-06, "loss": 0.0013, "num_input_tokens_seen": 65229544, "step": 96760 }, { "epoch": 2.363985048738182, "grad_norm": 25.035097122192383, "learning_rate": 1.2662604344812387e-06, "loss": 0.0692, "num_input_tokens_seen": 65233128, "step": 96765 }, { "epoch": 2.364107199570029, "grad_norm": 0.02839895896613598, "learning_rate": 1.2661782343390096e-06, "loss": 0.0002, "num_input_tokens_seen": 65236456, "step": 96770 }, { "epoch": 2.3642293504018763, "grad_norm": 0.1953248828649521, "learning_rate": 1.2660960322610605e-06, "loss": 0.0439, "num_input_tokens_seen": 65239848, "step": 96775 }, { "epoch": 2.3643515012337235, "grad_norm": 1421.1453857421875, "learning_rate": 1.2660138282479894e-06, "loss": 0.0145, "num_input_tokens_seen": 65243176, "step": 96780 }, { "epoch": 2.3644736520655707, "grad_norm": 0.013044201768934727, "learning_rate": 1.2659316223003945e-06, "loss": 0.0001, "num_input_tokens_seen": 65247336, "step": 96785 }, { "epoch": 2.364595802897418, "grad_norm": 0.005308858584612608, "learning_rate": 1.2658494144188732e-06, "loss": 0.0001, "num_input_tokens_seen": 65250408, "step": 96790 }, { "epoch": 2.364717953729265, "grad_norm": 0.006218060851097107, "learning_rate": 1.2657672046040235e-06, "loss": 0.0001, "num_input_tokens_seen": 65253608, "step": 96795 }, { "epoch": 2.364840104561112, "grad_norm": 23.112071990966797, "learning_rate": 1.265684992856443e-06, "loss": 0.1215, "num_input_tokens_seen": 65256936, "step": 96800 }, { "epoch": 2.3649622553929595, "grad_norm": 0.15702813863754272, "learning_rate": 1.2656027791767299e-06, "loss": 0.038, "num_input_tokens_seen": 65260520, "step": 96805 }, { "epoch": 2.365084406224806, "grad_norm": 0.011202758178114891, "learning_rate": 1.2655205635654819e-06, "loss": 0.0, "num_input_tokens_seen": 65263976, "step": 96810 }, { "epoch": 2.3652065570566534, "grad_norm": 8.87405776977539, "learning_rate": 1.2654383460232972e-06, "loss": 0.0403, "num_input_tokens_seen": 65267176, "step": 96815 }, { "epoch": 2.3653287078885006, "grad_norm": 17.453929901123047, "learning_rate": 1.265356126550773e-06, "loss": 0.0907, "num_input_tokens_seen": 65270504, "step": 96820 }, { "epoch": 2.3654508587203478, "grad_norm": 0.015709292143583298, "learning_rate": 1.2652739051485083e-06, "loss": 0.0002, "num_input_tokens_seen": 65273640, "step": 96825 }, { "epoch": 2.365573009552195, "grad_norm": 0.05382462218403816, "learning_rate": 1.2651916818170998e-06, "loss": 0.0002, "num_input_tokens_seen": 65276840, "step": 96830 }, { "epoch": 2.365695160384042, "grad_norm": 0.015843816101551056, "learning_rate": 1.2651094565571465e-06, "loss": 0.0513, "num_input_tokens_seen": 65280040, "step": 96835 }, { "epoch": 2.3658173112158893, "grad_norm": 0.004987873136997223, "learning_rate": 1.2650272293692457e-06, "loss": 0.049, "num_input_tokens_seen": 65283304, "step": 96840 }, { "epoch": 2.3659394620477365, "grad_norm": 0.30553194880485535, "learning_rate": 1.2649450002539957e-06, "loss": 0.044, "num_input_tokens_seen": 65286824, "step": 96845 }, { "epoch": 2.3660616128795837, "grad_norm": 0.05274348706007004, "learning_rate": 1.2648627692119942e-06, "loss": 0.0003, "num_input_tokens_seen": 65290024, "step": 96850 }, { "epoch": 2.366183763711431, "grad_norm": 0.018941234797239304, "learning_rate": 1.2647805362438395e-06, "loss": 0.1146, "num_input_tokens_seen": 65293096, "step": 96855 }, { "epoch": 2.366305914543278, "grad_norm": 0.05804905667901039, "learning_rate": 1.2646983013501298e-06, "loss": 0.0003, "num_input_tokens_seen": 65296744, "step": 96860 }, { "epoch": 2.3664280653751253, "grad_norm": 59.49602508544922, "learning_rate": 1.2646160645314623e-06, "loss": 0.0361, "num_input_tokens_seen": 65299816, "step": 96865 }, { "epoch": 2.3665502162069725, "grad_norm": 0.015892786905169487, "learning_rate": 1.264533825788436e-06, "loss": 0.0456, "num_input_tokens_seen": 65302952, "step": 96870 }, { "epoch": 2.3666723670388197, "grad_norm": 0.01757962629199028, "learning_rate": 1.2644515851216487e-06, "loss": 0.08, "num_input_tokens_seen": 65306664, "step": 96875 }, { "epoch": 2.366794517870667, "grad_norm": 0.08031915873289108, "learning_rate": 1.2643693425316981e-06, "loss": 0.0327, "num_input_tokens_seen": 65309736, "step": 96880 }, { "epoch": 2.366916668702514, "grad_norm": 0.025443589314818382, "learning_rate": 1.2642870980191827e-06, "loss": 0.0004, "num_input_tokens_seen": 65312744, "step": 96885 }, { "epoch": 2.3670388195343612, "grad_norm": 0.0021331454627215862, "learning_rate": 1.2642048515847003e-06, "loss": 0.0504, "num_input_tokens_seen": 65315752, "step": 96890 }, { "epoch": 2.367160970366208, "grad_norm": 0.1670653522014618, "learning_rate": 1.264122603228849e-06, "loss": 0.0005, "num_input_tokens_seen": 65319528, "step": 96895 }, { "epoch": 2.367283121198055, "grad_norm": 0.010267873294651508, "learning_rate": 1.2640403529522272e-06, "loss": 0.0331, "num_input_tokens_seen": 65322536, "step": 96900 }, { "epoch": 2.3674052720299024, "grad_norm": 0.3293243944644928, "learning_rate": 1.263958100755433e-06, "loss": 0.0002, "num_input_tokens_seen": 65325352, "step": 96905 }, { "epoch": 2.3675274228617496, "grad_norm": 0.022090457379817963, "learning_rate": 1.2638758466390647e-06, "loss": 0.0001, "num_input_tokens_seen": 65328808, "step": 96910 }, { "epoch": 2.3676495736935967, "grad_norm": 0.003714184043928981, "learning_rate": 1.2637935906037199e-06, "loss": 0.0001, "num_input_tokens_seen": 65332264, "step": 96915 }, { "epoch": 2.367771724525444, "grad_norm": 0.03657980635762215, "learning_rate": 1.2637113326499973e-06, "loss": 0.0002, "num_input_tokens_seen": 65335656, "step": 96920 }, { "epoch": 2.367893875357291, "grad_norm": 0.023362930864095688, "learning_rate": 1.2636290727784951e-06, "loss": 0.0001, "num_input_tokens_seen": 65338792, "step": 96925 }, { "epoch": 2.3680160261891383, "grad_norm": 0.018264321610331535, "learning_rate": 1.2635468109898112e-06, "loss": 0.0439, "num_input_tokens_seen": 65342184, "step": 96930 }, { "epoch": 2.3681381770209855, "grad_norm": 0.5096086263656616, "learning_rate": 1.263464547284544e-06, "loss": 0.0003, "num_input_tokens_seen": 65345064, "step": 96935 }, { "epoch": 2.3682603278528327, "grad_norm": 0.009335456416010857, "learning_rate": 1.263382281663292e-06, "loss": 0.0641, "num_input_tokens_seen": 65348136, "step": 96940 }, { "epoch": 2.36838247868468, "grad_norm": 0.058331821113824844, "learning_rate": 1.263300014126653e-06, "loss": 0.0008, "num_input_tokens_seen": 65351464, "step": 96945 }, { "epoch": 2.368504629516527, "grad_norm": 0.010250688530504704, "learning_rate": 1.2632177446752255e-06, "loss": 0.1049, "num_input_tokens_seen": 65355176, "step": 96950 }, { "epoch": 2.3686267803483743, "grad_norm": 0.014257263392210007, "learning_rate": 1.2631354733096075e-06, "loss": 0.0001, "num_input_tokens_seen": 65359080, "step": 96955 }, { "epoch": 2.3687489311802215, "grad_norm": 0.0008305838564410806, "learning_rate": 1.2630532000303978e-06, "loss": 0.0001, "num_input_tokens_seen": 65362536, "step": 96960 }, { "epoch": 2.3688710820120686, "grad_norm": 102.84041595458984, "learning_rate": 1.2629709248381946e-06, "loss": 0.1348, "num_input_tokens_seen": 65366312, "step": 96965 }, { "epoch": 2.368993232843916, "grad_norm": 0.0019097891636192799, "learning_rate": 1.2628886477335958e-06, "loss": 0.0415, "num_input_tokens_seen": 65369128, "step": 96970 }, { "epoch": 2.369115383675763, "grad_norm": 0.024086622521281242, "learning_rate": 1.2628063687172004e-06, "loss": 0.0003, "num_input_tokens_seen": 65372584, "step": 96975 }, { "epoch": 2.3692375345076098, "grad_norm": 3.4141321182250977, "learning_rate": 1.2627240877896063e-06, "loss": 0.0008, "num_input_tokens_seen": 65375272, "step": 96980 }, { "epoch": 2.3693596853394574, "grad_norm": 20.49992561340332, "learning_rate": 1.2626418049514118e-06, "loss": 0.0454, "num_input_tokens_seen": 65378536, "step": 96985 }, { "epoch": 2.369481836171304, "grad_norm": 0.021800873801112175, "learning_rate": 1.2625595202032156e-06, "loss": 0.0002, "num_input_tokens_seen": 65381736, "step": 96990 }, { "epoch": 2.3696039870031513, "grad_norm": 0.024180782958865166, "learning_rate": 1.262477233545616e-06, "loss": 0.0004, "num_input_tokens_seen": 65384872, "step": 96995 }, { "epoch": 2.3697261378349985, "grad_norm": 0.10953272134065628, "learning_rate": 1.2623949449792112e-06, "loss": 0.0601, "num_input_tokens_seen": 65388008, "step": 97000 }, { "epoch": 2.3698482886668457, "grad_norm": 0.11298985034227371, "learning_rate": 1.2623126545045999e-06, "loss": 0.0001, "num_input_tokens_seen": 65391528, "step": 97005 }, { "epoch": 2.369970439498693, "grad_norm": 0.08407048881053925, "learning_rate": 1.2622303621223804e-06, "loss": 0.0024, "num_input_tokens_seen": 65394664, "step": 97010 }, { "epoch": 2.37009259033054, "grad_norm": 0.018757490441203117, "learning_rate": 1.2621480678331513e-06, "loss": 0.0382, "num_input_tokens_seen": 65397928, "step": 97015 }, { "epoch": 2.3702147411623873, "grad_norm": 0.12693895399570465, "learning_rate": 1.2620657716375104e-06, "loss": 0.0002, "num_input_tokens_seen": 65401192, "step": 97020 }, { "epoch": 2.3703368919942345, "grad_norm": 30.074405670166016, "learning_rate": 1.2619834735360573e-06, "loss": 0.1228, "num_input_tokens_seen": 65404136, "step": 97025 }, { "epoch": 2.3704590428260817, "grad_norm": 0.007254903670400381, "learning_rate": 1.2619011735293897e-06, "loss": 0.068, "num_input_tokens_seen": 65407720, "step": 97030 }, { "epoch": 2.370581193657929, "grad_norm": 0.017863744869828224, "learning_rate": 1.2618188716181065e-06, "loss": 0.0719, "num_input_tokens_seen": 65411048, "step": 97035 }, { "epoch": 2.370703344489776, "grad_norm": 0.032780423760414124, "learning_rate": 1.261736567802806e-06, "loss": 0.0726, "num_input_tokens_seen": 65414056, "step": 97040 }, { "epoch": 2.3708254953216232, "grad_norm": 0.1686081439256668, "learning_rate": 1.2616542620840867e-06, "loss": 0.049, "num_input_tokens_seen": 65417512, "step": 97045 }, { "epoch": 2.3709476461534704, "grad_norm": 0.36529478430747986, "learning_rate": 1.261571954462547e-06, "loss": 0.0768, "num_input_tokens_seen": 65420840, "step": 97050 }, { "epoch": 2.3710697969853176, "grad_norm": 0.007805853616446257, "learning_rate": 1.261489644938786e-06, "loss": 0.0577, "num_input_tokens_seen": 65423720, "step": 97055 }, { "epoch": 2.371191947817165, "grad_norm": 0.06817847490310669, "learning_rate": 1.2614073335134018e-06, "loss": 0.0003, "num_input_tokens_seen": 65426792, "step": 97060 }, { "epoch": 2.371314098649012, "grad_norm": 0.009680322371423244, "learning_rate": 1.2613250201869931e-06, "loss": 0.0704, "num_input_tokens_seen": 65430312, "step": 97065 }, { "epoch": 2.371436249480859, "grad_norm": 0.02266172133386135, "learning_rate": 1.2612427049601589e-06, "loss": 0.0835, "num_input_tokens_seen": 65433704, "step": 97070 }, { "epoch": 2.371558400312706, "grad_norm": 0.04810142144560814, "learning_rate": 1.261160387833497e-06, "loss": 0.0434, "num_input_tokens_seen": 65436840, "step": 97075 }, { "epoch": 2.371680551144553, "grad_norm": 0.018566487357020378, "learning_rate": 1.261078068807607e-06, "loss": 0.0316, "num_input_tokens_seen": 65439656, "step": 97080 }, { "epoch": 2.3718027019764003, "grad_norm": 0.09787236154079437, "learning_rate": 1.260995747883087e-06, "loss": 0.0409, "num_input_tokens_seen": 65442600, "step": 97085 }, { "epoch": 2.3719248528082475, "grad_norm": 0.02488056570291519, "learning_rate": 1.2609134250605355e-06, "loss": 0.0391, "num_input_tokens_seen": 65445928, "step": 97090 }, { "epoch": 2.3720470036400947, "grad_norm": 0.17376767098903656, "learning_rate": 1.2608311003405513e-06, "loss": 0.0358, "num_input_tokens_seen": 65449192, "step": 97095 }, { "epoch": 2.372169154471942, "grad_norm": 0.14807315170764923, "learning_rate": 1.2607487737237334e-06, "loss": 0.0344, "num_input_tokens_seen": 65452520, "step": 97100 }, { "epoch": 2.372291305303789, "grad_norm": 0.0272610392421484, "learning_rate": 1.2606664452106804e-06, "loss": 0.0459, "num_input_tokens_seen": 65455848, "step": 97105 }, { "epoch": 2.3724134561356363, "grad_norm": 0.15240778028964996, "learning_rate": 1.2605841148019907e-06, "loss": 0.0682, "num_input_tokens_seen": 65459240, "step": 97110 }, { "epoch": 2.3725356069674834, "grad_norm": 26.267230987548828, "learning_rate": 1.260501782498263e-06, "loss": 0.0521, "num_input_tokens_seen": 65462888, "step": 97115 }, { "epoch": 2.3726577577993306, "grad_norm": 0.36245620250701904, "learning_rate": 1.2604194483000966e-06, "loss": 0.038, "num_input_tokens_seen": 65465960, "step": 97120 }, { "epoch": 2.372779908631178, "grad_norm": 0.4815026521682739, "learning_rate": 1.2603371122080901e-06, "loss": 0.0486, "num_input_tokens_seen": 65469224, "step": 97125 }, { "epoch": 2.372902059463025, "grad_norm": 2.3331916332244873, "learning_rate": 1.2602547742228417e-06, "loss": 0.001, "num_input_tokens_seen": 65472296, "step": 97130 }, { "epoch": 2.373024210294872, "grad_norm": 52.48902893066406, "learning_rate": 1.260172434344951e-06, "loss": 0.0912, "num_input_tokens_seen": 65475944, "step": 97135 }, { "epoch": 2.3731463611267194, "grad_norm": 13.921010971069336, "learning_rate": 1.260090092575016e-06, "loss": 0.0719, "num_input_tokens_seen": 65479464, "step": 97140 }, { "epoch": 2.3732685119585666, "grad_norm": 0.18770751357078552, "learning_rate": 1.260007748913636e-06, "loss": 0.0739, "num_input_tokens_seen": 65482664, "step": 97145 }, { "epoch": 2.3733906627904138, "grad_norm": 0.022564081475138664, "learning_rate": 1.2599254033614098e-06, "loss": 0.0003, "num_input_tokens_seen": 65485992, "step": 97150 }, { "epoch": 2.373512813622261, "grad_norm": 43.48125457763672, "learning_rate": 1.259843055918936e-06, "loss": 0.0751, "num_input_tokens_seen": 65489512, "step": 97155 }, { "epoch": 2.3736349644541077, "grad_norm": 0.06907090544700623, "learning_rate": 1.2597607065868138e-06, "loss": 0.0396, "num_input_tokens_seen": 65492712, "step": 97160 }, { "epoch": 2.3737571152859553, "grad_norm": 13.063774108886719, "learning_rate": 1.2596783553656418e-06, "loss": 0.0246, "num_input_tokens_seen": 65495720, "step": 97165 }, { "epoch": 2.373879266117802, "grad_norm": 26.079654693603516, "learning_rate": 1.259596002256019e-06, "loss": 0.0728, "num_input_tokens_seen": 65498664, "step": 97170 }, { "epoch": 2.3740014169496493, "grad_norm": 0.0327020138502121, "learning_rate": 1.259513647258544e-06, "loss": 0.0002, "num_input_tokens_seen": 65502120, "step": 97175 }, { "epoch": 2.3741235677814965, "grad_norm": 0.2324591726064682, "learning_rate": 1.2594312903738161e-06, "loss": 0.0014, "num_input_tokens_seen": 65505512, "step": 97180 }, { "epoch": 2.3742457186133437, "grad_norm": 0.06149037554860115, "learning_rate": 1.259348931602434e-06, "loss": 0.0003, "num_input_tokens_seen": 65508840, "step": 97185 }, { "epoch": 2.374367869445191, "grad_norm": 47.23630905151367, "learning_rate": 1.2592665709449972e-06, "loss": 0.0836, "num_input_tokens_seen": 65512040, "step": 97190 }, { "epoch": 2.374490020277038, "grad_norm": 0.10036757588386536, "learning_rate": 1.2591842084021037e-06, "loss": 0.0004, "num_input_tokens_seen": 65515944, "step": 97195 }, { "epoch": 2.3746121711088852, "grad_norm": 0.12383761256933212, "learning_rate": 1.259101843974353e-06, "loss": 0.0224, "num_input_tokens_seen": 65519336, "step": 97200 }, { "epoch": 2.3747343219407324, "grad_norm": 21.063034057617188, "learning_rate": 1.259019477662344e-06, "loss": 0.1061, "num_input_tokens_seen": 65522408, "step": 97205 }, { "epoch": 2.3748564727725796, "grad_norm": 0.007072287146002054, "learning_rate": 1.2589371094666757e-06, "loss": 0.0096, "num_input_tokens_seen": 65526248, "step": 97210 }, { "epoch": 2.374978623604427, "grad_norm": 1.5783196687698364, "learning_rate": 1.2588547393879472e-06, "loss": 0.0008, "num_input_tokens_seen": 65529896, "step": 97215 }, { "epoch": 2.375100774436274, "grad_norm": 0.046201128512620926, "learning_rate": 1.2587723674267572e-06, "loss": 0.0001, "num_input_tokens_seen": 65533736, "step": 97220 }, { "epoch": 2.375222925268121, "grad_norm": 0.006796675268560648, "learning_rate": 1.258689993583705e-06, "loss": 0.0004, "num_input_tokens_seen": 65537128, "step": 97225 }, { "epoch": 2.3753450760999684, "grad_norm": 18.510995864868164, "learning_rate": 1.2586076178593896e-06, "loss": 0.0525, "num_input_tokens_seen": 65540200, "step": 97230 }, { "epoch": 2.3754672269318156, "grad_norm": 0.011776096187531948, "learning_rate": 1.2585252402544101e-06, "loss": 0.1004, "num_input_tokens_seen": 65543080, "step": 97235 }, { "epoch": 2.3755893777636627, "grad_norm": 0.2864418625831604, "learning_rate": 1.2584428607693655e-06, "loss": 0.001, "num_input_tokens_seen": 65546728, "step": 97240 }, { "epoch": 2.3757115285955095, "grad_norm": 0.002647153800353408, "learning_rate": 1.258360479404855e-06, "loss": 0.0003, "num_input_tokens_seen": 65549864, "step": 97245 }, { "epoch": 2.375833679427357, "grad_norm": 0.05344594642519951, "learning_rate": 1.2582780961614776e-06, "loss": 0.0002, "num_input_tokens_seen": 65553192, "step": 97250 }, { "epoch": 2.375955830259204, "grad_norm": 0.008940442465245724, "learning_rate": 1.2581957110398322e-06, "loss": 0.069, "num_input_tokens_seen": 65556584, "step": 97255 }, { "epoch": 2.376077981091051, "grad_norm": 0.0033488385379314423, "learning_rate": 1.2581133240405184e-06, "loss": 0.0304, "num_input_tokens_seen": 65560168, "step": 97260 }, { "epoch": 2.3762001319228983, "grad_norm": 0.02567724883556366, "learning_rate": 1.258030935164135e-06, "loss": 0.0003, "num_input_tokens_seen": 65563432, "step": 97265 }, { "epoch": 2.3763222827547454, "grad_norm": 0.06815133988857269, "learning_rate": 1.257948544411281e-06, "loss": 0.0467, "num_input_tokens_seen": 65567080, "step": 97270 }, { "epoch": 2.3764444335865926, "grad_norm": 0.03765109181404114, "learning_rate": 1.257866151782556e-06, "loss": 0.1122, "num_input_tokens_seen": 65570728, "step": 97275 }, { "epoch": 2.37656658441844, "grad_norm": 0.008491848595440388, "learning_rate": 1.257783757278559e-06, "loss": 0.0001, "num_input_tokens_seen": 65574056, "step": 97280 }, { "epoch": 2.376688735250287, "grad_norm": 0.12625138461589813, "learning_rate": 1.2577013608998892e-06, "loss": 0.0002, "num_input_tokens_seen": 65577512, "step": 97285 }, { "epoch": 2.376810886082134, "grad_norm": 0.005301265046000481, "learning_rate": 1.2576189626471459e-06, "loss": 0.0001, "num_input_tokens_seen": 65580520, "step": 97290 }, { "epoch": 2.3769330369139814, "grad_norm": 18.142616271972656, "learning_rate": 1.257536562520928e-06, "loss": 0.0573, "num_input_tokens_seen": 65583720, "step": 97295 }, { "epoch": 2.3770551877458286, "grad_norm": 0.047586724162101746, "learning_rate": 1.257454160521835e-06, "loss": 0.0002, "num_input_tokens_seen": 65586664, "step": 97300 }, { "epoch": 2.3771773385776758, "grad_norm": 0.2373700588941574, "learning_rate": 1.257371756650466e-06, "loss": 0.0003, "num_input_tokens_seen": 65589672, "step": 97305 }, { "epoch": 2.377299489409523, "grad_norm": 0.7016290426254272, "learning_rate": 1.2572893509074206e-06, "loss": 0.0648, "num_input_tokens_seen": 65592936, "step": 97310 }, { "epoch": 2.37742164024137, "grad_norm": 0.010557304136455059, "learning_rate": 1.2572069432932978e-06, "loss": 0.0001, "num_input_tokens_seen": 65596392, "step": 97315 }, { "epoch": 2.3775437910732173, "grad_norm": 0.04777985066175461, "learning_rate": 1.2571245338086966e-06, "loss": 0.1605, "num_input_tokens_seen": 65599784, "step": 97320 }, { "epoch": 2.3776659419050645, "grad_norm": 0.004487090278416872, "learning_rate": 1.2570421224542169e-06, "loss": 0.0001, "num_input_tokens_seen": 65602856, "step": 97325 }, { "epoch": 2.3777880927369117, "grad_norm": 0.06862083077430725, "learning_rate": 1.2569597092304576e-06, "loss": 0.0008, "num_input_tokens_seen": 65606504, "step": 97330 }, { "epoch": 2.377910243568759, "grad_norm": 0.05558573827147484, "learning_rate": 1.2568772941380183e-06, "loss": 0.0003, "num_input_tokens_seen": 65609768, "step": 97335 }, { "epoch": 2.3780323944006057, "grad_norm": 0.06246393918991089, "learning_rate": 1.2567948771774984e-06, "loss": 0.0022, "num_input_tokens_seen": 65613480, "step": 97340 }, { "epoch": 2.378154545232453, "grad_norm": 19.32091522216797, "learning_rate": 1.256712458349497e-06, "loss": 0.2043, "num_input_tokens_seen": 65617128, "step": 97345 }, { "epoch": 2.3782766960643, "grad_norm": 0.12600833177566528, "learning_rate": 1.2566300376546135e-06, "loss": 0.0008, "num_input_tokens_seen": 65620136, "step": 97350 }, { "epoch": 2.3783988468961472, "grad_norm": 31.323135375976562, "learning_rate": 1.2565476150934472e-06, "loss": 0.068, "num_input_tokens_seen": 65623464, "step": 97355 }, { "epoch": 2.3785209977279944, "grad_norm": 0.008119119331240654, "learning_rate": 1.2564651906665979e-06, "loss": 0.0003, "num_input_tokens_seen": 65627176, "step": 97360 }, { "epoch": 2.3786431485598416, "grad_norm": 0.08318277448415756, "learning_rate": 1.2563827643746644e-06, "loss": 0.0387, "num_input_tokens_seen": 65632552, "step": 97365 }, { "epoch": 2.378765299391689, "grad_norm": 0.00928101222962141, "learning_rate": 1.2563003362182466e-06, "loss": 0.1144, "num_input_tokens_seen": 65635880, "step": 97370 }, { "epoch": 2.378887450223536, "grad_norm": 0.005901527125388384, "learning_rate": 1.256217906197944e-06, "loss": 0.0004, "num_input_tokens_seen": 65639208, "step": 97375 }, { "epoch": 2.379009601055383, "grad_norm": 88.21249389648438, "learning_rate": 1.2561354743143558e-06, "loss": 0.1245, "num_input_tokens_seen": 65642344, "step": 97380 }, { "epoch": 2.3791317518872304, "grad_norm": 0.2763185203075409, "learning_rate": 1.2560530405680813e-06, "loss": 0.0055, "num_input_tokens_seen": 65645608, "step": 97385 }, { "epoch": 2.3792539027190776, "grad_norm": 0.38322851061820984, "learning_rate": 1.2559706049597205e-06, "loss": 0.0488, "num_input_tokens_seen": 65649064, "step": 97390 }, { "epoch": 2.3793760535509247, "grad_norm": 0.07286795973777771, "learning_rate": 1.2558881674898727e-06, "loss": 0.0002, "num_input_tokens_seen": 65651944, "step": 97395 }, { "epoch": 2.379498204382772, "grad_norm": 0.026501847431063652, "learning_rate": 1.2558057281591373e-06, "loss": 0.0002, "num_input_tokens_seen": 65655336, "step": 97400 }, { "epoch": 2.379620355214619, "grad_norm": 3.2439448833465576, "learning_rate": 1.2557232869681136e-06, "loss": 0.0006, "num_input_tokens_seen": 65658536, "step": 97405 }, { "epoch": 2.3797425060464663, "grad_norm": 0.4015377461910248, "learning_rate": 1.2556408439174016e-06, "loss": 0.0114, "num_input_tokens_seen": 65661736, "step": 97410 }, { "epoch": 2.3798646568783135, "grad_norm": 0.0072961426340043545, "learning_rate": 1.2555583990076005e-06, "loss": 0.0002, "num_input_tokens_seen": 65665064, "step": 97415 }, { "epoch": 2.3799868077101607, "grad_norm": 0.00903349183499813, "learning_rate": 1.25547595223931e-06, "loss": 0.0342, "num_input_tokens_seen": 65668648, "step": 97420 }, { "epoch": 2.3801089585420074, "grad_norm": 0.011133073829114437, "learning_rate": 1.2553935036131294e-06, "loss": 0.0004, "num_input_tokens_seen": 65671656, "step": 97425 }, { "epoch": 2.380231109373855, "grad_norm": 0.004269387573003769, "learning_rate": 1.2553110531296588e-06, "loss": 0.0378, "num_input_tokens_seen": 65674920, "step": 97430 }, { "epoch": 2.380353260205702, "grad_norm": 19.349868774414062, "learning_rate": 1.2552286007894974e-06, "loss": 0.0491, "num_input_tokens_seen": 65678632, "step": 97435 }, { "epoch": 2.380475411037549, "grad_norm": 107.51361083984375, "learning_rate": 1.2551461465932453e-06, "loss": 0.2491, "num_input_tokens_seen": 65681768, "step": 97440 }, { "epoch": 2.380597561869396, "grad_norm": 0.22212591767311096, "learning_rate": 1.2550636905415014e-06, "loss": 0.0065, "num_input_tokens_seen": 65685352, "step": 97445 }, { "epoch": 2.3807197127012434, "grad_norm": 0.5093609094619751, "learning_rate": 1.2549812326348662e-06, "loss": 0.0148, "num_input_tokens_seen": 65688488, "step": 97450 }, { "epoch": 2.3808418635330906, "grad_norm": 0.23147644102573395, "learning_rate": 1.2548987728739386e-06, "loss": 0.0004, "num_input_tokens_seen": 65692008, "step": 97455 }, { "epoch": 2.3809640143649378, "grad_norm": 24.48149871826172, "learning_rate": 1.2548163112593187e-06, "loss": 0.095, "num_input_tokens_seen": 65695464, "step": 97460 }, { "epoch": 2.381086165196785, "grad_norm": 0.06188417226076126, "learning_rate": 1.2547338477916058e-06, "loss": 0.0882, "num_input_tokens_seen": 65698728, "step": 97465 }, { "epoch": 2.381208316028632, "grad_norm": 0.04154938459396362, "learning_rate": 1.2546513824714e-06, "loss": 0.0001, "num_input_tokens_seen": 65701928, "step": 97470 }, { "epoch": 2.3813304668604793, "grad_norm": 0.083378367125988, "learning_rate": 1.2545689152993008e-06, "loss": 0.0002, "num_input_tokens_seen": 65705320, "step": 97475 }, { "epoch": 2.3814526176923265, "grad_norm": 0.017329366877675056, "learning_rate": 1.2544864462759083e-06, "loss": 0.0001, "num_input_tokens_seen": 65708904, "step": 97480 }, { "epoch": 2.3815747685241737, "grad_norm": 150.7119903564453, "learning_rate": 1.2544039754018213e-06, "loss": 0.0393, "num_input_tokens_seen": 65712296, "step": 97485 }, { "epoch": 2.381696919356021, "grad_norm": 3.2090935707092285, "learning_rate": 1.2543215026776406e-06, "loss": 0.0427, "num_input_tokens_seen": 65715880, "step": 97490 }, { "epoch": 2.381819070187868, "grad_norm": 0.0032223211601376534, "learning_rate": 1.2542390281039654e-06, "loss": 0.0008, "num_input_tokens_seen": 65719528, "step": 97495 }, { "epoch": 2.3819412210197153, "grad_norm": 64.20941162109375, "learning_rate": 1.254156551681396e-06, "loss": 0.04, "num_input_tokens_seen": 65722920, "step": 97500 }, { "epoch": 2.3820633718515625, "grad_norm": 0.18008512258529663, "learning_rate": 1.2540740734105313e-06, "loss": 0.0423, "num_input_tokens_seen": 65726376, "step": 97505 }, { "epoch": 2.3821855226834097, "grad_norm": 0.01593446172773838, "learning_rate": 1.2539915932919717e-06, "loss": 0.0005, "num_input_tokens_seen": 65729704, "step": 97510 }, { "epoch": 2.382307673515257, "grad_norm": 44.185508728027344, "learning_rate": 1.2539091113263172e-06, "loss": 0.0778, "num_input_tokens_seen": 65733160, "step": 97515 }, { "epoch": 2.3824298243471036, "grad_norm": 0.08736875653266907, "learning_rate": 1.2538266275141667e-06, "loss": 0.0717, "num_input_tokens_seen": 65736616, "step": 97520 }, { "epoch": 2.382551975178951, "grad_norm": 0.12912006676197052, "learning_rate": 1.2537441418561213e-06, "loss": 0.0003, "num_input_tokens_seen": 65739752, "step": 97525 }, { "epoch": 2.382674126010798, "grad_norm": 0.06348052620887756, "learning_rate": 1.25366165435278e-06, "loss": 0.0002, "num_input_tokens_seen": 65743144, "step": 97530 }, { "epoch": 2.382796276842645, "grad_norm": 0.16505594551563263, "learning_rate": 1.2535791650047428e-06, "loss": 0.0535, "num_input_tokens_seen": 65746344, "step": 97535 }, { "epoch": 2.3829184276744924, "grad_norm": 0.020926062017679214, "learning_rate": 1.25349667381261e-06, "loss": 0.0053, "num_input_tokens_seen": 65750568, "step": 97540 }, { "epoch": 2.3830405785063395, "grad_norm": 0.2248907834291458, "learning_rate": 1.2534141807769811e-06, "loss": 0.0004, "num_input_tokens_seen": 65753960, "step": 97545 }, { "epoch": 2.3831627293381867, "grad_norm": 0.0069709401577711105, "learning_rate": 1.253331685898456e-06, "loss": 0.0001, "num_input_tokens_seen": 65757160, "step": 97550 }, { "epoch": 2.383284880170034, "grad_norm": 0.34722745418548584, "learning_rate": 1.253249189177635e-06, "loss": 0.0002, "num_input_tokens_seen": 65760424, "step": 97555 }, { "epoch": 2.383407031001881, "grad_norm": 0.013133928179740906, "learning_rate": 1.2531666906151177e-06, "loss": 0.0003, "num_input_tokens_seen": 65763432, "step": 97560 }, { "epoch": 2.3835291818337283, "grad_norm": 0.01156538538634777, "learning_rate": 1.253084190211504e-06, "loss": 0.0001, "num_input_tokens_seen": 65766824, "step": 97565 }, { "epoch": 2.3836513326655755, "grad_norm": 0.005557337775826454, "learning_rate": 1.2530016879673942e-06, "loss": 0.0003, "num_input_tokens_seen": 65770024, "step": 97570 }, { "epoch": 2.3837734834974227, "grad_norm": 0.03268285468220711, "learning_rate": 1.252919183883388e-06, "loss": 0.0373, "num_input_tokens_seen": 65773544, "step": 97575 }, { "epoch": 2.38389563432927, "grad_norm": 37.92142105102539, "learning_rate": 1.252836677960085e-06, "loss": 0.0311, "num_input_tokens_seen": 65777256, "step": 97580 }, { "epoch": 2.384017785161117, "grad_norm": 0.02641063742339611, "learning_rate": 1.2527541701980861e-06, "loss": 0.0398, "num_input_tokens_seen": 65780392, "step": 97585 }, { "epoch": 2.3841399359929643, "grad_norm": 0.00245059747248888, "learning_rate": 1.2526716605979909e-06, "loss": 0.0601, "num_input_tokens_seen": 65783976, "step": 97590 }, { "epoch": 2.3842620868248114, "grad_norm": 0.020249001681804657, "learning_rate": 1.2525891491603995e-06, "loss": 0.0002, "num_input_tokens_seen": 65787304, "step": 97595 }, { "epoch": 2.3843842376566586, "grad_norm": 0.02965971827507019, "learning_rate": 1.2525066358859119e-06, "loss": 0.1335, "num_input_tokens_seen": 65791144, "step": 97600 }, { "epoch": 2.3845063884885054, "grad_norm": 0.004216925706714392, "learning_rate": 1.2524241207751278e-06, "loss": 0.0001, "num_input_tokens_seen": 65794792, "step": 97605 }, { "epoch": 2.384628539320353, "grad_norm": 0.0560830794274807, "learning_rate": 1.2523416038286478e-06, "loss": 0.0413, "num_input_tokens_seen": 65798184, "step": 97610 }, { "epoch": 2.3847506901521998, "grad_norm": 0.05167970433831215, "learning_rate": 1.2522590850470717e-06, "loss": 0.0003, "num_input_tokens_seen": 65801512, "step": 97615 }, { "epoch": 2.384872840984047, "grad_norm": 0.002556778723374009, "learning_rate": 1.2521765644309998e-06, "loss": 0.0397, "num_input_tokens_seen": 65804520, "step": 97620 }, { "epoch": 2.384994991815894, "grad_norm": 0.025669759139418602, "learning_rate": 1.252094041981032e-06, "loss": 0.0002, "num_input_tokens_seen": 65807848, "step": 97625 }, { "epoch": 2.3851171426477413, "grad_norm": 30.663978576660156, "learning_rate": 1.2520115176977686e-06, "loss": 0.0603, "num_input_tokens_seen": 65811304, "step": 97630 }, { "epoch": 2.3852392934795885, "grad_norm": 339.6863098144531, "learning_rate": 1.2519289915818096e-06, "loss": 0.0478, "num_input_tokens_seen": 65814632, "step": 97635 }, { "epoch": 2.3853614443114357, "grad_norm": 129.6995849609375, "learning_rate": 1.2518464636337552e-06, "loss": 0.0407, "num_input_tokens_seen": 65817896, "step": 97640 }, { "epoch": 2.385483595143283, "grad_norm": 0.0230780690908432, "learning_rate": 1.2517639338542056e-06, "loss": 0.0489, "num_input_tokens_seen": 65821288, "step": 97645 }, { "epoch": 2.38560574597513, "grad_norm": 0.1822880506515503, "learning_rate": 1.251681402243761e-06, "loss": 0.0002, "num_input_tokens_seen": 65824936, "step": 97650 }, { "epoch": 2.3857278968069773, "grad_norm": 0.02899126335978508, "learning_rate": 1.2515988688030217e-06, "loss": 0.1391, "num_input_tokens_seen": 65828072, "step": 97655 }, { "epoch": 2.3858500476388245, "grad_norm": 488.391845703125, "learning_rate": 1.2515163335325875e-06, "loss": 0.0664, "num_input_tokens_seen": 65831528, "step": 97660 }, { "epoch": 2.3859721984706717, "grad_norm": 0.3630223870277405, "learning_rate": 1.251433796433059e-06, "loss": 0.0578, "num_input_tokens_seen": 65834984, "step": 97665 }, { "epoch": 2.386094349302519, "grad_norm": 0.00430646538734436, "learning_rate": 1.2513512575050365e-06, "loss": 0.0484, "num_input_tokens_seen": 65838056, "step": 97670 }, { "epoch": 2.386216500134366, "grad_norm": 0.08162271231412888, "learning_rate": 1.2512687167491193e-06, "loss": 0.0519, "num_input_tokens_seen": 65841320, "step": 97675 }, { "epoch": 2.3863386509662132, "grad_norm": 0.004340833052992821, "learning_rate": 1.2511861741659092e-06, "loss": 0.0002, "num_input_tokens_seen": 65845288, "step": 97680 }, { "epoch": 2.3864608017980604, "grad_norm": 0.03369579836726189, "learning_rate": 1.2511036297560054e-06, "loss": 0.0007, "num_input_tokens_seen": 65848680, "step": 97685 }, { "epoch": 2.386582952629907, "grad_norm": 0.15540599822998047, "learning_rate": 1.2510210835200082e-06, "loss": 0.0004, "num_input_tokens_seen": 65851944, "step": 97690 }, { "epoch": 2.386705103461755, "grad_norm": 0.08113245666027069, "learning_rate": 1.2509385354585187e-06, "loss": 0.0586, "num_input_tokens_seen": 65854952, "step": 97695 }, { "epoch": 2.3868272542936015, "grad_norm": 0.09177146852016449, "learning_rate": 1.2508559855721363e-06, "loss": 0.0007, "num_input_tokens_seen": 65858216, "step": 97700 }, { "epoch": 2.3869494051254487, "grad_norm": 0.0013687072787433863, "learning_rate": 1.250773433861462e-06, "loss": 0.0336, "num_input_tokens_seen": 65861352, "step": 97705 }, { "epoch": 2.387071555957296, "grad_norm": 0.0910479724407196, "learning_rate": 1.2506908803270954e-06, "loss": 0.0408, "num_input_tokens_seen": 65864616, "step": 97710 }, { "epoch": 2.387193706789143, "grad_norm": 0.05656975135207176, "learning_rate": 1.2506083249696374e-06, "loss": 0.0982, "num_input_tokens_seen": 65867816, "step": 97715 }, { "epoch": 2.3873158576209903, "grad_norm": 0.009938366711139679, "learning_rate": 1.2505257677896887e-06, "loss": 0.0005, "num_input_tokens_seen": 65871272, "step": 97720 }, { "epoch": 2.3874380084528375, "grad_norm": 0.011079080402851105, "learning_rate": 1.250443208787849e-06, "loss": 0.0002, "num_input_tokens_seen": 65874728, "step": 97725 }, { "epoch": 2.3875601592846847, "grad_norm": 0.027049781754612923, "learning_rate": 1.2503606479647189e-06, "loss": 0.0001, "num_input_tokens_seen": 65877928, "step": 97730 }, { "epoch": 2.387682310116532, "grad_norm": 35.81193923950195, "learning_rate": 1.2502780853208986e-06, "loss": 0.1042, "num_input_tokens_seen": 65881768, "step": 97735 }, { "epoch": 2.387804460948379, "grad_norm": 93.66885375976562, "learning_rate": 1.2501955208569887e-06, "loss": 0.0326, "num_input_tokens_seen": 65885032, "step": 97740 }, { "epoch": 2.3879266117802262, "grad_norm": 0.7266661524772644, "learning_rate": 1.25011295457359e-06, "loss": 0.0472, "num_input_tokens_seen": 65888552, "step": 97745 }, { "epoch": 2.3880487626120734, "grad_norm": 0.011747072450816631, "learning_rate": 1.2500303864713027e-06, "loss": 0.0001, "num_input_tokens_seen": 65891560, "step": 97750 }, { "epoch": 2.3881709134439206, "grad_norm": 24.473697662353516, "learning_rate": 1.249947816550727e-06, "loss": 0.0461, "num_input_tokens_seen": 65894760, "step": 97755 }, { "epoch": 2.388293064275768, "grad_norm": 0.13327303528785706, "learning_rate": 1.2498652448124634e-06, "loss": 0.0464, "num_input_tokens_seen": 65897832, "step": 97760 }, { "epoch": 2.388415215107615, "grad_norm": 0.055127665400505066, "learning_rate": 1.2497826712571126e-06, "loss": 0.0001, "num_input_tokens_seen": 65900840, "step": 97765 }, { "epoch": 2.388537365939462, "grad_norm": 0.11277513951063156, "learning_rate": 1.2497000958852753e-06, "loss": 0.0001, "num_input_tokens_seen": 65904104, "step": 97770 }, { "epoch": 2.3886595167713094, "grad_norm": 0.03402625396847725, "learning_rate": 1.2496175186975514e-06, "loss": 0.0938, "num_input_tokens_seen": 65907560, "step": 97775 }, { "epoch": 2.3887816676031566, "grad_norm": 0.017354309558868408, "learning_rate": 1.249534939694542e-06, "loss": 0.0753, "num_input_tokens_seen": 65910760, "step": 97780 }, { "epoch": 2.3889038184350033, "grad_norm": 0.1345100700855255, "learning_rate": 1.2494523588768473e-06, "loss": 0.0403, "num_input_tokens_seen": 65914152, "step": 97785 }, { "epoch": 2.3890259692668505, "grad_norm": 0.025590751320123672, "learning_rate": 1.2493697762450681e-06, "loss": 0.0392, "num_input_tokens_seen": 65917672, "step": 97790 }, { "epoch": 2.3891481200986977, "grad_norm": 0.5286394357681274, "learning_rate": 1.2492871917998048e-06, "loss": 0.0004, "num_input_tokens_seen": 65920808, "step": 97795 }, { "epoch": 2.389270270930545, "grad_norm": 112.20993041992188, "learning_rate": 1.2492046055416576e-06, "loss": 0.0705, "num_input_tokens_seen": 65923944, "step": 97800 }, { "epoch": 2.389392421762392, "grad_norm": 0.00426692608743906, "learning_rate": 1.249122017471228e-06, "loss": 0.0001, "num_input_tokens_seen": 65927272, "step": 97805 }, { "epoch": 2.3895145725942393, "grad_norm": 0.056970562785863876, "learning_rate": 1.2490394275891159e-06, "loss": 0.0001, "num_input_tokens_seen": 65930280, "step": 97810 }, { "epoch": 2.3896367234260865, "grad_norm": 0.036483284085989, "learning_rate": 1.248956835895922e-06, "loss": 0.0004, "num_input_tokens_seen": 65933736, "step": 97815 }, { "epoch": 2.3897588742579337, "grad_norm": 54.07392120361328, "learning_rate": 1.2488742423922472e-06, "loss": 0.1203, "num_input_tokens_seen": 65937384, "step": 97820 }, { "epoch": 2.389881025089781, "grad_norm": 0.02449539117515087, "learning_rate": 1.2487916470786916e-06, "loss": 0.0294, "num_input_tokens_seen": 65941032, "step": 97825 }, { "epoch": 2.390003175921628, "grad_norm": 0.036588121205568314, "learning_rate": 1.2487090499558563e-06, "loss": 0.0692, "num_input_tokens_seen": 65945192, "step": 97830 }, { "epoch": 2.390125326753475, "grad_norm": 1.6816258430480957, "learning_rate": 1.248626451024342e-06, "loss": 0.0003, "num_input_tokens_seen": 65948008, "step": 97835 }, { "epoch": 2.3902474775853224, "grad_norm": 0.0152506772428751, "learning_rate": 1.2485438502847494e-06, "loss": 0.0314, "num_input_tokens_seen": 65951016, "step": 97840 }, { "epoch": 2.3903696284171696, "grad_norm": 0.0030578149016946554, "learning_rate": 1.248461247737679e-06, "loss": 0.0001, "num_input_tokens_seen": 65954600, "step": 97845 }, { "epoch": 2.390491779249017, "grad_norm": 0.012720318511128426, "learning_rate": 1.2483786433837319e-06, "loss": 0.0002, "num_input_tokens_seen": 65957864, "step": 97850 }, { "epoch": 2.390613930080864, "grad_norm": 0.03128203749656677, "learning_rate": 1.2482960372235082e-06, "loss": 0.1456, "num_input_tokens_seen": 65961320, "step": 97855 }, { "epoch": 2.390736080912711, "grad_norm": 0.0449058972299099, "learning_rate": 1.2482134292576088e-06, "loss": 0.0886, "num_input_tokens_seen": 65964584, "step": 97860 }, { "epoch": 2.3908582317445584, "grad_norm": 0.036220669746398926, "learning_rate": 1.2481308194866347e-06, "loss": 0.0002, "num_input_tokens_seen": 65967656, "step": 97865 }, { "epoch": 2.390980382576405, "grad_norm": 0.45427876710891724, "learning_rate": 1.2480482079111864e-06, "loss": 0.0497, "num_input_tokens_seen": 65970728, "step": 97870 }, { "epoch": 2.3911025334082527, "grad_norm": 0.7156953811645508, "learning_rate": 1.2479655945318652e-06, "loss": 0.0006, "num_input_tokens_seen": 65974312, "step": 97875 }, { "epoch": 2.3912246842400995, "grad_norm": 0.025812193751335144, "learning_rate": 1.2478829793492712e-06, "loss": 0.0443, "num_input_tokens_seen": 65977768, "step": 97880 }, { "epoch": 2.3913468350719467, "grad_norm": 0.11862697452306747, "learning_rate": 1.2478003623640056e-06, "loss": 0.0368, "num_input_tokens_seen": 65981096, "step": 97885 }, { "epoch": 2.391468985903794, "grad_norm": 0.016623547300696373, "learning_rate": 1.2477177435766687e-06, "loss": 0.0628, "num_input_tokens_seen": 65984488, "step": 97890 }, { "epoch": 2.391591136735641, "grad_norm": 0.08937716484069824, "learning_rate": 1.2476351229878624e-06, "loss": 0.0001, "num_input_tokens_seen": 65988008, "step": 97895 }, { "epoch": 2.3917132875674882, "grad_norm": 0.01355107594281435, "learning_rate": 1.2475525005981867e-06, "loss": 0.0003, "num_input_tokens_seen": 65991016, "step": 97900 }, { "epoch": 2.3918354383993354, "grad_norm": 0.031602438539266586, "learning_rate": 1.2474698764082423e-06, "loss": 0.001, "num_input_tokens_seen": 65994408, "step": 97905 }, { "epoch": 2.3919575892311826, "grad_norm": 0.09495049715042114, "learning_rate": 1.2473872504186306e-06, "loss": 0.0003, "num_input_tokens_seen": 65997800, "step": 97910 }, { "epoch": 2.39207974006303, "grad_norm": 0.06340105086565018, "learning_rate": 1.2473046226299523e-06, "loss": 0.0003, "num_input_tokens_seen": 66001512, "step": 97915 }, { "epoch": 2.392201890894877, "grad_norm": 0.6001046299934387, "learning_rate": 1.2472219930428086e-06, "loss": 0.0561, "num_input_tokens_seen": 66004584, "step": 97920 }, { "epoch": 2.392324041726724, "grad_norm": 0.11289553344249725, "learning_rate": 1.2471393616577995e-06, "loss": 0.0565, "num_input_tokens_seen": 66008104, "step": 97925 }, { "epoch": 2.3924461925585714, "grad_norm": 0.6755673289299011, "learning_rate": 1.2470567284755267e-06, "loss": 0.0005, "num_input_tokens_seen": 66011496, "step": 97930 }, { "epoch": 2.3925683433904186, "grad_norm": 0.03471721336245537, "learning_rate": 1.246974093496591e-06, "loss": 0.0514, "num_input_tokens_seen": 66014824, "step": 97935 }, { "epoch": 2.3926904942222658, "grad_norm": 0.019346168264746666, "learning_rate": 1.2468914567215933e-06, "loss": 0.0001, "num_input_tokens_seen": 66018344, "step": 97940 }, { "epoch": 2.392812645054113, "grad_norm": 0.0018133146222680807, "learning_rate": 1.2468088181511345e-06, "loss": 0.0001, "num_input_tokens_seen": 66021160, "step": 97945 }, { "epoch": 2.39293479588596, "grad_norm": 0.002835739403963089, "learning_rate": 1.2467261777858156e-06, "loss": 0.0004, "num_input_tokens_seen": 66024616, "step": 97950 }, { "epoch": 2.3930569467178073, "grad_norm": 0.01715417392551899, "learning_rate": 1.2466435356262372e-06, "loss": 0.0239, "num_input_tokens_seen": 66027944, "step": 97955 }, { "epoch": 2.3931790975496545, "grad_norm": 11.423762321472168, "learning_rate": 1.246560891673001e-06, "loss": 0.1521, "num_input_tokens_seen": 66031080, "step": 97960 }, { "epoch": 2.3933012483815013, "grad_norm": 0.007263594772666693, "learning_rate": 1.2464782459267078e-06, "loss": 0.088, "num_input_tokens_seen": 66035432, "step": 97965 }, { "epoch": 2.3934233992133485, "grad_norm": 0.10601934045553207, "learning_rate": 1.2463955983879584e-06, "loss": 0.0542, "num_input_tokens_seen": 66038440, "step": 97970 }, { "epoch": 2.3935455500451956, "grad_norm": 0.0238367710262537, "learning_rate": 1.2463129490573538e-06, "loss": 0.0005, "num_input_tokens_seen": 66041512, "step": 97975 }, { "epoch": 2.393667700877043, "grad_norm": 0.04439342021942139, "learning_rate": 1.2462302979354955e-06, "loss": 0.0001, "num_input_tokens_seen": 66044840, "step": 97980 }, { "epoch": 2.39378985170889, "grad_norm": 132.5022735595703, "learning_rate": 1.2461476450229838e-06, "loss": 0.057, "num_input_tokens_seen": 66048296, "step": 97985 }, { "epoch": 2.393912002540737, "grad_norm": 0.003174891695380211, "learning_rate": 1.2460649903204204e-06, "loss": 0.0002, "num_input_tokens_seen": 66051496, "step": 97990 }, { "epoch": 2.3940341533725844, "grad_norm": 0.10849446803331375, "learning_rate": 1.245982333828406e-06, "loss": 0.0002, "num_input_tokens_seen": 66055400, "step": 97995 }, { "epoch": 2.3941563042044316, "grad_norm": 25.214004516601562, "learning_rate": 1.2458996755475424e-06, "loss": 0.1055, "num_input_tokens_seen": 66058728, "step": 98000 }, { "epoch": 2.394278455036279, "grad_norm": 0.28973275423049927, "learning_rate": 1.24581701547843e-06, "loss": 0.0003, "num_input_tokens_seen": 66061992, "step": 98005 }, { "epoch": 2.394400605868126, "grad_norm": 0.06445897370576859, "learning_rate": 1.24573435362167e-06, "loss": 0.0459, "num_input_tokens_seen": 66065704, "step": 98010 }, { "epoch": 2.394522756699973, "grad_norm": 0.02486269921064377, "learning_rate": 1.245651689977864e-06, "loss": 0.0003, "num_input_tokens_seen": 66069032, "step": 98015 }, { "epoch": 2.3946449075318204, "grad_norm": 0.20119556784629822, "learning_rate": 1.2455690245476126e-06, "loss": 0.0468, "num_input_tokens_seen": 66071912, "step": 98020 }, { "epoch": 2.3947670583636675, "grad_norm": 0.03414197266101837, "learning_rate": 1.2454863573315174e-06, "loss": 0.0004, "num_input_tokens_seen": 66075432, "step": 98025 }, { "epoch": 2.3948892091955147, "grad_norm": 0.08597314357757568, "learning_rate": 1.245403688330179e-06, "loss": 0.0001, "num_input_tokens_seen": 66078696, "step": 98030 }, { "epoch": 2.395011360027362, "grad_norm": 0.6161647439002991, "learning_rate": 1.2453210175441993e-06, "loss": 0.1373, "num_input_tokens_seen": 66082408, "step": 98035 }, { "epoch": 2.395133510859209, "grad_norm": 0.04381314292550087, "learning_rate": 1.245238344974179e-06, "loss": 0.0438, "num_input_tokens_seen": 66085416, "step": 98040 }, { "epoch": 2.3952556616910563, "grad_norm": 0.03218008577823639, "learning_rate": 1.2451556706207194e-06, "loss": 0.0366, "num_input_tokens_seen": 66088936, "step": 98045 }, { "epoch": 2.395377812522903, "grad_norm": 0.002943948609754443, "learning_rate": 1.245072994484422e-06, "loss": 0.0444, "num_input_tokens_seen": 66092648, "step": 98050 }, { "epoch": 2.3954999633547507, "grad_norm": 0.1787538379430771, "learning_rate": 1.2449903165658879e-06, "loss": 0.0002, "num_input_tokens_seen": 66096296, "step": 98055 }, { "epoch": 2.3956221141865974, "grad_norm": 0.030882876366376877, "learning_rate": 1.2449076368657184e-06, "loss": 0.0005, "num_input_tokens_seen": 66100264, "step": 98060 }, { "epoch": 2.3957442650184446, "grad_norm": 36.35651397705078, "learning_rate": 1.2448249553845146e-06, "loss": 0.1342, "num_input_tokens_seen": 66103592, "step": 98065 }, { "epoch": 2.395866415850292, "grad_norm": 0.0072379345074296, "learning_rate": 1.2447422721228777e-06, "loss": 0.0447, "num_input_tokens_seen": 66107048, "step": 98070 }, { "epoch": 2.395988566682139, "grad_norm": 0.1984623819589615, "learning_rate": 1.2446595870814096e-06, "loss": 0.043, "num_input_tokens_seen": 66110440, "step": 98075 }, { "epoch": 2.396110717513986, "grad_norm": 0.0784531831741333, "learning_rate": 1.2445769002607108e-06, "loss": 0.0005, "num_input_tokens_seen": 66113448, "step": 98080 }, { "epoch": 2.3962328683458334, "grad_norm": 107.5459213256836, "learning_rate": 1.244494211661383e-06, "loss": 0.0386, "num_input_tokens_seen": 66116776, "step": 98085 }, { "epoch": 2.3963550191776806, "grad_norm": 0.011617016047239304, "learning_rate": 1.2444115212840276e-06, "loss": 0.0002, "num_input_tokens_seen": 66119976, "step": 98090 }, { "epoch": 2.3964771700095278, "grad_norm": 0.014332075603306293, "learning_rate": 1.244328829129246e-06, "loss": 0.1071, "num_input_tokens_seen": 66123496, "step": 98095 }, { "epoch": 2.396599320841375, "grad_norm": 0.0055480338633060455, "learning_rate": 1.2442461351976395e-06, "loss": 0.0257, "num_input_tokens_seen": 66127592, "step": 98100 }, { "epoch": 2.396721471673222, "grad_norm": 0.02290935628116131, "learning_rate": 1.244163439489809e-06, "loss": 0.0559, "num_input_tokens_seen": 66131048, "step": 98105 }, { "epoch": 2.3968436225050693, "grad_norm": 0.0215010903775692, "learning_rate": 1.2440807420063565e-06, "loss": 0.0539, "num_input_tokens_seen": 66134376, "step": 98110 }, { "epoch": 2.3969657733369165, "grad_norm": 0.0604538656771183, "learning_rate": 1.2439980427478833e-06, "loss": 0.0912, "num_input_tokens_seen": 66137448, "step": 98115 }, { "epoch": 2.3970879241687637, "grad_norm": 0.04391892999410629, "learning_rate": 1.2439153417149908e-06, "loss": 0.0001, "num_input_tokens_seen": 66141160, "step": 98120 }, { "epoch": 2.397210075000611, "grad_norm": 17.283260345458984, "learning_rate": 1.2438326389082803e-06, "loss": 0.0504, "num_input_tokens_seen": 66144424, "step": 98125 }, { "epoch": 2.397332225832458, "grad_norm": 0.017710374668240547, "learning_rate": 1.243749934328353e-06, "loss": 0.0387, "num_input_tokens_seen": 66148136, "step": 98130 }, { "epoch": 2.3974543766643053, "grad_norm": 37.92856979370117, "learning_rate": 1.2436672279758108e-06, "loss": 0.0435, "num_input_tokens_seen": 66151208, "step": 98135 }, { "epoch": 2.3975765274961525, "grad_norm": 34.837039947509766, "learning_rate": 1.2435845198512547e-06, "loss": 0.063, "num_input_tokens_seen": 66154728, "step": 98140 }, { "epoch": 2.397698678327999, "grad_norm": 0.16700446605682373, "learning_rate": 1.2435018099552867e-06, "loss": 0.0013, "num_input_tokens_seen": 66157992, "step": 98145 }, { "epoch": 2.3978208291598464, "grad_norm": 0.26901018619537354, "learning_rate": 1.2434190982885082e-06, "loss": 0.0432, "num_input_tokens_seen": 66161640, "step": 98150 }, { "epoch": 2.3979429799916936, "grad_norm": 0.2465468794107437, "learning_rate": 1.2433363848515204e-06, "loss": 0.0003, "num_input_tokens_seen": 66164840, "step": 98155 }, { "epoch": 2.398065130823541, "grad_norm": 0.008369375951588154, "learning_rate": 1.243253669644925e-06, "loss": 0.0707, "num_input_tokens_seen": 66168296, "step": 98160 }, { "epoch": 2.398187281655388, "grad_norm": 0.007154883351176977, "learning_rate": 1.2431709526693234e-06, "loss": 0.0504, "num_input_tokens_seen": 66171624, "step": 98165 }, { "epoch": 2.398309432487235, "grad_norm": 0.47677215933799744, "learning_rate": 1.2430882339253172e-06, "loss": 0.0005, "num_input_tokens_seen": 66175080, "step": 98170 }, { "epoch": 2.3984315833190823, "grad_norm": 0.022384552285075188, "learning_rate": 1.243005513413508e-06, "loss": 0.0003, "num_input_tokens_seen": 66178536, "step": 98175 }, { "epoch": 2.3985537341509295, "grad_norm": 0.0022679297253489494, "learning_rate": 1.2429227911344976e-06, "loss": 0.0017, "num_input_tokens_seen": 66181928, "step": 98180 }, { "epoch": 2.3986758849827767, "grad_norm": 0.11913125962018967, "learning_rate": 1.242840067088887e-06, "loss": 0.0002, "num_input_tokens_seen": 66185192, "step": 98185 }, { "epoch": 2.398798035814624, "grad_norm": 0.15732306241989136, "learning_rate": 1.2427573412772783e-06, "loss": 0.0319, "num_input_tokens_seen": 66188456, "step": 98190 }, { "epoch": 2.398920186646471, "grad_norm": 0.07400466501712799, "learning_rate": 1.2426746137002727e-06, "loss": 0.0581, "num_input_tokens_seen": 66191784, "step": 98195 }, { "epoch": 2.3990423374783183, "grad_norm": 0.3544338047504425, "learning_rate": 1.2425918843584721e-06, "loss": 0.0005, "num_input_tokens_seen": 66194984, "step": 98200 }, { "epoch": 2.3991644883101655, "grad_norm": 0.22692422568798065, "learning_rate": 1.2425091532524783e-06, "loss": 0.0467, "num_input_tokens_seen": 66198120, "step": 98205 }, { "epoch": 2.3992866391420127, "grad_norm": 0.007906322367489338, "learning_rate": 1.2424264203828924e-06, "loss": 0.0259, "num_input_tokens_seen": 66201832, "step": 98210 }, { "epoch": 2.39940878997386, "grad_norm": 0.015309485606849194, "learning_rate": 1.2423436857503167e-06, "loss": 0.0009, "num_input_tokens_seen": 66205480, "step": 98215 }, { "epoch": 2.399530940805707, "grad_norm": 0.14849808812141418, "learning_rate": 1.2422609493553522e-06, "loss": 0.0456, "num_input_tokens_seen": 66208616, "step": 98220 }, { "epoch": 2.3996530916375542, "grad_norm": 0.013945171609520912, "learning_rate": 1.2421782111986013e-06, "loss": 0.0009, "num_input_tokens_seen": 66212200, "step": 98225 }, { "epoch": 2.399775242469401, "grad_norm": 18.396347045898438, "learning_rate": 1.2420954712806653e-06, "loss": 0.0394, "num_input_tokens_seen": 66215400, "step": 98230 }, { "epoch": 2.3998973933012486, "grad_norm": 0.014513997361063957, "learning_rate": 1.2420127296021454e-06, "loss": 0.0604, "num_input_tokens_seen": 66218664, "step": 98235 }, { "epoch": 2.4000195441330954, "grad_norm": 0.3597651422023773, "learning_rate": 1.241929986163644e-06, "loss": 0.0012, "num_input_tokens_seen": 66221672, "step": 98240 }, { "epoch": 2.4001416949649426, "grad_norm": 0.2162315994501114, "learning_rate": 1.241847240965763e-06, "loss": 0.0004, "num_input_tokens_seen": 66225064, "step": 98245 }, { "epoch": 2.4002638457967898, "grad_norm": 0.2530384659767151, "learning_rate": 1.2417644940091036e-06, "loss": 0.0325, "num_input_tokens_seen": 66228200, "step": 98250 }, { "epoch": 2.400385996628637, "grad_norm": 0.8703799247741699, "learning_rate": 1.2416817452942678e-06, "loss": 0.0371, "num_input_tokens_seen": 66231464, "step": 98255 }, { "epoch": 2.400508147460484, "grad_norm": 0.13068102300167084, "learning_rate": 1.2415989948218575e-06, "loss": 0.0214, "num_input_tokens_seen": 66234984, "step": 98260 }, { "epoch": 2.4006302982923313, "grad_norm": 0.002861426677554846, "learning_rate": 1.2415162425924739e-06, "loss": 0.0574, "num_input_tokens_seen": 66238504, "step": 98265 }, { "epoch": 2.4007524491241785, "grad_norm": 0.16854959726333618, "learning_rate": 1.2414334886067196e-06, "loss": 0.0002, "num_input_tokens_seen": 66241640, "step": 98270 }, { "epoch": 2.4008745999560257, "grad_norm": 0.36802446842193604, "learning_rate": 1.241350732865196e-06, "loss": 0.0002, "num_input_tokens_seen": 66245096, "step": 98275 }, { "epoch": 2.400996750787873, "grad_norm": 0.13454042375087738, "learning_rate": 1.241267975368505e-06, "loss": 0.0002, "num_input_tokens_seen": 66248296, "step": 98280 }, { "epoch": 2.40111890161972, "grad_norm": 0.5747636556625366, "learning_rate": 1.2411852161172482e-06, "loss": 0.0999, "num_input_tokens_seen": 66251752, "step": 98285 }, { "epoch": 2.4012410524515673, "grad_norm": 0.11840732395648956, "learning_rate": 1.2411024551120277e-06, "loss": 0.0002, "num_input_tokens_seen": 66255144, "step": 98290 }, { "epoch": 2.4013632032834145, "grad_norm": 0.0020689095836132765, "learning_rate": 1.2410196923534454e-06, "loss": 0.0001, "num_input_tokens_seen": 66258664, "step": 98295 }, { "epoch": 2.4014853541152617, "grad_norm": 0.011177745647728443, "learning_rate": 1.2409369278421026e-06, "loss": 0.0864, "num_input_tokens_seen": 66262312, "step": 98300 }, { "epoch": 2.401607504947109, "grad_norm": 0.0040663969703018665, "learning_rate": 1.2408541615786022e-06, "loss": 0.0399, "num_input_tokens_seen": 66265832, "step": 98305 }, { "epoch": 2.401729655778956, "grad_norm": 0.03280526027083397, "learning_rate": 1.2407713935635453e-06, "loss": 0.0001, "num_input_tokens_seen": 66268968, "step": 98310 }, { "epoch": 2.4018518066108028, "grad_norm": 0.04107336327433586, "learning_rate": 1.2406886237975342e-06, "loss": 0.0456, "num_input_tokens_seen": 66272104, "step": 98315 }, { "epoch": 2.4019739574426504, "grad_norm": 0.2942955493927002, "learning_rate": 1.240605852281171e-06, "loss": 0.0015, "num_input_tokens_seen": 66275752, "step": 98320 }, { "epoch": 2.402096108274497, "grad_norm": 16.743667602539062, "learning_rate": 1.2405230790150566e-06, "loss": 0.0334, "num_input_tokens_seen": 66279144, "step": 98325 }, { "epoch": 2.4022182591063443, "grad_norm": 0.021431000903248787, "learning_rate": 1.240440303999794e-06, "loss": 0.1057, "num_input_tokens_seen": 66282408, "step": 98330 }, { "epoch": 2.4023404099381915, "grad_norm": 0.13238425552845, "learning_rate": 1.2403575272359853e-06, "loss": 0.0004, "num_input_tokens_seen": 66285864, "step": 98335 }, { "epoch": 2.4024625607700387, "grad_norm": 0.05844270810484886, "learning_rate": 1.2402747487242313e-06, "loss": 0.0415, "num_input_tokens_seen": 66288872, "step": 98340 }, { "epoch": 2.402584711601886, "grad_norm": 0.12406209856271744, "learning_rate": 1.240191968465135e-06, "loss": 0.0706, "num_input_tokens_seen": 66292264, "step": 98345 }, { "epoch": 2.402706862433733, "grad_norm": 0.0026762220077216625, "learning_rate": 1.2401091864592984e-06, "loss": 0.0002, "num_input_tokens_seen": 66295592, "step": 98350 }, { "epoch": 2.4028290132655803, "grad_norm": 0.05921659618616104, "learning_rate": 1.2400264027073227e-06, "loss": 0.0002, "num_input_tokens_seen": 66298792, "step": 98355 }, { "epoch": 2.4029511640974275, "grad_norm": 0.0028167401906102896, "learning_rate": 1.2399436172098106e-06, "loss": 0.0626, "num_input_tokens_seen": 66302312, "step": 98360 }, { "epoch": 2.4030733149292747, "grad_norm": 0.003555183531716466, "learning_rate": 1.239860829967364e-06, "loss": 0.0002, "num_input_tokens_seen": 66305704, "step": 98365 }, { "epoch": 2.403195465761122, "grad_norm": 0.003496664110571146, "learning_rate": 1.239778040980585e-06, "loss": 0.0761, "num_input_tokens_seen": 66309800, "step": 98370 }, { "epoch": 2.403317616592969, "grad_norm": 19.096784591674805, "learning_rate": 1.2396952502500756e-06, "loss": 0.0349, "num_input_tokens_seen": 66313064, "step": 98375 }, { "epoch": 2.4034397674248162, "grad_norm": 0.007529269903898239, "learning_rate": 1.2396124577764378e-06, "loss": 0.0001, "num_input_tokens_seen": 66316264, "step": 98380 }, { "epoch": 2.4035619182566634, "grad_norm": 0.011298858560621738, "learning_rate": 1.239529663560274e-06, "loss": 0.0003, "num_input_tokens_seen": 66319656, "step": 98385 }, { "epoch": 2.4036840690885106, "grad_norm": 12.470608711242676, "learning_rate": 1.2394468676021856e-06, "loss": 0.0759, "num_input_tokens_seen": 66323112, "step": 98390 }, { "epoch": 2.403806219920358, "grad_norm": 0.0077195316553115845, "learning_rate": 1.2393640699027757e-06, "loss": 0.0001, "num_input_tokens_seen": 66326376, "step": 98395 }, { "epoch": 2.403928370752205, "grad_norm": 20.43804359436035, "learning_rate": 1.2392812704626453e-06, "loss": 0.0869, "num_input_tokens_seen": 66329832, "step": 98400 }, { "epoch": 2.404050521584052, "grad_norm": 0.029997482895851135, "learning_rate": 1.2391984692823976e-06, "loss": 0.0004, "num_input_tokens_seen": 66333160, "step": 98405 }, { "epoch": 2.404172672415899, "grad_norm": 16.685693740844727, "learning_rate": 1.2391156663626343e-06, "loss": 0.0592, "num_input_tokens_seen": 66336104, "step": 98410 }, { "epoch": 2.404294823247746, "grad_norm": 0.04722894728183746, "learning_rate": 1.2390328617039574e-06, "loss": 0.0003, "num_input_tokens_seen": 66339560, "step": 98415 }, { "epoch": 2.4044169740795933, "grad_norm": 0.03532855212688446, "learning_rate": 1.238950055306969e-06, "loss": 0.0004, "num_input_tokens_seen": 66342888, "step": 98420 }, { "epoch": 2.4045391249114405, "grad_norm": 0.1769874095916748, "learning_rate": 1.2388672471722719e-06, "loss": 0.0005, "num_input_tokens_seen": 66346344, "step": 98425 }, { "epoch": 2.4046612757432877, "grad_norm": 0.003971653990447521, "learning_rate": 1.238784437300468e-06, "loss": 0.0001, "num_input_tokens_seen": 66349480, "step": 98430 }, { "epoch": 2.404783426575135, "grad_norm": 0.07422498613595963, "learning_rate": 1.2387016256921593e-06, "loss": 0.0003, "num_input_tokens_seen": 66353256, "step": 98435 }, { "epoch": 2.404905577406982, "grad_norm": 18.69518280029297, "learning_rate": 1.2386188123479482e-06, "loss": 0.0448, "num_input_tokens_seen": 66356520, "step": 98440 }, { "epoch": 2.4050277282388293, "grad_norm": 1.4093151092529297, "learning_rate": 1.238535997268437e-06, "loss": 0.03, "num_input_tokens_seen": 66359912, "step": 98445 }, { "epoch": 2.4051498790706765, "grad_norm": 0.47100839018821716, "learning_rate": 1.2384531804542272e-06, "loss": 0.0022, "num_input_tokens_seen": 66363176, "step": 98450 }, { "epoch": 2.4052720299025236, "grad_norm": 0.013687855564057827, "learning_rate": 1.2383703619059225e-06, "loss": 0.0555, "num_input_tokens_seen": 66366504, "step": 98455 }, { "epoch": 2.405394180734371, "grad_norm": 13.551589965820312, "learning_rate": 1.238287541624124e-06, "loss": 0.0763, "num_input_tokens_seen": 66370024, "step": 98460 }, { "epoch": 2.405516331566218, "grad_norm": 0.012431265786290169, "learning_rate": 1.2382047196094348e-06, "loss": 0.0281, "num_input_tokens_seen": 66373096, "step": 98465 }, { "epoch": 2.405638482398065, "grad_norm": 0.04757985472679138, "learning_rate": 1.2381218958624565e-06, "loss": 0.0863, "num_input_tokens_seen": 66376168, "step": 98470 }, { "epoch": 2.4057606332299124, "grad_norm": 0.23747007548809052, "learning_rate": 1.238039070383792e-06, "loss": 0.0351, "num_input_tokens_seen": 66379688, "step": 98475 }, { "epoch": 2.4058827840617596, "grad_norm": 0.014675687067210674, "learning_rate": 1.237956243174043e-06, "loss": 0.0002, "num_input_tokens_seen": 66383016, "step": 98480 }, { "epoch": 2.406004934893607, "grad_norm": 0.0017657901626080275, "learning_rate": 1.2378734142338126e-06, "loss": 0.0001, "num_input_tokens_seen": 66386216, "step": 98485 }, { "epoch": 2.406127085725454, "grad_norm": 0.28223177790641785, "learning_rate": 1.2377905835637024e-06, "loss": 0.0553, "num_input_tokens_seen": 66389608, "step": 98490 }, { "epoch": 2.4062492365573007, "grad_norm": 0.2891443371772766, "learning_rate": 1.2377077511643152e-06, "loss": 0.0818, "num_input_tokens_seen": 66392552, "step": 98495 }, { "epoch": 2.4063713873891484, "grad_norm": 0.014545445330440998, "learning_rate": 1.2376249170362533e-06, "loss": 0.0648, "num_input_tokens_seen": 66395624, "step": 98500 }, { "epoch": 2.406493538220995, "grad_norm": 466.7126159667969, "learning_rate": 1.237542081180119e-06, "loss": 0.0115, "num_input_tokens_seen": 66399144, "step": 98505 }, { "epoch": 2.4066156890528423, "grad_norm": 0.0069570522755384445, "learning_rate": 1.2374592435965152e-06, "loss": 0.0002, "num_input_tokens_seen": 66402472, "step": 98510 }, { "epoch": 2.4067378398846895, "grad_norm": 169.1098175048828, "learning_rate": 1.2373764042860434e-06, "loss": 0.0815, "num_input_tokens_seen": 66406056, "step": 98515 }, { "epoch": 2.4068599907165367, "grad_norm": 2.4282772541046143, "learning_rate": 1.2372935632493068e-06, "loss": 0.0011, "num_input_tokens_seen": 66409640, "step": 98520 }, { "epoch": 2.406982141548384, "grad_norm": 62.906150817871094, "learning_rate": 1.2372107204869075e-06, "loss": 0.0797, "num_input_tokens_seen": 66412776, "step": 98525 }, { "epoch": 2.407104292380231, "grad_norm": 0.0374017059803009, "learning_rate": 1.237127875999448e-06, "loss": 0.1063, "num_input_tokens_seen": 66416424, "step": 98530 }, { "epoch": 2.4072264432120782, "grad_norm": 1.4092093706130981, "learning_rate": 1.2370450297875312e-06, "loss": 0.0633, "num_input_tokens_seen": 66419880, "step": 98535 }, { "epoch": 2.4073485940439254, "grad_norm": 0.13621792197227478, "learning_rate": 1.236962181851759e-06, "loss": 0.0002, "num_input_tokens_seen": 66423080, "step": 98540 }, { "epoch": 2.4074707448757726, "grad_norm": 0.03624136745929718, "learning_rate": 1.2368793321927338e-06, "loss": 0.0003, "num_input_tokens_seen": 66426088, "step": 98545 }, { "epoch": 2.40759289570762, "grad_norm": 0.07511377334594727, "learning_rate": 1.2367964808110585e-06, "loss": 0.0004, "num_input_tokens_seen": 66429736, "step": 98550 }, { "epoch": 2.407715046539467, "grad_norm": 0.08834755420684814, "learning_rate": 1.2367136277073358e-06, "loss": 0.0002, "num_input_tokens_seen": 66433576, "step": 98555 }, { "epoch": 2.407837197371314, "grad_norm": 0.21426711976528168, "learning_rate": 1.2366307728821676e-06, "loss": 0.0003, "num_input_tokens_seen": 66436904, "step": 98560 }, { "epoch": 2.4079593482031614, "grad_norm": 2.439096450805664, "learning_rate": 1.236547916336157e-06, "loss": 0.1034, "num_input_tokens_seen": 66440616, "step": 98565 }, { "epoch": 2.4080814990350086, "grad_norm": 0.015720663592219353, "learning_rate": 1.236465058069906e-06, "loss": 0.0001, "num_input_tokens_seen": 66443816, "step": 98570 }, { "epoch": 2.4082036498668558, "grad_norm": 0.010523266158998013, "learning_rate": 1.2363821980840173e-06, "loss": 0.0004, "num_input_tokens_seen": 66447400, "step": 98575 }, { "epoch": 2.408325800698703, "grad_norm": 0.07485683262348175, "learning_rate": 1.2362993363790943e-06, "loss": 0.049, "num_input_tokens_seen": 66450984, "step": 98580 }, { "epoch": 2.40844795153055, "grad_norm": 62.42988204956055, "learning_rate": 1.236216472955739e-06, "loss": 0.0301, "num_input_tokens_seen": 66453928, "step": 98585 }, { "epoch": 2.408570102362397, "grad_norm": 0.056043241173028946, "learning_rate": 1.2361336078145536e-06, "loss": 0.0167, "num_input_tokens_seen": 66457000, "step": 98590 }, { "epoch": 2.408692253194244, "grad_norm": 0.05428202077746391, "learning_rate": 1.2360507409561413e-06, "loss": 0.0003, "num_input_tokens_seen": 66460008, "step": 98595 }, { "epoch": 2.4088144040260913, "grad_norm": 0.04799538478255272, "learning_rate": 1.2359678723811045e-06, "loss": 0.0003, "num_input_tokens_seen": 66463144, "step": 98600 }, { "epoch": 2.4089365548579385, "grad_norm": 0.003943683113902807, "learning_rate": 1.2358850020900454e-06, "loss": 0.0982, "num_input_tokens_seen": 66466024, "step": 98605 }, { "epoch": 2.4090587056897856, "grad_norm": 0.03562261164188385, "learning_rate": 1.2358021300835676e-06, "loss": 0.0003, "num_input_tokens_seen": 66469224, "step": 98610 }, { "epoch": 2.409180856521633, "grad_norm": 2.066300630569458, "learning_rate": 1.235719256362273e-06, "loss": 0.0229, "num_input_tokens_seen": 66472744, "step": 98615 }, { "epoch": 2.40930300735348, "grad_norm": 194.5032958984375, "learning_rate": 1.235636380926765e-06, "loss": 0.1211, "num_input_tokens_seen": 66475816, "step": 98620 }, { "epoch": 2.409425158185327, "grad_norm": 0.015606533735990524, "learning_rate": 1.2355535037776456e-06, "loss": 0.0007, "num_input_tokens_seen": 66479016, "step": 98625 }, { "epoch": 2.4095473090171744, "grad_norm": 0.02473052591085434, "learning_rate": 1.2354706249155177e-06, "loss": 0.0003, "num_input_tokens_seen": 66482280, "step": 98630 }, { "epoch": 2.4096694598490216, "grad_norm": 0.1488429754972458, "learning_rate": 1.2353877443409844e-06, "loss": 0.0004, "num_input_tokens_seen": 66485608, "step": 98635 }, { "epoch": 2.409791610680869, "grad_norm": 0.014244853518903255, "learning_rate": 1.2353048620546477e-06, "loss": 0.0002, "num_input_tokens_seen": 66489192, "step": 98640 }, { "epoch": 2.409913761512716, "grad_norm": 0.017912834882736206, "learning_rate": 1.235221978057111e-06, "loss": 0.0002, "num_input_tokens_seen": 66492648, "step": 98645 }, { "epoch": 2.410035912344563, "grad_norm": 156.58143615722656, "learning_rate": 1.235139092348977e-06, "loss": 0.0544, "num_input_tokens_seen": 66495848, "step": 98650 }, { "epoch": 2.4101580631764103, "grad_norm": 39.045799255371094, "learning_rate": 1.2350562049308477e-06, "loss": 0.0525, "num_input_tokens_seen": 66499432, "step": 98655 }, { "epoch": 2.4102802140082575, "grad_norm": 0.024942591786384583, "learning_rate": 1.2349733158033268e-06, "loss": 0.0456, "num_input_tokens_seen": 66502760, "step": 98660 }, { "epoch": 2.4104023648401047, "grad_norm": 0.03383241221308708, "learning_rate": 1.2348904249670169e-06, "loss": 0.0002, "num_input_tokens_seen": 66506088, "step": 98665 }, { "epoch": 2.410524515671952, "grad_norm": 0.015606016851961613, "learning_rate": 1.2348075324225202e-06, "loss": 0.0002, "num_input_tokens_seen": 66509160, "step": 98670 }, { "epoch": 2.4106466665037987, "grad_norm": 0.0175799373537302, "learning_rate": 1.2347246381704402e-06, "loss": 0.0, "num_input_tokens_seen": 66512488, "step": 98675 }, { "epoch": 2.4107688173356463, "grad_norm": 0.0061505944468081, "learning_rate": 1.2346417422113794e-06, "loss": 0.0355, "num_input_tokens_seen": 66516264, "step": 98680 }, { "epoch": 2.410890968167493, "grad_norm": 0.0010680512059479952, "learning_rate": 1.234558844545941e-06, "loss": 0.0002, "num_input_tokens_seen": 66519528, "step": 98685 }, { "epoch": 2.4110131189993402, "grad_norm": 0.0005795100587420166, "learning_rate": 1.2344759451747275e-06, "loss": 0.0001, "num_input_tokens_seen": 66523624, "step": 98690 }, { "epoch": 2.4111352698311874, "grad_norm": 0.002949797548353672, "learning_rate": 1.2343930440983422e-06, "loss": 0.0004, "num_input_tokens_seen": 66526760, "step": 98695 }, { "epoch": 2.4112574206630346, "grad_norm": 95.02862548828125, "learning_rate": 1.2343101413173869e-06, "loss": 0.0466, "num_input_tokens_seen": 66530344, "step": 98700 }, { "epoch": 2.411379571494882, "grad_norm": 0.38339921832084656, "learning_rate": 1.2342272368324658e-06, "loss": 0.0003, "num_input_tokens_seen": 66533608, "step": 98705 }, { "epoch": 2.411501722326729, "grad_norm": 0.005964651238173246, "learning_rate": 1.234144330644181e-06, "loss": 0.108, "num_input_tokens_seen": 66536936, "step": 98710 }, { "epoch": 2.411623873158576, "grad_norm": 0.007162902038544416, "learning_rate": 1.2340614227531355e-06, "loss": 0.0361, "num_input_tokens_seen": 66540648, "step": 98715 }, { "epoch": 2.4117460239904234, "grad_norm": 0.03665280342102051, "learning_rate": 1.2339785131599328e-06, "loss": 0.0001, "num_input_tokens_seen": 66544104, "step": 98720 }, { "epoch": 2.4118681748222706, "grad_norm": 0.1304071545600891, "learning_rate": 1.2338956018651749e-06, "loss": 0.0428, "num_input_tokens_seen": 66547368, "step": 98725 }, { "epoch": 2.4119903256541178, "grad_norm": 0.10976487398147583, "learning_rate": 1.2338126888694656e-06, "loss": 0.0545, "num_input_tokens_seen": 66551016, "step": 98730 }, { "epoch": 2.412112476485965, "grad_norm": 0.5006261467933655, "learning_rate": 1.2337297741734075e-06, "loss": 0.0401, "num_input_tokens_seen": 66554600, "step": 98735 }, { "epoch": 2.412234627317812, "grad_norm": 0.0057154325768351555, "learning_rate": 1.2336468577776037e-06, "loss": 0.0004, "num_input_tokens_seen": 66558568, "step": 98740 }, { "epoch": 2.4123567781496593, "grad_norm": 0.07972795516252518, "learning_rate": 1.2335639396826572e-06, "loss": 0.1005, "num_input_tokens_seen": 66562664, "step": 98745 }, { "epoch": 2.4124789289815065, "grad_norm": 0.015096195042133331, "learning_rate": 1.2334810198891705e-06, "loss": 0.0755, "num_input_tokens_seen": 66566056, "step": 98750 }, { "epoch": 2.4126010798133537, "grad_norm": 0.24497045576572418, "learning_rate": 1.2333980983977474e-06, "loss": 0.0719, "num_input_tokens_seen": 66569512, "step": 98755 }, { "epoch": 2.4127232306452004, "grad_norm": 0.020139671862125397, "learning_rate": 1.2333151752089901e-06, "loss": 0.0695, "num_input_tokens_seen": 66572776, "step": 98760 }, { "epoch": 2.412845381477048, "grad_norm": 0.1416095346212387, "learning_rate": 1.2332322503235024e-06, "loss": 0.0005, "num_input_tokens_seen": 66576936, "step": 98765 }, { "epoch": 2.412967532308895, "grad_norm": 0.16571195423603058, "learning_rate": 1.2331493237418871e-06, "loss": 0.0237, "num_input_tokens_seen": 66580264, "step": 98770 }, { "epoch": 2.413089683140742, "grad_norm": 0.3631545603275299, "learning_rate": 1.2330663954647471e-06, "loss": 0.0456, "num_input_tokens_seen": 66584232, "step": 98775 }, { "epoch": 2.413211833972589, "grad_norm": 0.07778457552194595, "learning_rate": 1.2329834654926855e-06, "loss": 0.049, "num_input_tokens_seen": 66587496, "step": 98780 }, { "epoch": 2.4133339848044364, "grad_norm": 54.81132507324219, "learning_rate": 1.2329005338263058e-06, "loss": 0.0841, "num_input_tokens_seen": 66590888, "step": 98785 }, { "epoch": 2.4134561356362836, "grad_norm": 0.08354758471250534, "learning_rate": 1.2328176004662105e-06, "loss": 0.0065, "num_input_tokens_seen": 66595048, "step": 98790 }, { "epoch": 2.4135782864681308, "grad_norm": 0.055788785219192505, "learning_rate": 1.232734665413003e-06, "loss": 0.0562, "num_input_tokens_seen": 66598440, "step": 98795 }, { "epoch": 2.413700437299978, "grad_norm": 0.04762701317667961, "learning_rate": 1.2326517286672867e-06, "loss": 0.0518, "num_input_tokens_seen": 66601832, "step": 98800 }, { "epoch": 2.413822588131825, "grad_norm": 11.54391860961914, "learning_rate": 1.2325687902296642e-06, "loss": 0.1387, "num_input_tokens_seen": 66604776, "step": 98805 }, { "epoch": 2.4139447389636723, "grad_norm": 0.04480794444680214, "learning_rate": 1.2324858501007389e-06, "loss": 0.0728, "num_input_tokens_seen": 66607784, "step": 98810 }, { "epoch": 2.4140668897955195, "grad_norm": 0.24305419623851776, "learning_rate": 1.232402908281114e-06, "loss": 0.0007, "num_input_tokens_seen": 66611432, "step": 98815 }, { "epoch": 2.4141890406273667, "grad_norm": 14.967013359069824, "learning_rate": 1.2323199647713927e-06, "loss": 0.0464, "num_input_tokens_seen": 66614568, "step": 98820 }, { "epoch": 2.414311191459214, "grad_norm": 21.968698501586914, "learning_rate": 1.232237019572178e-06, "loss": 0.0378, "num_input_tokens_seen": 66618536, "step": 98825 }, { "epoch": 2.414433342291061, "grad_norm": 0.05005514249205589, "learning_rate": 1.2321540726840734e-06, "loss": 0.0525, "num_input_tokens_seen": 66621736, "step": 98830 }, { "epoch": 2.4145554931229083, "grad_norm": 0.08329369872808456, "learning_rate": 1.2320711241076817e-06, "loss": 0.0022, "num_input_tokens_seen": 66625128, "step": 98835 }, { "epoch": 2.4146776439547555, "grad_norm": 0.05002454295754433, "learning_rate": 1.2319881738436065e-06, "loss": 0.0003, "num_input_tokens_seen": 66628392, "step": 98840 }, { "epoch": 2.4147997947866027, "grad_norm": 0.6276405453681946, "learning_rate": 1.2319052218924509e-06, "loss": 0.0008, "num_input_tokens_seen": 66631848, "step": 98845 }, { "epoch": 2.41492194561845, "grad_norm": 0.012492536567151546, "learning_rate": 1.2318222682548185e-06, "loss": 0.0005, "num_input_tokens_seen": 66634920, "step": 98850 }, { "epoch": 2.4150440964502966, "grad_norm": 0.06913726031780243, "learning_rate": 1.2317393129313115e-06, "loss": 0.0004, "num_input_tokens_seen": 66638312, "step": 98855 }, { "epoch": 2.4151662472821442, "grad_norm": 0.04986165091395378, "learning_rate": 1.2316563559225345e-06, "loss": 0.0002, "num_input_tokens_seen": 66641576, "step": 98860 }, { "epoch": 2.415288398113991, "grad_norm": 0.026278551667928696, "learning_rate": 1.2315733972290897e-06, "loss": 0.0401, "num_input_tokens_seen": 66645096, "step": 98865 }, { "epoch": 2.415410548945838, "grad_norm": 0.010731449350714684, "learning_rate": 1.2314904368515813e-06, "loss": 0.0005, "num_input_tokens_seen": 66648040, "step": 98870 }, { "epoch": 2.4155326997776854, "grad_norm": 0.016922397539019585, "learning_rate": 1.231407474790612e-06, "loss": 0.0006, "num_input_tokens_seen": 66651176, "step": 98875 }, { "epoch": 2.4156548506095326, "grad_norm": 0.006709430366754532, "learning_rate": 1.2313245110467853e-06, "loss": 0.0577, "num_input_tokens_seen": 66654568, "step": 98880 }, { "epoch": 2.4157770014413797, "grad_norm": 0.006941602099686861, "learning_rate": 1.2312415456207045e-06, "loss": 0.0002, "num_input_tokens_seen": 66657960, "step": 98885 }, { "epoch": 2.415899152273227, "grad_norm": 0.3153388500213623, "learning_rate": 1.2311585785129727e-06, "loss": 0.0001, "num_input_tokens_seen": 66661032, "step": 98890 }, { "epoch": 2.416021303105074, "grad_norm": 14.619956016540527, "learning_rate": 1.2310756097241942e-06, "loss": 0.0951, "num_input_tokens_seen": 66664424, "step": 98895 }, { "epoch": 2.4161434539369213, "grad_norm": 0.0042695761658251286, "learning_rate": 1.2309926392549713e-06, "loss": 0.0481, "num_input_tokens_seen": 66667816, "step": 98900 }, { "epoch": 2.4162656047687685, "grad_norm": 26.597707748413086, "learning_rate": 1.230909667105908e-06, "loss": 0.0678, "num_input_tokens_seen": 66671464, "step": 98905 }, { "epoch": 2.4163877556006157, "grad_norm": 0.018522268161177635, "learning_rate": 1.2308266932776073e-06, "loss": 0.0602, "num_input_tokens_seen": 66674664, "step": 98910 }, { "epoch": 2.416509906432463, "grad_norm": 0.03735198453068733, "learning_rate": 1.2307437177706727e-06, "loss": 0.0001, "num_input_tokens_seen": 66678376, "step": 98915 }, { "epoch": 2.41663205726431, "grad_norm": 0.052860476076602936, "learning_rate": 1.2306607405857078e-06, "loss": 0.0001, "num_input_tokens_seen": 66681960, "step": 98920 }, { "epoch": 2.4167542080961573, "grad_norm": 0.03746561333537102, "learning_rate": 1.2305777617233162e-06, "loss": 0.036, "num_input_tokens_seen": 66684904, "step": 98925 }, { "epoch": 2.4168763589280045, "grad_norm": 0.16554565727710724, "learning_rate": 1.2304947811841008e-06, "loss": 0.0003, "num_input_tokens_seen": 66688232, "step": 98930 }, { "epoch": 2.4169985097598516, "grad_norm": 0.17963381111621857, "learning_rate": 1.2304117989686655e-06, "loss": 0.0452, "num_input_tokens_seen": 66691432, "step": 98935 }, { "epoch": 2.4171206605916984, "grad_norm": 0.06290189176797867, "learning_rate": 1.2303288150776138e-06, "loss": 0.0003, "num_input_tokens_seen": 66695208, "step": 98940 }, { "epoch": 2.417242811423546, "grad_norm": 0.012594391591846943, "learning_rate": 1.2302458295115488e-06, "loss": 0.0, "num_input_tokens_seen": 66698472, "step": 98945 }, { "epoch": 2.4173649622553928, "grad_norm": 0.13371609151363373, "learning_rate": 1.2301628422710742e-06, "loss": 0.095, "num_input_tokens_seen": 66701608, "step": 98950 }, { "epoch": 2.41748711308724, "grad_norm": 0.004092858172953129, "learning_rate": 1.2300798533567935e-06, "loss": 0.0, "num_input_tokens_seen": 66704872, "step": 98955 }, { "epoch": 2.417609263919087, "grad_norm": 0.021656107157468796, "learning_rate": 1.2299968627693102e-06, "loss": 0.0558, "num_input_tokens_seen": 66707816, "step": 98960 }, { "epoch": 2.4177314147509343, "grad_norm": 27.706756591796875, "learning_rate": 1.229913870509228e-06, "loss": 0.1187, "num_input_tokens_seen": 66711144, "step": 98965 }, { "epoch": 2.4178535655827815, "grad_norm": 87.51176452636719, "learning_rate": 1.22983087657715e-06, "loss": 0.1665, "num_input_tokens_seen": 66714408, "step": 98970 }, { "epoch": 2.4179757164146287, "grad_norm": 0.009178809821605682, "learning_rate": 1.2297478809736804e-06, "loss": 0.0467, "num_input_tokens_seen": 66717864, "step": 98975 }, { "epoch": 2.418097867246476, "grad_norm": 0.034699052572250366, "learning_rate": 1.229664883699422e-06, "loss": 0.0003, "num_input_tokens_seen": 66721256, "step": 98980 }, { "epoch": 2.418220018078323, "grad_norm": 0.23443113267421722, "learning_rate": 1.229581884754979e-06, "loss": 0.0032, "num_input_tokens_seen": 66724456, "step": 98985 }, { "epoch": 2.4183421689101703, "grad_norm": 0.05177519470453262, "learning_rate": 1.229498884140955e-06, "loss": 0.0001, "num_input_tokens_seen": 66728232, "step": 98990 }, { "epoch": 2.4184643197420175, "grad_norm": 25.600658416748047, "learning_rate": 1.2294158818579533e-06, "loss": 0.0465, "num_input_tokens_seen": 66731624, "step": 98995 }, { "epoch": 2.4185864705738647, "grad_norm": 0.06634686142206192, "learning_rate": 1.2293328779065774e-06, "loss": 0.0002, "num_input_tokens_seen": 66735464, "step": 99000 }, { "epoch": 2.418708621405712, "grad_norm": 0.03814494609832764, "learning_rate": 1.2292498722874316e-06, "loss": 0.0344, "num_input_tokens_seen": 66739176, "step": 99005 }, { "epoch": 2.418830772237559, "grad_norm": 0.02642938122153282, "learning_rate": 1.2291668650011185e-06, "loss": 0.0004, "num_input_tokens_seen": 66742248, "step": 99010 }, { "epoch": 2.4189529230694062, "grad_norm": 0.021579941734671593, "learning_rate": 1.2290838560482427e-06, "loss": 0.0399, "num_input_tokens_seen": 66745512, "step": 99015 }, { "epoch": 2.4190750739012534, "grad_norm": 0.036266956478357315, "learning_rate": 1.2290008454294072e-06, "loss": 0.0005, "num_input_tokens_seen": 66748968, "step": 99020 }, { "epoch": 2.4191972247331006, "grad_norm": 42.17207717895508, "learning_rate": 1.228917833145216e-06, "loss": 0.128, "num_input_tokens_seen": 66752552, "step": 99025 }, { "epoch": 2.419319375564948, "grad_norm": 0.12942864000797272, "learning_rate": 1.228834819196273e-06, "loss": 0.0296, "num_input_tokens_seen": 66755688, "step": 99030 }, { "epoch": 2.4194415263967946, "grad_norm": 0.011230261996388435, "learning_rate": 1.2287518035831815e-06, "loss": 0.1457, "num_input_tokens_seen": 66758888, "step": 99035 }, { "epoch": 2.4195636772286417, "grad_norm": 0.5837439298629761, "learning_rate": 1.228668786306545e-06, "loss": 0.0006, "num_input_tokens_seen": 66762408, "step": 99040 }, { "epoch": 2.419685828060489, "grad_norm": 0.6264066100120544, "learning_rate": 1.228585767366968e-06, "loss": 0.0956, "num_input_tokens_seen": 66765352, "step": 99045 }, { "epoch": 2.419807978892336, "grad_norm": 0.04524848982691765, "learning_rate": 1.228502746765054e-06, "loss": 0.0024, "num_input_tokens_seen": 66768168, "step": 99050 }, { "epoch": 2.4199301297241833, "grad_norm": 0.04734393209218979, "learning_rate": 1.2284197245014062e-06, "loss": 0.0001, "num_input_tokens_seen": 66771240, "step": 99055 }, { "epoch": 2.4200522805560305, "grad_norm": 33.75642395019531, "learning_rate": 1.2283367005766288e-06, "loss": 0.034, "num_input_tokens_seen": 66774248, "step": 99060 }, { "epoch": 2.4201744313878777, "grad_norm": 0.0772058367729187, "learning_rate": 1.2282536749913255e-06, "loss": 0.0001, "num_input_tokens_seen": 66777576, "step": 99065 }, { "epoch": 2.420296582219725, "grad_norm": 0.39729130268096924, "learning_rate": 1.2281706477461002e-06, "loss": 0.0316, "num_input_tokens_seen": 66780712, "step": 99070 }, { "epoch": 2.420418733051572, "grad_norm": 27.436330795288086, "learning_rate": 1.2280876188415562e-06, "loss": 0.1086, "num_input_tokens_seen": 66784168, "step": 99075 }, { "epoch": 2.4205408838834193, "grad_norm": 0.021735528483986855, "learning_rate": 1.2280045882782978e-06, "loss": 0.0006, "num_input_tokens_seen": 66787432, "step": 99080 }, { "epoch": 2.4206630347152664, "grad_norm": 0.11444593220949173, "learning_rate": 1.227921556056929e-06, "loss": 0.0001, "num_input_tokens_seen": 66790504, "step": 99085 }, { "epoch": 2.4207851855471136, "grad_norm": 0.021713944151997566, "learning_rate": 1.2278385221780534e-06, "loss": 0.0944, "num_input_tokens_seen": 66793896, "step": 99090 }, { "epoch": 2.420907336378961, "grad_norm": 0.01930270716547966, "learning_rate": 1.2277554866422746e-06, "loss": 0.1299, "num_input_tokens_seen": 66797352, "step": 99095 }, { "epoch": 2.421029487210808, "grad_norm": 0.027717571705579758, "learning_rate": 1.2276724494501966e-06, "loss": 0.0002, "num_input_tokens_seen": 66800552, "step": 99100 }, { "epoch": 2.421151638042655, "grad_norm": 434.61480712890625, "learning_rate": 1.2275894106024234e-06, "loss": 0.0049, "num_input_tokens_seen": 66803880, "step": 99105 }, { "epoch": 2.4212737888745024, "grad_norm": 14.324984550476074, "learning_rate": 1.2275063700995587e-06, "loss": 0.0443, "num_input_tokens_seen": 66807336, "step": 99110 }, { "epoch": 2.4213959397063496, "grad_norm": 15.495723724365234, "learning_rate": 1.2274233279422065e-06, "loss": 0.043, "num_input_tokens_seen": 66810600, "step": 99115 }, { "epoch": 2.4215180905381963, "grad_norm": 0.16907991468906403, "learning_rate": 1.2273402841309709e-06, "loss": 0.1082, "num_input_tokens_seen": 66814376, "step": 99120 }, { "epoch": 2.421640241370044, "grad_norm": 0.03562447056174278, "learning_rate": 1.2272572386664552e-06, "loss": 0.0003, "num_input_tokens_seen": 66817832, "step": 99125 }, { "epoch": 2.4217623922018907, "grad_norm": 0.0556715689599514, "learning_rate": 1.2271741915492642e-06, "loss": 0.0002, "num_input_tokens_seen": 66821160, "step": 99130 }, { "epoch": 2.421884543033738, "grad_norm": 0.009768993593752384, "learning_rate": 1.2270911427800008e-06, "loss": 0.0349, "num_input_tokens_seen": 66824296, "step": 99135 }, { "epoch": 2.422006693865585, "grad_norm": 0.31641116738319397, "learning_rate": 1.2270080923592699e-06, "loss": 0.0006, "num_input_tokens_seen": 66828264, "step": 99140 }, { "epoch": 2.4221288446974323, "grad_norm": 0.11315816640853882, "learning_rate": 1.2269250402876749e-06, "loss": 0.116, "num_input_tokens_seen": 66831464, "step": 99145 }, { "epoch": 2.4222509955292795, "grad_norm": 0.07480554282665253, "learning_rate": 1.2268419865658204e-06, "loss": 0.0007, "num_input_tokens_seen": 66834728, "step": 99150 }, { "epoch": 2.4223731463611267, "grad_norm": 0.06518545746803284, "learning_rate": 1.2267589311943096e-06, "loss": 0.0002, "num_input_tokens_seen": 66837992, "step": 99155 }, { "epoch": 2.422495297192974, "grad_norm": 66.28703308105469, "learning_rate": 1.2266758741737472e-06, "loss": 0.1335, "num_input_tokens_seen": 66841448, "step": 99160 }, { "epoch": 2.422617448024821, "grad_norm": 0.03924314305186272, "learning_rate": 1.2265928155047365e-06, "loss": 0.1043, "num_input_tokens_seen": 66844648, "step": 99165 }, { "epoch": 2.4227395988566682, "grad_norm": 0.014301169663667679, "learning_rate": 1.226509755187882e-06, "loss": 0.0985, "num_input_tokens_seen": 66847976, "step": 99170 }, { "epoch": 2.4228617496885154, "grad_norm": 0.48280882835388184, "learning_rate": 1.2264266932237878e-06, "loss": 0.0708, "num_input_tokens_seen": 66851112, "step": 99175 }, { "epoch": 2.4229839005203626, "grad_norm": 0.22303539514541626, "learning_rate": 1.2263436296130577e-06, "loss": 0.0514, "num_input_tokens_seen": 66854184, "step": 99180 }, { "epoch": 2.42310605135221, "grad_norm": 0.048168718814849854, "learning_rate": 1.2262605643562956e-06, "loss": 0.0003, "num_input_tokens_seen": 66857896, "step": 99185 }, { "epoch": 2.423228202184057, "grad_norm": 43.30610656738281, "learning_rate": 1.2261774974541062e-06, "loss": 0.0948, "num_input_tokens_seen": 66861352, "step": 99190 }, { "epoch": 2.423350353015904, "grad_norm": 0.08721016347408295, "learning_rate": 1.2260944289070928e-06, "loss": 0.0002, "num_input_tokens_seen": 66864616, "step": 99195 }, { "epoch": 2.4234725038477514, "grad_norm": 0.059328023344278336, "learning_rate": 1.22601135871586e-06, "loss": 0.0003, "num_input_tokens_seen": 66867624, "step": 99200 }, { "epoch": 2.4235946546795986, "grad_norm": 0.020462198182940483, "learning_rate": 1.2259282868810122e-06, "loss": 0.0001, "num_input_tokens_seen": 66871336, "step": 99205 }, { "epoch": 2.4237168055114457, "grad_norm": 0.009214747697114944, "learning_rate": 1.225845213403153e-06, "loss": 0.0507, "num_input_tokens_seen": 66874536, "step": 99210 }, { "epoch": 2.4238389563432925, "grad_norm": 0.6257734894752502, "learning_rate": 1.2257621382828864e-06, "loss": 0.0006, "num_input_tokens_seen": 66878248, "step": 99215 }, { "epoch": 2.4239611071751397, "grad_norm": 0.10356702655553818, "learning_rate": 1.225679061520817e-06, "loss": 0.0824, "num_input_tokens_seen": 66881768, "step": 99220 }, { "epoch": 2.424083258006987, "grad_norm": 0.17674194276332855, "learning_rate": 1.2255959831175486e-06, "loss": 0.0933, "num_input_tokens_seen": 66885096, "step": 99225 }, { "epoch": 2.424205408838834, "grad_norm": 92.4852294921875, "learning_rate": 1.2255129030736856e-06, "loss": 0.0997, "num_input_tokens_seen": 66888360, "step": 99230 }, { "epoch": 2.4243275596706813, "grad_norm": 0.02182663045823574, "learning_rate": 1.225429821389832e-06, "loss": 0.0002, "num_input_tokens_seen": 66892008, "step": 99235 }, { "epoch": 2.4244497105025284, "grad_norm": 0.03820806369185448, "learning_rate": 1.2253467380665923e-06, "loss": 0.0515, "num_input_tokens_seen": 66895656, "step": 99240 }, { "epoch": 2.4245718613343756, "grad_norm": 0.13037490844726562, "learning_rate": 1.2252636531045704e-06, "loss": 0.0004, "num_input_tokens_seen": 66899112, "step": 99245 }, { "epoch": 2.424694012166223, "grad_norm": 0.6307566165924072, "learning_rate": 1.2251805665043708e-06, "loss": 0.0008, "num_input_tokens_seen": 66902056, "step": 99250 }, { "epoch": 2.42481616299807, "grad_norm": 1.1177254915237427, "learning_rate": 1.2250974782665976e-06, "loss": 0.0014, "num_input_tokens_seen": 66905320, "step": 99255 }, { "epoch": 2.424938313829917, "grad_norm": 0.002898323815315962, "learning_rate": 1.2250143883918546e-06, "loss": 0.0665, "num_input_tokens_seen": 66908584, "step": 99260 }, { "epoch": 2.4250604646617644, "grad_norm": 0.09665067493915558, "learning_rate": 1.224931296880747e-06, "loss": 0.0006, "num_input_tokens_seen": 66912360, "step": 99265 }, { "epoch": 2.4251826154936116, "grad_norm": 0.014954613521695137, "learning_rate": 1.2248482037338778e-06, "loss": 0.0001, "num_input_tokens_seen": 66916520, "step": 99270 }, { "epoch": 2.4253047663254588, "grad_norm": 0.0005372444284148514, "learning_rate": 1.2247651089518524e-06, "loss": 0.0003, "num_input_tokens_seen": 66920104, "step": 99275 }, { "epoch": 2.425426917157306, "grad_norm": 0.36292293667793274, "learning_rate": 1.2246820125352747e-06, "loss": 0.0004, "num_input_tokens_seen": 66923560, "step": 99280 }, { "epoch": 2.425549067989153, "grad_norm": 0.039235059171915054, "learning_rate": 1.224598914484749e-06, "loss": 0.1132, "num_input_tokens_seen": 66926760, "step": 99285 }, { "epoch": 2.4256712188210003, "grad_norm": 0.027509810402989388, "learning_rate": 1.2245158148008795e-06, "loss": 0.0003, "num_input_tokens_seen": 66930024, "step": 99290 }, { "epoch": 2.4257933696528475, "grad_norm": 240.3977508544922, "learning_rate": 1.2244327134842704e-06, "loss": 0.0972, "num_input_tokens_seen": 66933032, "step": 99295 }, { "epoch": 2.4259155204846943, "grad_norm": 0.06561959534883499, "learning_rate": 1.2243496105355265e-06, "loss": 0.0512, "num_input_tokens_seen": 66936104, "step": 99300 }, { "epoch": 2.426037671316542, "grad_norm": 0.10798732191324234, "learning_rate": 1.224266505955252e-06, "loss": 0.0399, "num_input_tokens_seen": 66939240, "step": 99305 }, { "epoch": 2.4261598221483887, "grad_norm": 0.009463850408792496, "learning_rate": 1.224183399744051e-06, "loss": 0.0002, "num_input_tokens_seen": 66942440, "step": 99310 }, { "epoch": 2.426281972980236, "grad_norm": 0.05178477242588997, "learning_rate": 1.224100291902528e-06, "loss": 0.0368, "num_input_tokens_seen": 66946280, "step": 99315 }, { "epoch": 2.426404123812083, "grad_norm": 0.022696619853377342, "learning_rate": 1.2240171824312873e-06, "loss": 0.0006, "num_input_tokens_seen": 66949928, "step": 99320 }, { "epoch": 2.4265262746439302, "grad_norm": 0.016703089699149132, "learning_rate": 1.2239340713309335e-06, "loss": 0.0002, "num_input_tokens_seen": 66953704, "step": 99325 }, { "epoch": 2.4266484254757774, "grad_norm": 22.868074417114258, "learning_rate": 1.2238509586020708e-06, "loss": 0.0491, "num_input_tokens_seen": 66957544, "step": 99330 }, { "epoch": 2.4267705763076246, "grad_norm": 0.015933876857161522, "learning_rate": 1.2237678442453042e-06, "loss": 0.0001, "num_input_tokens_seen": 66960936, "step": 99335 }, { "epoch": 2.426892727139472, "grad_norm": 26.713512420654297, "learning_rate": 1.223684728261237e-06, "loss": 0.0567, "num_input_tokens_seen": 66964200, "step": 99340 }, { "epoch": 2.427014877971319, "grad_norm": 0.10175715386867523, "learning_rate": 1.2236016106504747e-06, "loss": 0.072, "num_input_tokens_seen": 66967400, "step": 99345 }, { "epoch": 2.427137028803166, "grad_norm": 21.69265365600586, "learning_rate": 1.223518491413621e-06, "loss": 0.0574, "num_input_tokens_seen": 66970280, "step": 99350 }, { "epoch": 2.4272591796350134, "grad_norm": 17.81764030456543, "learning_rate": 1.223435370551281e-06, "loss": 0.052, "num_input_tokens_seen": 66973416, "step": 99355 }, { "epoch": 2.4273813304668606, "grad_norm": 0.100733183324337, "learning_rate": 1.223352248064059e-06, "loss": 0.0002, "num_input_tokens_seen": 66977192, "step": 99360 }, { "epoch": 2.4275034812987077, "grad_norm": 0.0022963222581893206, "learning_rate": 1.2232691239525592e-06, "loss": 0.0706, "num_input_tokens_seen": 66980456, "step": 99365 }, { "epoch": 2.427625632130555, "grad_norm": 0.03133883699774742, "learning_rate": 1.2231859982173862e-06, "loss": 0.0003, "num_input_tokens_seen": 66984296, "step": 99370 }, { "epoch": 2.427747782962402, "grad_norm": 4.77004861831665, "learning_rate": 1.2231028708591447e-06, "loss": 0.0855, "num_input_tokens_seen": 66987944, "step": 99375 }, { "epoch": 2.4278699337942493, "grad_norm": 0.007516190409660339, "learning_rate": 1.2230197418784391e-06, "loss": 0.0503, "num_input_tokens_seen": 66992296, "step": 99380 }, { "epoch": 2.427992084626096, "grad_norm": 0.1313624083995819, "learning_rate": 1.2229366112758739e-06, "loss": 0.0108, "num_input_tokens_seen": 66995944, "step": 99385 }, { "epoch": 2.4281142354579437, "grad_norm": 51.022216796875, "learning_rate": 1.2228534790520537e-06, "loss": 0.049, "num_input_tokens_seen": 66999080, "step": 99390 }, { "epoch": 2.4282363862897904, "grad_norm": 0.03542424738407135, "learning_rate": 1.222770345207583e-06, "loss": 0.0002, "num_input_tokens_seen": 67002408, "step": 99395 }, { "epoch": 2.4283585371216376, "grad_norm": 36.8632698059082, "learning_rate": 1.2226872097430665e-06, "loss": 0.1616, "num_input_tokens_seen": 67005608, "step": 99400 }, { "epoch": 2.428480687953485, "grad_norm": 137.3507080078125, "learning_rate": 1.2226040726591088e-06, "loss": 0.0432, "num_input_tokens_seen": 67008872, "step": 99405 }, { "epoch": 2.428602838785332, "grad_norm": 143.63348388671875, "learning_rate": 1.2225209339563143e-06, "loss": 0.0721, "num_input_tokens_seen": 67012840, "step": 99410 }, { "epoch": 2.428724989617179, "grad_norm": 32.90876007080078, "learning_rate": 1.222437793635288e-06, "loss": 0.0391, "num_input_tokens_seen": 67016040, "step": 99415 }, { "epoch": 2.4288471404490264, "grad_norm": 0.024623477831482887, "learning_rate": 1.2223546516966339e-06, "loss": 0.0002, "num_input_tokens_seen": 67019560, "step": 99420 }, { "epoch": 2.4289692912808736, "grad_norm": 0.026823585852980614, "learning_rate": 1.2222715081409572e-06, "loss": 0.0425, "num_input_tokens_seen": 67022760, "step": 99425 }, { "epoch": 2.4290914421127208, "grad_norm": 0.207112655043602, "learning_rate": 1.2221883629688622e-06, "loss": 0.0563, "num_input_tokens_seen": 67026344, "step": 99430 }, { "epoch": 2.429213592944568, "grad_norm": 0.018758624792099, "learning_rate": 1.2221052161809535e-06, "loss": 0.0584, "num_input_tokens_seen": 67030184, "step": 99435 }, { "epoch": 2.429335743776415, "grad_norm": 0.0312594436109066, "learning_rate": 1.222022067777836e-06, "loss": 0.0001, "num_input_tokens_seen": 67034088, "step": 99440 }, { "epoch": 2.4294578946082623, "grad_norm": 0.032845884561538696, "learning_rate": 1.2219389177601142e-06, "loss": 0.031, "num_input_tokens_seen": 67037288, "step": 99445 }, { "epoch": 2.4295800454401095, "grad_norm": 0.030439363792538643, "learning_rate": 1.2218557661283932e-06, "loss": 0.0184, "num_input_tokens_seen": 67040808, "step": 99450 }, { "epoch": 2.4297021962719567, "grad_norm": 0.06845355033874512, "learning_rate": 1.2217726128832773e-06, "loss": 0.0002, "num_input_tokens_seen": 67043944, "step": 99455 }, { "epoch": 2.429824347103804, "grad_norm": 0.02207336015999317, "learning_rate": 1.2216894580253711e-06, "loss": 0.0002, "num_input_tokens_seen": 67047336, "step": 99460 }, { "epoch": 2.429946497935651, "grad_norm": 0.04571019858121872, "learning_rate": 1.2216063015552798e-06, "loss": 0.041, "num_input_tokens_seen": 67051368, "step": 99465 }, { "epoch": 2.4300686487674983, "grad_norm": 0.037082310765981674, "learning_rate": 1.221523143473608e-06, "loss": 0.0402, "num_input_tokens_seen": 67054632, "step": 99470 }, { "epoch": 2.4301907995993455, "grad_norm": 0.025185855105519295, "learning_rate": 1.2214399837809599e-06, "loss": 0.1122, "num_input_tokens_seen": 67057640, "step": 99475 }, { "epoch": 2.430312950431192, "grad_norm": 0.07133536785840988, "learning_rate": 1.2213568224779408e-06, "loss": 0.0461, "num_input_tokens_seen": 67061352, "step": 99480 }, { "epoch": 2.4304351012630394, "grad_norm": 0.049883488565683365, "learning_rate": 1.2212736595651555e-06, "loss": 0.0007, "num_input_tokens_seen": 67064424, "step": 99485 }, { "epoch": 2.4305572520948866, "grad_norm": 0.10151875764131546, "learning_rate": 1.2211904950432086e-06, "loss": 0.0001, "num_input_tokens_seen": 67067752, "step": 99490 }, { "epoch": 2.430679402926734, "grad_norm": 40.1695442199707, "learning_rate": 1.221107328912705e-06, "loss": 0.0542, "num_input_tokens_seen": 67070696, "step": 99495 }, { "epoch": 2.430801553758581, "grad_norm": 0.01168416440486908, "learning_rate": 1.2210241611742494e-06, "loss": 0.0716, "num_input_tokens_seen": 67074408, "step": 99500 }, { "epoch": 2.430923704590428, "grad_norm": 18.22470474243164, "learning_rate": 1.2209409918284465e-06, "loss": 0.0489, "num_input_tokens_seen": 67077992, "step": 99505 }, { "epoch": 2.4310458554222754, "grad_norm": 0.09461220353841782, "learning_rate": 1.220857820875901e-06, "loss": 0.0348, "num_input_tokens_seen": 67081384, "step": 99510 }, { "epoch": 2.4311680062541225, "grad_norm": 0.003890714608132839, "learning_rate": 1.2207746483172185e-06, "loss": 0.1613, "num_input_tokens_seen": 67084456, "step": 99515 }, { "epoch": 2.4312901570859697, "grad_norm": 0.1612948328256607, "learning_rate": 1.2206914741530034e-06, "loss": 0.0002, "num_input_tokens_seen": 67087912, "step": 99520 }, { "epoch": 2.431412307917817, "grad_norm": 12.028178215026855, "learning_rate": 1.2206082983838606e-06, "loss": 0.0345, "num_input_tokens_seen": 67091176, "step": 99525 }, { "epoch": 2.431534458749664, "grad_norm": 0.07618942856788635, "learning_rate": 1.2205251210103945e-06, "loss": 0.0293, "num_input_tokens_seen": 67094952, "step": 99530 }, { "epoch": 2.4316566095815113, "grad_norm": 0.372517466545105, "learning_rate": 1.2204419420332108e-06, "loss": 0.0385, "num_input_tokens_seen": 67098088, "step": 99535 }, { "epoch": 2.4317787604133585, "grad_norm": 17.607189178466797, "learning_rate": 1.2203587614529136e-06, "loss": 0.0706, "num_input_tokens_seen": 67101160, "step": 99540 }, { "epoch": 2.4319009112452057, "grad_norm": 0.5471447706222534, "learning_rate": 1.2202755792701085e-06, "loss": 0.0004, "num_input_tokens_seen": 67104552, "step": 99545 }, { "epoch": 2.432023062077053, "grad_norm": 0.04835929721593857, "learning_rate": 1.2201923954854e-06, "loss": 0.1449, "num_input_tokens_seen": 67107816, "step": 99550 }, { "epoch": 2.4321452129089, "grad_norm": 0.1779869645833969, "learning_rate": 1.2201092100993933e-06, "loss": 0.0387, "num_input_tokens_seen": 67111144, "step": 99555 }, { "epoch": 2.4322673637407473, "grad_norm": 0.5402523279190063, "learning_rate": 1.2200260231126933e-06, "loss": 0.0329, "num_input_tokens_seen": 67114152, "step": 99560 }, { "epoch": 2.432389514572594, "grad_norm": 0.26101481914520264, "learning_rate": 1.2199428345259047e-06, "loss": 0.0382, "num_input_tokens_seen": 67117032, "step": 99565 }, { "epoch": 2.4325116654044416, "grad_norm": 25.253210067749023, "learning_rate": 1.2198596443396328e-06, "loss": 0.027, "num_input_tokens_seen": 67120040, "step": 99570 }, { "epoch": 2.4326338162362884, "grad_norm": 0.04243769869208336, "learning_rate": 1.2197764525544822e-06, "loss": 0.0008, "num_input_tokens_seen": 67123688, "step": 99575 }, { "epoch": 2.4327559670681356, "grad_norm": 0.025501498952507973, "learning_rate": 1.2196932591710583e-06, "loss": 0.0007, "num_input_tokens_seen": 67127016, "step": 99580 }, { "epoch": 2.4328781178999828, "grad_norm": 0.03055681847035885, "learning_rate": 1.219610064189966e-06, "loss": 0.0012, "num_input_tokens_seen": 67130344, "step": 99585 }, { "epoch": 2.43300026873183, "grad_norm": 0.14057831466197968, "learning_rate": 1.21952686761181e-06, "loss": 0.0003, "num_input_tokens_seen": 67134184, "step": 99590 }, { "epoch": 2.433122419563677, "grad_norm": 0.5264701247215271, "learning_rate": 1.2194436694371959e-06, "loss": 0.0229, "num_input_tokens_seen": 67137896, "step": 99595 }, { "epoch": 2.4332445703955243, "grad_norm": 0.02670673280954361, "learning_rate": 1.219360469666728e-06, "loss": 0.0008, "num_input_tokens_seen": 67141032, "step": 99600 }, { "epoch": 2.4333667212273715, "grad_norm": 0.08604729175567627, "learning_rate": 1.219277268301012e-06, "loss": 0.0664, "num_input_tokens_seen": 67143976, "step": 99605 }, { "epoch": 2.4334888720592187, "grad_norm": 0.074459008872509, "learning_rate": 1.2191940653406528e-06, "loss": 0.0192, "num_input_tokens_seen": 67147048, "step": 99610 }, { "epoch": 2.433611022891066, "grad_norm": 2.0646824836730957, "learning_rate": 1.2191108607862553e-06, "loss": 0.0024, "num_input_tokens_seen": 67150120, "step": 99615 }, { "epoch": 2.433733173722913, "grad_norm": 0.05667717754840851, "learning_rate": 1.2190276546384247e-06, "loss": 0.0001, "num_input_tokens_seen": 67153320, "step": 99620 }, { "epoch": 2.4338553245547603, "grad_norm": 16.081130981445312, "learning_rate": 1.2189444468977664e-06, "loss": 0.065, "num_input_tokens_seen": 67156328, "step": 99625 }, { "epoch": 2.4339774753866075, "grad_norm": 0.2092624306678772, "learning_rate": 1.2188612375648846e-06, "loss": 0.0002, "num_input_tokens_seen": 67159976, "step": 99630 }, { "epoch": 2.4340996262184547, "grad_norm": 0.04626304656267166, "learning_rate": 1.2187780266403853e-06, "loss": 0.0561, "num_input_tokens_seen": 67163368, "step": 99635 }, { "epoch": 2.434221777050302, "grad_norm": 0.006048401352018118, "learning_rate": 1.218694814124873e-06, "loss": 0.0002, "num_input_tokens_seen": 67166440, "step": 99640 }, { "epoch": 2.434343927882149, "grad_norm": 0.13066811859607697, "learning_rate": 1.2186116000189536e-06, "loss": 0.0002, "num_input_tokens_seen": 67169576, "step": 99645 }, { "epoch": 2.4344660787139962, "grad_norm": 0.009472579695284367, "learning_rate": 1.218528384323232e-06, "loss": 0.0346, "num_input_tokens_seen": 67172840, "step": 99650 }, { "epoch": 2.4345882295458434, "grad_norm": 20.248411178588867, "learning_rate": 1.2184451670383129e-06, "loss": 0.0457, "num_input_tokens_seen": 67176040, "step": 99655 }, { "epoch": 2.43471038037769, "grad_norm": 0.01613772287964821, "learning_rate": 1.218361948164802e-06, "loss": 0.042, "num_input_tokens_seen": 67179240, "step": 99660 }, { "epoch": 2.4348325312095374, "grad_norm": 32.162349700927734, "learning_rate": 1.218278727703304e-06, "loss": 0.0396, "num_input_tokens_seen": 67182312, "step": 99665 }, { "epoch": 2.4349546820413845, "grad_norm": 0.007790977600961924, "learning_rate": 1.2181955056544244e-06, "loss": 0.0075, "num_input_tokens_seen": 67185704, "step": 99670 }, { "epoch": 2.4350768328732317, "grad_norm": 0.21455731987953186, "learning_rate": 1.2181122820187689e-06, "loss": 0.0602, "num_input_tokens_seen": 67188456, "step": 99675 }, { "epoch": 2.435198983705079, "grad_norm": 170.60511779785156, "learning_rate": 1.2180290567969417e-06, "loss": 0.0023, "num_input_tokens_seen": 67191720, "step": 99680 }, { "epoch": 2.435321134536926, "grad_norm": 0.011751042678952217, "learning_rate": 1.217945829989549e-06, "loss": 0.0002, "num_input_tokens_seen": 67195368, "step": 99685 }, { "epoch": 2.4354432853687733, "grad_norm": 0.008488446474075317, "learning_rate": 1.217862601597195e-06, "loss": 0.0401, "num_input_tokens_seen": 67198888, "step": 99690 }, { "epoch": 2.4355654362006205, "grad_norm": 0.23472584784030914, "learning_rate": 1.2177793716204858e-06, "loss": 0.0534, "num_input_tokens_seen": 67202344, "step": 99695 }, { "epoch": 2.4356875870324677, "grad_norm": 0.1314506083726883, "learning_rate": 1.2176961400600265e-06, "loss": 0.0002, "num_input_tokens_seen": 67205928, "step": 99700 }, { "epoch": 2.435809737864315, "grad_norm": 0.00579680735245347, "learning_rate": 1.2176129069164225e-06, "loss": 0.0001, "num_input_tokens_seen": 67209000, "step": 99705 }, { "epoch": 2.435931888696162, "grad_norm": 1.2763606309890747, "learning_rate": 1.2175296721902786e-06, "loss": 0.0008, "num_input_tokens_seen": 67212328, "step": 99710 }, { "epoch": 2.4360540395280093, "grad_norm": 3.3028550148010254, "learning_rate": 1.2174464358822005e-06, "loss": 0.001, "num_input_tokens_seen": 67215592, "step": 99715 }, { "epoch": 2.4361761903598564, "grad_norm": 0.00807754322886467, "learning_rate": 1.2173631979927935e-06, "loss": 0.0495, "num_input_tokens_seen": 67219176, "step": 99720 }, { "epoch": 2.4362983411917036, "grad_norm": 0.03470252826809883, "learning_rate": 1.217279958522663e-06, "loss": 0.0502, "num_input_tokens_seen": 67222440, "step": 99725 }, { "epoch": 2.436420492023551, "grad_norm": 0.022196587175130844, "learning_rate": 1.217196717472414e-06, "loss": 0.1152, "num_input_tokens_seen": 67225960, "step": 99730 }, { "epoch": 2.436542642855398, "grad_norm": 0.018160829320549965, "learning_rate": 1.2171134748426522e-06, "loss": 0.0001, "num_input_tokens_seen": 67229480, "step": 99735 }, { "epoch": 2.436664793687245, "grad_norm": 0.04221296310424805, "learning_rate": 1.2170302306339825e-06, "loss": 0.0763, "num_input_tokens_seen": 67232744, "step": 99740 }, { "epoch": 2.436786944519092, "grad_norm": 0.13677258789539337, "learning_rate": 1.216946984847011e-06, "loss": 0.0002, "num_input_tokens_seen": 67235816, "step": 99745 }, { "epoch": 2.4369090953509396, "grad_norm": 21.423381805419922, "learning_rate": 1.2168637374823425e-06, "loss": 0.0852, "num_input_tokens_seen": 67239144, "step": 99750 }, { "epoch": 2.4370312461827863, "grad_norm": 0.33186808228492737, "learning_rate": 1.2167804885405825e-06, "loss": 0.0001, "num_input_tokens_seen": 67242280, "step": 99755 }, { "epoch": 2.4371533970146335, "grad_norm": 0.030644388869404793, "learning_rate": 1.2166972380223363e-06, "loss": 0.0001, "num_input_tokens_seen": 67245608, "step": 99760 }, { "epoch": 2.4372755478464807, "grad_norm": 0.02295338734984398, "learning_rate": 1.2166139859282098e-06, "loss": 0.0002, "num_input_tokens_seen": 67249064, "step": 99765 }, { "epoch": 2.437397698678328, "grad_norm": 0.46199899911880493, "learning_rate": 1.2165307322588082e-06, "loss": 0.0003, "num_input_tokens_seen": 67252264, "step": 99770 }, { "epoch": 2.437519849510175, "grad_norm": 0.007493645418435335, "learning_rate": 1.2164474770147365e-06, "loss": 0.0372, "num_input_tokens_seen": 67255848, "step": 99775 }, { "epoch": 2.4376420003420223, "grad_norm": 0.04148320481181145, "learning_rate": 1.2163642201966011e-06, "loss": 0.0001, "num_input_tokens_seen": 67259176, "step": 99780 }, { "epoch": 2.4377641511738695, "grad_norm": 586.2407836914062, "learning_rate": 1.2162809618050065e-06, "loss": 0.0832, "num_input_tokens_seen": 67262184, "step": 99785 }, { "epoch": 2.4378863020057167, "grad_norm": 0.13672468066215515, "learning_rate": 1.2161977018405586e-06, "loss": 0.0491, "num_input_tokens_seen": 67265320, "step": 99790 }, { "epoch": 2.438008452837564, "grad_norm": 0.00598291028290987, "learning_rate": 1.2161144403038629e-06, "loss": 0.1263, "num_input_tokens_seen": 67268776, "step": 99795 }, { "epoch": 2.438130603669411, "grad_norm": 0.046379100531339645, "learning_rate": 1.2160311771955246e-06, "loss": 0.0476, "num_input_tokens_seen": 67272104, "step": 99800 }, { "epoch": 2.4382527545012582, "grad_norm": 0.0232473723590374, "learning_rate": 1.21594791251615e-06, "loss": 0.0002, "num_input_tokens_seen": 67275304, "step": 99805 }, { "epoch": 2.4383749053331054, "grad_norm": 0.05157886818051338, "learning_rate": 1.2158646462663437e-06, "loss": 0.0281, "num_input_tokens_seen": 67278184, "step": 99810 }, { "epoch": 2.4384970561649526, "grad_norm": 0.07750022411346436, "learning_rate": 1.215781378446712e-06, "loss": 0.1022, "num_input_tokens_seen": 67281512, "step": 99815 }, { "epoch": 2.4386192069968, "grad_norm": 0.865932047367096, "learning_rate": 1.2156981090578594e-06, "loss": 0.0003, "num_input_tokens_seen": 67285352, "step": 99820 }, { "epoch": 2.438741357828647, "grad_norm": 323.0584411621094, "learning_rate": 1.2156148381003926e-06, "loss": 0.0044, "num_input_tokens_seen": 67288360, "step": 99825 }, { "epoch": 2.4388635086604937, "grad_norm": 0.030158882960677147, "learning_rate": 1.215531565574917e-06, "loss": 0.0003, "num_input_tokens_seen": 67291624, "step": 99830 }, { "epoch": 2.4389856594923414, "grad_norm": 0.0562431737780571, "learning_rate": 1.2154482914820375e-06, "loss": 0.0004, "num_input_tokens_seen": 67294824, "step": 99835 }, { "epoch": 2.439107810324188, "grad_norm": 0.21903370320796967, "learning_rate": 1.2153650158223602e-06, "loss": 0.0002, "num_input_tokens_seen": 67298088, "step": 99840 }, { "epoch": 2.4392299611560353, "grad_norm": 0.0879155844449997, "learning_rate": 1.2152817385964906e-06, "loss": 0.0006, "num_input_tokens_seen": 67301352, "step": 99845 }, { "epoch": 2.4393521119878825, "grad_norm": 0.03366367518901825, "learning_rate": 1.215198459805034e-06, "loss": 0.0, "num_input_tokens_seen": 67305000, "step": 99850 }, { "epoch": 2.4394742628197297, "grad_norm": 0.013038181699812412, "learning_rate": 1.2151151794485966e-06, "loss": 0.0001, "num_input_tokens_seen": 67308520, "step": 99855 }, { "epoch": 2.439596413651577, "grad_norm": 0.0064660487696528435, "learning_rate": 1.2150318975277835e-06, "loss": 0.0883, "num_input_tokens_seen": 67311784, "step": 99860 }, { "epoch": 2.439718564483424, "grad_norm": 0.8282192945480347, "learning_rate": 1.2149486140432008e-06, "loss": 0.0014, "num_input_tokens_seen": 67315112, "step": 99865 }, { "epoch": 2.4398407153152712, "grad_norm": 39.81499099731445, "learning_rate": 1.214865328995454e-06, "loss": 0.1583, "num_input_tokens_seen": 67318056, "step": 99870 }, { "epoch": 2.4399628661471184, "grad_norm": 0.4565242826938629, "learning_rate": 1.2147820423851487e-06, "loss": 0.0569, "num_input_tokens_seen": 67321832, "step": 99875 }, { "epoch": 2.4400850169789656, "grad_norm": 0.058376945555210114, "learning_rate": 1.2146987542128906e-06, "loss": 0.0001, "num_input_tokens_seen": 67324968, "step": 99880 }, { "epoch": 2.440207167810813, "grad_norm": 0.017881063744425774, "learning_rate": 1.2146154644792855e-06, "loss": 0.0002, "num_input_tokens_seen": 67328232, "step": 99885 }, { "epoch": 2.44032931864266, "grad_norm": 0.022865792736411095, "learning_rate": 1.214532173184939e-06, "loss": 0.0005, "num_input_tokens_seen": 67331368, "step": 99890 }, { "epoch": 2.440451469474507, "grad_norm": 0.0678458958864212, "learning_rate": 1.2144488803304566e-06, "loss": 0.0002, "num_input_tokens_seen": 67334568, "step": 99895 }, { "epoch": 2.4405736203063544, "grad_norm": 0.03220222890377045, "learning_rate": 1.2143655859164445e-06, "loss": 0.0142, "num_input_tokens_seen": 67338280, "step": 99900 }, { "epoch": 2.4406957711382016, "grad_norm": 0.016223371028900146, "learning_rate": 1.2142822899435083e-06, "loss": 0.0004, "num_input_tokens_seen": 67342056, "step": 99905 }, { "epoch": 2.4408179219700488, "grad_norm": 0.025257963687181473, "learning_rate": 1.2141989924122534e-06, "loss": 0.0276, "num_input_tokens_seen": 67345896, "step": 99910 }, { "epoch": 2.440940072801896, "grad_norm": 0.0486016608774662, "learning_rate": 1.2141156933232856e-06, "loss": 0.0574, "num_input_tokens_seen": 67348904, "step": 99915 }, { "epoch": 2.441062223633743, "grad_norm": 29.583646774291992, "learning_rate": 1.2140323926772114e-06, "loss": 0.034, "num_input_tokens_seen": 67352744, "step": 99920 }, { "epoch": 2.44118437446559, "grad_norm": 0.04151540622115135, "learning_rate": 1.2139490904746359e-06, "loss": 0.0001, "num_input_tokens_seen": 67356328, "step": 99925 }, { "epoch": 2.4413065252974375, "grad_norm": 0.13054241240024567, "learning_rate": 1.213865786716165e-06, "loss": 0.0002, "num_input_tokens_seen": 67359976, "step": 99930 }, { "epoch": 2.4414286761292843, "grad_norm": 0.4463544189929962, "learning_rate": 1.2137824814024048e-06, "loss": 0.0003, "num_input_tokens_seen": 67363496, "step": 99935 }, { "epoch": 2.4415508269611315, "grad_norm": 0.1418827623128891, "learning_rate": 1.2136991745339606e-06, "loss": 0.0665, "num_input_tokens_seen": 67366504, "step": 99940 }, { "epoch": 2.4416729777929786, "grad_norm": 0.02598101831972599, "learning_rate": 1.2136158661114387e-06, "loss": 0.0004, "num_input_tokens_seen": 67370088, "step": 99945 }, { "epoch": 2.441795128624826, "grad_norm": 958.7318115234375, "learning_rate": 1.2135325561354446e-06, "loss": 0.101, "num_input_tokens_seen": 67373480, "step": 99950 }, { "epoch": 2.441917279456673, "grad_norm": 19.178232192993164, "learning_rate": 1.2134492446065844e-06, "loss": 0.0351, "num_input_tokens_seen": 67376680, "step": 99955 }, { "epoch": 2.44203943028852, "grad_norm": 0.021837705746293068, "learning_rate": 1.2133659315254637e-06, "loss": 0.0004, "num_input_tokens_seen": 67380072, "step": 99960 }, { "epoch": 2.4421615811203674, "grad_norm": 0.0334324985742569, "learning_rate": 1.2132826168926888e-06, "loss": 0.0005, "num_input_tokens_seen": 67383144, "step": 99965 }, { "epoch": 2.4422837319522146, "grad_norm": 0.002274291357025504, "learning_rate": 1.2131993007088654e-06, "loss": 0.0002, "num_input_tokens_seen": 67386472, "step": 99970 }, { "epoch": 2.442405882784062, "grad_norm": 0.10472066700458527, "learning_rate": 1.2131159829745991e-06, "loss": 0.0466, "num_input_tokens_seen": 67389800, "step": 99975 }, { "epoch": 2.442528033615909, "grad_norm": 0.11208847165107727, "learning_rate": 1.2130326636904963e-06, "loss": 0.0412, "num_input_tokens_seen": 67393512, "step": 99980 }, { "epoch": 2.442650184447756, "grad_norm": 0.4897066354751587, "learning_rate": 1.2129493428571627e-06, "loss": 0.0408, "num_input_tokens_seen": 67396648, "step": 99985 }, { "epoch": 2.4427723352796034, "grad_norm": 0.02697628177702427, "learning_rate": 1.2128660204752042e-06, "loss": 0.0002, "num_input_tokens_seen": 67400552, "step": 99990 }, { "epoch": 2.4428944861114505, "grad_norm": 0.007688530255109072, "learning_rate": 1.2127826965452266e-06, "loss": 0.0639, "num_input_tokens_seen": 67404008, "step": 99995 }, { "epoch": 2.4430166369432977, "grad_norm": 17.038576126098633, "learning_rate": 1.212699371067836e-06, "loss": 0.1636, "num_input_tokens_seen": 67407336, "step": 100000 }, { "epoch": 2.443138787775145, "grad_norm": 0.03481506183743477, "learning_rate": 1.212616044043638e-06, "loss": 0.0291, "num_input_tokens_seen": 67410664, "step": 100005 }, { "epoch": 2.4432609386069917, "grad_norm": 0.06357225775718689, "learning_rate": 1.2125327154732394e-06, "loss": 0.0003, "num_input_tokens_seen": 67413800, "step": 100010 }, { "epoch": 2.4433830894388393, "grad_norm": 0.060711268335580826, "learning_rate": 1.2124493853572458e-06, "loss": 0.0801, "num_input_tokens_seen": 67416616, "step": 100015 }, { "epoch": 2.443505240270686, "grad_norm": 0.0068696048110723495, "learning_rate": 1.2123660536962628e-06, "loss": 0.0002, "num_input_tokens_seen": 67419560, "step": 100020 }, { "epoch": 2.4436273911025332, "grad_norm": 0.019549276679754257, "learning_rate": 1.212282720490897e-06, "loss": 0.043, "num_input_tokens_seen": 67423272, "step": 100025 }, { "epoch": 2.4437495419343804, "grad_norm": 0.01690533384680748, "learning_rate": 1.2121993857417542e-06, "loss": 0.0004, "num_input_tokens_seen": 67426920, "step": 100030 }, { "epoch": 2.4438716927662276, "grad_norm": 0.12328990548849106, "learning_rate": 1.21211604944944e-06, "loss": 0.0002, "num_input_tokens_seen": 67430248, "step": 100035 }, { "epoch": 2.443993843598075, "grad_norm": 0.009458489716053009, "learning_rate": 1.2120327116145611e-06, "loss": 0.082, "num_input_tokens_seen": 67433576, "step": 100040 }, { "epoch": 2.444115994429922, "grad_norm": 21.805879592895508, "learning_rate": 1.2119493722377233e-06, "loss": 0.0875, "num_input_tokens_seen": 67437032, "step": 100045 }, { "epoch": 2.444238145261769, "grad_norm": 0.032339856028556824, "learning_rate": 1.2118660313195327e-06, "loss": 0.0665, "num_input_tokens_seen": 67440552, "step": 100050 }, { "epoch": 2.4443602960936164, "grad_norm": 0.07892021536827087, "learning_rate": 1.2117826888605953e-06, "loss": 0.0003, "num_input_tokens_seen": 67443624, "step": 100055 }, { "epoch": 2.4444824469254636, "grad_norm": 0.06196204572916031, "learning_rate": 1.2116993448615173e-06, "loss": 0.0004, "num_input_tokens_seen": 67447080, "step": 100060 }, { "epoch": 2.4446045977573108, "grad_norm": 0.10552898794412613, "learning_rate": 1.2116159993229045e-06, "loss": 0.12, "num_input_tokens_seen": 67450728, "step": 100065 }, { "epoch": 2.444726748589158, "grad_norm": 0.17518259584903717, "learning_rate": 1.2115326522453632e-06, "loss": 0.0004, "num_input_tokens_seen": 67454376, "step": 100070 }, { "epoch": 2.444848899421005, "grad_norm": 0.05008700117468834, "learning_rate": 1.2114493036294996e-06, "loss": 0.0526, "num_input_tokens_seen": 67457768, "step": 100075 }, { "epoch": 2.4449710502528523, "grad_norm": 25.26511573791504, "learning_rate": 1.21136595347592e-06, "loss": 0.0651, "num_input_tokens_seen": 67461160, "step": 100080 }, { "epoch": 2.4450932010846995, "grad_norm": 0.05229969322681427, "learning_rate": 1.2112826017852303e-06, "loss": 0.0733, "num_input_tokens_seen": 67464040, "step": 100085 }, { "epoch": 2.4452153519165467, "grad_norm": 0.011544210836291313, "learning_rate": 1.211199248558037e-06, "loss": 0.0002, "num_input_tokens_seen": 67467496, "step": 100090 }, { "epoch": 2.445337502748394, "grad_norm": 0.011759086512029171, "learning_rate": 1.2111158937949456e-06, "loss": 0.04, "num_input_tokens_seen": 67470824, "step": 100095 }, { "epoch": 2.445459653580241, "grad_norm": 2.3236122131347656, "learning_rate": 1.2110325374965624e-06, "loss": 0.0986, "num_input_tokens_seen": 67474088, "step": 100100 }, { "epoch": 2.445581804412088, "grad_norm": 0.016584821045398712, "learning_rate": 1.2109491796634942e-06, "loss": 0.0009, "num_input_tokens_seen": 67477160, "step": 100105 }, { "epoch": 2.445703955243935, "grad_norm": 0.03389434516429901, "learning_rate": 1.2108658202963464e-06, "loss": 0.0002, "num_input_tokens_seen": 67480552, "step": 100110 }, { "epoch": 2.445826106075782, "grad_norm": 0.03657127916812897, "learning_rate": 1.210782459395726e-06, "loss": 0.0446, "num_input_tokens_seen": 67483816, "step": 100115 }, { "epoch": 2.4459482569076294, "grad_norm": 19.909534454345703, "learning_rate": 1.2106990969622388e-06, "loss": 0.0379, "num_input_tokens_seen": 67487464, "step": 100120 }, { "epoch": 2.4460704077394766, "grad_norm": 0.20571334660053253, "learning_rate": 1.2106157329964913e-06, "loss": 0.0002, "num_input_tokens_seen": 67490792, "step": 100125 }, { "epoch": 2.446192558571324, "grad_norm": 0.005827835761010647, "learning_rate": 1.210532367499089e-06, "loss": 0.0465, "num_input_tokens_seen": 67494056, "step": 100130 }, { "epoch": 2.446314709403171, "grad_norm": 0.5593209266662598, "learning_rate": 1.210449000470639e-06, "loss": 0.0004, "num_input_tokens_seen": 67497512, "step": 100135 }, { "epoch": 2.446436860235018, "grad_norm": 13.949110984802246, "learning_rate": 1.2103656319117474e-06, "loss": 0.0529, "num_input_tokens_seen": 67500840, "step": 100140 }, { "epoch": 2.4465590110668654, "grad_norm": 0.0062551493756473064, "learning_rate": 1.2102822618230204e-06, "loss": 0.0396, "num_input_tokens_seen": 67504168, "step": 100145 }, { "epoch": 2.4466811618987125, "grad_norm": 0.0040052300319075584, "learning_rate": 1.210198890205064e-06, "loss": 0.0005, "num_input_tokens_seen": 67507496, "step": 100150 }, { "epoch": 2.4468033127305597, "grad_norm": 0.14093728363513947, "learning_rate": 1.210115517058485e-06, "loss": 0.0004, "num_input_tokens_seen": 67510824, "step": 100155 }, { "epoch": 2.446925463562407, "grad_norm": 0.26041334867477417, "learning_rate": 1.2100321423838889e-06, "loss": 0.0614, "num_input_tokens_seen": 67514216, "step": 100160 }, { "epoch": 2.447047614394254, "grad_norm": 0.06177394092082977, "learning_rate": 1.2099487661818829e-06, "loss": 0.0004, "num_input_tokens_seen": 67517608, "step": 100165 }, { "epoch": 2.4471697652261013, "grad_norm": 30.27055549621582, "learning_rate": 1.209865388453073e-06, "loss": 0.0595, "num_input_tokens_seen": 67520872, "step": 100170 }, { "epoch": 2.4472919160579485, "grad_norm": 0.05899616703391075, "learning_rate": 1.2097820091980654e-06, "loss": 0.0971, "num_input_tokens_seen": 67523880, "step": 100175 }, { "epoch": 2.4474140668897957, "grad_norm": 22.625675201416016, "learning_rate": 1.209698628417467e-06, "loss": 0.166, "num_input_tokens_seen": 67526888, "step": 100180 }, { "epoch": 2.447536217721643, "grad_norm": 0.04590220749378204, "learning_rate": 1.2096152461118836e-06, "loss": 0.0003, "num_input_tokens_seen": 67530088, "step": 100185 }, { "epoch": 2.4476583685534896, "grad_norm": 0.10016443580389023, "learning_rate": 1.2095318622819216e-06, "loss": 0.0469, "num_input_tokens_seen": 67533032, "step": 100190 }, { "epoch": 2.4477805193853373, "grad_norm": 0.08563831448554993, "learning_rate": 1.2094484769281877e-06, "loss": 0.0003, "num_input_tokens_seen": 67536040, "step": 100195 }, { "epoch": 2.447902670217184, "grad_norm": 0.09388595819473267, "learning_rate": 1.2093650900512879e-06, "loss": 0.0562, "num_input_tokens_seen": 67539496, "step": 100200 }, { "epoch": 2.448024821049031, "grad_norm": 15.039061546325684, "learning_rate": 1.2092817016518291e-06, "loss": 0.0657, "num_input_tokens_seen": 67542824, "step": 100205 }, { "epoch": 2.4481469718808784, "grad_norm": 0.07398822903633118, "learning_rate": 1.2091983117304174e-06, "loss": 0.0311, "num_input_tokens_seen": 67545832, "step": 100210 }, { "epoch": 2.4482691227127256, "grad_norm": 0.012557965703308582, "learning_rate": 1.2091149202876593e-06, "loss": 0.0398, "num_input_tokens_seen": 67549288, "step": 100215 }, { "epoch": 2.4483912735445728, "grad_norm": 0.4743223488330841, "learning_rate": 1.2090315273241613e-06, "loss": 0.0006, "num_input_tokens_seen": 67552296, "step": 100220 }, { "epoch": 2.44851342437642, "grad_norm": 0.2512113153934479, "learning_rate": 1.2089481328405295e-06, "loss": 0.0542, "num_input_tokens_seen": 67555624, "step": 100225 }, { "epoch": 2.448635575208267, "grad_norm": 0.02216065302491188, "learning_rate": 1.208864736837371e-06, "loss": 0.0003, "num_input_tokens_seen": 67559144, "step": 100230 }, { "epoch": 2.4487577260401143, "grad_norm": 0.07935440540313721, "learning_rate": 1.2087813393152919e-06, "loss": 0.0007, "num_input_tokens_seen": 67562600, "step": 100235 }, { "epoch": 2.4488798768719615, "grad_norm": 18.39009666442871, "learning_rate": 1.208697940274899e-06, "loss": 0.0804, "num_input_tokens_seen": 67565736, "step": 100240 }, { "epoch": 2.4490020277038087, "grad_norm": 55.88145446777344, "learning_rate": 1.2086145397167981e-06, "loss": 0.0418, "num_input_tokens_seen": 67569000, "step": 100245 }, { "epoch": 2.449124178535656, "grad_norm": 0.10761766880750656, "learning_rate": 1.2085311376415965e-06, "loss": 0.0003, "num_input_tokens_seen": 67572136, "step": 100250 }, { "epoch": 2.449246329367503, "grad_norm": 0.07407483458518982, "learning_rate": 1.2084477340499003e-06, "loss": 0.0004, "num_input_tokens_seen": 67575272, "step": 100255 }, { "epoch": 2.4493684801993503, "grad_norm": 0.02553796023130417, "learning_rate": 1.208364328942316e-06, "loss": 0.003, "num_input_tokens_seen": 67578536, "step": 100260 }, { "epoch": 2.4494906310311975, "grad_norm": 0.02047552913427353, "learning_rate": 1.2082809223194502e-06, "loss": 0.0006, "num_input_tokens_seen": 67581864, "step": 100265 }, { "epoch": 2.4496127818630447, "grad_norm": 0.007181069348007441, "learning_rate": 1.2081975141819097e-06, "loss": 0.0401, "num_input_tokens_seen": 67585000, "step": 100270 }, { "epoch": 2.449734932694892, "grad_norm": 0.008215844631195068, "learning_rate": 1.2081141045303007e-06, "loss": 0.0002, "num_input_tokens_seen": 67588264, "step": 100275 }, { "epoch": 2.449857083526739, "grad_norm": 0.007480736821889877, "learning_rate": 1.20803069336523e-06, "loss": 0.0003, "num_input_tokens_seen": 67592104, "step": 100280 }, { "epoch": 2.4499792343585858, "grad_norm": 0.0785703957080841, "learning_rate": 1.207947280687304e-06, "loss": 0.0428, "num_input_tokens_seen": 67595560, "step": 100285 }, { "epoch": 2.450101385190433, "grad_norm": 0.008701242506504059, "learning_rate": 1.2078638664971297e-06, "loss": 0.0002, "num_input_tokens_seen": 67598760, "step": 100290 }, { "epoch": 2.45022353602228, "grad_norm": 4.186373233795166, "learning_rate": 1.2077804507953135e-06, "loss": 0.009, "num_input_tokens_seen": 67602152, "step": 100295 }, { "epoch": 2.4503456868541273, "grad_norm": 0.03363225609064102, "learning_rate": 1.2076970335824618e-06, "loss": 0.1083, "num_input_tokens_seen": 67605608, "step": 100300 }, { "epoch": 2.4504678376859745, "grad_norm": 0.016309145838022232, "learning_rate": 1.2076136148591817e-06, "loss": 0.0788, "num_input_tokens_seen": 67608488, "step": 100305 }, { "epoch": 2.4505899885178217, "grad_norm": 0.013027573004364967, "learning_rate": 1.2075301946260795e-06, "loss": 0.0001, "num_input_tokens_seen": 67611944, "step": 100310 }, { "epoch": 2.450712139349669, "grad_norm": 0.035746023058891296, "learning_rate": 1.2074467728837615e-06, "loss": 0.0004, "num_input_tokens_seen": 67615272, "step": 100315 }, { "epoch": 2.450834290181516, "grad_norm": 0.025673598051071167, "learning_rate": 1.2073633496328348e-06, "loss": 0.0182, "num_input_tokens_seen": 67618600, "step": 100320 }, { "epoch": 2.4509564410133633, "grad_norm": 0.017467927187681198, "learning_rate": 1.2072799248739062e-06, "loss": 0.097, "num_input_tokens_seen": 67621736, "step": 100325 }, { "epoch": 2.4510785918452105, "grad_norm": 0.007844069972634315, "learning_rate": 1.2071964986075825e-06, "loss": 0.0042, "num_input_tokens_seen": 67625448, "step": 100330 }, { "epoch": 2.4512007426770577, "grad_norm": 0.18830718100070953, "learning_rate": 1.2071130708344698e-06, "loss": 0.0003, "num_input_tokens_seen": 67628776, "step": 100335 }, { "epoch": 2.451322893508905, "grad_norm": 0.05894733592867851, "learning_rate": 1.2070296415551754e-06, "loss": 0.0886, "num_input_tokens_seen": 67632168, "step": 100340 }, { "epoch": 2.451445044340752, "grad_norm": 0.11676836758852005, "learning_rate": 1.2069462107703055e-06, "loss": 0.0001, "num_input_tokens_seen": 67635880, "step": 100345 }, { "epoch": 2.4515671951725992, "grad_norm": 0.05520134046673775, "learning_rate": 1.2068627784804673e-06, "loss": 0.0002, "num_input_tokens_seen": 67639272, "step": 100350 }, { "epoch": 2.4516893460044464, "grad_norm": 32.44426727294922, "learning_rate": 1.206779344686267e-06, "loss": 0.0013, "num_input_tokens_seen": 67643048, "step": 100355 }, { "epoch": 2.4518114968362936, "grad_norm": 1.8535118103027344, "learning_rate": 1.2066959093883122e-06, "loss": 0.0665, "num_input_tokens_seen": 67646504, "step": 100360 }, { "epoch": 2.451933647668141, "grad_norm": 0.03558404743671417, "learning_rate": 1.206612472587209e-06, "loss": 0.0002, "num_input_tokens_seen": 67650216, "step": 100365 }, { "epoch": 2.4520557984999876, "grad_norm": 0.021302519366145134, "learning_rate": 1.206529034283564e-06, "loss": 0.0328, "num_input_tokens_seen": 67653672, "step": 100370 }, { "epoch": 2.452177949331835, "grad_norm": 0.019249480217695236, "learning_rate": 1.2064455944779845e-06, "loss": 0.0379, "num_input_tokens_seen": 67657256, "step": 100375 }, { "epoch": 2.452300100163682, "grad_norm": 0.1599765121936798, "learning_rate": 1.206362153171077e-06, "loss": 0.0913, "num_input_tokens_seen": 67660456, "step": 100380 }, { "epoch": 2.452422250995529, "grad_norm": 0.02373223938047886, "learning_rate": 1.2062787103634486e-06, "loss": 0.0008, "num_input_tokens_seen": 67663976, "step": 100385 }, { "epoch": 2.4525444018273763, "grad_norm": 0.004647197667509317, "learning_rate": 1.206195266055706e-06, "loss": 0.0002, "num_input_tokens_seen": 67667560, "step": 100390 }, { "epoch": 2.4526665526592235, "grad_norm": 0.05331575870513916, "learning_rate": 1.2061118202484556e-06, "loss": 0.0678, "num_input_tokens_seen": 67670824, "step": 100395 }, { "epoch": 2.4527887034910707, "grad_norm": 21.77971839904785, "learning_rate": 1.206028372942305e-06, "loss": 0.0514, "num_input_tokens_seen": 67674344, "step": 100400 }, { "epoch": 2.452910854322918, "grad_norm": 21.462068557739258, "learning_rate": 1.2059449241378608e-06, "loss": 0.0522, "num_input_tokens_seen": 67678376, "step": 100405 }, { "epoch": 2.453033005154765, "grad_norm": 0.0774010568857193, "learning_rate": 1.2058614738357294e-06, "loss": 0.0158, "num_input_tokens_seen": 67681448, "step": 100410 }, { "epoch": 2.4531551559866123, "grad_norm": 0.007746968884021044, "learning_rate": 1.205778022036518e-06, "loss": 0.0392, "num_input_tokens_seen": 67684904, "step": 100415 }, { "epoch": 2.4532773068184595, "grad_norm": 0.08811292052268982, "learning_rate": 1.2056945687408334e-06, "loss": 0.0002, "num_input_tokens_seen": 67687912, "step": 100420 }, { "epoch": 2.4533994576503066, "grad_norm": 239.14468383789062, "learning_rate": 1.2056111139492827e-06, "loss": 0.0553, "num_input_tokens_seen": 67691176, "step": 100425 }, { "epoch": 2.453521608482154, "grad_norm": 0.01934845559298992, "learning_rate": 1.2055276576624727e-06, "loss": 0.0639, "num_input_tokens_seen": 67694568, "step": 100430 }, { "epoch": 2.453643759314001, "grad_norm": 0.09107211232185364, "learning_rate": 1.2054441998810103e-06, "loss": 0.1097, "num_input_tokens_seen": 67698088, "step": 100435 }, { "epoch": 2.453765910145848, "grad_norm": 0.0451941080391407, "learning_rate": 1.205360740605502e-06, "loss": 0.0523, "num_input_tokens_seen": 67701544, "step": 100440 }, { "epoch": 2.4538880609776954, "grad_norm": 0.056204136461019516, "learning_rate": 1.2052772798365556e-06, "loss": 0.0754, "num_input_tokens_seen": 67704616, "step": 100445 }, { "epoch": 2.4540102118095426, "grad_norm": 0.027863988652825356, "learning_rate": 1.2051938175747777e-06, "loss": 0.0731, "num_input_tokens_seen": 67708136, "step": 100450 }, { "epoch": 2.4541323626413893, "grad_norm": 0.09919628500938416, "learning_rate": 1.2051103538207752e-06, "loss": 0.0003, "num_input_tokens_seen": 67711784, "step": 100455 }, { "epoch": 2.454254513473237, "grad_norm": 64.31039428710938, "learning_rate": 1.2050268885751547e-06, "loss": 0.2318, "num_input_tokens_seen": 67715176, "step": 100460 }, { "epoch": 2.4543766643050837, "grad_norm": 0.015285374596714973, "learning_rate": 1.2049434218385236e-06, "loss": 0.0004, "num_input_tokens_seen": 67718632, "step": 100465 }, { "epoch": 2.454498815136931, "grad_norm": 14.737951278686523, "learning_rate": 1.2048599536114887e-06, "loss": 0.0355, "num_input_tokens_seen": 67722472, "step": 100470 }, { "epoch": 2.454620965968778, "grad_norm": 4.654276371002197, "learning_rate": 1.2047764838946574e-06, "loss": 0.0015, "num_input_tokens_seen": 67725864, "step": 100475 }, { "epoch": 2.4547431168006253, "grad_norm": 14.541358947753906, "learning_rate": 1.2046930126886362e-06, "loss": 0.0008, "num_input_tokens_seen": 67729128, "step": 100480 }, { "epoch": 2.4548652676324725, "grad_norm": 0.00767884124070406, "learning_rate": 1.2046095399940326e-06, "loss": 0.0001, "num_input_tokens_seen": 67732328, "step": 100485 }, { "epoch": 2.4549874184643197, "grad_norm": 0.023718344047665596, "learning_rate": 1.2045260658114534e-06, "loss": 0.095, "num_input_tokens_seen": 67735656, "step": 100490 }, { "epoch": 2.455109569296167, "grad_norm": 0.27190932631492615, "learning_rate": 1.2044425901415053e-06, "loss": 0.0004, "num_input_tokens_seen": 67738920, "step": 100495 }, { "epoch": 2.455231720128014, "grad_norm": 0.017982225865125656, "learning_rate": 1.204359112984796e-06, "loss": 0.0003, "num_input_tokens_seen": 67742568, "step": 100500 }, { "epoch": 2.4553538709598612, "grad_norm": 0.030523277819156647, "learning_rate": 1.2042756343419324e-06, "loss": 0.0003, "num_input_tokens_seen": 67746344, "step": 100505 }, { "epoch": 2.4554760217917084, "grad_norm": 2.859264850616455, "learning_rate": 1.2041921542135209e-06, "loss": 0.0011, "num_input_tokens_seen": 67749544, "step": 100510 }, { "epoch": 2.4555981726235556, "grad_norm": 0.15216529369354248, "learning_rate": 1.2041086726001696e-06, "loss": 0.0002, "num_input_tokens_seen": 67752744, "step": 100515 }, { "epoch": 2.455720323455403, "grad_norm": 0.024804210290312767, "learning_rate": 1.204025189502485e-06, "loss": 0.0983, "num_input_tokens_seen": 67756456, "step": 100520 }, { "epoch": 2.45584247428725, "grad_norm": 0.03291197121143341, "learning_rate": 1.2039417049210743e-06, "loss": 0.0002, "num_input_tokens_seen": 67759848, "step": 100525 }, { "epoch": 2.455964625119097, "grad_norm": 35.64726257324219, "learning_rate": 1.2038582188565448e-06, "loss": 0.0562, "num_input_tokens_seen": 67763176, "step": 100530 }, { "epoch": 2.4560867759509444, "grad_norm": 0.017125124111771584, "learning_rate": 1.2037747313095032e-06, "loss": 0.0707, "num_input_tokens_seen": 67766376, "step": 100535 }, { "epoch": 2.4562089267827916, "grad_norm": 0.009626665152609348, "learning_rate": 1.2036912422805572e-06, "loss": 0.0001, "num_input_tokens_seen": 67769704, "step": 100540 }, { "epoch": 2.4563310776146388, "grad_norm": 0.10648053884506226, "learning_rate": 1.2036077517703136e-06, "loss": 0.0005, "num_input_tokens_seen": 67772648, "step": 100545 }, { "epoch": 2.4564532284464855, "grad_norm": 4.779392242431641, "learning_rate": 1.2035242597793796e-06, "loss": 0.0008, "num_input_tokens_seen": 67776040, "step": 100550 }, { "epoch": 2.4565753792783327, "grad_norm": 39.19169998168945, "learning_rate": 1.2034407663083626e-06, "loss": 0.0917, "num_input_tokens_seen": 67779368, "step": 100555 }, { "epoch": 2.45669753011018, "grad_norm": 0.06931298226118088, "learning_rate": 1.2033572713578698e-06, "loss": 0.0638, "num_input_tokens_seen": 67782824, "step": 100560 }, { "epoch": 2.456819680942027, "grad_norm": 0.03799625486135483, "learning_rate": 1.2032737749285077e-06, "loss": 0.0003, "num_input_tokens_seen": 67786344, "step": 100565 }, { "epoch": 2.4569418317738743, "grad_norm": 16.83151626586914, "learning_rate": 1.2031902770208846e-06, "loss": 0.0584, "num_input_tokens_seen": 67789864, "step": 100570 }, { "epoch": 2.4570639826057215, "grad_norm": 0.01382389385253191, "learning_rate": 1.2031067776356068e-06, "loss": 0.0627, "num_input_tokens_seen": 67793192, "step": 100575 }, { "epoch": 2.4571861334375686, "grad_norm": 0.02206377312541008, "learning_rate": 1.203023276773282e-06, "loss": 0.1053, "num_input_tokens_seen": 67796520, "step": 100580 }, { "epoch": 2.457308284269416, "grad_norm": 33.02057647705078, "learning_rate": 1.2029397744345173e-06, "loss": 0.1509, "num_input_tokens_seen": 67799848, "step": 100585 }, { "epoch": 2.457430435101263, "grad_norm": 0.11031505465507507, "learning_rate": 1.2028562706199198e-06, "loss": 0.0352, "num_input_tokens_seen": 67803048, "step": 100590 }, { "epoch": 2.45755258593311, "grad_norm": 0.03389930725097656, "learning_rate": 1.202772765330097e-06, "loss": 0.1097, "num_input_tokens_seen": 67806312, "step": 100595 }, { "epoch": 2.4576747367649574, "grad_norm": 0.1475170999765396, "learning_rate": 1.2026892585656564e-06, "loss": 0.0007, "num_input_tokens_seen": 67810024, "step": 100600 }, { "epoch": 2.4577968875968046, "grad_norm": 0.26105520129203796, "learning_rate": 1.2026057503272048e-06, "loss": 0.0495, "num_input_tokens_seen": 67813224, "step": 100605 }, { "epoch": 2.457919038428652, "grad_norm": 3.085169792175293, "learning_rate": 1.2025222406153499e-06, "loss": 0.0053, "num_input_tokens_seen": 67816488, "step": 100610 }, { "epoch": 2.458041189260499, "grad_norm": 0.14629432559013367, "learning_rate": 1.2024387294306986e-06, "loss": 0.056, "num_input_tokens_seen": 67819752, "step": 100615 }, { "epoch": 2.458163340092346, "grad_norm": 0.026330020278692245, "learning_rate": 1.2023552167738585e-06, "loss": 0.0334, "num_input_tokens_seen": 67823144, "step": 100620 }, { "epoch": 2.4582854909241934, "grad_norm": 0.03332911804318428, "learning_rate": 1.2022717026454365e-06, "loss": 0.0162, "num_input_tokens_seen": 67826600, "step": 100625 }, { "epoch": 2.4584076417560405, "grad_norm": 1.5771613121032715, "learning_rate": 1.2021881870460404e-06, "loss": 0.0006, "num_input_tokens_seen": 67829800, "step": 100630 }, { "epoch": 2.4585297925878873, "grad_norm": 0.4669386148452759, "learning_rate": 1.2021046699762777e-06, "loss": 0.0005, "num_input_tokens_seen": 67833192, "step": 100635 }, { "epoch": 2.458651943419735, "grad_norm": 0.07576936483383179, "learning_rate": 1.2020211514367552e-06, "loss": 0.0717, "num_input_tokens_seen": 67836584, "step": 100640 }, { "epoch": 2.4587740942515817, "grad_norm": 0.06343811750411987, "learning_rate": 1.2019376314280808e-06, "loss": 0.0005, "num_input_tokens_seen": 67839784, "step": 100645 }, { "epoch": 2.458896245083429, "grad_norm": 0.4868296980857849, "learning_rate": 1.2018541099508614e-06, "loss": 0.0005, "num_input_tokens_seen": 67843112, "step": 100650 }, { "epoch": 2.459018395915276, "grad_norm": 0.2107633501291275, "learning_rate": 1.201770587005705e-06, "loss": 0.0463, "num_input_tokens_seen": 67847080, "step": 100655 }, { "epoch": 2.4591405467471232, "grad_norm": 0.014766460284590721, "learning_rate": 1.2016870625932182e-06, "loss": 0.0002, "num_input_tokens_seen": 67850536, "step": 100660 }, { "epoch": 2.4592626975789704, "grad_norm": 0.07846330851316452, "learning_rate": 1.201603536714009e-06, "loss": 0.0466, "num_input_tokens_seen": 67854248, "step": 100665 }, { "epoch": 2.4593848484108176, "grad_norm": 0.01824275404214859, "learning_rate": 1.2015200093686845e-06, "loss": 0.0002, "num_input_tokens_seen": 67857512, "step": 100670 }, { "epoch": 2.459506999242665, "grad_norm": 0.001176109304651618, "learning_rate": 1.2014364805578525e-06, "loss": 0.0477, "num_input_tokens_seen": 67860776, "step": 100675 }, { "epoch": 2.459629150074512, "grad_norm": 0.013056579045951366, "learning_rate": 1.2013529502821203e-06, "loss": 0.0445, "num_input_tokens_seen": 67863976, "step": 100680 }, { "epoch": 2.459751300906359, "grad_norm": 0.03527745604515076, "learning_rate": 1.201269418542095e-06, "loss": 0.0361, "num_input_tokens_seen": 67867112, "step": 100685 }, { "epoch": 2.4598734517382064, "grad_norm": 0.0246686153113842, "learning_rate": 1.2011858853383846e-06, "loss": 0.0516, "num_input_tokens_seen": 67870632, "step": 100690 }, { "epoch": 2.4599956025700536, "grad_norm": 0.012224989011883736, "learning_rate": 1.201102350671596e-06, "loss": 0.0327, "num_input_tokens_seen": 67873896, "step": 100695 }, { "epoch": 2.4601177534019008, "grad_norm": 0.08171892166137695, "learning_rate": 1.2010188145423373e-06, "loss": 0.0366, "num_input_tokens_seen": 67877224, "step": 100700 }, { "epoch": 2.460239904233748, "grad_norm": 36.575340270996094, "learning_rate": 1.2009352769512157e-06, "loss": 0.0995, "num_input_tokens_seen": 67881064, "step": 100705 }, { "epoch": 2.460362055065595, "grad_norm": 0.02174554392695427, "learning_rate": 1.2008517378988387e-06, "loss": 0.0004, "num_input_tokens_seen": 67884520, "step": 100710 }, { "epoch": 2.4604842058974423, "grad_norm": 0.008866322226822376, "learning_rate": 1.200768197385814e-06, "loss": 0.0346, "num_input_tokens_seen": 67888104, "step": 100715 }, { "epoch": 2.4606063567292895, "grad_norm": 0.06225938722491264, "learning_rate": 1.2006846554127485e-06, "loss": 0.0423, "num_input_tokens_seen": 67891304, "step": 100720 }, { "epoch": 2.4607285075611367, "grad_norm": 0.024569325149059296, "learning_rate": 1.2006011119802506e-06, "loss": 0.0481, "num_input_tokens_seen": 67894568, "step": 100725 }, { "epoch": 2.4608506583929834, "grad_norm": 0.6258397102355957, "learning_rate": 1.2005175670889273e-06, "loss": 0.0716, "num_input_tokens_seen": 67897960, "step": 100730 }, { "epoch": 2.4609728092248306, "grad_norm": 0.1836250126361847, "learning_rate": 1.2004340207393866e-06, "loss": 0.0449, "num_input_tokens_seen": 67901352, "step": 100735 }, { "epoch": 2.461094960056678, "grad_norm": 0.0073592402040958405, "learning_rate": 1.2003504729322355e-06, "loss": 0.0001, "num_input_tokens_seen": 67904616, "step": 100740 }, { "epoch": 2.461217110888525, "grad_norm": 0.009784862399101257, "learning_rate": 1.200266923668082e-06, "loss": 0.0502, "num_input_tokens_seen": 67908008, "step": 100745 }, { "epoch": 2.461339261720372, "grad_norm": 0.7223156690597534, "learning_rate": 1.2001833729475332e-06, "loss": 0.0611, "num_input_tokens_seen": 67911208, "step": 100750 }, { "epoch": 2.4614614125522194, "grad_norm": 0.04937265068292618, "learning_rate": 1.2000998207711974e-06, "loss": 0.0005, "num_input_tokens_seen": 67915048, "step": 100755 }, { "epoch": 2.4615835633840666, "grad_norm": 1.4051700830459595, "learning_rate": 1.200016267139682e-06, "loss": 0.033, "num_input_tokens_seen": 67918312, "step": 100760 }, { "epoch": 2.4617057142159138, "grad_norm": 1.7179447412490845, "learning_rate": 1.1999327120535945e-06, "loss": 0.0005, "num_input_tokens_seen": 67921576, "step": 100765 }, { "epoch": 2.461827865047761, "grad_norm": 0.006037765648216009, "learning_rate": 1.1998491555135424e-06, "loss": 0.0006, "num_input_tokens_seen": 67924520, "step": 100770 }, { "epoch": 2.461950015879608, "grad_norm": 0.15003329515457153, "learning_rate": 1.1997655975201335e-06, "loss": 0.0009, "num_input_tokens_seen": 67927656, "step": 100775 }, { "epoch": 2.4620721667114553, "grad_norm": 0.005590246059000492, "learning_rate": 1.1996820380739754e-06, "loss": 0.0867, "num_input_tokens_seen": 67931304, "step": 100780 }, { "epoch": 2.4621943175433025, "grad_norm": 0.004409489221870899, "learning_rate": 1.1995984771756757e-06, "loss": 0.0005, "num_input_tokens_seen": 67934760, "step": 100785 }, { "epoch": 2.4623164683751497, "grad_norm": 0.008362567983567715, "learning_rate": 1.1995149148258423e-06, "loss": 0.0, "num_input_tokens_seen": 67937960, "step": 100790 }, { "epoch": 2.462438619206997, "grad_norm": 13.45290470123291, "learning_rate": 1.1994313510250828e-06, "loss": 0.098, "num_input_tokens_seen": 67941160, "step": 100795 }, { "epoch": 2.462560770038844, "grad_norm": 0.001858392613939941, "learning_rate": 1.1993477857740049e-06, "loss": 0.0002, "num_input_tokens_seen": 67944488, "step": 100800 }, { "epoch": 2.4626829208706913, "grad_norm": 0.06624038517475128, "learning_rate": 1.1992642190732163e-06, "loss": 0.1641, "num_input_tokens_seen": 67947880, "step": 100805 }, { "epoch": 2.4628050717025385, "grad_norm": 0.02120389975607395, "learning_rate": 1.1991806509233246e-06, "loss": 0.0002, "num_input_tokens_seen": 67951144, "step": 100810 }, { "epoch": 2.4629272225343852, "grad_norm": 0.006992959883064032, "learning_rate": 1.199097081324938e-06, "loss": 0.142, "num_input_tokens_seen": 67954984, "step": 100815 }, { "epoch": 2.463049373366233, "grad_norm": 0.049475252628326416, "learning_rate": 1.1990135102786634e-06, "loss": 0.0753, "num_input_tokens_seen": 67958440, "step": 100820 }, { "epoch": 2.4631715241980796, "grad_norm": 0.046576276421546936, "learning_rate": 1.1989299377851093e-06, "loss": 0.097, "num_input_tokens_seen": 67962152, "step": 100825 }, { "epoch": 2.463293675029927, "grad_norm": 0.056836869567632675, "learning_rate": 1.1988463638448832e-06, "loss": 0.0011, "num_input_tokens_seen": 67965608, "step": 100830 }, { "epoch": 2.463415825861774, "grad_norm": 0.04384641721844673, "learning_rate": 1.1987627884585927e-06, "loss": 0.0433, "num_input_tokens_seen": 67968744, "step": 100835 }, { "epoch": 2.463537976693621, "grad_norm": 0.26525595784187317, "learning_rate": 1.1986792116268458e-06, "loss": 0.0758, "num_input_tokens_seen": 67971752, "step": 100840 }, { "epoch": 2.4636601275254684, "grad_norm": 0.03251001983880997, "learning_rate": 1.19859563335025e-06, "loss": 0.0019, "num_input_tokens_seen": 67975208, "step": 100845 }, { "epoch": 2.4637822783573156, "grad_norm": 0.03581248223781586, "learning_rate": 1.1985120536294135e-06, "loss": 0.0311, "num_input_tokens_seen": 67978152, "step": 100850 }, { "epoch": 2.4639044291891627, "grad_norm": 0.050688888877630234, "learning_rate": 1.198428472464944e-06, "loss": 0.002, "num_input_tokens_seen": 67981480, "step": 100855 }, { "epoch": 2.46402658002101, "grad_norm": 0.1690601259469986, "learning_rate": 1.1983448898574493e-06, "loss": 0.0004, "num_input_tokens_seen": 67984680, "step": 100860 }, { "epoch": 2.464148730852857, "grad_norm": 1.0517950057983398, "learning_rate": 1.1982613058075372e-06, "loss": 0.132, "num_input_tokens_seen": 67987944, "step": 100865 }, { "epoch": 2.4642708816847043, "grad_norm": 0.02967626415193081, "learning_rate": 1.198177720315816e-06, "loss": 0.0002, "num_input_tokens_seen": 67992104, "step": 100870 }, { "epoch": 2.4643930325165515, "grad_norm": 0.02798345312476158, "learning_rate": 1.1980941333828924e-06, "loss": 0.1005, "num_input_tokens_seen": 67995368, "step": 100875 }, { "epoch": 2.4645151833483987, "grad_norm": 0.02776155062019825, "learning_rate": 1.1980105450093754e-06, "loss": 0.0001, "num_input_tokens_seen": 67998952, "step": 100880 }, { "epoch": 2.464637334180246, "grad_norm": 0.13066832721233368, "learning_rate": 1.1979269551958722e-06, "loss": 0.0002, "num_input_tokens_seen": 68002408, "step": 100885 }, { "epoch": 2.464759485012093, "grad_norm": 68.17784881591797, "learning_rate": 1.197843363942991e-06, "loss": 0.0338, "num_input_tokens_seen": 68005736, "step": 100890 }, { "epoch": 2.4648816358439403, "grad_norm": 0.05373845994472504, "learning_rate": 1.19775977125134e-06, "loss": 0.0468, "num_input_tokens_seen": 68009512, "step": 100895 }, { "epoch": 2.4650037866757875, "grad_norm": 0.07720531523227692, "learning_rate": 1.1976761771215262e-06, "loss": 0.0015, "num_input_tokens_seen": 68012584, "step": 100900 }, { "epoch": 2.4651259375076346, "grad_norm": 18.308761596679688, "learning_rate": 1.1975925815541582e-06, "loss": 0.149, "num_input_tokens_seen": 68016168, "step": 100905 }, { "epoch": 2.4652480883394814, "grad_norm": 0.0015869983471930027, "learning_rate": 1.197508984549844e-06, "loss": 0.0001, "num_input_tokens_seen": 68019688, "step": 100910 }, { "epoch": 2.4653702391713286, "grad_norm": 0.05763440206646919, "learning_rate": 1.1974253861091914e-06, "loss": 0.039, "num_input_tokens_seen": 68022824, "step": 100915 }, { "epoch": 2.4654923900031758, "grad_norm": 0.07990226149559021, "learning_rate": 1.1973417862328084e-06, "loss": 0.0191, "num_input_tokens_seen": 68026344, "step": 100920 }, { "epoch": 2.465614540835023, "grad_norm": 0.38577136397361755, "learning_rate": 1.1972581849213024e-06, "loss": 0.0004, "num_input_tokens_seen": 68029480, "step": 100925 }, { "epoch": 2.46573669166687, "grad_norm": 0.041660867631435394, "learning_rate": 1.197174582175282e-06, "loss": 0.0001, "num_input_tokens_seen": 68032744, "step": 100930 }, { "epoch": 2.4658588424987173, "grad_norm": 50.8360481262207, "learning_rate": 1.1970909779953553e-06, "loss": 0.1133, "num_input_tokens_seen": 68035880, "step": 100935 }, { "epoch": 2.4659809933305645, "grad_norm": 0.009769965894520283, "learning_rate": 1.1970073723821294e-06, "loss": 0.0452, "num_input_tokens_seen": 68039208, "step": 100940 }, { "epoch": 2.4661031441624117, "grad_norm": 0.029366832226514816, "learning_rate": 1.1969237653362135e-06, "loss": 0.0796, "num_input_tokens_seen": 68042664, "step": 100945 }, { "epoch": 2.466225294994259, "grad_norm": 0.02519991248846054, "learning_rate": 1.1968401568582145e-06, "loss": 0.0028, "num_input_tokens_seen": 68046248, "step": 100950 }, { "epoch": 2.466347445826106, "grad_norm": 0.08081067353487015, "learning_rate": 1.1967565469487413e-06, "loss": 0.0006, "num_input_tokens_seen": 68049512, "step": 100955 }, { "epoch": 2.4664695966579533, "grad_norm": 0.3698766231536865, "learning_rate": 1.1966729356084016e-06, "loss": 0.0002, "num_input_tokens_seen": 68052840, "step": 100960 }, { "epoch": 2.4665917474898005, "grad_norm": 0.046203531324863434, "learning_rate": 1.1965893228378032e-06, "loss": 0.0533, "num_input_tokens_seen": 68056040, "step": 100965 }, { "epoch": 2.4667138983216477, "grad_norm": 0.027350353077054024, "learning_rate": 1.1965057086375546e-06, "loss": 0.0005, "num_input_tokens_seen": 68059880, "step": 100970 }, { "epoch": 2.466836049153495, "grad_norm": 17.96902847290039, "learning_rate": 1.1964220930082633e-06, "loss": 0.0501, "num_input_tokens_seen": 68063272, "step": 100975 }, { "epoch": 2.466958199985342, "grad_norm": 0.010453293099999428, "learning_rate": 1.1963384759505378e-06, "loss": 0.0002, "num_input_tokens_seen": 68066280, "step": 100980 }, { "epoch": 2.4670803508171892, "grad_norm": 2.16798734664917, "learning_rate": 1.1962548574649863e-06, "loss": 0.0009, "num_input_tokens_seen": 68069416, "step": 100985 }, { "epoch": 2.4672025016490364, "grad_norm": 0.06394907832145691, "learning_rate": 1.1961712375522166e-06, "loss": 0.0002, "num_input_tokens_seen": 68072744, "step": 100990 }, { "epoch": 2.467324652480883, "grad_norm": 0.20608043670654297, "learning_rate": 1.1960876162128368e-06, "loss": 0.0306, "num_input_tokens_seen": 68076392, "step": 100995 }, { "epoch": 2.467446803312731, "grad_norm": 0.017128368839621544, "learning_rate": 1.1960039934474552e-06, "loss": 0.0401, "num_input_tokens_seen": 68079720, "step": 101000 }, { "epoch": 2.4675689541445776, "grad_norm": 0.016166547313332558, "learning_rate": 1.1959203692566797e-06, "loss": 0.0006, "num_input_tokens_seen": 68083048, "step": 101005 }, { "epoch": 2.4676911049764247, "grad_norm": 1.9010814428329468, "learning_rate": 1.1958367436411189e-06, "loss": 0.0004, "num_input_tokens_seen": 68086376, "step": 101010 }, { "epoch": 2.467813255808272, "grad_norm": 0.11727231740951538, "learning_rate": 1.1957531166013803e-06, "loss": 0.0602, "num_input_tokens_seen": 68089704, "step": 101015 }, { "epoch": 2.467935406640119, "grad_norm": 0.36076483130455017, "learning_rate": 1.1956694881380724e-06, "loss": 0.059, "num_input_tokens_seen": 68092840, "step": 101020 }, { "epoch": 2.4680575574719663, "grad_norm": 0.04420606046915054, "learning_rate": 1.1955858582518036e-06, "loss": 0.0007, "num_input_tokens_seen": 68096680, "step": 101025 }, { "epoch": 2.4681797083038135, "grad_norm": 0.03951043635606766, "learning_rate": 1.1955022269431816e-06, "loss": 0.0001, "num_input_tokens_seen": 68099944, "step": 101030 }, { "epoch": 2.4683018591356607, "grad_norm": 0.09191124141216278, "learning_rate": 1.195418594212815e-06, "loss": 0.0002, "num_input_tokens_seen": 68102952, "step": 101035 }, { "epoch": 2.468424009967508, "grad_norm": 0.06286905705928802, "learning_rate": 1.1953349600613116e-06, "loss": 0.0489, "num_input_tokens_seen": 68106024, "step": 101040 }, { "epoch": 2.468546160799355, "grad_norm": 0.027167772874236107, "learning_rate": 1.1952513244892802e-06, "loss": 0.0, "num_input_tokens_seen": 68109352, "step": 101045 }, { "epoch": 2.4686683116312023, "grad_norm": 0.007415004540234804, "learning_rate": 1.1951676874973284e-06, "loss": 0.0354, "num_input_tokens_seen": 68112872, "step": 101050 }, { "epoch": 2.4687904624630495, "grad_norm": 22.62297821044922, "learning_rate": 1.1950840490860647e-06, "loss": 0.0776, "num_input_tokens_seen": 68115816, "step": 101055 }, { "epoch": 2.4689126132948966, "grad_norm": 0.00367325940169394, "learning_rate": 1.1950004092560973e-06, "loss": 0.1308, "num_input_tokens_seen": 68119080, "step": 101060 }, { "epoch": 2.469034764126744, "grad_norm": 0.018181709572672844, "learning_rate": 1.1949167680080344e-06, "loss": 0.0002, "num_input_tokens_seen": 68122984, "step": 101065 }, { "epoch": 2.469156914958591, "grad_norm": 0.11490341275930405, "learning_rate": 1.1948331253424846e-06, "loss": 0.0423, "num_input_tokens_seen": 68126184, "step": 101070 }, { "epoch": 2.469279065790438, "grad_norm": 0.007061179727315903, "learning_rate": 1.1947494812600558e-06, "loss": 0.0003, "num_input_tokens_seen": 68129576, "step": 101075 }, { "epoch": 2.469401216622285, "grad_norm": 0.03727791830897331, "learning_rate": 1.1946658357613564e-06, "loss": 0.0579, "num_input_tokens_seen": 68132776, "step": 101080 }, { "epoch": 2.4695233674541326, "grad_norm": 37.90617370605469, "learning_rate": 1.1945821888469946e-06, "loss": 0.0374, "num_input_tokens_seen": 68135784, "step": 101085 }, { "epoch": 2.4696455182859793, "grad_norm": 0.022134313359856606, "learning_rate": 1.1944985405175788e-06, "loss": 0.0795, "num_input_tokens_seen": 68138856, "step": 101090 }, { "epoch": 2.4697676691178265, "grad_norm": 0.31639614701271057, "learning_rate": 1.1944148907737171e-06, "loss": 0.0528, "num_input_tokens_seen": 68142056, "step": 101095 }, { "epoch": 2.4698898199496737, "grad_norm": 0.10037325322628021, "learning_rate": 1.1943312396160181e-06, "loss": 0.077, "num_input_tokens_seen": 68145512, "step": 101100 }, { "epoch": 2.470011970781521, "grad_norm": 0.0351799838244915, "learning_rate": 1.1942475870450904e-06, "loss": 0.0002, "num_input_tokens_seen": 68149032, "step": 101105 }, { "epoch": 2.470134121613368, "grad_norm": 0.1054639145731926, "learning_rate": 1.1941639330615419e-06, "loss": 0.0004, "num_input_tokens_seen": 68152552, "step": 101110 }, { "epoch": 2.4702562724452153, "grad_norm": 0.04764983057975769, "learning_rate": 1.1940802776659808e-06, "loss": 0.0003, "num_input_tokens_seen": 68155944, "step": 101115 }, { "epoch": 2.4703784232770625, "grad_norm": 0.046450525522232056, "learning_rate": 1.193996620859016e-06, "loss": 0.0378, "num_input_tokens_seen": 68159528, "step": 101120 }, { "epoch": 2.4705005741089097, "grad_norm": 308.5850830078125, "learning_rate": 1.1939129626412553e-06, "loss": 0.0356, "num_input_tokens_seen": 68162728, "step": 101125 }, { "epoch": 2.470622724940757, "grad_norm": 0.29687780141830444, "learning_rate": 1.1938293030133075e-06, "loss": 0.0308, "num_input_tokens_seen": 68166952, "step": 101130 }, { "epoch": 2.470744875772604, "grad_norm": 72.99173736572266, "learning_rate": 1.193745641975781e-06, "loss": 0.0009, "num_input_tokens_seen": 68170216, "step": 101135 }, { "epoch": 2.4708670266044512, "grad_norm": 0.06350903958082199, "learning_rate": 1.193661979529284e-06, "loss": 0.0002, "num_input_tokens_seen": 68173544, "step": 101140 }, { "epoch": 2.4709891774362984, "grad_norm": 0.06977628171443939, "learning_rate": 1.193578315674425e-06, "loss": 0.0003, "num_input_tokens_seen": 68177320, "step": 101145 }, { "epoch": 2.4711113282681456, "grad_norm": 0.041446931660175323, "learning_rate": 1.1934946504118123e-06, "loss": 0.0582, "num_input_tokens_seen": 68181032, "step": 101150 }, { "epoch": 2.471233479099993, "grad_norm": 0.004509053658694029, "learning_rate": 1.1934109837420544e-06, "loss": 0.0014, "num_input_tokens_seen": 68184232, "step": 101155 }, { "epoch": 2.47135562993184, "grad_norm": 164.20571899414062, "learning_rate": 1.1933273156657602e-06, "loss": 0.091, "num_input_tokens_seen": 68187688, "step": 101160 }, { "epoch": 2.471477780763687, "grad_norm": 53.392967224121094, "learning_rate": 1.1932436461835376e-06, "loss": 0.0287, "num_input_tokens_seen": 68191464, "step": 101165 }, { "epoch": 2.4715999315955344, "grad_norm": 3.405658006668091, "learning_rate": 1.193159975295995e-06, "loss": 0.0449, "num_input_tokens_seen": 68195432, "step": 101170 }, { "epoch": 2.471722082427381, "grad_norm": 72.32855224609375, "learning_rate": 1.1930763030037413e-06, "loss": 0.0272, "num_input_tokens_seen": 68198824, "step": 101175 }, { "epoch": 2.4718442332592283, "grad_norm": 0.2518201172351837, "learning_rate": 1.1929926293073852e-06, "loss": 0.0009, "num_input_tokens_seen": 68202728, "step": 101180 }, { "epoch": 2.4719663840910755, "grad_norm": 201.95413208007812, "learning_rate": 1.192908954207534e-06, "loss": 0.005, "num_input_tokens_seen": 68206312, "step": 101185 }, { "epoch": 2.4720885349229227, "grad_norm": 0.07108218967914581, "learning_rate": 1.1928252777047974e-06, "loss": 0.0002, "num_input_tokens_seen": 68210088, "step": 101190 }, { "epoch": 2.47221068575477, "grad_norm": 0.021677618846297264, "learning_rate": 1.1927415997997834e-06, "loss": 0.0007, "num_input_tokens_seen": 68213480, "step": 101195 }, { "epoch": 2.472332836586617, "grad_norm": 0.011132234707474709, "learning_rate": 1.192657920493101e-06, "loss": 0.0008, "num_input_tokens_seen": 68216744, "step": 101200 }, { "epoch": 2.4724549874184643, "grad_norm": 46.65589904785156, "learning_rate": 1.192574239785358e-06, "loss": 0.0942, "num_input_tokens_seen": 68220008, "step": 101205 }, { "epoch": 2.4725771382503114, "grad_norm": 0.051826756447553635, "learning_rate": 1.1924905576771634e-06, "loss": 0.0419, "num_input_tokens_seen": 68223592, "step": 101210 }, { "epoch": 2.4726992890821586, "grad_norm": 0.3045022487640381, "learning_rate": 1.1924068741691258e-06, "loss": 0.0008, "num_input_tokens_seen": 68227048, "step": 101215 }, { "epoch": 2.472821439914006, "grad_norm": 0.0067571792751550674, "learning_rate": 1.1923231892618532e-06, "loss": 0.0426, "num_input_tokens_seen": 68230440, "step": 101220 }, { "epoch": 2.472943590745853, "grad_norm": 0.033845242112874985, "learning_rate": 1.1922395029559554e-06, "loss": 0.0001, "num_input_tokens_seen": 68233512, "step": 101225 }, { "epoch": 2.4730657415777, "grad_norm": 41.13069152832031, "learning_rate": 1.1921558152520399e-06, "loss": 0.0836, "num_input_tokens_seen": 68237416, "step": 101230 }, { "epoch": 2.4731878924095474, "grad_norm": 0.005356302484869957, "learning_rate": 1.1920721261507156e-06, "loss": 0.0423, "num_input_tokens_seen": 68240168, "step": 101235 }, { "epoch": 2.4733100432413946, "grad_norm": 0.3754318356513977, "learning_rate": 1.191988435652591e-06, "loss": 0.0702, "num_input_tokens_seen": 68243304, "step": 101240 }, { "epoch": 2.4734321940732418, "grad_norm": 0.030581409111618996, "learning_rate": 1.191904743758275e-06, "loss": 0.0002, "num_input_tokens_seen": 68247016, "step": 101245 }, { "epoch": 2.473554344905089, "grad_norm": 13.41250991821289, "learning_rate": 1.1918210504683759e-06, "loss": 0.0774, "num_input_tokens_seen": 68250472, "step": 101250 }, { "epoch": 2.473676495736936, "grad_norm": 0.1623183786869049, "learning_rate": 1.1917373557835026e-06, "loss": 0.0002, "num_input_tokens_seen": 68254120, "step": 101255 }, { "epoch": 2.473798646568783, "grad_norm": 0.40426185727119446, "learning_rate": 1.191653659704264e-06, "loss": 0.089, "num_input_tokens_seen": 68257448, "step": 101260 }, { "epoch": 2.4739207974006305, "grad_norm": 0.0006512034451588988, "learning_rate": 1.191569962231268e-06, "loss": 0.0752, "num_input_tokens_seen": 68260584, "step": 101265 }, { "epoch": 2.4740429482324773, "grad_norm": 0.08026715368032455, "learning_rate": 1.191486263365124e-06, "loss": 0.0001, "num_input_tokens_seen": 68263784, "step": 101270 }, { "epoch": 2.4741650990643245, "grad_norm": 0.0661499872803688, "learning_rate": 1.1914025631064403e-06, "loss": 0.0138, "num_input_tokens_seen": 68266856, "step": 101275 }, { "epoch": 2.4742872498961717, "grad_norm": 0.09698405861854553, "learning_rate": 1.1913188614558255e-06, "loss": 0.0002, "num_input_tokens_seen": 68270056, "step": 101280 }, { "epoch": 2.474409400728019, "grad_norm": 0.27747029066085815, "learning_rate": 1.1912351584138889e-06, "loss": 0.0739, "num_input_tokens_seen": 68273064, "step": 101285 }, { "epoch": 2.474531551559866, "grad_norm": 0.0036097955889999866, "learning_rate": 1.1911514539812386e-06, "loss": 0.0001, "num_input_tokens_seen": 68276584, "step": 101290 }, { "epoch": 2.4746537023917132, "grad_norm": 0.22168563306331635, "learning_rate": 1.1910677481584835e-06, "loss": 0.0782, "num_input_tokens_seen": 68279848, "step": 101295 }, { "epoch": 2.4747758532235604, "grad_norm": 0.002510837512090802, "learning_rate": 1.1909840409462322e-06, "loss": 0.0576, "num_input_tokens_seen": 68283496, "step": 101300 }, { "epoch": 2.4748980040554076, "grad_norm": 9.333548545837402, "learning_rate": 1.1909003323450938e-06, "loss": 0.1167, "num_input_tokens_seen": 68287080, "step": 101305 }, { "epoch": 2.475020154887255, "grad_norm": 0.042019180953502655, "learning_rate": 1.1908166223556766e-06, "loss": 0.0003, "num_input_tokens_seen": 68290408, "step": 101310 }, { "epoch": 2.475142305719102, "grad_norm": 25.9899845123291, "learning_rate": 1.1907329109785895e-06, "loss": 0.0942, "num_input_tokens_seen": 68293544, "step": 101315 }, { "epoch": 2.475264456550949, "grad_norm": 27.839702606201172, "learning_rate": 1.1906491982144417e-06, "loss": 0.1748, "num_input_tokens_seen": 68297384, "step": 101320 }, { "epoch": 2.4753866073827964, "grad_norm": 0.1217912808060646, "learning_rate": 1.1905654840638417e-06, "loss": 0.0603, "num_input_tokens_seen": 68300520, "step": 101325 }, { "epoch": 2.4755087582146436, "grad_norm": 0.010269726626574993, "learning_rate": 1.190481768527398e-06, "loss": 0.0008, "num_input_tokens_seen": 68304040, "step": 101330 }, { "epoch": 2.4756309090464907, "grad_norm": 1.610625982284546, "learning_rate": 1.19039805160572e-06, "loss": 0.0688, "num_input_tokens_seen": 68307304, "step": 101335 }, { "epoch": 2.475753059878338, "grad_norm": 0.009682337753474712, "learning_rate": 1.1903143332994156e-06, "loss": 0.0007, "num_input_tokens_seen": 68310568, "step": 101340 }, { "epoch": 2.475875210710185, "grad_norm": 37.2939453125, "learning_rate": 1.1902306136090947e-06, "loss": 0.06, "num_input_tokens_seen": 68314792, "step": 101345 }, { "epoch": 2.4759973615420323, "grad_norm": 0.0182523000985384, "learning_rate": 1.1901468925353652e-06, "loss": 0.0807, "num_input_tokens_seen": 68317992, "step": 101350 }, { "epoch": 2.476119512373879, "grad_norm": 0.03435177356004715, "learning_rate": 1.1900631700788366e-06, "loss": 0.0612, "num_input_tokens_seen": 68321512, "step": 101355 }, { "epoch": 2.4762416632057263, "grad_norm": 0.1018286719918251, "learning_rate": 1.1899794462401176e-06, "loss": 0.1757, "num_input_tokens_seen": 68324840, "step": 101360 }, { "epoch": 2.4763638140375734, "grad_norm": 0.9780299663543701, "learning_rate": 1.1898957210198168e-06, "loss": 0.0006, "num_input_tokens_seen": 68328104, "step": 101365 }, { "epoch": 2.4764859648694206, "grad_norm": 0.014705418609082699, "learning_rate": 1.1898119944185432e-06, "loss": 0.0344, "num_input_tokens_seen": 68331880, "step": 101370 }, { "epoch": 2.476608115701268, "grad_norm": 8.638080596923828, "learning_rate": 1.1897282664369058e-06, "loss": 0.0518, "num_input_tokens_seen": 68335080, "step": 101375 }, { "epoch": 2.476730266533115, "grad_norm": 0.31917837262153625, "learning_rate": 1.1896445370755135e-06, "loss": 0.0003, "num_input_tokens_seen": 68338216, "step": 101380 }, { "epoch": 2.476852417364962, "grad_norm": 0.2948130667209625, "learning_rate": 1.189560806334975e-06, "loss": 0.0006, "num_input_tokens_seen": 68341544, "step": 101385 }, { "epoch": 2.4769745681968094, "grad_norm": 413.6029052734375, "learning_rate": 1.1894770742158992e-06, "loss": 0.1015, "num_input_tokens_seen": 68344872, "step": 101390 }, { "epoch": 2.4770967190286566, "grad_norm": 16.92646026611328, "learning_rate": 1.1893933407188957e-06, "loss": 0.0425, "num_input_tokens_seen": 68348392, "step": 101395 }, { "epoch": 2.4772188698605038, "grad_norm": 0.10089614987373352, "learning_rate": 1.1893096058445723e-06, "loss": 0.0001, "num_input_tokens_seen": 68351848, "step": 101400 }, { "epoch": 2.477341020692351, "grad_norm": 0.018931131809949875, "learning_rate": 1.1892258695935383e-06, "loss": 0.0002, "num_input_tokens_seen": 68355368, "step": 101405 }, { "epoch": 2.477463171524198, "grad_norm": 42.50852584838867, "learning_rate": 1.1891421319664034e-06, "loss": 0.0958, "num_input_tokens_seen": 68358504, "step": 101410 }, { "epoch": 2.4775853223560453, "grad_norm": 0.08458676189184189, "learning_rate": 1.1890583929637761e-06, "loss": 0.0002, "num_input_tokens_seen": 68361768, "step": 101415 }, { "epoch": 2.4777074731878925, "grad_norm": 0.01854422688484192, "learning_rate": 1.188974652586265e-06, "loss": 0.0479, "num_input_tokens_seen": 68364968, "step": 101420 }, { "epoch": 2.4778296240197397, "grad_norm": 0.05844534561038017, "learning_rate": 1.1888909108344797e-06, "loss": 0.0002, "num_input_tokens_seen": 68368296, "step": 101425 }, { "epoch": 2.477951774851587, "grad_norm": 0.013152677565813065, "learning_rate": 1.1888071677090288e-06, "loss": 0.0514, "num_input_tokens_seen": 68372328, "step": 101430 }, { "epoch": 2.478073925683434, "grad_norm": 0.27791768312454224, "learning_rate": 1.1887234232105215e-06, "loss": 0.0002, "num_input_tokens_seen": 68375656, "step": 101435 }, { "epoch": 2.478196076515281, "grad_norm": 0.007701204624027014, "learning_rate": 1.1886396773395664e-06, "loss": 0.0457, "num_input_tokens_seen": 68378920, "step": 101440 }, { "epoch": 2.4783182273471285, "grad_norm": 10.394662857055664, "learning_rate": 1.1885559300967728e-06, "loss": 0.0011, "num_input_tokens_seen": 68382184, "step": 101445 }, { "epoch": 2.4784403781789752, "grad_norm": 0.023853451013565063, "learning_rate": 1.18847218148275e-06, "loss": 0.0422, "num_input_tokens_seen": 68385448, "step": 101450 }, { "epoch": 2.4785625290108224, "grad_norm": 0.04058440774679184, "learning_rate": 1.188388431498107e-06, "loss": 0.0003, "num_input_tokens_seen": 68388840, "step": 101455 }, { "epoch": 2.4786846798426696, "grad_norm": 0.013206390663981438, "learning_rate": 1.1883046801434524e-06, "loss": 0.0001, "num_input_tokens_seen": 68392552, "step": 101460 }, { "epoch": 2.478806830674517, "grad_norm": 0.981783390045166, "learning_rate": 1.1882209274193954e-06, "loss": 0.0005, "num_input_tokens_seen": 68395880, "step": 101465 }, { "epoch": 2.478928981506364, "grad_norm": 50.186222076416016, "learning_rate": 1.1881371733265451e-06, "loss": 0.0385, "num_input_tokens_seen": 68399272, "step": 101470 }, { "epoch": 2.479051132338211, "grad_norm": 0.006814572494477034, "learning_rate": 1.188053417865511e-06, "loss": 0.031, "num_input_tokens_seen": 68402408, "step": 101475 }, { "epoch": 2.4791732831700584, "grad_norm": 0.09142502397298813, "learning_rate": 1.1879696610369017e-06, "loss": 0.0004, "num_input_tokens_seen": 68406248, "step": 101480 }, { "epoch": 2.4792954340019056, "grad_norm": 0.2830347716808319, "learning_rate": 1.1878859028413267e-06, "loss": 0.0001, "num_input_tokens_seen": 68409384, "step": 101485 }, { "epoch": 2.4794175848337527, "grad_norm": 30.16297149658203, "learning_rate": 1.1878021432793948e-06, "loss": 0.115, "num_input_tokens_seen": 68412712, "step": 101490 }, { "epoch": 2.4795397356656, "grad_norm": 0.01764572784304619, "learning_rate": 1.187718382351715e-06, "loss": 0.0002, "num_input_tokens_seen": 68416040, "step": 101495 }, { "epoch": 2.479661886497447, "grad_norm": 0.006279618013650179, "learning_rate": 1.1876346200588966e-06, "loss": 0.0624, "num_input_tokens_seen": 68419112, "step": 101500 }, { "epoch": 2.4797840373292943, "grad_norm": 0.5235111117362976, "learning_rate": 1.1875508564015488e-06, "loss": 0.0002, "num_input_tokens_seen": 68422248, "step": 101505 }, { "epoch": 2.4799061881611415, "grad_norm": 0.12128138542175293, "learning_rate": 1.187467091380281e-06, "loss": 0.0699, "num_input_tokens_seen": 68425384, "step": 101510 }, { "epoch": 2.4800283389929887, "grad_norm": 0.2382190227508545, "learning_rate": 1.187383324995702e-06, "loss": 0.0005, "num_input_tokens_seen": 68428584, "step": 101515 }, { "epoch": 2.480150489824836, "grad_norm": 0.02181437611579895, "learning_rate": 1.1872995572484208e-06, "loss": 0.0838, "num_input_tokens_seen": 68431912, "step": 101520 }, { "epoch": 2.4802726406566826, "grad_norm": 0.02534443512558937, "learning_rate": 1.187215788139047e-06, "loss": 0.0483, "num_input_tokens_seen": 68434984, "step": 101525 }, { "epoch": 2.4803947914885303, "grad_norm": 0.029036127030849457, "learning_rate": 1.1871320176681896e-06, "loss": 0.0374, "num_input_tokens_seen": 68437928, "step": 101530 }, { "epoch": 2.480516942320377, "grad_norm": 0.02246944047510624, "learning_rate": 1.1870482458364582e-06, "loss": 0.0006, "num_input_tokens_seen": 68441640, "step": 101535 }, { "epoch": 2.480639093152224, "grad_norm": 0.059610191732645035, "learning_rate": 1.186964472644461e-06, "loss": 0.0002, "num_input_tokens_seen": 68444968, "step": 101540 }, { "epoch": 2.4807612439840714, "grad_norm": 0.005097515881061554, "learning_rate": 1.1868806980928084e-06, "loss": 0.0745, "num_input_tokens_seen": 68448808, "step": 101545 }, { "epoch": 2.4808833948159186, "grad_norm": 13.624731063842773, "learning_rate": 1.186796922182109e-06, "loss": 0.0394, "num_input_tokens_seen": 68452392, "step": 101550 }, { "epoch": 2.4810055456477658, "grad_norm": 0.0009953816188499331, "learning_rate": 1.186713144912972e-06, "loss": 0.0417, "num_input_tokens_seen": 68455464, "step": 101555 }, { "epoch": 2.481127696479613, "grad_norm": 0.11448115110397339, "learning_rate": 1.1866293662860066e-06, "loss": 0.0427, "num_input_tokens_seen": 68458408, "step": 101560 }, { "epoch": 2.48124984731146, "grad_norm": 0.14324010908603668, "learning_rate": 1.1865455863018226e-06, "loss": 0.0351, "num_input_tokens_seen": 68461480, "step": 101565 }, { "epoch": 2.4813719981433073, "grad_norm": 19.067188262939453, "learning_rate": 1.1864618049610287e-06, "loss": 0.1186, "num_input_tokens_seen": 68465896, "step": 101570 }, { "epoch": 2.4814941489751545, "grad_norm": 0.02945057861506939, "learning_rate": 1.1863780222642346e-06, "loss": 0.0002, "num_input_tokens_seen": 68469672, "step": 101575 }, { "epoch": 2.4816162998070017, "grad_norm": 1.2527203559875488, "learning_rate": 1.1862942382120492e-06, "loss": 0.0668, "num_input_tokens_seen": 68472936, "step": 101580 }, { "epoch": 2.481738450638849, "grad_norm": 0.9003691077232361, "learning_rate": 1.1862104528050823e-06, "loss": 0.0545, "num_input_tokens_seen": 68476968, "step": 101585 }, { "epoch": 2.481860601470696, "grad_norm": 70.0113296508789, "learning_rate": 1.1861266660439427e-06, "loss": 0.0706, "num_input_tokens_seen": 68479976, "step": 101590 }, { "epoch": 2.4819827523025433, "grad_norm": 26.04130744934082, "learning_rate": 1.1860428779292398e-06, "loss": 0.0895, "num_input_tokens_seen": 68483240, "step": 101595 }, { "epoch": 2.4821049031343905, "grad_norm": 0.817161500453949, "learning_rate": 1.1859590884615832e-06, "loss": 0.0595, "num_input_tokens_seen": 68486696, "step": 101600 }, { "epoch": 2.4822270539662377, "grad_norm": 0.051058027893304825, "learning_rate": 1.185875297641582e-06, "loss": 0.0004, "num_input_tokens_seen": 68490536, "step": 101605 }, { "epoch": 2.482349204798085, "grad_norm": 0.024212127551436424, "learning_rate": 1.1857915054698457e-06, "loss": 0.1793, "num_input_tokens_seen": 68493928, "step": 101610 }, { "epoch": 2.482471355629932, "grad_norm": 0.006492579821497202, "learning_rate": 1.1857077119469835e-06, "loss": 0.0518, "num_input_tokens_seen": 68496936, "step": 101615 }, { "epoch": 2.482593506461779, "grad_norm": 0.0478641577064991, "learning_rate": 1.1856239170736048e-06, "loss": 0.108, "num_input_tokens_seen": 68500008, "step": 101620 }, { "epoch": 2.482715657293626, "grad_norm": 0.037426840513944626, "learning_rate": 1.1855401208503192e-06, "loss": 0.0003, "num_input_tokens_seen": 68503592, "step": 101625 }, { "epoch": 2.482837808125473, "grad_norm": 0.020461494103074074, "learning_rate": 1.1854563232777362e-06, "loss": 0.0004, "num_input_tokens_seen": 68506664, "step": 101630 }, { "epoch": 2.4829599589573204, "grad_norm": 0.11810333281755447, "learning_rate": 1.1853725243564645e-06, "loss": 0.0424, "num_input_tokens_seen": 68509928, "step": 101635 }, { "epoch": 2.4830821097891675, "grad_norm": 0.3965718150138855, "learning_rate": 1.1852887240871142e-06, "loss": 0.0455, "num_input_tokens_seen": 68513768, "step": 101640 }, { "epoch": 2.4832042606210147, "grad_norm": 0.022183619439601898, "learning_rate": 1.1852049224702947e-06, "loss": 0.0006, "num_input_tokens_seen": 68516840, "step": 101645 }, { "epoch": 2.483326411452862, "grad_norm": 0.14527912437915802, "learning_rate": 1.1851211195066148e-06, "loss": 0.0005, "num_input_tokens_seen": 68519976, "step": 101650 }, { "epoch": 2.483448562284709, "grad_norm": 0.33538344502449036, "learning_rate": 1.1850373151966845e-06, "loss": 0.001, "num_input_tokens_seen": 68523304, "step": 101655 }, { "epoch": 2.4835707131165563, "grad_norm": 0.11240077763795853, "learning_rate": 1.1849535095411127e-06, "loss": 0.0003, "num_input_tokens_seen": 68526632, "step": 101660 }, { "epoch": 2.4836928639484035, "grad_norm": 0.00868434738367796, "learning_rate": 1.1848697025405096e-06, "loss": 0.0443, "num_input_tokens_seen": 68529768, "step": 101665 }, { "epoch": 2.4838150147802507, "grad_norm": 0.1269422471523285, "learning_rate": 1.1847858941954843e-06, "loss": 0.0367, "num_input_tokens_seen": 68533032, "step": 101670 }, { "epoch": 2.483937165612098, "grad_norm": 0.02361040934920311, "learning_rate": 1.1847020845066462e-06, "loss": 0.0002, "num_input_tokens_seen": 68536488, "step": 101675 }, { "epoch": 2.484059316443945, "grad_norm": 0.019039446488022804, "learning_rate": 1.1846182734746048e-06, "loss": 0.0306, "num_input_tokens_seen": 68539752, "step": 101680 }, { "epoch": 2.4841814672757923, "grad_norm": 0.06745230406522751, "learning_rate": 1.1845344610999694e-06, "loss": 0.1347, "num_input_tokens_seen": 68543528, "step": 101685 }, { "epoch": 2.4843036181076394, "grad_norm": 0.10040964186191559, "learning_rate": 1.1844506473833504e-06, "loss": 0.0296, "num_input_tokens_seen": 68547112, "step": 101690 }, { "epoch": 2.4844257689394866, "grad_norm": 0.054083868861198425, "learning_rate": 1.1843668323253564e-06, "loss": 0.0396, "num_input_tokens_seen": 68550376, "step": 101695 }, { "epoch": 2.484547919771334, "grad_norm": 0.05380721390247345, "learning_rate": 1.184283015926597e-06, "loss": 0.0443, "num_input_tokens_seen": 68553960, "step": 101700 }, { "epoch": 2.4846700706031806, "grad_norm": 0.7454630732536316, "learning_rate": 1.1841991981876823e-06, "loss": 0.0003, "num_input_tokens_seen": 68557352, "step": 101705 }, { "epoch": 2.484792221435028, "grad_norm": 27.55868911743164, "learning_rate": 1.1841153791092213e-06, "loss": 0.1567, "num_input_tokens_seen": 68560936, "step": 101710 }, { "epoch": 2.484914372266875, "grad_norm": 35.94108581542969, "learning_rate": 1.1840315586918234e-06, "loss": 0.1355, "num_input_tokens_seen": 68563944, "step": 101715 }, { "epoch": 2.485036523098722, "grad_norm": 0.08266687393188477, "learning_rate": 1.183947736936099e-06, "loss": 0.0716, "num_input_tokens_seen": 68566760, "step": 101720 }, { "epoch": 2.4851586739305693, "grad_norm": 0.024817267432808876, "learning_rate": 1.1838639138426572e-06, "loss": 0.0006, "num_input_tokens_seen": 68570280, "step": 101725 }, { "epoch": 2.4852808247624165, "grad_norm": 0.12349018454551697, "learning_rate": 1.1837800894121072e-06, "loss": 0.0537, "num_input_tokens_seen": 68573352, "step": 101730 }, { "epoch": 2.4854029755942637, "grad_norm": 26.148344039916992, "learning_rate": 1.183696263645059e-06, "loss": 0.0771, "num_input_tokens_seen": 68576744, "step": 101735 }, { "epoch": 2.485525126426111, "grad_norm": 0.007630622014403343, "learning_rate": 1.1836124365421225e-06, "loss": 0.0349, "num_input_tokens_seen": 68580456, "step": 101740 }, { "epoch": 2.485647277257958, "grad_norm": 717.2020263671875, "learning_rate": 1.1835286081039068e-06, "loss": 0.0135, "num_input_tokens_seen": 68584296, "step": 101745 }, { "epoch": 2.4857694280898053, "grad_norm": 0.9846317768096924, "learning_rate": 1.1834447783310217e-06, "loss": 0.0005, "num_input_tokens_seen": 68587176, "step": 101750 }, { "epoch": 2.4858915789216525, "grad_norm": 0.05773583799600601, "learning_rate": 1.183360947224077e-06, "loss": 0.0003, "num_input_tokens_seen": 68590376, "step": 101755 }, { "epoch": 2.4860137297534997, "grad_norm": 0.026842717081308365, "learning_rate": 1.183277114783682e-06, "loss": 0.0009, "num_input_tokens_seen": 68593832, "step": 101760 }, { "epoch": 2.486135880585347, "grad_norm": 109.32928466796875, "learning_rate": 1.1831932810104463e-06, "loss": 0.0956, "num_input_tokens_seen": 68597736, "step": 101765 }, { "epoch": 2.486258031417194, "grad_norm": 0.016676966100931168, "learning_rate": 1.1831094459049802e-06, "loss": 0.0408, "num_input_tokens_seen": 68601320, "step": 101770 }, { "epoch": 2.4863801822490412, "grad_norm": 0.11005407571792603, "learning_rate": 1.1830256094678927e-06, "loss": 0.0697, "num_input_tokens_seen": 68604712, "step": 101775 }, { "epoch": 2.4865023330808884, "grad_norm": 0.2985239028930664, "learning_rate": 1.1829417716997936e-06, "loss": 0.0003, "num_input_tokens_seen": 68607720, "step": 101780 }, { "epoch": 2.4866244839127356, "grad_norm": 0.016494203358888626, "learning_rate": 1.1828579326012928e-06, "loss": 0.0001, "num_input_tokens_seen": 68611240, "step": 101785 }, { "epoch": 2.486746634744583, "grad_norm": 0.007882504723966122, "learning_rate": 1.1827740921730003e-06, "loss": 0.0001, "num_input_tokens_seen": 68614824, "step": 101790 }, { "epoch": 2.48686878557643, "grad_norm": 0.3112851083278656, "learning_rate": 1.1826902504155253e-06, "loss": 0.0302, "num_input_tokens_seen": 68618920, "step": 101795 }, { "epoch": 2.4869909364082767, "grad_norm": 1.156801462173462, "learning_rate": 1.1826064073294778e-06, "loss": 0.0936, "num_input_tokens_seen": 68621992, "step": 101800 }, { "epoch": 2.487113087240124, "grad_norm": 0.1341012418270111, "learning_rate": 1.182522562915467e-06, "loss": 0.0003, "num_input_tokens_seen": 68625128, "step": 101805 }, { "epoch": 2.487235238071971, "grad_norm": 0.004828187171369791, "learning_rate": 1.182438717174103e-06, "loss": 0.0002, "num_input_tokens_seen": 68629032, "step": 101810 }, { "epoch": 2.4873573889038183, "grad_norm": 485.93035888671875, "learning_rate": 1.1823548701059957e-06, "loss": 0.0124, "num_input_tokens_seen": 68632616, "step": 101815 }, { "epoch": 2.4874795397356655, "grad_norm": 0.028452308848500252, "learning_rate": 1.182271021711755e-06, "loss": 0.0003, "num_input_tokens_seen": 68635624, "step": 101820 }, { "epoch": 2.4876016905675127, "grad_norm": 0.5252352952957153, "learning_rate": 1.1821871719919902e-06, "loss": 0.0373, "num_input_tokens_seen": 68638888, "step": 101825 }, { "epoch": 2.48772384139936, "grad_norm": 0.010361897759139538, "learning_rate": 1.1821033209473113e-06, "loss": 0.0001, "num_input_tokens_seen": 68642600, "step": 101830 }, { "epoch": 2.487845992231207, "grad_norm": 0.0020655151456594467, "learning_rate": 1.182019468578328e-06, "loss": 0.0001, "num_input_tokens_seen": 68645800, "step": 101835 }, { "epoch": 2.4879681430630542, "grad_norm": 0.05361203849315643, "learning_rate": 1.1819356148856501e-06, "loss": 0.0612, "num_input_tokens_seen": 68649448, "step": 101840 }, { "epoch": 2.4880902938949014, "grad_norm": 0.11180011928081512, "learning_rate": 1.181851759869888e-06, "loss": 0.0002, "num_input_tokens_seen": 68652712, "step": 101845 }, { "epoch": 2.4882124447267486, "grad_norm": 0.006292127072811127, "learning_rate": 1.1817679035316504e-06, "loss": 0.0752, "num_input_tokens_seen": 68655848, "step": 101850 }, { "epoch": 2.488334595558596, "grad_norm": 0.020426765084266663, "learning_rate": 1.181684045871548e-06, "loss": 0.0002, "num_input_tokens_seen": 68659240, "step": 101855 }, { "epoch": 2.488456746390443, "grad_norm": 0.0005153603851795197, "learning_rate": 1.1816001868901902e-06, "loss": 0.0001, "num_input_tokens_seen": 68662952, "step": 101860 }, { "epoch": 2.48857889722229, "grad_norm": 0.3125733435153961, "learning_rate": 1.181516326588187e-06, "loss": 0.0012, "num_input_tokens_seen": 68666280, "step": 101865 }, { "epoch": 2.4887010480541374, "grad_norm": 475.8556823730469, "learning_rate": 1.1814324649661483e-06, "loss": 0.1029, "num_input_tokens_seen": 68669672, "step": 101870 }, { "epoch": 2.4888231988859846, "grad_norm": 0.10539945960044861, "learning_rate": 1.181348602024684e-06, "loss": 0.0001, "num_input_tokens_seen": 68673512, "step": 101875 }, { "epoch": 2.4889453497178318, "grad_norm": 0.06662607192993164, "learning_rate": 1.1812647377644035e-06, "loss": 0.0225, "num_input_tokens_seen": 68677608, "step": 101880 }, { "epoch": 2.4890675005496785, "grad_norm": 0.02495940588414669, "learning_rate": 1.1811808721859175e-06, "loss": 0.0001, "num_input_tokens_seen": 68681512, "step": 101885 }, { "epoch": 2.489189651381526, "grad_norm": 13.704965591430664, "learning_rate": 1.1810970052898355e-06, "loss": 0.1399, "num_input_tokens_seen": 68685160, "step": 101890 }, { "epoch": 2.489311802213373, "grad_norm": 0.12245207279920578, "learning_rate": 1.1810131370767671e-06, "loss": 0.0001, "num_input_tokens_seen": 68688488, "step": 101895 }, { "epoch": 2.48943395304522, "grad_norm": 0.06405623257160187, "learning_rate": 1.1809292675473226e-06, "loss": 0.0006, "num_input_tokens_seen": 68691560, "step": 101900 }, { "epoch": 2.4895561038770673, "grad_norm": 0.020485732704401016, "learning_rate": 1.1808453967021119e-06, "loss": 0.0558, "num_input_tokens_seen": 68694952, "step": 101905 }, { "epoch": 2.4896782547089145, "grad_norm": 955.6651000976562, "learning_rate": 1.1807615245417447e-06, "loss": 0.0548, "num_input_tokens_seen": 68698792, "step": 101910 }, { "epoch": 2.4898004055407617, "grad_norm": 32.87169647216797, "learning_rate": 1.1806776510668312e-06, "loss": 0.0802, "num_input_tokens_seen": 68701864, "step": 101915 }, { "epoch": 2.489922556372609, "grad_norm": 0.8651986122131348, "learning_rate": 1.1805937762779812e-06, "loss": 0.001, "num_input_tokens_seen": 68705000, "step": 101920 }, { "epoch": 2.490044707204456, "grad_norm": 0.009764413349330425, "learning_rate": 1.1805099001758045e-06, "loss": 0.055, "num_input_tokens_seen": 68708392, "step": 101925 }, { "epoch": 2.490166858036303, "grad_norm": 0.00625853706151247, "learning_rate": 1.1804260227609116e-06, "loss": 0.0283, "num_input_tokens_seen": 68711720, "step": 101930 }, { "epoch": 2.4902890088681504, "grad_norm": 0.28316229581832886, "learning_rate": 1.1803421440339113e-06, "loss": 0.0226, "num_input_tokens_seen": 68714920, "step": 101935 }, { "epoch": 2.4904111596999976, "grad_norm": 0.03392892703413963, "learning_rate": 1.1802582639954152e-06, "loss": 0.0003, "num_input_tokens_seen": 68718440, "step": 101940 }, { "epoch": 2.490533310531845, "grad_norm": 0.27008190751075745, "learning_rate": 1.1801743826460324e-06, "loss": 0.0002, "num_input_tokens_seen": 68721960, "step": 101945 }, { "epoch": 2.490655461363692, "grad_norm": 0.01899190992116928, "learning_rate": 1.180090499986373e-06, "loss": 0.1339, "num_input_tokens_seen": 68724904, "step": 101950 }, { "epoch": 2.490777612195539, "grad_norm": 0.038128241896629333, "learning_rate": 1.1800066160170472e-06, "loss": 0.0326, "num_input_tokens_seen": 68728040, "step": 101955 }, { "epoch": 2.4908997630273864, "grad_norm": 0.19664481282234192, "learning_rate": 1.1799227307386648e-06, "loss": 0.0002, "num_input_tokens_seen": 68731368, "step": 101960 }, { "epoch": 2.4910219138592335, "grad_norm": 0.00493689626455307, "learning_rate": 1.1798388441518357e-06, "loss": 0.0001, "num_input_tokens_seen": 68734376, "step": 101965 }, { "epoch": 2.4911440646910807, "grad_norm": 0.012185181491076946, "learning_rate": 1.1797549562571702e-06, "loss": 0.0001, "num_input_tokens_seen": 68737768, "step": 101970 }, { "epoch": 2.491266215522928, "grad_norm": 0.0166007112711668, "learning_rate": 1.1796710670552783e-06, "loss": 0.0001, "num_input_tokens_seen": 68741480, "step": 101975 }, { "epoch": 2.4913883663547747, "grad_norm": 0.005818501580506563, "learning_rate": 1.1795871765467701e-06, "loss": 0.0002, "num_input_tokens_seen": 68744808, "step": 101980 }, { "epoch": 2.491510517186622, "grad_norm": 0.0035260694567114115, "learning_rate": 1.179503284732256e-06, "loss": 0.0002, "num_input_tokens_seen": 68748456, "step": 101985 }, { "epoch": 2.491632668018469, "grad_norm": 0.009216583333909512, "learning_rate": 1.1794193916123453e-06, "loss": 0.1791, "num_input_tokens_seen": 68752104, "step": 101990 }, { "epoch": 2.4917548188503162, "grad_norm": 0.44074198603630066, "learning_rate": 1.1793354971876483e-06, "loss": 0.0003, "num_input_tokens_seen": 68756136, "step": 101995 }, { "epoch": 2.4918769696821634, "grad_norm": 0.061944544315338135, "learning_rate": 1.179251601458776e-06, "loss": 0.0004, "num_input_tokens_seen": 68759528, "step": 102000 }, { "epoch": 2.4919991205140106, "grad_norm": 0.49484783411026, "learning_rate": 1.1791677044263375e-06, "loss": 0.0004, "num_input_tokens_seen": 68762728, "step": 102005 }, { "epoch": 2.492121271345858, "grad_norm": 0.052835285663604736, "learning_rate": 1.1790838060909434e-06, "loss": 0.0001, "num_input_tokens_seen": 68766248, "step": 102010 }, { "epoch": 2.492243422177705, "grad_norm": 1.7878988981246948, "learning_rate": 1.1789999064532034e-06, "loss": 0.0407, "num_input_tokens_seen": 68769192, "step": 102015 }, { "epoch": 2.492365573009552, "grad_norm": 26.050460815429688, "learning_rate": 1.1789160055137282e-06, "loss": 0.1691, "num_input_tokens_seen": 68772328, "step": 102020 }, { "epoch": 2.4924877238413994, "grad_norm": 0.045988693833351135, "learning_rate": 1.1788321032731274e-06, "loss": 0.1607, "num_input_tokens_seen": 68776232, "step": 102025 }, { "epoch": 2.4926098746732466, "grad_norm": 0.045895230025053024, "learning_rate": 1.1787481997320117e-06, "loss": 0.0006, "num_input_tokens_seen": 68779432, "step": 102030 }, { "epoch": 2.4927320255050938, "grad_norm": 0.10627572983503342, "learning_rate": 1.178664294890991e-06, "loss": 0.0666, "num_input_tokens_seen": 68782632, "step": 102035 }, { "epoch": 2.492854176336941, "grad_norm": 0.14017941057682037, "learning_rate": 1.1785803887506756e-06, "loss": 0.0004, "num_input_tokens_seen": 68785896, "step": 102040 }, { "epoch": 2.492976327168788, "grad_norm": 0.26814335584640503, "learning_rate": 1.1784964813116754e-06, "loss": 0.001, "num_input_tokens_seen": 68788968, "step": 102045 }, { "epoch": 2.4930984780006353, "grad_norm": 0.005131382029503584, "learning_rate": 1.1784125725746008e-06, "loss": 0.0002, "num_input_tokens_seen": 68792552, "step": 102050 }, { "epoch": 2.4932206288324825, "grad_norm": 17.653770446777344, "learning_rate": 1.1783286625400619e-06, "loss": 0.0465, "num_input_tokens_seen": 68795816, "step": 102055 }, { "epoch": 2.4933427796643297, "grad_norm": 0.32243382930755615, "learning_rate": 1.1782447512086693e-06, "loss": 0.0494, "num_input_tokens_seen": 68799400, "step": 102060 }, { "epoch": 2.4934649304961765, "grad_norm": 0.04632522165775299, "learning_rate": 1.1781608385810327e-06, "loss": 0.0899, "num_input_tokens_seen": 68802920, "step": 102065 }, { "epoch": 2.493587081328024, "grad_norm": 32.26042938232422, "learning_rate": 1.1780769246577625e-06, "loss": 0.0366, "num_input_tokens_seen": 68805864, "step": 102070 }, { "epoch": 2.493709232159871, "grad_norm": 0.5655348300933838, "learning_rate": 1.1779930094394692e-06, "loss": 0.0005, "num_input_tokens_seen": 68808808, "step": 102075 }, { "epoch": 2.493831382991718, "grad_norm": 134.5360565185547, "learning_rate": 1.1779090929267628e-06, "loss": 0.0685, "num_input_tokens_seen": 68812200, "step": 102080 }, { "epoch": 2.493953533823565, "grad_norm": 0.015330174006521702, "learning_rate": 1.1778251751202534e-06, "loss": 0.0001, "num_input_tokens_seen": 68815976, "step": 102085 }, { "epoch": 2.4940756846554124, "grad_norm": 0.011960971169173717, "learning_rate": 1.1777412560205515e-06, "loss": 0.0001, "num_input_tokens_seen": 68819752, "step": 102090 }, { "epoch": 2.4941978354872596, "grad_norm": 0.007198326289653778, "learning_rate": 1.1776573356282677e-06, "loss": 0.0002, "num_input_tokens_seen": 68823272, "step": 102095 }, { "epoch": 2.494319986319107, "grad_norm": 0.028509464114904404, "learning_rate": 1.1775734139440116e-06, "loss": 0.0002, "num_input_tokens_seen": 68826536, "step": 102100 }, { "epoch": 2.494442137150954, "grad_norm": 0.04958329722285271, "learning_rate": 1.1774894909683943e-06, "loss": 0.0002, "num_input_tokens_seen": 68829992, "step": 102105 }, { "epoch": 2.494564287982801, "grad_norm": 0.005899414885789156, "learning_rate": 1.1774055667020258e-06, "loss": 0.0443, "num_input_tokens_seen": 68833384, "step": 102110 }, { "epoch": 2.4946864388146484, "grad_norm": 1.5522427558898926, "learning_rate": 1.1773216411455158e-06, "loss": 0.0002, "num_input_tokens_seen": 68836456, "step": 102115 }, { "epoch": 2.4948085896464955, "grad_norm": 0.028691526502370834, "learning_rate": 1.1772377142994752e-06, "loss": 0.07, "num_input_tokens_seen": 68839784, "step": 102120 }, { "epoch": 2.4949307404783427, "grad_norm": 48.998382568359375, "learning_rate": 1.1771537861645143e-06, "loss": 0.0823, "num_input_tokens_seen": 68842920, "step": 102125 }, { "epoch": 2.49505289131019, "grad_norm": 0.053425900638103485, "learning_rate": 1.1770698567412437e-06, "loss": 0.0001, "num_input_tokens_seen": 68846248, "step": 102130 }, { "epoch": 2.495175042142037, "grad_norm": 76.71073913574219, "learning_rate": 1.1769859260302733e-06, "loss": 0.0799, "num_input_tokens_seen": 68849448, "step": 102135 }, { "epoch": 2.4952971929738843, "grad_norm": 0.07131334394216537, "learning_rate": 1.1769019940322137e-06, "loss": 0.0002, "num_input_tokens_seen": 68853224, "step": 102140 }, { "epoch": 2.4954193438057315, "grad_norm": 3.223170757293701, "learning_rate": 1.176818060747675e-06, "loss": 0.0007, "num_input_tokens_seen": 68857256, "step": 102145 }, { "epoch": 2.4955414946375782, "grad_norm": 0.1664036363363266, "learning_rate": 1.176734126177268e-06, "loss": 0.0003, "num_input_tokens_seen": 68860584, "step": 102150 }, { "epoch": 2.495663645469426, "grad_norm": 0.010533252730965614, "learning_rate": 1.1766501903216028e-06, "loss": 0.0017, "num_input_tokens_seen": 68864296, "step": 102155 }, { "epoch": 2.4957857963012726, "grad_norm": 32.75882339477539, "learning_rate": 1.17656625318129e-06, "loss": 0.1199, "num_input_tokens_seen": 68867688, "step": 102160 }, { "epoch": 2.49590794713312, "grad_norm": 0.8329938054084778, "learning_rate": 1.1764823147569399e-06, "loss": 0.0003, "num_input_tokens_seen": 68870952, "step": 102165 }, { "epoch": 2.496030097964967, "grad_norm": 0.10802672058343887, "learning_rate": 1.1763983750491629e-06, "loss": 0.0366, "num_input_tokens_seen": 68874088, "step": 102170 }, { "epoch": 2.496152248796814, "grad_norm": 0.7539714574813843, "learning_rate": 1.1763144340585695e-06, "loss": 0.0152, "num_input_tokens_seen": 68877288, "step": 102175 }, { "epoch": 2.4962743996286614, "grad_norm": 28.428529739379883, "learning_rate": 1.17623049178577e-06, "loss": 0.1067, "num_input_tokens_seen": 68880552, "step": 102180 }, { "epoch": 2.4963965504605086, "grad_norm": 0.030778715386986732, "learning_rate": 1.176146548231375e-06, "loss": 0.0001, "num_input_tokens_seen": 68884264, "step": 102185 }, { "epoch": 2.4965187012923558, "grad_norm": 0.00747660081833601, "learning_rate": 1.176062603395995e-06, "loss": 0.0002, "num_input_tokens_seen": 68887976, "step": 102190 }, { "epoch": 2.496640852124203, "grad_norm": 0.8309156894683838, "learning_rate": 1.1759786572802405e-06, "loss": 0.1329, "num_input_tokens_seen": 68891176, "step": 102195 }, { "epoch": 2.49676300295605, "grad_norm": 0.04314163699746132, "learning_rate": 1.1758947098847217e-06, "loss": 0.1105, "num_input_tokens_seen": 68894568, "step": 102200 }, { "epoch": 2.4968851537878973, "grad_norm": 0.005910862237215042, "learning_rate": 1.1758107612100491e-06, "loss": 0.0012, "num_input_tokens_seen": 68897832, "step": 102205 }, { "epoch": 2.4970073046197445, "grad_norm": 10.59837532043457, "learning_rate": 1.1757268112568337e-06, "loss": 0.0991, "num_input_tokens_seen": 68901352, "step": 102210 }, { "epoch": 2.4971294554515917, "grad_norm": 0.04170943796634674, "learning_rate": 1.1756428600256855e-06, "loss": 0.0007, "num_input_tokens_seen": 68904488, "step": 102215 }, { "epoch": 2.497251606283439, "grad_norm": 0.0058653769083321095, "learning_rate": 1.1755589075172152e-06, "loss": 0.1971, "num_input_tokens_seen": 68907560, "step": 102220 }, { "epoch": 2.497373757115286, "grad_norm": 0.035103026777505875, "learning_rate": 1.1754749537320333e-06, "loss": 0.0004, "num_input_tokens_seen": 68910760, "step": 102225 }, { "epoch": 2.4974959079471333, "grad_norm": 0.015648502856492996, "learning_rate": 1.1753909986707505e-06, "loss": 0.0293, "num_input_tokens_seen": 68914472, "step": 102230 }, { "epoch": 2.4976180587789805, "grad_norm": 0.0711943730711937, "learning_rate": 1.1753070423339768e-06, "loss": 0.1082, "num_input_tokens_seen": 68917608, "step": 102235 }, { "epoch": 2.4977402096108277, "grad_norm": 42.90780258178711, "learning_rate": 1.1752230847223235e-06, "loss": 0.0013, "num_input_tokens_seen": 68920808, "step": 102240 }, { "epoch": 2.4978623604426744, "grad_norm": 0.019687887281179428, "learning_rate": 1.1751391258364005e-06, "loss": 0.0066, "num_input_tokens_seen": 68924264, "step": 102245 }, { "epoch": 2.4979845112745216, "grad_norm": 0.008903698064386845, "learning_rate": 1.1750551656768188e-06, "loss": 0.0508, "num_input_tokens_seen": 68927528, "step": 102250 }, { "epoch": 2.498106662106369, "grad_norm": 0.19585798680782318, "learning_rate": 1.174971204244189e-06, "loss": 0.0005, "num_input_tokens_seen": 68930344, "step": 102255 }, { "epoch": 2.498228812938216, "grad_norm": 0.023192299529910088, "learning_rate": 1.1748872415391214e-06, "loss": 0.0003, "num_input_tokens_seen": 68933544, "step": 102260 }, { "epoch": 2.498350963770063, "grad_norm": 0.05166854336857796, "learning_rate": 1.1748032775622269e-06, "loss": 0.0003, "num_input_tokens_seen": 68937384, "step": 102265 }, { "epoch": 2.4984731146019103, "grad_norm": 0.006690154317766428, "learning_rate": 1.174719312314116e-06, "loss": 0.0345, "num_input_tokens_seen": 68940648, "step": 102270 }, { "epoch": 2.4985952654337575, "grad_norm": 0.036399852484464645, "learning_rate": 1.1746353457953988e-06, "loss": 0.0002, "num_input_tokens_seen": 68944360, "step": 102275 }, { "epoch": 2.4987174162656047, "grad_norm": 137.89358520507812, "learning_rate": 1.1745513780066867e-06, "loss": 0.035, "num_input_tokens_seen": 68948072, "step": 102280 }, { "epoch": 2.498839567097452, "grad_norm": 10.838881492614746, "learning_rate": 1.17446740894859e-06, "loss": 0.1122, "num_input_tokens_seen": 68951336, "step": 102285 }, { "epoch": 2.498961717929299, "grad_norm": 35.52821350097656, "learning_rate": 1.1743834386217192e-06, "loss": 0.098, "num_input_tokens_seen": 68954600, "step": 102290 }, { "epoch": 2.4990838687611463, "grad_norm": 660.7169189453125, "learning_rate": 1.1742994670266856e-06, "loss": 0.0631, "num_input_tokens_seen": 68957800, "step": 102295 }, { "epoch": 2.4992060195929935, "grad_norm": 1.6077616214752197, "learning_rate": 1.1742154941640989e-06, "loss": 0.0015, "num_input_tokens_seen": 68961384, "step": 102300 }, { "epoch": 2.4993281704248407, "grad_norm": 0.5358433723449707, "learning_rate": 1.1741315200345703e-06, "loss": 0.0263, "num_input_tokens_seen": 68964648, "step": 102305 }, { "epoch": 2.499450321256688, "grad_norm": 0.03443678468465805, "learning_rate": 1.174047544638711e-06, "loss": 0.0331, "num_input_tokens_seen": 68968168, "step": 102310 }, { "epoch": 2.499572472088535, "grad_norm": 0.06915648281574249, "learning_rate": 1.1739635679771306e-06, "loss": 0.0007, "num_input_tokens_seen": 68971368, "step": 102315 }, { "epoch": 2.4996946229203822, "grad_norm": 0.26230019330978394, "learning_rate": 1.1738795900504406e-06, "loss": 0.0006, "num_input_tokens_seen": 68974760, "step": 102320 }, { "epoch": 2.4998167737522294, "grad_norm": 229.71182250976562, "learning_rate": 1.1737956108592512e-06, "loss": 0.0299, "num_input_tokens_seen": 68978216, "step": 102325 }, { "epoch": 2.499938924584076, "grad_norm": 0.014974378049373627, "learning_rate": 1.1737116304041736e-06, "loss": 0.0428, "num_input_tokens_seen": 68981480, "step": 102330 }, { "epoch": 2.500061075415924, "grad_norm": 0.036249011754989624, "learning_rate": 1.173627648685818e-06, "loss": 0.0745, "num_input_tokens_seen": 68984744, "step": 102335 }, { "epoch": 2.5001832262477706, "grad_norm": 0.03688354417681694, "learning_rate": 1.173543665704796e-06, "loss": 0.0236, "num_input_tokens_seen": 68988456, "step": 102340 }, { "epoch": 2.5001832262477706, "eval_loss": 0.18347086012363434, "eval_runtime": 47.7866, "eval_samples_per_second": 761.406, "eval_steps_per_second": 95.194, "num_input_tokens_seen": 68988456, "step": 102340 }, { "epoch": 2.5003053770796178, "grad_norm": 0.02084536850452423, "learning_rate": 1.1734596814617173e-06, "loss": 0.0005, "num_input_tokens_seen": 68991656, "step": 102345 }, { "epoch": 2.500427527911465, "grad_norm": 11.465497016906738, "learning_rate": 1.1733756959571933e-06, "loss": 0.063, "num_input_tokens_seen": 68994728, "step": 102350 }, { "epoch": 2.500549678743312, "grad_norm": 0.03728627413511276, "learning_rate": 1.1732917091918347e-06, "loss": 0.0003, "num_input_tokens_seen": 68998056, "step": 102355 }, { "epoch": 2.5006718295751593, "grad_norm": 0.0972631648182869, "learning_rate": 1.173207721166252e-06, "loss": 0.0003, "num_input_tokens_seen": 69001192, "step": 102360 }, { "epoch": 2.5007939804070065, "grad_norm": 0.03852716460824013, "learning_rate": 1.1731237318810562e-06, "loss": 0.0551, "num_input_tokens_seen": 69004584, "step": 102365 }, { "epoch": 2.5009161312388537, "grad_norm": 0.027538815513253212, "learning_rate": 1.1730397413368583e-06, "loss": 0.0401, "num_input_tokens_seen": 69007464, "step": 102370 }, { "epoch": 2.501038282070701, "grad_norm": 0.10168246179819107, "learning_rate": 1.1729557495342685e-06, "loss": 0.0504, "num_input_tokens_seen": 69010536, "step": 102375 }, { "epoch": 2.501160432902548, "grad_norm": 0.024673402309417725, "learning_rate": 1.1728717564738983e-06, "loss": 0.0621, "num_input_tokens_seen": 69014184, "step": 102380 }, { "epoch": 2.5012825837343953, "grad_norm": 0.021501636132597923, "learning_rate": 1.172787762156358e-06, "loss": 0.0002, "num_input_tokens_seen": 69018152, "step": 102385 }, { "epoch": 2.5014047345662425, "grad_norm": 0.022808056324720383, "learning_rate": 1.1727037665822588e-06, "loss": 0.054, "num_input_tokens_seen": 69021608, "step": 102390 }, { "epoch": 2.5015268853980897, "grad_norm": 0.4708763659000397, "learning_rate": 1.172619769752211e-06, "loss": 0.0003, "num_input_tokens_seen": 69024680, "step": 102395 }, { "epoch": 2.501649036229937, "grad_norm": 34.90665054321289, "learning_rate": 1.172535771666826e-06, "loss": 0.1057, "num_input_tokens_seen": 69027752, "step": 102400 }, { "epoch": 2.501771187061784, "grad_norm": 0.05201762914657593, "learning_rate": 1.1724517723267143e-06, "loss": 0.0002, "num_input_tokens_seen": 69031080, "step": 102405 }, { "epoch": 2.501893337893631, "grad_norm": 0.03720242902636528, "learning_rate": 1.172367771732487e-06, "loss": 0.0002, "num_input_tokens_seen": 69034408, "step": 102410 }, { "epoch": 2.502015488725478, "grad_norm": 0.011985605582594872, "learning_rate": 1.1722837698847552e-06, "loss": 0.0434, "num_input_tokens_seen": 69038568, "step": 102415 }, { "epoch": 2.5021376395573256, "grad_norm": 11.999226570129395, "learning_rate": 1.1721997667841295e-06, "loss": 0.124, "num_input_tokens_seen": 69041960, "step": 102420 }, { "epoch": 2.5022597903891723, "grad_norm": 0.3938823938369751, "learning_rate": 1.1721157624312206e-06, "loss": 0.0012, "num_input_tokens_seen": 69045544, "step": 102425 }, { "epoch": 2.50238194122102, "grad_norm": 0.01563347317278385, "learning_rate": 1.1720317568266393e-06, "loss": 0.0541, "num_input_tokens_seen": 69049064, "step": 102430 }, { "epoch": 2.5025040920528667, "grad_norm": 43.03242111206055, "learning_rate": 1.1719477499709971e-06, "loss": 0.0013, "num_input_tokens_seen": 69052456, "step": 102435 }, { "epoch": 2.502626242884714, "grad_norm": 0.052847519516944885, "learning_rate": 1.1718637418649047e-06, "loss": 0.0623, "num_input_tokens_seen": 69055720, "step": 102440 }, { "epoch": 2.502748393716561, "grad_norm": 0.025785338133573532, "learning_rate": 1.1717797325089727e-06, "loss": 0.0002, "num_input_tokens_seen": 69058856, "step": 102445 }, { "epoch": 2.5028705445484083, "grad_norm": 0.35239148139953613, "learning_rate": 1.1716957219038123e-06, "loss": 0.0003, "num_input_tokens_seen": 69062440, "step": 102450 }, { "epoch": 2.5029926953802555, "grad_norm": 0.024918891489505768, "learning_rate": 1.1716117100500347e-06, "loss": 0.0627, "num_input_tokens_seen": 69065704, "step": 102455 }, { "epoch": 2.5031148462121027, "grad_norm": 1.4856153726577759, "learning_rate": 1.1715276969482502e-06, "loss": 0.0004, "num_input_tokens_seen": 69069096, "step": 102460 }, { "epoch": 2.50323699704395, "grad_norm": 0.26246124505996704, "learning_rate": 1.1714436825990706e-06, "loss": 0.0859, "num_input_tokens_seen": 69072680, "step": 102465 }, { "epoch": 2.503359147875797, "grad_norm": 0.3725166618824005, "learning_rate": 1.1713596670031061e-06, "loss": 0.0266, "num_input_tokens_seen": 69076008, "step": 102470 }, { "epoch": 2.5034812987076442, "grad_norm": 0.059920698404312134, "learning_rate": 1.1712756501609681e-06, "loss": 0.0594, "num_input_tokens_seen": 69079656, "step": 102475 }, { "epoch": 2.5036034495394914, "grad_norm": 19.280452728271484, "learning_rate": 1.1711916320732675e-06, "loss": 0.0634, "num_input_tokens_seen": 69082664, "step": 102480 }, { "epoch": 2.5037256003713386, "grad_norm": 0.05751289427280426, "learning_rate": 1.1711076127406155e-06, "loss": 0.047, "num_input_tokens_seen": 69085864, "step": 102485 }, { "epoch": 2.503847751203186, "grad_norm": 0.044818613678216934, "learning_rate": 1.1710235921636228e-06, "loss": 0.0005, "num_input_tokens_seen": 69089512, "step": 102490 }, { "epoch": 2.503969902035033, "grad_norm": 0.032787542790174484, "learning_rate": 1.1709395703429002e-06, "loss": 0.0004, "num_input_tokens_seen": 69093032, "step": 102495 }, { "epoch": 2.5040920528668797, "grad_norm": 0.039422884583473206, "learning_rate": 1.1708555472790593e-06, "loss": 0.0003, "num_input_tokens_seen": 69096232, "step": 102500 }, { "epoch": 2.5042142036987274, "grad_norm": 0.07437865436077118, "learning_rate": 1.170771522972711e-06, "loss": 0.0004, "num_input_tokens_seen": 69099432, "step": 102505 }, { "epoch": 2.504336354530574, "grad_norm": 0.008755282498896122, "learning_rate": 1.1706874974244661e-06, "loss": 0.0921, "num_input_tokens_seen": 69102504, "step": 102510 }, { "epoch": 2.5044585053624218, "grad_norm": 0.11593923717737198, "learning_rate": 1.1706034706349358e-06, "loss": 0.0003, "num_input_tokens_seen": 69106152, "step": 102515 }, { "epoch": 2.5045806561942685, "grad_norm": 0.037901975214481354, "learning_rate": 1.1705194426047314e-06, "loss": 0.0002, "num_input_tokens_seen": 69109800, "step": 102520 }, { "epoch": 2.5047028070261157, "grad_norm": 0.011591866612434387, "learning_rate": 1.1704354133344635e-06, "loss": 0.0436, "num_input_tokens_seen": 69113064, "step": 102525 }, { "epoch": 2.504824957857963, "grad_norm": 0.015728740021586418, "learning_rate": 1.1703513828247436e-06, "loss": 0.0002, "num_input_tokens_seen": 69116328, "step": 102530 }, { "epoch": 2.50494710868981, "grad_norm": 0.12637105584144592, "learning_rate": 1.1702673510761827e-06, "loss": 0.0006, "num_input_tokens_seen": 69119592, "step": 102535 }, { "epoch": 2.5050692595216573, "grad_norm": 0.15325936675071716, "learning_rate": 1.1701833180893917e-06, "loss": 0.0006, "num_input_tokens_seen": 69123048, "step": 102540 }, { "epoch": 2.5051914103535045, "grad_norm": 1.882394552230835, "learning_rate": 1.1700992838649819e-06, "loss": 0.0006, "num_input_tokens_seen": 69126888, "step": 102545 }, { "epoch": 2.5053135611853516, "grad_norm": 0.033663101494312286, "learning_rate": 1.170015248403564e-06, "loss": 0.1027, "num_input_tokens_seen": 69130088, "step": 102550 }, { "epoch": 2.505435712017199, "grad_norm": 0.43850404024124146, "learning_rate": 1.1699312117057498e-06, "loss": 0.0005, "num_input_tokens_seen": 69133096, "step": 102555 }, { "epoch": 2.505557862849046, "grad_norm": 12.489361763000488, "learning_rate": 1.16984717377215e-06, "loss": 0.0943, "num_input_tokens_seen": 69136616, "step": 102560 }, { "epoch": 2.505680013680893, "grad_norm": 0.17170336842536926, "learning_rate": 1.169763134603376e-06, "loss": 0.0563, "num_input_tokens_seen": 69139880, "step": 102565 }, { "epoch": 2.5058021645127404, "grad_norm": 0.04106225445866585, "learning_rate": 1.1696790942000389e-06, "loss": 0.0428, "num_input_tokens_seen": 69143848, "step": 102570 }, { "epoch": 2.5059243153445876, "grad_norm": 0.2668132185935974, "learning_rate": 1.1695950525627499e-06, "loss": 0.1425, "num_input_tokens_seen": 69146984, "step": 102575 }, { "epoch": 2.506046466176435, "grad_norm": 33.77798843383789, "learning_rate": 1.16951100969212e-06, "loss": 0.0607, "num_input_tokens_seen": 69150440, "step": 102580 }, { "epoch": 2.506168617008282, "grad_norm": 0.1751384735107422, "learning_rate": 1.1694269655887602e-06, "loss": 0.0006, "num_input_tokens_seen": 69153896, "step": 102585 }, { "epoch": 2.506290767840129, "grad_norm": 0.3074788451194763, "learning_rate": 1.169342920253282e-06, "loss": 0.0629, "num_input_tokens_seen": 69157032, "step": 102590 }, { "epoch": 2.506412918671976, "grad_norm": 0.03330891206860542, "learning_rate": 1.1692588736862966e-06, "loss": 0.042, "num_input_tokens_seen": 69160168, "step": 102595 }, { "epoch": 2.5065350695038235, "grad_norm": 0.21258597075939178, "learning_rate": 1.169174825888415e-06, "loss": 0.0005, "num_input_tokens_seen": 69163560, "step": 102600 }, { "epoch": 2.5066572203356703, "grad_norm": 0.07895040512084961, "learning_rate": 1.1690907768602487e-06, "loss": 0.032, "num_input_tokens_seen": 69167080, "step": 102605 }, { "epoch": 2.506779371167518, "grad_norm": 0.14544044435024261, "learning_rate": 1.1690067266024086e-06, "loss": 0.0282, "num_input_tokens_seen": 69170792, "step": 102610 }, { "epoch": 2.5069015219993647, "grad_norm": 0.08901277184486389, "learning_rate": 1.1689226751155062e-06, "loss": 0.097, "num_input_tokens_seen": 69174056, "step": 102615 }, { "epoch": 2.507023672831212, "grad_norm": 0.16903647780418396, "learning_rate": 1.168838622400153e-06, "loss": 0.0013, "num_input_tokens_seen": 69177064, "step": 102620 }, { "epoch": 2.507145823663059, "grad_norm": 0.6867279410362244, "learning_rate": 1.1687545684569598e-06, "loss": 0.0284, "num_input_tokens_seen": 69180264, "step": 102625 }, { "epoch": 2.5072679744949062, "grad_norm": 0.04904516041278839, "learning_rate": 1.1686705132865377e-06, "loss": 0.0003, "num_input_tokens_seen": 69183784, "step": 102630 }, { "epoch": 2.5073901253267534, "grad_norm": 28.1914005279541, "learning_rate": 1.1685864568894984e-06, "loss": 0.0509, "num_input_tokens_seen": 69187496, "step": 102635 }, { "epoch": 2.5075122761586006, "grad_norm": 0.06639095395803452, "learning_rate": 1.1685023992664533e-06, "loss": 0.0002, "num_input_tokens_seen": 69190888, "step": 102640 }, { "epoch": 2.507634426990448, "grad_norm": 0.3889292776584625, "learning_rate": 1.1684183404180132e-06, "loss": 0.0588, "num_input_tokens_seen": 69194856, "step": 102645 }, { "epoch": 2.507756577822295, "grad_norm": 1.6642292737960815, "learning_rate": 1.1683342803447894e-06, "loss": 0.0007, "num_input_tokens_seen": 69198120, "step": 102650 }, { "epoch": 2.507878728654142, "grad_norm": 0.02274067886173725, "learning_rate": 1.1682502190473937e-06, "loss": 0.0005, "num_input_tokens_seen": 69201384, "step": 102655 }, { "epoch": 2.5080008794859894, "grad_norm": 0.057451456785202026, "learning_rate": 1.168166156526437e-06, "loss": 0.0002, "num_input_tokens_seen": 69204648, "step": 102660 }, { "epoch": 2.5081230303178366, "grad_norm": 0.7137342691421509, "learning_rate": 1.168082092782531e-06, "loss": 0.0459, "num_input_tokens_seen": 69207720, "step": 102665 }, { "epoch": 2.5082451811496838, "grad_norm": 0.05443684384226799, "learning_rate": 1.167998027816287e-06, "loss": 0.0002, "num_input_tokens_seen": 69211048, "step": 102670 }, { "epoch": 2.508367331981531, "grad_norm": 0.12076818197965622, "learning_rate": 1.1679139616283155e-06, "loss": 0.0464, "num_input_tokens_seen": 69214120, "step": 102675 }, { "epoch": 2.5084894828133777, "grad_norm": 0.4422742426395416, "learning_rate": 1.1678298942192292e-06, "loss": 0.0003, "num_input_tokens_seen": 69217896, "step": 102680 }, { "epoch": 2.5086116336452253, "grad_norm": 0.18747763335704803, "learning_rate": 1.1677458255896384e-06, "loss": 0.0001, "num_input_tokens_seen": 69220776, "step": 102685 }, { "epoch": 2.508733784477072, "grad_norm": 0.008999832905828953, "learning_rate": 1.1676617557401547e-06, "loss": 0.0002, "num_input_tokens_seen": 69224104, "step": 102690 }, { "epoch": 2.5088559353089197, "grad_norm": 0.024260293692350388, "learning_rate": 1.1675776846713899e-06, "loss": 0.0048, "num_input_tokens_seen": 69227112, "step": 102695 }, { "epoch": 2.5089780861407665, "grad_norm": 42.969993591308594, "learning_rate": 1.167493612383955e-06, "loss": 0.0385, "num_input_tokens_seen": 69230696, "step": 102700 }, { "epoch": 2.5091002369726136, "grad_norm": 0.03098537214100361, "learning_rate": 1.1674095388784616e-06, "loss": 0.0003, "num_input_tokens_seen": 69234152, "step": 102705 }, { "epoch": 2.509222387804461, "grad_norm": 0.0010812964756041765, "learning_rate": 1.1673254641555206e-06, "loss": 0.0002, "num_input_tokens_seen": 69237544, "step": 102710 }, { "epoch": 2.509344538636308, "grad_norm": 0.02446441911160946, "learning_rate": 1.1672413882157442e-06, "loss": 0.0215, "num_input_tokens_seen": 69240616, "step": 102715 }, { "epoch": 2.509466689468155, "grad_norm": 0.3052109479904175, "learning_rate": 1.1671573110597434e-06, "loss": 0.0633, "num_input_tokens_seen": 69243624, "step": 102720 }, { "epoch": 2.5095888403000024, "grad_norm": 102.80419158935547, "learning_rate": 1.1670732326881297e-06, "loss": 0.1228, "num_input_tokens_seen": 69246888, "step": 102725 }, { "epoch": 2.5097109911318496, "grad_norm": 0.17125174403190613, "learning_rate": 1.1669891531015145e-06, "loss": 0.0002, "num_input_tokens_seen": 69250664, "step": 102730 }, { "epoch": 2.509833141963697, "grad_norm": 0.008751153945922852, "learning_rate": 1.1669050723005095e-06, "loss": 0.0002, "num_input_tokens_seen": 69254440, "step": 102735 }, { "epoch": 2.509955292795544, "grad_norm": 0.023152705281972885, "learning_rate": 1.1668209902857253e-06, "loss": 0.0003, "num_input_tokens_seen": 69257640, "step": 102740 }, { "epoch": 2.510077443627391, "grad_norm": 0.016359543427824974, "learning_rate": 1.1667369070577744e-06, "loss": 0.0002, "num_input_tokens_seen": 69261288, "step": 102745 }, { "epoch": 2.5101995944592383, "grad_norm": 0.13440461456775665, "learning_rate": 1.1666528226172678e-06, "loss": 0.0002, "num_input_tokens_seen": 69264360, "step": 102750 }, { "epoch": 2.5103217452910855, "grad_norm": 0.004721632227301598, "learning_rate": 1.1665687369648172e-06, "loss": 0.0681, "num_input_tokens_seen": 69267624, "step": 102755 }, { "epoch": 2.5104438961229327, "grad_norm": 0.0034103114157915115, "learning_rate": 1.1664846501010336e-06, "loss": 0.0329, "num_input_tokens_seen": 69270888, "step": 102760 }, { "epoch": 2.51056604695478, "grad_norm": 0.7425034642219543, "learning_rate": 1.1664005620265292e-06, "loss": 0.092, "num_input_tokens_seen": 69274536, "step": 102765 }, { "epoch": 2.510688197786627, "grad_norm": 0.026685822755098343, "learning_rate": 1.166316472741915e-06, "loss": 0.0003, "num_input_tokens_seen": 69278120, "step": 102770 }, { "epoch": 2.510810348618474, "grad_norm": 20.21971893310547, "learning_rate": 1.1662323822478026e-06, "loss": 0.0467, "num_input_tokens_seen": 69282024, "step": 102775 }, { "epoch": 2.5109324994503215, "grad_norm": 0.1762675642967224, "learning_rate": 1.166148290544804e-06, "loss": 0.0007, "num_input_tokens_seen": 69285288, "step": 102780 }, { "epoch": 2.5110546502821682, "grad_norm": 0.01969783566892147, "learning_rate": 1.16606419763353e-06, "loss": 0.0006, "num_input_tokens_seen": 69288808, "step": 102785 }, { "epoch": 2.5111768011140154, "grad_norm": 0.013307438232004642, "learning_rate": 1.1659801035145925e-06, "loss": 0.0018, "num_input_tokens_seen": 69292072, "step": 102790 }, { "epoch": 2.5112989519458626, "grad_norm": 33.22414016723633, "learning_rate": 1.165896008188603e-06, "loss": 0.0431, "num_input_tokens_seen": 69295400, "step": 102795 }, { "epoch": 2.51142110277771, "grad_norm": 0.007968748919665813, "learning_rate": 1.1658119116561732e-06, "loss": 0.1111, "num_input_tokens_seen": 69298408, "step": 102800 }, { "epoch": 2.511543253609557, "grad_norm": 0.14272573590278625, "learning_rate": 1.1657278139179143e-06, "loss": 0.0003, "num_input_tokens_seen": 69301928, "step": 102805 }, { "epoch": 2.511665404441404, "grad_norm": 0.02590774931013584, "learning_rate": 1.1656437149744384e-06, "loss": 0.0001, "num_input_tokens_seen": 69305320, "step": 102810 }, { "epoch": 2.5117875552732514, "grad_norm": 0.013551932759582996, "learning_rate": 1.1655596148263568e-06, "loss": 0.0, "num_input_tokens_seen": 69308520, "step": 102815 }, { "epoch": 2.5119097061050986, "grad_norm": 0.19894510507583618, "learning_rate": 1.1654755134742814e-06, "loss": 0.0502, "num_input_tokens_seen": 69311720, "step": 102820 }, { "epoch": 2.5120318569369458, "grad_norm": 0.01751531846821308, "learning_rate": 1.1653914109188233e-06, "loss": 0.0321, "num_input_tokens_seen": 69315240, "step": 102825 }, { "epoch": 2.512154007768793, "grad_norm": 0.010112373158335686, "learning_rate": 1.1653073071605945e-06, "loss": 0.0001, "num_input_tokens_seen": 69318888, "step": 102830 }, { "epoch": 2.51227615860064, "grad_norm": 0.024472983554005623, "learning_rate": 1.1652232022002064e-06, "loss": 0.0001, "num_input_tokens_seen": 69322024, "step": 102835 }, { "epoch": 2.5123983094324873, "grad_norm": 0.03016134537756443, "learning_rate": 1.1651390960382707e-06, "loss": 0.0501, "num_input_tokens_seen": 69325480, "step": 102840 }, { "epoch": 2.5125204602643345, "grad_norm": 0.02304493635892868, "learning_rate": 1.1650549886753994e-06, "loss": 0.0132, "num_input_tokens_seen": 69329320, "step": 102845 }, { "epoch": 2.5126426110961817, "grad_norm": 8.705880165100098, "learning_rate": 1.1649708801122036e-06, "loss": 0.0525, "num_input_tokens_seen": 69332712, "step": 102850 }, { "epoch": 2.512764761928029, "grad_norm": 80.08638763427734, "learning_rate": 1.1648867703492951e-06, "loss": 0.0337, "num_input_tokens_seen": 69336168, "step": 102855 }, { "epoch": 2.5128869127598756, "grad_norm": 0.08520685136318207, "learning_rate": 1.1648026593872858e-06, "loss": 0.0858, "num_input_tokens_seen": 69339432, "step": 102860 }, { "epoch": 2.5130090635917233, "grad_norm": 127.86909484863281, "learning_rate": 1.1647185472267868e-06, "loss": 0.0842, "num_input_tokens_seen": 69342888, "step": 102865 }, { "epoch": 2.51313121442357, "grad_norm": 0.01910681650042534, "learning_rate": 1.1646344338684107e-06, "loss": 0.1118, "num_input_tokens_seen": 69346344, "step": 102870 }, { "epoch": 2.5132533652554176, "grad_norm": 0.018942229449748993, "learning_rate": 1.1645503193127685e-06, "loss": 0.0001, "num_input_tokens_seen": 69349416, "step": 102875 }, { "epoch": 2.5133755160872644, "grad_norm": 18.96344566345215, "learning_rate": 1.1644662035604725e-06, "loss": 0.0941, "num_input_tokens_seen": 69352552, "step": 102880 }, { "epoch": 2.5134976669191116, "grad_norm": 0.36074990034103394, "learning_rate": 1.1643820866121338e-06, "loss": 0.0003, "num_input_tokens_seen": 69355944, "step": 102885 }, { "epoch": 2.5136198177509588, "grad_norm": 0.0732511654496193, "learning_rate": 1.1642979684683642e-06, "loss": 0.0003, "num_input_tokens_seen": 69358952, "step": 102890 }, { "epoch": 2.513741968582806, "grad_norm": 0.14155447483062744, "learning_rate": 1.1642138491297756e-06, "loss": 0.0004, "num_input_tokens_seen": 69362728, "step": 102895 }, { "epoch": 2.513864119414653, "grad_norm": 0.11240936070680618, "learning_rate": 1.1641297285969798e-06, "loss": 0.0002, "num_input_tokens_seen": 69366120, "step": 102900 }, { "epoch": 2.5139862702465003, "grad_norm": 0.016462501138448715, "learning_rate": 1.1640456068705886e-06, "loss": 0.0772, "num_input_tokens_seen": 69369576, "step": 102905 }, { "epoch": 2.5141084210783475, "grad_norm": 0.06560775637626648, "learning_rate": 1.1639614839512133e-06, "loss": 0.0373, "num_input_tokens_seen": 69372776, "step": 102910 }, { "epoch": 2.5142305719101947, "grad_norm": 108.05178833007812, "learning_rate": 1.1638773598394663e-06, "loss": 0.0064, "num_input_tokens_seen": 69375976, "step": 102915 }, { "epoch": 2.514352722742042, "grad_norm": 0.010637856088578701, "learning_rate": 1.1637932345359588e-06, "loss": 0.0004, "num_input_tokens_seen": 69379048, "step": 102920 }, { "epoch": 2.514474873573889, "grad_norm": 0.133377805352211, "learning_rate": 1.1637091080413032e-06, "loss": 0.0003, "num_input_tokens_seen": 69382184, "step": 102925 }, { "epoch": 2.5145970244057363, "grad_norm": 0.04459724947810173, "learning_rate": 1.1636249803561106e-06, "loss": 0.155, "num_input_tokens_seen": 69385576, "step": 102930 }, { "epoch": 2.5147191752375835, "grad_norm": 0.09748226404190063, "learning_rate": 1.1635408514809934e-06, "loss": 0.0001, "num_input_tokens_seen": 69389032, "step": 102935 }, { "epoch": 2.5148413260694307, "grad_norm": 0.03192561864852905, "learning_rate": 1.163456721416563e-06, "loss": 0.0005, "num_input_tokens_seen": 69392360, "step": 102940 }, { "epoch": 2.514963476901278, "grad_norm": 0.036991093307733536, "learning_rate": 1.1633725901634312e-06, "loss": 0.0002, "num_input_tokens_seen": 69395688, "step": 102945 }, { "epoch": 2.515085627733125, "grad_norm": 0.6261094212532043, "learning_rate": 1.1632884577222105e-06, "loss": 0.0509, "num_input_tokens_seen": 69399208, "step": 102950 }, { "epoch": 2.515207778564972, "grad_norm": 0.004038868006318808, "learning_rate": 1.1632043240935118e-06, "loss": 0.0452, "num_input_tokens_seen": 69402856, "step": 102955 }, { "epoch": 2.5153299293968194, "grad_norm": 0.015494892373681068, "learning_rate": 1.1631201892779473e-06, "loss": 0.0503, "num_input_tokens_seen": 69405992, "step": 102960 }, { "epoch": 2.515452080228666, "grad_norm": 0.043261006474494934, "learning_rate": 1.1630360532761287e-06, "loss": 0.0001, "num_input_tokens_seen": 69409320, "step": 102965 }, { "epoch": 2.5155742310605134, "grad_norm": 0.003127552103251219, "learning_rate": 1.1629519160886685e-06, "loss": 0.0001, "num_input_tokens_seen": 69413224, "step": 102970 }, { "epoch": 2.5156963818923606, "grad_norm": 26.568111419677734, "learning_rate": 1.1628677777161782e-06, "loss": 0.0417, "num_input_tokens_seen": 69416488, "step": 102975 }, { "epoch": 2.5158185327242077, "grad_norm": 0.005540535319596529, "learning_rate": 1.1627836381592694e-06, "loss": 0.0454, "num_input_tokens_seen": 69419816, "step": 102980 }, { "epoch": 2.515940683556055, "grad_norm": 0.06050194054841995, "learning_rate": 1.1626994974185542e-06, "loss": 0.0751, "num_input_tokens_seen": 69423464, "step": 102985 }, { "epoch": 2.516062834387902, "grad_norm": 0.021542632952332497, "learning_rate": 1.1626153554946446e-06, "loss": 0.1411, "num_input_tokens_seen": 69426600, "step": 102990 }, { "epoch": 2.5161849852197493, "grad_norm": 24.883790969848633, "learning_rate": 1.1625312123881522e-06, "loss": 0.0443, "num_input_tokens_seen": 69429928, "step": 102995 }, { "epoch": 2.5163071360515965, "grad_norm": 0.012859504669904709, "learning_rate": 1.1624470680996894e-06, "loss": 0.0634, "num_input_tokens_seen": 69433128, "step": 103000 }, { "epoch": 2.5164292868834437, "grad_norm": 0.07678893953561783, "learning_rate": 1.1623629226298677e-06, "loss": 0.0001, "num_input_tokens_seen": 69437224, "step": 103005 }, { "epoch": 2.516551437715291, "grad_norm": 9.300538063049316, "learning_rate": 1.1622787759792991e-06, "loss": 0.1557, "num_input_tokens_seen": 69440488, "step": 103010 }, { "epoch": 2.516673588547138, "grad_norm": 38.39460754394531, "learning_rate": 1.1621946281485957e-06, "loss": 0.2014, "num_input_tokens_seen": 69443624, "step": 103015 }, { "epoch": 2.5167957393789853, "grad_norm": 0.023470517247915268, "learning_rate": 1.1621104791383688e-06, "loss": 0.0351, "num_input_tokens_seen": 69447720, "step": 103020 }, { "epoch": 2.5169178902108325, "grad_norm": 0.2565382122993469, "learning_rate": 1.1620263289492316e-06, "loss": 0.0007, "num_input_tokens_seen": 69450920, "step": 103025 }, { "epoch": 2.5170400410426796, "grad_norm": 0.04461226239800453, "learning_rate": 1.161942177581795e-06, "loss": 0.0003, "num_input_tokens_seen": 69454248, "step": 103030 }, { "epoch": 2.517162191874527, "grad_norm": 16.27156639099121, "learning_rate": 1.1618580250366714e-06, "loss": 0.0843, "num_input_tokens_seen": 69457512, "step": 103035 }, { "epoch": 2.5172843427063736, "grad_norm": 0.05952126532793045, "learning_rate": 1.161773871314473e-06, "loss": 0.0816, "num_input_tokens_seen": 69460648, "step": 103040 }, { "epoch": 2.517406493538221, "grad_norm": 0.14907532930374146, "learning_rate": 1.1616897164158112e-06, "loss": 0.0288, "num_input_tokens_seen": 69463912, "step": 103045 }, { "epoch": 2.517528644370068, "grad_norm": 0.07751405984163284, "learning_rate": 1.1616055603412982e-06, "loss": 0.0004, "num_input_tokens_seen": 69467560, "step": 103050 }, { "epoch": 2.5176507952019156, "grad_norm": 12.210570335388184, "learning_rate": 1.1615214030915463e-06, "loss": 0.0506, "num_input_tokens_seen": 69470952, "step": 103055 }, { "epoch": 2.5177729460337623, "grad_norm": 0.07794451713562012, "learning_rate": 1.1614372446671672e-06, "loss": 0.0483, "num_input_tokens_seen": 69474216, "step": 103060 }, { "epoch": 2.5178950968656095, "grad_norm": 0.12036775052547455, "learning_rate": 1.1613530850687731e-06, "loss": 0.0006, "num_input_tokens_seen": 69477864, "step": 103065 }, { "epoch": 2.5180172476974567, "grad_norm": 0.2737639546394348, "learning_rate": 1.161268924296976e-06, "loss": 0.0548, "num_input_tokens_seen": 69480808, "step": 103070 }, { "epoch": 2.518139398529304, "grad_norm": 0.018827343359589577, "learning_rate": 1.161184762352388e-06, "loss": 0.0295, "num_input_tokens_seen": 69484712, "step": 103075 }, { "epoch": 2.518261549361151, "grad_norm": 0.06971325725317001, "learning_rate": 1.1611005992356208e-06, "loss": 0.1276, "num_input_tokens_seen": 69488296, "step": 103080 }, { "epoch": 2.5183837001929983, "grad_norm": 0.0024040297139436007, "learning_rate": 1.1610164349472868e-06, "loss": 0.0009, "num_input_tokens_seen": 69491176, "step": 103085 }, { "epoch": 2.5185058510248455, "grad_norm": 0.05693919584155083, "learning_rate": 1.160932269487998e-06, "loss": 0.0004, "num_input_tokens_seen": 69494440, "step": 103090 }, { "epoch": 2.5186280018566927, "grad_norm": 280.9774475097656, "learning_rate": 1.1608481028583666e-06, "loss": 0.0757, "num_input_tokens_seen": 69497960, "step": 103095 }, { "epoch": 2.51875015268854, "grad_norm": 0.0662260353565216, "learning_rate": 1.1607639350590042e-06, "loss": 0.0007, "num_input_tokens_seen": 69501608, "step": 103100 }, { "epoch": 2.518872303520387, "grad_norm": 12.876449584960938, "learning_rate": 1.1606797660905235e-06, "loss": 0.1427, "num_input_tokens_seen": 69505192, "step": 103105 }, { "epoch": 2.5189944543522342, "grad_norm": 0.147857666015625, "learning_rate": 1.1605955959535363e-06, "loss": 0.0422, "num_input_tokens_seen": 69508456, "step": 103110 }, { "epoch": 2.5191166051840814, "grad_norm": 0.06809548288583755, "learning_rate": 1.1605114246486545e-06, "loss": 0.0303, "num_input_tokens_seen": 69511656, "step": 103115 }, { "epoch": 2.5192387560159286, "grad_norm": 0.08468685299158096, "learning_rate": 1.1604272521764904e-06, "loss": 0.1495, "num_input_tokens_seen": 69515112, "step": 103120 }, { "epoch": 2.5193609068477754, "grad_norm": 0.20446525514125824, "learning_rate": 1.1603430785376564e-06, "loss": 0.0458, "num_input_tokens_seen": 69518376, "step": 103125 }, { "epoch": 2.519483057679623, "grad_norm": 0.10892233997583389, "learning_rate": 1.1602589037327644e-06, "loss": 0.0005, "num_input_tokens_seen": 69522216, "step": 103130 }, { "epoch": 2.5196052085114697, "grad_norm": 0.059261370450258255, "learning_rate": 1.1601747277624265e-06, "loss": 0.0009, "num_input_tokens_seen": 69525864, "step": 103135 }, { "epoch": 2.5197273593433174, "grad_norm": 0.18825897574424744, "learning_rate": 1.1600905506272552e-06, "loss": 0.0003, "num_input_tokens_seen": 69528936, "step": 103140 }, { "epoch": 2.519849510175164, "grad_norm": 0.08533114939928055, "learning_rate": 1.1600063723278618e-06, "loss": 0.0003, "num_input_tokens_seen": 69532136, "step": 103145 }, { "epoch": 2.5199716610070113, "grad_norm": 0.1688610017299652, "learning_rate": 1.1599221928648595e-06, "loss": 0.0004, "num_input_tokens_seen": 69535272, "step": 103150 }, { "epoch": 2.5200938118388585, "grad_norm": 0.06158433482050896, "learning_rate": 1.1598380122388598e-06, "loss": 0.1149, "num_input_tokens_seen": 69538664, "step": 103155 }, { "epoch": 2.5202159626707057, "grad_norm": 0.44061222672462463, "learning_rate": 1.1597538304504751e-06, "loss": 0.0008, "num_input_tokens_seen": 69541800, "step": 103160 }, { "epoch": 2.520338113502553, "grad_norm": 0.03005329892039299, "learning_rate": 1.1596696475003176e-06, "loss": 0.0886, "num_input_tokens_seen": 69545320, "step": 103165 }, { "epoch": 2.5204602643344, "grad_norm": 0.027707532048225403, "learning_rate": 1.1595854633889994e-06, "loss": 0.0001, "num_input_tokens_seen": 69548520, "step": 103170 }, { "epoch": 2.5205824151662473, "grad_norm": 120.6915283203125, "learning_rate": 1.1595012781171326e-06, "loss": 0.0377, "num_input_tokens_seen": 69552040, "step": 103175 }, { "epoch": 2.5207045659980944, "grad_norm": 0.06590377539396286, "learning_rate": 1.1594170916853298e-06, "loss": 0.0368, "num_input_tokens_seen": 69556200, "step": 103180 }, { "epoch": 2.5208267168299416, "grad_norm": 21.437572479248047, "learning_rate": 1.1593329040942032e-06, "loss": 0.0416, "num_input_tokens_seen": 69559144, "step": 103185 }, { "epoch": 2.520948867661789, "grad_norm": 0.020117169246077538, "learning_rate": 1.159248715344365e-06, "loss": 0.0662, "num_input_tokens_seen": 69562984, "step": 103190 }, { "epoch": 2.521071018493636, "grad_norm": 0.006265338975936174, "learning_rate": 1.159164525436427e-06, "loss": 0.1218, "num_input_tokens_seen": 69566056, "step": 103195 }, { "epoch": 2.521193169325483, "grad_norm": 7.672004222869873, "learning_rate": 1.1590803343710018e-06, "loss": 0.0007, "num_input_tokens_seen": 69569256, "step": 103200 }, { "epoch": 2.5213153201573304, "grad_norm": 0.02416401356458664, "learning_rate": 1.1589961421487017e-06, "loss": 0.0293, "num_input_tokens_seen": 69572456, "step": 103205 }, { "epoch": 2.5214374709891776, "grad_norm": 1.4142011404037476, "learning_rate": 1.1589119487701386e-06, "loss": 0.0006, "num_input_tokens_seen": 69575912, "step": 103210 }, { "epoch": 2.5215596218210248, "grad_norm": 0.10865466296672821, "learning_rate": 1.1588277542359253e-06, "loss": 0.0392, "num_input_tokens_seen": 69579432, "step": 103215 }, { "epoch": 2.5216817726528715, "grad_norm": 0.03473128750920296, "learning_rate": 1.1587435585466738e-06, "loss": 0.0003, "num_input_tokens_seen": 69583016, "step": 103220 }, { "epoch": 2.521803923484719, "grad_norm": 0.35919907689094543, "learning_rate": 1.1586593617029966e-06, "loss": 0.0011, "num_input_tokens_seen": 69586216, "step": 103225 }, { "epoch": 2.521926074316566, "grad_norm": 0.10506638139486313, "learning_rate": 1.1585751637055056e-06, "loss": 0.0006, "num_input_tokens_seen": 69589352, "step": 103230 }, { "epoch": 2.5220482251484135, "grad_norm": 0.1405552625656128, "learning_rate": 1.1584909645548136e-06, "loss": 0.0373, "num_input_tokens_seen": 69592936, "step": 103235 }, { "epoch": 2.5221703759802603, "grad_norm": 0.13142479956150055, "learning_rate": 1.1584067642515325e-06, "loss": 0.0005, "num_input_tokens_seen": 69595816, "step": 103240 }, { "epoch": 2.5222925268121075, "grad_norm": 1.2676528692245483, "learning_rate": 1.158322562796275e-06, "loss": 0.001, "num_input_tokens_seen": 69598888, "step": 103245 }, { "epoch": 2.5224146776439547, "grad_norm": 0.06140557676553726, "learning_rate": 1.158238360189653e-06, "loss": 0.0006, "num_input_tokens_seen": 69602728, "step": 103250 }, { "epoch": 2.522536828475802, "grad_norm": 0.4262479543685913, "learning_rate": 1.1581541564322792e-06, "loss": 0.0002, "num_input_tokens_seen": 69605928, "step": 103255 }, { "epoch": 2.522658979307649, "grad_norm": 0.046194881200790405, "learning_rate": 1.1580699515247658e-06, "loss": 0.0002, "num_input_tokens_seen": 69609384, "step": 103260 }, { "epoch": 2.5227811301394962, "grad_norm": 0.0063371495343744755, "learning_rate": 1.1579857454677253e-06, "loss": 0.0614, "num_input_tokens_seen": 69612520, "step": 103265 }, { "epoch": 2.5229032809713434, "grad_norm": 0.04240964353084564, "learning_rate": 1.1579015382617696e-06, "loss": 0.0446, "num_input_tokens_seen": 69616424, "step": 103270 }, { "epoch": 2.5230254318031906, "grad_norm": 0.011216072365641594, "learning_rate": 1.1578173299075118e-06, "loss": 0.0894, "num_input_tokens_seen": 69619688, "step": 103275 }, { "epoch": 2.523147582635038, "grad_norm": 0.011698591522872448, "learning_rate": 1.1577331204055638e-06, "loss": 0.0003, "num_input_tokens_seen": 69623080, "step": 103280 }, { "epoch": 2.523269733466885, "grad_norm": 0.015079408884048462, "learning_rate": 1.1576489097565383e-06, "loss": 0.0001, "num_input_tokens_seen": 69626408, "step": 103285 }, { "epoch": 2.523391884298732, "grad_norm": 28.438676834106445, "learning_rate": 1.1575646979610475e-06, "loss": 0.0268, "num_input_tokens_seen": 69630120, "step": 103290 }, { "epoch": 2.5235140351305794, "grad_norm": 0.2645500898361206, "learning_rate": 1.1574804850197037e-06, "loss": 0.0504, "num_input_tokens_seen": 69633704, "step": 103295 }, { "epoch": 2.5236361859624266, "grad_norm": 0.026053164154291153, "learning_rate": 1.1573962709331196e-06, "loss": 0.0002, "num_input_tokens_seen": 69636968, "step": 103300 }, { "epoch": 2.5237583367942733, "grad_norm": 0.05643535032868385, "learning_rate": 1.1573120557019071e-06, "loss": 0.0004, "num_input_tokens_seen": 69640232, "step": 103305 }, { "epoch": 2.523880487626121, "grad_norm": 26.065277099609375, "learning_rate": 1.1572278393266794e-06, "loss": 0.0786, "num_input_tokens_seen": 69643496, "step": 103310 }, { "epoch": 2.5240026384579677, "grad_norm": 0.014052963815629482, "learning_rate": 1.1571436218080485e-06, "loss": 0.0001, "num_input_tokens_seen": 69647208, "step": 103315 }, { "epoch": 2.5241247892898153, "grad_norm": 0.11288266628980637, "learning_rate": 1.157059403146627e-06, "loss": 0.0002, "num_input_tokens_seen": 69650600, "step": 103320 }, { "epoch": 2.524246940121662, "grad_norm": 0.29468193650245667, "learning_rate": 1.156975183343027e-06, "loss": 0.0005, "num_input_tokens_seen": 69654184, "step": 103325 }, { "epoch": 2.5243690909535093, "grad_norm": 0.07013779133558273, "learning_rate": 1.1568909623978612e-06, "loss": 0.1288, "num_input_tokens_seen": 69657448, "step": 103330 }, { "epoch": 2.5244912417853564, "grad_norm": 0.12776708602905273, "learning_rate": 1.1568067403117426e-06, "loss": 0.0426, "num_input_tokens_seen": 69660712, "step": 103335 }, { "epoch": 2.5246133926172036, "grad_norm": 0.009856940247118473, "learning_rate": 1.1567225170852828e-06, "loss": 0.0002, "num_input_tokens_seen": 69664104, "step": 103340 }, { "epoch": 2.524735543449051, "grad_norm": 0.042370233684778214, "learning_rate": 1.156638292719095e-06, "loss": 0.0005, "num_input_tokens_seen": 69667496, "step": 103345 }, { "epoch": 2.524857694280898, "grad_norm": 0.05852619186043739, "learning_rate": 1.1565540672137913e-06, "loss": 0.0001, "num_input_tokens_seen": 69670504, "step": 103350 }, { "epoch": 2.524979845112745, "grad_norm": 51.45964813232422, "learning_rate": 1.1564698405699843e-06, "loss": 0.0945, "num_input_tokens_seen": 69673512, "step": 103355 }, { "epoch": 2.5251019959445924, "grad_norm": 0.035364434123039246, "learning_rate": 1.1563856127882865e-06, "loss": 0.0491, "num_input_tokens_seen": 69677032, "step": 103360 }, { "epoch": 2.5252241467764396, "grad_norm": 62.330257415771484, "learning_rate": 1.1563013838693102e-06, "loss": 0.0574, "num_input_tokens_seen": 69680680, "step": 103365 }, { "epoch": 2.5253462976082868, "grad_norm": 0.4028855860233307, "learning_rate": 1.1562171538136684e-06, "loss": 0.0349, "num_input_tokens_seen": 69684200, "step": 103370 }, { "epoch": 2.525468448440134, "grad_norm": 0.04933023825287819, "learning_rate": 1.1561329226219736e-06, "loss": 0.0358, "num_input_tokens_seen": 69687656, "step": 103375 }, { "epoch": 2.525590599271981, "grad_norm": 0.025662919506430626, "learning_rate": 1.156048690294838e-06, "loss": 0.0501, "num_input_tokens_seen": 69691112, "step": 103380 }, { "epoch": 2.5257127501038283, "grad_norm": 0.020099427551031113, "learning_rate": 1.1559644568328746e-06, "loss": 0.0477, "num_input_tokens_seen": 69694056, "step": 103385 }, { "epoch": 2.5258349009356755, "grad_norm": 0.20844100415706635, "learning_rate": 1.1558802222366954e-06, "loss": 0.0014, "num_input_tokens_seen": 69697192, "step": 103390 }, { "epoch": 2.5259570517675227, "grad_norm": 0.02002965472638607, "learning_rate": 1.1557959865069133e-06, "loss": 0.01, "num_input_tokens_seen": 69700264, "step": 103395 }, { "epoch": 2.5260792025993695, "grad_norm": 0.014823324047029018, "learning_rate": 1.1557117496441414e-06, "loss": 0.0002, "num_input_tokens_seen": 69703784, "step": 103400 }, { "epoch": 2.526201353431217, "grad_norm": 0.012353803031146526, "learning_rate": 1.1556275116489913e-06, "loss": 0.0667, "num_input_tokens_seen": 69707240, "step": 103405 }, { "epoch": 2.526323504263064, "grad_norm": 0.027004987001419067, "learning_rate": 1.1555432725220762e-06, "loss": 0.0407, "num_input_tokens_seen": 69710120, "step": 103410 }, { "epoch": 2.526445655094911, "grad_norm": 0.03901997208595276, "learning_rate": 1.1554590322640088e-06, "loss": 0.0411, "num_input_tokens_seen": 69713448, "step": 103415 }, { "epoch": 2.5265678059267582, "grad_norm": 0.020968155935406685, "learning_rate": 1.1553747908754012e-06, "loss": 0.0514, "num_input_tokens_seen": 69716776, "step": 103420 }, { "epoch": 2.5266899567586054, "grad_norm": 0.07576152682304382, "learning_rate": 1.1552905483568662e-06, "loss": 0.1284, "num_input_tokens_seen": 69720168, "step": 103425 }, { "epoch": 2.5268121075904526, "grad_norm": 13.619282722473145, "learning_rate": 1.1552063047090167e-06, "loss": 0.0299, "num_input_tokens_seen": 69723496, "step": 103430 }, { "epoch": 2.5269342584223, "grad_norm": 0.025862164795398712, "learning_rate": 1.1551220599324654e-06, "loss": 0.0372, "num_input_tokens_seen": 69727144, "step": 103435 }, { "epoch": 2.527056409254147, "grad_norm": 23.02949333190918, "learning_rate": 1.1550378140278245e-06, "loss": 0.0396, "num_input_tokens_seen": 69730536, "step": 103440 }, { "epoch": 2.527178560085994, "grad_norm": 0.7969344854354858, "learning_rate": 1.1549535669957072e-06, "loss": 0.0008, "num_input_tokens_seen": 69733928, "step": 103445 }, { "epoch": 2.5273007109178414, "grad_norm": 0.06862567365169525, "learning_rate": 1.1548693188367256e-06, "loss": 0.0341, "num_input_tokens_seen": 69737192, "step": 103450 }, { "epoch": 2.5274228617496886, "grad_norm": 0.08049945533275604, "learning_rate": 1.1547850695514929e-06, "loss": 0.0003, "num_input_tokens_seen": 69740200, "step": 103455 }, { "epoch": 2.5275450125815357, "grad_norm": 0.0023474690970033407, "learning_rate": 1.1547008191406213e-06, "loss": 0.0002, "num_input_tokens_seen": 69743720, "step": 103460 }, { "epoch": 2.527667163413383, "grad_norm": 0.11849883198738098, "learning_rate": 1.154616567604724e-06, "loss": 0.0002, "num_input_tokens_seen": 69746856, "step": 103465 }, { "epoch": 2.52778931424523, "grad_norm": 0.06941009312868118, "learning_rate": 1.1545323149444132e-06, "loss": 0.0006, "num_input_tokens_seen": 69750248, "step": 103470 }, { "epoch": 2.5279114650770773, "grad_norm": 0.09503491222858429, "learning_rate": 1.1544480611603021e-06, "loss": 0.0527, "num_input_tokens_seen": 69753576, "step": 103475 }, { "epoch": 2.5280336159089245, "grad_norm": 0.011635013855993748, "learning_rate": 1.154363806253003e-06, "loss": 0.0353, "num_input_tokens_seen": 69756712, "step": 103480 }, { "epoch": 2.5281557667407712, "grad_norm": 37.80340576171875, "learning_rate": 1.1542795502231289e-06, "loss": 0.0446, "num_input_tokens_seen": 69759720, "step": 103485 }, { "epoch": 2.528277917572619, "grad_norm": 0.006175327580422163, "learning_rate": 1.1541952930712919e-06, "loss": 0.0001, "num_input_tokens_seen": 69762920, "step": 103490 }, { "epoch": 2.5284000684044656, "grad_norm": 40.25859832763672, "learning_rate": 1.1541110347981059e-06, "loss": 0.0397, "num_input_tokens_seen": 69766376, "step": 103495 }, { "epoch": 2.5285222192363133, "grad_norm": 0.016212737187743187, "learning_rate": 1.1540267754041826e-06, "loss": 0.0452, "num_input_tokens_seen": 69770088, "step": 103500 }, { "epoch": 2.52864437006816, "grad_norm": 0.30683091282844543, "learning_rate": 1.1539425148901356e-06, "loss": 0.0005, "num_input_tokens_seen": 69773864, "step": 103505 }, { "epoch": 2.528766520900007, "grad_norm": 0.03833984583616257, "learning_rate": 1.1538582532565768e-06, "loss": 0.0002, "num_input_tokens_seen": 69777576, "step": 103510 }, { "epoch": 2.5288886717318544, "grad_norm": 0.02461802400648594, "learning_rate": 1.1537739905041197e-06, "loss": 0.0003, "num_input_tokens_seen": 69781288, "step": 103515 }, { "epoch": 2.5290108225637016, "grad_norm": 0.0017495234496891499, "learning_rate": 1.1536897266333766e-06, "loss": 0.0001, "num_input_tokens_seen": 69784488, "step": 103520 }, { "epoch": 2.5291329733955488, "grad_norm": 44.30553436279297, "learning_rate": 1.1536054616449602e-06, "loss": 0.0615, "num_input_tokens_seen": 69787752, "step": 103525 }, { "epoch": 2.529255124227396, "grad_norm": 0.06662532687187195, "learning_rate": 1.153521195539484e-06, "loss": 0.0001, "num_input_tokens_seen": 69790888, "step": 103530 }, { "epoch": 2.529377275059243, "grad_norm": 25.166748046875, "learning_rate": 1.1534369283175602e-06, "loss": 0.059, "num_input_tokens_seen": 69794408, "step": 103535 }, { "epoch": 2.5294994258910903, "grad_norm": 42.99397659301758, "learning_rate": 1.1533526599798017e-06, "loss": 0.154, "num_input_tokens_seen": 69797672, "step": 103540 }, { "epoch": 2.5296215767229375, "grad_norm": 0.0017442662501707673, "learning_rate": 1.1532683905268216e-06, "loss": 0.0, "num_input_tokens_seen": 69800744, "step": 103545 }, { "epoch": 2.5297437275547847, "grad_norm": 0.02642848715186119, "learning_rate": 1.1531841199592323e-06, "loss": 0.0537, "num_input_tokens_seen": 69804456, "step": 103550 }, { "epoch": 2.529865878386632, "grad_norm": 0.06263232976198196, "learning_rate": 1.1530998482776473e-06, "loss": 0.0003, "num_input_tokens_seen": 69807784, "step": 103555 }, { "epoch": 2.529988029218479, "grad_norm": 6.440359115600586, "learning_rate": 1.1530155754826788e-06, "loss": 0.0007, "num_input_tokens_seen": 69811496, "step": 103560 }, { "epoch": 2.5301101800503263, "grad_norm": 0.02709343284368515, "learning_rate": 1.1529313015749399e-06, "loss": 0.0411, "num_input_tokens_seen": 69814888, "step": 103565 }, { "epoch": 2.530232330882173, "grad_norm": 0.17237377166748047, "learning_rate": 1.1528470265550434e-06, "loss": 0.0004, "num_input_tokens_seen": 69817896, "step": 103570 }, { "epoch": 2.5303544817140207, "grad_norm": 0.00297606666572392, "learning_rate": 1.1527627504236022e-06, "loss": 0.0003, "num_input_tokens_seen": 69821416, "step": 103575 }, { "epoch": 2.5304766325458674, "grad_norm": 1.024841070175171, "learning_rate": 1.1526784731812292e-06, "loss": 0.0481, "num_input_tokens_seen": 69824808, "step": 103580 }, { "epoch": 2.530598783377715, "grad_norm": 0.609325110912323, "learning_rate": 1.1525941948285372e-06, "loss": 0.0322, "num_input_tokens_seen": 69827880, "step": 103585 }, { "epoch": 2.530720934209562, "grad_norm": 0.01956762745976448, "learning_rate": 1.1525099153661391e-06, "loss": 0.0001, "num_input_tokens_seen": 69831144, "step": 103590 }, { "epoch": 2.530843085041409, "grad_norm": 0.004689326509833336, "learning_rate": 1.1524256347946482e-06, "loss": 0.0001, "num_input_tokens_seen": 69834216, "step": 103595 }, { "epoch": 2.530965235873256, "grad_norm": 0.010422090999782085, "learning_rate": 1.1523413531146768e-06, "loss": 0.0003, "num_input_tokens_seen": 69837736, "step": 103600 }, { "epoch": 2.5310873867051034, "grad_norm": 9.557653427124023, "learning_rate": 1.1522570703268381e-06, "loss": 0.2075, "num_input_tokens_seen": 69841192, "step": 103605 }, { "epoch": 2.5312095375369505, "grad_norm": 0.26799166202545166, "learning_rate": 1.152172786431745e-06, "loss": 0.0003, "num_input_tokens_seen": 69844584, "step": 103610 }, { "epoch": 2.5313316883687977, "grad_norm": 0.018254579976201057, "learning_rate": 1.152088501430011e-06, "loss": 0.0927, "num_input_tokens_seen": 69847656, "step": 103615 }, { "epoch": 2.531453839200645, "grad_norm": 0.004553716164082289, "learning_rate": 1.152004215322248e-06, "loss": 0.1143, "num_input_tokens_seen": 69850920, "step": 103620 }, { "epoch": 2.531575990032492, "grad_norm": 0.36942997574806213, "learning_rate": 1.1519199281090697e-06, "loss": 0.0559, "num_input_tokens_seen": 69854248, "step": 103625 }, { "epoch": 2.5316981408643393, "grad_norm": 0.31802067160606384, "learning_rate": 1.1518356397910887e-06, "loss": 0.0527, "num_input_tokens_seen": 69857640, "step": 103630 }, { "epoch": 2.5318202916961865, "grad_norm": 0.22344498336315155, "learning_rate": 1.151751350368918e-06, "loss": 0.0001, "num_input_tokens_seen": 69861288, "step": 103635 }, { "epoch": 2.5319424425280337, "grad_norm": 893.3682861328125, "learning_rate": 1.1516670598431709e-06, "loss": 0.076, "num_input_tokens_seen": 69864744, "step": 103640 }, { "epoch": 2.532064593359881, "grad_norm": 0.15178991854190826, "learning_rate": 1.15158276821446e-06, "loss": 0.0737, "num_input_tokens_seen": 69867880, "step": 103645 }, { "epoch": 2.532186744191728, "grad_norm": 0.035966865718364716, "learning_rate": 1.1514984754833983e-06, "loss": 0.0002, "num_input_tokens_seen": 69871464, "step": 103650 }, { "epoch": 2.5323088950235753, "grad_norm": 0.05065901577472687, "learning_rate": 1.1514141816505992e-06, "loss": 0.0004, "num_input_tokens_seen": 69874600, "step": 103655 }, { "epoch": 2.5324310458554224, "grad_norm": 0.1139889732003212, "learning_rate": 1.1513298867166755e-06, "loss": 0.0003, "num_input_tokens_seen": 69877736, "step": 103660 }, { "epoch": 2.532553196687269, "grad_norm": 0.07051640748977661, "learning_rate": 1.1512455906822398e-06, "loss": 0.0001, "num_input_tokens_seen": 69881000, "step": 103665 }, { "epoch": 2.532675347519117, "grad_norm": 27.080020904541016, "learning_rate": 1.1511612935479058e-06, "loss": 0.0609, "num_input_tokens_seen": 69884456, "step": 103670 }, { "epoch": 2.5327974983509636, "grad_norm": 0.03246402367949486, "learning_rate": 1.1510769953142858e-06, "loss": 0.0002, "num_input_tokens_seen": 69887976, "step": 103675 }, { "epoch": 2.532919649182811, "grad_norm": 0.007298397831618786, "learning_rate": 1.1509926959819936e-06, "loss": 0.0003, "num_input_tokens_seen": 69891304, "step": 103680 }, { "epoch": 2.533041800014658, "grad_norm": 0.011478755623102188, "learning_rate": 1.1509083955516418e-06, "loss": 0.0623, "num_input_tokens_seen": 69894696, "step": 103685 }, { "epoch": 2.533163950846505, "grad_norm": 0.10327120125293732, "learning_rate": 1.1508240940238438e-06, "loss": 0.0379, "num_input_tokens_seen": 69897960, "step": 103690 }, { "epoch": 2.5332861016783523, "grad_norm": 0.003992895130068064, "learning_rate": 1.150739791399212e-06, "loss": 0.0005, "num_input_tokens_seen": 69901352, "step": 103695 }, { "epoch": 2.5334082525101995, "grad_norm": 0.0049509950913488865, "learning_rate": 1.1506554876783604e-06, "loss": 0.0708, "num_input_tokens_seen": 69904744, "step": 103700 }, { "epoch": 2.5335304033420467, "grad_norm": 0.002750345505774021, "learning_rate": 1.1505711828619008e-06, "loss": 0.0433, "num_input_tokens_seen": 69908328, "step": 103705 }, { "epoch": 2.533652554173894, "grad_norm": 0.014179835096001625, "learning_rate": 1.150486876950448e-06, "loss": 0.0002, "num_input_tokens_seen": 69911848, "step": 103710 }, { "epoch": 2.533774705005741, "grad_norm": 0.02460402064025402, "learning_rate": 1.1504025699446136e-06, "loss": 0.0379, "num_input_tokens_seen": 69914984, "step": 103715 }, { "epoch": 2.5338968558375883, "grad_norm": 0.08573606610298157, "learning_rate": 1.1503182618450114e-06, "loss": 0.04, "num_input_tokens_seen": 69918440, "step": 103720 }, { "epoch": 2.5340190066694355, "grad_norm": 32.343074798583984, "learning_rate": 1.1502339526522545e-06, "loss": 0.103, "num_input_tokens_seen": 69921448, "step": 103725 }, { "epoch": 2.5341411575012827, "grad_norm": 146.36965942382812, "learning_rate": 1.1501496423669557e-06, "loss": 0.1077, "num_input_tokens_seen": 69924776, "step": 103730 }, { "epoch": 2.53426330833313, "grad_norm": 0.018751777708530426, "learning_rate": 1.1500653309897282e-06, "loss": 0.0513, "num_input_tokens_seen": 69927912, "step": 103735 }, { "epoch": 2.534385459164977, "grad_norm": 10.86872673034668, "learning_rate": 1.1499810185211853e-06, "loss": 0.1221, "num_input_tokens_seen": 69931112, "step": 103740 }, { "epoch": 2.5345076099968242, "grad_norm": 0.5235853791236877, "learning_rate": 1.14989670496194e-06, "loss": 0.0003, "num_input_tokens_seen": 69934760, "step": 103745 }, { "epoch": 2.534629760828671, "grad_norm": 0.023617833852767944, "learning_rate": 1.149812390312606e-06, "loss": 0.0005, "num_input_tokens_seen": 69938472, "step": 103750 }, { "epoch": 2.5347519116605186, "grad_norm": 0.2028709203004837, "learning_rate": 1.1497280745737955e-06, "loss": 0.0005, "num_input_tokens_seen": 69941544, "step": 103755 }, { "epoch": 2.5348740624923654, "grad_norm": 0.06458128988742828, "learning_rate": 1.1496437577461227e-06, "loss": 0.0577, "num_input_tokens_seen": 69944744, "step": 103760 }, { "epoch": 2.534996213324213, "grad_norm": 0.22427748143672943, "learning_rate": 1.1495594398301998e-06, "loss": 0.0896, "num_input_tokens_seen": 69948008, "step": 103765 }, { "epoch": 2.5351183641560597, "grad_norm": 0.015495997853577137, "learning_rate": 1.1494751208266408e-06, "loss": 0.0024, "num_input_tokens_seen": 69951144, "step": 103770 }, { "epoch": 2.535240514987907, "grad_norm": 0.03996007516980171, "learning_rate": 1.1493908007360581e-06, "loss": 0.0001, "num_input_tokens_seen": 69955304, "step": 103775 }, { "epoch": 2.535362665819754, "grad_norm": 0.00529811205342412, "learning_rate": 1.1493064795590655e-06, "loss": 0.0611, "num_input_tokens_seen": 69958696, "step": 103780 }, { "epoch": 2.5354848166516013, "grad_norm": 0.026105934754014015, "learning_rate": 1.1492221572962762e-06, "loss": 0.0002, "num_input_tokens_seen": 69962024, "step": 103785 }, { "epoch": 2.5356069674834485, "grad_norm": 0.013941958546638489, "learning_rate": 1.1491378339483028e-06, "loss": 0.1421, "num_input_tokens_seen": 69965480, "step": 103790 }, { "epoch": 2.5357291183152957, "grad_norm": 0.01809925213456154, "learning_rate": 1.1490535095157594e-06, "loss": 0.0547, "num_input_tokens_seen": 69968808, "step": 103795 }, { "epoch": 2.535851269147143, "grad_norm": 0.8025308847427368, "learning_rate": 1.1489691839992584e-06, "loss": 0.1093, "num_input_tokens_seen": 69972072, "step": 103800 }, { "epoch": 2.53597341997899, "grad_norm": 0.03385000675916672, "learning_rate": 1.1488848573994137e-06, "loss": 0.0117, "num_input_tokens_seen": 69975592, "step": 103805 }, { "epoch": 2.5360955708108373, "grad_norm": 0.010065199807286263, "learning_rate": 1.148800529716838e-06, "loss": 0.0939, "num_input_tokens_seen": 69978792, "step": 103810 }, { "epoch": 2.5362177216426844, "grad_norm": 0.12215670198202133, "learning_rate": 1.1487162009521453e-06, "loss": 0.0549, "num_input_tokens_seen": 69983080, "step": 103815 }, { "epoch": 2.5363398724745316, "grad_norm": 0.012825618498027325, "learning_rate": 1.1486318711059481e-06, "loss": 0.0003, "num_input_tokens_seen": 69986408, "step": 103820 }, { "epoch": 2.536462023306379, "grad_norm": 0.26266250014305115, "learning_rate": 1.14854754017886e-06, "loss": 0.0032, "num_input_tokens_seen": 69989672, "step": 103825 }, { "epoch": 2.536584174138226, "grad_norm": 22.808828353881836, "learning_rate": 1.1484632081714941e-06, "loss": 0.0553, "num_input_tokens_seen": 69992872, "step": 103830 }, { "epoch": 2.536706324970073, "grad_norm": 0.2519030272960663, "learning_rate": 1.148378875084464e-06, "loss": 0.0003, "num_input_tokens_seen": 69996328, "step": 103835 }, { "epoch": 2.5368284758019204, "grad_norm": 0.03134218975901604, "learning_rate": 1.1482945409183825e-06, "loss": 0.0003, "num_input_tokens_seen": 69999400, "step": 103840 }, { "epoch": 2.536950626633767, "grad_norm": 583.1251220703125, "learning_rate": 1.1482102056738636e-06, "loss": 0.0552, "num_input_tokens_seen": 70002728, "step": 103845 }, { "epoch": 2.5370727774656148, "grad_norm": 72.49507141113281, "learning_rate": 1.1481258693515202e-06, "loss": 0.0368, "num_input_tokens_seen": 70006824, "step": 103850 }, { "epoch": 2.5371949282974615, "grad_norm": 0.02389582060277462, "learning_rate": 1.1480415319519653e-06, "loss": 0.0001, "num_input_tokens_seen": 70010792, "step": 103855 }, { "epoch": 2.5373170791293087, "grad_norm": 0.028493667021393776, "learning_rate": 1.1479571934758128e-06, "loss": 0.0492, "num_input_tokens_seen": 70014120, "step": 103860 }, { "epoch": 2.537439229961156, "grad_norm": 0.18289715051651, "learning_rate": 1.147872853923676e-06, "loss": 0.001, "num_input_tokens_seen": 70017192, "step": 103865 }, { "epoch": 2.537561380793003, "grad_norm": 0.030623802915215492, "learning_rate": 1.1477885132961678e-06, "loss": 0.1266, "num_input_tokens_seen": 70021288, "step": 103870 }, { "epoch": 2.5376835316248503, "grad_norm": 0.007544918451458216, "learning_rate": 1.1477041715939018e-06, "loss": 0.0002, "num_input_tokens_seen": 70024616, "step": 103875 }, { "epoch": 2.5378056824566975, "grad_norm": 0.006404994986951351, "learning_rate": 1.1476198288174912e-06, "loss": 0.0003, "num_input_tokens_seen": 70027752, "step": 103880 }, { "epoch": 2.5379278332885447, "grad_norm": 0.03089703619480133, "learning_rate": 1.1475354849675496e-06, "loss": 0.0002, "num_input_tokens_seen": 70031080, "step": 103885 }, { "epoch": 2.538049984120392, "grad_norm": 18.35439682006836, "learning_rate": 1.1474511400446903e-06, "loss": 0.1082, "num_input_tokens_seen": 70034472, "step": 103890 }, { "epoch": 2.538172134952239, "grad_norm": 0.25763773918151855, "learning_rate": 1.1473667940495265e-06, "loss": 0.0636, "num_input_tokens_seen": 70037800, "step": 103895 }, { "epoch": 2.5382942857840862, "grad_norm": 0.5698397159576416, "learning_rate": 1.1472824469826718e-06, "loss": 0.0506, "num_input_tokens_seen": 70041448, "step": 103900 }, { "epoch": 2.5384164366159334, "grad_norm": 0.06392552703619003, "learning_rate": 1.1471980988447397e-06, "loss": 0.0272, "num_input_tokens_seen": 70044712, "step": 103905 }, { "epoch": 2.5385385874477806, "grad_norm": 0.008248881436884403, "learning_rate": 1.1471137496363435e-06, "loss": 0.0003, "num_input_tokens_seen": 70047848, "step": 103910 }, { "epoch": 2.538660738279628, "grad_norm": 0.022066285833716393, "learning_rate": 1.1470293993580961e-06, "loss": 0.0003, "num_input_tokens_seen": 70050856, "step": 103915 }, { "epoch": 2.538782889111475, "grad_norm": 0.03940925747156143, "learning_rate": 1.1469450480106118e-06, "loss": 0.1717, "num_input_tokens_seen": 70054056, "step": 103920 }, { "epoch": 2.538905039943322, "grad_norm": 0.11242895573377609, "learning_rate": 1.1468606955945034e-06, "loss": 0.0005, "num_input_tokens_seen": 70057384, "step": 103925 }, { "epoch": 2.539027190775169, "grad_norm": 0.13371014595031738, "learning_rate": 1.1467763421103846e-06, "loss": 0.0002, "num_input_tokens_seen": 70060328, "step": 103930 }, { "epoch": 2.5391493416070166, "grad_norm": 0.04523512348532677, "learning_rate": 1.1466919875588688e-06, "loss": 0.0002, "num_input_tokens_seen": 70063528, "step": 103935 }, { "epoch": 2.5392714924388633, "grad_norm": 0.050599198788404465, "learning_rate": 1.1466076319405693e-06, "loss": 0.0429, "num_input_tokens_seen": 70066856, "step": 103940 }, { "epoch": 2.539393643270711, "grad_norm": 0.002950431313365698, "learning_rate": 1.1465232752560996e-06, "loss": 0.0001, "num_input_tokens_seen": 70070440, "step": 103945 }, { "epoch": 2.5395157941025577, "grad_norm": 0.1107611358165741, "learning_rate": 1.1464389175060734e-06, "loss": 0.0763, "num_input_tokens_seen": 70073640, "step": 103950 }, { "epoch": 2.539637944934405, "grad_norm": 64.87561798095703, "learning_rate": 1.1463545586911036e-06, "loss": 0.061, "num_input_tokens_seen": 70077224, "step": 103955 }, { "epoch": 2.539760095766252, "grad_norm": 157.2279815673828, "learning_rate": 1.1462701988118047e-06, "loss": 0.0204, "num_input_tokens_seen": 70080872, "step": 103960 }, { "epoch": 2.5398822465980992, "grad_norm": 0.033033862709999084, "learning_rate": 1.146185837868789e-06, "loss": 0.001, "num_input_tokens_seen": 70083816, "step": 103965 }, { "epoch": 2.5400043974299464, "grad_norm": 0.032412346452474594, "learning_rate": 1.1461014758626712e-06, "loss": 0.0478, "num_input_tokens_seen": 70087080, "step": 103970 }, { "epoch": 2.5401265482617936, "grad_norm": 1.0125641822814941, "learning_rate": 1.146017112794064e-06, "loss": 0.0004, "num_input_tokens_seen": 70090344, "step": 103975 }, { "epoch": 2.540248699093641, "grad_norm": 0.058324720710515976, "learning_rate": 1.1459327486635808e-06, "loss": 0.0001, "num_input_tokens_seen": 70093480, "step": 103980 }, { "epoch": 2.540370849925488, "grad_norm": 0.5170395970344543, "learning_rate": 1.1458483834718352e-06, "loss": 0.0005, "num_input_tokens_seen": 70096552, "step": 103985 }, { "epoch": 2.540493000757335, "grad_norm": 16.676877975463867, "learning_rate": 1.1457640172194414e-06, "loss": 0.0779, "num_input_tokens_seen": 70100072, "step": 103990 }, { "epoch": 2.5406151515891824, "grad_norm": 0.2772276997566223, "learning_rate": 1.1456796499070123e-06, "loss": 0.0007, "num_input_tokens_seen": 70103208, "step": 103995 }, { "epoch": 2.5407373024210296, "grad_norm": 0.2213776856660843, "learning_rate": 1.1455952815351616e-06, "loss": 0.0004, "num_input_tokens_seen": 70106984, "step": 104000 }, { "epoch": 2.5408594532528768, "grad_norm": 0.09152863919734955, "learning_rate": 1.1455109121045028e-06, "loss": 0.0016, "num_input_tokens_seen": 70110056, "step": 104005 }, { "epoch": 2.540981604084724, "grad_norm": 0.07251527905464172, "learning_rate": 1.1454265416156497e-06, "loss": 0.0786, "num_input_tokens_seen": 70113256, "step": 104010 }, { "epoch": 2.541103754916571, "grad_norm": 0.010876025073230267, "learning_rate": 1.1453421700692152e-06, "loss": 0.1102, "num_input_tokens_seen": 70116584, "step": 104015 }, { "epoch": 2.5412259057484183, "grad_norm": 0.6156110763549805, "learning_rate": 1.1452577974658139e-06, "loss": 0.0002, "num_input_tokens_seen": 70120296, "step": 104020 }, { "epoch": 2.541348056580265, "grad_norm": 0.0007826112559996545, "learning_rate": 1.1451734238060587e-06, "loss": 0.0001, "num_input_tokens_seen": 70124008, "step": 104025 }, { "epoch": 2.5414702074121127, "grad_norm": 51.516597747802734, "learning_rate": 1.145089049090563e-06, "loss": 0.0318, "num_input_tokens_seen": 70127336, "step": 104030 }, { "epoch": 2.5415923582439595, "grad_norm": 0.01903372071683407, "learning_rate": 1.145004673319941e-06, "loss": 0.0001, "num_input_tokens_seen": 70130216, "step": 104035 }, { "epoch": 2.5417145090758066, "grad_norm": 0.014983262866735458, "learning_rate": 1.144920296494806e-06, "loss": 0.08, "num_input_tokens_seen": 70133032, "step": 104040 }, { "epoch": 2.541836659907654, "grad_norm": 58.6263542175293, "learning_rate": 1.1448359186157714e-06, "loss": 0.1615, "num_input_tokens_seen": 70136168, "step": 104045 }, { "epoch": 2.541958810739501, "grad_norm": 0.15867555141448975, "learning_rate": 1.1447515396834513e-06, "loss": 0.0461, "num_input_tokens_seen": 70139112, "step": 104050 }, { "epoch": 2.542080961571348, "grad_norm": 38.69710922241211, "learning_rate": 1.144667159698459e-06, "loss": 0.0632, "num_input_tokens_seen": 70142056, "step": 104055 }, { "epoch": 2.5422031124031954, "grad_norm": 0.004569328855723143, "learning_rate": 1.1445827786614082e-06, "loss": 0.0895, "num_input_tokens_seen": 70145256, "step": 104060 }, { "epoch": 2.5423252632350426, "grad_norm": 0.27281489968299866, "learning_rate": 1.1444983965729125e-06, "loss": 0.0009, "num_input_tokens_seen": 70148584, "step": 104065 }, { "epoch": 2.54244741406689, "grad_norm": 28.797000885009766, "learning_rate": 1.1444140134335855e-06, "loss": 0.0608, "num_input_tokens_seen": 70151720, "step": 104070 }, { "epoch": 2.542569564898737, "grad_norm": 0.031929127871990204, "learning_rate": 1.1443296292440412e-06, "loss": 0.0352, "num_input_tokens_seen": 70155432, "step": 104075 }, { "epoch": 2.542691715730584, "grad_norm": 20.4262752532959, "learning_rate": 1.1442452440048929e-06, "loss": 0.0566, "num_input_tokens_seen": 70158440, "step": 104080 }, { "epoch": 2.5428138665624314, "grad_norm": 0.33279237151145935, "learning_rate": 1.1441608577167544e-06, "loss": 0.0275, "num_input_tokens_seen": 70161960, "step": 104085 }, { "epoch": 2.5429360173942785, "grad_norm": 0.028363827615976334, "learning_rate": 1.1440764703802394e-06, "loss": 0.0454, "num_input_tokens_seen": 70165160, "step": 104090 }, { "epoch": 2.5430581682261257, "grad_norm": 1.5578937530517578, "learning_rate": 1.1439920819959614e-06, "loss": 0.0004, "num_input_tokens_seen": 70168360, "step": 104095 }, { "epoch": 2.543180319057973, "grad_norm": 0.04033007472753525, "learning_rate": 1.1439076925645347e-06, "loss": 0.0001, "num_input_tokens_seen": 70171368, "step": 104100 }, { "epoch": 2.54330246988982, "grad_norm": 0.014733080752193928, "learning_rate": 1.143823302086572e-06, "loss": 0.0002, "num_input_tokens_seen": 70174760, "step": 104105 }, { "epoch": 2.543424620721667, "grad_norm": 0.0995849072933197, "learning_rate": 1.1437389105626877e-06, "loss": 0.1443, "num_input_tokens_seen": 70178088, "step": 104110 }, { "epoch": 2.5435467715535145, "grad_norm": 0.01298166811466217, "learning_rate": 1.1436545179934953e-06, "loss": 0.0003, "num_input_tokens_seen": 70181544, "step": 104115 }, { "epoch": 2.5436689223853612, "grad_norm": 0.003690902143716812, "learning_rate": 1.1435701243796088e-06, "loss": 0.0225, "num_input_tokens_seen": 70185320, "step": 104120 }, { "epoch": 2.543791073217209, "grad_norm": 0.007218752522021532, "learning_rate": 1.1434857297216417e-06, "loss": 0.0005, "num_input_tokens_seen": 70188712, "step": 104125 }, { "epoch": 2.5439132240490556, "grad_norm": 0.010767634958028793, "learning_rate": 1.143401334020208e-06, "loss": 0.0001, "num_input_tokens_seen": 70191784, "step": 104130 }, { "epoch": 2.544035374880903, "grad_norm": 0.02760545164346695, "learning_rate": 1.143316937275921e-06, "loss": 0.0745, "num_input_tokens_seen": 70195176, "step": 104135 }, { "epoch": 2.54415752571275, "grad_norm": 0.013613578863441944, "learning_rate": 1.1432325394893946e-06, "loss": 0.0892, "num_input_tokens_seen": 70198376, "step": 104140 }, { "epoch": 2.544279676544597, "grad_norm": 0.013177074491977692, "learning_rate": 1.1431481406612427e-06, "loss": 0.0002, "num_input_tokens_seen": 70201640, "step": 104145 }, { "epoch": 2.5444018273764444, "grad_norm": 0.01608988456428051, "learning_rate": 1.143063740792079e-06, "loss": 0.0331, "num_input_tokens_seen": 70204776, "step": 104150 }, { "epoch": 2.5445239782082916, "grad_norm": 0.15606385469436646, "learning_rate": 1.1429793398825173e-06, "loss": 0.0663, "num_input_tokens_seen": 70208040, "step": 104155 }, { "epoch": 2.5446461290401388, "grad_norm": 0.22624799609184265, "learning_rate": 1.1428949379331716e-06, "loss": 0.1334, "num_input_tokens_seen": 70211240, "step": 104160 }, { "epoch": 2.544768279871986, "grad_norm": 0.007654536981135607, "learning_rate": 1.1428105349446554e-06, "loss": 0.0003, "num_input_tokens_seen": 70214824, "step": 104165 }, { "epoch": 2.544890430703833, "grad_norm": 0.17889165878295898, "learning_rate": 1.1427261309175821e-06, "loss": 0.0404, "num_input_tokens_seen": 70217832, "step": 104170 }, { "epoch": 2.5450125815356803, "grad_norm": 1.8349424600601196, "learning_rate": 1.1426417258525667e-06, "loss": 0.001, "num_input_tokens_seen": 70221672, "step": 104175 }, { "epoch": 2.5451347323675275, "grad_norm": 133.47740173339844, "learning_rate": 1.1425573197502221e-06, "loss": 0.0196, "num_input_tokens_seen": 70224616, "step": 104180 }, { "epoch": 2.5452568831993747, "grad_norm": 0.011612365022301674, "learning_rate": 1.1424729126111623e-06, "loss": 0.0895, "num_input_tokens_seen": 70227944, "step": 104185 }, { "epoch": 2.545379034031222, "grad_norm": 0.033770378679037094, "learning_rate": 1.142388504436001e-06, "loss": 0.0384, "num_input_tokens_seen": 70231400, "step": 104190 }, { "epoch": 2.5455011848630686, "grad_norm": 20.022085189819336, "learning_rate": 1.1423040952253523e-06, "loss": 0.0854, "num_input_tokens_seen": 70235048, "step": 104195 }, { "epoch": 2.5456233356949163, "grad_norm": 0.1359310746192932, "learning_rate": 1.14221968497983e-06, "loss": 0.0004, "num_input_tokens_seen": 70238056, "step": 104200 }, { "epoch": 2.545745486526763, "grad_norm": 0.12865757942199707, "learning_rate": 1.1421352737000475e-06, "loss": 0.0048, "num_input_tokens_seen": 70241256, "step": 104205 }, { "epoch": 2.5458676373586107, "grad_norm": 0.13368584215641022, "learning_rate": 1.1420508613866193e-06, "loss": 0.0939, "num_input_tokens_seen": 70244840, "step": 104210 }, { "epoch": 2.5459897881904574, "grad_norm": 0.026051154360175133, "learning_rate": 1.1419664480401592e-06, "loss": 0.0282, "num_input_tokens_seen": 70248040, "step": 104215 }, { "epoch": 2.5461119390223046, "grad_norm": 0.04423855245113373, "learning_rate": 1.141882033661281e-06, "loss": 0.0002, "num_input_tokens_seen": 70251560, "step": 104220 }, { "epoch": 2.546234089854152, "grad_norm": 0.0026223089080303907, "learning_rate": 1.1417976182505981e-06, "loss": 0.0384, "num_input_tokens_seen": 70254696, "step": 104225 }, { "epoch": 2.546356240685999, "grad_norm": 0.009092241525650024, "learning_rate": 1.141713201808725e-06, "loss": 0.0489, "num_input_tokens_seen": 70258408, "step": 104230 }, { "epoch": 2.546478391517846, "grad_norm": 0.261366605758667, "learning_rate": 1.1416287843362753e-06, "loss": 0.0386, "num_input_tokens_seen": 70261544, "step": 104235 }, { "epoch": 2.5466005423496934, "grad_norm": 0.008134927600622177, "learning_rate": 1.1415443658338632e-06, "loss": 0.049, "num_input_tokens_seen": 70265512, "step": 104240 }, { "epoch": 2.5467226931815405, "grad_norm": 0.005791224539279938, "learning_rate": 1.141459946302102e-06, "loss": 0.0002, "num_input_tokens_seen": 70269032, "step": 104245 }, { "epoch": 2.5468448440133877, "grad_norm": 0.024592120200395584, "learning_rate": 1.1413755257416064e-06, "loss": 0.0005, "num_input_tokens_seen": 70272552, "step": 104250 }, { "epoch": 2.546966994845235, "grad_norm": 779.8514404296875, "learning_rate": 1.14129110415299e-06, "loss": 0.0149, "num_input_tokens_seen": 70276264, "step": 104255 }, { "epoch": 2.547089145677082, "grad_norm": 0.012323008850216866, "learning_rate": 1.1412066815368664e-06, "loss": 0.0001, "num_input_tokens_seen": 70280232, "step": 104260 }, { "epoch": 2.5472112965089293, "grad_norm": 10.281240463256836, "learning_rate": 1.1411222578938496e-06, "loss": 0.0857, "num_input_tokens_seen": 70283944, "step": 104265 }, { "epoch": 2.5473334473407765, "grad_norm": 0.0013304692693054676, "learning_rate": 1.1410378332245542e-06, "loss": 0.0043, "num_input_tokens_seen": 70286888, "step": 104270 }, { "epoch": 2.5474555981726237, "grad_norm": 0.023484796285629272, "learning_rate": 1.1409534075295938e-06, "loss": 0.0002, "num_input_tokens_seen": 70289960, "step": 104275 }, { "epoch": 2.547577749004471, "grad_norm": 0.007967732846736908, "learning_rate": 1.140868980809582e-06, "loss": 0.0002, "num_input_tokens_seen": 70293480, "step": 104280 }, { "epoch": 2.547699899836318, "grad_norm": 13.040853500366211, "learning_rate": 1.1407845530651336e-06, "loss": 0.0424, "num_input_tokens_seen": 70297064, "step": 104285 }, { "epoch": 2.547822050668165, "grad_norm": 0.01658696122467518, "learning_rate": 1.140700124296862e-06, "loss": 0.0003, "num_input_tokens_seen": 70300584, "step": 104290 }, { "epoch": 2.5479442015000124, "grad_norm": 0.016342423856258392, "learning_rate": 1.1406156945053806e-06, "loss": 0.0005, "num_input_tokens_seen": 70303848, "step": 104295 }, { "epoch": 2.548066352331859, "grad_norm": 226.27023315429688, "learning_rate": 1.1405312636913044e-06, "loss": 0.0167, "num_input_tokens_seen": 70307304, "step": 104300 }, { "epoch": 2.548188503163707, "grad_norm": 14.65396499633789, "learning_rate": 1.1404468318552471e-06, "loss": 0.0823, "num_input_tokens_seen": 70310888, "step": 104305 }, { "epoch": 2.5483106539955536, "grad_norm": 0.19434776902198792, "learning_rate": 1.140362398997823e-06, "loss": 0.0244, "num_input_tokens_seen": 70314664, "step": 104310 }, { "epoch": 2.5484328048274008, "grad_norm": 0.00956706702709198, "learning_rate": 1.1402779651196452e-06, "loss": 0.0003, "num_input_tokens_seen": 70318120, "step": 104315 }, { "epoch": 2.548554955659248, "grad_norm": 0.6326279044151306, "learning_rate": 1.1401935302213286e-06, "loss": 0.0411, "num_input_tokens_seen": 70321960, "step": 104320 }, { "epoch": 2.548677106491095, "grad_norm": 0.01305704191327095, "learning_rate": 1.1401090943034865e-06, "loss": 0.0766, "num_input_tokens_seen": 70325288, "step": 104325 }, { "epoch": 2.5487992573229423, "grad_norm": 9.197270393371582, "learning_rate": 1.140024657366734e-06, "loss": 0.0956, "num_input_tokens_seen": 70329000, "step": 104330 }, { "epoch": 2.5489214081547895, "grad_norm": 23.622716903686523, "learning_rate": 1.1399402194116842e-06, "loss": 0.0524, "num_input_tokens_seen": 70332904, "step": 104335 }, { "epoch": 2.5490435589866367, "grad_norm": 0.02332184463739395, "learning_rate": 1.1398557804389517e-06, "loss": 0.0008, "num_input_tokens_seen": 70335912, "step": 104340 }, { "epoch": 2.549165709818484, "grad_norm": 0.01457999274134636, "learning_rate": 1.1397713404491503e-06, "loss": 0.0751, "num_input_tokens_seen": 70339624, "step": 104345 }, { "epoch": 2.549287860650331, "grad_norm": 0.06870204210281372, "learning_rate": 1.139686899442894e-06, "loss": 0.0365, "num_input_tokens_seen": 70342568, "step": 104350 }, { "epoch": 2.5494100114821783, "grad_norm": 0.521581768989563, "learning_rate": 1.139602457420797e-06, "loss": 0.0006, "num_input_tokens_seen": 70345896, "step": 104355 }, { "epoch": 2.5495321623140255, "grad_norm": 0.026709794998168945, "learning_rate": 1.1395180143834734e-06, "loss": 0.0002, "num_input_tokens_seen": 70349096, "step": 104360 }, { "epoch": 2.5496543131458727, "grad_norm": 0.03073815070092678, "learning_rate": 1.139433570331537e-06, "loss": 0.0394, "num_input_tokens_seen": 70352616, "step": 104365 }, { "epoch": 2.54977646397772, "grad_norm": 0.01742694526910782, "learning_rate": 1.1393491252656025e-06, "loss": 0.0001, "num_input_tokens_seen": 70355944, "step": 104370 }, { "epoch": 2.5498986148095666, "grad_norm": 38.13505935668945, "learning_rate": 1.1392646791862836e-06, "loss": 0.0468, "num_input_tokens_seen": 70359848, "step": 104375 }, { "epoch": 2.5500207656414142, "grad_norm": 0.018928751349449158, "learning_rate": 1.1391802320941946e-06, "loss": 0.0005, "num_input_tokens_seen": 70362984, "step": 104380 }, { "epoch": 2.550142916473261, "grad_norm": 0.2556987404823303, "learning_rate": 1.1390957839899495e-06, "loss": 0.1049, "num_input_tokens_seen": 70366184, "step": 104385 }, { "epoch": 2.5502650673051086, "grad_norm": 0.038296084851026535, "learning_rate": 1.1390113348741624e-06, "loss": 0.0002, "num_input_tokens_seen": 70369192, "step": 104390 }, { "epoch": 2.5503872181369553, "grad_norm": 0.7614354491233826, "learning_rate": 1.1389268847474476e-06, "loss": 0.0037, "num_input_tokens_seen": 70372328, "step": 104395 }, { "epoch": 2.5505093689688025, "grad_norm": 1.100406527519226, "learning_rate": 1.138842433610419e-06, "loss": 0.0006, "num_input_tokens_seen": 70375400, "step": 104400 }, { "epoch": 2.5506315198006497, "grad_norm": 0.14379285275936127, "learning_rate": 1.1387579814636908e-06, "loss": 0.1083, "num_input_tokens_seen": 70378600, "step": 104405 }, { "epoch": 2.550753670632497, "grad_norm": 14.693830490112305, "learning_rate": 1.1386735283078775e-06, "loss": 0.0582, "num_input_tokens_seen": 70381608, "step": 104410 }, { "epoch": 2.550875821464344, "grad_norm": 0.05920464172959328, "learning_rate": 1.1385890741435926e-06, "loss": 0.0683, "num_input_tokens_seen": 70384552, "step": 104415 }, { "epoch": 2.5509979722961913, "grad_norm": 32.26267623901367, "learning_rate": 1.1385046189714509e-06, "loss": 0.0298, "num_input_tokens_seen": 70388072, "step": 104420 }, { "epoch": 2.5511201231280385, "grad_norm": 0.010261290706694126, "learning_rate": 1.1384201627920663e-06, "loss": 0.0002, "num_input_tokens_seen": 70391784, "step": 104425 }, { "epoch": 2.5512422739598857, "grad_norm": 20.50096893310547, "learning_rate": 1.1383357056060531e-06, "loss": 0.0701, "num_input_tokens_seen": 70395880, "step": 104430 }, { "epoch": 2.551364424791733, "grad_norm": 11.664816856384277, "learning_rate": 1.1382512474140255e-06, "loss": 0.1085, "num_input_tokens_seen": 70399272, "step": 104435 }, { "epoch": 2.55148657562358, "grad_norm": 78.14788055419922, "learning_rate": 1.1381667882165977e-06, "loss": 0.0453, "num_input_tokens_seen": 70402472, "step": 104440 }, { "epoch": 2.5516087264554272, "grad_norm": 0.008329739794135094, "learning_rate": 1.138082328014384e-06, "loss": 0.0013, "num_input_tokens_seen": 70405288, "step": 104445 }, { "epoch": 2.5517308772872744, "grad_norm": 0.9223102331161499, "learning_rate": 1.137997866807998e-06, "loss": 0.0005, "num_input_tokens_seen": 70408488, "step": 104450 }, { "epoch": 2.5518530281191216, "grad_norm": 0.051211223006248474, "learning_rate": 1.1379134045980545e-06, "loss": 0.0669, "num_input_tokens_seen": 70412072, "step": 104455 }, { "epoch": 2.551975178950969, "grad_norm": 0.23331056535243988, "learning_rate": 1.1378289413851677e-06, "loss": 0.0398, "num_input_tokens_seen": 70415272, "step": 104460 }, { "epoch": 2.552097329782816, "grad_norm": 27.745243072509766, "learning_rate": 1.1377444771699519e-06, "loss": 0.0236, "num_input_tokens_seen": 70418856, "step": 104465 }, { "epoch": 2.5522194806146627, "grad_norm": 4.048886775970459, "learning_rate": 1.1376600119530211e-06, "loss": 0.058, "num_input_tokens_seen": 70421992, "step": 104470 }, { "epoch": 2.5523416314465104, "grad_norm": 0.04298697039484978, "learning_rate": 1.1375755457349896e-06, "loss": 0.0004, "num_input_tokens_seen": 70425320, "step": 104475 }, { "epoch": 2.552463782278357, "grad_norm": 0.17285777628421783, "learning_rate": 1.1374910785164717e-06, "loss": 0.0008, "num_input_tokens_seen": 70428584, "step": 104480 }, { "epoch": 2.5525859331102043, "grad_norm": 0.2123318761587143, "learning_rate": 1.1374066102980819e-06, "loss": 0.0002, "num_input_tokens_seen": 70431592, "step": 104485 }, { "epoch": 2.5527080839420515, "grad_norm": 34.717105865478516, "learning_rate": 1.1373221410804343e-06, "loss": 0.0543, "num_input_tokens_seen": 70435048, "step": 104490 }, { "epoch": 2.5528302347738987, "grad_norm": 0.024648137390613556, "learning_rate": 1.1372376708641432e-06, "loss": 0.0001, "num_input_tokens_seen": 70438056, "step": 104495 }, { "epoch": 2.552952385605746, "grad_norm": 0.07736524939537048, "learning_rate": 1.1371531996498226e-06, "loss": 0.0525, "num_input_tokens_seen": 70441448, "step": 104500 }, { "epoch": 2.553074536437593, "grad_norm": 0.004976922646164894, "learning_rate": 1.137068727438087e-06, "loss": 0.0002, "num_input_tokens_seen": 70445160, "step": 104505 }, { "epoch": 2.5531966872694403, "grad_norm": 0.010621704161167145, "learning_rate": 1.136984254229551e-06, "loss": 0.1141, "num_input_tokens_seen": 70448488, "step": 104510 }, { "epoch": 2.5533188381012875, "grad_norm": 23.055747985839844, "learning_rate": 1.1368997800248284e-06, "loss": 0.0478, "num_input_tokens_seen": 70451496, "step": 104515 }, { "epoch": 2.5534409889331346, "grad_norm": 0.020686976611614227, "learning_rate": 1.1368153048245337e-06, "loss": 0.0003, "num_input_tokens_seen": 70455080, "step": 104520 }, { "epoch": 2.553563139764982, "grad_norm": 0.19006820023059845, "learning_rate": 1.1367308286292816e-06, "loss": 0.0002, "num_input_tokens_seen": 70458152, "step": 104525 }, { "epoch": 2.553685290596829, "grad_norm": 0.07241163402795792, "learning_rate": 1.136646351439686e-06, "loss": 0.0514, "num_input_tokens_seen": 70461352, "step": 104530 }, { "epoch": 2.553807441428676, "grad_norm": 0.053362783044576645, "learning_rate": 1.1365618732563616e-06, "loss": 0.0001, "num_input_tokens_seen": 70464744, "step": 104535 }, { "epoch": 2.5539295922605234, "grad_norm": 0.0271244365721941, "learning_rate": 1.1364773940799222e-06, "loss": 0.0001, "num_input_tokens_seen": 70468712, "step": 104540 }, { "epoch": 2.5540517430923706, "grad_norm": 0.0525955893099308, "learning_rate": 1.1363929139109827e-06, "loss": 0.0677, "num_input_tokens_seen": 70472040, "step": 104545 }, { "epoch": 2.554173893924218, "grad_norm": 0.015901561826467514, "learning_rate": 1.1363084327501573e-06, "loss": 0.0709, "num_input_tokens_seen": 70475560, "step": 104550 }, { "epoch": 2.5542960447560645, "grad_norm": 22.091421127319336, "learning_rate": 1.1362239505980602e-06, "loss": 0.0426, "num_input_tokens_seen": 70478696, "step": 104555 }, { "epoch": 2.554418195587912, "grad_norm": 0.14539512991905212, "learning_rate": 1.1361394674553058e-06, "loss": 0.061, "num_input_tokens_seen": 70481960, "step": 104560 }, { "epoch": 2.554540346419759, "grad_norm": 15.404756546020508, "learning_rate": 1.1360549833225087e-06, "loss": 0.0904, "num_input_tokens_seen": 70485480, "step": 104565 }, { "epoch": 2.5546624972516065, "grad_norm": 13.381711959838867, "learning_rate": 1.1359704982002832e-06, "loss": 0.0503, "num_input_tokens_seen": 70488872, "step": 104570 }, { "epoch": 2.5547846480834533, "grad_norm": 0.0017305332003161311, "learning_rate": 1.1358860120892433e-06, "loss": 0.0002, "num_input_tokens_seen": 70492264, "step": 104575 }, { "epoch": 2.5549067989153005, "grad_norm": 0.5444920063018799, "learning_rate": 1.135801524990004e-06, "loss": 0.0004, "num_input_tokens_seen": 70495912, "step": 104580 }, { "epoch": 2.5550289497471477, "grad_norm": 0.06043120473623276, "learning_rate": 1.1357170369031797e-06, "loss": 0.039, "num_input_tokens_seen": 70499240, "step": 104585 }, { "epoch": 2.555151100578995, "grad_norm": 0.03580227494239807, "learning_rate": 1.1356325478293844e-06, "loss": 0.0529, "num_input_tokens_seen": 70502376, "step": 104590 }, { "epoch": 2.555273251410842, "grad_norm": 0.015926828607916832, "learning_rate": 1.135548057769233e-06, "loss": 0.0002, "num_input_tokens_seen": 70505960, "step": 104595 }, { "epoch": 2.5553954022426892, "grad_norm": 12.819072723388672, "learning_rate": 1.1354635667233394e-06, "loss": 0.088, "num_input_tokens_seen": 70509224, "step": 104600 }, { "epoch": 2.5555175530745364, "grad_norm": 0.046196166425943375, "learning_rate": 1.1353790746923182e-06, "loss": 0.0003, "num_input_tokens_seen": 70512808, "step": 104605 }, { "epoch": 2.5556397039063836, "grad_norm": 0.1168728843331337, "learning_rate": 1.1352945816767843e-06, "loss": 0.0002, "num_input_tokens_seen": 70516072, "step": 104610 }, { "epoch": 2.555761854738231, "grad_norm": 0.05582430958747864, "learning_rate": 1.1352100876773515e-06, "loss": 0.0295, "num_input_tokens_seen": 70519784, "step": 104615 }, { "epoch": 2.555884005570078, "grad_norm": 91.32950592041016, "learning_rate": 1.1351255926946348e-06, "loss": 0.0026, "num_input_tokens_seen": 70523112, "step": 104620 }, { "epoch": 2.556006156401925, "grad_norm": 0.06987828761339188, "learning_rate": 1.1350410967292483e-06, "loss": 0.0491, "num_input_tokens_seen": 70526568, "step": 104625 }, { "epoch": 2.5561283072337724, "grad_norm": 0.049678053706884384, "learning_rate": 1.1349565997818067e-06, "loss": 0.0004, "num_input_tokens_seen": 70529832, "step": 104630 }, { "epoch": 2.5562504580656196, "grad_norm": 0.27473777532577515, "learning_rate": 1.1348721018529243e-06, "loss": 0.0526, "num_input_tokens_seen": 70532904, "step": 104635 }, { "epoch": 2.5563726088974663, "grad_norm": 0.16689088940620422, "learning_rate": 1.134787602943216e-06, "loss": 0.0003, "num_input_tokens_seen": 70535656, "step": 104640 }, { "epoch": 2.556494759729314, "grad_norm": 0.8218722939491272, "learning_rate": 1.1347031030532956e-06, "loss": 0.0007, "num_input_tokens_seen": 70538728, "step": 104645 }, { "epoch": 2.5566169105611607, "grad_norm": 0.009063877165317535, "learning_rate": 1.1346186021837782e-06, "loss": 0.0001, "num_input_tokens_seen": 70542440, "step": 104650 }, { "epoch": 2.5567390613930083, "grad_norm": 0.17698997259140015, "learning_rate": 1.1345341003352782e-06, "loss": 0.1421, "num_input_tokens_seen": 70546152, "step": 104655 }, { "epoch": 2.556861212224855, "grad_norm": 0.05905401334166527, "learning_rate": 1.1344495975084098e-06, "loss": 0.0421, "num_input_tokens_seen": 70549672, "step": 104660 }, { "epoch": 2.5569833630567023, "grad_norm": 0.00954660214483738, "learning_rate": 1.134365093703788e-06, "loss": 0.0003, "num_input_tokens_seen": 70553768, "step": 104665 }, { "epoch": 2.5571055138885495, "grad_norm": 0.21002456545829773, "learning_rate": 1.1342805889220269e-06, "loss": 0.0027, "num_input_tokens_seen": 70557160, "step": 104670 }, { "epoch": 2.5572276647203966, "grad_norm": 19.2589168548584, "learning_rate": 1.1341960831637414e-06, "loss": 0.0029, "num_input_tokens_seen": 70560360, "step": 104675 }, { "epoch": 2.557349815552244, "grad_norm": 0.11573576182126999, "learning_rate": 1.1341115764295458e-06, "loss": 0.0108, "num_input_tokens_seen": 70563880, "step": 104680 }, { "epoch": 2.557471966384091, "grad_norm": 0.0027204095385968685, "learning_rate": 1.1340270687200547e-06, "loss": 0.0001, "num_input_tokens_seen": 70567400, "step": 104685 }, { "epoch": 2.557594117215938, "grad_norm": 1.3431215286254883, "learning_rate": 1.1339425600358827e-06, "loss": 0.0127, "num_input_tokens_seen": 70570600, "step": 104690 }, { "epoch": 2.5577162680477854, "grad_norm": 0.07056227326393127, "learning_rate": 1.1338580503776445e-06, "loss": 0.0003, "num_input_tokens_seen": 70573480, "step": 104695 }, { "epoch": 2.5578384188796326, "grad_norm": 0.015122072771191597, "learning_rate": 1.1337735397459547e-06, "loss": 0.0947, "num_input_tokens_seen": 70576552, "step": 104700 }, { "epoch": 2.55796056971148, "grad_norm": 0.0022590847220271826, "learning_rate": 1.1336890281414275e-06, "loss": 0.0, "num_input_tokens_seen": 70579624, "step": 104705 }, { "epoch": 2.558082720543327, "grad_norm": 0.3419479727745056, "learning_rate": 1.133604515564678e-06, "loss": 0.0491, "num_input_tokens_seen": 70582760, "step": 104710 }, { "epoch": 2.558204871375174, "grad_norm": 0.024046342819929123, "learning_rate": 1.13352000201632e-06, "loss": 0.0449, "num_input_tokens_seen": 70585832, "step": 104715 }, { "epoch": 2.5583270222070214, "grad_norm": 0.03014945797622204, "learning_rate": 1.133435487496969e-06, "loss": 0.0007, "num_input_tokens_seen": 70589096, "step": 104720 }, { "epoch": 2.5584491730388685, "grad_norm": 0.165313258767128, "learning_rate": 1.1333509720072392e-06, "loss": 0.0004, "num_input_tokens_seen": 70592872, "step": 104725 }, { "epoch": 2.5585713238707157, "grad_norm": 0.10299237817525864, "learning_rate": 1.133266455547745e-06, "loss": 0.0002, "num_input_tokens_seen": 70596584, "step": 104730 }, { "epoch": 2.5586934747025625, "grad_norm": 0.0059926919639110565, "learning_rate": 1.1331819381191016e-06, "loss": 0.0001, "num_input_tokens_seen": 70599848, "step": 104735 }, { "epoch": 2.55881562553441, "grad_norm": 36.820457458496094, "learning_rate": 1.1330974197219233e-06, "loss": 0.1445, "num_input_tokens_seen": 70602856, "step": 104740 }, { "epoch": 2.558937776366257, "grad_norm": 0.5047180652618408, "learning_rate": 1.1330129003568247e-06, "loss": 0.0707, "num_input_tokens_seen": 70606312, "step": 104745 }, { "epoch": 2.5590599271981045, "grad_norm": 0.057552579790353775, "learning_rate": 1.1329283800244206e-06, "loss": 0.0002, "num_input_tokens_seen": 70609704, "step": 104750 }, { "epoch": 2.5591820780299512, "grad_norm": 0.008029787801206112, "learning_rate": 1.1328438587253258e-06, "loss": 0.0465, "num_input_tokens_seen": 70613032, "step": 104755 }, { "epoch": 2.5593042288617984, "grad_norm": 0.019414668902754784, "learning_rate": 1.132759336460154e-06, "loss": 0.0001, "num_input_tokens_seen": 70616232, "step": 104760 }, { "epoch": 2.5594263796936456, "grad_norm": 0.08621279895305634, "learning_rate": 1.1326748132295211e-06, "loss": 0.0001, "num_input_tokens_seen": 70619432, "step": 104765 }, { "epoch": 2.559548530525493, "grad_norm": 0.02141975425183773, "learning_rate": 1.1325902890340414e-06, "loss": 0.0002, "num_input_tokens_seen": 70622824, "step": 104770 }, { "epoch": 2.55967068135734, "grad_norm": 75.61103820800781, "learning_rate": 1.1325057638743292e-06, "loss": 0.0308, "num_input_tokens_seen": 70626024, "step": 104775 }, { "epoch": 2.559792832189187, "grad_norm": 0.11365992575883865, "learning_rate": 1.1324212377509993e-06, "loss": 0.0441, "num_input_tokens_seen": 70630120, "step": 104780 }, { "epoch": 2.5599149830210344, "grad_norm": 0.05007657781243324, "learning_rate": 1.132336710664667e-06, "loss": 0.0148, "num_input_tokens_seen": 70633640, "step": 104785 }, { "epoch": 2.5600371338528816, "grad_norm": 0.06095591560006142, "learning_rate": 1.1322521826159464e-06, "loss": 0.0002, "num_input_tokens_seen": 70637160, "step": 104790 }, { "epoch": 2.5601592846847288, "grad_norm": 0.04456912353634834, "learning_rate": 1.1321676536054522e-06, "loss": 0.2187, "num_input_tokens_seen": 70640168, "step": 104795 }, { "epoch": 2.560281435516576, "grad_norm": 41.895538330078125, "learning_rate": 1.1320831236337994e-06, "loss": 0.0387, "num_input_tokens_seen": 70643560, "step": 104800 }, { "epoch": 2.560403586348423, "grad_norm": 0.01936202123761177, "learning_rate": 1.1319985927016026e-06, "loss": 0.0059, "num_input_tokens_seen": 70647464, "step": 104805 }, { "epoch": 2.5605257371802703, "grad_norm": 0.4608546495437622, "learning_rate": 1.1319140608094762e-06, "loss": 0.0004, "num_input_tokens_seen": 70650728, "step": 104810 }, { "epoch": 2.5606478880121175, "grad_norm": 0.014622372575104237, "learning_rate": 1.1318295279580357e-06, "loss": 0.0838, "num_input_tokens_seen": 70653544, "step": 104815 }, { "epoch": 2.5607700388439643, "grad_norm": 31.645605087280273, "learning_rate": 1.1317449941478952e-06, "loss": 0.0363, "num_input_tokens_seen": 70656552, "step": 104820 }, { "epoch": 2.560892189675812, "grad_norm": 0.06011528894305229, "learning_rate": 1.1316604593796695e-06, "loss": 0.0003, "num_input_tokens_seen": 70659560, "step": 104825 }, { "epoch": 2.5610143405076586, "grad_norm": 0.036927856504917145, "learning_rate": 1.1315759236539737e-06, "loss": 0.0803, "num_input_tokens_seen": 70662824, "step": 104830 }, { "epoch": 2.5611364913395063, "grad_norm": 0.05356254428625107, "learning_rate": 1.1314913869714225e-06, "loss": 0.0002, "num_input_tokens_seen": 70666344, "step": 104835 }, { "epoch": 2.561258642171353, "grad_norm": 0.43842801451683044, "learning_rate": 1.1314068493326305e-06, "loss": 0.0007, "num_input_tokens_seen": 70669672, "step": 104840 }, { "epoch": 2.5613807930032, "grad_norm": 0.03688225895166397, "learning_rate": 1.1313223107382124e-06, "loss": 0.0153, "num_input_tokens_seen": 70673128, "step": 104845 }, { "epoch": 2.5615029438350474, "grad_norm": 0.011995815671980381, "learning_rate": 1.1312377711887835e-06, "loss": 0.013, "num_input_tokens_seen": 70676776, "step": 104850 }, { "epoch": 2.5616250946668946, "grad_norm": 23.376665115356445, "learning_rate": 1.1311532306849579e-06, "loss": 0.0668, "num_input_tokens_seen": 70679976, "step": 104855 }, { "epoch": 2.5617472454987418, "grad_norm": 0.09116607159376144, "learning_rate": 1.131068689227351e-06, "loss": 0.0443, "num_input_tokens_seen": 70683240, "step": 104860 }, { "epoch": 2.561869396330589, "grad_norm": 0.021630791947245598, "learning_rate": 1.130984146816577e-06, "loss": 0.0001, "num_input_tokens_seen": 70687208, "step": 104865 }, { "epoch": 2.561991547162436, "grad_norm": 0.012038863264024258, "learning_rate": 1.1308996034532511e-06, "loss": 0.0, "num_input_tokens_seen": 70690472, "step": 104870 }, { "epoch": 2.5621136979942833, "grad_norm": 0.007038927637040615, "learning_rate": 1.1308150591379884e-06, "loss": 0.0573, "num_input_tokens_seen": 70693864, "step": 104875 }, { "epoch": 2.5622358488261305, "grad_norm": 0.16878843307495117, "learning_rate": 1.1307305138714032e-06, "loss": 0.0002, "num_input_tokens_seen": 70697448, "step": 104880 }, { "epoch": 2.5623579996579777, "grad_norm": 0.047601111233234406, "learning_rate": 1.1306459676541104e-06, "loss": 0.0601, "num_input_tokens_seen": 70700968, "step": 104885 }, { "epoch": 2.562480150489825, "grad_norm": 0.4622974395751953, "learning_rate": 1.130561420486725e-06, "loss": 0.0004, "num_input_tokens_seen": 70704488, "step": 104890 }, { "epoch": 2.562602301321672, "grad_norm": 1.5819555521011353, "learning_rate": 1.1304768723698622e-06, "loss": 0.0356, "num_input_tokens_seen": 70708008, "step": 104895 }, { "epoch": 2.5627244521535193, "grad_norm": 0.32831981778144836, "learning_rate": 1.130392323304136e-06, "loss": 0.0384, "num_input_tokens_seen": 70711208, "step": 104900 }, { "epoch": 2.5628466029853665, "grad_norm": 0.12604233622550964, "learning_rate": 1.1303077732901625e-06, "loss": 0.0003, "num_input_tokens_seen": 70714536, "step": 104905 }, { "epoch": 2.5629687538172137, "grad_norm": 0.3454369008541107, "learning_rate": 1.1302232223285554e-06, "loss": 0.0339, "num_input_tokens_seen": 70717800, "step": 104910 }, { "epoch": 2.5630909046490604, "grad_norm": 27.095327377319336, "learning_rate": 1.1301386704199298e-06, "loss": 0.0424, "num_input_tokens_seen": 70721320, "step": 104915 }, { "epoch": 2.563213055480908, "grad_norm": 0.38351860642433167, "learning_rate": 1.1300541175649008e-06, "loss": 0.1076, "num_input_tokens_seen": 70724328, "step": 104920 }, { "epoch": 2.563335206312755, "grad_norm": 40.32852554321289, "learning_rate": 1.1299695637640834e-06, "loss": 0.1206, "num_input_tokens_seen": 70727592, "step": 104925 }, { "epoch": 2.563457357144602, "grad_norm": 0.0024320860393345356, "learning_rate": 1.1298850090180923e-06, "loss": 0.0001, "num_input_tokens_seen": 70730984, "step": 104930 }, { "epoch": 2.563579507976449, "grad_norm": 0.058492377400398254, "learning_rate": 1.1298004533275426e-06, "loss": 0.0515, "num_input_tokens_seen": 70734120, "step": 104935 }, { "epoch": 2.5637016588082964, "grad_norm": 0.0694495216012001, "learning_rate": 1.1297158966930492e-06, "loss": 0.061, "num_input_tokens_seen": 70737768, "step": 104940 }, { "epoch": 2.5638238096401436, "grad_norm": 0.07813955098390579, "learning_rate": 1.1296313391152266e-06, "loss": 0.0008, "num_input_tokens_seen": 70741288, "step": 104945 }, { "epoch": 2.5639459604719907, "grad_norm": 0.16016274690628052, "learning_rate": 1.1295467805946902e-06, "loss": 0.0874, "num_input_tokens_seen": 70744552, "step": 104950 }, { "epoch": 2.564068111303838, "grad_norm": 0.0038009141571819782, "learning_rate": 1.1294622211320548e-06, "loss": 0.0529, "num_input_tokens_seen": 70748136, "step": 104955 }, { "epoch": 2.564190262135685, "grad_norm": 0.00716787576675415, "learning_rate": 1.1293776607279352e-06, "loss": 0.0002, "num_input_tokens_seen": 70751464, "step": 104960 }, { "epoch": 2.5643124129675323, "grad_norm": 43.62774658203125, "learning_rate": 1.1292930993829465e-06, "loss": 0.1121, "num_input_tokens_seen": 70754792, "step": 104965 }, { "epoch": 2.5644345637993795, "grad_norm": 317.7919921875, "learning_rate": 1.1292085370977036e-06, "loss": 0.0725, "num_input_tokens_seen": 70757928, "step": 104970 }, { "epoch": 2.5645567146312267, "grad_norm": 26.290687561035156, "learning_rate": 1.1291239738728214e-06, "loss": 0.0597, "num_input_tokens_seen": 70760808, "step": 104975 }, { "epoch": 2.564678865463074, "grad_norm": 30.182716369628906, "learning_rate": 1.129039409708915e-06, "loss": 0.063, "num_input_tokens_seen": 70764264, "step": 104980 }, { "epoch": 2.564801016294921, "grad_norm": 0.4601612687110901, "learning_rate": 1.1289548446065993e-06, "loss": 0.125, "num_input_tokens_seen": 70767656, "step": 104985 }, { "epoch": 2.5649231671267683, "grad_norm": 0.022800132632255554, "learning_rate": 1.1288702785664894e-06, "loss": 0.0002, "num_input_tokens_seen": 70770920, "step": 104990 }, { "epoch": 2.5650453179586155, "grad_norm": 0.060902874916791916, "learning_rate": 1.1287857115892002e-06, "loss": 0.0001, "num_input_tokens_seen": 70774312, "step": 104995 }, { "epoch": 2.565167468790462, "grad_norm": 65.98351287841797, "learning_rate": 1.1287011436753465e-06, "loss": 0.0115, "num_input_tokens_seen": 70777320, "step": 105000 }, { "epoch": 2.56528961962231, "grad_norm": 0.09278260916471481, "learning_rate": 1.1286165748255434e-06, "loss": 0.0479, "num_input_tokens_seen": 70780200, "step": 105005 }, { "epoch": 2.5654117704541566, "grad_norm": 0.008261207491159439, "learning_rate": 1.128532005040406e-06, "loss": 0.0056, "num_input_tokens_seen": 70783656, "step": 105010 }, { "epoch": 2.565533921286004, "grad_norm": 0.0054551963694393635, "learning_rate": 1.1284474343205494e-06, "loss": 0.0001, "num_input_tokens_seen": 70787368, "step": 105015 }, { "epoch": 2.565656072117851, "grad_norm": 0.007644488476216793, "learning_rate": 1.1283628626665887e-06, "loss": 0.0002, "num_input_tokens_seen": 70790696, "step": 105020 }, { "epoch": 2.565778222949698, "grad_norm": 0.08200935274362564, "learning_rate": 1.1282782900791384e-06, "loss": 0.0017, "num_input_tokens_seen": 70794024, "step": 105025 }, { "epoch": 2.5659003737815453, "grad_norm": 0.15629921853542328, "learning_rate": 1.128193716558814e-06, "loss": 0.0585, "num_input_tokens_seen": 70797160, "step": 105030 }, { "epoch": 2.5660225246133925, "grad_norm": 11.52754020690918, "learning_rate": 1.1281091421062306e-06, "loss": 0.0408, "num_input_tokens_seen": 70800616, "step": 105035 }, { "epoch": 2.5661446754452397, "grad_norm": 0.008200634270906448, "learning_rate": 1.1280245667220024e-06, "loss": 0.1486, "num_input_tokens_seen": 70803880, "step": 105040 }, { "epoch": 2.566266826277087, "grad_norm": 0.660971999168396, "learning_rate": 1.1279399904067457e-06, "loss": 0.0004, "num_input_tokens_seen": 70807272, "step": 105045 }, { "epoch": 2.566388977108934, "grad_norm": 0.07039150595664978, "learning_rate": 1.127855413161075e-06, "loss": 0.0849, "num_input_tokens_seen": 70810600, "step": 105050 }, { "epoch": 2.5665111279407813, "grad_norm": 0.04144516959786415, "learning_rate": 1.1277708349856053e-06, "loss": 0.0003, "num_input_tokens_seen": 70814312, "step": 105055 }, { "epoch": 2.5666332787726285, "grad_norm": 0.12249460071325302, "learning_rate": 1.1276862558809518e-06, "loss": 0.0004, "num_input_tokens_seen": 70817256, "step": 105060 }, { "epoch": 2.5667554296044757, "grad_norm": 0.20821748673915863, "learning_rate": 1.127601675847729e-06, "loss": 0.1226, "num_input_tokens_seen": 70820328, "step": 105065 }, { "epoch": 2.566877580436323, "grad_norm": 0.019365403801202774, "learning_rate": 1.1275170948865531e-06, "loss": 0.0547, "num_input_tokens_seen": 70823528, "step": 105070 }, { "epoch": 2.56699973126817, "grad_norm": 0.003225482301786542, "learning_rate": 1.127432512998038e-06, "loss": 0.0004, "num_input_tokens_seen": 70826856, "step": 105075 }, { "epoch": 2.5671218821000172, "grad_norm": 0.009520246647298336, "learning_rate": 1.1273479301827999e-06, "loss": 0.0396, "num_input_tokens_seen": 70830952, "step": 105080 }, { "epoch": 2.5672440329318644, "grad_norm": 0.09223540872335434, "learning_rate": 1.1272633464414533e-06, "loss": 0.0001, "num_input_tokens_seen": 70834216, "step": 105085 }, { "epoch": 2.5673661837637116, "grad_norm": 0.018998224288225174, "learning_rate": 1.127178761774613e-06, "loss": 0.0003, "num_input_tokens_seen": 70837224, "step": 105090 }, { "epoch": 2.5674883345955584, "grad_norm": 0.02200205810368061, "learning_rate": 1.127094176182895e-06, "loss": 0.1041, "num_input_tokens_seen": 70840360, "step": 105095 }, { "epoch": 2.567610485427406, "grad_norm": 0.1800374537706375, "learning_rate": 1.1270095896669138e-06, "loss": 0.0004, "num_input_tokens_seen": 70844008, "step": 105100 }, { "epoch": 2.5677326362592527, "grad_norm": 0.04483243450522423, "learning_rate": 1.126925002227285e-06, "loss": 0.0003, "num_input_tokens_seen": 70847528, "step": 105105 }, { "epoch": 2.5678547870911, "grad_norm": 0.025489473715424538, "learning_rate": 1.126840413864623e-06, "loss": 0.0001, "num_input_tokens_seen": 70850984, "step": 105110 }, { "epoch": 2.567976937922947, "grad_norm": 0.08616407215595245, "learning_rate": 1.1267558245795438e-06, "loss": 0.0006, "num_input_tokens_seen": 70854824, "step": 105115 }, { "epoch": 2.5680990887547943, "grad_norm": 0.578325092792511, "learning_rate": 1.1266712343726617e-06, "loss": 0.0005, "num_input_tokens_seen": 70858216, "step": 105120 }, { "epoch": 2.5682212395866415, "grad_norm": 0.011274863965809345, "learning_rate": 1.1265866432445925e-06, "loss": 0.0001, "num_input_tokens_seen": 70861544, "step": 105125 }, { "epoch": 2.5683433904184887, "grad_norm": 0.1854897290468216, "learning_rate": 1.1265020511959514e-06, "loss": 0.0753, "num_input_tokens_seen": 70864616, "step": 105130 }, { "epoch": 2.568465541250336, "grad_norm": 0.019263846799731255, "learning_rate": 1.126417458227353e-06, "loss": 0.0005, "num_input_tokens_seen": 70867496, "step": 105135 }, { "epoch": 2.568587692082183, "grad_norm": 0.003985455725342035, "learning_rate": 1.126332864339413e-06, "loss": 0.034, "num_input_tokens_seen": 70871272, "step": 105140 }, { "epoch": 2.5687098429140303, "grad_norm": 0.02949185110628605, "learning_rate": 1.1262482695327464e-06, "loss": 0.0221, "num_input_tokens_seen": 70874600, "step": 105145 }, { "epoch": 2.5688319937458775, "grad_norm": 0.023490281775593758, "learning_rate": 1.1261636738079686e-06, "loss": 0.0683, "num_input_tokens_seen": 70878184, "step": 105150 }, { "epoch": 2.5689541445777246, "grad_norm": 0.09141631424427032, "learning_rate": 1.1260790771656944e-06, "loss": 0.1187, "num_input_tokens_seen": 70881640, "step": 105155 }, { "epoch": 2.569076295409572, "grad_norm": 0.2394016832113266, "learning_rate": 1.1259944796065392e-06, "loss": 0.0005, "num_input_tokens_seen": 70885160, "step": 105160 }, { "epoch": 2.569198446241419, "grad_norm": 0.5712342262268066, "learning_rate": 1.1259098811311184e-06, "loss": 0.0457, "num_input_tokens_seen": 70888360, "step": 105165 }, { "epoch": 2.569320597073266, "grad_norm": 0.007113562431186438, "learning_rate": 1.1258252817400472e-06, "loss": 0.0001, "num_input_tokens_seen": 70891176, "step": 105170 }, { "epoch": 2.5694427479051134, "grad_norm": 0.039674174040555954, "learning_rate": 1.1257406814339404e-06, "loss": 0.0005, "num_input_tokens_seen": 70895144, "step": 105175 }, { "epoch": 2.56956489873696, "grad_norm": 72.04270935058594, "learning_rate": 1.1256560802134138e-06, "loss": 0.0957, "num_input_tokens_seen": 70898728, "step": 105180 }, { "epoch": 2.569687049568808, "grad_norm": 0.0031505043152719736, "learning_rate": 1.1255714780790823e-06, "loss": 0.0003, "num_input_tokens_seen": 70901928, "step": 105185 }, { "epoch": 2.5698092004006545, "grad_norm": 0.030574841424822807, "learning_rate": 1.1254868750315613e-06, "loss": 0.0618, "num_input_tokens_seen": 70904872, "step": 105190 }, { "epoch": 2.569931351232502, "grad_norm": 0.0069647496566176414, "learning_rate": 1.1254022710714662e-06, "loss": 0.0002, "num_input_tokens_seen": 70908200, "step": 105195 }, { "epoch": 2.570053502064349, "grad_norm": 0.1449422985315323, "learning_rate": 1.1253176661994114e-06, "loss": 0.1128, "num_input_tokens_seen": 70910888, "step": 105200 }, { "epoch": 2.570175652896196, "grad_norm": 0.04498355835676193, "learning_rate": 1.1252330604160134e-06, "loss": 0.0522, "num_input_tokens_seen": 70913960, "step": 105205 }, { "epoch": 2.5702978037280433, "grad_norm": 0.03253661096096039, "learning_rate": 1.1251484537218867e-06, "loss": 0.0927, "num_input_tokens_seen": 70917032, "step": 105210 }, { "epoch": 2.5704199545598905, "grad_norm": 0.002849914599210024, "learning_rate": 1.125063846117647e-06, "loss": 0.0654, "num_input_tokens_seen": 70920616, "step": 105215 }, { "epoch": 2.5705421053917377, "grad_norm": 0.00351352128200233, "learning_rate": 1.124979237603909e-06, "loss": 0.0005, "num_input_tokens_seen": 70923688, "step": 105220 }, { "epoch": 2.570664256223585, "grad_norm": 27.33385467529297, "learning_rate": 1.1248946281812888e-06, "loss": 0.0007, "num_input_tokens_seen": 70926888, "step": 105225 }, { "epoch": 2.570786407055432, "grad_norm": 0.008452944457530975, "learning_rate": 1.1248100178504008e-06, "loss": 0.0345, "num_input_tokens_seen": 70930216, "step": 105230 }, { "epoch": 2.5709085578872792, "grad_norm": 0.19451171159744263, "learning_rate": 1.124725406611861e-06, "loss": 0.0003, "num_input_tokens_seen": 70933928, "step": 105235 }, { "epoch": 2.5710307087191264, "grad_norm": 0.013641073368489742, "learning_rate": 1.1246407944662846e-06, "loss": 0.0002, "num_input_tokens_seen": 70937768, "step": 105240 }, { "epoch": 2.5711528595509736, "grad_norm": 0.02136857435107231, "learning_rate": 1.1245561814142865e-06, "loss": 0.0001, "num_input_tokens_seen": 70940584, "step": 105245 }, { "epoch": 2.571275010382821, "grad_norm": 0.01668001152575016, "learning_rate": 1.1244715674564827e-06, "loss": 0.0001, "num_input_tokens_seen": 70943912, "step": 105250 }, { "epoch": 2.571397161214668, "grad_norm": 0.016919095069169998, "learning_rate": 1.1243869525934881e-06, "loss": 0.0001, "num_input_tokens_seen": 70947304, "step": 105255 }, { "epoch": 2.571519312046515, "grad_norm": 0.015760477632284164, "learning_rate": 1.124302336825918e-06, "loss": 0.0585, "num_input_tokens_seen": 70950440, "step": 105260 }, { "epoch": 2.571641462878362, "grad_norm": 0.012062730267643929, "learning_rate": 1.1242177201543881e-06, "loss": 0.0586, "num_input_tokens_seen": 70954024, "step": 105265 }, { "epoch": 2.5717636137102096, "grad_norm": 0.008802815340459347, "learning_rate": 1.1241331025795132e-06, "loss": 0.0002, "num_input_tokens_seen": 70957160, "step": 105270 }, { "epoch": 2.5718857645420563, "grad_norm": 0.14933550357818604, "learning_rate": 1.1240484841019094e-06, "loss": 0.0006, "num_input_tokens_seen": 70960680, "step": 105275 }, { "epoch": 2.572007915373904, "grad_norm": 0.2914406359195709, "learning_rate": 1.1239638647221915e-06, "loss": 0.0338, "num_input_tokens_seen": 70963944, "step": 105280 }, { "epoch": 2.5721300662057507, "grad_norm": 196.21656799316406, "learning_rate": 1.123879244440975e-06, "loss": 0.0267, "num_input_tokens_seen": 70967144, "step": 105285 }, { "epoch": 2.572252217037598, "grad_norm": 78.05303192138672, "learning_rate": 1.1237946232588749e-06, "loss": 0.0523, "num_input_tokens_seen": 70970728, "step": 105290 }, { "epoch": 2.572374367869445, "grad_norm": 0.009671011008322239, "learning_rate": 1.1237100011765074e-06, "loss": 0.097, "num_input_tokens_seen": 70974120, "step": 105295 }, { "epoch": 2.5724965187012923, "grad_norm": 0.0990491583943367, "learning_rate": 1.1236253781944873e-06, "loss": 0.0057, "num_input_tokens_seen": 70977576, "step": 105300 }, { "epoch": 2.5726186695331394, "grad_norm": 0.7363083958625793, "learning_rate": 1.1235407543134305e-06, "loss": 0.0995, "num_input_tokens_seen": 70980648, "step": 105305 }, { "epoch": 2.5727408203649866, "grad_norm": 0.08942437171936035, "learning_rate": 1.123456129533952e-06, "loss": 0.0181, "num_input_tokens_seen": 70984040, "step": 105310 }, { "epoch": 2.572862971196834, "grad_norm": 0.17924891412258148, "learning_rate": 1.1233715038566673e-06, "loss": 0.0529, "num_input_tokens_seen": 70987240, "step": 105315 }, { "epoch": 2.572985122028681, "grad_norm": 0.11793851107358932, "learning_rate": 1.123286877282192e-06, "loss": 0.0002, "num_input_tokens_seen": 70990760, "step": 105320 }, { "epoch": 2.573107272860528, "grad_norm": 13.390181541442871, "learning_rate": 1.123202249811141e-06, "loss": 0.0819, "num_input_tokens_seen": 70994024, "step": 105325 }, { "epoch": 2.5732294236923754, "grad_norm": 0.5596466064453125, "learning_rate": 1.1231176214441305e-06, "loss": 0.0822, "num_input_tokens_seen": 70997672, "step": 105330 }, { "epoch": 2.5733515745242226, "grad_norm": 0.20801320672035217, "learning_rate": 1.123032992181775e-06, "loss": 0.0558, "num_input_tokens_seen": 71001128, "step": 105335 }, { "epoch": 2.5734737253560698, "grad_norm": 0.025137195363640785, "learning_rate": 1.1229483620246912e-06, "loss": 0.035, "num_input_tokens_seen": 71004456, "step": 105340 }, { "epoch": 2.573595876187917, "grad_norm": 0.005038428120315075, "learning_rate": 1.1228637309734935e-06, "loss": 0.0001, "num_input_tokens_seen": 71008040, "step": 105345 }, { "epoch": 2.573718027019764, "grad_norm": 0.2574462592601776, "learning_rate": 1.1227790990287976e-06, "loss": 0.0002, "num_input_tokens_seen": 71011432, "step": 105350 }, { "epoch": 2.5738401778516113, "grad_norm": 0.0075719221495091915, "learning_rate": 1.122694466191219e-06, "loss": 0.0002, "num_input_tokens_seen": 71014504, "step": 105355 }, { "epoch": 2.573962328683458, "grad_norm": 0.2954552173614502, "learning_rate": 1.1226098324613733e-06, "loss": 0.0604, "num_input_tokens_seen": 71017384, "step": 105360 }, { "epoch": 2.5740844795153057, "grad_norm": 10.460375785827637, "learning_rate": 1.122525197839876e-06, "loss": 0.0862, "num_input_tokens_seen": 71021160, "step": 105365 }, { "epoch": 2.5742066303471525, "grad_norm": 0.015190809965133667, "learning_rate": 1.1224405623273427e-06, "loss": 0.0003, "num_input_tokens_seen": 71024296, "step": 105370 }, { "epoch": 2.574328781179, "grad_norm": 0.05254284292459488, "learning_rate": 1.1223559259243886e-06, "loss": 0.0003, "num_input_tokens_seen": 71027496, "step": 105375 }, { "epoch": 2.574450932010847, "grad_norm": 0.01987920142710209, "learning_rate": 1.1222712886316292e-06, "loss": 0.0031, "num_input_tokens_seen": 71030760, "step": 105380 }, { "epoch": 2.574573082842694, "grad_norm": 0.26836860179901123, "learning_rate": 1.12218665044968e-06, "loss": 0.0002, "num_input_tokens_seen": 71034472, "step": 105385 }, { "epoch": 2.5746952336745412, "grad_norm": 0.2163187563419342, "learning_rate": 1.1221020113791567e-06, "loss": 0.0005, "num_input_tokens_seen": 71037544, "step": 105390 }, { "epoch": 2.5748173845063884, "grad_norm": 0.0660480409860611, "learning_rate": 1.122017371420675e-06, "loss": 0.1522, "num_input_tokens_seen": 71041320, "step": 105395 }, { "epoch": 2.5749395353382356, "grad_norm": 0.11806187033653259, "learning_rate": 1.1219327305748499e-06, "loss": 0.0685, "num_input_tokens_seen": 71044328, "step": 105400 }, { "epoch": 2.575061686170083, "grad_norm": 0.2208234816789627, "learning_rate": 1.121848088842297e-06, "loss": 0.0003, "num_input_tokens_seen": 71047400, "step": 105405 }, { "epoch": 2.57518383700193, "grad_norm": 0.04226498678326607, "learning_rate": 1.1217634462236323e-06, "loss": 0.0553, "num_input_tokens_seen": 71050792, "step": 105410 }, { "epoch": 2.575305987833777, "grad_norm": 0.10588227212429047, "learning_rate": 1.121678802719471e-06, "loss": 0.0514, "num_input_tokens_seen": 71053928, "step": 105415 }, { "epoch": 2.5754281386656244, "grad_norm": 13.927465438842773, "learning_rate": 1.1215941583304288e-06, "loss": 0.0257, "num_input_tokens_seen": 71057384, "step": 105420 }, { "epoch": 2.5755502894974716, "grad_norm": 76.3477783203125, "learning_rate": 1.121509513057121e-06, "loss": 0.0033, "num_input_tokens_seen": 71060520, "step": 105425 }, { "epoch": 2.5756724403293187, "grad_norm": 0.009837713092565536, "learning_rate": 1.1214248669001635e-06, "loss": 0.0305, "num_input_tokens_seen": 71064104, "step": 105430 }, { "epoch": 2.575794591161166, "grad_norm": 0.00897394958883524, "learning_rate": 1.1213402198601715e-06, "loss": 0.0323, "num_input_tokens_seen": 71067112, "step": 105435 }, { "epoch": 2.575916741993013, "grad_norm": 0.0860351026058197, "learning_rate": 1.1212555719377612e-06, "loss": 0.0435, "num_input_tokens_seen": 71070376, "step": 105440 }, { "epoch": 2.57603889282486, "grad_norm": 0.22064711153507233, "learning_rate": 1.121170923133547e-06, "loss": 0.0008, "num_input_tokens_seen": 71073704, "step": 105445 }, { "epoch": 2.5761610436567075, "grad_norm": 0.2620203197002411, "learning_rate": 1.1210862734481459e-06, "loss": 0.0006, "num_input_tokens_seen": 71076840, "step": 105450 }, { "epoch": 2.5762831944885543, "grad_norm": 0.7787070870399475, "learning_rate": 1.1210016228821729e-06, "loss": 0.0011, "num_input_tokens_seen": 71080168, "step": 105455 }, { "epoch": 2.576405345320402, "grad_norm": 0.03237830102443695, "learning_rate": 1.1209169714362431e-06, "loss": 0.0003, "num_input_tokens_seen": 71083816, "step": 105460 }, { "epoch": 2.5765274961522486, "grad_norm": 0.1585754007101059, "learning_rate": 1.120832319110973e-06, "loss": 0.0002, "num_input_tokens_seen": 71087080, "step": 105465 }, { "epoch": 2.576649646984096, "grad_norm": 0.13227100670337677, "learning_rate": 1.1207476659069776e-06, "loss": 0.0426, "num_input_tokens_seen": 71090664, "step": 105470 }, { "epoch": 2.576771797815943, "grad_norm": 0.055049993097782135, "learning_rate": 1.1206630118248725e-06, "loss": 0.0008, "num_input_tokens_seen": 71094440, "step": 105475 }, { "epoch": 2.57689394864779, "grad_norm": 0.0033830543980002403, "learning_rate": 1.1205783568652738e-06, "loss": 0.0001, "num_input_tokens_seen": 71097576, "step": 105480 }, { "epoch": 2.5770160994796374, "grad_norm": 14.097530364990234, "learning_rate": 1.1204937010287968e-06, "loss": 0.0618, "num_input_tokens_seen": 71100840, "step": 105485 }, { "epoch": 2.5771382503114846, "grad_norm": 0.10420069098472595, "learning_rate": 1.1204090443160573e-06, "loss": 0.0001, "num_input_tokens_seen": 71104040, "step": 105490 }, { "epoch": 2.5772604011433318, "grad_norm": 0.02168843150138855, "learning_rate": 1.1203243867276706e-06, "loss": 0.0431, "num_input_tokens_seen": 71107624, "step": 105495 }, { "epoch": 2.577382551975179, "grad_norm": 20.862285614013672, "learning_rate": 1.1202397282642529e-06, "loss": 0.0615, "num_input_tokens_seen": 71111016, "step": 105500 }, { "epoch": 2.577504702807026, "grad_norm": 0.044624313712120056, "learning_rate": 1.1201550689264191e-06, "loss": 0.0249, "num_input_tokens_seen": 71114664, "step": 105505 }, { "epoch": 2.5776268536388733, "grad_norm": 0.06067600101232529, "learning_rate": 1.1200704087147854e-06, "loss": 0.0515, "num_input_tokens_seen": 71118248, "step": 105510 }, { "epoch": 2.5777490044707205, "grad_norm": 0.0106893889605999, "learning_rate": 1.1199857476299678e-06, "loss": 0.0328, "num_input_tokens_seen": 71121448, "step": 105515 }, { "epoch": 2.5778711553025677, "grad_norm": 0.02916247397661209, "learning_rate": 1.1199010856725813e-06, "loss": 0.038, "num_input_tokens_seen": 71125736, "step": 105520 }, { "epoch": 2.577993306134415, "grad_norm": 0.07611119747161865, "learning_rate": 1.119816422843242e-06, "loss": 0.0903, "num_input_tokens_seen": 71129064, "step": 105525 }, { "epoch": 2.578115456966262, "grad_norm": 0.0019288009498268366, "learning_rate": 1.1197317591425656e-06, "loss": 0.0002, "num_input_tokens_seen": 71132328, "step": 105530 }, { "epoch": 2.5782376077981093, "grad_norm": 0.0974397137761116, "learning_rate": 1.1196470945711674e-06, "loss": 0.0002, "num_input_tokens_seen": 71135720, "step": 105535 }, { "epoch": 2.578359758629956, "grad_norm": 0.07071943581104279, "learning_rate": 1.119562429129663e-06, "loss": 0.0004, "num_input_tokens_seen": 71138920, "step": 105540 }, { "epoch": 2.5784819094618037, "grad_norm": 0.02228919044137001, "learning_rate": 1.1194777628186686e-06, "loss": 0.0558, "num_input_tokens_seen": 71142184, "step": 105545 }, { "epoch": 2.5786040602936504, "grad_norm": 27.30817985534668, "learning_rate": 1.1193930956388004e-06, "loss": 0.1206, "num_input_tokens_seen": 71145768, "step": 105550 }, { "epoch": 2.5787262111254976, "grad_norm": 0.14151504635810852, "learning_rate": 1.1193084275906727e-06, "loss": 0.072, "num_input_tokens_seen": 71149032, "step": 105555 }, { "epoch": 2.578848361957345, "grad_norm": 0.0615425668656826, "learning_rate": 1.1192237586749025e-06, "loss": 0.0004, "num_input_tokens_seen": 71152232, "step": 105560 }, { "epoch": 2.578970512789192, "grad_norm": 0.12096342444419861, "learning_rate": 1.119139088892105e-06, "loss": 0.0338, "num_input_tokens_seen": 71155944, "step": 105565 }, { "epoch": 2.579092663621039, "grad_norm": 0.0027860745321959257, "learning_rate": 1.119054418242896e-06, "loss": 0.0276, "num_input_tokens_seen": 71159272, "step": 105570 }, { "epoch": 2.5792148144528864, "grad_norm": 0.016637323424220085, "learning_rate": 1.1189697467278911e-06, "loss": 0.0489, "num_input_tokens_seen": 71162664, "step": 105575 }, { "epoch": 2.5793369652847336, "grad_norm": 0.005513378884643316, "learning_rate": 1.1188850743477062e-06, "loss": 0.0843, "num_input_tokens_seen": 71166056, "step": 105580 }, { "epoch": 2.5794591161165807, "grad_norm": 11.760927200317383, "learning_rate": 1.1188004011029573e-06, "loss": 0.0413, "num_input_tokens_seen": 71170088, "step": 105585 }, { "epoch": 2.579581266948428, "grad_norm": 29.75730323791504, "learning_rate": 1.1187157269942598e-06, "loss": 0.0483, "num_input_tokens_seen": 71173480, "step": 105590 }, { "epoch": 2.579703417780275, "grad_norm": 0.4479432702064514, "learning_rate": 1.1186310520222297e-06, "loss": 0.1053, "num_input_tokens_seen": 71176936, "step": 105595 }, { "epoch": 2.5798255686121223, "grad_norm": 0.19967444241046906, "learning_rate": 1.1185463761874823e-06, "loss": 0.0391, "num_input_tokens_seen": 71180200, "step": 105600 }, { "epoch": 2.5799477194439695, "grad_norm": 0.19300290942192078, "learning_rate": 1.1184616994906341e-06, "loss": 0.0013, "num_input_tokens_seen": 71183400, "step": 105605 }, { "epoch": 2.5800698702758167, "grad_norm": 0.021904923021793365, "learning_rate": 1.1183770219323005e-06, "loss": 0.0414, "num_input_tokens_seen": 71187624, "step": 105610 }, { "epoch": 2.580192021107664, "grad_norm": 0.022521737962961197, "learning_rate": 1.1182923435130972e-06, "loss": 0.0002, "num_input_tokens_seen": 71191208, "step": 105615 }, { "epoch": 2.580314171939511, "grad_norm": 1.1650266647338867, "learning_rate": 1.1182076642336405e-06, "loss": 0.0011, "num_input_tokens_seen": 71194920, "step": 105620 }, { "epoch": 2.580436322771358, "grad_norm": 0.006862139794975519, "learning_rate": 1.1181229840945457e-06, "loss": 0.0513, "num_input_tokens_seen": 71197992, "step": 105625 }, { "epoch": 2.5805584736032054, "grad_norm": 0.03143366053700447, "learning_rate": 1.1180383030964288e-06, "loss": 0.06, "num_input_tokens_seen": 71200936, "step": 105630 }, { "epoch": 2.580680624435052, "grad_norm": 0.32460829615592957, "learning_rate": 1.1179536212399057e-06, "loss": 0.0589, "num_input_tokens_seen": 71204648, "step": 105635 }, { "epoch": 2.5808027752669, "grad_norm": 0.01299480814486742, "learning_rate": 1.1178689385255919e-06, "loss": 0.0012, "num_input_tokens_seen": 71208104, "step": 105640 }, { "epoch": 2.5809249260987466, "grad_norm": 0.007713967002928257, "learning_rate": 1.1177842549541036e-06, "loss": 0.0709, "num_input_tokens_seen": 71211368, "step": 105645 }, { "epoch": 2.5810470769305938, "grad_norm": 0.010506748221814632, "learning_rate": 1.1176995705260566e-06, "loss": 0.0003, "num_input_tokens_seen": 71214568, "step": 105650 }, { "epoch": 2.581169227762441, "grad_norm": 0.04141269251704216, "learning_rate": 1.1176148852420666e-06, "loss": 0.0699, "num_input_tokens_seen": 71217512, "step": 105655 }, { "epoch": 2.581291378594288, "grad_norm": 0.15972989797592163, "learning_rate": 1.1175301991027494e-06, "loss": 0.0004, "num_input_tokens_seen": 71221160, "step": 105660 }, { "epoch": 2.5814135294261353, "grad_norm": 16.147079467773438, "learning_rate": 1.117445512108721e-06, "loss": 0.1022, "num_input_tokens_seen": 71224488, "step": 105665 }, { "epoch": 2.5815356802579825, "grad_norm": 0.015271567739546299, "learning_rate": 1.1173608242605974e-06, "loss": 0.0003, "num_input_tokens_seen": 71227752, "step": 105670 }, { "epoch": 2.5816578310898297, "grad_norm": 0.5717945694923401, "learning_rate": 1.1172761355589941e-06, "loss": 0.0007, "num_input_tokens_seen": 71230952, "step": 105675 }, { "epoch": 2.581779981921677, "grad_norm": 0.03975324705243111, "learning_rate": 1.1171914460045278e-06, "loss": 0.0002, "num_input_tokens_seen": 71234280, "step": 105680 }, { "epoch": 2.581902132753524, "grad_norm": 0.23138143122196198, "learning_rate": 1.1171067555978133e-06, "loss": 0.0385, "num_input_tokens_seen": 71237608, "step": 105685 }, { "epoch": 2.5820242835853713, "grad_norm": 0.023231210187077522, "learning_rate": 1.117022064339467e-06, "loss": 0.0005, "num_input_tokens_seen": 71241192, "step": 105690 }, { "epoch": 2.5821464344172185, "grad_norm": 0.015621090307831764, "learning_rate": 1.1169373722301044e-06, "loss": 0.0006, "num_input_tokens_seen": 71245544, "step": 105695 }, { "epoch": 2.5822685852490657, "grad_norm": 0.008443130180239677, "learning_rate": 1.1168526792703422e-06, "loss": 0.0001, "num_input_tokens_seen": 71248680, "step": 105700 }, { "epoch": 2.582390736080913, "grad_norm": 66.18633270263672, "learning_rate": 1.1167679854607962e-06, "loss": 0.1098, "num_input_tokens_seen": 71251624, "step": 105705 }, { "epoch": 2.58251288691276, "grad_norm": 0.10550694167613983, "learning_rate": 1.1166832908020815e-06, "loss": 0.0002, "num_input_tokens_seen": 71255144, "step": 105710 }, { "epoch": 2.5826350377446072, "grad_norm": 0.1457262635231018, "learning_rate": 1.1165985952948146e-06, "loss": 0.0002, "num_input_tokens_seen": 71258600, "step": 105715 }, { "epoch": 2.582757188576454, "grad_norm": 0.010310126468539238, "learning_rate": 1.1165138989396114e-06, "loss": 0.1239, "num_input_tokens_seen": 71261992, "step": 105720 }, { "epoch": 2.5828793394083016, "grad_norm": 0.19131167232990265, "learning_rate": 1.116429201737088e-06, "loss": 0.0002, "num_input_tokens_seen": 71265256, "step": 105725 }, { "epoch": 2.5830014902401484, "grad_norm": 0.03742775321006775, "learning_rate": 1.11634450368786e-06, "loss": 0.0001, "num_input_tokens_seen": 71268968, "step": 105730 }, { "epoch": 2.5831236410719955, "grad_norm": 0.16259603202342987, "learning_rate": 1.1162598047925434e-06, "loss": 0.0003, "num_input_tokens_seen": 71272360, "step": 105735 }, { "epoch": 2.5832457919038427, "grad_norm": 0.008227083832025528, "learning_rate": 1.1161751050517544e-06, "loss": 0.0006, "num_input_tokens_seen": 71275432, "step": 105740 }, { "epoch": 2.58336794273569, "grad_norm": 0.025638852268457413, "learning_rate": 1.1160904044661086e-06, "loss": 0.0468, "num_input_tokens_seen": 71278632, "step": 105745 }, { "epoch": 2.583490093567537, "grad_norm": 0.11284564435482025, "learning_rate": 1.1160057030362221e-06, "loss": 0.0655, "num_input_tokens_seen": 71281960, "step": 105750 }, { "epoch": 2.5836122443993843, "grad_norm": 0.0041786786168813705, "learning_rate": 1.115921000762711e-06, "loss": 0.0002, "num_input_tokens_seen": 71285288, "step": 105755 }, { "epoch": 2.5837343952312315, "grad_norm": 0.23622022569179535, "learning_rate": 1.115836297646191e-06, "loss": 0.0257, "num_input_tokens_seen": 71288616, "step": 105760 }, { "epoch": 2.5838565460630787, "grad_norm": 17.04725456237793, "learning_rate": 1.1157515936872785e-06, "loss": 0.062, "num_input_tokens_seen": 71292264, "step": 105765 }, { "epoch": 2.583978696894926, "grad_norm": 0.0007923907251097262, "learning_rate": 1.1156668888865895e-06, "loss": 0.0001, "num_input_tokens_seen": 71295656, "step": 105770 }, { "epoch": 2.584100847726773, "grad_norm": 56.511810302734375, "learning_rate": 1.1155821832447394e-06, "loss": 0.1369, "num_input_tokens_seen": 71299240, "step": 105775 }, { "epoch": 2.5842229985586203, "grad_norm": 0.011822639964520931, "learning_rate": 1.1154974767623448e-06, "loss": 0.0002, "num_input_tokens_seen": 71302376, "step": 105780 }, { "epoch": 2.5843451493904674, "grad_norm": 0.03911390155553818, "learning_rate": 1.1154127694400215e-06, "loss": 0.0633, "num_input_tokens_seen": 71305704, "step": 105785 }, { "epoch": 2.5844673002223146, "grad_norm": 0.12544453144073486, "learning_rate": 1.1153280612783856e-06, "loss": 0.1287, "num_input_tokens_seen": 71309544, "step": 105790 }, { "epoch": 2.584589451054162, "grad_norm": 0.017842907458543777, "learning_rate": 1.1152433522780526e-06, "loss": 0.0481, "num_input_tokens_seen": 71313256, "step": 105795 }, { "epoch": 2.584711601886009, "grad_norm": 0.09327222406864166, "learning_rate": 1.1151586424396394e-06, "loss": 0.1074, "num_input_tokens_seen": 71316520, "step": 105800 }, { "epoch": 2.5848337527178558, "grad_norm": 0.22710292041301727, "learning_rate": 1.1150739317637613e-06, "loss": 0.0003, "num_input_tokens_seen": 71319912, "step": 105805 }, { "epoch": 2.5849559035497034, "grad_norm": 0.022945843636989594, "learning_rate": 1.1149892202510347e-06, "loss": 0.0219, "num_input_tokens_seen": 71323048, "step": 105810 }, { "epoch": 2.58507805438155, "grad_norm": 0.42688482999801636, "learning_rate": 1.1149045079020755e-06, "loss": 0.0004, "num_input_tokens_seen": 71326248, "step": 105815 }, { "epoch": 2.5852002052133978, "grad_norm": 0.21542878448963165, "learning_rate": 1.1148197947174997e-06, "loss": 0.049, "num_input_tokens_seen": 71329448, "step": 105820 }, { "epoch": 2.5853223560452445, "grad_norm": 0.04005388543009758, "learning_rate": 1.1147350806979237e-06, "loss": 0.0583, "num_input_tokens_seen": 71332968, "step": 105825 }, { "epoch": 2.5854445068770917, "grad_norm": 0.005568156484514475, "learning_rate": 1.1146503658439632e-06, "loss": 0.0449, "num_input_tokens_seen": 71336360, "step": 105830 }, { "epoch": 2.585566657708939, "grad_norm": 0.0002745167294051498, "learning_rate": 1.1145656501562346e-06, "loss": 0.0003, "num_input_tokens_seen": 71340392, "step": 105835 }, { "epoch": 2.585688808540786, "grad_norm": 0.038722485303878784, "learning_rate": 1.1144809336353538e-06, "loss": 0.0697, "num_input_tokens_seen": 71343592, "step": 105840 }, { "epoch": 2.5858109593726333, "grad_norm": 0.028439607471227646, "learning_rate": 1.1143962162819367e-06, "loss": 0.0399, "num_input_tokens_seen": 71346856, "step": 105845 }, { "epoch": 2.5859331102044805, "grad_norm": 0.014738436788320541, "learning_rate": 1.1143114980965993e-06, "loss": 0.0008, "num_input_tokens_seen": 71349672, "step": 105850 }, { "epoch": 2.5860552610363277, "grad_norm": 0.0705747976899147, "learning_rate": 1.1142267790799581e-06, "loss": 0.0481, "num_input_tokens_seen": 71353192, "step": 105855 }, { "epoch": 2.586177411868175, "grad_norm": 0.7925322651863098, "learning_rate": 1.1141420592326292e-06, "loss": 0.0546, "num_input_tokens_seen": 71356264, "step": 105860 }, { "epoch": 2.586299562700022, "grad_norm": 0.018346594646573067, "learning_rate": 1.1140573385552285e-06, "loss": 0.0001, "num_input_tokens_seen": 71360040, "step": 105865 }, { "epoch": 2.5864217135318692, "grad_norm": 0.09538698196411133, "learning_rate": 1.113972617048372e-06, "loss": 0.0003, "num_input_tokens_seen": 71363304, "step": 105870 }, { "epoch": 2.5865438643637164, "grad_norm": 0.22416944801807404, "learning_rate": 1.1138878947126761e-06, "loss": 0.038, "num_input_tokens_seen": 71366824, "step": 105875 }, { "epoch": 2.5866660151955636, "grad_norm": 0.07192140817642212, "learning_rate": 1.113803171548757e-06, "loss": 0.0006, "num_input_tokens_seen": 71370536, "step": 105880 }, { "epoch": 2.586788166027411, "grad_norm": 1.827846884727478, "learning_rate": 1.1137184475572305e-06, "loss": 0.0009, "num_input_tokens_seen": 71373480, "step": 105885 }, { "epoch": 2.5869103168592575, "grad_norm": 0.02320150099694729, "learning_rate": 1.1136337227387126e-06, "loss": 0.0489, "num_input_tokens_seen": 71376616, "step": 105890 }, { "epoch": 2.587032467691105, "grad_norm": 0.010031183250248432, "learning_rate": 1.11354899709382e-06, "loss": 0.0004, "num_input_tokens_seen": 71380200, "step": 105895 }, { "epoch": 2.587154618522952, "grad_norm": 0.026507128030061722, "learning_rate": 1.1134642706231685e-06, "loss": 0.0481, "num_input_tokens_seen": 71383464, "step": 105900 }, { "epoch": 2.5872767693547996, "grad_norm": 58.687049865722656, "learning_rate": 1.1133795433273742e-06, "loss": 0.0592, "num_input_tokens_seen": 71386792, "step": 105905 }, { "epoch": 2.5873989201866463, "grad_norm": 0.026232236996293068, "learning_rate": 1.1132948152070535e-06, "loss": 0.0425, "num_input_tokens_seen": 71390312, "step": 105910 }, { "epoch": 2.5875210710184935, "grad_norm": 18.692960739135742, "learning_rate": 1.1132100862628222e-06, "loss": 0.0389, "num_input_tokens_seen": 71394088, "step": 105915 }, { "epoch": 2.5876432218503407, "grad_norm": 0.4010098874568939, "learning_rate": 1.1131253564952969e-06, "loss": 0.0007, "num_input_tokens_seen": 71398184, "step": 105920 }, { "epoch": 2.587765372682188, "grad_norm": 0.20872338116168976, "learning_rate": 1.1130406259050935e-06, "loss": 0.0373, "num_input_tokens_seen": 71401448, "step": 105925 }, { "epoch": 2.587887523514035, "grad_norm": 0.05918128043413162, "learning_rate": 1.1129558944928284e-06, "loss": 0.0506, "num_input_tokens_seen": 71404776, "step": 105930 }, { "epoch": 2.5880096743458822, "grad_norm": 0.23181723058223724, "learning_rate": 1.1128711622591173e-06, "loss": 0.1251, "num_input_tokens_seen": 71408168, "step": 105935 }, { "epoch": 2.5881318251777294, "grad_norm": 19.287378311157227, "learning_rate": 1.1127864292045773e-06, "loss": 0.0879, "num_input_tokens_seen": 71411432, "step": 105940 }, { "epoch": 2.5882539760095766, "grad_norm": 0.06729073822498322, "learning_rate": 1.1127016953298237e-06, "loss": 0.0024, "num_input_tokens_seen": 71414632, "step": 105945 }, { "epoch": 2.588376126841424, "grad_norm": 0.15538254380226135, "learning_rate": 1.112616960635473e-06, "loss": 0.0003, "num_input_tokens_seen": 71418152, "step": 105950 }, { "epoch": 2.588498277673271, "grad_norm": 0.01470793318003416, "learning_rate": 1.1125322251221416e-06, "loss": 0.0001, "num_input_tokens_seen": 71421224, "step": 105955 }, { "epoch": 2.588620428505118, "grad_norm": 0.5175967216491699, "learning_rate": 1.1124474887904457e-06, "loss": 0.1095, "num_input_tokens_seen": 71424552, "step": 105960 }, { "epoch": 2.5887425793369654, "grad_norm": 0.01623247191309929, "learning_rate": 1.1123627516410013e-06, "loss": 0.0455, "num_input_tokens_seen": 71427688, "step": 105965 }, { "epoch": 2.5888647301688126, "grad_norm": 0.009030419401824474, "learning_rate": 1.1122780136744247e-06, "loss": 0.0001, "num_input_tokens_seen": 71431080, "step": 105970 }, { "epoch": 2.5889868810006598, "grad_norm": 0.1200258657336235, "learning_rate": 1.1121932748913318e-06, "loss": 0.0223, "num_input_tokens_seen": 71434536, "step": 105975 }, { "epoch": 2.589109031832507, "grad_norm": 0.030536944046616554, "learning_rate": 1.11210853529234e-06, "loss": 0.0356, "num_input_tokens_seen": 71438120, "step": 105980 }, { "epoch": 2.5892311826643537, "grad_norm": 0.0698750764131546, "learning_rate": 1.1120237948780642e-06, "loss": 0.0387, "num_input_tokens_seen": 71441384, "step": 105985 }, { "epoch": 2.5893533334962013, "grad_norm": 27.646520614624023, "learning_rate": 1.1119390536491218e-06, "loss": 0.0381, "num_input_tokens_seen": 71444392, "step": 105990 }, { "epoch": 2.589475484328048, "grad_norm": 34.76492691040039, "learning_rate": 1.1118543116061282e-06, "loss": 0.0539, "num_input_tokens_seen": 71447464, "step": 105995 }, { "epoch": 2.5895976351598953, "grad_norm": 0.014757978729903698, "learning_rate": 1.1117695687497e-06, "loss": 0.0538, "num_input_tokens_seen": 71451240, "step": 106000 }, { "epoch": 2.5897197859917425, "grad_norm": 0.20501114428043365, "learning_rate": 1.1116848250804529e-06, "loss": 0.0004, "num_input_tokens_seen": 71454376, "step": 106005 }, { "epoch": 2.5898419368235897, "grad_norm": 0.24411644041538239, "learning_rate": 1.1116000805990043e-06, "loss": 0.0004, "num_input_tokens_seen": 71457576, "step": 106010 }, { "epoch": 2.589964087655437, "grad_norm": 0.018468251451849937, "learning_rate": 1.1115153353059698e-06, "loss": 0.0003, "num_input_tokens_seen": 71461032, "step": 106015 }, { "epoch": 2.590086238487284, "grad_norm": 17.73522186279297, "learning_rate": 1.1114305892019656e-06, "loss": 0.0524, "num_input_tokens_seen": 71464744, "step": 106020 }, { "epoch": 2.590208389319131, "grad_norm": 0.05297902598977089, "learning_rate": 1.1113458422876085e-06, "loss": 0.0001, "num_input_tokens_seen": 71468840, "step": 106025 }, { "epoch": 2.5903305401509784, "grad_norm": 0.09020661562681198, "learning_rate": 1.1112610945635145e-06, "loss": 0.001, "num_input_tokens_seen": 71472552, "step": 106030 }, { "epoch": 2.5904526909828256, "grad_norm": 0.012612675316631794, "learning_rate": 1.1111763460302994e-06, "loss": 0.0379, "num_input_tokens_seen": 71475496, "step": 106035 }, { "epoch": 2.590574841814673, "grad_norm": 0.7227842211723328, "learning_rate": 1.1110915966885805e-06, "loss": 0.0496, "num_input_tokens_seen": 71478888, "step": 106040 }, { "epoch": 2.59069699264652, "grad_norm": 0.1919873207807541, "learning_rate": 1.1110068465389735e-06, "loss": 0.1119, "num_input_tokens_seen": 71482664, "step": 106045 }, { "epoch": 2.590819143478367, "grad_norm": 0.0008974491502158344, "learning_rate": 1.1109220955820948e-06, "loss": 0.0002, "num_input_tokens_seen": 71485992, "step": 106050 }, { "epoch": 2.5909412943102144, "grad_norm": 0.0659492015838623, "learning_rate": 1.1108373438185608e-06, "loss": 0.0002, "num_input_tokens_seen": 71489320, "step": 106055 }, { "epoch": 2.5910634451420615, "grad_norm": 10.360064506530762, "learning_rate": 1.1107525912489878e-06, "loss": 0.001, "num_input_tokens_seen": 71492648, "step": 106060 }, { "epoch": 2.5911855959739087, "grad_norm": 0.03574829176068306, "learning_rate": 1.1106678378739922e-06, "loss": 0.0004, "num_input_tokens_seen": 71495592, "step": 106065 }, { "epoch": 2.5913077468057555, "grad_norm": 0.020459705963730812, "learning_rate": 1.11058308369419e-06, "loss": 0.0003, "num_input_tokens_seen": 71498856, "step": 106070 }, { "epoch": 2.591429897637603, "grad_norm": 0.02584422007203102, "learning_rate": 1.1104983287101982e-06, "loss": 0.0003, "num_input_tokens_seen": 71502248, "step": 106075 }, { "epoch": 2.59155204846945, "grad_norm": 0.10443761199712753, "learning_rate": 1.1104135729226329e-06, "loss": 0.0001, "num_input_tokens_seen": 71505512, "step": 106080 }, { "epoch": 2.5916741993012975, "grad_norm": 0.01857997104525566, "learning_rate": 1.1103288163321103e-06, "loss": 0.0002, "num_input_tokens_seen": 71508584, "step": 106085 }, { "epoch": 2.5917963501331442, "grad_norm": 0.08549598604440689, "learning_rate": 1.1102440589392468e-06, "loss": 0.0564, "num_input_tokens_seen": 71511720, "step": 106090 }, { "epoch": 2.5919185009649914, "grad_norm": 0.0008379715145565569, "learning_rate": 1.110159300744659e-06, "loss": 0.0, "num_input_tokens_seen": 71515112, "step": 106095 }, { "epoch": 2.5920406517968386, "grad_norm": 63.77941131591797, "learning_rate": 1.1100745417489629e-06, "loss": 0.0367, "num_input_tokens_seen": 71518568, "step": 106100 }, { "epoch": 2.592162802628686, "grad_norm": 0.015878800302743912, "learning_rate": 1.1099897819527755e-06, "loss": 0.0002, "num_input_tokens_seen": 71521960, "step": 106105 }, { "epoch": 2.592284953460533, "grad_norm": 38.300418853759766, "learning_rate": 1.1099050213567127e-06, "loss": 0.1, "num_input_tokens_seen": 71525032, "step": 106110 }, { "epoch": 2.59240710429238, "grad_norm": 0.010526223108172417, "learning_rate": 1.109820259961391e-06, "loss": 0.0616, "num_input_tokens_seen": 71528040, "step": 106115 }, { "epoch": 2.5925292551242274, "grad_norm": 0.005474468693137169, "learning_rate": 1.1097354977674267e-06, "loss": 0.034, "num_input_tokens_seen": 71531688, "step": 106120 }, { "epoch": 2.5926514059560746, "grad_norm": 0.024080710485577583, "learning_rate": 1.1096507347754364e-06, "loss": 0.0806, "num_input_tokens_seen": 71535144, "step": 106125 }, { "epoch": 2.5927735567879218, "grad_norm": 0.011120163835585117, "learning_rate": 1.1095659709860363e-06, "loss": 0.0007, "num_input_tokens_seen": 71538152, "step": 106130 }, { "epoch": 2.592895707619769, "grad_norm": 0.10011861473321915, "learning_rate": 1.1094812063998431e-06, "loss": 0.0003, "num_input_tokens_seen": 71541800, "step": 106135 }, { "epoch": 2.593017858451616, "grad_norm": 36.78319549560547, "learning_rate": 1.1093964410174733e-06, "loss": 0.0434, "num_input_tokens_seen": 71545064, "step": 106140 }, { "epoch": 2.5931400092834633, "grad_norm": 0.029222454875707626, "learning_rate": 1.1093116748395432e-06, "loss": 0.0001, "num_input_tokens_seen": 71548392, "step": 106145 }, { "epoch": 2.5932621601153105, "grad_norm": 0.10176047682762146, "learning_rate": 1.1092269078666689e-06, "loss": 0.0001, "num_input_tokens_seen": 71552168, "step": 106150 }, { "epoch": 2.5933843109471577, "grad_norm": 0.03981450945138931, "learning_rate": 1.1091421400994674e-06, "loss": 0.0002, "num_input_tokens_seen": 71555880, "step": 106155 }, { "epoch": 2.593506461779005, "grad_norm": 0.033819641917943954, "learning_rate": 1.1090573715385547e-06, "loss": 0.0002, "num_input_tokens_seen": 71559080, "step": 106160 }, { "epoch": 2.5936286126108516, "grad_norm": 0.00025354253011755645, "learning_rate": 1.1089726021845475e-06, "loss": 0.0005, "num_input_tokens_seen": 71562664, "step": 106165 }, { "epoch": 2.5937507634426993, "grad_norm": 98.45695495605469, "learning_rate": 1.1088878320380623e-06, "loss": 0.1754, "num_input_tokens_seen": 71565736, "step": 106170 }, { "epoch": 2.593872914274546, "grad_norm": 0.010990033857524395, "learning_rate": 1.1088030610997155e-06, "loss": 0.0001, "num_input_tokens_seen": 71568872, "step": 106175 }, { "epoch": 2.593995065106393, "grad_norm": 0.08195897191762924, "learning_rate": 1.1087182893701234e-06, "loss": 0.0001, "num_input_tokens_seen": 71571944, "step": 106180 }, { "epoch": 2.5941172159382404, "grad_norm": 0.07107039541006088, "learning_rate": 1.108633516849903e-06, "loss": 0.0002, "num_input_tokens_seen": 71575016, "step": 106185 }, { "epoch": 2.5942393667700876, "grad_norm": 46.70288848876953, "learning_rate": 1.1085487435396703e-06, "loss": 0.0885, "num_input_tokens_seen": 71578600, "step": 106190 }, { "epoch": 2.594361517601935, "grad_norm": 17.831167221069336, "learning_rate": 1.108463969440042e-06, "loss": 0.1151, "num_input_tokens_seen": 71582120, "step": 106195 }, { "epoch": 2.594483668433782, "grad_norm": 0.004120448138564825, "learning_rate": 1.1083791945516344e-06, "loss": 0.0001, "num_input_tokens_seen": 71585640, "step": 106200 }, { "epoch": 2.594605819265629, "grad_norm": 0.009358244016766548, "learning_rate": 1.108294418875064e-06, "loss": 0.0824, "num_input_tokens_seen": 71588648, "step": 106205 }, { "epoch": 2.5947279700974764, "grad_norm": 11.974853515625, "learning_rate": 1.1082096424109476e-06, "loss": 0.0405, "num_input_tokens_seen": 71592232, "step": 106210 }, { "epoch": 2.5948501209293235, "grad_norm": 0.01048423070460558, "learning_rate": 1.1081248651599017e-06, "loss": 0.0002, "num_input_tokens_seen": 71595624, "step": 106215 }, { "epoch": 2.5949722717611707, "grad_norm": 0.014245947822928429, "learning_rate": 1.1080400871225429e-06, "loss": 0.0711, "num_input_tokens_seen": 71598632, "step": 106220 }, { "epoch": 2.595094422593018, "grad_norm": 0.20360706746578217, "learning_rate": 1.1079553082994868e-06, "loss": 0.0373, "num_input_tokens_seen": 71602024, "step": 106225 }, { "epoch": 2.595216573424865, "grad_norm": 0.06869331747293472, "learning_rate": 1.1078705286913513e-06, "loss": 0.0002, "num_input_tokens_seen": 71605224, "step": 106230 }, { "epoch": 2.5953387242567123, "grad_norm": 0.30428367853164673, "learning_rate": 1.107785748298752e-06, "loss": 0.001, "num_input_tokens_seen": 71608104, "step": 106235 }, { "epoch": 2.5954608750885595, "grad_norm": 0.010862280614674091, "learning_rate": 1.1077009671223059e-06, "loss": 0.0735, "num_input_tokens_seen": 71611368, "step": 106240 }, { "epoch": 2.5955830259204067, "grad_norm": 0.18098630011081696, "learning_rate": 1.1076161851626294e-06, "loss": 0.0959, "num_input_tokens_seen": 71615080, "step": 106245 }, { "epoch": 2.5957051767522534, "grad_norm": 0.3075055181980133, "learning_rate": 1.107531402420339e-06, "loss": 0.0983, "num_input_tokens_seen": 71619176, "step": 106250 }, { "epoch": 2.595827327584101, "grad_norm": 0.5143350958824158, "learning_rate": 1.1074466188960515e-06, "loss": 0.0505, "num_input_tokens_seen": 71622568, "step": 106255 }, { "epoch": 2.595949478415948, "grad_norm": 0.11659617722034454, "learning_rate": 1.107361834590383e-06, "loss": 0.0746, "num_input_tokens_seen": 71625832, "step": 106260 }, { "epoch": 2.5960716292477954, "grad_norm": 0.02491629868745804, "learning_rate": 1.1072770495039506e-06, "loss": 0.0001, "num_input_tokens_seen": 71629352, "step": 106265 }, { "epoch": 2.596193780079642, "grad_norm": 763.0545654296875, "learning_rate": 1.1071922636373702e-06, "loss": 0.1042, "num_input_tokens_seen": 71632424, "step": 106270 }, { "epoch": 2.5963159309114894, "grad_norm": 0.06559303402900696, "learning_rate": 1.1071074769912593e-06, "loss": 0.0007, "num_input_tokens_seen": 71635688, "step": 106275 }, { "epoch": 2.5964380817433366, "grad_norm": 0.2602338194847107, "learning_rate": 1.107022689566234e-06, "loss": 0.0956, "num_input_tokens_seen": 71639016, "step": 106280 }, { "epoch": 2.5965602325751838, "grad_norm": 0.012517737224698067, "learning_rate": 1.1069379013629105e-06, "loss": 0.0249, "num_input_tokens_seen": 71642856, "step": 106285 }, { "epoch": 2.596682383407031, "grad_norm": 0.03258126974105835, "learning_rate": 1.106853112381906e-06, "loss": 0.0006, "num_input_tokens_seen": 71645800, "step": 106290 }, { "epoch": 2.596804534238878, "grad_norm": 0.05590072646737099, "learning_rate": 1.1067683226238372e-06, "loss": 0.082, "num_input_tokens_seen": 71649000, "step": 106295 }, { "epoch": 2.5969266850707253, "grad_norm": 0.03783603757619858, "learning_rate": 1.1066835320893204e-06, "loss": 0.1012, "num_input_tokens_seen": 71652136, "step": 106300 }, { "epoch": 2.5970488359025725, "grad_norm": 0.008168010972440243, "learning_rate": 1.1065987407789724e-06, "loss": 0.0006, "num_input_tokens_seen": 71655272, "step": 106305 }, { "epoch": 2.5971709867344197, "grad_norm": 0.15477074682712555, "learning_rate": 1.1065139486934092e-06, "loss": 0.0003, "num_input_tokens_seen": 71658728, "step": 106310 }, { "epoch": 2.597293137566267, "grad_norm": 13.059903144836426, "learning_rate": 1.106429155833248e-06, "loss": 0.0397, "num_input_tokens_seen": 71661928, "step": 106315 }, { "epoch": 2.597415288398114, "grad_norm": 0.013284175656735897, "learning_rate": 1.1063443621991056e-06, "loss": 0.1028, "num_input_tokens_seen": 71666088, "step": 106320 }, { "epoch": 2.5975374392299613, "grad_norm": 0.03173963353037834, "learning_rate": 1.1062595677915983e-06, "loss": 0.0001, "num_input_tokens_seen": 71669416, "step": 106325 }, { "epoch": 2.5976595900618085, "grad_norm": 1.1281355619430542, "learning_rate": 1.1061747726113427e-06, "loss": 0.0008, "num_input_tokens_seen": 71672616, "step": 106330 }, { "epoch": 2.597781740893655, "grad_norm": 0.009192223660647869, "learning_rate": 1.1060899766589558e-06, "loss": 0.0001, "num_input_tokens_seen": 71675560, "step": 106335 }, { "epoch": 2.597903891725503, "grad_norm": 0.3092971444129944, "learning_rate": 1.1060051799350538e-06, "loss": 0.0002, "num_input_tokens_seen": 71678952, "step": 106340 }, { "epoch": 2.5980260425573496, "grad_norm": 0.025217989459633827, "learning_rate": 1.105920382440254e-06, "loss": 0.0003, "num_input_tokens_seen": 71682664, "step": 106345 }, { "epoch": 2.5981481933891972, "grad_norm": 0.022733714431524277, "learning_rate": 1.1058355841751723e-06, "loss": 0.0007, "num_input_tokens_seen": 71685480, "step": 106350 }, { "epoch": 2.598270344221044, "grad_norm": 0.054993562400341034, "learning_rate": 1.105750785140426e-06, "loss": 0.0004, "num_input_tokens_seen": 71689320, "step": 106355 }, { "epoch": 2.598392495052891, "grad_norm": 0.02195347473025322, "learning_rate": 1.1056659853366315e-06, "loss": 0.0632, "num_input_tokens_seen": 71692584, "step": 106360 }, { "epoch": 2.5985146458847383, "grad_norm": 0.00801927875727415, "learning_rate": 1.1055811847644057e-06, "loss": 0.0002, "num_input_tokens_seen": 71696168, "step": 106365 }, { "epoch": 2.5986367967165855, "grad_norm": 0.21472963690757751, "learning_rate": 1.105496383424365e-06, "loss": 0.0345, "num_input_tokens_seen": 71699560, "step": 106370 }, { "epoch": 2.5987589475484327, "grad_norm": 0.1342368721961975, "learning_rate": 1.1054115813171262e-06, "loss": 0.1055, "num_input_tokens_seen": 71703080, "step": 106375 }, { "epoch": 2.59888109838028, "grad_norm": 0.04510444775223732, "learning_rate": 1.1053267784433057e-06, "loss": 0.0006, "num_input_tokens_seen": 71706344, "step": 106380 }, { "epoch": 2.599003249212127, "grad_norm": 0.01901618391275406, "learning_rate": 1.105241974803521e-06, "loss": 0.0001, "num_input_tokens_seen": 71709480, "step": 106385 }, { "epoch": 2.5991254000439743, "grad_norm": 0.27060818672180176, "learning_rate": 1.105157170398388e-06, "loss": 0.1092, "num_input_tokens_seen": 71713064, "step": 106390 }, { "epoch": 2.5992475508758215, "grad_norm": 30.31696128845215, "learning_rate": 1.105072365228524e-06, "loss": 0.0829, "num_input_tokens_seen": 71716264, "step": 106395 }, { "epoch": 2.5993697017076687, "grad_norm": 0.026632215827703476, "learning_rate": 1.1049875592945454e-06, "loss": 0.0351, "num_input_tokens_seen": 71719336, "step": 106400 }, { "epoch": 2.599491852539516, "grad_norm": 0.02794702723622322, "learning_rate": 1.1049027525970691e-06, "loss": 0.0424, "num_input_tokens_seen": 71722792, "step": 106405 }, { "epoch": 2.599614003371363, "grad_norm": 0.04424641281366348, "learning_rate": 1.104817945136712e-06, "loss": 0.1048, "num_input_tokens_seen": 71725992, "step": 106410 }, { "epoch": 2.5997361542032102, "grad_norm": 0.14470571279525757, "learning_rate": 1.1047331369140901e-06, "loss": 0.0006, "num_input_tokens_seen": 71729832, "step": 106415 }, { "epoch": 2.5998583050350574, "grad_norm": 0.006976842414587736, "learning_rate": 1.1046483279298212e-06, "loss": 0.0009, "num_input_tokens_seen": 71733416, "step": 106420 }, { "epoch": 2.5999804558669046, "grad_norm": 0.07952874898910522, "learning_rate": 1.1045635181845212e-06, "loss": 0.0681, "num_input_tokens_seen": 71736872, "step": 106425 }, { "epoch": 2.6001026066987514, "grad_norm": 0.03277485817670822, "learning_rate": 1.104478707678807e-06, "loss": 0.053, "num_input_tokens_seen": 71741096, "step": 106430 }, { "epoch": 2.600224757530599, "grad_norm": 0.6419253349304199, "learning_rate": 1.104393896413296e-06, "loss": 0.0423, "num_input_tokens_seen": 71744616, "step": 106435 }, { "epoch": 2.6003469083624458, "grad_norm": 94.72898864746094, "learning_rate": 1.104309084388604e-06, "loss": 0.0314, "num_input_tokens_seen": 71748072, "step": 106440 }, { "epoch": 2.6004690591942934, "grad_norm": 0.23949620127677917, "learning_rate": 1.1042242716053486e-06, "loss": 0.0395, "num_input_tokens_seen": 71751656, "step": 106445 }, { "epoch": 2.60059121002614, "grad_norm": 0.09659047424793243, "learning_rate": 1.1041394580641464e-06, "loss": 0.0008, "num_input_tokens_seen": 71754984, "step": 106450 }, { "epoch": 2.6007133608579873, "grad_norm": 14.89390754699707, "learning_rate": 1.104054643765614e-06, "loss": 0.0907, "num_input_tokens_seen": 71759144, "step": 106455 }, { "epoch": 2.6008355116898345, "grad_norm": 0.34447380900382996, "learning_rate": 1.103969828710368e-06, "loss": 0.139, "num_input_tokens_seen": 71762216, "step": 106460 }, { "epoch": 2.6009576625216817, "grad_norm": 0.06632810831069946, "learning_rate": 1.1038850128990255e-06, "loss": 0.0731, "num_input_tokens_seen": 71765288, "step": 106465 }, { "epoch": 2.601079813353529, "grad_norm": 0.022017555311322212, "learning_rate": 1.1038001963322031e-06, "loss": 0.0346, "num_input_tokens_seen": 71768616, "step": 106470 }, { "epoch": 2.601201964185376, "grad_norm": 0.053672295063734055, "learning_rate": 1.103715379010518e-06, "loss": 0.1645, "num_input_tokens_seen": 71771944, "step": 106475 }, { "epoch": 2.6013241150172233, "grad_norm": 0.08040033280849457, "learning_rate": 1.103630560934587e-06, "loss": 0.0008, "num_input_tokens_seen": 71774824, "step": 106480 }, { "epoch": 2.6014462658490705, "grad_norm": 0.009018289856612682, "learning_rate": 1.1035457421050262e-06, "loss": 0.0003, "num_input_tokens_seen": 71777704, "step": 106485 }, { "epoch": 2.6015684166809177, "grad_norm": 38.83277130126953, "learning_rate": 1.1034609225224531e-06, "loss": 0.0582, "num_input_tokens_seen": 71780904, "step": 106490 }, { "epoch": 2.601690567512765, "grad_norm": 0.023919718340039253, "learning_rate": 1.1033761021874844e-06, "loss": 0.057, "num_input_tokens_seen": 71784488, "step": 106495 }, { "epoch": 2.601812718344612, "grad_norm": 0.6867104172706604, "learning_rate": 1.103291281100737e-06, "loss": 0.0008, "num_input_tokens_seen": 71787816, "step": 106500 }, { "epoch": 2.601934869176459, "grad_norm": 15.458674430847168, "learning_rate": 1.1032064592628275e-06, "loss": 0.0037, "num_input_tokens_seen": 71791144, "step": 106505 }, { "epoch": 2.6020570200083064, "grad_norm": 0.11199041455984116, "learning_rate": 1.1031216366743727e-06, "loss": 0.0006, "num_input_tokens_seen": 71794536, "step": 106510 }, { "epoch": 2.602179170840153, "grad_norm": 0.729247510433197, "learning_rate": 1.1030368133359897e-06, "loss": 0.0006, "num_input_tokens_seen": 71797992, "step": 106515 }, { "epoch": 2.602301321672001, "grad_norm": 0.02848580852150917, "learning_rate": 1.1029519892482953e-06, "loss": 0.0132, "num_input_tokens_seen": 71801320, "step": 106520 }, { "epoch": 2.6024234725038475, "grad_norm": 0.11427497118711472, "learning_rate": 1.1028671644119066e-06, "loss": 0.0659, "num_input_tokens_seen": 71804712, "step": 106525 }, { "epoch": 2.602545623335695, "grad_norm": 0.043485596776008606, "learning_rate": 1.1027823388274397e-06, "loss": 0.0356, "num_input_tokens_seen": 71807976, "step": 106530 }, { "epoch": 2.602667774167542, "grad_norm": 68.4632339477539, "learning_rate": 1.1026975124955123e-06, "loss": 0.0395, "num_input_tokens_seen": 71811368, "step": 106535 }, { "epoch": 2.602789924999389, "grad_norm": 0.018174318596720695, "learning_rate": 1.1026126854167408e-06, "loss": 0.0551, "num_input_tokens_seen": 71814824, "step": 106540 }, { "epoch": 2.6029120758312363, "grad_norm": 0.1868731528520584, "learning_rate": 1.1025278575917425e-06, "loss": 0.105, "num_input_tokens_seen": 71818536, "step": 106545 }, { "epoch": 2.6030342266630835, "grad_norm": 1.8846322298049927, "learning_rate": 1.102443029021134e-06, "loss": 0.1427, "num_input_tokens_seen": 71821608, "step": 106550 }, { "epoch": 2.6031563774949307, "grad_norm": 0.01644289679825306, "learning_rate": 1.102358199705532e-06, "loss": 0.0354, "num_input_tokens_seen": 71825000, "step": 106555 }, { "epoch": 2.603278528326778, "grad_norm": 1.783953070640564, "learning_rate": 1.102273369645554e-06, "loss": 0.1585, "num_input_tokens_seen": 71828072, "step": 106560 }, { "epoch": 2.603400679158625, "grad_norm": 0.03290699049830437, "learning_rate": 1.1021885388418164e-06, "loss": 0.0227, "num_input_tokens_seen": 71831400, "step": 106565 }, { "epoch": 2.6035228299904722, "grad_norm": 0.018067052587866783, "learning_rate": 1.1021037072949362e-06, "loss": 0.0262, "num_input_tokens_seen": 71834344, "step": 106570 }, { "epoch": 2.6036449808223194, "grad_norm": 0.012045308016240597, "learning_rate": 1.1020188750055304e-06, "loss": 0.0009, "num_input_tokens_seen": 71837736, "step": 106575 }, { "epoch": 2.6037671316541666, "grad_norm": 0.3654814660549164, "learning_rate": 1.1019340419742157e-06, "loss": 0.046, "num_input_tokens_seen": 71841256, "step": 106580 }, { "epoch": 2.603889282486014, "grad_norm": 0.29941239953041077, "learning_rate": 1.1018492082016095e-06, "loss": 0.0006, "num_input_tokens_seen": 71844264, "step": 106585 }, { "epoch": 2.604011433317861, "grad_norm": 0.053947120904922485, "learning_rate": 1.1017643736883284e-06, "loss": 0.0029, "num_input_tokens_seen": 71847912, "step": 106590 }, { "epoch": 2.604133584149708, "grad_norm": 0.06559449434280396, "learning_rate": 1.1016795384349892e-06, "loss": 0.0539, "num_input_tokens_seen": 71851176, "step": 106595 }, { "epoch": 2.6042557349815554, "grad_norm": 0.10444041341543198, "learning_rate": 1.1015947024422094e-06, "loss": 0.0338, "num_input_tokens_seen": 71854696, "step": 106600 }, { "epoch": 2.6043778858134026, "grad_norm": 0.014229257591068745, "learning_rate": 1.1015098657106054e-06, "loss": 0.0005, "num_input_tokens_seen": 71857896, "step": 106605 }, { "epoch": 2.6045000366452493, "grad_norm": 0.0054950471967458725, "learning_rate": 1.1014250282407946e-06, "loss": 0.0001, "num_input_tokens_seen": 71861032, "step": 106610 }, { "epoch": 2.604622187477097, "grad_norm": 0.32495608925819397, "learning_rate": 1.1013401900333937e-06, "loss": 0.0005, "num_input_tokens_seen": 71864104, "step": 106615 }, { "epoch": 2.6047443383089437, "grad_norm": 0.012543436139822006, "learning_rate": 1.1012553510890192e-06, "loss": 0.1128, "num_input_tokens_seen": 71867176, "step": 106620 }, { "epoch": 2.604866489140791, "grad_norm": 0.019681131467223167, "learning_rate": 1.101170511408289e-06, "loss": 0.0007, "num_input_tokens_seen": 71870120, "step": 106625 }, { "epoch": 2.604988639972638, "grad_norm": 0.2520747184753418, "learning_rate": 1.1010856709918193e-06, "loss": 0.2105, "num_input_tokens_seen": 71873704, "step": 106630 }, { "epoch": 2.6051107908044853, "grad_norm": 0.009804549627006054, "learning_rate": 1.1010008298402275e-06, "loss": 0.0681, "num_input_tokens_seen": 71876968, "step": 106635 }, { "epoch": 2.6052329416363325, "grad_norm": 0.005678892135620117, "learning_rate": 1.1009159879541307e-06, "loss": 0.0504, "num_input_tokens_seen": 71880104, "step": 106640 }, { "epoch": 2.6053550924681796, "grad_norm": 21.121488571166992, "learning_rate": 1.1008311453341455e-06, "loss": 0.0761, "num_input_tokens_seen": 71883816, "step": 106645 }, { "epoch": 2.605477243300027, "grad_norm": 0.023501016199588776, "learning_rate": 1.1007463019808892e-06, "loss": 0.0003, "num_input_tokens_seen": 71887208, "step": 106650 }, { "epoch": 2.605599394131874, "grad_norm": 0.01757156103849411, "learning_rate": 1.1006614578949786e-06, "loss": 0.0006, "num_input_tokens_seen": 71890280, "step": 106655 }, { "epoch": 2.605721544963721, "grad_norm": 0.06994085013866425, "learning_rate": 1.1005766130770312e-06, "loss": 0.0002, "num_input_tokens_seen": 71893992, "step": 106660 }, { "epoch": 2.6058436957955684, "grad_norm": 0.020432811230421066, "learning_rate": 1.100491767527663e-06, "loss": 0.0005, "num_input_tokens_seen": 71896936, "step": 106665 }, { "epoch": 2.6059658466274156, "grad_norm": 0.006643875502049923, "learning_rate": 1.1004069212474921e-06, "loss": 0.0567, "num_input_tokens_seen": 71900264, "step": 106670 }, { "epoch": 2.606087997459263, "grad_norm": 0.0019912130665034056, "learning_rate": 1.1003220742371348e-06, "loss": 0.0003, "num_input_tokens_seen": 71903848, "step": 106675 }, { "epoch": 2.60621014829111, "grad_norm": 0.0037333867512643337, "learning_rate": 1.1002372264972083e-06, "loss": 0.0285, "num_input_tokens_seen": 71907368, "step": 106680 }, { "epoch": 2.606332299122957, "grad_norm": 0.4669528603553772, "learning_rate": 1.1001523780283302e-06, "loss": 0.0645, "num_input_tokens_seen": 71910632, "step": 106685 }, { "epoch": 2.6064544499548044, "grad_norm": 0.01946832798421383, "learning_rate": 1.1000675288311166e-06, "loss": 0.0915, "num_input_tokens_seen": 71914152, "step": 106690 }, { "epoch": 2.606576600786651, "grad_norm": 0.3970881998538971, "learning_rate": 1.099982678906185e-06, "loss": 0.0004, "num_input_tokens_seen": 71917544, "step": 106695 }, { "epoch": 2.6066987516184987, "grad_norm": 0.4034290611743927, "learning_rate": 1.0998978282541527e-06, "loss": 0.0003, "num_input_tokens_seen": 71920616, "step": 106700 }, { "epoch": 2.6068209024503455, "grad_norm": 0.18643775582313538, "learning_rate": 1.0998129768756365e-06, "loss": 0.0006, "num_input_tokens_seen": 71923496, "step": 106705 }, { "epoch": 2.606943053282193, "grad_norm": 0.07174662500619888, "learning_rate": 1.0997281247712536e-06, "loss": 0.0002, "num_input_tokens_seen": 71926760, "step": 106710 }, { "epoch": 2.60706520411404, "grad_norm": 0.06458175182342529, "learning_rate": 1.0996432719416209e-06, "loss": 0.0627, "num_input_tokens_seen": 71930024, "step": 106715 }, { "epoch": 2.607187354945887, "grad_norm": 0.034823257476091385, "learning_rate": 1.0995584183873553e-06, "loss": 0.055, "num_input_tokens_seen": 71933096, "step": 106720 }, { "epoch": 2.6073095057777342, "grad_norm": 55.86451721191406, "learning_rate": 1.0994735641090742e-06, "loss": 0.1001, "num_input_tokens_seen": 71936552, "step": 106725 }, { "epoch": 2.6074316566095814, "grad_norm": 0.013018675148487091, "learning_rate": 1.0993887091073947e-06, "loss": 0.0001, "num_input_tokens_seen": 71939752, "step": 106730 }, { "epoch": 2.6075538074414286, "grad_norm": 11.976505279541016, "learning_rate": 1.0993038533829338e-06, "loss": 0.0597, "num_input_tokens_seen": 71943208, "step": 106735 }, { "epoch": 2.607675958273276, "grad_norm": 0.017002243548631668, "learning_rate": 1.0992189969363084e-06, "loss": 0.1481, "num_input_tokens_seen": 71946408, "step": 106740 }, { "epoch": 2.607798109105123, "grad_norm": 0.1695324033498764, "learning_rate": 1.099134139768136e-06, "loss": 0.0688, "num_input_tokens_seen": 71949864, "step": 106745 }, { "epoch": 2.60792025993697, "grad_norm": 18.079139709472656, "learning_rate": 1.0990492818790331e-06, "loss": 0.0416, "num_input_tokens_seen": 71952872, "step": 106750 }, { "epoch": 2.6080424107688174, "grad_norm": 0.03448851406574249, "learning_rate": 1.0989644232696174e-06, "loss": 0.0018, "num_input_tokens_seen": 71955944, "step": 106755 }, { "epoch": 2.6081645616006646, "grad_norm": 0.03426092118024826, "learning_rate": 1.0988795639405056e-06, "loss": 0.0482, "num_input_tokens_seen": 71959528, "step": 106760 }, { "epoch": 2.6082867124325118, "grad_norm": 33.893646240234375, "learning_rate": 1.0987947038923155e-06, "loss": 0.0607, "num_input_tokens_seen": 71962856, "step": 106765 }, { "epoch": 2.608408863264359, "grad_norm": 0.21278434991836548, "learning_rate": 1.0987098431256637e-06, "loss": 0.0006, "num_input_tokens_seen": 71966312, "step": 106770 }, { "epoch": 2.608531014096206, "grad_norm": 0.15520715713500977, "learning_rate": 1.098624981641167e-06, "loss": 0.0003, "num_input_tokens_seen": 71969384, "step": 106775 }, { "epoch": 2.6086531649280533, "grad_norm": 0.04019223898649216, "learning_rate": 1.0985401194394431e-06, "loss": 0.0861, "num_input_tokens_seen": 71973480, "step": 106780 }, { "epoch": 2.6087753157599005, "grad_norm": 0.09767397493124008, "learning_rate": 1.0984552565211087e-06, "loss": 0.0238, "num_input_tokens_seen": 71976744, "step": 106785 }, { "epoch": 2.6088974665917473, "grad_norm": 0.013539946638047695, "learning_rate": 1.0983703928867813e-06, "loss": 0.0192, "num_input_tokens_seen": 71979880, "step": 106790 }, { "epoch": 2.609019617423595, "grad_norm": 0.059728436172008514, "learning_rate": 1.098285528537078e-06, "loss": 0.0003, "num_input_tokens_seen": 71983144, "step": 106795 }, { "epoch": 2.6091417682554416, "grad_norm": 0.02229916676878929, "learning_rate": 1.098200663472616e-06, "loss": 0.0002, "num_input_tokens_seen": 71986536, "step": 106800 }, { "epoch": 2.609263919087289, "grad_norm": 0.042197611182928085, "learning_rate": 1.0981157976940124e-06, "loss": 0.0006, "num_input_tokens_seen": 71990184, "step": 106805 }, { "epoch": 2.609386069919136, "grad_norm": 0.20854344964027405, "learning_rate": 1.0980309312018841e-06, "loss": 0.0002, "num_input_tokens_seen": 71993320, "step": 106810 }, { "epoch": 2.609508220750983, "grad_norm": 18.07057762145996, "learning_rate": 1.0979460639968485e-06, "loss": 0.0633, "num_input_tokens_seen": 71996456, "step": 106815 }, { "epoch": 2.6096303715828304, "grad_norm": 0.38547414541244507, "learning_rate": 1.097861196079523e-06, "loss": 0.0005, "num_input_tokens_seen": 72000232, "step": 106820 }, { "epoch": 2.6097525224146776, "grad_norm": 0.01530816126614809, "learning_rate": 1.0977763274505244e-06, "loss": 0.0001, "num_input_tokens_seen": 72003368, "step": 106825 }, { "epoch": 2.609874673246525, "grad_norm": 0.022470971569418907, "learning_rate": 1.09769145811047e-06, "loss": 0.0448, "num_input_tokens_seen": 72006568, "step": 106830 }, { "epoch": 2.609996824078372, "grad_norm": 0.01738395355641842, "learning_rate": 1.0976065880599772e-06, "loss": 0.0073, "num_input_tokens_seen": 72009896, "step": 106835 }, { "epoch": 2.610118974910219, "grad_norm": 0.004830238409340382, "learning_rate": 1.097521717299663e-06, "loss": 0.0017, "num_input_tokens_seen": 72013352, "step": 106840 }, { "epoch": 2.6102411257420663, "grad_norm": 0.44913017749786377, "learning_rate": 1.0974368458301444e-06, "loss": 0.0055, "num_input_tokens_seen": 72017320, "step": 106845 }, { "epoch": 2.6103632765739135, "grad_norm": 0.005423199385404587, "learning_rate": 1.0973519736520392e-06, "loss": 0.0002, "num_input_tokens_seen": 72020520, "step": 106850 }, { "epoch": 2.6104854274057607, "grad_norm": 0.09631983935832977, "learning_rate": 1.0972671007659642e-06, "loss": 0.0536, "num_input_tokens_seen": 72023848, "step": 106855 }, { "epoch": 2.610607578237608, "grad_norm": 0.18404462933540344, "learning_rate": 1.0971822271725367e-06, "loss": 0.0004, "num_input_tokens_seen": 72027112, "step": 106860 }, { "epoch": 2.610729729069455, "grad_norm": 0.028065208345651627, "learning_rate": 1.0970973528723736e-06, "loss": 0.0671, "num_input_tokens_seen": 72030632, "step": 106865 }, { "epoch": 2.6108518799013023, "grad_norm": 0.21517078578472137, "learning_rate": 1.0970124778660928e-06, "loss": 0.0002, "num_input_tokens_seen": 72034280, "step": 106870 }, { "epoch": 2.610974030733149, "grad_norm": 0.14804969727993011, "learning_rate": 1.096927602154311e-06, "loss": 0.0008, "num_input_tokens_seen": 72037928, "step": 106875 }, { "epoch": 2.6110961815649967, "grad_norm": 0.0020192775409668684, "learning_rate": 1.0968427257376455e-06, "loss": 0.0001, "num_input_tokens_seen": 72041128, "step": 106880 }, { "epoch": 2.6112183323968434, "grad_norm": 0.004779881797730923, "learning_rate": 1.096757848616714e-06, "loss": 0.1136, "num_input_tokens_seen": 72045288, "step": 106885 }, { "epoch": 2.611340483228691, "grad_norm": 0.315485417842865, "learning_rate": 1.096672970792133e-06, "loss": 0.0686, "num_input_tokens_seen": 72048680, "step": 106890 }, { "epoch": 2.611462634060538, "grad_norm": 0.8199894428253174, "learning_rate": 1.0965880922645204e-06, "loss": 0.0818, "num_input_tokens_seen": 72052328, "step": 106895 }, { "epoch": 2.611584784892385, "grad_norm": 0.37913942337036133, "learning_rate": 1.0965032130344932e-06, "loss": 0.0543, "num_input_tokens_seen": 72055592, "step": 106900 }, { "epoch": 2.611706935724232, "grad_norm": 0.049211300909519196, "learning_rate": 1.0964183331026686e-06, "loss": 0.039, "num_input_tokens_seen": 72058920, "step": 106905 }, { "epoch": 2.6118290865560794, "grad_norm": 12.023048400878906, "learning_rate": 1.0963334524696637e-06, "loss": 0.038, "num_input_tokens_seen": 72062568, "step": 106910 }, { "epoch": 2.6119512373879266, "grad_norm": 0.03025723248720169, "learning_rate": 1.0962485711360963e-06, "loss": 0.0005, "num_input_tokens_seen": 72065960, "step": 106915 }, { "epoch": 2.6120733882197738, "grad_norm": 0.030587896704673767, "learning_rate": 1.0961636891025836e-06, "loss": 0.0006, "num_input_tokens_seen": 72069352, "step": 106920 }, { "epoch": 2.612195539051621, "grad_norm": 0.006601739674806595, "learning_rate": 1.0960788063697425e-06, "loss": 0.0421, "num_input_tokens_seen": 72072424, "step": 106925 }, { "epoch": 2.612317689883468, "grad_norm": 0.10948925465345383, "learning_rate": 1.0959939229381906e-06, "loss": 0.0334, "num_input_tokens_seen": 72075944, "step": 106930 }, { "epoch": 2.6124398407153153, "grad_norm": 0.01464875414967537, "learning_rate": 1.0959090388085448e-06, "loss": 0.0002, "num_input_tokens_seen": 72079464, "step": 106935 }, { "epoch": 2.6125619915471625, "grad_norm": 0.022532816976308823, "learning_rate": 1.0958241539814226e-06, "loss": 0.0002, "num_input_tokens_seen": 72082408, "step": 106940 }, { "epoch": 2.6126841423790097, "grad_norm": 0.08455182611942291, "learning_rate": 1.0957392684574415e-06, "loss": 0.0002, "num_input_tokens_seen": 72085928, "step": 106945 }, { "epoch": 2.612806293210857, "grad_norm": 0.02040986716747284, "learning_rate": 1.095654382237219e-06, "loss": 0.0555, "num_input_tokens_seen": 72089000, "step": 106950 }, { "epoch": 2.612928444042704, "grad_norm": 35.983070373535156, "learning_rate": 1.0955694953213718e-06, "loss": 0.0447, "num_input_tokens_seen": 72092456, "step": 106955 }, { "epoch": 2.613050594874551, "grad_norm": 0.011616901494562626, "learning_rate": 1.0954846077105178e-06, "loss": 0.0293, "num_input_tokens_seen": 72095720, "step": 106960 }, { "epoch": 2.6131727457063985, "grad_norm": 0.0024410607293248177, "learning_rate": 1.0953997194052738e-06, "loss": 0.0001, "num_input_tokens_seen": 72100136, "step": 106965 }, { "epoch": 2.613294896538245, "grad_norm": 0.21549752354621887, "learning_rate": 1.0953148304062575e-06, "loss": 0.0002, "num_input_tokens_seen": 72103208, "step": 106970 }, { "epoch": 2.613417047370093, "grad_norm": 27.977876663208008, "learning_rate": 1.095229940714086e-06, "loss": 0.0588, "num_input_tokens_seen": 72106856, "step": 106975 }, { "epoch": 2.6135391982019396, "grad_norm": 0.007714892737567425, "learning_rate": 1.0951450503293769e-06, "loss": 0.1241, "num_input_tokens_seen": 72110568, "step": 106980 }, { "epoch": 2.6136613490337868, "grad_norm": 0.7092371582984924, "learning_rate": 1.0950601592527473e-06, "loss": 0.0004, "num_input_tokens_seen": 72113512, "step": 106985 }, { "epoch": 2.613783499865634, "grad_norm": 0.025556981563568115, "learning_rate": 1.0949752674848144e-06, "loss": 0.0002, "num_input_tokens_seen": 72117032, "step": 106990 }, { "epoch": 2.613905650697481, "grad_norm": 0.08398312330245972, "learning_rate": 1.0948903750261962e-06, "loss": 0.0003, "num_input_tokens_seen": 72120360, "step": 106995 }, { "epoch": 2.6140278015293283, "grad_norm": 0.1594977229833603, "learning_rate": 1.0948054818775094e-06, "loss": 0.0755, "num_input_tokens_seen": 72123688, "step": 107000 }, { "epoch": 2.6141499523611755, "grad_norm": 0.054152004420757294, "learning_rate": 1.094720588039372e-06, "loss": 0.0001, "num_input_tokens_seen": 72126952, "step": 107005 }, { "epoch": 2.6142721031930227, "grad_norm": 0.09151487797498703, "learning_rate": 1.0946356935124006e-06, "loss": 0.0002, "num_input_tokens_seen": 72130472, "step": 107010 }, { "epoch": 2.61439425402487, "grad_norm": 14.935052871704102, "learning_rate": 1.0945507982972134e-06, "loss": 0.1374, "num_input_tokens_seen": 72134248, "step": 107015 }, { "epoch": 2.614516404856717, "grad_norm": 0.003952572587877512, "learning_rate": 1.0944659023944269e-06, "loss": 0.0003, "num_input_tokens_seen": 72137832, "step": 107020 }, { "epoch": 2.6146385556885643, "grad_norm": 0.05129333212971687, "learning_rate": 1.0943810058046591e-06, "loss": 0.0468, "num_input_tokens_seen": 72141096, "step": 107025 }, { "epoch": 2.6147607065204115, "grad_norm": 0.030844559893012047, "learning_rate": 1.0942961085285275e-06, "loss": 0.0003, "num_input_tokens_seen": 72144488, "step": 107030 }, { "epoch": 2.6148828573522587, "grad_norm": 0.01932377554476261, "learning_rate": 1.094211210566649e-06, "loss": 0.0328, "num_input_tokens_seen": 72147944, "step": 107035 }, { "epoch": 2.615005008184106, "grad_norm": 0.043806299567222595, "learning_rate": 1.0941263119196413e-06, "loss": 0.0216, "num_input_tokens_seen": 72151528, "step": 107040 }, { "epoch": 2.615127159015953, "grad_norm": 0.2381763905286789, "learning_rate": 1.094041412588122e-06, "loss": 0.0005, "num_input_tokens_seen": 72154984, "step": 107045 }, { "epoch": 2.6152493098478002, "grad_norm": 6.37664794921875, "learning_rate": 1.093956512572708e-06, "loss": 0.0319, "num_input_tokens_seen": 72157928, "step": 107050 }, { "epoch": 2.615371460679647, "grad_norm": 854.23046875, "learning_rate": 1.0938716118740167e-06, "loss": 0.102, "num_input_tokens_seen": 72161320, "step": 107055 }, { "epoch": 2.6154936115114946, "grad_norm": 20.003992080688477, "learning_rate": 1.0937867104926662e-06, "loss": 0.0676, "num_input_tokens_seen": 72164648, "step": 107060 }, { "epoch": 2.6156157623433414, "grad_norm": 0.01823362149298191, "learning_rate": 1.0937018084292731e-06, "loss": 0.1215, "num_input_tokens_seen": 72168040, "step": 107065 }, { "epoch": 2.615737913175189, "grad_norm": 0.014729354530572891, "learning_rate": 1.0936169056844556e-06, "loss": 0.0002, "num_input_tokens_seen": 72171432, "step": 107070 }, { "epoch": 2.6158600640070357, "grad_norm": 0.0513114258646965, "learning_rate": 1.093532002258831e-06, "loss": 0.0514, "num_input_tokens_seen": 72174824, "step": 107075 }, { "epoch": 2.615982214838883, "grad_norm": 0.05931246653199196, "learning_rate": 1.0934470981530162e-06, "loss": 0.0002, "num_input_tokens_seen": 72178280, "step": 107080 }, { "epoch": 2.61610436567073, "grad_norm": 0.026284227147698402, "learning_rate": 1.093362193367629e-06, "loss": 0.0679, "num_input_tokens_seen": 72181608, "step": 107085 }, { "epoch": 2.6162265165025773, "grad_norm": 0.32598185539245605, "learning_rate": 1.0932772879032868e-06, "loss": 0.0029, "num_input_tokens_seen": 72185320, "step": 107090 }, { "epoch": 2.6163486673344245, "grad_norm": 0.15179592370986938, "learning_rate": 1.0931923817606068e-06, "loss": 0.0907, "num_input_tokens_seen": 72188520, "step": 107095 }, { "epoch": 2.6164708181662717, "grad_norm": 0.4278791844844818, "learning_rate": 1.0931074749402072e-06, "loss": 0.0006, "num_input_tokens_seen": 72191976, "step": 107100 }, { "epoch": 2.616592968998119, "grad_norm": 178.62254333496094, "learning_rate": 1.0930225674427047e-06, "loss": 0.0831, "num_input_tokens_seen": 72195176, "step": 107105 }, { "epoch": 2.616715119829966, "grad_norm": 0.04899469390511513, "learning_rate": 1.0929376592687173e-06, "loss": 0.0003, "num_input_tokens_seen": 72198312, "step": 107110 }, { "epoch": 2.6168372706618133, "grad_norm": 0.10489187389612198, "learning_rate": 1.092852750418862e-06, "loss": 0.0003, "num_input_tokens_seen": 72201640, "step": 107115 }, { "epoch": 2.6169594214936605, "grad_norm": 0.059486132115125656, "learning_rate": 1.092767840893757e-06, "loss": 0.0884, "num_input_tokens_seen": 72205032, "step": 107120 }, { "epoch": 2.6170815723255076, "grad_norm": 0.14339995384216309, "learning_rate": 1.0926829306940186e-06, "loss": 0.0003, "num_input_tokens_seen": 72208296, "step": 107125 }, { "epoch": 2.617203723157355, "grad_norm": 21.472740173339844, "learning_rate": 1.0925980198202655e-06, "loss": 0.0576, "num_input_tokens_seen": 72211624, "step": 107130 }, { "epoch": 2.617325873989202, "grad_norm": 22.353364944458008, "learning_rate": 1.0925131082731146e-06, "loss": 0.0413, "num_input_tokens_seen": 72214760, "step": 107135 }, { "epoch": 2.6174480248210488, "grad_norm": 0.08398572355508804, "learning_rate": 1.0924281960531834e-06, "loss": 0.0006, "num_input_tokens_seen": 72218088, "step": 107140 }, { "epoch": 2.6175701756528964, "grad_norm": 0.05433683097362518, "learning_rate": 1.0923432831610897e-06, "loss": 0.125, "num_input_tokens_seen": 72221864, "step": 107145 }, { "epoch": 2.617692326484743, "grad_norm": 0.034245796501636505, "learning_rate": 1.0922583695974506e-06, "loss": 0.0002, "num_input_tokens_seen": 72225384, "step": 107150 }, { "epoch": 2.617814477316591, "grad_norm": 0.08957494050264359, "learning_rate": 1.0921734553628836e-06, "loss": 0.001, "num_input_tokens_seen": 72228712, "step": 107155 }, { "epoch": 2.6179366281484375, "grad_norm": 0.055757008492946625, "learning_rate": 1.0920885404580066e-06, "loss": 0.0004, "num_input_tokens_seen": 72232168, "step": 107160 }, { "epoch": 2.6180587789802847, "grad_norm": 22.70863914489746, "learning_rate": 1.0920036248834373e-06, "loss": 0.0667, "num_input_tokens_seen": 72235112, "step": 107165 }, { "epoch": 2.618180929812132, "grad_norm": 0.015680965036153793, "learning_rate": 1.0919187086397928e-06, "loss": 0.1277, "num_input_tokens_seen": 72238824, "step": 107170 }, { "epoch": 2.618303080643979, "grad_norm": 33.95769119262695, "learning_rate": 1.0918337917276906e-06, "loss": 0.0355, "num_input_tokens_seen": 72242024, "step": 107175 }, { "epoch": 2.6184252314758263, "grad_norm": 0.04578560218214989, "learning_rate": 1.0917488741477483e-06, "loss": 0.0037, "num_input_tokens_seen": 72245032, "step": 107180 }, { "epoch": 2.6185473823076735, "grad_norm": 25.26980209350586, "learning_rate": 1.0916639559005837e-06, "loss": 0.0374, "num_input_tokens_seen": 72248360, "step": 107185 }, { "epoch": 2.6186695331395207, "grad_norm": 14.420616149902344, "learning_rate": 1.091579036986814e-06, "loss": 0.0883, "num_input_tokens_seen": 72251368, "step": 107190 }, { "epoch": 2.618791683971368, "grad_norm": 0.02755843475461006, "learning_rate": 1.091494117407057e-06, "loss": 0.165, "num_input_tokens_seen": 72254440, "step": 107195 }, { "epoch": 2.618913834803215, "grad_norm": 0.6943149566650391, "learning_rate": 1.0914091971619299e-06, "loss": 0.0005, "num_input_tokens_seen": 72257256, "step": 107200 }, { "epoch": 2.6190359856350622, "grad_norm": 0.029792679473757744, "learning_rate": 1.091324276252051e-06, "loss": 0.065, "num_input_tokens_seen": 72260328, "step": 107205 }, { "epoch": 2.6191581364669094, "grad_norm": 0.030234433710575104, "learning_rate": 1.091239354678037e-06, "loss": 0.0003, "num_input_tokens_seen": 72263912, "step": 107210 }, { "epoch": 2.6192802872987566, "grad_norm": 0.08542269468307495, "learning_rate": 1.091154432440506e-06, "loss": 0.0407, "num_input_tokens_seen": 72267496, "step": 107215 }, { "epoch": 2.619402438130604, "grad_norm": 0.0761958509683609, "learning_rate": 1.0910695095400753e-06, "loss": 0.1219, "num_input_tokens_seen": 72270696, "step": 107220 }, { "epoch": 2.619524588962451, "grad_norm": 1.5762901306152344, "learning_rate": 1.0909845859773628e-06, "loss": 0.042, "num_input_tokens_seen": 72273768, "step": 107225 }, { "epoch": 2.619646739794298, "grad_norm": 0.00965029839426279, "learning_rate": 1.0908996617529862e-06, "loss": 0.059, "num_input_tokens_seen": 72277096, "step": 107230 }, { "epoch": 2.619768890626145, "grad_norm": 0.6805073022842407, "learning_rate": 1.0908147368675626e-06, "loss": 0.0013, "num_input_tokens_seen": 72280488, "step": 107235 }, { "epoch": 2.6198910414579926, "grad_norm": 0.017918897792696953, "learning_rate": 1.09072981132171e-06, "loss": 0.0383, "num_input_tokens_seen": 72284136, "step": 107240 }, { "epoch": 2.6200131922898393, "grad_norm": 0.12319338321685791, "learning_rate": 1.0906448851160453e-06, "loss": 0.0349, "num_input_tokens_seen": 72287720, "step": 107245 }, { "epoch": 2.6201353431216865, "grad_norm": 0.071799635887146, "learning_rate": 1.0905599582511866e-06, "loss": 0.0361, "num_input_tokens_seen": 72291176, "step": 107250 }, { "epoch": 2.6202574939535337, "grad_norm": 0.03280222788453102, "learning_rate": 1.0904750307277519e-06, "loss": 0.0229, "num_input_tokens_seen": 72294888, "step": 107255 }, { "epoch": 2.620379644785381, "grad_norm": 0.03872086480259895, "learning_rate": 1.0903901025463581e-06, "loss": 0.0008, "num_input_tokens_seen": 72298408, "step": 107260 }, { "epoch": 2.620501795617228, "grad_norm": 49.3137092590332, "learning_rate": 1.0903051737076236e-06, "loss": 0.0814, "num_input_tokens_seen": 72302824, "step": 107265 }, { "epoch": 2.6206239464490753, "grad_norm": 0.19232575595378876, "learning_rate": 1.0902202442121654e-06, "loss": 0.03, "num_input_tokens_seen": 72306344, "step": 107270 }, { "epoch": 2.6207460972809224, "grad_norm": 0.03860234469175339, "learning_rate": 1.0901353140606013e-06, "loss": 0.0003, "num_input_tokens_seen": 72309288, "step": 107275 }, { "epoch": 2.6208682481127696, "grad_norm": 0.08553210645914078, "learning_rate": 1.090050383253549e-06, "loss": 0.0002, "num_input_tokens_seen": 72312744, "step": 107280 }, { "epoch": 2.620990398944617, "grad_norm": 0.043976765125989914, "learning_rate": 1.089965451791626e-06, "loss": 0.1156, "num_input_tokens_seen": 72315944, "step": 107285 }, { "epoch": 2.621112549776464, "grad_norm": 0.017137890681624413, "learning_rate": 1.0898805196754502e-06, "loss": 0.0001, "num_input_tokens_seen": 72319336, "step": 107290 }, { "epoch": 2.621234700608311, "grad_norm": 0.22604624927043915, "learning_rate": 1.089795586905639e-06, "loss": 0.0004, "num_input_tokens_seen": 72323240, "step": 107295 }, { "epoch": 2.6213568514401584, "grad_norm": 0.04105839133262634, "learning_rate": 1.08971065348281e-06, "loss": 0.0003, "num_input_tokens_seen": 72326312, "step": 107300 }, { "epoch": 2.6214790022720056, "grad_norm": 0.1574070006608963, "learning_rate": 1.0896257194075812e-06, "loss": 0.0003, "num_input_tokens_seen": 72329512, "step": 107305 }, { "epoch": 2.6216011531038528, "grad_norm": 0.03792516887187958, "learning_rate": 1.0895407846805698e-06, "loss": 0.0728, "num_input_tokens_seen": 72332776, "step": 107310 }, { "epoch": 2.6217233039357, "grad_norm": 0.0181158147752285, "learning_rate": 1.0894558493023937e-06, "loss": 0.0272, "num_input_tokens_seen": 72335848, "step": 107315 }, { "epoch": 2.6218454547675467, "grad_norm": 108.81785583496094, "learning_rate": 1.089370913273671e-06, "loss": 0.1086, "num_input_tokens_seen": 72339432, "step": 107320 }, { "epoch": 2.6219676055993943, "grad_norm": 0.1919240802526474, "learning_rate": 1.0892859765950187e-06, "loss": 0.0533, "num_input_tokens_seen": 72342440, "step": 107325 }, { "epoch": 2.622089756431241, "grad_norm": 0.21514792740345, "learning_rate": 1.089201039267055e-06, "loss": 0.001, "num_input_tokens_seen": 72345384, "step": 107330 }, { "epoch": 2.6222119072630887, "grad_norm": 0.057491566985845566, "learning_rate": 1.0891161012903971e-06, "loss": 0.0707, "num_input_tokens_seen": 72348264, "step": 107335 }, { "epoch": 2.6223340580949355, "grad_norm": 0.008800412528216839, "learning_rate": 1.0890311626656631e-06, "loss": 0.0004, "num_input_tokens_seen": 72351400, "step": 107340 }, { "epoch": 2.6224562089267827, "grad_norm": 0.04608434811234474, "learning_rate": 1.0889462233934704e-06, "loss": 0.0444, "num_input_tokens_seen": 72354792, "step": 107345 }, { "epoch": 2.62257835975863, "grad_norm": 1.220207691192627, "learning_rate": 1.088861283474437e-06, "loss": 0.0006, "num_input_tokens_seen": 72359016, "step": 107350 }, { "epoch": 2.622700510590477, "grad_norm": 0.06609610468149185, "learning_rate": 1.0887763429091804e-06, "loss": 0.0004, "num_input_tokens_seen": 72361960, "step": 107355 }, { "epoch": 2.6228226614223242, "grad_norm": 1.9949488639831543, "learning_rate": 1.0886914016983182e-06, "loss": 0.0087, "num_input_tokens_seen": 72365864, "step": 107360 }, { "epoch": 2.6229448122541714, "grad_norm": 0.1548883318901062, "learning_rate": 1.0886064598424684e-06, "loss": 0.0458, "num_input_tokens_seen": 72369192, "step": 107365 }, { "epoch": 2.6230669630860186, "grad_norm": 33.023460388183594, "learning_rate": 1.0885215173422486e-06, "loss": 0.038, "num_input_tokens_seen": 72372264, "step": 107370 }, { "epoch": 2.623189113917866, "grad_norm": 1.717215895652771, "learning_rate": 1.0884365741982764e-06, "loss": 0.0536, "num_input_tokens_seen": 72375528, "step": 107375 }, { "epoch": 2.623311264749713, "grad_norm": 0.05810878798365593, "learning_rate": 1.0883516304111698e-06, "loss": 0.0377, "num_input_tokens_seen": 72378792, "step": 107380 }, { "epoch": 2.62343341558156, "grad_norm": 0.034050095826387405, "learning_rate": 1.0882666859815466e-06, "loss": 0.0281, "num_input_tokens_seen": 72382312, "step": 107385 }, { "epoch": 2.6235555664134074, "grad_norm": 4.691112041473389, "learning_rate": 1.088181740910024e-06, "loss": 0.0007, "num_input_tokens_seen": 72385576, "step": 107390 }, { "epoch": 2.6236777172452546, "grad_norm": 0.07725610584020615, "learning_rate": 1.0880967951972201e-06, "loss": 0.0006, "num_input_tokens_seen": 72388968, "step": 107395 }, { "epoch": 2.6237998680771017, "grad_norm": 0.024160612374544144, "learning_rate": 1.0880118488437526e-06, "loss": 0.0001, "num_input_tokens_seen": 72392232, "step": 107400 }, { "epoch": 2.6239220189089485, "grad_norm": 0.024952422827482224, "learning_rate": 1.087926901850239e-06, "loss": 0.0431, "num_input_tokens_seen": 72395304, "step": 107405 }, { "epoch": 2.624044169740796, "grad_norm": 0.22318053245544434, "learning_rate": 1.0878419542172975e-06, "loss": 0.0003, "num_input_tokens_seen": 72398824, "step": 107410 }, { "epoch": 2.624166320572643, "grad_norm": 0.014739875681698322, "learning_rate": 1.087757005945546e-06, "loss": 0.0326, "num_input_tokens_seen": 72401768, "step": 107415 }, { "epoch": 2.6242884714044905, "grad_norm": 0.028513239696621895, "learning_rate": 1.0876720570356018e-06, "loss": 0.0902, "num_input_tokens_seen": 72405096, "step": 107420 }, { "epoch": 2.6244106222363373, "grad_norm": 0.0022885762155056, "learning_rate": 1.0875871074880827e-06, "loss": 0.0002, "num_input_tokens_seen": 72408168, "step": 107425 }, { "epoch": 2.6245327730681844, "grad_norm": 0.11882738023996353, "learning_rate": 1.0875021573036067e-06, "loss": 0.0002, "num_input_tokens_seen": 72411368, "step": 107430 }, { "epoch": 2.6246549239000316, "grad_norm": 0.01739366352558136, "learning_rate": 1.0874172064827913e-06, "loss": 0.0367, "num_input_tokens_seen": 72415080, "step": 107435 }, { "epoch": 2.624777074731879, "grad_norm": 0.004528645426034927, "learning_rate": 1.0873322550262548e-06, "loss": 0.104, "num_input_tokens_seen": 72418408, "step": 107440 }, { "epoch": 2.624899225563726, "grad_norm": 0.21300160884857178, "learning_rate": 1.0872473029346143e-06, "loss": 0.0003, "num_input_tokens_seen": 72422184, "step": 107445 }, { "epoch": 2.625021376395573, "grad_norm": 0.03288082405924797, "learning_rate": 1.0871623502084882e-06, "loss": 0.0004, "num_input_tokens_seen": 72425768, "step": 107450 }, { "epoch": 2.6251435272274204, "grad_norm": 0.010147850960493088, "learning_rate": 1.087077396848494e-06, "loss": 0.0001, "num_input_tokens_seen": 72429160, "step": 107455 }, { "epoch": 2.6252656780592676, "grad_norm": 34.80997085571289, "learning_rate": 1.0869924428552492e-06, "loss": 0.1302, "num_input_tokens_seen": 72432424, "step": 107460 }, { "epoch": 2.6253878288911148, "grad_norm": 0.039994291961193085, "learning_rate": 1.0869074882293723e-06, "loss": 0.0571, "num_input_tokens_seen": 72435432, "step": 107465 }, { "epoch": 2.625509979722962, "grad_norm": 0.016550660133361816, "learning_rate": 1.0868225329714806e-06, "loss": 0.0333, "num_input_tokens_seen": 72438824, "step": 107470 }, { "epoch": 2.625632130554809, "grad_norm": 0.02417152374982834, "learning_rate": 1.0867375770821922e-06, "loss": 0.0397, "num_input_tokens_seen": 72442472, "step": 107475 }, { "epoch": 2.6257542813866563, "grad_norm": 0.0034279574174433947, "learning_rate": 1.086652620562125e-06, "loss": 0.0001, "num_input_tokens_seen": 72445864, "step": 107480 }, { "epoch": 2.6258764322185035, "grad_norm": 0.010594514198601246, "learning_rate": 1.0865676634118963e-06, "loss": 0.0005, "num_input_tokens_seen": 72449256, "step": 107485 }, { "epoch": 2.6259985830503507, "grad_norm": 76.29888916015625, "learning_rate": 1.0864827056321243e-06, "loss": 0.068, "num_input_tokens_seen": 72452328, "step": 107490 }, { "epoch": 2.626120733882198, "grad_norm": 0.001311702886596322, "learning_rate": 1.0863977472234275e-06, "loss": 0.0141, "num_input_tokens_seen": 72455720, "step": 107495 }, { "epoch": 2.6262428847140447, "grad_norm": 0.26813095808029175, "learning_rate": 1.086312788186422e-06, "loss": 0.0773, "num_input_tokens_seen": 72458856, "step": 107500 }, { "epoch": 2.6263650355458923, "grad_norm": 0.1785283088684082, "learning_rate": 1.0862278285217272e-06, "loss": 0.0408, "num_input_tokens_seen": 72462184, "step": 107505 }, { "epoch": 2.626487186377739, "grad_norm": 0.20264551043510437, "learning_rate": 1.0861428682299605e-06, "loss": 0.0002, "num_input_tokens_seen": 72465576, "step": 107510 }, { "epoch": 2.6266093372095867, "grad_norm": 0.018418628722429276, "learning_rate": 1.0860579073117394e-06, "loss": 0.0005, "num_input_tokens_seen": 72469096, "step": 107515 }, { "epoch": 2.6267314880414334, "grad_norm": 0.007999802008271217, "learning_rate": 1.0859729457676823e-06, "loss": 0.0541, "num_input_tokens_seen": 72473064, "step": 107520 }, { "epoch": 2.6268536388732806, "grad_norm": 0.008016117848455906, "learning_rate": 1.0858879835984067e-06, "loss": 0.0007, "num_input_tokens_seen": 72476392, "step": 107525 }, { "epoch": 2.626975789705128, "grad_norm": 0.2861591875553131, "learning_rate": 1.0858030208045305e-06, "loss": 0.0004, "num_input_tokens_seen": 72479784, "step": 107530 }, { "epoch": 2.627097940536975, "grad_norm": 0.6544962525367737, "learning_rate": 1.0857180573866718e-06, "loss": 0.0005, "num_input_tokens_seen": 72482984, "step": 107535 }, { "epoch": 2.627220091368822, "grad_norm": 0.2255128175020218, "learning_rate": 1.0856330933454485e-06, "loss": 0.0002, "num_input_tokens_seen": 72486056, "step": 107540 }, { "epoch": 2.6273422422006694, "grad_norm": 0.01406821422278881, "learning_rate": 1.0855481286814781e-06, "loss": 0.0002, "num_input_tokens_seen": 72489320, "step": 107545 }, { "epoch": 2.6274643930325166, "grad_norm": 0.007307102438062429, "learning_rate": 1.0854631633953788e-06, "loss": 0.0526, "num_input_tokens_seen": 72492392, "step": 107550 }, { "epoch": 2.6275865438643637, "grad_norm": 0.0460895299911499, "learning_rate": 1.0853781974877682e-06, "loss": 0.0002, "num_input_tokens_seen": 72495720, "step": 107555 }, { "epoch": 2.627708694696211, "grad_norm": 0.012345832772552967, "learning_rate": 1.0852932309592644e-06, "loss": 0.0, "num_input_tokens_seen": 72498792, "step": 107560 }, { "epoch": 2.627830845528058, "grad_norm": 0.3947714865207672, "learning_rate": 1.0852082638104854e-06, "loss": 0.0453, "num_input_tokens_seen": 72501928, "step": 107565 }, { "epoch": 2.6279529963599053, "grad_norm": 23.10430908203125, "learning_rate": 1.0851232960420488e-06, "loss": 0.0501, "num_input_tokens_seen": 72505384, "step": 107570 }, { "epoch": 2.6280751471917525, "grad_norm": 13.216294288635254, "learning_rate": 1.085038327654573e-06, "loss": 0.0009, "num_input_tokens_seen": 72508328, "step": 107575 }, { "epoch": 2.6281972980235997, "grad_norm": 0.05808434635400772, "learning_rate": 1.0849533586486754e-06, "loss": 0.0004, "num_input_tokens_seen": 72511528, "step": 107580 }, { "epoch": 2.6283194488554464, "grad_norm": 0.1069037988781929, "learning_rate": 1.0848683890249743e-06, "loss": 0.0138, "num_input_tokens_seen": 72514664, "step": 107585 }, { "epoch": 2.628441599687294, "grad_norm": 0.009551643393933773, "learning_rate": 1.0847834187840873e-06, "loss": 0.0008, "num_input_tokens_seen": 72518312, "step": 107590 }, { "epoch": 2.628563750519141, "grad_norm": 0.036050669848918915, "learning_rate": 1.0846984479266326e-06, "loss": 0.0001, "num_input_tokens_seen": 72521640, "step": 107595 }, { "epoch": 2.6286859013509885, "grad_norm": 14.879709243774414, "learning_rate": 1.084613476453228e-06, "loss": 0.0467, "num_input_tokens_seen": 72525096, "step": 107600 }, { "epoch": 2.628808052182835, "grad_norm": 0.016211412847042084, "learning_rate": 1.0845285043644914e-06, "loss": 0.0555, "num_input_tokens_seen": 72529512, "step": 107605 }, { "epoch": 2.6289302030146824, "grad_norm": 0.017843419685959816, "learning_rate": 1.0844435316610408e-06, "loss": 0.0999, "num_input_tokens_seen": 72533224, "step": 107610 }, { "epoch": 2.6290523538465296, "grad_norm": 0.689307451248169, "learning_rate": 1.084358558343494e-06, "loss": 0.0004, "num_input_tokens_seen": 72536552, "step": 107615 }, { "epoch": 2.6291745046783768, "grad_norm": 0.06051316112279892, "learning_rate": 1.084273584412469e-06, "loss": 0.0028, "num_input_tokens_seen": 72540264, "step": 107620 }, { "epoch": 2.629296655510224, "grad_norm": 134.1039581298828, "learning_rate": 1.084188609868584e-06, "loss": 0.0367, "num_input_tokens_seen": 72543464, "step": 107625 }, { "epoch": 2.629418806342071, "grad_norm": 0.1083790734410286, "learning_rate": 1.0841036347124565e-06, "loss": 0.0459, "num_input_tokens_seen": 72546408, "step": 107630 }, { "epoch": 2.6295409571739183, "grad_norm": 0.1749052256345749, "learning_rate": 1.0840186589447052e-06, "loss": 0.0002, "num_input_tokens_seen": 72550568, "step": 107635 }, { "epoch": 2.6296631080057655, "grad_norm": 0.0020598629489541054, "learning_rate": 1.0839336825659473e-06, "loss": 0.0001, "num_input_tokens_seen": 72553896, "step": 107640 }, { "epoch": 2.6297852588376127, "grad_norm": 0.5078147053718567, "learning_rate": 1.0838487055768013e-06, "loss": 0.0003, "num_input_tokens_seen": 72557480, "step": 107645 }, { "epoch": 2.62990740966946, "grad_norm": 17.110179901123047, "learning_rate": 1.083763727977885e-06, "loss": 0.1545, "num_input_tokens_seen": 72560808, "step": 107650 }, { "epoch": 2.630029560501307, "grad_norm": 38.41392517089844, "learning_rate": 1.0836787497698161e-06, "loss": 0.1782, "num_input_tokens_seen": 72564328, "step": 107655 }, { "epoch": 2.6301517113331543, "grad_norm": 0.02592952363193035, "learning_rate": 1.0835937709532131e-06, "loss": 0.0003, "num_input_tokens_seen": 72567272, "step": 107660 }, { "epoch": 2.6302738621650015, "grad_norm": 0.01741073466837406, "learning_rate": 1.0835087915286933e-06, "loss": 0.0573, "num_input_tokens_seen": 72570472, "step": 107665 }, { "epoch": 2.6303960129968487, "grad_norm": 0.011544224806129932, "learning_rate": 1.0834238114968754e-06, "loss": 0.0001, "num_input_tokens_seen": 72573672, "step": 107670 }, { "epoch": 2.630518163828696, "grad_norm": 0.14928054809570312, "learning_rate": 1.0833388308583772e-06, "loss": 0.1061, "num_input_tokens_seen": 72576616, "step": 107675 }, { "epoch": 2.6306403146605426, "grad_norm": 10.083694458007812, "learning_rate": 1.0832538496138165e-06, "loss": 0.0239, "num_input_tokens_seen": 72580392, "step": 107680 }, { "epoch": 2.6307624654923902, "grad_norm": 0.007448187563568354, "learning_rate": 1.0831688677638112e-06, "loss": 0.0002, "num_input_tokens_seen": 72583464, "step": 107685 }, { "epoch": 2.630884616324237, "grad_norm": 0.06917484849691391, "learning_rate": 1.0830838853089796e-06, "loss": 0.0003, "num_input_tokens_seen": 72586984, "step": 107690 }, { "epoch": 2.631006767156084, "grad_norm": 0.04246160760521889, "learning_rate": 1.08299890224994e-06, "loss": 0.0636, "num_input_tokens_seen": 72589672, "step": 107695 }, { "epoch": 2.6311289179879314, "grad_norm": 0.4987105131149292, "learning_rate": 1.0829139185873097e-06, "loss": 0.0014, "num_input_tokens_seen": 72593320, "step": 107700 }, { "epoch": 2.6312510688197785, "grad_norm": 1.0627728700637817, "learning_rate": 1.082828934321707e-06, "loss": 0.0544, "num_input_tokens_seen": 72596904, "step": 107705 }, { "epoch": 2.6313732196516257, "grad_norm": 0.0632128193974495, "learning_rate": 1.0827439494537506e-06, "loss": 0.107, "num_input_tokens_seen": 72600360, "step": 107710 }, { "epoch": 2.631495370483473, "grad_norm": 0.21518582105636597, "learning_rate": 1.0826589639840572e-06, "loss": 0.0938, "num_input_tokens_seen": 72603816, "step": 107715 }, { "epoch": 2.63161752131532, "grad_norm": 0.013827536255121231, "learning_rate": 1.082573977913246e-06, "loss": 0.0401, "num_input_tokens_seen": 72607400, "step": 107720 }, { "epoch": 2.6317396721471673, "grad_norm": 0.029351606965065002, "learning_rate": 1.0824889912419344e-06, "loss": 0.0001, "num_input_tokens_seen": 72610600, "step": 107725 }, { "epoch": 2.6318618229790145, "grad_norm": 0.01663423702120781, "learning_rate": 1.0824040039707409e-06, "loss": 0.007, "num_input_tokens_seen": 72614120, "step": 107730 }, { "epoch": 2.6319839738108617, "grad_norm": 0.01716860942542553, "learning_rate": 1.0823190161002834e-06, "loss": 0.1447, "num_input_tokens_seen": 72617192, "step": 107735 }, { "epoch": 2.632106124642709, "grad_norm": 0.025707658380270004, "learning_rate": 1.0822340276311794e-06, "loss": 0.0502, "num_input_tokens_seen": 72620968, "step": 107740 }, { "epoch": 2.632228275474556, "grad_norm": 0.03293826803565025, "learning_rate": 1.082149038564048e-06, "loss": 0.0001, "num_input_tokens_seen": 72624488, "step": 107745 }, { "epoch": 2.6323504263064033, "grad_norm": 0.02947530522942543, "learning_rate": 1.0820640488995062e-06, "loss": 0.0002, "num_input_tokens_seen": 72627816, "step": 107750 }, { "epoch": 2.6324725771382504, "grad_norm": 0.006064974702894688, "learning_rate": 1.0819790586381729e-06, "loss": 0.0009, "num_input_tokens_seen": 72631080, "step": 107755 }, { "epoch": 2.6325947279700976, "grad_norm": 0.02745603770017624, "learning_rate": 1.0818940677806657e-06, "loss": 0.0577, "num_input_tokens_seen": 72634664, "step": 107760 }, { "epoch": 2.6327168788019444, "grad_norm": 0.06063401699066162, "learning_rate": 1.081809076327603e-06, "loss": 0.0003, "num_input_tokens_seen": 72638376, "step": 107765 }, { "epoch": 2.632839029633792, "grad_norm": 0.10956387221813202, "learning_rate": 1.0817240842796025e-06, "loss": 0.0004, "num_input_tokens_seen": 72641832, "step": 107770 }, { "epoch": 2.6329611804656388, "grad_norm": 0.45911097526550293, "learning_rate": 1.0816390916372824e-06, "loss": 0.0004, "num_input_tokens_seen": 72645096, "step": 107775 }, { "epoch": 2.6330833312974864, "grad_norm": 0.06135956570506096, "learning_rate": 1.0815540984012608e-06, "loss": 0.0003, "num_input_tokens_seen": 72650280, "step": 107780 }, { "epoch": 2.633205482129333, "grad_norm": 0.0062957340851426125, "learning_rate": 1.081469104572156e-06, "loss": 0.0002, "num_input_tokens_seen": 72653352, "step": 107785 }, { "epoch": 2.6333276329611803, "grad_norm": 18.732004165649414, "learning_rate": 1.081384110150586e-06, "loss": 0.0716, "num_input_tokens_seen": 72656488, "step": 107790 }, { "epoch": 2.6334497837930275, "grad_norm": 0.02306182123720646, "learning_rate": 1.0812991151371691e-06, "loss": 0.0003, "num_input_tokens_seen": 72659752, "step": 107795 }, { "epoch": 2.6335719346248747, "grad_norm": 0.43138134479522705, "learning_rate": 1.0812141195325228e-06, "loss": 0.1924, "num_input_tokens_seen": 72663208, "step": 107800 }, { "epoch": 2.633694085456722, "grad_norm": 0.010642552748322487, "learning_rate": 1.0811291233372659e-06, "loss": 0.0001, "num_input_tokens_seen": 72666472, "step": 107805 }, { "epoch": 2.633816236288569, "grad_norm": 12.951112747192383, "learning_rate": 1.081044126552016e-06, "loss": 0.0924, "num_input_tokens_seen": 72669800, "step": 107810 }, { "epoch": 2.6339383871204163, "grad_norm": 18.641759872436523, "learning_rate": 1.0809591291773913e-06, "loss": 0.0457, "num_input_tokens_seen": 72673000, "step": 107815 }, { "epoch": 2.6340605379522635, "grad_norm": 0.13156826794147491, "learning_rate": 1.0808741312140103e-06, "loss": 0.0397, "num_input_tokens_seen": 72676584, "step": 107820 }, { "epoch": 2.6341826887841107, "grad_norm": 0.010222331620752811, "learning_rate": 1.0807891326624906e-06, "loss": 0.0004, "num_input_tokens_seen": 72680936, "step": 107825 }, { "epoch": 2.634304839615958, "grad_norm": 0.03131281957030296, "learning_rate": 1.0807041335234508e-06, "loss": 0.0002, "num_input_tokens_seen": 72684264, "step": 107830 }, { "epoch": 2.634426990447805, "grad_norm": 0.2757070064544678, "learning_rate": 1.0806191337975085e-06, "loss": 0.0007, "num_input_tokens_seen": 72687720, "step": 107835 }, { "epoch": 2.6345491412796522, "grad_norm": 0.707978367805481, "learning_rate": 1.0805341334852824e-06, "loss": 0.0407, "num_input_tokens_seen": 72691240, "step": 107840 }, { "epoch": 2.6346712921114994, "grad_norm": 0.02468128129839897, "learning_rate": 1.0804491325873904e-06, "loss": 0.1027, "num_input_tokens_seen": 72694440, "step": 107845 }, { "epoch": 2.6347934429433466, "grad_norm": 0.03739924356341362, "learning_rate": 1.0803641311044507e-06, "loss": 0.0001, "num_input_tokens_seen": 72698088, "step": 107850 }, { "epoch": 2.634915593775194, "grad_norm": 0.29329103231430054, "learning_rate": 1.0802791290370819e-06, "loss": 0.1029, "num_input_tokens_seen": 72701864, "step": 107855 }, { "epoch": 2.6350377446070405, "grad_norm": 0.01383211649954319, "learning_rate": 1.0801941263859011e-06, "loss": 0.0001, "num_input_tokens_seen": 72705256, "step": 107860 }, { "epoch": 2.635159895438888, "grad_norm": 0.04233228415250778, "learning_rate": 1.0801091231515272e-06, "loss": 0.0008, "num_input_tokens_seen": 72708328, "step": 107865 }, { "epoch": 2.635282046270735, "grad_norm": 19.92087173461914, "learning_rate": 1.0800241193345778e-06, "loss": 0.0444, "num_input_tokens_seen": 72712040, "step": 107870 }, { "epoch": 2.635404197102582, "grad_norm": 0.02420320361852646, "learning_rate": 1.079939114935672e-06, "loss": 0.0004, "num_input_tokens_seen": 72715944, "step": 107875 }, { "epoch": 2.6355263479344293, "grad_norm": 0.03484322503209114, "learning_rate": 1.0798541099554272e-06, "loss": 0.0002, "num_input_tokens_seen": 72719080, "step": 107880 }, { "epoch": 2.6356484987662765, "grad_norm": 0.13985350728034973, "learning_rate": 1.0797691043944622e-06, "loss": 0.0332, "num_input_tokens_seen": 72722088, "step": 107885 }, { "epoch": 2.6357706495981237, "grad_norm": 0.1291801631450653, "learning_rate": 1.0796840982533943e-06, "loss": 0.0018, "num_input_tokens_seen": 72725352, "step": 107890 }, { "epoch": 2.635892800429971, "grad_norm": 0.033996183425188065, "learning_rate": 1.0795990915328426e-06, "loss": 0.0002, "num_input_tokens_seen": 72728936, "step": 107895 }, { "epoch": 2.636014951261818, "grad_norm": 0.05727604776620865, "learning_rate": 1.0795140842334248e-06, "loss": 0.0652, "num_input_tokens_seen": 72732392, "step": 107900 }, { "epoch": 2.6361371020936653, "grad_norm": 0.021231159567832947, "learning_rate": 1.0794290763557591e-06, "loss": 0.0001, "num_input_tokens_seen": 72735656, "step": 107905 }, { "epoch": 2.6362592529255124, "grad_norm": 18.85205078125, "learning_rate": 1.0793440679004638e-06, "loss": 0.0377, "num_input_tokens_seen": 72738856, "step": 107910 }, { "epoch": 2.6363814037573596, "grad_norm": 0.13238240778446198, "learning_rate": 1.0792590588681572e-06, "loss": 0.0366, "num_input_tokens_seen": 72741992, "step": 107915 }, { "epoch": 2.636503554589207, "grad_norm": 48.16722106933594, "learning_rate": 1.0791740492594574e-06, "loss": 0.1173, "num_input_tokens_seen": 72745320, "step": 107920 }, { "epoch": 2.636625705421054, "grad_norm": 0.30411502718925476, "learning_rate": 1.0790890390749824e-06, "loss": 0.0003, "num_input_tokens_seen": 72748840, "step": 107925 }, { "epoch": 2.636747856252901, "grad_norm": 0.07888974249362946, "learning_rate": 1.079004028315351e-06, "loss": 0.0001, "num_input_tokens_seen": 72752040, "step": 107930 }, { "epoch": 2.6368700070847484, "grad_norm": 0.02969658188521862, "learning_rate": 1.0789190169811806e-06, "loss": 0.0002, "num_input_tokens_seen": 72755624, "step": 107935 }, { "epoch": 2.6369921579165956, "grad_norm": 0.010916121304035187, "learning_rate": 1.0788340050730902e-06, "loss": 0.0001, "num_input_tokens_seen": 72758952, "step": 107940 }, { "epoch": 2.6371143087484423, "grad_norm": 0.03490295261144638, "learning_rate": 1.0787489925916976e-06, "loss": 0.0457, "num_input_tokens_seen": 72762280, "step": 107945 }, { "epoch": 2.63723645958029, "grad_norm": 0.13596896827220917, "learning_rate": 1.0786639795376214e-06, "loss": 0.0002, "num_input_tokens_seen": 72765544, "step": 107950 }, { "epoch": 2.6373586104121367, "grad_norm": 0.014326742850244045, "learning_rate": 1.0785789659114792e-06, "loss": 0.0002, "num_input_tokens_seen": 72768808, "step": 107955 }, { "epoch": 2.6374807612439843, "grad_norm": 0.009243039414286613, "learning_rate": 1.07849395171389e-06, "loss": 0.0001, "num_input_tokens_seen": 72772328, "step": 107960 }, { "epoch": 2.637602912075831, "grad_norm": 0.3703426420688629, "learning_rate": 1.0784089369454714e-06, "loss": 0.0002, "num_input_tokens_seen": 72775208, "step": 107965 }, { "epoch": 2.6377250629076783, "grad_norm": 0.005901218391954899, "learning_rate": 1.0783239216068421e-06, "loss": 0.0001, "num_input_tokens_seen": 72778408, "step": 107970 }, { "epoch": 2.6378472137395255, "grad_norm": 345.4832763671875, "learning_rate": 1.07823890569862e-06, "loss": 0.021, "num_input_tokens_seen": 72781864, "step": 107975 }, { "epoch": 2.6379693645713727, "grad_norm": 0.02927909605205059, "learning_rate": 1.0781538892214235e-06, "loss": 0.0156, "num_input_tokens_seen": 72785448, "step": 107980 }, { "epoch": 2.63809151540322, "grad_norm": 0.013741977512836456, "learning_rate": 1.078068872175871e-06, "loss": 0.1625, "num_input_tokens_seen": 72788584, "step": 107985 }, { "epoch": 2.638213666235067, "grad_norm": 0.006222781725227833, "learning_rate": 1.0779838545625808e-06, "loss": 0.0002, "num_input_tokens_seen": 72791528, "step": 107990 }, { "epoch": 2.6383358170669142, "grad_norm": 0.019642196595668793, "learning_rate": 1.0778988363821706e-06, "loss": 0.0865, "num_input_tokens_seen": 72795688, "step": 107995 }, { "epoch": 2.6384579678987614, "grad_norm": 0.010990871116518974, "learning_rate": 1.0778138176352596e-06, "loss": 0.0782, "num_input_tokens_seen": 72798824, "step": 108000 }, { "epoch": 2.6385801187306086, "grad_norm": 0.030032608658075333, "learning_rate": 1.0777287983224652e-06, "loss": 0.0001, "num_input_tokens_seen": 72801960, "step": 108005 }, { "epoch": 2.638702269562456, "grad_norm": 0.013616411946713924, "learning_rate": 1.0776437784444065e-06, "loss": 0.0001, "num_input_tokens_seen": 72804968, "step": 108010 }, { "epoch": 2.638824420394303, "grad_norm": 69.19341278076172, "learning_rate": 1.0775587580017012e-06, "loss": 0.1758, "num_input_tokens_seen": 72808552, "step": 108015 }, { "epoch": 2.63894657122615, "grad_norm": 1.5476653575897217, "learning_rate": 1.0774737369949678e-06, "loss": 0.0349, "num_input_tokens_seen": 72811624, "step": 108020 }, { "epoch": 2.6390687220579974, "grad_norm": 0.01377920899540186, "learning_rate": 1.0773887154248242e-06, "loss": 0.0004, "num_input_tokens_seen": 72815720, "step": 108025 }, { "epoch": 2.639190872889844, "grad_norm": 23.38638687133789, "learning_rate": 1.0773036932918892e-06, "loss": 0.03, "num_input_tokens_seen": 72819112, "step": 108030 }, { "epoch": 2.6393130237216917, "grad_norm": 0.05645943433046341, "learning_rate": 1.077218670596781e-06, "loss": 0.0002, "num_input_tokens_seen": 72823016, "step": 108035 }, { "epoch": 2.6394351745535385, "grad_norm": 0.002317589009180665, "learning_rate": 1.077133647340118e-06, "loss": 0.0001, "num_input_tokens_seen": 72826472, "step": 108040 }, { "epoch": 2.639557325385386, "grad_norm": 29.722360610961914, "learning_rate": 1.0770486235225182e-06, "loss": 0.1396, "num_input_tokens_seen": 72829864, "step": 108045 }, { "epoch": 2.639679476217233, "grad_norm": 0.17399294674396515, "learning_rate": 1.0769635991446002e-06, "loss": 0.0001, "num_input_tokens_seen": 72833192, "step": 108050 }, { "epoch": 2.63980162704908, "grad_norm": 0.10186666995286942, "learning_rate": 1.076878574206982e-06, "loss": 0.0002, "num_input_tokens_seen": 72836392, "step": 108055 }, { "epoch": 2.6399237778809272, "grad_norm": 27.92131996154785, "learning_rate": 1.0767935487102823e-06, "loss": 0.0351, "num_input_tokens_seen": 72840168, "step": 108060 }, { "epoch": 2.6400459287127744, "grad_norm": 0.010424827225506306, "learning_rate": 1.0767085226551194e-06, "loss": 0.0398, "num_input_tokens_seen": 72843496, "step": 108065 }, { "epoch": 2.6401680795446216, "grad_norm": 0.05996089056134224, "learning_rate": 1.0766234960421112e-06, "loss": 0.0001, "num_input_tokens_seen": 72847144, "step": 108070 }, { "epoch": 2.640290230376469, "grad_norm": 0.2539304196834564, "learning_rate": 1.0765384688718766e-06, "loss": 0.0001, "num_input_tokens_seen": 72850280, "step": 108075 }, { "epoch": 2.640412381208316, "grad_norm": 0.01256766077131033, "learning_rate": 1.0764534411450334e-06, "loss": 0.0539, "num_input_tokens_seen": 72853736, "step": 108080 }, { "epoch": 2.640534532040163, "grad_norm": 0.045256491750478745, "learning_rate": 1.0763684128622003e-06, "loss": 0.0002, "num_input_tokens_seen": 72857192, "step": 108085 }, { "epoch": 2.6406566828720104, "grad_norm": 0.016109425574541092, "learning_rate": 1.0762833840239956e-06, "loss": 0.0377, "num_input_tokens_seen": 72861288, "step": 108090 }, { "epoch": 2.6407788337038576, "grad_norm": 0.0010781821329146624, "learning_rate": 1.0761983546310376e-06, "loss": 0.0013, "num_input_tokens_seen": 72865000, "step": 108095 }, { "epoch": 2.6409009845357048, "grad_norm": 0.1415102183818817, "learning_rate": 1.0761133246839446e-06, "loss": 0.0492, "num_input_tokens_seen": 72868008, "step": 108100 }, { "epoch": 2.641023135367552, "grad_norm": 0.23793765902519226, "learning_rate": 1.076028294183335e-06, "loss": 0.0002, "num_input_tokens_seen": 72871016, "step": 108105 }, { "epoch": 2.641145286199399, "grad_norm": 0.011273748241364956, "learning_rate": 1.0759432631298276e-06, "loss": 0.1439, "num_input_tokens_seen": 72874344, "step": 108110 }, { "epoch": 2.6412674370312463, "grad_norm": 0.3362903594970703, "learning_rate": 1.0758582315240402e-06, "loss": 0.0002, "num_input_tokens_seen": 72877672, "step": 108115 }, { "epoch": 2.6413895878630935, "grad_norm": 0.012885728850960732, "learning_rate": 1.075773199366591e-06, "loss": 0.0001, "num_input_tokens_seen": 72880744, "step": 108120 }, { "epoch": 2.6415117386949403, "grad_norm": 0.03369002044200897, "learning_rate": 1.075688166658099e-06, "loss": 0.0002, "num_input_tokens_seen": 72884072, "step": 108125 }, { "epoch": 2.641633889526788, "grad_norm": 0.013920611701905727, "learning_rate": 1.0756031333991822e-06, "loss": 0.0001, "num_input_tokens_seen": 72887656, "step": 108130 }, { "epoch": 2.6417560403586346, "grad_norm": 0.0669659972190857, "learning_rate": 1.075518099590459e-06, "loss": 0.001, "num_input_tokens_seen": 72891176, "step": 108135 }, { "epoch": 2.6418781911904823, "grad_norm": 0.15272586047649384, "learning_rate": 1.0754330652325481e-06, "loss": 0.0001, "num_input_tokens_seen": 72894696, "step": 108140 }, { "epoch": 2.642000342022329, "grad_norm": 0.005168919917196035, "learning_rate": 1.0753480303260675e-06, "loss": 0.0003, "num_input_tokens_seen": 72897832, "step": 108145 }, { "epoch": 2.642122492854176, "grad_norm": 0.7232739329338074, "learning_rate": 1.0752629948716356e-06, "loss": 0.1065, "num_input_tokens_seen": 72901224, "step": 108150 }, { "epoch": 2.6422446436860234, "grad_norm": 0.046349361538887024, "learning_rate": 1.075177958869871e-06, "loss": 0.0002, "num_input_tokens_seen": 72904872, "step": 108155 }, { "epoch": 2.6423667945178706, "grad_norm": 0.421809583902359, "learning_rate": 1.0750929223213923e-06, "loss": 0.0002, "num_input_tokens_seen": 72908328, "step": 108160 }, { "epoch": 2.642488945349718, "grad_norm": 2.9098937375238165e-05, "learning_rate": 1.0750078852268178e-06, "loss": 0.0373, "num_input_tokens_seen": 72911592, "step": 108165 }, { "epoch": 2.642611096181565, "grad_norm": 0.01656261831521988, "learning_rate": 1.0749228475867656e-06, "loss": 0.0021, "num_input_tokens_seen": 72914984, "step": 108170 }, { "epoch": 2.642733247013412, "grad_norm": 48.75508499145508, "learning_rate": 1.0748378094018541e-06, "loss": 0.0402, "num_input_tokens_seen": 72918120, "step": 108175 }, { "epoch": 2.6428553978452594, "grad_norm": 0.157123863697052, "learning_rate": 1.0747527706727022e-06, "loss": 0.0542, "num_input_tokens_seen": 72921896, "step": 108180 }, { "epoch": 2.6429775486771065, "grad_norm": 0.12687893211841583, "learning_rate": 1.0746677313999277e-06, "loss": 0.0001, "num_input_tokens_seen": 72926248, "step": 108185 }, { "epoch": 2.6430996995089537, "grad_norm": 0.06420924514532089, "learning_rate": 1.0745826915841495e-06, "loss": 0.0366, "num_input_tokens_seen": 72929512, "step": 108190 }, { "epoch": 2.643221850340801, "grad_norm": 0.003628705395385623, "learning_rate": 1.0744976512259862e-06, "loss": 0.0567, "num_input_tokens_seen": 72932968, "step": 108195 }, { "epoch": 2.643344001172648, "grad_norm": 0.1197664812207222, "learning_rate": 1.0744126103260558e-06, "loss": 0.0003, "num_input_tokens_seen": 72937000, "step": 108200 }, { "epoch": 2.6434661520044953, "grad_norm": 0.02228275127708912, "learning_rate": 1.0743275688849767e-06, "loss": 0.0941, "num_input_tokens_seen": 72940136, "step": 108205 }, { "epoch": 2.643588302836342, "grad_norm": 0.35382992029190063, "learning_rate": 1.0742425269033678e-06, "loss": 0.0424, "num_input_tokens_seen": 72944040, "step": 108210 }, { "epoch": 2.6437104536681897, "grad_norm": 0.0022606253623962402, "learning_rate": 1.074157484381847e-06, "loss": 0.1378, "num_input_tokens_seen": 72947240, "step": 108215 }, { "epoch": 2.6438326045000364, "grad_norm": 0.027560878545045853, "learning_rate": 1.0740724413210332e-06, "loss": 0.0001, "num_input_tokens_seen": 72950760, "step": 108220 }, { "epoch": 2.643954755331884, "grad_norm": 0.007026746869087219, "learning_rate": 1.0739873977215447e-06, "loss": 0.0452, "num_input_tokens_seen": 72954024, "step": 108225 }, { "epoch": 2.644076906163731, "grad_norm": 0.25033438205718994, "learning_rate": 1.073902353584e-06, "loss": 0.0432, "num_input_tokens_seen": 72957736, "step": 108230 }, { "epoch": 2.644199056995578, "grad_norm": 0.21594354510307312, "learning_rate": 1.0738173089090172e-06, "loss": 0.1475, "num_input_tokens_seen": 72961256, "step": 108235 }, { "epoch": 2.644321207827425, "grad_norm": 0.458386093378067, "learning_rate": 1.0737322636972155e-06, "loss": 0.021, "num_input_tokens_seen": 72964456, "step": 108240 }, { "epoch": 2.6444433586592724, "grad_norm": 0.02155539020895958, "learning_rate": 1.0736472179492125e-06, "loss": 0.0001, "num_input_tokens_seen": 72968040, "step": 108245 }, { "epoch": 2.6445655094911196, "grad_norm": 0.04842046648263931, "learning_rate": 1.0735621716656274e-06, "loss": 0.0004, "num_input_tokens_seen": 72971432, "step": 108250 }, { "epoch": 2.6446876603229668, "grad_norm": 0.009866573847830296, "learning_rate": 1.0734771248470783e-06, "loss": 0.0004, "num_input_tokens_seen": 72974632, "step": 108255 }, { "epoch": 2.644809811154814, "grad_norm": 0.07377776503562927, "learning_rate": 1.0733920774941837e-06, "loss": 0.1142, "num_input_tokens_seen": 72977704, "step": 108260 }, { "epoch": 2.644931961986661, "grad_norm": 0.035990867763757706, "learning_rate": 1.0733070296075623e-06, "loss": 0.0015, "num_input_tokens_seen": 72981288, "step": 108265 }, { "epoch": 2.6450541128185083, "grad_norm": 0.027638480067253113, "learning_rate": 1.0732219811878327e-06, "loss": 0.0004, "num_input_tokens_seen": 72984744, "step": 108270 }, { "epoch": 2.6451762636503555, "grad_norm": 0.00911200325936079, "learning_rate": 1.0731369322356127e-06, "loss": 0.0593, "num_input_tokens_seen": 72988456, "step": 108275 }, { "epoch": 2.6452984144822027, "grad_norm": 0.3369785249233246, "learning_rate": 1.0730518827515216e-06, "loss": 0.0002, "num_input_tokens_seen": 72991656, "step": 108280 }, { "epoch": 2.64542056531405, "grad_norm": 56.09309005737305, "learning_rate": 1.0729668327361774e-06, "loss": 0.082, "num_input_tokens_seen": 72995304, "step": 108285 }, { "epoch": 2.645542716145897, "grad_norm": 0.04016836732625961, "learning_rate": 1.0728817821901988e-06, "loss": 0.0756, "num_input_tokens_seen": 72998312, "step": 108290 }, { "epoch": 2.6456648669777443, "grad_norm": 0.009905444458127022, "learning_rate": 1.0727967311142044e-06, "loss": 0.0001, "num_input_tokens_seen": 73001384, "step": 108295 }, { "epoch": 2.6457870178095915, "grad_norm": 0.031229624524712563, "learning_rate": 1.0727116795088125e-06, "loss": 0.0173, "num_input_tokens_seen": 73004392, "step": 108300 }, { "epoch": 2.645909168641438, "grad_norm": 0.14251470565795898, "learning_rate": 1.0726266273746414e-06, "loss": 0.0006, "num_input_tokens_seen": 73007656, "step": 108305 }, { "epoch": 2.646031319473286, "grad_norm": 0.08483777940273285, "learning_rate": 1.0725415747123102e-06, "loss": 0.0002, "num_input_tokens_seen": 73011496, "step": 108310 }, { "epoch": 2.6461534703051326, "grad_norm": 23.98685646057129, "learning_rate": 1.0724565215224373e-06, "loss": 0.0925, "num_input_tokens_seen": 73014824, "step": 108315 }, { "epoch": 2.64627562113698, "grad_norm": 0.050838831812143326, "learning_rate": 1.072371467805641e-06, "loss": 0.0002, "num_input_tokens_seen": 73017960, "step": 108320 }, { "epoch": 2.646397771968827, "grad_norm": 0.23060138523578644, "learning_rate": 1.07228641356254e-06, "loss": 0.0005, "num_input_tokens_seen": 73021288, "step": 108325 }, { "epoch": 2.646519922800674, "grad_norm": 0.12116897851228714, "learning_rate": 1.0722013587937526e-06, "loss": 0.0516, "num_input_tokens_seen": 73024424, "step": 108330 }, { "epoch": 2.6466420736325214, "grad_norm": 0.4200071096420288, "learning_rate": 1.0721163034998975e-06, "loss": 0.0482, "num_input_tokens_seen": 73027496, "step": 108335 }, { "epoch": 2.6467642244643685, "grad_norm": 0.10476449131965637, "learning_rate": 1.0720312476815932e-06, "loss": 0.0002, "num_input_tokens_seen": 73031272, "step": 108340 }, { "epoch": 2.6468863752962157, "grad_norm": 0.302228182554245, "learning_rate": 1.0719461913394582e-06, "loss": 0.0004, "num_input_tokens_seen": 73034280, "step": 108345 }, { "epoch": 2.647008526128063, "grad_norm": 0.10496729612350464, "learning_rate": 1.0718611344741116e-06, "loss": 0.0822, "num_input_tokens_seen": 73037224, "step": 108350 }, { "epoch": 2.64713067695991, "grad_norm": 0.07415910065174103, "learning_rate": 1.071776077086171e-06, "loss": 0.0934, "num_input_tokens_seen": 73040808, "step": 108355 }, { "epoch": 2.6472528277917573, "grad_norm": 24.086450576782227, "learning_rate": 1.0716910191762559e-06, "loss": 0.0623, "num_input_tokens_seen": 73044072, "step": 108360 }, { "epoch": 2.6473749786236045, "grad_norm": 0.011858934536576271, "learning_rate": 1.0716059607449842e-06, "loss": 0.0005, "num_input_tokens_seen": 73047080, "step": 108365 }, { "epoch": 2.6474971294554517, "grad_norm": 0.20711888372898102, "learning_rate": 1.0715209017929744e-06, "loss": 0.0398, "num_input_tokens_seen": 73050152, "step": 108370 }, { "epoch": 2.647619280287299, "grad_norm": 0.13118284940719604, "learning_rate": 1.0714358423208457e-06, "loss": 0.0184, "num_input_tokens_seen": 73053672, "step": 108375 }, { "epoch": 2.647741431119146, "grad_norm": 0.014156767167150974, "learning_rate": 1.0713507823292165e-06, "loss": 0.0849, "num_input_tokens_seen": 73056936, "step": 108380 }, { "epoch": 2.6478635819509933, "grad_norm": 0.022906072437763214, "learning_rate": 1.071265721818705e-06, "loss": 0.0005, "num_input_tokens_seen": 73060136, "step": 108385 }, { "epoch": 2.64798573278284, "grad_norm": 54.88224411010742, "learning_rate": 1.0711806607899302e-06, "loss": 0.1114, "num_input_tokens_seen": 73063784, "step": 108390 }, { "epoch": 2.6481078836146876, "grad_norm": 0.03241246938705444, "learning_rate": 1.0710955992435101e-06, "loss": 0.0003, "num_input_tokens_seen": 73067304, "step": 108395 }, { "epoch": 2.6482300344465344, "grad_norm": 0.015850190073251724, "learning_rate": 1.0710105371800637e-06, "loss": 0.0002, "num_input_tokens_seen": 73070888, "step": 108400 }, { "epoch": 2.648352185278382, "grad_norm": 0.16307973861694336, "learning_rate": 1.07092547460021e-06, "loss": 0.0008, "num_input_tokens_seen": 73074088, "step": 108405 }, { "epoch": 2.6484743361102288, "grad_norm": 24.154691696166992, "learning_rate": 1.0708404115045669e-06, "loss": 0.1232, "num_input_tokens_seen": 73077608, "step": 108410 }, { "epoch": 2.648596486942076, "grad_norm": 0.012943286448717117, "learning_rate": 1.0707553478937533e-06, "loss": 0.026, "num_input_tokens_seen": 73081000, "step": 108415 }, { "epoch": 2.648718637773923, "grad_norm": 0.012203868478536606, "learning_rate": 1.070670283768388e-06, "loss": 0.0003, "num_input_tokens_seen": 73084136, "step": 108420 }, { "epoch": 2.6488407886057703, "grad_norm": 0.034377772361040115, "learning_rate": 1.0705852191290891e-06, "loss": 0.0421, "num_input_tokens_seen": 73087528, "step": 108425 }, { "epoch": 2.6489629394376175, "grad_norm": 0.2772684097290039, "learning_rate": 1.0705001539764754e-06, "loss": 0.0002, "num_input_tokens_seen": 73091048, "step": 108430 }, { "epoch": 2.6490850902694647, "grad_norm": 0.05456133186817169, "learning_rate": 1.0704150883111659e-06, "loss": 0.0005, "num_input_tokens_seen": 73094440, "step": 108435 }, { "epoch": 2.649207241101312, "grad_norm": 0.010156864300370216, "learning_rate": 1.0703300221337787e-06, "loss": 0.0002, "num_input_tokens_seen": 73098472, "step": 108440 }, { "epoch": 2.649329391933159, "grad_norm": 0.0660182535648346, "learning_rate": 1.070244955444933e-06, "loss": 0.0863, "num_input_tokens_seen": 73101928, "step": 108445 }, { "epoch": 2.6494515427650063, "grad_norm": 0.11449148505926132, "learning_rate": 1.0701598882452469e-06, "loss": 0.0002, "num_input_tokens_seen": 73105768, "step": 108450 }, { "epoch": 2.6495736935968535, "grad_norm": 0.1072482019662857, "learning_rate": 1.070074820535339e-06, "loss": 0.1254, "num_input_tokens_seen": 73109544, "step": 108455 }, { "epoch": 2.6496958444287007, "grad_norm": 0.13168303668498993, "learning_rate": 1.0699897523158283e-06, "loss": 0.0293, "num_input_tokens_seen": 73113064, "step": 108460 }, { "epoch": 2.649817995260548, "grad_norm": 159.1094512939453, "learning_rate": 1.0699046835873336e-06, "loss": 0.0029, "num_input_tokens_seen": 73116200, "step": 108465 }, { "epoch": 2.649940146092395, "grad_norm": 0.06051814183592796, "learning_rate": 1.069819614350473e-06, "loss": 0.0515, "num_input_tokens_seen": 73119592, "step": 108470 }, { "epoch": 2.650062296924242, "grad_norm": 40.68425369262695, "learning_rate": 1.0697345446058654e-06, "loss": 0.0455, "num_input_tokens_seen": 73122984, "step": 108475 }, { "epoch": 2.6501844477560894, "grad_norm": 35.27906799316406, "learning_rate": 1.0696494743541296e-06, "loss": 0.0456, "num_input_tokens_seen": 73126248, "step": 108480 }, { "epoch": 2.650306598587936, "grad_norm": 0.003338438691571355, "learning_rate": 1.069564403595884e-06, "loss": 0.0004, "num_input_tokens_seen": 73129448, "step": 108485 }, { "epoch": 2.650428749419784, "grad_norm": 3.661097764968872, "learning_rate": 1.0694793323317473e-06, "loss": 0.0008, "num_input_tokens_seen": 73132840, "step": 108490 }, { "epoch": 2.6505509002516305, "grad_norm": 0.45807284116744995, "learning_rate": 1.0693942605623378e-06, "loss": 0.0907, "num_input_tokens_seen": 73136040, "step": 108495 }, { "epoch": 2.6506730510834777, "grad_norm": 0.04252566769719124, "learning_rate": 1.069309188288275e-06, "loss": 0.1461, "num_input_tokens_seen": 73139496, "step": 108500 }, { "epoch": 2.650795201915325, "grad_norm": 0.08476745337247849, "learning_rate": 1.069224115510177e-06, "loss": 0.0003, "num_input_tokens_seen": 73143080, "step": 108505 }, { "epoch": 2.650917352747172, "grad_norm": 0.014480705372989178, "learning_rate": 1.0691390422286627e-06, "loss": 0.0458, "num_input_tokens_seen": 73147112, "step": 108510 }, { "epoch": 2.6510395035790193, "grad_norm": 0.020470378920435905, "learning_rate": 1.0690539684443507e-06, "loss": 0.0001, "num_input_tokens_seen": 73150568, "step": 108515 }, { "epoch": 2.6511616544108665, "grad_norm": 0.12641994655132294, "learning_rate": 1.06896889415786e-06, "loss": 0.0005, "num_input_tokens_seen": 73153640, "step": 108520 }, { "epoch": 2.6512838052427137, "grad_norm": 0.16786456108093262, "learning_rate": 1.0688838193698083e-06, "loss": 0.0003, "num_input_tokens_seen": 73157096, "step": 108525 }, { "epoch": 2.651405956074561, "grad_norm": 0.013153999112546444, "learning_rate": 1.0687987440808153e-06, "loss": 0.0606, "num_input_tokens_seen": 73161128, "step": 108530 }, { "epoch": 2.651528106906408, "grad_norm": 0.058872465044260025, "learning_rate": 1.0687136682914993e-06, "loss": 0.0434, "num_input_tokens_seen": 73164328, "step": 108535 }, { "epoch": 2.6516502577382552, "grad_norm": 29.729175567626953, "learning_rate": 1.068628592002479e-06, "loss": 0.0347, "num_input_tokens_seen": 73167272, "step": 108540 }, { "epoch": 2.6517724085701024, "grad_norm": 0.2370745837688446, "learning_rate": 1.068543515214373e-06, "loss": 0.0002, "num_input_tokens_seen": 73170344, "step": 108545 }, { "epoch": 2.6518945594019496, "grad_norm": 0.00412773247808218, "learning_rate": 1.0684584379278004e-06, "loss": 0.0003, "num_input_tokens_seen": 73173224, "step": 108550 }, { "epoch": 2.652016710233797, "grad_norm": 0.07895587384700775, "learning_rate": 1.0683733601433793e-06, "loss": 0.0238, "num_input_tokens_seen": 73176296, "step": 108555 }, { "epoch": 2.652138861065644, "grad_norm": 0.09276538342237473, "learning_rate": 1.068288281861729e-06, "loss": 0.0469, "num_input_tokens_seen": 73179176, "step": 108560 }, { "epoch": 2.652261011897491, "grad_norm": 0.0015131103573367, "learning_rate": 1.0682032030834678e-06, "loss": 0.0003, "num_input_tokens_seen": 73182376, "step": 108565 }, { "epoch": 2.652383162729338, "grad_norm": 0.5894980430603027, "learning_rate": 1.0681181238092146e-06, "loss": 0.0007, "num_input_tokens_seen": 73185640, "step": 108570 }, { "epoch": 2.6525053135611856, "grad_norm": 53.47251510620117, "learning_rate": 1.0680330440395882e-06, "loss": 0.0396, "num_input_tokens_seen": 73188584, "step": 108575 }, { "epoch": 2.6526274643930323, "grad_norm": 16.599782943725586, "learning_rate": 1.0679479637752069e-06, "loss": 0.0931, "num_input_tokens_seen": 73191912, "step": 108580 }, { "epoch": 2.65274961522488, "grad_norm": 0.012147694826126099, "learning_rate": 1.06786288301669e-06, "loss": 0.0326, "num_input_tokens_seen": 73195304, "step": 108585 }, { "epoch": 2.6528717660567267, "grad_norm": 0.17597021162509918, "learning_rate": 1.0677778017646558e-06, "loss": 0.0002, "num_input_tokens_seen": 73198568, "step": 108590 }, { "epoch": 2.652993916888574, "grad_norm": 0.029810475185513496, "learning_rate": 1.0676927200197234e-06, "loss": 0.0001, "num_input_tokens_seen": 73201640, "step": 108595 }, { "epoch": 2.653116067720421, "grad_norm": 0.011127985082566738, "learning_rate": 1.067607637782511e-06, "loss": 0.0535, "num_input_tokens_seen": 73204712, "step": 108600 }, { "epoch": 2.6532382185522683, "grad_norm": 0.014848160557448864, "learning_rate": 1.0675225550536383e-06, "loss": 0.0002, "num_input_tokens_seen": 73208616, "step": 108605 }, { "epoch": 2.6533603693841155, "grad_norm": 94.40913391113281, "learning_rate": 1.067437471833723e-06, "loss": 0.147, "num_input_tokens_seen": 73211944, "step": 108610 }, { "epoch": 2.6534825202159626, "grad_norm": 0.10178336501121521, "learning_rate": 1.0673523881233841e-06, "loss": 0.0615, "num_input_tokens_seen": 73215592, "step": 108615 }, { "epoch": 2.65360467104781, "grad_norm": 0.03686375916004181, "learning_rate": 1.0672673039232405e-06, "loss": 0.1221, "num_input_tokens_seen": 73218792, "step": 108620 }, { "epoch": 2.653726821879657, "grad_norm": 0.44345104694366455, "learning_rate": 1.0671822192339112e-06, "loss": 0.0805, "num_input_tokens_seen": 73222568, "step": 108625 }, { "epoch": 2.653848972711504, "grad_norm": 0.08455266058444977, "learning_rate": 1.0670971340560148e-06, "loss": 0.0002, "num_input_tokens_seen": 73225640, "step": 108630 }, { "epoch": 2.6539711235433514, "grad_norm": 0.05392299219965935, "learning_rate": 1.06701204839017e-06, "loss": 0.0003, "num_input_tokens_seen": 73229032, "step": 108635 }, { "epoch": 2.6540932743751986, "grad_norm": 13.104811668395996, "learning_rate": 1.0669269622369957e-06, "loss": 0.0764, "num_input_tokens_seen": 73232360, "step": 108640 }, { "epoch": 2.654215425207046, "grad_norm": 0.19615143537521362, "learning_rate": 1.06684187559711e-06, "loss": 0.0002, "num_input_tokens_seen": 73235688, "step": 108645 }, { "epoch": 2.654337576038893, "grad_norm": 0.08257798105478287, "learning_rate": 1.0667567884711323e-06, "loss": 0.0005, "num_input_tokens_seen": 73238888, "step": 108650 }, { "epoch": 2.6544597268707397, "grad_norm": 0.08494102954864502, "learning_rate": 1.0666717008596814e-06, "loss": 0.0003, "num_input_tokens_seen": 73242216, "step": 108655 }, { "epoch": 2.6545818777025874, "grad_norm": 193.64535522460938, "learning_rate": 1.0665866127633762e-06, "loss": 0.0155, "num_input_tokens_seen": 73245608, "step": 108660 }, { "epoch": 2.654704028534434, "grad_norm": 15.049497604370117, "learning_rate": 1.066501524182835e-06, "loss": 0.1046, "num_input_tokens_seen": 73248680, "step": 108665 }, { "epoch": 2.6548261793662817, "grad_norm": 0.3932948410511017, "learning_rate": 1.066416435118677e-06, "loss": 0.0481, "num_input_tokens_seen": 73251688, "step": 108670 }, { "epoch": 2.6549483301981285, "grad_norm": 0.17279548943042755, "learning_rate": 1.0663313455715208e-06, "loss": 0.0006, "num_input_tokens_seen": 73254824, "step": 108675 }, { "epoch": 2.6550704810299757, "grad_norm": 0.061224523931741714, "learning_rate": 1.066246255541985e-06, "loss": 0.0009, "num_input_tokens_seen": 73258088, "step": 108680 }, { "epoch": 2.655192631861823, "grad_norm": 0.034446604549884796, "learning_rate": 1.0661611650306887e-06, "loss": 0.0344, "num_input_tokens_seen": 73261800, "step": 108685 }, { "epoch": 2.65531478269367, "grad_norm": 0.016167186200618744, "learning_rate": 1.0660760740382508e-06, "loss": 0.0007, "num_input_tokens_seen": 73265256, "step": 108690 }, { "epoch": 2.6554369335255172, "grad_norm": 0.019697608426213264, "learning_rate": 1.0659909825652898e-06, "loss": 0.0007, "num_input_tokens_seen": 73268520, "step": 108695 }, { "epoch": 2.6555590843573644, "grad_norm": 0.15763618052005768, "learning_rate": 1.0659058906124245e-06, "loss": 0.1883, "num_input_tokens_seen": 73272040, "step": 108700 }, { "epoch": 2.6556812351892116, "grad_norm": 0.07214447855949402, "learning_rate": 1.0658207981802741e-06, "loss": 0.0034, "num_input_tokens_seen": 73275176, "step": 108705 }, { "epoch": 2.655803386021059, "grad_norm": 0.021427778527140617, "learning_rate": 1.0657357052694567e-06, "loss": 0.0002, "num_input_tokens_seen": 73278504, "step": 108710 }, { "epoch": 2.655925536852906, "grad_norm": 28.227033615112305, "learning_rate": 1.0656506118805919e-06, "loss": 0.1303, "num_input_tokens_seen": 73282280, "step": 108715 }, { "epoch": 2.656047687684753, "grad_norm": 0.042592257261276245, "learning_rate": 1.0655655180142982e-06, "loss": 0.0003, "num_input_tokens_seen": 73285480, "step": 108720 }, { "epoch": 2.6561698385166004, "grad_norm": 0.236333966255188, "learning_rate": 1.0654804236711942e-06, "loss": 0.0318, "num_input_tokens_seen": 73288872, "step": 108725 }, { "epoch": 2.6562919893484476, "grad_norm": 0.005104314535856247, "learning_rate": 1.0653953288518994e-06, "loss": 0.0001, "num_input_tokens_seen": 73292008, "step": 108730 }, { "epoch": 2.6564141401802948, "grad_norm": 5.778536796569824, "learning_rate": 1.0653102335570317e-06, "loss": 0.0007, "num_input_tokens_seen": 73295464, "step": 108735 }, { "epoch": 2.656536291012142, "grad_norm": 0.056464217603206635, "learning_rate": 1.0652251377872108e-06, "loss": 0.0001, "num_input_tokens_seen": 73298728, "step": 108740 }, { "epoch": 2.656658441843989, "grad_norm": 0.012570296414196491, "learning_rate": 1.065140041543055e-06, "loss": 0.0319, "num_input_tokens_seen": 73302248, "step": 108745 }, { "epoch": 2.656780592675836, "grad_norm": 30.777732849121094, "learning_rate": 1.0650549448251831e-06, "loss": 0.0538, "num_input_tokens_seen": 73305768, "step": 108750 }, { "epoch": 2.6569027435076835, "grad_norm": 0.010172931477427483, "learning_rate": 1.0649698476342144e-06, "loss": 0.0003, "num_input_tokens_seen": 73309480, "step": 108755 }, { "epoch": 2.6570248943395303, "grad_norm": 0.006178905721753836, "learning_rate": 1.0648847499707673e-06, "loss": 0.0003, "num_input_tokens_seen": 73312680, "step": 108760 }, { "epoch": 2.6571470451713775, "grad_norm": 0.02570711076259613, "learning_rate": 1.0647996518354608e-06, "loss": 0.0005, "num_input_tokens_seen": 73316136, "step": 108765 }, { "epoch": 2.6572691960032246, "grad_norm": 0.006446475628763437, "learning_rate": 1.0647145532289142e-06, "loss": 0.0002, "num_input_tokens_seen": 73318952, "step": 108770 }, { "epoch": 2.657391346835072, "grad_norm": 0.01190762035548687, "learning_rate": 1.0646294541517456e-06, "loss": 0.0639, "num_input_tokens_seen": 73321960, "step": 108775 }, { "epoch": 2.657513497666919, "grad_norm": 0.012693374417722225, "learning_rate": 1.0645443546045743e-06, "loss": 0.0747, "num_input_tokens_seen": 73325032, "step": 108780 }, { "epoch": 2.657635648498766, "grad_norm": 0.02469741739332676, "learning_rate": 1.0644592545880193e-06, "loss": 0.0002, "num_input_tokens_seen": 73328616, "step": 108785 }, { "epoch": 2.6577577993306134, "grad_norm": 0.00556557159870863, "learning_rate": 1.0643741541026991e-06, "loss": 0.0795, "num_input_tokens_seen": 73331752, "step": 108790 }, { "epoch": 2.6578799501624606, "grad_norm": 0.009257211349904537, "learning_rate": 1.0642890531492327e-06, "loss": 0.0526, "num_input_tokens_seen": 73335272, "step": 108795 }, { "epoch": 2.658002100994308, "grad_norm": 0.07994474470615387, "learning_rate": 1.064203951728239e-06, "loss": 0.0645, "num_input_tokens_seen": 73338536, "step": 108800 }, { "epoch": 2.658124251826155, "grad_norm": 0.49836266040802, "learning_rate": 1.064118849840337e-06, "loss": 0.1111, "num_input_tokens_seen": 73341992, "step": 108805 }, { "epoch": 2.658246402658002, "grad_norm": 13.673401832580566, "learning_rate": 1.0640337474861453e-06, "loss": 0.0614, "num_input_tokens_seen": 73345256, "step": 108810 }, { "epoch": 2.6583685534898494, "grad_norm": 0.16080302000045776, "learning_rate": 1.063948644666283e-06, "loss": 0.049, "num_input_tokens_seen": 73348520, "step": 108815 }, { "epoch": 2.6584907043216965, "grad_norm": 0.014671185985207558, "learning_rate": 1.063863541381369e-06, "loss": 0.0002, "num_input_tokens_seen": 73351400, "step": 108820 }, { "epoch": 2.6586128551535437, "grad_norm": 0.0028238091617822647, "learning_rate": 1.0637784376320222e-06, "loss": 0.0517, "num_input_tokens_seen": 73355176, "step": 108825 }, { "epoch": 2.658735005985391, "grad_norm": 0.004296763800084591, "learning_rate": 1.0636933334188615e-06, "loss": 0.0001, "num_input_tokens_seen": 73358568, "step": 108830 }, { "epoch": 2.6588571568172377, "grad_norm": 39.8594856262207, "learning_rate": 1.0636082287425058e-06, "loss": 0.0933, "num_input_tokens_seen": 73362088, "step": 108835 }, { "epoch": 2.6589793076490853, "grad_norm": 0.02145841158926487, "learning_rate": 1.0635231236035739e-06, "loss": 0.0006, "num_input_tokens_seen": 73365224, "step": 108840 }, { "epoch": 2.659101458480932, "grad_norm": 0.01345257367938757, "learning_rate": 1.0634380180026846e-06, "loss": 0.0004, "num_input_tokens_seen": 73368232, "step": 108845 }, { "epoch": 2.6592236093127797, "grad_norm": 0.0031906412914395332, "learning_rate": 1.063352911940457e-06, "loss": 0.0001, "num_input_tokens_seen": 73371432, "step": 108850 }, { "epoch": 2.6593457601446264, "grad_norm": 0.011594736017286777, "learning_rate": 1.0632678054175102e-06, "loss": 0.0944, "num_input_tokens_seen": 73374952, "step": 108855 }, { "epoch": 2.6594679109764736, "grad_norm": 0.009862457402050495, "learning_rate": 1.0631826984344624e-06, "loss": 0.0001, "num_input_tokens_seen": 73378600, "step": 108860 }, { "epoch": 2.659590061808321, "grad_norm": 0.02546362765133381, "learning_rate": 1.0630975909919334e-06, "loss": 0.1409, "num_input_tokens_seen": 73382184, "step": 108865 }, { "epoch": 2.659712212640168, "grad_norm": 33.12275695800781, "learning_rate": 1.0630124830905418e-06, "loss": 0.0337, "num_input_tokens_seen": 73385384, "step": 108870 }, { "epoch": 2.659834363472015, "grad_norm": 0.02543380670249462, "learning_rate": 1.0629273747309064e-06, "loss": 0.0398, "num_input_tokens_seen": 73389096, "step": 108875 }, { "epoch": 2.6599565143038624, "grad_norm": 0.002421492477878928, "learning_rate": 1.0628422659136463e-06, "loss": 0.053, "num_input_tokens_seen": 73392104, "step": 108880 }, { "epoch": 2.6600786651357096, "grad_norm": 0.017665240913629532, "learning_rate": 1.06275715663938e-06, "loss": 0.0375, "num_input_tokens_seen": 73395176, "step": 108885 }, { "epoch": 2.6602008159675568, "grad_norm": 0.06175203621387482, "learning_rate": 1.062672046908727e-06, "loss": 0.0002, "num_input_tokens_seen": 73398696, "step": 108890 }, { "epoch": 2.660322966799404, "grad_norm": 0.07373465597629547, "learning_rate": 1.0625869367223063e-06, "loss": 0.0645, "num_input_tokens_seen": 73401960, "step": 108895 }, { "epoch": 2.660445117631251, "grad_norm": 2.7904903888702393, "learning_rate": 1.062501826080736e-06, "loss": 0.0004, "num_input_tokens_seen": 73404968, "step": 108900 }, { "epoch": 2.6605672684630983, "grad_norm": 346.7339172363281, "learning_rate": 1.062416714984636e-06, "loss": 0.035, "num_input_tokens_seen": 73408168, "step": 108905 }, { "epoch": 2.6606894192949455, "grad_norm": 0.07050517201423645, "learning_rate": 1.0623316034346248e-06, "loss": 0.0526, "num_input_tokens_seen": 73411560, "step": 108910 }, { "epoch": 2.6608115701267927, "grad_norm": 21.976076126098633, "learning_rate": 1.0622464914313214e-06, "loss": 0.1244, "num_input_tokens_seen": 73414888, "step": 108915 }, { "epoch": 2.66093372095864, "grad_norm": 0.02514161542057991, "learning_rate": 1.0621613789753447e-06, "loss": 0.0005, "num_input_tokens_seen": 73418280, "step": 108920 }, { "epoch": 2.661055871790487, "grad_norm": 0.019108710810542107, "learning_rate": 1.062076266067314e-06, "loss": 0.065, "num_input_tokens_seen": 73422056, "step": 108925 }, { "epoch": 2.661178022622334, "grad_norm": 0.4653870463371277, "learning_rate": 1.0619911527078475e-06, "loss": 0.0006, "num_input_tokens_seen": 73426088, "step": 108930 }, { "epoch": 2.6613001734541815, "grad_norm": 0.012500254437327385, "learning_rate": 1.0619060388975649e-06, "loss": 0.0002, "num_input_tokens_seen": 73429864, "step": 108935 }, { "epoch": 2.661422324286028, "grad_norm": 0.27674028277397156, "learning_rate": 1.0618209246370853e-06, "loss": 0.0009, "num_input_tokens_seen": 73433256, "step": 108940 }, { "epoch": 2.6615444751178754, "grad_norm": 0.010696930810809135, "learning_rate": 1.061735809927027e-06, "loss": 0.0418, "num_input_tokens_seen": 73436392, "step": 108945 }, { "epoch": 2.6616666259497226, "grad_norm": 0.016720250248908997, "learning_rate": 1.0616506947680092e-06, "loss": 0.0004, "num_input_tokens_seen": 73439656, "step": 108950 }, { "epoch": 2.6617887767815698, "grad_norm": 37.6923828125, "learning_rate": 1.0615655791606511e-06, "loss": 0.0398, "num_input_tokens_seen": 73442792, "step": 108955 }, { "epoch": 2.661910927613417, "grad_norm": 298.3846740722656, "learning_rate": 1.0614804631055713e-06, "loss": 0.028, "num_input_tokens_seen": 73446184, "step": 108960 }, { "epoch": 2.662033078445264, "grad_norm": 0.11180854588747025, "learning_rate": 1.0613953466033892e-06, "loss": 0.0002, "num_input_tokens_seen": 73449704, "step": 108965 }, { "epoch": 2.6621552292771113, "grad_norm": 0.27024519443511963, "learning_rate": 1.0613102296547237e-06, "loss": 0.0732, "num_input_tokens_seen": 73452968, "step": 108970 }, { "epoch": 2.6622773801089585, "grad_norm": 0.012191432528197765, "learning_rate": 1.0612251122601936e-06, "loss": 0.0004, "num_input_tokens_seen": 73456424, "step": 108975 }, { "epoch": 2.6623995309408057, "grad_norm": 0.007310071494430304, "learning_rate": 1.0611399944204181e-06, "loss": 0.0002, "num_input_tokens_seen": 73459816, "step": 108980 }, { "epoch": 2.662521681772653, "grad_norm": 20.659034729003906, "learning_rate": 1.061054876136016e-06, "loss": 0.0805, "num_input_tokens_seen": 73463208, "step": 108985 }, { "epoch": 2.6626438326045, "grad_norm": 0.029268672689795494, "learning_rate": 1.0609697574076066e-06, "loss": 0.0002, "num_input_tokens_seen": 73466600, "step": 108990 }, { "epoch": 2.6627659834363473, "grad_norm": 0.11111558228731155, "learning_rate": 1.0608846382358087e-06, "loss": 0.0896, "num_input_tokens_seen": 73469480, "step": 108995 }, { "epoch": 2.6628881342681945, "grad_norm": 50.83061599731445, "learning_rate": 1.0607995186212413e-06, "loss": 0.0629, "num_input_tokens_seen": 73472872, "step": 109000 }, { "epoch": 2.6630102851000417, "grad_norm": 0.21760523319244385, "learning_rate": 1.0607143985645235e-06, "loss": 0.0002, "num_input_tokens_seen": 73476072, "step": 109005 }, { "epoch": 2.663132435931889, "grad_norm": 0.009848492220044136, "learning_rate": 1.0606292780662742e-06, "loss": 0.0007, "num_input_tokens_seen": 73479208, "step": 109010 }, { "epoch": 2.6632545867637356, "grad_norm": 31.30138397216797, "learning_rate": 1.0605441571271126e-06, "loss": 0.0978, "num_input_tokens_seen": 73482600, "step": 109015 }, { "epoch": 2.6633767375955832, "grad_norm": 48.70668411254883, "learning_rate": 1.0604590357476571e-06, "loss": 0.1106, "num_input_tokens_seen": 73486248, "step": 109020 }, { "epoch": 2.66349888842743, "grad_norm": 0.12107347697019577, "learning_rate": 1.0603739139285276e-06, "loss": 0.0001, "num_input_tokens_seen": 73489832, "step": 109025 }, { "epoch": 2.6636210392592776, "grad_norm": 0.021504517644643784, "learning_rate": 1.0602887916703429e-06, "loss": 0.0776, "num_input_tokens_seen": 73493096, "step": 109030 }, { "epoch": 2.6637431900911244, "grad_norm": 0.17749902606010437, "learning_rate": 1.0602036689737218e-06, "loss": 0.0434, "num_input_tokens_seen": 73495976, "step": 109035 }, { "epoch": 2.6638653409229716, "grad_norm": 45.952964782714844, "learning_rate": 1.0601185458392833e-06, "loss": 0.0723, "num_input_tokens_seen": 73499112, "step": 109040 }, { "epoch": 2.6639874917548187, "grad_norm": 29.334917068481445, "learning_rate": 1.0600334222676469e-06, "loss": 0.0641, "num_input_tokens_seen": 73502248, "step": 109045 }, { "epoch": 2.664109642586666, "grad_norm": 0.13320650160312653, "learning_rate": 1.059948298259431e-06, "loss": 0.0585, "num_input_tokens_seen": 73505768, "step": 109050 }, { "epoch": 2.664231793418513, "grad_norm": 0.07540590316057205, "learning_rate": 1.0598631738152547e-06, "loss": 0.0025, "num_input_tokens_seen": 73509224, "step": 109055 }, { "epoch": 2.6643539442503603, "grad_norm": 0.11329970508813858, "learning_rate": 1.0597780489357378e-06, "loss": 0.0002, "num_input_tokens_seen": 73512232, "step": 109060 }, { "epoch": 2.6644760950822075, "grad_norm": 550.1355590820312, "learning_rate": 1.0596929236214986e-06, "loss": 0.0629, "num_input_tokens_seen": 73515432, "step": 109065 }, { "epoch": 2.6645982459140547, "grad_norm": 0.047656625509262085, "learning_rate": 1.0596077978731565e-06, "loss": 0.0002, "num_input_tokens_seen": 73518632, "step": 109070 }, { "epoch": 2.664720396745902, "grad_norm": 0.09848161041736603, "learning_rate": 1.0595226716913307e-06, "loss": 0.0379, "num_input_tokens_seen": 73521768, "step": 109075 }, { "epoch": 2.664842547577749, "grad_norm": 0.10069756954908371, "learning_rate": 1.0594375450766394e-06, "loss": 0.0423, "num_input_tokens_seen": 73525288, "step": 109080 }, { "epoch": 2.6649646984095963, "grad_norm": 0.023019161075353622, "learning_rate": 1.0593524180297026e-06, "loss": 0.0643, "num_input_tokens_seen": 73528744, "step": 109085 }, { "epoch": 2.6650868492414435, "grad_norm": 0.027636736631393433, "learning_rate": 1.059267290551139e-06, "loss": 0.0247, "num_input_tokens_seen": 73532264, "step": 109090 }, { "epoch": 2.6652090000732906, "grad_norm": 19.799278259277344, "learning_rate": 1.059182162641568e-06, "loss": 0.1019, "num_input_tokens_seen": 73535528, "step": 109095 }, { "epoch": 2.6653311509051374, "grad_norm": 0.16363434493541718, "learning_rate": 1.0590970343016083e-06, "loss": 0.0003, "num_input_tokens_seen": 73539048, "step": 109100 }, { "epoch": 2.665453301736985, "grad_norm": 0.007981637492775917, "learning_rate": 1.059011905531879e-06, "loss": 0.0003, "num_input_tokens_seen": 73542632, "step": 109105 }, { "epoch": 2.6655754525688318, "grad_norm": 0.09691093862056732, "learning_rate": 1.0589267763329992e-06, "loss": 0.0005, "num_input_tokens_seen": 73546280, "step": 109110 }, { "epoch": 2.6656976034006794, "grad_norm": 18.96839141845703, "learning_rate": 1.0588416467055878e-06, "loss": 0.0432, "num_input_tokens_seen": 73549608, "step": 109115 }, { "epoch": 2.665819754232526, "grad_norm": 0.04357759281992912, "learning_rate": 1.0587565166502646e-06, "loss": 0.0004, "num_input_tokens_seen": 73552872, "step": 109120 }, { "epoch": 2.6659419050643733, "grad_norm": 0.2836996912956238, "learning_rate": 1.058671386167648e-06, "loss": 0.0005, "num_input_tokens_seen": 73556584, "step": 109125 }, { "epoch": 2.6660640558962205, "grad_norm": 0.009287668392062187, "learning_rate": 1.058586255258357e-06, "loss": 0.0417, "num_input_tokens_seen": 73560296, "step": 109130 }, { "epoch": 2.6661862067280677, "grad_norm": 0.01081774290651083, "learning_rate": 1.0585011239230114e-06, "loss": 0.0329, "num_input_tokens_seen": 73563688, "step": 109135 }, { "epoch": 2.666308357559915, "grad_norm": 0.032498445361852646, "learning_rate": 1.0584159921622298e-06, "loss": 0.001, "num_input_tokens_seen": 73566888, "step": 109140 }, { "epoch": 2.666430508391762, "grad_norm": 0.020721541717648506, "learning_rate": 1.0583308599766314e-06, "loss": 0.0001, "num_input_tokens_seen": 73569896, "step": 109145 }, { "epoch": 2.6665526592236093, "grad_norm": 0.06984131038188934, "learning_rate": 1.0582457273668352e-06, "loss": 0.0778, "num_input_tokens_seen": 73573864, "step": 109150 }, { "epoch": 2.6666748100554565, "grad_norm": 16.520017623901367, "learning_rate": 1.0581605943334606e-06, "loss": 0.0621, "num_input_tokens_seen": 73577640, "step": 109155 }, { "epoch": 2.6667969608873037, "grad_norm": 19.316129684448242, "learning_rate": 1.0580754608771263e-06, "loss": 0.0427, "num_input_tokens_seen": 73581224, "step": 109160 }, { "epoch": 2.666919111719151, "grad_norm": 35.69450759887695, "learning_rate": 1.0579903269984519e-06, "loss": 0.0369, "num_input_tokens_seen": 73584488, "step": 109165 }, { "epoch": 2.667041262550998, "grad_norm": 0.025746477767825127, "learning_rate": 1.0579051926980558e-06, "loss": 0.0395, "num_input_tokens_seen": 73588392, "step": 109170 }, { "epoch": 2.6671634133828452, "grad_norm": 1.5054798126220703, "learning_rate": 1.0578200579765577e-06, "loss": 0.0004, "num_input_tokens_seen": 73591720, "step": 109175 }, { "epoch": 2.6672855642146924, "grad_norm": 0.08736526966094971, "learning_rate": 1.0577349228345766e-06, "loss": 0.0005, "num_input_tokens_seen": 73594792, "step": 109180 }, { "epoch": 2.6674077150465396, "grad_norm": 0.06524144113063812, "learning_rate": 1.0576497872727318e-06, "loss": 0.0002, "num_input_tokens_seen": 73598120, "step": 109185 }, { "epoch": 2.667529865878387, "grad_norm": 0.5269588232040405, "learning_rate": 1.0575646512916422e-06, "loss": 0.0004, "num_input_tokens_seen": 73601576, "step": 109190 }, { "epoch": 2.6676520167102336, "grad_norm": 0.2253422737121582, "learning_rate": 1.0574795148919268e-06, "loss": 0.0617, "num_input_tokens_seen": 73605288, "step": 109195 }, { "epoch": 2.667774167542081, "grad_norm": 0.2920159101486206, "learning_rate": 1.0573943780742051e-06, "loss": 0.0706, "num_input_tokens_seen": 73608424, "step": 109200 }, { "epoch": 2.667896318373928, "grad_norm": 122.05394744873047, "learning_rate": 1.0573092408390963e-06, "loss": 0.0431, "num_input_tokens_seen": 73611752, "step": 109205 }, { "epoch": 2.6680184692057756, "grad_norm": 0.03391716629266739, "learning_rate": 1.0572241031872187e-06, "loss": 0.0003, "num_input_tokens_seen": 73614696, "step": 109210 }, { "epoch": 2.6681406200376223, "grad_norm": 0.10054274648427963, "learning_rate": 1.0571389651191924e-06, "loss": 0.0003, "num_input_tokens_seen": 73618216, "step": 109215 }, { "epoch": 2.6682627708694695, "grad_norm": 0.19624534249305725, "learning_rate": 1.057053826635636e-06, "loss": 0.0004, "num_input_tokens_seen": 73621288, "step": 109220 }, { "epoch": 2.6683849217013167, "grad_norm": 0.07142213732004166, "learning_rate": 1.0569686877371688e-06, "loss": 0.0963, "num_input_tokens_seen": 73625320, "step": 109225 }, { "epoch": 2.668507072533164, "grad_norm": 0.0393596850335598, "learning_rate": 1.0568835484244103e-06, "loss": 0.0002, "num_input_tokens_seen": 73628904, "step": 109230 }, { "epoch": 2.668629223365011, "grad_norm": 0.11763045936822891, "learning_rate": 1.056798408697979e-06, "loss": 0.0003, "num_input_tokens_seen": 73632040, "step": 109235 }, { "epoch": 2.6687513741968583, "grad_norm": 0.030843744054436684, "learning_rate": 1.0567132685584944e-06, "loss": 0.0456, "num_input_tokens_seen": 73635432, "step": 109240 }, { "epoch": 2.6688735250287055, "grad_norm": 0.0399046465754509, "learning_rate": 1.0566281280065757e-06, "loss": 0.0002, "num_input_tokens_seen": 73638760, "step": 109245 }, { "epoch": 2.6689956758605526, "grad_norm": 13.421170234680176, "learning_rate": 1.0565429870428424e-06, "loss": 0.0653, "num_input_tokens_seen": 73641832, "step": 109250 }, { "epoch": 2.6691178266924, "grad_norm": 0.16441789269447327, "learning_rate": 1.056457845667913e-06, "loss": 0.104, "num_input_tokens_seen": 73644968, "step": 109255 }, { "epoch": 2.669239977524247, "grad_norm": 0.16625621914863586, "learning_rate": 1.0563727038824069e-06, "loss": 0.0002, "num_input_tokens_seen": 73648296, "step": 109260 }, { "epoch": 2.669362128356094, "grad_norm": 0.07061154395341873, "learning_rate": 1.0562875616869433e-06, "loss": 0.0002, "num_input_tokens_seen": 73651496, "step": 109265 }, { "epoch": 2.6694842791879414, "grad_norm": 15.163482666015625, "learning_rate": 1.0562024190821412e-06, "loss": 0.0999, "num_input_tokens_seen": 73654568, "step": 109270 }, { "epoch": 2.6696064300197886, "grad_norm": 0.20702727138996124, "learning_rate": 1.0561172760686204e-06, "loss": 0.029, "num_input_tokens_seen": 73658280, "step": 109275 }, { "epoch": 2.6697285808516353, "grad_norm": 0.005529319401830435, "learning_rate": 1.0560321326469996e-06, "loss": 0.0345, "num_input_tokens_seen": 73661288, "step": 109280 }, { "epoch": 2.669850731683483, "grad_norm": 0.03878706693649292, "learning_rate": 1.055946988817898e-06, "loss": 0.0469, "num_input_tokens_seen": 73664488, "step": 109285 }, { "epoch": 2.6699728825153297, "grad_norm": 0.04471408948302269, "learning_rate": 1.055861844581935e-06, "loss": 0.0789, "num_input_tokens_seen": 73667880, "step": 109290 }, { "epoch": 2.6700950333471773, "grad_norm": 0.07671474665403366, "learning_rate": 1.0557766999397295e-06, "loss": 0.0226, "num_input_tokens_seen": 73671144, "step": 109295 }, { "epoch": 2.670217184179024, "grad_norm": 0.049207672476768494, "learning_rate": 1.0556915548919007e-06, "loss": 0.0373, "num_input_tokens_seen": 73674792, "step": 109300 }, { "epoch": 2.6703393350108713, "grad_norm": 0.03824898228049278, "learning_rate": 1.0556064094390682e-06, "loss": 0.076, "num_input_tokens_seen": 73677928, "step": 109305 }, { "epoch": 2.6704614858427185, "grad_norm": 0.12842383980751038, "learning_rate": 1.0555212635818507e-06, "loss": 0.0333, "num_input_tokens_seen": 73681256, "step": 109310 }, { "epoch": 2.6705836366745657, "grad_norm": 0.0072562661953270435, "learning_rate": 1.0554361173208679e-06, "loss": 0.0005, "num_input_tokens_seen": 73684776, "step": 109315 }, { "epoch": 2.670705787506413, "grad_norm": 0.2971610724925995, "learning_rate": 1.0553509706567384e-06, "loss": 0.0482, "num_input_tokens_seen": 73687656, "step": 109320 }, { "epoch": 2.67082793833826, "grad_norm": 0.022412337362766266, "learning_rate": 1.055265823590082e-06, "loss": 0.0003, "num_input_tokens_seen": 73690792, "step": 109325 }, { "epoch": 2.6709500891701072, "grad_norm": 0.04063393548130989, "learning_rate": 1.0551806761215175e-06, "loss": 0.0578, "num_input_tokens_seen": 73694632, "step": 109330 }, { "epoch": 2.6710722400019544, "grad_norm": 210.82992553710938, "learning_rate": 1.0550955282516644e-06, "loss": 0.0572, "num_input_tokens_seen": 73697832, "step": 109335 }, { "epoch": 2.6711943908338016, "grad_norm": 26.77365493774414, "learning_rate": 1.0550103799811419e-06, "loss": 0.0728, "num_input_tokens_seen": 73700904, "step": 109340 }, { "epoch": 2.671316541665649, "grad_norm": 0.027570553123950958, "learning_rate": 1.054925231310569e-06, "loss": 0.0323, "num_input_tokens_seen": 73704104, "step": 109345 }, { "epoch": 2.671438692497496, "grad_norm": 0.16419246792793274, "learning_rate": 1.054840082240565e-06, "loss": 0.0353, "num_input_tokens_seen": 73707432, "step": 109350 }, { "epoch": 2.671560843329343, "grad_norm": 0.055125705897808075, "learning_rate": 1.0547549327717491e-06, "loss": 0.0406, "num_input_tokens_seen": 73710632, "step": 109355 }, { "epoch": 2.6716829941611904, "grad_norm": 1.7239696979522705, "learning_rate": 1.054669782904741e-06, "loss": 0.0004, "num_input_tokens_seen": 73714216, "step": 109360 }, { "epoch": 2.6718051449930376, "grad_norm": 0.043563805520534515, "learning_rate": 1.054584632640159e-06, "loss": 0.0004, "num_input_tokens_seen": 73717288, "step": 109365 }, { "epoch": 2.6719272958248848, "grad_norm": 25.463674545288086, "learning_rate": 1.0544994819786233e-06, "loss": 0.0627, "num_input_tokens_seen": 73720680, "step": 109370 }, { "epoch": 2.6720494466567315, "grad_norm": 0.024402741342782974, "learning_rate": 1.0544143309207525e-06, "loss": 0.0008, "num_input_tokens_seen": 73723880, "step": 109375 }, { "epoch": 2.672171597488579, "grad_norm": 0.025014013051986694, "learning_rate": 1.054329179467166e-06, "loss": 0.0004, "num_input_tokens_seen": 73727016, "step": 109380 }, { "epoch": 2.672293748320426, "grad_norm": 22.745786666870117, "learning_rate": 1.0542440276184835e-06, "loss": 0.1426, "num_input_tokens_seen": 73730152, "step": 109385 }, { "epoch": 2.672415899152273, "grad_norm": 17.65288734436035, "learning_rate": 1.0541588753753235e-06, "loss": 0.0894, "num_input_tokens_seen": 73733544, "step": 109390 }, { "epoch": 2.6725380499841203, "grad_norm": 0.011913538910448551, "learning_rate": 1.0540737227383052e-06, "loss": 0.0003, "num_input_tokens_seen": 73737256, "step": 109395 }, { "epoch": 2.6726602008159674, "grad_norm": 0.08027175068855286, "learning_rate": 1.0539885697080487e-06, "loss": 0.0003, "num_input_tokens_seen": 73740584, "step": 109400 }, { "epoch": 2.6727823516478146, "grad_norm": 0.0017405960243195295, "learning_rate": 1.053903416285173e-06, "loss": 0.0398, "num_input_tokens_seen": 73744040, "step": 109405 }, { "epoch": 2.672904502479662, "grad_norm": 0.09514592587947845, "learning_rate": 1.053818262470297e-06, "loss": 0.0559, "num_input_tokens_seen": 73747176, "step": 109410 }, { "epoch": 2.673026653311509, "grad_norm": 0.014695454388856888, "learning_rate": 1.0537331082640399e-06, "loss": 0.043, "num_input_tokens_seen": 73750568, "step": 109415 }, { "epoch": 2.673148804143356, "grad_norm": 13.021860122680664, "learning_rate": 1.0536479536670215e-06, "loss": 0.0016, "num_input_tokens_seen": 73753768, "step": 109420 }, { "epoch": 2.6732709549752034, "grad_norm": 0.020546959713101387, "learning_rate": 1.0535627986798603e-06, "loss": 0.0005, "num_input_tokens_seen": 73756776, "step": 109425 }, { "epoch": 2.6733931058070506, "grad_norm": 0.09020008146762848, "learning_rate": 1.0534776433031764e-06, "loss": 0.0491, "num_input_tokens_seen": 73760232, "step": 109430 }, { "epoch": 2.6735152566388978, "grad_norm": 0.008538307622075081, "learning_rate": 1.0533924875375886e-06, "loss": 0.0004, "num_input_tokens_seen": 73763752, "step": 109435 }, { "epoch": 2.673637407470745, "grad_norm": 0.02596125937998295, "learning_rate": 1.0533073313837163e-06, "loss": 0.0002, "num_input_tokens_seen": 73766952, "step": 109440 }, { "epoch": 2.673759558302592, "grad_norm": 0.01916971616446972, "learning_rate": 1.0532221748421785e-06, "loss": 0.0642, "num_input_tokens_seen": 73770216, "step": 109445 }, { "epoch": 2.6738817091344393, "grad_norm": 22.06159210205078, "learning_rate": 1.053137017913595e-06, "loss": 0.0589, "num_input_tokens_seen": 73773608, "step": 109450 }, { "epoch": 2.6740038599662865, "grad_norm": 0.0795968696475029, "learning_rate": 1.0530518605985848e-06, "loss": 0.0693, "num_input_tokens_seen": 73777000, "step": 109455 }, { "epoch": 2.6741260107981333, "grad_norm": 0.0034378645941615105, "learning_rate": 1.052966702897767e-06, "loss": 0.0567, "num_input_tokens_seen": 73780328, "step": 109460 }, { "epoch": 2.674248161629981, "grad_norm": 53.51797103881836, "learning_rate": 1.0528815448117613e-06, "loss": 0.0032, "num_input_tokens_seen": 73783336, "step": 109465 }, { "epoch": 2.6743703124618277, "grad_norm": 0.053672004491090775, "learning_rate": 1.052796386341187e-06, "loss": 0.0556, "num_input_tokens_seen": 73786408, "step": 109470 }, { "epoch": 2.6744924632936753, "grad_norm": 0.0037759304977953434, "learning_rate": 1.0527112274866628e-06, "loss": 0.0095, "num_input_tokens_seen": 73789416, "step": 109475 }, { "epoch": 2.674614614125522, "grad_norm": 0.3404017388820648, "learning_rate": 1.0526260682488085e-06, "loss": 0.0003, "num_input_tokens_seen": 73792296, "step": 109480 }, { "epoch": 2.6747367649573692, "grad_norm": 0.01111539825797081, "learning_rate": 1.0525409086282433e-06, "loss": 0.0002, "num_input_tokens_seen": 73795880, "step": 109485 }, { "epoch": 2.6748589157892164, "grad_norm": 0.023893926292657852, "learning_rate": 1.0524557486255862e-06, "loss": 0.0256, "num_input_tokens_seen": 73799336, "step": 109490 }, { "epoch": 2.6749810666210636, "grad_norm": 0.002047064481303096, "learning_rate": 1.052370588241457e-06, "loss": 0.0001, "num_input_tokens_seen": 73802856, "step": 109495 }, { "epoch": 2.675103217452911, "grad_norm": 0.016036951914429665, "learning_rate": 1.052285427476475e-06, "loss": 0.0006, "num_input_tokens_seen": 73806312, "step": 109500 }, { "epoch": 2.675225368284758, "grad_norm": 0.0667245090007782, "learning_rate": 1.052200266331259e-06, "loss": 0.0001, "num_input_tokens_seen": 73809576, "step": 109505 }, { "epoch": 2.675347519116605, "grad_norm": 0.2333661913871765, "learning_rate": 1.0521151048064287e-06, "loss": 0.0672, "num_input_tokens_seen": 73813224, "step": 109510 }, { "epoch": 2.6754696699484524, "grad_norm": 0.006419423967599869, "learning_rate": 1.0520299429026038e-06, "loss": 0.0002, "num_input_tokens_seen": 73816872, "step": 109515 }, { "epoch": 2.6755918207802996, "grad_norm": 0.026847392320632935, "learning_rate": 1.0519447806204026e-06, "loss": 0.0004, "num_input_tokens_seen": 73820136, "step": 109520 }, { "epoch": 2.6757139716121467, "grad_norm": 0.24557249248027802, "learning_rate": 1.0518596179604451e-06, "loss": 0.0003, "num_input_tokens_seen": 73823528, "step": 109525 }, { "epoch": 2.675836122443994, "grad_norm": 0.013657464645802975, "learning_rate": 1.0517744549233505e-06, "loss": 0.075, "num_input_tokens_seen": 73826984, "step": 109530 }, { "epoch": 2.675958273275841, "grad_norm": 0.0005760699859820306, "learning_rate": 1.0516892915097381e-06, "loss": 0.0002, "num_input_tokens_seen": 73830248, "step": 109535 }, { "epoch": 2.6760804241076883, "grad_norm": 0.015993546694517136, "learning_rate": 1.0516041277202275e-06, "loss": 0.1249, "num_input_tokens_seen": 73834408, "step": 109540 }, { "epoch": 2.676202574939535, "grad_norm": 0.008551014587283134, "learning_rate": 1.0515189635554375e-06, "loss": 0.0002, "num_input_tokens_seen": 73837736, "step": 109545 }, { "epoch": 2.6763247257713827, "grad_norm": 0.11013732105493546, "learning_rate": 1.0514337990159879e-06, "loss": 0.1554, "num_input_tokens_seen": 73841512, "step": 109550 }, { "epoch": 2.6764468766032294, "grad_norm": 0.0076579442247748375, "learning_rate": 1.0513486341024978e-06, "loss": 0.0001, "num_input_tokens_seen": 73844584, "step": 109555 }, { "epoch": 2.676569027435077, "grad_norm": 0.0050875660963356495, "learning_rate": 1.051263468815587e-06, "loss": 0.0004, "num_input_tokens_seen": 73847784, "step": 109560 }, { "epoch": 2.676691178266924, "grad_norm": 0.03001215308904648, "learning_rate": 1.0511783031558741e-06, "loss": 0.0002, "num_input_tokens_seen": 73851112, "step": 109565 }, { "epoch": 2.676813329098771, "grad_norm": 0.007266190368682146, "learning_rate": 1.051093137123979e-06, "loss": 0.0271, "num_input_tokens_seen": 73854248, "step": 109570 }, { "epoch": 2.676935479930618, "grad_norm": 39.2228889465332, "learning_rate": 1.0510079707205206e-06, "loss": 0.049, "num_input_tokens_seen": 73857320, "step": 109575 }, { "epoch": 2.6770576307624654, "grad_norm": 0.05401134118437767, "learning_rate": 1.0509228039461184e-06, "loss": 0.0453, "num_input_tokens_seen": 73860328, "step": 109580 }, { "epoch": 2.6771797815943126, "grad_norm": 0.10138184577226639, "learning_rate": 1.0508376368013922e-06, "loss": 0.0002, "num_input_tokens_seen": 73863592, "step": 109585 }, { "epoch": 2.6773019324261598, "grad_norm": 0.04651380330324173, "learning_rate": 1.050752469286961e-06, "loss": 0.0542, "num_input_tokens_seen": 73867496, "step": 109590 }, { "epoch": 2.677424083258007, "grad_norm": 0.005964824929833412, "learning_rate": 1.0506673014034441e-06, "loss": 0.0002, "num_input_tokens_seen": 73871016, "step": 109595 }, { "epoch": 2.677546234089854, "grad_norm": 12.295047760009766, "learning_rate": 1.050582133151461e-06, "loss": 0.1749, "num_input_tokens_seen": 73874792, "step": 109600 }, { "epoch": 2.6776683849217013, "grad_norm": 0.09397143125534058, "learning_rate": 1.0504969645316313e-06, "loss": 0.0002, "num_input_tokens_seen": 73878568, "step": 109605 }, { "epoch": 2.6777905357535485, "grad_norm": 0.007933935150504112, "learning_rate": 1.0504117955445736e-06, "loss": 0.0592, "num_input_tokens_seen": 73881576, "step": 109610 }, { "epoch": 2.6779126865853957, "grad_norm": 0.2045755684375763, "learning_rate": 1.050326626190908e-06, "loss": 0.0768, "num_input_tokens_seen": 73885032, "step": 109615 }, { "epoch": 2.678034837417243, "grad_norm": 0.012918557040393353, "learning_rate": 1.050241456471254e-06, "loss": 0.0703, "num_input_tokens_seen": 73888232, "step": 109620 }, { "epoch": 2.67815698824909, "grad_norm": 0.01960483193397522, "learning_rate": 1.05015628638623e-06, "loss": 0.0003, "num_input_tokens_seen": 73891304, "step": 109625 }, { "epoch": 2.6782791390809373, "grad_norm": 0.026721443980932236, "learning_rate": 1.0500711159364564e-06, "loss": 0.0004, "num_input_tokens_seen": 73894376, "step": 109630 }, { "epoch": 2.6784012899127845, "grad_norm": 0.03172799199819565, "learning_rate": 1.049985945122552e-06, "loss": 0.0007, "num_input_tokens_seen": 73897832, "step": 109635 }, { "epoch": 2.6785234407446312, "grad_norm": 508.57391357421875, "learning_rate": 1.0499007739451364e-06, "loss": 0.0138, "num_input_tokens_seen": 73901352, "step": 109640 }, { "epoch": 2.678645591576479, "grad_norm": 27.824207305908203, "learning_rate": 1.0498156024048285e-06, "loss": 0.0374, "num_input_tokens_seen": 73904680, "step": 109645 }, { "epoch": 2.6787677424083256, "grad_norm": 0.019097011536359787, "learning_rate": 1.0497304305022488e-06, "loss": 0.0002, "num_input_tokens_seen": 73908264, "step": 109650 }, { "epoch": 2.6788898932401732, "grad_norm": 239.4774932861328, "learning_rate": 1.0496452582380158e-06, "loss": 0.0327, "num_input_tokens_seen": 73911592, "step": 109655 }, { "epoch": 2.67901204407202, "grad_norm": 0.0204803254455328, "learning_rate": 1.0495600856127492e-06, "loss": 0.0616, "num_input_tokens_seen": 73914984, "step": 109660 }, { "epoch": 2.679134194903867, "grad_norm": 0.013475023210048676, "learning_rate": 1.0494749126270681e-06, "loss": 0.1155, "num_input_tokens_seen": 73917992, "step": 109665 }, { "epoch": 2.6792563457357144, "grad_norm": 0.08012855798006058, "learning_rate": 1.0493897392815927e-06, "loss": 0.0319, "num_input_tokens_seen": 73921128, "step": 109670 }, { "epoch": 2.6793784965675616, "grad_norm": 0.012000217102468014, "learning_rate": 1.049304565576941e-06, "loss": 0.0491, "num_input_tokens_seen": 73924968, "step": 109675 }, { "epoch": 2.6795006473994087, "grad_norm": 0.018842976540327072, "learning_rate": 1.0492193915137337e-06, "loss": 0.0897, "num_input_tokens_seen": 73928168, "step": 109680 }, { "epoch": 2.679622798231256, "grad_norm": 15.081670761108398, "learning_rate": 1.0491342170925898e-06, "loss": 0.0729, "num_input_tokens_seen": 73931560, "step": 109685 }, { "epoch": 2.679744949063103, "grad_norm": 0.0979156643152237, "learning_rate": 1.0490490423141286e-06, "loss": 0.0372, "num_input_tokens_seen": 73934696, "step": 109690 }, { "epoch": 2.6798670998949503, "grad_norm": 0.03201375901699066, "learning_rate": 1.0489638671789695e-06, "loss": 0.0507, "num_input_tokens_seen": 73938408, "step": 109695 }, { "epoch": 2.6799892507267975, "grad_norm": 0.06937959790229797, "learning_rate": 1.0488786916877322e-06, "loss": 0.0004, "num_input_tokens_seen": 73941544, "step": 109700 }, { "epoch": 2.6801114015586447, "grad_norm": 0.010883811861276627, "learning_rate": 1.0487935158410354e-06, "loss": 0.045, "num_input_tokens_seen": 73945448, "step": 109705 }, { "epoch": 2.680233552390492, "grad_norm": 0.01242207270115614, "learning_rate": 1.0487083396394994e-06, "loss": 0.0579, "num_input_tokens_seen": 73948712, "step": 109710 }, { "epoch": 2.680355703222339, "grad_norm": 0.008689627051353455, "learning_rate": 1.0486231630837435e-06, "loss": 0.027, "num_input_tokens_seen": 73952360, "step": 109715 }, { "epoch": 2.6804778540541863, "grad_norm": 0.15042613446712494, "learning_rate": 1.0485379861743867e-06, "loss": 0.0385, "num_input_tokens_seen": 73955752, "step": 109720 }, { "epoch": 2.680600004886033, "grad_norm": 0.02071407251060009, "learning_rate": 1.0484528089120484e-06, "loss": 0.0506, "num_input_tokens_seen": 73959144, "step": 109725 }, { "epoch": 2.6807221557178806, "grad_norm": 0.9460954666137695, "learning_rate": 1.0483676312973484e-06, "loss": 0.0005, "num_input_tokens_seen": 73962344, "step": 109730 }, { "epoch": 2.6808443065497274, "grad_norm": 0.18505899608135223, "learning_rate": 1.0482824533309057e-06, "loss": 0.0003, "num_input_tokens_seen": 73965544, "step": 109735 }, { "epoch": 2.680966457381575, "grad_norm": 0.01839536800980568, "learning_rate": 1.04819727501334e-06, "loss": 0.0005, "num_input_tokens_seen": 73968936, "step": 109740 }, { "epoch": 2.6810886082134218, "grad_norm": 0.3862949013710022, "learning_rate": 1.0481120963452712e-06, "loss": 0.0503, "num_input_tokens_seen": 73972200, "step": 109745 }, { "epoch": 2.681210759045269, "grad_norm": 0.015912873670458794, "learning_rate": 1.048026917327318e-06, "loss": 0.0004, "num_input_tokens_seen": 73975080, "step": 109750 }, { "epoch": 2.681332909877116, "grad_norm": 23.779499053955078, "learning_rate": 1.0479417379601003e-06, "loss": 0.0591, "num_input_tokens_seen": 73978152, "step": 109755 }, { "epoch": 2.6814550607089633, "grad_norm": 0.2585065960884094, "learning_rate": 1.0478565582442374e-06, "loss": 0.0969, "num_input_tokens_seen": 73981416, "step": 109760 }, { "epoch": 2.6815772115408105, "grad_norm": 86.09591674804688, "learning_rate": 1.0477713781803487e-06, "loss": 0.0928, "num_input_tokens_seen": 73985128, "step": 109765 }, { "epoch": 2.6816993623726577, "grad_norm": 0.012614854611456394, "learning_rate": 1.0476861977690533e-06, "loss": 0.0412, "num_input_tokens_seen": 73988328, "step": 109770 }, { "epoch": 2.681821513204505, "grad_norm": 0.11613646149635315, "learning_rate": 1.0476010170109715e-06, "loss": 0.0452, "num_input_tokens_seen": 73991592, "step": 109775 }, { "epoch": 2.681943664036352, "grad_norm": 1.0370471477508545, "learning_rate": 1.0475158359067222e-06, "loss": 0.0005, "num_input_tokens_seen": 73994920, "step": 109780 }, { "epoch": 2.6820658148681993, "grad_norm": 0.13517054915428162, "learning_rate": 1.047430654456925e-06, "loss": 0.1469, "num_input_tokens_seen": 73997992, "step": 109785 }, { "epoch": 2.6821879657000465, "grad_norm": 123.24833679199219, "learning_rate": 1.0473454726621992e-06, "loss": 0.1009, "num_input_tokens_seen": 74001192, "step": 109790 }, { "epoch": 2.6823101165318937, "grad_norm": 43.56346130371094, "learning_rate": 1.0472602905231647e-06, "loss": 0.001, "num_input_tokens_seen": 74004584, "step": 109795 }, { "epoch": 2.682432267363741, "grad_norm": 0.03455251827836037, "learning_rate": 1.0471751080404401e-06, "loss": 0.0008, "num_input_tokens_seen": 74008040, "step": 109800 }, { "epoch": 2.682554418195588, "grad_norm": 0.12241503596305847, "learning_rate": 1.0470899252146456e-06, "loss": 0.0005, "num_input_tokens_seen": 74010984, "step": 109805 }, { "epoch": 2.6826765690274352, "grad_norm": 0.010911804623901844, "learning_rate": 1.0470047420464008e-06, "loss": 0.0008, "num_input_tokens_seen": 74014184, "step": 109810 }, { "epoch": 2.6827987198592824, "grad_norm": 1.412819504737854, "learning_rate": 1.0469195585363246e-06, "loss": 0.0006, "num_input_tokens_seen": 74017448, "step": 109815 }, { "epoch": 2.682920870691129, "grad_norm": 0.12392840534448624, "learning_rate": 1.0468343746850369e-06, "loss": 0.1279, "num_input_tokens_seen": 74020904, "step": 109820 }, { "epoch": 2.683043021522977, "grad_norm": 414.29779052734375, "learning_rate": 1.0467491904931574e-06, "loss": 0.0209, "num_input_tokens_seen": 74024424, "step": 109825 }, { "epoch": 2.6831651723548235, "grad_norm": 0.019323252141475677, "learning_rate": 1.0466640059613045e-06, "loss": 0.0351, "num_input_tokens_seen": 74027752, "step": 109830 }, { "epoch": 2.6832873231866707, "grad_norm": 0.06316502392292023, "learning_rate": 1.0465788210900987e-06, "loss": 0.0001, "num_input_tokens_seen": 74031016, "step": 109835 }, { "epoch": 2.683409474018518, "grad_norm": 0.035119447857141495, "learning_rate": 1.046493635880159e-06, "loss": 0.059, "num_input_tokens_seen": 74034600, "step": 109840 }, { "epoch": 2.683531624850365, "grad_norm": 0.0017133563524112105, "learning_rate": 1.0464084503321053e-06, "loss": 0.0526, "num_input_tokens_seen": 74037672, "step": 109845 }, { "epoch": 2.6836537756822123, "grad_norm": 0.024416368454694748, "learning_rate": 1.046323264446557e-06, "loss": 0.0802, "num_input_tokens_seen": 74040808, "step": 109850 }, { "epoch": 2.6837759265140595, "grad_norm": 0.08081745356321335, "learning_rate": 1.046238078224133e-06, "loss": 0.0002, "num_input_tokens_seen": 74043944, "step": 109855 }, { "epoch": 2.6838980773459067, "grad_norm": 0.15387476980686188, "learning_rate": 1.0461528916654536e-06, "loss": 0.0271, "num_input_tokens_seen": 74047464, "step": 109860 }, { "epoch": 2.684020228177754, "grad_norm": 0.1924990713596344, "learning_rate": 1.046067704771138e-06, "loss": 0.0003, "num_input_tokens_seen": 74050920, "step": 109865 }, { "epoch": 2.684142379009601, "grad_norm": 0.050476282835006714, "learning_rate": 1.0459825175418057e-06, "loss": 0.0001, "num_input_tokens_seen": 74054312, "step": 109870 }, { "epoch": 2.6842645298414483, "grad_norm": 0.08359876275062561, "learning_rate": 1.045897329978076e-06, "loss": 0.0002, "num_input_tokens_seen": 74057640, "step": 109875 }, { "epoch": 2.6843866806732954, "grad_norm": 0.013668022118508816, "learning_rate": 1.0458121420805685e-06, "loss": 0.0004, "num_input_tokens_seen": 74060648, "step": 109880 }, { "epoch": 2.6845088315051426, "grad_norm": 0.011722809635102749, "learning_rate": 1.045726953849903e-06, "loss": 0.0581, "num_input_tokens_seen": 74064232, "step": 109885 }, { "epoch": 2.68463098233699, "grad_norm": 0.153499573469162, "learning_rate": 1.0456417652866986e-06, "loss": 0.0007, "num_input_tokens_seen": 74067368, "step": 109890 }, { "epoch": 2.684753133168837, "grad_norm": 0.11200530081987381, "learning_rate": 1.045556576391575e-06, "loss": 0.0438, "num_input_tokens_seen": 74071016, "step": 109895 }, { "epoch": 2.684875284000684, "grad_norm": 0.01185649260878563, "learning_rate": 1.0454713871651518e-06, "loss": 0.0004, "num_input_tokens_seen": 74074408, "step": 109900 }, { "epoch": 2.684997434832531, "grad_norm": 14.90159797668457, "learning_rate": 1.0453861976080485e-06, "loss": 0.0523, "num_input_tokens_seen": 74078184, "step": 109905 }, { "epoch": 2.6851195856643786, "grad_norm": 0.015263424254953861, "learning_rate": 1.0453010077208845e-06, "loss": 0.0002, "num_input_tokens_seen": 74081576, "step": 109910 }, { "epoch": 2.6852417364962253, "grad_norm": 0.10612472891807556, "learning_rate": 1.0452158175042794e-06, "loss": 0.0134, "num_input_tokens_seen": 74084904, "step": 109915 }, { "epoch": 2.685363887328073, "grad_norm": 0.0759076327085495, "learning_rate": 1.0451306269588526e-06, "loss": 0.0866, "num_input_tokens_seen": 74088360, "step": 109920 }, { "epoch": 2.6854860381599197, "grad_norm": 0.004117521457374096, "learning_rate": 1.0450454360852238e-06, "loss": 0.0005, "num_input_tokens_seen": 74091432, "step": 109925 }, { "epoch": 2.685608188991767, "grad_norm": 0.04352575168013573, "learning_rate": 1.0449602448840127e-06, "loss": 0.0003, "num_input_tokens_seen": 74094568, "step": 109930 }, { "epoch": 2.685730339823614, "grad_norm": 42.42630386352539, "learning_rate": 1.0448750533558383e-06, "loss": 0.0733, "num_input_tokens_seen": 74097896, "step": 109935 }, { "epoch": 2.6858524906554613, "grad_norm": 157.4754180908203, "learning_rate": 1.0447898615013206e-06, "loss": 0.0455, "num_input_tokens_seen": 74101032, "step": 109940 }, { "epoch": 2.6859746414873085, "grad_norm": 0.034718506038188934, "learning_rate": 1.044704669321079e-06, "loss": 0.0701, "num_input_tokens_seen": 74104552, "step": 109945 }, { "epoch": 2.6860967923191557, "grad_norm": 0.18217360973358154, "learning_rate": 1.044619476815733e-06, "loss": 0.0004, "num_input_tokens_seen": 74108328, "step": 109950 }, { "epoch": 2.686218943151003, "grad_norm": 36.641632080078125, "learning_rate": 1.044534283985902e-06, "loss": 0.074, "num_input_tokens_seen": 74112168, "step": 109955 }, { "epoch": 2.68634109398285, "grad_norm": 0.04653691127896309, "learning_rate": 1.044449090832206e-06, "loss": 0.0443, "num_input_tokens_seen": 74115624, "step": 109960 }, { "epoch": 2.6864632448146972, "grad_norm": 0.010276403278112411, "learning_rate": 1.044363897355264e-06, "loss": 0.0447, "num_input_tokens_seen": 74118696, "step": 109965 }, { "epoch": 2.6865853956465444, "grad_norm": 22.342988967895508, "learning_rate": 1.044278703555696e-06, "loss": 0.1297, "num_input_tokens_seen": 74122088, "step": 109970 }, { "epoch": 2.6867075464783916, "grad_norm": 0.28928831219673157, "learning_rate": 1.044193509434121e-06, "loss": 0.0467, "num_input_tokens_seen": 74125032, "step": 109975 }, { "epoch": 2.686829697310239, "grad_norm": 16.242509841918945, "learning_rate": 1.0441083149911596e-06, "loss": 0.0384, "num_input_tokens_seen": 74128360, "step": 109980 }, { "epoch": 2.686951848142086, "grad_norm": 0.03907003253698349, "learning_rate": 1.04402312022743e-06, "loss": 0.0019, "num_input_tokens_seen": 74132008, "step": 109985 }, { "epoch": 2.687073998973933, "grad_norm": 0.10918454825878143, "learning_rate": 1.0439379251435527e-06, "loss": 0.0002, "num_input_tokens_seen": 74135208, "step": 109990 }, { "epoch": 2.6871961498057804, "grad_norm": 0.6462915539741516, "learning_rate": 1.043852729740147e-06, "loss": 0.0008, "num_input_tokens_seen": 74138600, "step": 109995 }, { "epoch": 2.687318300637627, "grad_norm": 0.1784975826740265, "learning_rate": 1.0437675340178322e-06, "loss": 0.0655, "num_input_tokens_seen": 74141992, "step": 110000 }, { "epoch": 2.6874404514694747, "grad_norm": 0.0959717407822609, "learning_rate": 1.0436823379772283e-06, "loss": 0.0427, "num_input_tokens_seen": 74145128, "step": 110005 }, { "epoch": 2.6875626023013215, "grad_norm": 0.012468988075852394, "learning_rate": 1.0435971416189549e-06, "loss": 0.0003, "num_input_tokens_seen": 74148776, "step": 110010 }, { "epoch": 2.6876847531331687, "grad_norm": 0.022382086142897606, "learning_rate": 1.0435119449436309e-06, "loss": 0.0481, "num_input_tokens_seen": 74151912, "step": 110015 }, { "epoch": 2.687806903965016, "grad_norm": 0.04924376308917999, "learning_rate": 1.0434267479518768e-06, "loss": 0.0373, "num_input_tokens_seen": 74154984, "step": 110020 }, { "epoch": 2.687929054796863, "grad_norm": 18.4334659576416, "learning_rate": 1.0433415506443117e-06, "loss": 0.193, "num_input_tokens_seen": 74157928, "step": 110025 }, { "epoch": 2.6880512056287102, "grad_norm": 0.030442774295806885, "learning_rate": 1.043256353021555e-06, "loss": 0.0006, "num_input_tokens_seen": 74161512, "step": 110030 }, { "epoch": 2.6881733564605574, "grad_norm": 0.06618069857358932, "learning_rate": 1.0431711550842265e-06, "loss": 0.0002, "num_input_tokens_seen": 74164968, "step": 110035 }, { "epoch": 2.6882955072924046, "grad_norm": 0.46438732743263245, "learning_rate": 1.0430859568329458e-06, "loss": 0.0004, "num_input_tokens_seen": 74168232, "step": 110040 }, { "epoch": 2.688417658124252, "grad_norm": 1.1904546022415161, "learning_rate": 1.0430007582683322e-06, "loss": 0.0494, "num_input_tokens_seen": 74171624, "step": 110045 }, { "epoch": 2.688539808956099, "grad_norm": 478.31195068359375, "learning_rate": 1.0429155593910055e-06, "loss": 0.0137, "num_input_tokens_seen": 74175272, "step": 110050 }, { "epoch": 2.688661959787946, "grad_norm": 0.02546611614525318, "learning_rate": 1.0428303602015856e-06, "loss": 0.0411, "num_input_tokens_seen": 74178792, "step": 110055 }, { "epoch": 2.6887841106197934, "grad_norm": 10.07689094543457, "learning_rate": 1.0427451607006916e-06, "loss": 0.1165, "num_input_tokens_seen": 74182248, "step": 110060 }, { "epoch": 2.6889062614516406, "grad_norm": 0.045556433498859406, "learning_rate": 1.0426599608889435e-06, "loss": 0.0005, "num_input_tokens_seen": 74185896, "step": 110065 }, { "epoch": 2.6890284122834878, "grad_norm": 0.21916463971138, "learning_rate": 1.0425747607669607e-06, "loss": 0.0047, "num_input_tokens_seen": 74189096, "step": 110070 }, { "epoch": 2.689150563115335, "grad_norm": 81.54149627685547, "learning_rate": 1.0424895603353626e-06, "loss": 0.0301, "num_input_tokens_seen": 74192168, "step": 110075 }, { "epoch": 2.689272713947182, "grad_norm": 0.053823795169591904, "learning_rate": 1.0424043595947692e-06, "loss": 0.0455, "num_input_tokens_seen": 74195368, "step": 110080 }, { "epoch": 2.689394864779029, "grad_norm": 0.05746879801154137, "learning_rate": 1.0423191585457997e-06, "loss": 0.0002, "num_input_tokens_seen": 74198568, "step": 110085 }, { "epoch": 2.6895170156108765, "grad_norm": 0.039911337196826935, "learning_rate": 1.0422339571890738e-06, "loss": 0.0326, "num_input_tokens_seen": 74202024, "step": 110090 }, { "epoch": 2.6896391664427233, "grad_norm": 0.011159949004650116, "learning_rate": 1.0421487555252115e-06, "loss": 0.0468, "num_input_tokens_seen": 74205736, "step": 110095 }, { "epoch": 2.689761317274571, "grad_norm": 0.0034481934271752834, "learning_rate": 1.042063553554832e-06, "loss": 0.1252, "num_input_tokens_seen": 74208936, "step": 110100 }, { "epoch": 2.6898834681064177, "grad_norm": 0.03940635919570923, "learning_rate": 1.041978351278555e-06, "loss": 0.0002, "num_input_tokens_seen": 74212200, "step": 110105 }, { "epoch": 2.690005618938265, "grad_norm": 0.3935776352882385, "learning_rate": 1.0418931486969998e-06, "loss": 0.0007, "num_input_tokens_seen": 74215976, "step": 110110 }, { "epoch": 2.690127769770112, "grad_norm": 0.1421252191066742, "learning_rate": 1.0418079458107868e-06, "loss": 0.0001, "num_input_tokens_seen": 74219176, "step": 110115 }, { "epoch": 2.690249920601959, "grad_norm": 1.2814970016479492, "learning_rate": 1.041722742620535e-06, "loss": 0.0011, "num_input_tokens_seen": 74222696, "step": 110120 }, { "epoch": 2.6903720714338064, "grad_norm": 0.04111748933792114, "learning_rate": 1.0416375391268642e-06, "loss": 0.17, "num_input_tokens_seen": 74225960, "step": 110125 }, { "epoch": 2.6904942222656536, "grad_norm": 385.12689208984375, "learning_rate": 1.0415523353303942e-06, "loss": 0.0067, "num_input_tokens_seen": 74229992, "step": 110130 }, { "epoch": 2.690616373097501, "grad_norm": 0.01891736313700676, "learning_rate": 1.0414671312317444e-06, "loss": 0.0004, "num_input_tokens_seen": 74233576, "step": 110135 }, { "epoch": 2.690738523929348, "grad_norm": 9.382598876953125, "learning_rate": 1.0413819268315343e-06, "loss": 0.1042, "num_input_tokens_seen": 74237224, "step": 110140 }, { "epoch": 2.690860674761195, "grad_norm": 0.06907006353139877, "learning_rate": 1.0412967221303836e-06, "loss": 0.0367, "num_input_tokens_seen": 74240808, "step": 110145 }, { "epoch": 2.6909828255930424, "grad_norm": 0.052153579890728, "learning_rate": 1.041211517128912e-06, "loss": 0.0001, "num_input_tokens_seen": 74244392, "step": 110150 }, { "epoch": 2.6911049764248895, "grad_norm": 30.580373764038086, "learning_rate": 1.0411263118277396e-06, "loss": 0.1193, "num_input_tokens_seen": 74247464, "step": 110155 }, { "epoch": 2.6912271272567367, "grad_norm": 0.918717622756958, "learning_rate": 1.0410411062274856e-06, "loss": 0.0004, "num_input_tokens_seen": 74250664, "step": 110160 }, { "epoch": 2.691349278088584, "grad_norm": 0.008485613390803337, "learning_rate": 1.0409559003287692e-06, "loss": 0.066, "num_input_tokens_seen": 74254376, "step": 110165 }, { "epoch": 2.6914714289204307, "grad_norm": 0.03416123986244202, "learning_rate": 1.0408706941322105e-06, "loss": 0.0647, "num_input_tokens_seen": 74257832, "step": 110170 }, { "epoch": 2.6915935797522783, "grad_norm": 0.05413348227739334, "learning_rate": 1.0407854876384293e-06, "loss": 0.0744, "num_input_tokens_seen": 74261096, "step": 110175 }, { "epoch": 2.691715730584125, "grad_norm": 0.02704242244362831, "learning_rate": 1.0407002808480454e-06, "loss": 0.0001, "num_input_tokens_seen": 74264680, "step": 110180 }, { "epoch": 2.6918378814159727, "grad_norm": 0.0406159944832325, "learning_rate": 1.0406150737616776e-06, "loss": 0.0811, "num_input_tokens_seen": 74268264, "step": 110185 }, { "epoch": 2.6919600322478194, "grad_norm": 0.09724722802639008, "learning_rate": 1.0405298663799463e-06, "loss": 0.0358, "num_input_tokens_seen": 74271464, "step": 110190 }, { "epoch": 2.6920821830796666, "grad_norm": 8.796319961547852, "learning_rate": 1.0404446587034708e-06, "loss": 0.0011, "num_input_tokens_seen": 74274536, "step": 110195 }, { "epoch": 2.692204333911514, "grad_norm": 0.05820569023489952, "learning_rate": 1.040359450732871e-06, "loss": 0.0002, "num_input_tokens_seen": 74277928, "step": 110200 }, { "epoch": 2.692326484743361, "grad_norm": 0.004109354689717293, "learning_rate": 1.040274242468766e-06, "loss": 0.0001, "num_input_tokens_seen": 74281256, "step": 110205 }, { "epoch": 2.692448635575208, "grad_norm": 0.1228046640753746, "learning_rate": 1.0401890339117763e-06, "loss": 0.0004, "num_input_tokens_seen": 74284968, "step": 110210 }, { "epoch": 2.6925707864070554, "grad_norm": 0.012766096740961075, "learning_rate": 1.0401038250625212e-06, "loss": 0.0004, "num_input_tokens_seen": 74288360, "step": 110215 }, { "epoch": 2.6926929372389026, "grad_norm": 0.03938378766179085, "learning_rate": 1.04001861592162e-06, "loss": 0.0001, "num_input_tokens_seen": 74291304, "step": 110220 }, { "epoch": 2.6928150880707498, "grad_norm": 0.02110552228987217, "learning_rate": 1.0399334064896927e-06, "loss": 0.0396, "num_input_tokens_seen": 74295528, "step": 110225 }, { "epoch": 2.692937238902597, "grad_norm": 0.011235724203288555, "learning_rate": 1.0398481967673592e-06, "loss": 0.0001, "num_input_tokens_seen": 74299112, "step": 110230 }, { "epoch": 2.693059389734444, "grad_norm": 0.0376700684428215, "learning_rate": 1.0397629867552387e-06, "loss": 0.0483, "num_input_tokens_seen": 74302632, "step": 110235 }, { "epoch": 2.6931815405662913, "grad_norm": 0.06534771621227264, "learning_rate": 1.039677776453951e-06, "loss": 0.0002, "num_input_tokens_seen": 74305832, "step": 110240 }, { "epoch": 2.6933036913981385, "grad_norm": 0.1882932335138321, "learning_rate": 1.0395925658641161e-06, "loss": 0.0002, "num_input_tokens_seen": 74309224, "step": 110245 }, { "epoch": 2.6934258422299857, "grad_norm": 0.04143861308693886, "learning_rate": 1.039507354986353e-06, "loss": 0.0001, "num_input_tokens_seen": 74312552, "step": 110250 }, { "epoch": 2.693547993061833, "grad_norm": 0.0019535149913281202, "learning_rate": 1.0394221438212822e-06, "loss": 0.0001, "num_input_tokens_seen": 74315880, "step": 110255 }, { "epoch": 2.69367014389368, "grad_norm": 0.4619247317314148, "learning_rate": 1.0393369323695227e-06, "loss": 0.0373, "num_input_tokens_seen": 74319144, "step": 110260 }, { "epoch": 2.693792294725527, "grad_norm": 0.12660761177539825, "learning_rate": 1.0392517206316944e-06, "loss": 0.0397, "num_input_tokens_seen": 74322408, "step": 110265 }, { "epoch": 2.6939144455573745, "grad_norm": 0.00939325150102377, "learning_rate": 1.0391665086084172e-06, "loss": 0.0002, "num_input_tokens_seen": 74325480, "step": 110270 }, { "epoch": 2.694036596389221, "grad_norm": 13.599160194396973, "learning_rate": 1.0390812963003105e-06, "loss": 0.0656, "num_input_tokens_seen": 74328488, "step": 110275 }, { "epoch": 2.694158747221069, "grad_norm": 0.04216121882200241, "learning_rate": 1.0389960837079944e-06, "loss": 0.0378, "num_input_tokens_seen": 74331880, "step": 110280 }, { "epoch": 2.6942808980529156, "grad_norm": 0.015083221718668938, "learning_rate": 1.0389108708320879e-06, "loss": 0.0332, "num_input_tokens_seen": 74335144, "step": 110285 }, { "epoch": 2.694403048884763, "grad_norm": 0.15606962144374847, "learning_rate": 1.0388256576732115e-06, "loss": 0.0656, "num_input_tokens_seen": 74338664, "step": 110290 }, { "epoch": 2.69452519971661, "grad_norm": 0.02123434469103813, "learning_rate": 1.038740444231984e-06, "loss": 0.0001, "num_input_tokens_seen": 74341992, "step": 110295 }, { "epoch": 2.694647350548457, "grad_norm": 0.007134847808629274, "learning_rate": 1.0386552305090256e-06, "loss": 0.0523, "num_input_tokens_seen": 74345384, "step": 110300 }, { "epoch": 2.6947695013803044, "grad_norm": 0.007809483911842108, "learning_rate": 1.0385700165049565e-06, "loss": 0.035, "num_input_tokens_seen": 74348584, "step": 110305 }, { "epoch": 2.6948916522121515, "grad_norm": 2.889589548110962, "learning_rate": 1.0384848022203955e-06, "loss": 0.0611, "num_input_tokens_seen": 74352360, "step": 110310 }, { "epoch": 2.6950138030439987, "grad_norm": 0.8335879445075989, "learning_rate": 1.0383995876559626e-06, "loss": 0.001, "num_input_tokens_seen": 74356008, "step": 110315 }, { "epoch": 2.695135953875846, "grad_norm": 0.08728937804698944, "learning_rate": 1.0383143728122776e-06, "loss": 0.0002, "num_input_tokens_seen": 74359272, "step": 110320 }, { "epoch": 2.695258104707693, "grad_norm": 0.11635275930166245, "learning_rate": 1.0382291576899605e-06, "loss": 0.0304, "num_input_tokens_seen": 74362536, "step": 110325 }, { "epoch": 2.6953802555395403, "grad_norm": 0.11208701133728027, "learning_rate": 1.03814394228963e-06, "loss": 0.0001, "num_input_tokens_seen": 74365544, "step": 110330 }, { "epoch": 2.6955024063713875, "grad_norm": 0.022906199097633362, "learning_rate": 1.0380587266119072e-06, "loss": 0.0003, "num_input_tokens_seen": 74368872, "step": 110335 }, { "epoch": 2.6956245572032347, "grad_norm": 0.024830637499690056, "learning_rate": 1.0379735106574107e-06, "loss": 0.0465, "num_input_tokens_seen": 74372200, "step": 110340 }, { "epoch": 2.695746708035082, "grad_norm": 0.005570548586547375, "learning_rate": 1.0378882944267608e-06, "loss": 0.0579, "num_input_tokens_seen": 74375784, "step": 110345 }, { "epoch": 2.6958688588669286, "grad_norm": 20.110021591186523, "learning_rate": 1.037803077920577e-06, "loss": 0.1055, "num_input_tokens_seen": 74379112, "step": 110350 }, { "epoch": 2.6959910096987763, "grad_norm": 16.832504272460938, "learning_rate": 1.037717861139479e-06, "loss": 0.0908, "num_input_tokens_seen": 74382248, "step": 110355 }, { "epoch": 2.696113160530623, "grad_norm": 0.008288275450468063, "learning_rate": 1.0376326440840863e-06, "loss": 0.083, "num_input_tokens_seen": 74385512, "step": 110360 }, { "epoch": 2.6962353113624706, "grad_norm": 0.15754544734954834, "learning_rate": 1.0375474267550192e-06, "loss": 0.0502, "num_input_tokens_seen": 74388904, "step": 110365 }, { "epoch": 2.6963574621943174, "grad_norm": 0.011054810136556625, "learning_rate": 1.0374622091528973e-06, "loss": 0.0367, "num_input_tokens_seen": 74392424, "step": 110370 }, { "epoch": 2.6964796130261646, "grad_norm": 16.19540786743164, "learning_rate": 1.03737699127834e-06, "loss": 0.0426, "num_input_tokens_seen": 74396392, "step": 110375 }, { "epoch": 2.6966017638580118, "grad_norm": 0.013768474571406841, "learning_rate": 1.037291773131967e-06, "loss": 0.0002, "num_input_tokens_seen": 74399656, "step": 110380 }, { "epoch": 2.696723914689859, "grad_norm": 0.22251591086387634, "learning_rate": 1.0372065547143982e-06, "loss": 0.061, "num_input_tokens_seen": 74403240, "step": 110385 }, { "epoch": 2.696846065521706, "grad_norm": 0.13074393570423126, "learning_rate": 1.0371213360262537e-06, "loss": 0.0491, "num_input_tokens_seen": 74406888, "step": 110390 }, { "epoch": 2.6969682163535533, "grad_norm": 0.020176751539111137, "learning_rate": 1.0370361170681525e-06, "loss": 0.0968, "num_input_tokens_seen": 74410024, "step": 110395 }, { "epoch": 2.6970903671854005, "grad_norm": 0.0231939610093832, "learning_rate": 1.0369508978407146e-06, "loss": 0.009, "num_input_tokens_seen": 74413480, "step": 110400 }, { "epoch": 2.6972125180172477, "grad_norm": 0.05197136104106903, "learning_rate": 1.0368656783445603e-06, "loss": 0.0004, "num_input_tokens_seen": 74416808, "step": 110405 }, { "epoch": 2.697334668849095, "grad_norm": 1.812922716140747, "learning_rate": 1.0367804585803084e-06, "loss": 0.0412, "num_input_tokens_seen": 74421224, "step": 110410 }, { "epoch": 2.697456819680942, "grad_norm": 0.00616570794954896, "learning_rate": 1.0366952385485792e-06, "loss": 0.0004, "num_input_tokens_seen": 74425000, "step": 110415 }, { "epoch": 2.6975789705127893, "grad_norm": 0.011104024946689606, "learning_rate": 1.0366100182499923e-06, "loss": 0.0001, "num_input_tokens_seen": 74428328, "step": 110420 }, { "epoch": 2.6977011213446365, "grad_norm": 0.013107498176395893, "learning_rate": 1.0365247976851677e-06, "loss": 0.015, "num_input_tokens_seen": 74432424, "step": 110425 }, { "epoch": 2.6978232721764837, "grad_norm": 0.03257214277982712, "learning_rate": 1.036439576854725e-06, "loss": 0.0002, "num_input_tokens_seen": 74435752, "step": 110430 }, { "epoch": 2.697945423008331, "grad_norm": 0.002233329229056835, "learning_rate": 1.0363543557592838e-06, "loss": 0.0002, "num_input_tokens_seen": 74439016, "step": 110435 }, { "epoch": 2.698067573840178, "grad_norm": 0.049250464886426926, "learning_rate": 1.036269134399464e-06, "loss": 0.0001, "num_input_tokens_seen": 74442600, "step": 110440 }, { "epoch": 2.698189724672025, "grad_norm": 0.10871399939060211, "learning_rate": 1.0361839127758854e-06, "loss": 0.0664, "num_input_tokens_seen": 74445928, "step": 110445 }, { "epoch": 2.6983118755038724, "grad_norm": 0.6769186854362488, "learning_rate": 1.0360986908891672e-06, "loss": 0.0002, "num_input_tokens_seen": 74449640, "step": 110450 }, { "epoch": 2.698434026335719, "grad_norm": 0.0014609667705371976, "learning_rate": 1.03601346873993e-06, "loss": 0.0002, "num_input_tokens_seen": 74453032, "step": 110455 }, { "epoch": 2.6985561771675664, "grad_norm": 0.0031799173448234797, "learning_rate": 1.035928246328793e-06, "loss": 0.0714, "num_input_tokens_seen": 74456360, "step": 110460 }, { "epoch": 2.6986783279994135, "grad_norm": 0.0017244528280571103, "learning_rate": 1.0358430236563762e-06, "loss": 0.0001, "num_input_tokens_seen": 74460008, "step": 110465 }, { "epoch": 2.6988004788312607, "grad_norm": 2.850145101547241, "learning_rate": 1.0357578007232991e-06, "loss": 0.001, "num_input_tokens_seen": 74463144, "step": 110470 }, { "epoch": 2.698922629663108, "grad_norm": 0.010116000659763813, "learning_rate": 1.035672577530182e-06, "loss": 0.0366, "num_input_tokens_seen": 74466856, "step": 110475 }, { "epoch": 2.699044780494955, "grad_norm": 0.10385355353355408, "learning_rate": 1.0355873540776443e-06, "loss": 0.0362, "num_input_tokens_seen": 74470184, "step": 110480 }, { "epoch": 2.6991669313268023, "grad_norm": 51.628379821777344, "learning_rate": 1.0355021303663053e-06, "loss": 0.0514, "num_input_tokens_seen": 74473896, "step": 110485 }, { "epoch": 2.6992890821586495, "grad_norm": 0.009503796696662903, "learning_rate": 1.035416906396786e-06, "loss": 0.0279, "num_input_tokens_seen": 74477736, "step": 110490 }, { "epoch": 2.6994112329904967, "grad_norm": 0.0070153940469026566, "learning_rate": 1.035331682169705e-06, "loss": 0.0001, "num_input_tokens_seen": 74480872, "step": 110495 }, { "epoch": 2.699533383822344, "grad_norm": 0.007911594584584236, "learning_rate": 1.0352464576856826e-06, "loss": 0.0, "num_input_tokens_seen": 74484072, "step": 110500 }, { "epoch": 2.699655534654191, "grad_norm": 0.2613266706466675, "learning_rate": 1.0351612329453384e-06, "loss": 0.0002, "num_input_tokens_seen": 74487208, "step": 110505 }, { "epoch": 2.6997776854860382, "grad_norm": 0.01738755591213703, "learning_rate": 1.0350760079492922e-06, "loss": 0.0001, "num_input_tokens_seen": 74490472, "step": 110510 }, { "epoch": 2.6998998363178854, "grad_norm": 0.1000705137848854, "learning_rate": 1.0349907826981638e-06, "loss": 0.0002, "num_input_tokens_seen": 74493928, "step": 110515 }, { "epoch": 2.7000219871497326, "grad_norm": 0.00792511273175478, "learning_rate": 1.0349055571925731e-06, "loss": 0.0002, "num_input_tokens_seen": 74497064, "step": 110520 }, { "epoch": 2.70014413798158, "grad_norm": 0.0014833626337349415, "learning_rate": 1.0348203314331398e-06, "loss": 0.0001, "num_input_tokens_seen": 74500456, "step": 110525 }, { "epoch": 2.7002662888134266, "grad_norm": 4.809901714324951, "learning_rate": 1.0347351054204839e-06, "loss": 0.063, "num_input_tokens_seen": 74503656, "step": 110530 }, { "epoch": 2.700388439645274, "grad_norm": 0.15625979006290436, "learning_rate": 1.0346498791552247e-06, "loss": 0.0384, "num_input_tokens_seen": 74507048, "step": 110535 }, { "epoch": 2.700510590477121, "grad_norm": 0.02334548532962799, "learning_rate": 1.0345646526379824e-06, "loss": 0.0447, "num_input_tokens_seen": 74510504, "step": 110540 }, { "epoch": 2.7006327413089686, "grad_norm": 0.6260141134262085, "learning_rate": 1.0344794258693766e-06, "loss": 0.0005, "num_input_tokens_seen": 74513768, "step": 110545 }, { "epoch": 2.7007548921408153, "grad_norm": 0.17521077394485474, "learning_rate": 1.0343941988500271e-06, "loss": 0.0004, "num_input_tokens_seen": 74517032, "step": 110550 }, { "epoch": 2.7008770429726625, "grad_norm": 0.017024708911776543, "learning_rate": 1.0343089715805537e-06, "loss": 0.0001, "num_input_tokens_seen": 74520360, "step": 110555 }, { "epoch": 2.7009991938045097, "grad_norm": 0.15910948812961578, "learning_rate": 1.0342237440615765e-06, "loss": 0.0001, "num_input_tokens_seen": 74524264, "step": 110560 }, { "epoch": 2.701121344636357, "grad_norm": 0.09337528795003891, "learning_rate": 1.0341385162937147e-06, "loss": 0.0879, "num_input_tokens_seen": 74527848, "step": 110565 }, { "epoch": 2.701243495468204, "grad_norm": 0.0003024581237696111, "learning_rate": 1.0340532882775887e-06, "loss": 0.0001, "num_input_tokens_seen": 74531176, "step": 110570 }, { "epoch": 2.7013656463000513, "grad_norm": 0.010254576802253723, "learning_rate": 1.0339680600138176e-06, "loss": 0.0001, "num_input_tokens_seen": 74534632, "step": 110575 }, { "epoch": 2.7014877971318985, "grad_norm": 0.058449145406484604, "learning_rate": 1.033882831503022e-06, "loss": 0.115, "num_input_tokens_seen": 74538024, "step": 110580 }, { "epoch": 2.7016099479637457, "grad_norm": 0.2035467028617859, "learning_rate": 1.0337976027458213e-06, "loss": 0.0003, "num_input_tokens_seen": 74541288, "step": 110585 }, { "epoch": 2.701732098795593, "grad_norm": 0.09681735932826996, "learning_rate": 1.0337123737428352e-06, "loss": 0.0002, "num_input_tokens_seen": 74544360, "step": 110590 }, { "epoch": 2.70185424962744, "grad_norm": 0.28981155157089233, "learning_rate": 1.033627144494684e-06, "loss": 0.0741, "num_input_tokens_seen": 74547688, "step": 110595 }, { "epoch": 2.701976400459287, "grad_norm": 0.4329851567745209, "learning_rate": 1.033541915001987e-06, "loss": 0.0002, "num_input_tokens_seen": 74550888, "step": 110600 }, { "epoch": 2.7020985512911344, "grad_norm": 0.023930076509714127, "learning_rate": 1.033456685265364e-06, "loss": 0.0412, "num_input_tokens_seen": 74554280, "step": 110605 }, { "epoch": 2.7022207021229816, "grad_norm": 0.014112595468759537, "learning_rate": 1.0333714552854349e-06, "loss": 0.0001, "num_input_tokens_seen": 74557544, "step": 110610 }, { "epoch": 2.7023428529548283, "grad_norm": 0.8872970342636108, "learning_rate": 1.0332862250628198e-06, "loss": 0.0001, "num_input_tokens_seen": 74560552, "step": 110615 }, { "epoch": 2.702465003786676, "grad_norm": 0.1556682139635086, "learning_rate": 1.0332009945981384e-06, "loss": 0.0002, "num_input_tokens_seen": 74564264, "step": 110620 }, { "epoch": 2.7025871546185227, "grad_norm": 0.0046437764540314674, "learning_rate": 1.0331157638920102e-06, "loss": 0.0114, "num_input_tokens_seen": 74567464, "step": 110625 }, { "epoch": 2.7027093054503704, "grad_norm": 0.013693672604858875, "learning_rate": 1.0330305329450555e-06, "loss": 0.0342, "num_input_tokens_seen": 74570536, "step": 110630 }, { "epoch": 2.702831456282217, "grad_norm": 0.28892767429351807, "learning_rate": 1.0329453017578937e-06, "loss": 0.0003, "num_input_tokens_seen": 74574312, "step": 110635 }, { "epoch": 2.7029536071140643, "grad_norm": 0.027796685695648193, "learning_rate": 1.0328600703311447e-06, "loss": 0.0523, "num_input_tokens_seen": 74577640, "step": 110640 }, { "epoch": 2.7030757579459115, "grad_norm": 0.004140998236835003, "learning_rate": 1.0327748386654287e-06, "loss": 0.0002, "num_input_tokens_seen": 74581160, "step": 110645 }, { "epoch": 2.7031979087777587, "grad_norm": 0.008366966620087624, "learning_rate": 1.0326896067613654e-06, "loss": 0.0001, "num_input_tokens_seen": 74584808, "step": 110650 }, { "epoch": 2.703320059609606, "grad_norm": 0.013736417517066002, "learning_rate": 1.032604374619574e-06, "loss": 0.0419, "num_input_tokens_seen": 74588520, "step": 110655 }, { "epoch": 2.703442210441453, "grad_norm": 130.45484924316406, "learning_rate": 1.0325191422406751e-06, "loss": 0.049, "num_input_tokens_seen": 74592104, "step": 110660 }, { "epoch": 2.7035643612733002, "grad_norm": 0.008666807785630226, "learning_rate": 1.0324339096252883e-06, "loss": 0.0638, "num_input_tokens_seen": 74595944, "step": 110665 }, { "epoch": 2.7036865121051474, "grad_norm": 0.015011264011263847, "learning_rate": 1.0323486767740331e-06, "loss": 0.0001, "num_input_tokens_seen": 74599528, "step": 110670 }, { "epoch": 2.7038086629369946, "grad_norm": 0.0012946202186867595, "learning_rate": 1.03226344368753e-06, "loss": 0.0002, "num_input_tokens_seen": 74602856, "step": 110675 }, { "epoch": 2.703930813768842, "grad_norm": 0.0014228617073968053, "learning_rate": 1.032178210366398e-06, "loss": 0.0468, "num_input_tokens_seen": 74606632, "step": 110680 }, { "epoch": 2.704052964600689, "grad_norm": 0.0369437038898468, "learning_rate": 1.0320929768112578e-06, "loss": 0.0465, "num_input_tokens_seen": 74609704, "step": 110685 }, { "epoch": 2.704175115432536, "grad_norm": 15.255035400390625, "learning_rate": 1.0320077430227287e-06, "loss": 0.0526, "num_input_tokens_seen": 74613544, "step": 110690 }, { "epoch": 2.7042972662643834, "grad_norm": 0.0014747011009603739, "learning_rate": 1.0319225090014307e-06, "loss": 0.0001, "num_input_tokens_seen": 74617192, "step": 110695 }, { "epoch": 2.7044194170962306, "grad_norm": 0.017089655622839928, "learning_rate": 1.0318372747479838e-06, "loss": 0.0001, "num_input_tokens_seen": 74620584, "step": 110700 }, { "epoch": 2.7045415679280778, "grad_norm": 0.01144702173769474, "learning_rate": 1.0317520402630076e-06, "loss": 0.0967, "num_input_tokens_seen": 74623912, "step": 110705 }, { "epoch": 2.7046637187599245, "grad_norm": 0.08541342616081238, "learning_rate": 1.0316668055471219e-06, "loss": 0.055, "num_input_tokens_seen": 74627304, "step": 110710 }, { "epoch": 2.704785869591772, "grad_norm": 149.50946044921875, "learning_rate": 1.0315815706009464e-06, "loss": 0.002, "num_input_tokens_seen": 74630440, "step": 110715 }, { "epoch": 2.704908020423619, "grad_norm": 0.012330389581620693, "learning_rate": 1.0314963354251018e-06, "loss": 0.0004, "num_input_tokens_seen": 74633832, "step": 110720 }, { "epoch": 2.7050301712554665, "grad_norm": 0.0038798125460743904, "learning_rate": 1.031411100020207e-06, "loss": 0.0001, "num_input_tokens_seen": 74636968, "step": 110725 }, { "epoch": 2.7051523220873133, "grad_norm": 0.02092175930738449, "learning_rate": 1.0313258643868823e-06, "loss": 0.0001, "num_input_tokens_seen": 74640424, "step": 110730 }, { "epoch": 2.7052744729191605, "grad_norm": 0.029145987704396248, "learning_rate": 1.0312406285257474e-06, "loss": 0.0516, "num_input_tokens_seen": 74643368, "step": 110735 }, { "epoch": 2.7053966237510076, "grad_norm": 0.09617263823747635, "learning_rate": 1.0311553924374224e-06, "loss": 0.0442, "num_input_tokens_seen": 74646440, "step": 110740 }, { "epoch": 2.705518774582855, "grad_norm": 0.0012000126298516989, "learning_rate": 1.031070156122527e-06, "loss": 0.0001, "num_input_tokens_seen": 74649960, "step": 110745 }, { "epoch": 2.705640925414702, "grad_norm": 0.03371034935116768, "learning_rate": 1.030984919581681e-06, "loss": 0.0001, "num_input_tokens_seen": 74653480, "step": 110750 }, { "epoch": 2.705763076246549, "grad_norm": 0.06264438480138779, "learning_rate": 1.0308996828155048e-06, "loss": 0.1049, "num_input_tokens_seen": 74656744, "step": 110755 }, { "epoch": 2.7058852270783964, "grad_norm": 0.012665455229580402, "learning_rate": 1.0308144458246172e-06, "loss": 0.0002, "num_input_tokens_seen": 74660136, "step": 110760 }, { "epoch": 2.7060073779102436, "grad_norm": 0.09554749727249146, "learning_rate": 1.0307292086096386e-06, "loss": 0.0001, "num_input_tokens_seen": 74663272, "step": 110765 }, { "epoch": 2.706129528742091, "grad_norm": 0.0033516192343086004, "learning_rate": 1.0306439711711893e-06, "loss": 0.1075, "num_input_tokens_seen": 74666792, "step": 110770 }, { "epoch": 2.706251679573938, "grad_norm": 0.00686039961874485, "learning_rate": 1.0305587335098887e-06, "loss": 0.0689, "num_input_tokens_seen": 74670056, "step": 110775 }, { "epoch": 2.706373830405785, "grad_norm": 0.3009239733219147, "learning_rate": 1.0304734956263567e-06, "loss": 0.0003, "num_input_tokens_seen": 74673256, "step": 110780 }, { "epoch": 2.7064959812376324, "grad_norm": 0.13494005799293518, "learning_rate": 1.0303882575212132e-06, "loss": 0.0788, "num_input_tokens_seen": 74676584, "step": 110785 }, { "epoch": 2.7066181320694795, "grad_norm": 0.11872848868370056, "learning_rate": 1.0303030191950784e-06, "loss": 0.0001, "num_input_tokens_seen": 74680168, "step": 110790 }, { "epoch": 2.7067402829013263, "grad_norm": 0.05219002440571785, "learning_rate": 1.0302177806485715e-06, "loss": 0.0001, "num_input_tokens_seen": 74683496, "step": 110795 }, { "epoch": 2.706862433733174, "grad_norm": 0.005950891878455877, "learning_rate": 1.030132541882313e-06, "loss": 0.0943, "num_input_tokens_seen": 74686760, "step": 110800 }, { "epoch": 2.7069845845650207, "grad_norm": 0.012123181484639645, "learning_rate": 1.0300473028969225e-06, "loss": 0.0065, "num_input_tokens_seen": 74689960, "step": 110805 }, { "epoch": 2.7071067353968683, "grad_norm": 0.42338210344314575, "learning_rate": 1.02996206369302e-06, "loss": 0.0007, "num_input_tokens_seen": 74693160, "step": 110810 }, { "epoch": 2.707228886228715, "grad_norm": 0.008268994279205799, "learning_rate": 1.0298768242712253e-06, "loss": 0.0002, "num_input_tokens_seen": 74696552, "step": 110815 }, { "epoch": 2.7073510370605622, "grad_norm": 21.602508544921875, "learning_rate": 1.0297915846321583e-06, "loss": 0.0875, "num_input_tokens_seen": 74700008, "step": 110820 }, { "epoch": 2.7074731878924094, "grad_norm": 0.0032915703486651182, "learning_rate": 1.0297063447764387e-06, "loss": 0.1667, "num_input_tokens_seen": 74703016, "step": 110825 }, { "epoch": 2.7075953387242566, "grad_norm": 0.14885084331035614, "learning_rate": 1.0296211047046865e-06, "loss": 0.0567, "num_input_tokens_seen": 74706216, "step": 110830 }, { "epoch": 2.707717489556104, "grad_norm": 16.99559211730957, "learning_rate": 1.0295358644175222e-06, "loss": 0.0626, "num_input_tokens_seen": 74710056, "step": 110835 }, { "epoch": 2.707839640387951, "grad_norm": 0.10859328508377075, "learning_rate": 1.0294506239155647e-06, "loss": 0.001, "num_input_tokens_seen": 74713448, "step": 110840 }, { "epoch": 2.707961791219798, "grad_norm": 0.178997203707695, "learning_rate": 1.0293653831994345e-06, "loss": 0.0469, "num_input_tokens_seen": 74717032, "step": 110845 }, { "epoch": 2.7080839420516454, "grad_norm": 0.0051488084718585014, "learning_rate": 1.0292801422697512e-06, "loss": 0.0831, "num_input_tokens_seen": 74720424, "step": 110850 }, { "epoch": 2.7082060928834926, "grad_norm": 0.006985927931964397, "learning_rate": 1.029194901127135e-06, "loss": 0.0002, "num_input_tokens_seen": 74723496, "step": 110855 }, { "epoch": 2.7083282437153398, "grad_norm": 26.968042373657227, "learning_rate": 1.0291096597722054e-06, "loss": 0.0571, "num_input_tokens_seen": 74726888, "step": 110860 }, { "epoch": 2.708450394547187, "grad_norm": 0.06846634298563004, "learning_rate": 1.0290244182055828e-06, "loss": 0.0003, "num_input_tokens_seen": 74730216, "step": 110865 }, { "epoch": 2.708572545379034, "grad_norm": 0.29106584191322327, "learning_rate": 1.0289391764278868e-06, "loss": 0.0573, "num_input_tokens_seen": 74733224, "step": 110870 }, { "epoch": 2.7086946962108813, "grad_norm": 0.07840101420879364, "learning_rate": 1.0288539344397371e-06, "loss": 0.0729, "num_input_tokens_seen": 74736360, "step": 110875 }, { "epoch": 2.7088168470427285, "grad_norm": 0.07999832928180695, "learning_rate": 1.028768692241754e-06, "loss": 0.0665, "num_input_tokens_seen": 74739752, "step": 110880 }, { "epoch": 2.7089389978745757, "grad_norm": 0.018879253417253494, "learning_rate": 1.028683449834557e-06, "loss": 0.0434, "num_input_tokens_seen": 74742888, "step": 110885 }, { "epoch": 2.7090611487064225, "grad_norm": 0.1324795037508011, "learning_rate": 1.0285982072187665e-06, "loss": 0.0005, "num_input_tokens_seen": 74746024, "step": 110890 }, { "epoch": 2.70918329953827, "grad_norm": 0.09683836251497269, "learning_rate": 1.028512964395002e-06, "loss": 0.0003, "num_input_tokens_seen": 74749928, "step": 110895 }, { "epoch": 2.709305450370117, "grad_norm": 0.014940385706722736, "learning_rate": 1.0284277213638837e-06, "loss": 0.049, "num_input_tokens_seen": 74753192, "step": 110900 }, { "epoch": 2.709427601201964, "grad_norm": 0.01176470797508955, "learning_rate": 1.0283424781260312e-06, "loss": 0.035, "num_input_tokens_seen": 74756264, "step": 110905 }, { "epoch": 2.709549752033811, "grad_norm": 0.03302425146102905, "learning_rate": 1.028257234682065e-06, "loss": 0.0002, "num_input_tokens_seen": 74759272, "step": 110910 }, { "epoch": 2.7096719028656584, "grad_norm": 28.31501007080078, "learning_rate": 1.028171991032604e-06, "loss": 0.072, "num_input_tokens_seen": 74762728, "step": 110915 }, { "epoch": 2.7097940536975056, "grad_norm": 0.01271035149693489, "learning_rate": 1.0280867471782691e-06, "loss": 0.0009, "num_input_tokens_seen": 74765864, "step": 110920 }, { "epoch": 2.709916204529353, "grad_norm": 0.0440848134458065, "learning_rate": 1.0280015031196796e-06, "loss": 0.0694, "num_input_tokens_seen": 74769064, "step": 110925 }, { "epoch": 2.7100383553612, "grad_norm": 23.72658920288086, "learning_rate": 1.0279162588574557e-06, "loss": 0.0984, "num_input_tokens_seen": 74772136, "step": 110930 }, { "epoch": 2.710160506193047, "grad_norm": 19.42964744567871, "learning_rate": 1.0278310143922173e-06, "loss": 0.1046, "num_input_tokens_seen": 74775784, "step": 110935 }, { "epoch": 2.7102826570248943, "grad_norm": 0.3410216271877289, "learning_rate": 1.0277457697245842e-06, "loss": 0.0369, "num_input_tokens_seen": 74779304, "step": 110940 }, { "epoch": 2.7104048078567415, "grad_norm": 0.17364412546157837, "learning_rate": 1.0276605248551764e-06, "loss": 0.0423, "num_input_tokens_seen": 74782440, "step": 110945 }, { "epoch": 2.7105269586885887, "grad_norm": 0.02989744395017624, "learning_rate": 1.0275752797846137e-06, "loss": 0.053, "num_input_tokens_seen": 74785448, "step": 110950 }, { "epoch": 2.710649109520436, "grad_norm": 0.051603052765131, "learning_rate": 1.0274900345135167e-06, "loss": 0.0615, "num_input_tokens_seen": 74788584, "step": 110955 }, { "epoch": 2.710771260352283, "grad_norm": 49.23311233520508, "learning_rate": 1.0274047890425043e-06, "loss": 0.0954, "num_input_tokens_seen": 74791656, "step": 110960 }, { "epoch": 2.7108934111841303, "grad_norm": 0.027312999591231346, "learning_rate": 1.027319543372197e-06, "loss": 0.0367, "num_input_tokens_seen": 74794792, "step": 110965 }, { "epoch": 2.7110155620159775, "grad_norm": 0.04595275968313217, "learning_rate": 1.0272342975032147e-06, "loss": 0.0134, "num_input_tokens_seen": 74797928, "step": 110970 }, { "epoch": 2.7111377128478242, "grad_norm": 0.16071614623069763, "learning_rate": 1.0271490514361771e-06, "loss": 0.0001, "num_input_tokens_seen": 74801448, "step": 110975 }, { "epoch": 2.711259863679672, "grad_norm": 1.5390660762786865, "learning_rate": 1.0270638051717041e-06, "loss": 0.0006, "num_input_tokens_seen": 74804904, "step": 110980 }, { "epoch": 2.7113820145115186, "grad_norm": 0.0822259932756424, "learning_rate": 1.0269785587104163e-06, "loss": 0.0006, "num_input_tokens_seen": 74808616, "step": 110985 }, { "epoch": 2.7115041653433662, "grad_norm": 0.5313860774040222, "learning_rate": 1.0268933120529332e-06, "loss": 0.0005, "num_input_tokens_seen": 74811752, "step": 110990 }, { "epoch": 2.711626316175213, "grad_norm": 0.19341067969799042, "learning_rate": 1.0268080651998744e-06, "loss": 0.0923, "num_input_tokens_seen": 74814952, "step": 110995 }, { "epoch": 2.71174846700706, "grad_norm": 0.0634603202342987, "learning_rate": 1.0267228181518601e-06, "loss": 0.1456, "num_input_tokens_seen": 74818280, "step": 111000 }, { "epoch": 2.7118706178389074, "grad_norm": 0.1839318722486496, "learning_rate": 1.0266375709095103e-06, "loss": 0.0662, "num_input_tokens_seen": 74821352, "step": 111005 }, { "epoch": 2.7119927686707546, "grad_norm": 16.85820960998535, "learning_rate": 1.0265523234734453e-06, "loss": 0.0913, "num_input_tokens_seen": 74824936, "step": 111010 }, { "epoch": 2.7121149195026018, "grad_norm": 0.048869337886571884, "learning_rate": 1.0264670758442843e-06, "loss": 0.0007, "num_input_tokens_seen": 74828968, "step": 111015 }, { "epoch": 2.712237070334449, "grad_norm": 31.205768585205078, "learning_rate": 1.0263818280226477e-06, "loss": 0.0701, "num_input_tokens_seen": 74832360, "step": 111020 }, { "epoch": 2.712359221166296, "grad_norm": 0.005597368814051151, "learning_rate": 1.0262965800091553e-06, "loss": 0.0004, "num_input_tokens_seen": 74836008, "step": 111025 }, { "epoch": 2.7124813719981433, "grad_norm": 0.2927609384059906, "learning_rate": 1.0262113318044271e-06, "loss": 0.0006, "num_input_tokens_seen": 74839272, "step": 111030 }, { "epoch": 2.7126035228299905, "grad_norm": 0.04279811680316925, "learning_rate": 1.0261260834090833e-06, "loss": 0.001, "num_input_tokens_seen": 74842280, "step": 111035 }, { "epoch": 2.7127256736618377, "grad_norm": 65.13777923583984, "learning_rate": 1.0260408348237432e-06, "loss": 0.0838, "num_input_tokens_seen": 74845608, "step": 111040 }, { "epoch": 2.712847824493685, "grad_norm": 51.76634979248047, "learning_rate": 1.0259555860490272e-06, "loss": 0.1082, "num_input_tokens_seen": 74848936, "step": 111045 }, { "epoch": 2.712969975325532, "grad_norm": 0.09504994004964828, "learning_rate": 1.0258703370855553e-06, "loss": 0.0002, "num_input_tokens_seen": 74852008, "step": 111050 }, { "epoch": 2.7130921261573793, "grad_norm": 0.13983154296875, "learning_rate": 1.0257850879339474e-06, "loss": 0.0004, "num_input_tokens_seen": 74855336, "step": 111055 }, { "epoch": 2.7132142769892265, "grad_norm": 0.043822336941957474, "learning_rate": 1.0256998385948234e-06, "loss": 0.0002, "num_input_tokens_seen": 74858408, "step": 111060 }, { "epoch": 2.7133364278210736, "grad_norm": 0.1019313782453537, "learning_rate": 1.0256145890688035e-06, "loss": 0.0816, "num_input_tokens_seen": 74862056, "step": 111065 }, { "epoch": 2.7134585786529204, "grad_norm": 42.160888671875, "learning_rate": 1.0255293393565073e-06, "loss": 0.0331, "num_input_tokens_seen": 74865512, "step": 111070 }, { "epoch": 2.713580729484768, "grad_norm": 0.030252449214458466, "learning_rate": 1.0254440894585543e-06, "loss": 0.0002, "num_input_tokens_seen": 74869096, "step": 111075 }, { "epoch": 2.7137028803166148, "grad_norm": 0.43322911858558655, "learning_rate": 1.0253588393755653e-06, "loss": 0.085, "num_input_tokens_seen": 74872296, "step": 111080 }, { "epoch": 2.713825031148462, "grad_norm": 0.06952600926160812, "learning_rate": 1.0252735891081604e-06, "loss": 0.0866, "num_input_tokens_seen": 74875496, "step": 111085 }, { "epoch": 2.713947181980309, "grad_norm": 0.04150708019733429, "learning_rate": 1.025188338656959e-06, "loss": 0.0001, "num_input_tokens_seen": 74878696, "step": 111090 }, { "epoch": 2.7140693328121563, "grad_norm": 0.015112698078155518, "learning_rate": 1.025103088022581e-06, "loss": 0.0315, "num_input_tokens_seen": 74881768, "step": 111095 }, { "epoch": 2.7141914836440035, "grad_norm": 0.05250068008899689, "learning_rate": 1.0250178372056468e-06, "loss": 0.0004, "num_input_tokens_seen": 74884904, "step": 111100 }, { "epoch": 2.7143136344758507, "grad_norm": 0.009632604196667671, "learning_rate": 1.024932586206776e-06, "loss": 0.0004, "num_input_tokens_seen": 74888424, "step": 111105 }, { "epoch": 2.714435785307698, "grad_norm": 25.12550163269043, "learning_rate": 1.0248473350265892e-06, "loss": 0.0539, "num_input_tokens_seen": 74891432, "step": 111110 }, { "epoch": 2.714557936139545, "grad_norm": 0.08560064435005188, "learning_rate": 1.0247620836657053e-06, "loss": 0.0001, "num_input_tokens_seen": 74894888, "step": 111115 }, { "epoch": 2.7146800869713923, "grad_norm": 1.7176668643951416, "learning_rate": 1.0246768321247452e-06, "loss": 0.0004, "num_input_tokens_seen": 74898088, "step": 111120 }, { "epoch": 2.7148022378032395, "grad_norm": 0.02066458947956562, "learning_rate": 1.0245915804043283e-06, "loss": 0.0001, "num_input_tokens_seen": 74901160, "step": 111125 }, { "epoch": 2.7149243886350867, "grad_norm": 39.92478942871094, "learning_rate": 1.0245063285050751e-06, "loss": 0.1044, "num_input_tokens_seen": 74904936, "step": 111130 }, { "epoch": 2.715046539466934, "grad_norm": 0.0022745642345398664, "learning_rate": 1.024421076427605e-06, "loss": 0.0001, "num_input_tokens_seen": 74908392, "step": 111135 }, { "epoch": 2.715168690298781, "grad_norm": 0.2585334777832031, "learning_rate": 1.0243358241725383e-06, "loss": 0.0003, "num_input_tokens_seen": 74911400, "step": 111140 }, { "epoch": 2.7152908411306282, "grad_norm": 0.008308542892336845, "learning_rate": 1.0242505717404953e-06, "loss": 0.0717, "num_input_tokens_seen": 74914792, "step": 111145 }, { "epoch": 2.7154129919624754, "grad_norm": 0.09557440131902695, "learning_rate": 1.0241653191320952e-06, "loss": 0.0001, "num_input_tokens_seen": 74918056, "step": 111150 }, { "epoch": 2.715535142794322, "grad_norm": 0.023261088877916336, "learning_rate": 1.0240800663479586e-06, "loss": 0.0002, "num_input_tokens_seen": 74921064, "step": 111155 }, { "epoch": 2.71565729362617, "grad_norm": 0.013574914075434208, "learning_rate": 1.0239948133887053e-06, "loss": 0.0001, "num_input_tokens_seen": 74924200, "step": 111160 }, { "epoch": 2.7157794444580166, "grad_norm": 0.06006774678826332, "learning_rate": 1.0239095602549552e-06, "loss": 0.0001, "num_input_tokens_seen": 74927528, "step": 111165 }, { "epoch": 2.715901595289864, "grad_norm": 0.09563129395246506, "learning_rate": 1.0238243069473283e-06, "loss": 0.0002, "num_input_tokens_seen": 74930984, "step": 111170 }, { "epoch": 2.716023746121711, "grad_norm": 0.008038848638534546, "learning_rate": 1.0237390534664447e-06, "loss": 0.1227, "num_input_tokens_seen": 74934184, "step": 111175 }, { "epoch": 2.716145896953558, "grad_norm": 0.1147889792919159, "learning_rate": 1.0236537998129245e-06, "loss": 0.0002, "num_input_tokens_seen": 74937768, "step": 111180 }, { "epoch": 2.7162680477854053, "grad_norm": 0.22480729222297668, "learning_rate": 1.0235685459873873e-06, "loss": 0.0001, "num_input_tokens_seen": 74940968, "step": 111185 }, { "epoch": 2.7163901986172525, "grad_norm": 11.220277786254883, "learning_rate": 1.0234832919904533e-06, "loss": 0.1094, "num_input_tokens_seen": 74944104, "step": 111190 }, { "epoch": 2.7165123494490997, "grad_norm": 0.28037410974502563, "learning_rate": 1.0233980378227426e-06, "loss": 0.0005, "num_input_tokens_seen": 74947112, "step": 111195 }, { "epoch": 2.716634500280947, "grad_norm": 0.01165709923952818, "learning_rate": 1.0233127834848744e-06, "loss": 0.0515, "num_input_tokens_seen": 74950568, "step": 111200 }, { "epoch": 2.716756651112794, "grad_norm": 0.07802575081586838, "learning_rate": 1.0232275289774702e-06, "loss": 0.0549, "num_input_tokens_seen": 74953704, "step": 111205 }, { "epoch": 2.7168788019446413, "grad_norm": 0.23846709728240967, "learning_rate": 1.0231422743011488e-06, "loss": 0.0341, "num_input_tokens_seen": 74957032, "step": 111210 }, { "epoch": 2.7170009527764885, "grad_norm": 0.03532462939620018, "learning_rate": 1.0230570194565307e-06, "loss": 0.0003, "num_input_tokens_seen": 74960360, "step": 111215 }, { "epoch": 2.7171231036083356, "grad_norm": 0.06964351236820221, "learning_rate": 1.022971764444236e-06, "loss": 0.0002, "num_input_tokens_seen": 74963496, "step": 111220 }, { "epoch": 2.717245254440183, "grad_norm": 15.483033180236816, "learning_rate": 1.0228865092648842e-06, "loss": 0.0649, "num_input_tokens_seen": 74966568, "step": 111225 }, { "epoch": 2.71736740527203, "grad_norm": 25.231521606445312, "learning_rate": 1.022801253919095e-06, "loss": 0.0351, "num_input_tokens_seen": 74969768, "step": 111230 }, { "epoch": 2.717489556103877, "grad_norm": 24.629430770874023, "learning_rate": 1.0227159984074895e-06, "loss": 0.0572, "num_input_tokens_seen": 74972968, "step": 111235 }, { "epoch": 2.717611706935724, "grad_norm": 0.027782179415225983, "learning_rate": 1.0226307427306873e-06, "loss": 0.001, "num_input_tokens_seen": 74976424, "step": 111240 }, { "epoch": 2.7177338577675716, "grad_norm": 0.02719302475452423, "learning_rate": 1.022545486889308e-06, "loss": 0.0011, "num_input_tokens_seen": 74979752, "step": 111245 }, { "epoch": 2.7178560085994183, "grad_norm": 0.0871863141655922, "learning_rate": 1.022460230883972e-06, "loss": 0.0011, "num_input_tokens_seen": 74983336, "step": 111250 }, { "epoch": 2.717978159431266, "grad_norm": 0.04725907742977142, "learning_rate": 1.022374974715299e-06, "loss": 0.0007, "num_input_tokens_seen": 74986920, "step": 111255 }, { "epoch": 2.7181003102631127, "grad_norm": 0.05942535400390625, "learning_rate": 1.022289718383909e-06, "loss": 0.0009, "num_input_tokens_seen": 74990120, "step": 111260 }, { "epoch": 2.71822246109496, "grad_norm": 13.261385917663574, "learning_rate": 1.0222044618904225e-06, "loss": 0.0554, "num_input_tokens_seen": 74993192, "step": 111265 }, { "epoch": 2.718344611926807, "grad_norm": 0.021577315405011177, "learning_rate": 1.0221192052354593e-06, "loss": 0.0469, "num_input_tokens_seen": 74996584, "step": 111270 }, { "epoch": 2.7184667627586543, "grad_norm": 31.583599090576172, "learning_rate": 1.0220339484196392e-06, "loss": 0.0437, "num_input_tokens_seen": 74999976, "step": 111275 }, { "epoch": 2.7185889135905015, "grad_norm": 0.07107774913311005, "learning_rate": 1.0219486914435823e-06, "loss": 0.0002, "num_input_tokens_seen": 75003240, "step": 111280 }, { "epoch": 2.7187110644223487, "grad_norm": 0.05268010124564171, "learning_rate": 1.0218634343079082e-06, "loss": 0.0002, "num_input_tokens_seen": 75006760, "step": 111285 }, { "epoch": 2.718833215254196, "grad_norm": 0.010458775795996189, "learning_rate": 1.0217781770132375e-06, "loss": 0.0001, "num_input_tokens_seen": 75010280, "step": 111290 }, { "epoch": 2.718955366086043, "grad_norm": 13.336898803710938, "learning_rate": 1.0216929195601903e-06, "loss": 0.0352, "num_input_tokens_seen": 75013864, "step": 111295 }, { "epoch": 2.7190775169178902, "grad_norm": 0.6519309282302856, "learning_rate": 1.0216076619493861e-06, "loss": 0.0565, "num_input_tokens_seen": 75016872, "step": 111300 }, { "epoch": 2.7191996677497374, "grad_norm": 0.7603404521942139, "learning_rate": 1.0215224041814455e-06, "loss": 0.0642, "num_input_tokens_seen": 75019944, "step": 111305 }, { "epoch": 2.7193218185815846, "grad_norm": 0.1283659040927887, "learning_rate": 1.0214371462569878e-06, "loss": 0.0002, "num_input_tokens_seen": 75023144, "step": 111310 }, { "epoch": 2.719443969413432, "grad_norm": 0.10316579043865204, "learning_rate": 1.0213518881766337e-06, "loss": 0.0376, "num_input_tokens_seen": 75026408, "step": 111315 }, { "epoch": 2.719566120245279, "grad_norm": 0.07398030906915665, "learning_rate": 1.0212666299410026e-06, "loss": 0.095, "num_input_tokens_seen": 75029928, "step": 111320 }, { "epoch": 2.719688271077126, "grad_norm": 0.006797238253057003, "learning_rate": 1.0211813715507151e-06, "loss": 0.0001, "num_input_tokens_seen": 75033192, "step": 111325 }, { "epoch": 2.7198104219089734, "grad_norm": 0.008504372090101242, "learning_rate": 1.0210961130063911e-06, "loss": 0.0002, "num_input_tokens_seen": 75036328, "step": 111330 }, { "epoch": 2.71993257274082, "grad_norm": 0.014152489602565765, "learning_rate": 1.0210108543086502e-06, "loss": 0.0006, "num_input_tokens_seen": 75039400, "step": 111335 }, { "epoch": 2.7200547235726678, "grad_norm": 0.16172784566879272, "learning_rate": 1.020925595458113e-06, "loss": 0.0005, "num_input_tokens_seen": 75042600, "step": 111340 }, { "epoch": 2.7201768744045145, "grad_norm": 0.00845453143119812, "learning_rate": 1.020840336455399e-06, "loss": 0.0708, "num_input_tokens_seen": 75045992, "step": 111345 }, { "epoch": 2.720299025236362, "grad_norm": 0.055560726672410965, "learning_rate": 1.0207550773011285e-06, "loss": 0.0451, "num_input_tokens_seen": 75049256, "step": 111350 }, { "epoch": 2.720421176068209, "grad_norm": 49.54551315307617, "learning_rate": 1.0206698179959213e-06, "loss": 0.0575, "num_input_tokens_seen": 75052840, "step": 111355 }, { "epoch": 2.720543326900056, "grad_norm": 0.040758680552244186, "learning_rate": 1.0205845585403978e-06, "loss": 0.0001, "num_input_tokens_seen": 75056296, "step": 111360 }, { "epoch": 2.7206654777319033, "grad_norm": 0.060084953904151917, "learning_rate": 1.020499298935178e-06, "loss": 0.0007, "num_input_tokens_seen": 75059240, "step": 111365 }, { "epoch": 2.7207876285637504, "grad_norm": 0.08063948899507523, "learning_rate": 1.0204140391808818e-06, "loss": 0.0009, "num_input_tokens_seen": 75062760, "step": 111370 }, { "epoch": 2.7209097793955976, "grad_norm": 33.327274322509766, "learning_rate": 1.0203287792781293e-06, "loss": 0.0915, "num_input_tokens_seen": 75065832, "step": 111375 }, { "epoch": 2.721031930227445, "grad_norm": 0.31778210401535034, "learning_rate": 1.0202435192275404e-06, "loss": 0.049, "num_input_tokens_seen": 75068840, "step": 111380 }, { "epoch": 2.721154081059292, "grad_norm": 25.272445678710938, "learning_rate": 1.0201582590297345e-06, "loss": 0.0844, "num_input_tokens_seen": 75071912, "step": 111385 }, { "epoch": 2.721276231891139, "grad_norm": 0.030040111392736435, "learning_rate": 1.0200729986853332e-06, "loss": 0.0358, "num_input_tokens_seen": 75075112, "step": 111390 }, { "epoch": 2.7213983827229864, "grad_norm": 0.011033943854272366, "learning_rate": 1.0199877381949552e-06, "loss": 0.1127, "num_input_tokens_seen": 75078824, "step": 111395 }, { "epoch": 2.7215205335548336, "grad_norm": 0.014756470918655396, "learning_rate": 1.019902477559221e-06, "loss": 0.0515, "num_input_tokens_seen": 75082088, "step": 111400 }, { "epoch": 2.7216426843866808, "grad_norm": 1.1041114330291748, "learning_rate": 1.0198172167787508e-06, "loss": 0.001, "num_input_tokens_seen": 75085416, "step": 111405 }, { "epoch": 2.721764835218528, "grad_norm": 0.1292664259672165, "learning_rate": 1.0197319558541645e-06, "loss": 0.0569, "num_input_tokens_seen": 75088424, "step": 111410 }, { "epoch": 2.721886986050375, "grad_norm": 0.22659969329833984, "learning_rate": 1.0196466947860819e-06, "loss": 0.0385, "num_input_tokens_seen": 75091624, "step": 111415 }, { "epoch": 2.722009136882222, "grad_norm": 0.16476193070411682, "learning_rate": 1.0195614335751234e-06, "loss": 0.0003, "num_input_tokens_seen": 75095528, "step": 111420 }, { "epoch": 2.7221312877140695, "grad_norm": 0.02349873073399067, "learning_rate": 1.019476172221909e-06, "loss": 0.0823, "num_input_tokens_seen": 75098984, "step": 111425 }, { "epoch": 2.7222534385459163, "grad_norm": 0.13670580089092255, "learning_rate": 1.0193909107270583e-06, "loss": 0.069, "num_input_tokens_seen": 75102440, "step": 111430 }, { "epoch": 2.722375589377764, "grad_norm": 0.012231250293552876, "learning_rate": 1.019305649091192e-06, "loss": 0.0002, "num_input_tokens_seen": 75105384, "step": 111435 }, { "epoch": 2.7224977402096107, "grad_norm": 0.16043393313884735, "learning_rate": 1.0192203873149299e-06, "loss": 0.0002, "num_input_tokens_seen": 75108584, "step": 111440 }, { "epoch": 2.722619891041458, "grad_norm": 0.04114136844873428, "learning_rate": 1.0191351253988915e-06, "loss": 0.0537, "num_input_tokens_seen": 75111848, "step": 111445 }, { "epoch": 2.722742041873305, "grad_norm": 0.01634533330798149, "learning_rate": 1.0190498633436976e-06, "loss": 0.0003, "num_input_tokens_seen": 75114984, "step": 111450 }, { "epoch": 2.7228641927051522, "grad_norm": 526.8831176757812, "learning_rate": 1.018964601149968e-06, "loss": 0.036, "num_input_tokens_seen": 75118376, "step": 111455 }, { "epoch": 2.7229863435369994, "grad_norm": 0.10897945612668991, "learning_rate": 1.0188793388183229e-06, "loss": 0.0004, "num_input_tokens_seen": 75121768, "step": 111460 }, { "epoch": 2.7231084943688466, "grad_norm": 0.11047311872243881, "learning_rate": 1.0187940763493818e-06, "loss": 0.0001, "num_input_tokens_seen": 75125288, "step": 111465 }, { "epoch": 2.723230645200694, "grad_norm": 0.4363507330417633, "learning_rate": 1.0187088137437652e-06, "loss": 0.0005, "num_input_tokens_seen": 75128808, "step": 111470 }, { "epoch": 2.723352796032541, "grad_norm": 0.07578103244304657, "learning_rate": 1.0186235510020933e-06, "loss": 0.0002, "num_input_tokens_seen": 75132136, "step": 111475 }, { "epoch": 2.723474946864388, "grad_norm": 0.014645537361502647, "learning_rate": 1.0185382881249857e-06, "loss": 0.0686, "num_input_tokens_seen": 75135656, "step": 111480 }, { "epoch": 2.7235970976962354, "grad_norm": 32.36290740966797, "learning_rate": 1.0184530251130628e-06, "loss": 0.0898, "num_input_tokens_seen": 75139240, "step": 111485 }, { "epoch": 2.7237192485280826, "grad_norm": 0.09573503583669662, "learning_rate": 1.0183677619669446e-06, "loss": 0.0002, "num_input_tokens_seen": 75142568, "step": 111490 }, { "epoch": 2.7238413993599297, "grad_norm": 0.11125845462083817, "learning_rate": 1.0182824986872509e-06, "loss": 0.0829, "num_input_tokens_seen": 75145960, "step": 111495 }, { "epoch": 2.723963550191777, "grad_norm": 13.607086181640625, "learning_rate": 1.0181972352746022e-06, "loss": 0.1665, "num_input_tokens_seen": 75149160, "step": 111500 }, { "epoch": 2.724085701023624, "grad_norm": 0.5846430063247681, "learning_rate": 1.018111971729618e-06, "loss": 0.0468, "num_input_tokens_seen": 75152232, "step": 111505 }, { "epoch": 2.7242078518554713, "grad_norm": 0.24971908330917358, "learning_rate": 1.0180267080529187e-06, "loss": 0.0844, "num_input_tokens_seen": 75155240, "step": 111510 }, { "epoch": 2.724330002687318, "grad_norm": 0.19749361276626587, "learning_rate": 1.0179414442451244e-06, "loss": 0.0002, "num_input_tokens_seen": 75158376, "step": 111515 }, { "epoch": 2.7244521535191657, "grad_norm": 0.09108477085828781, "learning_rate": 1.0178561803068554e-06, "loss": 0.0005, "num_input_tokens_seen": 75161512, "step": 111520 }, { "epoch": 2.7245743043510124, "grad_norm": 0.1065296083688736, "learning_rate": 1.0177709162387311e-06, "loss": 0.1018, "num_input_tokens_seen": 75165096, "step": 111525 }, { "epoch": 2.7246964551828596, "grad_norm": 0.012275001965463161, "learning_rate": 1.0176856520413723e-06, "loss": 0.1232, "num_input_tokens_seen": 75168680, "step": 111530 }, { "epoch": 2.724818606014707, "grad_norm": 1.0489786863327026, "learning_rate": 1.0176003877153986e-06, "loss": 0.0565, "num_input_tokens_seen": 75172072, "step": 111535 }, { "epoch": 2.724940756846554, "grad_norm": 0.12052971124649048, "learning_rate": 1.0175151232614296e-06, "loss": 0.0412, "num_input_tokens_seen": 75175656, "step": 111540 }, { "epoch": 2.725062907678401, "grad_norm": 0.078212209045887, "learning_rate": 1.0174298586800862e-06, "loss": 0.0868, "num_input_tokens_seen": 75179560, "step": 111545 }, { "epoch": 2.7251850585102484, "grad_norm": 13.409255981445312, "learning_rate": 1.0173445939719882e-06, "loss": 0.0638, "num_input_tokens_seen": 75183016, "step": 111550 }, { "epoch": 2.7253072093420956, "grad_norm": 0.19932065904140472, "learning_rate": 1.0172593291377559e-06, "loss": 0.0006, "num_input_tokens_seen": 75186664, "step": 111555 }, { "epoch": 2.7254293601739428, "grad_norm": 11.157979011535645, "learning_rate": 1.017174064178009e-06, "loss": 0.042, "num_input_tokens_seen": 75189736, "step": 111560 }, { "epoch": 2.72555151100579, "grad_norm": 16.223159790039062, "learning_rate": 1.0170887990933675e-06, "loss": 0.0351, "num_input_tokens_seen": 75193000, "step": 111565 }, { "epoch": 2.725673661837637, "grad_norm": 0.085118867456913, "learning_rate": 1.0170035338844514e-06, "loss": 0.0005, "num_input_tokens_seen": 75196520, "step": 111570 }, { "epoch": 2.7257958126694843, "grad_norm": 0.029599210247397423, "learning_rate": 1.0169182685518817e-06, "loss": 0.0004, "num_input_tokens_seen": 75199976, "step": 111575 }, { "epoch": 2.7259179635013315, "grad_norm": 0.7240051627159119, "learning_rate": 1.0168330030962775e-06, "loss": 0.029, "num_input_tokens_seen": 75203176, "step": 111580 }, { "epoch": 2.7260401143331787, "grad_norm": 0.00847703404724598, "learning_rate": 1.0167477375182592e-06, "loss": 0.0004, "num_input_tokens_seen": 75206632, "step": 111585 }, { "epoch": 2.726162265165026, "grad_norm": 15.220355033874512, "learning_rate": 1.0166624718184467e-06, "loss": 0.0851, "num_input_tokens_seen": 75209832, "step": 111590 }, { "epoch": 2.726284415996873, "grad_norm": 0.044437870383262634, "learning_rate": 1.0165772059974604e-06, "loss": 0.0901, "num_input_tokens_seen": 75213032, "step": 111595 }, { "epoch": 2.72640656682872, "grad_norm": 0.021976593881845474, "learning_rate": 1.0164919400559202e-06, "loss": 0.0003, "num_input_tokens_seen": 75216232, "step": 111600 }, { "epoch": 2.7265287176605675, "grad_norm": 0.003190957475453615, "learning_rate": 1.016406673994446e-06, "loss": 0.0001, "num_input_tokens_seen": 75219560, "step": 111605 }, { "epoch": 2.7266508684924142, "grad_norm": 0.08274943381547928, "learning_rate": 1.016321407813658e-06, "loss": 0.0337, "num_input_tokens_seen": 75223720, "step": 111610 }, { "epoch": 2.726773019324262, "grad_norm": 0.05247628316283226, "learning_rate": 1.0162361415141766e-06, "loss": 0.0276, "num_input_tokens_seen": 75227560, "step": 111615 }, { "epoch": 2.7268951701561086, "grad_norm": 0.09594687819480896, "learning_rate": 1.0161508750966214e-06, "loss": 0.044, "num_input_tokens_seen": 75230888, "step": 111620 }, { "epoch": 2.727017320987956, "grad_norm": 0.02336997166275978, "learning_rate": 1.0160656085616128e-06, "loss": 0.0003, "num_input_tokens_seen": 75234216, "step": 111625 }, { "epoch": 2.727139471819803, "grad_norm": 310.22930908203125, "learning_rate": 1.0159803419097708e-06, "loss": 0.0228, "num_input_tokens_seen": 75237608, "step": 111630 }, { "epoch": 2.72726162265165, "grad_norm": 0.20268277823925018, "learning_rate": 1.0158950751417155e-06, "loss": 0.0006, "num_input_tokens_seen": 75240488, "step": 111635 }, { "epoch": 2.7273837734834974, "grad_norm": 0.039220090955495834, "learning_rate": 1.0158098082580669e-06, "loss": 0.0002, "num_input_tokens_seen": 75243816, "step": 111640 }, { "epoch": 2.7275059243153446, "grad_norm": 16.948116302490234, "learning_rate": 1.015724541259445e-06, "loss": 0.0294, "num_input_tokens_seen": 75246952, "step": 111645 }, { "epoch": 2.7276280751471917, "grad_norm": 0.030434172600507736, "learning_rate": 1.01563927414647e-06, "loss": 0.0006, "num_input_tokens_seen": 75250216, "step": 111650 }, { "epoch": 2.727750225979039, "grad_norm": 55.45297622680664, "learning_rate": 1.0155540069197623e-06, "loss": 0.0515, "num_input_tokens_seen": 75253480, "step": 111655 }, { "epoch": 2.727872376810886, "grad_norm": 0.044720862060785294, "learning_rate": 1.0154687395799415e-06, "loss": 0.0007, "num_input_tokens_seen": 75256936, "step": 111660 }, { "epoch": 2.7279945276427333, "grad_norm": 0.01326954085379839, "learning_rate": 1.0153834721276276e-06, "loss": 0.0002, "num_input_tokens_seen": 75260392, "step": 111665 }, { "epoch": 2.7281166784745805, "grad_norm": 0.005129373632371426, "learning_rate": 1.0152982045634411e-06, "loss": 0.054, "num_input_tokens_seen": 75263784, "step": 111670 }, { "epoch": 2.7282388293064277, "grad_norm": 0.0035448065027594566, "learning_rate": 1.015212936888002e-06, "loss": 0.0001, "num_input_tokens_seen": 75266664, "step": 111675 }, { "epoch": 2.728360980138275, "grad_norm": 0.12941040098667145, "learning_rate": 1.0151276691019304e-06, "loss": 0.0001, "num_input_tokens_seen": 75270504, "step": 111680 }, { "epoch": 2.728483130970122, "grad_norm": 0.006792883854359388, "learning_rate": 1.0150424012058466e-06, "loss": 0.1038, "num_input_tokens_seen": 75274280, "step": 111685 }, { "epoch": 2.7286052818019693, "grad_norm": 0.019393935799598694, "learning_rate": 1.0149571332003702e-06, "loss": 0.001, "num_input_tokens_seen": 75277672, "step": 111690 }, { "epoch": 2.728727432633816, "grad_norm": 0.04781755059957504, "learning_rate": 1.014871865086121e-06, "loss": 0.068, "num_input_tokens_seen": 75281000, "step": 111695 }, { "epoch": 2.7288495834656636, "grad_norm": 0.06856974959373474, "learning_rate": 1.01478659686372e-06, "loss": 0.0001, "num_input_tokens_seen": 75284712, "step": 111700 }, { "epoch": 2.7289717342975104, "grad_norm": 0.12601056694984436, "learning_rate": 1.0147013285337868e-06, "loss": 0.0002, "num_input_tokens_seen": 75288168, "step": 111705 }, { "epoch": 2.7290938851293576, "grad_norm": 0.5254818797111511, "learning_rate": 1.0146160600969419e-06, "loss": 0.0541, "num_input_tokens_seen": 75291432, "step": 111710 }, { "epoch": 2.7292160359612048, "grad_norm": 0.018003568053245544, "learning_rate": 1.0145307915538047e-06, "loss": 0.0503, "num_input_tokens_seen": 75294952, "step": 111715 }, { "epoch": 2.729338186793052, "grad_norm": 0.015305710025131702, "learning_rate": 1.014445522904996e-06, "loss": 0.002, "num_input_tokens_seen": 75298024, "step": 111720 }, { "epoch": 2.729460337624899, "grad_norm": 0.033240336924791336, "learning_rate": 1.014360254151135e-06, "loss": 0.0644, "num_input_tokens_seen": 75301544, "step": 111725 }, { "epoch": 2.7295824884567463, "grad_norm": 0.017280491068959236, "learning_rate": 1.014274985292843e-06, "loss": 0.0001, "num_input_tokens_seen": 75305128, "step": 111730 }, { "epoch": 2.7297046392885935, "grad_norm": 0.0738111361861229, "learning_rate": 1.0141897163307394e-06, "loss": 0.0358, "num_input_tokens_seen": 75308520, "step": 111735 }, { "epoch": 2.7298267901204407, "grad_norm": 0.0017817127518355846, "learning_rate": 1.0141044472654441e-06, "loss": 0.0006, "num_input_tokens_seen": 75311656, "step": 111740 }, { "epoch": 2.729948940952288, "grad_norm": 0.43171536922454834, "learning_rate": 1.0140191780975776e-06, "loss": 0.0003, "num_input_tokens_seen": 75314920, "step": 111745 }, { "epoch": 2.730071091784135, "grad_norm": 0.5606056451797485, "learning_rate": 1.0139339088277599e-06, "loss": 0.0012, "num_input_tokens_seen": 75318504, "step": 111750 }, { "epoch": 2.7301932426159823, "grad_norm": 19.605661392211914, "learning_rate": 1.013848639456611e-06, "loss": 0.0693, "num_input_tokens_seen": 75321640, "step": 111755 }, { "epoch": 2.7303153934478295, "grad_norm": 50.48213195800781, "learning_rate": 1.0137633699847507e-06, "loss": 0.0515, "num_input_tokens_seen": 75325288, "step": 111760 }, { "epoch": 2.7304375442796767, "grad_norm": 0.007247891277074814, "learning_rate": 1.0136781004128e-06, "loss": 0.0002, "num_input_tokens_seen": 75328808, "step": 111765 }, { "epoch": 2.730559695111524, "grad_norm": 21.53213882446289, "learning_rate": 1.0135928307413785e-06, "loss": 0.0646, "num_input_tokens_seen": 75332392, "step": 111770 }, { "epoch": 2.730681845943371, "grad_norm": 0.0011175754480063915, "learning_rate": 1.013507560971106e-06, "loss": 0.0001, "num_input_tokens_seen": 75336104, "step": 111775 }, { "epoch": 2.730803996775218, "grad_norm": 0.04286136478185654, "learning_rate": 1.013422291102603e-06, "loss": 0.0465, "num_input_tokens_seen": 75339368, "step": 111780 }, { "epoch": 2.7309261476070654, "grad_norm": 0.474054753780365, "learning_rate": 1.0133370211364892e-06, "loss": 0.044, "num_input_tokens_seen": 75342568, "step": 111785 }, { "epoch": 2.731048298438912, "grad_norm": 4.169500350952148, "learning_rate": 1.0132517510733853e-06, "loss": 0.0007, "num_input_tokens_seen": 75345704, "step": 111790 }, { "epoch": 2.73117044927076, "grad_norm": 0.29191893339157104, "learning_rate": 1.0131664809139111e-06, "loss": 0.1245, "num_input_tokens_seen": 75349288, "step": 111795 }, { "epoch": 2.7312926001026065, "grad_norm": 0.093380406498909, "learning_rate": 1.0130812106586868e-06, "loss": 0.0002, "num_input_tokens_seen": 75352552, "step": 111800 }, { "epoch": 2.7314147509344537, "grad_norm": 0.030852509662508965, "learning_rate": 1.012995940308332e-06, "loss": 0.0465, "num_input_tokens_seen": 75355816, "step": 111805 }, { "epoch": 2.731536901766301, "grad_norm": 0.010006435215473175, "learning_rate": 1.0129106698634676e-06, "loss": 0.0003, "num_input_tokens_seen": 75359016, "step": 111810 }, { "epoch": 2.731659052598148, "grad_norm": 30.960412979125977, "learning_rate": 1.0128253993247132e-06, "loss": 0.0834, "num_input_tokens_seen": 75362216, "step": 111815 }, { "epoch": 2.7317812034299953, "grad_norm": 0.02463303506374359, "learning_rate": 1.012740128692689e-06, "loss": 0.0003, "num_input_tokens_seen": 75365672, "step": 111820 }, { "epoch": 2.7319033542618425, "grad_norm": 0.014886599965393543, "learning_rate": 1.0126548579680154e-06, "loss": 0.1318, "num_input_tokens_seen": 75368936, "step": 111825 }, { "epoch": 2.7320255050936897, "grad_norm": 0.09876159578561783, "learning_rate": 1.012569587151312e-06, "loss": 0.0329, "num_input_tokens_seen": 75372328, "step": 111830 }, { "epoch": 2.732147655925537, "grad_norm": 0.0017728406237438321, "learning_rate": 1.0124843162431994e-06, "loss": 0.0027, "num_input_tokens_seen": 75375528, "step": 111835 }, { "epoch": 2.732269806757384, "grad_norm": 0.006446607410907745, "learning_rate": 1.0123990452442977e-06, "loss": 0.0477, "num_input_tokens_seen": 75378472, "step": 111840 }, { "epoch": 2.7323919575892313, "grad_norm": 14.4539213180542, "learning_rate": 1.0123137741552264e-06, "loss": 0.0479, "num_input_tokens_seen": 75381736, "step": 111845 }, { "epoch": 2.7325141084210784, "grad_norm": 0.04016166925430298, "learning_rate": 1.012228502976606e-06, "loss": 0.0001, "num_input_tokens_seen": 75385128, "step": 111850 }, { "epoch": 2.7326362592529256, "grad_norm": 0.008136886171996593, "learning_rate": 1.0121432317090568e-06, "loss": 0.0001, "num_input_tokens_seen": 75388264, "step": 111855 }, { "epoch": 2.732758410084773, "grad_norm": 0.027124160900712013, "learning_rate": 1.0120579603531987e-06, "loss": 0.0334, "num_input_tokens_seen": 75391912, "step": 111860 }, { "epoch": 2.7328805609166196, "grad_norm": 0.07476188987493515, "learning_rate": 1.0119726889096518e-06, "loss": 0.0002, "num_input_tokens_seen": 75395304, "step": 111865 }, { "epoch": 2.733002711748467, "grad_norm": 0.009432375431060791, "learning_rate": 1.0118874173790364e-06, "loss": 0.0001, "num_input_tokens_seen": 75398440, "step": 111870 }, { "epoch": 2.733124862580314, "grad_norm": 0.06053454801440239, "learning_rate": 1.0118021457619725e-06, "loss": 0.0004, "num_input_tokens_seen": 75401896, "step": 111875 }, { "epoch": 2.7332470134121616, "grad_norm": 32.21339797973633, "learning_rate": 1.01171687405908e-06, "loss": 0.1077, "num_input_tokens_seen": 75405096, "step": 111880 }, { "epoch": 2.7333691642440083, "grad_norm": 0.004754332359880209, "learning_rate": 1.0116316022709794e-06, "loss": 0.0356, "num_input_tokens_seen": 75408936, "step": 111885 }, { "epoch": 2.7334913150758555, "grad_norm": 0.40759822726249695, "learning_rate": 1.0115463303982909e-06, "loss": 0.0008, "num_input_tokens_seen": 75412328, "step": 111890 }, { "epoch": 2.7336134659077027, "grad_norm": 5.361917018890381, "learning_rate": 1.0114610584416342e-06, "loss": 0.0011, "num_input_tokens_seen": 75415784, "step": 111895 }, { "epoch": 2.73373561673955, "grad_norm": 0.07207702100276947, "learning_rate": 1.0113757864016298e-06, "loss": 0.0411, "num_input_tokens_seen": 75419304, "step": 111900 }, { "epoch": 2.733857767571397, "grad_norm": 0.059938084334135056, "learning_rate": 1.0112905142788973e-06, "loss": 0.0423, "num_input_tokens_seen": 75422248, "step": 111905 }, { "epoch": 2.7339799184032443, "grad_norm": 0.2088875025510788, "learning_rate": 1.0112052420740573e-06, "loss": 0.0229, "num_input_tokens_seen": 75425384, "step": 111910 }, { "epoch": 2.7341020692350915, "grad_norm": 0.008488044142723083, "learning_rate": 1.0111199697877295e-06, "loss": 0.1241, "num_input_tokens_seen": 75428456, "step": 111915 }, { "epoch": 2.7342242200669387, "grad_norm": 0.06650572270154953, "learning_rate": 1.0110346974205344e-06, "loss": 0.0039, "num_input_tokens_seen": 75432424, "step": 111920 }, { "epoch": 2.734346370898786, "grad_norm": 0.027216732501983643, "learning_rate": 1.010949424973092e-06, "loss": 0.0002, "num_input_tokens_seen": 75435880, "step": 111925 }, { "epoch": 2.734468521730633, "grad_norm": 0.2589007318019867, "learning_rate": 1.0108641524460227e-06, "loss": 0.0335, "num_input_tokens_seen": 75439528, "step": 111930 }, { "epoch": 2.7345906725624802, "grad_norm": 0.9645323753356934, "learning_rate": 1.010778879839946e-06, "loss": 0.0004, "num_input_tokens_seen": 75442792, "step": 111935 }, { "epoch": 2.7347128233943274, "grad_norm": 0.36214014887809753, "learning_rate": 1.0106936071554828e-06, "loss": 0.0001, "num_input_tokens_seen": 75446760, "step": 111940 }, { "epoch": 2.7348349742261746, "grad_norm": 55.24829864501953, "learning_rate": 1.0106083343932527e-06, "loss": 0.0312, "num_input_tokens_seen": 75450536, "step": 111945 }, { "epoch": 2.734957125058022, "grad_norm": 3.9883968830108643, "learning_rate": 1.0105230615538757e-06, "loss": 0.0496, "num_input_tokens_seen": 75454376, "step": 111950 }, { "epoch": 2.735079275889869, "grad_norm": 0.030904987826943398, "learning_rate": 1.0104377886379725e-06, "loss": 0.0975, "num_input_tokens_seen": 75457832, "step": 111955 }, { "epoch": 2.7352014267217157, "grad_norm": 0.000735341280233115, "learning_rate": 1.0103525156461628e-06, "loss": 0.0478, "num_input_tokens_seen": 75461800, "step": 111960 }, { "epoch": 2.7353235775535634, "grad_norm": 0.178748220205307, "learning_rate": 1.0102672425790665e-06, "loss": 0.0003, "num_input_tokens_seen": 75464936, "step": 111965 }, { "epoch": 2.73544572838541, "grad_norm": 0.006740411277860403, "learning_rate": 1.0101819694373045e-06, "loss": 0.0001, "num_input_tokens_seen": 75468456, "step": 111970 }, { "epoch": 2.7355678792172573, "grad_norm": 0.032522208988666534, "learning_rate": 1.0100966962214959e-06, "loss": 0.0001, "num_input_tokens_seen": 75472040, "step": 111975 }, { "epoch": 2.7356900300491045, "grad_norm": 0.0018017988186329603, "learning_rate": 1.0100114229322618e-06, "loss": 0.176, "num_input_tokens_seen": 75475560, "step": 111980 }, { "epoch": 2.7358121808809517, "grad_norm": 1.2526220083236694, "learning_rate": 1.009926149570222e-06, "loss": 0.0004, "num_input_tokens_seen": 75478952, "step": 111985 }, { "epoch": 2.735934331712799, "grad_norm": 0.01706717163324356, "learning_rate": 1.0098408761359965e-06, "loss": 0.0003, "num_input_tokens_seen": 75482088, "step": 111990 }, { "epoch": 2.736056482544646, "grad_norm": 0.1926831156015396, "learning_rate": 1.0097556026302056e-06, "loss": 0.0492, "num_input_tokens_seen": 75485160, "step": 111995 }, { "epoch": 2.7361786333764933, "grad_norm": 0.05004437267780304, "learning_rate": 1.0096703290534693e-06, "loss": 0.0853, "num_input_tokens_seen": 75488232, "step": 112000 }, { "epoch": 2.7363007842083404, "grad_norm": 0.008143107406795025, "learning_rate": 1.0095850554064074e-06, "loss": 0.0004, "num_input_tokens_seen": 75491112, "step": 112005 }, { "epoch": 2.7364229350401876, "grad_norm": 0.4659461975097656, "learning_rate": 1.0094997816896407e-06, "loss": 0.0004, "num_input_tokens_seen": 75494120, "step": 112010 }, { "epoch": 2.736545085872035, "grad_norm": 0.030890757218003273, "learning_rate": 1.009414507903789e-06, "loss": 0.0351, "num_input_tokens_seen": 75497192, "step": 112015 }, { "epoch": 2.736667236703882, "grad_norm": 0.056731339544057846, "learning_rate": 1.0093292340494726e-06, "loss": 0.0548, "num_input_tokens_seen": 75500840, "step": 112020 }, { "epoch": 2.736789387535729, "grad_norm": 0.0058851963840425014, "learning_rate": 1.0092439601273112e-06, "loss": 0.0271, "num_input_tokens_seen": 75504168, "step": 112025 }, { "epoch": 2.7369115383675764, "grad_norm": 0.0594380646944046, "learning_rate": 1.0091586861379256e-06, "loss": 0.0604, "num_input_tokens_seen": 75507432, "step": 112030 }, { "epoch": 2.7370336891994236, "grad_norm": 0.12918642163276672, "learning_rate": 1.0090734120819353e-06, "loss": 0.0002, "num_input_tokens_seen": 75510504, "step": 112035 }, { "epoch": 2.7371558400312708, "grad_norm": 0.012315675616264343, "learning_rate": 1.0089881379599605e-06, "loss": 0.115, "num_input_tokens_seen": 75513576, "step": 112040 }, { "epoch": 2.7372779908631175, "grad_norm": 0.22571399807929993, "learning_rate": 1.0089028637726223e-06, "loss": 0.0002, "num_input_tokens_seen": 75517160, "step": 112045 }, { "epoch": 2.737400141694965, "grad_norm": 70.73702239990234, "learning_rate": 1.0088175895205396e-06, "loss": 0.1897, "num_input_tokens_seen": 75520744, "step": 112050 }, { "epoch": 2.737522292526812, "grad_norm": 0.03366973623633385, "learning_rate": 1.008732315204333e-06, "loss": 0.0008, "num_input_tokens_seen": 75523944, "step": 112055 }, { "epoch": 2.7376444433586595, "grad_norm": 0.002571349497884512, "learning_rate": 1.0086470408246225e-06, "loss": 0.0002, "num_input_tokens_seen": 75527848, "step": 112060 }, { "epoch": 2.7377665941905063, "grad_norm": 284.5186462402344, "learning_rate": 1.0085617663820288e-06, "loss": 0.0832, "num_input_tokens_seen": 75530856, "step": 112065 }, { "epoch": 2.7378887450223535, "grad_norm": 0.15322719514369965, "learning_rate": 1.0084764918771711e-06, "loss": 0.0334, "num_input_tokens_seen": 75534504, "step": 112070 }, { "epoch": 2.7380108958542007, "grad_norm": 0.09069032222032547, "learning_rate": 1.0083912173106703e-06, "loss": 0.0002, "num_input_tokens_seen": 75537704, "step": 112075 }, { "epoch": 2.738133046686048, "grad_norm": 0.2874353229999542, "learning_rate": 1.0083059426831466e-06, "loss": 0.0529, "num_input_tokens_seen": 75540712, "step": 112080 }, { "epoch": 2.738255197517895, "grad_norm": 0.02338283136487007, "learning_rate": 1.0082206679952197e-06, "loss": 0.0003, "num_input_tokens_seen": 75543912, "step": 112085 }, { "epoch": 2.7383773483497422, "grad_norm": 43.06546401977539, "learning_rate": 1.00813539324751e-06, "loss": 0.077, "num_input_tokens_seen": 75547560, "step": 112090 }, { "epoch": 2.7384994991815894, "grad_norm": 0.15701234340667725, "learning_rate": 1.0080501184406372e-06, "loss": 0.0003, "num_input_tokens_seen": 75550632, "step": 112095 }, { "epoch": 2.7386216500134366, "grad_norm": 0.10237427800893784, "learning_rate": 1.007964843575222e-06, "loss": 0.0004, "num_input_tokens_seen": 75554088, "step": 112100 }, { "epoch": 2.738743800845284, "grad_norm": 0.02234814129769802, "learning_rate": 1.007879568651884e-06, "loss": 0.0002, "num_input_tokens_seen": 75557480, "step": 112105 }, { "epoch": 2.738865951677131, "grad_norm": 56.13254165649414, "learning_rate": 1.007794293671244e-06, "loss": 0.1363, "num_input_tokens_seen": 75561320, "step": 112110 }, { "epoch": 2.738988102508978, "grad_norm": 21.933855056762695, "learning_rate": 1.0077090186339218e-06, "loss": 0.0579, "num_input_tokens_seen": 75564648, "step": 112115 }, { "epoch": 2.7391102533408254, "grad_norm": 0.01483561284840107, "learning_rate": 1.0076237435405374e-06, "loss": 0.0003, "num_input_tokens_seen": 75567912, "step": 112120 }, { "epoch": 2.7392324041726726, "grad_norm": 160.25828552246094, "learning_rate": 1.0075384683917111e-06, "loss": 0.0162, "num_input_tokens_seen": 75571240, "step": 112125 }, { "epoch": 2.7393545550045197, "grad_norm": 0.40404099225997925, "learning_rate": 1.007453193188063e-06, "loss": 0.0249, "num_input_tokens_seen": 75574696, "step": 112130 }, { "epoch": 2.739476705836367, "grad_norm": 0.020996518433094025, "learning_rate": 1.0073679179302133e-06, "loss": 0.1026, "num_input_tokens_seen": 75577832, "step": 112135 }, { "epoch": 2.7395988566682137, "grad_norm": 0.008093031123280525, "learning_rate": 1.0072826426187821e-06, "loss": 0.0002, "num_input_tokens_seen": 75581224, "step": 112140 }, { "epoch": 2.7397210075000613, "grad_norm": 0.004957599099725485, "learning_rate": 1.0071973672543898e-06, "loss": 0.0561, "num_input_tokens_seen": 75584744, "step": 112145 }, { "epoch": 2.739843158331908, "grad_norm": 0.018184345215559006, "learning_rate": 1.0071120918376563e-06, "loss": 0.0002, "num_input_tokens_seen": 75588200, "step": 112150 }, { "epoch": 2.7399653091637552, "grad_norm": 0.005898487288504839, "learning_rate": 1.0070268163692017e-06, "loss": 0.0543, "num_input_tokens_seen": 75591208, "step": 112155 }, { "epoch": 2.7400874599956024, "grad_norm": 38.10469055175781, "learning_rate": 1.0069415408496458e-06, "loss": 0.0554, "num_input_tokens_seen": 75594408, "step": 112160 }, { "epoch": 2.7402096108274496, "grad_norm": 0.009115857072174549, "learning_rate": 1.0068562652796095e-06, "loss": 0.0006, "num_input_tokens_seen": 75598120, "step": 112165 }, { "epoch": 2.740331761659297, "grad_norm": 0.20606650412082672, "learning_rate": 1.0067709896597126e-06, "loss": 0.0003, "num_input_tokens_seen": 75601192, "step": 112170 }, { "epoch": 2.740453912491144, "grad_norm": 0.28834593296051025, "learning_rate": 1.0066857139905752e-06, "loss": 0.0003, "num_input_tokens_seen": 75604328, "step": 112175 }, { "epoch": 2.740576063322991, "grad_norm": 0.008773494511842728, "learning_rate": 1.0066004382728176e-06, "loss": 0.048, "num_input_tokens_seen": 75607784, "step": 112180 }, { "epoch": 2.7406982141548384, "grad_norm": 0.037397127598524094, "learning_rate": 1.0065151625070595e-06, "loss": 0.0001, "num_input_tokens_seen": 75610984, "step": 112185 }, { "epoch": 2.7408203649866856, "grad_norm": 0.10962966829538345, "learning_rate": 1.0064298866939216e-06, "loss": 0.0001, "num_input_tokens_seen": 75614632, "step": 112190 }, { "epoch": 2.7409425158185328, "grad_norm": 358.03375244140625, "learning_rate": 1.0063446108340236e-06, "loss": 0.0069, "num_input_tokens_seen": 75617960, "step": 112195 }, { "epoch": 2.74106466665038, "grad_norm": 0.04087607190012932, "learning_rate": 1.0062593349279865e-06, "loss": 0.0001, "num_input_tokens_seen": 75621672, "step": 112200 }, { "epoch": 2.741186817482227, "grad_norm": 1.1919631958007812, "learning_rate": 1.0061740589764294e-06, "loss": 0.0379, "num_input_tokens_seen": 75625000, "step": 112205 }, { "epoch": 2.7413089683140743, "grad_norm": 0.009299659170210361, "learning_rate": 1.0060887829799728e-06, "loss": 0.0185, "num_input_tokens_seen": 75628392, "step": 112210 }, { "epoch": 2.7414311191459215, "grad_norm": 0.003497474826872349, "learning_rate": 1.0060035069392371e-06, "loss": 0.0003, "num_input_tokens_seen": 75631720, "step": 112215 }, { "epoch": 2.7415532699777687, "grad_norm": 0.0010874419240280986, "learning_rate": 1.0059182308548424e-06, "loss": 0.0847, "num_input_tokens_seen": 75635432, "step": 112220 }, { "epoch": 2.7416754208096155, "grad_norm": 0.025534670799970627, "learning_rate": 1.0058329547274083e-06, "loss": 0.0001, "num_input_tokens_seen": 75638824, "step": 112225 }, { "epoch": 2.741797571641463, "grad_norm": 0.06358666718006134, "learning_rate": 1.0057476785575555e-06, "loss": 0.0539, "num_input_tokens_seen": 75642472, "step": 112230 }, { "epoch": 2.74191972247331, "grad_norm": 0.01722525805234909, "learning_rate": 1.0056624023459045e-06, "loss": 0.0001, "num_input_tokens_seen": 75646504, "step": 112235 }, { "epoch": 2.7420418733051575, "grad_norm": 0.5232629179954529, "learning_rate": 1.0055771260930745e-06, "loss": 0.0002, "num_input_tokens_seen": 75650152, "step": 112240 }, { "epoch": 2.742164024137004, "grad_norm": 0.512911319732666, "learning_rate": 1.0054918497996865e-06, "loss": 0.1006, "num_input_tokens_seen": 75653288, "step": 112245 }, { "epoch": 2.7422861749688514, "grad_norm": 0.0002688757376745343, "learning_rate": 1.00540657346636e-06, "loss": 0.0779, "num_input_tokens_seen": 75656616, "step": 112250 }, { "epoch": 2.7424083258006986, "grad_norm": 0.016228975728154182, "learning_rate": 1.0053212970937157e-06, "loss": 0.0001, "num_input_tokens_seen": 75659944, "step": 112255 }, { "epoch": 2.742530476632546, "grad_norm": 6.871138095855713, "learning_rate": 1.0052360206823733e-06, "loss": 0.0383, "num_input_tokens_seen": 75663016, "step": 112260 }, { "epoch": 2.742652627464393, "grad_norm": 0.07361658662557602, "learning_rate": 1.0051507442329533e-06, "loss": 0.0728, "num_input_tokens_seen": 75666472, "step": 112265 }, { "epoch": 2.74277477829624, "grad_norm": 39.02785110473633, "learning_rate": 1.0050654677460754e-06, "loss": 0.1776, "num_input_tokens_seen": 75669672, "step": 112270 }, { "epoch": 2.7428969291280874, "grad_norm": 0.022098205983638763, "learning_rate": 1.0049801912223603e-06, "loss": 0.0558, "num_input_tokens_seen": 75672744, "step": 112275 }, { "epoch": 2.7430190799599345, "grad_norm": 0.07683975249528885, "learning_rate": 1.004894914662428e-06, "loss": 0.1883, "num_input_tokens_seen": 75676072, "step": 112280 }, { "epoch": 2.7431412307917817, "grad_norm": 0.02362046204507351, "learning_rate": 1.0048096380668982e-06, "loss": 0.1013, "num_input_tokens_seen": 75679208, "step": 112285 }, { "epoch": 2.743263381623629, "grad_norm": 0.04820949584245682, "learning_rate": 1.0047243614363916e-06, "loss": 0.0001, "num_input_tokens_seen": 75682984, "step": 112290 }, { "epoch": 2.743385532455476, "grad_norm": 23.516231536865234, "learning_rate": 1.0046390847715282e-06, "loss": 0.0477, "num_input_tokens_seen": 75686568, "step": 112295 }, { "epoch": 2.7435076832873233, "grad_norm": 0.06488364189863205, "learning_rate": 1.0045538080729283e-06, "loss": 0.0003, "num_input_tokens_seen": 75689896, "step": 112300 }, { "epoch": 2.7436298341191705, "grad_norm": 0.009952181950211525, "learning_rate": 1.004468531341212e-06, "loss": 0.0502, "num_input_tokens_seen": 75693416, "step": 112305 }, { "epoch": 2.7437519849510172, "grad_norm": 0.01083527971059084, "learning_rate": 1.004383254576999e-06, "loss": 0.0489, "num_input_tokens_seen": 75696488, "step": 112310 }, { "epoch": 2.743874135782865, "grad_norm": 0.007215315010398626, "learning_rate": 1.00429797778091e-06, "loss": 0.1186, "num_input_tokens_seen": 75699752, "step": 112315 }, { "epoch": 2.7439962866147116, "grad_norm": 0.023482423275709152, "learning_rate": 1.0042127009535647e-06, "loss": 0.056, "num_input_tokens_seen": 75702568, "step": 112320 }, { "epoch": 2.7441184374465593, "grad_norm": 47.78105545043945, "learning_rate": 1.0041274240955834e-06, "loss": 0.1368, "num_input_tokens_seen": 75705832, "step": 112325 }, { "epoch": 2.744240588278406, "grad_norm": 0.26664429903030396, "learning_rate": 1.0040421472075865e-06, "loss": 0.0002, "num_input_tokens_seen": 75709224, "step": 112330 }, { "epoch": 2.744362739110253, "grad_norm": 30.276893615722656, "learning_rate": 1.0039568702901942e-06, "loss": 0.1433, "num_input_tokens_seen": 75712488, "step": 112335 }, { "epoch": 2.7444848899421004, "grad_norm": 0.09096966683864594, "learning_rate": 1.0038715933440265e-06, "loss": 0.0005, "num_input_tokens_seen": 75715944, "step": 112340 }, { "epoch": 2.7446070407739476, "grad_norm": 0.09213719516992569, "learning_rate": 1.0037863163697034e-06, "loss": 0.0003, "num_input_tokens_seen": 75718952, "step": 112345 }, { "epoch": 2.7447291916057948, "grad_norm": 0.038761381059885025, "learning_rate": 1.0037010393678449e-06, "loss": 0.0397, "num_input_tokens_seen": 75722152, "step": 112350 }, { "epoch": 2.744851342437642, "grad_norm": 0.0874420702457428, "learning_rate": 1.003615762339072e-06, "loss": 0.0005, "num_input_tokens_seen": 75725032, "step": 112355 }, { "epoch": 2.744973493269489, "grad_norm": 0.2326081395149231, "learning_rate": 1.0035304852840042e-06, "loss": 0.0392, "num_input_tokens_seen": 75728168, "step": 112360 }, { "epoch": 2.7450956441013363, "grad_norm": 40.23335266113281, "learning_rate": 1.0034452082032615e-06, "loss": 0.0927, "num_input_tokens_seen": 75731112, "step": 112365 }, { "epoch": 2.7452177949331835, "grad_norm": 0.10373587161302567, "learning_rate": 1.0033599310974645e-06, "loss": 0.0002, "num_input_tokens_seen": 75734312, "step": 112370 }, { "epoch": 2.7453399457650307, "grad_norm": 10.037677764892578, "learning_rate": 1.003274653967233e-06, "loss": 0.0007, "num_input_tokens_seen": 75737704, "step": 112375 }, { "epoch": 2.745462096596878, "grad_norm": 0.325348436832428, "learning_rate": 1.0031893768131874e-06, "loss": 0.0004, "num_input_tokens_seen": 75741096, "step": 112380 }, { "epoch": 2.745584247428725, "grad_norm": 0.03348547965288162, "learning_rate": 1.0031040996359478e-06, "loss": 0.0006, "num_input_tokens_seen": 75744424, "step": 112385 }, { "epoch": 2.7457063982605723, "grad_norm": 0.0019438609015196562, "learning_rate": 1.0030188224361344e-06, "loss": 0.0002, "num_input_tokens_seen": 75748136, "step": 112390 }, { "epoch": 2.7458285490924195, "grad_norm": 0.055961500853300095, "learning_rate": 1.0029335452143673e-06, "loss": 0.0344, "num_input_tokens_seen": 75753384, "step": 112395 }, { "epoch": 2.7459506999242667, "grad_norm": 0.038423359394073486, "learning_rate": 1.0028482679712667e-06, "loss": 0.0005, "num_input_tokens_seen": 75756648, "step": 112400 }, { "epoch": 2.7460728507561134, "grad_norm": 0.03630128875374794, "learning_rate": 1.0027629907074527e-06, "loss": 0.0003, "num_input_tokens_seen": 75760104, "step": 112405 }, { "epoch": 2.746195001587961, "grad_norm": 0.2955532968044281, "learning_rate": 1.0026777134235456e-06, "loss": 0.0717, "num_input_tokens_seen": 75763624, "step": 112410 }, { "epoch": 2.746317152419808, "grad_norm": 0.00962782371789217, "learning_rate": 1.0025924361201652e-06, "loss": 0.0823, "num_input_tokens_seen": 75766632, "step": 112415 }, { "epoch": 2.7464393032516554, "grad_norm": 0.03371252119541168, "learning_rate": 1.0025071587979322e-06, "loss": 0.0001, "num_input_tokens_seen": 75769768, "step": 112420 }, { "epoch": 2.746561454083502, "grad_norm": 0.45033684372901917, "learning_rate": 1.0024218814574664e-06, "loss": 0.0004, "num_input_tokens_seen": 75773096, "step": 112425 }, { "epoch": 2.7466836049153494, "grad_norm": 0.558583676815033, "learning_rate": 1.0023366040993876e-06, "loss": 0.0716, "num_input_tokens_seen": 75776424, "step": 112430 }, { "epoch": 2.7468057557471965, "grad_norm": 0.00862213410437107, "learning_rate": 1.0022513267243169e-06, "loss": 0.0001, "num_input_tokens_seen": 75779560, "step": 112435 }, { "epoch": 2.7469279065790437, "grad_norm": 0.012112240307033062, "learning_rate": 1.0021660493328737e-06, "loss": 0.0013, "num_input_tokens_seen": 75783080, "step": 112440 }, { "epoch": 2.747050057410891, "grad_norm": 52.65711212158203, "learning_rate": 1.0020807719256784e-06, "loss": 0.046, "num_input_tokens_seen": 75786408, "step": 112445 }, { "epoch": 2.747172208242738, "grad_norm": 0.04763922840356827, "learning_rate": 1.0019954945033513e-06, "loss": 0.0004, "num_input_tokens_seen": 75789480, "step": 112450 }, { "epoch": 2.7472943590745853, "grad_norm": 0.052201878279447556, "learning_rate": 1.0019102170665124e-06, "loss": 0.0313, "num_input_tokens_seen": 75792936, "step": 112455 }, { "epoch": 2.7474165099064325, "grad_norm": 0.007947223260998726, "learning_rate": 1.0018249396157818e-06, "loss": 0.1355, "num_input_tokens_seen": 75796136, "step": 112460 }, { "epoch": 2.7475386607382797, "grad_norm": 0.02703636698424816, "learning_rate": 1.00173966215178e-06, "loss": 0.0003, "num_input_tokens_seen": 75799208, "step": 112465 }, { "epoch": 2.747660811570127, "grad_norm": 45.332603454589844, "learning_rate": 1.0016543846751265e-06, "loss": 0.0389, "num_input_tokens_seen": 75802536, "step": 112470 }, { "epoch": 2.747782962401974, "grad_norm": 0.008542269468307495, "learning_rate": 1.001569107186442e-06, "loss": 0.0, "num_input_tokens_seen": 75806312, "step": 112475 }, { "epoch": 2.7479051132338213, "grad_norm": 4.6416215896606445, "learning_rate": 1.0014838296863467e-06, "loss": 0.0008, "num_input_tokens_seen": 75810216, "step": 112480 }, { "epoch": 2.7480272640656684, "grad_norm": 0.0047716982662677765, "learning_rate": 1.0013985521754606e-06, "loss": 0.0002, "num_input_tokens_seen": 75813672, "step": 112485 }, { "epoch": 2.748149414897515, "grad_norm": 0.010327127762138844, "learning_rate": 1.0013132746544038e-06, "loss": 0.0526, "num_input_tokens_seen": 75817192, "step": 112490 }, { "epoch": 2.748271565729363, "grad_norm": 0.0022148210555315018, "learning_rate": 1.0012279971237965e-06, "loss": 0.0005, "num_input_tokens_seen": 75820264, "step": 112495 }, { "epoch": 2.7483937165612096, "grad_norm": 27.39652442932129, "learning_rate": 1.0011427195842589e-06, "loss": 0.0785, "num_input_tokens_seen": 75823272, "step": 112500 }, { "epoch": 2.748515867393057, "grad_norm": 0.18398308753967285, "learning_rate": 1.0010574420364108e-06, "loss": 0.0461, "num_input_tokens_seen": 75826344, "step": 112505 }, { "epoch": 2.748638018224904, "grad_norm": 0.2839778661727905, "learning_rate": 1.0009721644808734e-06, "loss": 0.0008, "num_input_tokens_seen": 75829672, "step": 112510 }, { "epoch": 2.748760169056751, "grad_norm": 0.0654890388250351, "learning_rate": 1.0008868869182656e-06, "loss": 0.0408, "num_input_tokens_seen": 75833192, "step": 112515 }, { "epoch": 2.7488823198885983, "grad_norm": 0.01932072453200817, "learning_rate": 1.0008016093492082e-06, "loss": 0.0003, "num_input_tokens_seen": 75836968, "step": 112520 }, { "epoch": 2.7490044707204455, "grad_norm": 0.020976558327674866, "learning_rate": 1.0007163317743214e-06, "loss": 0.0, "num_input_tokens_seen": 75840040, "step": 112525 }, { "epoch": 2.7491266215522927, "grad_norm": 0.012725918553769588, "learning_rate": 1.000631054194225e-06, "loss": 0.025, "num_input_tokens_seen": 75843432, "step": 112530 }, { "epoch": 2.74924877238414, "grad_norm": 0.01118039432913065, "learning_rate": 1.0005457766095395e-06, "loss": 0.0011, "num_input_tokens_seen": 75846824, "step": 112535 }, { "epoch": 2.749370923215987, "grad_norm": 49.7636833190918, "learning_rate": 1.000460499020885e-06, "loss": 0.0434, "num_input_tokens_seen": 75850344, "step": 112540 }, { "epoch": 2.7494930740478343, "grad_norm": 0.0003692000173032284, "learning_rate": 1.0003752214288818e-06, "loss": 0.0944, "num_input_tokens_seen": 75854120, "step": 112545 }, { "epoch": 2.7496152248796815, "grad_norm": 0.006110539194196463, "learning_rate": 1.0002899438341498e-06, "loss": 0.0541, "num_input_tokens_seen": 75857384, "step": 112550 }, { "epoch": 2.7497373757115287, "grad_norm": 0.132795512676239, "learning_rate": 1.0002046662373092e-06, "loss": 0.0001, "num_input_tokens_seen": 75860968, "step": 112555 }, { "epoch": 2.749859526543376, "grad_norm": 0.030062628909945488, "learning_rate": 1.0001193886389803e-06, "loss": 0.0002, "num_input_tokens_seen": 75864552, "step": 112560 }, { "epoch": 2.749981677375223, "grad_norm": 0.005698109045624733, "learning_rate": 1.000034111039783e-06, "loss": 0.0001, "num_input_tokens_seen": 75867752, "step": 112565 }, { "epoch": 2.7501038282070702, "grad_norm": 1.852415919303894, "learning_rate": 9.99948833440338e-07, "loss": 0.0008, "num_input_tokens_seen": 75871400, "step": 112570 }, { "epoch": 2.750201548872548, "eval_loss": 0.20390097796916962, "eval_runtime": 47.6331, "eval_samples_per_second": 763.86, "eval_steps_per_second": 95.501, "num_input_tokens_seen": 75874280, "step": 112574 }, { "epoch": 2.7502259790389174, "grad_norm": 0.024079719558358192, "learning_rate": 9.998635558412646e-07, "loss": 0.0001, "num_input_tokens_seen": 75874856, "step": 112575 }, { "epoch": 2.7503481298707646, "grad_norm": 890.7974853515625, "learning_rate": 9.997782782431837e-07, "loss": 0.0653, "num_input_tokens_seen": 75878184, "step": 112580 }, { "epoch": 2.7504702807026113, "grad_norm": 0.0008283877978101373, "learning_rate": 9.996930006467153e-07, "loss": 0.0, "num_input_tokens_seen": 75881448, "step": 112585 }, { "epoch": 2.750592431534459, "grad_norm": 0.05993705615401268, "learning_rate": 9.996077230524793e-07, "loss": 0.0685, "num_input_tokens_seen": 75884392, "step": 112590 }, { "epoch": 2.7507145823663057, "grad_norm": 0.03609907999634743, "learning_rate": 9.995224454610963e-07, "loss": 0.0523, "num_input_tokens_seen": 75887464, "step": 112595 }, { "epoch": 2.750836733198153, "grad_norm": 0.001615293207578361, "learning_rate": 9.994371678731857e-07, "loss": 0.0001, "num_input_tokens_seen": 75890472, "step": 112600 }, { "epoch": 2.75095888403, "grad_norm": 0.12746308743953705, "learning_rate": 9.993518902893688e-07, "loss": 0.0001, "num_input_tokens_seen": 75893800, "step": 112605 }, { "epoch": 2.7510810348618473, "grad_norm": 0.016232147812843323, "learning_rate": 9.992666127102648e-07, "loss": 0.1098, "num_input_tokens_seen": 75897192, "step": 112610 }, { "epoch": 2.7512031856936945, "grad_norm": 0.03641084209084511, "learning_rate": 9.991813351364941e-07, "loss": 0.0004, "num_input_tokens_seen": 75900392, "step": 112615 }, { "epoch": 2.7513253365255417, "grad_norm": 0.00999990850687027, "learning_rate": 9.990960575686773e-07, "loss": 0.0393, "num_input_tokens_seen": 75904040, "step": 112620 }, { "epoch": 2.751447487357389, "grad_norm": 0.005113823339343071, "learning_rate": 9.990107800074338e-07, "loss": 0.0002, "num_input_tokens_seen": 75907432, "step": 112625 }, { "epoch": 2.751569638189236, "grad_norm": 0.08694149553775787, "learning_rate": 9.989255024533846e-07, "loss": 0.0374, "num_input_tokens_seen": 75910824, "step": 112630 }, { "epoch": 2.7516917890210832, "grad_norm": 0.02369142509996891, "learning_rate": 9.98840224907149e-07, "loss": 0.1025, "num_input_tokens_seen": 75914088, "step": 112635 }, { "epoch": 2.7518139398529304, "grad_norm": 0.05341620370745659, "learning_rate": 9.98754947369348e-07, "loss": 0.0001, "num_input_tokens_seen": 75917224, "step": 112640 }, { "epoch": 2.7519360906847776, "grad_norm": 0.058093249797821045, "learning_rate": 9.98669669840601e-07, "loss": 0.0677, "num_input_tokens_seen": 75920808, "step": 112645 }, { "epoch": 2.752058241516625, "grad_norm": 0.009987649507820606, "learning_rate": 9.985843923215284e-07, "loss": 0.0001, "num_input_tokens_seen": 75924520, "step": 112650 }, { "epoch": 2.752180392348472, "grad_norm": 0.04662652686238289, "learning_rate": 9.98499114812751e-07, "loss": 0.0261, "num_input_tokens_seen": 75927528, "step": 112655 }, { "epoch": 2.752302543180319, "grad_norm": 0.0058932919055223465, "learning_rate": 9.98413837314888e-07, "loss": 0.0002, "num_input_tokens_seen": 75930536, "step": 112660 }, { "epoch": 2.7524246940121664, "grad_norm": 0.00041428772965446115, "learning_rate": 9.983285598285606e-07, "loss": 0.0001, "num_input_tokens_seen": 75934248, "step": 112665 }, { "epoch": 2.752546844844013, "grad_norm": 75.15556335449219, "learning_rate": 9.98243282354388e-07, "loss": 0.0701, "num_input_tokens_seen": 75937320, "step": 112670 }, { "epoch": 2.7526689956758608, "grad_norm": 38.11005783081055, "learning_rate": 9.981580048929904e-07, "loss": 0.095, "num_input_tokens_seen": 75940072, "step": 112675 }, { "epoch": 2.7527911465077075, "grad_norm": 0.19551952183246613, "learning_rate": 9.980727274449886e-07, "loss": 0.0005, "num_input_tokens_seen": 75943848, "step": 112680 }, { "epoch": 2.752913297339555, "grad_norm": 0.005637908820062876, "learning_rate": 9.979874500110023e-07, "loss": 0.0933, "num_input_tokens_seen": 75946984, "step": 112685 }, { "epoch": 2.753035448171402, "grad_norm": 0.02520628087222576, "learning_rate": 9.979021725916521e-07, "loss": 0.0004, "num_input_tokens_seen": 75950120, "step": 112690 }, { "epoch": 2.753157599003249, "grad_norm": 0.1252969652414322, "learning_rate": 9.978168951875576e-07, "loss": 0.0001, "num_input_tokens_seen": 75953512, "step": 112695 }, { "epoch": 2.7532797498350963, "grad_norm": 0.1884545087814331, "learning_rate": 9.977316177993395e-07, "loss": 0.0721, "num_input_tokens_seen": 75956584, "step": 112700 }, { "epoch": 2.7534019006669435, "grad_norm": 0.07097301632165909, "learning_rate": 9.976463404276173e-07, "loss": 0.0661, "num_input_tokens_seen": 75960168, "step": 112705 }, { "epoch": 2.7535240514987906, "grad_norm": 0.02633850835263729, "learning_rate": 9.975610630730118e-07, "loss": 0.0001, "num_input_tokens_seen": 75963816, "step": 112710 }, { "epoch": 2.753646202330638, "grad_norm": 0.03084862418472767, "learning_rate": 9.97475785736143e-07, "loss": 0.0405, "num_input_tokens_seen": 75967016, "step": 112715 }, { "epoch": 2.753768353162485, "grad_norm": 0.0017762664938345551, "learning_rate": 9.973905084176307e-07, "loss": 0.0001, "num_input_tokens_seen": 75970600, "step": 112720 }, { "epoch": 2.753890503994332, "grad_norm": 0.06449517607688904, "learning_rate": 9.973052311180956e-07, "loss": 0.1143, "num_input_tokens_seen": 75973736, "step": 112725 }, { "epoch": 2.7540126548261794, "grad_norm": 0.07654926925897598, "learning_rate": 9.972199538381573e-07, "loss": 0.0002, "num_input_tokens_seen": 75977192, "step": 112730 }, { "epoch": 2.7541348056580266, "grad_norm": 0.062180664390325546, "learning_rate": 9.97134676578436e-07, "loss": 0.0349, "num_input_tokens_seen": 75980520, "step": 112735 }, { "epoch": 2.754256956489874, "grad_norm": 62.79735565185547, "learning_rate": 9.970493993395527e-07, "loss": 0.2066, "num_input_tokens_seen": 75984104, "step": 112740 }, { "epoch": 2.754379107321721, "grad_norm": 0.04304159805178642, "learning_rate": 9.969641221221267e-07, "loss": 0.0005, "num_input_tokens_seen": 75987368, "step": 112745 }, { "epoch": 2.754501258153568, "grad_norm": 0.3604861795902252, "learning_rate": 9.968788449267786e-07, "loss": 0.0365, "num_input_tokens_seen": 75990632, "step": 112750 }, { "epoch": 2.7546234089854154, "grad_norm": 0.07671809196472168, "learning_rate": 9.96793567754128e-07, "loss": 0.0005, "num_input_tokens_seen": 75994216, "step": 112755 }, { "epoch": 2.7547455598172625, "grad_norm": 0.007398826535791159, "learning_rate": 9.967082906047958e-07, "loss": 0.0001, "num_input_tokens_seen": 75997480, "step": 112760 }, { "epoch": 2.7548677106491093, "grad_norm": 0.012353933416306973, "learning_rate": 9.966230134794017e-07, "loss": 0.0611, "num_input_tokens_seen": 76000744, "step": 112765 }, { "epoch": 2.754989861480957, "grad_norm": 0.05334820970892906, "learning_rate": 9.965377363785657e-07, "loss": 0.0002, "num_input_tokens_seen": 76004456, "step": 112770 }, { "epoch": 2.7551120123128037, "grad_norm": 0.05684984475374222, "learning_rate": 9.964524593029089e-07, "loss": 0.146, "num_input_tokens_seen": 76007656, "step": 112775 }, { "epoch": 2.755234163144651, "grad_norm": 0.02138231322169304, "learning_rate": 9.963671822530499e-07, "loss": 0.0002, "num_input_tokens_seen": 76010920, "step": 112780 }, { "epoch": 2.755356313976498, "grad_norm": 0.14898619055747986, "learning_rate": 9.962819052296105e-07, "loss": 0.0003, "num_input_tokens_seen": 76014440, "step": 112785 }, { "epoch": 2.7554784648083452, "grad_norm": 49.297935485839844, "learning_rate": 9.961966282332093e-07, "loss": 0.0998, "num_input_tokens_seen": 76017704, "step": 112790 }, { "epoch": 2.7556006156401924, "grad_norm": 0.06874702125787735, "learning_rate": 9.96111351264468e-07, "loss": 0.0668, "num_input_tokens_seen": 76020648, "step": 112795 }, { "epoch": 2.7557227664720396, "grad_norm": 0.3245699405670166, "learning_rate": 9.960260743240054e-07, "loss": 0.0536, "num_input_tokens_seen": 76023592, "step": 112800 }, { "epoch": 2.755844917303887, "grad_norm": 0.4467359781265259, "learning_rate": 9.959407974124423e-07, "loss": 0.0483, "num_input_tokens_seen": 76026536, "step": 112805 }, { "epoch": 2.755967068135734, "grad_norm": 0.13222886621952057, "learning_rate": 9.958555205303992e-07, "loss": 0.0415, "num_input_tokens_seen": 76029800, "step": 112810 }, { "epoch": 2.756089218967581, "grad_norm": 0.01306053064763546, "learning_rate": 9.957702436784956e-07, "loss": 0.0215, "num_input_tokens_seen": 76032616, "step": 112815 }, { "epoch": 2.7562113697994284, "grad_norm": 0.019551554694771767, "learning_rate": 9.95684966857352e-07, "loss": 0.1356, "num_input_tokens_seen": 76035624, "step": 112820 }, { "epoch": 2.7563335206312756, "grad_norm": 0.022561049088835716, "learning_rate": 9.955996900675888e-07, "loss": 0.001, "num_input_tokens_seen": 76038760, "step": 112825 }, { "epoch": 2.7564556714631228, "grad_norm": 0.07821331173181534, "learning_rate": 9.955144133098253e-07, "loss": 0.0445, "num_input_tokens_seen": 76042344, "step": 112830 }, { "epoch": 2.75657782229497, "grad_norm": 0.17297571897506714, "learning_rate": 9.954291365846825e-07, "loss": 0.0004, "num_input_tokens_seen": 76045608, "step": 112835 }, { "epoch": 2.756699973126817, "grad_norm": 0.15678569674491882, "learning_rate": 9.953438598927801e-07, "loss": 0.0006, "num_input_tokens_seen": 76048744, "step": 112840 }, { "epoch": 2.7568221239586643, "grad_norm": 0.07841111719608307, "learning_rate": 9.952585832347387e-07, "loss": 0.0003, "num_input_tokens_seen": 76051816, "step": 112845 }, { "epoch": 2.756944274790511, "grad_norm": 0.014237224124372005, "learning_rate": 9.951733066111776e-07, "loss": 0.0392, "num_input_tokens_seen": 76054952, "step": 112850 }, { "epoch": 2.7570664256223587, "grad_norm": 0.21275421977043152, "learning_rate": 9.950880300227183e-07, "loss": 0.1102, "num_input_tokens_seen": 76058280, "step": 112855 }, { "epoch": 2.7571885764542055, "grad_norm": 0.013343808241188526, "learning_rate": 9.950027534699793e-07, "loss": 0.0001, "num_input_tokens_seen": 76062120, "step": 112860 }, { "epoch": 2.757310727286053, "grad_norm": 0.07676204293966293, "learning_rate": 9.949174769535821e-07, "loss": 0.0002, "num_input_tokens_seen": 76065448, "step": 112865 }, { "epoch": 2.7574328781179, "grad_norm": 34.86268615722656, "learning_rate": 9.948322004741465e-07, "loss": 0.1247, "num_input_tokens_seen": 76069032, "step": 112870 }, { "epoch": 2.757555028949747, "grad_norm": 0.07208728790283203, "learning_rate": 9.947469240322922e-07, "loss": 0.0006, "num_input_tokens_seen": 76072616, "step": 112875 }, { "epoch": 2.757677179781594, "grad_norm": 0.28862571716308594, "learning_rate": 9.946616476286402e-07, "loss": 0.0514, "num_input_tokens_seen": 76075816, "step": 112880 }, { "epoch": 2.7577993306134414, "grad_norm": 0.08105853945016861, "learning_rate": 9.945763712638094e-07, "loss": 0.0008, "num_input_tokens_seen": 76079144, "step": 112885 }, { "epoch": 2.7579214814452886, "grad_norm": 0.00472519313916564, "learning_rate": 9.944910949384213e-07, "loss": 0.0408, "num_input_tokens_seen": 76082664, "step": 112890 }, { "epoch": 2.758043632277136, "grad_norm": 0.14678433537483215, "learning_rate": 9.944058186530951e-07, "loss": 0.0002, "num_input_tokens_seen": 76086248, "step": 112895 }, { "epoch": 2.758165783108983, "grad_norm": 0.09297756850719452, "learning_rate": 9.94320542408451e-07, "loss": 0.0001, "num_input_tokens_seen": 76089512, "step": 112900 }, { "epoch": 2.75828793394083, "grad_norm": 0.03199482336640358, "learning_rate": 9.9423526620511e-07, "loss": 0.1132, "num_input_tokens_seen": 76092584, "step": 112905 }, { "epoch": 2.7584100847726774, "grad_norm": 0.12539511919021606, "learning_rate": 9.941499900436915e-07, "loss": 0.0003, "num_input_tokens_seen": 76096104, "step": 112910 }, { "epoch": 2.7585322356045245, "grad_norm": 0.033748965710401535, "learning_rate": 9.94064713924816e-07, "loss": 0.0628, "num_input_tokens_seen": 76099432, "step": 112915 }, { "epoch": 2.7586543864363717, "grad_norm": 16.201793670654297, "learning_rate": 9.93979437849103e-07, "loss": 0.0576, "num_input_tokens_seen": 76103464, "step": 112920 }, { "epoch": 2.758776537268219, "grad_norm": 0.017845073714852333, "learning_rate": 9.938941618171736e-07, "loss": 0.049, "num_input_tokens_seen": 76106664, "step": 112925 }, { "epoch": 2.758898688100066, "grad_norm": 0.01976141892373562, "learning_rate": 9.938088858296477e-07, "loss": 0.0003, "num_input_tokens_seen": 76110376, "step": 112930 }, { "epoch": 2.759020838931913, "grad_norm": 16.954193115234375, "learning_rate": 9.937236098871447e-07, "loss": 0.0705, "num_input_tokens_seen": 76114152, "step": 112935 }, { "epoch": 2.7591429897637605, "grad_norm": 34.79048156738281, "learning_rate": 9.936383339902858e-07, "loss": 0.0008, "num_input_tokens_seen": 76117480, "step": 112940 }, { "epoch": 2.7592651405956072, "grad_norm": 0.17568282783031464, "learning_rate": 9.935530581396902e-07, "loss": 0.0002, "num_input_tokens_seen": 76120744, "step": 112945 }, { "epoch": 2.759387291427455, "grad_norm": 0.010089860297739506, "learning_rate": 9.93467782335979e-07, "loss": 0.0257, "num_input_tokens_seen": 76124264, "step": 112950 }, { "epoch": 2.7595094422593016, "grad_norm": 0.032470934092998505, "learning_rate": 9.933825065797711e-07, "loss": 0.0001, "num_input_tokens_seen": 76128296, "step": 112955 }, { "epoch": 2.759631593091149, "grad_norm": 0.037431854754686356, "learning_rate": 9.932972308716877e-07, "loss": 0.0007, "num_input_tokens_seen": 76132008, "step": 112960 }, { "epoch": 2.759753743922996, "grad_norm": 0.17975296080112457, "learning_rate": 9.93211955212349e-07, "loss": 0.0002, "num_input_tokens_seen": 76136104, "step": 112965 }, { "epoch": 2.759875894754843, "grad_norm": 0.013288943096995354, "learning_rate": 9.931266796023744e-07, "loss": 0.0009, "num_input_tokens_seen": 76139688, "step": 112970 }, { "epoch": 2.7599980455866904, "grad_norm": 0.09433402121067047, "learning_rate": 9.930414040423848e-07, "loss": 0.0008, "num_input_tokens_seen": 76143400, "step": 112975 }, { "epoch": 2.7601201964185376, "grad_norm": 0.19448913633823395, "learning_rate": 9.929561285329997e-07, "loss": 0.0592, "num_input_tokens_seen": 76147560, "step": 112980 }, { "epoch": 2.7602423472503848, "grad_norm": 0.04659518599510193, "learning_rate": 9.928708530748395e-07, "loss": 0.0453, "num_input_tokens_seen": 76150760, "step": 112985 }, { "epoch": 2.760364498082232, "grad_norm": 0.052931103855371475, "learning_rate": 9.927855776685247e-07, "loss": 0.0001, "num_input_tokens_seen": 76154216, "step": 112990 }, { "epoch": 2.760486648914079, "grad_norm": 0.023006392642855644, "learning_rate": 9.927003023146745e-07, "loss": 0.0004, "num_input_tokens_seen": 76157352, "step": 112995 }, { "epoch": 2.7606087997459263, "grad_norm": 0.009410569444298744, "learning_rate": 9.926150270139104e-07, "loss": 0.0006, "num_input_tokens_seen": 76160680, "step": 113000 }, { "epoch": 2.7607309505777735, "grad_norm": 0.01064909528940916, "learning_rate": 9.925297517668512e-07, "loss": 0.0001, "num_input_tokens_seen": 76163944, "step": 113005 }, { "epoch": 2.7608531014096207, "grad_norm": 0.158854141831398, "learning_rate": 9.924444765741183e-07, "loss": 0.0002, "num_input_tokens_seen": 76167016, "step": 113010 }, { "epoch": 2.760975252241468, "grad_norm": 0.17472516000270844, "learning_rate": 9.923592014363305e-07, "loss": 0.0008, "num_input_tokens_seen": 76170600, "step": 113015 }, { "epoch": 2.761097403073315, "grad_norm": 0.006018009968101978, "learning_rate": 9.92273926354109e-07, "loss": 0.0372, "num_input_tokens_seen": 76173800, "step": 113020 }, { "epoch": 2.7612195539051623, "grad_norm": 0.007808130234479904, "learning_rate": 9.921886513280735e-07, "loss": 0.0001, "num_input_tokens_seen": 76176936, "step": 113025 }, { "epoch": 2.761341704737009, "grad_norm": 0.08086532354354858, "learning_rate": 9.921033763588444e-07, "loss": 0.0004, "num_input_tokens_seen": 76179880, "step": 113030 }, { "epoch": 2.7614638555688567, "grad_norm": 0.006262186449021101, "learning_rate": 9.920181014470417e-07, "loss": 0.0002, "num_input_tokens_seen": 76183528, "step": 113035 }, { "epoch": 2.7615860064007034, "grad_norm": 0.020363593474030495, "learning_rate": 9.919328265932852e-07, "loss": 0.0705, "num_input_tokens_seen": 76186856, "step": 113040 }, { "epoch": 2.761708157232551, "grad_norm": 0.011324654333293438, "learning_rate": 9.918475517981958e-07, "loss": 0.0, "num_input_tokens_seen": 76190376, "step": 113045 }, { "epoch": 2.7618303080643978, "grad_norm": 12.743666648864746, "learning_rate": 9.917622770623925e-07, "loss": 0.049, "num_input_tokens_seen": 76194216, "step": 113050 }, { "epoch": 2.761952458896245, "grad_norm": 13.240878105163574, "learning_rate": 9.916770023864964e-07, "loss": 0.0647, "num_input_tokens_seen": 76197224, "step": 113055 }, { "epoch": 2.762074609728092, "grad_norm": 0.5368944406509399, "learning_rate": 9.915917277711277e-07, "loss": 0.0004, "num_input_tokens_seen": 76200808, "step": 113060 }, { "epoch": 2.7621967605599393, "grad_norm": 0.029167726635932922, "learning_rate": 9.915064532169058e-07, "loss": 0.0004, "num_input_tokens_seen": 76203816, "step": 113065 }, { "epoch": 2.7623189113917865, "grad_norm": 0.005114384926855564, "learning_rate": 9.91421178724452e-07, "loss": 0.0001, "num_input_tokens_seen": 76206952, "step": 113070 }, { "epoch": 2.7624410622236337, "grad_norm": 0.03877369314432144, "learning_rate": 9.913359042943848e-07, "loss": 0.0002, "num_input_tokens_seen": 76210024, "step": 113075 }, { "epoch": 2.762563213055481, "grad_norm": 672.5220947265625, "learning_rate": 9.912506299273256e-07, "loss": 0.0261, "num_input_tokens_seen": 76213480, "step": 113080 }, { "epoch": 2.762685363887328, "grad_norm": 0.17560061812400818, "learning_rate": 9.911653556238945e-07, "loss": 0.1107, "num_input_tokens_seen": 76216488, "step": 113085 }, { "epoch": 2.7628075147191753, "grad_norm": 0.04069105163216591, "learning_rate": 9.910800813847107e-07, "loss": 0.0676, "num_input_tokens_seen": 76219816, "step": 113090 }, { "epoch": 2.7629296655510225, "grad_norm": 0.014730570837855339, "learning_rate": 9.909948072103956e-07, "loss": 0.0007, "num_input_tokens_seen": 76223144, "step": 113095 }, { "epoch": 2.7630518163828697, "grad_norm": 0.13270990550518036, "learning_rate": 9.90909533101568e-07, "loss": 0.045, "num_input_tokens_seen": 76226536, "step": 113100 }, { "epoch": 2.763173967214717, "grad_norm": 0.04199433699250221, "learning_rate": 9.908242590588494e-07, "loss": 0.0005, "num_input_tokens_seen": 76230248, "step": 113105 }, { "epoch": 2.763296118046564, "grad_norm": 0.035170089453458786, "learning_rate": 9.907389850828586e-07, "loss": 0.0003, "num_input_tokens_seen": 76233576, "step": 113110 }, { "epoch": 2.763418268878411, "grad_norm": 0.08260912448167801, "learning_rate": 9.906537111742167e-07, "loss": 0.0006, "num_input_tokens_seen": 76237096, "step": 113115 }, { "epoch": 2.7635404197102584, "grad_norm": 0.03758857026696205, "learning_rate": 9.905684373335436e-07, "loss": 0.0524, "num_input_tokens_seen": 76240616, "step": 113120 }, { "epoch": 2.763662570542105, "grad_norm": 0.22792379558086395, "learning_rate": 9.90483163561459e-07, "loss": 0.0543, "num_input_tokens_seen": 76243496, "step": 113125 }, { "epoch": 2.763784721373953, "grad_norm": 10.247224807739258, "learning_rate": 9.90397889858584e-07, "loss": 0.0777, "num_input_tokens_seen": 76247144, "step": 113130 }, { "epoch": 2.7639068722057996, "grad_norm": 0.026630444452166557, "learning_rate": 9.903126162255379e-07, "loss": 0.0001, "num_input_tokens_seen": 76250280, "step": 113135 }, { "epoch": 2.7640290230376467, "grad_norm": 21.695283889770508, "learning_rate": 9.902273426629406e-07, "loss": 0.1067, "num_input_tokens_seen": 76253608, "step": 113140 }, { "epoch": 2.764151173869494, "grad_norm": 0.09523158520460129, "learning_rate": 9.901420691714135e-07, "loss": 0.0005, "num_input_tokens_seen": 76257640, "step": 113145 }, { "epoch": 2.764273324701341, "grad_norm": 0.2219829559326172, "learning_rate": 9.900567957515752e-07, "loss": 0.0003, "num_input_tokens_seen": 76261096, "step": 113150 }, { "epoch": 2.7643954755331883, "grad_norm": 0.3712293207645416, "learning_rate": 9.89971522404047e-07, "loss": 0.0451, "num_input_tokens_seen": 76264552, "step": 113155 }, { "epoch": 2.7645176263650355, "grad_norm": 0.12669548392295837, "learning_rate": 9.898862491294483e-07, "loss": 0.0003, "num_input_tokens_seen": 76267880, "step": 113160 }, { "epoch": 2.7646397771968827, "grad_norm": 0.03212711215019226, "learning_rate": 9.898009759283999e-07, "loss": 0.0569, "num_input_tokens_seen": 76271144, "step": 113165 }, { "epoch": 2.76476192802873, "grad_norm": 0.030199235305190086, "learning_rate": 9.89715702801521e-07, "loss": 0.0003, "num_input_tokens_seen": 76274600, "step": 113170 }, { "epoch": 2.764884078860577, "grad_norm": 0.034971047192811966, "learning_rate": 9.896304297494327e-07, "loss": 0.0003, "num_input_tokens_seen": 76277992, "step": 113175 }, { "epoch": 2.7650062296924243, "grad_norm": 0.0070164743810892105, "learning_rate": 9.895451567727544e-07, "loss": 0.0003, "num_input_tokens_seen": 76281128, "step": 113180 }, { "epoch": 2.7651283805242715, "grad_norm": 0.07414111495018005, "learning_rate": 9.894598838721069e-07, "loss": 0.0009, "num_input_tokens_seen": 76284456, "step": 113185 }, { "epoch": 2.7652505313561186, "grad_norm": 0.01181288342922926, "learning_rate": 9.893746110481097e-07, "loss": 0.07, "num_input_tokens_seen": 76287656, "step": 113190 }, { "epoch": 2.765372682187966, "grad_norm": 0.04185613617300987, "learning_rate": 9.892893383013833e-07, "loss": 0.0002, "num_input_tokens_seen": 76290856, "step": 113195 }, { "epoch": 2.765494833019813, "grad_norm": 0.03417033702135086, "learning_rate": 9.89204065632548e-07, "loss": 0.1193, "num_input_tokens_seen": 76293992, "step": 113200 }, { "epoch": 2.76561698385166, "grad_norm": 0.003816602984443307, "learning_rate": 9.89118793042223e-07, "loss": 0.0896, "num_input_tokens_seen": 76297128, "step": 113205 }, { "epoch": 2.765739134683507, "grad_norm": 0.16992871463298798, "learning_rate": 9.890335205310291e-07, "loss": 0.0004, "num_input_tokens_seen": 76300648, "step": 113210 }, { "epoch": 2.7658612855153546, "grad_norm": 0.0317666195333004, "learning_rate": 9.88948248099587e-07, "loss": 0.0428, "num_input_tokens_seen": 76303656, "step": 113215 }, { "epoch": 2.7659834363472013, "grad_norm": 0.01543817576020956, "learning_rate": 9.888629757485156e-07, "loss": 0.0539, "num_input_tokens_seen": 76307560, "step": 113220 }, { "epoch": 2.7661055871790485, "grad_norm": 0.03953710198402405, "learning_rate": 9.88777703478436e-07, "loss": 0.0398, "num_input_tokens_seen": 76311336, "step": 113225 }, { "epoch": 2.7662277380108957, "grad_norm": 0.045606087893247604, "learning_rate": 9.886924312899679e-07, "loss": 0.0003, "num_input_tokens_seen": 76314856, "step": 113230 }, { "epoch": 2.766349888842743, "grad_norm": 4.622429370880127, "learning_rate": 9.886071591837314e-07, "loss": 0.0458, "num_input_tokens_seen": 76318184, "step": 113235 }, { "epoch": 2.76647203967459, "grad_norm": 0.03922732546925545, "learning_rate": 9.88521887160347e-07, "loss": 0.0004, "num_input_tokens_seen": 76321320, "step": 113240 }, { "epoch": 2.7665941905064373, "grad_norm": 0.027470586821436882, "learning_rate": 9.88436615220434e-07, "loss": 0.0004, "num_input_tokens_seen": 76324648, "step": 113245 }, { "epoch": 2.7667163413382845, "grad_norm": 0.04852357134222984, "learning_rate": 9.883513433646135e-07, "loss": 0.0001, "num_input_tokens_seen": 76327848, "step": 113250 }, { "epoch": 2.7668384921701317, "grad_norm": 0.07106790691614151, "learning_rate": 9.882660715935047e-07, "loss": 0.0001, "num_input_tokens_seen": 76331496, "step": 113255 }, { "epoch": 2.766960643001979, "grad_norm": 31.647287368774414, "learning_rate": 9.881807999077288e-07, "loss": 0.0431, "num_input_tokens_seen": 76334568, "step": 113260 }, { "epoch": 2.767082793833826, "grad_norm": 0.009211227297782898, "learning_rate": 9.880955283079047e-07, "loss": 0.0002, "num_input_tokens_seen": 76337704, "step": 113265 }, { "epoch": 2.7672049446656732, "grad_norm": 0.010737738572061062, "learning_rate": 9.880102567946533e-07, "loss": 0.0003, "num_input_tokens_seen": 76341032, "step": 113270 }, { "epoch": 2.7673270954975204, "grad_norm": 0.05103771388530731, "learning_rate": 9.879249853685949e-07, "loss": 0.1252, "num_input_tokens_seen": 76344232, "step": 113275 }, { "epoch": 2.7674492463293676, "grad_norm": 0.06809987127780914, "learning_rate": 9.878397140303487e-07, "loss": 0.0001, "num_input_tokens_seen": 76348008, "step": 113280 }, { "epoch": 2.767571397161215, "grad_norm": 0.024205448105931282, "learning_rate": 9.877544427805358e-07, "loss": 0.0323, "num_input_tokens_seen": 76351336, "step": 113285 }, { "epoch": 2.767693547993062, "grad_norm": 0.06032427400350571, "learning_rate": 9.876691716197759e-07, "loss": 0.0003, "num_input_tokens_seen": 76354216, "step": 113290 }, { "epoch": 2.7678156988249087, "grad_norm": 0.10438254475593567, "learning_rate": 9.875839005486886e-07, "loss": 0.0585, "num_input_tokens_seen": 76357544, "step": 113295 }, { "epoch": 2.7679378496567564, "grad_norm": 0.061745673418045044, "learning_rate": 9.87498629567895e-07, "loss": 0.0257, "num_input_tokens_seen": 76361256, "step": 113300 }, { "epoch": 2.768060000488603, "grad_norm": 0.004507183562964201, "learning_rate": 9.874133586780145e-07, "loss": 0.0002, "num_input_tokens_seen": 76364520, "step": 113305 }, { "epoch": 2.7681821513204508, "grad_norm": 0.05256107077002525, "learning_rate": 9.873280878796676e-07, "loss": 0.0002, "num_input_tokens_seen": 76367528, "step": 113310 }, { "epoch": 2.7683043021522975, "grad_norm": 0.01807519607245922, "learning_rate": 9.87242817173474e-07, "loss": 0.0002, "num_input_tokens_seen": 76370792, "step": 113315 }, { "epoch": 2.7684264529841447, "grad_norm": 0.13760019838809967, "learning_rate": 9.871575465600546e-07, "loss": 0.0002, "num_input_tokens_seen": 76373736, "step": 113320 }, { "epoch": 2.768548603815992, "grad_norm": 0.4591255486011505, "learning_rate": 9.870722760400285e-07, "loss": 0.0006, "num_input_tokens_seen": 76377128, "step": 113325 }, { "epoch": 2.768670754647839, "grad_norm": 0.02008800208568573, "learning_rate": 9.869870056140163e-07, "loss": 0.0002, "num_input_tokens_seen": 76380968, "step": 113330 }, { "epoch": 2.7687929054796863, "grad_norm": 0.006490596570074558, "learning_rate": 9.869017352826382e-07, "loss": 0.0426, "num_input_tokens_seen": 76384168, "step": 113335 }, { "epoch": 2.7689150563115335, "grad_norm": 0.00938922818750143, "learning_rate": 9.86816465046514e-07, "loss": 0.0001, "num_input_tokens_seen": 76387240, "step": 113340 }, { "epoch": 2.7690372071433806, "grad_norm": 0.01878109946846962, "learning_rate": 9.867311949062644e-07, "loss": 0.0003, "num_input_tokens_seen": 76390504, "step": 113345 }, { "epoch": 2.769159357975228, "grad_norm": 0.10273200273513794, "learning_rate": 9.86645924862509e-07, "loss": 0.0004, "num_input_tokens_seen": 76393576, "step": 113350 }, { "epoch": 2.769281508807075, "grad_norm": 0.007617755327373743, "learning_rate": 9.865606549158681e-07, "loss": 0.0725, "num_input_tokens_seen": 76397032, "step": 113355 }, { "epoch": 2.769403659638922, "grad_norm": 0.0022685937583446503, "learning_rate": 9.864753850669613e-07, "loss": 0.0001, "num_input_tokens_seen": 76400744, "step": 113360 }, { "epoch": 2.7695258104707694, "grad_norm": 0.01068595889955759, "learning_rate": 9.863901153164094e-07, "loss": 0.0001, "num_input_tokens_seen": 76404136, "step": 113365 }, { "epoch": 2.7696479613026166, "grad_norm": 23.14859390258789, "learning_rate": 9.863048456648324e-07, "loss": 0.1539, "num_input_tokens_seen": 76407528, "step": 113370 }, { "epoch": 2.769770112134464, "grad_norm": 0.012229954823851585, "learning_rate": 9.862195761128498e-07, "loss": 0.0002, "num_input_tokens_seen": 76411176, "step": 113375 }, { "epoch": 2.7698922629663105, "grad_norm": 16.60312271118164, "learning_rate": 9.861343066610829e-07, "loss": 0.0967, "num_input_tokens_seen": 76414184, "step": 113380 }, { "epoch": 2.770014413798158, "grad_norm": 0.0237701628357172, "learning_rate": 9.860490373101503e-07, "loss": 0.0002, "num_input_tokens_seen": 76417128, "step": 113385 }, { "epoch": 2.770136564630005, "grad_norm": 0.19100302457809448, "learning_rate": 9.859637680606732e-07, "loss": 0.0002, "num_input_tokens_seen": 76420456, "step": 113390 }, { "epoch": 2.7702587154618525, "grad_norm": 0.2962363064289093, "learning_rate": 9.858784989132717e-07, "loss": 0.0728, "num_input_tokens_seen": 76423784, "step": 113395 }, { "epoch": 2.7703808662936993, "grad_norm": 0.1180248036980629, "learning_rate": 9.857932298685648e-07, "loss": 0.0002, "num_input_tokens_seen": 76427624, "step": 113400 }, { "epoch": 2.7705030171255465, "grad_norm": 0.014938372187316418, "learning_rate": 9.85707960927174e-07, "loss": 0.0504, "num_input_tokens_seen": 76431272, "step": 113405 }, { "epoch": 2.7706251679573937, "grad_norm": 16.964563369750977, "learning_rate": 9.856226920897182e-07, "loss": 0.0524, "num_input_tokens_seen": 76434856, "step": 113410 }, { "epoch": 2.770747318789241, "grad_norm": 0.04503167048096657, "learning_rate": 9.855374233568186e-07, "loss": 0.0002, "num_input_tokens_seen": 76437992, "step": 113415 }, { "epoch": 2.770869469621088, "grad_norm": 0.04523736983537674, "learning_rate": 9.854521547290942e-07, "loss": 0.0984, "num_input_tokens_seen": 76441448, "step": 113420 }, { "epoch": 2.7709916204529352, "grad_norm": 2.487971544265747, "learning_rate": 9.853668862071657e-07, "loss": 0.0019, "num_input_tokens_seen": 76444904, "step": 113425 }, { "epoch": 2.7711137712847824, "grad_norm": 0.17408542335033417, "learning_rate": 9.852816177916535e-07, "loss": 0.0006, "num_input_tokens_seen": 76448040, "step": 113430 }, { "epoch": 2.7712359221166296, "grad_norm": 0.1603502482175827, "learning_rate": 9.851963494831771e-07, "loss": 0.0007, "num_input_tokens_seen": 76451304, "step": 113435 }, { "epoch": 2.771358072948477, "grad_norm": 0.011682548560202122, "learning_rate": 9.851110812823571e-07, "loss": 0.0018, "num_input_tokens_seen": 76454632, "step": 113440 }, { "epoch": 2.771480223780324, "grad_norm": 2.488227128982544, "learning_rate": 9.850258131898133e-07, "loss": 0.0006, "num_input_tokens_seen": 76457896, "step": 113445 }, { "epoch": 2.771602374612171, "grad_norm": 0.01607205905020237, "learning_rate": 9.849405452061654e-07, "loss": 0.0169, "num_input_tokens_seen": 76461032, "step": 113450 }, { "epoch": 2.7717245254440184, "grad_norm": 0.002345475135371089, "learning_rate": 9.848552773320345e-07, "loss": 0.0567, "num_input_tokens_seen": 76464360, "step": 113455 }, { "epoch": 2.7718466762758656, "grad_norm": 0.015486088581383228, "learning_rate": 9.847700095680394e-07, "loss": 0.0002, "num_input_tokens_seen": 76467624, "step": 113460 }, { "epoch": 2.7719688271077128, "grad_norm": 0.03439730033278465, "learning_rate": 9.846847419148016e-07, "loss": 0.0483, "num_input_tokens_seen": 76470952, "step": 113465 }, { "epoch": 2.77209097793956, "grad_norm": 30.77482032775879, "learning_rate": 9.8459947437294e-07, "loss": 0.0459, "num_input_tokens_seen": 76474280, "step": 113470 }, { "epoch": 2.7722131287714067, "grad_norm": 0.06692664325237274, "learning_rate": 9.845142069430754e-07, "loss": 0.1802, "num_input_tokens_seen": 76477608, "step": 113475 }, { "epoch": 2.7723352796032543, "grad_norm": 25.74443244934082, "learning_rate": 9.844289396258272e-07, "loss": 0.0344, "num_input_tokens_seen": 76480872, "step": 113480 }, { "epoch": 2.772457430435101, "grad_norm": 0.021210841834545135, "learning_rate": 9.843436724218163e-07, "loss": 0.0002, "num_input_tokens_seen": 76484200, "step": 113485 }, { "epoch": 2.7725795812669487, "grad_norm": 0.00724679185077548, "learning_rate": 9.842584053316626e-07, "loss": 0.0005, "num_input_tokens_seen": 76487592, "step": 113490 }, { "epoch": 2.7727017320987954, "grad_norm": 0.018039369955658913, "learning_rate": 9.841731383559857e-07, "loss": 0.0005, "num_input_tokens_seen": 76490984, "step": 113495 }, { "epoch": 2.7728238829306426, "grad_norm": 0.016924476251006126, "learning_rate": 9.840878714954063e-07, "loss": 0.0002, "num_input_tokens_seen": 76494120, "step": 113500 }, { "epoch": 2.77294603376249, "grad_norm": 67.77898406982422, "learning_rate": 9.840026047505438e-07, "loss": 0.0468, "num_input_tokens_seen": 76497384, "step": 113505 }, { "epoch": 2.773068184594337, "grad_norm": 0.033945340663194656, "learning_rate": 9.839173381220191e-07, "loss": 0.0001, "num_input_tokens_seen": 76500840, "step": 113510 }, { "epoch": 2.773190335426184, "grad_norm": 0.02062019146978855, "learning_rate": 9.838320716104515e-07, "loss": 0.151, "num_input_tokens_seen": 76504744, "step": 113515 }, { "epoch": 2.7733124862580314, "grad_norm": 0.00838180910795927, "learning_rate": 9.837468052164612e-07, "loss": 0.0003, "num_input_tokens_seen": 76507624, "step": 113520 }, { "epoch": 2.7734346370898786, "grad_norm": 0.10621123760938644, "learning_rate": 9.83661538940669e-07, "loss": 0.0006, "num_input_tokens_seen": 76510952, "step": 113525 }, { "epoch": 2.7735567879217258, "grad_norm": 0.006988401524722576, "learning_rate": 9.83576272783694e-07, "loss": 0.0001, "num_input_tokens_seen": 76514216, "step": 113530 }, { "epoch": 2.773678938753573, "grad_norm": 0.00296860933303833, "learning_rate": 9.834910067461574e-07, "loss": 0.0008, "num_input_tokens_seen": 76517672, "step": 113535 }, { "epoch": 2.77380108958542, "grad_norm": 0.03899238258600235, "learning_rate": 9.834057408286782e-07, "loss": 0.053, "num_input_tokens_seen": 76521640, "step": 113540 }, { "epoch": 2.7739232404172673, "grad_norm": 0.1324678212404251, "learning_rate": 9.83320475031877e-07, "loss": 0.0001, "num_input_tokens_seen": 76525480, "step": 113545 }, { "epoch": 2.7740453912491145, "grad_norm": 0.02307271398603916, "learning_rate": 9.83235209356374e-07, "loss": 0.0002, "num_input_tokens_seen": 76529000, "step": 113550 }, { "epoch": 2.7741675420809617, "grad_norm": 0.0023544637951999903, "learning_rate": 9.831499438027888e-07, "loss": 0.141, "num_input_tokens_seen": 76532584, "step": 113555 }, { "epoch": 2.7742896929128085, "grad_norm": 0.03203720226883888, "learning_rate": 9.83064678371742e-07, "loss": 0.0664, "num_input_tokens_seen": 76535912, "step": 113560 }, { "epoch": 2.774411843744656, "grad_norm": 0.5081039071083069, "learning_rate": 9.82979413063853e-07, "loss": 0.0002, "num_input_tokens_seen": 76539560, "step": 113565 }, { "epoch": 2.774533994576503, "grad_norm": 0.0042310431599617004, "learning_rate": 9.828941478797428e-07, "loss": 0.0001, "num_input_tokens_seen": 76543144, "step": 113570 }, { "epoch": 2.7746561454083505, "grad_norm": 0.03580312058329582, "learning_rate": 9.828088828200303e-07, "loss": 0.0469, "num_input_tokens_seen": 76547368, "step": 113575 }, { "epoch": 2.7747782962401972, "grad_norm": 0.06093088537454605, "learning_rate": 9.827236178853366e-07, "loss": 0.0001, "num_input_tokens_seen": 76550632, "step": 113580 }, { "epoch": 2.7749004470720444, "grad_norm": 0.08521981537342072, "learning_rate": 9.826383530762817e-07, "loss": 0.115, "num_input_tokens_seen": 76554152, "step": 113585 }, { "epoch": 2.7750225979038916, "grad_norm": 0.011083260178565979, "learning_rate": 9.825530883934847e-07, "loss": 0.0001, "num_input_tokens_seen": 76557352, "step": 113590 }, { "epoch": 2.775144748735739, "grad_norm": 0.0052310023456811905, "learning_rate": 9.82467823837567e-07, "loss": 0.0003, "num_input_tokens_seen": 76560808, "step": 113595 }, { "epoch": 2.775266899567586, "grad_norm": 0.016578521579504013, "learning_rate": 9.823825594091477e-07, "loss": 0.0767, "num_input_tokens_seen": 76564072, "step": 113600 }, { "epoch": 2.775389050399433, "grad_norm": 0.04751870781183243, "learning_rate": 9.822972951088473e-07, "loss": 0.0003, "num_input_tokens_seen": 76567144, "step": 113605 }, { "epoch": 2.7755112012312804, "grad_norm": 0.05532370135188103, "learning_rate": 9.822120309372855e-07, "loss": 0.0002, "num_input_tokens_seen": 76570472, "step": 113610 }, { "epoch": 2.7756333520631276, "grad_norm": 0.03185422718524933, "learning_rate": 9.821267668950824e-07, "loss": 0.0002, "num_input_tokens_seen": 76573672, "step": 113615 }, { "epoch": 2.7757555028949747, "grad_norm": 0.07462109625339508, "learning_rate": 9.820415029828588e-07, "loss": 0.0003, "num_input_tokens_seen": 76577000, "step": 113620 }, { "epoch": 2.775877653726822, "grad_norm": 296.1712341308594, "learning_rate": 9.81956239201234e-07, "loss": 0.078, "num_input_tokens_seen": 76580584, "step": 113625 }, { "epoch": 2.775999804558669, "grad_norm": 0.011432225815951824, "learning_rate": 9.818709755508284e-07, "loss": 0.0664, "num_input_tokens_seen": 76583848, "step": 113630 }, { "epoch": 2.7761219553905163, "grad_norm": 0.08162835985422134, "learning_rate": 9.817857120322615e-07, "loss": 0.0001, "num_input_tokens_seen": 76587240, "step": 113635 }, { "epoch": 2.7762441062223635, "grad_norm": 0.11095693707466125, "learning_rate": 9.81700448646154e-07, "loss": 0.0002, "num_input_tokens_seen": 76590632, "step": 113640 }, { "epoch": 2.7763662570542107, "grad_norm": 155.89930725097656, "learning_rate": 9.81615185393126e-07, "loss": 0.058, "num_input_tokens_seen": 76594536, "step": 113645 }, { "epoch": 2.776488407886058, "grad_norm": 0.017484327778220177, "learning_rate": 9.815299222737972e-07, "loss": 0.0001, "num_input_tokens_seen": 76598184, "step": 113650 }, { "epoch": 2.7766105587179046, "grad_norm": 0.4224332869052887, "learning_rate": 9.814446592887878e-07, "loss": 0.0623, "num_input_tokens_seen": 76601128, "step": 113655 }, { "epoch": 2.7767327095497523, "grad_norm": 0.009731501340866089, "learning_rate": 9.813593964387177e-07, "loss": 0.0008, "num_input_tokens_seen": 76604968, "step": 113660 }, { "epoch": 2.776854860381599, "grad_norm": 0.027277182787656784, "learning_rate": 9.812741337242074e-07, "loss": 0.0001, "num_input_tokens_seen": 76608680, "step": 113665 }, { "epoch": 2.776977011213446, "grad_norm": 0.004735254216939211, "learning_rate": 9.811888711458762e-07, "loss": 0.0472, "num_input_tokens_seen": 76612072, "step": 113670 }, { "epoch": 2.7770991620452934, "grad_norm": 0.03370065987110138, "learning_rate": 9.811036087043445e-07, "loss": 0.0003, "num_input_tokens_seen": 76615592, "step": 113675 }, { "epoch": 2.7772213128771406, "grad_norm": 0.3166724145412445, "learning_rate": 9.81018346400233e-07, "loss": 0.0837, "num_input_tokens_seen": 76619176, "step": 113680 }, { "epoch": 2.7773434637089878, "grad_norm": 0.03397630155086517, "learning_rate": 9.809330842341607e-07, "loss": 0.0002, "num_input_tokens_seen": 76623080, "step": 113685 }, { "epoch": 2.777465614540835, "grad_norm": 0.015391331166028976, "learning_rate": 9.808478222067487e-07, "loss": 0.0598, "num_input_tokens_seen": 76626344, "step": 113690 }, { "epoch": 2.777587765372682, "grad_norm": 0.2570863366127014, "learning_rate": 9.807625603186158e-07, "loss": 0.0004, "num_input_tokens_seen": 76629736, "step": 113695 }, { "epoch": 2.7777099162045293, "grad_norm": 0.017918601632118225, "learning_rate": 9.80677298570383e-07, "loss": 0.0003, "num_input_tokens_seen": 76632936, "step": 113700 }, { "epoch": 2.7778320670363765, "grad_norm": 0.004007379058748484, "learning_rate": 9.805920369626706e-07, "loss": 0.0001, "num_input_tokens_seen": 76636264, "step": 113705 }, { "epoch": 2.7779542178682237, "grad_norm": 806.7548217773438, "learning_rate": 9.805067754960973e-07, "loss": 0.0168, "num_input_tokens_seen": 76639336, "step": 113710 }, { "epoch": 2.778076368700071, "grad_norm": 0.07259964942932129, "learning_rate": 9.804215141712848e-07, "loss": 0.0004, "num_input_tokens_seen": 76642664, "step": 113715 }, { "epoch": 2.778198519531918, "grad_norm": 0.005305212456732988, "learning_rate": 9.803362529888516e-07, "loss": 0.0001, "num_input_tokens_seen": 76645864, "step": 113720 }, { "epoch": 2.7783206703637653, "grad_norm": 0.1281394064426422, "learning_rate": 9.80250991949419e-07, "loss": 0.0001, "num_input_tokens_seen": 76649256, "step": 113725 }, { "epoch": 2.7784428211956125, "grad_norm": 0.01599547639489174, "learning_rate": 9.80165731053606e-07, "loss": 0.1172, "num_input_tokens_seen": 76652456, "step": 113730 }, { "epoch": 2.7785649720274597, "grad_norm": 0.38258928060531616, "learning_rate": 9.800804703020331e-07, "loss": 0.0001, "num_input_tokens_seen": 76656168, "step": 113735 }, { "epoch": 2.7786871228593064, "grad_norm": 0.048715341836214066, "learning_rate": 9.79995209695321e-07, "loss": 0.0002, "num_input_tokens_seen": 76660584, "step": 113740 }, { "epoch": 2.778809273691154, "grad_norm": 18.242271423339844, "learning_rate": 9.799099492340885e-07, "loss": 0.1553, "num_input_tokens_seen": 76663912, "step": 113745 }, { "epoch": 2.778931424523001, "grad_norm": 0.2122068852186203, "learning_rate": 9.798246889189567e-07, "loss": 0.0001, "num_input_tokens_seen": 76667048, "step": 113750 }, { "epoch": 2.7790535753548484, "grad_norm": 44.100433349609375, "learning_rate": 9.79739428750545e-07, "loss": 0.0978, "num_input_tokens_seen": 76670568, "step": 113755 }, { "epoch": 2.779175726186695, "grad_norm": 0.013350458815693855, "learning_rate": 9.796541687294738e-07, "loss": 0.0001, "num_input_tokens_seen": 76674024, "step": 113760 }, { "epoch": 2.7792978770185424, "grad_norm": 0.00019100087229162455, "learning_rate": 9.795689088563626e-07, "loss": 0.0366, "num_input_tokens_seen": 76677288, "step": 113765 }, { "epoch": 2.7794200278503896, "grad_norm": 0.0020818603225052357, "learning_rate": 9.79483649131832e-07, "loss": 0.0404, "num_input_tokens_seen": 76681000, "step": 113770 }, { "epoch": 2.7795421786822367, "grad_norm": 0.011318969540297985, "learning_rate": 9.79398389556502e-07, "loss": 0.0566, "num_input_tokens_seen": 76684200, "step": 113775 }, { "epoch": 2.779664329514084, "grad_norm": 0.10724736005067825, "learning_rate": 9.79313130130992e-07, "loss": 0.0003, "num_input_tokens_seen": 76687272, "step": 113780 }, { "epoch": 2.779786480345931, "grad_norm": 0.0018801460973918438, "learning_rate": 9.79227870855923e-07, "loss": 0.0536, "num_input_tokens_seen": 76690280, "step": 113785 }, { "epoch": 2.7799086311777783, "grad_norm": 0.11404317617416382, "learning_rate": 9.79142611731914e-07, "loss": 0.0336, "num_input_tokens_seen": 76693736, "step": 113790 }, { "epoch": 2.7800307820096255, "grad_norm": 0.45159173011779785, "learning_rate": 9.790573527595856e-07, "loss": 0.0004, "num_input_tokens_seen": 76697064, "step": 113795 }, { "epoch": 2.7801529328414727, "grad_norm": 0.07708978652954102, "learning_rate": 9.789720939395581e-07, "loss": 0.0004, "num_input_tokens_seen": 76700264, "step": 113800 }, { "epoch": 2.78027508367332, "grad_norm": 0.009074263274669647, "learning_rate": 9.78886835272451e-07, "loss": 0.0105, "num_input_tokens_seen": 76704040, "step": 113805 }, { "epoch": 2.780397234505167, "grad_norm": 0.029474390670657158, "learning_rate": 9.788015767588846e-07, "loss": 0.0003, "num_input_tokens_seen": 76707752, "step": 113810 }, { "epoch": 2.7805193853370143, "grad_norm": 0.17673642933368683, "learning_rate": 9.787163183994787e-07, "loss": 0.0331, "num_input_tokens_seen": 76711400, "step": 113815 }, { "epoch": 2.7806415361688614, "grad_norm": 0.039392951875925064, "learning_rate": 9.786310601948538e-07, "loss": 0.0001, "num_input_tokens_seen": 76715688, "step": 113820 }, { "epoch": 2.7807636870007086, "grad_norm": 0.06219044327735901, "learning_rate": 9.78545802145629e-07, "loss": 0.0429, "num_input_tokens_seen": 76719272, "step": 113825 }, { "epoch": 2.780885837832556, "grad_norm": 0.03805144503712654, "learning_rate": 9.784605442524252e-07, "loss": 0.0002, "num_input_tokens_seen": 76722728, "step": 113830 }, { "epoch": 2.7810079886644026, "grad_norm": 0.05186213180422783, "learning_rate": 9.783752865158623e-07, "loss": 0.0003, "num_input_tokens_seen": 76725480, "step": 113835 }, { "epoch": 2.78113013949625, "grad_norm": 0.006760579068213701, "learning_rate": 9.782900289365597e-07, "loss": 0.1236, "num_input_tokens_seen": 76729192, "step": 113840 }, { "epoch": 2.781252290328097, "grad_norm": 0.06068001687526703, "learning_rate": 9.782047715151384e-07, "loss": 0.0393, "num_input_tokens_seen": 76732648, "step": 113845 }, { "epoch": 2.781374441159944, "grad_norm": 30.574745178222656, "learning_rate": 9.781195142522175e-07, "loss": 0.0977, "num_input_tokens_seen": 76736104, "step": 113850 }, { "epoch": 2.7814965919917913, "grad_norm": 0.034847330302000046, "learning_rate": 9.780342571484174e-07, "loss": 0.0477, "num_input_tokens_seen": 76739816, "step": 113855 }, { "epoch": 2.7816187428236385, "grad_norm": 0.008041945286095142, "learning_rate": 9.779490002043584e-07, "loss": 0.0001, "num_input_tokens_seen": 76743784, "step": 113860 }, { "epoch": 2.7817408936554857, "grad_norm": 0.008885260671377182, "learning_rate": 9.778637434206595e-07, "loss": 0.0001, "num_input_tokens_seen": 76747496, "step": 113865 }, { "epoch": 2.781863044487333, "grad_norm": 0.2046172320842743, "learning_rate": 9.777784867979422e-07, "loss": 0.0002, "num_input_tokens_seen": 76751080, "step": 113870 }, { "epoch": 2.78198519531918, "grad_norm": 0.033650998026132584, "learning_rate": 9.776932303368252e-07, "loss": 0.0002, "num_input_tokens_seen": 76754408, "step": 113875 }, { "epoch": 2.7821073461510273, "grad_norm": 0.011351230554282665, "learning_rate": 9.776079740379294e-07, "loss": 0.0373, "num_input_tokens_seen": 76757672, "step": 113880 }, { "epoch": 2.7822294969828745, "grad_norm": 0.11378277838230133, "learning_rate": 9.77522717901874e-07, "loss": 0.0002, "num_input_tokens_seen": 76761448, "step": 113885 }, { "epoch": 2.7823516478147217, "grad_norm": 0.06339457631111145, "learning_rate": 9.774374619292796e-07, "loss": 0.0002, "num_input_tokens_seen": 76764904, "step": 113890 }, { "epoch": 2.782473798646569, "grad_norm": 0.0020954750943928957, "learning_rate": 9.773522061207664e-07, "loss": 0.0001, "num_input_tokens_seen": 76768104, "step": 113895 }, { "epoch": 2.782595949478416, "grad_norm": 0.01680098846554756, "learning_rate": 9.772669504769534e-07, "loss": 0.0002, "num_input_tokens_seen": 76771752, "step": 113900 }, { "epoch": 2.7827181003102632, "grad_norm": 0.007107491604983807, "learning_rate": 9.77181694998462e-07, "loss": 0.062, "num_input_tokens_seen": 76775080, "step": 113905 }, { "epoch": 2.7828402511421104, "grad_norm": 0.0028211900498718023, "learning_rate": 9.77096439685911e-07, "loss": 0.0001, "num_input_tokens_seen": 76778408, "step": 113910 }, { "epoch": 2.7829624019739576, "grad_norm": 0.037216681987047195, "learning_rate": 9.770111845399209e-07, "loss": 0.0615, "num_input_tokens_seen": 76781544, "step": 113915 }, { "epoch": 2.7830845528058044, "grad_norm": 0.05089016631245613, "learning_rate": 9.769259295611117e-07, "loss": 0.0005, "num_input_tokens_seen": 76784872, "step": 113920 }, { "epoch": 2.783206703637652, "grad_norm": 0.051179852336645126, "learning_rate": 9.768406747501032e-07, "loss": 0.0311, "num_input_tokens_seen": 76788392, "step": 113925 }, { "epoch": 2.7833288544694987, "grad_norm": 0.02378770150244236, "learning_rate": 9.76755420107516e-07, "loss": 0.0004, "num_input_tokens_seen": 76791464, "step": 113930 }, { "epoch": 2.7834510053013464, "grad_norm": 0.04368620738387108, "learning_rate": 9.76670165633969e-07, "loss": 0.0001, "num_input_tokens_seen": 76794664, "step": 113935 }, { "epoch": 2.783573156133193, "grad_norm": 0.03758257254958153, "learning_rate": 9.765849113300833e-07, "loss": 0.0498, "num_input_tokens_seen": 76797416, "step": 113940 }, { "epoch": 2.7836953069650403, "grad_norm": 0.10067665576934814, "learning_rate": 9.76499657196478e-07, "loss": 0.0002, "num_input_tokens_seen": 76800680, "step": 113945 }, { "epoch": 2.7838174577968875, "grad_norm": 0.0006436361582018435, "learning_rate": 9.764144032337738e-07, "loss": 0.0001, "num_input_tokens_seen": 76804008, "step": 113950 }, { "epoch": 2.7839396086287347, "grad_norm": 0.003465403337031603, "learning_rate": 9.763291494425904e-07, "loss": 0.0, "num_input_tokens_seen": 76807336, "step": 113955 }, { "epoch": 2.784061759460582, "grad_norm": 0.02004861645400524, "learning_rate": 9.762438958235479e-07, "loss": 0.0464, "num_input_tokens_seen": 76810664, "step": 113960 }, { "epoch": 2.784183910292429, "grad_norm": 0.04684687405824661, "learning_rate": 9.76158642377266e-07, "loss": 0.0434, "num_input_tokens_seen": 76813928, "step": 113965 }, { "epoch": 2.7843060611242763, "grad_norm": 0.0895547866821289, "learning_rate": 9.760733891043648e-07, "loss": 0.0327, "num_input_tokens_seen": 76816872, "step": 113970 }, { "epoch": 2.7844282119561234, "grad_norm": 0.0200139582157135, "learning_rate": 9.759881360054646e-07, "loss": 0.0003, "num_input_tokens_seen": 76819880, "step": 113975 }, { "epoch": 2.7845503627879706, "grad_norm": 20.02742576599121, "learning_rate": 9.75902883081185e-07, "loss": 0.0751, "num_input_tokens_seen": 76822888, "step": 113980 }, { "epoch": 2.784672513619818, "grad_norm": 0.012123535387217999, "learning_rate": 9.758176303321458e-07, "loss": 0.0019, "num_input_tokens_seen": 76826216, "step": 113985 }, { "epoch": 2.784794664451665, "grad_norm": 17.554426193237305, "learning_rate": 9.757323777589678e-07, "loss": 0.1272, "num_input_tokens_seen": 76829608, "step": 113990 }, { "epoch": 2.784916815283512, "grad_norm": 0.014771767891943455, "learning_rate": 9.7564712536227e-07, "loss": 0.0002, "num_input_tokens_seen": 76833256, "step": 113995 }, { "epoch": 2.7850389661153594, "grad_norm": 0.06855263561010361, "learning_rate": 9.755618731426735e-07, "loss": 0.0843, "num_input_tokens_seen": 76836840, "step": 114000 }, { "epoch": 2.785161116947206, "grad_norm": 0.05960407853126526, "learning_rate": 9.754766211007972e-07, "loss": 0.0657, "num_input_tokens_seen": 76839976, "step": 114005 }, { "epoch": 2.7852832677790538, "grad_norm": 0.024061383679509163, "learning_rate": 9.753913692372615e-07, "loss": 0.049, "num_input_tokens_seen": 76843112, "step": 114010 }, { "epoch": 2.7854054186109005, "grad_norm": 0.060720350593328476, "learning_rate": 9.753061175526867e-07, "loss": 0.0002, "num_input_tokens_seen": 76846824, "step": 114015 }, { "epoch": 2.785527569442748, "grad_norm": 0.14546024799346924, "learning_rate": 9.752208660476919e-07, "loss": 0.0001, "num_input_tokens_seen": 76850472, "step": 114020 }, { "epoch": 2.785649720274595, "grad_norm": 0.4514792859554291, "learning_rate": 9.751356147228982e-07, "loss": 0.0228, "num_input_tokens_seen": 76854568, "step": 114025 }, { "epoch": 2.785771871106442, "grad_norm": 21.797420501708984, "learning_rate": 9.750503635789246e-07, "loss": 0.0465, "num_input_tokens_seen": 76857832, "step": 114030 }, { "epoch": 2.7858940219382893, "grad_norm": 46.9689826965332, "learning_rate": 9.749651126163919e-07, "loss": 0.0738, "num_input_tokens_seen": 76861288, "step": 114035 }, { "epoch": 2.7860161727701365, "grad_norm": 0.06661438196897507, "learning_rate": 9.74879861835919e-07, "loss": 0.0677, "num_input_tokens_seen": 76864360, "step": 114040 }, { "epoch": 2.7861383236019837, "grad_norm": 0.31695249676704407, "learning_rate": 9.747946112381266e-07, "loss": 0.0312, "num_input_tokens_seen": 76867688, "step": 114045 }, { "epoch": 2.786260474433831, "grad_norm": 0.10348512977361679, "learning_rate": 9.747093608236352e-07, "loss": 0.0002, "num_input_tokens_seen": 76871272, "step": 114050 }, { "epoch": 2.786382625265678, "grad_norm": 0.31157082319259644, "learning_rate": 9.746241105930634e-07, "loss": 0.0739, "num_input_tokens_seen": 76874728, "step": 114055 }, { "epoch": 2.7865047760975252, "grad_norm": 0.11700203269720078, "learning_rate": 9.745388605470324e-07, "loss": 0.0991, "num_input_tokens_seen": 76877736, "step": 114060 }, { "epoch": 2.7866269269293724, "grad_norm": 0.0033500140998512506, "learning_rate": 9.744536106861615e-07, "loss": 0.0004, "num_input_tokens_seen": 76880936, "step": 114065 }, { "epoch": 2.7867490777612196, "grad_norm": 37.0404052734375, "learning_rate": 9.74368361011071e-07, "loss": 0.0363, "num_input_tokens_seen": 76884840, "step": 114070 }, { "epoch": 2.786871228593067, "grad_norm": 0.07285968214273453, "learning_rate": 9.742831115223802e-07, "loss": 0.0005, "num_input_tokens_seen": 76888104, "step": 114075 }, { "epoch": 2.786993379424914, "grad_norm": 0.009817846119403839, "learning_rate": 9.741978622207097e-07, "loss": 0.0001, "num_input_tokens_seen": 76891560, "step": 114080 }, { "epoch": 2.787115530256761, "grad_norm": 26.087810516357422, "learning_rate": 9.741126131066796e-07, "loss": 0.0989, "num_input_tokens_seen": 76894888, "step": 114085 }, { "epoch": 2.7872376810886084, "grad_norm": 0.24341338872909546, "learning_rate": 9.740273641809092e-07, "loss": 0.0002, "num_input_tokens_seen": 76898280, "step": 114090 }, { "epoch": 2.7873598319204556, "grad_norm": 0.0460708923637867, "learning_rate": 9.739421154440192e-07, "loss": 0.0003, "num_input_tokens_seen": 76901608, "step": 114095 }, { "epoch": 2.7874819827523023, "grad_norm": 0.03550918772816658, "learning_rate": 9.738568668966286e-07, "loss": 0.0602, "num_input_tokens_seen": 76905064, "step": 114100 }, { "epoch": 2.78760413358415, "grad_norm": 0.024629903957247734, "learning_rate": 9.737716185393582e-07, "loss": 0.0537, "num_input_tokens_seen": 76908392, "step": 114105 }, { "epoch": 2.7877262844159967, "grad_norm": 0.1366603970527649, "learning_rate": 9.736863703728275e-07, "loss": 0.0227, "num_input_tokens_seen": 76911976, "step": 114110 }, { "epoch": 2.7878484352478443, "grad_norm": 0.02549423649907112, "learning_rate": 9.736011223976567e-07, "loss": 0.0002, "num_input_tokens_seen": 76915304, "step": 114115 }, { "epoch": 2.787970586079691, "grad_norm": 0.07304327934980392, "learning_rate": 9.735158746144657e-07, "loss": 0.0002, "num_input_tokens_seen": 76919016, "step": 114120 }, { "epoch": 2.7880927369115382, "grad_norm": 0.07051575928926468, "learning_rate": 9.734306270238744e-07, "loss": 0.0003, "num_input_tokens_seen": 76921896, "step": 114125 }, { "epoch": 2.7882148877433854, "grad_norm": 0.0368647538125515, "learning_rate": 9.733453796265029e-07, "loss": 0.0425, "num_input_tokens_seen": 76925224, "step": 114130 }, { "epoch": 2.7883370385752326, "grad_norm": 0.2852783799171448, "learning_rate": 9.732601324229704e-07, "loss": 0.0003, "num_input_tokens_seen": 76928808, "step": 114135 }, { "epoch": 2.78845918940708, "grad_norm": 0.014169542118906975, "learning_rate": 9.731748854138977e-07, "loss": 0.0005, "num_input_tokens_seen": 76932200, "step": 114140 }, { "epoch": 2.788581340238927, "grad_norm": 0.03942190483212471, "learning_rate": 9.730896385999045e-07, "loss": 0.0001, "num_input_tokens_seen": 76935528, "step": 114145 }, { "epoch": 2.788703491070774, "grad_norm": 0.01825009100139141, "learning_rate": 9.730043919816104e-07, "loss": 0.0325, "num_input_tokens_seen": 76938536, "step": 114150 }, { "epoch": 2.7888256419026214, "grad_norm": 0.017786700278520584, "learning_rate": 9.72919145559636e-07, "loss": 0.0001, "num_input_tokens_seen": 76942248, "step": 114155 }, { "epoch": 2.7889477927344686, "grad_norm": 0.05002596229314804, "learning_rate": 9.728338993346007e-07, "loss": 0.0003, "num_input_tokens_seen": 76945448, "step": 114160 }, { "epoch": 2.7890699435663158, "grad_norm": 0.05178292095661163, "learning_rate": 9.727486533071248e-07, "loss": 0.0501, "num_input_tokens_seen": 76949160, "step": 114165 }, { "epoch": 2.789192094398163, "grad_norm": 0.027847200632095337, "learning_rate": 9.72663407477828e-07, "loss": 0.128, "num_input_tokens_seen": 76952232, "step": 114170 }, { "epoch": 2.78931424523001, "grad_norm": 0.3631141781806946, "learning_rate": 9.7257816184733e-07, "loss": 0.0003, "num_input_tokens_seen": 76955368, "step": 114175 }, { "epoch": 2.7894363960618573, "grad_norm": 0.014581491239368916, "learning_rate": 9.724929164162512e-07, "loss": 0.0002, "num_input_tokens_seen": 76958568, "step": 114180 }, { "epoch": 2.789558546893704, "grad_norm": 0.004223099909722805, "learning_rate": 9.72407671185211e-07, "loss": 0.0391, "num_input_tokens_seen": 76962344, "step": 114185 }, { "epoch": 2.7896806977255517, "grad_norm": 0.13153673708438873, "learning_rate": 9.7232242615483e-07, "loss": 0.0002, "num_input_tokens_seen": 76965864, "step": 114190 }, { "epoch": 2.7898028485573985, "grad_norm": 0.011691474355757236, "learning_rate": 9.722371813257274e-07, "loss": 0.1431, "num_input_tokens_seen": 76969064, "step": 114195 }, { "epoch": 2.789924999389246, "grad_norm": 0.09665603190660477, "learning_rate": 9.721519366985234e-07, "loss": 0.0559, "num_input_tokens_seen": 76972840, "step": 114200 }, { "epoch": 2.790047150221093, "grad_norm": 14.975530624389648, "learning_rate": 9.720666922738386e-07, "loss": 0.0503, "num_input_tokens_seen": 76975976, "step": 114205 }, { "epoch": 2.79016930105294, "grad_norm": 0.27885282039642334, "learning_rate": 9.719814480522918e-07, "loss": 0.0392, "num_input_tokens_seen": 76979496, "step": 114210 }, { "epoch": 2.790291451884787, "grad_norm": 0.14521782100200653, "learning_rate": 9.718962040345038e-07, "loss": 0.0003, "num_input_tokens_seen": 76982952, "step": 114215 }, { "epoch": 2.7904136027166344, "grad_norm": 0.1501213163137436, "learning_rate": 9.718109602210941e-07, "loss": 0.0005, "num_input_tokens_seen": 76986408, "step": 114220 }, { "epoch": 2.7905357535484816, "grad_norm": 0.28133001923561096, "learning_rate": 9.717257166126827e-07, "loss": 0.0251, "num_input_tokens_seen": 76989480, "step": 114225 }, { "epoch": 2.790657904380329, "grad_norm": 0.08251863718032837, "learning_rate": 9.716404732098894e-07, "loss": 0.1115, "num_input_tokens_seen": 76993064, "step": 114230 }, { "epoch": 2.790780055212176, "grad_norm": 0.23545780777931213, "learning_rate": 9.71555230013334e-07, "loss": 0.0002, "num_input_tokens_seen": 76996264, "step": 114235 }, { "epoch": 2.790902206044023, "grad_norm": 0.00977549608796835, "learning_rate": 9.71469987023637e-07, "loss": 0.0004, "num_input_tokens_seen": 77000040, "step": 114240 }, { "epoch": 2.7910243568758704, "grad_norm": 0.21447953581809998, "learning_rate": 9.713847442414174e-07, "loss": 0.0002, "num_input_tokens_seen": 77003496, "step": 114245 }, { "epoch": 2.7911465077077176, "grad_norm": 0.13338632881641388, "learning_rate": 9.712995016672963e-07, "loss": 0.106, "num_input_tokens_seen": 77006952, "step": 114250 }, { "epoch": 2.7912686585395647, "grad_norm": 0.01698482036590576, "learning_rate": 9.712142593018926e-07, "loss": 0.0004, "num_input_tokens_seen": 77010536, "step": 114255 }, { "epoch": 2.791390809371412, "grad_norm": 42.586021423339844, "learning_rate": 9.711290171458265e-07, "loss": 0.1619, "num_input_tokens_seen": 77014248, "step": 114260 }, { "epoch": 2.791512960203259, "grad_norm": 0.04516065493226051, "learning_rate": 9.71043775199718e-07, "loss": 0.0399, "num_input_tokens_seen": 77017896, "step": 114265 }, { "epoch": 2.7916351110351063, "grad_norm": 0.025962043553590775, "learning_rate": 9.70958533464187e-07, "loss": 0.0002, "num_input_tokens_seen": 77021416, "step": 114270 }, { "epoch": 2.7917572618669535, "grad_norm": 0.014167881570756435, "learning_rate": 9.708732919398534e-07, "loss": 0.0003, "num_input_tokens_seen": 77025064, "step": 114275 }, { "epoch": 2.7918794126988002, "grad_norm": 0.010760514996945858, "learning_rate": 9.707880506273369e-07, "loss": 0.0005, "num_input_tokens_seen": 77028520, "step": 114280 }, { "epoch": 2.792001563530648, "grad_norm": 0.07058648020029068, "learning_rate": 9.70702809527258e-07, "loss": 0.0312, "num_input_tokens_seen": 77031784, "step": 114285 }, { "epoch": 2.7921237143624946, "grad_norm": 0.005182285327464342, "learning_rate": 9.706175686402354e-07, "loss": 0.0003, "num_input_tokens_seen": 77034792, "step": 114290 }, { "epoch": 2.792245865194342, "grad_norm": 0.00801653228700161, "learning_rate": 9.7053232796689e-07, "loss": 0.0019, "num_input_tokens_seen": 77037992, "step": 114295 }, { "epoch": 2.792368016026189, "grad_norm": 0.027459247037768364, "learning_rate": 9.704470875078419e-07, "loss": 0.0002, "num_input_tokens_seen": 77041320, "step": 114300 }, { "epoch": 2.792490166858036, "grad_norm": 35.760154724121094, "learning_rate": 9.7036184726371e-07, "loss": 0.0492, "num_input_tokens_seen": 77044712, "step": 114305 }, { "epoch": 2.7926123176898834, "grad_norm": 0.10340413451194763, "learning_rate": 9.70276607235115e-07, "loss": 0.001, "num_input_tokens_seen": 77047720, "step": 114310 }, { "epoch": 2.7927344685217306, "grad_norm": 0.06875290721654892, "learning_rate": 9.701913674226764e-07, "loss": 0.0556, "num_input_tokens_seen": 77051048, "step": 114315 }, { "epoch": 2.7928566193535778, "grad_norm": 0.0017848052084445953, "learning_rate": 9.701061278270143e-07, "loss": 0.0003, "num_input_tokens_seen": 77054504, "step": 114320 }, { "epoch": 2.792978770185425, "grad_norm": 0.037753038108348846, "learning_rate": 9.700208884487485e-07, "loss": 0.0002, "num_input_tokens_seen": 77057704, "step": 114325 }, { "epoch": 2.793100921017272, "grad_norm": 746.1931762695312, "learning_rate": 9.699356492884986e-07, "loss": 0.005, "num_input_tokens_seen": 77061096, "step": 114330 }, { "epoch": 2.7932230718491193, "grad_norm": 0.17954334616661072, "learning_rate": 9.698504103468851e-07, "loss": 0.0504, "num_input_tokens_seen": 77064424, "step": 114335 }, { "epoch": 2.7933452226809665, "grad_norm": 16.79789924621582, "learning_rate": 9.697651716245271e-07, "loss": 0.0347, "num_input_tokens_seen": 77067688, "step": 114340 }, { "epoch": 2.7934673735128137, "grad_norm": 0.002543478272855282, "learning_rate": 9.696799331220453e-07, "loss": 0.0002, "num_input_tokens_seen": 77071016, "step": 114345 }, { "epoch": 2.793589524344661, "grad_norm": 0.00716570857912302, "learning_rate": 9.69594694840059e-07, "loss": 0.0, "num_input_tokens_seen": 77074408, "step": 114350 }, { "epoch": 2.793711675176508, "grad_norm": 0.004741067066788673, "learning_rate": 9.69509456779188e-07, "loss": 0.0513, "num_input_tokens_seen": 77077800, "step": 114355 }, { "epoch": 2.7938338260083553, "grad_norm": 0.0017748570535331964, "learning_rate": 9.694242189400528e-07, "loss": 0.0007, "num_input_tokens_seen": 77081128, "step": 114360 }, { "epoch": 2.793955976840202, "grad_norm": 0.013606770895421505, "learning_rate": 9.693389813232727e-07, "loss": 0.0006, "num_input_tokens_seen": 77084328, "step": 114365 }, { "epoch": 2.7940781276720497, "grad_norm": 0.0782063752412796, "learning_rate": 9.69253743929468e-07, "loss": 0.0002, "num_input_tokens_seen": 77087080, "step": 114370 }, { "epoch": 2.7942002785038964, "grad_norm": 0.0396018847823143, "learning_rate": 9.691685067592584e-07, "loss": 0.0002, "num_input_tokens_seen": 77090344, "step": 114375 }, { "epoch": 2.794322429335744, "grad_norm": 0.0037475209683179855, "learning_rate": 9.690832698132636e-07, "loss": 0.0001, "num_input_tokens_seen": 77093864, "step": 114380 }, { "epoch": 2.794444580167591, "grad_norm": 0.4666299819946289, "learning_rate": 9.689980330921035e-07, "loss": 0.0002, "num_input_tokens_seen": 77097384, "step": 114385 }, { "epoch": 2.794566730999438, "grad_norm": 0.09782411903142929, "learning_rate": 9.689127965963978e-07, "loss": 0.0001, "num_input_tokens_seen": 77100904, "step": 114390 }, { "epoch": 2.794688881831285, "grad_norm": 0.005147072486579418, "learning_rate": 9.68827560326767e-07, "loss": 0.1179, "num_input_tokens_seen": 77104168, "step": 114395 }, { "epoch": 2.7948110326631324, "grad_norm": 0.2189302146434784, "learning_rate": 9.687423242838303e-07, "loss": 0.0004, "num_input_tokens_seen": 77107304, "step": 114400 }, { "epoch": 2.7949331834949795, "grad_norm": 0.0006944366032257676, "learning_rate": 9.686570884682082e-07, "loss": 0.0501, "num_input_tokens_seen": 77110312, "step": 114405 }, { "epoch": 2.7950553343268267, "grad_norm": 0.012314150109887123, "learning_rate": 9.685718528805199e-07, "loss": 0.0001, "num_input_tokens_seen": 77113896, "step": 114410 }, { "epoch": 2.795177485158674, "grad_norm": 0.013810068368911743, "learning_rate": 9.684866175213856e-07, "loss": 0.0002, "num_input_tokens_seen": 77117352, "step": 114415 }, { "epoch": 2.795299635990521, "grad_norm": 0.056724026799201965, "learning_rate": 9.68401382391425e-07, "loss": 0.0001, "num_input_tokens_seen": 77120744, "step": 114420 }, { "epoch": 2.7954217868223683, "grad_norm": 0.0070975953713059425, "learning_rate": 9.68316147491258e-07, "loss": 0.0006, "num_input_tokens_seen": 77124200, "step": 114425 }, { "epoch": 2.7955439376542155, "grad_norm": 0.0037659115623682737, "learning_rate": 9.682309128215047e-07, "loss": 0.1171, "num_input_tokens_seen": 77127720, "step": 114430 }, { "epoch": 2.7956660884860627, "grad_norm": 0.018607452511787415, "learning_rate": 9.681456783827848e-07, "loss": 0.0003, "num_input_tokens_seen": 77131048, "step": 114435 }, { "epoch": 2.79578823931791, "grad_norm": 0.010145665146410465, "learning_rate": 9.68060444175718e-07, "loss": 0.0001, "num_input_tokens_seen": 77134760, "step": 114440 }, { "epoch": 2.795910390149757, "grad_norm": 0.027114300057291985, "learning_rate": 9.67975210200924e-07, "loss": 0.0002, "num_input_tokens_seen": 77138088, "step": 114445 }, { "epoch": 2.796032540981604, "grad_norm": 0.002864605514332652, "learning_rate": 9.67889976459023e-07, "loss": 0.066, "num_input_tokens_seen": 77141352, "step": 114450 }, { "epoch": 2.7961546918134514, "grad_norm": 0.23206019401550293, "learning_rate": 9.678047429506352e-07, "loss": 0.0845, "num_input_tokens_seen": 77145064, "step": 114455 }, { "epoch": 2.796276842645298, "grad_norm": 6.529763049911708e-05, "learning_rate": 9.677195096763791e-07, "loss": 0.0015, "num_input_tokens_seen": 77148072, "step": 114460 }, { "epoch": 2.796398993477146, "grad_norm": 2.753253221511841, "learning_rate": 9.676342766368763e-07, "loss": 0.0003, "num_input_tokens_seen": 77151272, "step": 114465 }, { "epoch": 2.7965211443089926, "grad_norm": 0.004751298110932112, "learning_rate": 9.67549043832745e-07, "loss": 0.0525, "num_input_tokens_seen": 77154664, "step": 114470 }, { "epoch": 2.7966432951408398, "grad_norm": 33.22373962402344, "learning_rate": 9.674638112646065e-07, "loss": 0.1032, "num_input_tokens_seen": 77157992, "step": 114475 }, { "epoch": 2.796765445972687, "grad_norm": 3.374569892883301, "learning_rate": 9.673785789330795e-07, "loss": 0.0004, "num_input_tokens_seen": 77161640, "step": 114480 }, { "epoch": 2.796887596804534, "grad_norm": 0.013949088752269745, "learning_rate": 9.67293346838784e-07, "loss": 0.0006, "num_input_tokens_seen": 77164968, "step": 114485 }, { "epoch": 2.7970097476363813, "grad_norm": 0.10943493992090225, "learning_rate": 9.672081149823406e-07, "loss": 0.0677, "num_input_tokens_seen": 77168488, "step": 114490 }, { "epoch": 2.7971318984682285, "grad_norm": 0.015567791648209095, "learning_rate": 9.671228833643683e-07, "loss": 0.0006, "num_input_tokens_seen": 77171816, "step": 114495 }, { "epoch": 2.7972540493000757, "grad_norm": 0.01357315480709076, "learning_rate": 9.670376519854874e-07, "loss": 0.0961, "num_input_tokens_seen": 77176040, "step": 114500 }, { "epoch": 2.797376200131923, "grad_norm": 0.015812013298273087, "learning_rate": 9.669524208463172e-07, "loss": 0.0001, "num_input_tokens_seen": 77179624, "step": 114505 }, { "epoch": 2.79749835096377, "grad_norm": 31.484952926635742, "learning_rate": 9.66867189947478e-07, "loss": 0.0786, "num_input_tokens_seen": 77183208, "step": 114510 }, { "epoch": 2.7976205017956173, "grad_norm": 18.537179946899414, "learning_rate": 9.667819592895899e-07, "loss": 0.1302, "num_input_tokens_seen": 77186408, "step": 114515 }, { "epoch": 2.7977426526274645, "grad_norm": 0.026167120784521103, "learning_rate": 9.666967288732719e-07, "loss": 0.0002, "num_input_tokens_seen": 77189992, "step": 114520 }, { "epoch": 2.7978648034593117, "grad_norm": 37.67267608642578, "learning_rate": 9.666114986991446e-07, "loss": 0.0535, "num_input_tokens_seen": 77193064, "step": 114525 }, { "epoch": 2.797986954291159, "grad_norm": 0.6203153729438782, "learning_rate": 9.665262687678273e-07, "loss": 0.0011, "num_input_tokens_seen": 77196200, "step": 114530 }, { "epoch": 2.798109105123006, "grad_norm": 0.01730647124350071, "learning_rate": 9.6644103907994e-07, "loss": 0.0468, "num_input_tokens_seen": 77199464, "step": 114535 }, { "epoch": 2.7982312559548532, "grad_norm": 0.03615279495716095, "learning_rate": 9.663558096361023e-07, "loss": 0.0001, "num_input_tokens_seen": 77202792, "step": 114540 }, { "epoch": 2.7983534067867, "grad_norm": 0.05028081312775612, "learning_rate": 9.662705804369343e-07, "loss": 0.1592, "num_input_tokens_seen": 77206248, "step": 114545 }, { "epoch": 2.7984755576185476, "grad_norm": 84.04562377929688, "learning_rate": 9.66185351483056e-07, "loss": 0.0375, "num_input_tokens_seen": 77209512, "step": 114550 }, { "epoch": 2.7985977084503944, "grad_norm": 24.186880111694336, "learning_rate": 9.661001227750864e-07, "loss": 0.0478, "num_input_tokens_seen": 77212648, "step": 114555 }, { "epoch": 2.798719859282242, "grad_norm": 8.545869827270508, "learning_rate": 9.660148943136465e-07, "loss": 0.1093, "num_input_tokens_seen": 77215784, "step": 114560 }, { "epoch": 2.7988420101140887, "grad_norm": 0.02828356996178627, "learning_rate": 9.659296660993548e-07, "loss": 0.0492, "num_input_tokens_seen": 77219752, "step": 114565 }, { "epoch": 2.798964160945936, "grad_norm": 0.0055162059143185616, "learning_rate": 9.65844438132832e-07, "loss": 0.0809, "num_input_tokens_seen": 77223208, "step": 114570 }, { "epoch": 2.799086311777783, "grad_norm": 0.015754438936710358, "learning_rate": 9.657592104146976e-07, "loss": 0.045, "num_input_tokens_seen": 77226216, "step": 114575 }, { "epoch": 2.7992084626096303, "grad_norm": 0.01425847876816988, "learning_rate": 9.656739829455712e-07, "loss": 0.0002, "num_input_tokens_seen": 77229928, "step": 114580 }, { "epoch": 2.7993306134414775, "grad_norm": 0.19158978760242462, "learning_rate": 9.655887557260731e-07, "loss": 0.0017, "num_input_tokens_seen": 77233384, "step": 114585 }, { "epoch": 2.7994527642733247, "grad_norm": 0.010324357077479362, "learning_rate": 9.655035287568229e-07, "loss": 0.0002, "num_input_tokens_seen": 77237288, "step": 114590 }, { "epoch": 2.799574915105172, "grad_norm": 0.6244637370109558, "learning_rate": 9.654183020384405e-07, "loss": 0.0004, "num_input_tokens_seen": 77240552, "step": 114595 }, { "epoch": 2.799697065937019, "grad_norm": 0.024087099358439445, "learning_rate": 9.65333075571545e-07, "loss": 0.0413, "num_input_tokens_seen": 77243688, "step": 114600 }, { "epoch": 2.7998192167688662, "grad_norm": 39.12160873413086, "learning_rate": 9.652478493567566e-07, "loss": 0.1785, "num_input_tokens_seen": 77246824, "step": 114605 }, { "epoch": 2.7999413676007134, "grad_norm": 24.444210052490234, "learning_rate": 9.651626233946959e-07, "loss": 0.0537, "num_input_tokens_seen": 77249896, "step": 114610 }, { "epoch": 2.8000635184325606, "grad_norm": 0.05592348426580429, "learning_rate": 9.650773976859812e-07, "loss": 0.0716, "num_input_tokens_seen": 77252968, "step": 114615 }, { "epoch": 2.800185669264408, "grad_norm": 0.1441681683063507, "learning_rate": 9.649921722312337e-07, "loss": 0.0002, "num_input_tokens_seen": 77256552, "step": 114620 }, { "epoch": 2.800307820096255, "grad_norm": 0.015336308628320694, "learning_rate": 9.64906947031072e-07, "loss": 0.0003, "num_input_tokens_seen": 77259944, "step": 114625 }, { "epoch": 2.8004299709281018, "grad_norm": 0.12260361760854721, "learning_rate": 9.64821722086117e-07, "loss": 0.0004, "num_input_tokens_seen": 77263080, "step": 114630 }, { "epoch": 2.8005521217599494, "grad_norm": 0.08802598714828491, "learning_rate": 9.647364973969876e-07, "loss": 0.0006, "num_input_tokens_seen": 77266344, "step": 114635 }, { "epoch": 2.800674272591796, "grad_norm": 0.02480044774711132, "learning_rate": 9.646512729643037e-07, "loss": 0.0404, "num_input_tokens_seen": 77269800, "step": 114640 }, { "epoch": 2.8007964234236438, "grad_norm": 0.0269177183508873, "learning_rate": 9.645660487886856e-07, "loss": 0.0002, "num_input_tokens_seen": 77273128, "step": 114645 }, { "epoch": 2.8009185742554905, "grad_norm": 0.086511991918087, "learning_rate": 9.644808248707523e-07, "loss": 0.0002, "num_input_tokens_seen": 77276328, "step": 114650 }, { "epoch": 2.8010407250873377, "grad_norm": 0.13102589547634125, "learning_rate": 9.643956012111247e-07, "loss": 0.0009, "num_input_tokens_seen": 77279976, "step": 114655 }, { "epoch": 2.801162875919185, "grad_norm": 0.38331305980682373, "learning_rate": 9.64310377810421e-07, "loss": 0.0442, "num_input_tokens_seen": 77283176, "step": 114660 }, { "epoch": 2.801285026751032, "grad_norm": 0.055746957659721375, "learning_rate": 9.642251546692621e-07, "loss": 0.0006, "num_input_tokens_seen": 77286312, "step": 114665 }, { "epoch": 2.8014071775828793, "grad_norm": 29.84088897705078, "learning_rate": 9.641399317882678e-07, "loss": 0.0478, "num_input_tokens_seen": 77289704, "step": 114670 }, { "epoch": 2.8015293284147265, "grad_norm": 0.005420892499387264, "learning_rate": 9.640547091680572e-07, "loss": 0.0001, "num_input_tokens_seen": 77293416, "step": 114675 }, { "epoch": 2.8016514792465737, "grad_norm": 0.006929678376764059, "learning_rate": 9.639694868092509e-07, "loss": 0.0008, "num_input_tokens_seen": 77297128, "step": 114680 }, { "epoch": 2.801773630078421, "grad_norm": 0.06219131126999855, "learning_rate": 9.638842647124679e-07, "loss": 0.137, "num_input_tokens_seen": 77300904, "step": 114685 }, { "epoch": 2.801895780910268, "grad_norm": 0.08001910150051117, "learning_rate": 9.637990428783282e-07, "loss": 0.0002, "num_input_tokens_seen": 77304680, "step": 114690 }, { "epoch": 2.802017931742115, "grad_norm": 0.016454750671982765, "learning_rate": 9.637138213074516e-07, "loss": 0.0002, "num_input_tokens_seen": 77307816, "step": 114695 }, { "epoch": 2.8021400825739624, "grad_norm": 0.04410364478826523, "learning_rate": 9.636286000004578e-07, "loss": 0.0464, "num_input_tokens_seen": 77311016, "step": 114700 }, { "epoch": 2.8022622334058096, "grad_norm": 0.5116713643074036, "learning_rate": 9.63543378957967e-07, "loss": 0.0951, "num_input_tokens_seen": 77314152, "step": 114705 }, { "epoch": 2.802384384237657, "grad_norm": 19.533918380737305, "learning_rate": 9.63458158180598e-07, "loss": 0.0961, "num_input_tokens_seen": 77318184, "step": 114710 }, { "epoch": 2.802506535069504, "grad_norm": 0.10283757001161575, "learning_rate": 9.633729376689715e-07, "loss": 0.0001, "num_input_tokens_seen": 77321448, "step": 114715 }, { "epoch": 2.802628685901351, "grad_norm": 0.09375852346420288, "learning_rate": 9.632877174237066e-07, "loss": 0.0527, "num_input_tokens_seen": 77324584, "step": 114720 }, { "epoch": 2.802750836733198, "grad_norm": 0.022160667926073074, "learning_rate": 9.632024974454233e-07, "loss": 0.0005, "num_input_tokens_seen": 77328680, "step": 114725 }, { "epoch": 2.8028729875650455, "grad_norm": 42.232276916503906, "learning_rate": 9.631172777347414e-07, "loss": 0.0593, "num_input_tokens_seen": 77331816, "step": 114730 }, { "epoch": 2.8029951383968923, "grad_norm": 0.039244040846824646, "learning_rate": 9.630320582922805e-07, "loss": 0.139, "num_input_tokens_seen": 77334888, "step": 114735 }, { "epoch": 2.8031172892287395, "grad_norm": 36.34895706176758, "learning_rate": 9.629468391186605e-07, "loss": 0.0428, "num_input_tokens_seen": 77340328, "step": 114740 }, { "epoch": 2.8032394400605867, "grad_norm": 0.02295946702361107, "learning_rate": 9.628616202145012e-07, "loss": 0.0409, "num_input_tokens_seen": 77343464, "step": 114745 }, { "epoch": 2.803361590892434, "grad_norm": 0.11297443509101868, "learning_rate": 9.627764015804223e-07, "loss": 0.0007, "num_input_tokens_seen": 77346856, "step": 114750 }, { "epoch": 2.803483741724281, "grad_norm": 25.889450073242188, "learning_rate": 9.62691183217043e-07, "loss": 0.0529, "num_input_tokens_seen": 77350376, "step": 114755 }, { "epoch": 2.8036058925561282, "grad_norm": 0.05398506671190262, "learning_rate": 9.626059651249834e-07, "loss": 0.0002, "num_input_tokens_seen": 77353768, "step": 114760 }, { "epoch": 2.8037280433879754, "grad_norm": 0.10572467744350433, "learning_rate": 9.625207473048638e-07, "loss": 0.0332, "num_input_tokens_seen": 77357224, "step": 114765 }, { "epoch": 2.8038501942198226, "grad_norm": 0.03468018025159836, "learning_rate": 9.624355297573028e-07, "loss": 0.0003, "num_input_tokens_seen": 77360616, "step": 114770 }, { "epoch": 2.80397234505167, "grad_norm": 0.0012982026673853397, "learning_rate": 9.623503124829213e-07, "loss": 0.0404, "num_input_tokens_seen": 77363816, "step": 114775 }, { "epoch": 2.804094495883517, "grad_norm": 0.03607716038823128, "learning_rate": 9.622650954823378e-07, "loss": 0.0002, "num_input_tokens_seen": 77367080, "step": 114780 }, { "epoch": 2.804216646715364, "grad_norm": 0.006853197701275349, "learning_rate": 9.621798787561736e-07, "loss": 0.0003, "num_input_tokens_seen": 77370280, "step": 114785 }, { "epoch": 2.8043387975472114, "grad_norm": 0.03181392699480057, "learning_rate": 9.620946623050468e-07, "loss": 0.0004, "num_input_tokens_seen": 77374632, "step": 114790 }, { "epoch": 2.8044609483790586, "grad_norm": 0.04052650183439255, "learning_rate": 9.620094461295779e-07, "loss": 0.0841, "num_input_tokens_seen": 77378216, "step": 114795 }, { "epoch": 2.8045830992109058, "grad_norm": 0.0003261482634115964, "learning_rate": 9.619242302303867e-07, "loss": 0.0001, "num_input_tokens_seen": 77382184, "step": 114800 }, { "epoch": 2.804705250042753, "grad_norm": 0.21794497966766357, "learning_rate": 9.618390146080925e-07, "loss": 0.0003, "num_input_tokens_seen": 77385256, "step": 114805 }, { "epoch": 2.8048274008745997, "grad_norm": 0.42617887258529663, "learning_rate": 9.617537992633155e-07, "loss": 0.0314, "num_input_tokens_seen": 77388456, "step": 114810 }, { "epoch": 2.8049495517064473, "grad_norm": 0.010697565041482449, "learning_rate": 9.61668584196675e-07, "loss": 0.1347, "num_input_tokens_seen": 77392360, "step": 114815 }, { "epoch": 2.805071702538294, "grad_norm": 0.01782727800309658, "learning_rate": 9.615833694087908e-07, "loss": 0.0002, "num_input_tokens_seen": 77395688, "step": 114820 }, { "epoch": 2.8051938533701417, "grad_norm": 0.013532194308936596, "learning_rate": 9.614981549002828e-07, "loss": 0.0527, "num_input_tokens_seen": 77398632, "step": 114825 }, { "epoch": 2.8053160042019885, "grad_norm": 0.047766704112291336, "learning_rate": 9.614129406717703e-07, "loss": 0.0573, "num_input_tokens_seen": 77401960, "step": 114830 }, { "epoch": 2.8054381550338356, "grad_norm": 0.08423038572072983, "learning_rate": 9.61327726723874e-07, "loss": 0.053, "num_input_tokens_seen": 77404968, "step": 114835 }, { "epoch": 2.805560305865683, "grad_norm": 0.044354844838380814, "learning_rate": 9.612425130572124e-07, "loss": 0.1165, "num_input_tokens_seen": 77408488, "step": 114840 }, { "epoch": 2.80568245669753, "grad_norm": 0.06138172373175621, "learning_rate": 9.611572996724055e-07, "loss": 0.0006, "num_input_tokens_seen": 77411688, "step": 114845 }, { "epoch": 2.805804607529377, "grad_norm": 0.002905749948695302, "learning_rate": 9.610720865700735e-07, "loss": 0.0002, "num_input_tokens_seen": 77414760, "step": 114850 }, { "epoch": 2.8059267583612244, "grad_norm": 0.08945734798908234, "learning_rate": 9.609868737508353e-07, "loss": 0.0004, "num_input_tokens_seen": 77418152, "step": 114855 }, { "epoch": 2.8060489091930716, "grad_norm": 0.003969348501414061, "learning_rate": 9.609016612153115e-07, "loss": 0.0003, "num_input_tokens_seen": 77421544, "step": 114860 }, { "epoch": 2.806171060024919, "grad_norm": 20.634653091430664, "learning_rate": 9.60816448964121e-07, "loss": 0.0805, "num_input_tokens_seen": 77424808, "step": 114865 }, { "epoch": 2.806293210856766, "grad_norm": 0.007369068451225758, "learning_rate": 9.607312369978842e-07, "loss": 0.0, "num_input_tokens_seen": 77428200, "step": 114870 }, { "epoch": 2.806415361688613, "grad_norm": 0.09972415119409561, "learning_rate": 9.606460253172201e-07, "loss": 0.0002, "num_input_tokens_seen": 77431656, "step": 114875 }, { "epoch": 2.8065375125204604, "grad_norm": 0.01619172841310501, "learning_rate": 9.60560813922749e-07, "loss": 0.0337, "num_input_tokens_seen": 77435112, "step": 114880 }, { "epoch": 2.8066596633523075, "grad_norm": 0.8120678067207336, "learning_rate": 9.604756028150898e-07, "loss": 0.0006, "num_input_tokens_seen": 77438312, "step": 114885 }, { "epoch": 2.8067818141841547, "grad_norm": 15.800888061523438, "learning_rate": 9.60390391994863e-07, "loss": 0.0456, "num_input_tokens_seen": 77441320, "step": 114890 }, { "epoch": 2.806903965016002, "grad_norm": 0.03910161927342415, "learning_rate": 9.603051814626877e-07, "loss": 0.0, "num_input_tokens_seen": 77444968, "step": 114895 }, { "epoch": 2.807026115847849, "grad_norm": 0.2639329731464386, "learning_rate": 9.60219971219184e-07, "loss": 0.0489, "num_input_tokens_seen": 77448232, "step": 114900 }, { "epoch": 2.807148266679696, "grad_norm": 14.962903022766113, "learning_rate": 9.601347612649715e-07, "loss": 0.1049, "num_input_tokens_seen": 77452072, "step": 114905 }, { "epoch": 2.8072704175115435, "grad_norm": 0.07089398801326752, "learning_rate": 9.600495516006694e-07, "loss": 0.0002, "num_input_tokens_seen": 77455912, "step": 114910 }, { "epoch": 2.8073925683433902, "grad_norm": 0.042340733110904694, "learning_rate": 9.599643422268976e-07, "loss": 0.0718, "num_input_tokens_seen": 77459304, "step": 114915 }, { "epoch": 2.8075147191752374, "grad_norm": 0.6589097380638123, "learning_rate": 9.598791331442765e-07, "loss": 0.0013, "num_input_tokens_seen": 77462696, "step": 114920 }, { "epoch": 2.8076368700070846, "grad_norm": 0.4322875440120697, "learning_rate": 9.597939243534244e-07, "loss": 0.0004, "num_input_tokens_seen": 77466472, "step": 114925 }, { "epoch": 2.807759020838932, "grad_norm": 44.9621467590332, "learning_rate": 9.597087158549623e-07, "loss": 0.0032, "num_input_tokens_seen": 77469608, "step": 114930 }, { "epoch": 2.807881171670779, "grad_norm": 0.038575429469347, "learning_rate": 9.596235076495088e-07, "loss": 0.1067, "num_input_tokens_seen": 77473064, "step": 114935 }, { "epoch": 2.808003322502626, "grad_norm": 0.28912949562072754, "learning_rate": 9.595382997376846e-07, "loss": 0.0006, "num_input_tokens_seen": 77476456, "step": 114940 }, { "epoch": 2.8081254733344734, "grad_norm": 0.33396586775779724, "learning_rate": 9.594530921201082e-07, "loss": 0.0002, "num_input_tokens_seen": 77479976, "step": 114945 }, { "epoch": 2.8082476241663206, "grad_norm": 0.015432288870215416, "learning_rate": 9.593678847974e-07, "loss": 0.0001, "num_input_tokens_seen": 77483240, "step": 114950 }, { "epoch": 2.8083697749981678, "grad_norm": 0.8247775435447693, "learning_rate": 9.592826777701796e-07, "loss": 0.0007, "num_input_tokens_seen": 77486568, "step": 114955 }, { "epoch": 2.808491925830015, "grad_norm": 0.018848801031708717, "learning_rate": 9.591974710390663e-07, "loss": 0.0416, "num_input_tokens_seen": 77489640, "step": 114960 }, { "epoch": 2.808614076661862, "grad_norm": 0.03549596667289734, "learning_rate": 9.591122646046802e-07, "loss": 0.0001, "num_input_tokens_seen": 77493224, "step": 114965 }, { "epoch": 2.8087362274937093, "grad_norm": 0.018727822229266167, "learning_rate": 9.590270584676403e-07, "loss": 0.0001, "num_input_tokens_seen": 77497128, "step": 114970 }, { "epoch": 2.8088583783255565, "grad_norm": 0.13936327397823334, "learning_rate": 9.589418526285667e-07, "loss": 0.1205, "num_input_tokens_seen": 77500392, "step": 114975 }, { "epoch": 2.8089805291574037, "grad_norm": 0.004517362453043461, "learning_rate": 9.588566470880794e-07, "loss": 0.116, "num_input_tokens_seen": 77503528, "step": 114980 }, { "epoch": 2.809102679989251, "grad_norm": 16.47195816040039, "learning_rate": 9.587714418467974e-07, "loss": 0.1401, "num_input_tokens_seen": 77506728, "step": 114985 }, { "epoch": 2.8092248308210976, "grad_norm": 21.894033432006836, "learning_rate": 9.586862369053409e-07, "loss": 0.0006, "num_input_tokens_seen": 77510248, "step": 114990 }, { "epoch": 2.8093469816529453, "grad_norm": 0.08726862072944641, "learning_rate": 9.586010322643287e-07, "loss": 0.0479, "num_input_tokens_seen": 77513704, "step": 114995 }, { "epoch": 2.809469132484792, "grad_norm": 0.05452345311641693, "learning_rate": 9.585158279243812e-07, "loss": 0.0371, "num_input_tokens_seen": 77517096, "step": 115000 }, { "epoch": 2.8095912833166397, "grad_norm": 0.16248691082000732, "learning_rate": 9.584306238861178e-07, "loss": 0.1121, "num_input_tokens_seen": 77520168, "step": 115005 }, { "epoch": 2.8097134341484864, "grad_norm": 0.0710228681564331, "learning_rate": 9.583454201501576e-07, "loss": 0.0471, "num_input_tokens_seen": 77523944, "step": 115010 }, { "epoch": 2.8098355849803336, "grad_norm": 0.029321448877453804, "learning_rate": 9.582602167171215e-07, "loss": 0.0003, "num_input_tokens_seen": 77527208, "step": 115015 }, { "epoch": 2.809957735812181, "grad_norm": 0.0011495605576783419, "learning_rate": 9.581750135876275e-07, "loss": 0.0006, "num_input_tokens_seen": 77530024, "step": 115020 }, { "epoch": 2.810079886644028, "grad_norm": 0.07034885138273239, "learning_rate": 9.580898107622967e-07, "loss": 0.0001, "num_input_tokens_seen": 77533096, "step": 115025 }, { "epoch": 2.810202037475875, "grad_norm": 0.002323192311450839, "learning_rate": 9.580046082417476e-07, "loss": 0.0003, "num_input_tokens_seen": 77536360, "step": 115030 }, { "epoch": 2.8103241883077223, "grad_norm": 75.02461242675781, "learning_rate": 9.57919406026601e-07, "loss": 0.0416, "num_input_tokens_seen": 77539560, "step": 115035 }, { "epoch": 2.8104463391395695, "grad_norm": 0.014927959069609642, "learning_rate": 9.57834204117475e-07, "loss": 0.0003, "num_input_tokens_seen": 77542632, "step": 115040 }, { "epoch": 2.8105684899714167, "grad_norm": 178.06178283691406, "learning_rate": 9.577490025149901e-07, "loss": 0.0707, "num_input_tokens_seen": 77546152, "step": 115045 }, { "epoch": 2.810690640803264, "grad_norm": 0.056914959102869034, "learning_rate": 9.576638012197661e-07, "loss": 0.0003, "num_input_tokens_seen": 77549544, "step": 115050 }, { "epoch": 2.810812791635111, "grad_norm": 0.021990373730659485, "learning_rate": 9.575786002324225e-07, "loss": 0.0002, "num_input_tokens_seen": 77552680, "step": 115055 }, { "epoch": 2.8109349424669583, "grad_norm": 0.0051172287203371525, "learning_rate": 9.574933995535786e-07, "loss": 0.0019, "num_input_tokens_seen": 77555880, "step": 115060 }, { "epoch": 2.8110570932988055, "grad_norm": 0.018687183037400246, "learning_rate": 9.57408199183854e-07, "loss": 0.0742, "num_input_tokens_seen": 77558952, "step": 115065 }, { "epoch": 2.8111792441306527, "grad_norm": 0.04799675941467285, "learning_rate": 9.57322999123868e-07, "loss": 0.0004, "num_input_tokens_seen": 77562600, "step": 115070 }, { "epoch": 2.8113013949624994, "grad_norm": 0.05752696096897125, "learning_rate": 9.572377993742413e-07, "loss": 0.0002, "num_input_tokens_seen": 77566312, "step": 115075 }, { "epoch": 2.811423545794347, "grad_norm": 1.0827497243881226, "learning_rate": 9.571525999355926e-07, "loss": 0.0333, "num_input_tokens_seen": 77569384, "step": 115080 }, { "epoch": 2.811545696626194, "grad_norm": 0.03544053062796593, "learning_rate": 9.570674008085419e-07, "loss": 0.0003, "num_input_tokens_seen": 77572776, "step": 115085 }, { "epoch": 2.8116678474580414, "grad_norm": 0.009640112519264221, "learning_rate": 9.569822019937082e-07, "loss": 0.0554, "num_input_tokens_seen": 77576232, "step": 115090 }, { "epoch": 2.811789998289888, "grad_norm": 0.0009792305063456297, "learning_rate": 9.568970034917119e-07, "loss": 0.0236, "num_input_tokens_seen": 77579624, "step": 115095 }, { "epoch": 2.8119121491217354, "grad_norm": 0.0004921176587231457, "learning_rate": 9.56811805303172e-07, "loss": 0.0628, "num_input_tokens_seen": 77583464, "step": 115100 }, { "epoch": 2.8120342999535826, "grad_norm": 0.004929780960083008, "learning_rate": 9.56726607428708e-07, "loss": 0.0399, "num_input_tokens_seen": 77586856, "step": 115105 }, { "epoch": 2.8121564507854298, "grad_norm": 0.003183274297043681, "learning_rate": 9.566414098689404e-07, "loss": 0.0001, "num_input_tokens_seen": 77589864, "step": 115110 }, { "epoch": 2.812278601617277, "grad_norm": 0.02194182761013508, "learning_rate": 9.565562126244876e-07, "loss": 0.0761, "num_input_tokens_seen": 77592808, "step": 115115 }, { "epoch": 2.812400752449124, "grad_norm": 0.2706195116043091, "learning_rate": 9.5647101569597e-07, "loss": 0.0002, "num_input_tokens_seen": 77596200, "step": 115120 }, { "epoch": 2.8125229032809713, "grad_norm": 1.5332212448120117, "learning_rate": 9.563858190840066e-07, "loss": 0.0723, "num_input_tokens_seen": 77599528, "step": 115125 }, { "epoch": 2.8126450541128185, "grad_norm": 0.036401137709617615, "learning_rate": 9.563006227892172e-07, "loss": 0.0001, "num_input_tokens_seen": 77602920, "step": 115130 }, { "epoch": 2.8127672049446657, "grad_norm": 0.013892637565732002, "learning_rate": 9.562154268122217e-07, "loss": 0.0001, "num_input_tokens_seen": 77606632, "step": 115135 }, { "epoch": 2.812889355776513, "grad_norm": 291.0292053222656, "learning_rate": 9.561302311536392e-07, "loss": 0.0291, "num_input_tokens_seen": 77610216, "step": 115140 }, { "epoch": 2.81301150660836, "grad_norm": 0.0005044717690907419, "learning_rate": 9.5604503581409e-07, "loss": 0.0193, "num_input_tokens_seen": 77613800, "step": 115145 }, { "epoch": 2.8131336574402073, "grad_norm": 21.074951171875, "learning_rate": 9.559598407941925e-07, "loss": 0.0568, "num_input_tokens_seen": 77617000, "step": 115150 }, { "epoch": 2.8132558082720545, "grad_norm": 0.0009566976805217564, "learning_rate": 9.558746460945672e-07, "loss": 0.0002, "num_input_tokens_seen": 77620072, "step": 115155 }, { "epoch": 2.8133779591039016, "grad_norm": 0.14798258244991302, "learning_rate": 9.557894517158332e-07, "loss": 0.0001, "num_input_tokens_seen": 77623208, "step": 115160 }, { "epoch": 2.813500109935749, "grad_norm": 0.13334304094314575, "learning_rate": 9.557042576586101e-07, "loss": 0.0002, "num_input_tokens_seen": 77626088, "step": 115165 }, { "epoch": 2.8136222607675956, "grad_norm": 0.1850810945034027, "learning_rate": 9.55619063923518e-07, "loss": 0.0003, "num_input_tokens_seen": 77629160, "step": 115170 }, { "epoch": 2.813744411599443, "grad_norm": 0.055887024849653244, "learning_rate": 9.555338705111753e-07, "loss": 0.0001, "num_input_tokens_seen": 77632808, "step": 115175 }, { "epoch": 2.81386656243129, "grad_norm": 0.2823597192764282, "learning_rate": 9.55448677422203e-07, "loss": 0.0002, "num_input_tokens_seen": 77636712, "step": 115180 }, { "epoch": 2.8139887132631376, "grad_norm": 0.00106719764880836, "learning_rate": 9.55363484657219e-07, "loss": 0.0386, "num_input_tokens_seen": 77640104, "step": 115185 }, { "epoch": 2.8141108640949843, "grad_norm": 0.01963244192302227, "learning_rate": 9.552782922168447e-07, "loss": 0.0113, "num_input_tokens_seen": 77643304, "step": 115190 }, { "epoch": 2.8142330149268315, "grad_norm": 0.006312564015388489, "learning_rate": 9.55193100101698e-07, "loss": 0.0599, "num_input_tokens_seen": 77646632, "step": 115195 }, { "epoch": 2.8143551657586787, "grad_norm": 0.20666086673736572, "learning_rate": 9.551079083123996e-07, "loss": 0.0457, "num_input_tokens_seen": 77649768, "step": 115200 }, { "epoch": 2.814477316590526, "grad_norm": 0.005621414165943861, "learning_rate": 9.550227168495683e-07, "loss": 0.0008, "num_input_tokens_seen": 77653032, "step": 115205 }, { "epoch": 2.814599467422373, "grad_norm": 25.110536575317383, "learning_rate": 9.54937525713824e-07, "loss": 0.0429, "num_input_tokens_seen": 77656296, "step": 115210 }, { "epoch": 2.8147216182542203, "grad_norm": 0.28763338923454285, "learning_rate": 9.548523349057864e-07, "loss": 0.0993, "num_input_tokens_seen": 77659496, "step": 115215 }, { "epoch": 2.8148437690860675, "grad_norm": 0.013426266610622406, "learning_rate": 9.54767144426074e-07, "loss": 0.0003, "num_input_tokens_seen": 77663016, "step": 115220 }, { "epoch": 2.8149659199179147, "grad_norm": 0.06359026581048965, "learning_rate": 9.546819542753074e-07, "loss": 0.0411, "num_input_tokens_seen": 77666920, "step": 115225 }, { "epoch": 2.815088070749762, "grad_norm": 0.012988533824682236, "learning_rate": 9.545967644541063e-07, "loss": 0.007, "num_input_tokens_seen": 77670312, "step": 115230 }, { "epoch": 2.815210221581609, "grad_norm": 0.5349770188331604, "learning_rate": 9.545115749630891e-07, "loss": 0.0466, "num_input_tokens_seen": 77673832, "step": 115235 }, { "epoch": 2.8153323724134562, "grad_norm": 0.37651684880256653, "learning_rate": 9.544263858028765e-07, "loss": 0.0004, "num_input_tokens_seen": 77677032, "step": 115240 }, { "epoch": 2.8154545232453034, "grad_norm": 0.04008885845541954, "learning_rate": 9.54341196974087e-07, "loss": 0.0046, "num_input_tokens_seen": 77680296, "step": 115245 }, { "epoch": 2.8155766740771506, "grad_norm": 0.0235599298030138, "learning_rate": 9.542560084773412e-07, "loss": 0.0458, "num_input_tokens_seen": 77683880, "step": 115250 }, { "epoch": 2.8156988249089974, "grad_norm": 0.04918990284204483, "learning_rate": 9.541708203132577e-07, "loss": 0.0001, "num_input_tokens_seen": 77687080, "step": 115255 }, { "epoch": 2.815820975740845, "grad_norm": 0.017839496955275536, "learning_rate": 9.54085632482456e-07, "loss": 0.0001, "num_input_tokens_seen": 77690472, "step": 115260 }, { "epoch": 2.8159431265726917, "grad_norm": 0.002251403173431754, "learning_rate": 9.540004449855565e-07, "loss": 0.0751, "num_input_tokens_seen": 77693864, "step": 115265 }, { "epoch": 2.8160652774045394, "grad_norm": 0.0022762392181903124, "learning_rate": 9.539152578231776e-07, "loss": 0.0454, "num_input_tokens_seen": 77697192, "step": 115270 }, { "epoch": 2.816187428236386, "grad_norm": 0.009227758273482323, "learning_rate": 9.538300709959398e-07, "loss": 0.069, "num_input_tokens_seen": 77700520, "step": 115275 }, { "epoch": 2.8163095790682333, "grad_norm": 0.01429454330354929, "learning_rate": 9.537448845044617e-07, "loss": 0.0, "num_input_tokens_seen": 77704040, "step": 115280 }, { "epoch": 2.8164317299000805, "grad_norm": 0.027496200054883957, "learning_rate": 9.536596983493633e-07, "loss": 0.0002, "num_input_tokens_seen": 77707880, "step": 115285 }, { "epoch": 2.8165538807319277, "grad_norm": 0.0004138645890634507, "learning_rate": 9.535745125312644e-07, "loss": 0.035, "num_input_tokens_seen": 77711208, "step": 115290 }, { "epoch": 2.816676031563775, "grad_norm": 0.008841968141496181, "learning_rate": 9.534893270507837e-07, "loss": 0.0001, "num_input_tokens_seen": 77714472, "step": 115295 }, { "epoch": 2.816798182395622, "grad_norm": 0.1025681346654892, "learning_rate": 9.534041419085417e-07, "loss": 0.0003, "num_input_tokens_seen": 77717864, "step": 115300 }, { "epoch": 2.8169203332274693, "grad_norm": 0.04022670164704323, "learning_rate": 9.53318957105157e-07, "loss": 0.0001, "num_input_tokens_seen": 77721256, "step": 115305 }, { "epoch": 2.8170424840593165, "grad_norm": 0.008396613411605358, "learning_rate": 9.532337726412494e-07, "loss": 0.0654, "num_input_tokens_seen": 77724456, "step": 115310 }, { "epoch": 2.8171646348911636, "grad_norm": 30.426029205322266, "learning_rate": 9.531485885174384e-07, "loss": 0.0492, "num_input_tokens_seen": 77727912, "step": 115315 }, { "epoch": 2.817286785723011, "grad_norm": 0.03145899623632431, "learning_rate": 9.530634047343432e-07, "loss": 0.0479, "num_input_tokens_seen": 77731432, "step": 115320 }, { "epoch": 2.817408936554858, "grad_norm": 0.01578490436077118, "learning_rate": 9.52978221292584e-07, "loss": 0.0404, "num_input_tokens_seen": 77734760, "step": 115325 }, { "epoch": 2.817531087386705, "grad_norm": 0.021990390494465828, "learning_rate": 9.528930381927794e-07, "loss": 0.0004, "num_input_tokens_seen": 77738216, "step": 115330 }, { "epoch": 2.8176532382185524, "grad_norm": 0.0014852866297587752, "learning_rate": 9.528078554355497e-07, "loss": 0.1156, "num_input_tokens_seen": 77742504, "step": 115335 }, { "epoch": 2.8177753890503996, "grad_norm": 0.17477023601531982, "learning_rate": 9.527226730215136e-07, "loss": 0.0003, "num_input_tokens_seen": 77746024, "step": 115340 }, { "epoch": 2.817897539882247, "grad_norm": 0.00901720579713583, "learning_rate": 9.526374909512913e-07, "loss": 0.0, "num_input_tokens_seen": 77749800, "step": 115345 }, { "epoch": 2.8180196907140935, "grad_norm": 0.013932290486991405, "learning_rate": 9.525523092255015e-07, "loss": 0.0001, "num_input_tokens_seen": 77753192, "step": 115350 }, { "epoch": 2.818141841545941, "grad_norm": 0.016308395192027092, "learning_rate": 9.524671278447642e-07, "loss": 0.0001, "num_input_tokens_seen": 77757160, "step": 115355 }, { "epoch": 2.818263992377788, "grad_norm": 0.06437277048826218, "learning_rate": 9.523819468096988e-07, "loss": 0.0004, "num_input_tokens_seen": 77760744, "step": 115360 }, { "epoch": 2.818386143209635, "grad_norm": 0.012472563423216343, "learning_rate": 9.522967661209249e-07, "loss": 0.0002, "num_input_tokens_seen": 77764200, "step": 115365 }, { "epoch": 2.8185082940414823, "grad_norm": 0.009374906308948994, "learning_rate": 9.522115857790616e-07, "loss": 0.036, "num_input_tokens_seen": 77767656, "step": 115370 }, { "epoch": 2.8186304448733295, "grad_norm": 0.048512861132621765, "learning_rate": 9.521264057847283e-07, "loss": 0.0542, "num_input_tokens_seen": 77770600, "step": 115375 }, { "epoch": 2.8187525957051767, "grad_norm": 23.211782455444336, "learning_rate": 9.520412261385445e-07, "loss": 0.0633, "num_input_tokens_seen": 77773736, "step": 115380 }, { "epoch": 2.818874746537024, "grad_norm": 0.025888273492455482, "learning_rate": 9.519560468411304e-07, "loss": 0.0526, "num_input_tokens_seen": 77776872, "step": 115385 }, { "epoch": 2.818996897368871, "grad_norm": 0.1917659342288971, "learning_rate": 9.518708678931044e-07, "loss": 0.0002, "num_input_tokens_seen": 77779944, "step": 115390 }, { "epoch": 2.8191190482007182, "grad_norm": 15.961345672607422, "learning_rate": 9.517856892950866e-07, "loss": 0.0981, "num_input_tokens_seen": 77783208, "step": 115395 }, { "epoch": 2.8192411990325654, "grad_norm": 0.020637147128582, "learning_rate": 9.51700511047696e-07, "loss": 0.0001, "num_input_tokens_seen": 77786728, "step": 115400 }, { "epoch": 2.8193633498644126, "grad_norm": 0.5006750822067261, "learning_rate": 9.516153331515528e-07, "loss": 0.0721, "num_input_tokens_seen": 77790120, "step": 115405 }, { "epoch": 2.81948550069626, "grad_norm": 0.04073977470397949, "learning_rate": 9.515301556072754e-07, "loss": 0.1149, "num_input_tokens_seen": 77793512, "step": 115410 }, { "epoch": 2.819607651528107, "grad_norm": 0.176833376288414, "learning_rate": 9.514449784154837e-07, "loss": 0.0486, "num_input_tokens_seen": 77797608, "step": 115415 }, { "epoch": 2.819729802359954, "grad_norm": 0.03162245824933052, "learning_rate": 9.513598015767978e-07, "loss": 0.0898, "num_input_tokens_seen": 77801384, "step": 115420 }, { "epoch": 2.8198519531918014, "grad_norm": 0.18794050812721252, "learning_rate": 9.512746250918358e-07, "loss": 0.0009, "num_input_tokens_seen": 77805096, "step": 115425 }, { "epoch": 2.8199741040236486, "grad_norm": 0.058585792779922485, "learning_rate": 9.511894489612183e-07, "loss": 0.001, "num_input_tokens_seen": 77808168, "step": 115430 }, { "epoch": 2.8200962548554953, "grad_norm": 0.03166075795888901, "learning_rate": 9.51104273185564e-07, "loss": 0.0834, "num_input_tokens_seen": 77811816, "step": 115435 }, { "epoch": 2.820218405687343, "grad_norm": 0.12651073932647705, "learning_rate": 9.510190977654924e-07, "loss": 0.0453, "num_input_tokens_seen": 77814952, "step": 115440 }, { "epoch": 2.8203405565191897, "grad_norm": 0.200551837682724, "learning_rate": 9.509339227016235e-07, "loss": 0.0005, "num_input_tokens_seen": 77818472, "step": 115445 }, { "epoch": 2.8204627073510373, "grad_norm": 0.04132993146777153, "learning_rate": 9.50848747994576e-07, "loss": 0.0503, "num_input_tokens_seen": 77821992, "step": 115450 }, { "epoch": 2.820584858182884, "grad_norm": 0.02948671020567417, "learning_rate": 9.5076357364497e-07, "loss": 0.1241, "num_input_tokens_seen": 77824936, "step": 115455 }, { "epoch": 2.8207070090147313, "grad_norm": 0.0442376546561718, "learning_rate": 9.506783996534244e-07, "loss": 0.0472, "num_input_tokens_seen": 77828264, "step": 115460 }, { "epoch": 2.8208291598465784, "grad_norm": 0.0049124970100820065, "learning_rate": 9.505932260205586e-07, "loss": 0.0284, "num_input_tokens_seen": 77831400, "step": 115465 }, { "epoch": 2.8209513106784256, "grad_norm": 39.6253776550293, "learning_rate": 9.505080527469924e-07, "loss": 0.0399, "num_input_tokens_seen": 77834536, "step": 115470 }, { "epoch": 2.821073461510273, "grad_norm": 0.0008584270253777504, "learning_rate": 9.504228798333445e-07, "loss": 0.0344, "num_input_tokens_seen": 77837928, "step": 115475 }, { "epoch": 2.82119561234212, "grad_norm": 0.014727511443197727, "learning_rate": 9.503377072802353e-07, "loss": 0.0446, "num_input_tokens_seen": 77841576, "step": 115480 }, { "epoch": 2.821317763173967, "grad_norm": 0.05348619073629379, "learning_rate": 9.502525350882831e-07, "loss": 0.0524, "num_input_tokens_seen": 77844520, "step": 115485 }, { "epoch": 2.8214399140058144, "grad_norm": 1.4710263013839722, "learning_rate": 9.501673632581083e-07, "loss": 0.0459, "num_input_tokens_seen": 77848424, "step": 115490 }, { "epoch": 2.8215620648376616, "grad_norm": 0.015803487971425056, "learning_rate": 9.500821917903295e-07, "loss": 0.0005, "num_input_tokens_seen": 77851944, "step": 115495 }, { "epoch": 2.8216842156695088, "grad_norm": 0.1972040981054306, "learning_rate": 9.499970206855668e-07, "loss": 0.0003, "num_input_tokens_seen": 77855400, "step": 115500 }, { "epoch": 2.821806366501356, "grad_norm": 0.051901161670684814, "learning_rate": 9.499118499444388e-07, "loss": 0.0003, "num_input_tokens_seen": 77858920, "step": 115505 }, { "epoch": 2.821928517333203, "grad_norm": 0.01611727848649025, "learning_rate": 9.498266795675654e-07, "loss": 0.0011, "num_input_tokens_seen": 77862376, "step": 115510 }, { "epoch": 2.8220506681650503, "grad_norm": 0.05335599184036255, "learning_rate": 9.497415095555659e-07, "loss": 0.0787, "num_input_tokens_seen": 77865896, "step": 115515 }, { "epoch": 2.822172818996897, "grad_norm": 0.031127309426665306, "learning_rate": 9.496563399090598e-07, "loss": 0.0487, "num_input_tokens_seen": 77869864, "step": 115520 }, { "epoch": 2.8222949698287447, "grad_norm": 0.09673364460468292, "learning_rate": 9.495711706286666e-07, "loss": 0.0255, "num_input_tokens_seen": 77873064, "step": 115525 }, { "epoch": 2.8224171206605915, "grad_norm": 0.5968812704086304, "learning_rate": 9.494860017150048e-07, "loss": 0.0015, "num_input_tokens_seen": 77876520, "step": 115530 }, { "epoch": 2.822539271492439, "grad_norm": 0.07545629888772964, "learning_rate": 9.494008331686945e-07, "loss": 0.0005, "num_input_tokens_seen": 77879976, "step": 115535 }, { "epoch": 2.822661422324286, "grad_norm": 0.09752354770898819, "learning_rate": 9.493156649903553e-07, "loss": 0.0006, "num_input_tokens_seen": 77883624, "step": 115540 }, { "epoch": 2.822783573156133, "grad_norm": 0.03341040015220642, "learning_rate": 9.492304971806059e-07, "loss": 0.0001, "num_input_tokens_seen": 77887400, "step": 115545 }, { "epoch": 2.8229057239879802, "grad_norm": 0.0756709948182106, "learning_rate": 9.491453297400663e-07, "loss": 0.0258, "num_input_tokens_seen": 77891048, "step": 115550 }, { "epoch": 2.8230278748198274, "grad_norm": 0.028244758024811745, "learning_rate": 9.490601626693551e-07, "loss": 0.0001, "num_input_tokens_seen": 77894184, "step": 115555 }, { "epoch": 2.8231500256516746, "grad_norm": 0.12282195687294006, "learning_rate": 9.489749959690926e-07, "loss": 0.0004, "num_input_tokens_seen": 77897384, "step": 115560 }, { "epoch": 2.823272176483522, "grad_norm": 0.011518844403326511, "learning_rate": 9.488898296398975e-07, "loss": 0.0003, "num_input_tokens_seen": 77900840, "step": 115565 }, { "epoch": 2.823394327315369, "grad_norm": 17.510047912597656, "learning_rate": 9.48804663682389e-07, "loss": 0.0249, "num_input_tokens_seen": 77904040, "step": 115570 }, { "epoch": 2.823516478147216, "grad_norm": 0.035026874393224716, "learning_rate": 9.487194980971871e-07, "loss": 0.0003, "num_input_tokens_seen": 77907624, "step": 115575 }, { "epoch": 2.8236386289790634, "grad_norm": 0.013446671888232231, "learning_rate": 9.486343328849105e-07, "loss": 0.0279, "num_input_tokens_seen": 77911080, "step": 115580 }, { "epoch": 2.8237607798109106, "grad_norm": 0.004692543763667345, "learning_rate": 9.485491680461792e-07, "loss": 0.0, "num_input_tokens_seen": 77914536, "step": 115585 }, { "epoch": 2.8238829306427577, "grad_norm": 0.03838277608156204, "learning_rate": 9.484640035816119e-07, "loss": 0.0459, "num_input_tokens_seen": 77918312, "step": 115590 }, { "epoch": 2.824005081474605, "grad_norm": 0.028557293117046356, "learning_rate": 9.483788394918285e-07, "loss": 0.073, "num_input_tokens_seen": 77922216, "step": 115595 }, { "epoch": 2.824127232306452, "grad_norm": 0.17949016392230988, "learning_rate": 9.482936757774477e-07, "loss": 0.0002, "num_input_tokens_seen": 77925736, "step": 115600 }, { "epoch": 2.8242493831382993, "grad_norm": 0.00452646566554904, "learning_rate": 9.482085124390892e-07, "loss": 0.0001, "num_input_tokens_seen": 77929064, "step": 115605 }, { "epoch": 2.8243715339701465, "grad_norm": 0.08502378314733505, "learning_rate": 9.481233494773727e-07, "loss": 0.0004, "num_input_tokens_seen": 77932136, "step": 115610 }, { "epoch": 2.8244936848019933, "grad_norm": 0.015604489482939243, "learning_rate": 9.48038186892917e-07, "loss": 0.063, "num_input_tokens_seen": 77934888, "step": 115615 }, { "epoch": 2.824615835633841, "grad_norm": 0.00344777200371027, "learning_rate": 9.479530246863416e-07, "loss": 0.0003, "num_input_tokens_seen": 77938344, "step": 115620 }, { "epoch": 2.8247379864656876, "grad_norm": 0.02435332164168358, "learning_rate": 9.478678628582657e-07, "loss": 0.0007, "num_input_tokens_seen": 77941672, "step": 115625 }, { "epoch": 2.8248601372975353, "grad_norm": 0.008998130448162556, "learning_rate": 9.477827014093086e-07, "loss": 0.0834, "num_input_tokens_seen": 77945000, "step": 115630 }, { "epoch": 2.824982288129382, "grad_norm": 0.023994240909814835, "learning_rate": 9.476975403400902e-07, "loss": 0.0504, "num_input_tokens_seen": 77948072, "step": 115635 }, { "epoch": 2.825104438961229, "grad_norm": 14.398282051086426, "learning_rate": 9.476123796512288e-07, "loss": 0.0314, "num_input_tokens_seen": 77951208, "step": 115640 }, { "epoch": 2.8252265897930764, "grad_norm": 0.08678370714187622, "learning_rate": 9.475272193433448e-07, "loss": 0.0604, "num_input_tokens_seen": 77954792, "step": 115645 }, { "epoch": 2.8253487406249236, "grad_norm": 86.93380737304688, "learning_rate": 9.474420594170566e-07, "loss": 0.0883, "num_input_tokens_seen": 77958632, "step": 115650 }, { "epoch": 2.8254708914567708, "grad_norm": 0.08339137583971024, "learning_rate": 9.473568998729842e-07, "loss": 0.034, "num_input_tokens_seen": 77962408, "step": 115655 }, { "epoch": 2.825593042288618, "grad_norm": 0.004629269242286682, "learning_rate": 9.472717407117461e-07, "loss": 0.0001, "num_input_tokens_seen": 77965736, "step": 115660 }, { "epoch": 2.825715193120465, "grad_norm": 0.06785409152507782, "learning_rate": 9.471865819339624e-07, "loss": 0.0514, "num_input_tokens_seen": 77969512, "step": 115665 }, { "epoch": 2.8258373439523123, "grad_norm": 14.922898292541504, "learning_rate": 9.471014235402521e-07, "loss": 0.1223, "num_input_tokens_seen": 77972968, "step": 115670 }, { "epoch": 2.8259594947841595, "grad_norm": 0.011822236701846123, "learning_rate": 9.470162655312344e-07, "loss": 0.0635, "num_input_tokens_seen": 77975912, "step": 115675 }, { "epoch": 2.8260816456160067, "grad_norm": 0.11484556645154953, "learning_rate": 9.46931107907529e-07, "loss": 0.0108, "num_input_tokens_seen": 77979176, "step": 115680 }, { "epoch": 2.826203796447854, "grad_norm": 0.01218679640442133, "learning_rate": 9.468459506697543e-07, "loss": 0.0002, "num_input_tokens_seen": 77982760, "step": 115685 }, { "epoch": 2.826325947279701, "grad_norm": 0.013855358585715294, "learning_rate": 9.467607938185301e-07, "loss": 0.0003, "num_input_tokens_seen": 77986216, "step": 115690 }, { "epoch": 2.8264480981115483, "grad_norm": 0.09669753909111023, "learning_rate": 9.466756373544763e-07, "loss": 0.0501, "num_input_tokens_seen": 77989672, "step": 115695 }, { "epoch": 2.826570248943395, "grad_norm": 0.03389380872249603, "learning_rate": 9.465904812782112e-07, "loss": 0.1267, "num_input_tokens_seen": 77992872, "step": 115700 }, { "epoch": 2.8266923997752427, "grad_norm": 0.027956049889326096, "learning_rate": 9.465053255903548e-07, "loss": 0.0841, "num_input_tokens_seen": 77995880, "step": 115705 }, { "epoch": 2.8268145506070894, "grad_norm": 0.006097717210650444, "learning_rate": 9.464201702915256e-07, "loss": 0.0007, "num_input_tokens_seen": 77999400, "step": 115710 }, { "epoch": 2.826936701438937, "grad_norm": 0.02535603940486908, "learning_rate": 9.463350153823438e-07, "loss": 0.1111, "num_input_tokens_seen": 78003112, "step": 115715 }, { "epoch": 2.827058852270784, "grad_norm": 0.7845985889434814, "learning_rate": 9.462498608634281e-07, "loss": 0.0557, "num_input_tokens_seen": 78006568, "step": 115720 }, { "epoch": 2.827181003102631, "grad_norm": 0.061375007033348083, "learning_rate": 9.461647067353975e-07, "loss": 0.0315, "num_input_tokens_seen": 78009512, "step": 115725 }, { "epoch": 2.827303153934478, "grad_norm": 0.10138930380344391, "learning_rate": 9.460795529988723e-07, "loss": 0.0488, "num_input_tokens_seen": 78013160, "step": 115730 }, { "epoch": 2.8274253047663254, "grad_norm": 0.11310622096061707, "learning_rate": 9.459943996544703e-07, "loss": 0.041, "num_input_tokens_seen": 78016232, "step": 115735 }, { "epoch": 2.8275474555981726, "grad_norm": 0.015895165503025055, "learning_rate": 9.459092467028122e-07, "loss": 0.0001, "num_input_tokens_seen": 78019688, "step": 115740 }, { "epoch": 2.8276696064300197, "grad_norm": 0.004023950546979904, "learning_rate": 9.458240941445163e-07, "loss": 0.0002, "num_input_tokens_seen": 78023016, "step": 115745 }, { "epoch": 2.827791757261867, "grad_norm": 0.7409040331840515, "learning_rate": 9.457389419802024e-07, "loss": 0.0003, "num_input_tokens_seen": 78026344, "step": 115750 }, { "epoch": 2.827913908093714, "grad_norm": 0.03319403901696205, "learning_rate": 9.45653790210489e-07, "loss": 0.0001, "num_input_tokens_seen": 78029928, "step": 115755 }, { "epoch": 2.8280360589255613, "grad_norm": 0.024063147604465485, "learning_rate": 9.455686388359961e-07, "loss": 0.0024, "num_input_tokens_seen": 78033320, "step": 115760 }, { "epoch": 2.8281582097574085, "grad_norm": 0.011868052184581757, "learning_rate": 9.45483487857343e-07, "loss": 0.0001, "num_input_tokens_seen": 78036264, "step": 115765 }, { "epoch": 2.8282803605892557, "grad_norm": 0.018821561709046364, "learning_rate": 9.453983372751484e-07, "loss": 0.0918, "num_input_tokens_seen": 78039656, "step": 115770 }, { "epoch": 2.828402511421103, "grad_norm": 0.20139813423156738, "learning_rate": 9.453131870900318e-07, "loss": 0.1312, "num_input_tokens_seen": 78042600, "step": 115775 }, { "epoch": 2.82852466225295, "grad_norm": 0.21862444281578064, "learning_rate": 9.452280373026125e-07, "loss": 0.032, "num_input_tokens_seen": 78045544, "step": 115780 }, { "epoch": 2.8286468130847973, "grad_norm": 0.128239706158638, "learning_rate": 9.451428879135093e-07, "loss": 0.0007, "num_input_tokens_seen": 78048424, "step": 115785 }, { "epoch": 2.8287689639166445, "grad_norm": 42.088844299316406, "learning_rate": 9.450577389233423e-07, "loss": 0.0537, "num_input_tokens_seen": 78051688, "step": 115790 }, { "epoch": 2.828891114748491, "grad_norm": 16.15775489807129, "learning_rate": 9.449725903327297e-07, "loss": 0.0318, "num_input_tokens_seen": 78055272, "step": 115795 }, { "epoch": 2.829013265580339, "grad_norm": 0.013786409981548786, "learning_rate": 9.448874421422916e-07, "loss": 0.0493, "num_input_tokens_seen": 78059240, "step": 115800 }, { "epoch": 2.8291354164121856, "grad_norm": 0.011192580685019493, "learning_rate": 9.448022943526466e-07, "loss": 0.0586, "num_input_tokens_seen": 78062568, "step": 115805 }, { "epoch": 2.8292575672440328, "grad_norm": 0.739030122756958, "learning_rate": 9.447171469644144e-07, "loss": 0.0004, "num_input_tokens_seen": 78066152, "step": 115810 }, { "epoch": 2.82937971807588, "grad_norm": 0.02116185426712036, "learning_rate": 9.446319999782136e-07, "loss": 0.0, "num_input_tokens_seen": 78069288, "step": 115815 }, { "epoch": 2.829501868907727, "grad_norm": 0.9456649422645569, "learning_rate": 9.445468533946641e-07, "loss": 0.0536, "num_input_tokens_seen": 78072872, "step": 115820 }, { "epoch": 2.8296240197395743, "grad_norm": 0.019342761486768723, "learning_rate": 9.444617072143848e-07, "loss": 0.0002, "num_input_tokens_seen": 78076200, "step": 115825 }, { "epoch": 2.8297461705714215, "grad_norm": 0.44813454151153564, "learning_rate": 9.443765614379948e-07, "loss": 0.0005, "num_input_tokens_seen": 78079336, "step": 115830 }, { "epoch": 2.8298683214032687, "grad_norm": 0.042555276304483414, "learning_rate": 9.442914160661137e-07, "loss": 0.0002, "num_input_tokens_seen": 78082792, "step": 115835 }, { "epoch": 2.829990472235116, "grad_norm": 0.04323071613907814, "learning_rate": 9.442062710993599e-07, "loss": 0.0349, "num_input_tokens_seen": 78086824, "step": 115840 }, { "epoch": 2.830112623066963, "grad_norm": 26.882911682128906, "learning_rate": 9.44121126538353e-07, "loss": 0.1367, "num_input_tokens_seen": 78090664, "step": 115845 }, { "epoch": 2.8302347738988103, "grad_norm": 0.062530517578125, "learning_rate": 9.44035982383713e-07, "loss": 0.0001, "num_input_tokens_seen": 78093800, "step": 115850 }, { "epoch": 2.8303569247306575, "grad_norm": 0.19051715731620789, "learning_rate": 9.439508386360577e-07, "loss": 0.2, "num_input_tokens_seen": 78097192, "step": 115855 }, { "epoch": 2.8304790755625047, "grad_norm": 0.08016974478960037, "learning_rate": 9.438656952960076e-07, "loss": 0.0004, "num_input_tokens_seen": 78100584, "step": 115860 }, { "epoch": 2.830601226394352, "grad_norm": 0.0509033165872097, "learning_rate": 9.437805523641808e-07, "loss": 0.0002, "num_input_tokens_seen": 78103656, "step": 115865 }, { "epoch": 2.830723377226199, "grad_norm": 0.002763264812529087, "learning_rate": 9.436954098411973e-07, "loss": 0.0003, "num_input_tokens_seen": 78106792, "step": 115870 }, { "epoch": 2.8308455280580462, "grad_norm": 0.08348390460014343, "learning_rate": 9.436102677276757e-07, "loss": 0.0003, "num_input_tokens_seen": 78109864, "step": 115875 }, { "epoch": 2.830967678889893, "grad_norm": 0.009231437928974628, "learning_rate": 9.435251260242352e-07, "loss": 0.1042, "num_input_tokens_seen": 78113000, "step": 115880 }, { "epoch": 2.8310898297217406, "grad_norm": 0.2583863139152527, "learning_rate": 9.434399847314958e-07, "loss": 0.0387, "num_input_tokens_seen": 78116072, "step": 115885 }, { "epoch": 2.8312119805535874, "grad_norm": 0.4468061625957489, "learning_rate": 9.433548438500753e-07, "loss": 0.0591, "num_input_tokens_seen": 78119400, "step": 115890 }, { "epoch": 2.831334131385435, "grad_norm": 0.37474071979522705, "learning_rate": 9.432697033805943e-07, "loss": 0.0005, "num_input_tokens_seen": 78122856, "step": 115895 }, { "epoch": 2.8314562822172817, "grad_norm": 0.8062929511070251, "learning_rate": 9.431845633236707e-07, "loss": 0.0006, "num_input_tokens_seen": 78126440, "step": 115900 }, { "epoch": 2.831578433049129, "grad_norm": 0.009114105254411697, "learning_rate": 9.430994236799247e-07, "loss": 0.0405, "num_input_tokens_seen": 78130024, "step": 115905 }, { "epoch": 2.831700583880976, "grad_norm": 0.45530620217323303, "learning_rate": 9.430142844499746e-07, "loss": 0.0005, "num_input_tokens_seen": 78133288, "step": 115910 }, { "epoch": 2.8318227347128233, "grad_norm": 0.06309596449136734, "learning_rate": 9.429291456344398e-07, "loss": 0.0001, "num_input_tokens_seen": 78136296, "step": 115915 }, { "epoch": 2.8319448855446705, "grad_norm": 11.176220893859863, "learning_rate": 9.428440072339402e-07, "loss": 0.0636, "num_input_tokens_seen": 78139624, "step": 115920 }, { "epoch": 2.8320670363765177, "grad_norm": 0.014629398472607136, "learning_rate": 9.42758869249094e-07, "loss": 0.0001, "num_input_tokens_seen": 78143080, "step": 115925 }, { "epoch": 2.832189187208365, "grad_norm": 0.048689037561416626, "learning_rate": 9.426737316805209e-07, "loss": 0.0874, "num_input_tokens_seen": 78146088, "step": 115930 }, { "epoch": 2.832311338040212, "grad_norm": 0.03567717596888542, "learning_rate": 9.425885945288397e-07, "loss": 0.0012, "num_input_tokens_seen": 78149800, "step": 115935 }, { "epoch": 2.8324334888720593, "grad_norm": 0.03357863798737526, "learning_rate": 9.425034577946696e-07, "loss": 0.0344, "num_input_tokens_seen": 78152936, "step": 115940 }, { "epoch": 2.8325556397039064, "grad_norm": 31.494722366333008, "learning_rate": 9.424183214786301e-07, "loss": 0.0427, "num_input_tokens_seen": 78156008, "step": 115945 }, { "epoch": 2.8326777905357536, "grad_norm": 0.0009557157754898071, "learning_rate": 9.423331855813396e-07, "loss": 0.0006, "num_input_tokens_seen": 78159336, "step": 115950 }, { "epoch": 2.832799941367601, "grad_norm": 0.3145740330219269, "learning_rate": 9.422480501034183e-07, "loss": 0.0478, "num_input_tokens_seen": 78162344, "step": 115955 }, { "epoch": 2.832922092199448, "grad_norm": 0.019206028431653976, "learning_rate": 9.421629150454841e-07, "loss": 0.0656, "num_input_tokens_seen": 78165480, "step": 115960 }, { "epoch": 2.833044243031295, "grad_norm": 0.011119597591459751, "learning_rate": 9.420777804081572e-07, "loss": 0.1134, "num_input_tokens_seen": 78168616, "step": 115965 }, { "epoch": 2.8331663938631424, "grad_norm": 0.07654442638158798, "learning_rate": 9.419926461920559e-07, "loss": 0.0411, "num_input_tokens_seen": 78172392, "step": 115970 }, { "epoch": 2.833288544694989, "grad_norm": 11.022329330444336, "learning_rate": 9.419075123977999e-07, "loss": 0.03, "num_input_tokens_seen": 78175528, "step": 115975 }, { "epoch": 2.8334106955268368, "grad_norm": 0.04601052403450012, "learning_rate": 9.41822379026008e-07, "loss": 0.0001, "num_input_tokens_seen": 78179176, "step": 115980 }, { "epoch": 2.8335328463586835, "grad_norm": 0.12010195106267929, "learning_rate": 9.417372460772994e-07, "loss": 0.0004, "num_input_tokens_seen": 78182504, "step": 115985 }, { "epoch": 2.8336549971905307, "grad_norm": 0.002848004223778844, "learning_rate": 9.416521135522936e-07, "loss": 0.0001, "num_input_tokens_seen": 78185960, "step": 115990 }, { "epoch": 2.833777148022378, "grad_norm": 0.06952404975891113, "learning_rate": 9.415669814516087e-07, "loss": 0.0012, "num_input_tokens_seen": 78189416, "step": 115995 }, { "epoch": 2.833899298854225, "grad_norm": 0.2463352084159851, "learning_rate": 9.414818497758645e-07, "loss": 0.0218, "num_input_tokens_seen": 78192936, "step": 116000 }, { "epoch": 2.8340214496860723, "grad_norm": 0.028511550277471542, "learning_rate": 9.413967185256806e-07, "loss": 0.0002, "num_input_tokens_seen": 78196200, "step": 116005 }, { "epoch": 2.8341436005179195, "grad_norm": 0.09941234439611435, "learning_rate": 9.413115877016749e-07, "loss": 0.0003, "num_input_tokens_seen": 78200104, "step": 116010 }, { "epoch": 2.8342657513497667, "grad_norm": 0.009271027520298958, "learning_rate": 9.412264573044676e-07, "loss": 0.092, "num_input_tokens_seen": 78203432, "step": 116015 }, { "epoch": 2.834387902181614, "grad_norm": 0.14161081612110138, "learning_rate": 9.411413273346769e-07, "loss": 0.0005, "num_input_tokens_seen": 78206760, "step": 116020 }, { "epoch": 2.834510053013461, "grad_norm": 0.05342858284711838, "learning_rate": 9.410561977929228e-07, "loss": 0.0717, "num_input_tokens_seen": 78210536, "step": 116025 }, { "epoch": 2.8346322038453082, "grad_norm": 89.6737289428711, "learning_rate": 9.409710686798236e-07, "loss": 0.1138, "num_input_tokens_seen": 78213864, "step": 116030 }, { "epoch": 2.8347543546771554, "grad_norm": 464.81121826171875, "learning_rate": 9.408859399959984e-07, "loss": 0.0712, "num_input_tokens_seen": 78216936, "step": 116035 }, { "epoch": 2.8348765055090026, "grad_norm": 0.04185141995549202, "learning_rate": 9.408008117420671e-07, "loss": 0.0003, "num_input_tokens_seen": 78220200, "step": 116040 }, { "epoch": 2.83499865634085, "grad_norm": 0.03626517951488495, "learning_rate": 9.407156839186477e-07, "loss": 0.0403, "num_input_tokens_seen": 78223464, "step": 116045 }, { "epoch": 2.835120807172697, "grad_norm": 0.018436536192893982, "learning_rate": 9.406305565263602e-07, "loss": 0.0548, "num_input_tokens_seen": 78227176, "step": 116050 }, { "epoch": 2.835242958004544, "grad_norm": 0.4739210605621338, "learning_rate": 9.405454295658229e-07, "loss": 0.0007, "num_input_tokens_seen": 78230696, "step": 116055 }, { "epoch": 2.835365108836391, "grad_norm": 0.08458838611841202, "learning_rate": 9.404603030376555e-07, "loss": 0.0455, "num_input_tokens_seen": 78233640, "step": 116060 }, { "epoch": 2.8354872596682386, "grad_norm": 0.004867865238338709, "learning_rate": 9.403751769424765e-07, "loss": 0.0569, "num_input_tokens_seen": 78237480, "step": 116065 }, { "epoch": 2.8356094105000853, "grad_norm": 14.933873176574707, "learning_rate": 9.402900512809052e-07, "loss": 0.0715, "num_input_tokens_seen": 78240680, "step": 116070 }, { "epoch": 2.835731561331933, "grad_norm": 25.75342559814453, "learning_rate": 9.402049260535613e-07, "loss": 0.0405, "num_input_tokens_seen": 78244136, "step": 116075 }, { "epoch": 2.8358537121637797, "grad_norm": 0.1192922443151474, "learning_rate": 9.401198012610628e-07, "loss": 0.0513, "num_input_tokens_seen": 78247912, "step": 116080 }, { "epoch": 2.835975862995627, "grad_norm": 0.06797201931476593, "learning_rate": 9.400346769040294e-07, "loss": 0.0001, "num_input_tokens_seen": 78251176, "step": 116085 }, { "epoch": 2.836098013827474, "grad_norm": 10.027449607849121, "learning_rate": 9.399495529830798e-07, "loss": 0.0773, "num_input_tokens_seen": 78254120, "step": 116090 }, { "epoch": 2.8362201646593213, "grad_norm": 21.70563316345215, "learning_rate": 9.398644294988332e-07, "loss": 0.1131, "num_input_tokens_seen": 78257256, "step": 116095 }, { "epoch": 2.8363423154911684, "grad_norm": 1.4771491289138794, "learning_rate": 9.397793064519088e-07, "loss": 0.0006, "num_input_tokens_seen": 78261160, "step": 116100 }, { "epoch": 2.8364644663230156, "grad_norm": 0.07461465150117874, "learning_rate": 9.396941838429253e-07, "loss": 0.0003, "num_input_tokens_seen": 78264488, "step": 116105 }, { "epoch": 2.836586617154863, "grad_norm": 0.023504182696342468, "learning_rate": 9.396090616725022e-07, "loss": 0.0073, "num_input_tokens_seen": 78267624, "step": 116110 }, { "epoch": 2.83670876798671, "grad_norm": 0.16718228161334991, "learning_rate": 9.395239399412579e-07, "loss": 0.0315, "num_input_tokens_seen": 78271016, "step": 116115 }, { "epoch": 2.836830918818557, "grad_norm": 24.0216121673584, "learning_rate": 9.394388186498121e-07, "loss": 0.0016, "num_input_tokens_seen": 78274536, "step": 116120 }, { "epoch": 2.8369530696504044, "grad_norm": 0.4365783929824829, "learning_rate": 9.393536977987831e-07, "loss": 0.0001, "num_input_tokens_seen": 78278504, "step": 116125 }, { "epoch": 2.8370752204822516, "grad_norm": 0.1103735938668251, "learning_rate": 9.392685773887907e-07, "loss": 0.0003, "num_input_tokens_seen": 78281256, "step": 116130 }, { "epoch": 2.8371973713140988, "grad_norm": 27.963003158569336, "learning_rate": 9.391834574204534e-07, "loss": 0.103, "num_input_tokens_seen": 78284392, "step": 116135 }, { "epoch": 2.837319522145946, "grad_norm": 0.08092053979635239, "learning_rate": 9.390983378943903e-07, "loss": 0.0001, "num_input_tokens_seen": 78288104, "step": 116140 }, { "epoch": 2.8374416729777927, "grad_norm": 0.039317984133958817, "learning_rate": 9.390132188112207e-07, "loss": 0.0001, "num_input_tokens_seen": 78291944, "step": 116145 }, { "epoch": 2.8375638238096403, "grad_norm": 0.015090661123394966, "learning_rate": 9.389281001715631e-07, "loss": 0.0004, "num_input_tokens_seen": 78295464, "step": 116150 }, { "epoch": 2.837685974641487, "grad_norm": 0.241033136844635, "learning_rate": 9.388429819760367e-07, "loss": 0.0004, "num_input_tokens_seen": 78298536, "step": 116155 }, { "epoch": 2.8378081254733347, "grad_norm": 0.003959252033382654, "learning_rate": 9.38757864225261e-07, "loss": 0.0002, "num_input_tokens_seen": 78302184, "step": 116160 }, { "epoch": 2.8379302763051815, "grad_norm": 0.12065454572439194, "learning_rate": 9.386727469198541e-07, "loss": 0.0452, "num_input_tokens_seen": 78305704, "step": 116165 }, { "epoch": 2.8380524271370287, "grad_norm": 0.0025195570196956396, "learning_rate": 9.385876300604359e-07, "loss": 0.036, "num_input_tokens_seen": 78309160, "step": 116170 }, { "epoch": 2.838174577968876, "grad_norm": 0.0028503055218607187, "learning_rate": 9.385025136476246e-07, "loss": 0.0001, "num_input_tokens_seen": 78312744, "step": 116175 }, { "epoch": 2.838296728800723, "grad_norm": 13.748456954956055, "learning_rate": 9.3841739768204e-07, "loss": 0.0361, "num_input_tokens_seen": 78316136, "step": 116180 }, { "epoch": 2.8384188796325702, "grad_norm": 0.03185724467039108, "learning_rate": 9.383322821643003e-07, "loss": 0.0304, "num_input_tokens_seen": 78319784, "step": 116185 }, { "epoch": 2.8385410304644174, "grad_norm": 0.03878801688551903, "learning_rate": 9.382471670950248e-07, "loss": 0.0021, "num_input_tokens_seen": 78322728, "step": 116190 }, { "epoch": 2.8386631812962646, "grad_norm": 0.11397980153560638, "learning_rate": 9.38162052474833e-07, "loss": 0.0001, "num_input_tokens_seen": 78326312, "step": 116195 }, { "epoch": 2.838785332128112, "grad_norm": 0.062247421592473984, "learning_rate": 9.380769383043428e-07, "loss": 0.0001, "num_input_tokens_seen": 78329832, "step": 116200 }, { "epoch": 2.838907482959959, "grad_norm": 0.0130871357396245, "learning_rate": 9.379918245841741e-07, "loss": 0.066, "num_input_tokens_seen": 78333096, "step": 116205 }, { "epoch": 2.839029633791806, "grad_norm": 0.017938796430826187, "learning_rate": 9.379067113149452e-07, "loss": 0.0003, "num_input_tokens_seen": 78336552, "step": 116210 }, { "epoch": 2.8391517846236534, "grad_norm": 26.627717971801758, "learning_rate": 9.378215984972759e-07, "loss": 0.1872, "num_input_tokens_seen": 78340584, "step": 116215 }, { "epoch": 2.8392739354555006, "grad_norm": 0.03087383322417736, "learning_rate": 9.377364861317843e-07, "loss": 0.0005, "num_input_tokens_seen": 78344744, "step": 116220 }, { "epoch": 2.8393960862873477, "grad_norm": 35.27699279785156, "learning_rate": 9.376513742190896e-07, "loss": 0.0476, "num_input_tokens_seen": 78348136, "step": 116225 }, { "epoch": 2.839518237119195, "grad_norm": 0.001459734863601625, "learning_rate": 9.375662627598113e-07, "loss": 0.0004, "num_input_tokens_seen": 78351400, "step": 116230 }, { "epoch": 2.839640387951042, "grad_norm": 15.316701889038086, "learning_rate": 9.374811517545677e-07, "loss": 0.0415, "num_input_tokens_seen": 78354536, "step": 116235 }, { "epoch": 2.839762538782889, "grad_norm": 24.034547805786133, "learning_rate": 9.37396041203978e-07, "loss": 0.1079, "num_input_tokens_seen": 78357864, "step": 116240 }, { "epoch": 2.8398846896147365, "grad_norm": 0.1375059336423874, "learning_rate": 9.373109311086612e-07, "loss": 0.0373, "num_input_tokens_seen": 78361576, "step": 116245 }, { "epoch": 2.8400068404465832, "grad_norm": 0.002783549018204212, "learning_rate": 9.372258214692358e-07, "loss": 0.1102, "num_input_tokens_seen": 78365032, "step": 116250 }, { "epoch": 2.840128991278431, "grad_norm": 0.8223499059677124, "learning_rate": 9.371407122863217e-07, "loss": 0.0763, "num_input_tokens_seen": 78368232, "step": 116255 }, { "epoch": 2.8402511421102776, "grad_norm": 0.0012122441548854113, "learning_rate": 9.370556035605366e-07, "loss": 0.1492, "num_input_tokens_seen": 78371432, "step": 116260 }, { "epoch": 2.840373292942125, "grad_norm": 0.007098636124283075, "learning_rate": 9.369704952925007e-07, "loss": 0.048, "num_input_tokens_seen": 78374824, "step": 116265 }, { "epoch": 2.840495443773972, "grad_norm": 0.11667418479919434, "learning_rate": 9.368853874828318e-07, "loss": 0.0225, "num_input_tokens_seen": 78378088, "step": 116270 }, { "epoch": 2.840617594605819, "grad_norm": 0.003962985239923, "learning_rate": 9.368002801321499e-07, "loss": 0.0001, "num_input_tokens_seen": 78381608, "step": 116275 }, { "epoch": 2.8407397454376664, "grad_norm": 74.27780151367188, "learning_rate": 9.367151732410727e-07, "loss": 0.0282, "num_input_tokens_seen": 78384936, "step": 116280 }, { "epoch": 2.8408618962695136, "grad_norm": 0.3086697459220886, "learning_rate": 9.366300668102201e-07, "loss": 0.0012, "num_input_tokens_seen": 78388136, "step": 116285 }, { "epoch": 2.8409840471013608, "grad_norm": 40.96001052856445, "learning_rate": 9.365449608402107e-07, "loss": 0.0479, "num_input_tokens_seen": 78391784, "step": 116290 }, { "epoch": 2.841106197933208, "grad_norm": 0.12763236463069916, "learning_rate": 9.364598553316635e-07, "loss": 0.001, "num_input_tokens_seen": 78394792, "step": 116295 }, { "epoch": 2.841228348765055, "grad_norm": 0.08105745911598206, "learning_rate": 9.363747502851975e-07, "loss": 0.0002, "num_input_tokens_seen": 78398440, "step": 116300 }, { "epoch": 2.8413504995969023, "grad_norm": 0.009671597741544247, "learning_rate": 9.36289645701431e-07, "loss": 0.0001, "num_input_tokens_seen": 78402088, "step": 116305 }, { "epoch": 2.8414726504287495, "grad_norm": 0.015310993418097496, "learning_rate": 9.362045415809837e-07, "loss": 0.0002, "num_input_tokens_seen": 78405480, "step": 116310 }, { "epoch": 2.8415948012605967, "grad_norm": 0.32537516951560974, "learning_rate": 9.361194379244738e-07, "loss": 0.026, "num_input_tokens_seen": 78408808, "step": 116315 }, { "epoch": 2.841716952092444, "grad_norm": 0.1695280373096466, "learning_rate": 9.360343347325204e-07, "loss": 0.0271, "num_input_tokens_seen": 78412200, "step": 116320 }, { "epoch": 2.8418391029242906, "grad_norm": 0.31750592589378357, "learning_rate": 9.359492320057431e-07, "loss": 0.0005, "num_input_tokens_seen": 78415656, "step": 116325 }, { "epoch": 2.8419612537561383, "grad_norm": 0.2636977732181549, "learning_rate": 9.358641297447596e-07, "loss": 0.0516, "num_input_tokens_seen": 78419752, "step": 116330 }, { "epoch": 2.842083404587985, "grad_norm": 0.0896163284778595, "learning_rate": 9.357790279501901e-07, "loss": 0.0686, "num_input_tokens_seen": 78422760, "step": 116335 }, { "epoch": 2.8422055554198327, "grad_norm": 0.2173192948102951, "learning_rate": 9.35693926622652e-07, "loss": 0.0587, "num_input_tokens_seen": 78425960, "step": 116340 }, { "epoch": 2.8423277062516794, "grad_norm": 0.984163224697113, "learning_rate": 9.356088257627655e-07, "loss": 0.0005, "num_input_tokens_seen": 78429160, "step": 116345 }, { "epoch": 2.8424498570835266, "grad_norm": 0.02859634719789028, "learning_rate": 9.355237253711489e-07, "loss": 0.0003, "num_input_tokens_seen": 78432680, "step": 116350 }, { "epoch": 2.842572007915374, "grad_norm": 36.05793762207031, "learning_rate": 9.354386254484207e-07, "loss": 0.08, "num_input_tokens_seen": 78435688, "step": 116355 }, { "epoch": 2.842694158747221, "grad_norm": 0.11167803406715393, "learning_rate": 9.353535259952009e-07, "loss": 0.0001, "num_input_tokens_seen": 78439144, "step": 116360 }, { "epoch": 2.842816309579068, "grad_norm": 0.005631339270621538, "learning_rate": 9.35268427012107e-07, "loss": 0.0002, "num_input_tokens_seen": 78442408, "step": 116365 }, { "epoch": 2.8429384604109154, "grad_norm": 0.03177811950445175, "learning_rate": 9.351833284997589e-07, "loss": 0.0001, "num_input_tokens_seen": 78445608, "step": 116370 }, { "epoch": 2.8430606112427625, "grad_norm": 0.01641424559056759, "learning_rate": 9.350982304587746e-07, "loss": 0.0001, "num_input_tokens_seen": 78449064, "step": 116375 }, { "epoch": 2.8431827620746097, "grad_norm": 0.0047453767620027065, "learning_rate": 9.350131328897736e-07, "loss": 0.0079, "num_input_tokens_seen": 78452328, "step": 116380 }, { "epoch": 2.843304912906457, "grad_norm": 0.25099989771842957, "learning_rate": 9.34928035793375e-07, "loss": 0.0005, "num_input_tokens_seen": 78455784, "step": 116385 }, { "epoch": 2.843427063738304, "grad_norm": 0.008549386635422707, "learning_rate": 9.348429391701969e-07, "loss": 0.0002, "num_input_tokens_seen": 78458920, "step": 116390 }, { "epoch": 2.8435492145701513, "grad_norm": 0.2677416205406189, "learning_rate": 9.347578430208586e-07, "loss": 0.0007, "num_input_tokens_seen": 78462248, "step": 116395 }, { "epoch": 2.8436713654019985, "grad_norm": 0.44831427931785583, "learning_rate": 9.346727473459787e-07, "loss": 0.0003, "num_input_tokens_seen": 78465768, "step": 116400 }, { "epoch": 2.8437935162338457, "grad_norm": 0.01087011955678463, "learning_rate": 9.34587652146176e-07, "loss": 0.0006, "num_input_tokens_seen": 78469672, "step": 116405 }, { "epoch": 2.843915667065693, "grad_norm": 0.0809653252363205, "learning_rate": 9.345025574220698e-07, "loss": 0.0397, "num_input_tokens_seen": 78472744, "step": 116410 }, { "epoch": 2.84403781789754, "grad_norm": 0.02179126814007759, "learning_rate": 9.344174631742782e-07, "loss": 0.0002, "num_input_tokens_seen": 78476136, "step": 116415 }, { "epoch": 2.844159968729387, "grad_norm": 0.005103960167616606, "learning_rate": 9.34332369403421e-07, "loss": 0.0002, "num_input_tokens_seen": 78479656, "step": 116420 }, { "epoch": 2.8442821195612344, "grad_norm": 472.42657470703125, "learning_rate": 9.342472761101161e-07, "loss": 0.0707, "num_input_tokens_seen": 78482856, "step": 116425 }, { "epoch": 2.844404270393081, "grad_norm": 0.06024815887212753, "learning_rate": 9.34162183294983e-07, "loss": 0.0003, "num_input_tokens_seen": 78485928, "step": 116430 }, { "epoch": 2.8445264212249284, "grad_norm": 0.05335991457104683, "learning_rate": 9.340770909586397e-07, "loss": 0.0332, "num_input_tokens_seen": 78489512, "step": 116435 }, { "epoch": 2.8446485720567756, "grad_norm": 0.05633782967925072, "learning_rate": 9.339919991017059e-07, "loss": 0.0, "num_input_tokens_seen": 78492648, "step": 116440 }, { "epoch": 2.8447707228886228, "grad_norm": 0.015017188154160976, "learning_rate": 9.339069077248e-07, "loss": 0.112, "num_input_tokens_seen": 78495976, "step": 116445 }, { "epoch": 2.84489287372047, "grad_norm": 0.13255448639392853, "learning_rate": 9.338218168285407e-07, "loss": 0.0001, "num_input_tokens_seen": 78499432, "step": 116450 }, { "epoch": 2.845015024552317, "grad_norm": 0.06532268226146698, "learning_rate": 9.337367264135474e-07, "loss": 0.0442, "num_input_tokens_seen": 78502952, "step": 116455 }, { "epoch": 2.8451371753841643, "grad_norm": 0.009263267740607262, "learning_rate": 9.336516364804379e-07, "loss": 0.0002, "num_input_tokens_seen": 78506088, "step": 116460 }, { "epoch": 2.8452593262160115, "grad_norm": 0.03530608117580414, "learning_rate": 9.335665470298319e-07, "loss": 0.0, "num_input_tokens_seen": 78509992, "step": 116465 }, { "epoch": 2.8453814770478587, "grad_norm": 0.05856453999876976, "learning_rate": 9.334814580623476e-07, "loss": 0.0432, "num_input_tokens_seen": 78513832, "step": 116470 }, { "epoch": 2.845503627879706, "grad_norm": 0.36436575651168823, "learning_rate": 9.333963695786038e-07, "loss": 0.0003, "num_input_tokens_seen": 78517032, "step": 116475 }, { "epoch": 2.845625778711553, "grad_norm": 0.054122958332300186, "learning_rate": 9.333112815792202e-07, "loss": 0.0007, "num_input_tokens_seen": 78520232, "step": 116480 }, { "epoch": 2.8457479295434003, "grad_norm": 0.011439117603003979, "learning_rate": 9.332261940648143e-07, "loss": 0.0563, "num_input_tokens_seen": 78523560, "step": 116485 }, { "epoch": 2.8458700803752475, "grad_norm": 0.12531723082065582, "learning_rate": 9.331411070360059e-07, "loss": 0.0002, "num_input_tokens_seen": 78527208, "step": 116490 }, { "epoch": 2.8459922312070947, "grad_norm": 0.012637167237699032, "learning_rate": 9.330560204934129e-07, "loss": 0.0016, "num_input_tokens_seen": 78530408, "step": 116495 }, { "epoch": 2.846114382038942, "grad_norm": 0.013321335427463055, "learning_rate": 9.329709344376549e-07, "loss": 0.0004, "num_input_tokens_seen": 78533544, "step": 116500 }, { "epoch": 2.8462365328707886, "grad_norm": 76.95355224609375, "learning_rate": 9.328858488693503e-07, "loss": 0.0606, "num_input_tokens_seen": 78536680, "step": 116505 }, { "epoch": 2.8463586837026362, "grad_norm": 0.23179636895656586, "learning_rate": 9.328007637891175e-07, "loss": 0.0001, "num_input_tokens_seen": 78539752, "step": 116510 }, { "epoch": 2.846480834534483, "grad_norm": 0.005101948510855436, "learning_rate": 9.327156791975762e-07, "loss": 0.0001, "num_input_tokens_seen": 78543400, "step": 116515 }, { "epoch": 2.8466029853663306, "grad_norm": 0.07535602897405624, "learning_rate": 9.326305950953439e-07, "loss": 0.0796, "num_input_tokens_seen": 78546792, "step": 116520 }, { "epoch": 2.8467251361981774, "grad_norm": 42.198482513427734, "learning_rate": 9.325455114830406e-07, "loss": 0.0829, "num_input_tokens_seen": 78550120, "step": 116525 }, { "epoch": 2.8468472870300245, "grad_norm": 0.0038453794550150633, "learning_rate": 9.32460428361284e-07, "loss": 0.1552, "num_input_tokens_seen": 78553128, "step": 116530 }, { "epoch": 2.8469694378618717, "grad_norm": 0.15347400307655334, "learning_rate": 9.323753457306934e-07, "loss": 0.0618, "num_input_tokens_seen": 78556584, "step": 116535 }, { "epoch": 2.847091588693719, "grad_norm": 0.0010158936493098736, "learning_rate": 9.322902635918879e-07, "loss": 0.0, "num_input_tokens_seen": 78560040, "step": 116540 }, { "epoch": 2.847213739525566, "grad_norm": 0.02456558868288994, "learning_rate": 9.322051819454856e-07, "loss": 0.0001, "num_input_tokens_seen": 78563432, "step": 116545 }, { "epoch": 2.8473358903574133, "grad_norm": 0.06507387012243271, "learning_rate": 9.321201007921054e-07, "loss": 0.0003, "num_input_tokens_seen": 78566376, "step": 116550 }, { "epoch": 2.8474580411892605, "grad_norm": 0.006255817599594593, "learning_rate": 9.320350201323662e-07, "loss": 0.0002, "num_input_tokens_seen": 78569576, "step": 116555 }, { "epoch": 2.8475801920211077, "grad_norm": 0.03314283862709999, "learning_rate": 9.319499399668863e-07, "loss": 0.0002, "num_input_tokens_seen": 78572712, "step": 116560 }, { "epoch": 2.847702342852955, "grad_norm": 1.2695236206054688, "learning_rate": 9.318648602962852e-07, "loss": 0.0002, "num_input_tokens_seen": 78575784, "step": 116565 }, { "epoch": 2.847824493684802, "grad_norm": 0.06694022566080093, "learning_rate": 9.317797811211807e-07, "loss": 0.0001, "num_input_tokens_seen": 78579304, "step": 116570 }, { "epoch": 2.8479466445166493, "grad_norm": 38.54062271118164, "learning_rate": 9.316947024421923e-07, "loss": 0.122, "num_input_tokens_seen": 78582440, "step": 116575 }, { "epoch": 2.8480687953484964, "grad_norm": 0.06952695548534393, "learning_rate": 9.316096242599382e-07, "loss": 0.0641, "num_input_tokens_seen": 78585960, "step": 116580 }, { "epoch": 2.8481909461803436, "grad_norm": 0.0023296386934816837, "learning_rate": 9.315245465750376e-07, "loss": 0.0, "num_input_tokens_seen": 78589672, "step": 116585 }, { "epoch": 2.8483130970121904, "grad_norm": 0.4964067041873932, "learning_rate": 9.314394693881086e-07, "loss": 0.0003, "num_input_tokens_seen": 78593256, "step": 116590 }, { "epoch": 2.848435247844038, "grad_norm": 0.01752789504826069, "learning_rate": 9.313543926997703e-07, "loss": 0.0001, "num_input_tokens_seen": 78596968, "step": 116595 }, { "epoch": 2.8485573986758848, "grad_norm": 0.008099086582660675, "learning_rate": 9.312693165106413e-07, "loss": 0.0002, "num_input_tokens_seen": 78600424, "step": 116600 }, { "epoch": 2.8486795495077324, "grad_norm": 0.004455342888832092, "learning_rate": 9.311842408213404e-07, "loss": 0.0411, "num_input_tokens_seen": 78603688, "step": 116605 }, { "epoch": 2.848801700339579, "grad_norm": 0.0616503469645977, "learning_rate": 9.310991656324865e-07, "loss": 0.0001, "num_input_tokens_seen": 78606568, "step": 116610 }, { "epoch": 2.8489238511714263, "grad_norm": 36.6910285949707, "learning_rate": 9.310140909446974e-07, "loss": 0.0405, "num_input_tokens_seen": 78609896, "step": 116615 }, { "epoch": 2.8490460020032735, "grad_norm": 0.0014246907085180283, "learning_rate": 9.309290167585929e-07, "loss": 0.0679, "num_input_tokens_seen": 78613416, "step": 116620 }, { "epoch": 2.8491681528351207, "grad_norm": 0.001358588458970189, "learning_rate": 9.308439430747908e-07, "loss": 0.0002, "num_input_tokens_seen": 78617320, "step": 116625 }, { "epoch": 2.849290303666968, "grad_norm": 0.003647299250587821, "learning_rate": 9.307588698939101e-07, "loss": 0.0717, "num_input_tokens_seen": 78620584, "step": 116630 }, { "epoch": 2.849412454498815, "grad_norm": 0.1672152727842331, "learning_rate": 9.306737972165699e-07, "loss": 0.0423, "num_input_tokens_seen": 78623976, "step": 116635 }, { "epoch": 2.8495346053306623, "grad_norm": 4.308013103582198e-06, "learning_rate": 9.30588725043388e-07, "loss": 0.0002, "num_input_tokens_seen": 78627304, "step": 116640 }, { "epoch": 2.8496567561625095, "grad_norm": 0.013248121365904808, "learning_rate": 9.305036533749842e-07, "loss": 0.0814, "num_input_tokens_seen": 78630184, "step": 116645 }, { "epoch": 2.8497789069943567, "grad_norm": 0.6838406920433044, "learning_rate": 9.304185822119759e-07, "loss": 0.0217, "num_input_tokens_seen": 78633832, "step": 116650 }, { "epoch": 2.849901057826204, "grad_norm": 0.005772408097982407, "learning_rate": 9.303335115549828e-07, "loss": 0.0672, "num_input_tokens_seen": 78637288, "step": 116655 }, { "epoch": 2.850023208658051, "grad_norm": 0.0022411204408854246, "learning_rate": 9.302484414046233e-07, "loss": 0.0001, "num_input_tokens_seen": 78641256, "step": 116660 }, { "epoch": 2.8501453594898982, "grad_norm": 50.66267395019531, "learning_rate": 9.301633717615152e-07, "loss": 0.1293, "num_input_tokens_seen": 78644840, "step": 116665 }, { "epoch": 2.8502675103217454, "grad_norm": 0.007837265729904175, "learning_rate": 9.300783026262785e-07, "loss": 0.0001, "num_input_tokens_seen": 78648104, "step": 116670 }, { "epoch": 2.8503896611535926, "grad_norm": 21.46866798400879, "learning_rate": 9.299932339995308e-07, "loss": 0.0458, "num_input_tokens_seen": 78651880, "step": 116675 }, { "epoch": 2.85051181198544, "grad_norm": 175.33380126953125, "learning_rate": 9.299081658818915e-07, "loss": 0.029, "num_input_tokens_seen": 78655208, "step": 116680 }, { "epoch": 2.8506339628172865, "grad_norm": 0.016408970579504967, "learning_rate": 9.298230982739784e-07, "loss": 0.1522, "num_input_tokens_seen": 78658216, "step": 116685 }, { "epoch": 2.850756113649134, "grad_norm": 0.013738897629082203, "learning_rate": 9.297380311764107e-07, "loss": 0.0002, "num_input_tokens_seen": 78661160, "step": 116690 }, { "epoch": 2.850878264480981, "grad_norm": 0.2336987853050232, "learning_rate": 9.296529645898073e-07, "loss": 0.0005, "num_input_tokens_seen": 78664232, "step": 116695 }, { "epoch": 2.8510004153128286, "grad_norm": 0.06240047514438629, "learning_rate": 9.295678985147863e-07, "loss": 0.0003, "num_input_tokens_seen": 78667496, "step": 116700 }, { "epoch": 2.8511225661446753, "grad_norm": 0.2012481391429901, "learning_rate": 9.294828329519664e-07, "loss": 0.0002, "num_input_tokens_seen": 78670632, "step": 116705 }, { "epoch": 2.8512447169765225, "grad_norm": 0.07818334549665451, "learning_rate": 9.293977679019663e-07, "loss": 0.0565, "num_input_tokens_seen": 78673768, "step": 116710 }, { "epoch": 2.8513668678083697, "grad_norm": 0.010745156556367874, "learning_rate": 9.293127033654045e-07, "loss": 0.0215, "num_input_tokens_seen": 78677352, "step": 116715 }, { "epoch": 2.851489018640217, "grad_norm": 0.21190738677978516, "learning_rate": 9.292276393429001e-07, "loss": 0.0767, "num_input_tokens_seen": 78680744, "step": 116720 }, { "epoch": 2.851611169472064, "grad_norm": 0.0015108853112906218, "learning_rate": 9.291425758350709e-07, "loss": 0.0001, "num_input_tokens_seen": 78684008, "step": 116725 }, { "epoch": 2.8517333203039112, "grad_norm": 0.1414852887392044, "learning_rate": 9.290575128425364e-07, "loss": 0.0004, "num_input_tokens_seen": 78687336, "step": 116730 }, { "epoch": 2.8518554711357584, "grad_norm": 0.18726703524589539, "learning_rate": 9.289724503659145e-07, "loss": 0.0002, "num_input_tokens_seen": 78690856, "step": 116735 }, { "epoch": 2.8519776219676056, "grad_norm": 0.2173684686422348, "learning_rate": 9.288873884058242e-07, "loss": 0.0536, "num_input_tokens_seen": 78694056, "step": 116740 }, { "epoch": 2.852099772799453, "grad_norm": 0.004171188920736313, "learning_rate": 9.288023269628836e-07, "loss": 0.1149, "num_input_tokens_seen": 78697512, "step": 116745 }, { "epoch": 2.8522219236313, "grad_norm": 44.7703971862793, "learning_rate": 9.287172660377119e-07, "loss": 0.0704, "num_input_tokens_seen": 78700712, "step": 116750 }, { "epoch": 2.852344074463147, "grad_norm": 0.31654784083366394, "learning_rate": 9.286322056309272e-07, "loss": 0.0002, "num_input_tokens_seen": 78704040, "step": 116755 }, { "epoch": 2.8524662252949944, "grad_norm": 0.007102549076080322, "learning_rate": 9.285471457431486e-07, "loss": 0.0325, "num_input_tokens_seen": 78707688, "step": 116760 }, { "epoch": 2.8525883761268416, "grad_norm": 36.19954299926758, "learning_rate": 9.284620863749945e-07, "loss": 0.0999, "num_input_tokens_seen": 78711208, "step": 116765 }, { "epoch": 2.8527105269586883, "grad_norm": 0.3918383717536926, "learning_rate": 9.283770275270828e-07, "loss": 0.1455, "num_input_tokens_seen": 78714472, "step": 116770 }, { "epoch": 2.852832677790536, "grad_norm": 0.020883552730083466, "learning_rate": 9.282919692000331e-07, "loss": 0.0001, "num_input_tokens_seen": 78717928, "step": 116775 }, { "epoch": 2.8529548286223827, "grad_norm": 0.19970282912254333, "learning_rate": 9.282069113944631e-07, "loss": 0.0002, "num_input_tokens_seen": 78721256, "step": 116780 }, { "epoch": 2.8530769794542303, "grad_norm": 0.02931102365255356, "learning_rate": 9.281218541109917e-07, "loss": 0.0378, "num_input_tokens_seen": 78724904, "step": 116785 }, { "epoch": 2.853199130286077, "grad_norm": 0.0009442739537917078, "learning_rate": 9.28036797350238e-07, "loss": 0.0001, "num_input_tokens_seen": 78728616, "step": 116790 }, { "epoch": 2.8533212811179243, "grad_norm": 0.5051276683807373, "learning_rate": 9.279517411128196e-07, "loss": 0.0005, "num_input_tokens_seen": 78732072, "step": 116795 }, { "epoch": 2.8534434319497715, "grad_norm": 35.278289794921875, "learning_rate": 9.27866685399356e-07, "loss": 0.054, "num_input_tokens_seen": 78735208, "step": 116800 }, { "epoch": 2.8535655827816186, "grad_norm": 32.29579162597656, "learning_rate": 9.277816302104647e-07, "loss": 0.0594, "num_input_tokens_seen": 78737960, "step": 116805 }, { "epoch": 2.853687733613466, "grad_norm": 0.0075948829762637615, "learning_rate": 9.276965755467652e-07, "loss": 0.0001, "num_input_tokens_seen": 78741224, "step": 116810 }, { "epoch": 2.853809884445313, "grad_norm": 0.040782373398542404, "learning_rate": 9.276115214088758e-07, "loss": 0.0185, "num_input_tokens_seen": 78744360, "step": 116815 }, { "epoch": 2.85393203527716, "grad_norm": 0.07822287082672119, "learning_rate": 9.275264677974144e-07, "loss": 0.0538, "num_input_tokens_seen": 78748072, "step": 116820 }, { "epoch": 2.8540541861090074, "grad_norm": 0.00317594176158309, "learning_rate": 9.274414147130006e-07, "loss": 0.0005, "num_input_tokens_seen": 78751464, "step": 116825 }, { "epoch": 2.8541763369408546, "grad_norm": 41.26247787475586, "learning_rate": 9.273563621562516e-07, "loss": 0.0495, "num_input_tokens_seen": 78755112, "step": 116830 }, { "epoch": 2.854298487772702, "grad_norm": 37.74317932128906, "learning_rate": 9.272713101277873e-07, "loss": 0.0566, "num_input_tokens_seen": 78758632, "step": 116835 }, { "epoch": 2.854420638604549, "grad_norm": 0.15638141334056854, "learning_rate": 9.271862586282252e-07, "loss": 0.0003, "num_input_tokens_seen": 78761896, "step": 116840 }, { "epoch": 2.854542789436396, "grad_norm": 0.17552080750465393, "learning_rate": 9.271012076581842e-07, "loss": 0.0005, "num_input_tokens_seen": 78765032, "step": 116845 }, { "epoch": 2.8546649402682434, "grad_norm": 0.009987038560211658, "learning_rate": 9.270161572182833e-07, "loss": 0.0002, "num_input_tokens_seen": 78768168, "step": 116850 }, { "epoch": 2.8547870911000905, "grad_norm": 0.06905113905668259, "learning_rate": 9.269311073091403e-07, "loss": 0.0751, "num_input_tokens_seen": 78771432, "step": 116855 }, { "epoch": 2.8549092419319377, "grad_norm": 0.03222937509417534, "learning_rate": 9.268460579313738e-07, "loss": 0.0001, "num_input_tokens_seen": 78774888, "step": 116860 }, { "epoch": 2.8550313927637845, "grad_norm": 0.17098113894462585, "learning_rate": 9.267610090856025e-07, "loss": 0.0604, "num_input_tokens_seen": 78778024, "step": 116865 }, { "epoch": 2.855153543595632, "grad_norm": 12.133179664611816, "learning_rate": 9.266759607724451e-07, "loss": 0.1097, "num_input_tokens_seen": 78781160, "step": 116870 }, { "epoch": 2.855275694427479, "grad_norm": 0.021802183240652084, "learning_rate": 9.265909129925194e-07, "loss": 0.0001, "num_input_tokens_seen": 78784552, "step": 116875 }, { "epoch": 2.855397845259326, "grad_norm": 0.005931623745709658, "learning_rate": 9.265058657464444e-07, "loss": 0.0936, "num_input_tokens_seen": 78787560, "step": 116880 }, { "epoch": 2.8555199960911732, "grad_norm": 51.705326080322266, "learning_rate": 9.264208190348388e-07, "loss": 0.1411, "num_input_tokens_seen": 78791080, "step": 116885 }, { "epoch": 2.8556421469230204, "grad_norm": 0.04667201265692711, "learning_rate": 9.263357728583204e-07, "loss": 0.0001, "num_input_tokens_seen": 78794728, "step": 116890 }, { "epoch": 2.8557642977548676, "grad_norm": 0.01778298430144787, "learning_rate": 9.262507272175087e-07, "loss": 0.0003, "num_input_tokens_seen": 78798376, "step": 116895 }, { "epoch": 2.855886448586715, "grad_norm": 21.58896827697754, "learning_rate": 9.261656821130208e-07, "loss": 0.0517, "num_input_tokens_seen": 78801576, "step": 116900 }, { "epoch": 2.856008599418562, "grad_norm": 0.033148009330034256, "learning_rate": 9.260806375454764e-07, "loss": 0.0004, "num_input_tokens_seen": 78805160, "step": 116905 }, { "epoch": 2.856130750250409, "grad_norm": 0.6077508926391602, "learning_rate": 9.259955935154932e-07, "loss": 0.0003, "num_input_tokens_seen": 78808552, "step": 116910 }, { "epoch": 2.8562529010822564, "grad_norm": 0.025105271488428116, "learning_rate": 9.259105500236902e-07, "loss": 0.0674, "num_input_tokens_seen": 78811816, "step": 116915 }, { "epoch": 2.8563750519141036, "grad_norm": 0.041017137467861176, "learning_rate": 9.258255070706857e-07, "loss": 0.0481, "num_input_tokens_seen": 78814952, "step": 116920 }, { "epoch": 2.8564972027459508, "grad_norm": 0.029608365148305893, "learning_rate": 9.257404646570978e-07, "loss": 0.0651, "num_input_tokens_seen": 78818152, "step": 116925 }, { "epoch": 2.856619353577798, "grad_norm": 20.657743453979492, "learning_rate": 9.256554227835455e-07, "loss": 0.0628, "num_input_tokens_seen": 78821288, "step": 116930 }, { "epoch": 2.856741504409645, "grad_norm": 0.018212083727121353, "learning_rate": 9.255703814506466e-07, "loss": 0.0003, "num_input_tokens_seen": 78824552, "step": 116935 }, { "epoch": 2.8568636552414923, "grad_norm": 0.008494891226291656, "learning_rate": 9.254853406590197e-07, "loss": 0.0002, "num_input_tokens_seen": 78827816, "step": 116940 }, { "epoch": 2.8569858060733395, "grad_norm": 39.98295974731445, "learning_rate": 9.254003004092841e-07, "loss": 0.0845, "num_input_tokens_seen": 78830952, "step": 116945 }, { "epoch": 2.8571079569051863, "grad_norm": 0.2915056049823761, "learning_rate": 9.253152607020572e-07, "loss": 0.0004, "num_input_tokens_seen": 78833832, "step": 116950 }, { "epoch": 2.857230107737034, "grad_norm": 0.01098598912358284, "learning_rate": 9.25230221537958e-07, "loss": 0.0001, "num_input_tokens_seen": 78836968, "step": 116955 }, { "epoch": 2.8573522585688806, "grad_norm": 0.01814945414662361, "learning_rate": 9.251451829176045e-07, "loss": 0.0715, "num_input_tokens_seen": 78840168, "step": 116960 }, { "epoch": 2.8574744094007283, "grad_norm": 0.19818727672100067, "learning_rate": 9.250601448416155e-07, "loss": 0.0757, "num_input_tokens_seen": 78843560, "step": 116965 }, { "epoch": 2.857596560232575, "grad_norm": 0.0399983674287796, "learning_rate": 9.249751073106095e-07, "loss": 0.0006, "num_input_tokens_seen": 78847080, "step": 116970 }, { "epoch": 2.857718711064422, "grad_norm": 0.17228193581104279, "learning_rate": 9.248900703252042e-07, "loss": 0.0008, "num_input_tokens_seen": 78850472, "step": 116975 }, { "epoch": 2.8578408618962694, "grad_norm": 0.0858980044722557, "learning_rate": 9.248050338860192e-07, "loss": 0.0529, "num_input_tokens_seen": 78853992, "step": 116980 }, { "epoch": 2.8579630127281166, "grad_norm": 33.91243362426758, "learning_rate": 9.247199979936715e-07, "loss": 0.1234, "num_input_tokens_seen": 78857384, "step": 116985 }, { "epoch": 2.858085163559964, "grad_norm": 0.14944037795066833, "learning_rate": 9.246349626487809e-07, "loss": 0.0639, "num_input_tokens_seen": 78860456, "step": 116990 }, { "epoch": 2.858207314391811, "grad_norm": 0.2674129605293274, "learning_rate": 9.245499278519644e-07, "loss": 0.0381, "num_input_tokens_seen": 78863720, "step": 116995 }, { "epoch": 2.858329465223658, "grad_norm": 0.07492050528526306, "learning_rate": 9.244648936038412e-07, "loss": 0.0433, "num_input_tokens_seen": 78866792, "step": 117000 }, { "epoch": 2.8584516160555054, "grad_norm": 0.13231611251831055, "learning_rate": 9.243798599050302e-07, "loss": 0.0666, "num_input_tokens_seen": 78870120, "step": 117005 }, { "epoch": 2.8585737668873525, "grad_norm": 126.9564437866211, "learning_rate": 9.242948267561489e-07, "loss": 0.0207, "num_input_tokens_seen": 78873320, "step": 117010 }, { "epoch": 2.8586959177191997, "grad_norm": 168.95916748046875, "learning_rate": 9.242097941578159e-07, "loss": 0.031, "num_input_tokens_seen": 78876520, "step": 117015 }, { "epoch": 2.858818068551047, "grad_norm": 0.014719391241669655, "learning_rate": 9.241247621106498e-07, "loss": 0.0434, "num_input_tokens_seen": 78879720, "step": 117020 }, { "epoch": 2.858940219382894, "grad_norm": 0.04394163191318512, "learning_rate": 9.24039730615269e-07, "loss": 0.067, "num_input_tokens_seen": 78882920, "step": 117025 }, { "epoch": 2.8590623702147413, "grad_norm": 326.14288330078125, "learning_rate": 9.239546996722914e-07, "loss": 0.0288, "num_input_tokens_seen": 78886248, "step": 117030 }, { "epoch": 2.8591845210465885, "grad_norm": 0.01146597508341074, "learning_rate": 9.238696692823355e-07, "loss": 0.064, "num_input_tokens_seen": 78889448, "step": 117035 }, { "epoch": 2.8593066718784357, "grad_norm": 41.29705810546875, "learning_rate": 9.237846394460203e-07, "loss": 0.0519, "num_input_tokens_seen": 78892456, "step": 117040 }, { "epoch": 2.8594288227102824, "grad_norm": 0.007990594953298569, "learning_rate": 9.236996101639632e-07, "loss": 0.0001, "num_input_tokens_seen": 78895976, "step": 117045 }, { "epoch": 2.85955097354213, "grad_norm": 0.8427734375, "learning_rate": 9.236145814367836e-07, "loss": 0.0754, "num_input_tokens_seen": 78899368, "step": 117050 }, { "epoch": 2.859673124373977, "grad_norm": 0.02752232365310192, "learning_rate": 9.23529553265099e-07, "loss": 0.0563, "num_input_tokens_seen": 78902440, "step": 117055 }, { "epoch": 2.859795275205824, "grad_norm": 0.1377357393503189, "learning_rate": 9.23444525649528e-07, "loss": 0.0582, "num_input_tokens_seen": 78905640, "step": 117060 }, { "epoch": 2.859917426037671, "grad_norm": 0.5623184442520142, "learning_rate": 9.233594985906892e-07, "loss": 0.0004, "num_input_tokens_seen": 78909224, "step": 117065 }, { "epoch": 2.8600395768695184, "grad_norm": 0.017609449103474617, "learning_rate": 9.232744720892006e-07, "loss": 0.0002, "num_input_tokens_seen": 78912360, "step": 117070 }, { "epoch": 2.8601617277013656, "grad_norm": 73.11463165283203, "learning_rate": 9.23189446145681e-07, "loss": 0.0495, "num_input_tokens_seen": 78915688, "step": 117075 }, { "epoch": 2.8602838785332128, "grad_norm": 0.369545578956604, "learning_rate": 9.231044207607479e-07, "loss": 0.0003, "num_input_tokens_seen": 78918888, "step": 117080 }, { "epoch": 2.86040602936506, "grad_norm": 0.04580839350819588, "learning_rate": 9.230193959350207e-07, "loss": 0.0357, "num_input_tokens_seen": 78922408, "step": 117085 }, { "epoch": 2.860528180196907, "grad_norm": 0.0036676761228591204, "learning_rate": 9.229343716691166e-07, "loss": 0.1141, "num_input_tokens_seen": 78926056, "step": 117090 }, { "epoch": 2.8606503310287543, "grad_norm": 0.13639883697032928, "learning_rate": 9.228493479636545e-07, "loss": 0.0002, "num_input_tokens_seen": 78929192, "step": 117095 }, { "epoch": 2.8607724818606015, "grad_norm": 0.010075625032186508, "learning_rate": 9.227643248192532e-07, "loss": 0.0, "num_input_tokens_seen": 78932648, "step": 117100 }, { "epoch": 2.8608946326924487, "grad_norm": 0.841788113117218, "learning_rate": 9.226793022365299e-07, "loss": 0.0474, "num_input_tokens_seen": 78936104, "step": 117105 }, { "epoch": 2.861016783524296, "grad_norm": 23.549686431884766, "learning_rate": 9.225942802161041e-07, "loss": 0.0352, "num_input_tokens_seen": 78939176, "step": 117110 }, { "epoch": 2.861138934356143, "grad_norm": 0.00849588867276907, "learning_rate": 9.225092587585929e-07, "loss": 0.0001, "num_input_tokens_seen": 78942248, "step": 117115 }, { "epoch": 2.8612610851879903, "grad_norm": 0.36857789754867554, "learning_rate": 9.224242378646156e-07, "loss": 0.0004, "num_input_tokens_seen": 78945384, "step": 117120 }, { "epoch": 2.8613832360198375, "grad_norm": 0.483141154050827, "learning_rate": 9.223392175347903e-07, "loss": 0.0003, "num_input_tokens_seen": 78949224, "step": 117125 }, { "epoch": 2.861505386851684, "grad_norm": 0.04797397181391716, "learning_rate": 9.222541977697346e-07, "loss": 0.0003, "num_input_tokens_seen": 78952424, "step": 117130 }, { "epoch": 2.861627537683532, "grad_norm": 19.711732864379883, "learning_rate": 9.221691785700679e-07, "loss": 0.062, "num_input_tokens_seen": 78955816, "step": 117135 }, { "epoch": 2.8617496885153786, "grad_norm": 2.041687488555908, "learning_rate": 9.220841599364073e-07, "loss": 0.0006, "num_input_tokens_seen": 78959144, "step": 117140 }, { "epoch": 2.861871839347226, "grad_norm": 0.006740696262568235, "learning_rate": 9.219991418693721e-07, "loss": 0.0002, "num_input_tokens_seen": 78962344, "step": 117145 }, { "epoch": 2.861993990179073, "grad_norm": 0.08522861450910568, "learning_rate": 9.219141243695796e-07, "loss": 0.0002, "num_input_tokens_seen": 78965608, "step": 117150 }, { "epoch": 2.86211614101092, "grad_norm": 0.02093188278377056, "learning_rate": 9.218291074376487e-07, "loss": 0.0878, "num_input_tokens_seen": 78968680, "step": 117155 }, { "epoch": 2.8622382918427673, "grad_norm": 0.0043796938844025135, "learning_rate": 9.217440910741979e-07, "loss": 0.0415, "num_input_tokens_seen": 78972136, "step": 117160 }, { "epoch": 2.8623604426746145, "grad_norm": 0.1477183699607849, "learning_rate": 9.21659075279845e-07, "loss": 0.076, "num_input_tokens_seen": 78975144, "step": 117165 }, { "epoch": 2.8624825935064617, "grad_norm": 0.3976381719112396, "learning_rate": 9.215740600552084e-07, "loss": 0.0004, "num_input_tokens_seen": 78978472, "step": 117170 }, { "epoch": 2.862604744338309, "grad_norm": 0.030067600309848785, "learning_rate": 9.214890454009062e-07, "loss": 0.0381, "num_input_tokens_seen": 78981928, "step": 117175 }, { "epoch": 2.862726895170156, "grad_norm": 0.4479190707206726, "learning_rate": 9.214040313175571e-07, "loss": 0.0068, "num_input_tokens_seen": 78985256, "step": 117180 }, { "epoch": 2.8628490460020033, "grad_norm": 26.292327880859375, "learning_rate": 9.213190178057784e-07, "loss": 0.0431, "num_input_tokens_seen": 78988520, "step": 117185 }, { "epoch": 2.8629711968338505, "grad_norm": 0.019016873091459274, "learning_rate": 9.212340048661892e-07, "loss": 0.0001, "num_input_tokens_seen": 78991976, "step": 117190 }, { "epoch": 2.8630933476656977, "grad_norm": 0.09147219359874725, "learning_rate": 9.211489924994078e-07, "loss": 0.0006, "num_input_tokens_seen": 78994984, "step": 117195 }, { "epoch": 2.863215498497545, "grad_norm": 0.020165953785181046, "learning_rate": 9.210639807060518e-07, "loss": 0.0001, "num_input_tokens_seen": 78998568, "step": 117200 }, { "epoch": 2.863337649329392, "grad_norm": 65.74693298339844, "learning_rate": 9.209789694867401e-07, "loss": 0.0717, "num_input_tokens_seen": 79002344, "step": 117205 }, { "epoch": 2.8634598001612392, "grad_norm": 1.2636717557907104, "learning_rate": 9.208939588420902e-07, "loss": 0.0002, "num_input_tokens_seen": 79005864, "step": 117210 }, { "epoch": 2.863581950993086, "grad_norm": 0.0033105944748967886, "learning_rate": 9.208089487727208e-07, "loss": 0.1023, "num_input_tokens_seen": 79009064, "step": 117215 }, { "epoch": 2.8637041018249336, "grad_norm": 0.016109691932797432, "learning_rate": 9.207239392792503e-07, "loss": 0.0004, "num_input_tokens_seen": 79012200, "step": 117220 }, { "epoch": 2.8638262526567804, "grad_norm": 0.7885407209396362, "learning_rate": 9.206389303622964e-07, "loss": 0.0346, "num_input_tokens_seen": 79015464, "step": 117225 }, { "epoch": 2.863948403488628, "grad_norm": 0.01864694058895111, "learning_rate": 9.205539220224779e-07, "loss": 0.0002, "num_input_tokens_seen": 79018920, "step": 117230 }, { "epoch": 2.8640705543204747, "grad_norm": 1.6205692291259766, "learning_rate": 9.20468914260412e-07, "loss": 0.0442, "num_input_tokens_seen": 79022504, "step": 117235 }, { "epoch": 2.864192705152322, "grad_norm": 0.27415525913238525, "learning_rate": 9.203839070767182e-07, "loss": 0.0002, "num_input_tokens_seen": 79025704, "step": 117240 }, { "epoch": 2.864314855984169, "grad_norm": 370.6185607910156, "learning_rate": 9.202989004720136e-07, "loss": 0.0865, "num_input_tokens_seen": 79028712, "step": 117245 }, { "epoch": 2.8644370068160163, "grad_norm": 0.01124818529933691, "learning_rate": 9.202138944469168e-07, "loss": 0.0, "num_input_tokens_seen": 79031656, "step": 117250 }, { "epoch": 2.8645591576478635, "grad_norm": 0.0069727166555821896, "learning_rate": 9.201288890020464e-07, "loss": 0.0002, "num_input_tokens_seen": 79035944, "step": 117255 }, { "epoch": 2.8646813084797107, "grad_norm": 0.016167595982551575, "learning_rate": 9.200438841380198e-07, "loss": 0.0001, "num_input_tokens_seen": 79039080, "step": 117260 }, { "epoch": 2.864803459311558, "grad_norm": 0.022563988342881203, "learning_rate": 9.199588798554559e-07, "loss": 0.0225, "num_input_tokens_seen": 79043112, "step": 117265 }, { "epoch": 2.864925610143405, "grad_norm": 0.009637587703764439, "learning_rate": 9.198738761549724e-07, "loss": 0.0026, "num_input_tokens_seen": 79046568, "step": 117270 }, { "epoch": 2.8650477609752523, "grad_norm": 0.02311187982559204, "learning_rate": 9.197888730371875e-07, "loss": 0.0501, "num_input_tokens_seen": 79050152, "step": 117275 }, { "epoch": 2.8651699118070995, "grad_norm": 0.1332298219203949, "learning_rate": 9.1970387050272e-07, "loss": 0.0002, "num_input_tokens_seen": 79053800, "step": 117280 }, { "epoch": 2.8652920626389466, "grad_norm": 0.31413617730140686, "learning_rate": 9.196188685521869e-07, "loss": 0.0236, "num_input_tokens_seen": 79057128, "step": 117285 }, { "epoch": 2.865414213470794, "grad_norm": 0.05050954595208168, "learning_rate": 9.195338671862077e-07, "loss": 0.1096, "num_input_tokens_seen": 79060776, "step": 117290 }, { "epoch": 2.865536364302641, "grad_norm": 0.01581372506916523, "learning_rate": 9.194488664053992e-07, "loss": 0.0395, "num_input_tokens_seen": 79063848, "step": 117295 }, { "epoch": 2.865658515134488, "grad_norm": 0.056255050003528595, "learning_rate": 9.193638662103807e-07, "loss": 0.0001, "num_input_tokens_seen": 79066984, "step": 117300 }, { "epoch": 2.8657806659663354, "grad_norm": 0.10820584744215012, "learning_rate": 9.192788666017695e-07, "loss": 0.0002, "num_input_tokens_seen": 79070184, "step": 117305 }, { "epoch": 2.865902816798182, "grad_norm": 0.016701946035027504, "learning_rate": 9.191938675801839e-07, "loss": 0.0002, "num_input_tokens_seen": 79073512, "step": 117310 }, { "epoch": 2.86602496763003, "grad_norm": 0.007976418361067772, "learning_rate": 9.191088691462428e-07, "loss": 0.0003, "num_input_tokens_seen": 79077160, "step": 117315 }, { "epoch": 2.8661471184618765, "grad_norm": 0.008298970758914948, "learning_rate": 9.190238713005636e-07, "loss": 0.0005, "num_input_tokens_seen": 79080552, "step": 117320 }, { "epoch": 2.866269269293724, "grad_norm": 0.07891908288002014, "learning_rate": 9.189388740437645e-07, "loss": 0.0422, "num_input_tokens_seen": 79084328, "step": 117325 }, { "epoch": 2.866391420125571, "grad_norm": 0.06407475471496582, "learning_rate": 9.188538773764637e-07, "loss": 0.1515, "num_input_tokens_seen": 79088104, "step": 117330 }, { "epoch": 2.866513570957418, "grad_norm": 0.08465859293937683, "learning_rate": 9.187688812992796e-07, "loss": 0.0001, "num_input_tokens_seen": 79091432, "step": 117335 }, { "epoch": 2.8666357217892653, "grad_norm": 0.09898823499679565, "learning_rate": 9.186838858128295e-07, "loss": 0.0315, "num_input_tokens_seen": 79095272, "step": 117340 }, { "epoch": 2.8667578726211125, "grad_norm": 0.1045418381690979, "learning_rate": 9.185988909177321e-07, "loss": 0.0003, "num_input_tokens_seen": 79098408, "step": 117345 }, { "epoch": 2.8668800234529597, "grad_norm": 40.67689895629883, "learning_rate": 9.18513896614606e-07, "loss": 0.1138, "num_input_tokens_seen": 79101352, "step": 117350 }, { "epoch": 2.867002174284807, "grad_norm": 0.007143858820199966, "learning_rate": 9.184289029040683e-07, "loss": 0.0741, "num_input_tokens_seen": 79104424, "step": 117355 }, { "epoch": 2.867124325116654, "grad_norm": 0.09244782477617264, "learning_rate": 9.18343909786738e-07, "loss": 0.0324, "num_input_tokens_seen": 79107944, "step": 117360 }, { "epoch": 2.8672464759485012, "grad_norm": 0.13260585069656372, "learning_rate": 9.182589172632321e-07, "loss": 0.0407, "num_input_tokens_seen": 79111528, "step": 117365 }, { "epoch": 2.8673686267803484, "grad_norm": 0.026284320279955864, "learning_rate": 9.181739253341699e-07, "loss": 0.0002, "num_input_tokens_seen": 79114984, "step": 117370 }, { "epoch": 2.8674907776121956, "grad_norm": 1.0271484851837158, "learning_rate": 9.180889340001686e-07, "loss": 0.0382, "num_input_tokens_seen": 79118632, "step": 117375 }, { "epoch": 2.867612928444043, "grad_norm": 0.022432560101151466, "learning_rate": 9.180039432618467e-07, "loss": 0.0347, "num_input_tokens_seen": 79121320, "step": 117380 }, { "epoch": 2.86773507927589, "grad_norm": 0.019050292670726776, "learning_rate": 9.179189531198225e-07, "loss": 0.0292, "num_input_tokens_seen": 79124456, "step": 117385 }, { "epoch": 2.867857230107737, "grad_norm": 0.021058736369013786, "learning_rate": 9.178339635747132e-07, "loss": 0.1183, "num_input_tokens_seen": 79127720, "step": 117390 }, { "epoch": 2.867979380939584, "grad_norm": 0.026515983045101166, "learning_rate": 9.17748974627138e-07, "loss": 0.0002, "num_input_tokens_seen": 79131048, "step": 117395 }, { "epoch": 2.8681015317714316, "grad_norm": 0.041551969945430756, "learning_rate": 9.176639862777138e-07, "loss": 0.0332, "num_input_tokens_seen": 79134376, "step": 117400 }, { "epoch": 2.8682236826032783, "grad_norm": 0.0533473864197731, "learning_rate": 9.175789985270593e-07, "loss": 0.0001, "num_input_tokens_seen": 79138024, "step": 117405 }, { "epoch": 2.868345833435126, "grad_norm": 0.007953424006700516, "learning_rate": 9.174940113757929e-07, "loss": 0.0003, "num_input_tokens_seen": 79141736, "step": 117410 }, { "epoch": 2.8684679842669727, "grad_norm": 0.015473191626369953, "learning_rate": 9.174090248245318e-07, "loss": 0.0004, "num_input_tokens_seen": 79145192, "step": 117415 }, { "epoch": 2.86859013509882, "grad_norm": 0.059506792575120926, "learning_rate": 9.17324038873895e-07, "loss": 0.0003, "num_input_tokens_seen": 79148712, "step": 117420 }, { "epoch": 2.868712285930667, "grad_norm": 0.01240159384906292, "learning_rate": 9.172390535244996e-07, "loss": 0.0569, "num_input_tokens_seen": 79152424, "step": 117425 }, { "epoch": 2.8688344367625143, "grad_norm": 0.003421264234930277, "learning_rate": 9.171540687769641e-07, "loss": 0.0572, "num_input_tokens_seen": 79155688, "step": 117430 }, { "epoch": 2.8689565875943615, "grad_norm": 0.014913155697286129, "learning_rate": 9.170690846319069e-07, "loss": 0.0388, "num_input_tokens_seen": 79159080, "step": 117435 }, { "epoch": 2.8690787384262086, "grad_norm": 0.07318806648254395, "learning_rate": 9.16984101089945e-07, "loss": 0.0622, "num_input_tokens_seen": 79162280, "step": 117440 }, { "epoch": 2.869200889258056, "grad_norm": 0.10887158662080765, "learning_rate": 9.168991181516977e-07, "loss": 0.0002, "num_input_tokens_seen": 79165416, "step": 117445 }, { "epoch": 2.869323040089903, "grad_norm": 0.048394229263067245, "learning_rate": 9.168141358177819e-07, "loss": 0.0002, "num_input_tokens_seen": 79169000, "step": 117450 }, { "epoch": 2.86944519092175, "grad_norm": 1.3721727132797241, "learning_rate": 9.167291540888163e-07, "loss": 0.0004, "num_input_tokens_seen": 79172392, "step": 117455 }, { "epoch": 2.8695673417535974, "grad_norm": 0.49199050664901733, "learning_rate": 9.166441729654184e-07, "loss": 0.0382, "num_input_tokens_seen": 79175592, "step": 117460 }, { "epoch": 2.8696894925854446, "grad_norm": 0.014259264804422855, "learning_rate": 9.165591924482066e-07, "loss": 0.0005, "num_input_tokens_seen": 79178856, "step": 117465 }, { "epoch": 2.869811643417292, "grad_norm": 0.004566266667097807, "learning_rate": 9.164742125377991e-07, "loss": 0.0001, "num_input_tokens_seen": 79182184, "step": 117470 }, { "epoch": 2.869933794249139, "grad_norm": 0.147256538271904, "learning_rate": 9.163892332348133e-07, "loss": 0.0441, "num_input_tokens_seen": 79185256, "step": 117475 }, { "epoch": 2.870055945080986, "grad_norm": 0.05114344507455826, "learning_rate": 9.163042545398676e-07, "loss": 0.0002, "num_input_tokens_seen": 79188456, "step": 117480 }, { "epoch": 2.8701780959128333, "grad_norm": 30.305294036865234, "learning_rate": 9.162192764535798e-07, "loss": 0.0422, "num_input_tokens_seen": 79191592, "step": 117485 }, { "epoch": 2.87030024674468, "grad_norm": 0.003184929024428129, "learning_rate": 9.161342989765683e-07, "loss": 0.0001, "num_input_tokens_seen": 79195240, "step": 117490 }, { "epoch": 2.8704223975765277, "grad_norm": 0.006119478493928909, "learning_rate": 9.160493221094502e-07, "loss": 0.0, "num_input_tokens_seen": 79198504, "step": 117495 }, { "epoch": 2.8705445484083745, "grad_norm": 23.057008743286133, "learning_rate": 9.159643458528441e-07, "loss": 0.0476, "num_input_tokens_seen": 79202152, "step": 117500 }, { "epoch": 2.8706666992402217, "grad_norm": 0.008205844089388847, "learning_rate": 9.158793702073682e-07, "loss": 0.1373, "num_input_tokens_seen": 79206056, "step": 117505 }, { "epoch": 2.870788850072069, "grad_norm": 0.006589832715690136, "learning_rate": 9.157943951736397e-07, "loss": 0.0454, "num_input_tokens_seen": 79209512, "step": 117510 }, { "epoch": 2.870911000903916, "grad_norm": 0.15100379288196564, "learning_rate": 9.157094207522775e-07, "loss": 0.0001, "num_input_tokens_seen": 79212968, "step": 117515 }, { "epoch": 2.8710331517357632, "grad_norm": 0.0029677124693989754, "learning_rate": 9.156244469438987e-07, "loss": 0.0, "num_input_tokens_seen": 79216360, "step": 117520 }, { "epoch": 2.8711553025676104, "grad_norm": 0.0012235000031068921, "learning_rate": 9.155394737491218e-07, "loss": 0.1394, "num_input_tokens_seen": 79220008, "step": 117525 }, { "epoch": 2.8712774533994576, "grad_norm": 0.00305823958478868, "learning_rate": 9.154545011685645e-07, "loss": 0.0001, "num_input_tokens_seen": 79223080, "step": 117530 }, { "epoch": 2.871399604231305, "grad_norm": 0.002814261708408594, "learning_rate": 9.153695292028449e-07, "loss": 0.0538, "num_input_tokens_seen": 79226408, "step": 117535 }, { "epoch": 2.871521755063152, "grad_norm": 0.047743260860443115, "learning_rate": 9.15284557852581e-07, "loss": 0.0013, "num_input_tokens_seen": 79229480, "step": 117540 }, { "epoch": 2.871643905894999, "grad_norm": 0.0454338937997818, "learning_rate": 9.1519958711839e-07, "loss": 0.0001, "num_input_tokens_seen": 79233064, "step": 117545 }, { "epoch": 2.8717660567268464, "grad_norm": 0.022779377177357674, "learning_rate": 9.151146170008911e-07, "loss": 0.0001, "num_input_tokens_seen": 79236328, "step": 117550 }, { "epoch": 2.8718882075586936, "grad_norm": 0.0035186749882996082, "learning_rate": 9.150296475007009e-07, "loss": 0.0425, "num_input_tokens_seen": 79240040, "step": 117555 }, { "epoch": 2.8720103583905408, "grad_norm": 0.012661670334637165, "learning_rate": 9.14944678618438e-07, "loss": 0.0569, "num_input_tokens_seen": 79242920, "step": 117560 }, { "epoch": 2.872132509222388, "grad_norm": 0.06139669194817543, "learning_rate": 9.148597103547209e-07, "loss": 0.0001, "num_input_tokens_seen": 79246056, "step": 117565 }, { "epoch": 2.872254660054235, "grad_norm": 0.054819829761981964, "learning_rate": 9.147747427101663e-07, "loss": 0.0001, "num_input_tokens_seen": 79249256, "step": 117570 }, { "epoch": 2.872376810886082, "grad_norm": 21.28368377685547, "learning_rate": 9.146897756853931e-07, "loss": 0.0872, "num_input_tokens_seen": 79252456, "step": 117575 }, { "epoch": 2.8724989617179295, "grad_norm": 0.008261069655418396, "learning_rate": 9.146048092810184e-07, "loss": 0.0549, "num_input_tokens_seen": 79255720, "step": 117580 }, { "epoch": 2.8726211125497763, "grad_norm": 0.02081366255879402, "learning_rate": 9.145198434976609e-07, "loss": 0.0717, "num_input_tokens_seen": 79258856, "step": 117585 }, { "epoch": 2.872743263381624, "grad_norm": 28.763362884521484, "learning_rate": 9.144348783359379e-07, "loss": 0.1152, "num_input_tokens_seen": 79262184, "step": 117590 }, { "epoch": 2.8728654142134706, "grad_norm": 0.757977306842804, "learning_rate": 9.143499137964673e-07, "loss": 0.0368, "num_input_tokens_seen": 79265320, "step": 117595 }, { "epoch": 2.872987565045318, "grad_norm": 0.06304121017456055, "learning_rate": 9.142649498798675e-07, "loss": 0.0002, "num_input_tokens_seen": 79268456, "step": 117600 }, { "epoch": 2.873109715877165, "grad_norm": 0.00835332740098238, "learning_rate": 9.141799865867558e-07, "loss": 0.0458, "num_input_tokens_seen": 79271656, "step": 117605 }, { "epoch": 2.873231866709012, "grad_norm": 0.022202685475349426, "learning_rate": 9.140950239177505e-07, "loss": 0.0651, "num_input_tokens_seen": 79275304, "step": 117610 }, { "epoch": 2.8733540175408594, "grad_norm": 0.315326988697052, "learning_rate": 9.140100618734691e-07, "loss": 0.0012, "num_input_tokens_seen": 79278632, "step": 117615 }, { "epoch": 2.8734761683727066, "grad_norm": 0.1667528599500656, "learning_rate": 9.139251004545296e-07, "loss": 0.0002, "num_input_tokens_seen": 79282728, "step": 117620 }, { "epoch": 2.8735983192045538, "grad_norm": 0.04228285327553749, "learning_rate": 9.138401396615502e-07, "loss": 0.0611, "num_input_tokens_seen": 79285928, "step": 117625 }, { "epoch": 2.873720470036401, "grad_norm": 59.36479187011719, "learning_rate": 9.137551794951483e-07, "loss": 0.0664, "num_input_tokens_seen": 79289128, "step": 117630 }, { "epoch": 2.873842620868248, "grad_norm": 0.06058398261666298, "learning_rate": 9.136702199559421e-07, "loss": 0.0003, "num_input_tokens_seen": 79292520, "step": 117635 }, { "epoch": 2.8739647717000953, "grad_norm": 0.024536775425076485, "learning_rate": 9.135852610445491e-07, "loss": 0.0246, "num_input_tokens_seen": 79295656, "step": 117640 }, { "epoch": 2.8740869225319425, "grad_norm": 0.6847139000892639, "learning_rate": 9.135003027615876e-07, "loss": 0.0599, "num_input_tokens_seen": 79299112, "step": 117645 }, { "epoch": 2.8742090733637897, "grad_norm": 0.026592666283249855, "learning_rate": 9.134153451076748e-07, "loss": 0.0503, "num_input_tokens_seen": 79302504, "step": 117650 }, { "epoch": 2.874331224195637, "grad_norm": 0.01792846992611885, "learning_rate": 9.133303880834287e-07, "loss": 0.0781, "num_input_tokens_seen": 79305896, "step": 117655 }, { "epoch": 2.874453375027484, "grad_norm": 14.320727348327637, "learning_rate": 9.13245431689468e-07, "loss": 0.1096, "num_input_tokens_seen": 79309288, "step": 117660 }, { "epoch": 2.8745755258593313, "grad_norm": 0.00515905162319541, "learning_rate": 9.131604759264093e-07, "loss": 0.0001, "num_input_tokens_seen": 79312616, "step": 117665 }, { "epoch": 2.874697676691178, "grad_norm": 50.880348205566406, "learning_rate": 9.130755207948715e-07, "loss": 0.027, "num_input_tokens_seen": 79315816, "step": 117670 }, { "epoch": 2.8748198275230257, "grad_norm": 0.03606473654508591, "learning_rate": 9.129905662954713e-07, "loss": 0.0957, "num_input_tokens_seen": 79319144, "step": 117675 }, { "epoch": 2.8749419783548724, "grad_norm": 27.191478729248047, "learning_rate": 9.129056124288275e-07, "loss": 0.062, "num_input_tokens_seen": 79322600, "step": 117680 }, { "epoch": 2.8750641291867196, "grad_norm": 0.27269473671913147, "learning_rate": 9.128206591955574e-07, "loss": 0.0008, "num_input_tokens_seen": 79326248, "step": 117685 }, { "epoch": 2.875186280018567, "grad_norm": 0.03510739281773567, "learning_rate": 9.12735706596279e-07, "loss": 0.0772, "num_input_tokens_seen": 79329576, "step": 117690 }, { "epoch": 2.875308430850414, "grad_norm": 0.006244110409170389, "learning_rate": 9.126507546316102e-07, "loss": 0.0556, "num_input_tokens_seen": 79332840, "step": 117695 }, { "epoch": 2.875430581682261, "grad_norm": 0.2085716277360916, "learning_rate": 9.125658033021682e-07, "loss": 0.0282, "num_input_tokens_seen": 79335976, "step": 117700 }, { "epoch": 2.8755527325141084, "grad_norm": 51.214942932128906, "learning_rate": 9.124808526085714e-07, "loss": 0.146, "num_input_tokens_seen": 79341288, "step": 117705 }, { "epoch": 2.8756748833459556, "grad_norm": 1.7222996950149536, "learning_rate": 9.123959025514372e-07, "loss": 0.0616, "num_input_tokens_seen": 79344424, "step": 117710 }, { "epoch": 2.8757970341778027, "grad_norm": 0.03357335925102234, "learning_rate": 9.123109531313836e-07, "loss": 0.0348, "num_input_tokens_seen": 79347816, "step": 117715 }, { "epoch": 2.87591918500965, "grad_norm": 0.01639949530363083, "learning_rate": 9.122260043490285e-07, "loss": 0.026, "num_input_tokens_seen": 79351528, "step": 117720 }, { "epoch": 2.876041335841497, "grad_norm": 0.012518463656306267, "learning_rate": 9.121410562049893e-07, "loss": 0.0003, "num_input_tokens_seen": 79354984, "step": 117725 }, { "epoch": 2.8761634866733443, "grad_norm": 0.1400122493505478, "learning_rate": 9.120561086998842e-07, "loss": 0.0001, "num_input_tokens_seen": 79358312, "step": 117730 }, { "epoch": 2.8762856375051915, "grad_norm": 0.03491630777716637, "learning_rate": 9.119711618343305e-07, "loss": 0.0341, "num_input_tokens_seen": 79361448, "step": 117735 }, { "epoch": 2.8764077883370387, "grad_norm": 0.016988487914204597, "learning_rate": 9.118862156089465e-07, "loss": 0.0311, "num_input_tokens_seen": 79364712, "step": 117740 }, { "epoch": 2.876529939168886, "grad_norm": 0.16878642141819, "learning_rate": 9.118012700243495e-07, "loss": 0.0303, "num_input_tokens_seen": 79368424, "step": 117745 }, { "epoch": 2.876652090000733, "grad_norm": 0.16372260451316833, "learning_rate": 9.117163250811571e-07, "loss": 0.0004, "num_input_tokens_seen": 79371880, "step": 117750 }, { "epoch": 2.87677424083258, "grad_norm": 0.026938119903206825, "learning_rate": 9.116313807799878e-07, "loss": 0.0165, "num_input_tokens_seen": 79375144, "step": 117755 }, { "epoch": 2.8768963916644275, "grad_norm": 36.589786529541016, "learning_rate": 9.115464371214585e-07, "loss": 0.0751, "num_input_tokens_seen": 79378472, "step": 117760 }, { "epoch": 2.877018542496274, "grad_norm": 0.034229863435029984, "learning_rate": 9.114614941061877e-07, "loss": 0.1053, "num_input_tokens_seen": 79382184, "step": 117765 }, { "epoch": 2.877140693328122, "grad_norm": 0.3387535512447357, "learning_rate": 9.113765517347922e-07, "loss": 0.0002, "num_input_tokens_seen": 79385896, "step": 117770 }, { "epoch": 2.8772628441599686, "grad_norm": 0.028109047561883926, "learning_rate": 9.112916100078903e-07, "loss": 0.0003, "num_input_tokens_seen": 79389480, "step": 117775 }, { "epoch": 2.8773849949918158, "grad_norm": 0.019770437851548195, "learning_rate": 9.112066689261001e-07, "loss": 0.141, "num_input_tokens_seen": 79392808, "step": 117780 }, { "epoch": 2.877507145823663, "grad_norm": 0.12967844307422638, "learning_rate": 9.111217284900387e-07, "loss": 0.051, "num_input_tokens_seen": 79396328, "step": 117785 }, { "epoch": 2.87762929665551, "grad_norm": 0.14182066917419434, "learning_rate": 9.110367887003241e-07, "loss": 0.0002, "num_input_tokens_seen": 79399848, "step": 117790 }, { "epoch": 2.8777514474873573, "grad_norm": 0.10105834156274796, "learning_rate": 9.109518495575736e-07, "loss": 0.0005, "num_input_tokens_seen": 79403304, "step": 117795 }, { "epoch": 2.8778735983192045, "grad_norm": 0.44391387701034546, "learning_rate": 9.108669110624055e-07, "loss": 0.0006, "num_input_tokens_seen": 79406312, "step": 117800 }, { "epoch": 2.8779957491510517, "grad_norm": 0.04133082181215286, "learning_rate": 9.107819732154371e-07, "loss": 0.0482, "num_input_tokens_seen": 79409576, "step": 117805 }, { "epoch": 2.878117899982899, "grad_norm": 0.005388464778661728, "learning_rate": 9.10697036017286e-07, "loss": 0.0002, "num_input_tokens_seen": 79413032, "step": 117810 }, { "epoch": 2.878240050814746, "grad_norm": 0.03301483392715454, "learning_rate": 9.106120994685704e-07, "loss": 0.0005, "num_input_tokens_seen": 79416360, "step": 117815 }, { "epoch": 2.8783622016465933, "grad_norm": 0.08911029249429703, "learning_rate": 9.105271635699072e-07, "loss": 0.0006, "num_input_tokens_seen": 79419304, "step": 117820 }, { "epoch": 2.8784843524784405, "grad_norm": 0.12972067296504974, "learning_rate": 9.104422283219151e-07, "loss": 0.0025, "num_input_tokens_seen": 79422824, "step": 117825 }, { "epoch": 2.8786065033102877, "grad_norm": 57.04539108276367, "learning_rate": 9.103572937252107e-07, "loss": 0.0386, "num_input_tokens_seen": 79426152, "step": 117830 }, { "epoch": 2.878728654142135, "grad_norm": 0.23031321167945862, "learning_rate": 9.102723597804125e-07, "loss": 0.0001, "num_input_tokens_seen": 79429544, "step": 117835 }, { "epoch": 2.8788508049739816, "grad_norm": 0.05489039421081543, "learning_rate": 9.101874264881377e-07, "loss": 0.0002, "num_input_tokens_seen": 79432872, "step": 117840 }, { "epoch": 2.8789729558058292, "grad_norm": 0.05241875350475311, "learning_rate": 9.101024938490041e-07, "loss": 0.04, "num_input_tokens_seen": 79436264, "step": 117845 }, { "epoch": 2.879095106637676, "grad_norm": 0.030297545716166496, "learning_rate": 9.100175618636296e-07, "loss": 0.0291, "num_input_tokens_seen": 79440360, "step": 117850 }, { "epoch": 2.8792172574695236, "grad_norm": 0.07322395592927933, "learning_rate": 9.099326305326311e-07, "loss": 0.0001, "num_input_tokens_seen": 79443880, "step": 117855 }, { "epoch": 2.8793394083013704, "grad_norm": 0.1015520840883255, "learning_rate": 9.098476998566274e-07, "loss": 0.0001, "num_input_tokens_seen": 79447080, "step": 117860 }, { "epoch": 2.8794615591332176, "grad_norm": 358.073974609375, "learning_rate": 9.097627698362348e-07, "loss": 0.0258, "num_input_tokens_seen": 79450600, "step": 117865 }, { "epoch": 2.8795837099650647, "grad_norm": 0.09080827236175537, "learning_rate": 9.096778404720716e-07, "loss": 0.0003, "num_input_tokens_seen": 79453864, "step": 117870 }, { "epoch": 2.879705860796912, "grad_norm": 0.06684160232543945, "learning_rate": 9.095929117647559e-07, "loss": 0.0002, "num_input_tokens_seen": 79457768, "step": 117875 }, { "epoch": 2.879828011628759, "grad_norm": 1036.3775634765625, "learning_rate": 9.095079837149046e-07, "loss": 0.1143, "num_input_tokens_seen": 79461352, "step": 117880 }, { "epoch": 2.8799501624606063, "grad_norm": 0.011043991893529892, "learning_rate": 9.094230563231359e-07, "loss": 0.0, "num_input_tokens_seen": 79464744, "step": 117885 }, { "epoch": 2.8800723132924535, "grad_norm": 54.542789459228516, "learning_rate": 9.093381295900666e-07, "loss": 0.0752, "num_input_tokens_seen": 79468072, "step": 117890 }, { "epoch": 2.8801944641243007, "grad_norm": 0.03255674988031387, "learning_rate": 9.092532035163153e-07, "loss": 0.0001, "num_input_tokens_seen": 79471976, "step": 117895 }, { "epoch": 2.880316614956148, "grad_norm": 0.1379069983959198, "learning_rate": 9.091682781024989e-07, "loss": 0.061, "num_input_tokens_seen": 79475176, "step": 117900 }, { "epoch": 2.880438765787995, "grad_norm": 0.8194472193717957, "learning_rate": 9.090833533492349e-07, "loss": 0.0468, "num_input_tokens_seen": 79478888, "step": 117905 }, { "epoch": 2.8805609166198423, "grad_norm": 0.4301382005214691, "learning_rate": 9.089984292571418e-07, "loss": 0.0002, "num_input_tokens_seen": 79482792, "step": 117910 }, { "epoch": 2.8806830674516894, "grad_norm": 0.04192114621400833, "learning_rate": 9.08913505826836e-07, "loss": 0.1394, "num_input_tokens_seen": 79486120, "step": 117915 }, { "epoch": 2.8808052182835366, "grad_norm": 0.9268858432769775, "learning_rate": 9.088285830589362e-07, "loss": 0.031, "num_input_tokens_seen": 79489064, "step": 117920 }, { "epoch": 2.880927369115384, "grad_norm": 0.06339632719755173, "learning_rate": 9.087436609540591e-07, "loss": 0.1359, "num_input_tokens_seen": 79492456, "step": 117925 }, { "epoch": 2.881049519947231, "grad_norm": 0.1005081757903099, "learning_rate": 9.086587395128226e-07, "loss": 0.0421, "num_input_tokens_seen": 79495848, "step": 117930 }, { "epoch": 2.8811716707790778, "grad_norm": 0.023637497797608376, "learning_rate": 9.085738187358447e-07, "loss": 0.0386, "num_input_tokens_seen": 79498856, "step": 117935 }, { "epoch": 2.8812938216109254, "grad_norm": 0.43018922209739685, "learning_rate": 9.084888986237425e-07, "loss": 0.0005, "num_input_tokens_seen": 79502184, "step": 117940 }, { "epoch": 2.881415972442772, "grad_norm": 0.006060573272407055, "learning_rate": 9.084039791771334e-07, "loss": 0.163, "num_input_tokens_seen": 79505128, "step": 117945 }, { "epoch": 2.8815381232746193, "grad_norm": 0.023106878623366356, "learning_rate": 9.083190603966354e-07, "loss": 0.0004, "num_input_tokens_seen": 79508520, "step": 117950 }, { "epoch": 2.8816602741064665, "grad_norm": 0.2827344834804535, "learning_rate": 9.082341422828657e-07, "loss": 0.0002, "num_input_tokens_seen": 79511656, "step": 117955 }, { "epoch": 2.8817824249383137, "grad_norm": 0.024535616859793663, "learning_rate": 9.081492248364422e-07, "loss": 0.0002, "num_input_tokens_seen": 79515816, "step": 117960 }, { "epoch": 2.881904575770161, "grad_norm": 0.018632639199495316, "learning_rate": 9.080643080579818e-07, "loss": 0.0003, "num_input_tokens_seen": 79519016, "step": 117965 }, { "epoch": 2.882026726602008, "grad_norm": 0.2095278650522232, "learning_rate": 9.079793919481032e-07, "loss": 0.0845, "num_input_tokens_seen": 79522280, "step": 117970 }, { "epoch": 2.8821488774338553, "grad_norm": 25.330570220947266, "learning_rate": 9.078944765074225e-07, "loss": 0.0587, "num_input_tokens_seen": 79525992, "step": 117975 }, { "epoch": 2.8822710282657025, "grad_norm": 0.003723717760294676, "learning_rate": 9.078095617365584e-07, "loss": 0.0467, "num_input_tokens_seen": 79529128, "step": 117980 }, { "epoch": 2.8823931790975497, "grad_norm": 0.011801057495176792, "learning_rate": 9.077246476361276e-07, "loss": 0.0002, "num_input_tokens_seen": 79532648, "step": 117985 }, { "epoch": 2.882515329929397, "grad_norm": 17.591293334960938, "learning_rate": 9.076397342067483e-07, "loss": 0.1119, "num_input_tokens_seen": 79535720, "step": 117990 }, { "epoch": 2.882637480761244, "grad_norm": 1.953616976737976, "learning_rate": 9.075548214490376e-07, "loss": 0.0794, "num_input_tokens_seen": 79539496, "step": 117995 }, { "epoch": 2.8827596315930912, "grad_norm": 0.05649435520172119, "learning_rate": 9.074699093636131e-07, "loss": 0.0648, "num_input_tokens_seen": 79542568, "step": 118000 }, { "epoch": 2.8828817824249384, "grad_norm": 1.586169719696045, "learning_rate": 9.073849979510926e-07, "loss": 0.1219, "num_input_tokens_seen": 79545832, "step": 118005 }, { "epoch": 2.8830039332567856, "grad_norm": 0.477380633354187, "learning_rate": 9.073000872120927e-07, "loss": 0.0005, "num_input_tokens_seen": 79548968, "step": 118010 }, { "epoch": 2.883126084088633, "grad_norm": 46.87863540649414, "learning_rate": 9.072151771472321e-07, "loss": 0.0312, "num_input_tokens_seen": 79552552, "step": 118015 }, { "epoch": 2.8832482349204795, "grad_norm": 0.1196833923459053, "learning_rate": 9.071302677571272e-07, "loss": 0.0009, "num_input_tokens_seen": 79555752, "step": 118020 }, { "epoch": 2.883370385752327, "grad_norm": 1.0895923376083374, "learning_rate": 9.070453590423959e-07, "loss": 0.022, "num_input_tokens_seen": 79558888, "step": 118025 }, { "epoch": 2.883492536584174, "grad_norm": 39.65727615356445, "learning_rate": 9.069604510036563e-07, "loss": 0.0734, "num_input_tokens_seen": 79562472, "step": 118030 }, { "epoch": 2.8836146874160216, "grad_norm": 69.60074615478516, "learning_rate": 9.068755436415247e-07, "loss": 0.0395, "num_input_tokens_seen": 79565480, "step": 118035 }, { "epoch": 2.8837368382478683, "grad_norm": 0.02860451489686966, "learning_rate": 9.067906369566198e-07, "loss": 0.0004, "num_input_tokens_seen": 79568616, "step": 118040 }, { "epoch": 2.8838589890797155, "grad_norm": 0.06782763451337814, "learning_rate": 9.06705730949558e-07, "loss": 0.0279, "num_input_tokens_seen": 79571688, "step": 118045 }, { "epoch": 2.8839811399115627, "grad_norm": 0.3268910348415375, "learning_rate": 9.066208256209576e-07, "loss": 0.0527, "num_input_tokens_seen": 79574952, "step": 118050 }, { "epoch": 2.88410329074341, "grad_norm": 15.585838317871094, "learning_rate": 9.065359209714356e-07, "loss": 0.1484, "num_input_tokens_seen": 79578088, "step": 118055 }, { "epoch": 2.884225441575257, "grad_norm": 0.0012059342116117477, "learning_rate": 9.064510170016092e-07, "loss": 0.0526, "num_input_tokens_seen": 79581544, "step": 118060 }, { "epoch": 2.8843475924071043, "grad_norm": 0.18633608520030975, "learning_rate": 9.063661137120966e-07, "loss": 0.0003, "num_input_tokens_seen": 79584616, "step": 118065 }, { "epoch": 2.8844697432389514, "grad_norm": 0.010403754189610481, "learning_rate": 9.062812111035143e-07, "loss": 0.0003, "num_input_tokens_seen": 79588456, "step": 118070 }, { "epoch": 2.8845918940707986, "grad_norm": 0.013725988566875458, "learning_rate": 9.061963091764809e-07, "loss": 0.048, "num_input_tokens_seen": 79591400, "step": 118075 }, { "epoch": 2.884714044902646, "grad_norm": 0.07384747266769409, "learning_rate": 9.061114079316124e-07, "loss": 0.0317, "num_input_tokens_seen": 79595112, "step": 118080 }, { "epoch": 2.884836195734493, "grad_norm": 0.027986222878098488, "learning_rate": 9.060265073695272e-07, "loss": 0.0711, "num_input_tokens_seen": 79598120, "step": 118085 }, { "epoch": 2.88495834656634, "grad_norm": 0.24027803540229797, "learning_rate": 9.059416074908429e-07, "loss": 0.0002, "num_input_tokens_seen": 79601256, "step": 118090 }, { "epoch": 2.8850804973981874, "grad_norm": 0.13268686830997467, "learning_rate": 9.058567082961764e-07, "loss": 0.0311, "num_input_tokens_seen": 79604328, "step": 118095 }, { "epoch": 2.8852026482300346, "grad_norm": 0.018090954050421715, "learning_rate": 9.057718097861452e-07, "loss": 0.0378, "num_input_tokens_seen": 79607720, "step": 118100 }, { "epoch": 2.8853247990618818, "grad_norm": 0.011310219764709473, "learning_rate": 9.056869119613667e-07, "loss": 0.0387, "num_input_tokens_seen": 79611048, "step": 118105 }, { "epoch": 2.885446949893729, "grad_norm": 0.02010614424943924, "learning_rate": 9.056020148224584e-07, "loss": 0.0, "num_input_tokens_seen": 79614632, "step": 118110 }, { "epoch": 2.8855691007255757, "grad_norm": 0.027373293414711952, "learning_rate": 9.055171183700376e-07, "loss": 0.0003, "num_input_tokens_seen": 79617960, "step": 118115 }, { "epoch": 2.8856912515574233, "grad_norm": 20.399688720703125, "learning_rate": 9.054322226047214e-07, "loss": 0.0904, "num_input_tokens_seen": 79621480, "step": 118120 }, { "epoch": 2.88581340238927, "grad_norm": 0.01832777075469494, "learning_rate": 9.053473275271281e-07, "loss": 0.0001, "num_input_tokens_seen": 79624872, "step": 118125 }, { "epoch": 2.8859355532211173, "grad_norm": 0.04512257128953934, "learning_rate": 9.05262433137874e-07, "loss": 0.0001, "num_input_tokens_seen": 79628200, "step": 118130 }, { "epoch": 2.8860577040529645, "grad_norm": 0.10394764691591263, "learning_rate": 9.051775394375775e-07, "loss": 0.0235, "num_input_tokens_seen": 79631656, "step": 118135 }, { "epoch": 2.8861798548848117, "grad_norm": 0.45173031091690063, "learning_rate": 9.050926464268549e-07, "loss": 0.0429, "num_input_tokens_seen": 79635112, "step": 118140 }, { "epoch": 2.886302005716659, "grad_norm": 0.007458243519067764, "learning_rate": 9.050077541063243e-07, "loss": 0.0283, "num_input_tokens_seen": 79638248, "step": 118145 }, { "epoch": 2.886424156548506, "grad_norm": 0.00957757979631424, "learning_rate": 9.049228624766029e-07, "loss": 0.0365, "num_input_tokens_seen": 79641256, "step": 118150 }, { "epoch": 2.8865463073803532, "grad_norm": 10.149345397949219, "learning_rate": 9.04837971538308e-07, "loss": 0.0593, "num_input_tokens_seen": 79644776, "step": 118155 }, { "epoch": 2.8866684582122004, "grad_norm": 0.0181287694722414, "learning_rate": 9.047530812920572e-07, "loss": 0.0001, "num_input_tokens_seen": 79648680, "step": 118160 }, { "epoch": 2.8867906090440476, "grad_norm": 0.01915971376001835, "learning_rate": 9.046681917384672e-07, "loss": 0.0596, "num_input_tokens_seen": 79652008, "step": 118165 }, { "epoch": 2.886912759875895, "grad_norm": 25.121864318847656, "learning_rate": 9.045833028781562e-07, "loss": 0.0422, "num_input_tokens_seen": 79655336, "step": 118170 }, { "epoch": 2.887034910707742, "grad_norm": 0.002604476409032941, "learning_rate": 9.044984147117406e-07, "loss": 0.0441, "num_input_tokens_seen": 79658408, "step": 118175 }, { "epoch": 2.887157061539589, "grad_norm": 0.1552521288394928, "learning_rate": 9.044135272398382e-07, "loss": 0.0003, "num_input_tokens_seen": 79661480, "step": 118180 }, { "epoch": 2.8872792123714364, "grad_norm": 0.04718397930264473, "learning_rate": 9.043286404630668e-07, "loss": 0.0311, "num_input_tokens_seen": 79664808, "step": 118185 }, { "epoch": 2.8874013632032836, "grad_norm": 22.923688888549805, "learning_rate": 9.042437543820428e-07, "loss": 0.1177, "num_input_tokens_seen": 79668072, "step": 118190 }, { "epoch": 2.8875235140351307, "grad_norm": 0.15967994928359985, "learning_rate": 9.041588689973845e-07, "loss": 0.0004, "num_input_tokens_seen": 79671336, "step": 118195 }, { "epoch": 2.8876456648669775, "grad_norm": 0.009539058431982994, "learning_rate": 9.040739843097082e-07, "loss": 0.0513, "num_input_tokens_seen": 79674856, "step": 118200 }, { "epoch": 2.887767815698825, "grad_norm": 0.019401682540774345, "learning_rate": 9.03989100319632e-07, "loss": 0.0004, "num_input_tokens_seen": 79678120, "step": 118205 }, { "epoch": 2.887889966530672, "grad_norm": 61.033382415771484, "learning_rate": 9.039042170277728e-07, "loss": 0.0283, "num_input_tokens_seen": 79681384, "step": 118210 }, { "epoch": 2.8880121173625195, "grad_norm": 0.18923260271549225, "learning_rate": 9.038193344347478e-07, "loss": 0.002, "num_input_tokens_seen": 79684712, "step": 118215 }, { "epoch": 2.8881342681943662, "grad_norm": 1.0554591417312622, "learning_rate": 9.037344525411747e-07, "loss": 0.0011, "num_input_tokens_seen": 79688168, "step": 118220 }, { "epoch": 2.8882564190262134, "grad_norm": 29.14522933959961, "learning_rate": 9.036495713476704e-07, "loss": 0.1608, "num_input_tokens_seen": 79691624, "step": 118225 }, { "epoch": 2.8883785698580606, "grad_norm": 0.2183917909860611, "learning_rate": 9.035646908548527e-07, "loss": 0.0006, "num_input_tokens_seen": 79694696, "step": 118230 }, { "epoch": 2.888500720689908, "grad_norm": 37.8150634765625, "learning_rate": 9.034798110633379e-07, "loss": 0.1012, "num_input_tokens_seen": 79698088, "step": 118235 }, { "epoch": 2.888622871521755, "grad_norm": 28.24630355834961, "learning_rate": 9.033949319737439e-07, "loss": 0.0404, "num_input_tokens_seen": 79701736, "step": 118240 }, { "epoch": 2.888745022353602, "grad_norm": 0.0458686463534832, "learning_rate": 9.033100535866885e-07, "loss": 0.0011, "num_input_tokens_seen": 79704808, "step": 118245 }, { "epoch": 2.8888671731854494, "grad_norm": 0.4126882553100586, "learning_rate": 9.032251759027881e-07, "loss": 0.0005, "num_input_tokens_seen": 79708136, "step": 118250 }, { "epoch": 2.8889893240172966, "grad_norm": 0.047328535467386246, "learning_rate": 9.031402989226603e-07, "loss": 0.0314, "num_input_tokens_seen": 79711464, "step": 118255 }, { "epoch": 2.8891114748491438, "grad_norm": 0.03057565726339817, "learning_rate": 9.030554226469222e-07, "loss": 0.0678, "num_input_tokens_seen": 79714984, "step": 118260 }, { "epoch": 2.889233625680991, "grad_norm": 0.32294678688049316, "learning_rate": 9.029705470761913e-07, "loss": 0.0005, "num_input_tokens_seen": 79718376, "step": 118265 }, { "epoch": 2.889355776512838, "grad_norm": 0.33089131116867065, "learning_rate": 9.028856722110846e-07, "loss": 0.0003, "num_input_tokens_seen": 79721576, "step": 118270 }, { "epoch": 2.8894779273446853, "grad_norm": 0.025815755128860474, "learning_rate": 9.028007980522192e-07, "loss": 0.0543, "num_input_tokens_seen": 79724712, "step": 118275 }, { "epoch": 2.8896000781765325, "grad_norm": 0.16206568479537964, "learning_rate": 9.02715924600213e-07, "loss": 0.0002, "num_input_tokens_seen": 79727848, "step": 118280 }, { "epoch": 2.8897222290083793, "grad_norm": 0.056277234107255936, "learning_rate": 9.026310518556822e-07, "loss": 0.0002, "num_input_tokens_seen": 79730920, "step": 118285 }, { "epoch": 2.889844379840227, "grad_norm": 0.020027903839945793, "learning_rate": 9.025461798192452e-07, "loss": 0.1201, "num_input_tokens_seen": 79734376, "step": 118290 }, { "epoch": 2.8899665306720737, "grad_norm": 0.054904062300920486, "learning_rate": 9.024613084915181e-07, "loss": 0.0417, "num_input_tokens_seen": 79737448, "step": 118295 }, { "epoch": 2.8900886815039213, "grad_norm": 0.0019138812785968184, "learning_rate": 9.023764378731189e-07, "loss": 0.0603, "num_input_tokens_seen": 79740392, "step": 118300 }, { "epoch": 2.890210832335768, "grad_norm": 0.16153118014335632, "learning_rate": 9.022915679646643e-07, "loss": 0.0005, "num_input_tokens_seen": 79743848, "step": 118305 }, { "epoch": 2.890332983167615, "grad_norm": 0.005311083514243364, "learning_rate": 9.022066987667717e-07, "loss": 0.0001, "num_input_tokens_seen": 79746920, "step": 118310 }, { "epoch": 2.8904551339994624, "grad_norm": 13.794023513793945, "learning_rate": 9.021218302800586e-07, "loss": 0.0396, "num_input_tokens_seen": 79750184, "step": 118315 }, { "epoch": 2.8905772848313096, "grad_norm": 0.019158679991960526, "learning_rate": 9.020369625051414e-07, "loss": 0.0003, "num_input_tokens_seen": 79753640, "step": 118320 }, { "epoch": 2.890699435663157, "grad_norm": 0.02267547883093357, "learning_rate": 9.019520954426383e-07, "loss": 0.0379, "num_input_tokens_seen": 79756904, "step": 118325 }, { "epoch": 2.890821586495004, "grad_norm": 0.3091530203819275, "learning_rate": 9.018672290931654e-07, "loss": 0.0003, "num_input_tokens_seen": 79759720, "step": 118330 }, { "epoch": 2.890943737326851, "grad_norm": 0.02154541015625, "learning_rate": 9.017823634573404e-07, "loss": 0.0473, "num_input_tokens_seen": 79762856, "step": 118335 }, { "epoch": 2.8910658881586984, "grad_norm": 15.876699447631836, "learning_rate": 9.01697498535781e-07, "loss": 0.0804, "num_input_tokens_seen": 79765864, "step": 118340 }, { "epoch": 2.8911880389905456, "grad_norm": 0.03007902391254902, "learning_rate": 9.016126343291033e-07, "loss": 0.0002, "num_input_tokens_seen": 79769384, "step": 118345 }, { "epoch": 2.8913101898223927, "grad_norm": 0.01837383396923542, "learning_rate": 9.015277708379254e-07, "loss": 0.0477, "num_input_tokens_seen": 79772840, "step": 118350 }, { "epoch": 2.89143234065424, "grad_norm": 31.521644592285156, "learning_rate": 9.014429080628636e-07, "loss": 0.0381, "num_input_tokens_seen": 79775912, "step": 118355 }, { "epoch": 2.891554491486087, "grad_norm": 0.011032157577574253, "learning_rate": 9.01358046004536e-07, "loss": 0.0549, "num_input_tokens_seen": 79779112, "step": 118360 }, { "epoch": 2.8916766423179343, "grad_norm": 32.51725769042969, "learning_rate": 9.012731846635589e-07, "loss": 0.0525, "num_input_tokens_seen": 79782248, "step": 118365 }, { "epoch": 2.8917987931497815, "grad_norm": 0.015014342032372952, "learning_rate": 9.011883240405496e-07, "loss": 0.0164, "num_input_tokens_seen": 79785512, "step": 118370 }, { "epoch": 2.8919209439816287, "grad_norm": 0.032123107463121414, "learning_rate": 9.011034641361259e-07, "loss": 0.0004, "num_input_tokens_seen": 79788712, "step": 118375 }, { "epoch": 2.8920430948134754, "grad_norm": 0.1224132776260376, "learning_rate": 9.010186049509038e-07, "loss": 0.0002, "num_input_tokens_seen": 79792424, "step": 118380 }, { "epoch": 2.892165245645323, "grad_norm": 0.01400213222950697, "learning_rate": 9.009337464855016e-07, "loss": 0.0003, "num_input_tokens_seen": 79795560, "step": 118385 }, { "epoch": 2.89228739647717, "grad_norm": 1.1057496070861816, "learning_rate": 9.008488887405354e-07, "loss": 0.0077, "num_input_tokens_seen": 79798824, "step": 118390 }, { "epoch": 2.8924095473090174, "grad_norm": 0.032255593687295914, "learning_rate": 9.007640317166228e-07, "loss": 0.0389, "num_input_tokens_seen": 79802664, "step": 118395 }, { "epoch": 2.892531698140864, "grad_norm": 0.11206360161304474, "learning_rate": 9.006791754143812e-07, "loss": 0.0001, "num_input_tokens_seen": 79805928, "step": 118400 }, { "epoch": 2.8926538489727114, "grad_norm": 0.29293307662010193, "learning_rate": 9.005943198344271e-07, "loss": 0.0004, "num_input_tokens_seen": 79808936, "step": 118405 }, { "epoch": 2.8927759998045586, "grad_norm": 0.0059355502016842365, "learning_rate": 9.005094649773779e-07, "loss": 0.0001, "num_input_tokens_seen": 79812520, "step": 118410 }, { "epoch": 2.8928981506364058, "grad_norm": 0.06507638841867447, "learning_rate": 9.004246108438505e-07, "loss": 0.066, "num_input_tokens_seen": 79815720, "step": 118415 }, { "epoch": 2.893020301468253, "grad_norm": 0.008765455335378647, "learning_rate": 9.003397574344624e-07, "loss": 0.1074, "num_input_tokens_seen": 79819368, "step": 118420 }, { "epoch": 2.8931424523001, "grad_norm": 0.025619253516197205, "learning_rate": 9.002549047498301e-07, "loss": 0.0001, "num_input_tokens_seen": 79822632, "step": 118425 }, { "epoch": 2.8932646031319473, "grad_norm": 0.018892310559749603, "learning_rate": 9.001700527905709e-07, "loss": 0.0616, "num_input_tokens_seen": 79826152, "step": 118430 }, { "epoch": 2.8933867539637945, "grad_norm": 0.1085629016160965, "learning_rate": 9.000852015573024e-07, "loss": 0.0312, "num_input_tokens_seen": 79829288, "step": 118435 }, { "epoch": 2.8935089047956417, "grad_norm": 22.9990234375, "learning_rate": 9.000003510506407e-07, "loss": 0.0863, "num_input_tokens_seen": 79832680, "step": 118440 }, { "epoch": 2.893631055627489, "grad_norm": 0.1559326946735382, "learning_rate": 8.999155012712036e-07, "loss": 0.0971, "num_input_tokens_seen": 79836200, "step": 118445 }, { "epoch": 2.893753206459336, "grad_norm": 0.017499227076768875, "learning_rate": 8.998306522196077e-07, "loss": 0.0002, "num_input_tokens_seen": 79839592, "step": 118450 }, { "epoch": 2.8938753572911833, "grad_norm": 0.3136901259422302, "learning_rate": 8.997458038964706e-07, "loss": 0.0369, "num_input_tokens_seen": 79842984, "step": 118455 }, { "epoch": 2.8939975081230305, "grad_norm": 54.96400451660156, "learning_rate": 8.996609563024084e-07, "loss": 0.0433, "num_input_tokens_seen": 79846184, "step": 118460 }, { "epoch": 2.894119658954877, "grad_norm": 27.167482376098633, "learning_rate": 8.995761094380392e-07, "loss": 0.0678, "num_input_tokens_seen": 79850024, "step": 118465 }, { "epoch": 2.894241809786725, "grad_norm": 0.40186038613319397, "learning_rate": 8.994912633039796e-07, "loss": 0.0003, "num_input_tokens_seen": 79853416, "step": 118470 }, { "epoch": 2.8943639606185716, "grad_norm": 0.07050628960132599, "learning_rate": 8.994064179008461e-07, "loss": 0.1456, "num_input_tokens_seen": 79856680, "step": 118475 }, { "epoch": 2.8944861114504192, "grad_norm": 0.1932118982076645, "learning_rate": 8.993215732292567e-07, "loss": 0.0005, "num_input_tokens_seen": 79860072, "step": 118480 }, { "epoch": 2.894608262282266, "grad_norm": 30.953916549682617, "learning_rate": 8.992367292898274e-07, "loss": 0.069, "num_input_tokens_seen": 79863528, "step": 118485 }, { "epoch": 2.894730413114113, "grad_norm": 0.2753456234931946, "learning_rate": 8.991518860831758e-07, "loss": 0.0002, "num_input_tokens_seen": 79866600, "step": 118490 }, { "epoch": 2.8948525639459604, "grad_norm": 0.0773712545633316, "learning_rate": 8.990670436099192e-07, "loss": 0.002, "num_input_tokens_seen": 79869800, "step": 118495 }, { "epoch": 2.8949747147778075, "grad_norm": 0.15468727052211761, "learning_rate": 8.989822018706738e-07, "loss": 0.0002, "num_input_tokens_seen": 79873064, "step": 118500 }, { "epoch": 2.8950968656096547, "grad_norm": 0.02235586568713188, "learning_rate": 8.988973608660572e-07, "loss": 0.0997, "num_input_tokens_seen": 79876776, "step": 118505 }, { "epoch": 2.895219016441502, "grad_norm": 44.47627639770508, "learning_rate": 8.988125205966861e-07, "loss": 0.102, "num_input_tokens_seen": 79880232, "step": 118510 }, { "epoch": 2.895341167273349, "grad_norm": 0.013052013702690601, "learning_rate": 8.987276810631779e-07, "loss": 0.0003, "num_input_tokens_seen": 79883624, "step": 118515 }, { "epoch": 2.8954633181051963, "grad_norm": 0.09096262603998184, "learning_rate": 8.986428422661489e-07, "loss": 0.113, "num_input_tokens_seen": 79886760, "step": 118520 }, { "epoch": 2.8955854689370435, "grad_norm": 0.007621260825544596, "learning_rate": 8.985580042062163e-07, "loss": 0.0002, "num_input_tokens_seen": 79890408, "step": 118525 }, { "epoch": 2.8957076197688907, "grad_norm": 22.843814849853516, "learning_rate": 8.984731668839976e-07, "loss": 0.117, "num_input_tokens_seen": 79893800, "step": 118530 }, { "epoch": 2.895829770600738, "grad_norm": 19.730802536010742, "learning_rate": 8.983883303001088e-07, "loss": 0.0008, "num_input_tokens_seen": 79897000, "step": 118535 }, { "epoch": 2.895951921432585, "grad_norm": 0.05201861262321472, "learning_rate": 8.98303494455168e-07, "loss": 0.0004, "num_input_tokens_seen": 79900328, "step": 118540 }, { "epoch": 2.8960740722644323, "grad_norm": 0.8403574824333191, "learning_rate": 8.982186593497909e-07, "loss": 0.0257, "num_input_tokens_seen": 79903272, "step": 118545 }, { "epoch": 2.8961962230962794, "grad_norm": 0.0860314890742302, "learning_rate": 8.981338249845952e-07, "loss": 0.0007, "num_input_tokens_seen": 79907240, "step": 118550 }, { "epoch": 2.8963183739281266, "grad_norm": 0.011775447055697441, "learning_rate": 8.980489913601982e-07, "loss": 0.046, "num_input_tokens_seen": 79910248, "step": 118555 }, { "epoch": 2.8964405247599734, "grad_norm": 11.870306968688965, "learning_rate": 8.979641584772161e-07, "loss": 0.0285, "num_input_tokens_seen": 79913832, "step": 118560 }, { "epoch": 2.896562675591821, "grad_norm": 0.008414355106651783, "learning_rate": 8.97879326336266e-07, "loss": 0.03, "num_input_tokens_seen": 79917608, "step": 118565 }, { "epoch": 2.8966848264236678, "grad_norm": 0.4362105131149292, "learning_rate": 8.977944949379652e-07, "loss": 0.038, "num_input_tokens_seen": 79921192, "step": 118570 }, { "epoch": 2.896806977255515, "grad_norm": 0.07210730016231537, "learning_rate": 8.977096642829301e-07, "loss": 0.0002, "num_input_tokens_seen": 79924264, "step": 118575 }, { "epoch": 2.896929128087362, "grad_norm": 25.157201766967773, "learning_rate": 8.976248343717778e-07, "loss": 0.101, "num_input_tokens_seen": 79927656, "step": 118580 }, { "epoch": 2.8970512789192093, "grad_norm": 0.6024566292762756, "learning_rate": 8.97540005205125e-07, "loss": 0.0175, "num_input_tokens_seen": 79931112, "step": 118585 }, { "epoch": 2.8971734297510565, "grad_norm": 0.12825407087802887, "learning_rate": 8.974551767835893e-07, "loss": 0.052, "num_input_tokens_seen": 79934184, "step": 118590 }, { "epoch": 2.8972955805829037, "grad_norm": 22.66798210144043, "learning_rate": 8.973703491077867e-07, "loss": 0.076, "num_input_tokens_seen": 79938152, "step": 118595 }, { "epoch": 2.897417731414751, "grad_norm": 0.0005638687289319932, "learning_rate": 8.972855221783351e-07, "loss": 0.001, "num_input_tokens_seen": 79941992, "step": 118600 }, { "epoch": 2.897539882246598, "grad_norm": 0.007802937179803848, "learning_rate": 8.972006959958502e-07, "loss": 0.108, "num_input_tokens_seen": 79945512, "step": 118605 }, { "epoch": 2.8976620330784453, "grad_norm": 27.239303588867188, "learning_rate": 8.9711587056095e-07, "loss": 0.1229, "num_input_tokens_seen": 79948520, "step": 118610 }, { "epoch": 2.8977841839102925, "grad_norm": 0.05133926868438721, "learning_rate": 8.970310458742505e-07, "loss": 0.0696, "num_input_tokens_seen": 79951976, "step": 118615 }, { "epoch": 2.8979063347421397, "grad_norm": 0.03441181033849716, "learning_rate": 8.969462219363691e-07, "loss": 0.0124, "num_input_tokens_seen": 79955304, "step": 118620 }, { "epoch": 2.898028485573987, "grad_norm": 0.005173355340957642, "learning_rate": 8.968613987479227e-07, "loss": 0.0007, "num_input_tokens_seen": 79958632, "step": 118625 }, { "epoch": 2.898150636405834, "grad_norm": 33.71710205078125, "learning_rate": 8.967765763095274e-07, "loss": 0.1746, "num_input_tokens_seen": 79961960, "step": 118630 }, { "epoch": 2.8982727872376812, "grad_norm": 0.1187821552157402, "learning_rate": 8.966917546218012e-07, "loss": 0.0957, "num_input_tokens_seen": 79965672, "step": 118635 }, { "epoch": 2.8983949380695284, "grad_norm": 0.04378324747085571, "learning_rate": 8.966069336853598e-07, "loss": 0.0004, "num_input_tokens_seen": 79969128, "step": 118640 }, { "epoch": 2.898517088901375, "grad_norm": 0.30410701036453247, "learning_rate": 8.965221135008207e-07, "loss": 0.0006, "num_input_tokens_seen": 79973096, "step": 118645 }, { "epoch": 2.898639239733223, "grad_norm": 0.022590233013033867, "learning_rate": 8.96437294068801e-07, "loss": 0.0502, "num_input_tokens_seen": 79976936, "step": 118650 }, { "epoch": 2.8987613905650695, "grad_norm": 0.03725120425224304, "learning_rate": 8.963524753899167e-07, "loss": 0.0165, "num_input_tokens_seen": 79980264, "step": 118655 }, { "epoch": 2.898883541396917, "grad_norm": 0.4708126187324524, "learning_rate": 8.962676574647855e-07, "loss": 0.0442, "num_input_tokens_seen": 79984040, "step": 118660 }, { "epoch": 2.899005692228764, "grad_norm": 0.024750245735049248, "learning_rate": 8.961828402940233e-07, "loss": 0.0008, "num_input_tokens_seen": 79987240, "step": 118665 }, { "epoch": 2.899127843060611, "grad_norm": 0.057560380548238754, "learning_rate": 8.96098023878248e-07, "loss": 0.0009, "num_input_tokens_seen": 79990568, "step": 118670 }, { "epoch": 2.8992499938924583, "grad_norm": 2.3662123680114746, "learning_rate": 8.960132082180755e-07, "loss": 0.0004, "num_input_tokens_seen": 79994664, "step": 118675 }, { "epoch": 2.8993721447243055, "grad_norm": 0.03252064064145088, "learning_rate": 8.959283933141227e-07, "loss": 0.0007, "num_input_tokens_seen": 79998312, "step": 118680 }, { "epoch": 2.8994942955561527, "grad_norm": 29.49829864501953, "learning_rate": 8.958435791670071e-07, "loss": 0.0703, "num_input_tokens_seen": 80001960, "step": 118685 }, { "epoch": 2.899616446388, "grad_norm": 0.45613086223602295, "learning_rate": 8.957587657773447e-07, "loss": 0.0645, "num_input_tokens_seen": 80005544, "step": 118690 }, { "epoch": 2.899738597219847, "grad_norm": 0.009808986447751522, "learning_rate": 8.956739531457528e-07, "loss": 0.0504, "num_input_tokens_seen": 80009128, "step": 118695 }, { "epoch": 2.8998607480516942, "grad_norm": 0.34509333968162537, "learning_rate": 8.955891412728476e-07, "loss": 0.0348, "num_input_tokens_seen": 80012136, "step": 118700 }, { "epoch": 2.8999828988835414, "grad_norm": 0.0034415319096297026, "learning_rate": 8.955043301592463e-07, "loss": 0.0459, "num_input_tokens_seen": 80016296, "step": 118705 }, { "epoch": 2.9001050497153886, "grad_norm": 0.04839680716395378, "learning_rate": 8.954195198055659e-07, "loss": 0.1135, "num_input_tokens_seen": 80019368, "step": 118710 }, { "epoch": 2.900227200547236, "grad_norm": 0.041439514607191086, "learning_rate": 8.953347102124229e-07, "loss": 0.0673, "num_input_tokens_seen": 80022440, "step": 118715 }, { "epoch": 2.900349351379083, "grad_norm": 0.20260454714298248, "learning_rate": 8.952499013804339e-07, "loss": 0.0003, "num_input_tokens_seen": 80025960, "step": 118720 }, { "epoch": 2.90047150221093, "grad_norm": 0.02303224429488182, "learning_rate": 8.951650933102158e-07, "loss": 0.0003, "num_input_tokens_seen": 80029224, "step": 118725 }, { "epoch": 2.9005936530427774, "grad_norm": 0.21073459088802338, "learning_rate": 8.950802860023854e-07, "loss": 0.0007, "num_input_tokens_seen": 80032296, "step": 118730 }, { "epoch": 2.9007158038746246, "grad_norm": 0.17983902990818024, "learning_rate": 8.949954794575593e-07, "loss": 0.0003, "num_input_tokens_seen": 80035560, "step": 118735 }, { "epoch": 2.9008379547064713, "grad_norm": 0.009431470185518265, "learning_rate": 8.949106736763541e-07, "loss": 0.033, "num_input_tokens_seen": 80038760, "step": 118740 }, { "epoch": 2.900960105538319, "grad_norm": 0.48249107599258423, "learning_rate": 8.948258686593872e-07, "loss": 0.0003, "num_input_tokens_seen": 80042024, "step": 118745 }, { "epoch": 2.9010822563701657, "grad_norm": 0.07081861048936844, "learning_rate": 8.947410644072745e-07, "loss": 0.0002, "num_input_tokens_seen": 80045288, "step": 118750 }, { "epoch": 2.901204407202013, "grad_norm": 0.12552383542060852, "learning_rate": 8.946562609206334e-07, "loss": 0.0002, "num_input_tokens_seen": 80048488, "step": 118755 }, { "epoch": 2.90132655803386, "grad_norm": 0.033153653144836426, "learning_rate": 8.9457145820008e-07, "loss": 0.0005, "num_input_tokens_seen": 80051752, "step": 118760 }, { "epoch": 2.9014487088657073, "grad_norm": 0.006722915451973677, "learning_rate": 8.944866562462317e-07, "loss": 0.0, "num_input_tokens_seen": 80055144, "step": 118765 }, { "epoch": 2.9015708596975545, "grad_norm": 0.16946537792682648, "learning_rate": 8.944018550597043e-07, "loss": 0.0548, "num_input_tokens_seen": 80058664, "step": 118770 }, { "epoch": 2.9016930105294017, "grad_norm": 0.028818394988775253, "learning_rate": 8.943170546411153e-07, "loss": 0.0006, "num_input_tokens_seen": 80061672, "step": 118775 }, { "epoch": 2.901815161361249, "grad_norm": 0.01083712000399828, "learning_rate": 8.942322549910813e-07, "loss": 0.0002, "num_input_tokens_seen": 80064744, "step": 118780 }, { "epoch": 2.901937312193096, "grad_norm": 0.0124315544962883, "learning_rate": 8.941474561102185e-07, "loss": 0.0701, "num_input_tokens_seen": 80068200, "step": 118785 }, { "epoch": 2.902059463024943, "grad_norm": 0.02468065172433853, "learning_rate": 8.940626579991442e-07, "loss": 0.0, "num_input_tokens_seen": 80071464, "step": 118790 }, { "epoch": 2.9021816138567904, "grad_norm": 36.39851379394531, "learning_rate": 8.939778606584743e-07, "loss": 0.0936, "num_input_tokens_seen": 80074728, "step": 118795 }, { "epoch": 2.9023037646886376, "grad_norm": 0.20101211965084076, "learning_rate": 8.938930640888258e-07, "loss": 0.035, "num_input_tokens_seen": 80077928, "step": 118800 }, { "epoch": 2.902425915520485, "grad_norm": 25.310131072998047, "learning_rate": 8.93808268290816e-07, "loss": 0.1283, "num_input_tokens_seen": 80080936, "step": 118805 }, { "epoch": 2.902548066352332, "grad_norm": 4.265520095825195, "learning_rate": 8.937234732650606e-07, "loss": 0.0009, "num_input_tokens_seen": 80084648, "step": 118810 }, { "epoch": 2.902670217184179, "grad_norm": 0.2513897120952606, "learning_rate": 8.936386790121772e-07, "loss": 0.0339, "num_input_tokens_seen": 80088168, "step": 118815 }, { "epoch": 2.9027923680160264, "grad_norm": 0.02958909422159195, "learning_rate": 8.935538855327814e-07, "loss": 0.1282, "num_input_tokens_seen": 80091432, "step": 118820 }, { "epoch": 2.902914518847873, "grad_norm": 0.1474994271993637, "learning_rate": 8.934690928274908e-07, "loss": 0.0003, "num_input_tokens_seen": 80094824, "step": 118825 }, { "epoch": 2.9030366696797207, "grad_norm": 1.460278868675232, "learning_rate": 8.933843008969215e-07, "loss": 0.0645, "num_input_tokens_seen": 80098344, "step": 118830 }, { "epoch": 2.9031588205115675, "grad_norm": 0.08092187345027924, "learning_rate": 8.9329950974169e-07, "loss": 0.0006, "num_input_tokens_seen": 80102312, "step": 118835 }, { "epoch": 2.903280971343415, "grad_norm": 0.02012321911752224, "learning_rate": 8.932147193624135e-07, "loss": 0.0003, "num_input_tokens_seen": 80105640, "step": 118840 }, { "epoch": 2.903403122175262, "grad_norm": 0.08701890707015991, "learning_rate": 8.931299297597079e-07, "loss": 0.0005, "num_input_tokens_seen": 80108968, "step": 118845 }, { "epoch": 2.903525273007109, "grad_norm": 0.006065746303647757, "learning_rate": 8.930451409341908e-07, "loss": 0.0633, "num_input_tokens_seen": 80112232, "step": 118850 }, { "epoch": 2.9036474238389562, "grad_norm": 0.07881610095500946, "learning_rate": 8.929603528864775e-07, "loss": 0.0004, "num_input_tokens_seen": 80115176, "step": 118855 }, { "epoch": 2.9037695746708034, "grad_norm": 0.11350667476654053, "learning_rate": 8.928755656171853e-07, "loss": 0.0004, "num_input_tokens_seen": 80118632, "step": 118860 }, { "epoch": 2.9038917255026506, "grad_norm": 0.05210879072546959, "learning_rate": 8.927907791269314e-07, "loss": 0.0001, "num_input_tokens_seen": 80122152, "step": 118865 }, { "epoch": 2.904013876334498, "grad_norm": 0.02516857162117958, "learning_rate": 8.927059934163316e-07, "loss": 0.0005, "num_input_tokens_seen": 80125672, "step": 118870 }, { "epoch": 2.904136027166345, "grad_norm": 0.021097134798765182, "learning_rate": 8.926212084860025e-07, "loss": 0.0004, "num_input_tokens_seen": 80129064, "step": 118875 }, { "epoch": 2.904258177998192, "grad_norm": 0.27393844723701477, "learning_rate": 8.925364243365609e-07, "loss": 0.0452, "num_input_tokens_seen": 80132520, "step": 118880 }, { "epoch": 2.9043803288300394, "grad_norm": 0.19845488667488098, "learning_rate": 8.924516409686235e-07, "loss": 0.0003, "num_input_tokens_seen": 80136104, "step": 118885 }, { "epoch": 2.9045024796618866, "grad_norm": 0.014890705235302448, "learning_rate": 8.923668583828066e-07, "loss": 0.0, "num_input_tokens_seen": 80139304, "step": 118890 }, { "epoch": 2.9046246304937338, "grad_norm": 0.07981786876916885, "learning_rate": 8.922820765797265e-07, "loss": 0.0005, "num_input_tokens_seen": 80142760, "step": 118895 }, { "epoch": 2.904746781325581, "grad_norm": 0.09087047725915909, "learning_rate": 8.921972955600006e-07, "loss": 0.0583, "num_input_tokens_seen": 80146664, "step": 118900 }, { "epoch": 2.904868932157428, "grad_norm": 0.19944734871387482, "learning_rate": 8.921125153242447e-07, "loss": 0.0017, "num_input_tokens_seen": 80150184, "step": 118905 }, { "epoch": 2.904991082989275, "grad_norm": 0.46191540360450745, "learning_rate": 8.920277358730759e-07, "loss": 0.0004, "num_input_tokens_seen": 80153448, "step": 118910 }, { "epoch": 2.9051132338211225, "grad_norm": 0.025700677186250687, "learning_rate": 8.9194295720711e-07, "loss": 0.0466, "num_input_tokens_seen": 80156904, "step": 118915 }, { "epoch": 2.9052353846529693, "grad_norm": 32.34309005737305, "learning_rate": 8.918581793269645e-07, "loss": 0.0635, "num_input_tokens_seen": 80159848, "step": 118920 }, { "epoch": 2.905357535484817, "grad_norm": 0.011389784514904022, "learning_rate": 8.917734022332549e-07, "loss": 0.0002, "num_input_tokens_seen": 80163304, "step": 118925 }, { "epoch": 2.9054796863166636, "grad_norm": 0.04762762784957886, "learning_rate": 8.916886259265985e-07, "loss": 0.0615, "num_input_tokens_seen": 80166632, "step": 118930 }, { "epoch": 2.905601837148511, "grad_norm": 0.018525706604123116, "learning_rate": 8.916038504076117e-07, "loss": 0.0403, "num_input_tokens_seen": 80169832, "step": 118935 }, { "epoch": 2.905723987980358, "grad_norm": 0.010757447220385075, "learning_rate": 8.915190756769104e-07, "loss": 0.0435, "num_input_tokens_seen": 80173224, "step": 118940 }, { "epoch": 2.905846138812205, "grad_norm": 0.012967376969754696, "learning_rate": 8.91434301735112e-07, "loss": 0.0, "num_input_tokens_seen": 80176680, "step": 118945 }, { "epoch": 2.9059682896440524, "grad_norm": 0.03979997709393501, "learning_rate": 8.913495285828323e-07, "loss": 0.0005, "num_input_tokens_seen": 80179688, "step": 118950 }, { "epoch": 2.9060904404758996, "grad_norm": 0.012289268895983696, "learning_rate": 8.912647562206879e-07, "loss": 0.0277, "num_input_tokens_seen": 80183208, "step": 118955 }, { "epoch": 2.906212591307747, "grad_norm": 2.190854072570801, "learning_rate": 8.911799846492959e-07, "loss": 0.0351, "num_input_tokens_seen": 80186280, "step": 118960 }, { "epoch": 2.906334742139594, "grad_norm": 0.36408019065856934, "learning_rate": 8.910952138692718e-07, "loss": 0.0683, "num_input_tokens_seen": 80189224, "step": 118965 }, { "epoch": 2.906456892971441, "grad_norm": 13.64661693572998, "learning_rate": 8.910104438812332e-07, "loss": 0.0866, "num_input_tokens_seen": 80192488, "step": 118970 }, { "epoch": 2.9065790438032884, "grad_norm": 0.03847251832485199, "learning_rate": 8.909256746857953e-07, "loss": 0.1192, "num_input_tokens_seen": 80196008, "step": 118975 }, { "epoch": 2.9067011946351355, "grad_norm": 0.044732265174388885, "learning_rate": 8.908409062835759e-07, "loss": 0.0297, "num_input_tokens_seen": 80199976, "step": 118980 }, { "epoch": 2.9068233454669827, "grad_norm": 0.18884044885635376, "learning_rate": 8.907561386751905e-07, "loss": 0.001, "num_input_tokens_seen": 80203048, "step": 118985 }, { "epoch": 2.90694549629883, "grad_norm": 0.014213848859071732, "learning_rate": 8.906713718612555e-07, "loss": 0.0578, "num_input_tokens_seen": 80206248, "step": 118990 }, { "epoch": 2.907067647130677, "grad_norm": 0.019253773614764214, "learning_rate": 8.905866058423884e-07, "loss": 0.0005, "num_input_tokens_seen": 80209384, "step": 118995 }, { "epoch": 2.9071897979625243, "grad_norm": 0.015433445572853088, "learning_rate": 8.905018406192042e-07, "loss": 0.0004, "num_input_tokens_seen": 80212776, "step": 119000 }, { "epoch": 2.907311948794371, "grad_norm": 0.019289663061499596, "learning_rate": 8.904170761923206e-07, "loss": 0.0001, "num_input_tokens_seen": 80216040, "step": 119005 }, { "epoch": 2.9074340996262187, "grad_norm": 19.523340225219727, "learning_rate": 8.903323125623531e-07, "loss": 0.0406, "num_input_tokens_seen": 80219624, "step": 119010 }, { "epoch": 2.9075562504580654, "grad_norm": 0.09069832414388657, "learning_rate": 8.90247549729919e-07, "loss": 0.0843, "num_input_tokens_seen": 80223144, "step": 119015 }, { "epoch": 2.907678401289913, "grad_norm": 12.428088188171387, "learning_rate": 8.901627876956337e-07, "loss": 0.1005, "num_input_tokens_seen": 80226856, "step": 119020 }, { "epoch": 2.90780055212176, "grad_norm": 0.13833267986774445, "learning_rate": 8.900780264601144e-07, "loss": 0.0006, "num_input_tokens_seen": 80230120, "step": 119025 }, { "epoch": 2.907922702953607, "grad_norm": 0.11480916291475296, "learning_rate": 8.899932660239773e-07, "loss": 0.0362, "num_input_tokens_seen": 80233512, "step": 119030 }, { "epoch": 2.908044853785454, "grad_norm": 0.019525211304426193, "learning_rate": 8.899085063878387e-07, "loss": 0.0922, "num_input_tokens_seen": 80236648, "step": 119035 }, { "epoch": 2.9081670046173014, "grad_norm": 0.29217401146888733, "learning_rate": 8.89823747552315e-07, "loss": 0.0004, "num_input_tokens_seen": 80239656, "step": 119040 }, { "epoch": 2.9082891554491486, "grad_norm": 0.0739276260137558, "learning_rate": 8.897389895180228e-07, "loss": 0.0003, "num_input_tokens_seen": 80243176, "step": 119045 }, { "epoch": 2.9084113062809958, "grad_norm": 0.012618579901754856, "learning_rate": 8.89654232285578e-07, "loss": 0.0005, "num_input_tokens_seen": 80246440, "step": 119050 }, { "epoch": 2.908533457112843, "grad_norm": 0.032436829060316086, "learning_rate": 8.895694758555979e-07, "loss": 0.1446, "num_input_tokens_seen": 80249832, "step": 119055 }, { "epoch": 2.90865560794469, "grad_norm": 0.4129319489002228, "learning_rate": 8.894847202286976e-07, "loss": 0.0979, "num_input_tokens_seen": 80253864, "step": 119060 }, { "epoch": 2.9087777587765373, "grad_norm": 0.024983467534184456, "learning_rate": 8.893999654054947e-07, "loss": 0.0005, "num_input_tokens_seen": 80257064, "step": 119065 }, { "epoch": 2.9088999096083845, "grad_norm": 0.00642590643838048, "learning_rate": 8.893152113866045e-07, "loss": 0.0008, "num_input_tokens_seen": 80260456, "step": 119070 }, { "epoch": 2.9090220604402317, "grad_norm": 0.025908183306455612, "learning_rate": 8.892304581726444e-07, "loss": 0.0608, "num_input_tokens_seen": 80263912, "step": 119075 }, { "epoch": 2.909144211272079, "grad_norm": 0.08259736746549606, "learning_rate": 8.891457057642296e-07, "loss": 0.0639, "num_input_tokens_seen": 80267240, "step": 119080 }, { "epoch": 2.909266362103926, "grad_norm": 14.999557495117188, "learning_rate": 8.890609541619775e-07, "loss": 0.0015, "num_input_tokens_seen": 80270504, "step": 119085 }, { "epoch": 2.909388512935773, "grad_norm": 0.385803759098053, "learning_rate": 8.88976203366504e-07, "loss": 0.0003, "num_input_tokens_seen": 80273960, "step": 119090 }, { "epoch": 2.9095106637676205, "grad_norm": 13.95687198638916, "learning_rate": 8.88891453378425e-07, "loss": 0.0896, "num_input_tokens_seen": 80277160, "step": 119095 }, { "epoch": 2.909632814599467, "grad_norm": 0.04457539692521095, "learning_rate": 8.888067041983577e-07, "loss": 0.082, "num_input_tokens_seen": 80280168, "step": 119100 }, { "epoch": 2.909754965431315, "grad_norm": 0.0129321264103055, "learning_rate": 8.887219558269176e-07, "loss": 0.0943, "num_input_tokens_seen": 80283496, "step": 119105 }, { "epoch": 2.9098771162631616, "grad_norm": 0.028965329751372337, "learning_rate": 8.886372082647212e-07, "loss": 0.0014, "num_input_tokens_seen": 80286952, "step": 119110 }, { "epoch": 2.909999267095009, "grad_norm": 0.08315041661262512, "learning_rate": 8.885524615123855e-07, "loss": 0.0001, "num_input_tokens_seen": 80290216, "step": 119115 }, { "epoch": 2.910121417926856, "grad_norm": 0.0037849927321076393, "learning_rate": 8.88467715570526e-07, "loss": 0.0558, "num_input_tokens_seen": 80293608, "step": 119120 }, { "epoch": 2.910243568758703, "grad_norm": 0.017975879833102226, "learning_rate": 8.883829704397594e-07, "loss": 0.0561, "num_input_tokens_seen": 80297064, "step": 119125 }, { "epoch": 2.9103657195905503, "grad_norm": 43.18073272705078, "learning_rate": 8.882982261207016e-07, "loss": 0.135, "num_input_tokens_seen": 80300136, "step": 119130 }, { "epoch": 2.9104878704223975, "grad_norm": 3.4434423446655273, "learning_rate": 8.882134826139695e-07, "loss": 0.0004, "num_input_tokens_seen": 80303720, "step": 119135 }, { "epoch": 2.9106100212542447, "grad_norm": 0.0011810072464868426, "learning_rate": 8.881287399201789e-07, "loss": 0.029, "num_input_tokens_seen": 80306728, "step": 119140 }, { "epoch": 2.910732172086092, "grad_norm": 0.07076526433229446, "learning_rate": 8.880439980399459e-07, "loss": 0.0001, "num_input_tokens_seen": 80309864, "step": 119145 }, { "epoch": 2.910854322917939, "grad_norm": 0.020179910585284233, "learning_rate": 8.879592569738875e-07, "loss": 0.0003, "num_input_tokens_seen": 80313320, "step": 119150 }, { "epoch": 2.9109764737497863, "grad_norm": 0.06361079961061478, "learning_rate": 8.878745167226192e-07, "loss": 0.0001, "num_input_tokens_seen": 80316456, "step": 119155 }, { "epoch": 2.9110986245816335, "grad_norm": 18.111434936523438, "learning_rate": 8.877897772867579e-07, "loss": 0.0462, "num_input_tokens_seen": 80319720, "step": 119160 }, { "epoch": 2.9112207754134807, "grad_norm": 0.03135787695646286, "learning_rate": 8.877050386669191e-07, "loss": 0.069, "num_input_tokens_seen": 80322920, "step": 119165 }, { "epoch": 2.911342926245328, "grad_norm": 0.07542268186807632, "learning_rate": 8.876203008637198e-07, "loss": 0.0012, "num_input_tokens_seen": 80325864, "step": 119170 }, { "epoch": 2.911465077077175, "grad_norm": 0.04872267693281174, "learning_rate": 8.875355638777756e-07, "loss": 0.0837, "num_input_tokens_seen": 80328808, "step": 119175 }, { "epoch": 2.9115872279090222, "grad_norm": 0.04887533187866211, "learning_rate": 8.874508277097033e-07, "loss": 0.0001, "num_input_tokens_seen": 80332200, "step": 119180 }, { "epoch": 2.911709378740869, "grad_norm": 0.006453742738813162, "learning_rate": 8.873660923601187e-07, "loss": 0.0373, "num_input_tokens_seen": 80335528, "step": 119185 }, { "epoch": 2.9118315295727166, "grad_norm": 0.6347325444221497, "learning_rate": 8.872813578296382e-07, "loss": 0.0005, "num_input_tokens_seen": 80339176, "step": 119190 }, { "epoch": 2.9119536804045634, "grad_norm": 0.0005772336153313518, "learning_rate": 8.871966241188781e-07, "loss": 0.0008, "num_input_tokens_seen": 80342440, "step": 119195 }, { "epoch": 2.9120758312364106, "grad_norm": 0.1035570576786995, "learning_rate": 8.871118912284543e-07, "loss": 0.0004, "num_input_tokens_seen": 80345896, "step": 119200 }, { "epoch": 2.9121979820682578, "grad_norm": 0.29160940647125244, "learning_rate": 8.870271591589831e-07, "loss": 0.0496, "num_input_tokens_seen": 80349032, "step": 119205 }, { "epoch": 2.912320132900105, "grad_norm": 16.458744049072266, "learning_rate": 8.869424279110812e-07, "loss": 0.0837, "num_input_tokens_seen": 80352424, "step": 119210 }, { "epoch": 2.912442283731952, "grad_norm": 0.21430669724941254, "learning_rate": 8.86857697485364e-07, "loss": 0.0317, "num_input_tokens_seen": 80355560, "step": 119215 }, { "epoch": 2.9125644345637993, "grad_norm": 17.024089813232422, "learning_rate": 8.867729678824484e-07, "loss": 0.0856, "num_input_tokens_seen": 80359272, "step": 119220 }, { "epoch": 2.9126865853956465, "grad_norm": 45.32529067993164, "learning_rate": 8.866882391029498e-07, "loss": 0.1324, "num_input_tokens_seen": 80362472, "step": 119225 }, { "epoch": 2.9128087362274937, "grad_norm": 0.015839653089642525, "learning_rate": 8.866035111474853e-07, "loss": 0.0467, "num_input_tokens_seen": 80365672, "step": 119230 }, { "epoch": 2.912930887059341, "grad_norm": 0.04247782379388809, "learning_rate": 8.865187840166701e-07, "loss": 0.0007, "num_input_tokens_seen": 80368872, "step": 119235 }, { "epoch": 2.913053037891188, "grad_norm": 0.02549228072166443, "learning_rate": 8.86434057711121e-07, "loss": 0.0001, "num_input_tokens_seen": 80371816, "step": 119240 }, { "epoch": 2.9131751887230353, "grad_norm": 0.02016402594745159, "learning_rate": 8.863493322314543e-07, "loss": 0.1107, "num_input_tokens_seen": 80375144, "step": 119245 }, { "epoch": 2.9132973395548825, "grad_norm": 0.2670440971851349, "learning_rate": 8.862646075782852e-07, "loss": 0.0006, "num_input_tokens_seen": 80378216, "step": 119250 }, { "epoch": 2.9134194903867296, "grad_norm": 0.09133932739496231, "learning_rate": 8.861798837522311e-07, "loss": 0.0002, "num_input_tokens_seen": 80381288, "step": 119255 }, { "epoch": 2.913541641218577, "grad_norm": 0.1471731960773468, "learning_rate": 8.86095160753907e-07, "loss": 0.0004, "num_input_tokens_seen": 80384488, "step": 119260 }, { "epoch": 2.913663792050424, "grad_norm": 0.08419156074523926, "learning_rate": 8.860104385839295e-07, "loss": 0.0001, "num_input_tokens_seen": 80388072, "step": 119265 }, { "epoch": 2.9137859428822708, "grad_norm": 0.18315629661083221, "learning_rate": 8.859257172429153e-07, "loss": 0.0408, "num_input_tokens_seen": 80391400, "step": 119270 }, { "epoch": 2.9139080937141184, "grad_norm": 0.09528139978647232, "learning_rate": 8.858409967314792e-07, "loss": 0.001, "num_input_tokens_seen": 80394472, "step": 119275 }, { "epoch": 2.914030244545965, "grad_norm": 0.030033722519874573, "learning_rate": 8.857562770502389e-07, "loss": 0.0334, "num_input_tokens_seen": 80398568, "step": 119280 }, { "epoch": 2.914152395377813, "grad_norm": 0.01004182081669569, "learning_rate": 8.856715581998091e-07, "loss": 0.0006, "num_input_tokens_seen": 80401640, "step": 119285 }, { "epoch": 2.9142745462096595, "grad_norm": 0.031097983941435814, "learning_rate": 8.855868401808069e-07, "loss": 0.0001, "num_input_tokens_seen": 80405224, "step": 119290 }, { "epoch": 2.9143966970415067, "grad_norm": 0.029696395620703697, "learning_rate": 8.855021229938478e-07, "loss": 0.0001, "num_input_tokens_seen": 80408808, "step": 119295 }, { "epoch": 2.914518847873354, "grad_norm": 0.38885918259620667, "learning_rate": 8.854174066395476e-07, "loss": 0.0503, "num_input_tokens_seen": 80412776, "step": 119300 }, { "epoch": 2.914640998705201, "grad_norm": 19.236915588378906, "learning_rate": 8.853326911185236e-07, "loss": 0.043, "num_input_tokens_seen": 80416488, "step": 119305 }, { "epoch": 2.9147631495370483, "grad_norm": 30.828025817871094, "learning_rate": 8.852479764313905e-07, "loss": 0.1288, "num_input_tokens_seen": 80419752, "step": 119310 }, { "epoch": 2.9148853003688955, "grad_norm": 0.02660001441836357, "learning_rate": 8.851632625787655e-07, "loss": 0.0579, "num_input_tokens_seen": 80422952, "step": 119315 }, { "epoch": 2.9150074512007427, "grad_norm": 93.14715576171875, "learning_rate": 8.850785495612636e-07, "loss": 0.031, "num_input_tokens_seen": 80426216, "step": 119320 }, { "epoch": 2.91512960203259, "grad_norm": 0.06403271853923798, "learning_rate": 8.84993837379502e-07, "loss": 0.0002, "num_input_tokens_seen": 80429544, "step": 119325 }, { "epoch": 2.915251752864437, "grad_norm": 185.7920684814453, "learning_rate": 8.849091260340955e-07, "loss": 0.1621, "num_input_tokens_seen": 80432936, "step": 119330 }, { "epoch": 2.9153739036962842, "grad_norm": 142.79888916015625, "learning_rate": 8.848244155256613e-07, "loss": 0.0385, "num_input_tokens_seen": 80436456, "step": 119335 }, { "epoch": 2.9154960545281314, "grad_norm": 0.07249239832162857, "learning_rate": 8.847397058548146e-07, "loss": 0.0395, "num_input_tokens_seen": 80440296, "step": 119340 }, { "epoch": 2.9156182053599786, "grad_norm": 0.09680524468421936, "learning_rate": 8.846549970221719e-07, "loss": 0.0423, "num_input_tokens_seen": 80443880, "step": 119345 }, { "epoch": 2.915740356191826, "grad_norm": 0.011335140094161034, "learning_rate": 8.845702890283492e-07, "loss": 0.0439, "num_input_tokens_seen": 80447464, "step": 119350 }, { "epoch": 2.9158625070236726, "grad_norm": 0.022819865494966507, "learning_rate": 8.844855818739623e-07, "loss": 0.0005, "num_input_tokens_seen": 80451496, "step": 119355 }, { "epoch": 2.91598465785552, "grad_norm": 0.002536680083721876, "learning_rate": 8.844008755596271e-07, "loss": 0.0001, "num_input_tokens_seen": 80454888, "step": 119360 }, { "epoch": 2.916106808687367, "grad_norm": 0.019164428114891052, "learning_rate": 8.843161700859602e-07, "loss": 0.0004, "num_input_tokens_seen": 80458280, "step": 119365 }, { "epoch": 2.9162289595192146, "grad_norm": 0.04097279906272888, "learning_rate": 8.842314654535769e-07, "loss": 0.0002, "num_input_tokens_seen": 80461352, "step": 119370 }, { "epoch": 2.9163511103510613, "grad_norm": 0.015141436830163002, "learning_rate": 8.841467616630939e-07, "loss": 0.0729, "num_input_tokens_seen": 80464936, "step": 119375 }, { "epoch": 2.9164732611829085, "grad_norm": 0.08130350708961487, "learning_rate": 8.840620587151264e-07, "loss": 0.0541, "num_input_tokens_seen": 80468200, "step": 119380 }, { "epoch": 2.9165954120147557, "grad_norm": 19.866153717041016, "learning_rate": 8.839773566102912e-07, "loss": 0.0778, "num_input_tokens_seen": 80471784, "step": 119385 }, { "epoch": 2.916717562846603, "grad_norm": 0.0013215701328590512, "learning_rate": 8.838926553492035e-07, "loss": 0.0001, "num_input_tokens_seen": 80475560, "step": 119390 }, { "epoch": 2.91683971367845, "grad_norm": 0.014515292830765247, "learning_rate": 8.838079549324797e-07, "loss": 0.0434, "num_input_tokens_seen": 80478440, "step": 119395 }, { "epoch": 2.9169618645102973, "grad_norm": 0.00209752912633121, "learning_rate": 8.837232553607361e-07, "loss": 0.1949, "num_input_tokens_seen": 80482152, "step": 119400 }, { "epoch": 2.9170840153421445, "grad_norm": 0.349534273147583, "learning_rate": 8.836385566345878e-07, "loss": 0.0005, "num_input_tokens_seen": 80485288, "step": 119405 }, { "epoch": 2.9172061661739916, "grad_norm": 0.04242877662181854, "learning_rate": 8.835538587546515e-07, "loss": 0.0007, "num_input_tokens_seen": 80488424, "step": 119410 }, { "epoch": 2.917328317005839, "grad_norm": 0.23315057158470154, "learning_rate": 8.834691617215425e-07, "loss": 0.0004, "num_input_tokens_seen": 80491880, "step": 119415 }, { "epoch": 2.917450467837686, "grad_norm": 0.08955276757478714, "learning_rate": 8.833844655358772e-07, "loss": 0.0002, "num_input_tokens_seen": 80495464, "step": 119420 }, { "epoch": 2.917572618669533, "grad_norm": 0.05814218521118164, "learning_rate": 8.832997701982718e-07, "loss": 0.0002, "num_input_tokens_seen": 80499176, "step": 119425 }, { "epoch": 2.9176947695013804, "grad_norm": 0.07658043503761292, "learning_rate": 8.832150757093414e-07, "loss": 0.038, "num_input_tokens_seen": 80502440, "step": 119430 }, { "epoch": 2.9178169203332276, "grad_norm": 0.33819350600242615, "learning_rate": 8.831303820697028e-07, "loss": 0.0007, "num_input_tokens_seen": 80505896, "step": 119435 }, { "epoch": 2.917939071165075, "grad_norm": 0.06955944001674652, "learning_rate": 8.830456892799712e-07, "loss": 0.0727, "num_input_tokens_seen": 80509608, "step": 119440 }, { "epoch": 2.918061221996922, "grad_norm": 0.09534032642841339, "learning_rate": 8.82960997340763e-07, "loss": 0.0004, "num_input_tokens_seen": 80513064, "step": 119445 }, { "epoch": 2.9181833728287687, "grad_norm": 0.06615091860294342, "learning_rate": 8.828763062526938e-07, "loss": 0.0613, "num_input_tokens_seen": 80516200, "step": 119450 }, { "epoch": 2.9183055236606164, "grad_norm": 0.04451657086610794, "learning_rate": 8.827916160163794e-07, "loss": 0.0504, "num_input_tokens_seen": 80519464, "step": 119455 }, { "epoch": 2.918427674492463, "grad_norm": 0.133866086602211, "learning_rate": 8.827069266324364e-07, "loss": 0.0493, "num_input_tokens_seen": 80522856, "step": 119460 }, { "epoch": 2.9185498253243107, "grad_norm": 26.166975021362305, "learning_rate": 8.826222381014796e-07, "loss": 0.0707, "num_input_tokens_seen": 80526184, "step": 119465 }, { "epoch": 2.9186719761561575, "grad_norm": 0.007928439415991306, "learning_rate": 8.82537550424126e-07, "loss": 0.0001, "num_input_tokens_seen": 80529384, "step": 119470 }, { "epoch": 2.9187941269880047, "grad_norm": 0.13122475147247314, "learning_rate": 8.824528636009904e-07, "loss": 0.0003, "num_input_tokens_seen": 80532904, "step": 119475 }, { "epoch": 2.918916277819852, "grad_norm": 0.03710798919200897, "learning_rate": 8.823681776326898e-07, "loss": 0.0424, "num_input_tokens_seen": 80536232, "step": 119480 }, { "epoch": 2.919038428651699, "grad_norm": 0.006326170172542334, "learning_rate": 8.822834925198389e-07, "loss": 0.0442, "num_input_tokens_seen": 80539432, "step": 119485 }, { "epoch": 2.9191605794835462, "grad_norm": 0.0011891796020790935, "learning_rate": 8.82198808263054e-07, "loss": 0.0366, "num_input_tokens_seen": 80542952, "step": 119490 }, { "epoch": 2.9192827303153934, "grad_norm": 13.90089225769043, "learning_rate": 8.821141248629516e-07, "loss": 0.1097, "num_input_tokens_seen": 80546280, "step": 119495 }, { "epoch": 2.9194048811472406, "grad_norm": 0.0893154889345169, "learning_rate": 8.820294423201469e-07, "loss": 0.0007, "num_input_tokens_seen": 80549608, "step": 119500 }, { "epoch": 2.919527031979088, "grad_norm": 0.6648651361465454, "learning_rate": 8.819447606352557e-07, "loss": 0.0007, "num_input_tokens_seen": 80552488, "step": 119505 }, { "epoch": 2.919649182810935, "grad_norm": 0.14524568617343903, "learning_rate": 8.818600798088939e-07, "loss": 0.0443, "num_input_tokens_seen": 80555880, "step": 119510 }, { "epoch": 2.919771333642782, "grad_norm": 0.07950985431671143, "learning_rate": 8.817753998416772e-07, "loss": 0.0484, "num_input_tokens_seen": 80559208, "step": 119515 }, { "epoch": 2.9198934844746294, "grad_norm": 0.06964538246393204, "learning_rate": 8.81690720734222e-07, "loss": 0.0005, "num_input_tokens_seen": 80562472, "step": 119520 }, { "epoch": 2.9200156353064766, "grad_norm": 0.13740044832229614, "learning_rate": 8.816060424871433e-07, "loss": 0.0001, "num_input_tokens_seen": 80565864, "step": 119525 }, { "epoch": 2.9201377861383238, "grad_norm": 0.11162356287240982, "learning_rate": 8.815213651010578e-07, "loss": 0.0344, "num_input_tokens_seen": 80569896, "step": 119530 }, { "epoch": 2.9202599369701705, "grad_norm": 0.15913273394107819, "learning_rate": 8.814366885765802e-07, "loss": 0.0379, "num_input_tokens_seen": 80573160, "step": 119535 }, { "epoch": 2.920382087802018, "grad_norm": 0.013078569434583187, "learning_rate": 8.813520129143275e-07, "loss": 0.0001, "num_input_tokens_seen": 80576360, "step": 119540 }, { "epoch": 2.920504238633865, "grad_norm": 0.2981469929218292, "learning_rate": 8.812673381149143e-07, "loss": 0.1346, "num_input_tokens_seen": 80579560, "step": 119545 }, { "epoch": 2.9206263894657125, "grad_norm": 0.20622022449970245, "learning_rate": 8.811826641789572e-07, "loss": 0.0011, "num_input_tokens_seen": 80582824, "step": 119550 }, { "epoch": 2.9207485402975593, "grad_norm": 26.90651512145996, "learning_rate": 8.810979911070721e-07, "loss": 0.0475, "num_input_tokens_seen": 80586088, "step": 119555 }, { "epoch": 2.9208706911294064, "grad_norm": 14.657966613769531, "learning_rate": 8.810133188998739e-07, "loss": 0.0415, "num_input_tokens_seen": 80589352, "step": 119560 }, { "epoch": 2.9209928419612536, "grad_norm": 0.016939891502261162, "learning_rate": 8.809286475579791e-07, "loss": 0.0768, "num_input_tokens_seen": 80592424, "step": 119565 }, { "epoch": 2.921114992793101, "grad_norm": 0.1867605596780777, "learning_rate": 8.808439770820028e-07, "loss": 0.0411, "num_input_tokens_seen": 80595560, "step": 119570 }, { "epoch": 2.921237143624948, "grad_norm": 59.19410705566406, "learning_rate": 8.807593074725618e-07, "loss": 0.038, "num_input_tokens_seen": 80598760, "step": 119575 }, { "epoch": 2.921359294456795, "grad_norm": 0.22791574895381927, "learning_rate": 8.806746387302706e-07, "loss": 0.0347, "num_input_tokens_seen": 80602088, "step": 119580 }, { "epoch": 2.9214814452886424, "grad_norm": 22.170124053955078, "learning_rate": 8.805899708557454e-07, "loss": 0.0412, "num_input_tokens_seen": 80605608, "step": 119585 }, { "epoch": 2.9216035961204896, "grad_norm": 224.7540283203125, "learning_rate": 8.805053038496028e-07, "loss": 0.017, "num_input_tokens_seen": 80608680, "step": 119590 }, { "epoch": 2.9217257469523368, "grad_norm": 0.733668863773346, "learning_rate": 8.804206377124571e-07, "loss": 0.0006, "num_input_tokens_seen": 80612264, "step": 119595 }, { "epoch": 2.921847897784184, "grad_norm": 0.04018937796354294, "learning_rate": 8.803359724449252e-07, "loss": 0.0524, "num_input_tokens_seen": 80615784, "step": 119600 }, { "epoch": 2.921970048616031, "grad_norm": 0.4537980556488037, "learning_rate": 8.80251308047622e-07, "loss": 0.0002, "num_input_tokens_seen": 80618664, "step": 119605 }, { "epoch": 2.9220921994478783, "grad_norm": 0.2348211258649826, "learning_rate": 8.801666445211634e-07, "loss": 0.0706, "num_input_tokens_seen": 80621480, "step": 119610 }, { "epoch": 2.9222143502797255, "grad_norm": 0.01142832450568676, "learning_rate": 8.800819818661655e-07, "loss": 0.0004, "num_input_tokens_seen": 80625000, "step": 119615 }, { "epoch": 2.9223365011115727, "grad_norm": 0.31771257519721985, "learning_rate": 8.799973200832434e-07, "loss": 0.0479, "num_input_tokens_seen": 80627944, "step": 119620 }, { "epoch": 2.92245865194342, "grad_norm": 0.05220457911491394, "learning_rate": 8.799126591730134e-07, "loss": 0.0536, "num_input_tokens_seen": 80631528, "step": 119625 }, { "epoch": 2.9225808027752667, "grad_norm": 0.051249198615550995, "learning_rate": 8.798279991360904e-07, "loss": 0.0004, "num_input_tokens_seen": 80634600, "step": 119630 }, { "epoch": 2.9227029536071143, "grad_norm": 0.6016253232955933, "learning_rate": 8.797433399730909e-07, "loss": 0.0009, "num_input_tokens_seen": 80637736, "step": 119635 }, { "epoch": 2.922825104438961, "grad_norm": 0.05893409997224808, "learning_rate": 8.796586816846299e-07, "loss": 0.0002, "num_input_tokens_seen": 80640936, "step": 119640 }, { "epoch": 2.9229472552708082, "grad_norm": 0.14922615885734558, "learning_rate": 8.795740242713232e-07, "loss": 0.0851, "num_input_tokens_seen": 80644520, "step": 119645 }, { "epoch": 2.9230694061026554, "grad_norm": 0.3645789921283722, "learning_rate": 8.794893677337872e-07, "loss": 0.0007, "num_input_tokens_seen": 80647848, "step": 119650 }, { "epoch": 2.9231915569345026, "grad_norm": 0.025446340441703796, "learning_rate": 8.794047120726364e-07, "loss": 0.085, "num_input_tokens_seen": 80651048, "step": 119655 }, { "epoch": 2.92331370776635, "grad_norm": 0.23881550133228302, "learning_rate": 8.793200572884873e-07, "loss": 0.0325, "num_input_tokens_seen": 80654696, "step": 119660 }, { "epoch": 2.923435858598197, "grad_norm": 0.03703530132770538, "learning_rate": 8.792354033819549e-07, "loss": 0.0006, "num_input_tokens_seen": 80657832, "step": 119665 }, { "epoch": 2.923558009430044, "grad_norm": 0.11316487938165665, "learning_rate": 8.79150750353655e-07, "loss": 0.0542, "num_input_tokens_seen": 80661032, "step": 119670 }, { "epoch": 2.9236801602618914, "grad_norm": 0.0426037423312664, "learning_rate": 8.79066098204204e-07, "loss": 0.0402, "num_input_tokens_seen": 80664872, "step": 119675 }, { "epoch": 2.9238023110937386, "grad_norm": 0.36164647340774536, "learning_rate": 8.789814469342161e-07, "loss": 0.0004, "num_input_tokens_seen": 80668136, "step": 119680 }, { "epoch": 2.9239244619255857, "grad_norm": 1.571184515953064, "learning_rate": 8.788967965443083e-07, "loss": 0.0003, "num_input_tokens_seen": 80671272, "step": 119685 }, { "epoch": 2.924046612757433, "grad_norm": 28.044828414916992, "learning_rate": 8.788121470350952e-07, "loss": 0.0599, "num_input_tokens_seen": 80674472, "step": 119690 }, { "epoch": 2.92416876358928, "grad_norm": 0.016553180292248726, "learning_rate": 8.78727498407193e-07, "loss": 0.0005, "num_input_tokens_seen": 80677864, "step": 119695 }, { "epoch": 2.9242909144211273, "grad_norm": 0.01405636128038168, "learning_rate": 8.786428506612168e-07, "loss": 0.0001, "num_input_tokens_seen": 80681320, "step": 119700 }, { "epoch": 2.9244130652529745, "grad_norm": 0.007759598549455404, "learning_rate": 8.785582037977826e-07, "loss": 0.0456, "num_input_tokens_seen": 80684136, "step": 119705 }, { "epoch": 2.9245352160848217, "grad_norm": 46.18053436279297, "learning_rate": 8.78473557817506e-07, "loss": 0.079, "num_input_tokens_seen": 80687848, "step": 119710 }, { "epoch": 2.9246573669166684, "grad_norm": 0.003461407730355859, "learning_rate": 8.783889127210019e-07, "loss": 0.0381, "num_input_tokens_seen": 80692008, "step": 119715 }, { "epoch": 2.924779517748516, "grad_norm": 36.74126052856445, "learning_rate": 8.783042685088869e-07, "loss": 0.0368, "num_input_tokens_seen": 80695464, "step": 119720 }, { "epoch": 2.924901668580363, "grad_norm": 0.4897412955760956, "learning_rate": 8.782196251817754e-07, "loss": 0.0003, "num_input_tokens_seen": 80698856, "step": 119725 }, { "epoch": 2.9250238194122105, "grad_norm": 0.06261391192674637, "learning_rate": 8.78134982740284e-07, "loss": 0.0534, "num_input_tokens_seen": 80702440, "step": 119730 }, { "epoch": 2.925145970244057, "grad_norm": 0.012732506729662418, "learning_rate": 8.780503411850273e-07, "loss": 0.0969, "num_input_tokens_seen": 80705832, "step": 119735 }, { "epoch": 2.9252681210759044, "grad_norm": 0.03934415802359581, "learning_rate": 8.779657005166215e-07, "loss": 0.0409, "num_input_tokens_seen": 80709096, "step": 119740 }, { "epoch": 2.9253902719077516, "grad_norm": 0.05687893554568291, "learning_rate": 8.778810607356822e-07, "loss": 0.0003, "num_input_tokens_seen": 80712552, "step": 119745 }, { "epoch": 2.9255124227395988, "grad_norm": 0.0480547770857811, "learning_rate": 8.777964218428243e-07, "loss": 0.0006, "num_input_tokens_seen": 80716200, "step": 119750 }, { "epoch": 2.925634573571446, "grad_norm": 0.0338495597243309, "learning_rate": 8.777117838386642e-07, "loss": 0.0347, "num_input_tokens_seen": 80719208, "step": 119755 }, { "epoch": 2.925756724403293, "grad_norm": 0.23030585050582886, "learning_rate": 8.776271467238166e-07, "loss": 0.0806, "num_input_tokens_seen": 80722344, "step": 119760 }, { "epoch": 2.9258788752351403, "grad_norm": 0.025462327525019646, "learning_rate": 8.775425104988971e-07, "loss": 0.0008, "num_input_tokens_seen": 80726184, "step": 119765 }, { "epoch": 2.9260010260669875, "grad_norm": 0.3015349805355072, "learning_rate": 8.774578751645219e-07, "loss": 0.0004, "num_input_tokens_seen": 80729384, "step": 119770 }, { "epoch": 2.9261231768988347, "grad_norm": 18.960887908935547, "learning_rate": 8.773732407213055e-07, "loss": 0.0429, "num_input_tokens_seen": 80732712, "step": 119775 }, { "epoch": 2.926245327730682, "grad_norm": 0.013392589055001736, "learning_rate": 8.772886071698643e-07, "loss": 0.1408, "num_input_tokens_seen": 80735848, "step": 119780 }, { "epoch": 2.926367478562529, "grad_norm": 0.004601533990353346, "learning_rate": 8.772039745108129e-07, "loss": 0.1689, "num_input_tokens_seen": 80739112, "step": 119785 }, { "epoch": 2.9264896293943763, "grad_norm": 0.014628436416387558, "learning_rate": 8.771193427447677e-07, "loss": 0.0001, "num_input_tokens_seen": 80742184, "step": 119790 }, { "epoch": 2.9266117802262235, "grad_norm": 63.195289611816406, "learning_rate": 8.770347118723433e-07, "loss": 0.0205, "num_input_tokens_seen": 80745704, "step": 119795 }, { "epoch": 2.9267339310580707, "grad_norm": 0.010293773375451565, "learning_rate": 8.769500818941555e-07, "loss": 0.0481, "num_input_tokens_seen": 80749096, "step": 119800 }, { "epoch": 2.926856081889918, "grad_norm": 0.019898995757102966, "learning_rate": 8.768654528108202e-07, "loss": 0.0003, "num_input_tokens_seen": 80753064, "step": 119805 }, { "epoch": 2.9269782327217646, "grad_norm": 0.0067049735225737095, "learning_rate": 8.767808246229523e-07, "loss": 0.0007, "num_input_tokens_seen": 80756328, "step": 119810 }, { "epoch": 2.9271003835536122, "grad_norm": 0.0841495618224144, "learning_rate": 8.766961973311674e-07, "loss": 0.0002, "num_input_tokens_seen": 80759592, "step": 119815 }, { "epoch": 2.927222534385459, "grad_norm": 0.12519319355487823, "learning_rate": 8.766115709360808e-07, "loss": 0.0002, "num_input_tokens_seen": 80762664, "step": 119820 }, { "epoch": 2.927344685217306, "grad_norm": 21.410520553588867, "learning_rate": 8.76526945438308e-07, "loss": 0.0503, "num_input_tokens_seen": 80766056, "step": 119825 }, { "epoch": 2.9274668360491534, "grad_norm": 0.0299088042229414, "learning_rate": 8.764423208384647e-07, "loss": 0.0013, "num_input_tokens_seen": 80769256, "step": 119830 }, { "epoch": 2.9275889868810006, "grad_norm": 0.353015273809433, "learning_rate": 8.763576971371658e-07, "loss": 0.0003, "num_input_tokens_seen": 80772712, "step": 119835 }, { "epoch": 2.9277111377128477, "grad_norm": 0.7852094769477844, "learning_rate": 8.762730743350273e-07, "loss": 0.0003, "num_input_tokens_seen": 80776232, "step": 119840 }, { "epoch": 2.927833288544695, "grad_norm": 0.002658928046002984, "learning_rate": 8.761884524326639e-07, "loss": 0.0004, "num_input_tokens_seen": 80779432, "step": 119845 }, { "epoch": 2.927955439376542, "grad_norm": 0.039048757404088974, "learning_rate": 8.761038314306918e-07, "loss": 0.0, "num_input_tokens_seen": 80782696, "step": 119850 }, { "epoch": 2.9280775902083893, "grad_norm": 23.21377182006836, "learning_rate": 8.760192113297255e-07, "loss": 0.0748, "num_input_tokens_seen": 80786408, "step": 119855 }, { "epoch": 2.9281997410402365, "grad_norm": 0.22337211668491364, "learning_rate": 8.759345921303811e-07, "loss": 0.0003, "num_input_tokens_seen": 80789416, "step": 119860 }, { "epoch": 2.9283218918720837, "grad_norm": 0.001950685866177082, "learning_rate": 8.75849973833274e-07, "loss": 0.0001, "num_input_tokens_seen": 80792424, "step": 119865 }, { "epoch": 2.928444042703931, "grad_norm": 0.011079180054366589, "learning_rate": 8.757653564390187e-07, "loss": 0.0918, "num_input_tokens_seen": 80795496, "step": 119870 }, { "epoch": 2.928566193535778, "grad_norm": 0.01707855612039566, "learning_rate": 8.756807399482316e-07, "loss": 0.0604, "num_input_tokens_seen": 80798952, "step": 119875 }, { "epoch": 2.9286883443676253, "grad_norm": 0.010524839162826538, "learning_rate": 8.755961243615273e-07, "loss": 0.0002, "num_input_tokens_seen": 80802408, "step": 119880 }, { "epoch": 2.9288104951994725, "grad_norm": 0.025105202570557594, "learning_rate": 8.755115096795218e-07, "loss": 0.0001, "num_input_tokens_seen": 80805736, "step": 119885 }, { "epoch": 2.9289326460313196, "grad_norm": 0.08292187005281448, "learning_rate": 8.754268959028297e-07, "loss": 0.0742, "num_input_tokens_seen": 80809064, "step": 119890 }, { "epoch": 2.9290547968631664, "grad_norm": 0.06168559938669205, "learning_rate": 8.753422830320666e-07, "loss": 0.0888, "num_input_tokens_seen": 80812200, "step": 119895 }, { "epoch": 2.929176947695014, "grad_norm": 0.013893499039113522, "learning_rate": 8.752576710678484e-07, "loss": 0.0339, "num_input_tokens_seen": 80815592, "step": 119900 }, { "epoch": 2.9292990985268608, "grad_norm": 0.019977891817688942, "learning_rate": 8.751730600107896e-07, "loss": 0.0002, "num_input_tokens_seen": 80818792, "step": 119905 }, { "epoch": 2.9294212493587084, "grad_norm": 296.9643859863281, "learning_rate": 8.750884498615063e-07, "loss": 0.1176, "num_input_tokens_seen": 80822248, "step": 119910 }, { "epoch": 2.929543400190555, "grad_norm": 0.3068140149116516, "learning_rate": 8.750038406206131e-07, "loss": 0.0195, "num_input_tokens_seen": 80825512, "step": 119915 }, { "epoch": 2.9296655510224023, "grad_norm": 0.07228963822126389, "learning_rate": 8.749192322887255e-07, "loss": 0.0833, "num_input_tokens_seen": 80828392, "step": 119920 }, { "epoch": 2.9297877018542495, "grad_norm": 0.2793293595314026, "learning_rate": 8.748346248664593e-07, "loss": 0.064, "num_input_tokens_seen": 80831656, "step": 119925 }, { "epoch": 2.9299098526860967, "grad_norm": 0.023717185482382774, "learning_rate": 8.74750018354429e-07, "loss": 0.0004, "num_input_tokens_seen": 80835048, "step": 119930 }, { "epoch": 2.930032003517944, "grad_norm": 1083.84228515625, "learning_rate": 8.746654127532505e-07, "loss": 0.0185, "num_input_tokens_seen": 80838184, "step": 119935 }, { "epoch": 2.930154154349791, "grad_norm": 0.176435187458992, "learning_rate": 8.745808080635385e-07, "loss": 0.0455, "num_input_tokens_seen": 80841128, "step": 119940 }, { "epoch": 2.9302763051816383, "grad_norm": 0.1183871254324913, "learning_rate": 8.744962042859089e-07, "loss": 0.0456, "num_input_tokens_seen": 80844456, "step": 119945 }, { "epoch": 2.9303984560134855, "grad_norm": 0.006031445227563381, "learning_rate": 8.744116014209763e-07, "loss": 0.0452, "num_input_tokens_seen": 80847848, "step": 119950 }, { "epoch": 2.9305206068453327, "grad_norm": 0.01688563823699951, "learning_rate": 8.743269994693565e-07, "loss": 0.0002, "num_input_tokens_seen": 80851048, "step": 119955 }, { "epoch": 2.93064275767718, "grad_norm": 0.010305249132215977, "learning_rate": 8.742423984316648e-07, "loss": 0.0404, "num_input_tokens_seen": 80853992, "step": 119960 }, { "epoch": 2.930764908509027, "grad_norm": 0.006608907133340836, "learning_rate": 8.741577983085161e-07, "loss": 0.0002, "num_input_tokens_seen": 80858088, "step": 119965 }, { "epoch": 2.9308870593408742, "grad_norm": 0.041544102132320404, "learning_rate": 8.740731991005257e-07, "loss": 0.0668, "num_input_tokens_seen": 80861480, "step": 119970 }, { "epoch": 2.9310092101727214, "grad_norm": 0.04633386433124542, "learning_rate": 8.739886008083088e-07, "loss": 0.041, "num_input_tokens_seen": 80864808, "step": 119975 }, { "epoch": 2.931131361004568, "grad_norm": 0.048316024243831635, "learning_rate": 8.739040034324805e-07, "loss": 0.0002, "num_input_tokens_seen": 80868328, "step": 119980 }, { "epoch": 2.931253511836416, "grad_norm": 0.004913606680929661, "learning_rate": 8.738194069736566e-07, "loss": 0.0014, "num_input_tokens_seen": 80871656, "step": 119985 }, { "epoch": 2.9313756626682625, "grad_norm": 0.6599655151367188, "learning_rate": 8.737348114324516e-07, "loss": 0.0004, "num_input_tokens_seen": 80874984, "step": 119990 }, { "epoch": 2.93149781350011, "grad_norm": 0.007945040240883827, "learning_rate": 8.736502168094814e-07, "loss": 0.0002, "num_input_tokens_seen": 80877864, "step": 119995 }, { "epoch": 2.931619964331957, "grad_norm": 0.010767627507448196, "learning_rate": 8.735656231053603e-07, "loss": 0.0002, "num_input_tokens_seen": 80881576, "step": 120000 }, { "epoch": 2.931742115163804, "grad_norm": 0.12437177449464798, "learning_rate": 8.734810303207046e-07, "loss": 0.0001, "num_input_tokens_seen": 80885160, "step": 120005 }, { "epoch": 2.9318642659956513, "grad_norm": 0.032584793865680695, "learning_rate": 8.733964384561282e-07, "loss": 0.0001, "num_input_tokens_seen": 80888616, "step": 120010 }, { "epoch": 2.9319864168274985, "grad_norm": 16.456106185913086, "learning_rate": 8.733118475122473e-07, "loss": 0.0528, "num_input_tokens_seen": 80892200, "step": 120015 }, { "epoch": 2.9321085676593457, "grad_norm": 0.02670753188431263, "learning_rate": 8.732272574896769e-07, "loss": 0.0002, "num_input_tokens_seen": 80895400, "step": 120020 }, { "epoch": 2.932230718491193, "grad_norm": 0.4635147154331207, "learning_rate": 8.731426683890315e-07, "loss": 0.0595, "num_input_tokens_seen": 80898344, "step": 120025 }, { "epoch": 2.93235286932304, "grad_norm": 0.0504097044467926, "learning_rate": 8.730580802109273e-07, "loss": 0.0001, "num_input_tokens_seen": 80901992, "step": 120030 }, { "epoch": 2.9324750201548873, "grad_norm": 18.078235626220703, "learning_rate": 8.729734929559785e-07, "loss": 0.081, "num_input_tokens_seen": 80905576, "step": 120035 }, { "epoch": 2.9325971709867344, "grad_norm": 0.055162858217954636, "learning_rate": 8.728889066248009e-07, "loss": 0.0757, "num_input_tokens_seen": 80908904, "step": 120040 }, { "epoch": 2.9327193218185816, "grad_norm": 0.0030497191473841667, "learning_rate": 8.72804321218009e-07, "loss": 0.0523, "num_input_tokens_seen": 80912040, "step": 120045 }, { "epoch": 2.932841472650429, "grad_norm": 0.20927371084690094, "learning_rate": 8.727197367362182e-07, "loss": 0.0001, "num_input_tokens_seen": 80915816, "step": 120050 }, { "epoch": 2.932963623482276, "grad_norm": 21.464340209960938, "learning_rate": 8.726351531800442e-07, "loss": 0.0552, "num_input_tokens_seen": 80919592, "step": 120055 }, { "epoch": 2.933085774314123, "grad_norm": 14.764019966125488, "learning_rate": 8.725505705501012e-07, "loss": 0.0407, "num_input_tokens_seen": 80923112, "step": 120060 }, { "epoch": 2.9332079251459704, "grad_norm": 0.012927616015076637, "learning_rate": 8.724659888470052e-07, "loss": 0.1252, "num_input_tokens_seen": 80926696, "step": 120065 }, { "epoch": 2.9333300759778176, "grad_norm": 65.81498718261719, "learning_rate": 8.723814080713705e-07, "loss": 0.1547, "num_input_tokens_seen": 80929896, "step": 120070 }, { "epoch": 2.9334522268096643, "grad_norm": 0.012749478220939636, "learning_rate": 8.722968282238124e-07, "loss": 0.0726, "num_input_tokens_seen": 80933608, "step": 120075 }, { "epoch": 2.933574377641512, "grad_norm": 9.03498363494873, "learning_rate": 8.722122493049465e-07, "loss": 0.0006, "num_input_tokens_seen": 80937320, "step": 120080 }, { "epoch": 2.9336965284733587, "grad_norm": 0.04102368652820587, "learning_rate": 8.721276713153871e-07, "loss": 0.0139, "num_input_tokens_seen": 80940264, "step": 120085 }, { "epoch": 2.9338186793052063, "grad_norm": 0.007833786308765411, "learning_rate": 8.720430942557502e-07, "loss": 0.0001, "num_input_tokens_seen": 80943784, "step": 120090 }, { "epoch": 2.933940830137053, "grad_norm": 0.055653203278779984, "learning_rate": 8.719585181266498e-07, "loss": 0.0395, "num_input_tokens_seen": 80946728, "step": 120095 }, { "epoch": 2.9340629809689003, "grad_norm": 0.4759756326675415, "learning_rate": 8.718739429287018e-07, "loss": 0.0009, "num_input_tokens_seen": 80950312, "step": 120100 }, { "epoch": 2.9341851318007475, "grad_norm": 0.05956454947590828, "learning_rate": 8.717893686625206e-07, "loss": 0.0001, "num_input_tokens_seen": 80953256, "step": 120105 }, { "epoch": 2.9343072826325947, "grad_norm": 0.041279036551713943, "learning_rate": 8.717047953287217e-07, "loss": 0.0003, "num_input_tokens_seen": 80956392, "step": 120110 }, { "epoch": 2.934429433464442, "grad_norm": 0.16914759576320648, "learning_rate": 8.716202229279204e-07, "loss": 0.0006, "num_input_tokens_seen": 80959464, "step": 120115 }, { "epoch": 2.934551584296289, "grad_norm": 33.73630905151367, "learning_rate": 8.715356514607312e-07, "loss": 0.1412, "num_input_tokens_seen": 80962984, "step": 120120 }, { "epoch": 2.9346737351281362, "grad_norm": 30.533674240112305, "learning_rate": 8.714510809277692e-07, "loss": 0.0863, "num_input_tokens_seen": 80966568, "step": 120125 }, { "epoch": 2.9347958859599834, "grad_norm": 0.09490475803613663, "learning_rate": 8.713665113296495e-07, "loss": 0.0521, "num_input_tokens_seen": 80969960, "step": 120130 }, { "epoch": 2.9349180367918306, "grad_norm": 0.041476789861917496, "learning_rate": 8.71281942666987e-07, "loss": 0.0003, "num_input_tokens_seen": 80973544, "step": 120135 }, { "epoch": 2.935040187623678, "grad_norm": 0.12009302526712418, "learning_rate": 8.711973749403974e-07, "loss": 0.0001, "num_input_tokens_seen": 80976936, "step": 120140 }, { "epoch": 2.935162338455525, "grad_norm": 0.02471102401614189, "learning_rate": 8.711128081504945e-07, "loss": 0.0008, "num_input_tokens_seen": 80980392, "step": 120145 }, { "epoch": 2.935284489287372, "grad_norm": 0.21336673200130463, "learning_rate": 8.710282422978942e-07, "loss": 0.0004, "num_input_tokens_seen": 80983464, "step": 120150 }, { "epoch": 2.9354066401192194, "grad_norm": 0.035361818969249725, "learning_rate": 8.709436773832111e-07, "loss": 0.0001, "num_input_tokens_seen": 80987112, "step": 120155 }, { "epoch": 2.935528790951066, "grad_norm": 0.33538514375686646, "learning_rate": 8.708591134070607e-07, "loss": 0.0004, "num_input_tokens_seen": 80990440, "step": 120160 }, { "epoch": 2.9356509417829137, "grad_norm": 0.006556864362210035, "learning_rate": 8.707745503700569e-07, "loss": 0.0001, "num_input_tokens_seen": 80993768, "step": 120165 }, { "epoch": 2.9357730926147605, "grad_norm": 0.0718025341629982, "learning_rate": 8.706899882728157e-07, "loss": 0.0545, "num_input_tokens_seen": 80997032, "step": 120170 }, { "epoch": 2.935895243446608, "grad_norm": 0.013668089173734188, "learning_rate": 8.70605427115952e-07, "loss": 0.0672, "num_input_tokens_seen": 81000744, "step": 120175 }, { "epoch": 2.936017394278455, "grad_norm": 0.03084360808134079, "learning_rate": 8.705208669000798e-07, "loss": 0.0002, "num_input_tokens_seen": 81003624, "step": 120180 }, { "epoch": 2.936139545110302, "grad_norm": 0.06931090354919434, "learning_rate": 8.704363076258152e-07, "loss": 0.0415, "num_input_tokens_seen": 81007144, "step": 120185 }, { "epoch": 2.9362616959421493, "grad_norm": 0.23784157633781433, "learning_rate": 8.703517492937721e-07, "loss": 0.0488, "num_input_tokens_seen": 81010536, "step": 120190 }, { "epoch": 2.9363838467739964, "grad_norm": 0.10758914798498154, "learning_rate": 8.702671919045665e-07, "loss": 0.0002, "num_input_tokens_seen": 81014120, "step": 120195 }, { "epoch": 2.9365059976058436, "grad_norm": 0.03329499065876007, "learning_rate": 8.701826354588123e-07, "loss": 0.0001, "num_input_tokens_seen": 81017128, "step": 120200 }, { "epoch": 2.936628148437691, "grad_norm": 0.023509986698627472, "learning_rate": 8.70098079957125e-07, "loss": 0.0001, "num_input_tokens_seen": 81020392, "step": 120205 }, { "epoch": 2.936750299269538, "grad_norm": 0.06350888311862946, "learning_rate": 8.700135254001197e-07, "loss": 0.0936, "num_input_tokens_seen": 81023400, "step": 120210 }, { "epoch": 2.936872450101385, "grad_norm": 0.16623754799365997, "learning_rate": 8.699289717884106e-07, "loss": 0.0829, "num_input_tokens_seen": 81026920, "step": 120215 }, { "epoch": 2.9369946009332324, "grad_norm": 0.2236860990524292, "learning_rate": 8.698444191226134e-07, "loss": 0.0003, "num_input_tokens_seen": 81030504, "step": 120220 }, { "epoch": 2.9371167517650796, "grad_norm": 0.21353180706501007, "learning_rate": 8.697598674033424e-07, "loss": 0.0006, "num_input_tokens_seen": 81033832, "step": 120225 }, { "epoch": 2.9372389025969268, "grad_norm": 0.1755683422088623, "learning_rate": 8.696753166312125e-07, "loss": 0.0007, "num_input_tokens_seen": 81037160, "step": 120230 }, { "epoch": 2.937361053428774, "grad_norm": 1.234663486480713, "learning_rate": 8.695907668068392e-07, "loss": 0.0003, "num_input_tokens_seen": 81040680, "step": 120235 }, { "epoch": 2.937483204260621, "grad_norm": 43.08966827392578, "learning_rate": 8.695062179308365e-07, "loss": 0.0676, "num_input_tokens_seen": 81044264, "step": 120240 }, { "epoch": 2.9376053550924683, "grad_norm": 0.07131534814834595, "learning_rate": 8.694216700038199e-07, "loss": 0.0002, "num_input_tokens_seen": 81047144, "step": 120245 }, { "epoch": 2.9377275059243155, "grad_norm": 85.09132385253906, "learning_rate": 8.693371230264038e-07, "loss": 0.0415, "num_input_tokens_seen": 81050792, "step": 120250 }, { "epoch": 2.9378496567561623, "grad_norm": 0.09336002171039581, "learning_rate": 8.692525769992037e-07, "loss": 0.0001, "num_input_tokens_seen": 81054056, "step": 120255 }, { "epoch": 2.93797180758801, "grad_norm": 0.011122871190309525, "learning_rate": 8.691680319228337e-07, "loss": 0.0015, "num_input_tokens_seen": 81057704, "step": 120260 }, { "epoch": 2.9380939584198567, "grad_norm": 28.55598258972168, "learning_rate": 8.690834877979087e-07, "loss": 0.0378, "num_input_tokens_seen": 81061096, "step": 120265 }, { "epoch": 2.938216109251704, "grad_norm": 0.23469318449497223, "learning_rate": 8.689989446250444e-07, "loss": 0.0002, "num_input_tokens_seen": 81064360, "step": 120270 }, { "epoch": 2.938338260083551, "grad_norm": 17.726032257080078, "learning_rate": 8.689144024048549e-07, "loss": 0.0424, "num_input_tokens_seen": 81067880, "step": 120275 }, { "epoch": 2.9384604109153982, "grad_norm": 0.0037503752391785383, "learning_rate": 8.688298611379548e-07, "loss": 0.0002, "num_input_tokens_seen": 81071400, "step": 120280 }, { "epoch": 2.9385825617472454, "grad_norm": 0.04066101089119911, "learning_rate": 8.687453208249594e-07, "loss": 0.0571, "num_input_tokens_seen": 81075240, "step": 120285 }, { "epoch": 2.9387047125790926, "grad_norm": 0.7134239077568054, "learning_rate": 8.686607814664836e-07, "loss": 0.0005, "num_input_tokens_seen": 81079016, "step": 120290 }, { "epoch": 2.93882686341094, "grad_norm": 0.0060598282143473625, "learning_rate": 8.685762430631415e-07, "loss": 0.0893, "num_input_tokens_seen": 81082600, "step": 120295 }, { "epoch": 2.938949014242787, "grad_norm": 0.23264743387699127, "learning_rate": 8.684917056155482e-07, "loss": 0.0513, "num_input_tokens_seen": 81086568, "step": 120300 }, { "epoch": 2.939071165074634, "grad_norm": 0.006453213281929493, "learning_rate": 8.684071691243191e-07, "loss": 0.0001, "num_input_tokens_seen": 81090600, "step": 120305 }, { "epoch": 2.9391933159064814, "grad_norm": 0.01573999412357807, "learning_rate": 8.68322633590068e-07, "loss": 0.0467, "num_input_tokens_seen": 81093928, "step": 120310 }, { "epoch": 2.9393154667383286, "grad_norm": 0.05469103530049324, "learning_rate": 8.682380990134106e-07, "loss": 0.0514, "num_input_tokens_seen": 81097192, "step": 120315 }, { "epoch": 2.9394376175701757, "grad_norm": 274.07611083984375, "learning_rate": 8.681535653949607e-07, "loss": 0.0329, "num_input_tokens_seen": 81100840, "step": 120320 }, { "epoch": 2.939559768402023, "grad_norm": 0.011432591825723648, "learning_rate": 8.680690327353338e-07, "loss": 0.043, "num_input_tokens_seen": 81104104, "step": 120325 }, { "epoch": 2.93968191923387, "grad_norm": 0.18098084628582, "learning_rate": 8.679845010351446e-07, "loss": 0.0461, "num_input_tokens_seen": 81107368, "step": 120330 }, { "epoch": 2.9398040700657173, "grad_norm": 0.02071589045226574, "learning_rate": 8.678999702950069e-07, "loss": 0.0743, "num_input_tokens_seen": 81110504, "step": 120335 }, { "epoch": 2.939926220897564, "grad_norm": 0.03295678645372391, "learning_rate": 8.678154405155369e-07, "loss": 0.0001, "num_input_tokens_seen": 81114024, "step": 120340 }, { "epoch": 2.9400483717294117, "grad_norm": 0.027914108708500862, "learning_rate": 8.677309116973481e-07, "loss": 0.0901, "num_input_tokens_seen": 81117352, "step": 120345 }, { "epoch": 2.9401705225612584, "grad_norm": 0.03139597550034523, "learning_rate": 8.67646383841056e-07, "loss": 0.0569, "num_input_tokens_seen": 81120616, "step": 120350 }, { "epoch": 2.940292673393106, "grad_norm": 0.025647137314081192, "learning_rate": 8.675618569472747e-07, "loss": 0.0745, "num_input_tokens_seen": 81123560, "step": 120355 }, { "epoch": 2.940414824224953, "grad_norm": 18.67249298095703, "learning_rate": 8.674773310166191e-07, "loss": 0.049, "num_input_tokens_seen": 81127080, "step": 120360 }, { "epoch": 2.9405369750568, "grad_norm": 1.2997747659683228, "learning_rate": 8.673928060497045e-07, "loss": 0.0007, "num_input_tokens_seen": 81130920, "step": 120365 }, { "epoch": 2.940659125888647, "grad_norm": 0.03595520183444023, "learning_rate": 8.673082820471447e-07, "loss": 0.0001, "num_input_tokens_seen": 81133992, "step": 120370 }, { "epoch": 2.9407812767204944, "grad_norm": 0.2902233898639679, "learning_rate": 8.67223759009555e-07, "loss": 0.0162, "num_input_tokens_seen": 81137448, "step": 120375 }, { "epoch": 2.9409034275523416, "grad_norm": 0.024199893698096275, "learning_rate": 8.671392369375498e-07, "loss": 0.0002, "num_input_tokens_seen": 81140904, "step": 120380 }, { "epoch": 2.9410255783841888, "grad_norm": 0.0339377336204052, "learning_rate": 8.670547158317434e-07, "loss": 0.0001, "num_input_tokens_seen": 81144552, "step": 120385 }, { "epoch": 2.941147729216036, "grad_norm": 0.008640998043119907, "learning_rate": 8.669701956927515e-07, "loss": 0.0562, "num_input_tokens_seen": 81148072, "step": 120390 }, { "epoch": 2.941269880047883, "grad_norm": 0.28882527351379395, "learning_rate": 8.668856765211876e-07, "loss": 0.1314, "num_input_tokens_seen": 81151208, "step": 120395 }, { "epoch": 2.9413920308797303, "grad_norm": 0.2839096784591675, "learning_rate": 8.668011583176673e-07, "loss": 0.0001, "num_input_tokens_seen": 81154792, "step": 120400 }, { "epoch": 2.9415141817115775, "grad_norm": 0.021755626425147057, "learning_rate": 8.667166410828044e-07, "loss": 0.0786, "num_input_tokens_seen": 81158056, "step": 120405 }, { "epoch": 2.9416363325434247, "grad_norm": 0.060528360307216644, "learning_rate": 8.666321248172143e-07, "loss": 0.0003, "num_input_tokens_seen": 81161768, "step": 120410 }, { "epoch": 2.941758483375272, "grad_norm": 0.2548329532146454, "learning_rate": 8.665476095215109e-07, "loss": 0.0005, "num_input_tokens_seen": 81165288, "step": 120415 }, { "epoch": 2.941880634207119, "grad_norm": 0.022125931456685066, "learning_rate": 8.664630951963091e-07, "loss": 0.0666, "num_input_tokens_seen": 81168680, "step": 120420 }, { "epoch": 2.942002785038966, "grad_norm": 0.15540054440498352, "learning_rate": 8.66378581842224e-07, "loss": 0.0008, "num_input_tokens_seen": 81172584, "step": 120425 }, { "epoch": 2.9421249358708135, "grad_norm": 3.678313078125939e-05, "learning_rate": 8.662940694598697e-07, "loss": 0.0002, "num_input_tokens_seen": 81175784, "step": 120430 }, { "epoch": 2.94224708670266, "grad_norm": 0.26085910201072693, "learning_rate": 8.662095580498607e-07, "loss": 0.0002, "num_input_tokens_seen": 81179496, "step": 120435 }, { "epoch": 2.942369237534508, "grad_norm": 0.03058680146932602, "learning_rate": 8.66125047612812e-07, "loss": 0.0003, "num_input_tokens_seen": 81182888, "step": 120440 }, { "epoch": 2.9424913883663546, "grad_norm": 0.19108319282531738, "learning_rate": 8.660405381493381e-07, "loss": 0.0444, "num_input_tokens_seen": 81186472, "step": 120445 }, { "epoch": 2.942613539198202, "grad_norm": 0.0032404169905930758, "learning_rate": 8.65956029660053e-07, "loss": 0.0001, "num_input_tokens_seen": 81189928, "step": 120450 }, { "epoch": 2.942735690030049, "grad_norm": 0.5902910828590393, "learning_rate": 8.658715221455717e-07, "loss": 0.0373, "num_input_tokens_seen": 81193000, "step": 120455 }, { "epoch": 2.942857840861896, "grad_norm": 11.769871711730957, "learning_rate": 8.657870156065091e-07, "loss": 0.0634, "num_input_tokens_seen": 81196136, "step": 120460 }, { "epoch": 2.9429799916937434, "grad_norm": 0.14178988337516785, "learning_rate": 8.657025100434792e-07, "loss": 0.0003, "num_input_tokens_seen": 81199144, "step": 120465 }, { "epoch": 2.9431021425255905, "grad_norm": 0.004357943311333656, "learning_rate": 8.65618005457097e-07, "loss": 0.0535, "num_input_tokens_seen": 81202728, "step": 120470 }, { "epoch": 2.9432242933574377, "grad_norm": 0.005334364715963602, "learning_rate": 8.655335018479764e-07, "loss": 0.0379, "num_input_tokens_seen": 81205864, "step": 120475 }, { "epoch": 2.943346444189285, "grad_norm": 0.10710705816745758, "learning_rate": 8.654489992167326e-07, "loss": 0.0515, "num_input_tokens_seen": 81209192, "step": 120480 }, { "epoch": 2.943468595021132, "grad_norm": 0.02609104849398136, "learning_rate": 8.653644975639802e-07, "loss": 0.0004, "num_input_tokens_seen": 81212904, "step": 120485 }, { "epoch": 2.9435907458529793, "grad_norm": 0.015001550316810608, "learning_rate": 8.652799968903328e-07, "loss": 0.0001, "num_input_tokens_seen": 81216360, "step": 120490 }, { "epoch": 2.9437128966848265, "grad_norm": 0.004880082793533802, "learning_rate": 8.65195497196406e-07, "loss": 0.0606, "num_input_tokens_seen": 81218984, "step": 120495 }, { "epoch": 2.9438350475166737, "grad_norm": 0.02117271162569523, "learning_rate": 8.651109984828133e-07, "loss": 0.0446, "num_input_tokens_seen": 81222184, "step": 120500 }, { "epoch": 2.943957198348521, "grad_norm": 0.14116646349430084, "learning_rate": 8.650265007501702e-07, "loss": 0.0004, "num_input_tokens_seen": 81225192, "step": 120505 }, { "epoch": 2.944079349180368, "grad_norm": 0.031078439205884933, "learning_rate": 8.649420039990904e-07, "loss": 0.0001, "num_input_tokens_seen": 81229224, "step": 120510 }, { "epoch": 2.9442015000122153, "grad_norm": 0.1605730801820755, "learning_rate": 8.648575082301884e-07, "loss": 0.0379, "num_input_tokens_seen": 81232552, "step": 120515 }, { "epoch": 2.944323650844062, "grad_norm": 0.08591590076684952, "learning_rate": 8.647730134440796e-07, "loss": 0.0001, "num_input_tokens_seen": 81236264, "step": 120520 }, { "epoch": 2.9444458016759096, "grad_norm": 2.821213960647583, "learning_rate": 8.646885196413772e-07, "loss": 0.0005, "num_input_tokens_seen": 81239720, "step": 120525 }, { "epoch": 2.9445679525077564, "grad_norm": 0.0029579047113656998, "learning_rate": 8.646040268226969e-07, "loss": 0.0001, "num_input_tokens_seen": 81244200, "step": 120530 }, { "epoch": 2.944690103339604, "grad_norm": 0.012535029090940952, "learning_rate": 8.645195349886522e-07, "loss": 0.0627, "num_input_tokens_seen": 81247336, "step": 120535 }, { "epoch": 2.9448122541714508, "grad_norm": 0.010929737240076065, "learning_rate": 8.644350441398575e-07, "loss": 0.0279, "num_input_tokens_seen": 81250792, "step": 120540 }, { "epoch": 2.944934405003298, "grad_norm": 0.29619693756103516, "learning_rate": 8.643505542769283e-07, "loss": 0.0466, "num_input_tokens_seen": 81253992, "step": 120545 }, { "epoch": 2.945056555835145, "grad_norm": 0.08591552823781967, "learning_rate": 8.642660654004779e-07, "loss": 0.0002, "num_input_tokens_seen": 81257448, "step": 120550 }, { "epoch": 2.9451787066669923, "grad_norm": 0.09558862447738647, "learning_rate": 8.641815775111216e-07, "loss": 0.0001, "num_input_tokens_seen": 81260584, "step": 120555 }, { "epoch": 2.9453008574988395, "grad_norm": 27.3420352935791, "learning_rate": 8.640970906094729e-07, "loss": 0.0765, "num_input_tokens_seen": 81263848, "step": 120560 }, { "epoch": 2.9454230083306867, "grad_norm": 0.0957900658249855, "learning_rate": 8.640126046961473e-07, "loss": 0.0003, "num_input_tokens_seen": 81267688, "step": 120565 }, { "epoch": 2.945545159162534, "grad_norm": 0.054024793207645416, "learning_rate": 8.639281197717579e-07, "loss": 0.0001, "num_input_tokens_seen": 81271016, "step": 120570 }, { "epoch": 2.945667309994381, "grad_norm": 0.016774863004684448, "learning_rate": 8.6384363583692e-07, "loss": 0.0001, "num_input_tokens_seen": 81274408, "step": 120575 }, { "epoch": 2.9457894608262283, "grad_norm": 0.029413677752017975, "learning_rate": 8.637591528922482e-07, "loss": 0.0, "num_input_tokens_seen": 81277544, "step": 120580 }, { "epoch": 2.9459116116580755, "grad_norm": 0.0019817219581454992, "learning_rate": 8.636746709383563e-07, "loss": 0.0515, "num_input_tokens_seen": 81281000, "step": 120585 }, { "epoch": 2.9460337624899227, "grad_norm": 266.28082275390625, "learning_rate": 8.635901899758589e-07, "loss": 0.0343, "num_input_tokens_seen": 81284264, "step": 120590 }, { "epoch": 2.94615591332177, "grad_norm": 0.043926727026700974, "learning_rate": 8.635057100053702e-07, "loss": 0.0548, "num_input_tokens_seen": 81287528, "step": 120595 }, { "epoch": 2.946278064153617, "grad_norm": 0.010100879706442356, "learning_rate": 8.63421231027505e-07, "loss": 0.0001, "num_input_tokens_seen": 81290792, "step": 120600 }, { "epoch": 2.946400214985464, "grad_norm": 0.10049595683813095, "learning_rate": 8.633367530428769e-07, "loss": 0.0002, "num_input_tokens_seen": 81294056, "step": 120605 }, { "epoch": 2.9465223658173114, "grad_norm": 0.026556089520454407, "learning_rate": 8.632522760521006e-07, "loss": 0.0001, "num_input_tokens_seen": 81297704, "step": 120610 }, { "epoch": 2.946644516649158, "grad_norm": 0.14986656606197357, "learning_rate": 8.63167800055791e-07, "loss": 0.0008, "num_input_tokens_seen": 81300904, "step": 120615 }, { "epoch": 2.946766667481006, "grad_norm": 0.001587200560607016, "learning_rate": 8.630833250545616e-07, "loss": 0.059, "num_input_tokens_seen": 81304360, "step": 120620 }, { "epoch": 2.9468888183128525, "grad_norm": 0.03377887234091759, "learning_rate": 8.629988510490274e-07, "loss": 0.0009, "num_input_tokens_seen": 81307880, "step": 120625 }, { "epoch": 2.9470109691446997, "grad_norm": 0.13490785658359528, "learning_rate": 8.629143780398022e-07, "loss": 0.0728, "num_input_tokens_seen": 81311144, "step": 120630 }, { "epoch": 2.947133119976547, "grad_norm": 0.0219797994941473, "learning_rate": 8.628299060275006e-07, "loss": 0.0006, "num_input_tokens_seen": 81314408, "step": 120635 }, { "epoch": 2.947255270808394, "grad_norm": 0.031006071716547012, "learning_rate": 8.62745435012737e-07, "loss": 0.0001, "num_input_tokens_seen": 81317736, "step": 120640 }, { "epoch": 2.9473774216402413, "grad_norm": 0.01624114066362381, "learning_rate": 8.62660964996125e-07, "loss": 0.0788, "num_input_tokens_seen": 81321064, "step": 120645 }, { "epoch": 2.9474995724720885, "grad_norm": 0.4532836377620697, "learning_rate": 8.625764959782799e-07, "loss": 0.0423, "num_input_tokens_seen": 81324200, "step": 120650 }, { "epoch": 2.9476217233039357, "grad_norm": 26.661325454711914, "learning_rate": 8.624920279598152e-07, "loss": 0.1261, "num_input_tokens_seen": 81327784, "step": 120655 }, { "epoch": 2.947743874135783, "grad_norm": 0.06300724297761917, "learning_rate": 8.624075609413457e-07, "loss": 0.0719, "num_input_tokens_seen": 81330984, "step": 120660 }, { "epoch": 2.94786602496763, "grad_norm": 0.014966240152716637, "learning_rate": 8.623230949234851e-07, "loss": 0.0002, "num_input_tokens_seen": 81334184, "step": 120665 }, { "epoch": 2.9479881757994773, "grad_norm": 24.074207305908203, "learning_rate": 8.62238629906848e-07, "loss": 0.1365, "num_input_tokens_seen": 81337256, "step": 120670 }, { "epoch": 2.9481103266313244, "grad_norm": 0.0069862534292042255, "learning_rate": 8.62154165892049e-07, "loss": 0.0002, "num_input_tokens_seen": 81340456, "step": 120675 }, { "epoch": 2.9482324774631716, "grad_norm": 0.008710438385605812, "learning_rate": 8.620697028797016e-07, "loss": 0.0003, "num_input_tokens_seen": 81343720, "step": 120680 }, { "epoch": 2.948354628295019, "grad_norm": 0.13439220190048218, "learning_rate": 8.619852408704208e-07, "loss": 0.0586, "num_input_tokens_seen": 81348008, "step": 120685 }, { "epoch": 2.948476779126866, "grad_norm": 0.017029277980327606, "learning_rate": 8.619007798648202e-07, "loss": 0.0004, "num_input_tokens_seen": 81351592, "step": 120690 }, { "epoch": 2.948598929958713, "grad_norm": 0.018834611400961876, "learning_rate": 8.618163198635142e-07, "loss": 0.0006, "num_input_tokens_seen": 81355048, "step": 120695 }, { "epoch": 2.94872108079056, "grad_norm": 0.03875388205051422, "learning_rate": 8.617318608671174e-07, "loss": 0.0334, "num_input_tokens_seen": 81358440, "step": 120700 }, { "epoch": 2.9488432316224076, "grad_norm": 14.059005737304688, "learning_rate": 8.616474028762432e-07, "loss": 0.0654, "num_input_tokens_seen": 81361704, "step": 120705 }, { "epoch": 2.9489653824542543, "grad_norm": 0.020959537476301193, "learning_rate": 8.615629458915069e-07, "loss": 0.0003, "num_input_tokens_seen": 81365224, "step": 120710 }, { "epoch": 2.9490875332861015, "grad_norm": 0.1255888193845749, "learning_rate": 8.614784899135216e-07, "loss": 0.0775, "num_input_tokens_seen": 81368680, "step": 120715 }, { "epoch": 2.9492096841179487, "grad_norm": 0.014902031049132347, "learning_rate": 8.613940349429024e-07, "loss": 0.0002, "num_input_tokens_seen": 81371560, "step": 120720 }, { "epoch": 2.949331834949796, "grad_norm": 0.1372278332710266, "learning_rate": 8.613095809802626e-07, "loss": 0.0003, "num_input_tokens_seen": 81375016, "step": 120725 }, { "epoch": 2.949453985781643, "grad_norm": 0.001658071530982852, "learning_rate": 8.612251280262168e-07, "loss": 0.0294, "num_input_tokens_seen": 81378664, "step": 120730 }, { "epoch": 2.9495761366134903, "grad_norm": 0.059731800109148026, "learning_rate": 8.611406760813797e-07, "loss": 0.1169, "num_input_tokens_seen": 81381544, "step": 120735 }, { "epoch": 2.9496982874453375, "grad_norm": 0.007839690893888474, "learning_rate": 8.610562251463648e-07, "loss": 0.0003, "num_input_tokens_seen": 81384936, "step": 120740 }, { "epoch": 2.9498204382771847, "grad_norm": 22.30178451538086, "learning_rate": 8.609717752217864e-07, "loss": 0.1131, "num_input_tokens_seen": 81388392, "step": 120745 }, { "epoch": 2.949942589109032, "grad_norm": 0.015643972903490067, "learning_rate": 8.608873263082584e-07, "loss": 0.058, "num_input_tokens_seen": 81391656, "step": 120750 }, { "epoch": 2.950064739940879, "grad_norm": 0.017678532749414444, "learning_rate": 8.608028784063957e-07, "loss": 0.0001, "num_input_tokens_seen": 81394792, "step": 120755 }, { "epoch": 2.9501868907727262, "grad_norm": 9.996286392211914, "learning_rate": 8.607184315168112e-07, "loss": 0.1291, "num_input_tokens_seen": 81397928, "step": 120760 }, { "epoch": 2.9503090416045734, "grad_norm": 9.89952278137207, "learning_rate": 8.6063398564012e-07, "loss": 0.0884, "num_input_tokens_seen": 81401512, "step": 120765 }, { "epoch": 2.9504311924364206, "grad_norm": 0.048212021589279175, "learning_rate": 8.605495407769362e-07, "loss": 0.0323, "num_input_tokens_seen": 81404520, "step": 120770 }, { "epoch": 2.950553343268268, "grad_norm": 0.2680891752243042, "learning_rate": 8.604650969278733e-07, "loss": 0.0005, "num_input_tokens_seen": 81408104, "step": 120775 }, { "epoch": 2.950675494100115, "grad_norm": 0.014988810755312443, "learning_rate": 8.603806540935461e-07, "loss": 0.0481, "num_input_tokens_seen": 81411880, "step": 120780 }, { "epoch": 2.9507976449319617, "grad_norm": 0.09640608727931976, "learning_rate": 8.602962122745679e-07, "loss": 0.0005, "num_input_tokens_seen": 81415528, "step": 120785 }, { "epoch": 2.9509197957638094, "grad_norm": 0.10828623175621033, "learning_rate": 8.602117714715536e-07, "loss": 0.0011, "num_input_tokens_seen": 81418984, "step": 120790 }, { "epoch": 2.951041946595656, "grad_norm": 0.13522803783416748, "learning_rate": 8.601273316851168e-07, "loss": 0.0006, "num_input_tokens_seen": 81422248, "step": 120795 }, { "epoch": 2.9511640974275037, "grad_norm": 0.036873117089271545, "learning_rate": 8.600428929158715e-07, "loss": 0.0004, "num_input_tokens_seen": 81425320, "step": 120800 }, { "epoch": 2.9512862482593505, "grad_norm": 0.15291425585746765, "learning_rate": 8.599584551644324e-07, "loss": 0.1369, "num_input_tokens_seen": 81428712, "step": 120805 }, { "epoch": 2.9514083990911977, "grad_norm": 0.17997407913208008, "learning_rate": 8.598740184314124e-07, "loss": 0.0005, "num_input_tokens_seen": 81431784, "step": 120810 }, { "epoch": 2.951530549923045, "grad_norm": 0.15903061628341675, "learning_rate": 8.597895827174269e-07, "loss": 0.0376, "num_input_tokens_seen": 81435176, "step": 120815 }, { "epoch": 2.951652700754892, "grad_norm": 0.06580569595098495, "learning_rate": 8.597051480230886e-07, "loss": 0.0005, "num_input_tokens_seen": 81438760, "step": 120820 }, { "epoch": 2.9517748515867392, "grad_norm": 0.052439238876104355, "learning_rate": 8.596207143490123e-07, "loss": 0.0537, "num_input_tokens_seen": 81441960, "step": 120825 }, { "epoch": 2.9518970024185864, "grad_norm": 0.025906480848789215, "learning_rate": 8.595362816958124e-07, "loss": 0.001, "num_input_tokens_seen": 81445288, "step": 120830 }, { "epoch": 2.9520191532504336, "grad_norm": 0.01771625503897667, "learning_rate": 8.594518500641019e-07, "loss": 0.0002, "num_input_tokens_seen": 81448424, "step": 120835 }, { "epoch": 2.952141304082281, "grad_norm": 0.2788228392601013, "learning_rate": 8.59367419454496e-07, "loss": 0.0427, "num_input_tokens_seen": 81451368, "step": 120840 }, { "epoch": 2.952263454914128, "grad_norm": 0.26164335012435913, "learning_rate": 8.592829898676076e-07, "loss": 0.0004, "num_input_tokens_seen": 81454760, "step": 120845 }, { "epoch": 2.952385605745975, "grad_norm": 12.802289962768555, "learning_rate": 8.591985613040511e-07, "loss": 0.1223, "num_input_tokens_seen": 81457960, "step": 120850 }, { "epoch": 2.9525077565778224, "grad_norm": 0.7145296931266785, "learning_rate": 8.591141337644409e-07, "loss": 0.0372, "num_input_tokens_seen": 81461352, "step": 120855 }, { "epoch": 2.9526299074096696, "grad_norm": 0.05967799574136734, "learning_rate": 8.590297072493901e-07, "loss": 0.0711, "num_input_tokens_seen": 81464552, "step": 120860 }, { "epoch": 2.9527520582415168, "grad_norm": 0.08640944957733154, "learning_rate": 8.589452817595138e-07, "loss": 0.0007, "num_input_tokens_seen": 81467752, "step": 120865 }, { "epoch": 2.952874209073364, "grad_norm": 0.059327781200408936, "learning_rate": 8.588608572954248e-07, "loss": 0.0001, "num_input_tokens_seen": 81471208, "step": 120870 }, { "epoch": 2.952996359905211, "grad_norm": 8.785379031905904e-05, "learning_rate": 8.587764338577381e-07, "loss": 0.0384, "num_input_tokens_seen": 81474664, "step": 120875 }, { "epoch": 2.953118510737058, "grad_norm": 0.17147451639175415, "learning_rate": 8.586920114470666e-07, "loss": 0.0005, "num_input_tokens_seen": 81478696, "step": 120880 }, { "epoch": 2.9532406615689055, "grad_norm": 0.017553070560097694, "learning_rate": 8.586075900640248e-07, "loss": 0.117, "num_input_tokens_seen": 81481640, "step": 120885 }, { "epoch": 2.9533628124007523, "grad_norm": 0.023216333240270615, "learning_rate": 8.585231697092272e-07, "loss": 0.0002, "num_input_tokens_seen": 81485416, "step": 120890 }, { "epoch": 2.9534849632325995, "grad_norm": 65.75865936279297, "learning_rate": 8.584387503832868e-07, "loss": 0.0558, "num_input_tokens_seen": 81488616, "step": 120895 }, { "epoch": 2.9536071140644466, "grad_norm": 0.9762405753135681, "learning_rate": 8.583543320868181e-07, "loss": 0.0993, "num_input_tokens_seen": 81492008, "step": 120900 }, { "epoch": 2.953729264896294, "grad_norm": 0.018305018544197083, "learning_rate": 8.582699148204347e-07, "loss": 0.0326, "num_input_tokens_seen": 81495336, "step": 120905 }, { "epoch": 2.953851415728141, "grad_norm": 0.005096158478409052, "learning_rate": 8.581854985847508e-07, "loss": 0.0003, "num_input_tokens_seen": 81498536, "step": 120910 }, { "epoch": 2.953973566559988, "grad_norm": 0.06799504905939102, "learning_rate": 8.581010833803795e-07, "loss": 0.0576, "num_input_tokens_seen": 81502184, "step": 120915 }, { "epoch": 2.9540957173918354, "grad_norm": 0.09852556139230728, "learning_rate": 8.580166692079355e-07, "loss": 0.0441, "num_input_tokens_seen": 81505384, "step": 120920 }, { "epoch": 2.9542178682236826, "grad_norm": 69.60293579101562, "learning_rate": 8.579322560680329e-07, "loss": 0.0665, "num_input_tokens_seen": 81508456, "step": 120925 }, { "epoch": 2.95434001905553, "grad_norm": 0.3892046809196472, "learning_rate": 8.578478439612846e-07, "loss": 0.0293, "num_input_tokens_seen": 81511464, "step": 120930 }, { "epoch": 2.954462169887377, "grad_norm": 0.019820692017674446, "learning_rate": 8.577634328883055e-07, "loss": 0.0624, "num_input_tokens_seen": 81514792, "step": 120935 }, { "epoch": 2.954584320719224, "grad_norm": 0.05053357407450676, "learning_rate": 8.576790228497085e-07, "loss": 0.0006, "num_input_tokens_seen": 81518440, "step": 120940 }, { "epoch": 2.9547064715510714, "grad_norm": 0.2523508667945862, "learning_rate": 8.575946138461082e-07, "loss": 0.0003, "num_input_tokens_seen": 81522408, "step": 120945 }, { "epoch": 2.9548286223829185, "grad_norm": 0.08181318640708923, "learning_rate": 8.575102058781181e-07, "loss": 0.0007, "num_input_tokens_seen": 81525800, "step": 120950 }, { "epoch": 2.9549507732147657, "grad_norm": 32.51930618286133, "learning_rate": 8.574257989463522e-07, "loss": 0.0014, "num_input_tokens_seen": 81529064, "step": 120955 }, { "epoch": 2.955072924046613, "grad_norm": 0.07995011657476425, "learning_rate": 8.573413930514244e-07, "loss": 0.0004, "num_input_tokens_seen": 81532584, "step": 120960 }, { "epoch": 2.9551950748784597, "grad_norm": 0.014723896980285645, "learning_rate": 8.57256988193948e-07, "loss": 0.0609, "num_input_tokens_seen": 81535912, "step": 120965 }, { "epoch": 2.9553172257103073, "grad_norm": 42.25886917114258, "learning_rate": 8.571725843745374e-07, "loss": 0.1942, "num_input_tokens_seen": 81539304, "step": 120970 }, { "epoch": 2.955439376542154, "grad_norm": 0.14818468689918518, "learning_rate": 8.57088181593806e-07, "loss": 0.0004, "num_input_tokens_seen": 81542632, "step": 120975 }, { "epoch": 2.9555615273740017, "grad_norm": 0.336948424577713, "learning_rate": 8.570037798523677e-07, "loss": 0.0085, "num_input_tokens_seen": 81545832, "step": 120980 }, { "epoch": 2.9556836782058484, "grad_norm": 0.010888462886214256, "learning_rate": 8.569193791508368e-07, "loss": 0.0031, "num_input_tokens_seen": 81548840, "step": 120985 }, { "epoch": 2.9558058290376956, "grad_norm": 0.026835812255740166, "learning_rate": 8.568349794898262e-07, "loss": 0.0003, "num_input_tokens_seen": 81552040, "step": 120990 }, { "epoch": 2.955927979869543, "grad_norm": 0.006106141954660416, "learning_rate": 8.567505808699506e-07, "loss": 0.0001, "num_input_tokens_seen": 81555240, "step": 120995 }, { "epoch": 2.95605013070139, "grad_norm": 0.024814531207084656, "learning_rate": 8.566661832918231e-07, "loss": 0.0002, "num_input_tokens_seen": 81558760, "step": 121000 }, { "epoch": 2.956172281533237, "grad_norm": 0.035326164215803146, "learning_rate": 8.565817867560576e-07, "loss": 0.0002, "num_input_tokens_seen": 81561640, "step": 121005 }, { "epoch": 2.9562944323650844, "grad_norm": 49.551002502441406, "learning_rate": 8.564973912632679e-07, "loss": 0.0845, "num_input_tokens_seen": 81564776, "step": 121010 }, { "epoch": 2.9564165831969316, "grad_norm": 0.08276200294494629, "learning_rate": 8.564129968140677e-07, "loss": 0.0002, "num_input_tokens_seen": 81567848, "step": 121015 }, { "epoch": 2.9565387340287788, "grad_norm": 0.03254903480410576, "learning_rate": 8.563286034090711e-07, "loss": 0.0001, "num_input_tokens_seen": 81571368, "step": 121020 }, { "epoch": 2.956660884860626, "grad_norm": 0.025688692927360535, "learning_rate": 8.562442110488911e-07, "loss": 0.049, "num_input_tokens_seen": 81574824, "step": 121025 }, { "epoch": 2.956783035692473, "grad_norm": 0.43042516708374023, "learning_rate": 8.561598197341424e-07, "loss": 0.0005, "num_input_tokens_seen": 81578024, "step": 121030 }, { "epoch": 2.9569051865243203, "grad_norm": 0.02726716548204422, "learning_rate": 8.560754294654377e-07, "loss": 0.048, "num_input_tokens_seen": 81582120, "step": 121035 }, { "epoch": 2.9570273373561675, "grad_norm": 0.0018384922295808792, "learning_rate": 8.559910402433912e-07, "loss": 0.0003, "num_input_tokens_seen": 81585640, "step": 121040 }, { "epoch": 2.9571494881880147, "grad_norm": 15.961065292358398, "learning_rate": 8.55906652068617e-07, "loss": 0.0613, "num_input_tokens_seen": 81588520, "step": 121045 }, { "epoch": 2.9572716390198615, "grad_norm": 0.10140183568000793, "learning_rate": 8.558222649417282e-07, "loss": 0.0588, "num_input_tokens_seen": 81592296, "step": 121050 }, { "epoch": 2.957393789851709, "grad_norm": 0.019582247361540794, "learning_rate": 8.557378788633386e-07, "loss": 0.0004, "num_input_tokens_seen": 81595688, "step": 121055 }, { "epoch": 2.957515940683556, "grad_norm": 236.25596618652344, "learning_rate": 8.55653493834062e-07, "loss": 0.0867, "num_input_tokens_seen": 81599208, "step": 121060 }, { "epoch": 2.9576380915154035, "grad_norm": 0.0025045794900506735, "learning_rate": 8.555691098545122e-07, "loss": 0.0317, "num_input_tokens_seen": 81602984, "step": 121065 }, { "epoch": 2.95776024234725, "grad_norm": 0.029240965843200684, "learning_rate": 8.554847269253023e-07, "loss": 0.0933, "num_input_tokens_seen": 81606312, "step": 121070 }, { "epoch": 2.9578823931790974, "grad_norm": 55.595611572265625, "learning_rate": 8.554003450470463e-07, "loss": 0.0954, "num_input_tokens_seen": 81609832, "step": 121075 }, { "epoch": 2.9580045440109446, "grad_norm": 0.018441449850797653, "learning_rate": 8.553159642203584e-07, "loss": 0.0003, "num_input_tokens_seen": 81613096, "step": 121080 }, { "epoch": 2.958126694842792, "grad_norm": 0.039422810077667236, "learning_rate": 8.552315844458511e-07, "loss": 0.049, "num_input_tokens_seen": 81616552, "step": 121085 }, { "epoch": 2.958248845674639, "grad_norm": 0.2605257034301758, "learning_rate": 8.551472057241393e-07, "loss": 0.0008, "num_input_tokens_seen": 81619560, "step": 121090 }, { "epoch": 2.958370996506486, "grad_norm": 0.006907718721777201, "learning_rate": 8.550628280558354e-07, "loss": 0.0002, "num_input_tokens_seen": 81622824, "step": 121095 }, { "epoch": 2.9584931473383334, "grad_norm": 0.018841054290533066, "learning_rate": 8.549784514415539e-07, "loss": 0.0003, "num_input_tokens_seen": 81625896, "step": 121100 }, { "epoch": 2.9586152981701805, "grad_norm": 0.058261509984731674, "learning_rate": 8.548940758819081e-07, "loss": 0.0002, "num_input_tokens_seen": 81629224, "step": 121105 }, { "epoch": 2.9587374490020277, "grad_norm": 18.50106430053711, "learning_rate": 8.548097013775116e-07, "loss": 0.0641, "num_input_tokens_seen": 81632680, "step": 121110 }, { "epoch": 2.958859599833875, "grad_norm": 0.10784649848937988, "learning_rate": 8.547253279289781e-07, "loss": 0.0467, "num_input_tokens_seen": 81635880, "step": 121115 }, { "epoch": 2.958981750665722, "grad_norm": 0.014018794521689415, "learning_rate": 8.546409555369207e-07, "loss": 0.0001, "num_input_tokens_seen": 81638888, "step": 121120 }, { "epoch": 2.9591039014975693, "grad_norm": 0.02228376641869545, "learning_rate": 8.545565842019539e-07, "loss": 0.0001, "num_input_tokens_seen": 81643944, "step": 121125 }, { "epoch": 2.9592260523294165, "grad_norm": 39.539794921875, "learning_rate": 8.544722139246902e-07, "loss": 0.1017, "num_input_tokens_seen": 81647016, "step": 121130 }, { "epoch": 2.9593482031612637, "grad_norm": 0.007597757503390312, "learning_rate": 8.543878447057439e-07, "loss": 0.0002, "num_input_tokens_seen": 81650792, "step": 121135 }, { "epoch": 2.959470353993111, "grad_norm": 0.02928669936954975, "learning_rate": 8.543034765457286e-07, "loss": 0.0007, "num_input_tokens_seen": 81654184, "step": 121140 }, { "epoch": 2.9595925048249576, "grad_norm": 0.054370008409023285, "learning_rate": 8.542191094452574e-07, "loss": 0.0001, "num_input_tokens_seen": 81657384, "step": 121145 }, { "epoch": 2.9597146556568052, "grad_norm": 0.011456976644694805, "learning_rate": 8.541347434049442e-07, "loss": 0.0003, "num_input_tokens_seen": 81661032, "step": 121150 }, { "epoch": 2.959836806488652, "grad_norm": 0.005163121968507767, "learning_rate": 8.540503784254023e-07, "loss": 0.0001, "num_input_tokens_seen": 81664488, "step": 121155 }, { "epoch": 2.9599589573204996, "grad_norm": 0.3065810203552246, "learning_rate": 8.539660145072452e-07, "loss": 0.0002, "num_input_tokens_seen": 81667688, "step": 121160 }, { "epoch": 2.9600811081523464, "grad_norm": 0.016518019139766693, "learning_rate": 8.538816516510866e-07, "loss": 0.0002, "num_input_tokens_seen": 81671016, "step": 121165 }, { "epoch": 2.9602032589841936, "grad_norm": 54.666160583496094, "learning_rate": 8.537972898575398e-07, "loss": 0.0225, "num_input_tokens_seen": 81674728, "step": 121170 }, { "epoch": 2.9603254098160408, "grad_norm": 0.020525958389043808, "learning_rate": 8.537129291272187e-07, "loss": 0.073, "num_input_tokens_seen": 81678312, "step": 121175 }, { "epoch": 2.960447560647888, "grad_norm": 0.07339372485876083, "learning_rate": 8.536285694607361e-07, "loss": 0.0501, "num_input_tokens_seen": 81681448, "step": 121180 }, { "epoch": 2.960569711479735, "grad_norm": 0.045110974460840225, "learning_rate": 8.535442108587066e-07, "loss": 0.1142, "num_input_tokens_seen": 81685416, "step": 121185 }, { "epoch": 2.9606918623115823, "grad_norm": 0.035503778606653214, "learning_rate": 8.534598533217423e-07, "loss": 0.0001, "num_input_tokens_seen": 81688872, "step": 121190 }, { "epoch": 2.9608140131434295, "grad_norm": 0.04830501601099968, "learning_rate": 8.533754968504574e-07, "loss": 0.0535, "num_input_tokens_seen": 81692008, "step": 121195 }, { "epoch": 2.9609361639752767, "grad_norm": 0.006708557717502117, "learning_rate": 8.532911414454657e-07, "loss": 0.0858, "num_input_tokens_seen": 81695272, "step": 121200 }, { "epoch": 2.961058314807124, "grad_norm": 0.021845147013664246, "learning_rate": 8.532067871073803e-07, "loss": 0.0493, "num_input_tokens_seen": 81698536, "step": 121205 }, { "epoch": 2.961180465638971, "grad_norm": 15.514656066894531, "learning_rate": 8.531224338368144e-07, "loss": 0.1358, "num_input_tokens_seen": 81701736, "step": 121210 }, { "epoch": 2.9613026164708183, "grad_norm": 0.1647643893957138, "learning_rate": 8.530380816343818e-07, "loss": 0.0583, "num_input_tokens_seen": 81705128, "step": 121215 }, { "epoch": 2.9614247673026655, "grad_norm": 0.048709526658058167, "learning_rate": 8.52953730500696e-07, "loss": 0.1345, "num_input_tokens_seen": 81708200, "step": 121220 }, { "epoch": 2.9615469181345127, "grad_norm": 0.07313637435436249, "learning_rate": 8.528693804363697e-07, "loss": 0.0674, "num_input_tokens_seen": 81711784, "step": 121225 }, { "epoch": 2.9616690689663594, "grad_norm": 0.0985637828707695, "learning_rate": 8.527850314420169e-07, "loss": 0.0849, "num_input_tokens_seen": 81714600, "step": 121230 }, { "epoch": 2.961791219798207, "grad_norm": 0.11501231044530869, "learning_rate": 8.527006835182514e-07, "loss": 0.0005, "num_input_tokens_seen": 81717736, "step": 121235 }, { "epoch": 2.9619133706300538, "grad_norm": 0.049504294991493225, "learning_rate": 8.526163366656857e-07, "loss": 0.0429, "num_input_tokens_seen": 81721128, "step": 121240 }, { "epoch": 2.9620355214619014, "grad_norm": 0.17213276028633118, "learning_rate": 8.52531990884934e-07, "loss": 0.0115, "num_input_tokens_seen": 81724200, "step": 121245 }, { "epoch": 2.962157672293748, "grad_norm": 0.20793768763542175, "learning_rate": 8.52447646176609e-07, "loss": 0.0013, "num_input_tokens_seen": 81727464, "step": 121250 }, { "epoch": 2.9622798231255953, "grad_norm": 0.01568690501153469, "learning_rate": 8.523633025413246e-07, "loss": 0.0742, "num_input_tokens_seen": 81730536, "step": 121255 }, { "epoch": 2.9624019739574425, "grad_norm": 0.1056603267788887, "learning_rate": 8.522789599796939e-07, "loss": 0.0003, "num_input_tokens_seen": 81733544, "step": 121260 }, { "epoch": 2.9625241247892897, "grad_norm": 0.031303521245718, "learning_rate": 8.521946184923304e-07, "loss": 0.1397, "num_input_tokens_seen": 81736744, "step": 121265 }, { "epoch": 2.962646275621137, "grad_norm": 0.49149009585380554, "learning_rate": 8.521102780798475e-07, "loss": 0.0798, "num_input_tokens_seen": 81740136, "step": 121270 }, { "epoch": 2.962768426452984, "grad_norm": 0.1194659098982811, "learning_rate": 8.520259387428582e-07, "loss": 0.0639, "num_input_tokens_seen": 81743336, "step": 121275 }, { "epoch": 2.9628905772848313, "grad_norm": 0.09376633167266846, "learning_rate": 8.519416004819764e-07, "loss": 0.0005, "num_input_tokens_seen": 81746344, "step": 121280 }, { "epoch": 2.9630127281166785, "grad_norm": 0.6304414868354797, "learning_rate": 8.518572632978147e-07, "loss": 0.0007, "num_input_tokens_seen": 81749608, "step": 121285 }, { "epoch": 2.9631348789485257, "grad_norm": 0.13901102542877197, "learning_rate": 8.517729271909869e-07, "loss": 0.1305, "num_input_tokens_seen": 81752872, "step": 121290 }, { "epoch": 2.963257029780373, "grad_norm": 0.006146088242530823, "learning_rate": 8.516885921621064e-07, "loss": 0.0004, "num_input_tokens_seen": 81755944, "step": 121295 }, { "epoch": 2.96337918061222, "grad_norm": 0.11365926265716553, "learning_rate": 8.516042582117862e-07, "loss": 0.0011, "num_input_tokens_seen": 81759720, "step": 121300 }, { "epoch": 2.9635013314440672, "grad_norm": 0.007527015637606382, "learning_rate": 8.5151992534064e-07, "loss": 0.0005, "num_input_tokens_seen": 81763368, "step": 121305 }, { "epoch": 2.9636234822759144, "grad_norm": 0.020057743415236473, "learning_rate": 8.514355935492806e-07, "loss": 0.1187, "num_input_tokens_seen": 81766824, "step": 121310 }, { "epoch": 2.9637456331077616, "grad_norm": 0.029346054419875145, "learning_rate": 8.513512628383217e-07, "loss": 0.0003, "num_input_tokens_seen": 81769896, "step": 121315 }, { "epoch": 2.963867783939609, "grad_norm": 0.07407413423061371, "learning_rate": 8.512669332083763e-07, "loss": 0.0732, "num_input_tokens_seen": 81773608, "step": 121320 }, { "epoch": 2.9639899347714556, "grad_norm": 0.2536911368370056, "learning_rate": 8.511826046600575e-07, "loss": 0.0336, "num_input_tokens_seen": 81777000, "step": 121325 }, { "epoch": 2.964112085603303, "grad_norm": 0.06144652143120766, "learning_rate": 8.510982771939794e-07, "loss": 0.035, "num_input_tokens_seen": 81780712, "step": 121330 }, { "epoch": 2.96423423643515, "grad_norm": 0.08548964560031891, "learning_rate": 8.510139508107541e-07, "loss": 0.0515, "num_input_tokens_seen": 81783720, "step": 121335 }, { "epoch": 2.964356387266997, "grad_norm": 0.010217788629233837, "learning_rate": 8.509296255109959e-07, "loss": 0.0001, "num_input_tokens_seen": 81787112, "step": 121340 }, { "epoch": 2.9644785380988443, "grad_norm": 0.0944608524441719, "learning_rate": 8.508453012953172e-07, "loss": 0.0445, "num_input_tokens_seen": 81790568, "step": 121345 }, { "epoch": 2.9646006889306915, "grad_norm": 0.12509916722774506, "learning_rate": 8.507609781643316e-07, "loss": 0.0408, "num_input_tokens_seen": 81793576, "step": 121350 }, { "epoch": 2.9647228397625387, "grad_norm": 0.06585719436407089, "learning_rate": 8.506766561186526e-07, "loss": 0.0401, "num_input_tokens_seen": 81797096, "step": 121355 }, { "epoch": 2.964844990594386, "grad_norm": 1.7288832664489746, "learning_rate": 8.505923351588931e-07, "loss": 0.0665, "num_input_tokens_seen": 81800296, "step": 121360 }, { "epoch": 2.964967141426233, "grad_norm": 0.03702880069613457, "learning_rate": 8.505080152856661e-07, "loss": 0.0237, "num_input_tokens_seen": 81803432, "step": 121365 }, { "epoch": 2.9650892922580803, "grad_norm": 0.8792369365692139, "learning_rate": 8.504236964995851e-07, "loss": 0.0005, "num_input_tokens_seen": 81806568, "step": 121370 }, { "epoch": 2.9652114430899275, "grad_norm": 0.18109741806983948, "learning_rate": 8.503393788012635e-07, "loss": 0.0384, "num_input_tokens_seen": 81810152, "step": 121375 }, { "epoch": 2.9653335939217746, "grad_norm": 60.54440689086914, "learning_rate": 8.502550621913137e-07, "loss": 0.0011, "num_input_tokens_seen": 81813800, "step": 121380 }, { "epoch": 2.965455744753622, "grad_norm": 0.0288938470184803, "learning_rate": 8.501707466703494e-07, "loss": 0.128, "num_input_tokens_seen": 81816616, "step": 121385 }, { "epoch": 2.965577895585469, "grad_norm": 0.09552355855703354, "learning_rate": 8.50086432238984e-07, "loss": 0.0599, "num_input_tokens_seen": 81819560, "step": 121390 }, { "epoch": 2.965700046417316, "grad_norm": 0.2099023461341858, "learning_rate": 8.500021188978301e-07, "loss": 0.0009, "num_input_tokens_seen": 81822760, "step": 121395 }, { "epoch": 2.9658221972491634, "grad_norm": 0.0012449991190806031, "learning_rate": 8.499178066475016e-07, "loss": 0.049, "num_input_tokens_seen": 81826024, "step": 121400 }, { "epoch": 2.9659443480810106, "grad_norm": 0.37859848141670227, "learning_rate": 8.498334954886107e-07, "loss": 0.0004, "num_input_tokens_seen": 81829224, "step": 121405 }, { "epoch": 2.9660664989128573, "grad_norm": 0.6261986494064331, "learning_rate": 8.497491854217713e-07, "loss": 0.0006, "num_input_tokens_seen": 81832680, "step": 121410 }, { "epoch": 2.966188649744705, "grad_norm": 0.20188039541244507, "learning_rate": 8.496648764475961e-07, "loss": 0.0006, "num_input_tokens_seen": 81836072, "step": 121415 }, { "epoch": 2.9663108005765517, "grad_norm": 0.12402575463056564, "learning_rate": 8.495805685666985e-07, "loss": 0.0003, "num_input_tokens_seen": 81839592, "step": 121420 }, { "epoch": 2.9664329514083994, "grad_norm": 0.037398580461740494, "learning_rate": 8.494962617796915e-07, "loss": 0.0445, "num_input_tokens_seen": 81843048, "step": 121425 }, { "epoch": 2.966555102240246, "grad_norm": 0.1763017773628235, "learning_rate": 8.494119560871879e-07, "loss": 0.0362, "num_input_tokens_seen": 81845928, "step": 121430 }, { "epoch": 2.9666772530720933, "grad_norm": 0.7172082662582397, "learning_rate": 8.493276514898014e-07, "loss": 0.0004, "num_input_tokens_seen": 81849704, "step": 121435 }, { "epoch": 2.9667994039039405, "grad_norm": 0.0016529584536328912, "learning_rate": 8.492433479881444e-07, "loss": 0.001, "num_input_tokens_seen": 81853032, "step": 121440 }, { "epoch": 2.9669215547357877, "grad_norm": 0.0239529088139534, "learning_rate": 8.491590455828302e-07, "loss": 0.0002, "num_input_tokens_seen": 81856360, "step": 121445 }, { "epoch": 2.967043705567635, "grad_norm": 0.011942592449486256, "learning_rate": 8.490747442744725e-07, "loss": 0.0014, "num_input_tokens_seen": 81859624, "step": 121450 }, { "epoch": 2.967165856399482, "grad_norm": 0.047506652772426605, "learning_rate": 8.489904440636833e-07, "loss": 0.0002, "num_input_tokens_seen": 81863144, "step": 121455 }, { "epoch": 2.9672880072313292, "grad_norm": 0.04296977072954178, "learning_rate": 8.489061449510768e-07, "loss": 0.0291, "num_input_tokens_seen": 81866408, "step": 121460 }, { "epoch": 2.9674101580631764, "grad_norm": 0.010503513738512993, "learning_rate": 8.488218469372652e-07, "loss": 0.0001, "num_input_tokens_seen": 81869480, "step": 121465 }, { "epoch": 2.9675323088950236, "grad_norm": 0.11669360101222992, "learning_rate": 8.487375500228617e-07, "loss": 0.0002, "num_input_tokens_seen": 81873192, "step": 121470 }, { "epoch": 2.967654459726871, "grad_norm": 0.25940388441085815, "learning_rate": 8.486532542084795e-07, "loss": 0.0002, "num_input_tokens_seen": 81876584, "step": 121475 }, { "epoch": 2.967776610558718, "grad_norm": 0.13429482281208038, "learning_rate": 8.485689594947314e-07, "loss": 0.0008, "num_input_tokens_seen": 81880040, "step": 121480 }, { "epoch": 2.967898761390565, "grad_norm": 23.280900955200195, "learning_rate": 8.484846658822308e-07, "loss": 0.1177, "num_input_tokens_seen": 81883560, "step": 121485 }, { "epoch": 2.9680209122224124, "grad_norm": 0.06801927834749222, "learning_rate": 8.484003733715902e-07, "loss": 0.027, "num_input_tokens_seen": 81887016, "step": 121490 }, { "epoch": 2.968143063054259, "grad_norm": 0.002891592215746641, "learning_rate": 8.483160819634232e-07, "loss": 0.0002, "num_input_tokens_seen": 81890792, "step": 121495 }, { "epoch": 2.9682652138861068, "grad_norm": 0.0072302971966564655, "learning_rate": 8.482317916583422e-07, "loss": 0.0389, "num_input_tokens_seen": 81894056, "step": 121500 }, { "epoch": 2.9683873647179535, "grad_norm": 0.05241973325610161, "learning_rate": 8.481475024569602e-07, "loss": 0.0513, "num_input_tokens_seen": 81897320, "step": 121505 }, { "epoch": 2.968509515549801, "grad_norm": 0.029552537947893143, "learning_rate": 8.480632143598909e-07, "loss": 0.1138, "num_input_tokens_seen": 81900392, "step": 121510 }, { "epoch": 2.968631666381648, "grad_norm": 0.042538851499557495, "learning_rate": 8.479789273677465e-07, "loss": 0.0574, "num_input_tokens_seen": 81903656, "step": 121515 }, { "epoch": 2.968753817213495, "grad_norm": 0.05005302652716637, "learning_rate": 8.478946414811403e-07, "loss": 0.0004, "num_input_tokens_seen": 81906856, "step": 121520 }, { "epoch": 2.9688759680453423, "grad_norm": 0.1344650685787201, "learning_rate": 8.478103567006853e-07, "loss": 0.0002, "num_input_tokens_seen": 81910312, "step": 121525 }, { "epoch": 2.9689981188771895, "grad_norm": 0.02481931447982788, "learning_rate": 8.477260730269944e-07, "loss": 0.0725, "num_input_tokens_seen": 81913576, "step": 121530 }, { "epoch": 2.9691202697090366, "grad_norm": 0.001746109570376575, "learning_rate": 8.4764179046068e-07, "loss": 0.0002, "num_input_tokens_seen": 81917224, "step": 121535 }, { "epoch": 2.969242420540884, "grad_norm": 0.00772502226755023, "learning_rate": 8.475575090023555e-07, "loss": 0.0007, "num_input_tokens_seen": 81920744, "step": 121540 }, { "epoch": 2.969364571372731, "grad_norm": 0.010284801945090294, "learning_rate": 8.474732286526342e-07, "loss": 0.0429, "num_input_tokens_seen": 81924072, "step": 121545 }, { "epoch": 2.969486722204578, "grad_norm": 17.817644119262695, "learning_rate": 8.473889494121282e-07, "loss": 0.0468, "num_input_tokens_seen": 81927336, "step": 121550 }, { "epoch": 2.9696088730364254, "grad_norm": 0.07845073193311691, "learning_rate": 8.473046712814513e-07, "loss": 0.0852, "num_input_tokens_seen": 81930664, "step": 121555 }, { "epoch": 2.9697310238682726, "grad_norm": 0.22656197845935822, "learning_rate": 8.472203942612154e-07, "loss": 0.0011, "num_input_tokens_seen": 81933736, "step": 121560 }, { "epoch": 2.96985317470012, "grad_norm": 0.041642528027296066, "learning_rate": 8.471361183520341e-07, "loss": 0.042, "num_input_tokens_seen": 81937384, "step": 121565 }, { "epoch": 2.969975325531967, "grad_norm": 0.018616296350955963, "learning_rate": 8.470518435545202e-07, "loss": 0.0002, "num_input_tokens_seen": 81940776, "step": 121570 }, { "epoch": 2.970097476363814, "grad_norm": 0.033992212265729904, "learning_rate": 8.469675698692862e-07, "loss": 0.0001, "num_input_tokens_seen": 81944168, "step": 121575 }, { "epoch": 2.9702196271956613, "grad_norm": 0.1472531408071518, "learning_rate": 8.468832972969457e-07, "loss": 0.0002, "num_input_tokens_seen": 81947752, "step": 121580 }, { "epoch": 2.9703417780275085, "grad_norm": 33.03204345703125, "learning_rate": 8.467990258381104e-07, "loss": 0.0919, "num_input_tokens_seen": 81950952, "step": 121585 }, { "epoch": 2.9704639288593553, "grad_norm": 0.008867146447300911, "learning_rate": 8.467147554933942e-07, "loss": 0.0725, "num_input_tokens_seen": 81954216, "step": 121590 }, { "epoch": 2.970586079691203, "grad_norm": 0.011977802962064743, "learning_rate": 8.466304862634092e-07, "loss": 0.0004, "num_input_tokens_seen": 81957480, "step": 121595 }, { "epoch": 2.9707082305230497, "grad_norm": 0.12904588878154755, "learning_rate": 8.465462181487684e-07, "loss": 0.0345, "num_input_tokens_seen": 81960808, "step": 121600 }, { "epoch": 2.9708303813548973, "grad_norm": 0.07471118867397308, "learning_rate": 8.464619511500855e-07, "loss": 0.0002, "num_input_tokens_seen": 81964328, "step": 121605 }, { "epoch": 2.970952532186744, "grad_norm": 0.11465713381767273, "learning_rate": 8.463776852679718e-07, "loss": 0.0376, "num_input_tokens_seen": 81968296, "step": 121610 }, { "epoch": 2.9710746830185912, "grad_norm": 0.15388906002044678, "learning_rate": 8.462934205030417e-07, "loss": 0.0004, "num_input_tokens_seen": 81971624, "step": 121615 }, { "epoch": 2.9711968338504384, "grad_norm": 0.013246401213109493, "learning_rate": 8.462091568559067e-07, "loss": 0.0007, "num_input_tokens_seen": 81974952, "step": 121620 }, { "epoch": 2.9713189846822856, "grad_norm": 0.03343416750431061, "learning_rate": 8.461248943271802e-07, "loss": 0.0005, "num_input_tokens_seen": 81978600, "step": 121625 }, { "epoch": 2.971441135514133, "grad_norm": 0.03627277538180351, "learning_rate": 8.460406329174748e-07, "loss": 0.0003, "num_input_tokens_seen": 81981608, "step": 121630 }, { "epoch": 2.97156328634598, "grad_norm": 0.10073908418416977, "learning_rate": 8.459563726274031e-07, "loss": 0.0003, "num_input_tokens_seen": 81984872, "step": 121635 }, { "epoch": 2.971685437177827, "grad_norm": 0.1816432625055313, "learning_rate": 8.458721134575785e-07, "loss": 0.0568, "num_input_tokens_seen": 81988008, "step": 121640 }, { "epoch": 2.9718075880096744, "grad_norm": 0.016053643077611923, "learning_rate": 8.457878554086129e-07, "loss": 0.0342, "num_input_tokens_seen": 81990952, "step": 121645 }, { "epoch": 2.9719297388415216, "grad_norm": 0.09785997867584229, "learning_rate": 8.4570359848112e-07, "loss": 0.0345, "num_input_tokens_seen": 81994344, "step": 121650 }, { "epoch": 2.9720518896733688, "grad_norm": 0.00311757018789649, "learning_rate": 8.456193426757117e-07, "loss": 0.0003, "num_input_tokens_seen": 81997672, "step": 121655 }, { "epoch": 2.972174040505216, "grad_norm": 0.08872219175100327, "learning_rate": 8.455350879930009e-07, "loss": 0.0001, "num_input_tokens_seen": 82000680, "step": 121660 }, { "epoch": 2.972296191337063, "grad_norm": 0.017512673512101173, "learning_rate": 8.454508344336009e-07, "loss": 0.0002, "num_input_tokens_seen": 82003560, "step": 121665 }, { "epoch": 2.9724183421689103, "grad_norm": 0.03185400366783142, "learning_rate": 8.453665819981239e-07, "loss": 0.0001, "num_input_tokens_seen": 82006824, "step": 121670 }, { "epoch": 2.972540493000757, "grad_norm": 18.32367706298828, "learning_rate": 8.452823306871826e-07, "loss": 0.0457, "num_input_tokens_seen": 82010024, "step": 121675 }, { "epoch": 2.9726626438326047, "grad_norm": 0.19949117302894592, "learning_rate": 8.451980805013898e-07, "loss": 0.1149, "num_input_tokens_seen": 82013224, "step": 121680 }, { "epoch": 2.9727847946644514, "grad_norm": 0.14638371765613556, "learning_rate": 8.451138314413586e-07, "loss": 0.0007, "num_input_tokens_seen": 82016680, "step": 121685 }, { "epoch": 2.972906945496299, "grad_norm": 12.467910766601562, "learning_rate": 8.450295835077007e-07, "loss": 0.071, "num_input_tokens_seen": 82020072, "step": 121690 }, { "epoch": 2.973029096328146, "grad_norm": 0.11415675282478333, "learning_rate": 8.449453367010293e-07, "loss": 0.0457, "num_input_tokens_seen": 82023400, "step": 121695 }, { "epoch": 2.973151247159993, "grad_norm": 0.0506068654358387, "learning_rate": 8.448610910219577e-07, "loss": 0.0517, "num_input_tokens_seen": 82026728, "step": 121700 }, { "epoch": 2.97327339799184, "grad_norm": 0.025008076801896095, "learning_rate": 8.447768464710974e-07, "loss": 0.0089, "num_input_tokens_seen": 82030120, "step": 121705 }, { "epoch": 2.9733955488236874, "grad_norm": 0.03348412737250328, "learning_rate": 8.446926030490622e-07, "loss": 0.0001, "num_input_tokens_seen": 82033768, "step": 121710 }, { "epoch": 2.9735176996555346, "grad_norm": 0.518605649471283, "learning_rate": 8.446083607564636e-07, "loss": 0.0642, "num_input_tokens_seen": 82037224, "step": 121715 }, { "epoch": 2.9736398504873818, "grad_norm": 0.0018251192523166537, "learning_rate": 8.445241195939152e-07, "loss": 0.1574, "num_input_tokens_seen": 82040616, "step": 121720 }, { "epoch": 2.973762001319229, "grad_norm": 14.896915435791016, "learning_rate": 8.444398795620289e-07, "loss": 0.0398, "num_input_tokens_seen": 82043688, "step": 121725 }, { "epoch": 2.973884152151076, "grad_norm": 1260.587158203125, "learning_rate": 8.443556406614179e-07, "loss": 0.0905, "num_input_tokens_seen": 82047528, "step": 121730 }, { "epoch": 2.9740063029829233, "grad_norm": 0.005130738485604525, "learning_rate": 8.442714028926946e-07, "loss": 0.0304, "num_input_tokens_seen": 82050920, "step": 121735 }, { "epoch": 2.9741284538147705, "grad_norm": 42.79816818237305, "learning_rate": 8.441871662564712e-07, "loss": 0.0853, "num_input_tokens_seen": 82054120, "step": 121740 }, { "epoch": 2.9742506046466177, "grad_norm": 0.3749528229236603, "learning_rate": 8.44102930753361e-07, "loss": 0.0006, "num_input_tokens_seen": 82057384, "step": 121745 }, { "epoch": 2.974372755478465, "grad_norm": 0.13021938502788544, "learning_rate": 8.44018696383976e-07, "loss": 0.0003, "num_input_tokens_seen": 82060648, "step": 121750 }, { "epoch": 2.974494906310312, "grad_norm": 0.08072230964899063, "learning_rate": 8.439344631489287e-07, "loss": 0.0006, "num_input_tokens_seen": 82064104, "step": 121755 }, { "epoch": 2.9746170571421593, "grad_norm": 0.3794505000114441, "learning_rate": 8.438502310488326e-07, "loss": 0.0518, "num_input_tokens_seen": 82067112, "step": 121760 }, { "epoch": 2.9747392079740065, "grad_norm": 0.29038816690444946, "learning_rate": 8.437660000842991e-07, "loss": 0.0008, "num_input_tokens_seen": 82070312, "step": 121765 }, { "epoch": 2.9748613588058532, "grad_norm": 0.034577976912260056, "learning_rate": 8.436817702559417e-07, "loss": 0.1004, "num_input_tokens_seen": 82073384, "step": 121770 }, { "epoch": 2.974983509637701, "grad_norm": 33.719852447509766, "learning_rate": 8.435975415643724e-07, "loss": 0.1113, "num_input_tokens_seen": 82077032, "step": 121775 }, { "epoch": 2.9751056604695476, "grad_norm": 0.037223588675260544, "learning_rate": 8.435133140102036e-07, "loss": 0.0006, "num_input_tokens_seen": 82080040, "step": 121780 }, { "epoch": 2.975227811301395, "grad_norm": 0.01888175681233406, "learning_rate": 8.434290875940483e-07, "loss": 0.0452, "num_input_tokens_seen": 82083112, "step": 121785 }, { "epoch": 2.975349962133242, "grad_norm": 0.0968722254037857, "learning_rate": 8.433448623165185e-07, "loss": 0.0002, "num_input_tokens_seen": 82086440, "step": 121790 }, { "epoch": 2.975472112965089, "grad_norm": 0.010138538666069508, "learning_rate": 8.432606381782275e-07, "loss": 0.0002, "num_input_tokens_seen": 82089896, "step": 121795 }, { "epoch": 2.9755942637969364, "grad_norm": 0.039139967411756516, "learning_rate": 8.431764151797867e-07, "loss": 0.0003, "num_input_tokens_seen": 82092712, "step": 121800 }, { "epoch": 2.9757164146287836, "grad_norm": 0.06282107532024384, "learning_rate": 8.430921933218097e-07, "loss": 0.0001, "num_input_tokens_seen": 82095592, "step": 121805 }, { "epoch": 2.9758385654606307, "grad_norm": 0.027126094326376915, "learning_rate": 8.430079726049081e-07, "loss": 0.0001, "num_input_tokens_seen": 82098984, "step": 121810 }, { "epoch": 2.975960716292478, "grad_norm": 0.1266213208436966, "learning_rate": 8.429237530296946e-07, "loss": 0.0666, "num_input_tokens_seen": 82102184, "step": 121815 }, { "epoch": 2.976082867124325, "grad_norm": 0.0039200736209750175, "learning_rate": 8.428395345967825e-07, "loss": 0.0001, "num_input_tokens_seen": 82105192, "step": 121820 }, { "epoch": 2.9762050179561723, "grad_norm": 0.11012466996908188, "learning_rate": 8.427553173067832e-07, "loss": 0.0002, "num_input_tokens_seen": 82108392, "step": 121825 }, { "epoch": 2.9763271687880195, "grad_norm": 28.55767250061035, "learning_rate": 8.426711011603094e-07, "loss": 0.0345, "num_input_tokens_seen": 82112040, "step": 121830 }, { "epoch": 2.9764493196198667, "grad_norm": 0.008977274410426617, "learning_rate": 8.425868861579739e-07, "loss": 0.0001, "num_input_tokens_seen": 82115624, "step": 121835 }, { "epoch": 2.976571470451714, "grad_norm": 0.036314770579338074, "learning_rate": 8.425026723003889e-07, "loss": 0.0657, "num_input_tokens_seen": 82118824, "step": 121840 }, { "epoch": 2.976693621283561, "grad_norm": 0.13658422231674194, "learning_rate": 8.424184595881666e-07, "loss": 0.0003, "num_input_tokens_seen": 82122472, "step": 121845 }, { "epoch": 2.9768157721154083, "grad_norm": 0.05026088282465935, "learning_rate": 8.423342480219195e-07, "loss": 0.0441, "num_input_tokens_seen": 82126248, "step": 121850 }, { "epoch": 2.976937922947255, "grad_norm": 14.070030212402344, "learning_rate": 8.422500376022607e-07, "loss": 0.0734, "num_input_tokens_seen": 82129704, "step": 121855 }, { "epoch": 2.9770600737791026, "grad_norm": 0.08053138852119446, "learning_rate": 8.421658283298017e-07, "loss": 0.0002, "num_input_tokens_seen": 82133224, "step": 121860 }, { "epoch": 2.9771822246109494, "grad_norm": 0.018369318917393684, "learning_rate": 8.420816202051555e-07, "loss": 0.0003, "num_input_tokens_seen": 82136616, "step": 121865 }, { "epoch": 2.977304375442797, "grad_norm": 0.20027559995651245, "learning_rate": 8.419974132289338e-07, "loss": 0.0505, "num_input_tokens_seen": 82140136, "step": 121870 }, { "epoch": 2.9774265262746438, "grad_norm": 0.044966600835323334, "learning_rate": 8.419132074017499e-07, "loss": 0.0423, "num_input_tokens_seen": 82143144, "step": 121875 }, { "epoch": 2.977548677106491, "grad_norm": 6.6325201988220215, "learning_rate": 8.418290027242153e-07, "loss": 0.0182, "num_input_tokens_seen": 82146408, "step": 121880 }, { "epoch": 2.977670827938338, "grad_norm": 0.023646734654903412, "learning_rate": 8.417447991969429e-07, "loss": 0.0004, "num_input_tokens_seen": 82149864, "step": 121885 }, { "epoch": 2.9777929787701853, "grad_norm": 0.043020982295274734, "learning_rate": 8.41660596820545e-07, "loss": 0.0002, "num_input_tokens_seen": 82153512, "step": 121890 }, { "epoch": 2.9779151296020325, "grad_norm": 9.485835075378418, "learning_rate": 8.415763955956336e-07, "loss": 0.0526, "num_input_tokens_seen": 82156648, "step": 121895 }, { "epoch": 2.9780372804338797, "grad_norm": 21.41651153564453, "learning_rate": 8.414921955228216e-07, "loss": 0.0731, "num_input_tokens_seen": 82159528, "step": 121900 }, { "epoch": 2.978159431265727, "grad_norm": 0.009164262562990189, "learning_rate": 8.414079966027206e-07, "loss": 0.0002, "num_input_tokens_seen": 82162984, "step": 121905 }, { "epoch": 2.978281582097574, "grad_norm": 0.1760968714952469, "learning_rate": 8.413237988359432e-07, "loss": 0.0691, "num_input_tokens_seen": 82166440, "step": 121910 }, { "epoch": 2.9784037329294213, "grad_norm": 0.04876111447811127, "learning_rate": 8.412396022231023e-07, "loss": 0.049, "num_input_tokens_seen": 82170472, "step": 121915 }, { "epoch": 2.9785258837612685, "grad_norm": 277.4755859375, "learning_rate": 8.411554067648092e-07, "loss": 0.1157, "num_input_tokens_seen": 82173544, "step": 121920 }, { "epoch": 2.9786480345931157, "grad_norm": 0.037194229662418365, "learning_rate": 8.410712124616773e-07, "loss": 0.0765, "num_input_tokens_seen": 82177384, "step": 121925 }, { "epoch": 2.978770185424963, "grad_norm": 0.0221418384462595, "learning_rate": 8.409870193143179e-07, "loss": 0.0004, "num_input_tokens_seen": 82180776, "step": 121930 }, { "epoch": 2.97889233625681, "grad_norm": 0.011147744953632355, "learning_rate": 8.409028273233439e-07, "loss": 0.0006, "num_input_tokens_seen": 82184168, "step": 121935 }, { "epoch": 2.9790144870886572, "grad_norm": 0.011297612451016903, "learning_rate": 8.40818636489367e-07, "loss": 0.0001, "num_input_tokens_seen": 82187944, "step": 121940 }, { "epoch": 2.9791366379205044, "grad_norm": 0.02041480876505375, "learning_rate": 8.407344468129998e-07, "loss": 0.0001, "num_input_tokens_seen": 82191528, "step": 121945 }, { "epoch": 2.979258788752351, "grad_norm": 0.01931638829410076, "learning_rate": 8.40650258294855e-07, "loss": 0.0849, "num_input_tokens_seen": 82195048, "step": 121950 }, { "epoch": 2.979380939584199, "grad_norm": 0.007796446327120066, "learning_rate": 8.405660709355439e-07, "loss": 0.0, "num_input_tokens_seen": 82198248, "step": 121955 }, { "epoch": 2.9795030904160456, "grad_norm": 0.012662877328693867, "learning_rate": 8.404818847356796e-07, "loss": 0.0002, "num_input_tokens_seen": 82201704, "step": 121960 }, { "epoch": 2.9796252412478927, "grad_norm": 0.1940654069185257, "learning_rate": 8.403976996958735e-07, "loss": 0.0003, "num_input_tokens_seen": 82204968, "step": 121965 }, { "epoch": 2.97974739207974, "grad_norm": 0.0024405743461102247, "learning_rate": 8.403135158167382e-07, "loss": 0.0454, "num_input_tokens_seen": 82208104, "step": 121970 }, { "epoch": 2.979869542911587, "grad_norm": 0.010657384060323238, "learning_rate": 8.402293330988866e-07, "loss": 0.0002, "num_input_tokens_seen": 82211368, "step": 121975 }, { "epoch": 2.9799916937434343, "grad_norm": 0.06140420585870743, "learning_rate": 8.401451515429299e-07, "loss": 0.1058, "num_input_tokens_seen": 82214440, "step": 121980 }, { "epoch": 2.9801138445752815, "grad_norm": 0.022319672629237175, "learning_rate": 8.400609711494807e-07, "loss": 0.0618, "num_input_tokens_seen": 82217832, "step": 121985 }, { "epoch": 2.9802359954071287, "grad_norm": 0.05925625190138817, "learning_rate": 8.399767919191511e-07, "loss": 0.0467, "num_input_tokens_seen": 82221416, "step": 121990 }, { "epoch": 2.980358146238976, "grad_norm": 0.07047688215970993, "learning_rate": 8.398926138525536e-07, "loss": 0.0016, "num_input_tokens_seen": 82224360, "step": 121995 }, { "epoch": 2.980480297070823, "grad_norm": 25.92572021484375, "learning_rate": 8.398084369502996e-07, "loss": 0.0424, "num_input_tokens_seen": 82227560, "step": 122000 }, { "epoch": 2.9806024479026703, "grad_norm": 0.0666838213801384, "learning_rate": 8.397242612130017e-07, "loss": 0.0332, "num_input_tokens_seen": 82230632, "step": 122005 }, { "epoch": 2.9807245987345174, "grad_norm": 0.7408774495124817, "learning_rate": 8.396400866412725e-07, "loss": 0.0683, "num_input_tokens_seen": 82234088, "step": 122010 }, { "epoch": 2.9808467495663646, "grad_norm": 0.01501146424561739, "learning_rate": 8.395559132357234e-07, "loss": 0.0003, "num_input_tokens_seen": 82237416, "step": 122015 }, { "epoch": 2.980968900398212, "grad_norm": 0.35236650705337524, "learning_rate": 8.394717409969671e-07, "loss": 0.0005, "num_input_tokens_seen": 82240872, "step": 122020 }, { "epoch": 2.981091051230059, "grad_norm": 0.029871536418795586, "learning_rate": 8.393875699256152e-07, "loss": 0.1045, "num_input_tokens_seen": 82244776, "step": 122025 }, { "epoch": 2.981213202061906, "grad_norm": 0.04119180887937546, "learning_rate": 8.393034000222805e-07, "loss": 0.056, "num_input_tokens_seen": 82247912, "step": 122030 }, { "epoch": 2.981335352893753, "grad_norm": 0.037397608160972595, "learning_rate": 8.392192312875742e-07, "loss": 0.0002, "num_input_tokens_seen": 82251624, "step": 122035 }, { "epoch": 2.9814575037256006, "grad_norm": 0.009144660085439682, "learning_rate": 8.391350637221092e-07, "loss": 0.0001, "num_input_tokens_seen": 82254696, "step": 122040 }, { "epoch": 2.9815796545574473, "grad_norm": 0.04723630100488663, "learning_rate": 8.390508973264974e-07, "loss": 0.0001, "num_input_tokens_seen": 82258216, "step": 122045 }, { "epoch": 2.981701805389295, "grad_norm": 26.548973083496094, "learning_rate": 8.389667321013505e-07, "loss": 0.0971, "num_input_tokens_seen": 82261544, "step": 122050 }, { "epoch": 2.9818239562211417, "grad_norm": 0.1542336344718933, "learning_rate": 8.388825680472811e-07, "loss": 0.0002, "num_input_tokens_seen": 82264808, "step": 122055 }, { "epoch": 2.981946107052989, "grad_norm": 0.018932661041617393, "learning_rate": 8.387984051649006e-07, "loss": 0.0009, "num_input_tokens_seen": 82268072, "step": 122060 }, { "epoch": 2.982068257884836, "grad_norm": 0.2454744130373001, "learning_rate": 8.387142434548216e-07, "loss": 0.0004, "num_input_tokens_seen": 82271592, "step": 122065 }, { "epoch": 2.9821904087166833, "grad_norm": 23.384599685668945, "learning_rate": 8.386300829176563e-07, "loss": 0.0491, "num_input_tokens_seen": 82274408, "step": 122070 }, { "epoch": 2.9823125595485305, "grad_norm": 0.22480256855487823, "learning_rate": 8.38545923554016e-07, "loss": 0.0003, "num_input_tokens_seen": 82277672, "step": 122075 }, { "epoch": 2.9824347103803777, "grad_norm": 0.08951090276241302, "learning_rate": 8.384617653645136e-07, "loss": 0.0442, "num_input_tokens_seen": 82281064, "step": 122080 }, { "epoch": 2.982556861212225, "grad_norm": 8.396158218383789, "learning_rate": 8.383776083497604e-07, "loss": 0.0342, "num_input_tokens_seen": 82285096, "step": 122085 }, { "epoch": 2.982679012044072, "grad_norm": 0.02514975517988205, "learning_rate": 8.382934525103688e-07, "loss": 0.1432, "num_input_tokens_seen": 82288296, "step": 122090 }, { "epoch": 2.9828011628759192, "grad_norm": 0.1363772749900818, "learning_rate": 8.382092978469508e-07, "loss": 0.0003, "num_input_tokens_seen": 82291880, "step": 122095 }, { "epoch": 2.9829233137077664, "grad_norm": 0.01438989583402872, "learning_rate": 8.381251443601181e-07, "loss": 0.0005, "num_input_tokens_seen": 82295208, "step": 122100 }, { "epoch": 2.9830454645396136, "grad_norm": 0.04141591861844063, "learning_rate": 8.380409920504832e-07, "loss": 0.062, "num_input_tokens_seen": 82298664, "step": 122105 }, { "epoch": 2.983167615371461, "grad_norm": 0.08502558618783951, "learning_rate": 8.379568409186573e-07, "loss": 0.0502, "num_input_tokens_seen": 82301992, "step": 122110 }, { "epoch": 2.983289766203308, "grad_norm": 0.022353440523147583, "learning_rate": 8.378726909652533e-07, "loss": 0.0002, "num_input_tokens_seen": 82305448, "step": 122115 }, { "epoch": 2.9834119170351547, "grad_norm": 0.013993551954627037, "learning_rate": 8.377885421908824e-07, "loss": 0.0539, "num_input_tokens_seen": 82308392, "step": 122120 }, { "epoch": 2.9835340678670024, "grad_norm": 0.23815952241420746, "learning_rate": 8.377043945961566e-07, "loss": 0.034, "num_input_tokens_seen": 82311848, "step": 122125 }, { "epoch": 2.983656218698849, "grad_norm": 0.05029185861349106, "learning_rate": 8.376202481816888e-07, "loss": 0.0556, "num_input_tokens_seen": 82315048, "step": 122130 }, { "epoch": 2.9837783695306968, "grad_norm": 0.1486971527338028, "learning_rate": 8.375361029480898e-07, "loss": 0.0474, "num_input_tokens_seen": 82318312, "step": 122135 }, { "epoch": 2.9839005203625435, "grad_norm": 0.01855803281068802, "learning_rate": 8.374519588959721e-07, "loss": 0.0002, "num_input_tokens_seen": 82322024, "step": 122140 }, { "epoch": 2.9840226711943907, "grad_norm": 0.12863339483737946, "learning_rate": 8.373678160259474e-07, "loss": 0.0384, "num_input_tokens_seen": 82325672, "step": 122145 }, { "epoch": 2.984144822026238, "grad_norm": 0.30401062965393066, "learning_rate": 8.372836743386279e-07, "loss": 0.0317, "num_input_tokens_seen": 82329128, "step": 122150 }, { "epoch": 2.984266972858085, "grad_norm": 0.0029473446775227785, "learning_rate": 8.371995338346249e-07, "loss": 0.0001, "num_input_tokens_seen": 82332264, "step": 122155 }, { "epoch": 2.9843891236899323, "grad_norm": 0.019141053780913353, "learning_rate": 8.371153945145506e-07, "loss": 0.0006, "num_input_tokens_seen": 82335848, "step": 122160 }, { "epoch": 2.9845112745217794, "grad_norm": 0.013371425680816174, "learning_rate": 8.370312563790174e-07, "loss": 0.1012, "num_input_tokens_seen": 82339176, "step": 122165 }, { "epoch": 2.9846334253536266, "grad_norm": 0.045894015580415726, "learning_rate": 8.369471194286364e-07, "loss": 0.0005, "num_input_tokens_seen": 82342824, "step": 122170 }, { "epoch": 2.984755576185474, "grad_norm": 0.1080714762210846, "learning_rate": 8.368629836640202e-07, "loss": 0.0002, "num_input_tokens_seen": 82346024, "step": 122175 }, { "epoch": 2.984877727017321, "grad_norm": 479.940673828125, "learning_rate": 8.367788490857798e-07, "loss": 0.0257, "num_input_tokens_seen": 82349672, "step": 122180 }, { "epoch": 2.984999877849168, "grad_norm": 0.02565705217421055, "learning_rate": 8.366947156945279e-07, "loss": 0.0003, "num_input_tokens_seen": 82352872, "step": 122185 }, { "epoch": 2.9851220286810154, "grad_norm": 0.020736481994390488, "learning_rate": 8.366105834908756e-07, "loss": 0.0001, "num_input_tokens_seen": 82356200, "step": 122190 }, { "epoch": 2.9852441795128626, "grad_norm": 0.01151752658188343, "learning_rate": 8.365264524754353e-07, "loss": 0.0564, "num_input_tokens_seen": 82359848, "step": 122195 }, { "epoch": 2.9853663303447098, "grad_norm": 0.0341997891664505, "learning_rate": 8.364423226488187e-07, "loss": 0.0002, "num_input_tokens_seen": 82363176, "step": 122200 }, { "epoch": 2.985488481176557, "grad_norm": 0.0014073860365897417, "learning_rate": 8.363581940116373e-07, "loss": 0.0343, "num_input_tokens_seen": 82366632, "step": 122205 }, { "epoch": 2.985610632008404, "grad_norm": 0.041219040751457214, "learning_rate": 8.362740665645034e-07, "loss": 0.0003, "num_input_tokens_seen": 82370216, "step": 122210 }, { "epoch": 2.985732782840251, "grad_norm": 78.13631439208984, "learning_rate": 8.361899403080282e-07, "loss": 0.062, "num_input_tokens_seen": 82373864, "step": 122215 }, { "epoch": 2.9858549336720985, "grad_norm": 0.0018047604244202375, "learning_rate": 8.361058152428238e-07, "loss": 0.0334, "num_input_tokens_seen": 82377384, "step": 122220 }, { "epoch": 2.9859770845039453, "grad_norm": 0.004586088005453348, "learning_rate": 8.360216913695023e-07, "loss": 0.0373, "num_input_tokens_seen": 82380328, "step": 122225 }, { "epoch": 2.986099235335793, "grad_norm": 32.516361236572266, "learning_rate": 8.359375686886748e-07, "loss": 0.1049, "num_input_tokens_seen": 82383784, "step": 122230 }, { "epoch": 2.9862213861676397, "grad_norm": 0.005690659396350384, "learning_rate": 8.358534472009538e-07, "loss": 0.0002, "num_input_tokens_seen": 82387624, "step": 122235 }, { "epoch": 2.986343536999487, "grad_norm": 30.37091636657715, "learning_rate": 8.357693269069506e-07, "loss": 0.0445, "num_input_tokens_seen": 82391144, "step": 122240 }, { "epoch": 2.986465687831334, "grad_norm": 0.008414661511778831, "learning_rate": 8.356852078072769e-07, "loss": 0.0002, "num_input_tokens_seen": 82394280, "step": 122245 }, { "epoch": 2.9865878386631812, "grad_norm": 0.019386325031518936, "learning_rate": 8.356010899025448e-07, "loss": 0.0002, "num_input_tokens_seen": 82397608, "step": 122250 }, { "epoch": 2.9867099894950284, "grad_norm": 0.02001720480620861, "learning_rate": 8.355169731933654e-07, "loss": 0.0866, "num_input_tokens_seen": 82400936, "step": 122255 }, { "epoch": 2.9868321403268756, "grad_norm": 0.12658660113811493, "learning_rate": 8.354328576803511e-07, "loss": 0.1082, "num_input_tokens_seen": 82404072, "step": 122260 }, { "epoch": 2.986954291158723, "grad_norm": 2.110907554626465, "learning_rate": 8.353487433641131e-07, "loss": 0.001, "num_input_tokens_seen": 82407912, "step": 122265 }, { "epoch": 2.98707644199057, "grad_norm": 0.002116686664521694, "learning_rate": 8.352646302452637e-07, "loss": 0.0002, "num_input_tokens_seen": 82411112, "step": 122270 }, { "epoch": 2.987198592822417, "grad_norm": 0.05068863555788994, "learning_rate": 8.351805183244137e-07, "loss": 0.0992, "num_input_tokens_seen": 82414696, "step": 122275 }, { "epoch": 2.9873207436542644, "grad_norm": 0.03697684034705162, "learning_rate": 8.350964076021754e-07, "loss": 0.0001, "num_input_tokens_seen": 82418216, "step": 122280 }, { "epoch": 2.9874428944861116, "grad_norm": 0.3347267210483551, "learning_rate": 8.350122980791608e-07, "loss": 0.0007, "num_input_tokens_seen": 82421352, "step": 122285 }, { "epoch": 2.9875650453179587, "grad_norm": 0.14682719111442566, "learning_rate": 8.349281897559808e-07, "loss": 0.0323, "num_input_tokens_seen": 82424680, "step": 122290 }, { "epoch": 2.987687196149806, "grad_norm": 0.3761098384857178, "learning_rate": 8.348440826332477e-07, "loss": 0.0002, "num_input_tokens_seen": 82427816, "step": 122295 }, { "epoch": 2.9878093469816527, "grad_norm": 0.006452401168644428, "learning_rate": 8.347599767115726e-07, "loss": 0.0663, "num_input_tokens_seen": 82431464, "step": 122300 }, { "epoch": 2.9879314978135003, "grad_norm": 0.0023743468336760998, "learning_rate": 8.346758719915677e-07, "loss": 0.0001, "num_input_tokens_seen": 82434664, "step": 122305 }, { "epoch": 2.988053648645347, "grad_norm": 0.5880225896835327, "learning_rate": 8.345917684738439e-07, "loss": 0.0007, "num_input_tokens_seen": 82438056, "step": 122310 }, { "epoch": 2.9881757994771947, "grad_norm": 0.16489212214946747, "learning_rate": 8.345076661590133e-07, "loss": 0.0277, "num_input_tokens_seen": 82441448, "step": 122315 }, { "epoch": 2.9882979503090414, "grad_norm": 0.000908148183953017, "learning_rate": 8.344235650476878e-07, "loss": 0.0001, "num_input_tokens_seen": 82447208, "step": 122320 }, { "epoch": 2.9884201011408886, "grad_norm": 0.015482887625694275, "learning_rate": 8.343394651404783e-07, "loss": 0.0547, "num_input_tokens_seen": 82450472, "step": 122325 }, { "epoch": 2.988542251972736, "grad_norm": 0.1507757008075714, "learning_rate": 8.342553664379971e-07, "loss": 0.0001, "num_input_tokens_seen": 82454184, "step": 122330 }, { "epoch": 2.988664402804583, "grad_norm": 0.09877260029315948, "learning_rate": 8.341712689408551e-07, "loss": 0.1056, "num_input_tokens_seen": 82457640, "step": 122335 }, { "epoch": 2.98878655363643, "grad_norm": 0.24767570197582245, "learning_rate": 8.340871726496646e-07, "loss": 0.1443, "num_input_tokens_seen": 82460904, "step": 122340 }, { "epoch": 2.9889087044682774, "grad_norm": 0.012325254268944263, "learning_rate": 8.340030775650364e-07, "loss": 0.0491, "num_input_tokens_seen": 82464808, "step": 122345 }, { "epoch": 2.9890308553001246, "grad_norm": 0.045981768518686295, "learning_rate": 8.339189836875827e-07, "loss": 0.0002, "num_input_tokens_seen": 82468136, "step": 122350 }, { "epoch": 2.9891530061319718, "grad_norm": 0.023901034146547318, "learning_rate": 8.338348910179151e-07, "loss": 0.0624, "num_input_tokens_seen": 82471400, "step": 122355 }, { "epoch": 2.989275156963819, "grad_norm": 121.82840728759766, "learning_rate": 8.337507995566444e-07, "loss": 0.0363, "num_input_tokens_seen": 82474792, "step": 122360 }, { "epoch": 2.989397307795666, "grad_norm": 0.009395765140652657, "learning_rate": 8.33666709304383e-07, "loss": 0.0002, "num_input_tokens_seen": 82478248, "step": 122365 }, { "epoch": 2.9895194586275133, "grad_norm": 0.12960787117481232, "learning_rate": 8.335826202617416e-07, "loss": 0.0351, "num_input_tokens_seen": 82481960, "step": 122370 }, { "epoch": 2.9896416094593605, "grad_norm": 0.004650300834327936, "learning_rate": 8.334985324293321e-07, "loss": 0.0016, "num_input_tokens_seen": 82484904, "step": 122375 }, { "epoch": 2.9897637602912077, "grad_norm": 0.07671540230512619, "learning_rate": 8.334144458077665e-07, "loss": 0.0666, "num_input_tokens_seen": 82488488, "step": 122380 }, { "epoch": 2.989885911123055, "grad_norm": 0.06331785023212433, "learning_rate": 8.333303603976555e-07, "loss": 0.0276, "num_input_tokens_seen": 82491432, "step": 122385 }, { "epoch": 2.990008061954902, "grad_norm": 0.011702315881848335, "learning_rate": 8.332462761996114e-07, "loss": 0.0001, "num_input_tokens_seen": 82494568, "step": 122390 }, { "epoch": 2.990130212786749, "grad_norm": 0.0312877893447876, "learning_rate": 8.33162193214245e-07, "loss": 0.0772, "num_input_tokens_seen": 82497640, "step": 122395 }, { "epoch": 2.9902523636185965, "grad_norm": 29.36800193786621, "learning_rate": 8.330781114421678e-07, "loss": 0.0973, "num_input_tokens_seen": 82501352, "step": 122400 }, { "epoch": 2.990374514450443, "grad_norm": 0.15311099588871002, "learning_rate": 8.329940308839918e-07, "loss": 0.0007, "num_input_tokens_seen": 82504424, "step": 122405 }, { "epoch": 2.9904966652822904, "grad_norm": 0.008352646604180336, "learning_rate": 8.329099515403277e-07, "loss": 0.0004, "num_input_tokens_seen": 82507752, "step": 122410 }, { "epoch": 2.9906188161141376, "grad_norm": 0.03414013236761093, "learning_rate": 8.328258734117879e-07, "loss": 0.0001, "num_input_tokens_seen": 82510952, "step": 122415 }, { "epoch": 2.990740966945985, "grad_norm": 0.03758617490530014, "learning_rate": 8.327417964989827e-07, "loss": 0.0692, "num_input_tokens_seen": 82513896, "step": 122420 }, { "epoch": 2.990863117777832, "grad_norm": 0.05610216408967972, "learning_rate": 8.326577208025247e-07, "loss": 0.0001, "num_input_tokens_seen": 82517160, "step": 122425 }, { "epoch": 2.990985268609679, "grad_norm": 0.025218749418854713, "learning_rate": 8.325736463230244e-07, "loss": 0.0538, "num_input_tokens_seen": 82521064, "step": 122430 }, { "epoch": 2.9911074194415264, "grad_norm": 0.19947555661201477, "learning_rate": 8.324895730610939e-07, "loss": 0.0005, "num_input_tokens_seen": 82524264, "step": 122435 }, { "epoch": 2.9912295702733736, "grad_norm": 0.022488972172141075, "learning_rate": 8.32405501017344e-07, "loss": 0.0017, "num_input_tokens_seen": 82527208, "step": 122440 }, { "epoch": 2.9913517211052207, "grad_norm": 0.1366833597421646, "learning_rate": 8.323214301923865e-07, "loss": 0.0107, "num_input_tokens_seen": 82530472, "step": 122445 }, { "epoch": 2.991473871937068, "grad_norm": 0.5030769109725952, "learning_rate": 8.322373605868326e-07, "loss": 0.0723, "num_input_tokens_seen": 82534056, "step": 122450 }, { "epoch": 2.991596022768915, "grad_norm": 0.016812171787023544, "learning_rate": 8.321532922012937e-07, "loss": 0.0001, "num_input_tokens_seen": 82537832, "step": 122455 }, { "epoch": 2.9917181736007623, "grad_norm": 0.004857075400650501, "learning_rate": 8.320692250363816e-07, "loss": 0.0827, "num_input_tokens_seen": 82541352, "step": 122460 }, { "epoch": 2.9918403244326095, "grad_norm": 0.17521388828754425, "learning_rate": 8.319851590927067e-07, "loss": 0.0002, "num_input_tokens_seen": 82544744, "step": 122465 }, { "epoch": 2.9919624752644567, "grad_norm": 0.08017278462648392, "learning_rate": 8.31901094370881e-07, "loss": 0.0006, "num_input_tokens_seen": 82548584, "step": 122470 }, { "epoch": 2.992084626096304, "grad_norm": 0.027329741045832634, "learning_rate": 8.318170308715161e-07, "loss": 0.0001, "num_input_tokens_seen": 82551976, "step": 122475 }, { "epoch": 2.9922067769281506, "grad_norm": 0.011738654226064682, "learning_rate": 8.317329685952226e-07, "loss": 0.1283, "num_input_tokens_seen": 82555304, "step": 122480 }, { "epoch": 2.9923289277599983, "grad_norm": 0.06388230621814728, "learning_rate": 8.316489075426127e-07, "loss": 0.0862, "num_input_tokens_seen": 82558312, "step": 122485 }, { "epoch": 2.992451078591845, "grad_norm": 0.16729193925857544, "learning_rate": 8.315648477142967e-07, "loss": 0.0003, "num_input_tokens_seen": 82561320, "step": 122490 }, { "epoch": 2.9925732294236926, "grad_norm": 0.012937177903950214, "learning_rate": 8.314807891108869e-07, "loss": 0.0004, "num_input_tokens_seen": 82564776, "step": 122495 }, { "epoch": 2.9926953802555394, "grad_norm": 0.0036519125569611788, "learning_rate": 8.313967317329936e-07, "loss": 0.0001, "num_input_tokens_seen": 82568296, "step": 122500 }, { "epoch": 2.9928175310873866, "grad_norm": 0.005260920617729425, "learning_rate": 8.313126755812289e-07, "loss": 0.0401, "num_input_tokens_seen": 82571304, "step": 122505 }, { "epoch": 2.9929396819192338, "grad_norm": 0.007121355272829533, "learning_rate": 8.31228620656204e-07, "loss": 0.0003, "num_input_tokens_seen": 82574312, "step": 122510 }, { "epoch": 2.993061832751081, "grad_norm": 0.03562138229608536, "learning_rate": 8.311445669585297e-07, "loss": 0.0303, "num_input_tokens_seen": 82578536, "step": 122515 }, { "epoch": 2.993183983582928, "grad_norm": 0.07383058965206146, "learning_rate": 8.310605144888177e-07, "loss": 0.0379, "num_input_tokens_seen": 82581736, "step": 122520 }, { "epoch": 2.9933061344147753, "grad_norm": 37.444820404052734, "learning_rate": 8.309764632476788e-07, "loss": 0.0499, "num_input_tokens_seen": 82584872, "step": 122525 }, { "epoch": 2.9934282852466225, "grad_norm": 16.04447364807129, "learning_rate": 8.308924132357245e-07, "loss": 0.0547, "num_input_tokens_seen": 82587752, "step": 122530 }, { "epoch": 2.9935504360784697, "grad_norm": 0.05529216304421425, "learning_rate": 8.308083644535665e-07, "loss": 0.0685, "num_input_tokens_seen": 82590952, "step": 122535 }, { "epoch": 2.993672586910317, "grad_norm": 0.19819395244121552, "learning_rate": 8.307243169018151e-07, "loss": 0.1417, "num_input_tokens_seen": 82594344, "step": 122540 }, { "epoch": 2.993794737742164, "grad_norm": 0.3649430274963379, "learning_rate": 8.306402705810824e-07, "loss": 0.1196, "num_input_tokens_seen": 82597864, "step": 122545 }, { "epoch": 2.9939168885740113, "grad_norm": 1411.5084228515625, "learning_rate": 8.305562254919791e-07, "loss": 0.035, "num_input_tokens_seen": 82601064, "step": 122550 }, { "epoch": 2.9940390394058585, "grad_norm": 0.06733270734548569, "learning_rate": 8.304721816351164e-07, "loss": 0.0011, "num_input_tokens_seen": 82604520, "step": 122555 }, { "epoch": 2.9941611902377057, "grad_norm": 0.03253524750471115, "learning_rate": 8.303881390111056e-07, "loss": 0.0345, "num_input_tokens_seen": 82607720, "step": 122560 }, { "epoch": 2.9942833410695524, "grad_norm": 0.035487256944179535, "learning_rate": 8.303040976205578e-07, "loss": 0.0002, "num_input_tokens_seen": 82610920, "step": 122565 }, { "epoch": 2.9944054919014, "grad_norm": 0.019332388415932655, "learning_rate": 8.302200574640845e-07, "loss": 0.0001, "num_input_tokens_seen": 82614632, "step": 122570 }, { "epoch": 2.994527642733247, "grad_norm": 0.35883399844169617, "learning_rate": 8.301360185422963e-07, "loss": 0.0005, "num_input_tokens_seen": 82617896, "step": 122575 }, { "epoch": 2.9946497935650944, "grad_norm": 0.93114173412323, "learning_rate": 8.30051980855805e-07, "loss": 0.1424, "num_input_tokens_seen": 82621608, "step": 122580 }, { "epoch": 2.994771944396941, "grad_norm": 0.0395393930375576, "learning_rate": 8.29967944405221e-07, "loss": 0.0424, "num_input_tokens_seen": 82624680, "step": 122585 }, { "epoch": 2.9948940952287884, "grad_norm": 0.058739159256219864, "learning_rate": 8.298839091911562e-07, "loss": 0.0853, "num_input_tokens_seen": 82628200, "step": 122590 }, { "epoch": 2.9950162460606355, "grad_norm": 0.02455005794763565, "learning_rate": 8.297998752142211e-07, "loss": 0.0002, "num_input_tokens_seen": 82631720, "step": 122595 }, { "epoch": 2.9951383968924827, "grad_norm": 0.5319987535476685, "learning_rate": 8.297158424750272e-07, "loss": 0.0006, "num_input_tokens_seen": 82635112, "step": 122600 }, { "epoch": 2.99526054772433, "grad_norm": 0.1334734559059143, "learning_rate": 8.296318109741856e-07, "loss": 0.1043, "num_input_tokens_seen": 82638632, "step": 122605 }, { "epoch": 2.995382698556177, "grad_norm": 0.023001959547400475, "learning_rate": 8.295477807123071e-07, "loss": 0.0001, "num_input_tokens_seen": 82642152, "step": 122610 }, { "epoch": 2.9955048493880243, "grad_norm": 10.496086120605469, "learning_rate": 8.294637516900034e-07, "loss": 0.1018, "num_input_tokens_seen": 82645544, "step": 122615 }, { "epoch": 2.9956270002198715, "grad_norm": 0.02662235125899315, "learning_rate": 8.293797239078846e-07, "loss": 0.0574, "num_input_tokens_seen": 82648936, "step": 122620 }, { "epoch": 2.9957491510517187, "grad_norm": 0.2781931459903717, "learning_rate": 8.292956973665624e-07, "loss": 0.0003, "num_input_tokens_seen": 82652008, "step": 122625 }, { "epoch": 2.995871301883566, "grad_norm": 0.027957888320088387, "learning_rate": 8.292116720666482e-07, "loss": 0.0001, "num_input_tokens_seen": 82655336, "step": 122630 }, { "epoch": 2.995993452715413, "grad_norm": 0.04008051007986069, "learning_rate": 8.291276480087522e-07, "loss": 0.0002, "num_input_tokens_seen": 82658280, "step": 122635 }, { "epoch": 2.9961156035472603, "grad_norm": 0.09257989376783371, "learning_rate": 8.290436251934865e-07, "loss": 0.0589, "num_input_tokens_seen": 82661864, "step": 122640 }, { "epoch": 2.9962377543791074, "grad_norm": 0.09206127375364304, "learning_rate": 8.289596036214609e-07, "loss": 0.0005, "num_input_tokens_seen": 82664936, "step": 122645 }, { "epoch": 2.9963599052109546, "grad_norm": 0.2243671715259552, "learning_rate": 8.288755832932877e-07, "loss": 0.0001, "num_input_tokens_seen": 82668584, "step": 122650 }, { "epoch": 2.996482056042802, "grad_norm": 0.06528321653604507, "learning_rate": 8.287915642095766e-07, "loss": 0.0285, "num_input_tokens_seen": 82671528, "step": 122655 }, { "epoch": 2.9966042068746486, "grad_norm": 0.01274157129228115, "learning_rate": 8.287075463709396e-07, "loss": 0.0394, "num_input_tokens_seen": 82674600, "step": 122660 }, { "epoch": 2.996726357706496, "grad_norm": 0.09428822249174118, "learning_rate": 8.286235297779878e-07, "loss": 0.0007, "num_input_tokens_seen": 82677800, "step": 122665 }, { "epoch": 2.996848508538343, "grad_norm": 0.008868207223713398, "learning_rate": 8.285395144313312e-07, "loss": 0.002, "num_input_tokens_seen": 82681320, "step": 122670 }, { "epoch": 2.9969706593701906, "grad_norm": 0.005609673913568258, "learning_rate": 8.284555003315819e-07, "loss": 0.0726, "num_input_tokens_seen": 82684072, "step": 122675 }, { "epoch": 2.9970928102020373, "grad_norm": 0.2403886914253235, "learning_rate": 8.283714874793497e-07, "loss": 0.0866, "num_input_tokens_seen": 82687528, "step": 122680 }, { "epoch": 2.9972149610338845, "grad_norm": 224.93780517578125, "learning_rate": 8.282874758752464e-07, "loss": 0.0152, "num_input_tokens_seen": 82691048, "step": 122685 }, { "epoch": 2.9973371118657317, "grad_norm": 0.33052942156791687, "learning_rate": 8.28203465519883e-07, "loss": 0.0684, "num_input_tokens_seen": 82694120, "step": 122690 }, { "epoch": 2.997459262697579, "grad_norm": 0.11541890352964401, "learning_rate": 8.2811945641387e-07, "loss": 0.0517, "num_input_tokens_seen": 82697512, "step": 122695 }, { "epoch": 2.997581413529426, "grad_norm": 0.03359320014715195, "learning_rate": 8.280354485578188e-07, "loss": 0.0149, "num_input_tokens_seen": 82700648, "step": 122700 }, { "epoch": 2.9977035643612733, "grad_norm": 0.10430304706096649, "learning_rate": 8.2795144195234e-07, "loss": 0.0664, "num_input_tokens_seen": 82704232, "step": 122705 }, { "epoch": 2.9978257151931205, "grad_norm": 0.04724357649683952, "learning_rate": 8.278674365980445e-07, "loss": 0.0004, "num_input_tokens_seen": 82707688, "step": 122710 }, { "epoch": 2.9979478660249677, "grad_norm": 0.059274137020111084, "learning_rate": 8.277834324955433e-07, "loss": 0.0002, "num_input_tokens_seen": 82710760, "step": 122715 }, { "epoch": 2.998070016856815, "grad_norm": 0.0280942153185606, "learning_rate": 8.276994296454471e-07, "loss": 0.0002, "num_input_tokens_seen": 82714024, "step": 122720 }, { "epoch": 2.998192167688662, "grad_norm": 0.48792269825935364, "learning_rate": 8.276154280483674e-07, "loss": 0.0016, "num_input_tokens_seen": 82717352, "step": 122725 }, { "epoch": 2.9983143185205092, "grad_norm": 0.20319870114326477, "learning_rate": 8.275314277049144e-07, "loss": 0.038, "num_input_tokens_seen": 82720616, "step": 122730 }, { "epoch": 2.9984364693523564, "grad_norm": 0.09327713400125504, "learning_rate": 8.274474286156994e-07, "loss": 0.0933, "num_input_tokens_seen": 82723688, "step": 122735 }, { "epoch": 2.9985586201842036, "grad_norm": 0.06357068568468094, "learning_rate": 8.273634307813329e-07, "loss": 0.0555, "num_input_tokens_seen": 82726888, "step": 122740 }, { "epoch": 2.9986807710160504, "grad_norm": 0.10009672492742538, "learning_rate": 8.272794342024263e-07, "loss": 0.0492, "num_input_tokens_seen": 82730216, "step": 122745 }, { "epoch": 2.998802921847898, "grad_norm": 0.1424786001443863, "learning_rate": 8.271954388795897e-07, "loss": 0.0003, "num_input_tokens_seen": 82733480, "step": 122750 }, { "epoch": 2.9989250726797447, "grad_norm": 0.007452080957591534, "learning_rate": 8.271114448134345e-07, "loss": 0.0561, "num_input_tokens_seen": 82736680, "step": 122755 }, { "epoch": 2.9990472235115924, "grad_norm": 0.02739526517689228, "learning_rate": 8.270274520045715e-07, "loss": 0.0001, "num_input_tokens_seen": 82740072, "step": 122760 }, { "epoch": 2.999169374343439, "grad_norm": 0.11577534675598145, "learning_rate": 8.269434604536113e-07, "loss": 0.0002, "num_input_tokens_seen": 82743272, "step": 122765 }, { "epoch": 2.9992915251752863, "grad_norm": 0.02286659926176071, "learning_rate": 8.268594701611651e-07, "loss": 0.0002, "num_input_tokens_seen": 82746600, "step": 122770 }, { "epoch": 2.9994136760071335, "grad_norm": 33.681095123291016, "learning_rate": 8.267754811278429e-07, "loss": 0.0518, "num_input_tokens_seen": 82750248, "step": 122775 }, { "epoch": 2.9995358268389807, "grad_norm": 0.03711327537894249, "learning_rate": 8.266914933542559e-07, "loss": 0.0044, "num_input_tokens_seen": 82753768, "step": 122780 }, { "epoch": 2.999657977670828, "grad_norm": 54.62702941894531, "learning_rate": 8.266075068410156e-07, "loss": 0.1181, "num_input_tokens_seen": 82757416, "step": 122785 }, { "epoch": 2.999780128502675, "grad_norm": 0.03556351736187935, "learning_rate": 8.265235215887317e-07, "loss": 0.0627, "num_input_tokens_seen": 82760936, "step": 122790 }, { "epoch": 2.9999022793345222, "grad_norm": 0.003964877687394619, "learning_rate": 8.264395375980156e-07, "loss": 0.0001, "num_input_tokens_seen": 82763880, "step": 122795 }, { "epoch": 3.0000244301663694, "grad_norm": 0.03248461335897446, "learning_rate": 8.263555548694777e-07, "loss": 0.0002, "num_input_tokens_seen": 82767184, "step": 122800 }, { "epoch": 3.0001465809982166, "grad_norm": 0.0407659187912941, "learning_rate": 8.262715734037292e-07, "loss": 0.0003, "num_input_tokens_seen": 82770064, "step": 122805 }, { "epoch": 3.000219871497325, "eval_loss": 0.19360828399658203, "eval_runtime": 47.722, "eval_samples_per_second": 762.437, "eval_steps_per_second": 95.323, "num_input_tokens_seen": 82772304, "step": 122808 }, { "epoch": 3.000268731830064, "grad_norm": 0.08304096758365631, "learning_rate": 8.261875932013802e-07, "loss": 0.0303, "num_input_tokens_seen": 82774032, "step": 122810 }, { "epoch": 3.000390882661911, "grad_norm": 0.012755513191223145, "learning_rate": 8.26103614263042e-07, "loss": 0.0002, "num_input_tokens_seen": 82777744, "step": 122815 }, { "epoch": 3.000513033493758, "grad_norm": 16.014375686645508, "learning_rate": 8.260196365893252e-07, "loss": 0.0634, "num_input_tokens_seen": 82781072, "step": 122820 }, { "epoch": 3.0006351843256054, "grad_norm": 0.07999828457832336, "learning_rate": 8.2593566018084e-07, "loss": 0.0001, "num_input_tokens_seen": 82784400, "step": 122825 }, { "epoch": 3.0007573351574526, "grad_norm": 0.029006067663431168, "learning_rate": 8.25851685038198e-07, "loss": 0.0001, "num_input_tokens_seen": 82787472, "step": 122830 }, { "epoch": 3.0008794859892998, "grad_norm": 0.030935434624552727, "learning_rate": 8.257677111620089e-07, "loss": 0.0002, "num_input_tokens_seen": 82790864, "step": 122835 }, { "epoch": 3.001001636821147, "grad_norm": 60.06044006347656, "learning_rate": 8.256837385528839e-07, "loss": 0.0135, "num_input_tokens_seen": 82794384, "step": 122840 }, { "epoch": 3.0011237876529937, "grad_norm": 0.09562061727046967, "learning_rate": 8.25599767211434e-07, "loss": 0.0002, "num_input_tokens_seen": 82798288, "step": 122845 }, { "epoch": 3.001245938484841, "grad_norm": 0.004097535274922848, "learning_rate": 8.255157971382691e-07, "loss": 0.0, "num_input_tokens_seen": 82801232, "step": 122850 }, { "epoch": 3.001368089316688, "grad_norm": 0.052044086158275604, "learning_rate": 8.254318283340007e-07, "loss": 0.0683, "num_input_tokens_seen": 82804304, "step": 122855 }, { "epoch": 3.0014902401485353, "grad_norm": 0.09695622324943542, "learning_rate": 8.253478607992388e-07, "loss": 0.0001, "num_input_tokens_seen": 82807568, "step": 122860 }, { "epoch": 3.0016123909803825, "grad_norm": 0.013213934376835823, "learning_rate": 8.252638945345942e-07, "loss": 0.0001, "num_input_tokens_seen": 82810768, "step": 122865 }, { "epoch": 3.0017345418122297, "grad_norm": 0.07854120433330536, "learning_rate": 8.251799295406776e-07, "loss": 0.0001, "num_input_tokens_seen": 82814096, "step": 122870 }, { "epoch": 3.001856692644077, "grad_norm": 0.04113243520259857, "learning_rate": 8.250959658180993e-07, "loss": 0.0, "num_input_tokens_seen": 82817872, "step": 122875 }, { "epoch": 3.001978843475924, "grad_norm": 0.014978439547121525, "learning_rate": 8.250120033674706e-07, "loss": 0.0002, "num_input_tokens_seen": 82821456, "step": 122880 }, { "epoch": 3.002100994307771, "grad_norm": 0.09572244435548782, "learning_rate": 8.249280421894012e-07, "loss": 0.0001, "num_input_tokens_seen": 82824720, "step": 122885 }, { "epoch": 3.0022231451396184, "grad_norm": 0.00526698911562562, "learning_rate": 8.248440822845028e-07, "loss": 0.0, "num_input_tokens_seen": 82828432, "step": 122890 }, { "epoch": 3.0023452959714656, "grad_norm": 0.03695838525891304, "learning_rate": 8.247601236533848e-07, "loss": 0.001, "num_input_tokens_seen": 82831440, "step": 122895 }, { "epoch": 3.002467446803313, "grad_norm": 0.04748551920056343, "learning_rate": 8.246761662966587e-07, "loss": 0.0001, "num_input_tokens_seen": 82835152, "step": 122900 }, { "epoch": 3.00258959763516, "grad_norm": 0.005985293071717024, "learning_rate": 8.245922102149343e-07, "loss": 0.0001, "num_input_tokens_seen": 82838800, "step": 122905 }, { "epoch": 3.002711748467007, "grad_norm": 0.02861677296459675, "learning_rate": 8.245082554088228e-07, "loss": 0.0001, "num_input_tokens_seen": 82841936, "step": 122910 }, { "epoch": 3.0028338992988544, "grad_norm": 0.03024284727871418, "learning_rate": 8.244243018789343e-07, "loss": 0.0001, "num_input_tokens_seen": 82845328, "step": 122915 }, { "epoch": 3.0029560501307015, "grad_norm": 0.010547067038714886, "learning_rate": 8.243403496258797e-07, "loss": 0.0391, "num_input_tokens_seen": 82848592, "step": 122920 }, { "epoch": 3.0030782009625487, "grad_norm": 0.02874787710607052, "learning_rate": 8.242563986502693e-07, "loss": 0.0016, "num_input_tokens_seen": 82852368, "step": 122925 }, { "epoch": 3.003200351794396, "grad_norm": 0.0258303415030241, "learning_rate": 8.241724489527134e-07, "loss": 0.0001, "num_input_tokens_seen": 82856016, "step": 122930 }, { "epoch": 3.0033225026262427, "grad_norm": 0.012022108770906925, "learning_rate": 8.240885005338227e-07, "loss": 0.0094, "num_input_tokens_seen": 82859344, "step": 122935 }, { "epoch": 3.00344465345809, "grad_norm": 0.28652423620224, "learning_rate": 8.240045533942081e-07, "loss": 0.0081, "num_input_tokens_seen": 82862288, "step": 122940 }, { "epoch": 3.003566804289937, "grad_norm": 0.0074856579303741455, "learning_rate": 8.239206075344793e-07, "loss": 0.0, "num_input_tokens_seen": 82865296, "step": 122945 }, { "epoch": 3.0036889551217842, "grad_norm": 0.003348552156239748, "learning_rate": 8.238366629552478e-07, "loss": 0.0001, "num_input_tokens_seen": 82868880, "step": 122950 }, { "epoch": 3.0038111059536314, "grad_norm": 0.008183478377759457, "learning_rate": 8.237527196571229e-07, "loss": 0.0431, "num_input_tokens_seen": 82872144, "step": 122955 }, { "epoch": 3.0039332567854786, "grad_norm": 0.15718647837638855, "learning_rate": 8.23668777640716e-07, "loss": 0.0001, "num_input_tokens_seen": 82875216, "step": 122960 }, { "epoch": 3.004055407617326, "grad_norm": 236.15402221679688, "learning_rate": 8.23584836906637e-07, "loss": 0.0294, "num_input_tokens_seen": 82878736, "step": 122965 }, { "epoch": 3.004177558449173, "grad_norm": 0.007492092438042164, "learning_rate": 8.235008974554964e-07, "loss": 0.0, "num_input_tokens_seen": 82882384, "step": 122970 }, { "epoch": 3.00429970928102, "grad_norm": 0.00372646888718009, "learning_rate": 8.234169592879053e-07, "loss": 0.0, "num_input_tokens_seen": 82885648, "step": 122975 }, { "epoch": 3.0044218601128674, "grad_norm": 0.006116523407399654, "learning_rate": 8.233330224044728e-07, "loss": 0.0002, "num_input_tokens_seen": 82889104, "step": 122980 }, { "epoch": 3.0045440109447146, "grad_norm": 0.0030098408460617065, "learning_rate": 8.232490868058106e-07, "loss": 0.0, "num_input_tokens_seen": 82892688, "step": 122985 }, { "epoch": 3.0046661617765618, "grad_norm": 0.1593664586544037, "learning_rate": 8.231651524925283e-07, "loss": 0.0001, "num_input_tokens_seen": 82895888, "step": 122990 }, { "epoch": 3.004788312608409, "grad_norm": 0.022927800193428993, "learning_rate": 8.230812194652369e-07, "loss": 0.0002, "num_input_tokens_seen": 82899024, "step": 122995 }, { "epoch": 3.004910463440256, "grad_norm": 0.007119093555957079, "learning_rate": 8.229972877245461e-07, "loss": 0.0, "num_input_tokens_seen": 82902352, "step": 123000 }, { "epoch": 3.0050326142721033, "grad_norm": 0.002705874852836132, "learning_rate": 8.229133572710665e-07, "loss": 0.0, "num_input_tokens_seen": 82905424, "step": 123005 }, { "epoch": 3.0051547651039505, "grad_norm": 0.041886989027261734, "learning_rate": 8.228294281054091e-07, "loss": 0.0001, "num_input_tokens_seen": 82908432, "step": 123010 }, { "epoch": 3.0052769159357977, "grad_norm": 0.10048436373472214, "learning_rate": 8.227455002281835e-07, "loss": 0.0002, "num_input_tokens_seen": 82911888, "step": 123015 }, { "epoch": 3.005399066767645, "grad_norm": 0.006059782113879919, "learning_rate": 8.226615736400004e-07, "loss": 0.0, "num_input_tokens_seen": 82915152, "step": 123020 }, { "epoch": 3.0055212175994916, "grad_norm": 0.03320590406656265, "learning_rate": 8.225776483414699e-07, "loss": 0.0001, "num_input_tokens_seen": 82918416, "step": 123025 }, { "epoch": 3.005643368431339, "grad_norm": 0.0008360522333532572, "learning_rate": 8.224937243332024e-07, "loss": 0.0, "num_input_tokens_seen": 82922128, "step": 123030 }, { "epoch": 3.005765519263186, "grad_norm": 0.0301240012049675, "learning_rate": 8.224098016158087e-07, "loss": 0.0001, "num_input_tokens_seen": 82925456, "step": 123035 }, { "epoch": 3.005887670095033, "grad_norm": 0.00424900371581316, "learning_rate": 8.223258801898981e-07, "loss": 0.0476, "num_input_tokens_seen": 82928528, "step": 123040 }, { "epoch": 3.0060098209268804, "grad_norm": 0.012241334654390812, "learning_rate": 8.22241960056082e-07, "loss": 0.0001, "num_input_tokens_seen": 82932112, "step": 123045 }, { "epoch": 3.0061319717587276, "grad_norm": 0.006143426522612572, "learning_rate": 8.221580412149697e-07, "loss": 0.0307, "num_input_tokens_seen": 82935184, "step": 123050 }, { "epoch": 3.006254122590575, "grad_norm": 0.009116101078689098, "learning_rate": 8.220741236671726e-07, "loss": 0.0001, "num_input_tokens_seen": 82938704, "step": 123055 }, { "epoch": 3.006376273422422, "grad_norm": 0.02133636362850666, "learning_rate": 8.219902074132996e-07, "loss": 0.0, "num_input_tokens_seen": 82941840, "step": 123060 }, { "epoch": 3.006498424254269, "grad_norm": 0.26472222805023193, "learning_rate": 8.219062924539621e-07, "loss": 0.0003, "num_input_tokens_seen": 82944912, "step": 123065 }, { "epoch": 3.0066205750861164, "grad_norm": 0.041839998215436935, "learning_rate": 8.218223787897699e-07, "loss": 0.0001, "num_input_tokens_seen": 82947600, "step": 123070 }, { "epoch": 3.0067427259179635, "grad_norm": 0.02455715648829937, "learning_rate": 8.217384664213332e-07, "loss": 0.0, "num_input_tokens_seen": 82950928, "step": 123075 }, { "epoch": 3.0068648767498107, "grad_norm": 0.25165486335754395, "learning_rate": 8.216545553492626e-07, "loss": 0.0001, "num_input_tokens_seen": 82954256, "step": 123080 }, { "epoch": 3.006987027581658, "grad_norm": 0.0012642904184758663, "learning_rate": 8.215706455741677e-07, "loss": 0.0, "num_input_tokens_seen": 82957904, "step": 123085 }, { "epoch": 3.007109178413505, "grad_norm": 0.0021325668785721064, "learning_rate": 8.214867370966589e-07, "loss": 0.0, "num_input_tokens_seen": 82961616, "step": 123090 }, { "epoch": 3.0072313292453523, "grad_norm": 0.013873937539756298, "learning_rate": 8.214028299173471e-07, "loss": 0.0, "num_input_tokens_seen": 82966032, "step": 123095 }, { "epoch": 3.0073534800771995, "grad_norm": 0.007746911607682705, "learning_rate": 8.213189240368416e-07, "loss": 0.0001, "num_input_tokens_seen": 82969424, "step": 123100 }, { "epoch": 3.0074756309090467, "grad_norm": 27.39247703552246, "learning_rate": 8.212350194557532e-07, "loss": 0.0028, "num_input_tokens_seen": 82973264, "step": 123105 }, { "epoch": 3.007597781740894, "grad_norm": 0.000948448374401778, "learning_rate": 8.211511161746914e-07, "loss": 0.0, "num_input_tokens_seen": 82976592, "step": 123110 }, { "epoch": 3.0077199325727406, "grad_norm": 0.16879381239414215, "learning_rate": 8.210672141942674e-07, "loss": 0.0028, "num_input_tokens_seen": 82979856, "step": 123115 }, { "epoch": 3.007842083404588, "grad_norm": 0.0199663657695055, "learning_rate": 8.209833135150901e-07, "loss": 0.1153, "num_input_tokens_seen": 82983184, "step": 123120 }, { "epoch": 3.007964234236435, "grad_norm": 0.0003089674864895642, "learning_rate": 8.208994141377706e-07, "loss": 0.0, "num_input_tokens_seen": 82986256, "step": 123125 }, { "epoch": 3.008086385068282, "grad_norm": 0.013704835437238216, "learning_rate": 8.20815516062919e-07, "loss": 0.0001, "num_input_tokens_seen": 82990032, "step": 123130 }, { "epoch": 3.0082085359001294, "grad_norm": 0.0007210209732875228, "learning_rate": 8.207316192911447e-07, "loss": 0.0001, "num_input_tokens_seen": 82993936, "step": 123135 }, { "epoch": 3.0083306867319766, "grad_norm": 0.00036419389653019607, "learning_rate": 8.206477238230587e-07, "loss": 0.0001, "num_input_tokens_seen": 82997072, "step": 123140 }, { "epoch": 3.0084528375638238, "grad_norm": 0.001539978664368391, "learning_rate": 8.205638296592703e-07, "loss": 0.0001, "num_input_tokens_seen": 83000528, "step": 123145 }, { "epoch": 3.008574988395671, "grad_norm": 0.1176891177892685, "learning_rate": 8.204799368003903e-07, "loss": 0.0002, "num_input_tokens_seen": 83003728, "step": 123150 }, { "epoch": 3.008697139227518, "grad_norm": 0.004794170614331961, "learning_rate": 8.203960452470282e-07, "loss": 0.0001, "num_input_tokens_seen": 83006928, "step": 123155 }, { "epoch": 3.0088192900593653, "grad_norm": 0.0019680424593389034, "learning_rate": 8.203121549997942e-07, "loss": 0.0764, "num_input_tokens_seen": 83010064, "step": 123160 }, { "epoch": 3.0089414408912125, "grad_norm": 0.005374094936996698, "learning_rate": 8.202282660592992e-07, "loss": 0.0001, "num_input_tokens_seen": 83013520, "step": 123165 }, { "epoch": 3.0090635917230597, "grad_norm": 206.52078247070312, "learning_rate": 8.201443784261522e-07, "loss": 0.0589, "num_input_tokens_seen": 83016912, "step": 123170 }, { "epoch": 3.009185742554907, "grad_norm": 0.041995588690042496, "learning_rate": 8.200604921009637e-07, "loss": 0.0002, "num_input_tokens_seen": 83020368, "step": 123175 }, { "epoch": 3.009307893386754, "grad_norm": 0.002659060060977936, "learning_rate": 8.199766070843437e-07, "loss": 0.0, "num_input_tokens_seen": 83023760, "step": 123180 }, { "epoch": 3.0094300442186013, "grad_norm": 0.0031865271739661694, "learning_rate": 8.198927233769021e-07, "loss": 0.0, "num_input_tokens_seen": 83027088, "step": 123185 }, { "epoch": 3.0095521950504485, "grad_norm": 0.006657351739704609, "learning_rate": 8.198088409792495e-07, "loss": 0.0001, "num_input_tokens_seen": 83031056, "step": 123190 }, { "epoch": 3.0096743458822957, "grad_norm": 0.021974647417664528, "learning_rate": 8.197249598919949e-07, "loss": 0.0, "num_input_tokens_seen": 83034192, "step": 123195 }, { "epoch": 3.009796496714143, "grad_norm": 0.2845902144908905, "learning_rate": 8.196410801157494e-07, "loss": 0.0001, "num_input_tokens_seen": 83037904, "step": 123200 }, { "epoch": 3.0099186475459896, "grad_norm": 0.0019516038009896874, "learning_rate": 8.19557201651122e-07, "loss": 0.0001, "num_input_tokens_seen": 83042000, "step": 123205 }, { "epoch": 3.010040798377837, "grad_norm": 0.002426578663289547, "learning_rate": 8.194733244987235e-07, "loss": 0.0, "num_input_tokens_seen": 83045456, "step": 123210 }, { "epoch": 3.010162949209684, "grad_norm": 0.002065720036625862, "learning_rate": 8.193894486591633e-07, "loss": 0.0, "num_input_tokens_seen": 83048528, "step": 123215 }, { "epoch": 3.010285100041531, "grad_norm": 0.0328923799097538, "learning_rate": 8.193055741330517e-07, "loss": 0.0, "num_input_tokens_seen": 83051472, "step": 123220 }, { "epoch": 3.0104072508733783, "grad_norm": 0.016800448298454285, "learning_rate": 8.192217009209986e-07, "loss": 0.0637, "num_input_tokens_seen": 83054672, "step": 123225 }, { "epoch": 3.0105294017052255, "grad_norm": 0.0432097427546978, "learning_rate": 8.191378290236139e-07, "loss": 0.0001, "num_input_tokens_seen": 83058192, "step": 123230 }, { "epoch": 3.0106515525370727, "grad_norm": 0.001382253016345203, "learning_rate": 8.19053958441508e-07, "loss": 0.0001, "num_input_tokens_seen": 83061392, "step": 123235 }, { "epoch": 3.01077370336892, "grad_norm": 110.7890396118164, "learning_rate": 8.189700891752897e-07, "loss": 0.0532, "num_input_tokens_seen": 83064592, "step": 123240 }, { "epoch": 3.010895854200767, "grad_norm": 0.01439904235303402, "learning_rate": 8.188862212255696e-07, "loss": 0.0002, "num_input_tokens_seen": 83067728, "step": 123245 }, { "epoch": 3.0110180050326143, "grad_norm": 0.012585053220391273, "learning_rate": 8.188023545929581e-07, "loss": 0.0, "num_input_tokens_seen": 83070736, "step": 123250 }, { "epoch": 3.0111401558644615, "grad_norm": 0.02939104288816452, "learning_rate": 8.187184892780641e-07, "loss": 0.0001, "num_input_tokens_seen": 83074384, "step": 123255 }, { "epoch": 3.0112623066963087, "grad_norm": 0.0023191035725176334, "learning_rate": 8.186346252814986e-07, "loss": 0.0, "num_input_tokens_seen": 83077456, "step": 123260 }, { "epoch": 3.011384457528156, "grad_norm": 0.001554151182062924, "learning_rate": 8.185507626038703e-07, "loss": 0.0, "num_input_tokens_seen": 83080976, "step": 123265 }, { "epoch": 3.011506608360003, "grad_norm": 0.0005073735374026, "learning_rate": 8.184669012457902e-07, "loss": 0.0, "num_input_tokens_seen": 83084944, "step": 123270 }, { "epoch": 3.0116287591918502, "grad_norm": 0.013489312492311, "learning_rate": 8.183830412078671e-07, "loss": 0.0002, "num_input_tokens_seen": 83088144, "step": 123275 }, { "epoch": 3.0117509100236974, "grad_norm": 0.0010727399494498968, "learning_rate": 8.182991824907118e-07, "loss": 0.0, "num_input_tokens_seen": 83091472, "step": 123280 }, { "epoch": 3.0118730608555446, "grad_norm": 0.007046072278171778, "learning_rate": 8.182153250949336e-07, "loss": 0.0, "num_input_tokens_seen": 83095056, "step": 123285 }, { "epoch": 3.0119952116873914, "grad_norm": 0.0008582680602557957, "learning_rate": 8.181314690211422e-07, "loss": 0.0, "num_input_tokens_seen": 83098384, "step": 123290 }, { "epoch": 3.0121173625192386, "grad_norm": 0.0054124463349580765, "learning_rate": 8.180476142699482e-07, "loss": 0.0001, "num_input_tokens_seen": 83101648, "step": 123295 }, { "epoch": 3.0122395133510858, "grad_norm": 0.20177757740020752, "learning_rate": 8.179637608419603e-07, "loss": 0.0002, "num_input_tokens_seen": 83105360, "step": 123300 }, { "epoch": 3.012361664182933, "grad_norm": 0.07805618643760681, "learning_rate": 8.178799087377893e-07, "loss": 0.0017, "num_input_tokens_seen": 83108368, "step": 123305 }, { "epoch": 3.01248381501478, "grad_norm": 0.07807561010122299, "learning_rate": 8.177960579580443e-07, "loss": 0.0001, "num_input_tokens_seen": 83111440, "step": 123310 }, { "epoch": 3.0126059658466273, "grad_norm": 0.001814626739360392, "learning_rate": 8.177122085033352e-07, "loss": 0.0, "num_input_tokens_seen": 83114896, "step": 123315 }, { "epoch": 3.0127281166784745, "grad_norm": 0.0006838240078650415, "learning_rate": 8.176283603742726e-07, "loss": 0.0179, "num_input_tokens_seen": 83117968, "step": 123320 }, { "epoch": 3.0128502675103217, "grad_norm": 0.002631385810673237, "learning_rate": 8.175445135714653e-07, "loss": 0.0, "num_input_tokens_seen": 83121104, "step": 123325 }, { "epoch": 3.012972418342169, "grad_norm": 0.005652591120451689, "learning_rate": 8.174606680955232e-07, "loss": 0.0, "num_input_tokens_seen": 83124432, "step": 123330 }, { "epoch": 3.013094569174016, "grad_norm": 412.9010925292969, "learning_rate": 8.173768239470564e-07, "loss": 0.0534, "num_input_tokens_seen": 83127504, "step": 123335 }, { "epoch": 3.0132167200058633, "grad_norm": 0.0015622521750628948, "learning_rate": 8.172929811266744e-07, "loss": 0.0, "num_input_tokens_seen": 83131280, "step": 123340 }, { "epoch": 3.0133388708377105, "grad_norm": 0.001484617474488914, "learning_rate": 8.172091396349871e-07, "loss": 0.0, "num_input_tokens_seen": 83134544, "step": 123345 }, { "epoch": 3.0134610216695576, "grad_norm": 0.0010365790221840143, "learning_rate": 8.171252994726039e-07, "loss": 0.0, "num_input_tokens_seen": 83137616, "step": 123350 }, { "epoch": 3.013583172501405, "grad_norm": 0.013608659617602825, "learning_rate": 8.17041460640135e-07, "loss": 0.0001, "num_input_tokens_seen": 83141008, "step": 123355 }, { "epoch": 3.013705323333252, "grad_norm": 0.03848070278763771, "learning_rate": 8.169576231381894e-07, "loss": 0.0, "num_input_tokens_seen": 83144912, "step": 123360 }, { "epoch": 3.013827474165099, "grad_norm": 0.034808073192834854, "learning_rate": 8.168737869673776e-07, "loss": 0.0, "num_input_tokens_seen": 83148560, "step": 123365 }, { "epoch": 3.0139496249969464, "grad_norm": 0.1787676066160202, "learning_rate": 8.167899521283086e-07, "loss": 0.0001, "num_input_tokens_seen": 83151824, "step": 123370 }, { "epoch": 3.0140717758287936, "grad_norm": 0.08518391847610474, "learning_rate": 8.167061186215925e-07, "loss": 0.0002, "num_input_tokens_seen": 83155472, "step": 123375 }, { "epoch": 3.0141939266606403, "grad_norm": 0.0024606117513030767, "learning_rate": 8.166222864478387e-07, "loss": 0.0, "num_input_tokens_seen": 83158864, "step": 123380 }, { "epoch": 3.0143160774924875, "grad_norm": 0.0019267149036750197, "learning_rate": 8.16538455607657e-07, "loss": 0.0, "num_input_tokens_seen": 83161936, "step": 123385 }, { "epoch": 3.0144382283243347, "grad_norm": 35.64485549926758, "learning_rate": 8.164546261016572e-07, "loss": 0.0565, "num_input_tokens_seen": 83165008, "step": 123390 }, { "epoch": 3.014560379156182, "grad_norm": 24.512760162353516, "learning_rate": 8.163707979304483e-07, "loss": 0.0688, "num_input_tokens_seen": 83168976, "step": 123395 }, { "epoch": 3.014682529988029, "grad_norm": 0.02174725942313671, "learning_rate": 8.162869710946404e-07, "loss": 0.0, "num_input_tokens_seen": 83172368, "step": 123400 }, { "epoch": 3.0148046808198763, "grad_norm": 0.002265357645228505, "learning_rate": 8.162031455948435e-07, "loss": 0.0, "num_input_tokens_seen": 83175824, "step": 123405 }, { "epoch": 3.0149268316517235, "grad_norm": 0.0016550914151594043, "learning_rate": 8.161193214316662e-07, "loss": 0.0, "num_input_tokens_seen": 83178896, "step": 123410 }, { "epoch": 3.0150489824835707, "grad_norm": 0.004780837334692478, "learning_rate": 8.160354986057192e-07, "loss": 0.0001, "num_input_tokens_seen": 83181968, "step": 123415 }, { "epoch": 3.015171133315418, "grad_norm": 0.002558564767241478, "learning_rate": 8.15951677117611e-07, "loss": 0.0, "num_input_tokens_seen": 83185680, "step": 123420 }, { "epoch": 3.015293284147265, "grad_norm": 0.00885737594217062, "learning_rate": 8.158678569679523e-07, "loss": 0.0, "num_input_tokens_seen": 83188944, "step": 123425 }, { "epoch": 3.0154154349791122, "grad_norm": 0.00036788417492061853, "learning_rate": 8.157840381573515e-07, "loss": 0.0, "num_input_tokens_seen": 83192080, "step": 123430 }, { "epoch": 3.0155375858109594, "grad_norm": 0.0042150323279201984, "learning_rate": 8.15700220686419e-07, "loss": 0.0, "num_input_tokens_seen": 83195152, "step": 123435 }, { "epoch": 3.0156597366428066, "grad_norm": 0.04398500174283981, "learning_rate": 8.156164045557643e-07, "loss": 0.0, "num_input_tokens_seen": 83198544, "step": 123440 }, { "epoch": 3.015781887474654, "grad_norm": 0.002602602820843458, "learning_rate": 8.155325897659963e-07, "loss": 0.0, "num_input_tokens_seen": 83202192, "step": 123445 }, { "epoch": 3.015904038306501, "grad_norm": 0.002433629473671317, "learning_rate": 8.154487763177252e-07, "loss": 0.0003, "num_input_tokens_seen": 83205648, "step": 123450 }, { "epoch": 3.016026189138348, "grad_norm": 0.0015204442897811532, "learning_rate": 8.1536496421156e-07, "loss": 0.0001, "num_input_tokens_seen": 83209296, "step": 123455 }, { "epoch": 3.0161483399701954, "grad_norm": 0.05968998745083809, "learning_rate": 8.152811534481108e-07, "loss": 0.0551, "num_input_tokens_seen": 83212560, "step": 123460 }, { "epoch": 3.0162704908020426, "grad_norm": 0.0054749189876019955, "learning_rate": 8.151973440279862e-07, "loss": 0.0, "num_input_tokens_seen": 83216016, "step": 123465 }, { "epoch": 3.0163926416338893, "grad_norm": 0.0608709380030632, "learning_rate": 8.151135359517963e-07, "loss": 0.0001, "num_input_tokens_seen": 83219152, "step": 123470 }, { "epoch": 3.0165147924657365, "grad_norm": 0.13947315514087677, "learning_rate": 8.150297292201509e-07, "loss": 0.0183, "num_input_tokens_seen": 83222608, "step": 123475 }, { "epoch": 3.0166369432975837, "grad_norm": 0.0014263535849750042, "learning_rate": 8.149459238336589e-07, "loss": 0.0, "num_input_tokens_seen": 83226000, "step": 123480 }, { "epoch": 3.016759094129431, "grad_norm": 0.27640414237976074, "learning_rate": 8.148621197929298e-07, "loss": 0.0002, "num_input_tokens_seen": 83228880, "step": 123485 }, { "epoch": 3.016881244961278, "grad_norm": 0.0012199397897347808, "learning_rate": 8.147783170985734e-07, "loss": 0.0, "num_input_tokens_seen": 83232272, "step": 123490 }, { "epoch": 3.0170033957931253, "grad_norm": 0.026773726567626, "learning_rate": 8.146945157511984e-07, "loss": 0.0, "num_input_tokens_seen": 83235536, "step": 123495 }, { "epoch": 3.0171255466249725, "grad_norm": 0.019472522661089897, "learning_rate": 8.146107157514152e-07, "loss": 0.0, "num_input_tokens_seen": 83239056, "step": 123500 }, { "epoch": 3.0172476974568196, "grad_norm": 0.017728516831994057, "learning_rate": 8.145269170998326e-07, "loss": 0.0001, "num_input_tokens_seen": 83242320, "step": 123505 }, { "epoch": 3.017369848288667, "grad_norm": 0.0009785944130271673, "learning_rate": 8.144431197970602e-07, "loss": 0.0, "num_input_tokens_seen": 83245648, "step": 123510 }, { "epoch": 3.017491999120514, "grad_norm": 0.008329205214977264, "learning_rate": 8.143593238437072e-07, "loss": 0.0, "num_input_tokens_seen": 83249744, "step": 123515 }, { "epoch": 3.017614149952361, "grad_norm": 0.0062654935754835606, "learning_rate": 8.142755292403833e-07, "loss": 0.0, "num_input_tokens_seen": 83253072, "step": 123520 }, { "epoch": 3.0177363007842084, "grad_norm": 0.002115017967298627, "learning_rate": 8.141917359876975e-07, "loss": 0.0, "num_input_tokens_seen": 83256848, "step": 123525 }, { "epoch": 3.0178584516160556, "grad_norm": 0.0019589269068092108, "learning_rate": 8.141079440862595e-07, "loss": 0.0, "num_input_tokens_seen": 83260496, "step": 123530 }, { "epoch": 3.017980602447903, "grad_norm": 0.003976363688707352, "learning_rate": 8.140241535366785e-07, "loss": 0.0, "num_input_tokens_seen": 83263824, "step": 123535 }, { "epoch": 3.01810275327975, "grad_norm": 0.0012698272475972772, "learning_rate": 8.139403643395639e-07, "loss": 0.0004, "num_input_tokens_seen": 83267152, "step": 123540 }, { "epoch": 3.018224904111597, "grad_norm": 0.00013846179354004562, "learning_rate": 8.138565764955252e-07, "loss": 0.0, "num_input_tokens_seen": 83270544, "step": 123545 }, { "epoch": 3.0183470549434444, "grad_norm": 0.002126675797626376, "learning_rate": 8.137727900051712e-07, "loss": 0.0, "num_input_tokens_seen": 83273424, "step": 123550 }, { "epoch": 3.0184692057752915, "grad_norm": 0.014070043340325356, "learning_rate": 8.136890048691116e-07, "loss": 0.0, "num_input_tokens_seen": 83276944, "step": 123555 }, { "epoch": 3.0185913566071383, "grad_norm": 0.3048165440559387, "learning_rate": 8.136052210879559e-07, "loss": 0.0001, "num_input_tokens_seen": 83280144, "step": 123560 }, { "epoch": 3.0187135074389855, "grad_norm": 0.00013424389180727303, "learning_rate": 8.135214386623128e-07, "loss": 0.0, "num_input_tokens_seen": 83283728, "step": 123565 }, { "epoch": 3.0188356582708327, "grad_norm": 0.0036720738280564547, "learning_rate": 8.134376575927924e-07, "loss": 0.0667, "num_input_tokens_seen": 83286928, "step": 123570 }, { "epoch": 3.01895780910268, "grad_norm": 0.001070171594619751, "learning_rate": 8.133538778800032e-07, "loss": 0.0, "num_input_tokens_seen": 83290576, "step": 123575 }, { "epoch": 3.019079959934527, "grad_norm": 0.001411976758390665, "learning_rate": 8.132700995245552e-07, "loss": 0.0, "num_input_tokens_seen": 83293776, "step": 123580 }, { "epoch": 3.0192021107663742, "grad_norm": 0.0027057963889092207, "learning_rate": 8.131863225270568e-07, "loss": 0.0, "num_input_tokens_seen": 83297040, "step": 123585 }, { "epoch": 3.0193242615982214, "grad_norm": 0.006092796102166176, "learning_rate": 8.131025468881179e-07, "loss": 0.0001, "num_input_tokens_seen": 83300816, "step": 123590 }, { "epoch": 3.0194464124300686, "grad_norm": 0.10362302511930466, "learning_rate": 8.130187726083477e-07, "loss": 0.0533, "num_input_tokens_seen": 83304336, "step": 123595 }, { "epoch": 3.019568563261916, "grad_norm": 0.005874832160770893, "learning_rate": 8.12934999688355e-07, "loss": 0.0, "num_input_tokens_seen": 83307600, "step": 123600 }, { "epoch": 3.019690714093763, "grad_norm": 13.92029094696045, "learning_rate": 8.128512281287496e-07, "loss": 0.0414, "num_input_tokens_seen": 83311184, "step": 123605 }, { "epoch": 3.01981286492561, "grad_norm": 0.002932375529780984, "learning_rate": 8.1276745793014e-07, "loss": 0.0006, "num_input_tokens_seen": 83314384, "step": 123610 }, { "epoch": 3.0199350157574574, "grad_norm": 0.3603280484676361, "learning_rate": 8.126836890931363e-07, "loss": 0.0001, "num_input_tokens_seen": 83317328, "step": 123615 }, { "epoch": 3.0200571665893046, "grad_norm": 0.04042335972189903, "learning_rate": 8.125999216183466e-07, "loss": 0.0, "num_input_tokens_seen": 83321040, "step": 123620 }, { "epoch": 3.0201793174211518, "grad_norm": 0.0001353430125163868, "learning_rate": 8.125161555063809e-07, "loss": 0.0, "num_input_tokens_seen": 83324368, "step": 123625 }, { "epoch": 3.020301468252999, "grad_norm": 0.0132133224979043, "learning_rate": 8.124323907578485e-07, "loss": 0.0203, "num_input_tokens_seen": 83328016, "step": 123630 }, { "epoch": 3.020423619084846, "grad_norm": 0.004359879996627569, "learning_rate": 8.12348627373358e-07, "loss": 0.0, "num_input_tokens_seen": 83331792, "step": 123635 }, { "epoch": 3.0205457699166933, "grad_norm": 0.013602170161902905, "learning_rate": 8.122648653535187e-07, "loss": 0.0, "num_input_tokens_seen": 83335248, "step": 123640 }, { "epoch": 3.0206679207485405, "grad_norm": 0.004602179396897554, "learning_rate": 8.121811046989397e-07, "loss": 0.0739, "num_input_tokens_seen": 83338448, "step": 123645 }, { "epoch": 3.0207900715803873, "grad_norm": 0.004291081335395575, "learning_rate": 8.120973454102303e-07, "loss": 0.0001, "num_input_tokens_seen": 83341712, "step": 123650 }, { "epoch": 3.0209122224122344, "grad_norm": 0.0004687369801104069, "learning_rate": 8.120135874879998e-07, "loss": 0.0501, "num_input_tokens_seen": 83345296, "step": 123655 }, { "epoch": 3.0210343732440816, "grad_norm": 0.003266794141381979, "learning_rate": 8.119298309328565e-07, "loss": 0.0, "num_input_tokens_seen": 83348496, "step": 123660 }, { "epoch": 3.021156524075929, "grad_norm": 0.0014626506017521024, "learning_rate": 8.118460757454107e-07, "loss": 0.0, "num_input_tokens_seen": 83351824, "step": 123665 }, { "epoch": 3.021278674907776, "grad_norm": 0.06713787466287613, "learning_rate": 8.117623219262702e-07, "loss": 0.0001, "num_input_tokens_seen": 83355216, "step": 123670 }, { "epoch": 3.021400825739623, "grad_norm": 0.0004317023267503828, "learning_rate": 8.116785694760453e-07, "loss": 0.0, "num_input_tokens_seen": 83358928, "step": 123675 }, { "epoch": 3.0215229765714704, "grad_norm": 0.0015613315626978874, "learning_rate": 8.115948183953441e-07, "loss": 0.0, "num_input_tokens_seen": 83362128, "step": 123680 }, { "epoch": 3.0216451274033176, "grad_norm": 20.940481185913086, "learning_rate": 8.115110686847762e-07, "loss": 0.0907, "num_input_tokens_seen": 83365328, "step": 123685 }, { "epoch": 3.0217672782351648, "grad_norm": 0.0010938859777525067, "learning_rate": 8.114273203449504e-07, "loss": 0.0, "num_input_tokens_seen": 83368592, "step": 123690 }, { "epoch": 3.021889429067012, "grad_norm": 0.0003897510759998113, "learning_rate": 8.11343573376476e-07, "loss": 0.0377, "num_input_tokens_seen": 83371920, "step": 123695 }, { "epoch": 3.022011579898859, "grad_norm": 0.018642354756593704, "learning_rate": 8.112598277799621e-07, "loss": 0.0, "num_input_tokens_seen": 83375312, "step": 123700 }, { "epoch": 3.0221337307307063, "grad_norm": 0.005920675583183765, "learning_rate": 8.111760835560171e-07, "loss": 0.0, "num_input_tokens_seen": 83378896, "step": 123705 }, { "epoch": 3.0222558815625535, "grad_norm": 0.009021357633173466, "learning_rate": 8.110923407052507e-07, "loss": 0.0001, "num_input_tokens_seen": 83381904, "step": 123710 }, { "epoch": 3.0223780323944007, "grad_norm": 0.00507028354331851, "learning_rate": 8.110085992282713e-07, "loss": 0.0001, "num_input_tokens_seen": 83385360, "step": 123715 }, { "epoch": 3.022500183226248, "grad_norm": 0.6077379584312439, "learning_rate": 8.10924859125688e-07, "loss": 0.0002, "num_input_tokens_seen": 83389264, "step": 123720 }, { "epoch": 3.022622334058095, "grad_norm": 10.27883529663086, "learning_rate": 8.108411203981106e-07, "loss": 0.0444, "num_input_tokens_seen": 83392784, "step": 123725 }, { "epoch": 3.0227444848899423, "grad_norm": 0.01629662699997425, "learning_rate": 8.107573830461469e-07, "loss": 0.0, "num_input_tokens_seen": 83396240, "step": 123730 }, { "epoch": 3.0228666357217895, "grad_norm": 0.06298413127660751, "learning_rate": 8.10673647070407e-07, "loss": 0.0, "num_input_tokens_seen": 83399696, "step": 123735 }, { "epoch": 3.0229887865536362, "grad_norm": 268.79547119140625, "learning_rate": 8.105899124714987e-07, "loss": 0.0022, "num_input_tokens_seen": 83403408, "step": 123740 }, { "epoch": 3.0231109373854834, "grad_norm": 0.002456031972542405, "learning_rate": 8.105061792500317e-07, "loss": 0.0, "num_input_tokens_seen": 83406608, "step": 123745 }, { "epoch": 3.0232330882173306, "grad_norm": 0.028353121131658554, "learning_rate": 8.10422447406615e-07, "loss": 0.0001, "num_input_tokens_seen": 83410128, "step": 123750 }, { "epoch": 3.023355239049178, "grad_norm": 0.052956756204366684, "learning_rate": 8.10338716941857e-07, "loss": 0.0, "num_input_tokens_seen": 83413840, "step": 123755 }, { "epoch": 3.023477389881025, "grad_norm": 0.0012758343946188688, "learning_rate": 8.10254987856367e-07, "loss": 0.0, "num_input_tokens_seen": 83416848, "step": 123760 }, { "epoch": 3.023599540712872, "grad_norm": 0.0033717567566782236, "learning_rate": 8.101712601507535e-07, "loss": 0.0688, "num_input_tokens_seen": 83420368, "step": 123765 }, { "epoch": 3.0237216915447194, "grad_norm": 0.025035880506038666, "learning_rate": 8.10087533825626e-07, "loss": 0.0312, "num_input_tokens_seen": 83423632, "step": 123770 }, { "epoch": 3.0238438423765666, "grad_norm": 0.025489607825875282, "learning_rate": 8.100038088815925e-07, "loss": 0.0001, "num_input_tokens_seen": 83426896, "step": 123775 }, { "epoch": 3.0239659932084137, "grad_norm": 0.012176979333162308, "learning_rate": 8.099200853192627e-07, "loss": 0.0001, "num_input_tokens_seen": 83430352, "step": 123780 }, { "epoch": 3.024088144040261, "grad_norm": 7.685334276175126e-05, "learning_rate": 8.098363631392454e-07, "loss": 0.0, "num_input_tokens_seen": 83433744, "step": 123785 }, { "epoch": 3.024210294872108, "grad_norm": 0.0011124322190880775, "learning_rate": 8.09752642342149e-07, "loss": 0.0, "num_input_tokens_seen": 83437328, "step": 123790 }, { "epoch": 3.0243324457039553, "grad_norm": 0.0012952962424606085, "learning_rate": 8.096689229285827e-07, "loss": 0.0001, "num_input_tokens_seen": 83440656, "step": 123795 }, { "epoch": 3.0244545965358025, "grad_norm": 0.3104526698589325, "learning_rate": 8.095852048991551e-07, "loss": 0.0002, "num_input_tokens_seen": 83443856, "step": 123800 }, { "epoch": 3.0245767473676497, "grad_norm": 0.0013774075778201222, "learning_rate": 8.095014882544749e-07, "loss": 0.0, "num_input_tokens_seen": 83446864, "step": 123805 }, { "epoch": 3.024698898199497, "grad_norm": 0.0012292321771383286, "learning_rate": 8.094177729951515e-07, "loss": 0.0, "num_input_tokens_seen": 83450000, "step": 123810 }, { "epoch": 3.024821049031344, "grad_norm": 0.009159058332443237, "learning_rate": 8.093340591217928e-07, "loss": 0.0, "num_input_tokens_seen": 83453712, "step": 123815 }, { "epoch": 3.0249431998631913, "grad_norm": 0.006682706065475941, "learning_rate": 8.092503466350086e-07, "loss": 0.0001, "num_input_tokens_seen": 83457040, "step": 123820 }, { "epoch": 3.025065350695038, "grad_norm": 0.007138276472687721, "learning_rate": 8.091666355354069e-07, "loss": 0.0, "num_input_tokens_seen": 83460496, "step": 123825 }, { "epoch": 3.025187501526885, "grad_norm": 0.0004544088733382523, "learning_rate": 8.09082925823597e-07, "loss": 0.0, "num_input_tokens_seen": 83463376, "step": 123830 }, { "epoch": 3.0253096523587324, "grad_norm": 0.04789057374000549, "learning_rate": 8.089992175001871e-07, "loss": 0.0659, "num_input_tokens_seen": 83466448, "step": 123835 }, { "epoch": 3.0254318031905796, "grad_norm": 0.007176238112151623, "learning_rate": 8.089155105657864e-07, "loss": 0.0, "num_input_tokens_seen": 83469712, "step": 123840 }, { "epoch": 3.0255539540224268, "grad_norm": 17.64391326904297, "learning_rate": 8.088318050210036e-07, "loss": 0.0427, "num_input_tokens_seen": 83472784, "step": 123845 }, { "epoch": 3.025676104854274, "grad_norm": 0.09517499804496765, "learning_rate": 8.087481008664471e-07, "loss": 0.0, "num_input_tokens_seen": 83476304, "step": 123850 }, { "epoch": 3.025798255686121, "grad_norm": 0.03949768841266632, "learning_rate": 8.086643981027264e-07, "loss": 0.0, "num_input_tokens_seen": 83479824, "step": 123855 }, { "epoch": 3.0259204065179683, "grad_norm": 0.01607385464012623, "learning_rate": 8.085806967304491e-07, "loss": 0.0, "num_input_tokens_seen": 83483792, "step": 123860 }, { "epoch": 3.0260425573498155, "grad_norm": 0.0017718808958306909, "learning_rate": 8.084969967502248e-07, "loss": 0.0, "num_input_tokens_seen": 83487184, "step": 123865 }, { "epoch": 3.0261647081816627, "grad_norm": 0.06724121421575546, "learning_rate": 8.084132981626615e-07, "loss": 0.0, "num_input_tokens_seen": 83490448, "step": 123870 }, { "epoch": 3.02628685901351, "grad_norm": 0.004997440613806248, "learning_rate": 8.083296009683683e-07, "loss": 0.0, "num_input_tokens_seen": 83493840, "step": 123875 }, { "epoch": 3.026409009845357, "grad_norm": 0.0055533465929329395, "learning_rate": 8.08245905167954e-07, "loss": 0.0, "num_input_tokens_seen": 83497424, "step": 123880 }, { "epoch": 3.0265311606772043, "grad_norm": 0.01228258665651083, "learning_rate": 8.081622107620267e-07, "loss": 0.0001, "num_input_tokens_seen": 83500688, "step": 123885 }, { "epoch": 3.0266533115090515, "grad_norm": 0.00043016637209802866, "learning_rate": 8.08078517751196e-07, "loss": 0.0, "num_input_tokens_seen": 83503696, "step": 123890 }, { "epoch": 3.0267754623408987, "grad_norm": 0.00011228966468479484, "learning_rate": 8.079948261360693e-07, "loss": 0.0002, "num_input_tokens_seen": 83507088, "step": 123895 }, { "epoch": 3.026897613172746, "grad_norm": 0.0008924832800403237, "learning_rate": 8.079111359172561e-07, "loss": 0.0, "num_input_tokens_seen": 83510224, "step": 123900 }, { "epoch": 3.027019764004593, "grad_norm": 0.06737366318702698, "learning_rate": 8.078274470953652e-07, "loss": 0.0, "num_input_tokens_seen": 83513232, "step": 123905 }, { "epoch": 3.0271419148364402, "grad_norm": 0.00100752804428339, "learning_rate": 8.077437596710042e-07, "loss": 0.0, "num_input_tokens_seen": 83516240, "step": 123910 }, { "epoch": 3.027264065668287, "grad_norm": 40.753448486328125, "learning_rate": 8.076600736447827e-07, "loss": 0.068, "num_input_tokens_seen": 83519824, "step": 123915 }, { "epoch": 3.027386216500134, "grad_norm": 0.00984196923673153, "learning_rate": 8.075763890173086e-07, "loss": 0.0, "num_input_tokens_seen": 83523408, "step": 123920 }, { "epoch": 3.0275083673319814, "grad_norm": 0.010168238542973995, "learning_rate": 8.074927057891911e-07, "loss": 0.0, "num_input_tokens_seen": 83526416, "step": 123925 }, { "epoch": 3.0276305181638286, "grad_norm": 69.27855682373047, "learning_rate": 8.07409023961038e-07, "loss": 0.074, "num_input_tokens_seen": 83529552, "step": 123930 }, { "epoch": 3.0277526689956757, "grad_norm": 0.0016448360402137041, "learning_rate": 8.073253435334582e-07, "loss": 0.0, "num_input_tokens_seen": 83532880, "step": 123935 }, { "epoch": 3.027874819827523, "grad_norm": 0.010067627765238285, "learning_rate": 8.072416645070607e-07, "loss": 0.0, "num_input_tokens_seen": 83536400, "step": 123940 }, { "epoch": 3.02799697065937, "grad_norm": 0.0036460966803133488, "learning_rate": 8.071579868824536e-07, "loss": 0.0, "num_input_tokens_seen": 83539728, "step": 123945 }, { "epoch": 3.0281191214912173, "grad_norm": 0.0032593992073088884, "learning_rate": 8.070743106602455e-07, "loss": 0.0, "num_input_tokens_seen": 83542992, "step": 123950 }, { "epoch": 3.0282412723230645, "grad_norm": 0.0018469596980139613, "learning_rate": 8.069906358410448e-07, "loss": 0.0, "num_input_tokens_seen": 83546384, "step": 123955 }, { "epoch": 3.0283634231549117, "grad_norm": 0.004029898438602686, "learning_rate": 8.0690696242546e-07, "loss": 0.0, "num_input_tokens_seen": 83549520, "step": 123960 }, { "epoch": 3.028485573986759, "grad_norm": 0.020568979904055595, "learning_rate": 8.068232904141002e-07, "loss": 0.0, "num_input_tokens_seen": 83553552, "step": 123965 }, { "epoch": 3.028607724818606, "grad_norm": 0.0037417744752019644, "learning_rate": 8.067396198075727e-07, "loss": 0.0001, "num_input_tokens_seen": 83556816, "step": 123970 }, { "epoch": 3.0287298756504533, "grad_norm": 0.18165583908557892, "learning_rate": 8.066559506064873e-07, "loss": 0.0001, "num_input_tokens_seen": 83559824, "step": 123975 }, { "epoch": 3.0288520264823005, "grad_norm": 0.011169048957526684, "learning_rate": 8.065722828114513e-07, "loss": 0.0, "num_input_tokens_seen": 83563088, "step": 123980 }, { "epoch": 3.0289741773141476, "grad_norm": 0.014875008724629879, "learning_rate": 8.064886164230742e-07, "loss": 0.0002, "num_input_tokens_seen": 83566736, "step": 123985 }, { "epoch": 3.029096328145995, "grad_norm": 0.0020445336122065783, "learning_rate": 8.064049514419635e-07, "loss": 0.0, "num_input_tokens_seen": 83570704, "step": 123990 }, { "epoch": 3.029218478977842, "grad_norm": 0.01352640800178051, "learning_rate": 8.063212878687282e-07, "loss": 0.0, "num_input_tokens_seen": 83574096, "step": 123995 }, { "epoch": 3.029340629809689, "grad_norm": 0.0033647569362074137, "learning_rate": 8.062376257039766e-07, "loss": 0.0, "num_input_tokens_seen": 83577296, "step": 124000 }, { "epoch": 3.029462780641536, "grad_norm": 0.0048996442928910255, "learning_rate": 8.061539649483171e-07, "loss": 0.0003, "num_input_tokens_seen": 83580560, "step": 124005 }, { "epoch": 3.029584931473383, "grad_norm": 0.0022092899307608604, "learning_rate": 8.060703056023583e-07, "loss": 0.0, "num_input_tokens_seen": 83583696, "step": 124010 }, { "epoch": 3.0297070823052303, "grad_norm": 38.923072814941406, "learning_rate": 8.059866476667081e-07, "loss": 0.0359, "num_input_tokens_seen": 83586704, "step": 124015 }, { "epoch": 3.0298292331370775, "grad_norm": 0.006466974038630724, "learning_rate": 8.059029911419755e-07, "loss": 0.0002, "num_input_tokens_seen": 83590160, "step": 124020 }, { "epoch": 3.0299513839689247, "grad_norm": 0.017574403434991837, "learning_rate": 8.058193360287681e-07, "loss": 0.0, "num_input_tokens_seen": 83593296, "step": 124025 }, { "epoch": 3.030073534800772, "grad_norm": 0.0008997087134048343, "learning_rate": 8.057356823276947e-07, "loss": 0.0, "num_input_tokens_seen": 83596944, "step": 124030 }, { "epoch": 3.030195685632619, "grad_norm": 32.1270637512207, "learning_rate": 8.056520300393642e-07, "loss": 0.0513, "num_input_tokens_seen": 83600144, "step": 124035 }, { "epoch": 3.0303178364644663, "grad_norm": 0.007690535392612219, "learning_rate": 8.055683791643839e-07, "loss": 0.0348, "num_input_tokens_seen": 83603152, "step": 124040 }, { "epoch": 3.0304399872963135, "grad_norm": 0.1448715180158615, "learning_rate": 8.054847297033633e-07, "loss": 0.0882, "num_input_tokens_seen": 83606288, "step": 124045 }, { "epoch": 3.0305621381281607, "grad_norm": 0.009605436585843563, "learning_rate": 8.054010816569094e-07, "loss": 0.0, "num_input_tokens_seen": 83609872, "step": 124050 }, { "epoch": 3.030684288960008, "grad_norm": 0.18111363053321838, "learning_rate": 8.053174350256313e-07, "loss": 0.0001, "num_input_tokens_seen": 83613520, "step": 124055 }, { "epoch": 3.030806439791855, "grad_norm": 0.0024255227763205767, "learning_rate": 8.052337898101376e-07, "loss": 0.0, "num_input_tokens_seen": 83616720, "step": 124060 }, { "epoch": 3.0309285906237022, "grad_norm": 0.09568436443805695, "learning_rate": 8.051501460110357e-07, "loss": 0.0001, "num_input_tokens_seen": 83619984, "step": 124065 }, { "epoch": 3.0310507414555494, "grad_norm": 0.0018284890102222562, "learning_rate": 8.050665036289347e-07, "loss": 0.0, "num_input_tokens_seen": 83622992, "step": 124070 }, { "epoch": 3.0311728922873966, "grad_norm": 0.016492174938321114, "learning_rate": 8.049828626644422e-07, "loss": 0.0, "num_input_tokens_seen": 83626128, "step": 124075 }, { "epoch": 3.031295043119244, "grad_norm": 0.0006634180899709463, "learning_rate": 8.048992231181671e-07, "loss": 0.0098, "num_input_tokens_seen": 83630352, "step": 124080 }, { "epoch": 3.031417193951091, "grad_norm": 0.042172130197286606, "learning_rate": 8.048155849907168e-07, "loss": 0.0, "num_input_tokens_seen": 83633616, "step": 124085 }, { "epoch": 3.031539344782938, "grad_norm": 0.0017520836554467678, "learning_rate": 8.047319482827003e-07, "loss": 0.0571, "num_input_tokens_seen": 83636688, "step": 124090 }, { "epoch": 3.031661495614785, "grad_norm": 0.010810492560267448, "learning_rate": 8.046483129947259e-07, "loss": 0.0, "num_input_tokens_seen": 83640336, "step": 124095 }, { "epoch": 3.031783646446632, "grad_norm": 0.002398022450506687, "learning_rate": 8.045646791274011e-07, "loss": 0.0, "num_input_tokens_seen": 83643600, "step": 124100 }, { "epoch": 3.0319057972784793, "grad_norm": 0.03453930467367172, "learning_rate": 8.04481046681335e-07, "loss": 0.0, "num_input_tokens_seen": 83646992, "step": 124105 }, { "epoch": 3.0320279481103265, "grad_norm": 0.0012411042116582394, "learning_rate": 8.043974156571351e-07, "loss": 0.0001, "num_input_tokens_seen": 83650256, "step": 124110 }, { "epoch": 3.0321500989421737, "grad_norm": 0.0029082736000418663, "learning_rate": 8.043137860554094e-07, "loss": 0.0325, "num_input_tokens_seen": 83653584, "step": 124115 }, { "epoch": 3.032272249774021, "grad_norm": 50.47825622558594, "learning_rate": 8.042301578767671e-07, "loss": 0.0514, "num_input_tokens_seen": 83656592, "step": 124120 }, { "epoch": 3.032394400605868, "grad_norm": 0.06554097682237625, "learning_rate": 8.041465311218153e-07, "loss": 0.0, "num_input_tokens_seen": 83659920, "step": 124125 }, { "epoch": 3.0325165514377153, "grad_norm": 0.03542274609208107, "learning_rate": 8.040629057911629e-07, "loss": 0.0001, "num_input_tokens_seen": 83663376, "step": 124130 }, { "epoch": 3.0326387022695624, "grad_norm": 0.08334421366453171, "learning_rate": 8.039792818854175e-07, "loss": 0.0002, "num_input_tokens_seen": 83666576, "step": 124135 }, { "epoch": 3.0327608531014096, "grad_norm": 0.002318961312994361, "learning_rate": 8.038956594051878e-07, "loss": 0.0, "num_input_tokens_seen": 83670352, "step": 124140 }, { "epoch": 3.032883003933257, "grad_norm": 0.0056270817294716835, "learning_rate": 8.038120383510813e-07, "loss": 0.0501, "num_input_tokens_seen": 83673744, "step": 124145 }, { "epoch": 3.033005154765104, "grad_norm": 0.04084935039281845, "learning_rate": 8.037284187237065e-07, "loss": 0.0001, "num_input_tokens_seen": 83676816, "step": 124150 }, { "epoch": 3.033127305596951, "grad_norm": 0.028429778292775154, "learning_rate": 8.036448005236715e-07, "loss": 0.0, "num_input_tokens_seen": 83680464, "step": 124155 }, { "epoch": 3.0332494564287984, "grad_norm": 0.009558050893247128, "learning_rate": 8.035611837515843e-07, "loss": 0.0, "num_input_tokens_seen": 83683856, "step": 124160 }, { "epoch": 3.0333716072606456, "grad_norm": 0.009309230372309685, "learning_rate": 8.034775684080532e-07, "loss": 0.085, "num_input_tokens_seen": 83686864, "step": 124165 }, { "epoch": 3.0334937580924928, "grad_norm": 0.08569590002298355, "learning_rate": 8.033939544936857e-07, "loss": 0.0001, "num_input_tokens_seen": 83690384, "step": 124170 }, { "epoch": 3.03361590892434, "grad_norm": 0.47182491421699524, "learning_rate": 8.033103420090906e-07, "loss": 0.0002, "num_input_tokens_seen": 83693968, "step": 124175 }, { "epoch": 3.033738059756187, "grad_norm": 0.005380032118409872, "learning_rate": 8.032267309548752e-07, "loss": 0.0, "num_input_tokens_seen": 83697104, "step": 124180 }, { "epoch": 3.033860210588034, "grad_norm": 0.0026039707008749247, "learning_rate": 8.03143121331648e-07, "loss": 0.0, "num_input_tokens_seen": 83700688, "step": 124185 }, { "epoch": 3.033982361419881, "grad_norm": 0.14887675642967224, "learning_rate": 8.030595131400174e-07, "loss": 0.0001, "num_input_tokens_seen": 83704016, "step": 124190 }, { "epoch": 3.0341045122517283, "grad_norm": 0.0026839664205908775, "learning_rate": 8.029759063805906e-07, "loss": 0.0, "num_input_tokens_seen": 83708240, "step": 124195 }, { "epoch": 3.0342266630835755, "grad_norm": 0.01625916361808777, "learning_rate": 8.028923010539763e-07, "loss": 0.0, "num_input_tokens_seen": 83711568, "step": 124200 }, { "epoch": 3.0343488139154227, "grad_norm": 0.01792566291987896, "learning_rate": 8.028086971607818e-07, "loss": 0.0439, "num_input_tokens_seen": 83714896, "step": 124205 }, { "epoch": 3.03447096474727, "grad_norm": 0.001258393982425332, "learning_rate": 8.027250947016157e-07, "loss": 0.0, "num_input_tokens_seen": 83718160, "step": 124210 }, { "epoch": 3.034593115579117, "grad_norm": 0.5693371891975403, "learning_rate": 8.026414936770861e-07, "loss": 0.0002, "num_input_tokens_seen": 83721488, "step": 124215 }, { "epoch": 3.0347152664109642, "grad_norm": 0.017172690480947495, "learning_rate": 8.025578940878001e-07, "loss": 0.0751, "num_input_tokens_seen": 83724432, "step": 124220 }, { "epoch": 3.0348374172428114, "grad_norm": 0.009208094328641891, "learning_rate": 8.024742959343667e-07, "loss": 0.0001, "num_input_tokens_seen": 83728144, "step": 124225 }, { "epoch": 3.0349595680746586, "grad_norm": 0.008969802409410477, "learning_rate": 8.023906992173929e-07, "loss": 0.041, "num_input_tokens_seen": 83731280, "step": 124230 }, { "epoch": 3.035081718906506, "grad_norm": 0.015506453812122345, "learning_rate": 8.023071039374875e-07, "loss": 0.0002, "num_input_tokens_seen": 83734608, "step": 124235 }, { "epoch": 3.035203869738353, "grad_norm": 0.023344608023762703, "learning_rate": 8.022235100952576e-07, "loss": 0.0, "num_input_tokens_seen": 83737936, "step": 124240 }, { "epoch": 3.0353260205702, "grad_norm": 0.0039008702151477337, "learning_rate": 8.021399176913115e-07, "loss": 0.0002, "num_input_tokens_seen": 83741456, "step": 124245 }, { "epoch": 3.0354481714020474, "grad_norm": 0.03828645497560501, "learning_rate": 8.020563267262576e-07, "loss": 0.0, "num_input_tokens_seen": 83744720, "step": 124250 }, { "epoch": 3.0355703222338946, "grad_norm": 0.0011847520945593715, "learning_rate": 8.019727372007028e-07, "loss": 0.0001, "num_input_tokens_seen": 83748368, "step": 124255 }, { "epoch": 3.0356924730657417, "grad_norm": 0.00173528294544667, "learning_rate": 8.01889149115256e-07, "loss": 0.006, "num_input_tokens_seen": 83751376, "step": 124260 }, { "epoch": 3.035814623897589, "grad_norm": 0.0002933296491391957, "learning_rate": 8.018055624705244e-07, "loss": 0.0, "num_input_tokens_seen": 83755152, "step": 124265 }, { "epoch": 3.0359367747294357, "grad_norm": 0.25443074107170105, "learning_rate": 8.017219772671158e-07, "loss": 0.0001, "num_input_tokens_seen": 83758352, "step": 124270 }, { "epoch": 3.036058925561283, "grad_norm": 0.0008950105402618647, "learning_rate": 8.016383935056389e-07, "loss": 0.0, "num_input_tokens_seen": 83761744, "step": 124275 }, { "epoch": 3.03618107639313, "grad_norm": 0.006972425617277622, "learning_rate": 8.015548111867003e-07, "loss": 0.0, "num_input_tokens_seen": 83764816, "step": 124280 }, { "epoch": 3.0363032272249773, "grad_norm": 0.0014357100008055568, "learning_rate": 8.014712303109092e-07, "loss": 0.0001, "num_input_tokens_seen": 83768528, "step": 124285 }, { "epoch": 3.0364253780568244, "grad_norm": 110.88409423828125, "learning_rate": 8.01387650878872e-07, "loss": 0.0751, "num_input_tokens_seen": 83771728, "step": 124290 }, { "epoch": 3.0365475288886716, "grad_norm": 0.001392767415381968, "learning_rate": 8.013040728911977e-07, "loss": 0.0673, "num_input_tokens_seen": 83774928, "step": 124295 }, { "epoch": 3.036669679720519, "grad_norm": 0.0041596959345042706, "learning_rate": 8.012204963484934e-07, "loss": 0.0, "num_input_tokens_seen": 83778512, "step": 124300 }, { "epoch": 3.036791830552366, "grad_norm": 0.0020632941741496325, "learning_rate": 8.011369212513671e-07, "loss": 0.0, "num_input_tokens_seen": 83781520, "step": 124305 }, { "epoch": 3.036913981384213, "grad_norm": 0.006585871335119009, "learning_rate": 8.010533476004267e-07, "loss": 0.0, "num_input_tokens_seen": 83784784, "step": 124310 }, { "epoch": 3.0370361322160604, "grad_norm": 0.0029705450870096684, "learning_rate": 8.009697753962799e-07, "loss": 0.0, "num_input_tokens_seen": 83788112, "step": 124315 }, { "epoch": 3.0371582830479076, "grad_norm": 0.006383042316883802, "learning_rate": 8.008862046395346e-07, "loss": 0.0, "num_input_tokens_seen": 83791376, "step": 124320 }, { "epoch": 3.0372804338797548, "grad_norm": 0.15197601914405823, "learning_rate": 8.00802635330798e-07, "loss": 0.0001, "num_input_tokens_seen": 83795536, "step": 124325 }, { "epoch": 3.037402584711602, "grad_norm": 0.023821311071515083, "learning_rate": 8.007190674706786e-07, "loss": 0.0, "num_input_tokens_seen": 83798480, "step": 124330 }, { "epoch": 3.037524735543449, "grad_norm": 0.0059493849985301495, "learning_rate": 8.006355010597832e-07, "loss": 0.0, "num_input_tokens_seen": 83801680, "step": 124335 }, { "epoch": 3.0376468863752963, "grad_norm": 0.002639399142935872, "learning_rate": 8.005519360987201e-07, "loss": 0.0001, "num_input_tokens_seen": 83805008, "step": 124340 }, { "epoch": 3.0377690372071435, "grad_norm": 0.00016760479775257409, "learning_rate": 8.004683725880976e-07, "loss": 0.0, "num_input_tokens_seen": 83808464, "step": 124345 }, { "epoch": 3.0378911880389907, "grad_norm": 0.046830035746097565, "learning_rate": 8.00384810528522e-07, "loss": 0.0, "num_input_tokens_seen": 83811728, "step": 124350 }, { "epoch": 3.038013338870838, "grad_norm": 0.0029815714806318283, "learning_rate": 8.003012499206025e-07, "loss": 0.0, "num_input_tokens_seen": 83814992, "step": 124355 }, { "epoch": 3.038135489702685, "grad_norm": 0.0005295452428981662, "learning_rate": 8.002176907649454e-07, "loss": 0.0, "num_input_tokens_seen": 83818256, "step": 124360 }, { "epoch": 3.038257640534532, "grad_norm": 0.0025638635270297527, "learning_rate": 8.001341330621593e-07, "loss": 0.0, "num_input_tokens_seen": 83821520, "step": 124365 }, { "epoch": 3.038379791366379, "grad_norm": 0.009190984070301056, "learning_rate": 8.000505768128517e-07, "loss": 0.0, "num_input_tokens_seen": 83824912, "step": 124370 }, { "epoch": 3.0385019421982262, "grad_norm": 0.030233608558773994, "learning_rate": 7.999670220176297e-07, "loss": 0.0, "num_input_tokens_seen": 83828304, "step": 124375 }, { "epoch": 3.0386240930300734, "grad_norm": 0.002085483865812421, "learning_rate": 7.998834686771016e-07, "loss": 0.0, "num_input_tokens_seen": 83832528, "step": 124380 }, { "epoch": 3.0387462438619206, "grad_norm": 27.48931312561035, "learning_rate": 7.997999167918745e-07, "loss": 0.0572, "num_input_tokens_seen": 83835856, "step": 124385 }, { "epoch": 3.038868394693768, "grad_norm": 0.0012941723689436913, "learning_rate": 7.997163663625566e-07, "loss": 0.0, "num_input_tokens_seen": 83839184, "step": 124390 }, { "epoch": 3.038990545525615, "grad_norm": 0.04093189164996147, "learning_rate": 7.996328173897548e-07, "loss": 0.0, "num_input_tokens_seen": 83842448, "step": 124395 }, { "epoch": 3.039112696357462, "grad_norm": 0.0002823161776177585, "learning_rate": 7.995492698740769e-07, "loss": 0.0, "num_input_tokens_seen": 83846288, "step": 124400 }, { "epoch": 3.0392348471893094, "grad_norm": 0.004458566661924124, "learning_rate": 7.994657238161311e-07, "loss": 0.0, "num_input_tokens_seen": 83849872, "step": 124405 }, { "epoch": 3.0393569980211566, "grad_norm": 0.0033806823194026947, "learning_rate": 7.99382179216524e-07, "loss": 0.0, "num_input_tokens_seen": 83853136, "step": 124410 }, { "epoch": 3.0394791488530037, "grad_norm": 0.01399586908519268, "learning_rate": 7.992986360758643e-07, "loss": 0.0302, "num_input_tokens_seen": 83856464, "step": 124415 }, { "epoch": 3.039601299684851, "grad_norm": 0.008270226418972015, "learning_rate": 7.992150943947586e-07, "loss": 0.0, "num_input_tokens_seen": 83859664, "step": 124420 }, { "epoch": 3.039723450516698, "grad_norm": 0.00548640126362443, "learning_rate": 7.991315541738147e-07, "loss": 0.0001, "num_input_tokens_seen": 83862928, "step": 124425 }, { "epoch": 3.0398456013485453, "grad_norm": 0.00551306689158082, "learning_rate": 7.990480154136401e-07, "loss": 0.0, "num_input_tokens_seen": 83865808, "step": 124430 }, { "epoch": 3.0399677521803925, "grad_norm": 0.004245460033416748, "learning_rate": 7.989644781148422e-07, "loss": 0.0001, "num_input_tokens_seen": 83869072, "step": 124435 }, { "epoch": 3.0400899030122397, "grad_norm": 0.008234397508203983, "learning_rate": 7.988809422780292e-07, "loss": 0.0001, "num_input_tokens_seen": 83872592, "step": 124440 }, { "epoch": 3.040212053844087, "grad_norm": 0.008850080892443657, "learning_rate": 7.987974079038076e-07, "loss": 0.0, "num_input_tokens_seen": 83875728, "step": 124445 }, { "epoch": 3.0403342046759336, "grad_norm": 170.21939086914062, "learning_rate": 7.987138749927858e-07, "loss": 0.1003, "num_input_tokens_seen": 83878992, "step": 124450 }, { "epoch": 3.040456355507781, "grad_norm": 0.014337592758238316, "learning_rate": 7.986303435455705e-07, "loss": 0.0, "num_input_tokens_seen": 83881936, "step": 124455 }, { "epoch": 3.040578506339628, "grad_norm": 0.005087694618850946, "learning_rate": 7.985468135627696e-07, "loss": 0.0005, "num_input_tokens_seen": 83885648, "step": 124460 }, { "epoch": 3.040700657171475, "grad_norm": 0.0010110632283613086, "learning_rate": 7.984632850449903e-07, "loss": 0.0, "num_input_tokens_seen": 83889232, "step": 124465 }, { "epoch": 3.0408228080033224, "grad_norm": 0.009701843373477459, "learning_rate": 7.983797579928406e-07, "loss": 0.0, "num_input_tokens_seen": 83892944, "step": 124470 }, { "epoch": 3.0409449588351696, "grad_norm": 0.08975927531719208, "learning_rate": 7.982962324069275e-07, "loss": 0.0, "num_input_tokens_seen": 83895888, "step": 124475 }, { "epoch": 3.0410671096670168, "grad_norm": 0.010530831292271614, "learning_rate": 7.98212708287858e-07, "loss": 0.0, "num_input_tokens_seen": 83899856, "step": 124480 }, { "epoch": 3.041189260498864, "grad_norm": 0.0028245660942047834, "learning_rate": 7.981291856362405e-07, "loss": 0.0, "num_input_tokens_seen": 83903120, "step": 124485 }, { "epoch": 3.041311411330711, "grad_norm": 0.000383858714485541, "learning_rate": 7.980456644526813e-07, "loss": 0.0, "num_input_tokens_seen": 83906384, "step": 124490 }, { "epoch": 3.0414335621625583, "grad_norm": 0.001335768261924386, "learning_rate": 7.979621447377885e-07, "loss": 0.0001, "num_input_tokens_seen": 83909648, "step": 124495 }, { "epoch": 3.0415557129944055, "grad_norm": 0.0008865381241776049, "learning_rate": 7.978786264921695e-07, "loss": 0.0372, "num_input_tokens_seen": 83913040, "step": 124500 }, { "epoch": 3.0416778638262527, "grad_norm": 9.840297570917755e-05, "learning_rate": 7.977951097164312e-07, "loss": 0.0001, "num_input_tokens_seen": 83916944, "step": 124505 }, { "epoch": 3.0418000146581, "grad_norm": 0.012626949697732925, "learning_rate": 7.977115944111819e-07, "loss": 0.0002, "num_input_tokens_seen": 83920016, "step": 124510 }, { "epoch": 3.041922165489947, "grad_norm": 0.0019248174503445625, "learning_rate": 7.976280805770275e-07, "loss": 0.0, "num_input_tokens_seen": 83923600, "step": 124515 }, { "epoch": 3.0420443163217943, "grad_norm": 0.0012115843128412962, "learning_rate": 7.975445682145766e-07, "loss": 0.0, "num_input_tokens_seen": 83926992, "step": 124520 }, { "epoch": 3.0421664671536415, "grad_norm": 0.00032460439251735806, "learning_rate": 7.974610573244362e-07, "loss": 0.0, "num_input_tokens_seen": 83930576, "step": 124525 }, { "epoch": 3.0422886179854887, "grad_norm": 0.009787623770534992, "learning_rate": 7.97377547907213e-07, "loss": 0.0, "num_input_tokens_seen": 83933648, "step": 124530 }, { "epoch": 3.042410768817336, "grad_norm": 0.0008228658698499203, "learning_rate": 7.972940399635153e-07, "loss": 0.0, "num_input_tokens_seen": 83937040, "step": 124535 }, { "epoch": 3.0425329196491826, "grad_norm": 0.003583153011277318, "learning_rate": 7.972105334939493e-07, "loss": 0.0465, "num_input_tokens_seen": 83940240, "step": 124540 }, { "epoch": 3.04265507048103, "grad_norm": 0.0009180400520563126, "learning_rate": 7.971270284991234e-07, "loss": 0.0, "num_input_tokens_seen": 83943376, "step": 124545 }, { "epoch": 3.042777221312877, "grad_norm": 0.04430723562836647, "learning_rate": 7.970435249796438e-07, "loss": 0.0, "num_input_tokens_seen": 83946512, "step": 124550 }, { "epoch": 3.042899372144724, "grad_norm": 0.00021339183149393648, "learning_rate": 7.969600229361181e-07, "loss": 0.0, "num_input_tokens_seen": 83950160, "step": 124555 }, { "epoch": 3.0430215229765714, "grad_norm": 0.001275393646210432, "learning_rate": 7.968765223691544e-07, "loss": 0.0, "num_input_tokens_seen": 83953168, "step": 124560 }, { "epoch": 3.0431436738084185, "grad_norm": 0.20841944217681885, "learning_rate": 7.967930232793589e-07, "loss": 0.0001, "num_input_tokens_seen": 83956176, "step": 124565 }, { "epoch": 3.0432658246402657, "grad_norm": 0.02498796209692955, "learning_rate": 7.967095256673395e-07, "loss": 0.0001, "num_input_tokens_seen": 83959632, "step": 124570 }, { "epoch": 3.043387975472113, "grad_norm": 0.01822241209447384, "learning_rate": 7.966260295337029e-07, "loss": 0.0, "num_input_tokens_seen": 83963280, "step": 124575 }, { "epoch": 3.04351012630396, "grad_norm": 0.010968970134854317, "learning_rate": 7.965425348790564e-07, "loss": 0.0, "num_input_tokens_seen": 83966352, "step": 124580 }, { "epoch": 3.0436322771358073, "grad_norm": 50.25225830078125, "learning_rate": 7.964590417040075e-07, "loss": 0.0751, "num_input_tokens_seen": 83969744, "step": 124585 }, { "epoch": 3.0437544279676545, "grad_norm": 0.0015823058784008026, "learning_rate": 7.963755500091629e-07, "loss": 0.0, "num_input_tokens_seen": 83972944, "step": 124590 }, { "epoch": 3.0438765787995017, "grad_norm": 0.0021922453306615353, "learning_rate": 7.962920597951305e-07, "loss": 0.0001, "num_input_tokens_seen": 83976720, "step": 124595 }, { "epoch": 3.043998729631349, "grad_norm": 0.000679630262311548, "learning_rate": 7.962085710625166e-07, "loss": 0.0, "num_input_tokens_seen": 83979984, "step": 124600 }, { "epoch": 3.044120880463196, "grad_norm": 0.0018834839574992657, "learning_rate": 7.961250838119292e-07, "loss": 0.0, "num_input_tokens_seen": 83983184, "step": 124605 }, { "epoch": 3.0442430312950433, "grad_norm": 0.010868406854569912, "learning_rate": 7.960415980439747e-07, "loss": 0.0727, "num_input_tokens_seen": 83986576, "step": 124610 }, { "epoch": 3.0443651821268904, "grad_norm": 0.029458479955792427, "learning_rate": 7.959581137592606e-07, "loss": 0.0001, "num_input_tokens_seen": 83989776, "step": 124615 }, { "epoch": 3.0444873329587376, "grad_norm": 0.002476650755852461, "learning_rate": 7.95874630958394e-07, "loss": 0.0001, "num_input_tokens_seen": 83993360, "step": 124620 }, { "epoch": 3.044609483790585, "grad_norm": 0.02898203395307064, "learning_rate": 7.957911496419821e-07, "loss": 0.0, "num_input_tokens_seen": 83996752, "step": 124625 }, { "epoch": 3.0447316346224316, "grad_norm": 3.074690539506264e-05, "learning_rate": 7.95707669810632e-07, "loss": 0.0, "num_input_tokens_seen": 84000272, "step": 124630 }, { "epoch": 3.0448537854542788, "grad_norm": 0.00030377108487300575, "learning_rate": 7.956241914649503e-07, "loss": 0.0, "num_input_tokens_seen": 84003536, "step": 124635 }, { "epoch": 3.044975936286126, "grad_norm": 0.005053306929767132, "learning_rate": 7.955407146055448e-07, "loss": 0.0, "num_input_tokens_seen": 84007056, "step": 124640 }, { "epoch": 3.045098087117973, "grad_norm": 0.0005974622326903045, "learning_rate": 7.954572392330219e-07, "loss": 0.0, "num_input_tokens_seen": 84010064, "step": 124645 }, { "epoch": 3.0452202379498203, "grad_norm": 30.14084243774414, "learning_rate": 7.953737653479889e-07, "loss": 0.0814, "num_input_tokens_seen": 84013584, "step": 124650 }, { "epoch": 3.0453423887816675, "grad_norm": 0.0002505712036509067, "learning_rate": 7.952902929510534e-07, "loss": 0.0625, "num_input_tokens_seen": 84017168, "step": 124655 }, { "epoch": 3.0454645396135147, "grad_norm": 0.0006067760987207294, "learning_rate": 7.952068220428215e-07, "loss": 0.0, "num_input_tokens_seen": 84020944, "step": 124660 }, { "epoch": 3.045586690445362, "grad_norm": 0.017085615545511246, "learning_rate": 7.951233526239012e-07, "loss": 0.0225, "num_input_tokens_seen": 84024336, "step": 124665 }, { "epoch": 3.045708841277209, "grad_norm": 0.003791298484429717, "learning_rate": 7.950398846948984e-07, "loss": 0.0, "num_input_tokens_seen": 84027728, "step": 124670 }, { "epoch": 3.0458309921090563, "grad_norm": 0.012227640487253666, "learning_rate": 7.949564182564209e-07, "loss": 0.0, "num_input_tokens_seen": 84030928, "step": 124675 }, { "epoch": 3.0459531429409035, "grad_norm": 0.001645068172365427, "learning_rate": 7.948729533090758e-07, "loss": 0.0, "num_input_tokens_seen": 84034448, "step": 124680 }, { "epoch": 3.0460752937727507, "grad_norm": 0.0035828205291181803, "learning_rate": 7.947894898534693e-07, "loss": 0.0378, "num_input_tokens_seen": 84037520, "step": 124685 }, { "epoch": 3.046197444604598, "grad_norm": 0.0012684467947110534, "learning_rate": 7.947060278902091e-07, "loss": 0.0, "num_input_tokens_seen": 84041488, "step": 124690 }, { "epoch": 3.046319595436445, "grad_norm": 0.00011037226795451716, "learning_rate": 7.946225674199017e-07, "loss": 0.0411, "num_input_tokens_seen": 84044624, "step": 124695 }, { "epoch": 3.0464417462682922, "grad_norm": 0.0008721097256056964, "learning_rate": 7.945391084431546e-07, "loss": 0.0, "num_input_tokens_seen": 84047888, "step": 124700 }, { "epoch": 3.0465638971001394, "grad_norm": 0.012080611661076546, "learning_rate": 7.944556509605737e-07, "loss": 0.0, "num_input_tokens_seen": 84051344, "step": 124705 }, { "epoch": 3.0466860479319866, "grad_norm": 0.0017130423802882433, "learning_rate": 7.943721949727668e-07, "loss": 0.0002, "num_input_tokens_seen": 84054416, "step": 124710 }, { "epoch": 3.046808198763834, "grad_norm": 0.0068734074011445045, "learning_rate": 7.94288740480341e-07, "loss": 0.0, "num_input_tokens_seen": 84057872, "step": 124715 }, { "epoch": 3.0469303495956805, "grad_norm": 0.02271157130599022, "learning_rate": 7.942052874839024e-07, "loss": 0.0, "num_input_tokens_seen": 84060880, "step": 124720 }, { "epoch": 3.0470525004275277, "grad_norm": 0.009061945602297783, "learning_rate": 7.941218359840587e-07, "loss": 0.0, "num_input_tokens_seen": 84063952, "step": 124725 }, { "epoch": 3.047174651259375, "grad_norm": 22.329303741455078, "learning_rate": 7.940383859814162e-07, "loss": 0.0725, "num_input_tokens_seen": 84067472, "step": 124730 }, { "epoch": 3.047296802091222, "grad_norm": 0.033782485872507095, "learning_rate": 7.93954937476582e-07, "loss": 0.0, "num_input_tokens_seen": 84071312, "step": 124735 }, { "epoch": 3.0474189529230693, "grad_norm": 0.001782033359631896, "learning_rate": 7.938714904701627e-07, "loss": 0.0001, "num_input_tokens_seen": 84074960, "step": 124740 }, { "epoch": 3.0475411037549165, "grad_norm": 0.001810292131267488, "learning_rate": 7.937880449627655e-07, "loss": 0.0365, "num_input_tokens_seen": 84078224, "step": 124745 }, { "epoch": 3.0476632545867637, "grad_norm": 0.0007144034607335925, "learning_rate": 7.937046009549971e-07, "loss": 0.012, "num_input_tokens_seen": 84081680, "step": 124750 }, { "epoch": 3.047785405418611, "grad_norm": 0.020684758201241493, "learning_rate": 7.936211584474641e-07, "loss": 0.0002, "num_input_tokens_seen": 84085008, "step": 124755 }, { "epoch": 3.047907556250458, "grad_norm": 0.0005914786597713828, "learning_rate": 7.935377174407742e-07, "loss": 0.1145, "num_input_tokens_seen": 84088080, "step": 124760 }, { "epoch": 3.0480297070823053, "grad_norm": 0.0006892129895277321, "learning_rate": 7.934542779355329e-07, "loss": 0.0, "num_input_tokens_seen": 84091472, "step": 124765 }, { "epoch": 3.0481518579141524, "grad_norm": 194.6983184814453, "learning_rate": 7.933708399323478e-07, "loss": 0.035, "num_input_tokens_seen": 84094736, "step": 124770 }, { "epoch": 3.0482740087459996, "grad_norm": 0.06471144407987595, "learning_rate": 7.932874034318256e-07, "loss": 0.0, "num_input_tokens_seen": 84098000, "step": 124775 }, { "epoch": 3.048396159577847, "grad_norm": 0.000554750207811594, "learning_rate": 7.932039684345731e-07, "loss": 0.0738, "num_input_tokens_seen": 84101008, "step": 124780 }, { "epoch": 3.048518310409694, "grad_norm": 0.002123037585988641, "learning_rate": 7.93120534941197e-07, "loss": 0.0001, "num_input_tokens_seen": 84104080, "step": 124785 }, { "epoch": 3.048640461241541, "grad_norm": 0.002121489029377699, "learning_rate": 7.930371029523037e-07, "loss": 0.0001, "num_input_tokens_seen": 84107792, "step": 124790 }, { "epoch": 3.0487626120733884, "grad_norm": 0.0026022300589829683, "learning_rate": 7.929536724685006e-07, "loss": 0.0004, "num_input_tokens_seen": 84111504, "step": 124795 }, { "epoch": 3.0488847629052356, "grad_norm": 0.006417084950953722, "learning_rate": 7.928702434903938e-07, "loss": 0.0001, "num_input_tokens_seen": 84114832, "step": 124800 }, { "epoch": 3.0490069137370828, "grad_norm": 0.001482818159274757, "learning_rate": 7.927868160185901e-07, "loss": 0.0002, "num_input_tokens_seen": 84118224, "step": 124805 }, { "epoch": 3.0491290645689295, "grad_norm": 0.02109672501683235, "learning_rate": 7.92703390053697e-07, "loss": 0.0, "num_input_tokens_seen": 84121360, "step": 124810 }, { "epoch": 3.0492512154007767, "grad_norm": 0.017897358164191246, "learning_rate": 7.926199655963201e-07, "loss": 0.0004, "num_input_tokens_seen": 84124432, "step": 124815 }, { "epoch": 3.049373366232624, "grad_norm": 0.01913967914879322, "learning_rate": 7.92536542647067e-07, "loss": 0.0, "num_input_tokens_seen": 84127376, "step": 124820 }, { "epoch": 3.049495517064471, "grad_norm": 0.12012538313865662, "learning_rate": 7.924531212065436e-07, "loss": 0.0001, "num_input_tokens_seen": 84130384, "step": 124825 }, { "epoch": 3.0496176678963183, "grad_norm": 0.01247513946145773, "learning_rate": 7.923697012753572e-07, "loss": 0.0, "num_input_tokens_seen": 84133328, "step": 124830 }, { "epoch": 3.0497398187281655, "grad_norm": 0.000963643251452595, "learning_rate": 7.922862828541143e-07, "loss": 0.0, "num_input_tokens_seen": 84136592, "step": 124835 }, { "epoch": 3.0498619695600127, "grad_norm": 0.01684432663023472, "learning_rate": 7.922028659434209e-07, "loss": 0.0, "num_input_tokens_seen": 84140112, "step": 124840 }, { "epoch": 3.04998412039186, "grad_norm": 0.007283932529389858, "learning_rate": 7.921194505438848e-07, "loss": 0.0001, "num_input_tokens_seen": 84143760, "step": 124845 }, { "epoch": 3.050106271223707, "grad_norm": 0.004185799043625593, "learning_rate": 7.920360366561113e-07, "loss": 0.0, "num_input_tokens_seen": 84147152, "step": 124850 }, { "epoch": 3.0502284220555542, "grad_norm": 0.002467324025928974, "learning_rate": 7.919526242807082e-07, "loss": 0.0, "num_input_tokens_seen": 84150288, "step": 124855 }, { "epoch": 3.0503505728874014, "grad_norm": 0.007444701623171568, "learning_rate": 7.918692134182812e-07, "loss": 0.0, "num_input_tokens_seen": 84153424, "step": 124860 }, { "epoch": 3.0504727237192486, "grad_norm": 0.006697532255202532, "learning_rate": 7.917858040694372e-07, "loss": 0.0, "num_input_tokens_seen": 84156880, "step": 124865 }, { "epoch": 3.050594874551096, "grad_norm": 0.003599474672228098, "learning_rate": 7.917023962347833e-07, "loss": 0.0, "num_input_tokens_seen": 84160336, "step": 124870 }, { "epoch": 3.050717025382943, "grad_norm": 0.0016461735358461738, "learning_rate": 7.916189899149251e-07, "loss": 0.0, "num_input_tokens_seen": 84164048, "step": 124875 }, { "epoch": 3.05083917621479, "grad_norm": 0.0001344871852779761, "learning_rate": 7.915355851104701e-07, "loss": 0.0, "num_input_tokens_seen": 84167376, "step": 124880 }, { "epoch": 3.0509613270466374, "grad_norm": 0.192990243434906, "learning_rate": 7.914521818220243e-07, "loss": 0.0001, "num_input_tokens_seen": 84170640, "step": 124885 }, { "epoch": 3.0510834778784846, "grad_norm": 0.006739953067153692, "learning_rate": 7.913687800501942e-07, "loss": 0.0679, "num_input_tokens_seen": 84174352, "step": 124890 }, { "epoch": 3.0512056287103313, "grad_norm": 0.0011157767148688436, "learning_rate": 7.912853797955866e-07, "loss": 0.0001, "num_input_tokens_seen": 84177616, "step": 124895 }, { "epoch": 3.0513277795421785, "grad_norm": 0.0006541142356581986, "learning_rate": 7.912019810588075e-07, "loss": 0.0, "num_input_tokens_seen": 84180752, "step": 124900 }, { "epoch": 3.0514499303740257, "grad_norm": 0.013899991288781166, "learning_rate": 7.911185838404642e-07, "loss": 0.0, "num_input_tokens_seen": 84183824, "step": 124905 }, { "epoch": 3.051572081205873, "grad_norm": 0.11817273497581482, "learning_rate": 7.910351881411624e-07, "loss": 0.0, "num_input_tokens_seen": 84187280, "step": 124910 }, { "epoch": 3.05169423203772, "grad_norm": 0.020235439762473106, "learning_rate": 7.909517939615092e-07, "loss": 0.0002, "num_input_tokens_seen": 84190480, "step": 124915 }, { "epoch": 3.0518163828695672, "grad_norm": 0.003976785112172365, "learning_rate": 7.908684013021106e-07, "loss": 0.0, "num_input_tokens_seen": 84194448, "step": 124920 }, { "epoch": 3.0519385337014144, "grad_norm": 0.014983579516410828, "learning_rate": 7.907850101635731e-07, "loss": 0.0, "num_input_tokens_seen": 84198224, "step": 124925 }, { "epoch": 3.0520606845332616, "grad_norm": 0.00013353618851397187, "learning_rate": 7.907016205465035e-07, "loss": 0.0, "num_input_tokens_seen": 84201488, "step": 124930 }, { "epoch": 3.052182835365109, "grad_norm": 0.005031253211200237, "learning_rate": 7.906182324515079e-07, "loss": 0.0648, "num_input_tokens_seen": 84205072, "step": 124935 }, { "epoch": 3.052304986196956, "grad_norm": 8.619089203421026e-05, "learning_rate": 7.905348458791932e-07, "loss": 0.0, "num_input_tokens_seen": 84209040, "step": 124940 }, { "epoch": 3.052427137028803, "grad_norm": 0.0009526963112875819, "learning_rate": 7.904514608301649e-07, "loss": 0.0001, "num_input_tokens_seen": 84212368, "step": 124945 }, { "epoch": 3.0525492878606504, "grad_norm": 0.0012412937358021736, "learning_rate": 7.903680773050304e-07, "loss": 0.0, "num_input_tokens_seen": 84215568, "step": 124950 }, { "epoch": 3.0526714386924976, "grad_norm": 0.020034687593579292, "learning_rate": 7.902846953043951e-07, "loss": 0.0, "num_input_tokens_seen": 84219024, "step": 124955 }, { "epoch": 3.0527935895243448, "grad_norm": 0.0002694456488825381, "learning_rate": 7.902013148288661e-07, "loss": 0.0245, "num_input_tokens_seen": 84222544, "step": 124960 }, { "epoch": 3.052915740356192, "grad_norm": 0.10132147371768951, "learning_rate": 7.901179358790499e-07, "loss": 0.0001, "num_input_tokens_seen": 84225680, "step": 124965 }, { "epoch": 3.053037891188039, "grad_norm": 0.009896229021251202, "learning_rate": 7.90034558455552e-07, "loss": 0.0001, "num_input_tokens_seen": 84229392, "step": 124970 }, { "epoch": 3.0531600420198863, "grad_norm": 0.004478112794458866, "learning_rate": 7.899511825589798e-07, "loss": 0.0, "num_input_tokens_seen": 84232528, "step": 124975 }, { "epoch": 3.0532821928517335, "grad_norm": 0.044881828129291534, "learning_rate": 7.898678081899386e-07, "loss": 0.0, "num_input_tokens_seen": 84235600, "step": 124980 }, { "epoch": 3.0534043436835803, "grad_norm": 0.007525295484811068, "learning_rate": 7.897844353490355e-07, "loss": 0.0, "num_input_tokens_seen": 84239440, "step": 124985 }, { "epoch": 3.0535264945154275, "grad_norm": 0.03217082843184471, "learning_rate": 7.897010640368767e-07, "loss": 0.0, "num_input_tokens_seen": 84242896, "step": 124990 }, { "epoch": 3.0536486453472746, "grad_norm": 0.005499767605215311, "learning_rate": 7.89617694254068e-07, "loss": 0.004, "num_input_tokens_seen": 84246352, "step": 124995 }, { "epoch": 3.053770796179122, "grad_norm": 0.011192183010280132, "learning_rate": 7.895343260012163e-07, "loss": 0.0328, "num_input_tokens_seen": 84249488, "step": 125000 }, { "epoch": 3.053892947010969, "grad_norm": 0.0028834501281380653, "learning_rate": 7.894509592789272e-07, "loss": 0.0572, "num_input_tokens_seen": 84252688, "step": 125005 }, { "epoch": 3.054015097842816, "grad_norm": 0.001355986576527357, "learning_rate": 7.893675940878079e-07, "loss": 0.0455, "num_input_tokens_seen": 84256336, "step": 125010 }, { "epoch": 3.0541372486746634, "grad_norm": 0.007849025540053844, "learning_rate": 7.892842304284634e-07, "loss": 0.0, "num_input_tokens_seen": 84259920, "step": 125015 }, { "epoch": 3.0542593995065106, "grad_norm": 0.002404062310233712, "learning_rate": 7.892008683015009e-07, "loss": 0.0291, "num_input_tokens_seen": 84262992, "step": 125020 }, { "epoch": 3.054381550338358, "grad_norm": 0.0010292022489011288, "learning_rate": 7.891175077075267e-07, "loss": 0.0408, "num_input_tokens_seen": 84266640, "step": 125025 }, { "epoch": 3.054503701170205, "grad_norm": 0.0029042158275842667, "learning_rate": 7.890341486471464e-07, "loss": 0.0, "num_input_tokens_seen": 84270544, "step": 125030 }, { "epoch": 3.054625852002052, "grad_norm": 0.008272890001535416, "learning_rate": 7.889507911209669e-07, "loss": 0.0, "num_input_tokens_seen": 84274000, "step": 125035 }, { "epoch": 3.0547480028338994, "grad_norm": 0.0023922626860439777, "learning_rate": 7.888674351295937e-07, "loss": 0.0, "num_input_tokens_seen": 84277008, "step": 125040 }, { "epoch": 3.0548701536657465, "grad_norm": 0.10242076218128204, "learning_rate": 7.887840806736335e-07, "loss": 0.0001, "num_input_tokens_seen": 84280912, "step": 125045 }, { "epoch": 3.0549923044975937, "grad_norm": 0.002840844914317131, "learning_rate": 7.88700727753692e-07, "loss": 0.0002, "num_input_tokens_seen": 84283984, "step": 125050 }, { "epoch": 3.055114455329441, "grad_norm": 0.0061529530212283134, "learning_rate": 7.886173763703756e-07, "loss": 0.0002, "num_input_tokens_seen": 84287056, "step": 125055 }, { "epoch": 3.055236606161288, "grad_norm": 0.0021191227715462446, "learning_rate": 7.885340265242909e-07, "loss": 0.0, "num_input_tokens_seen": 84291088, "step": 125060 }, { "epoch": 3.0553587569931353, "grad_norm": 0.0029413315933197737, "learning_rate": 7.884506782160431e-07, "loss": 0.0, "num_input_tokens_seen": 84294352, "step": 125065 }, { "epoch": 3.0554809078249825, "grad_norm": 0.04265553131699562, "learning_rate": 7.883673314462394e-07, "loss": 0.0572, "num_input_tokens_seen": 84297680, "step": 125070 }, { "epoch": 3.0556030586568292, "grad_norm": 0.002523405710235238, "learning_rate": 7.882839862154849e-07, "loss": 0.0003, "num_input_tokens_seen": 84300880, "step": 125075 }, { "epoch": 3.0557252094886764, "grad_norm": 0.004484863486140966, "learning_rate": 7.882006425243866e-07, "loss": 0.0002, "num_input_tokens_seen": 84304272, "step": 125080 }, { "epoch": 3.0558473603205236, "grad_norm": 0.05211414024233818, "learning_rate": 7.8811730037355e-07, "loss": 0.0001, "num_input_tokens_seen": 84307472, "step": 125085 }, { "epoch": 3.055969511152371, "grad_norm": 0.0016424644272774458, "learning_rate": 7.880339597635814e-07, "loss": 0.0, "num_input_tokens_seen": 84310736, "step": 125090 }, { "epoch": 3.056091661984218, "grad_norm": 0.0005251190159469843, "learning_rate": 7.879506206950872e-07, "loss": 0.0, "num_input_tokens_seen": 84314000, "step": 125095 }, { "epoch": 3.056213812816065, "grad_norm": 0.0010125606786459684, "learning_rate": 7.878672831686725e-07, "loss": 0.0, "num_input_tokens_seen": 84317200, "step": 125100 }, { "epoch": 3.0563359636479124, "grad_norm": 0.006290761288255453, "learning_rate": 7.877839471849446e-07, "loss": 0.0, "num_input_tokens_seen": 84321232, "step": 125105 }, { "epoch": 3.0564581144797596, "grad_norm": 0.0041360314935445786, "learning_rate": 7.877006127445084e-07, "loss": 0.0, "num_input_tokens_seen": 84325264, "step": 125110 }, { "epoch": 3.0565802653116068, "grad_norm": 0.0011427260469645262, "learning_rate": 7.876172798479703e-07, "loss": 0.0034, "num_input_tokens_seen": 84328592, "step": 125115 }, { "epoch": 3.056702416143454, "grad_norm": 0.004253493621945381, "learning_rate": 7.875339484959371e-07, "loss": 0.0, "num_input_tokens_seen": 84331728, "step": 125120 }, { "epoch": 3.056824566975301, "grad_norm": 0.038184262812137604, "learning_rate": 7.874506186890138e-07, "loss": 0.0001, "num_input_tokens_seen": 84335248, "step": 125125 }, { "epoch": 3.0569467178071483, "grad_norm": 0.0009650553110986948, "learning_rate": 7.873672904278069e-07, "loss": 0.0, "num_input_tokens_seen": 84338384, "step": 125130 }, { "epoch": 3.0570688686389955, "grad_norm": 0.0030284132808446884, "learning_rate": 7.87283963712922e-07, "loss": 0.0001, "num_input_tokens_seen": 84342032, "step": 125135 }, { "epoch": 3.0571910194708427, "grad_norm": 15.960134506225586, "learning_rate": 7.872006385449658e-07, "loss": 0.035, "num_input_tokens_seen": 84345232, "step": 125140 }, { "epoch": 3.05731317030269, "grad_norm": 0.0062057022005319595, "learning_rate": 7.871173149245436e-07, "loss": 0.0, "num_input_tokens_seen": 84348880, "step": 125145 }, { "epoch": 3.057435321134537, "grad_norm": 0.0008573241648264229, "learning_rate": 7.870339928522613e-07, "loss": 0.0, "num_input_tokens_seen": 84352400, "step": 125150 }, { "epoch": 3.0575574719663843, "grad_norm": 0.0005532007780857384, "learning_rate": 7.869506723287254e-07, "loss": 0.0, "num_input_tokens_seen": 84355728, "step": 125155 }, { "epoch": 3.0576796227982315, "grad_norm": 0.022176533937454224, "learning_rate": 7.868673533545411e-07, "loss": 0.0, "num_input_tokens_seen": 84359568, "step": 125160 }, { "epoch": 3.057801773630078, "grad_norm": 0.01318781916052103, "learning_rate": 7.867840359303153e-07, "loss": 0.0, "num_input_tokens_seen": 84362768, "step": 125165 }, { "epoch": 3.0579239244619254, "grad_norm": 0.00014520602417178452, "learning_rate": 7.867007200566527e-07, "loss": 0.0465, "num_input_tokens_seen": 84366480, "step": 125170 }, { "epoch": 3.0580460752937726, "grad_norm": 0.04776505380868912, "learning_rate": 7.866174057341601e-07, "loss": 0.0788, "num_input_tokens_seen": 84369744, "step": 125175 }, { "epoch": 3.05816822612562, "grad_norm": 5.101992792333476e-05, "learning_rate": 7.865340929634434e-07, "loss": 0.0002, "num_input_tokens_seen": 84372816, "step": 125180 }, { "epoch": 3.058290376957467, "grad_norm": 0.004976267460733652, "learning_rate": 7.864507817451079e-07, "loss": 0.031, "num_input_tokens_seen": 84376208, "step": 125185 }, { "epoch": 3.058412527789314, "grad_norm": 0.013150850310921669, "learning_rate": 7.8636747207976e-07, "loss": 0.0, "num_input_tokens_seen": 84379856, "step": 125190 }, { "epoch": 3.0585346786211614, "grad_norm": 0.0034766318276524544, "learning_rate": 7.862841639680052e-07, "loss": 0.0, "num_input_tokens_seen": 84383248, "step": 125195 }, { "epoch": 3.0586568294530085, "grad_norm": 0.002520632930099964, "learning_rate": 7.862008574104493e-07, "loss": 0.0, "num_input_tokens_seen": 84386448, "step": 125200 }, { "epoch": 3.0587789802848557, "grad_norm": 0.012875786982476711, "learning_rate": 7.861175524076984e-07, "loss": 0.0, "num_input_tokens_seen": 84389840, "step": 125205 }, { "epoch": 3.058901131116703, "grad_norm": 0.009824838489294052, "learning_rate": 7.860342489603578e-07, "loss": 0.0, "num_input_tokens_seen": 84392976, "step": 125210 }, { "epoch": 3.05902328194855, "grad_norm": 0.0018767307046800852, "learning_rate": 7.859509470690343e-07, "loss": 0.0383, "num_input_tokens_seen": 84396368, "step": 125215 }, { "epoch": 3.0591454327803973, "grad_norm": 0.001162689528428018, "learning_rate": 7.858676467343326e-07, "loss": 0.0, "num_input_tokens_seen": 84399376, "step": 125220 }, { "epoch": 3.0592675836122445, "grad_norm": 0.0010780501179397106, "learning_rate": 7.857843479568595e-07, "loss": 0.0008, "num_input_tokens_seen": 84402576, "step": 125225 }, { "epoch": 3.0593897344440917, "grad_norm": 0.0029626989271491766, "learning_rate": 7.857010507372197e-07, "loss": 0.0, "num_input_tokens_seen": 84405904, "step": 125230 }, { "epoch": 3.059511885275939, "grad_norm": 120.70935821533203, "learning_rate": 7.856177550760197e-07, "loss": 0.0201, "num_input_tokens_seen": 84409104, "step": 125235 }, { "epoch": 3.059634036107786, "grad_norm": 0.0020443014800548553, "learning_rate": 7.85534460973865e-07, "loss": 0.0, "num_input_tokens_seen": 84412752, "step": 125240 }, { "epoch": 3.0597561869396332, "grad_norm": 0.00010733292583609, "learning_rate": 7.854511684313615e-07, "loss": 0.0, "num_input_tokens_seen": 84415952, "step": 125245 }, { "epoch": 3.0598783377714804, "grad_norm": 0.00694976095110178, "learning_rate": 7.85367877449115e-07, "loss": 0.0523, "num_input_tokens_seen": 84419216, "step": 125250 }, { "epoch": 3.060000488603327, "grad_norm": 0.0008842891547828913, "learning_rate": 7.852845880277306e-07, "loss": 0.0, "num_input_tokens_seen": 84422928, "step": 125255 }, { "epoch": 3.0601226394351744, "grad_norm": 0.0027291791047900915, "learning_rate": 7.852013001678149e-07, "loss": 0.0871, "num_input_tokens_seen": 84426384, "step": 125260 }, { "epoch": 3.0602447902670216, "grad_norm": 0.032263994216918945, "learning_rate": 7.851180138699725e-07, "loss": 0.0001, "num_input_tokens_seen": 84429776, "step": 125265 }, { "epoch": 3.0603669410988688, "grad_norm": 0.035991132259368896, "learning_rate": 7.850347291348098e-07, "loss": 0.0001, "num_input_tokens_seen": 84433424, "step": 125270 }, { "epoch": 3.060489091930716, "grad_norm": 0.09364461898803711, "learning_rate": 7.849514459629329e-07, "loss": 0.0001, "num_input_tokens_seen": 84436432, "step": 125275 }, { "epoch": 3.060611242762563, "grad_norm": 0.009549515321850777, "learning_rate": 7.848681643549464e-07, "loss": 0.0, "num_input_tokens_seen": 84440016, "step": 125280 }, { "epoch": 3.0607333935944103, "grad_norm": 0.009542601183056831, "learning_rate": 7.84784884311457e-07, "loss": 0.0, "num_input_tokens_seen": 84443024, "step": 125285 }, { "epoch": 3.0608555444262575, "grad_norm": 0.0017092217458412051, "learning_rate": 7.847016058330692e-07, "loss": 0.0001, "num_input_tokens_seen": 84446416, "step": 125290 }, { "epoch": 3.0609776952581047, "grad_norm": 0.0002163954050047323, "learning_rate": 7.846183289203898e-07, "loss": 0.0325, "num_input_tokens_seen": 84449488, "step": 125295 }, { "epoch": 3.061099846089952, "grad_norm": 0.0009846043540164828, "learning_rate": 7.845350535740236e-07, "loss": 0.0, "num_input_tokens_seen": 84453328, "step": 125300 }, { "epoch": 3.061221996921799, "grad_norm": 0.002706692088395357, "learning_rate": 7.844517797945763e-07, "loss": 0.0002, "num_input_tokens_seen": 84456656, "step": 125305 }, { "epoch": 3.0613441477536463, "grad_norm": 0.0011171639198437333, "learning_rate": 7.843685075826538e-07, "loss": 0.0001, "num_input_tokens_seen": 84459728, "step": 125310 }, { "epoch": 3.0614662985854935, "grad_norm": 0.1492723524570465, "learning_rate": 7.842852369388612e-07, "loss": 0.0001, "num_input_tokens_seen": 84462800, "step": 125315 }, { "epoch": 3.0615884494173407, "grad_norm": 0.0004917579353787005, "learning_rate": 7.84201967863805e-07, "loss": 0.0, "num_input_tokens_seen": 84465872, "step": 125320 }, { "epoch": 3.061710600249188, "grad_norm": 0.0017128594918176532, "learning_rate": 7.841187003580895e-07, "loss": 0.0, "num_input_tokens_seen": 84470032, "step": 125325 }, { "epoch": 3.061832751081035, "grad_norm": 0.00024660531198605895, "learning_rate": 7.84035434422321e-07, "loss": 0.0, "num_input_tokens_seen": 84473296, "step": 125330 }, { "epoch": 3.061954901912882, "grad_norm": 0.0023638547863811255, "learning_rate": 7.839521700571053e-07, "loss": 0.0453, "num_input_tokens_seen": 84476816, "step": 125335 }, { "epoch": 3.062077052744729, "grad_norm": 0.0002841923851519823, "learning_rate": 7.838689072630471e-07, "loss": 0.0, "num_input_tokens_seen": 84480528, "step": 125340 }, { "epoch": 3.062199203576576, "grad_norm": 0.0006632845615968108, "learning_rate": 7.837856460407527e-07, "loss": 0.0, "num_input_tokens_seen": 84484112, "step": 125345 }, { "epoch": 3.0623213544084233, "grad_norm": 0.12877187132835388, "learning_rate": 7.837023863908271e-07, "loss": 0.0, "num_input_tokens_seen": 84487696, "step": 125350 }, { "epoch": 3.0624435052402705, "grad_norm": 0.008744907565414906, "learning_rate": 7.836191283138759e-07, "loss": 0.0, "num_input_tokens_seen": 84491280, "step": 125355 }, { "epoch": 3.0625656560721177, "grad_norm": 0.0188330989331007, "learning_rate": 7.835358718105046e-07, "loss": 0.0, "num_input_tokens_seen": 84494416, "step": 125360 }, { "epoch": 3.062687806903965, "grad_norm": 0.014633757062256336, "learning_rate": 7.834526168813185e-07, "loss": 0.0001, "num_input_tokens_seen": 84497360, "step": 125365 }, { "epoch": 3.062809957735812, "grad_norm": 0.0016165561974048615, "learning_rate": 7.833693635269236e-07, "loss": 0.0, "num_input_tokens_seen": 84500624, "step": 125370 }, { "epoch": 3.0629321085676593, "grad_norm": 0.000309811148326844, "learning_rate": 7.832861117479245e-07, "loss": 0.0, "num_input_tokens_seen": 84503952, "step": 125375 }, { "epoch": 3.0630542593995065, "grad_norm": 0.017329243943095207, "learning_rate": 7.832028615449275e-07, "loss": 0.0001, "num_input_tokens_seen": 84507152, "step": 125380 }, { "epoch": 3.0631764102313537, "grad_norm": 0.0019927939865738153, "learning_rate": 7.831196129185371e-07, "loss": 0.0, "num_input_tokens_seen": 84510416, "step": 125385 }, { "epoch": 3.063298561063201, "grad_norm": 0.0007572838803753257, "learning_rate": 7.830363658693596e-07, "loss": 0.0, "num_input_tokens_seen": 84513936, "step": 125390 }, { "epoch": 3.063420711895048, "grad_norm": 0.003489340888336301, "learning_rate": 7.82953120398e-07, "loss": 0.0001, "num_input_tokens_seen": 84517392, "step": 125395 }, { "epoch": 3.0635428627268952, "grad_norm": 0.05194596201181412, "learning_rate": 7.828698765050636e-07, "loss": 0.0001, "num_input_tokens_seen": 84520784, "step": 125400 }, { "epoch": 3.0636650135587424, "grad_norm": 0.01658715307712555, "learning_rate": 7.827866341911558e-07, "loss": 0.0, "num_input_tokens_seen": 84524048, "step": 125405 }, { "epoch": 3.0637871643905896, "grad_norm": 0.00041749561205506325, "learning_rate": 7.827033934568821e-07, "loss": 0.0, "num_input_tokens_seen": 84527248, "step": 125410 }, { "epoch": 3.063909315222437, "grad_norm": 0.001444365014322102, "learning_rate": 7.82620154302848e-07, "loss": 0.0, "num_input_tokens_seen": 84530576, "step": 125415 }, { "epoch": 3.064031466054284, "grad_norm": 0.0007564245024695992, "learning_rate": 7.825369167296581e-07, "loss": 0.0, "num_input_tokens_seen": 84533776, "step": 125420 }, { "epoch": 3.064153616886131, "grad_norm": 0.0001561980025144294, "learning_rate": 7.824536807379183e-07, "loss": 0.0001, "num_input_tokens_seen": 84537424, "step": 125425 }, { "epoch": 3.0642757677179784, "grad_norm": 0.002753429114818573, "learning_rate": 7.823704463282342e-07, "loss": 0.0, "num_input_tokens_seen": 84540688, "step": 125430 }, { "epoch": 3.064397918549825, "grad_norm": 0.0050643994472920895, "learning_rate": 7.822872135012104e-07, "loss": 0.0, "num_input_tokens_seen": 84544144, "step": 125435 }, { "epoch": 3.0645200693816723, "grad_norm": 0.0004163897247053683, "learning_rate": 7.82203982257453e-07, "loss": 0.0003, "num_input_tokens_seen": 84547536, "step": 125440 }, { "epoch": 3.0646422202135195, "grad_norm": 0.004049566108733416, "learning_rate": 7.821207525975664e-07, "loss": 0.0, "num_input_tokens_seen": 84551056, "step": 125445 }, { "epoch": 3.0647643710453667, "grad_norm": 0.0017293097916990519, "learning_rate": 7.820375245221567e-07, "loss": 0.0, "num_input_tokens_seen": 84554448, "step": 125450 }, { "epoch": 3.064886521877214, "grad_norm": 0.0007199915708042681, "learning_rate": 7.819542980318283e-07, "loss": 0.0, "num_input_tokens_seen": 84557840, "step": 125455 }, { "epoch": 3.065008672709061, "grad_norm": 0.007718597538769245, "learning_rate": 7.81871073127187e-07, "loss": 0.0, "num_input_tokens_seen": 84561424, "step": 125460 }, { "epoch": 3.0651308235409083, "grad_norm": 0.005239197518676519, "learning_rate": 7.817878498088382e-07, "loss": 0.0, "num_input_tokens_seen": 84565136, "step": 125465 }, { "epoch": 3.0652529743727555, "grad_norm": 0.0002189621445722878, "learning_rate": 7.817046280773864e-07, "loss": 0.0, "num_input_tokens_seen": 84568592, "step": 125470 }, { "epoch": 3.0653751252046026, "grad_norm": 0.0008673128322698176, "learning_rate": 7.816214079334378e-07, "loss": 0.0, "num_input_tokens_seen": 84571920, "step": 125475 }, { "epoch": 3.06549727603645, "grad_norm": 7.186421134974808e-05, "learning_rate": 7.815381893775965e-07, "loss": 0.0001, "num_input_tokens_seen": 84574736, "step": 125480 }, { "epoch": 3.065619426868297, "grad_norm": 0.0008313908474519849, "learning_rate": 7.814549724104683e-07, "loss": 0.0, "num_input_tokens_seen": 84578064, "step": 125485 }, { "epoch": 3.065741577700144, "grad_norm": 0.00026990409241989255, "learning_rate": 7.813717570326588e-07, "loss": 0.0, "num_input_tokens_seen": 84581200, "step": 125490 }, { "epoch": 3.0658637285319914, "grad_norm": 0.02783309668302536, "learning_rate": 7.812885432447722e-07, "loss": 0.0, "num_input_tokens_seen": 84584464, "step": 125495 }, { "epoch": 3.0659858793638386, "grad_norm": 0.0011254458222538233, "learning_rate": 7.812053310474146e-07, "loss": 0.0016, "num_input_tokens_seen": 84587984, "step": 125500 }, { "epoch": 3.066108030195686, "grad_norm": 23.020633697509766, "learning_rate": 7.811221204411905e-07, "loss": 0.0716, "num_input_tokens_seen": 84591376, "step": 125505 }, { "epoch": 3.066230181027533, "grad_norm": 0.0007930602878332138, "learning_rate": 7.810389114267051e-07, "loss": 0.0, "num_input_tokens_seen": 84594512, "step": 125510 }, { "epoch": 3.06635233185938, "grad_norm": 0.0006310031749308109, "learning_rate": 7.809557040045637e-07, "loss": 0.0, "num_input_tokens_seen": 84597712, "step": 125515 }, { "epoch": 3.066474482691227, "grad_norm": 0.015794072300195694, "learning_rate": 7.808724981753712e-07, "loss": 0.0, "num_input_tokens_seen": 84601040, "step": 125520 }, { "epoch": 3.066596633523074, "grad_norm": 0.00023171577777247876, "learning_rate": 7.807892939397331e-07, "loss": 0.0001, "num_input_tokens_seen": 84604176, "step": 125525 }, { "epoch": 3.0667187843549213, "grad_norm": 0.012235916219651699, "learning_rate": 7.807060912982538e-07, "loss": 0.0001, "num_input_tokens_seen": 84607632, "step": 125530 }, { "epoch": 3.0668409351867685, "grad_norm": 0.0012531871907413006, "learning_rate": 7.806228902515393e-07, "loss": 0.0, "num_input_tokens_seen": 84611344, "step": 125535 }, { "epoch": 3.0669630860186157, "grad_norm": 0.0702088475227356, "learning_rate": 7.805396908001938e-07, "loss": 0.0001, "num_input_tokens_seen": 84615568, "step": 125540 }, { "epoch": 3.067085236850463, "grad_norm": 0.0007449003169313073, "learning_rate": 7.804564929448227e-07, "loss": 0.054, "num_input_tokens_seen": 84618896, "step": 125545 }, { "epoch": 3.06720738768231, "grad_norm": 0.010736105963587761, "learning_rate": 7.803732966860311e-07, "loss": 0.0, "num_input_tokens_seen": 84622096, "step": 125550 }, { "epoch": 3.0673295385141572, "grad_norm": 0.05345999822020531, "learning_rate": 7.80290102024424e-07, "loss": 0.0001, "num_input_tokens_seen": 84625680, "step": 125555 }, { "epoch": 3.0674516893460044, "grad_norm": 0.0007100952789187431, "learning_rate": 7.802069089606064e-07, "loss": 0.0, "num_input_tokens_seen": 84628944, "step": 125560 }, { "epoch": 3.0675738401778516, "grad_norm": 0.06198933348059654, "learning_rate": 7.801237174951833e-07, "loss": 0.0, "num_input_tokens_seen": 84632528, "step": 125565 }, { "epoch": 3.067695991009699, "grad_norm": 0.00022255294607020915, "learning_rate": 7.800405276287599e-07, "loss": 0.0, "num_input_tokens_seen": 84635664, "step": 125570 }, { "epoch": 3.067818141841546, "grad_norm": 0.0003733917837962508, "learning_rate": 7.799573393619403e-07, "loss": 0.0, "num_input_tokens_seen": 84638928, "step": 125575 }, { "epoch": 3.067940292673393, "grad_norm": 0.0009603972430340946, "learning_rate": 7.798741526953303e-07, "loss": 0.0, "num_input_tokens_seen": 84642448, "step": 125580 }, { "epoch": 3.0680624435052404, "grad_norm": 0.0003000438155140728, "learning_rate": 7.797909676295351e-07, "loss": 0.0, "num_input_tokens_seen": 84645840, "step": 125585 }, { "epoch": 3.0681845943370876, "grad_norm": 0.0011194392573088408, "learning_rate": 7.797077841651587e-07, "loss": 0.0308, "num_input_tokens_seen": 84648592, "step": 125590 }, { "epoch": 3.0683067451689348, "grad_norm": 0.009211283177137375, "learning_rate": 7.79624602302807e-07, "loss": 0.0, "num_input_tokens_seen": 84652112, "step": 125595 }, { "epoch": 3.068428896000782, "grad_norm": 0.0015620322665199637, "learning_rate": 7.79541422043084e-07, "loss": 0.0001, "num_input_tokens_seen": 84655376, "step": 125600 }, { "epoch": 3.068551046832629, "grad_norm": 0.0010103716049343348, "learning_rate": 7.794582433865956e-07, "loss": 0.0, "num_input_tokens_seen": 84658640, "step": 125605 }, { "epoch": 3.068673197664476, "grad_norm": 0.03660206124186516, "learning_rate": 7.793750663339459e-07, "loss": 0.0, "num_input_tokens_seen": 84661456, "step": 125610 }, { "epoch": 3.068795348496323, "grad_norm": 0.0035117187071591616, "learning_rate": 7.792918908857399e-07, "loss": 0.0, "num_input_tokens_seen": 84664784, "step": 125615 }, { "epoch": 3.0689174993281703, "grad_norm": 0.0003971233672928065, "learning_rate": 7.792087170425829e-07, "loss": 0.0, "num_input_tokens_seen": 84668176, "step": 125620 }, { "epoch": 3.0690396501600175, "grad_norm": 0.0019440487958490849, "learning_rate": 7.791255448050793e-07, "loss": 0.0003, "num_input_tokens_seen": 84671248, "step": 125625 }, { "epoch": 3.0691618009918646, "grad_norm": 0.002487177960574627, "learning_rate": 7.790423741738344e-07, "loss": 0.0, "num_input_tokens_seen": 84674576, "step": 125630 }, { "epoch": 3.069283951823712, "grad_norm": 0.001050000311806798, "learning_rate": 7.789592051494524e-07, "loss": 0.0, "num_input_tokens_seen": 84677904, "step": 125635 }, { "epoch": 3.069406102655559, "grad_norm": 0.1311432123184204, "learning_rate": 7.788760377325385e-07, "loss": 0.0, "num_input_tokens_seen": 84681360, "step": 125640 }, { "epoch": 3.069528253487406, "grad_norm": 0.0007992003229446709, "learning_rate": 7.78792871923698e-07, "loss": 0.0, "num_input_tokens_seen": 84684752, "step": 125645 }, { "epoch": 3.0696504043192534, "grad_norm": 0.000563696725293994, "learning_rate": 7.787097077235348e-07, "loss": 0.0, "num_input_tokens_seen": 84688464, "step": 125650 }, { "epoch": 3.0697725551511006, "grad_norm": 0.00010724652383942157, "learning_rate": 7.786265451326546e-07, "loss": 0.0, "num_input_tokens_seen": 84692368, "step": 125655 }, { "epoch": 3.069894705982948, "grad_norm": 0.0006794543587602675, "learning_rate": 7.785433841516614e-07, "loss": 0.0, "num_input_tokens_seen": 84695824, "step": 125660 }, { "epoch": 3.070016856814795, "grad_norm": 0.010498473420739174, "learning_rate": 7.784602247811603e-07, "loss": 0.0, "num_input_tokens_seen": 84699472, "step": 125665 }, { "epoch": 3.070139007646642, "grad_norm": 0.003232223680242896, "learning_rate": 7.783770670217561e-07, "loss": 0.041, "num_input_tokens_seen": 84703184, "step": 125670 }, { "epoch": 3.0702611584784893, "grad_norm": 0.06351174414157867, "learning_rate": 7.782939108740532e-07, "loss": 0.0, "num_input_tokens_seen": 84706768, "step": 125675 }, { "epoch": 3.0703833093103365, "grad_norm": 0.0006841022404842079, "learning_rate": 7.782107563386571e-07, "loss": 0.0, "num_input_tokens_seen": 84709776, "step": 125680 }, { "epoch": 3.0705054601421837, "grad_norm": 0.006533041596412659, "learning_rate": 7.781276034161714e-07, "loss": 0.0284, "num_input_tokens_seen": 84713552, "step": 125685 }, { "epoch": 3.070627610974031, "grad_norm": 0.01073833741247654, "learning_rate": 7.780444521072021e-07, "loss": 0.0, "num_input_tokens_seen": 84716816, "step": 125690 }, { "epoch": 3.070749761805878, "grad_norm": 0.00148698256816715, "learning_rate": 7.779613024123528e-07, "loss": 0.0, "num_input_tokens_seen": 84720592, "step": 125695 }, { "epoch": 3.070871912637725, "grad_norm": 0.0003934977576136589, "learning_rate": 7.778781543322289e-07, "loss": 0.0, "num_input_tokens_seen": 84723984, "step": 125700 }, { "epoch": 3.070994063469572, "grad_norm": 0.0019493974978104234, "learning_rate": 7.777950078674345e-07, "loss": 0.0, "num_input_tokens_seen": 84727312, "step": 125705 }, { "epoch": 3.0711162143014192, "grad_norm": 0.0042640152387320995, "learning_rate": 7.777118630185748e-07, "loss": 0.0, "num_input_tokens_seen": 84730768, "step": 125710 }, { "epoch": 3.0712383651332664, "grad_norm": 0.0034844614565372467, "learning_rate": 7.776287197862541e-07, "loss": 0.0, "num_input_tokens_seen": 84734032, "step": 125715 }, { "epoch": 3.0713605159651136, "grad_norm": 0.000586310459766537, "learning_rate": 7.775455781710774e-07, "loss": 0.0, "num_input_tokens_seen": 84737296, "step": 125720 }, { "epoch": 3.071482666796961, "grad_norm": 0.0002898842503782362, "learning_rate": 7.774624381736489e-07, "loss": 0.0001, "num_input_tokens_seen": 84740752, "step": 125725 }, { "epoch": 3.071604817628808, "grad_norm": 1.425169825553894, "learning_rate": 7.773792997945734e-07, "loss": 0.0003, "num_input_tokens_seen": 84744528, "step": 125730 }, { "epoch": 3.071726968460655, "grad_norm": 0.000697058392688632, "learning_rate": 7.772961630344552e-07, "loss": 0.0, "num_input_tokens_seen": 84747984, "step": 125735 }, { "epoch": 3.0718491192925024, "grad_norm": 0.0004514774482231587, "learning_rate": 7.772130278938999e-07, "loss": 0.0001, "num_input_tokens_seen": 84751184, "step": 125740 }, { "epoch": 3.0719712701243496, "grad_norm": 0.00027701619546860456, "learning_rate": 7.771298943735108e-07, "loss": 0.0, "num_input_tokens_seen": 84755088, "step": 125745 }, { "epoch": 3.0720934209561968, "grad_norm": 0.0006469250656664371, "learning_rate": 7.770467624738935e-07, "loss": 0.0, "num_input_tokens_seen": 84758544, "step": 125750 }, { "epoch": 3.072215571788044, "grad_norm": 0.05088665336370468, "learning_rate": 7.769636321956516e-07, "loss": 0.0, "num_input_tokens_seen": 84762192, "step": 125755 }, { "epoch": 3.072337722619891, "grad_norm": 0.01325522642582655, "learning_rate": 7.768805035393907e-07, "loss": 0.0, "num_input_tokens_seen": 84765328, "step": 125760 }, { "epoch": 3.0724598734517383, "grad_norm": 28.06966209411621, "learning_rate": 7.767973765057145e-07, "loss": 0.1304, "num_input_tokens_seen": 84768912, "step": 125765 }, { "epoch": 3.0725820242835855, "grad_norm": 0.0073534781113266945, "learning_rate": 7.767142510952277e-07, "loss": 0.0, "num_input_tokens_seen": 84772112, "step": 125770 }, { "epoch": 3.0727041751154327, "grad_norm": 0.001848505693487823, "learning_rate": 7.766311273085353e-07, "loss": 0.0072, "num_input_tokens_seen": 84774992, "step": 125775 }, { "epoch": 3.07282632594728, "grad_norm": 0.031989775598049164, "learning_rate": 7.765480051462409e-07, "loss": 0.0, "num_input_tokens_seen": 84777936, "step": 125780 }, { "epoch": 3.072948476779127, "grad_norm": 0.00015862514555919915, "learning_rate": 7.764648846089501e-07, "loss": 0.0318, "num_input_tokens_seen": 84781456, "step": 125785 }, { "epoch": 3.073070627610974, "grad_norm": 0.0009046062477864325, "learning_rate": 7.763817656972662e-07, "loss": 0.0418, "num_input_tokens_seen": 84784656, "step": 125790 }, { "epoch": 3.073192778442821, "grad_norm": 0.002990515436977148, "learning_rate": 7.762986484117943e-07, "loss": 0.0, "num_input_tokens_seen": 84787600, "step": 125795 }, { "epoch": 3.073314929274668, "grad_norm": 0.0013474200386554003, "learning_rate": 7.762155327531392e-07, "loss": 0.0307, "num_input_tokens_seen": 84790736, "step": 125800 }, { "epoch": 3.0734370801065154, "grad_norm": 0.005178503692150116, "learning_rate": 7.761324187219046e-07, "loss": 0.0, "num_input_tokens_seen": 84794192, "step": 125805 }, { "epoch": 3.0735592309383626, "grad_norm": 0.010057356208562851, "learning_rate": 7.760493063186957e-07, "loss": 0.0, "num_input_tokens_seen": 84797456, "step": 125810 }, { "epoch": 3.0736813817702098, "grad_norm": 0.0014329110272228718, "learning_rate": 7.759661955441161e-07, "loss": 0.0001, "num_input_tokens_seen": 84801104, "step": 125815 }, { "epoch": 3.073803532602057, "grad_norm": 0.005202101077884436, "learning_rate": 7.758830863987707e-07, "loss": 0.0, "num_input_tokens_seen": 84804496, "step": 125820 }, { "epoch": 3.073925683433904, "grad_norm": 0.0016744352178648114, "learning_rate": 7.757999788832639e-07, "loss": 0.0001, "num_input_tokens_seen": 84807952, "step": 125825 }, { "epoch": 3.0740478342657513, "grad_norm": 0.00039133901009336114, "learning_rate": 7.757168729981996e-07, "loss": 0.0, "num_input_tokens_seen": 84811088, "step": 125830 }, { "epoch": 3.0741699850975985, "grad_norm": 0.009436092339456081, "learning_rate": 7.75633768744183e-07, "loss": 0.0489, "num_input_tokens_seen": 84814608, "step": 125835 }, { "epoch": 3.0742921359294457, "grad_norm": 0.04576566070318222, "learning_rate": 7.755506661218174e-07, "loss": 0.0, "num_input_tokens_seen": 84817680, "step": 125840 }, { "epoch": 3.074414286761293, "grad_norm": 0.0022139260545372963, "learning_rate": 7.754675651317083e-07, "loss": 0.0005, "num_input_tokens_seen": 84821328, "step": 125845 }, { "epoch": 3.07453643759314, "grad_norm": 0.00011124507727799937, "learning_rate": 7.753844657744591e-07, "loss": 0.0001, "num_input_tokens_seen": 84824848, "step": 125850 }, { "epoch": 3.0746585884249873, "grad_norm": 0.0007051240536384284, "learning_rate": 7.753013680506747e-07, "loss": 0.0, "num_input_tokens_seen": 84827792, "step": 125855 }, { "epoch": 3.0747807392568345, "grad_norm": 0.0007837665034458041, "learning_rate": 7.752182719609589e-07, "loss": 0.0, "num_input_tokens_seen": 84831248, "step": 125860 }, { "epoch": 3.0749028900886817, "grad_norm": 0.0006996446172706783, "learning_rate": 7.751351775059165e-07, "loss": 0.0, "num_input_tokens_seen": 84834448, "step": 125865 }, { "epoch": 3.075025040920529, "grad_norm": 0.02283223159611225, "learning_rate": 7.750520846861515e-07, "loss": 0.0, "num_input_tokens_seen": 84837968, "step": 125870 }, { "epoch": 3.075147191752376, "grad_norm": 0.0009633800364099443, "learning_rate": 7.749689935022683e-07, "loss": 0.0, "num_input_tokens_seen": 84840976, "step": 125875 }, { "epoch": 3.075269342584223, "grad_norm": 0.0018521981546655297, "learning_rate": 7.748859039548713e-07, "loss": 0.0, "num_input_tokens_seen": 84844240, "step": 125880 }, { "epoch": 3.07539149341607, "grad_norm": 0.0005762922228313982, "learning_rate": 7.748028160445641e-07, "loss": 0.05, "num_input_tokens_seen": 84848656, "step": 125885 }, { "epoch": 3.075513644247917, "grad_norm": 0.005822031758725643, "learning_rate": 7.747197297719515e-07, "loss": 0.0, "num_input_tokens_seen": 84851856, "step": 125890 }, { "epoch": 3.0756357950797644, "grad_norm": 0.011522366665303707, "learning_rate": 7.746366451376379e-07, "loss": 0.0002, "num_input_tokens_seen": 84855056, "step": 125895 }, { "epoch": 3.0757579459116116, "grad_norm": 0.16439086198806763, "learning_rate": 7.745535621422269e-07, "loss": 0.0001, "num_input_tokens_seen": 84858704, "step": 125900 }, { "epoch": 3.0758800967434587, "grad_norm": 0.00033246594830416143, "learning_rate": 7.744704807863234e-07, "loss": 0.0761, "num_input_tokens_seen": 84862480, "step": 125905 }, { "epoch": 3.076002247575306, "grad_norm": 0.1619579792022705, "learning_rate": 7.743874010705308e-07, "loss": 0.0001, "num_input_tokens_seen": 84866256, "step": 125910 }, { "epoch": 3.076124398407153, "grad_norm": 0.0010160075034946203, "learning_rate": 7.743043229954542e-07, "loss": 0.0002, "num_input_tokens_seen": 84869776, "step": 125915 }, { "epoch": 3.0762465492390003, "grad_norm": 0.000158069291501306, "learning_rate": 7.74221246561697e-07, "loss": 0.0444, "num_input_tokens_seen": 84873488, "step": 125920 }, { "epoch": 3.0763687000708475, "grad_norm": 0.0011093869106844068, "learning_rate": 7.741381717698634e-07, "loss": 0.0, "num_input_tokens_seen": 84876880, "step": 125925 }, { "epoch": 3.0764908509026947, "grad_norm": 0.0034708997700363398, "learning_rate": 7.740550986205582e-07, "loss": 0.0, "num_input_tokens_seen": 84880208, "step": 125930 }, { "epoch": 3.076613001734542, "grad_norm": 0.006403934210538864, "learning_rate": 7.739720271143847e-07, "loss": 0.0001, "num_input_tokens_seen": 84883088, "step": 125935 }, { "epoch": 3.076735152566389, "grad_norm": 0.0007148389122448862, "learning_rate": 7.738889572519478e-07, "loss": 0.0, "num_input_tokens_seen": 84886800, "step": 125940 }, { "epoch": 3.0768573033982363, "grad_norm": 0.02925863303244114, "learning_rate": 7.738058890338508e-07, "loss": 0.0013, "num_input_tokens_seen": 84890256, "step": 125945 }, { "epoch": 3.0769794542300835, "grad_norm": 0.001407115370966494, "learning_rate": 7.737228224606982e-07, "loss": 0.0, "num_input_tokens_seen": 84893520, "step": 125950 }, { "epoch": 3.0771016050619306, "grad_norm": 0.0008443163824267685, "learning_rate": 7.736397575330946e-07, "loss": 0.0001, "num_input_tokens_seen": 84897232, "step": 125955 }, { "epoch": 3.077223755893778, "grad_norm": 0.003405147697776556, "learning_rate": 7.735566942516431e-07, "loss": 0.0, "num_input_tokens_seen": 84900688, "step": 125960 }, { "epoch": 3.0773459067256246, "grad_norm": 0.005903087090700865, "learning_rate": 7.734736326169487e-07, "loss": 0.0, "num_input_tokens_seen": 84904208, "step": 125965 }, { "epoch": 3.0774680575574718, "grad_norm": 0.011653275229036808, "learning_rate": 7.733905726296146e-07, "loss": 0.0, "num_input_tokens_seen": 84907280, "step": 125970 }, { "epoch": 3.077590208389319, "grad_norm": 0.0014293314889073372, "learning_rate": 7.733075142902453e-07, "loss": 0.0, "num_input_tokens_seen": 84910416, "step": 125975 }, { "epoch": 3.077712359221166, "grad_norm": 0.0037776813842356205, "learning_rate": 7.732244575994447e-07, "loss": 0.0, "num_input_tokens_seen": 84913808, "step": 125980 }, { "epoch": 3.0778345100530133, "grad_norm": 16.149124145507812, "learning_rate": 7.731414025578168e-07, "loss": 0.0553, "num_input_tokens_seen": 84917136, "step": 125985 }, { "epoch": 3.0779566608848605, "grad_norm": 0.0021692479494959116, "learning_rate": 7.730583491659659e-07, "loss": 0.0, "num_input_tokens_seen": 84920784, "step": 125990 }, { "epoch": 3.0780788117167077, "grad_norm": 0.003136912826448679, "learning_rate": 7.729752974244953e-07, "loss": 0.0, "num_input_tokens_seen": 84924368, "step": 125995 }, { "epoch": 3.078200962548555, "grad_norm": 0.0010444693034514785, "learning_rate": 7.728922473340098e-07, "loss": 0.0663, "num_input_tokens_seen": 84928080, "step": 126000 }, { "epoch": 3.078323113380402, "grad_norm": 0.0024431466590613127, "learning_rate": 7.728091988951126e-07, "loss": 0.0002, "num_input_tokens_seen": 84931280, "step": 126005 }, { "epoch": 3.0784452642122493, "grad_norm": 0.011616510339081287, "learning_rate": 7.727261521084085e-07, "loss": 0.0, "num_input_tokens_seen": 84934672, "step": 126010 }, { "epoch": 3.0785674150440965, "grad_norm": 0.0012070763623341918, "learning_rate": 7.726431069745004e-07, "loss": 0.0, "num_input_tokens_seen": 84938256, "step": 126015 }, { "epoch": 3.0786895658759437, "grad_norm": 0.002955326810479164, "learning_rate": 7.725600634939931e-07, "loss": 0.0, "num_input_tokens_seen": 84941328, "step": 126020 }, { "epoch": 3.078811716707791, "grad_norm": 0.001535121351480484, "learning_rate": 7.724770216674901e-07, "loss": 0.0, "num_input_tokens_seen": 84944720, "step": 126025 }, { "epoch": 3.078933867539638, "grad_norm": 0.004278361797332764, "learning_rate": 7.723939814955955e-07, "loss": 0.0, "num_input_tokens_seen": 84947792, "step": 126030 }, { "epoch": 3.0790560183714852, "grad_norm": 0.018353991210460663, "learning_rate": 7.723109429789132e-07, "loss": 0.0, "num_input_tokens_seen": 84951248, "step": 126035 }, { "epoch": 3.0791781692033324, "grad_norm": 0.08312410116195679, "learning_rate": 7.722279061180465e-07, "loss": 0.0, "num_input_tokens_seen": 84954128, "step": 126040 }, { "epoch": 3.0793003200351796, "grad_norm": 0.007280240301042795, "learning_rate": 7.721448709135998e-07, "loss": 0.0, "num_input_tokens_seen": 84957648, "step": 126045 }, { "epoch": 3.079422470867027, "grad_norm": 0.0015203187940642238, "learning_rate": 7.720618373661774e-07, "loss": 0.0001, "num_input_tokens_seen": 84960912, "step": 126050 }, { "epoch": 3.079544621698874, "grad_norm": 0.001035342924296856, "learning_rate": 7.719788054763821e-07, "loss": 0.0, "num_input_tokens_seen": 84964048, "step": 126055 }, { "epoch": 3.0796667725307207, "grad_norm": 0.001273818896152079, "learning_rate": 7.718957752448187e-07, "loss": 0.0, "num_input_tokens_seen": 84967696, "step": 126060 }, { "epoch": 3.079788923362568, "grad_norm": 23.178403854370117, "learning_rate": 7.718127466720901e-07, "loss": 0.0857, "num_input_tokens_seen": 84971088, "step": 126065 }, { "epoch": 3.079911074194415, "grad_norm": 0.0031698124948889017, "learning_rate": 7.71729719758801e-07, "loss": 0.0003, "num_input_tokens_seen": 84974608, "step": 126070 }, { "epoch": 3.0800332250262623, "grad_norm": 0.00352927902713418, "learning_rate": 7.716466945055546e-07, "loss": 0.0838, "num_input_tokens_seen": 84977936, "step": 126075 }, { "epoch": 3.0801553758581095, "grad_norm": 0.0046735224314033985, "learning_rate": 7.715636709129547e-07, "loss": 0.0003, "num_input_tokens_seen": 84980880, "step": 126080 }, { "epoch": 3.0802775266899567, "grad_norm": 0.21390970051288605, "learning_rate": 7.714806489816056e-07, "loss": 0.0002, "num_input_tokens_seen": 84983696, "step": 126085 }, { "epoch": 3.080399677521804, "grad_norm": 0.0023563490249216557, "learning_rate": 7.713976287121102e-07, "loss": 0.0, "num_input_tokens_seen": 84987344, "step": 126090 }, { "epoch": 3.080521828353651, "grad_norm": 0.004802169743925333, "learning_rate": 7.713146101050733e-07, "loss": 0.0, "num_input_tokens_seen": 84990736, "step": 126095 }, { "epoch": 3.0806439791854983, "grad_norm": 0.14076021313667297, "learning_rate": 7.712315931610976e-07, "loss": 0.0001, "num_input_tokens_seen": 84994064, "step": 126100 }, { "epoch": 3.0807661300173454, "grad_norm": 23.542325973510742, "learning_rate": 7.711485778807872e-07, "loss": 0.0533, "num_input_tokens_seen": 84997264, "step": 126105 }, { "epoch": 3.0808882808491926, "grad_norm": 0.0003235576150473207, "learning_rate": 7.710655642647463e-07, "loss": 0.0, "num_input_tokens_seen": 85000656, "step": 126110 }, { "epoch": 3.08101043168104, "grad_norm": 0.0005096203531138599, "learning_rate": 7.709825523135778e-07, "loss": 0.0003, "num_input_tokens_seen": 85004048, "step": 126115 }, { "epoch": 3.081132582512887, "grad_norm": 0.00011765821545850486, "learning_rate": 7.708995420278864e-07, "loss": 0.0, "num_input_tokens_seen": 85007568, "step": 126120 }, { "epoch": 3.081254733344734, "grad_norm": 0.0091170035302639, "learning_rate": 7.708165334082746e-07, "loss": 0.0, "num_input_tokens_seen": 85011344, "step": 126125 }, { "epoch": 3.0813768841765814, "grad_norm": 0.004456630442291498, "learning_rate": 7.707335264553469e-07, "loss": 0.0, "num_input_tokens_seen": 85014544, "step": 126130 }, { "epoch": 3.0814990350084286, "grad_norm": 0.004846032243221998, "learning_rate": 7.706505211697065e-07, "loss": 0.0, "num_input_tokens_seen": 85018064, "step": 126135 }, { "epoch": 3.081621185840276, "grad_norm": 0.005253411363810301, "learning_rate": 7.70567517551957e-07, "loss": 0.0, "num_input_tokens_seen": 85021392, "step": 126140 }, { "epoch": 3.0817433366721225, "grad_norm": 0.0008864422561600804, "learning_rate": 7.704845156027027e-07, "loss": 0.0002, "num_input_tokens_seen": 85024528, "step": 126145 }, { "epoch": 3.0818654875039697, "grad_norm": 101.36573028564453, "learning_rate": 7.704015153225463e-07, "loss": 0.0418, "num_input_tokens_seen": 85027728, "step": 126150 }, { "epoch": 3.081987638335817, "grad_norm": 0.0012882634764537215, "learning_rate": 7.703185167120922e-07, "loss": 0.0, "num_input_tokens_seen": 85030864, "step": 126155 }, { "epoch": 3.082109789167664, "grad_norm": 0.0049063474871218204, "learning_rate": 7.702355197719432e-07, "loss": 0.0, "num_input_tokens_seen": 85034512, "step": 126160 }, { "epoch": 3.0822319399995113, "grad_norm": 0.007059808354824781, "learning_rate": 7.701525245027039e-07, "loss": 0.067, "num_input_tokens_seen": 85038160, "step": 126165 }, { "epoch": 3.0823540908313585, "grad_norm": 0.007309371139854193, "learning_rate": 7.700695309049766e-07, "loss": 0.0, "num_input_tokens_seen": 85041616, "step": 126170 }, { "epoch": 3.0824762416632057, "grad_norm": 0.0031766737811267376, "learning_rate": 7.699865389793659e-07, "loss": 0.0, "num_input_tokens_seen": 85044880, "step": 126175 }, { "epoch": 3.082598392495053, "grad_norm": 0.0005614809924736619, "learning_rate": 7.699035487264749e-07, "loss": 0.0, "num_input_tokens_seen": 85047760, "step": 126180 }, { "epoch": 3.0827205433269, "grad_norm": 0.008097624406218529, "learning_rate": 7.698205601469072e-07, "loss": 0.0, "num_input_tokens_seen": 85051216, "step": 126185 }, { "epoch": 3.0828426941587472, "grad_norm": 0.002300823340192437, "learning_rate": 7.697375732412665e-07, "loss": 0.0, "num_input_tokens_seen": 85054736, "step": 126190 }, { "epoch": 3.0829648449905944, "grad_norm": 0.00025516541791148484, "learning_rate": 7.696545880101556e-07, "loss": 0.0, "num_input_tokens_seen": 85058192, "step": 126195 }, { "epoch": 3.0830869958224416, "grad_norm": 0.9779638648033142, "learning_rate": 7.695716044541786e-07, "loss": 0.0002, "num_input_tokens_seen": 85061456, "step": 126200 }, { "epoch": 3.083209146654289, "grad_norm": 0.002006381982937455, "learning_rate": 7.694886225739392e-07, "loss": 0.0, "num_input_tokens_seen": 85064656, "step": 126205 }, { "epoch": 3.083331297486136, "grad_norm": 0.024691827595233917, "learning_rate": 7.694056423700401e-07, "loss": 0.0, "num_input_tokens_seen": 85067856, "step": 126210 }, { "epoch": 3.083453448317983, "grad_norm": 0.008902209810912609, "learning_rate": 7.693226638430857e-07, "loss": 0.0433, "num_input_tokens_seen": 85071120, "step": 126215 }, { "epoch": 3.0835755991498304, "grad_norm": 0.0031682380940765142, "learning_rate": 7.692396869936784e-07, "loss": 0.0, "num_input_tokens_seen": 85074384, "step": 126220 }, { "epoch": 3.0836977499816776, "grad_norm": 0.0010243651922792196, "learning_rate": 7.691567118224228e-07, "loss": 0.0, "num_input_tokens_seen": 85078224, "step": 126225 }, { "epoch": 3.0838199008135248, "grad_norm": 0.03558716922998428, "learning_rate": 7.690737383299212e-07, "loss": 0.0, "num_input_tokens_seen": 85081296, "step": 126230 }, { "epoch": 3.0839420516453715, "grad_norm": 0.007264365907758474, "learning_rate": 7.689907665167775e-07, "loss": 0.0, "num_input_tokens_seen": 85084816, "step": 126235 }, { "epoch": 3.0840642024772187, "grad_norm": 0.002083815401419997, "learning_rate": 7.689077963835955e-07, "loss": 0.0, "num_input_tokens_seen": 85088272, "step": 126240 }, { "epoch": 3.084186353309066, "grad_norm": 0.00029359650216065347, "learning_rate": 7.688248279309776e-07, "loss": 0.0, "num_input_tokens_seen": 85091792, "step": 126245 }, { "epoch": 3.084308504140913, "grad_norm": 0.001116403378546238, "learning_rate": 7.687418611595282e-07, "loss": 0.0541, "num_input_tokens_seen": 85094992, "step": 126250 }, { "epoch": 3.0844306549727603, "grad_norm": 0.002682020887732506, "learning_rate": 7.686588960698497e-07, "loss": 0.0002, "num_input_tokens_seen": 85098512, "step": 126255 }, { "epoch": 3.0845528058046074, "grad_norm": 0.0009902494493871927, "learning_rate": 7.685759326625461e-07, "loss": 0.0, "num_input_tokens_seen": 85101712, "step": 126260 }, { "epoch": 3.0846749566364546, "grad_norm": 0.05289221554994583, "learning_rate": 7.684929709382209e-07, "loss": 0.0, "num_input_tokens_seen": 85105040, "step": 126265 }, { "epoch": 3.084797107468302, "grad_norm": 0.00030390024767257273, "learning_rate": 7.684100108974766e-07, "loss": 0.0001, "num_input_tokens_seen": 85108304, "step": 126270 }, { "epoch": 3.084919258300149, "grad_norm": 0.0018810959300026298, "learning_rate": 7.683270525409175e-07, "loss": 0.0, "num_input_tokens_seen": 85111632, "step": 126275 }, { "epoch": 3.085041409131996, "grad_norm": 0.025849755853414536, "learning_rate": 7.682440958691461e-07, "loss": 0.0003, "num_input_tokens_seen": 85115408, "step": 126280 }, { "epoch": 3.0851635599638434, "grad_norm": 0.0540730357170105, "learning_rate": 7.681611408827661e-07, "loss": 0.0001, "num_input_tokens_seen": 85119056, "step": 126285 }, { "epoch": 3.0852857107956906, "grad_norm": 0.038840051740407944, "learning_rate": 7.680781875823805e-07, "loss": 0.0, "num_input_tokens_seen": 85122384, "step": 126290 }, { "epoch": 3.0854078616275378, "grad_norm": 0.0032664036843925714, "learning_rate": 7.679952359685925e-07, "loss": 0.0, "num_input_tokens_seen": 85125584, "step": 126295 }, { "epoch": 3.085530012459385, "grad_norm": 0.00981723889708519, "learning_rate": 7.679122860420059e-07, "loss": 0.0002, "num_input_tokens_seen": 85129808, "step": 126300 }, { "epoch": 3.085652163291232, "grad_norm": 0.00037645341944880784, "learning_rate": 7.678293378032234e-07, "loss": 0.0, "num_input_tokens_seen": 85132752, "step": 126305 }, { "epoch": 3.0857743141230793, "grad_norm": 0.0018895752727985382, "learning_rate": 7.677463912528487e-07, "loss": 0.0, "num_input_tokens_seen": 85135888, "step": 126310 }, { "epoch": 3.0858964649549265, "grad_norm": 0.059392791241407394, "learning_rate": 7.676634463914843e-07, "loss": 0.0, "num_input_tokens_seen": 85139728, "step": 126315 }, { "epoch": 3.0860186157867737, "grad_norm": 0.0005743264337070286, "learning_rate": 7.675805032197342e-07, "loss": 0.0, "num_input_tokens_seen": 85143248, "step": 126320 }, { "epoch": 3.0861407666186205, "grad_norm": 0.6246962547302246, "learning_rate": 7.674975617382007e-07, "loss": 0.0003, "num_input_tokens_seen": 85146576, "step": 126325 }, { "epoch": 3.0862629174504677, "grad_norm": 0.016369406133890152, "learning_rate": 7.674146219474877e-07, "loss": 0.0, "num_input_tokens_seen": 85149904, "step": 126330 }, { "epoch": 3.086385068282315, "grad_norm": 0.004355208016932011, "learning_rate": 7.673316838481982e-07, "loss": 0.0, "num_input_tokens_seen": 85153232, "step": 126335 }, { "epoch": 3.086507219114162, "grad_norm": 0.034227363765239716, "learning_rate": 7.672487474409353e-07, "loss": 0.0, "num_input_tokens_seen": 85156304, "step": 126340 }, { "epoch": 3.0866293699460092, "grad_norm": 0.00046993931755423546, "learning_rate": 7.671658127263023e-07, "loss": 0.0, "num_input_tokens_seen": 85159824, "step": 126345 }, { "epoch": 3.0867515207778564, "grad_norm": 0.009331612847745419, "learning_rate": 7.670828797049017e-07, "loss": 0.0535, "num_input_tokens_seen": 85163088, "step": 126350 }, { "epoch": 3.0868736716097036, "grad_norm": 0.0005309986299835145, "learning_rate": 7.66999948377337e-07, "loss": 0.0002, "num_input_tokens_seen": 85166928, "step": 126355 }, { "epoch": 3.086995822441551, "grad_norm": 0.0022755253594368696, "learning_rate": 7.669170187442119e-07, "loss": 0.0, "num_input_tokens_seen": 85170064, "step": 126360 }, { "epoch": 3.087117973273398, "grad_norm": 0.006053520832210779, "learning_rate": 7.668340908061284e-07, "loss": 0.0, "num_input_tokens_seen": 85173712, "step": 126365 }, { "epoch": 3.087240124105245, "grad_norm": 0.0020175171084702015, "learning_rate": 7.667511645636905e-07, "loss": 0.0, "num_input_tokens_seen": 85176848, "step": 126370 }, { "epoch": 3.0873622749370924, "grad_norm": 0.0023011830635368824, "learning_rate": 7.666682400175005e-07, "loss": 0.0, "num_input_tokens_seen": 85179920, "step": 126375 }, { "epoch": 3.0874844257689396, "grad_norm": 0.002503541298210621, "learning_rate": 7.665853171681622e-07, "loss": 0.0001, "num_input_tokens_seen": 85182928, "step": 126380 }, { "epoch": 3.0876065766007867, "grad_norm": 0.000122290279250592, "learning_rate": 7.665023960162781e-07, "loss": 0.0, "num_input_tokens_seen": 85186064, "step": 126385 }, { "epoch": 3.087728727432634, "grad_norm": 0.0057792747393250465, "learning_rate": 7.664194765624512e-07, "loss": 0.0, "num_input_tokens_seen": 85189072, "step": 126390 }, { "epoch": 3.087850878264481, "grad_norm": 32.82571792602539, "learning_rate": 7.66336558807285e-07, "loss": 0.0546, "num_input_tokens_seen": 85192528, "step": 126395 }, { "epoch": 3.0879730290963283, "grad_norm": 0.002696740673854947, "learning_rate": 7.662536427513818e-07, "loss": 0.0, "num_input_tokens_seen": 85196240, "step": 126400 }, { "epoch": 3.0880951799281755, "grad_norm": 0.005557489115744829, "learning_rate": 7.661707283953455e-07, "loss": 0.0, "num_input_tokens_seen": 85199632, "step": 126405 }, { "epoch": 3.0882173307600223, "grad_norm": 0.0012418734841048717, "learning_rate": 7.660878157397779e-07, "loss": 0.0, "num_input_tokens_seen": 85203280, "step": 126410 }, { "epoch": 3.0883394815918694, "grad_norm": 0.010740534402430058, "learning_rate": 7.660049047852834e-07, "loss": 0.0, "num_input_tokens_seen": 85206800, "step": 126415 }, { "epoch": 3.0884616324237166, "grad_norm": 0.0020704532507807016, "learning_rate": 7.659219955324635e-07, "loss": 0.0, "num_input_tokens_seen": 85209808, "step": 126420 }, { "epoch": 3.088583783255564, "grad_norm": 0.0006153500871732831, "learning_rate": 7.658390879819218e-07, "loss": 0.0868, "num_input_tokens_seen": 85213328, "step": 126425 }, { "epoch": 3.088705934087411, "grad_norm": 0.00045507983304560184, "learning_rate": 7.657561821342617e-07, "loss": 0.0, "num_input_tokens_seen": 85216400, "step": 126430 }, { "epoch": 3.088828084919258, "grad_norm": 0.02990969642996788, "learning_rate": 7.656732779900856e-07, "loss": 0.0, "num_input_tokens_seen": 85219600, "step": 126435 }, { "epoch": 3.0889502357511054, "grad_norm": 0.0035446241963654757, "learning_rate": 7.655903755499961e-07, "loss": 0.0, "num_input_tokens_seen": 85222736, "step": 126440 }, { "epoch": 3.0890723865829526, "grad_norm": 0.000591681688092649, "learning_rate": 7.655074748145968e-07, "loss": 0.0, "num_input_tokens_seen": 85225872, "step": 126445 }, { "epoch": 3.0891945374147998, "grad_norm": 0.050365399569272995, "learning_rate": 7.654245757844897e-07, "loss": 0.0, "num_input_tokens_seen": 85229584, "step": 126450 }, { "epoch": 3.089316688246647, "grad_norm": 0.00013743084855377674, "learning_rate": 7.653416784602789e-07, "loss": 0.0, "num_input_tokens_seen": 85233232, "step": 126455 }, { "epoch": 3.089438839078494, "grad_norm": 0.001835143892094493, "learning_rate": 7.652587828425659e-07, "loss": 0.0526, "num_input_tokens_seen": 85236688, "step": 126460 }, { "epoch": 3.0895609899103413, "grad_norm": 0.012828472070395947, "learning_rate": 7.651758889319547e-07, "loss": 0.0, "num_input_tokens_seen": 85239888, "step": 126465 }, { "epoch": 3.0896831407421885, "grad_norm": 0.007912257686257362, "learning_rate": 7.650929967290471e-07, "loss": 0.0001, "num_input_tokens_seen": 85243024, "step": 126470 }, { "epoch": 3.0898052915740357, "grad_norm": 0.0017077409429475665, "learning_rate": 7.650101062344468e-07, "loss": 0.0, "num_input_tokens_seen": 85246288, "step": 126475 }, { "epoch": 3.089927442405883, "grad_norm": 0.0008549513877369463, "learning_rate": 7.649272174487558e-07, "loss": 0.0, "num_input_tokens_seen": 85249104, "step": 126480 }, { "epoch": 3.09004959323773, "grad_norm": 0.00041189262992702425, "learning_rate": 7.648443303725775e-07, "loss": 0.0, "num_input_tokens_seen": 85252880, "step": 126485 }, { "epoch": 3.0901717440695773, "grad_norm": 0.04326731711626053, "learning_rate": 7.647614450065145e-07, "loss": 0.0, "num_input_tokens_seen": 85256400, "step": 126490 }, { "epoch": 3.0902938949014245, "grad_norm": 0.03192279860377312, "learning_rate": 7.646785613511696e-07, "loss": 0.0004, "num_input_tokens_seen": 85259600, "step": 126495 }, { "epoch": 3.0904160457332717, "grad_norm": 0.0032559670507907867, "learning_rate": 7.645956794071457e-07, "loss": 0.0, "num_input_tokens_seen": 85262928, "step": 126500 }, { "epoch": 3.0905381965651184, "grad_norm": 0.002402453450486064, "learning_rate": 7.645127991750449e-07, "loss": 0.0, "num_input_tokens_seen": 85266384, "step": 126505 }, { "epoch": 3.0906603473969656, "grad_norm": 0.20489975810050964, "learning_rate": 7.644299206554702e-07, "loss": 0.0001, "num_input_tokens_seen": 85269456, "step": 126510 }, { "epoch": 3.090782498228813, "grad_norm": 0.0044357432052493095, "learning_rate": 7.643470438490252e-07, "loss": 0.0, "num_input_tokens_seen": 85272528, "step": 126515 }, { "epoch": 3.09090464906066, "grad_norm": 0.38196611404418945, "learning_rate": 7.642641687563112e-07, "loss": 0.0002, "num_input_tokens_seen": 85275920, "step": 126520 }, { "epoch": 3.091026799892507, "grad_norm": 0.002942395396530628, "learning_rate": 7.641812953779322e-07, "loss": 0.0, "num_input_tokens_seen": 85279632, "step": 126525 }, { "epoch": 3.0911489507243544, "grad_norm": 0.001338522182777524, "learning_rate": 7.640984237144898e-07, "loss": 0.0, "num_input_tokens_seen": 85282768, "step": 126530 }, { "epoch": 3.0912711015562016, "grad_norm": 0.004309714771807194, "learning_rate": 7.640155537665875e-07, "loss": 0.0, "num_input_tokens_seen": 85285840, "step": 126535 }, { "epoch": 3.0913932523880487, "grad_norm": 0.0006489718798547983, "learning_rate": 7.639326855348273e-07, "loss": 0.0, "num_input_tokens_seen": 85289360, "step": 126540 }, { "epoch": 3.091515403219896, "grad_norm": 0.001993774203583598, "learning_rate": 7.638498190198119e-07, "loss": 0.0, "num_input_tokens_seen": 85292368, "step": 126545 }, { "epoch": 3.091637554051743, "grad_norm": 8.232580148614943e-05, "learning_rate": 7.637669542221445e-07, "loss": 0.0, "num_input_tokens_seen": 85296144, "step": 126550 }, { "epoch": 3.0917597048835903, "grad_norm": 0.014385731890797615, "learning_rate": 7.63684091142427e-07, "loss": 0.0242, "num_input_tokens_seen": 85299408, "step": 126555 }, { "epoch": 3.0918818557154375, "grad_norm": 0.0003742116969078779, "learning_rate": 7.636012297812627e-07, "loss": 0.0812, "num_input_tokens_seen": 85302992, "step": 126560 }, { "epoch": 3.0920040065472847, "grad_norm": 0.004770467523485422, "learning_rate": 7.635183701392536e-07, "loss": 0.0, "num_input_tokens_seen": 85306320, "step": 126565 }, { "epoch": 3.092126157379132, "grad_norm": 0.0011331579880788922, "learning_rate": 7.634355122170028e-07, "loss": 0.0001, "num_input_tokens_seen": 85309264, "step": 126570 }, { "epoch": 3.092248308210979, "grad_norm": 0.0013692784123122692, "learning_rate": 7.633526560151121e-07, "loss": 0.0, "num_input_tokens_seen": 85312656, "step": 126575 }, { "epoch": 3.0923704590428263, "grad_norm": 0.0005857199430465698, "learning_rate": 7.632698015341846e-07, "loss": 0.0279, "num_input_tokens_seen": 85315792, "step": 126580 }, { "epoch": 3.0924926098746734, "grad_norm": 0.0007058735354803503, "learning_rate": 7.631869487748232e-07, "loss": 0.0, "num_input_tokens_seen": 85318928, "step": 126585 }, { "epoch": 3.09261476070652, "grad_norm": 0.0006951163522899151, "learning_rate": 7.631040977376299e-07, "loss": 0.0, "num_input_tokens_seen": 85321936, "step": 126590 }, { "epoch": 3.0927369115383674, "grad_norm": 0.0006128123495727777, "learning_rate": 7.630212484232072e-07, "loss": 0.0, "num_input_tokens_seen": 85325520, "step": 126595 }, { "epoch": 3.0928590623702146, "grad_norm": 55.12501907348633, "learning_rate": 7.629384008321578e-07, "loss": 0.0346, "num_input_tokens_seen": 85328464, "step": 126600 }, { "epoch": 3.0929812132020618, "grad_norm": 0.05353361740708351, "learning_rate": 7.628555549650838e-07, "loss": 0.0, "num_input_tokens_seen": 85331536, "step": 126605 }, { "epoch": 3.093103364033909, "grad_norm": 0.0017707470105960965, "learning_rate": 7.627727108225886e-07, "loss": 0.0, "num_input_tokens_seen": 85334800, "step": 126610 }, { "epoch": 3.093225514865756, "grad_norm": 0.003513862146064639, "learning_rate": 7.626898684052734e-07, "loss": 0.0, "num_input_tokens_seen": 85338064, "step": 126615 }, { "epoch": 3.0933476656976033, "grad_norm": 0.010240820236504078, "learning_rate": 7.626070277137417e-07, "loss": 0.0, "num_input_tokens_seen": 85341520, "step": 126620 }, { "epoch": 3.0934698165294505, "grad_norm": 0.0003203249361831695, "learning_rate": 7.625241887485954e-07, "loss": 0.0453, "num_input_tokens_seen": 85344400, "step": 126625 }, { "epoch": 3.0935919673612977, "grad_norm": 0.002799415495246649, "learning_rate": 7.624413515104373e-07, "loss": 0.0, "num_input_tokens_seen": 85347664, "step": 126630 }, { "epoch": 3.093714118193145, "grad_norm": 9.273582691093907e-05, "learning_rate": 7.623585159998692e-07, "loss": 0.0, "num_input_tokens_seen": 85350992, "step": 126635 }, { "epoch": 3.093836269024992, "grad_norm": 0.0013542450033128262, "learning_rate": 7.622756822174941e-07, "loss": 0.0, "num_input_tokens_seen": 85354384, "step": 126640 }, { "epoch": 3.0939584198568393, "grad_norm": 0.0002636424032971263, "learning_rate": 7.621928501639142e-07, "loss": 0.0, "num_input_tokens_seen": 85357328, "step": 126645 }, { "epoch": 3.0940805706886865, "grad_norm": 0.00017485875287093222, "learning_rate": 7.621100198397318e-07, "loss": 0.0, "num_input_tokens_seen": 85360912, "step": 126650 }, { "epoch": 3.0942027215205337, "grad_norm": 0.012742831371724606, "learning_rate": 7.620271912455496e-07, "loss": 0.0, "num_input_tokens_seen": 85364112, "step": 126655 }, { "epoch": 3.094324872352381, "grad_norm": 0.06087416782975197, "learning_rate": 7.619443643819691e-07, "loss": 0.0, "num_input_tokens_seen": 85367440, "step": 126660 }, { "epoch": 3.094447023184228, "grad_norm": 0.003833112772554159, "learning_rate": 7.618615392495935e-07, "loss": 0.0001, "num_input_tokens_seen": 85371024, "step": 126665 }, { "epoch": 3.0945691740160752, "grad_norm": 0.001838882453739643, "learning_rate": 7.61778715849025e-07, "loss": 0.0, "num_input_tokens_seen": 85374992, "step": 126670 }, { "epoch": 3.0946913248479224, "grad_norm": 0.0028312301728874445, "learning_rate": 7.616958941808654e-07, "loss": 0.0, "num_input_tokens_seen": 85378000, "step": 126675 }, { "epoch": 3.094813475679769, "grad_norm": 0.0475040078163147, "learning_rate": 7.616130742457178e-07, "loss": 0.0, "num_input_tokens_seen": 85380944, "step": 126680 }, { "epoch": 3.0949356265116164, "grad_norm": 5.351084837457165e-05, "learning_rate": 7.615302560441837e-07, "loss": 0.0, "num_input_tokens_seen": 85384272, "step": 126685 }, { "epoch": 3.0950577773434635, "grad_norm": 0.005748182535171509, "learning_rate": 7.614474395768661e-07, "loss": 0.0, "num_input_tokens_seen": 85387664, "step": 126690 }, { "epoch": 3.0951799281753107, "grad_norm": 0.011121712625026703, "learning_rate": 7.613646248443666e-07, "loss": 0.0317, "num_input_tokens_seen": 85390992, "step": 126695 }, { "epoch": 3.095302079007158, "grad_norm": 0.022904004901647568, "learning_rate": 7.612818118472875e-07, "loss": 0.0, "num_input_tokens_seen": 85394000, "step": 126700 }, { "epoch": 3.095424229839005, "grad_norm": 0.005358532536774874, "learning_rate": 7.611990005862318e-07, "loss": 0.0, "num_input_tokens_seen": 85397584, "step": 126705 }, { "epoch": 3.0955463806708523, "grad_norm": 0.0005007089930586517, "learning_rate": 7.611161910618007e-07, "loss": 0.0716, "num_input_tokens_seen": 85401616, "step": 126710 }, { "epoch": 3.0956685315026995, "grad_norm": 0.002198169706389308, "learning_rate": 7.610333832745974e-07, "loss": 0.0, "num_input_tokens_seen": 85405392, "step": 126715 }, { "epoch": 3.0957906823345467, "grad_norm": 0.001933772349730134, "learning_rate": 7.609505772252232e-07, "loss": 0.0, "num_input_tokens_seen": 85408656, "step": 126720 }, { "epoch": 3.095912833166394, "grad_norm": 0.0009322002879343927, "learning_rate": 7.60867772914281e-07, "loss": 0.0, "num_input_tokens_seen": 85411920, "step": 126725 }, { "epoch": 3.096034983998241, "grad_norm": 0.0036974812392145395, "learning_rate": 7.607849703423723e-07, "loss": 0.0, "num_input_tokens_seen": 85415376, "step": 126730 }, { "epoch": 3.0961571348300883, "grad_norm": 0.007073775865137577, "learning_rate": 7.607021695100997e-07, "loss": 0.0, "num_input_tokens_seen": 85418448, "step": 126735 }, { "epoch": 3.0962792856619354, "grad_norm": 207.88877868652344, "learning_rate": 7.606193704180655e-07, "loss": 0.0533, "num_input_tokens_seen": 85421648, "step": 126740 }, { "epoch": 3.0964014364937826, "grad_norm": 8.36336039355956e-05, "learning_rate": 7.605365730668717e-07, "loss": 0.0, "num_input_tokens_seen": 85425168, "step": 126745 }, { "epoch": 3.09652358732563, "grad_norm": 0.31576985120773315, "learning_rate": 7.604537774571203e-07, "loss": 0.0002, "num_input_tokens_seen": 85428496, "step": 126750 }, { "epoch": 3.096645738157477, "grad_norm": 0.03226271644234657, "learning_rate": 7.603709835894133e-07, "loss": 0.0, "num_input_tokens_seen": 85432272, "step": 126755 }, { "epoch": 3.096767888989324, "grad_norm": 0.0003956420405302197, "learning_rate": 7.602881914643529e-07, "loss": 0.0, "num_input_tokens_seen": 85435664, "step": 126760 }, { "epoch": 3.0968900398211714, "grad_norm": 0.0004236078239046037, "learning_rate": 7.602054010825415e-07, "loss": 0.0, "num_input_tokens_seen": 85439248, "step": 126765 }, { "epoch": 3.097012190653018, "grad_norm": 0.00017692515393719077, "learning_rate": 7.601226124445806e-07, "loss": 0.0204, "num_input_tokens_seen": 85442640, "step": 126770 }, { "epoch": 3.0971343414848653, "grad_norm": 0.05441778898239136, "learning_rate": 7.60039825551073e-07, "loss": 0.0, "num_input_tokens_seen": 85446032, "step": 126775 }, { "epoch": 3.0972564923167125, "grad_norm": 0.0025518974289298058, "learning_rate": 7.599570404026199e-07, "loss": 0.0, "num_input_tokens_seen": 85449168, "step": 126780 }, { "epoch": 3.0973786431485597, "grad_norm": 0.000867149792611599, "learning_rate": 7.598742569998243e-07, "loss": 0.0, "num_input_tokens_seen": 85452496, "step": 126785 }, { "epoch": 3.097500793980407, "grad_norm": 0.007959526032209396, "learning_rate": 7.597914753432871e-07, "loss": 0.0359, "num_input_tokens_seen": 85455824, "step": 126790 }, { "epoch": 3.097622944812254, "grad_norm": 0.00035928128636442125, "learning_rate": 7.597086954336112e-07, "loss": 0.0, "num_input_tokens_seen": 85459920, "step": 126795 }, { "epoch": 3.0977450956441013, "grad_norm": 0.0035206761676818132, "learning_rate": 7.596259172713982e-07, "loss": 0.0383, "num_input_tokens_seen": 85463312, "step": 126800 }, { "epoch": 3.0978672464759485, "grad_norm": 0.00011418592475820333, "learning_rate": 7.595431408572504e-07, "loss": 0.0, "num_input_tokens_seen": 85466256, "step": 126805 }, { "epoch": 3.0979893973077957, "grad_norm": 0.0019963637460023165, "learning_rate": 7.594603661917695e-07, "loss": 0.0, "num_input_tokens_seen": 85470096, "step": 126810 }, { "epoch": 3.098111548139643, "grad_norm": 0.0009545327047817409, "learning_rate": 7.593775932755573e-07, "loss": 0.0726, "num_input_tokens_seen": 85473232, "step": 126815 }, { "epoch": 3.09823369897149, "grad_norm": 0.002073055598884821, "learning_rate": 7.592948221092158e-07, "loss": 0.0366, "num_input_tokens_seen": 85476560, "step": 126820 }, { "epoch": 3.0983558498033372, "grad_norm": 0.0013438011519610882, "learning_rate": 7.592120526933477e-07, "loss": 0.0, "num_input_tokens_seen": 85479632, "step": 126825 }, { "epoch": 3.0984780006351844, "grad_norm": 0.005760578904300928, "learning_rate": 7.591292850285538e-07, "loss": 0.0, "num_input_tokens_seen": 85482704, "step": 126830 }, { "epoch": 3.0986001514670316, "grad_norm": 0.00027516690897755325, "learning_rate": 7.59046519115437e-07, "loss": 0.0, "num_input_tokens_seen": 85486288, "step": 126835 }, { "epoch": 3.098722302298879, "grad_norm": 0.002451132982969284, "learning_rate": 7.589637549545981e-07, "loss": 0.0001, "num_input_tokens_seen": 85489680, "step": 126840 }, { "epoch": 3.098844453130726, "grad_norm": 0.0006566059309989214, "learning_rate": 7.588809925466402e-07, "loss": 0.0, "num_input_tokens_seen": 85493008, "step": 126845 }, { "epoch": 3.098966603962573, "grad_norm": 0.0007958338828757405, "learning_rate": 7.587982318921643e-07, "loss": 0.0, "num_input_tokens_seen": 85496208, "step": 126850 }, { "epoch": 3.0990887547944204, "grad_norm": 0.43890684843063354, "learning_rate": 7.587154729917724e-07, "loss": 0.0001, "num_input_tokens_seen": 85499280, "step": 126855 }, { "epoch": 3.099210905626267, "grad_norm": 0.047244809567928314, "learning_rate": 7.586327158460668e-07, "loss": 0.0966, "num_input_tokens_seen": 85502608, "step": 126860 }, { "epoch": 3.0993330564581143, "grad_norm": 0.0005038433591835201, "learning_rate": 7.585499604556487e-07, "loss": 0.0, "num_input_tokens_seen": 85505424, "step": 126865 }, { "epoch": 3.0994552072899615, "grad_norm": 0.011972165666520596, "learning_rate": 7.584672068211205e-07, "loss": 0.0, "num_input_tokens_seen": 85508816, "step": 126870 }, { "epoch": 3.0995773581218087, "grad_norm": 0.00880894623696804, "learning_rate": 7.583844549430835e-07, "loss": 0.0, "num_input_tokens_seen": 85512272, "step": 126875 }, { "epoch": 3.099699508953656, "grad_norm": 0.0007148732547648251, "learning_rate": 7.583017048221401e-07, "loss": 0.0, "num_input_tokens_seen": 85515472, "step": 126880 }, { "epoch": 3.099821659785503, "grad_norm": 0.0030713542364537716, "learning_rate": 7.582189564588912e-07, "loss": 0.0, "num_input_tokens_seen": 85518544, "step": 126885 }, { "epoch": 3.0999438106173502, "grad_norm": 0.001756560057401657, "learning_rate": 7.581362098539391e-07, "loss": 0.0, "num_input_tokens_seen": 85521616, "step": 126890 }, { "epoch": 3.1000659614491974, "grad_norm": 0.005060962401330471, "learning_rate": 7.58053465007886e-07, "loss": 0.0, "num_input_tokens_seen": 85524368, "step": 126895 }, { "epoch": 3.1001881122810446, "grad_norm": 0.0010907781543210149, "learning_rate": 7.57970721921333e-07, "loss": 0.0, "num_input_tokens_seen": 85527376, "step": 126900 }, { "epoch": 3.100310263112892, "grad_norm": 0.3052005171775818, "learning_rate": 7.578879805948819e-07, "loss": 0.0001, "num_input_tokens_seen": 85531472, "step": 126905 }, { "epoch": 3.100432413944739, "grad_norm": 0.00023940950632095337, "learning_rate": 7.578052410291346e-07, "loss": 0.0, "num_input_tokens_seen": 85534672, "step": 126910 }, { "epoch": 3.100554564776586, "grad_norm": 0.002871927572414279, "learning_rate": 7.577225032246925e-07, "loss": 0.0, "num_input_tokens_seen": 85537872, "step": 126915 }, { "epoch": 3.1006767156084334, "grad_norm": 0.0004271389334462583, "learning_rate": 7.576397671821579e-07, "loss": 0.0, "num_input_tokens_seen": 85541584, "step": 126920 }, { "epoch": 3.1007988664402806, "grad_norm": 0.0003819416160695255, "learning_rate": 7.575570329021316e-07, "loss": 0.0, "num_input_tokens_seen": 85544720, "step": 126925 }, { "epoch": 3.1009210172721278, "grad_norm": 0.0009660868090577424, "learning_rate": 7.574743003852164e-07, "loss": 0.0, "num_input_tokens_seen": 85547984, "step": 126930 }, { "epoch": 3.101043168103975, "grad_norm": 0.13878706097602844, "learning_rate": 7.573915696320128e-07, "loss": 0.0, "num_input_tokens_seen": 85550928, "step": 126935 }, { "epoch": 3.101165318935822, "grad_norm": 0.6104390025138855, "learning_rate": 7.573088406431236e-07, "loss": 0.0005, "num_input_tokens_seen": 85554512, "step": 126940 }, { "epoch": 3.1012874697676693, "grad_norm": 0.001662836642935872, "learning_rate": 7.57226113419149e-07, "loss": 0.0001, "num_input_tokens_seen": 85557648, "step": 126945 }, { "epoch": 3.101409620599516, "grad_norm": 0.003861561883240938, "learning_rate": 7.571433879606918e-07, "loss": 0.0, "num_input_tokens_seen": 85561936, "step": 126950 }, { "epoch": 3.1015317714313633, "grad_norm": 0.00627047149464488, "learning_rate": 7.570606642683531e-07, "loss": 0.0, "num_input_tokens_seen": 85565264, "step": 126955 }, { "epoch": 3.1016539222632105, "grad_norm": 28.827089309692383, "learning_rate": 7.569779423427347e-07, "loss": 0.075, "num_input_tokens_seen": 85568400, "step": 126960 }, { "epoch": 3.1017760730950577, "grad_norm": 0.002582937479019165, "learning_rate": 7.568952221844383e-07, "loss": 0.0001, "num_input_tokens_seen": 85571984, "step": 126965 }, { "epoch": 3.101898223926905, "grad_norm": 0.005004982929676771, "learning_rate": 7.568125037940647e-07, "loss": 0.0, "num_input_tokens_seen": 85575312, "step": 126970 }, { "epoch": 3.102020374758752, "grad_norm": 0.0010750489309430122, "learning_rate": 7.56729787172216e-07, "loss": 0.0, "num_input_tokens_seen": 85579152, "step": 126975 }, { "epoch": 3.102142525590599, "grad_norm": 0.00830749236047268, "learning_rate": 7.566470723194942e-07, "loss": 0.0001, "num_input_tokens_seen": 85582672, "step": 126980 }, { "epoch": 3.1022646764224464, "grad_norm": 0.0012467604828998446, "learning_rate": 7.565643592364999e-07, "loss": 0.0003, "num_input_tokens_seen": 85585872, "step": 126985 }, { "epoch": 3.1023868272542936, "grad_norm": 0.0003762088599614799, "learning_rate": 7.564816479238355e-07, "loss": 0.0425, "num_input_tokens_seen": 85589456, "step": 126990 }, { "epoch": 3.102508978086141, "grad_norm": 0.0005477772792801261, "learning_rate": 7.563989383821017e-07, "loss": 0.0, "num_input_tokens_seen": 85592784, "step": 126995 }, { "epoch": 3.102631128917988, "grad_norm": 0.008774727582931519, "learning_rate": 7.563162306119006e-07, "loss": 0.0, "num_input_tokens_seen": 85595920, "step": 127000 }, { "epoch": 3.102753279749835, "grad_norm": 817.6456909179688, "learning_rate": 7.562335246138333e-07, "loss": 0.029, "num_input_tokens_seen": 85599184, "step": 127005 }, { "epoch": 3.1028754305816824, "grad_norm": 0.0020112013444304466, "learning_rate": 7.561508203885012e-07, "loss": 0.0005, "num_input_tokens_seen": 85602384, "step": 127010 }, { "epoch": 3.1029975814135295, "grad_norm": 0.00014069992175791413, "learning_rate": 7.560681179365062e-07, "loss": 0.0, "num_input_tokens_seen": 85605648, "step": 127015 }, { "epoch": 3.1031197322453767, "grad_norm": 0.003962227143347263, "learning_rate": 7.559854172584491e-07, "loss": 0.0, "num_input_tokens_seen": 85609424, "step": 127020 }, { "epoch": 3.103241883077224, "grad_norm": 4.573463593260385e-05, "learning_rate": 7.559027183549322e-07, "loss": 0.0, "num_input_tokens_seen": 85612752, "step": 127025 }, { "epoch": 3.103364033909071, "grad_norm": 0.0379859022796154, "learning_rate": 7.558200212265558e-07, "loss": 0.0001, "num_input_tokens_seen": 85616336, "step": 127030 }, { "epoch": 3.103486184740918, "grad_norm": 0.0006606020033359528, "learning_rate": 7.557373258739224e-07, "loss": 0.1186, "num_input_tokens_seen": 85619792, "step": 127035 }, { "epoch": 3.103608335572765, "grad_norm": 0.01268491055816412, "learning_rate": 7.556546322976324e-07, "loss": 0.0, "num_input_tokens_seen": 85623248, "step": 127040 }, { "epoch": 3.1037304864046122, "grad_norm": 0.0007136997883208096, "learning_rate": 7.555719404982878e-07, "loss": 0.0, "num_input_tokens_seen": 85626896, "step": 127045 }, { "epoch": 3.1038526372364594, "grad_norm": 0.009720143862068653, "learning_rate": 7.5548925047649e-07, "loss": 0.0001, "num_input_tokens_seen": 85630160, "step": 127050 }, { "epoch": 3.1039747880683066, "grad_norm": 0.006915161851793528, "learning_rate": 7.5540656223284e-07, "loss": 0.0, "num_input_tokens_seen": 85633040, "step": 127055 }, { "epoch": 3.104096938900154, "grad_norm": 0.00606177793815732, "learning_rate": 7.553238757679393e-07, "loss": 0.0001, "num_input_tokens_seen": 85636176, "step": 127060 }, { "epoch": 3.104219089732001, "grad_norm": 0.0016445768997073174, "learning_rate": 7.552411910823891e-07, "loss": 0.076, "num_input_tokens_seen": 85639568, "step": 127065 }, { "epoch": 3.104341240563848, "grad_norm": 0.002737719565629959, "learning_rate": 7.551585081767906e-07, "loss": 0.0, "num_input_tokens_seen": 85642896, "step": 127070 }, { "epoch": 3.1044633913956954, "grad_norm": 0.0015813398640602827, "learning_rate": 7.550758270517458e-07, "loss": 0.0, "num_input_tokens_seen": 85645968, "step": 127075 }, { "epoch": 3.1045855422275426, "grad_norm": 0.01730138435959816, "learning_rate": 7.54993147707855e-07, "loss": 0.0, "num_input_tokens_seen": 85649424, "step": 127080 }, { "epoch": 3.1047076930593898, "grad_norm": 0.0005342924268916249, "learning_rate": 7.549104701457203e-07, "loss": 0.0, "num_input_tokens_seen": 85652688, "step": 127085 }, { "epoch": 3.104829843891237, "grad_norm": 1.5160987377166748, "learning_rate": 7.548277943659422e-07, "loss": 0.0002, "num_input_tokens_seen": 85655952, "step": 127090 }, { "epoch": 3.104951994723084, "grad_norm": 0.013315865769982338, "learning_rate": 7.547451203691227e-07, "loss": 0.0, "num_input_tokens_seen": 85659600, "step": 127095 }, { "epoch": 3.1050741455549313, "grad_norm": 0.00107246870175004, "learning_rate": 7.546624481558623e-07, "loss": 0.0001, "num_input_tokens_seen": 85662864, "step": 127100 }, { "epoch": 3.1051962963867785, "grad_norm": 0.006762396544218063, "learning_rate": 7.545797777267627e-07, "loss": 0.075, "num_input_tokens_seen": 85666448, "step": 127105 }, { "epoch": 3.1053184472186257, "grad_norm": 0.01796438731253147, "learning_rate": 7.54497109082425e-07, "loss": 0.0, "num_input_tokens_seen": 85669840, "step": 127110 }, { "epoch": 3.105440598050473, "grad_norm": 0.0015615341253578663, "learning_rate": 7.544144422234504e-07, "loss": 0.0, "num_input_tokens_seen": 85673296, "step": 127115 }, { "epoch": 3.10556274888232, "grad_norm": 0.00022422667825594544, "learning_rate": 7.543317771504402e-07, "loss": 0.0, "num_input_tokens_seen": 85676624, "step": 127120 }, { "epoch": 3.1056848997141673, "grad_norm": 0.00045156272244639695, "learning_rate": 7.542491138639951e-07, "loss": 0.0, "num_input_tokens_seen": 85679888, "step": 127125 }, { "epoch": 3.105807050546014, "grad_norm": 0.05712326988577843, "learning_rate": 7.541664523647168e-07, "loss": 0.0001, "num_input_tokens_seen": 85683664, "step": 127130 }, { "epoch": 3.105929201377861, "grad_norm": 0.04789616912603378, "learning_rate": 7.540837926532057e-07, "loss": 0.0, "num_input_tokens_seen": 85687312, "step": 127135 }, { "epoch": 3.1060513522097084, "grad_norm": 0.0007048913976177573, "learning_rate": 7.540011347300635e-07, "loss": 0.0, "num_input_tokens_seen": 85690512, "step": 127140 }, { "epoch": 3.1061735030415556, "grad_norm": 0.0033927359618246555, "learning_rate": 7.539184785958916e-07, "loss": 0.0, "num_input_tokens_seen": 85693648, "step": 127145 }, { "epoch": 3.106295653873403, "grad_norm": 0.00035177572863176465, "learning_rate": 7.538358242512905e-07, "loss": 0.0003, "num_input_tokens_seen": 85696848, "step": 127150 }, { "epoch": 3.10641780470525, "grad_norm": 0.0007704797317273915, "learning_rate": 7.537531716968617e-07, "loss": 0.0, "num_input_tokens_seen": 85700368, "step": 127155 }, { "epoch": 3.106539955537097, "grad_norm": 0.0019401259487494826, "learning_rate": 7.536705209332059e-07, "loss": 0.0, "num_input_tokens_seen": 85703568, "step": 127160 }, { "epoch": 3.1066621063689444, "grad_norm": 0.0008472330518998206, "learning_rate": 7.535878719609241e-07, "loss": 0.0, "num_input_tokens_seen": 85707216, "step": 127165 }, { "epoch": 3.1067842572007915, "grad_norm": 0.0016105916583910584, "learning_rate": 7.535052247806179e-07, "loss": 0.0, "num_input_tokens_seen": 85710736, "step": 127170 }, { "epoch": 3.1069064080326387, "grad_norm": 0.00024303798272740096, "learning_rate": 7.534225793928878e-07, "loss": 0.0, "num_input_tokens_seen": 85714064, "step": 127175 }, { "epoch": 3.107028558864486, "grad_norm": 0.00046464145998470485, "learning_rate": 7.533399357983353e-07, "loss": 0.0, "num_input_tokens_seen": 85717328, "step": 127180 }, { "epoch": 3.107150709696333, "grad_norm": 0.011482232250273228, "learning_rate": 7.532572939975608e-07, "loss": 0.0, "num_input_tokens_seen": 85721040, "step": 127185 }, { "epoch": 3.1072728605281803, "grad_norm": 0.002534418599680066, "learning_rate": 7.53174653991166e-07, "loss": 0.0, "num_input_tokens_seen": 85724112, "step": 127190 }, { "epoch": 3.1073950113600275, "grad_norm": 0.00020533621136564761, "learning_rate": 7.530920157797511e-07, "loss": 0.0, "num_input_tokens_seen": 85728336, "step": 127195 }, { "epoch": 3.1075171621918747, "grad_norm": 0.0007511446019634604, "learning_rate": 7.530093793639174e-07, "loss": 0.0001, "num_input_tokens_seen": 85731920, "step": 127200 }, { "epoch": 3.107639313023722, "grad_norm": 0.01170251052826643, "learning_rate": 7.529267447442664e-07, "loss": 0.0, "num_input_tokens_seen": 85735696, "step": 127205 }, { "epoch": 3.107761463855569, "grad_norm": 0.000938538636546582, "learning_rate": 7.528441119213984e-07, "loss": 0.0365, "num_input_tokens_seen": 85739152, "step": 127210 }, { "epoch": 3.107883614687416, "grad_norm": 0.05814709886908531, "learning_rate": 7.527614808959144e-07, "loss": 0.0, "num_input_tokens_seen": 85742416, "step": 127215 }, { "epoch": 3.108005765519263, "grad_norm": 0.0004410638939589262, "learning_rate": 7.526788516684155e-07, "loss": 0.0, "num_input_tokens_seen": 85745488, "step": 127220 }, { "epoch": 3.10812791635111, "grad_norm": 0.00028008874505758286, "learning_rate": 7.525962242395022e-07, "loss": 0.0583, "num_input_tokens_seen": 85748560, "step": 127225 }, { "epoch": 3.1082500671829574, "grad_norm": 0.03178201988339424, "learning_rate": 7.525135986097762e-07, "loss": 0.0, "num_input_tokens_seen": 85752272, "step": 127230 }, { "epoch": 3.1083722180148046, "grad_norm": 0.0007714095409028232, "learning_rate": 7.524309747798374e-07, "loss": 0.0774, "num_input_tokens_seen": 85755472, "step": 127235 }, { "epoch": 3.1084943688466518, "grad_norm": 0.004785490222275257, "learning_rate": 7.523483527502876e-07, "loss": 0.0, "num_input_tokens_seen": 85758480, "step": 127240 }, { "epoch": 3.108616519678499, "grad_norm": 0.002524032723158598, "learning_rate": 7.522657325217267e-07, "loss": 0.0, "num_input_tokens_seen": 85761424, "step": 127245 }, { "epoch": 3.108738670510346, "grad_norm": 0.11010199785232544, "learning_rate": 7.521831140947566e-07, "loss": 0.0001, "num_input_tokens_seen": 85764688, "step": 127250 }, { "epoch": 3.1088608213421933, "grad_norm": 0.16963838040828705, "learning_rate": 7.52100497469977e-07, "loss": 0.0, "num_input_tokens_seen": 85767760, "step": 127255 }, { "epoch": 3.1089829721740405, "grad_norm": 0.0005663972115144134, "learning_rate": 7.520178826479895e-07, "loss": 0.0, "num_input_tokens_seen": 85770832, "step": 127260 }, { "epoch": 3.1091051230058877, "grad_norm": 0.0009025745093822479, "learning_rate": 7.519352696293948e-07, "loss": 0.0001, "num_input_tokens_seen": 85774544, "step": 127265 }, { "epoch": 3.109227273837735, "grad_norm": 0.004693191032856703, "learning_rate": 7.518526584147934e-07, "loss": 0.0325, "num_input_tokens_seen": 85778064, "step": 127270 }, { "epoch": 3.109349424669582, "grad_norm": 0.0005293733556754887, "learning_rate": 7.517700490047864e-07, "loss": 0.0, "num_input_tokens_seen": 85781328, "step": 127275 }, { "epoch": 3.1094715755014293, "grad_norm": 0.3933495879173279, "learning_rate": 7.516874413999739e-07, "loss": 0.0002, "num_input_tokens_seen": 85784400, "step": 127280 }, { "epoch": 3.1095937263332765, "grad_norm": 0.0027819527313113213, "learning_rate": 7.516048356009577e-07, "loss": 0.0, "num_input_tokens_seen": 85787600, "step": 127285 }, { "epoch": 3.1097158771651237, "grad_norm": 0.002181408926844597, "learning_rate": 7.515222316083374e-07, "loss": 0.0, "num_input_tokens_seen": 85790864, "step": 127290 }, { "epoch": 3.109838027996971, "grad_norm": 0.00033735146280378103, "learning_rate": 7.514396294227143e-07, "loss": 0.0, "num_input_tokens_seen": 85794320, "step": 127295 }, { "epoch": 3.109960178828818, "grad_norm": 0.0002902150445152074, "learning_rate": 7.513570290446896e-07, "loss": 0.0, "num_input_tokens_seen": 85797584, "step": 127300 }, { "epoch": 3.110082329660665, "grad_norm": 0.0033535470720380545, "learning_rate": 7.512744304748629e-07, "loss": 0.0, "num_input_tokens_seen": 85801104, "step": 127305 }, { "epoch": 3.110204480492512, "grad_norm": 0.00014512380585074425, "learning_rate": 7.511918337138359e-07, "loss": 0.0, "num_input_tokens_seen": 85804368, "step": 127310 }, { "epoch": 3.110326631324359, "grad_norm": 0.004475784488022327, "learning_rate": 7.511092387622086e-07, "loss": 0.0001, "num_input_tokens_seen": 85807696, "step": 127315 }, { "epoch": 3.1104487821562063, "grad_norm": 0.013879061676561832, "learning_rate": 7.510266456205816e-07, "loss": 0.0001, "num_input_tokens_seen": 85810512, "step": 127320 }, { "epoch": 3.1105709329880535, "grad_norm": 0.004589783027768135, "learning_rate": 7.509440542895562e-07, "loss": 0.0, "num_input_tokens_seen": 85813520, "step": 127325 }, { "epoch": 3.1106930838199007, "grad_norm": 0.00585208460688591, "learning_rate": 7.508614647697324e-07, "loss": 0.0002, "num_input_tokens_seen": 85816784, "step": 127330 }, { "epoch": 3.110815234651748, "grad_norm": 0.0008580581634305418, "learning_rate": 7.507788770617111e-07, "loss": 0.0, "num_input_tokens_seen": 85820112, "step": 127335 }, { "epoch": 3.110937385483595, "grad_norm": 0.0012955941492691636, "learning_rate": 7.506962911660927e-07, "loss": 0.0, "num_input_tokens_seen": 85823056, "step": 127340 }, { "epoch": 3.1110595363154423, "grad_norm": 0.0007464304217137396, "learning_rate": 7.506137070834784e-07, "loss": 0.0, "num_input_tokens_seen": 85826448, "step": 127345 }, { "epoch": 3.1111816871472895, "grad_norm": 0.002111968584358692, "learning_rate": 7.505311248144677e-07, "loss": 0.0, "num_input_tokens_seen": 85829456, "step": 127350 }, { "epoch": 3.1113038379791367, "grad_norm": 0.0009733681799843907, "learning_rate": 7.504485443596619e-07, "loss": 0.0, "num_input_tokens_seen": 85832720, "step": 127355 }, { "epoch": 3.111425988810984, "grad_norm": 0.003962097689509392, "learning_rate": 7.503659657196617e-07, "loss": 0.0, "num_input_tokens_seen": 85836048, "step": 127360 }, { "epoch": 3.111548139642831, "grad_norm": 0.0002371447772020474, "learning_rate": 7.502833888950672e-07, "loss": 0.0, "num_input_tokens_seen": 85839376, "step": 127365 }, { "epoch": 3.1116702904746782, "grad_norm": 0.003351636463776231, "learning_rate": 7.502008138864791e-07, "loss": 0.0, "num_input_tokens_seen": 85842448, "step": 127370 }, { "epoch": 3.1117924413065254, "grad_norm": 3.950299742427887e-06, "learning_rate": 7.501182406944977e-07, "loss": 0.0442, "num_input_tokens_seen": 85845648, "step": 127375 }, { "epoch": 3.1119145921383726, "grad_norm": 0.00042691812268458307, "learning_rate": 7.500356693197236e-07, "loss": 0.0, "num_input_tokens_seen": 85848784, "step": 127380 }, { "epoch": 3.11203674297022, "grad_norm": 13.983359336853027, "learning_rate": 7.499530997627576e-07, "loss": 0.0551, "num_input_tokens_seen": 85851920, "step": 127385 }, { "epoch": 3.112158893802067, "grad_norm": 28.192888259887695, "learning_rate": 7.498705320241998e-07, "loss": 0.0619, "num_input_tokens_seen": 85855056, "step": 127390 }, { "epoch": 3.1122810446339138, "grad_norm": 0.020545123144984245, "learning_rate": 7.49787966104651e-07, "loss": 0.0, "num_input_tokens_seen": 85858256, "step": 127395 }, { "epoch": 3.112403195465761, "grad_norm": 0.25158601999282837, "learning_rate": 7.49705402004711e-07, "loss": 0.0002, "num_input_tokens_seen": 85861712, "step": 127400 }, { "epoch": 3.112525346297608, "grad_norm": 0.047194406390190125, "learning_rate": 7.49622839724981e-07, "loss": 0.0, "num_input_tokens_seen": 85865104, "step": 127405 }, { "epoch": 3.1126474971294553, "grad_norm": 0.0534670390188694, "learning_rate": 7.495402792660608e-07, "loss": 0.0002, "num_input_tokens_seen": 85868368, "step": 127410 }, { "epoch": 3.1127696479613025, "grad_norm": 0.015193069353699684, "learning_rate": 7.494577206285511e-07, "loss": 0.0, "num_input_tokens_seen": 85871440, "step": 127415 }, { "epoch": 3.1128917987931497, "grad_norm": 0.005533255636692047, "learning_rate": 7.493751638130523e-07, "loss": 0.0001, "num_input_tokens_seen": 85875472, "step": 127420 }, { "epoch": 3.113013949624997, "grad_norm": 0.0008053280180320144, "learning_rate": 7.492926088201648e-07, "loss": 0.0001, "num_input_tokens_seen": 85878672, "step": 127425 }, { "epoch": 3.113136100456844, "grad_norm": 0.008561119437217712, "learning_rate": 7.49210055650489e-07, "loss": 0.0, "num_input_tokens_seen": 85881936, "step": 127430 }, { "epoch": 3.1132582512886913, "grad_norm": 0.0030915874522179365, "learning_rate": 7.491275043046246e-07, "loss": 0.0, "num_input_tokens_seen": 85885264, "step": 127435 }, { "epoch": 3.1133804021205385, "grad_norm": 32.49300765991211, "learning_rate": 7.49044954783173e-07, "loss": 0.0663, "num_input_tokens_seen": 85888400, "step": 127440 }, { "epoch": 3.1135025529523856, "grad_norm": 0.000495272921398282, "learning_rate": 7.489624070867337e-07, "loss": 0.0, "num_input_tokens_seen": 85892048, "step": 127445 }, { "epoch": 3.113624703784233, "grad_norm": 0.0019321962026879191, "learning_rate": 7.48879861215907e-07, "loss": 0.0, "num_input_tokens_seen": 85895056, "step": 127450 }, { "epoch": 3.11374685461608, "grad_norm": 0.6780271530151367, "learning_rate": 7.487973171712942e-07, "loss": 0.0006, "num_input_tokens_seen": 85898000, "step": 127455 }, { "epoch": 3.113869005447927, "grad_norm": 0.0012302626855671406, "learning_rate": 7.487147749534943e-07, "loss": 0.0, "num_input_tokens_seen": 85901584, "step": 127460 }, { "epoch": 3.1139911562797744, "grad_norm": 0.00070422631688416, "learning_rate": 7.486322345631086e-07, "loss": 0.0, "num_input_tokens_seen": 85904784, "step": 127465 }, { "epoch": 3.1141133071116216, "grad_norm": 0.00759016303345561, "learning_rate": 7.485496960007367e-07, "loss": 0.0, "num_input_tokens_seen": 85908048, "step": 127470 }, { "epoch": 3.114235457943469, "grad_norm": 0.009760981425642967, "learning_rate": 7.484671592669789e-07, "loss": 0.0, "num_input_tokens_seen": 85911248, "step": 127475 }, { "epoch": 3.1143576087753155, "grad_norm": 0.16156858205795288, "learning_rate": 7.483846243624359e-07, "loss": 0.0001, "num_input_tokens_seen": 85914768, "step": 127480 }, { "epoch": 3.1144797596071627, "grad_norm": 26.11109733581543, "learning_rate": 7.483020912877072e-07, "loss": 0.0927, "num_input_tokens_seen": 85918032, "step": 127485 }, { "epoch": 3.11460191043901, "grad_norm": 0.019733689725399017, "learning_rate": 7.482195600433938e-07, "loss": 0.0, "num_input_tokens_seen": 85921552, "step": 127490 }, { "epoch": 3.114724061270857, "grad_norm": 6.98584844940342e-05, "learning_rate": 7.481370306300949e-07, "loss": 0.0, "num_input_tokens_seen": 85925200, "step": 127495 }, { "epoch": 3.1148462121027043, "grad_norm": 0.327136367559433, "learning_rate": 7.48054503048412e-07, "loss": 0.0001, "num_input_tokens_seen": 85928528, "step": 127500 }, { "epoch": 3.1149683629345515, "grad_norm": 0.014904793351888657, "learning_rate": 7.479719772989439e-07, "loss": 0.0, "num_input_tokens_seen": 85932112, "step": 127505 }, { "epoch": 3.1150905137663987, "grad_norm": 0.12061361223459244, "learning_rate": 7.478894533822914e-07, "loss": 0.0004, "num_input_tokens_seen": 85935120, "step": 127510 }, { "epoch": 3.115212664598246, "grad_norm": 0.002666124375537038, "learning_rate": 7.478069312990549e-07, "loss": 0.0, "num_input_tokens_seen": 85938320, "step": 127515 }, { "epoch": 3.115334815430093, "grad_norm": 0.02227841690182686, "learning_rate": 7.477244110498342e-07, "loss": 0.0313, "num_input_tokens_seen": 85941456, "step": 127520 }, { "epoch": 3.1154569662619402, "grad_norm": 0.15282024443149567, "learning_rate": 7.476418926352295e-07, "loss": 0.0676, "num_input_tokens_seen": 85944656, "step": 127525 }, { "epoch": 3.1155791170937874, "grad_norm": 0.0020133310463279486, "learning_rate": 7.475593760558406e-07, "loss": 0.0, "num_input_tokens_seen": 85947728, "step": 127530 }, { "epoch": 3.1157012679256346, "grad_norm": 0.027102556079626083, "learning_rate": 7.474768613122678e-07, "loss": 0.0, "num_input_tokens_seen": 85950672, "step": 127535 }, { "epoch": 3.115823418757482, "grad_norm": 47.2922248840332, "learning_rate": 7.473943484051115e-07, "loss": 0.0224, "num_input_tokens_seen": 85954384, "step": 127540 }, { "epoch": 3.115945569589329, "grad_norm": 1599.6219482421875, "learning_rate": 7.473118373349709e-07, "loss": 0.0348, "num_input_tokens_seen": 85957520, "step": 127545 }, { "epoch": 3.116067720421176, "grad_norm": 0.002141246572136879, "learning_rate": 7.472293281024474e-07, "loss": 0.0, "num_input_tokens_seen": 85960720, "step": 127550 }, { "epoch": 3.1161898712530234, "grad_norm": 0.02709318697452545, "learning_rate": 7.471468207081394e-07, "loss": 0.0, "num_input_tokens_seen": 85963984, "step": 127555 }, { "epoch": 3.1163120220848706, "grad_norm": 0.005332686472684145, "learning_rate": 7.470643151526483e-07, "loss": 0.041, "num_input_tokens_seen": 85966992, "step": 127560 }, { "epoch": 3.1164341729167178, "grad_norm": 0.0005027592997066677, "learning_rate": 7.469818114365732e-07, "loss": 0.0, "num_input_tokens_seen": 85970128, "step": 127565 }, { "epoch": 3.116556323748565, "grad_norm": 0.0007410330581478775, "learning_rate": 7.468993095605143e-07, "loss": 0.0, "num_input_tokens_seen": 85973392, "step": 127570 }, { "epoch": 3.1166784745804117, "grad_norm": 0.010723591782152653, "learning_rate": 7.46816809525072e-07, "loss": 0.0, "num_input_tokens_seen": 85976784, "step": 127575 }, { "epoch": 3.116800625412259, "grad_norm": 0.020599083974957466, "learning_rate": 7.467343113308459e-07, "loss": 0.0, "num_input_tokens_seen": 85980432, "step": 127580 }, { "epoch": 3.116922776244106, "grad_norm": 6.975384894758463e-05, "learning_rate": 7.466518149784362e-07, "loss": 0.0788, "num_input_tokens_seen": 85984144, "step": 127585 }, { "epoch": 3.1170449270759533, "grad_norm": 0.0009970880346372724, "learning_rate": 7.465693204684422e-07, "loss": 0.0, "num_input_tokens_seen": 85987280, "step": 127590 }, { "epoch": 3.1171670779078005, "grad_norm": 0.0002569013158790767, "learning_rate": 7.464868278014647e-07, "loss": 0.0, "num_input_tokens_seen": 85990544, "step": 127595 }, { "epoch": 3.1172892287396476, "grad_norm": 0.00037578216870315373, "learning_rate": 7.464043369781027e-07, "loss": 0.0, "num_input_tokens_seen": 85994064, "step": 127600 }, { "epoch": 3.117411379571495, "grad_norm": 0.0027975961565971375, "learning_rate": 7.463218479989568e-07, "loss": 0.0, "num_input_tokens_seen": 85997712, "step": 127605 }, { "epoch": 3.117533530403342, "grad_norm": 0.0024667629040777683, "learning_rate": 7.462393608646269e-07, "loss": 0.0, "num_input_tokens_seen": 86001488, "step": 127610 }, { "epoch": 3.117655681235189, "grad_norm": 0.0030389088205993176, "learning_rate": 7.461568755757122e-07, "loss": 0.0, "num_input_tokens_seen": 86004880, "step": 127615 }, { "epoch": 3.1177778320670364, "grad_norm": 0.001127371215261519, "learning_rate": 7.460743921328134e-07, "loss": 0.0, "num_input_tokens_seen": 86008272, "step": 127620 }, { "epoch": 3.1178999828988836, "grad_norm": 0.0004868946853093803, "learning_rate": 7.459919105365297e-07, "loss": 0.0864, "num_input_tokens_seen": 86011408, "step": 127625 }, { "epoch": 3.118022133730731, "grad_norm": 0.0030997958965599537, "learning_rate": 7.459094307874609e-07, "loss": 0.0001, "num_input_tokens_seen": 86015184, "step": 127630 }, { "epoch": 3.118144284562578, "grad_norm": 0.0009643174707889557, "learning_rate": 7.458269528862075e-07, "loss": 0.0003, "num_input_tokens_seen": 86018256, "step": 127635 }, { "epoch": 3.118266435394425, "grad_norm": 0.0002588916686363518, "learning_rate": 7.457444768333686e-07, "loss": 0.058, "num_input_tokens_seen": 86021392, "step": 127640 }, { "epoch": 3.1183885862262724, "grad_norm": 0.006056373938918114, "learning_rate": 7.456620026295446e-07, "loss": 0.0, "num_input_tokens_seen": 86025296, "step": 127645 }, { "epoch": 3.1185107370581195, "grad_norm": 0.0017953312490135431, "learning_rate": 7.455795302753345e-07, "loss": 0.0, "num_input_tokens_seen": 86028880, "step": 127650 }, { "epoch": 3.1186328878899667, "grad_norm": 0.001279030810110271, "learning_rate": 7.454970597713388e-07, "loss": 0.0, "num_input_tokens_seen": 86032272, "step": 127655 }, { "epoch": 3.1187550387218135, "grad_norm": 0.009960848838090897, "learning_rate": 7.454145911181566e-07, "loss": 0.0, "num_input_tokens_seen": 86035280, "step": 127660 }, { "epoch": 3.1188771895536607, "grad_norm": 0.003310409840196371, "learning_rate": 7.453321243163879e-07, "loss": 0.0, "num_input_tokens_seen": 86038416, "step": 127665 }, { "epoch": 3.118999340385508, "grad_norm": 0.0014001899398863316, "learning_rate": 7.452496593666329e-07, "loss": 0.0002, "num_input_tokens_seen": 86041488, "step": 127670 }, { "epoch": 3.119121491217355, "grad_norm": 0.006665100809186697, "learning_rate": 7.451671962694907e-07, "loss": 0.0003, "num_input_tokens_seen": 86044880, "step": 127675 }, { "epoch": 3.1192436420492022, "grad_norm": 0.00016490106645505875, "learning_rate": 7.450847350255613e-07, "loss": 0.0, "num_input_tokens_seen": 86048400, "step": 127680 }, { "epoch": 3.1193657928810494, "grad_norm": 0.0031723338179290295, "learning_rate": 7.450022756354443e-07, "loss": 0.0, "num_input_tokens_seen": 86051728, "step": 127685 }, { "epoch": 3.1194879437128966, "grad_norm": 0.0014128751354292035, "learning_rate": 7.449198180997389e-07, "loss": 0.0, "num_input_tokens_seen": 86054992, "step": 127690 }, { "epoch": 3.119610094544744, "grad_norm": 0.006229817401617765, "learning_rate": 7.448373624190458e-07, "loss": 0.0, "num_input_tokens_seen": 86057936, "step": 127695 }, { "epoch": 3.119732245376591, "grad_norm": 0.05980667844414711, "learning_rate": 7.447549085939636e-07, "loss": 0.0609, "num_input_tokens_seen": 86061200, "step": 127700 }, { "epoch": 3.119854396208438, "grad_norm": 6.3800071075093e-06, "learning_rate": 7.446724566250927e-07, "loss": 0.0001, "num_input_tokens_seen": 86064528, "step": 127705 }, { "epoch": 3.1199765470402854, "grad_norm": 0.07691963762044907, "learning_rate": 7.44590006513032e-07, "loss": 0.0001, "num_input_tokens_seen": 86067728, "step": 127710 }, { "epoch": 3.1200986978721326, "grad_norm": 0.0028570157010108232, "learning_rate": 7.445075582583819e-07, "loss": 0.0001, "num_input_tokens_seen": 86070928, "step": 127715 }, { "epoch": 3.1202208487039798, "grad_norm": 0.07462915778160095, "learning_rate": 7.444251118617411e-07, "loss": 0.0, "num_input_tokens_seen": 86074256, "step": 127720 }, { "epoch": 3.120342999535827, "grad_norm": 0.008847139775753021, "learning_rate": 7.443426673237098e-07, "loss": 0.0, "num_input_tokens_seen": 86077520, "step": 127725 }, { "epoch": 3.120465150367674, "grad_norm": 0.04461780562996864, "learning_rate": 7.442602246448875e-07, "loss": 0.0, "num_input_tokens_seen": 86080720, "step": 127730 }, { "epoch": 3.1205873011995213, "grad_norm": 4.41354131908156e-05, "learning_rate": 7.441777838258736e-07, "loss": 0.0002, "num_input_tokens_seen": 86084304, "step": 127735 }, { "epoch": 3.1207094520313685, "grad_norm": 0.013822735287249088, "learning_rate": 7.440953448672678e-07, "loss": 0.0, "num_input_tokens_seen": 86087504, "step": 127740 }, { "epoch": 3.1208316028632157, "grad_norm": 0.0070150066167116165, "learning_rate": 7.440129077696691e-07, "loss": 0.0, "num_input_tokens_seen": 86090832, "step": 127745 }, { "epoch": 3.1209537536950624, "grad_norm": 0.006862407084554434, "learning_rate": 7.439304725336778e-07, "loss": 0.0, "num_input_tokens_seen": 86094352, "step": 127750 }, { "epoch": 3.1210759045269096, "grad_norm": 0.0013574488693848252, "learning_rate": 7.438480391598925e-07, "loss": 0.0836, "num_input_tokens_seen": 86097488, "step": 127755 }, { "epoch": 3.121198055358757, "grad_norm": 0.0021971026435494423, "learning_rate": 7.437656076489133e-07, "loss": 0.0, "num_input_tokens_seen": 86100624, "step": 127760 }, { "epoch": 3.121320206190604, "grad_norm": 0.0034022280015051365, "learning_rate": 7.436831780013398e-07, "loss": 0.0052, "num_input_tokens_seen": 86104336, "step": 127765 }, { "epoch": 3.121442357022451, "grad_norm": 0.09795635938644409, "learning_rate": 7.436007502177708e-07, "loss": 0.0003, "num_input_tokens_seen": 86108240, "step": 127770 }, { "epoch": 3.1215645078542984, "grad_norm": 0.0012237750925123692, "learning_rate": 7.435183242988066e-07, "loss": 0.0, "num_input_tokens_seen": 86111440, "step": 127775 }, { "epoch": 3.1216866586861456, "grad_norm": 0.0012300205416977406, "learning_rate": 7.434359002450458e-07, "loss": 0.0, "num_input_tokens_seen": 86115216, "step": 127780 }, { "epoch": 3.1218088095179928, "grad_norm": 0.37210115790367126, "learning_rate": 7.433534780570881e-07, "loss": 0.0002, "num_input_tokens_seen": 86118672, "step": 127785 }, { "epoch": 3.12193096034984, "grad_norm": 0.0007529134163632989, "learning_rate": 7.432710577355332e-07, "loss": 0.0, "num_input_tokens_seen": 86121552, "step": 127790 }, { "epoch": 3.122053111181687, "grad_norm": 0.0013815355487167835, "learning_rate": 7.431886392809799e-07, "loss": 0.131, "num_input_tokens_seen": 86125136, "step": 127795 }, { "epoch": 3.1221752620135343, "grad_norm": 0.03114251233637333, "learning_rate": 7.431062226940281e-07, "loss": 0.0, "num_input_tokens_seen": 86128336, "step": 127800 }, { "epoch": 3.1222974128453815, "grad_norm": 0.0010919078486040235, "learning_rate": 7.430238079752768e-07, "loss": 0.0, "num_input_tokens_seen": 86131536, "step": 127805 }, { "epoch": 3.1224195636772287, "grad_norm": 0.0005009145243093371, "learning_rate": 7.429413951253259e-07, "loss": 0.0, "num_input_tokens_seen": 86134672, "step": 127810 }, { "epoch": 3.122541714509076, "grad_norm": 0.0005763991503044963, "learning_rate": 7.428589841447737e-07, "loss": 0.0, "num_input_tokens_seen": 86138512, "step": 127815 }, { "epoch": 3.122663865340923, "grad_norm": 0.0008991776849143207, "learning_rate": 7.427765750342201e-07, "loss": 0.0, "num_input_tokens_seen": 86141648, "step": 127820 }, { "epoch": 3.1227860161727703, "grad_norm": 0.0005382252857089043, "learning_rate": 7.42694167794265e-07, "loss": 0.0, "num_input_tokens_seen": 86145168, "step": 127825 }, { "epoch": 3.1229081670046175, "grad_norm": 0.000699529075063765, "learning_rate": 7.426117624255068e-07, "loss": 0.0001, "num_input_tokens_seen": 86148304, "step": 127830 }, { "epoch": 3.1230303178364647, "grad_norm": 0.0004691076755989343, "learning_rate": 7.425293589285453e-07, "loss": 0.0, "num_input_tokens_seen": 86151440, "step": 127835 }, { "epoch": 3.1231524686683114, "grad_norm": 0.0002849614538718015, "learning_rate": 7.424469573039793e-07, "loss": 0.0, "num_input_tokens_seen": 86154832, "step": 127840 }, { "epoch": 3.1232746195001586, "grad_norm": 0.001343605574220419, "learning_rate": 7.423645575524087e-07, "loss": 0.1155, "num_input_tokens_seen": 86158480, "step": 127845 }, { "epoch": 3.123396770332006, "grad_norm": 0.0011115703964605927, "learning_rate": 7.422821596744318e-07, "loss": 0.0, "num_input_tokens_seen": 86162128, "step": 127850 }, { "epoch": 3.123518921163853, "grad_norm": 0.0004212648200336844, "learning_rate": 7.421997636706486e-07, "loss": 0.0, "num_input_tokens_seen": 86165456, "step": 127855 }, { "epoch": 3.1236410719957, "grad_norm": 0.0056180949322879314, "learning_rate": 7.421173695416582e-07, "loss": 0.0978, "num_input_tokens_seen": 86168912, "step": 127860 }, { "epoch": 3.1237632228275474, "grad_norm": 0.00836129393428564, "learning_rate": 7.420349772880592e-07, "loss": 0.0001, "num_input_tokens_seen": 86172432, "step": 127865 }, { "epoch": 3.1238853736593946, "grad_norm": 0.0003254195617046207, "learning_rate": 7.419525869104518e-07, "loss": 0.0, "num_input_tokens_seen": 86175568, "step": 127870 }, { "epoch": 3.1240075244912417, "grad_norm": 0.00044932254240848124, "learning_rate": 7.418701984094341e-07, "loss": 0.0, "num_input_tokens_seen": 86178832, "step": 127875 }, { "epoch": 3.124129675323089, "grad_norm": 0.006995463743805885, "learning_rate": 7.41787811785606e-07, "loss": 0.0, "num_input_tokens_seen": 86181968, "step": 127880 }, { "epoch": 3.124251826154936, "grad_norm": 0.00013451272388920188, "learning_rate": 7.417054270395664e-07, "loss": 0.0, "num_input_tokens_seen": 86185360, "step": 127885 }, { "epoch": 3.1243739769867833, "grad_norm": 0.016113661229610443, "learning_rate": 7.416230441719143e-07, "loss": 0.0, "num_input_tokens_seen": 86188944, "step": 127890 }, { "epoch": 3.1244961278186305, "grad_norm": 0.0011379237985238433, "learning_rate": 7.415406631832493e-07, "loss": 0.0, "num_input_tokens_seen": 86192336, "step": 127895 }, { "epoch": 3.1246182786504777, "grad_norm": 0.012022904120385647, "learning_rate": 7.414582840741696e-07, "loss": 0.0001, "num_input_tokens_seen": 86196368, "step": 127900 }, { "epoch": 3.124740429482325, "grad_norm": 0.01843303069472313, "learning_rate": 7.413759068452752e-07, "loss": 0.0, "num_input_tokens_seen": 86199760, "step": 127905 }, { "epoch": 3.124862580314172, "grad_norm": 0.0009940828895196319, "learning_rate": 7.412935314971643e-07, "loss": 0.0339, "num_input_tokens_seen": 86203216, "step": 127910 }, { "epoch": 3.1249847311460193, "grad_norm": 0.0009316856157965958, "learning_rate": 7.412111580304366e-07, "loss": 0.0, "num_input_tokens_seen": 86206224, "step": 127915 }, { "epoch": 3.1251068819778665, "grad_norm": 0.013916014693677425, "learning_rate": 7.411287864456912e-07, "loss": 0.0001, "num_input_tokens_seen": 86209616, "step": 127920 }, { "epoch": 3.125229032809713, "grad_norm": 0.0019733826629817486, "learning_rate": 7.410464167435265e-07, "loss": 0.0001, "num_input_tokens_seen": 86212880, "step": 127925 }, { "epoch": 3.1253511836415604, "grad_norm": 0.0007011366542428732, "learning_rate": 7.409640489245423e-07, "loss": 0.0, "num_input_tokens_seen": 86216400, "step": 127930 }, { "epoch": 3.1254733344734076, "grad_norm": 0.0009087013895623386, "learning_rate": 7.408816829893371e-07, "loss": 0.0, "num_input_tokens_seen": 86219472, "step": 127935 }, { "epoch": 3.1255954853052548, "grad_norm": 0.008903411217033863, "learning_rate": 7.407993189385098e-07, "loss": 0.0001, "num_input_tokens_seen": 86223184, "step": 127940 }, { "epoch": 3.125717636137102, "grad_norm": 0.004272781778126955, "learning_rate": 7.407169567726599e-07, "loss": 0.0588, "num_input_tokens_seen": 86226320, "step": 127945 }, { "epoch": 3.125839786968949, "grad_norm": 0.006159925367683172, "learning_rate": 7.406345964923857e-07, "loss": 0.0, "num_input_tokens_seen": 86229840, "step": 127950 }, { "epoch": 3.1259619378007963, "grad_norm": 0.0068336776457726955, "learning_rate": 7.40552238098287e-07, "loss": 0.0, "num_input_tokens_seen": 86233744, "step": 127955 }, { "epoch": 3.1260840886326435, "grad_norm": 0.022573009133338928, "learning_rate": 7.404698815909616e-07, "loss": 0.0002, "num_input_tokens_seen": 86237264, "step": 127960 }, { "epoch": 3.1262062394644907, "grad_norm": 0.005695146508514881, "learning_rate": 7.403875269710093e-07, "loss": 0.0024, "num_input_tokens_seen": 86240720, "step": 127965 }, { "epoch": 3.126328390296338, "grad_norm": 0.06871867179870605, "learning_rate": 7.403051742390285e-07, "loss": 0.0, "num_input_tokens_seen": 86243856, "step": 127970 }, { "epoch": 3.126450541128185, "grad_norm": 0.015363491140305996, "learning_rate": 7.402228233956184e-07, "loss": 0.0345, "num_input_tokens_seen": 86247248, "step": 127975 }, { "epoch": 3.1265726919600323, "grad_norm": 0.03830622509121895, "learning_rate": 7.401404744413782e-07, "loss": 0.0, "num_input_tokens_seen": 86250832, "step": 127980 }, { "epoch": 3.1266948427918795, "grad_norm": 0.0008308925316669047, "learning_rate": 7.40058127376906e-07, "loss": 0.0, "num_input_tokens_seen": 86254800, "step": 127985 }, { "epoch": 3.1268169936237267, "grad_norm": 0.0017075296491384506, "learning_rate": 7.399757822028011e-07, "loss": 0.0, "num_input_tokens_seen": 86258256, "step": 127990 }, { "epoch": 3.126939144455574, "grad_norm": 0.007749969605356455, "learning_rate": 7.398934389196622e-07, "loss": 0.0, "num_input_tokens_seen": 86261456, "step": 127995 }, { "epoch": 3.127061295287421, "grad_norm": 0.0004765443445648998, "learning_rate": 7.398110975280884e-07, "loss": 0.0, "num_input_tokens_seen": 86265424, "step": 128000 }, { "epoch": 3.1271834461192682, "grad_norm": 0.0021843453869223595, "learning_rate": 7.39728758028678e-07, "loss": 0.0, "num_input_tokens_seen": 86269008, "step": 128005 }, { "epoch": 3.1273055969511154, "grad_norm": 0.0013905063970014453, "learning_rate": 7.3964642042203e-07, "loss": 0.0002, "num_input_tokens_seen": 86272464, "step": 128010 }, { "epoch": 3.1274277477829626, "grad_norm": 0.03797272965312004, "learning_rate": 7.395640847087436e-07, "loss": 0.0, "num_input_tokens_seen": 86275856, "step": 128015 }, { "epoch": 3.1275498986148094, "grad_norm": 0.09020286798477173, "learning_rate": 7.394817508894169e-07, "loss": 0.0447, "num_input_tokens_seen": 86279184, "step": 128020 }, { "epoch": 3.1276720494466566, "grad_norm": 0.0005714365397579968, "learning_rate": 7.393994189646493e-07, "loss": 0.0, "num_input_tokens_seen": 86282320, "step": 128025 }, { "epoch": 3.1277942002785037, "grad_norm": 0.004440112039446831, "learning_rate": 7.393170889350388e-07, "loss": 0.0, "num_input_tokens_seen": 86285712, "step": 128030 }, { "epoch": 3.127916351110351, "grad_norm": 0.0017021347302943468, "learning_rate": 7.392347608011849e-07, "loss": 0.0, "num_input_tokens_seen": 86289616, "step": 128035 }, { "epoch": 3.128038501942198, "grad_norm": 0.0017139667179435492, "learning_rate": 7.391524345636859e-07, "loss": 0.0, "num_input_tokens_seen": 86293072, "step": 128040 }, { "epoch": 3.1281606527740453, "grad_norm": 0.02372741885483265, "learning_rate": 7.390701102231405e-07, "loss": 0.0, "num_input_tokens_seen": 86296592, "step": 128045 }, { "epoch": 3.1282828036058925, "grad_norm": 0.0010716107208281755, "learning_rate": 7.389877877801476e-07, "loss": 0.0, "num_input_tokens_seen": 86299792, "step": 128050 }, { "epoch": 3.1284049544377397, "grad_norm": 0.002866087481379509, "learning_rate": 7.389054672353054e-07, "loss": 0.0, "num_input_tokens_seen": 86303568, "step": 128055 }, { "epoch": 3.128527105269587, "grad_norm": 0.002422518329694867, "learning_rate": 7.388231485892132e-07, "loss": 0.0, "num_input_tokens_seen": 86306960, "step": 128060 }, { "epoch": 3.128649256101434, "grad_norm": 0.002960509154945612, "learning_rate": 7.38740831842469e-07, "loss": 0.0, "num_input_tokens_seen": 86310544, "step": 128065 }, { "epoch": 3.1287714069332813, "grad_norm": 0.013696005567908287, "learning_rate": 7.386585169956717e-07, "loss": 0.0553, "num_input_tokens_seen": 86313808, "step": 128070 }, { "epoch": 3.1288935577651285, "grad_norm": 0.005466190166771412, "learning_rate": 7.385762040494203e-07, "loss": 0.0, "num_input_tokens_seen": 86317200, "step": 128075 }, { "epoch": 3.1290157085969756, "grad_norm": 0.00015159814211074263, "learning_rate": 7.384938930043128e-07, "loss": 0.0, "num_input_tokens_seen": 86320784, "step": 128080 }, { "epoch": 3.129137859428823, "grad_norm": 0.03894127905368805, "learning_rate": 7.384115838609483e-07, "loss": 0.0, "num_input_tokens_seen": 86324240, "step": 128085 }, { "epoch": 3.12926001026067, "grad_norm": 0.0035981256514787674, "learning_rate": 7.38329276619925e-07, "loss": 0.0, "num_input_tokens_seen": 86327632, "step": 128090 }, { "epoch": 3.129382161092517, "grad_norm": 43.11925506591797, "learning_rate": 7.382469712818413e-07, "loss": 0.0439, "num_input_tokens_seen": 86330960, "step": 128095 }, { "epoch": 3.1295043119243644, "grad_norm": 11.641146659851074, "learning_rate": 7.381646678472965e-07, "loss": 0.0949, "num_input_tokens_seen": 86334352, "step": 128100 }, { "epoch": 3.129626462756211, "grad_norm": 0.0009966957150027156, "learning_rate": 7.380823663168882e-07, "loss": 0.0, "num_input_tokens_seen": 86338000, "step": 128105 }, { "epoch": 3.1297486135880583, "grad_norm": 0.0005179878207854927, "learning_rate": 7.380000666912158e-07, "loss": 0.0439, "num_input_tokens_seen": 86341840, "step": 128110 }, { "epoch": 3.1298707644199055, "grad_norm": 0.000786024727858603, "learning_rate": 7.379177689708771e-07, "loss": 0.0, "num_input_tokens_seen": 86345040, "step": 128115 }, { "epoch": 3.1299929152517527, "grad_norm": 1.1530879735946655, "learning_rate": 7.378354731564711e-07, "loss": 0.0408, "num_input_tokens_seen": 86348688, "step": 128120 }, { "epoch": 3.1301150660836, "grad_norm": 16.093496322631836, "learning_rate": 7.377531792485958e-07, "loss": 0.0332, "num_input_tokens_seen": 86351760, "step": 128125 }, { "epoch": 3.130237216915447, "grad_norm": 0.03726727515459061, "learning_rate": 7.376708872478499e-07, "loss": 0.0, "num_input_tokens_seen": 86354832, "step": 128130 }, { "epoch": 3.1303593677472943, "grad_norm": 0.0029666442424058914, "learning_rate": 7.375885971548321e-07, "loss": 0.0003, "num_input_tokens_seen": 86358416, "step": 128135 }, { "epoch": 3.1304815185791415, "grad_norm": 0.002378805074840784, "learning_rate": 7.375063089701405e-07, "loss": 0.0, "num_input_tokens_seen": 86361680, "step": 128140 }, { "epoch": 3.1306036694109887, "grad_norm": 0.12748949229717255, "learning_rate": 7.374240226943737e-07, "loss": 0.0001, "num_input_tokens_seen": 86364880, "step": 128145 }, { "epoch": 3.130725820242836, "grad_norm": 20.794109344482422, "learning_rate": 7.3734173832813e-07, "loss": 0.0008, "num_input_tokens_seen": 86368016, "step": 128150 }, { "epoch": 3.130847971074683, "grad_norm": 0.0004399059107527137, "learning_rate": 7.37259455872008e-07, "loss": 0.0, "num_input_tokens_seen": 86371088, "step": 128155 }, { "epoch": 3.1309701219065302, "grad_norm": 0.00037931985571049154, "learning_rate": 7.371771753266055e-07, "loss": 0.0, "num_input_tokens_seen": 86374608, "step": 128160 }, { "epoch": 3.1310922727383774, "grad_norm": 0.00013678277900908142, "learning_rate": 7.370948966925212e-07, "loss": 0.0, "num_input_tokens_seen": 86378384, "step": 128165 }, { "epoch": 3.1312144235702246, "grad_norm": 0.0024177455343306065, "learning_rate": 7.370126199703541e-07, "loss": 0.0621, "num_input_tokens_seen": 86381264, "step": 128170 }, { "epoch": 3.131336574402072, "grad_norm": 769.744873046875, "learning_rate": 7.369303451607014e-07, "loss": 0.0168, "num_input_tokens_seen": 86384336, "step": 128175 }, { "epoch": 3.131458725233919, "grad_norm": 0.003459363942965865, "learning_rate": 7.368480722641626e-07, "loss": 0.0, "num_input_tokens_seen": 86388112, "step": 128180 }, { "epoch": 3.131580876065766, "grad_norm": 0.0007123707910068333, "learning_rate": 7.367658012813347e-07, "loss": 0.0, "num_input_tokens_seen": 86392016, "step": 128185 }, { "epoch": 3.1317030268976134, "grad_norm": 0.0074523743242025375, "learning_rate": 7.366835322128171e-07, "loss": 0.0, "num_input_tokens_seen": 86395344, "step": 128190 }, { "epoch": 3.1318251777294606, "grad_norm": 0.008593794889748096, "learning_rate": 7.366012650592076e-07, "loss": 0.0001, "num_input_tokens_seen": 86398736, "step": 128195 }, { "epoch": 3.1319473285613073, "grad_norm": 0.07377134263515472, "learning_rate": 7.365189998211046e-07, "loss": 0.0001, "num_input_tokens_seen": 86402384, "step": 128200 }, { "epoch": 3.1320694793931545, "grad_norm": 0.000612644711509347, "learning_rate": 7.364367364991064e-07, "loss": 0.0, "num_input_tokens_seen": 86406032, "step": 128205 }, { "epoch": 3.1321916302250017, "grad_norm": 0.00483801169320941, "learning_rate": 7.363544750938109e-07, "loss": 0.0, "num_input_tokens_seen": 86410128, "step": 128210 }, { "epoch": 3.132313781056849, "grad_norm": 0.00010632204066496342, "learning_rate": 7.362722156058169e-07, "loss": 0.0002, "num_input_tokens_seen": 86413520, "step": 128215 }, { "epoch": 3.132435931888696, "grad_norm": 0.0007703229202888906, "learning_rate": 7.361899580357219e-07, "loss": 0.0, "num_input_tokens_seen": 86416592, "step": 128220 }, { "epoch": 3.1325580827205433, "grad_norm": 0.0008033441845327616, "learning_rate": 7.361077023841244e-07, "loss": 0.0, "num_input_tokens_seen": 86419984, "step": 128225 }, { "epoch": 3.1326802335523904, "grad_norm": 0.0024972441606223583, "learning_rate": 7.360254486516231e-07, "loss": 0.0, "num_input_tokens_seen": 86423568, "step": 128230 }, { "epoch": 3.1328023843842376, "grad_norm": 0.0006891600205563009, "learning_rate": 7.359431968388153e-07, "loss": 0.0001, "num_input_tokens_seen": 86427088, "step": 128235 }, { "epoch": 3.132924535216085, "grad_norm": 0.0033626670483499765, "learning_rate": 7.358609469463e-07, "loss": 0.0, "num_input_tokens_seen": 86430160, "step": 128240 }, { "epoch": 3.133046686047932, "grad_norm": 0.49775242805480957, "learning_rate": 7.357786989746748e-07, "loss": 0.0001, "num_input_tokens_seen": 86433232, "step": 128245 }, { "epoch": 3.133168836879779, "grad_norm": 0.00190130271948874, "learning_rate": 7.356964529245378e-07, "loss": 0.0002, "num_input_tokens_seen": 86436432, "step": 128250 }, { "epoch": 3.1332909877116264, "grad_norm": 0.0011852835305035114, "learning_rate": 7.356142087964876e-07, "loss": 0.0002, "num_input_tokens_seen": 86439888, "step": 128255 }, { "epoch": 3.1334131385434736, "grad_norm": 6.330687756417319e-05, "learning_rate": 7.355319665911217e-07, "loss": 0.0, "num_input_tokens_seen": 86443408, "step": 128260 }, { "epoch": 3.1335352893753208, "grad_norm": 0.00018317383364774287, "learning_rate": 7.354497263090386e-07, "loss": 0.0, "num_input_tokens_seen": 86446608, "step": 128265 }, { "epoch": 3.133657440207168, "grad_norm": 0.001978239743039012, "learning_rate": 7.353674879508363e-07, "loss": 0.0, "num_input_tokens_seen": 86450448, "step": 128270 }, { "epoch": 3.133779591039015, "grad_norm": 0.003964691422879696, "learning_rate": 7.352852515171128e-07, "loss": 0.0001, "num_input_tokens_seen": 86453648, "step": 128275 }, { "epoch": 3.1339017418708623, "grad_norm": 0.015365658327937126, "learning_rate": 7.35203017008466e-07, "loss": 0.0, "num_input_tokens_seen": 86457104, "step": 128280 }, { "epoch": 3.134023892702709, "grad_norm": 0.002852953039109707, "learning_rate": 7.351207844254938e-07, "loss": 0.0, "num_input_tokens_seen": 86460560, "step": 128285 }, { "epoch": 3.1341460435345563, "grad_norm": 0.2430233508348465, "learning_rate": 7.350385537687951e-07, "loss": 0.0001, "num_input_tokens_seen": 86463824, "step": 128290 }, { "epoch": 3.1342681943664035, "grad_norm": 0.0005990845966152847, "learning_rate": 7.349563250389672e-07, "loss": 0.0, "num_input_tokens_seen": 86467088, "step": 128295 }, { "epoch": 3.1343903451982507, "grad_norm": 0.0022348244674503803, "learning_rate": 7.34874098236608e-07, "loss": 0.0002, "num_input_tokens_seen": 86470736, "step": 128300 }, { "epoch": 3.134512496030098, "grad_norm": 0.0038873967714607716, "learning_rate": 7.347918733623157e-07, "loss": 0.0, "num_input_tokens_seen": 86474576, "step": 128305 }, { "epoch": 3.134634646861945, "grad_norm": 0.0013605405110865831, "learning_rate": 7.347096504166885e-07, "loss": 0.0, "num_input_tokens_seen": 86478160, "step": 128310 }, { "epoch": 3.1347567976937922, "grad_norm": 0.02028125710785389, "learning_rate": 7.346274294003237e-07, "loss": 0.0, "num_input_tokens_seen": 86481552, "step": 128315 }, { "epoch": 3.1348789485256394, "grad_norm": 0.00044287380296736956, "learning_rate": 7.345452103138195e-07, "loss": 0.0, "num_input_tokens_seen": 86485200, "step": 128320 }, { "epoch": 3.1350010993574866, "grad_norm": 0.008367608301341534, "learning_rate": 7.344629931577744e-07, "loss": 0.0, "num_input_tokens_seen": 86488592, "step": 128325 }, { "epoch": 3.135123250189334, "grad_norm": 0.044535327702760696, "learning_rate": 7.343807779327855e-07, "loss": 0.0, "num_input_tokens_seen": 86491792, "step": 128330 }, { "epoch": 3.135245401021181, "grad_norm": 0.0006210493156686425, "learning_rate": 7.342985646394513e-07, "loss": 0.0001, "num_input_tokens_seen": 86495888, "step": 128335 }, { "epoch": 3.135367551853028, "grad_norm": 0.0006383709842339158, "learning_rate": 7.342163532783689e-07, "loss": 0.0, "num_input_tokens_seen": 86499024, "step": 128340 }, { "epoch": 3.1354897026848754, "grad_norm": 1.4991077478043735e-05, "learning_rate": 7.341341438501372e-07, "loss": 0.0, "num_input_tokens_seen": 86502608, "step": 128345 }, { "epoch": 3.1356118535167226, "grad_norm": 0.001342342933639884, "learning_rate": 7.340519363553532e-07, "loss": 0.0, "num_input_tokens_seen": 86506128, "step": 128350 }, { "epoch": 3.1357340043485697, "grad_norm": 4.490639184950851e-05, "learning_rate": 7.339697307946152e-07, "loss": 0.0, "num_input_tokens_seen": 86509776, "step": 128355 }, { "epoch": 3.135856155180417, "grad_norm": 0.0005294650327414274, "learning_rate": 7.338875271685211e-07, "loss": 0.0, "num_input_tokens_seen": 86513168, "step": 128360 }, { "epoch": 3.135978306012264, "grad_norm": 0.0015506184427067637, "learning_rate": 7.33805325477668e-07, "loss": 0.0001, "num_input_tokens_seen": 86516560, "step": 128365 }, { "epoch": 3.136100456844111, "grad_norm": 0.0003006265906151384, "learning_rate": 7.337231257226546e-07, "loss": 0.0005, "num_input_tokens_seen": 86520080, "step": 128370 }, { "epoch": 3.1362226076759585, "grad_norm": 0.026055965572595596, "learning_rate": 7.336409279040778e-07, "loss": 0.0, "num_input_tokens_seen": 86523536, "step": 128375 }, { "epoch": 3.1363447585078053, "grad_norm": 0.000478060421301052, "learning_rate": 7.335587320225359e-07, "loss": 0.0, "num_input_tokens_seen": 86526864, "step": 128380 }, { "epoch": 3.1364669093396524, "grad_norm": 0.5377205014228821, "learning_rate": 7.33476538078627e-07, "loss": 0.0001, "num_input_tokens_seen": 86530384, "step": 128385 }, { "epoch": 3.1365890601714996, "grad_norm": 0.00033786255517043173, "learning_rate": 7.333943460729479e-07, "loss": 0.0, "num_input_tokens_seen": 86533776, "step": 128390 }, { "epoch": 3.136711211003347, "grad_norm": 0.011143106035888195, "learning_rate": 7.333121560060973e-07, "loss": 0.0, "num_input_tokens_seen": 86537424, "step": 128395 }, { "epoch": 3.136833361835194, "grad_norm": 0.00022679645917378366, "learning_rate": 7.332299678786722e-07, "loss": 0.0, "num_input_tokens_seen": 86541520, "step": 128400 }, { "epoch": 3.136955512667041, "grad_norm": 0.00040304975118488073, "learning_rate": 7.331477816912703e-07, "loss": 0.0329, "num_input_tokens_seen": 86544464, "step": 128405 }, { "epoch": 3.1370776634988884, "grad_norm": 0.0011125191813334823, "learning_rate": 7.330655974444899e-07, "loss": 0.0, "num_input_tokens_seen": 86547664, "step": 128410 }, { "epoch": 3.1371998143307356, "grad_norm": 0.0013933493755757809, "learning_rate": 7.329834151389278e-07, "loss": 0.0, "num_input_tokens_seen": 86550544, "step": 128415 }, { "epoch": 3.1373219651625828, "grad_norm": 0.0029185281600803137, "learning_rate": 7.329012347751827e-07, "loss": 0.0406, "num_input_tokens_seen": 86553872, "step": 128420 }, { "epoch": 3.13744411599443, "grad_norm": 0.00033394224010407925, "learning_rate": 7.328190563538512e-07, "loss": 0.0, "num_input_tokens_seen": 86557072, "step": 128425 }, { "epoch": 3.137566266826277, "grad_norm": 0.004945480264723301, "learning_rate": 7.327368798755318e-07, "loss": 0.0, "num_input_tokens_seen": 86560656, "step": 128430 }, { "epoch": 3.1376884176581243, "grad_norm": 0.033799320459365845, "learning_rate": 7.326547053408212e-07, "loss": 0.0, "num_input_tokens_seen": 86564368, "step": 128435 }, { "epoch": 3.1378105684899715, "grad_norm": 0.008554290048778057, "learning_rate": 7.325725327503175e-07, "loss": 0.0001, "num_input_tokens_seen": 86567312, "step": 128440 }, { "epoch": 3.1379327193218187, "grad_norm": 0.0007735177641734481, "learning_rate": 7.324903621046188e-07, "loss": 0.0, "num_input_tokens_seen": 86570512, "step": 128445 }, { "epoch": 3.138054870153666, "grad_norm": 0.0013030408881604671, "learning_rate": 7.324081934043218e-07, "loss": 0.0, "num_input_tokens_seen": 86574992, "step": 128450 }, { "epoch": 3.138177020985513, "grad_norm": 0.0024377258960157633, "learning_rate": 7.323260266500244e-07, "loss": 0.0479, "num_input_tokens_seen": 86578384, "step": 128455 }, { "epoch": 3.1382991718173603, "grad_norm": 0.005287248641252518, "learning_rate": 7.322438618423241e-07, "loss": 0.0, "num_input_tokens_seen": 86583760, "step": 128460 }, { "epoch": 3.138421322649207, "grad_norm": 10.693768501281738, "learning_rate": 7.321616989818189e-07, "loss": 0.0339, "num_input_tokens_seen": 86587600, "step": 128465 }, { "epoch": 3.1385434734810542, "grad_norm": 0.0022159749642014503, "learning_rate": 7.320795380691051e-07, "loss": 0.0587, "num_input_tokens_seen": 86590992, "step": 128470 }, { "epoch": 3.1386656243129014, "grad_norm": 0.04777996987104416, "learning_rate": 7.319973791047813e-07, "loss": 0.0, "num_input_tokens_seen": 86594320, "step": 128475 }, { "epoch": 3.1387877751447486, "grad_norm": 0.12414655089378357, "learning_rate": 7.319152220894449e-07, "loss": 0.0, "num_input_tokens_seen": 86597584, "step": 128480 }, { "epoch": 3.138909925976596, "grad_norm": 0.0007596567156724632, "learning_rate": 7.318330670236927e-07, "loss": 0.0255, "num_input_tokens_seen": 86601488, "step": 128485 }, { "epoch": 3.139032076808443, "grad_norm": 0.7451971173286438, "learning_rate": 7.31750913908123e-07, "loss": 0.0696, "num_input_tokens_seen": 86604880, "step": 128490 }, { "epoch": 3.13915422764029, "grad_norm": 717.7139282226562, "learning_rate": 7.316687627433323e-07, "loss": 0.0703, "num_input_tokens_seen": 86608592, "step": 128495 }, { "epoch": 3.1392763784721374, "grad_norm": 0.001274026115424931, "learning_rate": 7.315866135299189e-07, "loss": 0.0002, "num_input_tokens_seen": 86612112, "step": 128500 }, { "epoch": 3.1393985293039846, "grad_norm": 0.005512653850018978, "learning_rate": 7.315044662684797e-07, "loss": 0.0, "num_input_tokens_seen": 86615184, "step": 128505 }, { "epoch": 3.1395206801358317, "grad_norm": 0.45431196689605713, "learning_rate": 7.314223209596122e-07, "loss": 0.0003, "num_input_tokens_seen": 86618384, "step": 128510 }, { "epoch": 3.139642830967679, "grad_norm": 0.3715175688266754, "learning_rate": 7.313401776039142e-07, "loss": 0.0001, "num_input_tokens_seen": 86621648, "step": 128515 }, { "epoch": 3.139764981799526, "grad_norm": 1.0766215324401855, "learning_rate": 7.312580362019822e-07, "loss": 0.0003, "num_input_tokens_seen": 86624912, "step": 128520 }, { "epoch": 3.1398871326313733, "grad_norm": 0.00018668769916985184, "learning_rate": 7.311758967544143e-07, "loss": 0.0, "num_input_tokens_seen": 86628112, "step": 128525 }, { "epoch": 3.1400092834632205, "grad_norm": 1.5411447748192586e-05, "learning_rate": 7.310937592618074e-07, "loss": 0.0716, "num_input_tokens_seen": 86631440, "step": 128530 }, { "epoch": 3.1401314342950677, "grad_norm": 0.04406026378273964, "learning_rate": 7.31011623724759e-07, "loss": 0.0, "num_input_tokens_seen": 86635088, "step": 128535 }, { "epoch": 3.140253585126915, "grad_norm": 0.25144264101982117, "learning_rate": 7.309294901438667e-07, "loss": 0.0001, "num_input_tokens_seen": 86638480, "step": 128540 }, { "epoch": 3.140375735958762, "grad_norm": 0.003910064697265625, "learning_rate": 7.308473585197272e-07, "loss": 0.0, "num_input_tokens_seen": 86641936, "step": 128545 }, { "epoch": 3.140497886790609, "grad_norm": 0.008174985647201538, "learning_rate": 7.307652288529385e-07, "loss": 0.0, "num_input_tokens_seen": 86644880, "step": 128550 }, { "epoch": 3.140620037622456, "grad_norm": 0.0032011514995247126, "learning_rate": 7.306831011440971e-07, "loss": 0.0, "num_input_tokens_seen": 86649168, "step": 128555 }, { "epoch": 3.140742188454303, "grad_norm": 0.0005409402656368911, "learning_rate": 7.30600975393801e-07, "loss": 0.0, "num_input_tokens_seen": 86652432, "step": 128560 }, { "epoch": 3.1408643392861504, "grad_norm": 0.0008832211024127901, "learning_rate": 7.305188516026468e-07, "loss": 0.0822, "num_input_tokens_seen": 86655696, "step": 128565 }, { "epoch": 3.1409864901179976, "grad_norm": 0.004884419031441212, "learning_rate": 7.304367297712318e-07, "loss": 0.0048, "num_input_tokens_seen": 86659088, "step": 128570 }, { "epoch": 3.1411086409498448, "grad_norm": 0.004036621190607548, "learning_rate": 7.303546099001539e-07, "loss": 0.0, "num_input_tokens_seen": 86663056, "step": 128575 }, { "epoch": 3.141230791781692, "grad_norm": 0.0024037614930421114, "learning_rate": 7.302724919900093e-07, "loss": 0.0, "num_input_tokens_seen": 86666128, "step": 128580 }, { "epoch": 3.141352942613539, "grad_norm": 0.022274015471339226, "learning_rate": 7.301903760413961e-07, "loss": 0.036, "num_input_tokens_seen": 86669776, "step": 128585 }, { "epoch": 3.1414750934453863, "grad_norm": 0.001364888739772141, "learning_rate": 7.301082620549107e-07, "loss": 0.0002, "num_input_tokens_seen": 86672656, "step": 128590 }, { "epoch": 3.1415972442772335, "grad_norm": 0.00043467117939144373, "learning_rate": 7.300261500311507e-07, "loss": 0.0541, "num_input_tokens_seen": 86675984, "step": 128595 }, { "epoch": 3.1417193951090807, "grad_norm": 0.003563940990716219, "learning_rate": 7.299440399707133e-07, "loss": 0.0001, "num_input_tokens_seen": 86679376, "step": 128600 }, { "epoch": 3.141841545940928, "grad_norm": 0.04951038211584091, "learning_rate": 7.298619318741955e-07, "loss": 0.0, "num_input_tokens_seen": 86682576, "step": 128605 }, { "epoch": 3.141963696772775, "grad_norm": 0.00022587290732190013, "learning_rate": 7.297798257421944e-07, "loss": 0.0, "num_input_tokens_seen": 86685712, "step": 128610 }, { "epoch": 3.1420858476046223, "grad_norm": 0.003272018162533641, "learning_rate": 7.296977215753069e-07, "loss": 0.0, "num_input_tokens_seen": 86688528, "step": 128615 }, { "epoch": 3.1422079984364695, "grad_norm": 0.005255506839603186, "learning_rate": 7.296156193741305e-07, "loss": 0.0, "num_input_tokens_seen": 86691856, "step": 128620 }, { "epoch": 3.1423301492683167, "grad_norm": 3.095484134973958e-05, "learning_rate": 7.295335191392617e-07, "loss": 0.0, "num_input_tokens_seen": 86694928, "step": 128625 }, { "epoch": 3.142452300100164, "grad_norm": 0.05205165222287178, "learning_rate": 7.294514208712979e-07, "loss": 0.0004, "num_input_tokens_seen": 86698320, "step": 128630 }, { "epoch": 3.142574450932011, "grad_norm": 0.08933182805776596, "learning_rate": 7.293693245708365e-07, "loss": 0.0001, "num_input_tokens_seen": 86701904, "step": 128635 }, { "epoch": 3.1426966017638582, "grad_norm": 0.0016661847475916147, "learning_rate": 7.29287230238474e-07, "loss": 0.0546, "num_input_tokens_seen": 86705680, "step": 128640 }, { "epoch": 3.142818752595705, "grad_norm": 0.0012546000070869923, "learning_rate": 7.292051378748076e-07, "loss": 0.0, "num_input_tokens_seen": 86709072, "step": 128645 }, { "epoch": 3.142940903427552, "grad_norm": 0.0005776460748165846, "learning_rate": 7.291230474804342e-07, "loss": 0.0, "num_input_tokens_seen": 86712336, "step": 128650 }, { "epoch": 3.1430630542593994, "grad_norm": 0.11688629537820816, "learning_rate": 7.290409590559508e-07, "loss": 0.0458, "num_input_tokens_seen": 86715216, "step": 128655 }, { "epoch": 3.1431852050912465, "grad_norm": 0.00021849303448107094, "learning_rate": 7.289588726019547e-07, "loss": 0.0, "num_input_tokens_seen": 86718672, "step": 128660 }, { "epoch": 3.1433073559230937, "grad_norm": 0.006894854828715324, "learning_rate": 7.288767881190423e-07, "loss": 0.0, "num_input_tokens_seen": 86721872, "step": 128665 }, { "epoch": 3.143429506754941, "grad_norm": 0.0018162491032853723, "learning_rate": 7.287947056078112e-07, "loss": 0.1059, "num_input_tokens_seen": 86725392, "step": 128670 }, { "epoch": 3.143551657586788, "grad_norm": 0.010568572208285332, "learning_rate": 7.287126250688575e-07, "loss": 0.0001, "num_input_tokens_seen": 86728464, "step": 128675 }, { "epoch": 3.1436738084186353, "grad_norm": 0.0048435525968670845, "learning_rate": 7.286305465027789e-07, "loss": 0.0, "num_input_tokens_seen": 86731536, "step": 128680 }, { "epoch": 3.1437959592504825, "grad_norm": 0.0015782549744471908, "learning_rate": 7.285484699101716e-07, "loss": 0.0, "num_input_tokens_seen": 86734736, "step": 128685 }, { "epoch": 3.1439181100823297, "grad_norm": 2663.33740234375, "learning_rate": 7.284663952916328e-07, "loss": 0.0305, "num_input_tokens_seen": 86738128, "step": 128690 }, { "epoch": 3.144040260914177, "grad_norm": 0.007821955718100071, "learning_rate": 7.283843226477598e-07, "loss": 0.0001, "num_input_tokens_seen": 86741328, "step": 128695 }, { "epoch": 3.144162411746024, "grad_norm": 0.030453966930508614, "learning_rate": 7.283022519791487e-07, "loss": 0.0, "num_input_tokens_seen": 86744720, "step": 128700 }, { "epoch": 3.1442845625778713, "grad_norm": 0.015875792130827904, "learning_rate": 7.282201832863972e-07, "loss": 0.0, "num_input_tokens_seen": 86748688, "step": 128705 }, { "epoch": 3.1444067134097184, "grad_norm": 0.20353585481643677, "learning_rate": 7.281381165701011e-07, "loss": 0.0001, "num_input_tokens_seen": 86752144, "step": 128710 }, { "epoch": 3.1445288642415656, "grad_norm": 0.0022369814105331898, "learning_rate": 7.280560518308582e-07, "loss": 0.0, "num_input_tokens_seen": 86755600, "step": 128715 }, { "epoch": 3.144651015073413, "grad_norm": 0.0004540746449492872, "learning_rate": 7.279739890692646e-07, "loss": 0.0002, "num_input_tokens_seen": 86758864, "step": 128720 }, { "epoch": 3.14477316590526, "grad_norm": 0.007710547186434269, "learning_rate": 7.27891928285917e-07, "loss": 0.0, "num_input_tokens_seen": 86762448, "step": 128725 }, { "epoch": 3.1448953167371068, "grad_norm": 0.00036782852839678526, "learning_rate": 7.278098694814131e-07, "loss": 0.0343, "num_input_tokens_seen": 86765840, "step": 128730 }, { "epoch": 3.145017467568954, "grad_norm": 0.3705562949180603, "learning_rate": 7.277278126563485e-07, "loss": 0.0003, "num_input_tokens_seen": 86769296, "step": 128735 }, { "epoch": 3.145139618400801, "grad_norm": 0.0008896345389075577, "learning_rate": 7.27645757811321e-07, "loss": 0.0, "num_input_tokens_seen": 86772560, "step": 128740 }, { "epoch": 3.1452617692326483, "grad_norm": 0.005860117729753256, "learning_rate": 7.275637049469263e-07, "loss": 0.0, "num_input_tokens_seen": 86776080, "step": 128745 }, { "epoch": 3.1453839200644955, "grad_norm": 9.167318057734519e-05, "learning_rate": 7.274816540637616e-07, "loss": 0.0, "num_input_tokens_seen": 86779088, "step": 128750 }, { "epoch": 3.1455060708963427, "grad_norm": 0.00035717521677725017, "learning_rate": 7.27399605162424e-07, "loss": 0.036, "num_input_tokens_seen": 86782544, "step": 128755 }, { "epoch": 3.14562822172819, "grad_norm": 0.003054143860936165, "learning_rate": 7.273175582435098e-07, "loss": 0.0, "num_input_tokens_seen": 86785680, "step": 128760 }, { "epoch": 3.145750372560037, "grad_norm": 0.004068409558385611, "learning_rate": 7.272355133076154e-07, "loss": 0.0, "num_input_tokens_seen": 86789136, "step": 128765 }, { "epoch": 3.1458725233918843, "grad_norm": 0.008344685658812523, "learning_rate": 7.271534703553379e-07, "loss": 0.0, "num_input_tokens_seen": 86792656, "step": 128770 }, { "epoch": 3.1459946742237315, "grad_norm": 0.00015507386706303805, "learning_rate": 7.270714293872738e-07, "loss": 0.0003, "num_input_tokens_seen": 86795792, "step": 128775 }, { "epoch": 3.1461168250555787, "grad_norm": 0.024332698434591293, "learning_rate": 7.269893904040194e-07, "loss": 0.0, "num_input_tokens_seen": 86799696, "step": 128780 }, { "epoch": 3.146238975887426, "grad_norm": 0.000505484058521688, "learning_rate": 7.269073534061715e-07, "loss": 0.0001, "num_input_tokens_seen": 86803216, "step": 128785 }, { "epoch": 3.146361126719273, "grad_norm": 0.005547806154936552, "learning_rate": 7.268253183943271e-07, "loss": 0.0, "num_input_tokens_seen": 86806224, "step": 128790 }, { "epoch": 3.1464832775511202, "grad_norm": 0.001450459472835064, "learning_rate": 7.267432853690823e-07, "loss": 0.0477, "num_input_tokens_seen": 86809168, "step": 128795 }, { "epoch": 3.1466054283829674, "grad_norm": 0.0012143428903073072, "learning_rate": 7.266612543310339e-07, "loss": 0.0, "num_input_tokens_seen": 86812688, "step": 128800 }, { "epoch": 3.1467275792148146, "grad_norm": 382.2807312011719, "learning_rate": 7.265792252807783e-07, "loss": 0.0772, "num_input_tokens_seen": 86815696, "step": 128805 }, { "epoch": 3.146849730046662, "grad_norm": 2.9038077627774328e-05, "learning_rate": 7.264971982189122e-07, "loss": 0.0288, "num_input_tokens_seen": 86819344, "step": 128810 }, { "epoch": 3.146971880878509, "grad_norm": 8.047770825214684e-05, "learning_rate": 7.26415173146032e-07, "loss": 0.0001, "num_input_tokens_seen": 86822928, "step": 128815 }, { "epoch": 3.147094031710356, "grad_norm": 0.0018185937078669667, "learning_rate": 7.263331500627343e-07, "loss": 0.0, "num_input_tokens_seen": 86826192, "step": 128820 }, { "epoch": 3.147216182542203, "grad_norm": 0.0006772976485081017, "learning_rate": 7.262511289696158e-07, "loss": 0.0, "num_input_tokens_seen": 86829264, "step": 128825 }, { "epoch": 3.14733833337405, "grad_norm": 0.001433266093954444, "learning_rate": 7.261691098672722e-07, "loss": 0.0, "num_input_tokens_seen": 86833424, "step": 128830 }, { "epoch": 3.1474604842058973, "grad_norm": 0.000816820771433413, "learning_rate": 7.260870927563009e-07, "loss": 0.0, "num_input_tokens_seen": 86836432, "step": 128835 }, { "epoch": 3.1475826350377445, "grad_norm": 0.000719138071872294, "learning_rate": 7.260050776372974e-07, "loss": 0.0, "num_input_tokens_seen": 86839952, "step": 128840 }, { "epoch": 3.1477047858695917, "grad_norm": 0.0024840179830789566, "learning_rate": 7.259230645108589e-07, "loss": 0.0003, "num_input_tokens_seen": 86843344, "step": 128845 }, { "epoch": 3.147826936701439, "grad_norm": 0.04995972290635109, "learning_rate": 7.258410533775819e-07, "loss": 0.0, "num_input_tokens_seen": 86846608, "step": 128850 }, { "epoch": 3.147949087533286, "grad_norm": 0.0003871772496495396, "learning_rate": 7.257590442380621e-07, "loss": 0.0, "num_input_tokens_seen": 86849808, "step": 128855 }, { "epoch": 3.1480712383651333, "grad_norm": 0.0005167672061361372, "learning_rate": 7.256770370928968e-07, "loss": 0.0, "num_input_tokens_seen": 86853264, "step": 128860 }, { "epoch": 3.1481933891969804, "grad_norm": 0.002442282158881426, "learning_rate": 7.255950319426814e-07, "loss": 0.0, "num_input_tokens_seen": 86857040, "step": 128865 }, { "epoch": 3.1483155400288276, "grad_norm": 0.00010521677177166566, "learning_rate": 7.25513028788013e-07, "loss": 0.0, "num_input_tokens_seen": 86860752, "step": 128870 }, { "epoch": 3.148437690860675, "grad_norm": 27.28692626953125, "learning_rate": 7.254310276294876e-07, "loss": 0.0538, "num_input_tokens_seen": 86863824, "step": 128875 }, { "epoch": 3.148559841692522, "grad_norm": 0.005157478619366884, "learning_rate": 7.253490284677015e-07, "loss": 0.0, "num_input_tokens_seen": 86866896, "step": 128880 }, { "epoch": 3.148681992524369, "grad_norm": 0.1106048971414566, "learning_rate": 7.252670313032514e-07, "loss": 0.0003, "num_input_tokens_seen": 86870224, "step": 128885 }, { "epoch": 3.1488041433562164, "grad_norm": 0.0023445556871593, "learning_rate": 7.251850361367329e-07, "loss": 0.0, "num_input_tokens_seen": 86873552, "step": 128890 }, { "epoch": 3.1489262941880636, "grad_norm": 0.004360957071185112, "learning_rate": 7.251030429687433e-07, "loss": 0.0, "num_input_tokens_seen": 86876816, "step": 128895 }, { "epoch": 3.1490484450199108, "grad_norm": 0.010129369795322418, "learning_rate": 7.250210517998778e-07, "loss": 0.0, "num_input_tokens_seen": 86880144, "step": 128900 }, { "epoch": 3.149170595851758, "grad_norm": 0.0009373872308060527, "learning_rate": 7.249390626307332e-07, "loss": 0.0001, "num_input_tokens_seen": 86883664, "step": 128905 }, { "epoch": 3.1492927466836047, "grad_norm": 0.0001356559369014576, "learning_rate": 7.248570754619061e-07, "loss": 0.0013, "num_input_tokens_seen": 86886736, "step": 128910 }, { "epoch": 3.149414897515452, "grad_norm": 0.00227438029833138, "learning_rate": 7.247750902939922e-07, "loss": 0.0, "num_input_tokens_seen": 86889808, "step": 128915 }, { "epoch": 3.149537048347299, "grad_norm": 0.0019287688191980124, "learning_rate": 7.246931071275879e-07, "loss": 0.0, "num_input_tokens_seen": 86893072, "step": 128920 }, { "epoch": 3.1496591991791463, "grad_norm": 0.000745182391256094, "learning_rate": 7.246111259632892e-07, "loss": 0.0001, "num_input_tokens_seen": 86896208, "step": 128925 }, { "epoch": 3.1497813500109935, "grad_norm": 0.0013668572064489126, "learning_rate": 7.245291468016928e-07, "loss": 0.0, "num_input_tokens_seen": 86899536, "step": 128930 }, { "epoch": 3.1499035008428407, "grad_norm": 0.0015289505245164037, "learning_rate": 7.244471696433943e-07, "loss": 0.0, "num_input_tokens_seen": 86902672, "step": 128935 }, { "epoch": 3.150025651674688, "grad_norm": 0.005079837050288916, "learning_rate": 7.243651944889897e-07, "loss": 0.0, "num_input_tokens_seen": 86905808, "step": 128940 }, { "epoch": 3.150147802506535, "grad_norm": 0.0121584078297019, "learning_rate": 7.242832213390763e-07, "loss": 0.0, "num_input_tokens_seen": 86909072, "step": 128945 }, { "epoch": 3.1502699533383822, "grad_norm": 4.289217758923769e-05, "learning_rate": 7.24201250194249e-07, "loss": 0.0001, "num_input_tokens_seen": 86912208, "step": 128950 }, { "epoch": 3.1503921041702294, "grad_norm": 0.0025455676950514317, "learning_rate": 7.241192810551047e-07, "loss": 0.0574, "num_input_tokens_seen": 86915280, "step": 128955 }, { "epoch": 3.1505142550020766, "grad_norm": 0.000352776434738189, "learning_rate": 7.240373139222387e-07, "loss": 0.0, "num_input_tokens_seen": 86918864, "step": 128960 }, { "epoch": 3.150636405833924, "grad_norm": 0.5395686030387878, "learning_rate": 7.239553487962479e-07, "loss": 0.0001, "num_input_tokens_seen": 86922000, "step": 128965 }, { "epoch": 3.150758556665771, "grad_norm": 0.0004787775978911668, "learning_rate": 7.238733856777281e-07, "loss": 0.0005, "num_input_tokens_seen": 86925264, "step": 128970 }, { "epoch": 3.150880707497618, "grad_norm": 0.0027044913731515408, "learning_rate": 7.237914245672752e-07, "loss": 0.0, "num_input_tokens_seen": 86928784, "step": 128975 }, { "epoch": 3.1510028583294654, "grad_norm": 0.001796882483176887, "learning_rate": 7.237094654654857e-07, "loss": 0.0, "num_input_tokens_seen": 86932240, "step": 128980 }, { "epoch": 3.1511250091613126, "grad_norm": 0.0006195952300913632, "learning_rate": 7.236275083729546e-07, "loss": 0.0, "num_input_tokens_seen": 86935440, "step": 128985 }, { "epoch": 3.1512471599931597, "grad_norm": 0.004765995312482119, "learning_rate": 7.235455532902793e-07, "loss": 0.0, "num_input_tokens_seen": 86938704, "step": 128990 }, { "epoch": 3.1513693108250065, "grad_norm": 0.01532856933772564, "learning_rate": 7.234636002180545e-07, "loss": 0.0, "num_input_tokens_seen": 86942160, "step": 128995 }, { "epoch": 3.1514914616568537, "grad_norm": 0.0055182985961437225, "learning_rate": 7.233816491568768e-07, "loss": 0.0001, "num_input_tokens_seen": 86945872, "step": 129000 }, { "epoch": 3.151613612488701, "grad_norm": 0.003385876538231969, "learning_rate": 7.232997001073427e-07, "loss": 0.0, "num_input_tokens_seen": 86948944, "step": 129005 }, { "epoch": 3.151735763320548, "grad_norm": 0.07067646086215973, "learning_rate": 7.23217753070047e-07, "loss": 0.0, "num_input_tokens_seen": 86952080, "step": 129010 }, { "epoch": 3.1518579141523952, "grad_norm": 0.0036404673010110855, "learning_rate": 7.231358080455868e-07, "loss": 0.0002, "num_input_tokens_seen": 86955088, "step": 129015 }, { "epoch": 3.1519800649842424, "grad_norm": 0.002148119965568185, "learning_rate": 7.23053865034557e-07, "loss": 0.0348, "num_input_tokens_seen": 86958288, "step": 129020 }, { "epoch": 3.1521022158160896, "grad_norm": 0.0029477067291736603, "learning_rate": 7.229719240375545e-07, "loss": 0.0, "num_input_tokens_seen": 86961744, "step": 129025 }, { "epoch": 3.152224366647937, "grad_norm": 0.0029479314107447863, "learning_rate": 7.228899850551743e-07, "loss": 0.0, "num_input_tokens_seen": 86965264, "step": 129030 }, { "epoch": 3.152346517479784, "grad_norm": 0.0006279372028075159, "learning_rate": 7.228080480880125e-07, "loss": 0.0, "num_input_tokens_seen": 86968336, "step": 129035 }, { "epoch": 3.152468668311631, "grad_norm": 0.0009111549588851631, "learning_rate": 7.227261131366655e-07, "loss": 0.0, "num_input_tokens_seen": 86971984, "step": 129040 }, { "epoch": 3.1525908191434784, "grad_norm": 0.011269640177488327, "learning_rate": 7.226441802017286e-07, "loss": 0.0, "num_input_tokens_seen": 86975120, "step": 129045 }, { "epoch": 3.1527129699753256, "grad_norm": 0.01214879471808672, "learning_rate": 7.22562249283798e-07, "loss": 0.0, "num_input_tokens_seen": 86978576, "step": 129050 }, { "epoch": 3.1528351208071728, "grad_norm": 0.20093731582164764, "learning_rate": 7.224803203834691e-07, "loss": 0.0001, "num_input_tokens_seen": 86981648, "step": 129055 }, { "epoch": 3.15295727163902, "grad_norm": 0.0008670971728861332, "learning_rate": 7.223983935013378e-07, "loss": 0.0416, "num_input_tokens_seen": 86984912, "step": 129060 }, { "epoch": 3.153079422470867, "grad_norm": 0.017721112817525864, "learning_rate": 7.223164686380005e-07, "loss": 0.0001, "num_input_tokens_seen": 86988240, "step": 129065 }, { "epoch": 3.1532015733027143, "grad_norm": 0.13180363178253174, "learning_rate": 7.222345457940524e-07, "loss": 0.1514, "num_input_tokens_seen": 86991248, "step": 129070 }, { "epoch": 3.1533237241345615, "grad_norm": 0.03970232978463173, "learning_rate": 7.221526249700893e-07, "loss": 0.0, "num_input_tokens_seen": 86994896, "step": 129075 }, { "epoch": 3.1534458749664087, "grad_norm": 0.1309601068496704, "learning_rate": 7.220707061667072e-07, "loss": 0.0464, "num_input_tokens_seen": 86998160, "step": 129080 }, { "epoch": 3.153568025798256, "grad_norm": 39.45444869995117, "learning_rate": 7.219887893845018e-07, "loss": 0.0489, "num_input_tokens_seen": 87001680, "step": 129085 }, { "epoch": 3.1536901766301026, "grad_norm": 0.00731156300753355, "learning_rate": 7.219068746240682e-07, "loss": 0.0688, "num_input_tokens_seen": 87005200, "step": 129090 }, { "epoch": 3.15381232746195, "grad_norm": 0.02709462121129036, "learning_rate": 7.218249618860026e-07, "loss": 0.0005, "num_input_tokens_seen": 87008720, "step": 129095 }, { "epoch": 3.153934478293797, "grad_norm": 0.0009111324907280505, "learning_rate": 7.217430511709013e-07, "loss": 0.0, "num_input_tokens_seen": 87012560, "step": 129100 }, { "epoch": 3.154056629125644, "grad_norm": 0.0053654201328754425, "learning_rate": 7.216611424793588e-07, "loss": 0.0, "num_input_tokens_seen": 87015504, "step": 129105 }, { "epoch": 3.1541787799574914, "grad_norm": 8.82080930750817e-05, "learning_rate": 7.215792358119718e-07, "loss": 0.0, "num_input_tokens_seen": 87018704, "step": 129110 }, { "epoch": 3.1543009307893386, "grad_norm": 0.002231209073215723, "learning_rate": 7.21497331169335e-07, "loss": 0.0, "num_input_tokens_seen": 87022288, "step": 129115 }, { "epoch": 3.154423081621186, "grad_norm": 0.00520332483574748, "learning_rate": 7.214154285520451e-07, "loss": 0.0, "num_input_tokens_seen": 87025872, "step": 129120 }, { "epoch": 3.154545232453033, "grad_norm": 497.58929443359375, "learning_rate": 7.213335279606965e-07, "loss": 0.0131, "num_input_tokens_seen": 87029456, "step": 129125 }, { "epoch": 3.15466738328488, "grad_norm": 0.0042228191159665585, "learning_rate": 7.212516293958857e-07, "loss": 0.0001, "num_input_tokens_seen": 87033104, "step": 129130 }, { "epoch": 3.1547895341167274, "grad_norm": 0.000230711157200858, "learning_rate": 7.211697328582082e-07, "loss": 0.0002, "num_input_tokens_seen": 87036368, "step": 129135 }, { "epoch": 3.1549116849485745, "grad_norm": 0.05502479523420334, "learning_rate": 7.210878383482593e-07, "loss": 0.0001, "num_input_tokens_seen": 87039568, "step": 129140 }, { "epoch": 3.1550338357804217, "grad_norm": 0.11637943238019943, "learning_rate": 7.210059458666348e-07, "loss": 0.0, "num_input_tokens_seen": 87042704, "step": 129145 }, { "epoch": 3.155155986612269, "grad_norm": 0.002879110863432288, "learning_rate": 7.209240554139296e-07, "loss": 0.1131, "num_input_tokens_seen": 87046416, "step": 129150 }, { "epoch": 3.155278137444116, "grad_norm": 0.0007435963489115238, "learning_rate": 7.208421669907398e-07, "loss": 0.0002, "num_input_tokens_seen": 87049872, "step": 129155 }, { "epoch": 3.1554002882759633, "grad_norm": 0.23474089801311493, "learning_rate": 7.207602805976613e-07, "loss": 0.0001, "num_input_tokens_seen": 87053200, "step": 129160 }, { "epoch": 3.1555224391078105, "grad_norm": 0.09376419335603714, "learning_rate": 7.206783962352889e-07, "loss": 0.0, "num_input_tokens_seen": 87057936, "step": 129165 }, { "epoch": 3.1556445899396577, "grad_norm": 0.0002012960467254743, "learning_rate": 7.205965139042186e-07, "loss": 0.0, "num_input_tokens_seen": 87061520, "step": 129170 }, { "epoch": 3.1557667407715044, "grad_norm": 0.00446065841242671, "learning_rate": 7.205146336050451e-07, "loss": 0.0, "num_input_tokens_seen": 87064848, "step": 129175 }, { "epoch": 3.1558888916033516, "grad_norm": 0.0006037917919456959, "learning_rate": 7.204327553383649e-07, "loss": 0.0501, "num_input_tokens_seen": 87067984, "step": 129180 }, { "epoch": 3.156011042435199, "grad_norm": 0.0025712205097079277, "learning_rate": 7.203508791047727e-07, "loss": 0.0055, "num_input_tokens_seen": 87070928, "step": 129185 }, { "epoch": 3.156133193267046, "grad_norm": 0.0012938089203089476, "learning_rate": 7.202690049048638e-07, "loss": 0.0003, "num_input_tokens_seen": 87074128, "step": 129190 }, { "epoch": 3.156255344098893, "grad_norm": 0.010054078884422779, "learning_rate": 7.201871327392344e-07, "loss": 0.0, "num_input_tokens_seen": 87077392, "step": 129195 }, { "epoch": 3.1563774949307404, "grad_norm": 0.0007030934211798012, "learning_rate": 7.201052626084792e-07, "loss": 0.0004, "num_input_tokens_seen": 87080592, "step": 129200 }, { "epoch": 3.1564996457625876, "grad_norm": 8.724342478672042e-05, "learning_rate": 7.200233945131939e-07, "loss": 0.0, "num_input_tokens_seen": 87083856, "step": 129205 }, { "epoch": 3.1566217965944348, "grad_norm": 0.0009425784228369594, "learning_rate": 7.199415284539736e-07, "loss": 0.0, "num_input_tokens_seen": 87087056, "step": 129210 }, { "epoch": 3.156743947426282, "grad_norm": 0.00022201616957318038, "learning_rate": 7.198596644314137e-07, "loss": 0.0001, "num_input_tokens_seen": 87090256, "step": 129215 }, { "epoch": 3.156866098258129, "grad_norm": 299.904052734375, "learning_rate": 7.1977780244611e-07, "loss": 0.043, "num_input_tokens_seen": 87093520, "step": 129220 }, { "epoch": 3.1569882490899763, "grad_norm": 0.398426353931427, "learning_rate": 7.196959424986575e-07, "loss": 0.0001, "num_input_tokens_seen": 87096656, "step": 129225 }, { "epoch": 3.1571103999218235, "grad_norm": 0.00035814818693324924, "learning_rate": 7.196140845896514e-07, "loss": 0.0, "num_input_tokens_seen": 87100048, "step": 129230 }, { "epoch": 3.1572325507536707, "grad_norm": 0.0038671051152050495, "learning_rate": 7.195322287196872e-07, "loss": 0.0, "num_input_tokens_seen": 87103824, "step": 129235 }, { "epoch": 3.157354701585518, "grad_norm": 4.015643571619876e-05, "learning_rate": 7.194503748893601e-07, "loss": 0.0003, "num_input_tokens_seen": 87107344, "step": 129240 }, { "epoch": 3.157476852417365, "grad_norm": 6.89021180733107e-06, "learning_rate": 7.193685230992651e-07, "loss": 0.0, "num_input_tokens_seen": 87110736, "step": 129245 }, { "epoch": 3.1575990032492123, "grad_norm": 0.0034669388551265, "learning_rate": 7.192866733499976e-07, "loss": 0.0002, "num_input_tokens_seen": 87113936, "step": 129250 }, { "epoch": 3.1577211540810595, "grad_norm": 0.00022201616957318038, "learning_rate": 7.192048256421532e-07, "loss": 0.0, "num_input_tokens_seen": 87117328, "step": 129255 }, { "epoch": 3.1578433049129067, "grad_norm": 0.16952480375766754, "learning_rate": 7.191229799763265e-07, "loss": 0.0, "num_input_tokens_seen": 87120592, "step": 129260 }, { "epoch": 3.157965455744754, "grad_norm": 0.003934331238269806, "learning_rate": 7.190411363531136e-07, "loss": 0.0, "num_input_tokens_seen": 87123856, "step": 129265 }, { "epoch": 3.1580876065766006, "grad_norm": 9.973491978598759e-05, "learning_rate": 7.189592947731085e-07, "loss": 0.0489, "num_input_tokens_seen": 87127184, "step": 129270 }, { "epoch": 3.158209757408448, "grad_norm": 0.0008483565761707723, "learning_rate": 7.188774552369077e-07, "loss": 0.0, "num_input_tokens_seen": 87130512, "step": 129275 }, { "epoch": 3.158331908240295, "grad_norm": 0.0038336871657520533, "learning_rate": 7.187956177451049e-07, "loss": 0.0, "num_input_tokens_seen": 87133904, "step": 129280 }, { "epoch": 3.158454059072142, "grad_norm": 0.0014283207710832357, "learning_rate": 7.187137822982965e-07, "loss": 0.0001, "num_input_tokens_seen": 87137168, "step": 129285 }, { "epoch": 3.1585762099039894, "grad_norm": 0.017337150871753693, "learning_rate": 7.186319488970771e-07, "loss": 0.0, "num_input_tokens_seen": 87140560, "step": 129290 }, { "epoch": 3.1586983607358365, "grad_norm": 0.00010832039697561413, "learning_rate": 7.185501175420416e-07, "loss": 0.0001, "num_input_tokens_seen": 87144208, "step": 129295 }, { "epoch": 3.1588205115676837, "grad_norm": 0.0023732376284897327, "learning_rate": 7.184682882337856e-07, "loss": 0.0, "num_input_tokens_seen": 87147472, "step": 129300 }, { "epoch": 3.158942662399531, "grad_norm": 0.0010419761529192328, "learning_rate": 7.183864609729037e-07, "loss": 0.0, "num_input_tokens_seen": 87150672, "step": 129305 }, { "epoch": 3.159064813231378, "grad_norm": 0.0005405220435932279, "learning_rate": 7.183046357599912e-07, "loss": 0.0, "num_input_tokens_seen": 87153744, "step": 129310 }, { "epoch": 3.1591869640632253, "grad_norm": 3.672197999549098e-05, "learning_rate": 7.182228125956433e-07, "loss": 0.0291, "num_input_tokens_seen": 87156944, "step": 129315 }, { "epoch": 3.1593091148950725, "grad_norm": 0.0006655405159108341, "learning_rate": 7.181409914804547e-07, "loss": 0.0001, "num_input_tokens_seen": 87160528, "step": 129320 }, { "epoch": 3.1594312657269197, "grad_norm": 0.0002669768873602152, "learning_rate": 7.18059172415021e-07, "loss": 0.0591, "num_input_tokens_seen": 87164176, "step": 129325 }, { "epoch": 3.159553416558767, "grad_norm": 0.0010073547018691897, "learning_rate": 7.179773553999364e-07, "loss": 0.0, "num_input_tokens_seen": 87167696, "step": 129330 }, { "epoch": 3.159675567390614, "grad_norm": 0.015864400193095207, "learning_rate": 7.178955404357967e-07, "loss": 0.0, "num_input_tokens_seen": 87170896, "step": 129335 }, { "epoch": 3.1597977182224612, "grad_norm": 0.05096196010708809, "learning_rate": 7.178137275231963e-07, "loss": 0.0001, "num_input_tokens_seen": 87174032, "step": 129340 }, { "epoch": 3.1599198690543084, "grad_norm": 3.69091285392642e-05, "learning_rate": 7.177319166627304e-07, "loss": 0.0001, "num_input_tokens_seen": 87177296, "step": 129345 }, { "epoch": 3.1600420198861556, "grad_norm": 0.00034703267738223076, "learning_rate": 7.176501078549941e-07, "loss": 0.0, "num_input_tokens_seen": 87181200, "step": 129350 }, { "epoch": 3.1601641707180024, "grad_norm": 1.9219897985458374, "learning_rate": 7.175683011005818e-07, "loss": 0.0001, "num_input_tokens_seen": 87184656, "step": 129355 }, { "epoch": 3.1602863215498496, "grad_norm": 0.007372839376330376, "learning_rate": 7.174864964000893e-07, "loss": 0.0, "num_input_tokens_seen": 87187728, "step": 129360 }, { "epoch": 3.1604084723816968, "grad_norm": 0.2552824318408966, "learning_rate": 7.174046937541103e-07, "loss": 0.0443, "num_input_tokens_seen": 87191184, "step": 129365 }, { "epoch": 3.160530623213544, "grad_norm": 0.007546910550445318, "learning_rate": 7.173228931632406e-07, "loss": 0.0, "num_input_tokens_seen": 87194128, "step": 129370 }, { "epoch": 3.160652774045391, "grad_norm": 0.00017476998618803918, "learning_rate": 7.172410946280752e-07, "loss": 0.0, "num_input_tokens_seen": 87197712, "step": 129375 }, { "epoch": 3.1607749248772383, "grad_norm": 0.0020659775473177433, "learning_rate": 7.171592981492085e-07, "loss": 0.0001, "num_input_tokens_seen": 87201040, "step": 129380 }, { "epoch": 3.1608970757090855, "grad_norm": 0.002920554019510746, "learning_rate": 7.170775037272353e-07, "loss": 0.0, "num_input_tokens_seen": 87204432, "step": 129385 }, { "epoch": 3.1610192265409327, "grad_norm": 0.00036389954038895667, "learning_rate": 7.169957113627507e-07, "loss": 0.0626, "num_input_tokens_seen": 87207504, "step": 129390 }, { "epoch": 3.16114137737278, "grad_norm": 0.00018603085482027382, "learning_rate": 7.169139210563495e-07, "loss": 0.0, "num_input_tokens_seen": 87211024, "step": 129395 }, { "epoch": 3.161263528204627, "grad_norm": 0.0009080006857402623, "learning_rate": 7.168321328086262e-07, "loss": 0.0269, "num_input_tokens_seen": 87214288, "step": 129400 }, { "epoch": 3.1613856790364743, "grad_norm": 0.0012143454514443874, "learning_rate": 7.167503466201757e-07, "loss": 0.0003, "num_input_tokens_seen": 87218000, "step": 129405 }, { "epoch": 3.1615078298683215, "grad_norm": 0.0028844240587204695, "learning_rate": 7.166685624915931e-07, "loss": 0.0688, "num_input_tokens_seen": 87221072, "step": 129410 }, { "epoch": 3.1616299807001687, "grad_norm": 0.03779517486691475, "learning_rate": 7.165867804234727e-07, "loss": 0.0, "num_input_tokens_seen": 87224720, "step": 129415 }, { "epoch": 3.161752131532016, "grad_norm": 0.01557319238781929, "learning_rate": 7.165050004164098e-07, "loss": 0.0, "num_input_tokens_seen": 87228368, "step": 129420 }, { "epoch": 3.161874282363863, "grad_norm": 0.0016049350379034877, "learning_rate": 7.164232224709984e-07, "loss": 0.0, "num_input_tokens_seen": 87231696, "step": 129425 }, { "epoch": 3.16199643319571, "grad_norm": 0.0009750159224495292, "learning_rate": 7.16341446587834e-07, "loss": 0.0489, "num_input_tokens_seen": 87235024, "step": 129430 }, { "epoch": 3.1621185840275574, "grad_norm": 0.004246531054377556, "learning_rate": 7.162596727675105e-07, "loss": 0.0, "num_input_tokens_seen": 87238800, "step": 129435 }, { "epoch": 3.1622407348594046, "grad_norm": 0.00011263210762990639, "learning_rate": 7.161779010106233e-07, "loss": 0.0642, "num_input_tokens_seen": 87242064, "step": 129440 }, { "epoch": 3.162362885691252, "grad_norm": 0.0018355419160798192, "learning_rate": 7.160961313177667e-07, "loss": 0.0, "num_input_tokens_seen": 87244944, "step": 129445 }, { "epoch": 3.1624850365230985, "grad_norm": 0.047277096658945084, "learning_rate": 7.16014363689535e-07, "loss": 0.0, "num_input_tokens_seen": 87248400, "step": 129450 }, { "epoch": 3.1626071873549457, "grad_norm": 0.004752216394990683, "learning_rate": 7.159325981265238e-07, "loss": 0.0, "num_input_tokens_seen": 87251920, "step": 129455 }, { "epoch": 3.162729338186793, "grad_norm": 0.00715651735663414, "learning_rate": 7.158508346293268e-07, "loss": 0.0001, "num_input_tokens_seen": 87255824, "step": 129460 }, { "epoch": 3.16285148901864, "grad_norm": 0.0004549931618385017, "learning_rate": 7.157690731985388e-07, "loss": 0.0477, "num_input_tokens_seen": 87259408, "step": 129465 }, { "epoch": 3.1629736398504873, "grad_norm": 39.06941223144531, "learning_rate": 7.15687313834755e-07, "loss": 0.0637, "num_input_tokens_seen": 87262672, "step": 129470 }, { "epoch": 3.1630957906823345, "grad_norm": 0.14959043264389038, "learning_rate": 7.156055565385692e-07, "loss": 0.0, "num_input_tokens_seen": 87265744, "step": 129475 }, { "epoch": 3.1632179415141817, "grad_norm": 0.00015907990746200085, "learning_rate": 7.155238013105765e-07, "loss": 0.0, "num_input_tokens_seen": 87269072, "step": 129480 }, { "epoch": 3.163340092346029, "grad_norm": 0.001588508952409029, "learning_rate": 7.15442048151371e-07, "loss": 0.0001, "num_input_tokens_seen": 87272144, "step": 129485 }, { "epoch": 3.163462243177876, "grad_norm": 3.941430350096198e-06, "learning_rate": 7.153602970615478e-07, "loss": 0.0, "num_input_tokens_seen": 87275984, "step": 129490 }, { "epoch": 3.1635843940097232, "grad_norm": 0.0015727278077974916, "learning_rate": 7.152785480417009e-07, "loss": 0.0, "num_input_tokens_seen": 87279312, "step": 129495 }, { "epoch": 3.1637065448415704, "grad_norm": 0.030083678662776947, "learning_rate": 7.151968010924247e-07, "loss": 0.0718, "num_input_tokens_seen": 87282512, "step": 129500 }, { "epoch": 3.1638286956734176, "grad_norm": 0.0015011405339464545, "learning_rate": 7.151150562143145e-07, "loss": 0.0, "num_input_tokens_seen": 87285968, "step": 129505 }, { "epoch": 3.163950846505265, "grad_norm": 0.003872362896800041, "learning_rate": 7.150333134079636e-07, "loss": 0.0007, "num_input_tokens_seen": 87289296, "step": 129510 }, { "epoch": 3.164072997337112, "grad_norm": 0.0008269310346804559, "learning_rate": 7.149515726739677e-07, "loss": 0.0, "num_input_tokens_seen": 87292752, "step": 129515 }, { "epoch": 3.164195148168959, "grad_norm": 0.00023637239064555615, "learning_rate": 7.148698340129202e-07, "loss": 0.0, "num_input_tokens_seen": 87296464, "step": 129520 }, { "epoch": 3.1643172990008064, "grad_norm": 0.017714515328407288, "learning_rate": 7.14788097425416e-07, "loss": 0.0, "num_input_tokens_seen": 87299536, "step": 129525 }, { "epoch": 3.1644394498326536, "grad_norm": 0.04738396778702736, "learning_rate": 7.147063629120497e-07, "loss": 0.0, "num_input_tokens_seen": 87302736, "step": 129530 }, { "epoch": 3.1645616006645003, "grad_norm": 0.0005483126733452082, "learning_rate": 7.146246304734155e-07, "loss": 0.0001, "num_input_tokens_seen": 87305744, "step": 129535 }, { "epoch": 3.1646837514963475, "grad_norm": 0.06283750385046005, "learning_rate": 7.145429001101077e-07, "loss": 0.0, "num_input_tokens_seen": 87308752, "step": 129540 }, { "epoch": 3.1648059023281947, "grad_norm": 4.195101428194903e-05, "learning_rate": 7.144611718227206e-07, "loss": 0.0, "num_input_tokens_seen": 87312144, "step": 129545 }, { "epoch": 3.164928053160042, "grad_norm": 0.04716356843709946, "learning_rate": 7.143794456118488e-07, "loss": 0.0, "num_input_tokens_seen": 87315344, "step": 129550 }, { "epoch": 3.165050203991889, "grad_norm": 0.00015722461102996022, "learning_rate": 7.142977214780863e-07, "loss": 0.0, "num_input_tokens_seen": 87318736, "step": 129555 }, { "epoch": 3.1651723548237363, "grad_norm": 0.00816582702100277, "learning_rate": 7.142159994220274e-07, "loss": 0.0, "num_input_tokens_seen": 87322384, "step": 129560 }, { "epoch": 3.1652945056555835, "grad_norm": 0.0011006930144503713, "learning_rate": 7.141342794442671e-07, "loss": 0.0001, "num_input_tokens_seen": 87325648, "step": 129565 }, { "epoch": 3.1654166564874306, "grad_norm": 0.09024383872747421, "learning_rate": 7.140525615453989e-07, "loss": 0.0001, "num_input_tokens_seen": 87329488, "step": 129570 }, { "epoch": 3.165538807319278, "grad_norm": 0.007687109988182783, "learning_rate": 7.139708457260176e-07, "loss": 0.0, "num_input_tokens_seen": 87332560, "step": 129575 }, { "epoch": 3.165660958151125, "grad_norm": 0.02901403419673443, "learning_rate": 7.138891319867171e-07, "loss": 0.0685, "num_input_tokens_seen": 87335440, "step": 129580 }, { "epoch": 3.165783108982972, "grad_norm": 0.0005344308447092772, "learning_rate": 7.138074203280919e-07, "loss": 0.0402, "num_input_tokens_seen": 87338448, "step": 129585 }, { "epoch": 3.1659052598148194, "grad_norm": 0.0023302987683564425, "learning_rate": 7.137257107507359e-07, "loss": 0.0001, "num_input_tokens_seen": 87342736, "step": 129590 }, { "epoch": 3.1660274106466666, "grad_norm": 0.0006107247318141162, "learning_rate": 7.136440032552437e-07, "loss": 0.0, "num_input_tokens_seen": 87345872, "step": 129595 }, { "epoch": 3.166149561478514, "grad_norm": 0.0016856311121955514, "learning_rate": 7.135622978422096e-07, "loss": 0.0, "num_input_tokens_seen": 87349392, "step": 129600 }, { "epoch": 3.166271712310361, "grad_norm": 0.022728141397237778, "learning_rate": 7.13480594512227e-07, "loss": 0.0, "num_input_tokens_seen": 87352784, "step": 129605 }, { "epoch": 3.166393863142208, "grad_norm": 0.001035742461681366, "learning_rate": 7.13398893265891e-07, "loss": 0.0, "num_input_tokens_seen": 87356304, "step": 129610 }, { "epoch": 3.1665160139740554, "grad_norm": 0.015423404984176159, "learning_rate": 7.13317194103795e-07, "loss": 0.0, "num_input_tokens_seen": 87359568, "step": 129615 }, { "epoch": 3.166638164805902, "grad_norm": 8.489882748108357e-05, "learning_rate": 7.132354970265333e-07, "loss": 0.0, "num_input_tokens_seen": 87363472, "step": 129620 }, { "epoch": 3.1667603156377493, "grad_norm": 0.005331840366125107, "learning_rate": 7.131538020347007e-07, "loss": 0.0, "num_input_tokens_seen": 87366608, "step": 129625 }, { "epoch": 3.1668824664695965, "grad_norm": 0.0009954558918252587, "learning_rate": 7.130721091288905e-07, "loss": 0.0543, "num_input_tokens_seen": 87369808, "step": 129630 }, { "epoch": 3.1670046173014437, "grad_norm": 0.004612566903233528, "learning_rate": 7.129904183096973e-07, "loss": 0.0, "num_input_tokens_seen": 87373584, "step": 129635 }, { "epoch": 3.167126768133291, "grad_norm": 0.007426253519952297, "learning_rate": 7.129087295777148e-07, "loss": 0.0441, "num_input_tokens_seen": 87377296, "step": 129640 }, { "epoch": 3.167248918965138, "grad_norm": 0.000458728609373793, "learning_rate": 7.128270429335375e-07, "loss": 0.1569, "num_input_tokens_seen": 87380560, "step": 129645 }, { "epoch": 3.1673710697969852, "grad_norm": 0.014502973295748234, "learning_rate": 7.12745358377759e-07, "loss": 0.0, "num_input_tokens_seen": 87383888, "step": 129650 }, { "epoch": 3.1674932206288324, "grad_norm": 0.0007057562470436096, "learning_rate": 7.126636759109732e-07, "loss": 0.0, "num_input_tokens_seen": 87387152, "step": 129655 }, { "epoch": 3.1676153714606796, "grad_norm": 0.004495688248425722, "learning_rate": 7.12581995533775e-07, "loss": 0.0, "num_input_tokens_seen": 87390928, "step": 129660 }, { "epoch": 3.167737522292527, "grad_norm": 0.006966794840991497, "learning_rate": 7.125003172467574e-07, "loss": 0.0, "num_input_tokens_seen": 87394384, "step": 129665 }, { "epoch": 3.167859673124374, "grad_norm": 0.006203863769769669, "learning_rate": 7.124186410505153e-07, "loss": 0.0001, "num_input_tokens_seen": 87397584, "step": 129670 }, { "epoch": 3.167981823956221, "grad_norm": 0.0003877569397445768, "learning_rate": 7.123369669456417e-07, "loss": 0.0455, "num_input_tokens_seen": 87400976, "step": 129675 }, { "epoch": 3.1681039747880684, "grad_norm": 0.009295029565691948, "learning_rate": 7.12255294932731e-07, "loss": 0.0002, "num_input_tokens_seen": 87404240, "step": 129680 }, { "epoch": 3.1682261256199156, "grad_norm": 0.002795857610180974, "learning_rate": 7.121736250123777e-07, "loss": 0.0, "num_input_tokens_seen": 87407376, "step": 129685 }, { "epoch": 3.1683482764517628, "grad_norm": 0.0059104301035404205, "learning_rate": 7.120919571851749e-07, "loss": 0.0001, "num_input_tokens_seen": 87410832, "step": 129690 }, { "epoch": 3.16847042728361, "grad_norm": 0.02798088826239109, "learning_rate": 7.120102914517168e-07, "loss": 0.0, "num_input_tokens_seen": 87414416, "step": 129695 }, { "epoch": 3.168592578115457, "grad_norm": 0.001736186444759369, "learning_rate": 7.119286278125976e-07, "loss": 0.0, "num_input_tokens_seen": 87417488, "step": 129700 }, { "epoch": 3.1687147289473043, "grad_norm": 0.03984348848462105, "learning_rate": 7.118469662684108e-07, "loss": 0.0001, "num_input_tokens_seen": 87420752, "step": 129705 }, { "epoch": 3.1688368797791515, "grad_norm": 0.0017900230595842004, "learning_rate": 7.1176530681975e-07, "loss": 0.0875, "num_input_tokens_seen": 87424144, "step": 129710 }, { "epoch": 3.1689590306109983, "grad_norm": 0.0016544137615710497, "learning_rate": 7.116836494672096e-07, "loss": 0.0002, "num_input_tokens_seen": 87427792, "step": 129715 }, { "epoch": 3.1690811814428455, "grad_norm": 0.00012129753304179758, "learning_rate": 7.116019942113835e-07, "loss": 0.0, "num_input_tokens_seen": 87432336, "step": 129720 }, { "epoch": 3.1692033322746926, "grad_norm": 19.250612258911133, "learning_rate": 7.115203410528649e-07, "loss": 0.0255, "num_input_tokens_seen": 87435536, "step": 129725 }, { "epoch": 3.16932548310654, "grad_norm": 0.006029466167092323, "learning_rate": 7.114386899922483e-07, "loss": 0.0, "num_input_tokens_seen": 87438800, "step": 129730 }, { "epoch": 3.169447633938387, "grad_norm": 0.032365381717681885, "learning_rate": 7.113570410301268e-07, "loss": 0.0001, "num_input_tokens_seen": 87442576, "step": 129735 }, { "epoch": 3.169569784770234, "grad_norm": 0.007656916975975037, "learning_rate": 7.11275394167095e-07, "loss": 0.0, "num_input_tokens_seen": 87446288, "step": 129740 }, { "epoch": 3.1696919356020814, "grad_norm": 0.09838418662548065, "learning_rate": 7.111937494037457e-07, "loss": 0.0001, "num_input_tokens_seen": 87449552, "step": 129745 }, { "epoch": 3.1698140864339286, "grad_norm": 0.004372446797788143, "learning_rate": 7.111121067406735e-07, "loss": 0.0, "num_input_tokens_seen": 87452496, "step": 129750 }, { "epoch": 3.169936237265776, "grad_norm": 0.006624558009207249, "learning_rate": 7.110304661784719e-07, "loss": 0.0, "num_input_tokens_seen": 87456272, "step": 129755 }, { "epoch": 3.170058388097623, "grad_norm": 0.10447652637958527, "learning_rate": 7.10948827717734e-07, "loss": 0.0001, "num_input_tokens_seen": 87459984, "step": 129760 }, { "epoch": 3.17018053892947, "grad_norm": 0.0034557366743683815, "learning_rate": 7.108671913590543e-07, "loss": 0.0001, "num_input_tokens_seen": 87463632, "step": 129765 }, { "epoch": 3.1703026897613173, "grad_norm": 0.001495639211498201, "learning_rate": 7.107855571030259e-07, "loss": 0.0, "num_input_tokens_seen": 87467088, "step": 129770 }, { "epoch": 3.1704248405931645, "grad_norm": 0.013662347570061684, "learning_rate": 7.107039249502427e-07, "loss": 0.0001, "num_input_tokens_seen": 87470288, "step": 129775 }, { "epoch": 3.1705469914250117, "grad_norm": 0.00015722191892564297, "learning_rate": 7.106222949012988e-07, "loss": 0.0, "num_input_tokens_seen": 87473424, "step": 129780 }, { "epoch": 3.170669142256859, "grad_norm": 0.012680304236710072, "learning_rate": 7.105406669567869e-07, "loss": 0.0087, "num_input_tokens_seen": 87476688, "step": 129785 }, { "epoch": 3.170791293088706, "grad_norm": 0.0014011618914082646, "learning_rate": 7.104590411173014e-07, "loss": 0.0001, "num_input_tokens_seen": 87479952, "step": 129790 }, { "epoch": 3.1709134439205533, "grad_norm": 0.03720277547836304, "learning_rate": 7.103774173834354e-07, "loss": 0.0, "num_input_tokens_seen": 87483536, "step": 129795 }, { "epoch": 3.1710355947524, "grad_norm": 0.013112449087202549, "learning_rate": 7.102957957557831e-07, "loss": 0.0, "num_input_tokens_seen": 87487184, "step": 129800 }, { "epoch": 3.1711577455842472, "grad_norm": 0.001320056733675301, "learning_rate": 7.102141762349376e-07, "loss": 0.0001, "num_input_tokens_seen": 87490768, "step": 129805 }, { "epoch": 3.1712798964160944, "grad_norm": 0.00020275999850127846, "learning_rate": 7.101325588214923e-07, "loss": 0.0, "num_input_tokens_seen": 87494096, "step": 129810 }, { "epoch": 3.1714020472479416, "grad_norm": 0.012677210383117199, "learning_rate": 7.100509435160413e-07, "loss": 0.0, "num_input_tokens_seen": 87497168, "step": 129815 }, { "epoch": 3.171524198079789, "grad_norm": 0.0008101258426904678, "learning_rate": 7.099693303191775e-07, "loss": 0.0, "num_input_tokens_seen": 87500624, "step": 129820 }, { "epoch": 3.171646348911636, "grad_norm": 0.0013864908833056688, "learning_rate": 7.098877192314952e-07, "loss": 0.0, "num_input_tokens_seen": 87503952, "step": 129825 }, { "epoch": 3.171768499743483, "grad_norm": 0.007696992717683315, "learning_rate": 7.098061102535868e-07, "loss": 0.0, "num_input_tokens_seen": 87507408, "step": 129830 }, { "epoch": 3.1718906505753304, "grad_norm": 0.0013759899884462357, "learning_rate": 7.09724503386047e-07, "loss": 0.0501, "num_input_tokens_seen": 87510480, "step": 129835 }, { "epoch": 3.1720128014071776, "grad_norm": 0.0023257506545633078, "learning_rate": 7.096428986294682e-07, "loss": 0.0001, "num_input_tokens_seen": 87513936, "step": 129840 }, { "epoch": 3.1721349522390248, "grad_norm": 0.0013343783793970942, "learning_rate": 7.095612959844447e-07, "loss": 0.0001, "num_input_tokens_seen": 87517904, "step": 129845 }, { "epoch": 3.172257103070872, "grad_norm": 0.028019046410918236, "learning_rate": 7.094796954515695e-07, "loss": 0.0003, "num_input_tokens_seen": 87520976, "step": 129850 }, { "epoch": 3.172379253902719, "grad_norm": 0.00030671211425215006, "learning_rate": 7.093980970314361e-07, "loss": 0.0003, "num_input_tokens_seen": 87523984, "step": 129855 }, { "epoch": 3.1725014047345663, "grad_norm": 0.007012230344116688, "learning_rate": 7.09316500724638e-07, "loss": 0.0251, "num_input_tokens_seen": 87527120, "step": 129860 }, { "epoch": 3.1726235555664135, "grad_norm": 0.0017949631437659264, "learning_rate": 7.092349065317683e-07, "loss": 0.0688, "num_input_tokens_seen": 87530576, "step": 129865 }, { "epoch": 3.1727457063982607, "grad_norm": 0.008416776545345783, "learning_rate": 7.091533144534203e-07, "loss": 0.0, "num_input_tokens_seen": 87533968, "step": 129870 }, { "epoch": 3.172867857230108, "grad_norm": 0.02502255141735077, "learning_rate": 7.090717244901883e-07, "loss": 0.0, "num_input_tokens_seen": 87537744, "step": 129875 }, { "epoch": 3.172990008061955, "grad_norm": 0.0020456870552152395, "learning_rate": 7.089901366426642e-07, "loss": 0.0, "num_input_tokens_seen": 87541200, "step": 129880 }, { "epoch": 3.1731121588938023, "grad_norm": 0.014949319884181023, "learning_rate": 7.089085509114428e-07, "loss": 0.0001, "num_input_tokens_seen": 87544272, "step": 129885 }, { "epoch": 3.1732343097256495, "grad_norm": 0.005121736787259579, "learning_rate": 7.088269672971164e-07, "loss": 0.0, "num_input_tokens_seen": 87547216, "step": 129890 }, { "epoch": 3.173356460557496, "grad_norm": 0.03437983617186546, "learning_rate": 7.087453858002787e-07, "loss": 0.0, "num_input_tokens_seen": 87550608, "step": 129895 }, { "epoch": 3.1734786113893434, "grad_norm": 0.005664190277457237, "learning_rate": 7.086638064215226e-07, "loss": 0.0, "num_input_tokens_seen": 87553680, "step": 129900 }, { "epoch": 3.1736007622211906, "grad_norm": 0.00024182397464755923, "learning_rate": 7.085822291614419e-07, "loss": 0.0, "num_input_tokens_seen": 87556816, "step": 129905 }, { "epoch": 3.1737229130530378, "grad_norm": 0.03858715668320656, "learning_rate": 7.085006540206298e-07, "loss": 0.0, "num_input_tokens_seen": 87560592, "step": 129910 }, { "epoch": 3.173845063884885, "grad_norm": 0.0002090064954245463, "learning_rate": 7.08419080999679e-07, "loss": 0.0, "num_input_tokens_seen": 87563856, "step": 129915 }, { "epoch": 3.173967214716732, "grad_norm": 0.026446470990777016, "learning_rate": 7.083375100991835e-07, "loss": 0.0, "num_input_tokens_seen": 87567248, "step": 129920 }, { "epoch": 3.1740893655485793, "grad_norm": 0.0023130772169679403, "learning_rate": 7.082559413197356e-07, "loss": 0.0, "num_input_tokens_seen": 87570576, "step": 129925 }, { "epoch": 3.1742115163804265, "grad_norm": 0.0021503111347556114, "learning_rate": 7.081743746619289e-07, "loss": 0.0427, "num_input_tokens_seen": 87573776, "step": 129930 }, { "epoch": 3.1743336672122737, "grad_norm": 0.002262742491438985, "learning_rate": 7.080928101263571e-07, "loss": 0.0, "num_input_tokens_seen": 87576912, "step": 129935 }, { "epoch": 3.174455818044121, "grad_norm": 0.00038311193929985166, "learning_rate": 7.080112477136124e-07, "loss": 0.0001, "num_input_tokens_seen": 87580240, "step": 129940 }, { "epoch": 3.174577968875968, "grad_norm": 0.0581546351313591, "learning_rate": 7.07929687424289e-07, "loss": 0.0, "num_input_tokens_seen": 87583824, "step": 129945 }, { "epoch": 3.1747001197078153, "grad_norm": 0.017676610499620438, "learning_rate": 7.078481292589791e-07, "loss": 0.0679, "num_input_tokens_seen": 87587280, "step": 129950 }, { "epoch": 3.1748222705396625, "grad_norm": 0.09328219294548035, "learning_rate": 7.077665732182765e-07, "loss": 0.0637, "num_input_tokens_seen": 87590480, "step": 129955 }, { "epoch": 3.1749444213715097, "grad_norm": 0.0017453327309340239, "learning_rate": 7.076850193027737e-07, "loss": 0.0388, "num_input_tokens_seen": 87593936, "step": 129960 }, { "epoch": 3.175066572203357, "grad_norm": 0.005118220578879118, "learning_rate": 7.07603467513064e-07, "loss": 0.0001, "num_input_tokens_seen": 87597392, "step": 129965 }, { "epoch": 3.175188723035204, "grad_norm": 0.0016354554099962115, "learning_rate": 7.075219178497409e-07, "loss": 0.0, "num_input_tokens_seen": 87601872, "step": 129970 }, { "epoch": 3.1753108738670512, "grad_norm": 0.012382320128381252, "learning_rate": 7.074403703133967e-07, "loss": 0.0001, "num_input_tokens_seen": 87605264, "step": 129975 }, { "epoch": 3.175433024698898, "grad_norm": 0.00264959828928113, "learning_rate": 7.073588249046252e-07, "loss": 0.0001, "num_input_tokens_seen": 87609168, "step": 129980 }, { "epoch": 3.175555175530745, "grad_norm": 0.09764406085014343, "learning_rate": 7.072772816240184e-07, "loss": 0.0001, "num_input_tokens_seen": 87612304, "step": 129985 }, { "epoch": 3.1756773263625924, "grad_norm": 0.023879224434494972, "learning_rate": 7.071957404721707e-07, "loss": 0.0001, "num_input_tokens_seen": 87615312, "step": 129990 }, { "epoch": 3.1757994771944396, "grad_norm": 0.00015118512965273112, "learning_rate": 7.071142014496737e-07, "loss": 0.0001, "num_input_tokens_seen": 87618960, "step": 129995 }, { "epoch": 3.1759216280262867, "grad_norm": 0.0017053249757736921, "learning_rate": 7.070326645571213e-07, "loss": 0.0003, "num_input_tokens_seen": 87622096, "step": 130000 }, { "epoch": 3.176043778858134, "grad_norm": 44.842586517333984, "learning_rate": 7.06951129795106e-07, "loss": 0.062, "num_input_tokens_seen": 87625040, "step": 130005 }, { "epoch": 3.176165929689981, "grad_norm": 0.005887690931558609, "learning_rate": 7.068695971642212e-07, "loss": 0.0, "num_input_tokens_seen": 87628624, "step": 130010 }, { "epoch": 3.1762880805218283, "grad_norm": 0.0014447735156863928, "learning_rate": 7.067880666650594e-07, "loss": 0.0835, "num_input_tokens_seen": 87631888, "step": 130015 }, { "epoch": 3.1764102313536755, "grad_norm": 0.0010694863740354776, "learning_rate": 7.067065382982136e-07, "loss": 0.0, "num_input_tokens_seen": 87635472, "step": 130020 }, { "epoch": 3.1765323821855227, "grad_norm": 0.006981464568525553, "learning_rate": 7.066250120642765e-07, "loss": 0.0, "num_input_tokens_seen": 87638992, "step": 130025 }, { "epoch": 3.17665453301737, "grad_norm": 0.17222914099693298, "learning_rate": 7.065434879638417e-07, "loss": 0.0001, "num_input_tokens_seen": 87642192, "step": 130030 }, { "epoch": 3.176776683849217, "grad_norm": 0.0022465409711003304, "learning_rate": 7.064619659975012e-07, "loss": 0.0, "num_input_tokens_seen": 87645392, "step": 130035 }, { "epoch": 3.1768988346810643, "grad_norm": 0.04304928332567215, "learning_rate": 7.063804461658486e-07, "loss": 0.0, "num_input_tokens_seen": 87648912, "step": 130040 }, { "epoch": 3.1770209855129115, "grad_norm": 0.002591552911326289, "learning_rate": 7.06298928469476e-07, "loss": 0.0001, "num_input_tokens_seen": 87652560, "step": 130045 }, { "epoch": 3.1771431363447586, "grad_norm": 0.0006279898807406425, "learning_rate": 7.06217412908977e-07, "loss": 0.0001, "num_input_tokens_seen": 87656144, "step": 130050 }, { "epoch": 3.177265287176606, "grad_norm": 0.03918922692537308, "learning_rate": 7.061358994849434e-07, "loss": 0.0, "num_input_tokens_seen": 87659344, "step": 130055 }, { "epoch": 3.177387438008453, "grad_norm": 0.015605071559548378, "learning_rate": 7.06054388197969e-07, "loss": 0.0, "num_input_tokens_seen": 87662800, "step": 130060 }, { "epoch": 3.1775095888402998, "grad_norm": 0.0005128192133270204, "learning_rate": 7.059728790486463e-07, "loss": 0.0162, "num_input_tokens_seen": 87666128, "step": 130065 }, { "epoch": 3.177631739672147, "grad_norm": 0.005295256618410349, "learning_rate": 7.058913720375674e-07, "loss": 0.0002, "num_input_tokens_seen": 87669072, "step": 130070 }, { "epoch": 3.177753890503994, "grad_norm": 0.02350236475467682, "learning_rate": 7.058098671653261e-07, "loss": 0.0, "num_input_tokens_seen": 87672208, "step": 130075 }, { "epoch": 3.1778760413358413, "grad_norm": 0.0005129770725034177, "learning_rate": 7.057283644325141e-07, "loss": 0.0001, "num_input_tokens_seen": 87676112, "step": 130080 }, { "epoch": 3.1779981921676885, "grad_norm": 0.009734749794006348, "learning_rate": 7.056468638397246e-07, "loss": 0.0, "num_input_tokens_seen": 87679376, "step": 130085 }, { "epoch": 3.1781203429995357, "grad_norm": 0.0009069366496987641, "learning_rate": 7.055653653875507e-07, "loss": 0.0655, "num_input_tokens_seen": 87682576, "step": 130090 }, { "epoch": 3.178242493831383, "grad_norm": 0.002525150077417493, "learning_rate": 7.054838690765843e-07, "loss": 0.0, "num_input_tokens_seen": 87685968, "step": 130095 }, { "epoch": 3.17836464466323, "grad_norm": 0.0003225205873604864, "learning_rate": 7.054023749074188e-07, "loss": 0.0, "num_input_tokens_seen": 87689168, "step": 130100 }, { "epoch": 3.1784867954950773, "grad_norm": 0.0007413416169583797, "learning_rate": 7.053208828806459e-07, "loss": 0.0, "num_input_tokens_seen": 87692752, "step": 130105 }, { "epoch": 3.1786089463269245, "grad_norm": 0.0032514086924493313, "learning_rate": 7.052393929968593e-07, "loss": 0.0007, "num_input_tokens_seen": 87695632, "step": 130110 }, { "epoch": 3.1787310971587717, "grad_norm": 0.0037922996561974287, "learning_rate": 7.05157905256651e-07, "loss": 0.0002, "num_input_tokens_seen": 87698768, "step": 130115 }, { "epoch": 3.178853247990619, "grad_norm": 0.0006531036924570799, "learning_rate": 7.050764196606134e-07, "loss": 0.0, "num_input_tokens_seen": 87702288, "step": 130120 }, { "epoch": 3.178975398822466, "grad_norm": 0.00046883506001904607, "learning_rate": 7.049949362093399e-07, "loss": 0.0, "num_input_tokens_seen": 87705232, "step": 130125 }, { "epoch": 3.1790975496543132, "grad_norm": 0.0008409220608882606, "learning_rate": 7.049134549034222e-07, "loss": 0.0511, "num_input_tokens_seen": 87709136, "step": 130130 }, { "epoch": 3.1792197004861604, "grad_norm": 0.0014450735179707408, "learning_rate": 7.048319757434535e-07, "loss": 0.0001, "num_input_tokens_seen": 87712400, "step": 130135 }, { "epoch": 3.1793418513180076, "grad_norm": 0.0005200757295824587, "learning_rate": 7.047504987300256e-07, "loss": 0.0, "num_input_tokens_seen": 87715600, "step": 130140 }, { "epoch": 3.179464002149855, "grad_norm": 0.013696745969355106, "learning_rate": 7.046690238637321e-07, "loss": 0.0, "num_input_tokens_seen": 87719184, "step": 130145 }, { "epoch": 3.179586152981702, "grad_norm": 0.0015738914953544736, "learning_rate": 7.045875511451642e-07, "loss": 0.0, "num_input_tokens_seen": 87722512, "step": 130150 }, { "epoch": 3.179708303813549, "grad_norm": 0.00783845316618681, "learning_rate": 7.045060805749156e-07, "loss": 0.0, "num_input_tokens_seen": 87725648, "step": 130155 }, { "epoch": 3.179830454645396, "grad_norm": 0.0014518832322210073, "learning_rate": 7.044246121535781e-07, "loss": 0.0613, "num_input_tokens_seen": 87728848, "step": 130160 }, { "epoch": 3.179952605477243, "grad_norm": 0.15988725423812866, "learning_rate": 7.043431458817444e-07, "loss": 0.0001, "num_input_tokens_seen": 87731920, "step": 130165 }, { "epoch": 3.1800747563090903, "grad_norm": 0.0022228602319955826, "learning_rate": 7.042616817600067e-07, "loss": 0.0, "num_input_tokens_seen": 87735184, "step": 130170 }, { "epoch": 3.1801969071409375, "grad_norm": 0.029602840542793274, "learning_rate": 7.041802197889577e-07, "loss": 0.0366, "num_input_tokens_seen": 87738704, "step": 130175 }, { "epoch": 3.1803190579727847, "grad_norm": 51.33023452758789, "learning_rate": 7.040987599691895e-07, "loss": 0.0402, "num_input_tokens_seen": 87741840, "step": 130180 }, { "epoch": 3.180441208804632, "grad_norm": 0.00034280677209608257, "learning_rate": 7.040173023012952e-07, "loss": 0.0, "num_input_tokens_seen": 87745104, "step": 130185 }, { "epoch": 3.180563359636479, "grad_norm": 0.017405470833182335, "learning_rate": 7.039358467858662e-07, "loss": 0.0001, "num_input_tokens_seen": 87749008, "step": 130190 }, { "epoch": 3.1806855104683263, "grad_norm": 0.008447892032563686, "learning_rate": 7.038543934234957e-07, "loss": 0.0, "num_input_tokens_seen": 87752848, "step": 130195 }, { "epoch": 3.1808076613001735, "grad_norm": 0.04451584443449974, "learning_rate": 7.037729422147754e-07, "loss": 0.0, "num_input_tokens_seen": 87756048, "step": 130200 }, { "epoch": 3.1809298121320206, "grad_norm": 0.015546695329248905, "learning_rate": 7.036914931602984e-07, "loss": 0.0, "num_input_tokens_seen": 87759504, "step": 130205 }, { "epoch": 3.181051962963868, "grad_norm": 0.0012945537455379963, "learning_rate": 7.03610046260656e-07, "loss": 0.0003, "num_input_tokens_seen": 87762576, "step": 130210 }, { "epoch": 3.181174113795715, "grad_norm": 0.00038110712193883955, "learning_rate": 7.035286015164413e-07, "loss": 0.0, "num_input_tokens_seen": 87765712, "step": 130215 }, { "epoch": 3.181296264627562, "grad_norm": 0.0018213241128250957, "learning_rate": 7.034471589282467e-07, "loss": 0.0003, "num_input_tokens_seen": 87769168, "step": 130220 }, { "epoch": 3.1814184154594094, "grad_norm": 13.02833366394043, "learning_rate": 7.033657184966634e-07, "loss": 0.0399, "num_input_tokens_seen": 87772624, "step": 130225 }, { "epoch": 3.1815405662912566, "grad_norm": 0.023209894075989723, "learning_rate": 7.032842802222851e-07, "loss": 0.0, "num_input_tokens_seen": 87776080, "step": 130230 }, { "epoch": 3.181662717123104, "grad_norm": 0.07949075847864151, "learning_rate": 7.032028441057028e-07, "loss": 0.0672, "num_input_tokens_seen": 87779472, "step": 130235 }, { "epoch": 3.181784867954951, "grad_norm": 0.016844136640429497, "learning_rate": 7.031214101475092e-07, "loss": 0.0, "num_input_tokens_seen": 87782800, "step": 130240 }, { "epoch": 3.1819070187867977, "grad_norm": 0.0032533081248402596, "learning_rate": 7.030399783482971e-07, "loss": 0.0, "num_input_tokens_seen": 87786064, "step": 130245 }, { "epoch": 3.182029169618645, "grad_norm": 0.0273179579526186, "learning_rate": 7.029585487086576e-07, "loss": 0.0, "num_input_tokens_seen": 87789072, "step": 130250 }, { "epoch": 3.182151320450492, "grad_norm": 0.000918433303013444, "learning_rate": 7.028771212291839e-07, "loss": 0.0, "num_input_tokens_seen": 87792272, "step": 130255 }, { "epoch": 3.1822734712823393, "grad_norm": 0.02920876070857048, "learning_rate": 7.027956959104673e-07, "loss": 0.0002, "num_input_tokens_seen": 87795472, "step": 130260 }, { "epoch": 3.1823956221141865, "grad_norm": 6.971739639993757e-05, "learning_rate": 7.027142727531008e-07, "loss": 0.0002, "num_input_tokens_seen": 87798800, "step": 130265 }, { "epoch": 3.1825177729460337, "grad_norm": 0.05342421680688858, "learning_rate": 7.026328517576757e-07, "loss": 0.0, "num_input_tokens_seen": 87801872, "step": 130270 }, { "epoch": 3.182639923777881, "grad_norm": 0.0004939790815114975, "learning_rate": 7.025514329247844e-07, "loss": 0.0004, "num_input_tokens_seen": 87804880, "step": 130275 }, { "epoch": 3.182762074609728, "grad_norm": 0.18766658008098602, "learning_rate": 7.024700162550194e-07, "loss": 0.0002, "num_input_tokens_seen": 87808272, "step": 130280 }, { "epoch": 3.1828842254415752, "grad_norm": 0.0023981656413525343, "learning_rate": 7.023886017489721e-07, "loss": 0.0, "num_input_tokens_seen": 87811856, "step": 130285 }, { "epoch": 3.1830063762734224, "grad_norm": 0.00666560186073184, "learning_rate": 7.023071894072354e-07, "loss": 0.0317, "num_input_tokens_seen": 87815120, "step": 130290 }, { "epoch": 3.1831285271052696, "grad_norm": 0.004666912369430065, "learning_rate": 7.022257792304005e-07, "loss": 0.0, "num_input_tokens_seen": 87818832, "step": 130295 }, { "epoch": 3.183250677937117, "grad_norm": 0.005564829334616661, "learning_rate": 7.021443712190601e-07, "loss": 0.0005, "num_input_tokens_seen": 87822544, "step": 130300 }, { "epoch": 3.183372828768964, "grad_norm": 0.09069501608610153, "learning_rate": 7.020629653738056e-07, "loss": 0.0, "num_input_tokens_seen": 87826064, "step": 130305 }, { "epoch": 3.183494979600811, "grad_norm": 6.888069765409455e-05, "learning_rate": 7.019815616952295e-07, "loss": 0.0, "num_input_tokens_seen": 87829520, "step": 130310 }, { "epoch": 3.1836171304326584, "grad_norm": 0.005070709623396397, "learning_rate": 7.019001601839238e-07, "loss": 0.0, "num_input_tokens_seen": 87832976, "step": 130315 }, { "epoch": 3.1837392812645056, "grad_norm": 0.00020985525043215603, "learning_rate": 7.018187608404802e-07, "loss": 0.0674, "num_input_tokens_seen": 87836112, "step": 130320 }, { "epoch": 3.1838614320963528, "grad_norm": 7.797882426530123e-05, "learning_rate": 7.017373636654908e-07, "loss": 0.0703, "num_input_tokens_seen": 87839056, "step": 130325 }, { "epoch": 3.1839835829282, "grad_norm": 0.006128346081823111, "learning_rate": 7.016559686595475e-07, "loss": 0.0, "num_input_tokens_seen": 87842320, "step": 130330 }, { "epoch": 3.184105733760047, "grad_norm": 0.15260010957717896, "learning_rate": 7.015745758232421e-07, "loss": 0.0, "num_input_tokens_seen": 87845776, "step": 130335 }, { "epoch": 3.184227884591894, "grad_norm": 0.15395669639110565, "learning_rate": 7.01493185157167e-07, "loss": 0.0001, "num_input_tokens_seen": 87848848, "step": 130340 }, { "epoch": 3.184350035423741, "grad_norm": 4.5210264943307266e-05, "learning_rate": 7.014117966619133e-07, "loss": 0.0, "num_input_tokens_seen": 87852880, "step": 130345 }, { "epoch": 3.1844721862555883, "grad_norm": 0.06586100906133652, "learning_rate": 7.013304103380738e-07, "loss": 0.0, "num_input_tokens_seen": 87855952, "step": 130350 }, { "epoch": 3.1845943370874354, "grad_norm": 0.24961978197097778, "learning_rate": 7.012490261862394e-07, "loss": 0.0001, "num_input_tokens_seen": 87859472, "step": 130355 }, { "epoch": 3.1847164879192826, "grad_norm": 12.334030151367188, "learning_rate": 7.011676442070029e-07, "loss": 0.031, "num_input_tokens_seen": 87862608, "step": 130360 }, { "epoch": 3.18483863875113, "grad_norm": 0.00022853047994431108, "learning_rate": 7.010862644009553e-07, "loss": 0.0, "num_input_tokens_seen": 87866320, "step": 130365 }, { "epoch": 3.184960789582977, "grad_norm": 0.06349430978298187, "learning_rate": 7.010048867686889e-07, "loss": 0.0001, "num_input_tokens_seen": 87869712, "step": 130370 }, { "epoch": 3.185082940414824, "grad_norm": 0.009467333555221558, "learning_rate": 7.009235113107956e-07, "loss": 0.0, "num_input_tokens_seen": 87873424, "step": 130375 }, { "epoch": 3.1852050912466714, "grad_norm": 0.00011473613267298788, "learning_rate": 7.008421380278666e-07, "loss": 0.0311, "num_input_tokens_seen": 87876496, "step": 130380 }, { "epoch": 3.1853272420785186, "grad_norm": 0.03573605790734291, "learning_rate": 7.007607669204944e-07, "loss": 0.0317, "num_input_tokens_seen": 87879824, "step": 130385 }, { "epoch": 3.1854493929103658, "grad_norm": 0.007226187270134687, "learning_rate": 7.006793979892702e-07, "loss": 0.0418, "num_input_tokens_seen": 87883152, "step": 130390 }, { "epoch": 3.185571543742213, "grad_norm": 0.00042921036947518587, "learning_rate": 7.005980312347856e-07, "loss": 0.0, "num_input_tokens_seen": 87886736, "step": 130395 }, { "epoch": 3.18569369457406, "grad_norm": 17.04238510131836, "learning_rate": 7.005166666576333e-07, "loss": 0.033, "num_input_tokens_seen": 87890320, "step": 130400 }, { "epoch": 3.1858158454059073, "grad_norm": 0.07590338587760925, "learning_rate": 7.004353042584038e-07, "loss": 0.0002, "num_input_tokens_seen": 87894288, "step": 130405 }, { "epoch": 3.1859379962377545, "grad_norm": 0.0023550644982606173, "learning_rate": 7.003539440376898e-07, "loss": 0.0, "num_input_tokens_seen": 87897872, "step": 130410 }, { "epoch": 3.1860601470696017, "grad_norm": 0.02361527644097805, "learning_rate": 7.002725859960821e-07, "loss": 0.0, "num_input_tokens_seen": 87902224, "step": 130415 }, { "epoch": 3.186182297901449, "grad_norm": 0.05243421345949173, "learning_rate": 7.001912301341732e-07, "loss": 0.0, "num_input_tokens_seen": 87905872, "step": 130420 }, { "epoch": 3.1863044487332957, "grad_norm": 0.00032037662458606064, "learning_rate": 7.001098764525542e-07, "loss": 0.0001, "num_input_tokens_seen": 87909072, "step": 130425 }, { "epoch": 3.186426599565143, "grad_norm": 0.043554555624723434, "learning_rate": 7.000285249518164e-07, "loss": 0.0, "num_input_tokens_seen": 87912336, "step": 130430 }, { "epoch": 3.18654875039699, "grad_norm": 0.04560072347521782, "learning_rate": 6.999471756325523e-07, "loss": 0.0, "num_input_tokens_seen": 87915792, "step": 130435 }, { "epoch": 3.1866709012288372, "grad_norm": 0.24238801002502441, "learning_rate": 6.998658284953528e-07, "loss": 0.0001, "num_input_tokens_seen": 87919056, "step": 130440 }, { "epoch": 3.1867930520606844, "grad_norm": 0.11142058670520782, "learning_rate": 6.9978448354081e-07, "loss": 0.0572, "num_input_tokens_seen": 87922448, "step": 130445 }, { "epoch": 3.1869152028925316, "grad_norm": 0.050994254648685455, "learning_rate": 6.997031407695148e-07, "loss": 0.0, "num_input_tokens_seen": 87926032, "step": 130450 }, { "epoch": 3.187037353724379, "grad_norm": 0.012314151972532272, "learning_rate": 6.996218001820596e-07, "loss": 0.0555, "num_input_tokens_seen": 87929296, "step": 130455 }, { "epoch": 3.187159504556226, "grad_norm": 0.0014616765547543764, "learning_rate": 6.99540461779035e-07, "loss": 0.0, "num_input_tokens_seen": 87932944, "step": 130460 }, { "epoch": 3.187281655388073, "grad_norm": 0.006061887834221125, "learning_rate": 6.994591255610331e-07, "loss": 0.0, "num_input_tokens_seen": 87936016, "step": 130465 }, { "epoch": 3.1874038062199204, "grad_norm": 0.00010483522055437788, "learning_rate": 6.993777915286455e-07, "loss": 0.0, "num_input_tokens_seen": 87939600, "step": 130470 }, { "epoch": 3.1875259570517676, "grad_norm": 0.001160849235020578, "learning_rate": 6.992964596824633e-07, "loss": 0.0, "num_input_tokens_seen": 87942608, "step": 130475 }, { "epoch": 3.1876481078836147, "grad_norm": 2.6711461544036865, "learning_rate": 6.992151300230782e-07, "loss": 0.0002, "num_input_tokens_seen": 87946000, "step": 130480 }, { "epoch": 3.187770258715462, "grad_norm": 0.000411342567531392, "learning_rate": 6.991338025510816e-07, "loss": 0.0, "num_input_tokens_seen": 87949520, "step": 130485 }, { "epoch": 3.187892409547309, "grad_norm": 0.004299001768231392, "learning_rate": 6.990524772670645e-07, "loss": 0.0, "num_input_tokens_seen": 87953104, "step": 130490 }, { "epoch": 3.1880145603791563, "grad_norm": 0.0024759077932685614, "learning_rate": 6.989711541716192e-07, "loss": 0.0, "num_input_tokens_seen": 87956624, "step": 130495 }, { "epoch": 3.1881367112110035, "grad_norm": 0.002334713703021407, "learning_rate": 6.988898332653363e-07, "loss": 0.0, "num_input_tokens_seen": 87960016, "step": 130500 }, { "epoch": 3.1882588620428507, "grad_norm": 0.0018174410797655582, "learning_rate": 6.988085145488081e-07, "loss": 0.0, "num_input_tokens_seen": 87963152, "step": 130505 }, { "epoch": 3.188381012874698, "grad_norm": 0.00905153714120388, "learning_rate": 6.987271980226247e-07, "loss": 0.0, "num_input_tokens_seen": 87966544, "step": 130510 }, { "epoch": 3.188503163706545, "grad_norm": 0.0009720994858071208, "learning_rate": 6.986458836873787e-07, "loss": 0.0, "num_input_tokens_seen": 87969616, "step": 130515 }, { "epoch": 3.188625314538392, "grad_norm": 0.0017524209106341004, "learning_rate": 6.985645715436605e-07, "loss": 0.0, "num_input_tokens_seen": 87973264, "step": 130520 }, { "epoch": 3.188747465370239, "grad_norm": 0.0002401109377387911, "learning_rate": 6.98483261592062e-07, "loss": 0.0365, "num_input_tokens_seen": 87976464, "step": 130525 }, { "epoch": 3.188869616202086, "grad_norm": 0.5011038184165955, "learning_rate": 6.984019538331745e-07, "loss": 0.0001, "num_input_tokens_seen": 87979600, "step": 130530 }, { "epoch": 3.1889917670339334, "grad_norm": 0.014322505332529545, "learning_rate": 6.983206482675885e-07, "loss": 0.0, "num_input_tokens_seen": 87982864, "step": 130535 }, { "epoch": 3.1891139178657806, "grad_norm": 0.01264337170869112, "learning_rate": 6.982393448958965e-07, "loss": 0.0002, "num_input_tokens_seen": 87986448, "step": 130540 }, { "epoch": 3.1892360686976278, "grad_norm": 0.0003539649769663811, "learning_rate": 6.981580437186887e-07, "loss": 0.0, "num_input_tokens_seen": 87989904, "step": 130545 }, { "epoch": 3.189358219529475, "grad_norm": 0.004508215002715588, "learning_rate": 6.980767447365574e-07, "loss": 0.0, "num_input_tokens_seen": 87993168, "step": 130550 }, { "epoch": 3.189480370361322, "grad_norm": 0.001551062916405499, "learning_rate": 6.979954479500924e-07, "loss": 0.0, "num_input_tokens_seen": 87996240, "step": 130555 }, { "epoch": 3.1896025211931693, "grad_norm": 0.00028249152819626033, "learning_rate": 6.979141533598861e-07, "loss": 0.0001, "num_input_tokens_seen": 88000080, "step": 130560 }, { "epoch": 3.1897246720250165, "grad_norm": 0.0010627760784700513, "learning_rate": 6.978328609665296e-07, "loss": 0.0, "num_input_tokens_seen": 88003408, "step": 130565 }, { "epoch": 3.1898468228568637, "grad_norm": 0.0011315912706777453, "learning_rate": 6.977515707706134e-07, "loss": 0.0498, "num_input_tokens_seen": 88006480, "step": 130570 }, { "epoch": 3.189968973688711, "grad_norm": 0.03361840918660164, "learning_rate": 6.976702827727294e-07, "loss": 0.0001, "num_input_tokens_seen": 88009872, "step": 130575 }, { "epoch": 3.190091124520558, "grad_norm": 0.0004706278850790113, "learning_rate": 6.975889969734684e-07, "loss": 0.0002, "num_input_tokens_seen": 88013328, "step": 130580 }, { "epoch": 3.1902132753524053, "grad_norm": 0.004012112505733967, "learning_rate": 6.975077133734213e-07, "loss": 0.0, "num_input_tokens_seen": 88017232, "step": 130585 }, { "epoch": 3.1903354261842525, "grad_norm": 0.0007510602008551359, "learning_rate": 6.974264319731797e-07, "loss": 0.0, "num_input_tokens_seen": 88020688, "step": 130590 }, { "epoch": 3.1904575770160997, "grad_norm": 0.0016171341994777322, "learning_rate": 6.973451527733343e-07, "loss": 0.0, "num_input_tokens_seen": 88023568, "step": 130595 }, { "epoch": 3.190579727847947, "grad_norm": 0.0002590622170828283, "learning_rate": 6.972638757744766e-07, "loss": 0.0, "num_input_tokens_seen": 88027024, "step": 130600 }, { "epoch": 3.1907018786797936, "grad_norm": 0.0003849154163617641, "learning_rate": 6.971826009771971e-07, "loss": 0.0, "num_input_tokens_seen": 88030544, "step": 130605 }, { "epoch": 3.190824029511641, "grad_norm": 0.00034108353429473937, "learning_rate": 6.971013283820876e-07, "loss": 0.0001, "num_input_tokens_seen": 88034064, "step": 130610 }, { "epoch": 3.190946180343488, "grad_norm": 0.002630336210131645, "learning_rate": 6.970200579897382e-07, "loss": 0.0, "num_input_tokens_seen": 88037328, "step": 130615 }, { "epoch": 3.191068331175335, "grad_norm": 0.0024332611355930567, "learning_rate": 6.969387898007406e-07, "loss": 0.0, "num_input_tokens_seen": 88041040, "step": 130620 }, { "epoch": 3.1911904820071824, "grad_norm": 0.00011417076166253537, "learning_rate": 6.968575238156858e-07, "loss": 0.0, "num_input_tokens_seen": 88044624, "step": 130625 }, { "epoch": 3.1913126328390296, "grad_norm": 0.0007042978540994227, "learning_rate": 6.967762600351646e-07, "loss": 0.0, "num_input_tokens_seen": 88047952, "step": 130630 }, { "epoch": 3.1914347836708767, "grad_norm": 0.00012770794273819774, "learning_rate": 6.966949984597679e-07, "loss": 0.0, "num_input_tokens_seen": 88051216, "step": 130635 }, { "epoch": 3.191556934502724, "grad_norm": 0.027314137667417526, "learning_rate": 6.966137390900868e-07, "loss": 0.0001, "num_input_tokens_seen": 88054352, "step": 130640 }, { "epoch": 3.191679085334571, "grad_norm": 52.50052261352539, "learning_rate": 6.96532481926712e-07, "loss": 0.0467, "num_input_tokens_seen": 88057872, "step": 130645 }, { "epoch": 3.1918012361664183, "grad_norm": 0.00997094251215458, "learning_rate": 6.96451226970235e-07, "loss": 0.0, "num_input_tokens_seen": 88060880, "step": 130650 }, { "epoch": 3.1919233869982655, "grad_norm": 0.012899408116936684, "learning_rate": 6.963699742212459e-07, "loss": 0.0, "num_input_tokens_seen": 88064272, "step": 130655 }, { "epoch": 3.1920455378301127, "grad_norm": 0.00294029014185071, "learning_rate": 6.962887236803363e-07, "loss": 0.0418, "num_input_tokens_seen": 88067920, "step": 130660 }, { "epoch": 3.19216768866196, "grad_norm": 0.0002910926123149693, "learning_rate": 6.962074753480966e-07, "loss": 0.0, "num_input_tokens_seen": 88071504, "step": 130665 }, { "epoch": 3.192289839493807, "grad_norm": 0.017607729882001877, "learning_rate": 6.961262292251182e-07, "loss": 0.0, "num_input_tokens_seen": 88074576, "step": 130670 }, { "epoch": 3.1924119903256543, "grad_norm": 9.501243039267138e-05, "learning_rate": 6.96044985311991e-07, "loss": 0.0, "num_input_tokens_seen": 88078032, "step": 130675 }, { "epoch": 3.1925341411575014, "grad_norm": 0.001667727599851787, "learning_rate": 6.959637436093069e-07, "loss": 0.0, "num_input_tokens_seen": 88081488, "step": 130680 }, { "epoch": 3.1926562919893486, "grad_norm": 0.00019331845396663994, "learning_rate": 6.958825041176564e-07, "loss": 0.0, "num_input_tokens_seen": 88084752, "step": 130685 }, { "epoch": 3.1927784428211954, "grad_norm": 0.000298037106404081, "learning_rate": 6.958012668376295e-07, "loss": 0.0, "num_input_tokens_seen": 88087952, "step": 130690 }, { "epoch": 3.1929005936530426, "grad_norm": 0.0023864801041781902, "learning_rate": 6.957200317698182e-07, "loss": 0.0, "num_input_tokens_seen": 88091280, "step": 130695 }, { "epoch": 3.1930227444848898, "grad_norm": 0.13202454149723053, "learning_rate": 6.956387989148123e-07, "loss": 0.0001, "num_input_tokens_seen": 88094800, "step": 130700 }, { "epoch": 3.193144895316737, "grad_norm": 0.018603404983878136, "learning_rate": 6.955575682732032e-07, "loss": 0.0, "num_input_tokens_seen": 88097872, "step": 130705 }, { "epoch": 3.193267046148584, "grad_norm": 0.00045014871284365654, "learning_rate": 6.95476339845581e-07, "loss": 0.0, "num_input_tokens_seen": 88101648, "step": 130710 }, { "epoch": 3.1933891969804313, "grad_norm": 3.039315743080806e-05, "learning_rate": 6.953951136325367e-07, "loss": 0.0002, "num_input_tokens_seen": 88105296, "step": 130715 }, { "epoch": 3.1935113478122785, "grad_norm": 0.00015498968423344195, "learning_rate": 6.953138896346615e-07, "loss": 0.0, "num_input_tokens_seen": 88108688, "step": 130720 }, { "epoch": 3.1936334986441257, "grad_norm": 21.384185791015625, "learning_rate": 6.952326678525452e-07, "loss": 0.0903, "num_input_tokens_seen": 88112400, "step": 130725 }, { "epoch": 3.193755649475973, "grad_norm": 0.01024781446903944, "learning_rate": 6.951514482867794e-07, "loss": 0.0, "num_input_tokens_seen": 88115728, "step": 130730 }, { "epoch": 3.19387780030782, "grad_norm": 0.0065758079290390015, "learning_rate": 6.95070230937954e-07, "loss": 0.0005, "num_input_tokens_seen": 88118672, "step": 130735 }, { "epoch": 3.1939999511396673, "grad_norm": 0.0006454504327848554, "learning_rate": 6.949890158066598e-07, "loss": 0.0001, "num_input_tokens_seen": 88122064, "step": 130740 }, { "epoch": 3.1941221019715145, "grad_norm": 0.005921836942434311, "learning_rate": 6.949078028934879e-07, "loss": 0.0, "num_input_tokens_seen": 88125136, "step": 130745 }, { "epoch": 3.1942442528033617, "grad_norm": 0.00011554537923075259, "learning_rate": 6.94826592199028e-07, "loss": 0.0002, "num_input_tokens_seen": 88128208, "step": 130750 }, { "epoch": 3.194366403635209, "grad_norm": 0.007586855906993151, "learning_rate": 6.947453837238716e-07, "loss": 0.0, "num_input_tokens_seen": 88131984, "step": 130755 }, { "epoch": 3.194488554467056, "grad_norm": 0.0022421926259994507, "learning_rate": 6.946641774686085e-07, "loss": 0.1728, "num_input_tokens_seen": 88135696, "step": 130760 }, { "epoch": 3.1946107052989032, "grad_norm": 0.05754800885915756, "learning_rate": 6.945829734338301e-07, "loss": 0.0, "num_input_tokens_seen": 88139088, "step": 130765 }, { "epoch": 3.1947328561307504, "grad_norm": 104.725341796875, "learning_rate": 6.94501771620126e-07, "loss": 0.0379, "num_input_tokens_seen": 88142224, "step": 130770 }, { "epoch": 3.1948550069625976, "grad_norm": 0.0007363682962022722, "learning_rate": 6.944205720280875e-07, "loss": 0.0019, "num_input_tokens_seen": 88145488, "step": 130775 }, { "epoch": 3.194977157794445, "grad_norm": 0.06700855493545532, "learning_rate": 6.943393746583047e-07, "loss": 0.0001, "num_input_tokens_seen": 88148624, "step": 130780 }, { "epoch": 3.1950993086262915, "grad_norm": 0.00593652855604887, "learning_rate": 6.942581795113681e-07, "loss": 0.0, "num_input_tokens_seen": 88152336, "step": 130785 }, { "epoch": 3.1952214594581387, "grad_norm": 0.0031424618791788816, "learning_rate": 6.941769865878684e-07, "loss": 0.0, "num_input_tokens_seen": 88155600, "step": 130790 }, { "epoch": 3.195343610289986, "grad_norm": 0.00047509727301076055, "learning_rate": 6.940957958883957e-07, "loss": 0.0003, "num_input_tokens_seen": 88159248, "step": 130795 }, { "epoch": 3.195465761121833, "grad_norm": 0.00025798825663514435, "learning_rate": 6.940146074135406e-07, "loss": 0.0683, "num_input_tokens_seen": 88162832, "step": 130800 }, { "epoch": 3.1955879119536803, "grad_norm": 0.0051554907113313675, "learning_rate": 6.93933421163894e-07, "loss": 0.0465, "num_input_tokens_seen": 88166096, "step": 130805 }, { "epoch": 3.1957100627855275, "grad_norm": 0.001389906508848071, "learning_rate": 6.938522371400455e-07, "loss": 0.0, "num_input_tokens_seen": 88169552, "step": 130810 }, { "epoch": 3.1958322136173747, "grad_norm": 0.01061465684324503, "learning_rate": 6.937710553425862e-07, "loss": 0.0, "num_input_tokens_seen": 88172688, "step": 130815 }, { "epoch": 3.195954364449222, "grad_norm": 0.0011689724633470178, "learning_rate": 6.936898757721059e-07, "loss": 0.0, "num_input_tokens_seen": 88175952, "step": 130820 }, { "epoch": 3.196076515281069, "grad_norm": 0.01699061505496502, "learning_rate": 6.936086984291954e-07, "loss": 0.0, "num_input_tokens_seen": 88179472, "step": 130825 }, { "epoch": 3.1961986661129163, "grad_norm": 0.0058431727811694145, "learning_rate": 6.935275233144447e-07, "loss": 0.0407, "num_input_tokens_seen": 88182928, "step": 130830 }, { "epoch": 3.1963208169447634, "grad_norm": 0.0075033195316791534, "learning_rate": 6.934463504284442e-07, "loss": 0.0, "num_input_tokens_seen": 88186064, "step": 130835 }, { "epoch": 3.1964429677766106, "grad_norm": 0.002028069458901882, "learning_rate": 6.933651797717847e-07, "loss": 0.0002, "num_input_tokens_seen": 88189072, "step": 130840 }, { "epoch": 3.196565118608458, "grad_norm": 0.0006016287952661514, "learning_rate": 6.932840113450555e-07, "loss": 0.0, "num_input_tokens_seen": 88192336, "step": 130845 }, { "epoch": 3.196687269440305, "grad_norm": 0.0034821913577616215, "learning_rate": 6.932028451488481e-07, "loss": 0.0, "num_input_tokens_seen": 88195728, "step": 130850 }, { "epoch": 3.196809420272152, "grad_norm": 8.157228876370937e-05, "learning_rate": 6.931216811837515e-07, "loss": 0.0, "num_input_tokens_seen": 88199312, "step": 130855 }, { "epoch": 3.1969315711039994, "grad_norm": 0.0009059004369191825, "learning_rate": 6.930405194503571e-07, "loss": 0.0, "num_input_tokens_seen": 88202576, "step": 130860 }, { "epoch": 3.1970537219358466, "grad_norm": 0.006109096109867096, "learning_rate": 6.929593599492543e-07, "loss": 0.0, "num_input_tokens_seen": 88205648, "step": 130865 }, { "epoch": 3.1971758727676933, "grad_norm": 0.0026130101177841425, "learning_rate": 6.928782026810333e-07, "loss": 0.0558, "num_input_tokens_seen": 88208784, "step": 130870 }, { "epoch": 3.1972980235995405, "grad_norm": 2.651002250786405e-05, "learning_rate": 6.927970476462853e-07, "loss": 0.0, "num_input_tokens_seen": 88211920, "step": 130875 }, { "epoch": 3.1974201744313877, "grad_norm": 0.016481151804327965, "learning_rate": 6.927158948455994e-07, "loss": 0.0, "num_input_tokens_seen": 88215376, "step": 130880 }, { "epoch": 3.197542325263235, "grad_norm": 0.15974406898021698, "learning_rate": 6.926347442795664e-07, "loss": 0.0006, "num_input_tokens_seen": 88219152, "step": 130885 }, { "epoch": 3.197664476095082, "grad_norm": 0.08364619314670563, "learning_rate": 6.925535959487761e-07, "loss": 0.0001, "num_input_tokens_seen": 88222992, "step": 130890 }, { "epoch": 3.1977866269269293, "grad_norm": 0.0003432470257394016, "learning_rate": 6.924724498538186e-07, "loss": 0.0, "num_input_tokens_seen": 88226320, "step": 130895 }, { "epoch": 3.1979087777587765, "grad_norm": 0.0074369474314153194, "learning_rate": 6.923913059952844e-07, "loss": 0.0, "num_input_tokens_seen": 88230032, "step": 130900 }, { "epoch": 3.1980309285906237, "grad_norm": 0.0005242819897830486, "learning_rate": 6.92310164373763e-07, "loss": 0.0, "num_input_tokens_seen": 88232976, "step": 130905 }, { "epoch": 3.198153079422471, "grad_norm": 0.004553093109279871, "learning_rate": 6.922290249898454e-07, "loss": 0.0, "num_input_tokens_seen": 88236432, "step": 130910 }, { "epoch": 3.198275230254318, "grad_norm": 0.0007548279245384037, "learning_rate": 6.921478878441206e-07, "loss": 0.0, "num_input_tokens_seen": 88240336, "step": 130915 }, { "epoch": 3.1983973810861652, "grad_norm": 3.5369539546081796e-05, "learning_rate": 6.920667529371795e-07, "loss": 0.0, "num_input_tokens_seen": 88243536, "step": 130920 }, { "epoch": 3.1985195319180124, "grad_norm": 0.005292957182973623, "learning_rate": 6.919856202696115e-07, "loss": 0.0, "num_input_tokens_seen": 88247440, "step": 130925 }, { "epoch": 3.1986416827498596, "grad_norm": 0.0005814823089167476, "learning_rate": 6.919044898420072e-07, "loss": 0.0, "num_input_tokens_seen": 88251472, "step": 130930 }, { "epoch": 3.198763833581707, "grad_norm": 0.01645071804523468, "learning_rate": 6.918233616549561e-07, "loss": 0.0, "num_input_tokens_seen": 88254864, "step": 130935 }, { "epoch": 3.198885984413554, "grad_norm": 0.0033709302078932524, "learning_rate": 6.917422357090485e-07, "loss": 0.0001, "num_input_tokens_seen": 88258128, "step": 130940 }, { "epoch": 3.199008135245401, "grad_norm": 0.04572080448269844, "learning_rate": 6.916611120048743e-07, "loss": 0.0, "num_input_tokens_seen": 88261584, "step": 130945 }, { "epoch": 3.1991302860772484, "grad_norm": 0.022833306342363358, "learning_rate": 6.915799905430233e-07, "loss": 0.0, "num_input_tokens_seen": 88264976, "step": 130950 }, { "epoch": 3.1992524369090956, "grad_norm": 0.0016878378810361028, "learning_rate": 6.914988713240855e-07, "loss": 0.0, "num_input_tokens_seen": 88268048, "step": 130955 }, { "epoch": 3.1993745877409427, "grad_norm": 0.07270152121782303, "learning_rate": 6.914177543486512e-07, "loss": 0.0, "num_input_tokens_seen": 88271568, "step": 130960 }, { "epoch": 3.1994967385727895, "grad_norm": 0.0013511275174096227, "learning_rate": 6.913366396173097e-07, "loss": 0.0555, "num_input_tokens_seen": 88275088, "step": 130965 }, { "epoch": 3.1996188894046367, "grad_norm": 0.002923275576904416, "learning_rate": 6.912555271306515e-07, "loss": 0.0, "num_input_tokens_seen": 88278352, "step": 130970 }, { "epoch": 3.199741040236484, "grad_norm": 0.00026840600185096264, "learning_rate": 6.911744168892657e-07, "loss": 0.0, "num_input_tokens_seen": 88281680, "step": 130975 }, { "epoch": 3.199863191068331, "grad_norm": 0.0009354413487017155, "learning_rate": 6.910933088937432e-07, "loss": 0.0619, "num_input_tokens_seen": 88284752, "step": 130980 }, { "epoch": 3.1999853419001782, "grad_norm": 0.0007869818946346641, "learning_rate": 6.910122031446726e-07, "loss": 0.0, "num_input_tokens_seen": 88287760, "step": 130985 }, { "epoch": 3.2001074927320254, "grad_norm": 0.00020354626758489758, "learning_rate": 6.909310996426447e-07, "loss": 0.0, "num_input_tokens_seen": 88291024, "step": 130990 }, { "epoch": 3.2002296435638726, "grad_norm": 0.0014506744919344783, "learning_rate": 6.908499983882492e-07, "loss": 0.0, "num_input_tokens_seen": 88294672, "step": 130995 }, { "epoch": 3.20035179439572, "grad_norm": 0.0020283234771341085, "learning_rate": 6.907688993820752e-07, "loss": 0.0, "num_input_tokens_seen": 88297936, "step": 131000 }, { "epoch": 3.200473945227567, "grad_norm": 0.013011399656534195, "learning_rate": 6.906878026247134e-07, "loss": 0.0001, "num_input_tokens_seen": 88301072, "step": 131005 }, { "epoch": 3.200596096059414, "grad_norm": 0.011319443583488464, "learning_rate": 6.906067081167526e-07, "loss": 0.0, "num_input_tokens_seen": 88304336, "step": 131010 }, { "epoch": 3.2007182468912614, "grad_norm": 0.04771203547716141, "learning_rate": 6.905256158587836e-07, "loss": 0.0001, "num_input_tokens_seen": 88307664, "step": 131015 }, { "epoch": 3.2008403977231086, "grad_norm": 5.148741245269775, "learning_rate": 6.904445258513949e-07, "loss": 0.0003, "num_input_tokens_seen": 88311056, "step": 131020 }, { "epoch": 3.2009625485549558, "grad_norm": 93.61454772949219, "learning_rate": 6.903634380951771e-07, "loss": 0.0625, "num_input_tokens_seen": 88314320, "step": 131025 }, { "epoch": 3.201084699386803, "grad_norm": 0.0014936138177290559, "learning_rate": 6.9028235259072e-07, "loss": 0.0, "num_input_tokens_seen": 88317712, "step": 131030 }, { "epoch": 3.20120685021865, "grad_norm": 0.0007606488652527332, "learning_rate": 6.902012693386123e-07, "loss": 0.0, "num_input_tokens_seen": 88321104, "step": 131035 }, { "epoch": 3.2013290010504973, "grad_norm": 0.003329877508804202, "learning_rate": 6.90120188339445e-07, "loss": 0.0, "num_input_tokens_seen": 88324368, "step": 131040 }, { "epoch": 3.2014511518823445, "grad_norm": 0.0009622600628063083, "learning_rate": 6.900391095938068e-07, "loss": 0.0, "num_input_tokens_seen": 88328208, "step": 131045 }, { "epoch": 3.2015733027141913, "grad_norm": 0.001263272948563099, "learning_rate": 6.899580331022873e-07, "loss": 0.0, "num_input_tokens_seen": 88331152, "step": 131050 }, { "epoch": 3.2016954535460385, "grad_norm": 0.018948759883642197, "learning_rate": 6.898769588654767e-07, "loss": 0.0, "num_input_tokens_seen": 88333904, "step": 131055 }, { "epoch": 3.2018176043778857, "grad_norm": 0.014238434843719006, "learning_rate": 6.89795886883964e-07, "loss": 0.0, "num_input_tokens_seen": 88336784, "step": 131060 }, { "epoch": 3.201939755209733, "grad_norm": 0.34706351161003113, "learning_rate": 6.897148171583394e-07, "loss": 0.034, "num_input_tokens_seen": 88340112, "step": 131065 }, { "epoch": 3.20206190604158, "grad_norm": 0.0014285926008597016, "learning_rate": 6.896337496891917e-07, "loss": 0.0, "num_input_tokens_seen": 88343312, "step": 131070 }, { "epoch": 3.202184056873427, "grad_norm": 0.008070665411651134, "learning_rate": 6.895526844771112e-07, "loss": 0.0, "num_input_tokens_seen": 88346896, "step": 131075 }, { "epoch": 3.2023062077052744, "grad_norm": 0.0005574771203100681, "learning_rate": 6.894716215226868e-07, "loss": 0.0, "num_input_tokens_seen": 88350480, "step": 131080 }, { "epoch": 3.2024283585371216, "grad_norm": 0.000559286039788276, "learning_rate": 6.893905608265084e-07, "loss": 0.0, "num_input_tokens_seen": 88354256, "step": 131085 }, { "epoch": 3.202550509368969, "grad_norm": 0.0003937402507290244, "learning_rate": 6.893095023891653e-07, "loss": 0.0, "num_input_tokens_seen": 88357584, "step": 131090 }, { "epoch": 3.202672660200816, "grad_norm": 0.09179575741291046, "learning_rate": 6.892284462112472e-07, "loss": 0.0, "num_input_tokens_seen": 88360976, "step": 131095 }, { "epoch": 3.202794811032663, "grad_norm": 0.0010902261128649116, "learning_rate": 6.891473922933435e-07, "loss": 0.0, "num_input_tokens_seen": 88364368, "step": 131100 }, { "epoch": 3.2029169618645104, "grad_norm": 0.0002610184019431472, "learning_rate": 6.890663406360434e-07, "loss": 0.0, "num_input_tokens_seen": 88367696, "step": 131105 }, { "epoch": 3.2030391126963575, "grad_norm": 0.0027821410913020372, "learning_rate": 6.889852912399363e-07, "loss": 0.0001, "num_input_tokens_seen": 88370896, "step": 131110 }, { "epoch": 3.2031612635282047, "grad_norm": 0.0007937629707157612, "learning_rate": 6.889042441056123e-07, "loss": 0.0, "num_input_tokens_seen": 88374544, "step": 131115 }, { "epoch": 3.203283414360052, "grad_norm": 0.0003279813681729138, "learning_rate": 6.888231992336599e-07, "loss": 0.0, "num_input_tokens_seen": 88378128, "step": 131120 }, { "epoch": 3.203405565191899, "grad_norm": 0.00046456785639747977, "learning_rate": 6.887421566246694e-07, "loss": 0.0305, "num_input_tokens_seen": 88381392, "step": 131125 }, { "epoch": 3.2035277160237463, "grad_norm": 0.08387045562267303, "learning_rate": 6.886611162792291e-07, "loss": 0.0, "num_input_tokens_seen": 88384592, "step": 131130 }, { "epoch": 3.203649866855593, "grad_norm": 0.01048725750297308, "learning_rate": 6.885800781979294e-07, "loss": 0.0, "num_input_tokens_seen": 88387984, "step": 131135 }, { "epoch": 3.2037720176874402, "grad_norm": 0.0010767376516014338, "learning_rate": 6.884990423813586e-07, "loss": 0.0, "num_input_tokens_seen": 88391632, "step": 131140 }, { "epoch": 3.2038941685192874, "grad_norm": 0.002124819438904524, "learning_rate": 6.884180088301068e-07, "loss": 0.0, "num_input_tokens_seen": 88394576, "step": 131145 }, { "epoch": 3.2040163193511346, "grad_norm": 0.002192183630540967, "learning_rate": 6.883369775447633e-07, "loss": 0.0616, "num_input_tokens_seen": 88397840, "step": 131150 }, { "epoch": 3.204138470182982, "grad_norm": 0.0002685803337953985, "learning_rate": 6.882559485259167e-07, "loss": 0.0, "num_input_tokens_seen": 88401296, "step": 131155 }, { "epoch": 3.204260621014829, "grad_norm": 0.001198447891511023, "learning_rate": 6.88174921774157e-07, "loss": 0.0, "num_input_tokens_seen": 88404880, "step": 131160 }, { "epoch": 3.204382771846676, "grad_norm": 0.001283994410187006, "learning_rate": 6.880938972900729e-07, "loss": 0.0332, "num_input_tokens_seen": 88408656, "step": 131165 }, { "epoch": 3.2045049226785234, "grad_norm": 0.003258587559685111, "learning_rate": 6.880128750742542e-07, "loss": 0.0, "num_input_tokens_seen": 88412432, "step": 131170 }, { "epoch": 3.2046270735103706, "grad_norm": 0.004086503759026527, "learning_rate": 6.879318551272894e-07, "loss": 0.0, "num_input_tokens_seen": 88415440, "step": 131175 }, { "epoch": 3.2047492243422178, "grad_norm": 0.00019456451991572976, "learning_rate": 6.87850837449768e-07, "loss": 0.0, "num_input_tokens_seen": 88418960, "step": 131180 }, { "epoch": 3.204871375174065, "grad_norm": 0.003474778262898326, "learning_rate": 6.877698220422799e-07, "loss": 0.0546, "num_input_tokens_seen": 88422352, "step": 131185 }, { "epoch": 3.204993526005912, "grad_norm": 0.012960633262991905, "learning_rate": 6.87688808905413e-07, "loss": 0.0, "num_input_tokens_seen": 88425424, "step": 131190 }, { "epoch": 3.2051156768377593, "grad_norm": 0.02482810989022255, "learning_rate": 6.876077980397575e-07, "loss": 0.0, "num_input_tokens_seen": 88428816, "step": 131195 }, { "epoch": 3.2052378276696065, "grad_norm": 0.05032927915453911, "learning_rate": 6.87526789445902e-07, "loss": 0.0002, "num_input_tokens_seen": 88432208, "step": 131200 }, { "epoch": 3.2053599785014537, "grad_norm": 0.11770392954349518, "learning_rate": 6.874457831244355e-07, "loss": 0.0001, "num_input_tokens_seen": 88435920, "step": 131205 }, { "epoch": 3.205482129333301, "grad_norm": 0.03212438151240349, "learning_rate": 6.873647790759478e-07, "loss": 0.0, "num_input_tokens_seen": 88440080, "step": 131210 }, { "epoch": 3.205604280165148, "grad_norm": 0.0017605361063033342, "learning_rate": 6.87283777301027e-07, "loss": 0.0, "num_input_tokens_seen": 88443536, "step": 131215 }, { "epoch": 3.2057264309969953, "grad_norm": 0.0023195864632725716, "learning_rate": 6.872027778002632e-07, "loss": 0.0001, "num_input_tokens_seen": 88447056, "step": 131220 }, { "epoch": 3.2058485818288425, "grad_norm": 0.3160790205001831, "learning_rate": 6.871217805742444e-07, "loss": 0.0002, "num_input_tokens_seen": 88450640, "step": 131225 }, { "epoch": 3.205970732660689, "grad_norm": 0.13889338076114655, "learning_rate": 6.870407856235608e-07, "loss": 0.0001, "num_input_tokens_seen": 88454352, "step": 131230 }, { "epoch": 3.2060928834925364, "grad_norm": 0.0015086831990629435, "learning_rate": 6.869597929488e-07, "loss": 0.0, "num_input_tokens_seen": 88458192, "step": 131235 }, { "epoch": 3.2062150343243836, "grad_norm": 0.017581794410943985, "learning_rate": 6.868788025505523e-07, "loss": 0.0, "num_input_tokens_seen": 88461648, "step": 131240 }, { "epoch": 3.206337185156231, "grad_norm": 0.002975418232381344, "learning_rate": 6.86797814429406e-07, "loss": 0.0, "num_input_tokens_seen": 88465616, "step": 131245 }, { "epoch": 3.206459335988078, "grad_norm": 0.0019408295629546046, "learning_rate": 6.867168285859504e-07, "loss": 0.0, "num_input_tokens_seen": 88469136, "step": 131250 }, { "epoch": 3.206581486819925, "grad_norm": 0.0003184815577697009, "learning_rate": 6.866358450207741e-07, "loss": 0.0001, "num_input_tokens_seen": 88473040, "step": 131255 }, { "epoch": 3.2067036376517724, "grad_norm": 0.004589362069964409, "learning_rate": 6.865548637344664e-07, "loss": 0.0, "num_input_tokens_seen": 88475984, "step": 131260 }, { "epoch": 3.2068257884836195, "grad_norm": 3.0053229331970215, "learning_rate": 6.864738847276162e-07, "loss": 0.0, "num_input_tokens_seen": 88479568, "step": 131265 }, { "epoch": 3.2069479393154667, "grad_norm": 21.975263595581055, "learning_rate": 6.863929080008118e-07, "loss": 0.0501, "num_input_tokens_seen": 88482640, "step": 131270 }, { "epoch": 3.207070090147314, "grad_norm": 0.0006934599950909615, "learning_rate": 6.863119335546423e-07, "loss": 0.0, "num_input_tokens_seen": 88486032, "step": 131275 }, { "epoch": 3.207192240979161, "grad_norm": 0.006196063477545977, "learning_rate": 6.862309613896975e-07, "loss": 0.0, "num_input_tokens_seen": 88490000, "step": 131280 }, { "epoch": 3.2073143918110083, "grad_norm": 0.000710879685357213, "learning_rate": 6.86149991506565e-07, "loss": 0.0, "num_input_tokens_seen": 88493264, "step": 131285 }, { "epoch": 3.2074365426428555, "grad_norm": 0.0015197633765637875, "learning_rate": 6.860690239058347e-07, "loss": 0.0, "num_input_tokens_seen": 88496976, "step": 131290 }, { "epoch": 3.2075586934747027, "grad_norm": 0.005204588640481234, "learning_rate": 6.859880585880944e-07, "loss": 0.0553, "num_input_tokens_seen": 88500432, "step": 131295 }, { "epoch": 3.20768084430655, "grad_norm": 0.0007456667954102159, "learning_rate": 6.859070955539337e-07, "loss": 0.0, "num_input_tokens_seen": 88503952, "step": 131300 }, { "epoch": 3.207802995138397, "grad_norm": 0.00033635873114690185, "learning_rate": 6.858261348039411e-07, "loss": 0.0377, "num_input_tokens_seen": 88507280, "step": 131305 }, { "epoch": 3.2079251459702443, "grad_norm": 0.003113040467724204, "learning_rate": 6.85745176338705e-07, "loss": 0.0, "num_input_tokens_seen": 88510672, "step": 131310 }, { "epoch": 3.208047296802091, "grad_norm": 0.00037906321813352406, "learning_rate": 6.856642201588149e-07, "loss": 0.0, "num_input_tokens_seen": 88514576, "step": 131315 }, { "epoch": 3.208169447633938, "grad_norm": 0.0035938401706516743, "learning_rate": 6.855832662648589e-07, "loss": 0.0, "num_input_tokens_seen": 88518224, "step": 131320 }, { "epoch": 3.2082915984657854, "grad_norm": 0.002629627473652363, "learning_rate": 6.855023146574262e-07, "loss": 0.0, "num_input_tokens_seen": 88521872, "step": 131325 }, { "epoch": 3.2084137492976326, "grad_norm": 67.88191986083984, "learning_rate": 6.854213653371049e-07, "loss": 0.0526, "num_input_tokens_seen": 88525264, "step": 131330 }, { "epoch": 3.2085359001294798, "grad_norm": 0.011571920476853848, "learning_rate": 6.85340418304484e-07, "loss": 0.0001, "num_input_tokens_seen": 88528272, "step": 131335 }, { "epoch": 3.208658050961327, "grad_norm": 0.0011708816746249795, "learning_rate": 6.852594735601527e-07, "loss": 0.0, "num_input_tokens_seen": 88531728, "step": 131340 }, { "epoch": 3.208780201793174, "grad_norm": 0.0015514298574998975, "learning_rate": 6.851785311046987e-07, "loss": 0.0154, "num_input_tokens_seen": 88534992, "step": 131345 }, { "epoch": 3.2089023526250213, "grad_norm": 0.03095664456486702, "learning_rate": 6.850975909387115e-07, "loss": 0.0543, "num_input_tokens_seen": 88538384, "step": 131350 }, { "epoch": 3.2090245034568685, "grad_norm": 0.2210804671049118, "learning_rate": 6.850166530627791e-07, "loss": 0.0552, "num_input_tokens_seen": 88541392, "step": 131355 }, { "epoch": 3.2091466542887157, "grad_norm": 1.6713513135910034, "learning_rate": 6.849357174774901e-07, "loss": 0.0504, "num_input_tokens_seen": 88544656, "step": 131360 }, { "epoch": 3.209268805120563, "grad_norm": 0.00557379936799407, "learning_rate": 6.848547841834338e-07, "loss": 0.0004, "num_input_tokens_seen": 88548176, "step": 131365 }, { "epoch": 3.20939095595241, "grad_norm": 0.004929456394165754, "learning_rate": 6.847738531811978e-07, "loss": 0.0, "num_input_tokens_seen": 88551312, "step": 131370 }, { "epoch": 3.2095131067842573, "grad_norm": 0.22072072327136993, "learning_rate": 6.846929244713717e-07, "loss": 0.0, "num_input_tokens_seen": 88554832, "step": 131375 }, { "epoch": 3.2096352576161045, "grad_norm": 0.002108390908688307, "learning_rate": 6.846119980545429e-07, "loss": 0.0, "num_input_tokens_seen": 88557712, "step": 131380 }, { "epoch": 3.2097574084479517, "grad_norm": 0.004526549018919468, "learning_rate": 6.845310739313008e-07, "loss": 0.0, "num_input_tokens_seen": 88560912, "step": 131385 }, { "epoch": 3.209879559279799, "grad_norm": 0.0006033982499502599, "learning_rate": 6.844501521022333e-07, "loss": 0.0003, "num_input_tokens_seen": 88564176, "step": 131390 }, { "epoch": 3.210001710111646, "grad_norm": 0.0009200237691402435, "learning_rate": 6.843692325679293e-07, "loss": 0.0052, "num_input_tokens_seen": 88567440, "step": 131395 }, { "epoch": 3.2101238609434932, "grad_norm": 0.00029823146178387105, "learning_rate": 6.842883153289771e-07, "loss": 0.0001, "num_input_tokens_seen": 88571152, "step": 131400 }, { "epoch": 3.2102460117753404, "grad_norm": 0.004292737692594528, "learning_rate": 6.842074003859654e-07, "loss": 0.0, "num_input_tokens_seen": 88574800, "step": 131405 }, { "epoch": 3.210368162607187, "grad_norm": 0.016947563737630844, "learning_rate": 6.841264877394822e-07, "loss": 0.0, "num_input_tokens_seen": 88578640, "step": 131410 }, { "epoch": 3.2104903134390343, "grad_norm": 0.17108048498630524, "learning_rate": 6.840455773901163e-07, "loss": 0.0, "num_input_tokens_seen": 88582032, "step": 131415 }, { "epoch": 3.2106124642708815, "grad_norm": 0.00048168751527555287, "learning_rate": 6.839646693384559e-07, "loss": 0.0, "num_input_tokens_seen": 88585488, "step": 131420 }, { "epoch": 3.2107346151027287, "grad_norm": 0.0014778217300772667, "learning_rate": 6.838837635850894e-07, "loss": 0.0, "num_input_tokens_seen": 88588688, "step": 131425 }, { "epoch": 3.210856765934576, "grad_norm": 3.849087443086319e-05, "learning_rate": 6.83802860130605e-07, "loss": 0.0, "num_input_tokens_seen": 88592400, "step": 131430 }, { "epoch": 3.210978916766423, "grad_norm": 0.00019800262816715986, "learning_rate": 6.837219589755916e-07, "loss": 0.1358, "num_input_tokens_seen": 88595536, "step": 131435 }, { "epoch": 3.2111010675982703, "grad_norm": 0.00025983096566051245, "learning_rate": 6.836410601206368e-07, "loss": 0.0, "num_input_tokens_seen": 88598800, "step": 131440 }, { "epoch": 3.2112232184301175, "grad_norm": 0.0001302832388319075, "learning_rate": 6.835601635663297e-07, "loss": 0.0003, "num_input_tokens_seen": 88602128, "step": 131445 }, { "epoch": 3.2113453692619647, "grad_norm": 0.16828882694244385, "learning_rate": 6.834792693132578e-07, "loss": 0.0001, "num_input_tokens_seen": 88605456, "step": 131450 }, { "epoch": 3.211467520093812, "grad_norm": 0.1079067662358284, "learning_rate": 6.8339837736201e-07, "loss": 0.0, "num_input_tokens_seen": 88608656, "step": 131455 }, { "epoch": 3.211589670925659, "grad_norm": 0.0008097724057734013, "learning_rate": 6.833174877131746e-07, "loss": 0.0002, "num_input_tokens_seen": 88611728, "step": 131460 }, { "epoch": 3.2117118217575062, "grad_norm": 0.00048446975415572524, "learning_rate": 6.832366003673392e-07, "loss": 0.1079, "num_input_tokens_seen": 88615056, "step": 131465 }, { "epoch": 3.2118339725893534, "grad_norm": 0.0007245682063512504, "learning_rate": 6.831557153250929e-07, "loss": 0.0, "num_input_tokens_seen": 88618128, "step": 131470 }, { "epoch": 3.2119561234212006, "grad_norm": 0.0002163596946047619, "learning_rate": 6.830748325870229e-07, "loss": 0.0006, "num_input_tokens_seen": 88621136, "step": 131475 }, { "epoch": 3.212078274253048, "grad_norm": 34.87563705444336, "learning_rate": 6.829939521537184e-07, "loss": 0.0466, "num_input_tokens_seen": 88624784, "step": 131480 }, { "epoch": 3.212200425084895, "grad_norm": 34.09080123901367, "learning_rate": 6.829130740257668e-07, "loss": 0.0501, "num_input_tokens_seen": 88627792, "step": 131485 }, { "epoch": 3.212322575916742, "grad_norm": 0.01155630499124527, "learning_rate": 6.828321982037566e-07, "loss": 0.0589, "num_input_tokens_seen": 88631376, "step": 131490 }, { "epoch": 3.212444726748589, "grad_norm": 0.004634066019207239, "learning_rate": 6.827513246882763e-07, "loss": 0.0, "num_input_tokens_seen": 88634576, "step": 131495 }, { "epoch": 3.212566877580436, "grad_norm": 0.00048246190999634564, "learning_rate": 6.826704534799135e-07, "loss": 0.0917, "num_input_tokens_seen": 88637584, "step": 131500 }, { "epoch": 3.2126890284122833, "grad_norm": 0.009547368623316288, "learning_rate": 6.825895845792567e-07, "loss": 0.0, "num_input_tokens_seen": 88641296, "step": 131505 }, { "epoch": 3.2128111792441305, "grad_norm": 0.030916303396224976, "learning_rate": 6.825087179868935e-07, "loss": 0.0, "num_input_tokens_seen": 88644624, "step": 131510 }, { "epoch": 3.2129333300759777, "grad_norm": 0.012111157178878784, "learning_rate": 6.824278537034125e-07, "loss": 0.0, "num_input_tokens_seen": 88647696, "step": 131515 }, { "epoch": 3.213055480907825, "grad_norm": 0.009429936297237873, "learning_rate": 6.823469917294017e-07, "loss": 0.0001, "num_input_tokens_seen": 88651088, "step": 131520 }, { "epoch": 3.213177631739672, "grad_norm": 0.0030681893695145845, "learning_rate": 6.822661320654486e-07, "loss": 0.0, "num_input_tokens_seen": 88654160, "step": 131525 }, { "epoch": 3.2132997825715193, "grad_norm": 0.0012531710090115666, "learning_rate": 6.821852747121422e-07, "loss": 0.0001, "num_input_tokens_seen": 88658000, "step": 131530 }, { "epoch": 3.2134219334033665, "grad_norm": 0.003988673444837332, "learning_rate": 6.821044196700694e-07, "loss": 0.0002, "num_input_tokens_seen": 88661392, "step": 131535 }, { "epoch": 3.2135440842352136, "grad_norm": 0.0003183768130838871, "learning_rate": 6.820235669398192e-07, "loss": 0.0001, "num_input_tokens_seen": 88665488, "step": 131540 }, { "epoch": 3.213666235067061, "grad_norm": 0.0008758779149502516, "learning_rate": 6.819427165219789e-07, "loss": 0.0001, "num_input_tokens_seen": 88668880, "step": 131545 }, { "epoch": 3.213788385898908, "grad_norm": 0.0005794035969302058, "learning_rate": 6.818618684171367e-07, "loss": 0.0, "num_input_tokens_seen": 88672528, "step": 131550 }, { "epoch": 3.213910536730755, "grad_norm": 0.00545384269207716, "learning_rate": 6.817810226258806e-07, "loss": 0.0348, "num_input_tokens_seen": 88675536, "step": 131555 }, { "epoch": 3.2140326875626024, "grad_norm": 0.0026415474712848663, "learning_rate": 6.817001791487986e-07, "loss": 0.0, "num_input_tokens_seen": 88678800, "step": 131560 }, { "epoch": 3.2141548383944496, "grad_norm": 0.024035947397351265, "learning_rate": 6.816193379864785e-07, "loss": 0.0525, "num_input_tokens_seen": 88682320, "step": 131565 }, { "epoch": 3.214276989226297, "grad_norm": 0.0018660565838217735, "learning_rate": 6.815384991395081e-07, "loss": 0.0, "num_input_tokens_seen": 88686096, "step": 131570 }, { "epoch": 3.214399140058144, "grad_norm": 0.0012469434877857566, "learning_rate": 6.814576626084757e-07, "loss": 0.0001, "num_input_tokens_seen": 88689296, "step": 131575 }, { "epoch": 3.214521290889991, "grad_norm": 0.0005144423921592534, "learning_rate": 6.813768283939683e-07, "loss": 0.0, "num_input_tokens_seen": 88692368, "step": 131580 }, { "epoch": 3.2146434417218384, "grad_norm": 0.007941600866615772, "learning_rate": 6.812959964965746e-07, "loss": 0.0, "num_input_tokens_seen": 88695632, "step": 131585 }, { "epoch": 3.214765592553685, "grad_norm": 0.002308615716174245, "learning_rate": 6.812151669168821e-07, "loss": 0.0, "num_input_tokens_seen": 88699088, "step": 131590 }, { "epoch": 3.2148877433855323, "grad_norm": 0.0004510280559770763, "learning_rate": 6.811343396554786e-07, "loss": 0.0838, "num_input_tokens_seen": 88702096, "step": 131595 }, { "epoch": 3.2150098942173795, "grad_norm": 0.4567067325115204, "learning_rate": 6.810535147129524e-07, "loss": 0.0002, "num_input_tokens_seen": 88704848, "step": 131600 }, { "epoch": 3.2151320450492267, "grad_norm": 0.00045220693573355675, "learning_rate": 6.809726920898902e-07, "loss": 0.0513, "num_input_tokens_seen": 88708432, "step": 131605 }, { "epoch": 3.215254195881074, "grad_norm": 0.003778433194383979, "learning_rate": 6.808918717868805e-07, "loss": 0.0002, "num_input_tokens_seen": 88711696, "step": 131610 }, { "epoch": 3.215376346712921, "grad_norm": 0.01336110569536686, "learning_rate": 6.808110538045114e-07, "loss": 0.0001, "num_input_tokens_seen": 88714960, "step": 131615 }, { "epoch": 3.2154984975447682, "grad_norm": 0.008328008465468884, "learning_rate": 6.807302381433697e-07, "loss": 0.0, "num_input_tokens_seen": 88718672, "step": 131620 }, { "epoch": 3.2156206483766154, "grad_norm": 0.0011965453159064054, "learning_rate": 6.806494248040439e-07, "loss": 0.0688, "num_input_tokens_seen": 88722128, "step": 131625 }, { "epoch": 3.2157427992084626, "grad_norm": 0.0028489800170063972, "learning_rate": 6.805686137871211e-07, "loss": 0.0371, "num_input_tokens_seen": 88725200, "step": 131630 }, { "epoch": 3.21586495004031, "grad_norm": 0.0015178770991042256, "learning_rate": 6.804878050931895e-07, "loss": 0.0002, "num_input_tokens_seen": 88728144, "step": 131635 }, { "epoch": 3.215987100872157, "grad_norm": 0.025252878665924072, "learning_rate": 6.804069987228363e-07, "loss": 0.0002, "num_input_tokens_seen": 88731536, "step": 131640 }, { "epoch": 3.216109251704004, "grad_norm": 0.0018437153194099665, "learning_rate": 6.803261946766492e-07, "loss": 0.0001, "num_input_tokens_seen": 88734864, "step": 131645 }, { "epoch": 3.2162314025358514, "grad_norm": 0.0007309846114367247, "learning_rate": 6.802453929552165e-07, "loss": 0.0551, "num_input_tokens_seen": 88738256, "step": 131650 }, { "epoch": 3.2163535533676986, "grad_norm": 0.04808569699525833, "learning_rate": 6.801645935591249e-07, "loss": 0.0355, "num_input_tokens_seen": 88741392, "step": 131655 }, { "epoch": 3.2164757041995458, "grad_norm": 0.001859356532804668, "learning_rate": 6.800837964889627e-07, "loss": 0.0002, "num_input_tokens_seen": 88744976, "step": 131660 }, { "epoch": 3.216597855031393, "grad_norm": 0.012184222228825092, "learning_rate": 6.800030017453171e-07, "loss": 0.0001, "num_input_tokens_seen": 88748304, "step": 131665 }, { "epoch": 3.21672000586324, "grad_norm": 0.0017921392573043704, "learning_rate": 6.799222093287756e-07, "loss": 0.0, "num_input_tokens_seen": 88751504, "step": 131670 }, { "epoch": 3.216842156695087, "grad_norm": 0.0525604784488678, "learning_rate": 6.79841419239926e-07, "loss": 0.0001, "num_input_tokens_seen": 88754256, "step": 131675 }, { "epoch": 3.216964307526934, "grad_norm": 0.008412518538534641, "learning_rate": 6.797606314793556e-07, "loss": 0.0, "num_input_tokens_seen": 88757264, "step": 131680 }, { "epoch": 3.2170864583587813, "grad_norm": 0.006365551147609949, "learning_rate": 6.796798460476523e-07, "loss": 0.0001, "num_input_tokens_seen": 88760400, "step": 131685 }, { "epoch": 3.2172086091906285, "grad_norm": 0.039969027042388916, "learning_rate": 6.795990629454029e-07, "loss": 0.0004, "num_input_tokens_seen": 88763920, "step": 131690 }, { "epoch": 3.2173307600224756, "grad_norm": 0.00221992377191782, "learning_rate": 6.795182821731957e-07, "loss": 0.0001, "num_input_tokens_seen": 88767056, "step": 131695 }, { "epoch": 3.217452910854323, "grad_norm": 0.002486801240593195, "learning_rate": 6.794375037316173e-07, "loss": 0.0, "num_input_tokens_seen": 88770064, "step": 131700 }, { "epoch": 3.21757506168617, "grad_norm": 0.003136269748210907, "learning_rate": 6.793567276212557e-07, "loss": 0.0, "num_input_tokens_seen": 88773136, "step": 131705 }, { "epoch": 3.217697212518017, "grad_norm": 0.0005143572925589979, "learning_rate": 6.792759538426987e-07, "loss": 0.0, "num_input_tokens_seen": 88776208, "step": 131710 }, { "epoch": 3.2178193633498644, "grad_norm": 0.00440189940854907, "learning_rate": 6.791951823965327e-07, "loss": 0.0001, "num_input_tokens_seen": 88779664, "step": 131715 }, { "epoch": 3.2179415141817116, "grad_norm": 0.000603349064476788, "learning_rate": 6.791144132833459e-07, "loss": 0.0, "num_input_tokens_seen": 88783056, "step": 131720 }, { "epoch": 3.218063665013559, "grad_norm": 0.0004256120591890067, "learning_rate": 6.790336465037254e-07, "loss": 0.0, "num_input_tokens_seen": 88786128, "step": 131725 }, { "epoch": 3.218185815845406, "grad_norm": 0.00017496588407084346, "learning_rate": 6.789528820582587e-07, "loss": 0.0001, "num_input_tokens_seen": 88789456, "step": 131730 }, { "epoch": 3.218307966677253, "grad_norm": 0.04625757038593292, "learning_rate": 6.788721199475325e-07, "loss": 0.0, "num_input_tokens_seen": 88793168, "step": 131735 }, { "epoch": 3.2184301175091004, "grad_norm": 0.011211957782506943, "learning_rate": 6.787913601721346e-07, "loss": 0.0001, "num_input_tokens_seen": 88796368, "step": 131740 }, { "epoch": 3.2185522683409475, "grad_norm": 0.00312986271455884, "learning_rate": 6.787106027326529e-07, "loss": 0.0, "num_input_tokens_seen": 88799824, "step": 131745 }, { "epoch": 3.2186744191727947, "grad_norm": 0.020161407068371773, "learning_rate": 6.786298476296737e-07, "loss": 0.0, "num_input_tokens_seen": 88803280, "step": 131750 }, { "epoch": 3.218796570004642, "grad_norm": 0.00044350314419716597, "learning_rate": 6.785490948637849e-07, "loss": 0.0595, "num_input_tokens_seen": 88806928, "step": 131755 }, { "epoch": 3.2189187208364887, "grad_norm": 0.0013805481139570475, "learning_rate": 6.784683444355732e-07, "loss": 0.105, "num_input_tokens_seen": 88810192, "step": 131760 }, { "epoch": 3.219040871668336, "grad_norm": 0.01782987453043461, "learning_rate": 6.783875963456264e-07, "loss": 0.0001, "num_input_tokens_seen": 88813456, "step": 131765 }, { "epoch": 3.219163022500183, "grad_norm": 0.0005228363443166018, "learning_rate": 6.783068505945319e-07, "loss": 0.0, "num_input_tokens_seen": 88817360, "step": 131770 }, { "epoch": 3.2192851733320302, "grad_norm": 0.12143167853355408, "learning_rate": 6.782261071828759e-07, "loss": 0.0382, "num_input_tokens_seen": 88820560, "step": 131775 }, { "epoch": 3.2194073241638774, "grad_norm": 0.09187977761030197, "learning_rate": 6.781453661112466e-07, "loss": 0.0001, "num_input_tokens_seen": 88823824, "step": 131780 }, { "epoch": 3.2195294749957246, "grad_norm": 0.004339349456131458, "learning_rate": 6.780646273802304e-07, "loss": 0.0, "num_input_tokens_seen": 88827408, "step": 131785 }, { "epoch": 3.219651625827572, "grad_norm": 0.0006389893242157996, "learning_rate": 6.779838909904152e-07, "loss": 0.0002, "num_input_tokens_seen": 88830480, "step": 131790 }, { "epoch": 3.219773776659419, "grad_norm": 0.0051666791550815105, "learning_rate": 6.779031569423874e-07, "loss": 0.0, "num_input_tokens_seen": 88834000, "step": 131795 }, { "epoch": 3.219895927491266, "grad_norm": 0.004778078757226467, "learning_rate": 6.778224252367344e-07, "loss": 0.0002, "num_input_tokens_seen": 88837072, "step": 131800 }, { "epoch": 3.2200180783231134, "grad_norm": 0.008272204548120499, "learning_rate": 6.777416958740438e-07, "loss": 0.0, "num_input_tokens_seen": 88840272, "step": 131805 }, { "epoch": 3.2201402291549606, "grad_norm": 0.008195118978619576, "learning_rate": 6.77660968854902e-07, "loss": 0.0, "num_input_tokens_seen": 88843664, "step": 131810 }, { "epoch": 3.2202623799868078, "grad_norm": 0.016739945858716965, "learning_rate": 6.775802441798966e-07, "loss": 0.0, "num_input_tokens_seen": 88847056, "step": 131815 }, { "epoch": 3.220384530818655, "grad_norm": 0.010644571855664253, "learning_rate": 6.774995218496142e-07, "loss": 0.0, "num_input_tokens_seen": 88850384, "step": 131820 }, { "epoch": 3.220506681650502, "grad_norm": 0.00225659366697073, "learning_rate": 6.774188018646421e-07, "loss": 0.0001, "num_input_tokens_seen": 88853392, "step": 131825 }, { "epoch": 3.2206288324823493, "grad_norm": 0.0010362331522628665, "learning_rate": 6.773380842255671e-07, "loss": 0.0, "num_input_tokens_seen": 88856976, "step": 131830 }, { "epoch": 3.2207509833141965, "grad_norm": 0.03388144075870514, "learning_rate": 6.772573689329763e-07, "loss": 0.0, "num_input_tokens_seen": 88860496, "step": 131835 }, { "epoch": 3.2208731341460437, "grad_norm": 0.014827468432486057, "learning_rate": 6.77176655987457e-07, "loss": 0.0, "num_input_tokens_seen": 88864336, "step": 131840 }, { "epoch": 3.220995284977891, "grad_norm": 0.009838799946010113, "learning_rate": 6.770959453895955e-07, "loss": 0.0, "num_input_tokens_seen": 88867856, "step": 131845 }, { "epoch": 3.221117435809738, "grad_norm": 0.005385571159422398, "learning_rate": 6.770152371399795e-07, "loss": 0.0, "num_input_tokens_seen": 88871632, "step": 131850 }, { "epoch": 3.221239586641585, "grad_norm": 0.09350186586380005, "learning_rate": 6.769345312391952e-07, "loss": 0.0, "num_input_tokens_seen": 88874960, "step": 131855 }, { "epoch": 3.221361737473432, "grad_norm": 0.0004697141412179917, "learning_rate": 6.7685382768783e-07, "loss": 0.0, "num_input_tokens_seen": 88878416, "step": 131860 }, { "epoch": 3.221483888305279, "grad_norm": 35.65409851074219, "learning_rate": 6.767731264864709e-07, "loss": 0.0563, "num_input_tokens_seen": 88881744, "step": 131865 }, { "epoch": 3.2216060391371264, "grad_norm": 0.0021280786022543907, "learning_rate": 6.766924276357044e-07, "loss": 0.0, "num_input_tokens_seen": 88884944, "step": 131870 }, { "epoch": 3.2217281899689736, "grad_norm": 0.001438329927623272, "learning_rate": 6.766117311361177e-07, "loss": 0.0, "num_input_tokens_seen": 88888016, "step": 131875 }, { "epoch": 3.2218503408008208, "grad_norm": 0.0007603879203088582, "learning_rate": 6.765310369882973e-07, "loss": 0.0, "num_input_tokens_seen": 88891728, "step": 131880 }, { "epoch": 3.221972491632668, "grad_norm": 0.0012879845453426242, "learning_rate": 6.764503451928305e-07, "loss": 0.0, "num_input_tokens_seen": 88894736, "step": 131885 }, { "epoch": 3.222094642464515, "grad_norm": 0.001136791193857789, "learning_rate": 6.763696557503034e-07, "loss": 0.0181, "num_input_tokens_seen": 88898064, "step": 131890 }, { "epoch": 3.2222167932963623, "grad_norm": 0.083034448325634, "learning_rate": 6.762889686613032e-07, "loss": 0.0378, "num_input_tokens_seen": 88901456, "step": 131895 }, { "epoch": 3.2223389441282095, "grad_norm": 0.0010081586660817266, "learning_rate": 6.762082839264169e-07, "loss": 0.0, "num_input_tokens_seen": 88904912, "step": 131900 }, { "epoch": 3.2224610949600567, "grad_norm": 0.0007607207517139614, "learning_rate": 6.761276015462309e-07, "loss": 0.0001, "num_input_tokens_seen": 88907984, "step": 131905 }, { "epoch": 3.222583245791904, "grad_norm": 0.0006288554286584258, "learning_rate": 6.760469215213324e-07, "loss": 0.0, "num_input_tokens_seen": 88911632, "step": 131910 }, { "epoch": 3.222705396623751, "grad_norm": 0.004640562925487757, "learning_rate": 6.759662438523074e-07, "loss": 0.0317, "num_input_tokens_seen": 88915152, "step": 131915 }, { "epoch": 3.2228275474555983, "grad_norm": 0.002924085594713688, "learning_rate": 6.758855685397432e-07, "loss": 0.0, "num_input_tokens_seen": 88918416, "step": 131920 }, { "epoch": 3.2229496982874455, "grad_norm": 0.0007594149792566895, "learning_rate": 6.758048955842266e-07, "loss": 0.0, "num_input_tokens_seen": 88921488, "step": 131925 }, { "epoch": 3.2230718491192927, "grad_norm": 219.20193481445312, "learning_rate": 6.757242249863435e-07, "loss": 0.0663, "num_input_tokens_seen": 88924944, "step": 131930 }, { "epoch": 3.22319399995114, "grad_norm": 0.006914704106748104, "learning_rate": 6.756435567466816e-07, "loss": 0.0397, "num_input_tokens_seen": 88928144, "step": 131935 }, { "epoch": 3.2233161507829866, "grad_norm": 0.003912906628102064, "learning_rate": 6.755628908658265e-07, "loss": 0.0, "num_input_tokens_seen": 88931472, "step": 131940 }, { "epoch": 3.223438301614834, "grad_norm": 0.0022785658948123455, "learning_rate": 6.754822273443657e-07, "loss": 0.0, "num_input_tokens_seen": 88935248, "step": 131945 }, { "epoch": 3.223560452446681, "grad_norm": 0.0023476288188248873, "learning_rate": 6.754015661828851e-07, "loss": 0.0, "num_input_tokens_seen": 88938576, "step": 131950 }, { "epoch": 3.223682603278528, "grad_norm": 0.06342912465333939, "learning_rate": 6.753209073819717e-07, "loss": 0.0, "num_input_tokens_seen": 88942032, "step": 131955 }, { "epoch": 3.2238047541103754, "grad_norm": 0.0013723451411351562, "learning_rate": 6.752402509422121e-07, "loss": 0.0, "num_input_tokens_seen": 88945104, "step": 131960 }, { "epoch": 3.2239269049422226, "grad_norm": 0.002334070857614279, "learning_rate": 6.751595968641925e-07, "loss": 0.0, "num_input_tokens_seen": 88949072, "step": 131965 }, { "epoch": 3.2240490557740697, "grad_norm": 0.005523113068193197, "learning_rate": 6.750789451485002e-07, "loss": 0.0728, "num_input_tokens_seen": 88952976, "step": 131970 }, { "epoch": 3.224171206605917, "grad_norm": 0.08320626616477966, "learning_rate": 6.74998295795721e-07, "loss": 0.0, "num_input_tokens_seen": 88956304, "step": 131975 }, { "epoch": 3.224293357437764, "grad_norm": 0.0005697169108316302, "learning_rate": 6.749176488064414e-07, "loss": 0.0, "num_input_tokens_seen": 88959696, "step": 131980 }, { "epoch": 3.2244155082696113, "grad_norm": 0.002113824477419257, "learning_rate": 6.748370041812482e-07, "loss": 0.0, "num_input_tokens_seen": 88963088, "step": 131985 }, { "epoch": 3.2245376591014585, "grad_norm": 0.0005294289439916611, "learning_rate": 6.747563619207276e-07, "loss": 0.0006, "num_input_tokens_seen": 88966288, "step": 131990 }, { "epoch": 3.2246598099333057, "grad_norm": 0.003573896363377571, "learning_rate": 6.746757220254667e-07, "loss": 0.0, "num_input_tokens_seen": 88970256, "step": 131995 }, { "epoch": 3.224781960765153, "grad_norm": 0.0011918267700821161, "learning_rate": 6.745950844960509e-07, "loss": 0.0001, "num_input_tokens_seen": 88973520, "step": 132000 }, { "epoch": 3.224904111597, "grad_norm": 0.033025145530700684, "learning_rate": 6.745144493330676e-07, "loss": 0.0, "num_input_tokens_seen": 88977104, "step": 132005 }, { "epoch": 3.2250262624288473, "grad_norm": 0.000612385047134012, "learning_rate": 6.744338165371023e-07, "loss": 0.0002, "num_input_tokens_seen": 88980560, "step": 132010 }, { "epoch": 3.2251484132606945, "grad_norm": 0.001855800743214786, "learning_rate": 6.74353186108742e-07, "loss": 0.0, "num_input_tokens_seen": 88983696, "step": 132015 }, { "epoch": 3.2252705640925416, "grad_norm": 0.01201531384140253, "learning_rate": 6.742725580485732e-07, "loss": 0.0001, "num_input_tokens_seen": 88986704, "step": 132020 }, { "epoch": 3.225392714924389, "grad_norm": 0.002281307941302657, "learning_rate": 6.741919323571818e-07, "loss": 0.0001, "num_input_tokens_seen": 88989584, "step": 132025 }, { "epoch": 3.225514865756236, "grad_norm": 0.008396490477025509, "learning_rate": 6.741113090351544e-07, "loss": 0.0636, "num_input_tokens_seen": 88992976, "step": 132030 }, { "epoch": 3.2256370165880828, "grad_norm": 0.28072237968444824, "learning_rate": 6.740306880830771e-07, "loss": 0.0001, "num_input_tokens_seen": 88996368, "step": 132035 }, { "epoch": 3.22575916741993, "grad_norm": 0.46522748470306396, "learning_rate": 6.739500695015365e-07, "loss": 0.0001, "num_input_tokens_seen": 88999568, "step": 132040 }, { "epoch": 3.225881318251777, "grad_norm": 2.098423480987549, "learning_rate": 6.738694532911184e-07, "loss": 0.0004, "num_input_tokens_seen": 89002896, "step": 132045 }, { "epoch": 3.2260034690836243, "grad_norm": 0.030650269240140915, "learning_rate": 6.737888394524092e-07, "loss": 0.0001, "num_input_tokens_seen": 89006480, "step": 132050 }, { "epoch": 3.2261256199154715, "grad_norm": 0.007767067290842533, "learning_rate": 6.737082279859958e-07, "loss": 0.0002, "num_input_tokens_seen": 89009936, "step": 132055 }, { "epoch": 3.2262477707473187, "grad_norm": 0.019312912598252296, "learning_rate": 6.736276188924636e-07, "loss": 0.0, "num_input_tokens_seen": 89013136, "step": 132060 }, { "epoch": 3.226369921579166, "grad_norm": 0.0337306447327137, "learning_rate": 6.735470121723994e-07, "loss": 0.0, "num_input_tokens_seen": 89016528, "step": 132065 }, { "epoch": 3.226492072411013, "grad_norm": 0.00048753563896752894, "learning_rate": 6.734664078263887e-07, "loss": 0.0, "num_input_tokens_seen": 89020368, "step": 132070 }, { "epoch": 3.2266142232428603, "grad_norm": 0.0010278359986841679, "learning_rate": 6.733858058550185e-07, "loss": 0.0, "num_input_tokens_seen": 89023568, "step": 132075 }, { "epoch": 3.2267363740747075, "grad_norm": 0.00571131706237793, "learning_rate": 6.733052062588747e-07, "loss": 0.0001, "num_input_tokens_seen": 89026960, "step": 132080 }, { "epoch": 3.2268585249065547, "grad_norm": 0.0007071031141094863, "learning_rate": 6.732246090385428e-07, "loss": 0.0001, "num_input_tokens_seen": 89030480, "step": 132085 }, { "epoch": 3.226980675738402, "grad_norm": 0.0007928020786494017, "learning_rate": 6.7314401419461e-07, "loss": 0.0, "num_input_tokens_seen": 89034064, "step": 132090 }, { "epoch": 3.227102826570249, "grad_norm": 0.012956708669662476, "learning_rate": 6.730634217276614e-07, "loss": 0.0, "num_input_tokens_seen": 89037584, "step": 132095 }, { "epoch": 3.2272249774020962, "grad_norm": 0.016126209869980812, "learning_rate": 6.729828316382837e-07, "loss": 0.0, "num_input_tokens_seen": 89041040, "step": 132100 }, { "epoch": 3.2273471282339434, "grad_norm": 0.0004620710969902575, "learning_rate": 6.729022439270627e-07, "loss": 0.0, "num_input_tokens_seen": 89044048, "step": 132105 }, { "epoch": 3.2274692790657906, "grad_norm": 0.012120189145207405, "learning_rate": 6.728216585945845e-07, "loss": 0.0, "num_input_tokens_seen": 89047376, "step": 132110 }, { "epoch": 3.227591429897638, "grad_norm": 0.019521376118063927, "learning_rate": 6.727410756414356e-07, "loss": 0.0, "num_input_tokens_seen": 89050768, "step": 132115 }, { "epoch": 3.2277135807294846, "grad_norm": 0.0017054117051884532, "learning_rate": 6.726604950682011e-07, "loss": 0.0, "num_input_tokens_seen": 89053968, "step": 132120 }, { "epoch": 3.2278357315613317, "grad_norm": 0.2144937962293625, "learning_rate": 6.72579916875468e-07, "loss": 0.0001, "num_input_tokens_seen": 89057488, "step": 132125 }, { "epoch": 3.227957882393179, "grad_norm": 0.00031233998015522957, "learning_rate": 6.724993410638216e-07, "loss": 0.0, "num_input_tokens_seen": 89060880, "step": 132130 }, { "epoch": 3.228080033225026, "grad_norm": 0.0021750705782324076, "learning_rate": 6.724187676338481e-07, "loss": 0.0, "num_input_tokens_seen": 89064016, "step": 132135 }, { "epoch": 3.2282021840568733, "grad_norm": 0.00044330034870654345, "learning_rate": 6.723381965861334e-07, "loss": 0.0, "num_input_tokens_seen": 89067216, "step": 132140 }, { "epoch": 3.2283243348887205, "grad_norm": 6.203007797012106e-05, "learning_rate": 6.722576279212632e-07, "loss": 0.0, "num_input_tokens_seen": 89070544, "step": 132145 }, { "epoch": 3.2284464857205677, "grad_norm": 0.002357953228056431, "learning_rate": 6.721770616398242e-07, "loss": 0.0, "num_input_tokens_seen": 89074320, "step": 132150 }, { "epoch": 3.228568636552415, "grad_norm": 0.0007738954154774547, "learning_rate": 6.720964977424013e-07, "loss": 0.0, "num_input_tokens_seen": 89077776, "step": 132155 }, { "epoch": 3.228690787384262, "grad_norm": 0.001500914222560823, "learning_rate": 6.720159362295814e-07, "loss": 0.0, "num_input_tokens_seen": 89080912, "step": 132160 }, { "epoch": 3.2288129382161093, "grad_norm": 0.0029234839603304863, "learning_rate": 6.719353771019494e-07, "loss": 0.0, "num_input_tokens_seen": 89084560, "step": 132165 }, { "epoch": 3.2289350890479565, "grad_norm": 0.0030644559301435947, "learning_rate": 6.718548203600915e-07, "loss": 0.0023, "num_input_tokens_seen": 89087888, "step": 132170 }, { "epoch": 3.2290572398798036, "grad_norm": 0.00027050048811361194, "learning_rate": 6.71774266004594e-07, "loss": 0.0488, "num_input_tokens_seen": 89091024, "step": 132175 }, { "epoch": 3.229179390711651, "grad_norm": 0.0002629015943966806, "learning_rate": 6.716937140360421e-07, "loss": 0.0, "num_input_tokens_seen": 89094160, "step": 132180 }, { "epoch": 3.229301541543498, "grad_norm": 0.0009435914689674973, "learning_rate": 6.716131644550218e-07, "loss": 0.0875, "num_input_tokens_seen": 89097552, "step": 132185 }, { "epoch": 3.229423692375345, "grad_norm": 0.021992305293679237, "learning_rate": 6.71532617262119e-07, "loss": 0.0, "num_input_tokens_seen": 89100816, "step": 132190 }, { "epoch": 3.2295458432071924, "grad_norm": 0.00021161261247470975, "learning_rate": 6.714520724579196e-07, "loss": 0.0002, "num_input_tokens_seen": 89103888, "step": 132195 }, { "epoch": 3.2296679940390396, "grad_norm": 0.002651083981618285, "learning_rate": 6.713715300430085e-07, "loss": 0.0, "num_input_tokens_seen": 89107024, "step": 132200 }, { "epoch": 3.2297901448708863, "grad_norm": 0.0004908027476631105, "learning_rate": 6.712909900179722e-07, "loss": 0.0, "num_input_tokens_seen": 89110288, "step": 132205 }, { "epoch": 3.229912295702734, "grad_norm": 0.0019102852093055844, "learning_rate": 6.712104523833965e-07, "loss": 0.0, "num_input_tokens_seen": 89113552, "step": 132210 }, { "epoch": 3.2300344465345807, "grad_norm": 23.559524536132812, "learning_rate": 6.711299171398664e-07, "loss": 0.0579, "num_input_tokens_seen": 89117136, "step": 132215 }, { "epoch": 3.230156597366428, "grad_norm": 0.053920187056064606, "learning_rate": 6.710493842879685e-07, "loss": 0.0, "num_input_tokens_seen": 89120016, "step": 132220 }, { "epoch": 3.230278748198275, "grad_norm": 0.001154446741566062, "learning_rate": 6.709688538282876e-07, "loss": 0.0001, "num_input_tokens_seen": 89122896, "step": 132225 }, { "epoch": 3.2304008990301223, "grad_norm": 0.0008815564215183258, "learning_rate": 6.708883257614098e-07, "loss": 0.0, "num_input_tokens_seen": 89126544, "step": 132230 }, { "epoch": 3.2305230498619695, "grad_norm": 0.0008381134830415249, "learning_rate": 6.708078000879209e-07, "loss": 0.0, "num_input_tokens_seen": 89129744, "step": 132235 }, { "epoch": 3.2306452006938167, "grad_norm": 0.010292504914104939, "learning_rate": 6.707272768084057e-07, "loss": 0.0001, "num_input_tokens_seen": 89132816, "step": 132240 }, { "epoch": 3.230767351525664, "grad_norm": 0.004390857182443142, "learning_rate": 6.706467559234507e-07, "loss": 0.0, "num_input_tokens_seen": 89135760, "step": 132245 }, { "epoch": 3.230889502357511, "grad_norm": 0.01249841507524252, "learning_rate": 6.705662374336409e-07, "loss": 0.0, "num_input_tokens_seen": 89139088, "step": 132250 }, { "epoch": 3.2310116531893582, "grad_norm": 6.614337326027453e-05, "learning_rate": 6.704857213395622e-07, "loss": 0.0, "num_input_tokens_seen": 89142288, "step": 132255 }, { "epoch": 3.2311338040212054, "grad_norm": 0.0024311612360179424, "learning_rate": 6.704052076417996e-07, "loss": 0.0, "num_input_tokens_seen": 89145552, "step": 132260 }, { "epoch": 3.2312559548530526, "grad_norm": 0.0019033217104151845, "learning_rate": 6.70324696340939e-07, "loss": 0.0, "num_input_tokens_seen": 89148880, "step": 132265 }, { "epoch": 3.2313781056849, "grad_norm": 0.0026095532812178135, "learning_rate": 6.702441874375664e-07, "loss": 0.0, "num_input_tokens_seen": 89152400, "step": 132270 }, { "epoch": 3.231500256516747, "grad_norm": 0.02475583739578724, "learning_rate": 6.701636809322662e-07, "loss": 0.0001, "num_input_tokens_seen": 89155408, "step": 132275 }, { "epoch": 3.231622407348594, "grad_norm": 0.0015918344724923372, "learning_rate": 6.700831768256249e-07, "loss": 0.0, "num_input_tokens_seen": 89158800, "step": 132280 }, { "epoch": 3.2317445581804414, "grad_norm": 0.0030924296006560326, "learning_rate": 6.700026751182273e-07, "loss": 0.0, "num_input_tokens_seen": 89162192, "step": 132285 }, { "epoch": 3.2318667090122886, "grad_norm": 0.00015647780674044043, "learning_rate": 6.69922175810659e-07, "loss": 0.0001, "num_input_tokens_seen": 89165136, "step": 132290 }, { "epoch": 3.2319888598441358, "grad_norm": 0.0005808421992696822, "learning_rate": 6.698416789035053e-07, "loss": 0.0, "num_input_tokens_seen": 89168336, "step": 132295 }, { "epoch": 3.2321110106759825, "grad_norm": 0.0008498340612277389, "learning_rate": 6.697611843973517e-07, "loss": 0.0266, "num_input_tokens_seen": 89171472, "step": 132300 }, { "epoch": 3.2322331615078297, "grad_norm": 0.001342020696029067, "learning_rate": 6.696806922927838e-07, "loss": 0.0, "num_input_tokens_seen": 89174864, "step": 132305 }, { "epoch": 3.232355312339677, "grad_norm": 8.064936264418066e-05, "learning_rate": 6.696002025903864e-07, "loss": 0.0, "num_input_tokens_seen": 89178064, "step": 132310 }, { "epoch": 3.232477463171524, "grad_norm": 0.0025094160810112953, "learning_rate": 6.695197152907457e-07, "loss": 0.0, "num_input_tokens_seen": 89181008, "step": 132315 }, { "epoch": 3.2325996140033713, "grad_norm": 0.0005574112292379141, "learning_rate": 6.694392303944461e-07, "loss": 0.0, "num_input_tokens_seen": 89184336, "step": 132320 }, { "epoch": 3.2327217648352184, "grad_norm": 0.019411450251936913, "learning_rate": 6.693587479020732e-07, "loss": 0.0, "num_input_tokens_seen": 89187792, "step": 132325 }, { "epoch": 3.2328439156670656, "grad_norm": 0.05258939042687416, "learning_rate": 6.692782678142129e-07, "loss": 0.0, "num_input_tokens_seen": 89191184, "step": 132330 }, { "epoch": 3.232966066498913, "grad_norm": 0.002094942843541503, "learning_rate": 6.691977901314498e-07, "loss": 0.0462, "num_input_tokens_seen": 89194384, "step": 132335 }, { "epoch": 3.23308821733076, "grad_norm": 0.0006676348857581615, "learning_rate": 6.691173148543694e-07, "loss": 0.0, "num_input_tokens_seen": 89198224, "step": 132340 }, { "epoch": 3.233210368162607, "grad_norm": 0.002458383562043309, "learning_rate": 6.690368419835569e-07, "loss": 0.0, "num_input_tokens_seen": 89201360, "step": 132345 }, { "epoch": 3.2333325189944544, "grad_norm": 0.0006629899726249278, "learning_rate": 6.689563715195975e-07, "loss": 0.0, "num_input_tokens_seen": 89204560, "step": 132350 }, { "epoch": 3.2334546698263016, "grad_norm": 6.684491381747648e-05, "learning_rate": 6.688759034630761e-07, "loss": 0.0, "num_input_tokens_seen": 89207952, "step": 132355 }, { "epoch": 3.2335768206581488, "grad_norm": 0.0046476987190544605, "learning_rate": 6.687954378145782e-07, "loss": 0.0, "num_input_tokens_seen": 89211024, "step": 132360 }, { "epoch": 3.233698971489996, "grad_norm": 0.0023857601918280125, "learning_rate": 6.687149745746894e-07, "loss": 0.1191, "num_input_tokens_seen": 89214608, "step": 132365 }, { "epoch": 3.233821122321843, "grad_norm": 82.63002014160156, "learning_rate": 6.686345137439939e-07, "loss": 0.1076, "num_input_tokens_seen": 89218384, "step": 132370 }, { "epoch": 3.2339432731536903, "grad_norm": 0.015677260234951973, "learning_rate": 6.68554055323078e-07, "loss": 0.0, "num_input_tokens_seen": 89221456, "step": 132375 }, { "epoch": 3.2340654239855375, "grad_norm": 0.0005482754786498845, "learning_rate": 6.684735993125255e-07, "loss": 0.0043, "num_input_tokens_seen": 89225040, "step": 132380 }, { "epoch": 3.2341875748173843, "grad_norm": 0.0008770662243478, "learning_rate": 6.683931457129224e-07, "loss": 0.0, "num_input_tokens_seen": 89228240, "step": 132385 }, { "epoch": 3.2343097256492315, "grad_norm": 0.022349568083882332, "learning_rate": 6.683126945248538e-07, "loss": 0.0, "num_input_tokens_seen": 89231376, "step": 132390 }, { "epoch": 3.2344318764810787, "grad_norm": 0.00225584814324975, "learning_rate": 6.682322457489042e-07, "loss": 0.0, "num_input_tokens_seen": 89234576, "step": 132395 }, { "epoch": 3.234554027312926, "grad_norm": 0.01475659478455782, "learning_rate": 6.681517993856592e-07, "loss": 0.0, "num_input_tokens_seen": 89237840, "step": 132400 }, { "epoch": 3.234676178144773, "grad_norm": 0.000728962360881269, "learning_rate": 6.680713554357031e-07, "loss": 0.0354, "num_input_tokens_seen": 89241360, "step": 132405 }, { "epoch": 3.2347983289766202, "grad_norm": 0.00011187187919858843, "learning_rate": 6.679909138996219e-07, "loss": 0.0, "num_input_tokens_seen": 89244880, "step": 132410 }, { "epoch": 3.2349204798084674, "grad_norm": 0.0008701798506081104, "learning_rate": 6.679104747779996e-07, "loss": 0.0, "num_input_tokens_seen": 89247952, "step": 132415 }, { "epoch": 3.2350426306403146, "grad_norm": 0.0016978960484266281, "learning_rate": 6.678300380714217e-07, "loss": 0.0001, "num_input_tokens_seen": 89251472, "step": 132420 }, { "epoch": 3.235164781472162, "grad_norm": 2.5931992530822754, "learning_rate": 6.677496037804735e-07, "loss": 0.0239, "num_input_tokens_seen": 89255120, "step": 132425 }, { "epoch": 3.235286932304009, "grad_norm": 0.025977689772844315, "learning_rate": 6.676691719057393e-07, "loss": 0.0, "num_input_tokens_seen": 89258640, "step": 132430 }, { "epoch": 3.235409083135856, "grad_norm": 9.993995627155527e-05, "learning_rate": 6.675887424478044e-07, "loss": 0.0, "num_input_tokens_seen": 89262224, "step": 132435 }, { "epoch": 3.2355312339677034, "grad_norm": 0.03302092105150223, "learning_rate": 6.675083154072535e-07, "loss": 0.0, "num_input_tokens_seen": 89265808, "step": 132440 }, { "epoch": 3.2356533847995506, "grad_norm": 0.006590451113879681, "learning_rate": 6.674278907846715e-07, "loss": 0.0001, "num_input_tokens_seen": 89269072, "step": 132445 }, { "epoch": 3.2357755356313977, "grad_norm": 0.02588418684899807, "learning_rate": 6.673474685806435e-07, "loss": 0.0001, "num_input_tokens_seen": 89272336, "step": 132450 }, { "epoch": 3.235897686463245, "grad_norm": 0.01116380374878645, "learning_rate": 6.672670487957538e-07, "loss": 0.0, "num_input_tokens_seen": 89275600, "step": 132455 }, { "epoch": 3.236019837295092, "grad_norm": 0.015107502229511738, "learning_rate": 6.671866314305881e-07, "loss": 0.0, "num_input_tokens_seen": 89278736, "step": 132460 }, { "epoch": 3.2361419881269393, "grad_norm": 0.0018150904215872288, "learning_rate": 6.671062164857303e-07, "loss": 0.0, "num_input_tokens_seen": 89282192, "step": 132465 }, { "epoch": 3.2362641389587865, "grad_norm": 0.0016039460897445679, "learning_rate": 6.67025803961766e-07, "loss": 0.0002, "num_input_tokens_seen": 89285072, "step": 132470 }, { "epoch": 3.2363862897906337, "grad_norm": 0.007005858235061169, "learning_rate": 6.669453938592793e-07, "loss": 0.0, "num_input_tokens_seen": 89288400, "step": 132475 }, { "epoch": 3.2365084406224804, "grad_norm": 0.0012618335895240307, "learning_rate": 6.668649861788551e-07, "loss": 0.0325, "num_input_tokens_seen": 89291344, "step": 132480 }, { "epoch": 3.2366305914543276, "grad_norm": 0.004066959489136934, "learning_rate": 6.66784580921079e-07, "loss": 0.0663, "num_input_tokens_seen": 89295184, "step": 132485 }, { "epoch": 3.236752742286175, "grad_norm": 0.024879004806280136, "learning_rate": 6.667041780865347e-07, "loss": 0.0439, "num_input_tokens_seen": 89298192, "step": 132490 }, { "epoch": 3.236874893118022, "grad_norm": 0.00970000121742487, "learning_rate": 6.666237776758072e-07, "loss": 0.0, "num_input_tokens_seen": 89301392, "step": 132495 }, { "epoch": 3.236997043949869, "grad_norm": 0.02402373217046261, "learning_rate": 6.665433796894812e-07, "loss": 0.0, "num_input_tokens_seen": 89304784, "step": 132500 }, { "epoch": 3.2371191947817164, "grad_norm": 0.012411472387611866, "learning_rate": 6.66462984128142e-07, "loss": 0.0, "num_input_tokens_seen": 89308048, "step": 132505 }, { "epoch": 3.2372413456135636, "grad_norm": 0.013225167989730835, "learning_rate": 6.663825909923729e-07, "loss": 0.0001, "num_input_tokens_seen": 89311376, "step": 132510 }, { "epoch": 3.2373634964454108, "grad_norm": 0.004108354914933443, "learning_rate": 6.663022002827595e-07, "loss": 0.0, "num_input_tokens_seen": 89314448, "step": 132515 }, { "epoch": 3.237485647277258, "grad_norm": 0.34476473927497864, "learning_rate": 6.662218119998866e-07, "loss": 0.001, "num_input_tokens_seen": 89317520, "step": 132520 }, { "epoch": 3.237607798109105, "grad_norm": 0.010773766785860062, "learning_rate": 6.661414261443381e-07, "loss": 0.0, "num_input_tokens_seen": 89320720, "step": 132525 }, { "epoch": 3.2377299489409523, "grad_norm": 0.02545316517353058, "learning_rate": 6.660610427166993e-07, "loss": 0.0, "num_input_tokens_seen": 89323856, "step": 132530 }, { "epoch": 3.2378520997727995, "grad_norm": 0.014912369661033154, "learning_rate": 6.659806617175541e-07, "loss": 0.0, "num_input_tokens_seen": 89328016, "step": 132535 }, { "epoch": 3.2379742506046467, "grad_norm": 0.10480382293462753, "learning_rate": 6.659002831474878e-07, "loss": 0.0, "num_input_tokens_seen": 89331344, "step": 132540 }, { "epoch": 3.238096401436494, "grad_norm": 0.008487637154757977, "learning_rate": 6.658199070070842e-07, "loss": 0.0, "num_input_tokens_seen": 89334992, "step": 132545 }, { "epoch": 3.238218552268341, "grad_norm": 8.822243398753926e-05, "learning_rate": 6.657395332969279e-07, "loss": 0.0, "num_input_tokens_seen": 89338128, "step": 132550 }, { "epoch": 3.2383407031001883, "grad_norm": 0.0040891217067837715, "learning_rate": 6.656591620176041e-07, "loss": 0.0, "num_input_tokens_seen": 89341648, "step": 132555 }, { "epoch": 3.2384628539320355, "grad_norm": 0.0024746954441070557, "learning_rate": 6.655787931696964e-07, "loss": 0.0, "num_input_tokens_seen": 89345104, "step": 132560 }, { "epoch": 3.2385850047638822, "grad_norm": 0.1467559039592743, "learning_rate": 6.6549842675379e-07, "loss": 0.0001, "num_input_tokens_seen": 89347856, "step": 132565 }, { "epoch": 3.2387071555957294, "grad_norm": 0.0009347347659058869, "learning_rate": 6.654180627704687e-07, "loss": 0.0001, "num_input_tokens_seen": 89350992, "step": 132570 }, { "epoch": 3.2388293064275766, "grad_norm": 0.0004353010153863579, "learning_rate": 6.653377012203171e-07, "loss": 0.0, "num_input_tokens_seen": 89354000, "step": 132575 }, { "epoch": 3.238951457259424, "grad_norm": 0.007408044300973415, "learning_rate": 6.652573421039203e-07, "loss": 0.1238, "num_input_tokens_seen": 89357904, "step": 132580 }, { "epoch": 3.239073608091271, "grad_norm": 0.0021132344845682383, "learning_rate": 6.651769854218615e-07, "loss": 0.0, "num_input_tokens_seen": 89361616, "step": 132585 }, { "epoch": 3.239195758923118, "grad_norm": 0.0005457843071781099, "learning_rate": 6.650966311747263e-07, "loss": 0.0, "num_input_tokens_seen": 89365200, "step": 132590 }, { "epoch": 3.2393179097549654, "grad_norm": 0.0004142549878451973, "learning_rate": 6.650162793630982e-07, "loss": 0.0, "num_input_tokens_seen": 89369168, "step": 132595 }, { "epoch": 3.2394400605868126, "grad_norm": 0.00022182843531481922, "learning_rate": 6.649359299875619e-07, "loss": 0.0, "num_input_tokens_seen": 89372624, "step": 132600 }, { "epoch": 3.2395622114186597, "grad_norm": 0.032848212867975235, "learning_rate": 6.648555830487018e-07, "loss": 0.0001, "num_input_tokens_seen": 89375888, "step": 132605 }, { "epoch": 3.239684362250507, "grad_norm": 3.7265719583956525e-05, "learning_rate": 6.647752385471015e-07, "loss": 0.0, "num_input_tokens_seen": 89379280, "step": 132610 }, { "epoch": 3.239806513082354, "grad_norm": 0.005500871688127518, "learning_rate": 6.646948964833465e-07, "loss": 0.0003, "num_input_tokens_seen": 89382416, "step": 132615 }, { "epoch": 3.2399286639142013, "grad_norm": 0.006964322179555893, "learning_rate": 6.646145568580198e-07, "loss": 0.0, "num_input_tokens_seen": 89385872, "step": 132620 }, { "epoch": 3.2400508147460485, "grad_norm": 0.004247676581144333, "learning_rate": 6.645342196717067e-07, "loss": 0.0, "num_input_tokens_seen": 89389136, "step": 132625 }, { "epoch": 3.2401729655778957, "grad_norm": 0.033477533608675, "learning_rate": 6.644538849249907e-07, "loss": 0.0, "num_input_tokens_seen": 89394768, "step": 132630 }, { "epoch": 3.240295116409743, "grad_norm": 0.0007979911752045155, "learning_rate": 6.643735526184562e-07, "loss": 0.0, "num_input_tokens_seen": 89398608, "step": 132635 }, { "epoch": 3.24041726724159, "grad_norm": 9.7431001663208, "learning_rate": 6.64293222752688e-07, "loss": 0.0285, "num_input_tokens_seen": 89402128, "step": 132640 }, { "epoch": 3.2405394180734373, "grad_norm": 0.006626779213547707, "learning_rate": 6.642128953282695e-07, "loss": 0.0087, "num_input_tokens_seen": 89405456, "step": 132645 }, { "epoch": 3.2406615689052845, "grad_norm": 4.8968089686241e-05, "learning_rate": 6.641325703457852e-07, "loss": 0.0684, "num_input_tokens_seen": 89409680, "step": 132650 }, { "epoch": 3.2407837197371316, "grad_norm": 0.00018853062647394836, "learning_rate": 6.64052247805819e-07, "loss": 0.0, "num_input_tokens_seen": 89413264, "step": 132655 }, { "epoch": 3.2409058705689784, "grad_norm": 0.042871687561273575, "learning_rate": 6.639719277089556e-07, "loss": 0.0, "num_input_tokens_seen": 89416592, "step": 132660 }, { "epoch": 3.2410280214008256, "grad_norm": 0.00719387736171484, "learning_rate": 6.638916100557782e-07, "loss": 0.0001, "num_input_tokens_seen": 89420048, "step": 132665 }, { "epoch": 3.2411501722326728, "grad_norm": 0.0006991358241066337, "learning_rate": 6.638112948468715e-07, "loss": 0.0, "num_input_tokens_seen": 89423056, "step": 132670 }, { "epoch": 3.24127232306452, "grad_norm": 0.004646055866032839, "learning_rate": 6.637309820828199e-07, "loss": 0.0, "num_input_tokens_seen": 89426768, "step": 132675 }, { "epoch": 3.241394473896367, "grad_norm": 0.00044314860133454204, "learning_rate": 6.636506717642066e-07, "loss": 0.0, "num_input_tokens_seen": 89430032, "step": 132680 }, { "epoch": 3.2415166247282143, "grad_norm": 0.00037013122346252203, "learning_rate": 6.635703638916165e-07, "loss": 0.0203, "num_input_tokens_seen": 89433424, "step": 132685 }, { "epoch": 3.2416387755600615, "grad_norm": 0.0005003222031518817, "learning_rate": 6.634900584656328e-07, "loss": 0.0, "num_input_tokens_seen": 89437264, "step": 132690 }, { "epoch": 3.2417609263919087, "grad_norm": 0.0033043583389371634, "learning_rate": 6.634097554868403e-07, "loss": 0.0, "num_input_tokens_seen": 89440464, "step": 132695 }, { "epoch": 3.241883077223756, "grad_norm": 0.0024806768633425236, "learning_rate": 6.633294549558223e-07, "loss": 0.0, "num_input_tokens_seen": 89443792, "step": 132700 }, { "epoch": 3.242005228055603, "grad_norm": 0.000671973277349025, "learning_rate": 6.63249156873163e-07, "loss": 0.0158, "num_input_tokens_seen": 89447184, "step": 132705 }, { "epoch": 3.2421273788874503, "grad_norm": 0.07534290105104446, "learning_rate": 6.631688612394469e-07, "loss": 0.0, "num_input_tokens_seen": 89450960, "step": 132710 }, { "epoch": 3.2422495297192975, "grad_norm": 0.013921644538640976, "learning_rate": 6.630885680552567e-07, "loss": 0.0, "num_input_tokens_seen": 89454224, "step": 132715 }, { "epoch": 3.2423716805511447, "grad_norm": 0.006058692466467619, "learning_rate": 6.630082773211777e-07, "loss": 0.0, "num_input_tokens_seen": 89457808, "step": 132720 }, { "epoch": 3.242493831382992, "grad_norm": 0.003040890209376812, "learning_rate": 6.629279890377926e-07, "loss": 0.0519, "num_input_tokens_seen": 89461520, "step": 132725 }, { "epoch": 3.242615982214839, "grad_norm": 0.0009857604745775461, "learning_rate": 6.62847703205686e-07, "loss": 0.0, "num_input_tokens_seen": 89464912, "step": 132730 }, { "epoch": 3.2427381330466862, "grad_norm": 0.035643454641103745, "learning_rate": 6.627674198254419e-07, "loss": 0.0961, "num_input_tokens_seen": 89467984, "step": 132735 }, { "epoch": 3.2428602838785334, "grad_norm": 0.00444948673248291, "learning_rate": 6.626871388976433e-07, "loss": 0.0001, "num_input_tokens_seen": 89470992, "step": 132740 }, { "epoch": 3.24298243471038, "grad_norm": 0.0002501757408026606, "learning_rate": 6.626068604228752e-07, "loss": 0.0, "num_input_tokens_seen": 89474256, "step": 132745 }, { "epoch": 3.2431045855422274, "grad_norm": 0.0025310420896857977, "learning_rate": 6.625265844017205e-07, "loss": 0.0, "num_input_tokens_seen": 89477328, "step": 132750 }, { "epoch": 3.2432267363740745, "grad_norm": 0.0023761270567774773, "learning_rate": 6.624463108347631e-07, "loss": 0.0, "num_input_tokens_seen": 89481040, "step": 132755 }, { "epoch": 3.2433488872059217, "grad_norm": 0.001543680438771844, "learning_rate": 6.62366039722587e-07, "loss": 0.0, "num_input_tokens_seen": 89484560, "step": 132760 }, { "epoch": 3.243471038037769, "grad_norm": 0.0036118794232606888, "learning_rate": 6.622857710657757e-07, "loss": 0.0, "num_input_tokens_seen": 89488016, "step": 132765 }, { "epoch": 3.243593188869616, "grad_norm": 0.00880009587854147, "learning_rate": 6.622055048649135e-07, "loss": 0.0, "num_input_tokens_seen": 89491344, "step": 132770 }, { "epoch": 3.2437153397014633, "grad_norm": 0.00033158602309413254, "learning_rate": 6.621252411205834e-07, "loss": 0.0, "num_input_tokens_seen": 89494864, "step": 132775 }, { "epoch": 3.2438374905333105, "grad_norm": 0.01662488840520382, "learning_rate": 6.620449798333696e-07, "loss": 0.0001, "num_input_tokens_seen": 89498384, "step": 132780 }, { "epoch": 3.2439596413651577, "grad_norm": 0.0022721965797245502, "learning_rate": 6.619647210038554e-07, "loss": 0.0, "num_input_tokens_seen": 89501584, "step": 132785 }, { "epoch": 3.244081792197005, "grad_norm": 0.003767449175938964, "learning_rate": 6.618844646326245e-07, "loss": 0.0001, "num_input_tokens_seen": 89504656, "step": 132790 }, { "epoch": 3.244203943028852, "grad_norm": 0.0007159715751186013, "learning_rate": 6.618042107202613e-07, "loss": 0.0066, "num_input_tokens_seen": 89507984, "step": 132795 }, { "epoch": 3.2443260938606993, "grad_norm": 0.007665713783353567, "learning_rate": 6.617239592673485e-07, "loss": 0.0, "num_input_tokens_seen": 89511440, "step": 132800 }, { "epoch": 3.2444482446925464, "grad_norm": 7.725616887910292e-05, "learning_rate": 6.616437102744701e-07, "loss": 0.0, "num_input_tokens_seen": 89514704, "step": 132805 }, { "epoch": 3.2445703955243936, "grad_norm": 0.004263072274625301, "learning_rate": 6.615634637422097e-07, "loss": 0.0, "num_input_tokens_seen": 89517712, "step": 132810 }, { "epoch": 3.244692546356241, "grad_norm": 0.00012406962923705578, "learning_rate": 6.61483219671151e-07, "loss": 0.0, "num_input_tokens_seen": 89521232, "step": 132815 }, { "epoch": 3.244814697188088, "grad_norm": 9.752336336532608e-05, "learning_rate": 6.61402978061877e-07, "loss": 0.0468, "num_input_tokens_seen": 89525008, "step": 132820 }, { "epoch": 3.244936848019935, "grad_norm": 0.16568030416965485, "learning_rate": 6.613227389149716e-07, "loss": 0.0001, "num_input_tokens_seen": 89527824, "step": 132825 }, { "epoch": 3.245058998851782, "grad_norm": 0.002355449367314577, "learning_rate": 6.612425022310186e-07, "loss": 0.0, "num_input_tokens_seen": 89531280, "step": 132830 }, { "epoch": 3.245181149683629, "grad_norm": 0.01688223145902157, "learning_rate": 6.611622680106011e-07, "loss": 0.0, "num_input_tokens_seen": 89534992, "step": 132835 }, { "epoch": 3.2453033005154763, "grad_norm": 0.0006499082082882524, "learning_rate": 6.610820362543028e-07, "loss": 0.0543, "num_input_tokens_seen": 89538640, "step": 132840 }, { "epoch": 3.2454254513473235, "grad_norm": 0.8445472121238708, "learning_rate": 6.61001806962707e-07, "loss": 0.0003, "num_input_tokens_seen": 89541904, "step": 132845 }, { "epoch": 3.2455476021791707, "grad_norm": 0.0038771627005189657, "learning_rate": 6.609215801363974e-07, "loss": 0.0, "num_input_tokens_seen": 89545040, "step": 132850 }, { "epoch": 3.245669753011018, "grad_norm": 3.451886004768312e-05, "learning_rate": 6.608413557759572e-07, "loss": 0.0003, "num_input_tokens_seen": 89548432, "step": 132855 }, { "epoch": 3.245791903842865, "grad_norm": 0.03163387253880501, "learning_rate": 6.607611338819696e-07, "loss": 0.0, "num_input_tokens_seen": 89551696, "step": 132860 }, { "epoch": 3.2459140546747123, "grad_norm": 0.0013465473894029856, "learning_rate": 6.606809144550188e-07, "loss": 0.0001, "num_input_tokens_seen": 89554960, "step": 132865 }, { "epoch": 3.2460362055065595, "grad_norm": 0.28843867778778076, "learning_rate": 6.606006974956872e-07, "loss": 0.0831, "num_input_tokens_seen": 89558544, "step": 132870 }, { "epoch": 3.2461583563384067, "grad_norm": 0.0004849781689699739, "learning_rate": 6.60520483004559e-07, "loss": 0.0, "num_input_tokens_seen": 89561872, "step": 132875 }, { "epoch": 3.246280507170254, "grad_norm": 0.0038369568064808846, "learning_rate": 6.604402709822168e-07, "loss": 0.0001, "num_input_tokens_seen": 89565648, "step": 132880 }, { "epoch": 3.246402658002101, "grad_norm": 0.0021499425638467073, "learning_rate": 6.603600614292441e-07, "loss": 0.0, "num_input_tokens_seen": 89569424, "step": 132885 }, { "epoch": 3.2465248088339482, "grad_norm": 0.001384978648275137, "learning_rate": 6.602798543462252e-07, "loss": 0.0, "num_input_tokens_seen": 89572752, "step": 132890 }, { "epoch": 3.2466469596657954, "grad_norm": 0.0010479361517354846, "learning_rate": 6.601996497337418e-07, "loss": 0.0, "num_input_tokens_seen": 89576272, "step": 132895 }, { "epoch": 3.2467691104976426, "grad_norm": 0.054091040045022964, "learning_rate": 6.601194475923784e-07, "loss": 0.0, "num_input_tokens_seen": 89578960, "step": 132900 }, { "epoch": 3.24689126132949, "grad_norm": 0.002501105424016714, "learning_rate": 6.600392479227177e-07, "loss": 0.0, "num_input_tokens_seen": 89582288, "step": 132905 }, { "epoch": 3.247013412161337, "grad_norm": 0.0004881916393060237, "learning_rate": 6.599590507253429e-07, "loss": 0.0625, "num_input_tokens_seen": 89586320, "step": 132910 }, { "epoch": 3.247135562993184, "grad_norm": 0.00018641616043169051, "learning_rate": 6.598788560008375e-07, "loss": 0.0, "num_input_tokens_seen": 89590096, "step": 132915 }, { "epoch": 3.2472577138250314, "grad_norm": 0.00022535701282322407, "learning_rate": 6.597986637497841e-07, "loss": 0.0, "num_input_tokens_seen": 89593360, "step": 132920 }, { "epoch": 3.247379864656878, "grad_norm": 0.0004712426452897489, "learning_rate": 6.597184739727669e-07, "loss": 0.0001, "num_input_tokens_seen": 89596944, "step": 132925 }, { "epoch": 3.2475020154887253, "grad_norm": 0.0014859620714560151, "learning_rate": 6.59638286670368e-07, "loss": 0.0, "num_input_tokens_seen": 89600272, "step": 132930 }, { "epoch": 3.2476241663205725, "grad_norm": 0.7832088470458984, "learning_rate": 6.595581018431715e-07, "loss": 0.0206, "num_input_tokens_seen": 89603472, "step": 132935 }, { "epoch": 3.2477463171524197, "grad_norm": 0.0012433022493496537, "learning_rate": 6.594779194917596e-07, "loss": 0.0, "num_input_tokens_seen": 89607056, "step": 132940 }, { "epoch": 3.247868467984267, "grad_norm": 0.0006725243292748928, "learning_rate": 6.593977396167159e-07, "loss": 0.0, "num_input_tokens_seen": 89610704, "step": 132945 }, { "epoch": 3.247990618816114, "grad_norm": 0.006654083263128996, "learning_rate": 6.59317562218624e-07, "loss": 0.0433, "num_input_tokens_seen": 89614096, "step": 132950 }, { "epoch": 3.2481127696479613, "grad_norm": 0.0048959460109472275, "learning_rate": 6.59237387298066e-07, "loss": 0.0, "num_input_tokens_seen": 89617424, "step": 132955 }, { "epoch": 3.2482349204798084, "grad_norm": 0.0009554842254146934, "learning_rate": 6.591572148556254e-07, "loss": 0.0, "num_input_tokens_seen": 89620560, "step": 132960 }, { "epoch": 3.2483570713116556, "grad_norm": 0.00032741823815740645, "learning_rate": 6.590770448918852e-07, "loss": 0.0, "num_input_tokens_seen": 89624144, "step": 132965 }, { "epoch": 3.248479222143503, "grad_norm": 0.007848912850022316, "learning_rate": 6.589968774074287e-07, "loss": 0.0, "num_input_tokens_seen": 89627856, "step": 132970 }, { "epoch": 3.24860137297535, "grad_norm": 0.0036281703505665064, "learning_rate": 6.589167124028382e-07, "loss": 0.0002, "num_input_tokens_seen": 89631312, "step": 132975 }, { "epoch": 3.248723523807197, "grad_norm": 0.003891072468832135, "learning_rate": 6.588365498786972e-07, "loss": 0.0, "num_input_tokens_seen": 89634384, "step": 132980 }, { "epoch": 3.2488456746390444, "grad_norm": 0.0014572052750736475, "learning_rate": 6.587563898355888e-07, "loss": 0.0, "num_input_tokens_seen": 89638032, "step": 132985 }, { "epoch": 3.2489678254708916, "grad_norm": 0.0013541283551603556, "learning_rate": 6.586762322740953e-07, "loss": 0.0, "num_input_tokens_seen": 89642000, "step": 132990 }, { "epoch": 3.2490899763027388, "grad_norm": 0.0017062696861103177, "learning_rate": 6.585960771948006e-07, "loss": 0.0, "num_input_tokens_seen": 89645392, "step": 132995 }, { "epoch": 3.249212127134586, "grad_norm": 0.000382839934900403, "learning_rate": 6.585159245982866e-07, "loss": 0.0, "num_input_tokens_seen": 89648720, "step": 133000 }, { "epoch": 3.249334277966433, "grad_norm": 0.0014269673265516758, "learning_rate": 6.584357744851369e-07, "loss": 0.0005, "num_input_tokens_seen": 89651920, "step": 133005 }, { "epoch": 3.24945642879828, "grad_norm": 0.009629074484109879, "learning_rate": 6.583556268559343e-07, "loss": 0.0, "num_input_tokens_seen": 89654864, "step": 133010 }, { "epoch": 3.249578579630127, "grad_norm": 0.014059150591492653, "learning_rate": 6.582754817112609e-07, "loss": 0.0, "num_input_tokens_seen": 89658384, "step": 133015 }, { "epoch": 3.2497007304619743, "grad_norm": 0.00020276792929507792, "learning_rate": 6.581953390517007e-07, "loss": 0.0572, "num_input_tokens_seen": 89661392, "step": 133020 }, { "epoch": 3.2498228812938215, "grad_norm": 3.3943728340091184e-05, "learning_rate": 6.581151988778354e-07, "loss": 0.0, "num_input_tokens_seen": 89664592, "step": 133025 }, { "epoch": 3.2499450321256687, "grad_norm": 0.00014127862232271582, "learning_rate": 6.580350611902488e-07, "loss": 0.0, "num_input_tokens_seen": 89668176, "step": 133030 }, { "epoch": 3.250067182957516, "grad_norm": 0.0010063429363071918, "learning_rate": 6.579549259895227e-07, "loss": 0.0, "num_input_tokens_seen": 89671440, "step": 133035 }, { "epoch": 3.250189333789363, "grad_norm": 0.014567175880074501, "learning_rate": 6.578747932762405e-07, "loss": 0.0, "num_input_tokens_seen": 89674576, "step": 133040 }, { "epoch": 3.250238194122102, "eval_loss": 0.26102444529533386, "eval_runtime": 47.8303, "eval_samples_per_second": 760.71, "eval_steps_per_second": 95.107, "num_input_tokens_seen": 89675984, "step": 133042 }, { "epoch": 3.2503114846212102, "grad_norm": 0.0018988935044035316, "learning_rate": 6.577946630509852e-07, "loss": 0.0, "num_input_tokens_seen": 89678096, "step": 133045 }, { "epoch": 3.2504336354530574, "grad_norm": 0.03488035500049591, "learning_rate": 6.577145353143388e-07, "loss": 0.1774, "num_input_tokens_seen": 89681424, "step": 133050 }, { "epoch": 3.2505557862849046, "grad_norm": 0.0004958320059813559, "learning_rate": 6.576344100668847e-07, "loss": 0.0, "num_input_tokens_seen": 89684240, "step": 133055 }, { "epoch": 3.250677937116752, "grad_norm": 0.08107688277959824, "learning_rate": 6.575542873092051e-07, "loss": 0.0, "num_input_tokens_seen": 89688208, "step": 133060 }, { "epoch": 3.250800087948599, "grad_norm": 0.04220094159245491, "learning_rate": 6.574741670418829e-07, "loss": 0.0943, "num_input_tokens_seen": 89691536, "step": 133065 }, { "epoch": 3.250922238780446, "grad_norm": 0.026985283941030502, "learning_rate": 6.573940492655005e-07, "loss": 0.0, "num_input_tokens_seen": 89694864, "step": 133070 }, { "epoch": 3.2510443896122934, "grad_norm": 0.004324234090745449, "learning_rate": 6.573139339806406e-07, "loss": 0.0, "num_input_tokens_seen": 89698064, "step": 133075 }, { "epoch": 3.2511665404441406, "grad_norm": 0.0011103182332590222, "learning_rate": 6.572338211878864e-07, "loss": 0.0, "num_input_tokens_seen": 89701328, "step": 133080 }, { "epoch": 3.2512886912759877, "grad_norm": 0.00011528417962836102, "learning_rate": 6.571537108878195e-07, "loss": 0.0313, "num_input_tokens_seen": 89705040, "step": 133085 }, { "epoch": 3.251410842107835, "grad_norm": 0.008831367827951908, "learning_rate": 6.570736030810236e-07, "loss": 0.0, "num_input_tokens_seen": 89708880, "step": 133090 }, { "epoch": 3.2515329929396817, "grad_norm": 0.015606212429702282, "learning_rate": 6.569934977680802e-07, "loss": 0.0, "num_input_tokens_seen": 89712464, "step": 133095 }, { "epoch": 3.2516551437715293, "grad_norm": 0.183375284075737, "learning_rate": 6.569133949495724e-07, "loss": 0.0001, "num_input_tokens_seen": 89716176, "step": 133100 }, { "epoch": 3.251777294603376, "grad_norm": 0.00039575432310812175, "learning_rate": 6.568332946260831e-07, "loss": 0.0, "num_input_tokens_seen": 89719056, "step": 133105 }, { "epoch": 3.2518994454352232, "grad_norm": 0.005938323680311441, "learning_rate": 6.56753196798194e-07, "loss": 0.0, "num_input_tokens_seen": 89722448, "step": 133110 }, { "epoch": 3.2520215962670704, "grad_norm": 0.14990012347698212, "learning_rate": 6.566731014664881e-07, "loss": 0.0001, "num_input_tokens_seen": 89725648, "step": 133115 }, { "epoch": 3.2521437470989176, "grad_norm": 0.009630163200199604, "learning_rate": 6.565930086315479e-07, "loss": 0.0001, "num_input_tokens_seen": 89728592, "step": 133120 }, { "epoch": 3.252265897930765, "grad_norm": 0.0037281611002981663, "learning_rate": 6.565129182939557e-07, "loss": 0.0, "num_input_tokens_seen": 89731792, "step": 133125 }, { "epoch": 3.252388048762612, "grad_norm": 0.023433174937963486, "learning_rate": 6.564328304542936e-07, "loss": 0.0, "num_input_tokens_seen": 89735504, "step": 133130 }, { "epoch": 3.252510199594459, "grad_norm": 0.17902742326259613, "learning_rate": 6.563527451131443e-07, "loss": 0.0002, "num_input_tokens_seen": 89739280, "step": 133135 }, { "epoch": 3.2526323504263064, "grad_norm": 0.0012498385040089488, "learning_rate": 6.562726622710908e-07, "loss": 0.0, "num_input_tokens_seen": 89743248, "step": 133140 }, { "epoch": 3.2527545012581536, "grad_norm": 0.002557901432737708, "learning_rate": 6.561925819287144e-07, "loss": 0.0, "num_input_tokens_seen": 89746384, "step": 133145 }, { "epoch": 3.2528766520900008, "grad_norm": 0.0017938632518053055, "learning_rate": 6.561125040865984e-07, "loss": 0.0838, "num_input_tokens_seen": 89749968, "step": 133150 }, { "epoch": 3.252998802921848, "grad_norm": 0.017057769000530243, "learning_rate": 6.560324287453246e-07, "loss": 0.0, "num_input_tokens_seen": 89753040, "step": 133155 }, { "epoch": 3.253120953753695, "grad_norm": 0.0012168893590569496, "learning_rate": 6.559523559054758e-07, "loss": 0.0778, "num_input_tokens_seen": 89756496, "step": 133160 }, { "epoch": 3.2532431045855423, "grad_norm": 0.0019093046430498362, "learning_rate": 6.558722855676335e-07, "loss": 0.0002, "num_input_tokens_seen": 89759440, "step": 133165 }, { "epoch": 3.2533652554173895, "grad_norm": 0.001984116854146123, "learning_rate": 6.557922177323807e-07, "loss": 0.0611, "num_input_tokens_seen": 89762960, "step": 133170 }, { "epoch": 3.2534874062492367, "grad_norm": 0.00024715482140891254, "learning_rate": 6.557121524002998e-07, "loss": 0.0609, "num_input_tokens_seen": 89766544, "step": 133175 }, { "epoch": 3.253609557081084, "grad_norm": 0.007147402036935091, "learning_rate": 6.556320895719723e-07, "loss": 0.0, "num_input_tokens_seen": 89769744, "step": 133180 }, { "epoch": 3.253731707912931, "grad_norm": 0.01977328583598137, "learning_rate": 6.555520292479812e-07, "loss": 0.0, "num_input_tokens_seen": 89773584, "step": 133185 }, { "epoch": 3.253853858744778, "grad_norm": 0.07471085339784622, "learning_rate": 6.554719714289081e-07, "loss": 0.0002, "num_input_tokens_seen": 89776656, "step": 133190 }, { "epoch": 3.253976009576625, "grad_norm": 0.014332527294754982, "learning_rate": 6.553919161153354e-07, "loss": 0.0557, "num_input_tokens_seen": 89779984, "step": 133195 }, { "epoch": 3.254098160408472, "grad_norm": 0.001091811340302229, "learning_rate": 6.553118633078457e-07, "loss": 0.0, "num_input_tokens_seen": 89783568, "step": 133200 }, { "epoch": 3.2542203112403194, "grad_norm": 0.016809549182653427, "learning_rate": 6.552318130070206e-07, "loss": 0.0001, "num_input_tokens_seen": 89786896, "step": 133205 }, { "epoch": 3.2543424620721666, "grad_norm": 0.09317447245121002, "learning_rate": 6.551517652134428e-07, "loss": 0.0513, "num_input_tokens_seen": 89789904, "step": 133210 }, { "epoch": 3.254464612904014, "grad_norm": 0.025950223207473755, "learning_rate": 6.550717199276939e-07, "loss": 0.0001, "num_input_tokens_seen": 89793104, "step": 133215 }, { "epoch": 3.254586763735861, "grad_norm": 1.1682426929473877, "learning_rate": 6.549916771503564e-07, "loss": 0.0344, "num_input_tokens_seen": 89796304, "step": 133220 }, { "epoch": 3.254708914567708, "grad_norm": 0.012416906654834747, "learning_rate": 6.549116368820121e-07, "loss": 0.0058, "num_input_tokens_seen": 89799184, "step": 133225 }, { "epoch": 3.2548310653995554, "grad_norm": 0.010654439218342304, "learning_rate": 6.548315991232428e-07, "loss": 0.0377, "num_input_tokens_seen": 89802384, "step": 133230 }, { "epoch": 3.2549532162314025, "grad_norm": 0.0026855203323066235, "learning_rate": 6.547515638746315e-07, "loss": 0.0001, "num_input_tokens_seen": 89806032, "step": 133235 }, { "epoch": 3.2550753670632497, "grad_norm": 0.004891504999250174, "learning_rate": 6.546715311367593e-07, "loss": 0.0137, "num_input_tokens_seen": 89808976, "step": 133240 }, { "epoch": 3.255197517895097, "grad_norm": 1.8451565504074097, "learning_rate": 6.545915009102091e-07, "loss": 0.0284, "num_input_tokens_seen": 89813200, "step": 133245 }, { "epoch": 3.255319668726944, "grad_norm": 0.2361879199743271, "learning_rate": 6.545114731955619e-07, "loss": 0.0005, "num_input_tokens_seen": 89816400, "step": 133250 }, { "epoch": 3.2554418195587913, "grad_norm": 0.019942492246627808, "learning_rate": 6.544314479934005e-07, "loss": 0.0, "num_input_tokens_seen": 89819728, "step": 133255 }, { "epoch": 3.2555639703906385, "grad_norm": 0.0032596394885331392, "learning_rate": 6.543514253043063e-07, "loss": 0.0001, "num_input_tokens_seen": 89823312, "step": 133260 }, { "epoch": 3.2556861212224857, "grad_norm": 0.0006665443652309477, "learning_rate": 6.542714051288618e-07, "loss": 0.0, "num_input_tokens_seen": 89826832, "step": 133265 }, { "epoch": 3.255808272054333, "grad_norm": 0.024253949522972107, "learning_rate": 6.541913874676486e-07, "loss": 0.0, "num_input_tokens_seen": 89829968, "step": 133270 }, { "epoch": 3.2559304228861796, "grad_norm": 0.01717350445687771, "learning_rate": 6.541113723212484e-07, "loss": 0.0465, "num_input_tokens_seen": 89833168, "step": 133275 }, { "epoch": 3.2560525737180273, "grad_norm": 0.34456226229667664, "learning_rate": 6.540313596902438e-07, "loss": 0.0002, "num_input_tokens_seen": 89836752, "step": 133280 }, { "epoch": 3.256174724549874, "grad_norm": 0.0388353168964386, "learning_rate": 6.539513495752155e-07, "loss": 0.0, "num_input_tokens_seen": 89840208, "step": 133285 }, { "epoch": 3.256296875381721, "grad_norm": 0.0007972258608788252, "learning_rate": 6.538713419767463e-07, "loss": 0.0002, "num_input_tokens_seen": 89843088, "step": 133290 }, { "epoch": 3.2564190262135684, "grad_norm": 0.0009614526061341166, "learning_rate": 6.537913368954182e-07, "loss": 0.0, "num_input_tokens_seen": 89846416, "step": 133295 }, { "epoch": 3.2565411770454156, "grad_norm": 0.0011288266396149993, "learning_rate": 6.537113343318122e-07, "loss": 0.0001, "num_input_tokens_seen": 89850000, "step": 133300 }, { "epoch": 3.2566633278772628, "grad_norm": 0.0012508789077401161, "learning_rate": 6.536313342865109e-07, "loss": 0.0001, "num_input_tokens_seen": 89853008, "step": 133305 }, { "epoch": 3.25678547870911, "grad_norm": 0.015493168495595455, "learning_rate": 6.535513367600953e-07, "loss": 0.0, "num_input_tokens_seen": 89856464, "step": 133310 }, { "epoch": 3.256907629540957, "grad_norm": 21.916187286376953, "learning_rate": 6.534713417531479e-07, "loss": 0.0343, "num_input_tokens_seen": 89859216, "step": 133315 }, { "epoch": 3.2570297803728043, "grad_norm": 0.0007719859713688493, "learning_rate": 6.533913492662497e-07, "loss": 0.0001, "num_input_tokens_seen": 89862992, "step": 133320 }, { "epoch": 3.2571519312046515, "grad_norm": 0.027441492304205894, "learning_rate": 6.533113592999833e-07, "loss": 0.0, "num_input_tokens_seen": 89866000, "step": 133325 }, { "epoch": 3.2572740820364987, "grad_norm": 0.038071949034929276, "learning_rate": 6.532313718549299e-07, "loss": 0.0247, "num_input_tokens_seen": 89869200, "step": 133330 }, { "epoch": 3.257396232868346, "grad_norm": 0.0006605487433262169, "learning_rate": 6.531513869316707e-07, "loss": 0.0, "num_input_tokens_seen": 89872528, "step": 133335 }, { "epoch": 3.257518383700193, "grad_norm": 0.052122194319963455, "learning_rate": 6.530714045307886e-07, "loss": 0.0, "num_input_tokens_seen": 89875920, "step": 133340 }, { "epoch": 3.2576405345320403, "grad_norm": 0.00033679555053822696, "learning_rate": 6.52991424652864e-07, "loss": 0.0, "num_input_tokens_seen": 89879568, "step": 133345 }, { "epoch": 3.2577626853638875, "grad_norm": 0.2269451916217804, "learning_rate": 6.529114472984791e-07, "loss": 0.0418, "num_input_tokens_seen": 89882640, "step": 133350 }, { "epoch": 3.2578848361957347, "grad_norm": 0.00019456748850643635, "learning_rate": 6.52831472468216e-07, "loss": 0.0001, "num_input_tokens_seen": 89886160, "step": 133355 }, { "epoch": 3.258006987027582, "grad_norm": 0.0026399213820695877, "learning_rate": 6.527515001626554e-07, "loss": 0.0004, "num_input_tokens_seen": 89889680, "step": 133360 }, { "epoch": 3.258129137859429, "grad_norm": 0.0004979751538485289, "learning_rate": 6.526715303823795e-07, "loss": 0.0, "num_input_tokens_seen": 89893264, "step": 133365 }, { "epoch": 3.258251288691276, "grad_norm": 0.0011222100583836436, "learning_rate": 6.525915631279697e-07, "loss": 0.0, "num_input_tokens_seen": 89896592, "step": 133370 }, { "epoch": 3.258373439523123, "grad_norm": 0.008360485546290874, "learning_rate": 6.525115984000073e-07, "loss": 0.0, "num_input_tokens_seen": 89900048, "step": 133375 }, { "epoch": 3.25849559035497, "grad_norm": 0.09326830506324768, "learning_rate": 6.524316361990741e-07, "loss": 0.036, "num_input_tokens_seen": 89903760, "step": 133380 }, { "epoch": 3.2586177411868174, "grad_norm": 0.08574612438678741, "learning_rate": 6.523516765257513e-07, "loss": 0.0, "num_input_tokens_seen": 89907344, "step": 133385 }, { "epoch": 3.2587398920186645, "grad_norm": 0.006928337272256613, "learning_rate": 6.522717193806211e-07, "loss": 0.0, "num_input_tokens_seen": 89910416, "step": 133390 }, { "epoch": 3.2588620428505117, "grad_norm": 0.0026560239493846893, "learning_rate": 6.52191764764264e-07, "loss": 0.0, "num_input_tokens_seen": 89914064, "step": 133395 }, { "epoch": 3.258984193682359, "grad_norm": 0.005287462379783392, "learning_rate": 6.521118126772625e-07, "loss": 0.0, "num_input_tokens_seen": 89917520, "step": 133400 }, { "epoch": 3.259106344514206, "grad_norm": 0.0896107479929924, "learning_rate": 6.520318631201969e-07, "loss": 0.0001, "num_input_tokens_seen": 89920464, "step": 133405 }, { "epoch": 3.2592284953460533, "grad_norm": 0.0007962300442159176, "learning_rate": 6.519519160936495e-07, "loss": 0.0, "num_input_tokens_seen": 89923536, "step": 133410 }, { "epoch": 3.2593506461779005, "grad_norm": 0.0007463557994924486, "learning_rate": 6.518719715982011e-07, "loss": 0.0, "num_input_tokens_seen": 89927184, "step": 133415 }, { "epoch": 3.2594727970097477, "grad_norm": 0.07562704384326935, "learning_rate": 6.517920296344335e-07, "loss": 0.0001, "num_input_tokens_seen": 89930512, "step": 133420 }, { "epoch": 3.259594947841595, "grad_norm": 0.5774443745613098, "learning_rate": 6.517120902029281e-07, "loss": 0.0003, "num_input_tokens_seen": 89933776, "step": 133425 }, { "epoch": 3.259717098673442, "grad_norm": 0.0017166045727208257, "learning_rate": 6.516321533042659e-07, "loss": 0.0, "num_input_tokens_seen": 89936976, "step": 133430 }, { "epoch": 3.2598392495052892, "grad_norm": 0.013317457400262356, "learning_rate": 6.515522189390286e-07, "loss": 0.0309, "num_input_tokens_seen": 89940368, "step": 133435 }, { "epoch": 3.2599614003371364, "grad_norm": 0.0011091303313151002, "learning_rate": 6.514722871077969e-07, "loss": 0.0, "num_input_tokens_seen": 89943376, "step": 133440 }, { "epoch": 3.2600835511689836, "grad_norm": 0.002225500764325261, "learning_rate": 6.513923578111525e-07, "loss": 0.0, "num_input_tokens_seen": 89946320, "step": 133445 }, { "epoch": 3.260205702000831, "grad_norm": 0.01922401413321495, "learning_rate": 6.513124310496769e-07, "loss": 0.0, "num_input_tokens_seen": 89949648, "step": 133450 }, { "epoch": 3.2603278528326776, "grad_norm": 0.00025927339447662234, "learning_rate": 6.512325068239508e-07, "loss": 0.0, "num_input_tokens_seen": 89952912, "step": 133455 }, { "epoch": 3.260450003664525, "grad_norm": 0.0014124533627182245, "learning_rate": 6.511525851345562e-07, "loss": 0.0, "num_input_tokens_seen": 89956560, "step": 133460 }, { "epoch": 3.260572154496372, "grad_norm": 0.027523819357156754, "learning_rate": 6.510726659820733e-07, "loss": 0.0265, "num_input_tokens_seen": 89959632, "step": 133465 }, { "epoch": 3.260694305328219, "grad_norm": 0.0057300967164337635, "learning_rate": 6.509927493670842e-07, "loss": 0.0002, "num_input_tokens_seen": 89962704, "step": 133470 }, { "epoch": 3.2608164561600663, "grad_norm": 0.004024703986942768, "learning_rate": 6.509128352901694e-07, "loss": 0.0, "num_input_tokens_seen": 89965712, "step": 133475 }, { "epoch": 3.2609386069919135, "grad_norm": 0.000581703323405236, "learning_rate": 6.508329237519106e-07, "loss": 0.0, "num_input_tokens_seen": 89968976, "step": 133480 }, { "epoch": 3.2610607578237607, "grad_norm": 0.00022578010975848883, "learning_rate": 6.507530147528888e-07, "loss": 0.0, "num_input_tokens_seen": 89972176, "step": 133485 }, { "epoch": 3.261182908655608, "grad_norm": 3.6321653169579804e-05, "learning_rate": 6.506731082936845e-07, "loss": 0.0, "num_input_tokens_seen": 89975056, "step": 133490 }, { "epoch": 3.261305059487455, "grad_norm": 0.0028527742251753807, "learning_rate": 6.505932043748798e-07, "loss": 0.0, "num_input_tokens_seen": 89978640, "step": 133495 }, { "epoch": 3.2614272103193023, "grad_norm": 2.6254167556762695, "learning_rate": 6.505133029970551e-07, "loss": 0.0002, "num_input_tokens_seen": 89982288, "step": 133500 }, { "epoch": 3.2615493611511495, "grad_norm": 0.0006126816151663661, "learning_rate": 6.504334041607914e-07, "loss": 0.0, "num_input_tokens_seen": 89986064, "step": 133505 }, { "epoch": 3.2616715119829967, "grad_norm": 0.0002679953468032181, "learning_rate": 6.503535078666705e-07, "loss": 0.0, "num_input_tokens_seen": 89989328, "step": 133510 }, { "epoch": 3.261793662814844, "grad_norm": 42.283355712890625, "learning_rate": 6.502736141152724e-07, "loss": 0.0527, "num_input_tokens_seen": 89992592, "step": 133515 }, { "epoch": 3.261915813646691, "grad_norm": 74.90733337402344, "learning_rate": 6.501937229071793e-07, "loss": 0.0548, "num_input_tokens_seen": 89995856, "step": 133520 }, { "epoch": 3.262037964478538, "grad_norm": 0.010414835065603256, "learning_rate": 6.501138342429713e-07, "loss": 0.0, "num_input_tokens_seen": 89999312, "step": 133525 }, { "epoch": 3.2621601153103854, "grad_norm": 0.002175811445340514, "learning_rate": 6.500339481232296e-07, "loss": 0.0, "num_input_tokens_seen": 90002512, "step": 133530 }, { "epoch": 3.2622822661422326, "grad_norm": 0.003502947511151433, "learning_rate": 6.49954064548535e-07, "loss": 0.0, "num_input_tokens_seen": 90005520, "step": 133535 }, { "epoch": 3.2624044169740793, "grad_norm": 0.00042188982479274273, "learning_rate": 6.498741835194684e-07, "loss": 0.0, "num_input_tokens_seen": 90008656, "step": 133540 }, { "epoch": 3.262526567805927, "grad_norm": 0.005062382202595472, "learning_rate": 6.497943050366115e-07, "loss": 0.0, "num_input_tokens_seen": 90012624, "step": 133545 }, { "epoch": 3.2626487186377737, "grad_norm": 0.0034066669177263975, "learning_rate": 6.49714429100544e-07, "loss": 0.0, "num_input_tokens_seen": 90015632, "step": 133550 }, { "epoch": 3.262770869469621, "grad_norm": 0.0008335746242664754, "learning_rate": 6.496345557118478e-07, "loss": 0.0441, "num_input_tokens_seen": 90019344, "step": 133555 }, { "epoch": 3.262893020301468, "grad_norm": 0.0030794290360063314, "learning_rate": 6.495546848711031e-07, "loss": 0.075, "num_input_tokens_seen": 90022672, "step": 133560 }, { "epoch": 3.2630151711333153, "grad_norm": 0.003941728733479977, "learning_rate": 6.494748165788912e-07, "loss": 0.0, "num_input_tokens_seen": 90026128, "step": 133565 }, { "epoch": 3.2631373219651625, "grad_norm": 0.0013275842647999525, "learning_rate": 6.493949508357926e-07, "loss": 0.0004, "num_input_tokens_seen": 90029456, "step": 133570 }, { "epoch": 3.2632594727970097, "grad_norm": 0.003424836788326502, "learning_rate": 6.493150876423882e-07, "loss": 0.0, "num_input_tokens_seen": 90032592, "step": 133575 }, { "epoch": 3.263381623628857, "grad_norm": 0.0038157880771905184, "learning_rate": 6.492352269992588e-07, "loss": 0.0, "num_input_tokens_seen": 90036240, "step": 133580 }, { "epoch": 3.263503774460704, "grad_norm": 0.00199596188031137, "learning_rate": 6.491553689069853e-07, "loss": 0.0, "num_input_tokens_seen": 90039376, "step": 133585 }, { "epoch": 3.2636259252925512, "grad_norm": 0.03811373934149742, "learning_rate": 6.490755133661484e-07, "loss": 0.1105, "num_input_tokens_seen": 90042448, "step": 133590 }, { "epoch": 3.2637480761243984, "grad_norm": 0.0061827911995351315, "learning_rate": 6.489956603773284e-07, "loss": 0.0406, "num_input_tokens_seen": 90045648, "step": 133595 }, { "epoch": 3.2638702269562456, "grad_norm": 0.0010410583345219493, "learning_rate": 6.489158099411062e-07, "loss": 0.0001, "num_input_tokens_seen": 90049104, "step": 133600 }, { "epoch": 3.263992377788093, "grad_norm": 0.0003505503118503839, "learning_rate": 6.488359620580634e-07, "loss": 0.056, "num_input_tokens_seen": 90052560, "step": 133605 }, { "epoch": 3.26411452861994, "grad_norm": 0.0007625527214258909, "learning_rate": 6.487561167287794e-07, "loss": 0.0, "num_input_tokens_seen": 90055760, "step": 133610 }, { "epoch": 3.264236679451787, "grad_norm": 0.003039886010810733, "learning_rate": 6.486762739538356e-07, "loss": 0.0, "num_input_tokens_seen": 90059216, "step": 133615 }, { "epoch": 3.2643588302836344, "grad_norm": 0.00022629981685895473, "learning_rate": 6.485964337338124e-07, "loss": 0.0348, "num_input_tokens_seen": 90062672, "step": 133620 }, { "epoch": 3.2644809811154816, "grad_norm": 0.0004150950408075005, "learning_rate": 6.485165960692906e-07, "loss": 0.0686, "num_input_tokens_seen": 90065808, "step": 133625 }, { "epoch": 3.2646031319473288, "grad_norm": 0.008487959392368793, "learning_rate": 6.484367609608502e-07, "loss": 0.0004, "num_input_tokens_seen": 90069392, "step": 133630 }, { "epoch": 3.2647252827791755, "grad_norm": 0.00031680116080679, "learning_rate": 6.483569284090725e-07, "loss": 0.0, "num_input_tokens_seen": 90072400, "step": 133635 }, { "epoch": 3.2648474336110227, "grad_norm": 0.0003479302395135164, "learning_rate": 6.482770984145381e-07, "loss": 0.0, "num_input_tokens_seen": 90076048, "step": 133640 }, { "epoch": 3.26496958444287, "grad_norm": 0.03978421166539192, "learning_rate": 6.481972709778267e-07, "loss": 0.0, "num_input_tokens_seen": 90078992, "step": 133645 }, { "epoch": 3.265091735274717, "grad_norm": 0.0023317488376051188, "learning_rate": 6.481174460995198e-07, "loss": 0.0, "num_input_tokens_seen": 90082576, "step": 133650 }, { "epoch": 3.2652138861065643, "grad_norm": 0.0005702111520804465, "learning_rate": 6.480376237801973e-07, "loss": 0.0, "num_input_tokens_seen": 90085712, "step": 133655 }, { "epoch": 3.2653360369384115, "grad_norm": 0.0005525056039914489, "learning_rate": 6.479578040204396e-07, "loss": 0.0, "num_input_tokens_seen": 90088848, "step": 133660 }, { "epoch": 3.2654581877702586, "grad_norm": 0.0002772066800389439, "learning_rate": 6.478779868208278e-07, "loss": 0.0, "num_input_tokens_seen": 90091920, "step": 133665 }, { "epoch": 3.265580338602106, "grad_norm": 0.003932084422558546, "learning_rate": 6.477981721819416e-07, "loss": 0.0, "num_input_tokens_seen": 90095120, "step": 133670 }, { "epoch": 3.265702489433953, "grad_norm": 0.0019422955811023712, "learning_rate": 6.477183601043625e-07, "loss": 0.0, "num_input_tokens_seen": 90098192, "step": 133675 }, { "epoch": 3.2658246402658, "grad_norm": 0.010535065084695816, "learning_rate": 6.476385505886698e-07, "loss": 0.0, "num_input_tokens_seen": 90101264, "step": 133680 }, { "epoch": 3.2659467910976474, "grad_norm": 0.0017384805250912905, "learning_rate": 6.475587436354443e-07, "loss": 0.0, "num_input_tokens_seen": 90104400, "step": 133685 }, { "epoch": 3.2660689419294946, "grad_norm": 0.0010419674217700958, "learning_rate": 6.474789392452666e-07, "loss": 0.0, "num_input_tokens_seen": 90108112, "step": 133690 }, { "epoch": 3.266191092761342, "grad_norm": 0.016605474054813385, "learning_rate": 6.473991374187166e-07, "loss": 0.0001, "num_input_tokens_seen": 90111056, "step": 133695 }, { "epoch": 3.266313243593189, "grad_norm": 0.07536473870277405, "learning_rate": 6.473193381563753e-07, "loss": 0.0, "num_input_tokens_seen": 90114512, "step": 133700 }, { "epoch": 3.266435394425036, "grad_norm": 0.0019029227551072836, "learning_rate": 6.472395414588222e-07, "loss": 0.0, "num_input_tokens_seen": 90117840, "step": 133705 }, { "epoch": 3.2665575452568834, "grad_norm": 0.00034090832923538983, "learning_rate": 6.471597473266385e-07, "loss": 0.0359, "num_input_tokens_seen": 90121424, "step": 133710 }, { "epoch": 3.2666796960887305, "grad_norm": 0.027212005108594894, "learning_rate": 6.470799557604035e-07, "loss": 0.0, "num_input_tokens_seen": 90124880, "step": 133715 }, { "epoch": 3.2668018469205773, "grad_norm": 0.0004326793714426458, "learning_rate": 6.470001667606986e-07, "loss": 0.0, "num_input_tokens_seen": 90127888, "step": 133720 }, { "epoch": 3.266923997752425, "grad_norm": 0.0014250059612095356, "learning_rate": 6.469203803281027e-07, "loss": 0.0, "num_input_tokens_seen": 90131344, "step": 133725 }, { "epoch": 3.2670461485842717, "grad_norm": 0.03089824691414833, "learning_rate": 6.468405964631972e-07, "loss": 0.0453, "num_input_tokens_seen": 90134288, "step": 133730 }, { "epoch": 3.267168299416119, "grad_norm": 0.0011771543649956584, "learning_rate": 6.467608151665618e-07, "loss": 0.0, "num_input_tokens_seen": 90137616, "step": 133735 }, { "epoch": 3.267290450247966, "grad_norm": 0.0012687522685155272, "learning_rate": 6.46681036438777e-07, "loss": 0.0, "num_input_tokens_seen": 90141264, "step": 133740 }, { "epoch": 3.2674126010798132, "grad_norm": 0.0001758733851602301, "learning_rate": 6.466012602804225e-07, "loss": 0.0, "num_input_tokens_seen": 90144400, "step": 133745 }, { "epoch": 3.2675347519116604, "grad_norm": 0.0003743020643014461, "learning_rate": 6.465214866920785e-07, "loss": 0.0, "num_input_tokens_seen": 90147856, "step": 133750 }, { "epoch": 3.2676569027435076, "grad_norm": 0.024355005472898483, "learning_rate": 6.464417156743253e-07, "loss": 0.0002, "num_input_tokens_seen": 90151504, "step": 133755 }, { "epoch": 3.267779053575355, "grad_norm": 0.020076554268598557, "learning_rate": 6.463619472277436e-07, "loss": 0.0, "num_input_tokens_seen": 90155088, "step": 133760 }, { "epoch": 3.267901204407202, "grad_norm": 9.398308611707762e-05, "learning_rate": 6.462821813529125e-07, "loss": 0.0, "num_input_tokens_seen": 90158032, "step": 133765 }, { "epoch": 3.268023355239049, "grad_norm": 0.00041264158789999783, "learning_rate": 6.462024180504128e-07, "loss": 0.0043, "num_input_tokens_seen": 90161040, "step": 133770 }, { "epoch": 3.2681455060708964, "grad_norm": 0.00012087931827409193, "learning_rate": 6.461226573208239e-07, "loss": 0.0001, "num_input_tokens_seen": 90164368, "step": 133775 }, { "epoch": 3.2682676569027436, "grad_norm": 7.979868678376079e-05, "learning_rate": 6.460428991647265e-07, "loss": 0.0, "num_input_tokens_seen": 90167504, "step": 133780 }, { "epoch": 3.2683898077345908, "grad_norm": 0.008352191187441349, "learning_rate": 6.459631435827001e-07, "loss": 0.0, "num_input_tokens_seen": 90170960, "step": 133785 }, { "epoch": 3.268511958566438, "grad_norm": 0.0003512290713842958, "learning_rate": 6.458833905753251e-07, "loss": 0.0, "num_input_tokens_seen": 90174160, "step": 133790 }, { "epoch": 3.268634109398285, "grad_norm": 8.37286570458673e-05, "learning_rate": 6.458036401431816e-07, "loss": 0.0, "num_input_tokens_seen": 90177616, "step": 133795 }, { "epoch": 3.2687562602301323, "grad_norm": 0.00048232992412522435, "learning_rate": 6.457238922868487e-07, "loss": 0.0, "num_input_tokens_seen": 90181200, "step": 133800 }, { "epoch": 3.2688784110619795, "grad_norm": 0.0005159936263225973, "learning_rate": 6.456441470069076e-07, "loss": 0.0, "num_input_tokens_seen": 90184656, "step": 133805 }, { "epoch": 3.2690005618938267, "grad_norm": 0.0002815297630149871, "learning_rate": 6.45564404303937e-07, "loss": 0.0, "num_input_tokens_seen": 90187920, "step": 133810 }, { "epoch": 3.2691227127256735, "grad_norm": 0.0007183253183029592, "learning_rate": 6.454846641785174e-07, "loss": 0.0, "num_input_tokens_seen": 90191184, "step": 133815 }, { "epoch": 3.2692448635575206, "grad_norm": 0.007655336521565914, "learning_rate": 6.454049266312291e-07, "loss": 0.0, "num_input_tokens_seen": 90194448, "step": 133820 }, { "epoch": 3.269367014389368, "grad_norm": 0.015166401863098145, "learning_rate": 6.453251916626512e-07, "loss": 0.0, "num_input_tokens_seen": 90197584, "step": 133825 }, { "epoch": 3.269489165221215, "grad_norm": 0.00043149187695235014, "learning_rate": 6.452454592733642e-07, "loss": 0.0, "num_input_tokens_seen": 90200720, "step": 133830 }, { "epoch": 3.269611316053062, "grad_norm": 93.59136199951172, "learning_rate": 6.451657294639475e-07, "loss": 0.0838, "num_input_tokens_seen": 90204496, "step": 133835 }, { "epoch": 3.2697334668849094, "grad_norm": 0.06872415542602539, "learning_rate": 6.450860022349811e-07, "loss": 0.0318, "num_input_tokens_seen": 90208144, "step": 133840 }, { "epoch": 3.2698556177167566, "grad_norm": 0.00017835324979387224, "learning_rate": 6.450062775870446e-07, "loss": 0.0204, "num_input_tokens_seen": 90211600, "step": 133845 }, { "epoch": 3.269977768548604, "grad_norm": 0.0010258352849632502, "learning_rate": 6.44926555520718e-07, "loss": 0.0, "num_input_tokens_seen": 90214928, "step": 133850 }, { "epoch": 3.270099919380451, "grad_norm": 0.0022669630125164986, "learning_rate": 6.44846836036581e-07, "loss": 0.0451, "num_input_tokens_seen": 90218640, "step": 133855 }, { "epoch": 3.270222070212298, "grad_norm": 1563.13525390625, "learning_rate": 6.447671191352134e-07, "loss": 0.105, "num_input_tokens_seen": 90221840, "step": 133860 }, { "epoch": 3.2703442210441453, "grad_norm": 0.008289139717817307, "learning_rate": 6.446874048171948e-07, "loss": 0.0, "num_input_tokens_seen": 90225168, "step": 133865 }, { "epoch": 3.2704663718759925, "grad_norm": 0.003361885203048587, "learning_rate": 6.446076930831049e-07, "loss": 0.0, "num_input_tokens_seen": 90228368, "step": 133870 }, { "epoch": 3.2705885227078397, "grad_norm": 0.01708057150244713, "learning_rate": 6.445279839335237e-07, "loss": 0.0, "num_input_tokens_seen": 90231504, "step": 133875 }, { "epoch": 3.270710673539687, "grad_norm": 0.010200517252087593, "learning_rate": 6.444482773690303e-07, "loss": 0.0, "num_input_tokens_seen": 90234832, "step": 133880 }, { "epoch": 3.270832824371534, "grad_norm": 0.0008809025166556239, "learning_rate": 6.443685733902046e-07, "loss": 0.0451, "num_input_tokens_seen": 90238032, "step": 133885 }, { "epoch": 3.2709549752033813, "grad_norm": 0.018013739958405495, "learning_rate": 6.442888719976266e-07, "loss": 0.0, "num_input_tokens_seen": 90241488, "step": 133890 }, { "epoch": 3.2710771260352285, "grad_norm": 0.00027410429902374744, "learning_rate": 6.442091731918756e-07, "loss": 0.0001, "num_input_tokens_seen": 90244944, "step": 133895 }, { "epoch": 3.2711992768670752, "grad_norm": 0.09898103773593903, "learning_rate": 6.441294769735312e-07, "loss": 0.0, "num_input_tokens_seen": 90248656, "step": 133900 }, { "epoch": 3.271321427698923, "grad_norm": 0.0009784942958503962, "learning_rate": 6.440497833431726e-07, "loss": 0.0203, "num_input_tokens_seen": 90251984, "step": 133905 }, { "epoch": 3.2714435785307696, "grad_norm": 0.003985030576586723, "learning_rate": 6.439700923013798e-07, "loss": 0.0, "num_input_tokens_seen": 90254992, "step": 133910 }, { "epoch": 3.271565729362617, "grad_norm": 0.02826862223446369, "learning_rate": 6.438904038487325e-07, "loss": 0.0, "num_input_tokens_seen": 90258256, "step": 133915 }, { "epoch": 3.271687880194464, "grad_norm": 0.00011547313624760136, "learning_rate": 6.438107179858097e-07, "loss": 0.0, "num_input_tokens_seen": 90261520, "step": 133920 }, { "epoch": 3.271810031026311, "grad_norm": 73.17760467529297, "learning_rate": 6.437310347131915e-07, "loss": 0.0688, "num_input_tokens_seen": 90264848, "step": 133925 }, { "epoch": 3.2719321818581584, "grad_norm": 0.0009607910178601742, "learning_rate": 6.436513540314566e-07, "loss": 0.0, "num_input_tokens_seen": 90268304, "step": 133930 }, { "epoch": 3.2720543326900056, "grad_norm": 0.014877952635288239, "learning_rate": 6.435716759411853e-07, "loss": 0.0, "num_input_tokens_seen": 90271632, "step": 133935 }, { "epoch": 3.2721764835218528, "grad_norm": 97.39641571044922, "learning_rate": 6.434920004429564e-07, "loss": 0.0115, "num_input_tokens_seen": 90275280, "step": 133940 }, { "epoch": 3.2722986343537, "grad_norm": 0.002261359943076968, "learning_rate": 6.434123275373496e-07, "loss": 0.0, "num_input_tokens_seen": 90278544, "step": 133945 }, { "epoch": 3.272420785185547, "grad_norm": 15.63397216796875, "learning_rate": 6.433326572249446e-07, "loss": 0.0225, "num_input_tokens_seen": 90281552, "step": 133950 }, { "epoch": 3.2725429360173943, "grad_norm": 0.01223745010793209, "learning_rate": 6.432529895063199e-07, "loss": 0.0, "num_input_tokens_seen": 90285200, "step": 133955 }, { "epoch": 3.2726650868492415, "grad_norm": 0.0026963527780026197, "learning_rate": 6.431733243820558e-07, "loss": 0.0001, "num_input_tokens_seen": 90289040, "step": 133960 }, { "epoch": 3.2727872376810887, "grad_norm": 0.005582587327808142, "learning_rate": 6.430936618527311e-07, "loss": 0.0, "num_input_tokens_seen": 90292752, "step": 133965 }, { "epoch": 3.272909388512936, "grad_norm": 0.0012265495024621487, "learning_rate": 6.430140019189254e-07, "loss": 0.0, "num_input_tokens_seen": 90296336, "step": 133970 }, { "epoch": 3.273031539344783, "grad_norm": 0.01831926219165325, "learning_rate": 6.429343445812177e-07, "loss": 0.0, "num_input_tokens_seen": 90299280, "step": 133975 }, { "epoch": 3.2731536901766303, "grad_norm": 0.00041980401147156954, "learning_rate": 6.428546898401874e-07, "loss": 0.0, "num_input_tokens_seen": 90302416, "step": 133980 }, { "epoch": 3.2732758410084775, "grad_norm": 0.0010928019182756543, "learning_rate": 6.427750376964143e-07, "loss": 0.0, "num_input_tokens_seen": 90305616, "step": 133985 }, { "epoch": 3.2733979918403247, "grad_norm": 0.0043005309998989105, "learning_rate": 6.42695388150477e-07, "loss": 0.0, "num_input_tokens_seen": 90309456, "step": 133990 }, { "epoch": 3.2735201426721714, "grad_norm": 0.01648613251745701, "learning_rate": 6.426157412029549e-07, "loss": 0.0027, "num_input_tokens_seen": 90312720, "step": 133995 }, { "epoch": 3.2736422935040186, "grad_norm": 0.001249257242307067, "learning_rate": 6.425360968544272e-07, "loss": 0.0, "num_input_tokens_seen": 90316304, "step": 134000 }, { "epoch": 3.2737644443358658, "grad_norm": 0.0011085321893915534, "learning_rate": 6.42456455105473e-07, "loss": 0.0021, "num_input_tokens_seen": 90319376, "step": 134005 }, { "epoch": 3.273886595167713, "grad_norm": 0.0005377625930123031, "learning_rate": 6.42376815956672e-07, "loss": 0.0, "num_input_tokens_seen": 90322832, "step": 134010 }, { "epoch": 3.27400874599956, "grad_norm": 0.05542025715112686, "learning_rate": 6.422971794086028e-07, "loss": 0.0, "num_input_tokens_seen": 90325840, "step": 134015 }, { "epoch": 3.2741308968314073, "grad_norm": 23.629776000976562, "learning_rate": 6.422175454618448e-07, "loss": 0.0455, "num_input_tokens_seen": 90329488, "step": 134020 }, { "epoch": 3.2742530476632545, "grad_norm": 0.024139046669006348, "learning_rate": 6.421379141169769e-07, "loss": 0.095, "num_input_tokens_seen": 90333072, "step": 134025 }, { "epoch": 3.2743751984951017, "grad_norm": 0.005243145395070314, "learning_rate": 6.420582853745787e-07, "loss": 0.0001, "num_input_tokens_seen": 90336144, "step": 134030 }, { "epoch": 3.274497349326949, "grad_norm": 0.0035617812536656857, "learning_rate": 6.419786592352283e-07, "loss": 0.0591, "num_input_tokens_seen": 90339664, "step": 134035 }, { "epoch": 3.274619500158796, "grad_norm": 0.011242561042308807, "learning_rate": 6.418990356995058e-07, "loss": 0.0, "num_input_tokens_seen": 90342736, "step": 134040 }, { "epoch": 3.2747416509906433, "grad_norm": 0.01859310083091259, "learning_rate": 6.418194147679898e-07, "loss": 0.0, "num_input_tokens_seen": 90346128, "step": 134045 }, { "epoch": 3.2748638018224905, "grad_norm": 0.05717771500349045, "learning_rate": 6.417397964412594e-07, "loss": 0.0, "num_input_tokens_seen": 90349520, "step": 134050 }, { "epoch": 3.2749859526543377, "grad_norm": 0.004490395076572895, "learning_rate": 6.416601807198936e-07, "loss": 0.0214, "num_input_tokens_seen": 90352848, "step": 134055 }, { "epoch": 3.275108103486185, "grad_norm": 224.87542724609375, "learning_rate": 6.41580567604471e-07, "loss": 0.0247, "num_input_tokens_seen": 90356432, "step": 134060 }, { "epoch": 3.275230254318032, "grad_norm": 0.0020325894001871347, "learning_rate": 6.415009570955709e-07, "loss": 0.0001, "num_input_tokens_seen": 90359632, "step": 134065 }, { "epoch": 3.2753524051498792, "grad_norm": 0.008936960250139236, "learning_rate": 6.414213491937728e-07, "loss": 0.0348, "num_input_tokens_seen": 90363536, "step": 134070 }, { "epoch": 3.2754745559817264, "grad_norm": 0.0028429508674889803, "learning_rate": 6.413417438996547e-07, "loss": 0.0001, "num_input_tokens_seen": 90366928, "step": 134075 }, { "epoch": 3.275596706813573, "grad_norm": 0.08826093375682831, "learning_rate": 6.412621412137962e-07, "loss": 0.0001, "num_input_tokens_seen": 90370256, "step": 134080 }, { "epoch": 3.275718857645421, "grad_norm": 0.000634827243629843, "learning_rate": 6.411825411367755e-07, "loss": 0.0001, "num_input_tokens_seen": 90373456, "step": 134085 }, { "epoch": 3.2758410084772676, "grad_norm": 0.002248347969725728, "learning_rate": 6.411029436691723e-07, "loss": 0.0, "num_input_tokens_seen": 90376592, "step": 134090 }, { "epoch": 3.2759631593091147, "grad_norm": 0.04211549088358879, "learning_rate": 6.410233488115646e-07, "loss": 0.0, "num_input_tokens_seen": 90380112, "step": 134095 }, { "epoch": 3.276085310140962, "grad_norm": 2.0768942704307847e-05, "learning_rate": 6.409437565645319e-07, "loss": 0.0, "num_input_tokens_seen": 90383568, "step": 134100 }, { "epoch": 3.276207460972809, "grad_norm": 0.011450367048382759, "learning_rate": 6.408641669286529e-07, "loss": 0.0, "num_input_tokens_seen": 90386960, "step": 134105 }, { "epoch": 3.2763296118046563, "grad_norm": 0.007889004424214363, "learning_rate": 6.40784579904506e-07, "loss": 0.0024, "num_input_tokens_seen": 90390096, "step": 134110 }, { "epoch": 3.2764517626365035, "grad_norm": 0.0003262287937104702, "learning_rate": 6.407049954926705e-07, "loss": 0.0653, "num_input_tokens_seen": 90393424, "step": 134115 }, { "epoch": 3.2765739134683507, "grad_norm": 0.008806253783404827, "learning_rate": 6.406254136937246e-07, "loss": 0.0, "num_input_tokens_seen": 90396624, "step": 134120 }, { "epoch": 3.276696064300198, "grad_norm": 9.64621503953822e-05, "learning_rate": 6.405458345082477e-07, "loss": 0.0001, "num_input_tokens_seen": 90399824, "step": 134125 }, { "epoch": 3.276818215132045, "grad_norm": 0.010716472752392292, "learning_rate": 6.404662579368178e-07, "loss": 0.0002, "num_input_tokens_seen": 90403344, "step": 134130 }, { "epoch": 3.2769403659638923, "grad_norm": 88.94921112060547, "learning_rate": 6.403866839800141e-07, "loss": 0.1229, "num_input_tokens_seen": 90406416, "step": 134135 }, { "epoch": 3.2770625167957395, "grad_norm": 0.004718456882983446, "learning_rate": 6.403071126384154e-07, "loss": 0.0001, "num_input_tokens_seen": 90410064, "step": 134140 }, { "epoch": 3.2771846676275866, "grad_norm": 0.0020348127000033855, "learning_rate": 6.402275439126e-07, "loss": 0.0, "num_input_tokens_seen": 90413328, "step": 134145 }, { "epoch": 3.277306818459434, "grad_norm": 0.0047275470569729805, "learning_rate": 6.401479778031467e-07, "loss": 0.0, "num_input_tokens_seen": 90416592, "step": 134150 }, { "epoch": 3.277428969291281, "grad_norm": 0.002068186178803444, "learning_rate": 6.40068414310634e-07, "loss": 0.0, "num_input_tokens_seen": 90419664, "step": 134155 }, { "epoch": 3.277551120123128, "grad_norm": 0.002157870912924409, "learning_rate": 6.399888534356404e-07, "loss": 0.0, "num_input_tokens_seen": 90422928, "step": 134160 }, { "epoch": 3.277673270954975, "grad_norm": 0.006738911848515272, "learning_rate": 6.399092951787451e-07, "loss": 0.0, "num_input_tokens_seen": 90426256, "step": 134165 }, { "epoch": 3.2777954217868226, "grad_norm": 0.01319417729973793, "learning_rate": 6.398297395405259e-07, "loss": 0.0, "num_input_tokens_seen": 90429712, "step": 134170 }, { "epoch": 3.2779175726186693, "grad_norm": 298.2821350097656, "learning_rate": 6.39750186521562e-07, "loss": 0.0214, "num_input_tokens_seen": 90432912, "step": 134175 }, { "epoch": 3.2780397234505165, "grad_norm": 0.002581668319180608, "learning_rate": 6.396706361224313e-07, "loss": 0.0, "num_input_tokens_seen": 90436624, "step": 134180 }, { "epoch": 3.2781618742823637, "grad_norm": 0.00018515576084610075, "learning_rate": 6.395910883437132e-07, "loss": 0.0, "num_input_tokens_seen": 90440016, "step": 134185 }, { "epoch": 3.278284025114211, "grad_norm": 0.0025104996748268604, "learning_rate": 6.395115431859851e-07, "loss": 0.0, "num_input_tokens_seen": 90443344, "step": 134190 }, { "epoch": 3.278406175946058, "grad_norm": 0.01018584705889225, "learning_rate": 6.394320006498262e-07, "loss": 0.0, "num_input_tokens_seen": 90446608, "step": 134195 }, { "epoch": 3.2785283267779053, "grad_norm": 0.48450037837028503, "learning_rate": 6.393524607358149e-07, "loss": 0.006, "num_input_tokens_seen": 90449808, "step": 134200 }, { "epoch": 3.2786504776097525, "grad_norm": 0.00024042440054472536, "learning_rate": 6.392729234445293e-07, "loss": 0.0, "num_input_tokens_seen": 90454352, "step": 134205 }, { "epoch": 3.2787726284415997, "grad_norm": 0.01791437901556492, "learning_rate": 6.391933887765484e-07, "loss": 0.0001, "num_input_tokens_seen": 90458128, "step": 134210 }, { "epoch": 3.278894779273447, "grad_norm": 0.034773245453834534, "learning_rate": 6.391138567324497e-07, "loss": 0.0, "num_input_tokens_seen": 90461776, "step": 134215 }, { "epoch": 3.279016930105294, "grad_norm": 0.0008298733155243099, "learning_rate": 6.390343273128123e-07, "loss": 0.0, "num_input_tokens_seen": 90465168, "step": 134220 }, { "epoch": 3.2791390809371412, "grad_norm": 0.00022249763424042612, "learning_rate": 6.389548005182146e-07, "loss": 0.0001, "num_input_tokens_seen": 90468304, "step": 134225 }, { "epoch": 3.2792612317689884, "grad_norm": 0.0016737374244257808, "learning_rate": 6.388752763492344e-07, "loss": 0.0544, "num_input_tokens_seen": 90471760, "step": 134230 }, { "epoch": 3.2793833826008356, "grad_norm": 0.0006355304503813386, "learning_rate": 6.387957548064505e-07, "loss": 0.129, "num_input_tokens_seen": 90474896, "step": 134235 }, { "epoch": 3.279505533432683, "grad_norm": 0.00043898748117499053, "learning_rate": 6.387162358904408e-07, "loss": 0.0568, "num_input_tokens_seen": 90478096, "step": 134240 }, { "epoch": 3.27962768426453, "grad_norm": 0.002303024521097541, "learning_rate": 6.386367196017842e-07, "loss": 0.0546, "num_input_tokens_seen": 90482064, "step": 134245 }, { "epoch": 3.279749835096377, "grad_norm": 0.11519376933574677, "learning_rate": 6.385572059410583e-07, "loss": 0.0775, "num_input_tokens_seen": 90485456, "step": 134250 }, { "epoch": 3.2798719859282244, "grad_norm": 0.0054914443753659725, "learning_rate": 6.384776949088416e-07, "loss": 0.0155, "num_input_tokens_seen": 90488464, "step": 134255 }, { "epoch": 3.279994136760071, "grad_norm": 0.024513721466064453, "learning_rate": 6.383981865057125e-07, "loss": 0.0, "num_input_tokens_seen": 90491920, "step": 134260 }, { "epoch": 3.2801162875919183, "grad_norm": 0.07861457020044327, "learning_rate": 6.383186807322488e-07, "loss": 0.0, "num_input_tokens_seen": 90495504, "step": 134265 }, { "epoch": 3.2802384384237655, "grad_norm": 0.002974571194499731, "learning_rate": 6.382391775890293e-07, "loss": 0.1083, "num_input_tokens_seen": 90498896, "step": 134270 }, { "epoch": 3.2803605892556127, "grad_norm": 0.009940390475094318, "learning_rate": 6.381596770766313e-07, "loss": 0.0001, "num_input_tokens_seen": 90502480, "step": 134275 }, { "epoch": 3.28048274008746, "grad_norm": 0.8798280954360962, "learning_rate": 6.380801791956341e-07, "loss": 0.0007, "num_input_tokens_seen": 90505552, "step": 134280 }, { "epoch": 3.280604890919307, "grad_norm": 0.007819697260856628, "learning_rate": 6.380006839466146e-07, "loss": 0.0008, "num_input_tokens_seen": 90509008, "step": 134285 }, { "epoch": 3.2807270417511543, "grad_norm": 0.001056055654771626, "learning_rate": 6.379211913301514e-07, "loss": 0.0192, "num_input_tokens_seen": 90512592, "step": 134290 }, { "epoch": 3.2808491925830015, "grad_norm": 0.015260012820363045, "learning_rate": 6.378417013468233e-07, "loss": 0.0, "num_input_tokens_seen": 90515536, "step": 134295 }, { "epoch": 3.2809713434148486, "grad_norm": 0.0004757667484227568, "learning_rate": 6.377622139972074e-07, "loss": 0.0, "num_input_tokens_seen": 90518800, "step": 134300 }, { "epoch": 3.281093494246696, "grad_norm": 0.014724750071763992, "learning_rate": 6.376827292818822e-07, "loss": 0.0001, "num_input_tokens_seen": 90522000, "step": 134305 }, { "epoch": 3.281215645078543, "grad_norm": 0.001603720709681511, "learning_rate": 6.376032472014256e-07, "loss": 0.0, "num_input_tokens_seen": 90525200, "step": 134310 }, { "epoch": 3.28133779591039, "grad_norm": 13.405024528503418, "learning_rate": 6.375237677564154e-07, "loss": 0.029, "num_input_tokens_seen": 90528272, "step": 134315 }, { "epoch": 3.2814599467422374, "grad_norm": 0.00246247136965394, "learning_rate": 6.374442909474304e-07, "loss": 0.0, "num_input_tokens_seen": 90531216, "step": 134320 }, { "epoch": 3.2815820975740846, "grad_norm": 0.002576568629592657, "learning_rate": 6.373648167750475e-07, "loss": 0.0001, "num_input_tokens_seen": 90534672, "step": 134325 }, { "epoch": 3.281704248405932, "grad_norm": 0.004765174351632595, "learning_rate": 6.372853452398457e-07, "loss": 0.0, "num_input_tokens_seen": 90538000, "step": 134330 }, { "epoch": 3.281826399237779, "grad_norm": 0.07081875205039978, "learning_rate": 6.37205876342402e-07, "loss": 0.0002, "num_input_tokens_seen": 90541392, "step": 134335 }, { "epoch": 3.281948550069626, "grad_norm": 0.35364988446235657, "learning_rate": 6.371264100832951e-07, "loss": 0.057, "num_input_tokens_seen": 90544976, "step": 134340 }, { "epoch": 3.282070700901473, "grad_norm": 0.006533215753734112, "learning_rate": 6.370469464631021e-07, "loss": 0.0, "num_input_tokens_seen": 90547920, "step": 134345 }, { "epoch": 3.2821928517333205, "grad_norm": 0.04874301329255104, "learning_rate": 6.369674854824017e-07, "loss": 0.0, "num_input_tokens_seen": 90550992, "step": 134350 }, { "epoch": 3.2823150025651673, "grad_norm": 0.0069482699036598206, "learning_rate": 6.368880271417712e-07, "loss": 0.0001, "num_input_tokens_seen": 90553872, "step": 134355 }, { "epoch": 3.2824371533970145, "grad_norm": 0.0007645434234291315, "learning_rate": 6.368085714417888e-07, "loss": 0.0, "num_input_tokens_seen": 90557584, "step": 134360 }, { "epoch": 3.2825593042288617, "grad_norm": 0.0026880523655563593, "learning_rate": 6.367291183830322e-07, "loss": 0.0, "num_input_tokens_seen": 90561104, "step": 134365 }, { "epoch": 3.282681455060709, "grad_norm": 40.059661865234375, "learning_rate": 6.366496679660789e-07, "loss": 0.0144, "num_input_tokens_seen": 90564560, "step": 134370 }, { "epoch": 3.282803605892556, "grad_norm": 0.848136305809021, "learning_rate": 6.365702201915069e-07, "loss": 0.0507, "num_input_tokens_seen": 90567632, "step": 134375 }, { "epoch": 3.2829257567244032, "grad_norm": 0.01674298197031021, "learning_rate": 6.364907750598942e-07, "loss": 0.0, "num_input_tokens_seen": 90571088, "step": 134380 }, { "epoch": 3.2830479075562504, "grad_norm": 0.0930301696062088, "learning_rate": 6.364113325718183e-07, "loss": 0.0, "num_input_tokens_seen": 90574416, "step": 134385 }, { "epoch": 3.2831700583880976, "grad_norm": 0.024816663935780525, "learning_rate": 6.363318927278571e-07, "loss": 0.0, "num_input_tokens_seen": 90577744, "step": 134390 }, { "epoch": 3.283292209219945, "grad_norm": 0.002348346170037985, "learning_rate": 6.36252455528588e-07, "loss": 0.0395, "num_input_tokens_seen": 90580752, "step": 134395 }, { "epoch": 3.283414360051792, "grad_norm": 0.0015244438545778394, "learning_rate": 6.361730209745891e-07, "loss": 0.0001, "num_input_tokens_seen": 90583888, "step": 134400 }, { "epoch": 3.283536510883639, "grad_norm": 0.005531086586415768, "learning_rate": 6.360935890664376e-07, "loss": 0.0, "num_input_tokens_seen": 90587408, "step": 134405 }, { "epoch": 3.2836586617154864, "grad_norm": 0.030012428760528564, "learning_rate": 6.360141598047115e-07, "loss": 0.0, "num_input_tokens_seen": 90591248, "step": 134410 }, { "epoch": 3.2837808125473336, "grad_norm": 0.40428632497787476, "learning_rate": 6.359347331899887e-07, "loss": 0.0001, "num_input_tokens_seen": 90595280, "step": 134415 }, { "epoch": 3.2839029633791808, "grad_norm": 0.03745681792497635, "learning_rate": 6.358553092228458e-07, "loss": 0.0, "num_input_tokens_seen": 90598288, "step": 134420 }, { "epoch": 3.284025114211028, "grad_norm": 0.01733500324189663, "learning_rate": 6.357758879038617e-07, "loss": 0.0001, "num_input_tokens_seen": 90601488, "step": 134425 }, { "epoch": 3.284147265042875, "grad_norm": 0.0021801122929900885, "learning_rate": 6.356964692336127e-07, "loss": 0.0, "num_input_tokens_seen": 90604944, "step": 134430 }, { "epoch": 3.2842694158747223, "grad_norm": 0.00015866855392232537, "learning_rate": 6.356170532126774e-07, "loss": 0.0, "num_input_tokens_seen": 90608272, "step": 134435 }, { "epoch": 3.284391566706569, "grad_norm": 86.30461120605469, "learning_rate": 6.355376398416325e-07, "loss": 0.0718, "num_input_tokens_seen": 90611408, "step": 134440 }, { "epoch": 3.2845137175384163, "grad_norm": 0.0021867312025278807, "learning_rate": 6.354582291210559e-07, "loss": 0.0, "num_input_tokens_seen": 90614288, "step": 134445 }, { "epoch": 3.2846358683702634, "grad_norm": 0.0006792885833419859, "learning_rate": 6.353788210515255e-07, "loss": 0.0, "num_input_tokens_seen": 90617488, "step": 134450 }, { "epoch": 3.2847580192021106, "grad_norm": 0.01702149212360382, "learning_rate": 6.352994156336182e-07, "loss": 0.0, "num_input_tokens_seen": 90620688, "step": 134455 }, { "epoch": 3.284880170033958, "grad_norm": 0.0014157212572172284, "learning_rate": 6.352200128679117e-07, "loss": 0.0001, "num_input_tokens_seen": 90623888, "step": 134460 }, { "epoch": 3.285002320865805, "grad_norm": 0.00022495303710456938, "learning_rate": 6.351406127549834e-07, "loss": 0.0, "num_input_tokens_seen": 90627088, "step": 134465 }, { "epoch": 3.285124471697652, "grad_norm": 159.6038360595703, "learning_rate": 6.350612152954105e-07, "loss": 0.0512, "num_input_tokens_seen": 90631184, "step": 134470 }, { "epoch": 3.2852466225294994, "grad_norm": 19.899839401245117, "learning_rate": 6.349818204897708e-07, "loss": 0.0382, "num_input_tokens_seen": 90634576, "step": 134475 }, { "epoch": 3.2853687733613466, "grad_norm": 0.0002656186406966299, "learning_rate": 6.349024283386413e-07, "loss": 0.0, "num_input_tokens_seen": 90638032, "step": 134480 }, { "epoch": 3.2854909241931938, "grad_norm": 0.0022826252970844507, "learning_rate": 6.348230388425999e-07, "loss": 0.0, "num_input_tokens_seen": 90641424, "step": 134485 }, { "epoch": 3.285613075025041, "grad_norm": 0.0040170783177018166, "learning_rate": 6.347436520022231e-07, "loss": 0.0, "num_input_tokens_seen": 90644688, "step": 134490 }, { "epoch": 3.285735225856888, "grad_norm": 0.02008282206952572, "learning_rate": 6.346642678180891e-07, "loss": 0.0134, "num_input_tokens_seen": 90648016, "step": 134495 }, { "epoch": 3.2858573766887353, "grad_norm": 0.001128724543377757, "learning_rate": 6.345848862907746e-07, "loss": 0.0007, "num_input_tokens_seen": 90651344, "step": 134500 }, { "epoch": 3.2859795275205825, "grad_norm": 0.0005452800542116165, "learning_rate": 6.34505507420857e-07, "loss": 0.0, "num_input_tokens_seen": 90654864, "step": 134505 }, { "epoch": 3.2861016783524297, "grad_norm": 0.026881614699959755, "learning_rate": 6.344261312089138e-07, "loss": 0.0, "num_input_tokens_seen": 90658576, "step": 134510 }, { "epoch": 3.286223829184277, "grad_norm": 0.003952720668166876, "learning_rate": 6.343467576555222e-07, "loss": 0.0002, "num_input_tokens_seen": 90661648, "step": 134515 }, { "epoch": 3.286345980016124, "grad_norm": 0.23101937770843506, "learning_rate": 6.342673867612594e-07, "loss": 0.0001, "num_input_tokens_seen": 90665424, "step": 134520 }, { "epoch": 3.286468130847971, "grad_norm": 0.10918232798576355, "learning_rate": 6.341880185267021e-07, "loss": 0.0001, "num_input_tokens_seen": 90668624, "step": 134525 }, { "epoch": 3.2865902816798185, "grad_norm": 0.0025677045341581106, "learning_rate": 6.34108652952428e-07, "loss": 0.0, "num_input_tokens_seen": 90671952, "step": 134530 }, { "epoch": 3.2867124325116652, "grad_norm": 0.04821763187646866, "learning_rate": 6.340292900390146e-07, "loss": 0.0, "num_input_tokens_seen": 90675536, "step": 134535 }, { "epoch": 3.2868345833435124, "grad_norm": 0.0005546039319597185, "learning_rate": 6.339499297870382e-07, "loss": 0.0, "num_input_tokens_seen": 90678416, "step": 134540 }, { "epoch": 3.2869567341753596, "grad_norm": 0.0003405036695767194, "learning_rate": 6.338705721970768e-07, "loss": 0.0, "num_input_tokens_seen": 90681616, "step": 134545 }, { "epoch": 3.287078885007207, "grad_norm": 0.0017196012195199728, "learning_rate": 6.337912172697066e-07, "loss": 0.0, "num_input_tokens_seen": 90685072, "step": 134550 }, { "epoch": 3.287201035839054, "grad_norm": 6.0141754150390625, "learning_rate": 6.337118650055056e-07, "loss": 0.0003, "num_input_tokens_seen": 90688080, "step": 134555 }, { "epoch": 3.287323186670901, "grad_norm": 0.010194681584835052, "learning_rate": 6.336325154050502e-07, "loss": 0.0002, "num_input_tokens_seen": 90691280, "step": 134560 }, { "epoch": 3.2874453375027484, "grad_norm": 0.0034128446131944656, "learning_rate": 6.335531684689177e-07, "loss": 0.0, "num_input_tokens_seen": 90694928, "step": 134565 }, { "epoch": 3.2875674883345956, "grad_norm": 0.004803723655641079, "learning_rate": 6.334738241976854e-07, "loss": 0.0319, "num_input_tokens_seen": 90698448, "step": 134570 }, { "epoch": 3.2876896391664427, "grad_norm": 0.00041958095971494913, "learning_rate": 6.333944825919295e-07, "loss": 0.0001, "num_input_tokens_seen": 90701648, "step": 134575 }, { "epoch": 3.28781178999829, "grad_norm": 0.0028518533799797297, "learning_rate": 6.333151436522282e-07, "loss": 0.0, "num_input_tokens_seen": 90705424, "step": 134580 }, { "epoch": 3.287933940830137, "grad_norm": 0.0027906023897230625, "learning_rate": 6.33235807379157e-07, "loss": 0.0, "num_input_tokens_seen": 90708880, "step": 134585 }, { "epoch": 3.2880560916619843, "grad_norm": 31.440628051757812, "learning_rate": 6.331564737732944e-07, "loss": 0.1364, "num_input_tokens_seen": 90711888, "step": 134590 }, { "epoch": 3.2881782424938315, "grad_norm": 49.43909454345703, "learning_rate": 6.330771428352161e-07, "loss": 0.0446, "num_input_tokens_seen": 90714768, "step": 134595 }, { "epoch": 3.2883003933256787, "grad_norm": 0.00022165839618537575, "learning_rate": 6.329978145654994e-07, "loss": 0.0, "num_input_tokens_seen": 90717904, "step": 134600 }, { "epoch": 3.288422544157526, "grad_norm": 0.0024501916486769915, "learning_rate": 6.329184889647219e-07, "loss": 0.0, "num_input_tokens_seen": 90722000, "step": 134605 }, { "epoch": 3.2885446949893726, "grad_norm": 0.1524658054113388, "learning_rate": 6.328391660334596e-07, "loss": 0.0001, "num_input_tokens_seen": 90725008, "step": 134610 }, { "epoch": 3.2886668458212203, "grad_norm": 569.6676635742188, "learning_rate": 6.327598457722896e-07, "loss": 0.0378, "num_input_tokens_seen": 90728336, "step": 134615 }, { "epoch": 3.288788996653067, "grad_norm": 0.019205080345273018, "learning_rate": 6.326805281817887e-07, "loss": 0.0, "num_input_tokens_seen": 90731792, "step": 134620 }, { "epoch": 3.288911147484914, "grad_norm": 0.0007060529896989465, "learning_rate": 6.326012132625338e-07, "loss": 0.0, "num_input_tokens_seen": 90735504, "step": 134625 }, { "epoch": 3.2890332983167614, "grad_norm": 0.0064339907839894295, "learning_rate": 6.32521901015102e-07, "loss": 0.0, "num_input_tokens_seen": 90738960, "step": 134630 }, { "epoch": 3.2891554491486086, "grad_norm": 0.0008209880325011909, "learning_rate": 6.324425914400693e-07, "loss": 0.0, "num_input_tokens_seen": 90741840, "step": 134635 }, { "epoch": 3.2892775999804558, "grad_norm": 0.0007558853249065578, "learning_rate": 6.323632845380134e-07, "loss": 0.0028, "num_input_tokens_seen": 90745104, "step": 134640 }, { "epoch": 3.289399750812303, "grad_norm": 0.0004715778341051191, "learning_rate": 6.322839803095102e-07, "loss": 0.0, "num_input_tokens_seen": 90748432, "step": 134645 }, { "epoch": 3.28952190164415, "grad_norm": 0.0012815663358196616, "learning_rate": 6.322046787551372e-07, "loss": 0.0, "num_input_tokens_seen": 90751440, "step": 134650 }, { "epoch": 3.2896440524759973, "grad_norm": 0.0019647274166345596, "learning_rate": 6.321253798754702e-07, "loss": 0.0, "num_input_tokens_seen": 90755024, "step": 134655 }, { "epoch": 3.2897662033078445, "grad_norm": 0.9728825688362122, "learning_rate": 6.320460836710866e-07, "loss": 0.0002, "num_input_tokens_seen": 90758544, "step": 134660 }, { "epoch": 3.2898883541396917, "grad_norm": 0.014007066376507282, "learning_rate": 6.319667901425629e-07, "loss": 0.0, "num_input_tokens_seen": 90761936, "step": 134665 }, { "epoch": 3.290010504971539, "grad_norm": 0.001597664668224752, "learning_rate": 6.318874992904757e-07, "loss": 0.0, "num_input_tokens_seen": 90765008, "step": 134670 }, { "epoch": 3.290132655803386, "grad_norm": 0.13420917093753815, "learning_rate": 6.318082111154018e-07, "loss": 0.0, "num_input_tokens_seen": 90768336, "step": 134675 }, { "epoch": 3.2902548066352333, "grad_norm": 0.0011054584756493568, "learning_rate": 6.317289256179172e-07, "loss": 0.0, "num_input_tokens_seen": 90771536, "step": 134680 }, { "epoch": 3.2903769574670805, "grad_norm": 0.014804008416831493, "learning_rate": 6.316496427985995e-07, "loss": 0.0004, "num_input_tokens_seen": 90774928, "step": 134685 }, { "epoch": 3.2904991082989277, "grad_norm": 0.0023647164925932884, "learning_rate": 6.31570362658024e-07, "loss": 0.0083, "num_input_tokens_seen": 90778320, "step": 134690 }, { "epoch": 3.290621259130775, "grad_norm": 0.001251070643775165, "learning_rate": 6.31491085196768e-07, "loss": 0.0, "num_input_tokens_seen": 90781520, "step": 134695 }, { "epoch": 3.290743409962622, "grad_norm": 0.0034466327633708715, "learning_rate": 6.314118104154084e-07, "loss": 0.0, "num_input_tokens_seen": 90784784, "step": 134700 }, { "epoch": 3.290865560794469, "grad_norm": 0.00014235633716452867, "learning_rate": 6.313325383145208e-07, "loss": 0.0001, "num_input_tokens_seen": 90787664, "step": 134705 }, { "epoch": 3.290987711626316, "grad_norm": 0.03204859048128128, "learning_rate": 6.312532688946826e-07, "loss": 0.0, "num_input_tokens_seen": 90790800, "step": 134710 }, { "epoch": 3.291109862458163, "grad_norm": 0.05829733610153198, "learning_rate": 6.311740021564693e-07, "loss": 0.0, "num_input_tokens_seen": 90793872, "step": 134715 }, { "epoch": 3.2912320132900104, "grad_norm": 0.0008750380948185921, "learning_rate": 6.310947381004582e-07, "loss": 0.0001, "num_input_tokens_seen": 90797136, "step": 134720 }, { "epoch": 3.2913541641218576, "grad_norm": 0.0014805016107857227, "learning_rate": 6.310154767272255e-07, "loss": 0.0, "num_input_tokens_seen": 90800528, "step": 134725 }, { "epoch": 3.2914763149537047, "grad_norm": 0.008990327827632427, "learning_rate": 6.309362180373472e-07, "loss": 0.0609, "num_input_tokens_seen": 90806800, "step": 134730 }, { "epoch": 3.291598465785552, "grad_norm": 0.0007601430988870561, "learning_rate": 6.308569620314003e-07, "loss": 0.0, "num_input_tokens_seen": 90810128, "step": 134735 }, { "epoch": 3.291720616617399, "grad_norm": 0.0013207076117396355, "learning_rate": 6.307777087099603e-07, "loss": 0.0, "num_input_tokens_seen": 90813008, "step": 134740 }, { "epoch": 3.2918427674492463, "grad_norm": 0.0012732624309137464, "learning_rate": 6.306984580736048e-07, "loss": 0.0299, "num_input_tokens_seen": 90816208, "step": 134745 }, { "epoch": 3.2919649182810935, "grad_norm": 0.0010163038969039917, "learning_rate": 6.306192101229089e-07, "loss": 0.0011, "num_input_tokens_seen": 90819664, "step": 134750 }, { "epoch": 3.2920870691129407, "grad_norm": 0.00017418651259504259, "learning_rate": 6.305399648584495e-07, "loss": 0.0, "num_input_tokens_seen": 90822864, "step": 134755 }, { "epoch": 3.292209219944788, "grad_norm": 0.00439677806571126, "learning_rate": 6.304607222808032e-07, "loss": 0.0, "num_input_tokens_seen": 90826064, "step": 134760 }, { "epoch": 3.292331370776635, "grad_norm": 0.005836864467710257, "learning_rate": 6.303814823905458e-07, "loss": 0.0, "num_input_tokens_seen": 90829072, "step": 134765 }, { "epoch": 3.2924535216084823, "grad_norm": 0.0002580733271315694, "learning_rate": 6.303022451882536e-07, "loss": 0.0679, "num_input_tokens_seen": 90832592, "step": 134770 }, { "epoch": 3.2925756724403294, "grad_norm": 0.0020530004985630512, "learning_rate": 6.30223010674503e-07, "loss": 0.0, "num_input_tokens_seen": 90836048, "step": 134775 }, { "epoch": 3.2926978232721766, "grad_norm": 0.0024392507039010525, "learning_rate": 6.301437788498698e-07, "loss": 0.0, "num_input_tokens_seen": 90839312, "step": 134780 }, { "epoch": 3.292819974104024, "grad_norm": 0.7272111177444458, "learning_rate": 6.30064549714931e-07, "loss": 0.0002, "num_input_tokens_seen": 90842576, "step": 134785 }, { "epoch": 3.2929421249358706, "grad_norm": 0.02128804847598076, "learning_rate": 6.299853232702619e-07, "loss": 0.0, "num_input_tokens_seen": 90845648, "step": 134790 }, { "epoch": 3.293064275767718, "grad_norm": 0.0028478854801505804, "learning_rate": 6.299060995164394e-07, "loss": 0.056, "num_input_tokens_seen": 90848784, "step": 134795 }, { "epoch": 3.293186426599565, "grad_norm": 0.0012133775744587183, "learning_rate": 6.298268784540389e-07, "loss": 0.0, "num_input_tokens_seen": 90852304, "step": 134800 }, { "epoch": 3.293308577431412, "grad_norm": 0.030529484152793884, "learning_rate": 6.297476600836374e-07, "loss": 0.0, "num_input_tokens_seen": 90855312, "step": 134805 }, { "epoch": 3.2934307282632593, "grad_norm": 1.027057409286499, "learning_rate": 6.2966844440581e-07, "loss": 0.0003, "num_input_tokens_seen": 90858768, "step": 134810 }, { "epoch": 3.2935528790951065, "grad_norm": 0.0038657994009554386, "learning_rate": 6.295892314211334e-07, "loss": 0.0002, "num_input_tokens_seen": 90862032, "step": 134815 }, { "epoch": 3.2936750299269537, "grad_norm": 0.0011094954097643495, "learning_rate": 6.295100211301836e-07, "loss": 0.0001, "num_input_tokens_seen": 90865552, "step": 134820 }, { "epoch": 3.293797180758801, "grad_norm": 0.00212948489934206, "learning_rate": 6.294308135335367e-07, "loss": 0.0, "num_input_tokens_seen": 90868688, "step": 134825 }, { "epoch": 3.293919331590648, "grad_norm": 0.011899611912667751, "learning_rate": 6.293516086317687e-07, "loss": 0.0, "num_input_tokens_seen": 90872016, "step": 134830 }, { "epoch": 3.2940414824224953, "grad_norm": 0.005786948837339878, "learning_rate": 6.292724064254551e-07, "loss": 0.0, "num_input_tokens_seen": 90875216, "step": 134835 }, { "epoch": 3.2941636332543425, "grad_norm": 0.00014639328583143651, "learning_rate": 6.291932069151726e-07, "loss": 0.0001, "num_input_tokens_seen": 90878288, "step": 134840 }, { "epoch": 3.2942857840861897, "grad_norm": 0.0010571659076958895, "learning_rate": 6.291140101014966e-07, "loss": 0.0, "num_input_tokens_seen": 90881616, "step": 134845 }, { "epoch": 3.294407934918037, "grad_norm": 0.00961493793874979, "learning_rate": 6.290348159850032e-07, "loss": 0.0536, "num_input_tokens_seen": 90884688, "step": 134850 }, { "epoch": 3.294530085749884, "grad_norm": 0.0011426964774727821, "learning_rate": 6.289556245662687e-07, "loss": 0.0, "num_input_tokens_seen": 90888720, "step": 134855 }, { "epoch": 3.2946522365817312, "grad_norm": 0.060139358043670654, "learning_rate": 6.288764358458685e-07, "loss": 0.0001, "num_input_tokens_seen": 90891728, "step": 134860 }, { "epoch": 3.2947743874135784, "grad_norm": 0.004513459745794535, "learning_rate": 6.287972498243788e-07, "loss": 0.0, "num_input_tokens_seen": 90895056, "step": 134865 }, { "epoch": 3.2948965382454256, "grad_norm": 0.007571101188659668, "learning_rate": 6.287180665023751e-07, "loss": 0.0, "num_input_tokens_seen": 90898320, "step": 134870 }, { "epoch": 3.295018689077273, "grad_norm": 0.003237637458369136, "learning_rate": 6.286388858804337e-07, "loss": 0.0, "num_input_tokens_seen": 90901840, "step": 134875 }, { "epoch": 3.29514083990912, "grad_norm": 0.001679329783655703, "learning_rate": 6.285597079591305e-07, "loss": 0.0001, "num_input_tokens_seen": 90905104, "step": 134880 }, { "epoch": 3.2952629907409667, "grad_norm": 0.00029259934672154486, "learning_rate": 6.284805327390404e-07, "loss": 0.0, "num_input_tokens_seen": 90908496, "step": 134885 }, { "epoch": 3.295385141572814, "grad_norm": 0.003385191550478339, "learning_rate": 6.284013602207403e-07, "loss": 0.0, "num_input_tokens_seen": 90911760, "step": 134890 }, { "epoch": 3.295507292404661, "grad_norm": 0.0001985864364542067, "learning_rate": 6.283221904048051e-07, "loss": 0.0, "num_input_tokens_seen": 90915024, "step": 134895 }, { "epoch": 3.2956294432365083, "grad_norm": 0.004313347861170769, "learning_rate": 6.282430232918112e-07, "loss": 0.0773, "num_input_tokens_seen": 90918288, "step": 134900 }, { "epoch": 3.2957515940683555, "grad_norm": 0.0002821721136569977, "learning_rate": 6.281638588823337e-07, "loss": 0.0365, "num_input_tokens_seen": 90921360, "step": 134905 }, { "epoch": 3.2958737449002027, "grad_norm": 0.002907287096604705, "learning_rate": 6.280846971769486e-07, "loss": 0.0978, "num_input_tokens_seen": 90924432, "step": 134910 }, { "epoch": 3.29599589573205, "grad_norm": 0.0004542749666143209, "learning_rate": 6.280055381762319e-07, "loss": 0.0001, "num_input_tokens_seen": 90928144, "step": 134915 }, { "epoch": 3.296118046563897, "grad_norm": 0.004509879741817713, "learning_rate": 6.27926381880759e-07, "loss": 0.0002, "num_input_tokens_seen": 90931600, "step": 134920 }, { "epoch": 3.2962401973957443, "grad_norm": 0.012882355600595474, "learning_rate": 6.278472282911054e-07, "loss": 0.0003, "num_input_tokens_seen": 90934672, "step": 134925 }, { "epoch": 3.2963623482275914, "grad_norm": 0.000582054490223527, "learning_rate": 6.277680774078469e-07, "loss": 0.0, "num_input_tokens_seen": 90938256, "step": 134930 }, { "epoch": 3.2964844990594386, "grad_norm": 0.0010208197636529803, "learning_rate": 6.276889292315588e-07, "loss": 0.0643, "num_input_tokens_seen": 90941456, "step": 134935 }, { "epoch": 3.296606649891286, "grad_norm": 0.004477804992347956, "learning_rate": 6.276097837628174e-07, "loss": 0.0, "num_input_tokens_seen": 90944656, "step": 134940 }, { "epoch": 3.296728800723133, "grad_norm": 0.007399188820272684, "learning_rate": 6.275306410021974e-07, "loss": 0.0565, "num_input_tokens_seen": 90947856, "step": 134945 }, { "epoch": 3.29685095155498, "grad_norm": 0.05458146706223488, "learning_rate": 6.274515009502751e-07, "loss": 0.0555, "num_input_tokens_seen": 90951376, "step": 134950 }, { "epoch": 3.2969731023868274, "grad_norm": 0.0012545207282528281, "learning_rate": 6.273723636076254e-07, "loss": 0.0, "num_input_tokens_seen": 90954832, "step": 134955 }, { "epoch": 3.2970952532186746, "grad_norm": 0.007827935740351677, "learning_rate": 6.272932289748244e-07, "loss": 0.0005, "num_input_tokens_seen": 90958224, "step": 134960 }, { "epoch": 3.2972174040505218, "grad_norm": 0.0029065613634884357, "learning_rate": 6.272140970524469e-07, "loss": 0.0, "num_input_tokens_seen": 90961552, "step": 134965 }, { "epoch": 3.2973395548823685, "grad_norm": 0.002743184333667159, "learning_rate": 6.27134967841069e-07, "loss": 0.0001, "num_input_tokens_seen": 90964880, "step": 134970 }, { "epoch": 3.297461705714216, "grad_norm": 0.0002498124958947301, "learning_rate": 6.270558413412659e-07, "loss": 0.0, "num_input_tokens_seen": 90968208, "step": 134975 }, { "epoch": 3.297583856546063, "grad_norm": 0.0028212815523147583, "learning_rate": 6.26976717553613e-07, "loss": 0.0001, "num_input_tokens_seen": 90971536, "step": 134980 }, { "epoch": 3.29770600737791, "grad_norm": 0.014758952893316746, "learning_rate": 6.26897596478686e-07, "loss": 0.0489, "num_input_tokens_seen": 90974672, "step": 134985 }, { "epoch": 3.2978281582097573, "grad_norm": 0.004514573607593775, "learning_rate": 6.268184781170596e-07, "loss": 0.0003, "num_input_tokens_seen": 90978320, "step": 134990 }, { "epoch": 3.2979503090416045, "grad_norm": 0.0010999558726325631, "learning_rate": 6.2673936246931e-07, "loss": 0.0, "num_input_tokens_seen": 90981968, "step": 134995 }, { "epoch": 3.2980724598734517, "grad_norm": 0.004744267091155052, "learning_rate": 6.266602495360116e-07, "loss": 0.0, "num_input_tokens_seen": 90985040, "step": 135000 }, { "epoch": 3.298194610705299, "grad_norm": 0.0059790643863379955, "learning_rate": 6.265811393177405e-07, "loss": 0.0, "num_input_tokens_seen": 90988240, "step": 135005 }, { "epoch": 3.298316761537146, "grad_norm": 0.022460024803876877, "learning_rate": 6.265020318150721e-07, "loss": 0.0, "num_input_tokens_seen": 90991696, "step": 135010 }, { "epoch": 3.2984389123689932, "grad_norm": 0.003150185802951455, "learning_rate": 6.26422927028581e-07, "loss": 0.0, "num_input_tokens_seen": 90995024, "step": 135015 }, { "epoch": 3.2985610632008404, "grad_norm": 0.001140235923230648, "learning_rate": 6.263438249588433e-07, "loss": 0.0, "num_input_tokens_seen": 90998672, "step": 135020 }, { "epoch": 3.2986832140326876, "grad_norm": 4.278022606740706e-05, "learning_rate": 6.262647256064333e-07, "loss": 0.0, "num_input_tokens_seen": 91002384, "step": 135025 }, { "epoch": 3.298805364864535, "grad_norm": 0.010379807092249393, "learning_rate": 6.26185628971927e-07, "loss": 0.0, "num_input_tokens_seen": 91005648, "step": 135030 }, { "epoch": 3.298927515696382, "grad_norm": 0.007365286350250244, "learning_rate": 6.261065350558996e-07, "loss": 0.0, "num_input_tokens_seen": 91009040, "step": 135035 }, { "epoch": 3.299049666528229, "grad_norm": 0.05299066752195358, "learning_rate": 6.260274438589254e-07, "loss": 0.0, "num_input_tokens_seen": 91012688, "step": 135040 }, { "epoch": 3.2991718173600764, "grad_norm": 0.002590995281934738, "learning_rate": 6.25948355381581e-07, "loss": 0.0, "num_input_tokens_seen": 91016272, "step": 135045 }, { "epoch": 3.2992939681919236, "grad_norm": 0.019199777394533157, "learning_rate": 6.258692696244401e-07, "loss": 0.0, "num_input_tokens_seen": 91019792, "step": 135050 }, { "epoch": 3.2994161190237707, "grad_norm": 0.009994552470743656, "learning_rate": 6.257901865880791e-07, "loss": 0.0, "num_input_tokens_seen": 91022992, "step": 135055 }, { "epoch": 3.299538269855618, "grad_norm": 0.006808954291045666, "learning_rate": 6.257111062730718e-07, "loss": 0.0, "num_input_tokens_seen": 91026192, "step": 135060 }, { "epoch": 3.2996604206874647, "grad_norm": 0.01960168033838272, "learning_rate": 6.256320286799944e-07, "loss": 0.0685, "num_input_tokens_seen": 91029200, "step": 135065 }, { "epoch": 3.299782571519312, "grad_norm": 0.0008817182388156652, "learning_rate": 6.255529538094216e-07, "loss": 0.0, "num_input_tokens_seen": 91032272, "step": 135070 }, { "epoch": 3.299904722351159, "grad_norm": 0.00037948612589389086, "learning_rate": 6.254738816619285e-07, "loss": 0.0, "num_input_tokens_seen": 91035664, "step": 135075 }, { "epoch": 3.3000268731830062, "grad_norm": 0.0006895409314893186, "learning_rate": 6.253948122380898e-07, "loss": 0.0, "num_input_tokens_seen": 91039248, "step": 135080 }, { "epoch": 3.3001490240148534, "grad_norm": 0.0011778641492128372, "learning_rate": 6.25315745538481e-07, "loss": 0.0, "num_input_tokens_seen": 91043216, "step": 135085 }, { "epoch": 3.3002711748467006, "grad_norm": 0.0011863255640491843, "learning_rate": 6.252366815636767e-07, "loss": 0.0, "num_input_tokens_seen": 91046992, "step": 135090 }, { "epoch": 3.300393325678548, "grad_norm": 0.0013244269648566842, "learning_rate": 6.251576203142524e-07, "loss": 0.0, "num_input_tokens_seen": 91050192, "step": 135095 }, { "epoch": 3.300515476510395, "grad_norm": 0.04680801182985306, "learning_rate": 6.250785617907822e-07, "loss": 0.0, "num_input_tokens_seen": 91053456, "step": 135100 }, { "epoch": 3.300637627342242, "grad_norm": 0.0024712015874683857, "learning_rate": 6.249995059938421e-07, "loss": 0.0001, "num_input_tokens_seen": 91056784, "step": 135105 }, { "epoch": 3.3007597781740894, "grad_norm": 0.001630530459806323, "learning_rate": 6.24920452924006e-07, "loss": 0.0001, "num_input_tokens_seen": 91060240, "step": 135110 }, { "epoch": 3.3008819290059366, "grad_norm": 159.94313049316406, "learning_rate": 6.248414025818496e-07, "loss": 0.096, "num_input_tokens_seen": 91063504, "step": 135115 }, { "epoch": 3.3010040798377838, "grad_norm": 0.02005532756447792, "learning_rate": 6.247623549679471e-07, "loss": 0.0, "num_input_tokens_seen": 91066768, "step": 135120 }, { "epoch": 3.301126230669631, "grad_norm": 0.0057156141847372055, "learning_rate": 6.246833100828738e-07, "loss": 0.0, "num_input_tokens_seen": 91070032, "step": 135125 }, { "epoch": 3.301248381501478, "grad_norm": 0.001090741716325283, "learning_rate": 6.246042679272044e-07, "loss": 0.0, "num_input_tokens_seen": 91073552, "step": 135130 }, { "epoch": 3.3013705323333253, "grad_norm": 17.12499237060547, "learning_rate": 6.245252285015139e-07, "loss": 0.0696, "num_input_tokens_seen": 91077392, "step": 135135 }, { "epoch": 3.3014926831651725, "grad_norm": 0.0025136242620646954, "learning_rate": 6.24446191806377e-07, "loss": 0.0, "num_input_tokens_seen": 91080592, "step": 135140 }, { "epoch": 3.3016148339970197, "grad_norm": 0.017409253865480423, "learning_rate": 6.243671578423679e-07, "loss": 0.0001, "num_input_tokens_seen": 91083920, "step": 135145 }, { "epoch": 3.3017369848288665, "grad_norm": 0.001015423913486302, "learning_rate": 6.242881266100625e-07, "loss": 0.0, "num_input_tokens_seen": 91087056, "step": 135150 }, { "epoch": 3.301859135660714, "grad_norm": 0.013307973742485046, "learning_rate": 6.242090981100343e-07, "loss": 0.0, "num_input_tokens_seen": 91090192, "step": 135155 }, { "epoch": 3.301981286492561, "grad_norm": 0.02078000269830227, "learning_rate": 6.241300723428587e-07, "loss": 0.0, "num_input_tokens_seen": 91093328, "step": 135160 }, { "epoch": 3.302103437324408, "grad_norm": 0.013913290575146675, "learning_rate": 6.240510493091108e-07, "loss": 0.0, "num_input_tokens_seen": 91096464, "step": 135165 }, { "epoch": 3.302225588156255, "grad_norm": 20.877248764038086, "learning_rate": 6.239720290093642e-07, "loss": 0.0362, "num_input_tokens_seen": 91099664, "step": 135170 }, { "epoch": 3.3023477389881024, "grad_norm": 0.00022836009156890213, "learning_rate": 6.238930114441947e-07, "loss": 0.0, "num_input_tokens_seen": 91102864, "step": 135175 }, { "epoch": 3.3024698898199496, "grad_norm": 0.007412285078316927, "learning_rate": 6.23813996614176e-07, "loss": 0.0, "num_input_tokens_seen": 91106320, "step": 135180 }, { "epoch": 3.302592040651797, "grad_norm": 0.0035116132348775864, "learning_rate": 6.237349845198831e-07, "loss": 0.0, "num_input_tokens_seen": 91109648, "step": 135185 }, { "epoch": 3.302714191483644, "grad_norm": 0.21899212896823883, "learning_rate": 6.23655975161891e-07, "loss": 0.0001, "num_input_tokens_seen": 91112848, "step": 135190 }, { "epoch": 3.302836342315491, "grad_norm": 0.04682852700352669, "learning_rate": 6.235769685407734e-07, "loss": 0.0875, "num_input_tokens_seen": 91116048, "step": 135195 }, { "epoch": 3.3029584931473384, "grad_norm": 0.005642554722726345, "learning_rate": 6.234979646571057e-07, "loss": 0.0, "num_input_tokens_seen": 91119568, "step": 135200 }, { "epoch": 3.3030806439791855, "grad_norm": 0.010612928308546543, "learning_rate": 6.234189635114617e-07, "loss": 0.0, "num_input_tokens_seen": 91122960, "step": 135205 }, { "epoch": 3.3032027948110327, "grad_norm": 0.033904921263456345, "learning_rate": 6.233399651044167e-07, "loss": 0.0, "num_input_tokens_seen": 91125840, "step": 135210 }, { "epoch": 3.30332494564288, "grad_norm": 0.014243927784264088, "learning_rate": 6.232609694365443e-07, "loss": 0.0, "num_input_tokens_seen": 91129168, "step": 135215 }, { "epoch": 3.303447096474727, "grad_norm": 0.0019423479679971933, "learning_rate": 6.231819765084195e-07, "loss": 0.0, "num_input_tokens_seen": 91132688, "step": 135220 }, { "epoch": 3.3035692473065743, "grad_norm": 221.44570922851562, "learning_rate": 6.231029863206172e-07, "loss": 0.005, "num_input_tokens_seen": 91136208, "step": 135225 }, { "epoch": 3.3036913981384215, "grad_norm": 0.008687580935657024, "learning_rate": 6.23023998873711e-07, "loss": 0.0, "num_input_tokens_seen": 91139408, "step": 135230 }, { "epoch": 3.3038135489702682, "grad_norm": 0.005624264944344759, "learning_rate": 6.229450141682758e-07, "loss": 0.0001, "num_input_tokens_seen": 91142480, "step": 135235 }, { "epoch": 3.303935699802116, "grad_norm": 0.001273568719625473, "learning_rate": 6.228660322048858e-07, "loss": 0.0004, "num_input_tokens_seen": 91146064, "step": 135240 }, { "epoch": 3.3040578506339626, "grad_norm": 0.00161461450625211, "learning_rate": 6.227870529841155e-07, "loss": 0.029, "num_input_tokens_seen": 91149776, "step": 135245 }, { "epoch": 3.30418000146581, "grad_norm": 0.002101946622133255, "learning_rate": 6.227080765065392e-07, "loss": 0.0, "num_input_tokens_seen": 91152976, "step": 135250 }, { "epoch": 3.304302152297657, "grad_norm": 0.0007854495197534561, "learning_rate": 6.226291027727311e-07, "loss": 0.0, "num_input_tokens_seen": 91156240, "step": 135255 }, { "epoch": 3.304424303129504, "grad_norm": 0.0015872808871790767, "learning_rate": 6.22550131783266e-07, "loss": 0.0, "num_input_tokens_seen": 91159632, "step": 135260 }, { "epoch": 3.3045464539613514, "grad_norm": 0.0007271585636772215, "learning_rate": 6.224711635387174e-07, "loss": 0.0, "num_input_tokens_seen": 91162704, "step": 135265 }, { "epoch": 3.3046686047931986, "grad_norm": 0.4559313654899597, "learning_rate": 6.223921980396606e-07, "loss": 0.0001, "num_input_tokens_seen": 91166160, "step": 135270 }, { "epoch": 3.3047907556250458, "grad_norm": 0.0029029580764472485, "learning_rate": 6.223132352866688e-07, "loss": 0.0, "num_input_tokens_seen": 91169424, "step": 135275 }, { "epoch": 3.304912906456893, "grad_norm": 0.07211734354496002, "learning_rate": 6.22234275280317e-07, "loss": 0.0, "num_input_tokens_seen": 91172432, "step": 135280 }, { "epoch": 3.30503505728874, "grad_norm": 0.002586309565231204, "learning_rate": 6.221553180211791e-07, "loss": 0.0001, "num_input_tokens_seen": 91175504, "step": 135285 }, { "epoch": 3.3051572081205873, "grad_norm": 16.60679054260254, "learning_rate": 6.220763635098294e-07, "loss": 0.0339, "num_input_tokens_seen": 91178768, "step": 135290 }, { "epoch": 3.3052793589524345, "grad_norm": 0.001326601137407124, "learning_rate": 6.21997411746842e-07, "loss": 0.0, "num_input_tokens_seen": 91182032, "step": 135295 }, { "epoch": 3.3054015097842817, "grad_norm": 0.011996055953204632, "learning_rate": 6.21918462732791e-07, "loss": 0.0001, "num_input_tokens_seen": 91185616, "step": 135300 }, { "epoch": 3.305523660616129, "grad_norm": 6.76007111906074e-05, "learning_rate": 6.218395164682509e-07, "loss": 0.0, "num_input_tokens_seen": 91188624, "step": 135305 }, { "epoch": 3.305645811447976, "grad_norm": 0.0028651319444179535, "learning_rate": 6.217605729537952e-07, "loss": 0.1288, "num_input_tokens_seen": 91191632, "step": 135310 }, { "epoch": 3.3057679622798233, "grad_norm": 0.0010443766368553042, "learning_rate": 6.216816321899984e-07, "loss": 0.0002, "num_input_tokens_seen": 91194960, "step": 135315 }, { "epoch": 3.3058901131116705, "grad_norm": 0.0022625199053436518, "learning_rate": 6.216026941774348e-07, "loss": 0.0034, "num_input_tokens_seen": 91198608, "step": 135320 }, { "epoch": 3.3060122639435177, "grad_norm": 0.0015713103348389268, "learning_rate": 6.215237589166778e-07, "loss": 0.0, "num_input_tokens_seen": 91201488, "step": 135325 }, { "epoch": 3.3061344147753644, "grad_norm": 0.0026938188821077347, "learning_rate": 6.214448264083024e-07, "loss": 0.0002, "num_input_tokens_seen": 91204816, "step": 135330 }, { "epoch": 3.3062565656072116, "grad_norm": 0.0012552423868328333, "learning_rate": 6.213658966528814e-07, "loss": 0.0001, "num_input_tokens_seen": 91208400, "step": 135335 }, { "epoch": 3.306378716439059, "grad_norm": 0.0023763503413647413, "learning_rate": 6.212869696509896e-07, "loss": 0.0, "num_input_tokens_seen": 91211280, "step": 135340 }, { "epoch": 3.306500867270906, "grad_norm": 0.0035993000492453575, "learning_rate": 6.212080454032013e-07, "loss": 0.0, "num_input_tokens_seen": 91214672, "step": 135345 }, { "epoch": 3.306623018102753, "grad_norm": 0.0008662957116030157, "learning_rate": 6.211291239100893e-07, "loss": 0.0, "num_input_tokens_seen": 91218512, "step": 135350 }, { "epoch": 3.3067451689346004, "grad_norm": 0.010020371526479721, "learning_rate": 6.210502051722289e-07, "loss": 0.0576, "num_input_tokens_seen": 91222288, "step": 135355 }, { "epoch": 3.3068673197664475, "grad_norm": 0.005591185763478279, "learning_rate": 6.209712891901927e-07, "loss": 0.0, "num_input_tokens_seen": 91225616, "step": 135360 }, { "epoch": 3.3069894705982947, "grad_norm": 0.008373158052563667, "learning_rate": 6.208923759645557e-07, "loss": 0.0, "num_input_tokens_seen": 91229072, "step": 135365 }, { "epoch": 3.307111621430142, "grad_norm": 0.0013248298782855272, "learning_rate": 6.20813465495891e-07, "loss": 0.0, "num_input_tokens_seen": 91232400, "step": 135370 }, { "epoch": 3.307233772261989, "grad_norm": 0.0008385555702261627, "learning_rate": 6.207345577847727e-07, "loss": 0.0, "num_input_tokens_seen": 91235536, "step": 135375 }, { "epoch": 3.3073559230938363, "grad_norm": 23.962797164916992, "learning_rate": 6.20655652831775e-07, "loss": 0.0256, "num_input_tokens_seen": 91238928, "step": 135380 }, { "epoch": 3.3074780739256835, "grad_norm": 0.016070673242211342, "learning_rate": 6.205767506374713e-07, "loss": 0.0, "num_input_tokens_seen": 91242128, "step": 135385 }, { "epoch": 3.3076002247575307, "grad_norm": 0.011566529050469398, "learning_rate": 6.204978512024355e-07, "loss": 0.0, "num_input_tokens_seen": 91245520, "step": 135390 }, { "epoch": 3.307722375589378, "grad_norm": 0.27769777178764343, "learning_rate": 6.204189545272415e-07, "loss": 0.0001, "num_input_tokens_seen": 91248848, "step": 135395 }, { "epoch": 3.307844526421225, "grad_norm": 0.0017242628382518888, "learning_rate": 6.203400606124629e-07, "loss": 0.0418, "num_input_tokens_seen": 91251792, "step": 135400 }, { "epoch": 3.3079666772530723, "grad_norm": 0.011997255496680737, "learning_rate": 6.202611694586735e-07, "loss": 0.0334, "num_input_tokens_seen": 91255376, "step": 135405 }, { "epoch": 3.3080888280849194, "grad_norm": 12.103500366210938, "learning_rate": 6.201822810664468e-07, "loss": 0.0397, "num_input_tokens_seen": 91258448, "step": 135410 }, { "epoch": 3.308210978916766, "grad_norm": 0.012106460519134998, "learning_rate": 6.201033954363571e-07, "loss": 0.0214, "num_input_tokens_seen": 91262096, "step": 135415 }, { "epoch": 3.308333129748614, "grad_norm": 0.021580029278993607, "learning_rate": 6.200245125689774e-07, "loss": 0.0, "num_input_tokens_seen": 91265552, "step": 135420 }, { "epoch": 3.3084552805804606, "grad_norm": 0.001080291927792132, "learning_rate": 6.19945632464882e-07, "loss": 0.0, "num_input_tokens_seen": 91268816, "step": 135425 }, { "epoch": 3.3085774314123078, "grad_norm": 0.0014836577465757728, "learning_rate": 6.198667551246437e-07, "loss": 0.0001, "num_input_tokens_seen": 91272208, "step": 135430 }, { "epoch": 3.308699582244155, "grad_norm": 0.0157320536673069, "learning_rate": 6.197878805488368e-07, "loss": 0.0, "num_input_tokens_seen": 91275280, "step": 135435 }, { "epoch": 3.308821733076002, "grad_norm": 0.017448775470256805, "learning_rate": 6.197090087380348e-07, "loss": 0.0004, "num_input_tokens_seen": 91278736, "step": 135440 }, { "epoch": 3.3089438839078493, "grad_norm": 0.004054314456880093, "learning_rate": 6.196301396928109e-07, "loss": 0.0, "num_input_tokens_seen": 91282512, "step": 135445 }, { "epoch": 3.3090660347396965, "grad_norm": 0.0010950237046927214, "learning_rate": 6.195512734137395e-07, "loss": 0.0726, "num_input_tokens_seen": 91285200, "step": 135450 }, { "epoch": 3.3091881855715437, "grad_norm": 0.027147632092237473, "learning_rate": 6.194724099013929e-07, "loss": 0.0016, "num_input_tokens_seen": 91288208, "step": 135455 }, { "epoch": 3.309310336403391, "grad_norm": 0.0017834958853200078, "learning_rate": 6.193935491563458e-07, "loss": 0.0001, "num_input_tokens_seen": 91292112, "step": 135460 }, { "epoch": 3.309432487235238, "grad_norm": 0.0024778808001428843, "learning_rate": 6.19314691179171e-07, "loss": 0.0001, "num_input_tokens_seen": 91295376, "step": 135465 }, { "epoch": 3.3095546380670853, "grad_norm": 11.526365280151367, "learning_rate": 6.192358359704417e-07, "loss": 0.0011, "num_input_tokens_seen": 91298832, "step": 135470 }, { "epoch": 3.3096767888989325, "grad_norm": 0.007461852394044399, "learning_rate": 6.191569835307324e-07, "loss": 0.0, "num_input_tokens_seen": 91302672, "step": 135475 }, { "epoch": 3.3097989397307797, "grad_norm": 0.0007520471699535847, "learning_rate": 6.190781338606157e-07, "loss": 0.0, "num_input_tokens_seen": 91306000, "step": 135480 }, { "epoch": 3.309921090562627, "grad_norm": 0.0005934469518251717, "learning_rate": 6.189992869606655e-07, "loss": 0.0, "num_input_tokens_seen": 91309392, "step": 135485 }, { "epoch": 3.310043241394474, "grad_norm": 0.0005032624467276037, "learning_rate": 6.189204428314547e-07, "loss": 0.0001, "num_input_tokens_seen": 91312592, "step": 135490 }, { "epoch": 3.3101653922263212, "grad_norm": 0.0002889156749006361, "learning_rate": 6.18841601473557e-07, "loss": 0.0, "num_input_tokens_seen": 91315920, "step": 135495 }, { "epoch": 3.3102875430581684, "grad_norm": 0.0017207504715770483, "learning_rate": 6.18762762887546e-07, "loss": 0.0027, "num_input_tokens_seen": 91319056, "step": 135500 }, { "epoch": 3.3104096938900156, "grad_norm": 0.00313011952675879, "learning_rate": 6.186839270739943e-07, "loss": 0.0, "num_input_tokens_seen": 91322000, "step": 135505 }, { "epoch": 3.3105318447218623, "grad_norm": 0.007349675986915827, "learning_rate": 6.18605094033476e-07, "loss": 0.0001, "num_input_tokens_seen": 91325136, "step": 135510 }, { "epoch": 3.3106539955537095, "grad_norm": 0.0009442095761187375, "learning_rate": 6.185262637665636e-07, "loss": 0.0, "num_input_tokens_seen": 91328016, "step": 135515 }, { "epoch": 3.3107761463855567, "grad_norm": 0.00473749591037631, "learning_rate": 6.184474362738314e-07, "loss": 0.0311, "num_input_tokens_seen": 91331472, "step": 135520 }, { "epoch": 3.310898297217404, "grad_norm": 0.00017679596203379333, "learning_rate": 6.183686115558515e-07, "loss": 0.1495, "num_input_tokens_seen": 91334864, "step": 135525 }, { "epoch": 3.311020448049251, "grad_norm": 0.022237082943320274, "learning_rate": 6.182897896131977e-07, "loss": 0.0895, "num_input_tokens_seen": 91338000, "step": 135530 }, { "epoch": 3.3111425988810983, "grad_norm": 0.0017483303090557456, "learning_rate": 6.182109704464438e-07, "loss": 0.0, "num_input_tokens_seen": 91341456, "step": 135535 }, { "epoch": 3.3112647497129455, "grad_norm": 0.009556563571095467, "learning_rate": 6.181321540561619e-07, "loss": 0.0, "num_input_tokens_seen": 91344656, "step": 135540 }, { "epoch": 3.3113869005447927, "grad_norm": 0.013166706077754498, "learning_rate": 6.18053340442926e-07, "loss": 0.0, "num_input_tokens_seen": 91347792, "step": 135545 }, { "epoch": 3.31150905137664, "grad_norm": 0.00014834990724921227, "learning_rate": 6.179745296073087e-07, "loss": 0.0, "num_input_tokens_seen": 91351440, "step": 135550 }, { "epoch": 3.311631202208487, "grad_norm": 0.020284490659832954, "learning_rate": 6.178957215498836e-07, "loss": 0.0, "num_input_tokens_seen": 91355152, "step": 135555 }, { "epoch": 3.3117533530403342, "grad_norm": 0.03961563855409622, "learning_rate": 6.178169162712234e-07, "loss": 0.0366, "num_input_tokens_seen": 91358160, "step": 135560 }, { "epoch": 3.3118755038721814, "grad_norm": 0.029587585479021072, "learning_rate": 6.177381137719013e-07, "loss": 0.0, "num_input_tokens_seen": 91361360, "step": 135565 }, { "epoch": 3.3119976547040286, "grad_norm": 0.07171474397182465, "learning_rate": 6.176593140524909e-07, "loss": 0.0002, "num_input_tokens_seen": 91365072, "step": 135570 }, { "epoch": 3.312119805535876, "grad_norm": 0.0006190555286593735, "learning_rate": 6.175805171135642e-07, "loss": 0.0, "num_input_tokens_seen": 91368784, "step": 135575 }, { "epoch": 3.312241956367723, "grad_norm": 0.001581455348059535, "learning_rate": 6.175017229556953e-07, "loss": 0.0, "num_input_tokens_seen": 91371984, "step": 135580 }, { "epoch": 3.31236410719957, "grad_norm": 0.027697941288352013, "learning_rate": 6.174229315794564e-07, "loss": 0.0658, "num_input_tokens_seen": 91374736, "step": 135585 }, { "epoch": 3.3124862580314174, "grad_norm": 0.009913300164043903, "learning_rate": 6.17344142985421e-07, "loss": 0.0009, "num_input_tokens_seen": 91377936, "step": 135590 }, { "epoch": 3.312608408863264, "grad_norm": 0.02108718268573284, "learning_rate": 6.17265357174162e-07, "loss": 0.0, "num_input_tokens_seen": 91381200, "step": 135595 }, { "epoch": 3.3127305596951118, "grad_norm": 0.0025244238786399364, "learning_rate": 6.171865741462522e-07, "loss": 0.0123, "num_input_tokens_seen": 91384208, "step": 135600 }, { "epoch": 3.3128527105269585, "grad_norm": 0.02910471521317959, "learning_rate": 6.171077939022649e-07, "loss": 0.0, "num_input_tokens_seen": 91387728, "step": 135605 }, { "epoch": 3.3129748613588057, "grad_norm": 0.003600621595978737, "learning_rate": 6.170290164427721e-07, "loss": 0.0001, "num_input_tokens_seen": 91390736, "step": 135610 }, { "epoch": 3.313097012190653, "grad_norm": 0.0006075279670767486, "learning_rate": 6.169502417683478e-07, "loss": 0.0002, "num_input_tokens_seen": 91394000, "step": 135615 }, { "epoch": 3.3132191630225, "grad_norm": 0.027739422395825386, "learning_rate": 6.16871469879564e-07, "loss": 0.0, "num_input_tokens_seen": 91397392, "step": 135620 }, { "epoch": 3.3133413138543473, "grad_norm": 0.013399646617472172, "learning_rate": 6.16792700776994e-07, "loss": 0.0403, "num_input_tokens_seen": 91400656, "step": 135625 }, { "epoch": 3.3134634646861945, "grad_norm": 0.003509106580168009, "learning_rate": 6.167139344612108e-07, "loss": 0.0, "num_input_tokens_seen": 91403984, "step": 135630 }, { "epoch": 3.3135856155180416, "grad_norm": 0.0002290195261593908, "learning_rate": 6.166351709327866e-07, "loss": 0.0, "num_input_tokens_seen": 91407120, "step": 135635 }, { "epoch": 3.313707766349889, "grad_norm": 0.0010968875139951706, "learning_rate": 6.16556410192295e-07, "loss": 0.0, "num_input_tokens_seen": 91410640, "step": 135640 }, { "epoch": 3.313829917181736, "grad_norm": 57.31441879272461, "learning_rate": 6.164776522403079e-07, "loss": 0.0575, "num_input_tokens_seen": 91413712, "step": 135645 }, { "epoch": 3.313952068013583, "grad_norm": 0.0006980210309848189, "learning_rate": 6.163988970773985e-07, "loss": 0.041, "num_input_tokens_seen": 91417680, "step": 135650 }, { "epoch": 3.3140742188454304, "grad_norm": 0.00029445570544339716, "learning_rate": 6.163201447041399e-07, "loss": 0.0, "num_input_tokens_seen": 91420752, "step": 135655 }, { "epoch": 3.3141963696772776, "grad_norm": 0.0027940631844103336, "learning_rate": 6.162413951211041e-07, "loss": 0.0003, "num_input_tokens_seen": 91424336, "step": 135660 }, { "epoch": 3.314318520509125, "grad_norm": 0.02510049007833004, "learning_rate": 6.161626483288643e-07, "loss": 0.0, "num_input_tokens_seen": 91427536, "step": 135665 }, { "epoch": 3.314440671340972, "grad_norm": 0.007939993403851986, "learning_rate": 6.160839043279926e-07, "loss": 0.0, "num_input_tokens_seen": 91430544, "step": 135670 }, { "epoch": 3.314562822172819, "grad_norm": 0.0004061466024722904, "learning_rate": 6.160051631190623e-07, "loss": 0.0, "num_input_tokens_seen": 91433872, "step": 135675 }, { "epoch": 3.314684973004666, "grad_norm": 0.010502016171813011, "learning_rate": 6.159264247026456e-07, "loss": 0.0, "num_input_tokens_seen": 91437200, "step": 135680 }, { "epoch": 3.3148071238365135, "grad_norm": 0.0011987907346338034, "learning_rate": 6.158476890793152e-07, "loss": 0.0, "num_input_tokens_seen": 91440528, "step": 135685 }, { "epoch": 3.3149292746683603, "grad_norm": 0.000779199821408838, "learning_rate": 6.157689562496439e-07, "loss": 0.0709, "num_input_tokens_seen": 91443984, "step": 135690 }, { "epoch": 3.3150514255002075, "grad_norm": 0.0010632823687046766, "learning_rate": 6.15690226214204e-07, "loss": 0.0, "num_input_tokens_seen": 91447248, "step": 135695 }, { "epoch": 3.3151735763320547, "grad_norm": 0.005994278471916914, "learning_rate": 6.156114989735682e-07, "loss": 0.0, "num_input_tokens_seen": 91450448, "step": 135700 }, { "epoch": 3.315295727163902, "grad_norm": 0.00121176743414253, "learning_rate": 6.15532774528309e-07, "loss": 0.0, "num_input_tokens_seen": 91454096, "step": 135705 }, { "epoch": 3.315417877995749, "grad_norm": 0.0034481610637158155, "learning_rate": 6.154540528789988e-07, "loss": 0.0, "num_input_tokens_seen": 91457552, "step": 135710 }, { "epoch": 3.3155400288275962, "grad_norm": 0.0012256080517545342, "learning_rate": 6.153753340262101e-07, "loss": 0.0, "num_input_tokens_seen": 91461264, "step": 135715 }, { "epoch": 3.3156621796594434, "grad_norm": 0.003028532722964883, "learning_rate": 6.152966179705154e-07, "loss": 0.0, "num_input_tokens_seen": 91464656, "step": 135720 }, { "epoch": 3.3157843304912906, "grad_norm": 35.18305587768555, "learning_rate": 6.152179047124875e-07, "loss": 0.2296, "num_input_tokens_seen": 91467728, "step": 135725 }, { "epoch": 3.315906481323138, "grad_norm": 0.003076448105275631, "learning_rate": 6.15139194252698e-07, "loss": 0.0, "num_input_tokens_seen": 91470928, "step": 135730 }, { "epoch": 3.316028632154985, "grad_norm": 0.0004749661311507225, "learning_rate": 6.150604865917201e-07, "loss": 0.0001, "num_input_tokens_seen": 91474448, "step": 135735 }, { "epoch": 3.316150782986832, "grad_norm": 0.006054687313735485, "learning_rate": 6.149817817301257e-07, "loss": 0.0, "num_input_tokens_seen": 91478032, "step": 135740 }, { "epoch": 3.3162729338186794, "grad_norm": 0.003576445858925581, "learning_rate": 6.149030796684875e-07, "loss": 0.0, "num_input_tokens_seen": 91481744, "step": 135745 }, { "epoch": 3.3163950846505266, "grad_norm": 0.0043333168141543865, "learning_rate": 6.148243804073776e-07, "loss": 0.0, "num_input_tokens_seen": 91485072, "step": 135750 }, { "epoch": 3.3165172354823738, "grad_norm": 0.016035636886954308, "learning_rate": 6.147456839473684e-07, "loss": 0.0, "num_input_tokens_seen": 91488464, "step": 135755 }, { "epoch": 3.316639386314221, "grad_norm": 0.06646835058927536, "learning_rate": 6.146669902890324e-07, "loss": 0.0001, "num_input_tokens_seen": 91491792, "step": 135760 }, { "epoch": 3.316761537146068, "grad_norm": 0.02964874729514122, "learning_rate": 6.145882994329414e-07, "loss": 0.0, "num_input_tokens_seen": 91495056, "step": 135765 }, { "epoch": 3.3168836879779153, "grad_norm": 0.06201769784092903, "learning_rate": 6.145096113796684e-07, "loss": 0.0, "num_input_tokens_seen": 91498256, "step": 135770 }, { "epoch": 3.317005838809762, "grad_norm": 52.88552474975586, "learning_rate": 6.144309261297847e-07, "loss": 0.0546, "num_input_tokens_seen": 91501776, "step": 135775 }, { "epoch": 3.3171279896416097, "grad_norm": 0.17194008827209473, "learning_rate": 6.143522436838628e-07, "loss": 0.0001, "num_input_tokens_seen": 91504656, "step": 135780 }, { "epoch": 3.3172501404734565, "grad_norm": 0.00464673014357686, "learning_rate": 6.142735640424759e-07, "loss": 0.0027, "num_input_tokens_seen": 91507792, "step": 135785 }, { "epoch": 3.3173722913053036, "grad_norm": 0.009759887121617794, "learning_rate": 6.141948872061947e-07, "loss": 0.0001, "num_input_tokens_seen": 91511184, "step": 135790 }, { "epoch": 3.317494442137151, "grad_norm": 0.19676682353019714, "learning_rate": 6.141162131755926e-07, "loss": 0.0, "num_input_tokens_seen": 91514576, "step": 135795 }, { "epoch": 3.317616592968998, "grad_norm": 0.15152515470981598, "learning_rate": 6.140375419512406e-07, "loss": 0.0001, "num_input_tokens_seen": 91517840, "step": 135800 }, { "epoch": 3.317738743800845, "grad_norm": 0.014471475966274738, "learning_rate": 6.139588735337118e-07, "loss": 0.0, "num_input_tokens_seen": 91521232, "step": 135805 }, { "epoch": 3.3178608946326924, "grad_norm": 0.001642052666284144, "learning_rate": 6.138802079235781e-07, "loss": 0.0, "num_input_tokens_seen": 91524240, "step": 135810 }, { "epoch": 3.3179830454645396, "grad_norm": 0.013218896463513374, "learning_rate": 6.138015451214109e-07, "loss": 0.0002, "num_input_tokens_seen": 91527696, "step": 135815 }, { "epoch": 3.318105196296387, "grad_norm": 0.0030538730788975954, "learning_rate": 6.137228851277831e-07, "loss": 0.0001, "num_input_tokens_seen": 91531920, "step": 135820 }, { "epoch": 3.318227347128234, "grad_norm": 0.006154006812721491, "learning_rate": 6.136442279432661e-07, "loss": 0.058, "num_input_tokens_seen": 91535824, "step": 135825 }, { "epoch": 3.318349497960081, "grad_norm": 0.008251064456999302, "learning_rate": 6.135655735684327e-07, "loss": 0.0468, "num_input_tokens_seen": 91539216, "step": 135830 }, { "epoch": 3.3184716487919284, "grad_norm": 28.197277069091797, "learning_rate": 6.134869220038537e-07, "loss": 0.0526, "num_input_tokens_seen": 91542864, "step": 135835 }, { "epoch": 3.3185937996237755, "grad_norm": 0.0020110972691327333, "learning_rate": 6.134082732501018e-07, "loss": 0.0, "num_input_tokens_seen": 91546000, "step": 135840 }, { "epoch": 3.3187159504556227, "grad_norm": 0.0014587831683456898, "learning_rate": 6.133296273077495e-07, "loss": 0.0, "num_input_tokens_seen": 91549392, "step": 135845 }, { "epoch": 3.31883810128747, "grad_norm": 0.003994397819042206, "learning_rate": 6.132509841773678e-07, "loss": 0.0001, "num_input_tokens_seen": 91552912, "step": 135850 }, { "epoch": 3.318960252119317, "grad_norm": 0.42959192395210266, "learning_rate": 6.131723438595291e-07, "loss": 0.0, "num_input_tokens_seen": 91555984, "step": 135855 }, { "epoch": 3.319082402951164, "grad_norm": 0.0016079478664323688, "learning_rate": 6.13093706354805e-07, "loss": 0.0, "num_input_tokens_seen": 91559312, "step": 135860 }, { "epoch": 3.3192045537830115, "grad_norm": 0.10521599650382996, "learning_rate": 6.130150716637677e-07, "loss": 0.0, "num_input_tokens_seen": 91563280, "step": 135865 }, { "epoch": 3.3193267046148582, "grad_norm": 0.006913943216204643, "learning_rate": 6.129364397869887e-07, "loss": 0.0, "num_input_tokens_seen": 91566736, "step": 135870 }, { "epoch": 3.3194488554467054, "grad_norm": 0.0004993074107915163, "learning_rate": 6.128578107250399e-07, "loss": 0.0, "num_input_tokens_seen": 91570256, "step": 135875 }, { "epoch": 3.3195710062785526, "grad_norm": 0.0032401448115706444, "learning_rate": 6.127791844784937e-07, "loss": 0.0, "num_input_tokens_seen": 91573968, "step": 135880 }, { "epoch": 3.3196931571104, "grad_norm": 0.0026708885561674833, "learning_rate": 6.12700561047921e-07, "loss": 0.0, "num_input_tokens_seen": 91577424, "step": 135885 }, { "epoch": 3.319815307942247, "grad_norm": 0.41900894045829773, "learning_rate": 6.126219404338944e-07, "loss": 0.0002, "num_input_tokens_seen": 91581072, "step": 135890 }, { "epoch": 3.319937458774094, "grad_norm": 0.002558025298640132, "learning_rate": 6.125433226369847e-07, "loss": 0.0095, "num_input_tokens_seen": 91584848, "step": 135895 }, { "epoch": 3.3200596096059414, "grad_norm": 0.00013529101852327585, "learning_rate": 6.124647076577644e-07, "loss": 0.0, "num_input_tokens_seen": 91588240, "step": 135900 }, { "epoch": 3.3201817604377886, "grad_norm": 0.002773279557004571, "learning_rate": 6.123860954968051e-07, "loss": 0.0, "num_input_tokens_seen": 91591184, "step": 135905 }, { "epoch": 3.3203039112696358, "grad_norm": 0.00456498796120286, "learning_rate": 6.123074861546783e-07, "loss": 0.0477, "num_input_tokens_seen": 91594576, "step": 135910 }, { "epoch": 3.320426062101483, "grad_norm": 0.010695680975914001, "learning_rate": 6.122288796319559e-07, "loss": 0.0001, "num_input_tokens_seen": 91597776, "step": 135915 }, { "epoch": 3.32054821293333, "grad_norm": 0.007244298234581947, "learning_rate": 6.121502759292091e-07, "loss": 0.0001, "num_input_tokens_seen": 91601168, "step": 135920 }, { "epoch": 3.3206703637651773, "grad_norm": 0.0032030600123107433, "learning_rate": 6.120716750470102e-07, "loss": 0.0, "num_input_tokens_seen": 91604496, "step": 135925 }, { "epoch": 3.3207925145970245, "grad_norm": 0.00389327434822917, "learning_rate": 6.119930769859299e-07, "loss": 0.0, "num_input_tokens_seen": 91607696, "step": 135930 }, { "epoch": 3.3209146654288717, "grad_norm": 0.025663211941719055, "learning_rate": 6.119144817465405e-07, "loss": 0.0001, "num_input_tokens_seen": 91611216, "step": 135935 }, { "epoch": 3.321036816260719, "grad_norm": 0.0016724220477044582, "learning_rate": 6.118358893294135e-07, "loss": 0.0, "num_input_tokens_seen": 91614608, "step": 135940 }, { "epoch": 3.321158967092566, "grad_norm": 0.0005501621635630727, "learning_rate": 6.117572997351201e-07, "loss": 0.0, "num_input_tokens_seen": 91617680, "step": 135945 }, { "epoch": 3.3212811179244133, "grad_norm": 0.003155820071697235, "learning_rate": 6.116787129642324e-07, "loss": 0.0, "num_input_tokens_seen": 91620944, "step": 135950 }, { "epoch": 3.32140326875626, "grad_norm": 0.004578068852424622, "learning_rate": 6.116001290173211e-07, "loss": 0.0, "num_input_tokens_seen": 91624464, "step": 135955 }, { "epoch": 3.321525419588107, "grad_norm": 0.0008005460840649903, "learning_rate": 6.115215478949587e-07, "loss": 0.0, "num_input_tokens_seen": 91627728, "step": 135960 }, { "epoch": 3.3216475704199544, "grad_norm": 0.009103553369641304, "learning_rate": 6.114429695977157e-07, "loss": 0.0, "num_input_tokens_seen": 91631184, "step": 135965 }, { "epoch": 3.3217697212518016, "grad_norm": 0.0013579517835751176, "learning_rate": 6.113643941261639e-07, "loss": 0.0, "num_input_tokens_seen": 91634384, "step": 135970 }, { "epoch": 3.3218918720836488, "grad_norm": 9.428247722098604e-05, "learning_rate": 6.112858214808749e-07, "loss": 0.0, "num_input_tokens_seen": 91638032, "step": 135975 }, { "epoch": 3.322014022915496, "grad_norm": 0.004081009421497583, "learning_rate": 6.112072516624198e-07, "loss": 0.0, "num_input_tokens_seen": 91641552, "step": 135980 }, { "epoch": 3.322136173747343, "grad_norm": 0.011382028460502625, "learning_rate": 6.111286846713704e-07, "loss": 0.0, "num_input_tokens_seen": 91644816, "step": 135985 }, { "epoch": 3.3222583245791903, "grad_norm": 0.07787194848060608, "learning_rate": 6.110501205082976e-07, "loss": 0.0001, "num_input_tokens_seen": 91648016, "step": 135990 }, { "epoch": 3.3223804754110375, "grad_norm": 0.006711010821163654, "learning_rate": 6.109715591737727e-07, "loss": 0.0512, "num_input_tokens_seen": 91651088, "step": 135995 }, { "epoch": 3.3225026262428847, "grad_norm": 0.0014324527001008391, "learning_rate": 6.108930006683678e-07, "loss": 0.0356, "num_input_tokens_seen": 91654352, "step": 136000 }, { "epoch": 3.322624777074732, "grad_norm": 0.0011398588540032506, "learning_rate": 6.108144449926533e-07, "loss": 0.0, "num_input_tokens_seen": 91657360, "step": 136005 }, { "epoch": 3.322746927906579, "grad_norm": 0.02468794398009777, "learning_rate": 6.10735892147201e-07, "loss": 0.0, "num_input_tokens_seen": 91660560, "step": 136010 }, { "epoch": 3.3228690787384263, "grad_norm": 0.0005330094136297703, "learning_rate": 6.10657342132582e-07, "loss": 0.0407, "num_input_tokens_seen": 91663824, "step": 136015 }, { "epoch": 3.3229912295702735, "grad_norm": 0.0012963847257196903, "learning_rate": 6.105787949493675e-07, "loss": 0.0, "num_input_tokens_seen": 91666896, "step": 136020 }, { "epoch": 3.3231133804021207, "grad_norm": 0.0028438479639589787, "learning_rate": 6.105002505981287e-07, "loss": 0.0, "num_input_tokens_seen": 91670224, "step": 136025 }, { "epoch": 3.323235531233968, "grad_norm": 0.033919740468263626, "learning_rate": 6.104217090794365e-07, "loss": 0.0, "num_input_tokens_seen": 91673616, "step": 136030 }, { "epoch": 3.323357682065815, "grad_norm": 0.015069283545017242, "learning_rate": 6.10343170393863e-07, "loss": 0.0001, "num_input_tokens_seen": 91677136, "step": 136035 }, { "epoch": 3.323479832897662, "grad_norm": 0.009964990429580212, "learning_rate": 6.102646345419784e-07, "loss": 0.0, "num_input_tokens_seen": 91680208, "step": 136040 }, { "epoch": 3.3236019837295094, "grad_norm": 0.0020420521032065153, "learning_rate": 6.101861015243546e-07, "loss": 0.0, "num_input_tokens_seen": 91683984, "step": 136045 }, { "epoch": 3.323724134561356, "grad_norm": 0.0036547784693539143, "learning_rate": 6.101075713415617e-07, "loss": 0.0256, "num_input_tokens_seen": 91687120, "step": 136050 }, { "epoch": 3.3238462853932034, "grad_norm": 0.0009308999869972467, "learning_rate": 6.100290439941718e-07, "loss": 0.0001, "num_input_tokens_seen": 91690576, "step": 136055 }, { "epoch": 3.3239684362250506, "grad_norm": 18.14400291442871, "learning_rate": 6.099505194827557e-07, "loss": 0.0653, "num_input_tokens_seen": 91693648, "step": 136060 }, { "epoch": 3.3240905870568977, "grad_norm": 0.033947087824344635, "learning_rate": 6.098719978078841e-07, "loss": 0.0001, "num_input_tokens_seen": 91696976, "step": 136065 }, { "epoch": 3.324212737888745, "grad_norm": 0.06831333786249161, "learning_rate": 6.097934789701285e-07, "loss": 0.0, "num_input_tokens_seen": 91700816, "step": 136070 }, { "epoch": 3.324334888720592, "grad_norm": 0.023533586412668228, "learning_rate": 6.097149629700593e-07, "loss": 0.0, "num_input_tokens_seen": 91704336, "step": 136075 }, { "epoch": 3.3244570395524393, "grad_norm": 126.68274688720703, "learning_rate": 6.096364498082483e-07, "loss": 0.0022, "num_input_tokens_seen": 91707536, "step": 136080 }, { "epoch": 3.3245791903842865, "grad_norm": 0.0028027426451444626, "learning_rate": 6.095579394852657e-07, "loss": 0.0, "num_input_tokens_seen": 91710800, "step": 136085 }, { "epoch": 3.3247013412161337, "grad_norm": 8.272482872009277, "learning_rate": 6.094794320016826e-07, "loss": 0.0003, "num_input_tokens_seen": 91713872, "step": 136090 }, { "epoch": 3.324823492047981, "grad_norm": 0.0009194356389343739, "learning_rate": 6.094009273580707e-07, "loss": 0.0, "num_input_tokens_seen": 91717520, "step": 136095 }, { "epoch": 3.324945642879828, "grad_norm": 0.0003622818912845105, "learning_rate": 6.093224255549998e-07, "loss": 0.0, "num_input_tokens_seen": 91720912, "step": 136100 }, { "epoch": 3.3250677937116753, "grad_norm": 0.0019771524239331484, "learning_rate": 6.092439265930416e-07, "loss": 0.0385, "num_input_tokens_seen": 91724240, "step": 136105 }, { "epoch": 3.3251899445435225, "grad_norm": 0.0015008923364803195, "learning_rate": 6.091654304727665e-07, "loss": 0.0001, "num_input_tokens_seen": 91727504, "step": 136110 }, { "epoch": 3.3253120953753696, "grad_norm": 0.0024910790380090475, "learning_rate": 6.090869371947458e-07, "loss": 0.0, "num_input_tokens_seen": 91730704, "step": 136115 }, { "epoch": 3.325434246207217, "grad_norm": 0.0058991131372749805, "learning_rate": 6.090084467595497e-07, "loss": 0.0, "num_input_tokens_seen": 91734224, "step": 136120 }, { "epoch": 3.325556397039064, "grad_norm": 0.002062541898339987, "learning_rate": 6.089299591677492e-07, "loss": 0.0, "num_input_tokens_seen": 91737360, "step": 136125 }, { "epoch": 3.325678547870911, "grad_norm": 0.001725882524624467, "learning_rate": 6.088514744199158e-07, "loss": 0.0, "num_input_tokens_seen": 91740432, "step": 136130 }, { "epoch": 3.325800698702758, "grad_norm": 0.004223139490932226, "learning_rate": 6.087729925166191e-07, "loss": 0.0005, "num_input_tokens_seen": 91743952, "step": 136135 }, { "epoch": 3.325922849534605, "grad_norm": 0.00504926685243845, "learning_rate": 6.086945134584311e-07, "loss": 0.0, "num_input_tokens_seen": 91747792, "step": 136140 }, { "epoch": 3.3260450003664523, "grad_norm": 0.001159909414127469, "learning_rate": 6.086160372459211e-07, "loss": 0.0, "num_input_tokens_seen": 91751312, "step": 136145 }, { "epoch": 3.3261671511982995, "grad_norm": 0.005098773166537285, "learning_rate": 6.085375638796608e-07, "loss": 0.0, "num_input_tokens_seen": 91754576, "step": 136150 }, { "epoch": 3.3262893020301467, "grad_norm": 0.10678897053003311, "learning_rate": 6.084590933602209e-07, "loss": 0.0, "num_input_tokens_seen": 91757968, "step": 136155 }, { "epoch": 3.326411452861994, "grad_norm": 0.009015164338052273, "learning_rate": 6.083806256881716e-07, "loss": 0.0, "num_input_tokens_seen": 91761360, "step": 136160 }, { "epoch": 3.326533603693841, "grad_norm": 0.0019480243790894747, "learning_rate": 6.083021608640837e-07, "loss": 0.0, "num_input_tokens_seen": 91764944, "step": 136165 }, { "epoch": 3.3266557545256883, "grad_norm": 0.00024862895952537656, "learning_rate": 6.082236988885279e-07, "loss": 0.0, "num_input_tokens_seen": 91768016, "step": 136170 }, { "epoch": 3.3267779053575355, "grad_norm": 0.0010257846442982554, "learning_rate": 6.081452397620747e-07, "loss": 0.0414, "num_input_tokens_seen": 91770960, "step": 136175 }, { "epoch": 3.3269000561893827, "grad_norm": 0.0008336947648786008, "learning_rate": 6.080667834852948e-07, "loss": 0.0, "num_input_tokens_seen": 91774160, "step": 136180 }, { "epoch": 3.32702220702123, "grad_norm": 0.04634743928909302, "learning_rate": 6.079883300587583e-07, "loss": 0.0, "num_input_tokens_seen": 91776912, "step": 136185 }, { "epoch": 3.327144357853077, "grad_norm": 0.004024902358651161, "learning_rate": 6.079098794830366e-07, "loss": 0.0001, "num_input_tokens_seen": 91780176, "step": 136190 }, { "epoch": 3.3272665086849242, "grad_norm": 0.018328385427594185, "learning_rate": 6.078314317586992e-07, "loss": 0.0001, "num_input_tokens_seen": 91783120, "step": 136195 }, { "epoch": 3.3273886595167714, "grad_norm": 0.013721533119678497, "learning_rate": 6.077529868863178e-07, "loss": 0.0, "num_input_tokens_seen": 91786512, "step": 136200 }, { "epoch": 3.3275108103486186, "grad_norm": 0.002645154483616352, "learning_rate": 6.076745448664616e-07, "loss": 0.0, "num_input_tokens_seen": 91789648, "step": 136205 }, { "epoch": 3.327632961180466, "grad_norm": 0.0104436706751585, "learning_rate": 6.075961056997017e-07, "loss": 0.0, "num_input_tokens_seen": 91792848, "step": 136210 }, { "epoch": 3.327755112012313, "grad_norm": 0.036987703293561935, "learning_rate": 6.075176693866086e-07, "loss": 0.0433, "num_input_tokens_seen": 91796176, "step": 136215 }, { "epoch": 3.3278772628441597, "grad_norm": 0.004183395765721798, "learning_rate": 6.074392359277526e-07, "loss": 0.0353, "num_input_tokens_seen": 91799568, "step": 136220 }, { "epoch": 3.3279994136760074, "grad_norm": 0.003504760330542922, "learning_rate": 6.073608053237042e-07, "loss": 0.0001, "num_input_tokens_seen": 91802960, "step": 136225 }, { "epoch": 3.328121564507854, "grad_norm": 0.0017839574720710516, "learning_rate": 6.072823775750333e-07, "loss": 0.0001, "num_input_tokens_seen": 91805904, "step": 136230 }, { "epoch": 3.3282437153397013, "grad_norm": 0.00135110376868397, "learning_rate": 6.072039526823109e-07, "loss": 0.0, "num_input_tokens_seen": 91809232, "step": 136235 }, { "epoch": 3.3283658661715485, "grad_norm": 0.0043913801200687885, "learning_rate": 6.071255306461067e-07, "loss": 0.0, "num_input_tokens_seen": 91812368, "step": 136240 }, { "epoch": 3.3284880170033957, "grad_norm": 0.0013979102950543165, "learning_rate": 6.070471114669913e-07, "loss": 0.0001, "num_input_tokens_seen": 91815632, "step": 136245 }, { "epoch": 3.328610167835243, "grad_norm": 0.013382709585130215, "learning_rate": 6.069686951455353e-07, "loss": 0.0, "num_input_tokens_seen": 91818768, "step": 136250 }, { "epoch": 3.32873231866709, "grad_norm": 0.0029500352684408426, "learning_rate": 6.068902816823083e-07, "loss": 0.0, "num_input_tokens_seen": 91822480, "step": 136255 }, { "epoch": 3.3288544694989373, "grad_norm": 0.0005986875621601939, "learning_rate": 6.068118710778813e-07, "loss": 0.0001, "num_input_tokens_seen": 91825680, "step": 136260 }, { "epoch": 3.3289766203307845, "grad_norm": 0.0019035233417525887, "learning_rate": 6.067334633328237e-07, "loss": 0.0001, "num_input_tokens_seen": 91829392, "step": 136265 }, { "epoch": 3.3290987711626316, "grad_norm": 0.0014049690216779709, "learning_rate": 6.066550584477065e-07, "loss": 0.0001, "num_input_tokens_seen": 91832784, "step": 136270 }, { "epoch": 3.329220921994479, "grad_norm": 0.002635091543197632, "learning_rate": 6.065766564230995e-07, "loss": 0.0, "num_input_tokens_seen": 91836176, "step": 136275 }, { "epoch": 3.329343072826326, "grad_norm": 0.0010414626449346542, "learning_rate": 6.064982572595725e-07, "loss": 0.0607, "num_input_tokens_seen": 91839312, "step": 136280 }, { "epoch": 3.329465223658173, "grad_norm": 0.0004415341536514461, "learning_rate": 6.064198609576965e-07, "loss": 0.0, "num_input_tokens_seen": 91843216, "step": 136285 }, { "epoch": 3.3295873744900204, "grad_norm": 0.02237209863960743, "learning_rate": 6.063414675180407e-07, "loss": 0.0, "num_input_tokens_seen": 91846480, "step": 136290 }, { "epoch": 3.3297095253218676, "grad_norm": 0.008092211559414864, "learning_rate": 6.06263076941176e-07, "loss": 0.0, "num_input_tokens_seen": 91850320, "step": 136295 }, { "epoch": 3.329831676153715, "grad_norm": 0.0001304069155594334, "learning_rate": 6.061846892276718e-07, "loss": 0.0, "num_input_tokens_seen": 91853712, "step": 136300 }, { "epoch": 3.3299538269855615, "grad_norm": 0.0009956281865015626, "learning_rate": 6.061063043780985e-07, "loss": 0.0, "num_input_tokens_seen": 91857040, "step": 136305 }, { "epoch": 3.330075977817409, "grad_norm": 0.0025123651139438152, "learning_rate": 6.060279223930263e-07, "loss": 0.0343, "num_input_tokens_seen": 91860368, "step": 136310 }, { "epoch": 3.330198128649256, "grad_norm": 0.001795079791918397, "learning_rate": 6.059495432730248e-07, "loss": 0.0, "num_input_tokens_seen": 91863632, "step": 136315 }, { "epoch": 3.330320279481103, "grad_norm": 0.0031533746514469385, "learning_rate": 6.058711670186645e-07, "loss": 0.0, "num_input_tokens_seen": 91867024, "step": 136320 }, { "epoch": 3.3304424303129503, "grad_norm": 0.0018401495181024075, "learning_rate": 6.057927936305149e-07, "loss": 0.0006, "num_input_tokens_seen": 91870160, "step": 136325 }, { "epoch": 3.3305645811447975, "grad_norm": 0.019284943118691444, "learning_rate": 6.057144231091461e-07, "loss": 0.0, "num_input_tokens_seen": 91873232, "step": 136330 }, { "epoch": 3.3306867319766447, "grad_norm": 0.008021565154194832, "learning_rate": 6.056360554551281e-07, "loss": 0.0, "num_input_tokens_seen": 91876048, "step": 136335 }, { "epoch": 3.330808882808492, "grad_norm": 0.014595117419958115, "learning_rate": 6.055576906690306e-07, "loss": 0.0001, "num_input_tokens_seen": 91879120, "step": 136340 }, { "epoch": 3.330931033640339, "grad_norm": 0.07980555295944214, "learning_rate": 6.054793287514241e-07, "loss": 0.0, "num_input_tokens_seen": 91882320, "step": 136345 }, { "epoch": 3.3310531844721862, "grad_norm": 0.1366054266691208, "learning_rate": 6.054009697028776e-07, "loss": 0.0, "num_input_tokens_seen": 91885776, "step": 136350 }, { "epoch": 3.3311753353040334, "grad_norm": 0.05009021982550621, "learning_rate": 6.053226135239618e-07, "loss": 0.0, "num_input_tokens_seen": 91889040, "step": 136355 }, { "epoch": 3.3312974861358806, "grad_norm": 0.00617753341794014, "learning_rate": 6.052442602152457e-07, "loss": 0.0002, "num_input_tokens_seen": 91892496, "step": 136360 }, { "epoch": 3.331419636967728, "grad_norm": 0.007933614775538445, "learning_rate": 6.051659097772996e-07, "loss": 0.0, "num_input_tokens_seen": 91895760, "step": 136365 }, { "epoch": 3.331541787799575, "grad_norm": 0.09009955078363419, "learning_rate": 6.050875622106932e-07, "loss": 0.0, "num_input_tokens_seen": 91898704, "step": 136370 }, { "epoch": 3.331663938631422, "grad_norm": 0.00013575205230154097, "learning_rate": 6.050092175159964e-07, "loss": 0.0667, "num_input_tokens_seen": 91901968, "step": 136375 }, { "epoch": 3.3317860894632694, "grad_norm": 0.00041842387872748077, "learning_rate": 6.04930875693779e-07, "loss": 0.0, "num_input_tokens_seen": 91904976, "step": 136380 }, { "epoch": 3.3319082402951166, "grad_norm": 0.0005398113862611353, "learning_rate": 6.048525367446102e-07, "loss": 0.0, "num_input_tokens_seen": 91908368, "step": 136385 }, { "epoch": 3.3320303911269638, "grad_norm": 0.019100764766335487, "learning_rate": 6.047742006690602e-07, "loss": 0.0527, "num_input_tokens_seen": 91911312, "step": 136390 }, { "epoch": 3.332152541958811, "grad_norm": 0.0005279332399368286, "learning_rate": 6.046958674676983e-07, "loss": 0.0, "num_input_tokens_seen": 91914512, "step": 136395 }, { "epoch": 3.3322746927906577, "grad_norm": 0.00022604659898206592, "learning_rate": 6.046175371410944e-07, "loss": 0.0, "num_input_tokens_seen": 91917968, "step": 136400 }, { "epoch": 3.332396843622505, "grad_norm": 0.00010783207835629582, "learning_rate": 6.045392096898184e-07, "loss": 0.0, "num_input_tokens_seen": 91921488, "step": 136405 }, { "epoch": 3.332518994454352, "grad_norm": 0.011618511751294136, "learning_rate": 6.044608851144392e-07, "loss": 0.0, "num_input_tokens_seen": 91924368, "step": 136410 }, { "epoch": 3.3326411452861993, "grad_norm": 0.03842709958553314, "learning_rate": 6.043825634155274e-07, "loss": 0.0453, "num_input_tokens_seen": 91927568, "step": 136415 }, { "epoch": 3.3327632961180464, "grad_norm": 0.001610856968909502, "learning_rate": 6.043042445936515e-07, "loss": 0.0, "num_input_tokens_seen": 91930832, "step": 136420 }, { "epoch": 3.3328854469498936, "grad_norm": 0.008025610819458961, "learning_rate": 6.04225928649382e-07, "loss": 0.0, "num_input_tokens_seen": 91934608, "step": 136425 }, { "epoch": 3.333007597781741, "grad_norm": 0.000821794499643147, "learning_rate": 6.041476155832877e-07, "loss": 0.0, "num_input_tokens_seen": 91937936, "step": 136430 }, { "epoch": 3.333129748613588, "grad_norm": 0.007527175359427929, "learning_rate": 6.040693053959384e-07, "loss": 0.0, "num_input_tokens_seen": 91941200, "step": 136435 }, { "epoch": 3.333251899445435, "grad_norm": 0.01938558928668499, "learning_rate": 6.039909980879039e-07, "loss": 0.0, "num_input_tokens_seen": 91944400, "step": 136440 }, { "epoch": 3.3333740502772824, "grad_norm": 0.031942564994096756, "learning_rate": 6.039126936597529e-07, "loss": 0.0, "num_input_tokens_seen": 91947920, "step": 136445 }, { "epoch": 3.3334962011091296, "grad_norm": 0.00023766096273902804, "learning_rate": 6.038343921120558e-07, "loss": 0.0, "num_input_tokens_seen": 91951120, "step": 136450 }, { "epoch": 3.3336183519409768, "grad_norm": 0.0017525376752018929, "learning_rate": 6.037560934453812e-07, "loss": 0.0, "num_input_tokens_seen": 91954192, "step": 136455 }, { "epoch": 3.333740502772824, "grad_norm": 0.7847245335578918, "learning_rate": 6.036777976602987e-07, "loss": 0.0001, "num_input_tokens_seen": 91957392, "step": 136460 }, { "epoch": 3.333862653604671, "grad_norm": 0.006502887699753046, "learning_rate": 6.035995047573785e-07, "loss": 0.0, "num_input_tokens_seen": 91960464, "step": 136465 }, { "epoch": 3.3339848044365183, "grad_norm": 0.004712434485554695, "learning_rate": 6.035212147371887e-07, "loss": 0.0, "num_input_tokens_seen": 91964112, "step": 136470 }, { "epoch": 3.3341069552683655, "grad_norm": 0.05862710252404213, "learning_rate": 6.034429276002996e-07, "loss": 0.0, "num_input_tokens_seen": 91967184, "step": 136475 }, { "epoch": 3.3342291061002127, "grad_norm": 0.002594059333205223, "learning_rate": 6.033646433472803e-07, "loss": 0.0, "num_input_tokens_seen": 91970704, "step": 136480 }, { "epoch": 3.3343512569320595, "grad_norm": 3.1144535541534424, "learning_rate": 6.032863619786999e-07, "loss": 0.0005, "num_input_tokens_seen": 91974224, "step": 136485 }, { "epoch": 3.334473407763907, "grad_norm": 0.008164148777723312, "learning_rate": 6.032080834951276e-07, "loss": 0.0001, "num_input_tokens_seen": 91977488, "step": 136490 }, { "epoch": 3.334595558595754, "grad_norm": 0.0005573926609940827, "learning_rate": 6.031298078971328e-07, "loss": 0.0, "num_input_tokens_seen": 91981136, "step": 136495 }, { "epoch": 3.334717709427601, "grad_norm": 0.0009567593806423247, "learning_rate": 6.030515351852852e-07, "loss": 0.0, "num_input_tokens_seen": 91984656, "step": 136500 }, { "epoch": 3.3348398602594482, "grad_norm": 0.00125235749874264, "learning_rate": 6.029732653601531e-07, "loss": 0.055, "num_input_tokens_seen": 91987664, "step": 136505 }, { "epoch": 3.3349620110912954, "grad_norm": 0.22148816287517548, "learning_rate": 6.028949984223066e-07, "loss": 0.0001, "num_input_tokens_seen": 91991120, "step": 136510 }, { "epoch": 3.3350841619231426, "grad_norm": 0.009758691303431988, "learning_rate": 6.028167343723142e-07, "loss": 0.0645, "num_input_tokens_seen": 91994064, "step": 136515 }, { "epoch": 3.33520631275499, "grad_norm": 0.0007590046152472496, "learning_rate": 6.027384732107455e-07, "loss": 0.0, "num_input_tokens_seen": 91997072, "step": 136520 }, { "epoch": 3.335328463586837, "grad_norm": 0.01003759540617466, "learning_rate": 6.026602149381694e-07, "loss": 0.0, "num_input_tokens_seen": 91999952, "step": 136525 }, { "epoch": 3.335450614418684, "grad_norm": 0.0017680259188637137, "learning_rate": 6.025819595551551e-07, "loss": 0.0, "num_input_tokens_seen": 92003664, "step": 136530 }, { "epoch": 3.3355727652505314, "grad_norm": 16.769784927368164, "learning_rate": 6.025037070622719e-07, "loss": 0.0406, "num_input_tokens_seen": 92006928, "step": 136535 }, { "epoch": 3.3356949160823786, "grad_norm": 0.0016546935075893998, "learning_rate": 6.024254574600884e-07, "loss": 0.0, "num_input_tokens_seen": 92010320, "step": 136540 }, { "epoch": 3.3358170669142257, "grad_norm": 0.001963790040463209, "learning_rate": 6.02347210749174e-07, "loss": 0.0, "num_input_tokens_seen": 92013456, "step": 136545 }, { "epoch": 3.335939217746073, "grad_norm": 0.0012200935743749142, "learning_rate": 6.022689669300976e-07, "loss": 0.0662, "num_input_tokens_seen": 92016976, "step": 136550 }, { "epoch": 3.33606136857792, "grad_norm": 0.0235135480761528, "learning_rate": 6.02190726003428e-07, "loss": 0.0001, "num_input_tokens_seen": 92020368, "step": 136555 }, { "epoch": 3.3361835194097673, "grad_norm": 0.003793837269768119, "learning_rate": 6.021124879697348e-07, "loss": 0.0, "num_input_tokens_seen": 92023376, "step": 136560 }, { "epoch": 3.3363056702416145, "grad_norm": 0.0012269438011571765, "learning_rate": 6.020342528295863e-07, "loss": 0.0, "num_input_tokens_seen": 92026512, "step": 136565 }, { "epoch": 3.3364278210734617, "grad_norm": 0.00037712950143031776, "learning_rate": 6.019560205835522e-07, "loss": 0.0, "num_input_tokens_seen": 92029520, "step": 136570 }, { "epoch": 3.336549971905309, "grad_norm": 0.0009124455391429365, "learning_rate": 6.018777912322007e-07, "loss": 0.0002, "num_input_tokens_seen": 92032784, "step": 136575 }, { "epoch": 3.3366721227371556, "grad_norm": 0.03195099160075188, "learning_rate": 6.01799564776101e-07, "loss": 0.0, "num_input_tokens_seen": 92036432, "step": 136580 }, { "epoch": 3.336794273569003, "grad_norm": 0.0006369482143782079, "learning_rate": 6.017213412158221e-07, "loss": 0.0, "num_input_tokens_seen": 92040080, "step": 136585 }, { "epoch": 3.33691642440085, "grad_norm": 0.25829169154167175, "learning_rate": 6.016431205519323e-07, "loss": 0.0001, "num_input_tokens_seen": 92044112, "step": 136590 }, { "epoch": 3.337038575232697, "grad_norm": 0.0028545642271637917, "learning_rate": 6.015649027850015e-07, "loss": 0.0002, "num_input_tokens_seen": 92047504, "step": 136595 }, { "epoch": 3.3371607260645444, "grad_norm": 0.0025095988530665636, "learning_rate": 6.014866879155973e-07, "loss": 0.0, "num_input_tokens_seen": 92050704, "step": 136600 }, { "epoch": 3.3372828768963916, "grad_norm": 0.000693855807185173, "learning_rate": 6.014084759442897e-07, "loss": 0.0, "num_input_tokens_seen": 92054416, "step": 136605 }, { "epoch": 3.3374050277282388, "grad_norm": 0.03811664879322052, "learning_rate": 6.013302668716462e-07, "loss": 0.0, "num_input_tokens_seen": 92057808, "step": 136610 }, { "epoch": 3.337527178560086, "grad_norm": 138.32643127441406, "learning_rate": 6.012520606982365e-07, "loss": 0.0131, "num_input_tokens_seen": 92061328, "step": 136615 }, { "epoch": 3.337649329391933, "grad_norm": 0.006058790720999241, "learning_rate": 6.011738574246294e-07, "loss": 0.0003, "num_input_tokens_seen": 92064912, "step": 136620 }, { "epoch": 3.3377714802237803, "grad_norm": 0.0011405334807932377, "learning_rate": 6.010956570513927e-07, "loss": 0.0, "num_input_tokens_seen": 92068112, "step": 136625 }, { "epoch": 3.3378936310556275, "grad_norm": 0.0002621239691507071, "learning_rate": 6.010174595790963e-07, "loss": 0.0, "num_input_tokens_seen": 92071440, "step": 136630 }, { "epoch": 3.3380157818874747, "grad_norm": 0.0019386817002668977, "learning_rate": 6.009392650083079e-07, "loss": 0.2199, "num_input_tokens_seen": 92075024, "step": 136635 }, { "epoch": 3.338137932719322, "grad_norm": 0.0012935078702867031, "learning_rate": 6.008610733395965e-07, "loss": 0.0284, "num_input_tokens_seen": 92078352, "step": 136640 }, { "epoch": 3.338260083551169, "grad_norm": 0.007016130723059177, "learning_rate": 6.007828845735308e-07, "loss": 0.0, "num_input_tokens_seen": 92082192, "step": 136645 }, { "epoch": 3.3383822343830163, "grad_norm": 0.0018813019851222634, "learning_rate": 6.007046987106792e-07, "loss": 0.0, "num_input_tokens_seen": 92085136, "step": 136650 }, { "epoch": 3.3385043852148635, "grad_norm": 0.0037900370080024004, "learning_rate": 6.006265157516106e-07, "loss": 0.0, "num_input_tokens_seen": 92088400, "step": 136655 }, { "epoch": 3.3386265360467107, "grad_norm": 0.004031932447105646, "learning_rate": 6.005483356968932e-07, "loss": 0.0774, "num_input_tokens_seen": 92091216, "step": 136660 }, { "epoch": 3.3387486868785574, "grad_norm": 0.0031838095746934414, "learning_rate": 6.004701585470961e-07, "loss": 0.0, "num_input_tokens_seen": 92094544, "step": 136665 }, { "epoch": 3.338870837710405, "grad_norm": 0.0348113477230072, "learning_rate": 6.00391984302787e-07, "loss": 0.0, "num_input_tokens_seen": 92098000, "step": 136670 }, { "epoch": 3.338992988542252, "grad_norm": 0.0009348982712253928, "learning_rate": 6.003138129645353e-07, "loss": 0.0, "num_input_tokens_seen": 92101200, "step": 136675 }, { "epoch": 3.339115139374099, "grad_norm": 0.003419789019972086, "learning_rate": 6.002356445329088e-07, "loss": 0.0, "num_input_tokens_seen": 92104848, "step": 136680 }, { "epoch": 3.339237290205946, "grad_norm": 0.007083706092089415, "learning_rate": 6.001574790084763e-07, "loss": 0.0, "num_input_tokens_seen": 92108240, "step": 136685 }, { "epoch": 3.3393594410377934, "grad_norm": 0.030707471072673798, "learning_rate": 6.000793163918063e-07, "loss": 0.0, "num_input_tokens_seen": 92111696, "step": 136690 }, { "epoch": 3.3394815918696406, "grad_norm": 0.0045353625901043415, "learning_rate": 6.000011566834667e-07, "loss": 0.0, "num_input_tokens_seen": 92115152, "step": 136695 }, { "epoch": 3.3396037427014877, "grad_norm": 0.002015952952206135, "learning_rate": 5.999229998840268e-07, "loss": 0.0, "num_input_tokens_seen": 92118352, "step": 136700 }, { "epoch": 3.339725893533335, "grad_norm": 7.489074050681666e-05, "learning_rate": 5.998448459940539e-07, "loss": 0.0001, "num_input_tokens_seen": 92121744, "step": 136705 }, { "epoch": 3.339848044365182, "grad_norm": 0.057606253772974014, "learning_rate": 5.997666950141169e-07, "loss": 0.0, "num_input_tokens_seen": 92124752, "step": 136710 }, { "epoch": 3.3399701951970293, "grad_norm": 0.0022049969993531704, "learning_rate": 5.996885469447847e-07, "loss": 0.0, "num_input_tokens_seen": 92128336, "step": 136715 }, { "epoch": 3.3400923460288765, "grad_norm": 89.3311538696289, "learning_rate": 5.996104017866245e-07, "loss": 0.0918, "num_input_tokens_seen": 92131792, "step": 136720 }, { "epoch": 3.3402144968607237, "grad_norm": 0.000725537771359086, "learning_rate": 5.995322595402057e-07, "loss": 0.0001, "num_input_tokens_seen": 92135056, "step": 136725 }, { "epoch": 3.340336647692571, "grad_norm": 0.0006241207593120635, "learning_rate": 5.994541202060955e-07, "loss": 0.0, "num_input_tokens_seen": 92138384, "step": 136730 }, { "epoch": 3.340458798524418, "grad_norm": 0.003467158181592822, "learning_rate": 5.993759837848631e-07, "loss": 0.0, "num_input_tokens_seen": 92142480, "step": 136735 }, { "epoch": 3.3405809493562653, "grad_norm": 0.0008007539436221123, "learning_rate": 5.99297850277076e-07, "loss": 0.0, "num_input_tokens_seen": 92145744, "step": 136740 }, { "epoch": 3.3407031001881125, "grad_norm": 0.008967030793428421, "learning_rate": 5.992197196833026e-07, "loss": 0.0, "num_input_tokens_seen": 92149264, "step": 136745 }, { "epoch": 3.340825251019959, "grad_norm": 0.00047545693814754486, "learning_rate": 5.991415920041117e-07, "loss": 0.0009, "num_input_tokens_seen": 92152656, "step": 136750 }, { "epoch": 3.340947401851807, "grad_norm": 28.655147552490234, "learning_rate": 5.990634672400705e-07, "loss": 0.0902, "num_input_tokens_seen": 92155920, "step": 136755 }, { "epoch": 3.3410695526836536, "grad_norm": 0.0035769431851804256, "learning_rate": 5.98985345391748e-07, "loss": 0.0001, "num_input_tokens_seen": 92158928, "step": 136760 }, { "epoch": 3.3411917035155008, "grad_norm": 0.027868378907442093, "learning_rate": 5.989072264597115e-07, "loss": 0.0001, "num_input_tokens_seen": 92162000, "step": 136765 }, { "epoch": 3.341313854347348, "grad_norm": 0.0004942073719576001, "learning_rate": 5.988291104445296e-07, "loss": 0.0, "num_input_tokens_seen": 92165136, "step": 136770 }, { "epoch": 3.341436005179195, "grad_norm": 0.0015912620583549142, "learning_rate": 5.987509973467706e-07, "loss": 0.0, "num_input_tokens_seen": 92168976, "step": 136775 }, { "epoch": 3.3415581560110423, "grad_norm": 86.06957244873047, "learning_rate": 5.98672887167002e-07, "loss": 0.0235, "num_input_tokens_seen": 92172176, "step": 136780 }, { "epoch": 3.3416803068428895, "grad_norm": 0.0011086321901530027, "learning_rate": 5.985947799057924e-07, "loss": 0.0001, "num_input_tokens_seen": 92175504, "step": 136785 }, { "epoch": 3.3418024576747367, "grad_norm": 0.007813183590769768, "learning_rate": 5.985166755637092e-07, "loss": 0.0002, "num_input_tokens_seen": 92178768, "step": 136790 }, { "epoch": 3.341924608506584, "grad_norm": 0.00011821733642136678, "learning_rate": 5.984385741413209e-07, "loss": 0.0414, "num_input_tokens_seen": 92181904, "step": 136795 }, { "epoch": 3.342046759338431, "grad_norm": 37.56241989135742, "learning_rate": 5.983604756391954e-07, "loss": 0.1131, "num_input_tokens_seen": 92185488, "step": 136800 }, { "epoch": 3.3421689101702783, "grad_norm": 0.0004566130228340626, "learning_rate": 5.982823800579002e-07, "loss": 0.0001, "num_input_tokens_seen": 92189584, "step": 136805 }, { "epoch": 3.3422910610021255, "grad_norm": 0.03750479221343994, "learning_rate": 5.98204287398004e-07, "loss": 0.0001, "num_input_tokens_seen": 92193808, "step": 136810 }, { "epoch": 3.3424132118339727, "grad_norm": 0.008922645822167397, "learning_rate": 5.981261976600738e-07, "loss": 0.0, "num_input_tokens_seen": 92197072, "step": 136815 }, { "epoch": 3.34253536266582, "grad_norm": 0.010072392411530018, "learning_rate": 5.980481108446786e-07, "loss": 0.0, "num_input_tokens_seen": 92201296, "step": 136820 }, { "epoch": 3.342657513497667, "grad_norm": 0.015576275065541267, "learning_rate": 5.97970026952385e-07, "loss": 0.0001, "num_input_tokens_seen": 92204496, "step": 136825 }, { "epoch": 3.3427796643295142, "grad_norm": 0.16503624618053436, "learning_rate": 5.978919459837621e-07, "loss": 0.0001, "num_input_tokens_seen": 92207888, "step": 136830 }, { "epoch": 3.3429018151613614, "grad_norm": 0.004576961509883404, "learning_rate": 5.978138679393766e-07, "loss": 0.0, "num_input_tokens_seen": 92210896, "step": 136835 }, { "epoch": 3.3430239659932086, "grad_norm": 0.002583190565928817, "learning_rate": 5.977357928197971e-07, "loss": 0.0, "num_input_tokens_seen": 92213904, "step": 136840 }, { "epoch": 3.3431461168250554, "grad_norm": 0.002497687004506588, "learning_rate": 5.976577206255913e-07, "loss": 0.0, "num_input_tokens_seen": 92217232, "step": 136845 }, { "epoch": 3.343268267656903, "grad_norm": 0.0009140447364188731, "learning_rate": 5.975796513573263e-07, "loss": 0.0002, "num_input_tokens_seen": 92220240, "step": 136850 }, { "epoch": 3.3433904184887497, "grad_norm": 0.04105198755860329, "learning_rate": 5.975015850155708e-07, "loss": 0.0, "num_input_tokens_seen": 92224016, "step": 136855 }, { "epoch": 3.343512569320597, "grad_norm": 0.015119964256882668, "learning_rate": 5.974235216008916e-07, "loss": 0.0, "num_input_tokens_seen": 92227408, "step": 136860 }, { "epoch": 3.343634720152444, "grad_norm": 0.0003713774203788489, "learning_rate": 5.973454611138568e-07, "loss": 0.0001, "num_input_tokens_seen": 92230480, "step": 136865 }, { "epoch": 3.3437568709842913, "grad_norm": 0.010512808337807655, "learning_rate": 5.972674035550345e-07, "loss": 0.0921, "num_input_tokens_seen": 92234832, "step": 136870 }, { "epoch": 3.3438790218161385, "grad_norm": 0.0023981162812560797, "learning_rate": 5.971893489249917e-07, "loss": 0.0001, "num_input_tokens_seen": 92238224, "step": 136875 }, { "epoch": 3.3440011726479857, "grad_norm": 0.003602446988224983, "learning_rate": 5.971112972242966e-07, "loss": 0.0, "num_input_tokens_seen": 92241104, "step": 136880 }, { "epoch": 3.344123323479833, "grad_norm": 0.009283242747187614, "learning_rate": 5.970332484535161e-07, "loss": 0.0, "num_input_tokens_seen": 92244112, "step": 136885 }, { "epoch": 3.34424547431168, "grad_norm": 0.0038792439736425877, "learning_rate": 5.969552026132186e-07, "loss": 0.0, "num_input_tokens_seen": 92247376, "step": 136890 }, { "epoch": 3.3443676251435273, "grad_norm": 0.001365432282909751, "learning_rate": 5.968771597039711e-07, "loss": 0.0, "num_input_tokens_seen": 92250960, "step": 136895 }, { "epoch": 3.3444897759753744, "grad_norm": 0.0032672842498868704, "learning_rate": 5.967991197263412e-07, "loss": 0.0, "num_input_tokens_seen": 92254096, "step": 136900 }, { "epoch": 3.3446119268072216, "grad_norm": 0.05888065695762634, "learning_rate": 5.967210826808968e-07, "loss": 0.0, "num_input_tokens_seen": 92257680, "step": 136905 }, { "epoch": 3.344734077639069, "grad_norm": 0.006529807113111019, "learning_rate": 5.966430485682048e-07, "loss": 0.0, "num_input_tokens_seen": 92260688, "step": 136910 }, { "epoch": 3.344856228470916, "grad_norm": 0.011194230057299137, "learning_rate": 5.965650173888334e-07, "loss": 0.0, "num_input_tokens_seen": 92264784, "step": 136915 }, { "epoch": 3.344978379302763, "grad_norm": 0.00022088276455178857, "learning_rate": 5.964869891433494e-07, "loss": 0.0, "num_input_tokens_seen": 92268240, "step": 136920 }, { "epoch": 3.3451005301346104, "grad_norm": 0.0009188788244500756, "learning_rate": 5.964089638323204e-07, "loss": 0.0, "num_input_tokens_seen": 92271632, "step": 136925 }, { "epoch": 3.345222680966457, "grad_norm": 0.016729068011045456, "learning_rate": 5.963309414563146e-07, "loss": 0.0001, "num_input_tokens_seen": 92275280, "step": 136930 }, { "epoch": 3.3453448317983048, "grad_norm": 0.0008369608549401164, "learning_rate": 5.962529220158983e-07, "loss": 0.0, "num_input_tokens_seen": 92278672, "step": 136935 }, { "epoch": 3.3454669826301515, "grad_norm": 0.0001761860039550811, "learning_rate": 5.961749055116396e-07, "loss": 0.0, "num_input_tokens_seen": 92282192, "step": 136940 }, { "epoch": 3.3455891334619987, "grad_norm": 0.001143355155363679, "learning_rate": 5.960968919441055e-07, "loss": 0.0, "num_input_tokens_seen": 92285904, "step": 136945 }, { "epoch": 3.345711284293846, "grad_norm": 0.010505764745175838, "learning_rate": 5.960188813138634e-07, "loss": 0.0001, "num_input_tokens_seen": 92289104, "step": 136950 }, { "epoch": 3.345833435125693, "grad_norm": 0.02035011164844036, "learning_rate": 5.959408736214807e-07, "loss": 0.0001, "num_input_tokens_seen": 92292368, "step": 136955 }, { "epoch": 3.3459555859575403, "grad_norm": 0.0037497461307793856, "learning_rate": 5.958628688675244e-07, "loss": 0.0, "num_input_tokens_seen": 92295632, "step": 136960 }, { "epoch": 3.3460777367893875, "grad_norm": 0.002685282379388809, "learning_rate": 5.957848670525624e-07, "loss": 0.0, "num_input_tokens_seen": 92299536, "step": 136965 }, { "epoch": 3.3461998876212347, "grad_norm": 0.004526887554675341, "learning_rate": 5.957068681771613e-07, "loss": 0.0006, "num_input_tokens_seen": 92302608, "step": 136970 }, { "epoch": 3.346322038453082, "grad_norm": 0.002396609168499708, "learning_rate": 5.95628872241889e-07, "loss": 0.0, "num_input_tokens_seen": 92306000, "step": 136975 }, { "epoch": 3.346444189284929, "grad_norm": 0.0007503124652430415, "learning_rate": 5.955508792473118e-07, "loss": 0.0, "num_input_tokens_seen": 92309456, "step": 136980 }, { "epoch": 3.3465663401167762, "grad_norm": 0.0008637371938675642, "learning_rate": 5.954728891939977e-07, "loss": 0.0, "num_input_tokens_seen": 92313488, "step": 136985 }, { "epoch": 3.3466884909486234, "grad_norm": 0.0006992123671807349, "learning_rate": 5.953949020825133e-07, "loss": 0.0, "num_input_tokens_seen": 92316688, "step": 136990 }, { "epoch": 3.3468106417804706, "grad_norm": 0.004576565232127905, "learning_rate": 5.95316917913426e-07, "loss": 0.0, "num_input_tokens_seen": 92320080, "step": 136995 }, { "epoch": 3.346932792612318, "grad_norm": 0.010585155338048935, "learning_rate": 5.952389366873034e-07, "loss": 0.0, "num_input_tokens_seen": 92323664, "step": 137000 }, { "epoch": 3.347054943444165, "grad_norm": 0.0007783559849485755, "learning_rate": 5.951609584047117e-07, "loss": 0.0, "num_input_tokens_seen": 92327376, "step": 137005 }, { "epoch": 3.347177094276012, "grad_norm": 0.001543479971587658, "learning_rate": 5.950829830662186e-07, "loss": 0.0, "num_input_tokens_seen": 92330448, "step": 137010 }, { "epoch": 3.3472992451078594, "grad_norm": 0.0005223507178016007, "learning_rate": 5.950050106723907e-07, "loss": 0.0, "num_input_tokens_seen": 92334096, "step": 137015 }, { "epoch": 3.3474213959397066, "grad_norm": 19.34946060180664, "learning_rate": 5.949270412237953e-07, "loss": 0.039, "num_input_tokens_seen": 92337488, "step": 137020 }, { "epoch": 3.3475435467715533, "grad_norm": 0.0002101163554470986, "learning_rate": 5.948490747209997e-07, "loss": 0.043, "num_input_tokens_seen": 92341008, "step": 137025 }, { "epoch": 3.3476656976034005, "grad_norm": 0.10596171766519547, "learning_rate": 5.947711111645703e-07, "loss": 0.0001, "num_input_tokens_seen": 92344016, "step": 137030 }, { "epoch": 3.3477878484352477, "grad_norm": 0.021134018898010254, "learning_rate": 5.946931505550746e-07, "loss": 0.0, "num_input_tokens_seen": 92347408, "step": 137035 }, { "epoch": 3.347909999267095, "grad_norm": 0.011571722105145454, "learning_rate": 5.946151928930792e-07, "loss": 0.0, "num_input_tokens_seen": 92351120, "step": 137040 }, { "epoch": 3.348032150098942, "grad_norm": 0.07473000884056091, "learning_rate": 5.945372381791513e-07, "loss": 0.0, "num_input_tokens_seen": 92355216, "step": 137045 }, { "epoch": 3.3481543009307893, "grad_norm": 0.00010492785804672167, "learning_rate": 5.944592864138575e-07, "loss": 0.0, "num_input_tokens_seen": 92358288, "step": 137050 }, { "epoch": 3.3482764517626364, "grad_norm": 0.0009476658888161182, "learning_rate": 5.943813375977647e-07, "loss": 0.0, "num_input_tokens_seen": 92361296, "step": 137055 }, { "epoch": 3.3483986025944836, "grad_norm": 0.0033659671898931265, "learning_rate": 5.943033917314404e-07, "loss": 0.0, "num_input_tokens_seen": 92364560, "step": 137060 }, { "epoch": 3.348520753426331, "grad_norm": 0.00013656694500241429, "learning_rate": 5.942254488154504e-07, "loss": 0.0, "num_input_tokens_seen": 92368080, "step": 137065 }, { "epoch": 3.348642904258178, "grad_norm": 0.002999598393216729, "learning_rate": 5.941475088503627e-07, "loss": 0.0, "num_input_tokens_seen": 92371408, "step": 137070 }, { "epoch": 3.348765055090025, "grad_norm": 0.0001747731730574742, "learning_rate": 5.940695718367428e-07, "loss": 0.0, "num_input_tokens_seen": 92374416, "step": 137075 }, { "epoch": 3.3488872059218724, "grad_norm": 0.0003013430687133223, "learning_rate": 5.939916377751584e-07, "loss": 0.0, "num_input_tokens_seen": 92378256, "step": 137080 }, { "epoch": 3.3490093567537196, "grad_norm": 0.005943240597844124, "learning_rate": 5.939137066661763e-07, "loss": 0.0418, "num_input_tokens_seen": 92381584, "step": 137085 }, { "epoch": 3.3491315075855668, "grad_norm": 0.009656358510255814, "learning_rate": 5.938357785103625e-07, "loss": 0.0, "num_input_tokens_seen": 92384912, "step": 137090 }, { "epoch": 3.349253658417414, "grad_norm": 0.00010450145782670006, "learning_rate": 5.937578533082846e-07, "loss": 0.0, "num_input_tokens_seen": 92388752, "step": 137095 }, { "epoch": 3.349375809249261, "grad_norm": 0.0005017042858526111, "learning_rate": 5.936799310605087e-07, "loss": 0.0, "num_input_tokens_seen": 92391696, "step": 137100 }, { "epoch": 3.3494979600811083, "grad_norm": 0.10877280682325363, "learning_rate": 5.936020117676015e-07, "loss": 0.0001, "num_input_tokens_seen": 92395088, "step": 137105 }, { "epoch": 3.349620110912955, "grad_norm": 0.003869707463309169, "learning_rate": 5.9352409543013e-07, "loss": 0.0, "num_input_tokens_seen": 92398608, "step": 137110 }, { "epoch": 3.3497422617448027, "grad_norm": 0.02804212085902691, "learning_rate": 5.934461820486603e-07, "loss": 0.0, "num_input_tokens_seen": 92402064, "step": 137115 }, { "epoch": 3.3498644125766495, "grad_norm": 0.0017672295216470957, "learning_rate": 5.933682716237596e-07, "loss": 0.0, "num_input_tokens_seen": 92405520, "step": 137120 }, { "epoch": 3.3499865634084967, "grad_norm": 0.6739102005958557, "learning_rate": 5.932903641559939e-07, "loss": 0.0007, "num_input_tokens_seen": 92409104, "step": 137125 }, { "epoch": 3.350108714240344, "grad_norm": 0.00022642611293122172, "learning_rate": 5.932124596459305e-07, "loss": 0.0, "num_input_tokens_seen": 92412432, "step": 137130 }, { "epoch": 3.350230865072191, "grad_norm": 0.0005862355465069413, "learning_rate": 5.93134558094135e-07, "loss": 0.0, "num_input_tokens_seen": 92416016, "step": 137135 }, { "epoch": 3.3503530159040382, "grad_norm": 0.0008308440446853638, "learning_rate": 5.930566595011749e-07, "loss": 0.0, "num_input_tokens_seen": 92419152, "step": 137140 }, { "epoch": 3.3504751667358854, "grad_norm": 0.030860837548971176, "learning_rate": 5.929787638676158e-07, "loss": 0.0001, "num_input_tokens_seen": 92422224, "step": 137145 }, { "epoch": 3.3505973175677326, "grad_norm": 0.0021630236878991127, "learning_rate": 5.929008711940249e-07, "loss": 0.0001, "num_input_tokens_seen": 92425872, "step": 137150 }, { "epoch": 3.35071946839958, "grad_norm": 0.13139772415161133, "learning_rate": 5.928229814809684e-07, "loss": 0.0001, "num_input_tokens_seen": 92429584, "step": 137155 }, { "epoch": 3.350841619231427, "grad_norm": 0.02592483162879944, "learning_rate": 5.927450947290125e-07, "loss": 0.0001, "num_input_tokens_seen": 92432592, "step": 137160 }, { "epoch": 3.350963770063274, "grad_norm": 447.85107421875, "learning_rate": 5.926672109387241e-07, "loss": 0.0204, "num_input_tokens_seen": 92436304, "step": 137165 }, { "epoch": 3.3510859208951214, "grad_norm": 0.0019791650120168924, "learning_rate": 5.925893301106688e-07, "loss": 0.0, "num_input_tokens_seen": 92439760, "step": 137170 }, { "epoch": 3.3512080717269686, "grad_norm": 8.97948193596676e-05, "learning_rate": 5.925114522454136e-07, "loss": 0.0001, "num_input_tokens_seen": 92442896, "step": 137175 }, { "epoch": 3.3513302225588157, "grad_norm": 0.0074377055279910564, "learning_rate": 5.924335773435251e-07, "loss": 0.0, "num_input_tokens_seen": 92445712, "step": 137180 }, { "epoch": 3.351452373390663, "grad_norm": 0.0001644604344619438, "learning_rate": 5.923557054055688e-07, "loss": 0.0, "num_input_tokens_seen": 92448848, "step": 137185 }, { "epoch": 3.35157452422251, "grad_norm": 0.0013965462567284703, "learning_rate": 5.922778364321119e-07, "loss": 0.0, "num_input_tokens_seen": 92452048, "step": 137190 }, { "epoch": 3.3516966750543573, "grad_norm": 0.0002730031847022474, "learning_rate": 5.921999704237197e-07, "loss": 0.0, "num_input_tokens_seen": 92455056, "step": 137195 }, { "epoch": 3.3518188258862045, "grad_norm": 0.004419253673404455, "learning_rate": 5.921221073809596e-07, "loss": 0.0, "num_input_tokens_seen": 92458704, "step": 137200 }, { "epoch": 3.3519409767180512, "grad_norm": 0.025197016075253487, "learning_rate": 5.92044247304397e-07, "loss": 0.0004, "num_input_tokens_seen": 92461712, "step": 137205 }, { "epoch": 3.3520631275498984, "grad_norm": 5.4128915508044884e-05, "learning_rate": 5.919663901945982e-07, "loss": 0.0003, "num_input_tokens_seen": 92464720, "step": 137210 }, { "epoch": 3.3521852783817456, "grad_norm": 0.0008574188686907291, "learning_rate": 5.918885360521297e-07, "loss": 0.0, "num_input_tokens_seen": 92467728, "step": 137215 }, { "epoch": 3.352307429213593, "grad_norm": 0.0039085992611944675, "learning_rate": 5.918106848775574e-07, "loss": 0.0, "num_input_tokens_seen": 92470928, "step": 137220 }, { "epoch": 3.35242958004544, "grad_norm": 0.0001256961259059608, "learning_rate": 5.917328366714479e-07, "loss": 0.0001, "num_input_tokens_seen": 92474384, "step": 137225 }, { "epoch": 3.352551730877287, "grad_norm": 0.0015291773015633225, "learning_rate": 5.916549914343667e-07, "loss": 0.0, "num_input_tokens_seen": 92477776, "step": 137230 }, { "epoch": 3.3526738817091344, "grad_norm": 70.6291732788086, "learning_rate": 5.915771491668801e-07, "loss": 0.0838, "num_input_tokens_seen": 92481296, "step": 137235 }, { "epoch": 3.3527960325409816, "grad_norm": 0.07619927823543549, "learning_rate": 5.914993098695548e-07, "loss": 0.0, "num_input_tokens_seen": 92484560, "step": 137240 }, { "epoch": 3.3529181833728288, "grad_norm": 2.3242442694026977e-05, "learning_rate": 5.914214735429559e-07, "loss": 0.0, "num_input_tokens_seen": 92488592, "step": 137245 }, { "epoch": 3.353040334204676, "grad_norm": 0.002132971538230777, "learning_rate": 5.913436401876505e-07, "loss": 0.0, "num_input_tokens_seen": 92491984, "step": 137250 }, { "epoch": 3.353162485036523, "grad_norm": 7.056258618831635e-05, "learning_rate": 5.912658098042038e-07, "loss": 0.0, "num_input_tokens_seen": 92495376, "step": 137255 }, { "epoch": 3.3532846358683703, "grad_norm": 0.0008625510963611305, "learning_rate": 5.91187982393182e-07, "loss": 0.0, "num_input_tokens_seen": 92498256, "step": 137260 }, { "epoch": 3.3534067867002175, "grad_norm": 0.0012392710195854306, "learning_rate": 5.911101579551511e-07, "loss": 0.0, "num_input_tokens_seen": 92501776, "step": 137265 }, { "epoch": 3.3535289375320647, "grad_norm": 0.0004566275456454605, "learning_rate": 5.910323364906771e-07, "loss": 0.0, "num_input_tokens_seen": 92504784, "step": 137270 }, { "epoch": 3.353651088363912, "grad_norm": 0.0005576558178290725, "learning_rate": 5.909545180003262e-07, "loss": 0.0001, "num_input_tokens_seen": 92507984, "step": 137275 }, { "epoch": 3.353773239195759, "grad_norm": 0.00011330114648444578, "learning_rate": 5.908767024846637e-07, "loss": 0.0001, "num_input_tokens_seen": 92511248, "step": 137280 }, { "epoch": 3.3538953900276063, "grad_norm": 0.0003647850244306028, "learning_rate": 5.907988899442565e-07, "loss": 0.0, "num_input_tokens_seen": 92514512, "step": 137285 }, { "epoch": 3.354017540859453, "grad_norm": 5.280201730784029e-05, "learning_rate": 5.90721080379669e-07, "loss": 0.0, "num_input_tokens_seen": 92517456, "step": 137290 }, { "epoch": 3.3541396916913007, "grad_norm": 0.7083552479743958, "learning_rate": 5.906432737914686e-07, "loss": 0.0001, "num_input_tokens_seen": 92520912, "step": 137295 }, { "epoch": 3.3542618425231474, "grad_norm": 0.009471097961068153, "learning_rate": 5.905654701802198e-07, "loss": 0.0, "num_input_tokens_seen": 92524368, "step": 137300 }, { "epoch": 3.3543839933549946, "grad_norm": 1.3663839126820676e-05, "learning_rate": 5.904876695464894e-07, "loss": 0.0, "num_input_tokens_seen": 92527696, "step": 137305 }, { "epoch": 3.354506144186842, "grad_norm": 0.000423252786276862, "learning_rate": 5.90409871890843e-07, "loss": 0.0761, "num_input_tokens_seen": 92531088, "step": 137310 }, { "epoch": 3.354628295018689, "grad_norm": 0.0005392113816924393, "learning_rate": 5.903320772138458e-07, "loss": 0.0, "num_input_tokens_seen": 92534480, "step": 137315 }, { "epoch": 3.354750445850536, "grad_norm": 0.002171800471842289, "learning_rate": 5.902542855160641e-07, "loss": 0.0, "num_input_tokens_seen": 92537872, "step": 137320 }, { "epoch": 3.3548725966823834, "grad_norm": 0.00034207970020361245, "learning_rate": 5.901764967980634e-07, "loss": 0.0, "num_input_tokens_seen": 92541904, "step": 137325 }, { "epoch": 3.3549947475142305, "grad_norm": 0.0010893391445279121, "learning_rate": 5.900987110604092e-07, "loss": 0.0, "num_input_tokens_seen": 92544976, "step": 137330 }, { "epoch": 3.3551168983460777, "grad_norm": 0.001828734646551311, "learning_rate": 5.900209283036677e-07, "loss": 0.0, "num_input_tokens_seen": 92548240, "step": 137335 }, { "epoch": 3.355239049177925, "grad_norm": 0.0322355292737484, "learning_rate": 5.899431485284041e-07, "loss": 0.0, "num_input_tokens_seen": 92551568, "step": 137340 }, { "epoch": 3.355361200009772, "grad_norm": 0.11325082927942276, "learning_rate": 5.898653717351847e-07, "loss": 0.0003, "num_input_tokens_seen": 92554960, "step": 137345 }, { "epoch": 3.3554833508416193, "grad_norm": 3.2338393793907017e-05, "learning_rate": 5.89787597924574e-07, "loss": 0.0, "num_input_tokens_seen": 92558608, "step": 137350 }, { "epoch": 3.3556055016734665, "grad_norm": 0.00022079696645960212, "learning_rate": 5.897098270971388e-07, "loss": 0.0001, "num_input_tokens_seen": 92561808, "step": 137355 }, { "epoch": 3.3557276525053137, "grad_norm": 0.001337683410383761, "learning_rate": 5.896320592534438e-07, "loss": 0.0, "num_input_tokens_seen": 92564880, "step": 137360 }, { "epoch": 3.355849803337161, "grad_norm": 28.633352279663086, "learning_rate": 5.895542943940546e-07, "loss": 0.081, "num_input_tokens_seen": 92567888, "step": 137365 }, { "epoch": 3.355971954169008, "grad_norm": 0.00016493708244524896, "learning_rate": 5.894765325195374e-07, "loss": 0.0, "num_input_tokens_seen": 92571280, "step": 137370 }, { "epoch": 3.356094105000855, "grad_norm": 0.6020975708961487, "learning_rate": 5.893987736304569e-07, "loss": 0.0, "num_input_tokens_seen": 92574864, "step": 137375 }, { "epoch": 3.3562162558327024, "grad_norm": 0.0003073678817600012, "learning_rate": 5.893210177273793e-07, "loss": 0.0, "num_input_tokens_seen": 92578640, "step": 137380 }, { "epoch": 3.356338406664549, "grad_norm": 0.014538398012518883, "learning_rate": 5.892432648108694e-07, "loss": 0.0, "num_input_tokens_seen": 92581584, "step": 137385 }, { "epoch": 3.3564605574963964, "grad_norm": 9.626116661820561e-05, "learning_rate": 5.891655148814934e-07, "loss": 0.0001, "num_input_tokens_seen": 92585872, "step": 137390 }, { "epoch": 3.3565827083282436, "grad_norm": 0.0016042754286900163, "learning_rate": 5.890877679398158e-07, "loss": 0.0, "num_input_tokens_seen": 92588944, "step": 137395 }, { "epoch": 3.3567048591600908, "grad_norm": 0.0007302347803488374, "learning_rate": 5.890100239864024e-07, "loss": 0.0879, "num_input_tokens_seen": 92592400, "step": 137400 }, { "epoch": 3.356827009991938, "grad_norm": 0.00015298521611839533, "learning_rate": 5.88932283021819e-07, "loss": 0.0, "num_input_tokens_seen": 92595920, "step": 137405 }, { "epoch": 3.356949160823785, "grad_norm": 0.004179175477474928, "learning_rate": 5.888545450466307e-07, "loss": 0.0, "num_input_tokens_seen": 92599504, "step": 137410 }, { "epoch": 3.3570713116556323, "grad_norm": 8.79567232914269e-05, "learning_rate": 5.887768100614026e-07, "loss": 0.0001, "num_input_tokens_seen": 92602704, "step": 137415 }, { "epoch": 3.3571934624874795, "grad_norm": 104.23986053466797, "learning_rate": 5.886990780667e-07, "loss": 0.0007, "num_input_tokens_seen": 92605776, "step": 137420 }, { "epoch": 3.3573156133193267, "grad_norm": 0.005066916812211275, "learning_rate": 5.886213490630883e-07, "loss": 0.0, "num_input_tokens_seen": 92609232, "step": 137425 }, { "epoch": 3.357437764151174, "grad_norm": 20.911270141601562, "learning_rate": 5.885436230511332e-07, "loss": 0.0012, "num_input_tokens_seen": 92612304, "step": 137430 }, { "epoch": 3.357559914983021, "grad_norm": 0.0022453595884144306, "learning_rate": 5.884659000313989e-07, "loss": 0.0, "num_input_tokens_seen": 92615568, "step": 137435 }, { "epoch": 3.3576820658148683, "grad_norm": 0.0018182910280302167, "learning_rate": 5.883881800044519e-07, "loss": 0.0, "num_input_tokens_seen": 92619472, "step": 137440 }, { "epoch": 3.3578042166467155, "grad_norm": 0.015961581841111183, "learning_rate": 5.883104629708563e-07, "loss": 0.0, "num_input_tokens_seen": 92623376, "step": 137445 }, { "epoch": 3.3579263674785627, "grad_norm": 0.00026710316888056695, "learning_rate": 5.882327489311781e-07, "loss": 0.043, "num_input_tokens_seen": 92626768, "step": 137450 }, { "epoch": 3.35804851831041, "grad_norm": 0.00041441788198426366, "learning_rate": 5.881550378859817e-07, "loss": 0.0, "num_input_tokens_seen": 92630224, "step": 137455 }, { "epoch": 3.358170669142257, "grad_norm": 0.00719722593203187, "learning_rate": 5.88077329835833e-07, "loss": 0.0003, "num_input_tokens_seen": 92633552, "step": 137460 }, { "epoch": 3.3582928199741042, "grad_norm": 0.0013678967952728271, "learning_rate": 5.879996247812969e-07, "loss": 0.039, "num_input_tokens_seen": 92636560, "step": 137465 }, { "epoch": 3.358414970805951, "grad_norm": 0.000781642273068428, "learning_rate": 5.879219227229378e-07, "loss": 0.0353, "num_input_tokens_seen": 92639888, "step": 137470 }, { "epoch": 3.358537121637798, "grad_norm": 0.0018686053808778524, "learning_rate": 5.878442236613217e-07, "loss": 0.0, "num_input_tokens_seen": 92643408, "step": 137475 }, { "epoch": 3.3586592724696454, "grad_norm": 0.004237237386405468, "learning_rate": 5.87766527597013e-07, "loss": 0.0001, "num_input_tokens_seen": 92646864, "step": 137480 }, { "epoch": 3.3587814233014925, "grad_norm": 0.0021779872477054596, "learning_rate": 5.876888345305769e-07, "loss": 0.0, "num_input_tokens_seen": 92650320, "step": 137485 }, { "epoch": 3.3589035741333397, "grad_norm": 0.0003856025286950171, "learning_rate": 5.87611144462579e-07, "loss": 0.0, "num_input_tokens_seen": 92653264, "step": 137490 }, { "epoch": 3.359025724965187, "grad_norm": 0.002911294810473919, "learning_rate": 5.875334573935833e-07, "loss": 0.0346, "num_input_tokens_seen": 92656592, "step": 137495 }, { "epoch": 3.359147875797034, "grad_norm": 0.026074426248669624, "learning_rate": 5.874557733241557e-07, "loss": 0.0, "num_input_tokens_seen": 92660240, "step": 137500 }, { "epoch": 3.3592700266288813, "grad_norm": 0.0025548043195158243, "learning_rate": 5.873780922548602e-07, "loss": 0.0, "num_input_tokens_seen": 92663632, "step": 137505 }, { "epoch": 3.3593921774607285, "grad_norm": 0.00036749220453202724, "learning_rate": 5.873004141862626e-07, "loss": 0.0, "num_input_tokens_seen": 92667088, "step": 137510 }, { "epoch": 3.3595143282925757, "grad_norm": 33.46885299682617, "learning_rate": 5.872227391189273e-07, "loss": 0.0619, "num_input_tokens_seen": 92670160, "step": 137515 }, { "epoch": 3.359636479124423, "grad_norm": 54.26581954956055, "learning_rate": 5.871450670534189e-07, "loss": 0.0596, "num_input_tokens_seen": 92673488, "step": 137520 }, { "epoch": 3.35975862995627, "grad_norm": 0.0010688966140151024, "learning_rate": 5.870673979903031e-07, "loss": 0.0, "num_input_tokens_seen": 92676496, "step": 137525 }, { "epoch": 3.3598807807881172, "grad_norm": 0.0011966531164944172, "learning_rate": 5.869897319301438e-07, "loss": 0.0, "num_input_tokens_seen": 92679760, "step": 137530 }, { "epoch": 3.3600029316199644, "grad_norm": 0.001972771715372801, "learning_rate": 5.869120688735067e-07, "loss": 0.0, "num_input_tokens_seen": 92683728, "step": 137535 }, { "epoch": 3.3601250824518116, "grad_norm": 0.0003887968778144568, "learning_rate": 5.868344088209558e-07, "loss": 0.0414, "num_input_tokens_seen": 92686864, "step": 137540 }, { "epoch": 3.360247233283659, "grad_norm": 0.0006644058739766479, "learning_rate": 5.867567517730565e-07, "loss": 0.0477, "num_input_tokens_seen": 92690256, "step": 137545 }, { "epoch": 3.360369384115506, "grad_norm": 0.00020107610907871276, "learning_rate": 5.866790977303729e-07, "loss": 0.0, "num_input_tokens_seen": 92693584, "step": 137550 }, { "epoch": 3.3604915349473528, "grad_norm": 0.008693316951394081, "learning_rate": 5.866014466934701e-07, "loss": 0.0325, "num_input_tokens_seen": 92697104, "step": 137555 }, { "epoch": 3.3606136857792004, "grad_norm": 13.368460655212402, "learning_rate": 5.865237986629132e-07, "loss": 0.065, "num_input_tokens_seen": 92700624, "step": 137560 }, { "epoch": 3.360735836611047, "grad_norm": 0.0017884562257677317, "learning_rate": 5.864461536392662e-07, "loss": 0.0, "num_input_tokens_seen": 92704080, "step": 137565 }, { "epoch": 3.3608579874428943, "grad_norm": 0.002366987755522132, "learning_rate": 5.863685116230939e-07, "loss": 0.0406, "num_input_tokens_seen": 92707024, "step": 137570 }, { "epoch": 3.3609801382747415, "grad_norm": 0.003429947653785348, "learning_rate": 5.862908726149611e-07, "loss": 0.0, "num_input_tokens_seen": 92710224, "step": 137575 }, { "epoch": 3.3611022891065887, "grad_norm": 0.0011162642622366548, "learning_rate": 5.862132366154322e-07, "loss": 0.0, "num_input_tokens_seen": 92713744, "step": 137580 }, { "epoch": 3.361224439938436, "grad_norm": 0.01790856011211872, "learning_rate": 5.861356036250724e-07, "loss": 0.0, "num_input_tokens_seen": 92716816, "step": 137585 }, { "epoch": 3.361346590770283, "grad_norm": 0.003260530298575759, "learning_rate": 5.860579736444453e-07, "loss": 0.0, "num_input_tokens_seen": 92720272, "step": 137590 }, { "epoch": 3.3614687416021303, "grad_norm": 0.537703812122345, "learning_rate": 5.859803466741164e-07, "loss": 0.0007, "num_input_tokens_seen": 92723280, "step": 137595 }, { "epoch": 3.3615908924339775, "grad_norm": 0.003660767339169979, "learning_rate": 5.859027227146493e-07, "loss": 0.0, "num_input_tokens_seen": 92727120, "step": 137600 }, { "epoch": 3.3617130432658247, "grad_norm": 0.25912341475486755, "learning_rate": 5.858251017666095e-07, "loss": 0.0005, "num_input_tokens_seen": 92730576, "step": 137605 }, { "epoch": 3.361835194097672, "grad_norm": 0.0004168798914179206, "learning_rate": 5.857474838305605e-07, "loss": 0.0, "num_input_tokens_seen": 92734416, "step": 137610 }, { "epoch": 3.361957344929519, "grad_norm": 0.005119622685015202, "learning_rate": 5.856698689070674e-07, "loss": 0.0, "num_input_tokens_seen": 92738064, "step": 137615 }, { "epoch": 3.362079495761366, "grad_norm": 0.006182185839861631, "learning_rate": 5.855922569966945e-07, "loss": 0.0, "num_input_tokens_seen": 92741520, "step": 137620 }, { "epoch": 3.3622016465932134, "grad_norm": 0.0035552720073610544, "learning_rate": 5.85514648100006e-07, "loss": 0.0551, "num_input_tokens_seen": 92745168, "step": 137625 }, { "epoch": 3.3623237974250606, "grad_norm": 0.0037220127414911985, "learning_rate": 5.854370422175668e-07, "loss": 0.0, "num_input_tokens_seen": 92748432, "step": 137630 }, { "epoch": 3.362445948256908, "grad_norm": 19.34136199951172, "learning_rate": 5.853594393499406e-07, "loss": 0.039, "num_input_tokens_seen": 92751440, "step": 137635 }, { "epoch": 3.362568099088755, "grad_norm": 0.000591250485740602, "learning_rate": 5.852818394976919e-07, "loss": 0.0, "num_input_tokens_seen": 92754640, "step": 137640 }, { "epoch": 3.362690249920602, "grad_norm": 0.0014338805340230465, "learning_rate": 5.852042426613858e-07, "loss": 0.0542, "num_input_tokens_seen": 92757776, "step": 137645 }, { "epoch": 3.362812400752449, "grad_norm": 0.020083287730813026, "learning_rate": 5.851266488415856e-07, "loss": 0.0, "num_input_tokens_seen": 92760912, "step": 137650 }, { "epoch": 3.362934551584296, "grad_norm": 0.0002926739689428359, "learning_rate": 5.850490580388562e-07, "loss": 0.0, "num_input_tokens_seen": 92764240, "step": 137655 }, { "epoch": 3.3630567024161433, "grad_norm": 0.0002616309793666005, "learning_rate": 5.849714702537615e-07, "loss": 0.0, "num_input_tokens_seen": 92767568, "step": 137660 }, { "epoch": 3.3631788532479905, "grad_norm": 0.0019339133286848664, "learning_rate": 5.848938854868661e-07, "loss": 0.0, "num_input_tokens_seen": 92770704, "step": 137665 }, { "epoch": 3.3633010040798377, "grad_norm": 0.004293452017009258, "learning_rate": 5.848163037387339e-07, "loss": 0.0, "num_input_tokens_seen": 92773776, "step": 137670 }, { "epoch": 3.363423154911685, "grad_norm": 0.012178168632090092, "learning_rate": 5.847387250099292e-07, "loss": 0.0, "num_input_tokens_seen": 92777296, "step": 137675 }, { "epoch": 3.363545305743532, "grad_norm": 0.12999624013900757, "learning_rate": 5.846611493010163e-07, "loss": 0.0475, "num_input_tokens_seen": 92780688, "step": 137680 }, { "epoch": 3.3636674565753792, "grad_norm": 0.008777649141848087, "learning_rate": 5.845835766125589e-07, "loss": 0.0, "num_input_tokens_seen": 92784144, "step": 137685 }, { "epoch": 3.3637896074072264, "grad_norm": 0.0008219497394748032, "learning_rate": 5.84506006945122e-07, "loss": 0.0001, "num_input_tokens_seen": 92787664, "step": 137690 }, { "epoch": 3.3639117582390736, "grad_norm": 0.004280789755284786, "learning_rate": 5.844284402992685e-07, "loss": 0.0, "num_input_tokens_seen": 92790736, "step": 137695 }, { "epoch": 3.364033909070921, "grad_norm": 20.881895065307617, "learning_rate": 5.843508766755638e-07, "loss": 0.0418, "num_input_tokens_seen": 92793808, "step": 137700 }, { "epoch": 3.364156059902768, "grad_norm": 0.0034292656928300858, "learning_rate": 5.842733160745709e-07, "loss": 0.0, "num_input_tokens_seen": 92797712, "step": 137705 }, { "epoch": 3.364278210734615, "grad_norm": 0.0009706453420221806, "learning_rate": 5.841957584968542e-07, "loss": 0.0001, "num_input_tokens_seen": 92801296, "step": 137710 }, { "epoch": 3.3644003615664624, "grad_norm": 0.02814302034676075, "learning_rate": 5.841182039429782e-07, "loss": 0.0001, "num_input_tokens_seen": 92804880, "step": 137715 }, { "epoch": 3.3645225123983096, "grad_norm": 0.001046680612489581, "learning_rate": 5.840406524135061e-07, "loss": 0.0, "num_input_tokens_seen": 92808272, "step": 137720 }, { "epoch": 3.3646446632301568, "grad_norm": 0.0003182195359840989, "learning_rate": 5.839631039090025e-07, "loss": 0.0002, "num_input_tokens_seen": 92811408, "step": 137725 }, { "epoch": 3.364766814062004, "grad_norm": 0.0013458180474117398, "learning_rate": 5.838855584300311e-07, "loss": 0.0, "num_input_tokens_seen": 92815120, "step": 137730 }, { "epoch": 3.3648889648938507, "grad_norm": 0.011692610569298267, "learning_rate": 5.838080159771556e-07, "loss": 0.0, "num_input_tokens_seen": 92818448, "step": 137735 }, { "epoch": 3.3650111157256983, "grad_norm": 0.0017371824942529202, "learning_rate": 5.837304765509405e-07, "loss": 0.0, "num_input_tokens_seen": 92821648, "step": 137740 }, { "epoch": 3.365133266557545, "grad_norm": 0.006624247413128614, "learning_rate": 5.83652940151949e-07, "loss": 0.0, "num_input_tokens_seen": 92824848, "step": 137745 }, { "epoch": 3.3652554173893923, "grad_norm": 0.004637475591152906, "learning_rate": 5.835754067807457e-07, "loss": 0.0, "num_input_tokens_seen": 92827664, "step": 137750 }, { "epoch": 3.3653775682212395, "grad_norm": 0.009000650607049465, "learning_rate": 5.834978764378935e-07, "loss": 0.0, "num_input_tokens_seen": 92831056, "step": 137755 }, { "epoch": 3.3654997190530866, "grad_norm": 0.00272104749456048, "learning_rate": 5.834203491239574e-07, "loss": 0.0, "num_input_tokens_seen": 92834448, "step": 137760 }, { "epoch": 3.365621869884934, "grad_norm": 0.0005968745681457222, "learning_rate": 5.833428248395e-07, "loss": 0.0, "num_input_tokens_seen": 92838032, "step": 137765 }, { "epoch": 3.365744020716781, "grad_norm": 0.10684335976839066, "learning_rate": 5.832653035850856e-07, "loss": 0.005, "num_input_tokens_seen": 92841168, "step": 137770 }, { "epoch": 3.365866171548628, "grad_norm": 0.011952157132327557, "learning_rate": 5.831877853612785e-07, "loss": 0.0, "num_input_tokens_seen": 92844624, "step": 137775 }, { "epoch": 3.3659883223804754, "grad_norm": 0.0029711390379816294, "learning_rate": 5.831102701686416e-07, "loss": 0.0257, "num_input_tokens_seen": 92848080, "step": 137780 }, { "epoch": 3.3661104732123226, "grad_norm": 0.006985391024500132, "learning_rate": 5.830327580077392e-07, "loss": 0.0, "num_input_tokens_seen": 92851088, "step": 137785 }, { "epoch": 3.36623262404417, "grad_norm": 0.00044560953392647207, "learning_rate": 5.829552488791345e-07, "loss": 0.0, "num_input_tokens_seen": 92854800, "step": 137790 }, { "epoch": 3.366354774876017, "grad_norm": 0.003403823124244809, "learning_rate": 5.828777427833917e-07, "loss": 0.0478, "num_input_tokens_seen": 92858000, "step": 137795 }, { "epoch": 3.366476925707864, "grad_norm": 0.003880531992763281, "learning_rate": 5.82800239721074e-07, "loss": 0.0001, "num_input_tokens_seen": 92861776, "step": 137800 }, { "epoch": 3.3665990765397114, "grad_norm": 0.0010169809684157372, "learning_rate": 5.82722739692745e-07, "loss": 0.0676, "num_input_tokens_seen": 92864592, "step": 137805 }, { "epoch": 3.3667212273715585, "grad_norm": 0.001201188424602151, "learning_rate": 5.826452426989688e-07, "loss": 0.0, "num_input_tokens_seen": 92867792, "step": 137810 }, { "epoch": 3.3668433782034057, "grad_norm": 0.000344475352903828, "learning_rate": 5.825677487403082e-07, "loss": 0.0846, "num_input_tokens_seen": 92870928, "step": 137815 }, { "epoch": 3.366965529035253, "grad_norm": 0.00039618040318600833, "learning_rate": 5.824902578173278e-07, "loss": 0.0, "num_input_tokens_seen": 92874576, "step": 137820 }, { "epoch": 3.3670876798671, "grad_norm": 0.0006792770582251251, "learning_rate": 5.824127699305899e-07, "loss": 0.0, "num_input_tokens_seen": 92878672, "step": 137825 }, { "epoch": 3.367209830698947, "grad_norm": 0.012354973703622818, "learning_rate": 5.823352850806587e-07, "loss": 0.0, "num_input_tokens_seen": 92882000, "step": 137830 }, { "epoch": 3.367331981530794, "grad_norm": 0.027326742187142372, "learning_rate": 5.822578032680983e-07, "loss": 0.0, "num_input_tokens_seen": 92885392, "step": 137835 }, { "epoch": 3.3674541323626412, "grad_norm": 0.0008394501637667418, "learning_rate": 5.821803244934708e-07, "loss": 0.0001, "num_input_tokens_seen": 92888976, "step": 137840 }, { "epoch": 3.3675762831944884, "grad_norm": 0.00104551634285599, "learning_rate": 5.821028487573408e-07, "loss": 0.0001, "num_input_tokens_seen": 92892560, "step": 137845 }, { "epoch": 3.3676984340263356, "grad_norm": 0.0046212682500481606, "learning_rate": 5.82025376060271e-07, "loss": 0.0027, "num_input_tokens_seen": 92895696, "step": 137850 }, { "epoch": 3.367820584858183, "grad_norm": 0.20989792048931122, "learning_rate": 5.819479064028254e-07, "loss": 0.0001, "num_input_tokens_seen": 92898960, "step": 137855 }, { "epoch": 3.36794273569003, "grad_norm": 0.00013297107943799347, "learning_rate": 5.818704397855667e-07, "loss": 0.0237, "num_input_tokens_seen": 92902224, "step": 137860 }, { "epoch": 3.368064886521877, "grad_norm": 0.0011130105704069138, "learning_rate": 5.817929762090588e-07, "loss": 0.0, "num_input_tokens_seen": 92905744, "step": 137865 }, { "epoch": 3.3681870373537244, "grad_norm": 0.0032320625614374876, "learning_rate": 5.81715515673865e-07, "loss": 0.0, "num_input_tokens_seen": 92909136, "step": 137870 }, { "epoch": 3.3683091881855716, "grad_norm": 0.0019834605045616627, "learning_rate": 5.816380581805482e-07, "loss": 0.0, "num_input_tokens_seen": 92912528, "step": 137875 }, { "epoch": 3.3684313390174188, "grad_norm": 0.0003283233963884413, "learning_rate": 5.815606037296723e-07, "loss": 0.0, "num_input_tokens_seen": 92915856, "step": 137880 }, { "epoch": 3.368553489849266, "grad_norm": 0.030905848369002342, "learning_rate": 5.814831523217998e-07, "loss": 0.0001, "num_input_tokens_seen": 92919184, "step": 137885 }, { "epoch": 3.368675640681113, "grad_norm": 0.0006857877597212791, "learning_rate": 5.814057039574944e-07, "loss": 0.0, "num_input_tokens_seen": 92922064, "step": 137890 }, { "epoch": 3.3687977915129603, "grad_norm": 0.8784730434417725, "learning_rate": 5.813282586373198e-07, "loss": 0.0002, "num_input_tokens_seen": 92925392, "step": 137895 }, { "epoch": 3.3689199423448075, "grad_norm": 0.010333950631320477, "learning_rate": 5.812508163618389e-07, "loss": 0.0, "num_input_tokens_seen": 92928848, "step": 137900 }, { "epoch": 3.3690420931766547, "grad_norm": 0.0007572348113171756, "learning_rate": 5.811733771316139e-07, "loss": 0.0, "num_input_tokens_seen": 92931984, "step": 137905 }, { "epoch": 3.369164244008502, "grad_norm": 0.006056161597371101, "learning_rate": 5.810959409472093e-07, "loss": 0.0, "num_input_tokens_seen": 92935440, "step": 137910 }, { "epoch": 3.3692863948403486, "grad_norm": 0.0007624893332831562, "learning_rate": 5.810185078091879e-07, "loss": 0.0, "num_input_tokens_seen": 92939088, "step": 137915 }, { "epoch": 3.3694085456721963, "grad_norm": 0.00043983652722090483, "learning_rate": 5.809410777181118e-07, "loss": 0.0, "num_input_tokens_seen": 92942544, "step": 137920 }, { "epoch": 3.369530696504043, "grad_norm": 0.3516944646835327, "learning_rate": 5.808636506745453e-07, "loss": 0.0001, "num_input_tokens_seen": 92945360, "step": 137925 }, { "epoch": 3.36965284733589, "grad_norm": 0.03775249049067497, "learning_rate": 5.807862266790512e-07, "loss": 0.0, "num_input_tokens_seen": 92948944, "step": 137930 }, { "epoch": 3.3697749981677374, "grad_norm": 0.00020264858903829008, "learning_rate": 5.807088057321921e-07, "loss": 0.068, "num_input_tokens_seen": 92952400, "step": 137935 }, { "epoch": 3.3698971489995846, "grad_norm": 0.003877782030031085, "learning_rate": 5.806313878345317e-07, "loss": 0.0, "num_input_tokens_seen": 92955664, "step": 137940 }, { "epoch": 3.370019299831432, "grad_norm": 0.1308281123638153, "learning_rate": 5.805539729866322e-07, "loss": 0.0001, "num_input_tokens_seen": 92959376, "step": 137945 }, { "epoch": 3.370141450663279, "grad_norm": 0.009636446833610535, "learning_rate": 5.804765611890576e-07, "loss": 0.0, "num_input_tokens_seen": 92962768, "step": 137950 }, { "epoch": 3.370263601495126, "grad_norm": 0.003599069779738784, "learning_rate": 5.803991524423698e-07, "loss": 0.0, "num_input_tokens_seen": 92965776, "step": 137955 }, { "epoch": 3.3703857523269733, "grad_norm": 21.893583297729492, "learning_rate": 5.803217467471322e-07, "loss": 0.1704, "num_input_tokens_seen": 92968848, "step": 137960 }, { "epoch": 3.3705079031588205, "grad_norm": 0.07198493182659149, "learning_rate": 5.802443441039082e-07, "loss": 0.0, "num_input_tokens_seen": 92971728, "step": 137965 }, { "epoch": 3.3706300539906677, "grad_norm": 0.003822456346824765, "learning_rate": 5.801669445132597e-07, "loss": 0.0007, "num_input_tokens_seen": 92975440, "step": 137970 }, { "epoch": 3.370752204822515, "grad_norm": 0.0025848846416920424, "learning_rate": 5.800895479757506e-07, "loss": 0.0, "num_input_tokens_seen": 92978832, "step": 137975 }, { "epoch": 3.370874355654362, "grad_norm": 0.0029444515239447355, "learning_rate": 5.800121544919429e-07, "loss": 0.0, "num_input_tokens_seen": 92982288, "step": 137980 }, { "epoch": 3.3709965064862093, "grad_norm": 0.006925372406840324, "learning_rate": 5.799347640623997e-07, "loss": 0.0001, "num_input_tokens_seen": 92985744, "step": 137985 }, { "epoch": 3.3711186573180565, "grad_norm": 0.0038767820224165916, "learning_rate": 5.798573766876841e-07, "loss": 0.0, "num_input_tokens_seen": 92989328, "step": 137990 }, { "epoch": 3.3712408081499037, "grad_norm": 0.06253549456596375, "learning_rate": 5.797799923683586e-07, "loss": 0.0, "num_input_tokens_seen": 92992720, "step": 137995 }, { "epoch": 3.3713629589817504, "grad_norm": 0.00161257095169276, "learning_rate": 5.797026111049863e-07, "loss": 0.0, "num_input_tokens_seen": 92995984, "step": 138000 }, { "epoch": 3.371485109813598, "grad_norm": 0.0027689035050570965, "learning_rate": 5.796252328981295e-07, "loss": 0.0, "num_input_tokens_seen": 93000208, "step": 138005 }, { "epoch": 3.371607260645445, "grad_norm": 0.00044218875700607896, "learning_rate": 5.795478577483508e-07, "loss": 0.0003, "num_input_tokens_seen": 93003920, "step": 138010 }, { "epoch": 3.371729411477292, "grad_norm": 0.004747296683490276, "learning_rate": 5.794704856562136e-07, "loss": 0.0096, "num_input_tokens_seen": 93007504, "step": 138015 }, { "epoch": 3.371851562309139, "grad_norm": 0.00363075640052557, "learning_rate": 5.793931166222798e-07, "loss": 0.0002, "num_input_tokens_seen": 93010896, "step": 138020 }, { "epoch": 3.3719737131409864, "grad_norm": 0.00033181399339810014, "learning_rate": 5.793157506471127e-07, "loss": 0.0, "num_input_tokens_seen": 93014160, "step": 138025 }, { "epoch": 3.3720958639728336, "grad_norm": 5.022310506319627e-05, "learning_rate": 5.79238387731274e-07, "loss": 0.0, "num_input_tokens_seen": 93017424, "step": 138030 }, { "epoch": 3.3722180148046808, "grad_norm": 0.0004518816713243723, "learning_rate": 5.791610278753276e-07, "loss": 0.0, "num_input_tokens_seen": 93021456, "step": 138035 }, { "epoch": 3.372340165636528, "grad_norm": 0.08886852115392685, "learning_rate": 5.79083671079835e-07, "loss": 0.0334, "num_input_tokens_seen": 93024464, "step": 138040 }, { "epoch": 3.372462316468375, "grad_norm": 0.0003890712687280029, "learning_rate": 5.79006317345359e-07, "loss": 0.0, "num_input_tokens_seen": 93027600, "step": 138045 }, { "epoch": 3.3725844673002223, "grad_norm": 0.1624067723751068, "learning_rate": 5.789289666724629e-07, "loss": 0.0, "num_input_tokens_seen": 93031248, "step": 138050 }, { "epoch": 3.3727066181320695, "grad_norm": 0.0007669181213714182, "learning_rate": 5.78851619061708e-07, "loss": 0.0, "num_input_tokens_seen": 93034576, "step": 138055 }, { "epoch": 3.3728287689639167, "grad_norm": 0.009815610013902187, "learning_rate": 5.787742745136579e-07, "loss": 0.0, "num_input_tokens_seen": 93037648, "step": 138060 }, { "epoch": 3.372950919795764, "grad_norm": 0.00123111205175519, "learning_rate": 5.786969330288741e-07, "loss": 0.0, "num_input_tokens_seen": 93040720, "step": 138065 }, { "epoch": 3.373073070627611, "grad_norm": 0.004505062475800514, "learning_rate": 5.7861959460792e-07, "loss": 0.0726, "num_input_tokens_seen": 93043984, "step": 138070 }, { "epoch": 3.3731952214594583, "grad_norm": 0.005152316763997078, "learning_rate": 5.785422592513572e-07, "loss": 0.0, "num_input_tokens_seen": 93046992, "step": 138075 }, { "epoch": 3.3733173722913055, "grad_norm": 0.0056071956641972065, "learning_rate": 5.784649269597482e-07, "loss": 0.0, "num_input_tokens_seen": 93050192, "step": 138080 }, { "epoch": 3.3734395231231527, "grad_norm": 0.011997929774224758, "learning_rate": 5.783875977336563e-07, "loss": 0.0, "num_input_tokens_seen": 93053456, "step": 138085 }, { "epoch": 3.373561673955, "grad_norm": 0.0025390381924808025, "learning_rate": 5.783102715736426e-07, "loss": 0.0377, "num_input_tokens_seen": 93057040, "step": 138090 }, { "epoch": 3.3736838247868466, "grad_norm": 0.007399492897093296, "learning_rate": 5.782329484802706e-07, "loss": 0.0, "num_input_tokens_seen": 93060240, "step": 138095 }, { "epoch": 3.3738059756186938, "grad_norm": 0.09200409799814224, "learning_rate": 5.781556284541015e-07, "loss": 0.0, "num_input_tokens_seen": 93064016, "step": 138100 }, { "epoch": 3.373928126450541, "grad_norm": 0.0009583776700310409, "learning_rate": 5.780783114956986e-07, "loss": 0.0, "num_input_tokens_seen": 93066896, "step": 138105 }, { "epoch": 3.374050277282388, "grad_norm": 21.754066467285156, "learning_rate": 5.780009976056237e-07, "loss": 0.056, "num_input_tokens_seen": 93070480, "step": 138110 }, { "epoch": 3.3741724281142353, "grad_norm": 0.0002358252095291391, "learning_rate": 5.779236867844385e-07, "loss": 0.0, "num_input_tokens_seen": 93074128, "step": 138115 }, { "epoch": 3.3742945789460825, "grad_norm": 0.001320650801062584, "learning_rate": 5.778463790327064e-07, "loss": 0.0, "num_input_tokens_seen": 93077456, "step": 138120 }, { "epoch": 3.3744167297779297, "grad_norm": 0.005088005214929581, "learning_rate": 5.777690743509885e-07, "loss": 0.0, "num_input_tokens_seen": 93080976, "step": 138125 }, { "epoch": 3.374538880609777, "grad_norm": 0.00016588116704951972, "learning_rate": 5.776917727398478e-07, "loss": 0.0, "num_input_tokens_seen": 93084560, "step": 138130 }, { "epoch": 3.374661031441624, "grad_norm": 0.023031558841466904, "learning_rate": 5.776144741998457e-07, "loss": 0.0, "num_input_tokens_seen": 93088208, "step": 138135 }, { "epoch": 3.3747831822734713, "grad_norm": 0.0005032282206229866, "learning_rate": 5.775371787315448e-07, "loss": 0.0, "num_input_tokens_seen": 93091792, "step": 138140 }, { "epoch": 3.3749053331053185, "grad_norm": 0.06005355343222618, "learning_rate": 5.774598863355077e-07, "loss": 0.0001, "num_input_tokens_seen": 93095440, "step": 138145 }, { "epoch": 3.3750274839371657, "grad_norm": 0.0024213064461946487, "learning_rate": 5.773825970122954e-07, "loss": 0.0454, "num_input_tokens_seen": 93098768, "step": 138150 }, { "epoch": 3.375149634769013, "grad_norm": 0.0022115923929959536, "learning_rate": 5.773053107624711e-07, "loss": 0.0569, "num_input_tokens_seen": 93102096, "step": 138155 }, { "epoch": 3.37527178560086, "grad_norm": 0.0012570025864988565, "learning_rate": 5.772280275865955e-07, "loss": 0.0, "num_input_tokens_seen": 93105616, "step": 138160 }, { "epoch": 3.3753939364327072, "grad_norm": 0.0030597366858273745, "learning_rate": 5.771507474852322e-07, "loss": 0.0, "num_input_tokens_seen": 93109200, "step": 138165 }, { "epoch": 3.3755160872645544, "grad_norm": 0.0005051979096606374, "learning_rate": 5.770734704589417e-07, "loss": 0.0643, "num_input_tokens_seen": 93112208, "step": 138170 }, { "epoch": 3.3756382380964016, "grad_norm": 0.00468916492536664, "learning_rate": 5.769961965082868e-07, "loss": 0.0, "num_input_tokens_seen": 93115856, "step": 138175 }, { "epoch": 3.3757603889282484, "grad_norm": 0.019855257123708725, "learning_rate": 5.769189256338299e-07, "loss": 0.0, "num_input_tokens_seen": 93119056, "step": 138180 }, { "epoch": 3.375882539760096, "grad_norm": 0.014095536433160305, "learning_rate": 5.768416578361317e-07, "loss": 0.0, "num_input_tokens_seen": 93122704, "step": 138185 }, { "epoch": 3.3760046905919427, "grad_norm": 0.045923784375190735, "learning_rate": 5.767643931157552e-07, "loss": 0.0, "num_input_tokens_seen": 93125904, "step": 138190 }, { "epoch": 3.37612684142379, "grad_norm": 0.20710352063179016, "learning_rate": 5.766871314732616e-07, "loss": 0.0, "num_input_tokens_seen": 93129168, "step": 138195 }, { "epoch": 3.376248992255637, "grad_norm": 0.0003032416570931673, "learning_rate": 5.76609872909213e-07, "loss": 0.0464, "num_input_tokens_seen": 93132496, "step": 138200 }, { "epoch": 3.3763711430874843, "grad_norm": 0.0032075492199510336, "learning_rate": 5.765326174241716e-07, "loss": 0.0588, "num_input_tokens_seen": 93135888, "step": 138205 }, { "epoch": 3.3764932939193315, "grad_norm": 0.02502131648361683, "learning_rate": 5.76455365018699e-07, "loss": 0.0, "num_input_tokens_seen": 93139088, "step": 138210 }, { "epoch": 3.3766154447511787, "grad_norm": 0.0029747423250228167, "learning_rate": 5.763781156933565e-07, "loss": 0.0, "num_input_tokens_seen": 93142864, "step": 138215 }, { "epoch": 3.376737595583026, "grad_norm": 0.11040252447128296, "learning_rate": 5.763008694487066e-07, "loss": 0.0001, "num_input_tokens_seen": 93146256, "step": 138220 }, { "epoch": 3.376859746414873, "grad_norm": 0.004152897745370865, "learning_rate": 5.762236262853108e-07, "loss": 0.0, "num_input_tokens_seen": 93149584, "step": 138225 }, { "epoch": 3.3769818972467203, "grad_norm": 0.007863405160605907, "learning_rate": 5.761463862037304e-07, "loss": 0.0, "num_input_tokens_seen": 93152976, "step": 138230 }, { "epoch": 3.3771040480785675, "grad_norm": 0.005199017468839884, "learning_rate": 5.760691492045275e-07, "loss": 0.0001, "num_input_tokens_seen": 93156048, "step": 138235 }, { "epoch": 3.3772261989104146, "grad_norm": 0.0022649518214166164, "learning_rate": 5.75991915288264e-07, "loss": 0.0, "num_input_tokens_seen": 93159184, "step": 138240 }, { "epoch": 3.377348349742262, "grad_norm": 0.024286113679409027, "learning_rate": 5.759146844555011e-07, "loss": 0.0, "num_input_tokens_seen": 93162960, "step": 138245 }, { "epoch": 3.377470500574109, "grad_norm": 0.00025870741228573024, "learning_rate": 5.758374567068011e-07, "loss": 0.0, "num_input_tokens_seen": 93166544, "step": 138250 }, { "epoch": 3.377592651405956, "grad_norm": 0.0024284597020596266, "learning_rate": 5.757602320427248e-07, "loss": 0.0013, "num_input_tokens_seen": 93169616, "step": 138255 }, { "epoch": 3.3777148022378034, "grad_norm": 0.00405424740165472, "learning_rate": 5.756830104638345e-07, "loss": 0.0, "num_input_tokens_seen": 93173200, "step": 138260 }, { "epoch": 3.3778369530696506, "grad_norm": 0.004090134520083666, "learning_rate": 5.756057919706912e-07, "loss": 0.0001, "num_input_tokens_seen": 93176272, "step": 138265 }, { "epoch": 3.377959103901498, "grad_norm": 0.07391510158777237, "learning_rate": 5.755285765638565e-07, "loss": 0.0, "num_input_tokens_seen": 93179536, "step": 138270 }, { "epoch": 3.3780812547333445, "grad_norm": 0.01007003989070654, "learning_rate": 5.754513642438928e-07, "loss": 0.0, "num_input_tokens_seen": 93182928, "step": 138275 }, { "epoch": 3.3782034055651917, "grad_norm": 0.01484597846865654, "learning_rate": 5.753741550113605e-07, "loss": 0.0001, "num_input_tokens_seen": 93186128, "step": 138280 }, { "epoch": 3.378325556397039, "grad_norm": 0.0003579423646442592, "learning_rate": 5.752969488668218e-07, "loss": 0.0246, "num_input_tokens_seen": 93189776, "step": 138285 }, { "epoch": 3.378447707228886, "grad_norm": 0.003087420715019107, "learning_rate": 5.752197458108376e-07, "loss": 0.0512, "num_input_tokens_seen": 93193424, "step": 138290 }, { "epoch": 3.3785698580607333, "grad_norm": 0.005263794679194689, "learning_rate": 5.751425458439698e-07, "loss": 0.0, "num_input_tokens_seen": 93196688, "step": 138295 }, { "epoch": 3.3786920088925805, "grad_norm": 0.011670518666505814, "learning_rate": 5.750653489667801e-07, "loss": 0.0001, "num_input_tokens_seen": 93200080, "step": 138300 }, { "epoch": 3.3788141597244277, "grad_norm": 23.023923873901367, "learning_rate": 5.749881551798288e-07, "loss": 0.0213, "num_input_tokens_seen": 93203152, "step": 138305 }, { "epoch": 3.378936310556275, "grad_norm": 0.0015072767855599523, "learning_rate": 5.749109644836786e-07, "loss": 0.0, "num_input_tokens_seen": 93206032, "step": 138310 }, { "epoch": 3.379058461388122, "grad_norm": 0.0019943653605878353, "learning_rate": 5.748337768788901e-07, "loss": 0.0, "num_input_tokens_seen": 93209040, "step": 138315 }, { "epoch": 3.3791806122199692, "grad_norm": 0.03423043712973595, "learning_rate": 5.747565923660244e-07, "loss": 0.0, "num_input_tokens_seen": 93212816, "step": 138320 }, { "epoch": 3.3793027630518164, "grad_norm": 0.007128616329282522, "learning_rate": 5.746794109456434e-07, "loss": 0.0, "num_input_tokens_seen": 93216272, "step": 138325 }, { "epoch": 3.3794249138836636, "grad_norm": 0.0021170915570110083, "learning_rate": 5.746022326183079e-07, "loss": 0.0, "num_input_tokens_seen": 93219664, "step": 138330 }, { "epoch": 3.379547064715511, "grad_norm": 0.0022564528044313192, "learning_rate": 5.745250573845797e-07, "loss": 0.0246, "num_input_tokens_seen": 93222864, "step": 138335 }, { "epoch": 3.379669215547358, "grad_norm": 0.005194546654820442, "learning_rate": 5.744478852450192e-07, "loss": 0.0001, "num_input_tokens_seen": 93226192, "step": 138340 }, { "epoch": 3.379791366379205, "grad_norm": 0.040994592010974884, "learning_rate": 5.743707162001888e-07, "loss": 0.0002, "num_input_tokens_seen": 93229392, "step": 138345 }, { "epoch": 3.3799135172110524, "grad_norm": 0.004443665035068989, "learning_rate": 5.742935502506484e-07, "loss": 0.0, "num_input_tokens_seen": 93232784, "step": 138350 }, { "epoch": 3.3800356680428996, "grad_norm": 0.003939436282962561, "learning_rate": 5.742163873969599e-07, "loss": 0.0001, "num_input_tokens_seen": 93235984, "step": 138355 }, { "epoch": 3.3801578188747463, "grad_norm": 0.0042626261711120605, "learning_rate": 5.741392276396847e-07, "loss": 0.0, "num_input_tokens_seen": 93239248, "step": 138360 }, { "epoch": 3.380279969706594, "grad_norm": 0.0003855399845633656, "learning_rate": 5.740620709793832e-07, "loss": 0.0002, "num_input_tokens_seen": 93242448, "step": 138365 }, { "epoch": 3.3804021205384407, "grad_norm": 0.052950721234083176, "learning_rate": 5.739849174166173e-07, "loss": 0.0, "num_input_tokens_seen": 93245456, "step": 138370 }, { "epoch": 3.380524271370288, "grad_norm": 0.00018201395869255066, "learning_rate": 5.739077669519473e-07, "loss": 0.0, "num_input_tokens_seen": 93248336, "step": 138375 }, { "epoch": 3.380646422202135, "grad_norm": 0.017797749489545822, "learning_rate": 5.738306195859351e-07, "loss": 0.0, "num_input_tokens_seen": 93251472, "step": 138380 }, { "epoch": 3.3807685730339823, "grad_norm": 0.002218404784798622, "learning_rate": 5.737534753191406e-07, "loss": 0.0, "num_input_tokens_seen": 93254672, "step": 138385 }, { "epoch": 3.3808907238658295, "grad_norm": 0.013328985311090946, "learning_rate": 5.736763341521256e-07, "loss": 0.0001, "num_input_tokens_seen": 93258064, "step": 138390 }, { "epoch": 3.3810128746976766, "grad_norm": 37.32630157470703, "learning_rate": 5.735991960854514e-07, "loss": 0.0696, "num_input_tokens_seen": 93261648, "step": 138395 }, { "epoch": 3.381135025529524, "grad_norm": 0.030253412202000618, "learning_rate": 5.735220611196781e-07, "loss": 0.0008, "num_input_tokens_seen": 93264912, "step": 138400 }, { "epoch": 3.381257176361371, "grad_norm": 0.00933878030627966, "learning_rate": 5.734449292553675e-07, "loss": 0.0, "num_input_tokens_seen": 93268368, "step": 138405 }, { "epoch": 3.381379327193218, "grad_norm": 8.085768786258996e-05, "learning_rate": 5.733678004930798e-07, "loss": 0.0, "num_input_tokens_seen": 93271824, "step": 138410 }, { "epoch": 3.3815014780250654, "grad_norm": 0.006211719010025263, "learning_rate": 5.732906748333766e-07, "loss": 0.0, "num_input_tokens_seen": 93275344, "step": 138415 }, { "epoch": 3.3816236288569126, "grad_norm": 0.0006274699117057025, "learning_rate": 5.732135522768182e-07, "loss": 0.0001, "num_input_tokens_seen": 93278992, "step": 138420 }, { "epoch": 3.38174577968876, "grad_norm": 0.0006471577798947692, "learning_rate": 5.731364328239654e-07, "loss": 0.0, "num_input_tokens_seen": 93282000, "step": 138425 }, { "epoch": 3.381867930520607, "grad_norm": 0.030883746221661568, "learning_rate": 5.730593164753795e-07, "loss": 0.1003, "num_input_tokens_seen": 93285584, "step": 138430 }, { "epoch": 3.381990081352454, "grad_norm": 0.008337740786373615, "learning_rate": 5.729822032316208e-07, "loss": 0.0, "num_input_tokens_seen": 93288912, "step": 138435 }, { "epoch": 3.3821122321843013, "grad_norm": 279.927978515625, "learning_rate": 5.729050930932508e-07, "loss": 0.0022, "num_input_tokens_seen": 93292112, "step": 138440 }, { "epoch": 3.382234383016148, "grad_norm": 0.00017066705913748592, "learning_rate": 5.728279860608294e-07, "loss": 0.0, "num_input_tokens_seen": 93295184, "step": 138445 }, { "epoch": 3.3823565338479957, "grad_norm": 0.0031765324529260397, "learning_rate": 5.727508821349178e-07, "loss": 0.0, "num_input_tokens_seen": 93298896, "step": 138450 }, { "epoch": 3.3824786846798425, "grad_norm": 0.007076940033584833, "learning_rate": 5.726737813160771e-07, "loss": 0.0, "num_input_tokens_seen": 93302288, "step": 138455 }, { "epoch": 3.3826008355116897, "grad_norm": 0.022491326555609703, "learning_rate": 5.725966836048671e-07, "loss": 0.0, "num_input_tokens_seen": 93305552, "step": 138460 }, { "epoch": 3.382722986343537, "grad_norm": 0.005003460217267275, "learning_rate": 5.725195890018495e-07, "loss": 0.0, "num_input_tokens_seen": 93308880, "step": 138465 }, { "epoch": 3.382845137175384, "grad_norm": 0.0008077129605226219, "learning_rate": 5.72442497507584e-07, "loss": 0.0001, "num_input_tokens_seen": 93312272, "step": 138470 }, { "epoch": 3.3829672880072312, "grad_norm": 0.0005891541368328035, "learning_rate": 5.72365409122632e-07, "loss": 0.0, "num_input_tokens_seen": 93315408, "step": 138475 }, { "epoch": 3.3830894388390784, "grad_norm": 1.0880800485610962, "learning_rate": 5.722883238475535e-07, "loss": 0.0002, "num_input_tokens_seen": 93319120, "step": 138480 }, { "epoch": 3.3832115896709256, "grad_norm": 0.027302728965878487, "learning_rate": 5.722112416829092e-07, "loss": 0.0, "num_input_tokens_seen": 93322640, "step": 138485 }, { "epoch": 3.383333740502773, "grad_norm": 0.0027143717743456364, "learning_rate": 5.721341626292603e-07, "loss": 0.0, "num_input_tokens_seen": 93326224, "step": 138490 }, { "epoch": 3.38345589133462, "grad_norm": 0.00450406176969409, "learning_rate": 5.720570866871664e-07, "loss": 0.0452, "num_input_tokens_seen": 93330256, "step": 138495 }, { "epoch": 3.383578042166467, "grad_norm": 0.0004445763770490885, "learning_rate": 5.719800138571889e-07, "loss": 0.0, "num_input_tokens_seen": 93333520, "step": 138500 }, { "epoch": 3.3837001929983144, "grad_norm": 0.007731878198683262, "learning_rate": 5.719029441398875e-07, "loss": 0.0, "num_input_tokens_seen": 93336848, "step": 138505 }, { "epoch": 3.3838223438301616, "grad_norm": 0.03202767297625542, "learning_rate": 5.718258775358229e-07, "loss": 0.0001, "num_input_tokens_seen": 93340048, "step": 138510 }, { "epoch": 3.3839444946620088, "grad_norm": 0.0012034822721034288, "learning_rate": 5.717488140455562e-07, "loss": 0.0, "num_input_tokens_seen": 93343504, "step": 138515 }, { "epoch": 3.384066645493856, "grad_norm": 0.0018715094774961472, "learning_rate": 5.716717536696473e-07, "loss": 0.0, "num_input_tokens_seen": 93347024, "step": 138520 }, { "epoch": 3.384188796325703, "grad_norm": 0.001975530991330743, "learning_rate": 5.715946964086562e-07, "loss": 0.0638, "num_input_tokens_seen": 93350224, "step": 138525 }, { "epoch": 3.3843109471575503, "grad_norm": 0.002123733516782522, "learning_rate": 5.71517642263144e-07, "loss": 0.0, "num_input_tokens_seen": 93354192, "step": 138530 }, { "epoch": 3.3844330979893975, "grad_norm": 0.0031818312127143145, "learning_rate": 5.714405912336708e-07, "loss": 0.0, "num_input_tokens_seen": 93357712, "step": 138535 }, { "epoch": 3.3845552488212443, "grad_norm": 0.2843506634235382, "learning_rate": 5.713635433207966e-07, "loss": 0.0489, "num_input_tokens_seen": 93361040, "step": 138540 }, { "epoch": 3.3846773996530914, "grad_norm": 0.0008113362709991634, "learning_rate": 5.71286498525082e-07, "loss": 0.0, "num_input_tokens_seen": 93364560, "step": 138545 }, { "epoch": 3.3847995504849386, "grad_norm": 0.010063768364489079, "learning_rate": 5.712094568470875e-07, "loss": 0.0, "num_input_tokens_seen": 93368336, "step": 138550 }, { "epoch": 3.384921701316786, "grad_norm": 0.006297845859080553, "learning_rate": 5.711324182873729e-07, "loss": 0.0, "num_input_tokens_seen": 93371472, "step": 138555 }, { "epoch": 3.385043852148633, "grad_norm": 96.34769439697266, "learning_rate": 5.710553828464993e-07, "loss": 0.0546, "num_input_tokens_seen": 93374480, "step": 138560 }, { "epoch": 3.38516600298048, "grad_norm": 0.003234319156035781, "learning_rate": 5.709783505250256e-07, "loss": 0.0, "num_input_tokens_seen": 93378000, "step": 138565 }, { "epoch": 3.3852881538123274, "grad_norm": 0.007625074591487646, "learning_rate": 5.709013213235133e-07, "loss": 0.0, "num_input_tokens_seen": 93381200, "step": 138570 }, { "epoch": 3.3854103046441746, "grad_norm": 0.020981401205062866, "learning_rate": 5.708242952425216e-07, "loss": 0.0, "num_input_tokens_seen": 93384592, "step": 138575 }, { "epoch": 3.3855324554760218, "grad_norm": 0.0004391383845359087, "learning_rate": 5.707472722826109e-07, "loss": 0.0, "num_input_tokens_seen": 93389264, "step": 138580 }, { "epoch": 3.385654606307869, "grad_norm": 0.03177093714475632, "learning_rate": 5.706702524443419e-07, "loss": 0.0, "num_input_tokens_seen": 93392784, "step": 138585 }, { "epoch": 3.385776757139716, "grad_norm": 0.00039413681952282786, "learning_rate": 5.705932357282741e-07, "loss": 0.0001, "num_input_tokens_seen": 93396176, "step": 138590 }, { "epoch": 3.3858989079715633, "grad_norm": 0.00042011006735265255, "learning_rate": 5.705162221349681e-07, "loss": 0.0, "num_input_tokens_seen": 93399376, "step": 138595 }, { "epoch": 3.3860210588034105, "grad_norm": 0.00041277159471064806, "learning_rate": 5.704392116649832e-07, "loss": 0.0, "num_input_tokens_seen": 93403216, "step": 138600 }, { "epoch": 3.3861432096352577, "grad_norm": 0.002446555532515049, "learning_rate": 5.703622043188799e-07, "loss": 0.0684, "num_input_tokens_seen": 93406416, "step": 138605 }, { "epoch": 3.386265360467105, "grad_norm": 0.0015684259124100208, "learning_rate": 5.702852000972187e-07, "loss": 0.0371, "num_input_tokens_seen": 93409360, "step": 138610 }, { "epoch": 3.386387511298952, "grad_norm": 0.005386325065046549, "learning_rate": 5.702081990005587e-07, "loss": 0.0001, "num_input_tokens_seen": 93412240, "step": 138615 }, { "epoch": 3.3865096621307993, "grad_norm": 0.004681466147303581, "learning_rate": 5.701312010294606e-07, "loss": 0.0, "num_input_tokens_seen": 93415760, "step": 138620 }, { "epoch": 3.386631812962646, "grad_norm": 0.4638597369194031, "learning_rate": 5.700542061844839e-07, "loss": 0.0003, "num_input_tokens_seen": 93418768, "step": 138625 }, { "epoch": 3.3867539637944937, "grad_norm": 0.00034154325840063393, "learning_rate": 5.699772144661885e-07, "loss": 0.0002, "num_input_tokens_seen": 93422096, "step": 138630 }, { "epoch": 3.3868761146263404, "grad_norm": 1.7362202405929565, "learning_rate": 5.699002258751348e-07, "loss": 0.0419, "num_input_tokens_seen": 93425488, "step": 138635 }, { "epoch": 3.3869982654581876, "grad_norm": 0.0004423426289577037, "learning_rate": 5.698232404118819e-07, "loss": 0.0, "num_input_tokens_seen": 93429136, "step": 138640 }, { "epoch": 3.387120416290035, "grad_norm": 0.004544767551124096, "learning_rate": 5.697462580769905e-07, "loss": 0.0, "num_input_tokens_seen": 93432464, "step": 138645 }, { "epoch": 3.387242567121882, "grad_norm": 0.001746030175127089, "learning_rate": 5.696692788710196e-07, "loss": 0.0001, "num_input_tokens_seen": 93435792, "step": 138650 }, { "epoch": 3.387364717953729, "grad_norm": 0.00814051739871502, "learning_rate": 5.6959230279453e-07, "loss": 0.0, "num_input_tokens_seen": 93439568, "step": 138655 }, { "epoch": 3.3874868687855764, "grad_norm": 0.0016798563301563263, "learning_rate": 5.695153298480803e-07, "loss": 0.0, "num_input_tokens_seen": 93443152, "step": 138660 }, { "epoch": 3.3876090196174236, "grad_norm": 25.824024200439453, "learning_rate": 5.694383600322314e-07, "loss": 0.0431, "num_input_tokens_seen": 93446736, "step": 138665 }, { "epoch": 3.3877311704492707, "grad_norm": 0.0016559103969484568, "learning_rate": 5.693613933475423e-07, "loss": 0.0, "num_input_tokens_seen": 93449872, "step": 138670 }, { "epoch": 3.387853321281118, "grad_norm": 0.02711871638894081, "learning_rate": 5.692844297945728e-07, "loss": 0.0155, "num_input_tokens_seen": 93453200, "step": 138675 }, { "epoch": 3.387975472112965, "grad_norm": 0.0018949408549815416, "learning_rate": 5.692074693738833e-07, "loss": 0.0, "num_input_tokens_seen": 93456464, "step": 138680 }, { "epoch": 3.3880976229448123, "grad_norm": 0.0003768306924030185, "learning_rate": 5.691305120860323e-07, "loss": 0.1295, "num_input_tokens_seen": 93459664, "step": 138685 }, { "epoch": 3.3882197737766595, "grad_norm": 0.018024956807494164, "learning_rate": 5.690535579315809e-07, "loss": 0.0, "num_input_tokens_seen": 93463056, "step": 138690 }, { "epoch": 3.3883419246085067, "grad_norm": 0.004028045106679201, "learning_rate": 5.689766069110873e-07, "loss": 0.0, "num_input_tokens_seen": 93466192, "step": 138695 }, { "epoch": 3.388464075440354, "grad_norm": 0.004355765879154205, "learning_rate": 5.688996590251118e-07, "loss": 0.0, "num_input_tokens_seen": 93469392, "step": 138700 }, { "epoch": 3.388586226272201, "grad_norm": 0.0004680180863942951, "learning_rate": 5.688227142742143e-07, "loss": 0.0, "num_input_tokens_seen": 93472592, "step": 138705 }, { "epoch": 3.3887083771040483, "grad_norm": 0.01364127453416586, "learning_rate": 5.687457726589535e-07, "loss": 0.0, "num_input_tokens_seen": 93475792, "step": 138710 }, { "epoch": 3.3888305279358955, "grad_norm": 0.0096084950491786, "learning_rate": 5.6866883417989e-07, "loss": 0.0001, "num_input_tokens_seen": 93478864, "step": 138715 }, { "epoch": 3.388952678767742, "grad_norm": 7.81436829129234e-05, "learning_rate": 5.685918988375823e-07, "loss": 0.0722, "num_input_tokens_seen": 93482640, "step": 138720 }, { "epoch": 3.3890748295995894, "grad_norm": 0.0009174890001304448, "learning_rate": 5.685149666325907e-07, "loss": 0.0001, "num_input_tokens_seen": 93486160, "step": 138725 }, { "epoch": 3.3891969804314366, "grad_norm": 0.5889875888824463, "learning_rate": 5.684380375654744e-07, "loss": 0.0002, "num_input_tokens_seen": 93489488, "step": 138730 }, { "epoch": 3.3893191312632838, "grad_norm": 0.00953624863177538, "learning_rate": 5.683611116367924e-07, "loss": 0.0, "num_input_tokens_seen": 93492880, "step": 138735 }, { "epoch": 3.389441282095131, "grad_norm": 0.0017142931465059519, "learning_rate": 5.682841888471047e-07, "loss": 0.0005, "num_input_tokens_seen": 93496144, "step": 138740 }, { "epoch": 3.389563432926978, "grad_norm": 0.001251032343134284, "learning_rate": 5.682072691969701e-07, "loss": 0.0, "num_input_tokens_seen": 93499408, "step": 138745 }, { "epoch": 3.3896855837588253, "grad_norm": 0.00032911982270888984, "learning_rate": 5.68130352686949e-07, "loss": 0.0, "num_input_tokens_seen": 93503056, "step": 138750 }, { "epoch": 3.3898077345906725, "grad_norm": 0.0022663853596895933, "learning_rate": 5.680534393175997e-07, "loss": 0.0, "num_input_tokens_seen": 93506384, "step": 138755 }, { "epoch": 3.3899298854225197, "grad_norm": 9.262767707696185e-05, "learning_rate": 5.679765290894818e-07, "loss": 0.0, "num_input_tokens_seen": 93509776, "step": 138760 }, { "epoch": 3.390052036254367, "grad_norm": 0.007468585856258869, "learning_rate": 5.678996220031553e-07, "loss": 0.0, "num_input_tokens_seen": 93513296, "step": 138765 }, { "epoch": 3.390174187086214, "grad_norm": 0.023306336253881454, "learning_rate": 5.678227180591786e-07, "loss": 0.0, "num_input_tokens_seen": 93516624, "step": 138770 }, { "epoch": 3.3902963379180613, "grad_norm": 0.0007523863459937274, "learning_rate": 5.677458172581115e-07, "loss": 0.0, "num_input_tokens_seen": 93520144, "step": 138775 }, { "epoch": 3.3904184887499085, "grad_norm": 0.01631038449704647, "learning_rate": 5.676689196005129e-07, "loss": 0.0, "num_input_tokens_seen": 93523408, "step": 138780 }, { "epoch": 3.3905406395817557, "grad_norm": 0.0019541881047189236, "learning_rate": 5.675920250869426e-07, "loss": 0.0, "num_input_tokens_seen": 93526928, "step": 138785 }, { "epoch": 3.390662790413603, "grad_norm": 0.0017687291838228703, "learning_rate": 5.67515133717959e-07, "loss": 0.0, "num_input_tokens_seen": 93530192, "step": 138790 }, { "epoch": 3.39078494124545, "grad_norm": 0.0010739255230873823, "learning_rate": 5.674382454941215e-07, "loss": 0.0001, "num_input_tokens_seen": 93533584, "step": 138795 }, { "epoch": 3.3909070920772972, "grad_norm": 0.0009468430071137846, "learning_rate": 5.6736136041599e-07, "loss": 0.0, "num_input_tokens_seen": 93536848, "step": 138800 }, { "epoch": 3.391029242909144, "grad_norm": 0.001576863694936037, "learning_rate": 5.672844784841226e-07, "loss": 0.0, "num_input_tokens_seen": 93540240, "step": 138805 }, { "epoch": 3.3911513937409916, "grad_norm": 0.0003011640510521829, "learning_rate": 5.672075996990792e-07, "loss": 0.0, "num_input_tokens_seen": 93543760, "step": 138810 }, { "epoch": 3.3912735445728384, "grad_norm": 0.0003061066963709891, "learning_rate": 5.671307240614183e-07, "loss": 0.0, "num_input_tokens_seen": 93546704, "step": 138815 }, { "epoch": 3.3913956954046856, "grad_norm": 0.0020857774652540684, "learning_rate": 5.670538515716996e-07, "loss": 0.0, "num_input_tokens_seen": 93549904, "step": 138820 }, { "epoch": 3.3915178462365327, "grad_norm": 714.5460815429688, "learning_rate": 5.669769822304812e-07, "loss": 0.0227, "num_input_tokens_seen": 93553552, "step": 138825 }, { "epoch": 3.39163999706838, "grad_norm": 0.0003291108296252787, "learning_rate": 5.669001160383231e-07, "loss": 0.0, "num_input_tokens_seen": 93557072, "step": 138830 }, { "epoch": 3.391762147900227, "grad_norm": 1.0141626596450806, "learning_rate": 5.668232529957835e-07, "loss": 0.0002, "num_input_tokens_seen": 93560208, "step": 138835 }, { "epoch": 3.3918842987320743, "grad_norm": 0.016728295013308525, "learning_rate": 5.667463931034219e-07, "loss": 0.0, "num_input_tokens_seen": 93563344, "step": 138840 }, { "epoch": 3.3920064495639215, "grad_norm": 0.40863490104675293, "learning_rate": 5.666695363617972e-07, "loss": 0.0002, "num_input_tokens_seen": 93566480, "step": 138845 }, { "epoch": 3.3921286003957687, "grad_norm": 0.0022530490532517433, "learning_rate": 5.66592682771468e-07, "loss": 0.0001, "num_input_tokens_seen": 93569808, "step": 138850 }, { "epoch": 3.392250751227616, "grad_norm": 0.0001802055921871215, "learning_rate": 5.66515832332993e-07, "loss": 0.0001, "num_input_tokens_seen": 93573264, "step": 138855 }, { "epoch": 3.392372902059463, "grad_norm": 1311.0792236328125, "learning_rate": 5.664389850469322e-07, "loss": 0.0144, "num_input_tokens_seen": 93576400, "step": 138860 }, { "epoch": 3.3924950528913103, "grad_norm": 0.0003263753023929894, "learning_rate": 5.663621409138431e-07, "loss": 0.0, "num_input_tokens_seen": 93579728, "step": 138865 }, { "epoch": 3.3926172037231574, "grad_norm": 0.0007934704190120101, "learning_rate": 5.662852999342856e-07, "loss": 0.0, "num_input_tokens_seen": 93583184, "step": 138870 }, { "epoch": 3.3927393545550046, "grad_norm": 0.000460325856693089, "learning_rate": 5.662084621088177e-07, "loss": 0.0, "num_input_tokens_seen": 93586256, "step": 138875 }, { "epoch": 3.392861505386852, "grad_norm": 0.012204419821500778, "learning_rate": 5.66131627437999e-07, "loss": 0.0395, "num_input_tokens_seen": 93589776, "step": 138880 }, { "epoch": 3.392983656218699, "grad_norm": 0.0014695656718686223, "learning_rate": 5.660547959223871e-07, "loss": 0.0, "num_input_tokens_seen": 93593360, "step": 138885 }, { "epoch": 3.393105807050546, "grad_norm": 0.027506841346621513, "learning_rate": 5.659779675625418e-07, "loss": 0.0, "num_input_tokens_seen": 93596624, "step": 138890 }, { "epoch": 3.3932279578823934, "grad_norm": 0.0004509424907155335, "learning_rate": 5.659011423590217e-07, "loss": 0.0004, "num_input_tokens_seen": 93599952, "step": 138895 }, { "epoch": 3.39335010871424, "grad_norm": 0.004649386275559664, "learning_rate": 5.658243203123848e-07, "loss": 0.0001, "num_input_tokens_seen": 93603088, "step": 138900 }, { "epoch": 3.3934722595460873, "grad_norm": 0.018392464146018028, "learning_rate": 5.657475014231908e-07, "loss": 0.0, "num_input_tokens_seen": 93606928, "step": 138905 }, { "epoch": 3.3935944103779345, "grad_norm": 0.008321767672896385, "learning_rate": 5.656706856919971e-07, "loss": 0.0, "num_input_tokens_seen": 93610768, "step": 138910 }, { "epoch": 3.3937165612097817, "grad_norm": 0.03646848350763321, "learning_rate": 5.655938731193633e-07, "loss": 0.0, "num_input_tokens_seen": 93614032, "step": 138915 }, { "epoch": 3.393838712041629, "grad_norm": 0.000169451828696765, "learning_rate": 5.655170637058479e-07, "loss": 0.0, "num_input_tokens_seen": 93617232, "step": 138920 }, { "epoch": 3.393960862873476, "grad_norm": 0.0003246811975259334, "learning_rate": 5.654402574520088e-07, "loss": 0.0, "num_input_tokens_seen": 93620880, "step": 138925 }, { "epoch": 3.3940830137053233, "grad_norm": 0.028859952464699745, "learning_rate": 5.653634543584056e-07, "loss": 0.0, "num_input_tokens_seen": 93624016, "step": 138930 }, { "epoch": 3.3942051645371705, "grad_norm": 0.0009226136025972664, "learning_rate": 5.652866544255962e-07, "loss": 0.0, "num_input_tokens_seen": 93627216, "step": 138935 }, { "epoch": 3.3943273153690177, "grad_norm": 0.0003126481897197664, "learning_rate": 5.652098576541387e-07, "loss": 0.0739, "num_input_tokens_seen": 93630288, "step": 138940 }, { "epoch": 3.394449466200865, "grad_norm": 0.001821165787987411, "learning_rate": 5.651330640445926e-07, "loss": 0.0, "num_input_tokens_seen": 93633296, "step": 138945 }, { "epoch": 3.394571617032712, "grad_norm": 0.0009765501017682254, "learning_rate": 5.650562735975152e-07, "loss": 0.0, "num_input_tokens_seen": 93636816, "step": 138950 }, { "epoch": 3.3946937678645592, "grad_norm": 8.84903347468935e-05, "learning_rate": 5.649794863134663e-07, "loss": 0.1083, "num_input_tokens_seen": 93640144, "step": 138955 }, { "epoch": 3.3948159186964064, "grad_norm": 0.00939985178411007, "learning_rate": 5.649027021930031e-07, "loss": 0.0, "num_input_tokens_seen": 93643472, "step": 138960 }, { "epoch": 3.3949380695282536, "grad_norm": 0.0015276978956535459, "learning_rate": 5.648259212366847e-07, "loss": 0.0, "num_input_tokens_seen": 93646928, "step": 138965 }, { "epoch": 3.395060220360101, "grad_norm": 0.002236619358882308, "learning_rate": 5.647491434450688e-07, "loss": 0.0, "num_input_tokens_seen": 93650512, "step": 138970 }, { "epoch": 3.395182371191948, "grad_norm": 0.03943828493356705, "learning_rate": 5.646723688187148e-07, "loss": 0.0, "num_input_tokens_seen": 93653712, "step": 138975 }, { "epoch": 3.395304522023795, "grad_norm": 0.015553759410977364, "learning_rate": 5.645955973581799e-07, "loss": 0.0005, "num_input_tokens_seen": 93656976, "step": 138980 }, { "epoch": 3.395426672855642, "grad_norm": 0.0021713352762162685, "learning_rate": 5.645188290640231e-07, "loss": 0.0, "num_input_tokens_seen": 93660432, "step": 138985 }, { "epoch": 3.3955488236874896, "grad_norm": 0.00021980589372105896, "learning_rate": 5.644420639368028e-07, "loss": 0.0, "num_input_tokens_seen": 93663696, "step": 138990 }, { "epoch": 3.3956709745193363, "grad_norm": 0.027932588011026382, "learning_rate": 5.643653019770764e-07, "loss": 0.0355, "num_input_tokens_seen": 93666896, "step": 138995 }, { "epoch": 3.3957931253511835, "grad_norm": 0.014813977293670177, "learning_rate": 5.642885431854034e-07, "loss": 0.0, "num_input_tokens_seen": 93669904, "step": 139000 }, { "epoch": 3.3959152761830307, "grad_norm": 0.00031996812322176993, "learning_rate": 5.642117875623406e-07, "loss": 0.0, "num_input_tokens_seen": 93673424, "step": 139005 }, { "epoch": 3.396037427014878, "grad_norm": 0.00020197322010062635, "learning_rate": 5.641350351084471e-07, "loss": 0.0, "num_input_tokens_seen": 93676432, "step": 139010 }, { "epoch": 3.396159577846725, "grad_norm": 99.5778579711914, "learning_rate": 5.640582858242812e-07, "loss": 0.0412, "num_input_tokens_seen": 93680272, "step": 139015 }, { "epoch": 3.3962817286785723, "grad_norm": 0.0011479872046038508, "learning_rate": 5.639815397104004e-07, "loss": 0.0, "num_input_tokens_seen": 93683664, "step": 139020 }, { "epoch": 3.3964038795104194, "grad_norm": 68.51597595214844, "learning_rate": 5.639047967673634e-07, "loss": 0.0011, "num_input_tokens_seen": 93687120, "step": 139025 }, { "epoch": 3.3965260303422666, "grad_norm": 0.0023358529433608055, "learning_rate": 5.638280569957277e-07, "loss": 0.0, "num_input_tokens_seen": 93690384, "step": 139030 }, { "epoch": 3.396648181174114, "grad_norm": 0.0002345311950193718, "learning_rate": 5.637513203960519e-07, "loss": 0.0, "num_input_tokens_seen": 93693328, "step": 139035 }, { "epoch": 3.396770332005961, "grad_norm": 0.004020425956696272, "learning_rate": 5.636745869688939e-07, "loss": 0.0001, "num_input_tokens_seen": 93696592, "step": 139040 }, { "epoch": 3.396892482837808, "grad_norm": 0.0003916302521247417, "learning_rate": 5.635978567148114e-07, "loss": 0.0, "num_input_tokens_seen": 93699856, "step": 139045 }, { "epoch": 3.3970146336696554, "grad_norm": 0.001366269774734974, "learning_rate": 5.63521129634363e-07, "loss": 0.0, "num_input_tokens_seen": 93702800, "step": 139050 }, { "epoch": 3.3971367845015026, "grad_norm": 0.002023897599428892, "learning_rate": 5.634444057281058e-07, "loss": 0.0, "num_input_tokens_seen": 93706256, "step": 139055 }, { "epoch": 3.3972589353333498, "grad_norm": 75.8708267211914, "learning_rate": 5.633676849965989e-07, "loss": 0.0868, "num_input_tokens_seen": 93710032, "step": 139060 }, { "epoch": 3.397381086165197, "grad_norm": 0.001355248736217618, "learning_rate": 5.632909674403991e-07, "loss": 0.0, "num_input_tokens_seen": 93713104, "step": 139065 }, { "epoch": 3.3975032369970437, "grad_norm": 0.0011240398744121194, "learning_rate": 5.63214253060065e-07, "loss": 0.0, "num_input_tokens_seen": 93716752, "step": 139070 }, { "epoch": 3.3976253878288913, "grad_norm": 0.00047678040573373437, "learning_rate": 5.631375418561546e-07, "loss": 0.0, "num_input_tokens_seen": 93720400, "step": 139075 }, { "epoch": 3.397747538660738, "grad_norm": 0.0010390763636678457, "learning_rate": 5.630608338292251e-07, "loss": 0.0909, "num_input_tokens_seen": 93723792, "step": 139080 }, { "epoch": 3.3978696894925853, "grad_norm": 0.00013290536298882216, "learning_rate": 5.629841289798352e-07, "loss": 0.0, "num_input_tokens_seen": 93727184, "step": 139085 }, { "epoch": 3.3979918403244325, "grad_norm": 0.0025743693113327026, "learning_rate": 5.629074273085419e-07, "loss": 0.0, "num_input_tokens_seen": 93731024, "step": 139090 }, { "epoch": 3.3981139911562797, "grad_norm": 0.0016093073645606637, "learning_rate": 5.628307288159035e-07, "loss": 0.0, "num_input_tokens_seen": 93734224, "step": 139095 }, { "epoch": 3.398236141988127, "grad_norm": 0.001873745582997799, "learning_rate": 5.627540335024776e-07, "loss": 0.0, "num_input_tokens_seen": 93737168, "step": 139100 }, { "epoch": 3.398358292819974, "grad_norm": 0.0914987251162529, "learning_rate": 5.626773413688218e-07, "loss": 0.0, "num_input_tokens_seen": 93740368, "step": 139105 }, { "epoch": 3.3984804436518212, "grad_norm": 0.0004763550532516092, "learning_rate": 5.626006524154943e-07, "loss": 0.0448, "num_input_tokens_seen": 93743568, "step": 139110 }, { "epoch": 3.3986025944836684, "grad_norm": 0.0016596581554040313, "learning_rate": 5.625239666430521e-07, "loss": 0.0365, "num_input_tokens_seen": 93747088, "step": 139115 }, { "epoch": 3.3987247453155156, "grad_norm": 0.012288597412407398, "learning_rate": 5.624472840520538e-07, "loss": 0.0, "num_input_tokens_seen": 93751568, "step": 139120 }, { "epoch": 3.398846896147363, "grad_norm": 0.0013532297452911735, "learning_rate": 5.623706046430561e-07, "loss": 0.0, "num_input_tokens_seen": 93754704, "step": 139125 }, { "epoch": 3.39896904697921, "grad_norm": 0.0021572858095169067, "learning_rate": 5.622939284166175e-07, "loss": 0.0, "num_input_tokens_seen": 93758288, "step": 139130 }, { "epoch": 3.399091197811057, "grad_norm": 0.00035751090035773814, "learning_rate": 5.622172553732946e-07, "loss": 0.0, "num_input_tokens_seen": 93763856, "step": 139135 }, { "epoch": 3.3992133486429044, "grad_norm": 0.009177071042358875, "learning_rate": 5.621405855136463e-07, "loss": 0.0, "num_input_tokens_seen": 93766864, "step": 139140 }, { "epoch": 3.3993354994747516, "grad_norm": 0.03870631381869316, "learning_rate": 5.620639188382287e-07, "loss": 0.0001, "num_input_tokens_seen": 93770064, "step": 139145 }, { "epoch": 3.3994576503065987, "grad_norm": 0.003173073288053274, "learning_rate": 5.619872553476007e-07, "loss": 0.0, "num_input_tokens_seen": 93773584, "step": 139150 }, { "epoch": 3.399579801138446, "grad_norm": 0.000830693868920207, "learning_rate": 5.619105950423191e-07, "loss": 0.0001, "num_input_tokens_seen": 93777104, "step": 139155 }, { "epoch": 3.399701951970293, "grad_norm": 0.004456925205886364, "learning_rate": 5.618339379229411e-07, "loss": 0.0, "num_input_tokens_seen": 93780176, "step": 139160 }, { "epoch": 3.39982410280214, "grad_norm": 0.037957482039928436, "learning_rate": 5.617572839900246e-07, "loss": 0.0, "num_input_tokens_seen": 93783888, "step": 139165 }, { "epoch": 3.399946253633987, "grad_norm": 0.0009529789094813168, "learning_rate": 5.616806332441274e-07, "loss": 0.0, "num_input_tokens_seen": 93787472, "step": 139170 }, { "epoch": 3.4000684044658342, "grad_norm": 0.43681278824806213, "learning_rate": 5.616039856858062e-07, "loss": 0.0001, "num_input_tokens_seen": 93790864, "step": 139175 }, { "epoch": 3.4001905552976814, "grad_norm": 0.0007161188987083733, "learning_rate": 5.61527341315619e-07, "loss": 0.0, "num_input_tokens_seen": 93794064, "step": 139180 }, { "epoch": 3.4003127061295286, "grad_norm": 0.0001278212876059115, "learning_rate": 5.614507001341224e-07, "loss": 0.0, "num_input_tokens_seen": 93797264, "step": 139185 }, { "epoch": 3.400434856961376, "grad_norm": 0.00162774499040097, "learning_rate": 5.613740621418748e-07, "loss": 0.1035, "num_input_tokens_seen": 93800592, "step": 139190 }, { "epoch": 3.400557007793223, "grad_norm": 0.01283679436892271, "learning_rate": 5.612974273394327e-07, "loss": 0.0, "num_input_tokens_seen": 93803920, "step": 139195 }, { "epoch": 3.40067915862507, "grad_norm": 0.5930418372154236, "learning_rate": 5.612207957273535e-07, "loss": 0.0002, "num_input_tokens_seen": 93807248, "step": 139200 }, { "epoch": 3.4008013094569174, "grad_norm": 0.0012030928628519177, "learning_rate": 5.611441673061951e-07, "loss": 0.0, "num_input_tokens_seen": 93810704, "step": 139205 }, { "epoch": 3.4009234602887646, "grad_norm": 0.0014273381093516946, "learning_rate": 5.610675420765141e-07, "loss": 0.0, "num_input_tokens_seen": 93813968, "step": 139210 }, { "epoch": 3.4010456111206118, "grad_norm": 0.0039851912297308445, "learning_rate": 5.609909200388683e-07, "loss": 0.0002, "num_input_tokens_seen": 93817104, "step": 139215 }, { "epoch": 3.401167761952459, "grad_norm": 3.220369035261683e-05, "learning_rate": 5.609143011938143e-07, "loss": 0.0, "num_input_tokens_seen": 93822544, "step": 139220 }, { "epoch": 3.401289912784306, "grad_norm": 0.0013037320459261537, "learning_rate": 5.608376855419094e-07, "loss": 0.0, "num_input_tokens_seen": 93825872, "step": 139225 }, { "epoch": 3.4014120636161533, "grad_norm": 0.00025415164418518543, "learning_rate": 5.607610730837116e-07, "loss": 0.0, "num_input_tokens_seen": 93828752, "step": 139230 }, { "epoch": 3.4015342144480005, "grad_norm": 0.0004327208735048771, "learning_rate": 5.60684463819777e-07, "loss": 0.0003, "num_input_tokens_seen": 93832272, "step": 139235 }, { "epoch": 3.4016563652798477, "grad_norm": 5.300765405991115e-05, "learning_rate": 5.606078577506635e-07, "loss": 0.0, "num_input_tokens_seen": 93835664, "step": 139240 }, { "epoch": 3.401778516111695, "grad_norm": 0.003021878655999899, "learning_rate": 5.605312548769278e-07, "loss": 0.0, "num_input_tokens_seen": 93839312, "step": 139245 }, { "epoch": 3.4019006669435417, "grad_norm": 0.00036664100480265915, "learning_rate": 5.604546551991266e-07, "loss": 0.0, "num_input_tokens_seen": 93842512, "step": 139250 }, { "epoch": 3.4020228177753893, "grad_norm": 0.0010006529046222568, "learning_rate": 5.603780587178177e-07, "loss": 0.0399, "num_input_tokens_seen": 93845776, "step": 139255 }, { "epoch": 3.402144968607236, "grad_norm": 0.0046476093120872974, "learning_rate": 5.603014654335576e-07, "loss": 0.0, "num_input_tokens_seen": 93848784, "step": 139260 }, { "epoch": 3.402267119439083, "grad_norm": 0.004209481179714203, "learning_rate": 5.602248753469039e-07, "loss": 0.0, "num_input_tokens_seen": 93852752, "step": 139265 }, { "epoch": 3.4023892702709304, "grad_norm": 0.0006373856449499726, "learning_rate": 5.601482884584125e-07, "loss": 0.0, "num_input_tokens_seen": 93856016, "step": 139270 }, { "epoch": 3.4025114211027776, "grad_norm": 0.020606614649295807, "learning_rate": 5.600717047686417e-07, "loss": 0.0425, "num_input_tokens_seen": 93859728, "step": 139275 }, { "epoch": 3.402633571934625, "grad_norm": 0.00027014079387299716, "learning_rate": 5.599951242781473e-07, "loss": 0.135, "num_input_tokens_seen": 93862544, "step": 139280 }, { "epoch": 3.402755722766472, "grad_norm": 0.00013495008170139045, "learning_rate": 5.599185469874872e-07, "loss": 0.0, "num_input_tokens_seen": 93866000, "step": 139285 }, { "epoch": 3.402877873598319, "grad_norm": 2.1405741790658794e-05, "learning_rate": 5.598419728972174e-07, "loss": 0.0763, "num_input_tokens_seen": 93869456, "step": 139290 }, { "epoch": 3.4030000244301664, "grad_norm": 138.9090576171875, "learning_rate": 5.59765402007895e-07, "loss": 0.0026, "num_input_tokens_seen": 93872784, "step": 139295 }, { "epoch": 3.4031221752620135, "grad_norm": 0.0009671871666796505, "learning_rate": 5.596888343200776e-07, "loss": 0.0, "num_input_tokens_seen": 93876304, "step": 139300 }, { "epoch": 3.4032443260938607, "grad_norm": 0.003403621492907405, "learning_rate": 5.596122698343208e-07, "loss": 0.0, "num_input_tokens_seen": 93879760, "step": 139305 }, { "epoch": 3.403366476925708, "grad_norm": 0.02258618362247944, "learning_rate": 5.595357085511827e-07, "loss": 0.0004, "num_input_tokens_seen": 93883024, "step": 139310 }, { "epoch": 3.403488627757555, "grad_norm": 0.46484342217445374, "learning_rate": 5.594591504712189e-07, "loss": 0.0003, "num_input_tokens_seen": 93886224, "step": 139315 }, { "epoch": 3.4036107785894023, "grad_norm": 0.005803953390568495, "learning_rate": 5.593825955949865e-07, "loss": 0.0279, "num_input_tokens_seen": 93889552, "step": 139320 }, { "epoch": 3.4037329294212495, "grad_norm": 0.024452105164527893, "learning_rate": 5.593060439230429e-07, "loss": 0.0, "num_input_tokens_seen": 93893072, "step": 139325 }, { "epoch": 3.4038550802530967, "grad_norm": 0.029246680438518524, "learning_rate": 5.592294954559439e-07, "loss": 0.0, "num_input_tokens_seen": 93896656, "step": 139330 }, { "epoch": 3.403977231084944, "grad_norm": 0.009791702963411808, "learning_rate": 5.591529501942469e-07, "loss": 0.0, "num_input_tokens_seen": 93900304, "step": 139335 }, { "epoch": 3.404099381916791, "grad_norm": 0.009990615770220757, "learning_rate": 5.590764081385078e-07, "loss": 0.0, "num_input_tokens_seen": 93903760, "step": 139340 }, { "epoch": 3.404221532748638, "grad_norm": 0.0006494000554084778, "learning_rate": 5.589998692892841e-07, "loss": 0.0626, "num_input_tokens_seen": 93907216, "step": 139345 }, { "epoch": 3.404343683580485, "grad_norm": 0.004100476857274771, "learning_rate": 5.589233336471316e-07, "loss": 0.0524, "num_input_tokens_seen": 93910544, "step": 139350 }, { "epoch": 3.404465834412332, "grad_norm": 0.01545916311442852, "learning_rate": 5.588468012126076e-07, "loss": 0.0, "num_input_tokens_seen": 93914000, "step": 139355 }, { "epoch": 3.4045879852441794, "grad_norm": 0.012155533768236637, "learning_rate": 5.587702719862683e-07, "loss": 0.0002, "num_input_tokens_seen": 93917584, "step": 139360 }, { "epoch": 3.4047101360760266, "grad_norm": 193.21961975097656, "learning_rate": 5.586937459686701e-07, "loss": 0.0134, "num_input_tokens_seen": 93920720, "step": 139365 }, { "epoch": 3.4048322869078738, "grad_norm": 0.7182050943374634, "learning_rate": 5.586172231603697e-07, "loss": 0.0004, "num_input_tokens_seen": 93923856, "step": 139370 }, { "epoch": 3.404954437739721, "grad_norm": 0.00684003159403801, "learning_rate": 5.585407035619234e-07, "loss": 0.0001, "num_input_tokens_seen": 93927056, "step": 139375 }, { "epoch": 3.405076588571568, "grad_norm": 0.012966359034180641, "learning_rate": 5.584641871738882e-07, "loss": 0.0, "num_input_tokens_seen": 93930640, "step": 139380 }, { "epoch": 3.4051987394034153, "grad_norm": 0.0029723867774009705, "learning_rate": 5.583876739968197e-07, "loss": 0.0, "num_input_tokens_seen": 93934096, "step": 139385 }, { "epoch": 3.4053208902352625, "grad_norm": 0.03468893840909004, "learning_rate": 5.58311164031275e-07, "loss": 0.0005, "num_input_tokens_seen": 93937168, "step": 139390 }, { "epoch": 3.4054430410671097, "grad_norm": 28.802310943603516, "learning_rate": 5.582346572778104e-07, "loss": 0.0956, "num_input_tokens_seen": 93940496, "step": 139395 }, { "epoch": 3.405565191898957, "grad_norm": 0.0013457908062264323, "learning_rate": 5.581581537369821e-07, "loss": 0.0378, "num_input_tokens_seen": 93943696, "step": 139400 }, { "epoch": 3.405687342730804, "grad_norm": 0.0074604772962629795, "learning_rate": 5.580816534093468e-07, "loss": 0.0002, "num_input_tokens_seen": 93947088, "step": 139405 }, { "epoch": 3.4058094935626513, "grad_norm": 0.00541321886703372, "learning_rate": 5.580051562954602e-07, "loss": 0.0, "num_input_tokens_seen": 93950224, "step": 139410 }, { "epoch": 3.4059316443944985, "grad_norm": 0.010462961159646511, "learning_rate": 5.57928662395879e-07, "loss": 0.0001, "num_input_tokens_seen": 93954000, "step": 139415 }, { "epoch": 3.4060537952263457, "grad_norm": 0.000587663846090436, "learning_rate": 5.5785217171116e-07, "loss": 0.0, "num_input_tokens_seen": 93957712, "step": 139420 }, { "epoch": 3.406175946058193, "grad_norm": 0.10726846754550934, "learning_rate": 5.577756842418584e-07, "loss": 0.0, "num_input_tokens_seen": 93961360, "step": 139425 }, { "epoch": 3.4062980968900396, "grad_norm": 0.0019450212130323052, "learning_rate": 5.576991999885313e-07, "loss": 0.0, "num_input_tokens_seen": 93964496, "step": 139430 }, { "epoch": 3.4064202477218872, "grad_norm": 0.009344382211565971, "learning_rate": 5.576227189517343e-07, "loss": 0.0, "num_input_tokens_seen": 93968016, "step": 139435 }, { "epoch": 3.406542398553734, "grad_norm": 0.012868695892393589, "learning_rate": 5.575462411320241e-07, "loss": 0.0, "num_input_tokens_seen": 93971664, "step": 139440 }, { "epoch": 3.406664549385581, "grad_norm": 0.001059131813235581, "learning_rate": 5.574697665299565e-07, "loss": 0.0, "num_input_tokens_seen": 93974992, "step": 139445 }, { "epoch": 3.4067867002174284, "grad_norm": 0.0033619788009673357, "learning_rate": 5.573932951460881e-07, "loss": 0.0514, "num_input_tokens_seen": 93978064, "step": 139450 }, { "epoch": 3.4069088510492755, "grad_norm": 0.0006246289703994989, "learning_rate": 5.573168269809743e-07, "loss": 0.0, "num_input_tokens_seen": 93981520, "step": 139455 }, { "epoch": 3.4070310018811227, "grad_norm": 0.0036987571511417627, "learning_rate": 5.572403620351718e-07, "loss": 0.0, "num_input_tokens_seen": 93984848, "step": 139460 }, { "epoch": 3.40715315271297, "grad_norm": 0.0031094530131667852, "learning_rate": 5.571639003092368e-07, "loss": 0.0, "num_input_tokens_seen": 93988496, "step": 139465 }, { "epoch": 3.407275303544817, "grad_norm": 0.0013259407132863998, "learning_rate": 5.570874418037244e-07, "loss": 0.0, "num_input_tokens_seen": 93991440, "step": 139470 }, { "epoch": 3.4073974543766643, "grad_norm": 0.04478773847222328, "learning_rate": 5.570109865191912e-07, "loss": 0.0, "num_input_tokens_seen": 93994896, "step": 139475 }, { "epoch": 3.4075196052085115, "grad_norm": 0.014282858930528164, "learning_rate": 5.569345344561938e-07, "loss": 0.0, "num_input_tokens_seen": 93998032, "step": 139480 }, { "epoch": 3.4076417560403587, "grad_norm": 0.004201257135719061, "learning_rate": 5.568580856152873e-07, "loss": 0.0, "num_input_tokens_seen": 94000784, "step": 139485 }, { "epoch": 3.407763906872206, "grad_norm": 0.006269012577831745, "learning_rate": 5.567816399970282e-07, "loss": 0.0002, "num_input_tokens_seen": 94004048, "step": 139490 }, { "epoch": 3.407886057704053, "grad_norm": 0.27850908041000366, "learning_rate": 5.567051976019719e-07, "loss": 0.0001, "num_input_tokens_seen": 94006928, "step": 139495 }, { "epoch": 3.4080082085359003, "grad_norm": 0.00017745274817571044, "learning_rate": 5.566287584306751e-07, "loss": 0.0569, "num_input_tokens_seen": 94010128, "step": 139500 }, { "epoch": 3.4081303593677474, "grad_norm": 0.0006029402720741928, "learning_rate": 5.565523224836928e-07, "loss": 0.0003, "num_input_tokens_seen": 94013200, "step": 139505 }, { "epoch": 3.4082525101995946, "grad_norm": 0.0006123344646766782, "learning_rate": 5.564758897615813e-07, "loss": 0.0003, "num_input_tokens_seen": 94016528, "step": 139510 }, { "epoch": 3.4083746610314414, "grad_norm": 0.0006965179345570505, "learning_rate": 5.563994602648967e-07, "loss": 0.0001, "num_input_tokens_seen": 94019728, "step": 139515 }, { "epoch": 3.408496811863289, "grad_norm": 0.010701261460781097, "learning_rate": 5.563230339941942e-07, "loss": 0.0, "num_input_tokens_seen": 94023184, "step": 139520 }, { "epoch": 3.4086189626951358, "grad_norm": 0.006104510743170977, "learning_rate": 5.562466109500304e-07, "loss": 0.0, "num_input_tokens_seen": 94026320, "step": 139525 }, { "epoch": 3.408741113526983, "grad_norm": 0.0008365048561245203, "learning_rate": 5.561701911329602e-07, "loss": 0.0, "num_input_tokens_seen": 94029456, "step": 139530 }, { "epoch": 3.40886326435883, "grad_norm": 0.042535971850156784, "learning_rate": 5.560937745435401e-07, "loss": 0.05, "num_input_tokens_seen": 94033168, "step": 139535 }, { "epoch": 3.4089854151906773, "grad_norm": 0.00022195244673639536, "learning_rate": 5.560173611823251e-07, "loss": 0.0001, "num_input_tokens_seen": 94036624, "step": 139540 }, { "epoch": 3.4091075660225245, "grad_norm": 0.12215565890073776, "learning_rate": 5.559409510498711e-07, "loss": 0.0001, "num_input_tokens_seen": 94041040, "step": 139545 }, { "epoch": 3.4092297168543717, "grad_norm": 0.0006495245615951717, "learning_rate": 5.558645441467346e-07, "loss": 0.0, "num_input_tokens_seen": 94044240, "step": 139550 }, { "epoch": 3.409351867686219, "grad_norm": 0.011606546118855476, "learning_rate": 5.557881404734705e-07, "loss": 0.0, "num_input_tokens_seen": 94047184, "step": 139555 }, { "epoch": 3.409474018518066, "grad_norm": 0.0022992929443717003, "learning_rate": 5.557117400306341e-07, "loss": 0.0, "num_input_tokens_seen": 94050448, "step": 139560 }, { "epoch": 3.4095961693499133, "grad_norm": 9.92760033113882e-05, "learning_rate": 5.556353428187818e-07, "loss": 0.0, "num_input_tokens_seen": 94054160, "step": 139565 }, { "epoch": 3.4097183201817605, "grad_norm": 0.0028340895660221577, "learning_rate": 5.555589488384685e-07, "loss": 0.0, "num_input_tokens_seen": 94057616, "step": 139570 }, { "epoch": 3.4098404710136077, "grad_norm": 0.0007459719781763852, "learning_rate": 5.554825580902503e-07, "loss": 0.0, "num_input_tokens_seen": 94060752, "step": 139575 }, { "epoch": 3.409962621845455, "grad_norm": 0.0012573867570608854, "learning_rate": 5.554061705746822e-07, "loss": 0.0, "num_input_tokens_seen": 94064656, "step": 139580 }, { "epoch": 3.410084772677302, "grad_norm": 0.10932556539773941, "learning_rate": 5.553297862923203e-07, "loss": 0.0, "num_input_tokens_seen": 94068368, "step": 139585 }, { "epoch": 3.4102069235091492, "grad_norm": 0.0009823766304180026, "learning_rate": 5.552534052437195e-07, "loss": 0.0, "num_input_tokens_seen": 94072144, "step": 139590 }, { "epoch": 3.4103290743409964, "grad_norm": 0.006408262066543102, "learning_rate": 5.551770274294361e-07, "loss": 0.0, "num_input_tokens_seen": 94075472, "step": 139595 }, { "epoch": 3.4104512251728436, "grad_norm": 0.0034182979725301266, "learning_rate": 5.551006528500244e-07, "loss": 0.0804, "num_input_tokens_seen": 94078800, "step": 139600 }, { "epoch": 3.410573376004691, "grad_norm": 0.2944185435771942, "learning_rate": 5.550242815060404e-07, "loss": 0.0001, "num_input_tokens_seen": 94081936, "step": 139605 }, { "epoch": 3.4106955268365375, "grad_norm": 0.0003999386972282082, "learning_rate": 5.5494791339804e-07, "loss": 0.0, "num_input_tokens_seen": 94085584, "step": 139610 }, { "epoch": 3.4108176776683847, "grad_norm": 0.047348976135253906, "learning_rate": 5.548715485265776e-07, "loss": 0.0001, "num_input_tokens_seen": 94089040, "step": 139615 }, { "epoch": 3.410939828500232, "grad_norm": 0.012279681861400604, "learning_rate": 5.547951868922096e-07, "loss": 0.0, "num_input_tokens_seen": 94092048, "step": 139620 }, { "epoch": 3.411061979332079, "grad_norm": 0.006022414658218622, "learning_rate": 5.547188284954902e-07, "loss": 0.0, "num_input_tokens_seen": 94095120, "step": 139625 }, { "epoch": 3.4111841301639263, "grad_norm": 0.0033210585825145245, "learning_rate": 5.546424733369752e-07, "loss": 0.0, "num_input_tokens_seen": 94098768, "step": 139630 }, { "epoch": 3.4113062809957735, "grad_norm": 0.0012283911928534508, "learning_rate": 5.545661214172204e-07, "loss": 0.0, "num_input_tokens_seen": 94101520, "step": 139635 }, { "epoch": 3.4114284318276207, "grad_norm": 0.0016081221401691437, "learning_rate": 5.544897727367802e-07, "loss": 0.0694, "num_input_tokens_seen": 94104528, "step": 139640 }, { "epoch": 3.411550582659468, "grad_norm": 0.02599150314927101, "learning_rate": 5.544134272962105e-07, "loss": 0.0, "num_input_tokens_seen": 94107856, "step": 139645 }, { "epoch": 3.411672733491315, "grad_norm": 0.0009995142463594675, "learning_rate": 5.543370850960659e-07, "loss": 0.0001, "num_input_tokens_seen": 94110928, "step": 139650 }, { "epoch": 3.4117948843231622, "grad_norm": 0.0031279893592000008, "learning_rate": 5.542607461369022e-07, "loss": 0.0, "num_input_tokens_seen": 94114256, "step": 139655 }, { "epoch": 3.4119170351550094, "grad_norm": 0.0002375991316512227, "learning_rate": 5.54184410419274e-07, "loss": 0.0, "num_input_tokens_seen": 94117776, "step": 139660 }, { "epoch": 3.4120391859868566, "grad_norm": 1.0746980905532837, "learning_rate": 5.54108077943737e-07, "loss": 0.0004, "num_input_tokens_seen": 94121232, "step": 139665 }, { "epoch": 3.412161336818704, "grad_norm": 0.000253146281465888, "learning_rate": 5.540317487108459e-07, "loss": 0.0, "num_input_tokens_seen": 94124560, "step": 139670 }, { "epoch": 3.412283487650551, "grad_norm": 0.0034022240433841944, "learning_rate": 5.539554227211556e-07, "loss": 0.0, "num_input_tokens_seen": 94127952, "step": 139675 }, { "epoch": 3.412405638482398, "grad_norm": 0.004152646288275719, "learning_rate": 5.538790999752217e-07, "loss": 0.0, "num_input_tokens_seen": 94130960, "step": 139680 }, { "epoch": 3.4125277893142454, "grad_norm": 0.0016099411295726895, "learning_rate": 5.538027804735987e-07, "loss": 0.0, "num_input_tokens_seen": 94134288, "step": 139685 }, { "epoch": 3.4126499401460926, "grad_norm": 0.003590294159948826, "learning_rate": 5.537264642168422e-07, "loss": 0.0, "num_input_tokens_seen": 94137424, "step": 139690 }, { "epoch": 3.4127720909779393, "grad_norm": 0.010240147821605206, "learning_rate": 5.536501512055065e-07, "loss": 0.0, "num_input_tokens_seen": 94140944, "step": 139695 }, { "epoch": 3.412894241809787, "grad_norm": 0.0274630356580019, "learning_rate": 5.535738414401471e-07, "loss": 0.065, "num_input_tokens_seen": 94143824, "step": 139700 }, { "epoch": 3.4130163926416337, "grad_norm": 0.0014623597962781787, "learning_rate": 5.534975349213191e-07, "loss": 0.0376, "num_input_tokens_seen": 94147024, "step": 139705 }, { "epoch": 3.413138543473481, "grad_norm": 0.0355786494910717, "learning_rate": 5.534212316495767e-07, "loss": 0.0, "num_input_tokens_seen": 94150864, "step": 139710 }, { "epoch": 3.413260694305328, "grad_norm": 0.0009701123344711959, "learning_rate": 5.533449316254756e-07, "loss": 0.0, "num_input_tokens_seen": 94154256, "step": 139715 }, { "epoch": 3.4133828451371753, "grad_norm": 0.0032924406696110964, "learning_rate": 5.532686348495699e-07, "loss": 0.0, "num_input_tokens_seen": 94157904, "step": 139720 }, { "epoch": 3.4135049959690225, "grad_norm": 0.004160938318818808, "learning_rate": 5.531923413224148e-07, "loss": 0.0, "num_input_tokens_seen": 94161168, "step": 139725 }, { "epoch": 3.4136271468008696, "grad_norm": 0.0015338974772021174, "learning_rate": 5.531160510445657e-07, "loss": 0.0, "num_input_tokens_seen": 94164432, "step": 139730 }, { "epoch": 3.413749297632717, "grad_norm": 0.02621505782008171, "learning_rate": 5.530397640165765e-07, "loss": 0.0004, "num_input_tokens_seen": 94168528, "step": 139735 }, { "epoch": 3.413871448464564, "grad_norm": 0.004979231394827366, "learning_rate": 5.529634802390026e-07, "loss": 0.0256, "num_input_tokens_seen": 94171536, "step": 139740 }, { "epoch": 3.413993599296411, "grad_norm": 0.005769277922809124, "learning_rate": 5.528871997123981e-07, "loss": 0.0, "num_input_tokens_seen": 94174608, "step": 139745 }, { "epoch": 3.4141157501282584, "grad_norm": 0.0031776968389749527, "learning_rate": 5.528109224373186e-07, "loss": 0.0501, "num_input_tokens_seen": 94177744, "step": 139750 }, { "epoch": 3.4142379009601056, "grad_norm": 0.0034165666438639164, "learning_rate": 5.52734648414318e-07, "loss": 0.0, "num_input_tokens_seen": 94181072, "step": 139755 }, { "epoch": 3.414360051791953, "grad_norm": 0.0006008553318679333, "learning_rate": 5.526583776439517e-07, "loss": 0.0, "num_input_tokens_seen": 94184656, "step": 139760 }, { "epoch": 3.4144822026238, "grad_norm": 0.022039469331502914, "learning_rate": 5.525821101267735e-07, "loss": 0.0, "num_input_tokens_seen": 94188496, "step": 139765 }, { "epoch": 3.414604353455647, "grad_norm": 0.0023359765764325857, "learning_rate": 5.525058458633391e-07, "loss": 0.0, "num_input_tokens_seen": 94191696, "step": 139770 }, { "epoch": 3.4147265042874944, "grad_norm": 6.296150240814313e-05, "learning_rate": 5.524295848542025e-07, "loss": 0.0001, "num_input_tokens_seen": 94195344, "step": 139775 }, { "epoch": 3.4148486551193415, "grad_norm": 0.0014544177101925015, "learning_rate": 5.523533270999179e-07, "loss": 0.0, "num_input_tokens_seen": 94198800, "step": 139780 }, { "epoch": 3.4149708059511887, "grad_norm": 0.019036872312426567, "learning_rate": 5.522770726010404e-07, "loss": 0.0, "num_input_tokens_seen": 94202576, "step": 139785 }, { "epoch": 3.4150929567830355, "grad_norm": 0.0024766286369413137, "learning_rate": 5.522008213581249e-07, "loss": 0.0, "num_input_tokens_seen": 94205968, "step": 139790 }, { "epoch": 3.4152151076148827, "grad_norm": 0.029975347220897675, "learning_rate": 5.521245733717248e-07, "loss": 0.0716, "num_input_tokens_seen": 94209168, "step": 139795 }, { "epoch": 3.41533725844673, "grad_norm": 0.0008781835786066949, "learning_rate": 5.520483286423958e-07, "loss": 0.0, "num_input_tokens_seen": 94212688, "step": 139800 }, { "epoch": 3.415459409278577, "grad_norm": 0.0005944212316535413, "learning_rate": 5.519720871706916e-07, "loss": 0.0002, "num_input_tokens_seen": 94215696, "step": 139805 }, { "epoch": 3.4155815601104242, "grad_norm": 0.024179916828870773, "learning_rate": 5.51895848957167e-07, "loss": 0.0, "num_input_tokens_seen": 94219280, "step": 139810 }, { "epoch": 3.4157037109422714, "grad_norm": 0.0005148306954652071, "learning_rate": 5.518196140023761e-07, "loss": 0.0002, "num_input_tokens_seen": 94222928, "step": 139815 }, { "epoch": 3.4158258617741186, "grad_norm": 0.017895027995109558, "learning_rate": 5.517433823068736e-07, "loss": 0.0, "num_input_tokens_seen": 94226128, "step": 139820 }, { "epoch": 3.415948012605966, "grad_norm": 0.00047818353050388396, "learning_rate": 5.51667153871214e-07, "loss": 0.0001, "num_input_tokens_seen": 94229200, "step": 139825 }, { "epoch": 3.416070163437813, "grad_norm": 0.0006948122172616422, "learning_rate": 5.51590928695951e-07, "loss": 0.0, "num_input_tokens_seen": 94232400, "step": 139830 }, { "epoch": 3.41619231426966, "grad_norm": 0.06057262420654297, "learning_rate": 5.515147067816399e-07, "loss": 0.0002, "num_input_tokens_seen": 94235920, "step": 139835 }, { "epoch": 3.4163144651015074, "grad_norm": 0.0007530879229307175, "learning_rate": 5.514384881288341e-07, "loss": 0.0, "num_input_tokens_seen": 94239568, "step": 139840 }, { "epoch": 3.4164366159333546, "grad_norm": 0.003584317397326231, "learning_rate": 5.513622727380887e-07, "loss": 0.0, "num_input_tokens_seen": 94242960, "step": 139845 }, { "epoch": 3.4165587667652018, "grad_norm": 0.0016264979494735599, "learning_rate": 5.51286060609957e-07, "loss": 0.0, "num_input_tokens_seen": 94245776, "step": 139850 }, { "epoch": 3.416680917597049, "grad_norm": 0.00211779261007905, "learning_rate": 5.512098517449938e-07, "loss": 0.0, "num_input_tokens_seen": 94249168, "step": 139855 }, { "epoch": 3.416803068428896, "grad_norm": 0.8464714884757996, "learning_rate": 5.511336461437536e-07, "loss": 0.0003, "num_input_tokens_seen": 94252496, "step": 139860 }, { "epoch": 3.4169252192607433, "grad_norm": 0.00245960452593863, "learning_rate": 5.510574438067904e-07, "loss": 0.0, "num_input_tokens_seen": 94256080, "step": 139865 }, { "epoch": 3.4170473700925905, "grad_norm": 0.04314202442765236, "learning_rate": 5.509812447346578e-07, "loss": 0.0001, "num_input_tokens_seen": 94259792, "step": 139870 }, { "epoch": 3.4171695209244373, "grad_norm": 0.060364704579114914, "learning_rate": 5.509050489279107e-07, "loss": 0.0, "num_input_tokens_seen": 94262800, "step": 139875 }, { "epoch": 3.417291671756285, "grad_norm": 0.0005380921647883952, "learning_rate": 5.508288563871024e-07, "loss": 0.0, "num_input_tokens_seen": 94266064, "step": 139880 }, { "epoch": 3.4174138225881316, "grad_norm": 0.0008233811822719872, "learning_rate": 5.50752667112788e-07, "loss": 0.0, "num_input_tokens_seen": 94270288, "step": 139885 }, { "epoch": 3.417535973419979, "grad_norm": 0.0003207654517609626, "learning_rate": 5.506764811055206e-07, "loss": 0.0, "num_input_tokens_seen": 94273680, "step": 139890 }, { "epoch": 3.417658124251826, "grad_norm": 0.004454872105270624, "learning_rate": 5.506002983658551e-07, "loss": 0.0, "num_input_tokens_seen": 94276880, "step": 139895 }, { "epoch": 3.417780275083673, "grad_norm": 0.0006709819426760077, "learning_rate": 5.505241188943446e-07, "loss": 0.0001, "num_input_tokens_seen": 94280464, "step": 139900 }, { "epoch": 3.4179024259155204, "grad_norm": 0.0037196956109255552, "learning_rate": 5.504479426915441e-07, "loss": 0.0402, "num_input_tokens_seen": 94283600, "step": 139905 }, { "epoch": 3.4180245767473676, "grad_norm": 0.005503419786691666, "learning_rate": 5.503717697580067e-07, "loss": 0.0, "num_input_tokens_seen": 94287248, "step": 139910 }, { "epoch": 3.418146727579215, "grad_norm": 0.09051766246557236, "learning_rate": 5.502956000942867e-07, "loss": 0.0, "num_input_tokens_seen": 94290320, "step": 139915 }, { "epoch": 3.418268878411062, "grad_norm": 0.013865584507584572, "learning_rate": 5.502194337009384e-07, "loss": 0.1, "num_input_tokens_seen": 94293328, "step": 139920 }, { "epoch": 3.418391029242909, "grad_norm": 0.00011315821029711515, "learning_rate": 5.501432705785149e-07, "loss": 0.0, "num_input_tokens_seen": 94298704, "step": 139925 }, { "epoch": 3.4185131800747564, "grad_norm": 0.0008986890898086131, "learning_rate": 5.500671107275712e-07, "loss": 0.0, "num_input_tokens_seen": 94301840, "step": 139930 }, { "epoch": 3.4186353309066035, "grad_norm": 0.00023561430862173438, "learning_rate": 5.499909541486598e-07, "loss": 0.0, "num_input_tokens_seen": 94304976, "step": 139935 }, { "epoch": 3.4187574817384507, "grad_norm": 0.013146194629371166, "learning_rate": 5.499148008423353e-07, "loss": 0.0, "num_input_tokens_seen": 94308368, "step": 139940 }, { "epoch": 3.418879632570298, "grad_norm": 0.0015010988572612405, "learning_rate": 5.49838650809152e-07, "loss": 0.0, "num_input_tokens_seen": 94311760, "step": 139945 }, { "epoch": 3.419001783402145, "grad_norm": 0.012417850084602833, "learning_rate": 5.497625040496625e-07, "loss": 0.0, "num_input_tokens_seen": 94314960, "step": 139950 }, { "epoch": 3.4191239342339923, "grad_norm": 0.0008566909236833453, "learning_rate": 5.496863605644215e-07, "loss": 0.0, "num_input_tokens_seen": 94318224, "step": 139955 }, { "epoch": 3.4192460850658395, "grad_norm": 0.02170843631029129, "learning_rate": 5.496102203539823e-07, "loss": 0.0, "num_input_tokens_seen": 94321872, "step": 139960 }, { "epoch": 3.4193682358976867, "grad_norm": 0.006882964167743921, "learning_rate": 5.495340834188988e-07, "loss": 0.0475, "num_input_tokens_seen": 94325200, "step": 139965 }, { "epoch": 3.4194903867295334, "grad_norm": 0.002628026297315955, "learning_rate": 5.494579497597245e-07, "loss": 0.0, "num_input_tokens_seen": 94328528, "step": 139970 }, { "epoch": 3.4196125375613806, "grad_norm": 0.0007482045330107212, "learning_rate": 5.493818193770134e-07, "loss": 0.0, "num_input_tokens_seen": 94331792, "step": 139975 }, { "epoch": 3.419734688393228, "grad_norm": 0.014378773048520088, "learning_rate": 5.493056922713189e-07, "loss": 0.0365, "num_input_tokens_seen": 94335760, "step": 139980 }, { "epoch": 3.419856839225075, "grad_norm": 0.00155660230666399, "learning_rate": 5.492295684431942e-07, "loss": 0.0001, "num_input_tokens_seen": 94339088, "step": 139985 }, { "epoch": 3.419978990056922, "grad_norm": 0.0016220946563407779, "learning_rate": 5.491534478931939e-07, "loss": 0.0001, "num_input_tokens_seen": 94342352, "step": 139990 }, { "epoch": 3.4201011408887694, "grad_norm": 0.0007400620961561799, "learning_rate": 5.490773306218705e-07, "loss": 0.0001, "num_input_tokens_seen": 94345488, "step": 139995 }, { "epoch": 3.4202232917206166, "grad_norm": 0.007975745014846325, "learning_rate": 5.490012166297783e-07, "loss": 0.0, "num_input_tokens_seen": 94349136, "step": 140000 }, { "epoch": 3.4203454425524638, "grad_norm": 0.016189241781830788, "learning_rate": 5.489251059174705e-07, "loss": 0.0, "num_input_tokens_seen": 94352272, "step": 140005 }, { "epoch": 3.420467593384311, "grad_norm": 0.00473784189671278, "learning_rate": 5.488489984855002e-07, "loss": 0.0001, "num_input_tokens_seen": 94355728, "step": 140010 }, { "epoch": 3.420589744216158, "grad_norm": 0.004506468307226896, "learning_rate": 5.487728943344221e-07, "loss": 0.0, "num_input_tokens_seen": 94359248, "step": 140015 }, { "epoch": 3.4207118950480053, "grad_norm": 27.068483352661133, "learning_rate": 5.486967934647884e-07, "loss": 0.0799, "num_input_tokens_seen": 94362256, "step": 140020 }, { "epoch": 3.4208340458798525, "grad_norm": 0.00046828857739456, "learning_rate": 5.486206958771534e-07, "loss": 0.0, "num_input_tokens_seen": 94366096, "step": 140025 }, { "epoch": 3.4209561967116997, "grad_norm": 144.9590606689453, "learning_rate": 5.485446015720695e-07, "loss": 0.002, "num_input_tokens_seen": 94369104, "step": 140030 }, { "epoch": 3.421078347543547, "grad_norm": 0.0013556088088080287, "learning_rate": 5.484685105500908e-07, "loss": 0.0955, "num_input_tokens_seen": 94372496, "step": 140035 }, { "epoch": 3.421200498375394, "grad_norm": 0.009067345410585403, "learning_rate": 5.48392422811771e-07, "loss": 0.0, "num_input_tokens_seen": 94375568, "step": 140040 }, { "epoch": 3.4213226492072413, "grad_norm": 0.001369845005683601, "learning_rate": 5.483163383576626e-07, "loss": 0.0001, "num_input_tokens_seen": 94379216, "step": 140045 }, { "epoch": 3.4214448000390885, "grad_norm": 0.003838911419734359, "learning_rate": 5.482402571883196e-07, "loss": 0.0, "num_input_tokens_seen": 94382736, "step": 140050 }, { "epoch": 3.421566950870935, "grad_norm": 0.02284904755651951, "learning_rate": 5.481641793042945e-07, "loss": 0.0, "num_input_tokens_seen": 94386576, "step": 140055 }, { "epoch": 3.421689101702783, "grad_norm": 0.004888457246124744, "learning_rate": 5.480881047061415e-07, "loss": 0.0001, "num_input_tokens_seen": 94390288, "step": 140060 }, { "epoch": 3.4218112525346296, "grad_norm": 0.004341055639088154, "learning_rate": 5.48012033394413e-07, "loss": 0.0, "num_input_tokens_seen": 94393680, "step": 140065 }, { "epoch": 3.421933403366477, "grad_norm": 0.0021836755331605673, "learning_rate": 5.47935965369663e-07, "loss": 0.0, "num_input_tokens_seen": 94397392, "step": 140070 }, { "epoch": 3.422055554198324, "grad_norm": 0.0013760802103206515, "learning_rate": 5.478599006324436e-07, "loss": 0.0, "num_input_tokens_seen": 94400720, "step": 140075 }, { "epoch": 3.422177705030171, "grad_norm": 0.011365426704287529, "learning_rate": 5.477838391833092e-07, "loss": 0.0, "num_input_tokens_seen": 94404048, "step": 140080 }, { "epoch": 3.4222998558620183, "grad_norm": 0.002810131059959531, "learning_rate": 5.477077810228123e-07, "loss": 0.0, "num_input_tokens_seen": 94407248, "step": 140085 }, { "epoch": 3.4224220066938655, "grad_norm": 0.19500964879989624, "learning_rate": 5.476317261515058e-07, "loss": 0.0001, "num_input_tokens_seen": 94410576, "step": 140090 }, { "epoch": 3.4225441575257127, "grad_norm": 0.019051941111683846, "learning_rate": 5.475556745699433e-07, "loss": 0.0007, "num_input_tokens_seen": 94413776, "step": 140095 }, { "epoch": 3.42266630835756, "grad_norm": 0.040146760642528534, "learning_rate": 5.474796262786773e-07, "loss": 0.0, "num_input_tokens_seen": 94417232, "step": 140100 }, { "epoch": 3.422788459189407, "grad_norm": 0.003175535937771201, "learning_rate": 5.474035812782611e-07, "loss": 0.0, "num_input_tokens_seen": 94420176, "step": 140105 }, { "epoch": 3.4229106100212543, "grad_norm": 0.003321741707623005, "learning_rate": 5.473275395692483e-07, "loss": 0.0, "num_input_tokens_seen": 94423312, "step": 140110 }, { "epoch": 3.4230327608531015, "grad_norm": 0.0006850691861473024, "learning_rate": 5.472515011521908e-07, "loss": 0.083, "num_input_tokens_seen": 94426896, "step": 140115 }, { "epoch": 3.4231549116849487, "grad_norm": 0.04458747059106827, "learning_rate": 5.471754660276428e-07, "loss": 0.0, "num_input_tokens_seen": 94430352, "step": 140120 }, { "epoch": 3.423277062516796, "grad_norm": 0.009945902973413467, "learning_rate": 5.470994341961561e-07, "loss": 0.0, "num_input_tokens_seen": 94434128, "step": 140125 }, { "epoch": 3.423399213348643, "grad_norm": 0.0006958251469768584, "learning_rate": 5.47023405658284e-07, "loss": 0.0548, "num_input_tokens_seen": 94437200, "step": 140130 }, { "epoch": 3.4235213641804902, "grad_norm": 0.006527234800159931, "learning_rate": 5.469473804145801e-07, "loss": 0.0, "num_input_tokens_seen": 94440528, "step": 140135 }, { "epoch": 3.423643515012337, "grad_norm": 0.0007295972318388522, "learning_rate": 5.468713584655963e-07, "loss": 0.0, "num_input_tokens_seen": 94443792, "step": 140140 }, { "epoch": 3.4237656658441846, "grad_norm": 0.0014944429276511073, "learning_rate": 5.467953398118863e-07, "loss": 0.0489, "num_input_tokens_seen": 94446864, "step": 140145 }, { "epoch": 3.4238878166760314, "grad_norm": 0.05061039701104164, "learning_rate": 5.467193244540019e-07, "loss": 0.031, "num_input_tokens_seen": 94450192, "step": 140150 }, { "epoch": 3.4240099675078786, "grad_norm": 0.20036552846431732, "learning_rate": 5.466433123924969e-07, "loss": 0.0366, "num_input_tokens_seen": 94453968, "step": 140155 }, { "epoch": 3.4241321183397257, "grad_norm": 0.0480385348200798, "learning_rate": 5.465673036279235e-07, "loss": 0.0, "num_input_tokens_seen": 94457296, "step": 140160 }, { "epoch": 3.424254269171573, "grad_norm": 0.008224071003496647, "learning_rate": 5.464912981608345e-07, "loss": 0.0, "num_input_tokens_seen": 94460176, "step": 140165 }, { "epoch": 3.42437642000342, "grad_norm": 0.00030735330074094236, "learning_rate": 5.464152959917831e-07, "loss": 0.0, "num_input_tokens_seen": 94463184, "step": 140170 }, { "epoch": 3.4244985708352673, "grad_norm": 0.1763540804386139, "learning_rate": 5.463392971213218e-07, "loss": 0.0001, "num_input_tokens_seen": 94466320, "step": 140175 }, { "epoch": 3.4246207216671145, "grad_norm": 0.007921542041003704, "learning_rate": 5.462633015500027e-07, "loss": 0.0, "num_input_tokens_seen": 94469392, "step": 140180 }, { "epoch": 3.4247428724989617, "grad_norm": 0.0009218865307047963, "learning_rate": 5.461873092783792e-07, "loss": 0.027, "num_input_tokens_seen": 94472656, "step": 140185 }, { "epoch": 3.424865023330809, "grad_norm": 0.0015647134277969599, "learning_rate": 5.461113203070033e-07, "loss": 0.0, "num_input_tokens_seen": 94476688, "step": 140190 }, { "epoch": 3.424987174162656, "grad_norm": 2.1352298259735107, "learning_rate": 5.460353346364284e-07, "loss": 0.001, "num_input_tokens_seen": 94479888, "step": 140195 }, { "epoch": 3.4251093249945033, "grad_norm": 0.002515707165002823, "learning_rate": 5.459593522672063e-07, "loss": 0.0, "num_input_tokens_seen": 94483152, "step": 140200 }, { "epoch": 3.4252314758263505, "grad_norm": 0.002265633549541235, "learning_rate": 5.458833731998902e-07, "loss": 0.0, "num_input_tokens_seen": 94486480, "step": 140205 }, { "epoch": 3.4253536266581976, "grad_norm": 0.07212600111961365, "learning_rate": 5.45807397435032e-07, "loss": 0.0388, "num_input_tokens_seen": 94489744, "step": 140210 }, { "epoch": 3.425475777490045, "grad_norm": 0.00012162972416263074, "learning_rate": 5.45731424973185e-07, "loss": 0.0, "num_input_tokens_seen": 94493072, "step": 140215 }, { "epoch": 3.425597928321892, "grad_norm": 0.038783859461545944, "learning_rate": 5.456554558149008e-07, "loss": 0.0, "num_input_tokens_seen": 94496400, "step": 140220 }, { "epoch": 3.425720079153739, "grad_norm": 5.232390685705468e-05, "learning_rate": 5.455794899607324e-07, "loss": 0.0001, "num_input_tokens_seen": 94499920, "step": 140225 }, { "epoch": 3.4258422299855864, "grad_norm": 8.634371624793857e-05, "learning_rate": 5.455035274112325e-07, "loss": 0.0644, "num_input_tokens_seen": 94503888, "step": 140230 }, { "epoch": 3.425964380817433, "grad_norm": 0.0008194191614165902, "learning_rate": 5.454275681669529e-07, "loss": 0.0001, "num_input_tokens_seen": 94507728, "step": 140235 }, { "epoch": 3.4260865316492803, "grad_norm": 0.07005437463521957, "learning_rate": 5.453516122284465e-07, "loss": 0.0, "num_input_tokens_seen": 94511184, "step": 140240 }, { "epoch": 3.4262086824811275, "grad_norm": 0.011823393404483795, "learning_rate": 5.452756595962653e-07, "loss": 0.0, "num_input_tokens_seen": 94514576, "step": 140245 }, { "epoch": 3.4263308333129747, "grad_norm": 0.0013865167275071144, "learning_rate": 5.45199710270962e-07, "loss": 0.0, "num_input_tokens_seen": 94517840, "step": 140250 }, { "epoch": 3.426452984144822, "grad_norm": 0.0023665507324039936, "learning_rate": 5.451237642530884e-07, "loss": 0.0, "num_input_tokens_seen": 94521040, "step": 140255 }, { "epoch": 3.426575134976669, "grad_norm": 0.0009652891894802451, "learning_rate": 5.450478215431973e-07, "loss": 0.0, "num_input_tokens_seen": 94524368, "step": 140260 }, { "epoch": 3.4266972858085163, "grad_norm": 0.00044628497562371194, "learning_rate": 5.44971882141841e-07, "loss": 0.0576, "num_input_tokens_seen": 94527632, "step": 140265 }, { "epoch": 3.4268194366403635, "grad_norm": 0.022095222026109695, "learning_rate": 5.448959460495712e-07, "loss": 0.0001, "num_input_tokens_seen": 94531024, "step": 140270 }, { "epoch": 3.4269415874722107, "grad_norm": 0.0030793990008533, "learning_rate": 5.44820013266941e-07, "loss": 0.0, "num_input_tokens_seen": 94534032, "step": 140275 }, { "epoch": 3.427063738304058, "grad_norm": 0.00028188584838062525, "learning_rate": 5.447440837945015e-07, "loss": 0.0, "num_input_tokens_seen": 94537680, "step": 140280 }, { "epoch": 3.427185889135905, "grad_norm": 0.0007988949655555189, "learning_rate": 5.446681576328061e-07, "loss": 0.0002, "num_input_tokens_seen": 94540624, "step": 140285 }, { "epoch": 3.4273080399677522, "grad_norm": 0.0007177051738835871, "learning_rate": 5.445922347824062e-07, "loss": 0.0001, "num_input_tokens_seen": 94543952, "step": 140290 }, { "epoch": 3.4274301907995994, "grad_norm": 0.001201879233121872, "learning_rate": 5.445163152438535e-07, "loss": 0.0, "num_input_tokens_seen": 94547344, "step": 140295 }, { "epoch": 3.4275523416314466, "grad_norm": 0.008907120674848557, "learning_rate": 5.444403990177013e-07, "loss": 0.047, "num_input_tokens_seen": 94550928, "step": 140300 }, { "epoch": 3.427674492463294, "grad_norm": 0.001095519750379026, "learning_rate": 5.443644861045006e-07, "loss": 0.0, "num_input_tokens_seen": 94554128, "step": 140305 }, { "epoch": 3.427796643295141, "grad_norm": 0.1418331414461136, "learning_rate": 5.442885765048042e-07, "loss": 0.0001, "num_input_tokens_seen": 94557584, "step": 140310 }, { "epoch": 3.427918794126988, "grad_norm": 0.022201886400580406, "learning_rate": 5.442126702191637e-07, "loss": 0.0, "num_input_tokens_seen": 94560720, "step": 140315 }, { "epoch": 3.428040944958835, "grad_norm": 0.0010307944612577558, "learning_rate": 5.44136767248131e-07, "loss": 0.0325, "num_input_tokens_seen": 94564176, "step": 140320 }, { "epoch": 3.4281630957906826, "grad_norm": 0.00235103745944798, "learning_rate": 5.440608675922589e-07, "loss": 0.0366, "num_input_tokens_seen": 94567568, "step": 140325 }, { "epoch": 3.4282852466225293, "grad_norm": 0.019659819081425667, "learning_rate": 5.439849712520983e-07, "loss": 0.0, "num_input_tokens_seen": 94570896, "step": 140330 }, { "epoch": 3.4284073974543765, "grad_norm": 0.002396960277110338, "learning_rate": 5.439090782282021e-07, "loss": 0.0, "num_input_tokens_seen": 94574736, "step": 140335 }, { "epoch": 3.4285295482862237, "grad_norm": 0.003646480618044734, "learning_rate": 5.438331885211212e-07, "loss": 0.0, "num_input_tokens_seen": 94577744, "step": 140340 }, { "epoch": 3.428651699118071, "grad_norm": 0.0033332430757582188, "learning_rate": 5.437573021314083e-07, "loss": 0.0569, "num_input_tokens_seen": 94580880, "step": 140345 }, { "epoch": 3.428773849949918, "grad_norm": 0.0023963816929608583, "learning_rate": 5.436814190596153e-07, "loss": 0.0248, "num_input_tokens_seen": 94584080, "step": 140350 }, { "epoch": 3.4288960007817653, "grad_norm": 0.0009214163874275982, "learning_rate": 5.436055393062933e-07, "loss": 0.0, "num_input_tokens_seen": 94587536, "step": 140355 }, { "epoch": 3.4290181516136125, "grad_norm": 37.51756286621094, "learning_rate": 5.43529662871995e-07, "loss": 0.0017, "num_input_tokens_seen": 94590928, "step": 140360 }, { "epoch": 3.4291403024454596, "grad_norm": 0.0013852888951078057, "learning_rate": 5.434537897572713e-07, "loss": 0.0, "num_input_tokens_seen": 94594320, "step": 140365 }, { "epoch": 3.429262453277307, "grad_norm": 0.010237696580588818, "learning_rate": 5.43377919962675e-07, "loss": 0.0, "num_input_tokens_seen": 94597776, "step": 140370 }, { "epoch": 3.429384604109154, "grad_norm": 0.0037443661130964756, "learning_rate": 5.433020534887568e-07, "loss": 0.043, "num_input_tokens_seen": 94601424, "step": 140375 }, { "epoch": 3.429506754941001, "grad_norm": 0.07455271482467651, "learning_rate": 5.432261903360693e-07, "loss": 0.0, "num_input_tokens_seen": 94604624, "step": 140380 }, { "epoch": 3.4296289057728484, "grad_norm": 0.002680698409676552, "learning_rate": 5.431503305051634e-07, "loss": 0.0, "num_input_tokens_seen": 94607760, "step": 140385 }, { "epoch": 3.4297510566046956, "grad_norm": 0.06336840987205505, "learning_rate": 5.430744739965915e-07, "loss": 0.0572, "num_input_tokens_seen": 94611152, "step": 140390 }, { "epoch": 3.429873207436543, "grad_norm": 0.0008052958874031901, "learning_rate": 5.429986208109052e-07, "loss": 0.0001, "num_input_tokens_seen": 94614544, "step": 140395 }, { "epoch": 3.42999535826839, "grad_norm": 0.02269018441438675, "learning_rate": 5.429227709486552e-07, "loss": 0.0, "num_input_tokens_seen": 94618448, "step": 140400 }, { "epoch": 3.430117509100237, "grad_norm": 0.00405275821685791, "learning_rate": 5.428469244103941e-07, "loss": 0.0549, "num_input_tokens_seen": 94621840, "step": 140405 }, { "epoch": 3.4302396599320844, "grad_norm": 0.6425538659095764, "learning_rate": 5.427710811966729e-07, "loss": 0.0004, "num_input_tokens_seen": 94625488, "step": 140410 }, { "epoch": 3.430361810763931, "grad_norm": 0.001985025592148304, "learning_rate": 5.426952413080433e-07, "loss": 0.0058, "num_input_tokens_seen": 94629264, "step": 140415 }, { "epoch": 3.4304839615957783, "grad_norm": 16.643945693969727, "learning_rate": 5.426194047450574e-07, "loss": 0.0299, "num_input_tokens_seen": 94633104, "step": 140420 }, { "epoch": 3.4306061124276255, "grad_norm": 0.11192969977855682, "learning_rate": 5.425435715082657e-07, "loss": 0.0656, "num_input_tokens_seen": 94636176, "step": 140425 }, { "epoch": 3.4307282632594727, "grad_norm": 0.03374035283923149, "learning_rate": 5.424677415982206e-07, "loss": 0.0, "num_input_tokens_seen": 94639632, "step": 140430 }, { "epoch": 3.43085041409132, "grad_norm": 0.011221355758607388, "learning_rate": 5.423919150154727e-07, "loss": 0.0444, "num_input_tokens_seen": 94642896, "step": 140435 }, { "epoch": 3.430972564923167, "grad_norm": 0.000732203247025609, "learning_rate": 5.42316091760574e-07, "loss": 0.0422, "num_input_tokens_seen": 94646480, "step": 140440 }, { "epoch": 3.4310947157550142, "grad_norm": 0.1238001212477684, "learning_rate": 5.42240271834076e-07, "loss": 0.0001, "num_input_tokens_seen": 94650128, "step": 140445 }, { "epoch": 3.4312168665868614, "grad_norm": 0.01326525118201971, "learning_rate": 5.421644552365296e-07, "loss": 0.0, "num_input_tokens_seen": 94653776, "step": 140450 }, { "epoch": 3.4313390174187086, "grad_norm": 0.011391034349799156, "learning_rate": 5.420886419684869e-07, "loss": 0.0, "num_input_tokens_seen": 94656976, "step": 140455 }, { "epoch": 3.431461168250556, "grad_norm": 0.00035396870225667953, "learning_rate": 5.420128320304982e-07, "loss": 0.0, "num_input_tokens_seen": 94660304, "step": 140460 }, { "epoch": 3.431583319082403, "grad_norm": 0.0002599718573037535, "learning_rate": 5.419370254231159e-07, "loss": 0.0001, "num_input_tokens_seen": 94663632, "step": 140465 }, { "epoch": 3.43170546991425, "grad_norm": 0.07461328059434891, "learning_rate": 5.418612221468902e-07, "loss": 0.0, "num_input_tokens_seen": 94667152, "step": 140470 }, { "epoch": 3.4318276207460974, "grad_norm": 0.12314751744270325, "learning_rate": 5.41785422202373e-07, "loss": 0.0, "num_input_tokens_seen": 94670544, "step": 140475 }, { "epoch": 3.4319497715779446, "grad_norm": 0.004574818070977926, "learning_rate": 5.417096255901159e-07, "loss": 0.1119, "num_input_tokens_seen": 94674064, "step": 140480 }, { "epoch": 3.4320719224097918, "grad_norm": 0.005468493793159723, "learning_rate": 5.416338323106697e-07, "loss": 0.0, "num_input_tokens_seen": 94677584, "step": 140485 }, { "epoch": 3.432194073241639, "grad_norm": 0.02408468723297119, "learning_rate": 5.41558042364585e-07, "loss": 0.0004, "num_input_tokens_seen": 94680784, "step": 140490 }, { "epoch": 3.432316224073486, "grad_norm": 0.009744217619299889, "learning_rate": 5.41482255752414e-07, "loss": 0.1002, "num_input_tokens_seen": 94683920, "step": 140495 }, { "epoch": 3.432438374905333, "grad_norm": 0.6703230738639832, "learning_rate": 5.414064724747069e-07, "loss": 0.0002, "num_input_tokens_seen": 94686800, "step": 140500 }, { "epoch": 3.4325605257371805, "grad_norm": 0.005938725546002388, "learning_rate": 5.413306925320158e-07, "loss": 0.0, "num_input_tokens_seen": 94690768, "step": 140505 }, { "epoch": 3.4326826765690273, "grad_norm": 0.09531354159116745, "learning_rate": 5.412549159248909e-07, "loss": 0.0006, "num_input_tokens_seen": 94693776, "step": 140510 }, { "epoch": 3.4328048274008744, "grad_norm": 0.0047434065490961075, "learning_rate": 5.411791426538839e-07, "loss": 0.0, "num_input_tokens_seen": 94697488, "step": 140515 }, { "epoch": 3.4329269782327216, "grad_norm": 0.0113271065056324, "learning_rate": 5.411033727195453e-07, "loss": 0.0203, "num_input_tokens_seen": 94700688, "step": 140520 }, { "epoch": 3.433049129064569, "grad_norm": 0.0006195983733050525, "learning_rate": 5.410276061224267e-07, "loss": 0.0, "num_input_tokens_seen": 94704336, "step": 140525 }, { "epoch": 3.433171279896416, "grad_norm": 0.05286652222275734, "learning_rate": 5.409518428630785e-07, "loss": 0.0, "num_input_tokens_seen": 94707600, "step": 140530 }, { "epoch": 3.433293430728263, "grad_norm": 0.00045306552783586085, "learning_rate": 5.408760829420519e-07, "loss": 0.0, "num_input_tokens_seen": 94710864, "step": 140535 }, { "epoch": 3.4334155815601104, "grad_norm": 0.06464538723230362, "learning_rate": 5.408003263598984e-07, "loss": 0.0, "num_input_tokens_seen": 94714128, "step": 140540 }, { "epoch": 3.4335377323919576, "grad_norm": 0.0013601994141936302, "learning_rate": 5.407245731171679e-07, "loss": 0.0, "num_input_tokens_seen": 94717264, "step": 140545 }, { "epoch": 3.4336598832238048, "grad_norm": 0.006994623225182295, "learning_rate": 5.406488232144122e-07, "loss": 0.0, "num_input_tokens_seen": 94720848, "step": 140550 }, { "epoch": 3.433782034055652, "grad_norm": 26.440235137939453, "learning_rate": 5.405730766521815e-07, "loss": 0.0694, "num_input_tokens_seen": 94724368, "step": 140555 }, { "epoch": 3.433904184887499, "grad_norm": 0.23649762570858002, "learning_rate": 5.404973334310274e-07, "loss": 0.0606, "num_input_tokens_seen": 94727824, "step": 140560 }, { "epoch": 3.4340263357193463, "grad_norm": 0.01221111137419939, "learning_rate": 5.404215935514999e-07, "loss": 0.0, "num_input_tokens_seen": 94730960, "step": 140565 }, { "epoch": 3.4341484865511935, "grad_norm": 0.0008569937199354172, "learning_rate": 5.403458570141502e-07, "loss": 0.0001, "num_input_tokens_seen": 94734224, "step": 140570 }, { "epoch": 3.4342706373830407, "grad_norm": 0.002068551490083337, "learning_rate": 5.402701238195293e-07, "loss": 0.0001, "num_input_tokens_seen": 94737488, "step": 140575 }, { "epoch": 3.434392788214888, "grad_norm": 0.018063923344016075, "learning_rate": 5.401943939681875e-07, "loss": 0.0, "num_input_tokens_seen": 94741008, "step": 140580 }, { "epoch": 3.4345149390467347, "grad_norm": 0.019212810322642326, "learning_rate": 5.401186674606759e-07, "loss": 0.0004, "num_input_tokens_seen": 94744400, "step": 140585 }, { "epoch": 3.4346370898785823, "grad_norm": 0.002542370930314064, "learning_rate": 5.400429442975448e-07, "loss": 0.0186, "num_input_tokens_seen": 94747984, "step": 140590 }, { "epoch": 3.434759240710429, "grad_norm": 0.016300778836011887, "learning_rate": 5.399672244793455e-07, "loss": 0.0, "num_input_tokens_seen": 94751568, "step": 140595 }, { "epoch": 3.4348813915422762, "grad_norm": 0.019658824428915977, "learning_rate": 5.398915080066283e-07, "loss": 0.0001, "num_input_tokens_seen": 94754832, "step": 140600 }, { "epoch": 3.4350035423741234, "grad_norm": 0.014098125509917736, "learning_rate": 5.398157948799434e-07, "loss": 0.0, "num_input_tokens_seen": 94758032, "step": 140605 }, { "epoch": 3.4351256932059706, "grad_norm": 0.007338009774684906, "learning_rate": 5.397400850998421e-07, "loss": 0.0187, "num_input_tokens_seen": 94761552, "step": 140610 }, { "epoch": 3.435247844037818, "grad_norm": 0.0005641011521220207, "learning_rate": 5.396643786668744e-07, "loss": 0.0, "num_input_tokens_seen": 94764752, "step": 140615 }, { "epoch": 3.435369994869665, "grad_norm": 0.03765435144305229, "learning_rate": 5.395886755815918e-07, "loss": 0.0, "num_input_tokens_seen": 94768272, "step": 140620 }, { "epoch": 3.435492145701512, "grad_norm": 0.013260685838758945, "learning_rate": 5.395129758445433e-07, "loss": 0.0, "num_input_tokens_seen": 94771664, "step": 140625 }, { "epoch": 3.4356142965333594, "grad_norm": 0.004746972117573023, "learning_rate": 5.394372794562805e-07, "loss": 0.0005, "num_input_tokens_seen": 94774928, "step": 140630 }, { "epoch": 3.4357364473652066, "grad_norm": 0.0005900778924115002, "learning_rate": 5.393615864173542e-07, "loss": 0.0001, "num_input_tokens_seen": 94778384, "step": 140635 }, { "epoch": 3.4358585981970537, "grad_norm": 0.0015556630678474903, "learning_rate": 5.392858967283138e-07, "loss": 0.0, "num_input_tokens_seen": 94782032, "step": 140640 }, { "epoch": 3.435980749028901, "grad_norm": 0.00017046624270733446, "learning_rate": 5.392102103897108e-07, "loss": 0.0006, "num_input_tokens_seen": 94785488, "step": 140645 }, { "epoch": 3.436102899860748, "grad_norm": 0.0002703327627386898, "learning_rate": 5.391345274020946e-07, "loss": 0.0, "num_input_tokens_seen": 94788752, "step": 140650 }, { "epoch": 3.4362250506925953, "grad_norm": 0.002478924347087741, "learning_rate": 5.390588477660161e-07, "loss": 0.0001, "num_input_tokens_seen": 94792016, "step": 140655 }, { "epoch": 3.4363472015244425, "grad_norm": 0.01701575517654419, "learning_rate": 5.389831714820261e-07, "loss": 0.0, "num_input_tokens_seen": 94794960, "step": 140660 }, { "epoch": 3.4364693523562897, "grad_norm": 0.021169807761907578, "learning_rate": 5.389074985506739e-07, "loss": 0.0001, "num_input_tokens_seen": 94798032, "step": 140665 }, { "epoch": 3.436591503188137, "grad_norm": 0.001484877779148519, "learning_rate": 5.38831828972511e-07, "loss": 0.0, "num_input_tokens_seen": 94801232, "step": 140670 }, { "epoch": 3.436713654019984, "grad_norm": 0.6949237585067749, "learning_rate": 5.387561627480865e-07, "loss": 0.0003, "num_input_tokens_seen": 94804880, "step": 140675 }, { "epoch": 3.436835804851831, "grad_norm": 0.0015133284032344818, "learning_rate": 5.386804998779517e-07, "loss": 0.0421, "num_input_tokens_seen": 94808336, "step": 140680 }, { "epoch": 3.436957955683678, "grad_norm": 0.25915995240211487, "learning_rate": 5.386048403626561e-07, "loss": 0.0001, "num_input_tokens_seen": 94811600, "step": 140685 }, { "epoch": 3.437080106515525, "grad_norm": 0.009517625905573368, "learning_rate": 5.385291842027505e-07, "loss": 0.0, "num_input_tokens_seen": 94815248, "step": 140690 }, { "epoch": 3.4372022573473724, "grad_norm": 0.011960485950112343, "learning_rate": 5.384535313987844e-07, "loss": 0.0, "num_input_tokens_seen": 94818704, "step": 140695 }, { "epoch": 3.4373244081792196, "grad_norm": 0.0006022618617862463, "learning_rate": 5.383778819513088e-07, "loss": 0.0, "num_input_tokens_seen": 94821776, "step": 140700 }, { "epoch": 3.4374465590110668, "grad_norm": 5.928589962422848e-05, "learning_rate": 5.383022358608733e-07, "loss": 0.0, "num_input_tokens_seen": 94825680, "step": 140705 }, { "epoch": 3.437568709842914, "grad_norm": 0.0023734017740935087, "learning_rate": 5.382265931280279e-07, "loss": 0.0003, "num_input_tokens_seen": 94829328, "step": 140710 }, { "epoch": 3.437690860674761, "grad_norm": 0.00043151769204996526, "learning_rate": 5.381509537533231e-07, "loss": 0.0, "num_input_tokens_seen": 94832464, "step": 140715 }, { "epoch": 3.4378130115066083, "grad_norm": 0.0004412507696542889, "learning_rate": 5.380753177373085e-07, "loss": 0.0002, "num_input_tokens_seen": 94835408, "step": 140720 }, { "epoch": 3.4379351623384555, "grad_norm": 0.002872526179999113, "learning_rate": 5.379996850805344e-07, "loss": 0.0, "num_input_tokens_seen": 94838416, "step": 140725 }, { "epoch": 3.4380573131703027, "grad_norm": 0.022230220958590508, "learning_rate": 5.379240557835514e-07, "loss": 0.0001, "num_input_tokens_seen": 94842000, "step": 140730 }, { "epoch": 3.43817946400215, "grad_norm": 0.003806622000411153, "learning_rate": 5.378484298469084e-07, "loss": 0.0, "num_input_tokens_seen": 94845008, "step": 140735 }, { "epoch": 3.438301614833997, "grad_norm": 0.0002141565055353567, "learning_rate": 5.377728072711563e-07, "loss": 0.0, "num_input_tokens_seen": 94848464, "step": 140740 }, { "epoch": 3.4384237656658443, "grad_norm": 0.0003093667037319392, "learning_rate": 5.376971880568444e-07, "loss": 0.0371, "num_input_tokens_seen": 94851984, "step": 140745 }, { "epoch": 3.4385459164976915, "grad_norm": 0.04661043733358383, "learning_rate": 5.376215722045227e-07, "loss": 0.0, "num_input_tokens_seen": 94855120, "step": 140750 }, { "epoch": 3.4386680673295387, "grad_norm": 0.0001445067609893158, "learning_rate": 5.375459597147419e-07, "loss": 0.0002, "num_input_tokens_seen": 94858704, "step": 140755 }, { "epoch": 3.438790218161386, "grad_norm": 0.006317912135273218, "learning_rate": 5.374703505880507e-07, "loss": 0.0, "num_input_tokens_seen": 94861904, "step": 140760 }, { "epoch": 3.4389123689932326, "grad_norm": 0.009212841279804707, "learning_rate": 5.373947448250001e-07, "loss": 0.039, "num_input_tokens_seen": 94865296, "step": 140765 }, { "epoch": 3.4390345198250802, "grad_norm": 0.02927383780479431, "learning_rate": 5.373191424261388e-07, "loss": 0.0, "num_input_tokens_seen": 94868624, "step": 140770 }, { "epoch": 3.439156670656927, "grad_norm": 0.009841068647801876, "learning_rate": 5.372435433920175e-07, "loss": 0.0, "num_input_tokens_seen": 94871824, "step": 140775 }, { "epoch": 3.439278821488774, "grad_norm": 0.0017802307847887278, "learning_rate": 5.371679477231852e-07, "loss": 0.0001, "num_input_tokens_seen": 94875344, "step": 140780 }, { "epoch": 3.4394009723206214, "grad_norm": 0.0039372132159769535, "learning_rate": 5.370923554201923e-07, "loss": 0.0, "num_input_tokens_seen": 94879120, "step": 140785 }, { "epoch": 3.4395231231524686, "grad_norm": 0.00011069678294006735, "learning_rate": 5.370167664835885e-07, "loss": 0.0001, "num_input_tokens_seen": 94882000, "step": 140790 }, { "epoch": 3.4396452739843157, "grad_norm": 0.018852299079298973, "learning_rate": 5.369411809139232e-07, "loss": 0.005, "num_input_tokens_seen": 94886160, "step": 140795 }, { "epoch": 3.439767424816163, "grad_norm": 0.0011196270352229476, "learning_rate": 5.36865598711746e-07, "loss": 0.0, "num_input_tokens_seen": 94889552, "step": 140800 }, { "epoch": 3.43988957564801, "grad_norm": 8.786971739027649e-05, "learning_rate": 5.367900198776072e-07, "loss": 0.0011, "num_input_tokens_seen": 94893328, "step": 140805 }, { "epoch": 3.4400117264798573, "grad_norm": 0.0004634481738321483, "learning_rate": 5.367144444120553e-07, "loss": 0.0001, "num_input_tokens_seen": 94896528, "step": 140810 }, { "epoch": 3.4401338773117045, "grad_norm": 0.0002953216608148068, "learning_rate": 5.366388723156412e-07, "loss": 0.0, "num_input_tokens_seen": 94900048, "step": 140815 }, { "epoch": 3.4402560281435517, "grad_norm": 0.004183363169431686, "learning_rate": 5.365633035889135e-07, "loss": 0.0685, "num_input_tokens_seen": 94903056, "step": 140820 }, { "epoch": 3.440378178975399, "grad_norm": 0.011689604260027409, "learning_rate": 5.364877382324222e-07, "loss": 0.0276, "num_input_tokens_seen": 94906320, "step": 140825 }, { "epoch": 3.440500329807246, "grad_norm": 0.005056587513536215, "learning_rate": 5.364121762467165e-07, "loss": 0.0, "num_input_tokens_seen": 94909840, "step": 140830 }, { "epoch": 3.4406224806390933, "grad_norm": 347.3509216308594, "learning_rate": 5.363366176323465e-07, "loss": 0.0737, "num_input_tokens_seen": 94913360, "step": 140835 }, { "epoch": 3.4407446314709405, "grad_norm": 0.0028221230022609234, "learning_rate": 5.362610623898612e-07, "loss": 0.0, "num_input_tokens_seen": 94916816, "step": 140840 }, { "epoch": 3.4408667823027876, "grad_norm": 0.004818981513381004, "learning_rate": 5.361855105198098e-07, "loss": 0.0, "num_input_tokens_seen": 94920336, "step": 140845 }, { "epoch": 3.440988933134635, "grad_norm": 0.000316784338792786, "learning_rate": 5.361099620227427e-07, "loss": 0.0279, "num_input_tokens_seen": 94923408, "step": 140850 }, { "epoch": 3.441111083966482, "grad_norm": 0.0016813983675092459, "learning_rate": 5.360344168992083e-07, "loss": 0.0, "num_input_tokens_seen": 94926416, "step": 140855 }, { "epoch": 3.4412332347983288, "grad_norm": 27.457794189453125, "learning_rate": 5.359588751497568e-07, "loss": 0.095, "num_input_tokens_seen": 94929744, "step": 140860 }, { "epoch": 3.441355385630176, "grad_norm": 0.001600230229087174, "learning_rate": 5.358833367749368e-07, "loss": 0.0, "num_input_tokens_seen": 94933072, "step": 140865 }, { "epoch": 3.441477536462023, "grad_norm": 0.035153310745954514, "learning_rate": 5.358078017752984e-07, "loss": 0.0, "num_input_tokens_seen": 94936912, "step": 140870 }, { "epoch": 3.4415996872938703, "grad_norm": 0.0017619016580283642, "learning_rate": 5.357322701513901e-07, "loss": 0.0, "num_input_tokens_seen": 94940752, "step": 140875 }, { "epoch": 3.4417218381257175, "grad_norm": 0.002070831134915352, "learning_rate": 5.356567419037616e-07, "loss": 0.0002, "num_input_tokens_seen": 94944080, "step": 140880 }, { "epoch": 3.4418439889575647, "grad_norm": 0.0022519580088555813, "learning_rate": 5.355812170329626e-07, "loss": 0.0, "num_input_tokens_seen": 94947344, "step": 140885 }, { "epoch": 3.441966139789412, "grad_norm": 1.9227795600891113, "learning_rate": 5.355056955395415e-07, "loss": 0.0006, "num_input_tokens_seen": 94950544, "step": 140890 }, { "epoch": 3.442088290621259, "grad_norm": 0.0021642029751092196, "learning_rate": 5.354301774240483e-07, "loss": 0.0, "num_input_tokens_seen": 94953488, "step": 140895 }, { "epoch": 3.4422104414531063, "grad_norm": 0.000459485687315464, "learning_rate": 5.353546626870313e-07, "loss": 0.0, "num_input_tokens_seen": 94956496, "step": 140900 }, { "epoch": 3.4423325922849535, "grad_norm": 0.0010341558372601867, "learning_rate": 5.352791513290408e-07, "loss": 0.0001, "num_input_tokens_seen": 94959952, "step": 140905 }, { "epoch": 3.4424547431168007, "grad_norm": 0.0008501124102622271, "learning_rate": 5.352036433506251e-07, "loss": 0.0, "num_input_tokens_seen": 94963600, "step": 140910 }, { "epoch": 3.442576893948648, "grad_norm": 0.0004239061672706157, "learning_rate": 5.351281387523332e-07, "loss": 0.0, "num_input_tokens_seen": 94967056, "step": 140915 }, { "epoch": 3.442699044780495, "grad_norm": 0.0022695395164191723, "learning_rate": 5.35052637534715e-07, "loss": 0.0, "num_input_tokens_seen": 94970064, "step": 140920 }, { "epoch": 3.4428211956123422, "grad_norm": 5.309248808771372e-05, "learning_rate": 5.349771396983185e-07, "loss": 0.0, "num_input_tokens_seen": 94973392, "step": 140925 }, { "epoch": 3.4429433464441894, "grad_norm": 0.004750723950564861, "learning_rate": 5.349016452436938e-07, "loss": 0.0001, "num_input_tokens_seen": 94976656, "step": 140930 }, { "epoch": 3.4430654972760366, "grad_norm": 0.0031600017100572586, "learning_rate": 5.348261541713891e-07, "loss": 0.0, "num_input_tokens_seen": 94979728, "step": 140935 }, { "epoch": 3.443187648107884, "grad_norm": 0.000454644177807495, "learning_rate": 5.347506664819536e-07, "loss": 0.0, "num_input_tokens_seen": 94983056, "step": 140940 }, { "epoch": 3.4433097989397305, "grad_norm": 166.5770263671875, "learning_rate": 5.346751821759369e-07, "loss": 0.043, "num_input_tokens_seen": 94986832, "step": 140945 }, { "epoch": 3.443431949771578, "grad_norm": 0.05186281353235245, "learning_rate": 5.34599701253887e-07, "loss": 0.0, "num_input_tokens_seen": 94990096, "step": 140950 }, { "epoch": 3.443554100603425, "grad_norm": 0.004760414827615023, "learning_rate": 5.345242237163537e-07, "loss": 0.0, "num_input_tokens_seen": 94993552, "step": 140955 }, { "epoch": 3.443676251435272, "grad_norm": 0.002913939068093896, "learning_rate": 5.34448749563885e-07, "loss": 0.0, "num_input_tokens_seen": 94997584, "step": 140960 }, { "epoch": 3.4437984022671193, "grad_norm": 0.001271998044103384, "learning_rate": 5.343732787970305e-07, "loss": 0.0, "num_input_tokens_seen": 95000912, "step": 140965 }, { "epoch": 3.4439205530989665, "grad_norm": 0.001133499899879098, "learning_rate": 5.342978114163384e-07, "loss": 0.0001, "num_input_tokens_seen": 95004240, "step": 140970 }, { "epoch": 3.4440427039308137, "grad_norm": 130.31053161621094, "learning_rate": 5.342223474223579e-07, "loss": 0.0204, "num_input_tokens_seen": 95007952, "step": 140975 }, { "epoch": 3.444164854762661, "grad_norm": 0.009950289502739906, "learning_rate": 5.341468868156382e-07, "loss": 0.081, "num_input_tokens_seen": 95011152, "step": 140980 }, { "epoch": 3.444287005594508, "grad_norm": 0.006339088547974825, "learning_rate": 5.340714295967272e-07, "loss": 0.0001, "num_input_tokens_seen": 95014480, "step": 140985 }, { "epoch": 3.4444091564263553, "grad_norm": 45.47343826293945, "learning_rate": 5.339959757661745e-07, "loss": 0.0439, "num_input_tokens_seen": 95018000, "step": 140990 }, { "epoch": 3.4445313072582024, "grad_norm": 0.000821881287265569, "learning_rate": 5.339205253245281e-07, "loss": 0.0, "num_input_tokens_seen": 95021264, "step": 140995 }, { "epoch": 3.4446534580900496, "grad_norm": 0.2967153787612915, "learning_rate": 5.338450782723373e-07, "loss": 0.0001, "num_input_tokens_seen": 95024528, "step": 141000 }, { "epoch": 3.444775608921897, "grad_norm": 0.0012332163751125336, "learning_rate": 5.337696346101502e-07, "loss": 0.0001, "num_input_tokens_seen": 95028176, "step": 141005 }, { "epoch": 3.444897759753744, "grad_norm": 0.08027886599302292, "learning_rate": 5.336941943385158e-07, "loss": 0.0, "num_input_tokens_seen": 95031312, "step": 141010 }, { "epoch": 3.445019910585591, "grad_norm": 0.032805878669023514, "learning_rate": 5.336187574579829e-07, "loss": 0.0, "num_input_tokens_seen": 95034640, "step": 141015 }, { "epoch": 3.4451420614174384, "grad_norm": 0.011173062957823277, "learning_rate": 5.335433239690994e-07, "loss": 0.0, "num_input_tokens_seen": 95037968, "step": 141020 }, { "epoch": 3.4452642122492856, "grad_norm": 0.006212008185684681, "learning_rate": 5.334678938724147e-07, "loss": 0.0, "num_input_tokens_seen": 95041296, "step": 141025 }, { "epoch": 3.4453863630811328, "grad_norm": 0.0015581079060211778, "learning_rate": 5.333924671684765e-07, "loss": 0.0, "num_input_tokens_seen": 95044624, "step": 141030 }, { "epoch": 3.44550851391298, "grad_norm": 2.255311965942383, "learning_rate": 5.333170438578338e-07, "loss": 0.0016, "num_input_tokens_seen": 95047824, "step": 141035 }, { "epoch": 3.4456306647448267, "grad_norm": 0.002275418723002076, "learning_rate": 5.332416239410355e-07, "loss": 0.0, "num_input_tokens_seen": 95051472, "step": 141040 }, { "epoch": 3.445752815576674, "grad_norm": 1.5443556308746338, "learning_rate": 5.331662074186294e-07, "loss": 0.0011, "num_input_tokens_seen": 95054544, "step": 141045 }, { "epoch": 3.445874966408521, "grad_norm": 0.0001924506650539115, "learning_rate": 5.330907942911644e-07, "loss": 0.0, "num_input_tokens_seen": 95058000, "step": 141050 }, { "epoch": 3.4459971172403683, "grad_norm": 0.003207719186320901, "learning_rate": 5.330153845591884e-07, "loss": 0.0, "num_input_tokens_seen": 95061520, "step": 141055 }, { "epoch": 3.4461192680722155, "grad_norm": 0.00038838028558529913, "learning_rate": 5.329399782232501e-07, "loss": 0.0, "num_input_tokens_seen": 95065104, "step": 141060 }, { "epoch": 3.4462414189040627, "grad_norm": 0.016583800315856934, "learning_rate": 5.328645752838983e-07, "loss": 0.0, "num_input_tokens_seen": 95068496, "step": 141065 }, { "epoch": 3.44636356973591, "grad_norm": 0.0007583460537716746, "learning_rate": 5.327891757416806e-07, "loss": 0.0, "num_input_tokens_seen": 95071760, "step": 141070 }, { "epoch": 3.446485720567757, "grad_norm": 0.00037626130506396294, "learning_rate": 5.327137795971461e-07, "loss": 0.0, "num_input_tokens_seen": 95075024, "step": 141075 }, { "epoch": 3.4466078713996042, "grad_norm": 0.0008241772884503007, "learning_rate": 5.326383868508422e-07, "loss": 0.0, "num_input_tokens_seen": 95078032, "step": 141080 }, { "epoch": 3.4467300222314514, "grad_norm": 0.00028776045655831695, "learning_rate": 5.325629975033181e-07, "loss": 0.0024, "num_input_tokens_seen": 95082320, "step": 141085 }, { "epoch": 3.4468521730632986, "grad_norm": 0.006339185405522585, "learning_rate": 5.324876115551211e-07, "loss": 0.0, "num_input_tokens_seen": 95085584, "step": 141090 }, { "epoch": 3.446974323895146, "grad_norm": 0.0002104215818690136, "learning_rate": 5.324122290068001e-07, "loss": 0.0002, "num_input_tokens_seen": 95088912, "step": 141095 }, { "epoch": 3.447096474726993, "grad_norm": 0.0013865433866158128, "learning_rate": 5.323368498589035e-07, "loss": 0.0, "num_input_tokens_seen": 95091664, "step": 141100 }, { "epoch": 3.44721862555884, "grad_norm": 0.07491642981767654, "learning_rate": 5.322614741119791e-07, "loss": 0.0, "num_input_tokens_seen": 95094928, "step": 141105 }, { "epoch": 3.4473407763906874, "grad_norm": 0.0023299320600926876, "learning_rate": 5.321861017665745e-07, "loss": 0.0, "num_input_tokens_seen": 95098256, "step": 141110 }, { "epoch": 3.4474629272225346, "grad_norm": 0.01721266657114029, "learning_rate": 5.321107328232391e-07, "loss": 0.0, "num_input_tokens_seen": 95101328, "step": 141115 }, { "epoch": 3.4475850780543817, "grad_norm": 0.0050105806440114975, "learning_rate": 5.320353672825197e-07, "loss": 0.0256, "num_input_tokens_seen": 95104656, "step": 141120 }, { "epoch": 3.4477072288862285, "grad_norm": 0.00042763014789670706, "learning_rate": 5.319600051449653e-07, "loss": 0.0, "num_input_tokens_seen": 95108432, "step": 141125 }, { "epoch": 3.447829379718076, "grad_norm": 0.0009441131260246038, "learning_rate": 5.318846464111235e-07, "loss": 0.0, "num_input_tokens_seen": 95111760, "step": 141130 }, { "epoch": 3.447951530549923, "grad_norm": 0.0003792133938986808, "learning_rate": 5.318092910815426e-07, "loss": 0.0, "num_input_tokens_seen": 95114896, "step": 141135 }, { "epoch": 3.44807368138177, "grad_norm": 0.001232424401678145, "learning_rate": 5.317339391567702e-07, "loss": 0.0, "num_input_tokens_seen": 95118160, "step": 141140 }, { "epoch": 3.4481958322136173, "grad_norm": 0.0013748533092439175, "learning_rate": 5.31658590637355e-07, "loss": 0.0, "num_input_tokens_seen": 95121808, "step": 141145 }, { "epoch": 3.4483179830454644, "grad_norm": 0.00029413148877210915, "learning_rate": 5.315832455238439e-07, "loss": 0.0, "num_input_tokens_seen": 95125328, "step": 141150 }, { "epoch": 3.4484401338773116, "grad_norm": 0.014625326730310917, "learning_rate": 5.315079038167856e-07, "loss": 0.0, "num_input_tokens_seen": 95128656, "step": 141155 }, { "epoch": 3.448562284709159, "grad_norm": 0.001389988581649959, "learning_rate": 5.314325655167282e-07, "loss": 0.0, "num_input_tokens_seen": 95131920, "step": 141160 }, { "epoch": 3.448684435541006, "grad_norm": 0.004245539661496878, "learning_rate": 5.313572306242187e-07, "loss": 0.0, "num_input_tokens_seen": 95135568, "step": 141165 }, { "epoch": 3.448806586372853, "grad_norm": 0.0002586462942417711, "learning_rate": 5.312818991398061e-07, "loss": 0.143, "num_input_tokens_seen": 95138768, "step": 141170 }, { "epoch": 3.4489287372047004, "grad_norm": 4.186246223980561e-05, "learning_rate": 5.31206571064037e-07, "loss": 0.0, "num_input_tokens_seen": 95142352, "step": 141175 }, { "epoch": 3.4490508880365476, "grad_norm": 0.0017056518699973822, "learning_rate": 5.311312463974602e-07, "loss": 0.0266, "num_input_tokens_seen": 95145680, "step": 141180 }, { "epoch": 3.4491730388683948, "grad_norm": 0.015404156409204006, "learning_rate": 5.310559251406228e-07, "loss": 0.0, "num_input_tokens_seen": 95149456, "step": 141185 }, { "epoch": 3.449295189700242, "grad_norm": 0.14585693180561066, "learning_rate": 5.309806072940728e-07, "loss": 0.0001, "num_input_tokens_seen": 95152592, "step": 141190 }, { "epoch": 3.449417340532089, "grad_norm": 0.0002526379539631307, "learning_rate": 5.309052928583583e-07, "loss": 0.0804, "num_input_tokens_seen": 95156304, "step": 141195 }, { "epoch": 3.4495394913639363, "grad_norm": 0.004309144802391529, "learning_rate": 5.308299818340263e-07, "loss": 0.0568, "num_input_tokens_seen": 95159568, "step": 141200 }, { "epoch": 3.4496616421957835, "grad_norm": 0.00161842186935246, "learning_rate": 5.307546742216253e-07, "loss": 0.0, "num_input_tokens_seen": 95162832, "step": 141205 }, { "epoch": 3.4497837930276303, "grad_norm": 0.0006941687897779047, "learning_rate": 5.306793700217021e-07, "loss": 0.0, "num_input_tokens_seen": 95166352, "step": 141210 }, { "epoch": 3.449905943859478, "grad_norm": 0.005880511831492186, "learning_rate": 5.306040692348053e-07, "loss": 0.0625, "num_input_tokens_seen": 95169936, "step": 141215 }, { "epoch": 3.4500280946913247, "grad_norm": 0.001196865108795464, "learning_rate": 5.305287718614818e-07, "loss": 0.0, "num_input_tokens_seen": 95173008, "step": 141220 }, { "epoch": 3.450150245523172, "grad_norm": 0.00487381499260664, "learning_rate": 5.304534779022789e-07, "loss": 0.0888, "num_input_tokens_seen": 95176080, "step": 141225 }, { "epoch": 3.450272396355019, "grad_norm": 0.0032687492202967405, "learning_rate": 5.303781873577451e-07, "loss": 0.0227, "num_input_tokens_seen": 95179152, "step": 141230 }, { "epoch": 3.4503945471868662, "grad_norm": 0.00017934656352736056, "learning_rate": 5.303029002284271e-07, "loss": 0.0, "num_input_tokens_seen": 95182352, "step": 141235 }, { "epoch": 3.4505166980187134, "grad_norm": 0.6623572111129761, "learning_rate": 5.30227616514873e-07, "loss": 0.0002, "num_input_tokens_seen": 95185808, "step": 141240 }, { "epoch": 3.4506388488505606, "grad_norm": 0.006883205845952034, "learning_rate": 5.301523362176297e-07, "loss": 0.0, "num_input_tokens_seen": 95189136, "step": 141245 }, { "epoch": 3.450760999682408, "grad_norm": 0.0020480111707001925, "learning_rate": 5.30077059337245e-07, "loss": 0.0, "num_input_tokens_seen": 95192528, "step": 141250 }, { "epoch": 3.450883150514255, "grad_norm": 0.10637383162975311, "learning_rate": 5.300017858742667e-07, "loss": 0.028, "num_input_tokens_seen": 95195728, "step": 141255 }, { "epoch": 3.451005301346102, "grad_norm": 0.41171160340309143, "learning_rate": 5.299265158292414e-07, "loss": 0.0001, "num_input_tokens_seen": 95198992, "step": 141260 }, { "epoch": 3.4511274521779494, "grad_norm": 0.001396168489009142, "learning_rate": 5.298512492027174e-07, "loss": 0.0266, "num_input_tokens_seen": 95202256, "step": 141265 }, { "epoch": 3.4512496030097966, "grad_norm": 0.2164297252893448, "learning_rate": 5.297759859952411e-07, "loss": 0.0006, "num_input_tokens_seen": 95205264, "step": 141270 }, { "epoch": 3.4513717538416437, "grad_norm": 0.008003800176084042, "learning_rate": 5.297007262073607e-07, "loss": 0.0002, "num_input_tokens_seen": 95208208, "step": 141275 }, { "epoch": 3.451493904673491, "grad_norm": 0.0007602174300700426, "learning_rate": 5.296254698396227e-07, "loss": 0.1171, "num_input_tokens_seen": 95211600, "step": 141280 }, { "epoch": 3.451616055505338, "grad_norm": 0.007521396037191153, "learning_rate": 5.295502168925749e-07, "loss": 0.0, "num_input_tokens_seen": 95214608, "step": 141285 }, { "epoch": 3.4517382063371853, "grad_norm": 0.09310326725244522, "learning_rate": 5.294749673667646e-07, "loss": 0.0, "num_input_tokens_seen": 95217616, "step": 141290 }, { "epoch": 3.4518603571690325, "grad_norm": 0.012504089623689651, "learning_rate": 5.293997212627388e-07, "loss": 0.0, "num_input_tokens_seen": 95221072, "step": 141295 }, { "epoch": 3.4519825080008797, "grad_norm": 0.03356561064720154, "learning_rate": 5.293244785810451e-07, "loss": 0.0, "num_input_tokens_seen": 95224208, "step": 141300 }, { "epoch": 3.4521046588327264, "grad_norm": 0.0009436097461730242, "learning_rate": 5.292492393222299e-07, "loss": 0.0, "num_input_tokens_seen": 95227536, "step": 141305 }, { "epoch": 3.4522268096645736, "grad_norm": 0.029604079201817513, "learning_rate": 5.291740034868413e-07, "loss": 0.0001, "num_input_tokens_seen": 95230736, "step": 141310 }, { "epoch": 3.452348960496421, "grad_norm": 0.0026405539829283953, "learning_rate": 5.290987710754256e-07, "loss": 0.0001, "num_input_tokens_seen": 95234384, "step": 141315 }, { "epoch": 3.452471111328268, "grad_norm": 0.020678304135799408, "learning_rate": 5.290235420885307e-07, "loss": 0.0, "num_input_tokens_seen": 95237904, "step": 141320 }, { "epoch": 3.452593262160115, "grad_norm": 0.002151084365323186, "learning_rate": 5.289483165267033e-07, "loss": 0.0, "num_input_tokens_seen": 95240848, "step": 141325 }, { "epoch": 3.4527154129919624, "grad_norm": 0.006780423689633608, "learning_rate": 5.2887309439049e-07, "loss": 0.0348, "num_input_tokens_seen": 95243984, "step": 141330 }, { "epoch": 3.4528375638238096, "grad_norm": 0.002874805126339197, "learning_rate": 5.287978756804387e-07, "loss": 0.0439, "num_input_tokens_seen": 95247376, "step": 141335 }, { "epoch": 3.4529597146556568, "grad_norm": 0.008217284455895424, "learning_rate": 5.287226603970956e-07, "loss": 0.0, "num_input_tokens_seen": 95250576, "step": 141340 }, { "epoch": 3.453081865487504, "grad_norm": 0.00020205184409860522, "learning_rate": 5.28647448541008e-07, "loss": 0.0685, "num_input_tokens_seen": 95253904, "step": 141345 }, { "epoch": 3.453204016319351, "grad_norm": 0.011114844121038914, "learning_rate": 5.285722401127234e-07, "loss": 0.0, "num_input_tokens_seen": 95256592, "step": 141350 }, { "epoch": 3.4533261671511983, "grad_norm": 0.005883699748665094, "learning_rate": 5.284970351127878e-07, "loss": 0.0, "num_input_tokens_seen": 95260112, "step": 141355 }, { "epoch": 3.4534483179830455, "grad_norm": 0.004646258894354105, "learning_rate": 5.284218335417489e-07, "loss": 0.0001, "num_input_tokens_seen": 95263312, "step": 141360 }, { "epoch": 3.4535704688148927, "grad_norm": 0.010385840199887753, "learning_rate": 5.283466354001531e-07, "loss": 0.0, "num_input_tokens_seen": 95266832, "step": 141365 }, { "epoch": 3.45369261964674, "grad_norm": 0.0005899182870052755, "learning_rate": 5.282714406885478e-07, "loss": 0.0, "num_input_tokens_seen": 95270160, "step": 141370 }, { "epoch": 3.453814770478587, "grad_norm": 0.036543406546115875, "learning_rate": 5.28196249407479e-07, "loss": 0.044, "num_input_tokens_seen": 95273168, "step": 141375 }, { "epoch": 3.4539369213104343, "grad_norm": 0.015833524987101555, "learning_rate": 5.281210615574939e-07, "loss": 0.0, "num_input_tokens_seen": 95276496, "step": 141380 }, { "epoch": 3.4540590721422815, "grad_norm": 0.012564883567392826, "learning_rate": 5.280458771391398e-07, "loss": 0.0001, "num_input_tokens_seen": 95279760, "step": 141385 }, { "epoch": 3.454181222974128, "grad_norm": 0.0949430912733078, "learning_rate": 5.279706961529627e-07, "loss": 0.0006, "num_input_tokens_seen": 95283728, "step": 141390 }, { "epoch": 3.454303373805976, "grad_norm": 53.003692626953125, "learning_rate": 5.2789551859951e-07, "loss": 0.108, "num_input_tokens_seen": 95286992, "step": 141395 }, { "epoch": 3.4544255246378226, "grad_norm": 0.007030909415334463, "learning_rate": 5.278203444793276e-07, "loss": 0.0003, "num_input_tokens_seen": 95290512, "step": 141400 }, { "epoch": 3.45454767546967, "grad_norm": 0.00644741440191865, "learning_rate": 5.277451737929628e-07, "loss": 0.0, "num_input_tokens_seen": 95293904, "step": 141405 }, { "epoch": 3.454669826301517, "grad_norm": 0.01929423399269581, "learning_rate": 5.276700065409623e-07, "loss": 0.0, "num_input_tokens_seen": 95297552, "step": 141410 }, { "epoch": 3.454791977133364, "grad_norm": 0.0031475010327994823, "learning_rate": 5.275948427238727e-07, "loss": 0.0, "num_input_tokens_seen": 95301200, "step": 141415 }, { "epoch": 3.4549141279652114, "grad_norm": 0.011686863377690315, "learning_rate": 5.275196823422401e-07, "loss": 0.0, "num_input_tokens_seen": 95304976, "step": 141420 }, { "epoch": 3.4550362787970585, "grad_norm": 0.0013666352024301887, "learning_rate": 5.274445253966118e-07, "loss": 0.0002, "num_input_tokens_seen": 95308304, "step": 141425 }, { "epoch": 3.4551584296289057, "grad_norm": 0.0014562977012246847, "learning_rate": 5.273693718875336e-07, "loss": 0.0, "num_input_tokens_seen": 95311696, "step": 141430 }, { "epoch": 3.455280580460753, "grad_norm": 0.005508221220225096, "learning_rate": 5.27294221815553e-07, "loss": 0.0, "num_input_tokens_seen": 95314960, "step": 141435 }, { "epoch": 3.4554027312926, "grad_norm": 0.0002619369188323617, "learning_rate": 5.272190751812154e-07, "loss": 0.0129, "num_input_tokens_seen": 95318672, "step": 141440 }, { "epoch": 3.4555248821244473, "grad_norm": 0.0007996332133188844, "learning_rate": 5.271439319850682e-07, "loss": 0.0, "num_input_tokens_seen": 95322128, "step": 141445 }, { "epoch": 3.4556470329562945, "grad_norm": 0.1051895022392273, "learning_rate": 5.270687922276573e-07, "loss": 0.0001, "num_input_tokens_seen": 95325264, "step": 141450 }, { "epoch": 3.4557691837881417, "grad_norm": 0.002971762791275978, "learning_rate": 5.269936559095296e-07, "loss": 0.0002, "num_input_tokens_seen": 95328336, "step": 141455 }, { "epoch": 3.455891334619989, "grad_norm": 0.0001414813450537622, "learning_rate": 5.269185230312309e-07, "loss": 0.0, "num_input_tokens_seen": 95331664, "step": 141460 }, { "epoch": 3.456013485451836, "grad_norm": 0.0005533331423066556, "learning_rate": 5.268433935933079e-07, "loss": 0.0, "num_input_tokens_seen": 95334608, "step": 141465 }, { "epoch": 3.4561356362836833, "grad_norm": 0.005247303284704685, "learning_rate": 5.267682675963074e-07, "loss": 0.0, "num_input_tokens_seen": 95338256, "step": 141470 }, { "epoch": 3.4562577871155304, "grad_norm": 0.0007101392257027328, "learning_rate": 5.26693145040775e-07, "loss": 0.0064, "num_input_tokens_seen": 95341584, "step": 141475 }, { "epoch": 3.4563799379473776, "grad_norm": 0.000519787659868598, "learning_rate": 5.266180259272578e-07, "loss": 0.0, "num_input_tokens_seen": 95344720, "step": 141480 }, { "epoch": 3.4565020887792244, "grad_norm": 0.5096776485443115, "learning_rate": 5.265429102563012e-07, "loss": 0.0002, "num_input_tokens_seen": 95347728, "step": 141485 }, { "epoch": 3.4566242396110716, "grad_norm": 0.0044595482759177685, "learning_rate": 5.264677980284522e-07, "loss": 0.0, "num_input_tokens_seen": 95350928, "step": 141490 }, { "epoch": 3.4567463904429188, "grad_norm": 0.0008935718215070665, "learning_rate": 5.263926892442565e-07, "loss": 0.0, "num_input_tokens_seen": 95354000, "step": 141495 }, { "epoch": 3.456868541274766, "grad_norm": 0.0032971524633467197, "learning_rate": 5.263175839042604e-07, "loss": 0.0, "num_input_tokens_seen": 95357328, "step": 141500 }, { "epoch": 3.456990692106613, "grad_norm": 0.07441108673810959, "learning_rate": 5.262424820090108e-07, "loss": 0.0, "num_input_tokens_seen": 95360592, "step": 141505 }, { "epoch": 3.4571128429384603, "grad_norm": 0.0006337398081086576, "learning_rate": 5.261673835590527e-07, "loss": 0.0002, "num_input_tokens_seen": 95363664, "step": 141510 }, { "epoch": 3.4572349937703075, "grad_norm": 0.005729390308260918, "learning_rate": 5.260922885549333e-07, "loss": 0.0, "num_input_tokens_seen": 95366736, "step": 141515 }, { "epoch": 3.4573571446021547, "grad_norm": 0.00012647430412471294, "learning_rate": 5.260171969971981e-07, "loss": 0.0004, "num_input_tokens_seen": 95370256, "step": 141520 }, { "epoch": 3.457479295434002, "grad_norm": 0.001564518897794187, "learning_rate": 5.259421088863934e-07, "loss": 0.0, "num_input_tokens_seen": 95373392, "step": 141525 }, { "epoch": 3.457601446265849, "grad_norm": 0.01004080194979906, "learning_rate": 5.258670242230653e-07, "loss": 0.0, "num_input_tokens_seen": 95376784, "step": 141530 }, { "epoch": 3.4577235970976963, "grad_norm": 0.0010182727128267288, "learning_rate": 5.257919430077594e-07, "loss": 0.0, "num_input_tokens_seen": 95380048, "step": 141535 }, { "epoch": 3.4578457479295435, "grad_norm": 0.010390793904662132, "learning_rate": 5.257168652410223e-07, "loss": 0.0, "num_input_tokens_seen": 95383440, "step": 141540 }, { "epoch": 3.4579678987613907, "grad_norm": 0.22023968398571014, "learning_rate": 5.256417909233993e-07, "loss": 0.0002, "num_input_tokens_seen": 95387088, "step": 141545 }, { "epoch": 3.458090049593238, "grad_norm": 0.003728673327714205, "learning_rate": 5.255667200554372e-07, "loss": 0.0, "num_input_tokens_seen": 95390544, "step": 141550 }, { "epoch": 3.458212200425085, "grad_norm": 0.04721221327781677, "learning_rate": 5.254916526376813e-07, "loss": 0.0, "num_input_tokens_seen": 95394256, "step": 141555 }, { "epoch": 3.4583343512569322, "grad_norm": 0.05539275333285332, "learning_rate": 5.254165886706776e-07, "loss": 0.0, "num_input_tokens_seen": 95397584, "step": 141560 }, { "epoch": 3.4584565020887794, "grad_norm": 0.0006743664271198213, "learning_rate": 5.253415281549726e-07, "loss": 0.0, "num_input_tokens_seen": 95401232, "step": 141565 }, { "epoch": 3.458578652920626, "grad_norm": 0.2516691982746124, "learning_rate": 5.252664710911112e-07, "loss": 0.0395, "num_input_tokens_seen": 95404176, "step": 141570 }, { "epoch": 3.458700803752474, "grad_norm": 0.0004871607816312462, "learning_rate": 5.251914174796401e-07, "loss": 0.1123, "num_input_tokens_seen": 95407952, "step": 141575 }, { "epoch": 3.4588229545843205, "grad_norm": 0.004576869774609804, "learning_rate": 5.251163673211043e-07, "loss": 0.0645, "num_input_tokens_seen": 95411216, "step": 141580 }, { "epoch": 3.4589451054161677, "grad_norm": 0.05273722484707832, "learning_rate": 5.250413206160505e-07, "loss": 0.0, "num_input_tokens_seen": 95414480, "step": 141585 }, { "epoch": 3.459067256248015, "grad_norm": 0.0011433085892349482, "learning_rate": 5.249662773650235e-07, "loss": 0.0, "num_input_tokens_seen": 95417808, "step": 141590 }, { "epoch": 3.459189407079862, "grad_norm": 0.0031519471667706966, "learning_rate": 5.248912375685694e-07, "loss": 0.0, "num_input_tokens_seen": 95421264, "step": 141595 }, { "epoch": 3.4593115579117093, "grad_norm": 0.004848890472203493, "learning_rate": 5.248162012272345e-07, "loss": 0.0, "num_input_tokens_seen": 95424272, "step": 141600 }, { "epoch": 3.4594337087435565, "grad_norm": 0.009992300532758236, "learning_rate": 5.247411683415637e-07, "loss": 0.0, "num_input_tokens_seen": 95427728, "step": 141605 }, { "epoch": 3.4595558595754037, "grad_norm": 7.762262976029888e-05, "learning_rate": 5.246661389121032e-07, "loss": 0.0, "num_input_tokens_seen": 95430992, "step": 141610 }, { "epoch": 3.459678010407251, "grad_norm": 0.006131031550467014, "learning_rate": 5.24591112939398e-07, "loss": 0.0, "num_input_tokens_seen": 95434448, "step": 141615 }, { "epoch": 3.459800161239098, "grad_norm": 0.00018021617142949253, "learning_rate": 5.245160904239947e-07, "loss": 0.0, "num_input_tokens_seen": 95438032, "step": 141620 }, { "epoch": 3.4599223120709452, "grad_norm": 0.0003632458974607289, "learning_rate": 5.244410713664377e-07, "loss": 0.0, "num_input_tokens_seen": 95441040, "step": 141625 }, { "epoch": 3.4600444629027924, "grad_norm": 0.02345282770693302, "learning_rate": 5.243660557672736e-07, "loss": 0.0, "num_input_tokens_seen": 95444816, "step": 141630 }, { "epoch": 3.4601666137346396, "grad_norm": 0.004612599965184927, "learning_rate": 5.242910436270474e-07, "loss": 0.0, "num_input_tokens_seen": 95447696, "step": 141635 }, { "epoch": 3.460288764566487, "grad_norm": 2.76730479527032e-05, "learning_rate": 5.242160349463043e-07, "loss": 0.0, "num_input_tokens_seen": 95450896, "step": 141640 }, { "epoch": 3.460410915398334, "grad_norm": 106.46083068847656, "learning_rate": 5.241410297255906e-07, "loss": 0.0738, "num_input_tokens_seen": 95454096, "step": 141645 }, { "epoch": 3.460533066230181, "grad_norm": 0.0009458428248763084, "learning_rate": 5.240660279654509e-07, "loss": 0.0, "num_input_tokens_seen": 95457360, "step": 141650 }, { "epoch": 3.460655217062028, "grad_norm": 884.1512451171875, "learning_rate": 5.239910296664311e-07, "loss": 0.0108, "num_input_tokens_seen": 95460560, "step": 141655 }, { "epoch": 3.4607773678938756, "grad_norm": 0.18908710777759552, "learning_rate": 5.239160348290769e-07, "loss": 0.0001, "num_input_tokens_seen": 95464400, "step": 141660 }, { "epoch": 3.4608995187257223, "grad_norm": 0.019987985491752625, "learning_rate": 5.238410434539329e-07, "loss": 0.0395, "num_input_tokens_seen": 95467792, "step": 141665 }, { "epoch": 3.4610216695575695, "grad_norm": 0.002345374319702387, "learning_rate": 5.237660555415454e-07, "loss": 0.0, "num_input_tokens_seen": 95470928, "step": 141670 }, { "epoch": 3.4611438203894167, "grad_norm": 0.030504705384373665, "learning_rate": 5.236910710924588e-07, "loss": 0.0003, "num_input_tokens_seen": 95474128, "step": 141675 }, { "epoch": 3.461265971221264, "grad_norm": 0.0014592435909435153, "learning_rate": 5.236160901072193e-07, "loss": 0.0001, "num_input_tokens_seen": 95477712, "step": 141680 }, { "epoch": 3.461388122053111, "grad_norm": 0.0011629685759544373, "learning_rate": 5.235411125863713e-07, "loss": 0.0, "num_input_tokens_seen": 95480912, "step": 141685 }, { "epoch": 3.4615102728849583, "grad_norm": 0.0006028559291735291, "learning_rate": 5.234661385304603e-07, "loss": 0.0, "num_input_tokens_seen": 95483920, "step": 141690 }, { "epoch": 3.4616324237168055, "grad_norm": 0.000889910152181983, "learning_rate": 5.233911679400324e-07, "loss": 0.0, "num_input_tokens_seen": 95486992, "step": 141695 }, { "epoch": 3.4617545745486527, "grad_norm": 0.0008395583718083799, "learning_rate": 5.233162008156316e-07, "loss": 0.0, "num_input_tokens_seen": 95490448, "step": 141700 }, { "epoch": 3.4618767253805, "grad_norm": 0.06065589189529419, "learning_rate": 5.232412371578038e-07, "loss": 0.0, "num_input_tokens_seen": 95494032, "step": 141705 }, { "epoch": 3.461998876212347, "grad_norm": 0.009806448593735695, "learning_rate": 5.231662769670939e-07, "loss": 0.0, "num_input_tokens_seen": 95497616, "step": 141710 }, { "epoch": 3.462121027044194, "grad_norm": 0.0029943319968879223, "learning_rate": 5.230913202440469e-07, "loss": 0.0917, "num_input_tokens_seen": 95501328, "step": 141715 }, { "epoch": 3.4622431778760414, "grad_norm": 0.023564515635371208, "learning_rate": 5.230163669892085e-07, "loss": 0.0172, "num_input_tokens_seen": 95504784, "step": 141720 }, { "epoch": 3.4623653287078886, "grad_norm": 0.0008361052605323493, "learning_rate": 5.229414172031235e-07, "loss": 0.0, "num_input_tokens_seen": 95507920, "step": 141725 }, { "epoch": 3.462487479539736, "grad_norm": 0.0017536578234285116, "learning_rate": 5.228664708863362e-07, "loss": 0.0, "num_input_tokens_seen": 95511248, "step": 141730 }, { "epoch": 3.462609630371583, "grad_norm": 0.7590802907943726, "learning_rate": 5.227915280393928e-07, "loss": 0.0004, "num_input_tokens_seen": 95514256, "step": 141735 }, { "epoch": 3.46273178120343, "grad_norm": 0.007260952610522509, "learning_rate": 5.227165886628373e-07, "loss": 0.0, "num_input_tokens_seen": 95517392, "step": 141740 }, { "epoch": 3.4628539320352774, "grad_norm": 0.014699455350637436, "learning_rate": 5.226416527572157e-07, "loss": 0.0, "num_input_tokens_seen": 95520976, "step": 141745 }, { "epoch": 3.462976082867124, "grad_norm": 0.0004249770427122712, "learning_rate": 5.225667203230719e-07, "loss": 0.0, "num_input_tokens_seen": 95524496, "step": 141750 }, { "epoch": 3.4630982336989717, "grad_norm": 0.011417384259402752, "learning_rate": 5.224917913609517e-07, "loss": 0.0, "num_input_tokens_seen": 95528208, "step": 141755 }, { "epoch": 3.4632203845308185, "grad_norm": 0.022379083558917046, "learning_rate": 5.224168658713992e-07, "loss": 0.0, "num_input_tokens_seen": 95531856, "step": 141760 }, { "epoch": 3.4633425353626657, "grad_norm": 0.00799702201038599, "learning_rate": 5.223419438549601e-07, "loss": 0.0, "num_input_tokens_seen": 95535120, "step": 141765 }, { "epoch": 3.463464686194513, "grad_norm": 0.0009740145178511739, "learning_rate": 5.222670253121786e-07, "loss": 0.0, "num_input_tokens_seen": 95538192, "step": 141770 }, { "epoch": 3.46358683702636, "grad_norm": 0.000678222393617034, "learning_rate": 5.221921102435995e-07, "loss": 0.0, "num_input_tokens_seen": 95541200, "step": 141775 }, { "epoch": 3.4637089878582072, "grad_norm": 0.008515672758221626, "learning_rate": 5.221171986497686e-07, "loss": 0.0001, "num_input_tokens_seen": 95544784, "step": 141780 }, { "epoch": 3.4638311386900544, "grad_norm": 0.000750542851164937, "learning_rate": 5.220422905312293e-07, "loss": 0.0406, "num_input_tokens_seen": 95548304, "step": 141785 }, { "epoch": 3.4639532895219016, "grad_norm": 0.00016197515651583672, "learning_rate": 5.219673858885276e-07, "loss": 0.0, "num_input_tokens_seen": 95551312, "step": 141790 }, { "epoch": 3.464075440353749, "grad_norm": 0.004807916935533285, "learning_rate": 5.218924847222073e-07, "loss": 0.0, "num_input_tokens_seen": 95554640, "step": 141795 }, { "epoch": 3.464197591185596, "grad_norm": 0.01425571646541357, "learning_rate": 5.218175870328136e-07, "loss": 0.0, "num_input_tokens_seen": 95557520, "step": 141800 }, { "epoch": 3.464319742017443, "grad_norm": 0.03530505299568176, "learning_rate": 5.217426928208908e-07, "loss": 0.0, "num_input_tokens_seen": 95560592, "step": 141805 }, { "epoch": 3.4644418928492904, "grad_norm": 0.0017172127263620496, "learning_rate": 5.216678020869838e-07, "loss": 0.0, "num_input_tokens_seen": 95563408, "step": 141810 }, { "epoch": 3.4645640436811376, "grad_norm": 0.290860652923584, "learning_rate": 5.215929148316376e-07, "loss": 0.0001, "num_input_tokens_seen": 95566992, "step": 141815 }, { "epoch": 3.4646861945129848, "grad_norm": 0.04317009821534157, "learning_rate": 5.215180310553959e-07, "loss": 0.0, "num_input_tokens_seen": 95570384, "step": 141820 }, { "epoch": 3.464808345344832, "grad_norm": 0.027695301920175552, "learning_rate": 5.214431507588042e-07, "loss": 0.0781, "num_input_tokens_seen": 95573648, "step": 141825 }, { "epoch": 3.464930496176679, "grad_norm": 3.70340421795845e-05, "learning_rate": 5.213682739424063e-07, "loss": 0.0, "num_input_tokens_seen": 95577232, "step": 141830 }, { "epoch": 3.465052647008526, "grad_norm": 0.0006201051292009652, "learning_rate": 5.212934006067474e-07, "loss": 0.0, "num_input_tokens_seen": 95580752, "step": 141835 }, { "epoch": 3.4651747978403735, "grad_norm": 0.028088655322790146, "learning_rate": 5.212185307523716e-07, "loss": 0.0, "num_input_tokens_seen": 95583888, "step": 141840 }, { "epoch": 3.4652969486722203, "grad_norm": 0.025718344375491142, "learning_rate": 5.211436643798231e-07, "loss": 0.0, "num_input_tokens_seen": 95587536, "step": 141845 }, { "epoch": 3.4654190995040675, "grad_norm": 0.006959665101021528, "learning_rate": 5.21068801489647e-07, "loss": 0.0236, "num_input_tokens_seen": 95590992, "step": 141850 }, { "epoch": 3.4655412503359146, "grad_norm": 0.0032126219011843204, "learning_rate": 5.20993942082387e-07, "loss": 0.0, "num_input_tokens_seen": 95594832, "step": 141855 }, { "epoch": 3.465663401167762, "grad_norm": 0.0005849121371284127, "learning_rate": 5.209190861585883e-07, "loss": 0.0001, "num_input_tokens_seen": 95598352, "step": 141860 }, { "epoch": 3.465785551999609, "grad_norm": 0.09262830764055252, "learning_rate": 5.208442337187945e-07, "loss": 0.0001, "num_input_tokens_seen": 95601616, "step": 141865 }, { "epoch": 3.465907702831456, "grad_norm": 0.004266612231731415, "learning_rate": 5.207693847635503e-07, "loss": 0.0, "num_input_tokens_seen": 95605072, "step": 141870 }, { "epoch": 3.4660298536633034, "grad_norm": 0.0007697442197240889, "learning_rate": 5.206945392934004e-07, "loss": 0.0, "num_input_tokens_seen": 95608912, "step": 141875 }, { "epoch": 3.4661520044951506, "grad_norm": 0.00639304518699646, "learning_rate": 5.206196973088884e-07, "loss": 0.0096, "num_input_tokens_seen": 95614480, "step": 141880 }, { "epoch": 3.466274155326998, "grad_norm": 0.0009124985663220286, "learning_rate": 5.205448588105592e-07, "loss": 0.0477, "num_input_tokens_seen": 95618192, "step": 141885 }, { "epoch": 3.466396306158845, "grad_norm": 0.048047225922346115, "learning_rate": 5.204700237989563e-07, "loss": 0.0, "num_input_tokens_seen": 95620944, "step": 141890 }, { "epoch": 3.466518456990692, "grad_norm": 0.007053458597511053, "learning_rate": 5.203951922746249e-07, "loss": 0.0, "num_input_tokens_seen": 95624400, "step": 141895 }, { "epoch": 3.4666406078225394, "grad_norm": 0.009079281240701675, "learning_rate": 5.203203642381083e-07, "loss": 0.0, "num_input_tokens_seen": 95627664, "step": 141900 }, { "epoch": 3.4667627586543865, "grad_norm": 0.0011568315094336867, "learning_rate": 5.202455396899508e-07, "loss": 0.0, "num_input_tokens_seen": 95630928, "step": 141905 }, { "epoch": 3.4668849094862337, "grad_norm": 8.400007209274918e-05, "learning_rate": 5.201707186306974e-07, "loss": 0.0, "num_input_tokens_seen": 95633808, "step": 141910 }, { "epoch": 3.467007060318081, "grad_norm": 0.004403356928378344, "learning_rate": 5.20095901060891e-07, "loss": 0.0, "num_input_tokens_seen": 95637648, "step": 141915 }, { "epoch": 3.467129211149928, "grad_norm": 0.0006242926465347409, "learning_rate": 5.200210869810768e-07, "loss": 0.0002, "num_input_tokens_seen": 95641296, "step": 141920 }, { "epoch": 3.4672513619817753, "grad_norm": 0.40456339716911316, "learning_rate": 5.19946276391798e-07, "loss": 0.0001, "num_input_tokens_seen": 95644624, "step": 141925 }, { "epoch": 3.467373512813622, "grad_norm": 0.0003231973387300968, "learning_rate": 5.198714692935993e-07, "loss": 0.0, "num_input_tokens_seen": 95647888, "step": 141930 }, { "epoch": 3.4674956636454692, "grad_norm": 0.020619744434952736, "learning_rate": 5.19796665687024e-07, "loss": 0.0, "num_input_tokens_seen": 95651216, "step": 141935 }, { "epoch": 3.4676178144773164, "grad_norm": 0.011790621094405651, "learning_rate": 5.197218655726171e-07, "loss": 0.0, "num_input_tokens_seen": 95654544, "step": 141940 }, { "epoch": 3.4677399653091636, "grad_norm": 0.0007443840149790049, "learning_rate": 5.196470689509218e-07, "loss": 0.0, "num_input_tokens_seen": 95658192, "step": 141945 }, { "epoch": 3.467862116141011, "grad_norm": 0.01050039753317833, "learning_rate": 5.195722758224819e-07, "loss": 0.0, "num_input_tokens_seen": 95662352, "step": 141950 }, { "epoch": 3.467984266972858, "grad_norm": 0.005819221027195454, "learning_rate": 5.19497486187842e-07, "loss": 0.0, "num_input_tokens_seen": 95665744, "step": 141955 }, { "epoch": 3.468106417804705, "grad_norm": 0.0011456216452643275, "learning_rate": 5.19422700047545e-07, "loss": 0.0, "num_input_tokens_seen": 95669200, "step": 141960 }, { "epoch": 3.4682285686365524, "grad_norm": 0.15970462560653687, "learning_rate": 5.193479174021358e-07, "loss": 0.0, "num_input_tokens_seen": 95672592, "step": 141965 }, { "epoch": 3.4683507194683996, "grad_norm": 0.0024676870089024305, "learning_rate": 5.19273138252158e-07, "loss": 0.0675, "num_input_tokens_seen": 95676368, "step": 141970 }, { "epoch": 3.4684728703002468, "grad_norm": 0.002797144465148449, "learning_rate": 5.191983625981549e-07, "loss": 0.0, "num_input_tokens_seen": 95679632, "step": 141975 }, { "epoch": 3.468595021132094, "grad_norm": 0.05274736136198044, "learning_rate": 5.191235904406709e-07, "loss": 0.0039, "num_input_tokens_seen": 95683088, "step": 141980 }, { "epoch": 3.468717171963941, "grad_norm": 0.011775636114180088, "learning_rate": 5.190488217802492e-07, "loss": 0.0, "num_input_tokens_seen": 95686672, "step": 141985 }, { "epoch": 3.4688393227957883, "grad_norm": 0.0019723197910934687, "learning_rate": 5.189740566174341e-07, "loss": 0.0, "num_input_tokens_seen": 95690512, "step": 141990 }, { "epoch": 3.4689614736276355, "grad_norm": 0.0013073725858703256, "learning_rate": 5.188992949527688e-07, "loss": 0.0399, "num_input_tokens_seen": 95694096, "step": 141995 }, { "epoch": 3.4690836244594827, "grad_norm": 0.009154350496828556, "learning_rate": 5.188245367867971e-07, "loss": 0.0009, "num_input_tokens_seen": 95697296, "step": 142000 }, { "epoch": 3.46920577529133, "grad_norm": 19.28621482849121, "learning_rate": 5.187497821200633e-07, "loss": 0.0556, "num_input_tokens_seen": 95700688, "step": 142005 }, { "epoch": 3.469327926123177, "grad_norm": 0.016265923157334328, "learning_rate": 5.1867503095311e-07, "loss": 0.0, "num_input_tokens_seen": 95704080, "step": 142010 }, { "epoch": 3.469450076955024, "grad_norm": 0.0036928714253008366, "learning_rate": 5.186002832864819e-07, "loss": 0.0399, "num_input_tokens_seen": 95707408, "step": 142015 }, { "epoch": 3.4695722277868715, "grad_norm": 3.3985830668825656e-05, "learning_rate": 5.185255391207215e-07, "loss": 0.0001, "num_input_tokens_seen": 95710800, "step": 142020 }, { "epoch": 3.469694378618718, "grad_norm": 0.009393775835633278, "learning_rate": 5.18450798456373e-07, "loss": 0.0, "num_input_tokens_seen": 95714192, "step": 142025 }, { "epoch": 3.4698165294505654, "grad_norm": 0.001170952571555972, "learning_rate": 5.1837606129398e-07, "loss": 0.0, "num_input_tokens_seen": 95717456, "step": 142030 }, { "epoch": 3.4699386802824126, "grad_norm": 0.0034615262411534786, "learning_rate": 5.183013276340859e-07, "loss": 0.0, "num_input_tokens_seen": 95720528, "step": 142035 }, { "epoch": 3.47006083111426, "grad_norm": 0.005982245784252882, "learning_rate": 5.182265974772339e-07, "loss": 0.0001, "num_input_tokens_seen": 95724048, "step": 142040 }, { "epoch": 3.470182981946107, "grad_norm": 0.001264628954231739, "learning_rate": 5.181518708239679e-07, "loss": 0.0444, "num_input_tokens_seen": 95727312, "step": 142045 }, { "epoch": 3.470305132777954, "grad_norm": 0.0009982496267184615, "learning_rate": 5.180771476748307e-07, "loss": 0.0, "num_input_tokens_seen": 95730640, "step": 142050 }, { "epoch": 3.4704272836098013, "grad_norm": 0.0025757618714123964, "learning_rate": 5.180024280303665e-07, "loss": 0.0, "num_input_tokens_seen": 95733648, "step": 142055 }, { "epoch": 3.4705494344416485, "grad_norm": 0.00035017167101614177, "learning_rate": 5.17927711891118e-07, "loss": 0.0, "num_input_tokens_seen": 95737232, "step": 142060 }, { "epoch": 3.4706715852734957, "grad_norm": 2.5826502678683028e-05, "learning_rate": 5.178529992576291e-07, "loss": 0.0079, "num_input_tokens_seen": 95740368, "step": 142065 }, { "epoch": 3.470793736105343, "grad_norm": 3.1048228265717626e-05, "learning_rate": 5.177782901304426e-07, "loss": 0.0, "num_input_tokens_seen": 95743568, "step": 142070 }, { "epoch": 3.47091588693719, "grad_norm": 15.827798843383789, "learning_rate": 5.177035845101023e-07, "loss": 0.0422, "num_input_tokens_seen": 95747088, "step": 142075 }, { "epoch": 3.4710380377690373, "grad_norm": 0.003537968033924699, "learning_rate": 5.176288823971511e-07, "loss": 0.0001, "num_input_tokens_seen": 95750480, "step": 142080 }, { "epoch": 3.4711601886008845, "grad_norm": 0.0037160757929086685, "learning_rate": 5.175541837921326e-07, "loss": 0.0, "num_input_tokens_seen": 95753488, "step": 142085 }, { "epoch": 3.4712823394327317, "grad_norm": 0.0007948851562105119, "learning_rate": 5.174794886955895e-07, "loss": 0.0001, "num_input_tokens_seen": 95756880, "step": 142090 }, { "epoch": 3.471404490264579, "grad_norm": 0.0015356955118477345, "learning_rate": 5.174047971080653e-07, "loss": 0.0, "num_input_tokens_seen": 95760144, "step": 142095 }, { "epoch": 3.471526641096426, "grad_norm": 0.0012752095935866237, "learning_rate": 5.173301090301036e-07, "loss": 0.0002, "num_input_tokens_seen": 95764112, "step": 142100 }, { "epoch": 3.4716487919282732, "grad_norm": 0.071896031498909, "learning_rate": 5.172554244622469e-07, "loss": 0.0, "num_input_tokens_seen": 95767568, "step": 142105 }, { "epoch": 3.47177094276012, "grad_norm": 0.0005444808048196137, "learning_rate": 5.171807434050389e-07, "loss": 0.0, "num_input_tokens_seen": 95771216, "step": 142110 }, { "epoch": 3.471893093591967, "grad_norm": 0.01946081407368183, "learning_rate": 5.17106065859022e-07, "loss": 0.012, "num_input_tokens_seen": 95774352, "step": 142115 }, { "epoch": 3.4720152444238144, "grad_norm": 0.021580923348665237, "learning_rate": 5.170313918247397e-07, "loss": 0.0, "num_input_tokens_seen": 95778000, "step": 142120 }, { "epoch": 3.4721373952556616, "grad_norm": 0.005905607715249062, "learning_rate": 5.169567213027355e-07, "loss": 0.0001, "num_input_tokens_seen": 95781456, "step": 142125 }, { "epoch": 3.4722595460875088, "grad_norm": 0.003950274549424648, "learning_rate": 5.168820542935514e-07, "loss": 0.0001, "num_input_tokens_seen": 95785168, "step": 142130 }, { "epoch": 3.472381696919356, "grad_norm": 0.0005195545381866395, "learning_rate": 5.168073907977315e-07, "loss": 0.0, "num_input_tokens_seen": 95790480, "step": 142135 }, { "epoch": 3.472503847751203, "grad_norm": 0.10520616173744202, "learning_rate": 5.167327308158177e-07, "loss": 0.0, "num_input_tokens_seen": 95793744, "step": 142140 }, { "epoch": 3.4726259985830503, "grad_norm": 0.0003425517934374511, "learning_rate": 5.166580743483539e-07, "loss": 0.0, "num_input_tokens_seen": 95797200, "step": 142145 }, { "epoch": 3.4727481494148975, "grad_norm": 0.0060327257961034775, "learning_rate": 5.165834213958825e-07, "loss": 0.0, "num_input_tokens_seen": 95800528, "step": 142150 }, { "epoch": 3.4728703002467447, "grad_norm": 0.019918235018849373, "learning_rate": 5.165087719589462e-07, "loss": 0.0938, "num_input_tokens_seen": 95803728, "step": 142155 }, { "epoch": 3.472992451078592, "grad_norm": 0.49219003319740295, "learning_rate": 5.164341260380885e-07, "loss": 0.0001, "num_input_tokens_seen": 95807248, "step": 142160 }, { "epoch": 3.473114601910439, "grad_norm": 0.008874580264091492, "learning_rate": 5.163594836338515e-07, "loss": 0.0001, "num_input_tokens_seen": 95810320, "step": 142165 }, { "epoch": 3.4732367527422863, "grad_norm": 0.0008690126123838127, "learning_rate": 5.162848447467789e-07, "loss": 0.0524, "num_input_tokens_seen": 95813712, "step": 142170 }, { "epoch": 3.4733589035741335, "grad_norm": 0.0009398649563081563, "learning_rate": 5.162102093774126e-07, "loss": 0.0382, "num_input_tokens_seen": 95817232, "step": 142175 }, { "epoch": 3.4734810544059807, "grad_norm": 111.28203582763672, "learning_rate": 5.161355775262957e-07, "loss": 0.0341, "num_input_tokens_seen": 95820432, "step": 142180 }, { "epoch": 3.473603205237828, "grad_norm": 0.008379044011235237, "learning_rate": 5.160609491939713e-07, "loss": 0.0, "num_input_tokens_seen": 95824272, "step": 142185 }, { "epoch": 3.473725356069675, "grad_norm": 0.050469011068344116, "learning_rate": 5.159863243809816e-07, "loss": 0.0001, "num_input_tokens_seen": 95828112, "step": 142190 }, { "epoch": 3.4738475069015218, "grad_norm": 0.006850233767181635, "learning_rate": 5.159117030878699e-07, "loss": 0.0, "num_input_tokens_seen": 95831376, "step": 142195 }, { "epoch": 3.4739696577333694, "grad_norm": 0.0007184845162555575, "learning_rate": 5.158370853151783e-07, "loss": 0.0, "num_input_tokens_seen": 95834640, "step": 142200 }, { "epoch": 3.474091808565216, "grad_norm": 0.0031210724264383316, "learning_rate": 5.157624710634499e-07, "loss": 0.0005, "num_input_tokens_seen": 95838416, "step": 142205 }, { "epoch": 3.4742139593970633, "grad_norm": 0.0003850772918667644, "learning_rate": 5.156878603332265e-07, "loss": 0.0, "num_input_tokens_seen": 95841296, "step": 142210 }, { "epoch": 3.4743361102289105, "grad_norm": 0.13583695888519287, "learning_rate": 5.156132531250515e-07, "loss": 0.0, "num_input_tokens_seen": 95844880, "step": 142215 }, { "epoch": 3.4744582610607577, "grad_norm": 0.0015774507774040103, "learning_rate": 5.155386494394674e-07, "loss": 0.0, "num_input_tokens_seen": 95848528, "step": 142220 }, { "epoch": 3.474580411892605, "grad_norm": 0.023258036002516747, "learning_rate": 5.154640492770161e-07, "loss": 0.0, "num_input_tokens_seen": 95851920, "step": 142225 }, { "epoch": 3.474702562724452, "grad_norm": 0.0010493238223716617, "learning_rate": 5.153894526382412e-07, "loss": 0.0, "num_input_tokens_seen": 95855440, "step": 142230 }, { "epoch": 3.4748247135562993, "grad_norm": 0.001293657929636538, "learning_rate": 5.153148595236839e-07, "loss": 0.0001, "num_input_tokens_seen": 95859088, "step": 142235 }, { "epoch": 3.4749468643881465, "grad_norm": 0.11125042289495468, "learning_rate": 5.152402699338878e-07, "loss": 0.0001, "num_input_tokens_seen": 95862032, "step": 142240 }, { "epoch": 3.4750690152199937, "grad_norm": 0.00025333481607958674, "learning_rate": 5.151656838693945e-07, "loss": 0.0, "num_input_tokens_seen": 95865616, "step": 142245 }, { "epoch": 3.475191166051841, "grad_norm": 0.0008209923398680985, "learning_rate": 5.150911013307471e-07, "loss": 0.0, "num_input_tokens_seen": 95869264, "step": 142250 }, { "epoch": 3.475313316883688, "grad_norm": 0.00028186949202790856, "learning_rate": 5.150165223184877e-07, "loss": 0.0, "num_input_tokens_seen": 95873488, "step": 142255 }, { "epoch": 3.4754354677155352, "grad_norm": 0.001031126594170928, "learning_rate": 5.149419468331582e-07, "loss": 0.0754, "num_input_tokens_seen": 95876944, "step": 142260 }, { "epoch": 3.4755576185473824, "grad_norm": 41.8818244934082, "learning_rate": 5.148673748753017e-07, "loss": 0.0454, "num_input_tokens_seen": 95880080, "step": 142265 }, { "epoch": 3.4756797693792296, "grad_norm": 22.44657325744629, "learning_rate": 5.147928064454597e-07, "loss": 0.1032, "num_input_tokens_seen": 95883472, "step": 142270 }, { "epoch": 3.475801920211077, "grad_norm": 0.0021455264650285244, "learning_rate": 5.147182415441749e-07, "loss": 0.0, "num_input_tokens_seen": 95886992, "step": 142275 }, { "epoch": 3.4759240710429236, "grad_norm": 0.0005806908593513072, "learning_rate": 5.1464368017199e-07, "loss": 0.0422, "num_input_tokens_seen": 95890384, "step": 142280 }, { "epoch": 3.476046221874771, "grad_norm": 0.0041892873123288155, "learning_rate": 5.145691223294464e-07, "loss": 0.0001, "num_input_tokens_seen": 95893712, "step": 142285 }, { "epoch": 3.476168372706618, "grad_norm": 0.005277763120830059, "learning_rate": 5.144945680170871e-07, "loss": 0.0001, "num_input_tokens_seen": 95897168, "step": 142290 }, { "epoch": 3.476290523538465, "grad_norm": 0.009297845885157585, "learning_rate": 5.144200172354534e-07, "loss": 0.0, "num_input_tokens_seen": 95900624, "step": 142295 }, { "epoch": 3.4764126743703123, "grad_norm": 0.01249323133379221, "learning_rate": 5.143454699850884e-07, "loss": 0.0002, "num_input_tokens_seen": 95904016, "step": 142300 }, { "epoch": 3.4765348252021595, "grad_norm": 0.002345341257750988, "learning_rate": 5.142709262665334e-07, "loss": 0.0001, "num_input_tokens_seen": 95907600, "step": 142305 }, { "epoch": 3.4766569760340067, "grad_norm": 0.008824708871543407, "learning_rate": 5.14196386080331e-07, "loss": 0.0, "num_input_tokens_seen": 95910736, "step": 142310 }, { "epoch": 3.476779126865854, "grad_norm": 0.009206004440784454, "learning_rate": 5.141218494270234e-07, "loss": 0.0, "num_input_tokens_seen": 95914640, "step": 142315 }, { "epoch": 3.476901277697701, "grad_norm": 0.09007237106561661, "learning_rate": 5.14047316307152e-07, "loss": 0.0001, "num_input_tokens_seen": 95918032, "step": 142320 }, { "epoch": 3.4770234285295483, "grad_norm": 0.0011511669727042317, "learning_rate": 5.139727867212596e-07, "loss": 0.0, "num_input_tokens_seen": 95921808, "step": 142325 }, { "epoch": 3.4771455793613955, "grad_norm": 0.014105385169386864, "learning_rate": 5.138982606698876e-07, "loss": 0.0026, "num_input_tokens_seen": 95925328, "step": 142330 }, { "epoch": 3.4772677301932426, "grad_norm": 0.00038664639578200877, "learning_rate": 5.13823738153578e-07, "loss": 0.0, "num_input_tokens_seen": 95928528, "step": 142335 }, { "epoch": 3.47738988102509, "grad_norm": 0.0021773031912744045, "learning_rate": 5.137492191728734e-07, "loss": 0.0695, "num_input_tokens_seen": 95931408, "step": 142340 }, { "epoch": 3.477512031856937, "grad_norm": 0.0014116725651547313, "learning_rate": 5.136747037283149e-07, "loss": 0.0, "num_input_tokens_seen": 95935120, "step": 142345 }, { "epoch": 3.477634182688784, "grad_norm": 0.030428191646933556, "learning_rate": 5.136001918204451e-07, "loss": 0.0439, "num_input_tokens_seen": 95938576, "step": 142350 }, { "epoch": 3.4777563335206314, "grad_norm": 0.0033477810211479664, "learning_rate": 5.135256834498054e-07, "loss": 0.0, "num_input_tokens_seen": 95942096, "step": 142355 }, { "epoch": 3.4778784843524786, "grad_norm": 8.172683010343462e-05, "learning_rate": 5.134511786169376e-07, "loss": 0.0, "num_input_tokens_seen": 95945360, "step": 142360 }, { "epoch": 3.478000635184326, "grad_norm": 0.006972254253923893, "learning_rate": 5.133766773223839e-07, "loss": 0.0, "num_input_tokens_seen": 95948688, "step": 142365 }, { "epoch": 3.478122786016173, "grad_norm": 0.0022087888792157173, "learning_rate": 5.133021795666858e-07, "loss": 0.0002, "num_input_tokens_seen": 95952144, "step": 142370 }, { "epoch": 3.4782449368480197, "grad_norm": 0.001995604485273361, "learning_rate": 5.132276853503853e-07, "loss": 0.0, "num_input_tokens_seen": 95955472, "step": 142375 }, { "epoch": 3.478367087679867, "grad_norm": 0.009456205181777477, "learning_rate": 5.131531946740238e-07, "loss": 0.0, "num_input_tokens_seen": 95958736, "step": 142380 }, { "epoch": 3.478489238511714, "grad_norm": 3.635671964730136e-05, "learning_rate": 5.130787075381433e-07, "loss": 0.0001, "num_input_tokens_seen": 95962256, "step": 142385 }, { "epoch": 3.4786113893435613, "grad_norm": 0.0007822351763024926, "learning_rate": 5.130042239432853e-07, "loss": 0.0, "num_input_tokens_seen": 95965328, "step": 142390 }, { "epoch": 3.4787335401754085, "grad_norm": 0.0020497075747698545, "learning_rate": 5.129297438899918e-07, "loss": 0.0, "num_input_tokens_seen": 95968848, "step": 142395 }, { "epoch": 3.4788556910072557, "grad_norm": 0.0031505257356911898, "learning_rate": 5.128552673788038e-07, "loss": 0.0, "num_input_tokens_seen": 95972176, "step": 142400 }, { "epoch": 3.478977841839103, "grad_norm": 0.006290497723966837, "learning_rate": 5.127807944102634e-07, "loss": 0.0, "num_input_tokens_seen": 95975056, "step": 142405 }, { "epoch": 3.47909999267095, "grad_norm": 0.004650265444070101, "learning_rate": 5.127063249849125e-07, "loss": 0.0, "num_input_tokens_seen": 95978704, "step": 142410 }, { "epoch": 3.4792221435027972, "grad_norm": 0.0017632690723985434, "learning_rate": 5.126318591032919e-07, "loss": 0.0166, "num_input_tokens_seen": 95982608, "step": 142415 }, { "epoch": 3.4793442943346444, "grad_norm": 0.0033946416806429625, "learning_rate": 5.125573967659437e-07, "loss": 0.0, "num_input_tokens_seen": 95986640, "step": 142420 }, { "epoch": 3.4794664451664916, "grad_norm": 0.003886127145960927, "learning_rate": 5.124829379734091e-07, "loss": 0.0001, "num_input_tokens_seen": 95990096, "step": 142425 }, { "epoch": 3.479588595998339, "grad_norm": 0.004299870692193508, "learning_rate": 5.124084827262297e-07, "loss": 0.0, "num_input_tokens_seen": 95993168, "step": 142430 }, { "epoch": 3.479710746830186, "grad_norm": 0.0008910160977393389, "learning_rate": 5.123340310249471e-07, "loss": 0.0, "num_input_tokens_seen": 95996816, "step": 142435 }, { "epoch": 3.479832897662033, "grad_norm": 0.00021660601487383246, "learning_rate": 5.122595828701024e-07, "loss": 0.0, "num_input_tokens_seen": 95999952, "step": 142440 }, { "epoch": 3.4799550484938804, "grad_norm": 0.0003907645004801452, "learning_rate": 5.121851382622375e-07, "loss": 0.0001, "num_input_tokens_seen": 96003408, "step": 142445 }, { "epoch": 3.4800771993257276, "grad_norm": 0.0013325160834938288, "learning_rate": 5.121106972018931e-07, "loss": 0.0, "num_input_tokens_seen": 96006800, "step": 142450 }, { "epoch": 3.4801993501575748, "grad_norm": 0.0025354530662298203, "learning_rate": 5.120362596896115e-07, "loss": 0.0, "num_input_tokens_seen": 96010384, "step": 142455 }, { "epoch": 3.4803215009894215, "grad_norm": 0.0005585558828897774, "learning_rate": 5.119618257259333e-07, "loss": 0.0, "num_input_tokens_seen": 96013712, "step": 142460 }, { "epoch": 3.480443651821269, "grad_norm": 0.0003892200766131282, "learning_rate": 5.118873953113995e-07, "loss": 0.0192, "num_input_tokens_seen": 96017168, "step": 142465 }, { "epoch": 3.480565802653116, "grad_norm": 0.0006179861375130713, "learning_rate": 5.118129684465524e-07, "loss": 0.0001, "num_input_tokens_seen": 96020688, "step": 142470 }, { "epoch": 3.480687953484963, "grad_norm": 0.0033679294865578413, "learning_rate": 5.117385451319322e-07, "loss": 0.0001, "num_input_tokens_seen": 96023632, "step": 142475 }, { "epoch": 3.4808101043168103, "grad_norm": 0.004632589407265186, "learning_rate": 5.116641253680811e-07, "loss": 0.0, "num_input_tokens_seen": 96027024, "step": 142480 }, { "epoch": 3.4809322551486575, "grad_norm": 0.0003384651499800384, "learning_rate": 5.115897091555394e-07, "loss": 0.0343, "num_input_tokens_seen": 96030224, "step": 142485 }, { "epoch": 3.4810544059805046, "grad_norm": 0.0016163821564987302, "learning_rate": 5.115152964948487e-07, "loss": 0.0, "num_input_tokens_seen": 96033616, "step": 142490 }, { "epoch": 3.481176556812352, "grad_norm": 0.002763167954981327, "learning_rate": 5.114408873865505e-07, "loss": 0.0, "num_input_tokens_seen": 96036944, "step": 142495 }, { "epoch": 3.481298707644199, "grad_norm": 0.17083217203617096, "learning_rate": 5.113664818311852e-07, "loss": 0.0, "num_input_tokens_seen": 96040272, "step": 142500 }, { "epoch": 3.481420858476046, "grad_norm": 0.03899205103516579, "learning_rate": 5.112920798292947e-07, "loss": 0.0, "num_input_tokens_seen": 96043792, "step": 142505 }, { "epoch": 3.4815430093078934, "grad_norm": 0.03535054251551628, "learning_rate": 5.112176813814193e-07, "loss": 0.0, "num_input_tokens_seen": 96046928, "step": 142510 }, { "epoch": 3.4816651601397406, "grad_norm": 0.00012026626791339368, "learning_rate": 5.111432864881007e-07, "loss": 0.0305, "num_input_tokens_seen": 96049936, "step": 142515 }, { "epoch": 3.481787310971588, "grad_norm": 0.005253299605101347, "learning_rate": 5.110688951498792e-07, "loss": 0.0, "num_input_tokens_seen": 96053520, "step": 142520 }, { "epoch": 3.481909461803435, "grad_norm": 0.15855756402015686, "learning_rate": 5.109945073672963e-07, "loss": 0.0, "num_input_tokens_seen": 96056528, "step": 142525 }, { "epoch": 3.482031612635282, "grad_norm": 4.489319690037519e-05, "learning_rate": 5.109201231408931e-07, "loss": 0.0001, "num_input_tokens_seen": 96060240, "step": 142530 }, { "epoch": 3.4821537634671293, "grad_norm": 0.0007414538995362818, "learning_rate": 5.1084574247121e-07, "loss": 0.0, "num_input_tokens_seen": 96064016, "step": 142535 }, { "epoch": 3.4822759142989765, "grad_norm": 0.005368458107113838, "learning_rate": 5.107713653587886e-07, "loss": 0.0, "num_input_tokens_seen": 96067472, "step": 142540 }, { "epoch": 3.4823980651308237, "grad_norm": 0.49003827571868896, "learning_rate": 5.106969918041692e-07, "loss": 0.0001, "num_input_tokens_seen": 96070736, "step": 142545 }, { "epoch": 3.482520215962671, "grad_norm": 0.009398811496794224, "learning_rate": 5.106226218078931e-07, "loss": 0.0, "num_input_tokens_seen": 96074192, "step": 142550 }, { "epoch": 3.4826423667945177, "grad_norm": 0.0036050749476999044, "learning_rate": 5.105482553705005e-07, "loss": 0.0321, "num_input_tokens_seen": 96077328, "step": 142555 }, { "epoch": 3.482764517626365, "grad_norm": 0.0015523568727076054, "learning_rate": 5.104738924925331e-07, "loss": 0.0, "num_input_tokens_seen": 96080464, "step": 142560 }, { "epoch": 3.482886668458212, "grad_norm": 0.0035881816875189543, "learning_rate": 5.103995331745313e-07, "loss": 0.0, "num_input_tokens_seen": 96083792, "step": 142565 }, { "epoch": 3.4830088192900592, "grad_norm": 0.004073690623044968, "learning_rate": 5.103251774170352e-07, "loss": 0.0, "num_input_tokens_seen": 96087376, "step": 142570 }, { "epoch": 3.4831309701219064, "grad_norm": 0.0024040727876126766, "learning_rate": 5.102508252205866e-07, "loss": 0.0, "num_input_tokens_seen": 96090448, "step": 142575 }, { "epoch": 3.4832531209537536, "grad_norm": 0.001509516965597868, "learning_rate": 5.101764765857254e-07, "loss": 0.0291, "num_input_tokens_seen": 96093584, "step": 142580 }, { "epoch": 3.483375271785601, "grad_norm": 0.054433681070804596, "learning_rate": 5.101021315129925e-07, "loss": 0.0, "num_input_tokens_seen": 96096976, "step": 142585 }, { "epoch": 3.483497422617448, "grad_norm": 0.00018331894534640014, "learning_rate": 5.10027790002929e-07, "loss": 0.0, "num_input_tokens_seen": 96100880, "step": 142590 }, { "epoch": 3.483619573449295, "grad_norm": 0.003632371546700597, "learning_rate": 5.099534520560751e-07, "loss": 0.0, "num_input_tokens_seen": 96104272, "step": 142595 }, { "epoch": 3.4837417242811424, "grad_norm": 0.00011348898988217115, "learning_rate": 5.098791176729716e-07, "loss": 0.0524, "num_input_tokens_seen": 96107216, "step": 142600 }, { "epoch": 3.4838638751129896, "grad_norm": 0.0023140916600823402, "learning_rate": 5.098047868541587e-07, "loss": 0.0284, "num_input_tokens_seen": 96110480, "step": 142605 }, { "epoch": 3.4839860259448368, "grad_norm": 0.00012285502452868968, "learning_rate": 5.097304596001777e-07, "loss": 0.0339, "num_input_tokens_seen": 96113936, "step": 142610 }, { "epoch": 3.484108176776684, "grad_norm": 0.001920671435073018, "learning_rate": 5.096561359115682e-07, "loss": 0.0001, "num_input_tokens_seen": 96117072, "step": 142615 }, { "epoch": 3.484230327608531, "grad_norm": 0.013230564072728157, "learning_rate": 5.095818157888712e-07, "loss": 0.0, "num_input_tokens_seen": 96120272, "step": 142620 }, { "epoch": 3.4843524784403783, "grad_norm": 2.9572129249572754, "learning_rate": 5.095074992326274e-07, "loss": 0.0531, "num_input_tokens_seen": 96123600, "step": 142625 }, { "epoch": 3.4844746292722255, "grad_norm": 0.03145867958664894, "learning_rate": 5.094331862433768e-07, "loss": 0.0, "num_input_tokens_seen": 96127376, "step": 142630 }, { "epoch": 3.4845967801040727, "grad_norm": 0.01226538885384798, "learning_rate": 5.093588768216602e-07, "loss": 0.0546, "num_input_tokens_seen": 96131088, "step": 142635 }, { "epoch": 3.4847189309359194, "grad_norm": 0.17331823706626892, "learning_rate": 5.092845709680176e-07, "loss": 0.0001, "num_input_tokens_seen": 96134224, "step": 142640 }, { "epoch": 3.484841081767767, "grad_norm": 0.005320474039763212, "learning_rate": 5.092102686829896e-07, "loss": 0.0, "num_input_tokens_seen": 96137296, "step": 142645 }, { "epoch": 3.484963232599614, "grad_norm": 0.0061260429210960865, "learning_rate": 5.091359699671168e-07, "loss": 0.0, "num_input_tokens_seen": 96140944, "step": 142650 }, { "epoch": 3.485085383431461, "grad_norm": 0.0017569613410159945, "learning_rate": 5.090616748209388e-07, "loss": 0.0, "num_input_tokens_seen": 96144464, "step": 142655 }, { "epoch": 3.485207534263308, "grad_norm": 0.000533196609467268, "learning_rate": 5.089873832449969e-07, "loss": 0.0, "num_input_tokens_seen": 96147792, "step": 142660 }, { "epoch": 3.4853296850951554, "grad_norm": 0.01179375872015953, "learning_rate": 5.089130952398308e-07, "loss": 0.0003, "num_input_tokens_seen": 96151312, "step": 142665 }, { "epoch": 3.4854518359270026, "grad_norm": 0.0049562300555408, "learning_rate": 5.088388108059802e-07, "loss": 0.0849, "num_input_tokens_seen": 96154640, "step": 142670 }, { "epoch": 3.4855739867588498, "grad_norm": 0.0013419613242149353, "learning_rate": 5.087645299439864e-07, "loss": 0.0001, "num_input_tokens_seen": 96158032, "step": 142675 }, { "epoch": 3.485696137590697, "grad_norm": 0.32624852657318115, "learning_rate": 5.086902526543889e-07, "loss": 0.0001, "num_input_tokens_seen": 96160976, "step": 142680 }, { "epoch": 3.485818288422544, "grad_norm": 0.00047253796947188675, "learning_rate": 5.08615978937728e-07, "loss": 0.0, "num_input_tokens_seen": 96164944, "step": 142685 }, { "epoch": 3.4859404392543913, "grad_norm": 0.22956159710884094, "learning_rate": 5.085417087945436e-07, "loss": 0.0001, "num_input_tokens_seen": 96168272, "step": 142690 }, { "epoch": 3.4860625900862385, "grad_norm": 0.2009653002023697, "learning_rate": 5.084674422253767e-07, "loss": 0.0, "num_input_tokens_seen": 96171984, "step": 142695 }, { "epoch": 3.4861847409180857, "grad_norm": 0.001368940225802362, "learning_rate": 5.083931792307661e-07, "loss": 0.0, "num_input_tokens_seen": 96175312, "step": 142700 }, { "epoch": 3.486306891749933, "grad_norm": 0.012675776146352291, "learning_rate": 5.08318919811253e-07, "loss": 0.049, "num_input_tokens_seen": 96178768, "step": 142705 }, { "epoch": 3.48642904258178, "grad_norm": 0.0061261216178536415, "learning_rate": 5.082446639673766e-07, "loss": 0.0, "num_input_tokens_seen": 96182224, "step": 142710 }, { "epoch": 3.4865511934136273, "grad_norm": 0.002247494412586093, "learning_rate": 5.081704116996773e-07, "loss": 0.0005, "num_input_tokens_seen": 96185808, "step": 142715 }, { "epoch": 3.4866733442454745, "grad_norm": 0.005125206429511309, "learning_rate": 5.080961630086954e-07, "loss": 0.001, "num_input_tokens_seen": 96189008, "step": 142720 }, { "epoch": 3.4867954950773212, "grad_norm": 0.011667453683912754, "learning_rate": 5.080219178949701e-07, "loss": 0.0157, "num_input_tokens_seen": 96192272, "step": 142725 }, { "epoch": 3.486917645909169, "grad_norm": 0.01347960066050291, "learning_rate": 5.079476763590422e-07, "loss": 0.0, "num_input_tokens_seen": 96195856, "step": 142730 }, { "epoch": 3.4870397967410156, "grad_norm": 0.008868924342095852, "learning_rate": 5.078734384014507e-07, "loss": 0.0663, "num_input_tokens_seen": 96199184, "step": 142735 }, { "epoch": 3.487161947572863, "grad_norm": 0.15017327666282654, "learning_rate": 5.07799204022736e-07, "loss": 0.0001, "num_input_tokens_seen": 96202320, "step": 142740 }, { "epoch": 3.48728409840471, "grad_norm": 0.0016383582260459661, "learning_rate": 5.077249732234381e-07, "loss": 0.0, "num_input_tokens_seen": 96205776, "step": 142745 }, { "epoch": 3.487406249236557, "grad_norm": 0.00033586935023777187, "learning_rate": 5.076507460040964e-07, "loss": 0.0, "num_input_tokens_seen": 96208912, "step": 142750 }, { "epoch": 3.4875284000684044, "grad_norm": 0.0009348663734272122, "learning_rate": 5.075765223652511e-07, "loss": 0.0, "num_input_tokens_seen": 96212048, "step": 142755 }, { "epoch": 3.4876505509002516, "grad_norm": 0.043417029082775116, "learning_rate": 5.075023023074415e-07, "loss": 0.0, "num_input_tokens_seen": 96216016, "step": 142760 }, { "epoch": 3.4877727017320987, "grad_norm": 0.0003841420984826982, "learning_rate": 5.07428085831208e-07, "loss": 0.0, "num_input_tokens_seen": 96219536, "step": 142765 }, { "epoch": 3.487894852563946, "grad_norm": 0.00028211314929649234, "learning_rate": 5.0735387293709e-07, "loss": 0.0, "num_input_tokens_seen": 96223056, "step": 142770 }, { "epoch": 3.488017003395793, "grad_norm": 0.0009631828288547695, "learning_rate": 5.072796636256267e-07, "loss": 0.0693, "num_input_tokens_seen": 96226128, "step": 142775 }, { "epoch": 3.4881391542276403, "grad_norm": 0.01604449190199375, "learning_rate": 5.072054578973585e-07, "loss": 0.0005, "num_input_tokens_seen": 96229392, "step": 142780 }, { "epoch": 3.4882613050594875, "grad_norm": 0.0003695189079735428, "learning_rate": 5.071312557528244e-07, "loss": 0.0003, "num_input_tokens_seen": 96232336, "step": 142785 }, { "epoch": 3.4883834558913347, "grad_norm": 0.0021992342080920935, "learning_rate": 5.07057057192565e-07, "loss": 0.0, "num_input_tokens_seen": 96235792, "step": 142790 }, { "epoch": 3.488505606723182, "grad_norm": 0.22512054443359375, "learning_rate": 5.069828622171186e-07, "loss": 0.0001, "num_input_tokens_seen": 96238992, "step": 142795 }, { "epoch": 3.488627757555029, "grad_norm": 0.02057253010571003, "learning_rate": 5.06908670827026e-07, "loss": 0.0, "num_input_tokens_seen": 96242768, "step": 142800 }, { "epoch": 3.4887499083868763, "grad_norm": 0.006819578818976879, "learning_rate": 5.068344830228257e-07, "loss": 0.0, "num_input_tokens_seen": 96245968, "step": 142805 }, { "epoch": 3.4888720592187235, "grad_norm": 0.05661904439330101, "learning_rate": 5.067602988050576e-07, "loss": 0.0, "num_input_tokens_seen": 96249360, "step": 142810 }, { "epoch": 3.4889942100505706, "grad_norm": 0.0019289409974589944, "learning_rate": 5.066861181742619e-07, "loss": 0.0, "num_input_tokens_seen": 96252432, "step": 142815 }, { "epoch": 3.4891163608824174, "grad_norm": 0.001519133453257382, "learning_rate": 5.066119411309769e-07, "loss": 0.0, "num_input_tokens_seen": 96255824, "step": 142820 }, { "epoch": 3.489238511714265, "grad_norm": 0.0009457020205445588, "learning_rate": 5.065377676757428e-07, "loss": 0.0, "num_input_tokens_seen": 96258960, "step": 142825 }, { "epoch": 3.4893606625461118, "grad_norm": 0.015754206106066704, "learning_rate": 5.064635978090986e-07, "loss": 0.0, "num_input_tokens_seen": 96262160, "step": 142830 }, { "epoch": 3.489482813377959, "grad_norm": 0.005095080006867647, "learning_rate": 5.063894315315837e-07, "loss": 0.0245, "num_input_tokens_seen": 96265424, "step": 142835 }, { "epoch": 3.489604964209806, "grad_norm": 0.004009610507637262, "learning_rate": 5.063152688437382e-07, "loss": 0.0, "num_input_tokens_seen": 96268752, "step": 142840 }, { "epoch": 3.4897271150416533, "grad_norm": 0.004639514721930027, "learning_rate": 5.062411097461004e-07, "loss": 0.0, "num_input_tokens_seen": 96272208, "step": 142845 }, { "epoch": 3.4898492658735005, "grad_norm": 0.0006230053259059787, "learning_rate": 5.061669542392104e-07, "loss": 0.0, "num_input_tokens_seen": 96276048, "step": 142850 }, { "epoch": 3.4899714167053477, "grad_norm": 2334.111328125, "learning_rate": 5.060928023236069e-07, "loss": 0.0918, "num_input_tokens_seen": 96279312, "step": 142855 }, { "epoch": 3.490093567537195, "grad_norm": 0.009122136048972607, "learning_rate": 5.060186539998295e-07, "loss": 0.0, "num_input_tokens_seen": 96282256, "step": 142860 }, { "epoch": 3.490215718369042, "grad_norm": 0.004901864565908909, "learning_rate": 5.059445092684171e-07, "loss": 0.0, "num_input_tokens_seen": 96285136, "step": 142865 }, { "epoch": 3.4903378692008893, "grad_norm": 33.27792739868164, "learning_rate": 5.058703681299094e-07, "loss": 0.0548, "num_input_tokens_seen": 96288336, "step": 142870 }, { "epoch": 3.4904600200327365, "grad_norm": 0.06755754351615906, "learning_rate": 5.057962305848454e-07, "loss": 0.0002, "num_input_tokens_seen": 96291280, "step": 142875 }, { "epoch": 3.4905821708645837, "grad_norm": 0.007401663344353437, "learning_rate": 5.057220966337638e-07, "loss": 0.0, "num_input_tokens_seen": 96294800, "step": 142880 }, { "epoch": 3.490704321696431, "grad_norm": 0.0013410568935796618, "learning_rate": 5.056479662772042e-07, "loss": 0.0008, "num_input_tokens_seen": 96297808, "step": 142885 }, { "epoch": 3.490826472528278, "grad_norm": 0.06911446154117584, "learning_rate": 5.055738395157055e-07, "loss": 0.0511, "num_input_tokens_seen": 96301136, "step": 142890 }, { "epoch": 3.4909486233601252, "grad_norm": 0.0003165986272506416, "learning_rate": 5.054997163498065e-07, "loss": 0.0, "num_input_tokens_seen": 96304528, "step": 142895 }, { "epoch": 3.4910707741919724, "grad_norm": 0.050810977816581726, "learning_rate": 5.054255967800471e-07, "loss": 0.0, "num_input_tokens_seen": 96307600, "step": 142900 }, { "epoch": 3.491192925023819, "grad_norm": 0.009707154706120491, "learning_rate": 5.053514808069655e-07, "loss": 0.0, "num_input_tokens_seen": 96310928, "step": 142905 }, { "epoch": 3.491315075855667, "grad_norm": 0.001085141790099442, "learning_rate": 5.052773684311011e-07, "loss": 0.0, "num_input_tokens_seen": 96314512, "step": 142910 }, { "epoch": 3.4914372266875136, "grad_norm": 0.002273534657433629, "learning_rate": 5.052032596529926e-07, "loss": 0.0, "num_input_tokens_seen": 96318096, "step": 142915 }, { "epoch": 3.4915593775193607, "grad_norm": 0.001621173694729805, "learning_rate": 5.051291544731794e-07, "loss": 0.0268, "num_input_tokens_seen": 96321296, "step": 142920 }, { "epoch": 3.491681528351208, "grad_norm": 0.0009241417865268886, "learning_rate": 5.050550528921998e-07, "loss": 0.0001, "num_input_tokens_seen": 96324496, "step": 142925 }, { "epoch": 3.491803679183055, "grad_norm": 0.0004545733390841633, "learning_rate": 5.049809549105928e-07, "loss": 0.0001, "num_input_tokens_seen": 96327760, "step": 142930 }, { "epoch": 3.4919258300149023, "grad_norm": 0.0037344826851040125, "learning_rate": 5.049068605288978e-07, "loss": 0.0002, "num_input_tokens_seen": 96330640, "step": 142935 }, { "epoch": 3.4920479808467495, "grad_norm": 0.0003664505493361503, "learning_rate": 5.04832769747653e-07, "loss": 0.0, "num_input_tokens_seen": 96333584, "step": 142940 }, { "epoch": 3.4921701316785967, "grad_norm": 0.07739343494176865, "learning_rate": 5.047586825673978e-07, "loss": 0.0016, "num_input_tokens_seen": 96336848, "step": 142945 }, { "epoch": 3.492292282510444, "grad_norm": 0.00023452962341252714, "learning_rate": 5.046845989886703e-07, "loss": 0.0, "num_input_tokens_seen": 96340496, "step": 142950 }, { "epoch": 3.492414433342291, "grad_norm": 0.0009708444122225046, "learning_rate": 5.0461051901201e-07, "loss": 0.0, "num_input_tokens_seen": 96343568, "step": 142955 }, { "epoch": 3.4925365841741383, "grad_norm": 0.0010330878430977464, "learning_rate": 5.04536442637955e-07, "loss": 0.0, "num_input_tokens_seen": 96347216, "step": 142960 }, { "epoch": 3.4926587350059854, "grad_norm": 32.308738708496094, "learning_rate": 5.044623698670441e-07, "loss": 0.0489, "num_input_tokens_seen": 96349968, "step": 142965 }, { "epoch": 3.4927808858378326, "grad_norm": 0.007837752811610699, "learning_rate": 5.043883006998166e-07, "loss": 0.0, "num_input_tokens_seen": 96353744, "step": 142970 }, { "epoch": 3.49290303666968, "grad_norm": 0.002741006202995777, "learning_rate": 5.043142351368106e-07, "loss": 0.0609, "num_input_tokens_seen": 96356816, "step": 142975 }, { "epoch": 3.493025187501527, "grad_norm": 0.0050845337100327015, "learning_rate": 5.042401731785645e-07, "loss": 0.0, "num_input_tokens_seen": 96360400, "step": 142980 }, { "epoch": 3.493147338333374, "grad_norm": 0.022080646827816963, "learning_rate": 5.041661148256175e-07, "loss": 0.0001, "num_input_tokens_seen": 96363600, "step": 142985 }, { "epoch": 3.4932694891652214, "grad_norm": 0.0163921769708395, "learning_rate": 5.040920600785075e-07, "loss": 0.0, "num_input_tokens_seen": 96367120, "step": 142990 }, { "epoch": 3.4933916399970686, "grad_norm": 0.00010943515371764079, "learning_rate": 5.04018008937774e-07, "loss": 0.0006, "num_input_tokens_seen": 96370256, "step": 142995 }, { "epoch": 3.4935137908289153, "grad_norm": 0.012624138034880161, "learning_rate": 5.039439614039543e-07, "loss": 0.0, "num_input_tokens_seen": 96373648, "step": 143000 }, { "epoch": 3.4936359416607625, "grad_norm": 0.005008554086089134, "learning_rate": 5.03869917477588e-07, "loss": 0.0, "num_input_tokens_seen": 96377232, "step": 143005 }, { "epoch": 3.4937580924926097, "grad_norm": 0.021310361102223396, "learning_rate": 5.037958771592128e-07, "loss": 0.0, "num_input_tokens_seen": 96380368, "step": 143010 }, { "epoch": 3.493880243324457, "grad_norm": 0.011983363889157772, "learning_rate": 5.037218404493677e-07, "loss": 0.0001, "num_input_tokens_seen": 96384080, "step": 143015 }, { "epoch": 3.494002394156304, "grad_norm": 0.022541791200637817, "learning_rate": 5.036478073485906e-07, "loss": 0.0, "num_input_tokens_seen": 96387728, "step": 143020 }, { "epoch": 3.4941245449881513, "grad_norm": 0.0031625598203390837, "learning_rate": 5.035737778574202e-07, "loss": 0.0, "num_input_tokens_seen": 96391184, "step": 143025 }, { "epoch": 3.4942466958199985, "grad_norm": 21.59922218322754, "learning_rate": 5.034997519763951e-07, "loss": 0.0355, "num_input_tokens_seen": 96394448, "step": 143030 }, { "epoch": 3.4943688466518457, "grad_norm": 0.0019981954246759415, "learning_rate": 5.034257297060529e-07, "loss": 0.0, "num_input_tokens_seen": 96398032, "step": 143035 }, { "epoch": 3.494490997483693, "grad_norm": 11.361743927001953, "learning_rate": 5.033517110469327e-07, "loss": 0.001, "num_input_tokens_seen": 96401168, "step": 143040 }, { "epoch": 3.49461314831554, "grad_norm": 6.632530130445957e-05, "learning_rate": 5.032776959995721e-07, "loss": 0.0002, "num_input_tokens_seen": 96404688, "step": 143045 }, { "epoch": 3.4947352991473872, "grad_norm": 0.00037045919452793896, "learning_rate": 5.032036845645099e-07, "loss": 0.0, "num_input_tokens_seen": 96407824, "step": 143050 }, { "epoch": 3.4948574499792344, "grad_norm": 0.0032580657862126827, "learning_rate": 5.031296767422844e-07, "loss": 0.0842, "num_input_tokens_seen": 96411088, "step": 143055 }, { "epoch": 3.4949796008110816, "grad_norm": 0.013785989955067635, "learning_rate": 5.030556725334331e-07, "loss": 0.0, "num_input_tokens_seen": 96414672, "step": 143060 }, { "epoch": 3.495101751642929, "grad_norm": 0.00632657203823328, "learning_rate": 5.029816719384949e-07, "loss": 0.0, "num_input_tokens_seen": 96417808, "step": 143065 }, { "epoch": 3.495223902474776, "grad_norm": 0.00011165008618263528, "learning_rate": 5.029076749580075e-07, "loss": 0.0001, "num_input_tokens_seen": 96421264, "step": 143070 }, { "epoch": 3.495346053306623, "grad_norm": 0.0034451729152351618, "learning_rate": 5.028336815925094e-07, "loss": 0.0, "num_input_tokens_seen": 96424848, "step": 143075 }, { "epoch": 3.4954682041384704, "grad_norm": 0.05073514208197594, "learning_rate": 5.027596918425386e-07, "loss": 0.0172, "num_input_tokens_seen": 96429072, "step": 143080 }, { "epoch": 3.495590354970317, "grad_norm": 0.0018720118096098304, "learning_rate": 5.026857057086325e-07, "loss": 0.0, "num_input_tokens_seen": 96432400, "step": 143085 }, { "epoch": 3.4957125058021647, "grad_norm": 0.00102695869281888, "learning_rate": 5.026117231913303e-07, "loss": 0.0, "num_input_tokens_seen": 96435600, "step": 143090 }, { "epoch": 3.4958346566340115, "grad_norm": 0.002525731222704053, "learning_rate": 5.025377442911689e-07, "loss": 0.0, "num_input_tokens_seen": 96439056, "step": 143095 }, { "epoch": 3.4959568074658587, "grad_norm": 176.42254638671875, "learning_rate": 5.024637690086873e-07, "loss": 0.1003, "num_input_tokens_seen": 96442192, "step": 143100 }, { "epoch": 3.496078958297706, "grad_norm": 0.001519545796327293, "learning_rate": 5.023897973444226e-07, "loss": 0.0, "num_input_tokens_seen": 96445776, "step": 143105 }, { "epoch": 3.496201109129553, "grad_norm": 0.0004945505643263459, "learning_rate": 5.023158292989135e-07, "loss": 0.0, "num_input_tokens_seen": 96449104, "step": 143110 }, { "epoch": 3.4963232599614003, "grad_norm": 0.04455680400133133, "learning_rate": 5.022418648726972e-07, "loss": 0.0001, "num_input_tokens_seen": 96451920, "step": 143115 }, { "epoch": 3.4964454107932474, "grad_norm": 0.006775304209440947, "learning_rate": 5.021679040663118e-07, "loss": 0.0002, "num_input_tokens_seen": 96454864, "step": 143120 }, { "epoch": 3.4965675616250946, "grad_norm": 0.0007572094327770174, "learning_rate": 5.020939468802958e-07, "loss": 0.0, "num_input_tokens_seen": 96458768, "step": 143125 }, { "epoch": 3.496689712456942, "grad_norm": 0.000804522424004972, "learning_rate": 5.020199933151862e-07, "loss": 0.0, "num_input_tokens_seen": 96462160, "step": 143130 }, { "epoch": 3.496811863288789, "grad_norm": 0.006477277260273695, "learning_rate": 5.019460433715214e-07, "loss": 0.0, "num_input_tokens_seen": 96465296, "step": 143135 }, { "epoch": 3.496934014120636, "grad_norm": 0.002193046035245061, "learning_rate": 5.018720970498387e-07, "loss": 0.0, "num_input_tokens_seen": 96468880, "step": 143140 }, { "epoch": 3.4970561649524834, "grad_norm": 48.42259216308594, "learning_rate": 5.01798154350676e-07, "loss": 0.0269, "num_input_tokens_seen": 96472208, "step": 143145 }, { "epoch": 3.4971783157843306, "grad_norm": 0.009474172256886959, "learning_rate": 5.017242152745715e-07, "loss": 0.0, "num_input_tokens_seen": 96475728, "step": 143150 }, { "epoch": 3.4973004666161778, "grad_norm": 0.0001851016713771969, "learning_rate": 5.016502798220622e-07, "loss": 0.0, "num_input_tokens_seen": 96479184, "step": 143155 }, { "epoch": 3.497422617448025, "grad_norm": 0.0032584320288151503, "learning_rate": 5.015763479936865e-07, "loss": 0.0001, "num_input_tokens_seen": 96482320, "step": 143160 }, { "epoch": 3.497544768279872, "grad_norm": 0.0007037792238406837, "learning_rate": 5.015024197899812e-07, "loss": 0.0, "num_input_tokens_seen": 96485264, "step": 143165 }, { "epoch": 3.4976669191117193, "grad_norm": 0.31896787881851196, "learning_rate": 5.014284952114848e-07, "loss": 0.0001, "num_input_tokens_seen": 96488848, "step": 143170 }, { "epoch": 3.4977890699435665, "grad_norm": 0.0003134756116196513, "learning_rate": 5.013545742587341e-07, "loss": 0.0, "num_input_tokens_seen": 96492496, "step": 143175 }, { "epoch": 3.4979112207754133, "grad_norm": 0.011219010688364506, "learning_rate": 5.012806569322674e-07, "loss": 0.0344, "num_input_tokens_seen": 96496208, "step": 143180 }, { "epoch": 3.4980333716072605, "grad_norm": 0.12939979135990143, "learning_rate": 5.012067432326219e-07, "loss": 0.0642, "num_input_tokens_seen": 96499408, "step": 143185 }, { "epoch": 3.4981555224391077, "grad_norm": 0.18904073536396027, "learning_rate": 5.011328331603348e-07, "loss": 0.0001, "num_input_tokens_seen": 96502672, "step": 143190 }, { "epoch": 3.498277673270955, "grad_norm": 0.0017373156733810902, "learning_rate": 5.010589267159443e-07, "loss": 0.0, "num_input_tokens_seen": 96506000, "step": 143195 }, { "epoch": 3.498399824102802, "grad_norm": 0.0005279670003801584, "learning_rate": 5.00985023899987e-07, "loss": 0.0, "num_input_tokens_seen": 96509584, "step": 143200 }, { "epoch": 3.4985219749346492, "grad_norm": 0.005878926254808903, "learning_rate": 5.00911124713001e-07, "loss": 0.0, "num_input_tokens_seen": 96512592, "step": 143205 }, { "epoch": 3.4986441257664964, "grad_norm": 0.00023229350335896015, "learning_rate": 5.008372291555238e-07, "loss": 0.0, "num_input_tokens_seen": 96516176, "step": 143210 }, { "epoch": 3.4987662765983436, "grad_norm": 0.001065135351382196, "learning_rate": 5.007633372280921e-07, "loss": 0.0, "num_input_tokens_seen": 96519440, "step": 143215 }, { "epoch": 3.498888427430191, "grad_norm": 0.02967376634478569, "learning_rate": 5.006894489312442e-07, "loss": 0.0, "num_input_tokens_seen": 96522512, "step": 143220 }, { "epoch": 3.499010578262038, "grad_norm": 0.0024210901465266943, "learning_rate": 5.006155642655165e-07, "loss": 0.0, "num_input_tokens_seen": 96525840, "step": 143225 }, { "epoch": 3.499132729093885, "grad_norm": 0.0014005607226863503, "learning_rate": 5.005416832314471e-07, "loss": 0.0, "num_input_tokens_seen": 96529424, "step": 143230 }, { "epoch": 3.4992548799257324, "grad_norm": 0.009456724859774113, "learning_rate": 5.004678058295726e-07, "loss": 0.0, "num_input_tokens_seen": 96532560, "step": 143235 }, { "epoch": 3.4993770307575796, "grad_norm": 0.002857289044186473, "learning_rate": 5.003939320604304e-07, "loss": 0.0, "num_input_tokens_seen": 96535760, "step": 143240 }, { "epoch": 3.4994991815894267, "grad_norm": 0.0014447863213717937, "learning_rate": 5.003200619245584e-07, "loss": 0.0, "num_input_tokens_seen": 96539152, "step": 143245 }, { "epoch": 3.499621332421274, "grad_norm": 0.0642884150147438, "learning_rate": 5.00246195422493e-07, "loss": 0.0, "num_input_tokens_seen": 96542672, "step": 143250 }, { "epoch": 3.499743483253121, "grad_norm": 0.0013544763205572963, "learning_rate": 5.00172332554772e-07, "loss": 0.0421, "num_input_tokens_seen": 96546128, "step": 143255 }, { "epoch": 3.4998656340849683, "grad_norm": 0.02764141745865345, "learning_rate": 5.000984733219318e-07, "loss": 0.0001, "num_input_tokens_seen": 96549840, "step": 143260 }, { "epoch": 3.499987784916815, "grad_norm": 0.001067040953785181, "learning_rate": 5.000246177245104e-07, "loss": 0.0, "num_input_tokens_seen": 96553104, "step": 143265 }, { "epoch": 3.5001099357486627, "grad_norm": 0.006503286771476269, "learning_rate": 4.999507657630441e-07, "loss": 0.1349, "num_input_tokens_seen": 96557008, "step": 143270 }, { "epoch": 3.5002320865805094, "grad_norm": 7.654829823877662e-05, "learning_rate": 4.998769174380703e-07, "loss": 0.0332, "num_input_tokens_seen": 96560080, "step": 143275 }, { "epoch": 3.5002565167468793, "eval_loss": 0.24939614534378052, "eval_runtime": 47.9318, "eval_samples_per_second": 759.099, "eval_steps_per_second": 94.906, "num_input_tokens_seen": 96560720, "step": 143276 }, { "epoch": 3.5003542374123566, "grad_norm": 0.0012859876733273268, "learning_rate": 4.998030727501263e-07, "loss": 0.0, "num_input_tokens_seen": 96563536, "step": 143280 }, { "epoch": 3.500476388244204, "grad_norm": 0.002032454591244459, "learning_rate": 4.997292316997492e-07, "loss": 0.0563, "num_input_tokens_seen": 96566736, "step": 143285 }, { "epoch": 3.500598539076051, "grad_norm": 0.002231521299108863, "learning_rate": 4.996553942874751e-07, "loss": 0.0, "num_input_tokens_seen": 96570192, "step": 143290 }, { "epoch": 3.500720689907898, "grad_norm": 66.57640838623047, "learning_rate": 4.995815605138419e-07, "loss": 0.0762, "num_input_tokens_seen": 96573392, "step": 143295 }, { "epoch": 3.5008428407397454, "grad_norm": 0.0034801210276782513, "learning_rate": 4.995077303793859e-07, "loss": 0.0, "num_input_tokens_seen": 96576848, "step": 143300 }, { "epoch": 3.5009649915715926, "grad_norm": 0.0004456123278941959, "learning_rate": 4.994339038846447e-07, "loss": 0.0, "num_input_tokens_seen": 96580112, "step": 143305 }, { "epoch": 3.5010871424034398, "grad_norm": 0.0003277628275100142, "learning_rate": 4.993600810301543e-07, "loss": 0.0, "num_input_tokens_seen": 96583568, "step": 143310 }, { "epoch": 3.501209293235287, "grad_norm": 0.00047519730287604034, "learning_rate": 4.992862618164525e-07, "loss": 0.0, "num_input_tokens_seen": 96586960, "step": 143315 }, { "epoch": 3.501331444067134, "grad_norm": 0.001118892221711576, "learning_rate": 4.992124462440754e-07, "loss": 0.0016, "num_input_tokens_seen": 96590224, "step": 143320 }, { "epoch": 3.5014535948989813, "grad_norm": 0.0010593585902824998, "learning_rate": 4.991386343135602e-07, "loss": 0.0, "num_input_tokens_seen": 96593680, "step": 143325 }, { "epoch": 3.5015757457308285, "grad_norm": 0.00285542756319046, "learning_rate": 4.990648260254434e-07, "loss": 0.0, "num_input_tokens_seen": 96596688, "step": 143330 }, { "epoch": 3.5016978965626757, "grad_norm": 0.030459964647889137, "learning_rate": 4.989910213802618e-07, "loss": 0.0, "num_input_tokens_seen": 96599824, "step": 143335 }, { "epoch": 3.501820047394523, "grad_norm": 0.0019312178483232856, "learning_rate": 4.989172203785528e-07, "loss": 0.0526, "num_input_tokens_seen": 96603216, "step": 143340 }, { "epoch": 3.50194219822637, "grad_norm": 0.0007082682568579912, "learning_rate": 4.98843423020852e-07, "loss": 0.0, "num_input_tokens_seen": 96607248, "step": 143345 }, { "epoch": 3.502064349058217, "grad_norm": 0.0170864537358284, "learning_rate": 4.98769629307697e-07, "loss": 0.0002, "num_input_tokens_seen": 96610704, "step": 143350 }, { "epoch": 3.5021864998900645, "grad_norm": 0.006353064905852079, "learning_rate": 4.986958392396239e-07, "loss": 0.0377, "num_input_tokens_seen": 96613904, "step": 143355 }, { "epoch": 3.502308650721911, "grad_norm": 0.00010134075273526832, "learning_rate": 4.986220528171692e-07, "loss": 0.0, "num_input_tokens_seen": 96617360, "step": 143360 }, { "epoch": 3.502430801553759, "grad_norm": 0.0008548396872356534, "learning_rate": 4.985482700408704e-07, "loss": 0.0, "num_input_tokens_seen": 96621008, "step": 143365 }, { "epoch": 3.5025529523856056, "grad_norm": 0.073929063975811, "learning_rate": 4.98474490911263e-07, "loss": 0.0, "num_input_tokens_seen": 96624208, "step": 143370 }, { "epoch": 3.502675103217453, "grad_norm": 0.0011899123201146722, "learning_rate": 4.984007154288843e-07, "loss": 0.0, "num_input_tokens_seen": 96627920, "step": 143375 }, { "epoch": 3.5027972540493, "grad_norm": 0.010921421460807323, "learning_rate": 4.983269435942702e-07, "loss": 0.0, "num_input_tokens_seen": 96631120, "step": 143380 }, { "epoch": 3.502919404881147, "grad_norm": 0.0010460818884894252, "learning_rate": 4.98253175407958e-07, "loss": 0.0626, "num_input_tokens_seen": 96634704, "step": 143385 }, { "epoch": 3.5030415557129944, "grad_norm": 0.0009477305575273931, "learning_rate": 4.981794108704834e-07, "loss": 0.087, "num_input_tokens_seen": 96637968, "step": 143390 }, { "epoch": 3.5031637065448415, "grad_norm": 0.14386871457099915, "learning_rate": 4.981056499823829e-07, "loss": 0.0001, "num_input_tokens_seen": 96641552, "step": 143395 }, { "epoch": 3.5032858573766887, "grad_norm": 0.0003375703818164766, "learning_rate": 4.980318927441934e-07, "loss": 0.0491, "num_input_tokens_seen": 96645008, "step": 143400 }, { "epoch": 3.503408008208536, "grad_norm": 0.0025346807669848204, "learning_rate": 4.979581391564507e-07, "loss": 0.0, "num_input_tokens_seen": 96648400, "step": 143405 }, { "epoch": 3.503530159040383, "grad_norm": 0.009452375583350658, "learning_rate": 4.978843892196918e-07, "loss": 0.0, "num_input_tokens_seen": 96651792, "step": 143410 }, { "epoch": 3.5036523098722303, "grad_norm": 0.07168054580688477, "learning_rate": 4.978106429344523e-07, "loss": 0.0548, "num_input_tokens_seen": 96655248, "step": 143415 }, { "epoch": 3.5037744607040775, "grad_norm": 0.011760186403989792, "learning_rate": 4.977369003012691e-07, "loss": 0.0, "num_input_tokens_seen": 96658192, "step": 143420 }, { "epoch": 3.5038966115359247, "grad_norm": 0.0027387761510908604, "learning_rate": 4.976631613206781e-07, "loss": 0.0, "num_input_tokens_seen": 96661456, "step": 143425 }, { "epoch": 3.504018762367772, "grad_norm": 0.0015713631873950362, "learning_rate": 4.975894259932156e-07, "loss": 0.0, "num_input_tokens_seen": 96664912, "step": 143430 }, { "epoch": 3.5041409131996186, "grad_norm": 0.0005512706702575088, "learning_rate": 4.975156943194183e-07, "loss": 0.0001, "num_input_tokens_seen": 96667856, "step": 143435 }, { "epoch": 3.5042630640314663, "grad_norm": 0.0021763816475868225, "learning_rate": 4.974419662998216e-07, "loss": 0.0, "num_input_tokens_seen": 96671312, "step": 143440 }, { "epoch": 3.504385214863313, "grad_norm": 0.009386450052261353, "learning_rate": 4.973682419349625e-07, "loss": 0.0001, "num_input_tokens_seen": 96674832, "step": 143445 }, { "epoch": 3.5045073656951606, "grad_norm": 0.00046236676280386746, "learning_rate": 4.972945212253764e-07, "loss": 0.031, "num_input_tokens_seen": 96678032, "step": 143450 }, { "epoch": 3.5046295165270074, "grad_norm": 0.003409997094422579, "learning_rate": 4.972208041715997e-07, "loss": 0.0, "num_input_tokens_seen": 96680976, "step": 143455 }, { "epoch": 3.5047516673588546, "grad_norm": 0.00525322463363409, "learning_rate": 4.971470907741691e-07, "loss": 0.0, "num_input_tokens_seen": 96684560, "step": 143460 }, { "epoch": 3.5048738181907018, "grad_norm": 0.001424750778824091, "learning_rate": 4.970733810336196e-07, "loss": 0.0, "num_input_tokens_seen": 96687888, "step": 143465 }, { "epoch": 3.504995969022549, "grad_norm": 0.0075294580310583115, "learning_rate": 4.96999674950488e-07, "loss": 0.0, "num_input_tokens_seen": 96691408, "step": 143470 }, { "epoch": 3.505118119854396, "grad_norm": 0.0034216614440083504, "learning_rate": 4.969259725253098e-07, "loss": 0.0, "num_input_tokens_seen": 96694736, "step": 143475 }, { "epoch": 3.5052402706862433, "grad_norm": 0.013915733434259892, "learning_rate": 4.968522737586216e-07, "loss": 0.0, "num_input_tokens_seen": 96698192, "step": 143480 }, { "epoch": 3.5053624215180905, "grad_norm": 0.000553418998606503, "learning_rate": 4.967785786509586e-07, "loss": 0.0001, "num_input_tokens_seen": 96701904, "step": 143485 }, { "epoch": 3.5054845723499377, "grad_norm": 0.08207833021879196, "learning_rate": 4.967048872028575e-07, "loss": 0.0, "num_input_tokens_seen": 96705168, "step": 143490 }, { "epoch": 3.505606723181785, "grad_norm": 0.0022202276159077883, "learning_rate": 4.966311994148539e-07, "loss": 0.0, "num_input_tokens_seen": 96708560, "step": 143495 }, { "epoch": 3.505728874013632, "grad_norm": 0.04817364364862442, "learning_rate": 4.965575152874833e-07, "loss": 0.0001, "num_input_tokens_seen": 96711632, "step": 143500 }, { "epoch": 3.5058510248454793, "grad_norm": 0.0006875473191030324, "learning_rate": 4.96483834821282e-07, "loss": 0.0307, "num_input_tokens_seen": 96715280, "step": 143505 }, { "epoch": 3.5059731756773265, "grad_norm": 0.0027317036874592304, "learning_rate": 4.964101580167855e-07, "loss": 0.0, "num_input_tokens_seen": 96718352, "step": 143510 }, { "epoch": 3.5060953265091737, "grad_norm": 0.15406769514083862, "learning_rate": 4.963364848745301e-07, "loss": 0.0001, "num_input_tokens_seen": 96721744, "step": 143515 }, { "epoch": 3.506217477341021, "grad_norm": 0.001995170721784234, "learning_rate": 4.962628153950508e-07, "loss": 0.092, "num_input_tokens_seen": 96724816, "step": 143520 }, { "epoch": 3.506339628172868, "grad_norm": 0.08986975252628326, "learning_rate": 4.961891495788838e-07, "loss": 0.0, "num_input_tokens_seen": 96728272, "step": 143525 }, { "epoch": 3.506461779004715, "grad_norm": 0.00025660096434876323, "learning_rate": 4.961154874265653e-07, "loss": 0.0, "num_input_tokens_seen": 96731600, "step": 143530 }, { "epoch": 3.5065839298365624, "grad_norm": 0.027546176686882973, "learning_rate": 4.9604182893863e-07, "loss": 0.0, "num_input_tokens_seen": 96735312, "step": 143535 }, { "epoch": 3.506706080668409, "grad_norm": 0.0038960163947194815, "learning_rate": 4.959681741156146e-07, "loss": 0.0671, "num_input_tokens_seen": 96738128, "step": 143540 }, { "epoch": 3.5068282315002564, "grad_norm": 0.00791318528354168, "learning_rate": 4.958945229580537e-07, "loss": 0.0001, "num_input_tokens_seen": 96741520, "step": 143545 }, { "epoch": 3.5069503823321035, "grad_norm": 0.014546308666467667, "learning_rate": 4.958208754664834e-07, "loss": 0.0001, "num_input_tokens_seen": 96744784, "step": 143550 }, { "epoch": 3.5070725331639507, "grad_norm": 0.0013046764070168138, "learning_rate": 4.957472316414398e-07, "loss": 0.0341, "num_input_tokens_seen": 96748560, "step": 143555 }, { "epoch": 3.507194683995798, "grad_norm": 0.002422966528683901, "learning_rate": 4.956735914834576e-07, "loss": 0.0359, "num_input_tokens_seen": 96752144, "step": 143560 }, { "epoch": 3.507316834827645, "grad_norm": 0.002931036287918687, "learning_rate": 4.95599954993073e-07, "loss": 0.0001, "num_input_tokens_seen": 96755536, "step": 143565 }, { "epoch": 3.5074389856594923, "grad_norm": 0.004579018801450729, "learning_rate": 4.955263221708206e-07, "loss": 0.0002, "num_input_tokens_seen": 96758864, "step": 143570 }, { "epoch": 3.5075611364913395, "grad_norm": 0.0005623329780064523, "learning_rate": 4.954526930172371e-07, "loss": 0.0001, "num_input_tokens_seen": 96762640, "step": 143575 }, { "epoch": 3.5076832873231867, "grad_norm": 0.20815621316432953, "learning_rate": 4.953790675328569e-07, "loss": 0.0271, "num_input_tokens_seen": 96766160, "step": 143580 }, { "epoch": 3.507805438155034, "grad_norm": 0.0022279792465269566, "learning_rate": 4.95305445718216e-07, "loss": 0.0414, "num_input_tokens_seen": 96769744, "step": 143585 }, { "epoch": 3.507927588986881, "grad_norm": 0.01307701226323843, "learning_rate": 4.952318275738499e-07, "loss": 0.0, "num_input_tokens_seen": 96773392, "step": 143590 }, { "epoch": 3.5080497398187283, "grad_norm": 0.09603697061538696, "learning_rate": 4.951582131002936e-07, "loss": 0.0001, "num_input_tokens_seen": 96776592, "step": 143595 }, { "epoch": 3.5081718906505754, "grad_norm": 0.0005847231950610876, "learning_rate": 4.950846022980822e-07, "loss": 0.0, "num_input_tokens_seen": 96779856, "step": 143600 }, { "epoch": 3.5082940414824226, "grad_norm": 0.001085648313164711, "learning_rate": 4.950109951677519e-07, "loss": 0.0011, "num_input_tokens_seen": 96783056, "step": 143605 }, { "epoch": 3.50841619231427, "grad_norm": 0.0025777278933674097, "learning_rate": 4.94937391709837e-07, "loss": 0.0, "num_input_tokens_seen": 96787088, "step": 143610 }, { "epoch": 3.5085383431461166, "grad_norm": 0.0006910903030075133, "learning_rate": 4.948637919248736e-07, "loss": 0.0, "num_input_tokens_seen": 96790544, "step": 143615 }, { "epoch": 3.508660493977964, "grad_norm": 0.0012758640805259347, "learning_rate": 4.947901958133962e-07, "loss": 0.0, "num_input_tokens_seen": 96794384, "step": 143620 }, { "epoch": 3.508782644809811, "grad_norm": 0.0014997717225924134, "learning_rate": 4.947166033759408e-07, "loss": 0.0, "num_input_tokens_seen": 96797840, "step": 143625 }, { "epoch": 3.5089047956416586, "grad_norm": 0.0014699621824547648, "learning_rate": 4.946430146130419e-07, "loss": 0.0, "num_input_tokens_seen": 96801232, "step": 143630 }, { "epoch": 3.5090269464735053, "grad_norm": 0.035319749265909195, "learning_rate": 4.94569429525235e-07, "loss": 0.0, "num_input_tokens_seen": 96804688, "step": 143635 }, { "epoch": 3.5091490973053525, "grad_norm": 0.001275359420105815, "learning_rate": 4.94495848113055e-07, "loss": 0.0, "num_input_tokens_seen": 96808528, "step": 143640 }, { "epoch": 3.5092712481371997, "grad_norm": 0.003758045844733715, "learning_rate": 4.944222703770371e-07, "loss": 0.0, "num_input_tokens_seen": 96811792, "step": 143645 }, { "epoch": 3.509393398969047, "grad_norm": 0.0023038359358906746, "learning_rate": 4.943486963177168e-07, "loss": 0.0, "num_input_tokens_seen": 96814992, "step": 143650 }, { "epoch": 3.509515549800894, "grad_norm": 0.00477098673582077, "learning_rate": 4.942751259356285e-07, "loss": 0.0, "num_input_tokens_seen": 96818896, "step": 143655 }, { "epoch": 3.5096377006327413, "grad_norm": 0.06560824066400528, "learning_rate": 4.942015592313078e-07, "loss": 0.0213, "num_input_tokens_seen": 96821840, "step": 143660 }, { "epoch": 3.5097598514645885, "grad_norm": 0.0005781487561762333, "learning_rate": 4.94127996205289e-07, "loss": 0.0001, "num_input_tokens_seen": 96825232, "step": 143665 }, { "epoch": 3.5098820022964357, "grad_norm": 0.05620527267456055, "learning_rate": 4.940544368581079e-07, "loss": 0.0, "num_input_tokens_seen": 96829136, "step": 143670 }, { "epoch": 3.510004153128283, "grad_norm": 0.11132854223251343, "learning_rate": 4.939808811902986e-07, "loss": 0.0003, "num_input_tokens_seen": 96832784, "step": 143675 }, { "epoch": 3.51012630396013, "grad_norm": 0.003071943297982216, "learning_rate": 4.939073292023965e-07, "loss": 0.0001, "num_input_tokens_seen": 96835792, "step": 143680 }, { "epoch": 3.5102484547919772, "grad_norm": 0.017345771193504333, "learning_rate": 4.938337808949368e-07, "loss": 0.0, "num_input_tokens_seen": 96838672, "step": 143685 }, { "epoch": 3.5103706056238244, "grad_norm": 3.049597978591919, "learning_rate": 4.937602362684535e-07, "loss": 0.0007, "num_input_tokens_seen": 96841744, "step": 143690 }, { "epoch": 3.5104927564556716, "grad_norm": 0.23833641409873962, "learning_rate": 4.936866953234824e-07, "loss": 0.0, "num_input_tokens_seen": 96845264, "step": 143695 }, { "epoch": 3.510614907287519, "grad_norm": 0.012993131764233112, "learning_rate": 4.936131580605578e-07, "loss": 0.0, "num_input_tokens_seen": 96848336, "step": 143700 }, { "epoch": 3.510737058119366, "grad_norm": 0.002314453711733222, "learning_rate": 4.935396244802142e-07, "loss": 0.0691, "num_input_tokens_seen": 96851472, "step": 143705 }, { "epoch": 3.5108592089512127, "grad_norm": 0.015659581869840622, "learning_rate": 4.934660945829869e-07, "loss": 0.0, "num_input_tokens_seen": 96854992, "step": 143710 }, { "epoch": 3.5109813597830604, "grad_norm": 0.0017577859107404947, "learning_rate": 4.933925683694101e-07, "loss": 0.0001, "num_input_tokens_seen": 96857872, "step": 143715 }, { "epoch": 3.511103510614907, "grad_norm": 0.053707629442214966, "learning_rate": 4.933190458400193e-07, "loss": 0.0, "num_input_tokens_seen": 96861008, "step": 143720 }, { "epoch": 3.5112256614467543, "grad_norm": 0.00028163212118670344, "learning_rate": 4.932455269953482e-07, "loss": 0.0, "num_input_tokens_seen": 96864208, "step": 143725 }, { "epoch": 3.5113478122786015, "grad_norm": 1.548169493675232, "learning_rate": 4.931720118359323e-07, "loss": 0.0005, "num_input_tokens_seen": 96867536, "step": 143730 }, { "epoch": 3.5114699631104487, "grad_norm": 0.03325950354337692, "learning_rate": 4.930985003623054e-07, "loss": 0.0004, "num_input_tokens_seen": 96871440, "step": 143735 }, { "epoch": 3.511592113942296, "grad_norm": 4.091006278991699, "learning_rate": 4.930249925750026e-07, "loss": 0.0906, "num_input_tokens_seen": 96874768, "step": 143740 }, { "epoch": 3.511714264774143, "grad_norm": 0.054243385791778564, "learning_rate": 4.929514884745588e-07, "loss": 0.0, "num_input_tokens_seen": 96878672, "step": 143745 }, { "epoch": 3.5118364156059902, "grad_norm": 0.0010780526790767908, "learning_rate": 4.928779880615078e-07, "loss": 0.0, "num_input_tokens_seen": 96881808, "step": 143750 }, { "epoch": 3.5119585664378374, "grad_norm": 0.0017064374405890703, "learning_rate": 4.928044913363849e-07, "loss": 0.0421, "num_input_tokens_seen": 96885264, "step": 143755 }, { "epoch": 3.5120807172696846, "grad_norm": 0.002845099428668618, "learning_rate": 4.927309982997237e-07, "loss": 0.0, "num_input_tokens_seen": 96888912, "step": 143760 }, { "epoch": 3.512202868101532, "grad_norm": 0.0018212158465757966, "learning_rate": 4.926575089520592e-07, "loss": 0.0, "num_input_tokens_seen": 96892304, "step": 143765 }, { "epoch": 3.512325018933379, "grad_norm": 0.0034192639868706465, "learning_rate": 4.925840232939261e-07, "loss": 0.0975, "num_input_tokens_seen": 96895632, "step": 143770 }, { "epoch": 3.512447169765226, "grad_norm": 0.000690255023073405, "learning_rate": 4.92510541325858e-07, "loss": 0.0002, "num_input_tokens_seen": 96899152, "step": 143775 }, { "epoch": 3.5125693205970734, "grad_norm": 0.023969320580363274, "learning_rate": 4.924370630483902e-07, "loss": 0.0, "num_input_tokens_seen": 96902736, "step": 143780 }, { "epoch": 3.5126914714289206, "grad_norm": 0.4486314356327057, "learning_rate": 4.923635884620561e-07, "loss": 0.0002, "num_input_tokens_seen": 96906000, "step": 143785 }, { "epoch": 3.5128136222607678, "grad_norm": 0.0013574823969975114, "learning_rate": 4.92290117567391e-07, "loss": 0.0, "num_input_tokens_seen": 96909584, "step": 143790 }, { "epoch": 3.5129357730926145, "grad_norm": 0.3799872100353241, "learning_rate": 4.922166503649284e-07, "loss": 0.0001, "num_input_tokens_seen": 96912912, "step": 143795 }, { "epoch": 3.513057923924462, "grad_norm": 0.0018316078931093216, "learning_rate": 4.921431868552032e-07, "loss": 0.0, "num_input_tokens_seen": 96916176, "step": 143800 }, { "epoch": 3.513180074756309, "grad_norm": 0.0247394610196352, "learning_rate": 4.920697270387489e-07, "loss": 0.0, "num_input_tokens_seen": 96919760, "step": 143805 }, { "epoch": 3.5133022255881565, "grad_norm": 0.003527220571413636, "learning_rate": 4.919962709161008e-07, "loss": 0.0, "num_input_tokens_seen": 96923344, "step": 143810 }, { "epoch": 3.5134243764200033, "grad_norm": 0.014180098660290241, "learning_rate": 4.919228184877922e-07, "loss": 0.0, "num_input_tokens_seen": 96926352, "step": 143815 }, { "epoch": 3.5135465272518505, "grad_norm": 0.0015301750972867012, "learning_rate": 4.918493697543572e-07, "loss": 0.0001, "num_input_tokens_seen": 96930000, "step": 143820 }, { "epoch": 3.5136686780836976, "grad_norm": 0.059794213622808456, "learning_rate": 4.917759247163307e-07, "loss": 0.0001, "num_input_tokens_seen": 96933520, "step": 143825 }, { "epoch": 3.513790828915545, "grad_norm": 0.0013804766349494457, "learning_rate": 4.917024833742459e-07, "loss": 0.0, "num_input_tokens_seen": 96936528, "step": 143830 }, { "epoch": 3.513912979747392, "grad_norm": 0.007687632460147142, "learning_rate": 4.916290457286374e-07, "loss": 0.0002, "num_input_tokens_seen": 96939792, "step": 143835 }, { "epoch": 3.514035130579239, "grad_norm": 0.0029032202437520027, "learning_rate": 4.915556117800395e-07, "loss": 0.0, "num_input_tokens_seen": 96942928, "step": 143840 }, { "epoch": 3.5141572814110864, "grad_norm": 0.007129203528165817, "learning_rate": 4.914821815289858e-07, "loss": 0.0, "num_input_tokens_seen": 96946064, "step": 143845 }, { "epoch": 3.5142794322429336, "grad_norm": 0.0005458451923914254, "learning_rate": 4.914087549760106e-07, "loss": 0.0001, "num_input_tokens_seen": 96949136, "step": 143850 }, { "epoch": 3.514401583074781, "grad_norm": 0.0006653392338193953, "learning_rate": 4.913353321216475e-07, "loss": 0.0, "num_input_tokens_seen": 96952464, "step": 143855 }, { "epoch": 3.514523733906628, "grad_norm": 6.379980087280273, "learning_rate": 4.912619129664306e-07, "loss": 0.0008, "num_input_tokens_seen": 96955792, "step": 143860 }, { "epoch": 3.514645884738475, "grad_norm": 0.0027823809068650007, "learning_rate": 4.911884975108943e-07, "loss": 0.0001, "num_input_tokens_seen": 96959696, "step": 143865 }, { "epoch": 3.5147680355703224, "grad_norm": 0.0003252919123042375, "learning_rate": 4.911150857555717e-07, "loss": 0.0, "num_input_tokens_seen": 96963280, "step": 143870 }, { "epoch": 3.5148901864021695, "grad_norm": 0.004755501169711351, "learning_rate": 4.910416777009975e-07, "loss": 0.0, "num_input_tokens_seen": 96966544, "step": 143875 }, { "epoch": 3.5150123372340163, "grad_norm": 0.0003794467484112829, "learning_rate": 4.909682733477047e-07, "loss": 0.0613, "num_input_tokens_seen": 96970064, "step": 143880 }, { "epoch": 3.515134488065864, "grad_norm": 0.0023390930145978928, "learning_rate": 4.90894872696228e-07, "loss": 0.0046, "num_input_tokens_seen": 96973456, "step": 143885 }, { "epoch": 3.5152566388977107, "grad_norm": 0.021191345527768135, "learning_rate": 4.908214757471002e-07, "loss": 0.0, "num_input_tokens_seen": 96977168, "step": 143890 }, { "epoch": 3.5153787897295583, "grad_norm": 0.0015332361217588186, "learning_rate": 4.907480825008556e-07, "loss": 0.0, "num_input_tokens_seen": 96980496, "step": 143895 }, { "epoch": 3.515500940561405, "grad_norm": 0.00018658730550669134, "learning_rate": 4.906746929580284e-07, "loss": 0.0, "num_input_tokens_seen": 96984208, "step": 143900 }, { "epoch": 3.5156230913932522, "grad_norm": 0.0002967533946502954, "learning_rate": 4.906013071191517e-07, "loss": 0.0003, "num_input_tokens_seen": 96987088, "step": 143905 }, { "epoch": 3.5157452422250994, "grad_norm": 0.004914917983114719, "learning_rate": 4.90527924984759e-07, "loss": 0.0002, "num_input_tokens_seen": 96990352, "step": 143910 }, { "epoch": 3.5158673930569466, "grad_norm": 0.003742861095815897, "learning_rate": 4.904545465553847e-07, "loss": 0.0, "num_input_tokens_seen": 96993552, "step": 143915 }, { "epoch": 3.515989543888794, "grad_norm": 0.0010810464154928923, "learning_rate": 4.903811718315615e-07, "loss": 0.0, "num_input_tokens_seen": 96996880, "step": 143920 }, { "epoch": 3.516111694720641, "grad_norm": 0.1657690852880478, "learning_rate": 4.903078008138239e-07, "loss": 0.0001, "num_input_tokens_seen": 97000272, "step": 143925 }, { "epoch": 3.516233845552488, "grad_norm": 0.0005674972198903561, "learning_rate": 4.902344335027047e-07, "loss": 0.0003, "num_input_tokens_seen": 97003792, "step": 143930 }, { "epoch": 3.5163559963843354, "grad_norm": 0.0016297880792990327, "learning_rate": 4.901610698987381e-07, "loss": 0.0, "num_input_tokens_seen": 97006672, "step": 143935 }, { "epoch": 3.5164781472161826, "grad_norm": 0.013815726153552532, "learning_rate": 4.900877100024571e-07, "loss": 0.0, "num_input_tokens_seen": 97009936, "step": 143940 }, { "epoch": 3.5166002980480298, "grad_norm": 0.0040595149621367455, "learning_rate": 4.900143538143958e-07, "loss": 0.0, "num_input_tokens_seen": 97013328, "step": 143945 }, { "epoch": 3.516722448879877, "grad_norm": 0.00040780974086374044, "learning_rate": 4.899410013350867e-07, "loss": 0.0009, "num_input_tokens_seen": 97016656, "step": 143950 }, { "epoch": 3.516844599711724, "grad_norm": 0.00041566326399333775, "learning_rate": 4.898676525650639e-07, "loss": 0.0002, "num_input_tokens_seen": 97019536, "step": 143955 }, { "epoch": 3.5169667505435713, "grad_norm": 0.026442712172865868, "learning_rate": 4.897943075048612e-07, "loss": 0.0, "num_input_tokens_seen": 97022736, "step": 143960 }, { "epoch": 3.5170889013754185, "grad_norm": 0.12575189769268036, "learning_rate": 4.897209661550111e-07, "loss": 0.0, "num_input_tokens_seen": 97026960, "step": 143965 }, { "epoch": 3.5172110522072657, "grad_norm": 0.0001432756835129112, "learning_rate": 4.896476285160479e-07, "loss": 0.0, "num_input_tokens_seen": 97030544, "step": 143970 }, { "epoch": 3.5173332030391125, "grad_norm": 17.486061096191406, "learning_rate": 4.895742945885038e-07, "loss": 0.0002, "num_input_tokens_seen": 97036240, "step": 143975 }, { "epoch": 3.51745535387096, "grad_norm": 0.0007020349148660898, "learning_rate": 4.895009643729133e-07, "loss": 0.0, "num_input_tokens_seen": 97039632, "step": 143980 }, { "epoch": 3.517577504702807, "grad_norm": 0.001066066906787455, "learning_rate": 4.894276378698087e-07, "loss": 0.0, "num_input_tokens_seen": 97043152, "step": 143985 }, { "epoch": 3.5176996555346545, "grad_norm": 0.0017511459300294518, "learning_rate": 4.893543150797236e-07, "loss": 0.0, "num_input_tokens_seen": 97046928, "step": 143990 }, { "epoch": 3.517821806366501, "grad_norm": 0.004090828821063042, "learning_rate": 4.892809960031916e-07, "loss": 0.0, "num_input_tokens_seen": 97050384, "step": 143995 }, { "epoch": 3.5179439571983484, "grad_norm": 0.0008125362219288945, "learning_rate": 4.892076806407451e-07, "loss": 0.0001, "num_input_tokens_seen": 97053648, "step": 144000 }, { "epoch": 3.5180661080301956, "grad_norm": 0.001185380737297237, "learning_rate": 4.891343689929182e-07, "loss": 0.0, "num_input_tokens_seen": 97056912, "step": 144005 }, { "epoch": 3.518188258862043, "grad_norm": 4.846961975097656, "learning_rate": 4.890610610602437e-07, "loss": 0.0017, "num_input_tokens_seen": 97060496, "step": 144010 }, { "epoch": 3.51831040969389, "grad_norm": 0.00037370057543739676, "learning_rate": 4.889877568432541e-07, "loss": 0.0, "num_input_tokens_seen": 97064272, "step": 144015 }, { "epoch": 3.518432560525737, "grad_norm": 0.02818647213280201, "learning_rate": 4.889144563424834e-07, "loss": 0.0, "num_input_tokens_seen": 97067408, "step": 144020 }, { "epoch": 3.5185547113575844, "grad_norm": 0.010698727332055569, "learning_rate": 4.888411595584639e-07, "loss": 0.0, "num_input_tokens_seen": 97070736, "step": 144025 }, { "epoch": 3.5186768621894315, "grad_norm": 0.0008168493513949215, "learning_rate": 4.887678664917292e-07, "loss": 0.0, "num_input_tokens_seen": 97074192, "step": 144030 }, { "epoch": 3.5187990130212787, "grad_norm": 0.057550206780433655, "learning_rate": 4.886945771428118e-07, "loss": 0.0, "num_input_tokens_seen": 97077840, "step": 144035 }, { "epoch": 3.518921163853126, "grad_norm": 1.4165632724761963, "learning_rate": 4.886212915122453e-07, "loss": 0.0003, "num_input_tokens_seen": 97080912, "step": 144040 }, { "epoch": 3.519043314684973, "grad_norm": 0.0002670617832336575, "learning_rate": 4.88548009600562e-07, "loss": 0.0001, "num_input_tokens_seen": 97084240, "step": 144045 }, { "epoch": 3.5191654655168203, "grad_norm": 0.010491433553397655, "learning_rate": 4.884747314082951e-07, "loss": 0.0, "num_input_tokens_seen": 97087696, "step": 144050 }, { "epoch": 3.5192876163486675, "grad_norm": 0.0011365021346136928, "learning_rate": 4.884014569359779e-07, "loss": 0.0, "num_input_tokens_seen": 97091280, "step": 144055 }, { "epoch": 3.5194097671805142, "grad_norm": 0.009114922024309635, "learning_rate": 4.883281861841425e-07, "loss": 0.0, "num_input_tokens_seen": 97094544, "step": 144060 }, { "epoch": 3.519531918012362, "grad_norm": 0.004652893636375666, "learning_rate": 4.882549191533226e-07, "loss": 0.0, "num_input_tokens_seen": 97097808, "step": 144065 }, { "epoch": 3.5196540688442086, "grad_norm": 0.0016463432693853974, "learning_rate": 4.881816558440501e-07, "loss": 0.0, "num_input_tokens_seen": 97101072, "step": 144070 }, { "epoch": 3.5197762196760563, "grad_norm": 0.0022587061394006014, "learning_rate": 4.881083962568587e-07, "loss": 0.0, "num_input_tokens_seen": 97104144, "step": 144075 }, { "epoch": 3.519898370507903, "grad_norm": 0.0006902991444803774, "learning_rate": 4.880351403922804e-07, "loss": 0.0224, "num_input_tokens_seen": 97107280, "step": 144080 }, { "epoch": 3.52002052133975, "grad_norm": 0.0041884733363986015, "learning_rate": 4.879618882508481e-07, "loss": 0.0, "num_input_tokens_seen": 97110480, "step": 144085 }, { "epoch": 3.5201426721715974, "grad_norm": 75.95166778564453, "learning_rate": 4.878886398330952e-07, "loss": 0.0922, "num_input_tokens_seen": 97114000, "step": 144090 }, { "epoch": 3.5202648230034446, "grad_norm": 0.332274466753006, "learning_rate": 4.878153951395535e-07, "loss": 0.0005, "num_input_tokens_seen": 97117840, "step": 144095 }, { "epoch": 3.5203869738352918, "grad_norm": 0.0002197538415202871, "learning_rate": 4.877421541707563e-07, "loss": 0.0, "num_input_tokens_seen": 97120912, "step": 144100 }, { "epoch": 3.520509124667139, "grad_norm": 0.0009739008964970708, "learning_rate": 4.876689169272355e-07, "loss": 0.0, "num_input_tokens_seen": 97124048, "step": 144105 }, { "epoch": 3.520631275498986, "grad_norm": 0.002452496439218521, "learning_rate": 4.875956834095247e-07, "loss": 0.0, "num_input_tokens_seen": 97127760, "step": 144110 }, { "epoch": 3.5207534263308333, "grad_norm": 0.01981990784406662, "learning_rate": 4.875224536181553e-07, "loss": 0.0, "num_input_tokens_seen": 97131536, "step": 144115 }, { "epoch": 3.5208755771626805, "grad_norm": 0.0002594464167486876, "learning_rate": 4.87449227553661e-07, "loss": 0.0, "num_input_tokens_seen": 97134864, "step": 144120 }, { "epoch": 3.5209977279945277, "grad_norm": 0.0004595739010255784, "learning_rate": 4.873760052165737e-07, "loss": 0.0, "num_input_tokens_seen": 97138512, "step": 144125 }, { "epoch": 3.521119878826375, "grad_norm": 0.012149565853178501, "learning_rate": 4.873027866074258e-07, "loss": 0.0003, "num_input_tokens_seen": 97142288, "step": 144130 }, { "epoch": 3.521242029658222, "grad_norm": 0.007893732748925686, "learning_rate": 4.8722957172675e-07, "loss": 0.0, "num_input_tokens_seen": 97145616, "step": 144135 }, { "epoch": 3.5213641804900693, "grad_norm": 0.0037115265149623156, "learning_rate": 4.871563605750785e-07, "loss": 0.0, "num_input_tokens_seen": 97149136, "step": 144140 }, { "epoch": 3.5214863313219165, "grad_norm": 0.0001909531856654212, "learning_rate": 4.870831531529438e-07, "loss": 0.0001, "num_input_tokens_seen": 97152656, "step": 144145 }, { "epoch": 3.5216084821537637, "grad_norm": 0.013603954575955868, "learning_rate": 4.870099494608788e-07, "loss": 0.0, "num_input_tokens_seen": 97156112, "step": 144150 }, { "epoch": 3.5217306329856104, "grad_norm": 0.0094887875020504, "learning_rate": 4.86936749499415e-07, "loss": 0.0, "num_input_tokens_seen": 97159632, "step": 144155 }, { "epoch": 3.521852783817458, "grad_norm": 0.0001825742656365037, "learning_rate": 4.868635532690856e-07, "loss": 0.0, "num_input_tokens_seen": 97163408, "step": 144160 }, { "epoch": 3.521974934649305, "grad_norm": 0.0007969773141667247, "learning_rate": 4.867903607704219e-07, "loss": 0.0, "num_input_tokens_seen": 97166864, "step": 144165 }, { "epoch": 3.522097085481152, "grad_norm": 0.00028032498084940016, "learning_rate": 4.867171720039569e-07, "loss": 0.0, "num_input_tokens_seen": 97170064, "step": 144170 }, { "epoch": 3.522219236312999, "grad_norm": 0.0015606768429279327, "learning_rate": 4.86643986970223e-07, "loss": 0.0, "num_input_tokens_seen": 97173712, "step": 144175 }, { "epoch": 3.5223413871448463, "grad_norm": 0.0021179928444325924, "learning_rate": 4.865708056697517e-07, "loss": 0.0, "num_input_tokens_seen": 97177424, "step": 144180 }, { "epoch": 3.5224635379766935, "grad_norm": 0.0003003637830261141, "learning_rate": 4.864976281030761e-07, "loss": 0.0032, "num_input_tokens_seen": 97180560, "step": 144185 }, { "epoch": 3.5225856888085407, "grad_norm": 0.001165257883258164, "learning_rate": 4.864244542707274e-07, "loss": 0.0, "num_input_tokens_seen": 97183504, "step": 144190 }, { "epoch": 3.522707839640388, "grad_norm": 0.00015066112973727286, "learning_rate": 4.863512841732386e-07, "loss": 0.0, "num_input_tokens_seen": 97186640, "step": 144195 }, { "epoch": 3.522829990472235, "grad_norm": 0.001419951906427741, "learning_rate": 4.86278117811141e-07, "loss": 0.0924, "num_input_tokens_seen": 97189968, "step": 144200 }, { "epoch": 3.5229521413040823, "grad_norm": 7.131059828680009e-05, "learning_rate": 4.862049551849671e-07, "loss": 0.1, "num_input_tokens_seen": 97193104, "step": 144205 }, { "epoch": 3.5230742921359295, "grad_norm": 0.00625317357480526, "learning_rate": 4.861317962952494e-07, "loss": 0.0, "num_input_tokens_seen": 97196752, "step": 144210 }, { "epoch": 3.5231964429677767, "grad_norm": 0.00016985490219667554, "learning_rate": 4.860586411425195e-07, "loss": 0.0, "num_input_tokens_seen": 97200208, "step": 144215 }, { "epoch": 3.523318593799624, "grad_norm": 0.0004274469683878124, "learning_rate": 4.859854897273089e-07, "loss": 0.0, "num_input_tokens_seen": 97203472, "step": 144220 }, { "epoch": 3.523440744631471, "grad_norm": 0.00014868633297737688, "learning_rate": 4.859123420501506e-07, "loss": 0.0308, "num_input_tokens_seen": 97207248, "step": 144225 }, { "epoch": 3.5235628954633182, "grad_norm": 0.000561856955755502, "learning_rate": 4.858391981115759e-07, "loss": 0.0, "num_input_tokens_seen": 97211152, "step": 144230 }, { "epoch": 3.5236850462951654, "grad_norm": 0.511713445186615, "learning_rate": 4.857660579121164e-07, "loss": 0.0003, "num_input_tokens_seen": 97214544, "step": 144235 }, { "epoch": 3.523807197127012, "grad_norm": 0.0024884133599698544, "learning_rate": 4.856929214523047e-07, "loss": 0.0, "num_input_tokens_seen": 97218256, "step": 144240 }, { "epoch": 3.52392934795886, "grad_norm": 0.0004282156878616661, "learning_rate": 4.856197887326726e-07, "loss": 0.0671, "num_input_tokens_seen": 97221456, "step": 144245 }, { "epoch": 3.5240514987907066, "grad_norm": 0.00036971422377973795, "learning_rate": 4.855466597537514e-07, "loss": 0.0, "num_input_tokens_seen": 97224656, "step": 144250 }, { "epoch": 3.524173649622554, "grad_norm": 0.003901298623532057, "learning_rate": 4.854735345160736e-07, "loss": 0.0, "num_input_tokens_seen": 97227984, "step": 144255 }, { "epoch": 3.524295800454401, "grad_norm": 0.0001063659947249107, "learning_rate": 4.854004130201704e-07, "loss": 0.0965, "num_input_tokens_seen": 97230992, "step": 144260 }, { "epoch": 3.524417951286248, "grad_norm": 0.0028174621984362602, "learning_rate": 4.853272952665737e-07, "loss": 0.0, "num_input_tokens_seen": 97234064, "step": 144265 }, { "epoch": 3.5245401021180953, "grad_norm": 0.054041311144828796, "learning_rate": 4.852541812558158e-07, "loss": 0.0, "num_input_tokens_seen": 97237328, "step": 144270 }, { "epoch": 3.5246622529499425, "grad_norm": 0.0009716590284369886, "learning_rate": 4.851810709884274e-07, "loss": 0.0739, "num_input_tokens_seen": 97240592, "step": 144275 }, { "epoch": 3.5247844037817897, "grad_norm": 0.005417170003056526, "learning_rate": 4.851079644649412e-07, "loss": 0.0003, "num_input_tokens_seen": 97243856, "step": 144280 }, { "epoch": 3.524906554613637, "grad_norm": 0.0002540026616770774, "learning_rate": 4.850348616858881e-07, "loss": 0.0, "num_input_tokens_seen": 97247312, "step": 144285 }, { "epoch": 3.525028705445484, "grad_norm": 0.010472937487065792, "learning_rate": 4.849617626518002e-07, "loss": 0.0653, "num_input_tokens_seen": 97251216, "step": 144290 }, { "epoch": 3.5251508562773313, "grad_norm": 0.0028327424079179764, "learning_rate": 4.848886673632086e-07, "loss": 0.0, "num_input_tokens_seen": 97254416, "step": 144295 }, { "epoch": 3.5252730071091785, "grad_norm": 0.010983242653310299, "learning_rate": 4.848155758206452e-07, "loss": 0.0, "num_input_tokens_seen": 97257872, "step": 144300 }, { "epoch": 3.5253951579410256, "grad_norm": 0.013581220991909504, "learning_rate": 4.847424880246417e-07, "loss": 0.0738, "num_input_tokens_seen": 97261072, "step": 144305 }, { "epoch": 3.525517308772873, "grad_norm": 0.0014813852030783892, "learning_rate": 4.846694039757292e-07, "loss": 0.0001, "num_input_tokens_seen": 97264464, "step": 144310 }, { "epoch": 3.52563945960472, "grad_norm": 0.0008495126385241747, "learning_rate": 4.845963236744397e-07, "loss": 0.0001, "num_input_tokens_seen": 97267728, "step": 144315 }, { "epoch": 3.525761610436567, "grad_norm": 0.0009643576922826469, "learning_rate": 4.845232471213045e-07, "loss": 0.0, "num_input_tokens_seen": 97270800, "step": 144320 }, { "epoch": 3.525883761268414, "grad_norm": 0.24121108651161194, "learning_rate": 4.844501743168543e-07, "loss": 0.0001, "num_input_tokens_seen": 97274768, "step": 144325 }, { "epoch": 3.5260059121002616, "grad_norm": 0.00016662731650285423, "learning_rate": 4.843771052616216e-07, "loss": 0.0625, "num_input_tokens_seen": 97278608, "step": 144330 }, { "epoch": 3.5261280629321083, "grad_norm": 0.01039132196456194, "learning_rate": 4.843040399561369e-07, "loss": 0.0, "num_input_tokens_seen": 97282960, "step": 144335 }, { "epoch": 3.526250213763956, "grad_norm": 23.242433547973633, "learning_rate": 4.842309784009323e-07, "loss": 0.0389, "num_input_tokens_seen": 97285840, "step": 144340 }, { "epoch": 3.5263723645958027, "grad_norm": 0.00015703374810982496, "learning_rate": 4.841579205965384e-07, "loss": 0.0, "num_input_tokens_seen": 97289168, "step": 144345 }, { "epoch": 3.52649451542765, "grad_norm": 0.035092271864414215, "learning_rate": 4.840848665434872e-07, "loss": 0.0, "num_input_tokens_seen": 97292240, "step": 144350 }, { "epoch": 3.526616666259497, "grad_norm": 0.004903553519397974, "learning_rate": 4.840118162423092e-07, "loss": 0.0, "num_input_tokens_seen": 97295632, "step": 144355 }, { "epoch": 3.5267388170913443, "grad_norm": 0.0005345535464584827, "learning_rate": 4.839387696935361e-07, "loss": 0.0, "num_input_tokens_seen": 97298832, "step": 144360 }, { "epoch": 3.5268609679231915, "grad_norm": 0.023823775351047516, "learning_rate": 4.838657268976994e-07, "loss": 0.0001, "num_input_tokens_seen": 97301968, "step": 144365 }, { "epoch": 3.5269831187550387, "grad_norm": 0.00848571676760912, "learning_rate": 4.837926878553296e-07, "loss": 0.0, "num_input_tokens_seen": 97305040, "step": 144370 }, { "epoch": 3.527105269586886, "grad_norm": 0.00041069305734708905, "learning_rate": 4.837196525669586e-07, "loss": 0.0, "num_input_tokens_seen": 97309008, "step": 144375 }, { "epoch": 3.527227420418733, "grad_norm": 0.024357300251722336, "learning_rate": 4.836466210331168e-07, "loss": 0.0, "num_input_tokens_seen": 97312656, "step": 144380 }, { "epoch": 3.5273495712505802, "grad_norm": 36.91389465332031, "learning_rate": 4.83573593254336e-07, "loss": 0.0573, "num_input_tokens_seen": 97315792, "step": 144385 }, { "epoch": 3.5274717220824274, "grad_norm": 0.003210177179425955, "learning_rate": 4.835005692311466e-07, "loss": 0.0, "num_input_tokens_seen": 97319952, "step": 144390 }, { "epoch": 3.5275938729142746, "grad_norm": 6.834761734353378e-05, "learning_rate": 4.834275489640799e-07, "loss": 0.0852, "num_input_tokens_seen": 97323856, "step": 144395 }, { "epoch": 3.527716023746122, "grad_norm": 0.001592350541613996, "learning_rate": 4.833545324536674e-07, "loss": 0.0, "num_input_tokens_seen": 97327696, "step": 144400 }, { "epoch": 3.527838174577969, "grad_norm": 0.0010718260891735554, "learning_rate": 4.832815197004394e-07, "loss": 0.0004, "num_input_tokens_seen": 97330640, "step": 144405 }, { "epoch": 3.527960325409816, "grad_norm": 0.09385642409324646, "learning_rate": 4.832085107049275e-07, "loss": 0.0, "num_input_tokens_seen": 97333840, "step": 144410 }, { "epoch": 3.5280824762416634, "grad_norm": 0.003379811067134142, "learning_rate": 4.83135505467662e-07, "loss": 0.0, "num_input_tokens_seen": 97337168, "step": 144415 }, { "epoch": 3.52820462707351, "grad_norm": 0.08003751188516617, "learning_rate": 4.830625039891744e-07, "loss": 0.0, "num_input_tokens_seen": 97340560, "step": 144420 }, { "epoch": 3.5283267779053578, "grad_norm": 0.0010598560329526663, "learning_rate": 4.82989506269995e-07, "loss": 0.0, "num_input_tokens_seen": 97344016, "step": 144425 }, { "epoch": 3.5284489287372045, "grad_norm": 0.000324626627843827, "learning_rate": 4.829165123106552e-07, "loss": 0.0348, "num_input_tokens_seen": 97347280, "step": 144430 }, { "epoch": 3.528571079569052, "grad_norm": 0.009058714844286442, "learning_rate": 4.828435221116858e-07, "loss": 0.0001, "num_input_tokens_seen": 97350160, "step": 144435 }, { "epoch": 3.528693230400899, "grad_norm": 0.03952280059456825, "learning_rate": 4.827705356736169e-07, "loss": 0.0, "num_input_tokens_seen": 97353296, "step": 144440 }, { "epoch": 3.528815381232746, "grad_norm": 0.005792593117803335, "learning_rate": 4.826975529969802e-07, "loss": 0.0, "num_input_tokens_seen": 97356432, "step": 144445 }, { "epoch": 3.5289375320645933, "grad_norm": 0.0001956968626473099, "learning_rate": 4.826245740823056e-07, "loss": 0.0001, "num_input_tokens_seen": 97359824, "step": 144450 }, { "epoch": 3.5290596828964405, "grad_norm": 0.00083877460565418, "learning_rate": 4.825515989301244e-07, "loss": 0.0, "num_input_tokens_seen": 97363536, "step": 144455 }, { "epoch": 3.5291818337282876, "grad_norm": 0.00042945987661369145, "learning_rate": 4.824786275409675e-07, "loss": 0.0, "num_input_tokens_seen": 97366800, "step": 144460 }, { "epoch": 3.529303984560135, "grad_norm": 0.013885509222745895, "learning_rate": 4.824056599153646e-07, "loss": 0.0, "num_input_tokens_seen": 97370256, "step": 144465 }, { "epoch": 3.529426135391982, "grad_norm": 0.9156922101974487, "learning_rate": 4.823326960538476e-07, "loss": 0.0, "num_input_tokens_seen": 97373392, "step": 144470 }, { "epoch": 3.529548286223829, "grad_norm": 0.004656149540096521, "learning_rate": 4.82259735956946e-07, "loss": 0.0, "num_input_tokens_seen": 97377104, "step": 144475 }, { "epoch": 3.5296704370556764, "grad_norm": 0.0002942743303719908, "learning_rate": 4.821867796251908e-07, "loss": 0.0, "num_input_tokens_seen": 97380368, "step": 144480 }, { "epoch": 3.5297925878875236, "grad_norm": 0.004940942395478487, "learning_rate": 4.82113827059113e-07, "loss": 0.0, "num_input_tokens_seen": 97383568, "step": 144485 }, { "epoch": 3.529914738719371, "grad_norm": 0.0011132443323731422, "learning_rate": 4.820408782592425e-07, "loss": 0.0002, "num_input_tokens_seen": 97387024, "step": 144490 }, { "epoch": 3.530036889551218, "grad_norm": 0.001001048949547112, "learning_rate": 4.819679332261104e-07, "loss": 0.0, "num_input_tokens_seen": 97390224, "step": 144495 }, { "epoch": 3.530159040383065, "grad_norm": 0.00019185699056833982, "learning_rate": 4.818949919602465e-07, "loss": 0.0, "num_input_tokens_seen": 97393296, "step": 144500 }, { "epoch": 3.530281191214912, "grad_norm": 0.0046676155179739, "learning_rate": 4.818220544621817e-07, "loss": 0.0, "num_input_tokens_seen": 97396496, "step": 144505 }, { "epoch": 3.5304033420467595, "grad_norm": 0.26163390278816223, "learning_rate": 4.817491207324461e-07, "loss": 0.0001, "num_input_tokens_seen": 97399760, "step": 144510 }, { "epoch": 3.5305254928786063, "grad_norm": 7.951348379720002e-05, "learning_rate": 4.816761907715702e-07, "loss": 0.0, "num_input_tokens_seen": 97403152, "step": 144515 }, { "epoch": 3.530647643710454, "grad_norm": 0.0023116308730095625, "learning_rate": 4.81603264580085e-07, "loss": 0.0, "num_input_tokens_seen": 97406736, "step": 144520 }, { "epoch": 3.5307697945423007, "grad_norm": 0.0011757491156458855, "learning_rate": 4.8153034215852e-07, "loss": 0.0001, "num_input_tokens_seen": 97409744, "step": 144525 }, { "epoch": 3.530891945374148, "grad_norm": 0.0014412114396691322, "learning_rate": 4.814574235074056e-07, "loss": 0.0, "num_input_tokens_seen": 97413328, "step": 144530 }, { "epoch": 3.531014096205995, "grad_norm": 0.0006095138960517943, "learning_rate": 4.813845086272727e-07, "loss": 0.0003, "num_input_tokens_seen": 97416720, "step": 144535 }, { "epoch": 3.5311362470378422, "grad_norm": 0.0010900633642449975, "learning_rate": 4.813115975186512e-07, "loss": 0.0, "num_input_tokens_seen": 97420176, "step": 144540 }, { "epoch": 3.5312583978696894, "grad_norm": 0.0006364626460708678, "learning_rate": 4.812386901820708e-07, "loss": 0.0, "num_input_tokens_seen": 97423312, "step": 144545 }, { "epoch": 3.5313805487015366, "grad_norm": 0.004302667919546366, "learning_rate": 4.811657866180621e-07, "loss": 0.0, "num_input_tokens_seen": 97426960, "step": 144550 }, { "epoch": 3.531502699533384, "grad_norm": 0.0002200361923314631, "learning_rate": 4.810928868271558e-07, "loss": 0.0, "num_input_tokens_seen": 97429968, "step": 144555 }, { "epoch": 3.531624850365231, "grad_norm": 0.00010742614540504292, "learning_rate": 4.810199908098813e-07, "loss": 0.0, "num_input_tokens_seen": 97433616, "step": 144560 }, { "epoch": 3.531747001197078, "grad_norm": 0.0015959254233166575, "learning_rate": 4.809470985667692e-07, "loss": 0.0001, "num_input_tokens_seen": 97436624, "step": 144565 }, { "epoch": 3.5318691520289254, "grad_norm": 0.00047675054520368576, "learning_rate": 4.808742100983492e-07, "loss": 0.0, "num_input_tokens_seen": 97440144, "step": 144570 }, { "epoch": 3.5319913028607726, "grad_norm": 0.002196983899921179, "learning_rate": 4.808013254051514e-07, "loss": 0.0947, "num_input_tokens_seen": 97443856, "step": 144575 }, { "epoch": 3.5321134536926198, "grad_norm": 0.0001492752635385841, "learning_rate": 4.807284444877066e-07, "loss": 0.0, "num_input_tokens_seen": 97447184, "step": 144580 }, { "epoch": 3.532235604524467, "grad_norm": 0.0891515240073204, "learning_rate": 4.806555673465437e-07, "loss": 0.0001, "num_input_tokens_seen": 97450448, "step": 144585 }, { "epoch": 3.532357755356314, "grad_norm": 0.002806537551805377, "learning_rate": 4.805826939821937e-07, "loss": 0.0, "num_input_tokens_seen": 97453520, "step": 144590 }, { "epoch": 3.5324799061881613, "grad_norm": 0.0002000067033804953, "learning_rate": 4.805098243951855e-07, "loss": 0.0, "num_input_tokens_seen": 97456656, "step": 144595 }, { "epoch": 3.532602057020008, "grad_norm": 0.0005195115809328854, "learning_rate": 4.8043695858605e-07, "loss": 0.0, "num_input_tokens_seen": 97459728, "step": 144600 }, { "epoch": 3.5327242078518557, "grad_norm": 0.025334253907203674, "learning_rate": 4.803640965553164e-07, "loss": 0.0002, "num_input_tokens_seen": 97462736, "step": 144605 }, { "epoch": 3.5328463586837024, "grad_norm": 0.00040619299397803843, "learning_rate": 4.802912383035148e-07, "loss": 0.0003, "num_input_tokens_seen": 97465808, "step": 144610 }, { "epoch": 3.5329685095155496, "grad_norm": 0.00015731289749965072, "learning_rate": 4.802183838311755e-07, "loss": 0.0, "num_input_tokens_seen": 97469456, "step": 144615 }, { "epoch": 3.533090660347397, "grad_norm": 0.0008111689821816981, "learning_rate": 4.801455331388275e-07, "loss": 0.0, "num_input_tokens_seen": 97472720, "step": 144620 }, { "epoch": 3.533212811179244, "grad_norm": 0.013854150660336018, "learning_rate": 4.800726862270014e-07, "loss": 0.0, "num_input_tokens_seen": 97475920, "step": 144625 }, { "epoch": 3.533334962011091, "grad_norm": 0.0011478314409032464, "learning_rate": 4.799998430962267e-07, "loss": 0.0316, "num_input_tokens_seen": 97479248, "step": 144630 }, { "epoch": 3.5334571128429384, "grad_norm": 0.0010483600199222565, "learning_rate": 4.799270037470324e-07, "loss": 0.0, "num_input_tokens_seen": 97482768, "step": 144635 }, { "epoch": 3.5335792636747856, "grad_norm": 0.013378694653511047, "learning_rate": 4.798541681799494e-07, "loss": 0.0, "num_input_tokens_seen": 97485968, "step": 144640 }, { "epoch": 3.5337014145066328, "grad_norm": 0.006969807669520378, "learning_rate": 4.797813363955064e-07, "loss": 0.0, "num_input_tokens_seen": 97489296, "step": 144645 }, { "epoch": 3.53382356533848, "grad_norm": 0.00013974278408568352, "learning_rate": 4.797085083942336e-07, "loss": 0.0, "num_input_tokens_seen": 97492816, "step": 144650 }, { "epoch": 3.533945716170327, "grad_norm": 0.24057985842227936, "learning_rate": 4.796356841766602e-07, "loss": 0.0001, "num_input_tokens_seen": 97496272, "step": 144655 }, { "epoch": 3.5340678670021743, "grad_norm": 0.005007751751691103, "learning_rate": 4.795628637433165e-07, "loss": 0.0, "num_input_tokens_seen": 97499792, "step": 144660 }, { "epoch": 3.5341900178340215, "grad_norm": 0.002173068467527628, "learning_rate": 4.794900470947312e-07, "loss": 0.0, "num_input_tokens_seen": 97503504, "step": 144665 }, { "epoch": 3.5343121686658687, "grad_norm": 0.00048343787784688175, "learning_rate": 4.794172342314345e-07, "loss": 0.0, "num_input_tokens_seen": 97506768, "step": 144670 }, { "epoch": 3.534434319497716, "grad_norm": 35.95384979248047, "learning_rate": 4.793444251539558e-07, "loss": 0.1554, "num_input_tokens_seen": 97509968, "step": 144675 }, { "epoch": 3.534556470329563, "grad_norm": 0.0020706241484731436, "learning_rate": 4.792716198628242e-07, "loss": 0.0002, "num_input_tokens_seen": 97513616, "step": 144680 }, { "epoch": 3.53467862116141, "grad_norm": 0.0008316741441376507, "learning_rate": 4.791988183585697e-07, "loss": 0.0001, "num_input_tokens_seen": 97517392, "step": 144685 }, { "epoch": 3.5348007719932575, "grad_norm": 0.0002605855988804251, "learning_rate": 4.791260206417212e-07, "loss": 0.0001, "num_input_tokens_seen": 97520912, "step": 144690 }, { "epoch": 3.5349229228251042, "grad_norm": 0.046921804547309875, "learning_rate": 4.790532267128088e-07, "loss": 0.0, "num_input_tokens_seen": 97524304, "step": 144695 }, { "epoch": 3.535045073656952, "grad_norm": 0.00059171934844926, "learning_rate": 4.78980436572361e-07, "loss": 0.0, "num_input_tokens_seen": 97528592, "step": 144700 }, { "epoch": 3.5351672244887986, "grad_norm": 4.206080120638944e-05, "learning_rate": 4.789076502209077e-07, "loss": 0.0, "num_input_tokens_seen": 97532624, "step": 144705 }, { "epoch": 3.535289375320646, "grad_norm": 0.004249978344887495, "learning_rate": 4.788348676589784e-07, "loss": 0.0, "num_input_tokens_seen": 97535760, "step": 144710 }, { "epoch": 3.535411526152493, "grad_norm": 0.0190388523042202, "learning_rate": 4.787620888871018e-07, "loss": 0.0815, "num_input_tokens_seen": 97539216, "step": 144715 }, { "epoch": 3.53553367698434, "grad_norm": 0.0013153469190001488, "learning_rate": 4.786893139058078e-07, "loss": 0.0, "num_input_tokens_seen": 97542736, "step": 144720 }, { "epoch": 3.5356558278161874, "grad_norm": 0.0047118207439780235, "learning_rate": 4.78616542715625e-07, "loss": 0.0, "num_input_tokens_seen": 97545936, "step": 144725 }, { "epoch": 3.5357779786480346, "grad_norm": 0.0007281777216121554, "learning_rate": 4.785437753170832e-07, "loss": 0.0, "num_input_tokens_seen": 97549136, "step": 144730 }, { "epoch": 3.5359001294798817, "grad_norm": 0.20080366730690002, "learning_rate": 4.784710117107112e-07, "loss": 0.0, "num_input_tokens_seen": 97552464, "step": 144735 }, { "epoch": 3.536022280311729, "grad_norm": 0.0007743925671093166, "learning_rate": 4.783982518970384e-07, "loss": 0.0, "num_input_tokens_seen": 97556112, "step": 144740 }, { "epoch": 3.536144431143576, "grad_norm": 0.16999106109142303, "learning_rate": 4.783254958765939e-07, "loss": 0.0, "num_input_tokens_seen": 97559440, "step": 144745 }, { "epoch": 3.5362665819754233, "grad_norm": 0.0008060486288741231, "learning_rate": 4.782527436499063e-07, "loss": 0.0, "num_input_tokens_seen": 97562576, "step": 144750 }, { "epoch": 3.5363887328072705, "grad_norm": 0.02622039243578911, "learning_rate": 4.781799952175056e-07, "loss": 0.0, "num_input_tokens_seen": 97566160, "step": 144755 }, { "epoch": 3.5365108836391177, "grad_norm": 8.060546679189429e-05, "learning_rate": 4.781072505799197e-07, "loss": 0.0, "num_input_tokens_seen": 97570064, "step": 144760 }, { "epoch": 3.536633034470965, "grad_norm": 0.007018039468675852, "learning_rate": 4.780345097376784e-07, "loss": 0.0003, "num_input_tokens_seen": 97573200, "step": 144765 }, { "epoch": 3.536755185302812, "grad_norm": 0.008267217315733433, "learning_rate": 4.779617726913109e-07, "loss": 0.0829, "num_input_tokens_seen": 97576528, "step": 144770 }, { "epoch": 3.5368773361346593, "grad_norm": 0.001004086690954864, "learning_rate": 4.778890394413454e-07, "loss": 0.0, "num_input_tokens_seen": 97580176, "step": 144775 }, { "epoch": 3.536999486966506, "grad_norm": 0.0035942229442298412, "learning_rate": 4.778163099883117e-07, "loss": 0.0003, "num_input_tokens_seen": 97583440, "step": 144780 }, { "epoch": 3.5371216377983536, "grad_norm": 0.002467522630468011, "learning_rate": 4.777435843327377e-07, "loss": 0.0, "num_input_tokens_seen": 97586896, "step": 144785 }, { "epoch": 3.5372437886302004, "grad_norm": 0.00476666959002614, "learning_rate": 4.776708624751535e-07, "loss": 0.0, "num_input_tokens_seen": 97590160, "step": 144790 }, { "epoch": 3.5373659394620476, "grad_norm": 4.979291043127887e-05, "learning_rate": 4.775981444160865e-07, "loss": 0.0846, "num_input_tokens_seen": 97593552, "step": 144795 }, { "epoch": 3.5374880902938948, "grad_norm": 0.0007779115112498403, "learning_rate": 4.775254301560666e-07, "loss": 0.0, "num_input_tokens_seen": 97596816, "step": 144800 }, { "epoch": 3.537610241125742, "grad_norm": 0.6420580148696899, "learning_rate": 4.774527196956226e-07, "loss": 0.0005, "num_input_tokens_seen": 97600080, "step": 144805 }, { "epoch": 3.537732391957589, "grad_norm": 0.0033624963834881783, "learning_rate": 4.773800130352825e-07, "loss": 0.0, "num_input_tokens_seen": 97603152, "step": 144810 }, { "epoch": 3.5378545427894363, "grad_norm": 0.002699097152799368, "learning_rate": 4.77307310175576e-07, "loss": 0.0, "num_input_tokens_seen": 97606608, "step": 144815 }, { "epoch": 3.5379766936212835, "grad_norm": 0.00019098362827207893, "learning_rate": 4.772346111170309e-07, "loss": 0.0, "num_input_tokens_seen": 97609872, "step": 144820 }, { "epoch": 3.5380988444531307, "grad_norm": 0.028840744867920876, "learning_rate": 4.771619158601764e-07, "loss": 0.0, "num_input_tokens_seen": 97613328, "step": 144825 }, { "epoch": 3.538220995284978, "grad_norm": 0.011287868022918701, "learning_rate": 4.770892244055413e-07, "loss": 0.0, "num_input_tokens_seen": 97616592, "step": 144830 }, { "epoch": 3.538343146116825, "grad_norm": 0.02073443867266178, "learning_rate": 4.770165367536541e-07, "loss": 0.0, "num_input_tokens_seen": 97620240, "step": 144835 }, { "epoch": 3.5384652969486723, "grad_norm": 0.0004936923505738378, "learning_rate": 4.76943852905043e-07, "loss": 0.0001, "num_input_tokens_seen": 97623824, "step": 144840 }, { "epoch": 3.5385874477805195, "grad_norm": 0.0005841117817908525, "learning_rate": 4.768711728602371e-07, "loss": 0.0, "num_input_tokens_seen": 97626896, "step": 144845 }, { "epoch": 3.5387095986123667, "grad_norm": 0.0016818649601191282, "learning_rate": 4.767984966197649e-07, "loss": 0.0001, "num_input_tokens_seen": 97630224, "step": 144850 }, { "epoch": 3.538831749444214, "grad_norm": 0.007975401356816292, "learning_rate": 4.767258241841543e-07, "loss": 0.0, "num_input_tokens_seen": 97633552, "step": 144855 }, { "epoch": 3.538953900276061, "grad_norm": 0.03557858616113663, "learning_rate": 4.766531555539343e-07, "loss": 0.0, "num_input_tokens_seen": 97637648, "step": 144860 }, { "epoch": 3.539076051107908, "grad_norm": 0.0010234951041638851, "learning_rate": 4.7658049072963357e-07, "loss": 0.0, "num_input_tokens_seen": 97641488, "step": 144865 }, { "epoch": 3.5391982019397554, "grad_norm": 0.3270286023616791, "learning_rate": 4.7650782971178003e-07, "loss": 0.0, "num_input_tokens_seen": 97644432, "step": 144870 }, { "epoch": 3.539320352771602, "grad_norm": 0.0560191385447979, "learning_rate": 4.764351725009027e-07, "loss": 0.0698, "num_input_tokens_seen": 97647632, "step": 144875 }, { "epoch": 3.53944250360345, "grad_norm": 0.004076603800058365, "learning_rate": 4.763625190975292e-07, "loss": 0.0002, "num_input_tokens_seen": 97650896, "step": 144880 }, { "epoch": 3.5395646544352966, "grad_norm": 0.0007098827045410872, "learning_rate": 4.762898695021884e-07, "loss": 0.0501, "num_input_tokens_seen": 97653776, "step": 144885 }, { "epoch": 3.5396868052671437, "grad_norm": 0.009917641058564186, "learning_rate": 4.7621722371540884e-07, "loss": 0.0325, "num_input_tokens_seen": 97656976, "step": 144890 }, { "epoch": 3.539808956098991, "grad_norm": 0.010203611105680466, "learning_rate": 4.7614458173771807e-07, "loss": 0.0, "num_input_tokens_seen": 97660560, "step": 144895 }, { "epoch": 3.539931106930838, "grad_norm": 0.004964059218764305, "learning_rate": 4.760719435696453e-07, "loss": 0.0, "num_input_tokens_seen": 97664080, "step": 144900 }, { "epoch": 3.5400532577626853, "grad_norm": 0.0006427750922739506, "learning_rate": 4.759993092117178e-07, "loss": 0.0, "num_input_tokens_seen": 97667472, "step": 144905 }, { "epoch": 3.5401754085945325, "grad_norm": 0.004524386487901211, "learning_rate": 4.759266786644648e-07, "loss": 0.0371, "num_input_tokens_seen": 97670672, "step": 144910 }, { "epoch": 3.5402975594263797, "grad_norm": 8.596708357799798e-05, "learning_rate": 4.7585405192841343e-07, "loss": 0.0, "num_input_tokens_seen": 97674000, "step": 144915 }, { "epoch": 3.540419710258227, "grad_norm": 0.01520247757434845, "learning_rate": 4.7578142900409237e-07, "loss": 0.0, "num_input_tokens_seen": 97677392, "step": 144920 }, { "epoch": 3.540541861090074, "grad_norm": 0.0006964769563637674, "learning_rate": 4.7570880989203023e-07, "loss": 0.1114, "num_input_tokens_seen": 97680784, "step": 144925 }, { "epoch": 3.5406640119219213, "grad_norm": 0.0032320874743163586, "learning_rate": 4.756361945927542e-07, "loss": 0.0, "num_input_tokens_seen": 97684048, "step": 144930 }, { "epoch": 3.5407861627537685, "grad_norm": 0.0014720155159011483, "learning_rate": 4.755635831067931e-07, "loss": 0.0224, "num_input_tokens_seen": 97687376, "step": 144935 }, { "epoch": 3.5409083135856156, "grad_norm": 0.00014576480316463858, "learning_rate": 4.754909754346748e-07, "loss": 0.0, "num_input_tokens_seen": 97691088, "step": 144940 }, { "epoch": 3.541030464417463, "grad_norm": 0.0016071463469415903, "learning_rate": 4.7541837157692676e-07, "loss": 0.0, "num_input_tokens_seen": 97694736, "step": 144945 }, { "epoch": 3.5411526152493096, "grad_norm": 0.027005070820450783, "learning_rate": 4.753457715340778e-07, "loss": 0.0002, "num_input_tokens_seen": 97698064, "step": 144950 }, { "epoch": 3.541274766081157, "grad_norm": 0.018544316291809082, "learning_rate": 4.752731753066552e-07, "loss": 0.0, "num_input_tokens_seen": 97701392, "step": 144955 }, { "epoch": 3.541396916913004, "grad_norm": 0.025899503380060196, "learning_rate": 4.7520058289518747e-07, "loss": 0.0, "num_input_tokens_seen": 97704720, "step": 144960 }, { "epoch": 3.5415190677448516, "grad_norm": 0.0339164175093174, "learning_rate": 4.75127994300202e-07, "loss": 0.0, "num_input_tokens_seen": 97707856, "step": 144965 }, { "epoch": 3.5416412185766983, "grad_norm": 0.01129910722374916, "learning_rate": 4.7505540952222725e-07, "loss": 0.0, "num_input_tokens_seen": 97711312, "step": 144970 }, { "epoch": 3.5417633694085455, "grad_norm": 0.001934434287250042, "learning_rate": 4.749828285617904e-07, "loss": 0.0, "num_input_tokens_seen": 97714704, "step": 144975 }, { "epoch": 3.5418855202403927, "grad_norm": 0.0007302694721147418, "learning_rate": 4.7491025141941955e-07, "loss": 0.0348, "num_input_tokens_seen": 97718032, "step": 144980 }, { "epoch": 3.54200767107224, "grad_norm": 0.0018314080080017447, "learning_rate": 4.74837678095643e-07, "loss": 0.0, "num_input_tokens_seen": 97721360, "step": 144985 }, { "epoch": 3.542129821904087, "grad_norm": 0.0010636234655976295, "learning_rate": 4.7476510859098775e-07, "loss": 0.0, "num_input_tokens_seen": 97724560, "step": 144990 }, { "epoch": 3.5422519727359343, "grad_norm": 0.01839928887784481, "learning_rate": 4.7469254290598224e-07, "loss": 0.0001, "num_input_tokens_seen": 97728272, "step": 144995 }, { "epoch": 3.5423741235677815, "grad_norm": 0.00023793902073521167, "learning_rate": 4.7461998104115355e-07, "loss": 0.0, "num_input_tokens_seen": 97731920, "step": 145000 }, { "epoch": 3.5424962743996287, "grad_norm": 0.0005881048273295164, "learning_rate": 4.7454742299703e-07, "loss": 0.0, "num_input_tokens_seen": 97734928, "step": 145005 }, { "epoch": 3.542618425231476, "grad_norm": 0.0031892170663923025, "learning_rate": 4.744748687741386e-07, "loss": 0.0, "num_input_tokens_seen": 97738000, "step": 145010 }, { "epoch": 3.542740576063323, "grad_norm": 0.0010000730399042368, "learning_rate": 4.7440231837300716e-07, "loss": 0.0, "num_input_tokens_seen": 97741904, "step": 145015 }, { "epoch": 3.5428627268951702, "grad_norm": 0.0004860509361606091, "learning_rate": 4.743297717941639e-07, "loss": 0.0, "num_input_tokens_seen": 97744912, "step": 145020 }, { "epoch": 3.5429848777270174, "grad_norm": 0.0049315025098621845, "learning_rate": 4.7425722903813556e-07, "loss": 0.0, "num_input_tokens_seen": 97748112, "step": 145025 }, { "epoch": 3.5431070285588646, "grad_norm": 0.0005576725816354156, "learning_rate": 4.7418469010545036e-07, "loss": 0.0, "num_input_tokens_seen": 97751440, "step": 145030 }, { "epoch": 3.543229179390712, "grad_norm": 0.01474781148135662, "learning_rate": 4.7411215499663525e-07, "loss": 0.0, "num_input_tokens_seen": 97754640, "step": 145035 }, { "epoch": 3.543351330222559, "grad_norm": 0.0008148238994181156, "learning_rate": 4.7403962371221837e-07, "loss": 0.0, "num_input_tokens_seen": 97758032, "step": 145040 }, { "epoch": 3.5434734810544057, "grad_norm": 0.002185927936807275, "learning_rate": 4.7396709625272636e-07, "loss": 0.0, "num_input_tokens_seen": 97761680, "step": 145045 }, { "epoch": 3.5435956318862534, "grad_norm": 0.08207400143146515, "learning_rate": 4.738945726186875e-07, "loss": 0.0, "num_input_tokens_seen": 97765264, "step": 145050 }, { "epoch": 3.5437177827181, "grad_norm": 0.0013452256098389626, "learning_rate": 4.738220528106288e-07, "loss": 0.0, "num_input_tokens_seen": 97768336, "step": 145055 }, { "epoch": 3.5438399335499478, "grad_norm": 0.0007345146732404828, "learning_rate": 4.7374953682907736e-07, "loss": 0.0001, "num_input_tokens_seen": 97771536, "step": 145060 }, { "epoch": 3.5439620843817945, "grad_norm": 0.002797122113406658, "learning_rate": 4.7367702467456115e-07, "loss": 0.0, "num_input_tokens_seen": 97774544, "step": 145065 }, { "epoch": 3.5440842352136417, "grad_norm": 0.00030433552456088364, "learning_rate": 4.736045163476068e-07, "loss": 0.0, "num_input_tokens_seen": 97777808, "step": 145070 }, { "epoch": 3.544206386045489, "grad_norm": 0.0008012225735001266, "learning_rate": 4.73532011848742e-07, "loss": 0.0, "num_input_tokens_seen": 97781392, "step": 145075 }, { "epoch": 3.544328536877336, "grad_norm": 0.0005928860628046095, "learning_rate": 4.734595111784945e-07, "loss": 0.0663, "num_input_tokens_seen": 97784528, "step": 145080 }, { "epoch": 3.5444506877091833, "grad_norm": 0.0003491929383017123, "learning_rate": 4.733870143373905e-07, "loss": 0.0001, "num_input_tokens_seen": 97787664, "step": 145085 }, { "epoch": 3.5445728385410304, "grad_norm": 0.016333796083927155, "learning_rate": 4.7331452132595827e-07, "loss": 0.0, "num_input_tokens_seen": 97790544, "step": 145090 }, { "epoch": 3.5446949893728776, "grad_norm": 0.001521604834124446, "learning_rate": 4.7324203214472403e-07, "loss": 0.0, "num_input_tokens_seen": 97794000, "step": 145095 }, { "epoch": 3.544817140204725, "grad_norm": 0.13567009568214417, "learning_rate": 4.731695467942158e-07, "loss": 0.0, "num_input_tokens_seen": 97797392, "step": 145100 }, { "epoch": 3.544939291036572, "grad_norm": 0.00036377584910951555, "learning_rate": 4.730970652749601e-07, "loss": 0.0, "num_input_tokens_seen": 97800400, "step": 145105 }, { "epoch": 3.545061441868419, "grad_norm": 0.00028549678972922266, "learning_rate": 4.730245875874841e-07, "loss": 0.0, "num_input_tokens_seen": 97803536, "step": 145110 }, { "epoch": 3.5451835927002664, "grad_norm": 0.002109118504449725, "learning_rate": 4.7295211373231546e-07, "loss": 0.0, "num_input_tokens_seen": 97806672, "step": 145115 }, { "epoch": 3.5453057435321136, "grad_norm": 0.012760130688548088, "learning_rate": 4.7287964370998043e-07, "loss": 0.0, "num_input_tokens_seen": 97810064, "step": 145120 }, { "epoch": 3.5454278943639608, "grad_norm": 0.0016017990419641137, "learning_rate": 4.7280717752100683e-07, "loss": 0.0, "num_input_tokens_seen": 97813584, "step": 145125 }, { "epoch": 3.5455500451958075, "grad_norm": 0.00019777823763433844, "learning_rate": 4.7273471516592076e-07, "loss": 0.0, "num_input_tokens_seen": 97816848, "step": 145130 }, { "epoch": 3.545672196027655, "grad_norm": 0.00033244179212488234, "learning_rate": 4.726622566452497e-07, "loss": 0.0, "num_input_tokens_seen": 97820240, "step": 145135 }, { "epoch": 3.545794346859502, "grad_norm": 0.006279574707150459, "learning_rate": 4.7258980195952103e-07, "loss": 0.0435, "num_input_tokens_seen": 97823184, "step": 145140 }, { "epoch": 3.5459164976913495, "grad_norm": 0.00035050706355832517, "learning_rate": 4.7251735110926103e-07, "loss": 0.0, "num_input_tokens_seen": 97826512, "step": 145145 }, { "epoch": 3.5460386485231963, "grad_norm": 0.013322637416422367, "learning_rate": 4.724449040949965e-07, "loss": 0.0, "num_input_tokens_seen": 97829584, "step": 145150 }, { "epoch": 3.5461607993550435, "grad_norm": 0.0018419913249090314, "learning_rate": 4.723724609172548e-07, "loss": 0.0001, "num_input_tokens_seen": 97832464, "step": 145155 }, { "epoch": 3.5462829501868907, "grad_norm": 0.0012325738789513707, "learning_rate": 4.7230002157656245e-07, "loss": 0.0001, "num_input_tokens_seen": 97835600, "step": 145160 }, { "epoch": 3.546405101018738, "grad_norm": 0.00032701349118724465, "learning_rate": 4.7222758607344593e-07, "loss": 0.1, "num_input_tokens_seen": 97838928, "step": 145165 }, { "epoch": 3.546527251850585, "grad_norm": 0.0011002069804817438, "learning_rate": 4.7215515440843236e-07, "loss": 0.0841, "num_input_tokens_seen": 97842384, "step": 145170 }, { "epoch": 3.5466494026824322, "grad_norm": 0.02300787903368473, "learning_rate": 4.720827265820489e-07, "loss": 0.0, "num_input_tokens_seen": 97845584, "step": 145175 }, { "epoch": 3.5467715535142794, "grad_norm": 0.03971827030181885, "learning_rate": 4.7201030259482146e-07, "loss": 0.0, "num_input_tokens_seen": 97849168, "step": 145180 }, { "epoch": 3.5468937043461266, "grad_norm": 0.08529645949602127, "learning_rate": 4.719378824472774e-07, "loss": 0.0, "num_input_tokens_seen": 97852688, "step": 145185 }, { "epoch": 3.547015855177974, "grad_norm": 0.00528777576982975, "learning_rate": 4.7186546613994283e-07, "loss": 0.0002, "num_input_tokens_seen": 97855888, "step": 145190 }, { "epoch": 3.547138006009821, "grad_norm": 0.002800324000418186, "learning_rate": 4.7179305367334453e-07, "loss": 0.0001, "num_input_tokens_seen": 97859216, "step": 145195 }, { "epoch": 3.547260156841668, "grad_norm": 0.00263691577129066, "learning_rate": 4.7172064504800967e-07, "loss": 0.0, "num_input_tokens_seen": 97862416, "step": 145200 }, { "epoch": 3.5473823076735154, "grad_norm": 0.03326334431767464, "learning_rate": 4.7164824026446405e-07, "loss": 0.0332, "num_input_tokens_seen": 97865488, "step": 145205 }, { "epoch": 3.5475044585053626, "grad_norm": 0.004230371210724115, "learning_rate": 4.7157583932323475e-07, "loss": 0.0001, "num_input_tokens_seen": 97868624, "step": 145210 }, { "epoch": 3.5476266093372097, "grad_norm": 0.0032722600735723972, "learning_rate": 4.7150344222484786e-07, "loss": 0.0001, "num_input_tokens_seen": 97871568, "step": 145215 }, { "epoch": 3.547748760169057, "grad_norm": 0.014357690699398518, "learning_rate": 4.714310489698303e-07, "loss": 0.0, "num_input_tokens_seen": 97874832, "step": 145220 }, { "epoch": 3.5478709110009037, "grad_norm": 0.003965605981647968, "learning_rate": 4.7135865955870803e-07, "loss": 0.062, "num_input_tokens_seen": 97877968, "step": 145225 }, { "epoch": 3.5479930618327513, "grad_norm": 0.0004163807607255876, "learning_rate": 4.7128627399200784e-07, "loss": 0.0, "num_input_tokens_seen": 97881296, "step": 145230 }, { "epoch": 3.548115212664598, "grad_norm": 0.0007733021629974246, "learning_rate": 4.712138922702563e-07, "loss": 0.0001, "num_input_tokens_seen": 97885456, "step": 145235 }, { "epoch": 3.5482373634964453, "grad_norm": 0.01566707342863083, "learning_rate": 4.7114151439397933e-07, "loss": 0.0194, "num_input_tokens_seen": 97888592, "step": 145240 }, { "epoch": 3.5483595143282924, "grad_norm": 0.07242065668106079, "learning_rate": 4.710691403637038e-07, "loss": 0.0002, "num_input_tokens_seen": 97891728, "step": 145245 }, { "epoch": 3.5484816651601396, "grad_norm": 0.005784235429018736, "learning_rate": 4.7099677017995575e-07, "loss": 0.0393, "num_input_tokens_seen": 97894992, "step": 145250 }, { "epoch": 3.548603815991987, "grad_norm": 0.002436217153444886, "learning_rate": 4.7092440384326113e-07, "loss": 0.0001, "num_input_tokens_seen": 97898064, "step": 145255 }, { "epoch": 3.548725966823834, "grad_norm": 0.005221229046583176, "learning_rate": 4.708520413541469e-07, "loss": 0.0001, "num_input_tokens_seen": 97901328, "step": 145260 }, { "epoch": 3.548848117655681, "grad_norm": 0.01007154956459999, "learning_rate": 4.7077968271313863e-07, "loss": 0.0477, "num_input_tokens_seen": 97904464, "step": 145265 }, { "epoch": 3.5489702684875284, "grad_norm": 0.07373490929603577, "learning_rate": 4.707073279207632e-07, "loss": 0.0002, "num_input_tokens_seen": 97907664, "step": 145270 }, { "epoch": 3.5490924193193756, "grad_norm": 0.046758461743593216, "learning_rate": 4.706349769775461e-07, "loss": 0.0536, "num_input_tokens_seen": 97910864, "step": 145275 }, { "epoch": 3.5492145701512228, "grad_norm": 0.0076475548557937145, "learning_rate": 4.705626298840141e-07, "loss": 0.0001, "num_input_tokens_seen": 97914256, "step": 145280 }, { "epoch": 3.54933672098307, "grad_norm": 0.016452349722385406, "learning_rate": 4.7049028664069266e-07, "loss": 0.0004, "num_input_tokens_seen": 97917328, "step": 145285 }, { "epoch": 3.549458871814917, "grad_norm": 0.04465107619762421, "learning_rate": 4.7041794724810846e-07, "loss": 0.0001, "num_input_tokens_seen": 97920464, "step": 145290 }, { "epoch": 3.5495810226467643, "grad_norm": 0.26384222507476807, "learning_rate": 4.703456117067877e-07, "loss": 0.0001, "num_input_tokens_seen": 97923600, "step": 145295 }, { "epoch": 3.5497031734786115, "grad_norm": 0.028273768723011017, "learning_rate": 4.702732800172556e-07, "loss": 0.0, "num_input_tokens_seen": 97926928, "step": 145300 }, { "epoch": 3.5498253243104587, "grad_norm": 0.043451856821775436, "learning_rate": 4.702009521800392e-07, "loss": 0.0365, "num_input_tokens_seen": 97930192, "step": 145305 }, { "epoch": 3.5499474751423055, "grad_norm": 0.002773558022454381, "learning_rate": 4.701286281956636e-07, "loss": 0.0, "num_input_tokens_seen": 97933456, "step": 145310 }, { "epoch": 3.550069625974153, "grad_norm": 0.017502794042229652, "learning_rate": 4.7005630806465547e-07, "loss": 0.0001, "num_input_tokens_seen": 97936592, "step": 145315 }, { "epoch": 3.550191776806, "grad_norm": 0.0033604062628000975, "learning_rate": 4.6998399178754e-07, "loss": 0.0, "num_input_tokens_seen": 97940240, "step": 145320 }, { "epoch": 3.5503139276378475, "grad_norm": 0.007517929654568434, "learning_rate": 4.6991167936484346e-07, "loss": 0.0, "num_input_tokens_seen": 97943632, "step": 145325 }, { "epoch": 3.5504360784696942, "grad_norm": 0.49256661534309387, "learning_rate": 4.698393707970922e-07, "loss": 0.0006, "num_input_tokens_seen": 97947088, "step": 145330 }, { "epoch": 3.5505582293015414, "grad_norm": 0.000612253847066313, "learning_rate": 4.697670660848113e-07, "loss": 0.0, "num_input_tokens_seen": 97950864, "step": 145335 }, { "epoch": 3.5506803801333886, "grad_norm": 0.024863019585609436, "learning_rate": 4.6969476522852726e-07, "loss": 0.0001, "num_input_tokens_seen": 97954384, "step": 145340 }, { "epoch": 3.550802530965236, "grad_norm": 0.017794562503695488, "learning_rate": 4.696224682287652e-07, "loss": 0.0, "num_input_tokens_seen": 97957712, "step": 145345 }, { "epoch": 3.550924681797083, "grad_norm": 0.037794046103954315, "learning_rate": 4.695501750860514e-07, "loss": 0.0, "num_input_tokens_seen": 97960848, "step": 145350 }, { "epoch": 3.55104683262893, "grad_norm": 0.012041272595524788, "learning_rate": 4.694778858009112e-07, "loss": 0.0, "num_input_tokens_seen": 97964560, "step": 145355 }, { "epoch": 3.5511689834607774, "grad_norm": 0.022189682349562645, "learning_rate": 4.694056003738708e-07, "loss": 0.0, "num_input_tokens_seen": 97967184, "step": 145360 }, { "epoch": 3.5512911342926246, "grad_norm": 0.0005078144022263587, "learning_rate": 4.693333188054556e-07, "loss": 0.0, "num_input_tokens_seen": 97970064, "step": 145365 }, { "epoch": 3.5514132851244717, "grad_norm": 73.60726928710938, "learning_rate": 4.692610410961909e-07, "loss": 0.1659, "num_input_tokens_seen": 97973264, "step": 145370 }, { "epoch": 3.551535435956319, "grad_norm": 0.003191084135323763, "learning_rate": 4.6918876724660296e-07, "loss": 0.0, "num_input_tokens_seen": 97976784, "step": 145375 }, { "epoch": 3.551657586788166, "grad_norm": 0.011187938041985035, "learning_rate": 4.691164972572168e-07, "loss": 0.0, "num_input_tokens_seen": 97980048, "step": 145380 }, { "epoch": 3.5517797376200133, "grad_norm": 0.0007194733479991555, "learning_rate": 4.690442311285582e-07, "loss": 0.0, "num_input_tokens_seen": 97983504, "step": 145385 }, { "epoch": 3.5519018884518605, "grad_norm": 0.0015291129238903522, "learning_rate": 4.689719688611532e-07, "loss": 0.0235, "num_input_tokens_seen": 97987216, "step": 145390 }, { "epoch": 3.5520240392837072, "grad_norm": 0.05800214782357216, "learning_rate": 4.6889971045552636e-07, "loss": 0.0, "num_input_tokens_seen": 97990480, "step": 145395 }, { "epoch": 3.552146190115555, "grad_norm": 0.00901162251830101, "learning_rate": 4.6882745591220417e-07, "loss": 0.0, "num_input_tokens_seen": 97993680, "step": 145400 }, { "epoch": 3.5522683409474016, "grad_norm": 0.006305050104856491, "learning_rate": 4.687552052317112e-07, "loss": 0.0, "num_input_tokens_seen": 97997008, "step": 145405 }, { "epoch": 3.5523904917792493, "grad_norm": 0.007316413801163435, "learning_rate": 4.6868295841457363e-07, "loss": 0.0, "num_input_tokens_seen": 98000272, "step": 145410 }, { "epoch": 3.552512642611096, "grad_norm": 0.014707071706652641, "learning_rate": 4.68610715461316e-07, "loss": 0.0001, "num_input_tokens_seen": 98003600, "step": 145415 }, { "epoch": 3.552634793442943, "grad_norm": 0.0018783895066007972, "learning_rate": 4.6853847637246433e-07, "loss": 0.0001, "num_input_tokens_seen": 98006800, "step": 145420 }, { "epoch": 3.5527569442747904, "grad_norm": 0.017941100522875786, "learning_rate": 4.6846624114854415e-07, "loss": 0.06, "num_input_tokens_seen": 98010064, "step": 145425 }, { "epoch": 3.5528790951066376, "grad_norm": 0.001249508699402213, "learning_rate": 4.6839400979008005e-07, "loss": 0.0288, "num_input_tokens_seen": 98013456, "step": 145430 }, { "epoch": 3.5530012459384848, "grad_norm": 0.002429740270599723, "learning_rate": 4.683217822975981e-07, "loss": 0.0606, "num_input_tokens_seen": 98016912, "step": 145435 }, { "epoch": 3.553123396770332, "grad_norm": 0.0017890139715746045, "learning_rate": 4.6824955867162276e-07, "loss": 0.0246, "num_input_tokens_seen": 98020496, "step": 145440 }, { "epoch": 3.553245547602179, "grad_norm": 0.001725839334540069, "learning_rate": 4.681773389126795e-07, "loss": 0.0, "num_input_tokens_seen": 98023632, "step": 145445 }, { "epoch": 3.5533676984340263, "grad_norm": 0.0002874275960493833, "learning_rate": 4.681051230212942e-07, "loss": 0.0, "num_input_tokens_seen": 98027664, "step": 145450 }, { "epoch": 3.5534898492658735, "grad_norm": 0.0017251042881980538, "learning_rate": 4.680329109979916e-07, "loss": 0.0, "num_input_tokens_seen": 98030800, "step": 145455 }, { "epoch": 3.5536120000977207, "grad_norm": 0.004043647553771734, "learning_rate": 4.679607028432961e-07, "loss": 0.0, "num_input_tokens_seen": 98034576, "step": 145460 }, { "epoch": 3.553734150929568, "grad_norm": 0.0002633123949635774, "learning_rate": 4.6788849855773413e-07, "loss": 0.0001, "num_input_tokens_seen": 98037840, "step": 145465 }, { "epoch": 3.553856301761415, "grad_norm": 22.955411911010742, "learning_rate": 4.6781629814183e-07, "loss": 0.0667, "num_input_tokens_seen": 98040976, "step": 145470 }, { "epoch": 3.5539784525932623, "grad_norm": 0.020732754841446877, "learning_rate": 4.6774410159610847e-07, "loss": 0.0, "num_input_tokens_seen": 98043984, "step": 145475 }, { "epoch": 3.5541006034251095, "grad_norm": 0.10230601578950882, "learning_rate": 4.676719089210951e-07, "loss": 0.0344, "num_input_tokens_seen": 98047504, "step": 145480 }, { "epoch": 3.5542227542569567, "grad_norm": 0.0032723443582654, "learning_rate": 4.675997201173151e-07, "loss": 0.0, "num_input_tokens_seen": 98050448, "step": 145485 }, { "epoch": 3.5543449050888034, "grad_norm": 0.013427302241325378, "learning_rate": 4.6752753518529276e-07, "loss": 0.0601, "num_input_tokens_seen": 98053648, "step": 145490 }, { "epoch": 3.554467055920651, "grad_norm": 0.0018329521408304572, "learning_rate": 4.674553541255537e-07, "loss": 0.0024, "num_input_tokens_seen": 98056912, "step": 145495 }, { "epoch": 3.554589206752498, "grad_norm": 0.004841540474444628, "learning_rate": 4.673831769386223e-07, "loss": 0.0, "num_input_tokens_seen": 98060304, "step": 145500 }, { "epoch": 3.5547113575843454, "grad_norm": 0.0034890659153461456, "learning_rate": 4.67311003625024e-07, "loss": 0.0, "num_input_tokens_seen": 98063440, "step": 145505 }, { "epoch": 3.554833508416192, "grad_norm": 0.9182288646697998, "learning_rate": 4.67238834185283e-07, "loss": 0.0002, "num_input_tokens_seen": 98066832, "step": 145510 }, { "epoch": 3.5549556592480394, "grad_norm": 0.012411080300807953, "learning_rate": 4.6716666861992447e-07, "loss": 0.0001, "num_input_tokens_seen": 98069904, "step": 145515 }, { "epoch": 3.5550778100798865, "grad_norm": 0.019958067685365677, "learning_rate": 4.6709450692947363e-07, "loss": 0.0474, "num_input_tokens_seen": 98073680, "step": 145520 }, { "epoch": 3.5551999609117337, "grad_norm": 0.0059996098279953, "learning_rate": 4.670223491144545e-07, "loss": 0.0, "num_input_tokens_seen": 98076752, "step": 145525 }, { "epoch": 3.555322111743581, "grad_norm": 0.0008659661398269236, "learning_rate": 4.6695019517539257e-07, "loss": 0.0501, "num_input_tokens_seen": 98080144, "step": 145530 }, { "epoch": 3.555444262575428, "grad_norm": 0.0021337266080081463, "learning_rate": 4.6687804511281183e-07, "loss": 0.0, "num_input_tokens_seen": 98083344, "step": 145535 }, { "epoch": 3.5555664134072753, "grad_norm": 0.008302521891891956, "learning_rate": 4.668058989272373e-07, "loss": 0.0667, "num_input_tokens_seen": 98086672, "step": 145540 }, { "epoch": 3.5556885642391225, "grad_norm": 0.0024677154142409563, "learning_rate": 4.667337566191941e-07, "loss": 0.0, "num_input_tokens_seen": 98089872, "step": 145545 }, { "epoch": 3.5558107150709697, "grad_norm": 0.05019146203994751, "learning_rate": 4.666616181892061e-07, "loss": 0.0, "num_input_tokens_seen": 98092944, "step": 145550 }, { "epoch": 3.555932865902817, "grad_norm": 0.0019049012335017323, "learning_rate": 4.665894836377986e-07, "loss": 0.0352, "num_input_tokens_seen": 98096016, "step": 145555 }, { "epoch": 3.556055016734664, "grad_norm": 0.002492862520739436, "learning_rate": 4.665173529654959e-07, "loss": 0.0, "num_input_tokens_seen": 98099280, "step": 145560 }, { "epoch": 3.5561771675665113, "grad_norm": 0.008258125744760036, "learning_rate": 4.6644522617282203e-07, "loss": 0.0727, "num_input_tokens_seen": 98102864, "step": 145565 }, { "epoch": 3.5562993183983584, "grad_norm": 0.027965500950813293, "learning_rate": 4.6637310326030243e-07, "loss": 0.0569, "num_input_tokens_seen": 98106064, "step": 145570 }, { "epoch": 3.556421469230205, "grad_norm": 0.006361914332956076, "learning_rate": 4.663009842284608e-07, "loss": 0.0, "num_input_tokens_seen": 98109456, "step": 145575 }, { "epoch": 3.556543620062053, "grad_norm": 0.002534546423703432, "learning_rate": 4.662288690778222e-07, "loss": 0.0001, "num_input_tokens_seen": 98112784, "step": 145580 }, { "epoch": 3.5566657708938996, "grad_norm": 0.0019274181686341763, "learning_rate": 4.661567578089105e-07, "loss": 0.0, "num_input_tokens_seen": 98116112, "step": 145585 }, { "epoch": 3.556787921725747, "grad_norm": 0.0018334973137825727, "learning_rate": 4.660846504222509e-07, "loss": 0.0003, "num_input_tokens_seen": 98119312, "step": 145590 }, { "epoch": 3.556910072557594, "grad_norm": 0.0011634392431005836, "learning_rate": 4.660125469183669e-07, "loss": 0.0, "num_input_tokens_seen": 98122448, "step": 145595 }, { "epoch": 3.557032223389441, "grad_norm": 0.0033267210237681866, "learning_rate": 4.6594044729778336e-07, "loss": 0.0259, "num_input_tokens_seen": 98125904, "step": 145600 }, { "epoch": 3.5571543742212883, "grad_norm": 0.004412208218127489, "learning_rate": 4.658683515610248e-07, "loss": 0.0, "num_input_tokens_seen": 98129232, "step": 145605 }, { "epoch": 3.5572765250531355, "grad_norm": 0.11257799714803696, "learning_rate": 4.6579625970861494e-07, "loss": 0.0001, "num_input_tokens_seen": 98132624, "step": 145610 }, { "epoch": 3.5573986758849827, "grad_norm": 0.006835788954049349, "learning_rate": 4.657241717410787e-07, "loss": 0.0404, "num_input_tokens_seen": 98135888, "step": 145615 }, { "epoch": 3.55752082671683, "grad_norm": 0.0053612408228218555, "learning_rate": 4.656520876589397e-07, "loss": 0.0, "num_input_tokens_seen": 98139088, "step": 145620 }, { "epoch": 3.557642977548677, "grad_norm": 0.013923496007919312, "learning_rate": 4.6558000746272276e-07, "loss": 0.0001, "num_input_tokens_seen": 98142544, "step": 145625 }, { "epoch": 3.5577651283805243, "grad_norm": 0.004267881624400616, "learning_rate": 4.655079311529513e-07, "loss": 0.0307, "num_input_tokens_seen": 98145808, "step": 145630 }, { "epoch": 3.5578872792123715, "grad_norm": 0.006597915198653936, "learning_rate": 4.6543585873015e-07, "loss": 0.0004, "num_input_tokens_seen": 98149072, "step": 145635 }, { "epoch": 3.5580094300442187, "grad_norm": 0.01169089786708355, "learning_rate": 4.6536379019484327e-07, "loss": 0.0, "num_input_tokens_seen": 98152528, "step": 145640 }, { "epoch": 3.558131580876066, "grad_norm": 0.0013577293138951063, "learning_rate": 4.6529172554755446e-07, "loss": 0.0048, "num_input_tokens_seen": 98155728, "step": 145645 }, { "epoch": 3.558253731707913, "grad_norm": 0.05119822919368744, "learning_rate": 4.6521966478880846e-07, "loss": 0.0006, "num_input_tokens_seen": 98159696, "step": 145650 }, { "epoch": 3.5583758825397602, "grad_norm": 0.0024902746081352234, "learning_rate": 4.6514760791912853e-07, "loss": 0.0, "num_input_tokens_seen": 98163152, "step": 145655 }, { "epoch": 3.5584980333716074, "grad_norm": 0.00010411084076622501, "learning_rate": 4.6507555493903936e-07, "loss": 0.0, "num_input_tokens_seen": 98166992, "step": 145660 }, { "epoch": 3.5586201842034546, "grad_norm": 0.00025045243091881275, "learning_rate": 4.6500350584906435e-07, "loss": 0.0, "num_input_tokens_seen": 98170192, "step": 145665 }, { "epoch": 3.5587423350353014, "grad_norm": 0.0014511343324556947, "learning_rate": 4.649314606497281e-07, "loss": 0.0488, "num_input_tokens_seen": 98173264, "step": 145670 }, { "epoch": 3.558864485867149, "grad_norm": 0.0007764685433357954, "learning_rate": 4.6485941934155413e-07, "loss": 0.0, "num_input_tokens_seen": 98176464, "step": 145675 }, { "epoch": 3.5589866366989957, "grad_norm": 0.0007026352686807513, "learning_rate": 4.6478738192506607e-07, "loss": 0.049, "num_input_tokens_seen": 98179984, "step": 145680 }, { "epoch": 3.559108787530843, "grad_norm": 0.002035627607256174, "learning_rate": 4.647153484007884e-07, "loss": 0.0008, "num_input_tokens_seen": 98183376, "step": 145685 }, { "epoch": 3.55923093836269, "grad_norm": 27.889453887939453, "learning_rate": 4.6464331876924443e-07, "loss": 0.0477, "num_input_tokens_seen": 98187216, "step": 145690 }, { "epoch": 3.5593530891945373, "grad_norm": 0.02954605594277382, "learning_rate": 4.645712930309582e-07, "loss": 0.0001, "num_input_tokens_seen": 98190672, "step": 145695 }, { "epoch": 3.5594752400263845, "grad_norm": 0.0005955526721663773, "learning_rate": 4.6449927118645393e-07, "loss": 0.0, "num_input_tokens_seen": 98194384, "step": 145700 }, { "epoch": 3.5595973908582317, "grad_norm": 0.005972879007458687, "learning_rate": 4.6442725323625467e-07, "loss": 0.0, "num_input_tokens_seen": 98197904, "step": 145705 }, { "epoch": 3.559719541690079, "grad_norm": 0.0004364507331047207, "learning_rate": 4.6435523918088473e-07, "loss": 0.0, "num_input_tokens_seen": 98201808, "step": 145710 }, { "epoch": 3.559841692521926, "grad_norm": 0.014401191845536232, "learning_rate": 4.642832290208672e-07, "loss": 0.0, "num_input_tokens_seen": 98205328, "step": 145715 }, { "epoch": 3.5599638433537732, "grad_norm": 0.0015330385649576783, "learning_rate": 4.642112227567265e-07, "loss": 0.0, "num_input_tokens_seen": 98208656, "step": 145720 }, { "epoch": 3.5600859941856204, "grad_norm": 0.0011512466007843614, "learning_rate": 4.641392203889857e-07, "loss": 0.0, "num_input_tokens_seen": 98211792, "step": 145725 }, { "epoch": 3.5602081450174676, "grad_norm": 0.0002796232874970883, "learning_rate": 4.640672219181684e-07, "loss": 0.0, "num_input_tokens_seen": 98214800, "step": 145730 }, { "epoch": 3.560330295849315, "grad_norm": 0.011278784833848476, "learning_rate": 4.639952273447989e-07, "loss": 0.0001, "num_input_tokens_seen": 98218256, "step": 145735 }, { "epoch": 3.560452446681162, "grad_norm": 0.004169910680502653, "learning_rate": 4.639232366693998e-07, "loss": 0.0, "num_input_tokens_seen": 98221648, "step": 145740 }, { "epoch": 3.560574597513009, "grad_norm": 0.004272952675819397, "learning_rate": 4.638512498924956e-07, "loss": 0.0, "num_input_tokens_seen": 98224784, "step": 145745 }, { "epoch": 3.5606967483448564, "grad_norm": 8.410751615883783e-05, "learning_rate": 4.637792670146089e-07, "loss": 0.0, "num_input_tokens_seen": 98228368, "step": 145750 }, { "epoch": 3.560818899176703, "grad_norm": 0.004245266318321228, "learning_rate": 4.6370728803626357e-07, "loss": 0.0399, "num_input_tokens_seen": 98231568, "step": 145755 }, { "epoch": 3.5609410500085508, "grad_norm": 0.00774649903178215, "learning_rate": 4.6363531295798344e-07, "loss": 0.0479, "num_input_tokens_seen": 98234960, "step": 145760 }, { "epoch": 3.5610632008403975, "grad_norm": 0.010200269520282745, "learning_rate": 4.635633417802917e-07, "loss": 0.0001, "num_input_tokens_seen": 98238672, "step": 145765 }, { "epoch": 3.561185351672245, "grad_norm": 0.0071225594729185104, "learning_rate": 4.634913745037111e-07, "loss": 0.0, "num_input_tokens_seen": 98242384, "step": 145770 }, { "epoch": 3.561307502504092, "grad_norm": 0.007639218121767044, "learning_rate": 4.6341941112876593e-07, "loss": 0.0, "num_input_tokens_seen": 98245520, "step": 145775 }, { "epoch": 3.561429653335939, "grad_norm": 0.008603011257946491, "learning_rate": 4.633474516559792e-07, "loss": 0.0, "num_input_tokens_seen": 98248656, "step": 145780 }, { "epoch": 3.5615518041677863, "grad_norm": 0.010492679663002491, "learning_rate": 4.632754960858738e-07, "loss": 0.0001, "num_input_tokens_seen": 98251472, "step": 145785 }, { "epoch": 3.5616739549996335, "grad_norm": 0.00014071117038838565, "learning_rate": 4.6320354441897326e-07, "loss": 0.0001, "num_input_tokens_seen": 98254928, "step": 145790 }, { "epoch": 3.5617961058314807, "grad_norm": 0.003771584015339613, "learning_rate": 4.6313159665580124e-07, "loss": 0.0454, "num_input_tokens_seen": 98258256, "step": 145795 }, { "epoch": 3.561918256663328, "grad_norm": 0.007178634870797396, "learning_rate": 4.630596527968804e-07, "loss": 0.0, "num_input_tokens_seen": 98261264, "step": 145800 }, { "epoch": 3.562040407495175, "grad_norm": 0.006300019100308418, "learning_rate": 4.629877128427345e-07, "loss": 0.0, "num_input_tokens_seen": 98264784, "step": 145805 }, { "epoch": 3.562162558327022, "grad_norm": 0.00243266299366951, "learning_rate": 4.6291577679388607e-07, "loss": 0.0, "num_input_tokens_seen": 98267920, "step": 145810 }, { "epoch": 3.5622847091588694, "grad_norm": 0.0011975874658674002, "learning_rate": 4.628438446508589e-07, "loss": 0.0, "num_input_tokens_seen": 98271440, "step": 145815 }, { "epoch": 3.5624068599907166, "grad_norm": 0.0013561134692281485, "learning_rate": 4.6277191641417547e-07, "loss": 0.0, "num_input_tokens_seen": 98275152, "step": 145820 }, { "epoch": 3.562529010822564, "grad_norm": 0.006943685468286276, "learning_rate": 4.6269999208435903e-07, "loss": 0.0, "num_input_tokens_seen": 98278480, "step": 145825 }, { "epoch": 3.562651161654411, "grad_norm": 0.0019534756429493427, "learning_rate": 4.6262807166193316e-07, "loss": 0.0, "num_input_tokens_seen": 98282256, "step": 145830 }, { "epoch": 3.562773312486258, "grad_norm": 0.0007993488106876612, "learning_rate": 4.6255615514742016e-07, "loss": 0.0, "num_input_tokens_seen": 98285392, "step": 145835 }, { "epoch": 3.5628954633181054, "grad_norm": 0.008926840499043465, "learning_rate": 4.6248424254134376e-07, "loss": 0.0, "num_input_tokens_seen": 98288528, "step": 145840 }, { "epoch": 3.5630176141499525, "grad_norm": 0.005140448454767466, "learning_rate": 4.6241233384422616e-07, "loss": 0.0, "num_input_tokens_seen": 98291792, "step": 145845 }, { "epoch": 3.5631397649817993, "grad_norm": 0.017702613025903702, "learning_rate": 4.6234042905659066e-07, "loss": 0.0, "num_input_tokens_seen": 98295696, "step": 145850 }, { "epoch": 3.563261915813647, "grad_norm": 0.0010711681097745895, "learning_rate": 4.6226852817896046e-07, "loss": 0.0, "num_input_tokens_seen": 98299216, "step": 145855 }, { "epoch": 3.5633840666454937, "grad_norm": 0.0027515243273228407, "learning_rate": 4.621966312118578e-07, "loss": 0.0, "num_input_tokens_seen": 98302800, "step": 145860 }, { "epoch": 3.563506217477341, "grad_norm": 0.0022552255541086197, "learning_rate": 4.621247381558063e-07, "loss": 0.0002, "num_input_tokens_seen": 98306192, "step": 145865 }, { "epoch": 3.563628368309188, "grad_norm": 0.0019252895144745708, "learning_rate": 4.620528490113284e-07, "loss": 0.0, "num_input_tokens_seen": 98309136, "step": 145870 }, { "epoch": 3.5637505191410352, "grad_norm": 0.009776824153959751, "learning_rate": 4.6198096377894644e-07, "loss": 0.0, "num_input_tokens_seen": 98312144, "step": 145875 }, { "epoch": 3.5638726699728824, "grad_norm": 0.009079672396183014, "learning_rate": 4.61909082459184e-07, "loss": 0.0, "num_input_tokens_seen": 98315856, "step": 145880 }, { "epoch": 3.5639948208047296, "grad_norm": 0.0010798972798511386, "learning_rate": 4.618372050525631e-07, "loss": 0.0682, "num_input_tokens_seen": 98319440, "step": 145885 }, { "epoch": 3.564116971636577, "grad_norm": 0.020890379324555397, "learning_rate": 4.617653315596072e-07, "loss": 0.0, "num_input_tokens_seen": 98322512, "step": 145890 }, { "epoch": 3.564239122468424, "grad_norm": 0.11921865493059158, "learning_rate": 4.616934619808381e-07, "loss": 0.0, "num_input_tokens_seen": 98325776, "step": 145895 }, { "epoch": 3.564361273300271, "grad_norm": 0.0025778845883905888, "learning_rate": 4.6162159631677946e-07, "loss": 0.0, "num_input_tokens_seen": 98329040, "step": 145900 }, { "epoch": 3.5644834241321184, "grad_norm": 0.00012149583199061453, "learning_rate": 4.615497345679529e-07, "loss": 0.0, "num_input_tokens_seen": 98332496, "step": 145905 }, { "epoch": 3.5646055749639656, "grad_norm": 33.14250183105469, "learning_rate": 4.614778767348815e-07, "loss": 0.0835, "num_input_tokens_seen": 98335440, "step": 145910 }, { "epoch": 3.5647277257958128, "grad_norm": 0.00033949079806916416, "learning_rate": 4.6140602281808816e-07, "loss": 0.0, "num_input_tokens_seen": 98338768, "step": 145915 }, { "epoch": 3.56484987662766, "grad_norm": 0.0017763259820640087, "learning_rate": 4.613341728180947e-07, "loss": 0.0225, "num_input_tokens_seen": 98342352, "step": 145920 }, { "epoch": 3.564972027459507, "grad_norm": 0.007347355131059885, "learning_rate": 4.6126232673542456e-07, "loss": 0.0001, "num_input_tokens_seen": 98346192, "step": 145925 }, { "epoch": 3.5650941782913543, "grad_norm": 0.09457369893789291, "learning_rate": 4.6119048457059916e-07, "loss": 0.0, "num_input_tokens_seen": 98349456, "step": 145930 }, { "epoch": 3.565216329123201, "grad_norm": 0.021234316751360893, "learning_rate": 4.611186463241419e-07, "loss": 0.0001, "num_input_tokens_seen": 98352912, "step": 145935 }, { "epoch": 3.5653384799550487, "grad_norm": 0.008176828734576702, "learning_rate": 4.610468119965744e-07, "loss": 0.0, "num_input_tokens_seen": 98355856, "step": 145940 }, { "epoch": 3.5654606307868955, "grad_norm": 0.0014455120544880629, "learning_rate": 4.609749815884194e-07, "loss": 0.0, "num_input_tokens_seen": 98359376, "step": 145945 }, { "epoch": 3.565582781618743, "grad_norm": 0.020845487713813782, "learning_rate": 4.609031551001997e-07, "loss": 0.0002, "num_input_tokens_seen": 98362448, "step": 145950 }, { "epoch": 3.56570493245059, "grad_norm": 16.44266700744629, "learning_rate": 4.608313325324369e-07, "loss": 0.0235, "num_input_tokens_seen": 98365968, "step": 145955 }, { "epoch": 3.565827083282437, "grad_norm": 0.0004650065384339541, "learning_rate": 4.6075951388565414e-07, "loss": 0.0, "num_input_tokens_seen": 98369104, "step": 145960 }, { "epoch": 3.565949234114284, "grad_norm": 0.003706740215420723, "learning_rate": 4.6068769916037277e-07, "loss": 0.0001, "num_input_tokens_seen": 98372560, "step": 145965 }, { "epoch": 3.5660713849461314, "grad_norm": 0.0027632254641503096, "learning_rate": 4.6061588835711583e-07, "loss": 0.0, "num_input_tokens_seen": 98375824, "step": 145970 }, { "epoch": 3.5661935357779786, "grad_norm": 0.012814885936677456, "learning_rate": 4.605440814764049e-07, "loss": 0.0, "num_input_tokens_seen": 98379280, "step": 145975 }, { "epoch": 3.566315686609826, "grad_norm": 0.040037527680397034, "learning_rate": 4.604722785187629e-07, "loss": 0.0002, "num_input_tokens_seen": 98382608, "step": 145980 }, { "epoch": 3.566437837441673, "grad_norm": 0.0004106028936803341, "learning_rate": 4.604004794847116e-07, "loss": 0.0, "num_input_tokens_seen": 98385872, "step": 145985 }, { "epoch": 3.56655998827352, "grad_norm": 0.0010544214164838195, "learning_rate": 4.603286843747728e-07, "loss": 0.0, "num_input_tokens_seen": 98389584, "step": 145990 }, { "epoch": 3.5666821391053674, "grad_norm": 0.0008923725690692663, "learning_rate": 4.602568931894694e-07, "loss": 0.0, "num_input_tokens_seen": 98393296, "step": 145995 }, { "epoch": 3.5668042899372145, "grad_norm": 0.0021879971027374268, "learning_rate": 4.601851059293225e-07, "loss": 0.0, "num_input_tokens_seen": 98396496, "step": 146000 }, { "epoch": 3.5669264407690617, "grad_norm": 0.041268907487392426, "learning_rate": 4.601133225948548e-07, "loss": 0.0001, "num_input_tokens_seen": 98399632, "step": 146005 }, { "epoch": 3.567048591600909, "grad_norm": 0.03337256982922554, "learning_rate": 4.600415431865886e-07, "loss": 0.0003, "num_input_tokens_seen": 98403024, "step": 146010 }, { "epoch": 3.567170742432756, "grad_norm": 0.00034684070851653814, "learning_rate": 4.5996976770504514e-07, "loss": 0.0005, "num_input_tokens_seen": 98406352, "step": 146015 }, { "epoch": 3.567292893264603, "grad_norm": 0.0028094665613025427, "learning_rate": 4.598979961507471e-07, "loss": 0.0, "num_input_tokens_seen": 98409488, "step": 146020 }, { "epoch": 3.5674150440964505, "grad_norm": 0.058483824133872986, "learning_rate": 4.598262285242158e-07, "loss": 0.0, "num_input_tokens_seen": 98412752, "step": 146025 }, { "epoch": 3.5675371949282972, "grad_norm": 0.013209797441959381, "learning_rate": 4.597544648259738e-07, "loss": 0.0546, "num_input_tokens_seen": 98416336, "step": 146030 }, { "epoch": 3.567659345760145, "grad_norm": 0.006027190946042538, "learning_rate": 4.5968270505654227e-07, "loss": 0.062, "num_input_tokens_seen": 98419664, "step": 146035 }, { "epoch": 3.5677814965919916, "grad_norm": 0.05212077498435974, "learning_rate": 4.596109492164435e-07, "loss": 0.0, "num_input_tokens_seen": 98422800, "step": 146040 }, { "epoch": 3.567903647423839, "grad_norm": 0.013847493566572666, "learning_rate": 4.595391973061995e-07, "loss": 0.0, "num_input_tokens_seen": 98425872, "step": 146045 }, { "epoch": 3.568025798255686, "grad_norm": 0.0033159477170556784, "learning_rate": 4.5946744932633155e-07, "loss": 0.0, "num_input_tokens_seen": 98429392, "step": 146050 }, { "epoch": 3.568147949087533, "grad_norm": 0.03926025703549385, "learning_rate": 4.5939570527736203e-07, "loss": 0.0001, "num_input_tokens_seen": 98433040, "step": 146055 }, { "epoch": 3.5682700999193804, "grad_norm": 0.002551204524934292, "learning_rate": 4.59323965159812e-07, "loss": 0.0563, "num_input_tokens_seen": 98436176, "step": 146060 }, { "epoch": 3.5683922507512276, "grad_norm": 0.10834618657827377, "learning_rate": 4.592522289742036e-07, "loss": 0.0308, "num_input_tokens_seen": 98439568, "step": 146065 }, { "epoch": 3.5685144015830748, "grad_norm": 0.0003172959550283849, "learning_rate": 4.591804967210586e-07, "loss": 0.0, "num_input_tokens_seen": 98442640, "step": 146070 }, { "epoch": 3.568636552414922, "grad_norm": 11.231805801391602, "learning_rate": 4.5910876840089865e-07, "loss": 0.0002, "num_input_tokens_seen": 98446096, "step": 146075 }, { "epoch": 3.568758703246769, "grad_norm": 0.0025172054301947355, "learning_rate": 4.590370440142448e-07, "loss": 0.0, "num_input_tokens_seen": 98449168, "step": 146080 }, { "epoch": 3.5688808540786163, "grad_norm": 0.001789450878277421, "learning_rate": 4.5896532356161944e-07, "loss": 0.0001, "num_input_tokens_seen": 98452688, "step": 146085 }, { "epoch": 3.5690030049104635, "grad_norm": 0.002166353864595294, "learning_rate": 4.5889360704354375e-07, "loss": 0.0, "num_input_tokens_seen": 98455952, "step": 146090 }, { "epoch": 3.5691251557423107, "grad_norm": 0.0020952769555151463, "learning_rate": 4.58821894460539e-07, "loss": 0.0, "num_input_tokens_seen": 98459408, "step": 146095 }, { "epoch": 3.569247306574158, "grad_norm": 0.12789571285247803, "learning_rate": 4.5875018581312684e-07, "loss": 0.0504, "num_input_tokens_seen": 98462736, "step": 146100 }, { "epoch": 3.569369457406005, "grad_norm": 0.06485660374164581, "learning_rate": 4.5867848110182937e-07, "loss": 0.0001, "num_input_tokens_seen": 98466192, "step": 146105 }, { "epoch": 3.5694916082378523, "grad_norm": 0.02141660824418068, "learning_rate": 4.5860678032716724e-07, "loss": 0.0361, "num_input_tokens_seen": 98469520, "step": 146110 }, { "epoch": 3.569613759069699, "grad_norm": 0.0036332227755337954, "learning_rate": 4.5853508348966253e-07, "loss": 0.0001, "num_input_tokens_seen": 98472720, "step": 146115 }, { "epoch": 3.5697359099015467, "grad_norm": 0.0006790324696339667, "learning_rate": 4.5846339058983595e-07, "loss": 0.0, "num_input_tokens_seen": 98476304, "step": 146120 }, { "epoch": 3.5698580607333934, "grad_norm": 0.007151363417506218, "learning_rate": 4.583917016282097e-07, "loss": 0.0001, "num_input_tokens_seen": 98479312, "step": 146125 }, { "epoch": 3.569980211565241, "grad_norm": 0.05629653483629227, "learning_rate": 4.583200166053043e-07, "loss": 0.0, "num_input_tokens_seen": 98482640, "step": 146130 }, { "epoch": 3.570102362397088, "grad_norm": 0.0005009549204260111, "learning_rate": 4.5824833552164134e-07, "loss": 0.0625, "num_input_tokens_seen": 98485904, "step": 146135 }, { "epoch": 3.570224513228935, "grad_norm": 0.011462909169495106, "learning_rate": 4.5817665837774265e-07, "loss": 0.0, "num_input_tokens_seen": 98489424, "step": 146140 }, { "epoch": 3.570346664060782, "grad_norm": 0.011264095082879066, "learning_rate": 4.581049851741287e-07, "loss": 0.0, "num_input_tokens_seen": 98492752, "step": 146145 }, { "epoch": 3.5704688148926294, "grad_norm": 0.004408912267535925, "learning_rate": 4.580333159113213e-07, "loss": 0.0, "num_input_tokens_seen": 98496336, "step": 146150 }, { "epoch": 3.5705909657244765, "grad_norm": 0.001395556260831654, "learning_rate": 4.5796165058984104e-07, "loss": 0.0001, "num_input_tokens_seen": 98499536, "step": 146155 }, { "epoch": 3.5707131165563237, "grad_norm": 0.010997572913765907, "learning_rate": 4.578899892102095e-07, "loss": 0.0, "num_input_tokens_seen": 98502544, "step": 146160 }, { "epoch": 3.570835267388171, "grad_norm": 0.0001377869484713301, "learning_rate": 4.5781833177294815e-07, "loss": 0.0, "num_input_tokens_seen": 98505808, "step": 146165 }, { "epoch": 3.570957418220018, "grad_norm": 0.0016566417180001736, "learning_rate": 4.577466782785774e-07, "loss": 0.0002, "num_input_tokens_seen": 98509392, "step": 146170 }, { "epoch": 3.5710795690518653, "grad_norm": 0.0005025031859986484, "learning_rate": 4.5767502872761885e-07, "loss": 0.0009, "num_input_tokens_seen": 98512656, "step": 146175 }, { "epoch": 3.5712017198837125, "grad_norm": 0.001447174116037786, "learning_rate": 4.576033831205935e-07, "loss": 0.0, "num_input_tokens_seen": 98515728, "step": 146180 }, { "epoch": 3.5713238707155597, "grad_norm": 0.0032353862188756466, "learning_rate": 4.5753174145802185e-07, "loss": 0.0, "num_input_tokens_seen": 98518800, "step": 146185 }, { "epoch": 3.571446021547407, "grad_norm": 0.0006234863540157676, "learning_rate": 4.5746010374042567e-07, "loss": 0.0, "num_input_tokens_seen": 98521936, "step": 146190 }, { "epoch": 3.571568172379254, "grad_norm": 0.003277813782915473, "learning_rate": 4.5738846996832505e-07, "loss": 0.0306, "num_input_tokens_seen": 98525072, "step": 146195 }, { "epoch": 3.571690323211101, "grad_norm": 0.002012363402172923, "learning_rate": 4.573168401422419e-07, "loss": 0.041, "num_input_tokens_seen": 98528592, "step": 146200 }, { "epoch": 3.5718124740429484, "grad_norm": 0.000628719397354871, "learning_rate": 4.5724521426269626e-07, "loss": 0.0, "num_input_tokens_seen": 98532112, "step": 146205 }, { "epoch": 3.571934624874795, "grad_norm": 0.010826955549418926, "learning_rate": 4.571735923302098e-07, "loss": 0.0001, "num_input_tokens_seen": 98535568, "step": 146210 }, { "epoch": 3.572056775706643, "grad_norm": 0.000483804993564263, "learning_rate": 4.571019743453025e-07, "loss": 0.0566, "num_input_tokens_seen": 98539344, "step": 146215 }, { "epoch": 3.5721789265384896, "grad_norm": 0.0004217389796394855, "learning_rate": 4.5703036030849617e-07, "loss": 0.0, "num_input_tokens_seen": 98543056, "step": 146220 }, { "epoch": 3.5723010773703368, "grad_norm": 0.02112424373626709, "learning_rate": 4.5695875022031073e-07, "loss": 0.0, "num_input_tokens_seen": 98546640, "step": 146225 }, { "epoch": 3.572423228202184, "grad_norm": 0.2892974019050598, "learning_rate": 4.5688714408126717e-07, "loss": 0.0002, "num_input_tokens_seen": 98549712, "step": 146230 }, { "epoch": 3.572545379034031, "grad_norm": 0.008840722031891346, "learning_rate": 4.5681554189188684e-07, "loss": 0.0, "num_input_tokens_seen": 98553104, "step": 146235 }, { "epoch": 3.5726675298658783, "grad_norm": 0.023208221420645714, "learning_rate": 4.5674394365268966e-07, "loss": 0.0003, "num_input_tokens_seen": 98556496, "step": 146240 }, { "epoch": 3.5727896806977255, "grad_norm": 0.1287384033203125, "learning_rate": 4.56672349364197e-07, "loss": 0.0007, "num_input_tokens_seen": 98560208, "step": 146245 }, { "epoch": 3.5729118315295727, "grad_norm": 0.0059753842651844025, "learning_rate": 4.5660075902692877e-07, "loss": 0.0001, "num_input_tokens_seen": 98563984, "step": 146250 }, { "epoch": 3.57303398236142, "grad_norm": 0.00015051568334456533, "learning_rate": 4.565291726414059e-07, "loss": 0.0001, "num_input_tokens_seen": 98567696, "step": 146255 }, { "epoch": 3.573156133193267, "grad_norm": 0.0029284560587257147, "learning_rate": 4.5645759020814955e-07, "loss": 0.0, "num_input_tokens_seen": 98570704, "step": 146260 }, { "epoch": 3.5732782840251143, "grad_norm": 0.1056944951415062, "learning_rate": 4.5638601172767934e-07, "loss": 0.0893, "num_input_tokens_seen": 98573968, "step": 146265 }, { "epoch": 3.5734004348569615, "grad_norm": 0.0052173323929309845, "learning_rate": 4.5631443720051667e-07, "loss": 0.0, "num_input_tokens_seen": 98577040, "step": 146270 }, { "epoch": 3.5735225856888087, "grad_norm": 0.0021510545630007982, "learning_rate": 4.5624286662718124e-07, "loss": 0.0225, "num_input_tokens_seen": 98580176, "step": 146275 }, { "epoch": 3.573644736520656, "grad_norm": 0.0036637140437960625, "learning_rate": 4.5617130000819435e-07, "loss": 0.0, "num_input_tokens_seen": 98583184, "step": 146280 }, { "epoch": 3.573766887352503, "grad_norm": 0.000627799890935421, "learning_rate": 4.560997373440757e-07, "loss": 0.0, "num_input_tokens_seen": 98586640, "step": 146285 }, { "epoch": 3.57388903818435, "grad_norm": 0.012774244882166386, "learning_rate": 4.560281786353464e-07, "loss": 0.0, "num_input_tokens_seen": 98589776, "step": 146290 }, { "epoch": 3.574011189016197, "grad_norm": 0.0006221451330929995, "learning_rate": 4.5595662388252643e-07, "loss": 0.0001, "num_input_tokens_seen": 98592912, "step": 146295 }, { "epoch": 3.5741333398480446, "grad_norm": 0.00047458289191126823, "learning_rate": 4.55885073086136e-07, "loss": 0.0, "num_input_tokens_seen": 98596176, "step": 146300 }, { "epoch": 3.5742554906798913, "grad_norm": 0.0028228475712239742, "learning_rate": 4.558135262466959e-07, "loss": 0.0058, "num_input_tokens_seen": 98599312, "step": 146305 }, { "epoch": 3.5743776415117385, "grad_norm": 0.0960443764925003, "learning_rate": 4.557419833647258e-07, "loss": 0.0, "num_input_tokens_seen": 98602512, "step": 146310 }, { "epoch": 3.5744997923435857, "grad_norm": 0.013317099772393703, "learning_rate": 4.556704444407465e-07, "loss": 0.0001, "num_input_tokens_seen": 98605904, "step": 146315 }, { "epoch": 3.574621943175433, "grad_norm": 0.0006582035566680133, "learning_rate": 4.5559890947527843e-07, "loss": 0.0003, "num_input_tokens_seen": 98609680, "step": 146320 }, { "epoch": 3.57474409400728, "grad_norm": 0.13117016851902008, "learning_rate": 4.555273784688413e-07, "loss": 0.0657, "num_input_tokens_seen": 98613072, "step": 146325 }, { "epoch": 3.5748662448391273, "grad_norm": 0.043668705970048904, "learning_rate": 4.554558514219557e-07, "loss": 0.0, "num_input_tokens_seen": 98616656, "step": 146330 }, { "epoch": 3.5749883956709745, "grad_norm": 0.017411423847079277, "learning_rate": 4.553843283351413e-07, "loss": 0.1131, "num_input_tokens_seen": 98619920, "step": 146335 }, { "epoch": 3.5751105465028217, "grad_norm": 0.001685897121205926, "learning_rate": 4.553128092089189e-07, "loss": 0.0, "num_input_tokens_seen": 98622864, "step": 146340 }, { "epoch": 3.575232697334669, "grad_norm": 0.0014981109416112304, "learning_rate": 4.5524129404380794e-07, "loss": 0.0001, "num_input_tokens_seen": 98626448, "step": 146345 }, { "epoch": 3.575354848166516, "grad_norm": 0.0020773394498974085, "learning_rate": 4.551697828403288e-07, "loss": 0.0, "num_input_tokens_seen": 98629904, "step": 146350 }, { "epoch": 3.5754769989983632, "grad_norm": 0.001966852927580476, "learning_rate": 4.5509827559900194e-07, "loss": 0.0, "num_input_tokens_seen": 98633616, "step": 146355 }, { "epoch": 3.5755991498302104, "grad_norm": 0.00928665604442358, "learning_rate": 4.550267723203466e-07, "loss": 0.0, "num_input_tokens_seen": 98636816, "step": 146360 }, { "epoch": 3.5757213006620576, "grad_norm": 0.003639796283096075, "learning_rate": 4.5495527300488346e-07, "loss": 0.0001, "num_input_tokens_seen": 98640720, "step": 146365 }, { "epoch": 3.575843451493905, "grad_norm": 0.002453819615766406, "learning_rate": 4.548837776531318e-07, "loss": 0.0, "num_input_tokens_seen": 98644432, "step": 146370 }, { "epoch": 3.575965602325752, "grad_norm": 64.8224105834961, "learning_rate": 4.548122862656124e-07, "loss": 0.0023, "num_input_tokens_seen": 98647824, "step": 146375 }, { "epoch": 3.5760877531575987, "grad_norm": 0.0014054341008886695, "learning_rate": 4.547407988428442e-07, "loss": 0.0, "num_input_tokens_seen": 98651024, "step": 146380 }, { "epoch": 3.5762099039894464, "grad_norm": 0.005636067595332861, "learning_rate": 4.5466931538534804e-07, "loss": 0.0001, "num_input_tokens_seen": 98654608, "step": 146385 }, { "epoch": 3.576332054821293, "grad_norm": 0.014609677717089653, "learning_rate": 4.545978358936429e-07, "loss": 0.0, "num_input_tokens_seen": 98658320, "step": 146390 }, { "epoch": 3.5764542056531408, "grad_norm": 0.011487624607980251, "learning_rate": 4.5452636036824933e-07, "loss": 0.0, "num_input_tokens_seen": 98661712, "step": 146395 }, { "epoch": 3.5765763564849875, "grad_norm": 0.18441209197044373, "learning_rate": 4.5445488880968673e-07, "loss": 0.0001, "num_input_tokens_seen": 98664720, "step": 146400 }, { "epoch": 3.5766985073168347, "grad_norm": 0.029274463653564453, "learning_rate": 4.543834212184746e-07, "loss": 0.0003, "num_input_tokens_seen": 98667984, "step": 146405 }, { "epoch": 3.576820658148682, "grad_norm": 0.02028658241033554, "learning_rate": 4.543119575951331e-07, "loss": 0.0, "num_input_tokens_seen": 98671248, "step": 146410 }, { "epoch": 3.576942808980529, "grad_norm": 0.049594081938266754, "learning_rate": 4.5424049794018203e-07, "loss": 0.0001, "num_input_tokens_seen": 98674896, "step": 146415 }, { "epoch": 3.5770649598123763, "grad_norm": 0.052891720086336136, "learning_rate": 4.5416904225414055e-07, "loss": 0.0, "num_input_tokens_seen": 98678288, "step": 146420 }, { "epoch": 3.5771871106442235, "grad_norm": 0.04260937124490738, "learning_rate": 4.540975905375289e-07, "loss": 0.0, "num_input_tokens_seen": 98682064, "step": 146425 }, { "epoch": 3.5773092614760706, "grad_norm": 0.00021985071361996233, "learning_rate": 4.5402614279086617e-07, "loss": 0.0019, "num_input_tokens_seen": 98685648, "step": 146430 }, { "epoch": 3.577431412307918, "grad_norm": 0.000840392371173948, "learning_rate": 4.539546990146724e-07, "loss": 0.0334, "num_input_tokens_seen": 98689488, "step": 146435 }, { "epoch": 3.577553563139765, "grad_norm": 0.00027688511181622744, "learning_rate": 4.538832592094666e-07, "loss": 0.0, "num_input_tokens_seen": 98693136, "step": 146440 }, { "epoch": 3.577675713971612, "grad_norm": 0.012098210863769054, "learning_rate": 4.538118233757686e-07, "loss": 0.0, "num_input_tokens_seen": 98696720, "step": 146445 }, { "epoch": 3.5777978648034594, "grad_norm": 0.04021194577217102, "learning_rate": 4.5374039151409836e-07, "loss": 0.0, "num_input_tokens_seen": 98700432, "step": 146450 }, { "epoch": 3.5779200156353066, "grad_norm": 0.008958011865615845, "learning_rate": 4.5366896362497464e-07, "loss": 0.0001, "num_input_tokens_seen": 98703632, "step": 146455 }, { "epoch": 3.578042166467154, "grad_norm": 0.003126812633126974, "learning_rate": 4.5359753970891735e-07, "loss": 0.0, "num_input_tokens_seen": 98706832, "step": 146460 }, { "epoch": 3.5781643172990005, "grad_norm": 0.0010996502824127674, "learning_rate": 4.535261197664455e-07, "loss": 0.111, "num_input_tokens_seen": 98710160, "step": 146465 }, { "epoch": 3.578286468130848, "grad_norm": 0.00027585600037127733, "learning_rate": 4.534547037980786e-07, "loss": 0.0001, "num_input_tokens_seen": 98713552, "step": 146470 }, { "epoch": 3.578408618962695, "grad_norm": 0.001792923198081553, "learning_rate": 4.533832918043364e-07, "loss": 0.0, "num_input_tokens_seen": 98716880, "step": 146475 }, { "epoch": 3.5785307697945425, "grad_norm": 0.0010359457228332758, "learning_rate": 4.533118837857377e-07, "loss": 0.0, "num_input_tokens_seen": 98720464, "step": 146480 }, { "epoch": 3.5786529206263893, "grad_norm": 0.09417528659105301, "learning_rate": 4.532404797428023e-07, "loss": 0.0001, "num_input_tokens_seen": 98723472, "step": 146485 }, { "epoch": 3.5787750714582365, "grad_norm": 0.006773589178919792, "learning_rate": 4.531690796760492e-07, "loss": 0.0, "num_input_tokens_seen": 98727248, "step": 146490 }, { "epoch": 3.5788972222900837, "grad_norm": 0.000443848519353196, "learning_rate": 4.530976835859973e-07, "loss": 0.0001, "num_input_tokens_seen": 98730832, "step": 146495 }, { "epoch": 3.579019373121931, "grad_norm": 0.0023871995508670807, "learning_rate": 4.530262914731665e-07, "loss": 0.0268, "num_input_tokens_seen": 98734160, "step": 146500 }, { "epoch": 3.579141523953778, "grad_norm": 0.0007982885581441224, "learning_rate": 4.529549033380753e-07, "loss": 0.0001, "num_input_tokens_seen": 98737552, "step": 146505 }, { "epoch": 3.5792636747856252, "grad_norm": 0.00018090193043462932, "learning_rate": 4.528835191812435e-07, "loss": 0.0002, "num_input_tokens_seen": 98740560, "step": 146510 }, { "epoch": 3.5793858256174724, "grad_norm": 0.0023492518812417984, "learning_rate": 4.5281213900318947e-07, "loss": 0.0001, "num_input_tokens_seen": 98743568, "step": 146515 }, { "epoch": 3.5795079764493196, "grad_norm": 0.003083728486672044, "learning_rate": 4.527407628044332e-07, "loss": 0.0, "num_input_tokens_seen": 98746832, "step": 146520 }, { "epoch": 3.579630127281167, "grad_norm": 0.002531050704419613, "learning_rate": 4.526693905854929e-07, "loss": 0.0, "num_input_tokens_seen": 98750544, "step": 146525 }, { "epoch": 3.579752278113014, "grad_norm": 0.00050026283133775, "learning_rate": 4.5259802234688836e-07, "loss": 0.0001, "num_input_tokens_seen": 98753808, "step": 146530 }, { "epoch": 3.579874428944861, "grad_norm": 0.03973764181137085, "learning_rate": 4.525266580891379e-07, "loss": 0.0002, "num_input_tokens_seen": 98757008, "step": 146535 }, { "epoch": 3.5799965797767084, "grad_norm": 0.00027027149917557836, "learning_rate": 4.5245529781276083e-07, "loss": 0.0, "num_input_tokens_seen": 98760272, "step": 146540 }, { "epoch": 3.5801187306085556, "grad_norm": 0.0016394499689340591, "learning_rate": 4.523839415182765e-07, "loss": 0.0001, "num_input_tokens_seen": 98763600, "step": 146545 }, { "epoch": 3.5802408814404028, "grad_norm": 0.0012777202064171433, "learning_rate": 4.5231258920620305e-07, "loss": 0.0, "num_input_tokens_seen": 98766992, "step": 146550 }, { "epoch": 3.58036303227225, "grad_norm": 0.0009179083281196654, "learning_rate": 4.522412408770602e-07, "loss": 0.0, "num_input_tokens_seen": 98770256, "step": 146555 }, { "epoch": 3.5804851831040967, "grad_norm": 0.005039707757532597, "learning_rate": 4.5216989653136584e-07, "loss": 0.0, "num_input_tokens_seen": 98773712, "step": 146560 }, { "epoch": 3.5806073339359443, "grad_norm": 0.001076148939318955, "learning_rate": 4.5209855616963945e-07, "loss": 0.0, "num_input_tokens_seen": 98777104, "step": 146565 }, { "epoch": 3.580729484767791, "grad_norm": 0.000977373798377812, "learning_rate": 4.520272197924001e-07, "loss": 0.0631, "num_input_tokens_seen": 98780432, "step": 146570 }, { "epoch": 3.5808516355996387, "grad_norm": 0.0024073768872767687, "learning_rate": 4.519558874001658e-07, "loss": 0.0, "num_input_tokens_seen": 98784080, "step": 146575 }, { "epoch": 3.5809737864314855, "grad_norm": 0.06418831646442413, "learning_rate": 4.51884558993456e-07, "loss": 0.0001, "num_input_tokens_seen": 98786832, "step": 146580 }, { "epoch": 3.5810959372633326, "grad_norm": 0.002536097541451454, "learning_rate": 4.518132345727889e-07, "loss": 0.0366, "num_input_tokens_seen": 98790096, "step": 146585 }, { "epoch": 3.58121808809518, "grad_norm": 0.0007362982723861933, "learning_rate": 4.5174191413868354e-07, "loss": 0.0001, "num_input_tokens_seen": 98793680, "step": 146590 }, { "epoch": 3.581340238927027, "grad_norm": 0.0036098656710237265, "learning_rate": 4.5167059769165827e-07, "loss": 0.0, "num_input_tokens_seen": 98796816, "step": 146595 }, { "epoch": 3.581462389758874, "grad_norm": 0.0015288054710254073, "learning_rate": 4.5159928523223224e-07, "loss": 0.0, "num_input_tokens_seen": 98800400, "step": 146600 }, { "epoch": 3.5815845405907214, "grad_norm": 0.001292704837396741, "learning_rate": 4.5152797676092367e-07, "loss": 0.0, "num_input_tokens_seen": 98803600, "step": 146605 }, { "epoch": 3.5817066914225686, "grad_norm": 0.0005294968141242862, "learning_rate": 4.514566722782508e-07, "loss": 0.0, "num_input_tokens_seen": 98806992, "step": 146610 }, { "epoch": 3.581828842254416, "grad_norm": 0.04279835522174835, "learning_rate": 4.513853717847329e-07, "loss": 0.0, "num_input_tokens_seen": 98810384, "step": 146615 }, { "epoch": 3.581950993086263, "grad_norm": 0.0010195255745202303, "learning_rate": 4.513140752808878e-07, "loss": 0.0, "num_input_tokens_seen": 98813136, "step": 146620 }, { "epoch": 3.58207314391811, "grad_norm": 0.011961672455072403, "learning_rate": 4.512427827672344e-07, "loss": 0.0, "num_input_tokens_seen": 98816976, "step": 146625 }, { "epoch": 3.5821952947499573, "grad_norm": 0.0007193966303020716, "learning_rate": 4.5117149424429135e-07, "loss": 0.0001, "num_input_tokens_seen": 98819856, "step": 146630 }, { "epoch": 3.5823174455818045, "grad_norm": 0.0016777556156739593, "learning_rate": 4.5110020971257645e-07, "loss": 0.0, "num_input_tokens_seen": 98823696, "step": 146635 }, { "epoch": 3.5824395964136517, "grad_norm": 0.0003486250643618405, "learning_rate": 4.510289291726088e-07, "loss": 0.0001, "num_input_tokens_seen": 98827280, "step": 146640 }, { "epoch": 3.5825617472454985, "grad_norm": 0.00026854375028051436, "learning_rate": 4.5095765262490614e-07, "loss": 0.0, "num_input_tokens_seen": 98830736, "step": 146645 }, { "epoch": 3.582683898077346, "grad_norm": 0.00011141942377435043, "learning_rate": 4.5088638006998745e-07, "loss": 0.0, "num_input_tokens_seen": 98833872, "step": 146650 }, { "epoch": 3.582806048909193, "grad_norm": 0.013646208681166172, "learning_rate": 4.508151115083703e-07, "loss": 0.0002, "num_input_tokens_seen": 98837200, "step": 146655 }, { "epoch": 3.5829281997410405, "grad_norm": 0.005915411747992039, "learning_rate": 4.5074384694057334e-07, "loss": 0.0, "num_input_tokens_seen": 98840656, "step": 146660 }, { "epoch": 3.5830503505728872, "grad_norm": 4.565409471979365e-05, "learning_rate": 4.5067258636711536e-07, "loss": 0.0, "num_input_tokens_seen": 98843984, "step": 146665 }, { "epoch": 3.5831725014047344, "grad_norm": 0.003964328207075596, "learning_rate": 4.5060132978851364e-07, "loss": 0.0313, "num_input_tokens_seen": 98847248, "step": 146670 }, { "epoch": 3.5832946522365816, "grad_norm": 0.0006252607563510537, "learning_rate": 4.5053007720528713e-07, "loss": 0.0001, "num_input_tokens_seen": 98850384, "step": 146675 }, { "epoch": 3.583416803068429, "grad_norm": 0.016371555626392365, "learning_rate": 4.5045882861795337e-07, "loss": 0.0, "num_input_tokens_seen": 98853776, "step": 146680 }, { "epoch": 3.583538953900276, "grad_norm": 0.0004095694748684764, "learning_rate": 4.503875840270311e-07, "loss": 0.0001, "num_input_tokens_seen": 98857040, "step": 146685 }, { "epoch": 3.583661104732123, "grad_norm": 0.01132937241345644, "learning_rate": 4.503163434330379e-07, "loss": 0.0, "num_input_tokens_seen": 98860944, "step": 146690 }, { "epoch": 3.5837832555639704, "grad_norm": 0.0022024072241038084, "learning_rate": 4.502451068364925e-07, "loss": 0.0, "num_input_tokens_seen": 98864208, "step": 146695 }, { "epoch": 3.5839054063958176, "grad_norm": 0.009176064282655716, "learning_rate": 4.501738742379121e-07, "loss": 0.0, "num_input_tokens_seen": 98867344, "step": 146700 }, { "epoch": 3.5840275572276648, "grad_norm": 0.07557198405265808, "learning_rate": 4.5010264563781554e-07, "loss": 0.0, "num_input_tokens_seen": 98870224, "step": 146705 }, { "epoch": 3.584149708059512, "grad_norm": 0.0012070549419149756, "learning_rate": 4.5003142103672045e-07, "loss": 0.0, "num_input_tokens_seen": 98873232, "step": 146710 }, { "epoch": 3.584271858891359, "grad_norm": 0.01205280888825655, "learning_rate": 4.499602004351445e-07, "loss": 0.0001, "num_input_tokens_seen": 98876816, "step": 146715 }, { "epoch": 3.5843940097232063, "grad_norm": 0.0006821723654866219, "learning_rate": 4.4988898383360576e-07, "loss": 0.0002, "num_input_tokens_seen": 98880208, "step": 146720 }, { "epoch": 3.5845161605550535, "grad_norm": 0.005529592279344797, "learning_rate": 4.498177712326228e-07, "loss": 0.0, "num_input_tokens_seen": 98883728, "step": 146725 }, { "epoch": 3.5846383113869007, "grad_norm": 0.0008852336904965341, "learning_rate": 4.4974656263271247e-07, "loss": 0.0, "num_input_tokens_seen": 98886864, "step": 146730 }, { "epoch": 3.584760462218748, "grad_norm": 0.0005162259330973029, "learning_rate": 4.496753580343937e-07, "loss": 0.0, "num_input_tokens_seen": 98890000, "step": 146735 }, { "epoch": 3.5848826130505946, "grad_norm": 0.0826941654086113, "learning_rate": 4.4960415743818327e-07, "loss": 0.0, "num_input_tokens_seen": 98893520, "step": 146740 }, { "epoch": 3.5850047638824423, "grad_norm": 0.0002892829361371696, "learning_rate": 4.495329608445998e-07, "loss": 0.0, "num_input_tokens_seen": 98896528, "step": 146745 }, { "epoch": 3.585126914714289, "grad_norm": 0.06882300227880478, "learning_rate": 4.494617682541604e-07, "loss": 0.062, "num_input_tokens_seen": 98899856, "step": 146750 }, { "epoch": 3.585249065546136, "grad_norm": 0.0031155727338045835, "learning_rate": 4.4939057966738304e-07, "loss": 0.0, "num_input_tokens_seen": 98903440, "step": 146755 }, { "epoch": 3.5853712163779834, "grad_norm": 0.00041419637273065746, "learning_rate": 4.4931939508478575e-07, "loss": 0.0, "num_input_tokens_seen": 98906512, "step": 146760 }, { "epoch": 3.5854933672098306, "grad_norm": 0.0037223040126264095, "learning_rate": 4.4924821450688575e-07, "loss": 0.0, "num_input_tokens_seen": 98909520, "step": 146765 }, { "epoch": 3.5856155180416778, "grad_norm": 1.3689956665039062, "learning_rate": 4.4917703793420116e-07, "loss": 0.0003, "num_input_tokens_seen": 98912400, "step": 146770 }, { "epoch": 3.585737668873525, "grad_norm": 0.0005561459111049771, "learning_rate": 4.4910586536724893e-07, "loss": 0.0, "num_input_tokens_seen": 98915920, "step": 146775 }, { "epoch": 3.585859819705372, "grad_norm": 7.715138781350106e-05, "learning_rate": 4.4903469680654703e-07, "loss": 0.0, "num_input_tokens_seen": 98919184, "step": 146780 }, { "epoch": 3.5859819705372193, "grad_norm": 0.002350148744881153, "learning_rate": 4.489635322526134e-07, "loss": 0.0, "num_input_tokens_seen": 98922640, "step": 146785 }, { "epoch": 3.5861041213690665, "grad_norm": 0.013689137063920498, "learning_rate": 4.488923717059647e-07, "loss": 0.0, "num_input_tokens_seen": 98925712, "step": 146790 }, { "epoch": 3.5862262722009137, "grad_norm": 0.006791385821998119, "learning_rate": 4.4882121516711937e-07, "loss": 0.0, "num_input_tokens_seen": 98929168, "step": 146795 }, { "epoch": 3.586348423032761, "grad_norm": 0.15933595597743988, "learning_rate": 4.4875006263659445e-07, "loss": 0.0001, "num_input_tokens_seen": 98932752, "step": 146800 }, { "epoch": 3.586470573864608, "grad_norm": 60.48442077636719, "learning_rate": 4.486789141149069e-07, "loss": 0.083, "num_input_tokens_seen": 98936208, "step": 146805 }, { "epoch": 3.5865927246964553, "grad_norm": 0.0009146135998889804, "learning_rate": 4.4860776960257495e-07, "loss": 0.0001, "num_input_tokens_seen": 98939280, "step": 146810 }, { "epoch": 3.5867148755283025, "grad_norm": 0.0010512598091736436, "learning_rate": 4.4853662910011524e-07, "loss": 0.0, "num_input_tokens_seen": 98942352, "step": 146815 }, { "epoch": 3.5868370263601497, "grad_norm": 0.008385025896131992, "learning_rate": 4.484654926080459e-07, "loss": 0.0002, "num_input_tokens_seen": 98945680, "step": 146820 }, { "epoch": 3.5869591771919964, "grad_norm": 0.00016320282884407789, "learning_rate": 4.4839436012688336e-07, "loss": 0.0, "num_input_tokens_seen": 98949520, "step": 146825 }, { "epoch": 3.587081328023844, "grad_norm": 0.009338638745248318, "learning_rate": 4.483232316571459e-07, "loss": 0.0, "num_input_tokens_seen": 98952848, "step": 146830 }, { "epoch": 3.587203478855691, "grad_norm": 0.15116487443447113, "learning_rate": 4.482521071993498e-07, "loss": 0.0001, "num_input_tokens_seen": 98956112, "step": 146835 }, { "epoch": 3.5873256296875384, "grad_norm": 0.0012497843708842993, "learning_rate": 4.481809867540133e-07, "loss": 0.0, "num_input_tokens_seen": 98959376, "step": 146840 }, { "epoch": 3.587447780519385, "grad_norm": 0.0009556132717989385, "learning_rate": 4.4810987032165257e-07, "loss": 0.0, "num_input_tokens_seen": 98963536, "step": 146845 }, { "epoch": 3.5875699313512324, "grad_norm": 0.00022231374168768525, "learning_rate": 4.480387579027853e-07, "loss": 0.0, "num_input_tokens_seen": 98966800, "step": 146850 }, { "epoch": 3.5876920821830796, "grad_norm": 0.0007160860113799572, "learning_rate": 4.479676494979291e-07, "loss": 0.0, "num_input_tokens_seen": 98970192, "step": 146855 }, { "epoch": 3.5878142330149267, "grad_norm": 0.003910457249730825, "learning_rate": 4.4789654510760023e-07, "loss": 0.0, "num_input_tokens_seen": 98973072, "step": 146860 }, { "epoch": 3.587936383846774, "grad_norm": 0.011815412901341915, "learning_rate": 4.478254447323165e-07, "loss": 0.0453, "num_input_tokens_seen": 98976272, "step": 146865 }, { "epoch": 3.588058534678621, "grad_norm": 7.326767081394792e-05, "learning_rate": 4.477543483725944e-07, "loss": 0.0, "num_input_tokens_seen": 98979728, "step": 146870 }, { "epoch": 3.5881806855104683, "grad_norm": 0.11559463292360306, "learning_rate": 4.4768325602895116e-07, "loss": 0.0, "num_input_tokens_seen": 98983312, "step": 146875 }, { "epoch": 3.5883028363423155, "grad_norm": 0.004063963890075684, "learning_rate": 4.476121677019042e-07, "loss": 0.0, "num_input_tokens_seen": 98986512, "step": 146880 }, { "epoch": 3.5884249871741627, "grad_norm": 0.007323488127440214, "learning_rate": 4.4754108339196974e-07, "loss": 0.0224, "num_input_tokens_seen": 98989584, "step": 146885 }, { "epoch": 3.58854713800601, "grad_norm": 0.0003849711793009192, "learning_rate": 4.4747000309966553e-07, "loss": 0.0, "num_input_tokens_seen": 98993104, "step": 146890 }, { "epoch": 3.588669288837857, "grad_norm": 0.00032672181259840727, "learning_rate": 4.4739892682550763e-07, "loss": 0.0, "num_input_tokens_seen": 98996240, "step": 146895 }, { "epoch": 3.5887914396697043, "grad_norm": 0.02428433485329151, "learning_rate": 4.4732785457001375e-07, "loss": 0.0, "num_input_tokens_seen": 98999696, "step": 146900 }, { "epoch": 3.5889135905015515, "grad_norm": 0.0003919892478734255, "learning_rate": 4.472567863337001e-07, "loss": 0.0003, "num_input_tokens_seen": 99003344, "step": 146905 }, { "epoch": 3.5890357413333986, "grad_norm": 0.002559587825089693, "learning_rate": 4.4718572211708406e-07, "loss": 0.0001, "num_input_tokens_seen": 99007184, "step": 146910 }, { "epoch": 3.589157892165246, "grad_norm": 0.0003037639253307134, "learning_rate": 4.4711466192068215e-07, "loss": 0.0, "num_input_tokens_seen": 99010704, "step": 146915 }, { "epoch": 3.5892800429970926, "grad_norm": 0.0018825943116098642, "learning_rate": 4.4704360574501075e-07, "loss": 0.0725, "num_input_tokens_seen": 99014416, "step": 146920 }, { "epoch": 3.58940219382894, "grad_norm": 0.00019972519658040255, "learning_rate": 4.469725535905873e-07, "loss": 0.0, "num_input_tokens_seen": 99018384, "step": 146925 }, { "epoch": 3.589524344660787, "grad_norm": 0.0024977638386189938, "learning_rate": 4.4690150545792784e-07, "loss": 0.0, "num_input_tokens_seen": 99021392, "step": 146930 }, { "epoch": 3.589646495492634, "grad_norm": 0.0011735991574823856, "learning_rate": 4.4683046134754976e-07, "loss": 0.0, "num_input_tokens_seen": 99024528, "step": 146935 }, { "epoch": 3.5897686463244813, "grad_norm": 0.0015995007706806064, "learning_rate": 4.46759421259969e-07, "loss": 0.0545, "num_input_tokens_seen": 99027536, "step": 146940 }, { "epoch": 3.5898907971563285, "grad_norm": 0.0017878710059449077, "learning_rate": 4.466883851957026e-07, "loss": 0.0, "num_input_tokens_seen": 99033232, "step": 146945 }, { "epoch": 3.5900129479881757, "grad_norm": 0.00105475343298167, "learning_rate": 4.466173531552674e-07, "loss": 0.0001, "num_input_tokens_seen": 99036368, "step": 146950 }, { "epoch": 3.590135098820023, "grad_norm": 0.0045726606622338295, "learning_rate": 4.465463251391792e-07, "loss": 0.0, "num_input_tokens_seen": 99039632, "step": 146955 }, { "epoch": 3.59025724965187, "grad_norm": 0.0008261458133347332, "learning_rate": 4.464753011479555e-07, "loss": 0.0, "num_input_tokens_seen": 99043152, "step": 146960 }, { "epoch": 3.5903794004837173, "grad_norm": 0.35889339447021484, "learning_rate": 4.464042811821118e-07, "loss": 0.0001, "num_input_tokens_seen": 99048400, "step": 146965 }, { "epoch": 3.5905015513155645, "grad_norm": 0.0002090990892611444, "learning_rate": 4.4633326524216517e-07, "loss": 0.0001, "num_input_tokens_seen": 99051664, "step": 146970 }, { "epoch": 3.5906237021474117, "grad_norm": 0.001780422287993133, "learning_rate": 4.462622533286322e-07, "loss": 0.0, "num_input_tokens_seen": 99054608, "step": 146975 }, { "epoch": 3.590745852979259, "grad_norm": 0.0003043642791453749, "learning_rate": 4.461912454420288e-07, "loss": 0.0, "num_input_tokens_seen": 99058320, "step": 146980 }, { "epoch": 3.590868003811106, "grad_norm": 0.0010360745945945382, "learning_rate": 4.4612024158287196e-07, "loss": 0.0, "num_input_tokens_seen": 99061648, "step": 146985 }, { "epoch": 3.5909901546429532, "grad_norm": 0.013427109457552433, "learning_rate": 4.4604924175167737e-07, "loss": 0.0, "num_input_tokens_seen": 99065104, "step": 146990 }, { "epoch": 3.5911123054748004, "grad_norm": 0.0010030671255663037, "learning_rate": 4.4597824594896194e-07, "loss": 0.0, "num_input_tokens_seen": 99068432, "step": 146995 }, { "epoch": 3.5912344563066476, "grad_norm": 0.00030616443837061524, "learning_rate": 4.459072541752414e-07, "loss": 0.0, "num_input_tokens_seen": 99071888, "step": 147000 }, { "epoch": 3.5913566071384944, "grad_norm": 0.0007564072730019689, "learning_rate": 4.4583626643103276e-07, "loss": 0.1026, "num_input_tokens_seen": 99075152, "step": 147005 }, { "epoch": 3.591478757970342, "grad_norm": 0.003279258031398058, "learning_rate": 4.4576528271685143e-07, "loss": 0.0, "num_input_tokens_seen": 99078608, "step": 147010 }, { "epoch": 3.5916009088021887, "grad_norm": 0.0007738301646895707, "learning_rate": 4.4569430303321445e-07, "loss": 0.0, "num_input_tokens_seen": 99082256, "step": 147015 }, { "epoch": 3.5917230596340364, "grad_norm": 0.0013760982546955347, "learning_rate": 4.456233273806376e-07, "loss": 0.0003, "num_input_tokens_seen": 99085520, "step": 147020 }, { "epoch": 3.591845210465883, "grad_norm": 0.012807670049369335, "learning_rate": 4.4555235575963655e-07, "loss": 0.0, "num_input_tokens_seen": 99089040, "step": 147025 }, { "epoch": 3.5919673612977303, "grad_norm": 0.00666635949164629, "learning_rate": 4.45481388170728e-07, "loss": 0.0, "num_input_tokens_seen": 99092560, "step": 147030 }, { "epoch": 3.5920895121295775, "grad_norm": 44.692710876464844, "learning_rate": 4.4541042461442824e-07, "loss": 0.0414, "num_input_tokens_seen": 99095952, "step": 147035 }, { "epoch": 3.5922116629614247, "grad_norm": 0.0031907649245113134, "learning_rate": 4.4533946509125267e-07, "loss": 0.0, "num_input_tokens_seen": 99099536, "step": 147040 }, { "epoch": 3.592333813793272, "grad_norm": 0.011112304404377937, "learning_rate": 4.4526850960171813e-07, "loss": 0.0, "num_input_tokens_seen": 99102544, "step": 147045 }, { "epoch": 3.592455964625119, "grad_norm": 0.0003050380910281092, "learning_rate": 4.4519755814633974e-07, "loss": 0.0, "num_input_tokens_seen": 99105616, "step": 147050 }, { "epoch": 3.5925781154569663, "grad_norm": 0.0003009406791534275, "learning_rate": 4.451266107256344e-07, "loss": 0.0, "num_input_tokens_seen": 99109008, "step": 147055 }, { "epoch": 3.5927002662888134, "grad_norm": 0.00098048010841012, "learning_rate": 4.4505566734011713e-07, "loss": 0.0304, "num_input_tokens_seen": 99111888, "step": 147060 }, { "epoch": 3.5928224171206606, "grad_norm": 0.0001502260274719447, "learning_rate": 4.449847279903044e-07, "loss": 0.0898, "num_input_tokens_seen": 99115024, "step": 147065 }, { "epoch": 3.592944567952508, "grad_norm": 0.008276687934994698, "learning_rate": 4.4491379267671227e-07, "loss": 0.0, "num_input_tokens_seen": 99118288, "step": 147070 }, { "epoch": 3.593066718784355, "grad_norm": 0.0011936317896470428, "learning_rate": 4.4484286139985604e-07, "loss": 0.0, "num_input_tokens_seen": 99121744, "step": 147075 }, { "epoch": 3.593188869616202, "grad_norm": 0.001968457130715251, "learning_rate": 4.447719341602522e-07, "loss": 0.0, "num_input_tokens_seen": 99125136, "step": 147080 }, { "epoch": 3.5933110204480494, "grad_norm": 0.0005487522576004267, "learning_rate": 4.4470101095841584e-07, "loss": 0.0, "num_input_tokens_seen": 99128336, "step": 147085 }, { "epoch": 3.593433171279896, "grad_norm": 0.001973625971004367, "learning_rate": 4.4463009179486344e-07, "loss": 0.0, "num_input_tokens_seen": 99131728, "step": 147090 }, { "epoch": 3.5935553221117438, "grad_norm": 0.0011422754032537341, "learning_rate": 4.4455917667011e-07, "loss": 0.0, "num_input_tokens_seen": 99135568, "step": 147095 }, { "epoch": 3.5936774729435905, "grad_norm": 0.33334842324256897, "learning_rate": 4.444882655846717e-07, "loss": 0.0001, "num_input_tokens_seen": 99138512, "step": 147100 }, { "epoch": 3.593799623775438, "grad_norm": 0.0004663039289880544, "learning_rate": 4.4441735853906437e-07, "loss": 0.0, "num_input_tokens_seen": 99141648, "step": 147105 }, { "epoch": 3.593921774607285, "grad_norm": 0.01186708640307188, "learning_rate": 4.443464555338031e-07, "loss": 0.0524, "num_input_tokens_seen": 99144592, "step": 147110 }, { "epoch": 3.594043925439132, "grad_norm": 0.007880584336817265, "learning_rate": 4.4427555656940443e-07, "loss": 0.0, "num_input_tokens_seen": 99147664, "step": 147115 }, { "epoch": 3.5941660762709793, "grad_norm": 0.0002760709321592003, "learning_rate": 4.4420466164638316e-07, "loss": 0.0001, "num_input_tokens_seen": 99150736, "step": 147120 }, { "epoch": 3.5942882271028265, "grad_norm": 0.06267506629228592, "learning_rate": 4.4413377076525484e-07, "loss": 0.0, "num_input_tokens_seen": 99153936, "step": 147125 }, { "epoch": 3.5944103779346737, "grad_norm": 0.0029153351206332445, "learning_rate": 4.4406288392653556e-07, "loss": 0.0001, "num_input_tokens_seen": 99157136, "step": 147130 }, { "epoch": 3.594532528766521, "grad_norm": 0.0005900236428715289, "learning_rate": 4.4399200113074006e-07, "loss": 0.0, "num_input_tokens_seen": 99160592, "step": 147135 }, { "epoch": 3.594654679598368, "grad_norm": 0.005414668936282396, "learning_rate": 4.439211223783849e-07, "loss": 0.0286, "num_input_tokens_seen": 99164112, "step": 147140 }, { "epoch": 3.5947768304302152, "grad_norm": 19.3892822265625, "learning_rate": 4.4385024766998426e-07, "loss": 0.0962, "num_input_tokens_seen": 99167120, "step": 147145 }, { "epoch": 3.5948989812620624, "grad_norm": 0.0007410175749100745, "learning_rate": 4.4377937700605473e-07, "loss": 0.0, "num_input_tokens_seen": 99170448, "step": 147150 }, { "epoch": 3.5950211320939096, "grad_norm": 0.003837596857920289, "learning_rate": 4.437085103871108e-07, "loss": 0.0, "num_input_tokens_seen": 99173776, "step": 147155 }, { "epoch": 3.595143282925757, "grad_norm": 0.00045322536607272923, "learning_rate": 4.4363764781366817e-07, "loss": 0.0, "num_input_tokens_seen": 99177360, "step": 147160 }, { "epoch": 3.595265433757604, "grad_norm": 0.0013312663650140166, "learning_rate": 4.435667892862426e-07, "loss": 0.0, "num_input_tokens_seen": 99180752, "step": 147165 }, { "epoch": 3.595387584589451, "grad_norm": 0.006688072811812162, "learning_rate": 4.4349593480534854e-07, "loss": 0.0052, "num_input_tokens_seen": 99184208, "step": 147170 }, { "epoch": 3.5955097354212984, "grad_norm": 0.05550628900527954, "learning_rate": 4.4342508437150214e-07, "loss": 0.0, "num_input_tokens_seen": 99187536, "step": 147175 }, { "epoch": 3.5956318862531456, "grad_norm": 0.002747165272012353, "learning_rate": 4.43354237985218e-07, "loss": 0.0, "num_input_tokens_seen": 99190736, "step": 147180 }, { "epoch": 3.5957540370849923, "grad_norm": 0.00022270670160651207, "learning_rate": 4.4328339564701143e-07, "loss": 0.0001, "num_input_tokens_seen": 99194448, "step": 147185 }, { "epoch": 3.59587618791684, "grad_norm": 0.016678297892212868, "learning_rate": 4.4321255735739816e-07, "loss": 0.0, "num_input_tokens_seen": 99197584, "step": 147190 }, { "epoch": 3.5959983387486867, "grad_norm": 0.007575098890811205, "learning_rate": 4.4314172311689244e-07, "loss": 0.0003, "num_input_tokens_seen": 99201616, "step": 147195 }, { "epoch": 3.5961204895805343, "grad_norm": 0.027997225522994995, "learning_rate": 4.430708929260104e-07, "loss": 0.0, "num_input_tokens_seen": 99204944, "step": 147200 }, { "epoch": 3.596242640412381, "grad_norm": 0.0010609532473608851, "learning_rate": 4.4300006678526615e-07, "loss": 0.0, "num_input_tokens_seen": 99208272, "step": 147205 }, { "epoch": 3.5963647912442283, "grad_norm": 0.0009776917286217213, "learning_rate": 4.429292446951757e-07, "loss": 0.0, "num_input_tokens_seen": 99211664, "step": 147210 }, { "epoch": 3.5964869420760754, "grad_norm": 0.0003319795650895685, "learning_rate": 4.4285842665625317e-07, "loss": 0.0, "num_input_tokens_seen": 99214992, "step": 147215 }, { "epoch": 3.5966090929079226, "grad_norm": 0.00017459203081671149, "learning_rate": 4.4278761266901444e-07, "loss": 0.0, "num_input_tokens_seen": 99218384, "step": 147220 }, { "epoch": 3.59673124373977, "grad_norm": 0.09625773131847382, "learning_rate": 4.4271680273397404e-07, "loss": 0.0567, "num_input_tokens_seen": 99221840, "step": 147225 }, { "epoch": 3.596853394571617, "grad_norm": 0.0011675640707835555, "learning_rate": 4.426459968516466e-07, "loss": 0.0, "num_input_tokens_seen": 99224784, "step": 147230 }, { "epoch": 3.596975545403464, "grad_norm": 0.01471780426800251, "learning_rate": 4.425751950225477e-07, "loss": 0.0, "num_input_tokens_seen": 99227536, "step": 147235 }, { "epoch": 3.5970976962353114, "grad_norm": 0.001742966822348535, "learning_rate": 4.425043972471916e-07, "loss": 0.0, "num_input_tokens_seen": 99231184, "step": 147240 }, { "epoch": 3.5972198470671586, "grad_norm": 0.0022475863806903362, "learning_rate": 4.424336035260937e-07, "loss": 0.0, "num_input_tokens_seen": 99234768, "step": 147245 }, { "epoch": 3.5973419978990058, "grad_norm": 0.006544165778905153, "learning_rate": 4.423628138597684e-07, "loss": 0.0, "num_input_tokens_seen": 99238032, "step": 147250 }, { "epoch": 3.597464148730853, "grad_norm": 0.006454230286180973, "learning_rate": 4.4229202824873056e-07, "loss": 0.0, "num_input_tokens_seen": 99241168, "step": 147255 }, { "epoch": 3.5975862995627, "grad_norm": 0.002534851199015975, "learning_rate": 4.422212466934956e-07, "loss": 0.0035, "num_input_tokens_seen": 99244560, "step": 147260 }, { "epoch": 3.5977084503945473, "grad_norm": 0.0012128119124099612, "learning_rate": 4.421504691945773e-07, "loss": 0.0001, "num_input_tokens_seen": 99247824, "step": 147265 }, { "epoch": 3.597830601226394, "grad_norm": 0.001539935707114637, "learning_rate": 4.4207969575249126e-07, "loss": 0.075, "num_input_tokens_seen": 99251280, "step": 147270 }, { "epoch": 3.5979527520582417, "grad_norm": 0.004851538222283125, "learning_rate": 4.420089263677513e-07, "loss": 0.0, "num_input_tokens_seen": 99254800, "step": 147275 }, { "epoch": 3.5980749028900885, "grad_norm": 0.0061553469859063625, "learning_rate": 4.4193816104087257e-07, "loss": 0.0, "num_input_tokens_seen": 99258256, "step": 147280 }, { "epoch": 3.598197053721936, "grad_norm": 0.0033780173398554325, "learning_rate": 4.4186739977236997e-07, "loss": 0.0, "num_input_tokens_seen": 99261392, "step": 147285 }, { "epoch": 3.598319204553783, "grad_norm": 0.011429624632000923, "learning_rate": 4.417966425627574e-07, "loss": 0.0, "num_input_tokens_seen": 99264592, "step": 147290 }, { "epoch": 3.59844135538563, "grad_norm": 16.78447723388672, "learning_rate": 4.417258894125502e-07, "loss": 0.0326, "num_input_tokens_seen": 99268048, "step": 147295 }, { "epoch": 3.5985635062174772, "grad_norm": 0.000813651888165623, "learning_rate": 4.4165514032226205e-07, "loss": 0.0001, "num_input_tokens_seen": 99271504, "step": 147300 }, { "epoch": 3.5986856570493244, "grad_norm": 0.3993775546550751, "learning_rate": 4.4158439529240834e-07, "loss": 0.0003, "num_input_tokens_seen": 99274768, "step": 147305 }, { "epoch": 3.5988078078811716, "grad_norm": 0.0026703726034611464, "learning_rate": 4.4151365432350264e-07, "loss": 0.0, "num_input_tokens_seen": 99278416, "step": 147310 }, { "epoch": 3.598929958713019, "grad_norm": 0.0016981259686872363, "learning_rate": 4.414429174160603e-07, "loss": 0.0, "num_input_tokens_seen": 99281680, "step": 147315 }, { "epoch": 3.599052109544866, "grad_norm": 0.00385949295014143, "learning_rate": 4.4137218457059487e-07, "loss": 0.0, "num_input_tokens_seen": 99285072, "step": 147320 }, { "epoch": 3.599174260376713, "grad_norm": 0.001972557744011283, "learning_rate": 4.413014557876216e-07, "loss": 0.0224, "num_input_tokens_seen": 99288144, "step": 147325 }, { "epoch": 3.5992964112085604, "grad_norm": 0.0009733116021379828, "learning_rate": 4.412307310676544e-07, "loss": 0.0, "num_input_tokens_seen": 99291472, "step": 147330 }, { "epoch": 3.5994185620404076, "grad_norm": 0.002177638001739979, "learning_rate": 4.4116001041120723e-07, "loss": 0.0, "num_input_tokens_seen": 99294992, "step": 147335 }, { "epoch": 3.5995407128722547, "grad_norm": 0.0006871279911138117, "learning_rate": 4.410892938187948e-07, "loss": 0.0, "num_input_tokens_seen": 99298448, "step": 147340 }, { "epoch": 3.599662863704102, "grad_norm": 0.007582599762827158, "learning_rate": 4.410185812909316e-07, "loss": 0.0, "num_input_tokens_seen": 99301584, "step": 147345 }, { "epoch": 3.599785014535949, "grad_norm": 0.006781270261853933, "learning_rate": 4.409478728281314e-07, "loss": 0.0, "num_input_tokens_seen": 99305424, "step": 147350 }, { "epoch": 3.5999071653677963, "grad_norm": 7.533630559919402e-05, "learning_rate": 4.4087716843090897e-07, "loss": 0.0, "num_input_tokens_seen": 99308944, "step": 147355 }, { "epoch": 3.6000293161996435, "grad_norm": 5.5826687457738444e-05, "learning_rate": 4.408064680997777e-07, "loss": 0.0, "num_input_tokens_seen": 99313040, "step": 147360 }, { "epoch": 3.6001514670314902, "grad_norm": 0.0023900719825178385, "learning_rate": 4.407357718352527e-07, "loss": 0.0, "num_input_tokens_seen": 99316368, "step": 147365 }, { "epoch": 3.600273617863338, "grad_norm": 0.0005167967756278813, "learning_rate": 4.406650796378472e-07, "loss": 0.0, "num_input_tokens_seen": 99319440, "step": 147370 }, { "epoch": 3.6003957686951846, "grad_norm": 0.0021112055983394384, "learning_rate": 4.4059439150807566e-07, "loss": 0.0, "num_input_tokens_seen": 99322960, "step": 147375 }, { "epoch": 3.600517919527032, "grad_norm": 0.00010200147517025471, "learning_rate": 4.405237074464525e-07, "loss": 0.1013, "num_input_tokens_seen": 99326416, "step": 147380 }, { "epoch": 3.600640070358879, "grad_norm": 3.1429230148205534e-05, "learning_rate": 4.404530274534911e-07, "loss": 0.0, "num_input_tokens_seen": 99329744, "step": 147385 }, { "epoch": 3.600762221190726, "grad_norm": 0.001426719594746828, "learning_rate": 4.4038235152970606e-07, "loss": 0.0, "num_input_tokens_seen": 99333200, "step": 147390 }, { "epoch": 3.6008843720225734, "grad_norm": 0.02611630968749523, "learning_rate": 4.403116796756108e-07, "loss": 0.0, "num_input_tokens_seen": 99336784, "step": 147395 }, { "epoch": 3.6010065228544206, "grad_norm": 0.0002963593287859112, "learning_rate": 4.402410118917199e-07, "loss": 0.0001, "num_input_tokens_seen": 99340048, "step": 147400 }, { "epoch": 3.6011286736862678, "grad_norm": 0.010210043750703335, "learning_rate": 4.401703481785466e-07, "loss": 0.0001, "num_input_tokens_seen": 99343568, "step": 147405 }, { "epoch": 3.601250824518115, "grad_norm": 0.6873293519020081, "learning_rate": 4.4009968853660496e-07, "loss": 0.059, "num_input_tokens_seen": 99346576, "step": 147410 }, { "epoch": 3.601372975349962, "grad_norm": 0.00032992829801514745, "learning_rate": 4.4002903296640947e-07, "loss": 0.0, "num_input_tokens_seen": 99349840, "step": 147415 }, { "epoch": 3.6014951261818093, "grad_norm": 0.003081189002841711, "learning_rate": 4.399583814684731e-07, "loss": 0.0, "num_input_tokens_seen": 99353360, "step": 147420 }, { "epoch": 3.6016172770136565, "grad_norm": 0.00017771427519619465, "learning_rate": 4.3988773404331027e-07, "loss": 0.0001, "num_input_tokens_seen": 99356880, "step": 147425 }, { "epoch": 3.6017394278455037, "grad_norm": 0.001582012395374477, "learning_rate": 4.398170906914346e-07, "loss": 0.0, "num_input_tokens_seen": 99359952, "step": 147430 }, { "epoch": 3.601861578677351, "grad_norm": 0.0001949633879121393, "learning_rate": 4.397464514133593e-07, "loss": 0.0, "num_input_tokens_seen": 99363408, "step": 147435 }, { "epoch": 3.601983729509198, "grad_norm": 0.006579456850886345, "learning_rate": 4.3967581620959893e-07, "loss": 0.0349, "num_input_tokens_seen": 99367184, "step": 147440 }, { "epoch": 3.6021058803410453, "grad_norm": 0.0007167812436819077, "learning_rate": 4.3960518508066624e-07, "loss": 0.0002, "num_input_tokens_seen": 99370640, "step": 147445 }, { "epoch": 3.602228031172892, "grad_norm": 371.5362548828125, "learning_rate": 4.3953455802707587e-07, "loss": 0.0122, "num_input_tokens_seen": 99373904, "step": 147450 }, { "epoch": 3.6023501820047397, "grad_norm": 0.09295155853033066, "learning_rate": 4.3946393504934054e-07, "loss": 0.0, "num_input_tokens_seen": 99377744, "step": 147455 }, { "epoch": 3.6024723328365864, "grad_norm": 0.008168951608240604, "learning_rate": 4.3939331614797457e-07, "loss": 0.0001, "num_input_tokens_seen": 99380752, "step": 147460 }, { "epoch": 3.602594483668434, "grad_norm": 0.003420223481953144, "learning_rate": 4.393227013234908e-07, "loss": 0.0, "num_input_tokens_seen": 99383568, "step": 147465 }, { "epoch": 3.602716634500281, "grad_norm": 0.03465414419770241, "learning_rate": 4.392520905764032e-07, "loss": 0.0, "num_input_tokens_seen": 99386704, "step": 147470 }, { "epoch": 3.602838785332128, "grad_norm": 0.00030073043308220804, "learning_rate": 4.391814839072255e-07, "loss": 0.0, "num_input_tokens_seen": 99390032, "step": 147475 }, { "epoch": 3.602960936163975, "grad_norm": 0.002083902945742011, "learning_rate": 4.3911088131647066e-07, "loss": 0.0, "num_input_tokens_seen": 99392976, "step": 147480 }, { "epoch": 3.6030830869958224, "grad_norm": 0.005192081443965435, "learning_rate": 4.3904028280465267e-07, "loss": 0.0, "num_input_tokens_seen": 99396496, "step": 147485 }, { "epoch": 3.6032052378276695, "grad_norm": 0.0013596249045804143, "learning_rate": 4.389696883722842e-07, "loss": 0.0, "num_input_tokens_seen": 99400400, "step": 147490 }, { "epoch": 3.6033273886595167, "grad_norm": 0.0020743070635944605, "learning_rate": 4.3889909801987955e-07, "loss": 0.0, "num_input_tokens_seen": 99404496, "step": 147495 }, { "epoch": 3.603449539491364, "grad_norm": 0.027771538123488426, "learning_rate": 4.3882851174795113e-07, "loss": 0.0001, "num_input_tokens_seen": 99408208, "step": 147500 }, { "epoch": 3.603571690323211, "grad_norm": 0.0006288870936259627, "learning_rate": 4.387579295570126e-07, "loss": 0.0001, "num_input_tokens_seen": 99411280, "step": 147505 }, { "epoch": 3.6036938411550583, "grad_norm": 59.22822570800781, "learning_rate": 4.3868735144757787e-07, "loss": 0.0457, "num_input_tokens_seen": 99414672, "step": 147510 }, { "epoch": 3.6038159919869055, "grad_norm": 0.006329473108053207, "learning_rate": 4.3861677742015937e-07, "loss": 0.0, "num_input_tokens_seen": 99417872, "step": 147515 }, { "epoch": 3.6039381428187527, "grad_norm": 0.13014653325080872, "learning_rate": 4.3854620747527095e-07, "loss": 0.0001, "num_input_tokens_seen": 99421328, "step": 147520 }, { "epoch": 3.6040602936506, "grad_norm": 0.0023727233055979013, "learning_rate": 4.384756416134251e-07, "loss": 0.0, "num_input_tokens_seen": 99424528, "step": 147525 }, { "epoch": 3.604182444482447, "grad_norm": 0.00016004152712412179, "learning_rate": 4.3840507983513585e-07, "loss": 0.0, "num_input_tokens_seen": 99427664, "step": 147530 }, { "epoch": 3.604304595314294, "grad_norm": 0.00026830736896954477, "learning_rate": 4.383345221409159e-07, "loss": 0.0, "num_input_tokens_seen": 99430864, "step": 147535 }, { "epoch": 3.6044267461461414, "grad_norm": 0.04292040690779686, "learning_rate": 4.38263968531278e-07, "loss": 0.1113, "num_input_tokens_seen": 99434128, "step": 147540 }, { "epoch": 3.604548896977988, "grad_norm": 0.06387756019830704, "learning_rate": 4.38193419006736e-07, "loss": 0.0002, "num_input_tokens_seen": 99437584, "step": 147545 }, { "epoch": 3.604671047809836, "grad_norm": 0.006251856684684753, "learning_rate": 4.381228735678022e-07, "loss": 0.0, "num_input_tokens_seen": 99440656, "step": 147550 }, { "epoch": 3.6047931986416826, "grad_norm": 0.06982557475566864, "learning_rate": 4.380523322149903e-07, "loss": 0.0, "num_input_tokens_seen": 99444560, "step": 147555 }, { "epoch": 3.6049153494735298, "grad_norm": 0.004347453825175762, "learning_rate": 4.379817949488127e-07, "loss": 0.0, "num_input_tokens_seen": 99447824, "step": 147560 }, { "epoch": 3.605037500305377, "grad_norm": 0.00021586604998447, "learning_rate": 4.3791126176978254e-07, "loss": 0.0001, "num_input_tokens_seen": 99452304, "step": 147565 }, { "epoch": 3.605159651137224, "grad_norm": 0.0011594532988965511, "learning_rate": 4.378407326784134e-07, "loss": 0.0, "num_input_tokens_seen": 99455888, "step": 147570 }, { "epoch": 3.6052818019690713, "grad_norm": 0.00046078336890786886, "learning_rate": 4.377702076752171e-07, "loss": 0.0001, "num_input_tokens_seen": 99459408, "step": 147575 }, { "epoch": 3.6054039528009185, "grad_norm": 0.01156389620155096, "learning_rate": 4.3769968676070757e-07, "loss": 0.0, "num_input_tokens_seen": 99462928, "step": 147580 }, { "epoch": 3.6055261036327657, "grad_norm": 0.008867439813911915, "learning_rate": 4.376291699353968e-07, "loss": 0.0, "num_input_tokens_seen": 99466128, "step": 147585 }, { "epoch": 3.605648254464613, "grad_norm": 9.969405800802633e-05, "learning_rate": 4.375586571997979e-07, "loss": 0.0, "num_input_tokens_seen": 99469328, "step": 147590 }, { "epoch": 3.60577040529646, "grad_norm": 6.560365727636963e-05, "learning_rate": 4.374881485544241e-07, "loss": 0.0, "num_input_tokens_seen": 99472848, "step": 147595 }, { "epoch": 3.6058925561283073, "grad_norm": 0.0015670241555199027, "learning_rate": 4.3741764399978743e-07, "loss": 0.0, "num_input_tokens_seen": 99476560, "step": 147600 }, { "epoch": 3.6060147069601545, "grad_norm": 0.00015558060840703547, "learning_rate": 4.3734714353640124e-07, "loss": 0.0, "num_input_tokens_seen": 99480016, "step": 147605 }, { "epoch": 3.6061368577920017, "grad_norm": 0.001948792953044176, "learning_rate": 4.3727664716477773e-07, "loss": 0.0, "num_input_tokens_seen": 99484304, "step": 147610 }, { "epoch": 3.606259008623849, "grad_norm": 0.0182126946747303, "learning_rate": 4.372061548854301e-07, "loss": 0.0, "num_input_tokens_seen": 99487440, "step": 147615 }, { "epoch": 3.606381159455696, "grad_norm": 0.0016897412715479732, "learning_rate": 4.371356666988704e-07, "loss": 0.0258, "num_input_tokens_seen": 99490640, "step": 147620 }, { "epoch": 3.6065033102875432, "grad_norm": 0.010075540281832218, "learning_rate": 4.3706518260561175e-07, "loss": 0.0, "num_input_tokens_seen": 99493776, "step": 147625 }, { "epoch": 3.60662546111939, "grad_norm": 31.596620559692383, "learning_rate": 4.369947026061662e-07, "loss": 0.0891, "num_input_tokens_seen": 99496912, "step": 147630 }, { "epoch": 3.6067476119512376, "grad_norm": 0.0003606700920499861, "learning_rate": 4.36924226701047e-07, "loss": 0.0, "num_input_tokens_seen": 99500304, "step": 147635 }, { "epoch": 3.6068697627830844, "grad_norm": 0.004157507326453924, "learning_rate": 4.368537548907663e-07, "loss": 0.0, "num_input_tokens_seen": 99503504, "step": 147640 }, { "epoch": 3.606991913614932, "grad_norm": 0.0006897756829857826, "learning_rate": 4.3678328717583614e-07, "loss": 0.0001, "num_input_tokens_seen": 99507408, "step": 147645 }, { "epoch": 3.6071140644467787, "grad_norm": 0.00255883508361876, "learning_rate": 4.3671282355676976e-07, "loss": 0.1092, "num_input_tokens_seen": 99510928, "step": 147650 }, { "epoch": 3.607236215278626, "grad_norm": 0.05700772628188133, "learning_rate": 4.3664236403407886e-07, "loss": 0.0476, "num_input_tokens_seen": 99514192, "step": 147655 }, { "epoch": 3.607358366110473, "grad_norm": 0.011682671494781971, "learning_rate": 4.365719086082762e-07, "loss": 0.0003, "num_input_tokens_seen": 99517392, "step": 147660 }, { "epoch": 3.6074805169423203, "grad_norm": 0.0015562445623800159, "learning_rate": 4.365014572798745e-07, "loss": 0.0533, "num_input_tokens_seen": 99520784, "step": 147665 }, { "epoch": 3.6076026677741675, "grad_norm": 0.0011776462197303772, "learning_rate": 4.364310100493855e-07, "loss": 0.0, "num_input_tokens_seen": 99524304, "step": 147670 }, { "epoch": 3.6077248186060147, "grad_norm": 0.00629253638908267, "learning_rate": 4.3636056691732214e-07, "loss": 0.0, "num_input_tokens_seen": 99527632, "step": 147675 }, { "epoch": 3.607846969437862, "grad_norm": 0.08474673330783844, "learning_rate": 4.3629012788419597e-07, "loss": 0.0002, "num_input_tokens_seen": 99530896, "step": 147680 }, { "epoch": 3.607969120269709, "grad_norm": 0.0008923867717385292, "learning_rate": 4.3621969295051953e-07, "loss": 0.0, "num_input_tokens_seen": 99534032, "step": 147685 }, { "epoch": 3.6080912711015563, "grad_norm": 0.007003965321928263, "learning_rate": 4.361492621168056e-07, "loss": 0.0, "num_input_tokens_seen": 99537680, "step": 147690 }, { "epoch": 3.6082134219334034, "grad_norm": 0.006107357330620289, "learning_rate": 4.360788353835654e-07, "loss": 0.0, "num_input_tokens_seen": 99541136, "step": 147695 }, { "epoch": 3.6083355727652506, "grad_norm": 0.003756742924451828, "learning_rate": 4.3600841275131204e-07, "loss": 0.0001, "num_input_tokens_seen": 99544336, "step": 147700 }, { "epoch": 3.608457723597098, "grad_norm": 0.0005559317069128156, "learning_rate": 4.359379942205568e-07, "loss": 0.0, "num_input_tokens_seen": 99548176, "step": 147705 }, { "epoch": 3.608579874428945, "grad_norm": 0.004919090308248997, "learning_rate": 4.3586757979181254e-07, "loss": 0.0, "num_input_tokens_seen": 99551376, "step": 147710 }, { "epoch": 3.6087020252607918, "grad_norm": 0.003267875872552395, "learning_rate": 4.3579716946559064e-07, "loss": 0.0, "num_input_tokens_seen": 99554704, "step": 147715 }, { "epoch": 3.6088241760926394, "grad_norm": 0.005583520978689194, "learning_rate": 4.357267632424034e-07, "loss": 0.0, "num_input_tokens_seen": 99558224, "step": 147720 }, { "epoch": 3.608946326924486, "grad_norm": 0.003153529716655612, "learning_rate": 4.356563611227634e-07, "loss": 0.0451, "num_input_tokens_seen": 99561168, "step": 147725 }, { "epoch": 3.6090684777563338, "grad_norm": 0.001594212488271296, "learning_rate": 4.3558596310718166e-07, "loss": 0.0, "num_input_tokens_seen": 99564560, "step": 147730 }, { "epoch": 3.6091906285881805, "grad_norm": 0.0006904274341650307, "learning_rate": 4.355155691961711e-07, "loss": 0.0, "num_input_tokens_seen": 99567696, "step": 147735 }, { "epoch": 3.6093127794200277, "grad_norm": 0.0009084900375455618, "learning_rate": 4.3544517939024304e-07, "loss": 0.0, "num_input_tokens_seen": 99570640, "step": 147740 }, { "epoch": 3.609434930251875, "grad_norm": 0.4451712965965271, "learning_rate": 4.3537479368990917e-07, "loss": 0.0001, "num_input_tokens_seen": 99574096, "step": 147745 }, { "epoch": 3.609557081083722, "grad_norm": 0.011974464170634747, "learning_rate": 4.353044120956819e-07, "loss": 0.0, "num_input_tokens_seen": 99577616, "step": 147750 }, { "epoch": 3.6096792319155693, "grad_norm": 0.0011010023299604654, "learning_rate": 4.352340346080726e-07, "loss": 0.0, "num_input_tokens_seen": 99581456, "step": 147755 }, { "epoch": 3.6098013827474165, "grad_norm": 0.0023438516072928905, "learning_rate": 4.351636612275936e-07, "loss": 0.0001, "num_input_tokens_seen": 99584912, "step": 147760 }, { "epoch": 3.6099235335792637, "grad_norm": 0.027717119082808495, "learning_rate": 4.350932919547561e-07, "loss": 0.0, "num_input_tokens_seen": 99588112, "step": 147765 }, { "epoch": 3.610045684411111, "grad_norm": 0.0070284148678183556, "learning_rate": 4.350229267900725e-07, "loss": 0.0, "num_input_tokens_seen": 99592400, "step": 147770 }, { "epoch": 3.610167835242958, "grad_norm": 6.282122194534168e-05, "learning_rate": 4.349525657340536e-07, "loss": 0.0, "num_input_tokens_seen": 99595664, "step": 147775 }, { "epoch": 3.6102899860748052, "grad_norm": 0.028719637542963028, "learning_rate": 4.348822087872117e-07, "loss": 0.0005, "num_input_tokens_seen": 99598608, "step": 147780 }, { "epoch": 3.6104121369066524, "grad_norm": 0.004688702989369631, "learning_rate": 4.3481185595005875e-07, "loss": 0.0711, "num_input_tokens_seen": 99601616, "step": 147785 }, { "epoch": 3.6105342877384996, "grad_norm": 0.1381920576095581, "learning_rate": 4.347415072231058e-07, "loss": 0.0001, "num_input_tokens_seen": 99604944, "step": 147790 }, { "epoch": 3.610656438570347, "grad_norm": 32.13233947753906, "learning_rate": 4.3467116260686474e-07, "loss": 0.0344, "num_input_tokens_seen": 99608272, "step": 147795 }, { "epoch": 3.610778589402194, "grad_norm": 0.002420686185359955, "learning_rate": 4.3460082210184677e-07, "loss": 0.0, "num_input_tokens_seen": 99611664, "step": 147800 }, { "epoch": 3.610900740234041, "grad_norm": 0.00014533349894918501, "learning_rate": 4.345304857085642e-07, "loss": 0.0, "num_input_tokens_seen": 99614800, "step": 147805 }, { "epoch": 3.611022891065888, "grad_norm": 0.00020457235223148018, "learning_rate": 4.344601534275275e-07, "loss": 0.0, "num_input_tokens_seen": 99618000, "step": 147810 }, { "epoch": 3.6111450418977356, "grad_norm": 0.008015124127268791, "learning_rate": 4.3438982525924874e-07, "loss": 0.0, "num_input_tokens_seen": 99621456, "step": 147815 }, { "epoch": 3.6112671927295823, "grad_norm": 0.0028267372399568558, "learning_rate": 4.3431950120423963e-07, "loss": 0.0, "num_input_tokens_seen": 99625040, "step": 147820 }, { "epoch": 3.6113893435614295, "grad_norm": 0.0005698919994756579, "learning_rate": 4.342491812630109e-07, "loss": 0.0, "num_input_tokens_seen": 99628752, "step": 147825 }, { "epoch": 3.6115114943932767, "grad_norm": 0.004629726056009531, "learning_rate": 4.3417886543607474e-07, "loss": 0.0, "num_input_tokens_seen": 99632464, "step": 147830 }, { "epoch": 3.611633645225124, "grad_norm": 0.0010731581132858992, "learning_rate": 4.341085537239416e-07, "loss": 0.0775, "num_input_tokens_seen": 99635472, "step": 147835 }, { "epoch": 3.611755796056971, "grad_norm": 0.0005755699821747839, "learning_rate": 4.340382461271237e-07, "loss": 0.0, "num_input_tokens_seen": 99638928, "step": 147840 }, { "epoch": 3.6118779468888182, "grad_norm": 0.029588622972369194, "learning_rate": 4.339679426461319e-07, "loss": 0.0, "num_input_tokens_seen": 99642320, "step": 147845 }, { "epoch": 3.6120000977206654, "grad_norm": 0.002909613074734807, "learning_rate": 4.3389764328147706e-07, "loss": 0.0, "num_input_tokens_seen": 99646352, "step": 147850 }, { "epoch": 3.6121222485525126, "grad_norm": 0.036447200924158096, "learning_rate": 4.338273480336712e-07, "loss": 0.0453, "num_input_tokens_seen": 99649552, "step": 147855 }, { "epoch": 3.61224439938436, "grad_norm": 0.00029940472450107336, "learning_rate": 4.3375705690322474e-07, "loss": 0.0, "num_input_tokens_seen": 99653136, "step": 147860 }, { "epoch": 3.612366550216207, "grad_norm": 0.00088671495905146, "learning_rate": 4.336867698906497e-07, "loss": 0.0, "num_input_tokens_seen": 99655888, "step": 147865 }, { "epoch": 3.612488701048054, "grad_norm": 0.01289328932762146, "learning_rate": 4.3361648699645623e-07, "loss": 0.0, "num_input_tokens_seen": 99659408, "step": 147870 }, { "epoch": 3.6126108518799014, "grad_norm": 0.0001773501862771809, "learning_rate": 4.335462082211563e-07, "loss": 0.0, "num_input_tokens_seen": 99663184, "step": 147875 }, { "epoch": 3.6127330027117486, "grad_norm": 0.46676599979400635, "learning_rate": 4.3347593356526093e-07, "loss": 0.0002, "num_input_tokens_seen": 99666512, "step": 147880 }, { "epoch": 3.6128551535435958, "grad_norm": 0.0017477070214226842, "learning_rate": 4.334056630292805e-07, "loss": 0.0, "num_input_tokens_seen": 99669840, "step": 147885 }, { "epoch": 3.612977304375443, "grad_norm": 0.0036420223768800497, "learning_rate": 4.33335396613727e-07, "loss": 0.0001, "num_input_tokens_seen": 99673232, "step": 147890 }, { "epoch": 3.6130994552072897, "grad_norm": 0.08501607924699783, "learning_rate": 4.332651343191104e-07, "loss": 0.0001, "num_input_tokens_seen": 99676304, "step": 147895 }, { "epoch": 3.6132216060391373, "grad_norm": 0.0049306293949484825, "learning_rate": 4.331948761459423e-07, "loss": 0.0009, "num_input_tokens_seen": 99679568, "step": 147900 }, { "epoch": 3.613343756870984, "grad_norm": 0.014766248874366283, "learning_rate": 4.331246220947338e-07, "loss": 0.0, "num_input_tokens_seen": 99682896, "step": 147905 }, { "epoch": 3.6134659077028317, "grad_norm": 0.01937950775027275, "learning_rate": 4.3305437216599517e-07, "loss": 0.0, "num_input_tokens_seen": 99686544, "step": 147910 }, { "epoch": 3.6135880585346785, "grad_norm": 0.0013723402516916394, "learning_rate": 4.3298412636023797e-07, "loss": 0.0001, "num_input_tokens_seen": 99690192, "step": 147915 }, { "epoch": 3.6137102093665256, "grad_norm": 0.002116596093401313, "learning_rate": 4.329138846779724e-07, "loss": 0.0, "num_input_tokens_seen": 99693840, "step": 147920 }, { "epoch": 3.613832360198373, "grad_norm": 0.0024759876541793346, "learning_rate": 4.3284364711971e-07, "loss": 0.0006, "num_input_tokens_seen": 99697040, "step": 147925 }, { "epoch": 3.61395451103022, "grad_norm": 0.0009556720615364611, "learning_rate": 4.3277341368596066e-07, "loss": 0.0406, "num_input_tokens_seen": 99700304, "step": 147930 }, { "epoch": 3.614076661862067, "grad_norm": 0.0030979400034993887, "learning_rate": 4.327031843772361e-07, "loss": 0.049, "num_input_tokens_seen": 99703760, "step": 147935 }, { "epoch": 3.6141988126939144, "grad_norm": 0.012933360412716866, "learning_rate": 4.3263295919404605e-07, "loss": 0.0001, "num_input_tokens_seen": 99707152, "step": 147940 }, { "epoch": 3.6143209635257616, "grad_norm": 0.01721920073032379, "learning_rate": 4.3256273813690227e-07, "loss": 0.0, "num_input_tokens_seen": 99710416, "step": 147945 }, { "epoch": 3.614443114357609, "grad_norm": 0.00520155718550086, "learning_rate": 4.3249252120631474e-07, "loss": 0.0001, "num_input_tokens_seen": 99713872, "step": 147950 }, { "epoch": 3.614565265189456, "grad_norm": 0.0013286650646477938, "learning_rate": 4.3242230840279393e-07, "loss": 0.0947, "num_input_tokens_seen": 99717456, "step": 147955 }, { "epoch": 3.614687416021303, "grad_norm": 0.005257413722574711, "learning_rate": 4.3235209972685117e-07, "loss": 0.0002, "num_input_tokens_seen": 99721168, "step": 147960 }, { "epoch": 3.6148095668531504, "grad_norm": 0.03281658515334129, "learning_rate": 4.3228189517899616e-07, "loss": 0.0001, "num_input_tokens_seen": 99724880, "step": 147965 }, { "epoch": 3.6149317176849975, "grad_norm": 0.14259420335292816, "learning_rate": 4.3221169475973994e-07, "loss": 0.0, "num_input_tokens_seen": 99728144, "step": 147970 }, { "epoch": 3.6150538685168447, "grad_norm": 0.00042377988575026393, "learning_rate": 4.3214149846959336e-07, "loss": 0.0, "num_input_tokens_seen": 99731152, "step": 147975 }, { "epoch": 3.615176019348692, "grad_norm": 0.008627206087112427, "learning_rate": 4.320713063090662e-07, "loss": 0.0, "num_input_tokens_seen": 99734416, "step": 147980 }, { "epoch": 3.615298170180539, "grad_norm": 104.67365264892578, "learning_rate": 4.320011182786696e-07, "loss": 0.0224, "num_input_tokens_seen": 99737872, "step": 147985 }, { "epoch": 3.615420321012386, "grad_norm": 0.0013059648917987943, "learning_rate": 4.319309343789133e-07, "loss": 0.0, "num_input_tokens_seen": 99741136, "step": 147990 }, { "epoch": 3.6155424718442335, "grad_norm": 0.0005610277294181287, "learning_rate": 4.3186075461030803e-07, "loss": 0.0, "num_input_tokens_seen": 99744208, "step": 147995 }, { "epoch": 3.6156646226760802, "grad_norm": 0.0007101591327227652, "learning_rate": 4.317905789733645e-07, "loss": 0.0441, "num_input_tokens_seen": 99748624, "step": 148000 }, { "epoch": 3.6157867735079274, "grad_norm": 0.0024499285500496626, "learning_rate": 4.317204074685924e-07, "loss": 0.0, "num_input_tokens_seen": 99751632, "step": 148005 }, { "epoch": 3.6159089243397746, "grad_norm": 0.00240133423358202, "learning_rate": 4.316502400965026e-07, "loss": 0.0, "num_input_tokens_seen": 99754896, "step": 148010 }, { "epoch": 3.616031075171622, "grad_norm": 0.0021777721121907234, "learning_rate": 4.315800768576049e-07, "loss": 0.0, "num_input_tokens_seen": 99758288, "step": 148015 }, { "epoch": 3.616153226003469, "grad_norm": 0.0013742581941187382, "learning_rate": 4.3150991775241e-07, "loss": 0.0, "num_input_tokens_seen": 99761104, "step": 148020 }, { "epoch": 3.616275376835316, "grad_norm": 0.0009071017266251147, "learning_rate": 4.314397627814276e-07, "loss": 0.0001, "num_input_tokens_seen": 99764240, "step": 148025 }, { "epoch": 3.6163975276671634, "grad_norm": 0.013118072412908077, "learning_rate": 4.3136961194516817e-07, "loss": 0.0235, "num_input_tokens_seen": 99767696, "step": 148030 }, { "epoch": 3.6165196784990106, "grad_norm": 0.0009623137302696705, "learning_rate": 4.312994652441422e-07, "loss": 0.0382, "num_input_tokens_seen": 99770704, "step": 148035 }, { "epoch": 3.6166418293308578, "grad_norm": 0.0022995336912572384, "learning_rate": 4.3122932267885917e-07, "loss": 0.0, "num_input_tokens_seen": 99774096, "step": 148040 }, { "epoch": 3.616763980162705, "grad_norm": 0.0017281353939324617, "learning_rate": 4.311591842498298e-07, "loss": 0.0, "num_input_tokens_seen": 99777616, "step": 148045 }, { "epoch": 3.616886130994552, "grad_norm": 0.000500478723552078, "learning_rate": 4.310890499575638e-07, "loss": 0.0, "num_input_tokens_seen": 99780816, "step": 148050 }, { "epoch": 3.6170082818263993, "grad_norm": 0.0009686108096502721, "learning_rate": 4.3101891980257086e-07, "loss": 0.0001, "num_input_tokens_seen": 99783952, "step": 148055 }, { "epoch": 3.6171304326582465, "grad_norm": 0.004932576324790716, "learning_rate": 4.3094879378536185e-07, "loss": 0.0001, "num_input_tokens_seen": 99787472, "step": 148060 }, { "epoch": 3.6172525834900937, "grad_norm": 0.0024955112021416426, "learning_rate": 4.308786719064459e-07, "loss": 0.0, "num_input_tokens_seen": 99790416, "step": 148065 }, { "epoch": 3.617374734321941, "grad_norm": 0.000285703397821635, "learning_rate": 4.3080855416633363e-07, "loss": 0.0005, "num_input_tokens_seen": 99793872, "step": 148070 }, { "epoch": 3.6174968851537876, "grad_norm": 0.0008046041475608945, "learning_rate": 4.307384405655343e-07, "loss": 0.0, "num_input_tokens_seen": 99796816, "step": 148075 }, { "epoch": 3.6176190359856353, "grad_norm": 0.0003960691683460027, "learning_rate": 4.3066833110455845e-07, "loss": 0.0, "num_input_tokens_seen": 99800080, "step": 148080 }, { "epoch": 3.617741186817482, "grad_norm": 13.642656326293945, "learning_rate": 4.3059822578391536e-07, "loss": 0.0265, "num_input_tokens_seen": 99803792, "step": 148085 }, { "epoch": 3.6178633376493297, "grad_norm": 0.0023105733562260866, "learning_rate": 4.305281246041151e-07, "loss": 0.0, "num_input_tokens_seen": 99807376, "step": 148090 }, { "epoch": 3.6179854884811764, "grad_norm": 25.20553970336914, "learning_rate": 4.3045802756566787e-07, "loss": 0.0399, "num_input_tokens_seen": 99810704, "step": 148095 }, { "epoch": 3.6181076393130236, "grad_norm": 0.007407731376588345, "learning_rate": 4.3038793466908266e-07, "loss": 0.0, "num_input_tokens_seen": 99814544, "step": 148100 }, { "epoch": 3.618229790144871, "grad_norm": 0.1815802901983261, "learning_rate": 4.303178459148699e-07, "loss": 0.0001, "num_input_tokens_seen": 99818512, "step": 148105 }, { "epoch": 3.618351940976718, "grad_norm": 0.0026310270186513662, "learning_rate": 4.3024776130353866e-07, "loss": 0.0, "num_input_tokens_seen": 99822160, "step": 148110 }, { "epoch": 3.618474091808565, "grad_norm": 0.0008320335182361305, "learning_rate": 4.3017768083559933e-07, "loss": 0.0001, "num_input_tokens_seen": 99825552, "step": 148115 }, { "epoch": 3.6185962426404124, "grad_norm": 0.0030299080535769463, "learning_rate": 4.301076045115608e-07, "loss": 0.0001, "num_input_tokens_seen": 99828624, "step": 148120 }, { "epoch": 3.6187183934722595, "grad_norm": 0.019902754575014114, "learning_rate": 4.3003753233193305e-07, "loss": 0.0, "num_input_tokens_seen": 99831888, "step": 148125 }, { "epoch": 3.6188405443041067, "grad_norm": 0.01636345684528351, "learning_rate": 4.299674642972261e-07, "loss": 0.0, "num_input_tokens_seen": 99835280, "step": 148130 }, { "epoch": 3.618962695135954, "grad_norm": 0.01897534541785717, "learning_rate": 4.2989740040794864e-07, "loss": 0.0, "num_input_tokens_seen": 99838608, "step": 148135 }, { "epoch": 3.619084845967801, "grad_norm": 0.0227005984634161, "learning_rate": 4.29827340664611e-07, "loss": 0.0, "num_input_tokens_seen": 99841872, "step": 148140 }, { "epoch": 3.6192069967996483, "grad_norm": 0.025067569687962532, "learning_rate": 4.2975728506772193e-07, "loss": 0.0001, "num_input_tokens_seen": 99845008, "step": 148145 }, { "epoch": 3.6193291476314955, "grad_norm": 0.0004105870029889047, "learning_rate": 4.296872336177916e-07, "loss": 0.0002, "num_input_tokens_seen": 99848464, "step": 148150 }, { "epoch": 3.6194512984633427, "grad_norm": 0.000708704290445894, "learning_rate": 4.2961718631532905e-07, "loss": 0.0, "num_input_tokens_seen": 99851984, "step": 148155 }, { "epoch": 3.6195734492951894, "grad_norm": 0.012791010551154613, "learning_rate": 4.295471431608435e-07, "loss": 0.0, "num_input_tokens_seen": 99855312, "step": 148160 }, { "epoch": 3.619695600127037, "grad_norm": 0.6077525019645691, "learning_rate": 4.2947710415484486e-07, "loss": 0.0001, "num_input_tokens_seen": 99858896, "step": 148165 }, { "epoch": 3.619817750958884, "grad_norm": 0.007468364201486111, "learning_rate": 4.2940706929784176e-07, "loss": 0.0, "num_input_tokens_seen": 99862160, "step": 148170 }, { "epoch": 3.6199399017907314, "grad_norm": 0.00035050525912083685, "learning_rate": 4.2933703859034444e-07, "loss": 0.016, "num_input_tokens_seen": 99865680, "step": 148175 }, { "epoch": 3.620062052622578, "grad_norm": 0.004140862729400396, "learning_rate": 4.292670120328612e-07, "loss": 0.0, "num_input_tokens_seen": 99868816, "step": 148180 }, { "epoch": 3.6201842034544254, "grad_norm": 7.159739470807835e-05, "learning_rate": 4.291969896259017e-07, "loss": 0.0, "num_input_tokens_seen": 99872144, "step": 148185 }, { "epoch": 3.6203063542862726, "grad_norm": 0.00017383306112606078, "learning_rate": 4.291269713699757e-07, "loss": 0.0001, "num_input_tokens_seen": 99875664, "step": 148190 }, { "epoch": 3.6204285051181198, "grad_norm": 0.0002737454778980464, "learning_rate": 4.2905695726559145e-07, "loss": 0.0, "num_input_tokens_seen": 99879312, "step": 148195 }, { "epoch": 3.620550655949967, "grad_norm": 0.0008277028682641685, "learning_rate": 4.289869473132589e-07, "loss": 0.0003, "num_input_tokens_seen": 99882640, "step": 148200 }, { "epoch": 3.620672806781814, "grad_norm": 29.86298370361328, "learning_rate": 4.2891694151348654e-07, "loss": 0.0004, "num_input_tokens_seen": 99885968, "step": 148205 }, { "epoch": 3.6207949576136613, "grad_norm": 0.0006161229102872312, "learning_rate": 4.288469398667842e-07, "loss": 0.0, "num_input_tokens_seen": 99889104, "step": 148210 }, { "epoch": 3.6209171084455085, "grad_norm": 0.00639741774648428, "learning_rate": 4.2877694237366014e-07, "loss": 0.0, "num_input_tokens_seen": 99892304, "step": 148215 }, { "epoch": 3.6210392592773557, "grad_norm": 0.42625725269317627, "learning_rate": 4.2870694903462377e-07, "loss": 0.0002, "num_input_tokens_seen": 99897680, "step": 148220 }, { "epoch": 3.621161410109203, "grad_norm": 0.004575707484036684, "learning_rate": 4.2863695985018453e-07, "loss": 0.0004, "num_input_tokens_seen": 99901328, "step": 148225 }, { "epoch": 3.62128356094105, "grad_norm": 0.006831346545368433, "learning_rate": 4.285669748208507e-07, "loss": 0.0, "num_input_tokens_seen": 99904336, "step": 148230 }, { "epoch": 3.6214057117728973, "grad_norm": 0.00025160593213513494, "learning_rate": 4.284969939471318e-07, "loss": 0.0, "num_input_tokens_seen": 99907472, "step": 148235 }, { "epoch": 3.6215278626047445, "grad_norm": 0.4540623128414154, "learning_rate": 4.284270172295361e-07, "loss": 0.0003, "num_input_tokens_seen": 99910800, "step": 148240 }, { "epoch": 3.6216500134365917, "grad_norm": 0.019050270318984985, "learning_rate": 4.2835704466857325e-07, "loss": 0.0, "num_input_tokens_seen": 99914512, "step": 148245 }, { "epoch": 3.621772164268439, "grad_norm": 0.0001939299254445359, "learning_rate": 4.2828707626475133e-07, "loss": 0.0, "num_input_tokens_seen": 99917712, "step": 148250 }, { "epoch": 3.6218943151002856, "grad_norm": 0.00030984191107563674, "learning_rate": 4.2821711201858e-07, "loss": 0.0, "num_input_tokens_seen": 99920848, "step": 148255 }, { "epoch": 3.6220164659321332, "grad_norm": 0.00048598801367916167, "learning_rate": 4.281471519305676e-07, "loss": 0.0, "num_input_tokens_seen": 99924304, "step": 148260 }, { "epoch": 3.62213861676398, "grad_norm": 0.0002089657646138221, "learning_rate": 4.280771960012225e-07, "loss": 0.0, "num_input_tokens_seen": 99927824, "step": 148265 }, { "epoch": 3.6222607675958276, "grad_norm": 0.00018494176038075238, "learning_rate": 4.2800724423105427e-07, "loss": 0.0, "num_input_tokens_seen": 99931344, "step": 148270 }, { "epoch": 3.6223829184276743, "grad_norm": 0.00010118465434061363, "learning_rate": 4.279372966205709e-07, "loss": 0.0433, "num_input_tokens_seen": 99935056, "step": 148275 }, { "epoch": 3.6225050692595215, "grad_norm": 0.011214503087103367, "learning_rate": 4.278673531702812e-07, "loss": 0.0, "num_input_tokens_seen": 99938512, "step": 148280 }, { "epoch": 3.6226272200913687, "grad_norm": 0.00042954322998411953, "learning_rate": 4.2779741388069445e-07, "loss": 0.0, "num_input_tokens_seen": 99941904, "step": 148285 }, { "epoch": 3.622749370923216, "grad_norm": 0.0013367931824177504, "learning_rate": 4.277274787523185e-07, "loss": 0.0, "num_input_tokens_seen": 99945232, "step": 148290 }, { "epoch": 3.622871521755063, "grad_norm": 0.05535469576716423, "learning_rate": 4.2765754778566255e-07, "loss": 0.0001, "num_input_tokens_seen": 99948688, "step": 148295 }, { "epoch": 3.6229936725869103, "grad_norm": 0.0002708320098463446, "learning_rate": 4.275876209812346e-07, "loss": 0.0, "num_input_tokens_seen": 99952144, "step": 148300 }, { "epoch": 3.6231158234187575, "grad_norm": 0.00510826800018549, "learning_rate": 4.2751769833954334e-07, "loss": 0.0001, "num_input_tokens_seen": 99955536, "step": 148305 }, { "epoch": 3.6232379742506047, "grad_norm": 8.865822019288316e-05, "learning_rate": 4.274477798610977e-07, "loss": 0.0, "num_input_tokens_seen": 99958928, "step": 148310 }, { "epoch": 3.623360125082452, "grad_norm": 0.0006897286511957645, "learning_rate": 4.2737786554640543e-07, "loss": 0.0, "num_input_tokens_seen": 99962256, "step": 148315 }, { "epoch": 3.623482275914299, "grad_norm": 0.0003042963217012584, "learning_rate": 4.2730795539597575e-07, "loss": 0.0, "num_input_tokens_seen": 99966608, "step": 148320 }, { "epoch": 3.6236044267461462, "grad_norm": 34.62126159667969, "learning_rate": 4.272380494103163e-07, "loss": 0.0688, "num_input_tokens_seen": 99969744, "step": 148325 }, { "epoch": 3.6237265775779934, "grad_norm": 0.006418231409043074, "learning_rate": 4.2716814758993614e-07, "loss": 0.0, "num_input_tokens_seen": 99973072, "step": 148330 }, { "epoch": 3.6238487284098406, "grad_norm": 0.0013086996041238308, "learning_rate": 4.270982499353429e-07, "loss": 0.0, "num_input_tokens_seen": 99976528, "step": 148335 }, { "epoch": 3.6239708792416874, "grad_norm": 0.006796195171773434, "learning_rate": 4.2702835644704535e-07, "loss": 0.0, "num_input_tokens_seen": 99979920, "step": 148340 }, { "epoch": 3.624093030073535, "grad_norm": 0.002164926379919052, "learning_rate": 4.2695846712555204e-07, "loss": 0.0001, "num_input_tokens_seen": 99983376, "step": 148345 }, { "epoch": 3.6242151809053818, "grad_norm": 0.0018303889082744718, "learning_rate": 4.2688858197137047e-07, "loss": 0.0, "num_input_tokens_seen": 99987216, "step": 148350 }, { "epoch": 3.6243373317372294, "grad_norm": 85.07524108886719, "learning_rate": 4.268187009850097e-07, "loss": 0.0478, "num_input_tokens_seen": 99990352, "step": 148355 }, { "epoch": 3.624459482569076, "grad_norm": 0.00017215909610968083, "learning_rate": 4.2674882416697746e-07, "loss": 0.0891, "num_input_tokens_seen": 99993680, "step": 148360 }, { "epoch": 3.6245816334009233, "grad_norm": 0.1743038147687912, "learning_rate": 4.2667895151778167e-07, "loss": 0.0, "num_input_tokens_seen": 99996560, "step": 148365 }, { "epoch": 3.6247037842327705, "grad_norm": 0.0016746616456657648, "learning_rate": 4.266090830379311e-07, "loss": 0.0, "num_input_tokens_seen": 99999696, "step": 148370 }, { "epoch": 3.6248259350646177, "grad_norm": 0.00013440451584756374, "learning_rate": 4.265392187279331e-07, "loss": 0.0, "num_input_tokens_seen": 100002832, "step": 148375 }, { "epoch": 3.624948085896465, "grad_norm": 0.004280713852494955, "learning_rate": 4.2646935858829644e-07, "loss": 0.0001, "num_input_tokens_seen": 100006608, "step": 148380 }, { "epoch": 3.625070236728312, "grad_norm": 0.00028567464323714375, "learning_rate": 4.2639950261952863e-07, "loss": 0.0001, "num_input_tokens_seen": 100010000, "step": 148385 }, { "epoch": 3.6251923875601593, "grad_norm": 0.00025981743237935007, "learning_rate": 4.263296508221381e-07, "loss": 0.0, "num_input_tokens_seen": 100013328, "step": 148390 }, { "epoch": 3.6253145383920065, "grad_norm": 9.093299740925431e-05, "learning_rate": 4.262598031966325e-07, "loss": 0.0, "num_input_tokens_seen": 100016528, "step": 148395 }, { "epoch": 3.6254366892238536, "grad_norm": 0.0031217436771839857, "learning_rate": 4.261899597435198e-07, "loss": 0.0, "num_input_tokens_seen": 100019728, "step": 148400 }, { "epoch": 3.625558840055701, "grad_norm": 0.0002645206986926496, "learning_rate": 4.2612012046330846e-07, "loss": 0.0001, "num_input_tokens_seen": 100022864, "step": 148405 }, { "epoch": 3.625680990887548, "grad_norm": 7.561508391518146e-05, "learning_rate": 4.2605028535650553e-07, "loss": 0.0313, "num_input_tokens_seen": 100026256, "step": 148410 }, { "epoch": 3.625803141719395, "grad_norm": 0.0027498200070112944, "learning_rate": 4.259804544236197e-07, "loss": 0.0, "num_input_tokens_seen": 100029648, "step": 148415 }, { "epoch": 3.6259252925512424, "grad_norm": 0.0022068258840590715, "learning_rate": 4.2591062766515806e-07, "loss": 0.0667, "num_input_tokens_seen": 100032720, "step": 148420 }, { "epoch": 3.6260474433830896, "grad_norm": 0.02030225098133087, "learning_rate": 4.258408050816291e-07, "loss": 0.0, "num_input_tokens_seen": 100035856, "step": 148425 }, { "epoch": 3.626169594214937, "grad_norm": 0.0027770877350121737, "learning_rate": 4.2577098667353996e-07, "loss": 0.0, "num_input_tokens_seen": 100039248, "step": 148430 }, { "epoch": 3.6262917450467835, "grad_norm": 0.7698575258255005, "learning_rate": 4.257011724413985e-07, "loss": 0.0001, "num_input_tokens_seen": 100042896, "step": 148435 }, { "epoch": 3.626413895878631, "grad_norm": 0.007915056310594082, "learning_rate": 4.2563136238571307e-07, "loss": 0.0002, "num_input_tokens_seen": 100045904, "step": 148440 }, { "epoch": 3.626536046710478, "grad_norm": 0.0012001445284113288, "learning_rate": 4.2556155650699045e-07, "loss": 0.0, "num_input_tokens_seen": 100049296, "step": 148445 }, { "epoch": 3.626658197542325, "grad_norm": 0.0002976596006192267, "learning_rate": 4.2549175480573897e-07, "loss": 0.0, "num_input_tokens_seen": 100052624, "step": 148450 }, { "epoch": 3.6267803483741723, "grad_norm": 0.1230919361114502, "learning_rate": 4.2542195728246565e-07, "loss": 0.0001, "num_input_tokens_seen": 100056016, "step": 148455 }, { "epoch": 3.6269024992060195, "grad_norm": 0.00040737129165790975, "learning_rate": 4.253521639376788e-07, "loss": 0.0, "num_input_tokens_seen": 100058960, "step": 148460 }, { "epoch": 3.6270246500378667, "grad_norm": 0.00524023687466979, "learning_rate": 4.2528237477188566e-07, "loss": 0.0, "num_input_tokens_seen": 100062672, "step": 148465 }, { "epoch": 3.627146800869714, "grad_norm": 0.004324682056903839, "learning_rate": 4.2521258978559315e-07, "loss": 0.0, "num_input_tokens_seen": 100066064, "step": 148470 }, { "epoch": 3.627268951701561, "grad_norm": 0.10980729758739471, "learning_rate": 4.2514280897930977e-07, "loss": 0.0, "num_input_tokens_seen": 100069648, "step": 148475 }, { "epoch": 3.6273911025334082, "grad_norm": 0.31824973225593567, "learning_rate": 4.250730323535421e-07, "loss": 0.0, "num_input_tokens_seen": 100072784, "step": 148480 }, { "epoch": 3.6275132533652554, "grad_norm": 0.0008663343614898622, "learning_rate": 4.2500325990879835e-07, "loss": 0.0, "num_input_tokens_seen": 100075920, "step": 148485 }, { "epoch": 3.6276354041971026, "grad_norm": 36.30690002441406, "learning_rate": 4.249334916455851e-07, "loss": 0.0655, "num_input_tokens_seen": 100079632, "step": 148490 }, { "epoch": 3.62775755502895, "grad_norm": 0.0030439416877925396, "learning_rate": 4.2486372756441027e-07, "loss": 0.0257, "num_input_tokens_seen": 100083024, "step": 148495 }, { "epoch": 3.627879705860797, "grad_norm": 0.004145446699112654, "learning_rate": 4.247939676657815e-07, "loss": 0.0001, "num_input_tokens_seen": 100086160, "step": 148500 }, { "epoch": 3.628001856692644, "grad_norm": 0.1289975494146347, "learning_rate": 4.2472421195020525e-07, "loss": 0.0002, "num_input_tokens_seen": 100089552, "step": 148505 }, { "epoch": 3.6281240075244914, "grad_norm": 0.006557599641382694, "learning_rate": 4.2465446041818966e-07, "loss": 0.0, "num_input_tokens_seen": 100092624, "step": 148510 }, { "epoch": 3.6282461583563386, "grad_norm": 0.0008815338369458914, "learning_rate": 4.245847130702412e-07, "loss": 0.0002, "num_input_tokens_seen": 100096400, "step": 148515 }, { "epoch": 3.6283683091881853, "grad_norm": 0.02695058286190033, "learning_rate": 4.2451496990686784e-07, "loss": 0.0, "num_input_tokens_seen": 100100240, "step": 148520 }, { "epoch": 3.628490460020033, "grad_norm": 0.298431932926178, "learning_rate": 4.244452309285761e-07, "loss": 0.0002, "num_input_tokens_seen": 100103440, "step": 148525 }, { "epoch": 3.6286126108518797, "grad_norm": 0.005468748044222593, "learning_rate": 4.243754961358733e-07, "loss": 0.075, "num_input_tokens_seen": 100107408, "step": 148530 }, { "epoch": 3.6287347616837273, "grad_norm": 0.0004392905975691974, "learning_rate": 4.243057655292672e-07, "loss": 0.0, "num_input_tokens_seen": 100110416, "step": 148535 }, { "epoch": 3.628856912515574, "grad_norm": 0.0009532935218885541, "learning_rate": 4.242360391092641e-07, "loss": 0.0, "num_input_tokens_seen": 100113808, "step": 148540 }, { "epoch": 3.6289790633474213, "grad_norm": 0.0005135192768648267, "learning_rate": 4.2416631687637173e-07, "loss": 0.0738, "num_input_tokens_seen": 100116880, "step": 148545 }, { "epoch": 3.6291012141792685, "grad_norm": 0.0001638806570554152, "learning_rate": 4.240965988310963e-07, "loss": 0.0, "num_input_tokens_seen": 100120272, "step": 148550 }, { "epoch": 3.6292233650111156, "grad_norm": 0.0005596952978521585, "learning_rate": 4.240268849739458e-07, "loss": 0.0739, "num_input_tokens_seen": 100123472, "step": 148555 }, { "epoch": 3.629345515842963, "grad_norm": 0.06809617578983307, "learning_rate": 4.239571753054263e-07, "loss": 0.0881, "num_input_tokens_seen": 100127120, "step": 148560 }, { "epoch": 3.62946766667481, "grad_norm": 0.002564807888120413, "learning_rate": 4.2388746982604553e-07, "loss": 0.0, "num_input_tokens_seen": 100130384, "step": 148565 }, { "epoch": 3.629589817506657, "grad_norm": 0.0007430867408402264, "learning_rate": 4.2381776853630955e-07, "loss": 0.0, "num_input_tokens_seen": 100133776, "step": 148570 }, { "epoch": 3.6297119683385044, "grad_norm": 315.9742126464844, "learning_rate": 4.237480714367262e-07, "loss": 0.008, "num_input_tokens_seen": 100137104, "step": 148575 }, { "epoch": 3.6298341191703516, "grad_norm": 0.0024084101896733046, "learning_rate": 4.236783785278019e-07, "loss": 0.0, "num_input_tokens_seen": 100140496, "step": 148580 }, { "epoch": 3.629956270002199, "grad_norm": 0.007453474681824446, "learning_rate": 4.2360868981004305e-07, "loss": 0.0001, "num_input_tokens_seen": 100144208, "step": 148585 }, { "epoch": 3.630078420834046, "grad_norm": 0.0015486471820622683, "learning_rate": 4.235390052839568e-07, "loss": 0.0, "num_input_tokens_seen": 100147792, "step": 148590 }, { "epoch": 3.630200571665893, "grad_norm": 0.005932774394750595, "learning_rate": 4.2346932495005037e-07, "loss": 0.0004, "num_input_tokens_seen": 100150928, "step": 148595 }, { "epoch": 3.6303227224977404, "grad_norm": 0.00043449728400446475, "learning_rate": 4.2339964880882974e-07, "loss": 0.0014, "num_input_tokens_seen": 100154192, "step": 148600 }, { "epoch": 3.6304448733295875, "grad_norm": 0.0025287808384746313, "learning_rate": 4.233299768608022e-07, "loss": 0.0, "num_input_tokens_seen": 100157712, "step": 148605 }, { "epoch": 3.6305670241614347, "grad_norm": 0.16306915879249573, "learning_rate": 4.232603091064739e-07, "loss": 0.0001, "num_input_tokens_seen": 100161232, "step": 148610 }, { "epoch": 3.6306891749932815, "grad_norm": 0.0046179573982954025, "learning_rate": 4.2319064554635174e-07, "loss": 0.0, "num_input_tokens_seen": 100165456, "step": 148615 }, { "epoch": 3.630811325825129, "grad_norm": 0.0018630430568009615, "learning_rate": 4.231209861809427e-07, "loss": 0.0, "num_input_tokens_seen": 100168464, "step": 148620 }, { "epoch": 3.630933476656976, "grad_norm": 0.0014654771657660604, "learning_rate": 4.2305133101075264e-07, "loss": 0.0, "num_input_tokens_seen": 100172880, "step": 148625 }, { "epoch": 3.631055627488823, "grad_norm": 0.004783432465046644, "learning_rate": 4.2298168003628885e-07, "loss": 0.0, "num_input_tokens_seen": 100176144, "step": 148630 }, { "epoch": 3.6311777783206702, "grad_norm": 0.0012913690879940987, "learning_rate": 4.2291203325805715e-07, "loss": 0.0001, "num_input_tokens_seen": 100179216, "step": 148635 }, { "epoch": 3.6312999291525174, "grad_norm": 0.001476217177696526, "learning_rate": 4.228423906765647e-07, "loss": 0.0, "num_input_tokens_seen": 100182608, "step": 148640 }, { "epoch": 3.6314220799843646, "grad_norm": 0.000511297257617116, "learning_rate": 4.2277275229231726e-07, "loss": 0.0276, "num_input_tokens_seen": 100186256, "step": 148645 }, { "epoch": 3.631544230816212, "grad_norm": 0.16869373619556427, "learning_rate": 4.227031181058216e-07, "loss": 0.0007, "num_input_tokens_seen": 100189328, "step": 148650 }, { "epoch": 3.631666381648059, "grad_norm": 0.0005099636036902666, "learning_rate": 4.226334881175846e-07, "loss": 0.0, "num_input_tokens_seen": 100192592, "step": 148655 }, { "epoch": 3.631788532479906, "grad_norm": 0.9059097170829773, "learning_rate": 4.225638623281117e-07, "loss": 0.0007, "num_input_tokens_seen": 100195856, "step": 148660 }, { "epoch": 3.6319106833117534, "grad_norm": 0.003063419833779335, "learning_rate": 4.2249424073791006e-07, "loss": 0.0017, "num_input_tokens_seen": 100199120, "step": 148665 }, { "epoch": 3.6320328341436006, "grad_norm": 0.10458869487047195, "learning_rate": 4.224246233474857e-07, "loss": 0.0001, "num_input_tokens_seen": 100202448, "step": 148670 }, { "epoch": 3.6321549849754478, "grad_norm": 0.018070032820105553, "learning_rate": 4.2235501015734445e-07, "loss": 0.0, "num_input_tokens_seen": 100205904, "step": 148675 }, { "epoch": 3.632277135807295, "grad_norm": 0.003459717845544219, "learning_rate": 4.2228540116799326e-07, "loss": 0.0002, "num_input_tokens_seen": 100209168, "step": 148680 }, { "epoch": 3.632399286639142, "grad_norm": 0.0011072818888351321, "learning_rate": 4.2221579637993766e-07, "loss": 0.0, "num_input_tokens_seen": 100212624, "step": 148685 }, { "epoch": 3.6325214374709893, "grad_norm": 0.002741535659879446, "learning_rate": 4.221461957936846e-07, "loss": 0.0266, "num_input_tokens_seen": 100215760, "step": 148690 }, { "epoch": 3.6326435883028365, "grad_norm": 0.0002010358584811911, "learning_rate": 4.220765994097395e-07, "loss": 0.0, "num_input_tokens_seen": 100219024, "step": 148695 }, { "epoch": 3.6327657391346833, "grad_norm": 0.00028791220393031836, "learning_rate": 4.2200700722860906e-07, "loss": 0.0, "num_input_tokens_seen": 100222224, "step": 148700 }, { "epoch": 3.632887889966531, "grad_norm": 0.0003893005778081715, "learning_rate": 4.219374192507988e-07, "loss": 0.0001, "num_input_tokens_seen": 100225232, "step": 148705 }, { "epoch": 3.6330100407983776, "grad_norm": 0.0009554345160722733, "learning_rate": 4.2186783547681516e-07, "loss": 0.0619, "num_input_tokens_seen": 100228688, "step": 148710 }, { "epoch": 3.6331321916302253, "grad_norm": 0.0010442471830174327, "learning_rate": 4.2179825590716445e-07, "loss": 0.0, "num_input_tokens_seen": 100231888, "step": 148715 }, { "epoch": 3.633254342462072, "grad_norm": 0.00042447238229215145, "learning_rate": 4.21728680542352e-07, "loss": 0.0, "num_input_tokens_seen": 100235088, "step": 148720 }, { "epoch": 3.633376493293919, "grad_norm": 0.0007365150959230959, "learning_rate": 4.216591093828844e-07, "loss": 0.0, "num_input_tokens_seen": 100238352, "step": 148725 }, { "epoch": 3.6334986441257664, "grad_norm": 0.002207621233537793, "learning_rate": 4.21589542429267e-07, "loss": 0.0, "num_input_tokens_seen": 100242832, "step": 148730 }, { "epoch": 3.6336207949576136, "grad_norm": 0.004213795997202396, "learning_rate": 4.215199796820064e-07, "loss": 0.0, "num_input_tokens_seen": 100245968, "step": 148735 }, { "epoch": 3.6337429457894608, "grad_norm": 0.13924796879291534, "learning_rate": 4.2145042114160776e-07, "loss": 0.0, "num_input_tokens_seen": 100249232, "step": 148740 }, { "epoch": 3.633865096621308, "grad_norm": 0.06136344373226166, "learning_rate": 4.213808668085772e-07, "loss": 0.0, "num_input_tokens_seen": 100252688, "step": 148745 }, { "epoch": 3.633987247453155, "grad_norm": 0.0009907983476296067, "learning_rate": 4.2131131668342103e-07, "loss": 0.0, "num_input_tokens_seen": 100255952, "step": 148750 }, { "epoch": 3.6341093982850023, "grad_norm": 0.0021776894573122263, "learning_rate": 4.212417707666442e-07, "loss": 0.0, "num_input_tokens_seen": 100259088, "step": 148755 }, { "epoch": 3.6342315491168495, "grad_norm": 0.0004358404839877039, "learning_rate": 4.2117222905875327e-07, "loss": 0.0, "num_input_tokens_seen": 100262416, "step": 148760 }, { "epoch": 3.6343536999486967, "grad_norm": 0.03200365602970123, "learning_rate": 4.2110269156025327e-07, "loss": 0.0, "num_input_tokens_seen": 100265936, "step": 148765 }, { "epoch": 3.634475850780544, "grad_norm": 0.002977890195325017, "learning_rate": 4.2103315827165043e-07, "loss": 0.0, "num_input_tokens_seen": 100269776, "step": 148770 }, { "epoch": 3.634598001612391, "grad_norm": 0.02720186486840248, "learning_rate": 4.209636291934503e-07, "loss": 0.0, "num_input_tokens_seen": 100273616, "step": 148775 }, { "epoch": 3.6347201524442383, "grad_norm": 0.0012489106738939881, "learning_rate": 4.20894104326158e-07, "loss": 0.0, "num_input_tokens_seen": 100276880, "step": 148780 }, { "epoch": 3.634842303276085, "grad_norm": 0.0008800218929536641, "learning_rate": 4.2082458367027986e-07, "loss": 0.0, "num_input_tokens_seen": 100280208, "step": 148785 }, { "epoch": 3.6349644541079327, "grad_norm": 0.0018722382374107838, "learning_rate": 4.207550672263208e-07, "loss": 0.0001, "num_input_tokens_seen": 100283792, "step": 148790 }, { "epoch": 3.6350866049397794, "grad_norm": 0.010279770940542221, "learning_rate": 4.206855549947871e-07, "loss": 0.0, "num_input_tokens_seen": 100287184, "step": 148795 }, { "epoch": 3.635208755771627, "grad_norm": 0.006634359247982502, "learning_rate": 4.2061604697618347e-07, "loss": 0.0, "num_input_tokens_seen": 100290448, "step": 148800 }, { "epoch": 3.635330906603474, "grad_norm": 25.89348030090332, "learning_rate": 4.205465431710158e-07, "loss": 0.0433, "num_input_tokens_seen": 100293520, "step": 148805 }, { "epoch": 3.635453057435321, "grad_norm": 0.0019238588865846395, "learning_rate": 4.2047704357978975e-07, "loss": 0.0, "num_input_tokens_seen": 100297168, "step": 148810 }, { "epoch": 3.635575208267168, "grad_norm": 0.010663384571671486, "learning_rate": 4.204075482030103e-07, "loss": 0.0, "num_input_tokens_seen": 100301072, "step": 148815 }, { "epoch": 3.6356973590990154, "grad_norm": 0.0011268631787970662, "learning_rate": 4.203380570411833e-07, "loss": 0.0004, "num_input_tokens_seen": 100304784, "step": 148820 }, { "epoch": 3.6358195099308626, "grad_norm": 0.26588326692581177, "learning_rate": 4.2026857009481363e-07, "loss": 0.0922, "num_input_tokens_seen": 100308112, "step": 148825 }, { "epoch": 3.6359416607627097, "grad_norm": 0.0016133766621351242, "learning_rate": 4.201990873644071e-07, "loss": 0.0, "num_input_tokens_seen": 100311248, "step": 148830 }, { "epoch": 3.636063811594557, "grad_norm": 25.05052947998047, "learning_rate": 4.2012960885046846e-07, "loss": 0.0607, "num_input_tokens_seen": 100314512, "step": 148835 }, { "epoch": 3.636185962426404, "grad_norm": 31.458444595336914, "learning_rate": 4.200601345535032e-07, "loss": 0.0953, "num_input_tokens_seen": 100317712, "step": 148840 }, { "epoch": 3.6363081132582513, "grad_norm": 0.004293879494071007, "learning_rate": 4.1999066447401707e-07, "loss": 0.0, "num_input_tokens_seen": 100321040, "step": 148845 }, { "epoch": 3.6364302640900985, "grad_norm": 0.17925694584846497, "learning_rate": 4.1992119861251443e-07, "loss": 0.0567, "num_input_tokens_seen": 100324112, "step": 148850 }, { "epoch": 3.6365524149219457, "grad_norm": 0.024837162345647812, "learning_rate": 4.1985173696950125e-07, "loss": 0.0, "num_input_tokens_seen": 100327504, "step": 148855 }, { "epoch": 3.636674565753793, "grad_norm": 19.125436782836914, "learning_rate": 4.1978227954548183e-07, "loss": 0.0235, "num_input_tokens_seen": 100331152, "step": 148860 }, { "epoch": 3.63679671658564, "grad_norm": 0.007109674625098705, "learning_rate": 4.197128263409622e-07, "loss": 0.0591, "num_input_tokens_seen": 100334288, "step": 148865 }, { "epoch": 3.6369188674174873, "grad_norm": 0.002395701128989458, "learning_rate": 4.196433773564465e-07, "loss": 0.0001, "num_input_tokens_seen": 100337744, "step": 148870 }, { "epoch": 3.6370410182493345, "grad_norm": 0.000553856254555285, "learning_rate": 4.195739325924407e-07, "loss": 0.0, "num_input_tokens_seen": 100341008, "step": 148875 }, { "epoch": 3.637163169081181, "grad_norm": 0.0023220237344503403, "learning_rate": 4.1950449204944905e-07, "loss": 0.0, "num_input_tokens_seen": 100344528, "step": 148880 }, { "epoch": 3.637285319913029, "grad_norm": 0.0013494978193193674, "learning_rate": 4.1943505572797713e-07, "loss": 0.0, "num_input_tokens_seen": 100347920, "step": 148885 }, { "epoch": 3.6374074707448756, "grad_norm": 0.0008220344316214323, "learning_rate": 4.1936562362852966e-07, "loss": 0.0, "num_input_tokens_seen": 100351056, "step": 148890 }, { "epoch": 3.6375296215767228, "grad_norm": 0.006795614026486874, "learning_rate": 4.1929619575161126e-07, "loss": 0.0, "num_input_tokens_seen": 100354000, "step": 148895 }, { "epoch": 3.63765177240857, "grad_norm": 0.006160368211567402, "learning_rate": 4.192267720977271e-07, "loss": 0.0001, "num_input_tokens_seen": 100357520, "step": 148900 }, { "epoch": 3.637773923240417, "grad_norm": 0.0003869648789986968, "learning_rate": 4.1915735266738237e-07, "loss": 0.0, "num_input_tokens_seen": 100360720, "step": 148905 }, { "epoch": 3.6378960740722643, "grad_norm": 87.1167221069336, "learning_rate": 4.190879374610813e-07, "loss": 0.043, "num_input_tokens_seen": 100363984, "step": 148910 }, { "epoch": 3.6380182249041115, "grad_norm": 0.02374916709959507, "learning_rate": 4.190185264793292e-07, "loss": 0.0, "num_input_tokens_seen": 100367056, "step": 148915 }, { "epoch": 3.6381403757359587, "grad_norm": 0.0508439764380455, "learning_rate": 4.189491197226305e-07, "loss": 0.0, "num_input_tokens_seen": 100370384, "step": 148920 }, { "epoch": 3.638262526567806, "grad_norm": 0.0017856404883787036, "learning_rate": 4.188797171914903e-07, "loss": 0.0, "num_input_tokens_seen": 100373968, "step": 148925 }, { "epoch": 3.638384677399653, "grad_norm": 0.010784510523080826, "learning_rate": 4.1881031888641285e-07, "loss": 0.0, "num_input_tokens_seen": 100377616, "step": 148930 }, { "epoch": 3.6385068282315003, "grad_norm": 0.00013639629469253123, "learning_rate": 4.18740924807903e-07, "loss": 0.0, "num_input_tokens_seen": 100381136, "step": 148935 }, { "epoch": 3.6386289790633475, "grad_norm": 0.018810724839568138, "learning_rate": 4.186715349564658e-07, "loss": 0.0, "num_input_tokens_seen": 100384592, "step": 148940 }, { "epoch": 3.6387511298951947, "grad_norm": 0.6784608960151672, "learning_rate": 4.186021493326053e-07, "loss": 0.0001, "num_input_tokens_seen": 100387664, "step": 148945 }, { "epoch": 3.638873280727042, "grad_norm": 0.0006472144741564989, "learning_rate": 4.185327679368267e-07, "loss": 0.0, "num_input_tokens_seen": 100391568, "step": 148950 }, { "epoch": 3.638995431558889, "grad_norm": 0.516774594783783, "learning_rate": 4.184633907696338e-07, "loss": 0.0002, "num_input_tokens_seen": 100394704, "step": 148955 }, { "epoch": 3.6391175823907362, "grad_norm": 2.933089308498893e-05, "learning_rate": 4.183940178315315e-07, "loss": 0.0002, "num_input_tokens_seen": 100398032, "step": 148960 }, { "epoch": 3.639239733222583, "grad_norm": 0.000815013307146728, "learning_rate": 4.183246491230248e-07, "loss": 0.0, "num_input_tokens_seen": 100401296, "step": 148965 }, { "epoch": 3.6393618840544306, "grad_norm": 0.021738985553383827, "learning_rate": 4.1825528464461725e-07, "loss": 0.0002, "num_input_tokens_seen": 100404816, "step": 148970 }, { "epoch": 3.6394840348862774, "grad_norm": 0.015505907125771046, "learning_rate": 4.1818592439681413e-07, "loss": 0.0001, "num_input_tokens_seen": 100407824, "step": 148975 }, { "epoch": 3.639606185718125, "grad_norm": 0.00117449217941612, "learning_rate": 4.1811656838011946e-07, "loss": 0.0, "num_input_tokens_seen": 100410832, "step": 148980 }, { "epoch": 3.6397283365499717, "grad_norm": 8.240255556302145e-05, "learning_rate": 4.180472165950373e-07, "loss": 0.0671, "num_input_tokens_seen": 100413968, "step": 148985 }, { "epoch": 3.639850487381819, "grad_norm": 0.0006316679064184427, "learning_rate": 4.1797786904207254e-07, "loss": 0.0833, "num_input_tokens_seen": 100418256, "step": 148990 }, { "epoch": 3.639972638213666, "grad_norm": 0.0015859379200264812, "learning_rate": 4.17908525721729e-07, "loss": 0.0, "num_input_tokens_seen": 100422416, "step": 148995 }, { "epoch": 3.6400947890455133, "grad_norm": 0.052952248603105545, "learning_rate": 4.178391866345116e-07, "loss": 0.0003, "num_input_tokens_seen": 100426000, "step": 149000 }, { "epoch": 3.6402169398773605, "grad_norm": 0.21487371623516083, "learning_rate": 4.1776985178092383e-07, "loss": 0.0001, "num_input_tokens_seen": 100429200, "step": 149005 }, { "epoch": 3.6403390907092077, "grad_norm": 0.991147518157959, "learning_rate": 4.177005211614706e-07, "loss": 0.001, "num_input_tokens_seen": 100432144, "step": 149010 }, { "epoch": 3.640461241541055, "grad_norm": 0.0008949601906351745, "learning_rate": 4.176311947766555e-07, "loss": 0.0, "num_input_tokens_seen": 100435600, "step": 149015 }, { "epoch": 3.640583392372902, "grad_norm": 0.000679291202686727, "learning_rate": 4.1756187262698305e-07, "loss": 0.0, "num_input_tokens_seen": 100438800, "step": 149020 }, { "epoch": 3.6407055432047493, "grad_norm": 0.003153888275846839, "learning_rate": 4.1749255471295755e-07, "loss": 0.0, "num_input_tokens_seen": 100441680, "step": 149025 }, { "epoch": 3.6408276940365965, "grad_norm": 0.00218926346860826, "learning_rate": 4.174232410350826e-07, "loss": 0.0, "num_input_tokens_seen": 100444624, "step": 149030 }, { "epoch": 3.6409498448684436, "grad_norm": 0.0038852146826684475, "learning_rate": 4.173539315938629e-07, "loss": 0.0, "num_input_tokens_seen": 100447888, "step": 149035 }, { "epoch": 3.641071995700291, "grad_norm": 27.48082733154297, "learning_rate": 4.1728462638980164e-07, "loss": 0.0989, "num_input_tokens_seen": 100451152, "step": 149040 }, { "epoch": 3.641194146532138, "grad_norm": 0.004733655601739883, "learning_rate": 4.172153254234038e-07, "loss": 0.0, "num_input_tokens_seen": 100454608, "step": 149045 }, { "epoch": 3.641316297363985, "grad_norm": 0.1282656043767929, "learning_rate": 4.171460286951725e-07, "loss": 0.0001, "num_input_tokens_seen": 100458000, "step": 149050 }, { "epoch": 3.6414384481958324, "grad_norm": 50.21139907836914, "learning_rate": 4.17076736205612e-07, "loss": 0.1442, "num_input_tokens_seen": 100461776, "step": 149055 }, { "epoch": 3.641560599027679, "grad_norm": 0.001987928058952093, "learning_rate": 4.170074479552266e-07, "loss": 0.0476, "num_input_tokens_seen": 100464848, "step": 149060 }, { "epoch": 3.641682749859527, "grad_norm": 17.490524291992188, "learning_rate": 4.1693816394451954e-07, "loss": 0.0922, "num_input_tokens_seen": 100468560, "step": 149065 }, { "epoch": 3.6418049006913735, "grad_norm": 0.000808164884801954, "learning_rate": 4.1686888417399537e-07, "loss": 0.0001, "num_input_tokens_seen": 100471888, "step": 149070 }, { "epoch": 3.6419270515232207, "grad_norm": 0.00014078160165809095, "learning_rate": 4.167996086441571e-07, "loss": 0.0, "num_input_tokens_seen": 100475472, "step": 149075 }, { "epoch": 3.642049202355068, "grad_norm": 0.011615934781730175, "learning_rate": 4.167303373555092e-07, "loss": 0.0, "num_input_tokens_seen": 100478608, "step": 149080 }, { "epoch": 3.642171353186915, "grad_norm": 0.0010337198618799448, "learning_rate": 4.1666107030855535e-07, "loss": 0.0, "num_input_tokens_seen": 100482384, "step": 149085 }, { "epoch": 3.6422935040187623, "grad_norm": 22.374544143676758, "learning_rate": 4.165918075037986e-07, "loss": 0.058, "num_input_tokens_seen": 100485520, "step": 149090 }, { "epoch": 3.6424156548506095, "grad_norm": 0.012283634394407272, "learning_rate": 4.1652254894174357e-07, "loss": 0.0, "num_input_tokens_seen": 100488656, "step": 149095 }, { "epoch": 3.6425378056824567, "grad_norm": 0.0002704682119656354, "learning_rate": 4.1645329462289314e-07, "loss": 0.0716, "num_input_tokens_seen": 100492240, "step": 149100 }, { "epoch": 3.642659956514304, "grad_norm": 0.46763280034065247, "learning_rate": 4.163840445477517e-07, "loss": 0.0001, "num_input_tokens_seen": 100496720, "step": 149105 }, { "epoch": 3.642782107346151, "grad_norm": 0.01930251717567444, "learning_rate": 4.1631479871682195e-07, "loss": 0.0526, "num_input_tokens_seen": 100500048, "step": 149110 }, { "epoch": 3.6429042581779982, "grad_norm": 0.011264493688941002, "learning_rate": 4.1624555713060815e-07, "loss": 0.0, "num_input_tokens_seen": 100503184, "step": 149115 }, { "epoch": 3.6430264090098454, "grad_norm": 0.1804090440273285, "learning_rate": 4.1617631978961396e-07, "loss": 0.0001, "num_input_tokens_seen": 100506384, "step": 149120 }, { "epoch": 3.6431485598416926, "grad_norm": 0.007700651418417692, "learning_rate": 4.1610708669434224e-07, "loss": 0.0, "num_input_tokens_seen": 100509584, "step": 149125 }, { "epoch": 3.64327071067354, "grad_norm": 0.008980309590697289, "learning_rate": 4.1603785784529724e-07, "loss": 0.0002, "num_input_tokens_seen": 100512592, "step": 149130 }, { "epoch": 3.643392861505387, "grad_norm": 0.015934381633996964, "learning_rate": 4.1596863324298157e-07, "loss": 0.0, "num_input_tokens_seen": 100515920, "step": 149135 }, { "epoch": 3.643515012337234, "grad_norm": 0.020287154242396355, "learning_rate": 4.158994128878994e-07, "loss": 0.0557, "num_input_tokens_seen": 100519184, "step": 149140 }, { "epoch": 3.643637163169081, "grad_norm": 0.008383965119719505, "learning_rate": 4.158301967805535e-07, "loss": 0.0, "num_input_tokens_seen": 100522640, "step": 149145 }, { "epoch": 3.6437593140009286, "grad_norm": 0.4676218628883362, "learning_rate": 4.1576098492144763e-07, "loss": 0.0002, "num_input_tokens_seen": 100526672, "step": 149150 }, { "epoch": 3.6438814648327753, "grad_norm": 0.09821810573339462, "learning_rate": 4.1569177731108526e-07, "loss": 0.0739, "num_input_tokens_seen": 100530512, "step": 149155 }, { "epoch": 3.644003615664623, "grad_norm": 0.16381299495697021, "learning_rate": 4.1562257394996913e-07, "loss": 0.0001, "num_input_tokens_seen": 100533584, "step": 149160 }, { "epoch": 3.6441257664964697, "grad_norm": 0.0033917089458554983, "learning_rate": 4.155533748386032e-07, "loss": 0.0, "num_input_tokens_seen": 100536976, "step": 149165 }, { "epoch": 3.644247917328317, "grad_norm": 0.0065474179573357105, "learning_rate": 4.1548417997749e-07, "loss": 0.0, "num_input_tokens_seen": 100540304, "step": 149170 }, { "epoch": 3.644370068160164, "grad_norm": 0.001454808283597231, "learning_rate": 4.154149893671334e-07, "loss": 0.0, "num_input_tokens_seen": 100543376, "step": 149175 }, { "epoch": 3.6444922189920113, "grad_norm": 0.0016932397847995162, "learning_rate": 4.153458030080358e-07, "loss": 0.0, "num_input_tokens_seen": 100546832, "step": 149180 }, { "epoch": 3.6446143698238584, "grad_norm": 0.03147650137543678, "learning_rate": 4.1527662090070113e-07, "loss": 0.0001, "num_input_tokens_seen": 100550352, "step": 149185 }, { "epoch": 3.6447365206557056, "grad_norm": 0.021766608580946922, "learning_rate": 4.1520744304563185e-07, "loss": 0.0, "num_input_tokens_seen": 100553488, "step": 149190 }, { "epoch": 3.644858671487553, "grad_norm": 0.0049246856942772865, "learning_rate": 4.151382694433316e-07, "loss": 0.0002, "num_input_tokens_seen": 100556688, "step": 149195 }, { "epoch": 3.6449808223194, "grad_norm": 0.30315637588500977, "learning_rate": 4.150691000943033e-07, "loss": 0.0001, "num_input_tokens_seen": 100560208, "step": 149200 }, { "epoch": 3.645102973151247, "grad_norm": 0.01731988787651062, "learning_rate": 4.149999349990494e-07, "loss": 0.0347, "num_input_tokens_seen": 100563344, "step": 149205 }, { "epoch": 3.6452251239830944, "grad_norm": 0.11529941856861115, "learning_rate": 4.149307741580733e-07, "loss": 0.0001, "num_input_tokens_seen": 100566224, "step": 149210 }, { "epoch": 3.6453472748149416, "grad_norm": 0.08941885083913803, "learning_rate": 4.148616175718783e-07, "loss": 0.0001, "num_input_tokens_seen": 100569488, "step": 149215 }, { "epoch": 3.6454694256467888, "grad_norm": 24.89301109313965, "learning_rate": 4.1479246524096676e-07, "loss": 0.0093, "num_input_tokens_seen": 100572880, "step": 149220 }, { "epoch": 3.645591576478636, "grad_norm": 0.0017372871516272426, "learning_rate": 4.147233171658421e-07, "loss": 0.0, "num_input_tokens_seen": 100575824, "step": 149225 }, { "epoch": 3.6457137273104827, "grad_norm": 0.0009719376685097814, "learning_rate": 4.146541733470066e-07, "loss": 0.0451, "num_input_tokens_seen": 100579600, "step": 149230 }, { "epoch": 3.6458358781423303, "grad_norm": 0.08995156735181808, "learning_rate": 4.145850337849637e-07, "loss": 0.0, "num_input_tokens_seen": 100583376, "step": 149235 }, { "epoch": 3.645958028974177, "grad_norm": 0.005203651264309883, "learning_rate": 4.145158984802155e-07, "loss": 0.0614, "num_input_tokens_seen": 100586640, "step": 149240 }, { "epoch": 3.6460801798060247, "grad_norm": 0.012616139836609364, "learning_rate": 4.144467674332651e-07, "loss": 0.0, "num_input_tokens_seen": 100590160, "step": 149245 }, { "epoch": 3.6462023306378715, "grad_norm": 0.008399611338973045, "learning_rate": 4.143776406446158e-07, "loss": 0.0, "num_input_tokens_seen": 100593488, "step": 149250 }, { "epoch": 3.6463244814697187, "grad_norm": 0.0006929049268364906, "learning_rate": 4.143085181147694e-07, "loss": 0.0001, "num_input_tokens_seen": 100596432, "step": 149255 }, { "epoch": 3.646446632301566, "grad_norm": 0.0006431890651583672, "learning_rate": 4.142393998442294e-07, "loss": 0.0002, "num_input_tokens_seen": 100599696, "step": 149260 }, { "epoch": 3.646568783133413, "grad_norm": 0.0022328917402774096, "learning_rate": 4.1417028583349766e-07, "loss": 0.0, "num_input_tokens_seen": 100603088, "step": 149265 }, { "epoch": 3.6466909339652602, "grad_norm": 0.0004925087559968233, "learning_rate": 4.1410117608307716e-07, "loss": 0.0, "num_input_tokens_seen": 100606352, "step": 149270 }, { "epoch": 3.6468130847971074, "grad_norm": 0.0036744000390172005, "learning_rate": 4.140320705934708e-07, "loss": 0.0, "num_input_tokens_seen": 100609808, "step": 149275 }, { "epoch": 3.6469352356289546, "grad_norm": 0.013988408260047436, "learning_rate": 4.1396296936518047e-07, "loss": 0.0, "num_input_tokens_seen": 100613072, "step": 149280 }, { "epoch": 3.647057386460802, "grad_norm": 0.022471310570836067, "learning_rate": 4.1389387239870945e-07, "loss": 0.0648, "num_input_tokens_seen": 100616336, "step": 149285 }, { "epoch": 3.647179537292649, "grad_norm": 0.026236383244395256, "learning_rate": 4.138247796945599e-07, "loss": 0.0003, "num_input_tokens_seen": 100619408, "step": 149290 }, { "epoch": 3.647301688124496, "grad_norm": 0.01007220707833767, "learning_rate": 4.1375569125323374e-07, "loss": 0.0005, "num_input_tokens_seen": 100622928, "step": 149295 }, { "epoch": 3.6474238389563434, "grad_norm": 0.020251065492630005, "learning_rate": 4.136866070752343e-07, "loss": 0.0, "num_input_tokens_seen": 100626128, "step": 149300 }, { "epoch": 3.6475459897881906, "grad_norm": 0.007953628897666931, "learning_rate": 4.1361752716106315e-07, "loss": 0.0801, "num_input_tokens_seen": 100629584, "step": 149305 }, { "epoch": 3.6476681406200377, "grad_norm": 0.0007968613062985241, "learning_rate": 4.1354845151122344e-07, "loss": 0.0563, "num_input_tokens_seen": 100632848, "step": 149310 }, { "epoch": 3.647790291451885, "grad_norm": 0.023661449551582336, "learning_rate": 4.1347938012621675e-07, "loss": 0.0, "num_input_tokens_seen": 100636432, "step": 149315 }, { "epoch": 3.647912442283732, "grad_norm": 0.0034785429015755653, "learning_rate": 4.1341031300654615e-07, "loss": 0.0, "num_input_tokens_seen": 100639952, "step": 149320 }, { "epoch": 3.648034593115579, "grad_norm": 0.0030977909918874502, "learning_rate": 4.1334125015271316e-07, "loss": 0.0, "num_input_tokens_seen": 100643344, "step": 149325 }, { "epoch": 3.6481567439474265, "grad_norm": 0.00491897389292717, "learning_rate": 4.1327219156522043e-07, "loss": 0.0001, "num_input_tokens_seen": 100646608, "step": 149330 }, { "epoch": 3.6482788947792733, "grad_norm": 0.005906306207180023, "learning_rate": 4.1320313724457046e-07, "loss": 0.0, "num_input_tokens_seen": 100650320, "step": 149335 }, { "epoch": 3.648401045611121, "grad_norm": 0.01143131498247385, "learning_rate": 4.1313408719126475e-07, "loss": 0.0, "num_input_tokens_seen": 100653648, "step": 149340 }, { "epoch": 3.6485231964429676, "grad_norm": 0.000916936609428376, "learning_rate": 4.130650414058061e-07, "loss": 0.0, "num_input_tokens_seen": 100656784, "step": 149345 }, { "epoch": 3.648645347274815, "grad_norm": 0.00964375026524067, "learning_rate": 4.1299599988869606e-07, "loss": 0.0001, "num_input_tokens_seen": 100660304, "step": 149350 }, { "epoch": 3.648767498106662, "grad_norm": 0.004201000090688467, "learning_rate": 4.1292696264043724e-07, "loss": 0.0001, "num_input_tokens_seen": 100663312, "step": 149355 }, { "epoch": 3.648889648938509, "grad_norm": 0.0007439203909598291, "learning_rate": 4.128579296615312e-07, "loss": 0.0, "num_input_tokens_seen": 100666448, "step": 149360 }, { "epoch": 3.6490117997703564, "grad_norm": 0.005541152320802212, "learning_rate": 4.127889009524802e-07, "loss": 0.0, "num_input_tokens_seen": 100669712, "step": 149365 }, { "epoch": 3.6491339506022036, "grad_norm": 0.001362864044494927, "learning_rate": 4.127198765137866e-07, "loss": 0.0009, "num_input_tokens_seen": 100672976, "step": 149370 }, { "epoch": 3.6492561014340508, "grad_norm": 0.017730576917529106, "learning_rate": 4.1265085634595167e-07, "loss": 0.0, "num_input_tokens_seen": 100676944, "step": 149375 }, { "epoch": 3.649378252265898, "grad_norm": 0.029552794992923737, "learning_rate": 4.125818404494781e-07, "loss": 0.0, "num_input_tokens_seen": 100680784, "step": 149380 }, { "epoch": 3.649500403097745, "grad_norm": 0.01852906122803688, "learning_rate": 4.125128288248669e-07, "loss": 0.0009, "num_input_tokens_seen": 100683856, "step": 149385 }, { "epoch": 3.6496225539295923, "grad_norm": 0.460417240858078, "learning_rate": 4.1244382147262093e-07, "loss": 0.0002, "num_input_tokens_seen": 100687376, "step": 149390 }, { "epoch": 3.6497447047614395, "grad_norm": 0.014944472350180149, "learning_rate": 4.123748183932414e-07, "loss": 0.0, "num_input_tokens_seen": 100690832, "step": 149395 }, { "epoch": 3.6498668555932867, "grad_norm": 0.006090358830988407, "learning_rate": 4.1230581958723e-07, "loss": 0.0001, "num_input_tokens_seen": 100694544, "step": 149400 }, { "epoch": 3.649989006425134, "grad_norm": 0.022321002557873726, "learning_rate": 4.122368250550889e-07, "loss": 0.0003, "num_input_tokens_seen": 100697616, "step": 149405 }, { "epoch": 3.6501111572569807, "grad_norm": 0.0012846055906265974, "learning_rate": 4.121678347973195e-07, "loss": 0.0, "num_input_tokens_seen": 100700816, "step": 149410 }, { "epoch": 3.6502333080888283, "grad_norm": 0.0003858658601529896, "learning_rate": 4.1209884881442413e-07, "loss": 0.0162, "num_input_tokens_seen": 100703824, "step": 149415 }, { "epoch": 3.650355458920675, "grad_norm": 0.0029156107921153307, "learning_rate": 4.1202986710690356e-07, "loss": 0.0, "num_input_tokens_seen": 100707408, "step": 149420 }, { "epoch": 3.6504776097525227, "grad_norm": 18.649917602539062, "learning_rate": 4.1196088967526e-07, "loss": 0.0527, "num_input_tokens_seen": 100711056, "step": 149425 }, { "epoch": 3.6505997605843694, "grad_norm": 0.012175104580819607, "learning_rate": 4.1189191651999546e-07, "loss": 0.0, "num_input_tokens_seen": 100714448, "step": 149430 }, { "epoch": 3.6507219114162166, "grad_norm": 0.00016269886691588908, "learning_rate": 4.118229476416106e-07, "loss": 0.0348, "num_input_tokens_seen": 100717840, "step": 149435 }, { "epoch": 3.650844062248064, "grad_norm": 0.004647035151720047, "learning_rate": 4.1175398304060783e-07, "loss": 0.0, "num_input_tokens_seen": 100721424, "step": 149440 }, { "epoch": 3.650966213079911, "grad_norm": 0.0009960465831682086, "learning_rate": 4.1168502271748794e-07, "loss": 0.0, "num_input_tokens_seen": 100725008, "step": 149445 }, { "epoch": 3.651088363911758, "grad_norm": 0.0012097298167645931, "learning_rate": 4.1161606667275327e-07, "loss": 0.0, "num_input_tokens_seen": 100728656, "step": 149450 }, { "epoch": 3.6512105147436054, "grad_norm": 0.002824085298925638, "learning_rate": 4.1154711490690443e-07, "loss": 0.0, "num_input_tokens_seen": 100731664, "step": 149455 }, { "epoch": 3.6513326655754526, "grad_norm": 0.0038389968685805798, "learning_rate": 4.1147816742044317e-07, "loss": 0.0, "num_input_tokens_seen": 100734800, "step": 149460 }, { "epoch": 3.6514548164072997, "grad_norm": 0.0008183844038285315, "learning_rate": 4.1140922421387137e-07, "loss": 0.0, "num_input_tokens_seen": 100738384, "step": 149465 }, { "epoch": 3.651576967239147, "grad_norm": 0.08280424773693085, "learning_rate": 4.113402852876897e-07, "loss": 0.0, "num_input_tokens_seen": 100741712, "step": 149470 }, { "epoch": 3.651699118070994, "grad_norm": 0.0005090846680104733, "learning_rate": 4.1127135064240003e-07, "loss": 0.0, "num_input_tokens_seen": 100744848, "step": 149475 }, { "epoch": 3.6518212689028413, "grad_norm": 0.015799952670931816, "learning_rate": 4.112024202785033e-07, "loss": 0.0, "num_input_tokens_seen": 100748176, "step": 149480 }, { "epoch": 3.6519434197346885, "grad_norm": 0.0010082110529765487, "learning_rate": 4.1113349419650113e-07, "loss": 0.0, "num_input_tokens_seen": 100751696, "step": 149485 }, { "epoch": 3.6520655705665357, "grad_norm": 0.010985535569489002, "learning_rate": 4.1106457239689417e-07, "loss": 0.0, "num_input_tokens_seen": 100755408, "step": 149490 }, { "epoch": 3.652187721398383, "grad_norm": 0.0007222816930152476, "learning_rate": 4.109956548801845e-07, "loss": 0.0, "num_input_tokens_seen": 100758416, "step": 149495 }, { "epoch": 3.65230987223023, "grad_norm": 0.0009421857539564371, "learning_rate": 4.1092674164687247e-07, "loss": 0.0, "num_input_tokens_seen": 100761296, "step": 149500 }, { "epoch": 3.652432023062077, "grad_norm": 0.0004014758742414415, "learning_rate": 4.1085783269746e-07, "loss": 0.0, "num_input_tokens_seen": 100764432, "step": 149505 }, { "epoch": 3.6525541738939244, "grad_norm": 0.003766036592423916, "learning_rate": 4.107889280324478e-07, "loss": 0.0, "num_input_tokens_seen": 100767824, "step": 149510 }, { "epoch": 3.652676324725771, "grad_norm": 0.006593499332666397, "learning_rate": 4.107200276523367e-07, "loss": 0.0, "num_input_tokens_seen": 100771344, "step": 149515 }, { "epoch": 3.6527984755576184, "grad_norm": 0.26415514945983887, "learning_rate": 4.10651131557628e-07, "loss": 0.0001, "num_input_tokens_seen": 100774864, "step": 149520 }, { "epoch": 3.6529206263894656, "grad_norm": 0.0013655334478244185, "learning_rate": 4.105822397488231e-07, "loss": 0.0313, "num_input_tokens_seen": 100777808, "step": 149525 }, { "epoch": 3.6530427772213128, "grad_norm": 0.01848735846579075, "learning_rate": 4.1051335222642247e-07, "loss": 0.0, "num_input_tokens_seen": 100781712, "step": 149530 }, { "epoch": 3.65316492805316, "grad_norm": 0.0029542753472924232, "learning_rate": 4.1044446899092756e-07, "loss": 0.0, "num_input_tokens_seen": 100785360, "step": 149535 }, { "epoch": 3.653287078885007, "grad_norm": 0.10529420524835587, "learning_rate": 4.1037559004283863e-07, "loss": 0.0672, "num_input_tokens_seen": 100788368, "step": 149540 }, { "epoch": 3.6534092297168543, "grad_norm": 0.3563624918460846, "learning_rate": 4.103067153826575e-07, "loss": 0.0004, "num_input_tokens_seen": 100791824, "step": 149545 }, { "epoch": 3.6535313805487015, "grad_norm": 0.046180255711078644, "learning_rate": 4.1023784501088407e-07, "loss": 0.0, "num_input_tokens_seen": 100795344, "step": 149550 }, { "epoch": 3.6536535313805487, "grad_norm": 0.0006740966928191483, "learning_rate": 4.101689789280197e-07, "loss": 0.0, "num_input_tokens_seen": 100798800, "step": 149555 }, { "epoch": 3.653775682212396, "grad_norm": 0.011169800534844398, "learning_rate": 4.101001171345655e-07, "loss": 0.0002, "num_input_tokens_seen": 100802448, "step": 149560 }, { "epoch": 3.653897833044243, "grad_norm": 0.0009329328895546496, "learning_rate": 4.100312596310216e-07, "loss": 0.0, "num_input_tokens_seen": 100806032, "step": 149565 }, { "epoch": 3.6540199838760903, "grad_norm": 0.010429616086184978, "learning_rate": 4.0996240641788936e-07, "loss": 0.0, "num_input_tokens_seen": 100809360, "step": 149570 }, { "epoch": 3.6541421347079375, "grad_norm": 0.0010302531300112605, "learning_rate": 4.0989355749566887e-07, "loss": 0.0, "num_input_tokens_seen": 100812880, "step": 149575 }, { "epoch": 3.6542642855397847, "grad_norm": 42.75357437133789, "learning_rate": 4.098247128648611e-07, "loss": 0.0536, "num_input_tokens_seen": 100816528, "step": 149580 }, { "epoch": 3.654386436371632, "grad_norm": 0.0007744549075141549, "learning_rate": 4.097558725259672e-07, "loss": 0.0, "num_input_tokens_seen": 100819664, "step": 149585 }, { "epoch": 3.6545085872034786, "grad_norm": 0.002218458568677306, "learning_rate": 4.09687036479487e-07, "loss": 0.0, "num_input_tokens_seen": 100823504, "step": 149590 }, { "epoch": 3.6546307380353262, "grad_norm": 0.0005056411027908325, "learning_rate": 4.0961820472592167e-07, "loss": 0.0002, "num_input_tokens_seen": 100827024, "step": 149595 }, { "epoch": 3.654752888867173, "grad_norm": 0.0013489199336618185, "learning_rate": 4.0954937726577165e-07, "loss": 0.0455, "num_input_tokens_seen": 100830352, "step": 149600 }, { "epoch": 3.6548750396990206, "grad_norm": 0.001258478732779622, "learning_rate": 4.094805540995371e-07, "loss": 0.0, "num_input_tokens_seen": 100834000, "step": 149605 }, { "epoch": 3.6549971905308674, "grad_norm": 0.00742685841396451, "learning_rate": 4.09411735227719e-07, "loss": 0.0474, "num_input_tokens_seen": 100837008, "step": 149610 }, { "epoch": 3.6551193413627145, "grad_norm": 0.016420450061559677, "learning_rate": 4.0934292065081733e-07, "loss": 0.0267, "num_input_tokens_seen": 100840848, "step": 149615 }, { "epoch": 3.6552414921945617, "grad_norm": 0.01596057042479515, "learning_rate": 4.0927411036933314e-07, "loss": 0.0, "num_input_tokens_seen": 100843984, "step": 149620 }, { "epoch": 3.655363643026409, "grad_norm": 0.006191764958202839, "learning_rate": 4.092053043837661e-07, "loss": 0.0, "num_input_tokens_seen": 100847504, "step": 149625 }, { "epoch": 3.655485793858256, "grad_norm": 0.01052805408835411, "learning_rate": 4.091365026946174e-07, "loss": 0.0001, "num_input_tokens_seen": 100850640, "step": 149630 }, { "epoch": 3.6556079446901033, "grad_norm": 0.002658533863723278, "learning_rate": 4.0906770530238667e-07, "loss": 0.0, "num_input_tokens_seen": 100854736, "step": 149635 }, { "epoch": 3.6557300955219505, "grad_norm": 0.030460629612207413, "learning_rate": 4.089989122075748e-07, "loss": 0.0, "num_input_tokens_seen": 100858000, "step": 149640 }, { "epoch": 3.6558522463537977, "grad_norm": 0.021629929542541504, "learning_rate": 4.0893012341068146e-07, "loss": 0.1434, "num_input_tokens_seen": 100861136, "step": 149645 }, { "epoch": 3.655974397185645, "grad_norm": 0.0006607657414861023, "learning_rate": 4.088613389122072e-07, "loss": 0.0, "num_input_tokens_seen": 100864848, "step": 149650 }, { "epoch": 3.656096548017492, "grad_norm": 0.013342432677745819, "learning_rate": 4.087925587126527e-07, "loss": 0.0012, "num_input_tokens_seen": 100868240, "step": 149655 }, { "epoch": 3.6562186988493393, "grad_norm": 0.3374849855899811, "learning_rate": 4.087237828125174e-07, "loss": 0.0003, "num_input_tokens_seen": 100871888, "step": 149660 }, { "epoch": 3.6563408496811864, "grad_norm": 0.006206023972481489, "learning_rate": 4.0865501121230205e-07, "loss": 0.0, "num_input_tokens_seen": 100875344, "step": 149665 }, { "epoch": 3.6564630005130336, "grad_norm": 1.4731436967849731, "learning_rate": 4.085862439125063e-07, "loss": 0.0003, "num_input_tokens_seen": 100878544, "step": 149670 }, { "epoch": 3.656585151344881, "grad_norm": 0.0007857258315198123, "learning_rate": 4.0851748091363036e-07, "loss": 0.0, "num_input_tokens_seen": 100881616, "step": 149675 }, { "epoch": 3.656707302176728, "grad_norm": 0.0003224741667509079, "learning_rate": 4.084487222161748e-07, "loss": 0.0001, "num_input_tokens_seen": 100885072, "step": 149680 }, { "epoch": 3.6568294530085748, "grad_norm": 0.008007667027413845, "learning_rate": 4.0837996782063876e-07, "loss": 0.0, "num_input_tokens_seen": 100888592, "step": 149685 }, { "epoch": 3.6569516038404224, "grad_norm": 0.15671700239181519, "learning_rate": 4.083112177275232e-07, "loss": 0.0005, "num_input_tokens_seen": 100891984, "step": 149690 }, { "epoch": 3.657073754672269, "grad_norm": 0.041414983570575714, "learning_rate": 4.082424719373272e-07, "loss": 0.0309, "num_input_tokens_seen": 100895760, "step": 149695 }, { "epoch": 3.6571959055041163, "grad_norm": 0.0028408959042280912, "learning_rate": 4.0817373045055134e-07, "loss": 0.0, "num_input_tokens_seen": 100899600, "step": 149700 }, { "epoch": 3.6573180563359635, "grad_norm": 0.017122983932495117, "learning_rate": 4.0810499326769557e-07, "loss": 0.0974, "num_input_tokens_seen": 100902672, "step": 149705 }, { "epoch": 3.6574402071678107, "grad_norm": 0.00024130615929607302, "learning_rate": 4.080362603892589e-07, "loss": 0.0, "num_input_tokens_seen": 100905872, "step": 149710 }, { "epoch": 3.657562357999658, "grad_norm": 0.015229341574013233, "learning_rate": 4.079675318157423e-07, "loss": 0.0, "num_input_tokens_seen": 100909200, "step": 149715 }, { "epoch": 3.657684508831505, "grad_norm": 0.051498375833034515, "learning_rate": 4.078988075476445e-07, "loss": 0.0, "num_input_tokens_seen": 100912144, "step": 149720 }, { "epoch": 3.6578066596633523, "grad_norm": 0.04249678924679756, "learning_rate": 4.0783008758546633e-07, "loss": 0.0, "num_input_tokens_seen": 100915536, "step": 149725 }, { "epoch": 3.6579288104951995, "grad_norm": 0.008442615158855915, "learning_rate": 4.0776137192970664e-07, "loss": 0.0, "num_input_tokens_seen": 100918544, "step": 149730 }, { "epoch": 3.6580509613270467, "grad_norm": 0.004928910173475742, "learning_rate": 4.076926605808655e-07, "loss": 0.0, "num_input_tokens_seen": 100921744, "step": 149735 }, { "epoch": 3.658173112158894, "grad_norm": 0.02472076192498207, "learning_rate": 4.0762395353944303e-07, "loss": 0.0, "num_input_tokens_seen": 100924944, "step": 149740 }, { "epoch": 3.658295262990741, "grad_norm": 0.22950170934200287, "learning_rate": 4.075552508059382e-07, "loss": 0.0002, "num_input_tokens_seen": 100928656, "step": 149745 }, { "epoch": 3.6584174138225882, "grad_norm": 0.0042465124279260635, "learning_rate": 4.0748655238085115e-07, "loss": 0.0, "num_input_tokens_seen": 100931920, "step": 149750 }, { "epoch": 3.6585395646544354, "grad_norm": 0.10099215805530548, "learning_rate": 4.074178582646811e-07, "loss": 0.0, "num_input_tokens_seen": 100935312, "step": 149755 }, { "epoch": 3.6586617154862826, "grad_norm": 0.1387917548418045, "learning_rate": 4.07349168457928e-07, "loss": 0.0001, "num_input_tokens_seen": 100938832, "step": 149760 }, { "epoch": 3.65878386631813, "grad_norm": 0.001386315911076963, "learning_rate": 4.0728048296109084e-07, "loss": 0.0, "num_input_tokens_seen": 100942096, "step": 149765 }, { "epoch": 3.6589060171499765, "grad_norm": 0.005705169402062893, "learning_rate": 4.072118017746694e-07, "loss": 0.0001, "num_input_tokens_seen": 100945424, "step": 149770 }, { "epoch": 3.659028167981824, "grad_norm": 0.0016926834359765053, "learning_rate": 4.0714312489916347e-07, "loss": 0.0, "num_input_tokens_seen": 100948880, "step": 149775 }, { "epoch": 3.659150318813671, "grad_norm": 0.013110811822116375, "learning_rate": 4.07074452335072e-07, "loss": 0.0, "num_input_tokens_seen": 100954448, "step": 149780 }, { "epoch": 3.6592724696455186, "grad_norm": 0.030060919001698494, "learning_rate": 4.0700578408289477e-07, "loss": 0.0, "num_input_tokens_seen": 100958288, "step": 149785 }, { "epoch": 3.6593946204773653, "grad_norm": 0.0444689504802227, "learning_rate": 4.069371201431308e-07, "loss": 0.0514, "num_input_tokens_seen": 100961296, "step": 149790 }, { "epoch": 3.6595167713092125, "grad_norm": 0.0010941436048597097, "learning_rate": 4.068684605162798e-07, "loss": 0.0, "num_input_tokens_seen": 100964432, "step": 149795 }, { "epoch": 3.6596389221410597, "grad_norm": 0.01235596090555191, "learning_rate": 4.067998052028406e-07, "loss": 0.0, "num_input_tokens_seen": 100967824, "step": 149800 }, { "epoch": 3.659761072972907, "grad_norm": 0.11753348261117935, "learning_rate": 4.0673115420331315e-07, "loss": 0.0001, "num_input_tokens_seen": 100971472, "step": 149805 }, { "epoch": 3.659883223804754, "grad_norm": 0.0018995238933712244, "learning_rate": 4.0666250751819597e-07, "loss": 0.0, "num_input_tokens_seen": 100974864, "step": 149810 }, { "epoch": 3.6600053746366012, "grad_norm": 0.00396025599911809, "learning_rate": 4.0659386514798887e-07, "loss": 0.0, "num_input_tokens_seen": 100978576, "step": 149815 }, { "epoch": 3.6601275254684484, "grad_norm": 0.0511053092777729, "learning_rate": 4.065252270931909e-07, "loss": 0.0, "num_input_tokens_seen": 100982032, "step": 149820 }, { "epoch": 3.6602496763002956, "grad_norm": 0.0021754498593509197, "learning_rate": 4.0645659335430073e-07, "loss": 0.0, "num_input_tokens_seen": 100985296, "step": 149825 }, { "epoch": 3.660371827132143, "grad_norm": 0.047641050070524216, "learning_rate": 4.063879639318178e-07, "loss": 0.0001, "num_input_tokens_seen": 100988496, "step": 149830 }, { "epoch": 3.66049397796399, "grad_norm": 0.001030646963045001, "learning_rate": 4.063193388262417e-07, "loss": 0.0, "num_input_tokens_seen": 100991696, "step": 149835 }, { "epoch": 3.660616128795837, "grad_norm": 0.0016205195570364594, "learning_rate": 4.062507180380707e-07, "loss": 0.0006, "num_input_tokens_seen": 100995088, "step": 149840 }, { "epoch": 3.6607382796276844, "grad_norm": 0.003273698966950178, "learning_rate": 4.061821015678044e-07, "loss": 0.0, "num_input_tokens_seen": 100998224, "step": 149845 }, { "epoch": 3.6608604304595316, "grad_norm": 0.00023401729413308203, "learning_rate": 4.061134894159413e-07, "loss": 0.0, "num_input_tokens_seen": 101001872, "step": 149850 }, { "epoch": 3.6609825812913783, "grad_norm": 0.006904236972332001, "learning_rate": 4.06044881582981e-07, "loss": 0.0001, "num_input_tokens_seen": 101005328, "step": 149855 }, { "epoch": 3.661104732123226, "grad_norm": 0.0033744508400559425, "learning_rate": 4.059762780694217e-07, "loss": 0.0, "num_input_tokens_seen": 101008720, "step": 149860 }, { "epoch": 3.6612268829550727, "grad_norm": 0.0004270582285244018, "learning_rate": 4.059076788757627e-07, "loss": 0.0002, "num_input_tokens_seen": 101012112, "step": 149865 }, { "epoch": 3.6613490337869203, "grad_norm": 0.0033920772839337587, "learning_rate": 4.058390840025032e-07, "loss": 0.0, "num_input_tokens_seen": 101015248, "step": 149870 }, { "epoch": 3.661471184618767, "grad_norm": 0.004283586982637644, "learning_rate": 4.0577049345014137e-07, "loss": 0.06, "num_input_tokens_seen": 101018128, "step": 149875 }, { "epoch": 3.6615933354506143, "grad_norm": 0.00031772934016771615, "learning_rate": 4.057019072191766e-07, "loss": 0.0, "num_input_tokens_seen": 101021136, "step": 149880 }, { "epoch": 3.6617154862824615, "grad_norm": 0.022333379834890366, "learning_rate": 4.056333253101072e-07, "loss": 0.0003, "num_input_tokens_seen": 101024848, "step": 149885 }, { "epoch": 3.6618376371143087, "grad_norm": 0.3499573767185211, "learning_rate": 4.0556474772343194e-07, "loss": 0.0003, "num_input_tokens_seen": 101028176, "step": 149890 }, { "epoch": 3.661959787946156, "grad_norm": 0.015845805406570435, "learning_rate": 4.0549617445965023e-07, "loss": 0.0, "num_input_tokens_seen": 101031248, "step": 149895 }, { "epoch": 3.662081938778003, "grad_norm": 0.0006837969995103776, "learning_rate": 4.0542760551925983e-07, "loss": 0.0366, "num_input_tokens_seen": 101034960, "step": 149900 }, { "epoch": 3.66220408960985, "grad_norm": 0.014118622057139874, "learning_rate": 4.053590409027602e-07, "loss": 0.0, "num_input_tokens_seen": 101038224, "step": 149905 }, { "epoch": 3.6623262404416974, "grad_norm": 0.0033027688041329384, "learning_rate": 4.052904806106495e-07, "loss": 0.0, "num_input_tokens_seen": 101041360, "step": 149910 }, { "epoch": 3.6624483912735446, "grad_norm": 0.18004614114761353, "learning_rate": 4.052219246434261e-07, "loss": 0.0001, "num_input_tokens_seen": 101044816, "step": 149915 }, { "epoch": 3.662570542105392, "grad_norm": 0.002139299176633358, "learning_rate": 4.0515337300158914e-07, "loss": 0.0, "num_input_tokens_seen": 101047824, "step": 149920 }, { "epoch": 3.662692692937239, "grad_norm": 0.00020565226441249251, "learning_rate": 4.050848256856365e-07, "loss": 0.0975, "num_input_tokens_seen": 101051280, "step": 149925 }, { "epoch": 3.662814843769086, "grad_norm": 0.003223554929718375, "learning_rate": 4.0501628269606735e-07, "loss": 0.0, "num_input_tokens_seen": 101055056, "step": 149930 }, { "epoch": 3.6629369946009334, "grad_norm": 51.59830093383789, "learning_rate": 4.049477440333795e-07, "loss": 0.001, "num_input_tokens_seen": 101058512, "step": 149935 }, { "epoch": 3.6630591454327806, "grad_norm": 0.0013691794592887163, "learning_rate": 4.04879209698072e-07, "loss": 0.0299, "num_input_tokens_seen": 101061648, "step": 149940 }, { "epoch": 3.6631812962646277, "grad_norm": 0.0034709975589066744, "learning_rate": 4.048106796906426e-07, "loss": 0.0, "num_input_tokens_seen": 101065360, "step": 149945 }, { "epoch": 3.6633034470964745, "grad_norm": 0.0040048398077487946, "learning_rate": 4.047421540115905e-07, "loss": 0.0, "num_input_tokens_seen": 101068880, "step": 149950 }, { "epoch": 3.663425597928322, "grad_norm": 0.08709995448589325, "learning_rate": 4.0467363266141317e-07, "loss": 0.0, "num_input_tokens_seen": 101071888, "step": 149955 }, { "epoch": 3.663547748760169, "grad_norm": 0.004366215784102678, "learning_rate": 4.046051156406093e-07, "loss": 0.0, "num_input_tokens_seen": 101075472, "step": 149960 }, { "epoch": 3.6636698995920165, "grad_norm": 0.0002989302738569677, "learning_rate": 4.045366029496774e-07, "loss": 0.0, "num_input_tokens_seen": 101078672, "step": 149965 }, { "epoch": 3.6637920504238632, "grad_norm": 0.002677259035408497, "learning_rate": 4.044680945891152e-07, "loss": 0.0001, "num_input_tokens_seen": 101082064, "step": 149970 }, { "epoch": 3.6639142012557104, "grad_norm": 0.011708454228937626, "learning_rate": 4.0439959055942163e-07, "loss": 0.0, "num_input_tokens_seen": 101085520, "step": 149975 }, { "epoch": 3.6640363520875576, "grad_norm": 0.00015541176253464073, "learning_rate": 4.0433109086109407e-07, "loss": 0.0875, "num_input_tokens_seen": 101089168, "step": 149980 }, { "epoch": 3.664158502919405, "grad_norm": 0.0014233732363209128, "learning_rate": 4.042625954946309e-07, "loss": 0.0, "num_input_tokens_seen": 101092560, "step": 149985 }, { "epoch": 3.664280653751252, "grad_norm": 0.013304512947797775, "learning_rate": 4.0419410446053095e-07, "loss": 0.0839, "num_input_tokens_seen": 101096400, "step": 149990 }, { "epoch": 3.664402804583099, "grad_norm": 0.0023726148065179586, "learning_rate": 4.0412561775929123e-07, "loss": 0.0001, "num_input_tokens_seen": 101099856, "step": 149995 }, { "epoch": 3.6645249554149464, "grad_norm": 0.0012767596635967493, "learning_rate": 4.040571353914106e-07, "loss": 0.0, "num_input_tokens_seen": 101102992, "step": 150000 }, { "epoch": 3.6646471062467936, "grad_norm": 0.002920180559158325, "learning_rate": 4.039886573573864e-07, "loss": 0.0, "num_input_tokens_seen": 101106320, "step": 150005 }, { "epoch": 3.6647692570786408, "grad_norm": 0.00018298810755368322, "learning_rate": 4.039201836577175e-07, "loss": 0.0, "num_input_tokens_seen": 101109328, "step": 150010 }, { "epoch": 3.664891407910488, "grad_norm": 0.01957639306783676, "learning_rate": 4.038517142929012e-07, "loss": 0.0001, "num_input_tokens_seen": 101112400, "step": 150015 }, { "epoch": 3.665013558742335, "grad_norm": 0.0002295639569638297, "learning_rate": 4.037832492634353e-07, "loss": 0.0, "num_input_tokens_seen": 101115408, "step": 150020 }, { "epoch": 3.6651357095741823, "grad_norm": 0.00030080214492045343, "learning_rate": 4.0371478856981834e-07, "loss": 0.0, "num_input_tokens_seen": 101118800, "step": 150025 }, { "epoch": 3.6652578604060295, "grad_norm": 0.0009421844151802361, "learning_rate": 4.036463322125474e-07, "loss": 0.0, "num_input_tokens_seen": 101122064, "step": 150030 }, { "epoch": 3.6653800112378763, "grad_norm": 0.0010937204351648688, "learning_rate": 4.0357788019212116e-07, "loss": 0.0813, "num_input_tokens_seen": 101125776, "step": 150035 }, { "epoch": 3.665502162069724, "grad_norm": 0.005235959775745869, "learning_rate": 4.0350943250903657e-07, "loss": 0.0, "num_input_tokens_seen": 101129168, "step": 150040 }, { "epoch": 3.6656243129015706, "grad_norm": 0.0004736567207146436, "learning_rate": 4.034409891637919e-07, "loss": 0.0001, "num_input_tokens_seen": 101132880, "step": 150045 }, { "epoch": 3.6657464637334183, "grad_norm": 0.00039471962372772396, "learning_rate": 4.033725501568851e-07, "loss": 0.0, "num_input_tokens_seen": 101136272, "step": 150050 }, { "epoch": 3.665868614565265, "grad_norm": 0.006417789030820131, "learning_rate": 4.0330411548881325e-07, "loss": 0.0, "num_input_tokens_seen": 101139728, "step": 150055 }, { "epoch": 3.665990765397112, "grad_norm": 0.006520419381558895, "learning_rate": 4.032356851600748e-07, "loss": 0.0, "num_input_tokens_seen": 101142608, "step": 150060 }, { "epoch": 3.6661129162289594, "grad_norm": 0.000228529519517906, "learning_rate": 4.0316725917116645e-07, "loss": 0.0004, "num_input_tokens_seen": 101145936, "step": 150065 }, { "epoch": 3.6662350670608066, "grad_norm": 0.00034973511355929077, "learning_rate": 4.030988375225868e-07, "loss": 0.0, "num_input_tokens_seen": 101149840, "step": 150070 }, { "epoch": 3.666357217892654, "grad_norm": 0.003269769949838519, "learning_rate": 4.0303042021483256e-07, "loss": 0.0, "num_input_tokens_seen": 101153168, "step": 150075 }, { "epoch": 3.666479368724501, "grad_norm": 0.00031599405338056386, "learning_rate": 4.029620072484017e-07, "loss": 0.0, "num_input_tokens_seen": 101156560, "step": 150080 }, { "epoch": 3.666601519556348, "grad_norm": 0.013983500190079212, "learning_rate": 4.02893598623792e-07, "loss": 0.0, "num_input_tokens_seen": 101160208, "step": 150085 }, { "epoch": 3.6667236703881954, "grad_norm": 0.0007696465472690761, "learning_rate": 4.028251943415003e-07, "loss": 0.0, "num_input_tokens_seen": 101163792, "step": 150090 }, { "epoch": 3.6668458212200425, "grad_norm": 0.012345520779490471, "learning_rate": 4.027567944020248e-07, "loss": 0.0, "num_input_tokens_seen": 101167184, "step": 150095 }, { "epoch": 3.6669679720518897, "grad_norm": 0.0010798462899401784, "learning_rate": 4.0268839880586214e-07, "loss": 0.0269, "num_input_tokens_seen": 101170448, "step": 150100 }, { "epoch": 3.667090122883737, "grad_norm": 0.004565829876810312, "learning_rate": 4.026200075535104e-07, "loss": 0.0, "num_input_tokens_seen": 101173712, "step": 150105 }, { "epoch": 3.667212273715584, "grad_norm": 0.0010166483698412776, "learning_rate": 4.0255162064546644e-07, "loss": 0.0, "num_input_tokens_seen": 101177296, "step": 150110 }, { "epoch": 3.6673344245474313, "grad_norm": 0.9598413705825806, "learning_rate": 4.0248323808222803e-07, "loss": 0.0008, "num_input_tokens_seen": 101181072, "step": 150115 }, { "epoch": 3.6674565753792785, "grad_norm": 0.0005369943683035672, "learning_rate": 4.024148598642919e-07, "loss": 0.0, "num_input_tokens_seen": 101184016, "step": 150120 }, { "epoch": 3.6675787262111257, "grad_norm": 0.01920846663415432, "learning_rate": 4.0234648599215606e-07, "loss": 0.0, "num_input_tokens_seen": 101186832, "step": 150125 }, { "epoch": 3.6677008770429724, "grad_norm": 0.002244969131425023, "learning_rate": 4.022781164663173e-07, "loss": 0.0321, "num_input_tokens_seen": 101189968, "step": 150130 }, { "epoch": 3.66782302787482, "grad_norm": 0.1661972850561142, "learning_rate": 4.0220975128727244e-07, "loss": 0.0001, "num_input_tokens_seen": 101193232, "step": 150135 }, { "epoch": 3.667945178706667, "grad_norm": 0.00025612005265429616, "learning_rate": 4.02141390455519e-07, "loss": 0.0334, "num_input_tokens_seen": 101196368, "step": 150140 }, { "epoch": 3.668067329538514, "grad_norm": 0.004735205322504044, "learning_rate": 4.0207303397155467e-07, "loss": 0.0002, "num_input_tokens_seen": 101199824, "step": 150145 }, { "epoch": 3.668189480370361, "grad_norm": 0.0003075416316278279, "learning_rate": 4.020046818358755e-07, "loss": 0.0001, "num_input_tokens_seen": 101203088, "step": 150150 }, { "epoch": 3.6683116312022084, "grad_norm": 47.425045013427734, "learning_rate": 4.0193633404897973e-07, "loss": 0.1163, "num_input_tokens_seen": 101206800, "step": 150155 }, { "epoch": 3.6684337820340556, "grad_norm": 0.0007772819953970611, "learning_rate": 4.0186799061136334e-07, "loss": 0.1, "num_input_tokens_seen": 101209872, "step": 150160 }, { "epoch": 3.6685559328659028, "grad_norm": 0.001310055493377149, "learning_rate": 4.0179965152352413e-07, "loss": 0.0268, "num_input_tokens_seen": 101213648, "step": 150165 }, { "epoch": 3.66867808369775, "grad_norm": 0.0007551017333753407, "learning_rate": 4.0173131678595837e-07, "loss": 0.0001, "num_input_tokens_seen": 101216912, "step": 150170 }, { "epoch": 3.668800234529597, "grad_norm": 0.018435023725032806, "learning_rate": 4.016629863991634e-07, "loss": 0.0371, "num_input_tokens_seen": 101221136, "step": 150175 }, { "epoch": 3.6689223853614443, "grad_norm": 49.14809036254883, "learning_rate": 4.0159466036363654e-07, "loss": 0.0526, "num_input_tokens_seen": 101224720, "step": 150180 }, { "epoch": 3.6690445361932915, "grad_norm": 0.005340251140296459, "learning_rate": 4.0152633867987383e-07, "loss": 0.0, "num_input_tokens_seen": 101228368, "step": 150185 }, { "epoch": 3.6691666870251387, "grad_norm": 1.001558780670166, "learning_rate": 4.01458021348373e-07, "loss": 0.0, "num_input_tokens_seen": 101231888, "step": 150190 }, { "epoch": 3.669288837856986, "grad_norm": 0.00035684643080458045, "learning_rate": 4.0138970836963006e-07, "loss": 0.0383, "num_input_tokens_seen": 101235088, "step": 150195 }, { "epoch": 3.669410988688833, "grad_norm": 0.0006180881755426526, "learning_rate": 4.0132139974414247e-07, "loss": 0.0201, "num_input_tokens_seen": 101238352, "step": 150200 }, { "epoch": 3.6695331395206803, "grad_norm": 0.002350478433072567, "learning_rate": 4.012530954724064e-07, "loss": 0.0, "num_input_tokens_seen": 101241552, "step": 150205 }, { "epoch": 3.6696552903525275, "grad_norm": 0.0037568961270153522, "learning_rate": 4.011847955549188e-07, "loss": 0.075, "num_input_tokens_seen": 101244816, "step": 150210 }, { "epoch": 3.669777441184374, "grad_norm": 0.0006394311785697937, "learning_rate": 4.011164999921768e-07, "loss": 0.0001, "num_input_tokens_seen": 101248080, "step": 150215 }, { "epoch": 3.669899592016222, "grad_norm": 0.0004662454593926668, "learning_rate": 4.010482087846766e-07, "loss": 0.0, "num_input_tokens_seen": 101251856, "step": 150220 }, { "epoch": 3.6700217428480686, "grad_norm": 0.004368456080555916, "learning_rate": 4.0097992193291474e-07, "loss": 0.0, "num_input_tokens_seen": 101255376, "step": 150225 }, { "epoch": 3.6701438936799162, "grad_norm": 0.009613312780857086, "learning_rate": 4.0091163943738825e-07, "loss": 0.0, "num_input_tokens_seen": 101258640, "step": 150230 }, { "epoch": 3.670266044511763, "grad_norm": 0.0005940728588029742, "learning_rate": 4.008433612985931e-07, "loss": 0.0, "num_input_tokens_seen": 101261648, "step": 150235 }, { "epoch": 3.67038819534361, "grad_norm": 16.313629150390625, "learning_rate": 4.007750875170266e-07, "loss": 0.0373, "num_input_tokens_seen": 101264976, "step": 150240 }, { "epoch": 3.6705103461754574, "grad_norm": 0.3082471787929535, "learning_rate": 4.007068180931844e-07, "loss": 0.0002, "num_input_tokens_seen": 101268176, "step": 150245 }, { "epoch": 3.6706324970073045, "grad_norm": 0.004585791379213333, "learning_rate": 4.006385530275638e-07, "loss": 0.0457, "num_input_tokens_seen": 101271120, "step": 150250 }, { "epoch": 3.6707546478391517, "grad_norm": 0.000260774337220937, "learning_rate": 4.005702923206605e-07, "loss": 0.0, "num_input_tokens_seen": 101274320, "step": 150255 }, { "epoch": 3.670876798670999, "grad_norm": 9.134371794061735e-05, "learning_rate": 4.005020359729716e-07, "loss": 0.0, "num_input_tokens_seen": 101277840, "step": 150260 }, { "epoch": 3.670998949502846, "grad_norm": 0.00024886432220228016, "learning_rate": 4.0043378398499286e-07, "loss": 0.0, "num_input_tokens_seen": 101280784, "step": 150265 }, { "epoch": 3.6711211003346933, "grad_norm": 0.4519248604774475, "learning_rate": 4.0036553635722083e-07, "loss": 0.0001, "num_input_tokens_seen": 101284176, "step": 150270 }, { "epoch": 3.6712432511665405, "grad_norm": 0.00011098060349468142, "learning_rate": 4.0029729309015224e-07, "loss": 0.0, "num_input_tokens_seen": 101287184, "step": 150275 }, { "epoch": 3.6713654019983877, "grad_norm": 0.0005101999267935753, "learning_rate": 4.0022905418428275e-07, "loss": 0.0, "num_input_tokens_seen": 101290512, "step": 150280 }, { "epoch": 3.671487552830235, "grad_norm": 0.00131460081320256, "learning_rate": 4.0016081964010927e-07, "loss": 0.0001, "num_input_tokens_seen": 101294608, "step": 150285 }, { "epoch": 3.671609703662082, "grad_norm": 0.005014736205339432, "learning_rate": 4.000925894581272e-07, "loss": 0.0, "num_input_tokens_seen": 101298128, "step": 150290 }, { "epoch": 3.6717318544939292, "grad_norm": 0.005378643982112408, "learning_rate": 4.000243636388332e-07, "loss": 0.0, "num_input_tokens_seen": 101301456, "step": 150295 }, { "epoch": 3.671854005325776, "grad_norm": 0.0007074220338836312, "learning_rate": 3.9995614218272377e-07, "loss": 0.0, "num_input_tokens_seen": 101305040, "step": 150300 }, { "epoch": 3.6719761561576236, "grad_norm": 0.34038740396499634, "learning_rate": 3.9988792509029435e-07, "loss": 0.0002, "num_input_tokens_seen": 101308240, "step": 150305 }, { "epoch": 3.6720983069894704, "grad_norm": 0.044417813420295715, "learning_rate": 3.998197123620417e-07, "loss": 0.0, "num_input_tokens_seen": 101311504, "step": 150310 }, { "epoch": 3.672220457821318, "grad_norm": 0.0010615488281473517, "learning_rate": 3.997515039984611e-07, "loss": 0.0, "num_input_tokens_seen": 101314896, "step": 150315 }, { "epoch": 3.6723426086531648, "grad_norm": 0.02319425158202648, "learning_rate": 3.9968330000004944e-07, "loss": 0.0001, "num_input_tokens_seen": 101318032, "step": 150320 }, { "epoch": 3.672464759485012, "grad_norm": 0.0005021728575229645, "learning_rate": 3.9961510036730227e-07, "loss": 0.0002, "num_input_tokens_seen": 101321808, "step": 150325 }, { "epoch": 3.672586910316859, "grad_norm": 0.0013200391549617052, "learning_rate": 3.995469051007152e-07, "loss": 0.0001, "num_input_tokens_seen": 101325008, "step": 150330 }, { "epoch": 3.6727090611487063, "grad_norm": 0.000464181590359658, "learning_rate": 3.9947871420078495e-07, "loss": 0.0, "num_input_tokens_seen": 101328144, "step": 150335 }, { "epoch": 3.6728312119805535, "grad_norm": 0.0006909414078108966, "learning_rate": 3.9941052766800656e-07, "loss": 0.0, "num_input_tokens_seen": 101331536, "step": 150340 }, { "epoch": 3.6729533628124007, "grad_norm": 0.0009814815130084753, "learning_rate": 3.993423455028767e-07, "loss": 0.0, "num_input_tokens_seen": 101335120, "step": 150345 }, { "epoch": 3.673075513644248, "grad_norm": 0.0008539154659956694, "learning_rate": 3.992741677058906e-07, "loss": 0.0451, "num_input_tokens_seen": 101338320, "step": 150350 }, { "epoch": 3.673197664476095, "grad_norm": 0.0025035631842911243, "learning_rate": 3.9920599427754465e-07, "loss": 0.0, "num_input_tokens_seen": 101341776, "step": 150355 }, { "epoch": 3.6733198153079423, "grad_norm": 0.00038199685513973236, "learning_rate": 3.991378252183339e-07, "loss": 0.0002, "num_input_tokens_seen": 101345680, "step": 150360 }, { "epoch": 3.6734419661397895, "grad_norm": 0.0012773294001817703, "learning_rate": 3.9906966052875457e-07, "loss": 0.0, "num_input_tokens_seen": 101349008, "step": 150365 }, { "epoch": 3.6735641169716367, "grad_norm": 0.00031368996133096516, "learning_rate": 3.990015002093027e-07, "loss": 0.0, "num_input_tokens_seen": 101352336, "step": 150370 }, { "epoch": 3.673686267803484, "grad_norm": 0.0028718202374875546, "learning_rate": 3.989333442604731e-07, "loss": 0.0, "num_input_tokens_seen": 101355536, "step": 150375 }, { "epoch": 3.673808418635331, "grad_norm": 0.014222218655049801, "learning_rate": 3.988651926827623e-07, "loss": 0.0466, "num_input_tokens_seen": 101358416, "step": 150380 }, { "epoch": 3.673930569467178, "grad_norm": 0.00805890653282404, "learning_rate": 3.9879704547666517e-07, "loss": 0.0488, "num_input_tokens_seen": 101361424, "step": 150385 }, { "epoch": 3.6740527202990254, "grad_norm": 0.0034299599938094616, "learning_rate": 3.987289026426776e-07, "loss": 0.0, "num_input_tokens_seen": 101364432, "step": 150390 }, { "epoch": 3.674174871130872, "grad_norm": 0.011425524950027466, "learning_rate": 3.9866076418129545e-07, "loss": 0.0, "num_input_tokens_seen": 101368272, "step": 150395 }, { "epoch": 3.67429702196272, "grad_norm": 0.011008653789758682, "learning_rate": 3.985926300930137e-07, "loss": 0.0297, "num_input_tokens_seen": 101371536, "step": 150400 }, { "epoch": 3.6744191727945665, "grad_norm": 0.0011469714809209108, "learning_rate": 3.985245003783284e-07, "loss": 0.0001, "num_input_tokens_seen": 101375568, "step": 150405 }, { "epoch": 3.674541323626414, "grad_norm": 0.0007156103383749723, "learning_rate": 3.9845637503773443e-07, "loss": 0.0001, "num_input_tokens_seen": 101378768, "step": 150410 }, { "epoch": 3.674663474458261, "grad_norm": 0.0012320448877289891, "learning_rate": 3.9838825407172784e-07, "loss": 0.062, "num_input_tokens_seen": 101381776, "step": 150415 }, { "epoch": 3.674785625290108, "grad_norm": 0.0015208465047180653, "learning_rate": 3.983201374808033e-07, "loss": 0.0001, "num_input_tokens_seen": 101384656, "step": 150420 }, { "epoch": 3.6749077761219553, "grad_norm": 0.04762888699769974, "learning_rate": 3.982520252654569e-07, "loss": 0.0001, "num_input_tokens_seen": 101388048, "step": 150425 }, { "epoch": 3.6750299269538025, "grad_norm": 0.009869548492133617, "learning_rate": 3.981839174261833e-07, "loss": 0.0001, "num_input_tokens_seen": 101391184, "step": 150430 }, { "epoch": 3.6751520777856497, "grad_norm": 0.0018234923481941223, "learning_rate": 3.9811581396347835e-07, "loss": 0.0388, "num_input_tokens_seen": 101394512, "step": 150435 }, { "epoch": 3.675274228617497, "grad_norm": 0.0007763886242173612, "learning_rate": 3.9804771487783727e-07, "loss": 0.0, "num_input_tokens_seen": 101397904, "step": 150440 }, { "epoch": 3.675396379449344, "grad_norm": 0.002413820242509246, "learning_rate": 3.9797962016975463e-07, "loss": 0.0, "num_input_tokens_seen": 101401232, "step": 150445 }, { "epoch": 3.6755185302811912, "grad_norm": 0.014111106283962727, "learning_rate": 3.979115298397262e-07, "loss": 0.0004, "num_input_tokens_seen": 101404944, "step": 150450 }, { "epoch": 3.6756406811130384, "grad_norm": 0.010848580859601498, "learning_rate": 3.978434438882474e-07, "loss": 0.0917, "num_input_tokens_seen": 101408208, "step": 150455 }, { "epoch": 3.6757628319448856, "grad_norm": 77.76248931884766, "learning_rate": 3.9777536231581265e-07, "loss": 0.0012, "num_input_tokens_seen": 101411536, "step": 150460 }, { "epoch": 3.675884982776733, "grad_norm": 0.002751800697296858, "learning_rate": 3.9770728512291785e-07, "loss": 0.0001, "num_input_tokens_seen": 101414736, "step": 150465 }, { "epoch": 3.67600713360858, "grad_norm": 0.002995507325977087, "learning_rate": 3.9763921231005726e-07, "loss": 0.0001, "num_input_tokens_seen": 101418128, "step": 150470 }, { "epoch": 3.676129284440427, "grad_norm": 0.031024383381009102, "learning_rate": 3.975711438777267e-07, "loss": 0.0, "num_input_tokens_seen": 101421520, "step": 150475 }, { "epoch": 3.676251435272274, "grad_norm": 0.17898677289485931, "learning_rate": 3.975030798264205e-07, "loss": 0.0003, "num_input_tokens_seen": 101425232, "step": 150480 }, { "epoch": 3.6763735861041216, "grad_norm": 0.01304326206445694, "learning_rate": 3.974350201566339e-07, "loss": 0.0001, "num_input_tokens_seen": 101428304, "step": 150485 }, { "epoch": 3.6764957369359683, "grad_norm": 0.0001925857359310612, "learning_rate": 3.973669648688622e-07, "loss": 0.0, "num_input_tokens_seen": 101431568, "step": 150490 }, { "epoch": 3.676617887767816, "grad_norm": 0.0064493948593735695, "learning_rate": 3.9729891396359984e-07, "loss": 0.0365, "num_input_tokens_seen": 101435216, "step": 150495 }, { "epoch": 3.6767400385996627, "grad_norm": 0.0003160308697260916, "learning_rate": 3.9723086744134216e-07, "loss": 0.0, "num_input_tokens_seen": 101438480, "step": 150500 }, { "epoch": 3.67686218943151, "grad_norm": 51.7661018371582, "learning_rate": 3.971628253025834e-07, "loss": 0.1552, "num_input_tokens_seen": 101442064, "step": 150505 }, { "epoch": 3.676984340263357, "grad_norm": 0.05616138502955437, "learning_rate": 3.97094787547819e-07, "loss": 0.0, "num_input_tokens_seen": 101445328, "step": 150510 }, { "epoch": 3.6771064910952043, "grad_norm": 0.0024035891983658075, "learning_rate": 3.9702675417754317e-07, "loss": 0.0, "num_input_tokens_seen": 101448464, "step": 150515 }, { "epoch": 3.6772286419270515, "grad_norm": 0.06979092210531235, "learning_rate": 3.969587251922509e-07, "loss": 0.0, "num_input_tokens_seen": 101451984, "step": 150520 }, { "epoch": 3.6773507927588986, "grad_norm": 0.0018006553873419762, "learning_rate": 3.9689070059243745e-07, "loss": 0.0, "num_input_tokens_seen": 101455568, "step": 150525 }, { "epoch": 3.677472943590746, "grad_norm": 0.004438464529812336, "learning_rate": 3.96822680378597e-07, "loss": 0.0, "num_input_tokens_seen": 101458768, "step": 150530 }, { "epoch": 3.677595094422593, "grad_norm": 0.004643600899726152, "learning_rate": 3.967546645512239e-07, "loss": 0.0, "num_input_tokens_seen": 101462352, "step": 150535 }, { "epoch": 3.67771724525444, "grad_norm": 0.004531925544142723, "learning_rate": 3.9668665311081337e-07, "loss": 0.0, "num_input_tokens_seen": 101465424, "step": 150540 }, { "epoch": 3.6778393960862874, "grad_norm": 0.001109713688492775, "learning_rate": 3.966186460578596e-07, "loss": 0.0, "num_input_tokens_seen": 101468752, "step": 150545 }, { "epoch": 3.6779615469181346, "grad_norm": 27.36225128173828, "learning_rate": 3.965506433928576e-07, "loss": 0.0332, "num_input_tokens_seen": 101471824, "step": 150550 }, { "epoch": 3.678083697749982, "grad_norm": 0.0016479621408507228, "learning_rate": 3.9648264511630125e-07, "loss": 0.0, "num_input_tokens_seen": 101475344, "step": 150555 }, { "epoch": 3.678205848581829, "grad_norm": 0.0013086326653137803, "learning_rate": 3.964146512286858e-07, "loss": 0.0, "num_input_tokens_seen": 101478544, "step": 150560 }, { "epoch": 3.678327999413676, "grad_norm": 0.006203626748174429, "learning_rate": 3.96346661730505e-07, "loss": 0.0, "num_input_tokens_seen": 101482448, "step": 150565 }, { "epoch": 3.6784501502455234, "grad_norm": 0.0003723462868947536, "learning_rate": 3.9627867662225403e-07, "loss": 0.0, "num_input_tokens_seen": 101485776, "step": 150570 }, { "epoch": 3.67857230107737, "grad_norm": 44.98166275024414, "learning_rate": 3.962106959044265e-07, "loss": 0.0854, "num_input_tokens_seen": 101488784, "step": 150575 }, { "epoch": 3.6786944519092177, "grad_norm": 0.0014090328477323055, "learning_rate": 3.961427195775171e-07, "loss": 0.0001, "num_input_tokens_seen": 101491984, "step": 150580 }, { "epoch": 3.6788166027410645, "grad_norm": 0.010927320457994938, "learning_rate": 3.9607474764202073e-07, "loss": 0.0001, "num_input_tokens_seen": 101495632, "step": 150585 }, { "epoch": 3.6789387535729117, "grad_norm": 0.007265997119247913, "learning_rate": 3.960067800984309e-07, "loss": 0.075, "num_input_tokens_seen": 101499536, "step": 150590 }, { "epoch": 3.679060904404759, "grad_norm": 0.4248805642127991, "learning_rate": 3.9593881694724253e-07, "loss": 0.0002, "num_input_tokens_seen": 101502800, "step": 150595 }, { "epoch": 3.679183055236606, "grad_norm": 0.00709194503724575, "learning_rate": 3.958708581889493e-07, "loss": 0.0, "num_input_tokens_seen": 101506384, "step": 150600 }, { "epoch": 3.6793052060684532, "grad_norm": 0.005410181358456612, "learning_rate": 3.9580290382404546e-07, "loss": 0.0001, "num_input_tokens_seen": 101510160, "step": 150605 }, { "epoch": 3.6794273569003004, "grad_norm": 0.004404694773256779, "learning_rate": 3.957349538530259e-07, "loss": 0.0004, "num_input_tokens_seen": 101513872, "step": 150610 }, { "epoch": 3.6795495077321476, "grad_norm": 0.004392651841044426, "learning_rate": 3.95667008276384e-07, "loss": 0.0, "num_input_tokens_seen": 101516816, "step": 150615 }, { "epoch": 3.679671658563995, "grad_norm": 0.0029403301887214184, "learning_rate": 3.9559906709461445e-07, "loss": 0.0, "num_input_tokens_seen": 101520208, "step": 150620 }, { "epoch": 3.679793809395842, "grad_norm": 0.010532950982451439, "learning_rate": 3.9553113030821064e-07, "loss": 0.0, "num_input_tokens_seen": 101523792, "step": 150625 }, { "epoch": 3.679915960227689, "grad_norm": 0.0017830339493229985, "learning_rate": 3.954631979176675e-07, "loss": 0.0, "num_input_tokens_seen": 101526992, "step": 150630 }, { "epoch": 3.6800381110595364, "grad_norm": 0.027786199003458023, "learning_rate": 3.953952699234785e-07, "loss": 0.0, "num_input_tokens_seen": 101530320, "step": 150635 }, { "epoch": 3.6801602618913836, "grad_norm": 0.07802188396453857, "learning_rate": 3.953273463261374e-07, "loss": 0.0001, "num_input_tokens_seen": 101533712, "step": 150640 }, { "epoch": 3.6802824127232308, "grad_norm": 0.05071675032377243, "learning_rate": 3.952594271261388e-07, "loss": 0.0001, "num_input_tokens_seen": 101536976, "step": 150645 }, { "epoch": 3.680404563555078, "grad_norm": 0.0006053565302863717, "learning_rate": 3.95191512323976e-07, "loss": 0.0, "num_input_tokens_seen": 101540240, "step": 150650 }, { "epoch": 3.680526714386925, "grad_norm": 0.0005544578889384866, "learning_rate": 3.951236019201435e-07, "loss": 0.0, "num_input_tokens_seen": 101543696, "step": 150655 }, { "epoch": 3.680648865218772, "grad_norm": 0.002754826098680496, "learning_rate": 3.9505569591513444e-07, "loss": 0.0, "num_input_tokens_seen": 101546768, "step": 150660 }, { "epoch": 3.6807710160506195, "grad_norm": 0.007763294503092766, "learning_rate": 3.949877943094435e-07, "loss": 0.0001, "num_input_tokens_seen": 101550032, "step": 150665 }, { "epoch": 3.6808931668824663, "grad_norm": 0.05538984388113022, "learning_rate": 3.949198971035638e-07, "loss": 0.0, "num_input_tokens_seen": 101553040, "step": 150670 }, { "epoch": 3.681015317714314, "grad_norm": 0.0809285044670105, "learning_rate": 3.9485200429798914e-07, "loss": 0.0001, "num_input_tokens_seen": 101556560, "step": 150675 }, { "epoch": 3.6811374685461606, "grad_norm": 0.0003654407919384539, "learning_rate": 3.947841158932139e-07, "loss": 0.0, "num_input_tokens_seen": 101560400, "step": 150680 }, { "epoch": 3.681259619378008, "grad_norm": 0.0008546562166884542, "learning_rate": 3.9471623188973115e-07, "loss": 0.0, "num_input_tokens_seen": 101564432, "step": 150685 }, { "epoch": 3.681381770209855, "grad_norm": 5.210339546203613, "learning_rate": 3.9464835228803494e-07, "loss": 0.0006, "num_input_tokens_seen": 101567248, "step": 150690 }, { "epoch": 3.681503921041702, "grad_norm": 0.0010287740733474493, "learning_rate": 3.945804770886184e-07, "loss": 0.0, "num_input_tokens_seen": 101570512, "step": 150695 }, { "epoch": 3.6816260718735494, "grad_norm": 0.003941098693758249, "learning_rate": 3.9451260629197557e-07, "loss": 0.0, "num_input_tokens_seen": 101573520, "step": 150700 }, { "epoch": 3.6817482227053966, "grad_norm": 0.005360405892133713, "learning_rate": 3.9444473989860017e-07, "loss": 0.0, "num_input_tokens_seen": 101576720, "step": 150705 }, { "epoch": 3.681870373537244, "grad_norm": 0.001114573678933084, "learning_rate": 3.943768779089852e-07, "loss": 0.0, "num_input_tokens_seen": 101580368, "step": 150710 }, { "epoch": 3.681992524369091, "grad_norm": 0.002144193509593606, "learning_rate": 3.943090203236248e-07, "loss": 0.0002, "num_input_tokens_seen": 101583696, "step": 150715 }, { "epoch": 3.682114675200938, "grad_norm": 0.000986748025752604, "learning_rate": 3.942411671430118e-07, "loss": 0.0, "num_input_tokens_seen": 101587600, "step": 150720 }, { "epoch": 3.6822368260327853, "grad_norm": 0.006151706911623478, "learning_rate": 3.941733183676402e-07, "loss": 0.0, "num_input_tokens_seen": 101590992, "step": 150725 }, { "epoch": 3.6823589768646325, "grad_norm": 0.025515733286738396, "learning_rate": 3.94105473998003e-07, "loss": 0.0, "num_input_tokens_seen": 101594512, "step": 150730 }, { "epoch": 3.6824811276964797, "grad_norm": 0.15389606356620789, "learning_rate": 3.94037634034594e-07, "loss": 0.0001, "num_input_tokens_seen": 101598352, "step": 150735 }, { "epoch": 3.682603278528327, "grad_norm": 0.0027224128134548664, "learning_rate": 3.9396979847790603e-07, "loss": 0.0, "num_input_tokens_seen": 101601872, "step": 150740 }, { "epoch": 3.682725429360174, "grad_norm": 0.014924484305083752, "learning_rate": 3.9390196732843294e-07, "loss": 0.0, "num_input_tokens_seen": 101605456, "step": 150745 }, { "epoch": 3.6828475801920213, "grad_norm": 0.015203488990664482, "learning_rate": 3.9383414058666784e-07, "loss": 0.0001, "num_input_tokens_seen": 101608912, "step": 150750 }, { "epoch": 3.682969731023868, "grad_norm": 0.3671652674674988, "learning_rate": 3.9376631825310345e-07, "loss": 0.0692, "num_input_tokens_seen": 101612240, "step": 150755 }, { "epoch": 3.6830918818557157, "grad_norm": 0.01368219405412674, "learning_rate": 3.936985003282336e-07, "loss": 0.0, "num_input_tokens_seen": 101615312, "step": 150760 }, { "epoch": 3.6832140326875624, "grad_norm": 0.001394348219037056, "learning_rate": 3.936306868125516e-07, "loss": 0.0, "num_input_tokens_seen": 101618896, "step": 150765 }, { "epoch": 3.6833361835194096, "grad_norm": 0.007090140599757433, "learning_rate": 3.9356287770654993e-07, "loss": 0.0001, "num_input_tokens_seen": 101622096, "step": 150770 }, { "epoch": 3.683458334351257, "grad_norm": 0.00044982563122175634, "learning_rate": 3.934950730107226e-07, "loss": 0.0349, "num_input_tokens_seen": 101625680, "step": 150775 }, { "epoch": 3.683580485183104, "grad_norm": 23.51737403869629, "learning_rate": 3.9342727272556186e-07, "loss": 0.0235, "num_input_tokens_seen": 101628816, "step": 150780 }, { "epoch": 3.683702636014951, "grad_norm": 0.005765157286077738, "learning_rate": 3.933594768515615e-07, "loss": 0.0, "num_input_tokens_seen": 101632144, "step": 150785 }, { "epoch": 3.6838247868467984, "grad_norm": 0.0034340538550168276, "learning_rate": 3.932916853892138e-07, "loss": 0.0, "num_input_tokens_seen": 101636304, "step": 150790 }, { "epoch": 3.6839469376786456, "grad_norm": 0.011951752938330173, "learning_rate": 3.9322389833901205e-07, "loss": 0.0, "num_input_tokens_seen": 101640208, "step": 150795 }, { "epoch": 3.6840690885104928, "grad_norm": 0.0030271399300545454, "learning_rate": 3.931561157014498e-07, "loss": 0.0, "num_input_tokens_seen": 101643472, "step": 150800 }, { "epoch": 3.68419123934234, "grad_norm": 0.002439115894958377, "learning_rate": 3.930883374770191e-07, "loss": 0.0512, "num_input_tokens_seen": 101647696, "step": 150805 }, { "epoch": 3.684313390174187, "grad_norm": 0.004594883415848017, "learning_rate": 3.9302056366621363e-07, "loss": 0.0, "num_input_tokens_seen": 101651088, "step": 150810 }, { "epoch": 3.6844355410060343, "grad_norm": 0.0011603219900280237, "learning_rate": 3.929527942695254e-07, "loss": 0.0938, "num_input_tokens_seen": 101654480, "step": 150815 }, { "epoch": 3.6845576918378815, "grad_norm": 0.0017992982175201178, "learning_rate": 3.9288502928744824e-07, "loss": 0.0, "num_input_tokens_seen": 101657552, "step": 150820 }, { "epoch": 3.6846798426697287, "grad_norm": 0.009408014826476574, "learning_rate": 3.9281726872047403e-07, "loss": 0.0001, "num_input_tokens_seen": 101660816, "step": 150825 }, { "epoch": 3.684801993501576, "grad_norm": 0.003088391851633787, "learning_rate": 3.927495125690959e-07, "loss": 0.0728, "num_input_tokens_seen": 101663952, "step": 150830 }, { "epoch": 3.684924144333423, "grad_norm": 0.030103838071227074, "learning_rate": 3.926817608338071e-07, "loss": 0.0, "num_input_tokens_seen": 101667408, "step": 150835 }, { "epoch": 3.68504629516527, "grad_norm": 0.0010745770996436477, "learning_rate": 3.926140135150998e-07, "loss": 0.0, "num_input_tokens_seen": 101670608, "step": 150840 }, { "epoch": 3.6851684459971175, "grad_norm": 0.0004344276967458427, "learning_rate": 3.9254627061346655e-07, "loss": 0.0, "num_input_tokens_seen": 101674128, "step": 150845 }, { "epoch": 3.685290596828964, "grad_norm": 0.0007442793576046824, "learning_rate": 3.9247853212940043e-07, "loss": 0.1039, "num_input_tokens_seen": 101677648, "step": 150850 }, { "epoch": 3.685412747660812, "grad_norm": 1.0434240102767944, "learning_rate": 3.924107980633935e-07, "loss": 0.0003, "num_input_tokens_seen": 101680784, "step": 150855 }, { "epoch": 3.6855348984926586, "grad_norm": 0.0017332849092781544, "learning_rate": 3.92343068415939e-07, "loss": 0.0, "num_input_tokens_seen": 101684560, "step": 150860 }, { "epoch": 3.6856570493245058, "grad_norm": 0.011752471327781677, "learning_rate": 3.9227534318752887e-07, "loss": 0.0004, "num_input_tokens_seen": 101687824, "step": 150865 }, { "epoch": 3.685779200156353, "grad_norm": 0.021650463342666626, "learning_rate": 3.922076223786561e-07, "loss": 0.0002, "num_input_tokens_seen": 101691536, "step": 150870 }, { "epoch": 3.6859013509882, "grad_norm": 0.00036399438977241516, "learning_rate": 3.9213990598981283e-07, "loss": 0.0001, "num_input_tokens_seen": 101695184, "step": 150875 }, { "epoch": 3.6860235018200473, "grad_norm": 0.0023778865579515696, "learning_rate": 3.9207219402149183e-07, "loss": 0.0548, "num_input_tokens_seen": 101699152, "step": 150880 }, { "epoch": 3.6861456526518945, "grad_norm": 0.0013532171724364161, "learning_rate": 3.920044864741852e-07, "loss": 0.0, "num_input_tokens_seen": 101702288, "step": 150885 }, { "epoch": 3.6862678034837417, "grad_norm": 0.005285841412842274, "learning_rate": 3.919367833483852e-07, "loss": 0.0, "num_input_tokens_seen": 101705616, "step": 150890 }, { "epoch": 3.686389954315589, "grad_norm": 0.01104725431650877, "learning_rate": 3.91869084644585e-07, "loss": 0.0, "num_input_tokens_seen": 101708944, "step": 150895 }, { "epoch": 3.686512105147436, "grad_norm": 0.0009050817461684346, "learning_rate": 3.91801390363276e-07, "loss": 0.0001, "num_input_tokens_seen": 101712528, "step": 150900 }, { "epoch": 3.6866342559792833, "grad_norm": 0.0777590200304985, "learning_rate": 3.9173370050495123e-07, "loss": 0.0001, "num_input_tokens_seen": 101715920, "step": 150905 }, { "epoch": 3.6867564068111305, "grad_norm": 0.0008035873179323971, "learning_rate": 3.916660150701022e-07, "loss": 0.0225, "num_input_tokens_seen": 101719184, "step": 150910 }, { "epoch": 3.6868785576429777, "grad_norm": 0.00047678741975687444, "learning_rate": 3.9159833405922193e-07, "loss": 0.0, "num_input_tokens_seen": 101722448, "step": 150915 }, { "epoch": 3.687000708474825, "grad_norm": 0.0004096681368537247, "learning_rate": 3.915306574728019e-07, "loss": 0.0527, "num_input_tokens_seen": 101725712, "step": 150920 }, { "epoch": 3.6871228593066716, "grad_norm": 0.013166260905563831, "learning_rate": 3.914629853113345e-07, "loss": 0.0, "num_input_tokens_seen": 101729232, "step": 150925 }, { "epoch": 3.6872450101385192, "grad_norm": 128.74935913085938, "learning_rate": 3.913953175753123e-07, "loss": 0.0489, "num_input_tokens_seen": 101733200, "step": 150930 }, { "epoch": 3.687367160970366, "grad_norm": 0.008455926552414894, "learning_rate": 3.913276542652267e-07, "loss": 0.0, "num_input_tokens_seen": 101736272, "step": 150935 }, { "epoch": 3.6874893118022136, "grad_norm": 0.003050909610465169, "learning_rate": 3.912599953815705e-07, "loss": 0.0, "num_input_tokens_seen": 101739472, "step": 150940 }, { "epoch": 3.6876114626340604, "grad_norm": 0.0009746898431330919, "learning_rate": 3.911923409248353e-07, "loss": 0.0001, "num_input_tokens_seen": 101742800, "step": 150945 }, { "epoch": 3.6877336134659076, "grad_norm": 0.006257293745875359, "learning_rate": 3.911246908955129e-07, "loss": 0.0001, "num_input_tokens_seen": 101745936, "step": 150950 }, { "epoch": 3.6878557642977547, "grad_norm": 0.03148772194981575, "learning_rate": 3.9105704529409575e-07, "loss": 0.0, "num_input_tokens_seen": 101749136, "step": 150955 }, { "epoch": 3.687977915129602, "grad_norm": 0.17039455473423004, "learning_rate": 3.909894041210753e-07, "loss": 0.0003, "num_input_tokens_seen": 101752784, "step": 150960 }, { "epoch": 3.688100065961449, "grad_norm": 0.009566457010805607, "learning_rate": 3.90921767376944e-07, "loss": 0.0, "num_input_tokens_seen": 101755792, "step": 150965 }, { "epoch": 3.6882222167932963, "grad_norm": 0.006611944641917944, "learning_rate": 3.9085413506219313e-07, "loss": 0.0465, "num_input_tokens_seen": 101758992, "step": 150970 }, { "epoch": 3.6883443676251435, "grad_norm": 0.0023192299995571375, "learning_rate": 3.907865071773151e-07, "loss": 0.0, "num_input_tokens_seen": 101762640, "step": 150975 }, { "epoch": 3.6884665184569907, "grad_norm": 0.0027550682425498962, "learning_rate": 3.9071888372280113e-07, "loss": 0.0002, "num_input_tokens_seen": 101765904, "step": 150980 }, { "epoch": 3.688588669288838, "grad_norm": 15.158320426940918, "learning_rate": 3.906512646991433e-07, "loss": 0.0468, "num_input_tokens_seen": 101769552, "step": 150985 }, { "epoch": 3.688710820120685, "grad_norm": 0.00859321653842926, "learning_rate": 3.9058365010683383e-07, "loss": 0.0, "num_input_tokens_seen": 101772880, "step": 150990 }, { "epoch": 3.6888329709525323, "grad_norm": 0.0005637798458337784, "learning_rate": 3.905160399463635e-07, "loss": 0.0, "num_input_tokens_seen": 101776400, "step": 150995 }, { "epoch": 3.6889551217843795, "grad_norm": 0.004457912407815456, "learning_rate": 3.9044843421822485e-07, "loss": 0.0, "num_input_tokens_seen": 101779664, "step": 151000 }, { "epoch": 3.6890772726162266, "grad_norm": 0.0030633006244897842, "learning_rate": 3.903808329229087e-07, "loss": 0.0, "num_input_tokens_seen": 101782800, "step": 151005 }, { "epoch": 3.689199423448074, "grad_norm": 0.0062855626456439495, "learning_rate": 3.9031323606090717e-07, "loss": 0.0786, "num_input_tokens_seen": 101786128, "step": 151010 }, { "epoch": 3.689321574279921, "grad_norm": 0.005475311540067196, "learning_rate": 3.902456436327122e-07, "loss": 0.0, "num_input_tokens_seen": 101789392, "step": 151015 }, { "epoch": 3.6894437251117678, "grad_norm": 0.02409266121685505, "learning_rate": 3.9017805563881446e-07, "loss": 0.0, "num_input_tokens_seen": 101792592, "step": 151020 }, { "epoch": 3.6895658759436154, "grad_norm": 0.00048403823166154325, "learning_rate": 3.901104720797063e-07, "loss": 0.0001, "num_input_tokens_seen": 101795792, "step": 151025 }, { "epoch": 3.689688026775462, "grad_norm": 0.0028270462062209845, "learning_rate": 3.9004289295587845e-07, "loss": 0.0001, "num_input_tokens_seen": 101799312, "step": 151030 }, { "epoch": 3.68981017760731, "grad_norm": 0.02731265313923359, "learning_rate": 3.8997531826782315e-07, "loss": 0.0003, "num_input_tokens_seen": 101802640, "step": 151035 }, { "epoch": 3.6899323284391565, "grad_norm": 0.0027652860153466463, "learning_rate": 3.89907748016031e-07, "loss": 0.0, "num_input_tokens_seen": 101806288, "step": 151040 }, { "epoch": 3.6900544792710037, "grad_norm": 0.006285255309194326, "learning_rate": 3.898401822009942e-07, "loss": 0.0, "num_input_tokens_seen": 101810896, "step": 151045 }, { "epoch": 3.690176630102851, "grad_norm": 0.001907885423861444, "learning_rate": 3.8977262082320337e-07, "loss": 0.0, "num_input_tokens_seen": 101813968, "step": 151050 }, { "epoch": 3.690298780934698, "grad_norm": 18.86277198791504, "learning_rate": 3.897050638831505e-07, "loss": 0.0667, "num_input_tokens_seen": 101817360, "step": 151055 }, { "epoch": 3.6904209317665453, "grad_norm": 0.022184569388628006, "learning_rate": 3.896375113813265e-07, "loss": 0.0, "num_input_tokens_seen": 101820304, "step": 151060 }, { "epoch": 3.6905430825983925, "grad_norm": 0.0037373073864728212, "learning_rate": 3.8956996331822243e-07, "loss": 0.0, "num_input_tokens_seen": 101824016, "step": 151065 }, { "epoch": 3.6906652334302397, "grad_norm": 0.0009017616976052523, "learning_rate": 3.895024196943301e-07, "loss": 0.0, "num_input_tokens_seen": 101827856, "step": 151070 }, { "epoch": 3.690787384262087, "grad_norm": 0.009098973125219345, "learning_rate": 3.8943488051013997e-07, "loss": 0.0, "num_input_tokens_seen": 101830928, "step": 151075 }, { "epoch": 3.690909535093934, "grad_norm": 0.0026963918935507536, "learning_rate": 3.8936734576614374e-07, "loss": 0.0371, "num_input_tokens_seen": 101835280, "step": 151080 }, { "epoch": 3.6910316859257812, "grad_norm": 0.0007805816712789237, "learning_rate": 3.8929981546283266e-07, "loss": 0.0, "num_input_tokens_seen": 101838352, "step": 151085 }, { "epoch": 3.6911538367576284, "grad_norm": 0.004396933596581221, "learning_rate": 3.8923228960069723e-07, "loss": 0.0, "num_input_tokens_seen": 101841552, "step": 151090 }, { "epoch": 3.6912759875894756, "grad_norm": 0.002870592987164855, "learning_rate": 3.8916476818022914e-07, "loss": 0.0001, "num_input_tokens_seen": 101844688, "step": 151095 }, { "epoch": 3.691398138421323, "grad_norm": 0.007208169437944889, "learning_rate": 3.8909725120191893e-07, "loss": 0.0515, "num_input_tokens_seen": 101847952, "step": 151100 }, { "epoch": 3.6915202892531696, "grad_norm": 0.008845353499054909, "learning_rate": 3.890297386662578e-07, "loss": 0.0, "num_input_tokens_seen": 101851344, "step": 151105 }, { "epoch": 3.691642440085017, "grad_norm": 0.014517847448587418, "learning_rate": 3.88962230573737e-07, "loss": 0.0, "num_input_tokens_seen": 101854544, "step": 151110 }, { "epoch": 3.691764590916864, "grad_norm": 0.0029305708594620228, "learning_rate": 3.8889472692484703e-07, "loss": 0.0002, "num_input_tokens_seen": 101857744, "step": 151115 }, { "epoch": 3.6918867417487116, "grad_norm": 0.039825934916734695, "learning_rate": 3.8882722772007914e-07, "loss": 0.0001, "num_input_tokens_seen": 101861136, "step": 151120 }, { "epoch": 3.6920088925805583, "grad_norm": 0.0230876412242651, "learning_rate": 3.8875973295992383e-07, "loss": 0.0, "num_input_tokens_seen": 101864400, "step": 151125 }, { "epoch": 3.6921310434124055, "grad_norm": 0.00439818948507309, "learning_rate": 3.8869224264487244e-07, "loss": 0.0, "num_input_tokens_seen": 101867984, "step": 151130 }, { "epoch": 3.6922531942442527, "grad_norm": 0.009379718452692032, "learning_rate": 3.886247567754151e-07, "loss": 0.0001, "num_input_tokens_seen": 101871440, "step": 151135 }, { "epoch": 3.6923753450761, "grad_norm": 0.001351098413579166, "learning_rate": 3.88557275352043e-07, "loss": 0.0001, "num_input_tokens_seen": 101875152, "step": 151140 }, { "epoch": 3.692497495907947, "grad_norm": 0.0009771710028871894, "learning_rate": 3.884897983752472e-07, "loss": 0.0, "num_input_tokens_seen": 101878800, "step": 151145 }, { "epoch": 3.6926196467397943, "grad_norm": 0.0007362841279245913, "learning_rate": 3.88422325845518e-07, "loss": 0.0, "num_input_tokens_seen": 101881872, "step": 151150 }, { "epoch": 3.6927417975716414, "grad_norm": 0.003438524901866913, "learning_rate": 3.88354857763346e-07, "loss": 0.0, "num_input_tokens_seen": 101885264, "step": 151155 }, { "epoch": 3.6928639484034886, "grad_norm": 0.0023364173248410225, "learning_rate": 3.882873941292221e-07, "loss": 0.0001, "num_input_tokens_seen": 101888208, "step": 151160 }, { "epoch": 3.692986099235336, "grad_norm": 0.0015807130839675665, "learning_rate": 3.8821993494363657e-07, "loss": 0.0, "num_input_tokens_seen": 101891088, "step": 151165 }, { "epoch": 3.693108250067183, "grad_norm": 0.0011053653433918953, "learning_rate": 3.881524802070806e-07, "loss": 0.0, "num_input_tokens_seen": 101894352, "step": 151170 }, { "epoch": 3.69323040089903, "grad_norm": 0.012983616441488266, "learning_rate": 3.880850299200439e-07, "loss": 0.0001, "num_input_tokens_seen": 101897232, "step": 151175 }, { "epoch": 3.6933525517308774, "grad_norm": 0.0016408859519287944, "learning_rate": 3.880175840830179e-07, "loss": 0.0, "num_input_tokens_seen": 101900368, "step": 151180 }, { "epoch": 3.6934747025627246, "grad_norm": 0.0003170320705976337, "learning_rate": 3.879501426964922e-07, "loss": 0.0, "num_input_tokens_seen": 101903504, "step": 151185 }, { "epoch": 3.6935968533945718, "grad_norm": 0.1406192183494568, "learning_rate": 3.8788270576095806e-07, "loss": 0.0004, "num_input_tokens_seen": 101906832, "step": 151190 }, { "epoch": 3.693719004226419, "grad_norm": 0.16041308641433716, "learning_rate": 3.8781527327690523e-07, "loss": 0.0, "num_input_tokens_seen": 101909968, "step": 151195 }, { "epoch": 3.6938411550582657, "grad_norm": 0.04108577221632004, "learning_rate": 3.8774784524482426e-07, "loss": 0.0, "num_input_tokens_seen": 101913296, "step": 151200 }, { "epoch": 3.6939633058901133, "grad_norm": 0.10066086798906326, "learning_rate": 3.876804216652061e-07, "loss": 0.0001, "num_input_tokens_seen": 101916688, "step": 151205 }, { "epoch": 3.69408545672196, "grad_norm": 0.015516743063926697, "learning_rate": 3.876130025385402e-07, "loss": 0.0001, "num_input_tokens_seen": 101920336, "step": 151210 }, { "epoch": 3.6942076075538073, "grad_norm": 0.010554834268987179, "learning_rate": 3.8754558786531767e-07, "loss": 0.0, "num_input_tokens_seen": 101923728, "step": 151215 }, { "epoch": 3.6943297583856545, "grad_norm": 0.020320434123277664, "learning_rate": 3.87478177646028e-07, "loss": 0.0, "num_input_tokens_seen": 101927312, "step": 151220 }, { "epoch": 3.6944519092175017, "grad_norm": 0.020632604137063026, "learning_rate": 3.87410771881162e-07, "loss": 0.0001, "num_input_tokens_seen": 101931216, "step": 151225 }, { "epoch": 3.694574060049349, "grad_norm": 0.0003973071579821408, "learning_rate": 3.8734337057120945e-07, "loss": 0.0245, "num_input_tokens_seen": 101934416, "step": 151230 }, { "epoch": 3.694696210881196, "grad_norm": 0.0013479648623615503, "learning_rate": 3.8727597371666067e-07, "loss": 0.0, "num_input_tokens_seen": 101937424, "step": 151235 }, { "epoch": 3.6948183617130432, "grad_norm": 0.1412498652935028, "learning_rate": 3.8720858131800605e-07, "loss": 0.0001, "num_input_tokens_seen": 101940816, "step": 151240 }, { "epoch": 3.6949405125448904, "grad_norm": 0.012594731524586678, "learning_rate": 3.8714119337573513e-07, "loss": 0.0, "num_input_tokens_seen": 101944464, "step": 151245 }, { "epoch": 3.6950626633767376, "grad_norm": 0.42286956310272217, "learning_rate": 3.8707380989033866e-07, "loss": 0.0001, "num_input_tokens_seen": 101947984, "step": 151250 }, { "epoch": 3.695184814208585, "grad_norm": 0.0012724720872938633, "learning_rate": 3.870064308623063e-07, "loss": 0.0001, "num_input_tokens_seen": 101951184, "step": 151255 }, { "epoch": 3.695306965040432, "grad_norm": 0.0007571643218398094, "learning_rate": 3.8693905629212775e-07, "loss": 0.0001, "num_input_tokens_seen": 101955024, "step": 151260 }, { "epoch": 3.695429115872279, "grad_norm": 0.002359704114496708, "learning_rate": 3.8687168618029366e-07, "loss": 0.0, "num_input_tokens_seen": 101958736, "step": 151265 }, { "epoch": 3.6955512667041264, "grad_norm": 0.0016779140569269657, "learning_rate": 3.8680432052729304e-07, "loss": 0.0, "num_input_tokens_seen": 101962448, "step": 151270 }, { "epoch": 3.6956734175359736, "grad_norm": 0.0044897920452058315, "learning_rate": 3.867369593336168e-07, "loss": 0.0, "num_input_tokens_seen": 101966736, "step": 151275 }, { "epoch": 3.6957955683678207, "grad_norm": 19.066856384277344, "learning_rate": 3.86669602599754e-07, "loss": 0.115, "num_input_tokens_seen": 101969808, "step": 151280 }, { "epoch": 3.6959177191996675, "grad_norm": 23.83672523498535, "learning_rate": 3.866022503261952e-07, "loss": 0.0538, "num_input_tokens_seen": 101973008, "step": 151285 }, { "epoch": 3.696039870031515, "grad_norm": 0.005208525341004133, "learning_rate": 3.8653490251342945e-07, "loss": 0.031, "num_input_tokens_seen": 101976272, "step": 151290 }, { "epoch": 3.696162020863362, "grad_norm": 0.011254152283072472, "learning_rate": 3.8646755916194685e-07, "loss": 0.0, "num_input_tokens_seen": 101979344, "step": 151295 }, { "epoch": 3.6962841716952095, "grad_norm": 0.007257348857820034, "learning_rate": 3.864002202722375e-07, "loss": 0.0001, "num_input_tokens_seen": 101982928, "step": 151300 }, { "epoch": 3.6964063225270563, "grad_norm": 0.059719085693359375, "learning_rate": 3.863328858447905e-07, "loss": 0.0, "num_input_tokens_seen": 101986064, "step": 151305 }, { "epoch": 3.6965284733589034, "grad_norm": 0.0017241544555872679, "learning_rate": 3.8626555588009614e-07, "loss": 0.0075, "num_input_tokens_seen": 101989136, "step": 151310 }, { "epoch": 3.6966506241907506, "grad_norm": 0.17825163900852203, "learning_rate": 3.861982303786434e-07, "loss": 0.0001, "num_input_tokens_seen": 101992272, "step": 151315 }, { "epoch": 3.696772775022598, "grad_norm": 0.0026821442879736423, "learning_rate": 3.861309093409222e-07, "loss": 0.0, "num_input_tokens_seen": 101995280, "step": 151320 }, { "epoch": 3.696894925854445, "grad_norm": 0.004103350918740034, "learning_rate": 3.860635927674225e-07, "loss": 0.0, "num_input_tokens_seen": 101998992, "step": 151325 }, { "epoch": 3.697017076686292, "grad_norm": 0.0009725447162054479, "learning_rate": 3.859962806586331e-07, "loss": 0.0, "num_input_tokens_seen": 102002448, "step": 151330 }, { "epoch": 3.6971392275181394, "grad_norm": 0.0006192554719746113, "learning_rate": 3.8592897301504436e-07, "loss": 0.0287, "num_input_tokens_seen": 102006032, "step": 151335 }, { "epoch": 3.6972613783499866, "grad_norm": 0.0013876816956326365, "learning_rate": 3.8586166983714475e-07, "loss": 0.0, "num_input_tokens_seen": 102009488, "step": 151340 }, { "epoch": 3.6973835291818338, "grad_norm": 0.06822085380554199, "learning_rate": 3.8579437112542476e-07, "loss": 0.0, "num_input_tokens_seen": 102012880, "step": 151345 }, { "epoch": 3.697505680013681, "grad_norm": 0.055710822343826294, "learning_rate": 3.857270768803729e-07, "loss": 0.0, "num_input_tokens_seen": 102016272, "step": 151350 }, { "epoch": 3.697627830845528, "grad_norm": 0.06808667629957199, "learning_rate": 3.856597871024794e-07, "loss": 0.0001, "num_input_tokens_seen": 102019536, "step": 151355 }, { "epoch": 3.6977499816773753, "grad_norm": 0.13779473304748535, "learning_rate": 3.855925017922327e-07, "loss": 0.0001, "num_input_tokens_seen": 102022800, "step": 151360 }, { "epoch": 3.6978721325092225, "grad_norm": 0.0018543327460065484, "learning_rate": 3.8552522095012296e-07, "loss": 0.0, "num_input_tokens_seen": 102026064, "step": 151365 }, { "epoch": 3.6979942833410693, "grad_norm": 0.004081390332430601, "learning_rate": 3.8545794457663903e-07, "loss": 0.0001, "num_input_tokens_seen": 102029136, "step": 151370 }, { "epoch": 3.698116434172917, "grad_norm": 0.0007932799635455012, "learning_rate": 3.8539067267227e-07, "loss": 0.0, "num_input_tokens_seen": 102032464, "step": 151375 }, { "epoch": 3.6982385850047637, "grad_norm": 0.06597576290369034, "learning_rate": 3.853234052375055e-07, "loss": 0.0, "num_input_tokens_seen": 102035920, "step": 151380 }, { "epoch": 3.6983607358366113, "grad_norm": 0.04877420514822006, "learning_rate": 3.852561422728343e-07, "loss": 0.0001, "num_input_tokens_seen": 102038992, "step": 151385 }, { "epoch": 3.698482886668458, "grad_norm": 0.005778729449957609, "learning_rate": 3.851888837787457e-07, "loss": 0.0, "num_input_tokens_seen": 102042064, "step": 151390 }, { "epoch": 3.6986050375003052, "grad_norm": 0.000778566172812134, "learning_rate": 3.851216297557294e-07, "loss": 0.0, "num_input_tokens_seen": 102046160, "step": 151395 }, { "epoch": 3.6987271883321524, "grad_norm": 0.006441682111471891, "learning_rate": 3.850543802042735e-07, "loss": 0.0, "num_input_tokens_seen": 102049680, "step": 151400 }, { "epoch": 3.6988493391639996, "grad_norm": 0.037144798785448074, "learning_rate": 3.849871351248679e-07, "loss": 0.0, "num_input_tokens_seen": 102053392, "step": 151405 }, { "epoch": 3.698971489995847, "grad_norm": 0.00173321389593184, "learning_rate": 3.84919894518001e-07, "loss": 0.0, "num_input_tokens_seen": 102056784, "step": 151410 }, { "epoch": 3.699093640827694, "grad_norm": 0.09905073791742325, "learning_rate": 3.84852658384162e-07, "loss": 0.0008, "num_input_tokens_seen": 102060048, "step": 151415 }, { "epoch": 3.699215791659541, "grad_norm": 0.006596543826162815, "learning_rate": 3.847854267238403e-07, "loss": 0.0, "num_input_tokens_seen": 102063696, "step": 151420 }, { "epoch": 3.6993379424913884, "grad_norm": 0.01068484503775835, "learning_rate": 3.8471819953752404e-07, "loss": 0.0, "num_input_tokens_seen": 102067408, "step": 151425 }, { "epoch": 3.6994600933232356, "grad_norm": 23.218570709228516, "learning_rate": 3.84650976825703e-07, "loss": 0.0557, "num_input_tokens_seen": 102070672, "step": 151430 }, { "epoch": 3.6995822441550827, "grad_norm": 0.026982801035046577, "learning_rate": 3.8458375858886513e-07, "loss": 0.0, "num_input_tokens_seen": 102073872, "step": 151435 }, { "epoch": 3.69970439498693, "grad_norm": 0.0017854716861620545, "learning_rate": 3.8451654482750006e-07, "loss": 0.0, "num_input_tokens_seen": 102077200, "step": 151440 }, { "epoch": 3.699826545818777, "grad_norm": 0.0003598171751946211, "learning_rate": 3.844493355420958e-07, "loss": 0.0, "num_input_tokens_seen": 102080400, "step": 151445 }, { "epoch": 3.6999486966506243, "grad_norm": 0.0013809788506478071, "learning_rate": 3.8438213073314164e-07, "loss": 0.0, "num_input_tokens_seen": 102083792, "step": 151450 }, { "epoch": 3.7000708474824715, "grad_norm": 0.00033191696275025606, "learning_rate": 3.843149304011265e-07, "loss": 0.0001, "num_input_tokens_seen": 102087120, "step": 151455 }, { "epoch": 3.7001929983143187, "grad_norm": 0.0005849574226886034, "learning_rate": 3.842477345465388e-07, "loss": 0.0, "num_input_tokens_seen": 102090384, "step": 151460 }, { "epoch": 3.7003151491461654, "grad_norm": 0.0002934445801656693, "learning_rate": 3.841805431698669e-07, "loss": 0.0, "num_input_tokens_seen": 102093904, "step": 151465 }, { "epoch": 3.700437299978013, "grad_norm": 0.0008289655088447034, "learning_rate": 3.841133562716e-07, "loss": 0.0, "num_input_tokens_seen": 102097104, "step": 151470 }, { "epoch": 3.70055945080986, "grad_norm": 0.031241346150636673, "learning_rate": 3.8404617385222615e-07, "loss": 0.0168, "num_input_tokens_seen": 102100432, "step": 151475 }, { "epoch": 3.7006816016417075, "grad_norm": 0.0025893929414451122, "learning_rate": 3.839789959122345e-07, "loss": 0.0, "num_input_tokens_seen": 102104080, "step": 151480 }, { "epoch": 3.700803752473554, "grad_norm": 0.0004473034350667149, "learning_rate": 3.8391182245211283e-07, "loss": 0.0, "num_input_tokens_seen": 102107216, "step": 151485 }, { "epoch": 3.7009259033054014, "grad_norm": 0.0008554542437195778, "learning_rate": 3.8384465347235064e-07, "loss": 0.0, "num_input_tokens_seen": 102110608, "step": 151490 }, { "epoch": 3.7010480541372486, "grad_norm": 0.01684235781431198, "learning_rate": 3.837774889734353e-07, "loss": 0.0, "num_input_tokens_seen": 102114320, "step": 151495 }, { "epoch": 3.7011702049690958, "grad_norm": 0.37367013096809387, "learning_rate": 3.837103289558563e-07, "loss": 0.0001, "num_input_tokens_seen": 102118352, "step": 151500 }, { "epoch": 3.701292355800943, "grad_norm": 0.0028496081940829754, "learning_rate": 3.836431734201012e-07, "loss": 0.0, "num_input_tokens_seen": 102121552, "step": 151505 }, { "epoch": 3.70141450663279, "grad_norm": 0.4270290732383728, "learning_rate": 3.8357602236665867e-07, "loss": 0.0007, "num_input_tokens_seen": 102124944, "step": 151510 }, { "epoch": 3.7015366574646373, "grad_norm": 0.004349404014647007, "learning_rate": 3.8350887579601744e-07, "loss": 0.0, "num_input_tokens_seen": 102128464, "step": 151515 }, { "epoch": 3.7016588082964845, "grad_norm": 0.0028605188708752394, "learning_rate": 3.834417337086652e-07, "loss": 0.0, "num_input_tokens_seen": 102131664, "step": 151520 }, { "epoch": 3.7017809591283317, "grad_norm": 0.0024505862966179848, "learning_rate": 3.833745961050908e-07, "loss": 0.0, "num_input_tokens_seen": 102135120, "step": 151525 }, { "epoch": 3.701903109960179, "grad_norm": 0.0017270263051614165, "learning_rate": 3.833074629857819e-07, "loss": 0.0001, "num_input_tokens_seen": 102138768, "step": 151530 }, { "epoch": 3.702025260792026, "grad_norm": 0.002492433413863182, "learning_rate": 3.8324033435122727e-07, "loss": 0.0003, "num_input_tokens_seen": 102141968, "step": 151535 }, { "epoch": 3.7021474116238733, "grad_norm": 0.005628492683172226, "learning_rate": 3.831732102019145e-07, "loss": 0.0282, "num_input_tokens_seen": 102145616, "step": 151540 }, { "epoch": 3.7022695624557205, "grad_norm": 0.0043680984526872635, "learning_rate": 3.83106090538332e-07, "loss": 0.0002, "num_input_tokens_seen": 102148944, "step": 151545 }, { "epoch": 3.702391713287567, "grad_norm": 4.06487743020989e-05, "learning_rate": 3.830389753609684e-07, "loss": 0.0, "num_input_tokens_seen": 102152336, "step": 151550 }, { "epoch": 3.702513864119415, "grad_norm": 0.0013830072712153196, "learning_rate": 3.8297186467031083e-07, "loss": 0.0, "num_input_tokens_seen": 102155600, "step": 151555 }, { "epoch": 3.7026360149512616, "grad_norm": 0.0003124834911432117, "learning_rate": 3.829047584668483e-07, "loss": 0.0, "num_input_tokens_seen": 102158928, "step": 151560 }, { "epoch": 3.7027581657831092, "grad_norm": 0.006732202600687742, "learning_rate": 3.8283765675106795e-07, "loss": 0.0, "num_input_tokens_seen": 102161808, "step": 151565 }, { "epoch": 3.702880316614956, "grad_norm": 0.0012146084336563945, "learning_rate": 3.8277055952345847e-07, "loss": 0.0, "num_input_tokens_seen": 102165008, "step": 151570 }, { "epoch": 3.703002467446803, "grad_norm": 0.0025280537083745003, "learning_rate": 3.827034667845075e-07, "loss": 0.0001, "num_input_tokens_seen": 102168592, "step": 151575 }, { "epoch": 3.7031246182786504, "grad_norm": 0.008870430290699005, "learning_rate": 3.8263637853470266e-07, "loss": 0.0, "num_input_tokens_seen": 102172368, "step": 151580 }, { "epoch": 3.7032467691104975, "grad_norm": 0.11202926933765411, "learning_rate": 3.825692947745324e-07, "loss": 0.0, "num_input_tokens_seen": 102175632, "step": 151585 }, { "epoch": 3.7033689199423447, "grad_norm": 0.0003817217075265944, "learning_rate": 3.82502215504484e-07, "loss": 0.0, "num_input_tokens_seen": 102179024, "step": 151590 }, { "epoch": 3.703491070774192, "grad_norm": 0.18531584739685059, "learning_rate": 3.824351407250459e-07, "loss": 0.0, "num_input_tokens_seen": 102182352, "step": 151595 }, { "epoch": 3.703613221606039, "grad_norm": 0.0019869126845151186, "learning_rate": 3.823680704367053e-07, "loss": 0.0, "num_input_tokens_seen": 102185808, "step": 151600 }, { "epoch": 3.7037353724378863, "grad_norm": 0.0009062121971510351, "learning_rate": 3.823010046399501e-07, "loss": 0.0, "num_input_tokens_seen": 102189328, "step": 151605 }, { "epoch": 3.7038575232697335, "grad_norm": 0.00044265558244660497, "learning_rate": 3.8223394333526846e-07, "loss": 0.0, "num_input_tokens_seen": 102192784, "step": 151610 }, { "epoch": 3.7039796741015807, "grad_norm": 0.10403814166784286, "learning_rate": 3.8216688652314753e-07, "loss": 0.0, "num_input_tokens_seen": 102196240, "step": 151615 }, { "epoch": 3.704101824933428, "grad_norm": 0.00029964270652271807, "learning_rate": 3.8209983420407543e-07, "loss": 0.0, "num_input_tokens_seen": 102199312, "step": 151620 }, { "epoch": 3.704223975765275, "grad_norm": 0.0005833769682794809, "learning_rate": 3.820327863785392e-07, "loss": 0.0001, "num_input_tokens_seen": 102202832, "step": 151625 }, { "epoch": 3.7043461265971223, "grad_norm": 77.15211486816406, "learning_rate": 3.819657430470271e-07, "loss": 0.1065, "num_input_tokens_seen": 102206032, "step": 151630 }, { "epoch": 3.7044682774289694, "grad_norm": 0.0013740085996687412, "learning_rate": 3.81898704210026e-07, "loss": 0.0, "num_input_tokens_seen": 102209296, "step": 151635 }, { "epoch": 3.7045904282608166, "grad_norm": 0.002347599947825074, "learning_rate": 3.8183166986802384e-07, "loss": 0.0, "num_input_tokens_seen": 102213008, "step": 151640 }, { "epoch": 3.7047125790926634, "grad_norm": 0.0009971833787858486, "learning_rate": 3.8176464002150833e-07, "loss": 0.0, "num_input_tokens_seen": 102216528, "step": 151645 }, { "epoch": 3.704834729924511, "grad_norm": 0.0005341061041690409, "learning_rate": 3.816976146709663e-07, "loss": 0.0, "num_input_tokens_seen": 102219920, "step": 151650 }, { "epoch": 3.7049568807563578, "grad_norm": 0.00022218092635739595, "learning_rate": 3.8163059381688587e-07, "loss": 0.0, "num_input_tokens_seen": 102222992, "step": 151655 }, { "epoch": 3.705079031588205, "grad_norm": 0.00023510765458922833, "learning_rate": 3.8156357745975375e-07, "loss": 0.113, "num_input_tokens_seen": 102226640, "step": 151660 }, { "epoch": 3.705201182420052, "grad_norm": 0.0005348159465938807, "learning_rate": 3.8149656560005794e-07, "loss": 0.0202, "num_input_tokens_seen": 102229840, "step": 151665 }, { "epoch": 3.7053233332518993, "grad_norm": 0.00038441031938418746, "learning_rate": 3.8142955823828517e-07, "loss": 0.0, "num_input_tokens_seen": 102233616, "step": 151670 }, { "epoch": 3.7054454840837465, "grad_norm": 0.0006266786367632449, "learning_rate": 3.8136255537492333e-07, "loss": 0.0, "num_input_tokens_seen": 102237008, "step": 151675 }, { "epoch": 3.7055676349155937, "grad_norm": 0.0018625278025865555, "learning_rate": 3.8129555701045936e-07, "loss": 0.0, "num_input_tokens_seen": 102240080, "step": 151680 }, { "epoch": 3.705689785747441, "grad_norm": 0.0017500559333711863, "learning_rate": 3.812285631453802e-07, "loss": 0.0, "num_input_tokens_seen": 102243664, "step": 151685 }, { "epoch": 3.705811936579288, "grad_norm": 0.00026514322962611914, "learning_rate": 3.8116157378017377e-07, "loss": 0.0, "num_input_tokens_seen": 102247184, "step": 151690 }, { "epoch": 3.7059340874111353, "grad_norm": 50.65078353881836, "learning_rate": 3.810945889153264e-07, "loss": 0.0005, "num_input_tokens_seen": 102250704, "step": 151695 }, { "epoch": 3.7060562382429825, "grad_norm": 0.0005604383768513799, "learning_rate": 3.8102760855132567e-07, "loss": 0.0, "num_input_tokens_seen": 102254352, "step": 151700 }, { "epoch": 3.7061783890748297, "grad_norm": 0.0015173493884503841, "learning_rate": 3.8096063268865897e-07, "loss": 0.0696, "num_input_tokens_seen": 102258192, "step": 151705 }, { "epoch": 3.706300539906677, "grad_norm": 0.00023253409017343074, "learning_rate": 3.8089366132781277e-07, "loss": 0.0, "num_input_tokens_seen": 102261648, "step": 151710 }, { "epoch": 3.706422690738524, "grad_norm": 0.002127137267962098, "learning_rate": 3.808266944692746e-07, "loss": 0.0001, "num_input_tokens_seen": 102265296, "step": 151715 }, { "epoch": 3.7065448415703712, "grad_norm": 0.002646263688802719, "learning_rate": 3.8075973211353107e-07, "loss": 0.085, "num_input_tokens_seen": 102268624, "step": 151720 }, { "epoch": 3.7066669924022184, "grad_norm": 0.003195775207132101, "learning_rate": 3.8069277426106917e-07, "loss": 0.0, "num_input_tokens_seen": 102272208, "step": 151725 }, { "epoch": 3.706789143234065, "grad_norm": 0.0419430248439312, "learning_rate": 3.8062582091237637e-07, "loss": 0.0, "num_input_tokens_seen": 102275728, "step": 151730 }, { "epoch": 3.706911294065913, "grad_norm": 68.80615234375, "learning_rate": 3.805588720679389e-07, "loss": 0.044, "num_input_tokens_seen": 102279120, "step": 151735 }, { "epoch": 3.7070334448977595, "grad_norm": 0.002701559802517295, "learning_rate": 3.8049192772824435e-07, "loss": 0.0, "num_input_tokens_seen": 102282192, "step": 151740 }, { "epoch": 3.707155595729607, "grad_norm": 0.0026187242474406958, "learning_rate": 3.8042498789377863e-07, "loss": 0.0, "num_input_tokens_seen": 102285392, "step": 151745 }, { "epoch": 3.707277746561454, "grad_norm": 0.0021011237986385822, "learning_rate": 3.803580525650294e-07, "loss": 0.0, "num_input_tokens_seen": 102289168, "step": 151750 }, { "epoch": 3.707399897393301, "grad_norm": 0.0002951928472612053, "learning_rate": 3.802911217424828e-07, "loss": 0.0001, "num_input_tokens_seen": 102292432, "step": 151755 }, { "epoch": 3.7075220482251483, "grad_norm": 0.10593866556882858, "learning_rate": 3.8022419542662585e-07, "loss": 0.0, "num_input_tokens_seen": 102295696, "step": 151760 }, { "epoch": 3.7076441990569955, "grad_norm": 0.014959882013499737, "learning_rate": 3.8015727361794547e-07, "loss": 0.0, "num_input_tokens_seen": 102299152, "step": 151765 }, { "epoch": 3.7077663498888427, "grad_norm": 0.00046182944788597524, "learning_rate": 3.800903563169283e-07, "loss": 0.0, "num_input_tokens_seen": 102302608, "step": 151770 }, { "epoch": 3.70788850072069, "grad_norm": 0.38408032059669495, "learning_rate": 3.800234435240604e-07, "loss": 0.0001, "num_input_tokens_seen": 102305680, "step": 151775 }, { "epoch": 3.708010651552537, "grad_norm": 0.0023694150149822235, "learning_rate": 3.7995653523982896e-07, "loss": 0.0, "num_input_tokens_seen": 102308752, "step": 151780 }, { "epoch": 3.7081328023843843, "grad_norm": 0.0032163665164262056, "learning_rate": 3.7988963146472053e-07, "loss": 0.0002, "num_input_tokens_seen": 102312080, "step": 151785 }, { "epoch": 3.7082549532162314, "grad_norm": 0.0014809854328632355, "learning_rate": 3.798227321992211e-07, "loss": 0.0, "num_input_tokens_seen": 102315472, "step": 151790 }, { "epoch": 3.7083771040480786, "grad_norm": 0.002615902107208967, "learning_rate": 3.7975583744381757e-07, "loss": 0.0443, "num_input_tokens_seen": 102318800, "step": 151795 }, { "epoch": 3.708499254879926, "grad_norm": 82.20895385742188, "learning_rate": 3.796889471989967e-07, "loss": 0.0822, "num_input_tokens_seen": 102321808, "step": 151800 }, { "epoch": 3.708621405711773, "grad_norm": 6.49347566650249e-05, "learning_rate": 3.7962206146524435e-07, "loss": 0.0, "num_input_tokens_seen": 102325200, "step": 151805 }, { "epoch": 3.70874355654362, "grad_norm": 0.043628934770822525, "learning_rate": 3.7955518024304756e-07, "loss": 0.0, "num_input_tokens_seen": 102328784, "step": 151810 }, { "epoch": 3.7088657073754674, "grad_norm": 0.0645923987030983, "learning_rate": 3.794883035328921e-07, "loss": 0.0, "num_input_tokens_seen": 102331984, "step": 151815 }, { "epoch": 3.7089878582073146, "grad_norm": 0.0026498925872147083, "learning_rate": 3.794214313352646e-07, "loss": 0.0001, "num_input_tokens_seen": 102335248, "step": 151820 }, { "epoch": 3.7091100090391613, "grad_norm": 0.000418938638176769, "learning_rate": 3.7935456365065164e-07, "loss": 0.0001, "num_input_tokens_seen": 102338512, "step": 151825 }, { "epoch": 3.709232159871009, "grad_norm": 0.0551934652030468, "learning_rate": 3.7928770047953883e-07, "loss": 0.0001, "num_input_tokens_seen": 102342032, "step": 151830 }, { "epoch": 3.7093543107028557, "grad_norm": 0.04365779459476471, "learning_rate": 3.792208418224133e-07, "loss": 0.0, "num_input_tokens_seen": 102345744, "step": 151835 }, { "epoch": 3.709476461534703, "grad_norm": 0.005952696315944195, "learning_rate": 3.7915398767976037e-07, "loss": 0.0, "num_input_tokens_seen": 102348752, "step": 151840 }, { "epoch": 3.70959861236655, "grad_norm": 39.83332061767578, "learning_rate": 3.7908713805206694e-07, "loss": 0.0631, "num_input_tokens_seen": 102351888, "step": 151845 }, { "epoch": 3.7097207631983973, "grad_norm": 0.0012893755920231342, "learning_rate": 3.7902029293981854e-07, "loss": 0.0, "num_input_tokens_seen": 102355472, "step": 151850 }, { "epoch": 3.7098429140302445, "grad_norm": 0.00010355439007980749, "learning_rate": 3.7895345234350163e-07, "loss": 0.0001, "num_input_tokens_seen": 102358800, "step": 151855 }, { "epoch": 3.7099650648620917, "grad_norm": 0.024188674986362457, "learning_rate": 3.788866162636025e-07, "loss": 0.0, "num_input_tokens_seen": 102361872, "step": 151860 }, { "epoch": 3.710087215693939, "grad_norm": 0.0009943852201104164, "learning_rate": 3.788197847006067e-07, "loss": 0.0, "num_input_tokens_seen": 102365136, "step": 151865 }, { "epoch": 3.710209366525786, "grad_norm": 0.09669873118400574, "learning_rate": 3.787529576550008e-07, "loss": 0.0, "num_input_tokens_seen": 102368400, "step": 151870 }, { "epoch": 3.7103315173576332, "grad_norm": 0.012280414812266827, "learning_rate": 3.786861351272702e-07, "loss": 0.0, "num_input_tokens_seen": 102371728, "step": 151875 }, { "epoch": 3.7104536681894804, "grad_norm": 0.00038101509562693536, "learning_rate": 3.786193171179014e-07, "loss": 0.0, "num_input_tokens_seen": 102374992, "step": 151880 }, { "epoch": 3.7105758190213276, "grad_norm": 0.9798285961151123, "learning_rate": 3.7855250362738014e-07, "loss": 0.0004, "num_input_tokens_seen": 102378576, "step": 151885 }, { "epoch": 3.710697969853175, "grad_norm": 0.00015900557627901435, "learning_rate": 3.7848569465619187e-07, "loss": 0.0, "num_input_tokens_seen": 102381840, "step": 151890 }, { "epoch": 3.710820120685022, "grad_norm": 0.001267539686523378, "learning_rate": 3.7841889020482307e-07, "loss": 0.0, "num_input_tokens_seen": 102385232, "step": 151895 }, { "epoch": 3.710942271516869, "grad_norm": 0.0014602263690903783, "learning_rate": 3.7835209027375894e-07, "loss": 0.0, "num_input_tokens_seen": 102388304, "step": 151900 }, { "epoch": 3.7110644223487164, "grad_norm": 0.00064032559748739, "learning_rate": 3.7828529486348604e-07, "loss": 0.0384, "num_input_tokens_seen": 102391632, "step": 151905 }, { "epoch": 3.711186573180563, "grad_norm": 0.006628748960793018, "learning_rate": 3.782185039744893e-07, "loss": 0.0, "num_input_tokens_seen": 102394768, "step": 151910 }, { "epoch": 3.7113087240124107, "grad_norm": 0.0025127448607236147, "learning_rate": 3.781517176072548e-07, "loss": 0.0, "num_input_tokens_seen": 102397840, "step": 151915 }, { "epoch": 3.7114308748442575, "grad_norm": 0.0008406085544265807, "learning_rate": 3.7808493576226863e-07, "loss": 0.0, "num_input_tokens_seen": 102401104, "step": 151920 }, { "epoch": 3.711553025676105, "grad_norm": 0.0004238552937749773, "learning_rate": 3.780181584400158e-07, "loss": 0.0293, "num_input_tokens_seen": 102404752, "step": 151925 }, { "epoch": 3.711675176507952, "grad_norm": 0.22442388534545898, "learning_rate": 3.779513856409825e-07, "loss": 0.0, "num_input_tokens_seen": 102407952, "step": 151930 }, { "epoch": 3.711797327339799, "grad_norm": 0.034784946590662, "learning_rate": 3.778846173656538e-07, "loss": 0.0002, "num_input_tokens_seen": 102411088, "step": 151935 }, { "epoch": 3.7119194781716462, "grad_norm": 0.0021735504269599915, "learning_rate": 3.778178536145157e-07, "loss": 0.0, "num_input_tokens_seen": 102414096, "step": 151940 }, { "epoch": 3.7120416290034934, "grad_norm": 0.004135854076594114, "learning_rate": 3.777510943880532e-07, "loss": 0.0009, "num_input_tokens_seen": 102417168, "step": 151945 }, { "epoch": 3.7121637798353406, "grad_norm": 0.00016273342771455646, "learning_rate": 3.776843396867522e-07, "loss": 0.0, "num_input_tokens_seen": 102420752, "step": 151950 }, { "epoch": 3.712285930667188, "grad_norm": 0.0016355100087821484, "learning_rate": 3.7761758951109836e-07, "loss": 0.0, "num_input_tokens_seen": 102424144, "step": 151955 }, { "epoch": 3.712408081499035, "grad_norm": 0.003920779097825289, "learning_rate": 3.7755084386157643e-07, "loss": 0.0, "num_input_tokens_seen": 102427280, "step": 151960 }, { "epoch": 3.712530232330882, "grad_norm": 0.0015011931536719203, "learning_rate": 3.7748410273867247e-07, "loss": 0.0, "num_input_tokens_seen": 102431184, "step": 151965 }, { "epoch": 3.7126523831627294, "grad_norm": 0.0028277873061597347, "learning_rate": 3.7741736614287135e-07, "loss": 0.0, "num_input_tokens_seen": 102434448, "step": 151970 }, { "epoch": 3.7127745339945766, "grad_norm": 0.00037950114347040653, "learning_rate": 3.7735063407465886e-07, "loss": 0.0001, "num_input_tokens_seen": 102437648, "step": 151975 }, { "epoch": 3.7128966848264238, "grad_norm": 0.00014305794320534915, "learning_rate": 3.772839065345197e-07, "loss": 0.0, "num_input_tokens_seen": 102440912, "step": 151980 }, { "epoch": 3.713018835658271, "grad_norm": 0.00024351131287403405, "learning_rate": 3.7721718352293976e-07, "loss": 0.0004, "num_input_tokens_seen": 102444048, "step": 151985 }, { "epoch": 3.713140986490118, "grad_norm": 0.0007917290786281228, "learning_rate": 3.7715046504040406e-07, "loss": 0.0, "num_input_tokens_seen": 102447568, "step": 151990 }, { "epoch": 3.713263137321965, "grad_norm": 0.0008913589990697801, "learning_rate": 3.770837510873972e-07, "loss": 0.0, "num_input_tokens_seen": 102450960, "step": 151995 }, { "epoch": 3.7133852881538125, "grad_norm": 4.2157516872975975e-05, "learning_rate": 3.770170416644054e-07, "loss": 0.0, "num_input_tokens_seen": 102454928, "step": 152000 }, { "epoch": 3.7135074389856593, "grad_norm": 0.003875978058204055, "learning_rate": 3.7695033677191277e-07, "loss": 0.0, "num_input_tokens_seen": 102458384, "step": 152005 }, { "epoch": 3.713629589817507, "grad_norm": 27.588912963867188, "learning_rate": 3.7688363641040486e-07, "loss": 0.0359, "num_input_tokens_seen": 102461648, "step": 152010 }, { "epoch": 3.7137517406493536, "grad_norm": 0.00013868967653252184, "learning_rate": 3.7681694058036715e-07, "loss": 0.0, "num_input_tokens_seen": 102465488, "step": 152015 }, { "epoch": 3.713873891481201, "grad_norm": 0.001578146475367248, "learning_rate": 3.7675024928228393e-07, "loss": 0.0, "num_input_tokens_seen": 102469008, "step": 152020 }, { "epoch": 3.713996042313048, "grad_norm": 0.0010178780648857355, "learning_rate": 3.7668356251664077e-07, "loss": 0.0, "num_input_tokens_seen": 102472656, "step": 152025 }, { "epoch": 3.714118193144895, "grad_norm": 0.00019512797007337213, "learning_rate": 3.766168802839221e-07, "loss": 0.0, "num_input_tokens_seen": 102476112, "step": 152030 }, { "epoch": 3.7142403439767424, "grad_norm": 0.7737483382225037, "learning_rate": 3.765502025846132e-07, "loss": 0.0001, "num_input_tokens_seen": 102479440, "step": 152035 }, { "epoch": 3.7143624948085896, "grad_norm": 0.0016303518787026405, "learning_rate": 3.7648352941919924e-07, "loss": 0.0, "num_input_tokens_seen": 102482576, "step": 152040 }, { "epoch": 3.714484645640437, "grad_norm": 0.0010339925065636635, "learning_rate": 3.764168607881644e-07, "loss": 0.0001, "num_input_tokens_seen": 102485584, "step": 152045 }, { "epoch": 3.714606796472284, "grad_norm": 0.0014051726320758462, "learning_rate": 3.763501966919942e-07, "loss": 0.0001, "num_input_tokens_seen": 102488848, "step": 152050 }, { "epoch": 3.714728947304131, "grad_norm": 0.0021041228901594877, "learning_rate": 3.762835371311728e-07, "loss": 0.1307, "num_input_tokens_seen": 102492176, "step": 152055 }, { "epoch": 3.7148510981359784, "grad_norm": 0.01120474748313427, "learning_rate": 3.762168821061856e-07, "loss": 0.0003, "num_input_tokens_seen": 102495184, "step": 152060 }, { "epoch": 3.7149732489678255, "grad_norm": 0.00018623789947014302, "learning_rate": 3.761502316175167e-07, "loss": 0.0, "num_input_tokens_seen": 102498768, "step": 152065 }, { "epoch": 3.7150953997996727, "grad_norm": 0.0017337838653475046, "learning_rate": 3.760835856656511e-07, "loss": 0.0, "num_input_tokens_seen": 102501712, "step": 152070 }, { "epoch": 3.71521755063152, "grad_norm": 0.0008111335337162018, "learning_rate": 3.760169442510738e-07, "loss": 0.0, "num_input_tokens_seen": 102505360, "step": 152075 }, { "epoch": 3.715339701463367, "grad_norm": 0.2772418260574341, "learning_rate": 3.7595030737426916e-07, "loss": 0.062, "num_input_tokens_seen": 102508560, "step": 152080 }, { "epoch": 3.7154618522952143, "grad_norm": 0.00469400966539979, "learning_rate": 3.758836750357213e-07, "loss": 0.0, "num_input_tokens_seen": 102511824, "step": 152085 }, { "epoch": 3.715584003127061, "grad_norm": 0.0017776049207895994, "learning_rate": 3.758170472359156e-07, "loss": 0.0, "num_input_tokens_seen": 102515024, "step": 152090 }, { "epoch": 3.7157061539589087, "grad_norm": 0.002591691678389907, "learning_rate": 3.7575042397533627e-07, "loss": 0.0008, "num_input_tokens_seen": 102518096, "step": 152095 }, { "epoch": 3.7158283047907554, "grad_norm": 0.00019121443619951606, "learning_rate": 3.756838052544674e-07, "loss": 0.0433, "num_input_tokens_seen": 102521488, "step": 152100 }, { "epoch": 3.715950455622603, "grad_norm": 0.0015384487342089415, "learning_rate": 3.756171910737938e-07, "loss": 0.0, "num_input_tokens_seen": 102525008, "step": 152105 }, { "epoch": 3.71607260645445, "grad_norm": 0.011667652055621147, "learning_rate": 3.7555058143380024e-07, "loss": 0.0, "num_input_tokens_seen": 102528016, "step": 152110 }, { "epoch": 3.716194757286297, "grad_norm": 0.00015528489893767983, "learning_rate": 3.754839763349704e-07, "loss": 0.0, "num_input_tokens_seen": 102531536, "step": 152115 }, { "epoch": 3.716316908118144, "grad_norm": 0.0011144173331558704, "learning_rate": 3.7541737577778956e-07, "loss": 0.0, "num_input_tokens_seen": 102535120, "step": 152120 }, { "epoch": 3.7164390589499914, "grad_norm": 0.00029413026641122997, "learning_rate": 3.753507797627412e-07, "loss": 0.0, "num_input_tokens_seen": 102538640, "step": 152125 }, { "epoch": 3.7165612097818386, "grad_norm": 0.00037484162021428347, "learning_rate": 3.7528418829030986e-07, "loss": 0.0, "num_input_tokens_seen": 102541904, "step": 152130 }, { "epoch": 3.7166833606136858, "grad_norm": 0.004265445750206709, "learning_rate": 3.752176013609804e-07, "loss": 0.0, "num_input_tokens_seen": 102545360, "step": 152135 }, { "epoch": 3.716805511445533, "grad_norm": 0.0016600849339738488, "learning_rate": 3.7515101897523616e-07, "loss": 0.0, "num_input_tokens_seen": 102548944, "step": 152140 }, { "epoch": 3.71692766227738, "grad_norm": 0.0005052868509665132, "learning_rate": 3.750844411335622e-07, "loss": 0.0, "num_input_tokens_seen": 102552464, "step": 152145 }, { "epoch": 3.7170498131092273, "grad_norm": 0.0002263938804389909, "learning_rate": 3.7501786783644183e-07, "loss": 0.0, "num_input_tokens_seen": 102556048, "step": 152150 }, { "epoch": 3.7171719639410745, "grad_norm": 0.0004822145856451243, "learning_rate": 3.7495129908436e-07, "loss": 0.0, "num_input_tokens_seen": 102559504, "step": 152155 }, { "epoch": 3.7172941147729217, "grad_norm": 0.01699310727417469, "learning_rate": 3.7488473487780004e-07, "loss": 0.0001, "num_input_tokens_seen": 102562768, "step": 152160 }, { "epoch": 3.717416265604769, "grad_norm": 8.816047920845449e-05, "learning_rate": 3.7481817521724655e-07, "loss": 0.0, "num_input_tokens_seen": 102565904, "step": 152165 }, { "epoch": 3.717538416436616, "grad_norm": 0.003461300628259778, "learning_rate": 3.7475162010318374e-07, "loss": 0.0, "num_input_tokens_seen": 102569232, "step": 152170 }, { "epoch": 3.717660567268463, "grad_norm": 0.00039457378443330526, "learning_rate": 3.746850695360949e-07, "loss": 0.0, "num_input_tokens_seen": 102572496, "step": 152175 }, { "epoch": 3.7177827181003105, "grad_norm": 0.0001832624984672293, "learning_rate": 3.7461852351646483e-07, "loss": 0.0, "num_input_tokens_seen": 102576016, "step": 152180 }, { "epoch": 3.717904868932157, "grad_norm": 0.0015027448534965515, "learning_rate": 3.745519820447768e-07, "loss": 0.0002, "num_input_tokens_seen": 102579216, "step": 152185 }, { "epoch": 3.718027019764005, "grad_norm": 0.0005207133945077658, "learning_rate": 3.7448544512151514e-07, "loss": 0.0234, "num_input_tokens_seen": 102582480, "step": 152190 }, { "epoch": 3.7181491705958516, "grad_norm": 0.0003728516458068043, "learning_rate": 3.7441891274716375e-07, "loss": 0.0, "num_input_tokens_seen": 102586064, "step": 152195 }, { "epoch": 3.718271321427699, "grad_norm": 0.00019803232862614095, "learning_rate": 3.743523849222059e-07, "loss": 0.0, "num_input_tokens_seen": 102589456, "step": 152200 }, { "epoch": 3.718393472259546, "grad_norm": 0.008787796832621098, "learning_rate": 3.7428586164712604e-07, "loss": 0.0, "num_input_tokens_seen": 102592976, "step": 152205 }, { "epoch": 3.718515623091393, "grad_norm": 0.00048126160982064903, "learning_rate": 3.742193429224074e-07, "loss": 0.0, "num_input_tokens_seen": 102596112, "step": 152210 }, { "epoch": 3.7186377739232404, "grad_norm": 0.001404481241479516, "learning_rate": 3.741528287485344e-07, "loss": 0.0, "num_input_tokens_seen": 102599504, "step": 152215 }, { "epoch": 3.7187599247550875, "grad_norm": 0.0026575650554150343, "learning_rate": 3.7408631912599e-07, "loss": 0.0, "num_input_tokens_seen": 102602640, "step": 152220 }, { "epoch": 3.7188820755869347, "grad_norm": 0.013400964438915253, "learning_rate": 3.740198140552582e-07, "loss": 0.0, "num_input_tokens_seen": 102606544, "step": 152225 }, { "epoch": 3.719004226418782, "grad_norm": 0.0002321565116290003, "learning_rate": 3.7395331353682305e-07, "loss": 0.0, "num_input_tokens_seen": 102609488, "step": 152230 }, { "epoch": 3.719126377250629, "grad_norm": 0.00038578774547204375, "learning_rate": 3.7388681757116736e-07, "loss": 0.0, "num_input_tokens_seen": 102612688, "step": 152235 }, { "epoch": 3.7192485280824763, "grad_norm": 51.554908752441406, "learning_rate": 3.7382032615877554e-07, "loss": 0.0631, "num_input_tokens_seen": 102616528, "step": 152240 }, { "epoch": 3.7193706789143235, "grad_norm": 0.002198802540078759, "learning_rate": 3.7375383930013037e-07, "loss": 0.0462, "num_input_tokens_seen": 102619536, "step": 152245 }, { "epoch": 3.7194928297461707, "grad_norm": 0.000260370085015893, "learning_rate": 3.736873569957162e-07, "loss": 0.0383, "num_input_tokens_seen": 102622608, "step": 152250 }, { "epoch": 3.719614980578018, "grad_norm": 0.010937336832284927, "learning_rate": 3.736208792460156e-07, "loss": 0.0, "num_input_tokens_seen": 102626320, "step": 152255 }, { "epoch": 3.719737131409865, "grad_norm": 0.00016129412688314915, "learning_rate": 3.7355440605151236e-07, "loss": 0.0414, "num_input_tokens_seen": 102629968, "step": 152260 }, { "epoch": 3.7198592822417123, "grad_norm": 0.0022476690355688334, "learning_rate": 3.7348793741269036e-07, "loss": 0.0, "num_input_tokens_seen": 102633232, "step": 152265 }, { "epoch": 3.719981433073559, "grad_norm": 0.04069744423031807, "learning_rate": 3.7342147333003227e-07, "loss": 0.0, "num_input_tokens_seen": 102636432, "step": 152270 }, { "epoch": 3.7201035839054066, "grad_norm": 30.509748458862305, "learning_rate": 3.733550138040221e-07, "loss": 0.081, "num_input_tokens_seen": 102639696, "step": 152275 }, { "epoch": 3.7202257347372534, "grad_norm": 0.00010525318066356704, "learning_rate": 3.7328855883514244e-07, "loss": 0.0, "num_input_tokens_seen": 102642896, "step": 152280 }, { "epoch": 3.7203478855691006, "grad_norm": 0.016809474676847458, "learning_rate": 3.7322210842387734e-07, "loss": 0.0001, "num_input_tokens_seen": 102645840, "step": 152285 }, { "epoch": 3.7204700364009478, "grad_norm": 0.023841066285967827, "learning_rate": 3.731556625707093e-07, "loss": 0.0, "num_input_tokens_seen": 102649104, "step": 152290 }, { "epoch": 3.720592187232795, "grad_norm": 0.0014904678100720048, "learning_rate": 3.730892212761222e-07, "loss": 0.0, "num_input_tokens_seen": 102652176, "step": 152295 }, { "epoch": 3.720714338064642, "grad_norm": 0.003293322864919901, "learning_rate": 3.730227845405989e-07, "loss": 0.0003, "num_input_tokens_seen": 102655120, "step": 152300 }, { "epoch": 3.7208364888964893, "grad_norm": 0.007259721867740154, "learning_rate": 3.729563523646222e-07, "loss": 0.0, "num_input_tokens_seen": 102658448, "step": 152305 }, { "epoch": 3.7209586397283365, "grad_norm": 0.00021232757717370987, "learning_rate": 3.72889924748676e-07, "loss": 0.0, "num_input_tokens_seen": 102661520, "step": 152310 }, { "epoch": 3.7210807905601837, "grad_norm": 0.001801141886971891, "learning_rate": 3.728235016932425e-07, "loss": 0.0, "num_input_tokens_seen": 102664592, "step": 152315 }, { "epoch": 3.721202941392031, "grad_norm": 0.0009091881802305579, "learning_rate": 3.7275708319880516e-07, "loss": 0.0, "num_input_tokens_seen": 102668048, "step": 152320 }, { "epoch": 3.721325092223878, "grad_norm": 0.03039466217160225, "learning_rate": 3.7269066926584746e-07, "loss": 0.0, "num_input_tokens_seen": 102671504, "step": 152325 }, { "epoch": 3.7214472430557253, "grad_norm": 0.008603163994848728, "learning_rate": 3.7262425989485145e-07, "loss": 0.0, "num_input_tokens_seen": 102675152, "step": 152330 }, { "epoch": 3.7215693938875725, "grad_norm": 0.0013472189893946052, "learning_rate": 3.725578550863011e-07, "loss": 0.0402, "num_input_tokens_seen": 102678544, "step": 152335 }, { "epoch": 3.7216915447194197, "grad_norm": 0.00042434909846633673, "learning_rate": 3.7249145484067835e-07, "loss": 0.0, "num_input_tokens_seen": 102681872, "step": 152340 }, { "epoch": 3.721813695551267, "grad_norm": 0.004947059787809849, "learning_rate": 3.7242505915846677e-07, "loss": 0.0, "num_input_tokens_seen": 102684880, "step": 152345 }, { "epoch": 3.721935846383114, "grad_norm": 0.00031549998675473034, "learning_rate": 3.723586680401487e-07, "loss": 0.0, "num_input_tokens_seen": 102687888, "step": 152350 }, { "epoch": 3.722057997214961, "grad_norm": 0.0046067675575613976, "learning_rate": 3.7229228148620726e-07, "loss": 0.0, "num_input_tokens_seen": 102691536, "step": 152355 }, { "epoch": 3.7221801480468084, "grad_norm": 0.00043726101284846663, "learning_rate": 3.722258994971255e-07, "loss": 0.0, "num_input_tokens_seen": 102694800, "step": 152360 }, { "epoch": 3.722302298878655, "grad_norm": 0.0015163227217271924, "learning_rate": 3.7215952207338543e-07, "loss": 0.0, "num_input_tokens_seen": 102698576, "step": 152365 }, { "epoch": 3.722424449710503, "grad_norm": 0.07668344676494598, "learning_rate": 3.7209314921547066e-07, "loss": 0.0, "num_input_tokens_seen": 102701840, "step": 152370 }, { "epoch": 3.7225466005423495, "grad_norm": 0.0085446797311306, "learning_rate": 3.7202678092386296e-07, "loss": 0.0, "num_input_tokens_seen": 102705296, "step": 152375 }, { "epoch": 3.7226687513741967, "grad_norm": 0.0001068677956936881, "learning_rate": 3.7196041719904536e-07, "loss": 0.0, "num_input_tokens_seen": 102709776, "step": 152380 }, { "epoch": 3.722790902206044, "grad_norm": 0.0018057903507724404, "learning_rate": 3.71894058041501e-07, "loss": 0.0001, "num_input_tokens_seen": 102712912, "step": 152385 }, { "epoch": 3.722913053037891, "grad_norm": 0.0006664296961389482, "learning_rate": 3.7182770345171187e-07, "loss": 0.0, "num_input_tokens_seen": 102716240, "step": 152390 }, { "epoch": 3.7230352038697383, "grad_norm": 0.010751817375421524, "learning_rate": 3.7176135343016036e-07, "loss": 0.0, "num_input_tokens_seen": 102719248, "step": 152395 }, { "epoch": 3.7231573547015855, "grad_norm": 0.002380897058174014, "learning_rate": 3.7169500797732966e-07, "loss": 0.0, "num_input_tokens_seen": 102722384, "step": 152400 }, { "epoch": 3.7232795055334327, "grad_norm": 0.00012997696467209607, "learning_rate": 3.716286670937018e-07, "loss": 0.0435, "num_input_tokens_seen": 102725328, "step": 152405 }, { "epoch": 3.72340165636528, "grad_norm": 0.0108709204941988, "learning_rate": 3.7156233077975895e-07, "loss": 0.0566, "num_input_tokens_seen": 102728400, "step": 152410 }, { "epoch": 3.723523807197127, "grad_norm": 77.4952163696289, "learning_rate": 3.714959990359838e-07, "loss": 0.0031, "num_input_tokens_seen": 102731728, "step": 152415 }, { "epoch": 3.7236459580289742, "grad_norm": 0.11548440158367157, "learning_rate": 3.7142967186285924e-07, "loss": 0.0, "num_input_tokens_seen": 102735952, "step": 152420 }, { "epoch": 3.7237681088608214, "grad_norm": 0.007672048639506102, "learning_rate": 3.7136334926086676e-07, "loss": 0.0001, "num_input_tokens_seen": 102739408, "step": 152425 }, { "epoch": 3.7238902596926686, "grad_norm": 0.005651409272104502, "learning_rate": 3.712970312304894e-07, "loss": 0.0001, "num_input_tokens_seen": 102742800, "step": 152430 }, { "epoch": 3.724012410524516, "grad_norm": 0.00011328620894346386, "learning_rate": 3.7123071777220884e-07, "loss": 0.0012, "num_input_tokens_seen": 102746000, "step": 152435 }, { "epoch": 3.7241345613563626, "grad_norm": 0.00017487969307694584, "learning_rate": 3.711644088865076e-07, "loss": 0.0, "num_input_tokens_seen": 102749392, "step": 152440 }, { "epoch": 3.72425671218821, "grad_norm": 5.3928306442685425e-05, "learning_rate": 3.7109810457386825e-07, "loss": 0.0, "num_input_tokens_seen": 102752592, "step": 152445 }, { "epoch": 3.724378863020057, "grad_norm": 0.00017148379993159324, "learning_rate": 3.7103180483477234e-07, "loss": 0.0001, "num_input_tokens_seen": 102755920, "step": 152450 }, { "epoch": 3.7245010138519046, "grad_norm": 0.0005869636661373079, "learning_rate": 3.7096550966970264e-07, "loss": 0.0, "num_input_tokens_seen": 102759504, "step": 152455 }, { "epoch": 3.7246231646837513, "grad_norm": 0.0002772973384708166, "learning_rate": 3.7089921907914056e-07, "loss": 0.0, "num_input_tokens_seen": 102762960, "step": 152460 }, { "epoch": 3.7247453155155985, "grad_norm": 0.00029023404931649566, "learning_rate": 3.70832933063569e-07, "loss": 0.0882, "num_input_tokens_seen": 102766224, "step": 152465 }, { "epoch": 3.7248674663474457, "grad_norm": 0.0037888879887759686, "learning_rate": 3.707666516234692e-07, "loss": 0.0, "num_input_tokens_seen": 102769424, "step": 152470 }, { "epoch": 3.724989617179293, "grad_norm": 0.003013129811733961, "learning_rate": 3.7070037475932346e-07, "loss": 0.0, "num_input_tokens_seen": 102772496, "step": 152475 }, { "epoch": 3.72511176801114, "grad_norm": 0.0013928675325587392, "learning_rate": 3.706341024716143e-07, "loss": 0.0, "num_input_tokens_seen": 102776080, "step": 152480 }, { "epoch": 3.7252339188429873, "grad_norm": 0.012922985479235649, "learning_rate": 3.705678347608229e-07, "loss": 0.0, "num_input_tokens_seen": 102779408, "step": 152485 }, { "epoch": 3.7253560696748345, "grad_norm": 0.0011642073513939977, "learning_rate": 3.705015716274318e-07, "loss": 0.0, "num_input_tokens_seen": 102782864, "step": 152490 }, { "epoch": 3.7254782205066816, "grad_norm": 0.0006689711008220911, "learning_rate": 3.704353130719222e-07, "loss": 0.0, "num_input_tokens_seen": 102786128, "step": 152495 }, { "epoch": 3.725600371338529, "grad_norm": 47.64103698730469, "learning_rate": 3.7036905909477666e-07, "loss": 0.1, "num_input_tokens_seen": 102789520, "step": 152500 }, { "epoch": 3.725722522170376, "grad_norm": 0.0068311383947730064, "learning_rate": 3.7030280969647676e-07, "loss": 0.0, "num_input_tokens_seen": 102792592, "step": 152505 }, { "epoch": 3.725844673002223, "grad_norm": 0.044286441057920456, "learning_rate": 3.702365648775039e-07, "loss": 0.0383, "num_input_tokens_seen": 102796048, "step": 152510 }, { "epoch": 3.7259668238340704, "grad_norm": 0.002478921553120017, "learning_rate": 3.701703246383403e-07, "loss": 0.0, "num_input_tokens_seen": 102799504, "step": 152515 }, { "epoch": 3.7260889746659176, "grad_norm": 0.0020042003598064184, "learning_rate": 3.701040889794673e-07, "loss": 0.0728, "num_input_tokens_seen": 102802384, "step": 152520 }, { "epoch": 3.726211125497765, "grad_norm": 0.0006063705077394843, "learning_rate": 3.700378579013671e-07, "loss": 0.0, "num_input_tokens_seen": 102805840, "step": 152525 }, { "epoch": 3.726333276329612, "grad_norm": 0.00019261521811131388, "learning_rate": 3.699716314045207e-07, "loss": 0.0, "num_input_tokens_seen": 102808976, "step": 152530 }, { "epoch": 3.7264554271614587, "grad_norm": 0.00046472440590150654, "learning_rate": 3.6990540948940995e-07, "loss": 0.0, "num_input_tokens_seen": 102812240, "step": 152535 }, { "epoch": 3.7265775779933064, "grad_norm": 0.0002507887838874012, "learning_rate": 3.6983919215651704e-07, "loss": 0.0, "num_input_tokens_seen": 102815504, "step": 152540 }, { "epoch": 3.726699728825153, "grad_norm": 0.00016412966942880303, "learning_rate": 3.6977297940632257e-07, "loss": 0.0001, "num_input_tokens_seen": 102818512, "step": 152545 }, { "epoch": 3.7268218796570007, "grad_norm": 0.0005654366686940193, "learning_rate": 3.697067712393088e-07, "loss": 0.0, "num_input_tokens_seen": 102821584, "step": 152550 }, { "epoch": 3.7269440304888475, "grad_norm": 0.0047607459127902985, "learning_rate": 3.696405676559567e-07, "loss": 0.0, "num_input_tokens_seen": 102825040, "step": 152555 }, { "epoch": 3.7270661813206947, "grad_norm": 0.0005605295300483704, "learning_rate": 3.6957436865674817e-07, "loss": 0.0, "num_input_tokens_seen": 102828304, "step": 152560 }, { "epoch": 3.727188332152542, "grad_norm": 0.0003506525536067784, "learning_rate": 3.695081742421642e-07, "loss": 0.0, "num_input_tokens_seen": 102831824, "step": 152565 }, { "epoch": 3.727310482984389, "grad_norm": 0.0026996464002877474, "learning_rate": 3.6944198441268626e-07, "loss": 0.0, "num_input_tokens_seen": 102835024, "step": 152570 }, { "epoch": 3.7274326338162362, "grad_norm": 0.0029354402795434, "learning_rate": 3.6937579916879614e-07, "loss": 0.0, "num_input_tokens_seen": 102838288, "step": 152575 }, { "epoch": 3.7275547846480834, "grad_norm": 0.0009059472940862179, "learning_rate": 3.6930961851097454e-07, "loss": 0.0, "num_input_tokens_seen": 102841232, "step": 152580 }, { "epoch": 3.7276769354799306, "grad_norm": 3.0146076824166812e-05, "learning_rate": 3.692434424397033e-07, "loss": 0.0001, "num_input_tokens_seen": 102844816, "step": 152585 }, { "epoch": 3.727799086311778, "grad_norm": 0.012793498113751411, "learning_rate": 3.6917727095546314e-07, "loss": 0.0, "num_input_tokens_seen": 102847888, "step": 152590 }, { "epoch": 3.727921237143625, "grad_norm": 0.0031087123788893223, "learning_rate": 3.691111040587358e-07, "loss": 0.0001, "num_input_tokens_seen": 102850896, "step": 152595 }, { "epoch": 3.728043387975472, "grad_norm": 0.0017248626099899411, "learning_rate": 3.69044941750002e-07, "loss": 0.0, "num_input_tokens_seen": 102854032, "step": 152600 }, { "epoch": 3.7281655388073194, "grad_norm": 0.02217789925634861, "learning_rate": 3.6897878402974324e-07, "loss": 0.0, "num_input_tokens_seen": 102857296, "step": 152605 }, { "epoch": 3.7282876896391666, "grad_norm": 0.0074479603208601475, "learning_rate": 3.689126308984406e-07, "loss": 0.0, "num_input_tokens_seen": 102860624, "step": 152610 }, { "epoch": 3.7284098404710138, "grad_norm": 0.04629134759306908, "learning_rate": 3.688464823565747e-07, "loss": 0.0335, "num_input_tokens_seen": 102864528, "step": 152615 }, { "epoch": 3.7285319913028605, "grad_norm": 0.0012515024282038212, "learning_rate": 3.687803384046273e-07, "loss": 0.0001, "num_input_tokens_seen": 102867728, "step": 152620 }, { "epoch": 3.728654142134708, "grad_norm": 0.0010610457975417376, "learning_rate": 3.687141990430787e-07, "loss": 0.0, "num_input_tokens_seen": 102871248, "step": 152625 }, { "epoch": 3.728776292966555, "grad_norm": 0.017728229984641075, "learning_rate": 3.686480642724102e-07, "loss": 0.0, "num_input_tokens_seen": 102874448, "step": 152630 }, { "epoch": 3.7288984437984025, "grad_norm": 0.0009284796542488039, "learning_rate": 3.685819340931031e-07, "loss": 0.0, "num_input_tokens_seen": 102877584, "step": 152635 }, { "epoch": 3.7290205946302493, "grad_norm": 0.005502808839082718, "learning_rate": 3.685158085056378e-07, "loss": 0.0, "num_input_tokens_seen": 102880976, "step": 152640 }, { "epoch": 3.7291427454620965, "grad_norm": 0.003437718376517296, "learning_rate": 3.6844968751049566e-07, "loss": 0.0388, "num_input_tokens_seen": 102884432, "step": 152645 }, { "epoch": 3.7292648962939436, "grad_norm": 0.0014162580482661724, "learning_rate": 3.683835711081569e-07, "loss": 0.0063, "num_input_tokens_seen": 102887760, "step": 152650 }, { "epoch": 3.729387047125791, "grad_norm": 0.05147543549537659, "learning_rate": 3.6831745929910306e-07, "loss": 0.0, "num_input_tokens_seen": 102890896, "step": 152655 }, { "epoch": 3.729509197957638, "grad_norm": 24.238784790039062, "learning_rate": 3.682513520838142e-07, "loss": 0.0893, "num_input_tokens_seen": 102894288, "step": 152660 }, { "epoch": 3.729631348789485, "grad_norm": 0.0032417848706245422, "learning_rate": 3.681852494627714e-07, "loss": 0.0728, "num_input_tokens_seen": 102897552, "step": 152665 }, { "epoch": 3.7297534996213324, "grad_norm": 0.0010357864666730165, "learning_rate": 3.681191514364558e-07, "loss": 0.0, "num_input_tokens_seen": 102901072, "step": 152670 }, { "epoch": 3.7298756504531796, "grad_norm": 0.0016148955328390002, "learning_rate": 3.6805305800534726e-07, "loss": 0.0, "num_input_tokens_seen": 102905488, "step": 152675 }, { "epoch": 3.729997801285027, "grad_norm": 0.0070404973812401295, "learning_rate": 3.679869691699273e-07, "loss": 0.0, "num_input_tokens_seen": 102908752, "step": 152680 }, { "epoch": 3.730119952116874, "grad_norm": 0.00028679074603132904, "learning_rate": 3.6792088493067576e-07, "loss": 0.0, "num_input_tokens_seen": 102911760, "step": 152685 }, { "epoch": 3.730242102948721, "grad_norm": 0.0007746733026579022, "learning_rate": 3.6785480528807343e-07, "loss": 0.0, "num_input_tokens_seen": 102914832, "step": 152690 }, { "epoch": 3.7303642537805684, "grad_norm": 0.0019843345507979393, "learning_rate": 3.677887302426014e-07, "loss": 0.0598, "num_input_tokens_seen": 102918096, "step": 152695 }, { "epoch": 3.7304864046124155, "grad_norm": 0.9899024963378906, "learning_rate": 3.677226597947398e-07, "loss": 0.0002, "num_input_tokens_seen": 102921552, "step": 152700 }, { "epoch": 3.7306085554442627, "grad_norm": 0.14979158341884613, "learning_rate": 3.676565939449687e-07, "loss": 0.0, "num_input_tokens_seen": 102925072, "step": 152705 }, { "epoch": 3.73073070627611, "grad_norm": 0.09974019974470139, "learning_rate": 3.6759053269376927e-07, "loss": 0.0001, "num_input_tokens_seen": 102928848, "step": 152710 }, { "epoch": 3.7308528571079567, "grad_norm": 0.0010769544169306755, "learning_rate": 3.6752447604162165e-07, "loss": 0.0001, "num_input_tokens_seen": 102932432, "step": 152715 }, { "epoch": 3.7309750079398043, "grad_norm": 0.05534740537405014, "learning_rate": 3.674584239890057e-07, "loss": 0.0, "num_input_tokens_seen": 102935568, "step": 152720 }, { "epoch": 3.731097158771651, "grad_norm": 0.009643294848501682, "learning_rate": 3.673923765364022e-07, "loss": 0.0002, "num_input_tokens_seen": 102938576, "step": 152725 }, { "epoch": 3.7312193096034982, "grad_norm": 0.017214003950357437, "learning_rate": 3.67326333684292e-07, "loss": 0.0001, "num_input_tokens_seen": 102942160, "step": 152730 }, { "epoch": 3.7313414604353454, "grad_norm": 0.00015420409908983856, "learning_rate": 3.672602954331544e-07, "loss": 0.0, "num_input_tokens_seen": 102945296, "step": 152735 }, { "epoch": 3.7314636112671926, "grad_norm": 0.19823694229125977, "learning_rate": 3.671942617834705e-07, "loss": 0.0001, "num_input_tokens_seen": 102948816, "step": 152740 }, { "epoch": 3.73158576209904, "grad_norm": 0.0025073799770325422, "learning_rate": 3.671282327357198e-07, "loss": 0.0, "num_input_tokens_seen": 102952336, "step": 152745 }, { "epoch": 3.731707912930887, "grad_norm": 0.016093574464321136, "learning_rate": 3.670622082903828e-07, "loss": 0.0, "num_input_tokens_seen": 102955536, "step": 152750 }, { "epoch": 3.731830063762734, "grad_norm": 0.0363631546497345, "learning_rate": 3.6699618844794e-07, "loss": 0.0, "num_input_tokens_seen": 102959632, "step": 152755 }, { "epoch": 3.7319522145945814, "grad_norm": 0.0034554863814264536, "learning_rate": 3.6693017320887076e-07, "loss": 0.0, "num_input_tokens_seen": 102963536, "step": 152760 }, { "epoch": 3.7320743654264286, "grad_norm": 0.31483185291290283, "learning_rate": 3.6686416257365603e-07, "loss": 0.0001, "num_input_tokens_seen": 102966928, "step": 152765 }, { "epoch": 3.7321965162582758, "grad_norm": 0.0005507257301360369, "learning_rate": 3.66798156542775e-07, "loss": 0.0, "num_input_tokens_seen": 102970704, "step": 152770 }, { "epoch": 3.732318667090123, "grad_norm": 0.0034630734007805586, "learning_rate": 3.667321551167085e-07, "loss": 0.0, "num_input_tokens_seen": 102973648, "step": 152775 }, { "epoch": 3.73244081792197, "grad_norm": 0.0002851722820196301, "learning_rate": 3.666661582959357e-07, "loss": 0.0, "num_input_tokens_seen": 102977104, "step": 152780 }, { "epoch": 3.7325629687538173, "grad_norm": 0.005233187228441238, "learning_rate": 3.666001660809369e-07, "loss": 0.0001, "num_input_tokens_seen": 102980368, "step": 152785 }, { "epoch": 3.7326851195856645, "grad_norm": 0.010846148245036602, "learning_rate": 3.665341784721925e-07, "loss": 0.0, "num_input_tokens_seen": 102983632, "step": 152790 }, { "epoch": 3.7328072704175117, "grad_norm": 0.00087640032870695, "learning_rate": 3.6646819547018147e-07, "loss": 0.0714, "num_input_tokens_seen": 102986896, "step": 152795 }, { "epoch": 3.7329294212493584, "grad_norm": 0.0027219816111028194, "learning_rate": 3.6640221707538455e-07, "loss": 0.0, "num_input_tokens_seen": 102990224, "step": 152800 }, { "epoch": 3.733051572081206, "grad_norm": 0.03295997157692909, "learning_rate": 3.6633624328828085e-07, "loss": 0.0, "num_input_tokens_seen": 102993552, "step": 152805 }, { "epoch": 3.733173722913053, "grad_norm": 0.01818210259079933, "learning_rate": 3.6627027410935063e-07, "loss": 0.0, "num_input_tokens_seen": 102996944, "step": 152810 }, { "epoch": 3.7332958737449005, "grad_norm": 0.008293227292597294, "learning_rate": 3.662043095390736e-07, "loss": 0.0, "num_input_tokens_seen": 102999952, "step": 152815 }, { "epoch": 3.733418024576747, "grad_norm": 0.0011386339319869876, "learning_rate": 3.661383495779288e-07, "loss": 0.0, "num_input_tokens_seen": 103003280, "step": 152820 }, { "epoch": 3.7335401754085944, "grad_norm": 0.017195656895637512, "learning_rate": 3.660723942263967e-07, "loss": 0.0, "num_input_tokens_seen": 103006416, "step": 152825 }, { "epoch": 3.7336623262404416, "grad_norm": 0.0023615004029124975, "learning_rate": 3.660064434849565e-07, "loss": 0.0, "num_input_tokens_seen": 103009488, "step": 152830 }, { "epoch": 3.7337844770722888, "grad_norm": 0.007179385516792536, "learning_rate": 3.6594049735408816e-07, "loss": 0.0845, "num_input_tokens_seen": 103012496, "step": 152835 }, { "epoch": 3.733906627904136, "grad_norm": 0.000365541287465021, "learning_rate": 3.6587455583427074e-07, "loss": 0.0, "num_input_tokens_seen": 103016016, "step": 152840 }, { "epoch": 3.734028778735983, "grad_norm": 0.004545575473457575, "learning_rate": 3.6580861892598423e-07, "loss": 0.0, "num_input_tokens_seen": 103019088, "step": 152845 }, { "epoch": 3.7341509295678303, "grad_norm": 0.1034526601433754, "learning_rate": 3.657426866297082e-07, "loss": 0.0002, "num_input_tokens_seen": 103022416, "step": 152850 }, { "epoch": 3.7342730803996775, "grad_norm": 0.0006500378367491066, "learning_rate": 3.6567675894592174e-07, "loss": 0.0, "num_input_tokens_seen": 103025936, "step": 152855 }, { "epoch": 3.7343952312315247, "grad_norm": 0.003577360650524497, "learning_rate": 3.656108358751048e-07, "loss": 0.012, "num_input_tokens_seen": 103028880, "step": 152860 }, { "epoch": 3.734517382063372, "grad_norm": 0.0014856354100629687, "learning_rate": 3.655449174177361e-07, "loss": 0.0, "num_input_tokens_seen": 103032336, "step": 152865 }, { "epoch": 3.734639532895219, "grad_norm": 0.00800647959113121, "learning_rate": 3.6547900357429585e-07, "loss": 0.0, "num_input_tokens_seen": 103035728, "step": 152870 }, { "epoch": 3.7347616837270663, "grad_norm": 0.01941361092031002, "learning_rate": 3.654130943452625e-07, "loss": 0.0, "num_input_tokens_seen": 103038800, "step": 152875 }, { "epoch": 3.7348838345589135, "grad_norm": 0.18271203339099884, "learning_rate": 3.653471897311159e-07, "loss": 0.0001, "num_input_tokens_seen": 103042576, "step": 152880 }, { "epoch": 3.7350059853907607, "grad_norm": 0.003417690983042121, "learning_rate": 3.6528128973233554e-07, "loss": 0.0, "num_input_tokens_seen": 103045904, "step": 152885 }, { "epoch": 3.735128136222608, "grad_norm": 0.0034222102258354425, "learning_rate": 3.652153943494e-07, "loss": 0.0002, "num_input_tokens_seen": 103049360, "step": 152890 }, { "epoch": 3.7352502870544546, "grad_norm": 0.0035051971208304167, "learning_rate": 3.6514950358278917e-07, "loss": 0.0, "num_input_tokens_seen": 103052752, "step": 152895 }, { "epoch": 3.7353724378863022, "grad_norm": 0.05147337168455124, "learning_rate": 3.6508361743298167e-07, "loss": 0.0001, "num_input_tokens_seen": 103055568, "step": 152900 }, { "epoch": 3.735494588718149, "grad_norm": 0.002973938826471567, "learning_rate": 3.6501773590045713e-07, "loss": 0.0643, "num_input_tokens_seen": 103059088, "step": 152905 }, { "epoch": 3.735616739549996, "grad_norm": 0.0009580811602063477, "learning_rate": 3.6495185898569405e-07, "loss": 0.0, "num_input_tokens_seen": 103061840, "step": 152910 }, { "epoch": 3.7357388903818434, "grad_norm": 0.1412133127450943, "learning_rate": 3.6488598668917224e-07, "loss": 0.0001, "num_input_tokens_seen": 103065360, "step": 152915 }, { "epoch": 3.7358610412136906, "grad_norm": 0.005600213538855314, "learning_rate": 3.648201190113703e-07, "loss": 0.0, "num_input_tokens_seen": 103068688, "step": 152920 }, { "epoch": 3.7359831920455377, "grad_norm": 0.002254885621368885, "learning_rate": 3.647542559527671e-07, "loss": 0.0001, "num_input_tokens_seen": 103072208, "step": 152925 }, { "epoch": 3.736105342877385, "grad_norm": 0.0008223111508414149, "learning_rate": 3.646883975138421e-07, "loss": 0.0, "num_input_tokens_seen": 103075856, "step": 152930 }, { "epoch": 3.736227493709232, "grad_norm": 0.03418153151869774, "learning_rate": 3.646225436950735e-07, "loss": 0.0001, "num_input_tokens_seen": 103079312, "step": 152935 }, { "epoch": 3.7363496445410793, "grad_norm": 0.1998567283153534, "learning_rate": 3.6455669449694073e-07, "loss": 0.0001, "num_input_tokens_seen": 103082704, "step": 152940 }, { "epoch": 3.7364717953729265, "grad_norm": 0.009525242261588573, "learning_rate": 3.6449084991992295e-07, "loss": 0.0, "num_input_tokens_seen": 103086288, "step": 152945 }, { "epoch": 3.7365939462047737, "grad_norm": 0.07257004827260971, "learning_rate": 3.644250099644983e-07, "loss": 0.0003, "num_input_tokens_seen": 103089424, "step": 152950 }, { "epoch": 3.736716097036621, "grad_norm": 0.00019916010205633938, "learning_rate": 3.643591746311462e-07, "loss": 0.0, "num_input_tokens_seen": 103093584, "step": 152955 }, { "epoch": 3.736838247868468, "grad_norm": 0.012999081052839756, "learning_rate": 3.642933439203448e-07, "loss": 0.0, "num_input_tokens_seen": 103096592, "step": 152960 }, { "epoch": 3.7369603987003153, "grad_norm": 0.000976978917606175, "learning_rate": 3.6422751783257364e-07, "loss": 0.0, "num_input_tokens_seen": 103100240, "step": 152965 }, { "epoch": 3.7370825495321625, "grad_norm": 0.002696326235309243, "learning_rate": 3.6416169636831064e-07, "loss": 0.0, "num_input_tokens_seen": 103103376, "step": 152970 }, { "epoch": 3.7372047003640096, "grad_norm": 0.013375307433307171, "learning_rate": 3.640958795280347e-07, "loss": 0.0, "num_input_tokens_seen": 103106448, "step": 152975 }, { "epoch": 3.7373268511958564, "grad_norm": 0.00033097839332185686, "learning_rate": 3.6403006731222496e-07, "loss": 0.0001, "num_input_tokens_seen": 103110032, "step": 152980 }, { "epoch": 3.737449002027704, "grad_norm": 0.00204194663092494, "learning_rate": 3.6396425972135923e-07, "loss": 0.0, "num_input_tokens_seen": 103113360, "step": 152985 }, { "epoch": 3.7375711528595508, "grad_norm": 0.03878295421600342, "learning_rate": 3.638984567559169e-07, "loss": 0.0001, "num_input_tokens_seen": 103116496, "step": 152990 }, { "epoch": 3.7376933036913984, "grad_norm": 0.00504625029861927, "learning_rate": 3.6383265841637567e-07, "loss": 0.0, "num_input_tokens_seen": 103119952, "step": 152995 }, { "epoch": 3.737815454523245, "grad_norm": 0.592056930065155, "learning_rate": 3.6376686470321447e-07, "loss": 0.0, "num_input_tokens_seen": 103123344, "step": 153000 }, { "epoch": 3.7379376053550923, "grad_norm": 0.0003572091518435627, "learning_rate": 3.637010756169121e-07, "loss": 0.0, "num_input_tokens_seen": 103126480, "step": 153005 }, { "epoch": 3.7380597561869395, "grad_norm": 0.0050292848609387875, "learning_rate": 3.6363529115794667e-07, "loss": 0.0001, "num_input_tokens_seen": 103129936, "step": 153010 }, { "epoch": 3.7381819070187867, "grad_norm": 0.0025767607148736715, "learning_rate": 3.6356951132679626e-07, "loss": 0.0, "num_input_tokens_seen": 103133584, "step": 153015 }, { "epoch": 3.738304057850634, "grad_norm": 0.0029860185459256172, "learning_rate": 3.635037361239398e-07, "loss": 0.0, "num_input_tokens_seen": 103136848, "step": 153020 }, { "epoch": 3.738426208682481, "grad_norm": 0.0033167738001793623, "learning_rate": 3.6343796554985504e-07, "loss": 0.0, "num_input_tokens_seen": 103140048, "step": 153025 }, { "epoch": 3.7385483595143283, "grad_norm": 0.02676066942512989, "learning_rate": 3.6337219960502106e-07, "loss": 0.0, "num_input_tokens_seen": 103143376, "step": 153030 }, { "epoch": 3.7386705103461755, "grad_norm": 0.0005148989730514586, "learning_rate": 3.633064382899153e-07, "loss": 0.0, "num_input_tokens_seen": 103146640, "step": 153035 }, { "epoch": 3.7387926611780227, "grad_norm": 0.01030823029577732, "learning_rate": 3.632406816050166e-07, "loss": 0.0, "num_input_tokens_seen": 103149648, "step": 153040 }, { "epoch": 3.73891481200987, "grad_norm": 0.012857099995017052, "learning_rate": 3.6317492955080263e-07, "loss": 0.0, "num_input_tokens_seen": 103152656, "step": 153045 }, { "epoch": 3.739036962841717, "grad_norm": 0.002937320852652192, "learning_rate": 3.6310918212775223e-07, "loss": 0.0, "num_input_tokens_seen": 103155984, "step": 153050 }, { "epoch": 3.7391591136735642, "grad_norm": 0.0008897144580259919, "learning_rate": 3.6304343933634284e-07, "loss": 0.0, "num_input_tokens_seen": 103159568, "step": 153055 }, { "epoch": 3.7392812645054114, "grad_norm": 0.0007698034751228988, "learning_rate": 3.629777011770532e-07, "loss": 0.0, "num_input_tokens_seen": 103162896, "step": 153060 }, { "epoch": 3.739403415337258, "grad_norm": 0.002032829448580742, "learning_rate": 3.629119676503607e-07, "loss": 0.0, "num_input_tokens_seen": 103165840, "step": 153065 }, { "epoch": 3.739525566169106, "grad_norm": 0.0007218793034553528, "learning_rate": 3.628462387567437e-07, "loss": 0.0, "num_input_tokens_seen": 103169168, "step": 153070 }, { "epoch": 3.7396477170009526, "grad_norm": 0.0006760008982382715, "learning_rate": 3.6278051449668067e-07, "loss": 0.0, "num_input_tokens_seen": 103171792, "step": 153075 }, { "epoch": 3.7397698678328, "grad_norm": 0.0005427119904197752, "learning_rate": 3.627147948706487e-07, "loss": 0.0, "num_input_tokens_seen": 103175120, "step": 153080 }, { "epoch": 3.739892018664647, "grad_norm": 0.23871496319770813, "learning_rate": 3.626490798791265e-07, "loss": 0.0182, "num_input_tokens_seen": 103178000, "step": 153085 }, { "epoch": 3.740014169496494, "grad_norm": 0.00962295476347208, "learning_rate": 3.6258336952259127e-07, "loss": 0.0, "num_input_tokens_seen": 103181200, "step": 153090 }, { "epoch": 3.7401363203283413, "grad_norm": 0.026946377009153366, "learning_rate": 3.6251766380152127e-07, "loss": 0.0001, "num_input_tokens_seen": 103185040, "step": 153095 }, { "epoch": 3.7402584711601885, "grad_norm": 0.007624130696058273, "learning_rate": 3.6245196271639457e-07, "loss": 0.031, "num_input_tokens_seen": 103188880, "step": 153100 }, { "epoch": 3.7403806219920357, "grad_norm": 0.00016881691408343613, "learning_rate": 3.623862662676884e-07, "loss": 0.0, "num_input_tokens_seen": 103192336, "step": 153105 }, { "epoch": 3.740502772823883, "grad_norm": 0.0038916917983442545, "learning_rate": 3.6232057445588107e-07, "loss": 0.0955, "num_input_tokens_seen": 103195984, "step": 153110 }, { "epoch": 3.74062492365573, "grad_norm": 0.000500963709782809, "learning_rate": 3.622548872814497e-07, "loss": 0.0, "num_input_tokens_seen": 103199120, "step": 153115 }, { "epoch": 3.7407470744875773, "grad_norm": 0.0008566455217078328, "learning_rate": 3.621892047448727e-07, "loss": 0.0, "num_input_tokens_seen": 103202000, "step": 153120 }, { "epoch": 3.7408692253194245, "grad_norm": 0.0006452181842178106, "learning_rate": 3.6212352684662737e-07, "loss": 0.0, "num_input_tokens_seen": 103205392, "step": 153125 }, { "epoch": 3.7409913761512716, "grad_norm": 0.0011283294297754765, "learning_rate": 3.62057853587191e-07, "loss": 0.0001, "num_input_tokens_seen": 103208848, "step": 153130 }, { "epoch": 3.741113526983119, "grad_norm": 21.12510108947754, "learning_rate": 3.6199218496704175e-07, "loss": 0.0969, "num_input_tokens_seen": 103211984, "step": 153135 }, { "epoch": 3.741235677814966, "grad_norm": 0.0006625877576880157, "learning_rate": 3.619265209866567e-07, "loss": 0.0, "num_input_tokens_seen": 103215504, "step": 153140 }, { "epoch": 3.741357828646813, "grad_norm": 0.0018214958254247904, "learning_rate": 3.6186086164651387e-07, "loss": 0.0, "num_input_tokens_seen": 103218832, "step": 153145 }, { "epoch": 3.7414799794786604, "grad_norm": 27.20746421813965, "learning_rate": 3.617952069470902e-07, "loss": 0.0788, "num_input_tokens_seen": 103222288, "step": 153150 }, { "epoch": 3.7416021303105076, "grad_norm": 0.011315912939608097, "learning_rate": 3.6172955688886343e-07, "loss": 0.031, "num_input_tokens_seen": 103225552, "step": 153155 }, { "epoch": 3.7417242811423543, "grad_norm": 0.00682856747880578, "learning_rate": 3.6166391147231126e-07, "loss": 0.0, "num_input_tokens_seen": 103230352, "step": 153160 }, { "epoch": 3.741846431974202, "grad_norm": 0.08006688207387924, "learning_rate": 3.615982706979106e-07, "loss": 0.0001, "num_input_tokens_seen": 103233488, "step": 153165 }, { "epoch": 3.7419685828060487, "grad_norm": 0.004424131475389004, "learning_rate": 3.6153263456613925e-07, "loss": 0.0, "num_input_tokens_seen": 103237264, "step": 153170 }, { "epoch": 3.7420907336378963, "grad_norm": 0.0051756673492491245, "learning_rate": 3.6146700307747403e-07, "loss": 0.0, "num_input_tokens_seen": 103240464, "step": 153175 }, { "epoch": 3.742212884469743, "grad_norm": 0.11179034411907196, "learning_rate": 3.6140137623239287e-07, "loss": 0.0001, "num_input_tokens_seen": 103243600, "step": 153180 }, { "epoch": 3.7423350353015903, "grad_norm": 0.001237615942955017, "learning_rate": 3.613357540313723e-07, "loss": 0.0007, "num_input_tokens_seen": 103246672, "step": 153185 }, { "epoch": 3.7424571861334375, "grad_norm": 0.006782029289752245, "learning_rate": 3.612701364748899e-07, "loss": 0.0001, "num_input_tokens_seen": 103250064, "step": 153190 }, { "epoch": 3.7425793369652847, "grad_norm": 0.00018652534345164895, "learning_rate": 3.612045235634232e-07, "loss": 0.0, "num_input_tokens_seen": 103253136, "step": 153195 }, { "epoch": 3.742701487797132, "grad_norm": 0.001206182991154492, "learning_rate": 3.6113891529744864e-07, "loss": 0.0, "num_input_tokens_seen": 103256592, "step": 153200 }, { "epoch": 3.742823638628979, "grad_norm": 0.002523811301216483, "learning_rate": 3.610733116774441e-07, "loss": 0.0, "num_input_tokens_seen": 103259728, "step": 153205 }, { "epoch": 3.7429457894608262, "grad_norm": 0.005748201161623001, "learning_rate": 3.6100771270388606e-07, "loss": 0.0563, "num_input_tokens_seen": 103262864, "step": 153210 }, { "epoch": 3.7430679402926734, "grad_norm": 0.0019488586112856865, "learning_rate": 3.6094211837725197e-07, "loss": 0.0, "num_input_tokens_seen": 103266192, "step": 153215 }, { "epoch": 3.7431900911245206, "grad_norm": 0.0014601226430386305, "learning_rate": 3.6087652869801846e-07, "loss": 0.0, "num_input_tokens_seen": 103269584, "step": 153220 }, { "epoch": 3.743312241956368, "grad_norm": 0.0004819185414817184, "learning_rate": 3.60810943666663e-07, "loss": 0.0, "num_input_tokens_seen": 103272784, "step": 153225 }, { "epoch": 3.743434392788215, "grad_norm": 6.37445118627511e-05, "learning_rate": 3.6074536328366235e-07, "loss": 0.0, "num_input_tokens_seen": 103276368, "step": 153230 }, { "epoch": 3.743556543620062, "grad_norm": 0.004168359562754631, "learning_rate": 3.606797875494929e-07, "loss": 0.0, "num_input_tokens_seen": 103279120, "step": 153235 }, { "epoch": 3.7436786944519094, "grad_norm": 0.0013751707738265395, "learning_rate": 3.606142164646324e-07, "loss": 0.0, "num_input_tokens_seen": 103282320, "step": 153240 }, { "epoch": 3.743800845283756, "grad_norm": 0.009026645682752132, "learning_rate": 3.60548650029557e-07, "loss": 0.0001, "num_input_tokens_seen": 103285648, "step": 153245 }, { "epoch": 3.7439229961156038, "grad_norm": 0.004914161749184132, "learning_rate": 3.604830882447438e-07, "loss": 0.0, "num_input_tokens_seen": 103288592, "step": 153250 }, { "epoch": 3.7440451469474505, "grad_norm": 0.07660754024982452, "learning_rate": 3.6041753111066987e-07, "loss": 0.0, "num_input_tokens_seen": 103291792, "step": 153255 }, { "epoch": 3.744167297779298, "grad_norm": 0.00439593568444252, "learning_rate": 3.603519786278114e-07, "loss": 0.0, "num_input_tokens_seen": 103295184, "step": 153260 }, { "epoch": 3.744289448611145, "grad_norm": 0.0007788151269778609, "learning_rate": 3.602864307966457e-07, "loss": 0.1206, "num_input_tokens_seen": 103298320, "step": 153265 }, { "epoch": 3.744411599442992, "grad_norm": 0.13851021230220795, "learning_rate": 3.6022088761764877e-07, "loss": 0.0388, "num_input_tokens_seen": 103301584, "step": 153270 }, { "epoch": 3.7445337502748393, "grad_norm": 0.00768559193238616, "learning_rate": 3.6015534909129796e-07, "loss": 0.0449, "num_input_tokens_seen": 103304784, "step": 153275 }, { "epoch": 3.7446559011066864, "grad_norm": 0.003182411892339587, "learning_rate": 3.600898152180692e-07, "loss": 0.0, "num_input_tokens_seen": 103308304, "step": 153280 }, { "epoch": 3.7447780519385336, "grad_norm": 0.0014601831790059805, "learning_rate": 3.600242859984395e-07, "loss": 0.0383, "num_input_tokens_seen": 103311760, "step": 153285 }, { "epoch": 3.744900202770381, "grad_norm": 0.17673958837985992, "learning_rate": 3.599587614328856e-07, "loss": 0.0001, "num_input_tokens_seen": 103314768, "step": 153290 }, { "epoch": 3.745022353602228, "grad_norm": 0.007135581690818071, "learning_rate": 3.598932415218835e-07, "loss": 0.0, "num_input_tokens_seen": 103318416, "step": 153295 }, { "epoch": 3.745144504434075, "grad_norm": 0.010046053677797318, "learning_rate": 3.598277262659102e-07, "loss": 0.0, "num_input_tokens_seen": 103321296, "step": 153300 }, { "epoch": 3.7452666552659224, "grad_norm": 0.005226328037679195, "learning_rate": 3.597622156654414e-07, "loss": 0.0, "num_input_tokens_seen": 103325264, "step": 153305 }, { "epoch": 3.7453888060977696, "grad_norm": 0.03831545636057854, "learning_rate": 3.596967097209541e-07, "loss": 0.0003, "num_input_tokens_seen": 103328080, "step": 153310 }, { "epoch": 3.7455109569296168, "grad_norm": 0.0007643011631444097, "learning_rate": 3.596312084329248e-07, "loss": 0.0, "num_input_tokens_seen": 103331664, "step": 153315 }, { "epoch": 3.745633107761464, "grad_norm": 0.0006596199818886817, "learning_rate": 3.595657118018297e-07, "loss": 0.0, "num_input_tokens_seen": 103335184, "step": 153320 }, { "epoch": 3.745755258593311, "grad_norm": 0.006933595985174179, "learning_rate": 3.595002198281446e-07, "loss": 0.0002, "num_input_tokens_seen": 103338704, "step": 153325 }, { "epoch": 3.7458774094251583, "grad_norm": 35.37353515625, "learning_rate": 3.5943473251234656e-07, "loss": 0.0651, "num_input_tokens_seen": 103341840, "step": 153330 }, { "epoch": 3.7459995602570055, "grad_norm": 0.0031136884354054928, "learning_rate": 3.5936924985491104e-07, "loss": 0.0, "num_input_tokens_seen": 103346000, "step": 153335 }, { "epoch": 3.7461217110888523, "grad_norm": 0.0010474255541339517, "learning_rate": 3.59303771856315e-07, "loss": 0.0, "num_input_tokens_seen": 103349136, "step": 153340 }, { "epoch": 3.7462438619207, "grad_norm": 0.051398538053035736, "learning_rate": 3.592382985170339e-07, "loss": 0.0348, "num_input_tokens_seen": 103352336, "step": 153345 }, { "epoch": 3.7463660127525467, "grad_norm": 0.0012231292203068733, "learning_rate": 3.591728298375446e-07, "loss": 0.0, "num_input_tokens_seen": 103355408, "step": 153350 }, { "epoch": 3.746488163584394, "grad_norm": 0.01951722800731659, "learning_rate": 3.5910736581832246e-07, "loss": 0.0, "num_input_tokens_seen": 103358352, "step": 153355 }, { "epoch": 3.746610314416241, "grad_norm": 0.003924138844013214, "learning_rate": 3.5904190645984434e-07, "loss": 0.0, "num_input_tokens_seen": 103361680, "step": 153360 }, { "epoch": 3.7467324652480882, "grad_norm": 0.009085427969694138, "learning_rate": 3.589764517625855e-07, "loss": 0.0, "num_input_tokens_seen": 103365456, "step": 153365 }, { "epoch": 3.7468546160799354, "grad_norm": 0.010301058180630207, "learning_rate": 3.5891100172702273e-07, "loss": 0.0, "num_input_tokens_seen": 103369232, "step": 153370 }, { "epoch": 3.7469767669117826, "grad_norm": 0.050948645919561386, "learning_rate": 3.588455563536311e-07, "loss": 0.0003, "num_input_tokens_seen": 103372560, "step": 153375 }, { "epoch": 3.74709891774363, "grad_norm": 0.004248334560543299, "learning_rate": 3.5878011564288714e-07, "loss": 0.0, "num_input_tokens_seen": 103376016, "step": 153380 }, { "epoch": 3.747221068575477, "grad_norm": 0.001246364787220955, "learning_rate": 3.587146795952669e-07, "loss": 0.0, "num_input_tokens_seen": 103379344, "step": 153385 }, { "epoch": 3.747343219407324, "grad_norm": 0.0011733114952221513, "learning_rate": 3.5864924821124575e-07, "loss": 0.0, "num_input_tokens_seen": 103382480, "step": 153390 }, { "epoch": 3.7474653702391714, "grad_norm": 465.39776611328125, "learning_rate": 3.585838214913001e-07, "loss": 0.0085, "num_input_tokens_seen": 103385936, "step": 153395 }, { "epoch": 3.7475875210710186, "grad_norm": 0.4323878586292267, "learning_rate": 3.58518399435905e-07, "loss": 0.0001, "num_input_tokens_seen": 103389392, "step": 153400 }, { "epoch": 3.7477096719028657, "grad_norm": 0.012481575831770897, "learning_rate": 3.5845298204553676e-07, "loss": 0.0, "num_input_tokens_seen": 103393680, "step": 153405 }, { "epoch": 3.747831822734713, "grad_norm": 0.0003565066435839981, "learning_rate": 3.5838756932067126e-07, "loss": 0.0001, "num_input_tokens_seen": 103397008, "step": 153410 }, { "epoch": 3.74795397356656, "grad_norm": 0.0006096562137827277, "learning_rate": 3.5832216126178363e-07, "loss": 0.0321, "num_input_tokens_seen": 103400784, "step": 153415 }, { "epoch": 3.7480761243984073, "grad_norm": 0.0034066636580973864, "learning_rate": 3.5825675786935006e-07, "loss": 0.0738, "num_input_tokens_seen": 103404496, "step": 153420 }, { "epoch": 3.748198275230254, "grad_norm": 0.0015200987691059709, "learning_rate": 3.581913591438457e-07, "loss": 0.0, "num_input_tokens_seen": 103407696, "step": 153425 }, { "epoch": 3.7483204260621017, "grad_norm": 0.0004193643108010292, "learning_rate": 3.5812596508574675e-07, "loss": 0.0, "num_input_tokens_seen": 103411216, "step": 153430 }, { "epoch": 3.7484425768939484, "grad_norm": 0.2604064643383026, "learning_rate": 3.580605756955284e-07, "loss": 0.0001, "num_input_tokens_seen": 103414160, "step": 153435 }, { "epoch": 3.748564727725796, "grad_norm": 0.004687589127570391, "learning_rate": 3.5799519097366593e-07, "loss": 0.0, "num_input_tokens_seen": 103417360, "step": 153440 }, { "epoch": 3.748686878557643, "grad_norm": 49.20878219604492, "learning_rate": 3.579298109206353e-07, "loss": 0.0418, "num_input_tokens_seen": 103421072, "step": 153445 }, { "epoch": 3.74880902938949, "grad_norm": 0.0016561738448217511, "learning_rate": 3.578644355369116e-07, "loss": 0.0001, "num_input_tokens_seen": 103424080, "step": 153450 }, { "epoch": 3.748931180221337, "grad_norm": 0.000623115396592766, "learning_rate": 3.5779906482297073e-07, "loss": 0.0, "num_input_tokens_seen": 103427088, "step": 153455 }, { "epoch": 3.7490533310531844, "grad_norm": 0.0064060743898153305, "learning_rate": 3.577336987792874e-07, "loss": 0.0, "num_input_tokens_seen": 103430736, "step": 153460 }, { "epoch": 3.7491754818850316, "grad_norm": 0.00033515694667585194, "learning_rate": 3.576683374063374e-07, "loss": 0.0418, "num_input_tokens_seen": 103433744, "step": 153465 }, { "epoch": 3.7492976327168788, "grad_norm": 0.10513308644294739, "learning_rate": 3.576029807045964e-07, "loss": 0.0, "num_input_tokens_seen": 103438416, "step": 153470 }, { "epoch": 3.749419783548726, "grad_norm": 0.008837641216814518, "learning_rate": 3.5753762867453885e-07, "loss": 0.075, "num_input_tokens_seen": 103442128, "step": 153475 }, { "epoch": 3.749541934380573, "grad_norm": 0.013438341207802296, "learning_rate": 3.574722813166409e-07, "loss": 0.0, "num_input_tokens_seen": 103445776, "step": 153480 }, { "epoch": 3.7496640852124203, "grad_norm": 0.001661556656472385, "learning_rate": 3.5740693863137696e-07, "loss": 0.0, "num_input_tokens_seen": 103448784, "step": 153485 }, { "epoch": 3.7497862360442675, "grad_norm": 0.0008842450333759189, "learning_rate": 3.5734160061922304e-07, "loss": 0.0, "num_input_tokens_seen": 103452112, "step": 153490 }, { "epoch": 3.7499083868761147, "grad_norm": 0.0011517205275595188, "learning_rate": 3.572762672806534e-07, "loss": 0.0354, "num_input_tokens_seen": 103455696, "step": 153495 }, { "epoch": 3.750030537707962, "grad_norm": 0.0016859594034031034, "learning_rate": 3.572109386161436e-07, "loss": 0.0001, "num_input_tokens_seen": 103458896, "step": 153500 }, { "epoch": 3.750152688539809, "grad_norm": 0.0004544697585515678, "learning_rate": 3.571456146261691e-07, "loss": 0.0, "num_input_tokens_seen": 103462352, "step": 153505 }, { "epoch": 3.750274839371656, "grad_norm": 0.002033928642049432, "learning_rate": 3.5708029531120433e-07, "loss": 0.0, "num_input_tokens_seen": 103465808, "step": 153510 }, { "epoch": 3.750274839371656, "eval_loss": 0.24138930439949036, "eval_runtime": 47.7875, "eval_samples_per_second": 761.392, "eval_steps_per_second": 95.192, "num_input_tokens_seen": 103465808, "step": 153510 }, { "epoch": 3.7503969902035035, "grad_norm": 0.0008828761638142169, "learning_rate": 3.5701498067172487e-07, "loss": 0.0, "num_input_tokens_seen": 103468944, "step": 153515 }, { "epoch": 3.7505191410353502, "grad_norm": 0.0001341873430646956, "learning_rate": 3.5694967070820514e-07, "loss": 0.0, "num_input_tokens_seen": 103472464, "step": 153520 }, { "epoch": 3.750641291867198, "grad_norm": 0.006082539912313223, "learning_rate": 3.5688436542112054e-07, "loss": 0.0002, "num_input_tokens_seen": 103476176, "step": 153525 }, { "epoch": 3.7507634426990446, "grad_norm": 0.0005558205884881318, "learning_rate": 3.5681906481094557e-07, "loss": 0.0, "num_input_tokens_seen": 103480336, "step": 153530 }, { "epoch": 3.750885593530892, "grad_norm": 0.0016473763389512897, "learning_rate": 3.5675376887815577e-07, "loss": 0.0002, "num_input_tokens_seen": 103483728, "step": 153535 }, { "epoch": 3.751007744362739, "grad_norm": 0.011921056546270847, "learning_rate": 3.566884776232255e-07, "loss": 0.0, "num_input_tokens_seen": 103487248, "step": 153540 }, { "epoch": 3.751129895194586, "grad_norm": 0.00046353053767234087, "learning_rate": 3.566231910466293e-07, "loss": 0.0, "num_input_tokens_seen": 103490704, "step": 153545 }, { "epoch": 3.7512520460264334, "grad_norm": 0.001445167581550777, "learning_rate": 3.5655790914884264e-07, "loss": 0.0, "num_input_tokens_seen": 103494160, "step": 153550 }, { "epoch": 3.7513741968582806, "grad_norm": 0.003341602860018611, "learning_rate": 3.5649263193033964e-07, "loss": 0.0002, "num_input_tokens_seen": 103497424, "step": 153555 }, { "epoch": 3.7514963476901277, "grad_norm": 0.005969295743852854, "learning_rate": 3.564273593915953e-07, "loss": 0.0, "num_input_tokens_seen": 103500944, "step": 153560 }, { "epoch": 3.751618498521975, "grad_norm": 0.01053598988801241, "learning_rate": 3.563620915330846e-07, "loss": 0.0, "num_input_tokens_seen": 103504720, "step": 153565 }, { "epoch": 3.751740649353822, "grad_norm": 0.0017889856826514006, "learning_rate": 3.5629682835528153e-07, "loss": 0.0, "num_input_tokens_seen": 103507664, "step": 153570 }, { "epoch": 3.7518628001856693, "grad_norm": 39.256404876708984, "learning_rate": 3.562315698586614e-07, "loss": 0.0563, "num_input_tokens_seen": 103510928, "step": 153575 }, { "epoch": 3.7519849510175165, "grad_norm": 0.0006424240418709815, "learning_rate": 3.561663160436982e-07, "loss": 0.0001, "num_input_tokens_seen": 103514128, "step": 153580 }, { "epoch": 3.7521071018493637, "grad_norm": 0.00029739251476712525, "learning_rate": 3.5610106691086696e-07, "loss": 0.0, "num_input_tokens_seen": 103518096, "step": 153585 }, { "epoch": 3.752229252681211, "grad_norm": 49.057334899902344, "learning_rate": 3.5603582246064165e-07, "loss": 0.0297, "num_input_tokens_seen": 103521424, "step": 153590 }, { "epoch": 3.752351403513058, "grad_norm": 0.0034984839148819447, "learning_rate": 3.55970582693497e-07, "loss": 0.0, "num_input_tokens_seen": 103524880, "step": 153595 }, { "epoch": 3.7524735543449053, "grad_norm": 0.00019109105051029474, "learning_rate": 3.55905347609908e-07, "loss": 0.075, "num_input_tokens_seen": 103528784, "step": 153600 }, { "epoch": 3.752595705176752, "grad_norm": 0.028764499351382256, "learning_rate": 3.55840117210348e-07, "loss": 0.0, "num_input_tokens_seen": 103532560, "step": 153605 }, { "epoch": 3.7527178560085996, "grad_norm": 0.05303001031279564, "learning_rate": 3.557748914952924e-07, "loss": 0.0, "num_input_tokens_seen": 103535504, "step": 153610 }, { "epoch": 3.7528400068404464, "grad_norm": 0.002212024061009288, "learning_rate": 3.557096704652147e-07, "loss": 0.0, "num_input_tokens_seen": 103538448, "step": 153615 }, { "epoch": 3.752962157672294, "grad_norm": 0.0033569305669516325, "learning_rate": 3.5564445412058984e-07, "loss": 0.0, "num_input_tokens_seen": 103541584, "step": 153620 }, { "epoch": 3.7530843085041408, "grad_norm": 0.021345417946577072, "learning_rate": 3.5557924246189153e-07, "loss": 0.0, "num_input_tokens_seen": 103544528, "step": 153625 }, { "epoch": 3.753206459335988, "grad_norm": 0.0057569025084376335, "learning_rate": 3.555140354895947e-07, "loss": 0.0, "num_input_tokens_seen": 103547728, "step": 153630 }, { "epoch": 3.753328610167835, "grad_norm": 9.103876072913408e-05, "learning_rate": 3.5544883320417276e-07, "loss": 0.0, "num_input_tokens_seen": 103550928, "step": 153635 }, { "epoch": 3.7534507609996823, "grad_norm": 0.0010332964593544602, "learning_rate": 3.553836356061005e-07, "loss": 0.0, "num_input_tokens_seen": 103554064, "step": 153640 }, { "epoch": 3.7535729118315295, "grad_norm": 0.3927779793739319, "learning_rate": 3.5531844269585164e-07, "loss": 0.0001, "num_input_tokens_seen": 103557200, "step": 153645 }, { "epoch": 3.7536950626633767, "grad_norm": 0.0036919882986694574, "learning_rate": 3.5525325447390075e-07, "loss": 0.0002, "num_input_tokens_seen": 103560528, "step": 153650 }, { "epoch": 3.753817213495224, "grad_norm": 0.003987174481153488, "learning_rate": 3.5518807094072123e-07, "loss": 0.0001, "num_input_tokens_seen": 103563920, "step": 153655 }, { "epoch": 3.753939364327071, "grad_norm": 65.58088684082031, "learning_rate": 3.5512289209678794e-07, "loss": 0.0536, "num_input_tokens_seen": 103566864, "step": 153660 }, { "epoch": 3.7540615151589183, "grad_norm": 0.0005408762372098863, "learning_rate": 3.55057717942574e-07, "loss": 0.0, "num_input_tokens_seen": 103570960, "step": 153665 }, { "epoch": 3.7541836659907655, "grad_norm": 0.0010324962204322219, "learning_rate": 3.549925484785541e-07, "loss": 0.0003, "num_input_tokens_seen": 103574608, "step": 153670 }, { "epoch": 3.7543058168226127, "grad_norm": 0.002635074546560645, "learning_rate": 3.5492738370520157e-07, "loss": 0.0, "num_input_tokens_seen": 103578832, "step": 153675 }, { "epoch": 3.75442796765446, "grad_norm": 0.0025064474903047085, "learning_rate": 3.54862223622991e-07, "loss": 0.0, "num_input_tokens_seen": 103582032, "step": 153680 }, { "epoch": 3.754550118486307, "grad_norm": 0.0022307131439447403, "learning_rate": 3.5479706823239554e-07, "loss": 0.0, "num_input_tokens_seen": 103585040, "step": 153685 }, { "epoch": 3.754672269318154, "grad_norm": 0.00514277582988143, "learning_rate": 3.5473191753388923e-07, "loss": 0.0003, "num_input_tokens_seen": 103588240, "step": 153690 }, { "epoch": 3.7547944201500014, "grad_norm": 0.008436059579253197, "learning_rate": 3.5466677152794634e-07, "loss": 0.0, "num_input_tokens_seen": 103591632, "step": 153695 }, { "epoch": 3.754916570981848, "grad_norm": 0.002367235254496336, "learning_rate": 3.5460163021503996e-07, "loss": 0.0, "num_input_tokens_seen": 103594960, "step": 153700 }, { "epoch": 3.755038721813696, "grad_norm": 2.3470609188079834, "learning_rate": 3.545364935956445e-07, "loss": 0.0001, "num_input_tokens_seen": 103598544, "step": 153705 }, { "epoch": 3.7551608726455425, "grad_norm": 0.0022390945814549923, "learning_rate": 3.5447136167023286e-07, "loss": 0.0001, "num_input_tokens_seen": 103602064, "step": 153710 }, { "epoch": 3.7552830234773897, "grad_norm": 0.0007741377921774983, "learning_rate": 3.544062344392791e-07, "loss": 0.0, "num_input_tokens_seen": 103605136, "step": 153715 }, { "epoch": 3.755405174309237, "grad_norm": 0.0004159125383011997, "learning_rate": 3.5434111190325724e-07, "loss": 0.0, "num_input_tokens_seen": 103608336, "step": 153720 }, { "epoch": 3.755527325141084, "grad_norm": 0.0019826493225991726, "learning_rate": 3.542759940626401e-07, "loss": 0.0, "num_input_tokens_seen": 103611600, "step": 153725 }, { "epoch": 3.7556494759729313, "grad_norm": 0.08303279429674149, "learning_rate": 3.54210880917902e-07, "loss": 0.0001, "num_input_tokens_seen": 103614480, "step": 153730 }, { "epoch": 3.7557716268047785, "grad_norm": 0.003127832431346178, "learning_rate": 3.541457724695156e-07, "loss": 0.0002, "num_input_tokens_seen": 103617872, "step": 153735 }, { "epoch": 3.7558937776366257, "grad_norm": 0.0069281598553061485, "learning_rate": 3.540806687179553e-07, "loss": 0.002, "num_input_tokens_seen": 103621008, "step": 153740 }, { "epoch": 3.756015928468473, "grad_norm": 0.007652719039469957, "learning_rate": 3.5401556966369405e-07, "loss": 0.0, "num_input_tokens_seen": 103624272, "step": 153745 }, { "epoch": 3.75613807930032, "grad_norm": 0.001770931645296514, "learning_rate": 3.5395047530720513e-07, "loss": 0.0002, "num_input_tokens_seen": 103627216, "step": 153750 }, { "epoch": 3.7562602301321673, "grad_norm": 36.390316009521484, "learning_rate": 3.5388538564896233e-07, "loss": 0.1025, "num_input_tokens_seen": 103630544, "step": 153755 }, { "epoch": 3.7563823809640144, "grad_norm": 0.03158455714583397, "learning_rate": 3.5382030068943845e-07, "loss": 0.0, "num_input_tokens_seen": 103633680, "step": 153760 }, { "epoch": 3.7565045317958616, "grad_norm": 0.0006475714035332203, "learning_rate": 3.5375522042910756e-07, "loss": 0.0, "num_input_tokens_seen": 103636752, "step": 153765 }, { "epoch": 3.756626682627709, "grad_norm": 0.00044167076703161, "learning_rate": 3.5369014486844205e-07, "loss": 0.0, "num_input_tokens_seen": 103639888, "step": 153770 }, { "epoch": 3.756748833459556, "grad_norm": 0.0006852021906524897, "learning_rate": 3.536250740079161e-07, "loss": 0.0, "num_input_tokens_seen": 103643024, "step": 153775 }, { "epoch": 3.756870984291403, "grad_norm": 0.0010416110744699836, "learning_rate": 3.535600078480021e-07, "loss": 0.0006, "num_input_tokens_seen": 103646160, "step": 153780 }, { "epoch": 3.75699313512325, "grad_norm": 53.415870666503906, "learning_rate": 3.5349494638917354e-07, "loss": 0.0691, "num_input_tokens_seen": 103648976, "step": 153785 }, { "epoch": 3.7571152859550976, "grad_norm": 0.0010279123671352863, "learning_rate": 3.53429889631904e-07, "loss": 0.0, "num_input_tokens_seen": 103652112, "step": 153790 }, { "epoch": 3.7572374367869443, "grad_norm": 0.0022128450218588114, "learning_rate": 3.533648375766659e-07, "loss": 0.0, "num_input_tokens_seen": 103655056, "step": 153795 }, { "epoch": 3.7573595876187915, "grad_norm": 0.0005283782375045121, "learning_rate": 3.5329979022393296e-07, "loss": 0.0007, "num_input_tokens_seen": 103658448, "step": 153800 }, { "epoch": 3.7574817384506387, "grad_norm": 0.03994615003466606, "learning_rate": 3.532347475741776e-07, "loss": 0.0, "num_input_tokens_seen": 103661584, "step": 153805 }, { "epoch": 3.757603889282486, "grad_norm": 28.16742515563965, "learning_rate": 3.5316970962787295e-07, "loss": 0.0566, "num_input_tokens_seen": 103665296, "step": 153810 }, { "epoch": 3.757726040114333, "grad_norm": 0.0012795196380466223, "learning_rate": 3.5310467638549256e-07, "loss": 0.0, "num_input_tokens_seen": 103668240, "step": 153815 }, { "epoch": 3.7578481909461803, "grad_norm": 0.0019318463746458292, "learning_rate": 3.5303964784750875e-07, "loss": 0.0, "num_input_tokens_seen": 103671568, "step": 153820 }, { "epoch": 3.7579703417780275, "grad_norm": 0.008436155505478382, "learning_rate": 3.529746240143948e-07, "loss": 0.0001, "num_input_tokens_seen": 103674832, "step": 153825 }, { "epoch": 3.7580924926098747, "grad_norm": 0.0005548412445932627, "learning_rate": 3.5290960488662316e-07, "loss": 0.0, "num_input_tokens_seen": 103679312, "step": 153830 }, { "epoch": 3.758214643441722, "grad_norm": 0.0015689019346609712, "learning_rate": 3.528445904646672e-07, "loss": 0.0, "num_input_tokens_seen": 103683088, "step": 153835 }, { "epoch": 3.758336794273569, "grad_norm": 0.00017826675320975482, "learning_rate": 3.527795807489992e-07, "loss": 0.0751, "num_input_tokens_seen": 103687312, "step": 153840 }, { "epoch": 3.7584589451054162, "grad_norm": 0.058404359966516495, "learning_rate": 3.5271457574009246e-07, "loss": 0.0, "num_input_tokens_seen": 103690832, "step": 153845 }, { "epoch": 3.7585810959372634, "grad_norm": 0.0002906270674429834, "learning_rate": 3.5264957543841935e-07, "loss": 0.0, "num_input_tokens_seen": 103694032, "step": 153850 }, { "epoch": 3.7587032467691106, "grad_norm": 0.0008786988910287619, "learning_rate": 3.5258457984445234e-07, "loss": 0.0, "num_input_tokens_seen": 103697488, "step": 153855 }, { "epoch": 3.758825397600958, "grad_norm": 0.008816512301564217, "learning_rate": 3.5251958895866487e-07, "loss": 0.0, "num_input_tokens_seen": 103701072, "step": 153860 }, { "epoch": 3.758947548432805, "grad_norm": 0.0047731283120810986, "learning_rate": 3.5245460278152863e-07, "loss": 0.0, "num_input_tokens_seen": 103704016, "step": 153865 }, { "epoch": 3.7590696992646517, "grad_norm": 0.24734684824943542, "learning_rate": 3.523896213135167e-07, "loss": 0.0834, "num_input_tokens_seen": 103707472, "step": 153870 }, { "epoch": 3.7591918500964994, "grad_norm": 0.005535752046853304, "learning_rate": 3.523246445551019e-07, "loss": 0.0489, "num_input_tokens_seen": 103711120, "step": 153875 }, { "epoch": 3.759314000928346, "grad_norm": 0.026731478050351143, "learning_rate": 3.5225967250675623e-07, "loss": 0.0, "num_input_tokens_seen": 103714192, "step": 153880 }, { "epoch": 3.7594361517601937, "grad_norm": 0.0007929136045277119, "learning_rate": 3.521947051689528e-07, "loss": 0.0, "num_input_tokens_seen": 103717648, "step": 153885 }, { "epoch": 3.7595583025920405, "grad_norm": 0.01964680105447769, "learning_rate": 3.5212974254216343e-07, "loss": 0.0, "num_input_tokens_seen": 103721104, "step": 153890 }, { "epoch": 3.7596804534238877, "grad_norm": 0.072999969124794, "learning_rate": 3.5206478462686106e-07, "loss": 0.0, "num_input_tokens_seen": 103724432, "step": 153895 }, { "epoch": 3.759802604255735, "grad_norm": 0.005552014335989952, "learning_rate": 3.5199983142351753e-07, "loss": 0.0, "num_input_tokens_seen": 103728016, "step": 153900 }, { "epoch": 3.759924755087582, "grad_norm": 46.817413330078125, "learning_rate": 3.5193488293260554e-07, "loss": 0.0336, "num_input_tokens_seen": 103731472, "step": 153905 }, { "epoch": 3.7600469059194292, "grad_norm": 0.0019489700207486749, "learning_rate": 3.5186993915459773e-07, "loss": 0.104, "num_input_tokens_seen": 103734864, "step": 153910 }, { "epoch": 3.7601690567512764, "grad_norm": 0.019930588081479073, "learning_rate": 3.5180500008996574e-07, "loss": 0.0838, "num_input_tokens_seen": 103738384, "step": 153915 }, { "epoch": 3.7602912075831236, "grad_norm": 0.013438960537314415, "learning_rate": 3.517400657391824e-07, "loss": 0.0201, "num_input_tokens_seen": 103741776, "step": 153920 }, { "epoch": 3.760413358414971, "grad_norm": 0.00897922832518816, "learning_rate": 3.516751361027194e-07, "loss": 0.0004, "num_input_tokens_seen": 103744848, "step": 153925 }, { "epoch": 3.760535509246818, "grad_norm": 0.017759116366505623, "learning_rate": 3.516102111810494e-07, "loss": 0.0003, "num_input_tokens_seen": 103747920, "step": 153930 }, { "epoch": 3.760657660078665, "grad_norm": 0.2966800332069397, "learning_rate": 3.5154529097464413e-07, "loss": 0.0715, "num_input_tokens_seen": 103751056, "step": 153935 }, { "epoch": 3.7607798109105124, "grad_norm": 0.0029404417145997286, "learning_rate": 3.5148037548397616e-07, "loss": 0.0, "num_input_tokens_seen": 103754448, "step": 153940 }, { "epoch": 3.7609019617423596, "grad_norm": 0.018912794068455696, "learning_rate": 3.514154647095171e-07, "loss": 0.0224, "num_input_tokens_seen": 103757648, "step": 153945 }, { "epoch": 3.7610241125742068, "grad_norm": 0.008374440483748913, "learning_rate": 3.5135055865173943e-07, "loss": 0.0, "num_input_tokens_seen": 103760528, "step": 153950 }, { "epoch": 3.761146263406054, "grad_norm": 0.05202620476484299, "learning_rate": 3.512856573111147e-07, "loss": 0.0001, "num_input_tokens_seen": 103763728, "step": 153955 }, { "epoch": 3.761268414237901, "grad_norm": 0.008036044426262379, "learning_rate": 3.512207606881156e-07, "loss": 0.0001, "num_input_tokens_seen": 103766928, "step": 153960 }, { "epoch": 3.761390565069748, "grad_norm": 26.79295539855957, "learning_rate": 3.511558687832131e-07, "loss": 0.0205, "num_input_tokens_seen": 103771280, "step": 153965 }, { "epoch": 3.7615127159015955, "grad_norm": 0.0021897167898714542, "learning_rate": 3.510909815968801e-07, "loss": 0.0001, "num_input_tokens_seen": 103774480, "step": 153970 }, { "epoch": 3.7616348667334423, "grad_norm": 0.1314050406217575, "learning_rate": 3.510260991295876e-07, "loss": 0.0001, "num_input_tokens_seen": 103777552, "step": 153975 }, { "epoch": 3.7617570175652895, "grad_norm": 0.0008685789653100073, "learning_rate": 3.509612213818083e-07, "loss": 0.0, "num_input_tokens_seen": 103781264, "step": 153980 }, { "epoch": 3.7618791683971367, "grad_norm": 0.0010705238673835993, "learning_rate": 3.5089634835401317e-07, "loss": 0.0, "num_input_tokens_seen": 103785104, "step": 153985 }, { "epoch": 3.762001319228984, "grad_norm": 0.0003547889646142721, "learning_rate": 3.5083148004667474e-07, "loss": 0.0003, "num_input_tokens_seen": 103788688, "step": 153990 }, { "epoch": 3.762123470060831, "grad_norm": 0.003215220058336854, "learning_rate": 3.5076661646026396e-07, "loss": 0.0, "num_input_tokens_seen": 103791760, "step": 153995 }, { "epoch": 3.762245620892678, "grad_norm": 0.07013550400733948, "learning_rate": 3.507017575952531e-07, "loss": 0.0002, "num_input_tokens_seen": 103795024, "step": 154000 }, { "epoch": 3.7623677717245254, "grad_norm": 0.0026873864699155092, "learning_rate": 3.5063690345211396e-07, "loss": 0.0, "num_input_tokens_seen": 103798480, "step": 154005 }, { "epoch": 3.7624899225563726, "grad_norm": 0.004696927964687347, "learning_rate": 3.505720540313176e-07, "loss": 0.0002, "num_input_tokens_seen": 103801616, "step": 154010 }, { "epoch": 3.76261207338822, "grad_norm": 0.01624949276447296, "learning_rate": 3.5050720933333634e-07, "loss": 0.0001, "num_input_tokens_seen": 103804496, "step": 154015 }, { "epoch": 3.762734224220067, "grad_norm": 0.003025223733857274, "learning_rate": 3.504423693586409e-07, "loss": 0.0, "num_input_tokens_seen": 103808208, "step": 154020 }, { "epoch": 3.762856375051914, "grad_norm": 83.87301635742188, "learning_rate": 3.503775341077033e-07, "loss": 0.0018, "num_input_tokens_seen": 103811920, "step": 154025 }, { "epoch": 3.7629785258837614, "grad_norm": 0.0039696102030575275, "learning_rate": 3.503127035809953e-07, "loss": 0.0001, "num_input_tokens_seen": 103815120, "step": 154030 }, { "epoch": 3.7631006767156086, "grad_norm": 0.0053939297795295715, "learning_rate": 3.502478777789878e-07, "loss": 0.0, "num_input_tokens_seen": 103818640, "step": 154035 }, { "epoch": 3.7632228275474557, "grad_norm": 0.0019495252054184675, "learning_rate": 3.501830567021528e-07, "loss": 0.0, "num_input_tokens_seen": 103822672, "step": 154040 }, { "epoch": 3.763344978379303, "grad_norm": 0.024338144809007645, "learning_rate": 3.5011824035096104e-07, "loss": 0.0, "num_input_tokens_seen": 103825680, "step": 154045 }, { "epoch": 3.7634671292111497, "grad_norm": 0.026606090366840363, "learning_rate": 3.500534287258846e-07, "loss": 0.0, "num_input_tokens_seen": 103828880, "step": 154050 }, { "epoch": 3.7635892800429973, "grad_norm": 0.0008966674213297665, "learning_rate": 3.4998862182739444e-07, "loss": 0.0, "num_input_tokens_seen": 103832336, "step": 154055 }, { "epoch": 3.763711430874844, "grad_norm": 0.0002649786474648863, "learning_rate": 3.499238196559615e-07, "loss": 0.0, "num_input_tokens_seen": 103836112, "step": 154060 }, { "epoch": 3.7638335817066917, "grad_norm": 0.005120360292494297, "learning_rate": 3.4985902221205775e-07, "loss": 0.0, "num_input_tokens_seen": 103839248, "step": 154065 }, { "epoch": 3.7639557325385384, "grad_norm": 0.07683884352445602, "learning_rate": 3.497942294961537e-07, "loss": 0.0001, "num_input_tokens_seen": 103842512, "step": 154070 }, { "epoch": 3.7640778833703856, "grad_norm": 0.016874030232429504, "learning_rate": 3.497294415087212e-07, "loss": 0.0, "num_input_tokens_seen": 103846544, "step": 154075 }, { "epoch": 3.764200034202233, "grad_norm": 0.004456295166164637, "learning_rate": 3.496646582502308e-07, "loss": 0.043, "num_input_tokens_seen": 103850000, "step": 154080 }, { "epoch": 3.76432218503408, "grad_norm": 0.0006547545199282467, "learning_rate": 3.4959987972115437e-07, "loss": 0.0001, "num_input_tokens_seen": 103853520, "step": 154085 }, { "epoch": 3.764444335865927, "grad_norm": 0.011782302521169186, "learning_rate": 3.49535105921962e-07, "loss": 0.0002, "num_input_tokens_seen": 103856848, "step": 154090 }, { "epoch": 3.7645664866977744, "grad_norm": 36.85483932495117, "learning_rate": 3.494703368531254e-07, "loss": 0.0337, "num_input_tokens_seen": 103860112, "step": 154095 }, { "epoch": 3.7646886375296216, "grad_norm": 0.0012486326741054654, "learning_rate": 3.494055725151158e-07, "loss": 0.0, "num_input_tokens_seen": 103863312, "step": 154100 }, { "epoch": 3.7648107883614688, "grad_norm": 0.0011928676394745708, "learning_rate": 3.4934081290840367e-07, "loss": 0.0, "num_input_tokens_seen": 103866576, "step": 154105 }, { "epoch": 3.764932939193316, "grad_norm": 0.0329124741256237, "learning_rate": 3.492760580334603e-07, "loss": 0.0004, "num_input_tokens_seen": 103869968, "step": 154110 }, { "epoch": 3.765055090025163, "grad_norm": 0.00763581832870841, "learning_rate": 3.492113078907563e-07, "loss": 0.0001, "num_input_tokens_seen": 103873296, "step": 154115 }, { "epoch": 3.7651772408570103, "grad_norm": 0.12216723710298538, "learning_rate": 3.4914656248076256e-07, "loss": 0.0001, "num_input_tokens_seen": 103876560, "step": 154120 }, { "epoch": 3.7652993916888575, "grad_norm": 7.318980351556093e-05, "learning_rate": 3.490818218039504e-07, "loss": 0.0, "num_input_tokens_seen": 103879632, "step": 154125 }, { "epoch": 3.7654215425207047, "grad_norm": 0.14317382872104645, "learning_rate": 3.4901708586079003e-07, "loss": 0.0, "num_input_tokens_seen": 103883216, "step": 154130 }, { "epoch": 3.7655436933525515, "grad_norm": 0.06965726613998413, "learning_rate": 3.4895235465175286e-07, "loss": 0.0, "num_input_tokens_seen": 103886672, "step": 154135 }, { "epoch": 3.765665844184399, "grad_norm": 0.005927124992012978, "learning_rate": 3.488876281773089e-07, "loss": 0.0, "num_input_tokens_seen": 103889680, "step": 154140 }, { "epoch": 3.765787995016246, "grad_norm": 0.0025228143204003572, "learning_rate": 3.4882290643792967e-07, "loss": 0.0001, "num_input_tokens_seen": 103893584, "step": 154145 }, { "epoch": 3.7659101458480935, "grad_norm": 0.0008066084701567888, "learning_rate": 3.4875818943408496e-07, "loss": 0.0, "num_input_tokens_seen": 103897168, "step": 154150 }, { "epoch": 3.76603229667994, "grad_norm": 0.0005203133332543075, "learning_rate": 3.486934771662462e-07, "loss": 0.0, "num_input_tokens_seen": 103900624, "step": 154155 }, { "epoch": 3.7661544475117874, "grad_norm": 0.018812181428074837, "learning_rate": 3.4862876963488375e-07, "loss": 0.0001, "num_input_tokens_seen": 103903376, "step": 154160 }, { "epoch": 3.7662765983436346, "grad_norm": 0.005005580838769674, "learning_rate": 3.4856406684046767e-07, "loss": 0.0143, "num_input_tokens_seen": 103906704, "step": 154165 }, { "epoch": 3.766398749175482, "grad_norm": 0.0023675684351474047, "learning_rate": 3.484993687834693e-07, "loss": 0.0, "num_input_tokens_seen": 103910160, "step": 154170 }, { "epoch": 3.766520900007329, "grad_norm": 0.5319631099700928, "learning_rate": 3.4843467546435836e-07, "loss": 0.0002, "num_input_tokens_seen": 103913424, "step": 154175 }, { "epoch": 3.766643050839176, "grad_norm": 0.001158917206339538, "learning_rate": 3.4836998688360576e-07, "loss": 0.0001, "num_input_tokens_seen": 103916688, "step": 154180 }, { "epoch": 3.7667652016710234, "grad_norm": 0.0007587404688820243, "learning_rate": 3.4830530304168216e-07, "loss": 0.0, "num_input_tokens_seen": 103919952, "step": 154185 }, { "epoch": 3.7668873525028705, "grad_norm": 0.0037255992647260427, "learning_rate": 3.482406239390574e-07, "loss": 0.0001, "num_input_tokens_seen": 103923280, "step": 154190 }, { "epoch": 3.7670095033347177, "grad_norm": 0.00011486539733596146, "learning_rate": 3.4817594957620243e-07, "loss": 0.0, "num_input_tokens_seen": 103926544, "step": 154195 }, { "epoch": 3.767131654166565, "grad_norm": 0.0011237571015954018, "learning_rate": 3.4811127995358693e-07, "loss": 0.0, "num_input_tokens_seen": 103929936, "step": 154200 }, { "epoch": 3.767253804998412, "grad_norm": 0.0001079161957022734, "learning_rate": 3.4804661507168186e-07, "loss": 0.0, "num_input_tokens_seen": 103933456, "step": 154205 }, { "epoch": 3.7673759558302593, "grad_norm": 0.0612591914832592, "learning_rate": 3.4798195493095683e-07, "loss": 0.075, "num_input_tokens_seen": 103936656, "step": 154210 }, { "epoch": 3.7674981066621065, "grad_norm": 0.0009961389005184174, "learning_rate": 3.4791729953188243e-07, "loss": 0.0, "num_input_tokens_seen": 103940048, "step": 154215 }, { "epoch": 3.7676202574939537, "grad_norm": 0.0005742756766267121, "learning_rate": 3.4785264887492914e-07, "loss": 0.0, "num_input_tokens_seen": 103943376, "step": 154220 }, { "epoch": 3.767742408325801, "grad_norm": 0.0006218006019480526, "learning_rate": 3.477880029605665e-07, "loss": 0.0004, "num_input_tokens_seen": 103947216, "step": 154225 }, { "epoch": 3.7678645591576476, "grad_norm": 0.020232796669006348, "learning_rate": 3.477233617892652e-07, "loss": 0.0004, "num_input_tokens_seen": 103951120, "step": 154230 }, { "epoch": 3.7679867099894953, "grad_norm": 0.003948678728193045, "learning_rate": 3.476587253614948e-07, "loss": 0.0, "num_input_tokens_seen": 103954576, "step": 154235 }, { "epoch": 3.768108860821342, "grad_norm": 0.001216245349496603, "learning_rate": 3.4759409367772586e-07, "loss": 0.0001, "num_input_tokens_seen": 103957840, "step": 154240 }, { "epoch": 3.7682310116531896, "grad_norm": 0.00024403842689935118, "learning_rate": 3.475294667384279e-07, "loss": 0.0, "num_input_tokens_seen": 103961296, "step": 154245 }, { "epoch": 3.7683531624850364, "grad_norm": 0.0011553462827578187, "learning_rate": 3.4746484454407135e-07, "loss": 0.0, "num_input_tokens_seen": 103964752, "step": 154250 }, { "epoch": 3.7684753133168836, "grad_norm": 0.003239996265619993, "learning_rate": 3.4740022709512575e-07, "loss": 0.0, "num_input_tokens_seen": 103968592, "step": 154255 }, { "epoch": 3.7685974641487308, "grad_norm": 0.0013838638551533222, "learning_rate": 3.473356143920615e-07, "loss": 0.0869, "num_input_tokens_seen": 103971984, "step": 154260 }, { "epoch": 3.768719614980578, "grad_norm": 0.0005694458959624171, "learning_rate": 3.472710064353478e-07, "loss": 0.0, "num_input_tokens_seen": 103975120, "step": 154265 }, { "epoch": 3.768841765812425, "grad_norm": 0.0004933263990096748, "learning_rate": 3.4720640322545537e-07, "loss": 0.0, "num_input_tokens_seen": 103978512, "step": 154270 }, { "epoch": 3.7689639166442723, "grad_norm": 0.0007346657221205533, "learning_rate": 3.471418047628532e-07, "loss": 0.0, "num_input_tokens_seen": 103981584, "step": 154275 }, { "epoch": 3.7690860674761195, "grad_norm": 0.6496771574020386, "learning_rate": 3.470772110480117e-07, "loss": 0.0003, "num_input_tokens_seen": 103985040, "step": 154280 }, { "epoch": 3.7692082183079667, "grad_norm": 0.0006116953445598483, "learning_rate": 3.4701262208140004e-07, "loss": 0.0, "num_input_tokens_seen": 103988304, "step": 154285 }, { "epoch": 3.769330369139814, "grad_norm": 0.005050478503108025, "learning_rate": 3.4694803786348857e-07, "loss": 0.0, "num_input_tokens_seen": 103991696, "step": 154290 }, { "epoch": 3.769452519971661, "grad_norm": 0.0015640381025150418, "learning_rate": 3.468834583947462e-07, "loss": 0.0, "num_input_tokens_seen": 103995152, "step": 154295 }, { "epoch": 3.7695746708035083, "grad_norm": 0.004787995480000973, "learning_rate": 3.468188836756435e-07, "loss": 0.0, "num_input_tokens_seen": 103998736, "step": 154300 }, { "epoch": 3.7696968216353555, "grad_norm": 0.0012390019837766886, "learning_rate": 3.467543137066491e-07, "loss": 0.0, "num_input_tokens_seen": 104001872, "step": 154305 }, { "epoch": 3.7698189724672027, "grad_norm": 0.003966461401432753, "learning_rate": 3.4668974848823294e-07, "loss": 0.0, "num_input_tokens_seen": 104005392, "step": 154310 }, { "epoch": 3.7699411232990494, "grad_norm": 0.009088732302188873, "learning_rate": 3.4662518802086516e-07, "loss": 0.0005, "num_input_tokens_seen": 104008336, "step": 154315 }, { "epoch": 3.770063274130897, "grad_norm": 0.0006300138775259256, "learning_rate": 3.465606323050143e-07, "loss": 0.0001, "num_input_tokens_seen": 104011408, "step": 154320 }, { "epoch": 3.770185424962744, "grad_norm": 0.0006197233451530337, "learning_rate": 3.4649608134115074e-07, "loss": 0.0, "num_input_tokens_seen": 104014544, "step": 154325 }, { "epoch": 3.7703075757945914, "grad_norm": 0.0037844895850867033, "learning_rate": 3.4643153512974297e-07, "loss": 0.0, "num_input_tokens_seen": 104017616, "step": 154330 }, { "epoch": 3.770429726626438, "grad_norm": 0.0016215958166867495, "learning_rate": 3.463669936712613e-07, "loss": 0.0, "num_input_tokens_seen": 104021008, "step": 154335 }, { "epoch": 3.7705518774582854, "grad_norm": 0.008326614275574684, "learning_rate": 3.463024569661743e-07, "loss": 0.0, "num_input_tokens_seen": 104024208, "step": 154340 }, { "epoch": 3.7706740282901325, "grad_norm": 0.0009549881215207279, "learning_rate": 3.462379250149516e-07, "loss": 0.0, "num_input_tokens_seen": 104027344, "step": 154345 }, { "epoch": 3.7707961791219797, "grad_norm": 2.1092417227919213e-05, "learning_rate": 3.4617339781806296e-07, "loss": 0.0, "num_input_tokens_seen": 104030544, "step": 154350 }, { "epoch": 3.770918329953827, "grad_norm": 2.773488449747674e-05, "learning_rate": 3.4610887537597687e-07, "loss": 0.0, "num_input_tokens_seen": 104034064, "step": 154355 }, { "epoch": 3.771040480785674, "grad_norm": 0.0017581800930202007, "learning_rate": 3.460443576891632e-07, "loss": 0.0001, "num_input_tokens_seen": 104037328, "step": 154360 }, { "epoch": 3.7711626316175213, "grad_norm": 0.0019256254890933633, "learning_rate": 3.4597984475809094e-07, "loss": 0.0, "num_input_tokens_seen": 104040912, "step": 154365 }, { "epoch": 3.7712847824493685, "grad_norm": 0.0006136553129181266, "learning_rate": 3.459153365832288e-07, "loss": 0.0, "num_input_tokens_seen": 104044688, "step": 154370 }, { "epoch": 3.7714069332812157, "grad_norm": 0.00688550528138876, "learning_rate": 3.458508331650465e-07, "loss": 0.0, "num_input_tokens_seen": 104047824, "step": 154375 }, { "epoch": 3.771529084113063, "grad_norm": 0.01938728056848049, "learning_rate": 3.457863345040126e-07, "loss": 0.0002, "num_input_tokens_seen": 104050768, "step": 154380 }, { "epoch": 3.77165123494491, "grad_norm": 0.0019597031641751528, "learning_rate": 3.457218406005968e-07, "loss": 0.0001, "num_input_tokens_seen": 104054480, "step": 154385 }, { "epoch": 3.7717733857767572, "grad_norm": 0.0019120193319395185, "learning_rate": 3.456573514552675e-07, "loss": 0.0, "num_input_tokens_seen": 104057936, "step": 154390 }, { "epoch": 3.7718955366086044, "grad_norm": 0.003979104571044445, "learning_rate": 3.4559286706849424e-07, "loss": 0.0002, "num_input_tokens_seen": 104060944, "step": 154395 }, { "epoch": 3.7720176874404516, "grad_norm": 0.0009600624325685203, "learning_rate": 3.455283874407452e-07, "loss": 0.0, "num_input_tokens_seen": 104064336, "step": 154400 }, { "epoch": 3.772139838272299, "grad_norm": 0.0037294672802090645, "learning_rate": 3.4546391257248985e-07, "loss": 0.0234, "num_input_tokens_seen": 104067600, "step": 154405 }, { "epoch": 3.7722619891041456, "grad_norm": 0.00026753474958240986, "learning_rate": 3.453994424641973e-07, "loss": 0.0, "num_input_tokens_seen": 104071120, "step": 154410 }, { "epoch": 3.772384139935993, "grad_norm": 0.0002816287742462009, "learning_rate": 3.453349771163357e-07, "loss": 0.0274, "num_input_tokens_seen": 104074384, "step": 154415 }, { "epoch": 3.77250629076784, "grad_norm": 0.00024800936807878315, "learning_rate": 3.4527051652937467e-07, "loss": 0.0442, "num_input_tokens_seen": 104077392, "step": 154420 }, { "epoch": 3.772628441599687, "grad_norm": 0.004503862001001835, "learning_rate": 3.452060607037821e-07, "loss": 0.0, "num_input_tokens_seen": 104080656, "step": 154425 }, { "epoch": 3.7727505924315343, "grad_norm": 0.0010897937463596463, "learning_rate": 3.4514160964002725e-07, "loss": 0.0604, "num_input_tokens_seen": 104083792, "step": 154430 }, { "epoch": 3.7728727432633815, "grad_norm": 0.0008708562818355858, "learning_rate": 3.450771633385791e-07, "loss": 0.0, "num_input_tokens_seen": 104086992, "step": 154435 }, { "epoch": 3.7729948940952287, "grad_norm": 0.003654726780951023, "learning_rate": 3.450127217999055e-07, "loss": 0.0, "num_input_tokens_seen": 104090512, "step": 154440 }, { "epoch": 3.773117044927076, "grad_norm": 0.031255174428224564, "learning_rate": 3.44948285024476e-07, "loss": 0.0, "num_input_tokens_seen": 104093776, "step": 154445 }, { "epoch": 3.773239195758923, "grad_norm": 0.00021036301041021943, "learning_rate": 3.4488385301275833e-07, "loss": 0.0, "num_input_tokens_seen": 104097680, "step": 154450 }, { "epoch": 3.7733613465907703, "grad_norm": 0.0038913488388061523, "learning_rate": 3.448194257652219e-07, "loss": 0.0671, "num_input_tokens_seen": 104100688, "step": 154455 }, { "epoch": 3.7734834974226175, "grad_norm": 0.001282164128497243, "learning_rate": 3.447550032823345e-07, "loss": 0.0, "num_input_tokens_seen": 104103824, "step": 154460 }, { "epoch": 3.7736056482544647, "grad_norm": 0.007293464615941048, "learning_rate": 3.446905855645653e-07, "loss": 0.0, "num_input_tokens_seen": 104107216, "step": 154465 }, { "epoch": 3.773727799086312, "grad_norm": 0.0012252561282366514, "learning_rate": 3.4462617261238245e-07, "loss": 0.0787, "num_input_tokens_seen": 104110608, "step": 154470 }, { "epoch": 3.773849949918159, "grad_norm": 0.006050746422261, "learning_rate": 3.4456176442625393e-07, "loss": 0.0001, "num_input_tokens_seen": 104114000, "step": 154475 }, { "epoch": 3.773972100750006, "grad_norm": 0.0004838865133933723, "learning_rate": 3.4449736100664895e-07, "loss": 0.0, "num_input_tokens_seen": 104117648, "step": 154480 }, { "epoch": 3.7740942515818534, "grad_norm": 0.004921168088912964, "learning_rate": 3.4443296235403507e-07, "loss": 0.0, "num_input_tokens_seen": 104120784, "step": 154485 }, { "epoch": 3.7742164024137006, "grad_norm": 0.0025191386230289936, "learning_rate": 3.443685684688814e-07, "loss": 0.0403, "num_input_tokens_seen": 104124432, "step": 154490 }, { "epoch": 3.7743385532455473, "grad_norm": 0.0045335073955357075, "learning_rate": 3.4430417935165547e-07, "loss": 0.0, "num_input_tokens_seen": 104127568, "step": 154495 }, { "epoch": 3.774460704077395, "grad_norm": 0.0010492325527593493, "learning_rate": 3.44239795002826e-07, "loss": 0.0, "num_input_tokens_seen": 104130704, "step": 154500 }, { "epoch": 3.7745828549092417, "grad_norm": 0.013704109005630016, "learning_rate": 3.4417541542286134e-07, "loss": 0.0, "num_input_tokens_seen": 104133776, "step": 154505 }, { "epoch": 3.7747050057410894, "grad_norm": 0.031134042888879776, "learning_rate": 3.4411104061222916e-07, "loss": 0.0, "num_input_tokens_seen": 104137104, "step": 154510 }, { "epoch": 3.774827156572936, "grad_norm": 0.024660363793373108, "learning_rate": 3.4404667057139827e-07, "loss": 0.0001, "num_input_tokens_seen": 104141776, "step": 154515 }, { "epoch": 3.7749493074047833, "grad_norm": 0.002114421920850873, "learning_rate": 3.4398230530083596e-07, "loss": 0.0, "num_input_tokens_seen": 104145680, "step": 154520 }, { "epoch": 3.7750714582366305, "grad_norm": 0.000558870960958302, "learning_rate": 3.4391794480101087e-07, "loss": 0.0, "num_input_tokens_seen": 104148880, "step": 154525 }, { "epoch": 3.7751936090684777, "grad_norm": 0.0018979088636115193, "learning_rate": 3.4385358907239135e-07, "loss": 0.0, "num_input_tokens_seen": 104151952, "step": 154530 }, { "epoch": 3.775315759900325, "grad_norm": 0.019822288304567337, "learning_rate": 3.437892381154446e-07, "loss": 0.06, "num_input_tokens_seen": 104155728, "step": 154535 }, { "epoch": 3.775437910732172, "grad_norm": 0.0004573519399855286, "learning_rate": 3.4372489193063935e-07, "loss": 0.0, "num_input_tokens_seen": 104158672, "step": 154540 }, { "epoch": 3.7755600615640192, "grad_norm": 0.0019705379381775856, "learning_rate": 3.436605505184429e-07, "loss": 0.0, "num_input_tokens_seen": 104162384, "step": 154545 }, { "epoch": 3.7756822123958664, "grad_norm": 0.04014309123158455, "learning_rate": 3.435962138793237e-07, "loss": 0.0, "num_input_tokens_seen": 104165584, "step": 154550 }, { "epoch": 3.7758043632277136, "grad_norm": 0.0014087480958551168, "learning_rate": 3.4353188201374915e-07, "loss": 0.0, "num_input_tokens_seen": 104168848, "step": 154555 }, { "epoch": 3.775926514059561, "grad_norm": 30.96991729736328, "learning_rate": 3.434675549221876e-07, "loss": 0.0313, "num_input_tokens_seen": 104171984, "step": 154560 }, { "epoch": 3.776048664891408, "grad_norm": 0.01162485871464014, "learning_rate": 3.434032326051063e-07, "loss": 0.0, "num_input_tokens_seen": 104174992, "step": 154565 }, { "epoch": 3.776170815723255, "grad_norm": 0.0003187482070643455, "learning_rate": 3.4333891506297365e-07, "loss": 0.0, "num_input_tokens_seen": 104178512, "step": 154570 }, { "epoch": 3.7762929665551024, "grad_norm": 0.012058389373123646, "learning_rate": 3.432746022962566e-07, "loss": 0.0, "num_input_tokens_seen": 104181904, "step": 154575 }, { "epoch": 3.7764151173869496, "grad_norm": 0.019850589334964752, "learning_rate": 3.432102943054237e-07, "loss": 0.0, "num_input_tokens_seen": 104185104, "step": 154580 }, { "epoch": 3.7765372682187968, "grad_norm": 0.0027016245294362307, "learning_rate": 3.4314599109094176e-07, "loss": 0.0, "num_input_tokens_seen": 104188304, "step": 154585 }, { "epoch": 3.7766594190506435, "grad_norm": 0.0011109261540696025, "learning_rate": 3.4308169265327926e-07, "loss": 0.0, "num_input_tokens_seen": 104191504, "step": 154590 }, { "epoch": 3.776781569882491, "grad_norm": 29.25566864013672, "learning_rate": 3.4301739899290303e-07, "loss": 0.0255, "num_input_tokens_seen": 104194960, "step": 154595 }, { "epoch": 3.776903720714338, "grad_norm": 0.004831159487366676, "learning_rate": 3.429531101102814e-07, "loss": 0.0001, "num_input_tokens_seen": 104198864, "step": 154600 }, { "epoch": 3.777025871546185, "grad_norm": 0.05014396086335182, "learning_rate": 3.42888826005881e-07, "loss": 0.0, "num_input_tokens_seen": 104202448, "step": 154605 }, { "epoch": 3.7771480223780323, "grad_norm": 0.001068123267032206, "learning_rate": 3.428245466801701e-07, "loss": 0.0, "num_input_tokens_seen": 104205904, "step": 154610 }, { "epoch": 3.7772701732098795, "grad_norm": 0.07515928149223328, "learning_rate": 3.427602721336157e-07, "loss": 0.0, "num_input_tokens_seen": 104208976, "step": 154615 }, { "epoch": 3.7773923240417266, "grad_norm": 0.008085944689810276, "learning_rate": 3.426960023666853e-07, "loss": 0.0538, "num_input_tokens_seen": 104212240, "step": 154620 }, { "epoch": 3.777514474873574, "grad_norm": 0.004900203552097082, "learning_rate": 3.426317373798466e-07, "loss": 0.0, "num_input_tokens_seen": 104216080, "step": 154625 }, { "epoch": 3.777636625705421, "grad_norm": 0.11813540011644363, "learning_rate": 3.425674771735665e-07, "loss": 0.0001, "num_input_tokens_seen": 104219536, "step": 154630 }, { "epoch": 3.777758776537268, "grad_norm": 0.0009974022395908833, "learning_rate": 3.4250322174831294e-07, "loss": 0.0, "num_input_tokens_seen": 104222864, "step": 154635 }, { "epoch": 3.7778809273691154, "grad_norm": 0.00033019485999830067, "learning_rate": 3.424389711045523e-07, "loss": 0.0, "num_input_tokens_seen": 104226448, "step": 154640 }, { "epoch": 3.7780030782009626, "grad_norm": 0.1484830528497696, "learning_rate": 3.4237472524275266e-07, "loss": 0.0001, "num_input_tokens_seen": 104229904, "step": 154645 }, { "epoch": 3.77812522903281, "grad_norm": 0.05555364117026329, "learning_rate": 3.423104841633807e-07, "loss": 0.0, "num_input_tokens_seen": 104233488, "step": 154650 }, { "epoch": 3.778247379864657, "grad_norm": 0.001281336648389697, "learning_rate": 3.422462478669037e-07, "loss": 0.0002, "num_input_tokens_seen": 104237264, "step": 154655 }, { "epoch": 3.778369530696504, "grad_norm": 0.02976313605904579, "learning_rate": 3.4218201635378927e-07, "loss": 0.0515, "num_input_tokens_seen": 104240336, "step": 154660 }, { "epoch": 3.7784916815283514, "grad_norm": 0.000876471633091569, "learning_rate": 3.4211778962450376e-07, "loss": 0.0001, "num_input_tokens_seen": 104243984, "step": 154665 }, { "epoch": 3.7786138323601985, "grad_norm": 0.0004510780854616314, "learning_rate": 3.4205356767951497e-07, "loss": 0.0, "num_input_tokens_seen": 104246992, "step": 154670 }, { "epoch": 3.7787359831920453, "grad_norm": 0.000282302382402122, "learning_rate": 3.4198935051928967e-07, "loss": 0.1016, "num_input_tokens_seen": 104250320, "step": 154675 }, { "epoch": 3.778858134023893, "grad_norm": 0.001659899833612144, "learning_rate": 3.419251381442945e-07, "loss": 0.0591, "num_input_tokens_seen": 104253712, "step": 154680 }, { "epoch": 3.7789802848557397, "grad_norm": 0.0009223993984051049, "learning_rate": 3.41860930554997e-07, "loss": 0.0, "num_input_tokens_seen": 104257104, "step": 154685 }, { "epoch": 3.7791024356875873, "grad_norm": 0.002383466577157378, "learning_rate": 3.4179672775186344e-07, "loss": 0.0341, "num_input_tokens_seen": 104260368, "step": 154690 }, { "epoch": 3.779224586519434, "grad_norm": 0.00289145833812654, "learning_rate": 3.417325297353615e-07, "loss": 0.0, "num_input_tokens_seen": 104263696, "step": 154695 }, { "epoch": 3.7793467373512812, "grad_norm": 0.004132087808102369, "learning_rate": 3.4166833650595725e-07, "loss": 0.0, "num_input_tokens_seen": 104267088, "step": 154700 }, { "epoch": 3.7794688881831284, "grad_norm": 0.0008576092659495771, "learning_rate": 3.4160414806411844e-07, "loss": 0.0, "num_input_tokens_seen": 104269968, "step": 154705 }, { "epoch": 3.7795910390149756, "grad_norm": 0.03509823605418205, "learning_rate": 3.4153996441031086e-07, "loss": 0.0002, "num_input_tokens_seen": 104272848, "step": 154710 }, { "epoch": 3.779713189846823, "grad_norm": 0.010340539738535881, "learning_rate": 3.4147578554500177e-07, "loss": 0.0, "num_input_tokens_seen": 104276176, "step": 154715 }, { "epoch": 3.77983534067867, "grad_norm": 0.07541664689779282, "learning_rate": 3.4141161146865825e-07, "loss": 0.0, "num_input_tokens_seen": 104279120, "step": 154720 }, { "epoch": 3.779957491510517, "grad_norm": 0.07795143872499466, "learning_rate": 3.413474421817464e-07, "loss": 0.0003, "num_input_tokens_seen": 104282704, "step": 154725 }, { "epoch": 3.7800796423423644, "grad_norm": 0.003080329392105341, "learning_rate": 3.412832776847333e-07, "loss": 0.0001, "num_input_tokens_seen": 104285904, "step": 154730 }, { "epoch": 3.7802017931742116, "grad_norm": 0.12578627467155457, "learning_rate": 3.412191179780851e-07, "loss": 0.0, "num_input_tokens_seen": 104289040, "step": 154735 }, { "epoch": 3.7803239440060588, "grad_norm": 0.0012306523276492953, "learning_rate": 3.4115496306226863e-07, "loss": 0.0, "num_input_tokens_seen": 104292368, "step": 154740 }, { "epoch": 3.780446094837906, "grad_norm": 0.002234894782304764, "learning_rate": 3.410908129377509e-07, "loss": 0.0, "num_input_tokens_seen": 104296144, "step": 154745 }, { "epoch": 3.780568245669753, "grad_norm": 0.003981317859143019, "learning_rate": 3.4102666760499753e-07, "loss": 0.0, "num_input_tokens_seen": 104299600, "step": 154750 }, { "epoch": 3.7806903965016003, "grad_norm": 0.007521201390773058, "learning_rate": 3.4096252706447595e-07, "loss": 0.0, "num_input_tokens_seen": 104303120, "step": 154755 }, { "epoch": 3.780812547333447, "grad_norm": 0.0024240443017333746, "learning_rate": 3.4089839131665175e-07, "loss": 0.0566, "num_input_tokens_seen": 104306128, "step": 154760 }, { "epoch": 3.7809346981652947, "grad_norm": 0.0009506293572485447, "learning_rate": 3.4083426036199203e-07, "loss": 0.0, "num_input_tokens_seen": 104309328, "step": 154765 }, { "epoch": 3.7810568489971415, "grad_norm": 0.0022911010310053825, "learning_rate": 3.4077013420096255e-07, "loss": 0.0006, "num_input_tokens_seen": 104312400, "step": 154770 }, { "epoch": 3.781178999828989, "grad_norm": 0.009430350735783577, "learning_rate": 3.4070601283403033e-07, "loss": 0.0, "num_input_tokens_seen": 104315600, "step": 154775 }, { "epoch": 3.781301150660836, "grad_norm": 0.002623270731419325, "learning_rate": 3.406418962616612e-07, "loss": 0.0, "num_input_tokens_seen": 104318544, "step": 154780 }, { "epoch": 3.781423301492683, "grad_norm": 0.014037542045116425, "learning_rate": 3.4057778448432127e-07, "loss": 0.0, "num_input_tokens_seen": 104322064, "step": 154785 }, { "epoch": 3.78154545232453, "grad_norm": 0.0013723873998969793, "learning_rate": 3.405136775024775e-07, "loss": 0.0, "num_input_tokens_seen": 104325776, "step": 154790 }, { "epoch": 3.7816676031563774, "grad_norm": 0.007315513212233782, "learning_rate": 3.4044957531659514e-07, "loss": 0.0453, "num_input_tokens_seen": 104329360, "step": 154795 }, { "epoch": 3.7817897539882246, "grad_norm": 0.005057999864220619, "learning_rate": 3.4038547792714135e-07, "loss": 0.0, "num_input_tokens_seen": 104332880, "step": 154800 }, { "epoch": 3.781911904820072, "grad_norm": 87.0716552734375, "learning_rate": 3.403213853345813e-07, "loss": 0.0607, "num_input_tokens_seen": 104336528, "step": 154805 }, { "epoch": 3.782034055651919, "grad_norm": 0.013916331343352795, "learning_rate": 3.402572975393817e-07, "loss": 0.0, "num_input_tokens_seen": 104339600, "step": 154810 }, { "epoch": 3.782156206483766, "grad_norm": 1.0355695486068726, "learning_rate": 3.401932145420088e-07, "loss": 0.0678, "num_input_tokens_seen": 104343248, "step": 154815 }, { "epoch": 3.7822783573156133, "grad_norm": 0.01644453965127468, "learning_rate": 3.4012913634292796e-07, "loss": 0.0001, "num_input_tokens_seen": 104346640, "step": 154820 }, { "epoch": 3.7824005081474605, "grad_norm": 0.08392349630594254, "learning_rate": 3.400650629426057e-07, "loss": 0.0, "num_input_tokens_seen": 104350352, "step": 154825 }, { "epoch": 3.7825226589793077, "grad_norm": 0.0005958887049928308, "learning_rate": 3.400009943415076e-07, "loss": 0.0001, "num_input_tokens_seen": 104353424, "step": 154830 }, { "epoch": 3.782644809811155, "grad_norm": 0.0016513823065906763, "learning_rate": 3.3993693054009986e-07, "loss": 0.0001, "num_input_tokens_seen": 104356944, "step": 154835 }, { "epoch": 3.782766960643002, "grad_norm": 0.0036543281748890877, "learning_rate": 3.3987287153884856e-07, "loss": 0.0, "num_input_tokens_seen": 104360400, "step": 154840 }, { "epoch": 3.7828891114748493, "grad_norm": 0.00038208148907870054, "learning_rate": 3.3980881733821895e-07, "loss": 0.0, "num_input_tokens_seen": 104363728, "step": 154845 }, { "epoch": 3.7830112623066965, "grad_norm": 0.0020735624711960554, "learning_rate": 3.3974476793867755e-07, "loss": 0.0383, "num_input_tokens_seen": 104366992, "step": 154850 }, { "epoch": 3.7831334131385432, "grad_norm": 0.05217864736914635, "learning_rate": 3.396807233406894e-07, "loss": 0.0, "num_input_tokens_seen": 104370128, "step": 154855 }, { "epoch": 3.783255563970391, "grad_norm": 0.0008077344973571599, "learning_rate": 3.3961668354472107e-07, "loss": 0.0, "num_input_tokens_seen": 104373264, "step": 154860 }, { "epoch": 3.7833777148022376, "grad_norm": 0.0003835418028756976, "learning_rate": 3.3955264855123747e-07, "loss": 0.0, "num_input_tokens_seen": 104376336, "step": 154865 }, { "epoch": 3.783499865634085, "grad_norm": 0.005073685199022293, "learning_rate": 3.3948861836070463e-07, "loss": 0.0013, "num_input_tokens_seen": 104379408, "step": 154870 }, { "epoch": 3.783622016465932, "grad_norm": 0.001436670427210629, "learning_rate": 3.394245929735885e-07, "loss": 0.0001, "num_input_tokens_seen": 104382864, "step": 154875 }, { "epoch": 3.783744167297779, "grad_norm": 0.006241243798285723, "learning_rate": 3.3936057239035445e-07, "loss": 0.0001, "num_input_tokens_seen": 104386448, "step": 154880 }, { "epoch": 3.7838663181296264, "grad_norm": 0.00015113978588487953, "learning_rate": 3.392965566114676e-07, "loss": 0.0001, "num_input_tokens_seen": 104389968, "step": 154885 }, { "epoch": 3.7839884689614736, "grad_norm": 0.0023052634205669165, "learning_rate": 3.392325456373943e-07, "loss": 0.0, "num_input_tokens_seen": 104393424, "step": 154890 }, { "epoch": 3.7841106197933208, "grad_norm": 0.005526892840862274, "learning_rate": 3.3916853946859936e-07, "loss": 0.0, "num_input_tokens_seen": 104396304, "step": 154895 }, { "epoch": 3.784232770625168, "grad_norm": 0.008645119145512581, "learning_rate": 3.3910453810554884e-07, "loss": 0.0, "num_input_tokens_seen": 104399568, "step": 154900 }, { "epoch": 3.784354921457015, "grad_norm": 2.139812204404734e-05, "learning_rate": 3.390405415487075e-07, "loss": 0.0, "num_input_tokens_seen": 104402896, "step": 154905 }, { "epoch": 3.7844770722888623, "grad_norm": 0.0003892584063578397, "learning_rate": 3.389765497985415e-07, "loss": 0.0, "num_input_tokens_seen": 104405904, "step": 154910 }, { "epoch": 3.7845992231207095, "grad_norm": 0.0018738510552793741, "learning_rate": 3.389125628555155e-07, "loss": 0.0, "num_input_tokens_seen": 104409360, "step": 154915 }, { "epoch": 3.7847213739525567, "grad_norm": 0.0837446004152298, "learning_rate": 3.3884858072009546e-07, "loss": 0.0, "num_input_tokens_seen": 104412112, "step": 154920 }, { "epoch": 3.784843524784404, "grad_norm": 0.001453855657018721, "learning_rate": 3.387846033927461e-07, "loss": 0.0, "num_input_tokens_seen": 104415632, "step": 154925 }, { "epoch": 3.784965675616251, "grad_norm": 0.0007801406900398433, "learning_rate": 3.387206308739329e-07, "loss": 0.0436, "num_input_tokens_seen": 104418832, "step": 154930 }, { "epoch": 3.7850878264480983, "grad_norm": 5.12362239533104e-05, "learning_rate": 3.3865666316412143e-07, "loss": 0.0, "num_input_tokens_seen": 104422224, "step": 154935 }, { "epoch": 3.785209977279945, "grad_norm": 0.0006093584233894944, "learning_rate": 3.385927002637763e-07, "loss": 0.0181, "num_input_tokens_seen": 104425936, "step": 154940 }, { "epoch": 3.7853321281117926, "grad_norm": 0.0009082345641218126, "learning_rate": 3.3852874217336323e-07, "loss": 0.0, "num_input_tokens_seen": 104429328, "step": 154945 }, { "epoch": 3.7854542789436394, "grad_norm": 0.0011150073260068893, "learning_rate": 3.3846478889334673e-07, "loss": 0.0, "num_input_tokens_seen": 104433168, "step": 154950 }, { "epoch": 3.785576429775487, "grad_norm": 0.001950499601662159, "learning_rate": 3.384008404241926e-07, "loss": 0.092, "num_input_tokens_seen": 104436304, "step": 154955 }, { "epoch": 3.7856985806073338, "grad_norm": 0.0005495021468959749, "learning_rate": 3.3833689676636525e-07, "loss": 0.0, "num_input_tokens_seen": 104439760, "step": 154960 }, { "epoch": 3.785820731439181, "grad_norm": 0.0005154844257049263, "learning_rate": 3.3827295792032984e-07, "loss": 0.0001, "num_input_tokens_seen": 104443088, "step": 154965 }, { "epoch": 3.785942882271028, "grad_norm": 0.0010477579198777676, "learning_rate": 3.382090238865518e-07, "loss": 0.0, "num_input_tokens_seen": 104446800, "step": 154970 }, { "epoch": 3.7860650331028753, "grad_norm": 0.005031648091971874, "learning_rate": 3.3814509466549545e-07, "loss": 0.0001, "num_input_tokens_seen": 104450768, "step": 154975 }, { "epoch": 3.7861871839347225, "grad_norm": 0.14880934357643127, "learning_rate": 3.3808117025762626e-07, "loss": 0.0002, "num_input_tokens_seen": 104454032, "step": 154980 }, { "epoch": 3.7863093347665697, "grad_norm": 0.00034068856621161103, "learning_rate": 3.380172506634089e-07, "loss": 0.0, "num_input_tokens_seen": 104457744, "step": 154985 }, { "epoch": 3.786431485598417, "grad_norm": 0.009613665752112865, "learning_rate": 3.379533358833078e-07, "loss": 0.1035, "num_input_tokens_seen": 104460880, "step": 154990 }, { "epoch": 3.786553636430264, "grad_norm": 0.0013408566592261195, "learning_rate": 3.3788942591778836e-07, "loss": 0.0, "num_input_tokens_seen": 104464208, "step": 154995 }, { "epoch": 3.7866757872621113, "grad_norm": 0.00530999107286334, "learning_rate": 3.3782552076731487e-07, "loss": 0.0, "num_input_tokens_seen": 104467280, "step": 155000 }, { "epoch": 3.7867979380939585, "grad_norm": 0.00015352222544606775, "learning_rate": 3.377616204323526e-07, "loss": 0.0, "num_input_tokens_seen": 104470800, "step": 155005 }, { "epoch": 3.7869200889258057, "grad_norm": 0.008193439804017544, "learning_rate": 3.3769772491336554e-07, "loss": 0.0, "num_input_tokens_seen": 104473872, "step": 155010 }, { "epoch": 3.787042239757653, "grad_norm": 0.001924472744576633, "learning_rate": 3.3763383421081927e-07, "loss": 0.0, "num_input_tokens_seen": 104477136, "step": 155015 }, { "epoch": 3.7871643905895, "grad_norm": 0.006006884854286909, "learning_rate": 3.3756994832517737e-07, "loss": 0.0, "num_input_tokens_seen": 104480592, "step": 155020 }, { "epoch": 3.7872865414213472, "grad_norm": 0.0006222435622476041, "learning_rate": 3.3750606725690513e-07, "loss": 0.0, "num_input_tokens_seen": 104483920, "step": 155025 }, { "epoch": 3.7874086922531944, "grad_norm": 0.009881896898150444, "learning_rate": 3.374421910064672e-07, "loss": 0.0, "num_input_tokens_seen": 104487056, "step": 155030 }, { "epoch": 3.787530843085041, "grad_norm": 0.07544638216495514, "learning_rate": 3.3737831957432763e-07, "loss": 0.0, "num_input_tokens_seen": 104490320, "step": 155035 }, { "epoch": 3.787652993916889, "grad_norm": 0.0014049513265490532, "learning_rate": 3.373144529609514e-07, "loss": 0.0, "num_input_tokens_seen": 104493328, "step": 155040 }, { "epoch": 3.7877751447487356, "grad_norm": 0.002917036646977067, "learning_rate": 3.3725059116680245e-07, "loss": 0.0446, "num_input_tokens_seen": 104496400, "step": 155045 }, { "epoch": 3.7878972955805827, "grad_norm": 8.429832087131217e-05, "learning_rate": 3.3718673419234565e-07, "loss": 0.0003, "num_input_tokens_seen": 104499856, "step": 155050 }, { "epoch": 3.78801944641243, "grad_norm": 0.09984175115823746, "learning_rate": 3.37122882038045e-07, "loss": 0.0, "num_input_tokens_seen": 104503696, "step": 155055 }, { "epoch": 3.788141597244277, "grad_norm": 0.0014615656109526753, "learning_rate": 3.3705903470436504e-07, "loss": 0.0, "num_input_tokens_seen": 104506768, "step": 155060 }, { "epoch": 3.7882637480761243, "grad_norm": 0.005124914925545454, "learning_rate": 3.369951921917703e-07, "loss": 0.0, "num_input_tokens_seen": 104510160, "step": 155065 }, { "epoch": 3.7883858989079715, "grad_norm": 0.00016769897774793208, "learning_rate": 3.369313545007246e-07, "loss": 0.0001, "num_input_tokens_seen": 104513680, "step": 155070 }, { "epoch": 3.7885080497398187, "grad_norm": 0.0012042616726830602, "learning_rate": 3.3686752163169275e-07, "loss": 0.0002, "num_input_tokens_seen": 104517200, "step": 155075 }, { "epoch": 3.788630200571666, "grad_norm": 0.0004410717519931495, "learning_rate": 3.368036935851384e-07, "loss": 0.0, "num_input_tokens_seen": 104520784, "step": 155080 }, { "epoch": 3.788752351403513, "grad_norm": 0.00887715257704258, "learning_rate": 3.367398703615262e-07, "loss": 0.0714, "num_input_tokens_seen": 104524496, "step": 155085 }, { "epoch": 3.7888745022353603, "grad_norm": 0.0005934710497967899, "learning_rate": 3.366760519613201e-07, "loss": 0.0, "num_input_tokens_seen": 104527888, "step": 155090 }, { "epoch": 3.7889966530672075, "grad_norm": 0.000675864634104073, "learning_rate": 3.3661223838498374e-07, "loss": 0.0, "num_input_tokens_seen": 104531408, "step": 155095 }, { "epoch": 3.7891188038990546, "grad_norm": 0.005420152563601732, "learning_rate": 3.36548429632982e-07, "loss": 0.0001, "num_input_tokens_seen": 104535056, "step": 155100 }, { "epoch": 3.789240954730902, "grad_norm": 0.003639468690380454, "learning_rate": 3.364846257057783e-07, "loss": 0.0, "num_input_tokens_seen": 104538320, "step": 155105 }, { "epoch": 3.789363105562749, "grad_norm": 0.005886682774871588, "learning_rate": 3.364208266038371e-07, "loss": 0.0002, "num_input_tokens_seen": 104541648, "step": 155110 }, { "epoch": 3.789485256394596, "grad_norm": 0.0007768772193230689, "learning_rate": 3.363570323276218e-07, "loss": 0.0, "num_input_tokens_seen": 104544784, "step": 155115 }, { "epoch": 3.789607407226443, "grad_norm": 0.0006768378661945462, "learning_rate": 3.3629324287759666e-07, "loss": 0.0, "num_input_tokens_seen": 104548304, "step": 155120 }, { "epoch": 3.7897295580582906, "grad_norm": 0.0010963869281113148, "learning_rate": 3.362294582542259e-07, "loss": 0.0193, "num_input_tokens_seen": 104551952, "step": 155125 }, { "epoch": 3.7898517088901373, "grad_norm": 0.001188731868751347, "learning_rate": 3.3616567845797273e-07, "loss": 0.0, "num_input_tokens_seen": 104555728, "step": 155130 }, { "epoch": 3.789973859721985, "grad_norm": 0.0016654833452776074, "learning_rate": 3.3610190348930157e-07, "loss": 0.0002, "num_input_tokens_seen": 104558928, "step": 155135 }, { "epoch": 3.7900960105538317, "grad_norm": 0.006672716233879328, "learning_rate": 3.360381333486757e-07, "loss": 0.0001, "num_input_tokens_seen": 104562512, "step": 155140 }, { "epoch": 3.790218161385679, "grad_norm": 0.00013601829414255917, "learning_rate": 3.359743680365591e-07, "loss": 0.0, "num_input_tokens_seen": 104566096, "step": 155145 }, { "epoch": 3.790340312217526, "grad_norm": 0.001462005078792572, "learning_rate": 3.3591060755341583e-07, "loss": 0.0, "num_input_tokens_seen": 104569872, "step": 155150 }, { "epoch": 3.7904624630493733, "grad_norm": 0.005027350969612598, "learning_rate": 3.3584685189970886e-07, "loss": 0.0, "num_input_tokens_seen": 104573648, "step": 155155 }, { "epoch": 3.7905846138812205, "grad_norm": 0.0005566252511925995, "learning_rate": 3.357831010759026e-07, "loss": 0.2054, "num_input_tokens_seen": 104577936, "step": 155160 }, { "epoch": 3.7907067647130677, "grad_norm": 0.00025076669408008456, "learning_rate": 3.3571935508245986e-07, "loss": 0.0, "num_input_tokens_seen": 104581392, "step": 155165 }, { "epoch": 3.790828915544915, "grad_norm": 0.011523941531777382, "learning_rate": 3.35655613919845e-07, "loss": 0.0, "num_input_tokens_seen": 104584784, "step": 155170 }, { "epoch": 3.790951066376762, "grad_norm": 0.008374925702810287, "learning_rate": 3.355918775885209e-07, "loss": 0.0, "num_input_tokens_seen": 104587856, "step": 155175 }, { "epoch": 3.7910732172086092, "grad_norm": 0.005891414824873209, "learning_rate": 3.355281460889514e-07, "loss": 0.0, "num_input_tokens_seen": 104591056, "step": 155180 }, { "epoch": 3.7911953680404564, "grad_norm": 25.315025329589844, "learning_rate": 3.3546441942160033e-07, "loss": 0.1111, "num_input_tokens_seen": 104594192, "step": 155185 }, { "epoch": 3.7913175188723036, "grad_norm": 0.021479131653904915, "learning_rate": 3.3540069758693056e-07, "loss": 0.0, "num_input_tokens_seen": 104597456, "step": 155190 }, { "epoch": 3.791439669704151, "grad_norm": 0.0012764681596308947, "learning_rate": 3.353369805854055e-07, "loss": 0.0, "num_input_tokens_seen": 104600720, "step": 155195 }, { "epoch": 3.791561820535998, "grad_norm": 0.025395803153514862, "learning_rate": 3.3527326841748894e-07, "loss": 0.0, "num_input_tokens_seen": 104603920, "step": 155200 }, { "epoch": 3.7916839713678447, "grad_norm": 0.2535885274410248, "learning_rate": 3.3520956108364397e-07, "loss": 0.0004, "num_input_tokens_seen": 104607056, "step": 155205 }, { "epoch": 3.7918061221996924, "grad_norm": 0.0025765015743672848, "learning_rate": 3.351458585843335e-07, "loss": 0.0, "num_input_tokens_seen": 104610192, "step": 155210 }, { "epoch": 3.791928273031539, "grad_norm": 4.375946082291193e-05, "learning_rate": 3.350821609200213e-07, "loss": 0.0, "num_input_tokens_seen": 104613584, "step": 155215 }, { "epoch": 3.7920504238633868, "grad_norm": 0.0034888247027993202, "learning_rate": 3.3501846809117075e-07, "loss": 0.0, "num_input_tokens_seen": 104617104, "step": 155220 }, { "epoch": 3.7921725746952335, "grad_norm": 0.0013499618507921696, "learning_rate": 3.349547800982444e-07, "loss": 0.0235, "num_input_tokens_seen": 104620304, "step": 155225 }, { "epoch": 3.7922947255270807, "grad_norm": 0.01719053089618683, "learning_rate": 3.3489109694170604e-07, "loss": 0.0, "num_input_tokens_seen": 104623888, "step": 155230 }, { "epoch": 3.792416876358928, "grad_norm": 0.0007886372623033822, "learning_rate": 3.3482741862201827e-07, "loss": 0.0, "num_input_tokens_seen": 104627600, "step": 155235 }, { "epoch": 3.792539027190775, "grad_norm": 0.0006685940898023546, "learning_rate": 3.3476374513964444e-07, "loss": 0.0, "num_input_tokens_seen": 104630928, "step": 155240 }, { "epoch": 3.7926611780226223, "grad_norm": 0.004218700807541609, "learning_rate": 3.3470007649504783e-07, "loss": 0.0, "num_input_tokens_seen": 104634128, "step": 155245 }, { "epoch": 3.7927833288544694, "grad_norm": 0.0038865339010953903, "learning_rate": 3.3463641268869093e-07, "loss": 0.0, "num_input_tokens_seen": 104637776, "step": 155250 }, { "epoch": 3.7929054796863166, "grad_norm": 0.0005819426150992513, "learning_rate": 3.345727537210373e-07, "loss": 0.0, "num_input_tokens_seen": 104640848, "step": 155255 }, { "epoch": 3.793027630518164, "grad_norm": 0.00021406644373200834, "learning_rate": 3.3450909959254937e-07, "loss": 0.0005, "num_input_tokens_seen": 104643984, "step": 155260 }, { "epoch": 3.793149781350011, "grad_norm": 3.5937824577558786e-05, "learning_rate": 3.344454503036904e-07, "loss": 0.0006, "num_input_tokens_seen": 104647120, "step": 155265 }, { "epoch": 3.793271932181858, "grad_norm": 0.001609228434972465, "learning_rate": 3.3438180585492294e-07, "loss": 0.0, "num_input_tokens_seen": 104650640, "step": 155270 }, { "epoch": 3.7933940830137054, "grad_norm": 0.006711960770189762, "learning_rate": 3.3431816624670995e-07, "loss": 0.0, "num_input_tokens_seen": 104653648, "step": 155275 }, { "epoch": 3.7935162338455526, "grad_norm": 0.000300501094898209, "learning_rate": 3.3425453147951466e-07, "loss": 0.0001, "num_input_tokens_seen": 104656656, "step": 155280 }, { "epoch": 3.7936383846773998, "grad_norm": 0.0032590716145932674, "learning_rate": 3.3419090155379913e-07, "loss": 0.0001, "num_input_tokens_seen": 104659792, "step": 155285 }, { "epoch": 3.793760535509247, "grad_norm": 0.03171639144420624, "learning_rate": 3.341272764700268e-07, "loss": 0.0, "num_input_tokens_seen": 104663248, "step": 155290 }, { "epoch": 3.793882686341094, "grad_norm": 0.0020887863356620073, "learning_rate": 3.340636562286601e-07, "loss": 0.0, "num_input_tokens_seen": 104666512, "step": 155295 }, { "epoch": 3.794004837172941, "grad_norm": 0.0004597719816956669, "learning_rate": 3.340000408301611e-07, "loss": 0.0, "num_input_tokens_seen": 104669648, "step": 155300 }, { "epoch": 3.7941269880047885, "grad_norm": 0.001281373668462038, "learning_rate": 3.339364302749933e-07, "loss": 0.0, "num_input_tokens_seen": 104672912, "step": 155305 }, { "epoch": 3.7942491388366353, "grad_norm": 0.006146824918687344, "learning_rate": 3.3387282456361867e-07, "loss": 0.0, "num_input_tokens_seen": 104676304, "step": 155310 }, { "epoch": 3.794371289668483, "grad_norm": 0.0005169722135178745, "learning_rate": 3.3380922369650035e-07, "loss": 0.0952, "num_input_tokens_seen": 104679632, "step": 155315 }, { "epoch": 3.7944934405003297, "grad_norm": 0.01592920534312725, "learning_rate": 3.337456276741002e-07, "loss": 0.0, "num_input_tokens_seen": 104682704, "step": 155320 }, { "epoch": 3.794615591332177, "grad_norm": 0.002830099780112505, "learning_rate": 3.336820364968813e-07, "loss": 0.0, "num_input_tokens_seen": 104686352, "step": 155325 }, { "epoch": 3.794737742164024, "grad_norm": 0.00489602517336607, "learning_rate": 3.3361845016530566e-07, "loss": 0.05, "num_input_tokens_seen": 104689872, "step": 155330 }, { "epoch": 3.7948598929958712, "grad_norm": 0.029066437855362892, "learning_rate": 3.3355486867983573e-07, "loss": 0.0, "num_input_tokens_seen": 104693520, "step": 155335 }, { "epoch": 3.7949820438277184, "grad_norm": 0.038426849991083145, "learning_rate": 3.334912920409345e-07, "loss": 0.0, "num_input_tokens_seen": 104697104, "step": 155340 }, { "epoch": 3.7951041946595656, "grad_norm": 0.0005690338439308107, "learning_rate": 3.334277202490635e-07, "loss": 0.0002, "num_input_tokens_seen": 104700432, "step": 155345 }, { "epoch": 3.795226345491413, "grad_norm": 0.0005368631100282073, "learning_rate": 3.333641533046857e-07, "loss": 0.0706, "num_input_tokens_seen": 104703888, "step": 155350 }, { "epoch": 3.79534849632326, "grad_norm": 0.09150570631027222, "learning_rate": 3.333005912082628e-07, "loss": 0.0, "num_input_tokens_seen": 104707024, "step": 155355 }, { "epoch": 3.795470647155107, "grad_norm": 0.005836548749357462, "learning_rate": 3.332370339602576e-07, "loss": 0.0002, "num_input_tokens_seen": 104710736, "step": 155360 }, { "epoch": 3.7955927979869544, "grad_norm": 0.010113383643329144, "learning_rate": 3.331734815611318e-07, "loss": 0.0, "num_input_tokens_seen": 104713744, "step": 155365 }, { "epoch": 3.7957149488188016, "grad_norm": 0.6457812190055847, "learning_rate": 3.3310993401134767e-07, "loss": 0.0002, "num_input_tokens_seen": 104717200, "step": 155370 }, { "epoch": 3.7958370996506487, "grad_norm": 0.0009153983555734158, "learning_rate": 3.330463913113679e-07, "loss": 0.0001, "num_input_tokens_seen": 104720656, "step": 155375 }, { "epoch": 3.795959250482496, "grad_norm": 0.0020990949124097824, "learning_rate": 3.329828534616538e-07, "loss": 0.0, "num_input_tokens_seen": 104723984, "step": 155380 }, { "epoch": 3.7960814013143427, "grad_norm": 0.00040871353121474385, "learning_rate": 3.3291932046266804e-07, "loss": 0.0, "num_input_tokens_seen": 104726800, "step": 155385 }, { "epoch": 3.7962035521461903, "grad_norm": 0.000676620053127408, "learning_rate": 3.328557923148722e-07, "loss": 0.0, "num_input_tokens_seen": 104730128, "step": 155390 }, { "epoch": 3.796325702978037, "grad_norm": 0.00033176448778249323, "learning_rate": 3.327922690187287e-07, "loss": 0.0418, "num_input_tokens_seen": 104733392, "step": 155395 }, { "epoch": 3.7964478538098847, "grad_norm": 0.004591756500303745, "learning_rate": 3.327287505746993e-07, "loss": 0.0, "num_input_tokens_seen": 104736656, "step": 155400 }, { "epoch": 3.7965700046417314, "grad_norm": 0.026520751416683197, "learning_rate": 3.3266523698324564e-07, "loss": 0.0001, "num_input_tokens_seen": 104739984, "step": 155405 }, { "epoch": 3.7966921554735786, "grad_norm": 0.0005124854505993426, "learning_rate": 3.3260172824483013e-07, "loss": 0.0004, "num_input_tokens_seen": 104743504, "step": 155410 }, { "epoch": 3.796814306305426, "grad_norm": 0.006375231314450502, "learning_rate": 3.325382243599141e-07, "loss": 0.0001, "num_input_tokens_seen": 104746576, "step": 155415 }, { "epoch": 3.796936457137273, "grad_norm": 0.001261277706362307, "learning_rate": 3.324747253289599e-07, "loss": 0.0, "num_input_tokens_seen": 104749648, "step": 155420 }, { "epoch": 3.79705860796912, "grad_norm": 0.001110946643166244, "learning_rate": 3.3241123115242873e-07, "loss": 0.0, "num_input_tokens_seen": 104753168, "step": 155425 }, { "epoch": 3.7971807588009674, "grad_norm": 40.781349182128906, "learning_rate": 3.323477418307826e-07, "loss": 0.0296, "num_input_tokens_seen": 104756688, "step": 155430 }, { "epoch": 3.7973029096328146, "grad_norm": 0.00038077402859926224, "learning_rate": 3.322842573644837e-07, "loss": 0.0027, "num_input_tokens_seen": 104760400, "step": 155435 }, { "epoch": 3.7974250604646618, "grad_norm": 0.00400801794603467, "learning_rate": 3.3222077775399295e-07, "loss": 0.0256, "num_input_tokens_seen": 104764176, "step": 155440 }, { "epoch": 3.797547211296509, "grad_norm": 0.00038191594649106264, "learning_rate": 3.321573029997725e-07, "loss": 0.0, "num_input_tokens_seen": 104767696, "step": 155445 }, { "epoch": 3.797669362128356, "grad_norm": 0.0004798115696758032, "learning_rate": 3.3209383310228355e-07, "loss": 0.0, "num_input_tokens_seen": 104771088, "step": 155450 }, { "epoch": 3.7977915129602033, "grad_norm": 0.0025597454514354467, "learning_rate": 3.3203036806198783e-07, "loss": 0.0, "num_input_tokens_seen": 104774160, "step": 155455 }, { "epoch": 3.7979136637920505, "grad_norm": 0.005515729542821646, "learning_rate": 3.3196690787934734e-07, "loss": 0.0, "num_input_tokens_seen": 104777424, "step": 155460 }, { "epoch": 3.7980358146238977, "grad_norm": 0.036175571382045746, "learning_rate": 3.3190345255482276e-07, "loss": 0.0, "num_input_tokens_seen": 104780752, "step": 155465 }, { "epoch": 3.798157965455745, "grad_norm": 0.0009230665164068341, "learning_rate": 3.318400020888764e-07, "loss": 0.0, "num_input_tokens_seen": 104784208, "step": 155470 }, { "epoch": 3.798280116287592, "grad_norm": 22.15227699279785, "learning_rate": 3.317765564819689e-07, "loss": 0.0343, "num_input_tokens_seen": 104787728, "step": 155475 }, { "epoch": 3.798402267119439, "grad_norm": 0.0009698776993900537, "learning_rate": 3.317131157345623e-07, "loss": 0.0, "num_input_tokens_seen": 104791248, "step": 155480 }, { "epoch": 3.7985244179512865, "grad_norm": 0.011073552072048187, "learning_rate": 3.316496798471173e-07, "loss": 0.0299, "num_input_tokens_seen": 104794768, "step": 155485 }, { "epoch": 3.7986465687831332, "grad_norm": 0.0010699655395001173, "learning_rate": 3.3158624882009567e-07, "loss": 0.0, "num_input_tokens_seen": 104798224, "step": 155490 }, { "epoch": 3.7987687196149804, "grad_norm": 0.002896029269322753, "learning_rate": 3.3152282265395895e-07, "loss": 0.0, "num_input_tokens_seen": 104802000, "step": 155495 }, { "epoch": 3.7988908704468276, "grad_norm": 0.0017450011800974607, "learning_rate": 3.314594013491681e-07, "loss": 0.0, "num_input_tokens_seen": 104805136, "step": 155500 }, { "epoch": 3.799013021278675, "grad_norm": 0.002626369008794427, "learning_rate": 3.313959849061838e-07, "loss": 0.0006, "num_input_tokens_seen": 104808336, "step": 155505 }, { "epoch": 3.799135172110522, "grad_norm": 0.002531877951696515, "learning_rate": 3.313325733254682e-07, "loss": 0.0, "num_input_tokens_seen": 104811472, "step": 155510 }, { "epoch": 3.799257322942369, "grad_norm": 0.0009253830648958683, "learning_rate": 3.3126916660748194e-07, "loss": 0.0, "num_input_tokens_seen": 104814928, "step": 155515 }, { "epoch": 3.7993794737742164, "grad_norm": 0.0009583283099345863, "learning_rate": 3.312057647526858e-07, "loss": 0.0, "num_input_tokens_seen": 104818000, "step": 155520 }, { "epoch": 3.7995016246060636, "grad_norm": 0.0002628966176416725, "learning_rate": 3.311423677615414e-07, "loss": 0.0, "num_input_tokens_seen": 104821072, "step": 155525 }, { "epoch": 3.7996237754379107, "grad_norm": 0.0020152684301137924, "learning_rate": 3.310789756345097e-07, "loss": 0.0444, "num_input_tokens_seen": 104824336, "step": 155530 }, { "epoch": 3.799745926269758, "grad_norm": 0.0011446630815044045, "learning_rate": 3.310155883720513e-07, "loss": 0.0, "num_input_tokens_seen": 104827472, "step": 155535 }, { "epoch": 3.799868077101605, "grad_norm": 0.0017714033601805568, "learning_rate": 3.309522059746279e-07, "loss": 0.0, "num_input_tokens_seen": 104830672, "step": 155540 }, { "epoch": 3.7999902279334523, "grad_norm": 0.0008015253115445375, "learning_rate": 3.308888284426997e-07, "loss": 0.0, "num_input_tokens_seen": 104834128, "step": 155545 }, { "epoch": 3.8001123787652995, "grad_norm": 0.022300535812973976, "learning_rate": 3.308254557767279e-07, "loss": 0.0, "num_input_tokens_seen": 104837584, "step": 155550 }, { "epoch": 3.8002345295971467, "grad_norm": 0.00024351937463507056, "learning_rate": 3.3076208797717366e-07, "loss": 0.0, "num_input_tokens_seen": 104841232, "step": 155555 }, { "epoch": 3.800356680428994, "grad_norm": 68.36043548583984, "learning_rate": 3.3069872504449723e-07, "loss": 0.0213, "num_input_tokens_seen": 104844752, "step": 155560 }, { "epoch": 3.8004788312608406, "grad_norm": 0.00022004992933943868, "learning_rate": 3.3063536697915995e-07, "loss": 0.0, "num_input_tokens_seen": 104848208, "step": 155565 }, { "epoch": 3.8006009820926883, "grad_norm": 0.0009000792051665485, "learning_rate": 3.30572013781622e-07, "loss": 0.0, "num_input_tokens_seen": 104851792, "step": 155570 }, { "epoch": 3.800723132924535, "grad_norm": 0.012199978344142437, "learning_rate": 3.305086654523449e-07, "loss": 0.0001, "num_input_tokens_seen": 104855120, "step": 155575 }, { "epoch": 3.8008452837563826, "grad_norm": 0.0024338264483958483, "learning_rate": 3.304453219917883e-07, "loss": 0.1219, "num_input_tokens_seen": 104858192, "step": 155580 }, { "epoch": 3.8009674345882294, "grad_norm": 0.00391667103394866, "learning_rate": 3.3038198340041356e-07, "loss": 0.0293, "num_input_tokens_seen": 104862352, "step": 155585 }, { "epoch": 3.8010895854200766, "grad_norm": 0.003353690728545189, "learning_rate": 3.3031864967868153e-07, "loss": 0.0, "num_input_tokens_seen": 104865168, "step": 155590 }, { "epoch": 3.8012117362519238, "grad_norm": 1647.1788330078125, "learning_rate": 3.302553208270519e-07, "loss": 0.016, "num_input_tokens_seen": 104869008, "step": 155595 }, { "epoch": 3.801333887083771, "grad_norm": 0.020366232842206955, "learning_rate": 3.301919968459861e-07, "loss": 0.0, "num_input_tokens_seen": 104872144, "step": 155600 }, { "epoch": 3.801456037915618, "grad_norm": 0.007908672094345093, "learning_rate": 3.3012867773594434e-07, "loss": 0.0, "num_input_tokens_seen": 104875280, "step": 155605 }, { "epoch": 3.8015781887474653, "grad_norm": 0.010846472345292568, "learning_rate": 3.3006536349738654e-07, "loss": 0.0, "num_input_tokens_seen": 104878864, "step": 155610 }, { "epoch": 3.8017003395793125, "grad_norm": 0.00580504909157753, "learning_rate": 3.300020541307741e-07, "loss": 0.0001, "num_input_tokens_seen": 104881680, "step": 155615 }, { "epoch": 3.8018224904111597, "grad_norm": 0.005202096421271563, "learning_rate": 3.2993874963656645e-07, "loss": 0.0001, "num_input_tokens_seen": 104884496, "step": 155620 }, { "epoch": 3.801944641243007, "grad_norm": 0.005184181034564972, "learning_rate": 3.298754500152249e-07, "loss": 0.0001, "num_input_tokens_seen": 104888016, "step": 155625 }, { "epoch": 3.802066792074854, "grad_norm": 0.011404477059841156, "learning_rate": 3.298121552672088e-07, "loss": 0.0, "num_input_tokens_seen": 104891216, "step": 155630 }, { "epoch": 3.8021889429067013, "grad_norm": 0.00029271142557263374, "learning_rate": 3.297488653929794e-07, "loss": 0.0, "num_input_tokens_seen": 104895184, "step": 155635 }, { "epoch": 3.8023110937385485, "grad_norm": 0.0004036023165099323, "learning_rate": 3.2968558039299633e-07, "loss": 0.0, "num_input_tokens_seen": 104899152, "step": 155640 }, { "epoch": 3.8024332445703957, "grad_norm": 20.258140563964844, "learning_rate": 3.296223002677199e-07, "loss": 0.0869, "num_input_tokens_seen": 104902416, "step": 155645 }, { "epoch": 3.802555395402243, "grad_norm": 0.0014357427135109901, "learning_rate": 3.2955902501761067e-07, "loss": 0.0001, "num_input_tokens_seen": 104905808, "step": 155650 }, { "epoch": 3.80267754623409, "grad_norm": 0.004626494366675615, "learning_rate": 3.294957546431283e-07, "loss": 0.0, "num_input_tokens_seen": 104908752, "step": 155655 }, { "epoch": 3.802799697065937, "grad_norm": 37.42518997192383, "learning_rate": 3.294324891447334e-07, "loss": 0.0559, "num_input_tokens_seen": 104912208, "step": 155660 }, { "epoch": 3.8029218478977844, "grad_norm": 0.0018757034558802843, "learning_rate": 3.293692285228855e-07, "loss": 0.0, "num_input_tokens_seen": 104915920, "step": 155665 }, { "epoch": 3.803043998729631, "grad_norm": 0.004554565064609051, "learning_rate": 3.2930597277804537e-07, "loss": 0.0, "num_input_tokens_seen": 104919056, "step": 155670 }, { "epoch": 3.8031661495614784, "grad_norm": 0.0016938750632107258, "learning_rate": 3.2924272191067215e-07, "loss": 0.0, "num_input_tokens_seen": 104922384, "step": 155675 }, { "epoch": 3.8032883003933255, "grad_norm": 0.00745719950646162, "learning_rate": 3.291794759212263e-07, "loss": 0.0336, "num_input_tokens_seen": 104925840, "step": 155680 }, { "epoch": 3.8034104512251727, "grad_norm": 0.001887062331661582, "learning_rate": 3.2911623481016814e-07, "loss": 0.0606, "num_input_tokens_seen": 104929168, "step": 155685 }, { "epoch": 3.80353260205702, "grad_norm": 0.047493480145931244, "learning_rate": 3.2905299857795675e-07, "loss": 0.0, "num_input_tokens_seen": 104932752, "step": 155690 }, { "epoch": 3.803654752888867, "grad_norm": 0.006993389688432217, "learning_rate": 3.289897672250528e-07, "loss": 0.0332, "num_input_tokens_seen": 104935888, "step": 155695 }, { "epoch": 3.8037769037207143, "grad_norm": 0.0030944752506911755, "learning_rate": 3.289265407519154e-07, "loss": 0.074, "num_input_tokens_seen": 104939216, "step": 155700 }, { "epoch": 3.8038990545525615, "grad_norm": 0.14342844486236572, "learning_rate": 3.28863319159005e-07, "loss": 0.0001, "num_input_tokens_seen": 104942800, "step": 155705 }, { "epoch": 3.8040212053844087, "grad_norm": 0.007254133466631174, "learning_rate": 3.28800102446781e-07, "loss": 0.0, "num_input_tokens_seen": 104946064, "step": 155710 }, { "epoch": 3.804143356216256, "grad_norm": 0.0028453157283365726, "learning_rate": 3.2873689061570297e-07, "loss": 0.0, "num_input_tokens_seen": 104949776, "step": 155715 }, { "epoch": 3.804265507048103, "grad_norm": 0.0009071178501471877, "learning_rate": 3.286736836662311e-07, "loss": 0.0064, "num_input_tokens_seen": 104953104, "step": 155720 }, { "epoch": 3.8043876578799503, "grad_norm": 0.003251630812883377, "learning_rate": 3.286104815988244e-07, "loss": 0.0001, "num_input_tokens_seen": 104956240, "step": 155725 }, { "epoch": 3.8045098087117974, "grad_norm": 0.012875410728156567, "learning_rate": 3.285472844139432e-07, "loss": 0.0001, "num_input_tokens_seen": 104959248, "step": 155730 }, { "epoch": 3.8046319595436446, "grad_norm": 0.004795216489583254, "learning_rate": 3.2848409211204653e-07, "loss": 0.0, "num_input_tokens_seen": 104962256, "step": 155735 }, { "epoch": 3.804754110375492, "grad_norm": 0.0023328224197030067, "learning_rate": 3.2842090469359406e-07, "loss": 0.0001, "num_input_tokens_seen": 104965328, "step": 155740 }, { "epoch": 3.8048762612073386, "grad_norm": 0.007147334516048431, "learning_rate": 3.283577221590457e-07, "loss": 0.0002, "num_input_tokens_seen": 104968592, "step": 155745 }, { "epoch": 3.804998412039186, "grad_norm": 0.001518872333690524, "learning_rate": 3.282945445088604e-07, "loss": 0.0, "num_input_tokens_seen": 104972368, "step": 155750 }, { "epoch": 3.805120562871033, "grad_norm": 0.006218430120497942, "learning_rate": 3.28231371743498e-07, "loss": 0.0, "num_input_tokens_seen": 104975696, "step": 155755 }, { "epoch": 3.8052427137028806, "grad_norm": 0.2355884313583374, "learning_rate": 3.281682038634176e-07, "loss": 0.0002, "num_input_tokens_seen": 104979792, "step": 155760 }, { "epoch": 3.8053648645347273, "grad_norm": 0.0006400325219146907, "learning_rate": 3.28105040869079e-07, "loss": 0.0, "num_input_tokens_seen": 104983376, "step": 155765 }, { "epoch": 3.8054870153665745, "grad_norm": 0.0002564819878898561, "learning_rate": 3.280418827609409e-07, "loss": 0.0415, "num_input_tokens_seen": 104986768, "step": 155770 }, { "epoch": 3.8056091661984217, "grad_norm": 0.004700619261711836, "learning_rate": 3.2797872953946305e-07, "loss": 0.0, "num_input_tokens_seen": 104990416, "step": 155775 }, { "epoch": 3.805731317030269, "grad_norm": 0.0008923843270167708, "learning_rate": 3.279155812051049e-07, "loss": 0.0, "num_input_tokens_seen": 104994064, "step": 155780 }, { "epoch": 3.805853467862116, "grad_norm": 0.001144541660323739, "learning_rate": 3.2785243775832505e-07, "loss": 0.0001, "num_input_tokens_seen": 104997520, "step": 155785 }, { "epoch": 3.8059756186939633, "grad_norm": 0.00030647515086457133, "learning_rate": 3.277892991995834e-07, "loss": 0.0, "num_input_tokens_seen": 105000720, "step": 155790 }, { "epoch": 3.8060977695258105, "grad_norm": 0.010796795599162579, "learning_rate": 3.277261655293384e-07, "loss": 0.0, "num_input_tokens_seen": 105004432, "step": 155795 }, { "epoch": 3.8062199203576577, "grad_norm": 0.19322262704372406, "learning_rate": 3.2766303674804964e-07, "loss": 0.0001, "num_input_tokens_seen": 105007824, "step": 155800 }, { "epoch": 3.806342071189505, "grad_norm": 18.42563247680664, "learning_rate": 3.275999128561764e-07, "loss": 0.0619, "num_input_tokens_seen": 105010832, "step": 155805 }, { "epoch": 3.806464222021352, "grad_norm": 21.573936462402344, "learning_rate": 3.2753679385417745e-07, "loss": 0.0279, "num_input_tokens_seen": 105014160, "step": 155810 }, { "epoch": 3.8065863728531992, "grad_norm": 0.001498056692071259, "learning_rate": 3.274736797425115e-07, "loss": 0.0, "num_input_tokens_seen": 105017808, "step": 155815 }, { "epoch": 3.8067085236850464, "grad_norm": 20.64427947998047, "learning_rate": 3.2741057052163814e-07, "loss": 0.0546, "num_input_tokens_seen": 105021392, "step": 155820 }, { "epoch": 3.8068306745168936, "grad_norm": 0.01792875863611698, "learning_rate": 3.27347466192016e-07, "loss": 0.0, "num_input_tokens_seen": 105024848, "step": 155825 }, { "epoch": 3.8069528253487404, "grad_norm": 0.0008755176095291972, "learning_rate": 3.2728436675410376e-07, "loss": 0.0, "num_input_tokens_seen": 105027856, "step": 155830 }, { "epoch": 3.807074976180588, "grad_norm": 0.10433609038591385, "learning_rate": 3.2722127220836047e-07, "loss": 0.0, "num_input_tokens_seen": 105031120, "step": 155835 }, { "epoch": 3.8071971270124347, "grad_norm": 0.00405006343498826, "learning_rate": 3.271581825552454e-07, "loss": 0.0012, "num_input_tokens_seen": 105034320, "step": 155840 }, { "epoch": 3.8073192778442824, "grad_norm": 0.004479088354855776, "learning_rate": 3.270950977952166e-07, "loss": 0.0, "num_input_tokens_seen": 105037328, "step": 155845 }, { "epoch": 3.807441428676129, "grad_norm": 0.025686856359243393, "learning_rate": 3.270320179287337e-07, "loss": 0.0, "num_input_tokens_seen": 105040592, "step": 155850 }, { "epoch": 3.8075635795079763, "grad_norm": 0.028088264167308807, "learning_rate": 3.2696894295625456e-07, "loss": 0.0353, "num_input_tokens_seen": 105043856, "step": 155855 }, { "epoch": 3.8076857303398235, "grad_norm": 0.0788717269897461, "learning_rate": 3.2690587287823824e-07, "loss": 0.0005, "num_input_tokens_seen": 105047376, "step": 155860 }, { "epoch": 3.8078078811716707, "grad_norm": 0.31897079944610596, "learning_rate": 3.2684280769514384e-07, "loss": 0.0003, "num_input_tokens_seen": 105050832, "step": 155865 }, { "epoch": 3.807930032003518, "grad_norm": 0.003613928332924843, "learning_rate": 3.267797474074293e-07, "loss": 0.0467, "num_input_tokens_seen": 105054032, "step": 155870 }, { "epoch": 3.808052182835365, "grad_norm": 0.008457034826278687, "learning_rate": 3.267166920155537e-07, "loss": 0.0, "num_input_tokens_seen": 105057616, "step": 155875 }, { "epoch": 3.8081743336672123, "grad_norm": 21.948022842407227, "learning_rate": 3.266536415199753e-07, "loss": 0.0264, "num_input_tokens_seen": 105060880, "step": 155880 }, { "epoch": 3.8082964844990594, "grad_norm": 0.563244640827179, "learning_rate": 3.265905959211529e-07, "loss": 0.0001, "num_input_tokens_seen": 105064208, "step": 155885 }, { "epoch": 3.8084186353309066, "grad_norm": 0.6338911652565002, "learning_rate": 3.2652755521954456e-07, "loss": 0.1409, "num_input_tokens_seen": 105067280, "step": 155890 }, { "epoch": 3.808540786162754, "grad_norm": 0.007176944054663181, "learning_rate": 3.2646451941560895e-07, "loss": 0.0, "num_input_tokens_seen": 105070864, "step": 155895 }, { "epoch": 3.808662936994601, "grad_norm": 0.03897714242339134, "learning_rate": 3.264014885098049e-07, "loss": 0.0001, "num_input_tokens_seen": 105074320, "step": 155900 }, { "epoch": 3.808785087826448, "grad_norm": 0.021542305126786232, "learning_rate": 3.2633846250259e-07, "loss": 0.0, "num_input_tokens_seen": 105077584, "step": 155905 }, { "epoch": 3.8089072386582954, "grad_norm": 0.0006053220713511109, "learning_rate": 3.262754413944233e-07, "loss": 0.0, "num_input_tokens_seen": 105080848, "step": 155910 }, { "epoch": 3.8090293894901426, "grad_norm": 0.0019855385180562735, "learning_rate": 3.2621242518576286e-07, "loss": 0.0, "num_input_tokens_seen": 105084176, "step": 155915 }, { "epoch": 3.8091515403219898, "grad_norm": 0.01248567271977663, "learning_rate": 3.261494138770665e-07, "loss": 0.0, "num_input_tokens_seen": 105087376, "step": 155920 }, { "epoch": 3.8092736911538365, "grad_norm": 0.001591844018548727, "learning_rate": 3.260864074687932e-07, "loss": 0.0002, "num_input_tokens_seen": 105090640, "step": 155925 }, { "epoch": 3.809395841985684, "grad_norm": 0.00416983338072896, "learning_rate": 3.260234059614005e-07, "loss": 0.0, "num_input_tokens_seen": 105094288, "step": 155930 }, { "epoch": 3.809517992817531, "grad_norm": 0.0006153019494377077, "learning_rate": 3.259604093553472e-07, "loss": 0.06, "num_input_tokens_seen": 105097360, "step": 155935 }, { "epoch": 3.8096401436493785, "grad_norm": 0.00553601048886776, "learning_rate": 3.258974176510908e-07, "loss": 0.0, "num_input_tokens_seen": 105101392, "step": 155940 }, { "epoch": 3.8097622944812253, "grad_norm": 88.26294708251953, "learning_rate": 3.258344308490899e-07, "loss": 0.0563, "num_input_tokens_seen": 105104848, "step": 155945 }, { "epoch": 3.8098844453130725, "grad_norm": 58.035316467285156, "learning_rate": 3.2577144894980213e-07, "loss": 0.0378, "num_input_tokens_seen": 105108048, "step": 155950 }, { "epoch": 3.8100065961449197, "grad_norm": 0.007737881503999233, "learning_rate": 3.2570847195368565e-07, "loss": 0.0, "num_input_tokens_seen": 105110928, "step": 155955 }, { "epoch": 3.810128746976767, "grad_norm": 0.03556019812822342, "learning_rate": 3.256454998611989e-07, "loss": 0.0, "num_input_tokens_seen": 105114640, "step": 155960 }, { "epoch": 3.810250897808614, "grad_norm": 0.0022949352860450745, "learning_rate": 3.2558253267279923e-07, "loss": 0.0614, "num_input_tokens_seen": 105118224, "step": 155965 }, { "epoch": 3.8103730486404612, "grad_norm": 0.011481580324470997, "learning_rate": 3.25519570388945e-07, "loss": 0.0002, "num_input_tokens_seen": 105121424, "step": 155970 }, { "epoch": 3.8104951994723084, "grad_norm": 0.0012904921313747764, "learning_rate": 3.254566130100935e-07, "loss": 0.0, "num_input_tokens_seen": 105125200, "step": 155975 }, { "epoch": 3.8106173503041556, "grad_norm": 0.004640914965420961, "learning_rate": 3.253936605367034e-07, "loss": 0.0, "num_input_tokens_seen": 105128272, "step": 155980 }, { "epoch": 3.810739501136003, "grad_norm": 1.151449203491211, "learning_rate": 3.2533071296923154e-07, "loss": 0.0003, "num_input_tokens_seen": 105131408, "step": 155985 }, { "epoch": 3.81086165196785, "grad_norm": 0.008894668892025948, "learning_rate": 3.2526777030813636e-07, "loss": 0.0, "num_input_tokens_seen": 105134672, "step": 155990 }, { "epoch": 3.810983802799697, "grad_norm": 0.0030117423739284277, "learning_rate": 3.2520483255387567e-07, "loss": 0.0569, "num_input_tokens_seen": 105138000, "step": 155995 }, { "epoch": 3.8111059536315444, "grad_norm": 0.0029887051787227392, "learning_rate": 3.2514189970690666e-07, "loss": 0.0, "num_input_tokens_seen": 105141456, "step": 156000 }, { "epoch": 3.8112281044633916, "grad_norm": 0.002914925804361701, "learning_rate": 3.2507897176768753e-07, "loss": 0.0, "num_input_tokens_seen": 105144912, "step": 156005 }, { "epoch": 3.8113502552952383, "grad_norm": 0.006630240008234978, "learning_rate": 3.250160487366753e-07, "loss": 0.0001, "num_input_tokens_seen": 105148240, "step": 156010 }, { "epoch": 3.811472406127086, "grad_norm": 0.0067259520292282104, "learning_rate": 3.2495313061432836e-07, "loss": 0.0, "num_input_tokens_seen": 105151184, "step": 156015 }, { "epoch": 3.8115945569589327, "grad_norm": 0.0004644088912755251, "learning_rate": 3.2489021740110344e-07, "loss": 0.0, "num_input_tokens_seen": 105154640, "step": 156020 }, { "epoch": 3.8117167077907803, "grad_norm": 0.010894610546529293, "learning_rate": 3.2482730909745883e-07, "loss": 0.0, "num_input_tokens_seen": 105157968, "step": 156025 }, { "epoch": 3.811838858622627, "grad_norm": 0.384575754404068, "learning_rate": 3.2476440570385155e-07, "loss": 0.0001, "num_input_tokens_seen": 105161168, "step": 156030 }, { "epoch": 3.8119610094544742, "grad_norm": 0.0025056188460439444, "learning_rate": 3.2470150722073875e-07, "loss": 0.0007, "num_input_tokens_seen": 105164432, "step": 156035 }, { "epoch": 3.8120831602863214, "grad_norm": 0.006241832859814167, "learning_rate": 3.246386136485786e-07, "loss": 0.0, "num_input_tokens_seen": 105167888, "step": 156040 }, { "epoch": 3.8122053111181686, "grad_norm": 0.002331847557798028, "learning_rate": 3.2457572498782783e-07, "loss": 0.0, "num_input_tokens_seen": 105171472, "step": 156045 }, { "epoch": 3.812327461950016, "grad_norm": 0.003435875289142132, "learning_rate": 3.2451284123894394e-07, "loss": 0.0, "num_input_tokens_seen": 105174800, "step": 156050 }, { "epoch": 3.812449612781863, "grad_norm": 0.00930994562804699, "learning_rate": 3.2444996240238474e-07, "loss": 0.0, "num_input_tokens_seen": 105178128, "step": 156055 }, { "epoch": 3.81257176361371, "grad_norm": 0.0016094446182250977, "learning_rate": 3.2438708847860684e-07, "loss": 0.0001, "num_input_tokens_seen": 105181904, "step": 156060 }, { "epoch": 3.8126939144455574, "grad_norm": 0.001043045660480857, "learning_rate": 3.243242194680681e-07, "loss": 0.0879, "num_input_tokens_seen": 105185744, "step": 156065 }, { "epoch": 3.8128160652774046, "grad_norm": 0.021015044301748276, "learning_rate": 3.242613553712249e-07, "loss": 0.0, "num_input_tokens_seen": 105189392, "step": 156070 }, { "epoch": 3.8129382161092518, "grad_norm": 0.004099312704056501, "learning_rate": 3.2419849618853535e-07, "loss": 0.0671, "num_input_tokens_seen": 105192720, "step": 156075 }, { "epoch": 3.813060366941099, "grad_norm": 0.001958550186827779, "learning_rate": 3.2413564192045573e-07, "loss": 0.0, "num_input_tokens_seen": 105195792, "step": 156080 }, { "epoch": 3.813182517772946, "grad_norm": 0.09461662918329239, "learning_rate": 3.2407279256744344e-07, "loss": 0.0302, "num_input_tokens_seen": 105198864, "step": 156085 }, { "epoch": 3.8133046686047933, "grad_norm": 0.004242750816047192, "learning_rate": 3.2400994812995595e-07, "loss": 0.0001, "num_input_tokens_seen": 105202448, "step": 156090 }, { "epoch": 3.8134268194366405, "grad_norm": 0.00951340515166521, "learning_rate": 3.2394710860844963e-07, "loss": 0.0, "num_input_tokens_seen": 105205840, "step": 156095 }, { "epoch": 3.8135489702684877, "grad_norm": 16.03184700012207, "learning_rate": 3.2388427400338215e-07, "loss": 0.0173, "num_input_tokens_seen": 105209360, "step": 156100 }, { "epoch": 3.8136711211003345, "grad_norm": 0.0026069078594446182, "learning_rate": 3.2382144431520975e-07, "loss": 0.0, "num_input_tokens_seen": 105213008, "step": 156105 }, { "epoch": 3.813793271932182, "grad_norm": 0.0012540466850623488, "learning_rate": 3.2375861954438967e-07, "loss": 0.0359, "num_input_tokens_seen": 105216592, "step": 156110 }, { "epoch": 3.813915422764029, "grad_norm": 0.006343018263578415, "learning_rate": 3.236957996913791e-07, "loss": 0.0, "num_input_tokens_seen": 105220176, "step": 156115 }, { "epoch": 3.814037573595876, "grad_norm": 0.6488041877746582, "learning_rate": 3.236329847566346e-07, "loss": 0.0003, "num_input_tokens_seen": 105223824, "step": 156120 }, { "epoch": 3.814159724427723, "grad_norm": 0.011054744012653828, "learning_rate": 3.2357017474061255e-07, "loss": 0.0, "num_input_tokens_seen": 105227024, "step": 156125 }, { "epoch": 3.8142818752595704, "grad_norm": 0.008859804831445217, "learning_rate": 3.2350736964377045e-07, "loss": 0.0001, "num_input_tokens_seen": 105230224, "step": 156130 }, { "epoch": 3.8144040260914176, "grad_norm": 0.0750259980559349, "learning_rate": 3.2344456946656485e-07, "loss": 0.0513, "num_input_tokens_seen": 105233616, "step": 156135 }, { "epoch": 3.814526176923265, "grad_norm": 0.02294854074716568, "learning_rate": 3.233817742094519e-07, "loss": 0.0, "num_input_tokens_seen": 105236880, "step": 156140 }, { "epoch": 3.814648327755112, "grad_norm": 0.0033728990238159895, "learning_rate": 3.233189838728887e-07, "loss": 0.0001, "num_input_tokens_seen": 105239952, "step": 156145 }, { "epoch": 3.814770478586959, "grad_norm": 0.012777571566402912, "learning_rate": 3.232561984573321e-07, "loss": 0.0, "num_input_tokens_seen": 105243216, "step": 156150 }, { "epoch": 3.8148926294188064, "grad_norm": 0.09926612675189972, "learning_rate": 3.2319341796323817e-07, "loss": 0.0002, "num_input_tokens_seen": 105246416, "step": 156155 }, { "epoch": 3.8150147802506535, "grad_norm": 0.002459646435454488, "learning_rate": 3.231306423910641e-07, "loss": 0.0, "num_input_tokens_seen": 105249680, "step": 156160 }, { "epoch": 3.8151369310825007, "grad_norm": 0.0014912053011357784, "learning_rate": 3.230678717412657e-07, "loss": 0.0314, "num_input_tokens_seen": 105253008, "step": 156165 }, { "epoch": 3.815259081914348, "grad_norm": 0.007945233955979347, "learning_rate": 3.230051060142999e-07, "loss": 0.0001, "num_input_tokens_seen": 105255888, "step": 156170 }, { "epoch": 3.815381232746195, "grad_norm": 0.14787600934505463, "learning_rate": 3.2294234521062325e-07, "loss": 0.0001, "num_input_tokens_seen": 105259408, "step": 156175 }, { "epoch": 3.8155033835780423, "grad_norm": 0.015462593175470829, "learning_rate": 3.228795893306917e-07, "loss": 0.0, "num_input_tokens_seen": 105262352, "step": 156180 }, { "epoch": 3.8156255344098895, "grad_norm": 0.0005006550345569849, "learning_rate": 3.228168383749622e-07, "loss": 0.0001, "num_input_tokens_seen": 105265680, "step": 156185 }, { "epoch": 3.8157476852417362, "grad_norm": 0.3698059916496277, "learning_rate": 3.2275409234389053e-07, "loss": 0.0323, "num_input_tokens_seen": 105269008, "step": 156190 }, { "epoch": 3.815869836073584, "grad_norm": 0.020955270156264305, "learning_rate": 3.226913512379336e-07, "loss": 0.0, "num_input_tokens_seen": 105272080, "step": 156195 }, { "epoch": 3.8159919869054306, "grad_norm": 0.00198382674716413, "learning_rate": 3.226286150575469e-07, "loss": 0.0, "num_input_tokens_seen": 105275216, "step": 156200 }, { "epoch": 3.8161141377372783, "grad_norm": 0.008968944661319256, "learning_rate": 3.225658838031872e-07, "loss": 0.0, "num_input_tokens_seen": 105278672, "step": 156205 }, { "epoch": 3.816236288569125, "grad_norm": 0.0022881953045725822, "learning_rate": 3.225031574753109e-07, "loss": 0.0, "num_input_tokens_seen": 105281744, "step": 156210 }, { "epoch": 3.816358439400972, "grad_norm": 0.011253178119659424, "learning_rate": 3.2244043607437353e-07, "loss": 0.0, "num_input_tokens_seen": 105284880, "step": 156215 }, { "epoch": 3.8164805902328194, "grad_norm": 0.013502773828804493, "learning_rate": 3.223777196008318e-07, "loss": 0.0, "num_input_tokens_seen": 105287952, "step": 156220 }, { "epoch": 3.8166027410646666, "grad_norm": 26.68203353881836, "learning_rate": 3.2231500805514167e-07, "loss": 0.0277, "num_input_tokens_seen": 105292048, "step": 156225 }, { "epoch": 3.8167248918965138, "grad_norm": 0.027704458683729172, "learning_rate": 3.222523014377587e-07, "loss": 0.0, "num_input_tokens_seen": 105295376, "step": 156230 }, { "epoch": 3.816847042728361, "grad_norm": 0.011415623128414154, "learning_rate": 3.2218959974913963e-07, "loss": 0.0, "num_input_tokens_seen": 105298960, "step": 156235 }, { "epoch": 3.816969193560208, "grad_norm": 0.0035196489188820124, "learning_rate": 3.2212690298973976e-07, "loss": 0.0, "num_input_tokens_seen": 105302096, "step": 156240 }, { "epoch": 3.8170913443920553, "grad_norm": 0.0004433184221852571, "learning_rate": 3.220642111600157e-07, "loss": 0.0001, "num_input_tokens_seen": 105305104, "step": 156245 }, { "epoch": 3.8172134952239025, "grad_norm": 0.0012700185179710388, "learning_rate": 3.220015242604227e-07, "loss": 0.0, "num_input_tokens_seen": 105308176, "step": 156250 }, { "epoch": 3.8173356460557497, "grad_norm": 0.007200855761766434, "learning_rate": 3.219388422914173e-07, "loss": 0.0001, "num_input_tokens_seen": 105311376, "step": 156255 }, { "epoch": 3.817457796887597, "grad_norm": 0.004918563179671764, "learning_rate": 3.2187616525345474e-07, "loss": 0.0, "num_input_tokens_seen": 105315152, "step": 156260 }, { "epoch": 3.817579947719444, "grad_norm": 0.008288434706628323, "learning_rate": 3.2181349314699115e-07, "loss": 0.0, "num_input_tokens_seen": 105318544, "step": 156265 }, { "epoch": 3.8177020985512913, "grad_norm": 0.0018310670275241137, "learning_rate": 3.217508259724825e-07, "loss": 0.0, "num_input_tokens_seen": 105321680, "step": 156270 }, { "epoch": 3.817824249383138, "grad_norm": 0.017679639160633087, "learning_rate": 3.216881637303839e-07, "loss": 0.0002, "num_input_tokens_seen": 105324688, "step": 156275 }, { "epoch": 3.8179464002149857, "grad_norm": 0.00957119558006525, "learning_rate": 3.216255064211517e-07, "loss": 0.0007, "num_input_tokens_seen": 105328080, "step": 156280 }, { "epoch": 3.8180685510468324, "grad_norm": 0.0067603024654090405, "learning_rate": 3.215628540452411e-07, "loss": 0.0, "num_input_tokens_seen": 105331856, "step": 156285 }, { "epoch": 3.81819070187868, "grad_norm": 0.005869430955499411, "learning_rate": 3.215002066031082e-07, "loss": 0.0, "num_input_tokens_seen": 105335312, "step": 156290 }, { "epoch": 3.818312852710527, "grad_norm": 0.04536915570497513, "learning_rate": 3.2143756409520783e-07, "loss": 0.0834, "num_input_tokens_seen": 105338704, "step": 156295 }, { "epoch": 3.818435003542374, "grad_norm": 0.004794863983988762, "learning_rate": 3.213749265219962e-07, "loss": 0.0001, "num_input_tokens_seen": 105341968, "step": 156300 }, { "epoch": 3.818557154374221, "grad_norm": 0.009946302510797977, "learning_rate": 3.2131229388392877e-07, "loss": 0.0, "num_input_tokens_seen": 105345360, "step": 156305 }, { "epoch": 3.8186793052060684, "grad_norm": 0.016436627134680748, "learning_rate": 3.2124966618146066e-07, "loss": 0.0001, "num_input_tokens_seen": 105349264, "step": 156310 }, { "epoch": 3.8188014560379155, "grad_norm": 0.0038346415385603905, "learning_rate": 3.211870434150479e-07, "loss": 0.0, "num_input_tokens_seen": 105352528, "step": 156315 }, { "epoch": 3.8189236068697627, "grad_norm": 0.0010770164662972093, "learning_rate": 3.211244255851452e-07, "loss": 0.0, "num_input_tokens_seen": 105356112, "step": 156320 }, { "epoch": 3.81904575770161, "grad_norm": 0.01691940799355507, "learning_rate": 3.2106181269220856e-07, "loss": 0.0, "num_input_tokens_seen": 105359568, "step": 156325 }, { "epoch": 3.819167908533457, "grad_norm": 0.11155705899000168, "learning_rate": 3.209992047366927e-07, "loss": 0.0001, "num_input_tokens_seen": 105363152, "step": 156330 }, { "epoch": 3.8192900593653043, "grad_norm": 0.004868871998041868, "learning_rate": 3.209366017190536e-07, "loss": 0.0, "num_input_tokens_seen": 105366608, "step": 156335 }, { "epoch": 3.8194122101971515, "grad_norm": 0.0012334700440987945, "learning_rate": 3.2087400363974615e-07, "loss": 0.0001, "num_input_tokens_seen": 105370000, "step": 156340 }, { "epoch": 3.8195343610289987, "grad_norm": 0.002305307425558567, "learning_rate": 3.2081141049922534e-07, "loss": 0.0, "num_input_tokens_seen": 105373456, "step": 156345 }, { "epoch": 3.819656511860846, "grad_norm": 0.0002718440373428166, "learning_rate": 3.207488222979469e-07, "loss": 0.0, "num_input_tokens_seen": 105376912, "step": 156350 }, { "epoch": 3.819778662692693, "grad_norm": 0.0016530955908820033, "learning_rate": 3.2068623903636536e-07, "loss": 0.0, "num_input_tokens_seen": 105380368, "step": 156355 }, { "epoch": 3.8199008135245403, "grad_norm": 0.0009915755363181233, "learning_rate": 3.206236607149363e-07, "loss": 0.0, "num_input_tokens_seen": 105384144, "step": 156360 }, { "epoch": 3.8200229643563874, "grad_norm": 0.006974408403038979, "learning_rate": 3.2056108733411504e-07, "loss": 0.0, "num_input_tokens_seen": 105387152, "step": 156365 }, { "epoch": 3.820145115188234, "grad_norm": 0.0008302279748022556, "learning_rate": 3.2049851889435585e-07, "loss": 0.0, "num_input_tokens_seen": 105390608, "step": 156370 }, { "epoch": 3.820267266020082, "grad_norm": 0.0007241375278681517, "learning_rate": 3.2043595539611455e-07, "loss": 0.0, "num_input_tokens_seen": 105394256, "step": 156375 }, { "epoch": 3.8203894168519286, "grad_norm": 0.0036159860901534557, "learning_rate": 3.2037339683984554e-07, "loss": 0.0, "num_input_tokens_seen": 105397648, "step": 156380 }, { "epoch": 3.820511567683776, "grad_norm": 0.0054675801657140255, "learning_rate": 3.203108432260042e-07, "loss": 0.0479, "num_input_tokens_seen": 105400848, "step": 156385 }, { "epoch": 3.820633718515623, "grad_norm": 0.0014255117857828736, "learning_rate": 3.20248294555045e-07, "loss": 0.0, "num_input_tokens_seen": 105404688, "step": 156390 }, { "epoch": 3.82075586934747, "grad_norm": 0.0033376587089151144, "learning_rate": 3.201857508274231e-07, "loss": 0.0, "num_input_tokens_seen": 105408272, "step": 156395 }, { "epoch": 3.8208780201793173, "grad_norm": 135.96827697753906, "learning_rate": 3.201232120435934e-07, "loss": 0.0166, "num_input_tokens_seen": 105411344, "step": 156400 }, { "epoch": 3.8210001710111645, "grad_norm": 8.144104957580566, "learning_rate": 3.2006067820401026e-07, "loss": 0.1224, "num_input_tokens_seen": 105414864, "step": 156405 }, { "epoch": 3.8211223218430117, "grad_norm": 0.004194645211100578, "learning_rate": 3.1999814930912914e-07, "loss": 0.0, "num_input_tokens_seen": 105418640, "step": 156410 }, { "epoch": 3.821244472674859, "grad_norm": 0.0026404806412756443, "learning_rate": 3.1993562535940413e-07, "loss": 0.0, "num_input_tokens_seen": 105422288, "step": 156415 }, { "epoch": 3.821366623506706, "grad_norm": 0.02045687474310398, "learning_rate": 3.198731063552901e-07, "loss": 0.0, "num_input_tokens_seen": 105425744, "step": 156420 }, { "epoch": 3.8214887743385533, "grad_norm": 0.02419469691812992, "learning_rate": 3.1981059229724205e-07, "loss": 0.0, "num_input_tokens_seen": 105428944, "step": 156425 }, { "epoch": 3.8216109251704005, "grad_norm": 0.03584418073296547, "learning_rate": 3.197480831857143e-07, "loss": 0.0001, "num_input_tokens_seen": 105432400, "step": 156430 }, { "epoch": 3.8217330760022477, "grad_norm": 0.05083722621202469, "learning_rate": 3.1968557902116124e-07, "loss": 0.0, "num_input_tokens_seen": 105435664, "step": 156435 }, { "epoch": 3.821855226834095, "grad_norm": 0.0034922566264867783, "learning_rate": 3.196230798040379e-07, "loss": 0.0, "num_input_tokens_seen": 105439248, "step": 156440 }, { "epoch": 3.821977377665942, "grad_norm": 0.011057974770665169, "learning_rate": 3.195605855347985e-07, "loss": 0.0, "num_input_tokens_seen": 105442576, "step": 156445 }, { "epoch": 3.8220995284977892, "grad_norm": 0.0003465529007371515, "learning_rate": 3.194980962138972e-07, "loss": 0.0442, "num_input_tokens_seen": 105445776, "step": 156450 }, { "epoch": 3.822221679329636, "grad_norm": 0.004655024968087673, "learning_rate": 3.1943561184178893e-07, "loss": 0.0698, "num_input_tokens_seen": 105449744, "step": 156455 }, { "epoch": 3.8223438301614836, "grad_norm": 55.83885192871094, "learning_rate": 3.1937313241892806e-07, "loss": 0.0703, "num_input_tokens_seen": 105452880, "step": 156460 }, { "epoch": 3.8224659809933303, "grad_norm": 0.009342874400317669, "learning_rate": 3.1931065794576863e-07, "loss": 0.0317, "num_input_tokens_seen": 105456208, "step": 156465 }, { "epoch": 3.822588131825178, "grad_norm": 0.0033157255966216326, "learning_rate": 3.1924818842276547e-07, "loss": 0.0, "num_input_tokens_seen": 105459472, "step": 156470 }, { "epoch": 3.8227102826570247, "grad_norm": 0.004607651382684708, "learning_rate": 3.1918572385037225e-07, "loss": 0.0, "num_input_tokens_seen": 105462608, "step": 156475 }, { "epoch": 3.822832433488872, "grad_norm": 0.0009573953575454652, "learning_rate": 3.191232642290439e-07, "loss": 0.0, "num_input_tokens_seen": 105466640, "step": 156480 }, { "epoch": 3.822954584320719, "grad_norm": 0.021095087751746178, "learning_rate": 3.19060809559234e-07, "loss": 0.0001, "num_input_tokens_seen": 105470224, "step": 156485 }, { "epoch": 3.8230767351525663, "grad_norm": 0.0032122910488396883, "learning_rate": 3.18998359841397e-07, "loss": 0.0, "num_input_tokens_seen": 105473360, "step": 156490 }, { "epoch": 3.8231988859844135, "grad_norm": 0.19686217606067657, "learning_rate": 3.189359150759875e-07, "loss": 0.0001, "num_input_tokens_seen": 105476496, "step": 156495 }, { "epoch": 3.8233210368162607, "grad_norm": 0.04594238102436066, "learning_rate": 3.1887347526345885e-07, "loss": 0.0001, "num_input_tokens_seen": 105479568, "step": 156500 }, { "epoch": 3.823443187648108, "grad_norm": 0.0016377909341827035, "learning_rate": 3.1881104040426574e-07, "loss": 0.0, "num_input_tokens_seen": 105482512, "step": 156505 }, { "epoch": 3.823565338479955, "grad_norm": 0.003330622799694538, "learning_rate": 3.187486104988617e-07, "loss": 0.0, "num_input_tokens_seen": 105485968, "step": 156510 }, { "epoch": 3.8236874893118022, "grad_norm": 0.001057515270076692, "learning_rate": 3.18686185547701e-07, "loss": 0.0003, "num_input_tokens_seen": 105489680, "step": 156515 }, { "epoch": 3.8238096401436494, "grad_norm": 0.00812963955104351, "learning_rate": 3.1862376555123795e-07, "loss": 0.0, "num_input_tokens_seen": 105492880, "step": 156520 }, { "epoch": 3.8239317909754966, "grad_norm": 0.004067501053214073, "learning_rate": 3.1856135050992584e-07, "loss": 0.0, "num_input_tokens_seen": 105495952, "step": 156525 }, { "epoch": 3.824053941807344, "grad_norm": 0.005482273641973734, "learning_rate": 3.184989404242191e-07, "loss": 0.0002, "num_input_tokens_seen": 105499472, "step": 156530 }, { "epoch": 3.824176092639191, "grad_norm": 0.005263707600533962, "learning_rate": 3.184365352945715e-07, "loss": 0.0001, "num_input_tokens_seen": 105502672, "step": 156535 }, { "epoch": 3.824298243471038, "grad_norm": 0.0015458049019798636, "learning_rate": 3.183741351214363e-07, "loss": 0.0, "num_input_tokens_seen": 105505936, "step": 156540 }, { "epoch": 3.8244203943028854, "grad_norm": 0.000650758680421859, "learning_rate": 3.1831173990526806e-07, "loss": 0.0, "num_input_tokens_seen": 105509200, "step": 156545 }, { "epoch": 3.824542545134732, "grad_norm": 0.0012706829002127051, "learning_rate": 3.1824934964652e-07, "loss": 0.0, "num_input_tokens_seen": 105512528, "step": 156550 }, { "epoch": 3.8246646959665798, "grad_norm": 0.4105299711227417, "learning_rate": 3.1818696434564616e-07, "loss": 0.0016, "num_input_tokens_seen": 105515728, "step": 156555 }, { "epoch": 3.8247868467984265, "grad_norm": 0.007774420082569122, "learning_rate": 3.1812458400309993e-07, "loss": 0.0001, "num_input_tokens_seen": 105518800, "step": 156560 }, { "epoch": 3.8249089976302737, "grad_norm": 0.033843837678432465, "learning_rate": 3.180622086193354e-07, "loss": 0.0, "num_input_tokens_seen": 105522832, "step": 156565 }, { "epoch": 3.825031148462121, "grad_norm": 0.0010137784993276, "learning_rate": 3.1799983819480557e-07, "loss": 0.0, "num_input_tokens_seen": 105526224, "step": 156570 }, { "epoch": 3.825153299293968, "grad_norm": 0.000436730042565614, "learning_rate": 3.179374727299644e-07, "loss": 0.0, "num_input_tokens_seen": 105529744, "step": 156575 }, { "epoch": 3.8252754501258153, "grad_norm": 0.017465995624661446, "learning_rate": 3.1787511222526565e-07, "loss": 0.0, "num_input_tokens_seen": 105533136, "step": 156580 }, { "epoch": 3.8253976009576625, "grad_norm": 0.0004046593385282904, "learning_rate": 3.1781275668116225e-07, "loss": 0.0332, "num_input_tokens_seen": 105536528, "step": 156585 }, { "epoch": 3.8255197517895096, "grad_norm": 0.00676988298073411, "learning_rate": 3.177504060981083e-07, "loss": 0.0, "num_input_tokens_seen": 105539984, "step": 156590 }, { "epoch": 3.825641902621357, "grad_norm": 0.00046601903159171343, "learning_rate": 3.176880604765565e-07, "loss": 0.0, "num_input_tokens_seen": 105543376, "step": 156595 }, { "epoch": 3.825764053453204, "grad_norm": 0.0029104005079716444, "learning_rate": 3.1762571981696106e-07, "loss": 0.0, "num_input_tokens_seen": 105547024, "step": 156600 }, { "epoch": 3.825886204285051, "grad_norm": 0.00048557791160419583, "learning_rate": 3.175633841197746e-07, "loss": 0.0, "num_input_tokens_seen": 105550160, "step": 156605 }, { "epoch": 3.8260083551168984, "grad_norm": 0.0008252342231571674, "learning_rate": 3.1750105338545075e-07, "loss": 0.0, "num_input_tokens_seen": 105553424, "step": 156610 }, { "epoch": 3.8261305059487456, "grad_norm": 0.00999493058770895, "learning_rate": 3.174387276144431e-07, "loss": 0.0, "num_input_tokens_seen": 105556880, "step": 156615 }, { "epoch": 3.826252656780593, "grad_norm": 0.0012048380449414253, "learning_rate": 3.1737640680720433e-07, "loss": 0.0, "num_input_tokens_seen": 105560208, "step": 156620 }, { "epoch": 3.82637480761244, "grad_norm": 0.004898492246866226, "learning_rate": 3.173140909641883e-07, "loss": 0.0, "num_input_tokens_seen": 105564176, "step": 156625 }, { "epoch": 3.826496958444287, "grad_norm": 0.8940964937210083, "learning_rate": 3.1725178008584743e-07, "loss": 0.0002, "num_input_tokens_seen": 105567440, "step": 156630 }, { "epoch": 3.826619109276134, "grad_norm": 0.0011455508647486567, "learning_rate": 3.1718947417263553e-07, "loss": 0.0, "num_input_tokens_seen": 105570576, "step": 156635 }, { "epoch": 3.8267412601079815, "grad_norm": 0.0053358012810349464, "learning_rate": 3.1712717322500514e-07, "loss": 0.0, "num_input_tokens_seen": 105573520, "step": 156640 }, { "epoch": 3.8268634109398283, "grad_norm": 0.002351941540837288, "learning_rate": 3.1706487724341e-07, "loss": 0.0, "num_input_tokens_seen": 105576592, "step": 156645 }, { "epoch": 3.826985561771676, "grad_norm": 0.06902336329221725, "learning_rate": 3.1700258622830265e-07, "loss": 0.0626, "num_input_tokens_seen": 105580304, "step": 156650 }, { "epoch": 3.8271077126035227, "grad_norm": 0.0015805003931745887, "learning_rate": 3.1694030018013596e-07, "loss": 0.0, "num_input_tokens_seen": 105583888, "step": 156655 }, { "epoch": 3.82722986343537, "grad_norm": 0.01736919768154621, "learning_rate": 3.168780190993634e-07, "loss": 0.0, "num_input_tokens_seen": 105587024, "step": 156660 }, { "epoch": 3.827352014267217, "grad_norm": 0.003065595170482993, "learning_rate": 3.1681574298643743e-07, "loss": 0.0, "num_input_tokens_seen": 105590288, "step": 156665 }, { "epoch": 3.8274741650990642, "grad_norm": 0.017361000180244446, "learning_rate": 3.16753471841811e-07, "loss": 0.0, "num_input_tokens_seen": 105593232, "step": 156670 }, { "epoch": 3.8275963159309114, "grad_norm": 0.001208508969284594, "learning_rate": 3.1669120566593745e-07, "loss": 0.0, "num_input_tokens_seen": 105596368, "step": 156675 }, { "epoch": 3.8277184667627586, "grad_norm": 0.16313768923282623, "learning_rate": 3.166289444592689e-07, "loss": 0.0, "num_input_tokens_seen": 105599568, "step": 156680 }, { "epoch": 3.827840617594606, "grad_norm": 0.0008120430866256356, "learning_rate": 3.1656668822225884e-07, "loss": 0.0, "num_input_tokens_seen": 105603152, "step": 156685 }, { "epoch": 3.827962768426453, "grad_norm": 0.5106744170188904, "learning_rate": 3.165044369553592e-07, "loss": 0.0001, "num_input_tokens_seen": 105606736, "step": 156690 }, { "epoch": 3.8280849192583, "grad_norm": 0.0019487874815240502, "learning_rate": 3.1644219065902366e-07, "loss": 0.0001, "num_input_tokens_seen": 105610448, "step": 156695 }, { "epoch": 3.8282070700901474, "grad_norm": 0.0008269055979326367, "learning_rate": 3.1637994933370393e-07, "loss": 0.0, "num_input_tokens_seen": 105613456, "step": 156700 }, { "epoch": 3.8283292209219946, "grad_norm": 0.0038787401281297207, "learning_rate": 3.16317712979853e-07, "loss": 0.0, "num_input_tokens_seen": 105616784, "step": 156705 }, { "epoch": 3.8284513717538418, "grad_norm": 0.00037652463652193546, "learning_rate": 3.16255481597924e-07, "loss": 0.0, "num_input_tokens_seen": 105620624, "step": 156710 }, { "epoch": 3.828573522585689, "grad_norm": 0.03159145638346672, "learning_rate": 3.161932551883687e-07, "loss": 0.0, "num_input_tokens_seen": 105624080, "step": 156715 }, { "epoch": 3.828695673417536, "grad_norm": 0.0006402541184797883, "learning_rate": 3.161310337516402e-07, "loss": 0.0001, "num_input_tokens_seen": 105627344, "step": 156720 }, { "epoch": 3.8288178242493833, "grad_norm": 0.0021268008276820183, "learning_rate": 3.1606881728819057e-07, "loss": 0.0, "num_input_tokens_seen": 105630416, "step": 156725 }, { "epoch": 3.82893997508123, "grad_norm": 0.010209254920482635, "learning_rate": 3.160066057984724e-07, "loss": 0.0, "num_input_tokens_seen": 105633680, "step": 156730 }, { "epoch": 3.8290621259130777, "grad_norm": 0.007120275404304266, "learning_rate": 3.1594439928293847e-07, "loss": 0.0, "num_input_tokens_seen": 105637136, "step": 156735 }, { "epoch": 3.8291842767449245, "grad_norm": 0.00281558302231133, "learning_rate": 3.1588219774204085e-07, "loss": 0.0001, "num_input_tokens_seen": 105640080, "step": 156740 }, { "epoch": 3.8293064275767716, "grad_norm": 0.0013388247461989522, "learning_rate": 3.1582000117623154e-07, "loss": 0.0, "num_input_tokens_seen": 105643856, "step": 156745 }, { "epoch": 3.829428578408619, "grad_norm": 0.0020310573745518923, "learning_rate": 3.1575780958596353e-07, "loss": 0.0001, "num_input_tokens_seen": 105646928, "step": 156750 }, { "epoch": 3.829550729240466, "grad_norm": 0.0013612187467515469, "learning_rate": 3.1569562297168883e-07, "loss": 0.0, "num_input_tokens_seen": 105650064, "step": 156755 }, { "epoch": 3.829672880072313, "grad_norm": 0.0015260720392689109, "learning_rate": 3.1563344133385927e-07, "loss": 0.0, "num_input_tokens_seen": 105653648, "step": 156760 }, { "epoch": 3.8297950309041604, "grad_norm": 0.003066976321861148, "learning_rate": 3.155712646729275e-07, "loss": 0.0002, "num_input_tokens_seen": 105656528, "step": 156765 }, { "epoch": 3.8299171817360076, "grad_norm": 0.0018471548100933433, "learning_rate": 3.155090929893458e-07, "loss": 0.0, "num_input_tokens_seen": 105660112, "step": 156770 }, { "epoch": 3.830039332567855, "grad_norm": 0.002350366208702326, "learning_rate": 3.154469262835657e-07, "loss": 0.0, "num_input_tokens_seen": 105663504, "step": 156775 }, { "epoch": 3.830161483399702, "grad_norm": 35.2591438293457, "learning_rate": 3.153847645560401e-07, "loss": 0.0762, "num_input_tokens_seen": 105666960, "step": 156780 }, { "epoch": 3.830283634231549, "grad_norm": 0.016773177310824394, "learning_rate": 3.153226078072202e-07, "loss": 0.0838, "num_input_tokens_seen": 105670800, "step": 156785 }, { "epoch": 3.8304057850633964, "grad_norm": 0.0027613737620413303, "learning_rate": 3.152604560375589e-07, "loss": 0.0, "num_input_tokens_seen": 105673872, "step": 156790 }, { "epoch": 3.8305279358952435, "grad_norm": 522.2001953125, "learning_rate": 3.1519830924750734e-07, "loss": 0.003, "num_input_tokens_seen": 105677008, "step": 156795 }, { "epoch": 3.8306500867270907, "grad_norm": 0.0006445105536840856, "learning_rate": 3.151361674375179e-07, "loss": 0.05, "num_input_tokens_seen": 105680912, "step": 156800 }, { "epoch": 3.830772237558938, "grad_norm": 0.0025355806574225426, "learning_rate": 3.1507403060804274e-07, "loss": 0.0, "num_input_tokens_seen": 105684624, "step": 156805 }, { "epoch": 3.830894388390785, "grad_norm": 0.007931312546133995, "learning_rate": 3.1501189875953314e-07, "loss": 0.0, "num_input_tokens_seen": 105687696, "step": 156810 }, { "epoch": 3.831016539222632, "grad_norm": 0.19625136256217957, "learning_rate": 3.1494977189244166e-07, "loss": 0.0003, "num_input_tokens_seen": 105690960, "step": 156815 }, { "epoch": 3.8311386900544795, "grad_norm": 0.0006346600712276995, "learning_rate": 3.148876500072193e-07, "loss": 0.0, "num_input_tokens_seen": 105694608, "step": 156820 }, { "epoch": 3.8312608408863262, "grad_norm": 255.44143676757812, "learning_rate": 3.1482553310431816e-07, "loss": 0.0444, "num_input_tokens_seen": 105698000, "step": 156825 }, { "epoch": 3.831382991718174, "grad_norm": 0.001601660973392427, "learning_rate": 3.147634211841904e-07, "loss": 0.0, "num_input_tokens_seen": 105701456, "step": 156830 }, { "epoch": 3.8315051425500206, "grad_norm": 0.005953413899987936, "learning_rate": 3.1470131424728707e-07, "loss": 0.0002, "num_input_tokens_seen": 105705232, "step": 156835 }, { "epoch": 3.831627293381868, "grad_norm": 0.0003653077292256057, "learning_rate": 3.146392122940604e-07, "loss": 0.0, "num_input_tokens_seen": 105708176, "step": 156840 }, { "epoch": 3.831749444213715, "grad_norm": 0.0009022181620821357, "learning_rate": 3.145771153249618e-07, "loss": 0.0, "num_input_tokens_seen": 105711056, "step": 156845 }, { "epoch": 3.831871595045562, "grad_norm": 0.00146490428596735, "learning_rate": 3.145150233404423e-07, "loss": 0.0, "num_input_tokens_seen": 105714064, "step": 156850 }, { "epoch": 3.8319937458774094, "grad_norm": 0.0021413289941847324, "learning_rate": 3.144529363409544e-07, "loss": 0.0, "num_input_tokens_seen": 105717328, "step": 156855 }, { "epoch": 3.8321158967092566, "grad_norm": 0.021374104544520378, "learning_rate": 3.1439085432694866e-07, "loss": 0.175, "num_input_tokens_seen": 105720528, "step": 156860 }, { "epoch": 3.8322380475411038, "grad_norm": 0.0032985706347972155, "learning_rate": 3.143287772988774e-07, "loss": 0.0, "num_input_tokens_seen": 105723728, "step": 156865 }, { "epoch": 3.832360198372951, "grad_norm": 0.006007062271237373, "learning_rate": 3.142667052571915e-07, "loss": 0.0003, "num_input_tokens_seen": 105726672, "step": 156870 }, { "epoch": 3.832482349204798, "grad_norm": 0.004959517158567905, "learning_rate": 3.1420463820234266e-07, "loss": 0.0, "num_input_tokens_seen": 105729936, "step": 156875 }, { "epoch": 3.8326045000366453, "grad_norm": 0.009409020654857159, "learning_rate": 3.14142576134782e-07, "loss": 0.0544, "num_input_tokens_seen": 105733072, "step": 156880 }, { "epoch": 3.8327266508684925, "grad_norm": 65.75674438476562, "learning_rate": 3.140805190549609e-07, "loss": 0.0538, "num_input_tokens_seen": 105736208, "step": 156885 }, { "epoch": 3.8328488017003397, "grad_norm": 0.006501436233520508, "learning_rate": 3.140184669633311e-07, "loss": 0.0, "num_input_tokens_seen": 105739408, "step": 156890 }, { "epoch": 3.832970952532187, "grad_norm": 0.00046390751958824694, "learning_rate": 3.1395641986034324e-07, "loss": 0.0, "num_input_tokens_seen": 105742992, "step": 156895 }, { "epoch": 3.8330931033640336, "grad_norm": 0.006816296372562647, "learning_rate": 3.138943777464491e-07, "loss": 0.0001, "num_input_tokens_seen": 105746384, "step": 156900 }, { "epoch": 3.8332152541958813, "grad_norm": 26.399097442626953, "learning_rate": 3.138323406220993e-07, "loss": 0.0245, "num_input_tokens_seen": 105749904, "step": 156905 }, { "epoch": 3.833337405027728, "grad_norm": 0.014151382260024548, "learning_rate": 3.1377030848774565e-07, "loss": 0.0, "num_input_tokens_seen": 105753808, "step": 156910 }, { "epoch": 3.8334595558595757, "grad_norm": 0.0056682913564145565, "learning_rate": 3.1370828134383845e-07, "loss": 0.0, "num_input_tokens_seen": 105757136, "step": 156915 }, { "epoch": 3.8335817066914224, "grad_norm": 0.017652269452810287, "learning_rate": 3.136462591908293e-07, "loss": 0.0, "num_input_tokens_seen": 105759888, "step": 156920 }, { "epoch": 3.8337038575232696, "grad_norm": 0.32824185490608215, "learning_rate": 3.1358424202916945e-07, "loss": 0.0361, "num_input_tokens_seen": 105763280, "step": 156925 }, { "epoch": 3.8338260083551168, "grad_norm": 0.0020293546840548515, "learning_rate": 3.135222298593093e-07, "loss": 0.0, "num_input_tokens_seen": 105766288, "step": 156930 }, { "epoch": 3.833948159186964, "grad_norm": 0.018652616068720818, "learning_rate": 3.134602226817005e-07, "loss": 0.0, "num_input_tokens_seen": 105769680, "step": 156935 }, { "epoch": 3.834070310018811, "grad_norm": 0.0029221416916698217, "learning_rate": 3.1339822049679323e-07, "loss": 0.0, "num_input_tokens_seen": 105773584, "step": 156940 }, { "epoch": 3.8341924608506583, "grad_norm": 0.004153568297624588, "learning_rate": 3.1333622330503905e-07, "loss": 0.0406, "num_input_tokens_seen": 105776656, "step": 156945 }, { "epoch": 3.8343146116825055, "grad_norm": 0.0019725682213902473, "learning_rate": 3.1327423110688835e-07, "loss": 0.0, "num_input_tokens_seen": 105780240, "step": 156950 }, { "epoch": 3.8344367625143527, "grad_norm": 0.006787466816604137, "learning_rate": 3.1321224390279235e-07, "loss": 0.0005, "num_input_tokens_seen": 105784144, "step": 156955 }, { "epoch": 3.8345589133462, "grad_norm": 0.0011359815252944827, "learning_rate": 3.1315026169320167e-07, "loss": 0.0001, "num_input_tokens_seen": 105787728, "step": 156960 }, { "epoch": 3.834681064178047, "grad_norm": 0.004417893476784229, "learning_rate": 3.130882844785667e-07, "loss": 0.0, "num_input_tokens_seen": 105791760, "step": 156965 }, { "epoch": 3.8348032150098943, "grad_norm": 0.00016033170686569065, "learning_rate": 3.1302631225933884e-07, "loss": 0.0, "num_input_tokens_seen": 105795216, "step": 156970 }, { "epoch": 3.8349253658417415, "grad_norm": 0.01326667983084917, "learning_rate": 3.1296434503596815e-07, "loss": 0.0399, "num_input_tokens_seen": 105798736, "step": 156975 }, { "epoch": 3.8350475166735887, "grad_norm": 0.038337502628564835, "learning_rate": 3.1290238280890547e-07, "loss": 0.0, "num_input_tokens_seen": 105802384, "step": 156980 }, { "epoch": 3.835169667505436, "grad_norm": 0.0024482160806655884, "learning_rate": 3.128404255786017e-07, "loss": 0.0001, "num_input_tokens_seen": 105805776, "step": 156985 }, { "epoch": 3.835291818337283, "grad_norm": 0.6643205881118774, "learning_rate": 3.12778473345507e-07, "loss": 0.0737, "num_input_tokens_seen": 105809040, "step": 156990 }, { "epoch": 3.83541396916913, "grad_norm": 0.050306521356105804, "learning_rate": 3.1271652611007226e-07, "loss": 0.0001, "num_input_tokens_seen": 105812304, "step": 156995 }, { "epoch": 3.8355361200009774, "grad_norm": 0.00011481645924504846, "learning_rate": 3.126545838727476e-07, "loss": 0.0, "num_input_tokens_seen": 105815824, "step": 157000 }, { "epoch": 3.835658270832824, "grad_norm": 0.010054918937385082, "learning_rate": 3.12592646633984e-07, "loss": 0.0, "num_input_tokens_seen": 105819280, "step": 157005 }, { "epoch": 3.835780421664672, "grad_norm": 0.0001712543162284419, "learning_rate": 3.1253071439423116e-07, "loss": 0.0, "num_input_tokens_seen": 105822992, "step": 157010 }, { "epoch": 3.8359025724965186, "grad_norm": 0.0005007112631574273, "learning_rate": 3.1246878715393996e-07, "loss": 0.0, "num_input_tokens_seen": 105826512, "step": 157015 }, { "epoch": 3.8360247233283657, "grad_norm": 0.014416385442018509, "learning_rate": 3.1240686491356096e-07, "loss": 0.0, "num_input_tokens_seen": 105829904, "step": 157020 }, { "epoch": 3.836146874160213, "grad_norm": 0.006188525352627039, "learning_rate": 3.123449476735438e-07, "loss": 0.0, "num_input_tokens_seen": 105832912, "step": 157025 }, { "epoch": 3.83626902499206, "grad_norm": 0.0016473844880238175, "learning_rate": 3.1228303543433953e-07, "loss": 0.0, "num_input_tokens_seen": 105836368, "step": 157030 }, { "epoch": 3.8363911758239073, "grad_norm": 0.017037464305758476, "learning_rate": 3.1222112819639755e-07, "loss": 0.0, "num_input_tokens_seen": 105839760, "step": 157035 }, { "epoch": 3.8365133266557545, "grad_norm": 0.002478261012583971, "learning_rate": 3.1215922596016897e-07, "loss": 0.0, "num_input_tokens_seen": 105843280, "step": 157040 }, { "epoch": 3.8366354774876017, "grad_norm": 0.000509330362547189, "learning_rate": 3.1209732872610316e-07, "loss": 0.0, "num_input_tokens_seen": 105846416, "step": 157045 }, { "epoch": 3.836757628319449, "grad_norm": 200.7996063232422, "learning_rate": 3.1203543649465093e-07, "loss": 0.0174, "num_input_tokens_seen": 105849680, "step": 157050 }, { "epoch": 3.836879779151296, "grad_norm": 0.08813425898551941, "learning_rate": 3.1197354926626175e-07, "loss": 0.0001, "num_input_tokens_seen": 105853328, "step": 157055 }, { "epoch": 3.8370019299831433, "grad_norm": 0.00032203836599364877, "learning_rate": 3.1191166704138614e-07, "loss": 0.0, "num_input_tokens_seen": 105856528, "step": 157060 }, { "epoch": 3.8371240808149905, "grad_norm": 0.003999975975602865, "learning_rate": 3.118497898204742e-07, "loss": 0.0, "num_input_tokens_seen": 105859792, "step": 157065 }, { "epoch": 3.8372462316468376, "grad_norm": 0.02387079782783985, "learning_rate": 3.117879176039752e-07, "loss": 0.0, "num_input_tokens_seen": 105863312, "step": 157070 }, { "epoch": 3.837368382478685, "grad_norm": 0.0012991444673389196, "learning_rate": 3.117260503923396e-07, "loss": 0.0001, "num_input_tokens_seen": 105867024, "step": 157075 }, { "epoch": 3.8374905333105316, "grad_norm": 0.002810559468343854, "learning_rate": 3.1166418818601757e-07, "loss": 0.1088, "num_input_tokens_seen": 105870288, "step": 157080 }, { "epoch": 3.837612684142379, "grad_norm": 0.009368712082505226, "learning_rate": 3.116023309854584e-07, "loss": 0.0, "num_input_tokens_seen": 105873744, "step": 157085 }, { "epoch": 3.837734834974226, "grad_norm": 0.01500675454735756, "learning_rate": 3.1154047879111256e-07, "loss": 0.0, "num_input_tokens_seen": 105876880, "step": 157090 }, { "epoch": 3.8378569858060736, "grad_norm": 0.15511620044708252, "learning_rate": 3.1147863160342923e-07, "loss": 0.0001, "num_input_tokens_seen": 105880336, "step": 157095 }, { "epoch": 3.8379791366379203, "grad_norm": 0.0014603231102228165, "learning_rate": 3.114167894228589e-07, "loss": 0.0, "num_input_tokens_seen": 105883792, "step": 157100 }, { "epoch": 3.8381012874697675, "grad_norm": 0.0008880642708390951, "learning_rate": 3.1135495224985043e-07, "loss": 0.0, "num_input_tokens_seen": 105887312, "step": 157105 }, { "epoch": 3.8382234383016147, "grad_norm": 0.0009251810261048377, "learning_rate": 3.11293120084854e-07, "loss": 0.0, "num_input_tokens_seen": 105890704, "step": 157110 }, { "epoch": 3.838345589133462, "grad_norm": 0.006861668545752764, "learning_rate": 3.112312929283195e-07, "loss": 0.0, "num_input_tokens_seen": 105893904, "step": 157115 }, { "epoch": 3.838467739965309, "grad_norm": 0.031088093295693398, "learning_rate": 3.111694707806961e-07, "loss": 0.0325, "num_input_tokens_seen": 105897424, "step": 157120 }, { "epoch": 3.8385898907971563, "grad_norm": 0.0009894341928884387, "learning_rate": 3.111076536424337e-07, "loss": 0.0, "num_input_tokens_seen": 105900496, "step": 157125 }, { "epoch": 3.8387120416290035, "grad_norm": 0.0024139767047017813, "learning_rate": 3.1104584151398163e-07, "loss": 0.0, "num_input_tokens_seen": 105904208, "step": 157130 }, { "epoch": 3.8388341924608507, "grad_norm": 0.0013710018247365952, "learning_rate": 3.1098403439578945e-07, "loss": 0.0, "num_input_tokens_seen": 105907728, "step": 157135 }, { "epoch": 3.838956343292698, "grad_norm": 0.000259570952039212, "learning_rate": 3.1092223228830696e-07, "loss": 0.0002, "num_input_tokens_seen": 105911056, "step": 157140 }, { "epoch": 3.839078494124545, "grad_norm": 0.0033034791704267263, "learning_rate": 3.1086043519198315e-07, "loss": 0.0, "num_input_tokens_seen": 105913936, "step": 157145 }, { "epoch": 3.8392006449563922, "grad_norm": 11.38533878326416, "learning_rate": 3.107986431072678e-07, "loss": 0.0924, "num_input_tokens_seen": 105917072, "step": 157150 }, { "epoch": 3.8393227957882394, "grad_norm": 0.003908009268343449, "learning_rate": 3.107368560346101e-07, "loss": 0.0003, "num_input_tokens_seen": 105920272, "step": 157155 }, { "epoch": 3.8394449466200866, "grad_norm": 0.007159855682402849, "learning_rate": 3.1067507397445904e-07, "loss": 0.0001, "num_input_tokens_seen": 105923344, "step": 157160 }, { "epoch": 3.839567097451934, "grad_norm": 0.025989584624767303, "learning_rate": 3.106132969272646e-07, "loss": 0.0001, "num_input_tokens_seen": 105926480, "step": 157165 }, { "epoch": 3.839689248283781, "grad_norm": 0.0034603348467499018, "learning_rate": 3.1055152489347535e-07, "loss": 0.0, "num_input_tokens_seen": 105929872, "step": 157170 }, { "epoch": 3.8398113991156277, "grad_norm": 0.0010983675019815564, "learning_rate": 3.1048975787354126e-07, "loss": 0.0, "num_input_tokens_seen": 105933392, "step": 157175 }, { "epoch": 3.8399335499474754, "grad_norm": 0.003446553135290742, "learning_rate": 3.104279958679107e-07, "loss": 0.0588, "num_input_tokens_seen": 105936784, "step": 157180 }, { "epoch": 3.840055700779322, "grad_norm": 0.006336711347103119, "learning_rate": 3.103662388770335e-07, "loss": 0.0, "num_input_tokens_seen": 105939920, "step": 157185 }, { "epoch": 3.8401778516111693, "grad_norm": 0.21213245391845703, "learning_rate": 3.1030448690135824e-07, "loss": 0.0001, "num_input_tokens_seen": 105943312, "step": 157190 }, { "epoch": 3.8403000024430165, "grad_norm": 0.008288219571113586, "learning_rate": 3.1024273994133465e-07, "loss": 0.0, "num_input_tokens_seen": 105946832, "step": 157195 }, { "epoch": 3.8404221532748637, "grad_norm": 0.0031236589420586824, "learning_rate": 3.1018099799741095e-07, "loss": 0.0, "num_input_tokens_seen": 105950352, "step": 157200 }, { "epoch": 3.840544304106711, "grad_norm": 0.14038878679275513, "learning_rate": 3.101192610700366e-07, "loss": 0.0001, "num_input_tokens_seen": 105953680, "step": 157205 }, { "epoch": 3.840666454938558, "grad_norm": 0.005273896735161543, "learning_rate": 3.100575291596609e-07, "loss": 0.0, "num_input_tokens_seen": 105957072, "step": 157210 }, { "epoch": 3.8407886057704053, "grad_norm": 0.007837706245481968, "learning_rate": 3.0999580226673207e-07, "loss": 0.0, "num_input_tokens_seen": 105960656, "step": 157215 }, { "epoch": 3.8409107566022525, "grad_norm": 0.0034366052132099867, "learning_rate": 3.0993408039169964e-07, "loss": 0.0, "num_input_tokens_seen": 105964176, "step": 157220 }, { "epoch": 3.8410329074340996, "grad_norm": 0.03968435898423195, "learning_rate": 3.098723635350119e-07, "loss": 0.0, "num_input_tokens_seen": 105967952, "step": 157225 }, { "epoch": 3.841155058265947, "grad_norm": 0.025878041982650757, "learning_rate": 3.0981065169711793e-07, "loss": 0.0, "num_input_tokens_seen": 105971344, "step": 157230 }, { "epoch": 3.841277209097794, "grad_norm": 0.00534423440694809, "learning_rate": 3.0974894487846693e-07, "loss": 0.0009, "num_input_tokens_seen": 105974800, "step": 157235 }, { "epoch": 3.841399359929641, "grad_norm": 0.01658131182193756, "learning_rate": 3.096872430795069e-07, "loss": 0.0, "num_input_tokens_seen": 105978640, "step": 157240 }, { "epoch": 3.8415215107614884, "grad_norm": 0.0042229569517076015, "learning_rate": 3.0962554630068716e-07, "loss": 0.0563, "num_input_tokens_seen": 105981712, "step": 157245 }, { "epoch": 3.8416436615933356, "grad_norm": 0.0016310889041051269, "learning_rate": 3.095638545424559e-07, "loss": 0.0003, "num_input_tokens_seen": 105984656, "step": 157250 }, { "epoch": 3.841765812425183, "grad_norm": 0.0044392179697752, "learning_rate": 3.095021678052623e-07, "loss": 0.036, "num_input_tokens_seen": 105987984, "step": 157255 }, { "epoch": 3.8418879632570295, "grad_norm": 0.0019932505674660206, "learning_rate": 3.094404860895543e-07, "loss": 0.0, "num_input_tokens_seen": 105991376, "step": 157260 }, { "epoch": 3.842010114088877, "grad_norm": 0.0009364528232254088, "learning_rate": 3.0937880939578123e-07, "loss": 0.0001, "num_input_tokens_seen": 105994640, "step": 157265 }, { "epoch": 3.842132264920724, "grad_norm": 0.021783795207738876, "learning_rate": 3.0931713772439106e-07, "loss": 0.0, "num_input_tokens_seen": 105997648, "step": 157270 }, { "epoch": 3.8422544157525715, "grad_norm": 0.00874258577823639, "learning_rate": 3.0925547107583225e-07, "loss": 0.0, "num_input_tokens_seen": 106000848, "step": 157275 }, { "epoch": 3.8423765665844183, "grad_norm": 0.0005422328831627965, "learning_rate": 3.0919380945055374e-07, "loss": 0.0, "num_input_tokens_seen": 106004624, "step": 157280 }, { "epoch": 3.8424987174162655, "grad_norm": 0.004375748336315155, "learning_rate": 3.091321528490033e-07, "loss": 0.0, "num_input_tokens_seen": 106007568, "step": 157285 }, { "epoch": 3.8426208682481127, "grad_norm": 0.011253989301621914, "learning_rate": 3.090705012716297e-07, "loss": 0.0, "num_input_tokens_seen": 106010960, "step": 157290 }, { "epoch": 3.84274301907996, "grad_norm": 0.004648137837648392, "learning_rate": 3.090088547188815e-07, "loss": 0.0, "num_input_tokens_seen": 106014224, "step": 157295 }, { "epoch": 3.842865169911807, "grad_norm": 0.0027260452043265104, "learning_rate": 3.0894721319120654e-07, "loss": 0.0, "num_input_tokens_seen": 106017424, "step": 157300 }, { "epoch": 3.8429873207436542, "grad_norm": 0.00022834166884422302, "learning_rate": 3.088855766890536e-07, "loss": 0.0004, "num_input_tokens_seen": 106020816, "step": 157305 }, { "epoch": 3.8431094715755014, "grad_norm": 0.025838159024715424, "learning_rate": 3.0882394521287023e-07, "loss": 0.0002, "num_input_tokens_seen": 106024208, "step": 157310 }, { "epoch": 3.8432316224073486, "grad_norm": 0.0010606314754113555, "learning_rate": 3.0876231876310544e-07, "loss": 0.0466, "num_input_tokens_seen": 106027344, "step": 157315 }, { "epoch": 3.843353773239196, "grad_norm": 0.010034746490418911, "learning_rate": 3.0870069734020665e-07, "loss": 0.0625, "num_input_tokens_seen": 106030480, "step": 157320 }, { "epoch": 3.843475924071043, "grad_norm": 0.00046287436271086335, "learning_rate": 3.086390809446223e-07, "loss": 0.0002, "num_input_tokens_seen": 106033872, "step": 157325 }, { "epoch": 3.84359807490289, "grad_norm": 0.004671595059335232, "learning_rate": 3.0857746957680096e-07, "loss": 0.0, "num_input_tokens_seen": 106037328, "step": 157330 }, { "epoch": 3.8437202257347374, "grad_norm": 0.012156388722360134, "learning_rate": 3.085158632371898e-07, "loss": 0.0, "num_input_tokens_seen": 106040464, "step": 157335 }, { "epoch": 3.8438423765665846, "grad_norm": 0.00037465174682438374, "learning_rate": 3.084542619262376e-07, "loss": 0.0, "num_input_tokens_seen": 106043792, "step": 157340 }, { "epoch": 3.8439645273984313, "grad_norm": 0.0014692572876811028, "learning_rate": 3.083926656443917e-07, "loss": 0.0272, "num_input_tokens_seen": 106047184, "step": 157345 }, { "epoch": 3.844086678230279, "grad_norm": 0.10350882261991501, "learning_rate": 3.083310743921006e-07, "loss": 0.0, "num_input_tokens_seen": 106050640, "step": 157350 }, { "epoch": 3.8442088290621257, "grad_norm": 0.018523210659623146, "learning_rate": 3.082694881698118e-07, "loss": 0.0002, "num_input_tokens_seen": 106054416, "step": 157355 }, { "epoch": 3.8443309798939733, "grad_norm": 0.0012730018934234977, "learning_rate": 3.082079069779735e-07, "loss": 0.0, "num_input_tokens_seen": 106057808, "step": 157360 }, { "epoch": 3.84445313072582, "grad_norm": 0.0022517370525747538, "learning_rate": 3.081463308170331e-07, "loss": 0.0, "num_input_tokens_seen": 106061136, "step": 157365 }, { "epoch": 3.8445752815576673, "grad_norm": 0.0016175990458577871, "learning_rate": 3.0808475968743907e-07, "loss": 0.0, "num_input_tokens_seen": 106064272, "step": 157370 }, { "epoch": 3.8446974323895144, "grad_norm": 0.09517046064138412, "learning_rate": 3.080231935896387e-07, "loss": 0.0001, "num_input_tokens_seen": 106067792, "step": 157375 }, { "epoch": 3.8448195832213616, "grad_norm": 27.080537796020508, "learning_rate": 3.0796163252407946e-07, "loss": 0.0924, "num_input_tokens_seen": 106070928, "step": 157380 }, { "epoch": 3.844941734053209, "grad_norm": 0.0014318153262138367, "learning_rate": 3.079000764912093e-07, "loss": 0.0001, "num_input_tokens_seen": 106074064, "step": 157385 }, { "epoch": 3.845063884885056, "grad_norm": 75.27845764160156, "learning_rate": 3.078385254914764e-07, "loss": 0.0155, "num_input_tokens_seen": 106077776, "step": 157390 }, { "epoch": 3.845186035716903, "grad_norm": 0.06256076693534851, "learning_rate": 3.077769795253276e-07, "loss": 0.0001, "num_input_tokens_seen": 106081360, "step": 157395 }, { "epoch": 3.8453081865487504, "grad_norm": 0.0019677793607115746, "learning_rate": 3.077154385932109e-07, "loss": 0.0, "num_input_tokens_seen": 106084368, "step": 157400 }, { "epoch": 3.8454303373805976, "grad_norm": 0.07321988046169281, "learning_rate": 3.0765390269557356e-07, "loss": 0.0, "num_input_tokens_seen": 106087824, "step": 157405 }, { "epoch": 3.8455524882124448, "grad_norm": 0.0027987107168883085, "learning_rate": 3.0759237183286356e-07, "loss": 0.0, "num_input_tokens_seen": 106091088, "step": 157410 }, { "epoch": 3.845674639044292, "grad_norm": 0.013440297916531563, "learning_rate": 3.075308460055278e-07, "loss": 0.0002, "num_input_tokens_seen": 106094224, "step": 157415 }, { "epoch": 3.845796789876139, "grad_norm": 0.002260087290778756, "learning_rate": 3.074693252140139e-07, "loss": 0.0, "num_input_tokens_seen": 106097424, "step": 157420 }, { "epoch": 3.8459189407079863, "grad_norm": 0.017164267599582672, "learning_rate": 3.0740780945876963e-07, "loss": 0.0002, "num_input_tokens_seen": 106100816, "step": 157425 }, { "epoch": 3.8460410915398335, "grad_norm": 0.09083389490842819, "learning_rate": 3.0734629874024187e-07, "loss": 0.0202, "num_input_tokens_seen": 106104080, "step": 157430 }, { "epoch": 3.8461632423716807, "grad_norm": 0.00015704776160418987, "learning_rate": 3.072847930588783e-07, "loss": 0.0, "num_input_tokens_seen": 106107664, "step": 157435 }, { "epoch": 3.8462853932035275, "grad_norm": 0.005764813628047705, "learning_rate": 3.072232924151258e-07, "loss": 0.0, "num_input_tokens_seen": 106111760, "step": 157440 }, { "epoch": 3.846407544035375, "grad_norm": 0.004832027480006218, "learning_rate": 3.071617968094319e-07, "loss": 0.0, "num_input_tokens_seen": 106114704, "step": 157445 }, { "epoch": 3.846529694867222, "grad_norm": 9.927536302711815e-05, "learning_rate": 3.0710030624224405e-07, "loss": 0.0, "num_input_tokens_seen": 106117840, "step": 157450 }, { "epoch": 3.8466518456990695, "grad_norm": 0.007467443123459816, "learning_rate": 3.070388207140088e-07, "loss": 0.0001, "num_input_tokens_seen": 106120912, "step": 157455 }, { "epoch": 3.8467739965309162, "grad_norm": 0.05119911581277847, "learning_rate": 3.0697734022517386e-07, "loss": 0.0, "num_input_tokens_seen": 106124624, "step": 157460 }, { "epoch": 3.8468961473627634, "grad_norm": 0.0014485791325569153, "learning_rate": 3.0691586477618614e-07, "loss": 0.0, "num_input_tokens_seen": 106128400, "step": 157465 }, { "epoch": 3.8470182981946106, "grad_norm": 0.004423577804118395, "learning_rate": 3.0685439436749237e-07, "loss": 0.0614, "num_input_tokens_seen": 106131792, "step": 157470 }, { "epoch": 3.847140449026458, "grad_norm": 0.0003178124316036701, "learning_rate": 3.067929289995402e-07, "loss": 0.0, "num_input_tokens_seen": 106135824, "step": 157475 }, { "epoch": 3.847262599858305, "grad_norm": 0.0001393468992318958, "learning_rate": 3.067314686727761e-07, "loss": 0.0, "num_input_tokens_seen": 106139152, "step": 157480 }, { "epoch": 3.847384750690152, "grad_norm": 0.0022997669875621796, "learning_rate": 3.066700133876474e-07, "loss": 0.0, "num_input_tokens_seen": 106142032, "step": 157485 }, { "epoch": 3.8475069015219994, "grad_norm": 0.24477975070476532, "learning_rate": 3.066085631446006e-07, "loss": 0.0001, "num_input_tokens_seen": 106145616, "step": 157490 }, { "epoch": 3.8476290523538466, "grad_norm": 0.0012179150944575667, "learning_rate": 3.0654711794408304e-07, "loss": 0.0, "num_input_tokens_seen": 106148944, "step": 157495 }, { "epoch": 3.8477512031856937, "grad_norm": 0.3981289267539978, "learning_rate": 3.064856777865412e-07, "loss": 0.0002, "num_input_tokens_seen": 106152528, "step": 157500 }, { "epoch": 3.847873354017541, "grad_norm": 0.32446688413619995, "learning_rate": 3.064242426724223e-07, "loss": 0.0002, "num_input_tokens_seen": 106156048, "step": 157505 }, { "epoch": 3.847995504849388, "grad_norm": 0.0024203043431043625, "learning_rate": 3.0636281260217243e-07, "loss": 0.0, "num_input_tokens_seen": 106159184, "step": 157510 }, { "epoch": 3.8481176556812353, "grad_norm": 0.005391005426645279, "learning_rate": 3.0630138757623893e-07, "loss": 0.0433, "num_input_tokens_seen": 106162832, "step": 157515 }, { "epoch": 3.8482398065130825, "grad_norm": 0.0027643253561109304, "learning_rate": 3.0623996759506855e-07, "loss": 0.0, "num_input_tokens_seen": 106166096, "step": 157520 }, { "epoch": 3.8483619573449293, "grad_norm": 16.738208770751953, "learning_rate": 3.061785526591073e-07, "loss": 0.1025, "num_input_tokens_seen": 106169552, "step": 157525 }, { "epoch": 3.848484108176777, "grad_norm": 0.0038885173853486776, "learning_rate": 3.0611714276880273e-07, "loss": 0.0, "num_input_tokens_seen": 106173008, "step": 157530 }, { "epoch": 3.8486062590086236, "grad_norm": 0.0036091655492782593, "learning_rate": 3.060557379246005e-07, "loss": 0.0, "num_input_tokens_seen": 106176400, "step": 157535 }, { "epoch": 3.8487284098404713, "grad_norm": 0.0007444451912306249, "learning_rate": 3.0599433812694765e-07, "loss": 0.0283, "num_input_tokens_seen": 106179600, "step": 157540 }, { "epoch": 3.848850560672318, "grad_norm": 0.0016477408353239298, "learning_rate": 3.0593294337629097e-07, "loss": 0.0, "num_input_tokens_seen": 106183632, "step": 157545 }, { "epoch": 3.848972711504165, "grad_norm": 0.025943255051970482, "learning_rate": 3.0587155367307614e-07, "loss": 0.0001, "num_input_tokens_seen": 106186960, "step": 157550 }, { "epoch": 3.8490948623360124, "grad_norm": 0.0005402068491093814, "learning_rate": 3.0581016901775057e-07, "loss": 0.0713, "num_input_tokens_seen": 106190288, "step": 157555 }, { "epoch": 3.8492170131678596, "grad_norm": 0.7853077054023743, "learning_rate": 3.057487894107598e-07, "loss": 0.0002, "num_input_tokens_seen": 106193808, "step": 157560 }, { "epoch": 3.8493391639997068, "grad_norm": 0.023170972242951393, "learning_rate": 3.056874148525508e-07, "loss": 0.0, "num_input_tokens_seen": 106196944, "step": 157565 }, { "epoch": 3.849461314831554, "grad_norm": 0.0006037909188307822, "learning_rate": 3.056260453435694e-07, "loss": 0.0001, "num_input_tokens_seen": 106200208, "step": 157570 }, { "epoch": 3.849583465663401, "grad_norm": 0.0042616198770701885, "learning_rate": 3.0556468088426256e-07, "loss": 0.0001, "num_input_tokens_seen": 106203728, "step": 157575 }, { "epoch": 3.8497056164952483, "grad_norm": 0.0017427516868337989, "learning_rate": 3.0550332147507606e-07, "loss": 0.0, "num_input_tokens_seen": 106206864, "step": 157580 }, { "epoch": 3.8498277673270955, "grad_norm": 0.000949391454923898, "learning_rate": 3.0544196711645586e-07, "loss": 0.0, "num_input_tokens_seen": 106210128, "step": 157585 }, { "epoch": 3.8499499181589427, "grad_norm": 0.07061922550201416, "learning_rate": 3.053806178088488e-07, "loss": 0.0001, "num_input_tokens_seen": 106213456, "step": 157590 }, { "epoch": 3.85007206899079, "grad_norm": 0.0007843165658414364, "learning_rate": 3.053192735527005e-07, "loss": 0.0001, "num_input_tokens_seen": 106216656, "step": 157595 }, { "epoch": 3.850194219822637, "grad_norm": 0.011597550474107265, "learning_rate": 3.052579343484573e-07, "loss": 0.0619, "num_input_tokens_seen": 106220048, "step": 157600 }, { "epoch": 3.8503163706544843, "grad_norm": 0.0038043519016355276, "learning_rate": 3.0519660019656544e-07, "loss": 0.0005, "num_input_tokens_seen": 106223312, "step": 157605 }, { "epoch": 3.8504385214863315, "grad_norm": 0.12183055281639099, "learning_rate": 3.051352710974706e-07, "loss": 0.0, "num_input_tokens_seen": 106226448, "step": 157610 }, { "epoch": 3.8505606723181787, "grad_norm": 0.0008379321661777794, "learning_rate": 3.0507394705161913e-07, "loss": 0.0, "num_input_tokens_seen": 106229776, "step": 157615 }, { "epoch": 3.8506828231500254, "grad_norm": 0.0189727284014225, "learning_rate": 3.050126280594567e-07, "loss": 0.0, "num_input_tokens_seen": 106232976, "step": 157620 }, { "epoch": 3.850804973981873, "grad_norm": 0.007365493103861809, "learning_rate": 3.0495131412142963e-07, "loss": 0.0, "num_input_tokens_seen": 106236624, "step": 157625 }, { "epoch": 3.85092712481372, "grad_norm": 17.411718368530273, "learning_rate": 3.0489000523798316e-07, "loss": 0.1139, "num_input_tokens_seen": 106240528, "step": 157630 }, { "epoch": 3.851049275645567, "grad_norm": 0.0039583053439855576, "learning_rate": 3.048287014095635e-07, "loss": 0.0, "num_input_tokens_seen": 106244304, "step": 157635 }, { "epoch": 3.851171426477414, "grad_norm": 0.00854410044848919, "learning_rate": 3.0476740263661693e-07, "loss": 0.0, "num_input_tokens_seen": 106248272, "step": 157640 }, { "epoch": 3.8512935773092614, "grad_norm": 0.4559212923049927, "learning_rate": 3.0470610891958836e-07, "loss": 0.0001, "num_input_tokens_seen": 106251600, "step": 157645 }, { "epoch": 3.8514157281411086, "grad_norm": 0.00413027498871088, "learning_rate": 3.0464482025892444e-07, "loss": 0.0, "num_input_tokens_seen": 106254864, "step": 157650 }, { "epoch": 3.8515378789729557, "grad_norm": 0.004023184534162283, "learning_rate": 3.0458353665507e-07, "loss": 0.0, "num_input_tokens_seen": 106258064, "step": 157655 }, { "epoch": 3.851660029804803, "grad_norm": 0.0012542590266093612, "learning_rate": 3.0452225810847153e-07, "loss": 0.0, "num_input_tokens_seen": 106261584, "step": 157660 }, { "epoch": 3.85178218063665, "grad_norm": 0.002548971213400364, "learning_rate": 3.0446098461957383e-07, "loss": 0.0001, "num_input_tokens_seen": 106265168, "step": 157665 }, { "epoch": 3.8519043314684973, "grad_norm": 0.001746556255966425, "learning_rate": 3.043997161888233e-07, "loss": 0.0, "num_input_tokens_seen": 106268880, "step": 157670 }, { "epoch": 3.8520264823003445, "grad_norm": 0.0014939203392714262, "learning_rate": 3.0433845281666484e-07, "loss": 0.0, "num_input_tokens_seen": 106271824, "step": 157675 }, { "epoch": 3.8521486331321917, "grad_norm": 0.02588808350265026, "learning_rate": 3.0427719450354463e-07, "loss": 0.0, "num_input_tokens_seen": 106275472, "step": 157680 }, { "epoch": 3.852270783964039, "grad_norm": 0.02764168195426464, "learning_rate": 3.042159412499077e-07, "loss": 0.0001, "num_input_tokens_seen": 106278992, "step": 157685 }, { "epoch": 3.852392934795886, "grad_norm": 0.0657828077673912, "learning_rate": 3.041546930561992e-07, "loss": 0.0001, "num_input_tokens_seen": 106282320, "step": 157690 }, { "epoch": 3.8525150856277333, "grad_norm": 0.029806675389409065, "learning_rate": 3.04093449922865e-07, "loss": 0.0, "num_input_tokens_seen": 106285904, "step": 157695 }, { "epoch": 3.8526372364595804, "grad_norm": 0.0013426138320937753, "learning_rate": 3.0403221185035075e-07, "loss": 0.0, "num_input_tokens_seen": 106288912, "step": 157700 }, { "epoch": 3.852759387291427, "grad_norm": 0.0034400273580104113, "learning_rate": 3.0397097883910116e-07, "loss": 0.0, "num_input_tokens_seen": 106291664, "step": 157705 }, { "epoch": 3.852881538123275, "grad_norm": 0.007743450812995434, "learning_rate": 3.0390975088956207e-07, "loss": 0.0, "num_input_tokens_seen": 106294928, "step": 157710 }, { "epoch": 3.8530036889551216, "grad_norm": 0.01713470369577408, "learning_rate": 3.038485280021783e-07, "loss": 0.0, "num_input_tokens_seen": 106298448, "step": 157715 }, { "epoch": 3.853125839786969, "grad_norm": 0.0026307678781449795, "learning_rate": 3.0378731017739547e-07, "loss": 0.0, "num_input_tokens_seen": 106301968, "step": 157720 }, { "epoch": 3.853247990618816, "grad_norm": 0.026001354679465294, "learning_rate": 3.0372609741565824e-07, "loss": 0.0, "num_input_tokens_seen": 106305296, "step": 157725 }, { "epoch": 3.853370141450663, "grad_norm": 0.013364420272409916, "learning_rate": 3.0366488971741224e-07, "loss": 0.0, "num_input_tokens_seen": 106308304, "step": 157730 }, { "epoch": 3.8534922922825103, "grad_norm": 2.170867919921875, "learning_rate": 3.036036870831027e-07, "loss": 0.0002, "num_input_tokens_seen": 106311824, "step": 157735 }, { "epoch": 3.8536144431143575, "grad_norm": 0.0966757982969284, "learning_rate": 3.0354248951317407e-07, "loss": 0.022, "num_input_tokens_seen": 106315216, "step": 157740 }, { "epoch": 3.8537365939462047, "grad_norm": 0.018707867711782455, "learning_rate": 3.034812970080721e-07, "loss": 0.0, "num_input_tokens_seen": 106318288, "step": 157745 }, { "epoch": 3.853858744778052, "grad_norm": 0.011910786852240562, "learning_rate": 3.034201095682413e-07, "loss": 0.0, "num_input_tokens_seen": 106321168, "step": 157750 }, { "epoch": 3.853980895609899, "grad_norm": 0.0008571508224122226, "learning_rate": 3.0335892719412704e-07, "loss": 0.0, "num_input_tokens_seen": 106325072, "step": 157755 }, { "epoch": 3.8541030464417463, "grad_norm": 0.002870930125936866, "learning_rate": 3.032977498861737e-07, "loss": 0.0, "num_input_tokens_seen": 106328464, "step": 157760 }, { "epoch": 3.8542251972735935, "grad_norm": 0.013235303573310375, "learning_rate": 3.032365776448266e-07, "loss": 0.0, "num_input_tokens_seen": 106331472, "step": 157765 }, { "epoch": 3.8543473481054407, "grad_norm": 0.05652252584695816, "learning_rate": 3.0317541047053074e-07, "loss": 0.0003, "num_input_tokens_seen": 106335120, "step": 157770 }, { "epoch": 3.854469498937288, "grad_norm": 0.0021372237242758274, "learning_rate": 3.031142483637308e-07, "loss": 0.0, "num_input_tokens_seen": 106338320, "step": 157775 }, { "epoch": 3.854591649769135, "grad_norm": 0.00040053142583929, "learning_rate": 3.030530913248711e-07, "loss": 0.0, "num_input_tokens_seen": 106341264, "step": 157780 }, { "epoch": 3.8547138006009822, "grad_norm": 0.000844833324663341, "learning_rate": 3.0299193935439714e-07, "loss": 0.0, "num_input_tokens_seen": 106344080, "step": 157785 }, { "epoch": 3.8548359514328294, "grad_norm": 0.004650758113712072, "learning_rate": 3.0293079245275297e-07, "loss": 0.0, "num_input_tokens_seen": 106347536, "step": 157790 }, { "epoch": 3.8549581022646766, "grad_norm": 0.0012025663163512945, "learning_rate": 3.0286965062038383e-07, "loss": 0.0001, "num_input_tokens_seen": 106350608, "step": 157795 }, { "epoch": 3.8550802530965234, "grad_norm": 0.0009295342024415731, "learning_rate": 3.028085138577338e-07, "loss": 0.0, "num_input_tokens_seen": 106353616, "step": 157800 }, { "epoch": 3.855202403928371, "grad_norm": 0.005253209732472897, "learning_rate": 3.027473821652481e-07, "loss": 0.0, "num_input_tokens_seen": 106357328, "step": 157805 }, { "epoch": 3.8553245547602177, "grad_norm": 0.3880409002304077, "learning_rate": 3.0268625554337067e-07, "loss": 0.0001, "num_input_tokens_seen": 106360528, "step": 157810 }, { "epoch": 3.855446705592065, "grad_norm": 0.00103185314219445, "learning_rate": 3.026251339925466e-07, "loss": 0.0, "num_input_tokens_seen": 106364176, "step": 157815 }, { "epoch": 3.855568856423912, "grad_norm": 0.0018289852887392044, "learning_rate": 3.025640175132199e-07, "loss": 0.0, "num_input_tokens_seen": 106367696, "step": 157820 }, { "epoch": 3.8556910072557593, "grad_norm": 0.010241532698273659, "learning_rate": 3.025029061058352e-07, "loss": 0.0, "num_input_tokens_seen": 106370896, "step": 157825 }, { "epoch": 3.8558131580876065, "grad_norm": 0.14833614230155945, "learning_rate": 3.0244179977083727e-07, "loss": 0.0001, "num_input_tokens_seen": 106374480, "step": 157830 }, { "epoch": 3.8559353089194537, "grad_norm": 0.023858023807406425, "learning_rate": 3.023806985086699e-07, "loss": 0.0, "num_input_tokens_seen": 106377872, "step": 157835 }, { "epoch": 3.856057459751301, "grad_norm": 0.16358113288879395, "learning_rate": 3.0231960231977803e-07, "loss": 0.0, "num_input_tokens_seen": 106381520, "step": 157840 }, { "epoch": 3.856179610583148, "grad_norm": 0.00500052236020565, "learning_rate": 3.022585112046053e-07, "loss": 0.0001, "num_input_tokens_seen": 106384464, "step": 157845 }, { "epoch": 3.8563017614149953, "grad_norm": 0.0005464443238452077, "learning_rate": 3.021974251635965e-07, "loss": 0.0001, "num_input_tokens_seen": 106388112, "step": 157850 }, { "epoch": 3.8564239122468424, "grad_norm": 0.000844131747726351, "learning_rate": 3.021363441971959e-07, "loss": 0.0001, "num_input_tokens_seen": 106391312, "step": 157855 }, { "epoch": 3.8565460630786896, "grad_norm": 0.0025310174096375704, "learning_rate": 3.020752683058473e-07, "loss": 0.0, "num_input_tokens_seen": 106394832, "step": 157860 }, { "epoch": 3.856668213910537, "grad_norm": 0.0003100551257375628, "learning_rate": 3.0201419748999524e-07, "loss": 0.0, "num_input_tokens_seen": 106398160, "step": 157865 }, { "epoch": 3.856790364742384, "grad_norm": 0.0006610352429561317, "learning_rate": 3.019531317500834e-07, "loss": 0.0, "num_input_tokens_seen": 106401488, "step": 157870 }, { "epoch": 3.856912515574231, "grad_norm": 0.0011790769640356302, "learning_rate": 3.0189207108655656e-07, "loss": 0.0, "num_input_tokens_seen": 106404816, "step": 157875 }, { "epoch": 3.8570346664060784, "grad_norm": 0.0006918759318068624, "learning_rate": 3.018310154998579e-07, "loss": 0.0465, "num_input_tokens_seen": 106408080, "step": 157880 }, { "epoch": 3.857156817237925, "grad_norm": 0.005680972710251808, "learning_rate": 3.017699649904323e-07, "loss": 0.0, "num_input_tokens_seen": 106411664, "step": 157885 }, { "epoch": 3.8572789680697728, "grad_norm": 0.002036899561062455, "learning_rate": 3.017089195587232e-07, "loss": 0.0, "num_input_tokens_seen": 106414800, "step": 157890 }, { "epoch": 3.8574011189016195, "grad_norm": 0.00011878184886882082, "learning_rate": 3.0164787920517445e-07, "loss": 0.0001, "num_input_tokens_seen": 106418448, "step": 157895 }, { "epoch": 3.857523269733467, "grad_norm": 0.5946331024169922, "learning_rate": 3.0158684393023035e-07, "loss": 0.0002, "num_input_tokens_seen": 106421904, "step": 157900 }, { "epoch": 3.857645420565314, "grad_norm": 0.00022538656776305288, "learning_rate": 3.015258137343344e-07, "loss": 0.0, "num_input_tokens_seen": 106425104, "step": 157905 }, { "epoch": 3.857767571397161, "grad_norm": 0.0005389982834458351, "learning_rate": 3.0146478861793076e-07, "loss": 0.0, "num_input_tokens_seen": 106428752, "step": 157910 }, { "epoch": 3.8578897222290083, "grad_norm": 0.0007590112509205937, "learning_rate": 3.0140376858146286e-07, "loss": 0.0, "num_input_tokens_seen": 106432016, "step": 157915 }, { "epoch": 3.8580118730608555, "grad_norm": 0.005567790009081364, "learning_rate": 3.0134275362537465e-07, "loss": 0.0, "num_input_tokens_seen": 106435024, "step": 157920 }, { "epoch": 3.8581340238927027, "grad_norm": 0.006411381531506777, "learning_rate": 3.012817437501102e-07, "loss": 0.0006, "num_input_tokens_seen": 106438544, "step": 157925 }, { "epoch": 3.85825617472455, "grad_norm": 0.5736486315727234, "learning_rate": 3.0122073895611244e-07, "loss": 0.0468, "num_input_tokens_seen": 106441616, "step": 157930 }, { "epoch": 3.858378325556397, "grad_norm": 0.002640968654304743, "learning_rate": 3.011597392438258e-07, "loss": 0.0, "num_input_tokens_seen": 106444752, "step": 157935 }, { "epoch": 3.8585004763882442, "grad_norm": 0.00707990862429142, "learning_rate": 3.010987446136931e-07, "loss": 0.0835, "num_input_tokens_seen": 106448208, "step": 157940 }, { "epoch": 3.8586226272200914, "grad_norm": 0.001580284209921956, "learning_rate": 3.0103775506615837e-07, "loss": 0.0, "num_input_tokens_seen": 106452176, "step": 157945 }, { "epoch": 3.8587447780519386, "grad_norm": 0.0004957346827723086, "learning_rate": 3.0097677060166536e-07, "loss": 0.0, "num_input_tokens_seen": 106455952, "step": 157950 }, { "epoch": 3.858866928883786, "grad_norm": 0.0006093760603107512, "learning_rate": 3.00915791220657e-07, "loss": 0.0, "num_input_tokens_seen": 106459152, "step": 157955 }, { "epoch": 3.858989079715633, "grad_norm": 0.005039932671934366, "learning_rate": 3.008548169235774e-07, "loss": 0.0, "num_input_tokens_seen": 106462800, "step": 157960 }, { "epoch": 3.85911123054748, "grad_norm": 0.005193840246647596, "learning_rate": 3.0079384771086924e-07, "loss": 0.0, "num_input_tokens_seen": 106466320, "step": 157965 }, { "epoch": 3.859233381379327, "grad_norm": 0.006895958911627531, "learning_rate": 3.0073288358297656e-07, "loss": 0.0, "num_input_tokens_seen": 106469456, "step": 157970 }, { "epoch": 3.8593555322111746, "grad_norm": 0.01828017272055149, "learning_rate": 3.0067192454034217e-07, "loss": 0.0, "num_input_tokens_seen": 106472656, "step": 157975 }, { "epoch": 3.8594776830430213, "grad_norm": 0.004485220182687044, "learning_rate": 3.0061097058341e-07, "loss": 0.0, "num_input_tokens_seen": 106476240, "step": 157980 }, { "epoch": 3.859599833874869, "grad_norm": 0.00388152408413589, "learning_rate": 3.005500217126226e-07, "loss": 0.0003, "num_input_tokens_seen": 106479696, "step": 157985 }, { "epoch": 3.8597219847067157, "grad_norm": 0.0043313889764249325, "learning_rate": 3.004890779284239e-07, "loss": 0.0796, "num_input_tokens_seen": 106483216, "step": 157990 }, { "epoch": 3.859844135538563, "grad_norm": 0.0008505122968927026, "learning_rate": 3.0042813923125675e-07, "loss": 0.0003, "num_input_tokens_seen": 106486864, "step": 157995 }, { "epoch": 3.85996628637041, "grad_norm": 0.005160463973879814, "learning_rate": 3.0036720562156406e-07, "loss": 0.0, "num_input_tokens_seen": 106490512, "step": 158000 }, { "epoch": 3.8600884372022572, "grad_norm": 0.0028276483062654734, "learning_rate": 3.003062770997892e-07, "loss": 0.0, "num_input_tokens_seen": 106494032, "step": 158005 }, { "epoch": 3.8602105880341044, "grad_norm": 0.008698842488229275, "learning_rate": 3.002453536663756e-07, "loss": 0.0385, "num_input_tokens_seen": 106497552, "step": 158010 }, { "epoch": 3.8603327388659516, "grad_norm": 0.0025066733360290527, "learning_rate": 3.001844353217657e-07, "loss": 0.0, "num_input_tokens_seen": 106501136, "step": 158015 }, { "epoch": 3.860454889697799, "grad_norm": 0.00978843867778778, "learning_rate": 3.0012352206640313e-07, "loss": 0.0, "num_input_tokens_seen": 106504080, "step": 158020 }, { "epoch": 3.860577040529646, "grad_norm": 0.016681527718901634, "learning_rate": 3.000626139007302e-07, "loss": 0.0, "num_input_tokens_seen": 106507280, "step": 158025 }, { "epoch": 3.860699191361493, "grad_norm": 0.03341963514685631, "learning_rate": 3.0000171082519056e-07, "loss": 0.0, "num_input_tokens_seen": 106510288, "step": 158030 }, { "epoch": 3.8608213421933404, "grad_norm": 0.002291738986968994, "learning_rate": 2.999408128402264e-07, "loss": 0.0, "num_input_tokens_seen": 106513808, "step": 158035 }, { "epoch": 3.8609434930251876, "grad_norm": 0.0023505198769271374, "learning_rate": 2.9987991994628094e-07, "loss": 0.0, "num_input_tokens_seen": 106517008, "step": 158040 }, { "epoch": 3.8610656438570348, "grad_norm": 0.0010157792130485177, "learning_rate": 2.998190321437973e-07, "loss": 0.0, "num_input_tokens_seen": 106520144, "step": 158045 }, { "epoch": 3.861187794688882, "grad_norm": 0.0016791699454188347, "learning_rate": 2.9975814943321774e-07, "loss": 0.0436, "num_input_tokens_seen": 106523152, "step": 158050 }, { "epoch": 3.861309945520729, "grad_norm": 0.0008609518990851939, "learning_rate": 2.9969727181498563e-07, "loss": 0.0, "num_input_tokens_seen": 106526736, "step": 158055 }, { "epoch": 3.8614320963525763, "grad_norm": 0.004202486481517553, "learning_rate": 2.996363992895429e-07, "loss": 0.0, "num_input_tokens_seen": 106530256, "step": 158060 }, { "epoch": 3.861554247184423, "grad_norm": 0.0281804371625185, "learning_rate": 2.9957553185733295e-07, "loss": 0.0, "num_input_tokens_seen": 106533584, "step": 158065 }, { "epoch": 3.8616763980162707, "grad_norm": 0.03088965080678463, "learning_rate": 2.995146695187979e-07, "loss": 0.0, "num_input_tokens_seen": 106536976, "step": 158070 }, { "epoch": 3.8617985488481175, "grad_norm": 0.0011702068150043488, "learning_rate": 2.994538122743806e-07, "loss": 0.0003, "num_input_tokens_seen": 106540176, "step": 158075 }, { "epoch": 3.861920699679965, "grad_norm": 0.002321459585800767, "learning_rate": 2.993929601245239e-07, "loss": 0.0, "num_input_tokens_seen": 106543312, "step": 158080 }, { "epoch": 3.862042850511812, "grad_norm": 0.0071487524546682835, "learning_rate": 2.993321130696699e-07, "loss": 0.0, "num_input_tokens_seen": 106546576, "step": 158085 }, { "epoch": 3.862165001343659, "grad_norm": 0.011662092991173267, "learning_rate": 2.9927127111026094e-07, "loss": 0.0, "num_input_tokens_seen": 106549712, "step": 158090 }, { "epoch": 3.862287152175506, "grad_norm": 0.029429420828819275, "learning_rate": 2.992104342467402e-07, "loss": 0.0, "num_input_tokens_seen": 106552912, "step": 158095 }, { "epoch": 3.8624093030073534, "grad_norm": 0.005462816916406155, "learning_rate": 2.9914960247954936e-07, "loss": 0.0, "num_input_tokens_seen": 106556560, "step": 158100 }, { "epoch": 3.8625314538392006, "grad_norm": 0.016585996374487877, "learning_rate": 2.9908877580913126e-07, "loss": 0.0, "num_input_tokens_seen": 106559568, "step": 158105 }, { "epoch": 3.862653604671048, "grad_norm": 0.001426746603101492, "learning_rate": 2.99027954235928e-07, "loss": 0.0693, "num_input_tokens_seen": 106563088, "step": 158110 }, { "epoch": 3.862775755502895, "grad_norm": 0.0006431220099329948, "learning_rate": 2.989671377603822e-07, "loss": 0.0, "num_input_tokens_seen": 106566800, "step": 158115 }, { "epoch": 3.862897906334742, "grad_norm": 0.007045199163258076, "learning_rate": 2.989063263829357e-07, "loss": 0.0002, "num_input_tokens_seen": 106570256, "step": 158120 }, { "epoch": 3.8630200571665894, "grad_norm": 0.0010710848728194833, "learning_rate": 2.9884552010403106e-07, "loss": 0.0, "num_input_tokens_seen": 106573520, "step": 158125 }, { "epoch": 3.8631422079984366, "grad_norm": 0.008409908041357994, "learning_rate": 2.987847189241103e-07, "loss": 0.0, "num_input_tokens_seen": 106576848, "step": 158130 }, { "epoch": 3.8632643588302837, "grad_norm": 0.003941592760384083, "learning_rate": 2.987239228436156e-07, "loss": 0.0353, "num_input_tokens_seen": 106580048, "step": 158135 }, { "epoch": 3.863386509662131, "grad_norm": 0.0005290769040584564, "learning_rate": 2.9866313186298944e-07, "loss": 0.0775, "num_input_tokens_seen": 106583632, "step": 158140 }, { "epoch": 3.863508660493978, "grad_norm": 28.251291275024414, "learning_rate": 2.9860234598267333e-07, "loss": 0.0489, "num_input_tokens_seen": 106586960, "step": 158145 }, { "epoch": 3.863630811325825, "grad_norm": 0.01787254773080349, "learning_rate": 2.985415652031099e-07, "loss": 0.0, "num_input_tokens_seen": 106590096, "step": 158150 }, { "epoch": 3.8637529621576725, "grad_norm": 0.006238003261387348, "learning_rate": 2.9848078952474063e-07, "loss": 0.0, "num_input_tokens_seen": 106593232, "step": 158155 }, { "epoch": 3.8638751129895192, "grad_norm": 0.002003963105380535, "learning_rate": 2.984200189480077e-07, "loss": 0.0, "num_input_tokens_seen": 106596688, "step": 158160 }, { "epoch": 3.863997263821367, "grad_norm": 0.008238325826823711, "learning_rate": 2.983592534733533e-07, "loss": 0.0, "num_input_tokens_seen": 106600272, "step": 158165 }, { "epoch": 3.8641194146532136, "grad_norm": 0.007528550922870636, "learning_rate": 2.98298493101219e-07, "loss": 0.0258, "num_input_tokens_seen": 106603472, "step": 158170 }, { "epoch": 3.864241565485061, "grad_norm": 0.004292473196983337, "learning_rate": 2.982377378320471e-07, "loss": 0.0, "num_input_tokens_seen": 106606480, "step": 158175 }, { "epoch": 3.864363716316908, "grad_norm": 0.014148806221783161, "learning_rate": 2.981769876662786e-07, "loss": 0.0001, "num_input_tokens_seen": 106609936, "step": 158180 }, { "epoch": 3.864485867148755, "grad_norm": 0.002944579115137458, "learning_rate": 2.981162426043563e-07, "loss": 0.0, "num_input_tokens_seen": 106613264, "step": 158185 }, { "epoch": 3.8646080179806024, "grad_norm": 0.07395872473716736, "learning_rate": 2.980555026467212e-07, "loss": 0.0004, "num_input_tokens_seen": 106616272, "step": 158190 }, { "epoch": 3.8647301688124496, "grad_norm": 0.005129363853484392, "learning_rate": 2.9799476779381547e-07, "loss": 0.0, "num_input_tokens_seen": 106619728, "step": 158195 }, { "epoch": 3.8648523196442968, "grad_norm": 0.0018878680421039462, "learning_rate": 2.9793403804608066e-07, "loss": 0.0, "num_input_tokens_seen": 106623888, "step": 158200 }, { "epoch": 3.864974470476144, "grad_norm": 0.03187033161520958, "learning_rate": 2.9787331340395807e-07, "loss": 0.0, "num_input_tokens_seen": 106627664, "step": 158205 }, { "epoch": 3.865096621307991, "grad_norm": 0.0018504534382373095, "learning_rate": 2.9781259386788984e-07, "loss": 0.0, "num_input_tokens_seen": 106631248, "step": 158210 }, { "epoch": 3.8652187721398383, "grad_norm": 0.0012075145496055484, "learning_rate": 2.97751879438317e-07, "loss": 0.0, "num_input_tokens_seen": 106634704, "step": 158215 }, { "epoch": 3.8653409229716855, "grad_norm": 0.023051664233207703, "learning_rate": 2.976911701156818e-07, "loss": 0.001, "num_input_tokens_seen": 106638416, "step": 158220 }, { "epoch": 3.8654630738035327, "grad_norm": 0.0033154578413814306, "learning_rate": 2.9763046590042487e-07, "loss": 0.0, "num_input_tokens_seen": 106641616, "step": 158225 }, { "epoch": 3.86558522463538, "grad_norm": 0.001119441818445921, "learning_rate": 2.9756976679298805e-07, "loss": 0.0536, "num_input_tokens_seen": 106644688, "step": 158230 }, { "epoch": 3.865707375467227, "grad_norm": 0.0025297042448073626, "learning_rate": 2.9750907279381333e-07, "loss": 0.0, "num_input_tokens_seen": 106648464, "step": 158235 }, { "epoch": 3.8658295262990743, "grad_norm": 0.0012945241760462523, "learning_rate": 2.97448383903341e-07, "loss": 0.0, "num_input_tokens_seen": 106651792, "step": 158240 }, { "epoch": 3.865951677130921, "grad_norm": 0.08160996437072754, "learning_rate": 2.973877001220135e-07, "loss": 0.0002, "num_input_tokens_seen": 106655440, "step": 158245 }, { "epoch": 3.8660738279627687, "grad_norm": 0.0054352604784071445, "learning_rate": 2.9732702145027136e-07, "loss": 0.0, "num_input_tokens_seen": 106658576, "step": 158250 }, { "epoch": 3.8661959787946154, "grad_norm": 0.03615583851933479, "learning_rate": 2.97266347888556e-07, "loss": 0.0002, "num_input_tokens_seen": 106662288, "step": 158255 }, { "epoch": 3.8663181296264626, "grad_norm": 0.007702614646404982, "learning_rate": 2.9720567943730913e-07, "loss": 0.0447, "num_input_tokens_seen": 106665168, "step": 158260 }, { "epoch": 3.86644028045831, "grad_norm": 0.013397863134741783, "learning_rate": 2.971450160969712e-07, "loss": 0.0, "num_input_tokens_seen": 106668112, "step": 158265 }, { "epoch": 3.866562431290157, "grad_norm": 0.0030918209813535213, "learning_rate": 2.9708435786798414e-07, "loss": 0.0, "num_input_tokens_seen": 106672208, "step": 158270 }, { "epoch": 3.866684582122004, "grad_norm": 0.0047109597362577915, "learning_rate": 2.970237047507883e-07, "loss": 0.0, "num_input_tokens_seen": 106675408, "step": 158275 }, { "epoch": 3.8668067329538514, "grad_norm": 0.0012370293261483312, "learning_rate": 2.9696305674582553e-07, "loss": 0.0001, "num_input_tokens_seen": 106678992, "step": 158280 }, { "epoch": 3.8669288837856985, "grad_norm": 0.005313628353178501, "learning_rate": 2.969024138535362e-07, "loss": 0.0, "num_input_tokens_seen": 106682384, "step": 158285 }, { "epoch": 3.8670510346175457, "grad_norm": 0.000701016397215426, "learning_rate": 2.96841776074362e-07, "loss": 0.058, "num_input_tokens_seen": 106685520, "step": 158290 }, { "epoch": 3.867173185449393, "grad_norm": 0.0036516785621643066, "learning_rate": 2.9678114340874317e-07, "loss": 0.0, "num_input_tokens_seen": 106689040, "step": 158295 }, { "epoch": 3.86729533628124, "grad_norm": 0.2513877749443054, "learning_rate": 2.967205158571212e-07, "loss": 0.0204, "num_input_tokens_seen": 106692880, "step": 158300 }, { "epoch": 3.8674174871130873, "grad_norm": 0.00019543587404768914, "learning_rate": 2.966598934199369e-07, "loss": 0.0001, "num_input_tokens_seen": 106696528, "step": 158305 }, { "epoch": 3.8675396379449345, "grad_norm": 0.012953980825841427, "learning_rate": 2.965992760976308e-07, "loss": 0.0001, "num_input_tokens_seen": 106699728, "step": 158310 }, { "epoch": 3.8676617887767817, "grad_norm": 0.024555783718824387, "learning_rate": 2.9653866389064387e-07, "loss": 0.0, "num_input_tokens_seen": 106702800, "step": 158315 }, { "epoch": 3.867783939608629, "grad_norm": 0.06984036415815353, "learning_rate": 2.9647805679941726e-07, "loss": 0.0, "num_input_tokens_seen": 106705808, "step": 158320 }, { "epoch": 3.867906090440476, "grad_norm": 0.005320082418620586, "learning_rate": 2.9641745482439115e-07, "loss": 0.0004, "num_input_tokens_seen": 106709264, "step": 158325 }, { "epoch": 3.868028241272323, "grad_norm": 0.03669695928692818, "learning_rate": 2.9635685796600695e-07, "loss": 0.0, "num_input_tokens_seen": 106712336, "step": 158330 }, { "epoch": 3.8681503921041704, "grad_norm": 0.0015380196273326874, "learning_rate": 2.962962662247045e-07, "loss": 0.0, "num_input_tokens_seen": 106715600, "step": 158335 }, { "epoch": 3.868272542936017, "grad_norm": 0.0006915759877301753, "learning_rate": 2.962356796009253e-07, "loss": 0.0, "num_input_tokens_seen": 106719184, "step": 158340 }, { "epoch": 3.868394693767865, "grad_norm": 95.70181274414062, "learning_rate": 2.961750980951091e-07, "loss": 0.0771, "num_input_tokens_seen": 106722512, "step": 158345 }, { "epoch": 3.8685168445997116, "grad_norm": 49.37448501586914, "learning_rate": 2.9611452170769704e-07, "loss": 0.0329, "num_input_tokens_seen": 106725584, "step": 158350 }, { "epoch": 3.8686389954315588, "grad_norm": 0.009279985912144184, "learning_rate": 2.960539504391297e-07, "loss": 0.0001, "num_input_tokens_seen": 106728784, "step": 158355 }, { "epoch": 3.868761146263406, "grad_norm": 0.06280479580163956, "learning_rate": 2.959933842898471e-07, "loss": 0.0, "num_input_tokens_seen": 106731984, "step": 158360 }, { "epoch": 3.868883297095253, "grad_norm": 0.0013443909119814634, "learning_rate": 2.959328232602902e-07, "loss": 0.0, "num_input_tokens_seen": 106735312, "step": 158365 }, { "epoch": 3.8690054479271003, "grad_norm": 0.0017101116245612502, "learning_rate": 2.95872267350899e-07, "loss": 0.0, "num_input_tokens_seen": 106739024, "step": 158370 }, { "epoch": 3.8691275987589475, "grad_norm": 0.0016042448114603758, "learning_rate": 2.9581171656211423e-07, "loss": 0.0439, "num_input_tokens_seen": 106742224, "step": 158375 }, { "epoch": 3.8692497495907947, "grad_norm": 0.03807985782623291, "learning_rate": 2.9575117089437584e-07, "loss": 0.0, "num_input_tokens_seen": 106745360, "step": 158380 }, { "epoch": 3.869371900422642, "grad_norm": 0.0057759396731853485, "learning_rate": 2.956906303481244e-07, "loss": 0.0, "num_input_tokens_seen": 106748880, "step": 158385 }, { "epoch": 3.869494051254489, "grad_norm": 0.003695416497066617, "learning_rate": 2.956300949238003e-07, "loss": 0.0, "num_input_tokens_seen": 106752272, "step": 158390 }, { "epoch": 3.8696162020863363, "grad_norm": 0.0005830421578139067, "learning_rate": 2.955695646218437e-07, "loss": 0.0, "num_input_tokens_seen": 106755664, "step": 158395 }, { "epoch": 3.8697383529181835, "grad_norm": 0.0017493544146418571, "learning_rate": 2.9550903944269445e-07, "loss": 0.0489, "num_input_tokens_seen": 106758992, "step": 158400 }, { "epoch": 3.8698605037500307, "grad_norm": 0.04775483161211014, "learning_rate": 2.9544851938679314e-07, "loss": 0.0544, "num_input_tokens_seen": 106762448, "step": 158405 }, { "epoch": 3.869982654581878, "grad_norm": 0.09650695323944092, "learning_rate": 2.9538800445457946e-07, "loss": 0.0005, "num_input_tokens_seen": 106766352, "step": 158410 }, { "epoch": 3.8701048054137246, "grad_norm": 0.010348550975322723, "learning_rate": 2.95327494646494e-07, "loss": 0.0, "num_input_tokens_seen": 106769808, "step": 158415 }, { "epoch": 3.8702269562455722, "grad_norm": 0.00027812286862172186, "learning_rate": 2.9526698996297615e-07, "loss": 0.0536, "num_input_tokens_seen": 106773264, "step": 158420 }, { "epoch": 3.870349107077419, "grad_norm": 0.001886223559267819, "learning_rate": 2.952064904044668e-07, "loss": 0.0002, "num_input_tokens_seen": 106776336, "step": 158425 }, { "epoch": 3.8704712579092666, "grad_norm": 0.0028586441185325384, "learning_rate": 2.951459959714049e-07, "loss": 0.0, "num_input_tokens_seen": 106779280, "step": 158430 }, { "epoch": 3.8705934087411134, "grad_norm": 0.010470589622855186, "learning_rate": 2.9508550666423136e-07, "loss": 0.0, "num_input_tokens_seen": 106782288, "step": 158435 }, { "epoch": 3.8707155595729605, "grad_norm": 0.00042181566823273897, "learning_rate": 2.9502502248338525e-07, "loss": 0.0372, "num_input_tokens_seen": 106785680, "step": 158440 }, { "epoch": 3.8708377104048077, "grad_norm": 0.0008364489185623825, "learning_rate": 2.9496454342930674e-07, "loss": 0.0, "num_input_tokens_seen": 106788944, "step": 158445 }, { "epoch": 3.870959861236655, "grad_norm": 0.0018208818510174751, "learning_rate": 2.949040695024361e-07, "loss": 0.0, "num_input_tokens_seen": 106792464, "step": 158450 }, { "epoch": 3.871082012068502, "grad_norm": 3.7427899837493896, "learning_rate": 2.9484360070321236e-07, "loss": 0.0006, "num_input_tokens_seen": 106795920, "step": 158455 }, { "epoch": 3.8712041629003493, "grad_norm": 0.008290299214422703, "learning_rate": 2.94783137032076e-07, "loss": 0.0, "num_input_tokens_seen": 106798928, "step": 158460 }, { "epoch": 3.8713263137321965, "grad_norm": 0.0034195426851511, "learning_rate": 2.94722678489466e-07, "loss": 0.0427, "num_input_tokens_seen": 106802256, "step": 158465 }, { "epoch": 3.8714484645640437, "grad_norm": 0.00021721194207202643, "learning_rate": 2.946622250758226e-07, "loss": 0.0, "num_input_tokens_seen": 106805584, "step": 158470 }, { "epoch": 3.871570615395891, "grad_norm": 0.0020799501799046993, "learning_rate": 2.9460177679158505e-07, "loss": 0.0, "num_input_tokens_seen": 106809232, "step": 158475 }, { "epoch": 3.871692766227738, "grad_norm": 0.012277498841285706, "learning_rate": 2.9454133363719304e-07, "loss": 0.0, "num_input_tokens_seen": 106812112, "step": 158480 }, { "epoch": 3.8718149170595852, "grad_norm": 0.0012988023227080703, "learning_rate": 2.944808956130864e-07, "loss": 0.0489, "num_input_tokens_seen": 106815952, "step": 158485 }, { "epoch": 3.8719370678914324, "grad_norm": 0.005164467729628086, "learning_rate": 2.944204627197042e-07, "loss": 0.0003, "num_input_tokens_seen": 106819088, "step": 158490 }, { "epoch": 3.8720592187232796, "grad_norm": 0.003667938755825162, "learning_rate": 2.9436003495748664e-07, "loss": 0.0, "num_input_tokens_seen": 106821968, "step": 158495 }, { "epoch": 3.872181369555127, "grad_norm": 0.0016687294701114297, "learning_rate": 2.942996123268722e-07, "loss": 0.0, "num_input_tokens_seen": 106825616, "step": 158500 }, { "epoch": 3.872303520386974, "grad_norm": 0.0008459803066216409, "learning_rate": 2.942391948283012e-07, "loss": 0.0, "num_input_tokens_seen": 106829072, "step": 158505 }, { "epoch": 3.8724256712188208, "grad_norm": 0.0023603031877428293, "learning_rate": 2.941787824622125e-07, "loss": 0.0, "num_input_tokens_seen": 106832336, "step": 158510 }, { "epoch": 3.8725478220506684, "grad_norm": 0.015080592595040798, "learning_rate": 2.9411837522904536e-07, "loss": 0.0, "num_input_tokens_seen": 106835920, "step": 158515 }, { "epoch": 3.872669972882515, "grad_norm": 0.009865859523415565, "learning_rate": 2.940579731292395e-07, "loss": 0.0, "num_input_tokens_seen": 106839504, "step": 158520 }, { "epoch": 3.8727921237143628, "grad_norm": 248.03933715820312, "learning_rate": 2.9399757616323363e-07, "loss": 0.0327, "num_input_tokens_seen": 106842832, "step": 158525 }, { "epoch": 3.8729142745462095, "grad_norm": 0.005308913998305798, "learning_rate": 2.9393718433146766e-07, "loss": 0.0, "num_input_tokens_seen": 106846160, "step": 158530 }, { "epoch": 3.8730364253780567, "grad_norm": 0.002870805561542511, "learning_rate": 2.938767976343799e-07, "loss": 0.0, "num_input_tokens_seen": 106849296, "step": 158535 }, { "epoch": 3.873158576209904, "grad_norm": 0.0016712587093934417, "learning_rate": 2.9381641607241014e-07, "loss": 0.0001, "num_input_tokens_seen": 106852944, "step": 158540 }, { "epoch": 3.873280727041751, "grad_norm": 0.006070550996810198, "learning_rate": 2.937560396459976e-07, "loss": 0.0, "num_input_tokens_seen": 106856720, "step": 158545 }, { "epoch": 3.8734028778735983, "grad_norm": 0.00014672131510451436, "learning_rate": 2.936956683555808e-07, "loss": 0.0004, "num_input_tokens_seen": 106859728, "step": 158550 }, { "epoch": 3.8735250287054455, "grad_norm": 0.001376955071464181, "learning_rate": 2.936353022015994e-07, "loss": 0.0, "num_input_tokens_seen": 106863376, "step": 158555 }, { "epoch": 3.8736471795372927, "grad_norm": 0.0015214919112622738, "learning_rate": 2.935749411844918e-07, "loss": 0.0, "num_input_tokens_seen": 106866576, "step": 158560 }, { "epoch": 3.87376933036914, "grad_norm": 0.0040117776952683926, "learning_rate": 2.9351458530469707e-07, "loss": 0.0, "num_input_tokens_seen": 106869904, "step": 158565 }, { "epoch": 3.873891481200987, "grad_norm": 0.0029242250602692366, "learning_rate": 2.9345423456265474e-07, "loss": 0.0325, "num_input_tokens_seen": 106873232, "step": 158570 }, { "epoch": 3.874013632032834, "grad_norm": 0.0031731261406093836, "learning_rate": 2.933938889588029e-07, "loss": 0.0, "num_input_tokens_seen": 106876688, "step": 158575 }, { "epoch": 3.8741357828646814, "grad_norm": 0.0031085580121725798, "learning_rate": 2.933335484935812e-07, "loss": 0.0, "num_input_tokens_seen": 106879952, "step": 158580 }, { "epoch": 3.8742579336965286, "grad_norm": 0.0030465181916952133, "learning_rate": 2.932732131674275e-07, "loss": 0.0953, "num_input_tokens_seen": 106883600, "step": 158585 }, { "epoch": 3.874380084528376, "grad_norm": 0.0008837351924739778, "learning_rate": 2.932128829807815e-07, "loss": 0.0, "num_input_tokens_seen": 106886864, "step": 158590 }, { "epoch": 3.8745022353602225, "grad_norm": 0.01298034843057394, "learning_rate": 2.931525579340811e-07, "loss": 0.0524, "num_input_tokens_seen": 106890320, "step": 158595 }, { "epoch": 3.87462438619207, "grad_norm": 0.0018864242592826486, "learning_rate": 2.9309223802776585e-07, "loss": 0.0, "num_input_tokens_seen": 106894096, "step": 158600 }, { "epoch": 3.874746537023917, "grad_norm": 0.0011013778857886791, "learning_rate": 2.9303192326227365e-07, "loss": 0.0, "num_input_tokens_seen": 106898000, "step": 158605 }, { "epoch": 3.8748686878557645, "grad_norm": 0.008215600624680519, "learning_rate": 2.929716136380438e-07, "loss": 0.0004, "num_input_tokens_seen": 106901008, "step": 158610 }, { "epoch": 3.8749908386876113, "grad_norm": 0.0011877411743625998, "learning_rate": 2.9291130915551443e-07, "loss": 0.0, "num_input_tokens_seen": 106904144, "step": 158615 }, { "epoch": 3.8751129895194585, "grad_norm": 0.004192714113742113, "learning_rate": 2.928510098151239e-07, "loss": 0.0, "num_input_tokens_seen": 106907536, "step": 158620 }, { "epoch": 3.8752351403513057, "grad_norm": 0.0015647343825548887, "learning_rate": 2.927907156173114e-07, "loss": 0.0001, "num_input_tokens_seen": 106911184, "step": 158625 }, { "epoch": 3.875357291183153, "grad_norm": 0.005396246910095215, "learning_rate": 2.927304265625148e-07, "loss": 0.003, "num_input_tokens_seen": 106914320, "step": 158630 }, { "epoch": 3.875479442015, "grad_norm": 0.005837199278175831, "learning_rate": 2.9267014265117264e-07, "loss": 0.0, "num_input_tokens_seen": 106917776, "step": 158635 }, { "epoch": 3.8756015928468472, "grad_norm": 0.003539950354024768, "learning_rate": 2.9260986388372377e-07, "loss": 0.0, "num_input_tokens_seen": 106921680, "step": 158640 }, { "epoch": 3.8757237436786944, "grad_norm": 0.01194003690034151, "learning_rate": 2.92549590260606e-07, "loss": 0.0001, "num_input_tokens_seen": 106924816, "step": 158645 }, { "epoch": 3.8758458945105416, "grad_norm": 0.0017153606750071049, "learning_rate": 2.9248932178225813e-07, "loss": 0.0, "num_input_tokens_seen": 106927696, "step": 158650 }, { "epoch": 3.875968045342389, "grad_norm": 0.00042412380571477115, "learning_rate": 2.9242905844911794e-07, "loss": 0.0002, "num_input_tokens_seen": 106931024, "step": 158655 }, { "epoch": 3.876090196174236, "grad_norm": 0.03366420045495033, "learning_rate": 2.923688002616239e-07, "loss": 0.0001, "num_input_tokens_seen": 106934160, "step": 158660 }, { "epoch": 3.876212347006083, "grad_norm": 0.004773721564561129, "learning_rate": 2.9230854722021456e-07, "loss": 0.0002, "num_input_tokens_seen": 106937360, "step": 158665 }, { "epoch": 3.8763344978379304, "grad_norm": 0.0011197905987501144, "learning_rate": 2.922482993253277e-07, "loss": 0.0001, "num_input_tokens_seen": 106940624, "step": 158670 }, { "epoch": 3.8764566486697776, "grad_norm": 0.006437378935515881, "learning_rate": 2.921880565774016e-07, "loss": 0.0, "num_input_tokens_seen": 106944080, "step": 158675 }, { "epoch": 3.8765787995016248, "grad_norm": 0.0029959857929497957, "learning_rate": 2.9212781897687424e-07, "loss": 0.0, "num_input_tokens_seen": 106947408, "step": 158680 }, { "epoch": 3.876700950333472, "grad_norm": 0.03182037174701691, "learning_rate": 2.920675865241841e-07, "loss": 0.0, "num_input_tokens_seen": 106951312, "step": 158685 }, { "epoch": 3.8768231011653187, "grad_norm": 0.015112731605768204, "learning_rate": 2.920073592197684e-07, "loss": 0.0002, "num_input_tokens_seen": 106954576, "step": 158690 }, { "epoch": 3.8769452519971663, "grad_norm": 0.0007206939626485109, "learning_rate": 2.919471370640657e-07, "loss": 0.0002, "num_input_tokens_seen": 106957776, "step": 158695 }, { "epoch": 3.877067402829013, "grad_norm": 0.0029561310075223446, "learning_rate": 2.918869200575141e-07, "loss": 0.0444, "num_input_tokens_seen": 106961488, "step": 158700 }, { "epoch": 3.8771895536608603, "grad_norm": 0.0027475692331790924, "learning_rate": 2.918267082005513e-07, "loss": 0.0, "num_input_tokens_seen": 106964944, "step": 158705 }, { "epoch": 3.8773117044927075, "grad_norm": 0.0017425743862986565, "learning_rate": 2.9176650149361495e-07, "loss": 0.0, "num_input_tokens_seen": 106968208, "step": 158710 }, { "epoch": 3.8774338553245546, "grad_norm": 0.008587181568145752, "learning_rate": 2.9170629993714336e-07, "loss": 0.0, "num_input_tokens_seen": 106971216, "step": 158715 }, { "epoch": 3.877556006156402, "grad_norm": 0.21515829861164093, "learning_rate": 2.9164610353157373e-07, "loss": 0.0001, "num_input_tokens_seen": 106974672, "step": 158720 }, { "epoch": 3.877678156988249, "grad_norm": 0.043689239770174026, "learning_rate": 2.915859122773444e-07, "loss": 0.0, "num_input_tokens_seen": 106977744, "step": 158725 }, { "epoch": 3.877800307820096, "grad_norm": 0.010470571927726269, "learning_rate": 2.915257261748927e-07, "loss": 0.017, "num_input_tokens_seen": 106981392, "step": 158730 }, { "epoch": 3.8779224586519434, "grad_norm": 0.002375473501160741, "learning_rate": 2.9146554522465674e-07, "loss": 0.0, "num_input_tokens_seen": 106984784, "step": 158735 }, { "epoch": 3.8780446094837906, "grad_norm": 0.12363884598016739, "learning_rate": 2.914053694270735e-07, "loss": 0.0002, "num_input_tokens_seen": 106988432, "step": 158740 }, { "epoch": 3.878166760315638, "grad_norm": 0.09494874626398087, "learning_rate": 2.9134519878258133e-07, "loss": 0.0001, "num_input_tokens_seen": 106992080, "step": 158745 }, { "epoch": 3.878288911147485, "grad_norm": 0.005724775139242411, "learning_rate": 2.9128503329161724e-07, "loss": 0.0, "num_input_tokens_seen": 106995792, "step": 158750 }, { "epoch": 3.878411061979332, "grad_norm": 0.018899552524089813, "learning_rate": 2.912248729546191e-07, "loss": 0.0, "num_input_tokens_seen": 106999184, "step": 158755 }, { "epoch": 3.8785332128111794, "grad_norm": 0.35931485891342163, "learning_rate": 2.9116471777202445e-07, "loss": 0.0623, "num_input_tokens_seen": 107002128, "step": 158760 }, { "epoch": 3.8786553636430265, "grad_norm": 0.3836727738380432, "learning_rate": 2.911045677442704e-07, "loss": 0.0002, "num_input_tokens_seen": 107005840, "step": 158765 }, { "epoch": 3.8787775144748737, "grad_norm": 0.08484736829996109, "learning_rate": 2.910444228717949e-07, "loss": 0.0, "num_input_tokens_seen": 107008784, "step": 158770 }, { "epoch": 3.8788996653067205, "grad_norm": 0.02943931147456169, "learning_rate": 2.9098428315503466e-07, "loss": 0.0001, "num_input_tokens_seen": 107011728, "step": 158775 }, { "epoch": 3.879021816138568, "grad_norm": 0.008347810246050358, "learning_rate": 2.9092414859442784e-07, "loss": 0.0, "num_input_tokens_seen": 107015056, "step": 158780 }, { "epoch": 3.879143966970415, "grad_norm": 0.003075164509937167, "learning_rate": 2.908640191904109e-07, "loss": 0.0001, "num_input_tokens_seen": 107018128, "step": 158785 }, { "epoch": 3.8792661178022625, "grad_norm": 0.0002224208292318508, "learning_rate": 2.908038949434216e-07, "loss": 0.0, "num_input_tokens_seen": 107021200, "step": 158790 }, { "epoch": 3.8793882686341092, "grad_norm": 45.61159896850586, "learning_rate": 2.907437758538975e-07, "loss": 0.07, "num_input_tokens_seen": 107024400, "step": 158795 }, { "epoch": 3.8795104194659564, "grad_norm": 0.00022618890216108412, "learning_rate": 2.906836619222751e-07, "loss": 0.0001, "num_input_tokens_seen": 107028048, "step": 158800 }, { "epoch": 3.8796325702978036, "grad_norm": 0.0012848754413425922, "learning_rate": 2.906235531489921e-07, "loss": 0.0, "num_input_tokens_seen": 107031184, "step": 158805 }, { "epoch": 3.879754721129651, "grad_norm": 0.0028317791875451803, "learning_rate": 2.905634495344853e-07, "loss": 0.0, "num_input_tokens_seen": 107034640, "step": 158810 }, { "epoch": 3.879876871961498, "grad_norm": 0.005218177102506161, "learning_rate": 2.905033510791921e-07, "loss": 0.0, "num_input_tokens_seen": 107037776, "step": 158815 }, { "epoch": 3.879999022793345, "grad_norm": 0.0034669225569814444, "learning_rate": 2.9044325778354937e-07, "loss": 0.0, "num_input_tokens_seen": 107040976, "step": 158820 }, { "epoch": 3.8801211736251924, "grad_norm": 0.018067175522446632, "learning_rate": 2.903831696479938e-07, "loss": 0.0, "num_input_tokens_seen": 107044496, "step": 158825 }, { "epoch": 3.8802433244570396, "grad_norm": 0.0030394871719181538, "learning_rate": 2.90323086672963e-07, "loss": 0.0, "num_input_tokens_seen": 107047696, "step": 158830 }, { "epoch": 3.8803654752888868, "grad_norm": 0.00258993124589324, "learning_rate": 2.9026300885889333e-07, "loss": 0.0, "num_input_tokens_seen": 107050704, "step": 158835 }, { "epoch": 3.880487626120734, "grad_norm": 0.0005346864345483482, "learning_rate": 2.9020293620622214e-07, "loss": 0.0631, "num_input_tokens_seen": 107053968, "step": 158840 }, { "epoch": 3.880609776952581, "grad_norm": 0.0007554969051852822, "learning_rate": 2.90142868715386e-07, "loss": 0.0, "num_input_tokens_seen": 107057104, "step": 158845 }, { "epoch": 3.8807319277844283, "grad_norm": 0.0022322291042655706, "learning_rate": 2.900828063868216e-07, "loss": 0.0325, "num_input_tokens_seen": 107060368, "step": 158850 }, { "epoch": 3.8808540786162755, "grad_norm": 6.525689968839288e-05, "learning_rate": 2.9002274922096646e-07, "loss": 0.0, "num_input_tokens_seen": 107063888, "step": 158855 }, { "epoch": 3.8809762294481227, "grad_norm": 0.00232454901561141, "learning_rate": 2.899626972182565e-07, "loss": 0.0, "num_input_tokens_seen": 107067344, "step": 158860 }, { "epoch": 3.88109838027997, "grad_norm": 0.01140713132917881, "learning_rate": 2.899026503791291e-07, "loss": 0.0, "num_input_tokens_seen": 107070608, "step": 158865 }, { "epoch": 3.8812205311118166, "grad_norm": 0.00026834936579689384, "learning_rate": 2.898426087040203e-07, "loss": 0.0, "num_input_tokens_seen": 107073872, "step": 158870 }, { "epoch": 3.8813426819436643, "grad_norm": 0.0015741335228085518, "learning_rate": 2.89782572193367e-07, "loss": 0.0, "num_input_tokens_seen": 107077456, "step": 158875 }, { "epoch": 3.881464832775511, "grad_norm": 0.02891607955098152, "learning_rate": 2.8972254084760626e-07, "loss": 0.0, "num_input_tokens_seen": 107080784, "step": 158880 }, { "epoch": 3.881586983607358, "grad_norm": 0.012207439169287682, "learning_rate": 2.8966251466717395e-07, "loss": 0.0366, "num_input_tokens_seen": 107083728, "step": 158885 }, { "epoch": 3.8817091344392054, "grad_norm": 0.6531633734703064, "learning_rate": 2.896024936525071e-07, "loss": 0.0002, "num_input_tokens_seen": 107086800, "step": 158890 }, { "epoch": 3.8818312852710526, "grad_norm": 0.007391483057290316, "learning_rate": 2.895424778040417e-07, "loss": 0.0001, "num_input_tokens_seen": 107091344, "step": 158895 }, { "epoch": 3.8819534361029, "grad_norm": 0.000366843567462638, "learning_rate": 2.894824671222149e-07, "loss": 0.0, "num_input_tokens_seen": 107094864, "step": 158900 }, { "epoch": 3.882075586934747, "grad_norm": 0.0009734971099533141, "learning_rate": 2.894224616074623e-07, "loss": 0.0, "num_input_tokens_seen": 107098320, "step": 158905 }, { "epoch": 3.882197737766594, "grad_norm": 0.019451703876256943, "learning_rate": 2.89362461260221e-07, "loss": 0.0001, "num_input_tokens_seen": 107101264, "step": 158910 }, { "epoch": 3.8823198885984413, "grad_norm": 0.012309051118791103, "learning_rate": 2.893024660809268e-07, "loss": 0.0, "num_input_tokens_seen": 107104720, "step": 158915 }, { "epoch": 3.8824420394302885, "grad_norm": 0.0053732809610664845, "learning_rate": 2.892424760700164e-07, "loss": 0.0, "num_input_tokens_seen": 107108112, "step": 158920 }, { "epoch": 3.8825641902621357, "grad_norm": 0.0009634348680265248, "learning_rate": 2.89182491227926e-07, "loss": 0.0, "num_input_tokens_seen": 107111312, "step": 158925 }, { "epoch": 3.882686341093983, "grad_norm": 0.029144437983632088, "learning_rate": 2.891225115550914e-07, "loss": 0.0, "num_input_tokens_seen": 107115280, "step": 158930 }, { "epoch": 3.88280849192583, "grad_norm": 41.72419357299805, "learning_rate": 2.890625370519493e-07, "loss": 0.0838, "num_input_tokens_seen": 107118608, "step": 158935 }, { "epoch": 3.8829306427576773, "grad_norm": 0.0015868102200329304, "learning_rate": 2.8900256771893536e-07, "loss": 0.0, "num_input_tokens_seen": 107121616, "step": 158940 }, { "epoch": 3.8830527935895245, "grad_norm": 0.006282476708292961, "learning_rate": 2.8894260355648605e-07, "loss": 0.0378, "num_input_tokens_seen": 107124624, "step": 158945 }, { "epoch": 3.8831749444213717, "grad_norm": 0.0006471476517617702, "learning_rate": 2.888826445650376e-07, "loss": 0.0, "num_input_tokens_seen": 107127760, "step": 158950 }, { "epoch": 3.8832970952532184, "grad_norm": 0.005045460537075996, "learning_rate": 2.8882269074502565e-07, "loss": 0.0, "num_input_tokens_seen": 107130960, "step": 158955 }, { "epoch": 3.883419246085066, "grad_norm": 0.002895316807553172, "learning_rate": 2.887627420968867e-07, "loss": 0.0, "num_input_tokens_seen": 107134096, "step": 158960 }, { "epoch": 3.883541396916913, "grad_norm": 0.30166134238243103, "learning_rate": 2.8870279862105596e-07, "loss": 0.0, "num_input_tokens_seen": 107137360, "step": 158965 }, { "epoch": 3.8836635477487604, "grad_norm": 0.16179952025413513, "learning_rate": 2.886428603179698e-07, "loss": 0.0, "num_input_tokens_seen": 107140496, "step": 158970 }, { "epoch": 3.883785698580607, "grad_norm": 0.0012596636079251766, "learning_rate": 2.8858292718806443e-07, "loss": 0.0, "num_input_tokens_seen": 107143952, "step": 158975 }, { "epoch": 3.8839078494124544, "grad_norm": 0.03619730472564697, "learning_rate": 2.88522999231775e-07, "loss": 0.0, "num_input_tokens_seen": 107147664, "step": 158980 }, { "epoch": 3.8840300002443016, "grad_norm": 0.001989589538425207, "learning_rate": 2.8846307644953803e-07, "loss": 0.0, "num_input_tokens_seen": 107151376, "step": 158985 }, { "epoch": 3.8841521510761488, "grad_norm": 0.0026612537913024426, "learning_rate": 2.884031588417887e-07, "loss": 0.0, "num_input_tokens_seen": 107154832, "step": 158990 }, { "epoch": 3.884274301907996, "grad_norm": 0.0006597295869141817, "learning_rate": 2.8834324640896325e-07, "loss": 0.0, "num_input_tokens_seen": 107157904, "step": 158995 }, { "epoch": 3.884396452739843, "grad_norm": 0.0057218982838094234, "learning_rate": 2.8828333915149674e-07, "loss": 0.0, "num_input_tokens_seen": 107161296, "step": 159000 }, { "epoch": 3.8845186035716903, "grad_norm": 0.00010775305418064818, "learning_rate": 2.882234370698253e-07, "loss": 0.0, "num_input_tokens_seen": 107164624, "step": 159005 }, { "epoch": 3.8846407544035375, "grad_norm": 0.0007086143596097827, "learning_rate": 2.8816354016438483e-07, "loss": 0.0, "num_input_tokens_seen": 107168528, "step": 159010 }, { "epoch": 3.8847629052353847, "grad_norm": 0.007450388744473457, "learning_rate": 2.881036484356104e-07, "loss": 0.0, "num_input_tokens_seen": 107171856, "step": 159015 }, { "epoch": 3.884885056067232, "grad_norm": 0.220229834318161, "learning_rate": 2.8804376188393755e-07, "loss": 0.0001, "num_input_tokens_seen": 107174992, "step": 159020 }, { "epoch": 3.885007206899079, "grad_norm": 0.000642230617813766, "learning_rate": 2.8798388050980225e-07, "loss": 0.0, "num_input_tokens_seen": 107178384, "step": 159025 }, { "epoch": 3.8851293577309263, "grad_norm": 0.03365645557641983, "learning_rate": 2.8792400431363927e-07, "loss": 0.0001, "num_input_tokens_seen": 107181776, "step": 159030 }, { "epoch": 3.8852515085627735, "grad_norm": 0.0017283897614106536, "learning_rate": 2.8786413329588486e-07, "loss": 0.0, "num_input_tokens_seen": 107184976, "step": 159035 }, { "epoch": 3.88537365939462, "grad_norm": 0.00508853467181325, "learning_rate": 2.8780426745697374e-07, "loss": 0.0, "num_input_tokens_seen": 107188816, "step": 159040 }, { "epoch": 3.885495810226468, "grad_norm": 0.003997340332716703, "learning_rate": 2.877444067973418e-07, "loss": 0.0, "num_input_tokens_seen": 107192208, "step": 159045 }, { "epoch": 3.8856179610583146, "grad_norm": 28.383333206176758, "learning_rate": 2.8768455131742373e-07, "loss": 0.0566, "num_input_tokens_seen": 107195728, "step": 159050 }, { "epoch": 3.885740111890162, "grad_norm": 0.01382320187985897, "learning_rate": 2.876247010176556e-07, "loss": 0.0353, "num_input_tokens_seen": 107199120, "step": 159055 }, { "epoch": 3.885862262722009, "grad_norm": 0.002845111768692732, "learning_rate": 2.875648558984719e-07, "loss": 0.0, "num_input_tokens_seen": 107202256, "step": 159060 }, { "epoch": 3.885984413553856, "grad_norm": 0.001575396629050374, "learning_rate": 2.875050159603082e-07, "loss": 0.0, "num_input_tokens_seen": 107205520, "step": 159065 }, { "epoch": 3.8861065643857033, "grad_norm": 90.33045959472656, "learning_rate": 2.8744518120359997e-07, "loss": 0.0253, "num_input_tokens_seen": 107208848, "step": 159070 }, { "epoch": 3.8862287152175505, "grad_norm": 0.0005721577326767147, "learning_rate": 2.8738535162878173e-07, "loss": 0.0, "num_input_tokens_seen": 107211984, "step": 159075 }, { "epoch": 3.8863508660493977, "grad_norm": 0.0007707910262979567, "learning_rate": 2.873255272362891e-07, "loss": 0.0002, "num_input_tokens_seen": 107215504, "step": 159080 }, { "epoch": 3.886473016881245, "grad_norm": 0.0014013528125360608, "learning_rate": 2.872657080265567e-07, "loss": 0.0001, "num_input_tokens_seen": 107218576, "step": 159085 }, { "epoch": 3.886595167713092, "grad_norm": 0.0005075766239315271, "learning_rate": 2.8720589400002014e-07, "loss": 0.0005, "num_input_tokens_seen": 107221584, "step": 159090 }, { "epoch": 3.8867173185449393, "grad_norm": 0.0056504616513848305, "learning_rate": 2.871460851571137e-07, "loss": 0.0, "num_input_tokens_seen": 107225104, "step": 159095 }, { "epoch": 3.8868394693767865, "grad_norm": 0.0012615727027878165, "learning_rate": 2.870862814982726e-07, "loss": 0.0, "num_input_tokens_seen": 107228944, "step": 159100 }, { "epoch": 3.8869616202086337, "grad_norm": 0.002773595042526722, "learning_rate": 2.8702648302393217e-07, "loss": 0.0, "num_input_tokens_seen": 107232272, "step": 159105 }, { "epoch": 3.887083771040481, "grad_norm": 0.013990364968776703, "learning_rate": 2.869666897345265e-07, "loss": 0.0, "num_input_tokens_seen": 107235920, "step": 159110 }, { "epoch": 3.887205921872328, "grad_norm": 0.002354390686377883, "learning_rate": 2.8690690163049136e-07, "loss": 0.0, "num_input_tokens_seen": 107238992, "step": 159115 }, { "epoch": 3.8873280727041752, "grad_norm": 0.0010478426702320576, "learning_rate": 2.868471187122606e-07, "loss": 0.0, "num_input_tokens_seen": 107242832, "step": 159120 }, { "epoch": 3.8874502235360224, "grad_norm": 0.0021144491620361805, "learning_rate": 2.8678734098026967e-07, "loss": 0.0001, "num_input_tokens_seen": 107245904, "step": 159125 }, { "epoch": 3.8875723743678696, "grad_norm": 0.0012400305131450295, "learning_rate": 2.8672756843495316e-07, "loss": 0.0, "num_input_tokens_seen": 107249104, "step": 159130 }, { "epoch": 3.8876945251997164, "grad_norm": 0.0015789041062816978, "learning_rate": 2.8666780107674524e-07, "loss": 0.0002, "num_input_tokens_seen": 107252112, "step": 159135 }, { "epoch": 3.887816676031564, "grad_norm": 0.0018158305902034044, "learning_rate": 2.8660803890608123e-07, "loss": 0.0, "num_input_tokens_seen": 107255440, "step": 159140 }, { "epoch": 3.8879388268634107, "grad_norm": 0.008747609332203865, "learning_rate": 2.865482819233951e-07, "loss": 0.0001, "num_input_tokens_seen": 107258448, "step": 159145 }, { "epoch": 3.8880609776952584, "grad_norm": 0.0015083423350006342, "learning_rate": 2.864885301291221e-07, "loss": 0.0534, "num_input_tokens_seen": 107261520, "step": 159150 }, { "epoch": 3.888183128527105, "grad_norm": 0.00502125546336174, "learning_rate": 2.8642878352369616e-07, "loss": 0.0, "num_input_tokens_seen": 107264784, "step": 159155 }, { "epoch": 3.8883052793589523, "grad_norm": 0.002522894414141774, "learning_rate": 2.8636904210755196e-07, "loss": 0.1633, "num_input_tokens_seen": 107267984, "step": 159160 }, { "epoch": 3.8884274301907995, "grad_norm": 0.0046274131163954735, "learning_rate": 2.8630930588112443e-07, "loss": 0.0, "num_input_tokens_seen": 107271312, "step": 159165 }, { "epoch": 3.8885495810226467, "grad_norm": 0.884371817111969, "learning_rate": 2.8624957484484723e-07, "loss": 0.0002, "num_input_tokens_seen": 107274512, "step": 159170 }, { "epoch": 3.888671731854494, "grad_norm": 0.0001581492106197402, "learning_rate": 2.8618984899915533e-07, "loss": 0.0, "num_input_tokens_seen": 107278288, "step": 159175 }, { "epoch": 3.888793882686341, "grad_norm": 0.0011975031811743975, "learning_rate": 2.861301283444827e-07, "loss": 0.0, "num_input_tokens_seen": 107281808, "step": 159180 }, { "epoch": 3.8889160335181883, "grad_norm": 0.002455994253978133, "learning_rate": 2.8607041288126396e-07, "loss": 0.0, "num_input_tokens_seen": 107285200, "step": 159185 }, { "epoch": 3.8890381843500355, "grad_norm": 0.04356267303228378, "learning_rate": 2.8601070260993287e-07, "loss": 0.0, "num_input_tokens_seen": 107288272, "step": 159190 }, { "epoch": 3.8891603351818826, "grad_norm": 0.01476934365928173, "learning_rate": 2.859509975309241e-07, "loss": 0.0, "num_input_tokens_seen": 107291472, "step": 159195 }, { "epoch": 3.88928248601373, "grad_norm": 0.0009058124851435423, "learning_rate": 2.8589129764467203e-07, "loss": 0.0, "num_input_tokens_seen": 107294736, "step": 159200 }, { "epoch": 3.889404636845577, "grad_norm": 0.0037373830564320087, "learning_rate": 2.858316029516101e-07, "loss": 0.0245, "num_input_tokens_seen": 107298128, "step": 159205 }, { "epoch": 3.889526787677424, "grad_norm": 0.11109024286270142, "learning_rate": 2.8577191345217324e-07, "loss": 0.0, "num_input_tokens_seen": 107302096, "step": 159210 }, { "epoch": 3.8896489385092714, "grad_norm": 0.017915405333042145, "learning_rate": 2.857122291467948e-07, "loss": 0.0, "num_input_tokens_seen": 107305296, "step": 159215 }, { "epoch": 3.889771089341118, "grad_norm": 0.0032476552296429873, "learning_rate": 2.856525500359095e-07, "loss": 0.0001, "num_input_tokens_seen": 107309136, "step": 159220 }, { "epoch": 3.889893240172966, "grad_norm": 0.09382815659046173, "learning_rate": 2.855928761199505e-07, "loss": 0.0, "num_input_tokens_seen": 107312272, "step": 159225 }, { "epoch": 3.8900153910048125, "grad_norm": 0.025482621043920517, "learning_rate": 2.855332073993528e-07, "loss": 0.0, "num_input_tokens_seen": 107315408, "step": 159230 }, { "epoch": 3.89013754183666, "grad_norm": 0.028035694733262062, "learning_rate": 2.854735438745497e-07, "loss": 0.0, "num_input_tokens_seen": 107318608, "step": 159235 }, { "epoch": 3.890259692668507, "grad_norm": 0.016784196719527245, "learning_rate": 2.854138855459748e-07, "loss": 0.0, "num_input_tokens_seen": 107321936, "step": 159240 }, { "epoch": 3.890381843500354, "grad_norm": 0.030693599954247475, "learning_rate": 2.8535423241406274e-07, "loss": 0.0, "num_input_tokens_seen": 107324880, "step": 159245 }, { "epoch": 3.8905039943322013, "grad_norm": 0.0026803172659128904, "learning_rate": 2.8529458447924646e-07, "loss": 0.0, "num_input_tokens_seen": 107328912, "step": 159250 }, { "epoch": 3.8906261451640485, "grad_norm": 0.0023972035851329565, "learning_rate": 2.852349417419604e-07, "loss": 0.0204, "num_input_tokens_seen": 107332048, "step": 159255 }, { "epoch": 3.8907482959958957, "grad_norm": 0.0074956901371479034, "learning_rate": 2.8517530420263826e-07, "loss": 0.0, "num_input_tokens_seen": 107335248, "step": 159260 }, { "epoch": 3.890870446827743, "grad_norm": 0.001000861986540258, "learning_rate": 2.8511567186171327e-07, "loss": 0.0, "num_input_tokens_seen": 107338256, "step": 159265 }, { "epoch": 3.89099259765959, "grad_norm": 0.018955791369080544, "learning_rate": 2.8505604471961975e-07, "loss": 0.0001, "num_input_tokens_seen": 107341840, "step": 159270 }, { "epoch": 3.8911147484914372, "grad_norm": 0.02058357745409012, "learning_rate": 2.849964227767906e-07, "loss": 0.0, "num_input_tokens_seen": 107345424, "step": 159275 }, { "epoch": 3.8912368993232844, "grad_norm": 0.004790999460965395, "learning_rate": 2.849368060336599e-07, "loss": 0.0001, "num_input_tokens_seen": 107348432, "step": 159280 }, { "epoch": 3.8913590501551316, "grad_norm": 0.03545403480529785, "learning_rate": 2.848771944906613e-07, "loss": 0.1123, "num_input_tokens_seen": 107352720, "step": 159285 }, { "epoch": 3.891481200986979, "grad_norm": 0.0004981185775250196, "learning_rate": 2.8481758814822777e-07, "loss": 0.0, "num_input_tokens_seen": 107355792, "step": 159290 }, { "epoch": 3.891603351818826, "grad_norm": 0.0007688974146731198, "learning_rate": 2.8475798700679344e-07, "loss": 0.0, "num_input_tokens_seen": 107359568, "step": 159295 }, { "epoch": 3.891725502650673, "grad_norm": 0.00931483879685402, "learning_rate": 2.846983910667911e-07, "loss": 0.0, "num_input_tokens_seen": 107362896, "step": 159300 }, { "epoch": 3.8918476534825204, "grad_norm": 0.0016654456267133355, "learning_rate": 2.8463880032865463e-07, "loss": 0.0588, "num_input_tokens_seen": 107366544, "step": 159305 }, { "epoch": 3.8919698043143676, "grad_norm": 0.002245159586891532, "learning_rate": 2.84579214792817e-07, "loss": 0.0, "num_input_tokens_seen": 107369616, "step": 159310 }, { "epoch": 3.8920919551462143, "grad_norm": 0.0015120379393920302, "learning_rate": 2.8451963445971184e-07, "loss": 0.0002, "num_input_tokens_seen": 107373136, "step": 159315 }, { "epoch": 3.892214105978062, "grad_norm": 0.008667359128594398, "learning_rate": 2.844600593297726e-07, "loss": 0.0, "num_input_tokens_seen": 107376400, "step": 159320 }, { "epoch": 3.8923362568099087, "grad_norm": 0.0029568129684776068, "learning_rate": 2.8440048940343185e-07, "loss": 0.0002, "num_input_tokens_seen": 107379792, "step": 159325 }, { "epoch": 3.892458407641756, "grad_norm": 0.005628951825201511, "learning_rate": 2.843409246811236e-07, "loss": 0.0, "num_input_tokens_seen": 107382800, "step": 159330 }, { "epoch": 3.892580558473603, "grad_norm": 0.012718535959720612, "learning_rate": 2.842813651632806e-07, "loss": 0.0, "num_input_tokens_seen": 107385936, "step": 159335 }, { "epoch": 3.8927027093054503, "grad_norm": 16.744064331054688, "learning_rate": 2.8422181085033583e-07, "loss": 0.0256, "num_input_tokens_seen": 107389264, "step": 159340 }, { "epoch": 3.8928248601372974, "grad_norm": 0.0019720131531357765, "learning_rate": 2.841622617427227e-07, "loss": 0.0488, "num_input_tokens_seen": 107392400, "step": 159345 }, { "epoch": 3.8929470109691446, "grad_norm": 0.0032979014795273542, "learning_rate": 2.841027178408739e-07, "loss": 0.0488, "num_input_tokens_seen": 107395600, "step": 159350 }, { "epoch": 3.893069161800992, "grad_norm": 0.00013743384624831378, "learning_rate": 2.8404317914522304e-07, "loss": 0.0, "num_input_tokens_seen": 107398864, "step": 159355 }, { "epoch": 3.893191312632839, "grad_norm": 0.008313512429594994, "learning_rate": 2.839836456562025e-07, "loss": 0.0, "num_input_tokens_seen": 107402960, "step": 159360 }, { "epoch": 3.893313463464686, "grad_norm": 0.0017402205849066377, "learning_rate": 2.839241173742456e-07, "loss": 0.0, "num_input_tokens_seen": 107406096, "step": 159365 }, { "epoch": 3.8934356142965334, "grad_norm": 0.0016363713657483459, "learning_rate": 2.838645942997849e-07, "loss": 0.0005, "num_input_tokens_seen": 107409296, "step": 159370 }, { "epoch": 3.8935577651283806, "grad_norm": 0.0010019043693318963, "learning_rate": 2.8380507643325357e-07, "loss": 0.0, "num_input_tokens_seen": 107412688, "step": 159375 }, { "epoch": 3.8936799159602278, "grad_norm": 0.006046359892934561, "learning_rate": 2.8374556377508463e-07, "loss": 0.0441, "num_input_tokens_seen": 107415952, "step": 159380 }, { "epoch": 3.893802066792075, "grad_norm": 0.000366544903954491, "learning_rate": 2.8368605632571017e-07, "loss": 0.0048, "num_input_tokens_seen": 107419472, "step": 159385 }, { "epoch": 3.893924217623922, "grad_norm": 0.0009933729888871312, "learning_rate": 2.836265540855638e-07, "loss": 0.0, "num_input_tokens_seen": 107422736, "step": 159390 }, { "epoch": 3.8940463684557693, "grad_norm": 0.0023379067424684763, "learning_rate": 2.835670570550774e-07, "loss": 0.0, "num_input_tokens_seen": 107426576, "step": 159395 }, { "epoch": 3.894168519287616, "grad_norm": 0.0043589891865849495, "learning_rate": 2.8350756523468454e-07, "loss": 0.0, "num_input_tokens_seen": 107430416, "step": 159400 }, { "epoch": 3.8942906701194637, "grad_norm": 30.011795043945312, "learning_rate": 2.834480786248169e-07, "loss": 0.0489, "num_input_tokens_seen": 107433488, "step": 159405 }, { "epoch": 3.8944128209513105, "grad_norm": 17.220151901245117, "learning_rate": 2.833885972259077e-07, "loss": 0.0476, "num_input_tokens_seen": 107436496, "step": 159410 }, { "epoch": 3.894534971783158, "grad_norm": 0.0010251520434394479, "learning_rate": 2.8332912103838957e-07, "loss": 0.0, "num_input_tokens_seen": 107440080, "step": 159415 }, { "epoch": 3.894657122615005, "grad_norm": 0.00047361108590848744, "learning_rate": 2.8326965006269454e-07, "loss": 0.0, "num_input_tokens_seen": 107443536, "step": 159420 }, { "epoch": 3.894779273446852, "grad_norm": 0.029390431940555573, "learning_rate": 2.832101842992558e-07, "loss": 0.0, "num_input_tokens_seen": 107446672, "step": 159425 }, { "epoch": 3.8949014242786992, "grad_norm": 0.031576137989759445, "learning_rate": 2.8315072374850504e-07, "loss": 0.0, "num_input_tokens_seen": 107449808, "step": 159430 }, { "epoch": 3.8950235751105464, "grad_norm": 0.015351535752415657, "learning_rate": 2.8309126841087527e-07, "loss": 0.0, "num_input_tokens_seen": 107453200, "step": 159435 }, { "epoch": 3.8951457259423936, "grad_norm": 28.911039352416992, "learning_rate": 2.8303181828679857e-07, "loss": 0.0512, "num_input_tokens_seen": 107456976, "step": 159440 }, { "epoch": 3.895267876774241, "grad_norm": 0.012119249440729618, "learning_rate": 2.829723733767071e-07, "loss": 0.0002, "num_input_tokens_seen": 107460368, "step": 159445 }, { "epoch": 3.895390027606088, "grad_norm": 0.0006879116408526897, "learning_rate": 2.8291293368103374e-07, "loss": 0.0, "num_input_tokens_seen": 107463440, "step": 159450 }, { "epoch": 3.895512178437935, "grad_norm": 0.0032404428347945213, "learning_rate": 2.8285349920021006e-07, "loss": 0.0006, "num_input_tokens_seen": 107466960, "step": 159455 }, { "epoch": 3.8956343292697824, "grad_norm": 0.017269376665353775, "learning_rate": 2.8279406993466893e-07, "loss": 0.0, "num_input_tokens_seen": 107470224, "step": 159460 }, { "epoch": 3.8957564801016296, "grad_norm": 0.0010835862485691905, "learning_rate": 2.8273464588484186e-07, "loss": 0.0001, "num_input_tokens_seen": 107474192, "step": 159465 }, { "epoch": 3.8958786309334767, "grad_norm": 0.00023717455042060465, "learning_rate": 2.826752270511614e-07, "loss": 0.0, "num_input_tokens_seen": 107477520, "step": 159470 }, { "epoch": 3.896000781765324, "grad_norm": 0.0019396002171561122, "learning_rate": 2.8261581343406005e-07, "loss": 0.0, "num_input_tokens_seen": 107480720, "step": 159475 }, { "epoch": 3.896122932597171, "grad_norm": 0.0029893694445490837, "learning_rate": 2.8255640503396903e-07, "loss": 0.0001, "num_input_tokens_seen": 107484624, "step": 159480 }, { "epoch": 3.896245083429018, "grad_norm": 0.005537273362278938, "learning_rate": 2.8249700185132107e-07, "loss": 0.0, "num_input_tokens_seen": 107488080, "step": 159485 }, { "epoch": 3.8963672342608655, "grad_norm": 0.00046166477841325104, "learning_rate": 2.824376038865477e-07, "loss": 0.0, "num_input_tokens_seen": 107491408, "step": 159490 }, { "epoch": 3.8964893850927123, "grad_norm": 0.0005455143982544541, "learning_rate": 2.823782111400813e-07, "loss": 0.0, "num_input_tokens_seen": 107494992, "step": 159495 }, { "epoch": 3.89661153592456, "grad_norm": 0.006175261456519365, "learning_rate": 2.823188236123533e-07, "loss": 0.0, "num_input_tokens_seen": 107498320, "step": 159500 }, { "epoch": 3.8967336867564066, "grad_norm": 0.0041683511808514595, "learning_rate": 2.8225944130379586e-07, "loss": 0.0001, "num_input_tokens_seen": 107501392, "step": 159505 }, { "epoch": 3.896855837588254, "grad_norm": 0.003727443516254425, "learning_rate": 2.822000642148411e-07, "loss": 0.0, "num_input_tokens_seen": 107504912, "step": 159510 }, { "epoch": 3.896977988420101, "grad_norm": 0.010317213833332062, "learning_rate": 2.821406923459202e-07, "loss": 0.0, "num_input_tokens_seen": 107508240, "step": 159515 }, { "epoch": 3.897100139251948, "grad_norm": 0.001735912635922432, "learning_rate": 2.8208132569746555e-07, "loss": 0.0399, "num_input_tokens_seen": 107511248, "step": 159520 }, { "epoch": 3.8972222900837954, "grad_norm": 0.009436232969164848, "learning_rate": 2.8202196426990844e-07, "loss": 0.0, "num_input_tokens_seen": 107514576, "step": 159525 }, { "epoch": 3.8973444409156426, "grad_norm": 0.0007831714465282857, "learning_rate": 2.819626080636809e-07, "loss": 0.0, "num_input_tokens_seen": 107518608, "step": 159530 }, { "epoch": 3.8974665917474898, "grad_norm": 0.034116633236408234, "learning_rate": 2.8190325707921416e-07, "loss": 0.1042, "num_input_tokens_seen": 107522128, "step": 159535 }, { "epoch": 3.897588742579337, "grad_norm": 0.0010338969295844436, "learning_rate": 2.818439113169403e-07, "loss": 0.0001, "num_input_tokens_seen": 107525968, "step": 159540 }, { "epoch": 3.897710893411184, "grad_norm": 0.002002190565690398, "learning_rate": 2.817845707772908e-07, "loss": 0.0001, "num_input_tokens_seen": 107528784, "step": 159545 }, { "epoch": 3.8978330442430313, "grad_norm": 0.002047667745500803, "learning_rate": 2.817252354606966e-07, "loss": 0.0, "num_input_tokens_seen": 107531856, "step": 159550 }, { "epoch": 3.8979551950748785, "grad_norm": 0.03190112113952637, "learning_rate": 2.8166590536759015e-07, "loss": 0.0, "num_input_tokens_seen": 107535056, "step": 159555 }, { "epoch": 3.8980773459067257, "grad_norm": 0.005458956118673086, "learning_rate": 2.8160658049840205e-07, "loss": 0.1033, "num_input_tokens_seen": 107538512, "step": 159560 }, { "epoch": 3.898199496738573, "grad_norm": 0.0033516723196953535, "learning_rate": 2.815472608535642e-07, "loss": 0.0, "num_input_tokens_seen": 107541968, "step": 159565 }, { "epoch": 3.89832164757042, "grad_norm": 0.0010578599758446217, "learning_rate": 2.8148794643350816e-07, "loss": 0.0318, "num_input_tokens_seen": 107545360, "step": 159570 }, { "epoch": 3.8984437984022673, "grad_norm": 0.00019192988111171871, "learning_rate": 2.8142863723866475e-07, "loss": 0.0, "num_input_tokens_seen": 107548368, "step": 159575 }, { "epoch": 3.898565949234114, "grad_norm": 15.34018325805664, "learning_rate": 2.8136933326946574e-07, "loss": 0.058, "num_input_tokens_seen": 107551824, "step": 159580 }, { "epoch": 3.8986881000659617, "grad_norm": 0.009013521485030651, "learning_rate": 2.813100345263421e-07, "loss": 0.0, "num_input_tokens_seen": 107555088, "step": 159585 }, { "epoch": 3.8988102508978084, "grad_norm": 0.008238616399466991, "learning_rate": 2.812507410097251e-07, "loss": 0.0, "num_input_tokens_seen": 107558608, "step": 159590 }, { "epoch": 3.898932401729656, "grad_norm": 0.005177440121769905, "learning_rate": 2.811914527200463e-07, "loss": 0.0008, "num_input_tokens_seen": 107562000, "step": 159595 }, { "epoch": 3.899054552561503, "grad_norm": 0.0011088430183008313, "learning_rate": 2.8113216965773634e-07, "loss": 0.0, "num_input_tokens_seen": 107565456, "step": 159600 }, { "epoch": 3.89917670339335, "grad_norm": 0.024311434477567673, "learning_rate": 2.810728918232269e-07, "loss": 0.0, "num_input_tokens_seen": 107568592, "step": 159605 }, { "epoch": 3.899298854225197, "grad_norm": 0.006240412592887878, "learning_rate": 2.8101361921694854e-07, "loss": 0.0, "num_input_tokens_seen": 107572432, "step": 159610 }, { "epoch": 3.8994210050570444, "grad_norm": 0.008073143661022186, "learning_rate": 2.8095435183933267e-07, "loss": 0.0572, "num_input_tokens_seen": 107575504, "step": 159615 }, { "epoch": 3.8995431558888916, "grad_norm": 0.12511250376701355, "learning_rate": 2.8089508969081e-07, "loss": 0.0002, "num_input_tokens_seen": 107578896, "step": 159620 }, { "epoch": 3.8996653067207387, "grad_norm": 0.0018284362740814686, "learning_rate": 2.8083583277181154e-07, "loss": 0.0433, "num_input_tokens_seen": 107582096, "step": 159625 }, { "epoch": 3.899787457552586, "grad_norm": 0.004669949412345886, "learning_rate": 2.807765810827687e-07, "loss": 0.0002, "num_input_tokens_seen": 107585616, "step": 159630 }, { "epoch": 3.899909608384433, "grad_norm": 0.0008054210338741541, "learning_rate": 2.807173346241116e-07, "loss": 0.081, "num_input_tokens_seen": 107588624, "step": 159635 }, { "epoch": 3.9000317592162803, "grad_norm": 0.0017973963404074311, "learning_rate": 2.80658093396272e-07, "loss": 0.0003, "num_input_tokens_seen": 107592144, "step": 159640 }, { "epoch": 3.9001539100481275, "grad_norm": 0.053213659673929214, "learning_rate": 2.8059885739968e-07, "loss": 0.0, "num_input_tokens_seen": 107595344, "step": 159645 }, { "epoch": 3.9002760608799747, "grad_norm": 0.027758058160543442, "learning_rate": 2.805396266347665e-07, "loss": 0.0478, "num_input_tokens_seen": 107598928, "step": 159650 }, { "epoch": 3.900398211711822, "grad_norm": 0.003646234283223748, "learning_rate": 2.804804011019626e-07, "loss": 0.0, "num_input_tokens_seen": 107601936, "step": 159655 }, { "epoch": 3.900520362543669, "grad_norm": 0.006015193648636341, "learning_rate": 2.8042118080169843e-07, "loss": 0.0, "num_input_tokens_seen": 107605456, "step": 159660 }, { "epoch": 3.900642513375516, "grad_norm": 0.0008773684385232627, "learning_rate": 2.803619657344053e-07, "loss": 0.0, "num_input_tokens_seen": 107608656, "step": 159665 }, { "epoch": 3.9007646642073635, "grad_norm": 0.0009342681732960045, "learning_rate": 2.803027559005131e-07, "loss": 0.0, "num_input_tokens_seen": 107611984, "step": 159670 }, { "epoch": 3.90088681503921, "grad_norm": 0.008757350035011768, "learning_rate": 2.8024355130045316e-07, "loss": 0.0439, "num_input_tokens_seen": 107615056, "step": 159675 }, { "epoch": 3.901008965871058, "grad_norm": 0.0015640563797205687, "learning_rate": 2.801843519346555e-07, "loss": 0.0, "num_input_tokens_seen": 107618512, "step": 159680 }, { "epoch": 3.9011311167029046, "grad_norm": 0.02840656414628029, "learning_rate": 2.8012515780355084e-07, "loss": 0.0, "num_input_tokens_seen": 107622160, "step": 159685 }, { "epoch": 3.9012532675347518, "grad_norm": 0.0020951058249920607, "learning_rate": 2.8006596890756995e-07, "loss": 0.0, "num_input_tokens_seen": 107625616, "step": 159690 }, { "epoch": 3.901375418366599, "grad_norm": 0.0030957581475377083, "learning_rate": 2.8000678524714263e-07, "loss": 0.0, "num_input_tokens_seen": 107629264, "step": 159695 }, { "epoch": 3.901497569198446, "grad_norm": 0.04345306009054184, "learning_rate": 2.7994760682269993e-07, "loss": 0.0001, "num_input_tokens_seen": 107632528, "step": 159700 }, { "epoch": 3.9016197200302933, "grad_norm": 0.0007010184344835579, "learning_rate": 2.798884336346716e-07, "loss": 0.0, "num_input_tokens_seen": 107635984, "step": 159705 }, { "epoch": 3.9017418708621405, "grad_norm": 0.0029301580507308245, "learning_rate": 2.7982926568348853e-07, "loss": 0.0, "num_input_tokens_seen": 107639184, "step": 159710 }, { "epoch": 3.9018640216939877, "grad_norm": 0.043662700802087784, "learning_rate": 2.797701029695805e-07, "loss": 0.0, "num_input_tokens_seen": 107642768, "step": 159715 }, { "epoch": 3.901986172525835, "grad_norm": 0.018145162612199783, "learning_rate": 2.7971094549337805e-07, "loss": 0.0, "num_input_tokens_seen": 107646160, "step": 159720 }, { "epoch": 3.902108323357682, "grad_norm": 0.0647394210100174, "learning_rate": 2.7965179325531154e-07, "loss": 0.0, "num_input_tokens_seen": 107649872, "step": 159725 }, { "epoch": 3.9022304741895293, "grad_norm": 0.014430363662540913, "learning_rate": 2.7959264625581067e-07, "loss": 0.0, "num_input_tokens_seen": 107653648, "step": 159730 }, { "epoch": 3.9023526250213765, "grad_norm": 0.015603567473590374, "learning_rate": 2.795335044953061e-07, "loss": 0.0, "num_input_tokens_seen": 107657296, "step": 159735 }, { "epoch": 3.9024747758532237, "grad_norm": 0.00026981995324604213, "learning_rate": 2.794743679742274e-07, "loss": 0.0, "num_input_tokens_seen": 107660432, "step": 159740 }, { "epoch": 3.902596926685071, "grad_norm": 0.004165019374340773, "learning_rate": 2.7941523669300527e-07, "loss": 0.0, "num_input_tokens_seen": 107664144, "step": 159745 }, { "epoch": 3.902719077516918, "grad_norm": 0.002123868092894554, "learning_rate": 2.793561106520693e-07, "loss": 0.0, "num_input_tokens_seen": 107667344, "step": 159750 }, { "epoch": 3.9028412283487652, "grad_norm": 0.005422023590654135, "learning_rate": 2.7929698985184923e-07, "loss": 0.0, "num_input_tokens_seen": 107670544, "step": 159755 }, { "epoch": 3.902963379180612, "grad_norm": 0.032854385673999786, "learning_rate": 2.792378742927756e-07, "loss": 0.0001, "num_input_tokens_seen": 107674320, "step": 159760 }, { "epoch": 3.9030855300124596, "grad_norm": 0.5987409949302673, "learning_rate": 2.791787639752776e-07, "loss": 0.0001, "num_input_tokens_seen": 107677776, "step": 159765 }, { "epoch": 3.9032076808443064, "grad_norm": 0.0016436355654150248, "learning_rate": 2.7911965889978595e-07, "loss": 0.0001, "num_input_tokens_seen": 107681488, "step": 159770 }, { "epoch": 3.9033298316761535, "grad_norm": 0.00790480338037014, "learning_rate": 2.7906055906672965e-07, "loss": 0.0001, "num_input_tokens_seen": 107685328, "step": 159775 }, { "epoch": 3.9034519825080007, "grad_norm": 0.10544174164533615, "learning_rate": 2.7900146447653895e-07, "loss": 0.0, "num_input_tokens_seen": 107688336, "step": 159780 }, { "epoch": 3.903574133339848, "grad_norm": 0.0011935977963730693, "learning_rate": 2.789423751296438e-07, "loss": 0.0, "num_input_tokens_seen": 107691536, "step": 159785 }, { "epoch": 3.903696284171695, "grad_norm": 0.002410582033917308, "learning_rate": 2.788832910264732e-07, "loss": 0.0, "num_input_tokens_seen": 107695120, "step": 159790 }, { "epoch": 3.9038184350035423, "grad_norm": 0.0012916321866214275, "learning_rate": 2.7882421216745776e-07, "loss": 0.0, "num_input_tokens_seen": 107698768, "step": 159795 }, { "epoch": 3.9039405858353895, "grad_norm": 0.002064318861812353, "learning_rate": 2.787651385530263e-07, "loss": 0.0, "num_input_tokens_seen": 107702224, "step": 159800 }, { "epoch": 3.9040627366672367, "grad_norm": 0.050407975912094116, "learning_rate": 2.787060701836089e-07, "loss": 0.0, "num_input_tokens_seen": 107705424, "step": 159805 }, { "epoch": 3.904184887499084, "grad_norm": 0.02390783280134201, "learning_rate": 2.7864700705963484e-07, "loss": 0.0, "num_input_tokens_seen": 107708688, "step": 159810 }, { "epoch": 3.904307038330931, "grad_norm": 0.005413845181465149, "learning_rate": 2.785879491815336e-07, "loss": 0.0, "num_input_tokens_seen": 107712016, "step": 159815 }, { "epoch": 3.9044291891627783, "grad_norm": 0.0004383553168736398, "learning_rate": 2.785288965497352e-07, "loss": 0.0, "num_input_tokens_seen": 107715152, "step": 159820 }, { "epoch": 3.9045513399946254, "grad_norm": 0.0010350412921980023, "learning_rate": 2.784698491646684e-07, "loss": 0.0, "num_input_tokens_seen": 107718032, "step": 159825 }, { "epoch": 3.9046734908264726, "grad_norm": 0.09186630696058273, "learning_rate": 2.7841080702676336e-07, "loss": 0.0, "num_input_tokens_seen": 107721552, "step": 159830 }, { "epoch": 3.90479564165832, "grad_norm": 0.0008800149080343544, "learning_rate": 2.783517701364485e-07, "loss": 0.0, "num_input_tokens_seen": 107725328, "step": 159835 }, { "epoch": 3.904917792490167, "grad_norm": 0.004721642006188631, "learning_rate": 2.782927384941541e-07, "loss": 0.0, "num_input_tokens_seen": 107728336, "step": 159840 }, { "epoch": 3.9050399433220138, "grad_norm": 0.0003880482690874487, "learning_rate": 2.7823371210030865e-07, "loss": 0.0, "num_input_tokens_seen": 107731472, "step": 159845 }, { "epoch": 3.9051620941538614, "grad_norm": 0.0011299933539703488, "learning_rate": 2.781746909553422e-07, "loss": 0.0, "num_input_tokens_seen": 107734736, "step": 159850 }, { "epoch": 3.905284244985708, "grad_norm": 0.01743919961154461, "learning_rate": 2.7811567505968346e-07, "loss": 0.0, "num_input_tokens_seen": 107738832, "step": 159855 }, { "epoch": 3.9054063958175558, "grad_norm": 0.002373091410845518, "learning_rate": 2.780566644137614e-07, "loss": 0.0, "num_input_tokens_seen": 107742224, "step": 159860 }, { "epoch": 3.9055285466494025, "grad_norm": 0.0019706422463059425, "learning_rate": 2.7799765901800576e-07, "loss": 0.0, "num_input_tokens_seen": 107745424, "step": 159865 }, { "epoch": 3.9056506974812497, "grad_norm": 0.041707828640937805, "learning_rate": 2.779386588728451e-07, "loss": 0.0, "num_input_tokens_seen": 107748560, "step": 159870 }, { "epoch": 3.905772848313097, "grad_norm": 0.002031422220170498, "learning_rate": 2.7787966397870855e-07, "loss": 0.0001, "num_input_tokens_seen": 107751952, "step": 159875 }, { "epoch": 3.905894999144944, "grad_norm": 0.0063712685368955135, "learning_rate": 2.7782067433602574e-07, "loss": 0.0, "num_input_tokens_seen": 107755216, "step": 159880 }, { "epoch": 3.9060171499767913, "grad_norm": 0.0017904570559039712, "learning_rate": 2.777616899452249e-07, "loss": 0.0, "num_input_tokens_seen": 107758480, "step": 159885 }, { "epoch": 3.9061393008086385, "grad_norm": 0.46773773431777954, "learning_rate": 2.7770271080673566e-07, "loss": 0.0002, "num_input_tokens_seen": 107761872, "step": 159890 }, { "epoch": 3.9062614516404857, "grad_norm": 0.004684579558670521, "learning_rate": 2.776437369209862e-07, "loss": 0.0, "num_input_tokens_seen": 107765584, "step": 159895 }, { "epoch": 3.906383602472333, "grad_norm": 0.5561292767524719, "learning_rate": 2.7758476828840615e-07, "loss": 0.0001, "num_input_tokens_seen": 107768592, "step": 159900 }, { "epoch": 3.90650575330418, "grad_norm": 0.0021681429352611303, "learning_rate": 2.775258049094236e-07, "loss": 0.0, "num_input_tokens_seen": 107771664, "step": 159905 }, { "epoch": 3.9066279041360272, "grad_norm": 0.0008911622571758926, "learning_rate": 2.7746684678446776e-07, "loss": 0.0, "num_input_tokens_seen": 107774800, "step": 159910 }, { "epoch": 3.9067500549678744, "grad_norm": 0.0005695793661288917, "learning_rate": 2.774078939139677e-07, "loss": 0.0001, "num_input_tokens_seen": 107778000, "step": 159915 }, { "epoch": 3.9068722057997216, "grad_norm": 0.0030290824361145496, "learning_rate": 2.773489462983514e-07, "loss": 0.0, "num_input_tokens_seen": 107781264, "step": 159920 }, { "epoch": 3.906994356631569, "grad_norm": 0.004985318053513765, "learning_rate": 2.7729000393804825e-07, "loss": 0.0, "num_input_tokens_seen": 107785040, "step": 159925 }, { "epoch": 3.907116507463416, "grad_norm": 0.0003536255971994251, "learning_rate": 2.772310668334863e-07, "loss": 0.0, "num_input_tokens_seen": 107788560, "step": 159930 }, { "epoch": 3.907238658295263, "grad_norm": 0.04345562309026718, "learning_rate": 2.771721349850944e-07, "loss": 0.0001, "num_input_tokens_seen": 107792080, "step": 159935 }, { "epoch": 3.90736080912711, "grad_norm": 0.04325956106185913, "learning_rate": 2.7711320839330155e-07, "loss": 0.0, "num_input_tokens_seen": 107795472, "step": 159940 }, { "epoch": 3.9074829599589576, "grad_norm": 0.0022824618499726057, "learning_rate": 2.7705428705853537e-07, "loss": 0.0, "num_input_tokens_seen": 107798928, "step": 159945 }, { "epoch": 3.9076051107908043, "grad_norm": 0.3876858949661255, "learning_rate": 2.769953709812254e-07, "loss": 0.0001, "num_input_tokens_seen": 107802576, "step": 159950 }, { "epoch": 3.9077272616226515, "grad_norm": 0.037881772965192795, "learning_rate": 2.769364601617994e-07, "loss": 0.0, "num_input_tokens_seen": 107805776, "step": 159955 }, { "epoch": 3.9078494124544987, "grad_norm": 0.0005450754542835057, "learning_rate": 2.7687755460068575e-07, "loss": 0.0, "num_input_tokens_seen": 107809040, "step": 159960 }, { "epoch": 3.907971563286346, "grad_norm": 0.0026407239492982626, "learning_rate": 2.768186542983133e-07, "loss": 0.0667, "num_input_tokens_seen": 107812304, "step": 159965 }, { "epoch": 3.908093714118193, "grad_norm": 0.0014854839537292719, "learning_rate": 2.767597592551097e-07, "loss": 0.0, "num_input_tokens_seen": 107815696, "step": 159970 }, { "epoch": 3.9082158649500403, "grad_norm": 45.480194091796875, "learning_rate": 2.767008694715041e-07, "loss": 0.0632, "num_input_tokens_seen": 107819344, "step": 159975 }, { "epoch": 3.9083380157818874, "grad_norm": 0.0038357700686901808, "learning_rate": 2.766419849479239e-07, "loss": 0.0001, "num_input_tokens_seen": 107822544, "step": 159980 }, { "epoch": 3.9084601666137346, "grad_norm": 0.01778177171945572, "learning_rate": 2.765831056847981e-07, "loss": 0.0, "num_input_tokens_seen": 107825808, "step": 159985 }, { "epoch": 3.908582317445582, "grad_norm": 0.009620671160519123, "learning_rate": 2.765242316825542e-07, "loss": 0.0001, "num_input_tokens_seen": 107829328, "step": 159990 }, { "epoch": 3.908704468277429, "grad_norm": 0.0016746602486819029, "learning_rate": 2.764653629416208e-07, "loss": 0.0, "num_input_tokens_seen": 107832720, "step": 159995 }, { "epoch": 3.908826619109276, "grad_norm": 0.0006174160516820848, "learning_rate": 2.7640649946242613e-07, "loss": 0.0, "num_input_tokens_seen": 107835920, "step": 160000 }, { "epoch": 3.9089487699411234, "grad_norm": 0.0012764016864821315, "learning_rate": 2.7634764124539765e-07, "loss": 0.0, "num_input_tokens_seen": 107839696, "step": 160005 }, { "epoch": 3.9090709207729706, "grad_norm": 0.000690353917889297, "learning_rate": 2.762887882909641e-07, "loss": 0.0638, "num_input_tokens_seen": 107843664, "step": 160010 }, { "epoch": 3.9091930716048178, "grad_norm": 0.000673630100209266, "learning_rate": 2.7622994059955287e-07, "loss": 0.0, "num_input_tokens_seen": 107847120, "step": 160015 }, { "epoch": 3.909315222436665, "grad_norm": 0.0010738215642049909, "learning_rate": 2.7617109817159244e-07, "loss": 0.0, "num_input_tokens_seen": 107850512, "step": 160020 }, { "epoch": 3.9094373732685117, "grad_norm": 0.0006555470172315836, "learning_rate": 2.7611226100751016e-07, "loss": 0.0001, "num_input_tokens_seen": 107854352, "step": 160025 }, { "epoch": 3.9095595241003593, "grad_norm": 0.0011639671865850687, "learning_rate": 2.760534291077343e-07, "loss": 0.0, "num_input_tokens_seen": 107857744, "step": 160030 }, { "epoch": 3.909681674932206, "grad_norm": 0.04372568055987358, "learning_rate": 2.759946024726928e-07, "loss": 0.0388, "num_input_tokens_seen": 107861136, "step": 160035 }, { "epoch": 3.9098038257640537, "grad_norm": 0.010761997662484646, "learning_rate": 2.7593578110281314e-07, "loss": 0.0001, "num_input_tokens_seen": 107864400, "step": 160040 }, { "epoch": 3.9099259765959005, "grad_norm": 0.08728259056806564, "learning_rate": 2.758769649985234e-07, "loss": 0.0, "num_input_tokens_seen": 107867920, "step": 160045 }, { "epoch": 3.9100481274277477, "grad_norm": 0.0009333228808827698, "learning_rate": 2.7581815416025087e-07, "loss": 0.0, "num_input_tokens_seen": 107871376, "step": 160050 }, { "epoch": 3.910170278259595, "grad_norm": 0.0002366242988500744, "learning_rate": 2.757593485884238e-07, "loss": 0.0, "num_input_tokens_seen": 107874576, "step": 160055 }, { "epoch": 3.910292429091442, "grad_norm": 0.011959303170442581, "learning_rate": 2.7570054828346957e-07, "loss": 0.0, "num_input_tokens_seen": 107878416, "step": 160060 }, { "epoch": 3.9104145799232892, "grad_norm": 0.019165946170687675, "learning_rate": 2.756417532458154e-07, "loss": 0.0, "num_input_tokens_seen": 107882256, "step": 160065 }, { "epoch": 3.9105367307551364, "grad_norm": 0.0004379362508188933, "learning_rate": 2.755829634758896e-07, "loss": 0.0003, "num_input_tokens_seen": 107885520, "step": 160070 }, { "epoch": 3.9106588815869836, "grad_norm": 0.0002538120897952467, "learning_rate": 2.7552417897411905e-07, "loss": 0.0, "num_input_tokens_seen": 107888976, "step": 160075 }, { "epoch": 3.910781032418831, "grad_norm": 0.007831291295588017, "learning_rate": 2.7546539974093175e-07, "loss": 0.0, "num_input_tokens_seen": 107892688, "step": 160080 }, { "epoch": 3.910903183250678, "grad_norm": 0.0030617325101047754, "learning_rate": 2.7540662577675477e-07, "loss": 0.0001, "num_input_tokens_seen": 107895824, "step": 160085 }, { "epoch": 3.911025334082525, "grad_norm": 0.00044064526446163654, "learning_rate": 2.753478570820156e-07, "loss": 0.0, "num_input_tokens_seen": 107899024, "step": 160090 }, { "epoch": 3.9111474849143724, "grad_norm": 0.0005494902725331485, "learning_rate": 2.7528909365714205e-07, "loss": 0.0002, "num_input_tokens_seen": 107902096, "step": 160095 }, { "epoch": 3.9112696357462196, "grad_norm": 0.0003827103937510401, "learning_rate": 2.752303355025608e-07, "loss": 0.0, "num_input_tokens_seen": 107905872, "step": 160100 }, { "epoch": 3.9113917865780667, "grad_norm": 0.00016441762272734195, "learning_rate": 2.751715826186998e-07, "loss": 0.0, "num_input_tokens_seen": 107909008, "step": 160105 }, { "epoch": 3.9115139374099135, "grad_norm": 0.003068072721362114, "learning_rate": 2.751128350059857e-07, "loss": 0.0, "num_input_tokens_seen": 107912400, "step": 160110 }, { "epoch": 3.911636088241761, "grad_norm": 0.045241840183734894, "learning_rate": 2.7505409266484636e-07, "loss": 0.0, "num_input_tokens_seen": 107915856, "step": 160115 }, { "epoch": 3.911758239073608, "grad_norm": 0.0033214271534234285, "learning_rate": 2.749953555957083e-07, "loss": 0.0, "num_input_tokens_seen": 107919376, "step": 160120 }, { "epoch": 3.9118803899054555, "grad_norm": 0.0077246990986168385, "learning_rate": 2.7493662379899906e-07, "loss": 0.0, "num_input_tokens_seen": 107922704, "step": 160125 }, { "epoch": 3.9120025407373022, "grad_norm": 0.0009203380905091763, "learning_rate": 2.748778972751461e-07, "loss": 0.0, "num_input_tokens_seen": 107926160, "step": 160130 }, { "epoch": 3.9121246915691494, "grad_norm": 0.02601289190351963, "learning_rate": 2.748191760245756e-07, "loss": 0.0003, "num_input_tokens_seen": 107929360, "step": 160135 }, { "epoch": 3.9122468424009966, "grad_norm": 0.010707485489547253, "learning_rate": 2.7476046004771557e-07, "loss": 0.0001, "num_input_tokens_seen": 107932688, "step": 160140 }, { "epoch": 3.912368993232844, "grad_norm": 0.003142469795420766, "learning_rate": 2.747017493449922e-07, "loss": 0.0, "num_input_tokens_seen": 107936144, "step": 160145 }, { "epoch": 3.912491144064691, "grad_norm": 0.0026227415073662996, "learning_rate": 2.74643043916833e-07, "loss": 0.0, "num_input_tokens_seen": 107939856, "step": 160150 }, { "epoch": 3.912613294896538, "grad_norm": 0.0004527507408056408, "learning_rate": 2.7458434376366457e-07, "loss": 0.0, "num_input_tokens_seen": 107943056, "step": 160155 }, { "epoch": 3.9127354457283854, "grad_norm": 0.017700159922242165, "learning_rate": 2.74525648885914e-07, "loss": 0.0, "num_input_tokens_seen": 107946512, "step": 160160 }, { "epoch": 3.9128575965602326, "grad_norm": 0.004137058742344379, "learning_rate": 2.744669592840082e-07, "loss": 0.0684, "num_input_tokens_seen": 107949904, "step": 160165 }, { "epoch": 3.9129797473920798, "grad_norm": 0.0004117517964914441, "learning_rate": 2.744082749583734e-07, "loss": 0.0, "num_input_tokens_seen": 107953168, "step": 160170 }, { "epoch": 3.913101898223927, "grad_norm": 0.005193542223423719, "learning_rate": 2.743495959094372e-07, "loss": 0.0, "num_input_tokens_seen": 107956176, "step": 160175 }, { "epoch": 3.913224049055774, "grad_norm": 26.64436912536621, "learning_rate": 2.742909221376255e-07, "loss": 0.0633, "num_input_tokens_seen": 107959696, "step": 160180 }, { "epoch": 3.9133461998876213, "grad_norm": 0.0012446003966033459, "learning_rate": 2.742322536433654e-07, "loss": 0.0, "num_input_tokens_seen": 107962960, "step": 160185 }, { "epoch": 3.9134683507194685, "grad_norm": 0.0022602214012295008, "learning_rate": 2.74173590427084e-07, "loss": 0.0, "num_input_tokens_seen": 107966352, "step": 160190 }, { "epoch": 3.9135905015513157, "grad_norm": 0.006497112102806568, "learning_rate": 2.74114932489207e-07, "loss": 0.0005, "num_input_tokens_seen": 107969552, "step": 160195 }, { "epoch": 3.913712652383163, "grad_norm": 0.00037623572279699147, "learning_rate": 2.7405627983016186e-07, "loss": 0.0001, "num_input_tokens_seen": 107972560, "step": 160200 }, { "epoch": 3.9138348032150097, "grad_norm": 0.014156325720250607, "learning_rate": 2.7399763245037444e-07, "loss": 0.0, "num_input_tokens_seen": 107975824, "step": 160205 }, { "epoch": 3.9139569540468573, "grad_norm": 1.580858042871114e-05, "learning_rate": 2.739389903502718e-07, "loss": 0.0, "num_input_tokens_seen": 107979280, "step": 160210 }, { "epoch": 3.914079104878704, "grad_norm": 0.0027877134270966053, "learning_rate": 2.738803535302797e-07, "loss": 0.0, "num_input_tokens_seen": 107982544, "step": 160215 }, { "epoch": 3.9142012557105517, "grad_norm": 0.0003772991767618805, "learning_rate": 2.738217219908251e-07, "loss": 0.0, "num_input_tokens_seen": 107985872, "step": 160220 }, { "epoch": 3.9143234065423984, "grad_norm": 0.0005691954283975065, "learning_rate": 2.7376309573233446e-07, "loss": 0.0, "num_input_tokens_seen": 107989776, "step": 160225 }, { "epoch": 3.9144455573742456, "grad_norm": 0.04178770259022713, "learning_rate": 2.7370447475523363e-07, "loss": 0.0, "num_input_tokens_seen": 107993104, "step": 160230 }, { "epoch": 3.914567708206093, "grad_norm": 0.015368896536529064, "learning_rate": 2.7364585905994953e-07, "loss": 0.0003, "num_input_tokens_seen": 107996624, "step": 160235 }, { "epoch": 3.91468985903794, "grad_norm": 0.0012327000731602311, "learning_rate": 2.7358724864690783e-07, "loss": 0.0, "num_input_tokens_seen": 108000656, "step": 160240 }, { "epoch": 3.914812009869787, "grad_norm": 0.00057915726210922, "learning_rate": 2.7352864351653503e-07, "loss": 0.0146, "num_input_tokens_seen": 108003728, "step": 160245 }, { "epoch": 3.9149341607016344, "grad_norm": 0.007998412474989891, "learning_rate": 2.7347004366925764e-07, "loss": 0.0, "num_input_tokens_seen": 108006800, "step": 160250 }, { "epoch": 3.9150563115334815, "grad_norm": 0.007969462312757969, "learning_rate": 2.7341144910550116e-07, "loss": 0.0001, "num_input_tokens_seen": 108010576, "step": 160255 }, { "epoch": 3.9151784623653287, "grad_norm": 0.0016169307054951787, "learning_rate": 2.7335285982569247e-07, "loss": 0.0, "num_input_tokens_seen": 108013520, "step": 160260 }, { "epoch": 3.915300613197176, "grad_norm": 0.0054115429520606995, "learning_rate": 2.732942758302571e-07, "loss": 0.0001, "num_input_tokens_seen": 108016528, "step": 160265 }, { "epoch": 3.915422764029023, "grad_norm": 0.0002611346426419914, "learning_rate": 2.732356971196209e-07, "loss": 0.0, "num_input_tokens_seen": 108019600, "step": 160270 }, { "epoch": 3.9155449148608703, "grad_norm": 0.06928683817386627, "learning_rate": 2.7317712369421053e-07, "loss": 0.0, "num_input_tokens_seen": 108022864, "step": 160275 }, { "epoch": 3.9156670656927175, "grad_norm": 0.0011088034370914102, "learning_rate": 2.731185555544514e-07, "loss": 0.0002, "num_input_tokens_seen": 108026064, "step": 160280 }, { "epoch": 3.9157892165245647, "grad_norm": 0.00044814738794229925, "learning_rate": 2.7305999270076985e-07, "loss": 0.0024, "num_input_tokens_seen": 108029712, "step": 160285 }, { "epoch": 3.9159113673564114, "grad_norm": 3.936755092581734e-05, "learning_rate": 2.730014351335913e-07, "loss": 0.0609, "num_input_tokens_seen": 108032784, "step": 160290 }, { "epoch": 3.916033518188259, "grad_norm": 0.0029017627239227295, "learning_rate": 2.729428828533421e-07, "loss": 0.0, "num_input_tokens_seen": 108035728, "step": 160295 }, { "epoch": 3.916155669020106, "grad_norm": 0.00020281197794247419, "learning_rate": 2.7288433586044746e-07, "loss": 0.0, "num_input_tokens_seen": 108039760, "step": 160300 }, { "epoch": 3.9162778198519534, "grad_norm": 0.0007818678277544677, "learning_rate": 2.728257941553336e-07, "loss": 0.0, "num_input_tokens_seen": 108043088, "step": 160305 }, { "epoch": 3.9163999706838, "grad_norm": 0.00039589000516571105, "learning_rate": 2.7276725773842646e-07, "loss": 0.0, "num_input_tokens_seen": 108046864, "step": 160310 }, { "epoch": 3.9165221215156474, "grad_norm": 0.0013458137400448322, "learning_rate": 2.727087266101511e-07, "loss": 0.0, "num_input_tokens_seen": 108050960, "step": 160315 }, { "epoch": 3.9166442723474946, "grad_norm": 0.002179963979870081, "learning_rate": 2.726502007709338e-07, "loss": 0.0, "num_input_tokens_seen": 108053968, "step": 160320 }, { "epoch": 3.9167664231793418, "grad_norm": 0.042245928198099136, "learning_rate": 2.725916802211995e-07, "loss": 0.0, "num_input_tokens_seen": 108057232, "step": 160325 }, { "epoch": 3.916888574011189, "grad_norm": 0.004011943470686674, "learning_rate": 2.7253316496137457e-07, "loss": 0.0464, "num_input_tokens_seen": 108060496, "step": 160330 }, { "epoch": 3.917010724843036, "grad_norm": 0.047628071159124374, "learning_rate": 2.7247465499188373e-07, "loss": 0.0774, "num_input_tokens_seen": 108065744, "step": 160335 }, { "epoch": 3.9171328756748833, "grad_norm": 0.0027071163058280945, "learning_rate": 2.724161503131529e-07, "loss": 0.0, "num_input_tokens_seen": 108069136, "step": 160340 }, { "epoch": 3.9172550265067305, "grad_norm": 0.000492277555167675, "learning_rate": 2.7235765092560794e-07, "loss": 0.0816, "num_input_tokens_seen": 108072272, "step": 160345 }, { "epoch": 3.9173771773385777, "grad_norm": 1.0742985068645794e-05, "learning_rate": 2.722991568296734e-07, "loss": 0.0, "num_input_tokens_seen": 108075216, "step": 160350 }, { "epoch": 3.917499328170425, "grad_norm": 0.00816552434116602, "learning_rate": 2.7224066802577547e-07, "loss": 0.0001, "num_input_tokens_seen": 108079120, "step": 160355 }, { "epoch": 3.917621479002272, "grad_norm": 0.00256815692409873, "learning_rate": 2.721821845143388e-07, "loss": 0.019, "num_input_tokens_seen": 108082512, "step": 160360 }, { "epoch": 3.9177436298341193, "grad_norm": 0.002888076938688755, "learning_rate": 2.721237062957894e-07, "loss": 0.0007, "num_input_tokens_seen": 108085648, "step": 160365 }, { "epoch": 3.9178657806659665, "grad_norm": 0.0006103937048465014, "learning_rate": 2.720652333705522e-07, "loss": 0.0, "num_input_tokens_seen": 108088976, "step": 160370 }, { "epoch": 3.9179879314978137, "grad_norm": 0.0034813780803233385, "learning_rate": 2.720067657390521e-07, "loss": 0.0, "num_input_tokens_seen": 108092432, "step": 160375 }, { "epoch": 3.918110082329661, "grad_norm": 0.008327373303472996, "learning_rate": 2.7194830340171494e-07, "loss": 0.0, "num_input_tokens_seen": 108095760, "step": 160380 }, { "epoch": 3.9182322331615076, "grad_norm": 0.025573179125785828, "learning_rate": 2.7188984635896516e-07, "loss": 0.0, "num_input_tokens_seen": 108099024, "step": 160385 }, { "epoch": 3.9183543839933552, "grad_norm": 0.054493945091962814, "learning_rate": 2.718313946112286e-07, "loss": 0.0, "num_input_tokens_seen": 108102480, "step": 160390 }, { "epoch": 3.918476534825202, "grad_norm": 0.003784226719290018, "learning_rate": 2.717729481589297e-07, "loss": 0.0, "num_input_tokens_seen": 108105872, "step": 160395 }, { "epoch": 3.918598685657049, "grad_norm": 0.0017452125903218985, "learning_rate": 2.7171450700249375e-07, "loss": 0.0, "num_input_tokens_seen": 108108880, "step": 160400 }, { "epoch": 3.9187208364888964, "grad_norm": 0.0016395858256146312, "learning_rate": 2.7165607114234614e-07, "loss": 0.0, "num_input_tokens_seen": 108112336, "step": 160405 }, { "epoch": 3.9188429873207435, "grad_norm": 0.0008384010288864374, "learning_rate": 2.715976405789111e-07, "loss": 0.0, "num_input_tokens_seen": 108115408, "step": 160410 }, { "epoch": 3.9189651381525907, "grad_norm": 0.0010273084044456482, "learning_rate": 2.7153921531261436e-07, "loss": 0.0, "num_input_tokens_seen": 108118608, "step": 160415 }, { "epoch": 3.919087288984438, "grad_norm": 4.537554195849225e-05, "learning_rate": 2.7148079534388004e-07, "loss": 0.0, "num_input_tokens_seen": 108122448, "step": 160420 }, { "epoch": 3.919209439816285, "grad_norm": 0.0004955941112712026, "learning_rate": 2.714223806731335e-07, "loss": 0.0, "num_input_tokens_seen": 108126352, "step": 160425 }, { "epoch": 3.9193315906481323, "grad_norm": 0.002432050183415413, "learning_rate": 2.7136397130079926e-07, "loss": 0.0021, "num_input_tokens_seen": 108129552, "step": 160430 }, { "epoch": 3.9194537414799795, "grad_norm": 0.1448434293270111, "learning_rate": 2.71305567227302e-07, "loss": 0.0, "num_input_tokens_seen": 108133008, "step": 160435 }, { "epoch": 3.9195758923118267, "grad_norm": 0.054726745933294296, "learning_rate": 2.7124716845306717e-07, "loss": 0.0, "num_input_tokens_seen": 108136400, "step": 160440 }, { "epoch": 3.919698043143674, "grad_norm": 0.02143864706158638, "learning_rate": 2.7118877497851844e-07, "loss": 0.0, "num_input_tokens_seen": 108139984, "step": 160445 }, { "epoch": 3.919820193975521, "grad_norm": 0.10433755069971085, "learning_rate": 2.711303868040814e-07, "loss": 0.0, "num_input_tokens_seen": 108143248, "step": 160450 }, { "epoch": 3.9199423448073683, "grad_norm": 0.001276486786082387, "learning_rate": 2.7107200393017994e-07, "loss": 0.0, "num_input_tokens_seen": 108146448, "step": 160455 }, { "epoch": 3.9200644956392154, "grad_norm": 0.0015387848252430558, "learning_rate": 2.710136263572391e-07, "loss": 0.0, "num_input_tokens_seen": 108149776, "step": 160460 }, { "epoch": 3.9201866464710626, "grad_norm": 0.003469153307378292, "learning_rate": 2.7095525408568297e-07, "loss": 0.0, "num_input_tokens_seen": 108152848, "step": 160465 }, { "epoch": 3.9203087973029094, "grad_norm": 0.008718705736100674, "learning_rate": 2.7089688711593674e-07, "loss": 0.0, "num_input_tokens_seen": 108155792, "step": 160470 }, { "epoch": 3.920430948134757, "grad_norm": 0.002317639999091625, "learning_rate": 2.7083852544842433e-07, "loss": 0.0, "num_input_tokens_seen": 108158736, "step": 160475 }, { "epoch": 3.9205530989666038, "grad_norm": 0.004345919005572796, "learning_rate": 2.7078016908357004e-07, "loss": 0.0, "num_input_tokens_seen": 108162000, "step": 160480 }, { "epoch": 3.9206752497984514, "grad_norm": 0.08784374594688416, "learning_rate": 2.707218180217988e-07, "loss": 0.0, "num_input_tokens_seen": 108165648, "step": 160485 }, { "epoch": 3.920797400630298, "grad_norm": 0.0225373525172472, "learning_rate": 2.7066347226353435e-07, "loss": 0.0, "num_input_tokens_seen": 108169296, "step": 160490 }, { "epoch": 3.9209195514621453, "grad_norm": 0.0019363034516572952, "learning_rate": 2.706051318092013e-07, "loss": 0.0, "num_input_tokens_seen": 108172368, "step": 160495 }, { "epoch": 3.9210417022939925, "grad_norm": 0.0003040806914214045, "learning_rate": 2.705467966592242e-07, "loss": 0.0001, "num_input_tokens_seen": 108175568, "step": 160500 }, { "epoch": 3.9211638531258397, "grad_norm": 6.52025337330997e-05, "learning_rate": 2.704884668140267e-07, "loss": 0.0, "num_input_tokens_seen": 108179088, "step": 160505 }, { "epoch": 3.921286003957687, "grad_norm": 0.00024085829500108957, "learning_rate": 2.704301422740336e-07, "loss": 0.0, "num_input_tokens_seen": 108182096, "step": 160510 }, { "epoch": 3.921408154789534, "grad_norm": 0.002301350235939026, "learning_rate": 2.703718230396683e-07, "loss": 0.0, "num_input_tokens_seen": 108185744, "step": 160515 }, { "epoch": 3.9215303056213813, "grad_norm": 0.0006811128696426749, "learning_rate": 2.703135091113559e-07, "loss": 0.0, "num_input_tokens_seen": 108189328, "step": 160520 }, { "epoch": 3.9216524564532285, "grad_norm": 0.0007640509284101427, "learning_rate": 2.7025520048951944e-07, "loss": 0.0, "num_input_tokens_seen": 108192656, "step": 160525 }, { "epoch": 3.9217746072850757, "grad_norm": 0.001994823105633259, "learning_rate": 2.701968971745835e-07, "loss": 0.0, "num_input_tokens_seen": 108196240, "step": 160530 }, { "epoch": 3.921896758116923, "grad_norm": 0.007130780257284641, "learning_rate": 2.701385991669722e-07, "loss": 0.074, "num_input_tokens_seen": 108199376, "step": 160535 }, { "epoch": 3.92201890894877, "grad_norm": 0.005872517358511686, "learning_rate": 2.7008030646710923e-07, "loss": 0.0, "num_input_tokens_seen": 108202384, "step": 160540 }, { "epoch": 3.9221410597806172, "grad_norm": 4.299524784088135, "learning_rate": 2.7002201907541875e-07, "loss": 0.0007, "num_input_tokens_seen": 108205712, "step": 160545 }, { "epoch": 3.9222632106124644, "grad_norm": 0.0014040902024134994, "learning_rate": 2.699637369923242e-07, "loss": 0.0069, "num_input_tokens_seen": 108209168, "step": 160550 }, { "epoch": 3.9223853614443116, "grad_norm": 7.01907993061468e-05, "learning_rate": 2.699054602182498e-07, "loss": 0.0, "num_input_tokens_seen": 108212688, "step": 160555 }, { "epoch": 3.922507512276159, "grad_norm": 0.0007399121532216668, "learning_rate": 2.6984718875361947e-07, "loss": 0.0, "num_input_tokens_seen": 108216080, "step": 160560 }, { "epoch": 3.9226296631080055, "grad_norm": 0.006925436668097973, "learning_rate": 2.6978892259885657e-07, "loss": 0.0321, "num_input_tokens_seen": 108219472, "step": 160565 }, { "epoch": 3.922751813939853, "grad_norm": 0.0036607375368475914, "learning_rate": 2.697306617543852e-07, "loss": 0.0, "num_input_tokens_seen": 108222800, "step": 160570 }, { "epoch": 3.9228739647717, "grad_norm": 0.001552755944430828, "learning_rate": 2.6967240622062895e-07, "loss": 0.0, "num_input_tokens_seen": 108225936, "step": 160575 }, { "epoch": 3.922996115603547, "grad_norm": 0.0007583214901387691, "learning_rate": 2.696141559980111e-07, "loss": 0.0, "num_input_tokens_seen": 108229392, "step": 160580 }, { "epoch": 3.9231182664353943, "grad_norm": 0.008268969133496284, "learning_rate": 2.6955591108695585e-07, "loss": 0.0, "num_input_tokens_seen": 108232784, "step": 160585 }, { "epoch": 3.9232404172672415, "grad_norm": 5.6169724302890245e-06, "learning_rate": 2.6949767148788615e-07, "loss": 0.0, "num_input_tokens_seen": 108236496, "step": 160590 }, { "epoch": 3.9233625680990887, "grad_norm": 0.0036725711543112993, "learning_rate": 2.694394372012262e-07, "loss": 0.0, "num_input_tokens_seen": 108240400, "step": 160595 }, { "epoch": 3.923484718930936, "grad_norm": 0.0005443750414997339, "learning_rate": 2.6938120822739884e-07, "loss": 0.0, "num_input_tokens_seen": 108244240, "step": 160600 }, { "epoch": 3.923606869762783, "grad_norm": 0.12630638480186462, "learning_rate": 2.693229845668281e-07, "loss": 0.0, "num_input_tokens_seen": 108248976, "step": 160605 }, { "epoch": 3.9237290205946302, "grad_norm": 0.9307985305786133, "learning_rate": 2.6926476621993697e-07, "loss": 0.0003, "num_input_tokens_seen": 108252432, "step": 160610 }, { "epoch": 3.9238511714264774, "grad_norm": 0.0034127281978726387, "learning_rate": 2.6920655318714923e-07, "loss": 0.0, "num_input_tokens_seen": 108255568, "step": 160615 }, { "epoch": 3.9239733222583246, "grad_norm": 0.025321057066321373, "learning_rate": 2.6914834546888766e-07, "loss": 0.0, "num_input_tokens_seen": 108259280, "step": 160620 }, { "epoch": 3.924095473090172, "grad_norm": 0.00016328954370692372, "learning_rate": 2.69090143065576e-07, "loss": 0.0, "num_input_tokens_seen": 108262608, "step": 160625 }, { "epoch": 3.924217623922019, "grad_norm": 0.05200456827878952, "learning_rate": 2.690319459776376e-07, "loss": 0.0001, "num_input_tokens_seen": 108266000, "step": 160630 }, { "epoch": 3.924339774753866, "grad_norm": 0.008335310034453869, "learning_rate": 2.689737542054953e-07, "loss": 0.0, "num_input_tokens_seen": 108269136, "step": 160635 }, { "epoch": 3.9244619255857134, "grad_norm": 0.00032875704346224666, "learning_rate": 2.689155677495727e-07, "loss": 0.0, "num_input_tokens_seen": 108272144, "step": 160640 }, { "epoch": 3.9245840764175606, "grad_norm": 0.021563809365034103, "learning_rate": 2.6885738661029246e-07, "loss": 0.0, "num_input_tokens_seen": 108275152, "step": 160645 }, { "epoch": 3.9247062272494073, "grad_norm": 0.0003353093343321234, "learning_rate": 2.687992107880779e-07, "loss": 0.0, "num_input_tokens_seen": 108278608, "step": 160650 }, { "epoch": 3.924828378081255, "grad_norm": 0.0016764416359364986, "learning_rate": 2.6874104028335256e-07, "loss": 0.0, "num_input_tokens_seen": 108281808, "step": 160655 }, { "epoch": 3.9249505289131017, "grad_norm": 0.0027599085588008165, "learning_rate": 2.686828750965386e-07, "loss": 0.0553, "num_input_tokens_seen": 108285136, "step": 160660 }, { "epoch": 3.9250726797449493, "grad_norm": 0.0004885609960183501, "learning_rate": 2.6862471522805995e-07, "loss": 0.0, "num_input_tokens_seen": 108288400, "step": 160665 }, { "epoch": 3.925194830576796, "grad_norm": 0.0022552688606083393, "learning_rate": 2.685665606783387e-07, "loss": 0.0822, "num_input_tokens_seen": 108291792, "step": 160670 }, { "epoch": 3.9253169814086433, "grad_norm": 0.0019291022326797247, "learning_rate": 2.6850841144779844e-07, "loss": 0.0, "num_input_tokens_seen": 108295312, "step": 160675 }, { "epoch": 3.9254391322404905, "grad_norm": 0.7367055416107178, "learning_rate": 2.684502675368617e-07, "loss": 0.0002, "num_input_tokens_seen": 108298768, "step": 160680 }, { "epoch": 3.9255612830723376, "grad_norm": 0.0002957978576887399, "learning_rate": 2.683921289459512e-07, "loss": 0.0, "num_input_tokens_seen": 108302096, "step": 160685 }, { "epoch": 3.925683433904185, "grad_norm": 0.00023816576867830008, "learning_rate": 2.683339956754902e-07, "loss": 0.0, "num_input_tokens_seen": 108305040, "step": 160690 }, { "epoch": 3.925805584736032, "grad_norm": 0.001074330066330731, "learning_rate": 2.6827586772590084e-07, "loss": 0.0, "num_input_tokens_seen": 108308624, "step": 160695 }, { "epoch": 3.925927735567879, "grad_norm": 0.005222479347139597, "learning_rate": 2.6821774509760655e-07, "loss": 0.0, "num_input_tokens_seen": 108311696, "step": 160700 }, { "epoch": 3.9260498863997264, "grad_norm": 0.001443124609068036, "learning_rate": 2.681596277910293e-07, "loss": 0.0, "num_input_tokens_seen": 108315088, "step": 160705 }, { "epoch": 3.9261720372315736, "grad_norm": 0.001377643900923431, "learning_rate": 2.68101515806592e-07, "loss": 0.0, "num_input_tokens_seen": 108318416, "step": 160710 }, { "epoch": 3.926294188063421, "grad_norm": 0.000944781641010195, "learning_rate": 2.680434091447177e-07, "loss": 0.0, "num_input_tokens_seen": 108321616, "step": 160715 }, { "epoch": 3.926416338895268, "grad_norm": 0.0011502342531457543, "learning_rate": 2.6798530780582826e-07, "loss": 0.0353, "num_input_tokens_seen": 108324688, "step": 160720 }, { "epoch": 3.926538489727115, "grad_norm": 0.0038087046705186367, "learning_rate": 2.6792721179034695e-07, "loss": 0.0, "num_input_tokens_seen": 108327824, "step": 160725 }, { "epoch": 3.9266606405589624, "grad_norm": 0.0036730014253407717, "learning_rate": 2.678691210986955e-07, "loss": 0.0, "num_input_tokens_seen": 108331472, "step": 160730 }, { "epoch": 3.926782791390809, "grad_norm": 0.0008776450995355844, "learning_rate": 2.67811035731297e-07, "loss": 0.0003, "num_input_tokens_seen": 108334544, "step": 160735 }, { "epoch": 3.9269049422226567, "grad_norm": 21.63974380493164, "learning_rate": 2.677529556885734e-07, "loss": 0.074, "num_input_tokens_seen": 108337616, "step": 160740 }, { "epoch": 3.9270270930545035, "grad_norm": 0.003887888975441456, "learning_rate": 2.6769488097094704e-07, "loss": 0.0, "num_input_tokens_seen": 108342096, "step": 160745 }, { "epoch": 3.927149243886351, "grad_norm": 0.0010327683994546533, "learning_rate": 2.67636811578841e-07, "loss": 0.0, "num_input_tokens_seen": 108345360, "step": 160750 }, { "epoch": 3.927271394718198, "grad_norm": 0.0014810446882620454, "learning_rate": 2.675787475126766e-07, "loss": 0.0, "num_input_tokens_seen": 108348880, "step": 160755 }, { "epoch": 3.927393545550045, "grad_norm": 0.002494688145816326, "learning_rate": 2.675206887728769e-07, "loss": 0.0, "num_input_tokens_seen": 108351824, "step": 160760 }, { "epoch": 3.9275156963818922, "grad_norm": 0.00014557481335941702, "learning_rate": 2.6746263535986345e-07, "loss": 0.0003, "num_input_tokens_seen": 108355216, "step": 160765 }, { "epoch": 3.9276378472137394, "grad_norm": 0.0002793877793010324, "learning_rate": 2.6740458727405903e-07, "loss": 0.0, "num_input_tokens_seen": 108358672, "step": 160770 }, { "epoch": 3.9277599980455866, "grad_norm": 0.0017380811041221023, "learning_rate": 2.6734654451588524e-07, "loss": 0.0, "num_input_tokens_seen": 108362064, "step": 160775 }, { "epoch": 3.927882148877434, "grad_norm": 0.004759801551699638, "learning_rate": 2.6728850708576467e-07, "loss": 0.0, "num_input_tokens_seen": 108365392, "step": 160780 }, { "epoch": 3.928004299709281, "grad_norm": 0.001996064791455865, "learning_rate": 2.672304749841189e-07, "loss": 0.0, "num_input_tokens_seen": 108368656, "step": 160785 }, { "epoch": 3.928126450541128, "grad_norm": 0.0019026733934879303, "learning_rate": 2.671724482113705e-07, "loss": 0.0, "num_input_tokens_seen": 108372496, "step": 160790 }, { "epoch": 3.9282486013729754, "grad_norm": 0.0004214816144667566, "learning_rate": 2.6711442676794117e-07, "loss": 0.0001, "num_input_tokens_seen": 108375824, "step": 160795 }, { "epoch": 3.9283707522048226, "grad_norm": 0.001756381243467331, "learning_rate": 2.6705641065425255e-07, "loss": 0.0001, "num_input_tokens_seen": 108378960, "step": 160800 }, { "epoch": 3.9284929030366698, "grad_norm": 0.11266539990901947, "learning_rate": 2.669983998707268e-07, "loss": 0.0, "num_input_tokens_seen": 108381712, "step": 160805 }, { "epoch": 3.928615053868517, "grad_norm": 0.0004792583640664816, "learning_rate": 2.669403944177863e-07, "loss": 0.0, "num_input_tokens_seen": 108385040, "step": 160810 }, { "epoch": 3.928737204700364, "grad_norm": 0.0034123146906495094, "learning_rate": 2.668823942958519e-07, "loss": 0.0, "num_input_tokens_seen": 108388176, "step": 160815 }, { "epoch": 3.9288593555322113, "grad_norm": 0.002358450088649988, "learning_rate": 2.668243995053464e-07, "loss": 0.0, "num_input_tokens_seen": 108391632, "step": 160820 }, { "epoch": 3.9289815063640585, "grad_norm": 0.0016407499788329005, "learning_rate": 2.667664100466906e-07, "loss": 0.0, "num_input_tokens_seen": 108395472, "step": 160825 }, { "epoch": 3.9291036571959053, "grad_norm": 0.007201909553259611, "learning_rate": 2.6670842592030706e-07, "loss": 0.0, "num_input_tokens_seen": 108398992, "step": 160830 }, { "epoch": 3.929225808027753, "grad_norm": 0.00013028294779360294, "learning_rate": 2.6665044712661687e-07, "loss": 0.0, "num_input_tokens_seen": 108402512, "step": 160835 }, { "epoch": 3.9293479588595996, "grad_norm": 0.00041084669646807015, "learning_rate": 2.665924736660418e-07, "loss": 0.0402, "num_input_tokens_seen": 108405776, "step": 160840 }, { "epoch": 3.929470109691447, "grad_norm": 6.577320891665295e-05, "learning_rate": 2.6653450553900383e-07, "loss": 0.0, "num_input_tokens_seen": 108408976, "step": 160845 }, { "epoch": 3.929592260523294, "grad_norm": 0.02033839002251625, "learning_rate": 2.664765427459239e-07, "loss": 0.0001, "num_input_tokens_seen": 108412880, "step": 160850 }, { "epoch": 3.929714411355141, "grad_norm": 7.121515955077484e-05, "learning_rate": 2.6641858528722403e-07, "loss": 0.0489, "num_input_tokens_seen": 108416080, "step": 160855 }, { "epoch": 3.9298365621869884, "grad_norm": 0.002095576375722885, "learning_rate": 2.6636063316332535e-07, "loss": 0.0, "num_input_tokens_seen": 108419280, "step": 160860 }, { "epoch": 3.9299587130188356, "grad_norm": 0.0001592604094184935, "learning_rate": 2.663026863746495e-07, "loss": 0.0, "num_input_tokens_seen": 108422864, "step": 160865 }, { "epoch": 3.930080863850683, "grad_norm": 0.0009298757067881525, "learning_rate": 2.662447449216181e-07, "loss": 0.0, "num_input_tokens_seen": 108425936, "step": 160870 }, { "epoch": 3.93020301468253, "grad_norm": 0.007275915704667568, "learning_rate": 2.6618680880465207e-07, "loss": 0.0882, "num_input_tokens_seen": 108429456, "step": 160875 }, { "epoch": 3.930325165514377, "grad_norm": 17.31068992614746, "learning_rate": 2.6612887802417307e-07, "loss": 0.0373, "num_input_tokens_seen": 108432272, "step": 160880 }, { "epoch": 3.9304473163462244, "grad_norm": 0.03703254088759422, "learning_rate": 2.660709525806024e-07, "loss": 0.035, "num_input_tokens_seen": 108435408, "step": 160885 }, { "epoch": 3.9305694671780715, "grad_norm": 0.0028196689672768116, "learning_rate": 2.660130324743608e-07, "loss": 0.0, "num_input_tokens_seen": 108438736, "step": 160890 }, { "epoch": 3.9306916180099187, "grad_norm": 0.0006376370438374579, "learning_rate": 2.659551177058701e-07, "loss": 0.0, "num_input_tokens_seen": 108442000, "step": 160895 }, { "epoch": 3.930813768841766, "grad_norm": 0.0008170445216819644, "learning_rate": 2.6589720827555094e-07, "loss": 0.0, "num_input_tokens_seen": 108445264, "step": 160900 }, { "epoch": 3.930935919673613, "grad_norm": 0.3517223298549652, "learning_rate": 2.6583930418382507e-07, "loss": 0.0729, "num_input_tokens_seen": 108448656, "step": 160905 }, { "epoch": 3.9310580705054603, "grad_norm": 0.0012302043614909053, "learning_rate": 2.6578140543111293e-07, "loss": 0.0, "num_input_tokens_seen": 108452304, "step": 160910 }, { "epoch": 3.931180221337307, "grad_norm": 0.002731229877099395, "learning_rate": 2.6572351201783625e-07, "loss": 0.0, "num_input_tokens_seen": 108455632, "step": 160915 }, { "epoch": 3.9313023721691547, "grad_norm": 2.0680634406744502e-05, "learning_rate": 2.656656239444153e-07, "loss": 0.0, "num_input_tokens_seen": 108459216, "step": 160920 }, { "epoch": 3.9314245230010014, "grad_norm": 0.0033918926492333412, "learning_rate": 2.6560774121127185e-07, "loss": 0.0869, "num_input_tokens_seen": 108462352, "step": 160925 }, { "epoch": 3.931546673832849, "grad_norm": 0.0004112799360882491, "learning_rate": 2.6554986381882603e-07, "loss": 0.0, "num_input_tokens_seen": 108466000, "step": 160930 }, { "epoch": 3.931668824664696, "grad_norm": 0.0012325807474553585, "learning_rate": 2.6549199176749915e-07, "loss": 0.0, "num_input_tokens_seen": 108469264, "step": 160935 }, { "epoch": 3.931790975496543, "grad_norm": 0.0005546318716369569, "learning_rate": 2.654341250577125e-07, "loss": 0.0, "num_input_tokens_seen": 108472528, "step": 160940 }, { "epoch": 3.93191312632839, "grad_norm": 0.0006047788774594665, "learning_rate": 2.6537626368988595e-07, "loss": 0.0, "num_input_tokens_seen": 108476048, "step": 160945 }, { "epoch": 3.9320352771602374, "grad_norm": 0.02619839832186699, "learning_rate": 2.6531840766444127e-07, "loss": 0.0, "num_input_tokens_seen": 108479312, "step": 160950 }, { "epoch": 3.9321574279920846, "grad_norm": 2.9269987862790003e-05, "learning_rate": 2.6526055698179826e-07, "loss": 0.0, "num_input_tokens_seen": 108482448, "step": 160955 }, { "epoch": 3.9322795788239318, "grad_norm": 0.0012526396894827485, "learning_rate": 2.652027116423783e-07, "loss": 0.0, "num_input_tokens_seen": 108486288, "step": 160960 }, { "epoch": 3.932401729655779, "grad_norm": 0.003693891456350684, "learning_rate": 2.65144871646602e-07, "loss": 0.0, "num_input_tokens_seen": 108489936, "step": 160965 }, { "epoch": 3.932523880487626, "grad_norm": 0.001101264264434576, "learning_rate": 2.6508703699488964e-07, "loss": 0.0, "num_input_tokens_seen": 108493456, "step": 160970 }, { "epoch": 3.9326460313194733, "grad_norm": 0.001372728613205254, "learning_rate": 2.6502920768766234e-07, "loss": 0.0, "num_input_tokens_seen": 108496784, "step": 160975 }, { "epoch": 3.9327681821513205, "grad_norm": 0.0008805957622826099, "learning_rate": 2.6497138372534e-07, "loss": 0.0, "num_input_tokens_seen": 108500304, "step": 160980 }, { "epoch": 3.9328903329831677, "grad_norm": 0.001661680988036096, "learning_rate": 2.6491356510834374e-07, "loss": 0.0, "num_input_tokens_seen": 108503632, "step": 160985 }, { "epoch": 3.933012483815015, "grad_norm": 0.003996791783720255, "learning_rate": 2.6485575183709375e-07, "loss": 0.0, "num_input_tokens_seen": 108506832, "step": 160990 }, { "epoch": 3.933134634646862, "grad_norm": 0.0058486550115048885, "learning_rate": 2.6479794391201005e-07, "loss": 0.0, "num_input_tokens_seen": 108510352, "step": 160995 }, { "epoch": 3.9332567854787093, "grad_norm": 0.00618613138794899, "learning_rate": 2.6474014133351383e-07, "loss": 0.0446, "num_input_tokens_seen": 108514128, "step": 161000 }, { "epoch": 3.9333789363105565, "grad_norm": 0.03912936523556709, "learning_rate": 2.6468234410202484e-07, "loss": 0.0001, "num_input_tokens_seen": 108517328, "step": 161005 }, { "epoch": 3.933501087142403, "grad_norm": 0.0005721602938137949, "learning_rate": 2.6462455221796386e-07, "loss": 0.1235, "num_input_tokens_seen": 108520848, "step": 161010 }, { "epoch": 3.933623237974251, "grad_norm": 0.0008427056600339711, "learning_rate": 2.645667656817506e-07, "loss": 0.0, "num_input_tokens_seen": 108523856, "step": 161015 }, { "epoch": 3.9337453888060976, "grad_norm": 0.0006119231111370027, "learning_rate": 2.6450898449380575e-07, "loss": 0.0, "num_input_tokens_seen": 108527184, "step": 161020 }, { "epoch": 3.9338675396379448, "grad_norm": 0.005697476677596569, "learning_rate": 2.6445120865454964e-07, "loss": 0.0, "num_input_tokens_seen": 108530576, "step": 161025 }, { "epoch": 3.933989690469792, "grad_norm": 0.001607921440154314, "learning_rate": 2.643934381644017e-07, "loss": 0.0, "num_input_tokens_seen": 108534672, "step": 161030 }, { "epoch": 3.934111841301639, "grad_norm": 28.863571166992188, "learning_rate": 2.64335673023783e-07, "loss": 0.0524, "num_input_tokens_seen": 108538384, "step": 161035 }, { "epoch": 3.9342339921334863, "grad_norm": 0.0019764634780585766, "learning_rate": 2.6427791323311287e-07, "loss": 0.0, "num_input_tokens_seen": 108542288, "step": 161040 }, { "epoch": 3.9343561429653335, "grad_norm": 0.009618827141821384, "learning_rate": 2.642201587928119e-07, "loss": 0.0, "num_input_tokens_seen": 108546128, "step": 161045 }, { "epoch": 3.9344782937971807, "grad_norm": 0.0010088874259963632, "learning_rate": 2.641624097032995e-07, "loss": 0.0, "num_input_tokens_seen": 108549520, "step": 161050 }, { "epoch": 3.934600444629028, "grad_norm": 0.0008176874252967536, "learning_rate": 2.64104665964996e-07, "loss": 0.0, "num_input_tokens_seen": 108553104, "step": 161055 }, { "epoch": 3.934722595460875, "grad_norm": 0.00025323365116491914, "learning_rate": 2.640469275783217e-07, "loss": 0.0, "num_input_tokens_seen": 108556304, "step": 161060 }, { "epoch": 3.9348447462927223, "grad_norm": 0.005040105897933245, "learning_rate": 2.6398919454369564e-07, "loss": 0.0, "num_input_tokens_seen": 108560016, "step": 161065 }, { "epoch": 3.9349668971245695, "grad_norm": 0.0007410483667626977, "learning_rate": 2.639314668615384e-07, "loss": 0.0, "num_input_tokens_seen": 108562960, "step": 161070 }, { "epoch": 3.9350890479564167, "grad_norm": 0.015878252685070038, "learning_rate": 2.638737445322694e-07, "loss": 0.0, "num_input_tokens_seen": 108566480, "step": 161075 }, { "epoch": 3.935211198788264, "grad_norm": 0.014976591803133488, "learning_rate": 2.638160275563087e-07, "loss": 0.0, "num_input_tokens_seen": 108569232, "step": 161080 }, { "epoch": 3.935333349620111, "grad_norm": 0.007671588100492954, "learning_rate": 2.637583159340756e-07, "loss": 0.0479, "num_input_tokens_seen": 108572368, "step": 161085 }, { "epoch": 3.9354555004519582, "grad_norm": 0.03282986581325531, "learning_rate": 2.637006096659903e-07, "loss": 0.0, "num_input_tokens_seen": 108575248, "step": 161090 }, { "epoch": 3.935577651283805, "grad_norm": 0.006898272316902876, "learning_rate": 2.6364290875247195e-07, "loss": 0.0, "num_input_tokens_seen": 108578640, "step": 161095 }, { "epoch": 3.9356998021156526, "grad_norm": 0.003733513643965125, "learning_rate": 2.635852131939407e-07, "loss": 0.0, "num_input_tokens_seen": 108582160, "step": 161100 }, { "epoch": 3.9358219529474994, "grad_norm": 0.00028158118948340416, "learning_rate": 2.635275229908158e-07, "loss": 0.0, "num_input_tokens_seen": 108585424, "step": 161105 }, { "epoch": 3.935944103779347, "grad_norm": 0.002537839813157916, "learning_rate": 2.6346983814351667e-07, "loss": 0.0001, "num_input_tokens_seen": 108588432, "step": 161110 }, { "epoch": 3.9360662546111937, "grad_norm": 0.000527587253600359, "learning_rate": 2.634121586524629e-07, "loss": 0.0, "num_input_tokens_seen": 108591952, "step": 161115 }, { "epoch": 3.936188405443041, "grad_norm": 0.015813853591680527, "learning_rate": 2.633544845180743e-07, "loss": 0.0, "num_input_tokens_seen": 108595408, "step": 161120 }, { "epoch": 3.936310556274888, "grad_norm": 0.0024629030376672745, "learning_rate": 2.632968157407698e-07, "loss": 0.0, "num_input_tokens_seen": 108598608, "step": 161125 }, { "epoch": 3.9364327071067353, "grad_norm": 67.15186309814453, "learning_rate": 2.632391523209693e-07, "loss": 0.0778, "num_input_tokens_seen": 108601808, "step": 161130 }, { "epoch": 3.9365548579385825, "grad_norm": 0.0006878372514620423, "learning_rate": 2.631814942590914e-07, "loss": 0.0, "num_input_tokens_seen": 108604880, "step": 161135 }, { "epoch": 3.9366770087704297, "grad_norm": 0.007203788496553898, "learning_rate": 2.631238415555563e-07, "loss": 0.0, "num_input_tokens_seen": 108608016, "step": 161140 }, { "epoch": 3.936799159602277, "grad_norm": 0.00016854728164616972, "learning_rate": 2.6306619421078245e-07, "loss": 0.1047, "num_input_tokens_seen": 108611920, "step": 161145 }, { "epoch": 3.936921310434124, "grad_norm": 0.0006344046560116112, "learning_rate": 2.630085522251896e-07, "loss": 0.0, "num_input_tokens_seen": 108614736, "step": 161150 }, { "epoch": 3.9370434612659713, "grad_norm": 0.007249165792018175, "learning_rate": 2.629509155991969e-07, "loss": 0.0, "num_input_tokens_seen": 108617936, "step": 161155 }, { "epoch": 3.9371656120978185, "grad_norm": 0.0009594596922397614, "learning_rate": 2.6289328433322323e-07, "loss": 0.0, "num_input_tokens_seen": 108621072, "step": 161160 }, { "epoch": 3.9372877629296656, "grad_norm": 0.0017339460318908095, "learning_rate": 2.6283565842768807e-07, "loss": 0.0, "num_input_tokens_seen": 108624336, "step": 161165 }, { "epoch": 3.937409913761513, "grad_norm": 0.000566888484172523, "learning_rate": 2.627780378830099e-07, "loss": 0.0, "num_input_tokens_seen": 108627792, "step": 161170 }, { "epoch": 3.93753206459336, "grad_norm": 0.0023493324406445026, "learning_rate": 2.6272042269960856e-07, "loss": 0.0001, "num_input_tokens_seen": 108631120, "step": 161175 }, { "epoch": 3.9376542154252068, "grad_norm": 0.0009030869114212692, "learning_rate": 2.6266281287790225e-07, "loss": 0.0002, "num_input_tokens_seen": 108634384, "step": 161180 }, { "epoch": 3.9377763662570544, "grad_norm": 0.0036799830850213766, "learning_rate": 2.6260520841831037e-07, "loss": 0.0631, "num_input_tokens_seen": 108637584, "step": 161185 }, { "epoch": 3.937898517088901, "grad_norm": 0.004490790888667107, "learning_rate": 2.6254760932125184e-07, "loss": 0.0, "num_input_tokens_seen": 108641168, "step": 161190 }, { "epoch": 3.938020667920749, "grad_norm": 0.0012281035305932164, "learning_rate": 2.624900155871457e-07, "loss": 0.0001, "num_input_tokens_seen": 108644560, "step": 161195 }, { "epoch": 3.9381428187525955, "grad_norm": 0.009919474832713604, "learning_rate": 2.624324272164101e-07, "loss": 0.0001, "num_input_tokens_seen": 108648912, "step": 161200 }, { "epoch": 3.9382649695844427, "grad_norm": 0.013267102651298046, "learning_rate": 2.6237484420946456e-07, "loss": 0.1652, "num_input_tokens_seen": 108652240, "step": 161205 }, { "epoch": 3.93838712041629, "grad_norm": 0.001139350119046867, "learning_rate": 2.6231726656672726e-07, "loss": 0.0, "num_input_tokens_seen": 108655376, "step": 161210 }, { "epoch": 3.938509271248137, "grad_norm": 0.023086732253432274, "learning_rate": 2.622596942886175e-07, "loss": 0.0, "num_input_tokens_seen": 108658704, "step": 161215 }, { "epoch": 3.9386314220799843, "grad_norm": 0.0004796923603862524, "learning_rate": 2.622021273755535e-07, "loss": 0.0, "num_input_tokens_seen": 108661776, "step": 161220 }, { "epoch": 3.9387535729118315, "grad_norm": 0.0027204875368624926, "learning_rate": 2.621445658279542e-07, "loss": 0.0, "num_input_tokens_seen": 108665040, "step": 161225 }, { "epoch": 3.9388757237436787, "grad_norm": 0.010596921667456627, "learning_rate": 2.6208700964623785e-07, "loss": 0.0001, "num_input_tokens_seen": 108668240, "step": 161230 }, { "epoch": 3.938997874575526, "grad_norm": 0.022362949326634407, "learning_rate": 2.620294588308235e-07, "loss": 0.0, "num_input_tokens_seen": 108671376, "step": 161235 }, { "epoch": 3.939120025407373, "grad_norm": 0.04769204929471016, "learning_rate": 2.619719133821292e-07, "loss": 0.0, "num_input_tokens_seen": 108674576, "step": 161240 }, { "epoch": 3.9392421762392202, "grad_norm": 0.001487054629251361, "learning_rate": 2.6191437330057364e-07, "loss": 0.0, "num_input_tokens_seen": 108678160, "step": 161245 }, { "epoch": 3.9393643270710674, "grad_norm": 0.003391053294762969, "learning_rate": 2.6185683858657546e-07, "loss": 0.0, "num_input_tokens_seen": 108681680, "step": 161250 }, { "epoch": 3.9394864779029146, "grad_norm": 0.6954625248908997, "learning_rate": 2.617993092405527e-07, "loss": 0.0001, "num_input_tokens_seen": 108685008, "step": 161255 }, { "epoch": 3.939608628734762, "grad_norm": 0.0019832707475870848, "learning_rate": 2.6174178526292424e-07, "loss": 0.0, "num_input_tokens_seen": 108688528, "step": 161260 }, { "epoch": 3.939730779566609, "grad_norm": 0.06212802603840828, "learning_rate": 2.616842666541077e-07, "loss": 0.0001, "num_input_tokens_seen": 108691984, "step": 161265 }, { "epoch": 3.939852930398456, "grad_norm": 0.004584174137562513, "learning_rate": 2.616267534145218e-07, "loss": 0.0648, "num_input_tokens_seen": 108695312, "step": 161270 }, { "epoch": 3.939975081230303, "grad_norm": 0.006212098989635706, "learning_rate": 2.6156924554458506e-07, "loss": 0.0, "num_input_tokens_seen": 108698576, "step": 161275 }, { "epoch": 3.9400972320621506, "grad_norm": 0.000308558956021443, "learning_rate": 2.61511743044715e-07, "loss": 0.0, "num_input_tokens_seen": 108701584, "step": 161280 }, { "epoch": 3.9402193828939973, "grad_norm": 0.00427340529859066, "learning_rate": 2.614542459153306e-07, "loss": 0.0, "num_input_tokens_seen": 108705040, "step": 161285 }, { "epoch": 3.940341533725845, "grad_norm": 0.0022537033073604107, "learning_rate": 2.6139675415684914e-07, "loss": 0.0, "num_input_tokens_seen": 108708176, "step": 161290 }, { "epoch": 3.9404636845576917, "grad_norm": 0.007408153731375933, "learning_rate": 2.613392677696895e-07, "loss": 0.0002, "num_input_tokens_seen": 108711376, "step": 161295 }, { "epoch": 3.940585835389539, "grad_norm": 0.036171622574329376, "learning_rate": 2.612817867542694e-07, "loss": 0.0001, "num_input_tokens_seen": 108714512, "step": 161300 }, { "epoch": 3.940707986221386, "grad_norm": 0.0014052970800548792, "learning_rate": 2.612243111110065e-07, "loss": 0.0, "num_input_tokens_seen": 108717584, "step": 161305 }, { "epoch": 3.9408301370532333, "grad_norm": 0.0031262929551303387, "learning_rate": 2.611668408403195e-07, "loss": 0.0001, "num_input_tokens_seen": 108720976, "step": 161310 }, { "epoch": 3.9409522878850805, "grad_norm": 0.0026391837745904922, "learning_rate": 2.611093759426256e-07, "loss": 0.0, "num_input_tokens_seen": 108724304, "step": 161315 }, { "epoch": 3.9410744387169276, "grad_norm": 33.50833511352539, "learning_rate": 2.6105191641834337e-07, "loss": 0.0456, "num_input_tokens_seen": 108727504, "step": 161320 }, { "epoch": 3.941196589548775, "grad_norm": 0.007016018498688936, "learning_rate": 2.6099446226789e-07, "loss": 0.0, "num_input_tokens_seen": 108730576, "step": 161325 }, { "epoch": 3.941318740380622, "grad_norm": 0.03891550377011299, "learning_rate": 2.6093701349168396e-07, "loss": 0.0, "num_input_tokens_seen": 108734096, "step": 161330 }, { "epoch": 3.941440891212469, "grad_norm": 20.601884841918945, "learning_rate": 2.608795700901425e-07, "loss": 0.0457, "num_input_tokens_seen": 108737552, "step": 161335 }, { "epoch": 3.9415630420443164, "grad_norm": 0.001628112862817943, "learning_rate": 2.608221320636836e-07, "loss": 0.0, "num_input_tokens_seen": 108740816, "step": 161340 }, { "epoch": 3.9416851928761636, "grad_norm": 0.0007106566918082535, "learning_rate": 2.607646994127253e-07, "loss": 0.0003, "num_input_tokens_seen": 108744208, "step": 161345 }, { "epoch": 3.941807343708011, "grad_norm": 19.420751571655273, "learning_rate": 2.6070727213768464e-07, "loss": 0.0418, "num_input_tokens_seen": 108747408, "step": 161350 }, { "epoch": 3.941929494539858, "grad_norm": 0.2629610002040863, "learning_rate": 2.606498502389798e-07, "loss": 0.0006, "num_input_tokens_seen": 108750608, "step": 161355 }, { "epoch": 3.9420516453717047, "grad_norm": 0.0015320018865168095, "learning_rate": 2.6059243371702775e-07, "loss": 0.0001, "num_input_tokens_seen": 108753808, "step": 161360 }, { "epoch": 3.9421737962035523, "grad_norm": 0.002873330609872937, "learning_rate": 2.605350225722465e-07, "loss": 0.0, "num_input_tokens_seen": 108756752, "step": 161365 }, { "epoch": 3.942295947035399, "grad_norm": 0.007898210547864437, "learning_rate": 2.6047761680505367e-07, "loss": 0.0003, "num_input_tokens_seen": 108759632, "step": 161370 }, { "epoch": 3.9424180978672467, "grad_norm": 0.0009628442348912358, "learning_rate": 2.604202164158663e-07, "loss": 0.0, "num_input_tokens_seen": 108762960, "step": 161375 }, { "epoch": 3.9425402486990935, "grad_norm": 0.28202518820762634, "learning_rate": 2.6036282140510224e-07, "loss": 0.0001, "num_input_tokens_seen": 108766352, "step": 161380 }, { "epoch": 3.9426623995309407, "grad_norm": 0.0036664672661572695, "learning_rate": 2.6030543177317853e-07, "loss": 0.0652, "num_input_tokens_seen": 108769872, "step": 161385 }, { "epoch": 3.942784550362788, "grad_norm": 0.005653384141623974, "learning_rate": 2.602480475205129e-07, "loss": 0.0489, "num_input_tokens_seen": 108773200, "step": 161390 }, { "epoch": 3.942906701194635, "grad_norm": 0.001647413824684918, "learning_rate": 2.6019066864752206e-07, "loss": 0.0006, "num_input_tokens_seen": 108777040, "step": 161395 }, { "epoch": 3.9430288520264822, "grad_norm": 0.3349778950214386, "learning_rate": 2.60133295154624e-07, "loss": 0.0001, "num_input_tokens_seen": 108780368, "step": 161400 }, { "epoch": 3.9431510028583294, "grad_norm": 0.6160761713981628, "learning_rate": 2.600759270422355e-07, "loss": 0.0001, "num_input_tokens_seen": 108783440, "step": 161405 }, { "epoch": 3.9432731536901766, "grad_norm": 0.0059109157882630825, "learning_rate": 2.6001856431077395e-07, "loss": 0.0, "num_input_tokens_seen": 108786512, "step": 161410 }, { "epoch": 3.943395304522024, "grad_norm": 0.0066981189884245396, "learning_rate": 2.599612069606565e-07, "loss": 0.0, "num_input_tokens_seen": 108790096, "step": 161415 }, { "epoch": 3.943517455353871, "grad_norm": 0.011463329195976257, "learning_rate": 2.5990385499229994e-07, "loss": 0.011, "num_input_tokens_seen": 108793360, "step": 161420 }, { "epoch": 3.943639606185718, "grad_norm": 0.0017142114229500294, "learning_rate": 2.5984650840612157e-07, "loss": 0.0, "num_input_tokens_seen": 108796688, "step": 161425 }, { "epoch": 3.9437617570175654, "grad_norm": 0.055649321526288986, "learning_rate": 2.5978916720253873e-07, "loss": 0.0014, "num_input_tokens_seen": 108799760, "step": 161430 }, { "epoch": 3.9438839078494126, "grad_norm": 0.012390898540616035, "learning_rate": 2.5973183138196785e-07, "loss": 0.0, "num_input_tokens_seen": 108803344, "step": 161435 }, { "epoch": 3.9440060586812598, "grad_norm": 0.0008434390183538198, "learning_rate": 2.5967450094482657e-07, "loss": 0.0003, "num_input_tokens_seen": 108806480, "step": 161440 }, { "epoch": 3.944128209513107, "grad_norm": 0.0012261479860171676, "learning_rate": 2.596171758915312e-07, "loss": 0.0, "num_input_tokens_seen": 108810320, "step": 161445 }, { "epoch": 3.944250360344954, "grad_norm": 0.0011466493597254157, "learning_rate": 2.595598562224991e-07, "loss": 0.0, "num_input_tokens_seen": 108813712, "step": 161450 }, { "epoch": 3.944372511176801, "grad_norm": 0.007299667224287987, "learning_rate": 2.5950254193814655e-07, "loss": 0.0001, "num_input_tokens_seen": 108816656, "step": 161455 }, { "epoch": 3.9444946620086485, "grad_norm": 0.005376824643462896, "learning_rate": 2.5944523303889065e-07, "loss": 0.0001, "num_input_tokens_seen": 108820304, "step": 161460 }, { "epoch": 3.9446168128404953, "grad_norm": 0.0009492832468822598, "learning_rate": 2.593879295251485e-07, "loss": 0.0, "num_input_tokens_seen": 108823440, "step": 161465 }, { "epoch": 3.9447389636723424, "grad_norm": 0.0012723127147182822, "learning_rate": 2.5933063139733637e-07, "loss": 0.0, "num_input_tokens_seen": 108826704, "step": 161470 }, { "epoch": 3.9448611145041896, "grad_norm": 0.0002114681847160682, "learning_rate": 2.592733386558713e-07, "loss": 0.0, "num_input_tokens_seen": 108830864, "step": 161475 }, { "epoch": 3.944983265336037, "grad_norm": 0.0071832574903965, "learning_rate": 2.5921605130116954e-07, "loss": 0.0, "num_input_tokens_seen": 108834320, "step": 161480 }, { "epoch": 3.945105416167884, "grad_norm": 0.00037165748653933406, "learning_rate": 2.591587693336481e-07, "loss": 0.0, "num_input_tokens_seen": 108837712, "step": 161485 }, { "epoch": 3.945227566999731, "grad_norm": 0.014342446811497211, "learning_rate": 2.5910149275372305e-07, "loss": 0.0, "num_input_tokens_seen": 108840976, "step": 161490 }, { "epoch": 3.9453497178315784, "grad_norm": 0.03518125042319298, "learning_rate": 2.5904422156181126e-07, "loss": 0.0, "num_input_tokens_seen": 108844176, "step": 161495 }, { "epoch": 3.9454718686634256, "grad_norm": 0.006159580312669277, "learning_rate": 2.589869557583294e-07, "loss": 0.0, "num_input_tokens_seen": 108847504, "step": 161500 }, { "epoch": 3.9455940194952728, "grad_norm": 0.0010740587022155523, "learning_rate": 2.589296953436938e-07, "loss": 0.0, "num_input_tokens_seen": 108850576, "step": 161505 }, { "epoch": 3.94571617032712, "grad_norm": 0.0005017686635255814, "learning_rate": 2.5887244031832043e-07, "loss": 0.0, "num_input_tokens_seen": 108854032, "step": 161510 }, { "epoch": 3.945838321158967, "grad_norm": 0.0033837794326245785, "learning_rate": 2.5881519068262635e-07, "loss": 0.0, "num_input_tokens_seen": 108857104, "step": 161515 }, { "epoch": 3.9459604719908143, "grad_norm": 0.03228937089443207, "learning_rate": 2.587579464370273e-07, "loss": 0.0001, "num_input_tokens_seen": 108860560, "step": 161520 }, { "epoch": 3.9460826228226615, "grad_norm": 0.002921751234680414, "learning_rate": 2.587007075819401e-07, "loss": 0.0, "num_input_tokens_seen": 108864080, "step": 161525 }, { "epoch": 3.9462047736545087, "grad_norm": 0.047707222402095795, "learning_rate": 2.586434741177804e-07, "loss": 0.0, "num_input_tokens_seen": 108867344, "step": 161530 }, { "epoch": 3.946326924486356, "grad_norm": 0.008138305507600307, "learning_rate": 2.5858624604496504e-07, "loss": 0.0, "num_input_tokens_seen": 108870544, "step": 161535 }, { "epoch": 3.9464490753182027, "grad_norm": 0.3126929998397827, "learning_rate": 2.585290233639097e-07, "loss": 0.0001, "num_input_tokens_seen": 108873936, "step": 161540 }, { "epoch": 3.9465712261500503, "grad_norm": 0.001759407576173544, "learning_rate": 2.584718060750309e-07, "loss": 0.0, "num_input_tokens_seen": 108876944, "step": 161545 }, { "epoch": 3.946693376981897, "grad_norm": 0.0005770392599515617, "learning_rate": 2.584145941787444e-07, "loss": 0.0153, "num_input_tokens_seen": 108880144, "step": 161550 }, { "epoch": 3.9468155278137447, "grad_norm": 0.0005854017799720168, "learning_rate": 2.5835738767546647e-07, "loss": 0.0, "num_input_tokens_seen": 108883664, "step": 161555 }, { "epoch": 3.9469376786455914, "grad_norm": 0.0027297306805849075, "learning_rate": 2.5830018656561325e-07, "loss": 0.0, "num_input_tokens_seen": 108886992, "step": 161560 }, { "epoch": 3.9470598294774386, "grad_norm": 0.00015110817912500352, "learning_rate": 2.582429908496003e-07, "loss": 0.0, "num_input_tokens_seen": 108891024, "step": 161565 }, { "epoch": 3.947181980309286, "grad_norm": 0.0010867511155083776, "learning_rate": 2.581858005278442e-07, "loss": 0.0, "num_input_tokens_seen": 108894736, "step": 161570 }, { "epoch": 3.947304131141133, "grad_norm": 0.02226412296295166, "learning_rate": 2.581286156007602e-07, "loss": 0.0, "num_input_tokens_seen": 108898192, "step": 161575 }, { "epoch": 3.94742628197298, "grad_norm": 6.201533687999472e-05, "learning_rate": 2.5807143606876436e-07, "loss": 0.0, "num_input_tokens_seen": 108902032, "step": 161580 }, { "epoch": 3.9475484328048274, "grad_norm": 0.0005853096372447908, "learning_rate": 2.5801426193227296e-07, "loss": 0.0002, "num_input_tokens_seen": 108905552, "step": 161585 }, { "epoch": 3.9476705836366746, "grad_norm": 0.054668866097927094, "learning_rate": 2.5795709319170114e-07, "loss": 0.0, "num_input_tokens_seen": 108908560, "step": 161590 }, { "epoch": 3.9477927344685217, "grad_norm": 0.008940774016082287, "learning_rate": 2.578999298474651e-07, "loss": 0.0875, "num_input_tokens_seen": 108911440, "step": 161595 }, { "epoch": 3.947914885300369, "grad_norm": 0.002913819393143058, "learning_rate": 2.5784277189998016e-07, "loss": 0.0001, "num_input_tokens_seen": 108914960, "step": 161600 }, { "epoch": 3.948037036132216, "grad_norm": 11.425198554992676, "learning_rate": 2.577856193496625e-07, "loss": 0.0019, "num_input_tokens_seen": 108918288, "step": 161605 }, { "epoch": 3.9481591869640633, "grad_norm": 0.004938581492751837, "learning_rate": 2.577284721969274e-07, "loss": 0.0, "num_input_tokens_seen": 108921808, "step": 161610 }, { "epoch": 3.9482813377959105, "grad_norm": 0.46004217863082886, "learning_rate": 2.576713304421902e-07, "loss": 0.0001, "num_input_tokens_seen": 108924752, "step": 161615 }, { "epoch": 3.9484034886277577, "grad_norm": 0.008259247988462448, "learning_rate": 2.57614194085867e-07, "loss": 0.0001, "num_input_tokens_seen": 108928208, "step": 161620 }, { "epoch": 3.948525639459605, "grad_norm": 0.00015825018635950983, "learning_rate": 2.575570631283729e-07, "loss": 0.0, "num_input_tokens_seen": 108931472, "step": 161625 }, { "epoch": 3.948647790291452, "grad_norm": 0.02835937775671482, "learning_rate": 2.574999375701238e-07, "loss": 0.0546, "num_input_tokens_seen": 108934736, "step": 161630 }, { "epoch": 3.948769941123299, "grad_norm": 45.61603546142578, "learning_rate": 2.574428174115345e-07, "loss": 0.0978, "num_input_tokens_seen": 108938128, "step": 161635 }, { "epoch": 3.9488920919551465, "grad_norm": 0.026648221537470818, "learning_rate": 2.573857026530211e-07, "loss": 0.0001, "num_input_tokens_seen": 108941712, "step": 161640 }, { "epoch": 3.949014242786993, "grad_norm": 0.0013561674859374762, "learning_rate": 2.5732859329499825e-07, "loss": 0.0, "num_input_tokens_seen": 108945808, "step": 161645 }, { "epoch": 3.9491363936188404, "grad_norm": 0.003145823022350669, "learning_rate": 2.572714893378817e-07, "loss": 0.0, "num_input_tokens_seen": 108950160, "step": 161650 }, { "epoch": 3.9492585444506876, "grad_norm": 24.384469985961914, "learning_rate": 2.5721439078208686e-07, "loss": 0.0288, "num_input_tokens_seen": 108953744, "step": 161655 }, { "epoch": 3.9493806952825348, "grad_norm": 223.1640167236328, "learning_rate": 2.571572976280285e-07, "loss": 0.075, "num_input_tokens_seen": 108956752, "step": 161660 }, { "epoch": 3.949502846114382, "grad_norm": 0.299441397190094, "learning_rate": 2.5710020987612234e-07, "loss": 0.0466, "num_input_tokens_seen": 108959760, "step": 161665 }, { "epoch": 3.949624996946229, "grad_norm": 0.007449428550899029, "learning_rate": 2.57043127526783e-07, "loss": 0.0, "num_input_tokens_seen": 108963088, "step": 161670 }, { "epoch": 3.9497471477780763, "grad_norm": 0.3708611726760864, "learning_rate": 2.569860505804259e-07, "loss": 0.0002, "num_input_tokens_seen": 108966544, "step": 161675 }, { "epoch": 3.9498692986099235, "grad_norm": 0.01345105655491352, "learning_rate": 2.5692897903746635e-07, "loss": 0.0003, "num_input_tokens_seen": 108969552, "step": 161680 }, { "epoch": 3.9499914494417707, "grad_norm": 0.4047967791557312, "learning_rate": 2.568719128983189e-07, "loss": 0.0001, "num_input_tokens_seen": 108973008, "step": 161685 }, { "epoch": 3.950113600273618, "grad_norm": 0.002885033143684268, "learning_rate": 2.5681485216339907e-07, "loss": 0.0, "num_input_tokens_seen": 108976208, "step": 161690 }, { "epoch": 3.950235751105465, "grad_norm": 0.0001398777967551723, "learning_rate": 2.5675779683312115e-07, "loss": 0.0001, "num_input_tokens_seen": 108979472, "step": 161695 }, { "epoch": 3.9503579019373123, "grad_norm": 0.0028743662405759096, "learning_rate": 2.5670074690790065e-07, "loss": 0.0, "num_input_tokens_seen": 108982800, "step": 161700 }, { "epoch": 3.9504800527691595, "grad_norm": 0.0034815717954188585, "learning_rate": 2.5664370238815214e-07, "loss": 0.0, "num_input_tokens_seen": 108986192, "step": 161705 }, { "epoch": 3.9506022036010067, "grad_norm": 0.00020252345711924136, "learning_rate": 2.565866632742908e-07, "loss": 0.0, "num_input_tokens_seen": 108989072, "step": 161710 }, { "epoch": 3.950724354432854, "grad_norm": 0.002388434950262308, "learning_rate": 2.5652962956673086e-07, "loss": 0.0, "num_input_tokens_seen": 108992656, "step": 161715 }, { "epoch": 3.9508465052647006, "grad_norm": 0.15899832546710968, "learning_rate": 2.5647260126588775e-07, "loss": 0.0001, "num_input_tokens_seen": 108995536, "step": 161720 }, { "epoch": 3.9509686560965482, "grad_norm": 0.0023573962971568108, "learning_rate": 2.5641557837217586e-07, "loss": 0.0, "num_input_tokens_seen": 108999312, "step": 161725 }, { "epoch": 3.951090806928395, "grad_norm": 0.00032487220596522093, "learning_rate": 2.563585608860096e-07, "loss": 0.1403, "num_input_tokens_seen": 109002576, "step": 161730 }, { "epoch": 3.9512129577602426, "grad_norm": 32.855960845947266, "learning_rate": 2.563015488078039e-07, "loss": 0.0348, "num_input_tokens_seen": 109005776, "step": 161735 }, { "epoch": 3.9513351085920894, "grad_norm": 0.05380634963512421, "learning_rate": 2.5624454213797366e-07, "loss": 0.0001, "num_input_tokens_seen": 109008912, "step": 161740 }, { "epoch": 3.9514572594239366, "grad_norm": 0.0002731550484895706, "learning_rate": 2.5618754087693283e-07, "loss": 0.0, "num_input_tokens_seen": 109012240, "step": 161745 }, { "epoch": 3.9515794102557837, "grad_norm": 0.08828431367874146, "learning_rate": 2.5613054502509655e-07, "loss": 0.0001, "num_input_tokens_seen": 109015504, "step": 161750 }, { "epoch": 3.951701561087631, "grad_norm": 0.04316204786300659, "learning_rate": 2.560735545828787e-07, "loss": 0.0, "num_input_tokens_seen": 109018704, "step": 161755 }, { "epoch": 3.951823711919478, "grad_norm": 0.0059670270420610905, "learning_rate": 2.560165695506945e-07, "loss": 0.0002, "num_input_tokens_seen": 109022352, "step": 161760 }, { "epoch": 3.9519458627513253, "grad_norm": 0.0006971214897930622, "learning_rate": 2.559595899289575e-07, "loss": 0.0005, "num_input_tokens_seen": 109026192, "step": 161765 }, { "epoch": 3.9520680135831725, "grad_norm": 0.00021715546608902514, "learning_rate": 2.5590261571808247e-07, "loss": 0.0, "num_input_tokens_seen": 109029520, "step": 161770 }, { "epoch": 3.9521901644150197, "grad_norm": 38.267738342285156, "learning_rate": 2.558456469184841e-07, "loss": 0.0542, "num_input_tokens_seen": 109033168, "step": 161775 }, { "epoch": 3.952312315246867, "grad_norm": 0.002327230293303728, "learning_rate": 2.55788683530576e-07, "loss": 0.0, "num_input_tokens_seen": 109037200, "step": 161780 }, { "epoch": 3.952434466078714, "grad_norm": 0.00969710759818554, "learning_rate": 2.5573172555477316e-07, "loss": 0.0392, "num_input_tokens_seen": 109040400, "step": 161785 }, { "epoch": 3.9525566169105613, "grad_norm": 0.023102683946490288, "learning_rate": 2.55674772991489e-07, "loss": 0.0001, "num_input_tokens_seen": 109043664, "step": 161790 }, { "epoch": 3.9526787677424084, "grad_norm": 0.0002032756310654804, "learning_rate": 2.5561782584113845e-07, "loss": 0.0, "num_input_tokens_seen": 109046928, "step": 161795 }, { "epoch": 3.9528009185742556, "grad_norm": 0.40158918499946594, "learning_rate": 2.55560884104135e-07, "loss": 0.0001, "num_input_tokens_seen": 109049744, "step": 161800 }, { "epoch": 3.9529230694061024, "grad_norm": 0.0035159934777766466, "learning_rate": 2.555039477808929e-07, "loss": 0.0, "num_input_tokens_seen": 109053264, "step": 161805 }, { "epoch": 3.95304522023795, "grad_norm": 0.0007293214439414442, "learning_rate": 2.5544701687182677e-07, "loss": 0.0, "num_input_tokens_seen": 109056272, "step": 161810 }, { "epoch": 3.9531673710697968, "grad_norm": 0.0006925832713022828, "learning_rate": 2.5539009137735013e-07, "loss": 0.0, "num_input_tokens_seen": 109059536, "step": 161815 }, { "epoch": 3.9532895219016444, "grad_norm": 0.48797106742858887, "learning_rate": 2.553331712978768e-07, "loss": 0.0003, "num_input_tokens_seen": 109062800, "step": 161820 }, { "epoch": 3.953411672733491, "grad_norm": 0.0024608115199953318, "learning_rate": 2.552762566338211e-07, "loss": 0.0, "num_input_tokens_seen": 109066128, "step": 161825 }, { "epoch": 3.9535338235653383, "grad_norm": 0.0036889470648020506, "learning_rate": 2.552193473855966e-07, "loss": 0.0, "num_input_tokens_seen": 109069840, "step": 161830 }, { "epoch": 3.9536559743971855, "grad_norm": 0.0005082987481728196, "learning_rate": 2.551624435536176e-07, "loss": 0.0, "num_input_tokens_seen": 109073168, "step": 161835 }, { "epoch": 3.9537781252290327, "grad_norm": 65.21703338623047, "learning_rate": 2.551055451382973e-07, "loss": 0.0184, "num_input_tokens_seen": 109076240, "step": 161840 }, { "epoch": 3.95390027606088, "grad_norm": 0.0011391066946089268, "learning_rate": 2.550486521400501e-07, "loss": 0.0, "num_input_tokens_seen": 109080016, "step": 161845 }, { "epoch": 3.954022426892727, "grad_norm": 0.009924840182065964, "learning_rate": 2.5499176455928927e-07, "loss": 0.0005, "num_input_tokens_seen": 109083536, "step": 161850 }, { "epoch": 3.9541445777245743, "grad_norm": 0.0007699128473177552, "learning_rate": 2.5493488239642904e-07, "loss": 0.0354, "num_input_tokens_seen": 109087184, "step": 161855 }, { "epoch": 3.9542667285564215, "grad_norm": 0.040427979081869125, "learning_rate": 2.5487800565188236e-07, "loss": 0.0001, "num_input_tokens_seen": 109090640, "step": 161860 }, { "epoch": 3.9543888793882687, "grad_norm": 0.001991491997614503, "learning_rate": 2.548211343260632e-07, "loss": 0.0554, "num_input_tokens_seen": 109093776, "step": 161865 }, { "epoch": 3.954511030220116, "grad_norm": 0.001988684758543968, "learning_rate": 2.5476426841938545e-07, "loss": 0.0, "num_input_tokens_seen": 109096976, "step": 161870 }, { "epoch": 3.954633181051963, "grad_norm": 0.012328723445534706, "learning_rate": 2.547074079322622e-07, "loss": 0.0, "num_input_tokens_seen": 109100560, "step": 161875 }, { "epoch": 3.9547553318838102, "grad_norm": 0.00014424313849303871, "learning_rate": 2.5465055286510737e-07, "loss": 0.0, "num_input_tokens_seen": 109103888, "step": 161880 }, { "epoch": 3.9548774827156574, "grad_norm": 0.030226293951272964, "learning_rate": 2.5459370321833396e-07, "loss": 0.0, "num_input_tokens_seen": 109107344, "step": 161885 }, { "epoch": 3.9549996335475046, "grad_norm": 0.009276174008846283, "learning_rate": 2.545368589923559e-07, "loss": 0.0, "num_input_tokens_seen": 109110544, "step": 161890 }, { "epoch": 3.955121784379352, "grad_norm": 0.00403105653822422, "learning_rate": 2.54480020187586e-07, "loss": 0.0, "num_input_tokens_seen": 109113680, "step": 161895 }, { "epoch": 3.9552439352111985, "grad_norm": 0.0010097883641719818, "learning_rate": 2.54423186804438e-07, "loss": 0.0, "num_input_tokens_seen": 109117008, "step": 161900 }, { "epoch": 3.955366086043046, "grad_norm": 0.0010946433758363128, "learning_rate": 2.5436635884332526e-07, "loss": 0.0, "num_input_tokens_seen": 109120336, "step": 161905 }, { "epoch": 3.955488236874893, "grad_norm": 0.012915721163153648, "learning_rate": 2.5430953630466067e-07, "loss": 0.0, "num_input_tokens_seen": 109123792, "step": 161910 }, { "epoch": 3.95561038770674, "grad_norm": 0.003288612002506852, "learning_rate": 2.54252719188858e-07, "loss": 0.0, "num_input_tokens_seen": 109127568, "step": 161915 }, { "epoch": 3.9557325385385873, "grad_norm": 0.00014995834499131888, "learning_rate": 2.5419590749633014e-07, "loss": 0.0527, "num_input_tokens_seen": 109130960, "step": 161920 }, { "epoch": 3.9558546893704345, "grad_norm": 0.01613342948257923, "learning_rate": 2.5413910122748996e-07, "loss": 0.0, "num_input_tokens_seen": 109134288, "step": 161925 }, { "epoch": 3.9559768402022817, "grad_norm": 0.001566616352647543, "learning_rate": 2.5408230038275115e-07, "loss": 0.0, "num_input_tokens_seen": 109137552, "step": 161930 }, { "epoch": 3.956098991034129, "grad_norm": 0.0040521277114748955, "learning_rate": 2.5402550496252616e-07, "loss": 0.0, "num_input_tokens_seen": 109141200, "step": 161935 }, { "epoch": 3.956221141865976, "grad_norm": 0.007688730955123901, "learning_rate": 2.539687149672287e-07, "loss": 0.0843, "num_input_tokens_seen": 109144272, "step": 161940 }, { "epoch": 3.9563432926978233, "grad_norm": 0.005946619436144829, "learning_rate": 2.53911930397271e-07, "loss": 0.0, "num_input_tokens_seen": 109147664, "step": 161945 }, { "epoch": 3.9564654435296704, "grad_norm": 0.0005307059618644416, "learning_rate": 2.538551512530668e-07, "loss": 0.0, "num_input_tokens_seen": 109151056, "step": 161950 }, { "epoch": 3.9565875943615176, "grad_norm": 0.001883355900645256, "learning_rate": 2.537983775350283e-07, "loss": 0.0001, "num_input_tokens_seen": 109154576, "step": 161955 }, { "epoch": 3.956709745193365, "grad_norm": 0.001277736620977521, "learning_rate": 2.5374160924356867e-07, "loss": 0.0002, "num_input_tokens_seen": 109157968, "step": 161960 }, { "epoch": 3.956831896025212, "grad_norm": 6.472727545769885e-05, "learning_rate": 2.5368484637910117e-07, "loss": 0.0, "num_input_tokens_seen": 109161360, "step": 161965 }, { "epoch": 3.956954046857059, "grad_norm": 0.006716002244502306, "learning_rate": 2.536280889420378e-07, "loss": 0.0, "num_input_tokens_seen": 109164944, "step": 161970 }, { "epoch": 3.9570761976889064, "grad_norm": 0.06781076639890671, "learning_rate": 2.535713369327921e-07, "loss": 0.0, "num_input_tokens_seen": 109168912, "step": 161975 }, { "epoch": 3.9571983485207536, "grad_norm": 0.015026175417006016, "learning_rate": 2.5351459035177604e-07, "loss": 0.0224, "num_input_tokens_seen": 109172048, "step": 161980 }, { "epoch": 3.9573204993526003, "grad_norm": 0.0010686632012948394, "learning_rate": 2.534578491994026e-07, "loss": 0.0, "num_input_tokens_seen": 109175184, "step": 161985 }, { "epoch": 3.957442650184448, "grad_norm": 0.17090198397636414, "learning_rate": 2.534011134760848e-07, "loss": 0.0001, "num_input_tokens_seen": 109178384, "step": 161990 }, { "epoch": 3.9575648010162947, "grad_norm": 0.01744556427001953, "learning_rate": 2.533443831822347e-07, "loss": 0.0, "num_input_tokens_seen": 109181648, "step": 161995 }, { "epoch": 3.9576869518481423, "grad_norm": 0.0013410678366199136, "learning_rate": 2.5328765831826537e-07, "loss": 0.0, "num_input_tokens_seen": 109185040, "step": 162000 }, { "epoch": 3.957809102679989, "grad_norm": 0.002465799218043685, "learning_rate": 2.532309388845887e-07, "loss": 0.0, "num_input_tokens_seen": 109188752, "step": 162005 }, { "epoch": 3.9579312535118363, "grad_norm": 0.0010966339614242315, "learning_rate": 2.531742248816178e-07, "loss": 0.0, "num_input_tokens_seen": 109192656, "step": 162010 }, { "epoch": 3.9580534043436835, "grad_norm": 0.06205965578556061, "learning_rate": 2.531175163097645e-07, "loss": 0.0001, "num_input_tokens_seen": 109196048, "step": 162015 }, { "epoch": 3.9581755551755307, "grad_norm": 0.07128459960222244, "learning_rate": 2.5306081316944185e-07, "loss": 0.0, "num_input_tokens_seen": 109199120, "step": 162020 }, { "epoch": 3.958297706007378, "grad_norm": 0.0008230642415583134, "learning_rate": 2.530041154610615e-07, "loss": 0.0, "num_input_tokens_seen": 109202448, "step": 162025 }, { "epoch": 3.958419856839225, "grad_norm": 0.0010746166808530688, "learning_rate": 2.529474231850365e-07, "loss": 0.0, "num_input_tokens_seen": 109205456, "step": 162030 }, { "epoch": 3.9585420076710722, "grad_norm": 0.0005239786696620286, "learning_rate": 2.528907363417787e-07, "loss": 0.0237, "num_input_tokens_seen": 109208208, "step": 162035 }, { "epoch": 3.9586641585029194, "grad_norm": 0.0008065864094533026, "learning_rate": 2.528340549317002e-07, "loss": 0.0, "num_input_tokens_seen": 109211792, "step": 162040 }, { "epoch": 3.9587863093347666, "grad_norm": 0.0016470836708322167, "learning_rate": 2.5277737895521365e-07, "loss": 0.0001, "num_input_tokens_seen": 109214992, "step": 162045 }, { "epoch": 3.958908460166614, "grad_norm": 8.174134563887492e-05, "learning_rate": 2.5272070841273076e-07, "loss": 0.0, "num_input_tokens_seen": 109218192, "step": 162050 }, { "epoch": 3.959030610998461, "grad_norm": 0.004978245124220848, "learning_rate": 2.526640433046638e-07, "loss": 0.0, "num_input_tokens_seen": 109221328, "step": 162055 }, { "epoch": 3.959152761830308, "grad_norm": 0.0015048523200675845, "learning_rate": 2.526073836314252e-07, "loss": 0.0, "num_input_tokens_seen": 109224656, "step": 162060 }, { "epoch": 3.9592749126621554, "grad_norm": 0.009316632524132729, "learning_rate": 2.525507293934265e-07, "loss": 0.0, "num_input_tokens_seen": 109227920, "step": 162065 }, { "epoch": 3.9593970634940026, "grad_norm": 0.0022315524984151125, "learning_rate": 2.524940805910802e-07, "loss": 0.0, "num_input_tokens_seen": 109231632, "step": 162070 }, { "epoch": 3.9595192143258497, "grad_norm": 0.0013435684377327561, "learning_rate": 2.524374372247977e-07, "loss": 0.0, "num_input_tokens_seen": 109234832, "step": 162075 }, { "epoch": 3.9596413651576965, "grad_norm": 0.0003065931668970734, "learning_rate": 2.523807992949912e-07, "loss": 0.0576, "num_input_tokens_seen": 109237968, "step": 162080 }, { "epoch": 3.959763515989544, "grad_norm": 0.06455646455287933, "learning_rate": 2.52324166802073e-07, "loss": 0.0001, "num_input_tokens_seen": 109241232, "step": 162085 }, { "epoch": 3.959885666821391, "grad_norm": 0.0016405819915235043, "learning_rate": 2.5226753974645423e-07, "loss": 0.0824, "num_input_tokens_seen": 109244496, "step": 162090 }, { "epoch": 3.960007817653238, "grad_norm": 0.002664495026692748, "learning_rate": 2.522109181285473e-07, "loss": 0.0, "num_input_tokens_seen": 109247696, "step": 162095 }, { "epoch": 3.9601299684850853, "grad_norm": 0.0009869072819128633, "learning_rate": 2.5215430194876343e-07, "loss": 0.0, "num_input_tokens_seen": 109251152, "step": 162100 }, { "epoch": 3.9602521193169324, "grad_norm": 0.0019497988978400826, "learning_rate": 2.520976912075149e-07, "loss": 0.0, "num_input_tokens_seen": 109254608, "step": 162105 }, { "epoch": 3.9603742701487796, "grad_norm": 0.24836663901805878, "learning_rate": 2.5204108590521277e-07, "loss": 0.0003, "num_input_tokens_seen": 109258000, "step": 162110 }, { "epoch": 3.960496420980627, "grad_norm": 0.0007280535064637661, "learning_rate": 2.519844860422692e-07, "loss": 0.0513, "num_input_tokens_seen": 109261264, "step": 162115 }, { "epoch": 3.960618571812474, "grad_norm": 0.001604035496711731, "learning_rate": 2.519278916190958e-07, "loss": 0.0, "num_input_tokens_seen": 109264848, "step": 162120 }, { "epoch": 3.960740722644321, "grad_norm": 0.0037682605907320976, "learning_rate": 2.51871302636104e-07, "loss": 0.0, "num_input_tokens_seen": 109268048, "step": 162125 }, { "epoch": 3.9608628734761684, "grad_norm": 0.0007140697562135756, "learning_rate": 2.51814719093705e-07, "loss": 0.0, "num_input_tokens_seen": 109271504, "step": 162130 }, { "epoch": 3.9609850243080156, "grad_norm": 0.006422894541174173, "learning_rate": 2.5175814099231096e-07, "loss": 0.0, "num_input_tokens_seen": 109274640, "step": 162135 }, { "epoch": 3.9611071751398628, "grad_norm": 0.061705734580755234, "learning_rate": 2.5170156833233256e-07, "loss": 0.0001, "num_input_tokens_seen": 109278096, "step": 162140 }, { "epoch": 3.96122932597171, "grad_norm": 0.0003613313310779631, "learning_rate": 2.516450011141821e-07, "loss": 0.0019, "num_input_tokens_seen": 109282000, "step": 162145 }, { "epoch": 3.961351476803557, "grad_norm": 0.0009361098054796457, "learning_rate": 2.5158843933827e-07, "loss": 0.0001, "num_input_tokens_seen": 109285648, "step": 162150 }, { "epoch": 3.9614736276354043, "grad_norm": 0.18613314628601074, "learning_rate": 2.515318830050085e-07, "loss": 0.0001, "num_input_tokens_seen": 109289040, "step": 162155 }, { "epoch": 3.9615957784672515, "grad_norm": 0.005279130302369595, "learning_rate": 2.514753321148081e-07, "loss": 0.0, "num_input_tokens_seen": 109292432, "step": 162160 }, { "epoch": 3.9617179292990983, "grad_norm": 0.0038733906112611294, "learning_rate": 2.514187866680807e-07, "loss": 0.0, "num_input_tokens_seen": 109295824, "step": 162165 }, { "epoch": 3.961840080130946, "grad_norm": 0.0029186783358454704, "learning_rate": 2.5136224666523696e-07, "loss": 0.0, "num_input_tokens_seen": 109298832, "step": 162170 }, { "epoch": 3.9619622309627927, "grad_norm": 0.0011153332889080048, "learning_rate": 2.5130571210668825e-07, "loss": 0.0571, "num_input_tokens_seen": 109301968, "step": 162175 }, { "epoch": 3.9620843817946403, "grad_norm": 0.0011105951853096485, "learning_rate": 2.5124918299284615e-07, "loss": 0.0, "num_input_tokens_seen": 109305360, "step": 162180 }, { "epoch": 3.962206532626487, "grad_norm": 0.0008032761397771537, "learning_rate": 2.5119265932412105e-07, "loss": 0.0, "num_input_tokens_seen": 109308624, "step": 162185 }, { "epoch": 3.962328683458334, "grad_norm": 0.0009347455343231559, "learning_rate": 2.511361411009246e-07, "loss": 0.0, "num_input_tokens_seen": 109311824, "step": 162190 }, { "epoch": 3.9624508342901814, "grad_norm": 0.0006021481240168214, "learning_rate": 2.5107962832366735e-07, "loss": 0.0, "num_input_tokens_seen": 109315408, "step": 162195 }, { "epoch": 3.9625729851220286, "grad_norm": 0.00013758940622210503, "learning_rate": 2.510231209927608e-07, "loss": 0.0, "num_input_tokens_seen": 109318864, "step": 162200 }, { "epoch": 3.962695135953876, "grad_norm": 5.944971561431885, "learning_rate": 2.509666191086152e-07, "loss": 0.0013, "num_input_tokens_seen": 109322000, "step": 162205 }, { "epoch": 3.962817286785723, "grad_norm": 0.09964948892593384, "learning_rate": 2.509101226716418e-07, "loss": 0.0598, "num_input_tokens_seen": 109325200, "step": 162210 }, { "epoch": 3.96293943761757, "grad_norm": 0.000213745137443766, "learning_rate": 2.5085363168225173e-07, "loss": 0.0, "num_input_tokens_seen": 109329168, "step": 162215 }, { "epoch": 3.9630615884494174, "grad_norm": 0.001257982337847352, "learning_rate": 2.5079714614085535e-07, "loss": 0.0583, "num_input_tokens_seen": 109332880, "step": 162220 }, { "epoch": 3.9631837392812646, "grad_norm": 0.0010682143038138747, "learning_rate": 2.5074066604786383e-07, "loss": 0.0001, "num_input_tokens_seen": 109335952, "step": 162225 }, { "epoch": 3.9633058901131117, "grad_norm": 0.0014836126938462257, "learning_rate": 2.506841914036878e-07, "loss": 0.0, "num_input_tokens_seen": 109338960, "step": 162230 }, { "epoch": 3.963428040944959, "grad_norm": 0.004185094032436609, "learning_rate": 2.506277222087375e-07, "loss": 0.1063, "num_input_tokens_seen": 109342288, "step": 162235 }, { "epoch": 3.963550191776806, "grad_norm": 0.0005608157953247428, "learning_rate": 2.505712584634243e-07, "loss": 0.0, "num_input_tokens_seen": 109345680, "step": 162240 }, { "epoch": 3.9636723426086533, "grad_norm": 0.009780601598322392, "learning_rate": 2.505148001681582e-07, "loss": 0.0, "num_input_tokens_seen": 109349648, "step": 162245 }, { "epoch": 3.9637944934405, "grad_norm": 0.00024733677855692804, "learning_rate": 2.5045834732335024e-07, "loss": 0.0, "num_input_tokens_seen": 109353360, "step": 162250 }, { "epoch": 3.9639166442723477, "grad_norm": 0.1472015380859375, "learning_rate": 2.5040189992941063e-07, "loss": 0.0456, "num_input_tokens_seen": 109356368, "step": 162255 }, { "epoch": 3.9640387951041944, "grad_norm": 0.11164890229701996, "learning_rate": 2.5034545798675024e-07, "loss": 0.0, "num_input_tokens_seen": 109359504, "step": 162260 }, { "epoch": 3.964160945936042, "grad_norm": 0.004264294635504484, "learning_rate": 2.50289021495779e-07, "loss": 0.0, "num_input_tokens_seen": 109362960, "step": 162265 }, { "epoch": 3.964283096767889, "grad_norm": 0.005529699381440878, "learning_rate": 2.502325904569077e-07, "loss": 0.0, "num_input_tokens_seen": 109366160, "step": 162270 }, { "epoch": 3.964405247599736, "grad_norm": 0.09107805043458939, "learning_rate": 2.5017616487054694e-07, "loss": 0.0001, "num_input_tokens_seen": 109369488, "step": 162275 }, { "epoch": 3.964527398431583, "grad_norm": 0.000815638224594295, "learning_rate": 2.501197447371065e-07, "loss": 0.0, "num_input_tokens_seen": 109372624, "step": 162280 }, { "epoch": 3.9646495492634304, "grad_norm": 0.0033097947016358376, "learning_rate": 2.5006333005699734e-07, "loss": 0.0, "num_input_tokens_seen": 109375824, "step": 162285 }, { "epoch": 3.9647717000952776, "grad_norm": 0.01763514243066311, "learning_rate": 2.5000692083062893e-07, "loss": 0.0849, "num_input_tokens_seen": 109379280, "step": 162290 }, { "epoch": 3.9648938509271248, "grad_norm": 0.00030953448731452227, "learning_rate": 2.49950517058412e-07, "loss": 0.0, "num_input_tokens_seen": 109382608, "step": 162295 }, { "epoch": 3.965016001758972, "grad_norm": 0.002793027088046074, "learning_rate": 2.498941187407568e-07, "loss": 0.0, "num_input_tokens_seen": 109386128, "step": 162300 }, { "epoch": 3.965138152590819, "grad_norm": 0.006183539051562548, "learning_rate": 2.498377258780732e-07, "loss": 0.0407, "num_input_tokens_seen": 109389200, "step": 162305 }, { "epoch": 3.9652603034226663, "grad_norm": 0.021969813853502274, "learning_rate": 2.4978133847077163e-07, "loss": 0.0353, "num_input_tokens_seen": 109392272, "step": 162310 }, { "epoch": 3.9653824542545135, "grad_norm": 0.306232213973999, "learning_rate": 2.497249565192617e-07, "loss": 0.0002, "num_input_tokens_seen": 109395792, "step": 162315 }, { "epoch": 3.9655046050863607, "grad_norm": 0.004216625355184078, "learning_rate": 2.4966858002395396e-07, "loss": 0.0, "num_input_tokens_seen": 109398672, "step": 162320 }, { "epoch": 3.965626755918208, "grad_norm": 0.0067145368084311485, "learning_rate": 2.496122089852578e-07, "loss": 0.0, "num_input_tokens_seen": 109402000, "step": 162325 }, { "epoch": 3.965748906750055, "grad_norm": 0.028485514223575592, "learning_rate": 2.495558434035838e-07, "loss": 0.0, "num_input_tokens_seen": 109405584, "step": 162330 }, { "epoch": 3.9658710575819023, "grad_norm": 0.0013114007888361812, "learning_rate": 2.4949948327934134e-07, "loss": 0.0, "num_input_tokens_seen": 109408592, "step": 162335 }, { "epoch": 3.9659932084137495, "grad_norm": 0.0038664492312818766, "learning_rate": 2.494431286129407e-07, "loss": 0.0001, "num_input_tokens_seen": 109411984, "step": 162340 }, { "epoch": 3.966115359245596, "grad_norm": 16.34613800048828, "learning_rate": 2.493867794047916e-07, "loss": 0.075, "num_input_tokens_seen": 109415248, "step": 162345 }, { "epoch": 3.966237510077444, "grad_norm": 0.0032380574848502874, "learning_rate": 2.493304356553033e-07, "loss": 0.0, "num_input_tokens_seen": 109419280, "step": 162350 }, { "epoch": 3.9663596609092906, "grad_norm": 0.002148458966985345, "learning_rate": 2.492740973648864e-07, "loss": 0.0, "num_input_tokens_seen": 109422480, "step": 162355 }, { "epoch": 3.9664818117411382, "grad_norm": 0.005396352615207434, "learning_rate": 2.492177645339497e-07, "loss": 0.0, "num_input_tokens_seen": 109425808, "step": 162360 }, { "epoch": 3.966603962572985, "grad_norm": 0.0016786479391157627, "learning_rate": 2.491614371629035e-07, "loss": 0.0002, "num_input_tokens_seen": 109428816, "step": 162365 }, { "epoch": 3.966726113404832, "grad_norm": 0.0009584561921656132, "learning_rate": 2.491051152521576e-07, "loss": 0.0725, "num_input_tokens_seen": 109431952, "step": 162370 }, { "epoch": 3.9668482642366794, "grad_norm": 0.06761808693408966, "learning_rate": 2.4904879880212094e-07, "loss": 0.0, "num_input_tokens_seen": 109435216, "step": 162375 }, { "epoch": 3.9669704150685265, "grad_norm": 0.0004407799569889903, "learning_rate": 2.489924878132036e-07, "loss": 0.0001, "num_input_tokens_seen": 109438544, "step": 162380 }, { "epoch": 3.9670925659003737, "grad_norm": 0.013496466912329197, "learning_rate": 2.489361822858147e-07, "loss": 0.0006, "num_input_tokens_seen": 109442192, "step": 162385 }, { "epoch": 3.967214716732221, "grad_norm": 0.01726139336824417, "learning_rate": 2.488798822203638e-07, "loss": 0.0008, "num_input_tokens_seen": 109445456, "step": 162390 }, { "epoch": 3.967336867564068, "grad_norm": 0.03320024162530899, "learning_rate": 2.488235876172609e-07, "loss": 0.0001, "num_input_tokens_seen": 109448656, "step": 162395 }, { "epoch": 3.9674590183959153, "grad_norm": 0.0007737289415672421, "learning_rate": 2.4876729847691445e-07, "loss": 0.0, "num_input_tokens_seen": 109452176, "step": 162400 }, { "epoch": 3.9675811692277625, "grad_norm": 0.011182458139955997, "learning_rate": 2.4871101479973456e-07, "loss": 0.0, "num_input_tokens_seen": 109455312, "step": 162405 }, { "epoch": 3.9677033200596097, "grad_norm": 0.006535296328365803, "learning_rate": 2.4865473658613e-07, "loss": 0.0001, "num_input_tokens_seen": 109458832, "step": 162410 }, { "epoch": 3.967825470891457, "grad_norm": 0.011599404737353325, "learning_rate": 2.485984638365106e-07, "loss": 0.0, "num_input_tokens_seen": 109462096, "step": 162415 }, { "epoch": 3.967947621723304, "grad_norm": 0.02636815793812275, "learning_rate": 2.4854219655128493e-07, "loss": 0.0, "num_input_tokens_seen": 109465808, "step": 162420 }, { "epoch": 3.9680697725551513, "grad_norm": 54.23196029663086, "learning_rate": 2.4848593473086253e-07, "loss": 0.0444, "num_input_tokens_seen": 109469456, "step": 162425 }, { "epoch": 3.968191923386998, "grad_norm": 0.002171638421714306, "learning_rate": 2.4842967837565287e-07, "loss": 0.0, "num_input_tokens_seen": 109473488, "step": 162430 }, { "epoch": 3.9683140742188456, "grad_norm": 0.011168444529175758, "learning_rate": 2.483734274860647e-07, "loss": 0.0, "num_input_tokens_seen": 109476752, "step": 162435 }, { "epoch": 3.9684362250506924, "grad_norm": 0.0005473028868436813, "learning_rate": 2.4831718206250694e-07, "loss": 0.0003, "num_input_tokens_seen": 109480080, "step": 162440 }, { "epoch": 3.96855837588254, "grad_norm": 0.00041105563286691904, "learning_rate": 2.4826094210538895e-07, "loss": 0.0002, "num_input_tokens_seen": 109483344, "step": 162445 }, { "epoch": 3.9686805267143868, "grad_norm": 0.00028596349875442684, "learning_rate": 2.482047076151197e-07, "loss": 0.0001, "num_input_tokens_seen": 109486224, "step": 162450 }, { "epoch": 3.968802677546234, "grad_norm": 0.12528088688850403, "learning_rate": 2.4814847859210763e-07, "loss": 0.0, "num_input_tokens_seen": 109489232, "step": 162455 }, { "epoch": 3.968924828378081, "grad_norm": 0.0037132389843463898, "learning_rate": 2.480922550367621e-07, "loss": 0.0, "num_input_tokens_seen": 109492368, "step": 162460 }, { "epoch": 3.9690469792099283, "grad_norm": 0.005314487498253584, "learning_rate": 2.480360369494923e-07, "loss": 0.0, "num_input_tokens_seen": 109495440, "step": 162465 }, { "epoch": 3.9691691300417755, "grad_norm": 0.02827450819313526, "learning_rate": 2.479798243307063e-07, "loss": 0.0, "num_input_tokens_seen": 109498640, "step": 162470 }, { "epoch": 3.9692912808736227, "grad_norm": 0.023080473765730858, "learning_rate": 2.479236171808137e-07, "loss": 0.0, "num_input_tokens_seen": 109502288, "step": 162475 }, { "epoch": 3.96941343170547, "grad_norm": 0.019256386905908585, "learning_rate": 2.478674155002224e-07, "loss": 0.0001, "num_input_tokens_seen": 109506128, "step": 162480 }, { "epoch": 3.969535582537317, "grad_norm": 0.0003879503929056227, "learning_rate": 2.4781121928934155e-07, "loss": 0.0, "num_input_tokens_seen": 109509968, "step": 162485 }, { "epoch": 3.9696577333691643, "grad_norm": 0.002552243648096919, "learning_rate": 2.477550285485802e-07, "loss": 0.1131, "num_input_tokens_seen": 109513488, "step": 162490 }, { "epoch": 3.9697798842010115, "grad_norm": 0.09655870497226715, "learning_rate": 2.476988432783463e-07, "loss": 0.0001, "num_input_tokens_seen": 109517200, "step": 162495 }, { "epoch": 3.9699020350328587, "grad_norm": 0.021147755905985832, "learning_rate": 2.4764266347904905e-07, "loss": 0.0, "num_input_tokens_seen": 109520400, "step": 162500 }, { "epoch": 3.970024185864706, "grad_norm": 0.0016340258298441768, "learning_rate": 2.4758648915109636e-07, "loss": 0.0393, "num_input_tokens_seen": 109524048, "step": 162505 }, { "epoch": 3.970146336696553, "grad_norm": 0.009415844455361366, "learning_rate": 2.4753032029489753e-07, "loss": 0.0, "num_input_tokens_seen": 109527376, "step": 162510 }, { "epoch": 3.9702684875284002, "grad_norm": 0.00024968807701952755, "learning_rate": 2.4747415691086013e-07, "loss": 0.0, "num_input_tokens_seen": 109530768, "step": 162515 }, { "epoch": 3.9703906383602474, "grad_norm": 0.009475484490394592, "learning_rate": 2.474179989993932e-07, "loss": 0.0001, "num_input_tokens_seen": 109534032, "step": 162520 }, { "epoch": 3.970512789192094, "grad_norm": 0.011886761523783207, "learning_rate": 2.473618465609053e-07, "loss": 0.0001, "num_input_tokens_seen": 109537296, "step": 162525 }, { "epoch": 3.970634940023942, "grad_norm": 0.011090297251939774, "learning_rate": 2.4730569959580416e-07, "loss": 0.0, "num_input_tokens_seen": 109540304, "step": 162530 }, { "epoch": 3.9707570908557885, "grad_norm": 0.00524521991610527, "learning_rate": 2.4724955810449865e-07, "loss": 0.0, "num_input_tokens_seen": 109543440, "step": 162535 }, { "epoch": 3.9708792416876357, "grad_norm": 0.05811067670583725, "learning_rate": 2.471934220873969e-07, "loss": 0.0, "num_input_tokens_seen": 109547152, "step": 162540 }, { "epoch": 3.971001392519483, "grad_norm": 0.0004150404129177332, "learning_rate": 2.471372915449067e-07, "loss": 0.0, "num_input_tokens_seen": 109550160, "step": 162545 }, { "epoch": 3.97112354335133, "grad_norm": 0.1513708084821701, "learning_rate": 2.4708116647743696e-07, "loss": 0.0001, "num_input_tokens_seen": 109553296, "step": 162550 }, { "epoch": 3.9712456941831773, "grad_norm": 0.003029992338269949, "learning_rate": 2.4702504688539516e-07, "loss": 0.0893, "num_input_tokens_seen": 109556688, "step": 162555 }, { "epoch": 3.9713678450150245, "grad_norm": 0.11696822941303253, "learning_rate": 2.469689327691901e-07, "loss": 0.0817, "num_input_tokens_seen": 109560336, "step": 162560 }, { "epoch": 3.9714899958468717, "grad_norm": 0.016360236331820488, "learning_rate": 2.4691282412922923e-07, "loss": 0.0, "num_input_tokens_seen": 109563792, "step": 162565 }, { "epoch": 3.971612146678719, "grad_norm": 0.0006904311594553292, "learning_rate": 2.4685672096592105e-07, "loss": 0.0, "num_input_tokens_seen": 109566992, "step": 162570 }, { "epoch": 3.971734297510566, "grad_norm": 0.0017873753095045686, "learning_rate": 2.468006232796731e-07, "loss": 0.0, "num_input_tokens_seen": 109570384, "step": 162575 }, { "epoch": 3.9718564483424132, "grad_norm": 0.003968059550970793, "learning_rate": 2.4674453107089356e-07, "loss": 0.0, "num_input_tokens_seen": 109573520, "step": 162580 }, { "epoch": 3.9719785991742604, "grad_norm": 1.919407606124878, "learning_rate": 2.4668844433999083e-07, "loss": 0.0015, "num_input_tokens_seen": 109576912, "step": 162585 }, { "epoch": 3.9721007500061076, "grad_norm": 0.01713302545249462, "learning_rate": 2.466323630873719e-07, "loss": 0.0, "num_input_tokens_seen": 109580240, "step": 162590 }, { "epoch": 3.972222900837955, "grad_norm": 0.013763492926955223, "learning_rate": 2.465762873134455e-07, "loss": 0.0, "num_input_tokens_seen": 109583440, "step": 162595 }, { "epoch": 3.972345051669802, "grad_norm": 0.0006270576850511134, "learning_rate": 2.465202170186186e-07, "loss": 0.0, "num_input_tokens_seen": 109587024, "step": 162600 }, { "epoch": 3.972467202501649, "grad_norm": 0.12661677598953247, "learning_rate": 2.4646415220329963e-07, "loss": 0.0001, "num_input_tokens_seen": 109590224, "step": 162605 }, { "epoch": 3.972589353333496, "grad_norm": 0.003163676941767335, "learning_rate": 2.4640809286789575e-07, "loss": 0.0, "num_input_tokens_seen": 109593232, "step": 162610 }, { "epoch": 3.9727115041653436, "grad_norm": 0.0038118986412882805, "learning_rate": 2.463520390128149e-07, "loss": 0.0, "num_input_tokens_seen": 109596944, "step": 162615 }, { "epoch": 3.9728336549971903, "grad_norm": 0.0014689149102196097, "learning_rate": 2.4629599063846494e-07, "loss": 0.0, "num_input_tokens_seen": 109600080, "step": 162620 }, { "epoch": 3.972955805829038, "grad_norm": 0.00022433280537370592, "learning_rate": 2.4623994774525313e-07, "loss": 0.0001, "num_input_tokens_seen": 109603216, "step": 162625 }, { "epoch": 3.9730779566608847, "grad_norm": 0.004913068376481533, "learning_rate": 2.461839103335873e-07, "loss": 0.0377, "num_input_tokens_seen": 109606352, "step": 162630 }, { "epoch": 3.973200107492732, "grad_norm": 0.0011740082409232855, "learning_rate": 2.461278784038747e-07, "loss": 0.0, "num_input_tokens_seen": 109609808, "step": 162635 }, { "epoch": 3.973322258324579, "grad_norm": 0.002981620142236352, "learning_rate": 2.4607185195652315e-07, "loss": 0.0, "num_input_tokens_seen": 109613200, "step": 162640 }, { "epoch": 3.9734444091564263, "grad_norm": 0.016984274610877037, "learning_rate": 2.460158309919396e-07, "loss": 0.0, "num_input_tokens_seen": 109616784, "step": 162645 }, { "epoch": 3.9735665599882735, "grad_norm": 0.0023610140196979046, "learning_rate": 2.4595981551053193e-07, "loss": 0.0, "num_input_tokens_seen": 109620240, "step": 162650 }, { "epoch": 3.9736887108201207, "grad_norm": 0.07518626004457474, "learning_rate": 2.459038055127073e-07, "loss": 0.0002, "num_input_tokens_seen": 109623440, "step": 162655 }, { "epoch": 3.973810861651968, "grad_norm": 0.0030677923932671547, "learning_rate": 2.458478009988728e-07, "loss": 0.0, "num_input_tokens_seen": 109626704, "step": 162660 }, { "epoch": 3.973933012483815, "grad_norm": 0.0016324043972417712, "learning_rate": 2.4579180196943614e-07, "loss": 0.0001, "num_input_tokens_seen": 109630416, "step": 162665 }, { "epoch": 3.974055163315662, "grad_norm": 0.006567919161170721, "learning_rate": 2.4573580842480424e-07, "loss": 0.0, "num_input_tokens_seen": 109634064, "step": 162670 }, { "epoch": 3.9741773141475094, "grad_norm": 0.0009271908784285188, "learning_rate": 2.456798203653843e-07, "loss": 0.0, "num_input_tokens_seen": 109637648, "step": 162675 }, { "epoch": 3.9742994649793566, "grad_norm": 74.43541717529297, "learning_rate": 2.456238377915839e-07, "loss": 0.0523, "num_input_tokens_seen": 109641104, "step": 162680 }, { "epoch": 3.974421615811204, "grad_norm": 0.0008278028690256178, "learning_rate": 2.4556786070380954e-07, "loss": 0.0, "num_input_tokens_seen": 109644944, "step": 162685 }, { "epoch": 3.974543766643051, "grad_norm": 0.0014180800644680858, "learning_rate": 2.455118891024689e-07, "loss": 0.0, "num_input_tokens_seen": 109648080, "step": 162690 }, { "epoch": 3.974665917474898, "grad_norm": 0.001689577242359519, "learning_rate": 2.454559229879685e-07, "loss": 0.0, "num_input_tokens_seen": 109651664, "step": 162695 }, { "epoch": 3.9747880683067454, "grad_norm": 0.015290713869035244, "learning_rate": 2.453999623607155e-07, "loss": 0.0, "num_input_tokens_seen": 109655120, "step": 162700 }, { "epoch": 3.974910219138592, "grad_norm": 0.3379508852958679, "learning_rate": 2.4534400722111724e-07, "loss": 0.0, "num_input_tokens_seen": 109658128, "step": 162705 }, { "epoch": 3.9750323699704397, "grad_norm": 0.02128562517464161, "learning_rate": 2.4528805756958004e-07, "loss": 0.0, "num_input_tokens_seen": 109661776, "step": 162710 }, { "epoch": 3.9751545208022865, "grad_norm": 0.022409426048398018, "learning_rate": 2.452321134065114e-07, "loss": 0.0, "num_input_tokens_seen": 109664976, "step": 162715 }, { "epoch": 3.9752766716341337, "grad_norm": 0.0011433304753154516, "learning_rate": 2.4517617473231755e-07, "loss": 0.0, "num_input_tokens_seen": 109668304, "step": 162720 }, { "epoch": 3.975398822465981, "grad_norm": 0.0014628785429522395, "learning_rate": 2.4512024154740594e-07, "loss": 0.0001, "num_input_tokens_seen": 109671120, "step": 162725 }, { "epoch": 3.975520973297828, "grad_norm": 0.0012373403878882527, "learning_rate": 2.450643138521826e-07, "loss": 0.0, "num_input_tokens_seen": 109674320, "step": 162730 }, { "epoch": 3.9756431241296752, "grad_norm": 0.010684294626116753, "learning_rate": 2.4500839164705464e-07, "loss": 0.0004, "num_input_tokens_seen": 109677776, "step": 162735 }, { "epoch": 3.9757652749615224, "grad_norm": 0.005231752991676331, "learning_rate": 2.4495247493242896e-07, "loss": 0.0, "num_input_tokens_seen": 109680976, "step": 162740 }, { "epoch": 3.9758874257933696, "grad_norm": 0.004223552066832781, "learning_rate": 2.4489656370871205e-07, "loss": 0.06, "num_input_tokens_seen": 109683920, "step": 162745 }, { "epoch": 3.976009576625217, "grad_norm": 0.004554894287139177, "learning_rate": 2.4484065797631015e-07, "loss": 0.0, "num_input_tokens_seen": 109687312, "step": 162750 }, { "epoch": 3.976131727457064, "grad_norm": 0.04216483607888222, "learning_rate": 2.447847577356303e-07, "loss": 0.0, "num_input_tokens_seen": 109690576, "step": 162755 }, { "epoch": 3.976253878288911, "grad_norm": 0.1104845255613327, "learning_rate": 2.4472886298707883e-07, "loss": 0.0001, "num_input_tokens_seen": 109693776, "step": 162760 }, { "epoch": 3.9763760291207584, "grad_norm": 0.000709182524587959, "learning_rate": 2.44672973731062e-07, "loss": 0.0, "num_input_tokens_seen": 109696912, "step": 162765 }, { "epoch": 3.9764981799526056, "grad_norm": 0.0018730240408331156, "learning_rate": 2.4461708996798634e-07, "loss": 0.0001, "num_input_tokens_seen": 109700304, "step": 162770 }, { "epoch": 3.9766203307844528, "grad_norm": 0.002656224649399519, "learning_rate": 2.445612116982588e-07, "loss": 0.0, "num_input_tokens_seen": 109703248, "step": 162775 }, { "epoch": 3.9767424816163, "grad_norm": 0.006554523948580027, "learning_rate": 2.44505338922285e-07, "loss": 0.0, "num_input_tokens_seen": 109706576, "step": 162780 }, { "epoch": 3.976864632448147, "grad_norm": 0.00015725352568551898, "learning_rate": 2.444494716404718e-07, "loss": 0.0004, "num_input_tokens_seen": 109709648, "step": 162785 }, { "epoch": 3.976986783279994, "grad_norm": 0.012902540154755116, "learning_rate": 2.4439360985322497e-07, "loss": 0.0, "num_input_tokens_seen": 109713040, "step": 162790 }, { "epoch": 3.9771089341118415, "grad_norm": 0.00813246052712202, "learning_rate": 2.443377535609511e-07, "loss": 0.0, "num_input_tokens_seen": 109716240, "step": 162795 }, { "epoch": 3.9772310849436883, "grad_norm": 0.00023855824838392437, "learning_rate": 2.4428190276405657e-07, "loss": 0.0, "num_input_tokens_seen": 109719504, "step": 162800 }, { "epoch": 3.977353235775536, "grad_norm": 0.002531233709305525, "learning_rate": 2.4422605746294713e-07, "loss": 0.0578, "num_input_tokens_seen": 109722768, "step": 162805 }, { "epoch": 3.9774753866073826, "grad_norm": 0.0015762082766741514, "learning_rate": 2.4417021765802923e-07, "loss": 0.0, "num_input_tokens_seen": 109725904, "step": 162810 }, { "epoch": 3.97759753743923, "grad_norm": 0.0006869949284009635, "learning_rate": 2.4411438334970856e-07, "loss": 0.0, "num_input_tokens_seen": 109729808, "step": 162815 }, { "epoch": 3.977719688271077, "grad_norm": 0.04206356033682823, "learning_rate": 2.4405855453839174e-07, "loss": 0.0, "num_input_tokens_seen": 109732944, "step": 162820 }, { "epoch": 3.977841839102924, "grad_norm": 0.00013251813652459532, "learning_rate": 2.4400273122448413e-07, "loss": 0.0, "num_input_tokens_seen": 109736336, "step": 162825 }, { "epoch": 3.9779639899347714, "grad_norm": 0.0011560230050235987, "learning_rate": 2.43946913408392e-07, "loss": 0.0453, "num_input_tokens_seen": 109739600, "step": 162830 }, { "epoch": 3.9780861407666186, "grad_norm": 0.0013253620127215981, "learning_rate": 2.438911010905216e-07, "loss": 0.0, "num_input_tokens_seen": 109742864, "step": 162835 }, { "epoch": 3.978208291598466, "grad_norm": 0.0038731468375772238, "learning_rate": 2.4383529427127804e-07, "loss": 0.0, "num_input_tokens_seen": 109746448, "step": 162840 }, { "epoch": 3.978330442430313, "grad_norm": 0.018492287024855614, "learning_rate": 2.43779492951068e-07, "loss": 0.0, "num_input_tokens_seen": 109750032, "step": 162845 }, { "epoch": 3.97845259326216, "grad_norm": 0.001245662453584373, "learning_rate": 2.4372369713029683e-07, "loss": 0.0, "num_input_tokens_seen": 109753360, "step": 162850 }, { "epoch": 3.9785747440940074, "grad_norm": 0.0058091082610189915, "learning_rate": 2.436679068093701e-07, "loss": 0.0, "num_input_tokens_seen": 109756624, "step": 162855 }, { "epoch": 3.9786968949258545, "grad_norm": 56.89339828491211, "learning_rate": 2.436121219886941e-07, "loss": 0.0489, "num_input_tokens_seen": 109759760, "step": 162860 }, { "epoch": 3.9788190457577017, "grad_norm": 0.0014791139401495457, "learning_rate": 2.4355634266867387e-07, "loss": 0.0, "num_input_tokens_seen": 109763152, "step": 162865 }, { "epoch": 3.978941196589549, "grad_norm": 0.0012810054467990994, "learning_rate": 2.435005688497157e-07, "loss": 0.0, "num_input_tokens_seen": 109766288, "step": 162870 }, { "epoch": 3.9790633474213957, "grad_norm": 0.0035496437922120094, "learning_rate": 2.434448005322245e-07, "loss": 0.0, "num_input_tokens_seen": 109769360, "step": 162875 }, { "epoch": 3.9791854982532433, "grad_norm": 0.0032136563677340746, "learning_rate": 2.4338903771660656e-07, "loss": 0.0001, "num_input_tokens_seen": 109772624, "step": 162880 }, { "epoch": 3.97930764908509, "grad_norm": 0.0025195456109941006, "learning_rate": 2.433332804032667e-07, "loss": 0.0001, "num_input_tokens_seen": 109775568, "step": 162885 }, { "epoch": 3.9794297999169377, "grad_norm": 0.009100309573113918, "learning_rate": 2.4327752859261074e-07, "loss": 0.0, "num_input_tokens_seen": 109779088, "step": 162890 }, { "epoch": 3.9795519507487844, "grad_norm": 0.002834005979821086, "learning_rate": 2.432217822850445e-07, "loss": 0.0, "num_input_tokens_seen": 109782032, "step": 162895 }, { "epoch": 3.9796741015806316, "grad_norm": 0.0029583594296127558, "learning_rate": 2.4316604148097264e-07, "loss": 0.0, "num_input_tokens_seen": 109785296, "step": 162900 }, { "epoch": 3.979796252412479, "grad_norm": 0.0010822449112311006, "learning_rate": 2.431103061808012e-07, "loss": 0.0479, "num_input_tokens_seen": 109788816, "step": 162905 }, { "epoch": 3.979918403244326, "grad_norm": 0.0011741408379748464, "learning_rate": 2.43054576384935e-07, "loss": 0.0004, "num_input_tokens_seen": 109792080, "step": 162910 }, { "epoch": 3.980040554076173, "grad_norm": 11.589085578918457, "learning_rate": 2.429988520937797e-07, "loss": 0.041, "num_input_tokens_seen": 109795216, "step": 162915 }, { "epoch": 3.9801627049080204, "grad_norm": 0.013919665478169918, "learning_rate": 2.4294313330773995e-07, "loss": 0.0004, "num_input_tokens_seen": 109798800, "step": 162920 }, { "epoch": 3.9802848557398676, "grad_norm": 0.002399963326752186, "learning_rate": 2.428874200272215e-07, "loss": 0.0, "num_input_tokens_seen": 109802384, "step": 162925 }, { "epoch": 3.9804070065717148, "grad_norm": 0.011189553886651993, "learning_rate": 2.4283171225262967e-07, "loss": 0.0, "num_input_tokens_seen": 109806416, "step": 162930 }, { "epoch": 3.980529157403562, "grad_norm": 0.0008779754862189293, "learning_rate": 2.42776009984369e-07, "loss": 0.0, "num_input_tokens_seen": 109810192, "step": 162935 }, { "epoch": 3.980651308235409, "grad_norm": 0.0033224131911993027, "learning_rate": 2.427203132228451e-07, "loss": 0.0, "num_input_tokens_seen": 109813584, "step": 162940 }, { "epoch": 3.9807734590672563, "grad_norm": 2.35109018831281e-05, "learning_rate": 2.426646219684625e-07, "loss": 0.0001, "num_input_tokens_seen": 109816592, "step": 162945 }, { "epoch": 3.9808956098991035, "grad_norm": 0.06689867377281189, "learning_rate": 2.426089362216267e-07, "loss": 0.0, "num_input_tokens_seen": 109819728, "step": 162950 }, { "epoch": 3.9810177607309507, "grad_norm": 0.008919079788029194, "learning_rate": 2.4255325598274225e-07, "loss": 0.0, "num_input_tokens_seen": 109822864, "step": 162955 }, { "epoch": 3.981139911562798, "grad_norm": 0.01653454639017582, "learning_rate": 2.424975812522144e-07, "loss": 0.0, "num_input_tokens_seen": 109826320, "step": 162960 }, { "epoch": 3.981262062394645, "grad_norm": 0.016628887504339218, "learning_rate": 2.424419120304481e-07, "loss": 0.0, "num_input_tokens_seen": 109829584, "step": 162965 }, { "epoch": 3.981384213226492, "grad_norm": 0.0006351763149723411, "learning_rate": 2.423862483178475e-07, "loss": 0.0, "num_input_tokens_seen": 109833040, "step": 162970 }, { "epoch": 3.9815063640583395, "grad_norm": 0.004808998201042414, "learning_rate": 2.4233059011481817e-07, "loss": 0.0, "num_input_tokens_seen": 109836176, "step": 162975 }, { "epoch": 3.981628514890186, "grad_norm": 0.0008432915201410651, "learning_rate": 2.422749374217643e-07, "loss": 0.0, "num_input_tokens_seen": 109839248, "step": 162980 }, { "epoch": 3.981750665722034, "grad_norm": 0.0023574333172291517, "learning_rate": 2.4221929023909096e-07, "loss": 0.0, "num_input_tokens_seen": 109842704, "step": 162985 }, { "epoch": 3.9818728165538806, "grad_norm": 0.003182420041412115, "learning_rate": 2.4216364856720295e-07, "loss": 0.0, "num_input_tokens_seen": 109846608, "step": 162990 }, { "epoch": 3.981994967385728, "grad_norm": 0.039140839129686356, "learning_rate": 2.421080124065045e-07, "loss": 0.0093, "num_input_tokens_seen": 109849616, "step": 162995 }, { "epoch": 3.982117118217575, "grad_norm": 0.0021024311427026987, "learning_rate": 2.4205238175740075e-07, "loss": 0.0, "num_input_tokens_seen": 109852880, "step": 163000 }, { "epoch": 3.982239269049422, "grad_norm": 0.0015542428009212017, "learning_rate": 2.4199675662029563e-07, "loss": 0.0, "num_input_tokens_seen": 109856336, "step": 163005 }, { "epoch": 3.9823614198812693, "grad_norm": 6.775200017727911e-05, "learning_rate": 2.4194113699559395e-07, "loss": 0.0, "num_input_tokens_seen": 109859600, "step": 163010 }, { "epoch": 3.9824835707131165, "grad_norm": 0.0029845689423382282, "learning_rate": 2.4188552288370043e-07, "loss": 0.0523, "num_input_tokens_seen": 109862928, "step": 163015 }, { "epoch": 3.9826057215449637, "grad_norm": 0.1912604123353958, "learning_rate": 2.4182991428501906e-07, "loss": 0.0, "num_input_tokens_seen": 109866256, "step": 163020 }, { "epoch": 3.982727872376811, "grad_norm": 0.0005314030568115413, "learning_rate": 2.4177431119995483e-07, "loss": 0.0, "num_input_tokens_seen": 109869584, "step": 163025 }, { "epoch": 3.982850023208658, "grad_norm": 0.005421550944447517, "learning_rate": 2.417187136289115e-07, "loss": 0.0, "num_input_tokens_seen": 109872912, "step": 163030 }, { "epoch": 3.9829721740405053, "grad_norm": 0.0011578707490116358, "learning_rate": 2.4166312157229384e-07, "loss": 0.1016, "num_input_tokens_seen": 109875984, "step": 163035 }, { "epoch": 3.9830943248723525, "grad_norm": 0.11689208447933197, "learning_rate": 2.416075350305056e-07, "loss": 0.0001, "num_input_tokens_seen": 109879184, "step": 163040 }, { "epoch": 3.9832164757041997, "grad_norm": 0.0014069858007133007, "learning_rate": 2.4155195400395144e-07, "loss": 0.0002, "num_input_tokens_seen": 109882704, "step": 163045 }, { "epoch": 3.983338626536047, "grad_norm": 0.00013945819227956235, "learning_rate": 2.414963784930357e-07, "loss": 0.0, "num_input_tokens_seen": 109886288, "step": 163050 }, { "epoch": 3.9834607773678936, "grad_norm": 0.00327501748688519, "learning_rate": 2.414408084981623e-07, "loss": 0.0, "num_input_tokens_seen": 109890064, "step": 163055 }, { "epoch": 3.9835829281997412, "grad_norm": 0.00012703366519417614, "learning_rate": 2.4138524401973515e-07, "loss": 0.0, "num_input_tokens_seen": 109893264, "step": 163060 }, { "epoch": 3.983705079031588, "grad_norm": 0.012055044062435627, "learning_rate": 2.4132968505815874e-07, "loss": 0.0, "num_input_tokens_seen": 109896208, "step": 163065 }, { "epoch": 3.9838272298634356, "grad_norm": 0.3332982063293457, "learning_rate": 2.4127413161383693e-07, "loss": 0.0001, "num_input_tokens_seen": 109899472, "step": 163070 }, { "epoch": 3.9839493806952824, "grad_norm": 0.00024611069238744676, "learning_rate": 2.412185836871735e-07, "loss": 0.0, "num_input_tokens_seen": 109902480, "step": 163075 }, { "epoch": 3.9840715315271296, "grad_norm": 0.00047673837980255485, "learning_rate": 2.4116304127857256e-07, "loss": 0.0, "num_input_tokens_seen": 109905744, "step": 163080 }, { "epoch": 3.9841936823589768, "grad_norm": 0.0008909995085559785, "learning_rate": 2.411075043884384e-07, "loss": 0.0534, "num_input_tokens_seen": 109909264, "step": 163085 }, { "epoch": 3.984315833190824, "grad_norm": 0.0021835891529917717, "learning_rate": 2.410519730171743e-07, "loss": 0.0001, "num_input_tokens_seen": 109912784, "step": 163090 }, { "epoch": 3.984437984022671, "grad_norm": 0.0039231725968420506, "learning_rate": 2.4099644716518464e-07, "loss": 0.0001, "num_input_tokens_seen": 109915920, "step": 163095 }, { "epoch": 3.9845601348545183, "grad_norm": 0.0007445600931532681, "learning_rate": 2.409409268328727e-07, "loss": 0.0114, "num_input_tokens_seen": 109919760, "step": 163100 }, { "epoch": 3.9846822856863655, "grad_norm": 0.012176787480711937, "learning_rate": 2.4088541202064247e-07, "loss": 0.0, "num_input_tokens_seen": 109923536, "step": 163105 }, { "epoch": 3.9848044365182127, "grad_norm": 0.004063439555466175, "learning_rate": 2.4082990272889804e-07, "loss": 0.0, "num_input_tokens_seen": 109927184, "step": 163110 }, { "epoch": 3.98492658735006, "grad_norm": 0.026905439794063568, "learning_rate": 2.407743989580424e-07, "loss": 0.0002, "num_input_tokens_seen": 109930384, "step": 163115 }, { "epoch": 3.985048738181907, "grad_norm": 0.029015744104981422, "learning_rate": 2.407189007084799e-07, "loss": 0.0, "num_input_tokens_seen": 109934096, "step": 163120 }, { "epoch": 3.9851708890137543, "grad_norm": 0.0023757724557071924, "learning_rate": 2.4066340798061344e-07, "loss": 0.0, "num_input_tokens_seen": 109937744, "step": 163125 }, { "epoch": 3.9852930398456015, "grad_norm": 0.09235114604234695, "learning_rate": 2.4060792077484727e-07, "loss": 0.0, "num_input_tokens_seen": 109940944, "step": 163130 }, { "epoch": 3.9854151906774486, "grad_norm": 0.0007887451793067157, "learning_rate": 2.405524390915842e-07, "loss": 0.0001, "num_input_tokens_seen": 109944336, "step": 163135 }, { "epoch": 3.985537341509296, "grad_norm": 0.014184202067553997, "learning_rate": 2.4049696293122803e-07, "loss": 0.0004, "num_input_tokens_seen": 109947856, "step": 163140 }, { "epoch": 3.985659492341143, "grad_norm": 0.004972795955836773, "learning_rate": 2.4044149229418255e-07, "loss": 0.0, "num_input_tokens_seen": 109951184, "step": 163145 }, { "epoch": 3.9857816431729898, "grad_norm": 0.2864531874656677, "learning_rate": 2.4038602718085057e-07, "loss": 0.0001, "num_input_tokens_seen": 109954512, "step": 163150 }, { "epoch": 3.9859037940048374, "grad_norm": 0.0016255693044513464, "learning_rate": 2.4033056759163597e-07, "loss": 0.0, "num_input_tokens_seen": 109958096, "step": 163155 }, { "epoch": 3.986025944836684, "grad_norm": 0.00540508795529604, "learning_rate": 2.402751135269417e-07, "loss": 0.0465, "num_input_tokens_seen": 109961296, "step": 163160 }, { "epoch": 3.9861480956685313, "grad_norm": 0.012475240975618362, "learning_rate": 2.4021966498717107e-07, "loss": 0.0, "num_input_tokens_seen": 109965072, "step": 163165 }, { "epoch": 3.9862702465003785, "grad_norm": 0.00033634447026997805, "learning_rate": 2.4016422197272757e-07, "loss": 0.0, "num_input_tokens_seen": 109967952, "step": 163170 }, { "epoch": 3.9863923973322257, "grad_norm": 0.0004531031008809805, "learning_rate": 2.4010878448401393e-07, "loss": 0.0, "num_input_tokens_seen": 109971088, "step": 163175 }, { "epoch": 3.986514548164073, "grad_norm": 0.000985053600743413, "learning_rate": 2.4005335252143387e-07, "loss": 0.0001, "num_input_tokens_seen": 109974672, "step": 163180 }, { "epoch": 3.98663669899592, "grad_norm": 0.005329513922333717, "learning_rate": 2.3999792608539005e-07, "loss": 0.0, "num_input_tokens_seen": 109977808, "step": 163185 }, { "epoch": 3.9867588498277673, "grad_norm": 0.0018812778871506453, "learning_rate": 2.3994250517628587e-07, "loss": 0.0, "num_input_tokens_seen": 109981328, "step": 163190 }, { "epoch": 3.9868810006596145, "grad_norm": 0.017847029492259026, "learning_rate": 2.398870897945241e-07, "loss": 0.0, "num_input_tokens_seen": 109985040, "step": 163195 }, { "epoch": 3.9870031514914617, "grad_norm": 0.03550269454717636, "learning_rate": 2.398316799405077e-07, "loss": 0.0, "num_input_tokens_seen": 109988752, "step": 163200 }, { "epoch": 3.987125302323309, "grad_norm": 0.0018748401198536158, "learning_rate": 2.397762756146402e-07, "loss": 0.0, "num_input_tokens_seen": 109992464, "step": 163205 }, { "epoch": 3.987247453155156, "grad_norm": 0.0008736561867408454, "learning_rate": 2.3972087681732367e-07, "loss": 0.0, "num_input_tokens_seen": 109995792, "step": 163210 }, { "epoch": 3.9873696039870032, "grad_norm": 0.00046163939987309277, "learning_rate": 2.396654835489618e-07, "loss": 0.0001, "num_input_tokens_seen": 109999888, "step": 163215 }, { "epoch": 3.9874917548188504, "grad_norm": 0.0022144047543406487, "learning_rate": 2.396100958099567e-07, "loss": 0.0, "num_input_tokens_seen": 110003024, "step": 163220 }, { "epoch": 3.9876139056506976, "grad_norm": 0.004857294727116823, "learning_rate": 2.395547136007119e-07, "loss": 0.0, "num_input_tokens_seen": 110006288, "step": 163225 }, { "epoch": 3.987736056482545, "grad_norm": 0.0012780651450157166, "learning_rate": 2.3949933692162936e-07, "loss": 0.0, "num_input_tokens_seen": 110010320, "step": 163230 }, { "epoch": 3.9878582073143916, "grad_norm": 0.00010625671711750329, "learning_rate": 2.394439657731122e-07, "loss": 0.0, "num_input_tokens_seen": 110013840, "step": 163235 }, { "epoch": 3.987980358146239, "grad_norm": 0.0005658533773384988, "learning_rate": 2.393886001555634e-07, "loss": 0.0, "num_input_tokens_seen": 110017232, "step": 163240 }, { "epoch": 3.988102508978086, "grad_norm": 0.04258429631590843, "learning_rate": 2.3933324006938503e-07, "loss": 0.0, "num_input_tokens_seen": 110020624, "step": 163245 }, { "epoch": 3.9882246598099336, "grad_norm": 0.01064017042517662, "learning_rate": 2.3927788551498016e-07, "loss": 0.0, "num_input_tokens_seen": 110023760, "step": 163250 }, { "epoch": 3.9883468106417803, "grad_norm": 27.123470306396484, "learning_rate": 2.392225364927508e-07, "loss": 0.0371, "num_input_tokens_seen": 110026832, "step": 163255 }, { "epoch": 3.9884689614736275, "grad_norm": 0.00024280698562506586, "learning_rate": 2.3916719300310017e-07, "loss": 0.0371, "num_input_tokens_seen": 110030224, "step": 163260 }, { "epoch": 3.9885911123054747, "grad_norm": 0.0007595649221912026, "learning_rate": 2.3911185504642993e-07, "loss": 0.0679, "num_input_tokens_seen": 110033616, "step": 163265 }, { "epoch": 3.988713263137322, "grad_norm": 0.052835941314697266, "learning_rate": 2.3905652262314335e-07, "loss": 0.0, "num_input_tokens_seen": 110037072, "step": 163270 }, { "epoch": 3.988835413969169, "grad_norm": 0.0002746016252785921, "learning_rate": 2.390011957336424e-07, "loss": 0.0, "num_input_tokens_seen": 110040400, "step": 163275 }, { "epoch": 3.9889575648010163, "grad_norm": 0.010532891377806664, "learning_rate": 2.3894587437832903e-07, "loss": 0.0, "num_input_tokens_seen": 110043280, "step": 163280 }, { "epoch": 3.9890797156328635, "grad_norm": 0.02451830357313156, "learning_rate": 2.388905585576063e-07, "loss": 0.0, "num_input_tokens_seen": 110046928, "step": 163285 }, { "epoch": 3.9892018664647106, "grad_norm": 0.0006182059296406806, "learning_rate": 2.3883524827187593e-07, "loss": 0.0439, "num_input_tokens_seen": 110050128, "step": 163290 }, { "epoch": 3.989324017296558, "grad_norm": 0.034917838871479034, "learning_rate": 2.387799435215403e-07, "loss": 0.0626, "num_input_tokens_seen": 110053712, "step": 163295 }, { "epoch": 3.989446168128405, "grad_norm": 0.035469673573970795, "learning_rate": 2.3872464430700203e-07, "loss": 0.0667, "num_input_tokens_seen": 110057232, "step": 163300 }, { "epoch": 3.989568318960252, "grad_norm": 0.00020851861336268485, "learning_rate": 2.3866935062866254e-07, "loss": 0.0, "num_input_tokens_seen": 110060432, "step": 163305 }, { "epoch": 3.9896904697920994, "grad_norm": 0.003421532455831766, "learning_rate": 2.3861406248692463e-07, "loss": 0.0002, "num_input_tokens_seen": 110063824, "step": 163310 }, { "epoch": 3.9898126206239466, "grad_norm": 0.004031584598124027, "learning_rate": 2.3855877988218974e-07, "loss": 0.0, "num_input_tokens_seen": 110067216, "step": 163315 }, { "epoch": 3.9899347714557933, "grad_norm": 0.001301066018640995, "learning_rate": 2.3850350281486044e-07, "loss": 0.0, "num_input_tokens_seen": 110070544, "step": 163320 }, { "epoch": 3.990056922287641, "grad_norm": 0.008654467761516571, "learning_rate": 2.384482312853383e-07, "loss": 0.0, "num_input_tokens_seen": 110073360, "step": 163325 }, { "epoch": 3.9901790731194877, "grad_norm": 0.008370326831936836, "learning_rate": 2.383929652940253e-07, "loss": 0.0565, "num_input_tokens_seen": 110076688, "step": 163330 }, { "epoch": 3.9903012239513354, "grad_norm": 0.006358446553349495, "learning_rate": 2.3833770484132398e-07, "loss": 0.0, "num_input_tokens_seen": 110080272, "step": 163335 }, { "epoch": 3.990423374783182, "grad_norm": 0.0030323562677949667, "learning_rate": 2.3828244992763536e-07, "loss": 0.0, "num_input_tokens_seen": 110083664, "step": 163340 }, { "epoch": 3.9905455256150293, "grad_norm": 0.04232315719127655, "learning_rate": 2.3822720055336188e-07, "loss": 0.0001, "num_input_tokens_seen": 110086928, "step": 163345 }, { "epoch": 3.9906676764468765, "grad_norm": 0.0002105066378135234, "learning_rate": 2.381719567189049e-07, "loss": 0.0, "num_input_tokens_seen": 110090192, "step": 163350 }, { "epoch": 3.9907898272787237, "grad_norm": 0.015618708916008472, "learning_rate": 2.381167184246663e-07, "loss": 0.0001, "num_input_tokens_seen": 110093584, "step": 163355 }, { "epoch": 3.990911978110571, "grad_norm": 0.03186903893947601, "learning_rate": 2.380614856710481e-07, "loss": 0.0, "num_input_tokens_seen": 110096656, "step": 163360 }, { "epoch": 3.991034128942418, "grad_norm": 0.027114881202578545, "learning_rate": 2.380062584584518e-07, "loss": 0.0002, "num_input_tokens_seen": 110100112, "step": 163365 }, { "epoch": 3.9911562797742652, "grad_norm": 0.03611982986330986, "learning_rate": 2.3795103678727857e-07, "loss": 0.0002, "num_input_tokens_seen": 110103440, "step": 163370 }, { "epoch": 3.9912784306061124, "grad_norm": 0.004180221818387508, "learning_rate": 2.3789582065793068e-07, "loss": 0.0607, "num_input_tokens_seen": 110106640, "step": 163375 }, { "epoch": 3.9914005814379596, "grad_norm": 0.001564105274155736, "learning_rate": 2.3784061007080937e-07, "loss": 0.0003, "num_input_tokens_seen": 110110096, "step": 163380 }, { "epoch": 3.991522732269807, "grad_norm": 0.01233486458659172, "learning_rate": 2.3778540502631583e-07, "loss": 0.0, "num_input_tokens_seen": 110113744, "step": 163385 }, { "epoch": 3.991644883101654, "grad_norm": 0.000720239186193794, "learning_rate": 2.377302055248519e-07, "loss": 0.0888, "num_input_tokens_seen": 110117648, "step": 163390 }, { "epoch": 3.991767033933501, "grad_norm": 0.0004685970488935709, "learning_rate": 2.3767501156681923e-07, "loss": 0.0, "num_input_tokens_seen": 110120912, "step": 163395 }, { "epoch": 3.9918891847653484, "grad_norm": 0.0007855355506762862, "learning_rate": 2.3761982315261853e-07, "loss": 0.0372, "num_input_tokens_seen": 110124176, "step": 163400 }, { "epoch": 3.9920113355971956, "grad_norm": 0.002032736549153924, "learning_rate": 2.375646402826519e-07, "loss": 0.0, "num_input_tokens_seen": 110127184, "step": 163405 }, { "epoch": 3.9921334864290428, "grad_norm": 0.0016364410985261202, "learning_rate": 2.3750946295732e-07, "loss": 0.0001, "num_input_tokens_seen": 110130512, "step": 163410 }, { "epoch": 3.9922556372608895, "grad_norm": 0.00046291478793136775, "learning_rate": 2.374542911770243e-07, "loss": 0.039, "num_input_tokens_seen": 110133584, "step": 163415 }, { "epoch": 3.992377788092737, "grad_norm": 0.00025917578022927046, "learning_rate": 2.3739912494216641e-07, "loss": 0.0, "num_input_tokens_seen": 110137360, "step": 163420 }, { "epoch": 3.992499938924584, "grad_norm": 0.0023525061551481485, "learning_rate": 2.3734396425314695e-07, "loss": 0.0693, "num_input_tokens_seen": 110140560, "step": 163425 }, { "epoch": 3.9926220897564315, "grad_norm": 0.013277255930006504, "learning_rate": 2.3728880911036752e-07, "loss": 0.0001, "num_input_tokens_seen": 110143632, "step": 163430 }, { "epoch": 3.9927442405882783, "grad_norm": 0.002585779642686248, "learning_rate": 2.372336595142288e-07, "loss": 0.0, "num_input_tokens_seen": 110146896, "step": 163435 }, { "epoch": 3.9928663914201254, "grad_norm": 0.00874535832554102, "learning_rate": 2.3717851546513234e-07, "loss": 0.0, "num_input_tokens_seen": 110150864, "step": 163440 }, { "epoch": 3.9929885422519726, "grad_norm": 0.002351542469114065, "learning_rate": 2.3712337696347863e-07, "loss": 0.0, "num_input_tokens_seen": 110154384, "step": 163445 }, { "epoch": 3.99311069308382, "grad_norm": 0.0005293526337482035, "learning_rate": 2.3706824400966886e-07, "loss": 0.0, "num_input_tokens_seen": 110157904, "step": 163450 }, { "epoch": 3.993232843915667, "grad_norm": 0.002115802839398384, "learning_rate": 2.3701311660410438e-07, "loss": 0.0446, "num_input_tokens_seen": 110161104, "step": 163455 }, { "epoch": 3.993354994747514, "grad_norm": 0.0006367531605064869, "learning_rate": 2.3695799474718537e-07, "loss": 0.0, "num_input_tokens_seen": 110164688, "step": 163460 }, { "epoch": 3.9934771455793614, "grad_norm": 0.030436921864748, "learning_rate": 2.3690287843931334e-07, "loss": 0.0317, "num_input_tokens_seen": 110168720, "step": 163465 }, { "epoch": 3.9935992964112086, "grad_norm": 0.007803088985383511, "learning_rate": 2.3684776768088887e-07, "loss": 0.0, "num_input_tokens_seen": 110172048, "step": 163470 }, { "epoch": 3.9937214472430558, "grad_norm": 0.003044381272047758, "learning_rate": 2.3679266247231244e-07, "loss": 0.0001, "num_input_tokens_seen": 110177424, "step": 163475 }, { "epoch": 3.993843598074903, "grad_norm": 0.009955629706382751, "learning_rate": 2.3673756281398528e-07, "loss": 0.0, "num_input_tokens_seen": 110181008, "step": 163480 }, { "epoch": 3.99396574890675, "grad_norm": 0.0034395726397633553, "learning_rate": 2.3668246870630759e-07, "loss": 0.0003, "num_input_tokens_seen": 110184336, "step": 163485 }, { "epoch": 3.9940878997385973, "grad_norm": 0.03659478574991226, "learning_rate": 2.3662738014968054e-07, "loss": 0.0, "num_input_tokens_seen": 110187856, "step": 163490 }, { "epoch": 3.9942100505704445, "grad_norm": 0.0005764599773101509, "learning_rate": 2.3657229714450422e-07, "loss": 0.0, "num_input_tokens_seen": 110190800, "step": 163495 }, { "epoch": 3.9943322014022913, "grad_norm": 0.06950213015079498, "learning_rate": 2.365172196911799e-07, "loss": 0.0, "num_input_tokens_seen": 110194384, "step": 163500 }, { "epoch": 3.994454352234139, "grad_norm": 29.22477149963379, "learning_rate": 2.3646214779010732e-07, "loss": 0.1123, "num_input_tokens_seen": 110197840, "step": 163505 }, { "epoch": 3.9945765030659857, "grad_norm": 0.0926981195807457, "learning_rate": 2.364070814416873e-07, "loss": 0.0, "num_input_tokens_seen": 110201872, "step": 163510 }, { "epoch": 3.9946986538978333, "grad_norm": 28.436185836791992, "learning_rate": 2.3635202064632075e-07, "loss": 0.0336, "num_input_tokens_seen": 110204752, "step": 163515 }, { "epoch": 3.99482080472968, "grad_norm": 0.012042644433677197, "learning_rate": 2.3629696540440735e-07, "loss": 0.0, "num_input_tokens_seen": 110208208, "step": 163520 }, { "epoch": 3.9949429555615272, "grad_norm": 0.010948545299470425, "learning_rate": 2.3624191571634822e-07, "loss": 0.0, "num_input_tokens_seen": 110211472, "step": 163525 }, { "epoch": 3.9950651063933744, "grad_norm": 0.003581192810088396, "learning_rate": 2.3618687158254292e-07, "loss": 0.0, "num_input_tokens_seen": 110214672, "step": 163530 }, { "epoch": 3.9951872572252216, "grad_norm": 0.0026018167845904827, "learning_rate": 2.3613183300339246e-07, "loss": 0.0, "num_input_tokens_seen": 110218128, "step": 163535 }, { "epoch": 3.995309408057069, "grad_norm": 0.0834880992770195, "learning_rate": 2.3607679997929652e-07, "loss": 0.0, "num_input_tokens_seen": 110221392, "step": 163540 }, { "epoch": 3.995431558888916, "grad_norm": 0.00801916979253292, "learning_rate": 2.3602177251065548e-07, "loss": 0.0, "num_input_tokens_seen": 110224464, "step": 163545 }, { "epoch": 3.995553709720763, "grad_norm": 0.01131167821586132, "learning_rate": 2.3596675059786998e-07, "loss": 0.0, "num_input_tokens_seen": 110227856, "step": 163550 }, { "epoch": 3.9956758605526104, "grad_norm": 0.00454461295157671, "learning_rate": 2.3591173424133937e-07, "loss": 0.0, "num_input_tokens_seen": 110231248, "step": 163555 }, { "epoch": 3.9957980113844576, "grad_norm": 0.0033284425735473633, "learning_rate": 2.3585672344146457e-07, "loss": 0.0645, "num_input_tokens_seen": 110234640, "step": 163560 }, { "epoch": 3.9959201622163047, "grad_norm": 0.010862693190574646, "learning_rate": 2.358017181986448e-07, "loss": 0.0, "num_input_tokens_seen": 110237712, "step": 163565 }, { "epoch": 3.996042313048152, "grad_norm": 2.686147672648076e-05, "learning_rate": 2.3574671851328077e-07, "loss": 0.0, "num_input_tokens_seen": 110240720, "step": 163570 }, { "epoch": 3.996164463879999, "grad_norm": 0.020719388499855995, "learning_rate": 2.3569172438577189e-07, "loss": 0.0, "num_input_tokens_seen": 110244368, "step": 163575 }, { "epoch": 3.9962866147118463, "grad_norm": 0.03670404851436615, "learning_rate": 2.3563673581651866e-07, "loss": 0.0005, "num_input_tokens_seen": 110247376, "step": 163580 }, { "epoch": 3.9964087655436935, "grad_norm": 0.0005382307572290301, "learning_rate": 2.3558175280592075e-07, "loss": 0.0, "num_input_tokens_seen": 110250576, "step": 163585 }, { "epoch": 3.9965309163755407, "grad_norm": 0.16491441428661346, "learning_rate": 2.355267753543776e-07, "loss": 0.0, "num_input_tokens_seen": 110253776, "step": 163590 }, { "epoch": 3.9966530672073874, "grad_norm": 0.0002645579806994647, "learning_rate": 2.3547180346228957e-07, "loss": 0.0, "num_input_tokens_seen": 110257296, "step": 163595 }, { "epoch": 3.996775218039235, "grad_norm": 0.0007041199132800102, "learning_rate": 2.35416837130056e-07, "loss": 0.0, "num_input_tokens_seen": 110260688, "step": 163600 }, { "epoch": 3.996897368871082, "grad_norm": 0.00015905153122730553, "learning_rate": 2.353618763580768e-07, "loss": 0.0, "num_input_tokens_seen": 110264656, "step": 163605 }, { "epoch": 3.997019519702929, "grad_norm": 0.0004138974763918668, "learning_rate": 2.353069211467521e-07, "loss": 0.0, "num_input_tokens_seen": 110267920, "step": 163610 }, { "epoch": 3.997141670534776, "grad_norm": 0.021569538861513138, "learning_rate": 2.352519714964808e-07, "loss": 0.0, "num_input_tokens_seen": 110271312, "step": 163615 }, { "epoch": 3.9972638213666234, "grad_norm": 0.014961598441004753, "learning_rate": 2.3519702740766312e-07, "loss": 0.0002, "num_input_tokens_seen": 110274576, "step": 163620 }, { "epoch": 3.9973859721984706, "grad_norm": 0.001369571196846664, "learning_rate": 2.3514208888069798e-07, "loss": 0.0, "num_input_tokens_seen": 110277392, "step": 163625 }, { "epoch": 3.9975081230303178, "grad_norm": 0.0006057513528503478, "learning_rate": 2.3508715591598572e-07, "loss": 0.0, "num_input_tokens_seen": 110280592, "step": 163630 }, { "epoch": 3.997630273862165, "grad_norm": 0.000672376248985529, "learning_rate": 2.3503222851392513e-07, "loss": 0.0, "num_input_tokens_seen": 110283984, "step": 163635 }, { "epoch": 3.997752424694012, "grad_norm": 0.0005224617780186236, "learning_rate": 2.3497730667491577e-07, "loss": 0.0663, "num_input_tokens_seen": 110287376, "step": 163640 }, { "epoch": 3.9978745755258593, "grad_norm": 0.38089480996131897, "learning_rate": 2.3492239039935756e-07, "loss": 0.0002, "num_input_tokens_seen": 110290960, "step": 163645 }, { "epoch": 3.9979967263577065, "grad_norm": 0.0008336842292919755, "learning_rate": 2.348674796876493e-07, "loss": 0.0, "num_input_tokens_seen": 110294416, "step": 163650 }, { "epoch": 3.9981188771895537, "grad_norm": 0.0014327648095786572, "learning_rate": 2.3481257454019078e-07, "loss": 0.0, "num_input_tokens_seen": 110298512, "step": 163655 }, { "epoch": 3.998241028021401, "grad_norm": 115.09334564208984, "learning_rate": 2.3475767495738075e-07, "loss": 0.0535, "num_input_tokens_seen": 110302160, "step": 163660 }, { "epoch": 3.998363178853248, "grad_norm": 0.00022477505262941122, "learning_rate": 2.347027809396186e-07, "loss": 0.0, "num_input_tokens_seen": 110305616, "step": 163665 }, { "epoch": 3.9984853296850953, "grad_norm": 46.11375045776367, "learning_rate": 2.346478924873041e-07, "loss": 0.0135, "num_input_tokens_seen": 110308752, "step": 163670 }, { "epoch": 3.9986074805169425, "grad_norm": 0.012809050269424915, "learning_rate": 2.3459300960083593e-07, "loss": 0.0, "num_input_tokens_seen": 110312080, "step": 163675 }, { "epoch": 3.9987296313487892, "grad_norm": 0.0009718817891553044, "learning_rate": 2.3453813228061302e-07, "loss": 0.0975, "num_input_tokens_seen": 110315280, "step": 163680 }, { "epoch": 3.998851782180637, "grad_norm": 0.01494208537042141, "learning_rate": 2.3448326052703492e-07, "loss": 0.0191, "num_input_tokens_seen": 110318544, "step": 163685 }, { "epoch": 3.9989739330124836, "grad_norm": 0.0013464835938066244, "learning_rate": 2.3442839434050043e-07, "loss": 0.0003, "num_input_tokens_seen": 110322256, "step": 163690 }, { "epoch": 3.9990960838443312, "grad_norm": 0.10594768822193146, "learning_rate": 2.3437353372140833e-07, "loss": 0.0, "num_input_tokens_seen": 110325520, "step": 163695 }, { "epoch": 3.999218234676178, "grad_norm": 0.008476635441184044, "learning_rate": 2.3431867867015788e-07, "loss": 0.0, "num_input_tokens_seen": 110328976, "step": 163700 }, { "epoch": 3.999340385508025, "grad_norm": 0.03905835002660751, "learning_rate": 2.3426382918714815e-07, "loss": 0.0001, "num_input_tokens_seen": 110332048, "step": 163705 }, { "epoch": 3.9994625363398724, "grad_norm": 0.0044195763766765594, "learning_rate": 2.3420898527277754e-07, "loss": 0.0, "num_input_tokens_seen": 110335568, "step": 163710 }, { "epoch": 3.9995846871717196, "grad_norm": 0.0023455540649592876, "learning_rate": 2.341541469274454e-07, "loss": 0.0, "num_input_tokens_seen": 110339024, "step": 163715 }, { "epoch": 3.9997068380035667, "grad_norm": 0.12929129600524902, "learning_rate": 2.3409931415155003e-07, "loss": 0.0002, "num_input_tokens_seen": 110342032, "step": 163720 }, { "epoch": 3.999828988835414, "grad_norm": 0.013033738359808922, "learning_rate": 2.340444869454905e-07, "loss": 0.0001, "num_input_tokens_seen": 110345680, "step": 163725 }, { "epoch": 3.999951139667261, "grad_norm": 0.004790800623595715, "learning_rate": 2.339896653096658e-07, "loss": 0.0, "num_input_tokens_seen": 110348688, "step": 163730 }, { "epoch": 4.000073290499108, "grad_norm": 0.0014749355614185333, "learning_rate": 2.3393484924447392e-07, "loss": 0.0, "num_input_tokens_seen": 110351272, "step": 163735 }, { "epoch": 4.0001954413309555, "grad_norm": 0.0026620272547006607, "learning_rate": 2.3388003875031415e-07, "loss": 0.0, "num_input_tokens_seen": 110354664, "step": 163740 }, { "epoch": 4.000293161996433, "eval_loss": 0.247334286570549, "eval_runtime": 47.9251, "eval_samples_per_second": 759.206, "eval_steps_per_second": 94.919, "num_input_tokens_seen": 110357352, "step": 163744 }, { "epoch": 4.000317592162802, "grad_norm": 0.0002669579698704183, "learning_rate": 2.3382523382758456e-07, "loss": 0.0, "num_input_tokens_seen": 110357928, "step": 163745 }, { "epoch": 4.00043974299465, "grad_norm": 0.019051766023039818, "learning_rate": 2.337704344766842e-07, "loss": 0.0, "num_input_tokens_seen": 110361320, "step": 163750 }, { "epoch": 4.000561893826497, "grad_norm": 0.0015128724044188857, "learning_rate": 2.337156406980111e-07, "loss": 0.0, "num_input_tokens_seen": 110364584, "step": 163755 }, { "epoch": 4.000684044658344, "grad_norm": 0.1339402049779892, "learning_rate": 2.3366085249196387e-07, "loss": 0.0001, "num_input_tokens_seen": 110368168, "step": 163760 }, { "epoch": 4.000806195490191, "grad_norm": 0.012656701728701591, "learning_rate": 2.3360606985894138e-07, "loss": 0.0, "num_input_tokens_seen": 110371176, "step": 163765 }, { "epoch": 4.000928346322039, "grad_norm": 0.0021968057844787836, "learning_rate": 2.335512927993414e-07, "loss": 0.0, "num_input_tokens_seen": 110374312, "step": 163770 }, { "epoch": 4.001050497153885, "grad_norm": 0.001350610051304102, "learning_rate": 2.3349652131356278e-07, "loss": 0.0, "num_input_tokens_seen": 110377512, "step": 163775 }, { "epoch": 4.001172647985733, "grad_norm": 0.0015590637922286987, "learning_rate": 2.334417554020035e-07, "loss": 0.0419, "num_input_tokens_seen": 110381096, "step": 163780 }, { "epoch": 4.00129479881758, "grad_norm": 0.0005102349678054452, "learning_rate": 2.333869950650621e-07, "loss": 0.0, "num_input_tokens_seen": 110384296, "step": 163785 }, { "epoch": 4.001416949649427, "grad_norm": 0.004541546106338501, "learning_rate": 2.333322403031367e-07, "loss": 0.0, "num_input_tokens_seen": 110387496, "step": 163790 }, { "epoch": 4.001539100481274, "grad_norm": 0.01784665323793888, "learning_rate": 2.332774911166252e-07, "loss": 0.0, "num_input_tokens_seen": 110391208, "step": 163795 }, { "epoch": 4.001661251313122, "grad_norm": 0.03898552432656288, "learning_rate": 2.332227475059263e-07, "loss": 0.0, "num_input_tokens_seen": 110394536, "step": 163800 }, { "epoch": 4.0017834021449685, "grad_norm": 0.003088279627263546, "learning_rate": 2.3316800947143744e-07, "loss": 0.0, "num_input_tokens_seen": 110397672, "step": 163805 }, { "epoch": 4.001905552976816, "grad_norm": 0.006609124131500721, "learning_rate": 2.3311327701355743e-07, "loss": 0.0, "num_input_tokens_seen": 110401320, "step": 163810 }, { "epoch": 4.002027703808663, "grad_norm": 0.008004154078662395, "learning_rate": 2.3305855013268372e-07, "loss": 0.0, "num_input_tokens_seen": 110404776, "step": 163815 }, { "epoch": 4.0021498546405105, "grad_norm": 0.007009640336036682, "learning_rate": 2.3300382882921444e-07, "loss": 0.0002, "num_input_tokens_seen": 110408296, "step": 163820 }, { "epoch": 4.002272005472357, "grad_norm": 0.004657072480767965, "learning_rate": 2.329491131035478e-07, "loss": 0.0, "num_input_tokens_seen": 110411240, "step": 163825 }, { "epoch": 4.002394156304204, "grad_norm": 0.00040120657649822533, "learning_rate": 2.3289440295608142e-07, "loss": 0.0, "num_input_tokens_seen": 110414888, "step": 163830 }, { "epoch": 4.002516307136052, "grad_norm": 0.003007831983268261, "learning_rate": 2.328396983872134e-07, "loss": 0.0, "num_input_tokens_seen": 110418216, "step": 163835 }, { "epoch": 4.002638457967898, "grad_norm": 0.018329862505197525, "learning_rate": 2.327849993973413e-07, "loss": 0.0, "num_input_tokens_seen": 110421544, "step": 163840 }, { "epoch": 4.002760608799746, "grad_norm": 0.007019806187599897, "learning_rate": 2.3273030598686317e-07, "loss": 0.0, "num_input_tokens_seen": 110424744, "step": 163845 }, { "epoch": 4.002882759631593, "grad_norm": 0.0743463933467865, "learning_rate": 2.3267561815617641e-07, "loss": 0.0, "num_input_tokens_seen": 110428648, "step": 163850 }, { "epoch": 4.00300491046344, "grad_norm": 0.0006112426053732634, "learning_rate": 2.32620935905679e-07, "loss": 0.0, "num_input_tokens_seen": 110432424, "step": 163855 }, { "epoch": 4.003127061295287, "grad_norm": 0.002231811173260212, "learning_rate": 2.3256625923576877e-07, "loss": 0.0, "num_input_tokens_seen": 110435752, "step": 163860 }, { "epoch": 4.003249212127135, "grad_norm": 0.00027201403281651437, "learning_rate": 2.325115881468428e-07, "loss": 0.0, "num_input_tokens_seen": 110438952, "step": 163865 }, { "epoch": 4.0033713629589815, "grad_norm": 0.006251008715480566, "learning_rate": 2.324569226392994e-07, "loss": 0.0, "num_input_tokens_seen": 110442024, "step": 163870 }, { "epoch": 4.003493513790829, "grad_norm": 0.0033184706699103117, "learning_rate": 2.3240226271353525e-07, "loss": 0.0, "num_input_tokens_seen": 110445928, "step": 163875 }, { "epoch": 4.003615664622676, "grad_norm": 0.0009782484266906977, "learning_rate": 2.323476083699487e-07, "loss": 0.0, "num_input_tokens_seen": 110448936, "step": 163880 }, { "epoch": 4.003737815454524, "grad_norm": 0.0054573486559093, "learning_rate": 2.3229295960893647e-07, "loss": 0.0, "num_input_tokens_seen": 110452648, "step": 163885 }, { "epoch": 4.00385996628637, "grad_norm": 0.00012896816770080477, "learning_rate": 2.3223831643089664e-07, "loss": 0.0, "num_input_tokens_seen": 110456040, "step": 163890 }, { "epoch": 4.003982117118218, "grad_norm": 0.002094441093504429, "learning_rate": 2.3218367883622635e-07, "loss": 0.0, "num_input_tokens_seen": 110459176, "step": 163895 }, { "epoch": 4.004104267950065, "grad_norm": 0.0022213133051991463, "learning_rate": 2.3212904682532242e-07, "loss": 0.0, "num_input_tokens_seen": 110462760, "step": 163900 }, { "epoch": 4.004226418781912, "grad_norm": 0.008752007968723774, "learning_rate": 2.3207442039858306e-07, "loss": 0.0001, "num_input_tokens_seen": 110466152, "step": 163905 }, { "epoch": 4.004348569613759, "grad_norm": 0.001377705717459321, "learning_rate": 2.320197995564046e-07, "loss": 0.0, "num_input_tokens_seen": 110469544, "step": 163910 }, { "epoch": 4.004470720445606, "grad_norm": 0.0014387565897777677, "learning_rate": 2.3196518429918488e-07, "loss": 0.0534, "num_input_tokens_seen": 110473320, "step": 163915 }, { "epoch": 4.0045928712774534, "grad_norm": 0.0043650115840137005, "learning_rate": 2.319105746273211e-07, "loss": 0.0, "num_input_tokens_seen": 110476520, "step": 163920 }, { "epoch": 4.0047150221093, "grad_norm": 0.0008779270574450493, "learning_rate": 2.3185597054120999e-07, "loss": 0.0, "num_input_tokens_seen": 110479976, "step": 163925 }, { "epoch": 4.004837172941148, "grad_norm": 0.02200518734753132, "learning_rate": 2.3180137204124905e-07, "loss": 0.0, "num_input_tokens_seen": 110483496, "step": 163930 }, { "epoch": 4.004959323772995, "grad_norm": 0.09678073972463608, "learning_rate": 2.317467791278349e-07, "loss": 0.0, "num_input_tokens_seen": 110486632, "step": 163935 }, { "epoch": 4.005081474604842, "grad_norm": 0.003165224101394415, "learning_rate": 2.3169219180136513e-07, "loss": 0.0, "num_input_tokens_seen": 110490152, "step": 163940 }, { "epoch": 4.005203625436689, "grad_norm": 0.0009228963754139841, "learning_rate": 2.3163761006223616e-07, "loss": 0.0, "num_input_tokens_seen": 110494120, "step": 163945 }, { "epoch": 4.005325776268537, "grad_norm": 0.0017742099007591605, "learning_rate": 2.315830339108451e-07, "loss": 0.0, "num_input_tokens_seen": 110497896, "step": 163950 }, { "epoch": 4.005447927100383, "grad_norm": 0.00030024818261153996, "learning_rate": 2.3152846334758912e-07, "loss": 0.0, "num_input_tokens_seen": 110501032, "step": 163955 }, { "epoch": 4.005570077932231, "grad_norm": 0.0026951045729219913, "learning_rate": 2.314738983728647e-07, "loss": 0.0, "num_input_tokens_seen": 110504616, "step": 163960 }, { "epoch": 4.005692228764078, "grad_norm": 0.0005750693962909281, "learning_rate": 2.31419338987069e-07, "loss": 0.0, "num_input_tokens_seen": 110508200, "step": 163965 }, { "epoch": 4.005814379595925, "grad_norm": 0.0014873980544507504, "learning_rate": 2.3136478519059832e-07, "loss": 0.0, "num_input_tokens_seen": 110511720, "step": 163970 }, { "epoch": 4.005936530427772, "grad_norm": 0.0025685285218060017, "learning_rate": 2.3131023698384966e-07, "loss": 0.0002, "num_input_tokens_seen": 110516008, "step": 163975 }, { "epoch": 4.00605868125962, "grad_norm": 0.002139053773134947, "learning_rate": 2.3125569436721993e-07, "loss": 0.0, "num_input_tokens_seen": 110519016, "step": 163980 }, { "epoch": 4.0061808320914665, "grad_norm": 0.004334287252277136, "learning_rate": 2.3120115734110556e-07, "loss": 0.0, "num_input_tokens_seen": 110522408, "step": 163985 }, { "epoch": 4.006302982923314, "grad_norm": 0.0014206593623384833, "learning_rate": 2.3114662590590294e-07, "loss": 0.0, "num_input_tokens_seen": 110526056, "step": 163990 }, { "epoch": 4.006425133755161, "grad_norm": 0.0003952729166485369, "learning_rate": 2.310921000620092e-07, "loss": 0.0202, "num_input_tokens_seen": 110528936, "step": 163995 }, { "epoch": 4.006547284587008, "grad_norm": 0.020650498569011688, "learning_rate": 2.3103757980982042e-07, "loss": 0.0, "num_input_tokens_seen": 110532264, "step": 164000 }, { "epoch": 4.006669435418855, "grad_norm": 0.0008711630944162607, "learning_rate": 2.3098306514973287e-07, "loss": 0.0, "num_input_tokens_seen": 110535400, "step": 164005 }, { "epoch": 4.006791586250702, "grad_norm": 0.0007402771152555943, "learning_rate": 2.3092855608214345e-07, "loss": 0.0, "num_input_tokens_seen": 110538600, "step": 164010 }, { "epoch": 4.00691373708255, "grad_norm": 0.00012223249359522015, "learning_rate": 2.3087405260744852e-07, "loss": 0.0467, "num_input_tokens_seen": 110541928, "step": 164015 }, { "epoch": 4.007035887914396, "grad_norm": 0.0020083903800696135, "learning_rate": 2.3081955472604419e-07, "loss": 0.0, "num_input_tokens_seen": 110545192, "step": 164020 }, { "epoch": 4.007158038746244, "grad_norm": 0.006110670510679483, "learning_rate": 2.3076506243832727e-07, "loss": 0.0, "num_input_tokens_seen": 110548456, "step": 164025 }, { "epoch": 4.007280189578091, "grad_norm": 0.00020064758427906781, "learning_rate": 2.3071057574469332e-07, "loss": 0.0, "num_input_tokens_seen": 110551592, "step": 164030 }, { "epoch": 4.007402340409938, "grad_norm": 0.00031926928204484284, "learning_rate": 2.3065609464553937e-07, "loss": 0.0, "num_input_tokens_seen": 110554664, "step": 164035 }, { "epoch": 4.007524491241785, "grad_norm": 0.0028003801126033068, "learning_rate": 2.3060161914126086e-07, "loss": 0.0, "num_input_tokens_seen": 110558056, "step": 164040 }, { "epoch": 4.007646642073633, "grad_norm": 0.009324025362730026, "learning_rate": 2.305471492322544e-07, "loss": 0.0, "num_input_tokens_seen": 110561192, "step": 164045 }, { "epoch": 4.0077687929054795, "grad_norm": 0.002270081313326955, "learning_rate": 2.3049268491891615e-07, "loss": 0.0, "num_input_tokens_seen": 110565160, "step": 164050 }, { "epoch": 4.007890943737327, "grad_norm": 0.00206292187795043, "learning_rate": 2.3043822620164187e-07, "loss": 0.0003, "num_input_tokens_seen": 110568616, "step": 164055 }, { "epoch": 4.008013094569174, "grad_norm": 0.00012831814819946885, "learning_rate": 2.3038377308082812e-07, "loss": 0.0, "num_input_tokens_seen": 110572712, "step": 164060 }, { "epoch": 4.0081352454010215, "grad_norm": 0.0010039202170446515, "learning_rate": 2.3032932555687033e-07, "loss": 0.0003, "num_input_tokens_seen": 110576552, "step": 164065 }, { "epoch": 4.008257396232868, "grad_norm": 0.0008860706002451479, "learning_rate": 2.3027488363016458e-07, "loss": 0.0, "num_input_tokens_seen": 110579752, "step": 164070 }, { "epoch": 4.008379547064716, "grad_norm": 0.0006712638423778117, "learning_rate": 2.3022044730110723e-07, "loss": 0.0, "num_input_tokens_seen": 110583144, "step": 164075 }, { "epoch": 4.008501697896563, "grad_norm": 0.0006703397375531495, "learning_rate": 2.301660165700936e-07, "loss": 0.0002, "num_input_tokens_seen": 110586344, "step": 164080 }, { "epoch": 4.00862384872841, "grad_norm": 0.0009707873105071485, "learning_rate": 2.3011159143752e-07, "loss": 0.0, "num_input_tokens_seen": 110590120, "step": 164085 }, { "epoch": 4.008745999560257, "grad_norm": 0.0013939151540398598, "learning_rate": 2.300571719037817e-07, "loss": 0.0, "num_input_tokens_seen": 110593448, "step": 164090 }, { "epoch": 4.008868150392104, "grad_norm": 0.0066462913528084755, "learning_rate": 2.3000275796927504e-07, "loss": 0.0, "num_input_tokens_seen": 110597288, "step": 164095 }, { "epoch": 4.008990301223951, "grad_norm": 0.0016602538526058197, "learning_rate": 2.2994834963439547e-07, "loss": 0.0, "num_input_tokens_seen": 110600168, "step": 164100 }, { "epoch": 4.009112452055798, "grad_norm": 0.00018916718545369804, "learning_rate": 2.2989394689953824e-07, "loss": 0.0, "num_input_tokens_seen": 110603944, "step": 164105 }, { "epoch": 4.009234602887646, "grad_norm": 0.0003112444537691772, "learning_rate": 2.2983954976509967e-07, "loss": 0.0, "num_input_tokens_seen": 110606888, "step": 164110 }, { "epoch": 4.0093567537194925, "grad_norm": 0.001570153865031898, "learning_rate": 2.2978515823147481e-07, "loss": 0.0, "num_input_tokens_seen": 110610088, "step": 164115 }, { "epoch": 4.00947890455134, "grad_norm": 0.0008236741996370256, "learning_rate": 2.2973077229905967e-07, "loss": 0.0, "num_input_tokens_seen": 110613416, "step": 164120 }, { "epoch": 4.009601055383187, "grad_norm": 0.0006083215121179819, "learning_rate": 2.2967639196824928e-07, "loss": 0.0002, "num_input_tokens_seen": 110616488, "step": 164125 }, { "epoch": 4.0097232062150345, "grad_norm": 0.011240467429161072, "learning_rate": 2.296220172394394e-07, "loss": 0.0, "num_input_tokens_seen": 110619624, "step": 164130 }, { "epoch": 4.009845357046881, "grad_norm": 0.001044436008669436, "learning_rate": 2.2956764811302564e-07, "loss": 0.0, "num_input_tokens_seen": 110622952, "step": 164135 }, { "epoch": 4.009967507878729, "grad_norm": 0.0005551927606575191, "learning_rate": 2.295132845894029e-07, "loss": 0.058, "num_input_tokens_seen": 110626280, "step": 164140 }, { "epoch": 4.010089658710576, "grad_norm": 0.00011305078805889934, "learning_rate": 2.2945892666896705e-07, "loss": 0.0, "num_input_tokens_seen": 110630056, "step": 164145 }, { "epoch": 4.010211809542423, "grad_norm": 0.0020253423135727644, "learning_rate": 2.2940457435211292e-07, "loss": 0.0001, "num_input_tokens_seen": 110633320, "step": 164150 }, { "epoch": 4.01033396037427, "grad_norm": 0.000794794992543757, "learning_rate": 2.2935022763923618e-07, "loss": 0.0, "num_input_tokens_seen": 110636904, "step": 164155 }, { "epoch": 4.010456111206118, "grad_norm": 0.003235079115256667, "learning_rate": 2.2929588653073163e-07, "loss": 0.0, "num_input_tokens_seen": 110640104, "step": 164160 }, { "epoch": 4.010578262037964, "grad_norm": 0.00310494820587337, "learning_rate": 2.2924155102699472e-07, "loss": 0.0, "num_input_tokens_seen": 110643304, "step": 164165 }, { "epoch": 4.010700412869812, "grad_norm": 0.01030244305729866, "learning_rate": 2.2918722112842082e-07, "loss": 0.0, "num_input_tokens_seen": 110646568, "step": 164170 }, { "epoch": 4.010822563701659, "grad_norm": 0.0007277664844878018, "learning_rate": 2.291328968354045e-07, "loss": 0.0001, "num_input_tokens_seen": 110649896, "step": 164175 }, { "epoch": 4.0109447145335055, "grad_norm": 0.03205833584070206, "learning_rate": 2.2907857814834132e-07, "loss": 0.0, "num_input_tokens_seen": 110653160, "step": 164180 }, { "epoch": 4.011066865365353, "grad_norm": 0.01856006681919098, "learning_rate": 2.2902426506762574e-07, "loss": 0.0, "num_input_tokens_seen": 110656552, "step": 164185 }, { "epoch": 4.0111890161972, "grad_norm": 0.0006459152791649103, "learning_rate": 2.2896995759365344e-07, "loss": 0.0, "num_input_tokens_seen": 110659688, "step": 164190 }, { "epoch": 4.0113111670290476, "grad_norm": 0.00521416962146759, "learning_rate": 2.289156557268187e-07, "loss": 0.0, "num_input_tokens_seen": 110663144, "step": 164195 }, { "epoch": 4.011433317860894, "grad_norm": 0.0003529897076077759, "learning_rate": 2.2886135946751706e-07, "loss": 0.0, "num_input_tokens_seen": 110666152, "step": 164200 }, { "epoch": 4.011555468692742, "grad_norm": 0.016934869810938835, "learning_rate": 2.2880706881614298e-07, "loss": 0.0, "num_input_tokens_seen": 110669352, "step": 164205 }, { "epoch": 4.011677619524589, "grad_norm": 0.001921301824040711, "learning_rate": 2.2875278377309093e-07, "loss": 0.0, "num_input_tokens_seen": 110673064, "step": 164210 }, { "epoch": 4.011799770356436, "grad_norm": 0.005222341511398554, "learning_rate": 2.2869850433875648e-07, "loss": 0.0, "num_input_tokens_seen": 110676328, "step": 164215 }, { "epoch": 4.011921921188283, "grad_norm": 0.000702301156707108, "learning_rate": 2.2864423051353366e-07, "loss": 0.0, "num_input_tokens_seen": 110679784, "step": 164220 }, { "epoch": 4.012044072020131, "grad_norm": 0.0010769871296361089, "learning_rate": 2.2858996229781745e-07, "loss": 0.0, "num_input_tokens_seen": 110682856, "step": 164225 }, { "epoch": 4.012166222851977, "grad_norm": 0.011183716356754303, "learning_rate": 2.285356996920028e-07, "loss": 0.0, "num_input_tokens_seen": 110686248, "step": 164230 }, { "epoch": 4.012288373683825, "grad_norm": 0.0006547856028191745, "learning_rate": 2.2848144269648373e-07, "loss": 0.0, "num_input_tokens_seen": 110689960, "step": 164235 }, { "epoch": 4.012410524515672, "grad_norm": 0.03447788953781128, "learning_rate": 2.2842719131165544e-07, "loss": 0.0235, "num_input_tokens_seen": 110693672, "step": 164240 }, { "epoch": 4.0125326753475195, "grad_norm": 0.0019113948801532388, "learning_rate": 2.2837294553791186e-07, "loss": 0.0, "num_input_tokens_seen": 110696936, "step": 164245 }, { "epoch": 4.012654826179366, "grad_norm": 0.006821444723755121, "learning_rate": 2.2831870537564814e-07, "loss": 0.0, "num_input_tokens_seen": 110700328, "step": 164250 }, { "epoch": 4.012776977011214, "grad_norm": 0.00023176280956249684, "learning_rate": 2.282644708252579e-07, "loss": 0.0, "num_input_tokens_seen": 110703720, "step": 164255 }, { "epoch": 4.012899127843061, "grad_norm": 0.007748324424028397, "learning_rate": 2.2821024188713612e-07, "loss": 0.0, "num_input_tokens_seen": 110707240, "step": 164260 }, { "epoch": 4.013021278674908, "grad_norm": 0.00011586442997213453, "learning_rate": 2.2815601856167722e-07, "loss": 0.0, "num_input_tokens_seen": 110710568, "step": 164265 }, { "epoch": 4.013143429506755, "grad_norm": 0.0004410957044456154, "learning_rate": 2.281018008492751e-07, "loss": 0.0, "num_input_tokens_seen": 110714280, "step": 164270 }, { "epoch": 4.013265580338602, "grad_norm": 0.0011323723010718822, "learning_rate": 2.280475887503247e-07, "loss": 0.0, "num_input_tokens_seen": 110717736, "step": 164275 }, { "epoch": 4.013387731170449, "grad_norm": 0.018920201808214188, "learning_rate": 2.2799338226521947e-07, "loss": 0.0, "num_input_tokens_seen": 110720872, "step": 164280 }, { "epoch": 4.013509882002296, "grad_norm": 0.002477227710187435, "learning_rate": 2.279391813943541e-07, "loss": 0.0, "num_input_tokens_seen": 110724264, "step": 164285 }, { "epoch": 4.013632032834144, "grad_norm": 0.010635924525558949, "learning_rate": 2.2788498613812279e-07, "loss": 0.0, "num_input_tokens_seen": 110728552, "step": 164290 }, { "epoch": 4.0137541836659905, "grad_norm": 0.0006659305654466152, "learning_rate": 2.278307964969196e-07, "loss": 0.0, "num_input_tokens_seen": 110732136, "step": 164295 }, { "epoch": 4.013876334497838, "grad_norm": 8.711554983165115e-05, "learning_rate": 2.2777661247113832e-07, "loss": 0.0, "num_input_tokens_seen": 110735400, "step": 164300 }, { "epoch": 4.013998485329685, "grad_norm": 0.0004100829246453941, "learning_rate": 2.2772243406117353e-07, "loss": 0.0, "num_input_tokens_seen": 110738920, "step": 164305 }, { "epoch": 4.0141206361615325, "grad_norm": 0.001897158334031701, "learning_rate": 2.2766826126741877e-07, "loss": 0.0, "num_input_tokens_seen": 110742376, "step": 164310 }, { "epoch": 4.014242786993379, "grad_norm": 0.0074745300225913525, "learning_rate": 2.27614094090268e-07, "loss": 0.0, "num_input_tokens_seen": 110745576, "step": 164315 }, { "epoch": 4.014364937825227, "grad_norm": 0.0002957666292786598, "learning_rate": 2.275599325301153e-07, "loss": 0.0, "num_input_tokens_seen": 110748968, "step": 164320 }, { "epoch": 4.014487088657074, "grad_norm": 0.008016318082809448, "learning_rate": 2.275057765873547e-07, "loss": 0.0, "num_input_tokens_seen": 110752232, "step": 164325 }, { "epoch": 4.014609239488921, "grad_norm": 0.001003736280836165, "learning_rate": 2.274516262623797e-07, "loss": 0.0004, "num_input_tokens_seen": 110755624, "step": 164330 }, { "epoch": 4.014731390320768, "grad_norm": 0.00033409715979360044, "learning_rate": 2.2739748155558448e-07, "loss": 0.0, "num_input_tokens_seen": 110758760, "step": 164335 }, { "epoch": 4.014853541152616, "grad_norm": 0.0016860624309629202, "learning_rate": 2.273433424673622e-07, "loss": 0.0, "num_input_tokens_seen": 110761960, "step": 164340 }, { "epoch": 4.014975691984462, "grad_norm": 0.0037533806171268225, "learning_rate": 2.2728920899810734e-07, "loss": 0.0, "num_input_tokens_seen": 110765160, "step": 164345 }, { "epoch": 4.01509784281631, "grad_norm": 0.12728959321975708, "learning_rate": 2.27235081148213e-07, "loss": 0.0001, "num_input_tokens_seen": 110768616, "step": 164350 }, { "epoch": 4.015219993648157, "grad_norm": 0.0001371447870042175, "learning_rate": 2.2718095891807287e-07, "loss": 0.0, "num_input_tokens_seen": 110771880, "step": 164355 }, { "epoch": 4.0153421444800035, "grad_norm": 0.00028730809572152793, "learning_rate": 2.27126842308081e-07, "loss": 0.0, "num_input_tokens_seen": 110775272, "step": 164360 }, { "epoch": 4.015464295311851, "grad_norm": 0.0014190376969054341, "learning_rate": 2.2707273131863025e-07, "loss": 0.0001, "num_input_tokens_seen": 110778216, "step": 164365 }, { "epoch": 4.015586446143698, "grad_norm": 0.008561410009860992, "learning_rate": 2.270186259501149e-07, "loss": 0.0, "num_input_tokens_seen": 110781608, "step": 164370 }, { "epoch": 4.0157085969755455, "grad_norm": 0.0006715949275530875, "learning_rate": 2.269645262029276e-07, "loss": 0.0, "num_input_tokens_seen": 110784872, "step": 164375 }, { "epoch": 4.015830747807392, "grad_norm": 0.0041750771924853325, "learning_rate": 2.269104320774623e-07, "loss": 0.0, "num_input_tokens_seen": 110788392, "step": 164380 }, { "epoch": 4.01595289863924, "grad_norm": 0.007014765869826078, "learning_rate": 2.2685634357411242e-07, "loss": 0.0, "num_input_tokens_seen": 110791720, "step": 164385 }, { "epoch": 4.016075049471087, "grad_norm": 0.0028299784753471613, "learning_rate": 2.2680226069327102e-07, "loss": 0.0, "num_input_tokens_seen": 110794792, "step": 164390 }, { "epoch": 4.016197200302934, "grad_norm": 0.0004960571532137692, "learning_rate": 2.2674818343533175e-07, "loss": 0.0, "num_input_tokens_seen": 110798184, "step": 164395 }, { "epoch": 4.016319351134781, "grad_norm": 0.008951370604336262, "learning_rate": 2.2669411180068742e-07, "loss": 0.0, "num_input_tokens_seen": 110801256, "step": 164400 }, { "epoch": 4.016441501966629, "grad_norm": 0.004213410429656506, "learning_rate": 2.2664004578973173e-07, "loss": 0.0, "num_input_tokens_seen": 110804712, "step": 164405 }, { "epoch": 4.016563652798475, "grad_norm": 0.00019606153364293277, "learning_rate": 2.2658598540285767e-07, "loss": 0.0, "num_input_tokens_seen": 110807720, "step": 164410 }, { "epoch": 4.016685803630323, "grad_norm": 0.0010281888535246253, "learning_rate": 2.2653193064045807e-07, "loss": 0.0, "num_input_tokens_seen": 110811048, "step": 164415 }, { "epoch": 4.01680795446217, "grad_norm": 0.0025090575218200684, "learning_rate": 2.2647788150292657e-07, "loss": 0.0, "num_input_tokens_seen": 110814440, "step": 164420 }, { "epoch": 4.016930105294017, "grad_norm": 0.0008665485656820238, "learning_rate": 2.2642383799065578e-07, "loss": 0.0, "num_input_tokens_seen": 110817640, "step": 164425 }, { "epoch": 4.017052256125864, "grad_norm": 0.009751011617481709, "learning_rate": 2.2636980010403904e-07, "loss": 0.0, "num_input_tokens_seen": 110821096, "step": 164430 }, { "epoch": 4.017174406957712, "grad_norm": 0.005670327227562666, "learning_rate": 2.2631576784346906e-07, "loss": 0.0, "num_input_tokens_seen": 110824616, "step": 164435 }, { "epoch": 4.0172965577895585, "grad_norm": 0.0025142088998109102, "learning_rate": 2.2626174120933882e-07, "loss": 0.0, "num_input_tokens_seen": 110828008, "step": 164440 }, { "epoch": 4.017418708621406, "grad_norm": 0.0021428910549730062, "learning_rate": 2.262077202020416e-07, "loss": 0.0001, "num_input_tokens_seen": 110831144, "step": 164445 }, { "epoch": 4.017540859453253, "grad_norm": 0.00027007071184925735, "learning_rate": 2.261537048219697e-07, "loss": 0.0, "num_input_tokens_seen": 110834216, "step": 164450 }, { "epoch": 4.0176630102851, "grad_norm": 0.011947103776037693, "learning_rate": 2.2609969506951655e-07, "loss": 0.0, "num_input_tokens_seen": 110837352, "step": 164455 }, { "epoch": 4.017785161116947, "grad_norm": 8.05276504252106e-05, "learning_rate": 2.260456909450742e-07, "loss": 0.0, "num_input_tokens_seen": 110840808, "step": 164460 }, { "epoch": 4.017907311948794, "grad_norm": 0.00022645792341791093, "learning_rate": 2.2599169244903604e-07, "loss": 0.0, "num_input_tokens_seen": 110843752, "step": 164465 }, { "epoch": 4.018029462780642, "grad_norm": 0.0010104464599862695, "learning_rate": 2.259376995817942e-07, "loss": 0.0, "num_input_tokens_seen": 110847016, "step": 164470 }, { "epoch": 4.018151613612488, "grad_norm": 0.0003966574731748551, "learning_rate": 2.258837123437416e-07, "loss": 0.0, "num_input_tokens_seen": 110850280, "step": 164475 }, { "epoch": 4.018273764444336, "grad_norm": 0.00997732114046812, "learning_rate": 2.258297307352711e-07, "loss": 0.0, "num_input_tokens_seen": 110853224, "step": 164480 }, { "epoch": 4.018395915276183, "grad_norm": 0.0007848578388802707, "learning_rate": 2.257757547567748e-07, "loss": 0.0, "num_input_tokens_seen": 110857000, "step": 164485 }, { "epoch": 4.01851806610803, "grad_norm": 0.000499740825034678, "learning_rate": 2.2572178440864575e-07, "loss": 0.0, "num_input_tokens_seen": 110860136, "step": 164490 }, { "epoch": 4.018640216939877, "grad_norm": 0.0006135724834166467, "learning_rate": 2.256678196912758e-07, "loss": 0.0002, "num_input_tokens_seen": 110863080, "step": 164495 }, { "epoch": 4.018762367771725, "grad_norm": 0.00032030860893428326, "learning_rate": 2.2561386060505805e-07, "loss": 0.0, "num_input_tokens_seen": 110866664, "step": 164500 }, { "epoch": 4.0188845186035715, "grad_norm": 0.001445369329303503, "learning_rate": 2.2555990715038432e-07, "loss": 0.0, "num_input_tokens_seen": 110869480, "step": 164505 }, { "epoch": 4.019006669435419, "grad_norm": 0.004807790741324425, "learning_rate": 2.255059593276476e-07, "loss": 0.0, "num_input_tokens_seen": 110872296, "step": 164510 }, { "epoch": 4.019128820267266, "grad_norm": 0.0006840810528956354, "learning_rate": 2.254520171372397e-07, "loss": 0.0, "num_input_tokens_seen": 110876072, "step": 164515 }, { "epoch": 4.019250971099114, "grad_norm": 0.00278223748318851, "learning_rate": 2.253980805795529e-07, "loss": 0.0, "num_input_tokens_seen": 110879720, "step": 164520 }, { "epoch": 4.01937312193096, "grad_norm": 0.00030470662750303745, "learning_rate": 2.2534414965497984e-07, "loss": 0.0, "num_input_tokens_seen": 110882920, "step": 164525 }, { "epoch": 4.019495272762808, "grad_norm": 0.0013142710085958242, "learning_rate": 2.2529022436391221e-07, "loss": 0.0, "num_input_tokens_seen": 110886568, "step": 164530 }, { "epoch": 4.019617423594655, "grad_norm": 0.0029207654297351837, "learning_rate": 2.2523630470674238e-07, "loss": 0.0, "num_input_tokens_seen": 110889832, "step": 164535 }, { "epoch": 4.019739574426501, "grad_norm": 0.07291989028453827, "learning_rate": 2.251823906838629e-07, "loss": 0.0, "num_input_tokens_seen": 110893672, "step": 164540 }, { "epoch": 4.019861725258349, "grad_norm": 0.005791286937892437, "learning_rate": 2.2512848229566517e-07, "loss": 0.0, "num_input_tokens_seen": 110897384, "step": 164545 }, { "epoch": 4.019983876090196, "grad_norm": 0.005201366264373064, "learning_rate": 2.2507457954254173e-07, "loss": 0.0, "num_input_tokens_seen": 110900584, "step": 164550 }, { "epoch": 4.020106026922043, "grad_norm": 0.0014897704822942615, "learning_rate": 2.2502068242488414e-07, "loss": 0.0, "num_input_tokens_seen": 110903656, "step": 164555 }, { "epoch": 4.02022817775389, "grad_norm": 0.0009115237044170499, "learning_rate": 2.249667909430849e-07, "loss": 0.0, "num_input_tokens_seen": 110906984, "step": 164560 }, { "epoch": 4.020350328585738, "grad_norm": 0.0010413050185889006, "learning_rate": 2.2491290509753536e-07, "loss": 0.0, "num_input_tokens_seen": 110910376, "step": 164565 }, { "epoch": 4.020472479417585, "grad_norm": 0.00037109790719114244, "learning_rate": 2.2485902488862763e-07, "loss": 0.0, "num_input_tokens_seen": 110913384, "step": 164570 }, { "epoch": 4.020594630249432, "grad_norm": 0.0031259816605597734, "learning_rate": 2.2480515031675384e-07, "loss": 0.0, "num_input_tokens_seen": 110917224, "step": 164575 }, { "epoch": 4.020716781081279, "grad_norm": 0.00027198364841751754, "learning_rate": 2.2475128138230516e-07, "loss": 0.0, "num_input_tokens_seen": 110920360, "step": 164580 }, { "epoch": 4.020838931913127, "grad_norm": 0.25161880254745483, "learning_rate": 2.24697418085674e-07, "loss": 0.0, "num_input_tokens_seen": 110923496, "step": 164585 }, { "epoch": 4.020961082744973, "grad_norm": 0.015751512721180916, "learning_rate": 2.2464356042725152e-07, "loss": 0.0, "num_input_tokens_seen": 110926888, "step": 164590 }, { "epoch": 4.021083233576821, "grad_norm": 0.012762527912855148, "learning_rate": 2.2458970840742976e-07, "loss": 0.0, "num_input_tokens_seen": 110930152, "step": 164595 }, { "epoch": 4.021205384408668, "grad_norm": 0.020243745297193527, "learning_rate": 2.2453586202660003e-07, "loss": 0.0001, "num_input_tokens_seen": 110933672, "step": 164600 }, { "epoch": 4.021327535240515, "grad_norm": 0.0001934144675033167, "learning_rate": 2.2448202128515436e-07, "loss": 0.0, "num_input_tokens_seen": 110936936, "step": 164605 }, { "epoch": 4.021449686072362, "grad_norm": 0.0003170445270370692, "learning_rate": 2.2442818618348368e-07, "loss": 0.0, "num_input_tokens_seen": 110939816, "step": 164610 }, { "epoch": 4.02157183690421, "grad_norm": 0.00047657734830863774, "learning_rate": 2.2437435672198014e-07, "loss": 0.0, "num_input_tokens_seen": 110943272, "step": 164615 }, { "epoch": 4.0216939877360565, "grad_norm": 0.005242410581558943, "learning_rate": 2.243205329010349e-07, "loss": 0.0, "num_input_tokens_seen": 110946408, "step": 164620 }, { "epoch": 4.021816138567903, "grad_norm": 0.0006164037040434778, "learning_rate": 2.242667147210392e-07, "loss": 0.0002, "num_input_tokens_seen": 110949864, "step": 164625 }, { "epoch": 4.021938289399751, "grad_norm": 0.00393176032230258, "learning_rate": 2.2421290218238442e-07, "loss": 0.0, "num_input_tokens_seen": 110953832, "step": 164630 }, { "epoch": 4.022060440231598, "grad_norm": 0.00013939302880316973, "learning_rate": 2.241590952854625e-07, "loss": 0.0, "num_input_tokens_seen": 110956968, "step": 164635 }, { "epoch": 4.022182591063445, "grad_norm": 0.0005854598130099475, "learning_rate": 2.241052940306639e-07, "loss": 0.0, "num_input_tokens_seen": 110960168, "step": 164640 }, { "epoch": 4.022304741895292, "grad_norm": 0.00018355067004449666, "learning_rate": 2.2405149841838068e-07, "loss": 0.0, "num_input_tokens_seen": 110963496, "step": 164645 }, { "epoch": 4.02242689272714, "grad_norm": 0.0023532859049737453, "learning_rate": 2.2399770844900334e-07, "loss": 0.0, "num_input_tokens_seen": 110966632, "step": 164650 }, { "epoch": 4.022549043558986, "grad_norm": 0.0013373279944062233, "learning_rate": 2.2394392412292372e-07, "loss": 0.0, "num_input_tokens_seen": 110969896, "step": 164655 }, { "epoch": 4.022671194390834, "grad_norm": 0.8230629563331604, "learning_rate": 2.238901454405323e-07, "loss": 0.0005, "num_input_tokens_seen": 110973352, "step": 164660 }, { "epoch": 4.022793345222681, "grad_norm": 0.0009062674944289029, "learning_rate": 2.2383637240222052e-07, "loss": 0.0, "num_input_tokens_seen": 110976680, "step": 164665 }, { "epoch": 4.022915496054528, "grad_norm": 0.0002744555240496993, "learning_rate": 2.2378260500837965e-07, "loss": 0.0, "num_input_tokens_seen": 110980136, "step": 164670 }, { "epoch": 4.023037646886375, "grad_norm": 0.00032696948619559407, "learning_rate": 2.2372884325940013e-07, "loss": 0.0, "num_input_tokens_seen": 110983400, "step": 164675 }, { "epoch": 4.023159797718223, "grad_norm": 0.0033456883393228054, "learning_rate": 2.2367508715567364e-07, "loss": 0.0, "num_input_tokens_seen": 110987240, "step": 164680 }, { "epoch": 4.0232819485500695, "grad_norm": 0.002025763038545847, "learning_rate": 2.236213366975903e-07, "loss": 0.0, "num_input_tokens_seen": 110990568, "step": 164685 }, { "epoch": 4.023404099381917, "grad_norm": 0.029058869928121567, "learning_rate": 2.2356759188554153e-07, "loss": 0.0, "num_input_tokens_seen": 110994024, "step": 164690 }, { "epoch": 4.023526250213764, "grad_norm": 0.01186623889952898, "learning_rate": 2.235138527199184e-07, "loss": 0.0, "num_input_tokens_seen": 110997096, "step": 164695 }, { "epoch": 4.0236484010456115, "grad_norm": 0.007000217214226723, "learning_rate": 2.2346011920111095e-07, "loss": 0.0, "num_input_tokens_seen": 111000744, "step": 164700 }, { "epoch": 4.023770551877458, "grad_norm": 0.0017543296562507749, "learning_rate": 2.2340639132951077e-07, "loss": 0.0, "num_input_tokens_seen": 111004008, "step": 164705 }, { "epoch": 4.023892702709306, "grad_norm": 0.0025902055203914642, "learning_rate": 2.2335266910550787e-07, "loss": 0.0, "num_input_tokens_seen": 111007528, "step": 164710 }, { "epoch": 4.024014853541153, "grad_norm": 0.0019935499876737595, "learning_rate": 2.2329895252949348e-07, "loss": 0.0, "num_input_tokens_seen": 111010600, "step": 164715 }, { "epoch": 4.024137004372999, "grad_norm": 0.0051970952190458775, "learning_rate": 2.2324524160185808e-07, "loss": 0.0, "num_input_tokens_seen": 111014696, "step": 164720 }, { "epoch": 4.024259155204847, "grad_norm": 0.0010366010246798396, "learning_rate": 2.2319153632299192e-07, "loss": 0.0698, "num_input_tokens_seen": 111017832, "step": 164725 }, { "epoch": 4.024381306036694, "grad_norm": 0.0009999765316024423, "learning_rate": 2.2313783669328613e-07, "loss": 0.0, "num_input_tokens_seen": 111020968, "step": 164730 }, { "epoch": 4.024503456868541, "grad_norm": 0.00013882630446460098, "learning_rate": 2.230841427131307e-07, "loss": 0.0, "num_input_tokens_seen": 111024232, "step": 164735 }, { "epoch": 4.024625607700388, "grad_norm": 0.0006216170149855316, "learning_rate": 2.2303045438291656e-07, "loss": 0.0, "num_input_tokens_seen": 111027816, "step": 164740 }, { "epoch": 4.024747758532236, "grad_norm": 0.0018986144568771124, "learning_rate": 2.2297677170303363e-07, "loss": 0.0, "num_input_tokens_seen": 111030568, "step": 164745 }, { "epoch": 4.0248699093640825, "grad_norm": 0.0037460937164723873, "learning_rate": 2.2292309467387293e-07, "loss": 0.0, "num_input_tokens_seen": 111033576, "step": 164750 }, { "epoch": 4.02499206019593, "grad_norm": 0.002591182477772236, "learning_rate": 2.2286942329582425e-07, "loss": 0.0, "num_input_tokens_seen": 111037352, "step": 164755 }, { "epoch": 4.025114211027777, "grad_norm": 0.002688974840566516, "learning_rate": 2.2281575756927818e-07, "loss": 0.0, "num_input_tokens_seen": 111040488, "step": 164760 }, { "epoch": 4.0252363618596245, "grad_norm": 0.0012876774417236447, "learning_rate": 2.2276209749462516e-07, "loss": 0.0, "num_input_tokens_seen": 111043752, "step": 164765 }, { "epoch": 4.025358512691471, "grad_norm": 0.00019919118494726717, "learning_rate": 2.227084430722549e-07, "loss": 0.0, "num_input_tokens_seen": 111047080, "step": 164770 }, { "epoch": 4.025480663523319, "grad_norm": 0.00011809620627900586, "learning_rate": 2.226547943025583e-07, "loss": 0.0, "num_input_tokens_seen": 111050216, "step": 164775 }, { "epoch": 4.025602814355166, "grad_norm": 0.00037095736479386687, "learning_rate": 2.226011511859247e-07, "loss": 0.0, "num_input_tokens_seen": 111053416, "step": 164780 }, { "epoch": 4.025724965187013, "grad_norm": 0.0015142523916438222, "learning_rate": 2.2254751372274473e-07, "loss": 0.0, "num_input_tokens_seen": 111057000, "step": 164785 }, { "epoch": 4.02584711601886, "grad_norm": 0.005019500385969877, "learning_rate": 2.2249388191340857e-07, "loss": 0.0, "num_input_tokens_seen": 111060264, "step": 164790 }, { "epoch": 4.025969266850708, "grad_norm": 0.0001835352013586089, "learning_rate": 2.2244025575830582e-07, "loss": 0.0, "num_input_tokens_seen": 111063336, "step": 164795 }, { "epoch": 4.026091417682554, "grad_norm": 0.0005110033671371639, "learning_rate": 2.2238663525782687e-07, "loss": 0.0, "num_input_tokens_seen": 111066728, "step": 164800 }, { "epoch": 4.026213568514401, "grad_norm": 0.0037860642187297344, "learning_rate": 2.2233302041236124e-07, "loss": 0.0, "num_input_tokens_seen": 111070248, "step": 164805 }, { "epoch": 4.026335719346249, "grad_norm": 0.000749050872400403, "learning_rate": 2.222794112222993e-07, "loss": 0.0, "num_input_tokens_seen": 111073256, "step": 164810 }, { "epoch": 4.0264578701780955, "grad_norm": 0.008735577575862408, "learning_rate": 2.2222580768803045e-07, "loss": 0.0, "num_input_tokens_seen": 111076264, "step": 164815 }, { "epoch": 4.026580021009943, "grad_norm": 6.568283879460068e-06, "learning_rate": 2.221722098099449e-07, "loss": 0.0, "num_input_tokens_seen": 111080040, "step": 164820 }, { "epoch": 4.02670217184179, "grad_norm": 0.0029867016710340977, "learning_rate": 2.221186175884323e-07, "loss": 0.0, "num_input_tokens_seen": 111083240, "step": 164825 }, { "epoch": 4.0268243226736375, "grad_norm": 0.0010198794770985842, "learning_rate": 2.2206503102388207e-07, "loss": 0.0, "num_input_tokens_seen": 111086184, "step": 164830 }, { "epoch": 4.026946473505484, "grad_norm": 0.0011232220567762852, "learning_rate": 2.2201145011668443e-07, "loss": 0.0, "num_input_tokens_seen": 111089320, "step": 164835 }, { "epoch": 4.027068624337332, "grad_norm": 0.000603526714257896, "learning_rate": 2.219578748672285e-07, "loss": 0.0, "num_input_tokens_seen": 111092648, "step": 164840 }, { "epoch": 4.027190775169179, "grad_norm": 0.0009548215894028544, "learning_rate": 2.219043052759042e-07, "loss": 0.0, "num_input_tokens_seen": 111096104, "step": 164845 }, { "epoch": 4.027312926001026, "grad_norm": 0.049355845898389816, "learning_rate": 2.2185074134310134e-07, "loss": 0.0, "num_input_tokens_seen": 111099304, "step": 164850 }, { "epoch": 4.027435076832873, "grad_norm": 0.013851494528353214, "learning_rate": 2.217971830692089e-07, "loss": 0.0, "num_input_tokens_seen": 111102824, "step": 164855 }, { "epoch": 4.027557227664721, "grad_norm": 0.0009064696496352553, "learning_rate": 2.2174363045461697e-07, "loss": 0.0, "num_input_tokens_seen": 111106280, "step": 164860 }, { "epoch": 4.027679378496567, "grad_norm": 0.0007289585773833096, "learning_rate": 2.216900834997143e-07, "loss": 0.0, "num_input_tokens_seen": 111109416, "step": 164865 }, { "epoch": 4.027801529328415, "grad_norm": 0.001933575258590281, "learning_rate": 2.2163654220489102e-07, "loss": 0.0, "num_input_tokens_seen": 111112744, "step": 164870 }, { "epoch": 4.027923680160262, "grad_norm": 0.0033129544463008642, "learning_rate": 2.2158300657053596e-07, "loss": 0.0, "num_input_tokens_seen": 111116328, "step": 164875 }, { "epoch": 4.028045830992109, "grad_norm": 0.0009052801760844886, "learning_rate": 2.215294765970386e-07, "loss": 0.0, "num_input_tokens_seen": 111119272, "step": 164880 }, { "epoch": 4.028167981823956, "grad_norm": 5.468447125167586e-05, "learning_rate": 2.2147595228478844e-07, "loss": 0.0, "num_input_tokens_seen": 111122600, "step": 164885 }, { "epoch": 4.028290132655804, "grad_norm": 0.0022210185416042805, "learning_rate": 2.2142243363417446e-07, "loss": 0.0002, "num_input_tokens_seen": 111126184, "step": 164890 }, { "epoch": 4.028412283487651, "grad_norm": 0.0010133266914635897, "learning_rate": 2.213689206455861e-07, "loss": 0.0, "num_input_tokens_seen": 111129256, "step": 164895 }, { "epoch": 4.028534434319497, "grad_norm": 0.00019960623467341065, "learning_rate": 2.2131541331941216e-07, "loss": 0.0, "num_input_tokens_seen": 111132136, "step": 164900 }, { "epoch": 4.028656585151345, "grad_norm": 195.2945098876953, "learning_rate": 2.2126191165604214e-07, "loss": 0.0464, "num_input_tokens_seen": 111135528, "step": 164905 }, { "epoch": 4.028778735983192, "grad_norm": 4.072034789714962e-05, "learning_rate": 2.2120841565586479e-07, "loss": 0.0, "num_input_tokens_seen": 111138792, "step": 164910 }, { "epoch": 4.028900886815039, "grad_norm": 0.021570960059762, "learning_rate": 2.211549253192696e-07, "loss": 0.0, "num_input_tokens_seen": 111141864, "step": 164915 }, { "epoch": 4.029023037646886, "grad_norm": 0.002378236735239625, "learning_rate": 2.2110144064664493e-07, "loss": 0.0, "num_input_tokens_seen": 111145320, "step": 164920 }, { "epoch": 4.029145188478734, "grad_norm": 0.0004563156981021166, "learning_rate": 2.2104796163838036e-07, "loss": 0.0, "num_input_tokens_seen": 111148264, "step": 164925 }, { "epoch": 4.0292673393105805, "grad_norm": 0.008762065321207047, "learning_rate": 2.2099448829486455e-07, "loss": 0.0, "num_input_tokens_seen": 111152168, "step": 164930 }, { "epoch": 4.029389490142428, "grad_norm": 0.003576488932594657, "learning_rate": 2.2094102061648613e-07, "loss": 0.0, "num_input_tokens_seen": 111155432, "step": 164935 }, { "epoch": 4.029511640974275, "grad_norm": 0.003416407387703657, "learning_rate": 2.2088755860363406e-07, "loss": 0.0, "num_input_tokens_seen": 111158952, "step": 164940 }, { "epoch": 4.0296337918061225, "grad_norm": 0.01084967702627182, "learning_rate": 2.2083410225669752e-07, "loss": 0.0, "num_input_tokens_seen": 111162472, "step": 164945 }, { "epoch": 4.029755942637969, "grad_norm": 0.0006958711892366409, "learning_rate": 2.2078065157606473e-07, "loss": 0.0, "num_input_tokens_seen": 111165928, "step": 164950 }, { "epoch": 4.029878093469817, "grad_norm": 0.003058177651837468, "learning_rate": 2.2072720656212483e-07, "loss": 0.0, "num_input_tokens_seen": 111168936, "step": 164955 }, { "epoch": 4.030000244301664, "grad_norm": 0.0013698239345103502, "learning_rate": 2.206737672152661e-07, "loss": 0.0, "num_input_tokens_seen": 111172456, "step": 164960 }, { "epoch": 4.030122395133511, "grad_norm": 0.00022180116502568126, "learning_rate": 2.206203335358776e-07, "loss": 0.0, "num_input_tokens_seen": 111176040, "step": 164965 }, { "epoch": 4.030244545965358, "grad_norm": 0.0037178494967520237, "learning_rate": 2.2056690552434732e-07, "loss": 0.0, "num_input_tokens_seen": 111179176, "step": 164970 }, { "epoch": 4.030366696797206, "grad_norm": 0.0011469481978565454, "learning_rate": 2.2051348318106421e-07, "loss": 0.0, "num_input_tokens_seen": 111182376, "step": 164975 }, { "epoch": 4.030488847629052, "grad_norm": 0.007485619746148586, "learning_rate": 2.2046006650641692e-07, "loss": 0.0, "num_input_tokens_seen": 111185448, "step": 164980 }, { "epoch": 4.030610998460899, "grad_norm": 0.00013309967471286654, "learning_rate": 2.204066555007935e-07, "loss": 0.0, "num_input_tokens_seen": 111188328, "step": 164985 }, { "epoch": 4.030733149292747, "grad_norm": 0.0012170979753136635, "learning_rate": 2.2035325016458273e-07, "loss": 0.0, "num_input_tokens_seen": 111191528, "step": 164990 }, { "epoch": 4.0308553001245935, "grad_norm": 0.0009001771104522049, "learning_rate": 2.2029985049817268e-07, "loss": 0.0, "num_input_tokens_seen": 111194984, "step": 164995 }, { "epoch": 4.030977450956441, "grad_norm": 0.0003405268071219325, "learning_rate": 2.2024645650195174e-07, "loss": 0.0, "num_input_tokens_seen": 111198440, "step": 165000 }, { "epoch": 4.031099601788288, "grad_norm": 0.0001786830252967775, "learning_rate": 2.2019306817630856e-07, "loss": 0.0, "num_input_tokens_seen": 111201704, "step": 165005 }, { "epoch": 4.0312217526201355, "grad_norm": 0.00017067190492525697, "learning_rate": 2.2013968552163098e-07, "loss": 0.0, "num_input_tokens_seen": 111205224, "step": 165010 }, { "epoch": 4.031343903451982, "grad_norm": 0.0004245521849952638, "learning_rate": 2.2008630853830755e-07, "loss": 0.0, "num_input_tokens_seen": 111208296, "step": 165015 }, { "epoch": 4.03146605428383, "grad_norm": 0.248831644654274, "learning_rate": 2.20032937226726e-07, "loss": 0.0003, "num_input_tokens_seen": 111211816, "step": 165020 }, { "epoch": 4.031588205115677, "grad_norm": 0.000578263308852911, "learning_rate": 2.19979571587275e-07, "loss": 0.0, "num_input_tokens_seen": 111215016, "step": 165025 }, { "epoch": 4.031710355947524, "grad_norm": 0.00015196092135738581, "learning_rate": 2.1992621162034232e-07, "loss": 0.0, "num_input_tokens_seen": 111218408, "step": 165030 }, { "epoch": 4.031832506779371, "grad_norm": 0.005513553041964769, "learning_rate": 2.1987285732631577e-07, "loss": 0.0, "num_input_tokens_seen": 111221544, "step": 165035 }, { "epoch": 4.031954657611219, "grad_norm": 8.522966527380049e-05, "learning_rate": 2.1981950870558385e-07, "loss": 0.0, "num_input_tokens_seen": 111226088, "step": 165040 }, { "epoch": 4.032076808443065, "grad_norm": 0.00015558319864794612, "learning_rate": 2.1976616575853412e-07, "loss": 0.0, "num_input_tokens_seen": 111229608, "step": 165045 }, { "epoch": 4.032198959274913, "grad_norm": 0.03268592432141304, "learning_rate": 2.1971282848555495e-07, "loss": 0.0, "num_input_tokens_seen": 111232552, "step": 165050 }, { "epoch": 4.03232111010676, "grad_norm": 0.0004529871221166104, "learning_rate": 2.1965949688703368e-07, "loss": 0.0, "num_input_tokens_seen": 111236200, "step": 165055 }, { "epoch": 4.032443260938607, "grad_norm": 0.0002521305868867785, "learning_rate": 2.1960617096335876e-07, "loss": 0.0, "num_input_tokens_seen": 111239464, "step": 165060 }, { "epoch": 4.032565411770454, "grad_norm": 0.00017973648209590465, "learning_rate": 2.1955285071491724e-07, "loss": 0.0, "num_input_tokens_seen": 111242600, "step": 165065 }, { "epoch": 4.032687562602302, "grad_norm": 0.0015271722804754972, "learning_rate": 2.194995361420975e-07, "loss": 0.0, "num_input_tokens_seen": 111246056, "step": 165070 }, { "epoch": 4.0328097134341485, "grad_norm": 0.0016294847009703517, "learning_rate": 2.1944622724528716e-07, "loss": 0.0, "num_input_tokens_seen": 111249320, "step": 165075 }, { "epoch": 4.032931864265995, "grad_norm": 0.0002762658696155995, "learning_rate": 2.1939292402487363e-07, "loss": 0.0609, "num_input_tokens_seen": 111252776, "step": 165080 }, { "epoch": 4.033054015097843, "grad_norm": 0.020646601915359497, "learning_rate": 2.1933962648124505e-07, "loss": 0.0, "num_input_tokens_seen": 111255912, "step": 165085 }, { "epoch": 4.03317616592969, "grad_norm": 0.00040234896005131304, "learning_rate": 2.1928633461478828e-07, "loss": 0.062, "num_input_tokens_seen": 111259240, "step": 165090 }, { "epoch": 4.033298316761537, "grad_norm": 0.0001797089062165469, "learning_rate": 2.192330484258913e-07, "loss": 0.0, "num_input_tokens_seen": 111262760, "step": 165095 }, { "epoch": 4.033420467593384, "grad_norm": 0.002640556776896119, "learning_rate": 2.1917976791494186e-07, "loss": 0.0, "num_input_tokens_seen": 111266088, "step": 165100 }, { "epoch": 4.033542618425232, "grad_norm": 0.008661020547151566, "learning_rate": 2.1912649308232688e-07, "loss": 0.0, "num_input_tokens_seen": 111269608, "step": 165105 }, { "epoch": 4.033664769257078, "grad_norm": 0.0007709195488132536, "learning_rate": 2.190732239284344e-07, "loss": 0.0, "num_input_tokens_seen": 111272488, "step": 165110 }, { "epoch": 4.033786920088926, "grad_norm": 0.0018076790729537606, "learning_rate": 2.1901996045365123e-07, "loss": 0.0, "num_input_tokens_seen": 111275944, "step": 165115 }, { "epoch": 4.033909070920773, "grad_norm": 0.00034789761411957443, "learning_rate": 2.1896670265836516e-07, "loss": 0.0, "num_input_tokens_seen": 111279336, "step": 165120 }, { "epoch": 4.03403122175262, "grad_norm": 0.0001242852013092488, "learning_rate": 2.1891345054296306e-07, "loss": 0.0, "num_input_tokens_seen": 111283048, "step": 165125 }, { "epoch": 4.034153372584467, "grad_norm": 0.00019528514530975372, "learning_rate": 2.188602041078328e-07, "loss": 0.0, "num_input_tokens_seen": 111285992, "step": 165130 }, { "epoch": 4.034275523416315, "grad_norm": 0.00017401730292476714, "learning_rate": 2.1880696335336114e-07, "loss": 0.0, "num_input_tokens_seen": 111289064, "step": 165135 }, { "epoch": 4.0343976742481615, "grad_norm": 0.00035434920573607087, "learning_rate": 2.1875372827993499e-07, "loss": 0.0, "num_input_tokens_seen": 111292712, "step": 165140 }, { "epoch": 4.034519825080009, "grad_norm": 0.06561069935560226, "learning_rate": 2.1870049888794228e-07, "loss": 0.0, "num_input_tokens_seen": 111296232, "step": 165145 }, { "epoch": 4.034641975911856, "grad_norm": 0.0016677493695169687, "learning_rate": 2.1864727517776938e-07, "loss": 0.0001, "num_input_tokens_seen": 111299432, "step": 165150 }, { "epoch": 4.0347641267437035, "grad_norm": 0.0006396759999915957, "learning_rate": 2.1859405714980394e-07, "loss": 0.0, "num_input_tokens_seen": 111302440, "step": 165155 }, { "epoch": 4.03488627757555, "grad_norm": 0.0030758215580135584, "learning_rate": 2.1854084480443237e-07, "loss": 0.0, "num_input_tokens_seen": 111305768, "step": 165160 }, { "epoch": 4.035008428407397, "grad_norm": 0.002016278449445963, "learning_rate": 2.1848763814204197e-07, "loss": 0.0, "num_input_tokens_seen": 111309032, "step": 165165 }, { "epoch": 4.035130579239245, "grad_norm": 0.0009641702054068446, "learning_rate": 2.1843443716301991e-07, "loss": 0.0, "num_input_tokens_seen": 111312424, "step": 165170 }, { "epoch": 4.035252730071091, "grad_norm": 0.00021530137746594846, "learning_rate": 2.1838124186775265e-07, "loss": 0.0, "num_input_tokens_seen": 111318184, "step": 165175 }, { "epoch": 4.035374880902939, "grad_norm": 0.0003201996732968837, "learning_rate": 2.1832805225662742e-07, "loss": 0.0, "num_input_tokens_seen": 111321320, "step": 165180 }, { "epoch": 4.035497031734786, "grad_norm": 0.002409678651019931, "learning_rate": 2.1827486833003062e-07, "loss": 0.0479, "num_input_tokens_seen": 111324840, "step": 165185 }, { "epoch": 4.035619182566633, "grad_norm": 0.00077003677142784, "learning_rate": 2.1822169008834924e-07, "loss": 0.0, "num_input_tokens_seen": 111328616, "step": 165190 }, { "epoch": 4.03574133339848, "grad_norm": 0.004224866628646851, "learning_rate": 2.181685175319702e-07, "loss": 0.0, "num_input_tokens_seen": 111331752, "step": 165195 }, { "epoch": 4.035863484230328, "grad_norm": 0.0008089193142950535, "learning_rate": 2.1811535066127983e-07, "loss": 0.0002, "num_input_tokens_seen": 111335272, "step": 165200 }, { "epoch": 4.035985635062175, "grad_norm": 0.0007582316757179797, "learning_rate": 2.180621894766651e-07, "loss": 0.0, "num_input_tokens_seen": 111339112, "step": 165205 }, { "epoch": 4.036107785894022, "grad_norm": 0.0345303900539875, "learning_rate": 2.1800903397851222e-07, "loss": 0.0, "num_input_tokens_seen": 111342312, "step": 165210 }, { "epoch": 4.036229936725869, "grad_norm": 0.016312891617417336, "learning_rate": 2.1795588416720822e-07, "loss": 0.0, "num_input_tokens_seen": 111345832, "step": 165215 }, { "epoch": 4.036352087557717, "grad_norm": 0.0017842523520812392, "learning_rate": 2.1790274004313912e-07, "loss": 0.0, "num_input_tokens_seen": 111348776, "step": 165220 }, { "epoch": 4.036474238389563, "grad_norm": 0.0002951300411950797, "learning_rate": 2.1784960160669197e-07, "loss": 0.0, "num_input_tokens_seen": 111352552, "step": 165225 }, { "epoch": 4.036596389221411, "grad_norm": 0.037422504276037216, "learning_rate": 2.1779646885825264e-07, "loss": 0.0, "num_input_tokens_seen": 111355688, "step": 165230 }, { "epoch": 4.036718540053258, "grad_norm": 0.002478367416188121, "learning_rate": 2.1774334179820797e-07, "loss": 0.0, "num_input_tokens_seen": 111359208, "step": 165235 }, { "epoch": 4.036840690885105, "grad_norm": 0.0008452267502434552, "learning_rate": 2.1769022042694385e-07, "loss": 0.0168, "num_input_tokens_seen": 111362728, "step": 165240 }, { "epoch": 4.036962841716952, "grad_norm": 0.004490249324589968, "learning_rate": 2.176371047448472e-07, "loss": 0.0, "num_input_tokens_seen": 111366568, "step": 165245 }, { "epoch": 4.037084992548799, "grad_norm": 0.0013267035828903317, "learning_rate": 2.175839947523036e-07, "loss": 0.0, "num_input_tokens_seen": 111369768, "step": 165250 }, { "epoch": 4.0372071433806465, "grad_norm": 0.007490311283618212, "learning_rate": 2.1753089044969997e-07, "loss": 0.0001, "num_input_tokens_seen": 111373288, "step": 165255 }, { "epoch": 4.037329294212493, "grad_norm": 0.030202653259038925, "learning_rate": 2.1747779183742187e-07, "loss": 0.0004, "num_input_tokens_seen": 111377128, "step": 165260 }, { "epoch": 4.037451445044341, "grad_norm": 0.002074525225907564, "learning_rate": 2.17424698915856e-07, "loss": 0.0, "num_input_tokens_seen": 111380968, "step": 165265 }, { "epoch": 4.037573595876188, "grad_norm": 0.0004991142195649445, "learning_rate": 2.1737161168538787e-07, "loss": 0.0, "num_input_tokens_seen": 111383912, "step": 165270 }, { "epoch": 4.037695746708035, "grad_norm": 0.0005876885261386633, "learning_rate": 2.1731853014640422e-07, "loss": 0.0, "num_input_tokens_seen": 111387688, "step": 165275 }, { "epoch": 4.037817897539882, "grad_norm": 0.0024651093408465385, "learning_rate": 2.1726545429929055e-07, "loss": 0.0, "num_input_tokens_seen": 111390632, "step": 165280 }, { "epoch": 4.03794004837173, "grad_norm": 0.0009485721820965409, "learning_rate": 2.1721238414443287e-07, "loss": 0.0, "num_input_tokens_seen": 111394728, "step": 165285 }, { "epoch": 4.038062199203576, "grad_norm": 0.00034085451625287533, "learning_rate": 2.1715931968221768e-07, "loss": 0.0, "num_input_tokens_seen": 111398504, "step": 165290 }, { "epoch": 4.038184350035424, "grad_norm": 0.00034384272294119, "learning_rate": 2.1710626091303008e-07, "loss": 0.0, "num_input_tokens_seen": 111401320, "step": 165295 }, { "epoch": 4.038306500867271, "grad_norm": 0.00017364125233143568, "learning_rate": 2.1705320783725667e-07, "loss": 0.0, "num_input_tokens_seen": 111404840, "step": 165300 }, { "epoch": 4.038428651699118, "grad_norm": 0.0002893655910156667, "learning_rate": 2.170001604552827e-07, "loss": 0.0, "num_input_tokens_seen": 111408168, "step": 165305 }, { "epoch": 4.038550802530965, "grad_norm": 0.0010250789346173406, "learning_rate": 2.1694711876749438e-07, "loss": 0.0, "num_input_tokens_seen": 111411688, "step": 165310 }, { "epoch": 4.038672953362813, "grad_norm": 0.00014593149535357952, "learning_rate": 2.168940827742769e-07, "loss": 0.0, "num_input_tokens_seen": 111415144, "step": 165315 }, { "epoch": 4.0387951041946595, "grad_norm": 0.0019269100157544017, "learning_rate": 2.1684105247601635e-07, "loss": 0.0, "num_input_tokens_seen": 111418664, "step": 165320 }, { "epoch": 4.038917255026507, "grad_norm": 0.0007912858854979277, "learning_rate": 2.1678802787309857e-07, "loss": 0.0, "num_input_tokens_seen": 111421608, "step": 165325 }, { "epoch": 4.039039405858354, "grad_norm": 0.0006178620969876647, "learning_rate": 2.167350089659087e-07, "loss": 0.0, "num_input_tokens_seen": 111424808, "step": 165330 }, { "epoch": 4.0391615566902015, "grad_norm": 0.021620342507958412, "learning_rate": 2.166819957548327e-07, "loss": 0.0, "num_input_tokens_seen": 111428456, "step": 165335 }, { "epoch": 4.039283707522048, "grad_norm": 0.0001561202952871099, "learning_rate": 2.1662898824025588e-07, "loss": 0.0, "num_input_tokens_seen": 111431656, "step": 165340 }, { "epoch": 4.039405858353895, "grad_norm": 0.020325763151049614, "learning_rate": 2.1657598642256358e-07, "loss": 0.0, "num_input_tokens_seen": 111434792, "step": 165345 }, { "epoch": 4.039528009185743, "grad_norm": 0.0001341603638138622, "learning_rate": 2.165229903021417e-07, "loss": 0.0, "num_input_tokens_seen": 111438120, "step": 165350 }, { "epoch": 4.039650160017589, "grad_norm": 0.00033617831650190055, "learning_rate": 2.1646999987937497e-07, "loss": 0.0, "num_input_tokens_seen": 111441448, "step": 165355 }, { "epoch": 4.039772310849437, "grad_norm": 0.000377911317627877, "learning_rate": 2.164170151546496e-07, "loss": 0.0, "num_input_tokens_seen": 111444520, "step": 165360 }, { "epoch": 4.039894461681284, "grad_norm": 0.0014009472215548158, "learning_rate": 2.1636403612835007e-07, "loss": 0.0, "num_input_tokens_seen": 111447848, "step": 165365 }, { "epoch": 4.040016612513131, "grad_norm": 0.0025961874052882195, "learning_rate": 2.1631106280086232e-07, "loss": 0.0349, "num_input_tokens_seen": 111451240, "step": 165370 }, { "epoch": 4.040138763344978, "grad_norm": 0.00035863713128492236, "learning_rate": 2.1625809517257098e-07, "loss": 0.0, "num_input_tokens_seen": 111454632, "step": 165375 }, { "epoch": 4.040260914176826, "grad_norm": 0.0002058813552139327, "learning_rate": 2.162051332438617e-07, "loss": 0.0, "num_input_tokens_seen": 111458152, "step": 165380 }, { "epoch": 4.0403830650086725, "grad_norm": 0.0014873233158141375, "learning_rate": 2.1615217701511967e-07, "loss": 0.0, "num_input_tokens_seen": 111461480, "step": 165385 }, { "epoch": 4.04050521584052, "grad_norm": 0.0005732462159357965, "learning_rate": 2.1609922648672962e-07, "loss": 0.0, "num_input_tokens_seen": 111464552, "step": 165390 }, { "epoch": 4.040627366672367, "grad_norm": 0.0001214592921314761, "learning_rate": 2.1604628165907712e-07, "loss": 0.0, "num_input_tokens_seen": 111468072, "step": 165395 }, { "epoch": 4.0407495175042145, "grad_norm": 7.036358874756843e-05, "learning_rate": 2.1599334253254665e-07, "loss": 0.0, "num_input_tokens_seen": 111471592, "step": 165400 }, { "epoch": 4.040871668336061, "grad_norm": 1.940254878718406e-05, "learning_rate": 2.1594040910752344e-07, "loss": 0.0, "num_input_tokens_seen": 111474984, "step": 165405 }, { "epoch": 4.040993819167909, "grad_norm": 0.0024557760916650295, "learning_rate": 2.1588748138439271e-07, "loss": 0.0348, "num_input_tokens_seen": 111478504, "step": 165410 }, { "epoch": 4.041115969999756, "grad_norm": 0.00025112141156569123, "learning_rate": 2.1583455936353888e-07, "loss": 0.0, "num_input_tokens_seen": 111481640, "step": 165415 }, { "epoch": 4.041238120831603, "grad_norm": 0.0030566269997507334, "learning_rate": 2.157816430453473e-07, "loss": 0.0, "num_input_tokens_seen": 111485480, "step": 165420 }, { "epoch": 4.04136027166345, "grad_norm": 9.694881009636447e-05, "learning_rate": 2.1572873243020228e-07, "loss": 0.0003, "num_input_tokens_seen": 111488616, "step": 165425 }, { "epoch": 4.041482422495297, "grad_norm": 0.00042643630877137184, "learning_rate": 2.1567582751848913e-07, "loss": 0.0, "num_input_tokens_seen": 111491624, "step": 165430 }, { "epoch": 4.041604573327144, "grad_norm": 0.0017886931309476495, "learning_rate": 2.1562292831059203e-07, "loss": 0.0, "num_input_tokens_seen": 111494952, "step": 165435 }, { "epoch": 4.041726724158991, "grad_norm": 0.00014759103942196816, "learning_rate": 2.1557003480689627e-07, "loss": 0.0, "num_input_tokens_seen": 111497896, "step": 165440 }, { "epoch": 4.041848874990839, "grad_norm": 0.0017384887905791402, "learning_rate": 2.1551714700778623e-07, "loss": 0.0, "num_input_tokens_seen": 111501032, "step": 165445 }, { "epoch": 4.0419710258226855, "grad_norm": 0.00014177450793795288, "learning_rate": 2.1546426491364622e-07, "loss": 0.0, "num_input_tokens_seen": 111504168, "step": 165450 }, { "epoch": 4.042093176654533, "grad_norm": 0.007318509742617607, "learning_rate": 2.154113885248613e-07, "loss": 0.0882, "num_input_tokens_seen": 111507688, "step": 165455 }, { "epoch": 4.04221532748638, "grad_norm": 0.0010406904621049762, "learning_rate": 2.1535851784181558e-07, "loss": 0.0, "num_input_tokens_seen": 111510760, "step": 165460 }, { "epoch": 4.0423374783182275, "grad_norm": 0.002014661906287074, "learning_rate": 2.15305652864894e-07, "loss": 0.0, "num_input_tokens_seen": 111513832, "step": 165465 }, { "epoch": 4.042459629150074, "grad_norm": 0.10928776115179062, "learning_rate": 2.1525279359448046e-07, "loss": 0.0, "num_input_tokens_seen": 111517224, "step": 165470 }, { "epoch": 4.042581779981922, "grad_norm": 0.0005027904408052564, "learning_rate": 2.1519994003095976e-07, "loss": 0.0, "num_input_tokens_seen": 111520680, "step": 165475 }, { "epoch": 4.042703930813769, "grad_norm": 0.0004568375297822058, "learning_rate": 2.1514709217471638e-07, "loss": 0.0, "num_input_tokens_seen": 111523688, "step": 165480 }, { "epoch": 4.042826081645616, "grad_norm": 0.016826355829834938, "learning_rate": 2.1509425002613424e-07, "loss": 0.0, "num_input_tokens_seen": 111526824, "step": 165485 }, { "epoch": 4.042948232477463, "grad_norm": 6.418684642994776e-05, "learning_rate": 2.1504141358559812e-07, "loss": 0.0, "num_input_tokens_seen": 111530216, "step": 165490 }, { "epoch": 4.043070383309311, "grad_norm": 0.0003524493076838553, "learning_rate": 2.1498858285349164e-07, "loss": 0.0, "num_input_tokens_seen": 111533224, "step": 165495 }, { "epoch": 4.043192534141157, "grad_norm": 0.00021575384016614407, "learning_rate": 2.1493575783019934e-07, "loss": 0.0, "num_input_tokens_seen": 111536872, "step": 165500 }, { "epoch": 4.043314684973005, "grad_norm": 0.0024450430646538734, "learning_rate": 2.148829385161056e-07, "loss": 0.0, "num_input_tokens_seen": 111540072, "step": 165505 }, { "epoch": 4.043436835804852, "grad_norm": 4.44727556896396e-05, "learning_rate": 2.1483012491159404e-07, "loss": 0.0, "num_input_tokens_seen": 111543656, "step": 165510 }, { "epoch": 4.0435589866366985, "grad_norm": 0.001662745140492916, "learning_rate": 2.1477731701704927e-07, "loss": 0.0, "num_input_tokens_seen": 111547560, "step": 165515 }, { "epoch": 4.043681137468546, "grad_norm": 0.0012131177354604006, "learning_rate": 2.147245148328548e-07, "loss": 0.0, "num_input_tokens_seen": 111550824, "step": 165520 }, { "epoch": 4.043803288300393, "grad_norm": 0.005215724930167198, "learning_rate": 2.1467171835939525e-07, "loss": 0.0, "num_input_tokens_seen": 111554344, "step": 165525 }, { "epoch": 4.043925439132241, "grad_norm": 0.003541510319337249, "learning_rate": 2.146189275970538e-07, "loss": 0.0, "num_input_tokens_seen": 111557608, "step": 165530 }, { "epoch": 4.044047589964087, "grad_norm": 0.0022107716649770737, "learning_rate": 2.1456614254621497e-07, "loss": 0.0, "num_input_tokens_seen": 111561640, "step": 165535 }, { "epoch": 4.044169740795935, "grad_norm": 0.0003690843877848238, "learning_rate": 2.1451336320726222e-07, "loss": 0.0, "num_input_tokens_seen": 111564776, "step": 165540 }, { "epoch": 4.044291891627782, "grad_norm": 0.0020510528702288866, "learning_rate": 2.1446058958057978e-07, "loss": 0.0001, "num_input_tokens_seen": 111567912, "step": 165545 }, { "epoch": 4.044414042459629, "grad_norm": 0.03945672884583473, "learning_rate": 2.1440782166655101e-07, "loss": 0.0001, "num_input_tokens_seen": 111571176, "step": 165550 }, { "epoch": 4.044536193291476, "grad_norm": 0.0011049823369830847, "learning_rate": 2.1435505946556008e-07, "loss": 0.0, "num_input_tokens_seen": 111574440, "step": 165555 }, { "epoch": 4.044658344123324, "grad_norm": 0.003834398230537772, "learning_rate": 2.1430230297799024e-07, "loss": 0.0, "num_input_tokens_seen": 111578088, "step": 165560 }, { "epoch": 4.0447804949551704, "grad_norm": 0.00021740724332630634, "learning_rate": 2.142495522042257e-07, "loss": 0.0, "num_input_tokens_seen": 111581544, "step": 165565 }, { "epoch": 4.044902645787018, "grad_norm": 0.00013685625162906945, "learning_rate": 2.141968071446494e-07, "loss": 0.0, "num_input_tokens_seen": 111585128, "step": 165570 }, { "epoch": 4.045024796618865, "grad_norm": 0.0021345813293009996, "learning_rate": 2.1414406779964555e-07, "loss": 0.0, "num_input_tokens_seen": 111588712, "step": 165575 }, { "epoch": 4.0451469474507125, "grad_norm": 0.10980292409658432, "learning_rate": 2.1409133416959712e-07, "loss": 0.0, "num_input_tokens_seen": 111592744, "step": 165580 }, { "epoch": 4.045269098282559, "grad_norm": 0.00013364851474761963, "learning_rate": 2.1403860625488823e-07, "loss": 0.0, "num_input_tokens_seen": 111596456, "step": 165585 }, { "epoch": 4.045391249114407, "grad_norm": 0.002575997728854418, "learning_rate": 2.1398588405590168e-07, "loss": 0.0, "num_input_tokens_seen": 111599720, "step": 165590 }, { "epoch": 4.045513399946254, "grad_norm": 0.0008017385262064636, "learning_rate": 2.1393316757302116e-07, "loss": 0.0, "num_input_tokens_seen": 111603496, "step": 165595 }, { "epoch": 4.045635550778101, "grad_norm": 5.82604952796828e-05, "learning_rate": 2.1388045680663047e-07, "loss": 0.0, "num_input_tokens_seen": 111606824, "step": 165600 }, { "epoch": 4.045757701609948, "grad_norm": 0.0010002893395721912, "learning_rate": 2.1382775175711222e-07, "loss": 0.0, "num_input_tokens_seen": 111612008, "step": 165605 }, { "epoch": 4.045879852441795, "grad_norm": 0.0016825840575620532, "learning_rate": 2.1377505242485018e-07, "loss": 0.0, "num_input_tokens_seen": 111615016, "step": 165610 }, { "epoch": 4.046002003273642, "grad_norm": 0.0027573788538575172, "learning_rate": 2.1372235881022726e-07, "loss": 0.0, "num_input_tokens_seen": 111618600, "step": 165615 }, { "epoch": 4.046124154105489, "grad_norm": 0.0046699317172169685, "learning_rate": 2.1366967091362708e-07, "loss": 0.0, "num_input_tokens_seen": 111622248, "step": 165620 }, { "epoch": 4.046246304937337, "grad_norm": 0.00024322244280483574, "learning_rate": 2.136169887354322e-07, "loss": 0.0, "num_input_tokens_seen": 111625576, "step": 165625 }, { "epoch": 4.0463684557691835, "grad_norm": 0.0006262024398893118, "learning_rate": 2.1356431227602624e-07, "loss": 0.0, "num_input_tokens_seen": 111629608, "step": 165630 }, { "epoch": 4.046490606601031, "grad_norm": 0.0033578008878976107, "learning_rate": 2.1351164153579226e-07, "loss": 0.0, "num_input_tokens_seen": 111633000, "step": 165635 }, { "epoch": 4.046612757432878, "grad_norm": 0.0009691547602415085, "learning_rate": 2.1345897651511292e-07, "loss": 0.0, "num_input_tokens_seen": 111636072, "step": 165640 }, { "epoch": 4.0467349082647255, "grad_norm": 0.0001765676133800298, "learning_rate": 2.1340631721437174e-07, "loss": 0.0, "num_input_tokens_seen": 111639592, "step": 165645 }, { "epoch": 4.046857059096572, "grad_norm": 9.644962847232819e-05, "learning_rate": 2.1335366363395147e-07, "loss": 0.0, "num_input_tokens_seen": 111643432, "step": 165650 }, { "epoch": 4.04697920992842, "grad_norm": 0.004384058527648449, "learning_rate": 2.1330101577423453e-07, "loss": 0.0, "num_input_tokens_seen": 111646696, "step": 165655 }, { "epoch": 4.047101360760267, "grad_norm": 0.0009935772977769375, "learning_rate": 2.1324837363560456e-07, "loss": 0.0, "num_input_tokens_seen": 111650152, "step": 165660 }, { "epoch": 4.047223511592114, "grad_norm": 0.0007049691630527377, "learning_rate": 2.1319573721844376e-07, "loss": 0.0, "num_input_tokens_seen": 111653416, "step": 165665 }, { "epoch": 4.047345662423961, "grad_norm": 0.003179597668349743, "learning_rate": 2.131431065231355e-07, "loss": 0.0, "num_input_tokens_seen": 111657064, "step": 165670 }, { "epoch": 4.047467813255809, "grad_norm": 0.0008164419559761882, "learning_rate": 2.1309048155006183e-07, "loss": 0.0, "num_input_tokens_seen": 111660200, "step": 165675 }, { "epoch": 4.047589964087655, "grad_norm": 4.1886734834406525e-05, "learning_rate": 2.1303786229960618e-07, "loss": 0.0, "num_input_tokens_seen": 111663592, "step": 165680 }, { "epoch": 4.047712114919503, "grad_norm": 0.00021079998987261206, "learning_rate": 2.1298524877215052e-07, "loss": 0.0, "num_input_tokens_seen": 111666664, "step": 165685 }, { "epoch": 4.04783426575135, "grad_norm": 7.617528171977028e-05, "learning_rate": 2.1293264096807783e-07, "loss": 0.0, "num_input_tokens_seen": 111669800, "step": 165690 }, { "epoch": 4.0479564165831965, "grad_norm": 0.0013283933512866497, "learning_rate": 2.1288003888777096e-07, "loss": 0.0004, "num_input_tokens_seen": 111672808, "step": 165695 }, { "epoch": 4.048078567415044, "grad_norm": 0.0020888675935566425, "learning_rate": 2.128274425316119e-07, "loss": 0.0, "num_input_tokens_seen": 111676520, "step": 165700 }, { "epoch": 4.048200718246891, "grad_norm": 4.661543425754644e-05, "learning_rate": 2.1277485189998366e-07, "loss": 0.0, "num_input_tokens_seen": 111679848, "step": 165705 }, { "epoch": 4.0483228690787385, "grad_norm": 0.0011659510200843215, "learning_rate": 2.127222669932681e-07, "loss": 0.0, "num_input_tokens_seen": 111683304, "step": 165710 }, { "epoch": 4.048445019910585, "grad_norm": 0.0026223217137157917, "learning_rate": 2.1266968781184814e-07, "loss": 0.0, "num_input_tokens_seen": 111686504, "step": 165715 }, { "epoch": 4.048567170742433, "grad_norm": 0.0005963681032881141, "learning_rate": 2.1261711435610607e-07, "loss": 0.0, "num_input_tokens_seen": 111690088, "step": 165720 }, { "epoch": 4.04868932157428, "grad_norm": 0.00229801912792027, "learning_rate": 2.1256454662642398e-07, "loss": 0.0, "num_input_tokens_seen": 111693544, "step": 165725 }, { "epoch": 4.048811472406127, "grad_norm": 3.6713732697535306e-05, "learning_rate": 2.1251198462318444e-07, "loss": 0.0, "num_input_tokens_seen": 111696680, "step": 165730 }, { "epoch": 4.048933623237974, "grad_norm": 0.00027011564816348255, "learning_rate": 2.1245942834676944e-07, "loss": 0.0, "num_input_tokens_seen": 111700328, "step": 165735 }, { "epoch": 4.049055774069822, "grad_norm": 2.1074765754747204e-05, "learning_rate": 2.124068777975615e-07, "loss": 0.0, "num_input_tokens_seen": 111703848, "step": 165740 }, { "epoch": 4.049177924901668, "grad_norm": 0.002213638974353671, "learning_rate": 2.123543329759423e-07, "loss": 0.0, "num_input_tokens_seen": 111707240, "step": 165745 }, { "epoch": 4.049300075733516, "grad_norm": 0.0069284215569496155, "learning_rate": 2.123017938822945e-07, "loss": 0.0001, "num_input_tokens_seen": 111710056, "step": 165750 }, { "epoch": 4.049422226565363, "grad_norm": 1.297972266911529e-05, "learning_rate": 2.1224926051699987e-07, "loss": 0.0001, "num_input_tokens_seen": 111713768, "step": 165755 }, { "epoch": 4.04954437739721, "grad_norm": 0.0045747156254947186, "learning_rate": 2.121967328804404e-07, "loss": 0.0, "num_input_tokens_seen": 111717096, "step": 165760 }, { "epoch": 4.049666528229057, "grad_norm": 0.0001304529287153855, "learning_rate": 2.1214421097299828e-07, "loss": 0.0, "num_input_tokens_seen": 111720232, "step": 165765 }, { "epoch": 4.049788679060905, "grad_norm": 9.356778900837526e-05, "learning_rate": 2.1209169479505519e-07, "loss": 0.0001, "num_input_tokens_seen": 111723560, "step": 165770 }, { "epoch": 4.0499108298927515, "grad_norm": 0.0004932557349093258, "learning_rate": 2.1203918434699342e-07, "loss": 0.0, "num_input_tokens_seen": 111727336, "step": 165775 }, { "epoch": 4.050032980724599, "grad_norm": 0.0008772346773184836, "learning_rate": 2.1198667962919437e-07, "loss": 0.0, "num_input_tokens_seen": 111730536, "step": 165780 }, { "epoch": 4.050155131556446, "grad_norm": 0.00020624265016522259, "learning_rate": 2.1193418064204016e-07, "loss": 0.0, "num_input_tokens_seen": 111734440, "step": 165785 }, { "epoch": 4.050277282388293, "grad_norm": 0.0005781511426903307, "learning_rate": 2.1188168738591284e-07, "loss": 0.0, "num_input_tokens_seen": 111737832, "step": 165790 }, { "epoch": 4.05039943322014, "grad_norm": 4.9924346967600286e-05, "learning_rate": 2.1182919986119364e-07, "loss": 0.0591, "num_input_tokens_seen": 111741160, "step": 165795 }, { "epoch": 4.050521584051987, "grad_norm": 0.008669133298099041, "learning_rate": 2.117767180682647e-07, "loss": 0.0, "num_input_tokens_seen": 111744488, "step": 165800 }, { "epoch": 4.050643734883835, "grad_norm": 8.618797437520698e-05, "learning_rate": 2.1172424200750715e-07, "loss": 0.0, "num_input_tokens_seen": 111747880, "step": 165805 }, { "epoch": 4.050765885715681, "grad_norm": 5.16844738740474e-05, "learning_rate": 2.1167177167930307e-07, "loss": 0.0, "num_input_tokens_seen": 111751016, "step": 165810 }, { "epoch": 4.050888036547529, "grad_norm": 0.00011682142212521285, "learning_rate": 2.1161930708403407e-07, "loss": 0.0001, "num_input_tokens_seen": 111754024, "step": 165815 }, { "epoch": 4.051010187379376, "grad_norm": 0.0013719471171498299, "learning_rate": 2.1156684822208127e-07, "loss": 0.0, "num_input_tokens_seen": 111758184, "step": 165820 }, { "epoch": 4.051132338211223, "grad_norm": 0.0009899984579533339, "learning_rate": 2.1151439509382674e-07, "loss": 0.0, "num_input_tokens_seen": 111761384, "step": 165825 }, { "epoch": 4.05125448904307, "grad_norm": 0.00012213773152325302, "learning_rate": 2.1146194769965132e-07, "loss": 0.0, "num_input_tokens_seen": 111764392, "step": 165830 }, { "epoch": 4.051376639874918, "grad_norm": 0.006048544310033321, "learning_rate": 2.114095060399369e-07, "loss": 0.0, "num_input_tokens_seen": 111767208, "step": 165835 }, { "epoch": 4.0514987907067646, "grad_norm": 0.00441456213593483, "learning_rate": 2.1135707011506442e-07, "loss": 0.0, "num_input_tokens_seen": 111770088, "step": 165840 }, { "epoch": 4.051620941538612, "grad_norm": 0.008306225761771202, "learning_rate": 2.113046399254157e-07, "loss": 0.0, "num_input_tokens_seen": 111773416, "step": 165845 }, { "epoch": 4.051743092370459, "grad_norm": 0.00023795659944880754, "learning_rate": 2.112522154713715e-07, "loss": 0.0, "num_input_tokens_seen": 111776488, "step": 165850 }, { "epoch": 4.051865243202307, "grad_norm": 3.328266757307574e-05, "learning_rate": 2.111997967533137e-07, "loss": 0.0, "num_input_tokens_seen": 111779624, "step": 165855 }, { "epoch": 4.051987394034153, "grad_norm": 0.00015622578212060034, "learning_rate": 2.1114738377162279e-07, "loss": 0.0, "num_input_tokens_seen": 111783208, "step": 165860 }, { "epoch": 4.052109544866001, "grad_norm": 0.0018481943989172578, "learning_rate": 2.1109497652668052e-07, "loss": 0.0, "num_input_tokens_seen": 111786728, "step": 165865 }, { "epoch": 4.052231695697848, "grad_norm": 0.0011088012252002954, "learning_rate": 2.110425750188679e-07, "loss": 0.0, "num_input_tokens_seen": 111789864, "step": 165870 }, { "epoch": 4.052353846529694, "grad_norm": 0.0009757218649610877, "learning_rate": 2.1099017924856544e-07, "loss": 0.0, "num_input_tokens_seen": 111793256, "step": 165875 }, { "epoch": 4.052475997361542, "grad_norm": 3.3502219594083726e-05, "learning_rate": 2.109377892161547e-07, "loss": 0.0, "num_input_tokens_seen": 111796840, "step": 165880 }, { "epoch": 4.052598148193389, "grad_norm": 0.02216157130897045, "learning_rate": 2.108854049220169e-07, "loss": 0.0, "num_input_tokens_seen": 111800104, "step": 165885 }, { "epoch": 4.0527202990252365, "grad_norm": 0.001708241063170135, "learning_rate": 2.1083302636653234e-07, "loss": 0.0, "num_input_tokens_seen": 111803112, "step": 165890 }, { "epoch": 4.052842449857083, "grad_norm": 0.0003455929982010275, "learning_rate": 2.1078065355008257e-07, "loss": 0.0, "num_input_tokens_seen": 111806888, "step": 165895 }, { "epoch": 4.052964600688931, "grad_norm": 0.00027073745150119066, "learning_rate": 2.1072828647304795e-07, "loss": 0.0, "num_input_tokens_seen": 111809896, "step": 165900 }, { "epoch": 4.053086751520778, "grad_norm": 6.761745316907763e-05, "learning_rate": 2.1067592513580944e-07, "loss": 0.0001, "num_input_tokens_seen": 111813352, "step": 165905 }, { "epoch": 4.053208902352625, "grad_norm": 0.00017820534412749112, "learning_rate": 2.1062356953874815e-07, "loss": 0.0, "num_input_tokens_seen": 111816360, "step": 165910 }, { "epoch": 4.053331053184472, "grad_norm": 0.0018837273819372058, "learning_rate": 2.1057121968224445e-07, "loss": 0.0, "num_input_tokens_seen": 111819688, "step": 165915 }, { "epoch": 4.05345320401632, "grad_norm": 6.063660293875728e-06, "learning_rate": 2.1051887556667937e-07, "loss": 0.0224, "num_input_tokens_seen": 111823336, "step": 165920 }, { "epoch": 4.053575354848166, "grad_norm": 0.002080413280054927, "learning_rate": 2.10466537192433e-07, "loss": 0.0, "num_input_tokens_seen": 111826344, "step": 165925 }, { "epoch": 4.053697505680014, "grad_norm": 0.0037358372937887907, "learning_rate": 2.1041420455988668e-07, "loss": 0.0, "num_input_tokens_seen": 111829992, "step": 165930 }, { "epoch": 4.053819656511861, "grad_norm": 0.00466674380004406, "learning_rate": 2.1036187766942037e-07, "loss": 0.0, "num_input_tokens_seen": 111833512, "step": 165935 }, { "epoch": 4.053941807343708, "grad_norm": 0.0008783154771663249, "learning_rate": 2.103095565214149e-07, "loss": 0.0, "num_input_tokens_seen": 111836904, "step": 165940 }, { "epoch": 4.054063958175555, "grad_norm": 0.0030196059960871935, "learning_rate": 2.1025724111625099e-07, "loss": 0.0, "num_input_tokens_seen": 111840616, "step": 165945 }, { "epoch": 4.054186109007403, "grad_norm": 0.00019221876573283225, "learning_rate": 2.1020493145430851e-07, "loss": 0.0, "num_input_tokens_seen": 111843752, "step": 165950 }, { "epoch": 4.0543082598392495, "grad_norm": 0.0013134641340002418, "learning_rate": 2.1015262753596853e-07, "loss": 0.0, "num_input_tokens_seen": 111847720, "step": 165955 }, { "epoch": 4.054430410671097, "grad_norm": 0.0001785226777428761, "learning_rate": 2.1010032936161103e-07, "loss": 0.0007, "num_input_tokens_seen": 111851176, "step": 165960 }, { "epoch": 4.054552561502944, "grad_norm": 0.001006326638162136, "learning_rate": 2.100480369316162e-07, "loss": 0.0, "num_input_tokens_seen": 111854376, "step": 165965 }, { "epoch": 4.054674712334791, "grad_norm": 0.004637191072106361, "learning_rate": 2.0999575024636474e-07, "loss": 0.0, "num_input_tokens_seen": 111857576, "step": 165970 }, { "epoch": 4.054796863166638, "grad_norm": 0.00010858668974833563, "learning_rate": 2.0994346930623642e-07, "loss": 0.0, "num_input_tokens_seen": 111861224, "step": 165975 }, { "epoch": 4.054919013998485, "grad_norm": 7.806902431184426e-05, "learning_rate": 2.0989119411161194e-07, "loss": 0.0, "num_input_tokens_seen": 111864424, "step": 165980 }, { "epoch": 4.055041164830333, "grad_norm": 0.0072829751297831535, "learning_rate": 2.09838924662871e-07, "loss": 0.0, "num_input_tokens_seen": 111867496, "step": 165985 }, { "epoch": 4.055163315662179, "grad_norm": 0.004414117429405451, "learning_rate": 2.097866609603941e-07, "loss": 0.0, "num_input_tokens_seen": 111870568, "step": 165990 }, { "epoch": 4.055285466494027, "grad_norm": 0.004452398046851158, "learning_rate": 2.097344030045609e-07, "loss": 0.0, "num_input_tokens_seen": 111873960, "step": 165995 }, { "epoch": 4.055407617325874, "grad_norm": 0.00011855754564749077, "learning_rate": 2.096821507957517e-07, "loss": 0.0, "num_input_tokens_seen": 111877416, "step": 166000 }, { "epoch": 4.055529768157721, "grad_norm": 0.0006512808613479137, "learning_rate": 2.096299043343468e-07, "loss": 0.0, "num_input_tokens_seen": 111880360, "step": 166005 }, { "epoch": 4.055651918989568, "grad_norm": 0.001110451645217836, "learning_rate": 2.0957766362072548e-07, "loss": 0.0, "num_input_tokens_seen": 111883432, "step": 166010 }, { "epoch": 4.055774069821416, "grad_norm": 0.0010831262916326523, "learning_rate": 2.0952542865526824e-07, "loss": 0.0, "num_input_tokens_seen": 111886632, "step": 166015 }, { "epoch": 4.0558962206532625, "grad_norm": 0.0010897867614403367, "learning_rate": 2.094731994383544e-07, "loss": 0.0, "num_input_tokens_seen": 111889832, "step": 166020 }, { "epoch": 4.05601837148511, "grad_norm": 0.004145484417676926, "learning_rate": 2.0942097597036446e-07, "loss": 0.0, "num_input_tokens_seen": 111893160, "step": 166025 }, { "epoch": 4.056140522316957, "grad_norm": 3.431236109463498e-05, "learning_rate": 2.0936875825167744e-07, "loss": 0.0005, "num_input_tokens_seen": 111896936, "step": 166030 }, { "epoch": 4.0562626731488045, "grad_norm": 0.0006043767789378762, "learning_rate": 2.093165462826736e-07, "loss": 0.0, "num_input_tokens_seen": 111900712, "step": 166035 }, { "epoch": 4.056384823980651, "grad_norm": 0.002768630860373378, "learning_rate": 2.0926434006373261e-07, "loss": 0.0, "num_input_tokens_seen": 111903976, "step": 166040 }, { "epoch": 4.056506974812499, "grad_norm": 0.0006307873409241438, "learning_rate": 2.0921213959523388e-07, "loss": 0.0, "num_input_tokens_seen": 111907176, "step": 166045 }, { "epoch": 4.056629125644346, "grad_norm": 0.0002128913183696568, "learning_rate": 2.091599448775574e-07, "loss": 0.0, "num_input_tokens_seen": 111910440, "step": 166050 }, { "epoch": 4.056751276476192, "grad_norm": 0.001996406354010105, "learning_rate": 2.091077559110822e-07, "loss": 0.0246, "num_input_tokens_seen": 111913512, "step": 166055 }, { "epoch": 4.05687342730804, "grad_norm": 0.001469214097596705, "learning_rate": 2.0905557269618845e-07, "loss": 0.0, "num_input_tokens_seen": 111916840, "step": 166060 }, { "epoch": 4.056995578139887, "grad_norm": 0.0005221847095526755, "learning_rate": 2.0900339523325528e-07, "loss": 0.0, "num_input_tokens_seen": 111920296, "step": 166065 }, { "epoch": 4.057117728971734, "grad_norm": 0.0004611036856658757, "learning_rate": 2.0895122352266194e-07, "loss": 0.0, "num_input_tokens_seen": 111923624, "step": 166070 }, { "epoch": 4.057239879803581, "grad_norm": 0.004240771755576134, "learning_rate": 2.0889905756478833e-07, "loss": 0.0, "num_input_tokens_seen": 111926888, "step": 166075 }, { "epoch": 4.057362030635429, "grad_norm": 0.0005182889872230589, "learning_rate": 2.0884689736001316e-07, "loss": 0.0, "num_input_tokens_seen": 111930152, "step": 166080 }, { "epoch": 4.0574841814672755, "grad_norm": 0.00043798956903629005, "learning_rate": 2.0879474290871656e-07, "loss": 0.0, "num_input_tokens_seen": 111933544, "step": 166085 }, { "epoch": 4.057606332299123, "grad_norm": 0.0013579648220911622, "learning_rate": 2.0874259421127706e-07, "loss": 0.0, "num_input_tokens_seen": 111936616, "step": 166090 }, { "epoch": 4.05772848313097, "grad_norm": 0.0017074431525543332, "learning_rate": 2.0869045126807427e-07, "loss": 0.0, "num_input_tokens_seen": 111939496, "step": 166095 }, { "epoch": 4.0578506339628175, "grad_norm": 5.534076990443282e-05, "learning_rate": 2.0863831407948763e-07, "loss": 0.0, "num_input_tokens_seen": 111942888, "step": 166100 }, { "epoch": 4.057972784794664, "grad_norm": 0.0015134341083467007, "learning_rate": 2.0858618264589577e-07, "loss": 0.0, "num_input_tokens_seen": 111946344, "step": 166105 }, { "epoch": 4.058094935626512, "grad_norm": 38.33672332763672, "learning_rate": 2.0853405696767823e-07, "loss": 0.0293, "num_input_tokens_seen": 111949288, "step": 166110 }, { "epoch": 4.058217086458359, "grad_norm": 0.008847609162330627, "learning_rate": 2.0848193704521378e-07, "loss": 0.0, "num_input_tokens_seen": 111952616, "step": 166115 }, { "epoch": 4.058339237290206, "grad_norm": 5.5585911468369886e-05, "learning_rate": 2.0842982287888145e-07, "loss": 0.0, "num_input_tokens_seen": 111956328, "step": 166120 }, { "epoch": 4.058461388122053, "grad_norm": 0.0008487804443575442, "learning_rate": 2.0837771446906073e-07, "loss": 0.0, "num_input_tokens_seen": 111959400, "step": 166125 }, { "epoch": 4.058583538953901, "grad_norm": 0.00027049699565395713, "learning_rate": 2.0832561181612985e-07, "loss": 0.0, "num_input_tokens_seen": 111962856, "step": 166130 }, { "epoch": 4.058705689785747, "grad_norm": 0.002194001106545329, "learning_rate": 2.082735149204683e-07, "loss": 0.0, "num_input_tokens_seen": 111966184, "step": 166135 }, { "epoch": 4.058827840617594, "grad_norm": 0.0029670060612261295, "learning_rate": 2.0822142378245444e-07, "loss": 0.0, "num_input_tokens_seen": 111969448, "step": 166140 }, { "epoch": 4.058949991449442, "grad_norm": 0.0005519984406419098, "learning_rate": 2.0816933840246776e-07, "loss": 0.0418, "num_input_tokens_seen": 111972904, "step": 166145 }, { "epoch": 4.0590721422812885, "grad_norm": 0.0005892548360861838, "learning_rate": 2.0811725878088615e-07, "loss": 0.0, "num_input_tokens_seen": 111975976, "step": 166150 }, { "epoch": 4.059194293113136, "grad_norm": 0.0005409923614934087, "learning_rate": 2.0806518491808923e-07, "loss": 0.0004, "num_input_tokens_seen": 111979624, "step": 166155 }, { "epoch": 4.059316443944983, "grad_norm": 0.011323503218591213, "learning_rate": 2.08013116814455e-07, "loss": 0.0, "num_input_tokens_seen": 111982952, "step": 166160 }, { "epoch": 4.0594385947768306, "grad_norm": 0.03984846919775009, "learning_rate": 2.079610544703626e-07, "loss": 0.0, "num_input_tokens_seen": 111986088, "step": 166165 }, { "epoch": 4.059560745608677, "grad_norm": 0.01349696982651949, "learning_rate": 2.0790899788619033e-07, "loss": 0.0, "num_input_tokens_seen": 111989416, "step": 166170 }, { "epoch": 4.059682896440525, "grad_norm": 0.0021861952263861895, "learning_rate": 2.0785694706231693e-07, "loss": 0.0, "num_input_tokens_seen": 111992360, "step": 166175 }, { "epoch": 4.059805047272372, "grad_norm": 6.701894744765013e-05, "learning_rate": 2.0780490199912103e-07, "loss": 0.0, "num_input_tokens_seen": 111996072, "step": 166180 }, { "epoch": 4.059927198104219, "grad_norm": 0.014640755020081997, "learning_rate": 2.0775286269698066e-07, "loss": 0.0, "num_input_tokens_seen": 111999272, "step": 166185 }, { "epoch": 4.060049348936066, "grad_norm": 0.00035089225275442004, "learning_rate": 2.077008291562745e-07, "loss": 0.0, "num_input_tokens_seen": 112002216, "step": 166190 }, { "epoch": 4.060171499767914, "grad_norm": 4.5431013859342784e-05, "learning_rate": 2.076488013773814e-07, "loss": 0.0, "num_input_tokens_seen": 112005288, "step": 166195 }, { "epoch": 4.06029365059976, "grad_norm": 4.9018626668839715e-06, "learning_rate": 2.0759677936067899e-07, "loss": 0.0, "num_input_tokens_seen": 112008808, "step": 166200 }, { "epoch": 4.060415801431608, "grad_norm": 0.0014554978115484118, "learning_rate": 2.0754476310654611e-07, "loss": 0.0, "num_input_tokens_seen": 112011880, "step": 166205 }, { "epoch": 4.060537952263455, "grad_norm": 0.0021777262445539236, "learning_rate": 2.074927526153607e-07, "loss": 0.0, "num_input_tokens_seen": 112015336, "step": 166210 }, { "epoch": 4.0606601030953025, "grad_norm": 0.002060574246570468, "learning_rate": 2.074407478875012e-07, "loss": 0.0, "num_input_tokens_seen": 112018536, "step": 166215 }, { "epoch": 4.060782253927149, "grad_norm": 0.0008180902805179358, "learning_rate": 2.073887489233459e-07, "loss": 0.0, "num_input_tokens_seen": 112021480, "step": 166220 }, { "epoch": 4.060904404758997, "grad_norm": 0.00047382028424181044, "learning_rate": 2.0733675572327258e-07, "loss": 0.0, "num_input_tokens_seen": 112024488, "step": 166225 }, { "epoch": 4.061026555590844, "grad_norm": 0.004084159154444933, "learning_rate": 2.0728476828765996e-07, "loss": 0.0, "num_input_tokens_seen": 112027688, "step": 166230 }, { "epoch": 4.06114870642269, "grad_norm": 0.0006773903151042759, "learning_rate": 2.0723278661688526e-07, "loss": 0.0475, "num_input_tokens_seen": 112031272, "step": 166235 }, { "epoch": 4.061270857254538, "grad_norm": 0.0005431032041087747, "learning_rate": 2.0718081071132732e-07, "loss": 0.0, "num_input_tokens_seen": 112034600, "step": 166240 }, { "epoch": 4.061393008086385, "grad_norm": 5.953190702712163e-05, "learning_rate": 2.0712884057136348e-07, "loss": 0.0, "num_input_tokens_seen": 112037672, "step": 166245 }, { "epoch": 4.061515158918232, "grad_norm": 0.00018572367844171822, "learning_rate": 2.07076876197372e-07, "loss": 0.0, "num_input_tokens_seen": 112041192, "step": 166250 }, { "epoch": 4.061637309750079, "grad_norm": 0.0017650712979957461, "learning_rate": 2.0702491758973105e-07, "loss": 0.0, "num_input_tokens_seen": 112044584, "step": 166255 }, { "epoch": 4.061759460581927, "grad_norm": 0.00657630106434226, "learning_rate": 2.0697296474881787e-07, "loss": 0.0, "num_input_tokens_seen": 112047784, "step": 166260 }, { "epoch": 4.0618816114137735, "grad_norm": 0.00039903729339130223, "learning_rate": 2.069210176750108e-07, "loss": 0.0, "num_input_tokens_seen": 112051112, "step": 166265 }, { "epoch": 4.062003762245621, "grad_norm": 0.01160856056958437, "learning_rate": 2.0686907636868746e-07, "loss": 0.0, "num_input_tokens_seen": 112054184, "step": 166270 }, { "epoch": 4.062125913077468, "grad_norm": 0.0013919977936893702, "learning_rate": 2.0681714083022527e-07, "loss": 0.0, "num_input_tokens_seen": 112057448, "step": 166275 }, { "epoch": 4.0622480639093155, "grad_norm": 0.0015453466912731528, "learning_rate": 2.0676521106000245e-07, "loss": 0.0, "num_input_tokens_seen": 112060520, "step": 166280 }, { "epoch": 4.062370214741162, "grad_norm": 0.0008798211347311735, "learning_rate": 2.0671328705839608e-07, "loss": 0.0, "num_input_tokens_seen": 112063848, "step": 166285 }, { "epoch": 4.06249236557301, "grad_norm": 0.005919753108173609, "learning_rate": 2.066613688257842e-07, "loss": 0.0, "num_input_tokens_seen": 112067240, "step": 166290 }, { "epoch": 4.062614516404857, "grad_norm": 0.000499585410580039, "learning_rate": 2.066094563625441e-07, "loss": 0.0, "num_input_tokens_seen": 112070248, "step": 166295 }, { "epoch": 4.062736667236704, "grad_norm": 0.0014772225404158235, "learning_rate": 2.065575496690537e-07, "loss": 0.0, "num_input_tokens_seen": 112073832, "step": 166300 }, { "epoch": 4.062858818068551, "grad_norm": 9.564452193444595e-05, "learning_rate": 2.0650564874568988e-07, "loss": 0.0, "num_input_tokens_seen": 112077096, "step": 166305 }, { "epoch": 4.062980968900399, "grad_norm": 6.883578316774219e-05, "learning_rate": 2.0645375359283045e-07, "loss": 0.0, "num_input_tokens_seen": 112080296, "step": 166310 }, { "epoch": 4.063103119732245, "grad_norm": 0.037576161324977875, "learning_rate": 2.0640186421085303e-07, "loss": 0.0, "num_input_tokens_seen": 112083880, "step": 166315 }, { "epoch": 4.063225270564092, "grad_norm": 0.0019669902976602316, "learning_rate": 2.063499806001344e-07, "loss": 0.0, "num_input_tokens_seen": 112086824, "step": 166320 }, { "epoch": 4.06334742139594, "grad_norm": 0.00033250940032303333, "learning_rate": 2.0629810276105252e-07, "loss": 0.0, "num_input_tokens_seen": 112090024, "step": 166325 }, { "epoch": 4.0634695722277865, "grad_norm": 0.002947803121060133, "learning_rate": 2.0624623069398407e-07, "loss": 0.0, "num_input_tokens_seen": 112093416, "step": 166330 }, { "epoch": 4.063591723059634, "grad_norm": 0.0019779824651777744, "learning_rate": 2.061943643993067e-07, "loss": 0.0, "num_input_tokens_seen": 112097000, "step": 166335 }, { "epoch": 4.063713873891481, "grad_norm": 0.004801144357770681, "learning_rate": 2.061425038773972e-07, "loss": 0.0, "num_input_tokens_seen": 112100456, "step": 166340 }, { "epoch": 4.0638360247233285, "grad_norm": 0.002539390465244651, "learning_rate": 2.0609064912863284e-07, "loss": 0.0, "num_input_tokens_seen": 112103656, "step": 166345 }, { "epoch": 4.063958175555175, "grad_norm": 0.0009300903766416013, "learning_rate": 2.0603880015339115e-07, "loss": 0.0, "num_input_tokens_seen": 112107048, "step": 166350 }, { "epoch": 4.064080326387023, "grad_norm": 0.0006863299640826881, "learning_rate": 2.059869569520486e-07, "loss": 0.0, "num_input_tokens_seen": 112110696, "step": 166355 }, { "epoch": 4.06420247721887, "grad_norm": 0.0005132516380399466, "learning_rate": 2.0593511952498277e-07, "loss": 0.0, "num_input_tokens_seen": 112113896, "step": 166360 }, { "epoch": 4.064324628050717, "grad_norm": 0.00027452048379927874, "learning_rate": 2.0588328787257004e-07, "loss": 0.0, "num_input_tokens_seen": 112117224, "step": 166365 }, { "epoch": 4.064446778882564, "grad_norm": 0.00030005615553818643, "learning_rate": 2.0583146199518787e-07, "loss": 0.0, "num_input_tokens_seen": 112120552, "step": 166370 }, { "epoch": 4.064568929714412, "grad_norm": 0.01976819522678852, "learning_rate": 2.0577964189321284e-07, "loss": 0.0, "num_input_tokens_seen": 112123880, "step": 166375 }, { "epoch": 4.064691080546258, "grad_norm": 0.001043083262629807, "learning_rate": 2.0572782756702168e-07, "loss": 0.0, "num_input_tokens_seen": 112127080, "step": 166380 }, { "epoch": 4.064813231378106, "grad_norm": 0.0010621962137520313, "learning_rate": 2.0567601901699173e-07, "loss": 0.0, "num_input_tokens_seen": 112130664, "step": 166385 }, { "epoch": 4.064935382209953, "grad_norm": 0.0005313365836627781, "learning_rate": 2.0562421624349903e-07, "loss": 0.0, "num_input_tokens_seen": 112133992, "step": 166390 }, { "epoch": 4.0650575330418, "grad_norm": 3.122011185041629e-05, "learning_rate": 2.0557241924692103e-07, "loss": 0.0, "num_input_tokens_seen": 112137448, "step": 166395 }, { "epoch": 4.065179683873647, "grad_norm": 0.004067976027727127, "learning_rate": 2.0552062802763382e-07, "loss": 0.0, "num_input_tokens_seen": 112140584, "step": 166400 }, { "epoch": 4.065301834705495, "grad_norm": 0.0010395898716524243, "learning_rate": 2.0546884258601427e-07, "loss": 0.0001, "num_input_tokens_seen": 112143976, "step": 166405 }, { "epoch": 4.0654239855373415, "grad_norm": 0.00014114049554336816, "learning_rate": 2.0541706292243921e-07, "loss": 0.0, "num_input_tokens_seen": 112147560, "step": 166410 }, { "epoch": 4.065546136369188, "grad_norm": 0.001295996829867363, "learning_rate": 2.0536528903728478e-07, "loss": 0.0, "num_input_tokens_seen": 112151016, "step": 166415 }, { "epoch": 4.065668287201036, "grad_norm": 0.00015398616960737854, "learning_rate": 2.053135209309279e-07, "loss": 0.0, "num_input_tokens_seen": 112154408, "step": 166420 }, { "epoch": 4.065790438032883, "grad_norm": 5.002156103728339e-05, "learning_rate": 2.0526175860374462e-07, "loss": 0.0, "num_input_tokens_seen": 112157864, "step": 166425 }, { "epoch": 4.06591258886473, "grad_norm": 0.021989166736602783, "learning_rate": 2.0521000205611162e-07, "loss": 0.0, "num_input_tokens_seen": 112161000, "step": 166430 }, { "epoch": 4.066034739696577, "grad_norm": 0.00031291740015149117, "learning_rate": 2.0515825128840548e-07, "loss": 0.0, "num_input_tokens_seen": 112164008, "step": 166435 }, { "epoch": 4.066156890528425, "grad_norm": 0.00011651839304249734, "learning_rate": 2.0510650630100212e-07, "loss": 0.0, "num_input_tokens_seen": 112167272, "step": 166440 }, { "epoch": 4.066279041360271, "grad_norm": 0.00010879627370741218, "learning_rate": 2.0505476709427827e-07, "loss": 0.0, "num_input_tokens_seen": 112170600, "step": 166445 }, { "epoch": 4.066401192192119, "grad_norm": 0.0010120809311047196, "learning_rate": 2.050030336686097e-07, "loss": 0.0, "num_input_tokens_seen": 112174056, "step": 166450 }, { "epoch": 4.066523343023966, "grad_norm": 0.0003149090916849673, "learning_rate": 2.0495130602437315e-07, "loss": 0.0, "num_input_tokens_seen": 112177192, "step": 166455 }, { "epoch": 4.066645493855813, "grad_norm": 0.0001731503289192915, "learning_rate": 2.048995841619443e-07, "loss": 0.0, "num_input_tokens_seen": 112180776, "step": 166460 }, { "epoch": 4.06676764468766, "grad_norm": 0.00022798238205723464, "learning_rate": 2.0484786808169975e-07, "loss": 0.0, "num_input_tokens_seen": 112184168, "step": 166465 }, { "epoch": 4.066889795519508, "grad_norm": 0.002712154295295477, "learning_rate": 2.0479615778401517e-07, "loss": 0.0, "num_input_tokens_seen": 112187880, "step": 166470 }, { "epoch": 4.0670119463513545, "grad_norm": 0.00457811402156949, "learning_rate": 2.0474445326926703e-07, "loss": 0.0, "num_input_tokens_seen": 112191592, "step": 166475 }, { "epoch": 4.067134097183202, "grad_norm": 0.00020053704793099314, "learning_rate": 2.0469275453783098e-07, "loss": 0.0, "num_input_tokens_seen": 112194792, "step": 166480 }, { "epoch": 4.067256248015049, "grad_norm": 0.0011597948614507914, "learning_rate": 2.046410615900832e-07, "loss": 0.0, "num_input_tokens_seen": 112198376, "step": 166485 }, { "epoch": 4.067378398846897, "grad_norm": 0.0028763189911842346, "learning_rate": 2.0458937442639968e-07, "loss": 0.0, "num_input_tokens_seen": 112201896, "step": 166490 }, { "epoch": 4.067500549678743, "grad_norm": 0.008178922347724438, "learning_rate": 2.0453769304715586e-07, "loss": 0.0, "num_input_tokens_seen": 112205160, "step": 166495 }, { "epoch": 4.06762270051059, "grad_norm": 0.00021302708773873746, "learning_rate": 2.0448601745272797e-07, "loss": 0.0, "num_input_tokens_seen": 112208232, "step": 166500 }, { "epoch": 4.067744851342438, "grad_norm": 0.0008370587020181119, "learning_rate": 2.044343476434919e-07, "loss": 0.0, "num_input_tokens_seen": 112212008, "step": 166505 }, { "epoch": 4.067867002174284, "grad_norm": 0.001814281684346497, "learning_rate": 2.0438268361982303e-07, "loss": 0.0, "num_input_tokens_seen": 112215464, "step": 166510 }, { "epoch": 4.067989153006132, "grad_norm": 0.0003151059499941766, "learning_rate": 2.0433102538209745e-07, "loss": 0.0, "num_input_tokens_seen": 112218856, "step": 166515 }, { "epoch": 4.068111303837979, "grad_norm": 0.00011449768499005586, "learning_rate": 2.0427937293069042e-07, "loss": 0.0, "num_input_tokens_seen": 112221992, "step": 166520 }, { "epoch": 4.068233454669826, "grad_norm": 0.00038926705019548535, "learning_rate": 2.0422772626597796e-07, "loss": 0.0512, "num_input_tokens_seen": 112225128, "step": 166525 }, { "epoch": 4.068355605501673, "grad_norm": 19.979957580566406, "learning_rate": 2.0417608538833563e-07, "loss": 0.0204, "num_input_tokens_seen": 112228328, "step": 166530 }, { "epoch": 4.068477756333521, "grad_norm": 0.003946941811591387, "learning_rate": 2.0412445029813863e-07, "loss": 0.0, "num_input_tokens_seen": 112231528, "step": 166535 }, { "epoch": 4.068599907165368, "grad_norm": 0.04863838851451874, "learning_rate": 2.0407282099576295e-07, "loss": 0.0001, "num_input_tokens_seen": 112234600, "step": 166540 }, { "epoch": 4.068722057997215, "grad_norm": 0.00047266719047911465, "learning_rate": 2.0402119748158352e-07, "loss": 0.0, "num_input_tokens_seen": 112237800, "step": 166545 }, { "epoch": 4.068844208829062, "grad_norm": 0.0022663248237222433, "learning_rate": 2.039695797559763e-07, "loss": 0.0, "num_input_tokens_seen": 112240872, "step": 166550 }, { "epoch": 4.06896635966091, "grad_norm": 0.001182141830213368, "learning_rate": 2.0391796781931615e-07, "loss": 0.0, "num_input_tokens_seen": 112244392, "step": 166555 }, { "epoch": 4.069088510492756, "grad_norm": 0.0005252945702522993, "learning_rate": 2.0386636167197868e-07, "loss": 0.0, "num_input_tokens_seen": 112247976, "step": 166560 }, { "epoch": 4.069210661324604, "grad_norm": 0.00026759880711324513, "learning_rate": 2.038147613143394e-07, "loss": 0.0, "num_input_tokens_seen": 112251112, "step": 166565 }, { "epoch": 4.069332812156451, "grad_norm": 0.0017746612429618835, "learning_rate": 2.0376316674677306e-07, "loss": 0.0, "num_input_tokens_seen": 112254184, "step": 166570 }, { "epoch": 4.069454962988298, "grad_norm": 0.0013021057238802314, "learning_rate": 2.0371157796965544e-07, "loss": 0.0, "num_input_tokens_seen": 112257576, "step": 166575 }, { "epoch": 4.069577113820145, "grad_norm": 0.039894040673971176, "learning_rate": 2.0365999498336138e-07, "loss": 0.0, "num_input_tokens_seen": 112260904, "step": 166580 }, { "epoch": 4.069699264651993, "grad_norm": 0.0003660069196484983, "learning_rate": 2.0360841778826576e-07, "loss": 0.0, "num_input_tokens_seen": 112264040, "step": 166585 }, { "epoch": 4.0698214154838395, "grad_norm": 5.454904749058187e-05, "learning_rate": 2.0355684638474412e-07, "loss": 0.0, "num_input_tokens_seen": 112267240, "step": 166590 }, { "epoch": 4.069943566315686, "grad_norm": 0.0005420465604402125, "learning_rate": 2.035052807731712e-07, "loss": 0.0, "num_input_tokens_seen": 112270696, "step": 166595 }, { "epoch": 4.070065717147534, "grad_norm": 0.000395093928091228, "learning_rate": 2.034537209539222e-07, "loss": 0.0, "num_input_tokens_seen": 112274152, "step": 166600 }, { "epoch": 4.070187867979381, "grad_norm": 0.00028210054733790457, "learning_rate": 2.0340216692737188e-07, "loss": 0.0, "num_input_tokens_seen": 112278056, "step": 166605 }, { "epoch": 4.070310018811228, "grad_norm": 0.0013541615335270762, "learning_rate": 2.0335061869389547e-07, "loss": 0.0, "num_input_tokens_seen": 112281448, "step": 166610 }, { "epoch": 4.070432169643075, "grad_norm": 5.062710988568142e-05, "learning_rate": 2.0329907625386733e-07, "loss": 0.0, "num_input_tokens_seen": 112284904, "step": 166615 }, { "epoch": 4.070554320474923, "grad_norm": 0.0002720316406339407, "learning_rate": 2.0324753960766262e-07, "loss": 0.0, "num_input_tokens_seen": 112288616, "step": 166620 }, { "epoch": 4.070676471306769, "grad_norm": 0.039632584899663925, "learning_rate": 2.0319600875565635e-07, "loss": 0.0, "num_input_tokens_seen": 112291752, "step": 166625 }, { "epoch": 4.070798622138617, "grad_norm": 0.0007308509666472673, "learning_rate": 2.031444836982228e-07, "loss": 0.0, "num_input_tokens_seen": 112295400, "step": 166630 }, { "epoch": 4.070920772970464, "grad_norm": 0.0018601133488118649, "learning_rate": 2.030929644357371e-07, "loss": 0.0, "num_input_tokens_seen": 112298664, "step": 166635 }, { "epoch": 4.071042923802311, "grad_norm": 7.507337431889027e-05, "learning_rate": 2.030414509685734e-07, "loss": 0.0, "num_input_tokens_seen": 112301736, "step": 166640 }, { "epoch": 4.071165074634158, "grad_norm": 0.016003567725419998, "learning_rate": 2.0298994329710694e-07, "loss": 0.0, "num_input_tokens_seen": 112305064, "step": 166645 }, { "epoch": 4.071287225466006, "grad_norm": 0.0015702687669545412, "learning_rate": 2.029384414217118e-07, "loss": 0.0, "num_input_tokens_seen": 112308328, "step": 166650 }, { "epoch": 4.0714093762978525, "grad_norm": 0.00132664293050766, "learning_rate": 2.0288694534276262e-07, "loss": 0.0, "num_input_tokens_seen": 112311592, "step": 166655 }, { "epoch": 4.0715315271297, "grad_norm": 0.003941015340387821, "learning_rate": 2.0283545506063426e-07, "loss": 0.0, "num_input_tokens_seen": 112314728, "step": 166660 }, { "epoch": 4.071653677961547, "grad_norm": 0.0004780443850904703, "learning_rate": 2.0278397057570063e-07, "loss": 0.0005, "num_input_tokens_seen": 112317672, "step": 166665 }, { "epoch": 4.0717758287933945, "grad_norm": 0.026970183476805687, "learning_rate": 2.0273249188833652e-07, "loss": 0.0, "num_input_tokens_seen": 112321128, "step": 166670 }, { "epoch": 4.071897979625241, "grad_norm": 0.006872775498777628, "learning_rate": 2.026810189989161e-07, "loss": 0.0667, "num_input_tokens_seen": 112324840, "step": 166675 }, { "epoch": 4.072020130457088, "grad_norm": 0.0005837523494847119, "learning_rate": 2.0262955190781393e-07, "loss": 0.0, "num_input_tokens_seen": 112327848, "step": 166680 }, { "epoch": 4.072142281288936, "grad_norm": 0.001965815667062998, "learning_rate": 2.025780906154041e-07, "loss": 0.0, "num_input_tokens_seen": 112331240, "step": 166685 }, { "epoch": 4.072264432120782, "grad_norm": 0.00028454093262553215, "learning_rate": 2.025266351220607e-07, "loss": 0.0, "num_input_tokens_seen": 112334312, "step": 166690 }, { "epoch": 4.07238658295263, "grad_norm": 0.00013930317072663456, "learning_rate": 2.0247518542815822e-07, "loss": 0.0, "num_input_tokens_seen": 112337256, "step": 166695 }, { "epoch": 4.072508733784477, "grad_norm": 3.846707477350719e-05, "learning_rate": 2.024237415340706e-07, "loss": 0.0, "num_input_tokens_seen": 112340648, "step": 166700 }, { "epoch": 4.072630884616324, "grad_norm": 0.0019840323366224766, "learning_rate": 2.023723034401722e-07, "loss": 0.0, "num_input_tokens_seen": 112343720, "step": 166705 }, { "epoch": 4.072753035448171, "grad_norm": 0.0004893506411463022, "learning_rate": 2.0232087114683672e-07, "loss": 0.0, "num_input_tokens_seen": 112346792, "step": 166710 }, { "epoch": 4.072875186280019, "grad_norm": 0.0010090161813423038, "learning_rate": 2.022694446544385e-07, "loss": 0.0, "num_input_tokens_seen": 112350888, "step": 166715 }, { "epoch": 4.0729973371118655, "grad_norm": 0.00045400112867355347, "learning_rate": 2.0221802396335164e-07, "loss": 0.0003, "num_input_tokens_seen": 112354408, "step": 166720 }, { "epoch": 4.073119487943713, "grad_norm": 0.00026314205024391413, "learning_rate": 2.0216660907394955e-07, "loss": 0.0, "num_input_tokens_seen": 112357544, "step": 166725 }, { "epoch": 4.07324163877556, "grad_norm": 0.004530659411102533, "learning_rate": 2.0211519998660687e-07, "loss": 0.0001, "num_input_tokens_seen": 112361000, "step": 166730 }, { "epoch": 4.0733637896074075, "grad_norm": 0.0017997390823438764, "learning_rate": 2.020637967016967e-07, "loss": 0.0, "num_input_tokens_seen": 112363944, "step": 166735 }, { "epoch": 4.073485940439254, "grad_norm": 0.005322501994669437, "learning_rate": 2.0201239921959346e-07, "loss": 0.0, "num_input_tokens_seen": 112367080, "step": 166740 }, { "epoch": 4.073608091271102, "grad_norm": 0.0005989481578581035, "learning_rate": 2.0196100754067046e-07, "loss": 0.0, "num_input_tokens_seen": 112370472, "step": 166745 }, { "epoch": 4.073730242102949, "grad_norm": 0.003723002038896084, "learning_rate": 2.0190962166530167e-07, "loss": 0.0, "num_input_tokens_seen": 112373800, "step": 166750 }, { "epoch": 4.073852392934796, "grad_norm": 0.000128319050418213, "learning_rate": 2.018582415938611e-07, "loss": 0.0, "num_input_tokens_seen": 112376872, "step": 166755 }, { "epoch": 4.073974543766643, "grad_norm": 0.016991496086120605, "learning_rate": 2.018068673267217e-07, "loss": 0.0, "num_input_tokens_seen": 112379816, "step": 166760 }, { "epoch": 4.07409669459849, "grad_norm": 1.5704474208178e-05, "learning_rate": 2.017554988642578e-07, "loss": 0.0, "num_input_tokens_seen": 112383144, "step": 166765 }, { "epoch": 4.074218845430337, "grad_norm": 0.0006832077633589506, "learning_rate": 2.0170413620684222e-07, "loss": 0.0, "num_input_tokens_seen": 112386344, "step": 166770 }, { "epoch": 4.074340996262184, "grad_norm": 0.00029440299840644, "learning_rate": 2.0165277935484926e-07, "loss": 0.0, "num_input_tokens_seen": 112389672, "step": 166775 }, { "epoch": 4.074463147094032, "grad_norm": 4.601407272275537e-05, "learning_rate": 2.016014283086518e-07, "loss": 0.0, "num_input_tokens_seen": 112392744, "step": 166780 }, { "epoch": 4.0745852979258785, "grad_norm": 0.0020400311332195997, "learning_rate": 2.0155008306862366e-07, "loss": 0.0, "num_input_tokens_seen": 112396136, "step": 166785 }, { "epoch": 4.074707448757726, "grad_norm": 0.0016668371390551329, "learning_rate": 2.0149874363513775e-07, "loss": 0.0, "num_input_tokens_seen": 112399720, "step": 166790 }, { "epoch": 4.074829599589573, "grad_norm": 0.006800004281103611, "learning_rate": 2.0144741000856813e-07, "loss": 0.0, "num_input_tokens_seen": 112402536, "step": 166795 }, { "epoch": 4.0749517504214205, "grad_norm": 0.0035012788139283657, "learning_rate": 2.0139608218928772e-07, "loss": 0.0, "num_input_tokens_seen": 112406120, "step": 166800 }, { "epoch": 4.075073901253267, "grad_norm": 0.00147433637175709, "learning_rate": 2.0134476017766943e-07, "loss": 0.0, "num_input_tokens_seen": 112409384, "step": 166805 }, { "epoch": 4.075196052085115, "grad_norm": 0.020915621891617775, "learning_rate": 2.0129344397408698e-07, "loss": 0.0, "num_input_tokens_seen": 112412648, "step": 166810 }, { "epoch": 4.075318202916962, "grad_norm": 0.00011061589611927047, "learning_rate": 2.0124213357891362e-07, "loss": 0.0, "num_input_tokens_seen": 112415720, "step": 166815 }, { "epoch": 4.075440353748809, "grad_norm": 3.273623588029295e-05, "learning_rate": 2.0119082899252216e-07, "loss": 0.0, "num_input_tokens_seen": 112419496, "step": 166820 }, { "epoch": 4.075562504580656, "grad_norm": 0.00029083856497891247, "learning_rate": 2.0113953021528595e-07, "loss": 0.0, "num_input_tokens_seen": 112422824, "step": 166825 }, { "epoch": 4.075684655412504, "grad_norm": 0.00011449142039055005, "learning_rate": 2.0108823724757772e-07, "loss": 0.0, "num_input_tokens_seen": 112426280, "step": 166830 }, { "epoch": 4.07580680624435, "grad_norm": 0.0016644089482724667, "learning_rate": 2.0103695008977083e-07, "loss": 0.0, "num_input_tokens_seen": 112429608, "step": 166835 }, { "epoch": 4.075928957076198, "grad_norm": 0.0005640396266244352, "learning_rate": 2.0098566874223833e-07, "loss": 0.0, "num_input_tokens_seen": 112432744, "step": 166840 }, { "epoch": 4.076051107908045, "grad_norm": 0.0011225900379940867, "learning_rate": 2.0093439320535267e-07, "loss": 0.0, "num_input_tokens_seen": 112436392, "step": 166845 }, { "epoch": 4.0761732587398924, "grad_norm": 0.0030262626241892576, "learning_rate": 2.008831234794872e-07, "loss": 0.0, "num_input_tokens_seen": 112440104, "step": 166850 }, { "epoch": 4.076295409571739, "grad_norm": 0.0018415100639685988, "learning_rate": 2.0083185956501447e-07, "loss": 0.0, "num_input_tokens_seen": 112443112, "step": 166855 }, { "epoch": 4.076417560403586, "grad_norm": 0.0006190946442075074, "learning_rate": 2.0078060146230758e-07, "loss": 0.0, "num_input_tokens_seen": 112446184, "step": 166860 }, { "epoch": 4.076539711235434, "grad_norm": 1.6634949133731425e-05, "learning_rate": 2.007293491717389e-07, "loss": 0.0, "num_input_tokens_seen": 112449640, "step": 166865 }, { "epoch": 4.07666186206728, "grad_norm": 0.0008858484798111022, "learning_rate": 2.0067810269368136e-07, "loss": 0.0, "num_input_tokens_seen": 112452904, "step": 166870 }, { "epoch": 4.076784012899128, "grad_norm": 0.004251960664987564, "learning_rate": 2.0062686202850797e-07, "loss": 0.0, "num_input_tokens_seen": 112456488, "step": 166875 }, { "epoch": 4.076906163730975, "grad_norm": 5.69775584153831e-05, "learning_rate": 2.005756271765907e-07, "loss": 0.0, "num_input_tokens_seen": 112459688, "step": 166880 }, { "epoch": 4.077028314562822, "grad_norm": 0.0008310135453939438, "learning_rate": 2.005243981383028e-07, "loss": 0.0, "num_input_tokens_seen": 112463400, "step": 166885 }, { "epoch": 4.077150465394669, "grad_norm": 0.0076802265830338, "learning_rate": 2.004731749140165e-07, "loss": 0.0, "num_input_tokens_seen": 112466600, "step": 166890 }, { "epoch": 4.077272616226517, "grad_norm": 0.00616258243098855, "learning_rate": 2.0042195750410406e-07, "loss": 0.0, "num_input_tokens_seen": 112470056, "step": 166895 }, { "epoch": 4.0773947670583635, "grad_norm": 0.00012890664220321923, "learning_rate": 2.0037074590893842e-07, "loss": 0.0, "num_input_tokens_seen": 112473576, "step": 166900 }, { "epoch": 4.077516917890211, "grad_norm": 0.0006692970637232065, "learning_rate": 2.0031954012889153e-07, "loss": 0.0917, "num_input_tokens_seen": 112476904, "step": 166905 }, { "epoch": 4.077639068722058, "grad_norm": 0.000872828415594995, "learning_rate": 2.0026834016433635e-07, "loss": 0.0, "num_input_tokens_seen": 112480488, "step": 166910 }, { "epoch": 4.0777612195539055, "grad_norm": 0.0008254763088189065, "learning_rate": 2.0021714601564464e-07, "loss": 0.0, "num_input_tokens_seen": 112484264, "step": 166915 }, { "epoch": 4.077883370385752, "grad_norm": 0.0012822564458474517, "learning_rate": 2.0016595768318922e-07, "loss": 0.0, "num_input_tokens_seen": 112487464, "step": 166920 }, { "epoch": 4.0780055212176, "grad_norm": 0.0002115043462254107, "learning_rate": 2.0011477516734175e-07, "loss": 0.0, "num_input_tokens_seen": 112491176, "step": 166925 }, { "epoch": 4.078127672049447, "grad_norm": 0.0035075817722827196, "learning_rate": 2.0006359846847487e-07, "loss": 0.0, "num_input_tokens_seen": 112493928, "step": 166930 }, { "epoch": 4.078249822881294, "grad_norm": 0.00015296187484636903, "learning_rate": 2.000124275869609e-07, "loss": 0.0, "num_input_tokens_seen": 112497576, "step": 166935 }, { "epoch": 4.078371973713141, "grad_norm": 0.016465021297335625, "learning_rate": 1.9996126252317146e-07, "loss": 0.0, "num_input_tokens_seen": 112500584, "step": 166940 }, { "epoch": 4.078494124544988, "grad_norm": 0.002111559733748436, "learning_rate": 1.9991010327747915e-07, "loss": 0.0, "num_input_tokens_seen": 112503784, "step": 166945 }, { "epoch": 4.078616275376835, "grad_norm": 0.006358523387461901, "learning_rate": 1.9985894985025542e-07, "loss": 0.0, "num_input_tokens_seen": 112507240, "step": 166950 }, { "epoch": 4.078738426208682, "grad_norm": 0.000776168773882091, "learning_rate": 1.99807802241873e-07, "loss": 0.0, "num_input_tokens_seen": 112510184, "step": 166955 }, { "epoch": 4.07886057704053, "grad_norm": 2.3206490368465893e-05, "learning_rate": 1.9975666045270323e-07, "loss": 0.0, "num_input_tokens_seen": 112514280, "step": 166960 }, { "epoch": 4.0789827278723765, "grad_norm": 0.0020938734523952007, "learning_rate": 1.9970552448311818e-07, "loss": 0.0, "num_input_tokens_seen": 112517608, "step": 166965 }, { "epoch": 4.079104878704224, "grad_norm": 0.000599257240537554, "learning_rate": 1.9965439433349008e-07, "loss": 0.0, "num_input_tokens_seen": 112520808, "step": 166970 }, { "epoch": 4.079227029536071, "grad_norm": 0.0007910645217634737, "learning_rate": 1.9960327000419032e-07, "loss": 0.0, "num_input_tokens_seen": 112524008, "step": 166975 }, { "epoch": 4.0793491803679185, "grad_norm": 0.0006026038900017738, "learning_rate": 1.9955215149559101e-07, "loss": 0.0, "num_input_tokens_seen": 112526952, "step": 166980 }, { "epoch": 4.079471331199765, "grad_norm": 0.00035834472510032356, "learning_rate": 1.9950103880806357e-07, "loss": 0.0, "num_input_tokens_seen": 112530664, "step": 166985 }, { "epoch": 4.079593482031613, "grad_norm": 0.004703744780272245, "learning_rate": 1.9944993194198012e-07, "loss": 0.0018, "num_input_tokens_seen": 112534184, "step": 166990 }, { "epoch": 4.07971563286346, "grad_norm": 0.0003986228839494288, "learning_rate": 1.9939883089771203e-07, "loss": 0.0, "num_input_tokens_seen": 112537576, "step": 166995 }, { "epoch": 4.079837783695307, "grad_norm": 0.00031761577702127397, "learning_rate": 1.9934773567563079e-07, "loss": 0.0308, "num_input_tokens_seen": 112540840, "step": 167000 }, { "epoch": 4.079959934527154, "grad_norm": 0.0002825024421326816, "learning_rate": 1.9929664627610842e-07, "loss": 0.0392, "num_input_tokens_seen": 112544232, "step": 167005 }, { "epoch": 4.080082085359002, "grad_norm": 7.694314263062552e-05, "learning_rate": 1.9924556269951587e-07, "loss": 0.0359, "num_input_tokens_seen": 112547624, "step": 167010 }, { "epoch": 4.080204236190848, "grad_norm": 0.0013545994879677892, "learning_rate": 1.9919448494622526e-07, "loss": 0.0, "num_input_tokens_seen": 112551272, "step": 167015 }, { "epoch": 4.080326387022696, "grad_norm": 0.0005039193201810122, "learning_rate": 1.9914341301660752e-07, "loss": 0.0, "num_input_tokens_seen": 112554344, "step": 167020 }, { "epoch": 4.080448537854543, "grad_norm": 0.00019876591977663338, "learning_rate": 1.9909234691103426e-07, "loss": 0.0, "num_input_tokens_seen": 112557736, "step": 167025 }, { "epoch": 4.08057068868639, "grad_norm": 0.00017606680921744555, "learning_rate": 1.9904128662987717e-07, "loss": 0.0, "num_input_tokens_seen": 112561128, "step": 167030 }, { "epoch": 4.080692839518237, "grad_norm": 0.001958389300853014, "learning_rate": 1.9899023217350697e-07, "loss": 0.0, "num_input_tokens_seen": 112564264, "step": 167035 }, { "epoch": 4.080814990350084, "grad_norm": 0.0005842614336870611, "learning_rate": 1.9893918354229554e-07, "loss": 0.0, "num_input_tokens_seen": 112567784, "step": 167040 }, { "epoch": 4.0809371411819315, "grad_norm": 0.00013599508383776993, "learning_rate": 1.9888814073661353e-07, "loss": 0.0, "num_input_tokens_seen": 112571112, "step": 167045 }, { "epoch": 4.081059292013778, "grad_norm": 270.36590576171875, "learning_rate": 1.9883710375683273e-07, "loss": 0.054, "num_input_tokens_seen": 112574760, "step": 167050 }, { "epoch": 4.081181442845626, "grad_norm": 5.747499017161317e-05, "learning_rate": 1.987860726033237e-07, "loss": 0.0, "num_input_tokens_seen": 112578152, "step": 167055 }, { "epoch": 4.081303593677473, "grad_norm": 6.630839197896421e-05, "learning_rate": 1.9873504727645784e-07, "loss": 0.0, "num_input_tokens_seen": 112581928, "step": 167060 }, { "epoch": 4.08142574450932, "grad_norm": 0.0018387179588899016, "learning_rate": 1.9868402777660652e-07, "loss": 0.0, "num_input_tokens_seen": 112585320, "step": 167065 }, { "epoch": 4.081547895341167, "grad_norm": 0.0008636590791866183, "learning_rate": 1.9863301410414024e-07, "loss": 0.0, "num_input_tokens_seen": 112588968, "step": 167070 }, { "epoch": 4.081670046173015, "grad_norm": 0.006454425398260355, "learning_rate": 1.9858200625943044e-07, "loss": 0.0, "num_input_tokens_seen": 112591976, "step": 167075 }, { "epoch": 4.081792197004861, "grad_norm": 0.00018307649588678032, "learning_rate": 1.9853100424284764e-07, "loss": 0.0001, "num_input_tokens_seen": 112595048, "step": 167080 }, { "epoch": 4.081914347836709, "grad_norm": 0.0006819216068834066, "learning_rate": 1.9848000805476284e-07, "loss": 0.0, "num_input_tokens_seen": 112598888, "step": 167085 }, { "epoch": 4.082036498668556, "grad_norm": 0.005882292054593563, "learning_rate": 1.9842901769554742e-07, "loss": 0.0, "num_input_tokens_seen": 112601768, "step": 167090 }, { "epoch": 4.082158649500403, "grad_norm": 0.0009726961143314838, "learning_rate": 1.9837803316557167e-07, "loss": 0.0, "num_input_tokens_seen": 112605096, "step": 167095 }, { "epoch": 4.08228080033225, "grad_norm": 0.001027434947900474, "learning_rate": 1.9832705446520625e-07, "loss": 0.0, "num_input_tokens_seen": 112608680, "step": 167100 }, { "epoch": 4.082402951164098, "grad_norm": 0.00011184045433765277, "learning_rate": 1.9827608159482235e-07, "loss": 0.0, "num_input_tokens_seen": 112612136, "step": 167105 }, { "epoch": 4.0825251019959445, "grad_norm": 0.0006892028613947332, "learning_rate": 1.9822511455479041e-07, "loss": 0.0, "num_input_tokens_seen": 112615656, "step": 167110 }, { "epoch": 4.082647252827792, "grad_norm": 5.178718492970802e-05, "learning_rate": 1.9817415334548093e-07, "loss": 0.0, "num_input_tokens_seen": 112618728, "step": 167115 }, { "epoch": 4.082769403659639, "grad_norm": 0.0028919263277202845, "learning_rate": 1.9812319796726452e-07, "loss": 0.0, "num_input_tokens_seen": 112621992, "step": 167120 }, { "epoch": 4.082891554491486, "grad_norm": 0.00037826705374754965, "learning_rate": 1.980722484205123e-07, "loss": 0.0, "num_input_tokens_seen": 112625192, "step": 167125 }, { "epoch": 4.083013705323333, "grad_norm": 0.0008559504640288651, "learning_rate": 1.9802130470559397e-07, "loss": 0.0, "num_input_tokens_seen": 112628392, "step": 167130 }, { "epoch": 4.08313585615518, "grad_norm": 0.0019665653817355633, "learning_rate": 1.9797036682288083e-07, "loss": 0.0, "num_input_tokens_seen": 112631976, "step": 167135 }, { "epoch": 4.083258006987028, "grad_norm": 0.0028887365479022264, "learning_rate": 1.9791943477274255e-07, "loss": 0.0, "num_input_tokens_seen": 112636136, "step": 167140 }, { "epoch": 4.083380157818874, "grad_norm": 0.0007512049051001668, "learning_rate": 1.9786850855554993e-07, "loss": 0.0, "num_input_tokens_seen": 112639656, "step": 167145 }, { "epoch": 4.083502308650722, "grad_norm": 8.230555249610916e-05, "learning_rate": 1.9781758817167348e-07, "loss": 0.0, "num_input_tokens_seen": 112642792, "step": 167150 }, { "epoch": 4.083624459482569, "grad_norm": 0.0015477788401767612, "learning_rate": 1.9776667362148303e-07, "loss": 0.0, "num_input_tokens_seen": 112646120, "step": 167155 }, { "epoch": 4.083746610314416, "grad_norm": 0.0016721126157790422, "learning_rate": 1.9771576490534935e-07, "loss": 0.0, "num_input_tokens_seen": 112649576, "step": 167160 }, { "epoch": 4.083868761146263, "grad_norm": 0.000851157121360302, "learning_rate": 1.976648620236422e-07, "loss": 0.0, "num_input_tokens_seen": 112652840, "step": 167165 }, { "epoch": 4.083990911978111, "grad_norm": 0.00016223567945417017, "learning_rate": 1.976139649767322e-07, "loss": 0.0, "num_input_tokens_seen": 112656040, "step": 167170 }, { "epoch": 4.084113062809958, "grad_norm": 0.003818761557340622, "learning_rate": 1.9756307376498905e-07, "loss": 0.0, "num_input_tokens_seen": 112659112, "step": 167175 }, { "epoch": 4.084235213641805, "grad_norm": 0.0004354110569693148, "learning_rate": 1.9751218838878304e-07, "loss": 0.0, "num_input_tokens_seen": 112662184, "step": 167180 }, { "epoch": 4.084357364473652, "grad_norm": 0.025302061811089516, "learning_rate": 1.9746130884848445e-07, "loss": 0.0, "num_input_tokens_seen": 112665512, "step": 167185 }, { "epoch": 4.0844795153055, "grad_norm": 9.590527952241246e-06, "learning_rate": 1.9741043514446288e-07, "loss": 0.0, "num_input_tokens_seen": 112668840, "step": 167190 }, { "epoch": 4.084601666137346, "grad_norm": 0.01576736755669117, "learning_rate": 1.973595672770887e-07, "loss": 0.0, "num_input_tokens_seen": 112672168, "step": 167195 }, { "epoch": 4.084723816969194, "grad_norm": 0.00047771475510671735, "learning_rate": 1.9730870524673172e-07, "loss": 0.0, "num_input_tokens_seen": 112675368, "step": 167200 }, { "epoch": 4.084845967801041, "grad_norm": 8.826627890812233e-05, "learning_rate": 1.972578490537614e-07, "loss": 0.0, "num_input_tokens_seen": 112678504, "step": 167205 }, { "epoch": 4.084968118632888, "grad_norm": 0.00017720961477607489, "learning_rate": 1.9720699869854817e-07, "loss": 0.0, "num_input_tokens_seen": 112681960, "step": 167210 }, { "epoch": 4.085090269464735, "grad_norm": 0.0009463115711696446, "learning_rate": 1.9715615418146138e-07, "loss": 0.0, "num_input_tokens_seen": 112685288, "step": 167215 }, { "epoch": 4.085212420296582, "grad_norm": 1.9413335394347087e-05, "learning_rate": 1.9710531550287112e-07, "loss": 0.0, "num_input_tokens_seen": 112688360, "step": 167220 }, { "epoch": 4.0853345711284295, "grad_norm": 0.006827809847891331, "learning_rate": 1.9705448266314685e-07, "loss": 0.0, "num_input_tokens_seen": 112692136, "step": 167225 }, { "epoch": 4.085456721960276, "grad_norm": 6.61760859657079e-05, "learning_rate": 1.9700365566265852e-07, "loss": 0.0, "num_input_tokens_seen": 112695336, "step": 167230 }, { "epoch": 4.085578872792124, "grad_norm": 2.5695724616525695e-05, "learning_rate": 1.9695283450177523e-07, "loss": 0.0004, "num_input_tokens_seen": 112698536, "step": 167235 }, { "epoch": 4.085701023623971, "grad_norm": 0.002052685245871544, "learning_rate": 1.9690201918086712e-07, "loss": 0.0001, "num_input_tokens_seen": 112701736, "step": 167240 }, { "epoch": 4.085823174455818, "grad_norm": 0.004446287173777819, "learning_rate": 1.9685120970030366e-07, "loss": 0.0, "num_input_tokens_seen": 112705064, "step": 167245 }, { "epoch": 4.085945325287665, "grad_norm": 0.0018460979918017983, "learning_rate": 1.9680040606045402e-07, "loss": 0.0, "num_input_tokens_seen": 112708520, "step": 167250 }, { "epoch": 4.086067476119513, "grad_norm": 6.863919406896457e-05, "learning_rate": 1.9674960826168807e-07, "loss": 0.0, "num_input_tokens_seen": 112711400, "step": 167255 }, { "epoch": 4.086189626951359, "grad_norm": 0.00014254235429689288, "learning_rate": 1.966988163043748e-07, "loss": 0.0, "num_input_tokens_seen": 112714472, "step": 167260 }, { "epoch": 4.086311777783207, "grad_norm": 0.026957418769598007, "learning_rate": 1.966480301888841e-07, "loss": 0.0, "num_input_tokens_seen": 112717864, "step": 167265 }, { "epoch": 4.086433928615054, "grad_norm": 0.00010942335211439058, "learning_rate": 1.9659724991558467e-07, "loss": 0.0, "num_input_tokens_seen": 112720872, "step": 167270 }, { "epoch": 4.086556079446901, "grad_norm": 1.6432888514827937e-05, "learning_rate": 1.9654647548484615e-07, "loss": 0.0, "num_input_tokens_seen": 112724648, "step": 167275 }, { "epoch": 4.086678230278748, "grad_norm": 6.028258940204978e-05, "learning_rate": 1.96495706897038e-07, "loss": 0.0, "num_input_tokens_seen": 112727848, "step": 167280 }, { "epoch": 4.086800381110596, "grad_norm": 0.006534240674227476, "learning_rate": 1.9644494415252887e-07, "loss": 0.0, "num_input_tokens_seen": 112731176, "step": 167285 }, { "epoch": 4.0869225319424425, "grad_norm": 0.0022399064619094133, "learning_rate": 1.9639418725168866e-07, "loss": 0.0, "num_input_tokens_seen": 112734312, "step": 167290 }, { "epoch": 4.08704468277429, "grad_norm": 0.0006240478251129389, "learning_rate": 1.963434361948857e-07, "loss": 0.0, "num_input_tokens_seen": 112737320, "step": 167295 }, { "epoch": 4.087166833606137, "grad_norm": 0.0010657204547896981, "learning_rate": 1.9629269098248967e-07, "loss": 0.0, "num_input_tokens_seen": 112740328, "step": 167300 }, { "epoch": 4.087288984437984, "grad_norm": 0.00023298896849155426, "learning_rate": 1.9624195161486945e-07, "loss": 0.0, "num_input_tokens_seen": 112743336, "step": 167305 }, { "epoch": 4.087411135269831, "grad_norm": 2.725253943935968e-05, "learning_rate": 1.961912180923936e-07, "loss": 0.0, "num_input_tokens_seen": 112746792, "step": 167310 }, { "epoch": 4.087533286101678, "grad_norm": 0.0026628663763403893, "learning_rate": 1.961404904154317e-07, "loss": 0.0, "num_input_tokens_seen": 112750312, "step": 167315 }, { "epoch": 4.087655436933526, "grad_norm": 0.00037412403617054224, "learning_rate": 1.960897685843521e-07, "loss": 0.0, "num_input_tokens_seen": 112753448, "step": 167320 }, { "epoch": 4.087777587765372, "grad_norm": 0.0007068165577948093, "learning_rate": 1.9603905259952426e-07, "loss": 0.0, "num_input_tokens_seen": 112756840, "step": 167325 }, { "epoch": 4.08789973859722, "grad_norm": 7.83190771471709e-05, "learning_rate": 1.9598834246131634e-07, "loss": 0.0, "num_input_tokens_seen": 112760296, "step": 167330 }, { "epoch": 4.088021889429067, "grad_norm": 0.0009760848479345441, "learning_rate": 1.9593763817009745e-07, "loss": 0.0, "num_input_tokens_seen": 112763496, "step": 167335 }, { "epoch": 4.088144040260914, "grad_norm": 0.0001355950371362269, "learning_rate": 1.958869397262366e-07, "loss": 0.0, "num_input_tokens_seen": 112767016, "step": 167340 }, { "epoch": 4.088266191092761, "grad_norm": 0.0007737476262263954, "learning_rate": 1.9583624713010183e-07, "loss": 0.0, "num_input_tokens_seen": 112769896, "step": 167345 }, { "epoch": 4.088388341924609, "grad_norm": 0.000765020027756691, "learning_rate": 1.9578556038206262e-07, "loss": 0.0, "num_input_tokens_seen": 112773032, "step": 167350 }, { "epoch": 4.0885104927564555, "grad_norm": 0.00048364579561166465, "learning_rate": 1.9573487948248668e-07, "loss": 0.0, "num_input_tokens_seen": 112776424, "step": 167355 }, { "epoch": 4.088632643588303, "grad_norm": 0.002045189728960395, "learning_rate": 1.9568420443174338e-07, "loss": 0.0, "num_input_tokens_seen": 112779496, "step": 167360 }, { "epoch": 4.08875479442015, "grad_norm": 0.00012536265421658754, "learning_rate": 1.9563353523020066e-07, "loss": 0.0, "num_input_tokens_seen": 112782504, "step": 167365 }, { "epoch": 4.0888769452519975, "grad_norm": 0.0009548076777718961, "learning_rate": 1.9558287187822707e-07, "loss": 0.0, "num_input_tokens_seen": 112786216, "step": 167370 }, { "epoch": 4.088999096083844, "grad_norm": 6.926884816493839e-05, "learning_rate": 1.955322143761916e-07, "loss": 0.0, "num_input_tokens_seen": 112789992, "step": 167375 }, { "epoch": 4.089121246915692, "grad_norm": 0.00932652223855257, "learning_rate": 1.9548156272446194e-07, "loss": 0.0, "num_input_tokens_seen": 112793192, "step": 167380 }, { "epoch": 4.089243397747539, "grad_norm": 0.00042319862404838204, "learning_rate": 1.954309169234071e-07, "loss": 0.0, "num_input_tokens_seen": 112796648, "step": 167385 }, { "epoch": 4.089365548579385, "grad_norm": 5.152716039447114e-05, "learning_rate": 1.9538027697339455e-07, "loss": 0.0, "num_input_tokens_seen": 112800104, "step": 167390 }, { "epoch": 4.089487699411233, "grad_norm": 5.449349919217639e-05, "learning_rate": 1.9532964287479325e-07, "loss": 0.0, "num_input_tokens_seen": 112803688, "step": 167395 }, { "epoch": 4.08960985024308, "grad_norm": 0.00032072042813524604, "learning_rate": 1.9527901462797136e-07, "loss": 0.0, "num_input_tokens_seen": 112807400, "step": 167400 }, { "epoch": 4.089732001074927, "grad_norm": 0.0025550902355462313, "learning_rate": 1.95228392233297e-07, "loss": 0.0, "num_input_tokens_seen": 112810792, "step": 167405 }, { "epoch": 4.089854151906774, "grad_norm": 0.000510639336425811, "learning_rate": 1.9517777569113792e-07, "loss": 0.0, "num_input_tokens_seen": 112814184, "step": 167410 }, { "epoch": 4.089976302738622, "grad_norm": 2.887546543206554e-05, "learning_rate": 1.9512716500186277e-07, "loss": 0.0, "num_input_tokens_seen": 112817896, "step": 167415 }, { "epoch": 4.0900984535704685, "grad_norm": 0.00021238067711237818, "learning_rate": 1.950765601658394e-07, "loss": 0.0, "num_input_tokens_seen": 112821800, "step": 167420 }, { "epoch": 4.090220604402316, "grad_norm": 0.00016823512851260602, "learning_rate": 1.950259611834355e-07, "loss": 0.0, "num_input_tokens_seen": 112825064, "step": 167425 }, { "epoch": 4.090342755234163, "grad_norm": 0.005793475545942783, "learning_rate": 1.9497536805501934e-07, "loss": 0.0, "num_input_tokens_seen": 112828200, "step": 167430 }, { "epoch": 4.0904649060660105, "grad_norm": 0.00038167531602084637, "learning_rate": 1.9492478078095909e-07, "loss": 0.0, "num_input_tokens_seen": 112832104, "step": 167435 }, { "epoch": 4.090587056897857, "grad_norm": 0.00012043816968798637, "learning_rate": 1.948741993616221e-07, "loss": 0.0, "num_input_tokens_seen": 112835496, "step": 167440 }, { "epoch": 4.090709207729705, "grad_norm": 0.00020174685050733387, "learning_rate": 1.948236237973767e-07, "loss": 0.0, "num_input_tokens_seen": 112838888, "step": 167445 }, { "epoch": 4.090831358561552, "grad_norm": 0.001029736828058958, "learning_rate": 1.9477305408859023e-07, "loss": 0.0, "num_input_tokens_seen": 112842856, "step": 167450 }, { "epoch": 4.090953509393399, "grad_norm": 0.0009005250176414847, "learning_rate": 1.9472249023563103e-07, "loss": 0.0, "num_input_tokens_seen": 112846184, "step": 167455 }, { "epoch": 4.091075660225246, "grad_norm": 0.00013175183266866952, "learning_rate": 1.9467193223886613e-07, "loss": 0.0, "num_input_tokens_seen": 112849512, "step": 167460 }, { "epoch": 4.091197811057094, "grad_norm": 0.006021356210112572, "learning_rate": 1.9462138009866357e-07, "loss": 0.0, "num_input_tokens_seen": 112853096, "step": 167465 }, { "epoch": 4.09131996188894, "grad_norm": 2.9928800358902663e-05, "learning_rate": 1.945708338153913e-07, "loss": 0.0011, "num_input_tokens_seen": 112855912, "step": 167470 }, { "epoch": 4.091442112720788, "grad_norm": 2.2655483917333186e-05, "learning_rate": 1.9452029338941623e-07, "loss": 0.0, "num_input_tokens_seen": 112859560, "step": 167475 }, { "epoch": 4.091564263552635, "grad_norm": 0.0012740838574245572, "learning_rate": 1.944697588211064e-07, "loss": 0.0, "num_input_tokens_seen": 112863080, "step": 167480 }, { "epoch": 4.0916864143844816, "grad_norm": 0.020395029336214066, "learning_rate": 1.9441923011082905e-07, "loss": 0.0, "num_input_tokens_seen": 112866408, "step": 167485 }, { "epoch": 4.091808565216329, "grad_norm": 0.0010367101058363914, "learning_rate": 1.943687072589516e-07, "loss": 0.0, "num_input_tokens_seen": 112869672, "step": 167490 }, { "epoch": 4.091930716048176, "grad_norm": 0.0001547683059470728, "learning_rate": 1.9431819026584196e-07, "loss": 0.0, "num_input_tokens_seen": 112872808, "step": 167495 }, { "epoch": 4.092052866880024, "grad_norm": 0.0013382845791056752, "learning_rate": 1.942676791318668e-07, "loss": 0.0, "num_input_tokens_seen": 112876072, "step": 167500 }, { "epoch": 4.09217501771187, "grad_norm": 0.0001629707112442702, "learning_rate": 1.942171738573941e-07, "loss": 0.0, "num_input_tokens_seen": 112879464, "step": 167505 }, { "epoch": 4.092297168543718, "grad_norm": 7.951433872221969e-06, "learning_rate": 1.941666744427909e-07, "loss": 0.0, "num_input_tokens_seen": 112882792, "step": 167510 }, { "epoch": 4.092419319375565, "grad_norm": 0.0005250205867923796, "learning_rate": 1.9411618088842396e-07, "loss": 0.05, "num_input_tokens_seen": 112886568, "step": 167515 }, { "epoch": 4.092541470207412, "grad_norm": 0.004777814727276564, "learning_rate": 1.9406569319466136e-07, "loss": 0.0, "num_input_tokens_seen": 112890088, "step": 167520 }, { "epoch": 4.092663621039259, "grad_norm": 0.001972571946680546, "learning_rate": 1.9401521136186937e-07, "loss": 0.0, "num_input_tokens_seen": 112893608, "step": 167525 }, { "epoch": 4.092785771871107, "grad_norm": 0.00029019240173511207, "learning_rate": 1.939647353904159e-07, "loss": 0.0, "num_input_tokens_seen": 112896552, "step": 167530 }, { "epoch": 4.0929079227029534, "grad_norm": 0.01268207747489214, "learning_rate": 1.9391426528066744e-07, "loss": 0.0, "num_input_tokens_seen": 112899880, "step": 167535 }, { "epoch": 4.093030073534801, "grad_norm": 0.0008546076714992523, "learning_rate": 1.938638010329915e-07, "loss": 0.0, "num_input_tokens_seen": 112902824, "step": 167540 }, { "epoch": 4.093152224366648, "grad_norm": 0.001968494150787592, "learning_rate": 1.9381334264775462e-07, "loss": 0.0, "num_input_tokens_seen": 112906344, "step": 167545 }, { "epoch": 4.0932743751984955, "grad_norm": 0.001512790215201676, "learning_rate": 1.9376289012532388e-07, "loss": 0.0, "num_input_tokens_seen": 112909928, "step": 167550 }, { "epoch": 4.093396526030342, "grad_norm": 0.0003397958935238421, "learning_rate": 1.937124434660664e-07, "loss": 0.0, "num_input_tokens_seen": 112913576, "step": 167555 }, { "epoch": 4.09351867686219, "grad_norm": 0.00014833496243227273, "learning_rate": 1.9366200267034882e-07, "loss": 0.0, "num_input_tokens_seen": 112916776, "step": 167560 }, { "epoch": 4.093640827694037, "grad_norm": 2.349866190343164e-05, "learning_rate": 1.9361156773853826e-07, "loss": 0.0, "num_input_tokens_seen": 112920104, "step": 167565 }, { "epoch": 4.093762978525883, "grad_norm": 218.95103454589844, "learning_rate": 1.9356113867100089e-07, "loss": 0.0526, "num_input_tokens_seen": 112923752, "step": 167570 }, { "epoch": 4.093885129357731, "grad_norm": 0.00042152617243118584, "learning_rate": 1.9351071546810428e-07, "loss": 0.0, "num_input_tokens_seen": 112926696, "step": 167575 }, { "epoch": 4.094007280189578, "grad_norm": 0.0010712286457419395, "learning_rate": 1.9346029813021425e-07, "loss": 0.0, "num_input_tokens_seen": 112929960, "step": 167580 }, { "epoch": 4.094129431021425, "grad_norm": 6.560736892424757e-06, "learning_rate": 1.9340988665769786e-07, "loss": 0.0, "num_input_tokens_seen": 112932904, "step": 167585 }, { "epoch": 4.094251581853272, "grad_norm": 0.00016035842418204993, "learning_rate": 1.9335948105092203e-07, "loss": 0.0, "num_input_tokens_seen": 112936168, "step": 167590 }, { "epoch": 4.09437373268512, "grad_norm": 0.00047581008402630687, "learning_rate": 1.9330908131025282e-07, "loss": 0.0, "num_input_tokens_seen": 112939624, "step": 167595 }, { "epoch": 4.0944958835169665, "grad_norm": 0.006864218972623348, "learning_rate": 1.9325868743605711e-07, "loss": 0.0, "num_input_tokens_seen": 112943016, "step": 167600 }, { "epoch": 4.094618034348814, "grad_norm": 0.0001095055922633037, "learning_rate": 1.93208299428701e-07, "loss": 0.0, "num_input_tokens_seen": 112946216, "step": 167605 }, { "epoch": 4.094740185180661, "grad_norm": 0.00017072324408218265, "learning_rate": 1.9315791728855136e-07, "loss": 0.0, "num_input_tokens_seen": 112949224, "step": 167610 }, { "epoch": 4.0948623360125085, "grad_norm": 0.0022964330855757, "learning_rate": 1.9310754101597437e-07, "loss": 0.0, "num_input_tokens_seen": 112952552, "step": 167615 }, { "epoch": 4.094984486844355, "grad_norm": 0.0011054405476897955, "learning_rate": 1.930571706113362e-07, "loss": 0.0, "num_input_tokens_seen": 112955880, "step": 167620 }, { "epoch": 4.095106637676203, "grad_norm": 0.0005825211410410702, "learning_rate": 1.9300680607500354e-07, "loss": 0.0, "num_input_tokens_seen": 112958760, "step": 167625 }, { "epoch": 4.09522878850805, "grad_norm": 0.0003326810256112367, "learning_rate": 1.9295644740734207e-07, "loss": 0.0, "num_input_tokens_seen": 112962216, "step": 167630 }, { "epoch": 4.095350939339897, "grad_norm": 9.163653157884255e-05, "learning_rate": 1.9290609460871876e-07, "loss": 0.0, "num_input_tokens_seen": 112966440, "step": 167635 }, { "epoch": 4.095473090171744, "grad_norm": 0.0004953984171152115, "learning_rate": 1.928557476794991e-07, "loss": 0.0, "num_input_tokens_seen": 112969448, "step": 167640 }, { "epoch": 4.095595241003592, "grad_norm": 4.5307486288947985e-05, "learning_rate": 1.928054066200495e-07, "loss": 0.0698, "num_input_tokens_seen": 112973288, "step": 167645 }, { "epoch": 4.095717391835438, "grad_norm": 0.00015699061623308808, "learning_rate": 1.9275507143073645e-07, "loss": 0.0, "num_input_tokens_seen": 112976744, "step": 167650 }, { "epoch": 4.095839542667285, "grad_norm": 2.502801362425089e-05, "learning_rate": 1.9270474211192534e-07, "loss": 0.0, "num_input_tokens_seen": 112979688, "step": 167655 }, { "epoch": 4.095961693499133, "grad_norm": 0.0003214066382497549, "learning_rate": 1.926544186639828e-07, "loss": 0.0, "num_input_tokens_seen": 112982888, "step": 167660 }, { "epoch": 4.0960838443309795, "grad_norm": 0.0001289208885282278, "learning_rate": 1.9260410108727408e-07, "loss": 0.0, "num_input_tokens_seen": 112986024, "step": 167665 }, { "epoch": 4.096205995162827, "grad_norm": 0.0011240255553275347, "learning_rate": 1.9255378938216583e-07, "loss": 0.0, "num_input_tokens_seen": 112989288, "step": 167670 }, { "epoch": 4.096328145994674, "grad_norm": 0.00025505939265713096, "learning_rate": 1.9250348354902335e-07, "loss": 0.0, "num_input_tokens_seen": 112992232, "step": 167675 }, { "epoch": 4.0964502968265215, "grad_norm": 0.00030178253655321896, "learning_rate": 1.9245318358821272e-07, "loss": 0.0, "num_input_tokens_seen": 112995688, "step": 167680 }, { "epoch": 4.096572447658368, "grad_norm": 0.02885306626558304, "learning_rate": 1.9240288950010008e-07, "loss": 0.0, "num_input_tokens_seen": 112998888, "step": 167685 }, { "epoch": 4.096694598490216, "grad_norm": 0.0009121938492171466, "learning_rate": 1.923526012850505e-07, "loss": 0.0, "num_input_tokens_seen": 113002600, "step": 167690 }, { "epoch": 4.096816749322063, "grad_norm": 0.0020662290044128895, "learning_rate": 1.9230231894343029e-07, "loss": 0.0, "num_input_tokens_seen": 113005672, "step": 167695 }, { "epoch": 4.09693890015391, "grad_norm": 0.002329219365492463, "learning_rate": 1.9225204247560467e-07, "loss": 0.0, "num_input_tokens_seen": 113009192, "step": 167700 }, { "epoch": 4.097061050985757, "grad_norm": 0.002412031404674053, "learning_rate": 1.9220177188193942e-07, "loss": 0.0, "num_input_tokens_seen": 113012264, "step": 167705 }, { "epoch": 4.097183201817605, "grad_norm": 0.0004434922302607447, "learning_rate": 1.9215150716280037e-07, "loss": 0.0, "num_input_tokens_seen": 113015336, "step": 167710 }, { "epoch": 4.097305352649451, "grad_norm": 3.532558184815571e-05, "learning_rate": 1.921012483185529e-07, "loss": 0.0, "num_input_tokens_seen": 113018984, "step": 167715 }, { "epoch": 4.097427503481299, "grad_norm": 0.0013077000621706247, "learning_rate": 1.9205099534956214e-07, "loss": 0.0, "num_input_tokens_seen": 113021928, "step": 167720 }, { "epoch": 4.097549654313146, "grad_norm": 0.00023376866010949016, "learning_rate": 1.9200074825619418e-07, "loss": 0.0002, "num_input_tokens_seen": 113025384, "step": 167725 }, { "epoch": 4.097671805144993, "grad_norm": 0.00019923056242987514, "learning_rate": 1.91950507038814e-07, "loss": 0.0, "num_input_tokens_seen": 113028904, "step": 167730 }, { "epoch": 4.09779395597684, "grad_norm": 0.0014188940403982997, "learning_rate": 1.9190027169778688e-07, "loss": 0.0, "num_input_tokens_seen": 113032168, "step": 167735 }, { "epoch": 4.097916106808688, "grad_norm": 0.00020696816500276327, "learning_rate": 1.9185004223347834e-07, "loss": 0.0, "num_input_tokens_seen": 113035240, "step": 167740 }, { "epoch": 4.0980382576405345, "grad_norm": 0.0024865081068128347, "learning_rate": 1.9179981864625394e-07, "loss": 0.0, "num_input_tokens_seen": 113038888, "step": 167745 }, { "epoch": 4.098160408472381, "grad_norm": 0.0010457972530275583, "learning_rate": 1.917496009364784e-07, "loss": 0.0, "num_input_tokens_seen": 113042408, "step": 167750 }, { "epoch": 4.098282559304229, "grad_norm": 0.0010362849570810795, "learning_rate": 1.9169938910451734e-07, "loss": 0.0, "num_input_tokens_seen": 113045672, "step": 167755 }, { "epoch": 4.098404710136076, "grad_norm": 0.00017162924632430077, "learning_rate": 1.9164918315073552e-07, "loss": 0.0, "num_input_tokens_seen": 113048872, "step": 167760 }, { "epoch": 4.098526860967923, "grad_norm": 0.00047827913658693433, "learning_rate": 1.915989830754985e-07, "loss": 0.0002, "num_input_tokens_seen": 113051944, "step": 167765 }, { "epoch": 4.09864901179977, "grad_norm": 3.9480975829064846e-05, "learning_rate": 1.915487888791708e-07, "loss": 0.0, "num_input_tokens_seen": 113054952, "step": 167770 }, { "epoch": 4.098771162631618, "grad_norm": 0.003575035370886326, "learning_rate": 1.9149860056211787e-07, "loss": 0.0, "num_input_tokens_seen": 113058024, "step": 167775 }, { "epoch": 4.098893313463464, "grad_norm": 0.0023885369300842285, "learning_rate": 1.9144841812470468e-07, "loss": 0.0, "num_input_tokens_seen": 113061288, "step": 167780 }, { "epoch": 4.099015464295312, "grad_norm": 0.039635781198740005, "learning_rate": 1.913982415672959e-07, "loss": 0.0, "num_input_tokens_seen": 113065448, "step": 167785 }, { "epoch": 4.099137615127159, "grad_norm": 0.0010953181190416217, "learning_rate": 1.9134807089025695e-07, "loss": 0.0, "num_input_tokens_seen": 113068712, "step": 167790 }, { "epoch": 4.099259765959006, "grad_norm": 0.014914610423147678, "learning_rate": 1.9129790609395192e-07, "loss": 0.0, "num_input_tokens_seen": 113071848, "step": 167795 }, { "epoch": 4.099381916790853, "grad_norm": 0.0008530065533705056, "learning_rate": 1.9124774717874603e-07, "loss": 0.0, "num_input_tokens_seen": 113074728, "step": 167800 }, { "epoch": 4.099504067622701, "grad_norm": 0.00020005644182674587, "learning_rate": 1.9119759414500447e-07, "loss": 0.0, "num_input_tokens_seen": 113078120, "step": 167805 }, { "epoch": 4.0996262184545476, "grad_norm": 0.00026042881654575467, "learning_rate": 1.9114744699309117e-07, "loss": 0.0, "num_input_tokens_seen": 113081832, "step": 167810 }, { "epoch": 4.099748369286395, "grad_norm": 0.00039418303640559316, "learning_rate": 1.9109730572337146e-07, "loss": 0.058, "num_input_tokens_seen": 113085032, "step": 167815 }, { "epoch": 4.099870520118242, "grad_norm": 0.0019248060416430235, "learning_rate": 1.9104717033620965e-07, "loss": 0.0, "num_input_tokens_seen": 113088296, "step": 167820 }, { "epoch": 4.09999267095009, "grad_norm": 0.009397185407578945, "learning_rate": 1.9099704083197023e-07, "loss": 0.0, "num_input_tokens_seen": 113091496, "step": 167825 }, { "epoch": 4.100114821781936, "grad_norm": 0.001163032022304833, "learning_rate": 1.9094691721101818e-07, "loss": 0.0, "num_input_tokens_seen": 113094824, "step": 167830 }, { "epoch": 4.100236972613784, "grad_norm": 0.0006426633917726576, "learning_rate": 1.908967994737175e-07, "loss": 0.0001, "num_input_tokens_seen": 113098408, "step": 167835 }, { "epoch": 4.100359123445631, "grad_norm": 0.0019152211025357246, "learning_rate": 1.908466876204331e-07, "loss": 0.0, "num_input_tokens_seen": 113101672, "step": 167840 }, { "epoch": 4.100481274277477, "grad_norm": 0.0011807752307504416, "learning_rate": 1.90796581651529e-07, "loss": 0.0, "num_input_tokens_seen": 113104616, "step": 167845 }, { "epoch": 4.100603425109325, "grad_norm": 0.0011360227363184094, "learning_rate": 1.9074648156737017e-07, "loss": 0.0, "num_input_tokens_seen": 113108136, "step": 167850 }, { "epoch": 4.100725575941172, "grad_norm": 5.969356789137237e-05, "learning_rate": 1.9069638736832016e-07, "loss": 0.0, "num_input_tokens_seen": 113112168, "step": 167855 }, { "epoch": 4.1008477267730195, "grad_norm": 0.02083604969084263, "learning_rate": 1.9064629905474384e-07, "loss": 0.0, "num_input_tokens_seen": 113115112, "step": 167860 }, { "epoch": 4.100969877604866, "grad_norm": 0.00034951046109199524, "learning_rate": 1.9059621662700554e-07, "loss": 0.0325, "num_input_tokens_seen": 113118248, "step": 167865 }, { "epoch": 4.101092028436714, "grad_norm": 0.0008968613692559302, "learning_rate": 1.9054614008546888e-07, "loss": 0.0, "num_input_tokens_seen": 113121896, "step": 167870 }, { "epoch": 4.101214179268561, "grad_norm": 0.0006697792559862137, "learning_rate": 1.9049606943049878e-07, "loss": 0.0, "num_input_tokens_seen": 113125224, "step": 167875 }, { "epoch": 4.101336330100408, "grad_norm": 0.0009942372562363744, "learning_rate": 1.9044600466245875e-07, "loss": 0.0, "num_input_tokens_seen": 113128424, "step": 167880 }, { "epoch": 4.101458480932255, "grad_norm": 6.113015842856839e-05, "learning_rate": 1.9039594578171336e-07, "loss": 0.0, "num_input_tokens_seen": 113131752, "step": 167885 }, { "epoch": 4.101580631764103, "grad_norm": 0.0002360446087550372, "learning_rate": 1.9034589278862612e-07, "loss": 0.0, "num_input_tokens_seen": 113134888, "step": 167890 }, { "epoch": 4.101702782595949, "grad_norm": 0.00013911745918449014, "learning_rate": 1.9029584568356138e-07, "loss": 0.0, "num_input_tokens_seen": 113138024, "step": 167895 }, { "epoch": 4.101824933427797, "grad_norm": 8.226054342230782e-05, "learning_rate": 1.902458044668832e-07, "loss": 0.0, "num_input_tokens_seen": 113141288, "step": 167900 }, { "epoch": 4.101947084259644, "grad_norm": 0.0007206815644167364, "learning_rate": 1.901957691389552e-07, "loss": 0.0, "num_input_tokens_seen": 113144616, "step": 167905 }, { "epoch": 4.102069235091491, "grad_norm": 0.00041117100045084953, "learning_rate": 1.9014573970014147e-07, "loss": 0.0, "num_input_tokens_seen": 113148520, "step": 167910 }, { "epoch": 4.102191385923338, "grad_norm": 3.6475325032370165e-05, "learning_rate": 1.9009571615080555e-07, "loss": 0.0, "num_input_tokens_seen": 113151912, "step": 167915 }, { "epoch": 4.102313536755186, "grad_norm": 0.00016955210594460368, "learning_rate": 1.900456984913117e-07, "loss": 0.0, "num_input_tokens_seen": 113155240, "step": 167920 }, { "epoch": 4.1024356875870325, "grad_norm": 0.0006664685788564384, "learning_rate": 1.8999568672202338e-07, "loss": 0.0, "num_input_tokens_seen": 113158696, "step": 167925 }, { "epoch": 4.102557838418879, "grad_norm": 0.000883599161170423, "learning_rate": 1.899456808433041e-07, "loss": 0.0, "num_input_tokens_seen": 113161832, "step": 167930 }, { "epoch": 4.102679989250727, "grad_norm": 0.0010841061593964696, "learning_rate": 1.898956808555179e-07, "loss": 0.0, "num_input_tokens_seen": 113165096, "step": 167935 }, { "epoch": 4.102802140082574, "grad_norm": 0.0007180902175605297, "learning_rate": 1.898456867590279e-07, "loss": 0.0, "num_input_tokens_seen": 113168360, "step": 167940 }, { "epoch": 4.102924290914421, "grad_norm": 0.0037924540229141712, "learning_rate": 1.897956985541983e-07, "loss": 0.0, "num_input_tokens_seen": 113171880, "step": 167945 }, { "epoch": 4.103046441746268, "grad_norm": 0.0029051261954009533, "learning_rate": 1.8974571624139201e-07, "loss": 0.0, "num_input_tokens_seen": 113174952, "step": 167950 }, { "epoch": 4.103168592578116, "grad_norm": 0.002228027442470193, "learning_rate": 1.8969573982097288e-07, "loss": 0.0, "num_input_tokens_seen": 113178152, "step": 167955 }, { "epoch": 4.103290743409962, "grad_norm": 0.0008051015320234001, "learning_rate": 1.8964576929330444e-07, "loss": 0.0, "num_input_tokens_seen": 113181480, "step": 167960 }, { "epoch": 4.10341289424181, "grad_norm": 0.00023929915914777666, "learning_rate": 1.895958046587497e-07, "loss": 0.0, "num_input_tokens_seen": 113185576, "step": 167965 }, { "epoch": 4.103535045073657, "grad_norm": 0.00011397549678804353, "learning_rate": 1.8954584591767241e-07, "loss": 0.0, "num_input_tokens_seen": 113189096, "step": 167970 }, { "epoch": 4.103657195905504, "grad_norm": 8.767266263021156e-05, "learning_rate": 1.8949589307043555e-07, "loss": 0.0, "num_input_tokens_seen": 113192360, "step": 167975 }, { "epoch": 4.103779346737351, "grad_norm": 0.0018837309908121824, "learning_rate": 1.8944594611740282e-07, "loss": 0.0, "num_input_tokens_seen": 113195368, "step": 167980 }, { "epoch": 4.103901497569199, "grad_norm": 0.005669779144227505, "learning_rate": 1.8939600505893693e-07, "loss": 0.0, "num_input_tokens_seen": 113199080, "step": 167985 }, { "epoch": 4.1040236484010455, "grad_norm": 0.0024404674768447876, "learning_rate": 1.8934606989540125e-07, "loss": 0.0, "num_input_tokens_seen": 113202920, "step": 167990 }, { "epoch": 4.104145799232893, "grad_norm": 0.0001181487605208531, "learning_rate": 1.8929614062715927e-07, "loss": 0.0001, "num_input_tokens_seen": 113206248, "step": 167995 }, { "epoch": 4.10426795006474, "grad_norm": 0.0009015746763907373, "learning_rate": 1.8924621725457357e-07, "loss": 0.0, "num_input_tokens_seen": 113210152, "step": 168000 }, { "epoch": 4.1043901008965875, "grad_norm": 0.0009523396729491651, "learning_rate": 1.8919629977800767e-07, "loss": 0.0, "num_input_tokens_seen": 113213416, "step": 168005 }, { "epoch": 4.104512251728434, "grad_norm": 0.001039717230014503, "learning_rate": 1.8914638819782414e-07, "loss": 0.0, "num_input_tokens_seen": 113216936, "step": 168010 }, { "epoch": 4.104634402560281, "grad_norm": 9.482148925599176e-06, "learning_rate": 1.8909648251438648e-07, "loss": 0.0, "num_input_tokens_seen": 113221288, "step": 168015 }, { "epoch": 4.104756553392129, "grad_norm": 2.5004270355566405e-05, "learning_rate": 1.8904658272805696e-07, "loss": 0.0, "num_input_tokens_seen": 113225128, "step": 168020 }, { "epoch": 4.104878704223975, "grad_norm": 0.006195750553160906, "learning_rate": 1.8899668883919907e-07, "loss": 0.0, "num_input_tokens_seen": 113228328, "step": 168025 }, { "epoch": 4.105000855055823, "grad_norm": 0.0005033480701968074, "learning_rate": 1.8894680084817516e-07, "loss": 0.0, "num_input_tokens_seen": 113231528, "step": 168030 }, { "epoch": 4.10512300588767, "grad_norm": 0.001200584345497191, "learning_rate": 1.8889691875534853e-07, "loss": 0.0, "num_input_tokens_seen": 113235048, "step": 168035 }, { "epoch": 4.105245156719517, "grad_norm": 0.0023539774119853973, "learning_rate": 1.8884704256108163e-07, "loss": 0.0, "num_input_tokens_seen": 113238504, "step": 168040 }, { "epoch": 4.105367307551364, "grad_norm": 8.967569738160819e-05, "learning_rate": 1.8879717226573698e-07, "loss": 0.0, "num_input_tokens_seen": 113241960, "step": 168045 }, { "epoch": 4.105489458383212, "grad_norm": 0.0012145297368988395, "learning_rate": 1.8874730786967752e-07, "loss": 0.0, "num_input_tokens_seen": 113245288, "step": 168050 }, { "epoch": 4.1056116092150585, "grad_norm": 0.0001042317962856032, "learning_rate": 1.8869744937326603e-07, "loss": 0.0001, "num_input_tokens_seen": 113248616, "step": 168055 }, { "epoch": 4.105733760046906, "grad_norm": 0.001420485437847674, "learning_rate": 1.886475967768647e-07, "loss": 0.0, "num_input_tokens_seen": 113253160, "step": 168060 }, { "epoch": 4.105855910878753, "grad_norm": 0.00025929929688572884, "learning_rate": 1.8859775008083646e-07, "loss": 0.0, "num_input_tokens_seen": 113256296, "step": 168065 }, { "epoch": 4.1059780617106005, "grad_norm": 0.0011569394264370203, "learning_rate": 1.8854790928554343e-07, "loss": 0.0, "num_input_tokens_seen": 113259624, "step": 168070 }, { "epoch": 4.106100212542447, "grad_norm": 0.00021515910339076072, "learning_rate": 1.8849807439134847e-07, "loss": 0.0, "num_input_tokens_seen": 113262952, "step": 168075 }, { "epoch": 4.106222363374295, "grad_norm": 0.0018044417956843972, "learning_rate": 1.8844824539861348e-07, "loss": 0.0, "num_input_tokens_seen": 113266856, "step": 168080 }, { "epoch": 4.106344514206142, "grad_norm": 0.00029858926427550614, "learning_rate": 1.883984223077012e-07, "loss": 0.0, "num_input_tokens_seen": 113270376, "step": 168085 }, { "epoch": 4.106466665037989, "grad_norm": 0.00017355407180730253, "learning_rate": 1.883486051189742e-07, "loss": 0.0466, "num_input_tokens_seen": 113273512, "step": 168090 }, { "epoch": 4.106588815869836, "grad_norm": 0.00020966355805285275, "learning_rate": 1.882987938327941e-07, "loss": 0.0, "num_input_tokens_seen": 113277096, "step": 168095 }, { "epoch": 4.106710966701684, "grad_norm": 3.090937389060855e-05, "learning_rate": 1.8824898844952374e-07, "loss": 0.0, "num_input_tokens_seen": 113280296, "step": 168100 }, { "epoch": 4.10683311753353, "grad_norm": 0.0008340792264789343, "learning_rate": 1.8819918896952492e-07, "loss": 0.0, "num_input_tokens_seen": 113284072, "step": 168105 }, { "epoch": 4.106955268365377, "grad_norm": 5.1596016419352964e-05, "learning_rate": 1.8814939539315987e-07, "loss": 0.0, "num_input_tokens_seen": 113287400, "step": 168110 }, { "epoch": 4.107077419197225, "grad_norm": 0.00011981278657913208, "learning_rate": 1.8809960772079103e-07, "loss": 0.0, "num_input_tokens_seen": 113290600, "step": 168115 }, { "epoch": 4.1071995700290715, "grad_norm": 0.006958621088415384, "learning_rate": 1.8804982595277995e-07, "loss": 0.0, "num_input_tokens_seen": 113293672, "step": 168120 }, { "epoch": 4.107321720860919, "grad_norm": 0.00043330652988515794, "learning_rate": 1.8800005008948928e-07, "loss": 0.0, "num_input_tokens_seen": 113296488, "step": 168125 }, { "epoch": 4.107443871692766, "grad_norm": 0.00015095339040271938, "learning_rate": 1.8795028013128055e-07, "loss": 0.0, "num_input_tokens_seen": 113299816, "step": 168130 }, { "epoch": 4.107566022524614, "grad_norm": 0.00016991773736663163, "learning_rate": 1.8790051607851553e-07, "loss": 0.0, "num_input_tokens_seen": 113303208, "step": 168135 }, { "epoch": 4.10768817335646, "grad_norm": 9.44330167840235e-06, "learning_rate": 1.878507579315567e-07, "loss": 0.0, "num_input_tokens_seen": 113306600, "step": 168140 }, { "epoch": 4.107810324188308, "grad_norm": 0.0006781097617931664, "learning_rate": 1.878010056907653e-07, "loss": 0.0, "num_input_tokens_seen": 113310120, "step": 168145 }, { "epoch": 4.107932475020155, "grad_norm": 0.00011159246059833094, "learning_rate": 1.877512593565037e-07, "loss": 0.0, "num_input_tokens_seen": 113313768, "step": 168150 }, { "epoch": 4.108054625852002, "grad_norm": 1.1732379789464176e-05, "learning_rate": 1.8770151892913322e-07, "loss": 0.0, "num_input_tokens_seen": 113317352, "step": 168155 }, { "epoch": 4.108176776683849, "grad_norm": 0.4344090223312378, "learning_rate": 1.8765178440901596e-07, "loss": 0.0003, "num_input_tokens_seen": 113320808, "step": 168160 }, { "epoch": 4.108298927515697, "grad_norm": 0.00014278020535130054, "learning_rate": 1.8760205579651323e-07, "loss": 0.0, "num_input_tokens_seen": 113324328, "step": 168165 }, { "epoch": 4.108421078347543, "grad_norm": 0.00021015606762375683, "learning_rate": 1.8755233309198704e-07, "loss": 0.0, "num_input_tokens_seen": 113327720, "step": 168170 }, { "epoch": 4.108543229179391, "grad_norm": 0.00013545910769607872, "learning_rate": 1.8750261629579867e-07, "loss": 0.0, "num_input_tokens_seen": 113330792, "step": 168175 }, { "epoch": 4.108665380011238, "grad_norm": 0.0018144716741517186, "learning_rate": 1.8745290540830972e-07, "loss": 0.0, "num_input_tokens_seen": 113333992, "step": 168180 }, { "epoch": 4.1087875308430855, "grad_norm": 0.00032248892239294946, "learning_rate": 1.87403200429882e-07, "loss": 0.0, "num_input_tokens_seen": 113337320, "step": 168185 }, { "epoch": 4.108909681674932, "grad_norm": 4.1384933865629137e-05, "learning_rate": 1.8735350136087658e-07, "loss": 0.0, "num_input_tokens_seen": 113340776, "step": 168190 }, { "epoch": 4.109031832506779, "grad_norm": 0.0021187819074839354, "learning_rate": 1.8730380820165537e-07, "loss": 0.0, "num_input_tokens_seen": 113344168, "step": 168195 }, { "epoch": 4.109153983338627, "grad_norm": 0.00021328141156118363, "learning_rate": 1.8725412095257908e-07, "loss": 0.0, "num_input_tokens_seen": 113347432, "step": 168200 }, { "epoch": 4.109276134170473, "grad_norm": 7.013930735411122e-05, "learning_rate": 1.8720443961400944e-07, "loss": 0.0, "num_input_tokens_seen": 113350952, "step": 168205 }, { "epoch": 4.109398285002321, "grad_norm": 0.0004714243405032903, "learning_rate": 1.8715476418630805e-07, "loss": 0.0, "num_input_tokens_seen": 113354344, "step": 168210 }, { "epoch": 4.109520435834168, "grad_norm": 1.5150175386224873e-05, "learning_rate": 1.871050946698356e-07, "loss": 0.0, "num_input_tokens_seen": 113357800, "step": 168215 }, { "epoch": 4.109642586666015, "grad_norm": 1.0064273737953044e-05, "learning_rate": 1.870554310649538e-07, "loss": 0.0, "num_input_tokens_seen": 113361064, "step": 168220 }, { "epoch": 4.109764737497862, "grad_norm": 0.002622292609885335, "learning_rate": 1.8700577337202327e-07, "loss": 0.0, "num_input_tokens_seen": 113364200, "step": 168225 }, { "epoch": 4.10988688832971, "grad_norm": 0.007907216437160969, "learning_rate": 1.8695612159140572e-07, "loss": 0.0, "num_input_tokens_seen": 113367720, "step": 168230 }, { "epoch": 4.1100090391615565, "grad_norm": 0.01983102411031723, "learning_rate": 1.8690647572346185e-07, "loss": 0.0, "num_input_tokens_seen": 113370984, "step": 168235 }, { "epoch": 4.110131189993404, "grad_norm": 0.00025274226209148765, "learning_rate": 1.868568357685526e-07, "loss": 0.0, "num_input_tokens_seen": 113373864, "step": 168240 }, { "epoch": 4.110253340825251, "grad_norm": 0.0006391624920070171, "learning_rate": 1.8680720172703946e-07, "loss": 0.0, "num_input_tokens_seen": 113376872, "step": 168245 }, { "epoch": 4.1103754916570985, "grad_norm": 7.626259321114048e-05, "learning_rate": 1.867575735992827e-07, "loss": 0.0, "num_input_tokens_seen": 113380840, "step": 168250 }, { "epoch": 4.110497642488945, "grad_norm": 0.0005927463644184172, "learning_rate": 1.8670795138564387e-07, "loss": 0.0, "num_input_tokens_seen": 113384040, "step": 168255 }, { "epoch": 4.110619793320793, "grad_norm": 1.2069443073414732e-05, "learning_rate": 1.8665833508648344e-07, "loss": 0.0, "num_input_tokens_seen": 113387752, "step": 168260 }, { "epoch": 4.11074194415264, "grad_norm": 0.00023556312953587621, "learning_rate": 1.8660872470216215e-07, "loss": 0.0, "num_input_tokens_seen": 113391208, "step": 168265 }, { "epoch": 4.110864094984487, "grad_norm": 0.0015209665289148688, "learning_rate": 1.8655912023304143e-07, "loss": 0.0, "num_input_tokens_seen": 113395560, "step": 168270 }, { "epoch": 4.110986245816334, "grad_norm": 0.00024280369689222425, "learning_rate": 1.865095216794812e-07, "loss": 0.0001, "num_input_tokens_seen": 113398888, "step": 168275 }, { "epoch": 4.111108396648181, "grad_norm": 5.196717393118888e-05, "learning_rate": 1.8645992904184282e-07, "loss": 0.0, "num_input_tokens_seen": 113401960, "step": 168280 }, { "epoch": 4.111230547480028, "grad_norm": 0.00012093011901015416, "learning_rate": 1.8641034232048647e-07, "loss": 0.0, "num_input_tokens_seen": 113405736, "step": 168285 }, { "epoch": 4.111352698311875, "grad_norm": 0.00019129080465063453, "learning_rate": 1.8636076151577317e-07, "loss": 0.0, "num_input_tokens_seen": 113409384, "step": 168290 }, { "epoch": 4.111474849143723, "grad_norm": 5.972016879240982e-05, "learning_rate": 1.8631118662806288e-07, "loss": 0.0, "num_input_tokens_seen": 113413224, "step": 168295 }, { "epoch": 4.1115969999755695, "grad_norm": 6.202772783581167e-05, "learning_rate": 1.8626161765771665e-07, "loss": 0.0, "num_input_tokens_seen": 113416488, "step": 168300 }, { "epoch": 4.111719150807417, "grad_norm": 0.00035181641578674316, "learning_rate": 1.8621205460509504e-07, "loss": 0.0, "num_input_tokens_seen": 113419688, "step": 168305 }, { "epoch": 4.111841301639264, "grad_norm": 0.0006404395098797977, "learning_rate": 1.861624974705579e-07, "loss": 0.0, "num_input_tokens_seen": 113423528, "step": 168310 }, { "epoch": 4.1119634524711115, "grad_norm": 0.00018085418560076505, "learning_rate": 1.8611294625446628e-07, "loss": 0.0, "num_input_tokens_seen": 113426792, "step": 168315 }, { "epoch": 4.112085603302958, "grad_norm": 0.00031348931952379644, "learning_rate": 1.8606340095717999e-07, "loss": 0.0, "num_input_tokens_seen": 113430312, "step": 168320 }, { "epoch": 4.112207754134806, "grad_norm": 0.002586101181805134, "learning_rate": 1.8601386157905974e-07, "loss": 0.0, "num_input_tokens_seen": 113434024, "step": 168325 }, { "epoch": 4.112329904966653, "grad_norm": 0.000644766609184444, "learning_rate": 1.8596432812046548e-07, "loss": 0.0, "num_input_tokens_seen": 113437736, "step": 168330 }, { "epoch": 4.1124520557985, "grad_norm": 0.07879499346017838, "learning_rate": 1.859148005817578e-07, "loss": 0.0, "num_input_tokens_seen": 113441000, "step": 168335 }, { "epoch": 4.112574206630347, "grad_norm": 0.003919265698641539, "learning_rate": 1.858652789632964e-07, "loss": 0.0, "num_input_tokens_seen": 113443816, "step": 168340 }, { "epoch": 4.112696357462195, "grad_norm": 0.004266014322638512, "learning_rate": 1.858157632654419e-07, "loss": 0.0, "num_input_tokens_seen": 113446952, "step": 168345 }, { "epoch": 4.112818508294041, "grad_norm": 0.0012039493303745985, "learning_rate": 1.8576625348855411e-07, "loss": 0.0, "num_input_tokens_seen": 113450088, "step": 168350 }, { "epoch": 4.112940659125889, "grad_norm": 0.001107099698856473, "learning_rate": 1.85716749632993e-07, "loss": 0.0, "num_input_tokens_seen": 113453416, "step": 168355 }, { "epoch": 4.113062809957736, "grad_norm": 0.00014701222244184464, "learning_rate": 1.8566725169911858e-07, "loss": 0.0, "num_input_tokens_seen": 113456552, "step": 168360 }, { "epoch": 4.113184960789583, "grad_norm": 0.009403909556567669, "learning_rate": 1.856177596872913e-07, "loss": 0.0, "num_input_tokens_seen": 113459880, "step": 168365 }, { "epoch": 4.11330711162143, "grad_norm": 0.002776139182969928, "learning_rate": 1.8556827359787042e-07, "loss": 0.0, "num_input_tokens_seen": 113463208, "step": 168370 }, { "epoch": 4.113429262453277, "grad_norm": 0.0017793282167986035, "learning_rate": 1.8551879343121635e-07, "loss": 0.0, "num_input_tokens_seen": 113466344, "step": 168375 }, { "epoch": 4.1135514132851245, "grad_norm": 7.744233153061941e-05, "learning_rate": 1.854693191876884e-07, "loss": 0.0, "num_input_tokens_seen": 113469928, "step": 168380 }, { "epoch": 4.113673564116971, "grad_norm": 0.0009350012987852097, "learning_rate": 1.8541985086764688e-07, "loss": 0.0, "num_input_tokens_seen": 113473384, "step": 168385 }, { "epoch": 4.113795714948819, "grad_norm": 8.169966167770326e-05, "learning_rate": 1.8537038847145116e-07, "loss": 0.0, "num_input_tokens_seen": 113476840, "step": 168390 }, { "epoch": 4.113917865780666, "grad_norm": 0.014098173938691616, "learning_rate": 1.8532093199946098e-07, "loss": 0.0, "num_input_tokens_seen": 113480232, "step": 168395 }, { "epoch": 4.114040016612513, "grad_norm": 0.0029436154291033745, "learning_rate": 1.852714814520364e-07, "loss": 0.0, "num_input_tokens_seen": 113483304, "step": 168400 }, { "epoch": 4.11416216744436, "grad_norm": 0.08929237723350525, "learning_rate": 1.852220368295364e-07, "loss": 0.0, "num_input_tokens_seen": 113486440, "step": 168405 }, { "epoch": 4.114284318276208, "grad_norm": 0.0003501186438370496, "learning_rate": 1.8517259813232122e-07, "loss": 0.0, "num_input_tokens_seen": 113489960, "step": 168410 }, { "epoch": 4.114406469108054, "grad_norm": 0.022616511210799217, "learning_rate": 1.851231653607499e-07, "loss": 0.0, "num_input_tokens_seen": 113493416, "step": 168415 }, { "epoch": 4.114528619939902, "grad_norm": 0.00025067225215025246, "learning_rate": 1.8507373851518204e-07, "loss": 0.0, "num_input_tokens_seen": 113496488, "step": 168420 }, { "epoch": 4.114650770771749, "grad_norm": 0.437266081571579, "learning_rate": 1.8502431759597737e-07, "loss": 0.0002, "num_input_tokens_seen": 113499560, "step": 168425 }, { "epoch": 4.114772921603596, "grad_norm": 0.002507508499547839, "learning_rate": 1.849749026034948e-07, "loss": 0.0, "num_input_tokens_seen": 113502696, "step": 168430 }, { "epoch": 4.114895072435443, "grad_norm": 0.00028929513064213097, "learning_rate": 1.8492549353809416e-07, "loss": 0.0, "num_input_tokens_seen": 113506216, "step": 168435 }, { "epoch": 4.115017223267291, "grad_norm": 0.0007330722874030471, "learning_rate": 1.8487609040013463e-07, "loss": 0.0, "num_input_tokens_seen": 113509736, "step": 168440 }, { "epoch": 4.1151393740991375, "grad_norm": 0.000801653484813869, "learning_rate": 1.8482669318997524e-07, "loss": 0.0, "num_input_tokens_seen": 113513512, "step": 168445 }, { "epoch": 4.115261524930985, "grad_norm": 0.0018105350900441408, "learning_rate": 1.8477730190797548e-07, "loss": 0.0, "num_input_tokens_seen": 113516648, "step": 168450 }, { "epoch": 4.115383675762832, "grad_norm": 0.00022335609537549317, "learning_rate": 1.8472791655449426e-07, "loss": 0.0, "num_input_tokens_seen": 113522280, "step": 168455 }, { "epoch": 4.115505826594679, "grad_norm": 2.3306838556891307e-05, "learning_rate": 1.8467853712989123e-07, "loss": 0.0, "num_input_tokens_seen": 113525160, "step": 168460 }, { "epoch": 4.115627977426526, "grad_norm": 0.0005397560307756066, "learning_rate": 1.8462916363452486e-07, "loss": 0.0, "num_input_tokens_seen": 113528936, "step": 168465 }, { "epoch": 4.115750128258373, "grad_norm": 1.6428799426648766e-05, "learning_rate": 1.8457979606875483e-07, "loss": 0.0001, "num_input_tokens_seen": 113531880, "step": 168470 }, { "epoch": 4.115872279090221, "grad_norm": 0.0009806773159652948, "learning_rate": 1.8453043443293946e-07, "loss": 0.0, "num_input_tokens_seen": 113535272, "step": 168475 }, { "epoch": 4.115994429922067, "grad_norm": 8.44751630211249e-05, "learning_rate": 1.8448107872743855e-07, "loss": 0.0, "num_input_tokens_seen": 113538408, "step": 168480 }, { "epoch": 4.116116580753915, "grad_norm": 0.00013254370423965156, "learning_rate": 1.8443172895261016e-07, "loss": 0.0, "num_input_tokens_seen": 113541928, "step": 168485 }, { "epoch": 4.116238731585762, "grad_norm": 0.16897857189178467, "learning_rate": 1.8438238510881365e-07, "loss": 0.0001, "num_input_tokens_seen": 113545000, "step": 168490 }, { "epoch": 4.1163608824176094, "grad_norm": 0.0007797760772518814, "learning_rate": 1.8433304719640807e-07, "loss": 0.0, "num_input_tokens_seen": 113548264, "step": 168495 }, { "epoch": 4.116483033249456, "grad_norm": 0.0005675862776115537, "learning_rate": 1.8428371521575182e-07, "loss": 0.0, "num_input_tokens_seen": 113552104, "step": 168500 }, { "epoch": 4.116605184081304, "grad_norm": 0.002730336971580982, "learning_rate": 1.842343891672039e-07, "loss": 0.0, "num_input_tokens_seen": 113555176, "step": 168505 }, { "epoch": 4.116727334913151, "grad_norm": 2.090478847094346e-05, "learning_rate": 1.8418506905112274e-07, "loss": 0.0, "num_input_tokens_seen": 113558824, "step": 168510 }, { "epoch": 4.116849485744998, "grad_norm": 0.00023547621094621718, "learning_rate": 1.8413575486786713e-07, "loss": 0.0, "num_input_tokens_seen": 113562024, "step": 168515 }, { "epoch": 4.116971636576845, "grad_norm": 0.0004209127801004797, "learning_rate": 1.8408644661779605e-07, "loss": 0.0, "num_input_tokens_seen": 113565544, "step": 168520 }, { "epoch": 4.117093787408693, "grad_norm": 0.0011463210685178638, "learning_rate": 1.8403714430126748e-07, "loss": 0.0001, "num_input_tokens_seen": 113568936, "step": 168525 }, { "epoch": 4.117215938240539, "grad_norm": 0.0006217953632585704, "learning_rate": 1.8398784791864052e-07, "loss": 0.0, "num_input_tokens_seen": 113572584, "step": 168530 }, { "epoch": 4.117338089072387, "grad_norm": 0.011418608017265797, "learning_rate": 1.839385574702732e-07, "loss": 0.0, "num_input_tokens_seen": 113575720, "step": 168535 }, { "epoch": 4.117460239904234, "grad_norm": 0.00016718072583898902, "learning_rate": 1.8388927295652444e-07, "loss": 0.0, "num_input_tokens_seen": 113579368, "step": 168540 }, { "epoch": 4.117582390736081, "grad_norm": 5.693855928257108e-05, "learning_rate": 1.838399943777521e-07, "loss": 0.0, "num_input_tokens_seen": 113582824, "step": 168545 }, { "epoch": 4.117704541567928, "grad_norm": 0.00029498033109121025, "learning_rate": 1.837907217343151e-07, "loss": 0.0, "num_input_tokens_seen": 113586344, "step": 168550 }, { "epoch": 4.117826692399775, "grad_norm": 0.000566729751881212, "learning_rate": 1.8374145502657157e-07, "loss": 0.0, "num_input_tokens_seen": 113589608, "step": 168555 }, { "epoch": 4.1179488432316225, "grad_norm": 0.0009026674670167267, "learning_rate": 1.8369219425487935e-07, "loss": 0.0, "num_input_tokens_seen": 113593576, "step": 168560 }, { "epoch": 4.118070994063469, "grad_norm": 0.0011875737691298127, "learning_rate": 1.836429394195974e-07, "loss": 0.0, "num_input_tokens_seen": 113597160, "step": 168565 }, { "epoch": 4.118193144895317, "grad_norm": 0.00010860639304155484, "learning_rate": 1.8359369052108332e-07, "loss": 0.0, "num_input_tokens_seen": 113601128, "step": 168570 }, { "epoch": 4.118315295727164, "grad_norm": 0.0002416908391751349, "learning_rate": 1.8354444755969579e-07, "loss": 0.0, "num_input_tokens_seen": 113604456, "step": 168575 }, { "epoch": 4.118437446559011, "grad_norm": 8.761444405536167e-06, "learning_rate": 1.8349521053579232e-07, "loss": 0.0, "num_input_tokens_seen": 113607976, "step": 168580 }, { "epoch": 4.118559597390858, "grad_norm": 0.0002650410169735551, "learning_rate": 1.8344597944973129e-07, "loss": 0.0, "num_input_tokens_seen": 113612008, "step": 168585 }, { "epoch": 4.118681748222706, "grad_norm": 0.0030864370055496693, "learning_rate": 1.8339675430187097e-07, "loss": 0.0, "num_input_tokens_seen": 113615272, "step": 168590 }, { "epoch": 4.118803899054552, "grad_norm": 5.8644753153203055e-05, "learning_rate": 1.8334753509256883e-07, "loss": 0.0, "num_input_tokens_seen": 113618408, "step": 168595 }, { "epoch": 4.1189260498864, "grad_norm": 0.0005382047384046018, "learning_rate": 1.8329832182218341e-07, "loss": 0.0, "num_input_tokens_seen": 113621800, "step": 168600 }, { "epoch": 4.119048200718247, "grad_norm": 0.00048400447121821344, "learning_rate": 1.8324911449107195e-07, "loss": 0.0, "num_input_tokens_seen": 113625448, "step": 168605 }, { "epoch": 4.119170351550094, "grad_norm": 0.0004027804243378341, "learning_rate": 1.831999130995926e-07, "loss": 0.0, "num_input_tokens_seen": 113628840, "step": 168610 }, { "epoch": 4.119292502381941, "grad_norm": 0.0004047253751195967, "learning_rate": 1.8315071764810342e-07, "loss": 0.0, "num_input_tokens_seen": 113631720, "step": 168615 }, { "epoch": 4.119414653213789, "grad_norm": 3.090100653935224e-05, "learning_rate": 1.8310152813696166e-07, "loss": 0.0, "num_input_tokens_seen": 113634664, "step": 168620 }, { "epoch": 4.1195368040456355, "grad_norm": 12.117880821228027, "learning_rate": 1.8305234456652562e-07, "loss": 0.0003, "num_input_tokens_seen": 113638824, "step": 168625 }, { "epoch": 4.119658954877483, "grad_norm": 0.002483411692082882, "learning_rate": 1.8300316693715234e-07, "loss": 0.0, "num_input_tokens_seen": 113642216, "step": 168630 }, { "epoch": 4.11978110570933, "grad_norm": 0.0002661602047737688, "learning_rate": 1.8295399524920008e-07, "loss": 0.0, "num_input_tokens_seen": 113645480, "step": 168635 }, { "epoch": 4.119903256541177, "grad_norm": 0.0006768704042769969, "learning_rate": 1.829048295030259e-07, "loss": 0.0, "num_input_tokens_seen": 113649000, "step": 168640 }, { "epoch": 4.120025407373024, "grad_norm": 5.930757833993994e-05, "learning_rate": 1.828556696989878e-07, "loss": 0.0, "num_input_tokens_seen": 113652200, "step": 168645 }, { "epoch": 4.120147558204871, "grad_norm": 0.007106540724635124, "learning_rate": 1.8280651583744278e-07, "loss": 0.0477, "num_input_tokens_seen": 113655336, "step": 168650 }, { "epoch": 4.120269709036719, "grad_norm": 0.00035729375667870045, "learning_rate": 1.8275736791874885e-07, "loss": 0.0, "num_input_tokens_seen": 113658728, "step": 168655 }, { "epoch": 4.120391859868565, "grad_norm": 0.00504821864888072, "learning_rate": 1.8270822594326308e-07, "loss": 0.0, "num_input_tokens_seen": 113661928, "step": 168660 }, { "epoch": 4.120514010700413, "grad_norm": 0.008736646734178066, "learning_rate": 1.826590899113427e-07, "loss": 0.0, "num_input_tokens_seen": 113665128, "step": 168665 }, { "epoch": 4.12063616153226, "grad_norm": 9.719293302623555e-05, "learning_rate": 1.8260995982334538e-07, "loss": 0.0, "num_input_tokens_seen": 113668456, "step": 168670 }, { "epoch": 4.120758312364107, "grad_norm": 0.000476063578389585, "learning_rate": 1.8256083567962843e-07, "loss": 0.0, "num_input_tokens_seen": 113671272, "step": 168675 }, { "epoch": 4.120880463195954, "grad_norm": 0.00019063352374359965, "learning_rate": 1.825117174805486e-07, "loss": 0.0, "num_input_tokens_seen": 113674920, "step": 168680 }, { "epoch": 4.121002614027802, "grad_norm": 0.00020761760242749006, "learning_rate": 1.8246260522646385e-07, "loss": 0.0, "num_input_tokens_seen": 113677928, "step": 168685 }, { "epoch": 4.1211247648596485, "grad_norm": 0.007399700582027435, "learning_rate": 1.8241349891773062e-07, "loss": 0.0, "num_input_tokens_seen": 113681064, "step": 168690 }, { "epoch": 4.121246915691496, "grad_norm": 0.0027653025463223457, "learning_rate": 1.8236439855470654e-07, "loss": 0.0, "num_input_tokens_seen": 113684264, "step": 168695 }, { "epoch": 4.121369066523343, "grad_norm": 0.00011870275920955464, "learning_rate": 1.8231530413774833e-07, "loss": 0.0, "num_input_tokens_seen": 113687400, "step": 168700 }, { "epoch": 4.1214912173551905, "grad_norm": 4.691482172347605e-05, "learning_rate": 1.82266215667213e-07, "loss": 0.0, "num_input_tokens_seen": 113690920, "step": 168705 }, { "epoch": 4.121613368187037, "grad_norm": 0.0061193970032036304, "learning_rate": 1.822171331434581e-07, "loss": 0.0001, "num_input_tokens_seen": 113694376, "step": 168710 }, { "epoch": 4.121735519018885, "grad_norm": 6.885571201564744e-05, "learning_rate": 1.8216805656683986e-07, "loss": 0.0, "num_input_tokens_seen": 113697576, "step": 168715 }, { "epoch": 4.121857669850732, "grad_norm": 0.0019095209427177906, "learning_rate": 1.8211898593771568e-07, "loss": 0.0, "num_input_tokens_seen": 113701352, "step": 168720 }, { "epoch": 4.121979820682579, "grad_norm": 0.00321991671808064, "learning_rate": 1.8206992125644205e-07, "loss": 0.0001, "num_input_tokens_seen": 113704616, "step": 168725 }, { "epoch": 4.122101971514426, "grad_norm": 0.004618339240550995, "learning_rate": 1.8202086252337611e-07, "loss": 0.0, "num_input_tokens_seen": 113707752, "step": 168730 }, { "epoch": 4.122224122346273, "grad_norm": 0.0008676843135617673, "learning_rate": 1.8197180973887428e-07, "loss": 0.0, "num_input_tokens_seen": 113711080, "step": 168735 }, { "epoch": 4.12234627317812, "grad_norm": 0.0010523677337914705, "learning_rate": 1.8192276290329345e-07, "loss": 0.0, "num_input_tokens_seen": 113714344, "step": 168740 }, { "epoch": 4.122468424009967, "grad_norm": 0.002487873425707221, "learning_rate": 1.8187372201699058e-07, "loss": 0.0, "num_input_tokens_seen": 113717544, "step": 168745 }, { "epoch": 4.122590574841815, "grad_norm": 0.0005908418097533286, "learning_rate": 1.8182468708032205e-07, "loss": 0.0, "num_input_tokens_seen": 113721000, "step": 168750 }, { "epoch": 4.1227127256736615, "grad_norm": 0.000158902897965163, "learning_rate": 1.8177565809364426e-07, "loss": 0.0, "num_input_tokens_seen": 113724712, "step": 168755 }, { "epoch": 4.122834876505509, "grad_norm": 5.1024628191953525e-05, "learning_rate": 1.817266350573141e-07, "loss": 0.0, "num_input_tokens_seen": 113728552, "step": 168760 }, { "epoch": 4.122957027337356, "grad_norm": 0.001737323822453618, "learning_rate": 1.816776179716878e-07, "loss": 0.0, "num_input_tokens_seen": 113732136, "step": 168765 }, { "epoch": 4.1230791781692036, "grad_norm": 7.92595892562531e-05, "learning_rate": 1.8162860683712212e-07, "loss": 0.0, "num_input_tokens_seen": 113735272, "step": 168770 }, { "epoch": 4.12320132900105, "grad_norm": 0.0005044394638389349, "learning_rate": 1.8157960165397312e-07, "loss": 0.0, "num_input_tokens_seen": 113738728, "step": 168775 }, { "epoch": 4.123323479832898, "grad_norm": 0.0007095523760654032, "learning_rate": 1.8153060242259765e-07, "loss": 0.0021, "num_input_tokens_seen": 113742056, "step": 168780 }, { "epoch": 4.123445630664745, "grad_norm": 0.00040243950206786394, "learning_rate": 1.8148160914335153e-07, "loss": 0.0, "num_input_tokens_seen": 113745640, "step": 168785 }, { "epoch": 4.123567781496592, "grad_norm": 8.54555910336785e-05, "learning_rate": 1.8143262181659157e-07, "loss": 0.0, "num_input_tokens_seen": 113749160, "step": 168790 }, { "epoch": 4.123689932328439, "grad_norm": 0.00021532121172640473, "learning_rate": 1.813836404426734e-07, "loss": 0.0, "num_input_tokens_seen": 113752808, "step": 168795 }, { "epoch": 4.123812083160287, "grad_norm": 7.620219548698515e-05, "learning_rate": 1.8133466502195372e-07, "loss": 0.0, "num_input_tokens_seen": 113756008, "step": 168800 }, { "epoch": 4.123934233992133, "grad_norm": 0.0002450960746500641, "learning_rate": 1.8128569555478867e-07, "loss": 0.0, "num_input_tokens_seen": 113758952, "step": 168805 }, { "epoch": 4.124056384823981, "grad_norm": 2.163955468859058e-05, "learning_rate": 1.812367320415341e-07, "loss": 0.0, "num_input_tokens_seen": 113762280, "step": 168810 }, { "epoch": 4.124178535655828, "grad_norm": 4.057705154991709e-05, "learning_rate": 1.811877744825464e-07, "loss": 0.0, "num_input_tokens_seen": 113765736, "step": 168815 }, { "epoch": 4.124300686487675, "grad_norm": 0.005065068136900663, "learning_rate": 1.8113882287818127e-07, "loss": 0.0, "num_input_tokens_seen": 113769128, "step": 168820 }, { "epoch": 4.124422837319522, "grad_norm": 0.0001583754929015413, "learning_rate": 1.8108987722879487e-07, "loss": 0.0, "num_input_tokens_seen": 113773160, "step": 168825 }, { "epoch": 4.124544988151369, "grad_norm": 0.00011476362851681188, "learning_rate": 1.8104093753474336e-07, "loss": 0.0, "num_input_tokens_seen": 113776552, "step": 168830 }, { "epoch": 4.124667138983217, "grad_norm": 9.519248851574957e-05, "learning_rate": 1.8099200379638212e-07, "loss": 0.0, "num_input_tokens_seen": 113779944, "step": 168835 }, { "epoch": 4.124789289815063, "grad_norm": 365.7969970703125, "learning_rate": 1.8094307601406767e-07, "loss": 0.0028, "num_input_tokens_seen": 113783592, "step": 168840 }, { "epoch": 4.124911440646911, "grad_norm": 0.008785773068666458, "learning_rate": 1.8089415418815512e-07, "loss": 0.0, "num_input_tokens_seen": 113787240, "step": 168845 }, { "epoch": 4.125033591478758, "grad_norm": 1.7524176655570045e-05, "learning_rate": 1.8084523831900089e-07, "loss": 0.0, "num_input_tokens_seen": 113790888, "step": 168850 }, { "epoch": 4.125155742310605, "grad_norm": 0.00023619976127520204, "learning_rate": 1.8079632840696023e-07, "loss": 0.0, "num_input_tokens_seen": 113794280, "step": 168855 }, { "epoch": 4.125277893142452, "grad_norm": 1.563422119943425e-05, "learning_rate": 1.807474244523891e-07, "loss": 0.0, "num_input_tokens_seen": 113797480, "step": 168860 }, { "epoch": 4.1254000439743, "grad_norm": 5.026695362175815e-05, "learning_rate": 1.806985264556432e-07, "loss": 0.0, "num_input_tokens_seen": 113800744, "step": 168865 }, { "epoch": 4.1255221948061465, "grad_norm": 0.1911337971687317, "learning_rate": 1.806496344170777e-07, "loss": 0.0001, "num_input_tokens_seen": 113804008, "step": 168870 }, { "epoch": 4.125644345637994, "grad_norm": 0.0031086925882846117, "learning_rate": 1.8060074833704863e-07, "loss": 0.0, "num_input_tokens_seen": 113807784, "step": 168875 }, { "epoch": 4.125766496469841, "grad_norm": 2.847278847184498e-05, "learning_rate": 1.8055186821591107e-07, "loss": 0.0, "num_input_tokens_seen": 113811240, "step": 168880 }, { "epoch": 4.1258886473016885, "grad_norm": 0.00010783431935124099, "learning_rate": 1.8050299405402102e-07, "loss": 0.0526, "num_input_tokens_seen": 113814696, "step": 168885 }, { "epoch": 4.126010798133535, "grad_norm": 9.459498687647283e-05, "learning_rate": 1.8045412585173336e-07, "loss": 0.0, "num_input_tokens_seen": 113818024, "step": 168890 }, { "epoch": 4.126132948965383, "grad_norm": 4.6049019147176296e-05, "learning_rate": 1.804052636094038e-07, "loss": 0.0, "num_input_tokens_seen": 113821032, "step": 168895 }, { "epoch": 4.12625509979723, "grad_norm": 0.0009420507121831179, "learning_rate": 1.8035640732738766e-07, "loss": 0.0, "num_input_tokens_seen": 113824488, "step": 168900 }, { "epoch": 4.126377250629076, "grad_norm": 4.2329302232246846e-05, "learning_rate": 1.8030755700604007e-07, "loss": 0.0, "num_input_tokens_seen": 113827816, "step": 168905 }, { "epoch": 4.126499401460924, "grad_norm": 4.134025584789924e-05, "learning_rate": 1.8025871264571668e-07, "loss": 0.0, "num_input_tokens_seen": 113831592, "step": 168910 }, { "epoch": 4.126621552292771, "grad_norm": 0.002498138230293989, "learning_rate": 1.8020987424677203e-07, "loss": 0.0, "num_input_tokens_seen": 113834856, "step": 168915 }, { "epoch": 4.126743703124618, "grad_norm": 0.005550534464418888, "learning_rate": 1.801610418095618e-07, "loss": 0.0, "num_input_tokens_seen": 113838440, "step": 168920 }, { "epoch": 4.126865853956465, "grad_norm": 0.00012591673294082284, "learning_rate": 1.801122153344412e-07, "loss": 0.0, "num_input_tokens_seen": 113841512, "step": 168925 }, { "epoch": 4.126988004788313, "grad_norm": 2.2473011995316483e-05, "learning_rate": 1.800633948217648e-07, "loss": 0.0, "num_input_tokens_seen": 113845032, "step": 168930 }, { "epoch": 4.1271101556201595, "grad_norm": 0.00018835488299373537, "learning_rate": 1.800145802718882e-07, "loss": 0.0, "num_input_tokens_seen": 113848232, "step": 168935 }, { "epoch": 4.127232306452007, "grad_norm": 0.00011231577082071453, "learning_rate": 1.799657716851659e-07, "loss": 0.0, "num_input_tokens_seen": 113851304, "step": 168940 }, { "epoch": 4.127354457283854, "grad_norm": 6.715896597597748e-05, "learning_rate": 1.7991696906195332e-07, "loss": 0.0, "num_input_tokens_seen": 113854824, "step": 168945 }, { "epoch": 4.1274766081157015, "grad_norm": 0.026994898915290833, "learning_rate": 1.7986817240260487e-07, "loss": 0.0, "num_input_tokens_seen": 113858472, "step": 168950 }, { "epoch": 4.127598758947548, "grad_norm": 2.0782459614565596e-05, "learning_rate": 1.7981938170747591e-07, "loss": 0.0, "num_input_tokens_seen": 113861800, "step": 168955 }, { "epoch": 4.127720909779396, "grad_norm": 0.0001835390430642292, "learning_rate": 1.7977059697692065e-07, "loss": 0.0, "num_input_tokens_seen": 113865192, "step": 168960 }, { "epoch": 4.127843060611243, "grad_norm": 0.0012272525345906615, "learning_rate": 1.7972181821129462e-07, "loss": 0.0, "num_input_tokens_seen": 113868456, "step": 168965 }, { "epoch": 4.12796521144309, "grad_norm": 0.00022447301307693124, "learning_rate": 1.7967304541095206e-07, "loss": 0.0, "num_input_tokens_seen": 113871784, "step": 168970 }, { "epoch": 4.128087362274937, "grad_norm": 0.00019253823847975582, "learning_rate": 1.7962427857624752e-07, "loss": 0.0, "num_input_tokens_seen": 113874984, "step": 168975 }, { "epoch": 4.128209513106785, "grad_norm": 1.2826269085053355e-05, "learning_rate": 1.7957551770753598e-07, "loss": 0.0, "num_input_tokens_seen": 113878120, "step": 168980 }, { "epoch": 4.128331663938631, "grad_norm": 0.0008162598242051899, "learning_rate": 1.795267628051721e-07, "loss": 0.0012, "num_input_tokens_seen": 113881128, "step": 168985 }, { "epoch": 4.128453814770479, "grad_norm": 0.00021831518097314984, "learning_rate": 1.7947801386951e-07, "loss": 0.0, "num_input_tokens_seen": 113884200, "step": 168990 }, { "epoch": 4.128575965602326, "grad_norm": 0.0020679819863289595, "learning_rate": 1.7942927090090476e-07, "loss": 0.0, "num_input_tokens_seen": 113888232, "step": 168995 }, { "epoch": 4.1286981164341725, "grad_norm": 0.00011742662900360301, "learning_rate": 1.793805338997104e-07, "loss": 0.0, "num_input_tokens_seen": 113891304, "step": 169000 }, { "epoch": 4.12882026726602, "grad_norm": 4.990437446394935e-06, "learning_rate": 1.793318028662817e-07, "loss": 0.0, "num_input_tokens_seen": 113894568, "step": 169005 }, { "epoch": 4.128942418097867, "grad_norm": 3.3043686471501132e-06, "learning_rate": 1.7928307780097264e-07, "loss": 0.0, "num_input_tokens_seen": 113897768, "step": 169010 }, { "epoch": 4.1290645689297145, "grad_norm": 5.002227408112958e-05, "learning_rate": 1.7923435870413773e-07, "loss": 0.0, "num_input_tokens_seen": 113901096, "step": 169015 }, { "epoch": 4.129186719761561, "grad_norm": 7.49848986743018e-05, "learning_rate": 1.7918564557613157e-07, "loss": 0.0, "num_input_tokens_seen": 113904424, "step": 169020 }, { "epoch": 4.129308870593409, "grad_norm": 3.239734360249713e-05, "learning_rate": 1.7913693841730792e-07, "loss": 0.0, "num_input_tokens_seen": 113907304, "step": 169025 }, { "epoch": 4.129431021425256, "grad_norm": 8.012790203792974e-05, "learning_rate": 1.7908823722802157e-07, "loss": 0.0, "num_input_tokens_seen": 113910440, "step": 169030 }, { "epoch": 4.129553172257103, "grad_norm": 2.7389121896703728e-05, "learning_rate": 1.7903954200862602e-07, "loss": 0.0, "num_input_tokens_seen": 113913512, "step": 169035 }, { "epoch": 4.12967532308895, "grad_norm": 0.0003001104050781578, "learning_rate": 1.78990852759476e-07, "loss": 0.0, "num_input_tokens_seen": 113916456, "step": 169040 }, { "epoch": 4.129797473920798, "grad_norm": 2.86813810816966e-05, "learning_rate": 1.789421694809251e-07, "loss": 0.0, "num_input_tokens_seen": 113919656, "step": 169045 }, { "epoch": 4.129919624752644, "grad_norm": 7.979344809427857e-05, "learning_rate": 1.788934921733276e-07, "loss": 0.0, "num_input_tokens_seen": 113923112, "step": 169050 }, { "epoch": 4.130041775584492, "grad_norm": 0.00016391415556427091, "learning_rate": 1.7884482083703766e-07, "loss": 0.0, "num_input_tokens_seen": 113926568, "step": 169055 }, { "epoch": 4.130163926416339, "grad_norm": 0.001250895787961781, "learning_rate": 1.787961554724091e-07, "loss": 0.0, "num_input_tokens_seen": 113929640, "step": 169060 }, { "epoch": 4.130286077248186, "grad_norm": 0.0009996866574510932, "learning_rate": 1.7874749607979556e-07, "loss": 0.0, "num_input_tokens_seen": 113933288, "step": 169065 }, { "epoch": 4.130408228080033, "grad_norm": 6.69181245029904e-05, "learning_rate": 1.7869884265955127e-07, "loss": 0.0, "num_input_tokens_seen": 113936232, "step": 169070 }, { "epoch": 4.130530378911881, "grad_norm": 0.008389724418520927, "learning_rate": 1.7865019521202973e-07, "loss": 0.0, "num_input_tokens_seen": 113939560, "step": 169075 }, { "epoch": 4.1306525297437275, "grad_norm": 0.0025512150023132563, "learning_rate": 1.7860155373758511e-07, "loss": 0.0, "num_input_tokens_seen": 113942888, "step": 169080 }, { "epoch": 4.130774680575575, "grad_norm": 9.098489681491628e-05, "learning_rate": 1.785529182365707e-07, "loss": 0.0, "num_input_tokens_seen": 113945896, "step": 169085 }, { "epoch": 4.130896831407422, "grad_norm": 0.0035372497513890266, "learning_rate": 1.7850428870934052e-07, "loss": 0.0, "num_input_tokens_seen": 113949288, "step": 169090 }, { "epoch": 4.131018982239269, "grad_norm": 0.00010230082989437506, "learning_rate": 1.7845566515624798e-07, "loss": 0.0, "num_input_tokens_seen": 113952488, "step": 169095 }, { "epoch": 4.131141133071116, "grad_norm": 0.0007599542150273919, "learning_rate": 1.7840704757764712e-07, "loss": 0.0, "num_input_tokens_seen": 113955432, "step": 169100 }, { "epoch": 4.131263283902963, "grad_norm": 6.62624233882525e-06, "learning_rate": 1.7835843597389088e-07, "loss": 0.0002, "num_input_tokens_seen": 113959208, "step": 169105 }, { "epoch": 4.131385434734811, "grad_norm": 0.010468224994838238, "learning_rate": 1.783098303453331e-07, "loss": 0.0, "num_input_tokens_seen": 113962920, "step": 169110 }, { "epoch": 4.131507585566657, "grad_norm": 0.00017902413674164563, "learning_rate": 1.7826123069232746e-07, "loss": 0.0, "num_input_tokens_seen": 113966184, "step": 169115 }, { "epoch": 4.131629736398505, "grad_norm": 7.583387923659757e-05, "learning_rate": 1.7821263701522694e-07, "loss": 0.0, "num_input_tokens_seen": 113969320, "step": 169120 }, { "epoch": 4.131751887230352, "grad_norm": 0.01053790282458067, "learning_rate": 1.7816404931438533e-07, "loss": 0.0, "num_input_tokens_seen": 113972584, "step": 169125 }, { "epoch": 4.131874038062199, "grad_norm": 0.0004985374980606139, "learning_rate": 1.781154675901556e-07, "loss": 0.0, "num_input_tokens_seen": 113975656, "step": 169130 }, { "epoch": 4.131996188894046, "grad_norm": 0.0007123488467186689, "learning_rate": 1.7806689184289136e-07, "loss": 0.0, "num_input_tokens_seen": 113979432, "step": 169135 }, { "epoch": 4.132118339725894, "grad_norm": 1.9404064005357213e-05, "learning_rate": 1.7801832207294587e-07, "loss": 0.0, "num_input_tokens_seen": 113982760, "step": 169140 }, { "epoch": 4.132240490557741, "grad_norm": 4.020150299766101e-06, "learning_rate": 1.7796975828067206e-07, "loss": 0.0, "num_input_tokens_seen": 113986856, "step": 169145 }, { "epoch": 4.132362641389588, "grad_norm": 0.00013273171498440206, "learning_rate": 1.7792120046642344e-07, "loss": 0.0, "num_input_tokens_seen": 113990568, "step": 169150 }, { "epoch": 4.132484792221435, "grad_norm": 0.0015003958251327276, "learning_rate": 1.7787264863055273e-07, "loss": 0.0, "num_input_tokens_seen": 113994536, "step": 169155 }, { "epoch": 4.132606943053283, "grad_norm": 7.169459422584623e-05, "learning_rate": 1.7782410277341352e-07, "loss": 0.0, "num_input_tokens_seen": 113997672, "step": 169160 }, { "epoch": 4.132729093885129, "grad_norm": 0.00012075284757884219, "learning_rate": 1.7777556289535834e-07, "loss": 0.0, "num_input_tokens_seen": 114001064, "step": 169165 }, { "epoch": 4.132851244716976, "grad_norm": 4.0698819248063955e-06, "learning_rate": 1.7772702899674063e-07, "loss": 0.0, "num_input_tokens_seen": 114004392, "step": 169170 }, { "epoch": 4.132973395548824, "grad_norm": 0.000144742734846659, "learning_rate": 1.7767850107791316e-07, "loss": 0.0, "num_input_tokens_seen": 114008168, "step": 169175 }, { "epoch": 4.1330955463806704, "grad_norm": 0.0016985450638458133, "learning_rate": 1.7762997913922862e-07, "loss": 0.0, "num_input_tokens_seen": 114011752, "step": 169180 }, { "epoch": 4.133217697212518, "grad_norm": 6.355442565109115e-06, "learning_rate": 1.7758146318104018e-07, "loss": 0.0, "num_input_tokens_seen": 114015144, "step": 169185 }, { "epoch": 4.133339848044365, "grad_norm": 0.001065228134393692, "learning_rate": 1.7753295320370043e-07, "loss": 0.0, "num_input_tokens_seen": 114018152, "step": 169190 }, { "epoch": 4.1334619988762125, "grad_norm": 0.873834490776062, "learning_rate": 1.7748444920756245e-07, "loss": 0.0003, "num_input_tokens_seen": 114021416, "step": 169195 }, { "epoch": 4.133584149708059, "grad_norm": 0.00012380085536278784, "learning_rate": 1.774359511929785e-07, "loss": 0.0, "num_input_tokens_seen": 114024616, "step": 169200 }, { "epoch": 4.133706300539907, "grad_norm": 0.0001145715796155855, "learning_rate": 1.773874591603015e-07, "loss": 0.0, "num_input_tokens_seen": 114027944, "step": 169205 }, { "epoch": 4.133828451371754, "grad_norm": 1.7328757166978903e-05, "learning_rate": 1.7733897310988456e-07, "loss": 0.0, "num_input_tokens_seen": 114031400, "step": 169210 }, { "epoch": 4.133950602203601, "grad_norm": 3.638927228166722e-05, "learning_rate": 1.7729049304207955e-07, "loss": 0.0, "num_input_tokens_seen": 114034792, "step": 169215 }, { "epoch": 4.134072753035448, "grad_norm": 0.00015678427007514983, "learning_rate": 1.7724201895723956e-07, "loss": 0.0, "num_input_tokens_seen": 114038568, "step": 169220 }, { "epoch": 4.134194903867296, "grad_norm": 0.00022379684378392994, "learning_rate": 1.7719355085571676e-07, "loss": 0.0, "num_input_tokens_seen": 114041896, "step": 169225 }, { "epoch": 4.134317054699142, "grad_norm": 0.000458498892839998, "learning_rate": 1.771450887378637e-07, "loss": 0.0, "num_input_tokens_seen": 114045224, "step": 169230 }, { "epoch": 4.13443920553099, "grad_norm": 5.889030217076652e-05, "learning_rate": 1.7709663260403307e-07, "loss": 0.0, "num_input_tokens_seen": 114048552, "step": 169235 }, { "epoch": 4.134561356362837, "grad_norm": 0.000463417760329321, "learning_rate": 1.7704818245457686e-07, "loss": 0.0, "num_input_tokens_seen": 114051816, "step": 169240 }, { "epoch": 4.134683507194684, "grad_norm": 0.00025956856552511454, "learning_rate": 1.7699973828984794e-07, "loss": 0.0, "num_input_tokens_seen": 114054952, "step": 169245 }, { "epoch": 4.134805658026531, "grad_norm": 0.0002999178832396865, "learning_rate": 1.76951300110198e-07, "loss": 0.0, "num_input_tokens_seen": 114058024, "step": 169250 }, { "epoch": 4.134927808858379, "grad_norm": 3.823560109594837e-05, "learning_rate": 1.7690286791597973e-07, "loss": 0.0, "num_input_tokens_seen": 114061352, "step": 169255 }, { "epoch": 4.1350499596902255, "grad_norm": 2.4604967620689422e-05, "learning_rate": 1.7685444170754503e-07, "loss": 0.0, "num_input_tokens_seen": 114064552, "step": 169260 }, { "epoch": 4.135172110522072, "grad_norm": 1.1779735359596089e-05, "learning_rate": 1.7680602148524649e-07, "loss": 0.0, "num_input_tokens_seen": 114067624, "step": 169265 }, { "epoch": 4.13529426135392, "grad_norm": 0.0051346332766115665, "learning_rate": 1.7675760724943567e-07, "loss": 0.0, "num_input_tokens_seen": 114071144, "step": 169270 }, { "epoch": 4.135416412185767, "grad_norm": 0.0003757499507628381, "learning_rate": 1.767091990004652e-07, "loss": 0.0, "num_input_tokens_seen": 114074472, "step": 169275 }, { "epoch": 4.135538563017614, "grad_norm": 9.444287570659071e-06, "learning_rate": 1.7666079673868695e-07, "loss": 0.0, "num_input_tokens_seen": 114078952, "step": 169280 }, { "epoch": 4.135660713849461, "grad_norm": 7.637072849320248e-05, "learning_rate": 1.7661240046445259e-07, "loss": 0.0, "num_input_tokens_seen": 114082536, "step": 169285 }, { "epoch": 4.135782864681309, "grad_norm": 0.002652720781043172, "learning_rate": 1.7656401017811451e-07, "loss": 0.0, "num_input_tokens_seen": 114085992, "step": 169290 }, { "epoch": 4.135905015513155, "grad_norm": 5.934476575930603e-05, "learning_rate": 1.7651562588002412e-07, "loss": 0.0, "num_input_tokens_seen": 114089192, "step": 169295 }, { "epoch": 4.136027166345003, "grad_norm": 2.6133950086659752e-05, "learning_rate": 1.7646724757053366e-07, "loss": 0.0, "num_input_tokens_seen": 114092456, "step": 169300 }, { "epoch": 4.13614931717685, "grad_norm": 0.00019350307411514223, "learning_rate": 1.7641887524999511e-07, "loss": 0.0, "num_input_tokens_seen": 114095592, "step": 169305 }, { "epoch": 4.136271468008697, "grad_norm": 0.0005644158809445798, "learning_rate": 1.7637050891875983e-07, "loss": 0.0, "num_input_tokens_seen": 114099048, "step": 169310 }, { "epoch": 4.136393618840544, "grad_norm": 0.00013100756041239947, "learning_rate": 1.7632214857717997e-07, "loss": 0.0, "num_input_tokens_seen": 114102248, "step": 169315 }, { "epoch": 4.136515769672392, "grad_norm": 9.739145752973855e-05, "learning_rate": 1.7627379422560672e-07, "loss": 0.0, "num_input_tokens_seen": 114105960, "step": 169320 }, { "epoch": 4.1366379205042385, "grad_norm": 1.3738956113229506e-05, "learning_rate": 1.762254458643919e-07, "loss": 0.0, "num_input_tokens_seen": 114109352, "step": 169325 }, { "epoch": 4.136760071336086, "grad_norm": 0.00037023035110905766, "learning_rate": 1.7617710349388758e-07, "loss": 0.0005, "num_input_tokens_seen": 114113000, "step": 169330 }, { "epoch": 4.136882222167933, "grad_norm": 9.684554242994636e-05, "learning_rate": 1.761287671144447e-07, "loss": 0.0, "num_input_tokens_seen": 114116072, "step": 169335 }, { "epoch": 4.1370043729997805, "grad_norm": 1.5194071238511242e-05, "learning_rate": 1.7608043672641516e-07, "loss": 0.0, "num_input_tokens_seen": 114119272, "step": 169340 }, { "epoch": 4.137126523831627, "grad_norm": 6.645849498454481e-05, "learning_rate": 1.7603211233015013e-07, "loss": 0.0, "num_input_tokens_seen": 114122664, "step": 169345 }, { "epoch": 4.137248674663475, "grad_norm": 0.0013586204731836915, "learning_rate": 1.7598379392600137e-07, "loss": 0.0, "num_input_tokens_seen": 114126120, "step": 169350 }, { "epoch": 4.137370825495322, "grad_norm": 5.228986992733553e-05, "learning_rate": 1.759354815143199e-07, "loss": 0.0, "num_input_tokens_seen": 114129448, "step": 169355 }, { "epoch": 4.137492976327168, "grad_norm": 0.0018259503412991762, "learning_rate": 1.7588717509545738e-07, "loss": 0.0, "num_input_tokens_seen": 114132584, "step": 169360 }, { "epoch": 4.137615127159016, "grad_norm": 0.0023885120172053576, "learning_rate": 1.75838874669765e-07, "loss": 0.0, "num_input_tokens_seen": 114135912, "step": 169365 }, { "epoch": 4.137737277990863, "grad_norm": 0.00011633247777353972, "learning_rate": 1.757905802375942e-07, "loss": 0.0, "num_input_tokens_seen": 114139304, "step": 169370 }, { "epoch": 4.13785942882271, "grad_norm": 0.00010378471779404208, "learning_rate": 1.7574229179929556e-07, "loss": 0.0, "num_input_tokens_seen": 114142504, "step": 169375 }, { "epoch": 4.137981579654557, "grad_norm": 0.0009791553020477295, "learning_rate": 1.7569400935522105e-07, "loss": 0.0, "num_input_tokens_seen": 114146344, "step": 169380 }, { "epoch": 4.138103730486405, "grad_norm": 0.0005662897019647062, "learning_rate": 1.7564573290572115e-07, "loss": 0.0, "num_input_tokens_seen": 114149800, "step": 169385 }, { "epoch": 4.1382258813182515, "grad_norm": 1.4119503248366527e-05, "learning_rate": 1.7559746245114737e-07, "loss": 0.0, "num_input_tokens_seen": 114152872, "step": 169390 }, { "epoch": 4.138348032150099, "grad_norm": 0.00040378831909038126, "learning_rate": 1.7554919799185041e-07, "loss": 0.0, "num_input_tokens_seen": 114156136, "step": 169395 }, { "epoch": 4.138470182981946, "grad_norm": 0.00027075386606156826, "learning_rate": 1.7550093952818168e-07, "loss": 0.0, "num_input_tokens_seen": 114159272, "step": 169400 }, { "epoch": 4.1385923338137935, "grad_norm": 0.00012230666470713913, "learning_rate": 1.7545268706049155e-07, "loss": 0.0, "num_input_tokens_seen": 114162920, "step": 169405 }, { "epoch": 4.13871448464564, "grad_norm": 1.6268022591248155e-05, "learning_rate": 1.7540444058913162e-07, "loss": 0.0, "num_input_tokens_seen": 114166248, "step": 169410 }, { "epoch": 4.138836635477488, "grad_norm": 0.0005964138545095921, "learning_rate": 1.7535620011445208e-07, "loss": 0.0419, "num_input_tokens_seen": 114169640, "step": 169415 }, { "epoch": 4.138958786309335, "grad_norm": 0.00023118879471439868, "learning_rate": 1.7530796563680406e-07, "loss": 0.0, "num_input_tokens_seen": 114173224, "step": 169420 }, { "epoch": 4.139080937141182, "grad_norm": 0.0001617353700567037, "learning_rate": 1.752597371565385e-07, "loss": 0.0, "num_input_tokens_seen": 114176424, "step": 169425 }, { "epoch": 4.139203087973029, "grad_norm": 0.0001720954751363024, "learning_rate": 1.7521151467400585e-07, "loss": 0.0, "num_input_tokens_seen": 114179880, "step": 169430 }, { "epoch": 4.139325238804877, "grad_norm": 0.00021559244487434626, "learning_rate": 1.7516329818955712e-07, "loss": 0.0, "num_input_tokens_seen": 114183016, "step": 169435 }, { "epoch": 4.139447389636723, "grad_norm": 4.029813680972438e-06, "learning_rate": 1.7511508770354243e-07, "loss": 0.0, "num_input_tokens_seen": 114186152, "step": 169440 }, { "epoch": 4.13956954046857, "grad_norm": 0.00014288983948063105, "learning_rate": 1.75066883216313e-07, "loss": 0.0, "num_input_tokens_seen": 114189544, "step": 169445 }, { "epoch": 4.139691691300418, "grad_norm": 6.273949838941917e-05, "learning_rate": 1.750186847282188e-07, "loss": 0.0, "num_input_tokens_seen": 114193000, "step": 169450 }, { "epoch": 4.1398138421322646, "grad_norm": 7.3529131441318896e-06, "learning_rate": 1.7497049223961058e-07, "loss": 0.0, "num_input_tokens_seen": 114196648, "step": 169455 }, { "epoch": 4.139935992964112, "grad_norm": 0.007190991658717394, "learning_rate": 1.749223057508391e-07, "loss": 0.0, "num_input_tokens_seen": 114200104, "step": 169460 }, { "epoch": 4.140058143795959, "grad_norm": 0.004899477120488882, "learning_rate": 1.748741252622543e-07, "loss": 0.0, "num_input_tokens_seen": 114204008, "step": 169465 }, { "epoch": 4.140180294627807, "grad_norm": 4.425868246471509e-05, "learning_rate": 1.7482595077420713e-07, "loss": 0.0, "num_input_tokens_seen": 114206888, "step": 169470 }, { "epoch": 4.140302445459653, "grad_norm": 0.0001238979893969372, "learning_rate": 1.7477778228704732e-07, "loss": 0.0, "num_input_tokens_seen": 114210408, "step": 169475 }, { "epoch": 4.140424596291501, "grad_norm": 0.0001803866762202233, "learning_rate": 1.7472961980112556e-07, "loss": 0.0, "num_input_tokens_seen": 114213416, "step": 169480 }, { "epoch": 4.140546747123348, "grad_norm": 5.6156062782974914e-05, "learning_rate": 1.746814633167921e-07, "loss": 0.0, "num_input_tokens_seen": 114217000, "step": 169485 }, { "epoch": 4.140668897955195, "grad_norm": 0.0022617189679294825, "learning_rate": 1.7463331283439664e-07, "loss": 0.0, "num_input_tokens_seen": 114220072, "step": 169490 }, { "epoch": 4.140791048787042, "grad_norm": 5.546949978452176e-05, "learning_rate": 1.7458516835429016e-07, "loss": 0.0, "num_input_tokens_seen": 114223592, "step": 169495 }, { "epoch": 4.14091319961889, "grad_norm": 0.00012014965614071116, "learning_rate": 1.7453702987682195e-07, "loss": 0.0, "num_input_tokens_seen": 114227304, "step": 169500 }, { "epoch": 4.1410353504507365, "grad_norm": 0.0006364333676174283, "learning_rate": 1.7448889740234273e-07, "loss": 0.0, "num_input_tokens_seen": 114231208, "step": 169505 }, { "epoch": 4.141157501282584, "grad_norm": 0.0007857916643843055, "learning_rate": 1.7444077093120214e-07, "loss": 0.0, "num_input_tokens_seen": 114234408, "step": 169510 }, { "epoch": 4.141279652114431, "grad_norm": 0.00018345196440350264, "learning_rate": 1.743926504637503e-07, "loss": 0.0, "num_input_tokens_seen": 114237736, "step": 169515 }, { "epoch": 4.1414018029462785, "grad_norm": 0.0016277572140097618, "learning_rate": 1.7434453600033728e-07, "loss": 0.0, "num_input_tokens_seen": 114241128, "step": 169520 }, { "epoch": 4.141523953778125, "grad_norm": 1.3486666830431204e-05, "learning_rate": 1.742964275413128e-07, "loss": 0.0, "num_input_tokens_seen": 114244584, "step": 169525 }, { "epoch": 4.141646104609972, "grad_norm": 1.944943687703926e-05, "learning_rate": 1.7424832508702692e-07, "loss": 0.0, "num_input_tokens_seen": 114247912, "step": 169530 }, { "epoch": 4.14176825544182, "grad_norm": 0.0033633532002568245, "learning_rate": 1.74200228637829e-07, "loss": 0.0, "num_input_tokens_seen": 114250984, "step": 169535 }, { "epoch": 4.141890406273666, "grad_norm": 0.0007912274450063705, "learning_rate": 1.7415213819406926e-07, "loss": 0.0, "num_input_tokens_seen": 114254248, "step": 169540 }, { "epoch": 4.142012557105514, "grad_norm": 0.00033459014957770705, "learning_rate": 1.741040537560976e-07, "loss": 0.0001, "num_input_tokens_seen": 114257384, "step": 169545 }, { "epoch": 4.142134707937361, "grad_norm": 4.042280852445401e-05, "learning_rate": 1.7405597532426297e-07, "loss": 0.0, "num_input_tokens_seen": 114260392, "step": 169550 }, { "epoch": 4.142256858769208, "grad_norm": 0.0008538342663086951, "learning_rate": 1.7400790289891588e-07, "loss": 0.0, "num_input_tokens_seen": 114263848, "step": 169555 }, { "epoch": 4.142379009601055, "grad_norm": 1.0966689842462074e-05, "learning_rate": 1.7395983648040513e-07, "loss": 0.0, "num_input_tokens_seen": 114267432, "step": 169560 }, { "epoch": 4.142501160432903, "grad_norm": 0.0021321589592844248, "learning_rate": 1.7391177606908081e-07, "loss": 0.0, "num_input_tokens_seen": 114270824, "step": 169565 }, { "epoch": 4.1426233112647495, "grad_norm": 9.802708518691361e-06, "learning_rate": 1.7386372166529218e-07, "loss": 0.0, "num_input_tokens_seen": 114274280, "step": 169570 }, { "epoch": 4.142745462096597, "grad_norm": 8.670361239637714e-06, "learning_rate": 1.7381567326938883e-07, "loss": 0.0, "num_input_tokens_seen": 114277608, "step": 169575 }, { "epoch": 4.142867612928444, "grad_norm": 7.37422305974178e-05, "learning_rate": 1.7376763088171998e-07, "loss": 0.0, "num_input_tokens_seen": 114281064, "step": 169580 }, { "epoch": 4.1429897637602915, "grad_norm": 0.0022815996780991554, "learning_rate": 1.737195945026354e-07, "loss": 0.0, "num_input_tokens_seen": 114284200, "step": 169585 }, { "epoch": 4.143111914592138, "grad_norm": 0.0026343739591538906, "learning_rate": 1.7367156413248408e-07, "loss": 0.0, "num_input_tokens_seen": 114287400, "step": 169590 }, { "epoch": 4.143234065423986, "grad_norm": 9.867559128906578e-06, "learning_rate": 1.7362353977161527e-07, "loss": 0.0, "num_input_tokens_seen": 114291112, "step": 169595 }, { "epoch": 4.143356216255833, "grad_norm": 0.002449099440127611, "learning_rate": 1.7357552142037856e-07, "loss": 0.0, "num_input_tokens_seen": 114295144, "step": 169600 }, { "epoch": 4.14347836708768, "grad_norm": 8.740804332774132e-05, "learning_rate": 1.735275090791226e-07, "loss": 0.0, "num_input_tokens_seen": 114298856, "step": 169605 }, { "epoch": 4.143600517919527, "grad_norm": 8.52627053973265e-05, "learning_rate": 1.73479502748197e-07, "loss": 0.0, "num_input_tokens_seen": 114301992, "step": 169610 }, { "epoch": 4.143722668751375, "grad_norm": 1.3284211490827147e-05, "learning_rate": 1.7343150242795102e-07, "loss": 0.0, "num_input_tokens_seen": 114305064, "step": 169615 }, { "epoch": 4.143844819583221, "grad_norm": 0.0002733416040427983, "learning_rate": 1.7338350811873314e-07, "loss": 0.0, "num_input_tokens_seen": 114308200, "step": 169620 }, { "epoch": 4.143966970415068, "grad_norm": 0.0004196310183033347, "learning_rate": 1.73335519820893e-07, "loss": 0.0, "num_input_tokens_seen": 114310952, "step": 169625 }, { "epoch": 4.144089121246916, "grad_norm": 1.7216229025507346e-05, "learning_rate": 1.732875375347791e-07, "loss": 0.0, "num_input_tokens_seen": 114314408, "step": 169630 }, { "epoch": 4.1442112720787625, "grad_norm": 0.00020091682381462306, "learning_rate": 1.7323956126074057e-07, "loss": 0.0, "num_input_tokens_seen": 114317928, "step": 169635 }, { "epoch": 4.14433342291061, "grad_norm": 5.992214209982194e-05, "learning_rate": 1.731915909991265e-07, "loss": 0.0, "num_input_tokens_seen": 114320808, "step": 169640 }, { "epoch": 4.144455573742457, "grad_norm": 0.0003436962724663317, "learning_rate": 1.7314362675028537e-07, "loss": 0.0, "num_input_tokens_seen": 114324584, "step": 169645 }, { "epoch": 4.1445777245743045, "grad_norm": 0.00014694590936414897, "learning_rate": 1.7309566851456647e-07, "loss": 0.0, "num_input_tokens_seen": 114328296, "step": 169650 }, { "epoch": 4.144699875406151, "grad_norm": 0.05490154027938843, "learning_rate": 1.7304771629231796e-07, "loss": 0.0, "num_input_tokens_seen": 114331624, "step": 169655 }, { "epoch": 4.144822026237999, "grad_norm": 0.0006583016365766525, "learning_rate": 1.7299977008388923e-07, "loss": 0.0, "num_input_tokens_seen": 114334824, "step": 169660 }, { "epoch": 4.144944177069846, "grad_norm": 0.00024803922860883176, "learning_rate": 1.729518298896282e-07, "loss": 0.0, "num_input_tokens_seen": 114337768, "step": 169665 }, { "epoch": 4.145066327901693, "grad_norm": 1.9043955035158433e-05, "learning_rate": 1.7290389570988406e-07, "loss": 0.0, "num_input_tokens_seen": 114340712, "step": 169670 }, { "epoch": 4.14518847873354, "grad_norm": 0.0005053699715062976, "learning_rate": 1.728559675450054e-07, "loss": 0.0, "num_input_tokens_seen": 114344232, "step": 169675 }, { "epoch": 4.145310629565388, "grad_norm": 0.00019391553360037506, "learning_rate": 1.7280804539534066e-07, "loss": 0.0, "num_input_tokens_seen": 114347752, "step": 169680 }, { "epoch": 4.145432780397234, "grad_norm": 0.01149059645831585, "learning_rate": 1.7276012926123807e-07, "loss": 0.0, "num_input_tokens_seen": 114351144, "step": 169685 }, { "epoch": 4.145554931229082, "grad_norm": 3.013821697095409e-06, "learning_rate": 1.7271221914304657e-07, "loss": 0.0, "num_input_tokens_seen": 114354408, "step": 169690 }, { "epoch": 4.145677082060929, "grad_norm": 0.00018444034503772855, "learning_rate": 1.7266431504111413e-07, "loss": 0.0, "num_input_tokens_seen": 114357544, "step": 169695 }, { "epoch": 4.145799232892776, "grad_norm": 6.898889751028037e-06, "learning_rate": 1.7261641695578943e-07, "loss": 0.0, "num_input_tokens_seen": 114360872, "step": 169700 }, { "epoch": 4.145921383724623, "grad_norm": 5.794294338556938e-06, "learning_rate": 1.7256852488742057e-07, "loss": 0.0, "num_input_tokens_seen": 114363944, "step": 169705 }, { "epoch": 4.14604353455647, "grad_norm": 0.00010420309990877286, "learning_rate": 1.7252063883635604e-07, "loss": 0.0, "num_input_tokens_seen": 114367080, "step": 169710 }, { "epoch": 4.1461656853883175, "grad_norm": 0.00022488638933282346, "learning_rate": 1.7247275880294388e-07, "loss": 0.0, "num_input_tokens_seen": 114370152, "step": 169715 }, { "epoch": 4.146287836220164, "grad_norm": 0.000128116414998658, "learning_rate": 1.7242488478753258e-07, "loss": 0.0, "num_input_tokens_seen": 114373928, "step": 169720 }, { "epoch": 4.146409987052012, "grad_norm": 0.0007236965466290712, "learning_rate": 1.723770167904699e-07, "loss": 0.0, "num_input_tokens_seen": 114377000, "step": 169725 }, { "epoch": 4.146532137883859, "grad_norm": 4.138689109822735e-05, "learning_rate": 1.723291548121042e-07, "loss": 0.0, "num_input_tokens_seen": 114380584, "step": 169730 }, { "epoch": 4.146654288715706, "grad_norm": 2.3288272132049315e-05, "learning_rate": 1.7228129885278364e-07, "loss": 0.0, "num_input_tokens_seen": 114383976, "step": 169735 }, { "epoch": 4.146776439547553, "grad_norm": 0.00036748574348166585, "learning_rate": 1.7223344891285584e-07, "loss": 0.0, "num_input_tokens_seen": 114387112, "step": 169740 }, { "epoch": 4.146898590379401, "grad_norm": 0.0008554519154131413, "learning_rate": 1.7218560499266943e-07, "loss": 0.0, "num_input_tokens_seen": 114390248, "step": 169745 }, { "epoch": 4.147020741211247, "grad_norm": 0.0006384583539329469, "learning_rate": 1.7213776709257165e-07, "loss": 0.0, "num_input_tokens_seen": 114393704, "step": 169750 }, { "epoch": 4.147142892043095, "grad_norm": 4.4409707697923295e-06, "learning_rate": 1.7208993521291092e-07, "loss": 0.0, "num_input_tokens_seen": 114397224, "step": 169755 }, { "epoch": 4.147265042874942, "grad_norm": 3.955944976041792e-06, "learning_rate": 1.7204210935403462e-07, "loss": 0.0, "num_input_tokens_seen": 114400168, "step": 169760 }, { "epoch": 4.147387193706789, "grad_norm": 2.9546312362072058e-05, "learning_rate": 1.7199428951629082e-07, "loss": 0.0, "num_input_tokens_seen": 114403240, "step": 169765 }, { "epoch": 4.147509344538636, "grad_norm": 0.00019152191816829145, "learning_rate": 1.7194647570002741e-07, "loss": 0.0, "num_input_tokens_seen": 114406120, "step": 169770 }, { "epoch": 4.147631495370484, "grad_norm": 0.0013317177072167397, "learning_rate": 1.718986679055918e-07, "loss": 0.0, "num_input_tokens_seen": 114409512, "step": 169775 }, { "epoch": 4.147753646202331, "grad_norm": 0.00021308651776053011, "learning_rate": 1.71850866133332e-07, "loss": 0.0, "num_input_tokens_seen": 114412968, "step": 169780 }, { "epoch": 4.147875797034178, "grad_norm": 3.8212063373066485e-05, "learning_rate": 1.718030703835952e-07, "loss": 0.0, "num_input_tokens_seen": 114415784, "step": 169785 }, { "epoch": 4.147997947866025, "grad_norm": 7.068268314469606e-05, "learning_rate": 1.717552806567295e-07, "loss": 0.0, "num_input_tokens_seen": 114418920, "step": 169790 }, { "epoch": 4.148120098697872, "grad_norm": 1.8192162315244786e-05, "learning_rate": 1.7170749695308228e-07, "loss": 0.0, "num_input_tokens_seen": 114421992, "step": 169795 }, { "epoch": 4.148242249529719, "grad_norm": 0.0006818032707087696, "learning_rate": 1.716597192730005e-07, "loss": 0.0, "num_input_tokens_seen": 114425192, "step": 169800 }, { "epoch": 4.148364400361566, "grad_norm": 0.000235591855016537, "learning_rate": 1.716119476168324e-07, "loss": 0.0, "num_input_tokens_seen": 114428520, "step": 169805 }, { "epoch": 4.148486551193414, "grad_norm": 6.817103439971106e-06, "learning_rate": 1.7156418198492473e-07, "loss": 0.0, "num_input_tokens_seen": 114432296, "step": 169810 }, { "epoch": 4.14860870202526, "grad_norm": 0.0003154922742396593, "learning_rate": 1.7151642237762543e-07, "loss": 0.0, "num_input_tokens_seen": 114435880, "step": 169815 }, { "epoch": 4.148730852857108, "grad_norm": 0.002449472900480032, "learning_rate": 1.7146866879528122e-07, "loss": 0.0, "num_input_tokens_seen": 114439208, "step": 169820 }, { "epoch": 4.148853003688955, "grad_norm": 0.00023228446661960334, "learning_rate": 1.714209212382398e-07, "loss": 0.0, "num_input_tokens_seen": 114442984, "step": 169825 }, { "epoch": 4.1489751545208025, "grad_norm": 1.0799426490848418e-05, "learning_rate": 1.7137317970684851e-07, "loss": 0.0, "num_input_tokens_seen": 114445992, "step": 169830 }, { "epoch": 4.149097305352649, "grad_norm": 0.005137881729751825, "learning_rate": 1.71325444201454e-07, "loss": 0.0, "num_input_tokens_seen": 114449192, "step": 169835 }, { "epoch": 4.149219456184497, "grad_norm": 0.00022352815722115338, "learning_rate": 1.7127771472240404e-07, "loss": 0.0, "num_input_tokens_seen": 114452392, "step": 169840 }, { "epoch": 4.149341607016344, "grad_norm": 0.00036647875094786286, "learning_rate": 1.7122999127004522e-07, "loss": 0.0, "num_input_tokens_seen": 114456104, "step": 169845 }, { "epoch": 4.149463757848191, "grad_norm": 6.18748672422953e-05, "learning_rate": 1.7118227384472482e-07, "loss": 0.0, "num_input_tokens_seen": 114459112, "step": 169850 }, { "epoch": 4.149585908680038, "grad_norm": 0.0069971526972949505, "learning_rate": 1.7113456244679014e-07, "loss": 0.0, "num_input_tokens_seen": 114463016, "step": 169855 }, { "epoch": 4.149708059511886, "grad_norm": 0.00015763036208227277, "learning_rate": 1.7108685707658754e-07, "loss": 0.0716, "num_input_tokens_seen": 114466088, "step": 169860 }, { "epoch": 4.149830210343732, "grad_norm": 0.00014392206503544003, "learning_rate": 1.7103915773446453e-07, "loss": 0.0, "num_input_tokens_seen": 114469416, "step": 169865 }, { "epoch": 4.14995236117558, "grad_norm": 2.112441507051699e-05, "learning_rate": 1.709914644207675e-07, "loss": 0.0, "num_input_tokens_seen": 114472552, "step": 169870 }, { "epoch": 4.150074512007427, "grad_norm": 0.00011392030137358233, "learning_rate": 1.7094377713584374e-07, "loss": 0.0, "num_input_tokens_seen": 114475560, "step": 169875 }, { "epoch": 4.150196662839274, "grad_norm": 1.2522552424343303e-05, "learning_rate": 1.7089609588003962e-07, "loss": 0.0, "num_input_tokens_seen": 114478888, "step": 169880 }, { "epoch": 4.150318813671121, "grad_norm": 0.00044914192403666675, "learning_rate": 1.7084842065370232e-07, "loss": 0.0, "num_input_tokens_seen": 114482088, "step": 169885 }, { "epoch": 4.150440964502968, "grad_norm": 0.001731325639411807, "learning_rate": 1.7080075145717798e-07, "loss": 0.0, "num_input_tokens_seen": 114485992, "step": 169890 }, { "epoch": 4.1505631153348155, "grad_norm": 1.6076908650575206e-05, "learning_rate": 1.707530882908139e-07, "loss": 0.0, "num_input_tokens_seen": 114488936, "step": 169895 }, { "epoch": 4.150685266166662, "grad_norm": 0.4748183786869049, "learning_rate": 1.707054311549565e-07, "loss": 0.0001, "num_input_tokens_seen": 114492584, "step": 169900 }, { "epoch": 4.15080741699851, "grad_norm": 0.0008500635158270597, "learning_rate": 1.706577800499519e-07, "loss": 0.0, "num_input_tokens_seen": 114495912, "step": 169905 }, { "epoch": 4.150929567830357, "grad_norm": 3.9711067074676976e-05, "learning_rate": 1.706101349761473e-07, "loss": 0.0, "num_input_tokens_seen": 114499432, "step": 169910 }, { "epoch": 4.151051718662204, "grad_norm": 0.00018771788745652884, "learning_rate": 1.7056249593388862e-07, "loss": 0.0, "num_input_tokens_seen": 114502248, "step": 169915 }, { "epoch": 4.151173869494051, "grad_norm": 0.002624510321766138, "learning_rate": 1.7051486292352258e-07, "loss": 0.0, "num_input_tokens_seen": 114505448, "step": 169920 }, { "epoch": 4.151296020325899, "grad_norm": 3.772515265154652e-05, "learning_rate": 1.704672359453958e-07, "loss": 0.0, "num_input_tokens_seen": 114508520, "step": 169925 }, { "epoch": 4.151418171157745, "grad_norm": 0.000177039866684936, "learning_rate": 1.7041961499985414e-07, "loss": 0.0, "num_input_tokens_seen": 114511656, "step": 169930 }, { "epoch": 4.151540321989593, "grad_norm": 0.00017326019587926567, "learning_rate": 1.703720000872444e-07, "loss": 0.0, "num_input_tokens_seen": 114515560, "step": 169935 }, { "epoch": 4.15166247282144, "grad_norm": 0.004151183646172285, "learning_rate": 1.703243912079123e-07, "loss": 0.0, "num_input_tokens_seen": 114518760, "step": 169940 }, { "epoch": 4.151784623653287, "grad_norm": 3.284201739006676e-05, "learning_rate": 1.702767883622045e-07, "loss": 0.0, "num_input_tokens_seen": 114522472, "step": 169945 }, { "epoch": 4.151906774485134, "grad_norm": 1.2182783393654972e-05, "learning_rate": 1.7022919155046722e-07, "loss": 0.0, "num_input_tokens_seen": 114526184, "step": 169950 }, { "epoch": 4.152028925316982, "grad_norm": 1.4844258657831233e-05, "learning_rate": 1.7018160077304633e-07, "loss": 0.0, "num_input_tokens_seen": 114529576, "step": 169955 }, { "epoch": 4.1521510761488285, "grad_norm": 5.435540060716448e-06, "learning_rate": 1.7013401603028822e-07, "loss": 0.0, "num_input_tokens_seen": 114532584, "step": 169960 }, { "epoch": 4.152273226980676, "grad_norm": 0.0001301129232160747, "learning_rate": 1.7008643732253848e-07, "loss": 0.0, "num_input_tokens_seen": 114535592, "step": 169965 }, { "epoch": 4.152395377812523, "grad_norm": 0.00018831691704690456, "learning_rate": 1.7003886465014362e-07, "loss": 0.0, "num_input_tokens_seen": 114538728, "step": 169970 }, { "epoch": 4.1525175286443705, "grad_norm": 0.0016461594495922327, "learning_rate": 1.6999129801344914e-07, "loss": 0.0, "num_input_tokens_seen": 114541992, "step": 169975 }, { "epoch": 4.152639679476217, "grad_norm": 0.00039509753696620464, "learning_rate": 1.699437374128011e-07, "loss": 0.0, "num_input_tokens_seen": 114545768, "step": 169980 }, { "epoch": 4.152761830308064, "grad_norm": 0.032739847898483276, "learning_rate": 1.698961828485458e-07, "loss": 0.0001, "num_input_tokens_seen": 114549096, "step": 169985 }, { "epoch": 4.152883981139912, "grad_norm": 0.0007005423540249467, "learning_rate": 1.698486343210288e-07, "loss": 0.0, "num_input_tokens_seen": 114552424, "step": 169990 }, { "epoch": 4.153006131971758, "grad_norm": 0.0006590968114323914, "learning_rate": 1.6980109183059544e-07, "loss": 0.0, "num_input_tokens_seen": 114555304, "step": 169995 }, { "epoch": 4.153128282803606, "grad_norm": 8.95965495146811e-05, "learning_rate": 1.6975355537759217e-07, "loss": 0.0, "num_input_tokens_seen": 114558440, "step": 170000 }, { "epoch": 4.153250433635453, "grad_norm": 3.4148648410337046e-05, "learning_rate": 1.6970602496236409e-07, "loss": 0.0, "num_input_tokens_seen": 114562088, "step": 170005 }, { "epoch": 4.1533725844673, "grad_norm": 3.2041094527812675e-05, "learning_rate": 1.6965850058525732e-07, "loss": 0.0, "num_input_tokens_seen": 114565160, "step": 170010 }, { "epoch": 4.153494735299147, "grad_norm": 0.0001849216059781611, "learning_rate": 1.6961098224661707e-07, "loss": 0.0, "num_input_tokens_seen": 114568232, "step": 170015 }, { "epoch": 4.153616886130995, "grad_norm": 9.298501936427783e-06, "learning_rate": 1.6956346994678926e-07, "loss": 0.0, "num_input_tokens_seen": 114572072, "step": 170020 }, { "epoch": 4.1537390369628415, "grad_norm": 0.00011811690637841821, "learning_rate": 1.695159636861191e-07, "loss": 0.0, "num_input_tokens_seen": 114575208, "step": 170025 }, { "epoch": 4.153861187794689, "grad_norm": 0.0005981141002848744, "learning_rate": 1.6946846346495248e-07, "loss": 0.0, "num_input_tokens_seen": 114578728, "step": 170030 }, { "epoch": 4.153983338626536, "grad_norm": 8.729274122742936e-05, "learning_rate": 1.6942096928363426e-07, "loss": 0.0, "num_input_tokens_seen": 114582312, "step": 170035 }, { "epoch": 4.1541054894583835, "grad_norm": 2.739803858275991e-05, "learning_rate": 1.6937348114251026e-07, "loss": 0.0, "num_input_tokens_seen": 114585768, "step": 170040 }, { "epoch": 4.15422764029023, "grad_norm": 0.00011632242240011692, "learning_rate": 1.693259990419259e-07, "loss": 0.0, "num_input_tokens_seen": 114589224, "step": 170045 }, { "epoch": 4.154349791122078, "grad_norm": 0.000368999462807551, "learning_rate": 1.69278522982226e-07, "loss": 0.0, "num_input_tokens_seen": 114592616, "step": 170050 }, { "epoch": 4.154471941953925, "grad_norm": 0.00010005565854953602, "learning_rate": 1.6923105296375638e-07, "loss": 0.0, "num_input_tokens_seen": 114596136, "step": 170055 }, { "epoch": 4.154594092785772, "grad_norm": 0.00019996245100628585, "learning_rate": 1.691835889868618e-07, "loss": 0.0, "num_input_tokens_seen": 114600488, "step": 170060 }, { "epoch": 4.154716243617619, "grad_norm": 7.243484287755564e-05, "learning_rate": 1.6913613105188785e-07, "loss": 0.0, "num_input_tokens_seen": 114603816, "step": 170065 }, { "epoch": 4.154838394449466, "grad_norm": 1.4196218216966372e-05, "learning_rate": 1.6908867915917924e-07, "loss": 0.0, "num_input_tokens_seen": 114607528, "step": 170070 }, { "epoch": 4.154960545281313, "grad_norm": 4.784507837030105e-05, "learning_rate": 1.6904123330908117e-07, "loss": 0.0, "num_input_tokens_seen": 114610856, "step": 170075 }, { "epoch": 4.15508269611316, "grad_norm": 7.885733793955296e-05, "learning_rate": 1.68993793501939e-07, "loss": 0.0, "num_input_tokens_seen": 114614248, "step": 170080 }, { "epoch": 4.155204846945008, "grad_norm": 0.0017045887652784586, "learning_rate": 1.6894635973809725e-07, "loss": 0.0, "num_input_tokens_seen": 114617512, "step": 170085 }, { "epoch": 4.1553269977768545, "grad_norm": 0.00018425517191644758, "learning_rate": 1.688989320179014e-07, "loss": 0.0, "num_input_tokens_seen": 114620776, "step": 170090 }, { "epoch": 4.155449148608702, "grad_norm": 2.087597022182308e-05, "learning_rate": 1.6885151034169577e-07, "loss": 0.0, "num_input_tokens_seen": 114623848, "step": 170095 }, { "epoch": 4.155571299440549, "grad_norm": 0.0001408069219905883, "learning_rate": 1.688040947098257e-07, "loss": 0.0, "num_input_tokens_seen": 114627176, "step": 170100 }, { "epoch": 4.155693450272397, "grad_norm": 0.00032099412055686116, "learning_rate": 1.6875668512263587e-07, "loss": 0.0, "num_input_tokens_seen": 114630952, "step": 170105 }, { "epoch": 4.155815601104243, "grad_norm": 5.4309595725499094e-05, "learning_rate": 1.6870928158047072e-07, "loss": 0.1071, "num_input_tokens_seen": 114634024, "step": 170110 }, { "epoch": 4.155937751936091, "grad_norm": 0.00032432761508971453, "learning_rate": 1.6866188408367553e-07, "loss": 0.0, "num_input_tokens_seen": 114637288, "step": 170115 }, { "epoch": 4.156059902767938, "grad_norm": 0.04092469438910484, "learning_rate": 1.6861449263259453e-07, "loss": 0.0, "num_input_tokens_seen": 114640680, "step": 170120 }, { "epoch": 4.156182053599785, "grad_norm": 0.03474748134613037, "learning_rate": 1.6856710722757273e-07, "loss": 0.0, "num_input_tokens_seen": 114644072, "step": 170125 }, { "epoch": 4.156304204431632, "grad_norm": 0.0026629886124283075, "learning_rate": 1.685197278689543e-07, "loss": 0.0, "num_input_tokens_seen": 114647528, "step": 170130 }, { "epoch": 4.15642635526348, "grad_norm": 0.00020655507978517562, "learning_rate": 1.6847235455708408e-07, "loss": 0.0, "num_input_tokens_seen": 114650792, "step": 170135 }, { "epoch": 4.1565485060953264, "grad_norm": 0.0001946508709806949, "learning_rate": 1.6842498729230682e-07, "loss": 0.0, "num_input_tokens_seen": 114653928, "step": 170140 }, { "epoch": 4.156670656927174, "grad_norm": 0.00028996478067710996, "learning_rate": 1.6837762607496654e-07, "loss": 0.0, "num_input_tokens_seen": 114658088, "step": 170145 }, { "epoch": 4.156792807759021, "grad_norm": 8.436523785348982e-05, "learning_rate": 1.6833027090540797e-07, "loss": 0.0, "num_input_tokens_seen": 114661288, "step": 170150 }, { "epoch": 4.156914958590868, "grad_norm": 0.00046180543722584844, "learning_rate": 1.6828292178397508e-07, "loss": 0.0, "num_input_tokens_seen": 114664552, "step": 170155 }, { "epoch": 4.157037109422715, "grad_norm": 5.629775841953233e-06, "learning_rate": 1.682355787110128e-07, "loss": 0.0, "num_input_tokens_seen": 114668840, "step": 170160 }, { "epoch": 4.157159260254562, "grad_norm": 0.023796724155545235, "learning_rate": 1.6818824168686486e-07, "loss": 0.0, "num_input_tokens_seen": 114671848, "step": 170165 }, { "epoch": 4.15728141108641, "grad_norm": 3.74505361833144e-05, "learning_rate": 1.6814091071187586e-07, "loss": 0.0, "num_input_tokens_seen": 114675240, "step": 170170 }, { "epoch": 4.157403561918256, "grad_norm": 0.0002260417240904644, "learning_rate": 1.6809358578639e-07, "loss": 0.0224, "num_input_tokens_seen": 114679016, "step": 170175 }, { "epoch": 4.157525712750104, "grad_norm": 0.00018604165234137326, "learning_rate": 1.680462669107512e-07, "loss": 0.0, "num_input_tokens_seen": 114682344, "step": 170180 }, { "epoch": 4.157647863581951, "grad_norm": 0.00011431350139901042, "learning_rate": 1.6799895408530385e-07, "loss": 0.0, "num_input_tokens_seen": 114685672, "step": 170185 }, { "epoch": 4.157770014413798, "grad_norm": 0.000264197209617123, "learning_rate": 1.679516473103917e-07, "loss": 0.0, "num_input_tokens_seen": 114689128, "step": 170190 }, { "epoch": 4.157892165245645, "grad_norm": 8.455871284240857e-05, "learning_rate": 1.6790434658635922e-07, "loss": 0.0, "num_input_tokens_seen": 114692136, "step": 170195 }, { "epoch": 4.158014316077493, "grad_norm": 1.8691958757699467e-05, "learning_rate": 1.6785705191354983e-07, "loss": 0.0, "num_input_tokens_seen": 114695592, "step": 170200 }, { "epoch": 4.1581364669093395, "grad_norm": 0.00012280538794584572, "learning_rate": 1.678097632923081e-07, "loss": 0.0, "num_input_tokens_seen": 114698856, "step": 170205 }, { "epoch": 4.158258617741187, "grad_norm": 0.00011621593876043335, "learning_rate": 1.677624807229776e-07, "loss": 0.0, "num_input_tokens_seen": 114702248, "step": 170210 }, { "epoch": 4.158380768573034, "grad_norm": 0.0011803361121565104, "learning_rate": 1.677152042059019e-07, "loss": 0.0, "num_input_tokens_seen": 114705448, "step": 170215 }, { "epoch": 4.1585029194048815, "grad_norm": 0.00014835498586762697, "learning_rate": 1.676679337414254e-07, "loss": 0.0, "num_input_tokens_seen": 114709096, "step": 170220 }, { "epoch": 4.158625070236728, "grad_norm": 6.039536674506962e-05, "learning_rate": 1.6762066932989128e-07, "loss": 0.0, "num_input_tokens_seen": 114712488, "step": 170225 }, { "epoch": 4.158747221068576, "grad_norm": 0.001145469374023378, "learning_rate": 1.6757341097164345e-07, "loss": 0.0, "num_input_tokens_seen": 114715752, "step": 170230 }, { "epoch": 4.158869371900423, "grad_norm": 8.06782190920785e-06, "learning_rate": 1.67526158667026e-07, "loss": 0.0, "num_input_tokens_seen": 114718760, "step": 170235 }, { "epoch": 4.15899152273227, "grad_norm": 2.5670822651591152e-05, "learning_rate": 1.67478912416382e-07, "loss": 0.0, "num_input_tokens_seen": 114722472, "step": 170240 }, { "epoch": 4.159113673564117, "grad_norm": 0.0001012508655549027, "learning_rate": 1.674316722200555e-07, "loss": 0.0, "num_input_tokens_seen": 114725928, "step": 170245 }, { "epoch": 4.159235824395964, "grad_norm": 0.0019791119266301394, "learning_rate": 1.6738443807838952e-07, "loss": 0.0, "num_input_tokens_seen": 114729768, "step": 170250 }, { "epoch": 4.159357975227811, "grad_norm": 4.3414085666881874e-05, "learning_rate": 1.6733720999172786e-07, "loss": 0.0, "num_input_tokens_seen": 114733224, "step": 170255 }, { "epoch": 4.159480126059658, "grad_norm": 0.00107354368083179, "learning_rate": 1.6728998796041428e-07, "loss": 0.0, "num_input_tokens_seen": 114736424, "step": 170260 }, { "epoch": 4.159602276891506, "grad_norm": 6.924165063537657e-05, "learning_rate": 1.6724277198479163e-07, "loss": 0.0, "num_input_tokens_seen": 114739368, "step": 170265 }, { "epoch": 4.1597244277233525, "grad_norm": 8.328318654093891e-05, "learning_rate": 1.6719556206520368e-07, "loss": 0.0, "num_input_tokens_seen": 114742376, "step": 170270 }, { "epoch": 4.1598465785552, "grad_norm": 0.002161898883059621, "learning_rate": 1.6714835820199347e-07, "loss": 0.0, "num_input_tokens_seen": 114745576, "step": 170275 }, { "epoch": 4.159968729387047, "grad_norm": 3.9121092413552105e-05, "learning_rate": 1.671011603955046e-07, "loss": 0.0, "num_input_tokens_seen": 114749032, "step": 170280 }, { "epoch": 4.1600908802188945, "grad_norm": 1.349920694337925e-05, "learning_rate": 1.670539686460799e-07, "loss": 0.0, "num_input_tokens_seen": 114752360, "step": 170285 }, { "epoch": 4.160213031050741, "grad_norm": 0.004583634901791811, "learning_rate": 1.6700678295406267e-07, "loss": 0.0, "num_input_tokens_seen": 114755752, "step": 170290 }, { "epoch": 4.160335181882589, "grad_norm": 0.00012262725795153528, "learning_rate": 1.6695960331979652e-07, "loss": 0.0, "num_input_tokens_seen": 114758952, "step": 170295 }, { "epoch": 4.160457332714436, "grad_norm": 1.902866097225342e-05, "learning_rate": 1.6691242974362417e-07, "loss": 0.0, "num_input_tokens_seen": 114762152, "step": 170300 }, { "epoch": 4.160579483546283, "grad_norm": 0.0009349206229671836, "learning_rate": 1.6686526222588847e-07, "loss": 0.0, "num_input_tokens_seen": 114765480, "step": 170305 }, { "epoch": 4.16070163437813, "grad_norm": 4.975054707756499e-06, "learning_rate": 1.6681810076693282e-07, "loss": 0.0, "num_input_tokens_seen": 114769192, "step": 170310 }, { "epoch": 4.160823785209978, "grad_norm": 2.176424823119305e-05, "learning_rate": 1.6677094536709991e-07, "loss": 0.0436, "num_input_tokens_seen": 114772328, "step": 170315 }, { "epoch": 4.160945936041824, "grad_norm": 0.0011520327534526587, "learning_rate": 1.6672379602673303e-07, "loss": 0.0, "num_input_tokens_seen": 114775720, "step": 170320 }, { "epoch": 4.161068086873672, "grad_norm": 0.0003704590199049562, "learning_rate": 1.666766527461745e-07, "loss": 0.0001, "num_input_tokens_seen": 114778856, "step": 170325 }, { "epoch": 4.161190237705519, "grad_norm": 0.001671323669143021, "learning_rate": 1.6662951552576787e-07, "loss": 0.0, "num_input_tokens_seen": 114782504, "step": 170330 }, { "epoch": 4.1613123885373655, "grad_norm": 0.00016249853069894016, "learning_rate": 1.6658238436585515e-07, "loss": 0.0, "num_input_tokens_seen": 114785704, "step": 170335 }, { "epoch": 4.161434539369213, "grad_norm": 0.0018108749063685536, "learning_rate": 1.665352592667798e-07, "loss": 0.0, "num_input_tokens_seen": 114788648, "step": 170340 }, { "epoch": 4.16155669020106, "grad_norm": 2.1458145056385547e-05, "learning_rate": 1.6648814022888403e-07, "loss": 0.0, "num_input_tokens_seen": 114791912, "step": 170345 }, { "epoch": 4.1616788410329075, "grad_norm": 1.716206497803796e-05, "learning_rate": 1.6644102725251063e-07, "loss": 0.0, "num_input_tokens_seen": 114795368, "step": 170350 }, { "epoch": 4.161800991864754, "grad_norm": 0.0001412465499015525, "learning_rate": 1.663939203380026e-07, "loss": 0.0, "num_input_tokens_seen": 114798696, "step": 170355 }, { "epoch": 4.161923142696602, "grad_norm": 0.00016346178017556667, "learning_rate": 1.6634681948570183e-07, "loss": 0.0, "num_input_tokens_seen": 114801704, "step": 170360 }, { "epoch": 4.162045293528449, "grad_norm": 0.011184005998075008, "learning_rate": 1.6629972469595155e-07, "loss": 0.0, "num_input_tokens_seen": 114804904, "step": 170365 }, { "epoch": 4.162167444360296, "grad_norm": 0.00012003963638562709, "learning_rate": 1.6625263596909368e-07, "loss": 0.0, "num_input_tokens_seen": 114808168, "step": 170370 }, { "epoch": 4.162289595192143, "grad_norm": 1.699275162536651e-05, "learning_rate": 1.6620555330547104e-07, "loss": 0.0, "num_input_tokens_seen": 114811432, "step": 170375 }, { "epoch": 4.162411746023991, "grad_norm": 0.004154087509959936, "learning_rate": 1.6615847670542572e-07, "loss": 0.0, "num_input_tokens_seen": 114815016, "step": 170380 }, { "epoch": 4.162533896855837, "grad_norm": 0.014836370013654232, "learning_rate": 1.661114061693002e-07, "loss": 0.0, "num_input_tokens_seen": 114818408, "step": 170385 }, { "epoch": 4.162656047687685, "grad_norm": 0.0008278019959107041, "learning_rate": 1.660643416974371e-07, "loss": 0.0, "num_input_tokens_seen": 114821544, "step": 170390 }, { "epoch": 4.162778198519532, "grad_norm": 6.109980404289672e-06, "learning_rate": 1.6601728329017818e-07, "loss": 0.0, "num_input_tokens_seen": 114825192, "step": 170395 }, { "epoch": 4.162900349351379, "grad_norm": 0.0004017290484625846, "learning_rate": 1.6597023094786612e-07, "loss": 0.0122, "num_input_tokens_seen": 114828776, "step": 170400 }, { "epoch": 4.163022500183226, "grad_norm": 9.658478666096926e-05, "learning_rate": 1.6592318467084255e-07, "loss": 0.0, "num_input_tokens_seen": 114831848, "step": 170405 }, { "epoch": 4.163144651015074, "grad_norm": 0.00031634565675631166, "learning_rate": 1.658761444594502e-07, "loss": 0.0, "num_input_tokens_seen": 114835304, "step": 170410 }, { "epoch": 4.1632668018469206, "grad_norm": 0.0033153025433421135, "learning_rate": 1.658291103140309e-07, "loss": 0.0, "num_input_tokens_seen": 114838696, "step": 170415 }, { "epoch": 4.163388952678767, "grad_norm": 4.208605605526827e-05, "learning_rate": 1.657820822349264e-07, "loss": 0.0, "num_input_tokens_seen": 114841768, "step": 170420 }, { "epoch": 4.163511103510615, "grad_norm": 0.005869393702596426, "learning_rate": 1.657350602224793e-07, "loss": 0.0794, "num_input_tokens_seen": 114845032, "step": 170425 }, { "epoch": 4.163633254342462, "grad_norm": 2.9876026019337587e-05, "learning_rate": 1.6568804427703088e-07, "loss": 0.0, "num_input_tokens_seen": 114848424, "step": 170430 }, { "epoch": 4.163755405174309, "grad_norm": 6.542223127325997e-05, "learning_rate": 1.6564103439892373e-07, "loss": 0.0, "num_input_tokens_seen": 114851816, "step": 170435 }, { "epoch": 4.163877556006156, "grad_norm": 4.480009010876529e-05, "learning_rate": 1.6559403058849909e-07, "loss": 0.0, "num_input_tokens_seen": 114855464, "step": 170440 }, { "epoch": 4.163999706838004, "grad_norm": 8.49221851240145e-06, "learning_rate": 1.6554703284609918e-07, "loss": 0.0, "num_input_tokens_seen": 114858856, "step": 170445 }, { "epoch": 4.16412185766985, "grad_norm": 0.0001956393534783274, "learning_rate": 1.6550004117206583e-07, "loss": 0.0, "num_input_tokens_seen": 114862376, "step": 170450 }, { "epoch": 4.164244008501698, "grad_norm": 0.0013749731006100774, "learning_rate": 1.6545305556674038e-07, "loss": 0.0, "num_input_tokens_seen": 114865256, "step": 170455 }, { "epoch": 4.164366159333545, "grad_norm": 0.0001273680099984631, "learning_rate": 1.6540607603046508e-07, "loss": 0.0, "num_input_tokens_seen": 114868520, "step": 170460 }, { "epoch": 4.1644883101653924, "grad_norm": 8.07244359748438e-05, "learning_rate": 1.653591025635811e-07, "loss": 0.0, "num_input_tokens_seen": 114871592, "step": 170465 }, { "epoch": 4.164610460997239, "grad_norm": 0.0012089071096852422, "learning_rate": 1.6531213516643028e-07, "loss": 0.0, "num_input_tokens_seen": 114874856, "step": 170470 }, { "epoch": 4.164732611829087, "grad_norm": 3.042860953428317e-05, "learning_rate": 1.6526517383935402e-07, "loss": 0.0, "num_input_tokens_seen": 114878248, "step": 170475 }, { "epoch": 4.164854762660934, "grad_norm": 6.760417454643175e-05, "learning_rate": 1.652182185826939e-07, "loss": 0.0, "num_input_tokens_seen": 114881576, "step": 170480 }, { "epoch": 4.164976913492781, "grad_norm": 1.345566943200538e-05, "learning_rate": 1.651712693967916e-07, "loss": 0.0, "num_input_tokens_seen": 114885352, "step": 170485 }, { "epoch": 4.165099064324628, "grad_norm": 0.00022467400413006544, "learning_rate": 1.6512432628198823e-07, "loss": 0.0, "num_input_tokens_seen": 114888872, "step": 170490 }, { "epoch": 4.165221215156476, "grad_norm": 6.48816603643354e-06, "learning_rate": 1.6507738923862546e-07, "loss": 0.0, "num_input_tokens_seen": 114892072, "step": 170495 }, { "epoch": 4.165343365988322, "grad_norm": 0.00019495896412990987, "learning_rate": 1.6503045826704433e-07, "loss": 0.0, "num_input_tokens_seen": 114895528, "step": 170500 }, { "epoch": 4.16546551682017, "grad_norm": 0.0014716989826411009, "learning_rate": 1.6498353336758653e-07, "loss": 0.0, "num_input_tokens_seen": 114899432, "step": 170505 }, { "epoch": 4.165587667652017, "grad_norm": 3.384602678124793e-05, "learning_rate": 1.649366145405927e-07, "loss": 0.0, "num_input_tokens_seen": 114902888, "step": 170510 }, { "epoch": 4.1657098184838635, "grad_norm": 0.00020877330098301172, "learning_rate": 1.6488970178640483e-07, "loss": 0.0, "num_input_tokens_seen": 114906216, "step": 170515 }, { "epoch": 4.165831969315711, "grad_norm": 4.302657544030808e-05, "learning_rate": 1.6484279510536358e-07, "loss": 0.0, "num_input_tokens_seen": 114909480, "step": 170520 }, { "epoch": 4.165954120147558, "grad_norm": 0.009582330472767353, "learning_rate": 1.6479589449780984e-07, "loss": 0.0, "num_input_tokens_seen": 114912680, "step": 170525 }, { "epoch": 4.1660762709794055, "grad_norm": 0.00048023121780715883, "learning_rate": 1.6474899996408532e-07, "loss": 0.0, "num_input_tokens_seen": 114915752, "step": 170530 }, { "epoch": 4.166198421811252, "grad_norm": 8.170756336767226e-05, "learning_rate": 1.647021115045305e-07, "loss": 0.0, "num_input_tokens_seen": 114919784, "step": 170535 }, { "epoch": 4.1663205726431, "grad_norm": 0.0010021156631410122, "learning_rate": 1.646552291194866e-07, "loss": 0.0, "num_input_tokens_seen": 114923496, "step": 170540 }, { "epoch": 4.166442723474947, "grad_norm": 0.00017045924323610961, "learning_rate": 1.6460835280929474e-07, "loss": 0.0, "num_input_tokens_seen": 114926888, "step": 170545 }, { "epoch": 4.166564874306794, "grad_norm": 0.005076151341199875, "learning_rate": 1.6456148257429537e-07, "loss": 0.0, "num_input_tokens_seen": 114930024, "step": 170550 }, { "epoch": 4.166687025138641, "grad_norm": 0.00014168965572025627, "learning_rate": 1.6451461841482994e-07, "loss": 0.0002, "num_input_tokens_seen": 114933352, "step": 170555 }, { "epoch": 4.166809175970489, "grad_norm": 8.979089216154534e-06, "learning_rate": 1.6446776033123866e-07, "loss": 0.0, "num_input_tokens_seen": 114936424, "step": 170560 }, { "epoch": 4.166931326802335, "grad_norm": 0.00012583896750584245, "learning_rate": 1.6442090832386246e-07, "loss": 0.0, "num_input_tokens_seen": 114939688, "step": 170565 }, { "epoch": 4.167053477634183, "grad_norm": 0.0013897777535021305, "learning_rate": 1.6437406239304253e-07, "loss": 0.0, "num_input_tokens_seen": 114942824, "step": 170570 }, { "epoch": 4.16717562846603, "grad_norm": 0.0002839156368281692, "learning_rate": 1.643272225391188e-07, "loss": 0.0, "num_input_tokens_seen": 114946472, "step": 170575 }, { "epoch": 4.167297779297877, "grad_norm": 0.0005325472448021173, "learning_rate": 1.6428038876243266e-07, "loss": 0.0, "num_input_tokens_seen": 114949800, "step": 170580 }, { "epoch": 4.167419930129724, "grad_norm": 0.00025493022985756397, "learning_rate": 1.6423356106332398e-07, "loss": 0.0, "num_input_tokens_seen": 114953064, "step": 170585 }, { "epoch": 4.167542080961572, "grad_norm": 4.253402494214242e-06, "learning_rate": 1.641867394421339e-07, "loss": 0.0, "num_input_tokens_seen": 114956840, "step": 170590 }, { "epoch": 4.1676642317934185, "grad_norm": 0.00014483621635008603, "learning_rate": 1.641399238992024e-07, "loss": 0.0, "num_input_tokens_seen": 114959848, "step": 170595 }, { "epoch": 4.167786382625266, "grad_norm": 1.984128357435111e-05, "learning_rate": 1.640931144348703e-07, "loss": 0.0, "num_input_tokens_seen": 114963176, "step": 170600 }, { "epoch": 4.167908533457113, "grad_norm": 0.008027193136513233, "learning_rate": 1.6404631104947798e-07, "loss": 0.0, "num_input_tokens_seen": 114966888, "step": 170605 }, { "epoch": 4.16803068428896, "grad_norm": 0.002142030978575349, "learning_rate": 1.6399951374336585e-07, "loss": 0.0, "num_input_tokens_seen": 114970344, "step": 170610 }, { "epoch": 4.168152835120807, "grad_norm": 3.685107230921858e-06, "learning_rate": 1.6395272251687386e-07, "loss": 0.0, "num_input_tokens_seen": 114973864, "step": 170615 }, { "epoch": 4.168274985952654, "grad_norm": 8.950922165240627e-06, "learning_rate": 1.6390593737034276e-07, "loss": 0.0, "num_input_tokens_seen": 114976744, "step": 170620 }, { "epoch": 4.168397136784502, "grad_norm": 0.00010310253128409386, "learning_rate": 1.6385915830411223e-07, "loss": 0.0, "num_input_tokens_seen": 114979944, "step": 170625 }, { "epoch": 4.168519287616348, "grad_norm": 3.5585057048592716e-05, "learning_rate": 1.6381238531852314e-07, "loss": 0.0, "num_input_tokens_seen": 114983144, "step": 170630 }, { "epoch": 4.168641438448196, "grad_norm": 7.778478175168857e-05, "learning_rate": 1.6376561841391501e-07, "loss": 0.0, "num_input_tokens_seen": 114986472, "step": 170635 }, { "epoch": 4.168763589280043, "grad_norm": 0.00018595813889987767, "learning_rate": 1.6371885759062853e-07, "loss": 0.0, "num_input_tokens_seen": 114989608, "step": 170640 }, { "epoch": 4.16888574011189, "grad_norm": 0.00012617114407476038, "learning_rate": 1.6367210284900324e-07, "loss": 0.0, "num_input_tokens_seen": 114992808, "step": 170645 }, { "epoch": 4.169007890943737, "grad_norm": 0.000265518989181146, "learning_rate": 1.636253541893795e-07, "loss": 0.0, "num_input_tokens_seen": 114995880, "step": 170650 }, { "epoch": 4.169130041775585, "grad_norm": 0.0011026192223653197, "learning_rate": 1.6357861161209695e-07, "loss": 0.0, "num_input_tokens_seen": 114999272, "step": 170655 }, { "epoch": 4.1692521926074315, "grad_norm": 2.0162886357866228e-05, "learning_rate": 1.6353187511749565e-07, "loss": 0.0001, "num_input_tokens_seen": 115002664, "step": 170660 }, { "epoch": 4.169374343439279, "grad_norm": 4.777355206897482e-05, "learning_rate": 1.6348514470591578e-07, "loss": 0.0, "num_input_tokens_seen": 115005800, "step": 170665 }, { "epoch": 4.169496494271126, "grad_norm": 0.02091538906097412, "learning_rate": 1.6343842037769673e-07, "loss": 0.0, "num_input_tokens_seen": 115009256, "step": 170670 }, { "epoch": 4.1696186451029735, "grad_norm": 0.00016440707258880138, "learning_rate": 1.6339170213317877e-07, "loss": 0.0, "num_input_tokens_seen": 115012776, "step": 170675 }, { "epoch": 4.16974079593482, "grad_norm": 3.5134512472723145e-06, "learning_rate": 1.6334498997270108e-07, "loss": 0.0, "num_input_tokens_seen": 115015912, "step": 170680 }, { "epoch": 4.169862946766668, "grad_norm": 2.0491874238359742e-05, "learning_rate": 1.6329828389660394e-07, "loss": 0.0, "num_input_tokens_seen": 115019432, "step": 170685 }, { "epoch": 4.169985097598515, "grad_norm": 1.233548391610384e-05, "learning_rate": 1.6325158390522642e-07, "loss": 0.0, "num_input_tokens_seen": 115022568, "step": 170690 }, { "epoch": 4.170107248430361, "grad_norm": 0.00029678287683054805, "learning_rate": 1.6320488999890847e-07, "loss": 0.0, "num_input_tokens_seen": 115026088, "step": 170695 }, { "epoch": 4.170229399262209, "grad_norm": 0.0004376185534056276, "learning_rate": 1.6315820217798992e-07, "loss": 0.0001, "num_input_tokens_seen": 115029480, "step": 170700 }, { "epoch": 4.170351550094056, "grad_norm": 0.000860581174492836, "learning_rate": 1.6311152044280973e-07, "loss": 0.0, "num_input_tokens_seen": 115033128, "step": 170705 }, { "epoch": 4.170473700925903, "grad_norm": 4.864812581217848e-05, "learning_rate": 1.6306484479370786e-07, "loss": 0.0, "num_input_tokens_seen": 115036392, "step": 170710 }, { "epoch": 4.17059585175775, "grad_norm": 0.00042654649587348104, "learning_rate": 1.6301817523102335e-07, "loss": 0.0, "num_input_tokens_seen": 115040104, "step": 170715 }, { "epoch": 4.170718002589598, "grad_norm": 2.155587571905926e-05, "learning_rate": 1.6297151175509606e-07, "loss": 0.0, "num_input_tokens_seen": 115043560, "step": 170720 }, { "epoch": 4.1708401534214445, "grad_norm": 1.0149953595828265e-05, "learning_rate": 1.6292485436626502e-07, "loss": 0.0, "num_input_tokens_seen": 115046888, "step": 170725 }, { "epoch": 4.170962304253292, "grad_norm": 6.887714698677883e-05, "learning_rate": 1.6287820306486944e-07, "loss": 0.0, "num_input_tokens_seen": 115050024, "step": 170730 }, { "epoch": 4.171084455085139, "grad_norm": 4.840194378630258e-05, "learning_rate": 1.628315578512488e-07, "loss": 0.0, "num_input_tokens_seen": 115053224, "step": 170735 }, { "epoch": 4.1712066059169866, "grad_norm": 7.268014087458141e-06, "learning_rate": 1.6278491872574218e-07, "loss": 0.0, "num_input_tokens_seen": 115056424, "step": 170740 }, { "epoch": 4.171328756748833, "grad_norm": 0.002297777682542801, "learning_rate": 1.6273828568868886e-07, "loss": 0.0, "num_input_tokens_seen": 115059624, "step": 170745 }, { "epoch": 4.171450907580681, "grad_norm": 0.09131614118814468, "learning_rate": 1.6269165874042788e-07, "loss": 0.0, "num_input_tokens_seen": 115062888, "step": 170750 }, { "epoch": 4.171573058412528, "grad_norm": 0.00038020877400413156, "learning_rate": 1.6264503788129825e-07, "loss": 0.0, "num_input_tokens_seen": 115066152, "step": 170755 }, { "epoch": 4.171695209244375, "grad_norm": 8.89160655788146e-05, "learning_rate": 1.625984231116394e-07, "loss": 0.0001, "num_input_tokens_seen": 115069864, "step": 170760 }, { "epoch": 4.171817360076222, "grad_norm": 0.0017276030266657472, "learning_rate": 1.625518144317898e-07, "loss": 0.0, "num_input_tokens_seen": 115073128, "step": 170765 }, { "epoch": 4.17193951090807, "grad_norm": 0.00035557913361117244, "learning_rate": 1.6250521184208888e-07, "loss": 0.0, "num_input_tokens_seen": 115076264, "step": 170770 }, { "epoch": 4.172061661739916, "grad_norm": 6.348582974169403e-05, "learning_rate": 1.624586153428751e-07, "loss": 0.0397, "num_input_tokens_seen": 115079976, "step": 170775 }, { "epoch": 4.172183812571763, "grad_norm": 0.0005099988193251193, "learning_rate": 1.624120249344878e-07, "loss": 0.0, "num_input_tokens_seen": 115083624, "step": 170780 }, { "epoch": 4.172305963403611, "grad_norm": 0.0015803297283127904, "learning_rate": 1.623654406172652e-07, "loss": 0.0, "num_input_tokens_seen": 115086824, "step": 170785 }, { "epoch": 4.172428114235458, "grad_norm": 0.00015540645108558238, "learning_rate": 1.6231886239154647e-07, "loss": 0.0, "num_input_tokens_seen": 115090536, "step": 170790 }, { "epoch": 4.172550265067305, "grad_norm": 2.5943469154299237e-05, "learning_rate": 1.6227229025767052e-07, "loss": 0.0, "num_input_tokens_seen": 115093480, "step": 170795 }, { "epoch": 4.172672415899152, "grad_norm": 0.000604168395511806, "learning_rate": 1.6222572421597558e-07, "loss": 0.0, "num_input_tokens_seen": 115096680, "step": 170800 }, { "epoch": 4.172794566731, "grad_norm": 1.3740186659561004e-05, "learning_rate": 1.621791642668008e-07, "loss": 0.0, "num_input_tokens_seen": 115100584, "step": 170805 }, { "epoch": 4.172916717562846, "grad_norm": 8.320253073179629e-06, "learning_rate": 1.621326104104842e-07, "loss": 0.0, "num_input_tokens_seen": 115104104, "step": 170810 }, { "epoch": 4.173038868394694, "grad_norm": 7.703889423282817e-06, "learning_rate": 1.620860626473648e-07, "loss": 0.0, "num_input_tokens_seen": 115107304, "step": 170815 }, { "epoch": 4.173161019226541, "grad_norm": 2.276463965245057e-05, "learning_rate": 1.6203952097778073e-07, "loss": 0.0, "num_input_tokens_seen": 115110696, "step": 170820 }, { "epoch": 4.173283170058388, "grad_norm": 0.00026030722074210644, "learning_rate": 1.6199298540207086e-07, "loss": 0.0, "num_input_tokens_seen": 115113896, "step": 170825 }, { "epoch": 4.173405320890235, "grad_norm": 4.1503084503347054e-05, "learning_rate": 1.6194645592057343e-07, "loss": 0.0, "num_input_tokens_seen": 115117288, "step": 170830 }, { "epoch": 4.173527471722083, "grad_norm": 7.145016297727125e-06, "learning_rate": 1.6189993253362655e-07, "loss": 0.0, "num_input_tokens_seen": 115120488, "step": 170835 }, { "epoch": 4.1736496225539295, "grad_norm": 0.0001081978261936456, "learning_rate": 1.6185341524156904e-07, "loss": 0.0, "num_input_tokens_seen": 115123880, "step": 170840 }, { "epoch": 4.173771773385777, "grad_norm": 0.009872562251985073, "learning_rate": 1.6180690404473862e-07, "loss": 0.0, "num_input_tokens_seen": 115127016, "step": 170845 }, { "epoch": 4.173893924217624, "grad_norm": 0.001063234987668693, "learning_rate": 1.6176039894347382e-07, "loss": 0.0, "num_input_tokens_seen": 115130408, "step": 170850 }, { "epoch": 4.1740160750494715, "grad_norm": 9.49207696976373e-06, "learning_rate": 1.6171389993811323e-07, "loss": 0.0, "num_input_tokens_seen": 115133928, "step": 170855 }, { "epoch": 4.174138225881318, "grad_norm": 4.45809455413837e-05, "learning_rate": 1.616674070289943e-07, "loss": 0.0, "num_input_tokens_seen": 115137704, "step": 170860 }, { "epoch": 4.174260376713166, "grad_norm": 0.00028284022118896246, "learning_rate": 1.6162092021645569e-07, "loss": 0.0, "num_input_tokens_seen": 115141288, "step": 170865 }, { "epoch": 4.174382527545013, "grad_norm": 0.0019308615010231733, "learning_rate": 1.6157443950083504e-07, "loss": 0.0, "num_input_tokens_seen": 115144616, "step": 170870 }, { "epoch": 4.174504678376859, "grad_norm": 3.211089506294229e-06, "learning_rate": 1.6152796488247078e-07, "loss": 0.0, "num_input_tokens_seen": 115148072, "step": 170875 }, { "epoch": 4.174626829208707, "grad_norm": 7.433198788930895e-06, "learning_rate": 1.614814963617005e-07, "loss": 0.0, "num_input_tokens_seen": 115151464, "step": 170880 }, { "epoch": 4.174748980040554, "grad_norm": 0.00026477460050955415, "learning_rate": 1.6143503393886225e-07, "loss": 0.0, "num_input_tokens_seen": 115154920, "step": 170885 }, { "epoch": 4.174871130872401, "grad_norm": 0.0009165243827737868, "learning_rate": 1.6138857761429436e-07, "loss": 0.0, "num_input_tokens_seen": 115158184, "step": 170890 }, { "epoch": 4.174993281704248, "grad_norm": 3.91147805203218e-05, "learning_rate": 1.6134212738833385e-07, "loss": 0.0, "num_input_tokens_seen": 115161256, "step": 170895 }, { "epoch": 4.175115432536096, "grad_norm": 5.272724592941813e-05, "learning_rate": 1.6129568326131936e-07, "loss": 0.0, "num_input_tokens_seen": 115164520, "step": 170900 }, { "epoch": 4.1752375833679425, "grad_norm": 0.40251025557518005, "learning_rate": 1.6124924523358795e-07, "loss": 0.0002, "num_input_tokens_seen": 115167848, "step": 170905 }, { "epoch": 4.17535973419979, "grad_norm": 8.896431972971186e-05, "learning_rate": 1.612028133054776e-07, "loss": 0.0, "num_input_tokens_seen": 115171624, "step": 170910 }, { "epoch": 4.175481885031637, "grad_norm": 0.00010786348866531625, "learning_rate": 1.611563874773263e-07, "loss": 0.0, "num_input_tokens_seen": 115175016, "step": 170915 }, { "epoch": 4.1756040358634845, "grad_norm": 0.000168164013302885, "learning_rate": 1.6110996774947127e-07, "loss": 0.0, "num_input_tokens_seen": 115178600, "step": 170920 }, { "epoch": 4.175726186695331, "grad_norm": 0.0033566667698323727, "learning_rate": 1.6106355412225003e-07, "loss": 0.0, "num_input_tokens_seen": 115181928, "step": 170925 }, { "epoch": 4.175848337527179, "grad_norm": 2.072779716399964e-05, "learning_rate": 1.610171465960005e-07, "loss": 0.0, "num_input_tokens_seen": 115185192, "step": 170930 }, { "epoch": 4.175970488359026, "grad_norm": 0.003940373659133911, "learning_rate": 1.6097074517105967e-07, "loss": 0.0307, "num_input_tokens_seen": 115188904, "step": 170935 }, { "epoch": 4.176092639190873, "grad_norm": 8.632901881355792e-05, "learning_rate": 1.609243498477656e-07, "loss": 0.0, "num_input_tokens_seen": 115192040, "step": 170940 }, { "epoch": 4.17621479002272, "grad_norm": 0.002263226080685854, "learning_rate": 1.6087796062645499e-07, "loss": 0.0, "num_input_tokens_seen": 115195240, "step": 170945 }, { "epoch": 4.176336940854568, "grad_norm": 0.0014102370478212833, "learning_rate": 1.608315775074658e-07, "loss": 0.0, "num_input_tokens_seen": 115198760, "step": 170950 }, { "epoch": 4.176459091686414, "grad_norm": 8.044674177654088e-05, "learning_rate": 1.6078520049113485e-07, "loss": 0.0, "num_input_tokens_seen": 115201960, "step": 170955 }, { "epoch": 4.176581242518261, "grad_norm": 0.00010650189506122842, "learning_rate": 1.6073882957779993e-07, "loss": 0.0, "num_input_tokens_seen": 115205096, "step": 170960 }, { "epoch": 4.176703393350109, "grad_norm": 4.333862307248637e-05, "learning_rate": 1.6069246476779774e-07, "loss": 0.0, "num_input_tokens_seen": 115208808, "step": 170965 }, { "epoch": 4.1768255441819555, "grad_norm": 5.672447059623664e-06, "learning_rate": 1.6064610606146567e-07, "loss": 0.0, "num_input_tokens_seen": 115211816, "step": 170970 }, { "epoch": 4.176947695013803, "grad_norm": 2.8347905754344538e-05, "learning_rate": 1.60599753459141e-07, "loss": 0.0, "num_input_tokens_seen": 115215912, "step": 170975 }, { "epoch": 4.17706984584565, "grad_norm": 5.478051025420427e-05, "learning_rate": 1.605534069611606e-07, "loss": 0.0, "num_input_tokens_seen": 115219240, "step": 170980 }, { "epoch": 4.1771919966774975, "grad_norm": 8.117170182231348e-06, "learning_rate": 1.6050706656786184e-07, "loss": 0.0, "num_input_tokens_seen": 115222632, "step": 170985 }, { "epoch": 4.177314147509344, "grad_norm": 1.2525510101113468e-05, "learning_rate": 1.6046073227958123e-07, "loss": 0.0, "num_input_tokens_seen": 115226088, "step": 170990 }, { "epoch": 4.177436298341192, "grad_norm": 3.1837484129937366e-05, "learning_rate": 1.6041440409665618e-07, "loss": 0.0, "num_input_tokens_seen": 115229736, "step": 170995 }, { "epoch": 4.177558449173039, "grad_norm": 0.00021777056099381298, "learning_rate": 1.603680820194232e-07, "loss": 0.0, "num_input_tokens_seen": 115233128, "step": 171000 }, { "epoch": 4.177680600004886, "grad_norm": 0.00034911080729216337, "learning_rate": 1.6032176604821933e-07, "loss": 0.0, "num_input_tokens_seen": 115236584, "step": 171005 }, { "epoch": 4.177802750836733, "grad_norm": 3.63235485565383e-05, "learning_rate": 1.6027545618338166e-07, "loss": 0.0, "num_input_tokens_seen": 115240296, "step": 171010 }, { "epoch": 4.177924901668581, "grad_norm": 0.0006223563104867935, "learning_rate": 1.6022915242524659e-07, "loss": 0.0, "num_input_tokens_seen": 115244648, "step": 171015 }, { "epoch": 4.178047052500427, "grad_norm": 1.144667203334393e-05, "learning_rate": 1.6018285477415116e-07, "loss": 0.0, "num_input_tokens_seen": 115248488, "step": 171020 }, { "epoch": 4.178169203332275, "grad_norm": 4.863789945375174e-05, "learning_rate": 1.6013656323043166e-07, "loss": 0.0, "num_input_tokens_seen": 115252072, "step": 171025 }, { "epoch": 4.178291354164122, "grad_norm": 4.818748038815102e-06, "learning_rate": 1.6009027779442519e-07, "loss": 0.0, "num_input_tokens_seen": 115255592, "step": 171030 }, { "epoch": 4.178413504995969, "grad_norm": 0.0002015796781051904, "learning_rate": 1.600439984664681e-07, "loss": 0.0, "num_input_tokens_seen": 115259112, "step": 171035 }, { "epoch": 4.178535655827816, "grad_norm": 0.00025556082255207, "learning_rate": 1.599977252468968e-07, "loss": 0.0, "num_input_tokens_seen": 115261864, "step": 171040 }, { "epoch": 4.178657806659663, "grad_norm": 0.0024212037678807974, "learning_rate": 1.5995145813604815e-07, "loss": 0.0, "num_input_tokens_seen": 115265000, "step": 171045 }, { "epoch": 4.1787799574915105, "grad_norm": 4.863837602897547e-05, "learning_rate": 1.5990519713425832e-07, "loss": 0.0, "num_input_tokens_seen": 115268520, "step": 171050 }, { "epoch": 4.178902108323357, "grad_norm": 0.0003118966124020517, "learning_rate": 1.5985894224186401e-07, "loss": 0.0, "num_input_tokens_seen": 115271656, "step": 171055 }, { "epoch": 4.179024259155205, "grad_norm": 0.0437767393887043, "learning_rate": 1.5981269345920123e-07, "loss": 0.0, "num_input_tokens_seen": 115274728, "step": 171060 }, { "epoch": 4.179146409987052, "grad_norm": 1.679328124737367e-05, "learning_rate": 1.5976645078660643e-07, "loss": 0.0, "num_input_tokens_seen": 115278440, "step": 171065 }, { "epoch": 4.179268560818899, "grad_norm": 8.560314745409414e-05, "learning_rate": 1.597202142244164e-07, "loss": 0.0, "num_input_tokens_seen": 115281832, "step": 171070 }, { "epoch": 4.179390711650746, "grad_norm": 0.00014771960559301078, "learning_rate": 1.5967398377296658e-07, "loss": 0.0, "num_input_tokens_seen": 115285160, "step": 171075 }, { "epoch": 4.179512862482594, "grad_norm": 9.891157242236659e-05, "learning_rate": 1.59627759432594e-07, "loss": 0.0002, "num_input_tokens_seen": 115288232, "step": 171080 }, { "epoch": 4.17963501331444, "grad_norm": 3.114061109954491e-05, "learning_rate": 1.5958154120363398e-07, "loss": 0.0, "num_input_tokens_seen": 115291304, "step": 171085 }, { "epoch": 4.179757164146288, "grad_norm": 0.0009680325165390968, "learning_rate": 1.595353290864233e-07, "loss": 0.058, "num_input_tokens_seen": 115294184, "step": 171090 }, { "epoch": 4.179879314978135, "grad_norm": 1.1188540156581439e-05, "learning_rate": 1.594891230812976e-07, "loss": 0.0, "num_input_tokens_seen": 115297512, "step": 171095 }, { "epoch": 4.180001465809982, "grad_norm": 8.53309848025674e-06, "learning_rate": 1.59442923188593e-07, "loss": 0.0, "num_input_tokens_seen": 115301032, "step": 171100 }, { "epoch": 4.180123616641829, "grad_norm": 1.0541667506913655e-05, "learning_rate": 1.5939672940864578e-07, "loss": 0.0, "num_input_tokens_seen": 115303912, "step": 171105 }, { "epoch": 4.180245767473677, "grad_norm": 0.001356814056634903, "learning_rate": 1.5935054174179142e-07, "loss": 0.0, "num_input_tokens_seen": 115306856, "step": 171110 }, { "epoch": 4.180367918305524, "grad_norm": 1.8071999875246547e-05, "learning_rate": 1.5930436018836635e-07, "loss": 0.0, "num_input_tokens_seen": 115310184, "step": 171115 }, { "epoch": 4.180490069137371, "grad_norm": 0.00012888468336313963, "learning_rate": 1.5925818474870578e-07, "loss": 0.0693, "num_input_tokens_seen": 115313064, "step": 171120 }, { "epoch": 4.180612219969218, "grad_norm": 0.0025817484129220247, "learning_rate": 1.59212015423146e-07, "loss": 0.0, "num_input_tokens_seen": 115316584, "step": 171125 }, { "epoch": 4.180734370801066, "grad_norm": 0.00033562208409421146, "learning_rate": 1.5916585221202238e-07, "loss": 0.0, "num_input_tokens_seen": 115319912, "step": 171130 }, { "epoch": 4.180856521632912, "grad_norm": 1.0221515367447864e-05, "learning_rate": 1.5911969511567113e-07, "loss": 0.0, "num_input_tokens_seen": 115323112, "step": 171135 }, { "epoch": 4.180978672464759, "grad_norm": 0.000600189610850066, "learning_rate": 1.5907354413442765e-07, "loss": 0.0, "num_input_tokens_seen": 115325992, "step": 171140 }, { "epoch": 4.181100823296607, "grad_norm": 1.8580491087050177e-05, "learning_rate": 1.590273992686273e-07, "loss": 0.0, "num_input_tokens_seen": 115329576, "step": 171145 }, { "epoch": 4.1812229741284535, "grad_norm": 4.74736452815705e-06, "learning_rate": 1.5898126051860606e-07, "loss": 0.0, "num_input_tokens_seen": 115332968, "step": 171150 }, { "epoch": 4.181345124960301, "grad_norm": 7.126467153284466e-06, "learning_rate": 1.589351278846991e-07, "loss": 0.0, "num_input_tokens_seen": 115336168, "step": 171155 }, { "epoch": 4.181467275792148, "grad_norm": 5.6934957683552057e-05, "learning_rate": 1.5888900136724203e-07, "loss": 0.0, "num_input_tokens_seen": 115339624, "step": 171160 }, { "epoch": 4.1815894266239955, "grad_norm": 0.0008534068474546075, "learning_rate": 1.5884288096657071e-07, "loss": 0.0, "num_input_tokens_seen": 115343208, "step": 171165 }, { "epoch": 4.181711577455842, "grad_norm": 3.3917807741090655e-05, "learning_rate": 1.5879676668302e-07, "loss": 0.0, "num_input_tokens_seen": 115346344, "step": 171170 }, { "epoch": 4.18183372828769, "grad_norm": 0.00018031641957350075, "learning_rate": 1.587506585169256e-07, "loss": 0.0, "num_input_tokens_seen": 115349608, "step": 171175 }, { "epoch": 4.181955879119537, "grad_norm": 0.00013050250709056854, "learning_rate": 1.5870455646862246e-07, "loss": 0.0, "num_input_tokens_seen": 115353192, "step": 171180 }, { "epoch": 4.182078029951384, "grad_norm": 4.1380972106708214e-05, "learning_rate": 1.5865846053844634e-07, "loss": 0.0, "num_input_tokens_seen": 115356328, "step": 171185 }, { "epoch": 4.182200180783231, "grad_norm": 3.544270293787122e-05, "learning_rate": 1.5861237072673194e-07, "loss": 0.0, "num_input_tokens_seen": 115359720, "step": 171190 }, { "epoch": 4.182322331615079, "grad_norm": 4.615875513991341e-05, "learning_rate": 1.585662870338147e-07, "loss": 0.0, "num_input_tokens_seen": 115363048, "step": 171195 }, { "epoch": 4.182444482446925, "grad_norm": 0.00040074848220683634, "learning_rate": 1.5852020946002998e-07, "loss": 0.0, "num_input_tokens_seen": 115366568, "step": 171200 }, { "epoch": 4.182566633278773, "grad_norm": 4.493439064390259e-06, "learning_rate": 1.584741380057123e-07, "loss": 0.0, "num_input_tokens_seen": 115369832, "step": 171205 }, { "epoch": 4.18268878411062, "grad_norm": 0.004250842146575451, "learning_rate": 1.584280726711974e-07, "loss": 0.0, "num_input_tokens_seen": 115373096, "step": 171210 }, { "epoch": 4.182810934942467, "grad_norm": 8.616738341515884e-06, "learning_rate": 1.5838201345681957e-07, "loss": 0.0, "num_input_tokens_seen": 115376040, "step": 171215 }, { "epoch": 4.182933085774314, "grad_norm": 0.001658809371292591, "learning_rate": 1.5833596036291408e-07, "loss": 0.0, "num_input_tokens_seen": 115380072, "step": 171220 }, { "epoch": 4.183055236606162, "grad_norm": 0.002577235922217369, "learning_rate": 1.5828991338981623e-07, "loss": 0.0, "num_input_tokens_seen": 115383464, "step": 171225 }, { "epoch": 4.1831773874380085, "grad_norm": 0.0004686212632805109, "learning_rate": 1.5824387253786043e-07, "loss": 0.0, "num_input_tokens_seen": 115386920, "step": 171230 }, { "epoch": 4.183299538269855, "grad_norm": 0.002691245172172785, "learning_rate": 1.581978378073814e-07, "loss": 0.001, "num_input_tokens_seen": 115390632, "step": 171235 }, { "epoch": 4.183421689101703, "grad_norm": 2.6709578378358856e-05, "learning_rate": 1.581518091987144e-07, "loss": 0.0, "num_input_tokens_seen": 115393512, "step": 171240 }, { "epoch": 4.18354383993355, "grad_norm": 0.0002521930728107691, "learning_rate": 1.5810578671219355e-07, "loss": 0.0, "num_input_tokens_seen": 115396776, "step": 171245 }, { "epoch": 4.183665990765397, "grad_norm": 0.005020914599299431, "learning_rate": 1.5805977034815409e-07, "loss": 0.0, "num_input_tokens_seen": 115399912, "step": 171250 }, { "epoch": 4.183788141597244, "grad_norm": 0.000299194100080058, "learning_rate": 1.5801376010693024e-07, "loss": 0.0, "num_input_tokens_seen": 115403048, "step": 171255 }, { "epoch": 4.183910292429092, "grad_norm": 1.2573545973282307e-05, "learning_rate": 1.5796775598885703e-07, "loss": 0.0, "num_input_tokens_seen": 115406248, "step": 171260 }, { "epoch": 4.184032443260938, "grad_norm": 0.00023871743178460747, "learning_rate": 1.5792175799426855e-07, "loss": 0.0, "num_input_tokens_seen": 115409512, "step": 171265 }, { "epoch": 4.184154594092786, "grad_norm": 0.001035814406350255, "learning_rate": 1.5787576612349973e-07, "loss": 0.0, "num_input_tokens_seen": 115412840, "step": 171270 }, { "epoch": 4.184276744924633, "grad_norm": 0.0006627806578762829, "learning_rate": 1.5782978037688466e-07, "loss": 0.0, "num_input_tokens_seen": 115415976, "step": 171275 }, { "epoch": 4.18439889575648, "grad_norm": 1.587512633705046e-05, "learning_rate": 1.5778380075475818e-07, "loss": 0.0, "num_input_tokens_seen": 115419432, "step": 171280 }, { "epoch": 4.184521046588327, "grad_norm": 0.000416866154409945, "learning_rate": 1.5773782725745412e-07, "loss": 0.0, "num_input_tokens_seen": 115422952, "step": 171285 }, { "epoch": 4.184643197420175, "grad_norm": 0.024858905002474785, "learning_rate": 1.576918598853072e-07, "loss": 0.0, "num_input_tokens_seen": 115426216, "step": 171290 }, { "epoch": 4.1847653482520215, "grad_norm": 0.00018218440527562052, "learning_rate": 1.5764589863865187e-07, "loss": 0.0, "num_input_tokens_seen": 115429864, "step": 171295 }, { "epoch": 4.184887499083869, "grad_norm": 4.92211984237656e-06, "learning_rate": 1.575999435178218e-07, "loss": 0.0002, "num_input_tokens_seen": 115433640, "step": 171300 }, { "epoch": 4.185009649915716, "grad_norm": 0.00018195451411884278, "learning_rate": 1.5755399452315176e-07, "loss": 0.0, "num_input_tokens_seen": 115436840, "step": 171305 }, { "epoch": 4.185131800747563, "grad_norm": 1.3570603186963126e-05, "learning_rate": 1.575080516549755e-07, "loss": 0.0, "num_input_tokens_seen": 115440360, "step": 171310 }, { "epoch": 4.18525395157941, "grad_norm": 0.0008919899701140821, "learning_rate": 1.5746211491362726e-07, "loss": 0.0, "num_input_tokens_seen": 115443880, "step": 171315 }, { "epoch": 4.185376102411257, "grad_norm": 4.4831544073531404e-05, "learning_rate": 1.5741618429944136e-07, "loss": 0.0, "num_input_tokens_seen": 115447144, "step": 171320 }, { "epoch": 4.185498253243105, "grad_norm": 0.0010451078414916992, "learning_rate": 1.5737025981275143e-07, "loss": 0.0, "num_input_tokens_seen": 115450600, "step": 171325 }, { "epoch": 4.185620404074951, "grad_norm": 0.008530599996447563, "learning_rate": 1.5732434145389185e-07, "loss": 0.0, "num_input_tokens_seen": 115453864, "step": 171330 }, { "epoch": 4.185742554906799, "grad_norm": 7.52803825889714e-05, "learning_rate": 1.57278429223196e-07, "loss": 0.0, "num_input_tokens_seen": 115457064, "step": 171335 }, { "epoch": 4.185864705738646, "grad_norm": 0.00021106674103066325, "learning_rate": 1.5723252312099832e-07, "loss": 0.0, "num_input_tokens_seen": 115460264, "step": 171340 }, { "epoch": 4.185986856570493, "grad_norm": 0.000153837536345236, "learning_rate": 1.5718662314763242e-07, "loss": 0.0, "num_input_tokens_seen": 115463528, "step": 171345 }, { "epoch": 4.18610900740234, "grad_norm": 9.459636203246191e-06, "learning_rate": 1.571407293034319e-07, "loss": 0.0, "num_input_tokens_seen": 115466600, "step": 171350 }, { "epoch": 4.186231158234188, "grad_norm": 5.712966958526522e-05, "learning_rate": 1.5709484158873088e-07, "loss": 0.0, "num_input_tokens_seen": 115470184, "step": 171355 }, { "epoch": 4.1863533090660345, "grad_norm": 4.7370958782266825e-06, "learning_rate": 1.570489600038627e-07, "loss": 0.0, "num_input_tokens_seen": 115473832, "step": 171360 }, { "epoch": 4.186475459897882, "grad_norm": 1.8898492271546274e-05, "learning_rate": 1.5700308454916132e-07, "loss": 0.0, "num_input_tokens_seen": 115477096, "step": 171365 }, { "epoch": 4.186597610729729, "grad_norm": 0.0012110867537558079, "learning_rate": 1.5695721522496007e-07, "loss": 0.0, "num_input_tokens_seen": 115480808, "step": 171370 }, { "epoch": 4.1867197615615765, "grad_norm": 4.958015415468253e-05, "learning_rate": 1.5691135203159277e-07, "loss": 0.0, "num_input_tokens_seen": 115484392, "step": 171375 }, { "epoch": 4.186841912393423, "grad_norm": 0.000878807797562331, "learning_rate": 1.5686549496939306e-07, "loss": 0.0, "num_input_tokens_seen": 115487912, "step": 171380 }, { "epoch": 4.186964063225271, "grad_norm": 1.878697548818309e-05, "learning_rate": 1.5681964403869408e-07, "loss": 0.0, "num_input_tokens_seen": 115490984, "step": 171385 }, { "epoch": 4.187086214057118, "grad_norm": 51.99897003173828, "learning_rate": 1.5677379923982958e-07, "loss": 0.0162, "num_input_tokens_seen": 115494376, "step": 171390 }, { "epoch": 4.187208364888965, "grad_norm": 0.0018260630313307047, "learning_rate": 1.5672796057313265e-07, "loss": 0.0, "num_input_tokens_seen": 115497576, "step": 171395 }, { "epoch": 4.187330515720812, "grad_norm": 1.377374428557232e-05, "learning_rate": 1.5668212803893698e-07, "loss": 0.0, "num_input_tokens_seen": 115500776, "step": 171400 }, { "epoch": 4.187452666552659, "grad_norm": 0.00017338775796815753, "learning_rate": 1.5663630163757558e-07, "loss": 0.0, "num_input_tokens_seen": 115504552, "step": 171405 }, { "epoch": 4.187574817384506, "grad_norm": 1.9353059542481788e-05, "learning_rate": 1.565904813693817e-07, "loss": 0.0, "num_input_tokens_seen": 115508136, "step": 171410 }, { "epoch": 4.187696968216353, "grad_norm": 0.0007885847007855773, "learning_rate": 1.5654466723468897e-07, "loss": 0.0, "num_input_tokens_seen": 115511400, "step": 171415 }, { "epoch": 4.187819119048201, "grad_norm": 0.00039690814446657896, "learning_rate": 1.5649885923383e-07, "loss": 0.0, "num_input_tokens_seen": 115514600, "step": 171420 }, { "epoch": 4.187941269880048, "grad_norm": 0.0032051349990069866, "learning_rate": 1.5645305736713854e-07, "loss": 0.0, "num_input_tokens_seen": 115517864, "step": 171425 }, { "epoch": 4.188063420711895, "grad_norm": 7.87381868576631e-05, "learning_rate": 1.56407261634947e-07, "loss": 0.0, "num_input_tokens_seen": 115521128, "step": 171430 }, { "epoch": 4.188185571543742, "grad_norm": 2.181102172471583e-05, "learning_rate": 1.563614720375891e-07, "loss": 0.0, "num_input_tokens_seen": 115525160, "step": 171435 }, { "epoch": 4.18830772237559, "grad_norm": 0.00012077955761924386, "learning_rate": 1.5631568857539712e-07, "loss": 0.0, "num_input_tokens_seen": 115528488, "step": 171440 }, { "epoch": 4.188429873207436, "grad_norm": 0.00010481636854819953, "learning_rate": 1.562699112487047e-07, "loss": 0.0, "num_input_tokens_seen": 115531688, "step": 171445 }, { "epoch": 4.188552024039284, "grad_norm": 0.000120364515169058, "learning_rate": 1.5622414005784434e-07, "loss": 0.0, "num_input_tokens_seen": 115535016, "step": 171450 }, { "epoch": 4.188674174871131, "grad_norm": 0.0021958048455417156, "learning_rate": 1.5617837500314879e-07, "loss": 0.0, "num_input_tokens_seen": 115538280, "step": 171455 }, { "epoch": 4.188796325702978, "grad_norm": 4.293501842767e-05, "learning_rate": 1.561326160849513e-07, "loss": 0.0, "num_input_tokens_seen": 115541352, "step": 171460 }, { "epoch": 4.188918476534825, "grad_norm": 0.00024415075313299894, "learning_rate": 1.5608686330358422e-07, "loss": 0.0, "num_input_tokens_seen": 115544744, "step": 171465 }, { "epoch": 4.189040627366673, "grad_norm": 8.518228241882753e-06, "learning_rate": 1.5604111665938035e-07, "loss": 0.0, "num_input_tokens_seen": 115547944, "step": 171470 }, { "epoch": 4.1891627781985195, "grad_norm": 0.0003425255126785487, "learning_rate": 1.5599537615267277e-07, "loss": 0.0, "num_input_tokens_seen": 115551016, "step": 171475 }, { "epoch": 4.189284929030367, "grad_norm": 0.00011390036524971947, "learning_rate": 1.5594964178379366e-07, "loss": 0.0, "num_input_tokens_seen": 115554472, "step": 171480 }, { "epoch": 4.189407079862214, "grad_norm": 0.001087276148609817, "learning_rate": 1.5590391355307587e-07, "loss": 0.0, "num_input_tokens_seen": 115557480, "step": 171485 }, { "epoch": 4.1895292306940615, "grad_norm": 0.0009388430626131594, "learning_rate": 1.5585819146085178e-07, "loss": 0.0, "num_input_tokens_seen": 115560424, "step": 171490 }, { "epoch": 4.189651381525908, "grad_norm": 0.001759289763867855, "learning_rate": 1.5581247550745402e-07, "loss": 0.0, "num_input_tokens_seen": 115564008, "step": 171495 }, { "epoch": 4.189773532357755, "grad_norm": 1.796562173694838e-05, "learning_rate": 1.557667656932149e-07, "loss": 0.0, "num_input_tokens_seen": 115567400, "step": 171500 }, { "epoch": 4.189895683189603, "grad_norm": 7.069880666676909e-05, "learning_rate": 1.5572106201846691e-07, "loss": 0.0, "num_input_tokens_seen": 115570792, "step": 171505 }, { "epoch": 4.190017834021449, "grad_norm": 3.106020812992938e-05, "learning_rate": 1.5567536448354257e-07, "loss": 0.0, "num_input_tokens_seen": 115573864, "step": 171510 }, { "epoch": 4.190139984853297, "grad_norm": 0.0006465915939770639, "learning_rate": 1.5562967308877395e-07, "loss": 0.0, "num_input_tokens_seen": 115577256, "step": 171515 }, { "epoch": 4.190262135685144, "grad_norm": 5.1399223593762144e-05, "learning_rate": 1.5558398783449366e-07, "loss": 0.0501, "num_input_tokens_seen": 115580776, "step": 171520 }, { "epoch": 4.190384286516991, "grad_norm": 0.00033536626142449677, "learning_rate": 1.5553830872103347e-07, "loss": 0.0001, "num_input_tokens_seen": 115584872, "step": 171525 }, { "epoch": 4.190506437348838, "grad_norm": 32.08860778808594, "learning_rate": 1.5549263574872585e-07, "loss": 0.0444, "num_input_tokens_seen": 115587880, "step": 171530 }, { "epoch": 4.190628588180686, "grad_norm": 0.00031860574381425977, "learning_rate": 1.554469689179032e-07, "loss": 0.0, "num_input_tokens_seen": 115590952, "step": 171535 }, { "epoch": 4.1907507390125325, "grad_norm": 0.0002085407468257472, "learning_rate": 1.5540130822889708e-07, "loss": 0.0, "num_input_tokens_seen": 115594536, "step": 171540 }, { "epoch": 4.19087288984438, "grad_norm": 0.00019028125097975135, "learning_rate": 1.5535565368204008e-07, "loss": 0.0, "num_input_tokens_seen": 115598056, "step": 171545 }, { "epoch": 4.190995040676227, "grad_norm": 6.620458862016676e-06, "learning_rate": 1.553100052776639e-07, "loss": 0.0, "num_input_tokens_seen": 115601384, "step": 171550 }, { "epoch": 4.1911171915080745, "grad_norm": 0.0010951223084703088, "learning_rate": 1.5526436301610035e-07, "loss": 0.0, "num_input_tokens_seen": 115604392, "step": 171555 }, { "epoch": 4.191239342339921, "grad_norm": 0.00011559041013242677, "learning_rate": 1.5521872689768178e-07, "loss": 0.0, "num_input_tokens_seen": 115607592, "step": 171560 }, { "epoch": 4.191361493171769, "grad_norm": 0.00027739559300243855, "learning_rate": 1.551730969227396e-07, "loss": 0.0, "num_input_tokens_seen": 115611048, "step": 171565 }, { "epoch": 4.191483644003616, "grad_norm": 0.0003272799076512456, "learning_rate": 1.5512747309160622e-07, "loss": 0.0, "num_input_tokens_seen": 115614568, "step": 171570 }, { "epoch": 4.191605794835463, "grad_norm": 0.00016274228983093053, "learning_rate": 1.5508185540461283e-07, "loss": 0.0, "num_input_tokens_seen": 115618088, "step": 171575 }, { "epoch": 4.19172794566731, "grad_norm": 0.0005291857523843646, "learning_rate": 1.5503624386209157e-07, "loss": 0.0, "num_input_tokens_seen": 115621544, "step": 171580 }, { "epoch": 4.191850096499157, "grad_norm": 5.590125965682091e-06, "learning_rate": 1.5499063846437387e-07, "loss": 0.0, "num_input_tokens_seen": 115625192, "step": 171585 }, { "epoch": 4.191972247331004, "grad_norm": 0.00010487588588148355, "learning_rate": 1.549450392117917e-07, "loss": 0.0, "num_input_tokens_seen": 115628904, "step": 171590 }, { "epoch": 4.192094398162851, "grad_norm": 0.0001090022487915121, "learning_rate": 1.5489944610467632e-07, "loss": 0.0, "num_input_tokens_seen": 115632232, "step": 171595 }, { "epoch": 4.192216548994699, "grad_norm": 0.0007822644547559321, "learning_rate": 1.5485385914335946e-07, "loss": 0.0, "num_input_tokens_seen": 115635624, "step": 171600 }, { "epoch": 4.1923386998265455, "grad_norm": 2.5184515834553167e-05, "learning_rate": 1.548082783281729e-07, "loss": 0.0, "num_input_tokens_seen": 115638760, "step": 171605 }, { "epoch": 4.192460850658393, "grad_norm": 5.217688521952368e-05, "learning_rate": 1.5476270365944766e-07, "loss": 0.0, "num_input_tokens_seen": 115641512, "step": 171610 }, { "epoch": 4.19258300149024, "grad_norm": 8.979284757515416e-06, "learning_rate": 1.547171351375155e-07, "loss": 0.0, "num_input_tokens_seen": 115645096, "step": 171615 }, { "epoch": 4.1927051523220875, "grad_norm": 0.00022331729996949434, "learning_rate": 1.546715727627076e-07, "loss": 0.0, "num_input_tokens_seen": 115648872, "step": 171620 }, { "epoch": 4.192827303153934, "grad_norm": 5.690431407856522e-06, "learning_rate": 1.5462601653535524e-07, "loss": 0.0, "num_input_tokens_seen": 115652328, "step": 171625 }, { "epoch": 4.192949453985782, "grad_norm": 9.095601126318797e-05, "learning_rate": 1.5458046645579014e-07, "loss": 0.0, "num_input_tokens_seen": 115655464, "step": 171630 }, { "epoch": 4.193071604817629, "grad_norm": 0.00010419006866868585, "learning_rate": 1.5453492252434308e-07, "loss": 0.0, "num_input_tokens_seen": 115659304, "step": 171635 }, { "epoch": 4.193193755649476, "grad_norm": 0.0002787335542961955, "learning_rate": 1.5448938474134575e-07, "loss": 0.0, "num_input_tokens_seen": 115663016, "step": 171640 }, { "epoch": 4.193315906481323, "grad_norm": 0.0004578603256959468, "learning_rate": 1.544438531071287e-07, "loss": 0.0, "num_input_tokens_seen": 115666664, "step": 171645 }, { "epoch": 4.193438057313171, "grad_norm": 0.0024992485996335745, "learning_rate": 1.5439832762202375e-07, "loss": 0.0607, "num_input_tokens_seen": 115670440, "step": 171650 }, { "epoch": 4.193560208145017, "grad_norm": 0.00013882042549084872, "learning_rate": 1.5435280828636143e-07, "loss": 0.0, "num_input_tokens_seen": 115674024, "step": 171655 }, { "epoch": 4.193682358976865, "grad_norm": 7.745954644633457e-05, "learning_rate": 1.543072951004728e-07, "loss": 0.0, "num_input_tokens_seen": 115677032, "step": 171660 }, { "epoch": 4.193804509808712, "grad_norm": 0.00017201209266204387, "learning_rate": 1.5426178806468926e-07, "loss": 0.0, "num_input_tokens_seen": 115680104, "step": 171665 }, { "epoch": 4.1939266606405585, "grad_norm": 0.00019128096755594015, "learning_rate": 1.5421628717934109e-07, "loss": 0.0, "num_input_tokens_seen": 115683304, "step": 171670 }, { "epoch": 4.194048811472406, "grad_norm": 0.00016167706053238362, "learning_rate": 1.5417079244475995e-07, "loss": 0.0, "num_input_tokens_seen": 115686568, "step": 171675 }, { "epoch": 4.194170962304253, "grad_norm": 5.020221942686476e-05, "learning_rate": 1.54125303861276e-07, "loss": 0.0, "num_input_tokens_seen": 115689960, "step": 171680 }, { "epoch": 4.1942931131361005, "grad_norm": 0.00040699797682464123, "learning_rate": 1.540798214292204e-07, "loss": 0.0, "num_input_tokens_seen": 115693096, "step": 171685 }, { "epoch": 4.194415263967947, "grad_norm": 0.0012270803563296795, "learning_rate": 1.54034345148924e-07, "loss": 0.0, "num_input_tokens_seen": 115696232, "step": 171690 }, { "epoch": 4.194537414799795, "grad_norm": 5.7696772273629904e-05, "learning_rate": 1.5398887502071722e-07, "loss": 0.0, "num_input_tokens_seen": 115700072, "step": 171695 }, { "epoch": 4.194659565631642, "grad_norm": 0.02207312174141407, "learning_rate": 1.5394341104493113e-07, "loss": 0.0, "num_input_tokens_seen": 115703336, "step": 171700 }, { "epoch": 4.194781716463489, "grad_norm": 3.2915835618041456e-05, "learning_rate": 1.538979532218959e-07, "loss": 0.0, "num_input_tokens_seen": 115706792, "step": 171705 }, { "epoch": 4.194903867295336, "grad_norm": 0.0008636588463559747, "learning_rate": 1.538525015519425e-07, "loss": 0.0477, "num_input_tokens_seen": 115710248, "step": 171710 }, { "epoch": 4.195026018127184, "grad_norm": 0.0001473453885409981, "learning_rate": 1.5380705603540112e-07, "loss": 0.0, "num_input_tokens_seen": 115713704, "step": 171715 }, { "epoch": 4.19514816895903, "grad_norm": 0.0003014703397639096, "learning_rate": 1.5376161667260235e-07, "loss": 0.0, "num_input_tokens_seen": 115717480, "step": 171720 }, { "epoch": 4.195270319790878, "grad_norm": 0.0003513983974698931, "learning_rate": 1.5371618346387704e-07, "loss": 0.0, "num_input_tokens_seen": 115721256, "step": 171725 }, { "epoch": 4.195392470622725, "grad_norm": 2.8909251341247e-05, "learning_rate": 1.5367075640955495e-07, "loss": 0.0, "num_input_tokens_seen": 115724264, "step": 171730 }, { "epoch": 4.195514621454572, "grad_norm": 2.142676748917438e-05, "learning_rate": 1.5362533550996704e-07, "loss": 0.0, "num_input_tokens_seen": 115727656, "step": 171735 }, { "epoch": 4.195636772286419, "grad_norm": 0.00011070889740949497, "learning_rate": 1.5357992076544314e-07, "loss": 0.0, "num_input_tokens_seen": 115731048, "step": 171740 }, { "epoch": 4.195758923118267, "grad_norm": 3.502647814457305e-05, "learning_rate": 1.5353451217631386e-07, "loss": 0.0, "num_input_tokens_seen": 115734696, "step": 171745 }, { "epoch": 4.195881073950114, "grad_norm": 5.859508382854983e-05, "learning_rate": 1.5348910974290907e-07, "loss": 0.0, "num_input_tokens_seen": 115738216, "step": 171750 }, { "epoch": 4.196003224781961, "grad_norm": 0.0011176398256793618, "learning_rate": 1.534437134655595e-07, "loss": 0.0002, "num_input_tokens_seen": 115742184, "step": 171755 }, { "epoch": 4.196125375613808, "grad_norm": 0.00032299006124958396, "learning_rate": 1.533983233445948e-07, "loss": 0.0, "num_input_tokens_seen": 115745448, "step": 171760 }, { "epoch": 4.196247526445655, "grad_norm": 5.946209421381354e-05, "learning_rate": 1.53352939380345e-07, "loss": 0.0, "num_input_tokens_seen": 115748776, "step": 171765 }, { "epoch": 4.196369677277502, "grad_norm": 0.00039252001442946494, "learning_rate": 1.5330756157314062e-07, "loss": 0.0, "num_input_tokens_seen": 115751976, "step": 171770 }, { "epoch": 4.196491828109349, "grad_norm": 4.479515700950287e-05, "learning_rate": 1.5326218992331119e-07, "loss": 0.0001, "num_input_tokens_seen": 115755432, "step": 171775 }, { "epoch": 4.196613978941197, "grad_norm": 0.00012405213783495128, "learning_rate": 1.5321682443118677e-07, "loss": 0.0, "num_input_tokens_seen": 115758952, "step": 171780 }, { "epoch": 4.1967361297730434, "grad_norm": 0.012555737048387527, "learning_rate": 1.5317146509709767e-07, "loss": 0.0, "num_input_tokens_seen": 115761960, "step": 171785 }, { "epoch": 4.196858280604891, "grad_norm": 0.00039039074908941984, "learning_rate": 1.5312611192137313e-07, "loss": 0.0, "num_input_tokens_seen": 115765480, "step": 171790 }, { "epoch": 4.196980431436738, "grad_norm": 0.00015265199181158096, "learning_rate": 1.5308076490434352e-07, "loss": 0.0, "num_input_tokens_seen": 115769256, "step": 171795 }, { "epoch": 4.1971025822685855, "grad_norm": 0.025387544184923172, "learning_rate": 1.5303542404633818e-07, "loss": 0.0, "num_input_tokens_seen": 115772392, "step": 171800 }, { "epoch": 4.197224733100432, "grad_norm": 6.584433140233159e-05, "learning_rate": 1.529900893476873e-07, "loss": 0.0, "num_input_tokens_seen": 115775592, "step": 171805 }, { "epoch": 4.19734688393228, "grad_norm": 1.709511343506165e-05, "learning_rate": 1.5294476080872009e-07, "loss": 0.0, "num_input_tokens_seen": 115778408, "step": 171810 }, { "epoch": 4.197469034764127, "grad_norm": 3.4848755603889003e-05, "learning_rate": 1.5289943842976638e-07, "loss": 0.0, "num_input_tokens_seen": 115781672, "step": 171815 }, { "epoch": 4.197591185595974, "grad_norm": 2.3679769583395682e-05, "learning_rate": 1.5285412221115602e-07, "loss": 0.0, "num_input_tokens_seen": 115785000, "step": 171820 }, { "epoch": 4.197713336427821, "grad_norm": 0.00013830607349518687, "learning_rate": 1.5280881215321805e-07, "loss": 0.0, "num_input_tokens_seen": 115788136, "step": 171825 }, { "epoch": 4.197835487259669, "grad_norm": 0.005081809591501951, "learning_rate": 1.527635082562826e-07, "loss": 0.0, "num_input_tokens_seen": 115791400, "step": 171830 }, { "epoch": 4.197957638091515, "grad_norm": 0.0002572297817096114, "learning_rate": 1.5271821052067846e-07, "loss": 0.0, "num_input_tokens_seen": 115794664, "step": 171835 }, { "epoch": 4.198079788923363, "grad_norm": 8.138104021782055e-05, "learning_rate": 1.526729189467355e-07, "loss": 0.0009, "num_input_tokens_seen": 115797992, "step": 171840 }, { "epoch": 4.19820193975521, "grad_norm": 0.06563310325145721, "learning_rate": 1.5262763353478315e-07, "loss": 0.0, "num_input_tokens_seen": 115801256, "step": 171845 }, { "epoch": 4.198324090587057, "grad_norm": 8.636287384433672e-05, "learning_rate": 1.5258235428515033e-07, "loss": 0.0, "num_input_tokens_seen": 115804520, "step": 171850 }, { "epoch": 4.198446241418904, "grad_norm": 0.00015373101632576436, "learning_rate": 1.5253708119816676e-07, "loss": 0.0, "num_input_tokens_seen": 115808040, "step": 171855 }, { "epoch": 4.198568392250751, "grad_norm": 0.0005759844789281487, "learning_rate": 1.524918142741616e-07, "loss": 0.0, "num_input_tokens_seen": 115811112, "step": 171860 }, { "epoch": 4.1986905430825985, "grad_norm": 0.0023348061367869377, "learning_rate": 1.5244655351346357e-07, "loss": 0.0, "num_input_tokens_seen": 115814440, "step": 171865 }, { "epoch": 4.198812693914445, "grad_norm": 0.0023439086508005857, "learning_rate": 1.5240129891640242e-07, "loss": 0.0, "num_input_tokens_seen": 115817640, "step": 171870 }, { "epoch": 4.198934844746293, "grad_norm": 31.531675338745117, "learning_rate": 1.523560504833068e-07, "loss": 0.0524, "num_input_tokens_seen": 115820712, "step": 171875 }, { "epoch": 4.19905699557814, "grad_norm": 0.0009481237502768636, "learning_rate": 1.5231080821450616e-07, "loss": 0.0, "num_input_tokens_seen": 115823656, "step": 171880 }, { "epoch": 4.199179146409987, "grad_norm": 0.00029586380696855485, "learning_rate": 1.522655721103291e-07, "loss": 0.0, "num_input_tokens_seen": 115827048, "step": 171885 }, { "epoch": 4.199301297241834, "grad_norm": 0.0025091313291341066, "learning_rate": 1.5222034217110502e-07, "loss": 0.0, "num_input_tokens_seen": 115830504, "step": 171890 }, { "epoch": 4.199423448073682, "grad_norm": 0.00017374740855302662, "learning_rate": 1.5217511839716245e-07, "loss": 0.0, "num_input_tokens_seen": 115833896, "step": 171895 }, { "epoch": 4.199545598905528, "grad_norm": 0.0007191016338765621, "learning_rate": 1.521299007888307e-07, "loss": 0.125, "num_input_tokens_seen": 115837672, "step": 171900 }, { "epoch": 4.199667749737376, "grad_norm": 0.00038250116631388664, "learning_rate": 1.5208468934643815e-07, "loss": 0.0, "num_input_tokens_seen": 115841256, "step": 171905 }, { "epoch": 4.199789900569223, "grad_norm": 5.690229590982199e-05, "learning_rate": 1.5203948407031375e-07, "loss": 0.0, "num_input_tokens_seen": 115844456, "step": 171910 }, { "epoch": 4.19991205140107, "grad_norm": 0.00010644423309713602, "learning_rate": 1.5199428496078648e-07, "loss": 0.0, "num_input_tokens_seen": 115847656, "step": 171915 }, { "epoch": 4.200034202232917, "grad_norm": 1.8538225049269386e-05, "learning_rate": 1.5194909201818473e-07, "loss": 0.0, "num_input_tokens_seen": 115850856, "step": 171920 }, { "epoch": 4.200156353064765, "grad_norm": 0.009867326356470585, "learning_rate": 1.5190390524283747e-07, "loss": 0.0, "num_input_tokens_seen": 115853928, "step": 171925 }, { "epoch": 4.2002785038966115, "grad_norm": 0.00025367087800987065, "learning_rate": 1.5185872463507287e-07, "loss": 0.0, "num_input_tokens_seen": 115857448, "step": 171930 }, { "epoch": 4.200400654728458, "grad_norm": 6.811439379816875e-05, "learning_rate": 1.518135501952198e-07, "loss": 0.0, "num_input_tokens_seen": 115860264, "step": 171935 }, { "epoch": 4.200522805560306, "grad_norm": 0.00018472773081157357, "learning_rate": 1.5176838192360686e-07, "loss": 0.0, "num_input_tokens_seen": 115863720, "step": 171940 }, { "epoch": 4.200644956392153, "grad_norm": 0.00013160528033040464, "learning_rate": 1.5172321982056223e-07, "loss": 0.0, "num_input_tokens_seen": 115866920, "step": 171945 }, { "epoch": 4.200767107224, "grad_norm": 0.0074246665462851524, "learning_rate": 1.516780638864148e-07, "loss": 0.0, "num_input_tokens_seen": 115870312, "step": 171950 }, { "epoch": 4.200889258055847, "grad_norm": 0.00046449064393527806, "learning_rate": 1.5163291412149226e-07, "loss": 0.0, "num_input_tokens_seen": 115873512, "step": 171955 }, { "epoch": 4.201011408887695, "grad_norm": 9.282070823246613e-05, "learning_rate": 1.515877705261237e-07, "loss": 0.0, "num_input_tokens_seen": 115877032, "step": 171960 }, { "epoch": 4.201133559719541, "grad_norm": 2.614716322568711e-05, "learning_rate": 1.5154263310063708e-07, "loss": 0.0, "num_input_tokens_seen": 115880360, "step": 171965 }, { "epoch": 4.201255710551389, "grad_norm": 0.004874629434198141, "learning_rate": 1.5149750184536036e-07, "loss": 0.0, "num_input_tokens_seen": 115883560, "step": 171970 }, { "epoch": 4.201377861383236, "grad_norm": 1.403738042426994e-05, "learning_rate": 1.5145237676062228e-07, "loss": 0.0, "num_input_tokens_seen": 115886824, "step": 171975 }, { "epoch": 4.201500012215083, "grad_norm": 2.9235177862574346e-05, "learning_rate": 1.5140725784675057e-07, "loss": 0.0, "num_input_tokens_seen": 115890216, "step": 171980 }, { "epoch": 4.20162216304693, "grad_norm": 0.00021435305825434625, "learning_rate": 1.5136214510407364e-07, "loss": 0.0, "num_input_tokens_seen": 115893736, "step": 171985 }, { "epoch": 4.201744313878778, "grad_norm": 0.00011233131954213604, "learning_rate": 1.5131703853291934e-07, "loss": 0.0, "num_input_tokens_seen": 115897448, "step": 171990 }, { "epoch": 4.2018664647106245, "grad_norm": 0.0004522551316767931, "learning_rate": 1.5127193813361595e-07, "loss": 0.0, "num_input_tokens_seen": 115900968, "step": 171995 }, { "epoch": 4.201988615542472, "grad_norm": 1.637089553696569e-05, "learning_rate": 1.512268439064911e-07, "loss": 0.0, "num_input_tokens_seen": 115904104, "step": 172000 }, { "epoch": 4.202110766374319, "grad_norm": 0.0040295966900885105, "learning_rate": 1.5118175585187286e-07, "loss": 0.0, "num_input_tokens_seen": 115907432, "step": 172005 }, { "epoch": 4.2022329172061665, "grad_norm": 8.661628817208111e-05, "learning_rate": 1.5113667397008957e-07, "loss": 0.0, "num_input_tokens_seen": 115910696, "step": 172010 }, { "epoch": 4.202355068038013, "grad_norm": 1.427132428943878e-05, "learning_rate": 1.5109159826146834e-07, "loss": 0.0, "num_input_tokens_seen": 115914408, "step": 172015 }, { "epoch": 4.202477218869861, "grad_norm": 0.00014912939514033496, "learning_rate": 1.510465287263376e-07, "loss": 0.0, "num_input_tokens_seen": 115917672, "step": 172020 }, { "epoch": 4.202599369701708, "grad_norm": 4.3930493120569736e-05, "learning_rate": 1.5100146536502468e-07, "loss": 0.0001, "num_input_tokens_seen": 115921064, "step": 172025 }, { "epoch": 4.202721520533554, "grad_norm": 0.0004871827259194106, "learning_rate": 1.5095640817785737e-07, "loss": 0.0, "num_input_tokens_seen": 115924264, "step": 172030 }, { "epoch": 4.202843671365402, "grad_norm": 0.0010371499229222536, "learning_rate": 1.509113571651638e-07, "loss": 0.0, "num_input_tokens_seen": 115927528, "step": 172035 }, { "epoch": 4.202965822197249, "grad_norm": 0.00017510108591523021, "learning_rate": 1.5086631232727086e-07, "loss": 0.0, "num_input_tokens_seen": 115930664, "step": 172040 }, { "epoch": 4.203087973029096, "grad_norm": 4.109571091248654e-05, "learning_rate": 1.508212736645067e-07, "loss": 0.0, "num_input_tokens_seen": 115933928, "step": 172045 }, { "epoch": 4.203210123860943, "grad_norm": 0.00025954615557566285, "learning_rate": 1.5077624117719845e-07, "loss": 0.0, "num_input_tokens_seen": 115937192, "step": 172050 }, { "epoch": 4.203332274692791, "grad_norm": 3.810921043623239e-05, "learning_rate": 1.507312148656741e-07, "loss": 0.0, "num_input_tokens_seen": 115940456, "step": 172055 }, { "epoch": 4.2034544255246375, "grad_norm": 1.4034387277206406e-05, "learning_rate": 1.5068619473026045e-07, "loss": 0.0, "num_input_tokens_seen": 115944040, "step": 172060 }, { "epoch": 4.203576576356485, "grad_norm": 0.00018147245282307267, "learning_rate": 1.506411807712854e-07, "loss": 0.0, "num_input_tokens_seen": 115947688, "step": 172065 }, { "epoch": 4.203698727188332, "grad_norm": 1.4824206118646543e-05, "learning_rate": 1.5059617298907624e-07, "loss": 0.0, "num_input_tokens_seen": 115951016, "step": 172070 }, { "epoch": 4.20382087802018, "grad_norm": 0.00018080630979966372, "learning_rate": 1.505511713839599e-07, "loss": 0.0, "num_input_tokens_seen": 115955048, "step": 172075 }, { "epoch": 4.203943028852026, "grad_norm": 0.0012226704275235534, "learning_rate": 1.5050617595626424e-07, "loss": 0.0, "num_input_tokens_seen": 115958248, "step": 172080 }, { "epoch": 4.204065179683874, "grad_norm": 0.0011231348617002368, "learning_rate": 1.5046118670631581e-07, "loss": 0.0, "num_input_tokens_seen": 115961320, "step": 172085 }, { "epoch": 4.204187330515721, "grad_norm": 0.00010311927326256409, "learning_rate": 1.504162036344422e-07, "loss": 0.0004, "num_input_tokens_seen": 115964584, "step": 172090 }, { "epoch": 4.204309481347568, "grad_norm": 0.0007668191101402044, "learning_rate": 1.503712267409707e-07, "loss": 0.0, "num_input_tokens_seen": 115967912, "step": 172095 }, { "epoch": 4.204431632179415, "grad_norm": 0.00049975625006482, "learning_rate": 1.5032625602622784e-07, "loss": 0.0, "num_input_tokens_seen": 115970984, "step": 172100 }, { "epoch": 4.204553783011263, "grad_norm": 0.0009041104349307716, "learning_rate": 1.5028129149054126e-07, "loss": 0.0, "num_input_tokens_seen": 115974248, "step": 172105 }, { "epoch": 4.2046759338431094, "grad_norm": 3.793345968006179e-05, "learning_rate": 1.5023633313423745e-07, "loss": 0.0, "num_input_tokens_seen": 115977512, "step": 172110 }, { "epoch": 4.204798084674957, "grad_norm": 0.00024618953466415405, "learning_rate": 1.5019138095764383e-07, "loss": 0.0, "num_input_tokens_seen": 115980648, "step": 172115 }, { "epoch": 4.204920235506804, "grad_norm": 7.776911843393464e-06, "learning_rate": 1.5014643496108682e-07, "loss": 0.0, "num_input_tokens_seen": 115983976, "step": 172120 }, { "epoch": 4.205042386338651, "grad_norm": 0.00018268365238327533, "learning_rate": 1.5010149514489356e-07, "loss": 0.0, "num_input_tokens_seen": 115987048, "step": 172125 }, { "epoch": 4.205164537170498, "grad_norm": 0.0017578315455466509, "learning_rate": 1.5005656150939095e-07, "loss": 0.0, "num_input_tokens_seen": 115990184, "step": 172130 }, { "epoch": 4.205286688002345, "grad_norm": 0.00027843486168421805, "learning_rate": 1.5001163405490547e-07, "loss": 0.0, "num_input_tokens_seen": 115993640, "step": 172135 }, { "epoch": 4.205408838834193, "grad_norm": 0.00025260128313675523, "learning_rate": 1.499667127817642e-07, "loss": 0.0, "num_input_tokens_seen": 115997096, "step": 172140 }, { "epoch": 4.205530989666039, "grad_norm": 0.0006347659509629011, "learning_rate": 1.4992179769029346e-07, "loss": 0.0, "num_input_tokens_seen": 116000296, "step": 172145 }, { "epoch": 4.205653140497887, "grad_norm": 7.18262090231292e-05, "learning_rate": 1.4987688878082028e-07, "loss": 0.0, "num_input_tokens_seen": 116003368, "step": 172150 }, { "epoch": 4.205775291329734, "grad_norm": 0.000331896502757445, "learning_rate": 1.4983198605367075e-07, "loss": 0.0, "num_input_tokens_seen": 116006248, "step": 172155 }, { "epoch": 4.205897442161581, "grad_norm": 0.0006194012821651995, "learning_rate": 1.4978708950917162e-07, "loss": 0.0, "num_input_tokens_seen": 116009448, "step": 172160 }, { "epoch": 4.206019592993428, "grad_norm": 0.0014870319282636046, "learning_rate": 1.4974219914764986e-07, "loss": 0.0, "num_input_tokens_seen": 116012712, "step": 172165 }, { "epoch": 4.206141743825276, "grad_norm": 0.000584141060244292, "learning_rate": 1.496973149694314e-07, "loss": 0.0, "num_input_tokens_seen": 116016424, "step": 172170 }, { "epoch": 4.2062638946571225, "grad_norm": 0.0006933091790415347, "learning_rate": 1.4965243697484253e-07, "loss": 0.0, "num_input_tokens_seen": 116019816, "step": 172175 }, { "epoch": 4.20638604548897, "grad_norm": 0.00044879803317599, "learning_rate": 1.4960756516421013e-07, "loss": 0.0, "num_input_tokens_seen": 116022888, "step": 172180 }, { "epoch": 4.206508196320817, "grad_norm": 0.00214349920861423, "learning_rate": 1.4956269953785993e-07, "loss": 0.0, "num_input_tokens_seen": 116026216, "step": 172185 }, { "epoch": 4.2066303471526645, "grad_norm": 0.00808340311050415, "learning_rate": 1.495178400961188e-07, "loss": 0.0, "num_input_tokens_seen": 116029480, "step": 172190 }, { "epoch": 4.206752497984511, "grad_norm": 5.1568593335105106e-05, "learning_rate": 1.4947298683931254e-07, "loss": 0.0, "num_input_tokens_seen": 116033000, "step": 172195 }, { "epoch": 4.206874648816359, "grad_norm": 0.0003689782170113176, "learning_rate": 1.4942813976776759e-07, "loss": 0.0, "num_input_tokens_seen": 116036392, "step": 172200 }, { "epoch": 4.206996799648206, "grad_norm": 7.517338872276014e-06, "learning_rate": 1.493832988818098e-07, "loss": 0.0, "num_input_tokens_seen": 116039976, "step": 172205 }, { "epoch": 4.207118950480052, "grad_norm": 8.319402695633471e-05, "learning_rate": 1.4933846418176578e-07, "loss": 0.0, "num_input_tokens_seen": 116043368, "step": 172210 }, { "epoch": 4.2072411013119, "grad_norm": 0.0005808392306789756, "learning_rate": 1.4929363566796082e-07, "loss": 0.0, "num_input_tokens_seen": 116046824, "step": 172215 }, { "epoch": 4.207363252143747, "grad_norm": 0.0011009281734004617, "learning_rate": 1.492488133407215e-07, "loss": 0.0, "num_input_tokens_seen": 116049896, "step": 172220 }, { "epoch": 4.207485402975594, "grad_norm": 3.104091956629418e-05, "learning_rate": 1.492039972003738e-07, "loss": 0.0426, "num_input_tokens_seen": 116053096, "step": 172225 }, { "epoch": 4.207607553807441, "grad_norm": 0.00019769801292568445, "learning_rate": 1.491591872472433e-07, "loss": 0.0, "num_input_tokens_seen": 116056168, "step": 172230 }, { "epoch": 4.207729704639289, "grad_norm": 4.43961143901106e-05, "learning_rate": 1.491143834816563e-07, "loss": 0.0, "num_input_tokens_seen": 116059560, "step": 172235 }, { "epoch": 4.2078518554711355, "grad_norm": 0.0002230397949460894, "learning_rate": 1.4906958590393802e-07, "loss": 0.0, "num_input_tokens_seen": 116062824, "step": 172240 }, { "epoch": 4.207974006302983, "grad_norm": 4.6068620576988906e-05, "learning_rate": 1.4902479451441464e-07, "loss": 0.0, "num_input_tokens_seen": 116066344, "step": 172245 }, { "epoch": 4.20809615713483, "grad_norm": 1.1687574442476034e-05, "learning_rate": 1.4898000931341204e-07, "loss": 0.0, "num_input_tokens_seen": 116069352, "step": 172250 }, { "epoch": 4.2082183079666775, "grad_norm": 0.00023311935365200043, "learning_rate": 1.4893523030125544e-07, "loss": 0.0, "num_input_tokens_seen": 116072872, "step": 172255 }, { "epoch": 4.208340458798524, "grad_norm": 0.00037141842767596245, "learning_rate": 1.4889045747827111e-07, "loss": 0.0, "num_input_tokens_seen": 116075880, "step": 172260 }, { "epoch": 4.208462609630372, "grad_norm": 7.853261195123196e-05, "learning_rate": 1.4884569084478394e-07, "loss": 0.0, "num_input_tokens_seen": 116079336, "step": 172265 }, { "epoch": 4.208584760462219, "grad_norm": 5.044359568273649e-06, "learning_rate": 1.4880093040112018e-07, "loss": 0.0, "num_input_tokens_seen": 116082280, "step": 172270 }, { "epoch": 4.208706911294066, "grad_norm": 6.285287963692099e-05, "learning_rate": 1.4875617614760493e-07, "loss": 0.0, "num_input_tokens_seen": 116085608, "step": 172275 }, { "epoch": 4.208829062125913, "grad_norm": 0.0008630296215415001, "learning_rate": 1.4871142808456349e-07, "loss": 0.0, "num_input_tokens_seen": 116088744, "step": 172280 }, { "epoch": 4.208951212957761, "grad_norm": 0.001255490817129612, "learning_rate": 1.4866668621232182e-07, "loss": 0.0, "num_input_tokens_seen": 116092008, "step": 172285 }, { "epoch": 4.209073363789607, "grad_norm": 0.000520072877407074, "learning_rate": 1.4862195053120464e-07, "loss": 0.0, "num_input_tokens_seen": 116095144, "step": 172290 }, { "epoch": 4.209195514621454, "grad_norm": 0.00010253593791276217, "learning_rate": 1.4857722104153792e-07, "loss": 0.0, "num_input_tokens_seen": 116099048, "step": 172295 }, { "epoch": 4.209317665453302, "grad_norm": 0.0004774238623213023, "learning_rate": 1.485324977436464e-07, "loss": 0.0, "num_input_tokens_seen": 116102504, "step": 172300 }, { "epoch": 4.2094398162851485, "grad_norm": 0.000514692161232233, "learning_rate": 1.4848778063785583e-07, "loss": 0.0, "num_input_tokens_seen": 116105512, "step": 172305 }, { "epoch": 4.209561967116996, "grad_norm": 5.9682086430257186e-05, "learning_rate": 1.4844306972449093e-07, "loss": 0.0, "num_input_tokens_seen": 116108648, "step": 172310 }, { "epoch": 4.209684117948843, "grad_norm": 4.6664001274621114e-05, "learning_rate": 1.4839836500387703e-07, "loss": 0.0, "num_input_tokens_seen": 116111912, "step": 172315 }, { "epoch": 4.2098062687806905, "grad_norm": 0.0013582675019279122, "learning_rate": 1.4835366647633963e-07, "loss": 0.0, "num_input_tokens_seen": 116115176, "step": 172320 }, { "epoch": 4.209928419612537, "grad_norm": 4.392506070871605e-06, "learning_rate": 1.48308974142203e-07, "loss": 0.0002, "num_input_tokens_seen": 116118312, "step": 172325 }, { "epoch": 4.210050570444385, "grad_norm": 6.457888503064169e-06, "learning_rate": 1.4826428800179303e-07, "loss": 0.0, "num_input_tokens_seen": 116121704, "step": 172330 }, { "epoch": 4.210172721276232, "grad_norm": 0.0021146265789866447, "learning_rate": 1.4821960805543388e-07, "loss": 0.0, "num_input_tokens_seen": 116125352, "step": 172335 }, { "epoch": 4.210294872108079, "grad_norm": 1.908622834889684e-05, "learning_rate": 1.4817493430345084e-07, "loss": 0.0, "num_input_tokens_seen": 116128296, "step": 172340 }, { "epoch": 4.210417022939926, "grad_norm": 0.13494116067886353, "learning_rate": 1.48130266746169e-07, "loss": 0.0, "num_input_tokens_seen": 116131944, "step": 172345 }, { "epoch": 4.210539173771774, "grad_norm": 8.20185596239753e-06, "learning_rate": 1.480856053839129e-07, "loss": 0.0489, "num_input_tokens_seen": 116135528, "step": 172350 }, { "epoch": 4.21066132460362, "grad_norm": 7.075269240885973e-05, "learning_rate": 1.4804095021700746e-07, "loss": 0.0, "num_input_tokens_seen": 116139304, "step": 172355 }, { "epoch": 4.210783475435468, "grad_norm": 0.0006885943002998829, "learning_rate": 1.4799630124577733e-07, "loss": 0.0, "num_input_tokens_seen": 116142760, "step": 172360 }, { "epoch": 4.210905626267315, "grad_norm": 0.00013424194185063243, "learning_rate": 1.4795165847054735e-07, "loss": 0.0, "num_input_tokens_seen": 116145832, "step": 172365 }, { "epoch": 4.211027777099162, "grad_norm": 8.563858136767522e-05, "learning_rate": 1.4790702189164194e-07, "loss": 0.0, "num_input_tokens_seen": 116148904, "step": 172370 }, { "epoch": 4.211149927931009, "grad_norm": 0.00026543528656475246, "learning_rate": 1.4786239150938594e-07, "loss": 0.0, "num_input_tokens_seen": 116152104, "step": 172375 }, { "epoch": 4.211272078762857, "grad_norm": 0.0008876949432305992, "learning_rate": 1.47817767324104e-07, "loss": 0.0, "num_input_tokens_seen": 116155816, "step": 172380 }, { "epoch": 4.2113942295947036, "grad_norm": 9.343509009340778e-05, "learning_rate": 1.4777314933612016e-07, "loss": 0.0, "num_input_tokens_seen": 116159016, "step": 172385 }, { "epoch": 4.21151638042655, "grad_norm": 0.0004023423243779689, "learning_rate": 1.4772853754575942e-07, "loss": 0.0, "num_input_tokens_seen": 116162344, "step": 172390 }, { "epoch": 4.211638531258398, "grad_norm": 0.00024002035206649452, "learning_rate": 1.4768393195334583e-07, "loss": 0.0001, "num_input_tokens_seen": 116165480, "step": 172395 }, { "epoch": 4.211760682090245, "grad_norm": 0.0009489704389125109, "learning_rate": 1.476393325592038e-07, "loss": 0.0, "num_input_tokens_seen": 116169000, "step": 172400 }, { "epoch": 4.211882832922092, "grad_norm": 0.0010699955746531487, "learning_rate": 1.475947393636582e-07, "loss": 0.0, "num_input_tokens_seen": 116172264, "step": 172405 }, { "epoch": 4.212004983753939, "grad_norm": 2.9108750823070295e-05, "learning_rate": 1.475501523670325e-07, "loss": 0.0, "num_input_tokens_seen": 116175848, "step": 172410 }, { "epoch": 4.212127134585787, "grad_norm": 0.003317110938951373, "learning_rate": 1.475055715696517e-07, "loss": 0.0, "num_input_tokens_seen": 116179560, "step": 172415 }, { "epoch": 4.212249285417633, "grad_norm": 2.4914932510000654e-05, "learning_rate": 1.4746099697183945e-07, "loss": 0.0, "num_input_tokens_seen": 116182824, "step": 172420 }, { "epoch": 4.212371436249481, "grad_norm": 0.0004848266253247857, "learning_rate": 1.4741642857392045e-07, "loss": 0.0, "num_input_tokens_seen": 116186280, "step": 172425 }, { "epoch": 4.212493587081328, "grad_norm": 0.00012116871948819607, "learning_rate": 1.4737186637621812e-07, "loss": 0.0, "num_input_tokens_seen": 116189736, "step": 172430 }, { "epoch": 4.2126157379131755, "grad_norm": 0.019111763685941696, "learning_rate": 1.4732731037905698e-07, "loss": 0.0, "num_input_tokens_seen": 116192872, "step": 172435 }, { "epoch": 4.212737888745022, "grad_norm": 8.967510075308383e-05, "learning_rate": 1.4728276058276122e-07, "loss": 0.0, "num_input_tokens_seen": 116195752, "step": 172440 }, { "epoch": 4.21286003957687, "grad_norm": 0.0025152855087071657, "learning_rate": 1.4723821698765437e-07, "loss": 0.0, "num_input_tokens_seen": 116198824, "step": 172445 }, { "epoch": 4.212982190408717, "grad_norm": 0.0015792966587468982, "learning_rate": 1.471936795940607e-07, "loss": 0.0, "num_input_tokens_seen": 116202024, "step": 172450 }, { "epoch": 4.213104341240564, "grad_norm": 0.0015235176542773843, "learning_rate": 1.4714914840230385e-07, "loss": 0.0, "num_input_tokens_seen": 116205032, "step": 172455 }, { "epoch": 4.213226492072411, "grad_norm": 6.820956332376227e-05, "learning_rate": 1.471046234127079e-07, "loss": 0.0, "num_input_tokens_seen": 116208296, "step": 172460 }, { "epoch": 4.213348642904259, "grad_norm": 0.07334547489881516, "learning_rate": 1.4706010462559638e-07, "loss": 0.0, "num_input_tokens_seen": 116211688, "step": 172465 }, { "epoch": 4.213470793736105, "grad_norm": 0.0005423697293736041, "learning_rate": 1.470155920412932e-07, "loss": 0.0, "num_input_tokens_seen": 116214824, "step": 172470 }, { "epoch": 4.213592944567952, "grad_norm": 0.007120200432837009, "learning_rate": 1.4697108566012228e-07, "loss": 0.0, "num_input_tokens_seen": 116218088, "step": 172475 }, { "epoch": 4.2137150953998, "grad_norm": 0.01869882456958294, "learning_rate": 1.46926585482407e-07, "loss": 0.0, "num_input_tokens_seen": 116221096, "step": 172480 }, { "epoch": 4.2138372462316465, "grad_norm": 8.70914154802449e-05, "learning_rate": 1.4688209150847085e-07, "loss": 0.0, "num_input_tokens_seen": 116224040, "step": 172485 }, { "epoch": 4.213959397063494, "grad_norm": 0.0005563409649766982, "learning_rate": 1.4683760373863785e-07, "loss": 0.0, "num_input_tokens_seen": 116227048, "step": 172490 }, { "epoch": 4.214081547895341, "grad_norm": 5.803833482787013e-05, "learning_rate": 1.4679312217323102e-07, "loss": 0.0, "num_input_tokens_seen": 116230824, "step": 172495 }, { "epoch": 4.2142036987271885, "grad_norm": 1.2437372788554057e-05, "learning_rate": 1.4674864681257438e-07, "loss": 0.0, "num_input_tokens_seen": 116233960, "step": 172500 }, { "epoch": 4.214325849559035, "grad_norm": 0.004784159827977419, "learning_rate": 1.4670417765699072e-07, "loss": 0.0, "num_input_tokens_seen": 116236904, "step": 172505 }, { "epoch": 4.214448000390883, "grad_norm": 4.994161645299755e-05, "learning_rate": 1.4665971470680417e-07, "loss": 0.0, "num_input_tokens_seen": 116240296, "step": 172510 }, { "epoch": 4.21457015122273, "grad_norm": 0.0012720542727038264, "learning_rate": 1.4661525796233732e-07, "loss": 0.0, "num_input_tokens_seen": 116244072, "step": 172515 }, { "epoch": 4.214692302054577, "grad_norm": 0.0005264293286018074, "learning_rate": 1.4657080742391414e-07, "loss": 0.0, "num_input_tokens_seen": 116247400, "step": 172520 }, { "epoch": 4.214814452886424, "grad_norm": 0.010150831192731857, "learning_rate": 1.465263630918574e-07, "loss": 0.0, "num_input_tokens_seen": 116250792, "step": 172525 }, { "epoch": 4.214936603718272, "grad_norm": 0.0002481169649399817, "learning_rate": 1.4648192496649047e-07, "loss": 0.0, "num_input_tokens_seen": 116253928, "step": 172530 }, { "epoch": 4.215058754550118, "grad_norm": 4.520795846474357e-05, "learning_rate": 1.464374930481368e-07, "loss": 0.0, "num_input_tokens_seen": 116257768, "step": 172535 }, { "epoch": 4.215180905381966, "grad_norm": 8.352724398719147e-05, "learning_rate": 1.46393067337119e-07, "loss": 0.0, "num_input_tokens_seen": 116261480, "step": 172540 }, { "epoch": 4.215303056213813, "grad_norm": 4.275560422684066e-05, "learning_rate": 1.4634864783376055e-07, "loss": 0.0, "num_input_tokens_seen": 116264808, "step": 172545 }, { "epoch": 4.21542520704566, "grad_norm": 1.3493899132299703e-05, "learning_rate": 1.4630423453838427e-07, "loss": 0.0, "num_input_tokens_seen": 116268520, "step": 172550 }, { "epoch": 4.215547357877507, "grad_norm": 0.0005210431991145015, "learning_rate": 1.4625982745131315e-07, "loss": 0.0, "num_input_tokens_seen": 116271720, "step": 172555 }, { "epoch": 4.215669508709354, "grad_norm": 0.0029650393407791853, "learning_rate": 1.4621542657287033e-07, "loss": 0.0, "num_input_tokens_seen": 116274984, "step": 172560 }, { "epoch": 4.2157916595412015, "grad_norm": 6.22431471128948e-05, "learning_rate": 1.4617103190337853e-07, "loss": 0.0, "num_input_tokens_seen": 116278504, "step": 172565 }, { "epoch": 4.215913810373048, "grad_norm": 0.0007015995215624571, "learning_rate": 1.4612664344316073e-07, "loss": 0.0, "num_input_tokens_seen": 116281896, "step": 172570 }, { "epoch": 4.216035961204896, "grad_norm": 7.395140710286796e-05, "learning_rate": 1.4608226119253942e-07, "loss": 0.0, "num_input_tokens_seen": 116285416, "step": 172575 }, { "epoch": 4.216158112036743, "grad_norm": 0.0005374195170588791, "learning_rate": 1.4603788515183792e-07, "loss": 0.0, "num_input_tokens_seen": 116288808, "step": 172580 }, { "epoch": 4.21628026286859, "grad_norm": 2.6126805096282624e-05, "learning_rate": 1.4599351532137848e-07, "loss": 0.0, "num_input_tokens_seen": 116292072, "step": 172585 }, { "epoch": 4.216402413700437, "grad_norm": 0.0012434420641511679, "learning_rate": 1.459491517014837e-07, "loss": 0.0, "num_input_tokens_seen": 116295784, "step": 172590 }, { "epoch": 4.216524564532285, "grad_norm": 14.83011531829834, "learning_rate": 1.4590479429247672e-07, "loss": 0.1056, "num_input_tokens_seen": 116299624, "step": 172595 }, { "epoch": 4.216646715364131, "grad_norm": 3.7940932088531554e-05, "learning_rate": 1.458604430946795e-07, "loss": 0.0, "num_input_tokens_seen": 116302696, "step": 172600 }, { "epoch": 4.216768866195979, "grad_norm": 5.4114127124194056e-05, "learning_rate": 1.45816098108415e-07, "loss": 0.0, "num_input_tokens_seen": 116306408, "step": 172605 }, { "epoch": 4.216891017027826, "grad_norm": 1.4836638001725078e-05, "learning_rate": 1.4577175933400554e-07, "loss": 0.0, "num_input_tokens_seen": 116309416, "step": 172610 }, { "epoch": 4.217013167859673, "grad_norm": 0.0008437048527412117, "learning_rate": 1.4572742677177375e-07, "loss": 0.0, "num_input_tokens_seen": 116312168, "step": 172615 }, { "epoch": 4.21713531869152, "grad_norm": 3.5103963455185294e-05, "learning_rate": 1.4568310042204156e-07, "loss": 0.0, "num_input_tokens_seen": 116315496, "step": 172620 }, { "epoch": 4.217257469523368, "grad_norm": 0.005723453126847744, "learning_rate": 1.4563878028513177e-07, "loss": 0.0, "num_input_tokens_seen": 116318888, "step": 172625 }, { "epoch": 4.2173796203552145, "grad_norm": 0.001771946670487523, "learning_rate": 1.4559446636136675e-07, "loss": 0.0, "num_input_tokens_seen": 116322088, "step": 172630 }, { "epoch": 4.217501771187062, "grad_norm": 0.0040583363734185696, "learning_rate": 1.4555015865106835e-07, "loss": 0.0, "num_input_tokens_seen": 116325288, "step": 172635 }, { "epoch": 4.217623922018909, "grad_norm": 7.745297807559837e-06, "learning_rate": 1.455058571545593e-07, "loss": 0.0, "num_input_tokens_seen": 116328744, "step": 172640 }, { "epoch": 4.2177460728507565, "grad_norm": 0.0033273485023528337, "learning_rate": 1.454615618721612e-07, "loss": 0.0, "num_input_tokens_seen": 116332584, "step": 172645 }, { "epoch": 4.217868223682603, "grad_norm": 3.633387677837163e-05, "learning_rate": 1.4541727280419647e-07, "loss": 0.0, "num_input_tokens_seen": 116336232, "step": 172650 }, { "epoch": 4.21799037451445, "grad_norm": 0.00017991135246120393, "learning_rate": 1.4537298995098745e-07, "loss": 0.0, "num_input_tokens_seen": 116339240, "step": 172655 }, { "epoch": 4.218112525346298, "grad_norm": 3.644289608928375e-05, "learning_rate": 1.4532871331285568e-07, "loss": 0.0, "num_input_tokens_seen": 116342760, "step": 172660 }, { "epoch": 4.218234676178144, "grad_norm": 0.0003333684871904552, "learning_rate": 1.4528444289012353e-07, "loss": 0.0, "num_input_tokens_seen": 116345896, "step": 172665 }, { "epoch": 4.218356827009992, "grad_norm": 3.655048203654587e-05, "learning_rate": 1.4524017868311268e-07, "loss": 0.0001, "num_input_tokens_seen": 116349160, "step": 172670 }, { "epoch": 4.218478977841839, "grad_norm": 0.00015764005365781486, "learning_rate": 1.4519592069214538e-07, "loss": 0.0, "num_input_tokens_seen": 116352296, "step": 172675 }, { "epoch": 4.218601128673686, "grad_norm": 0.0005261016194708645, "learning_rate": 1.4515166891754292e-07, "loss": 0.0, "num_input_tokens_seen": 116355304, "step": 172680 }, { "epoch": 4.218723279505533, "grad_norm": 0.0004089116700924933, "learning_rate": 1.4510742335962777e-07, "loss": 0.0, "num_input_tokens_seen": 116358568, "step": 172685 }, { "epoch": 4.218845430337381, "grad_norm": 0.00021509006910491735, "learning_rate": 1.4506318401872143e-07, "loss": 0.0, "num_input_tokens_seen": 116362408, "step": 172690 }, { "epoch": 4.2189675811692275, "grad_norm": 0.00015573047858197242, "learning_rate": 1.4501895089514525e-07, "loss": 0.0, "num_input_tokens_seen": 116366120, "step": 172695 }, { "epoch": 4.219089732001075, "grad_norm": 4.082197392563103e-06, "learning_rate": 1.449747239892215e-07, "loss": 0.0, "num_input_tokens_seen": 116369640, "step": 172700 }, { "epoch": 4.219211882832922, "grad_norm": 8.524627628503367e-05, "learning_rate": 1.449305033012712e-07, "loss": 0.0, "num_input_tokens_seen": 116372648, "step": 172705 }, { "epoch": 4.21933403366477, "grad_norm": 0.0002137812552973628, "learning_rate": 1.4488628883161658e-07, "loss": 0.0, "num_input_tokens_seen": 116376168, "step": 172710 }, { "epoch": 4.219456184496616, "grad_norm": 0.00010211888002231717, "learning_rate": 1.4484208058057866e-07, "loss": 0.0, "num_input_tokens_seen": 116379240, "step": 172715 }, { "epoch": 4.219578335328464, "grad_norm": 0.0004470855346880853, "learning_rate": 1.4479787854847904e-07, "loss": 0.0002, "num_input_tokens_seen": 116382440, "step": 172720 }, { "epoch": 4.219700486160311, "grad_norm": 0.020762775093317032, "learning_rate": 1.447536827356396e-07, "loss": 0.0, "num_input_tokens_seen": 116385640, "step": 172725 }, { "epoch": 4.219822636992158, "grad_norm": 7.423523493343964e-05, "learning_rate": 1.4470949314238112e-07, "loss": 0.0, "num_input_tokens_seen": 116388968, "step": 172730 }, { "epoch": 4.219944787824005, "grad_norm": 0.0003863392921630293, "learning_rate": 1.4466530976902557e-07, "loss": 0.0, "num_input_tokens_seen": 116392296, "step": 172735 }, { "epoch": 4.220066938655853, "grad_norm": 1.6147825590451248e-05, "learning_rate": 1.446211326158936e-07, "loss": 0.0, "num_input_tokens_seen": 116395368, "step": 172740 }, { "epoch": 4.220189089487699, "grad_norm": 0.00059023208450526, "learning_rate": 1.445769616833069e-07, "loss": 0.0, "num_input_tokens_seen": 116399080, "step": 172745 }, { "epoch": 4.220311240319546, "grad_norm": 0.0002006474242080003, "learning_rate": 1.4453279697158683e-07, "loss": 0.0, "num_input_tokens_seen": 116402152, "step": 172750 }, { "epoch": 4.220433391151394, "grad_norm": 0.00967673771083355, "learning_rate": 1.4448863848105407e-07, "loss": 0.0, "num_input_tokens_seen": 116405736, "step": 172755 }, { "epoch": 4.220555541983241, "grad_norm": 0.00023713294649496675, "learning_rate": 1.444444862120303e-07, "loss": 0.0, "num_input_tokens_seen": 116408488, "step": 172760 }, { "epoch": 4.220677692815088, "grad_norm": 0.002359689911827445, "learning_rate": 1.4440034016483614e-07, "loss": 0.0, "num_input_tokens_seen": 116412136, "step": 172765 }, { "epoch": 4.220799843646935, "grad_norm": 0.0009306291467510164, "learning_rate": 1.4435620033979302e-07, "loss": 0.0, "num_input_tokens_seen": 116415656, "step": 172770 }, { "epoch": 4.220921994478783, "grad_norm": 4.442542831384344e-06, "learning_rate": 1.443120667372215e-07, "loss": 0.0, "num_input_tokens_seen": 116419112, "step": 172775 }, { "epoch": 4.221044145310629, "grad_norm": 0.0063672238029539585, "learning_rate": 1.4426793935744287e-07, "loss": 0.0, "num_input_tokens_seen": 116422952, "step": 172780 }, { "epoch": 4.221166296142477, "grad_norm": 4.459876072360203e-05, "learning_rate": 1.442238182007781e-07, "loss": 0.0, "num_input_tokens_seen": 116426280, "step": 172785 }, { "epoch": 4.221288446974324, "grad_norm": 5.368567872210406e-05, "learning_rate": 1.4417970326754803e-07, "loss": 0.0, "num_input_tokens_seen": 116429352, "step": 172790 }, { "epoch": 4.221410597806171, "grad_norm": 0.00011356353934388608, "learning_rate": 1.44135594558073e-07, "loss": 0.0305, "num_input_tokens_seen": 116433000, "step": 172795 }, { "epoch": 4.221532748638018, "grad_norm": 0.0008511125342920423, "learning_rate": 1.4409149207267434e-07, "loss": 0.0, "num_input_tokens_seen": 116436392, "step": 172800 }, { "epoch": 4.221654899469866, "grad_norm": 3.7000179872848094e-05, "learning_rate": 1.4404739581167236e-07, "loss": 0.0, "num_input_tokens_seen": 116439720, "step": 172805 }, { "epoch": 4.2217770503017125, "grad_norm": 3.221071165171452e-05, "learning_rate": 1.4400330577538822e-07, "loss": 0.0, "num_input_tokens_seen": 116443240, "step": 172810 }, { "epoch": 4.22189920113356, "grad_norm": 0.00017757921887096018, "learning_rate": 1.43959221964142e-07, "loss": 0.0, "num_input_tokens_seen": 116446696, "step": 172815 }, { "epoch": 4.222021351965407, "grad_norm": 0.00012855124077759683, "learning_rate": 1.439151443782548e-07, "loss": 0.0, "num_input_tokens_seen": 116450792, "step": 172820 }, { "epoch": 4.2221435027972545, "grad_norm": 2.032962402154226e-05, "learning_rate": 1.4387107301804668e-07, "loss": 0.0, "num_input_tokens_seen": 116454184, "step": 172825 }, { "epoch": 4.222265653629101, "grad_norm": 0.006621385924518108, "learning_rate": 1.4382700788383873e-07, "loss": 0.0, "num_input_tokens_seen": 116457320, "step": 172830 }, { "epoch": 4.222387804460948, "grad_norm": 8.379903192690108e-06, "learning_rate": 1.4378294897595068e-07, "loss": 0.0, "num_input_tokens_seen": 116461480, "step": 172835 }, { "epoch": 4.222509955292796, "grad_norm": 0.0001950185833266005, "learning_rate": 1.4373889629470336e-07, "loss": 0.0, "num_input_tokens_seen": 116464872, "step": 172840 }, { "epoch": 4.222632106124642, "grad_norm": 0.008499527350068092, "learning_rate": 1.4369484984041735e-07, "loss": 0.0, "num_input_tokens_seen": 116468200, "step": 172845 }, { "epoch": 4.22275425695649, "grad_norm": 0.0044451188296079636, "learning_rate": 1.4365080961341246e-07, "loss": 0.0, "num_input_tokens_seen": 116471592, "step": 172850 }, { "epoch": 4.222876407788337, "grad_norm": 7.82210463512456e-06, "learning_rate": 1.4360677561400947e-07, "loss": 0.0, "num_input_tokens_seen": 116475240, "step": 172855 }, { "epoch": 4.222998558620184, "grad_norm": 0.00022457803424913436, "learning_rate": 1.435627478425282e-07, "loss": 0.0, "num_input_tokens_seen": 116479144, "step": 172860 }, { "epoch": 4.223120709452031, "grad_norm": 0.00020594919624272734, "learning_rate": 1.4351872629928907e-07, "loss": 0.0, "num_input_tokens_seen": 116482472, "step": 172865 }, { "epoch": 4.223242860283879, "grad_norm": 0.0017935315845534205, "learning_rate": 1.4347471098461194e-07, "loss": 0.0, "num_input_tokens_seen": 116485544, "step": 172870 }, { "epoch": 4.2233650111157255, "grad_norm": 0.004511896055191755, "learning_rate": 1.434307018988171e-07, "loss": 0.0, "num_input_tokens_seen": 116488936, "step": 172875 }, { "epoch": 4.223487161947573, "grad_norm": 0.0023193045053631067, "learning_rate": 1.4338669904222478e-07, "loss": 0.0, "num_input_tokens_seen": 116492008, "step": 172880 }, { "epoch": 4.22360931277942, "grad_norm": 0.0012620068155229092, "learning_rate": 1.4334270241515466e-07, "loss": 0.0, "num_input_tokens_seen": 116495400, "step": 172885 }, { "epoch": 4.2237314636112675, "grad_norm": 0.0058755516074597836, "learning_rate": 1.4329871201792698e-07, "loss": 0.0, "num_input_tokens_seen": 116498920, "step": 172890 }, { "epoch": 4.223853614443114, "grad_norm": 0.00023124816652853042, "learning_rate": 1.4325472785086147e-07, "loss": 0.0, "num_input_tokens_seen": 116502056, "step": 172895 }, { "epoch": 4.223975765274962, "grad_norm": 0.009667105972766876, "learning_rate": 1.4321074991427785e-07, "loss": 0.0002, "num_input_tokens_seen": 116505320, "step": 172900 }, { "epoch": 4.224097916106809, "grad_norm": 0.005875737406313419, "learning_rate": 1.431667782084962e-07, "loss": 0.0, "num_input_tokens_seen": 116508520, "step": 172905 }, { "epoch": 4.224220066938656, "grad_norm": 21.689476013183594, "learning_rate": 1.4312281273383608e-07, "loss": 0.0572, "num_input_tokens_seen": 116511720, "step": 172910 }, { "epoch": 4.224342217770503, "grad_norm": 0.0006670998991467059, "learning_rate": 1.4307885349061755e-07, "loss": 0.0, "num_input_tokens_seen": 116514792, "step": 172915 }, { "epoch": 4.22446436860235, "grad_norm": 2.3548327590106055e-05, "learning_rate": 1.4303490047915989e-07, "loss": 0.0, "num_input_tokens_seen": 116518056, "step": 172920 }, { "epoch": 4.224586519434197, "grad_norm": 0.0005599820869974792, "learning_rate": 1.429909536997831e-07, "loss": 0.0, "num_input_tokens_seen": 116521192, "step": 172925 }, { "epoch": 4.224708670266044, "grad_norm": 0.07890927791595459, "learning_rate": 1.4294701315280645e-07, "loss": 0.0, "num_input_tokens_seen": 116524264, "step": 172930 }, { "epoch": 4.224830821097892, "grad_norm": 0.0008640710148029029, "learning_rate": 1.4290307883854958e-07, "loss": 0.0, "num_input_tokens_seen": 116527784, "step": 172935 }, { "epoch": 4.2249529719297385, "grad_norm": 7.143465336412191e-05, "learning_rate": 1.4285915075733225e-07, "loss": 0.0, "num_input_tokens_seen": 116530920, "step": 172940 }, { "epoch": 4.225075122761586, "grad_norm": 4.0271534089697525e-05, "learning_rate": 1.428152289094735e-07, "loss": 0.0, "num_input_tokens_seen": 116534056, "step": 172945 }, { "epoch": 4.225197273593433, "grad_norm": 8.894584607332945e-05, "learning_rate": 1.4277131329529323e-07, "loss": 0.0, "num_input_tokens_seen": 116537384, "step": 172950 }, { "epoch": 4.2253194244252805, "grad_norm": 1.7404921891284175e-05, "learning_rate": 1.427274039151103e-07, "loss": 0.0, "num_input_tokens_seen": 116540968, "step": 172955 }, { "epoch": 4.225441575257127, "grad_norm": 4.886610258836299e-05, "learning_rate": 1.426835007692443e-07, "loss": 0.0, "num_input_tokens_seen": 116544168, "step": 172960 }, { "epoch": 4.225563726088975, "grad_norm": 0.00024690740974619985, "learning_rate": 1.4263960385801465e-07, "loss": 0.0, "num_input_tokens_seen": 116547112, "step": 172965 }, { "epoch": 4.225685876920822, "grad_norm": 0.00024043295707087964, "learning_rate": 1.4259571318174014e-07, "loss": 0.0, "num_input_tokens_seen": 116550120, "step": 172970 }, { "epoch": 4.225808027752669, "grad_norm": 0.00130647758487612, "learning_rate": 1.4255182874074045e-07, "loss": 0.0, "num_input_tokens_seen": 116553320, "step": 172975 }, { "epoch": 4.225930178584516, "grad_norm": 0.0005790484719909728, "learning_rate": 1.4250795053533438e-07, "loss": 0.0, "num_input_tokens_seen": 116556776, "step": 172980 }, { "epoch": 4.226052329416364, "grad_norm": 0.0015895002288743854, "learning_rate": 1.4246407856584132e-07, "loss": 0.0, "num_input_tokens_seen": 116559912, "step": 172985 }, { "epoch": 4.22617448024821, "grad_norm": 0.0025169982109218836, "learning_rate": 1.4242021283257976e-07, "loss": 0.0, "num_input_tokens_seen": 116563112, "step": 172990 }, { "epoch": 4.226296631080058, "grad_norm": 1.5641830032109283e-05, "learning_rate": 1.4237635333586938e-07, "loss": 0.0, "num_input_tokens_seen": 116566312, "step": 172995 }, { "epoch": 4.226418781911905, "grad_norm": 0.011025205254554749, "learning_rate": 1.423325000760287e-07, "loss": 0.0529, "num_input_tokens_seen": 116569768, "step": 173000 }, { "epoch": 4.226540932743752, "grad_norm": 0.00022293122310657054, "learning_rate": 1.422886530533769e-07, "loss": 0.0, "num_input_tokens_seen": 116573032, "step": 173005 }, { "epoch": 4.226663083575599, "grad_norm": 3.6755925975739956e-05, "learning_rate": 1.422448122682327e-07, "loss": 0.0, "num_input_tokens_seen": 116576296, "step": 173010 }, { "epoch": 4.226785234407446, "grad_norm": 0.02970314212143421, "learning_rate": 1.4220097772091478e-07, "loss": 0.0002, "num_input_tokens_seen": 116580008, "step": 173015 }, { "epoch": 4.2269073852392935, "grad_norm": 0.0007604420534335077, "learning_rate": 1.4215714941174227e-07, "loss": 0.0, "num_input_tokens_seen": 116583720, "step": 173020 }, { "epoch": 4.22702953607114, "grad_norm": 0.0001692136429483071, "learning_rate": 1.4211332734103343e-07, "loss": 0.0, "num_input_tokens_seen": 116586920, "step": 173025 }, { "epoch": 4.227151686902988, "grad_norm": 2.0400722860358655e-05, "learning_rate": 1.4206951150910727e-07, "loss": 0.0, "num_input_tokens_seen": 116590440, "step": 173030 }, { "epoch": 4.227273837734835, "grad_norm": 5.274592331261374e-05, "learning_rate": 1.420257019162826e-07, "loss": 0.0, "num_input_tokens_seen": 116593576, "step": 173035 }, { "epoch": 4.227395988566682, "grad_norm": 0.00022966494725551456, "learning_rate": 1.4198189856287746e-07, "loss": 0.0, "num_input_tokens_seen": 116597096, "step": 173040 }, { "epoch": 4.227518139398529, "grad_norm": 0.001598756411112845, "learning_rate": 1.4193810144921114e-07, "loss": 0.0, "num_input_tokens_seen": 116600488, "step": 173045 }, { "epoch": 4.227640290230377, "grad_norm": 6.039086656528525e-05, "learning_rate": 1.4189431057560142e-07, "loss": 0.0, "num_input_tokens_seen": 116604072, "step": 173050 }, { "epoch": 4.227762441062223, "grad_norm": 3.7535726733040065e-05, "learning_rate": 1.4185052594236702e-07, "loss": 0.0, "num_input_tokens_seen": 116607464, "step": 173055 }, { "epoch": 4.227884591894071, "grad_norm": 0.00012207684630993754, "learning_rate": 1.418067475498267e-07, "loss": 0.0, "num_input_tokens_seen": 116610856, "step": 173060 }, { "epoch": 4.228006742725918, "grad_norm": 3.734017082024366e-05, "learning_rate": 1.417629753982983e-07, "loss": 0.0, "num_input_tokens_seen": 116613864, "step": 173065 }, { "epoch": 4.2281288935577654, "grad_norm": 0.0005823975661769509, "learning_rate": 1.4171920948810056e-07, "loss": 0.0, "num_input_tokens_seen": 116616872, "step": 173070 }, { "epoch": 4.228251044389612, "grad_norm": 0.00012418540427461267, "learning_rate": 1.4167544981955148e-07, "loss": 0.0, "num_input_tokens_seen": 116620392, "step": 173075 }, { "epoch": 4.22837319522146, "grad_norm": 3.818258846877143e-06, "learning_rate": 1.4163169639296946e-07, "loss": 0.0, "num_input_tokens_seen": 116623976, "step": 173080 }, { "epoch": 4.228495346053307, "grad_norm": 9.804531873669475e-05, "learning_rate": 1.4158794920867245e-07, "loss": 0.0, "num_input_tokens_seen": 116627752, "step": 173085 }, { "epoch": 4.228617496885154, "grad_norm": 0.0018733707256615162, "learning_rate": 1.4154420826697888e-07, "loss": 0.0, "num_input_tokens_seen": 116631144, "step": 173090 }, { "epoch": 4.228739647717001, "grad_norm": 0.00010759669385151938, "learning_rate": 1.415004735682068e-07, "loss": 0.0, "num_input_tokens_seen": 116634792, "step": 173095 }, { "epoch": 4.228861798548848, "grad_norm": 0.006699483375996351, "learning_rate": 1.4145674511267425e-07, "loss": 0.0, "num_input_tokens_seen": 116638248, "step": 173100 }, { "epoch": 4.228983949380695, "grad_norm": 4.7020821511978284e-05, "learning_rate": 1.414130229006989e-07, "loss": 0.0, "num_input_tokens_seen": 116641448, "step": 173105 }, { "epoch": 4.229106100212542, "grad_norm": 4.57140886283014e-05, "learning_rate": 1.4136930693259918e-07, "loss": 0.0, "num_input_tokens_seen": 116645096, "step": 173110 }, { "epoch": 4.22922825104439, "grad_norm": 0.0014901167014613748, "learning_rate": 1.4132559720869264e-07, "loss": 0.0, "num_input_tokens_seen": 116648680, "step": 173115 }, { "epoch": 4.2293504018762365, "grad_norm": 0.00018210777489002794, "learning_rate": 1.4128189372929755e-07, "loss": 0.0, "num_input_tokens_seen": 116651880, "step": 173120 }, { "epoch": 4.229472552708084, "grad_norm": 0.00023117540695238858, "learning_rate": 1.4123819649473123e-07, "loss": 0.0, "num_input_tokens_seen": 116655208, "step": 173125 }, { "epoch": 4.229594703539931, "grad_norm": 6.564979412360117e-05, "learning_rate": 1.4119450550531198e-07, "loss": 0.0, "num_input_tokens_seen": 116658920, "step": 173130 }, { "epoch": 4.2297168543717785, "grad_norm": 0.001030170125886798, "learning_rate": 1.411508207613571e-07, "loss": 0.0, "num_input_tokens_seen": 116662120, "step": 173135 }, { "epoch": 4.229839005203625, "grad_norm": 1.3802886314806528e-05, "learning_rate": 1.4110714226318455e-07, "loss": 0.0, "num_input_tokens_seen": 116665512, "step": 173140 }, { "epoch": 4.229961156035473, "grad_norm": 0.0002681001788005233, "learning_rate": 1.4106347001111173e-07, "loss": 0.0, "num_input_tokens_seen": 116669160, "step": 173145 }, { "epoch": 4.23008330686732, "grad_norm": 8.571257785661146e-05, "learning_rate": 1.4101980400545643e-07, "loss": 0.0, "num_input_tokens_seen": 116672488, "step": 173150 }, { "epoch": 4.230205457699167, "grad_norm": 0.0009145172662101686, "learning_rate": 1.4097614424653624e-07, "loss": 0.0, "num_input_tokens_seen": 116675752, "step": 173155 }, { "epoch": 4.230327608531014, "grad_norm": 1.8907823687186465e-05, "learning_rate": 1.409324907346685e-07, "loss": 0.0, "num_input_tokens_seen": 116678952, "step": 173160 }, { "epoch": 4.230449759362862, "grad_norm": 8.053556666709483e-05, "learning_rate": 1.4088884347017094e-07, "loss": 0.0, "num_input_tokens_seen": 116682280, "step": 173165 }, { "epoch": 4.230571910194708, "grad_norm": 0.005030336324125528, "learning_rate": 1.4084520245336052e-07, "loss": 0.0, "num_input_tokens_seen": 116685480, "step": 173170 }, { "epoch": 4.230694061026556, "grad_norm": 0.00014258605369832367, "learning_rate": 1.408015676845551e-07, "loss": 0.0, "num_input_tokens_seen": 116688808, "step": 173175 }, { "epoch": 4.230816211858403, "grad_norm": 0.001624719938263297, "learning_rate": 1.4075793916407154e-07, "loss": 0.0, "num_input_tokens_seen": 116692008, "step": 173180 }, { "epoch": 4.2309383626902495, "grad_norm": 0.00024306464183609933, "learning_rate": 1.4071431689222735e-07, "loss": 0.0, "num_input_tokens_seen": 116695208, "step": 173185 }, { "epoch": 4.231060513522097, "grad_norm": 0.0005330306012183428, "learning_rate": 1.4067070086933996e-07, "loss": 0.0, "num_input_tokens_seen": 116698536, "step": 173190 }, { "epoch": 4.231182664353944, "grad_norm": 1.0813733752002008e-05, "learning_rate": 1.4062709109572623e-07, "loss": 0.0, "num_input_tokens_seen": 116702760, "step": 173195 }, { "epoch": 4.2313048151857915, "grad_norm": 6.632073927903548e-05, "learning_rate": 1.4058348757170367e-07, "loss": 0.0, "num_input_tokens_seen": 116706216, "step": 173200 }, { "epoch": 4.231426966017638, "grad_norm": 0.02792537584900856, "learning_rate": 1.4053989029758905e-07, "loss": 0.0, "num_input_tokens_seen": 116709416, "step": 173205 }, { "epoch": 4.231549116849486, "grad_norm": 0.003411710960790515, "learning_rate": 1.4049629927369934e-07, "loss": 0.0, "num_input_tokens_seen": 116713256, "step": 173210 }, { "epoch": 4.231671267681333, "grad_norm": 0.002977790078148246, "learning_rate": 1.40452714500352e-07, "loss": 0.0, "num_input_tokens_seen": 116716648, "step": 173215 }, { "epoch": 4.23179341851318, "grad_norm": 0.034006424248218536, "learning_rate": 1.4040913597786342e-07, "loss": 0.0, "num_input_tokens_seen": 116719592, "step": 173220 }, { "epoch": 4.231915569345027, "grad_norm": 2.359913196414709e-05, "learning_rate": 1.4036556370655105e-07, "loss": 0.0, "num_input_tokens_seen": 116722984, "step": 173225 }, { "epoch": 4.232037720176875, "grad_norm": 1.3884271538699977e-05, "learning_rate": 1.4032199768673124e-07, "loss": 0.0, "num_input_tokens_seen": 116726888, "step": 173230 }, { "epoch": 4.232159871008721, "grad_norm": 0.0003605287929531187, "learning_rate": 1.402784379187213e-07, "loss": 0.0, "num_input_tokens_seen": 116730536, "step": 173235 }, { "epoch": 4.232282021840569, "grad_norm": 0.000187235651537776, "learning_rate": 1.4023488440283771e-07, "loss": 0.0001, "num_input_tokens_seen": 116733800, "step": 173240 }, { "epoch": 4.232404172672416, "grad_norm": 0.00020515448704827577, "learning_rate": 1.4019133713939713e-07, "loss": 0.0, "num_input_tokens_seen": 116737448, "step": 173245 }, { "epoch": 4.232526323504263, "grad_norm": 6.843862502137199e-05, "learning_rate": 1.4014779612871673e-07, "loss": 0.0, "num_input_tokens_seen": 116740392, "step": 173250 }, { "epoch": 4.23264847433611, "grad_norm": 0.00012958318984601647, "learning_rate": 1.4010426137111265e-07, "loss": 0.0451, "num_input_tokens_seen": 116743464, "step": 173255 }, { "epoch": 4.232770625167958, "grad_norm": 3.7418103602249175e-05, "learning_rate": 1.4006073286690178e-07, "loss": 0.0, "num_input_tokens_seen": 116747240, "step": 173260 }, { "epoch": 4.2328927759998045, "grad_norm": 3.6361041566124186e-05, "learning_rate": 1.4001721061640038e-07, "loss": 0.0, "num_input_tokens_seen": 116750760, "step": 173265 }, { "epoch": 4.233014926831652, "grad_norm": 0.00014582725998479873, "learning_rate": 1.3997369461992513e-07, "loss": 0.0, "num_input_tokens_seen": 116753960, "step": 173270 }, { "epoch": 4.233137077663499, "grad_norm": 1.2433220035745762e-05, "learning_rate": 1.3993018487779262e-07, "loss": 0.0, "num_input_tokens_seen": 116757672, "step": 173275 }, { "epoch": 4.233259228495346, "grad_norm": 0.0011232885299250484, "learning_rate": 1.39886681390319e-07, "loss": 0.0005, "num_input_tokens_seen": 116760808, "step": 173280 }, { "epoch": 4.233381379327193, "grad_norm": 8.350649295607582e-05, "learning_rate": 1.3984318415782103e-07, "loss": 0.0, "num_input_tokens_seen": 116764008, "step": 173285 }, { "epoch": 4.23350353015904, "grad_norm": 2.9423656087601557e-05, "learning_rate": 1.3979969318061457e-07, "loss": 0.0, "num_input_tokens_seen": 116767528, "step": 173290 }, { "epoch": 4.233625680990888, "grad_norm": 0.00022587507555726916, "learning_rate": 1.3975620845901624e-07, "loss": 0.0, "num_input_tokens_seen": 116771176, "step": 173295 }, { "epoch": 4.233747831822734, "grad_norm": 1.5688436178606935e-05, "learning_rate": 1.3971272999334206e-07, "loss": 0.0, "num_input_tokens_seen": 116774568, "step": 173300 }, { "epoch": 4.233869982654582, "grad_norm": 0.00046054826816543937, "learning_rate": 1.3966925778390836e-07, "loss": 0.0, "num_input_tokens_seen": 116777768, "step": 173305 }, { "epoch": 4.233992133486429, "grad_norm": 0.00010839592141564935, "learning_rate": 1.3962579183103106e-07, "loss": 0.0, "num_input_tokens_seen": 116781352, "step": 173310 }, { "epoch": 4.234114284318276, "grad_norm": 4.6599354391219094e-05, "learning_rate": 1.3958233213502669e-07, "loss": 0.0, "num_input_tokens_seen": 116784680, "step": 173315 }, { "epoch": 4.234236435150123, "grad_norm": 0.00041652043000794947, "learning_rate": 1.3953887869621095e-07, "loss": 0.0584, "num_input_tokens_seen": 116787880, "step": 173320 }, { "epoch": 4.234358585981971, "grad_norm": 0.0005544126615859568, "learning_rate": 1.3949543151489973e-07, "loss": 0.0, "num_input_tokens_seen": 116790824, "step": 173325 }, { "epoch": 4.2344807368138175, "grad_norm": 0.00018715529586188495, "learning_rate": 1.3945199059140932e-07, "loss": 0.0, "num_input_tokens_seen": 116793896, "step": 173330 }, { "epoch": 4.234602887645665, "grad_norm": 1.5036851436889265e-05, "learning_rate": 1.3940855592605538e-07, "loss": 0.0, "num_input_tokens_seen": 116796840, "step": 173335 }, { "epoch": 4.234725038477512, "grad_norm": 0.0010711431968957186, "learning_rate": 1.3936512751915387e-07, "loss": 0.0, "num_input_tokens_seen": 116799976, "step": 173340 }, { "epoch": 4.2348471893093595, "grad_norm": 0.000726178870536387, "learning_rate": 1.3932170537102084e-07, "loss": 0.0, "num_input_tokens_seen": 116803432, "step": 173345 }, { "epoch": 4.234969340141206, "grad_norm": 0.00036462812568061054, "learning_rate": 1.3927828948197162e-07, "loss": 0.0, "num_input_tokens_seen": 116806504, "step": 173350 }, { "epoch": 4.235091490973054, "grad_norm": 3.795464726863429e-05, "learning_rate": 1.392348798523225e-07, "loss": 0.0, "num_input_tokens_seen": 116810152, "step": 173355 }, { "epoch": 4.235213641804901, "grad_norm": 0.4274020493030548, "learning_rate": 1.391914764823885e-07, "loss": 0.0002, "num_input_tokens_seen": 116813352, "step": 173360 }, { "epoch": 4.235335792636748, "grad_norm": 5.8089312915399205e-06, "learning_rate": 1.3914807937248575e-07, "loss": 0.0, "num_input_tokens_seen": 116816936, "step": 173365 }, { "epoch": 4.235457943468595, "grad_norm": 7.85816228017211e-05, "learning_rate": 1.3910468852292977e-07, "loss": 0.0, "num_input_tokens_seen": 116820072, "step": 173370 }, { "epoch": 4.235580094300442, "grad_norm": 0.00032599607948213816, "learning_rate": 1.3906130393403593e-07, "loss": 0.0, "num_input_tokens_seen": 116823592, "step": 173375 }, { "epoch": 4.235702245132289, "grad_norm": 0.00011063476995332167, "learning_rate": 1.3901792560612002e-07, "loss": 0.0, "num_input_tokens_seen": 116827240, "step": 173380 }, { "epoch": 4.235824395964136, "grad_norm": 2.9659470328624593e-06, "learning_rate": 1.3897455353949715e-07, "loss": 0.0, "num_input_tokens_seen": 116830632, "step": 173385 }, { "epoch": 4.235946546795984, "grad_norm": 5.866341962246224e-05, "learning_rate": 1.389311877344832e-07, "loss": 0.0, "num_input_tokens_seen": 116834152, "step": 173390 }, { "epoch": 4.236068697627831, "grad_norm": 0.00014773337170481682, "learning_rate": 1.38887828191393e-07, "loss": 0.0, "num_input_tokens_seen": 116837224, "step": 173395 }, { "epoch": 4.236190848459678, "grad_norm": 0.00015105612692423165, "learning_rate": 1.3884447491054207e-07, "loss": 0.0, "num_input_tokens_seen": 116840488, "step": 173400 }, { "epoch": 4.236312999291525, "grad_norm": 8.750290726311505e-05, "learning_rate": 1.3880112789224596e-07, "loss": 0.0, "num_input_tokens_seen": 116844392, "step": 173405 }, { "epoch": 4.236435150123373, "grad_norm": 0.00471015740185976, "learning_rate": 1.3875778713681975e-07, "loss": 0.0, "num_input_tokens_seen": 116847720, "step": 173410 }, { "epoch": 4.236557300955219, "grad_norm": 4.11164146498777e-05, "learning_rate": 1.3871445264457826e-07, "loss": 0.0, "num_input_tokens_seen": 116850984, "step": 173415 }, { "epoch": 4.236679451787067, "grad_norm": 4.238776455167681e-05, "learning_rate": 1.3867112441583718e-07, "loss": 0.0536, "num_input_tokens_seen": 116854120, "step": 173420 }, { "epoch": 4.236801602618914, "grad_norm": 0.0008721463964320719, "learning_rate": 1.3862780245091133e-07, "loss": 0.0, "num_input_tokens_seen": 116857448, "step": 173425 }, { "epoch": 4.236923753450761, "grad_norm": 4.055196768604219e-05, "learning_rate": 1.3858448675011558e-07, "loss": 0.0, "num_input_tokens_seen": 116860648, "step": 173430 }, { "epoch": 4.237045904282608, "grad_norm": 0.0018774106865748763, "learning_rate": 1.3854117731376515e-07, "loss": 0.0, "num_input_tokens_seen": 116864296, "step": 173435 }, { "epoch": 4.237168055114456, "grad_norm": 0.00010131034650839865, "learning_rate": 1.384978741421752e-07, "loss": 0.0, "num_input_tokens_seen": 116867496, "step": 173440 }, { "epoch": 4.2372902059463025, "grad_norm": 0.0007216433878056705, "learning_rate": 1.3845457723566024e-07, "loss": 0.0563, "num_input_tokens_seen": 116870632, "step": 173445 }, { "epoch": 4.237412356778149, "grad_norm": 0.0008672663243487477, "learning_rate": 1.3841128659453548e-07, "loss": 0.0, "num_input_tokens_seen": 116874024, "step": 173450 }, { "epoch": 4.237534507609997, "grad_norm": 0.0017662736354395747, "learning_rate": 1.3836800221911537e-07, "loss": 0.0, "num_input_tokens_seen": 116877544, "step": 173455 }, { "epoch": 4.237656658441844, "grad_norm": 0.004523616284132004, "learning_rate": 1.3832472410971485e-07, "loss": 0.0, "num_input_tokens_seen": 116880744, "step": 173460 }, { "epoch": 4.237778809273691, "grad_norm": 0.00046189091517589986, "learning_rate": 1.38281452266649e-07, "loss": 0.0, "num_input_tokens_seen": 116883816, "step": 173465 }, { "epoch": 4.237900960105538, "grad_norm": 1.331346902588848e-05, "learning_rate": 1.3823818669023202e-07, "loss": 0.0, "num_input_tokens_seen": 116887144, "step": 173470 }, { "epoch": 4.238023110937386, "grad_norm": 5.324160156305879e-06, "learning_rate": 1.3819492738077887e-07, "loss": 0.0, "num_input_tokens_seen": 116890472, "step": 173475 }, { "epoch": 4.238145261769232, "grad_norm": 2.3222146034240723, "learning_rate": 1.3815167433860387e-07, "loss": 0.0004, "num_input_tokens_seen": 116893480, "step": 173480 }, { "epoch": 4.23826741260108, "grad_norm": 0.00231824885122478, "learning_rate": 1.3810842756402184e-07, "loss": 0.0, "num_input_tokens_seen": 116896744, "step": 173485 }, { "epoch": 4.238389563432927, "grad_norm": 0.00015672051813453436, "learning_rate": 1.3806518705734694e-07, "loss": 0.0, "num_input_tokens_seen": 116900072, "step": 173490 }, { "epoch": 4.238511714264774, "grad_norm": 0.0015891582006588578, "learning_rate": 1.3802195281889383e-07, "loss": 0.0, "num_input_tokens_seen": 116903272, "step": 173495 }, { "epoch": 4.238633865096621, "grad_norm": 0.0005230855895206332, "learning_rate": 1.379787248489771e-07, "loss": 0.0, "num_input_tokens_seen": 116906728, "step": 173500 }, { "epoch": 4.238756015928469, "grad_norm": 2.7853136998601258e-05, "learning_rate": 1.379355031479108e-07, "loss": 0.0, "num_input_tokens_seen": 116910248, "step": 173505 }, { "epoch": 4.2388781667603155, "grad_norm": 4.448258187039755e-05, "learning_rate": 1.3789228771600959e-07, "loss": 0.0, "num_input_tokens_seen": 116913384, "step": 173510 }, { "epoch": 4.239000317592163, "grad_norm": 0.00026565848384052515, "learning_rate": 1.378490785535875e-07, "loss": 0.0, "num_input_tokens_seen": 116919144, "step": 173515 }, { "epoch": 4.23912246842401, "grad_norm": 0.00017902874969877303, "learning_rate": 1.378058756609587e-07, "loss": 0.0, "num_input_tokens_seen": 116922408, "step": 173520 }, { "epoch": 4.2392446192558575, "grad_norm": 0.00046245678095147014, "learning_rate": 1.3776267903843763e-07, "loss": 0.0, "num_input_tokens_seen": 116926056, "step": 173525 }, { "epoch": 4.239366770087704, "grad_norm": 0.00010655471123754978, "learning_rate": 1.3771948868633797e-07, "loss": 0.0, "num_input_tokens_seen": 116929448, "step": 173530 }, { "epoch": 4.239488920919552, "grad_norm": 0.0017132902285084128, "learning_rate": 1.3767630460497447e-07, "loss": 0.0, "num_input_tokens_seen": 116932648, "step": 173535 }, { "epoch": 4.239611071751399, "grad_norm": 0.0010654488578438759, "learning_rate": 1.3763312679466054e-07, "loss": 0.0001, "num_input_tokens_seen": 116936104, "step": 173540 }, { "epoch": 4.239733222583245, "grad_norm": 6.303595000645146e-05, "learning_rate": 1.375899552557106e-07, "loss": 0.0, "num_input_tokens_seen": 116939496, "step": 173545 }, { "epoch": 4.239855373415093, "grad_norm": 3.1240680982591584e-05, "learning_rate": 1.3754678998843838e-07, "loss": 0.0, "num_input_tokens_seen": 116942888, "step": 173550 }, { "epoch": 4.23997752424694, "grad_norm": 9.1200927272439e-05, "learning_rate": 1.3750363099315777e-07, "loss": 0.0, "num_input_tokens_seen": 116946216, "step": 173555 }, { "epoch": 4.240099675078787, "grad_norm": 9.304591003456153e-06, "learning_rate": 1.3746047827018302e-07, "loss": 0.0, "num_input_tokens_seen": 116949288, "step": 173560 }, { "epoch": 4.240221825910634, "grad_norm": 1.8034832464763895e-05, "learning_rate": 1.374173318198274e-07, "loss": 0.0, "num_input_tokens_seen": 116952872, "step": 173565 }, { "epoch": 4.240343976742482, "grad_norm": 5.101427177578444e-06, "learning_rate": 1.3737419164240527e-07, "loss": 0.0, "num_input_tokens_seen": 116956776, "step": 173570 }, { "epoch": 4.2404661275743285, "grad_norm": 0.002323642373085022, "learning_rate": 1.3733105773822973e-07, "loss": 0.0, "num_input_tokens_seen": 116960104, "step": 173575 }, { "epoch": 4.240588278406176, "grad_norm": 127.75724792480469, "learning_rate": 1.3728793010761497e-07, "loss": 0.0139, "num_input_tokens_seen": 116963240, "step": 173580 }, { "epoch": 4.240710429238023, "grad_norm": 0.00025013191043399274, "learning_rate": 1.372448087508742e-07, "loss": 0.0, "num_input_tokens_seen": 116966504, "step": 173585 }, { "epoch": 4.2408325800698705, "grad_norm": 0.00014707444643136114, "learning_rate": 1.3720169366832134e-07, "loss": 0.0, "num_input_tokens_seen": 116969512, "step": 173590 }, { "epoch": 4.240954730901717, "grad_norm": 6.153630965854973e-05, "learning_rate": 1.3715858486027e-07, "loss": 0.0005, "num_input_tokens_seen": 116972456, "step": 173595 }, { "epoch": 4.241076881733565, "grad_norm": 7.514778189943172e-06, "learning_rate": 1.371154823270332e-07, "loss": 0.0, "num_input_tokens_seen": 116975592, "step": 173600 }, { "epoch": 4.241199032565412, "grad_norm": 4.237967004883103e-05, "learning_rate": 1.3707238606892503e-07, "loss": 0.0343, "num_input_tokens_seen": 116978920, "step": 173605 }, { "epoch": 4.241321183397259, "grad_norm": 0.000658139237202704, "learning_rate": 1.3702929608625823e-07, "loss": 0.0, "num_input_tokens_seen": 116981928, "step": 173610 }, { "epoch": 4.241443334229106, "grad_norm": 0.0018850330961868167, "learning_rate": 1.369862123793468e-07, "loss": 0.0, "num_input_tokens_seen": 116985576, "step": 173615 }, { "epoch": 4.241565485060954, "grad_norm": 0.00011951574560953304, "learning_rate": 1.3694313494850362e-07, "loss": 0.0, "num_input_tokens_seen": 116988968, "step": 173620 }, { "epoch": 4.2416876358928, "grad_norm": 0.0011580303544178605, "learning_rate": 1.3690006379404217e-07, "loss": 0.0, "num_input_tokens_seen": 116992680, "step": 173625 }, { "epoch": 4.241809786724648, "grad_norm": 0.0008965888991951942, "learning_rate": 1.3685699891627568e-07, "loss": 0.0003, "num_input_tokens_seen": 116995752, "step": 173630 }, { "epoch": 4.241931937556495, "grad_norm": 0.005259280558675528, "learning_rate": 1.3681394031551706e-07, "loss": 0.0, "num_input_tokens_seen": 116999528, "step": 173635 }, { "epoch": 4.2420540883883415, "grad_norm": 0.0002571164513938129, "learning_rate": 1.367708879920798e-07, "loss": 0.0, "num_input_tokens_seen": 117002920, "step": 173640 }, { "epoch": 4.242176239220189, "grad_norm": 0.00011767227988457307, "learning_rate": 1.3672784194627663e-07, "loss": 0.0, "num_input_tokens_seen": 117006568, "step": 173645 }, { "epoch": 4.242298390052036, "grad_norm": 0.0065977550111711025, "learning_rate": 1.3668480217842072e-07, "loss": 0.0, "num_input_tokens_seen": 117009640, "step": 173650 }, { "epoch": 4.2424205408838835, "grad_norm": 0.00029881237423978746, "learning_rate": 1.3664176868882537e-07, "loss": 0.0, "num_input_tokens_seen": 117012840, "step": 173655 }, { "epoch": 4.24254269171573, "grad_norm": 0.0006145496154204011, "learning_rate": 1.3659874147780314e-07, "loss": 0.0, "num_input_tokens_seen": 117016296, "step": 173660 }, { "epoch": 4.242664842547578, "grad_norm": 0.00022483298380393535, "learning_rate": 1.365557205456672e-07, "loss": 0.0, "num_input_tokens_seen": 117019944, "step": 173665 }, { "epoch": 4.242786993379425, "grad_norm": 0.0005453492049127817, "learning_rate": 1.3651270589273023e-07, "loss": 0.0, "num_input_tokens_seen": 117023848, "step": 173670 }, { "epoch": 4.242909144211272, "grad_norm": 0.0019332737429067492, "learning_rate": 1.3646969751930504e-07, "loss": 0.0, "num_input_tokens_seen": 117027176, "step": 173675 }, { "epoch": 4.243031295043119, "grad_norm": 0.0005564195453189313, "learning_rate": 1.364266954257046e-07, "loss": 0.0001, "num_input_tokens_seen": 117030440, "step": 173680 }, { "epoch": 4.243153445874967, "grad_norm": 0.10380977392196655, "learning_rate": 1.3638369961224138e-07, "loss": 0.0001, "num_input_tokens_seen": 117033896, "step": 173685 }, { "epoch": 4.243275596706813, "grad_norm": 0.00010214153735432774, "learning_rate": 1.3634071007922841e-07, "loss": 0.0, "num_input_tokens_seen": 117037480, "step": 173690 }, { "epoch": 4.243397747538661, "grad_norm": 0.00010061613284051418, "learning_rate": 1.3629772682697794e-07, "loss": 0.0, "num_input_tokens_seen": 117040552, "step": 173695 }, { "epoch": 4.243519898370508, "grad_norm": 0.0002196808491135016, "learning_rate": 1.3625474985580277e-07, "loss": 0.0, "num_input_tokens_seen": 117044008, "step": 173700 }, { "epoch": 4.243642049202355, "grad_norm": 1.6563162716920488e-05, "learning_rate": 1.3621177916601522e-07, "loss": 0.0, "num_input_tokens_seen": 117047528, "step": 173705 }, { "epoch": 4.243764200034202, "grad_norm": 0.0005563225131481886, "learning_rate": 1.3616881475792796e-07, "loss": 0.0, "num_input_tokens_seen": 117051048, "step": 173710 }, { "epoch": 4.24388635086605, "grad_norm": 0.0004958529025316238, "learning_rate": 1.3612585663185372e-07, "loss": 0.0, "num_input_tokens_seen": 117054184, "step": 173715 }, { "epoch": 4.244008501697897, "grad_norm": 0.0001543401594972238, "learning_rate": 1.3608290478810448e-07, "loss": 0.0, "num_input_tokens_seen": 117057064, "step": 173720 }, { "epoch": 4.244130652529743, "grad_norm": 0.0005324023077264428, "learning_rate": 1.3603995922699252e-07, "loss": 0.0, "num_input_tokens_seen": 117060648, "step": 173725 }, { "epoch": 4.244252803361591, "grad_norm": 0.0005624612094834447, "learning_rate": 1.3599701994883062e-07, "loss": 0.0, "num_input_tokens_seen": 117063912, "step": 173730 }, { "epoch": 4.244374954193438, "grad_norm": 0.005314116831868887, "learning_rate": 1.3595408695393072e-07, "loss": 0.0, "num_input_tokens_seen": 117067240, "step": 173735 }, { "epoch": 4.244497105025285, "grad_norm": 0.012981265783309937, "learning_rate": 1.3591116024260496e-07, "loss": 0.0, "num_input_tokens_seen": 117070376, "step": 173740 }, { "epoch": 4.244619255857132, "grad_norm": 0.0001835319126257673, "learning_rate": 1.3586823981516559e-07, "loss": 0.0, "num_input_tokens_seen": 117073832, "step": 173745 }, { "epoch": 4.24474140668898, "grad_norm": 0.00045120823779143393, "learning_rate": 1.3582532567192506e-07, "loss": 0.0, "num_input_tokens_seen": 117077224, "step": 173750 }, { "epoch": 4.2448635575208264, "grad_norm": 0.00046550267143175006, "learning_rate": 1.3578241781319498e-07, "loss": 0.0399, "num_input_tokens_seen": 117080680, "step": 173755 }, { "epoch": 4.244985708352674, "grad_norm": 2.941801540146116e-05, "learning_rate": 1.357395162392878e-07, "loss": 0.0, "num_input_tokens_seen": 117083944, "step": 173760 }, { "epoch": 4.245107859184521, "grad_norm": 0.00016879245231393725, "learning_rate": 1.3569662095051504e-07, "loss": 0.0, "num_input_tokens_seen": 117087144, "step": 173765 }, { "epoch": 4.2452300100163685, "grad_norm": 0.0005892272456549108, "learning_rate": 1.35653731947189e-07, "loss": 0.0, "num_input_tokens_seen": 117091048, "step": 173770 }, { "epoch": 4.245352160848215, "grad_norm": 7.808295777067542e-05, "learning_rate": 1.3561084922962173e-07, "loss": 0.0, "num_input_tokens_seen": 117094312, "step": 173775 }, { "epoch": 4.245474311680063, "grad_norm": 0.00033269840059801936, "learning_rate": 1.355679727981246e-07, "loss": 0.0, "num_input_tokens_seen": 117097704, "step": 173780 }, { "epoch": 4.24559646251191, "grad_norm": 9.472859528614208e-05, "learning_rate": 1.3552510265300988e-07, "loss": 0.0, "num_input_tokens_seen": 117100904, "step": 173785 }, { "epoch": 4.245718613343757, "grad_norm": 0.00011878240911755711, "learning_rate": 1.3548223879458897e-07, "loss": 0.0, "num_input_tokens_seen": 117104488, "step": 173790 }, { "epoch": 4.245840764175604, "grad_norm": 1.9101145880995318e-05, "learning_rate": 1.35439381223174e-07, "loss": 0.0, "num_input_tokens_seen": 117107432, "step": 173795 }, { "epoch": 4.245962915007452, "grad_norm": 0.0006267334683798254, "learning_rate": 1.35396529939076e-07, "loss": 0.0, "num_input_tokens_seen": 117110696, "step": 173800 }, { "epoch": 4.246085065839298, "grad_norm": 4.32271153840702e-05, "learning_rate": 1.3535368494260712e-07, "loss": 0.0, "num_input_tokens_seen": 117114024, "step": 173805 }, { "epoch": 4.246207216671145, "grad_norm": 0.00016103855159599334, "learning_rate": 1.3531084623407897e-07, "loss": 0.0, "num_input_tokens_seen": 117117480, "step": 173810 }, { "epoch": 4.246329367502993, "grad_norm": 0.00027206912636756897, "learning_rate": 1.3526801381380272e-07, "loss": 0.0, "num_input_tokens_seen": 117120872, "step": 173815 }, { "epoch": 4.2464515183348395, "grad_norm": 0.0004113983013667166, "learning_rate": 1.3522518768209034e-07, "loss": 0.0, "num_input_tokens_seen": 117123880, "step": 173820 }, { "epoch": 4.246573669166687, "grad_norm": 0.0003232089220546186, "learning_rate": 1.3518236783925296e-07, "loss": 0.0, "num_input_tokens_seen": 117127144, "step": 173825 }, { "epoch": 4.246695819998534, "grad_norm": 1.5161986993916798e-05, "learning_rate": 1.3513955428560175e-07, "loss": 0.0, "num_input_tokens_seen": 117131048, "step": 173830 }, { "epoch": 4.2468179708303815, "grad_norm": 4.181816620985046e-05, "learning_rate": 1.3509674702144859e-07, "loss": 0.0, "num_input_tokens_seen": 117134376, "step": 173835 }, { "epoch": 4.246940121662228, "grad_norm": 0.00016705627785995603, "learning_rate": 1.350539460471042e-07, "loss": 0.0, "num_input_tokens_seen": 117137704, "step": 173840 }, { "epoch": 4.247062272494076, "grad_norm": 3.5695707083505113e-06, "learning_rate": 1.3501115136288044e-07, "loss": 0.0, "num_input_tokens_seen": 117141352, "step": 173845 }, { "epoch": 4.247184423325923, "grad_norm": 0.0028615519404411316, "learning_rate": 1.3496836296908797e-07, "loss": 0.0, "num_input_tokens_seen": 117144552, "step": 173850 }, { "epoch": 4.24730657415777, "grad_norm": 0.0008766906685195863, "learning_rate": 1.3492558086603855e-07, "loss": 0.0, "num_input_tokens_seen": 117147944, "step": 173855 }, { "epoch": 4.247428724989617, "grad_norm": 0.0012991810217499733, "learning_rate": 1.348828050540427e-07, "loss": 0.0, "num_input_tokens_seen": 117151272, "step": 173860 }, { "epoch": 4.247550875821465, "grad_norm": 0.0005853850743733346, "learning_rate": 1.3484003553341183e-07, "loss": 0.0, "num_input_tokens_seen": 117154536, "step": 173865 }, { "epoch": 4.247673026653311, "grad_norm": 7.38587277737679e-06, "learning_rate": 1.3479727230445704e-07, "loss": 0.0, "num_input_tokens_seen": 117158248, "step": 173870 }, { "epoch": 4.247795177485159, "grad_norm": 0.010029040277004242, "learning_rate": 1.3475451536748906e-07, "loss": 0.0, "num_input_tokens_seen": 117161320, "step": 173875 }, { "epoch": 4.247917328317006, "grad_norm": 0.0016140680527314544, "learning_rate": 1.347117647228192e-07, "loss": 0.0513, "num_input_tokens_seen": 117164840, "step": 173880 }, { "epoch": 4.248039479148853, "grad_norm": 9.858083649305627e-05, "learning_rate": 1.3466902037075788e-07, "loss": 0.0, "num_input_tokens_seen": 117168040, "step": 173885 }, { "epoch": 4.2481616299807, "grad_norm": 0.0005197059363126755, "learning_rate": 1.3462628231161632e-07, "loss": 0.0, "num_input_tokens_seen": 117171304, "step": 173890 }, { "epoch": 4.248283780812548, "grad_norm": 0.0014987658942118287, "learning_rate": 1.3458355054570515e-07, "loss": 0.0, "num_input_tokens_seen": 117175400, "step": 173895 }, { "epoch": 4.2484059316443945, "grad_norm": 0.00018606704543344676, "learning_rate": 1.3454082507333496e-07, "loss": 0.0, "num_input_tokens_seen": 117178344, "step": 173900 }, { "epoch": 4.248528082476241, "grad_norm": 8.002785762073472e-05, "learning_rate": 1.3449810589481702e-07, "loss": 0.0, "num_input_tokens_seen": 117181480, "step": 173905 }, { "epoch": 4.248650233308089, "grad_norm": 4.034727317048237e-05, "learning_rate": 1.3445539301046148e-07, "loss": 0.0, "num_input_tokens_seen": 117184872, "step": 173910 }, { "epoch": 4.248772384139936, "grad_norm": 0.0015639587072655559, "learning_rate": 1.3441268642057923e-07, "loss": 0.0, "num_input_tokens_seen": 117188200, "step": 173915 }, { "epoch": 4.248894534971783, "grad_norm": 44.089595794677734, "learning_rate": 1.3436998612548055e-07, "loss": 0.0372, "num_input_tokens_seen": 117191656, "step": 173920 }, { "epoch": 4.24901668580363, "grad_norm": 0.004503658507019281, "learning_rate": 1.3432729212547645e-07, "loss": 0.0, "num_input_tokens_seen": 117195176, "step": 173925 }, { "epoch": 4.249138836635478, "grad_norm": 0.00021741162345279008, "learning_rate": 1.3428460442087686e-07, "loss": 0.0, "num_input_tokens_seen": 117198312, "step": 173930 }, { "epoch": 4.249260987467324, "grad_norm": 3.549739994923584e-05, "learning_rate": 1.3424192301199267e-07, "loss": 0.0001, "num_input_tokens_seen": 117201512, "step": 173935 }, { "epoch": 4.249383138299172, "grad_norm": 0.00024185300571843982, "learning_rate": 1.3419924789913407e-07, "loss": 0.0, "num_input_tokens_seen": 117205032, "step": 173940 }, { "epoch": 4.249505289131019, "grad_norm": 0.0010530028957873583, "learning_rate": 1.3415657908261113e-07, "loss": 0.0, "num_input_tokens_seen": 117208424, "step": 173945 }, { "epoch": 4.249627439962866, "grad_norm": 7.1865242716739886e-06, "learning_rate": 1.3411391656273475e-07, "loss": 0.0, "num_input_tokens_seen": 117211560, "step": 173950 }, { "epoch": 4.249749590794713, "grad_norm": 3.061044117202982e-05, "learning_rate": 1.3407126033981464e-07, "loss": 0.0, "num_input_tokens_seen": 117215336, "step": 173955 }, { "epoch": 4.249871741626561, "grad_norm": 0.0009061378077603877, "learning_rate": 1.3402861041416124e-07, "loss": 0.0, "num_input_tokens_seen": 117218344, "step": 173960 }, { "epoch": 4.2499938924584075, "grad_norm": 0.00015108681691344827, "learning_rate": 1.3398596678608488e-07, "loss": 0.0, "num_input_tokens_seen": 117221928, "step": 173965 }, { "epoch": 4.250116043290255, "grad_norm": 7.658931281184778e-06, "learning_rate": 1.3394332945589526e-07, "loss": 0.0, "num_input_tokens_seen": 117225384, "step": 173970 }, { "epoch": 4.250238194122102, "grad_norm": 0.00028781109722331166, "learning_rate": 1.3390069842390295e-07, "loss": 0.0, "num_input_tokens_seen": 117228520, "step": 173975 }, { "epoch": 4.250311484621211, "eval_loss": 0.337473601102829, "eval_runtime": 47.7909, "eval_samples_per_second": 761.337, "eval_steps_per_second": 95.185, "num_input_tokens_seen": 117230952, "step": 173978 }, { "epoch": 4.2503603449539495, "grad_norm": 3.5128767194692045e-05, "learning_rate": 1.3385807369041746e-07, "loss": 0.0, "num_input_tokens_seen": 117232040, "step": 173980 }, { "epoch": 4.250482495785796, "grad_norm": 0.0010884815128520131, "learning_rate": 1.338154552557491e-07, "loss": 0.0, "num_input_tokens_seen": 117235048, "step": 173985 }, { "epoch": 4.250604646617644, "grad_norm": 0.00018114039266947657, "learning_rate": 1.3377284312020787e-07, "loss": 0.05, "num_input_tokens_seen": 117238376, "step": 173990 }, { "epoch": 4.250726797449491, "grad_norm": 0.00021715753246098757, "learning_rate": 1.3373023728410338e-07, "loss": 0.0, "num_input_tokens_seen": 117242408, "step": 173995 }, { "epoch": 4.250848948281337, "grad_norm": 0.01458763051778078, "learning_rate": 1.336876377477457e-07, "loss": 0.0, "num_input_tokens_seen": 117245672, "step": 174000 }, { "epoch": 4.250971099113185, "grad_norm": 3.400916466489434e-05, "learning_rate": 1.3364504451144443e-07, "loss": 0.0, "num_input_tokens_seen": 117248872, "step": 174005 }, { "epoch": 4.251093249945032, "grad_norm": 0.0003229551366530359, "learning_rate": 1.3360245757550947e-07, "loss": 0.0, "num_input_tokens_seen": 117251880, "step": 174010 }, { "epoch": 4.251215400776879, "grad_norm": 4.918500781059265e-05, "learning_rate": 1.335598769402504e-07, "loss": 0.0, "num_input_tokens_seen": 117255528, "step": 174015 }, { "epoch": 4.251337551608726, "grad_norm": 1.871642416517716e-05, "learning_rate": 1.3351730260597693e-07, "loss": 0.0, "num_input_tokens_seen": 117258408, "step": 174020 }, { "epoch": 4.251459702440574, "grad_norm": 0.00018231081776320934, "learning_rate": 1.3347473457299885e-07, "loss": 0.0, "num_input_tokens_seen": 117262056, "step": 174025 }, { "epoch": 4.2515818532724206, "grad_norm": 0.00022168007853906602, "learning_rate": 1.3343217284162566e-07, "loss": 0.0, "num_input_tokens_seen": 117265448, "step": 174030 }, { "epoch": 4.251704004104268, "grad_norm": 0.0004635670338757336, "learning_rate": 1.333896174121665e-07, "loss": 0.0, "num_input_tokens_seen": 117268392, "step": 174035 }, { "epoch": 4.251826154936115, "grad_norm": 0.00015483734023291618, "learning_rate": 1.3334706828493137e-07, "loss": 0.0, "num_input_tokens_seen": 117271720, "step": 174040 }, { "epoch": 4.251948305767963, "grad_norm": 0.0002787092234939337, "learning_rate": 1.333045254602294e-07, "loss": 0.0, "num_input_tokens_seen": 117274856, "step": 174045 }, { "epoch": 4.252070456599809, "grad_norm": 0.0005139851709827781, "learning_rate": 1.3326198893836994e-07, "loss": 0.0, "num_input_tokens_seen": 117278120, "step": 174050 }, { "epoch": 4.252192607431657, "grad_norm": 2.0777453755727038e-05, "learning_rate": 1.3321945871966234e-07, "loss": 0.0, "num_input_tokens_seen": 117281256, "step": 174055 }, { "epoch": 4.252314758263504, "grad_norm": 3.087955337832682e-05, "learning_rate": 1.3317693480441615e-07, "loss": 0.0, "num_input_tokens_seen": 117284648, "step": 174060 }, { "epoch": 4.252436909095351, "grad_norm": 0.002935598837211728, "learning_rate": 1.3313441719294027e-07, "loss": 0.0, "num_input_tokens_seen": 117288552, "step": 174065 }, { "epoch": 4.252559059927198, "grad_norm": 8.68289134814404e-06, "learning_rate": 1.3309190588554432e-07, "loss": 0.0, "num_input_tokens_seen": 117292200, "step": 174070 }, { "epoch": 4.252681210759045, "grad_norm": 4.14956193708349e-05, "learning_rate": 1.330494008825369e-07, "loss": 0.0, "num_input_tokens_seen": 117295528, "step": 174075 }, { "epoch": 4.2528033615908925, "grad_norm": 0.00044843723298981786, "learning_rate": 1.330069021842275e-07, "loss": 0.0, "num_input_tokens_seen": 117298984, "step": 174080 }, { "epoch": 4.252925512422739, "grad_norm": 0.00042743535595946014, "learning_rate": 1.3296440979092527e-07, "loss": 0.0, "num_input_tokens_seen": 117302760, "step": 174085 }, { "epoch": 4.253047663254587, "grad_norm": 0.00012097960279788822, "learning_rate": 1.3292192370293887e-07, "loss": 0.0, "num_input_tokens_seen": 117305768, "step": 174090 }, { "epoch": 4.253169814086434, "grad_norm": 0.00042267670505680144, "learning_rate": 1.328794439205777e-07, "loss": 0.0, "num_input_tokens_seen": 117309224, "step": 174095 }, { "epoch": 4.253291964918281, "grad_norm": 8.828636782709509e-05, "learning_rate": 1.328369704441501e-07, "loss": 0.0, "num_input_tokens_seen": 117312488, "step": 174100 }, { "epoch": 4.253414115750128, "grad_norm": 0.00013019546167925, "learning_rate": 1.3279450327396568e-07, "loss": 0.0, "num_input_tokens_seen": 117315688, "step": 174105 }, { "epoch": 4.253536266581976, "grad_norm": 0.00131877395324409, "learning_rate": 1.3275204241033255e-07, "loss": 0.0, "num_input_tokens_seen": 117319336, "step": 174110 }, { "epoch": 4.253658417413822, "grad_norm": 8.842379611451179e-05, "learning_rate": 1.327095878535598e-07, "loss": 0.0, "num_input_tokens_seen": 117322344, "step": 174115 }, { "epoch": 4.25378056824567, "grad_norm": 0.00011061535042244941, "learning_rate": 1.3266713960395647e-07, "loss": 0.0, "num_input_tokens_seen": 117325288, "step": 174120 }, { "epoch": 4.253902719077517, "grad_norm": 0.0001622894051251933, "learning_rate": 1.3262469766183083e-07, "loss": 0.0, "num_input_tokens_seen": 117328744, "step": 174125 }, { "epoch": 4.254024869909364, "grad_norm": 0.0030617835000157356, "learning_rate": 1.325822620274918e-07, "loss": 0.0, "num_input_tokens_seen": 117331880, "step": 174130 }, { "epoch": 4.254147020741211, "grad_norm": 3.148586620227434e-05, "learning_rate": 1.325398327012479e-07, "loss": 0.0, "num_input_tokens_seen": 117335208, "step": 174135 }, { "epoch": 4.254269171573059, "grad_norm": 5.484212579176528e-06, "learning_rate": 1.324974096834075e-07, "loss": 0.0, "num_input_tokens_seen": 117338344, "step": 174140 }, { "epoch": 4.2543913224049055, "grad_norm": 9.148853132501245e-05, "learning_rate": 1.3245499297427943e-07, "loss": 0.0, "num_input_tokens_seen": 117341800, "step": 174145 }, { "epoch": 4.254513473236753, "grad_norm": 0.0003981620538979769, "learning_rate": 1.3241258257417177e-07, "loss": 0.0001, "num_input_tokens_seen": 117345064, "step": 174150 }, { "epoch": 4.2546356240686, "grad_norm": 0.00012439371494110674, "learning_rate": 1.323701784833934e-07, "loss": 0.0, "num_input_tokens_seen": 117348520, "step": 174155 }, { "epoch": 4.2547577749004475, "grad_norm": 0.00014706332876812667, "learning_rate": 1.3232778070225227e-07, "loss": 0.0, "num_input_tokens_seen": 117351976, "step": 174160 }, { "epoch": 4.254879925732294, "grad_norm": 0.0016874118009582162, "learning_rate": 1.3228538923105704e-07, "loss": 0.0, "num_input_tokens_seen": 117355432, "step": 174165 }, { "epoch": 4.255002076564141, "grad_norm": 0.00014558476686943322, "learning_rate": 1.3224300407011558e-07, "loss": 0.0, "num_input_tokens_seen": 117359016, "step": 174170 }, { "epoch": 4.255124227395989, "grad_norm": 0.00010693442891351879, "learning_rate": 1.3220062521973652e-07, "loss": 0.0, "num_input_tokens_seen": 117362728, "step": 174175 }, { "epoch": 4.255246378227835, "grad_norm": 0.00010506354738026857, "learning_rate": 1.3215825268022807e-07, "loss": 0.0, "num_input_tokens_seen": 117365928, "step": 174180 }, { "epoch": 4.255368529059683, "grad_norm": 4.111505040782504e-05, "learning_rate": 1.3211588645189809e-07, "loss": 0.0, "num_input_tokens_seen": 117369896, "step": 174185 }, { "epoch": 4.25549067989153, "grad_norm": 0.0002986934850923717, "learning_rate": 1.3207352653505488e-07, "loss": 0.0, "num_input_tokens_seen": 117373480, "step": 174190 }, { "epoch": 4.255612830723377, "grad_norm": 0.0003481210151221603, "learning_rate": 1.3203117293000632e-07, "loss": 0.0, "num_input_tokens_seen": 117376488, "step": 174195 }, { "epoch": 4.255734981555224, "grad_norm": 6.517604924738407e-05, "learning_rate": 1.3198882563706082e-07, "loss": 0.0, "num_input_tokens_seen": 117379560, "step": 174200 }, { "epoch": 4.255857132387072, "grad_norm": 7.557028584415093e-05, "learning_rate": 1.319464846565257e-07, "loss": 0.0, "num_input_tokens_seen": 117383016, "step": 174205 }, { "epoch": 4.2559792832189185, "grad_norm": 3.513231058605015e-05, "learning_rate": 1.3190414998870924e-07, "loss": 0.0, "num_input_tokens_seen": 117386792, "step": 174210 }, { "epoch": 4.256101434050766, "grad_norm": 0.000536827661562711, "learning_rate": 1.3186182163391957e-07, "loss": 0.0, "num_input_tokens_seen": 117390056, "step": 174215 }, { "epoch": 4.256223584882613, "grad_norm": 0.0003994362778030336, "learning_rate": 1.3181949959246398e-07, "loss": 0.0, "num_input_tokens_seen": 117393512, "step": 174220 }, { "epoch": 4.2563457357144605, "grad_norm": 1.1261043255217373e-05, "learning_rate": 1.3177718386465065e-07, "loss": 0.0, "num_input_tokens_seen": 117397224, "step": 174225 }, { "epoch": 4.256467886546307, "grad_norm": 0.0023173654917627573, "learning_rate": 1.3173487445078702e-07, "loss": 0.0, "num_input_tokens_seen": 117401064, "step": 174230 }, { "epoch": 4.256590037378155, "grad_norm": 0.001880491035990417, "learning_rate": 1.3169257135118118e-07, "loss": 0.0, "num_input_tokens_seen": 117404392, "step": 174235 }, { "epoch": 4.256712188210002, "grad_norm": 0.000906631350517273, "learning_rate": 1.316502745661402e-07, "loss": 0.0, "num_input_tokens_seen": 117407720, "step": 174240 }, { "epoch": 4.256834339041849, "grad_norm": 5.04143608850427e-05, "learning_rate": 1.316079840959723e-07, "loss": 0.0, "num_input_tokens_seen": 117411240, "step": 174245 }, { "epoch": 4.256956489873696, "grad_norm": 0.01137853879481554, "learning_rate": 1.3156569994098465e-07, "loss": 0.0, "num_input_tokens_seen": 117414504, "step": 174250 }, { "epoch": 4.257078640705544, "grad_norm": 6.655412107647862e-06, "learning_rate": 1.3152342210148447e-07, "loss": 0.0, "num_input_tokens_seen": 117418408, "step": 174255 }, { "epoch": 4.25720079153739, "grad_norm": 0.0011601169826462865, "learning_rate": 1.3148115057777997e-07, "loss": 0.0, "num_input_tokens_seen": 117422568, "step": 174260 }, { "epoch": 4.257322942369237, "grad_norm": 0.0007189237512648106, "learning_rate": 1.3143888537017788e-07, "loss": 0.0, "num_input_tokens_seen": 117425768, "step": 174265 }, { "epoch": 4.257445093201085, "grad_norm": 8.994324161903933e-05, "learning_rate": 1.3139662647898574e-07, "loss": 0.0, "num_input_tokens_seen": 117429096, "step": 174270 }, { "epoch": 4.2575672440329315, "grad_norm": 0.00013263424625620246, "learning_rate": 1.313543739045113e-07, "loss": 0.0, "num_input_tokens_seen": 117432168, "step": 174275 }, { "epoch": 4.257689394864779, "grad_norm": 2.323191802133806e-05, "learning_rate": 1.313121276470611e-07, "loss": 0.0, "num_input_tokens_seen": 117435688, "step": 174280 }, { "epoch": 4.257811545696626, "grad_norm": 0.0006844392628408968, "learning_rate": 1.3126988770694314e-07, "loss": 0.0, "num_input_tokens_seen": 117439016, "step": 174285 }, { "epoch": 4.2579336965284735, "grad_norm": 6.209936691448092e-05, "learning_rate": 1.31227654084464e-07, "loss": 0.0, "num_input_tokens_seen": 117442664, "step": 174290 }, { "epoch": 4.25805584736032, "grad_norm": 4.204810466035269e-05, "learning_rate": 1.3118542677993116e-07, "loss": 0.0, "num_input_tokens_seen": 117445992, "step": 174295 }, { "epoch": 4.258177998192168, "grad_norm": 0.001022745156660676, "learning_rate": 1.3114320579365134e-07, "loss": 0.0, "num_input_tokens_seen": 117449384, "step": 174300 }, { "epoch": 4.258300149024015, "grad_norm": 0.0005625466583296657, "learning_rate": 1.3110099112593199e-07, "loss": 0.0, "num_input_tokens_seen": 117452456, "step": 174305 }, { "epoch": 4.258422299855862, "grad_norm": 7.460943379555829e-06, "learning_rate": 1.3105878277707992e-07, "loss": 0.0, "num_input_tokens_seen": 117455784, "step": 174310 }, { "epoch": 4.258544450687709, "grad_norm": 0.0010313538368791342, "learning_rate": 1.3101658074740207e-07, "loss": 0.0, "num_input_tokens_seen": 117459048, "step": 174315 }, { "epoch": 4.258666601519557, "grad_norm": 0.00015061290469020605, "learning_rate": 1.3097438503720548e-07, "loss": 0.0, "num_input_tokens_seen": 117462568, "step": 174320 }, { "epoch": 4.258788752351403, "grad_norm": 0.00010983618994941935, "learning_rate": 1.309321956467968e-07, "loss": 0.0, "num_input_tokens_seen": 117465576, "step": 174325 }, { "epoch": 4.258910903183251, "grad_norm": 7.537674537161365e-05, "learning_rate": 1.308900125764828e-07, "loss": 0.0, "num_input_tokens_seen": 117469160, "step": 174330 }, { "epoch": 4.259033054015098, "grad_norm": 0.001629429985769093, "learning_rate": 1.3084783582657077e-07, "loss": 0.0, "num_input_tokens_seen": 117472744, "step": 174335 }, { "epoch": 4.2591552048469445, "grad_norm": 0.00039150213706307113, "learning_rate": 1.3080566539736691e-07, "loss": 0.0, "num_input_tokens_seen": 117476136, "step": 174340 }, { "epoch": 4.259277355678792, "grad_norm": 2.7556236091186292e-05, "learning_rate": 1.307635012891779e-07, "loss": 0.0, "num_input_tokens_seen": 117479528, "step": 174345 }, { "epoch": 4.259399506510639, "grad_norm": 1.5702662494732067e-05, "learning_rate": 1.3072134350231068e-07, "loss": 0.0, "num_input_tokens_seen": 117482920, "step": 174350 }, { "epoch": 4.2595216573424866, "grad_norm": 0.012511259876191616, "learning_rate": 1.3067919203707168e-07, "loss": 0.0, "num_input_tokens_seen": 117486248, "step": 174355 }, { "epoch": 4.259643808174333, "grad_norm": 0.004491363186389208, "learning_rate": 1.306370468937672e-07, "loss": 0.0, "num_input_tokens_seen": 117489192, "step": 174360 }, { "epoch": 4.259765959006181, "grad_norm": 9.086837053473573e-06, "learning_rate": 1.305949080727039e-07, "loss": 0.0, "num_input_tokens_seen": 117492264, "step": 174365 }, { "epoch": 4.259888109838028, "grad_norm": 8.383391104871407e-05, "learning_rate": 1.3055277557418854e-07, "loss": 0.0, "num_input_tokens_seen": 117495592, "step": 174370 }, { "epoch": 4.260010260669875, "grad_norm": 0.0011324695078656077, "learning_rate": 1.3051064939852706e-07, "loss": 0.0, "num_input_tokens_seen": 117498856, "step": 174375 }, { "epoch": 4.260132411501722, "grad_norm": 8.879219967639074e-05, "learning_rate": 1.3046852954602617e-07, "loss": 0.0, "num_input_tokens_seen": 117501736, "step": 174380 }, { "epoch": 4.26025456233357, "grad_norm": 6.638761988142505e-06, "learning_rate": 1.3042641601699178e-07, "loss": 0.0, "num_input_tokens_seen": 117505384, "step": 174385 }, { "epoch": 4.260376713165416, "grad_norm": 0.00012476768461056054, "learning_rate": 1.3038430881173035e-07, "loss": 0.0, "num_input_tokens_seen": 117508648, "step": 174390 }, { "epoch": 4.260498863997264, "grad_norm": 5.976101601845585e-05, "learning_rate": 1.303422079305484e-07, "loss": 0.0, "num_input_tokens_seen": 117511976, "step": 174395 }, { "epoch": 4.260621014829111, "grad_norm": 1.2742106264340691e-05, "learning_rate": 1.3030011337375158e-07, "loss": 0.0, "num_input_tokens_seen": 117515240, "step": 174400 }, { "epoch": 4.2607431656609585, "grad_norm": 2.1465803001774475e-05, "learning_rate": 1.3025802514164653e-07, "loss": 0.0, "num_input_tokens_seen": 117518376, "step": 174405 }, { "epoch": 4.260865316492805, "grad_norm": 7.837494922569022e-05, "learning_rate": 1.3021594323453878e-07, "loss": 0.0, "num_input_tokens_seen": 117521512, "step": 174410 }, { "epoch": 4.260987467324653, "grad_norm": 0.0001668841578066349, "learning_rate": 1.3017386765273487e-07, "loss": 0.0, "num_input_tokens_seen": 117524776, "step": 174415 }, { "epoch": 4.2611096181565, "grad_norm": 0.0011746666859835386, "learning_rate": 1.3013179839654033e-07, "loss": 0.0, "num_input_tokens_seen": 117527912, "step": 174420 }, { "epoch": 4.261231768988347, "grad_norm": 7.951833140396047e-06, "learning_rate": 1.3008973546626134e-07, "loss": 0.0, "num_input_tokens_seen": 117531240, "step": 174425 }, { "epoch": 4.261353919820194, "grad_norm": 0.0004515565815381706, "learning_rate": 1.3004767886220391e-07, "loss": 0.0001, "num_input_tokens_seen": 117535016, "step": 174430 }, { "epoch": 4.261476070652041, "grad_norm": 4.313921454013325e-05, "learning_rate": 1.3000562858467368e-07, "loss": 0.0, "num_input_tokens_seen": 117538856, "step": 174435 }, { "epoch": 4.261598221483888, "grad_norm": 0.00011880762758664787, "learning_rate": 1.2996358463397662e-07, "loss": 0.0, "num_input_tokens_seen": 117542312, "step": 174440 }, { "epoch": 4.261720372315735, "grad_norm": 0.002191459061577916, "learning_rate": 1.2992154701041836e-07, "loss": 0.0, "num_input_tokens_seen": 117545512, "step": 174445 }, { "epoch": 4.261842523147583, "grad_norm": 2.773893584162579e-06, "learning_rate": 1.2987951571430456e-07, "loss": 0.0522, "num_input_tokens_seen": 117549160, "step": 174450 }, { "epoch": 4.2619646739794295, "grad_norm": 0.0002300427295267582, "learning_rate": 1.2983749074594097e-07, "loss": 0.0, "num_input_tokens_seen": 117552616, "step": 174455 }, { "epoch": 4.262086824811277, "grad_norm": 4.851877747569233e-05, "learning_rate": 1.2979547210563313e-07, "loss": 0.0, "num_input_tokens_seen": 117555752, "step": 174460 }, { "epoch": 4.262208975643124, "grad_norm": 0.008494346402585506, "learning_rate": 1.297534597936869e-07, "loss": 0.0, "num_input_tokens_seen": 117558824, "step": 174465 }, { "epoch": 4.2623311264749715, "grad_norm": 0.00031238130759447813, "learning_rate": 1.2971145381040726e-07, "loss": 0.0, "num_input_tokens_seen": 117562088, "step": 174470 }, { "epoch": 4.262453277306818, "grad_norm": 0.00039827567525207996, "learning_rate": 1.296694541561003e-07, "loss": 0.0, "num_input_tokens_seen": 117565416, "step": 174475 }, { "epoch": 4.262575428138666, "grad_norm": 0.0004993233596906066, "learning_rate": 1.296274608310709e-07, "loss": 0.0, "num_input_tokens_seen": 117568808, "step": 174480 }, { "epoch": 4.262697578970513, "grad_norm": 0.00011720485053956509, "learning_rate": 1.2958547383562468e-07, "loss": 0.0, "num_input_tokens_seen": 117572072, "step": 174485 }, { "epoch": 4.26281972980236, "grad_norm": 1.069751124305185e-05, "learning_rate": 1.295434931700673e-07, "loss": 0.0, "num_input_tokens_seen": 117575784, "step": 174490 }, { "epoch": 4.262941880634207, "grad_norm": 0.00021572900004684925, "learning_rate": 1.295015188347035e-07, "loss": 0.0, "num_input_tokens_seen": 117579432, "step": 174495 }, { "epoch": 4.263064031466055, "grad_norm": 0.002782398136332631, "learning_rate": 1.2945955082983906e-07, "loss": 0.0, "num_input_tokens_seen": 117583016, "step": 174500 }, { "epoch": 4.263186182297901, "grad_norm": 6.141927588032559e-05, "learning_rate": 1.2941758915577862e-07, "loss": 0.0318, "num_input_tokens_seen": 117586408, "step": 174505 }, { "epoch": 4.263308333129749, "grad_norm": 0.0004919608472846448, "learning_rate": 1.293756338128279e-07, "loss": 0.0, "num_input_tokens_seen": 117589736, "step": 174510 }, { "epoch": 4.263430483961596, "grad_norm": 0.0031189259607344866, "learning_rate": 1.2933368480129148e-07, "loss": 0.0, "num_input_tokens_seen": 117592872, "step": 174515 }, { "epoch": 4.263552634793443, "grad_norm": 5.286826490191743e-05, "learning_rate": 1.2929174212147475e-07, "loss": 0.0, "num_input_tokens_seen": 117596456, "step": 174520 }, { "epoch": 4.26367478562529, "grad_norm": 0.00019402038014959544, "learning_rate": 1.2924980577368284e-07, "loss": 0.0, "num_input_tokens_seen": 117599912, "step": 174525 }, { "epoch": 4.263796936457137, "grad_norm": 0.0013834084384143353, "learning_rate": 1.2920787575822035e-07, "loss": 0.0, "num_input_tokens_seen": 117603240, "step": 174530 }, { "epoch": 4.2639190872889845, "grad_norm": 1.8565729988040403e-05, "learning_rate": 1.291659520753926e-07, "loss": 0.0, "num_input_tokens_seen": 117606952, "step": 174535 }, { "epoch": 4.264041238120831, "grad_norm": 0.0015397652750834823, "learning_rate": 1.2912403472550405e-07, "loss": 0.0, "num_input_tokens_seen": 117610472, "step": 174540 }, { "epoch": 4.264163388952679, "grad_norm": 0.002840655390173197, "learning_rate": 1.2908212370885997e-07, "loss": 0.0, "num_input_tokens_seen": 117613928, "step": 174545 }, { "epoch": 4.264285539784526, "grad_norm": 0.00011604101746343076, "learning_rate": 1.2904021902576467e-07, "loss": 0.0, "num_input_tokens_seen": 117617256, "step": 174550 }, { "epoch": 4.264407690616373, "grad_norm": 0.0003640939248725772, "learning_rate": 1.289983206765235e-07, "loss": 0.0, "num_input_tokens_seen": 117620584, "step": 174555 }, { "epoch": 4.26452984144822, "grad_norm": 0.006663096137344837, "learning_rate": 1.2895642866144075e-07, "loss": 0.0, "num_input_tokens_seen": 117624360, "step": 174560 }, { "epoch": 4.264651992280068, "grad_norm": 0.0002884053101297468, "learning_rate": 1.2891454298082084e-07, "loss": 0.0, "num_input_tokens_seen": 117627688, "step": 174565 }, { "epoch": 4.264774143111914, "grad_norm": 0.00045368506107479334, "learning_rate": 1.2887266363496897e-07, "loss": 0.0, "num_input_tokens_seen": 117631208, "step": 174570 }, { "epoch": 4.264896293943762, "grad_norm": 7.1562126322533e-06, "learning_rate": 1.2883079062418922e-07, "loss": 0.0, "num_input_tokens_seen": 117634728, "step": 174575 }, { "epoch": 4.265018444775609, "grad_norm": 0.00015068422362674028, "learning_rate": 1.2878892394878616e-07, "loss": 0.0, "num_input_tokens_seen": 117638248, "step": 174580 }, { "epoch": 4.265140595607456, "grad_norm": 7.73434730945155e-05, "learning_rate": 1.2874706360906462e-07, "loss": 0.0, "num_input_tokens_seen": 117642216, "step": 174585 }, { "epoch": 4.265262746439303, "grad_norm": 0.00013224811118561774, "learning_rate": 1.287052096053286e-07, "loss": 0.0, "num_input_tokens_seen": 117645352, "step": 174590 }, { "epoch": 4.265384897271151, "grad_norm": 0.0004519324575085193, "learning_rate": 1.2866336193788285e-07, "loss": 0.0, "num_input_tokens_seen": 117648360, "step": 174595 }, { "epoch": 4.2655070481029975, "grad_norm": 0.00014004706463310868, "learning_rate": 1.2862152060703135e-07, "loss": 0.0, "num_input_tokens_seen": 117651304, "step": 174600 }, { "epoch": 4.265629198934845, "grad_norm": 3.622357689891942e-05, "learning_rate": 1.2857968561307864e-07, "loss": 0.0105, "num_input_tokens_seen": 117654632, "step": 174605 }, { "epoch": 4.265751349766692, "grad_norm": 7.538398494943976e-05, "learning_rate": 1.285378569563287e-07, "loss": 0.0, "num_input_tokens_seen": 117657768, "step": 174610 }, { "epoch": 4.2658735005985395, "grad_norm": 9.134790161624551e-05, "learning_rate": 1.284960346370858e-07, "loss": 0.0, "num_input_tokens_seen": 117660968, "step": 174615 }, { "epoch": 4.265995651430386, "grad_norm": 0.00016150051669683307, "learning_rate": 1.284542186556543e-07, "loss": 0.0, "num_input_tokens_seen": 117664360, "step": 174620 }, { "epoch": 4.266117802262233, "grad_norm": 0.0005944840959273279, "learning_rate": 1.2841240901233796e-07, "loss": 0.0001, "num_input_tokens_seen": 117667880, "step": 174625 }, { "epoch": 4.266239953094081, "grad_norm": 4.92603903694544e-05, "learning_rate": 1.2837060570744128e-07, "loss": 0.0, "num_input_tokens_seen": 117671336, "step": 174630 }, { "epoch": 4.266362103925927, "grad_norm": 0.00011865422129631042, "learning_rate": 1.2832880874126784e-07, "loss": 0.0, "num_input_tokens_seen": 117674856, "step": 174635 }, { "epoch": 4.266484254757775, "grad_norm": 0.0009908857755362988, "learning_rate": 1.2828701811412168e-07, "loss": 0.0, "num_input_tokens_seen": 117678120, "step": 174640 }, { "epoch": 4.266606405589622, "grad_norm": 2.8118483896832913e-05, "learning_rate": 1.2824523382630692e-07, "loss": 0.0, "num_input_tokens_seen": 117681448, "step": 174645 }, { "epoch": 4.266728556421469, "grad_norm": 0.00045708840480074286, "learning_rate": 1.2820345587812743e-07, "loss": 0.0, "num_input_tokens_seen": 117684520, "step": 174650 }, { "epoch": 4.266850707253316, "grad_norm": 9.125410542765167e-06, "learning_rate": 1.281616842698866e-07, "loss": 0.0, "num_input_tokens_seen": 117688424, "step": 174655 }, { "epoch": 4.266972858085164, "grad_norm": 7.427701348206028e-05, "learning_rate": 1.2811991900188868e-07, "loss": 0.0, "num_input_tokens_seen": 117691432, "step": 174660 }, { "epoch": 4.2670950089170105, "grad_norm": 4.319162599131232e-06, "learning_rate": 1.2807816007443727e-07, "loss": 0.0, "num_input_tokens_seen": 117694760, "step": 174665 }, { "epoch": 4.267217159748858, "grad_norm": 0.0005892039043828845, "learning_rate": 1.2803640748783583e-07, "loss": 0.0, "num_input_tokens_seen": 117697896, "step": 174670 }, { "epoch": 4.267339310580705, "grad_norm": 0.0002718200266826898, "learning_rate": 1.279946612423881e-07, "loss": 0.0, "num_input_tokens_seen": 117702312, "step": 174675 }, { "epoch": 4.267461461412553, "grad_norm": 8.382520900340751e-05, "learning_rate": 1.2795292133839796e-07, "loss": 0.0, "num_input_tokens_seen": 117705576, "step": 174680 }, { "epoch": 4.267583612244399, "grad_norm": 9.732329635880888e-05, "learning_rate": 1.279111877761685e-07, "loss": 0.0, "num_input_tokens_seen": 117708904, "step": 174685 }, { "epoch": 4.267705763076247, "grad_norm": 7.27354881746578e-06, "learning_rate": 1.2786946055600367e-07, "loss": 0.0, "num_input_tokens_seen": 117712424, "step": 174690 }, { "epoch": 4.267827913908094, "grad_norm": 0.002255277708172798, "learning_rate": 1.2782773967820647e-07, "loss": 0.0, "num_input_tokens_seen": 117716328, "step": 174695 }, { "epoch": 4.26795006473994, "grad_norm": 3.05923203995917e-05, "learning_rate": 1.277860251430808e-07, "loss": 0.0, "num_input_tokens_seen": 117719528, "step": 174700 }, { "epoch": 4.268072215571788, "grad_norm": 7.840585749363527e-05, "learning_rate": 1.2774431695092958e-07, "loss": 0.0, "num_input_tokens_seen": 117722600, "step": 174705 }, { "epoch": 4.268194366403635, "grad_norm": 3.6371238820720464e-05, "learning_rate": 1.2770261510205616e-07, "loss": 0.0, "num_input_tokens_seen": 117726248, "step": 174710 }, { "epoch": 4.268316517235482, "grad_norm": 0.0004068179405294359, "learning_rate": 1.2766091959676427e-07, "loss": 0.0, "num_input_tokens_seen": 117730024, "step": 174715 }, { "epoch": 4.268438668067329, "grad_norm": 0.000273067009402439, "learning_rate": 1.276192304353565e-07, "loss": 0.0, "num_input_tokens_seen": 117733288, "step": 174720 }, { "epoch": 4.268560818899177, "grad_norm": 3.6302051739767194e-05, "learning_rate": 1.2757754761813667e-07, "loss": 0.0, "num_input_tokens_seen": 117736872, "step": 174725 }, { "epoch": 4.268682969731024, "grad_norm": 2.0907571524730884e-05, "learning_rate": 1.275358711454072e-07, "loss": 0.0, "num_input_tokens_seen": 117740392, "step": 174730 }, { "epoch": 4.268805120562871, "grad_norm": 15.842265129089355, "learning_rate": 1.2749420101747165e-07, "loss": 0.0256, "num_input_tokens_seen": 117743464, "step": 174735 }, { "epoch": 4.268927271394718, "grad_norm": 0.00020856004266534, "learning_rate": 1.2745253723463311e-07, "loss": 0.0, "num_input_tokens_seen": 117746984, "step": 174740 }, { "epoch": 4.269049422226566, "grad_norm": 0.0001524622057331726, "learning_rate": 1.2741087979719412e-07, "loss": 0.0, "num_input_tokens_seen": 117750952, "step": 174745 }, { "epoch": 4.269171573058412, "grad_norm": 5.263303319225088e-05, "learning_rate": 1.2736922870545829e-07, "loss": 0.0, "num_input_tokens_seen": 117754472, "step": 174750 }, { "epoch": 4.26929372389026, "grad_norm": 0.0003677001514006406, "learning_rate": 1.27327583959728e-07, "loss": 0.0, "num_input_tokens_seen": 117757608, "step": 174755 }, { "epoch": 4.269415874722107, "grad_norm": 0.0001912468287628144, "learning_rate": 1.2728594556030613e-07, "loss": 0.0, "num_input_tokens_seen": 117760808, "step": 174760 }, { "epoch": 4.269538025553954, "grad_norm": 0.0002439589734422043, "learning_rate": 1.2724431350749576e-07, "loss": 0.0, "num_input_tokens_seen": 117764072, "step": 174765 }, { "epoch": 4.269660176385801, "grad_norm": 2.498049070709385e-05, "learning_rate": 1.2720268780159927e-07, "loss": 0.0, "num_input_tokens_seen": 117767656, "step": 174770 }, { "epoch": 4.269782327217649, "grad_norm": 7.077799818944186e-05, "learning_rate": 1.2716106844291974e-07, "loss": 0.0, "num_input_tokens_seen": 117770792, "step": 174775 }, { "epoch": 4.2699044780494955, "grad_norm": 5.7361143262824044e-05, "learning_rate": 1.271194554317595e-07, "loss": 0.0, "num_input_tokens_seen": 117774056, "step": 174780 }, { "epoch": 4.270026628881343, "grad_norm": 0.0009771619224920869, "learning_rate": 1.2707784876842165e-07, "loss": 0.0, "num_input_tokens_seen": 117777320, "step": 174785 }, { "epoch": 4.27014877971319, "grad_norm": 3.899524381267838e-05, "learning_rate": 1.2703624845320826e-07, "loss": 0.0, "num_input_tokens_seen": 117780584, "step": 174790 }, { "epoch": 4.270270930545037, "grad_norm": 0.0007512098527513444, "learning_rate": 1.2699465448642198e-07, "loss": 0.0, "num_input_tokens_seen": 117783848, "step": 174795 }, { "epoch": 4.270393081376884, "grad_norm": 3.748635572264902e-05, "learning_rate": 1.269530668683656e-07, "loss": 0.0, "num_input_tokens_seen": 117787560, "step": 174800 }, { "epoch": 4.270515232208731, "grad_norm": 0.003991847857832909, "learning_rate": 1.2691148559934117e-07, "loss": 0.0, "num_input_tokens_seen": 117790760, "step": 174805 }, { "epoch": 4.270637383040579, "grad_norm": 0.001877957722172141, "learning_rate": 1.2686991067965147e-07, "loss": 0.0, "num_input_tokens_seen": 117793704, "step": 174810 }, { "epoch": 4.270759533872425, "grad_norm": 0.0001509853609604761, "learning_rate": 1.2682834210959847e-07, "loss": 0.0, "num_input_tokens_seen": 117796712, "step": 174815 }, { "epoch": 4.270881684704273, "grad_norm": 1.75936638697749e-05, "learning_rate": 1.2678677988948473e-07, "loss": 0.0001, "num_input_tokens_seen": 117799912, "step": 174820 }, { "epoch": 4.27100383553612, "grad_norm": 0.000486937933601439, "learning_rate": 1.2674522401961218e-07, "loss": 0.0, "num_input_tokens_seen": 117803112, "step": 174825 }, { "epoch": 4.271125986367967, "grad_norm": 0.0001492535520810634, "learning_rate": 1.2670367450028328e-07, "loss": 0.0, "num_input_tokens_seen": 117806376, "step": 174830 }, { "epoch": 4.271248137199814, "grad_norm": 0.00012599291221704334, "learning_rate": 1.2666213133180038e-07, "loss": 0.0, "num_input_tokens_seen": 117809896, "step": 174835 }, { "epoch": 4.271370288031662, "grad_norm": 4.7090870793908834e-05, "learning_rate": 1.2662059451446506e-07, "loss": 0.0, "num_input_tokens_seen": 117813224, "step": 174840 }, { "epoch": 4.2714924388635085, "grad_norm": 0.00016614743799436837, "learning_rate": 1.2657906404858e-07, "loss": 0.0, "num_input_tokens_seen": 117816488, "step": 174845 }, { "epoch": 4.271614589695356, "grad_norm": 0.00242516677826643, "learning_rate": 1.265375399344466e-07, "loss": 0.0, "num_input_tokens_seen": 117819368, "step": 174850 }, { "epoch": 4.271736740527203, "grad_norm": 0.00045968478661961854, "learning_rate": 1.2649602217236744e-07, "loss": 0.0, "num_input_tokens_seen": 117822312, "step": 174855 }, { "epoch": 4.2718588913590505, "grad_norm": 0.00011602450103964657, "learning_rate": 1.264545107626439e-07, "loss": 0.0005, "num_input_tokens_seen": 117825576, "step": 174860 }, { "epoch": 4.271981042190897, "grad_norm": 3.8411395507864654e-05, "learning_rate": 1.2641300570557834e-07, "loss": 0.0, "num_input_tokens_seen": 117828968, "step": 174865 }, { "epoch": 4.272103193022745, "grad_norm": 5.47633899259381e-05, "learning_rate": 1.2637150700147235e-07, "loss": 0.0, "num_input_tokens_seen": 117832808, "step": 174870 }, { "epoch": 4.272225343854592, "grad_norm": 0.00022426110808737576, "learning_rate": 1.2633001465062754e-07, "loss": 0.0, "num_input_tokens_seen": 117836136, "step": 174875 }, { "epoch": 4.272347494686439, "grad_norm": 6.305616261670366e-05, "learning_rate": 1.2628852865334606e-07, "loss": 0.0, "num_input_tokens_seen": 117839336, "step": 174880 }, { "epoch": 4.272469645518286, "grad_norm": 0.00013498378393705934, "learning_rate": 1.262470490099292e-07, "loss": 0.0, "num_input_tokens_seen": 117842536, "step": 174885 }, { "epoch": 4.272591796350133, "grad_norm": 0.00016713308286853135, "learning_rate": 1.262055757206788e-07, "loss": 0.0, "num_input_tokens_seen": 117845352, "step": 174890 }, { "epoch": 4.27271394718198, "grad_norm": 1.3508079064195044e-05, "learning_rate": 1.2616410878589666e-07, "loss": 0.0, "num_input_tokens_seen": 117848872, "step": 174895 }, { "epoch": 4.272836098013827, "grad_norm": 0.00011501550761749968, "learning_rate": 1.2612264820588403e-07, "loss": 0.0, "num_input_tokens_seen": 117852328, "step": 174900 }, { "epoch": 4.272958248845675, "grad_norm": 0.0009163783397525549, "learning_rate": 1.2608119398094276e-07, "loss": 0.0, "num_input_tokens_seen": 117855848, "step": 174905 }, { "epoch": 4.2730803996775215, "grad_norm": 0.00014613720122724771, "learning_rate": 1.260397461113738e-07, "loss": 0.0, "num_input_tokens_seen": 117859240, "step": 174910 }, { "epoch": 4.273202550509369, "grad_norm": 9.100071474676952e-05, "learning_rate": 1.2599830459747907e-07, "loss": 0.0, "num_input_tokens_seen": 117862632, "step": 174915 }, { "epoch": 4.273324701341216, "grad_norm": 3.082215471295058e-06, "learning_rate": 1.2595686943955964e-07, "loss": 0.0, "num_input_tokens_seen": 117866152, "step": 174920 }, { "epoch": 4.2734468521730635, "grad_norm": 0.0011469591408967972, "learning_rate": 1.2591544063791683e-07, "loss": 0.0, "num_input_tokens_seen": 117869544, "step": 174925 }, { "epoch": 4.27356900300491, "grad_norm": 0.00022313492081593722, "learning_rate": 1.2587401819285237e-07, "loss": 0.0, "num_input_tokens_seen": 117872488, "step": 174930 }, { "epoch": 4.273691153836758, "grad_norm": 0.0006101438775658607, "learning_rate": 1.2583260210466685e-07, "loss": 0.0, "num_input_tokens_seen": 117875880, "step": 174935 }, { "epoch": 4.273813304668605, "grad_norm": 0.02029566280543804, "learning_rate": 1.25791192373662e-07, "loss": 0.0, "num_input_tokens_seen": 117879144, "step": 174940 }, { "epoch": 4.273935455500452, "grad_norm": 58.66333770751953, "learning_rate": 1.2574978900013854e-07, "loss": 0.0738, "num_input_tokens_seen": 117882280, "step": 174945 }, { "epoch": 4.274057606332299, "grad_norm": 8.697695011505857e-05, "learning_rate": 1.2570839198439775e-07, "loss": 0.0, "num_input_tokens_seen": 117885608, "step": 174950 }, { "epoch": 4.274179757164147, "grad_norm": 0.0003056648129131645, "learning_rate": 1.256670013267409e-07, "loss": 0.0, "num_input_tokens_seen": 117888808, "step": 174955 }, { "epoch": 4.274301907995993, "grad_norm": 0.0003528349625412375, "learning_rate": 1.2562561702746888e-07, "loss": 0.0, "num_input_tokens_seen": 117891624, "step": 174960 }, { "epoch": 4.27442405882784, "grad_norm": 0.0005818032077513635, "learning_rate": 1.255842390868822e-07, "loss": 0.0, "num_input_tokens_seen": 117895080, "step": 174965 }, { "epoch": 4.274546209659688, "grad_norm": 0.00101753999479115, "learning_rate": 1.255428675052824e-07, "loss": 0.0, "num_input_tokens_seen": 117898280, "step": 174970 }, { "epoch": 4.2746683604915345, "grad_norm": 0.00011096594971604645, "learning_rate": 1.2550150228297007e-07, "loss": 0.0, "num_input_tokens_seen": 117901480, "step": 174975 }, { "epoch": 4.274790511323382, "grad_norm": 3.41241029673256e-05, "learning_rate": 1.254601434202458e-07, "loss": 0.0, "num_input_tokens_seen": 117904616, "step": 174980 }, { "epoch": 4.274912662155229, "grad_norm": 0.000965735933277756, "learning_rate": 1.2541879091741058e-07, "loss": 0.0, "num_input_tokens_seen": 117907752, "step": 174985 }, { "epoch": 4.2750348129870765, "grad_norm": 9.309906999988016e-06, "learning_rate": 1.253774447747653e-07, "loss": 0.0, "num_input_tokens_seen": 117910632, "step": 174990 }, { "epoch": 4.275156963818923, "grad_norm": 0.002611256204545498, "learning_rate": 1.253361049926104e-07, "loss": 0.0, "num_input_tokens_seen": 117914472, "step": 174995 }, { "epoch": 4.275279114650771, "grad_norm": 3.897144051734358e-05, "learning_rate": 1.252947715712468e-07, "loss": 0.0, "num_input_tokens_seen": 117917416, "step": 175000 }, { "epoch": 4.275401265482618, "grad_norm": 2.1422400095616467e-05, "learning_rate": 1.2525344451097465e-07, "loss": 0.0, "num_input_tokens_seen": 117920488, "step": 175005 }, { "epoch": 4.275523416314465, "grad_norm": 0.0001733067911118269, "learning_rate": 1.252121238120949e-07, "loss": 0.0, "num_input_tokens_seen": 117923944, "step": 175010 }, { "epoch": 4.275645567146312, "grad_norm": 6.505786586785689e-05, "learning_rate": 1.2517080947490765e-07, "loss": 0.0, "num_input_tokens_seen": 117927016, "step": 175015 }, { "epoch": 4.27576771797816, "grad_norm": 0.0011223267065361142, "learning_rate": 1.2512950149971357e-07, "loss": 0.0174, "num_input_tokens_seen": 117930152, "step": 175020 }, { "epoch": 4.275889868810006, "grad_norm": 0.0003185166569892317, "learning_rate": 1.250881998868134e-07, "loss": 0.0, "num_input_tokens_seen": 117933864, "step": 175025 }, { "epoch": 4.276012019641854, "grad_norm": 0.0028908748645335436, "learning_rate": 1.250469046365068e-07, "loss": 0.0, "num_input_tokens_seen": 117937320, "step": 175030 }, { "epoch": 4.276134170473701, "grad_norm": 0.0006264683324843645, "learning_rate": 1.2500561574909474e-07, "loss": 0.0, "num_input_tokens_seen": 117940520, "step": 175035 }, { "epoch": 4.2762563213055484, "grad_norm": 7.769901276333258e-05, "learning_rate": 1.2496433322487697e-07, "loss": 0.0, "num_input_tokens_seen": 117943976, "step": 175040 }, { "epoch": 4.276378472137395, "grad_norm": 4.179087045486085e-05, "learning_rate": 1.2492305706415397e-07, "loss": 0.0, "num_input_tokens_seen": 117947368, "step": 175045 }, { "epoch": 4.276500622969243, "grad_norm": 0.00018700955843087286, "learning_rate": 1.24881787267226e-07, "loss": 0.0, "num_input_tokens_seen": 117950376, "step": 175050 }, { "epoch": 4.27662277380109, "grad_norm": 0.00018165944493375719, "learning_rate": 1.2484052383439293e-07, "loss": 0.0, "num_input_tokens_seen": 117953704, "step": 175055 }, { "epoch": 4.276744924632936, "grad_norm": 0.0002837112988345325, "learning_rate": 1.247992667659551e-07, "loss": 0.0, "num_input_tokens_seen": 117956840, "step": 175060 }, { "epoch": 4.276867075464784, "grad_norm": 0.0011048574233427644, "learning_rate": 1.2475801606221236e-07, "loss": 0.0, "num_input_tokens_seen": 117960232, "step": 175065 }, { "epoch": 4.276989226296631, "grad_norm": 0.00657190615311265, "learning_rate": 1.247167717234646e-07, "loss": 0.0, "num_input_tokens_seen": 117963688, "step": 175070 }, { "epoch": 4.277111377128478, "grad_norm": 0.0002077388489851728, "learning_rate": 1.2467553375001204e-07, "loss": 0.0, "num_input_tokens_seen": 117966440, "step": 175075 }, { "epoch": 4.277233527960325, "grad_norm": 4.731911394628696e-05, "learning_rate": 1.2463430214215432e-07, "loss": 0.0, "num_input_tokens_seen": 117969448, "step": 175080 }, { "epoch": 4.277355678792173, "grad_norm": 0.011321183294057846, "learning_rate": 1.2459307690019162e-07, "loss": 0.0, "num_input_tokens_seen": 117972520, "step": 175085 }, { "epoch": 4.2774778296240195, "grad_norm": 2.9748660381301306e-05, "learning_rate": 1.2455185802442314e-07, "loss": 0.0, "num_input_tokens_seen": 117976040, "step": 175090 }, { "epoch": 4.277599980455867, "grad_norm": 2.0650611986638978e-05, "learning_rate": 1.2451064551514946e-07, "loss": 0.0, "num_input_tokens_seen": 117979048, "step": 175095 }, { "epoch": 4.277722131287714, "grad_norm": 0.010599371045827866, "learning_rate": 1.244694393726694e-07, "loss": 0.0, "num_input_tokens_seen": 117982248, "step": 175100 }, { "epoch": 4.2778442821195615, "grad_norm": 6.258305802475661e-05, "learning_rate": 1.2442823959728322e-07, "loss": 0.0, "num_input_tokens_seen": 117985832, "step": 175105 }, { "epoch": 4.277966432951408, "grad_norm": 0.0001950976438820362, "learning_rate": 1.2438704618929052e-07, "loss": 0.0, "num_input_tokens_seen": 117989224, "step": 175110 }, { "epoch": 4.278088583783256, "grad_norm": 1.6232579582720064e-05, "learning_rate": 1.2434585914899054e-07, "loss": 0.0, "num_input_tokens_seen": 117992552, "step": 175115 }, { "epoch": 4.278210734615103, "grad_norm": 0.0004744456382468343, "learning_rate": 1.2430467847668325e-07, "loss": 0.0, "num_input_tokens_seen": 117995880, "step": 175120 }, { "epoch": 4.27833288544695, "grad_norm": 0.001792342634871602, "learning_rate": 1.2426350417266762e-07, "loss": 0.0, "num_input_tokens_seen": 117998888, "step": 175125 }, { "epoch": 4.278455036278797, "grad_norm": 0.00013714176020585, "learning_rate": 1.2422233623724354e-07, "loss": 0.0, "num_input_tokens_seen": 118002088, "step": 175130 }, { "epoch": 4.278577187110645, "grad_norm": 0.00023262936156243086, "learning_rate": 1.2418117467070998e-07, "loss": 0.0, "num_input_tokens_seen": 118004904, "step": 175135 }, { "epoch": 4.278699337942491, "grad_norm": 0.0003559202014002949, "learning_rate": 1.241400194733665e-07, "loss": 0.0, "num_input_tokens_seen": 118007976, "step": 175140 }, { "epoch": 4.278821488774339, "grad_norm": 6.383551226463169e-05, "learning_rate": 1.2409887064551262e-07, "loss": 0.0, "num_input_tokens_seen": 118011112, "step": 175145 }, { "epoch": 4.278943639606186, "grad_norm": 5.636045534629375e-05, "learning_rate": 1.240577281874471e-07, "loss": 0.0, "num_input_tokens_seen": 118014504, "step": 175150 }, { "epoch": 4.2790657904380325, "grad_norm": 0.0032909191213548183, "learning_rate": 1.240165920994696e-07, "loss": 0.0182, "num_input_tokens_seen": 118017896, "step": 175155 }, { "epoch": 4.27918794126988, "grad_norm": 2.825920546456473e-06, "learning_rate": 1.2397546238187883e-07, "loss": 0.0, "num_input_tokens_seen": 118021416, "step": 175160 }, { "epoch": 4.279310092101727, "grad_norm": 0.00020561976998578757, "learning_rate": 1.239343390349743e-07, "loss": 0.0, "num_input_tokens_seen": 118025704, "step": 175165 }, { "epoch": 4.2794322429335745, "grad_norm": 0.00010846881195902824, "learning_rate": 1.2389322205905473e-07, "loss": 0.0, "num_input_tokens_seen": 118028968, "step": 175170 }, { "epoch": 4.279554393765421, "grad_norm": 8.97041263669962e-06, "learning_rate": 1.2385211145441943e-07, "loss": 0.0, "num_input_tokens_seen": 118031976, "step": 175175 }, { "epoch": 4.279676544597269, "grad_norm": 0.00014840844960417598, "learning_rate": 1.238110072213673e-07, "loss": 0.0, "num_input_tokens_seen": 118034792, "step": 175180 }, { "epoch": 4.279798695429116, "grad_norm": 3.458599167061038e-05, "learning_rate": 1.2376990936019694e-07, "loss": 0.0, "num_input_tokens_seen": 118037800, "step": 175185 }, { "epoch": 4.279920846260963, "grad_norm": 0.0011223549954593182, "learning_rate": 1.237288178712077e-07, "loss": 0.0, "num_input_tokens_seen": 118041320, "step": 175190 }, { "epoch": 4.28004299709281, "grad_norm": 8.894487837096676e-05, "learning_rate": 1.2368773275469801e-07, "loss": 0.0, "num_input_tokens_seen": 118044776, "step": 175195 }, { "epoch": 4.280165147924658, "grad_norm": 0.00013673091598320752, "learning_rate": 1.2364665401096686e-07, "loss": 0.0, "num_input_tokens_seen": 118048232, "step": 175200 }, { "epoch": 4.280287298756504, "grad_norm": 0.00022629571321886033, "learning_rate": 1.236055816403131e-07, "loss": 0.0, "num_input_tokens_seen": 118051944, "step": 175205 }, { "epoch": 4.280409449588352, "grad_norm": 0.001033892622217536, "learning_rate": 1.2356451564303504e-07, "loss": 0.0, "num_input_tokens_seen": 118054952, "step": 175210 }, { "epoch": 4.280531600420199, "grad_norm": 0.0015593706630170345, "learning_rate": 1.235234560194318e-07, "loss": 0.0, "num_input_tokens_seen": 118058152, "step": 175215 }, { "epoch": 4.280653751252046, "grad_norm": 0.003671762067824602, "learning_rate": 1.2348240276980148e-07, "loss": 0.0305, "num_input_tokens_seen": 118061160, "step": 175220 }, { "epoch": 4.280775902083893, "grad_norm": 3.475360063021071e-05, "learning_rate": 1.2344135589444315e-07, "loss": 0.0, "num_input_tokens_seen": 118064872, "step": 175225 }, { "epoch": 4.280898052915741, "grad_norm": 5.875719580217265e-06, "learning_rate": 1.2340031539365481e-07, "loss": 0.0, "num_input_tokens_seen": 118068328, "step": 175230 }, { "epoch": 4.2810202037475875, "grad_norm": 0.002820430090650916, "learning_rate": 1.233592812677352e-07, "loss": 0.0, "num_input_tokens_seen": 118071080, "step": 175235 }, { "epoch": 4.281142354579435, "grad_norm": 0.002786210970953107, "learning_rate": 1.2331825351698278e-07, "loss": 0.0, "num_input_tokens_seen": 118074472, "step": 175240 }, { "epoch": 4.281264505411282, "grad_norm": 6.457543349824846e-05, "learning_rate": 1.2327723214169572e-07, "loss": 0.0, "num_input_tokens_seen": 118077864, "step": 175245 }, { "epoch": 4.281386656243129, "grad_norm": 9.612838039174676e-05, "learning_rate": 1.2323621714217257e-07, "loss": 0.0, "num_input_tokens_seen": 118081192, "step": 175250 }, { "epoch": 4.281508807074976, "grad_norm": 0.00032473396277055144, "learning_rate": 1.2319520851871136e-07, "loss": 0.0, "num_input_tokens_seen": 118084712, "step": 175255 }, { "epoch": 4.281630957906823, "grad_norm": 2.6731300749816e-05, "learning_rate": 1.2315420627161032e-07, "loss": 0.0, "num_input_tokens_seen": 118087848, "step": 175260 }, { "epoch": 4.281753108738671, "grad_norm": 0.0761445015668869, "learning_rate": 1.2311321040116795e-07, "loss": 0.0, "num_input_tokens_seen": 118091112, "step": 175265 }, { "epoch": 4.281875259570517, "grad_norm": 0.00014299452595878392, "learning_rate": 1.230722209076822e-07, "loss": 0.0, "num_input_tokens_seen": 118094120, "step": 175270 }, { "epoch": 4.281997410402365, "grad_norm": 7.086592086125165e-05, "learning_rate": 1.2303123779145096e-07, "loss": 0.0, "num_input_tokens_seen": 118097384, "step": 175275 }, { "epoch": 4.282119561234212, "grad_norm": 0.0003347352030687034, "learning_rate": 1.2299026105277265e-07, "loss": 0.0, "num_input_tokens_seen": 118100776, "step": 175280 }, { "epoch": 4.282241712066059, "grad_norm": 0.00018632085993885994, "learning_rate": 1.2294929069194494e-07, "loss": 0.0, "num_input_tokens_seen": 118104168, "step": 175285 }, { "epoch": 4.282363862897906, "grad_norm": 5.435491766547784e-05, "learning_rate": 1.2290832670926576e-07, "loss": 0.0, "num_input_tokens_seen": 118107368, "step": 175290 }, { "epoch": 4.282486013729754, "grad_norm": 6.169379048515111e-05, "learning_rate": 1.2286736910503314e-07, "loss": 0.0, "num_input_tokens_seen": 118110440, "step": 175295 }, { "epoch": 4.2826081645616005, "grad_norm": 0.006105211563408375, "learning_rate": 1.2282641787954506e-07, "loss": 0.0, "num_input_tokens_seen": 118113448, "step": 175300 }, { "epoch": 4.282730315393448, "grad_norm": 8.213877299567685e-05, "learning_rate": 1.2278547303309905e-07, "loss": 0.0, "num_input_tokens_seen": 118116968, "step": 175305 }, { "epoch": 4.282852466225295, "grad_norm": 0.00024381156254094094, "learning_rate": 1.2274453456599333e-07, "loss": 0.0, "num_input_tokens_seen": 118120296, "step": 175310 }, { "epoch": 4.2829746170571426, "grad_norm": 0.006086864974349737, "learning_rate": 1.2270360247852496e-07, "loss": 0.0, "num_input_tokens_seen": 118123432, "step": 175315 }, { "epoch": 4.283096767888989, "grad_norm": 0.0015530579257756472, "learning_rate": 1.2266267677099219e-07, "loss": 0.0, "num_input_tokens_seen": 118127080, "step": 175320 }, { "epoch": 4.283218918720836, "grad_norm": 0.04028932377696037, "learning_rate": 1.2262175744369218e-07, "loss": 0.0553, "num_input_tokens_seen": 118130728, "step": 175325 }, { "epoch": 4.283341069552684, "grad_norm": 0.0018163869390264153, "learning_rate": 1.2258084449692286e-07, "loss": 0.0, "num_input_tokens_seen": 118134248, "step": 175330 }, { "epoch": 4.28346322038453, "grad_norm": 0.00018046381592284888, "learning_rate": 1.2253993793098171e-07, "loss": 0.0, "num_input_tokens_seen": 118137448, "step": 175335 }, { "epoch": 4.283585371216378, "grad_norm": 7.681015267735347e-05, "learning_rate": 1.2249903774616598e-07, "loss": 0.0002, "num_input_tokens_seen": 118140968, "step": 175340 }, { "epoch": 4.283707522048225, "grad_norm": 0.0006761262775398791, "learning_rate": 1.2245814394277354e-07, "loss": 0.0, "num_input_tokens_seen": 118144360, "step": 175345 }, { "epoch": 4.283829672880072, "grad_norm": 0.0010945533867925406, "learning_rate": 1.2241725652110124e-07, "loss": 0.0, "num_input_tokens_seen": 118147560, "step": 175350 }, { "epoch": 4.283951823711919, "grad_norm": 0.0016349507495760918, "learning_rate": 1.2237637548144664e-07, "loss": 0.0, "num_input_tokens_seen": 118151016, "step": 175355 }, { "epoch": 4.284073974543767, "grad_norm": 0.0002284444635733962, "learning_rate": 1.2233550082410737e-07, "loss": 0.0, "num_input_tokens_seen": 118154216, "step": 175360 }, { "epoch": 4.284196125375614, "grad_norm": 0.00017028734146151692, "learning_rate": 1.222946325493801e-07, "loss": 0.0, "num_input_tokens_seen": 118157800, "step": 175365 }, { "epoch": 4.284318276207461, "grad_norm": 0.0015447017503902316, "learning_rate": 1.222537706575627e-07, "loss": 0.0, "num_input_tokens_seen": 118160744, "step": 175370 }, { "epoch": 4.284440427039308, "grad_norm": 0.0003525497450027615, "learning_rate": 1.2221291514895182e-07, "loss": 0.0, "num_input_tokens_seen": 118164136, "step": 175375 }, { "epoch": 4.284562577871156, "grad_norm": 1.5372748748632148e-05, "learning_rate": 1.2217206602384455e-07, "loss": 0.0, "num_input_tokens_seen": 118167784, "step": 175380 }, { "epoch": 4.284684728703002, "grad_norm": 0.0003744949062820524, "learning_rate": 1.2213122328253833e-07, "loss": 0.0, "num_input_tokens_seen": 118171240, "step": 175385 }, { "epoch": 4.28480687953485, "grad_norm": 7.806552457623184e-05, "learning_rate": 1.2209038692532981e-07, "loss": 0.0004, "num_input_tokens_seen": 118174888, "step": 175390 }, { "epoch": 4.284929030366697, "grad_norm": 0.00011152249498991296, "learning_rate": 1.2204955695251628e-07, "loss": 0.0, "num_input_tokens_seen": 118177768, "step": 175395 }, { "epoch": 4.285051181198544, "grad_norm": 0.00016263005090877414, "learning_rate": 1.2200873336439442e-07, "loss": 0.0, "num_input_tokens_seen": 118180520, "step": 175400 }, { "epoch": 4.285173332030391, "grad_norm": 9.192336437990889e-05, "learning_rate": 1.2196791616126135e-07, "loss": 0.0317, "num_input_tokens_seen": 118183976, "step": 175405 }, { "epoch": 4.285295482862239, "grad_norm": 0.0002289551921421662, "learning_rate": 1.2192710534341343e-07, "loss": 0.0, "num_input_tokens_seen": 118186920, "step": 175410 }, { "epoch": 4.2854176336940855, "grad_norm": 0.0002647794899530709, "learning_rate": 1.2188630091114817e-07, "loss": 0.0, "num_input_tokens_seen": 118190440, "step": 175415 }, { "epoch": 4.285539784525932, "grad_norm": 2.581018998171203e-05, "learning_rate": 1.218455028647616e-07, "loss": 0.0, "num_input_tokens_seen": 118193448, "step": 175420 }, { "epoch": 4.28566193535778, "grad_norm": 7.212234049802646e-05, "learning_rate": 1.218047112045507e-07, "loss": 0.0, "num_input_tokens_seen": 118196520, "step": 175425 }, { "epoch": 4.285784086189627, "grad_norm": 4.635051664081402e-05, "learning_rate": 1.2176392593081242e-07, "loss": 0.0, "num_input_tokens_seen": 118199784, "step": 175430 }, { "epoch": 4.285906237021474, "grad_norm": 8.119764243019745e-05, "learning_rate": 1.2172314704384278e-07, "loss": 0.0, "num_input_tokens_seen": 118202984, "step": 175435 }, { "epoch": 4.286028387853321, "grad_norm": 7.789898518240079e-05, "learning_rate": 1.2168237454393893e-07, "loss": 0.0, "num_input_tokens_seen": 118206504, "step": 175440 }, { "epoch": 4.286150538685169, "grad_norm": 2.6979070753441192e-05, "learning_rate": 1.2164160843139693e-07, "loss": 0.0, "num_input_tokens_seen": 118209704, "step": 175445 }, { "epoch": 4.286272689517015, "grad_norm": 1.372455335513223e-05, "learning_rate": 1.2160084870651331e-07, "loss": 0.0, "num_input_tokens_seen": 118212776, "step": 175450 }, { "epoch": 4.286394840348863, "grad_norm": 0.00017044198466464877, "learning_rate": 1.2156009536958479e-07, "loss": 0.0, "num_input_tokens_seen": 118216552, "step": 175455 }, { "epoch": 4.28651699118071, "grad_norm": 28.354991912841797, "learning_rate": 1.2151934842090738e-07, "loss": 0.0325, "num_input_tokens_seen": 118219944, "step": 175460 }, { "epoch": 4.286639142012557, "grad_norm": 1.995334059756715e-05, "learning_rate": 1.2147860786077767e-07, "loss": 0.0, "num_input_tokens_seen": 118223784, "step": 175465 }, { "epoch": 4.286761292844404, "grad_norm": 0.0001449656847398728, "learning_rate": 1.2143787368949178e-07, "loss": 0.0, "num_input_tokens_seen": 118227048, "step": 175470 }, { "epoch": 4.286883443676252, "grad_norm": 4.758282011607662e-05, "learning_rate": 1.2139714590734607e-07, "loss": 0.0, "num_input_tokens_seen": 118230056, "step": 175475 }, { "epoch": 4.2870055945080985, "grad_norm": 4.159102900302969e-06, "learning_rate": 1.2135642451463635e-07, "loss": 0.0, "num_input_tokens_seen": 118233256, "step": 175480 }, { "epoch": 4.287127745339946, "grad_norm": 0.0004711418878287077, "learning_rate": 1.2131570951165936e-07, "loss": 0.0, "num_input_tokens_seen": 118236200, "step": 175485 }, { "epoch": 4.287249896171793, "grad_norm": 0.008723760023713112, "learning_rate": 1.212750008987109e-07, "loss": 0.0, "num_input_tokens_seen": 118239208, "step": 175490 }, { "epoch": 4.2873720470036405, "grad_norm": 8.196967428375501e-06, "learning_rate": 1.212342986760867e-07, "loss": 0.0, "num_input_tokens_seen": 118242920, "step": 175495 }, { "epoch": 4.287494197835487, "grad_norm": 0.005483199842274189, "learning_rate": 1.211936028440832e-07, "loss": 0.0, "num_input_tokens_seen": 118246632, "step": 175500 }, { "epoch": 4.287616348667335, "grad_norm": 0.002007837640121579, "learning_rate": 1.2115291340299604e-07, "loss": 0.0, "num_input_tokens_seen": 118249896, "step": 175505 }, { "epoch": 4.287738499499182, "grad_norm": 0.00037572745350189507, "learning_rate": 1.2111223035312136e-07, "loss": 0.0, "num_input_tokens_seen": 118253224, "step": 175510 }, { "epoch": 4.287860650331028, "grad_norm": 0.00016909983241930604, "learning_rate": 1.2107155369475496e-07, "loss": 0.0, "num_input_tokens_seen": 118256360, "step": 175515 }, { "epoch": 4.287982801162876, "grad_norm": 0.00011647819337667897, "learning_rate": 1.2103088342819256e-07, "loss": 0.0, "num_input_tokens_seen": 118259368, "step": 175520 }, { "epoch": 4.288104951994723, "grad_norm": 0.0005521110142581165, "learning_rate": 1.2099021955373013e-07, "loss": 0.0, "num_input_tokens_seen": 118262440, "step": 175525 }, { "epoch": 4.28822710282657, "grad_norm": 0.001317873946391046, "learning_rate": 1.2094956207166307e-07, "loss": 0.0004, "num_input_tokens_seen": 118265576, "step": 175530 }, { "epoch": 4.288349253658417, "grad_norm": 0.0002650999231263995, "learning_rate": 1.2090891098228739e-07, "loss": 0.0, "num_input_tokens_seen": 118269096, "step": 175535 }, { "epoch": 4.288471404490265, "grad_norm": 0.0008484512218274176, "learning_rate": 1.208682662858984e-07, "loss": 0.0, "num_input_tokens_seen": 118272616, "step": 175540 }, { "epoch": 4.2885935553221115, "grad_norm": 7.143753464333713e-06, "learning_rate": 1.208276279827919e-07, "loss": 0.0, "num_input_tokens_seen": 118276008, "step": 175545 }, { "epoch": 4.288715706153959, "grad_norm": 9.948560909833759e-05, "learning_rate": 1.2078699607326347e-07, "loss": 0.0, "num_input_tokens_seen": 118279144, "step": 175550 }, { "epoch": 4.288837856985806, "grad_norm": 0.0017997337272390723, "learning_rate": 1.2074637055760828e-07, "loss": 0.0, "num_input_tokens_seen": 118282152, "step": 175555 }, { "epoch": 4.2889600078176535, "grad_norm": 4.9702899559633806e-05, "learning_rate": 1.2070575143612217e-07, "loss": 0.0, "num_input_tokens_seen": 118284968, "step": 175560 }, { "epoch": 4.2890821586495, "grad_norm": 4.725019607576542e-06, "learning_rate": 1.2066513870910022e-07, "loss": 0.0, "num_input_tokens_seen": 118287912, "step": 175565 }, { "epoch": 4.289204309481348, "grad_norm": 9.601834608474746e-05, "learning_rate": 1.20624532376838e-07, "loss": 0.0, "num_input_tokens_seen": 118291112, "step": 175570 }, { "epoch": 4.289326460313195, "grad_norm": 4.6068747906247154e-05, "learning_rate": 1.205839324396305e-07, "loss": 0.0, "num_input_tokens_seen": 118294504, "step": 175575 }, { "epoch": 4.289448611145042, "grad_norm": 6.677881174255162e-05, "learning_rate": 1.2054333889777345e-07, "loss": 0.0, "num_input_tokens_seen": 118298024, "step": 175580 }, { "epoch": 4.289570761976889, "grad_norm": 0.00011366893886588514, "learning_rate": 1.2050275175156156e-07, "loss": 0.0, "num_input_tokens_seen": 118301352, "step": 175585 }, { "epoch": 4.289692912808736, "grad_norm": 2.1709463908337057e-05, "learning_rate": 1.2046217100129042e-07, "loss": 0.0001, "num_input_tokens_seen": 118304808, "step": 175590 }, { "epoch": 4.289815063640583, "grad_norm": 4.48407263320405e-06, "learning_rate": 1.2042159664725494e-07, "loss": 0.0, "num_input_tokens_seen": 118308072, "step": 175595 }, { "epoch": 4.28993721447243, "grad_norm": 0.0007157354848459363, "learning_rate": 1.2038102868975e-07, "loss": 0.0, "num_input_tokens_seen": 118311720, "step": 175600 }, { "epoch": 4.290059365304278, "grad_norm": 7.197898867161712e-06, "learning_rate": 1.2034046712907075e-07, "loss": 0.0, "num_input_tokens_seen": 118314984, "step": 175605 }, { "epoch": 4.2901815161361245, "grad_norm": 5.3880936320638284e-05, "learning_rate": 1.2029991196551248e-07, "loss": 0.0, "num_input_tokens_seen": 118318056, "step": 175610 }, { "epoch": 4.290303666967972, "grad_norm": 0.014211894944310188, "learning_rate": 1.2025936319936957e-07, "loss": 0.0, "num_input_tokens_seen": 118321640, "step": 175615 }, { "epoch": 4.290425817799819, "grad_norm": 1.3704505363421049e-05, "learning_rate": 1.2021882083093748e-07, "loss": 0.0, "num_input_tokens_seen": 118324840, "step": 175620 }, { "epoch": 4.2905479686316665, "grad_norm": 0.00019162753596901894, "learning_rate": 1.2017828486051052e-07, "loss": 0.0, "num_input_tokens_seen": 118328104, "step": 175625 }, { "epoch": 4.290670119463513, "grad_norm": 5.376626722863875e-05, "learning_rate": 1.2013775528838399e-07, "loss": 0.0, "num_input_tokens_seen": 118331496, "step": 175630 }, { "epoch": 4.290792270295361, "grad_norm": 5.275745934341103e-05, "learning_rate": 1.2009723211485212e-07, "loss": 0.0001, "num_input_tokens_seen": 118334696, "step": 175635 }, { "epoch": 4.290914421127208, "grad_norm": 0.0004111059242859483, "learning_rate": 1.2005671534020978e-07, "loss": 0.0, "num_input_tokens_seen": 118337832, "step": 175640 }, { "epoch": 4.291036571959055, "grad_norm": 0.00015165227523539215, "learning_rate": 1.2001620496475195e-07, "loss": 0.0, "num_input_tokens_seen": 118340776, "step": 175645 }, { "epoch": 4.291158722790902, "grad_norm": 0.00014834836474619806, "learning_rate": 1.1997570098877275e-07, "loss": 0.0, "num_input_tokens_seen": 118343912, "step": 175650 }, { "epoch": 4.29128087362275, "grad_norm": 5.52527817490045e-05, "learning_rate": 1.1993520341256713e-07, "loss": 0.0, "num_input_tokens_seen": 118347432, "step": 175655 }, { "epoch": 4.291403024454596, "grad_norm": 0.0016621008981019258, "learning_rate": 1.1989471223642923e-07, "loss": 0.0, "num_input_tokens_seen": 118350312, "step": 175660 }, { "epoch": 4.291525175286444, "grad_norm": 0.0002743805234786123, "learning_rate": 1.1985422746065367e-07, "loss": 0.0, "num_input_tokens_seen": 118353832, "step": 175665 }, { "epoch": 4.291647326118291, "grad_norm": 1.5842942957533523e-05, "learning_rate": 1.1981374908553522e-07, "loss": 0.0, "num_input_tokens_seen": 118357288, "step": 175670 }, { "epoch": 4.291769476950138, "grad_norm": 0.004092568531632423, "learning_rate": 1.1977327711136754e-07, "loss": 0.0, "num_input_tokens_seen": 118360488, "step": 175675 }, { "epoch": 4.291891627781985, "grad_norm": 3.0272083677118644e-05, "learning_rate": 1.1973281153844572e-07, "loss": 0.0, "num_input_tokens_seen": 118363816, "step": 175680 }, { "epoch": 4.292013778613832, "grad_norm": 0.0012410653289407492, "learning_rate": 1.1969235236706354e-07, "loss": 0.0784, "num_input_tokens_seen": 118367208, "step": 175685 }, { "epoch": 4.29213592944568, "grad_norm": 1.507085016783094e-05, "learning_rate": 1.196518995975152e-07, "loss": 0.0, "num_input_tokens_seen": 118370920, "step": 175690 }, { "epoch": 4.292258080277526, "grad_norm": 0.0006038264837116003, "learning_rate": 1.1961145323009526e-07, "loss": 0.0, "num_input_tokens_seen": 118374120, "step": 175695 }, { "epoch": 4.292380231109374, "grad_norm": 2.109080924128648e-05, "learning_rate": 1.1957101326509733e-07, "loss": 0.0, "num_input_tokens_seen": 118377256, "step": 175700 }, { "epoch": 4.292502381941221, "grad_norm": 0.0002546267060097307, "learning_rate": 1.195305797028161e-07, "loss": 0.0, "num_input_tokens_seen": 118380648, "step": 175705 }, { "epoch": 4.292624532773068, "grad_norm": 6.967822992010042e-05, "learning_rate": 1.194901525435451e-07, "loss": 0.0, "num_input_tokens_seen": 118383912, "step": 175710 }, { "epoch": 4.292746683604915, "grad_norm": 5.2994387260696385e-06, "learning_rate": 1.1944973178757868e-07, "loss": 0.0, "num_input_tokens_seen": 118387304, "step": 175715 }, { "epoch": 4.292868834436763, "grad_norm": 2.5682649720693007e-05, "learning_rate": 1.1940931743521044e-07, "loss": 0.0, "num_input_tokens_seen": 118390888, "step": 175720 }, { "epoch": 4.2929909852686094, "grad_norm": 0.0005329764098860323, "learning_rate": 1.1936890948673473e-07, "loss": 0.0, "num_input_tokens_seen": 118394536, "step": 175725 }, { "epoch": 4.293113136100457, "grad_norm": 7.66331868362613e-05, "learning_rate": 1.1932850794244497e-07, "loss": 0.0, "num_input_tokens_seen": 118397672, "step": 175730 }, { "epoch": 4.293235286932304, "grad_norm": 0.00011966860620304942, "learning_rate": 1.1928811280263517e-07, "loss": 0.0, "num_input_tokens_seen": 118401320, "step": 175735 }, { "epoch": 4.2933574377641515, "grad_norm": 7.26673097233288e-05, "learning_rate": 1.192477240675993e-07, "loss": 0.0, "num_input_tokens_seen": 118404840, "step": 175740 }, { "epoch": 4.293479588595998, "grad_norm": 7.701585855102167e-05, "learning_rate": 1.192073417376307e-07, "loss": 0.0, "num_input_tokens_seen": 118408232, "step": 175745 }, { "epoch": 4.293601739427846, "grad_norm": 3.5217308322899044e-05, "learning_rate": 1.1916696581302344e-07, "loss": 0.0, "num_input_tokens_seen": 118411432, "step": 175750 }, { "epoch": 4.293723890259693, "grad_norm": 9.866907930700108e-05, "learning_rate": 1.1912659629407063e-07, "loss": 0.0, "num_input_tokens_seen": 118415016, "step": 175755 }, { "epoch": 4.29384604109154, "grad_norm": 0.0011521294945850968, "learning_rate": 1.1908623318106626e-07, "loss": 0.0, "num_input_tokens_seen": 118418216, "step": 175760 }, { "epoch": 4.293968191923387, "grad_norm": 0.003137110499665141, "learning_rate": 1.1904587647430386e-07, "loss": 0.0, "num_input_tokens_seen": 118422056, "step": 175765 }, { "epoch": 4.294090342755235, "grad_norm": 0.0001456877653254196, "learning_rate": 1.1900552617407655e-07, "loss": 0.0, "num_input_tokens_seen": 118425512, "step": 175770 }, { "epoch": 4.294212493587081, "grad_norm": 5.808260539197363e-06, "learning_rate": 1.1896518228067831e-07, "loss": 0.0, "num_input_tokens_seen": 118428456, "step": 175775 }, { "epoch": 4.294334644418928, "grad_norm": 7.820190148777328e-06, "learning_rate": 1.189248447944019e-07, "loss": 0.0, "num_input_tokens_seen": 118431656, "step": 175780 }, { "epoch": 4.294456795250776, "grad_norm": 1.0643769201124087e-05, "learning_rate": 1.1888451371554132e-07, "loss": 0.0, "num_input_tokens_seen": 118435688, "step": 175785 }, { "epoch": 4.2945789460826225, "grad_norm": 7.565869054815266e-06, "learning_rate": 1.188441890443893e-07, "loss": 0.0, "num_input_tokens_seen": 118438824, "step": 175790 }, { "epoch": 4.29470109691447, "grad_norm": 7.671902494621463e-06, "learning_rate": 1.1880387078123955e-07, "loss": 0.0, "num_input_tokens_seen": 118442152, "step": 175795 }, { "epoch": 4.294823247746317, "grad_norm": 0.0015023218002170324, "learning_rate": 1.1876355892638513e-07, "loss": 0.0, "num_input_tokens_seen": 118445288, "step": 175800 }, { "epoch": 4.2949453985781645, "grad_norm": 7.464071677532047e-05, "learning_rate": 1.187232534801188e-07, "loss": 0.0, "num_input_tokens_seen": 118448552, "step": 175805 }, { "epoch": 4.295067549410011, "grad_norm": 5.0487367843743414e-05, "learning_rate": 1.1868295444273435e-07, "loss": 0.0761, "num_input_tokens_seen": 118451624, "step": 175810 }, { "epoch": 4.295189700241859, "grad_norm": 4.657682802644558e-05, "learning_rate": 1.1864266181452421e-07, "loss": 0.0, "num_input_tokens_seen": 118454952, "step": 175815 }, { "epoch": 4.295311851073706, "grad_norm": 0.00018549639207776636, "learning_rate": 1.1860237559578168e-07, "loss": 0.0, "num_input_tokens_seen": 118458536, "step": 175820 }, { "epoch": 4.295434001905553, "grad_norm": 9.009638597490266e-05, "learning_rate": 1.1856209578679998e-07, "loss": 0.0, "num_input_tokens_seen": 118461864, "step": 175825 }, { "epoch": 4.2955561527374, "grad_norm": 0.01003610622137785, "learning_rate": 1.1852182238787156e-07, "loss": 0.0, "num_input_tokens_seen": 118465192, "step": 175830 }, { "epoch": 4.295678303569248, "grad_norm": 0.003888308070600033, "learning_rate": 1.1848155539928972e-07, "loss": 0.0, "num_input_tokens_seen": 118468648, "step": 175835 }, { "epoch": 4.295800454401094, "grad_norm": 0.00016373286780435592, "learning_rate": 1.1844129482134702e-07, "loss": 0.0, "num_input_tokens_seen": 118472104, "step": 175840 }, { "epoch": 4.295922605232942, "grad_norm": 5.8873829402728006e-05, "learning_rate": 1.1840104065433642e-07, "loss": 0.0, "num_input_tokens_seen": 118475368, "step": 175845 }, { "epoch": 4.296044756064789, "grad_norm": 0.0008571085636503994, "learning_rate": 1.183607928985505e-07, "loss": 0.0, "num_input_tokens_seen": 118479144, "step": 175850 }, { "epoch": 4.2961669068966355, "grad_norm": 1.4165329048410058e-05, "learning_rate": 1.1832055155428189e-07, "loss": 0.0, "num_input_tokens_seen": 118482408, "step": 175855 }, { "epoch": 4.296289057728483, "grad_norm": 5.2919556765118614e-05, "learning_rate": 1.1828031662182358e-07, "loss": 0.0, "num_input_tokens_seen": 118485992, "step": 175860 }, { "epoch": 4.296411208560331, "grad_norm": 6.386043878592318e-06, "learning_rate": 1.1824008810146791e-07, "loss": 0.0, "num_input_tokens_seen": 118489128, "step": 175865 }, { "epoch": 4.2965333593921775, "grad_norm": 2.0044681150466204e-05, "learning_rate": 1.1819986599350751e-07, "loss": 0.0, "num_input_tokens_seen": 118492264, "step": 175870 }, { "epoch": 4.296655510224024, "grad_norm": 0.012165850028395653, "learning_rate": 1.1815965029823471e-07, "loss": 0.0, "num_input_tokens_seen": 118495848, "step": 175875 }, { "epoch": 4.296777661055872, "grad_norm": 0.0007713138475082815, "learning_rate": 1.181194410159424e-07, "loss": 0.0, "num_input_tokens_seen": 118499176, "step": 175880 }, { "epoch": 4.296899811887719, "grad_norm": 6.350491457851604e-05, "learning_rate": 1.1807923814692244e-07, "loss": 0.0017, "num_input_tokens_seen": 118502440, "step": 175885 }, { "epoch": 4.297021962719566, "grad_norm": 0.00017253389523830265, "learning_rate": 1.1803904169146773e-07, "loss": 0.0, "num_input_tokens_seen": 118506216, "step": 175890 }, { "epoch": 4.297144113551413, "grad_norm": 6.567937816726044e-05, "learning_rate": 1.179988516498701e-07, "loss": 0.0, "num_input_tokens_seen": 118509416, "step": 175895 }, { "epoch": 4.297266264383261, "grad_norm": 0.00023607707407791167, "learning_rate": 1.1795866802242216e-07, "loss": 0.0346, "num_input_tokens_seen": 118512488, "step": 175900 }, { "epoch": 4.297388415215107, "grad_norm": 0.000546814757399261, "learning_rate": 1.1791849080941618e-07, "loss": 0.0, "num_input_tokens_seen": 118515816, "step": 175905 }, { "epoch": 4.297510566046955, "grad_norm": 0.0017294178251177073, "learning_rate": 1.1787832001114384e-07, "loss": 0.0, "num_input_tokens_seen": 118519144, "step": 175910 }, { "epoch": 4.297632716878802, "grad_norm": 9.488432988291606e-05, "learning_rate": 1.1783815562789767e-07, "loss": 0.0, "num_input_tokens_seen": 118522792, "step": 175915 }, { "epoch": 4.297754867710649, "grad_norm": 0.0001220635895151645, "learning_rate": 1.1779799765997e-07, "loss": 0.0, "num_input_tokens_seen": 118525736, "step": 175920 }, { "epoch": 4.297877018542496, "grad_norm": 0.001318514347076416, "learning_rate": 1.1775784610765227e-07, "loss": 0.0, "num_input_tokens_seen": 118529064, "step": 175925 }, { "epoch": 4.297999169374344, "grad_norm": 0.0556945838034153, "learning_rate": 1.1771770097123701e-07, "loss": 0.0, "num_input_tokens_seen": 118532392, "step": 175930 }, { "epoch": 4.2981213202061905, "grad_norm": 1.180052731797332e-05, "learning_rate": 1.1767756225101566e-07, "loss": 0.0, "num_input_tokens_seen": 118535912, "step": 175935 }, { "epoch": 4.298243471038038, "grad_norm": 0.0001700563880149275, "learning_rate": 1.1763742994728077e-07, "loss": 0.0, "num_input_tokens_seen": 118539368, "step": 175940 }, { "epoch": 4.298365621869885, "grad_norm": 1.8086886484525166e-05, "learning_rate": 1.1759730406032342e-07, "loss": 0.0, "num_input_tokens_seen": 118542568, "step": 175945 }, { "epoch": 4.298487772701732, "grad_norm": 6.41237638774328e-05, "learning_rate": 1.1755718459043595e-07, "loss": 0.0, "num_input_tokens_seen": 118545512, "step": 175950 }, { "epoch": 4.298609923533579, "grad_norm": 0.0019423235207796097, "learning_rate": 1.1751707153791012e-07, "loss": 0.0, "num_input_tokens_seen": 118549160, "step": 175955 }, { "epoch": 4.298732074365426, "grad_norm": 3.768239912460558e-05, "learning_rate": 1.1747696490303727e-07, "loss": 0.0, "num_input_tokens_seen": 118553000, "step": 175960 }, { "epoch": 4.298854225197274, "grad_norm": 0.00021600848413072526, "learning_rate": 1.1743686468610958e-07, "loss": 0.0, "num_input_tokens_seen": 118556968, "step": 175965 }, { "epoch": 4.29897637602912, "grad_norm": 0.0007384647615253925, "learning_rate": 1.1739677088741817e-07, "loss": 0.0, "num_input_tokens_seen": 118559976, "step": 175970 }, { "epoch": 4.299098526860968, "grad_norm": 0.0001238829572685063, "learning_rate": 1.1735668350725481e-07, "loss": 0.0001, "num_input_tokens_seen": 118563368, "step": 175975 }, { "epoch": 4.299220677692815, "grad_norm": 0.00313155772164464, "learning_rate": 1.1731660254591124e-07, "loss": 0.0, "num_input_tokens_seen": 118566824, "step": 175980 }, { "epoch": 4.299342828524662, "grad_norm": 0.001018465030938387, "learning_rate": 1.172765280036786e-07, "loss": 0.0, "num_input_tokens_seen": 118569832, "step": 175985 }, { "epoch": 4.299464979356509, "grad_norm": 0.03963853418827057, "learning_rate": 1.1723645988084862e-07, "loss": 0.0, "num_input_tokens_seen": 118572968, "step": 175990 }, { "epoch": 4.299587130188357, "grad_norm": 1.9976530893472955e-05, "learning_rate": 1.1719639817771244e-07, "loss": 0.0, "num_input_tokens_seen": 118576680, "step": 175995 }, { "epoch": 4.2997092810202036, "grad_norm": 0.0008707842789590359, "learning_rate": 1.1715634289456156e-07, "loss": 0.0, "num_input_tokens_seen": 118580456, "step": 176000 }, { "epoch": 4.299831431852051, "grad_norm": 0.0005335027817636728, "learning_rate": 1.1711629403168733e-07, "loss": 0.0, "num_input_tokens_seen": 118583272, "step": 176005 }, { "epoch": 4.299953582683898, "grad_norm": 0.0005570012144744396, "learning_rate": 1.1707625158938062e-07, "loss": 0.0, "num_input_tokens_seen": 118586984, "step": 176010 }, { "epoch": 4.300075733515746, "grad_norm": 9.334905917057768e-05, "learning_rate": 1.1703621556793308e-07, "loss": 0.0, "num_input_tokens_seen": 118590440, "step": 176015 }, { "epoch": 4.300197884347592, "grad_norm": 0.00020337030582595617, "learning_rate": 1.1699618596763549e-07, "loss": 0.0, "num_input_tokens_seen": 118593896, "step": 176020 }, { "epoch": 4.30032003517944, "grad_norm": 1.1005053238477558e-05, "learning_rate": 1.1695616278877929e-07, "loss": 0.0001, "num_input_tokens_seen": 118597032, "step": 176025 }, { "epoch": 4.300442186011287, "grad_norm": 1.5739868103992194e-05, "learning_rate": 1.1691614603165522e-07, "loss": 0.0, "num_input_tokens_seen": 118600552, "step": 176030 }, { "epoch": 4.300564336843134, "grad_norm": 7.899448974058032e-05, "learning_rate": 1.1687613569655464e-07, "loss": 0.0, "num_input_tokens_seen": 118604072, "step": 176035 }, { "epoch": 4.300686487674981, "grad_norm": 4.880525011685677e-05, "learning_rate": 1.1683613178376816e-07, "loss": 0.0, "num_input_tokens_seen": 118607336, "step": 176040 }, { "epoch": 4.300808638506828, "grad_norm": 0.00020250272064004093, "learning_rate": 1.1679613429358681e-07, "loss": 0.0, "num_input_tokens_seen": 118610728, "step": 176045 }, { "epoch": 4.3009307893386755, "grad_norm": 0.00022577719937544316, "learning_rate": 1.1675614322630179e-07, "loss": 0.0, "num_input_tokens_seen": 118613736, "step": 176050 }, { "epoch": 4.301052940170522, "grad_norm": 1.0512053449929226e-05, "learning_rate": 1.1671615858220352e-07, "loss": 0.0, "num_input_tokens_seen": 118617000, "step": 176055 }, { "epoch": 4.30117509100237, "grad_norm": 2.0839153876295313e-05, "learning_rate": 1.16676180361583e-07, "loss": 0.0001, "num_input_tokens_seen": 118620264, "step": 176060 }, { "epoch": 4.301297241834217, "grad_norm": 9.713278996059671e-05, "learning_rate": 1.1663620856473078e-07, "loss": 0.0, "num_input_tokens_seen": 118623464, "step": 176065 }, { "epoch": 4.301419392666064, "grad_norm": 1.1436190106905997e-05, "learning_rate": 1.1659624319193751e-07, "loss": 0.0, "num_input_tokens_seen": 118627240, "step": 176070 }, { "epoch": 4.301541543497911, "grad_norm": 0.00018494624237064272, "learning_rate": 1.1655628424349428e-07, "loss": 0.0, "num_input_tokens_seen": 118630376, "step": 176075 }, { "epoch": 4.301663694329759, "grad_norm": 0.00010769705113489181, "learning_rate": 1.165163317196911e-07, "loss": 0.0852, "num_input_tokens_seen": 118633704, "step": 176080 }, { "epoch": 4.301785845161605, "grad_norm": 0.01379750669002533, "learning_rate": 1.1647638562081907e-07, "loss": 0.0, "num_input_tokens_seen": 118637352, "step": 176085 }, { "epoch": 4.301907995993453, "grad_norm": 0.0006048534414730966, "learning_rate": 1.1643644594716817e-07, "loss": 0.0, "num_input_tokens_seen": 118640616, "step": 176090 }, { "epoch": 4.3020301468253, "grad_norm": 0.0024736644700169563, "learning_rate": 1.1639651269902928e-07, "loss": 0.0, "num_input_tokens_seen": 118643944, "step": 176095 }, { "epoch": 4.302152297657147, "grad_norm": 8.007245196495205e-05, "learning_rate": 1.1635658587669239e-07, "loss": 0.0, "num_input_tokens_seen": 118647400, "step": 176100 }, { "epoch": 4.302274448488994, "grad_norm": 0.000884940498508513, "learning_rate": 1.1631666548044827e-07, "loss": 0.0, "num_input_tokens_seen": 118650664, "step": 176105 }, { "epoch": 4.302396599320842, "grad_norm": 9.08693255041726e-05, "learning_rate": 1.1627675151058703e-07, "loss": 0.0, "num_input_tokens_seen": 118653800, "step": 176110 }, { "epoch": 4.3025187501526885, "grad_norm": 3.714384729391895e-05, "learning_rate": 1.1623684396739885e-07, "loss": 0.0, "num_input_tokens_seen": 118656872, "step": 176115 }, { "epoch": 4.302640900984536, "grad_norm": 5.196634901949437e-06, "learning_rate": 1.161969428511741e-07, "loss": 0.0001, "num_input_tokens_seen": 118660392, "step": 176120 }, { "epoch": 4.302763051816383, "grad_norm": 0.003202979452908039, "learning_rate": 1.1615704816220284e-07, "loss": 0.0, "num_input_tokens_seen": 118663656, "step": 176125 }, { "epoch": 4.3028852026482305, "grad_norm": 0.000134141169837676, "learning_rate": 1.1611715990077531e-07, "loss": 0.0, "num_input_tokens_seen": 118666856, "step": 176130 }, { "epoch": 4.303007353480077, "grad_norm": 0.0003105984069406986, "learning_rate": 1.1607727806718138e-07, "loss": 0.0, "num_input_tokens_seen": 118670184, "step": 176135 }, { "epoch": 4.303129504311924, "grad_norm": 0.00014293364074546844, "learning_rate": 1.1603740266171124e-07, "loss": 0.0, "num_input_tokens_seen": 118673384, "step": 176140 }, { "epoch": 4.303251655143772, "grad_norm": 0.00013589364243671298, "learning_rate": 1.1599753368465515e-07, "loss": 0.0, "num_input_tokens_seen": 118676776, "step": 176145 }, { "epoch": 4.303373805975618, "grad_norm": 0.002318460727110505, "learning_rate": 1.159576711363025e-07, "loss": 0.0, "num_input_tokens_seen": 118679912, "step": 176150 }, { "epoch": 4.303495956807466, "grad_norm": 0.000528425385709852, "learning_rate": 1.1591781501694365e-07, "loss": 0.0, "num_input_tokens_seen": 118682984, "step": 176155 }, { "epoch": 4.303618107639313, "grad_norm": 5.2127357776043937e-05, "learning_rate": 1.15877965326868e-07, "loss": 0.0, "num_input_tokens_seen": 118686248, "step": 176160 }, { "epoch": 4.30374025847116, "grad_norm": 0.00026971526676788926, "learning_rate": 1.1583812206636556e-07, "loss": 0.0, "num_input_tokens_seen": 118689448, "step": 176165 }, { "epoch": 4.303862409303007, "grad_norm": 0.0002909989270847291, "learning_rate": 1.1579828523572632e-07, "loss": 0.0, "num_input_tokens_seen": 118692840, "step": 176170 }, { "epoch": 4.303984560134855, "grad_norm": 0.0005220604944042861, "learning_rate": 1.157584548352396e-07, "loss": 0.0, "num_input_tokens_seen": 118696040, "step": 176175 }, { "epoch": 4.3041067109667015, "grad_norm": 2.770401624729857e-05, "learning_rate": 1.157186308651955e-07, "loss": 0.0, "num_input_tokens_seen": 118699240, "step": 176180 }, { "epoch": 4.304228861798549, "grad_norm": 0.010921383276581764, "learning_rate": 1.1567881332588303e-07, "loss": 0.0, "num_input_tokens_seen": 118702376, "step": 176185 }, { "epoch": 4.304351012630396, "grad_norm": 1.1668385923258029e-05, "learning_rate": 1.1563900221759238e-07, "loss": 0.0, "num_input_tokens_seen": 118705704, "step": 176190 }, { "epoch": 4.3044731634622435, "grad_norm": 7.894026202848181e-05, "learning_rate": 1.1559919754061253e-07, "loss": 0.0, "num_input_tokens_seen": 118709032, "step": 176195 }, { "epoch": 4.30459531429409, "grad_norm": 3.2366890081902966e-05, "learning_rate": 1.155593992952334e-07, "loss": 0.0, "num_input_tokens_seen": 118712680, "step": 176200 }, { "epoch": 4.304717465125938, "grad_norm": 3.0066912586335093e-05, "learning_rate": 1.1551960748174405e-07, "loss": 0.0, "num_input_tokens_seen": 118716008, "step": 176205 }, { "epoch": 4.304839615957785, "grad_norm": 0.00018369973986409605, "learning_rate": 1.1547982210043417e-07, "loss": 0.0, "num_input_tokens_seen": 118719528, "step": 176210 }, { "epoch": 4.304961766789631, "grad_norm": 0.0011245689820498228, "learning_rate": 1.1544004315159284e-07, "loss": 0.0, "num_input_tokens_seen": 118723240, "step": 176215 }, { "epoch": 4.305083917621479, "grad_norm": 2.8072752684238367e-05, "learning_rate": 1.1540027063550939e-07, "loss": 0.0, "num_input_tokens_seen": 118726248, "step": 176220 }, { "epoch": 4.305206068453326, "grad_norm": 7.902841753093526e-05, "learning_rate": 1.1536050455247304e-07, "loss": 0.0, "num_input_tokens_seen": 118729512, "step": 176225 }, { "epoch": 4.305328219285173, "grad_norm": 0.00010827074584085494, "learning_rate": 1.1532074490277321e-07, "loss": 0.0, "num_input_tokens_seen": 118732584, "step": 176230 }, { "epoch": 4.30545037011702, "grad_norm": 0.0008183018071576953, "learning_rate": 1.152809916866987e-07, "loss": 0.0, "num_input_tokens_seen": 118735656, "step": 176235 }, { "epoch": 4.305572520948868, "grad_norm": 0.0002630564267747104, "learning_rate": 1.152412449045389e-07, "loss": 0.0, "num_input_tokens_seen": 118739368, "step": 176240 }, { "epoch": 4.3056946717807145, "grad_norm": 0.00021008927433285862, "learning_rate": 1.1520150455658261e-07, "loss": 0.0794, "num_input_tokens_seen": 118742632, "step": 176245 }, { "epoch": 4.305816822612562, "grad_norm": 2.9066259230603464e-05, "learning_rate": 1.1516177064311916e-07, "loss": 0.0, "num_input_tokens_seen": 118746280, "step": 176250 }, { "epoch": 4.305938973444409, "grad_norm": 0.022015077993273735, "learning_rate": 1.1512204316443719e-07, "loss": 0.0, "num_input_tokens_seen": 118749608, "step": 176255 }, { "epoch": 4.3060611242762565, "grad_norm": 0.0004457909963093698, "learning_rate": 1.1508232212082559e-07, "loss": 0.0, "num_input_tokens_seen": 118752936, "step": 176260 }, { "epoch": 4.306183275108103, "grad_norm": 5.9687656175810844e-05, "learning_rate": 1.1504260751257366e-07, "loss": 0.0, "num_input_tokens_seen": 118756648, "step": 176265 }, { "epoch": 4.306305425939951, "grad_norm": 0.0004831032711081207, "learning_rate": 1.1500289933996965e-07, "loss": 0.0, "num_input_tokens_seen": 118760488, "step": 176270 }, { "epoch": 4.306427576771798, "grad_norm": 2.4277231204905547e-05, "learning_rate": 1.1496319760330276e-07, "loss": 0.0, "num_input_tokens_seen": 118763944, "step": 176275 }, { "epoch": 4.306549727603645, "grad_norm": 0.0004642781859729439, "learning_rate": 1.149235023028614e-07, "loss": 0.0, "num_input_tokens_seen": 118767272, "step": 176280 }, { "epoch": 4.306671878435492, "grad_norm": 0.0004993979819118977, "learning_rate": 1.1488381343893461e-07, "loss": 0.0001, "num_input_tokens_seen": 118771048, "step": 176285 }, { "epoch": 4.30679402926734, "grad_norm": 0.0004221405542921275, "learning_rate": 1.1484413101181057e-07, "loss": 0.0, "num_input_tokens_seen": 118774120, "step": 176290 }, { "epoch": 4.306916180099186, "grad_norm": 4.720066954178037e-06, "learning_rate": 1.1480445502177805e-07, "loss": 0.0, "num_input_tokens_seen": 118777576, "step": 176295 }, { "epoch": 4.307038330931034, "grad_norm": 0.00036887277383357286, "learning_rate": 1.1476478546912582e-07, "loss": 0.0, "num_input_tokens_seen": 118781864, "step": 176300 }, { "epoch": 4.307160481762881, "grad_norm": 0.00020793842850252986, "learning_rate": 1.14725122354142e-07, "loss": 0.0, "num_input_tokens_seen": 118785448, "step": 176305 }, { "epoch": 4.3072826325947275, "grad_norm": 0.0008742841309867799, "learning_rate": 1.1468546567711545e-07, "loss": 0.0, "num_input_tokens_seen": 118788392, "step": 176310 }, { "epoch": 4.307404783426575, "grad_norm": 8.221582538681105e-05, "learning_rate": 1.1464581543833429e-07, "loss": 0.0, "num_input_tokens_seen": 118791464, "step": 176315 }, { "epoch": 4.307526934258422, "grad_norm": 0.00010175922216149047, "learning_rate": 1.1460617163808661e-07, "loss": 0.0, "num_input_tokens_seen": 118795432, "step": 176320 }, { "epoch": 4.30764908509027, "grad_norm": 0.008732876740396023, "learning_rate": 1.145665342766613e-07, "loss": 0.0465, "num_input_tokens_seen": 118798952, "step": 176325 }, { "epoch": 4.307771235922116, "grad_norm": 0.003801588201895356, "learning_rate": 1.14526903354346e-07, "loss": 0.0, "num_input_tokens_seen": 118801960, "step": 176330 }, { "epoch": 4.307893386753964, "grad_norm": 0.0003274581686127931, "learning_rate": 1.1448727887142951e-07, "loss": 0.0, "num_input_tokens_seen": 118805288, "step": 176335 }, { "epoch": 4.308015537585811, "grad_norm": 2.4809673050185665e-05, "learning_rate": 1.1444766082819945e-07, "loss": 0.0, "num_input_tokens_seen": 118808488, "step": 176340 }, { "epoch": 4.308137688417658, "grad_norm": 0.0007116202614270151, "learning_rate": 1.144080492249444e-07, "loss": 0.0, "num_input_tokens_seen": 118811624, "step": 176345 }, { "epoch": 4.308259839249505, "grad_norm": 6.148311513243243e-05, "learning_rate": 1.1436844406195211e-07, "loss": 0.0, "num_input_tokens_seen": 118814824, "step": 176350 }, { "epoch": 4.308381990081353, "grad_norm": 0.000994387548416853, "learning_rate": 1.1432884533951059e-07, "loss": 0.0213, "num_input_tokens_seen": 118817896, "step": 176355 }, { "epoch": 4.308504140913199, "grad_norm": 0.0012195755261927843, "learning_rate": 1.1428925305790815e-07, "loss": 0.0, "num_input_tokens_seen": 118821416, "step": 176360 }, { "epoch": 4.308626291745047, "grad_norm": 0.0007472949801012874, "learning_rate": 1.1424966721743224e-07, "loss": 0.0, "num_input_tokens_seen": 118824808, "step": 176365 }, { "epoch": 4.308748442576894, "grad_norm": 0.001678556320257485, "learning_rate": 1.1421008781837127e-07, "loss": 0.0, "num_input_tokens_seen": 118828392, "step": 176370 }, { "epoch": 4.3088705934087415, "grad_norm": 0.00041926439735107124, "learning_rate": 1.1417051486101248e-07, "loss": 0.0, "num_input_tokens_seen": 118831656, "step": 176375 }, { "epoch": 4.308992744240588, "grad_norm": 6.319615931715816e-05, "learning_rate": 1.1413094834564408e-07, "loss": 0.0, "num_input_tokens_seen": 118835048, "step": 176380 }, { "epoch": 4.309114895072436, "grad_norm": 8.652818360133097e-05, "learning_rate": 1.1409138827255382e-07, "loss": 0.0, "num_input_tokens_seen": 118838376, "step": 176385 }, { "epoch": 4.309237045904283, "grad_norm": 0.00018144700152333826, "learning_rate": 1.1405183464202916e-07, "loss": 0.0, "num_input_tokens_seen": 118841512, "step": 176390 }, { "epoch": 4.30935919673613, "grad_norm": 0.0005828774883411825, "learning_rate": 1.1401228745435799e-07, "loss": 0.0, "num_input_tokens_seen": 118844648, "step": 176395 }, { "epoch": 4.309481347567977, "grad_norm": 6.49345474812435e-06, "learning_rate": 1.1397274670982748e-07, "loss": 0.0, "num_input_tokens_seen": 118847848, "step": 176400 }, { "epoch": 4.309603498399824, "grad_norm": 0.004258220084011555, "learning_rate": 1.1393321240872578e-07, "loss": 0.0, "num_input_tokens_seen": 118850984, "step": 176405 }, { "epoch": 4.309725649231671, "grad_norm": 0.00043068305240012705, "learning_rate": 1.1389368455133985e-07, "loss": 0.0, "num_input_tokens_seen": 118854568, "step": 176410 }, { "epoch": 4.309847800063518, "grad_norm": 0.0021215358283370733, "learning_rate": 1.138541631379576e-07, "loss": 0.0, "num_input_tokens_seen": 118858088, "step": 176415 }, { "epoch": 4.309969950895366, "grad_norm": 1.3441069313557819e-05, "learning_rate": 1.138146481688662e-07, "loss": 0.0, "num_input_tokens_seen": 118861416, "step": 176420 }, { "epoch": 4.3100921017272125, "grad_norm": 4.286822149879299e-05, "learning_rate": 1.1377513964435292e-07, "loss": 0.0, "num_input_tokens_seen": 118865064, "step": 176425 }, { "epoch": 4.31021425255906, "grad_norm": 0.0008951426716521382, "learning_rate": 1.1373563756470527e-07, "loss": 0.0, "num_input_tokens_seen": 118868520, "step": 176430 }, { "epoch": 4.310336403390907, "grad_norm": 0.0001404430076945573, "learning_rate": 1.1369614193021027e-07, "loss": 0.0, "num_input_tokens_seen": 118871720, "step": 176435 }, { "epoch": 4.3104585542227545, "grad_norm": 0.00013012201816309243, "learning_rate": 1.1365665274115554e-07, "loss": 0.0, "num_input_tokens_seen": 118874792, "step": 176440 }, { "epoch": 4.310580705054601, "grad_norm": 0.00011794355668826029, "learning_rate": 1.1361716999782778e-07, "loss": 0.0, "num_input_tokens_seen": 118878248, "step": 176445 }, { "epoch": 4.310702855886449, "grad_norm": 0.0004219324328005314, "learning_rate": 1.135776937005144e-07, "loss": 0.0, "num_input_tokens_seen": 118881448, "step": 176450 }, { "epoch": 4.310825006718296, "grad_norm": 0.0328974686563015, "learning_rate": 1.1353822384950263e-07, "loss": 0.0, "num_input_tokens_seen": 118885096, "step": 176455 }, { "epoch": 4.310947157550143, "grad_norm": 4.000966146122664e-05, "learning_rate": 1.1349876044507922e-07, "loss": 0.0, "num_input_tokens_seen": 118888616, "step": 176460 }, { "epoch": 4.31106930838199, "grad_norm": 0.00020574162772390991, "learning_rate": 1.134593034875313e-07, "loss": 0.0, "num_input_tokens_seen": 118892072, "step": 176465 }, { "epoch": 4.311191459213838, "grad_norm": 0.0006144284270703793, "learning_rate": 1.1341985297714573e-07, "loss": 0.0001, "num_input_tokens_seen": 118895272, "step": 176470 }, { "epoch": 4.311313610045684, "grad_norm": 5.5516093198093586e-06, "learning_rate": 1.1338040891420941e-07, "loss": 0.0, "num_input_tokens_seen": 118898920, "step": 176475 }, { "epoch": 4.311435760877531, "grad_norm": 3.77788492187392e-05, "learning_rate": 1.1334097129900932e-07, "loss": 0.0, "num_input_tokens_seen": 118902568, "step": 176480 }, { "epoch": 4.311557911709379, "grad_norm": 0.00012832036009058356, "learning_rate": 1.1330154013183213e-07, "loss": 0.0, "num_input_tokens_seen": 118906024, "step": 176485 }, { "epoch": 4.311680062541226, "grad_norm": 8.782917575445026e-05, "learning_rate": 1.1326211541296471e-07, "loss": 0.0, "num_input_tokens_seen": 118909288, "step": 176490 }, { "epoch": 4.311802213373073, "grad_norm": 5.389552461565472e-05, "learning_rate": 1.1322269714269361e-07, "loss": 0.0, "num_input_tokens_seen": 118912360, "step": 176495 }, { "epoch": 4.31192436420492, "grad_norm": 0.00011952647037105635, "learning_rate": 1.1318328532130561e-07, "loss": 0.0, "num_input_tokens_seen": 118915496, "step": 176500 }, { "epoch": 4.3120465150367675, "grad_norm": 0.0003824408631771803, "learning_rate": 1.1314387994908726e-07, "loss": 0.0, "num_input_tokens_seen": 118918632, "step": 176505 }, { "epoch": 4.312168665868614, "grad_norm": 4.39748982898891e-05, "learning_rate": 1.1310448102632519e-07, "loss": 0.0, "num_input_tokens_seen": 118921640, "step": 176510 }, { "epoch": 4.312290816700462, "grad_norm": 8.625312148069497e-06, "learning_rate": 1.1306508855330576e-07, "loss": 0.0, "num_input_tokens_seen": 118925224, "step": 176515 }, { "epoch": 4.312412967532309, "grad_norm": 2.4011906134546734e-05, "learning_rate": 1.1302570253031573e-07, "loss": 0.0, "num_input_tokens_seen": 118928360, "step": 176520 }, { "epoch": 4.312535118364156, "grad_norm": 0.0008898421074263752, "learning_rate": 1.1298632295764143e-07, "loss": 0.0, "num_input_tokens_seen": 118931432, "step": 176525 }, { "epoch": 4.312657269196003, "grad_norm": 6.401252176146954e-05, "learning_rate": 1.1294694983556896e-07, "loss": 0.0, "num_input_tokens_seen": 118934824, "step": 176530 }, { "epoch": 4.312779420027851, "grad_norm": 2.1623593056574464e-05, "learning_rate": 1.1290758316438476e-07, "loss": 0.0, "num_input_tokens_seen": 118937832, "step": 176535 }, { "epoch": 4.312901570859697, "grad_norm": 0.00032035564072430134, "learning_rate": 1.1286822294437548e-07, "loss": 0.0, "num_input_tokens_seen": 118940904, "step": 176540 }, { "epoch": 4.313023721691545, "grad_norm": 0.010348553769290447, "learning_rate": 1.128288691758269e-07, "loss": 0.0, "num_input_tokens_seen": 118944552, "step": 176545 }, { "epoch": 4.313145872523392, "grad_norm": 6.47408960503526e-05, "learning_rate": 1.1278952185902557e-07, "loss": 0.0001, "num_input_tokens_seen": 118948136, "step": 176550 }, { "epoch": 4.313268023355239, "grad_norm": 0.08188897371292114, "learning_rate": 1.1275018099425738e-07, "loss": 0.0, "num_input_tokens_seen": 118951656, "step": 176555 }, { "epoch": 4.313390174187086, "grad_norm": 0.013791157864034176, "learning_rate": 1.1271084658180862e-07, "loss": 0.0203, "num_input_tokens_seen": 118954856, "step": 176560 }, { "epoch": 4.313512325018934, "grad_norm": 0.0006009989883750677, "learning_rate": 1.1267151862196501e-07, "loss": 0.0, "num_input_tokens_seen": 118958056, "step": 176565 }, { "epoch": 4.3136344758507805, "grad_norm": 0.0009822046849876642, "learning_rate": 1.1263219711501282e-07, "loss": 0.0002, "num_input_tokens_seen": 118961448, "step": 176570 }, { "epoch": 4.313756626682627, "grad_norm": 0.00011430822632974014, "learning_rate": 1.1259288206123818e-07, "loss": 0.0, "num_input_tokens_seen": 118965416, "step": 176575 }, { "epoch": 4.313878777514475, "grad_norm": 37.28138732910156, "learning_rate": 1.1255357346092653e-07, "loss": 0.0641, "num_input_tokens_seen": 118968552, "step": 176580 }, { "epoch": 4.314000928346322, "grad_norm": 0.00012667212286032736, "learning_rate": 1.125142713143642e-07, "loss": 0.0, "num_input_tokens_seen": 118971880, "step": 176585 }, { "epoch": 4.314123079178169, "grad_norm": 0.0009236466721631587, "learning_rate": 1.1247497562183661e-07, "loss": 0.0, "num_input_tokens_seen": 118975592, "step": 176590 }, { "epoch": 4.314245230010016, "grad_norm": 0.0015617778990417719, "learning_rate": 1.1243568638362988e-07, "loss": 0.0001, "num_input_tokens_seen": 118979432, "step": 176595 }, { "epoch": 4.314367380841864, "grad_norm": 0.00024131375539582223, "learning_rate": 1.1239640360002945e-07, "loss": 0.0, "num_input_tokens_seen": 118982952, "step": 176600 }, { "epoch": 4.31448953167371, "grad_norm": 0.0003638735506683588, "learning_rate": 1.1235712727132107e-07, "loss": 0.0, "num_input_tokens_seen": 118986024, "step": 176605 }, { "epoch": 4.314611682505558, "grad_norm": 0.00020771405252162367, "learning_rate": 1.1231785739779065e-07, "loss": 0.0, "num_input_tokens_seen": 118988904, "step": 176610 }, { "epoch": 4.314733833337405, "grad_norm": 5.820173100801185e-06, "learning_rate": 1.1227859397972328e-07, "loss": 0.0, "num_input_tokens_seen": 118992232, "step": 176615 }, { "epoch": 4.314855984169252, "grad_norm": 0.0002828043943736702, "learning_rate": 1.1223933701740484e-07, "loss": 0.0, "num_input_tokens_seen": 118995432, "step": 176620 }, { "epoch": 4.314978135001099, "grad_norm": 4.8877256631385535e-05, "learning_rate": 1.1220008651112089e-07, "loss": 0.0, "num_input_tokens_seen": 118998760, "step": 176625 }, { "epoch": 4.315100285832947, "grad_norm": 1.4099057807470672e-05, "learning_rate": 1.1216084246115642e-07, "loss": 0.0, "num_input_tokens_seen": 119002216, "step": 176630 }, { "epoch": 4.3152224366647935, "grad_norm": 0.0008657033322378993, "learning_rate": 1.1212160486779732e-07, "loss": 0.0002, "num_input_tokens_seen": 119005288, "step": 176635 }, { "epoch": 4.315344587496641, "grad_norm": 4.1478779166936874e-05, "learning_rate": 1.1208237373132845e-07, "loss": 0.0, "num_input_tokens_seen": 119008552, "step": 176640 }, { "epoch": 4.315466738328488, "grad_norm": 6.402322469512001e-05, "learning_rate": 1.1204314905203571e-07, "loss": 0.0, "num_input_tokens_seen": 119011880, "step": 176645 }, { "epoch": 4.315588889160336, "grad_norm": 8.852753671817482e-05, "learning_rate": 1.1200393083020376e-07, "loss": 0.0, "num_input_tokens_seen": 119015336, "step": 176650 }, { "epoch": 4.315711039992182, "grad_norm": 3.6001690659759333e-06, "learning_rate": 1.1196471906611826e-07, "loss": 0.0, "num_input_tokens_seen": 119018728, "step": 176655 }, { "epoch": 4.31583319082403, "grad_norm": 1.8246606487082317e-05, "learning_rate": 1.1192551376006398e-07, "loss": 0.0, "num_input_tokens_seen": 119022056, "step": 176660 }, { "epoch": 4.315955341655877, "grad_norm": 4.046333197038621e-05, "learning_rate": 1.1188631491232626e-07, "loss": 0.0, "num_input_tokens_seen": 119025448, "step": 176665 }, { "epoch": 4.316077492487723, "grad_norm": 0.0010642303386703134, "learning_rate": 1.1184712252319028e-07, "loss": 0.0, "num_input_tokens_seen": 119029544, "step": 176670 }, { "epoch": 4.316199643319571, "grad_norm": 0.0003580785123631358, "learning_rate": 1.1180793659294074e-07, "loss": 0.0, "num_input_tokens_seen": 119032808, "step": 176675 }, { "epoch": 4.316321794151418, "grad_norm": 0.00015057233395054936, "learning_rate": 1.1176875712186295e-07, "loss": 0.0, "num_input_tokens_seen": 119036328, "step": 176680 }, { "epoch": 4.3164439449832654, "grad_norm": 6.504006159957498e-05, "learning_rate": 1.1172958411024147e-07, "loss": 0.0029, "num_input_tokens_seen": 119039848, "step": 176685 }, { "epoch": 4.316566095815112, "grad_norm": 0.0016117613995447755, "learning_rate": 1.116904175583614e-07, "loss": 0.0, "num_input_tokens_seen": 119042984, "step": 176690 }, { "epoch": 4.31668824664696, "grad_norm": 6.902215318405069e-06, "learning_rate": 1.1165125746650771e-07, "loss": 0.0143, "num_input_tokens_seen": 119046760, "step": 176695 }, { "epoch": 4.316810397478807, "grad_norm": 0.3850659430027008, "learning_rate": 1.1161210383496478e-07, "loss": 0.0001, "num_input_tokens_seen": 119049960, "step": 176700 }, { "epoch": 4.316932548310654, "grad_norm": 0.012357803992927074, "learning_rate": 1.1157295666401789e-07, "loss": 0.0, "num_input_tokens_seen": 119052968, "step": 176705 }, { "epoch": 4.317054699142501, "grad_norm": 0.00010526581172598526, "learning_rate": 1.1153381595395117e-07, "loss": 0.0, "num_input_tokens_seen": 119056232, "step": 176710 }, { "epoch": 4.317176849974349, "grad_norm": 2.6836085453396663e-05, "learning_rate": 1.114946817050496e-07, "loss": 0.0169, "num_input_tokens_seen": 119059880, "step": 176715 }, { "epoch": 4.317299000806195, "grad_norm": 0.000445863523054868, "learning_rate": 1.1145555391759764e-07, "loss": 0.0, "num_input_tokens_seen": 119062952, "step": 176720 }, { "epoch": 4.317421151638043, "grad_norm": 0.0002292140416102484, "learning_rate": 1.1141643259187994e-07, "loss": 0.0, "num_input_tokens_seen": 119066088, "step": 176725 }, { "epoch": 4.31754330246989, "grad_norm": 0.000498281151521951, "learning_rate": 1.1137731772818105e-07, "loss": 0.0, "num_input_tokens_seen": 119069480, "step": 176730 }, { "epoch": 4.317665453301737, "grad_norm": 0.00032792615820653737, "learning_rate": 1.1133820932678506e-07, "loss": 0.0, "num_input_tokens_seen": 119072680, "step": 176735 }, { "epoch": 4.317787604133584, "grad_norm": 0.00145147112198174, "learning_rate": 1.1129910738797688e-07, "loss": 0.0, "num_input_tokens_seen": 119076200, "step": 176740 }, { "epoch": 4.317909754965432, "grad_norm": 6.6974112087336835e-06, "learning_rate": 1.1126001191204038e-07, "loss": 0.0, "num_input_tokens_seen": 119079016, "step": 176745 }, { "epoch": 4.3180319057972785, "grad_norm": 0.0011043348349630833, "learning_rate": 1.1122092289926033e-07, "loss": 0.0, "num_input_tokens_seen": 119082280, "step": 176750 }, { "epoch": 4.318154056629126, "grad_norm": 0.00018441988504491746, "learning_rate": 1.1118184034992062e-07, "loss": 0.0, "num_input_tokens_seen": 119085544, "step": 176755 }, { "epoch": 4.318276207460973, "grad_norm": 9.656700422056019e-05, "learning_rate": 1.1114276426430558e-07, "loss": 0.0, "num_input_tokens_seen": 119088616, "step": 176760 }, { "epoch": 4.31839835829282, "grad_norm": 0.004439180716872215, "learning_rate": 1.1110369464269964e-07, "loss": 0.0001, "num_input_tokens_seen": 119092776, "step": 176765 }, { "epoch": 4.318520509124667, "grad_norm": 1.4901905160513707e-05, "learning_rate": 1.1106463148538659e-07, "loss": 0.0, "num_input_tokens_seen": 119096616, "step": 176770 }, { "epoch": 4.318642659956514, "grad_norm": 0.001198622863739729, "learning_rate": 1.1102557479265074e-07, "loss": 0.0, "num_input_tokens_seen": 119100264, "step": 176775 }, { "epoch": 4.318764810788362, "grad_norm": 0.0009047950152307749, "learning_rate": 1.1098652456477586e-07, "loss": 0.0, "num_input_tokens_seen": 119103464, "step": 176780 }, { "epoch": 4.318886961620208, "grad_norm": 5.245099964668043e-05, "learning_rate": 1.1094748080204608e-07, "loss": 0.0, "num_input_tokens_seen": 119106472, "step": 176785 }, { "epoch": 4.319009112452056, "grad_norm": 3.678392749861814e-05, "learning_rate": 1.1090844350474559e-07, "loss": 0.0, "num_input_tokens_seen": 119109736, "step": 176790 }, { "epoch": 4.319131263283903, "grad_norm": 3.22956548188813e-05, "learning_rate": 1.1086941267315775e-07, "loss": 0.0, "num_input_tokens_seen": 119112936, "step": 176795 }, { "epoch": 4.31925341411575, "grad_norm": 39.47646713256836, "learning_rate": 1.1083038830756697e-07, "loss": 0.0632, "num_input_tokens_seen": 119116264, "step": 176800 }, { "epoch": 4.319375564947597, "grad_norm": 0.0001879289629869163, "learning_rate": 1.1079137040825648e-07, "loss": 0.0, "num_input_tokens_seen": 119119528, "step": 176805 }, { "epoch": 4.319497715779445, "grad_norm": 0.0004902026848867536, "learning_rate": 1.107523589755105e-07, "loss": 0.0595, "num_input_tokens_seen": 119122920, "step": 176810 }, { "epoch": 4.3196198666112915, "grad_norm": 0.000874120625667274, "learning_rate": 1.1071335400961245e-07, "loss": 0.0, "num_input_tokens_seen": 119126376, "step": 176815 }, { "epoch": 4.319742017443139, "grad_norm": 0.0103354062885046, "learning_rate": 1.1067435551084625e-07, "loss": 0.0, "num_input_tokens_seen": 119129448, "step": 176820 }, { "epoch": 4.319864168274986, "grad_norm": 0.00011349384294589981, "learning_rate": 1.1063536347949509e-07, "loss": 0.0, "num_input_tokens_seen": 119133096, "step": 176825 }, { "epoch": 4.3199863191068335, "grad_norm": 0.0028983631636947393, "learning_rate": 1.1059637791584298e-07, "loss": 0.0, "num_input_tokens_seen": 119136104, "step": 176830 }, { "epoch": 4.32010846993868, "grad_norm": 0.00027363133267499506, "learning_rate": 1.1055739882017323e-07, "loss": 0.0, "num_input_tokens_seen": 119139432, "step": 176835 }, { "epoch": 4.320230620770527, "grad_norm": 0.0002914174401666969, "learning_rate": 1.1051842619276918e-07, "loss": 0.0, "num_input_tokens_seen": 119142760, "step": 176840 }, { "epoch": 4.320352771602375, "grad_norm": 0.00014641509915236384, "learning_rate": 1.104794600339145e-07, "loss": 0.0, "num_input_tokens_seen": 119145960, "step": 176845 }, { "epoch": 4.320474922434221, "grad_norm": 62.26630783081055, "learning_rate": 1.104405003438923e-07, "loss": 0.0638, "num_input_tokens_seen": 119149032, "step": 176850 }, { "epoch": 4.320597073266069, "grad_norm": 3.0409164537559263e-05, "learning_rate": 1.1040154712298599e-07, "loss": 0.0, "num_input_tokens_seen": 119152552, "step": 176855 }, { "epoch": 4.320719224097916, "grad_norm": 0.00017086489242501557, "learning_rate": 1.1036260037147915e-07, "loss": 0.0, "num_input_tokens_seen": 119155752, "step": 176860 }, { "epoch": 4.320841374929763, "grad_norm": 0.0013268385082483292, "learning_rate": 1.1032366008965455e-07, "loss": 0.0, "num_input_tokens_seen": 119158888, "step": 176865 }, { "epoch": 4.32096352576161, "grad_norm": 0.0006828519399277866, "learning_rate": 1.1028472627779573e-07, "loss": 0.0, "num_input_tokens_seen": 119162088, "step": 176870 }, { "epoch": 4.321085676593458, "grad_norm": 0.0033956593833863735, "learning_rate": 1.1024579893618547e-07, "loss": 0.0, "num_input_tokens_seen": 119165032, "step": 176875 }, { "epoch": 4.3212078274253045, "grad_norm": 5.517356839845888e-05, "learning_rate": 1.102068780651072e-07, "loss": 0.0, "num_input_tokens_seen": 119168424, "step": 176880 }, { "epoch": 4.321329978257152, "grad_norm": 8.954934855864849e-06, "learning_rate": 1.1016796366484394e-07, "loss": 0.0, "num_input_tokens_seen": 119171560, "step": 176885 }, { "epoch": 4.321452129088999, "grad_norm": 5.516711826203391e-05, "learning_rate": 1.1012905573567843e-07, "loss": 0.0, "num_input_tokens_seen": 119174888, "step": 176890 }, { "epoch": 4.3215742799208465, "grad_norm": 0.00040915783029049635, "learning_rate": 1.1009015427789393e-07, "loss": 0.0, "num_input_tokens_seen": 119178536, "step": 176895 }, { "epoch": 4.321696430752693, "grad_norm": 2.283164212713018e-05, "learning_rate": 1.1005125929177306e-07, "loss": 0.0, "num_input_tokens_seen": 119181800, "step": 176900 }, { "epoch": 4.321818581584541, "grad_norm": 0.000558803731109947, "learning_rate": 1.1001237077759895e-07, "loss": 0.0, "num_input_tokens_seen": 119184808, "step": 176905 }, { "epoch": 4.321940732416388, "grad_norm": 6.331290933303535e-06, "learning_rate": 1.0997348873565404e-07, "loss": 0.0, "num_input_tokens_seen": 119188264, "step": 176910 }, { "epoch": 4.322062883248235, "grad_norm": 0.0001938117202371359, "learning_rate": 1.0993461316622132e-07, "loss": 0.0, "num_input_tokens_seen": 119191656, "step": 176915 }, { "epoch": 4.322185034080082, "grad_norm": 2.994046917592641e-05, "learning_rate": 1.0989574406958368e-07, "loss": 0.0, "num_input_tokens_seen": 119194856, "step": 176920 }, { "epoch": 4.32230718491193, "grad_norm": 8.466270628559869e-06, "learning_rate": 1.0985688144602346e-07, "loss": 0.0, "num_input_tokens_seen": 119198760, "step": 176925 }, { "epoch": 4.322429335743776, "grad_norm": 0.000782399030867964, "learning_rate": 1.0981802529582362e-07, "loss": 0.0, "num_input_tokens_seen": 119202280, "step": 176930 }, { "epoch": 4.322551486575623, "grad_norm": 0.00056214421056211, "learning_rate": 1.0977917561926642e-07, "loss": 0.0, "num_input_tokens_seen": 119205544, "step": 176935 }, { "epoch": 4.322673637407471, "grad_norm": 0.004536370746791363, "learning_rate": 1.0974033241663439e-07, "loss": 0.0, "num_input_tokens_seen": 119208808, "step": 176940 }, { "epoch": 4.3227957882393175, "grad_norm": 0.0001978017098736018, "learning_rate": 1.097014956882103e-07, "loss": 0.0, "num_input_tokens_seen": 119212264, "step": 176945 }, { "epoch": 4.322917939071165, "grad_norm": 0.0022211032919585705, "learning_rate": 1.0966266543427616e-07, "loss": 0.0, "num_input_tokens_seen": 119215976, "step": 176950 }, { "epoch": 4.323040089903012, "grad_norm": 0.00013491483696270734, "learning_rate": 1.0962384165511485e-07, "loss": 0.0, "num_input_tokens_seen": 119219560, "step": 176955 }, { "epoch": 4.3231622407348596, "grad_norm": 4.585308852256276e-05, "learning_rate": 1.0958502435100814e-07, "loss": 0.0, "num_input_tokens_seen": 119222952, "step": 176960 }, { "epoch": 4.323284391566706, "grad_norm": 5.633817636407912e-05, "learning_rate": 1.0954621352223892e-07, "loss": 0.0, "num_input_tokens_seen": 119226472, "step": 176965 }, { "epoch": 4.323406542398554, "grad_norm": 0.00012583695934154093, "learning_rate": 1.0950740916908896e-07, "loss": 0.0, "num_input_tokens_seen": 119229544, "step": 176970 }, { "epoch": 4.323528693230401, "grad_norm": 0.0007696277461946011, "learning_rate": 1.0946861129184048e-07, "loss": 0.0, "num_input_tokens_seen": 119232808, "step": 176975 }, { "epoch": 4.323650844062248, "grad_norm": 0.0008008855511434376, "learning_rate": 1.0942981989077615e-07, "loss": 0.0, "num_input_tokens_seen": 119235880, "step": 176980 }, { "epoch": 4.323772994894095, "grad_norm": 1.950888690771535e-05, "learning_rate": 1.093910349661774e-07, "loss": 0.0, "num_input_tokens_seen": 119239336, "step": 176985 }, { "epoch": 4.323895145725943, "grad_norm": 0.0029932560864835978, "learning_rate": 1.0935225651832691e-07, "loss": 0.0, "num_input_tokens_seen": 119242984, "step": 176990 }, { "epoch": 4.324017296557789, "grad_norm": 0.04415324330329895, "learning_rate": 1.0931348454750599e-07, "loss": 0.0, "num_input_tokens_seen": 119245992, "step": 176995 }, { "epoch": 4.324139447389637, "grad_norm": 0.0006158613250590861, "learning_rate": 1.0927471905399732e-07, "loss": 0.0, "num_input_tokens_seen": 119249192, "step": 177000 }, { "epoch": 4.324261598221484, "grad_norm": 0.0008558622212149203, "learning_rate": 1.0923596003808222e-07, "loss": 0.0, "num_input_tokens_seen": 119252264, "step": 177005 }, { "epoch": 4.3243837490533314, "grad_norm": 8.662411710247397e-05, "learning_rate": 1.091972075000428e-07, "loss": 0.0, "num_input_tokens_seen": 119255464, "step": 177010 }, { "epoch": 4.324505899885178, "grad_norm": 2.8647489671129733e-05, "learning_rate": 1.0915846144016117e-07, "loss": 0.0, "num_input_tokens_seen": 119259432, "step": 177015 }, { "epoch": 4.324628050717026, "grad_norm": 0.000773418927565217, "learning_rate": 1.0911972185871842e-07, "loss": 0.0, "num_input_tokens_seen": 119262632, "step": 177020 }, { "epoch": 4.324750201548873, "grad_norm": 0.00012162028724560514, "learning_rate": 1.0908098875599703e-07, "loss": 0.0, "num_input_tokens_seen": 119266280, "step": 177025 }, { "epoch": 4.324872352380719, "grad_norm": 0.0016422343906015158, "learning_rate": 1.0904226213227807e-07, "loss": 0.0, "num_input_tokens_seen": 119269352, "step": 177030 }, { "epoch": 4.324994503212567, "grad_norm": 0.0027196877636015415, "learning_rate": 1.0900354198784367e-07, "loss": 0.0, "num_input_tokens_seen": 119272360, "step": 177035 }, { "epoch": 4.325116654044414, "grad_norm": 7.278865814441815e-05, "learning_rate": 1.0896482832297515e-07, "loss": 0.0, "num_input_tokens_seen": 119275368, "step": 177040 }, { "epoch": 4.325238804876261, "grad_norm": 1.766349669196643e-05, "learning_rate": 1.0892612113795374e-07, "loss": 0.0, "num_input_tokens_seen": 119278312, "step": 177045 }, { "epoch": 4.325360955708108, "grad_norm": 1.544174665468745e-05, "learning_rate": 1.0888742043306154e-07, "loss": 0.0, "num_input_tokens_seen": 119281704, "step": 177050 }, { "epoch": 4.325483106539956, "grad_norm": 9.286731255997438e-06, "learning_rate": 1.0884872620857954e-07, "loss": 0.0, "num_input_tokens_seen": 119284712, "step": 177055 }, { "epoch": 4.3256052573718025, "grad_norm": 0.0032419469207525253, "learning_rate": 1.0881003846478942e-07, "loss": 0.1786, "num_input_tokens_seen": 119287912, "step": 177060 }, { "epoch": 4.32572740820365, "grad_norm": 2.6196485123364255e-05, "learning_rate": 1.0877135720197228e-07, "loss": 0.0, "num_input_tokens_seen": 119291368, "step": 177065 }, { "epoch": 4.325849559035497, "grad_norm": 0.0016508515691384673, "learning_rate": 1.0873268242040945e-07, "loss": 0.0, "num_input_tokens_seen": 119294696, "step": 177070 }, { "epoch": 4.3259717098673445, "grad_norm": 7.895252201706171e-05, "learning_rate": 1.0869401412038248e-07, "loss": 0.0, "num_input_tokens_seen": 119297960, "step": 177075 }, { "epoch": 4.326093860699191, "grad_norm": 0.011798272840678692, "learning_rate": 1.0865535230217226e-07, "loss": 0.0, "num_input_tokens_seen": 119301416, "step": 177080 }, { "epoch": 4.326216011531039, "grad_norm": 0.002923740306869149, "learning_rate": 1.0861669696606024e-07, "loss": 0.0, "num_input_tokens_seen": 119305000, "step": 177085 }, { "epoch": 4.326338162362886, "grad_norm": 0.0003772857889998704, "learning_rate": 1.0857804811232707e-07, "loss": 0.0, "num_input_tokens_seen": 119308328, "step": 177090 }, { "epoch": 4.326460313194733, "grad_norm": 0.001669778604991734, "learning_rate": 1.0853940574125419e-07, "loss": 0.0, "num_input_tokens_seen": 119311208, "step": 177095 }, { "epoch": 4.32658246402658, "grad_norm": 0.00031803478486835957, "learning_rate": 1.0850076985312262e-07, "loss": 0.0, "num_input_tokens_seen": 119314152, "step": 177100 }, { "epoch": 4.326704614858427, "grad_norm": 9.132656850852072e-05, "learning_rate": 1.0846214044821311e-07, "loss": 0.0, "num_input_tokens_seen": 119317544, "step": 177105 }, { "epoch": 4.326826765690274, "grad_norm": 0.00018010310304816812, "learning_rate": 1.084235175268069e-07, "loss": 0.0, "num_input_tokens_seen": 119320616, "step": 177110 }, { "epoch": 4.326948916522121, "grad_norm": 0.0012703038519248366, "learning_rate": 1.0838490108918452e-07, "loss": 0.0, "num_input_tokens_seen": 119324136, "step": 177115 }, { "epoch": 4.327071067353969, "grad_norm": 0.00039456746890209615, "learning_rate": 1.083462911356271e-07, "loss": 0.0, "num_input_tokens_seen": 119327912, "step": 177120 }, { "epoch": 4.3271932181858155, "grad_norm": 5.931454506935552e-05, "learning_rate": 1.0830768766641507e-07, "loss": 0.0, "num_input_tokens_seen": 119331432, "step": 177125 }, { "epoch": 4.327315369017663, "grad_norm": 1.6832897017593496e-05, "learning_rate": 1.0826909068182954e-07, "loss": 0.0, "num_input_tokens_seen": 119334888, "step": 177130 }, { "epoch": 4.32743751984951, "grad_norm": 1.2521577446023002e-05, "learning_rate": 1.0823050018215097e-07, "loss": 0.0, "num_input_tokens_seen": 119338408, "step": 177135 }, { "epoch": 4.3275596706813575, "grad_norm": 7.616833318024874e-05, "learning_rate": 1.0819191616766011e-07, "loss": 0.0, "num_input_tokens_seen": 119342184, "step": 177140 }, { "epoch": 4.327681821513204, "grad_norm": 0.0011563415173441172, "learning_rate": 1.0815333863863763e-07, "loss": 0.0, "num_input_tokens_seen": 119345768, "step": 177145 }, { "epoch": 4.327803972345052, "grad_norm": 7.111600280040875e-05, "learning_rate": 1.0811476759536364e-07, "loss": 0.0, "num_input_tokens_seen": 119348968, "step": 177150 }, { "epoch": 4.327926123176899, "grad_norm": 3.4444997254468035e-06, "learning_rate": 1.0807620303811915e-07, "loss": 0.0, "num_input_tokens_seen": 119351912, "step": 177155 }, { "epoch": 4.328048274008746, "grad_norm": 0.0074299597181379795, "learning_rate": 1.0803764496718426e-07, "loss": 0.0, "num_input_tokens_seen": 119355176, "step": 177160 }, { "epoch": 4.328170424840593, "grad_norm": 9.750135177455377e-06, "learning_rate": 1.0799909338283952e-07, "loss": 0.0, "num_input_tokens_seen": 119358248, "step": 177165 }, { "epoch": 4.328292575672441, "grad_norm": 0.0002162015880458057, "learning_rate": 1.0796054828536549e-07, "loss": 0.0, "num_input_tokens_seen": 119361576, "step": 177170 }, { "epoch": 4.328414726504287, "grad_norm": 7.276174437720329e-05, "learning_rate": 1.0792200967504206e-07, "loss": 0.0, "num_input_tokens_seen": 119364648, "step": 177175 }, { "epoch": 4.328536877336135, "grad_norm": 0.00020344149379525334, "learning_rate": 1.0788347755214999e-07, "loss": 0.0, "num_input_tokens_seen": 119368040, "step": 177180 }, { "epoch": 4.328659028167982, "grad_norm": 0.0005671592662110925, "learning_rate": 1.0784495191696897e-07, "loss": 0.0, "num_input_tokens_seen": 119371624, "step": 177185 }, { "epoch": 4.328781178999829, "grad_norm": 0.00015273607277777046, "learning_rate": 1.0780643276977941e-07, "loss": 0.0, "num_input_tokens_seen": 119374888, "step": 177190 }, { "epoch": 4.328903329831676, "grad_norm": 1.7507405573269352e-05, "learning_rate": 1.0776792011086166e-07, "loss": 0.0, "num_input_tokens_seen": 119378088, "step": 177195 }, { "epoch": 4.329025480663523, "grad_norm": 0.00021398518583737314, "learning_rate": 1.0772941394049528e-07, "loss": 0.0, "num_input_tokens_seen": 119381480, "step": 177200 }, { "epoch": 4.3291476314953705, "grad_norm": 4.134556002099998e-05, "learning_rate": 1.0769091425896093e-07, "loss": 0.0002, "num_input_tokens_seen": 119384744, "step": 177205 }, { "epoch": 4.329269782327217, "grad_norm": 0.000264695001533255, "learning_rate": 1.0765242106653805e-07, "loss": 0.0002, "num_input_tokens_seen": 119387880, "step": 177210 }, { "epoch": 4.329391933159065, "grad_norm": 2.9532797270803712e-05, "learning_rate": 1.0761393436350685e-07, "loss": 0.0, "num_input_tokens_seen": 119391144, "step": 177215 }, { "epoch": 4.329514083990912, "grad_norm": 4.648505637305789e-05, "learning_rate": 1.0757545415014702e-07, "loss": 0.0, "num_input_tokens_seen": 119394216, "step": 177220 }, { "epoch": 4.329636234822759, "grad_norm": 0.0005457483348436654, "learning_rate": 1.0753698042673853e-07, "loss": 0.0, "num_input_tokens_seen": 119397480, "step": 177225 }, { "epoch": 4.329758385654606, "grad_norm": 7.230892515508458e-05, "learning_rate": 1.074985131935614e-07, "loss": 0.0, "num_input_tokens_seen": 119400680, "step": 177230 }, { "epoch": 4.329880536486454, "grad_norm": 7.861643098294735e-06, "learning_rate": 1.0746005245089484e-07, "loss": 0.0, "num_input_tokens_seen": 119404840, "step": 177235 }, { "epoch": 4.3300026873183, "grad_norm": 0.00010462554928380996, "learning_rate": 1.0742159819901908e-07, "loss": 0.0, "num_input_tokens_seen": 119407912, "step": 177240 }, { "epoch": 4.330124838150148, "grad_norm": 0.0006451192311942577, "learning_rate": 1.0738315043821356e-07, "loss": 0.0, "num_input_tokens_seen": 119411176, "step": 177245 }, { "epoch": 4.330246988981995, "grad_norm": 0.000196227862033993, "learning_rate": 1.0734470916875771e-07, "loss": 0.0, "num_input_tokens_seen": 119414184, "step": 177250 }, { "epoch": 4.330369139813842, "grad_norm": 0.0009764119167812169, "learning_rate": 1.0730627439093131e-07, "loss": 0.0, "num_input_tokens_seen": 119417512, "step": 177255 }, { "epoch": 4.330491290645689, "grad_norm": 7.502334483433515e-05, "learning_rate": 1.0726784610501372e-07, "loss": 0.0, "num_input_tokens_seen": 119420392, "step": 177260 }, { "epoch": 4.330613441477537, "grad_norm": 0.0002198718866566196, "learning_rate": 1.0722942431128457e-07, "loss": 0.0, "num_input_tokens_seen": 119423848, "step": 177265 }, { "epoch": 4.3307355923093835, "grad_norm": 0.000420448457589373, "learning_rate": 1.0719100901002298e-07, "loss": 0.0, "num_input_tokens_seen": 119427240, "step": 177270 }, { "epoch": 4.330857743141231, "grad_norm": 1.2290911399759352e-05, "learning_rate": 1.0715260020150874e-07, "loss": 0.0, "num_input_tokens_seen": 119430248, "step": 177275 }, { "epoch": 4.330979893973078, "grad_norm": 0.00271340599283576, "learning_rate": 1.0711419788602072e-07, "loss": 0.0, "num_input_tokens_seen": 119433448, "step": 177280 }, { "epoch": 4.3311020448049256, "grad_norm": 3.577460302039981e-05, "learning_rate": 1.0707580206383837e-07, "loss": 0.0, "num_input_tokens_seen": 119436840, "step": 177285 }, { "epoch": 4.331224195636772, "grad_norm": 9.350366599392146e-05, "learning_rate": 1.0703741273524125e-07, "loss": 0.0, "num_input_tokens_seen": 119440232, "step": 177290 }, { "epoch": 4.331346346468619, "grad_norm": 3.744398782146163e-05, "learning_rate": 1.0699902990050791e-07, "loss": 0.0, "num_input_tokens_seen": 119443880, "step": 177295 }, { "epoch": 4.331468497300467, "grad_norm": 0.00029137422097846866, "learning_rate": 1.0696065355991812e-07, "loss": 0.0, "num_input_tokens_seen": 119447528, "step": 177300 }, { "epoch": 4.331590648132313, "grad_norm": 0.00011451731552369893, "learning_rate": 1.0692228371375045e-07, "loss": 0.0, "num_input_tokens_seen": 119451176, "step": 177305 }, { "epoch": 4.331712798964161, "grad_norm": 8.536599489161745e-05, "learning_rate": 1.0688392036228434e-07, "loss": 0.0, "num_input_tokens_seen": 119454440, "step": 177310 }, { "epoch": 4.331834949796008, "grad_norm": 0.029383648186922073, "learning_rate": 1.0684556350579832e-07, "loss": 0.0, "num_input_tokens_seen": 119458152, "step": 177315 }, { "epoch": 4.331957100627855, "grad_norm": 7.236431702040136e-05, "learning_rate": 1.0680721314457164e-07, "loss": 0.0, "num_input_tokens_seen": 119462120, "step": 177320 }, { "epoch": 4.332079251459702, "grad_norm": 0.005610376596450806, "learning_rate": 1.0676886927888329e-07, "loss": 0.0, "num_input_tokens_seen": 119465320, "step": 177325 }, { "epoch": 4.33220140229155, "grad_norm": 0.020806120708584785, "learning_rate": 1.0673053190901183e-07, "loss": 0.0, "num_input_tokens_seen": 119468776, "step": 177330 }, { "epoch": 4.332323553123397, "grad_norm": 0.0010466218227520585, "learning_rate": 1.0669220103523647e-07, "loss": 0.0, "num_input_tokens_seen": 119472360, "step": 177335 }, { "epoch": 4.332445703955244, "grad_norm": 0.000505940755829215, "learning_rate": 1.0665387665783532e-07, "loss": 0.0, "num_input_tokens_seen": 119475496, "step": 177340 }, { "epoch": 4.332567854787091, "grad_norm": 0.0004687255423050374, "learning_rate": 1.0661555877708783e-07, "loss": 0.0, "num_input_tokens_seen": 119478952, "step": 177345 }, { "epoch": 4.332690005618939, "grad_norm": 0.002022866625338793, "learning_rate": 1.0657724739327223e-07, "loss": 0.0, "num_input_tokens_seen": 119482152, "step": 177350 }, { "epoch": 4.332812156450785, "grad_norm": 0.00021063839085400105, "learning_rate": 1.0653894250666695e-07, "loss": 0.0, "num_input_tokens_seen": 119486696, "step": 177355 }, { "epoch": 4.332934307282633, "grad_norm": 0.0020078979432582855, "learning_rate": 1.06500644117551e-07, "loss": 0.0, "num_input_tokens_seen": 119490216, "step": 177360 }, { "epoch": 4.33305645811448, "grad_norm": 9.417716682946775e-06, "learning_rate": 1.0646235222620247e-07, "loss": 0.0002, "num_input_tokens_seen": 119493800, "step": 177365 }, { "epoch": 4.333178608946327, "grad_norm": 1.2056245395797305e-05, "learning_rate": 1.0642406683290028e-07, "loss": 0.0, "num_input_tokens_seen": 119497192, "step": 177370 }, { "epoch": 4.333300759778174, "grad_norm": 0.001364095718599856, "learning_rate": 1.0638578793792253e-07, "loss": 0.0929, "num_input_tokens_seen": 119500584, "step": 177375 }, { "epoch": 4.333422910610022, "grad_norm": 5.21705842402298e-05, "learning_rate": 1.0634751554154753e-07, "loss": 0.0, "num_input_tokens_seen": 119503656, "step": 177380 }, { "epoch": 4.3335450614418685, "grad_norm": 4.9862387641042005e-06, "learning_rate": 1.0630924964405396e-07, "loss": 0.0, "num_input_tokens_seen": 119507240, "step": 177385 }, { "epoch": 4.333667212273715, "grad_norm": 0.0001016928072203882, "learning_rate": 1.0627099024571984e-07, "loss": 0.0, "num_input_tokens_seen": 119510568, "step": 177390 }, { "epoch": 4.333789363105563, "grad_norm": 0.0002153657260350883, "learning_rate": 1.0623273734682347e-07, "loss": 0.0, "num_input_tokens_seen": 119513768, "step": 177395 }, { "epoch": 4.33391151393741, "grad_norm": 0.0023736932780593634, "learning_rate": 1.0619449094764299e-07, "loss": 0.0, "num_input_tokens_seen": 119517224, "step": 177400 }, { "epoch": 4.334033664769257, "grad_norm": 0.000669497880153358, "learning_rate": 1.0615625104845672e-07, "loss": 0.0, "num_input_tokens_seen": 119520808, "step": 177405 }, { "epoch": 4.334155815601104, "grad_norm": 0.0003056761051993817, "learning_rate": 1.0611801764954242e-07, "loss": 0.0, "num_input_tokens_seen": 119524072, "step": 177410 }, { "epoch": 4.334277966432952, "grad_norm": 0.0005257625016383827, "learning_rate": 1.0607979075117824e-07, "loss": 0.0354, "num_input_tokens_seen": 119527336, "step": 177415 }, { "epoch": 4.334400117264798, "grad_norm": 6.078456499380991e-05, "learning_rate": 1.0604157035364259e-07, "loss": 0.0, "num_input_tokens_seen": 119530856, "step": 177420 }, { "epoch": 4.334522268096646, "grad_norm": 0.0018427494214847684, "learning_rate": 1.0600335645721281e-07, "loss": 0.0, "num_input_tokens_seen": 119533864, "step": 177425 }, { "epoch": 4.334644418928493, "grad_norm": 0.0004773819528054446, "learning_rate": 1.0596514906216725e-07, "loss": 0.0, "num_input_tokens_seen": 119536808, "step": 177430 }, { "epoch": 4.33476656976034, "grad_norm": 0.0014973797369748354, "learning_rate": 1.0592694816878345e-07, "loss": 0.0, "num_input_tokens_seen": 119539880, "step": 177435 }, { "epoch": 4.334888720592187, "grad_norm": 0.00011810551950475201, "learning_rate": 1.0588875377733952e-07, "loss": 0.0, "num_input_tokens_seen": 119543272, "step": 177440 }, { "epoch": 4.335010871424035, "grad_norm": 0.00020126704475842416, "learning_rate": 1.0585056588811292e-07, "loss": 0.0, "num_input_tokens_seen": 119546792, "step": 177445 }, { "epoch": 4.3351330222558815, "grad_norm": 0.0002205111231887713, "learning_rate": 1.0581238450138163e-07, "loss": 0.0, "num_input_tokens_seen": 119549800, "step": 177450 }, { "epoch": 4.335255173087729, "grad_norm": 0.7495196461677551, "learning_rate": 1.0577420961742301e-07, "loss": 0.0004, "num_input_tokens_seen": 119553192, "step": 177455 }, { "epoch": 4.335377323919576, "grad_norm": 0.0011778388870880008, "learning_rate": 1.0573604123651503e-07, "loss": 0.0, "num_input_tokens_seen": 119556712, "step": 177460 }, { "epoch": 4.335499474751423, "grad_norm": 0.00013111214502714574, "learning_rate": 1.0569787935893514e-07, "loss": 0.0, "num_input_tokens_seen": 119559912, "step": 177465 }, { "epoch": 4.33562162558327, "grad_norm": 0.001341344672255218, "learning_rate": 1.0565972398496059e-07, "loss": 0.0, "num_input_tokens_seen": 119563240, "step": 177470 }, { "epoch": 4.335743776415117, "grad_norm": 0.002520339796319604, "learning_rate": 1.0562157511486902e-07, "loss": 0.003, "num_input_tokens_seen": 119566440, "step": 177475 }, { "epoch": 4.335865927246965, "grad_norm": 0.00025319313863292336, "learning_rate": 1.0558343274893821e-07, "loss": 0.0, "num_input_tokens_seen": 119569768, "step": 177480 }, { "epoch": 4.335988078078811, "grad_norm": 0.018951229751110077, "learning_rate": 1.0554529688744507e-07, "loss": 0.0, "num_input_tokens_seen": 119573416, "step": 177485 }, { "epoch": 4.336110228910659, "grad_norm": 0.001201994833536446, "learning_rate": 1.0550716753066724e-07, "loss": 0.0, "num_input_tokens_seen": 119576872, "step": 177490 }, { "epoch": 4.336232379742506, "grad_norm": 6.68705761199817e-05, "learning_rate": 1.0546904467888162e-07, "loss": 0.0, "num_input_tokens_seen": 119580328, "step": 177495 }, { "epoch": 4.336354530574353, "grad_norm": 0.00011509162141010165, "learning_rate": 1.0543092833236578e-07, "loss": 0.0, "num_input_tokens_seen": 119583464, "step": 177500 }, { "epoch": 4.3364766814062, "grad_norm": 0.0015443543670699, "learning_rate": 1.0539281849139703e-07, "loss": 0.0, "num_input_tokens_seen": 119586664, "step": 177505 }, { "epoch": 4.336598832238048, "grad_norm": 0.0019911492709070444, "learning_rate": 1.0535471515625216e-07, "loss": 0.0, "num_input_tokens_seen": 119590184, "step": 177510 }, { "epoch": 4.3367209830698945, "grad_norm": 2.6939822419080883e-05, "learning_rate": 1.0531661832720862e-07, "loss": 0.0, "num_input_tokens_seen": 119593192, "step": 177515 }, { "epoch": 4.336843133901742, "grad_norm": 2.7527998099685647e-05, "learning_rate": 1.0527852800454295e-07, "loss": 0.0, "num_input_tokens_seen": 119596456, "step": 177520 }, { "epoch": 4.336965284733589, "grad_norm": 3.5799188481178135e-05, "learning_rate": 1.0524044418853284e-07, "loss": 0.0, "num_input_tokens_seen": 119600424, "step": 177525 }, { "epoch": 4.3370874355654365, "grad_norm": 2.5521441784803756e-05, "learning_rate": 1.0520236687945461e-07, "loss": 0.0, "num_input_tokens_seen": 119603752, "step": 177530 }, { "epoch": 4.337209586397283, "grad_norm": 0.0010590673191472888, "learning_rate": 1.0516429607758548e-07, "loss": 0.0, "num_input_tokens_seen": 119607016, "step": 177535 }, { "epoch": 4.337331737229131, "grad_norm": 0.0003094057319685817, "learning_rate": 1.0512623178320235e-07, "loss": 0.0, "num_input_tokens_seen": 119610408, "step": 177540 }, { "epoch": 4.337453888060978, "grad_norm": 0.0007048142142593861, "learning_rate": 1.0508817399658187e-07, "loss": 0.0, "num_input_tokens_seen": 119613800, "step": 177545 }, { "epoch": 4.337576038892825, "grad_norm": 5.7931225455831736e-05, "learning_rate": 1.0505012271800107e-07, "loss": 0.0, "num_input_tokens_seen": 119616936, "step": 177550 }, { "epoch": 4.337698189724672, "grad_norm": 0.0013491696445271373, "learning_rate": 1.0501207794773647e-07, "loss": 0.0, "num_input_tokens_seen": 119620520, "step": 177555 }, { "epoch": 4.337820340556519, "grad_norm": 3.8408492400776595e-05, "learning_rate": 1.0497403968606455e-07, "loss": 0.0, "num_input_tokens_seen": 119623720, "step": 177560 }, { "epoch": 4.337942491388366, "grad_norm": 9.544839122099802e-05, "learning_rate": 1.049360079332624e-07, "loss": 0.0, "num_input_tokens_seen": 119627816, "step": 177565 }, { "epoch": 4.338064642220213, "grad_norm": 0.0001397064042976126, "learning_rate": 1.0489798268960615e-07, "loss": 0.0, "num_input_tokens_seen": 119631016, "step": 177570 }, { "epoch": 4.338186793052061, "grad_norm": 0.013521953485906124, "learning_rate": 1.0485996395537267e-07, "loss": 0.0, "num_input_tokens_seen": 119634472, "step": 177575 }, { "epoch": 4.3383089438839075, "grad_norm": 0.005240561906248331, "learning_rate": 1.0482195173083807e-07, "loss": 0.0, "num_input_tokens_seen": 119637608, "step": 177580 }, { "epoch": 4.338431094715755, "grad_norm": 0.0003331290790811181, "learning_rate": 1.0478394601627927e-07, "loss": 0.0, "num_input_tokens_seen": 119641064, "step": 177585 }, { "epoch": 4.338553245547602, "grad_norm": 1.179240189230768e-05, "learning_rate": 1.0474594681197213e-07, "loss": 0.0, "num_input_tokens_seen": 119644520, "step": 177590 }, { "epoch": 4.3386753963794495, "grad_norm": 0.00010941450454993173, "learning_rate": 1.0470795411819333e-07, "loss": 0.0, "num_input_tokens_seen": 119648104, "step": 177595 }, { "epoch": 4.338797547211296, "grad_norm": 0.00044772823457606137, "learning_rate": 1.0466996793521932e-07, "loss": 0.0, "num_input_tokens_seen": 119651624, "step": 177600 }, { "epoch": 4.338919698043144, "grad_norm": 3.42298299074173e-05, "learning_rate": 1.0463198826332587e-07, "loss": 0.0, "num_input_tokens_seen": 119655080, "step": 177605 }, { "epoch": 4.339041848874991, "grad_norm": 0.00016116289771161973, "learning_rate": 1.0459401510278965e-07, "loss": 0.0, "num_input_tokens_seen": 119658408, "step": 177610 }, { "epoch": 4.339163999706838, "grad_norm": 5.474098361446522e-05, "learning_rate": 1.0455604845388633e-07, "loss": 0.0, "num_input_tokens_seen": 119661800, "step": 177615 }, { "epoch": 4.339286150538685, "grad_norm": 4.148006337345578e-05, "learning_rate": 1.0451808831689247e-07, "loss": 0.0, "num_input_tokens_seen": 119664872, "step": 177620 }, { "epoch": 4.339408301370533, "grad_norm": 0.021591929718852043, "learning_rate": 1.0448013469208384e-07, "loss": 0.0, "num_input_tokens_seen": 119667816, "step": 177625 }, { "epoch": 4.339530452202379, "grad_norm": 0.0004449640109669417, "learning_rate": 1.0444218757973643e-07, "loss": 0.0, "num_input_tokens_seen": 119671720, "step": 177630 }, { "epoch": 4.339652603034227, "grad_norm": 0.0003688403812702745, "learning_rate": 1.0440424698012651e-07, "loss": 0.0, "num_input_tokens_seen": 119675112, "step": 177635 }, { "epoch": 4.339774753866074, "grad_norm": 0.23644912242889404, "learning_rate": 1.0436631289352959e-07, "loss": 0.0001, "num_input_tokens_seen": 119678312, "step": 177640 }, { "epoch": 4.339896904697921, "grad_norm": 8.655286364955828e-05, "learning_rate": 1.0432838532022204e-07, "loss": 0.0, "num_input_tokens_seen": 119681832, "step": 177645 }, { "epoch": 4.340019055529768, "grad_norm": 0.0001912415900733322, "learning_rate": 1.0429046426047905e-07, "loss": 0.0, "num_input_tokens_seen": 119685544, "step": 177650 }, { "epoch": 4.340141206361615, "grad_norm": 0.0002560184511821717, "learning_rate": 1.0425254971457697e-07, "loss": 0.0, "num_input_tokens_seen": 119688616, "step": 177655 }, { "epoch": 4.340263357193463, "grad_norm": 0.0001207870664075017, "learning_rate": 1.0421464168279137e-07, "loss": 0.0, "num_input_tokens_seen": 119692072, "step": 177660 }, { "epoch": 4.340385508025309, "grad_norm": 0.02083778753876686, "learning_rate": 1.0417674016539745e-07, "loss": 0.0, "num_input_tokens_seen": 119695272, "step": 177665 }, { "epoch": 4.340507658857157, "grad_norm": 0.15552714467048645, "learning_rate": 1.0413884516267158e-07, "loss": 0.0, "num_input_tokens_seen": 119699048, "step": 177670 }, { "epoch": 4.340629809689004, "grad_norm": 1.4773806469747797e-05, "learning_rate": 1.0410095667488872e-07, "loss": 0.0, "num_input_tokens_seen": 119702504, "step": 177675 }, { "epoch": 4.340751960520851, "grad_norm": 1.469433027523337e-05, "learning_rate": 1.040630747023249e-07, "loss": 0.0, "num_input_tokens_seen": 119706344, "step": 177680 }, { "epoch": 4.340874111352698, "grad_norm": 0.0002408750297036022, "learning_rate": 1.0402519924525511e-07, "loss": 0.0, "num_input_tokens_seen": 119710376, "step": 177685 }, { "epoch": 4.340996262184546, "grad_norm": 4.3600357457762584e-05, "learning_rate": 1.0398733030395512e-07, "loss": 0.0, "num_input_tokens_seen": 119713896, "step": 177690 }, { "epoch": 4.3411184130163925, "grad_norm": 7.78894464019686e-05, "learning_rate": 1.0394946787870052e-07, "loss": 0.0, "num_input_tokens_seen": 119717096, "step": 177695 }, { "epoch": 4.34124056384824, "grad_norm": 9.142693306785077e-05, "learning_rate": 1.0391161196976605e-07, "loss": 0.0, "num_input_tokens_seen": 119720104, "step": 177700 }, { "epoch": 4.341362714680087, "grad_norm": 0.004934506490826607, "learning_rate": 1.0387376257742763e-07, "loss": 0.0, "num_input_tokens_seen": 119723688, "step": 177705 }, { "epoch": 4.3414848655119345, "grad_norm": 0.0002775702450890094, "learning_rate": 1.0383591970196004e-07, "loss": 0.0, "num_input_tokens_seen": 119726888, "step": 177710 }, { "epoch": 4.341607016343781, "grad_norm": 0.001972169615328312, "learning_rate": 1.0379808334363893e-07, "loss": 0.0, "num_input_tokens_seen": 119730408, "step": 177715 }, { "epoch": 4.341729167175629, "grad_norm": 2.0951976694050245e-05, "learning_rate": 1.0376025350273898e-07, "loss": 0.0, "num_input_tokens_seen": 119733608, "step": 177720 }, { "epoch": 4.341851318007476, "grad_norm": 9.501501335762441e-05, "learning_rate": 1.0372243017953541e-07, "loss": 0.0, "num_input_tokens_seen": 119736872, "step": 177725 }, { "epoch": 4.341973468839322, "grad_norm": 3.5114768252242357e-05, "learning_rate": 1.0368461337430378e-07, "loss": 0.0, "num_input_tokens_seen": 119739944, "step": 177730 }, { "epoch": 4.34209561967117, "grad_norm": 2.0370420315884985e-05, "learning_rate": 1.0364680308731843e-07, "loss": 0.0, "num_input_tokens_seen": 119743848, "step": 177735 }, { "epoch": 4.342217770503017, "grad_norm": 0.00045057680108584464, "learning_rate": 1.036089993188548e-07, "loss": 0.0, "num_input_tokens_seen": 119747752, "step": 177740 }, { "epoch": 4.342339921334864, "grad_norm": 0.00036081598955206573, "learning_rate": 1.0357120206918746e-07, "loss": 0.0, "num_input_tokens_seen": 119750824, "step": 177745 }, { "epoch": 4.342462072166711, "grad_norm": 0.0006354292272590101, "learning_rate": 1.0353341133859161e-07, "loss": 0.0, "num_input_tokens_seen": 119754280, "step": 177750 }, { "epoch": 4.342584222998559, "grad_norm": 0.00011101896961918101, "learning_rate": 1.0349562712734173e-07, "loss": 0.0, "num_input_tokens_seen": 119757544, "step": 177755 }, { "epoch": 4.3427063738304055, "grad_norm": 3.6241308407625183e-05, "learning_rate": 1.03457849435713e-07, "loss": 0.0, "num_input_tokens_seen": 119760552, "step": 177760 }, { "epoch": 4.342828524662253, "grad_norm": 6.376237433869392e-05, "learning_rate": 1.034200782639797e-07, "loss": 0.0, "num_input_tokens_seen": 119763944, "step": 177765 }, { "epoch": 4.3429506754941, "grad_norm": 0.00014923287380952388, "learning_rate": 1.03382313612417e-07, "loss": 0.0, "num_input_tokens_seen": 119767272, "step": 177770 }, { "epoch": 4.3430728263259475, "grad_norm": 6.0516202211147174e-05, "learning_rate": 1.0334455548129928e-07, "loss": 0.0, "num_input_tokens_seen": 119771560, "step": 177775 }, { "epoch": 4.343194977157794, "grad_norm": 0.001674599014222622, "learning_rate": 1.0330680387090085e-07, "loss": 0.0, "num_input_tokens_seen": 119774952, "step": 177780 }, { "epoch": 4.343317127989642, "grad_norm": 0.00041652386425994337, "learning_rate": 1.032690587814965e-07, "loss": 0.0, "num_input_tokens_seen": 119778280, "step": 177785 }, { "epoch": 4.343439278821489, "grad_norm": 2.3158287149271928e-05, "learning_rate": 1.0323132021336101e-07, "loss": 0.0, "num_input_tokens_seen": 119781544, "step": 177790 }, { "epoch": 4.343561429653336, "grad_norm": 4.255375461070798e-05, "learning_rate": 1.0319358816676838e-07, "loss": 0.0, "num_input_tokens_seen": 119784680, "step": 177795 }, { "epoch": 4.343683580485183, "grad_norm": 0.0006458880961872637, "learning_rate": 1.0315586264199338e-07, "loss": 0.0, "num_input_tokens_seen": 119787688, "step": 177800 }, { "epoch": 4.343805731317031, "grad_norm": 0.0010130023583769798, "learning_rate": 1.0311814363930994e-07, "loss": 0.0, "num_input_tokens_seen": 119791208, "step": 177805 }, { "epoch": 4.343927882148877, "grad_norm": 0.0003084522904828191, "learning_rate": 1.0308043115899257e-07, "loss": 0.0, "num_input_tokens_seen": 119794344, "step": 177810 }, { "epoch": 4.344050032980725, "grad_norm": 7.93716826592572e-05, "learning_rate": 1.0304272520131586e-07, "loss": 0.0, "num_input_tokens_seen": 119797736, "step": 177815 }, { "epoch": 4.344172183812572, "grad_norm": 5.822316961712204e-05, "learning_rate": 1.0300502576655334e-07, "loss": 0.0, "num_input_tokens_seen": 119801256, "step": 177820 }, { "epoch": 4.3442943346444185, "grad_norm": 0.0020748593378812075, "learning_rate": 1.0296733285497982e-07, "loss": 0.0, "num_input_tokens_seen": 119804520, "step": 177825 }, { "epoch": 4.344416485476266, "grad_norm": 0.00011760352936107665, "learning_rate": 1.0292964646686897e-07, "loss": 0.0, "num_input_tokens_seen": 119807848, "step": 177830 }, { "epoch": 4.344538636308113, "grad_norm": 0.0002966334577649832, "learning_rate": 1.0289196660249521e-07, "loss": 0.0, "num_input_tokens_seen": 119811048, "step": 177835 }, { "epoch": 4.3446607871399605, "grad_norm": 0.0029755146242678165, "learning_rate": 1.0285429326213213e-07, "loss": 0.0, "num_input_tokens_seen": 119814248, "step": 177840 }, { "epoch": 4.344782937971807, "grad_norm": 0.005637416150420904, "learning_rate": 1.0281662644605394e-07, "loss": 0.0, "num_input_tokens_seen": 119817512, "step": 177845 }, { "epoch": 4.344905088803655, "grad_norm": 9.653414053900633e-06, "learning_rate": 1.0277896615453473e-07, "loss": 0.0, "num_input_tokens_seen": 119820648, "step": 177850 }, { "epoch": 4.345027239635502, "grad_norm": 0.011159008368849754, "learning_rate": 1.02741312387848e-07, "loss": 0.0, "num_input_tokens_seen": 119823976, "step": 177855 }, { "epoch": 4.345149390467349, "grad_norm": 0.002734529320150614, "learning_rate": 1.0270366514626793e-07, "loss": 0.0, "num_input_tokens_seen": 119827944, "step": 177860 }, { "epoch": 4.345271541299196, "grad_norm": 1.2335614883340895e-05, "learning_rate": 1.0266602443006822e-07, "loss": 0.0, "num_input_tokens_seen": 119831336, "step": 177865 }, { "epoch": 4.345393692131044, "grad_norm": 0.00011353209265507758, "learning_rate": 1.0262839023952241e-07, "loss": 0.0, "num_input_tokens_seen": 119835112, "step": 177870 }, { "epoch": 4.34551584296289, "grad_norm": 0.0011685584904626012, "learning_rate": 1.025907625749044e-07, "loss": 0.0, "num_input_tokens_seen": 119838568, "step": 177875 }, { "epoch": 4.345637993794738, "grad_norm": 6.6461670940043405e-06, "learning_rate": 1.0255314143648753e-07, "loss": 0.0, "num_input_tokens_seen": 119841832, "step": 177880 }, { "epoch": 4.345760144626585, "grad_norm": 0.0003076045250054449, "learning_rate": 1.025155268245459e-07, "loss": 0.0, "num_input_tokens_seen": 119844840, "step": 177885 }, { "epoch": 4.345882295458432, "grad_norm": 0.0029748149681836367, "learning_rate": 1.0247791873935241e-07, "loss": 0.0, "num_input_tokens_seen": 119849192, "step": 177890 }, { "epoch": 4.346004446290279, "grad_norm": 2.7037634936277755e-05, "learning_rate": 1.0244031718118118e-07, "loss": 0.0, "num_input_tokens_seen": 119852456, "step": 177895 }, { "epoch": 4.346126597122127, "grad_norm": 5.461079490487464e-05, "learning_rate": 1.0240272215030521e-07, "loss": 0.0816, "num_input_tokens_seen": 119855400, "step": 177900 }, { "epoch": 4.3462487479539735, "grad_norm": 0.0004930661525577307, "learning_rate": 1.0236513364699805e-07, "loss": 0.0, "num_input_tokens_seen": 119858472, "step": 177905 }, { "epoch": 4.346370898785821, "grad_norm": 0.00023198813141789287, "learning_rate": 1.0232755167153328e-07, "loss": 0.0, "num_input_tokens_seen": 119861672, "step": 177910 }, { "epoch": 4.346493049617668, "grad_norm": 0.0025924292858690023, "learning_rate": 1.0228997622418378e-07, "loss": 0.0, "num_input_tokens_seen": 119865384, "step": 177915 }, { "epoch": 4.346615200449515, "grad_norm": 0.006062481552362442, "learning_rate": 1.0225240730522322e-07, "loss": 0.0213, "num_input_tokens_seen": 119868328, "step": 177920 }, { "epoch": 4.346737351281362, "grad_norm": 0.00014382071094587445, "learning_rate": 1.0221484491492438e-07, "loss": 0.0, "num_input_tokens_seen": 119872040, "step": 177925 }, { "epoch": 4.346859502113209, "grad_norm": 2.9611932404804975e-05, "learning_rate": 1.0217728905356093e-07, "loss": 0.0, "num_input_tokens_seen": 119875048, "step": 177930 }, { "epoch": 4.346981652945057, "grad_norm": 0.0008013474871404469, "learning_rate": 1.0213973972140555e-07, "loss": 0.0, "num_input_tokens_seen": 119878248, "step": 177935 }, { "epoch": 4.347103803776903, "grad_norm": 0.0003407940675970167, "learning_rate": 1.0210219691873145e-07, "loss": 0.0, "num_input_tokens_seen": 119881704, "step": 177940 }, { "epoch": 4.347225954608751, "grad_norm": 0.0001222924911417067, "learning_rate": 1.0206466064581177e-07, "loss": 0.0, "num_input_tokens_seen": 119885288, "step": 177945 }, { "epoch": 4.347348105440598, "grad_norm": 4.897462076769443e-06, "learning_rate": 1.0202713090291937e-07, "loss": 0.0, "num_input_tokens_seen": 119888744, "step": 177950 }, { "epoch": 4.347470256272445, "grad_norm": 0.0013736019609495997, "learning_rate": 1.0198960769032728e-07, "loss": 0.0, "num_input_tokens_seen": 119892072, "step": 177955 }, { "epoch": 4.347592407104292, "grad_norm": 0.00012519631127361208, "learning_rate": 1.0195209100830815e-07, "loss": 0.0001, "num_input_tokens_seen": 119895080, "step": 177960 }, { "epoch": 4.34771455793614, "grad_norm": 0.00036417320370674133, "learning_rate": 1.0191458085713511e-07, "loss": 0.0, "num_input_tokens_seen": 119898408, "step": 177965 }, { "epoch": 4.347836708767987, "grad_norm": 0.0004924045060761273, "learning_rate": 1.0187707723708084e-07, "loss": 0.0, "num_input_tokens_seen": 119901544, "step": 177970 }, { "epoch": 4.347958859599834, "grad_norm": 2.2906911908648908e-05, "learning_rate": 1.0183958014841776e-07, "loss": 0.0, "num_input_tokens_seen": 119905192, "step": 177975 }, { "epoch": 4.348081010431681, "grad_norm": 0.38317111134529114, "learning_rate": 1.0180208959141912e-07, "loss": 0.0002, "num_input_tokens_seen": 119908456, "step": 177980 }, { "epoch": 4.348203161263529, "grad_norm": 0.000214638261240907, "learning_rate": 1.0176460556635702e-07, "loss": 0.0001, "num_input_tokens_seen": 119911656, "step": 177985 }, { "epoch": 4.348325312095375, "grad_norm": 0.0001367831719107926, "learning_rate": 1.0172712807350447e-07, "loss": 0.0536, "num_input_tokens_seen": 119915304, "step": 177990 }, { "epoch": 4.348447462927223, "grad_norm": 0.001143554924055934, "learning_rate": 1.0168965711313371e-07, "loss": 0.0, "num_input_tokens_seen": 119918568, "step": 177995 }, { "epoch": 4.34856961375907, "grad_norm": 0.00035983105772174895, "learning_rate": 1.016521926855174e-07, "loss": 0.0, "num_input_tokens_seen": 119921832, "step": 178000 }, { "epoch": 4.348691764590917, "grad_norm": 6.349872273858637e-05, "learning_rate": 1.0161473479092819e-07, "loss": 0.0, "num_input_tokens_seen": 119924904, "step": 178005 }, { "epoch": 4.348813915422764, "grad_norm": 7.528601418016478e-05, "learning_rate": 1.0157728342963801e-07, "loss": 0.0, "num_input_tokens_seen": 119928744, "step": 178010 }, { "epoch": 4.348936066254611, "grad_norm": 9.19345548027195e-05, "learning_rate": 1.0153983860191961e-07, "loss": 0.0, "num_input_tokens_seen": 119932200, "step": 178015 }, { "epoch": 4.3490582170864585, "grad_norm": 0.004338003229349852, "learning_rate": 1.0150240030804502e-07, "loss": 0.0, "num_input_tokens_seen": 119935592, "step": 178020 }, { "epoch": 4.349180367918305, "grad_norm": 7.14230554876849e-05, "learning_rate": 1.014649685482869e-07, "loss": 0.0, "num_input_tokens_seen": 119938728, "step": 178025 }, { "epoch": 4.349302518750153, "grad_norm": 0.00011825282854260877, "learning_rate": 1.0142754332291692e-07, "loss": 0.0, "num_input_tokens_seen": 119942376, "step": 178030 }, { "epoch": 4.349424669582, "grad_norm": 0.0034788944758474827, "learning_rate": 1.0139012463220764e-07, "loss": 0.0, "num_input_tokens_seen": 119945704, "step": 178035 }, { "epoch": 4.349546820413847, "grad_norm": 0.00012874802632723004, "learning_rate": 1.0135271247643117e-07, "loss": 0.0, "num_input_tokens_seen": 119948968, "step": 178040 }, { "epoch": 4.349668971245694, "grad_norm": 5.9843339840881526e-05, "learning_rate": 1.0131530685585931e-07, "loss": 0.0, "num_input_tokens_seen": 119952232, "step": 178045 }, { "epoch": 4.349791122077542, "grad_norm": 0.00011922222620341927, "learning_rate": 1.0127790777076439e-07, "loss": 0.0, "num_input_tokens_seen": 119956136, "step": 178050 }, { "epoch": 4.349913272909388, "grad_norm": 0.00035078474320471287, "learning_rate": 1.0124051522141819e-07, "loss": 0.0, "num_input_tokens_seen": 119959656, "step": 178055 }, { "epoch": 4.350035423741236, "grad_norm": 0.007354686502367258, "learning_rate": 1.0120312920809282e-07, "loss": 0.0, "num_input_tokens_seen": 119962792, "step": 178060 }, { "epoch": 4.350157574573083, "grad_norm": 0.0004929696442559361, "learning_rate": 1.0116574973105984e-07, "loss": 0.0, "num_input_tokens_seen": 119965992, "step": 178065 }, { "epoch": 4.35027972540493, "grad_norm": 7.778491635690443e-06, "learning_rate": 1.011283767905915e-07, "loss": 0.0, "num_input_tokens_seen": 119969448, "step": 178070 }, { "epoch": 4.350401876236777, "grad_norm": 0.0005125360912643373, "learning_rate": 1.0109101038695911e-07, "loss": 0.0, "num_input_tokens_seen": 119972520, "step": 178075 }, { "epoch": 4.350524027068625, "grad_norm": 5.3846797527512535e-05, "learning_rate": 1.0105365052043491e-07, "loss": 0.0, "num_input_tokens_seen": 119976104, "step": 178080 }, { "epoch": 4.3506461779004715, "grad_norm": 9.910299559123814e-05, "learning_rate": 1.0101629719129045e-07, "loss": 0.0, "num_input_tokens_seen": 119978984, "step": 178085 }, { "epoch": 4.350768328732318, "grad_norm": 1.2183804756205063e-05, "learning_rate": 1.0097895039979698e-07, "loss": 0.0, "num_input_tokens_seen": 119982056, "step": 178090 }, { "epoch": 4.350890479564166, "grad_norm": 0.00019707833416759968, "learning_rate": 1.0094161014622637e-07, "loss": 0.0, "num_input_tokens_seen": 119985384, "step": 178095 }, { "epoch": 4.351012630396013, "grad_norm": 5.629958468489349e-05, "learning_rate": 1.0090427643085043e-07, "loss": 0.0, "num_input_tokens_seen": 119988584, "step": 178100 }, { "epoch": 4.35113478122786, "grad_norm": 3.05861140077468e-05, "learning_rate": 1.0086694925394024e-07, "loss": 0.0019, "num_input_tokens_seen": 119991720, "step": 178105 }, { "epoch": 4.351256932059707, "grad_norm": 0.00027523620519787073, "learning_rate": 1.008296286157676e-07, "loss": 0.0, "num_input_tokens_seen": 119995432, "step": 178110 }, { "epoch": 4.351379082891555, "grad_norm": 8.419102960033342e-05, "learning_rate": 1.0079231451660352e-07, "loss": 0.0, "num_input_tokens_seen": 119998952, "step": 178115 }, { "epoch": 4.351501233723401, "grad_norm": 0.0008841792005114257, "learning_rate": 1.007550069567198e-07, "loss": 0.0259, "num_input_tokens_seen": 120001960, "step": 178120 }, { "epoch": 4.351623384555249, "grad_norm": 0.0002478567766956985, "learning_rate": 1.007177059363874e-07, "loss": 0.0, "num_input_tokens_seen": 120005160, "step": 178125 }, { "epoch": 4.351745535387096, "grad_norm": 30.11157989501953, "learning_rate": 1.0068041145587769e-07, "loss": 0.0403, "num_input_tokens_seen": 120009064, "step": 178130 }, { "epoch": 4.351867686218943, "grad_norm": 5.561210491578095e-05, "learning_rate": 1.00643123515462e-07, "loss": 0.0, "num_input_tokens_seen": 120012392, "step": 178135 }, { "epoch": 4.35198983705079, "grad_norm": 9.507766723632812, "learning_rate": 1.0060584211541134e-07, "loss": 0.0134, "num_input_tokens_seen": 120015720, "step": 178140 }, { "epoch": 4.352111987882638, "grad_norm": 5.897363371332176e-05, "learning_rate": 1.0056856725599704e-07, "loss": 0.0, "num_input_tokens_seen": 120019304, "step": 178145 }, { "epoch": 4.3522341387144845, "grad_norm": 0.002634249161928892, "learning_rate": 1.0053129893748991e-07, "loss": 0.0, "num_input_tokens_seen": 120022248, "step": 178150 }, { "epoch": 4.352356289546332, "grad_norm": 9.353709174320102e-05, "learning_rate": 1.0049403716016113e-07, "loss": 0.0336, "num_input_tokens_seen": 120025384, "step": 178155 }, { "epoch": 4.352478440378179, "grad_norm": 0.0002733849687501788, "learning_rate": 1.0045678192428175e-07, "loss": 0.0, "num_input_tokens_seen": 120029032, "step": 178160 }, { "epoch": 4.3526005912100265, "grad_norm": 0.0362461693584919, "learning_rate": 1.0041953323012242e-07, "loss": 0.0, "num_input_tokens_seen": 120032232, "step": 178165 }, { "epoch": 4.352722742041873, "grad_norm": 0.00040877415449358523, "learning_rate": 1.0038229107795448e-07, "loss": 0.0, "num_input_tokens_seen": 120035368, "step": 178170 }, { "epoch": 4.352844892873721, "grad_norm": 0.004401254002004862, "learning_rate": 1.0034505546804839e-07, "loss": 0.0, "num_input_tokens_seen": 120038888, "step": 178175 }, { "epoch": 4.352967043705568, "grad_norm": 0.012119931168854237, "learning_rate": 1.003078264006748e-07, "loss": 0.0, "num_input_tokens_seen": 120042664, "step": 178180 }, { "epoch": 4.353089194537414, "grad_norm": 0.0002318314218427986, "learning_rate": 1.0027060387610497e-07, "loss": 0.0, "num_input_tokens_seen": 120045672, "step": 178185 }, { "epoch": 4.353211345369262, "grad_norm": 0.00025992945302277803, "learning_rate": 1.0023338789460912e-07, "loss": 0.0, "num_input_tokens_seen": 120048744, "step": 178190 }, { "epoch": 4.353333496201109, "grad_norm": 0.0011936393566429615, "learning_rate": 1.0019617845645822e-07, "loss": 0.0, "num_input_tokens_seen": 120052200, "step": 178195 }, { "epoch": 4.353455647032956, "grad_norm": 0.00025509262923151255, "learning_rate": 1.0015897556192266e-07, "loss": 0.0, "num_input_tokens_seen": 120054952, "step": 178200 }, { "epoch": 4.353577797864803, "grad_norm": 4.1563100239727646e-05, "learning_rate": 1.0012177921127307e-07, "loss": 0.0, "num_input_tokens_seen": 120058024, "step": 178205 }, { "epoch": 4.353699948696651, "grad_norm": 0.00012275317567400634, "learning_rate": 1.0008458940477992e-07, "loss": 0.0, "num_input_tokens_seen": 120061288, "step": 178210 }, { "epoch": 4.3538220995284975, "grad_norm": 0.001947722746990621, "learning_rate": 1.0004740614271356e-07, "loss": 0.0, "num_input_tokens_seen": 120064424, "step": 178215 }, { "epoch": 4.353944250360345, "grad_norm": 8.509887993568555e-05, "learning_rate": 1.0001022942534476e-07, "loss": 0.0001, "num_input_tokens_seen": 120067688, "step": 178220 }, { "epoch": 4.354066401192192, "grad_norm": 9.287018656323198e-06, "learning_rate": 9.997305925294342e-08, "loss": 0.0, "num_input_tokens_seen": 120071464, "step": 178225 }, { "epoch": 4.3541885520240395, "grad_norm": 0.00255648884922266, "learning_rate": 9.993589562578031e-08, "loss": 0.0, "num_input_tokens_seen": 120074856, "step": 178230 }, { "epoch": 4.354310702855886, "grad_norm": 0.0007204010616987944, "learning_rate": 9.989873854412523e-08, "loss": 0.0, "num_input_tokens_seen": 120078056, "step": 178235 }, { "epoch": 4.354432853687734, "grad_norm": 0.002324800007045269, "learning_rate": 9.986158800824884e-08, "loss": 0.0, "num_input_tokens_seen": 120081448, "step": 178240 }, { "epoch": 4.354555004519581, "grad_norm": 0.0022073762957006693, "learning_rate": 9.982444401842083e-08, "loss": 0.0, "num_input_tokens_seen": 120084648, "step": 178245 }, { "epoch": 4.354677155351428, "grad_norm": 3.476447818684392e-05, "learning_rate": 9.978730657491164e-08, "loss": 0.0, "num_input_tokens_seen": 120087912, "step": 178250 }, { "epoch": 4.354799306183275, "grad_norm": 0.00024955617845989764, "learning_rate": 9.975017567799148e-08, "loss": 0.0, "num_input_tokens_seen": 120090920, "step": 178255 }, { "epoch": 4.354921457015123, "grad_norm": 0.004400161094963551, "learning_rate": 9.971305132792996e-08, "loss": 0.0, "num_input_tokens_seen": 120094248, "step": 178260 }, { "epoch": 4.355043607846969, "grad_norm": 4.038333281641826e-05, "learning_rate": 9.967593352499747e-08, "loss": 0.0, "num_input_tokens_seen": 120097128, "step": 178265 }, { "epoch": 4.355165758678817, "grad_norm": 0.0003064874326810241, "learning_rate": 9.963882226946363e-08, "loss": 0.0, "num_input_tokens_seen": 120100328, "step": 178270 }, { "epoch": 4.355287909510664, "grad_norm": 1.765959314070642e-05, "learning_rate": 9.960171756159851e-08, "loss": 0.0, "num_input_tokens_seen": 120104232, "step": 178275 }, { "epoch": 4.3554100603425105, "grad_norm": 4.4604366848943755e-05, "learning_rate": 9.956461940167193e-08, "loss": 0.0, "num_input_tokens_seen": 120107880, "step": 178280 }, { "epoch": 4.355532211174358, "grad_norm": 0.0005809883587062359, "learning_rate": 9.952752778995343e-08, "loss": 0.0001, "num_input_tokens_seen": 120111080, "step": 178285 }, { "epoch": 4.355654362006205, "grad_norm": 0.0001975120831048116, "learning_rate": 9.949044272671326e-08, "loss": 0.0313, "num_input_tokens_seen": 120114344, "step": 178290 }, { "epoch": 4.355776512838053, "grad_norm": 0.0005515380762517452, "learning_rate": 9.945336421222039e-08, "loss": 0.0, "num_input_tokens_seen": 120117736, "step": 178295 }, { "epoch": 4.355898663669899, "grad_norm": 0.001049726502969861, "learning_rate": 9.941629224674519e-08, "loss": 0.0, "num_input_tokens_seen": 120121064, "step": 178300 }, { "epoch": 4.356020814501747, "grad_norm": 1.5405454178107902e-05, "learning_rate": 9.937922683055677e-08, "loss": 0.0, "num_input_tokens_seen": 120124776, "step": 178305 }, { "epoch": 4.356142965333594, "grad_norm": 8.174588583642617e-05, "learning_rate": 9.93421679639248e-08, "loss": 0.0, "num_input_tokens_seen": 120128168, "step": 178310 }, { "epoch": 4.356265116165441, "grad_norm": 8.779927156865597e-05, "learning_rate": 9.930511564711907e-08, "loss": 0.0, "num_input_tokens_seen": 120131752, "step": 178315 }, { "epoch": 4.356387266997288, "grad_norm": 0.0010885270312428474, "learning_rate": 9.926806988040858e-08, "loss": 0.0, "num_input_tokens_seen": 120135272, "step": 178320 }, { "epoch": 4.356509417829136, "grad_norm": 9.781593689695e-05, "learning_rate": 9.923103066406314e-08, "loss": 0.0, "num_input_tokens_seen": 120138408, "step": 178325 }, { "epoch": 4.3566315686609824, "grad_norm": 0.003006650833413005, "learning_rate": 9.919399799835171e-08, "loss": 0.0, "num_input_tokens_seen": 120141928, "step": 178330 }, { "epoch": 4.35675371949283, "grad_norm": 0.00023989882902242243, "learning_rate": 9.915697188354399e-08, "loss": 0.0, "num_input_tokens_seen": 120145448, "step": 178335 }, { "epoch": 4.356875870324677, "grad_norm": 0.00019699001859407872, "learning_rate": 9.911995231990899e-08, "loss": 0.0004, "num_input_tokens_seen": 120148904, "step": 178340 }, { "epoch": 4.3569980211565245, "grad_norm": 0.0009989457903429866, "learning_rate": 9.908293930771594e-08, "loss": 0.0, "num_input_tokens_seen": 120152360, "step": 178345 }, { "epoch": 4.357120171988371, "grad_norm": 0.001904218690469861, "learning_rate": 9.904593284723417e-08, "loss": 0.0, "num_input_tokens_seen": 120155752, "step": 178350 }, { "epoch": 4.357242322820218, "grad_norm": 0.000590673356782645, "learning_rate": 9.90089329387327e-08, "loss": 0.0, "num_input_tokens_seen": 120159144, "step": 178355 }, { "epoch": 4.357364473652066, "grad_norm": 0.0008590373327024281, "learning_rate": 9.897193958248063e-08, "loss": 0.0, "num_input_tokens_seen": 120162536, "step": 178360 }, { "epoch": 4.357486624483912, "grad_norm": 3.4686174331000075e-05, "learning_rate": 9.893495277874686e-08, "loss": 0.0, "num_input_tokens_seen": 120165736, "step": 178365 }, { "epoch": 4.35760877531576, "grad_norm": 7.88893667049706e-05, "learning_rate": 9.889797252780064e-08, "loss": 0.0, "num_input_tokens_seen": 120168808, "step": 178370 }, { "epoch": 4.357730926147607, "grad_norm": 0.0003943110932596028, "learning_rate": 9.88609988299105e-08, "loss": 0.0, "num_input_tokens_seen": 120172200, "step": 178375 }, { "epoch": 4.357853076979454, "grad_norm": 4.9077701987698674e-05, "learning_rate": 9.882403168534581e-08, "loss": 0.0, "num_input_tokens_seen": 120175400, "step": 178380 }, { "epoch": 4.357975227811301, "grad_norm": 1.5061576050356962e-05, "learning_rate": 9.878707109437489e-08, "loss": 0.0, "num_input_tokens_seen": 120178856, "step": 178385 }, { "epoch": 4.358097378643149, "grad_norm": 0.00023197197879198939, "learning_rate": 9.875011705726699e-08, "loss": 0.0, "num_input_tokens_seen": 120181736, "step": 178390 }, { "epoch": 4.3582195294749955, "grad_norm": 0.026410503312945366, "learning_rate": 9.871316957429077e-08, "loss": 0.0, "num_input_tokens_seen": 120185000, "step": 178395 }, { "epoch": 4.358341680306843, "grad_norm": 0.0004599327512551099, "learning_rate": 9.867622864571445e-08, "loss": 0.0, "num_input_tokens_seen": 120188072, "step": 178400 }, { "epoch": 4.35846383113869, "grad_norm": 0.006481677293777466, "learning_rate": 9.863929427180706e-08, "loss": 0.0, "num_input_tokens_seen": 120191464, "step": 178405 }, { "epoch": 4.3585859819705375, "grad_norm": 7.190836913650855e-05, "learning_rate": 9.860236645283737e-08, "loss": 0.0, "num_input_tokens_seen": 120194920, "step": 178410 }, { "epoch": 4.358708132802384, "grad_norm": 0.00027438238612376153, "learning_rate": 9.856544518907362e-08, "loss": 0.0536, "num_input_tokens_seen": 120198376, "step": 178415 }, { "epoch": 4.358830283634232, "grad_norm": 6.237047637114301e-05, "learning_rate": 9.852853048078446e-08, "loss": 0.0655, "num_input_tokens_seen": 120201512, "step": 178420 }, { "epoch": 4.358952434466079, "grad_norm": 0.00016815854178275913, "learning_rate": 9.849162232823816e-08, "loss": 0.0, "num_input_tokens_seen": 120205608, "step": 178425 }, { "epoch": 4.359074585297926, "grad_norm": 0.002402889309450984, "learning_rate": 9.845472073170346e-08, "loss": 0.0, "num_input_tokens_seen": 120209128, "step": 178430 }, { "epoch": 4.359196736129773, "grad_norm": 6.714063147228444e-06, "learning_rate": 9.84178256914483e-08, "loss": 0.0, "num_input_tokens_seen": 120212520, "step": 178435 }, { "epoch": 4.359318886961621, "grad_norm": 0.00024982349714264274, "learning_rate": 9.83809372077412e-08, "loss": 0.0, "num_input_tokens_seen": 120215528, "step": 178440 }, { "epoch": 4.359441037793467, "grad_norm": 0.00264274631626904, "learning_rate": 9.834405528085066e-08, "loss": 0.0, "num_input_tokens_seen": 120219112, "step": 178445 }, { "epoch": 4.359563188625314, "grad_norm": 3.579885378712788e-05, "learning_rate": 9.830717991104443e-08, "loss": 0.0, "num_input_tokens_seen": 120222760, "step": 178450 }, { "epoch": 4.359685339457162, "grad_norm": 0.0001969273725990206, "learning_rate": 9.827031109859107e-08, "loss": 0.0, "num_input_tokens_seen": 120225832, "step": 178455 }, { "epoch": 4.3598074902890085, "grad_norm": 0.0016006685327738523, "learning_rate": 9.82334488437585e-08, "loss": 0.0, "num_input_tokens_seen": 120229096, "step": 178460 }, { "epoch": 4.359929641120856, "grad_norm": 7.797037687851116e-05, "learning_rate": 9.819659314681472e-08, "loss": 0.0, "num_input_tokens_seen": 120232552, "step": 178465 }, { "epoch": 4.360051791952703, "grad_norm": 0.0005138301639817655, "learning_rate": 9.815974400802807e-08, "loss": 0.0, "num_input_tokens_seen": 120236584, "step": 178470 }, { "epoch": 4.3601739427845505, "grad_norm": 0.0052763535641133785, "learning_rate": 9.812290142766622e-08, "loss": 0.0, "num_input_tokens_seen": 120239656, "step": 178475 }, { "epoch": 4.360296093616397, "grad_norm": 0.0007992389146238565, "learning_rate": 9.808606540599728e-08, "loss": 0.0, "num_input_tokens_seen": 120243112, "step": 178480 }, { "epoch": 4.360418244448245, "grad_norm": 6.511791980301496e-06, "learning_rate": 9.804923594328907e-08, "loss": 0.0, "num_input_tokens_seen": 120246568, "step": 178485 }, { "epoch": 4.360540395280092, "grad_norm": 4.3418702261988074e-05, "learning_rate": 9.801241303980934e-08, "loss": 0.0011, "num_input_tokens_seen": 120249960, "step": 178490 }, { "epoch": 4.360662546111939, "grad_norm": 0.0003371757920831442, "learning_rate": 9.7975596695826e-08, "loss": 0.0, "num_input_tokens_seen": 120253224, "step": 178495 }, { "epoch": 4.360784696943786, "grad_norm": 0.0005492345080710948, "learning_rate": 9.793878691160662e-08, "loss": 0.0, "num_input_tokens_seen": 120256360, "step": 178500 }, { "epoch": 4.360906847775634, "grad_norm": 0.0005842145183123648, "learning_rate": 9.79019836874192e-08, "loss": 0.0, "num_input_tokens_seen": 120259496, "step": 178505 }, { "epoch": 4.36102899860748, "grad_norm": 8.086705202003941e-05, "learning_rate": 9.786518702353097e-08, "loss": 0.0, "num_input_tokens_seen": 120263080, "step": 178510 }, { "epoch": 4.361151149439328, "grad_norm": 4.444857040653005e-05, "learning_rate": 9.782839692020994e-08, "loss": 0.0, "num_input_tokens_seen": 120267368, "step": 178515 }, { "epoch": 4.361273300271175, "grad_norm": 0.09220793098211288, "learning_rate": 9.779161337772323e-08, "loss": 0.0001, "num_input_tokens_seen": 120271144, "step": 178520 }, { "epoch": 4.361395451103022, "grad_norm": 4.644213277060771e-06, "learning_rate": 9.775483639633863e-08, "loss": 0.0, "num_input_tokens_seen": 120274024, "step": 178525 }, { "epoch": 4.361517601934869, "grad_norm": 1.780526690708939e-05, "learning_rate": 9.771806597632382e-08, "loss": 0.0, "num_input_tokens_seen": 120277480, "step": 178530 }, { "epoch": 4.361639752766717, "grad_norm": 5.388813224271871e-05, "learning_rate": 9.768130211794556e-08, "loss": 0.0, "num_input_tokens_seen": 120280424, "step": 178535 }, { "epoch": 4.3617619035985635, "grad_norm": 1.0598834705888294e-05, "learning_rate": 9.764454482147189e-08, "loss": 0.0, "num_input_tokens_seen": 120283304, "step": 178540 }, { "epoch": 4.36188405443041, "grad_norm": 0.0008127638720907271, "learning_rate": 9.760779408716946e-08, "loss": 0.0, "num_input_tokens_seen": 120287080, "step": 178545 }, { "epoch": 4.362006205262258, "grad_norm": 0.0010838373564183712, "learning_rate": 9.757104991530618e-08, "loss": 0.0, "num_input_tokens_seen": 120290728, "step": 178550 }, { "epoch": 4.362128356094105, "grad_norm": 0.0002147614723071456, "learning_rate": 9.753431230614873e-08, "loss": 0.0001, "num_input_tokens_seen": 120294504, "step": 178555 }, { "epoch": 4.362250506925952, "grad_norm": 0.0016481346683576703, "learning_rate": 9.749758125996444e-08, "loss": 0.0435, "num_input_tokens_seen": 120297832, "step": 178560 }, { "epoch": 4.362372657757799, "grad_norm": 0.0002112577494699508, "learning_rate": 9.746085677702065e-08, "loss": 0.0, "num_input_tokens_seen": 120301096, "step": 178565 }, { "epoch": 4.362494808589647, "grad_norm": 0.000503468094393611, "learning_rate": 9.742413885758416e-08, "loss": 0.0, "num_input_tokens_seen": 120304424, "step": 178570 }, { "epoch": 4.362616959421493, "grad_norm": 0.0016552689485251904, "learning_rate": 9.73874275019223e-08, "loss": 0.0, "num_input_tokens_seen": 120307688, "step": 178575 }, { "epoch": 4.362739110253341, "grad_norm": 0.00026949195307679474, "learning_rate": 9.735072271030165e-08, "loss": 0.0, "num_input_tokens_seen": 120311144, "step": 178580 }, { "epoch": 4.362861261085188, "grad_norm": 0.00013682998542208225, "learning_rate": 9.731402448298942e-08, "loss": 0.0, "num_input_tokens_seen": 120314216, "step": 178585 }, { "epoch": 4.362983411917035, "grad_norm": 0.000252584955887869, "learning_rate": 9.727733282025242e-08, "loss": 0.0, "num_input_tokens_seen": 120317800, "step": 178590 }, { "epoch": 4.363105562748882, "grad_norm": 0.012169472873210907, "learning_rate": 9.724064772235729e-08, "loss": 0.0, "num_input_tokens_seen": 120321576, "step": 178595 }, { "epoch": 4.36322771358073, "grad_norm": 0.002420415636152029, "learning_rate": 9.720396918957118e-08, "loss": 0.0, "num_input_tokens_seen": 120325096, "step": 178600 }, { "epoch": 4.3633498644125766, "grad_norm": 4.463781078811735e-05, "learning_rate": 9.716729722216055e-08, "loss": 0.0, "num_input_tokens_seen": 120328360, "step": 178605 }, { "epoch": 4.363472015244424, "grad_norm": 0.00020588969346135855, "learning_rate": 9.713063182039216e-08, "loss": 0.0, "num_input_tokens_seen": 120331496, "step": 178610 }, { "epoch": 4.363594166076271, "grad_norm": 0.00028956696041859686, "learning_rate": 9.709397298453259e-08, "loss": 0.0, "num_input_tokens_seen": 120334632, "step": 178615 }, { "epoch": 4.363716316908118, "grad_norm": 0.0002490598999429494, "learning_rate": 9.705732071484851e-08, "loss": 0.0, "num_input_tokens_seen": 120337832, "step": 178620 }, { "epoch": 4.363838467739965, "grad_norm": 0.0003953216946683824, "learning_rate": 9.70206750116066e-08, "loss": 0.0, "num_input_tokens_seen": 120341288, "step": 178625 }, { "epoch": 4.363960618571813, "grad_norm": 0.0002987831539940089, "learning_rate": 9.698403587507298e-08, "loss": 0.0, "num_input_tokens_seen": 120344616, "step": 178630 }, { "epoch": 4.36408276940366, "grad_norm": 37.20766830444336, "learning_rate": 9.694740330551465e-08, "loss": 0.083, "num_input_tokens_seen": 120347624, "step": 178635 }, { "epoch": 4.364204920235506, "grad_norm": 1.8995802747667767e-05, "learning_rate": 9.691077730319741e-08, "loss": 0.0, "num_input_tokens_seen": 120351336, "step": 178640 }, { "epoch": 4.364327071067354, "grad_norm": 7.500431820517406e-06, "learning_rate": 9.687415786838804e-08, "loss": 0.0, "num_input_tokens_seen": 120354664, "step": 178645 }, { "epoch": 4.364449221899201, "grad_norm": 3.114115315838717e-05, "learning_rate": 9.683754500135266e-08, "loss": 0.0, "num_input_tokens_seen": 120358184, "step": 178650 }, { "epoch": 4.3645713727310484, "grad_norm": 0.0004918667254969478, "learning_rate": 9.68009387023575e-08, "loss": 0.0, "num_input_tokens_seen": 120361192, "step": 178655 }, { "epoch": 4.364693523562895, "grad_norm": 0.00686995405703783, "learning_rate": 9.676433897166903e-08, "loss": 0.0, "num_input_tokens_seen": 120364328, "step": 178660 }, { "epoch": 4.364815674394743, "grad_norm": 0.0003014057583641261, "learning_rate": 9.6727745809553e-08, "loss": 0.0, "num_input_tokens_seen": 120367400, "step": 178665 }, { "epoch": 4.36493782522659, "grad_norm": 4.572595935314894e-05, "learning_rate": 9.669115921627602e-08, "loss": 0.0, "num_input_tokens_seen": 120370408, "step": 178670 }, { "epoch": 4.365059976058437, "grad_norm": 2.4771761673036963e-05, "learning_rate": 9.665457919210363e-08, "loss": 0.0, "num_input_tokens_seen": 120373672, "step": 178675 }, { "epoch": 4.365182126890284, "grad_norm": 9.413428779225796e-05, "learning_rate": 9.661800573730239e-08, "loss": 0.0, "num_input_tokens_seen": 120376744, "step": 178680 }, { "epoch": 4.365304277722132, "grad_norm": 1.0507080332899932e-05, "learning_rate": 9.658143885213776e-08, "loss": 0.0, "num_input_tokens_seen": 120380072, "step": 178685 }, { "epoch": 4.365426428553978, "grad_norm": 0.00014622887829318643, "learning_rate": 9.65448785368761e-08, "loss": 0.0, "num_input_tokens_seen": 120383272, "step": 178690 }, { "epoch": 4.365548579385826, "grad_norm": 0.0007955287583172321, "learning_rate": 9.650832479178283e-08, "loss": 0.0, "num_input_tokens_seen": 120386536, "step": 178695 }, { "epoch": 4.365670730217673, "grad_norm": 0.00014913473569322377, "learning_rate": 9.647177761712421e-08, "loss": 0.0, "num_input_tokens_seen": 120389928, "step": 178700 }, { "epoch": 4.36579288104952, "grad_norm": 0.0032064183615148067, "learning_rate": 9.643523701316591e-08, "loss": 0.0, "num_input_tokens_seen": 120393384, "step": 178705 }, { "epoch": 4.365915031881367, "grad_norm": 0.00015113249537535012, "learning_rate": 9.639870298017339e-08, "loss": 0.0, "num_input_tokens_seen": 120396456, "step": 178710 }, { "epoch": 4.366037182713214, "grad_norm": 0.0001086229458451271, "learning_rate": 9.636217551841253e-08, "loss": 0.0, "num_input_tokens_seen": 120399656, "step": 178715 }, { "epoch": 4.3661593335450615, "grad_norm": 0.0002728144754655659, "learning_rate": 9.632565462814923e-08, "loss": 0.0, "num_input_tokens_seen": 120402728, "step": 178720 }, { "epoch": 4.366281484376908, "grad_norm": 8.51703662192449e-05, "learning_rate": 9.628914030964863e-08, "loss": 0.0, "num_input_tokens_seen": 120405736, "step": 178725 }, { "epoch": 4.366403635208756, "grad_norm": 0.003147940617054701, "learning_rate": 9.625263256317661e-08, "loss": 0.0, "num_input_tokens_seen": 120409192, "step": 178730 }, { "epoch": 4.366525786040603, "grad_norm": 0.0001306245248997584, "learning_rate": 9.621613138899831e-08, "loss": 0.0, "num_input_tokens_seen": 120412520, "step": 178735 }, { "epoch": 4.36664793687245, "grad_norm": 0.001566228223964572, "learning_rate": 9.617963678737961e-08, "loss": 0.0, "num_input_tokens_seen": 120415656, "step": 178740 }, { "epoch": 4.366770087704297, "grad_norm": 0.009327697567641735, "learning_rate": 9.614314875858554e-08, "loss": 0.0, "num_input_tokens_seen": 120418856, "step": 178745 }, { "epoch": 4.366892238536145, "grad_norm": 0.00037339047412388027, "learning_rate": 9.610666730288152e-08, "loss": 0.0, "num_input_tokens_seen": 120422056, "step": 178750 }, { "epoch": 4.367014389367991, "grad_norm": 0.00020661455346271396, "learning_rate": 9.607019242053315e-08, "loss": 0.0, "num_input_tokens_seen": 120425384, "step": 178755 }, { "epoch": 4.367136540199839, "grad_norm": 0.02003299444913864, "learning_rate": 9.603372411180532e-08, "loss": 0.0, "num_input_tokens_seen": 120429160, "step": 178760 }, { "epoch": 4.367258691031686, "grad_norm": 0.00031955703161656857, "learning_rate": 9.599726237696359e-08, "loss": 0.0, "num_input_tokens_seen": 120432488, "step": 178765 }, { "epoch": 4.367380841863533, "grad_norm": 4.561063906294294e-05, "learning_rate": 9.596080721627264e-08, "loss": 0.0, "num_input_tokens_seen": 120435624, "step": 178770 }, { "epoch": 4.36750299269538, "grad_norm": 8.527966201654635e-06, "learning_rate": 9.592435862999793e-08, "loss": 0.0, "num_input_tokens_seen": 120439272, "step": 178775 }, { "epoch": 4.367625143527228, "grad_norm": 0.00015384901780635118, "learning_rate": 9.588791661840468e-08, "loss": 0.0, "num_input_tokens_seen": 120442280, "step": 178780 }, { "epoch": 4.3677472943590745, "grad_norm": 0.00019443745259195566, "learning_rate": 9.585148118175746e-08, "loss": 0.0, "num_input_tokens_seen": 120446248, "step": 178785 }, { "epoch": 4.367869445190922, "grad_norm": 0.031488917768001556, "learning_rate": 9.581505232032161e-08, "loss": 0.0, "num_input_tokens_seen": 120449064, "step": 178790 }, { "epoch": 4.367991596022769, "grad_norm": 0.0008396423072554171, "learning_rate": 9.577863003436182e-08, "loss": 0.0, "num_input_tokens_seen": 120452328, "step": 178795 }, { "epoch": 4.3681137468546165, "grad_norm": 0.06349749118089676, "learning_rate": 9.574221432414297e-08, "loss": 0.0, "num_input_tokens_seen": 120455208, "step": 178800 }, { "epoch": 4.368235897686463, "grad_norm": 0.00028285704320296645, "learning_rate": 9.57058051899301e-08, "loss": 0.0, "num_input_tokens_seen": 120458472, "step": 178805 }, { "epoch": 4.36835804851831, "grad_norm": 0.003374557476490736, "learning_rate": 9.566940263198764e-08, "loss": 0.0, "num_input_tokens_seen": 120462056, "step": 178810 }, { "epoch": 4.368480199350158, "grad_norm": 0.00019568443531170487, "learning_rate": 9.563300665058072e-08, "loss": 0.0, "num_input_tokens_seen": 120465640, "step": 178815 }, { "epoch": 4.368602350182004, "grad_norm": 0.016990887001156807, "learning_rate": 9.559661724597368e-08, "loss": 0.0, "num_input_tokens_seen": 120468520, "step": 178820 }, { "epoch": 4.368724501013852, "grad_norm": 0.0021610939875245094, "learning_rate": 9.556023441843142e-08, "loss": 0.0, "num_input_tokens_seen": 120471848, "step": 178825 }, { "epoch": 4.368846651845699, "grad_norm": 0.0011525845620781183, "learning_rate": 9.552385816821818e-08, "loss": 0.0003, "num_input_tokens_seen": 120475240, "step": 178830 }, { "epoch": 4.368968802677546, "grad_norm": 0.0027562808245420456, "learning_rate": 9.548748849559896e-08, "loss": 0.0, "num_input_tokens_seen": 120479208, "step": 178835 }, { "epoch": 4.369090953509393, "grad_norm": 0.002062242478132248, "learning_rate": 9.545112540083788e-08, "loss": 0.0, "num_input_tokens_seen": 120482152, "step": 178840 }, { "epoch": 4.369213104341241, "grad_norm": 0.00020986916206311435, "learning_rate": 9.541476888419942e-08, "loss": 0.0, "num_input_tokens_seen": 120485416, "step": 178845 }, { "epoch": 4.3693352551730875, "grad_norm": 0.0021745937410742044, "learning_rate": 9.537841894594823e-08, "loss": 0.0, "num_input_tokens_seen": 120488936, "step": 178850 }, { "epoch": 4.369457406004935, "grad_norm": 0.00035431631840765476, "learning_rate": 9.534207558634833e-08, "loss": 0.0, "num_input_tokens_seen": 120492584, "step": 178855 }, { "epoch": 4.369579556836782, "grad_norm": 20.136585235595703, "learning_rate": 9.53057388056644e-08, "loss": 0.0931, "num_input_tokens_seen": 120495272, "step": 178860 }, { "epoch": 4.3697017076686295, "grad_norm": 0.00022699769760947675, "learning_rate": 9.526940860416033e-08, "loss": 0.0, "num_input_tokens_seen": 120499560, "step": 178865 }, { "epoch": 4.369823858500476, "grad_norm": 0.00014428774011321366, "learning_rate": 9.523308498210036e-08, "loss": 0.0, "num_input_tokens_seen": 120502824, "step": 178870 }, { "epoch": 4.369946009332324, "grad_norm": 0.0001790342212188989, "learning_rate": 9.519676793974907e-08, "loss": 0.0, "num_input_tokens_seen": 120505896, "step": 178875 }, { "epoch": 4.370068160164171, "grad_norm": 2.688972199393902e-05, "learning_rate": 9.516045747736989e-08, "loss": 0.0, "num_input_tokens_seen": 120508840, "step": 178880 }, { "epoch": 4.370190310996018, "grad_norm": 4.677814558817772e-06, "learning_rate": 9.512415359522752e-08, "loss": 0.0, "num_input_tokens_seen": 120512040, "step": 178885 }, { "epoch": 4.370312461827865, "grad_norm": 0.0043054185807704926, "learning_rate": 9.508785629358552e-08, "loss": 0.0, "num_input_tokens_seen": 120515560, "step": 178890 }, { "epoch": 4.370434612659713, "grad_norm": 3.436887345742434e-05, "learning_rate": 9.50515655727081e-08, "loss": 0.0, "num_input_tokens_seen": 120519208, "step": 178895 }, { "epoch": 4.370556763491559, "grad_norm": 0.0003368296311236918, "learning_rate": 9.50152814328592e-08, "loss": 0.0, "num_input_tokens_seen": 120522344, "step": 178900 }, { "epoch": 4.370678914323406, "grad_norm": 0.00037761012208648026, "learning_rate": 9.497900387430236e-08, "loss": 0.0, "num_input_tokens_seen": 120525864, "step": 178905 }, { "epoch": 4.370801065155254, "grad_norm": 0.001919149188324809, "learning_rate": 9.494273289730181e-08, "loss": 0.0, "num_input_tokens_seen": 120529320, "step": 178910 }, { "epoch": 4.3709232159871005, "grad_norm": 0.0013135827612131834, "learning_rate": 9.490646850212103e-08, "loss": 0.0, "num_input_tokens_seen": 120532328, "step": 178915 }, { "epoch": 4.371045366818948, "grad_norm": 1.2294826774450485e-05, "learning_rate": 9.487021068902402e-08, "loss": 0.0, "num_input_tokens_seen": 120535976, "step": 178920 }, { "epoch": 4.371167517650795, "grad_norm": 4.1211696952814236e-05, "learning_rate": 9.483395945827399e-08, "loss": 0.0, "num_input_tokens_seen": 120539176, "step": 178925 }, { "epoch": 4.3712896684826426, "grad_norm": 2.6847643312066793e-05, "learning_rate": 9.479771481013488e-08, "loss": 0.0, "num_input_tokens_seen": 120542696, "step": 178930 }, { "epoch": 4.371411819314489, "grad_norm": 0.0009733302285894752, "learning_rate": 9.476147674487056e-08, "loss": 0.0, "num_input_tokens_seen": 120546152, "step": 178935 }, { "epoch": 4.371533970146337, "grad_norm": 0.00021564547205343843, "learning_rate": 9.472524526274394e-08, "loss": 0.0, "num_input_tokens_seen": 120549800, "step": 178940 }, { "epoch": 4.371656120978184, "grad_norm": 0.0003852611407637596, "learning_rate": 9.468902036401916e-08, "loss": 0.0, "num_input_tokens_seen": 120553448, "step": 178945 }, { "epoch": 4.371778271810031, "grad_norm": 0.00020964071154594421, "learning_rate": 9.465280204895909e-08, "loss": 0.0, "num_input_tokens_seen": 120556712, "step": 178950 }, { "epoch": 4.371900422641878, "grad_norm": 0.00012947864888701588, "learning_rate": 9.461659031782742e-08, "loss": 0.0, "num_input_tokens_seen": 120560168, "step": 178955 }, { "epoch": 4.372022573473726, "grad_norm": 0.00023771810811012983, "learning_rate": 9.45803851708874e-08, "loss": 0.0, "num_input_tokens_seen": 120563304, "step": 178960 }, { "epoch": 4.372144724305572, "grad_norm": 0.0017591394716873765, "learning_rate": 9.454418660840225e-08, "loss": 0.0, "num_input_tokens_seen": 120566504, "step": 178965 }, { "epoch": 4.37226687513742, "grad_norm": 0.007710427511483431, "learning_rate": 9.450799463063552e-08, "loss": 0.0, "num_input_tokens_seen": 120570152, "step": 178970 }, { "epoch": 4.372389025969267, "grad_norm": 0.04205526411533356, "learning_rate": 9.447180923785004e-08, "loss": 0.0, "num_input_tokens_seen": 120573800, "step": 178975 }, { "epoch": 4.372511176801114, "grad_norm": 0.0001873615983640775, "learning_rate": 9.443563043030922e-08, "loss": 0.0, "num_input_tokens_seen": 120577576, "step": 178980 }, { "epoch": 4.372633327632961, "grad_norm": 2.1078389181639068e-05, "learning_rate": 9.4399458208276e-08, "loss": 0.0, "num_input_tokens_seen": 120581288, "step": 178985 }, { "epoch": 4.372755478464808, "grad_norm": 0.0005454168422147632, "learning_rate": 9.436329257201359e-08, "loss": 0.0, "num_input_tokens_seen": 120584488, "step": 178990 }, { "epoch": 4.372877629296656, "grad_norm": 0.0002987706393469125, "learning_rate": 9.432713352178479e-08, "loss": 0.0, "num_input_tokens_seen": 120587688, "step": 178995 }, { "epoch": 4.372999780128502, "grad_norm": 6.870987999718636e-05, "learning_rate": 9.429098105785283e-08, "loss": 0.0, "num_input_tokens_seen": 120590888, "step": 179000 }, { "epoch": 4.37312193096035, "grad_norm": 0.005530540365725756, "learning_rate": 9.425483518048028e-08, "loss": 0.0, "num_input_tokens_seen": 120594152, "step": 179005 }, { "epoch": 4.373244081792197, "grad_norm": 0.00047069930587895215, "learning_rate": 9.421869588993025e-08, "loss": 0.0, "num_input_tokens_seen": 120597352, "step": 179010 }, { "epoch": 4.373366232624044, "grad_norm": 0.00013837346341460943, "learning_rate": 9.418256318646567e-08, "loss": 0.0, "num_input_tokens_seen": 120600616, "step": 179015 }, { "epoch": 4.373488383455891, "grad_norm": 0.0004524173273239285, "learning_rate": 9.414643707034886e-08, "loss": 0.0, "num_input_tokens_seen": 120603880, "step": 179020 }, { "epoch": 4.373610534287739, "grad_norm": 0.0004959152429364622, "learning_rate": 9.411031754184285e-08, "loss": 0.0, "num_input_tokens_seen": 120607656, "step": 179025 }, { "epoch": 4.3737326851195855, "grad_norm": 0.00014898374502081424, "learning_rate": 9.40742046012104e-08, "loss": 0.0, "num_input_tokens_seen": 120610920, "step": 179030 }, { "epoch": 4.373854835951433, "grad_norm": 0.005233935080468655, "learning_rate": 9.403809824871378e-08, "loss": 0.0, "num_input_tokens_seen": 120614440, "step": 179035 }, { "epoch": 4.37397698678328, "grad_norm": 0.0002437441289657727, "learning_rate": 9.400199848461598e-08, "loss": 0.0, "num_input_tokens_seen": 120617832, "step": 179040 }, { "epoch": 4.3740991376151275, "grad_norm": 5.977055479888804e-05, "learning_rate": 9.396590530917925e-08, "loss": 0.0, "num_input_tokens_seen": 120621416, "step": 179045 }, { "epoch": 4.374221288446974, "grad_norm": 0.0006878579151816666, "learning_rate": 9.392981872266626e-08, "loss": 0.0, "num_input_tokens_seen": 120625192, "step": 179050 }, { "epoch": 4.374343439278822, "grad_norm": 1.9039856852032244e-05, "learning_rate": 9.389373872533912e-08, "loss": 0.0, "num_input_tokens_seen": 120628328, "step": 179055 }, { "epoch": 4.374465590110669, "grad_norm": 0.4868165850639343, "learning_rate": 9.385766531746053e-08, "loss": 0.0001, "num_input_tokens_seen": 120631528, "step": 179060 }, { "epoch": 4.374587740942516, "grad_norm": 10.62925910949707, "learning_rate": 9.382159849929284e-08, "loss": 0.0302, "num_input_tokens_seen": 120634856, "step": 179065 }, { "epoch": 4.374709891774363, "grad_norm": 0.0009652128792367876, "learning_rate": 9.378553827109803e-08, "loss": 0.0, "num_input_tokens_seen": 120637928, "step": 179070 }, { "epoch": 4.37483204260621, "grad_norm": 7.944925891933963e-05, "learning_rate": 9.37494846331387e-08, "loss": 0.0, "num_input_tokens_seen": 120641064, "step": 179075 }, { "epoch": 4.374954193438057, "grad_norm": 3.24042011925485e-05, "learning_rate": 9.371343758567663e-08, "loss": 0.0, "num_input_tokens_seen": 120644136, "step": 179080 }, { "epoch": 4.375076344269904, "grad_norm": 0.000675307703204453, "learning_rate": 9.367739712897426e-08, "loss": 0.0, "num_input_tokens_seen": 120647592, "step": 179085 }, { "epoch": 4.375198495101752, "grad_norm": 0.0001943126117112115, "learning_rate": 9.364136326329386e-08, "loss": 0.0, "num_input_tokens_seen": 120650792, "step": 179090 }, { "epoch": 4.3753206459335985, "grad_norm": 63.127132415771484, "learning_rate": 9.360533598889708e-08, "loss": 0.0913, "num_input_tokens_seen": 120654120, "step": 179095 }, { "epoch": 4.375442796765446, "grad_norm": 2.2227972294786014e-05, "learning_rate": 9.356931530604617e-08, "loss": 0.0555, "num_input_tokens_seen": 120657256, "step": 179100 }, { "epoch": 4.375564947597293, "grad_norm": 5.614551264443435e-05, "learning_rate": 9.353330121500302e-08, "loss": 0.0, "num_input_tokens_seen": 120660392, "step": 179105 }, { "epoch": 4.3756870984291405, "grad_norm": 0.0001968853030120954, "learning_rate": 9.349729371602944e-08, "loss": 0.0, "num_input_tokens_seen": 120663720, "step": 179110 }, { "epoch": 4.375809249260987, "grad_norm": 0.00011514556535985321, "learning_rate": 9.346129280938742e-08, "loss": 0.0, "num_input_tokens_seen": 120667176, "step": 179115 }, { "epoch": 4.375931400092835, "grad_norm": 0.00011827262642327696, "learning_rate": 9.342529849533853e-08, "loss": 0.0, "num_input_tokens_seen": 120670888, "step": 179120 }, { "epoch": 4.376053550924682, "grad_norm": 0.002016911981627345, "learning_rate": 9.338931077414492e-08, "loss": 0.0, "num_input_tokens_seen": 120674280, "step": 179125 }, { "epoch": 4.376175701756529, "grad_norm": 0.0008704157080501318, "learning_rate": 9.335332964606802e-08, "loss": 0.0, "num_input_tokens_seen": 120677608, "step": 179130 }, { "epoch": 4.376297852588376, "grad_norm": 0.0019470416009426117, "learning_rate": 9.331735511136962e-08, "loss": 0.0, "num_input_tokens_seen": 120681064, "step": 179135 }, { "epoch": 4.376420003420224, "grad_norm": 2.9816957976436242e-05, "learning_rate": 9.328138717031109e-08, "loss": 0.0, "num_input_tokens_seen": 120684328, "step": 179140 }, { "epoch": 4.37654215425207, "grad_norm": 4.998807344236411e-05, "learning_rate": 9.324542582315442e-08, "loss": 0.0489, "num_input_tokens_seen": 120687400, "step": 179145 }, { "epoch": 4.376664305083918, "grad_norm": 0.007195464801043272, "learning_rate": 9.320947107016074e-08, "loss": 0.0, "num_input_tokens_seen": 120690600, "step": 179150 }, { "epoch": 4.376786455915765, "grad_norm": 0.0022986922413110733, "learning_rate": 9.317352291159164e-08, "loss": 0.0, "num_input_tokens_seen": 120694312, "step": 179155 }, { "epoch": 4.376908606747612, "grad_norm": 0.00016899174079298973, "learning_rate": 9.313758134770877e-08, "loss": 0.0, "num_input_tokens_seen": 120697576, "step": 179160 }, { "epoch": 4.377030757579459, "grad_norm": 0.02144761197268963, "learning_rate": 9.310164637877316e-08, "loss": 0.0, "num_input_tokens_seen": 120701160, "step": 179165 }, { "epoch": 4.377152908411306, "grad_norm": 0.0012190442066639662, "learning_rate": 9.306571800504648e-08, "loss": 0.0, "num_input_tokens_seen": 120705000, "step": 179170 }, { "epoch": 4.3772750592431535, "grad_norm": 0.2229158878326416, "learning_rate": 9.302979622678964e-08, "loss": 0.0001, "num_input_tokens_seen": 120708264, "step": 179175 }, { "epoch": 4.377397210075, "grad_norm": 5.557996337302029e-05, "learning_rate": 9.299388104426409e-08, "loss": 0.0, "num_input_tokens_seen": 120711400, "step": 179180 }, { "epoch": 4.377519360906848, "grad_norm": 8.016775973374024e-05, "learning_rate": 9.295797245773119e-08, "loss": 0.0, "num_input_tokens_seen": 120714344, "step": 179185 }, { "epoch": 4.377641511738695, "grad_norm": 0.00011706854274962097, "learning_rate": 9.29220704674516e-08, "loss": 0.0, "num_input_tokens_seen": 120717352, "step": 179190 }, { "epoch": 4.377763662570542, "grad_norm": 0.004367952700704336, "learning_rate": 9.288617507368701e-08, "loss": 0.0, "num_input_tokens_seen": 120720488, "step": 179195 }, { "epoch": 4.377885813402389, "grad_norm": 0.0011716161388903856, "learning_rate": 9.285028627669789e-08, "loss": 0.0, "num_input_tokens_seen": 120723752, "step": 179200 }, { "epoch": 4.378007964234237, "grad_norm": 0.000715529196895659, "learning_rate": 9.281440407674556e-08, "loss": 0.0, "num_input_tokens_seen": 120726888, "step": 179205 }, { "epoch": 4.378130115066083, "grad_norm": 0.0004990265006199479, "learning_rate": 9.277852847409107e-08, "loss": 0.0, "num_input_tokens_seen": 120730536, "step": 179210 }, { "epoch": 4.378252265897931, "grad_norm": 3.416829349589534e-05, "learning_rate": 9.274265946899484e-08, "loss": 0.0, "num_input_tokens_seen": 120734504, "step": 179215 }, { "epoch": 4.378374416729778, "grad_norm": 0.00046856681001372635, "learning_rate": 9.270679706171825e-08, "loss": 0.0, "num_input_tokens_seen": 120737448, "step": 179220 }, { "epoch": 4.378496567561625, "grad_norm": 0.00010078339255414903, "learning_rate": 9.267094125252161e-08, "loss": 0.0001, "num_input_tokens_seen": 120740328, "step": 179225 }, { "epoch": 4.378618718393472, "grad_norm": 2.9010152502451092e-05, "learning_rate": 9.26350920416662e-08, "loss": 0.0, "num_input_tokens_seen": 120743592, "step": 179230 }, { "epoch": 4.37874086922532, "grad_norm": 3.963453309552278e-06, "learning_rate": 9.259924942941222e-08, "loss": 0.0, "num_input_tokens_seen": 120746856, "step": 179235 }, { "epoch": 4.3788630200571665, "grad_norm": 0.0013897567987442017, "learning_rate": 9.256341341602059e-08, "loss": 0.0003, "num_input_tokens_seen": 120750504, "step": 179240 }, { "epoch": 4.378985170889013, "grad_norm": 0.00011837323108920828, "learning_rate": 9.25275840017521e-08, "loss": 0.0, "num_input_tokens_seen": 120754344, "step": 179245 }, { "epoch": 4.379107321720861, "grad_norm": 0.00936560146510601, "learning_rate": 9.249176118686686e-08, "loss": 0.0, "num_input_tokens_seen": 120757736, "step": 179250 }, { "epoch": 4.379229472552708, "grad_norm": 7.882525824243203e-05, "learning_rate": 9.245594497162579e-08, "loss": 0.0, "num_input_tokens_seen": 120761064, "step": 179255 }, { "epoch": 4.379351623384555, "grad_norm": 0.0018617367604747415, "learning_rate": 9.242013535628901e-08, "loss": 0.0, "num_input_tokens_seen": 120764456, "step": 179260 }, { "epoch": 4.379473774216402, "grad_norm": 0.00010539888171479106, "learning_rate": 9.238433234111731e-08, "loss": 0.0, "num_input_tokens_seen": 120767656, "step": 179265 }, { "epoch": 4.37959592504825, "grad_norm": 0.0016833314439281821, "learning_rate": 9.234853592637071e-08, "loss": 0.0, "num_input_tokens_seen": 120770792, "step": 179270 }, { "epoch": 4.379718075880096, "grad_norm": 0.000337698234943673, "learning_rate": 9.231274611230965e-08, "loss": 0.084, "num_input_tokens_seen": 120773992, "step": 179275 }, { "epoch": 4.379840226711944, "grad_norm": 0.0006232039304450154, "learning_rate": 9.227696289919462e-08, "loss": 0.0, "num_input_tokens_seen": 120777128, "step": 179280 }, { "epoch": 4.379962377543791, "grad_norm": 0.0005635755369439721, "learning_rate": 9.22411862872855e-08, "loss": 0.0, "num_input_tokens_seen": 120780264, "step": 179285 }, { "epoch": 4.380084528375638, "grad_norm": 0.0004072504525538534, "learning_rate": 9.220541627684286e-08, "loss": 0.0, "num_input_tokens_seen": 120783848, "step": 179290 }, { "epoch": 4.380206679207485, "grad_norm": 0.0003334445063956082, "learning_rate": 9.216965286812628e-08, "loss": 0.0, "num_input_tokens_seen": 120787304, "step": 179295 }, { "epoch": 4.380328830039333, "grad_norm": 0.0065982588566839695, "learning_rate": 9.213389606139643e-08, "loss": 0.0182, "num_input_tokens_seen": 120791016, "step": 179300 }, { "epoch": 4.38045098087118, "grad_norm": 0.00016611551109235734, "learning_rate": 9.2098145856913e-08, "loss": 0.0001, "num_input_tokens_seen": 120794344, "step": 179305 }, { "epoch": 4.380573131703027, "grad_norm": 5.830096051795408e-05, "learning_rate": 9.206240225493611e-08, "loss": 0.0, "num_input_tokens_seen": 120797736, "step": 179310 }, { "epoch": 4.380695282534874, "grad_norm": 0.00011630048538791016, "learning_rate": 9.202666525572545e-08, "loss": 0.0, "num_input_tokens_seen": 120800936, "step": 179315 }, { "epoch": 4.380817433366722, "grad_norm": 0.0007063053781166673, "learning_rate": 9.199093485954123e-08, "loss": 0.0, "num_input_tokens_seen": 120804840, "step": 179320 }, { "epoch": 4.380939584198568, "grad_norm": 3.338679744047113e-05, "learning_rate": 9.195521106664328e-08, "loss": 0.0, "num_input_tokens_seen": 120808168, "step": 179325 }, { "epoch": 4.381061735030416, "grad_norm": 0.0004898302140645683, "learning_rate": 9.191949387729103e-08, "loss": 0.0, "num_input_tokens_seen": 120811496, "step": 179330 }, { "epoch": 4.381183885862263, "grad_norm": 0.0033150261733680964, "learning_rate": 9.188378329174451e-08, "loss": 0.0, "num_input_tokens_seen": 120814760, "step": 179335 }, { "epoch": 4.3813060366941095, "grad_norm": 6.164521892060293e-06, "learning_rate": 9.184807931026351e-08, "loss": 0.0, "num_input_tokens_seen": 120818472, "step": 179340 }, { "epoch": 4.381428187525957, "grad_norm": 0.0019105395767837763, "learning_rate": 9.181238193310736e-08, "loss": 0.0, "num_input_tokens_seen": 120821352, "step": 179345 }, { "epoch": 4.381550338357804, "grad_norm": 0.0002401101082796231, "learning_rate": 9.177669116053599e-08, "loss": 0.0, "num_input_tokens_seen": 120824488, "step": 179350 }, { "epoch": 4.3816724891896515, "grad_norm": 0.00011806233669631183, "learning_rate": 9.174100699280862e-08, "loss": 0.0, "num_input_tokens_seen": 120827752, "step": 179355 }, { "epoch": 4.381794640021498, "grad_norm": 0.017286479473114014, "learning_rate": 9.170532943018517e-08, "loss": 0.0302, "num_input_tokens_seen": 120831656, "step": 179360 }, { "epoch": 4.381916790853346, "grad_norm": 0.0006702401442453265, "learning_rate": 9.166965847292474e-08, "loss": 0.0, "num_input_tokens_seen": 120834792, "step": 179365 }, { "epoch": 4.382038941685193, "grad_norm": 0.0017308936221525073, "learning_rate": 9.163399412128681e-08, "loss": 0.0, "num_input_tokens_seen": 120839272, "step": 179370 }, { "epoch": 4.38216109251704, "grad_norm": 0.00014859286602586508, "learning_rate": 9.159833637553094e-08, "loss": 0.0, "num_input_tokens_seen": 120842600, "step": 179375 }, { "epoch": 4.382283243348887, "grad_norm": 0.0006171285640448332, "learning_rate": 9.156268523591615e-08, "loss": 0.0, "num_input_tokens_seen": 120845864, "step": 179380 }, { "epoch": 4.382405394180735, "grad_norm": 0.0025407453067600727, "learning_rate": 9.152704070270201e-08, "loss": 0.0, "num_input_tokens_seen": 120849192, "step": 179385 }, { "epoch": 4.382527545012581, "grad_norm": 0.0007126021664589643, "learning_rate": 9.149140277614742e-08, "loss": 0.0, "num_input_tokens_seen": 120853096, "step": 179390 }, { "epoch": 4.382649695844429, "grad_norm": 0.0014084700960665941, "learning_rate": 9.145577145651173e-08, "loss": 0.0, "num_input_tokens_seen": 120856360, "step": 179395 }, { "epoch": 4.382771846676276, "grad_norm": 0.00025574007304385304, "learning_rate": 9.142014674405418e-08, "loss": 0.0, "num_input_tokens_seen": 120860072, "step": 179400 }, { "epoch": 4.382893997508123, "grad_norm": 0.041199006140232086, "learning_rate": 9.138452863903357e-08, "loss": 0.0, "num_input_tokens_seen": 120863016, "step": 179405 }, { "epoch": 4.38301614833997, "grad_norm": 0.0009356053778901696, "learning_rate": 9.134891714170911e-08, "loss": 0.0, "num_input_tokens_seen": 120866280, "step": 179410 }, { "epoch": 4.383138299171818, "grad_norm": 0.020888380706310272, "learning_rate": 9.131331225233985e-08, "loss": 0.0, "num_input_tokens_seen": 120869800, "step": 179415 }, { "epoch": 4.3832604500036645, "grad_norm": 0.00015980943862814456, "learning_rate": 9.127771397118434e-08, "loss": 0.0, "num_input_tokens_seen": 120872552, "step": 179420 }, { "epoch": 4.383382600835512, "grad_norm": 0.0007625825819559395, "learning_rate": 9.124212229850192e-08, "loss": 0.0, "num_input_tokens_seen": 120876200, "step": 179425 }, { "epoch": 4.383504751667359, "grad_norm": 0.032647211104631424, "learning_rate": 9.120653723455108e-08, "loss": 0.0, "num_input_tokens_seen": 120879016, "step": 179430 }, { "epoch": 4.383626902499206, "grad_norm": 6.470428343163803e-05, "learning_rate": 9.117095877959091e-08, "loss": 0.0, "num_input_tokens_seen": 120882408, "step": 179435 }, { "epoch": 4.383749053331053, "grad_norm": 8.53855672175996e-05, "learning_rate": 9.113538693387967e-08, "loss": 0.0, "num_input_tokens_seen": 120885608, "step": 179440 }, { "epoch": 4.3838712041629, "grad_norm": 0.00017426212434656918, "learning_rate": 9.109982169767671e-08, "loss": 0.0, "num_input_tokens_seen": 120888680, "step": 179445 }, { "epoch": 4.383993354994748, "grad_norm": 0.02529120445251465, "learning_rate": 9.106426307124004e-08, "loss": 0.0, "num_input_tokens_seen": 120891944, "step": 179450 }, { "epoch": 4.384115505826594, "grad_norm": 3.9343041862593964e-05, "learning_rate": 9.102871105482868e-08, "loss": 0.0, "num_input_tokens_seen": 120895336, "step": 179455 }, { "epoch": 4.384237656658442, "grad_norm": 0.00017238286091014743, "learning_rate": 9.099316564870086e-08, "loss": 0.0, "num_input_tokens_seen": 120898664, "step": 179460 }, { "epoch": 4.384359807490289, "grad_norm": 24.275854110717773, "learning_rate": 9.095762685311526e-08, "loss": 0.058, "num_input_tokens_seen": 120901800, "step": 179465 }, { "epoch": 4.384481958322136, "grad_norm": 0.018933162093162537, "learning_rate": 9.092209466833046e-08, "loss": 0.0, "num_input_tokens_seen": 120904808, "step": 179470 }, { "epoch": 4.384604109153983, "grad_norm": 4.2884777940344065e-05, "learning_rate": 9.088656909460446e-08, "loss": 0.0, "num_input_tokens_seen": 120908264, "step": 179475 }, { "epoch": 4.384726259985831, "grad_norm": 0.00044881939538754523, "learning_rate": 9.085105013219607e-08, "loss": 0.0, "num_input_tokens_seen": 120911272, "step": 179480 }, { "epoch": 4.3848484108176775, "grad_norm": 0.0136734489351511, "learning_rate": 9.08155377813633e-08, "loss": 0.0, "num_input_tokens_seen": 120914664, "step": 179485 }, { "epoch": 4.384970561649525, "grad_norm": 0.0005159003776498139, "learning_rate": 9.07800320423644e-08, "loss": 0.0, "num_input_tokens_seen": 120917992, "step": 179490 }, { "epoch": 4.385092712481372, "grad_norm": 8.528042963007465e-05, "learning_rate": 9.074453291545781e-08, "loss": 0.0, "num_input_tokens_seen": 120921192, "step": 179495 }, { "epoch": 4.3852148633132195, "grad_norm": 6.457211566157639e-05, "learning_rate": 9.070904040090132e-08, "loss": 0.0, "num_input_tokens_seen": 120924328, "step": 179500 }, { "epoch": 4.385337014145066, "grad_norm": 3.3956439438043162e-06, "learning_rate": 9.067355449895352e-08, "loss": 0.0, "num_input_tokens_seen": 120928104, "step": 179505 }, { "epoch": 4.385459164976914, "grad_norm": 0.0062750293873250484, "learning_rate": 9.063807520987199e-08, "loss": 0.0, "num_input_tokens_seen": 120931304, "step": 179510 }, { "epoch": 4.385581315808761, "grad_norm": 0.0009548944071866572, "learning_rate": 9.060260253391517e-08, "loss": 0.0, "num_input_tokens_seen": 120934568, "step": 179515 }, { "epoch": 4.385703466640608, "grad_norm": 0.00014670187374576926, "learning_rate": 9.056713647134084e-08, "loss": 0.0, "num_input_tokens_seen": 120937832, "step": 179520 }, { "epoch": 4.385825617472455, "grad_norm": 0.002957995282486081, "learning_rate": 9.053167702240672e-08, "loss": 0.0, "num_input_tokens_seen": 120940904, "step": 179525 }, { "epoch": 4.385947768304302, "grad_norm": 0.055948711931705475, "learning_rate": 9.049622418737102e-08, "loss": 0.0, "num_input_tokens_seen": 120944808, "step": 179530 }, { "epoch": 4.386069919136149, "grad_norm": 0.0007765132468193769, "learning_rate": 9.046077796649121e-08, "loss": 0.0, "num_input_tokens_seen": 120948200, "step": 179535 }, { "epoch": 4.386192069967996, "grad_norm": 0.0014981003478169441, "learning_rate": 9.042533836002541e-08, "loss": 0.0, "num_input_tokens_seen": 120951912, "step": 179540 }, { "epoch": 4.386314220799844, "grad_norm": 0.0016036713495850563, "learning_rate": 9.03899053682311e-08, "loss": 0.0, "num_input_tokens_seen": 120955496, "step": 179545 }, { "epoch": 4.3864363716316905, "grad_norm": 0.0004156556387897581, "learning_rate": 9.035447899136617e-08, "loss": 0.0, "num_input_tokens_seen": 120959336, "step": 179550 }, { "epoch": 4.386558522463538, "grad_norm": 0.00033517173142172396, "learning_rate": 9.031905922968797e-08, "loss": 0.0, "num_input_tokens_seen": 120963112, "step": 179555 }, { "epoch": 4.386680673295385, "grad_norm": 5.0665068556554615e-05, "learning_rate": 9.02836460834543e-08, "loss": 0.0, "num_input_tokens_seen": 120966376, "step": 179560 }, { "epoch": 4.3868028241272325, "grad_norm": 0.00043228318099863827, "learning_rate": 9.024823955292271e-08, "loss": 0.0, "num_input_tokens_seen": 120969896, "step": 179565 }, { "epoch": 4.386924974959079, "grad_norm": 0.0023719624150544405, "learning_rate": 9.021283963835058e-08, "loss": 0.0, "num_input_tokens_seen": 120973608, "step": 179570 }, { "epoch": 4.387047125790927, "grad_norm": 0.00037452831747941673, "learning_rate": 9.017744633999547e-08, "loss": 0.0, "num_input_tokens_seen": 120977000, "step": 179575 }, { "epoch": 4.387169276622774, "grad_norm": 0.0002062133135041222, "learning_rate": 9.01420596581145e-08, "loss": 0.0, "num_input_tokens_seen": 120980584, "step": 179580 }, { "epoch": 4.387291427454621, "grad_norm": 0.0007104614633135498, "learning_rate": 9.010667959296526e-08, "loss": 0.0, "num_input_tokens_seen": 120984168, "step": 179585 }, { "epoch": 4.387413578286468, "grad_norm": 0.00010111812298418954, "learning_rate": 9.007130614480507e-08, "loss": 0.0, "num_input_tokens_seen": 120987560, "step": 179590 }, { "epoch": 4.387535729118316, "grad_norm": 0.0004167421138845384, "learning_rate": 9.003593931389087e-08, "loss": 0.0, "num_input_tokens_seen": 120990696, "step": 179595 }, { "epoch": 4.387657879950162, "grad_norm": 0.0004182478878647089, "learning_rate": 9.000057910048042e-08, "loss": 0.0, "num_input_tokens_seen": 120993960, "step": 179600 }, { "epoch": 4.387780030782009, "grad_norm": 0.0013876301236450672, "learning_rate": 8.996522550483021e-08, "loss": 0.0, "num_input_tokens_seen": 120997224, "step": 179605 }, { "epoch": 4.387902181613857, "grad_norm": 4.902793807559647e-05, "learning_rate": 8.99298785271978e-08, "loss": 0.0, "num_input_tokens_seen": 121000680, "step": 179610 }, { "epoch": 4.388024332445704, "grad_norm": 0.00018190659466199577, "learning_rate": 8.989453816783998e-08, "loss": 0.0, "num_input_tokens_seen": 121003752, "step": 179615 }, { "epoch": 4.388146483277551, "grad_norm": 4.526587417785777e-06, "learning_rate": 8.985920442701411e-08, "loss": 0.0, "num_input_tokens_seen": 121007144, "step": 179620 }, { "epoch": 4.388268634109398, "grad_norm": 0.002543956506997347, "learning_rate": 8.982387730497665e-08, "loss": 0.0001, "num_input_tokens_seen": 121010152, "step": 179625 }, { "epoch": 4.388390784941246, "grad_norm": 0.004739391151815653, "learning_rate": 8.978855680198494e-08, "loss": 0.0, "num_input_tokens_seen": 121013288, "step": 179630 }, { "epoch": 4.388512935773092, "grad_norm": 7.183388515841216e-05, "learning_rate": 8.975324291829567e-08, "loss": 0.0, "num_input_tokens_seen": 121016680, "step": 179635 }, { "epoch": 4.38863508660494, "grad_norm": 1.286260703636799e-05, "learning_rate": 8.971793565416541e-08, "loss": 0.0214, "num_input_tokens_seen": 121020072, "step": 179640 }, { "epoch": 4.388757237436787, "grad_norm": 0.0037381022702902555, "learning_rate": 8.96826350098513e-08, "loss": 0.0, "num_input_tokens_seen": 121023336, "step": 179645 }, { "epoch": 4.388879388268634, "grad_norm": 2.0466199202928692e-05, "learning_rate": 8.964734098561e-08, "loss": 0.0, "num_input_tokens_seen": 121026664, "step": 179650 }, { "epoch": 4.389001539100481, "grad_norm": 0.0005669654929079115, "learning_rate": 8.961205358169788e-08, "loss": 0.0001, "num_input_tokens_seen": 121029672, "step": 179655 }, { "epoch": 4.389123689932329, "grad_norm": 21.659992218017578, "learning_rate": 8.957677279837195e-08, "loss": 0.0325, "num_input_tokens_seen": 121032744, "step": 179660 }, { "epoch": 4.3892458407641755, "grad_norm": 1.6105790564324707e-05, "learning_rate": 8.954149863588844e-08, "loss": 0.0001, "num_input_tokens_seen": 121036136, "step": 179665 }, { "epoch": 4.389367991596023, "grad_norm": 4.675076706917025e-05, "learning_rate": 8.950623109450428e-08, "loss": 0.0, "num_input_tokens_seen": 121039592, "step": 179670 }, { "epoch": 4.38949014242787, "grad_norm": 0.006997786462306976, "learning_rate": 8.947097017447546e-08, "loss": 0.0002, "num_input_tokens_seen": 121042920, "step": 179675 }, { "epoch": 4.3896122932597175, "grad_norm": 0.0003096538130193949, "learning_rate": 8.94357158760587e-08, "loss": 0.0, "num_input_tokens_seen": 121046568, "step": 179680 }, { "epoch": 4.389734444091564, "grad_norm": 0.0009752101614139974, "learning_rate": 8.940046819951052e-08, "loss": 0.0, "num_input_tokens_seen": 121050024, "step": 179685 }, { "epoch": 4.389856594923412, "grad_norm": 0.0027850079350173473, "learning_rate": 8.936522714508676e-08, "loss": 0.0, "num_input_tokens_seen": 121053096, "step": 179690 }, { "epoch": 4.389978745755259, "grad_norm": 0.0012620454654097557, "learning_rate": 8.932999271304432e-08, "loss": 0.0, "num_input_tokens_seen": 121056360, "step": 179695 }, { "epoch": 4.390100896587105, "grad_norm": 0.0006452227826230228, "learning_rate": 8.929476490363885e-08, "loss": 0.0, "num_input_tokens_seen": 121059688, "step": 179700 }, { "epoch": 4.390223047418953, "grad_norm": 0.001190279726870358, "learning_rate": 8.925954371712696e-08, "loss": 0.0, "num_input_tokens_seen": 121063720, "step": 179705 }, { "epoch": 4.3903451982508, "grad_norm": 0.0012196673778817058, "learning_rate": 8.922432915376443e-08, "loss": 0.0, "num_input_tokens_seen": 121067048, "step": 179710 }, { "epoch": 4.390467349082647, "grad_norm": 0.01806124486029148, "learning_rate": 8.918912121380761e-08, "loss": 0.0, "num_input_tokens_seen": 121070568, "step": 179715 }, { "epoch": 4.390589499914494, "grad_norm": 0.001845500897616148, "learning_rate": 8.915391989751265e-08, "loss": 0.0, "num_input_tokens_seen": 121073832, "step": 179720 }, { "epoch": 4.390711650746342, "grad_norm": 0.0005148798227310181, "learning_rate": 8.911872520513542e-08, "loss": 0.0002, "num_input_tokens_seen": 121076968, "step": 179725 }, { "epoch": 4.3908338015781885, "grad_norm": 0.00020431082521099597, "learning_rate": 8.908353713693162e-08, "loss": 0.0014, "num_input_tokens_seen": 121080552, "step": 179730 }, { "epoch": 4.390955952410036, "grad_norm": 0.0008650731761008501, "learning_rate": 8.90483556931575e-08, "loss": 0.0, "num_input_tokens_seen": 121084392, "step": 179735 }, { "epoch": 4.391078103241883, "grad_norm": 0.0010449419496580958, "learning_rate": 8.90131808740685e-08, "loss": 0.0, "num_input_tokens_seen": 121088040, "step": 179740 }, { "epoch": 4.3912002540737305, "grad_norm": 0.0005244952626526356, "learning_rate": 8.8978012679921e-08, "loss": 0.0, "num_input_tokens_seen": 121092008, "step": 179745 }, { "epoch": 4.391322404905577, "grad_norm": 0.00010663815919542685, "learning_rate": 8.894285111097011e-08, "loss": 0.0318, "num_input_tokens_seen": 121095400, "step": 179750 }, { "epoch": 4.391444555737425, "grad_norm": 4.027450268040411e-05, "learning_rate": 8.890769616747207e-08, "loss": 0.0, "num_input_tokens_seen": 121098344, "step": 179755 }, { "epoch": 4.391566706569272, "grad_norm": 7.695380918448791e-05, "learning_rate": 8.887254784968223e-08, "loss": 0.0, "num_input_tokens_seen": 121101736, "step": 179760 }, { "epoch": 4.391688857401119, "grad_norm": 0.0006177126779220998, "learning_rate": 8.88374061578564e-08, "loss": 0.0, "num_input_tokens_seen": 121104936, "step": 179765 }, { "epoch": 4.391811008232966, "grad_norm": 0.22347557544708252, "learning_rate": 8.880227109224981e-08, "loss": 0.0, "num_input_tokens_seen": 121108648, "step": 179770 }, { "epoch": 4.391933159064814, "grad_norm": 3.539887256920338e-05, "learning_rate": 8.876714265311824e-08, "loss": 0.0, "num_input_tokens_seen": 121111720, "step": 179775 }, { "epoch": 4.39205530989666, "grad_norm": 0.0023835294414311647, "learning_rate": 8.873202084071717e-08, "loss": 0.0, "num_input_tokens_seen": 121115368, "step": 179780 }, { "epoch": 4.392177460728508, "grad_norm": 0.00017241346358787268, "learning_rate": 8.869690565530185e-08, "loss": 0.0, "num_input_tokens_seen": 121118632, "step": 179785 }, { "epoch": 4.392299611560355, "grad_norm": 0.00017596670659258962, "learning_rate": 8.866179709712795e-08, "loss": 0.0, "num_input_tokens_seen": 121121512, "step": 179790 }, { "epoch": 4.3924217623922015, "grad_norm": 0.0005292627029120922, "learning_rate": 8.862669516645039e-08, "loss": 0.0288, "num_input_tokens_seen": 121125672, "step": 179795 }, { "epoch": 4.392543913224049, "grad_norm": 0.00010504711826797575, "learning_rate": 8.85915998635246e-08, "loss": 0.0, "num_input_tokens_seen": 121128680, "step": 179800 }, { "epoch": 4.392666064055896, "grad_norm": 0.0018456984544172883, "learning_rate": 8.855651118860608e-08, "loss": 0.0299, "num_input_tokens_seen": 121132264, "step": 179805 }, { "epoch": 4.3927882148877435, "grad_norm": 0.0027914142701774836, "learning_rate": 8.85214291419496e-08, "loss": 0.0, "num_input_tokens_seen": 121135528, "step": 179810 }, { "epoch": 4.39291036571959, "grad_norm": 0.01401690673083067, "learning_rate": 8.848635372381052e-08, "loss": 0.0, "num_input_tokens_seen": 121139304, "step": 179815 }, { "epoch": 4.393032516551438, "grad_norm": 9.930646228895057e-06, "learning_rate": 8.845128493444365e-08, "loss": 0.0, "num_input_tokens_seen": 121142568, "step": 179820 }, { "epoch": 4.393154667383285, "grad_norm": 0.0008397593628615141, "learning_rate": 8.841622277410455e-08, "loss": 0.0, "num_input_tokens_seen": 121146216, "step": 179825 }, { "epoch": 4.393276818215132, "grad_norm": 0.0007986437994986773, "learning_rate": 8.838116724304767e-08, "loss": 0.0, "num_input_tokens_seen": 121149480, "step": 179830 }, { "epoch": 4.393398969046979, "grad_norm": 0.0004919094499200583, "learning_rate": 8.834611834152806e-08, "loss": 0.0, "num_input_tokens_seen": 121152616, "step": 179835 }, { "epoch": 4.393521119878827, "grad_norm": 0.00010752362140920013, "learning_rate": 8.831107606980093e-08, "loss": 0.0, "num_input_tokens_seen": 121156392, "step": 179840 }, { "epoch": 4.393643270710673, "grad_norm": 0.0016955259488895535, "learning_rate": 8.827604042812054e-08, "loss": 0.0, "num_input_tokens_seen": 121159656, "step": 179845 }, { "epoch": 4.393765421542521, "grad_norm": 0.010647977702319622, "learning_rate": 8.824101141674234e-08, "loss": 0.0, "num_input_tokens_seen": 121162728, "step": 179850 }, { "epoch": 4.393887572374368, "grad_norm": 0.0001815099676605314, "learning_rate": 8.820598903592047e-08, "loss": 0.0, "num_input_tokens_seen": 121166248, "step": 179855 }, { "epoch": 4.394009723206215, "grad_norm": 0.0004509559366852045, "learning_rate": 8.817097328591005e-08, "loss": 0.0, "num_input_tokens_seen": 121169256, "step": 179860 }, { "epoch": 4.394131874038062, "grad_norm": 6.720137119293213, "learning_rate": 8.813596416696544e-08, "loss": 0.0001, "num_input_tokens_seen": 121172584, "step": 179865 }, { "epoch": 4.394254024869909, "grad_norm": 0.0016880101757124066, "learning_rate": 8.810096167934133e-08, "loss": 0.0, "num_input_tokens_seen": 121175848, "step": 179870 }, { "epoch": 4.3943761757017565, "grad_norm": 0.0022065758239477873, "learning_rate": 8.806596582329251e-08, "loss": 0.0, "num_input_tokens_seen": 121178856, "step": 179875 }, { "epoch": 4.394498326533603, "grad_norm": 0.0007906149257905781, "learning_rate": 8.8030976599073e-08, "loss": 0.0, "num_input_tokens_seen": 121182056, "step": 179880 }, { "epoch": 4.394620477365451, "grad_norm": 0.00020999391563236713, "learning_rate": 8.79959940069378e-08, "loss": 0.0, "num_input_tokens_seen": 121185832, "step": 179885 }, { "epoch": 4.394742628197298, "grad_norm": 0.0006979668978601694, "learning_rate": 8.796101804714084e-08, "loss": 0.0, "num_input_tokens_seen": 121189288, "step": 179890 }, { "epoch": 4.394864779029145, "grad_norm": 0.0028894988354295492, "learning_rate": 8.792604871993658e-08, "loss": 0.0, "num_input_tokens_seen": 121192680, "step": 179895 }, { "epoch": 4.394986929860992, "grad_norm": 0.06202784553170204, "learning_rate": 8.78910860255797e-08, "loss": 0.0003, "num_input_tokens_seen": 121196200, "step": 179900 }, { "epoch": 4.39510908069284, "grad_norm": 8.067709131864831e-05, "learning_rate": 8.78561299643239e-08, "loss": 0.0, "num_input_tokens_seen": 121199336, "step": 179905 }, { "epoch": 4.395231231524686, "grad_norm": 0.0005007724976167083, "learning_rate": 8.782118053642384e-08, "loss": 0.0, "num_input_tokens_seen": 121202472, "step": 179910 }, { "epoch": 4.395353382356534, "grad_norm": 2.4386856239289045e-05, "learning_rate": 8.778623774213345e-08, "loss": 0.0, "num_input_tokens_seen": 121205864, "step": 179915 }, { "epoch": 4.395475533188381, "grad_norm": 6.757559458492324e-05, "learning_rate": 8.775130158170697e-08, "loss": 0.0, "num_input_tokens_seen": 121208936, "step": 179920 }, { "epoch": 4.395597684020228, "grad_norm": 0.00019384313782211393, "learning_rate": 8.771637205539817e-08, "loss": 0.0, "num_input_tokens_seen": 121212136, "step": 179925 }, { "epoch": 4.395719834852075, "grad_norm": 0.0002847821160685271, "learning_rate": 8.768144916346155e-08, "loss": 0.0, "num_input_tokens_seen": 121215784, "step": 179930 }, { "epoch": 4.395841985683923, "grad_norm": 0.00039346402627415955, "learning_rate": 8.764653290615065e-08, "loss": 0.0, "num_input_tokens_seen": 121218920, "step": 179935 }, { "epoch": 4.39596413651577, "grad_norm": 9.05127526493743e-05, "learning_rate": 8.761162328371963e-08, "loss": 0.0, "num_input_tokens_seen": 121222696, "step": 179940 }, { "epoch": 4.396086287347617, "grad_norm": 0.0003977123415097594, "learning_rate": 8.757672029642238e-08, "loss": 0.0, "num_input_tokens_seen": 121225576, "step": 179945 }, { "epoch": 4.396208438179464, "grad_norm": 2.6523548513068818e-05, "learning_rate": 8.754182394451249e-08, "loss": 0.0, "num_input_tokens_seen": 121229288, "step": 179950 }, { "epoch": 4.396330589011312, "grad_norm": 0.02637898176908493, "learning_rate": 8.750693422824374e-08, "loss": 0.0336, "num_input_tokens_seen": 121232488, "step": 179955 }, { "epoch": 4.396452739843158, "grad_norm": 0.0001352629333268851, "learning_rate": 8.747205114787026e-08, "loss": 0.0, "num_input_tokens_seen": 121236200, "step": 179960 }, { "epoch": 4.396574890675005, "grad_norm": 0.0008541085990145802, "learning_rate": 8.743717470364532e-08, "loss": 0.0, "num_input_tokens_seen": 121239592, "step": 179965 }, { "epoch": 4.396697041506853, "grad_norm": 0.0006383212166838348, "learning_rate": 8.740230489582278e-08, "loss": 0.0, "num_input_tokens_seen": 121243240, "step": 179970 }, { "epoch": 4.3968191923386994, "grad_norm": 0.0004918515332974494, "learning_rate": 8.736744172465604e-08, "loss": 0.0, "num_input_tokens_seen": 121246760, "step": 179975 }, { "epoch": 4.396941343170547, "grad_norm": 0.004548352677375078, "learning_rate": 8.733258519039888e-08, "loss": 0.0, "num_input_tokens_seen": 121250280, "step": 179980 }, { "epoch": 4.397063494002394, "grad_norm": 0.0026537757366895676, "learning_rate": 8.729773529330442e-08, "loss": 0.0, "num_input_tokens_seen": 121252968, "step": 179985 }, { "epoch": 4.3971856448342415, "grad_norm": 0.01826804131269455, "learning_rate": 8.726289203362636e-08, "loss": 0.0, "num_input_tokens_seen": 121255912, "step": 179990 }, { "epoch": 4.397307795666088, "grad_norm": 0.0012325093848630786, "learning_rate": 8.722805541161826e-08, "loss": 0.0, "num_input_tokens_seen": 121259176, "step": 179995 }, { "epoch": 4.397429946497936, "grad_norm": 0.00019591573800425977, "learning_rate": 8.719322542753305e-08, "loss": 0.0, "num_input_tokens_seen": 121262248, "step": 180000 }, { "epoch": 4.397552097329783, "grad_norm": 0.0003988531534560025, "learning_rate": 8.715840208162439e-08, "loss": 0.0, "num_input_tokens_seen": 121265576, "step": 180005 }, { "epoch": 4.39767424816163, "grad_norm": 0.00022057151363696903, "learning_rate": 8.71235853741451e-08, "loss": 0.0, "num_input_tokens_seen": 121269096, "step": 180010 }, { "epoch": 4.397796398993477, "grad_norm": 4.593012636178173e-05, "learning_rate": 8.708877530534897e-08, "loss": 0.0, "num_input_tokens_seen": 121272360, "step": 180015 }, { "epoch": 4.397918549825325, "grad_norm": 0.0004291969817131758, "learning_rate": 8.705397187548846e-08, "loss": 0.0, "num_input_tokens_seen": 121275496, "step": 180020 }, { "epoch": 4.398040700657171, "grad_norm": 0.0010543052339926362, "learning_rate": 8.701917508481715e-08, "loss": 0.0, "num_input_tokens_seen": 121279272, "step": 180025 }, { "epoch": 4.398162851489019, "grad_norm": 5.976646571070887e-05, "learning_rate": 8.698438493358806e-08, "loss": 0.0, "num_input_tokens_seen": 121282472, "step": 180030 }, { "epoch": 4.398285002320866, "grad_norm": 0.001080619520507753, "learning_rate": 8.69496014220541e-08, "loss": 0.0, "num_input_tokens_seen": 121286056, "step": 180035 }, { "epoch": 4.398407153152713, "grad_norm": 0.0015729547012597322, "learning_rate": 8.691482455046806e-08, "loss": 0.0, "num_input_tokens_seen": 121289448, "step": 180040 }, { "epoch": 4.39852930398456, "grad_norm": 0.0009298583609052002, "learning_rate": 8.688005431908318e-08, "loss": 0.0, "num_input_tokens_seen": 121292840, "step": 180045 }, { "epoch": 4.398651454816408, "grad_norm": 0.0070594120770692825, "learning_rate": 8.684529072815206e-08, "loss": 0.0, "num_input_tokens_seen": 121296360, "step": 180050 }, { "epoch": 4.3987736056482545, "grad_norm": 0.0002929186448454857, "learning_rate": 8.681053377792768e-08, "loss": 0.0, "num_input_tokens_seen": 121299752, "step": 180055 }, { "epoch": 4.398895756480101, "grad_norm": 0.00671204412356019, "learning_rate": 8.677578346866254e-08, "loss": 0.0, "num_input_tokens_seen": 121303144, "step": 180060 }, { "epoch": 4.399017907311949, "grad_norm": 0.0009458751883357763, "learning_rate": 8.674103980060964e-08, "loss": 0.0, "num_input_tokens_seen": 121306856, "step": 180065 }, { "epoch": 4.399140058143796, "grad_norm": 0.002767004305496812, "learning_rate": 8.670630277402147e-08, "loss": 0.0, "num_input_tokens_seen": 121310248, "step": 180070 }, { "epoch": 4.399262208975643, "grad_norm": 0.0006484072655439377, "learning_rate": 8.667157238915079e-08, "loss": 0.0, "num_input_tokens_seen": 121313768, "step": 180075 }, { "epoch": 4.39938435980749, "grad_norm": 0.00034164972021244466, "learning_rate": 8.663684864624998e-08, "loss": 0.0, "num_input_tokens_seen": 121317736, "step": 180080 }, { "epoch": 4.399506510639338, "grad_norm": 0.00013225118163973093, "learning_rate": 8.660213154557172e-08, "loss": 0.0, "num_input_tokens_seen": 121320616, "step": 180085 }, { "epoch": 4.399628661471184, "grad_norm": 0.0022325459867715836, "learning_rate": 8.656742108736859e-08, "loss": 0.0, "num_input_tokens_seen": 121324328, "step": 180090 }, { "epoch": 4.399750812303032, "grad_norm": 0.03179163113236427, "learning_rate": 8.653271727189259e-08, "loss": 0.0, "num_input_tokens_seen": 121328296, "step": 180095 }, { "epoch": 4.399872963134879, "grad_norm": 0.0007358693983405828, "learning_rate": 8.649802009939666e-08, "loss": 0.0, "num_input_tokens_seen": 121331752, "step": 180100 }, { "epoch": 4.399995113966726, "grad_norm": 0.0012749300803989172, "learning_rate": 8.646332957013258e-08, "loss": 0.0, "num_input_tokens_seen": 121334952, "step": 180105 }, { "epoch": 4.400117264798573, "grad_norm": 0.00021154101705178618, "learning_rate": 8.642864568435281e-08, "loss": 0.0, "num_input_tokens_seen": 121338792, "step": 180110 }, { "epoch": 4.400239415630421, "grad_norm": 7.006477244431153e-05, "learning_rate": 8.639396844230995e-08, "loss": 0.09, "num_input_tokens_seen": 121342120, "step": 180115 }, { "epoch": 4.4003615664622675, "grad_norm": 0.002877721330150962, "learning_rate": 8.635929784425566e-08, "loss": 0.105, "num_input_tokens_seen": 121345576, "step": 180120 }, { "epoch": 4.400483717294115, "grad_norm": 0.0005133976228535175, "learning_rate": 8.632463389044253e-08, "loss": 0.0, "num_input_tokens_seen": 121349288, "step": 180125 }, { "epoch": 4.400605868125962, "grad_norm": 0.00018399501277599484, "learning_rate": 8.628997658112214e-08, "loss": 0.0, "num_input_tokens_seen": 121353448, "step": 180130 }, { "epoch": 4.4007280189578095, "grad_norm": 0.0013751068618148565, "learning_rate": 8.625532591654705e-08, "loss": 0.0, "num_input_tokens_seen": 121356648, "step": 180135 }, { "epoch": 4.400850169789656, "grad_norm": 8.092206553556025e-05, "learning_rate": 8.622068189696896e-08, "loss": 0.0, "num_input_tokens_seen": 121359912, "step": 180140 }, { "epoch": 4.400972320621504, "grad_norm": 0.00022807734785601497, "learning_rate": 8.618604452263967e-08, "loss": 0.0, "num_input_tokens_seen": 121363176, "step": 180145 }, { "epoch": 4.401094471453351, "grad_norm": 3.466855196165852e-05, "learning_rate": 8.615141379381141e-08, "loss": 0.0, "num_input_tokens_seen": 121366504, "step": 180150 }, { "epoch": 4.401216622285197, "grad_norm": 0.0007562345126643777, "learning_rate": 8.611678971073577e-08, "loss": 0.0, "num_input_tokens_seen": 121370088, "step": 180155 }, { "epoch": 4.401338773117045, "grad_norm": 0.005128931254148483, "learning_rate": 8.608217227366465e-08, "loss": 0.0, "num_input_tokens_seen": 121373736, "step": 180160 }, { "epoch": 4.401460923948892, "grad_norm": 8.437899668933824e-05, "learning_rate": 8.604756148284975e-08, "loss": 0.0, "num_input_tokens_seen": 121376936, "step": 180165 }, { "epoch": 4.401583074780739, "grad_norm": 0.0020770810078829527, "learning_rate": 8.601295733854297e-08, "loss": 0.0, "num_input_tokens_seen": 121380264, "step": 180170 }, { "epoch": 4.401705225612586, "grad_norm": 0.00014438113430514932, "learning_rate": 8.597835984099566e-08, "loss": 0.0001, "num_input_tokens_seen": 121383848, "step": 180175 }, { "epoch": 4.401827376444434, "grad_norm": 8.756471288506873e-06, "learning_rate": 8.59437689904594e-08, "loss": 0.0, "num_input_tokens_seen": 121387112, "step": 180180 }, { "epoch": 4.4019495272762805, "grad_norm": 0.0019400314195081592, "learning_rate": 8.590918478718623e-08, "loss": 0.0, "num_input_tokens_seen": 121390760, "step": 180185 }, { "epoch": 4.402071678108128, "grad_norm": 0.0010442528873682022, "learning_rate": 8.587460723142715e-08, "loss": 0.0, "num_input_tokens_seen": 121394088, "step": 180190 }, { "epoch": 4.402193828939975, "grad_norm": 0.0024224664084613323, "learning_rate": 8.584003632343384e-08, "loss": 0.0, "num_input_tokens_seen": 121397608, "step": 180195 }, { "epoch": 4.4023159797718225, "grad_norm": 0.0025445320643484592, "learning_rate": 8.580547206345767e-08, "loss": 0.0, "num_input_tokens_seen": 121401960, "step": 180200 }, { "epoch": 4.402438130603669, "grad_norm": 0.0017734251450747252, "learning_rate": 8.577091445174988e-08, "loss": 0.0, "num_input_tokens_seen": 121405544, "step": 180205 }, { "epoch": 4.402560281435517, "grad_norm": 0.011874084360897541, "learning_rate": 8.573636348856205e-08, "loss": 0.0, "num_input_tokens_seen": 121409128, "step": 180210 }, { "epoch": 4.402682432267364, "grad_norm": 0.01571572758257389, "learning_rate": 8.570181917414521e-08, "loss": 0.0, "num_input_tokens_seen": 121412008, "step": 180215 }, { "epoch": 4.402804583099211, "grad_norm": 0.0001697017578408122, "learning_rate": 8.56672815087508e-08, "loss": 0.0, "num_input_tokens_seen": 121415656, "step": 180220 }, { "epoch": 4.402926733931058, "grad_norm": 0.00033848744351416826, "learning_rate": 8.563275049262964e-08, "loss": 0.0, "num_input_tokens_seen": 121419240, "step": 180225 }, { "epoch": 4.403048884762905, "grad_norm": 0.00022906869708094746, "learning_rate": 8.559822612603318e-08, "loss": 0.0, "num_input_tokens_seen": 121422312, "step": 180230 }, { "epoch": 4.403171035594752, "grad_norm": 0.005000482778996229, "learning_rate": 8.556370840921235e-08, "loss": 0.0, "num_input_tokens_seen": 121425576, "step": 180235 }, { "epoch": 4.403293186426599, "grad_norm": 0.0008071277989074588, "learning_rate": 8.552919734241825e-08, "loss": 0.0, "num_input_tokens_seen": 121428776, "step": 180240 }, { "epoch": 4.403415337258447, "grad_norm": 4.6842753363307565e-05, "learning_rate": 8.549469292590161e-08, "loss": 0.0, "num_input_tokens_seen": 121431976, "step": 180245 }, { "epoch": 4.4035374880902936, "grad_norm": 0.0004724858154077083, "learning_rate": 8.546019515991376e-08, "loss": 0.0, "num_input_tokens_seen": 121434856, "step": 180250 }, { "epoch": 4.403659638922141, "grad_norm": 8.139275450957939e-05, "learning_rate": 8.542570404470529e-08, "loss": 0.0, "num_input_tokens_seen": 121438184, "step": 180255 }, { "epoch": 4.403781789753988, "grad_norm": 0.0004135536146350205, "learning_rate": 8.539121958052697e-08, "loss": 0.0, "num_input_tokens_seen": 121441768, "step": 180260 }, { "epoch": 4.403903940585836, "grad_norm": 0.00012670079013332725, "learning_rate": 8.535674176762986e-08, "loss": 0.0, "num_input_tokens_seen": 121444968, "step": 180265 }, { "epoch": 4.404026091417682, "grad_norm": 0.0009698671055957675, "learning_rate": 8.53222706062643e-08, "loss": 0.0, "num_input_tokens_seen": 121448296, "step": 180270 }, { "epoch": 4.40414824224953, "grad_norm": 0.00019362542661838233, "learning_rate": 8.52878060966813e-08, "loss": 0.0001, "num_input_tokens_seen": 121452072, "step": 180275 }, { "epoch": 4.404270393081377, "grad_norm": 0.000198545036255382, "learning_rate": 8.525334823913156e-08, "loss": 0.0003, "num_input_tokens_seen": 121455272, "step": 180280 }, { "epoch": 4.404392543913224, "grad_norm": 0.00477649737149477, "learning_rate": 8.521889703386532e-08, "loss": 0.0, "num_input_tokens_seen": 121458408, "step": 180285 }, { "epoch": 4.404514694745071, "grad_norm": 0.0031864261254668236, "learning_rate": 8.518445248113338e-08, "loss": 0.0, "num_input_tokens_seen": 121461736, "step": 180290 }, { "epoch": 4.404636845576919, "grad_norm": 0.0018196194432675838, "learning_rate": 8.515001458118609e-08, "loss": 0.0, "num_input_tokens_seen": 121465640, "step": 180295 }, { "epoch": 4.4047589964087654, "grad_norm": 0.006328812334686518, "learning_rate": 8.511558333427393e-08, "loss": 0.0, "num_input_tokens_seen": 121468968, "step": 180300 }, { "epoch": 4.404881147240613, "grad_norm": 0.0008431719616055489, "learning_rate": 8.508115874064759e-08, "loss": 0.0, "num_input_tokens_seen": 121472232, "step": 180305 }, { "epoch": 4.40500329807246, "grad_norm": 0.0002700486802496016, "learning_rate": 8.504674080055685e-08, "loss": 0.0001, "num_input_tokens_seen": 121475752, "step": 180310 }, { "epoch": 4.4051254489043075, "grad_norm": 0.00010356915299780667, "learning_rate": 8.501232951425252e-08, "loss": 0.0, "num_input_tokens_seen": 121479464, "step": 180315 }, { "epoch": 4.405247599736154, "grad_norm": 0.0001315046101808548, "learning_rate": 8.497792488198452e-08, "loss": 0.0, "num_input_tokens_seen": 121483048, "step": 180320 }, { "epoch": 4.405369750568001, "grad_norm": 0.0014662779867649078, "learning_rate": 8.494352690400319e-08, "loss": 0.0, "num_input_tokens_seen": 121486376, "step": 180325 }, { "epoch": 4.405491901399849, "grad_norm": 0.0007219245890155435, "learning_rate": 8.490913558055856e-08, "loss": 0.0, "num_input_tokens_seen": 121489448, "step": 180330 }, { "epoch": 4.405614052231695, "grad_norm": 0.00012071897799614817, "learning_rate": 8.487475091190088e-08, "loss": 0.0, "num_input_tokens_seen": 121492904, "step": 180335 }, { "epoch": 4.405736203063543, "grad_norm": 4.9478978326078504e-05, "learning_rate": 8.484037289828028e-08, "loss": 0.0, "num_input_tokens_seen": 121496424, "step": 180340 }, { "epoch": 4.40585835389539, "grad_norm": 0.003782287472859025, "learning_rate": 8.480600153994666e-08, "loss": 0.0, "num_input_tokens_seen": 121499496, "step": 180345 }, { "epoch": 4.405980504727237, "grad_norm": 0.00012144062930019572, "learning_rate": 8.477163683714972e-08, "loss": 0.0, "num_input_tokens_seen": 121502696, "step": 180350 }, { "epoch": 4.406102655559084, "grad_norm": 0.0005082075367681682, "learning_rate": 8.473727879013981e-08, "loss": 0.0, "num_input_tokens_seen": 121506088, "step": 180355 }, { "epoch": 4.406224806390932, "grad_norm": 2.384355138929095e-05, "learning_rate": 8.470292739916641e-08, "loss": 0.0, "num_input_tokens_seen": 121509672, "step": 180360 }, { "epoch": 4.4063469572227785, "grad_norm": 0.0004144615086261183, "learning_rate": 8.466858266447962e-08, "loss": 0.0004, "num_input_tokens_seen": 121513064, "step": 180365 }, { "epoch": 4.406469108054626, "grad_norm": 0.0003986510564573109, "learning_rate": 8.463424458632906e-08, "loss": 0.0, "num_input_tokens_seen": 121516456, "step": 180370 }, { "epoch": 4.406591258886473, "grad_norm": 0.0025848003569990396, "learning_rate": 8.459991316496452e-08, "loss": 0.0, "num_input_tokens_seen": 121519656, "step": 180375 }, { "epoch": 4.4067134097183205, "grad_norm": 0.0050504072569310665, "learning_rate": 8.456558840063555e-08, "loss": 0.0, "num_input_tokens_seen": 121522728, "step": 180380 }, { "epoch": 4.406835560550167, "grad_norm": 0.00011555037781363353, "learning_rate": 8.453127029359197e-08, "loss": 0.0, "num_input_tokens_seen": 121525608, "step": 180385 }, { "epoch": 4.406957711382015, "grad_norm": 0.001160181942395866, "learning_rate": 8.449695884408303e-08, "loss": 0.0, "num_input_tokens_seen": 121528936, "step": 180390 }, { "epoch": 4.407079862213862, "grad_norm": 0.002147308550775051, "learning_rate": 8.446265405235842e-08, "loss": 0.0435, "num_input_tokens_seen": 121532008, "step": 180395 }, { "epoch": 4.407202013045709, "grad_norm": 15.700284957885742, "learning_rate": 8.442835591866792e-08, "loss": 0.0286, "num_input_tokens_seen": 121535144, "step": 180400 }, { "epoch": 4.407324163877556, "grad_norm": 8.615958358859643e-05, "learning_rate": 8.439406444326047e-08, "loss": 0.0, "num_input_tokens_seen": 121538920, "step": 180405 }, { "epoch": 4.407446314709404, "grad_norm": 0.00015841845015529543, "learning_rate": 8.435977962638574e-08, "loss": 0.0, "num_input_tokens_seen": 121542120, "step": 180410 }, { "epoch": 4.40756846554125, "grad_norm": 3.081785325775854e-05, "learning_rate": 8.432550146829287e-08, "loss": 0.0, "num_input_tokens_seen": 121545512, "step": 180415 }, { "epoch": 4.407690616373097, "grad_norm": 0.004024590831249952, "learning_rate": 8.429122996923143e-08, "loss": 0.0, "num_input_tokens_seen": 121548776, "step": 180420 }, { "epoch": 4.407812767204945, "grad_norm": 8.935786900110543e-05, "learning_rate": 8.425696512945024e-08, "loss": 0.0, "num_input_tokens_seen": 121552232, "step": 180425 }, { "epoch": 4.4079349180367915, "grad_norm": 0.0006876801489852369, "learning_rate": 8.422270694919864e-08, "loss": 0.0001, "num_input_tokens_seen": 121555432, "step": 180430 }, { "epoch": 4.408057068868639, "grad_norm": 6.320657848846167e-05, "learning_rate": 8.41884554287261e-08, "loss": 0.0, "num_input_tokens_seen": 121558696, "step": 180435 }, { "epoch": 4.408179219700486, "grad_norm": 0.01520298607647419, "learning_rate": 8.415421056828132e-08, "loss": 0.0, "num_input_tokens_seen": 121562088, "step": 180440 }, { "epoch": 4.4083013705323335, "grad_norm": 0.00019980291835963726, "learning_rate": 8.411997236811352e-08, "loss": 0.0, "num_input_tokens_seen": 121565416, "step": 180445 }, { "epoch": 4.40842352136418, "grad_norm": 4.798701047548093e-05, "learning_rate": 8.408574082847164e-08, "loss": 0.0001, "num_input_tokens_seen": 121568616, "step": 180450 }, { "epoch": 4.408545672196028, "grad_norm": 0.0002813969331327826, "learning_rate": 8.405151594960447e-08, "loss": 0.0, "num_input_tokens_seen": 121572264, "step": 180455 }, { "epoch": 4.408667823027875, "grad_norm": 0.0001570227468619123, "learning_rate": 8.401729773176114e-08, "loss": 0.0, "num_input_tokens_seen": 121575528, "step": 180460 }, { "epoch": 4.408789973859722, "grad_norm": 9.800500265555456e-05, "learning_rate": 8.398308617519024e-08, "loss": 0.0, "num_input_tokens_seen": 121578536, "step": 180465 }, { "epoch": 4.408912124691569, "grad_norm": 5.589954889728688e-05, "learning_rate": 8.394888128014099e-08, "loss": 0.0, "num_input_tokens_seen": 121582184, "step": 180470 }, { "epoch": 4.409034275523417, "grad_norm": 0.0007112363819032907, "learning_rate": 8.391468304686155e-08, "loss": 0.0, "num_input_tokens_seen": 121585256, "step": 180475 }, { "epoch": 4.409156426355263, "grad_norm": 0.0006815246306359768, "learning_rate": 8.388049147560117e-08, "loss": 0.0, "num_input_tokens_seen": 121588712, "step": 180480 }, { "epoch": 4.409278577187111, "grad_norm": 0.010045552626252174, "learning_rate": 8.384630656660807e-08, "loss": 0.0, "num_input_tokens_seen": 121592104, "step": 180485 }, { "epoch": 4.409400728018958, "grad_norm": 0.015227672643959522, "learning_rate": 8.381212832013107e-08, "loss": 0.0, "num_input_tokens_seen": 121595240, "step": 180490 }, { "epoch": 4.4095228788508045, "grad_norm": 0.0007747916970402002, "learning_rate": 8.377795673641886e-08, "loss": 0.0, "num_input_tokens_seen": 121598440, "step": 180495 }, { "epoch": 4.409645029682652, "grad_norm": 0.059335384517908096, "learning_rate": 8.374379181571967e-08, "loss": 0.0, "num_input_tokens_seen": 121601704, "step": 180500 }, { "epoch": 4.409767180514499, "grad_norm": 0.00015939258446451277, "learning_rate": 8.37096335582822e-08, "loss": 0.0, "num_input_tokens_seen": 121604968, "step": 180505 }, { "epoch": 4.4098893313463465, "grad_norm": 0.0002897983358707279, "learning_rate": 8.367548196435447e-08, "loss": 0.0, "num_input_tokens_seen": 121608104, "step": 180510 }, { "epoch": 4.410011482178193, "grad_norm": 0.00018529892258811742, "learning_rate": 8.364133703418518e-08, "loss": 0.0, "num_input_tokens_seen": 121611112, "step": 180515 }, { "epoch": 4.410133633010041, "grad_norm": 0.00022547252592630684, "learning_rate": 8.360719876802269e-08, "loss": 0.0388, "num_input_tokens_seen": 121614504, "step": 180520 }, { "epoch": 4.410255783841888, "grad_norm": 1.873561996035278e-05, "learning_rate": 8.357306716611501e-08, "loss": 0.0, "num_input_tokens_seen": 121617896, "step": 180525 }, { "epoch": 4.410377934673735, "grad_norm": 0.00014970744086895138, "learning_rate": 8.35389422287106e-08, "loss": 0.0, "num_input_tokens_seen": 121621096, "step": 180530 }, { "epoch": 4.410500085505582, "grad_norm": 0.0071835326962172985, "learning_rate": 8.35048239560574e-08, "loss": 0.0, "num_input_tokens_seen": 121624616, "step": 180535 }, { "epoch": 4.41062223633743, "grad_norm": 2.6335650545661338e-05, "learning_rate": 8.347071234840374e-08, "loss": 0.0, "num_input_tokens_seen": 121627688, "step": 180540 }, { "epoch": 4.410744387169276, "grad_norm": 0.00024163529451470822, "learning_rate": 8.343660740599745e-08, "loss": 0.0, "num_input_tokens_seen": 121631144, "step": 180545 }, { "epoch": 4.410866538001124, "grad_norm": 0.00012851272185798734, "learning_rate": 8.340250912908675e-08, "loss": 0.0, "num_input_tokens_seen": 121634280, "step": 180550 }, { "epoch": 4.410988688832971, "grad_norm": 3.004685277119279e-05, "learning_rate": 8.336841751791946e-08, "loss": 0.0, "num_input_tokens_seen": 121637864, "step": 180555 }, { "epoch": 4.411110839664818, "grad_norm": 0.000623641477432102, "learning_rate": 8.333433257274369e-08, "loss": 0.0, "num_input_tokens_seen": 121641320, "step": 180560 }, { "epoch": 4.411232990496665, "grad_norm": 6.0218488215468824e-05, "learning_rate": 8.330025429380727e-08, "loss": 0.0, "num_input_tokens_seen": 121644584, "step": 180565 }, { "epoch": 4.411355141328513, "grad_norm": 0.000500671798363328, "learning_rate": 8.326618268135776e-08, "loss": 0.0, "num_input_tokens_seen": 121647592, "step": 180570 }, { "epoch": 4.4114772921603596, "grad_norm": 0.00015049302601255476, "learning_rate": 8.323211773564331e-08, "loss": 0.0, "num_input_tokens_seen": 121650664, "step": 180575 }, { "epoch": 4.411599442992207, "grad_norm": 7.418448512908071e-05, "learning_rate": 8.319805945691127e-08, "loss": 0.0421, "num_input_tokens_seen": 121653928, "step": 180580 }, { "epoch": 4.411721593824054, "grad_norm": 7.855272997403517e-05, "learning_rate": 8.316400784540966e-08, "loss": 0.0, "num_input_tokens_seen": 121657576, "step": 180585 }, { "epoch": 4.411843744655901, "grad_norm": 0.00044542065006680787, "learning_rate": 8.312996290138607e-08, "loss": 0.0, "num_input_tokens_seen": 121660904, "step": 180590 }, { "epoch": 4.411965895487748, "grad_norm": 7.770089723635465e-05, "learning_rate": 8.309592462508774e-08, "loss": 0.0, "num_input_tokens_seen": 121664232, "step": 180595 }, { "epoch": 4.412088046319595, "grad_norm": 2.8739916615450056e-06, "learning_rate": 8.306189301676281e-08, "loss": 0.0, "num_input_tokens_seen": 121667496, "step": 180600 }, { "epoch": 4.412210197151443, "grad_norm": 0.0005004116101190448, "learning_rate": 8.302786807665807e-08, "loss": 0.0, "num_input_tokens_seen": 121670440, "step": 180605 }, { "epoch": 4.412332347983289, "grad_norm": 0.00010662328713806346, "learning_rate": 8.299384980502144e-08, "loss": 0.0003, "num_input_tokens_seen": 121673576, "step": 180610 }, { "epoch": 4.412454498815137, "grad_norm": 0.0006866508047096431, "learning_rate": 8.295983820210028e-08, "loss": 0.0, "num_input_tokens_seen": 121676840, "step": 180615 }, { "epoch": 4.412576649646984, "grad_norm": 0.014268654398620129, "learning_rate": 8.292583326814173e-08, "loss": 0.0003, "num_input_tokens_seen": 121680104, "step": 180620 }, { "epoch": 4.4126988004788315, "grad_norm": 0.0005359610659070313, "learning_rate": 8.289183500339337e-08, "loss": 0.0, "num_input_tokens_seen": 121683496, "step": 180625 }, { "epoch": 4.412820951310678, "grad_norm": 0.006035420577973127, "learning_rate": 8.285784340810198e-08, "loss": 0.0, "num_input_tokens_seen": 121687016, "step": 180630 }, { "epoch": 4.412943102142526, "grad_norm": 7.512760203098878e-05, "learning_rate": 8.282385848251527e-08, "loss": 0.0, "num_input_tokens_seen": 121690152, "step": 180635 }, { "epoch": 4.413065252974373, "grad_norm": 0.0006038388819433749, "learning_rate": 8.278988022687994e-08, "loss": 0.0, "num_input_tokens_seen": 121693416, "step": 180640 }, { "epoch": 4.41318740380622, "grad_norm": 0.10882148891687393, "learning_rate": 8.275590864144333e-08, "loss": 0.0001, "num_input_tokens_seen": 121696808, "step": 180645 }, { "epoch": 4.413309554638067, "grad_norm": 6.010133438394405e-05, "learning_rate": 8.27219437264527e-08, "loss": 0.0, "num_input_tokens_seen": 121700712, "step": 180650 }, { "epoch": 4.413431705469915, "grad_norm": 0.00010217710223514587, "learning_rate": 8.268798548215472e-08, "loss": 0.0001, "num_input_tokens_seen": 121704808, "step": 180655 }, { "epoch": 4.413553856301761, "grad_norm": 0.00018335915228817612, "learning_rate": 8.265403390879633e-08, "loss": 0.0, "num_input_tokens_seen": 121707880, "step": 180660 }, { "epoch": 4.413676007133609, "grad_norm": 0.00040197354974225163, "learning_rate": 8.262008900662464e-08, "loss": 0.0354, "num_input_tokens_seen": 121711080, "step": 180665 }, { "epoch": 4.413798157965456, "grad_norm": 0.002384263789281249, "learning_rate": 8.258615077588627e-08, "loss": 0.0, "num_input_tokens_seen": 121714536, "step": 180670 }, { "epoch": 4.413920308797303, "grad_norm": 0.0006682064849883318, "learning_rate": 8.255221921682831e-08, "loss": 0.0, "num_input_tokens_seen": 121718120, "step": 180675 }, { "epoch": 4.41404245962915, "grad_norm": 0.0005522278370335698, "learning_rate": 8.251829432969726e-08, "loss": 0.0, "num_input_tokens_seen": 121721320, "step": 180680 }, { "epoch": 4.414164610460997, "grad_norm": 0.0015256262850016356, "learning_rate": 8.248437611474013e-08, "loss": 0.0, "num_input_tokens_seen": 121724776, "step": 180685 }, { "epoch": 4.4142867612928445, "grad_norm": 7.958729838719591e-06, "learning_rate": 8.245046457220317e-08, "loss": 0.0, "num_input_tokens_seen": 121728424, "step": 180690 }, { "epoch": 4.414408912124691, "grad_norm": 0.00025719229597598314, "learning_rate": 8.241655970233341e-08, "loss": 0.0, "num_input_tokens_seen": 121731496, "step": 180695 }, { "epoch": 4.414531062956539, "grad_norm": 0.00022496600286103785, "learning_rate": 8.238266150537699e-08, "loss": 0.0, "num_input_tokens_seen": 121734696, "step": 180700 }, { "epoch": 4.414653213788386, "grad_norm": 0.0012882340233772993, "learning_rate": 8.23487699815808e-08, "loss": 0.0, "num_input_tokens_seen": 121738664, "step": 180705 }, { "epoch": 4.414775364620233, "grad_norm": 0.013423847034573555, "learning_rate": 8.231488513119123e-08, "loss": 0.0, "num_input_tokens_seen": 121742120, "step": 180710 }, { "epoch": 4.41489751545208, "grad_norm": 9.619421325623989e-05, "learning_rate": 8.22810069544545e-08, "loss": 0.0, "num_input_tokens_seen": 121745960, "step": 180715 }, { "epoch": 4.415019666283928, "grad_norm": 1.3759875400864985e-05, "learning_rate": 8.224713545161732e-08, "loss": 0.0, "num_input_tokens_seen": 121749800, "step": 180720 }, { "epoch": 4.415141817115774, "grad_norm": 9.072878310689703e-05, "learning_rate": 8.221327062292571e-08, "loss": 0.0, "num_input_tokens_seen": 121753704, "step": 180725 }, { "epoch": 4.415263967947622, "grad_norm": 4.327476199250668e-05, "learning_rate": 8.217941246862614e-08, "loss": 0.0, "num_input_tokens_seen": 121756840, "step": 180730 }, { "epoch": 4.415386118779469, "grad_norm": 0.00012532465916592628, "learning_rate": 8.214556098896464e-08, "loss": 0.0, "num_input_tokens_seen": 121760296, "step": 180735 }, { "epoch": 4.415508269611316, "grad_norm": 0.0005663962219841778, "learning_rate": 8.211171618418744e-08, "loss": 0.0, "num_input_tokens_seen": 121764008, "step": 180740 }, { "epoch": 4.415630420443163, "grad_norm": 0.0007085498655214906, "learning_rate": 8.20778780545408e-08, "loss": 0.0001, "num_input_tokens_seen": 121767336, "step": 180745 }, { "epoch": 4.415752571275011, "grad_norm": 0.0009212750592269003, "learning_rate": 8.204404660027065e-08, "loss": 0.0, "num_input_tokens_seen": 121770600, "step": 180750 }, { "epoch": 4.4158747221068575, "grad_norm": 2.3134165530791506e-05, "learning_rate": 8.201022182162332e-08, "loss": 0.0, "num_input_tokens_seen": 121773864, "step": 180755 }, { "epoch": 4.415996872938704, "grad_norm": 0.00025919542531482875, "learning_rate": 8.197640371884429e-08, "loss": 0.0, "num_input_tokens_seen": 121777256, "step": 180760 }, { "epoch": 4.416119023770552, "grad_norm": 0.00022408708173315972, "learning_rate": 8.194259229218003e-08, "loss": 0.0, "num_input_tokens_seen": 121781096, "step": 180765 }, { "epoch": 4.4162411746023995, "grad_norm": 0.0001324850891251117, "learning_rate": 8.190878754187614e-08, "loss": 0.0, "num_input_tokens_seen": 121784616, "step": 180770 }, { "epoch": 4.416363325434246, "grad_norm": 3.3088224881794304e-05, "learning_rate": 8.187498946817828e-08, "loss": 0.0, "num_input_tokens_seen": 121787624, "step": 180775 }, { "epoch": 4.416485476266093, "grad_norm": 4.933834497933276e-05, "learning_rate": 8.18411980713326e-08, "loss": 0.0002, "num_input_tokens_seen": 121791272, "step": 180780 }, { "epoch": 4.416607627097941, "grad_norm": 0.00011093864304712042, "learning_rate": 8.180741335158458e-08, "loss": 0.0, "num_input_tokens_seen": 121794792, "step": 180785 }, { "epoch": 4.416729777929787, "grad_norm": 0.006448894739151001, "learning_rate": 8.177363530918013e-08, "loss": 0.0, "num_input_tokens_seen": 121798632, "step": 180790 }, { "epoch": 4.416851928761635, "grad_norm": 4.5410546590574086e-05, "learning_rate": 8.173986394436461e-08, "loss": 0.0, "num_input_tokens_seen": 121802024, "step": 180795 }, { "epoch": 4.416974079593482, "grad_norm": 0.010403868742287159, "learning_rate": 8.17060992573838e-08, "loss": 0.0, "num_input_tokens_seen": 121805736, "step": 180800 }, { "epoch": 4.417096230425329, "grad_norm": 0.00030534606776200235, "learning_rate": 8.167234124848344e-08, "loss": 0.0, "num_input_tokens_seen": 121808936, "step": 180805 }, { "epoch": 4.417218381257176, "grad_norm": 0.0001068161946022883, "learning_rate": 8.163858991790861e-08, "loss": 0.0, "num_input_tokens_seen": 121812008, "step": 180810 }, { "epoch": 4.417340532089024, "grad_norm": 9.78023890638724e-05, "learning_rate": 8.160484526590516e-08, "loss": 0.0, "num_input_tokens_seen": 121815592, "step": 180815 }, { "epoch": 4.4174626829208705, "grad_norm": 0.0007254159427247941, "learning_rate": 8.157110729271799e-08, "loss": 0.0, "num_input_tokens_seen": 121818600, "step": 180820 }, { "epoch": 4.417584833752718, "grad_norm": 0.000565720722079277, "learning_rate": 8.153737599859312e-08, "loss": 0.0, "num_input_tokens_seen": 121821672, "step": 180825 }, { "epoch": 4.417706984584565, "grad_norm": 0.005168421193957329, "learning_rate": 8.150365138377513e-08, "loss": 0.0, "num_input_tokens_seen": 121825512, "step": 180830 }, { "epoch": 4.4178291354164125, "grad_norm": 9.393416985403746e-05, "learning_rate": 8.146993344850973e-08, "loss": 0.0, "num_input_tokens_seen": 121828712, "step": 180835 }, { "epoch": 4.417951286248259, "grad_norm": 0.0016235082875937223, "learning_rate": 8.143622219304225e-08, "loss": 0.0, "num_input_tokens_seen": 121831784, "step": 180840 }, { "epoch": 4.418073437080107, "grad_norm": 3.2696454582037404e-05, "learning_rate": 8.140251761761741e-08, "loss": 0.0, "num_input_tokens_seen": 121835688, "step": 180845 }, { "epoch": 4.418195587911954, "grad_norm": 0.0008309634868055582, "learning_rate": 8.136881972248067e-08, "loss": 0.0, "num_input_tokens_seen": 121839016, "step": 180850 }, { "epoch": 4.4183177387438, "grad_norm": 0.00010062227374874055, "learning_rate": 8.133512850787682e-08, "loss": 0.0, "num_input_tokens_seen": 121842984, "step": 180855 }, { "epoch": 4.418439889575648, "grad_norm": 0.000737546244636178, "learning_rate": 8.130144397405114e-08, "loss": 0.0392, "num_input_tokens_seen": 121846504, "step": 180860 }, { "epoch": 4.418562040407495, "grad_norm": 5.2958694141125306e-05, "learning_rate": 8.12677661212483e-08, "loss": 0.0, "num_input_tokens_seen": 121849832, "step": 180865 }, { "epoch": 4.418684191239342, "grad_norm": 0.004930529743432999, "learning_rate": 8.123409494971356e-08, "loss": 0.0, "num_input_tokens_seen": 121853224, "step": 180870 }, { "epoch": 4.418806342071189, "grad_norm": 0.012600020505487919, "learning_rate": 8.120043045969161e-08, "loss": 0.0, "num_input_tokens_seen": 121856616, "step": 180875 }, { "epoch": 4.418928492903037, "grad_norm": 2.7497879273141734e-05, "learning_rate": 8.116677265142713e-08, "loss": 0.0, "num_input_tokens_seen": 121859752, "step": 180880 }, { "epoch": 4.4190506437348835, "grad_norm": 0.00012461999722290784, "learning_rate": 8.113312152516516e-08, "loss": 0.0, "num_input_tokens_seen": 121862888, "step": 180885 }, { "epoch": 4.419172794566731, "grad_norm": 7.542503590229899e-05, "learning_rate": 8.109947708115006e-08, "loss": 0.0, "num_input_tokens_seen": 121866344, "step": 180890 }, { "epoch": 4.419294945398578, "grad_norm": 0.00015688169514760375, "learning_rate": 8.106583931962674e-08, "loss": 0.0, "num_input_tokens_seen": 121869800, "step": 180895 }, { "epoch": 4.419417096230426, "grad_norm": 3.380305861355737e-05, "learning_rate": 8.103220824083989e-08, "loss": 0.0, "num_input_tokens_seen": 121873384, "step": 180900 }, { "epoch": 4.419539247062272, "grad_norm": 0.0002742180950008333, "learning_rate": 8.0998583845034e-08, "loss": 0.0, "num_input_tokens_seen": 121877224, "step": 180905 }, { "epoch": 4.41966139789412, "grad_norm": 0.0028896431904286146, "learning_rate": 8.096496613245363e-08, "loss": 0.0, "num_input_tokens_seen": 121880040, "step": 180910 }, { "epoch": 4.419783548725967, "grad_norm": 6.235108594410121e-05, "learning_rate": 8.093135510334304e-08, "loss": 0.0, "num_input_tokens_seen": 121883496, "step": 180915 }, { "epoch": 4.419905699557814, "grad_norm": 0.004725324921309948, "learning_rate": 8.089775075794691e-08, "loss": 0.0, "num_input_tokens_seen": 121886696, "step": 180920 }, { "epoch": 4.420027850389661, "grad_norm": 0.001249430701136589, "learning_rate": 8.086415309650962e-08, "loss": 0.0, "num_input_tokens_seen": 121890024, "step": 180925 }, { "epoch": 4.420150001221509, "grad_norm": 0.00012448421330191195, "learning_rate": 8.08305621192753e-08, "loss": 0.0, "num_input_tokens_seen": 121893288, "step": 180930 }, { "epoch": 4.420272152053355, "grad_norm": 8.14038940006867e-05, "learning_rate": 8.079697782648864e-08, "loss": 0.0, "num_input_tokens_seen": 121896680, "step": 180935 }, { "epoch": 4.420394302885203, "grad_norm": 0.0013667421881109476, "learning_rate": 8.076340021839323e-08, "loss": 0.0, "num_input_tokens_seen": 121899880, "step": 180940 }, { "epoch": 4.42051645371705, "grad_norm": 0.003148171119391918, "learning_rate": 8.072982929523398e-08, "loss": 0.0, "num_input_tokens_seen": 121903080, "step": 180945 }, { "epoch": 4.420638604548897, "grad_norm": 0.014106903225183487, "learning_rate": 8.069626505725435e-08, "loss": 0.0, "num_input_tokens_seen": 121906216, "step": 180950 }, { "epoch": 4.420760755380744, "grad_norm": 0.00035607683821581304, "learning_rate": 8.066270750469883e-08, "loss": 0.0, "num_input_tokens_seen": 121909160, "step": 180955 }, { "epoch": 4.420882906212591, "grad_norm": 0.00014510061009787023, "learning_rate": 8.062915663781145e-08, "loss": 0.0, "num_input_tokens_seen": 121912744, "step": 180960 }, { "epoch": 4.421005057044439, "grad_norm": 0.006395295727998018, "learning_rate": 8.059561245683622e-08, "loss": 0.0, "num_input_tokens_seen": 121916456, "step": 180965 }, { "epoch": 4.421127207876285, "grad_norm": 0.0009115340071730316, "learning_rate": 8.056207496201672e-08, "loss": 0.0, "num_input_tokens_seen": 121920424, "step": 180970 }, { "epoch": 4.421249358708133, "grad_norm": 0.0064463368616998196, "learning_rate": 8.052854415359744e-08, "loss": 0.0, "num_input_tokens_seen": 121923752, "step": 180975 }, { "epoch": 4.42137150953998, "grad_norm": 8.19075430626981e-05, "learning_rate": 8.049502003182173e-08, "loss": 0.0, "num_input_tokens_seen": 121926760, "step": 180980 }, { "epoch": 4.421493660371827, "grad_norm": 7.664481381652877e-05, "learning_rate": 8.046150259693341e-08, "loss": 0.0, "num_input_tokens_seen": 121930216, "step": 180985 }, { "epoch": 4.421615811203674, "grad_norm": 2.986722029163502e-05, "learning_rate": 8.042799184917647e-08, "loss": 0.0, "num_input_tokens_seen": 121933544, "step": 180990 }, { "epoch": 4.421737962035522, "grad_norm": 0.00506412610411644, "learning_rate": 8.039448778879465e-08, "loss": 0.0, "num_input_tokens_seen": 121937064, "step": 180995 }, { "epoch": 4.4218601128673685, "grad_norm": 0.0002981481666211039, "learning_rate": 8.036099041603117e-08, "loss": 0.0, "num_input_tokens_seen": 121940200, "step": 181000 }, { "epoch": 4.421982263699216, "grad_norm": 0.0007899164338596165, "learning_rate": 8.032749973113017e-08, "loss": 0.0, "num_input_tokens_seen": 121943784, "step": 181005 }, { "epoch": 4.422104414531063, "grad_norm": 4.1584593418519944e-05, "learning_rate": 8.029401573433481e-08, "loss": 0.0, "num_input_tokens_seen": 121946920, "step": 181010 }, { "epoch": 4.4222265653629105, "grad_norm": 0.00011753295257221907, "learning_rate": 8.026053842588876e-08, "loss": 0.0, "num_input_tokens_seen": 121950120, "step": 181015 }, { "epoch": 4.422348716194757, "grad_norm": 0.00019384206098038703, "learning_rate": 8.022706780603549e-08, "loss": 0.0, "num_input_tokens_seen": 121953768, "step": 181020 }, { "epoch": 4.422470867026605, "grad_norm": 0.0009538192534819245, "learning_rate": 8.019360387501839e-08, "loss": 0.0, "num_input_tokens_seen": 121957032, "step": 181025 }, { "epoch": 4.422593017858452, "grad_norm": 0.0008440099190920591, "learning_rate": 8.01601466330808e-08, "loss": 0.0, "num_input_tokens_seen": 121960872, "step": 181030 }, { "epoch": 4.422715168690299, "grad_norm": 0.0011175043182447553, "learning_rate": 8.012669608046596e-08, "loss": 0.0, "num_input_tokens_seen": 121964136, "step": 181035 }, { "epoch": 4.422837319522146, "grad_norm": 0.02144869789481163, "learning_rate": 8.009325221741736e-08, "loss": 0.0, "num_input_tokens_seen": 121967400, "step": 181040 }, { "epoch": 4.422959470353993, "grad_norm": 0.015995550900697708, "learning_rate": 8.00598150441778e-08, "loss": 0.0, "num_input_tokens_seen": 121970600, "step": 181045 }, { "epoch": 4.42308162118584, "grad_norm": 0.0019130867440253496, "learning_rate": 8.002638456099087e-08, "loss": 0.0, "num_input_tokens_seen": 121974120, "step": 181050 }, { "epoch": 4.423203772017687, "grad_norm": 5.404177045420511e-06, "learning_rate": 7.999296076809958e-08, "loss": 0.0, "num_input_tokens_seen": 121977640, "step": 181055 }, { "epoch": 4.423325922849535, "grad_norm": 8.673696720506996e-05, "learning_rate": 7.995954366574675e-08, "loss": 0.0, "num_input_tokens_seen": 121980776, "step": 181060 }, { "epoch": 4.4234480736813815, "grad_norm": 4.539159999694675e-05, "learning_rate": 7.992613325417574e-08, "loss": 0.0, "num_input_tokens_seen": 121984168, "step": 181065 }, { "epoch": 4.423570224513229, "grad_norm": 3.427283445489593e-05, "learning_rate": 7.989272953362924e-08, "loss": 0.0, "num_input_tokens_seen": 121987624, "step": 181070 }, { "epoch": 4.423692375345076, "grad_norm": 0.0009406374301761389, "learning_rate": 7.985933250435039e-08, "loss": 0.0, "num_input_tokens_seen": 121990952, "step": 181075 }, { "epoch": 4.4238145261769235, "grad_norm": 6.120477337390184e-05, "learning_rate": 7.9825942166582e-08, "loss": 0.0, "num_input_tokens_seen": 121994344, "step": 181080 }, { "epoch": 4.42393667700877, "grad_norm": 0.0012227625120431185, "learning_rate": 7.979255852056677e-08, "loss": 0.0, "num_input_tokens_seen": 121997416, "step": 181085 }, { "epoch": 4.424058827840618, "grad_norm": 0.006647599395364523, "learning_rate": 7.97591815665476e-08, "loss": 0.0, "num_input_tokens_seen": 122000552, "step": 181090 }, { "epoch": 4.424180978672465, "grad_norm": 0.00015228458505589515, "learning_rate": 7.972581130476707e-08, "loss": 0.0, "num_input_tokens_seen": 122003688, "step": 181095 }, { "epoch": 4.424303129504312, "grad_norm": 0.00011203240865143016, "learning_rate": 7.969244773546812e-08, "loss": 0.0224, "num_input_tokens_seen": 122006824, "step": 181100 }, { "epoch": 4.424425280336159, "grad_norm": 4.6374192606890574e-05, "learning_rate": 7.965909085889299e-08, "loss": 0.0, "num_input_tokens_seen": 122010344, "step": 181105 }, { "epoch": 4.424547431168007, "grad_norm": 0.006114604417234659, "learning_rate": 7.96257406752846e-08, "loss": 0.0, "num_input_tokens_seen": 122013352, "step": 181110 }, { "epoch": 4.424669581999853, "grad_norm": 0.0006236334447748959, "learning_rate": 7.959239718488553e-08, "loss": 0.0, "num_input_tokens_seen": 122017064, "step": 181115 }, { "epoch": 4.4247917328317, "grad_norm": 0.00038256382686086, "learning_rate": 7.955906038793791e-08, "loss": 0.0, "num_input_tokens_seen": 122020200, "step": 181120 }, { "epoch": 4.424913883663548, "grad_norm": 0.000345156091498211, "learning_rate": 7.952573028468456e-08, "loss": 0.0, "num_input_tokens_seen": 122023528, "step": 181125 }, { "epoch": 4.4250360344953945, "grad_norm": 0.00016293360386043787, "learning_rate": 7.949240687536751e-08, "loss": 0.0001, "num_input_tokens_seen": 122026600, "step": 181130 }, { "epoch": 4.425158185327242, "grad_norm": 0.0001275143149541691, "learning_rate": 7.945909016022934e-08, "loss": 0.0, "num_input_tokens_seen": 122029992, "step": 181135 }, { "epoch": 4.425280336159089, "grad_norm": 0.00010861880582524464, "learning_rate": 7.942578013951217e-08, "loss": 0.0, "num_input_tokens_seen": 122033000, "step": 181140 }, { "epoch": 4.4254024869909365, "grad_norm": 0.00043809949420392513, "learning_rate": 7.93924768134584e-08, "loss": 0.0, "num_input_tokens_seen": 122036200, "step": 181145 }, { "epoch": 4.425524637822783, "grad_norm": 0.008996120654046535, "learning_rate": 7.935918018231024e-08, "loss": 0.0, "num_input_tokens_seen": 122039848, "step": 181150 }, { "epoch": 4.425646788654631, "grad_norm": 0.0006707753636874259, "learning_rate": 7.932589024630953e-08, "loss": 0.0, "num_input_tokens_seen": 122043688, "step": 181155 }, { "epoch": 4.425768939486478, "grad_norm": 0.004772162064909935, "learning_rate": 7.929260700569884e-08, "loss": 0.0, "num_input_tokens_seen": 122047208, "step": 181160 }, { "epoch": 4.425891090318325, "grad_norm": 0.0019871399272233248, "learning_rate": 7.925933046071975e-08, "loss": 0.0, "num_input_tokens_seen": 122050600, "step": 181165 }, { "epoch": 4.426013241150172, "grad_norm": 0.00029408876434899867, "learning_rate": 7.922606061161464e-08, "loss": 0.0, "num_input_tokens_seen": 122053608, "step": 181170 }, { "epoch": 4.42613539198202, "grad_norm": 0.10013458132743835, "learning_rate": 7.919279745862505e-08, "loss": 0.0, "num_input_tokens_seen": 122056872, "step": 181175 }, { "epoch": 4.426257542813866, "grad_norm": 3.233554161852226e-05, "learning_rate": 7.915954100199328e-08, "loss": 0.0, "num_input_tokens_seen": 122060200, "step": 181180 }, { "epoch": 4.426379693645714, "grad_norm": 0.0017619574209675193, "learning_rate": 7.912629124196113e-08, "loss": 0.0, "num_input_tokens_seen": 122063464, "step": 181185 }, { "epoch": 4.426501844477561, "grad_norm": 0.0008039613021537662, "learning_rate": 7.909304817876994e-08, "loss": 0.0, "num_input_tokens_seen": 122066536, "step": 181190 }, { "epoch": 4.426623995309408, "grad_norm": 0.0007726152543909848, "learning_rate": 7.905981181266208e-08, "loss": 0.0, "num_input_tokens_seen": 122069736, "step": 181195 }, { "epoch": 4.426746146141255, "grad_norm": 0.0002162769524147734, "learning_rate": 7.90265821438788e-08, "loss": 0.0, "num_input_tokens_seen": 122073576, "step": 181200 }, { "epoch": 4.426868296973103, "grad_norm": 7.136868953239173e-05, "learning_rate": 7.89933591726618e-08, "loss": 0.0, "num_input_tokens_seen": 122076776, "step": 181205 }, { "epoch": 4.4269904478049495, "grad_norm": 0.0007138107321225107, "learning_rate": 7.896014289925312e-08, "loss": 0.0, "num_input_tokens_seen": 122079848, "step": 181210 }, { "epoch": 4.427112598636796, "grad_norm": 2.201797724410426e-05, "learning_rate": 7.892693332389378e-08, "loss": 0.0, "num_input_tokens_seen": 122083304, "step": 181215 }, { "epoch": 4.427234749468644, "grad_norm": 1.4028590157977305e-05, "learning_rate": 7.889373044682567e-08, "loss": 0.0, "num_input_tokens_seen": 122086504, "step": 181220 }, { "epoch": 4.427356900300491, "grad_norm": 0.0016195435309782624, "learning_rate": 7.886053426828998e-08, "loss": 0.0, "num_input_tokens_seen": 122089704, "step": 181225 }, { "epoch": 4.427479051132338, "grad_norm": 0.00019918143516406417, "learning_rate": 7.882734478852826e-08, "loss": 0.0, "num_input_tokens_seen": 122093224, "step": 181230 }, { "epoch": 4.427601201964185, "grad_norm": 0.00010197334631811827, "learning_rate": 7.8794162007782e-08, "loss": 0.0, "num_input_tokens_seen": 122096616, "step": 181235 }, { "epoch": 4.427723352796033, "grad_norm": 0.002117699710652232, "learning_rate": 7.876098592629221e-08, "loss": 0.0, "num_input_tokens_seen": 122100136, "step": 181240 }, { "epoch": 4.427845503627879, "grad_norm": 4.120414814678952e-05, "learning_rate": 7.872781654430039e-08, "loss": 0.0, "num_input_tokens_seen": 122103336, "step": 181245 }, { "epoch": 4.427967654459727, "grad_norm": 1.0961198313452769e-05, "learning_rate": 7.869465386204765e-08, "loss": 0.0, "num_input_tokens_seen": 122106792, "step": 181250 }, { "epoch": 4.428089805291574, "grad_norm": 0.0006331568001769483, "learning_rate": 7.866149787977528e-08, "loss": 0.0, "num_input_tokens_seen": 122109928, "step": 181255 }, { "epoch": 4.4282119561234214, "grad_norm": 5.89434894209262e-05, "learning_rate": 7.862834859772416e-08, "loss": 0.0, "num_input_tokens_seen": 122113128, "step": 181260 }, { "epoch": 4.428334106955268, "grad_norm": 0.0022513335570693016, "learning_rate": 7.859520601613545e-08, "loss": 0.0, "num_input_tokens_seen": 122116392, "step": 181265 }, { "epoch": 4.428456257787116, "grad_norm": 0.0005072103813290596, "learning_rate": 7.85620701352504e-08, "loss": 0.0, "num_input_tokens_seen": 122119848, "step": 181270 }, { "epoch": 4.428578408618963, "grad_norm": 0.0003140374319627881, "learning_rate": 7.852894095530993e-08, "loss": 0.0, "num_input_tokens_seen": 122123240, "step": 181275 }, { "epoch": 4.42870055945081, "grad_norm": 3.0278168196673505e-05, "learning_rate": 7.849581847655462e-08, "loss": 0.0002, "num_input_tokens_seen": 122126632, "step": 181280 }, { "epoch": 4.428822710282657, "grad_norm": 0.00029192675719968975, "learning_rate": 7.846270269922572e-08, "loss": 0.0, "num_input_tokens_seen": 122130600, "step": 181285 }, { "epoch": 4.428944861114505, "grad_norm": 0.004352732561528683, "learning_rate": 7.842959362356394e-08, "loss": 0.0, "num_input_tokens_seen": 122133736, "step": 181290 }, { "epoch": 4.429067011946351, "grad_norm": 0.0026824241504073143, "learning_rate": 7.839649124980985e-08, "loss": 0.0, "num_input_tokens_seen": 122136936, "step": 181295 }, { "epoch": 4.429189162778199, "grad_norm": 0.0006657766061834991, "learning_rate": 7.836339557820427e-08, "loss": 0.0, "num_input_tokens_seen": 122140648, "step": 181300 }, { "epoch": 4.429311313610046, "grad_norm": 0.005454826634377241, "learning_rate": 7.83303066089882e-08, "loss": 0.0, "num_input_tokens_seen": 122143976, "step": 181305 }, { "epoch": 4.4294334644418925, "grad_norm": 0.00019215767679270357, "learning_rate": 7.829722434240193e-08, "loss": 0.0, "num_input_tokens_seen": 122147496, "step": 181310 }, { "epoch": 4.42955561527374, "grad_norm": 0.001787459827028215, "learning_rate": 7.826414877868626e-08, "loss": 0.0, "num_input_tokens_seen": 122151016, "step": 181315 }, { "epoch": 4.429677766105587, "grad_norm": 0.00047288238420151174, "learning_rate": 7.823107991808143e-08, "loss": 0.0, "num_input_tokens_seen": 122154344, "step": 181320 }, { "epoch": 4.4297999169374345, "grad_norm": 0.0002162763848900795, "learning_rate": 7.819801776082813e-08, "loss": 0.0, "num_input_tokens_seen": 122157864, "step": 181325 }, { "epoch": 4.429922067769281, "grad_norm": 0.0003336450026836246, "learning_rate": 7.816496230716696e-08, "loss": 0.0, "num_input_tokens_seen": 122161448, "step": 181330 }, { "epoch": 4.430044218601129, "grad_norm": 0.0003785964800044894, "learning_rate": 7.813191355733806e-08, "loss": 0.0, "num_input_tokens_seen": 122165288, "step": 181335 }, { "epoch": 4.430166369432976, "grad_norm": 0.0002000385575229302, "learning_rate": 7.809887151158189e-08, "loss": 0.0, "num_input_tokens_seen": 122168360, "step": 181340 }, { "epoch": 4.430288520264823, "grad_norm": 0.00092991505516693, "learning_rate": 7.806583617013851e-08, "loss": 0.0, "num_input_tokens_seen": 122171368, "step": 181345 }, { "epoch": 4.43041067109667, "grad_norm": 0.00020382586808409542, "learning_rate": 7.80328075332486e-08, "loss": 0.0, "num_input_tokens_seen": 122175144, "step": 181350 }, { "epoch": 4.430532821928518, "grad_norm": 0.0013917014002799988, "learning_rate": 7.799978560115184e-08, "loss": 0.0, "num_input_tokens_seen": 122178344, "step": 181355 }, { "epoch": 4.430654972760364, "grad_norm": 0.00021623856446240097, "learning_rate": 7.79667703740886e-08, "loss": 0.0, "num_input_tokens_seen": 122181608, "step": 181360 }, { "epoch": 4.430777123592212, "grad_norm": 0.00034316626260988414, "learning_rate": 7.793376185229928e-08, "loss": 0.0, "num_input_tokens_seen": 122184872, "step": 181365 }, { "epoch": 4.430899274424059, "grad_norm": 0.006357030943036079, "learning_rate": 7.790076003602342e-08, "loss": 0.0, "num_input_tokens_seen": 122188136, "step": 181370 }, { "epoch": 4.431021425255906, "grad_norm": 0.0008824463002383709, "learning_rate": 7.78677649255014e-08, "loss": 0.0, "num_input_tokens_seen": 122191272, "step": 181375 }, { "epoch": 4.431143576087753, "grad_norm": 0.00017288989329244941, "learning_rate": 7.78347765209728e-08, "loss": 0.0, "num_input_tokens_seen": 122194408, "step": 181380 }, { "epoch": 4.4312657269196, "grad_norm": 0.00015481859736610204, "learning_rate": 7.780179482267802e-08, "loss": 0.0, "num_input_tokens_seen": 122198120, "step": 181385 }, { "epoch": 4.4313878777514475, "grad_norm": 4.8036527005024254e-05, "learning_rate": 7.77688198308566e-08, "loss": 0.0, "num_input_tokens_seen": 122201512, "step": 181390 }, { "epoch": 4.431510028583294, "grad_norm": 5.7792276493273675e-05, "learning_rate": 7.773585154574814e-08, "loss": 0.0, "num_input_tokens_seen": 122204904, "step": 181395 }, { "epoch": 4.431632179415142, "grad_norm": 0.00042270554695278406, "learning_rate": 7.770288996759289e-08, "loss": 0.0, "num_input_tokens_seen": 122208488, "step": 181400 }, { "epoch": 4.431754330246989, "grad_norm": 0.0004007647221442312, "learning_rate": 7.766993509663e-08, "loss": 0.0, "num_input_tokens_seen": 122211496, "step": 181405 }, { "epoch": 4.431876481078836, "grad_norm": 0.00021725076658185571, "learning_rate": 7.763698693309972e-08, "loss": 0.0, "num_input_tokens_seen": 122214696, "step": 181410 }, { "epoch": 4.431998631910683, "grad_norm": 0.00569581426680088, "learning_rate": 7.760404547724109e-08, "loss": 0.0, "num_input_tokens_seen": 122218024, "step": 181415 }, { "epoch": 4.432120782742531, "grad_norm": 0.00361349293962121, "learning_rate": 7.7571110729294e-08, "loss": 0.0, "num_input_tokens_seen": 122221864, "step": 181420 }, { "epoch": 4.432242933574377, "grad_norm": 0.00037453541881404817, "learning_rate": 7.753818268949808e-08, "loss": 0.0, "num_input_tokens_seen": 122225128, "step": 181425 }, { "epoch": 4.432365084406225, "grad_norm": 7.587824075017124e-05, "learning_rate": 7.750526135809232e-08, "loss": 0.0, "num_input_tokens_seen": 122228648, "step": 181430 }, { "epoch": 4.432487235238072, "grad_norm": 0.0013005606597289443, "learning_rate": 7.747234673531667e-08, "loss": 0.0, "num_input_tokens_seen": 122232040, "step": 181435 }, { "epoch": 4.432609386069919, "grad_norm": 0.0004842895723413676, "learning_rate": 7.743943882141013e-08, "loss": 0.0, "num_input_tokens_seen": 122235496, "step": 181440 }, { "epoch": 4.432731536901766, "grad_norm": 0.00032378273317590356, "learning_rate": 7.740653761661219e-08, "loss": 0.0, "num_input_tokens_seen": 122238696, "step": 181445 }, { "epoch": 4.432853687733614, "grad_norm": 0.00021496588306035846, "learning_rate": 7.737364312116202e-08, "loss": 0.0, "num_input_tokens_seen": 122242280, "step": 181450 }, { "epoch": 4.4329758385654605, "grad_norm": 6.006818512105383e-05, "learning_rate": 7.734075533529871e-08, "loss": 0.0, "num_input_tokens_seen": 122245480, "step": 181455 }, { "epoch": 4.433097989397308, "grad_norm": 0.0027862994465976954, "learning_rate": 7.730787425926188e-08, "loss": 0.0, "num_input_tokens_seen": 122249320, "step": 181460 }, { "epoch": 4.433220140229155, "grad_norm": 0.00013754340761806816, "learning_rate": 7.727499989329023e-08, "loss": 0.0, "num_input_tokens_seen": 122252776, "step": 181465 }, { "epoch": 4.4333422910610025, "grad_norm": 0.0007522147498093545, "learning_rate": 7.7242132237623e-08, "loss": 0.0286, "num_input_tokens_seen": 122256552, "step": 181470 }, { "epoch": 4.433464441892849, "grad_norm": 0.00010326150368200615, "learning_rate": 7.72092712924991e-08, "loss": 0.0, "num_input_tokens_seen": 122259880, "step": 181475 }, { "epoch": 4.433586592724696, "grad_norm": 3.690972880576737e-05, "learning_rate": 7.71764170581577e-08, "loss": 0.0, "num_input_tokens_seen": 122263208, "step": 181480 }, { "epoch": 4.433708743556544, "grad_norm": 3.2292551622958854e-05, "learning_rate": 7.714356953483747e-08, "loss": 0.0, "num_input_tokens_seen": 122266472, "step": 181485 }, { "epoch": 4.43383089438839, "grad_norm": 0.0006843619630672038, "learning_rate": 7.711072872277757e-08, "loss": 0.0, "num_input_tokens_seen": 122269800, "step": 181490 }, { "epoch": 4.433953045220238, "grad_norm": 0.002597218146547675, "learning_rate": 7.70778946222167e-08, "loss": 0.0, "num_input_tokens_seen": 122273128, "step": 181495 }, { "epoch": 4.434075196052085, "grad_norm": 0.003175930818542838, "learning_rate": 7.704506723339343e-08, "loss": 0.0, "num_input_tokens_seen": 122276200, "step": 181500 }, { "epoch": 4.434197346883932, "grad_norm": 0.0005056135705672204, "learning_rate": 7.701224655654682e-08, "loss": 0.0, "num_input_tokens_seen": 122279400, "step": 181505 }, { "epoch": 4.434319497715779, "grad_norm": 4.015320428152336e-06, "learning_rate": 7.69794325919153e-08, "loss": 0.0, "num_input_tokens_seen": 122282856, "step": 181510 }, { "epoch": 4.434441648547627, "grad_norm": 0.08624907582998276, "learning_rate": 7.694662533973762e-08, "loss": 0.0, "num_input_tokens_seen": 122286568, "step": 181515 }, { "epoch": 4.4345637993794735, "grad_norm": 0.0014746385859325528, "learning_rate": 7.691382480025244e-08, "loss": 0.0, "num_input_tokens_seen": 122289704, "step": 181520 }, { "epoch": 4.434685950211321, "grad_norm": 0.002636983757838607, "learning_rate": 7.688103097369803e-08, "loss": 0.0002, "num_input_tokens_seen": 122292904, "step": 181525 }, { "epoch": 4.434808101043168, "grad_norm": 0.00015489933139178902, "learning_rate": 7.68482438603133e-08, "loss": 0.0, "num_input_tokens_seen": 122296232, "step": 181530 }, { "epoch": 4.4349302518750155, "grad_norm": 0.0012338929809629917, "learning_rate": 7.681546346033618e-08, "loss": 0.0, "num_input_tokens_seen": 122299688, "step": 181535 }, { "epoch": 4.435052402706862, "grad_norm": 0.0014644035836681724, "learning_rate": 7.67826897740056e-08, "loss": 0.0, "num_input_tokens_seen": 122302696, "step": 181540 }, { "epoch": 4.43517455353871, "grad_norm": 0.11627105623483658, "learning_rate": 7.674992280155934e-08, "loss": 0.0, "num_input_tokens_seen": 122305768, "step": 181545 }, { "epoch": 4.435296704370557, "grad_norm": 0.00040323357097804546, "learning_rate": 7.671716254323601e-08, "loss": 0.0, "num_input_tokens_seen": 122308904, "step": 181550 }, { "epoch": 4.435418855202404, "grad_norm": 0.00010230207408312708, "learning_rate": 7.668440899927398e-08, "loss": 0.0, "num_input_tokens_seen": 122312168, "step": 181555 }, { "epoch": 4.435541006034251, "grad_norm": 0.013337737880647182, "learning_rate": 7.665166216991115e-08, "loss": 0.0, "num_input_tokens_seen": 122315816, "step": 181560 }, { "epoch": 4.435663156866099, "grad_norm": 0.00021302149980328977, "learning_rate": 7.66189220553859e-08, "loss": 0.0, "num_input_tokens_seen": 122318888, "step": 181565 }, { "epoch": 4.435785307697945, "grad_norm": 0.00024847377790138125, "learning_rate": 7.658618865593603e-08, "loss": 0.0, "num_input_tokens_seen": 122322088, "step": 181570 }, { "epoch": 4.435907458529792, "grad_norm": 0.00015122335753403604, "learning_rate": 7.655346197179979e-08, "loss": 0.0, "num_input_tokens_seen": 122325736, "step": 181575 }, { "epoch": 4.43602960936164, "grad_norm": 0.0009328412124887109, "learning_rate": 7.652074200321524e-08, "loss": 0.0, "num_input_tokens_seen": 122328936, "step": 181580 }, { "epoch": 4.436151760193487, "grad_norm": 0.00010473511792952195, "learning_rate": 7.648802875042038e-08, "loss": 0.0, "num_input_tokens_seen": 122332328, "step": 181585 }, { "epoch": 4.436273911025334, "grad_norm": 0.00021845597075298429, "learning_rate": 7.64553222136527e-08, "loss": 0.0003, "num_input_tokens_seen": 122335272, "step": 181590 }, { "epoch": 4.436396061857181, "grad_norm": 0.00012159592733951285, "learning_rate": 7.642262239315055e-08, "loss": 0.0, "num_input_tokens_seen": 122338792, "step": 181595 }, { "epoch": 4.436518212689029, "grad_norm": 0.0010934515157714486, "learning_rate": 7.638992928915144e-08, "loss": 0.0, "num_input_tokens_seen": 122341928, "step": 181600 }, { "epoch": 4.436640363520875, "grad_norm": 0.020605893805623055, "learning_rate": 7.635724290189305e-08, "loss": 0.0, "num_input_tokens_seen": 122345320, "step": 181605 }, { "epoch": 4.436762514352723, "grad_norm": 0.013531841337680817, "learning_rate": 7.632456323161319e-08, "loss": 0.0, "num_input_tokens_seen": 122349416, "step": 181610 }, { "epoch": 4.43688466518457, "grad_norm": 0.0014700923347845674, "learning_rate": 7.629189027854977e-08, "loss": 0.0, "num_input_tokens_seen": 122352616, "step": 181615 }, { "epoch": 4.437006816016417, "grad_norm": 8.41475193738006e-05, "learning_rate": 7.625922404293994e-08, "loss": 0.0, "num_input_tokens_seen": 122356264, "step": 181620 }, { "epoch": 4.437128966848264, "grad_norm": 0.00010870520054595545, "learning_rate": 7.622656452502174e-08, "loss": 0.0663, "num_input_tokens_seen": 122359656, "step": 181625 }, { "epoch": 4.437251117680112, "grad_norm": 0.00017314977594651282, "learning_rate": 7.61939117250322e-08, "loss": 0.0, "num_input_tokens_seen": 122362856, "step": 181630 }, { "epoch": 4.4373732685119585, "grad_norm": 0.002724649151787162, "learning_rate": 7.616126564320901e-08, "loss": 0.0, "num_input_tokens_seen": 122366120, "step": 181635 }, { "epoch": 4.437495419343806, "grad_norm": 0.022800017148256302, "learning_rate": 7.612862627978978e-08, "loss": 0.0, "num_input_tokens_seen": 122370216, "step": 181640 }, { "epoch": 4.437617570175653, "grad_norm": 7.581769750686362e-05, "learning_rate": 7.60959936350114e-08, "loss": 0.0, "num_input_tokens_seen": 122373672, "step": 181645 }, { "epoch": 4.4377397210075005, "grad_norm": 0.0020787513349205256, "learning_rate": 7.60633677091117e-08, "loss": 0.0, "num_input_tokens_seen": 122377704, "step": 181650 }, { "epoch": 4.437861871839347, "grad_norm": 0.0002737126487772912, "learning_rate": 7.60307485023276e-08, "loss": 0.0, "num_input_tokens_seen": 122380904, "step": 181655 }, { "epoch": 4.437984022671195, "grad_norm": 1.887790858745575e-05, "learning_rate": 7.599813601489646e-08, "loss": 0.0, "num_input_tokens_seen": 122384040, "step": 181660 }, { "epoch": 4.438106173503042, "grad_norm": 0.0011298698373138905, "learning_rate": 7.596553024705533e-08, "loss": 0.0, "num_input_tokens_seen": 122387176, "step": 181665 }, { "epoch": 4.438228324334888, "grad_norm": 0.0046443212777376175, "learning_rate": 7.593293119904132e-08, "loss": 0.0, "num_input_tokens_seen": 122390632, "step": 181670 }, { "epoch": 4.438350475166736, "grad_norm": 0.00015271840675268322, "learning_rate": 7.590033887109181e-08, "loss": 0.0, "num_input_tokens_seen": 122393960, "step": 181675 }, { "epoch": 4.438472625998583, "grad_norm": 0.001776915742084384, "learning_rate": 7.586775326344341e-08, "loss": 0.0, "num_input_tokens_seen": 122396648, "step": 181680 }, { "epoch": 4.43859477683043, "grad_norm": 0.0008560987189412117, "learning_rate": 7.583517437633335e-08, "loss": 0.0, "num_input_tokens_seen": 122400104, "step": 181685 }, { "epoch": 4.438716927662277, "grad_norm": 5.613368557533249e-06, "learning_rate": 7.580260220999845e-08, "loss": 0.0, "num_input_tokens_seen": 122403688, "step": 181690 }, { "epoch": 4.438839078494125, "grad_norm": 0.00016497218166477978, "learning_rate": 7.577003676467564e-08, "loss": 0.0, "num_input_tokens_seen": 122406888, "step": 181695 }, { "epoch": 4.4389612293259715, "grad_norm": 0.0033229936379939318, "learning_rate": 7.573747804060182e-08, "loss": 0.0, "num_input_tokens_seen": 122409832, "step": 181700 }, { "epoch": 4.439083380157819, "grad_norm": 6.330916949082166e-05, "learning_rate": 7.570492603801337e-08, "loss": 0.0, "num_input_tokens_seen": 122413224, "step": 181705 }, { "epoch": 4.439205530989666, "grad_norm": 0.0011732260463759303, "learning_rate": 7.567238075714755e-08, "loss": 0.0, "num_input_tokens_seen": 122416552, "step": 181710 }, { "epoch": 4.4393276818215135, "grad_norm": 0.0009869820205494761, "learning_rate": 7.56398421982406e-08, "loss": 0.0, "num_input_tokens_seen": 122419752, "step": 181715 }, { "epoch": 4.43944983265336, "grad_norm": 0.0001454023295082152, "learning_rate": 7.560731036152957e-08, "loss": 0.0, "num_input_tokens_seen": 122422824, "step": 181720 }, { "epoch": 4.439571983485208, "grad_norm": 1.622208765184041e-05, "learning_rate": 7.557478524725059e-08, "loss": 0.0, "num_input_tokens_seen": 122426024, "step": 181725 }, { "epoch": 4.439694134317055, "grad_norm": 0.00026236390112899244, "learning_rate": 7.554226685564047e-08, "loss": 0.0, "num_input_tokens_seen": 122429160, "step": 181730 }, { "epoch": 4.439816285148902, "grad_norm": 10.125528335571289, "learning_rate": 7.55097551869357e-08, "loss": 0.0384, "num_input_tokens_seen": 122432488, "step": 181735 }, { "epoch": 4.439938435980749, "grad_norm": 0.0002785317483358085, "learning_rate": 7.547725024137252e-08, "loss": 0.0, "num_input_tokens_seen": 122435624, "step": 181740 }, { "epoch": 4.440060586812596, "grad_norm": 0.0014351366553455591, "learning_rate": 7.544475201918765e-08, "loss": 0.0, "num_input_tokens_seen": 122438504, "step": 181745 }, { "epoch": 4.440182737644443, "grad_norm": 0.001117666601203382, "learning_rate": 7.5412260520617e-08, "loss": 0.0, "num_input_tokens_seen": 122441640, "step": 181750 }, { "epoch": 4.44030488847629, "grad_norm": 0.0025107613764703274, "learning_rate": 7.537977574589726e-08, "loss": 0.0, "num_input_tokens_seen": 122444776, "step": 181755 }, { "epoch": 4.440427039308138, "grad_norm": 3.955272404709831e-05, "learning_rate": 7.534729769526437e-08, "loss": 0.0, "num_input_tokens_seen": 122448040, "step": 181760 }, { "epoch": 4.4405491901399845, "grad_norm": 0.0025206992868334055, "learning_rate": 7.531482636895458e-08, "loss": 0.0, "num_input_tokens_seen": 122451496, "step": 181765 }, { "epoch": 4.440671340971832, "grad_norm": 0.00026313986745662987, "learning_rate": 7.528236176720426e-08, "loss": 0.0, "num_input_tokens_seen": 122454696, "step": 181770 }, { "epoch": 4.440793491803679, "grad_norm": 0.023444993421435356, "learning_rate": 7.52499038902491e-08, "loss": 0.0, "num_input_tokens_seen": 122458088, "step": 181775 }, { "epoch": 4.4409156426355265, "grad_norm": 0.0010252405190840364, "learning_rate": 7.521745273832558e-08, "loss": 0.0, "num_input_tokens_seen": 122461096, "step": 181780 }, { "epoch": 4.441037793467373, "grad_norm": 0.0006436887779273093, "learning_rate": 7.518500831166929e-08, "loss": 0.0, "num_input_tokens_seen": 122464168, "step": 181785 }, { "epoch": 4.441159944299221, "grad_norm": 1.9545019313227385e-05, "learning_rate": 7.515257061051661e-08, "loss": 0.0, "num_input_tokens_seen": 122467368, "step": 181790 }, { "epoch": 4.441282095131068, "grad_norm": 9.000881982501596e-05, "learning_rate": 7.5120139635103e-08, "loss": 0.0, "num_input_tokens_seen": 122470568, "step": 181795 }, { "epoch": 4.441404245962915, "grad_norm": 8.998543489724398e-05, "learning_rate": 7.508771538566461e-08, "loss": 0.0, "num_input_tokens_seen": 122473768, "step": 181800 }, { "epoch": 4.441526396794762, "grad_norm": 0.0010170077439397573, "learning_rate": 7.505529786243714e-08, "loss": 0.0, "num_input_tokens_seen": 122476776, "step": 181805 }, { "epoch": 4.44164854762661, "grad_norm": 7.858948811190203e-05, "learning_rate": 7.502288706565618e-08, "loss": 0.0, "num_input_tokens_seen": 122480296, "step": 181810 }, { "epoch": 4.441770698458456, "grad_norm": 0.014259721152484417, "learning_rate": 7.499048299555777e-08, "loss": 0.0, "num_input_tokens_seen": 122483432, "step": 181815 }, { "epoch": 4.441892849290304, "grad_norm": 0.00014318434114102274, "learning_rate": 7.495808565237716e-08, "loss": 0.0308, "num_input_tokens_seen": 122486888, "step": 181820 }, { "epoch": 4.442015000122151, "grad_norm": 0.0010650715557858348, "learning_rate": 7.492569503635015e-08, "loss": 0.0, "num_input_tokens_seen": 122490088, "step": 181825 }, { "epoch": 4.442137150953998, "grad_norm": 0.00724154943600297, "learning_rate": 7.489331114771247e-08, "loss": 0.0, "num_input_tokens_seen": 122493416, "step": 181830 }, { "epoch": 4.442259301785845, "grad_norm": 4.4712283852277324e-05, "learning_rate": 7.486093398669934e-08, "loss": 0.0, "num_input_tokens_seen": 122497576, "step": 181835 }, { "epoch": 4.442381452617692, "grad_norm": 6.589079566765577e-05, "learning_rate": 7.482856355354638e-08, "loss": 0.0, "num_input_tokens_seen": 122501480, "step": 181840 }, { "epoch": 4.4425036034495395, "grad_norm": 0.015320822596549988, "learning_rate": 7.479619984848884e-08, "loss": 0.0, "num_input_tokens_seen": 122504744, "step": 181845 }, { "epoch": 4.442625754281386, "grad_norm": 5.367151970858686e-05, "learning_rate": 7.476384287176241e-08, "loss": 0.0, "num_input_tokens_seen": 122507944, "step": 181850 }, { "epoch": 4.442747905113234, "grad_norm": 0.014334838837385178, "learning_rate": 7.473149262360201e-08, "loss": 0.0, "num_input_tokens_seen": 122511080, "step": 181855 }, { "epoch": 4.442870055945081, "grad_norm": 0.0004377129953354597, "learning_rate": 7.469914910424291e-08, "loss": 0.0, "num_input_tokens_seen": 122514600, "step": 181860 }, { "epoch": 4.442992206776928, "grad_norm": 0.0004606062138918787, "learning_rate": 7.46668123139208e-08, "loss": 0.0, "num_input_tokens_seen": 122517992, "step": 181865 }, { "epoch": 4.443114357608775, "grad_norm": 0.00025587991694919765, "learning_rate": 7.463448225287028e-08, "loss": 0.0, "num_input_tokens_seen": 122520872, "step": 181870 }, { "epoch": 4.443236508440623, "grad_norm": 8.987699402496219e-05, "learning_rate": 7.460215892132693e-08, "loss": 0.0, "num_input_tokens_seen": 122524008, "step": 181875 }, { "epoch": 4.443358659272469, "grad_norm": 0.0005700733745470643, "learning_rate": 7.456984231952535e-08, "loss": 0.0, "num_input_tokens_seen": 122527144, "step": 181880 }, { "epoch": 4.443480810104317, "grad_norm": 0.0001441869098925963, "learning_rate": 7.453753244770078e-08, "loss": 0.0, "num_input_tokens_seen": 122530856, "step": 181885 }, { "epoch": 4.443602960936164, "grad_norm": 0.0013551083393394947, "learning_rate": 7.450522930608838e-08, "loss": 0.0, "num_input_tokens_seen": 122534248, "step": 181890 }, { "epoch": 4.443725111768011, "grad_norm": 4.1691670048749074e-05, "learning_rate": 7.447293289492285e-08, "loss": 0.0, "num_input_tokens_seen": 122537192, "step": 181895 }, { "epoch": 4.443847262599858, "grad_norm": 0.0007187697337940335, "learning_rate": 7.444064321443899e-08, "loss": 0.0, "num_input_tokens_seen": 122540200, "step": 181900 }, { "epoch": 4.443969413431706, "grad_norm": 0.005256319418549538, "learning_rate": 7.440836026487184e-08, "loss": 0.0, "num_input_tokens_seen": 122543656, "step": 181905 }, { "epoch": 4.444091564263553, "grad_norm": 0.001109532080590725, "learning_rate": 7.43760840464559e-08, "loss": 0.0, "num_input_tokens_seen": 122546472, "step": 181910 }, { "epoch": 4.4442137150954, "grad_norm": 0.006649449001997709, "learning_rate": 7.434381455942617e-08, "loss": 0.0, "num_input_tokens_seen": 122550184, "step": 181915 }, { "epoch": 4.444335865927247, "grad_norm": 0.00018596640438772738, "learning_rate": 7.431155180401705e-08, "loss": 0.0, "num_input_tokens_seen": 122553320, "step": 181920 }, { "epoch": 4.444458016759095, "grad_norm": 0.00020076056534890085, "learning_rate": 7.427929578046354e-08, "loss": 0.0, "num_input_tokens_seen": 122556584, "step": 181925 }, { "epoch": 4.444580167590941, "grad_norm": 5.85622874496039e-05, "learning_rate": 7.424704648899972e-08, "loss": 0.0, "num_input_tokens_seen": 122559592, "step": 181930 }, { "epoch": 4.444702318422788, "grad_norm": 0.00037898457958362997, "learning_rate": 7.421480392986057e-08, "loss": 0.0, "num_input_tokens_seen": 122562664, "step": 181935 }, { "epoch": 4.444824469254636, "grad_norm": 0.00010478322656126693, "learning_rate": 7.418256810328016e-08, "loss": 0.0, "num_input_tokens_seen": 122565672, "step": 181940 }, { "epoch": 4.4449466200864824, "grad_norm": 0.0002965153835248202, "learning_rate": 7.415033900949319e-08, "loss": 0.0, "num_input_tokens_seen": 122569896, "step": 181945 }, { "epoch": 4.44506877091833, "grad_norm": 0.0018205991946160793, "learning_rate": 7.411811664873413e-08, "loss": 0.0, "num_input_tokens_seen": 122573480, "step": 181950 }, { "epoch": 4.445190921750177, "grad_norm": 0.0012627599062398076, "learning_rate": 7.408590102123701e-08, "loss": 0.0, "num_input_tokens_seen": 122576680, "step": 181955 }, { "epoch": 4.4453130725820245, "grad_norm": 0.00049396394751966, "learning_rate": 7.405369212723645e-08, "loss": 0.0, "num_input_tokens_seen": 122579944, "step": 181960 }, { "epoch": 4.445435223413871, "grad_norm": 0.0006146155064925551, "learning_rate": 7.402148996696622e-08, "loss": 0.0, "num_input_tokens_seen": 122583208, "step": 181965 }, { "epoch": 4.445557374245719, "grad_norm": 0.0018971695099025965, "learning_rate": 7.398929454066105e-08, "loss": 0.0, "num_input_tokens_seen": 122586472, "step": 181970 }, { "epoch": 4.445679525077566, "grad_norm": 39.05138397216797, "learning_rate": 7.395710584855452e-08, "loss": 0.0631, "num_input_tokens_seen": 122590312, "step": 181975 }, { "epoch": 4.445801675909413, "grad_norm": 0.0018318736692890525, "learning_rate": 7.392492389088112e-08, "loss": 0.0, "num_input_tokens_seen": 122593448, "step": 181980 }, { "epoch": 4.44592382674126, "grad_norm": 0.00022619598894380033, "learning_rate": 7.389274866787488e-08, "loss": 0.0, "num_input_tokens_seen": 122597352, "step": 181985 }, { "epoch": 4.446045977573108, "grad_norm": 0.28438884019851685, "learning_rate": 7.386058017976938e-08, "loss": 0.0002, "num_input_tokens_seen": 122600936, "step": 181990 }, { "epoch": 4.446168128404954, "grad_norm": 0.0005298346513882279, "learning_rate": 7.38284184267991e-08, "loss": 0.0, "num_input_tokens_seen": 122604520, "step": 181995 }, { "epoch": 4.446290279236802, "grad_norm": 0.00031661076354794204, "learning_rate": 7.379626340919754e-08, "loss": 0.0, "num_input_tokens_seen": 122607784, "step": 182000 }, { "epoch": 4.446412430068649, "grad_norm": 0.0004995632916688919, "learning_rate": 7.376411512719882e-08, "loss": 0.0, "num_input_tokens_seen": 122610536, "step": 182005 }, { "epoch": 4.4465345809004955, "grad_norm": 0.00014909429592080414, "learning_rate": 7.373197358103655e-08, "loss": 0.0, "num_input_tokens_seen": 122614568, "step": 182010 }, { "epoch": 4.446656731732343, "grad_norm": 0.0005114256637170911, "learning_rate": 7.369983877094432e-08, "loss": 0.0, "num_input_tokens_seen": 122617768, "step": 182015 }, { "epoch": 4.44677888256419, "grad_norm": 0.0002833055623341352, "learning_rate": 7.366771069715627e-08, "loss": 0.0, "num_input_tokens_seen": 122621096, "step": 182020 }, { "epoch": 4.4469010333960375, "grad_norm": 0.00017942176782526076, "learning_rate": 7.363558935990555e-08, "loss": 0.0, "num_input_tokens_seen": 122624616, "step": 182025 }, { "epoch": 4.447023184227884, "grad_norm": 0.0020283800549805164, "learning_rate": 7.360347475942618e-08, "loss": 0.0, "num_input_tokens_seen": 122627752, "step": 182030 }, { "epoch": 4.447145335059732, "grad_norm": 0.00043666703277267516, "learning_rate": 7.357136689595133e-08, "loss": 0.0, "num_input_tokens_seen": 122631400, "step": 182035 }, { "epoch": 4.447267485891579, "grad_norm": 0.0004874668666161597, "learning_rate": 7.35392657697147e-08, "loss": 0.0, "num_input_tokens_seen": 122634664, "step": 182040 }, { "epoch": 4.447389636723426, "grad_norm": 0.0012796398950740695, "learning_rate": 7.350717138094976e-08, "loss": 0.0, "num_input_tokens_seen": 122638056, "step": 182045 }, { "epoch": 4.447511787555273, "grad_norm": 0.00035492394817993045, "learning_rate": 7.347508372988986e-08, "loss": 0.0, "num_input_tokens_seen": 122641384, "step": 182050 }, { "epoch": 4.447633938387121, "grad_norm": 0.00027042333385907114, "learning_rate": 7.34430028167684e-08, "loss": 0.0, "num_input_tokens_seen": 122644520, "step": 182055 }, { "epoch": 4.447756089218967, "grad_norm": 0.00047873600851744413, "learning_rate": 7.341092864181853e-08, "loss": 0.0, "num_input_tokens_seen": 122648360, "step": 182060 }, { "epoch": 4.447878240050815, "grad_norm": 0.0010061763459816575, "learning_rate": 7.337886120527381e-08, "loss": 0.0, "num_input_tokens_seen": 122651560, "step": 182065 }, { "epoch": 4.448000390882662, "grad_norm": 0.0006434638053178787, "learning_rate": 7.334680050736707e-08, "loss": 0.0, "num_input_tokens_seen": 122654696, "step": 182070 }, { "epoch": 4.448122541714509, "grad_norm": 0.00011786912364186719, "learning_rate": 7.331474654833158e-08, "loss": 0.0, "num_input_tokens_seen": 122657832, "step": 182075 }, { "epoch": 4.448244692546356, "grad_norm": 0.007387042045593262, "learning_rate": 7.32826993284007e-08, "loss": 0.0, "num_input_tokens_seen": 122660840, "step": 182080 }, { "epoch": 4.448366843378204, "grad_norm": 0.00154625263530761, "learning_rate": 7.325065884780712e-08, "loss": 0.0, "num_input_tokens_seen": 122664168, "step": 182085 }, { "epoch": 4.4484889942100505, "grad_norm": 0.0005491775227710605, "learning_rate": 7.321862510678423e-08, "loss": 0.0, "num_input_tokens_seen": 122667880, "step": 182090 }, { "epoch": 4.448611145041898, "grad_norm": 9.22619983612094e-06, "learning_rate": 7.318659810556449e-08, "loss": 0.0, "num_input_tokens_seen": 122671720, "step": 182095 }, { "epoch": 4.448733295873745, "grad_norm": 0.0003610023995861411, "learning_rate": 7.31545778443814e-08, "loss": 0.0, "num_input_tokens_seen": 122675112, "step": 182100 }, { "epoch": 4.448855446705592, "grad_norm": 5.4004536650609225e-05, "learning_rate": 7.31225643234672e-08, "loss": 0.0224, "num_input_tokens_seen": 122678312, "step": 182105 }, { "epoch": 4.448977597537439, "grad_norm": 0.0013011741684749722, "learning_rate": 7.309055754305527e-08, "loss": 0.0, "num_input_tokens_seen": 122681320, "step": 182110 }, { "epoch": 4.449099748369286, "grad_norm": 0.0007865259540267289, "learning_rate": 7.305855750337809e-08, "loss": 0.0, "num_input_tokens_seen": 122684840, "step": 182115 }, { "epoch": 4.449221899201134, "grad_norm": 0.0013168536825105548, "learning_rate": 7.302656420466824e-08, "loss": 0.0, "num_input_tokens_seen": 122688552, "step": 182120 }, { "epoch": 4.44934405003298, "grad_norm": 3.883744648192078e-05, "learning_rate": 7.299457764715866e-08, "loss": 0.0, "num_input_tokens_seen": 122691880, "step": 182125 }, { "epoch": 4.449466200864828, "grad_norm": 0.0002559190324973315, "learning_rate": 7.296259783108171e-08, "loss": 0.0, "num_input_tokens_seen": 122695144, "step": 182130 }, { "epoch": 4.449588351696675, "grad_norm": 0.0001097548010875471, "learning_rate": 7.293062475667011e-08, "loss": 0.0, "num_input_tokens_seen": 122698536, "step": 182135 }, { "epoch": 4.449710502528522, "grad_norm": 0.030722323805093765, "learning_rate": 7.289865842415654e-08, "loss": 0.0, "num_input_tokens_seen": 122701736, "step": 182140 }, { "epoch": 4.449832653360369, "grad_norm": 0.002170925959944725, "learning_rate": 7.286669883377306e-08, "loss": 0.0, "num_input_tokens_seen": 122705128, "step": 182145 }, { "epoch": 4.449954804192217, "grad_norm": 0.001756233279593289, "learning_rate": 7.283474598575257e-08, "loss": 0.0, "num_input_tokens_seen": 122708072, "step": 182150 }, { "epoch": 4.4500769550240635, "grad_norm": 2.314039011253044e-05, "learning_rate": 7.280279988032689e-08, "loss": 0.0, "num_input_tokens_seen": 122711528, "step": 182155 }, { "epoch": 4.450199105855911, "grad_norm": 0.00042975114774890244, "learning_rate": 7.277086051772896e-08, "loss": 0.0, "num_input_tokens_seen": 122714536, "step": 182160 }, { "epoch": 4.450321256687758, "grad_norm": 3.163126530125737e-05, "learning_rate": 7.273892789819047e-08, "loss": 0.0, "num_input_tokens_seen": 122717864, "step": 182165 }, { "epoch": 4.4504434075196055, "grad_norm": 0.003855722723528743, "learning_rate": 7.270700202194391e-08, "loss": 0.0, "num_input_tokens_seen": 122721256, "step": 182170 }, { "epoch": 4.450565558351452, "grad_norm": 0.000997714465484023, "learning_rate": 7.267508288922153e-08, "loss": 0.0, "num_input_tokens_seen": 122724328, "step": 182175 }, { "epoch": 4.4506877091833, "grad_norm": 0.00014899314555805176, "learning_rate": 7.264317050025537e-08, "loss": 0.0, "num_input_tokens_seen": 122727656, "step": 182180 }, { "epoch": 4.450809860015147, "grad_norm": 0.0003528865345288068, "learning_rate": 7.261126485527757e-08, "loss": 0.0, "num_input_tokens_seen": 122731112, "step": 182185 }, { "epoch": 4.450932010846994, "grad_norm": 0.00044632842764258385, "learning_rate": 7.257936595451986e-08, "loss": 0.0, "num_input_tokens_seen": 122734312, "step": 182190 }, { "epoch": 4.451054161678841, "grad_norm": 4.655039811041206e-05, "learning_rate": 7.254747379821458e-08, "loss": 0.0, "num_input_tokens_seen": 122738472, "step": 182195 }, { "epoch": 4.451176312510688, "grad_norm": 0.000397029856685549, "learning_rate": 7.251558838659355e-08, "loss": 0.0, "num_input_tokens_seen": 122741608, "step": 182200 }, { "epoch": 4.451298463342535, "grad_norm": 0.0004111557500436902, "learning_rate": 7.24837097198887e-08, "loss": 0.0, "num_input_tokens_seen": 122744872, "step": 182205 }, { "epoch": 4.451420614174382, "grad_norm": 5.328708721208386e-05, "learning_rate": 7.245183779833163e-08, "loss": 0.0, "num_input_tokens_seen": 122748392, "step": 182210 }, { "epoch": 4.45154276500623, "grad_norm": 0.00046480150194838643, "learning_rate": 7.241997262215449e-08, "loss": 0.0, "num_input_tokens_seen": 122751912, "step": 182215 }, { "epoch": 4.4516649158380766, "grad_norm": 0.00010292190563632175, "learning_rate": 7.238811419158852e-08, "loss": 0.0, "num_input_tokens_seen": 122755240, "step": 182220 }, { "epoch": 4.451787066669924, "grad_norm": 0.0004073938471265137, "learning_rate": 7.2356262506866e-08, "loss": 0.0, "num_input_tokens_seen": 122758632, "step": 182225 }, { "epoch": 4.451909217501771, "grad_norm": 0.01925656571984291, "learning_rate": 7.232441756821794e-08, "loss": 0.0, "num_input_tokens_seen": 122761960, "step": 182230 }, { "epoch": 4.452031368333619, "grad_norm": 0.011916612274944782, "learning_rate": 7.229257937587641e-08, "loss": 0.0, "num_input_tokens_seen": 122765672, "step": 182235 }, { "epoch": 4.452153519165465, "grad_norm": 1.697085281193722e-05, "learning_rate": 7.226074793007264e-08, "loss": 0.0, "num_input_tokens_seen": 122768744, "step": 182240 }, { "epoch": 4.452275669997313, "grad_norm": 1.4337666470964905e-05, "learning_rate": 7.222892323103846e-08, "loss": 0.0, "num_input_tokens_seen": 122772136, "step": 182245 }, { "epoch": 4.45239782082916, "grad_norm": 0.006274717394262552, "learning_rate": 7.21971052790048e-08, "loss": 0.0, "num_input_tokens_seen": 122775528, "step": 182250 }, { "epoch": 4.452519971661007, "grad_norm": 0.00027175049763172865, "learning_rate": 7.216529407420357e-08, "loss": 0.0, "num_input_tokens_seen": 122779048, "step": 182255 }, { "epoch": 4.452642122492854, "grad_norm": 0.00012441864237189293, "learning_rate": 7.213348961686572e-08, "loss": 0.0, "num_input_tokens_seen": 122782568, "step": 182260 }, { "epoch": 4.452764273324702, "grad_norm": 0.0012823011493310332, "learning_rate": 7.210169190722271e-08, "loss": 0.0523, "num_input_tokens_seen": 122786088, "step": 182265 }, { "epoch": 4.4528864241565485, "grad_norm": 7.73086940171197e-05, "learning_rate": 7.206990094550592e-08, "loss": 0.0002, "num_input_tokens_seen": 122789608, "step": 182270 }, { "epoch": 4.453008574988396, "grad_norm": 0.000146789156133309, "learning_rate": 7.203811673194615e-08, "loss": 0.0, "num_input_tokens_seen": 122793000, "step": 182275 }, { "epoch": 4.453130725820243, "grad_norm": 4.125279883737676e-05, "learning_rate": 7.200633926677513e-08, "loss": 0.0, "num_input_tokens_seen": 122796456, "step": 182280 }, { "epoch": 4.4532528766520905, "grad_norm": 0.002219364047050476, "learning_rate": 7.197456855022333e-08, "loss": 0.0, "num_input_tokens_seen": 122799528, "step": 182285 }, { "epoch": 4.453375027483937, "grad_norm": 0.0004296134866308421, "learning_rate": 7.194280458252211e-08, "loss": 0.0, "num_input_tokens_seen": 122802472, "step": 182290 }, { "epoch": 4.453497178315784, "grad_norm": 0.0001687946787569672, "learning_rate": 7.191104736390252e-08, "loss": 0.0, "num_input_tokens_seen": 122805416, "step": 182295 }, { "epoch": 4.453619329147632, "grad_norm": 0.00604760879650712, "learning_rate": 7.187929689459527e-08, "loss": 0.0005, "num_input_tokens_seen": 122808424, "step": 182300 }, { "epoch": 4.453741479979478, "grad_norm": 0.0005081015406176448, "learning_rate": 7.18475531748317e-08, "loss": 0.001, "num_input_tokens_seen": 122811752, "step": 182305 }, { "epoch": 4.453863630811326, "grad_norm": 0.0017769659170880914, "learning_rate": 7.181581620484211e-08, "loss": 0.0, "num_input_tokens_seen": 122815080, "step": 182310 }, { "epoch": 4.453985781643173, "grad_norm": 0.0003629335842560977, "learning_rate": 7.178408598485775e-08, "loss": 0.0, "num_input_tokens_seen": 122818344, "step": 182315 }, { "epoch": 4.45410793247502, "grad_norm": 0.0013918086187914014, "learning_rate": 7.175236251510908e-08, "loss": 0.0, "num_input_tokens_seen": 122822312, "step": 182320 }, { "epoch": 4.454230083306867, "grad_norm": 0.00013686173770111054, "learning_rate": 7.172064579582682e-08, "loss": 0.0, "num_input_tokens_seen": 122825512, "step": 182325 }, { "epoch": 4.454352234138715, "grad_norm": 0.0005737289902754128, "learning_rate": 7.16889358272419e-08, "loss": 0.0, "num_input_tokens_seen": 122828904, "step": 182330 }, { "epoch": 4.4544743849705615, "grad_norm": 0.0011010384187102318, "learning_rate": 7.165723260958445e-08, "loss": 0.1163, "num_input_tokens_seen": 122832808, "step": 182335 }, { "epoch": 4.454596535802409, "grad_norm": 0.0005671690450981259, "learning_rate": 7.162553614308552e-08, "loss": 0.0, "num_input_tokens_seen": 122835880, "step": 182340 }, { "epoch": 4.454718686634256, "grad_norm": 1.5716286725364625e-05, "learning_rate": 7.159384642797528e-08, "loss": 0.0, "num_input_tokens_seen": 122839016, "step": 182345 }, { "epoch": 4.4548408374661035, "grad_norm": 0.002666143700480461, "learning_rate": 7.156216346448419e-08, "loss": 0.0, "num_input_tokens_seen": 122843048, "step": 182350 }, { "epoch": 4.45496298829795, "grad_norm": 0.000196826717001386, "learning_rate": 7.153048725284305e-08, "loss": 0.0, "num_input_tokens_seen": 122847016, "step": 182355 }, { "epoch": 4.455085139129798, "grad_norm": 0.00038609313196502626, "learning_rate": 7.14988177932817e-08, "loss": 0.0, "num_input_tokens_seen": 122850152, "step": 182360 }, { "epoch": 4.455207289961645, "grad_norm": 0.0003739885869435966, "learning_rate": 7.146715508603085e-08, "loss": 0.0, "num_input_tokens_seen": 122853096, "step": 182365 }, { "epoch": 4.455329440793491, "grad_norm": 0.002237460808828473, "learning_rate": 7.143549913132052e-08, "loss": 0.0, "num_input_tokens_seen": 122856488, "step": 182370 }, { "epoch": 4.455451591625339, "grad_norm": 0.0001580318494234234, "learning_rate": 7.140384992938108e-08, "loss": 0.0, "num_input_tokens_seen": 122859752, "step": 182375 }, { "epoch": 4.455573742457186, "grad_norm": 0.0003379890986252576, "learning_rate": 7.137220748044236e-08, "loss": 0.0, "num_input_tokens_seen": 122863336, "step": 182380 }, { "epoch": 4.455695893289033, "grad_norm": 8.986893953988329e-05, "learning_rate": 7.134057178473485e-08, "loss": 0.0, "num_input_tokens_seen": 122866472, "step": 182385 }, { "epoch": 4.45581804412088, "grad_norm": 0.0007048301049508154, "learning_rate": 7.130894284248856e-08, "loss": 0.0, "num_input_tokens_seen": 122870248, "step": 182390 }, { "epoch": 4.455940194952728, "grad_norm": 32.200618743896484, "learning_rate": 7.127732065393333e-08, "loss": 0.0383, "num_input_tokens_seen": 122873256, "step": 182395 }, { "epoch": 4.4560623457845745, "grad_norm": 0.00011272566189290956, "learning_rate": 7.12457052192994e-08, "loss": 0.0, "num_input_tokens_seen": 122876584, "step": 182400 }, { "epoch": 4.456184496616422, "grad_norm": 0.00023590961063746363, "learning_rate": 7.121409653881628e-08, "loss": 0.0, "num_input_tokens_seen": 122879976, "step": 182405 }, { "epoch": 4.456306647448269, "grad_norm": 0.005138612352311611, "learning_rate": 7.11824946127142e-08, "loss": 0.0, "num_input_tokens_seen": 122883240, "step": 182410 }, { "epoch": 4.4564287982801165, "grad_norm": 0.0001687046606093645, "learning_rate": 7.115089944122276e-08, "loss": 0.0, "num_input_tokens_seen": 122886312, "step": 182415 }, { "epoch": 4.456550949111963, "grad_norm": 0.00014753674622625113, "learning_rate": 7.111931102457192e-08, "loss": 0.0, "num_input_tokens_seen": 122889640, "step": 182420 }, { "epoch": 4.456673099943811, "grad_norm": 0.00025895764701999724, "learning_rate": 7.108772936299134e-08, "loss": 0.0, "num_input_tokens_seen": 122893288, "step": 182425 }, { "epoch": 4.456795250775658, "grad_norm": 4.39085197285749e-05, "learning_rate": 7.105615445671042e-08, "loss": 0.0, "num_input_tokens_seen": 122896552, "step": 182430 }, { "epoch": 4.456917401607505, "grad_norm": 0.001786267152056098, "learning_rate": 7.10245863059592e-08, "loss": 0.0, "num_input_tokens_seen": 122899880, "step": 182435 }, { "epoch": 4.457039552439352, "grad_norm": 0.0021983960177749395, "learning_rate": 7.099302491096681e-08, "loss": 0.0, "num_input_tokens_seen": 122903528, "step": 182440 }, { "epoch": 4.4571617032712, "grad_norm": 0.00025607639690861106, "learning_rate": 7.096147027196308e-08, "loss": 0.0, "num_input_tokens_seen": 122907176, "step": 182445 }, { "epoch": 4.457283854103046, "grad_norm": 0.0005844164406880736, "learning_rate": 7.092992238917761e-08, "loss": 0.0, "num_input_tokens_seen": 122910760, "step": 182450 }, { "epoch": 4.457406004934894, "grad_norm": 0.0015613926807418466, "learning_rate": 7.089838126283943e-08, "loss": 0.0, "num_input_tokens_seen": 122913960, "step": 182455 }, { "epoch": 4.457528155766741, "grad_norm": 2.1202440620982088e-05, "learning_rate": 7.086684689317834e-08, "loss": 0.0, "num_input_tokens_seen": 122916904, "step": 182460 }, { "epoch": 4.4576503065985875, "grad_norm": 4.5927870814921334e-05, "learning_rate": 7.083531928042319e-08, "loss": 0.0, "num_input_tokens_seen": 122920168, "step": 182465 }, { "epoch": 4.457772457430435, "grad_norm": 0.004868772812187672, "learning_rate": 7.080379842480378e-08, "loss": 0.0, "num_input_tokens_seen": 122923688, "step": 182470 }, { "epoch": 4.457894608262282, "grad_norm": 8.599821740062907e-05, "learning_rate": 7.077228432654881e-08, "loss": 0.0, "num_input_tokens_seen": 122926632, "step": 182475 }, { "epoch": 4.4580167590941295, "grad_norm": 0.0005928333266638219, "learning_rate": 7.074077698588777e-08, "loss": 0.0, "num_input_tokens_seen": 122930152, "step": 182480 }, { "epoch": 4.458138909925976, "grad_norm": 0.00020691838290076703, "learning_rate": 7.070927640304992e-08, "loss": 0.0, "num_input_tokens_seen": 122933288, "step": 182485 }, { "epoch": 4.458261060757824, "grad_norm": 1.5695924957981333e-05, "learning_rate": 7.067778257826395e-08, "loss": 0.0, "num_input_tokens_seen": 122936808, "step": 182490 }, { "epoch": 4.458383211589671, "grad_norm": 0.0004510094877332449, "learning_rate": 7.064629551175928e-08, "loss": 0.0, "num_input_tokens_seen": 122939880, "step": 182495 }, { "epoch": 4.458505362421518, "grad_norm": 1.2007948160171509, "learning_rate": 7.061481520376455e-08, "loss": 0.0, "num_input_tokens_seen": 122943080, "step": 182500 }, { "epoch": 4.458627513253365, "grad_norm": 0.00013166008284315467, "learning_rate": 7.058334165450885e-08, "loss": 0.0, "num_input_tokens_seen": 122946472, "step": 182505 }, { "epoch": 4.458749664085213, "grad_norm": 0.00042926755850203335, "learning_rate": 7.055187486422131e-08, "loss": 0.0, "num_input_tokens_seen": 122949672, "step": 182510 }, { "epoch": 4.458871814917059, "grad_norm": 3.18528400384821e-05, "learning_rate": 7.052041483313043e-08, "loss": 0.0, "num_input_tokens_seen": 122952936, "step": 182515 }, { "epoch": 4.458993965748907, "grad_norm": 7.602172263432294e-05, "learning_rate": 7.0488961561465e-08, "loss": 0.0, "num_input_tokens_seen": 122956136, "step": 182520 }, { "epoch": 4.459116116580754, "grad_norm": 0.031247856095433235, "learning_rate": 7.045751504945396e-08, "loss": 0.0, "num_input_tokens_seen": 122959336, "step": 182525 }, { "epoch": 4.459238267412601, "grad_norm": 0.0004095268959645182, "learning_rate": 7.04260752973258e-08, "loss": 0.0, "num_input_tokens_seen": 122962984, "step": 182530 }, { "epoch": 4.459360418244448, "grad_norm": 0.0004039282212033868, "learning_rate": 7.039464230530933e-08, "loss": 0.0, "num_input_tokens_seen": 122966056, "step": 182535 }, { "epoch": 4.459482569076296, "grad_norm": 0.00013901021156925708, "learning_rate": 7.036321607363294e-08, "loss": 0.0286, "num_input_tokens_seen": 122969128, "step": 182540 }, { "epoch": 4.459604719908143, "grad_norm": 0.00012049246288370341, "learning_rate": 7.033179660252541e-08, "loss": 0.0, "num_input_tokens_seen": 122972712, "step": 182545 }, { "epoch": 4.45972687073999, "grad_norm": 9.773957572178915e-05, "learning_rate": 7.030038389221493e-08, "loss": 0.0, "num_input_tokens_seen": 122976104, "step": 182550 }, { "epoch": 4.459849021571837, "grad_norm": 0.0012148652458563447, "learning_rate": 7.02689779429304e-08, "loss": 0.0, "num_input_tokens_seen": 122979304, "step": 182555 }, { "epoch": 4.459971172403684, "grad_norm": 0.0008318567997775972, "learning_rate": 7.023757875489967e-08, "loss": 0.0, "num_input_tokens_seen": 122982888, "step": 182560 }, { "epoch": 4.460093323235531, "grad_norm": 0.00032979255774989724, "learning_rate": 7.020618632835151e-08, "loss": 0.0, "num_input_tokens_seen": 122985896, "step": 182565 }, { "epoch": 4.460215474067378, "grad_norm": 4.526316115516238e-05, "learning_rate": 7.017480066351388e-08, "loss": 0.0001, "num_input_tokens_seen": 122989160, "step": 182570 }, { "epoch": 4.460337624899226, "grad_norm": 0.005635230336338282, "learning_rate": 7.014342176061517e-08, "loss": 0.1477, "num_input_tokens_seen": 122992424, "step": 182575 }, { "epoch": 4.460459775731072, "grad_norm": 7.891910354373977e-05, "learning_rate": 7.011204961988382e-08, "loss": 0.0, "num_input_tokens_seen": 122995880, "step": 182580 }, { "epoch": 4.46058192656292, "grad_norm": 0.00021873510559089482, "learning_rate": 7.008068424154756e-08, "loss": 0.0, "num_input_tokens_seen": 122998952, "step": 182585 }, { "epoch": 4.460704077394767, "grad_norm": 2.3276723368326202e-05, "learning_rate": 7.004932562583488e-08, "loss": 0.0, "num_input_tokens_seen": 123002024, "step": 182590 }, { "epoch": 4.4608262282266145, "grad_norm": 0.00020752607088070363, "learning_rate": 7.001797377297348e-08, "loss": 0.0, "num_input_tokens_seen": 123005480, "step": 182595 }, { "epoch": 4.460948379058461, "grad_norm": 0.02167407050728798, "learning_rate": 6.998662868319138e-08, "loss": 0.0, "num_input_tokens_seen": 123008808, "step": 182600 }, { "epoch": 4.461070529890309, "grad_norm": 3.9221591578098014e-05, "learning_rate": 6.9955290356717e-08, "loss": 0.0, "num_input_tokens_seen": 123012328, "step": 182605 }, { "epoch": 4.461192680722156, "grad_norm": 0.0007778708823025227, "learning_rate": 6.992395879377766e-08, "loss": 0.0001, "num_input_tokens_seen": 123015720, "step": 182610 }, { "epoch": 4.461314831554003, "grad_norm": 0.00041512076859362423, "learning_rate": 6.989263399460155e-08, "loss": 0.0, "num_input_tokens_seen": 123019048, "step": 182615 }, { "epoch": 4.46143698238585, "grad_norm": 0.00015520087617915124, "learning_rate": 6.986131595941624e-08, "loss": 0.0001, "num_input_tokens_seen": 123022568, "step": 182620 }, { "epoch": 4.461559133217698, "grad_norm": 6.149549881229177e-05, "learning_rate": 6.98300046884498e-08, "loss": 0.0, "num_input_tokens_seen": 123025640, "step": 182625 }, { "epoch": 4.461681284049544, "grad_norm": 0.0043769595213234425, "learning_rate": 6.97987001819298e-08, "loss": 0.0, "num_input_tokens_seen": 123029160, "step": 182630 }, { "epoch": 4.461803434881391, "grad_norm": 0.0011604566825553775, "learning_rate": 6.976740244008361e-08, "loss": 0.0, "num_input_tokens_seen": 123032360, "step": 182635 }, { "epoch": 4.461925585713239, "grad_norm": 0.0051023769192397594, "learning_rate": 6.973611146313929e-08, "loss": 0.0, "num_input_tokens_seen": 123035880, "step": 182640 }, { "epoch": 4.4620477365450855, "grad_norm": 0.003443704219534993, "learning_rate": 6.970482725132399e-08, "loss": 0.0001, "num_input_tokens_seen": 123038888, "step": 182645 }, { "epoch": 4.462169887376933, "grad_norm": 0.00011538012040546164, "learning_rate": 6.967354980486562e-08, "loss": 0.0, "num_input_tokens_seen": 123042088, "step": 182650 }, { "epoch": 4.46229203820878, "grad_norm": 0.0013685214798897505, "learning_rate": 6.964227912399123e-08, "loss": 0.0, "num_input_tokens_seen": 123045416, "step": 182655 }, { "epoch": 4.4624141890406275, "grad_norm": 0.00011652199464151636, "learning_rate": 6.961101520892831e-08, "loss": 0.0, "num_input_tokens_seen": 123048616, "step": 182660 }, { "epoch": 4.462536339872474, "grad_norm": 0.005106240976601839, "learning_rate": 6.957975805990469e-08, "loss": 0.0, "num_input_tokens_seen": 123051560, "step": 182665 }, { "epoch": 4.462658490704322, "grad_norm": 0.003788114059716463, "learning_rate": 6.954850767714704e-08, "loss": 0.0, "num_input_tokens_seen": 123055144, "step": 182670 }, { "epoch": 4.462780641536169, "grad_norm": 3.244485560571775e-05, "learning_rate": 6.951726406088309e-08, "loss": 0.0, "num_input_tokens_seen": 123059624, "step": 182675 }, { "epoch": 4.462902792368016, "grad_norm": 0.001383076305501163, "learning_rate": 6.948602721133967e-08, "loss": 0.0, "num_input_tokens_seen": 123063336, "step": 182680 }, { "epoch": 4.463024943199863, "grad_norm": 0.0005897828377783298, "learning_rate": 6.945479712874436e-08, "loss": 0.0, "num_input_tokens_seen": 123066728, "step": 182685 }, { "epoch": 4.463147094031711, "grad_norm": 2.9718587029492483e-05, "learning_rate": 6.942357381332387e-08, "loss": 0.0, "num_input_tokens_seen": 123070376, "step": 182690 }, { "epoch": 4.463269244863557, "grad_norm": 0.00011758630716940388, "learning_rate": 6.939235726530535e-08, "loss": 0.0, "num_input_tokens_seen": 123073512, "step": 182695 }, { "epoch": 4.463391395695405, "grad_norm": 0.7688287496566772, "learning_rate": 6.936114748491617e-08, "loss": 0.0005, "num_input_tokens_seen": 123076520, "step": 182700 }, { "epoch": 4.463513546527252, "grad_norm": 0.0001084234390873462, "learning_rate": 6.932994447238294e-08, "loss": 0.0, "num_input_tokens_seen": 123080360, "step": 182705 }, { "epoch": 4.463635697359099, "grad_norm": 0.0003334138309583068, "learning_rate": 6.929874822793269e-08, "loss": 0.0, "num_input_tokens_seen": 123084072, "step": 182710 }, { "epoch": 4.463757848190946, "grad_norm": 0.0001240150013472885, "learning_rate": 6.926755875179224e-08, "loss": 0.0, "num_input_tokens_seen": 123087208, "step": 182715 }, { "epoch": 4.463879999022794, "grad_norm": 0.0007824696367606521, "learning_rate": 6.923637604418853e-08, "loss": 0.0, "num_input_tokens_seen": 123090344, "step": 182720 }, { "epoch": 4.4640021498546405, "grad_norm": 7.241334969876334e-05, "learning_rate": 6.920520010534803e-08, "loss": 0.0, "num_input_tokens_seen": 123093608, "step": 182725 }, { "epoch": 4.464124300686487, "grad_norm": 0.0003914496919605881, "learning_rate": 6.91740309354979e-08, "loss": 0.0, "num_input_tokens_seen": 123097448, "step": 182730 }, { "epoch": 4.464246451518335, "grad_norm": 0.0005804976099170744, "learning_rate": 6.914286853486462e-08, "loss": 0.0, "num_input_tokens_seen": 123101352, "step": 182735 }, { "epoch": 4.464368602350182, "grad_norm": 0.0001754326221998781, "learning_rate": 6.911171290367457e-08, "loss": 0.0, "num_input_tokens_seen": 123104488, "step": 182740 }, { "epoch": 4.464490753182029, "grad_norm": 0.0002568564668763429, "learning_rate": 6.908056404215467e-08, "loss": 0.0, "num_input_tokens_seen": 123107752, "step": 182745 }, { "epoch": 4.464612904013876, "grad_norm": 7.831337279640138e-05, "learning_rate": 6.90494219505311e-08, "loss": 0.0, "num_input_tokens_seen": 123111336, "step": 182750 }, { "epoch": 4.464735054845724, "grad_norm": 0.0005972188664600253, "learning_rate": 6.901828662903054e-08, "loss": 0.0172, "num_input_tokens_seen": 123114536, "step": 182755 }, { "epoch": 4.46485720567757, "grad_norm": 0.000675778544973582, "learning_rate": 6.898715807787958e-08, "loss": 0.0003, "num_input_tokens_seen": 123117864, "step": 182760 }, { "epoch": 4.464979356509418, "grad_norm": 8.06990938144736e-05, "learning_rate": 6.895603629730429e-08, "loss": 0.0, "num_input_tokens_seen": 123120680, "step": 182765 }, { "epoch": 4.465101507341265, "grad_norm": 0.012151413597166538, "learning_rate": 6.892492128753124e-08, "loss": 0.0, "num_input_tokens_seen": 123124136, "step": 182770 }, { "epoch": 4.465223658173112, "grad_norm": 0.0002190818777307868, "learning_rate": 6.88938130487865e-08, "loss": 0.0, "num_input_tokens_seen": 123127528, "step": 182775 }, { "epoch": 4.465345809004959, "grad_norm": 0.000651566602755338, "learning_rate": 6.886271158129642e-08, "loss": 0.0, "num_input_tokens_seen": 123130600, "step": 182780 }, { "epoch": 4.465467959836807, "grad_norm": 0.00015405859448947012, "learning_rate": 6.883161688528715e-08, "loss": 0.0, "num_input_tokens_seen": 123134888, "step": 182785 }, { "epoch": 4.4655901106686535, "grad_norm": 0.0005990671343170106, "learning_rate": 6.880052896098465e-08, "loss": 0.0, "num_input_tokens_seen": 123138344, "step": 182790 }, { "epoch": 4.465712261500501, "grad_norm": 0.0010906461393460631, "learning_rate": 6.876944780861548e-08, "loss": 0.0, "num_input_tokens_seen": 123142056, "step": 182795 }, { "epoch": 4.465834412332348, "grad_norm": 0.00044133083429187536, "learning_rate": 6.873837342840516e-08, "loss": 0.0, "num_input_tokens_seen": 123145384, "step": 182800 }, { "epoch": 4.4659565631641955, "grad_norm": 0.0014121757121756673, "learning_rate": 6.870730582057993e-08, "loss": 0.0001, "num_input_tokens_seen": 123148712, "step": 182805 }, { "epoch": 4.466078713996042, "grad_norm": 0.003218111116439104, "learning_rate": 6.867624498536561e-08, "loss": 0.0, "num_input_tokens_seen": 123151848, "step": 182810 }, { "epoch": 4.46620086482789, "grad_norm": 3.127128002233803e-05, "learning_rate": 6.864519092298804e-08, "loss": 0.0, "num_input_tokens_seen": 123155112, "step": 182815 }, { "epoch": 4.466323015659737, "grad_norm": 0.0003318258677609265, "learning_rate": 6.861414363367335e-08, "loss": 0.0, "num_input_tokens_seen": 123158312, "step": 182820 }, { "epoch": 4.466445166491583, "grad_norm": 1.8997519873664714e-05, "learning_rate": 6.858310311764715e-08, "loss": 0.0, "num_input_tokens_seen": 123161640, "step": 182825 }, { "epoch": 4.466567317323431, "grad_norm": 0.00020123532158322632, "learning_rate": 6.855206937513491e-08, "loss": 0.0, "num_input_tokens_seen": 123165160, "step": 182830 }, { "epoch": 4.466689468155278, "grad_norm": 5.2726703870575875e-05, "learning_rate": 6.85210424063628e-08, "loss": 0.0, "num_input_tokens_seen": 123168808, "step": 182835 }, { "epoch": 4.466811618987125, "grad_norm": 0.0004758377617690712, "learning_rate": 6.849002221155598e-08, "loss": 0.0, "num_input_tokens_seen": 123172264, "step": 182840 }, { "epoch": 4.466933769818972, "grad_norm": 0.0007463787333108485, "learning_rate": 6.845900879094046e-08, "loss": 0.0, "num_input_tokens_seen": 123175528, "step": 182845 }, { "epoch": 4.46705592065082, "grad_norm": 9.71447370829992e-05, "learning_rate": 6.842800214474143e-08, "loss": 0.0, "num_input_tokens_seen": 123179176, "step": 182850 }, { "epoch": 4.4671780714826665, "grad_norm": 0.0004629456379916519, "learning_rate": 6.839700227318468e-08, "loss": 0.0, "num_input_tokens_seen": 123182376, "step": 182855 }, { "epoch": 4.467300222314514, "grad_norm": 0.0009171870187856257, "learning_rate": 6.836600917649538e-08, "loss": 0.0, "num_input_tokens_seen": 123185704, "step": 182860 }, { "epoch": 4.467422373146361, "grad_norm": 7.736064617347438e-06, "learning_rate": 6.833502285489911e-08, "loss": 0.0, "num_input_tokens_seen": 123189032, "step": 182865 }, { "epoch": 4.467544523978209, "grad_norm": 0.0008894494967535138, "learning_rate": 6.830404330862104e-08, "loss": 0.0, "num_input_tokens_seen": 123192808, "step": 182870 }, { "epoch": 4.467666674810055, "grad_norm": 0.0012684657704085112, "learning_rate": 6.827307053788667e-08, "loss": 0.0, "num_input_tokens_seen": 123196200, "step": 182875 }, { "epoch": 4.467788825641903, "grad_norm": 0.004361152183264494, "learning_rate": 6.8242104542921e-08, "loss": 0.0, "num_input_tokens_seen": 123199144, "step": 182880 }, { "epoch": 4.46791097647375, "grad_norm": 0.00017511250916868448, "learning_rate": 6.821114532394944e-08, "loss": 0.0, "num_input_tokens_seen": 123203048, "step": 182885 }, { "epoch": 4.468033127305597, "grad_norm": 0.0025651047471910715, "learning_rate": 6.818019288119714e-08, "loss": 0.0, "num_input_tokens_seen": 123205992, "step": 182890 }, { "epoch": 4.468155278137444, "grad_norm": 0.00010991712042596191, "learning_rate": 6.81492472148889e-08, "loss": 0.0, "num_input_tokens_seen": 123209256, "step": 182895 }, { "epoch": 4.468277428969291, "grad_norm": 0.0003879454161506146, "learning_rate": 6.811830832525023e-08, "loss": 0.0, "num_input_tokens_seen": 123212392, "step": 182900 }, { "epoch": 4.468399579801138, "grad_norm": 0.00018364388961344957, "learning_rate": 6.808737621250571e-08, "loss": 0.0, "num_input_tokens_seen": 123215784, "step": 182905 }, { "epoch": 4.468521730632986, "grad_norm": 0.0015769852325320244, "learning_rate": 6.805645087688039e-08, "loss": 0.0, "num_input_tokens_seen": 123219176, "step": 182910 }, { "epoch": 4.468643881464833, "grad_norm": 0.024887003004550934, "learning_rate": 6.80255323185993e-08, "loss": 0.0, "num_input_tokens_seen": 123222440, "step": 182915 }, { "epoch": 4.46876603229668, "grad_norm": 1.2393422366585582e-05, "learning_rate": 6.799462053788718e-08, "loss": 0.0, "num_input_tokens_seen": 123225448, "step": 182920 }, { "epoch": 4.468888183128527, "grad_norm": 0.007184472866356373, "learning_rate": 6.796371553496904e-08, "loss": 0.0, "num_input_tokens_seen": 123228456, "step": 182925 }, { "epoch": 4.469010333960374, "grad_norm": 0.029829949140548706, "learning_rate": 6.793281731006917e-08, "loss": 0.0348, "num_input_tokens_seen": 123231720, "step": 182930 }, { "epoch": 4.469132484792222, "grad_norm": 0.0022584174294024706, "learning_rate": 6.790192586341282e-08, "loss": 0.0, "num_input_tokens_seen": 123235432, "step": 182935 }, { "epoch": 4.469254635624068, "grad_norm": 2.5439127057325095e-05, "learning_rate": 6.787104119522425e-08, "loss": 0.0, "num_input_tokens_seen": 123239144, "step": 182940 }, { "epoch": 4.469376786455916, "grad_norm": 0.00019265869923401624, "learning_rate": 6.784016330572816e-08, "loss": 0.0, "num_input_tokens_seen": 123242472, "step": 182945 }, { "epoch": 4.469498937287763, "grad_norm": 0.009062398225069046, "learning_rate": 6.780929219514919e-08, "loss": 0.0, "num_input_tokens_seen": 123245608, "step": 182950 }, { "epoch": 4.46962108811961, "grad_norm": 0.007672363892197609, "learning_rate": 6.777842786371157e-08, "loss": 0.0, "num_input_tokens_seen": 123249064, "step": 182955 }, { "epoch": 4.469743238951457, "grad_norm": 0.0006050781230442226, "learning_rate": 6.774757031164025e-08, "loss": 0.0, "num_input_tokens_seen": 123252136, "step": 182960 }, { "epoch": 4.469865389783305, "grad_norm": 4.5979195419931784e-05, "learning_rate": 6.771671953915914e-08, "loss": 0.0, "num_input_tokens_seen": 123255784, "step": 182965 }, { "epoch": 4.4699875406151515, "grad_norm": 9.688878344604746e-05, "learning_rate": 6.768587554649286e-08, "loss": 0.0, "num_input_tokens_seen": 123259304, "step": 182970 }, { "epoch": 4.470109691446999, "grad_norm": 0.0017482617404311895, "learning_rate": 6.765503833386566e-08, "loss": 0.0, "num_input_tokens_seen": 123262376, "step": 182975 }, { "epoch": 4.470231842278846, "grad_norm": 8.310518751386553e-05, "learning_rate": 6.76242079015017e-08, "loss": 0.0, "num_input_tokens_seen": 123265384, "step": 182980 }, { "epoch": 4.4703539931106935, "grad_norm": 2.6630152206053026e-05, "learning_rate": 6.759338424962546e-08, "loss": 0.0, "num_input_tokens_seen": 123268904, "step": 182985 }, { "epoch": 4.47047614394254, "grad_norm": 0.00017120575648732483, "learning_rate": 6.756256737846067e-08, "loss": 0.0, "num_input_tokens_seen": 123272040, "step": 182990 }, { "epoch": 4.470598294774387, "grad_norm": 0.0010852214181795716, "learning_rate": 6.753175728823191e-08, "loss": 0.0, "num_input_tokens_seen": 123275624, "step": 182995 }, { "epoch": 4.470720445606235, "grad_norm": 0.00042630190728232265, "learning_rate": 6.750095397916289e-08, "loss": 0.0, "num_input_tokens_seen": 123279144, "step": 183000 }, { "epoch": 4.470842596438081, "grad_norm": 0.000170299899764359, "learning_rate": 6.747015745147777e-08, "loss": 0.0, "num_input_tokens_seen": 123282536, "step": 183005 }, { "epoch": 4.470964747269929, "grad_norm": 0.0007768705254420638, "learning_rate": 6.74393677054006e-08, "loss": 0.0, "num_input_tokens_seen": 123285928, "step": 183010 }, { "epoch": 4.471086898101776, "grad_norm": 0.00016575297922827303, "learning_rate": 6.740858474115496e-08, "loss": 0.0, "num_input_tokens_seen": 123289576, "step": 183015 }, { "epoch": 4.471209048933623, "grad_norm": 0.0007921412470750511, "learning_rate": 6.737780855896513e-08, "loss": 0.0, "num_input_tokens_seen": 123293160, "step": 183020 }, { "epoch": 4.47133119976547, "grad_norm": 8.899014937924221e-05, "learning_rate": 6.73470391590546e-08, "loss": 0.0, "num_input_tokens_seen": 123296808, "step": 183025 }, { "epoch": 4.471453350597318, "grad_norm": 0.0014562405413016677, "learning_rate": 6.73162765416474e-08, "loss": 0.0, "num_input_tokens_seen": 123299752, "step": 183030 }, { "epoch": 4.4715755014291645, "grad_norm": 0.0005221511819399893, "learning_rate": 6.728552070696691e-08, "loss": 0.0, "num_input_tokens_seen": 123303592, "step": 183035 }, { "epoch": 4.471697652261012, "grad_norm": 0.0009294108022004366, "learning_rate": 6.725477165523719e-08, "loss": 0.0, "num_input_tokens_seen": 123307304, "step": 183040 }, { "epoch": 4.471819803092859, "grad_norm": 0.0023763030767440796, "learning_rate": 6.722402938668158e-08, "loss": 0.0, "num_input_tokens_seen": 123310824, "step": 183045 }, { "epoch": 4.4719419539247065, "grad_norm": 0.0003205655375495553, "learning_rate": 6.719329390152361e-08, "loss": 0.0, "num_input_tokens_seen": 123314280, "step": 183050 }, { "epoch": 4.472064104756553, "grad_norm": 3.7294415960786864e-05, "learning_rate": 6.716256519998698e-08, "loss": 0.0, "num_input_tokens_seen": 123318056, "step": 183055 }, { "epoch": 4.472186255588401, "grad_norm": 0.0005499294493347406, "learning_rate": 6.713184328229505e-08, "loss": 0.0, "num_input_tokens_seen": 123321512, "step": 183060 }, { "epoch": 4.472308406420248, "grad_norm": 0.0001556864590384066, "learning_rate": 6.71011281486712e-08, "loss": 0.0, "num_input_tokens_seen": 123324840, "step": 183065 }, { "epoch": 4.472430557252095, "grad_norm": 8.681939652888104e-05, "learning_rate": 6.707041979933903e-08, "loss": 0.0606, "num_input_tokens_seen": 123327976, "step": 183070 }, { "epoch": 4.472552708083942, "grad_norm": 0.006124210078269243, "learning_rate": 6.703971823452149e-08, "loss": 0.0, "num_input_tokens_seen": 123331112, "step": 183075 }, { "epoch": 4.47267485891579, "grad_norm": 0.0030316852498799562, "learning_rate": 6.700902345444226e-08, "loss": 0.0, "num_input_tokens_seen": 123334440, "step": 183080 }, { "epoch": 4.472797009747636, "grad_norm": 0.00030044937739148736, "learning_rate": 6.697833545932419e-08, "loss": 0.0, "num_input_tokens_seen": 123339944, "step": 183085 }, { "epoch": 4.472919160579483, "grad_norm": 0.0003357155655976385, "learning_rate": 6.694765424939075e-08, "loss": 0.0, "num_input_tokens_seen": 123343208, "step": 183090 }, { "epoch": 4.473041311411331, "grad_norm": 0.0010258235270157456, "learning_rate": 6.691697982486478e-08, "loss": 0.0, "num_input_tokens_seen": 123346728, "step": 183095 }, { "epoch": 4.4731634622431775, "grad_norm": 0.000255766324698925, "learning_rate": 6.688631218596951e-08, "loss": 0.0003, "num_input_tokens_seen": 123349992, "step": 183100 }, { "epoch": 4.473285613075025, "grad_norm": 0.00048594220425002277, "learning_rate": 6.685565133292814e-08, "loss": 0.0, "num_input_tokens_seen": 123353064, "step": 183105 }, { "epoch": 4.473407763906872, "grad_norm": 0.01167520321905613, "learning_rate": 6.682499726596336e-08, "loss": 0.0, "num_input_tokens_seen": 123356072, "step": 183110 }, { "epoch": 4.4735299147387195, "grad_norm": 0.009621109813451767, "learning_rate": 6.67943499852982e-08, "loss": 0.0, "num_input_tokens_seen": 123359336, "step": 183115 }, { "epoch": 4.473652065570566, "grad_norm": 0.0003300695971120149, "learning_rate": 6.67637094911555e-08, "loss": 0.0, "num_input_tokens_seen": 123362664, "step": 183120 }, { "epoch": 4.473774216402414, "grad_norm": 0.0014379281783476472, "learning_rate": 6.67330757837582e-08, "loss": 0.0193, "num_input_tokens_seen": 123365992, "step": 183125 }, { "epoch": 4.473896367234261, "grad_norm": 0.0006100056925788522, "learning_rate": 6.670244886332888e-08, "loss": 0.0, "num_input_tokens_seen": 123369128, "step": 183130 }, { "epoch": 4.474018518066108, "grad_norm": 0.00041016406612470746, "learning_rate": 6.667182873009047e-08, "loss": 0.0, "num_input_tokens_seen": 123372648, "step": 183135 }, { "epoch": 4.474140668897955, "grad_norm": 0.27162519097328186, "learning_rate": 6.664121538426548e-08, "loss": 0.0002, "num_input_tokens_seen": 123376168, "step": 183140 }, { "epoch": 4.474262819729803, "grad_norm": 0.00015165298827923834, "learning_rate": 6.661060882607672e-08, "loss": 0.0, "num_input_tokens_seen": 123379880, "step": 183145 }, { "epoch": 4.474384970561649, "grad_norm": 8.916952356230468e-05, "learning_rate": 6.658000905574657e-08, "loss": 0.0, "num_input_tokens_seen": 123383208, "step": 183150 }, { "epoch": 4.474507121393497, "grad_norm": 5.2944047638447955e-05, "learning_rate": 6.654941607349773e-08, "loss": 0.0, "num_input_tokens_seen": 123386088, "step": 183155 }, { "epoch": 4.474629272225344, "grad_norm": 0.004181950818747282, "learning_rate": 6.651882987955249e-08, "loss": 0.0, "num_input_tokens_seen": 123389288, "step": 183160 }, { "epoch": 4.474751423057191, "grad_norm": 0.0015895121032372117, "learning_rate": 6.648825047413353e-08, "loss": 0.0, "num_input_tokens_seen": 123392488, "step": 183165 }, { "epoch": 4.474873573889038, "grad_norm": 0.0012971092946827412, "learning_rate": 6.645767785746292e-08, "loss": 0.0, "num_input_tokens_seen": 123395944, "step": 183170 }, { "epoch": 4.474995724720886, "grad_norm": 0.0003645285905804485, "learning_rate": 6.642711202976336e-08, "loss": 0.0, "num_input_tokens_seen": 123399400, "step": 183175 }, { "epoch": 4.4751178755527325, "grad_norm": 0.0002894159115385264, "learning_rate": 6.63965529912568e-08, "loss": 0.0, "num_input_tokens_seen": 123402920, "step": 183180 }, { "epoch": 4.475240026384579, "grad_norm": 0.00044401903869584203, "learning_rate": 6.636600074216569e-08, "loss": 0.0, "num_input_tokens_seen": 123406248, "step": 183185 }, { "epoch": 4.475362177216427, "grad_norm": 0.001973674399778247, "learning_rate": 6.633545528271211e-08, "loss": 0.0, "num_input_tokens_seen": 123410024, "step": 183190 }, { "epoch": 4.475484328048274, "grad_norm": 0.0008897829684428871, "learning_rate": 6.63049166131181e-08, "loss": 0.0399, "num_input_tokens_seen": 123413608, "step": 183195 }, { "epoch": 4.475606478880121, "grad_norm": 0.00013381907774601132, "learning_rate": 6.627438473360614e-08, "loss": 0.0, "num_input_tokens_seen": 123416552, "step": 183200 }, { "epoch": 4.475728629711968, "grad_norm": 0.001655498635955155, "learning_rate": 6.624385964439782e-08, "loss": 0.0, "num_input_tokens_seen": 123419752, "step": 183205 }, { "epoch": 4.475850780543816, "grad_norm": 8.101667481241748e-05, "learning_rate": 6.621334134571543e-08, "loss": 0.0, "num_input_tokens_seen": 123423144, "step": 183210 }, { "epoch": 4.475972931375662, "grad_norm": 0.0003639574279077351, "learning_rate": 6.618282983778056e-08, "loss": 0.0, "num_input_tokens_seen": 123426536, "step": 183215 }, { "epoch": 4.47609508220751, "grad_norm": 0.00012090370728401467, "learning_rate": 6.615232512081536e-08, "loss": 0.0, "num_input_tokens_seen": 123430184, "step": 183220 }, { "epoch": 4.476217233039357, "grad_norm": 2.0068329831701703e-05, "learning_rate": 6.612182719504189e-08, "loss": 0.0, "num_input_tokens_seen": 123433512, "step": 183225 }, { "epoch": 4.4763393838712044, "grad_norm": 0.00010261212446494028, "learning_rate": 6.609133606068141e-08, "loss": 0.0, "num_input_tokens_seen": 123436584, "step": 183230 }, { "epoch": 4.476461534703051, "grad_norm": 0.0005591385997831821, "learning_rate": 6.606085171795606e-08, "loss": 0.0, "num_input_tokens_seen": 123440040, "step": 183235 }, { "epoch": 4.476583685534899, "grad_norm": 0.0006489804945886135, "learning_rate": 6.603037416708734e-08, "loss": 0.0007, "num_input_tokens_seen": 123443368, "step": 183240 }, { "epoch": 4.476705836366746, "grad_norm": 0.0007108654244802892, "learning_rate": 6.599990340829697e-08, "loss": 0.0, "num_input_tokens_seen": 123446760, "step": 183245 }, { "epoch": 4.476827987198593, "grad_norm": 0.0032975731883198023, "learning_rate": 6.596943944180666e-08, "loss": 0.0, "num_input_tokens_seen": 123450408, "step": 183250 }, { "epoch": 4.47695013803044, "grad_norm": 0.0004452765570022166, "learning_rate": 6.593898226783757e-08, "loss": 0.0, "num_input_tokens_seen": 123453736, "step": 183255 }, { "epoch": 4.477072288862287, "grad_norm": 2.435279020573944e-05, "learning_rate": 6.590853188661161e-08, "loss": 0.0, "num_input_tokens_seen": 123457064, "step": 183260 }, { "epoch": 4.477194439694134, "grad_norm": 0.03045794926583767, "learning_rate": 6.587808829834984e-08, "loss": 0.0, "num_input_tokens_seen": 123460712, "step": 183265 }, { "epoch": 4.477316590525981, "grad_norm": 0.00014943921996746212, "learning_rate": 6.584765150327409e-08, "loss": 0.0, "num_input_tokens_seen": 123463848, "step": 183270 }, { "epoch": 4.477438741357829, "grad_norm": 0.0033560895826667547, "learning_rate": 6.58172215016053e-08, "loss": 0.0, "num_input_tokens_seen": 123467176, "step": 183275 }, { "epoch": 4.4775608921896755, "grad_norm": 0.00017658942670095712, "learning_rate": 6.578679829356514e-08, "loss": 0.0, "num_input_tokens_seen": 123470440, "step": 183280 }, { "epoch": 4.477683043021523, "grad_norm": 0.00010721410217229277, "learning_rate": 6.575638187937437e-08, "loss": 0.0, "num_input_tokens_seen": 123473832, "step": 183285 }, { "epoch": 4.47780519385337, "grad_norm": 0.019060831516981125, "learning_rate": 6.572597225925458e-08, "loss": 0.0, "num_input_tokens_seen": 123477928, "step": 183290 }, { "epoch": 4.4779273446852175, "grad_norm": 0.0005212004762142897, "learning_rate": 6.569556943342691e-08, "loss": 0.0, "num_input_tokens_seen": 123481512, "step": 183295 }, { "epoch": 4.478049495517064, "grad_norm": 3.1600420475006104, "learning_rate": 6.56651734021123e-08, "loss": 0.0009, "num_input_tokens_seen": 123485352, "step": 183300 }, { "epoch": 4.478171646348912, "grad_norm": 0.0009134397841989994, "learning_rate": 6.563478416553192e-08, "loss": 0.0, "num_input_tokens_seen": 123488232, "step": 183305 }, { "epoch": 4.478293797180759, "grad_norm": 0.0004978424985893071, "learning_rate": 6.560440172390658e-08, "loss": 0.0, "num_input_tokens_seen": 123491688, "step": 183310 }, { "epoch": 4.478415948012606, "grad_norm": 0.00014363221998792142, "learning_rate": 6.557402607745733e-08, "loss": 0.0, "num_input_tokens_seen": 123494888, "step": 183315 }, { "epoch": 4.478538098844453, "grad_norm": 0.0022859908640384674, "learning_rate": 6.55436572264052e-08, "loss": 0.0, "num_input_tokens_seen": 123498344, "step": 183320 }, { "epoch": 4.478660249676301, "grad_norm": 0.000360700098099187, "learning_rate": 6.551329517097092e-08, "loss": 0.0, "num_input_tokens_seen": 123501800, "step": 183325 }, { "epoch": 4.478782400508147, "grad_norm": 0.0004340151499491185, "learning_rate": 6.548293991137533e-08, "loss": 0.0, "num_input_tokens_seen": 123505448, "step": 183330 }, { "epoch": 4.478904551339995, "grad_norm": 0.0074119605123996735, "learning_rate": 6.54525914478391e-08, "loss": 0.0, "num_input_tokens_seen": 123508520, "step": 183335 }, { "epoch": 4.479026702171842, "grad_norm": 0.00487268203869462, "learning_rate": 6.542224978058309e-08, "loss": 0.0, "num_input_tokens_seen": 123511912, "step": 183340 }, { "epoch": 4.479148853003689, "grad_norm": 0.0007920575444586575, "learning_rate": 6.539191490982766e-08, "loss": 0.0, "num_input_tokens_seen": 123514984, "step": 183345 }, { "epoch": 4.479271003835536, "grad_norm": 0.0023543655406683683, "learning_rate": 6.536158683579374e-08, "loss": 0.0, "num_input_tokens_seen": 123517928, "step": 183350 }, { "epoch": 4.479393154667383, "grad_norm": 0.00011763614020310342, "learning_rate": 6.533126555870183e-08, "loss": 0.0001, "num_input_tokens_seen": 123521192, "step": 183355 }, { "epoch": 4.4795153054992305, "grad_norm": 3.650221333373338e-05, "learning_rate": 6.53009510787722e-08, "loss": 0.0, "num_input_tokens_seen": 123524264, "step": 183360 }, { "epoch": 4.479637456331077, "grad_norm": 0.07241086661815643, "learning_rate": 6.527064339622557e-08, "loss": 0.0, "num_input_tokens_seen": 123527464, "step": 183365 }, { "epoch": 4.479759607162925, "grad_norm": 2.090701127599459e-05, "learning_rate": 6.524034251128207e-08, "loss": 0.0, "num_input_tokens_seen": 123530984, "step": 183370 }, { "epoch": 4.479881757994772, "grad_norm": 0.047290850430727005, "learning_rate": 6.521004842416222e-08, "loss": 0.0, "num_input_tokens_seen": 123534376, "step": 183375 }, { "epoch": 4.480003908826619, "grad_norm": 0.007085281889885664, "learning_rate": 6.51797611350865e-08, "loss": 0.0, "num_input_tokens_seen": 123538024, "step": 183380 }, { "epoch": 4.480126059658466, "grad_norm": 0.00010538164497120306, "learning_rate": 6.514948064427484e-08, "loss": 0.0, "num_input_tokens_seen": 123541352, "step": 183385 }, { "epoch": 4.480248210490314, "grad_norm": 0.00016254704678431153, "learning_rate": 6.511920695194783e-08, "loss": 0.0, "num_input_tokens_seen": 123544552, "step": 183390 }, { "epoch": 4.48037036132216, "grad_norm": 4.432534842635505e-05, "learning_rate": 6.50889400583251e-08, "loss": 0.0, "num_input_tokens_seen": 123548072, "step": 183395 }, { "epoch": 4.480492512154008, "grad_norm": 9.711547318147495e-05, "learning_rate": 6.505867996362735e-08, "loss": 0.0, "num_input_tokens_seen": 123551656, "step": 183400 }, { "epoch": 4.480614662985855, "grad_norm": 0.0009047918720170856, "learning_rate": 6.502842666807406e-08, "loss": 0.0, "num_input_tokens_seen": 123555048, "step": 183405 }, { "epoch": 4.480736813817702, "grad_norm": 0.00018528725195210427, "learning_rate": 6.499818017188563e-08, "loss": 0.05, "num_input_tokens_seen": 123558184, "step": 183410 }, { "epoch": 4.480858964649549, "grad_norm": 7.544342952314764e-05, "learning_rate": 6.49679404752821e-08, "loss": 0.0, "num_input_tokens_seen": 123561512, "step": 183415 }, { "epoch": 4.480981115481397, "grad_norm": 0.0016681732377037406, "learning_rate": 6.493770757848294e-08, "loss": 0.0, "num_input_tokens_seen": 123565288, "step": 183420 }, { "epoch": 4.4811032663132435, "grad_norm": 0.0003314499044790864, "learning_rate": 6.490748148170844e-08, "loss": 0.0, "num_input_tokens_seen": 123568552, "step": 183425 }, { "epoch": 4.481225417145091, "grad_norm": 0.00010013279097620398, "learning_rate": 6.48772621851782e-08, "loss": 0.0, "num_input_tokens_seen": 123572136, "step": 183430 }, { "epoch": 4.481347567976938, "grad_norm": 0.001971852732822299, "learning_rate": 6.484704968911204e-08, "loss": 0.0002, "num_input_tokens_seen": 123575464, "step": 183435 }, { "epoch": 4.4814697188087855, "grad_norm": 0.0005752836004830897, "learning_rate": 6.481684399372955e-08, "loss": 0.0, "num_input_tokens_seen": 123579176, "step": 183440 }, { "epoch": 4.481591869640632, "grad_norm": 5.8873403759207577e-05, "learning_rate": 6.47866450992507e-08, "loss": 0.0, "num_input_tokens_seen": 123582504, "step": 183445 }, { "epoch": 4.481714020472479, "grad_norm": 0.0018522852333262563, "learning_rate": 6.475645300589472e-08, "loss": 0.0, "num_input_tokens_seen": 123585832, "step": 183450 }, { "epoch": 4.481836171304327, "grad_norm": 0.0020564633887261152, "learning_rate": 6.472626771388156e-08, "loss": 0.0, "num_input_tokens_seen": 123589352, "step": 183455 }, { "epoch": 4.481958322136173, "grad_norm": 0.003389239078387618, "learning_rate": 6.46960892234304e-08, "loss": 0.0005, "num_input_tokens_seen": 123592808, "step": 183460 }, { "epoch": 4.482080472968021, "grad_norm": 7.917655602796003e-05, "learning_rate": 6.466591753476092e-08, "loss": 0.0, "num_input_tokens_seen": 123596008, "step": 183465 }, { "epoch": 4.482202623799868, "grad_norm": 0.00023252793471328914, "learning_rate": 6.463575264809229e-08, "loss": 0.0, "num_input_tokens_seen": 123599592, "step": 183470 }, { "epoch": 4.482324774631715, "grad_norm": 6.5754255047068e-05, "learning_rate": 6.460559456364434e-08, "loss": 0.0, "num_input_tokens_seen": 123603048, "step": 183475 }, { "epoch": 4.482446925463562, "grad_norm": 0.00039355934131890535, "learning_rate": 6.457544328163578e-08, "loss": 0.0, "num_input_tokens_seen": 123606120, "step": 183480 }, { "epoch": 4.48256907629541, "grad_norm": 0.00019612463074736297, "learning_rate": 6.454529880228643e-08, "loss": 0.0, "num_input_tokens_seen": 123609256, "step": 183485 }, { "epoch": 4.4826912271272565, "grad_norm": 0.0003407395852264017, "learning_rate": 6.451516112581512e-08, "loss": 0.0, "num_input_tokens_seen": 123612520, "step": 183490 }, { "epoch": 4.482813377959104, "grad_norm": 2.2714593796990812e-05, "learning_rate": 6.448503025244134e-08, "loss": 0.0, "num_input_tokens_seen": 123615912, "step": 183495 }, { "epoch": 4.482935528790951, "grad_norm": 1.2123967280786019e-05, "learning_rate": 6.445490618238392e-08, "loss": 0.0, "num_input_tokens_seen": 123619176, "step": 183500 }, { "epoch": 4.4830576796227986, "grad_norm": 0.0008833124884404242, "learning_rate": 6.4424788915862e-08, "loss": 0.0, "num_input_tokens_seen": 123622696, "step": 183505 }, { "epoch": 4.483179830454645, "grad_norm": 0.0001925179094541818, "learning_rate": 6.439467845309488e-08, "loss": 0.0, "num_input_tokens_seen": 123625960, "step": 183510 }, { "epoch": 4.483301981286493, "grad_norm": 0.0018321550451219082, "learning_rate": 6.436457479430101e-08, "loss": 0.0, "num_input_tokens_seen": 123629032, "step": 183515 }, { "epoch": 4.48342413211834, "grad_norm": 0.0010063842637464404, "learning_rate": 6.433447793969982e-08, "loss": 0.0, "num_input_tokens_seen": 123632488, "step": 183520 }, { "epoch": 4.483546282950186, "grad_norm": 0.00020096627122256905, "learning_rate": 6.430438788950987e-08, "loss": 0.0, "num_input_tokens_seen": 123635944, "step": 183525 }, { "epoch": 4.483668433782034, "grad_norm": 0.015927936881780624, "learning_rate": 6.427430464395022e-08, "loss": 0.0, "num_input_tokens_seen": 123639464, "step": 183530 }, { "epoch": 4.483790584613882, "grad_norm": 0.0010257528629153967, "learning_rate": 6.424422820323938e-08, "loss": 0.0001, "num_input_tokens_seen": 123643304, "step": 183535 }, { "epoch": 4.483912735445728, "grad_norm": 0.0001770484377630055, "learning_rate": 6.421415856759616e-08, "loss": 0.0, "num_input_tokens_seen": 123646952, "step": 183540 }, { "epoch": 4.484034886277575, "grad_norm": 0.0002756124595180154, "learning_rate": 6.418409573723937e-08, "loss": 0.0, "num_input_tokens_seen": 123650280, "step": 183545 }, { "epoch": 4.484157037109423, "grad_norm": 5.871909888810478e-05, "learning_rate": 6.415403971238741e-08, "loss": 0.0, "num_input_tokens_seen": 123653352, "step": 183550 }, { "epoch": 4.48427918794127, "grad_norm": 0.0005751706194132566, "learning_rate": 6.412399049325922e-08, "loss": 0.0005, "num_input_tokens_seen": 123656552, "step": 183555 }, { "epoch": 4.484401338773117, "grad_norm": 0.00021047874179203063, "learning_rate": 6.409394808007307e-08, "loss": 0.0, "num_input_tokens_seen": 123659752, "step": 183560 }, { "epoch": 4.484523489604964, "grad_norm": 7.142380491131917e-05, "learning_rate": 6.406391247304732e-08, "loss": 0.0, "num_input_tokens_seen": 123663592, "step": 183565 }, { "epoch": 4.484645640436812, "grad_norm": 4.248780169291422e-05, "learning_rate": 6.403388367240059e-08, "loss": 0.0, "num_input_tokens_seen": 123666728, "step": 183570 }, { "epoch": 4.484767791268658, "grad_norm": 0.001250158529728651, "learning_rate": 6.400386167835115e-08, "loss": 0.0, "num_input_tokens_seen": 123669992, "step": 183575 }, { "epoch": 4.484889942100506, "grad_norm": 0.001813249895349145, "learning_rate": 6.397384649111748e-08, "loss": 0.0, "num_input_tokens_seen": 123673640, "step": 183580 }, { "epoch": 4.485012092932353, "grad_norm": 0.0007400148897431791, "learning_rate": 6.394383811091764e-08, "loss": 0.0, "num_input_tokens_seen": 123677032, "step": 183585 }, { "epoch": 4.4851342437642, "grad_norm": 8.471470209769905e-05, "learning_rate": 6.39138365379701e-08, "loss": 0.0, "num_input_tokens_seen": 123680168, "step": 183590 }, { "epoch": 4.485256394596047, "grad_norm": 0.00019484623044263572, "learning_rate": 6.388384177249273e-08, "loss": 0.0, "num_input_tokens_seen": 123683432, "step": 183595 }, { "epoch": 4.485378545427895, "grad_norm": 7.977020868565887e-05, "learning_rate": 6.385385381470388e-08, "loss": 0.0, "num_input_tokens_seen": 123686888, "step": 183600 }, { "epoch": 4.4855006962597415, "grad_norm": 0.00033194624120369554, "learning_rate": 6.382387266482182e-08, "loss": 0.0002, "num_input_tokens_seen": 123690280, "step": 183605 }, { "epoch": 4.485622847091589, "grad_norm": 0.00013592727191280574, "learning_rate": 6.379389832306415e-08, "loss": 0.0, "num_input_tokens_seen": 123693608, "step": 183610 }, { "epoch": 4.485744997923436, "grad_norm": 0.23284316062927246, "learning_rate": 6.376393078964915e-08, "loss": 0.0, "num_input_tokens_seen": 123696936, "step": 183615 }, { "epoch": 4.485867148755283, "grad_norm": 0.00017789828416425735, "learning_rate": 6.373397006479464e-08, "loss": 0.0, "num_input_tokens_seen": 123700072, "step": 183620 }, { "epoch": 4.48598929958713, "grad_norm": 0.0004446000966709107, "learning_rate": 6.370401614871845e-08, "loss": 0.0, "num_input_tokens_seen": 123703336, "step": 183625 }, { "epoch": 4.486111450418977, "grad_norm": 6.071280949981883e-05, "learning_rate": 6.367406904163863e-08, "loss": 0.0, "num_input_tokens_seen": 123706280, "step": 183630 }, { "epoch": 4.486233601250825, "grad_norm": 0.000198695546714589, "learning_rate": 6.364412874377267e-08, "loss": 0.0, "num_input_tokens_seen": 123709672, "step": 183635 }, { "epoch": 4.486355752082671, "grad_norm": 0.00022004038328304887, "learning_rate": 6.361419525533862e-08, "loss": 0.0, "num_input_tokens_seen": 123712680, "step": 183640 }, { "epoch": 4.486477902914519, "grad_norm": 9.493048855802044e-05, "learning_rate": 6.358426857655386e-08, "loss": 0.0, "num_input_tokens_seen": 123715752, "step": 183645 }, { "epoch": 4.486600053746366, "grad_norm": 0.0011744924122467637, "learning_rate": 6.355434870763632e-08, "loss": 0.0, "num_input_tokens_seen": 123719400, "step": 183650 }, { "epoch": 4.486722204578213, "grad_norm": 0.002865522401407361, "learning_rate": 6.352443564880328e-08, "loss": 0.0, "num_input_tokens_seen": 123722344, "step": 183655 }, { "epoch": 4.48684435541006, "grad_norm": 0.0005735221202485263, "learning_rate": 6.349452940027255e-08, "loss": 0.0, "num_input_tokens_seen": 123725864, "step": 183660 }, { "epoch": 4.486966506241908, "grad_norm": 0.003035858040675521, "learning_rate": 6.346462996226155e-08, "loss": 0.0, "num_input_tokens_seen": 123729512, "step": 183665 }, { "epoch": 4.4870886570737545, "grad_norm": 1.2980992323718965e-05, "learning_rate": 6.343473733498739e-08, "loss": 0.0, "num_input_tokens_seen": 123732904, "step": 183670 }, { "epoch": 4.487210807905602, "grad_norm": 1.0696382560126949e-05, "learning_rate": 6.340485151866793e-08, "loss": 0.0, "num_input_tokens_seen": 123736168, "step": 183675 }, { "epoch": 4.487332958737449, "grad_norm": 0.0003565715451259166, "learning_rate": 6.33749725135202e-08, "loss": 0.0, "num_input_tokens_seen": 123739240, "step": 183680 }, { "epoch": 4.4874551095692965, "grad_norm": 0.0007983737159520388, "learning_rate": 6.33451003197617e-08, "loss": 0.0, "num_input_tokens_seen": 123742504, "step": 183685 }, { "epoch": 4.487577260401143, "grad_norm": 0.014135781675577164, "learning_rate": 6.331523493760949e-08, "loss": 0.0, "num_input_tokens_seen": 123745896, "step": 183690 }, { "epoch": 4.487699411232991, "grad_norm": 0.00018134378478862345, "learning_rate": 6.328537636728071e-08, "loss": 0.0, "num_input_tokens_seen": 123749224, "step": 183695 }, { "epoch": 4.487821562064838, "grad_norm": 0.00032579965773038566, "learning_rate": 6.325552460899285e-08, "loss": 0.0, "num_input_tokens_seen": 123752360, "step": 183700 }, { "epoch": 4.487943712896685, "grad_norm": 0.0001060232098097913, "learning_rate": 6.322567966296255e-08, "loss": 0.0, "num_input_tokens_seen": 123755752, "step": 183705 }, { "epoch": 4.488065863728532, "grad_norm": 0.0027526814956218004, "learning_rate": 6.319584152940726e-08, "loss": 0.0, "num_input_tokens_seen": 123759528, "step": 183710 }, { "epoch": 4.488188014560379, "grad_norm": 0.0005101757124066353, "learning_rate": 6.316601020854362e-08, "loss": 0.0, "num_input_tokens_seen": 123762984, "step": 183715 }, { "epoch": 4.488310165392226, "grad_norm": 8.717154560144991e-05, "learning_rate": 6.313618570058876e-08, "loss": 0.0, "num_input_tokens_seen": 123766120, "step": 183720 }, { "epoch": 4.488432316224073, "grad_norm": 0.00010378050501458347, "learning_rate": 6.310636800575975e-08, "loss": 0.0, "num_input_tokens_seen": 123770152, "step": 183725 }, { "epoch": 4.488554467055921, "grad_norm": 1.9653307390399277e-05, "learning_rate": 6.307655712427295e-08, "loss": 0.0, "num_input_tokens_seen": 123773288, "step": 183730 }, { "epoch": 4.4886766178877675, "grad_norm": 2.4535993361496367e-05, "learning_rate": 6.304675305634566e-08, "loss": 0.0, "num_input_tokens_seen": 123776552, "step": 183735 }, { "epoch": 4.488798768719615, "grad_norm": 0.0008791940053924918, "learning_rate": 6.301695580219424e-08, "loss": 0.0, "num_input_tokens_seen": 123779688, "step": 183740 }, { "epoch": 4.488920919551462, "grad_norm": 0.00033614938729442656, "learning_rate": 6.298716536203563e-08, "loss": 0.0, "num_input_tokens_seen": 123783208, "step": 183745 }, { "epoch": 4.4890430703833095, "grad_norm": 6.0099537222413346e-05, "learning_rate": 6.295738173608622e-08, "loss": 0.0, "num_input_tokens_seen": 123786344, "step": 183750 }, { "epoch": 4.489165221215156, "grad_norm": 0.00011174487008247524, "learning_rate": 6.292760492456284e-08, "loss": 0.0, "num_input_tokens_seen": 123790184, "step": 183755 }, { "epoch": 4.489287372047004, "grad_norm": 0.006128856446594, "learning_rate": 6.289783492768208e-08, "loss": 0.0, "num_input_tokens_seen": 123793320, "step": 183760 }, { "epoch": 4.489409522878851, "grad_norm": 0.004429202992469072, "learning_rate": 6.286807174566033e-08, "loss": 0.0, "num_input_tokens_seen": 123796584, "step": 183765 }, { "epoch": 4.489531673710698, "grad_norm": 0.0015801249537616968, "learning_rate": 6.283831537871375e-08, "loss": 0.0, "num_input_tokens_seen": 123799528, "step": 183770 }, { "epoch": 4.489653824542545, "grad_norm": 0.0002338515332667157, "learning_rate": 6.280856582705929e-08, "loss": 0.0, "num_input_tokens_seen": 123802984, "step": 183775 }, { "epoch": 4.489775975374393, "grad_norm": 0.00029317050939425826, "learning_rate": 6.277882309091287e-08, "loss": 0.0, "num_input_tokens_seen": 123806184, "step": 183780 }, { "epoch": 4.489898126206239, "grad_norm": 0.021793987601995468, "learning_rate": 6.274908717049099e-08, "loss": 0.0, "num_input_tokens_seen": 123809384, "step": 183785 }, { "epoch": 4.490020277038087, "grad_norm": 0.0002981332945637405, "learning_rate": 6.27193580660098e-08, "loss": 0.0, "num_input_tokens_seen": 123812520, "step": 183790 }, { "epoch": 4.490142427869934, "grad_norm": 0.000523839786183089, "learning_rate": 6.26896357776856e-08, "loss": 0.0, "num_input_tokens_seen": 123815912, "step": 183795 }, { "epoch": 4.490264578701781, "grad_norm": 0.0005471837939694524, "learning_rate": 6.26599203057343e-08, "loss": 0.0, "num_input_tokens_seen": 123819240, "step": 183800 }, { "epoch": 4.490386729533628, "grad_norm": 0.0008639732259325683, "learning_rate": 6.263021165037241e-08, "loss": 0.0, "num_input_tokens_seen": 123822504, "step": 183805 }, { "epoch": 4.490508880365475, "grad_norm": 0.00019813806284219027, "learning_rate": 6.260050981181553e-08, "loss": 0.0, "num_input_tokens_seen": 123825832, "step": 183810 }, { "epoch": 4.4906310311973225, "grad_norm": 8.793376036919653e-05, "learning_rate": 6.257081479027993e-08, "loss": 0.0, "num_input_tokens_seen": 123829160, "step": 183815 }, { "epoch": 4.490753182029169, "grad_norm": 0.00021990187815390527, "learning_rate": 6.254112658598165e-08, "loss": 0.0, "num_input_tokens_seen": 123832296, "step": 183820 }, { "epoch": 4.490875332861017, "grad_norm": 0.00017110860790126026, "learning_rate": 6.25114451991362e-08, "loss": 0.0, "num_input_tokens_seen": 123836072, "step": 183825 }, { "epoch": 4.490997483692864, "grad_norm": 0.00019226314907427877, "learning_rate": 6.248177062995997e-08, "loss": 0.0, "num_input_tokens_seen": 123839144, "step": 183830 }, { "epoch": 4.491119634524711, "grad_norm": 0.0013531792210415006, "learning_rate": 6.245210287866821e-08, "loss": 0.0, "num_input_tokens_seen": 123842408, "step": 183835 }, { "epoch": 4.491241785356558, "grad_norm": 0.00011543887376319617, "learning_rate": 6.242244194547707e-08, "loss": 0.0, "num_input_tokens_seen": 123846440, "step": 183840 }, { "epoch": 4.491363936188406, "grad_norm": 0.007675806060433388, "learning_rate": 6.239278783060198e-08, "loss": 0.0, "num_input_tokens_seen": 123849448, "step": 183845 }, { "epoch": 4.491486087020252, "grad_norm": 0.0008180846925824881, "learning_rate": 6.236314053425873e-08, "loss": 0.0, "num_input_tokens_seen": 123853288, "step": 183850 }, { "epoch": 4.4916082378521, "grad_norm": 0.00017951830523088574, "learning_rate": 6.233350005666304e-08, "loss": 0.0, "num_input_tokens_seen": 123856872, "step": 183855 }, { "epoch": 4.491730388683947, "grad_norm": 0.001047560479491949, "learning_rate": 6.230386639803031e-08, "loss": 0.0, "num_input_tokens_seen": 123859816, "step": 183860 }, { "epoch": 4.491852539515794, "grad_norm": 0.0005243326304480433, "learning_rate": 6.227423955857614e-08, "loss": 0.0, "num_input_tokens_seen": 123863528, "step": 183865 }, { "epoch": 4.491974690347641, "grad_norm": 27.46204376220703, "learning_rate": 6.22446195385159e-08, "loss": 0.0354, "num_input_tokens_seen": 123867112, "step": 183870 }, { "epoch": 4.492096841179489, "grad_norm": 0.002453556749969721, "learning_rate": 6.221500633806487e-08, "loss": 0.0, "num_input_tokens_seen": 123870760, "step": 183875 }, { "epoch": 4.492218992011336, "grad_norm": 3.372270384716103e-06, "learning_rate": 6.218539995743865e-08, "loss": 0.0, "num_input_tokens_seen": 123873768, "step": 183880 }, { "epoch": 4.492341142843182, "grad_norm": 0.04905255511403084, "learning_rate": 6.215580039685243e-08, "loss": 0.0, "num_input_tokens_seen": 123876776, "step": 183885 }, { "epoch": 4.49246329367503, "grad_norm": 0.00012224094825796783, "learning_rate": 6.212620765652155e-08, "loss": 0.0, "num_input_tokens_seen": 123880040, "step": 183890 }, { "epoch": 4.492585444506877, "grad_norm": 1.0237623428110965e-05, "learning_rate": 6.209662173666097e-08, "loss": 0.0, "num_input_tokens_seen": 123883560, "step": 183895 }, { "epoch": 4.492707595338724, "grad_norm": 1.5369896573247388e-05, "learning_rate": 6.206704263748618e-08, "loss": 0.0, "num_input_tokens_seen": 123886952, "step": 183900 }, { "epoch": 4.492829746170571, "grad_norm": 0.00015546464419458061, "learning_rate": 6.203747035921192e-08, "loss": 0.0, "num_input_tokens_seen": 123889960, "step": 183905 }, { "epoch": 4.492951897002419, "grad_norm": 0.0012587353121489286, "learning_rate": 6.200790490205354e-08, "loss": 0.0, "num_input_tokens_seen": 123893480, "step": 183910 }, { "epoch": 4.4930740478342654, "grad_norm": 0.0001071410661097616, "learning_rate": 6.197834626622611e-08, "loss": 0.0, "num_input_tokens_seen": 123896616, "step": 183915 }, { "epoch": 4.493196198666113, "grad_norm": 0.0003027370839845389, "learning_rate": 6.194879445194434e-08, "loss": 0.0, "num_input_tokens_seen": 123899944, "step": 183920 }, { "epoch": 4.49331834949796, "grad_norm": 1.3733631021750625e-05, "learning_rate": 6.191924945942329e-08, "loss": 0.0, "num_input_tokens_seen": 123903400, "step": 183925 }, { "epoch": 4.4934405003298075, "grad_norm": 0.0011689336970448494, "learning_rate": 6.188971128887777e-08, "loss": 0.0, "num_input_tokens_seen": 123907304, "step": 183930 }, { "epoch": 4.493562651161654, "grad_norm": 0.00024081453739199787, "learning_rate": 6.18601799405224e-08, "loss": 0.0, "num_input_tokens_seen": 123910568, "step": 183935 }, { "epoch": 4.493684801993502, "grad_norm": 0.0007175906794145703, "learning_rate": 6.183065541457244e-08, "loss": 0.0, "num_input_tokens_seen": 123915112, "step": 183940 }, { "epoch": 4.493806952825349, "grad_norm": 0.013144269585609436, "learning_rate": 6.180113771124207e-08, "loss": 0.0, "num_input_tokens_seen": 123918632, "step": 183945 }, { "epoch": 4.493929103657196, "grad_norm": 0.00019422074547037482, "learning_rate": 6.177162683074633e-08, "loss": 0.0, "num_input_tokens_seen": 123921896, "step": 183950 }, { "epoch": 4.494051254489043, "grad_norm": 2.715620576054789e-05, "learning_rate": 6.174212277329949e-08, "loss": 0.0, "num_input_tokens_seen": 123925352, "step": 183955 }, { "epoch": 4.494173405320891, "grad_norm": 0.00022808015637565404, "learning_rate": 6.17126255391165e-08, "loss": 0.0, "num_input_tokens_seen": 123928872, "step": 183960 }, { "epoch": 4.494295556152737, "grad_norm": 0.06994034349918365, "learning_rate": 6.16831351284115e-08, "loss": 0.0001, "num_input_tokens_seen": 123932520, "step": 183965 }, { "epoch": 4.494417706984585, "grad_norm": 0.030225621536374092, "learning_rate": 6.165365154139924e-08, "loss": 0.0394, "num_input_tokens_seen": 123935848, "step": 183970 }, { "epoch": 4.494539857816432, "grad_norm": 8.776846516411752e-05, "learning_rate": 6.162417477829407e-08, "loss": 0.0, "num_input_tokens_seen": 123938984, "step": 183975 }, { "epoch": 4.4946620086482785, "grad_norm": 7.617447408847511e-05, "learning_rate": 6.159470483931006e-08, "loss": 0.0863, "num_input_tokens_seen": 123942632, "step": 183980 }, { "epoch": 4.494784159480126, "grad_norm": 4.946303670294583e-05, "learning_rate": 6.156524172466194e-08, "loss": 0.0, "num_input_tokens_seen": 123945832, "step": 183985 }, { "epoch": 4.494906310311973, "grad_norm": 0.00012827737373299897, "learning_rate": 6.153578543456361e-08, "loss": 0.029, "num_input_tokens_seen": 123949416, "step": 183990 }, { "epoch": 4.4950284611438205, "grad_norm": 6.737570947734639e-05, "learning_rate": 6.150633596922971e-08, "loss": 0.0568, "num_input_tokens_seen": 123952936, "step": 183995 }, { "epoch": 4.495150611975667, "grad_norm": 0.001037980429828167, "learning_rate": 6.147689332887396e-08, "loss": 0.0, "num_input_tokens_seen": 123956200, "step": 184000 }, { "epoch": 4.495272762807515, "grad_norm": 0.0002838200598489493, "learning_rate": 6.14474575137106e-08, "loss": 0.0, "num_input_tokens_seen": 123959400, "step": 184005 }, { "epoch": 4.495394913639362, "grad_norm": 0.00040364160668104887, "learning_rate": 6.141802852395406e-08, "loss": 0.0, "num_input_tokens_seen": 123962856, "step": 184010 }, { "epoch": 4.495517064471209, "grad_norm": 0.005427889991551638, "learning_rate": 6.138860635981779e-08, "loss": 0.0, "num_input_tokens_seen": 123966184, "step": 184015 }, { "epoch": 4.495639215303056, "grad_norm": 0.007369643542915583, "learning_rate": 6.135919102151632e-08, "loss": 0.0, "num_input_tokens_seen": 123969448, "step": 184020 }, { "epoch": 4.495761366134904, "grad_norm": 2.715495793381706e-05, "learning_rate": 6.132978250926302e-08, "loss": 0.0, "num_input_tokens_seen": 123973032, "step": 184025 }, { "epoch": 4.49588351696675, "grad_norm": 0.008515347726643085, "learning_rate": 6.130038082327205e-08, "loss": 0.0, "num_input_tokens_seen": 123976744, "step": 184030 }, { "epoch": 4.496005667798598, "grad_norm": 0.0004892494180239737, "learning_rate": 6.127098596375724e-08, "loss": 0.0, "num_input_tokens_seen": 123979880, "step": 184035 }, { "epoch": 4.496127818630445, "grad_norm": 0.0005637376452796161, "learning_rate": 6.124159793093231e-08, "loss": 0.0, "num_input_tokens_seen": 123983336, "step": 184040 }, { "epoch": 4.496249969462292, "grad_norm": 3.558983053153497e-06, "learning_rate": 6.121221672501108e-08, "loss": 0.0, "num_input_tokens_seen": 123986408, "step": 184045 }, { "epoch": 4.496372120294139, "grad_norm": 10.293907165527344, "learning_rate": 6.118284234620686e-08, "loss": 0.0257, "num_input_tokens_seen": 123990632, "step": 184050 }, { "epoch": 4.496494271125987, "grad_norm": 0.0009266805136576295, "learning_rate": 6.115347479473376e-08, "loss": 0.0, "num_input_tokens_seen": 123993704, "step": 184055 }, { "epoch": 4.4966164219578335, "grad_norm": 2.9098644517944194e-05, "learning_rate": 6.112411407080498e-08, "loss": 0.0, "num_input_tokens_seen": 123997224, "step": 184060 }, { "epoch": 4.496738572789681, "grad_norm": 2.041774678218644e-05, "learning_rate": 6.10947601746341e-08, "loss": 0.0, "num_input_tokens_seen": 124000872, "step": 184065 }, { "epoch": 4.496860723621528, "grad_norm": 0.003889445448294282, "learning_rate": 6.106541310643487e-08, "loss": 0.0, "num_input_tokens_seen": 124004200, "step": 184070 }, { "epoch": 4.496982874453375, "grad_norm": 0.003433778416365385, "learning_rate": 6.103607286642054e-08, "loss": 0.0, "num_input_tokens_seen": 124007656, "step": 184075 }, { "epoch": 4.497105025285222, "grad_norm": 0.00015160914335865527, "learning_rate": 6.100673945480417e-08, "loss": 0.0, "num_input_tokens_seen": 124011112, "step": 184080 }, { "epoch": 4.497227176117069, "grad_norm": 0.0003066387725993991, "learning_rate": 6.097741287179958e-08, "loss": 0.0, "num_input_tokens_seen": 124014440, "step": 184085 }, { "epoch": 4.497349326948917, "grad_norm": 0.0002169539366150275, "learning_rate": 6.094809311761961e-08, "loss": 0.0099, "num_input_tokens_seen": 124017832, "step": 184090 }, { "epoch": 4.497471477780763, "grad_norm": 0.009181767702102661, "learning_rate": 6.091878019247787e-08, "loss": 0.0, "num_input_tokens_seen": 124021160, "step": 184095 }, { "epoch": 4.497593628612611, "grad_norm": 0.00010554036998655647, "learning_rate": 6.088947409658718e-08, "loss": 0.0, "num_input_tokens_seen": 124024488, "step": 184100 }, { "epoch": 4.497715779444458, "grad_norm": 0.00019654342031572014, "learning_rate": 6.086017483016104e-08, "loss": 0.0, "num_input_tokens_seen": 124028072, "step": 184105 }, { "epoch": 4.497837930276305, "grad_norm": 0.00013703553122468293, "learning_rate": 6.083088239341206e-08, "loss": 0.0, "num_input_tokens_seen": 124031400, "step": 184110 }, { "epoch": 4.497960081108152, "grad_norm": 2.7517999114934355e-05, "learning_rate": 6.080159678655372e-08, "loss": 0.0, "num_input_tokens_seen": 124034600, "step": 184115 }, { "epoch": 4.49808223194, "grad_norm": 9.830735507421196e-06, "learning_rate": 6.077231800979865e-08, "loss": 0.0, "num_input_tokens_seen": 124037672, "step": 184120 }, { "epoch": 4.4982043827718465, "grad_norm": 8.236825669882819e-05, "learning_rate": 6.074304606335978e-08, "loss": 0.0, "num_input_tokens_seen": 124040616, "step": 184125 }, { "epoch": 4.498326533603694, "grad_norm": 0.00047793277190066874, "learning_rate": 6.071378094745039e-08, "loss": 0.0, "num_input_tokens_seen": 124043816, "step": 184130 }, { "epoch": 4.498448684435541, "grad_norm": 4.4294542021816596e-05, "learning_rate": 6.068452266228285e-08, "loss": 0.0, "num_input_tokens_seen": 124047272, "step": 184135 }, { "epoch": 4.4985708352673885, "grad_norm": 5.49412434338592e-05, "learning_rate": 6.065527120807024e-08, "loss": 0.0, "num_input_tokens_seen": 124050856, "step": 184140 }, { "epoch": 4.498692986099235, "grad_norm": 8.162522863131016e-05, "learning_rate": 6.062602658502491e-08, "loss": 0.0, "num_input_tokens_seen": 124054312, "step": 184145 }, { "epoch": 4.498815136931082, "grad_norm": 0.033382777124643326, "learning_rate": 6.059678879336005e-08, "loss": 0.0, "num_input_tokens_seen": 124057128, "step": 184150 }, { "epoch": 4.49893728776293, "grad_norm": 0.004745765123516321, "learning_rate": 6.056755783328782e-08, "loss": 0.0, "num_input_tokens_seen": 124060136, "step": 184155 }, { "epoch": 4.499059438594776, "grad_norm": 2.5301449568360113e-05, "learning_rate": 6.053833370502104e-08, "loss": 0.0, "num_input_tokens_seen": 124064040, "step": 184160 }, { "epoch": 4.499181589426624, "grad_norm": 6.118205055827275e-05, "learning_rate": 6.050911640877221e-08, "loss": 0.0, "num_input_tokens_seen": 124067880, "step": 184165 }, { "epoch": 4.499303740258471, "grad_norm": 0.00027135369600728154, "learning_rate": 6.047990594475372e-08, "loss": 0.0, "num_input_tokens_seen": 124070696, "step": 184170 }, { "epoch": 4.499425891090318, "grad_norm": 3.677931090351194e-05, "learning_rate": 6.045070231317817e-08, "loss": 0.0, "num_input_tokens_seen": 124074344, "step": 184175 }, { "epoch": 4.499548041922165, "grad_norm": 0.006334454286843538, "learning_rate": 6.042150551425784e-08, "loss": 0.0, "num_input_tokens_seen": 124077608, "step": 184180 }, { "epoch": 4.499670192754013, "grad_norm": 0.0003608764673117548, "learning_rate": 6.039231554820489e-08, "loss": 0.0, "num_input_tokens_seen": 124081384, "step": 184185 }, { "epoch": 4.4997923435858596, "grad_norm": 3.4421696909703314e-05, "learning_rate": 6.036313241523183e-08, "loss": 0.0, "num_input_tokens_seen": 124084968, "step": 184190 }, { "epoch": 4.499914494417707, "grad_norm": 5.452797267935239e-06, "learning_rate": 6.033395611555081e-08, "loss": 0.0538, "num_input_tokens_seen": 124088040, "step": 184195 }, { "epoch": 4.500036645249554, "grad_norm": 2.9334744340303587e-06, "learning_rate": 6.0304786649374e-08, "loss": 0.0, "num_input_tokens_seen": 124091240, "step": 184200 }, { "epoch": 4.500158796081402, "grad_norm": 0.00040240155067294836, "learning_rate": 6.027562401691344e-08, "loss": 0.0, "num_input_tokens_seen": 124095144, "step": 184205 }, { "epoch": 4.500280946913248, "grad_norm": 0.000127659848658368, "learning_rate": 6.024646821838142e-08, "loss": 0.0, "num_input_tokens_seen": 124098728, "step": 184210 }, { "epoch": 4.500329807245987, "eval_loss": 0.3127802014350891, "eval_runtime": 47.8244, "eval_samples_per_second": 760.805, "eval_steps_per_second": 95.119, "num_input_tokens_seen": 124100264, "step": 184212 }, { "epoch": 4.500403097745096, "grad_norm": 0.0016075852327048779, "learning_rate": 6.021731925398976e-08, "loss": 0.0, "num_input_tokens_seen": 124102632, "step": 184215 }, { "epoch": 4.500525248576943, "grad_norm": 0.0014234702102839947, "learning_rate": 6.018817712395041e-08, "loss": 0.0, "num_input_tokens_seen": 124105832, "step": 184220 }, { "epoch": 4.50064739940879, "grad_norm": 0.0013454908039420843, "learning_rate": 6.015904182847564e-08, "loss": 0.0, "num_input_tokens_seen": 124109352, "step": 184225 }, { "epoch": 4.500769550240637, "grad_norm": 0.0019803806208074093, "learning_rate": 6.012991336777695e-08, "loss": 0.0, "num_input_tokens_seen": 124112552, "step": 184230 }, { "epoch": 4.500891701072485, "grad_norm": 0.0009988030651584268, "learning_rate": 6.01007917420665e-08, "loss": 0.0, "num_input_tokens_seen": 124115816, "step": 184235 }, { "epoch": 4.5010138519043315, "grad_norm": 0.38556408882141113, "learning_rate": 6.007167695155569e-08, "loss": 0.0001, "num_input_tokens_seen": 124119528, "step": 184240 }, { "epoch": 4.501136002736178, "grad_norm": 0.01152084581553936, "learning_rate": 6.004256899645665e-08, "loss": 0.0, "num_input_tokens_seen": 124122344, "step": 184245 }, { "epoch": 4.501258153568026, "grad_norm": 0.00029359193285927176, "learning_rate": 6.001346787698069e-08, "loss": 0.0, "num_input_tokens_seen": 124125416, "step": 184250 }, { "epoch": 4.501380304399873, "grad_norm": 0.0015446188626810908, "learning_rate": 5.998437359333964e-08, "loss": 0.0, "num_input_tokens_seen": 124129000, "step": 184255 }, { "epoch": 4.50150245523172, "grad_norm": 0.0001862171193351969, "learning_rate": 5.995528614574519e-08, "loss": 0.0, "num_input_tokens_seen": 124131880, "step": 184260 }, { "epoch": 4.501624606063567, "grad_norm": 0.0001308717328356579, "learning_rate": 5.992620553440863e-08, "loss": 0.0, "num_input_tokens_seen": 124134760, "step": 184265 }, { "epoch": 4.501746756895415, "grad_norm": 4.519295907812193e-05, "learning_rate": 5.989713175954169e-08, "loss": 0.0, "num_input_tokens_seen": 124138152, "step": 184270 }, { "epoch": 4.501868907727261, "grad_norm": 2.6418254492455162e-05, "learning_rate": 5.986806482135542e-08, "loss": 0.0, "num_input_tokens_seen": 124141352, "step": 184275 }, { "epoch": 4.501991058559109, "grad_norm": 0.00016644655261188745, "learning_rate": 5.983900472006175e-08, "loss": 0.0, "num_input_tokens_seen": 124144616, "step": 184280 }, { "epoch": 4.502113209390956, "grad_norm": 0.02736012451350689, "learning_rate": 5.980995145587165e-08, "loss": 0.0, "num_input_tokens_seen": 124147944, "step": 184285 }, { "epoch": 4.502235360222803, "grad_norm": 0.0007747677154839039, "learning_rate": 5.978090502899624e-08, "loss": 0.0, "num_input_tokens_seen": 124151400, "step": 184290 }, { "epoch": 4.50235751105465, "grad_norm": 8.166118641383946e-05, "learning_rate": 5.975186543964716e-08, "loss": 0.0, "num_input_tokens_seen": 124155368, "step": 184295 }, { "epoch": 4.502479661886498, "grad_norm": 2.3084163331077434e-05, "learning_rate": 5.972283268803536e-08, "loss": 0.0, "num_input_tokens_seen": 124158632, "step": 184300 }, { "epoch": 4.5026018127183445, "grad_norm": 0.0006073784315958619, "learning_rate": 5.969380677437208e-08, "loss": 0.0, "num_input_tokens_seen": 124162344, "step": 184305 }, { "epoch": 4.502723963550192, "grad_norm": 8.503326534992084e-05, "learning_rate": 5.966478769886818e-08, "loss": 0.0, "num_input_tokens_seen": 124165672, "step": 184310 }, { "epoch": 4.502846114382039, "grad_norm": 0.00027897878317162395, "learning_rate": 5.963577546173493e-08, "loss": 0.0, "num_input_tokens_seen": 124169320, "step": 184315 }, { "epoch": 4.5029682652138865, "grad_norm": 0.0013202318223193288, "learning_rate": 5.960677006318338e-08, "loss": 0.0, "num_input_tokens_seen": 124173608, "step": 184320 }, { "epoch": 4.503090416045733, "grad_norm": 2.0044208213221282e-05, "learning_rate": 5.9577771503424135e-08, "loss": 0.0, "num_input_tokens_seen": 124177576, "step": 184325 }, { "epoch": 4.503212566877581, "grad_norm": 0.0001279815041925758, "learning_rate": 5.954877978266848e-08, "loss": 0.0, "num_input_tokens_seen": 124181288, "step": 184330 }, { "epoch": 4.503334717709428, "grad_norm": 0.0007320480654016137, "learning_rate": 5.9519794901126907e-08, "loss": 0.0, "num_input_tokens_seen": 124184296, "step": 184335 }, { "epoch": 4.503456868541274, "grad_norm": 0.0055288467556238174, "learning_rate": 5.9490816859010364e-08, "loss": 0.0, "num_input_tokens_seen": 124187944, "step": 184340 }, { "epoch": 4.503579019373122, "grad_norm": 0.002740490948781371, "learning_rate": 5.946184565652967e-08, "loss": 0.0, "num_input_tokens_seen": 124191208, "step": 184345 }, { "epoch": 4.503701170204969, "grad_norm": 2.137678711733315e-05, "learning_rate": 5.943288129389523e-08, "loss": 0.0, "num_input_tokens_seen": 124194664, "step": 184350 }, { "epoch": 4.503823321036816, "grad_norm": 0.009588432498276234, "learning_rate": 5.940392377131809e-08, "loss": 0.0, "num_input_tokens_seen": 124198184, "step": 184355 }, { "epoch": 4.503945471868663, "grad_norm": 0.0009689059224911034, "learning_rate": 5.937497308900841e-08, "loss": 0.0, "num_input_tokens_seen": 124201768, "step": 184360 }, { "epoch": 4.504067622700511, "grad_norm": 2.13335151784122e-05, "learning_rate": 5.934602924717702e-08, "loss": 0.0, "num_input_tokens_seen": 124205096, "step": 184365 }, { "epoch": 4.5041897735323575, "grad_norm": 0.00023117160890251398, "learning_rate": 5.93170922460342e-08, "loss": 0.0, "num_input_tokens_seen": 124208104, "step": 184370 }, { "epoch": 4.504311924364205, "grad_norm": 0.0012295347405597568, "learning_rate": 5.9288162085790574e-08, "loss": 0.0, "num_input_tokens_seen": 124211112, "step": 184375 }, { "epoch": 4.504434075196052, "grad_norm": 0.0013790687080472708, "learning_rate": 5.9259238766656506e-08, "loss": 0.0, "num_input_tokens_seen": 124214568, "step": 184380 }, { "epoch": 4.5045562260278995, "grad_norm": 0.0005128180491738021, "learning_rate": 5.923032228884228e-08, "loss": 0.0, "num_input_tokens_seen": 124217704, "step": 184385 }, { "epoch": 4.504678376859746, "grad_norm": 2.7716218028217554e-05, "learning_rate": 5.920141265255818e-08, "loss": 0.0, "num_input_tokens_seen": 124220776, "step": 184390 }, { "epoch": 4.504800527691594, "grad_norm": 0.002322013955563307, "learning_rate": 5.9172509858014474e-08, "loss": 0.0, "num_input_tokens_seen": 124224168, "step": 184395 }, { "epoch": 4.504922678523441, "grad_norm": 5.2793726354138926e-05, "learning_rate": 5.914361390542133e-08, "loss": 0.0, "num_input_tokens_seen": 124227240, "step": 184400 }, { "epoch": 4.505044829355288, "grad_norm": 7.494450255762786e-05, "learning_rate": 5.91147247949888e-08, "loss": 0.0, "num_input_tokens_seen": 124230312, "step": 184405 }, { "epoch": 4.505166980187135, "grad_norm": 0.006155565846711397, "learning_rate": 5.908584252692694e-08, "loss": 0.0, "num_input_tokens_seen": 124233960, "step": 184410 }, { "epoch": 4.505289131018982, "grad_norm": 0.0005912997294217348, "learning_rate": 5.905696710144614e-08, "loss": 0.0, "num_input_tokens_seen": 124236968, "step": 184415 }, { "epoch": 4.505411281850829, "grad_norm": 5.531529041036265e-06, "learning_rate": 5.902809851875601e-08, "loss": 0.0, "num_input_tokens_seen": 124240232, "step": 184420 }, { "epoch": 4.505533432682677, "grad_norm": 4.586255818139762e-05, "learning_rate": 5.899923677906682e-08, "loss": 0.0, "num_input_tokens_seen": 124243432, "step": 184425 }, { "epoch": 4.505655583514524, "grad_norm": 0.005010268650949001, "learning_rate": 5.897038188258807e-08, "loss": 0.0, "num_input_tokens_seen": 124246696, "step": 184430 }, { "epoch": 4.5057777343463705, "grad_norm": 7.66208268032642e-06, "learning_rate": 5.894153382952993e-08, "loss": 0.0, "num_input_tokens_seen": 124249768, "step": 184435 }, { "epoch": 4.505899885178218, "grad_norm": 2.6806714231497608e-05, "learning_rate": 5.891269262010212e-08, "loss": 0.0, "num_input_tokens_seen": 124253224, "step": 184440 }, { "epoch": 4.506022036010065, "grad_norm": 9.153199789579958e-05, "learning_rate": 5.888385825451414e-08, "loss": 0.0, "num_input_tokens_seen": 124256168, "step": 184445 }, { "epoch": 4.5061441868419125, "grad_norm": 0.04417189583182335, "learning_rate": 5.885503073297604e-08, "loss": 0.0, "num_input_tokens_seen": 124260328, "step": 184450 }, { "epoch": 4.506266337673759, "grad_norm": 0.000164502183906734, "learning_rate": 5.8826210055697215e-08, "loss": 0.0, "num_input_tokens_seen": 124263912, "step": 184455 }, { "epoch": 4.506388488505607, "grad_norm": 0.0004098447388969362, "learning_rate": 5.879739622288748e-08, "loss": 0.0117, "num_input_tokens_seen": 124267240, "step": 184460 }, { "epoch": 4.506510639337454, "grad_norm": 0.00029537943191826344, "learning_rate": 5.876858923475603e-08, "loss": 0.0, "num_input_tokens_seen": 124270504, "step": 184465 }, { "epoch": 4.506632790169301, "grad_norm": 4.8967107431963086e-05, "learning_rate": 5.873978909151256e-08, "loss": 0.0, "num_input_tokens_seen": 124273512, "step": 184470 }, { "epoch": 4.506754941001148, "grad_norm": 0.0012310813181102276, "learning_rate": 5.871099579336669e-08, "loss": 0.0, "num_input_tokens_seen": 124277032, "step": 184475 }, { "epoch": 4.506877091832996, "grad_norm": 0.0020224247127771378, "learning_rate": 5.868220934052748e-08, "loss": 0.0, "num_input_tokens_seen": 124280808, "step": 184480 }, { "epoch": 4.506999242664842, "grad_norm": 0.005152786150574684, "learning_rate": 5.865342973320453e-08, "loss": 0.0, "num_input_tokens_seen": 124284328, "step": 184485 }, { "epoch": 4.50712139349669, "grad_norm": 0.00016626408614683896, "learning_rate": 5.862465697160712e-08, "loss": 0.0, "num_input_tokens_seen": 124287656, "step": 184490 }, { "epoch": 4.507243544328537, "grad_norm": 0.0020967598538845778, "learning_rate": 5.85958910559442e-08, "loss": 0.0, "num_input_tokens_seen": 124291432, "step": 184495 }, { "epoch": 4.507365695160384, "grad_norm": 0.0016112698940560222, "learning_rate": 5.8567131986425265e-08, "loss": 0.0, "num_input_tokens_seen": 124295272, "step": 184500 }, { "epoch": 4.507487845992231, "grad_norm": 2.913714160968084e-05, "learning_rate": 5.853837976325926e-08, "loss": 0.0, "num_input_tokens_seen": 124298536, "step": 184505 }, { "epoch": 4.507609996824078, "grad_norm": 1.0782813660625834e-05, "learning_rate": 5.8509634386655573e-08, "loss": 0.0, "num_input_tokens_seen": 124301992, "step": 184510 }, { "epoch": 4.507732147655926, "grad_norm": 4.982833706890233e-05, "learning_rate": 5.848089585682292e-08, "loss": 0.0, "num_input_tokens_seen": 124305384, "step": 184515 }, { "epoch": 4.507854298487773, "grad_norm": 0.0008456270443275571, "learning_rate": 5.8452164173970475e-08, "loss": 0.0, "num_input_tokens_seen": 124308840, "step": 184520 }, { "epoch": 4.50797644931962, "grad_norm": 4.60719020338729e-05, "learning_rate": 5.842343933830707e-08, "loss": 0.0, "num_input_tokens_seen": 124312552, "step": 184525 }, { "epoch": 4.508098600151467, "grad_norm": 0.00022173790785018355, "learning_rate": 5.839472135004164e-08, "loss": 0.0, "num_input_tokens_seen": 124315752, "step": 184530 }, { "epoch": 4.508220750983314, "grad_norm": 0.00018909823847934604, "learning_rate": 5.8366010209383254e-08, "loss": 0.0, "num_input_tokens_seen": 124319016, "step": 184535 }, { "epoch": 4.508342901815161, "grad_norm": 0.011719591915607452, "learning_rate": 5.833730591654029e-08, "loss": 0.0, "num_input_tokens_seen": 124322664, "step": 184540 }, { "epoch": 4.508465052647009, "grad_norm": 0.00031967408722266555, "learning_rate": 5.830860847172192e-08, "loss": 0.0, "num_input_tokens_seen": 124325608, "step": 184545 }, { "epoch": 4.508587203478855, "grad_norm": 0.00010630823817336932, "learning_rate": 5.827991787513642e-08, "loss": 0.0014, "num_input_tokens_seen": 124328744, "step": 184550 }, { "epoch": 4.508709354310703, "grad_norm": 0.00011591524526011199, "learning_rate": 5.825123412699284e-08, "loss": 0.0, "num_input_tokens_seen": 124332264, "step": 184555 }, { "epoch": 4.50883150514255, "grad_norm": 0.00012368029274512082, "learning_rate": 5.822255722749947e-08, "loss": 0.0, "num_input_tokens_seen": 124335784, "step": 184560 }, { "epoch": 4.5089536559743975, "grad_norm": 0.000389666180126369, "learning_rate": 5.819388717686491e-08, "loss": 0.0, "num_input_tokens_seen": 124339304, "step": 184565 }, { "epoch": 4.509075806806244, "grad_norm": 6.443438905989751e-06, "learning_rate": 5.816522397529788e-08, "loss": 0.0, "num_input_tokens_seen": 124342440, "step": 184570 }, { "epoch": 4.509197957638092, "grad_norm": 9.82302753982367e-06, "learning_rate": 5.813656762300656e-08, "loss": 0.0, "num_input_tokens_seen": 124345896, "step": 184575 }, { "epoch": 4.509320108469939, "grad_norm": 0.00010355612903367728, "learning_rate": 5.810791812019955e-08, "loss": 0.0, "num_input_tokens_seen": 124349352, "step": 184580 }, { "epoch": 4.509442259301786, "grad_norm": 6.289218435995281e-05, "learning_rate": 5.807927546708491e-08, "loss": 0.0341, "num_input_tokens_seen": 124352424, "step": 184585 }, { "epoch": 4.509564410133633, "grad_norm": 0.0032984442077577114, "learning_rate": 5.805063966387136e-08, "loss": 0.0, "num_input_tokens_seen": 124355688, "step": 184590 }, { "epoch": 4.509686560965481, "grad_norm": 0.0005909492610953748, "learning_rate": 5.8022010710766844e-08, "loss": 0.0, "num_input_tokens_seen": 124359016, "step": 184595 }, { "epoch": 4.509808711797327, "grad_norm": 0.0003565122897271067, "learning_rate": 5.799338860797953e-08, "loss": 0.0, "num_input_tokens_seen": 124362728, "step": 184600 }, { "epoch": 4.509930862629174, "grad_norm": 0.00019694749789778143, "learning_rate": 5.796477335571781e-08, "loss": 0.0, "num_input_tokens_seen": 124365864, "step": 184605 }, { "epoch": 4.510053013461022, "grad_norm": 2.829977893270552e-05, "learning_rate": 5.793616495418951e-08, "loss": 0.0, "num_input_tokens_seen": 124369320, "step": 184610 }, { "epoch": 4.5101751642928685, "grad_norm": 0.0032207928597927094, "learning_rate": 5.790756340360292e-08, "loss": 0.0003, "num_input_tokens_seen": 124372200, "step": 184615 }, { "epoch": 4.510297315124716, "grad_norm": 0.00031161491642706096, "learning_rate": 5.7878968704165754e-08, "loss": 0.0, "num_input_tokens_seen": 124375208, "step": 184620 }, { "epoch": 4.510419465956563, "grad_norm": 0.0034799398854374886, "learning_rate": 5.785038085608607e-08, "loss": 0.0, "num_input_tokens_seen": 124378728, "step": 184625 }, { "epoch": 4.5105416167884105, "grad_norm": 0.0002824080584105104, "learning_rate": 5.782179985957214e-08, "loss": 0.0, "num_input_tokens_seen": 124382568, "step": 184630 }, { "epoch": 4.510663767620257, "grad_norm": 0.0002862545079551637, "learning_rate": 5.779322571483125e-08, "loss": 0.0, "num_input_tokens_seen": 124385896, "step": 184635 }, { "epoch": 4.510785918452105, "grad_norm": 0.03027668595314026, "learning_rate": 5.7764658422071566e-08, "loss": 0.0, "num_input_tokens_seen": 124389032, "step": 184640 }, { "epoch": 4.510908069283952, "grad_norm": 0.00020627176854759455, "learning_rate": 5.773609798150059e-08, "loss": 0.0, "num_input_tokens_seen": 124392488, "step": 184645 }, { "epoch": 4.511030220115799, "grad_norm": 2.99558541883016e-05, "learning_rate": 5.7707544393326145e-08, "loss": 0.0, "num_input_tokens_seen": 124395816, "step": 184650 }, { "epoch": 4.511152370947646, "grad_norm": 0.00017671234672889113, "learning_rate": 5.767899765775608e-08, "loss": 0.0414, "num_input_tokens_seen": 124399272, "step": 184655 }, { "epoch": 4.511274521779494, "grad_norm": 5.429242810350843e-05, "learning_rate": 5.765045777499755e-08, "loss": 0.0, "num_input_tokens_seen": 124402600, "step": 184660 }, { "epoch": 4.51139667261134, "grad_norm": 0.02834673970937729, "learning_rate": 5.7621924745258624e-08, "loss": 0.0, "num_input_tokens_seen": 124405928, "step": 184665 }, { "epoch": 4.511518823443188, "grad_norm": 0.0001638929679756984, "learning_rate": 5.759339856874634e-08, "loss": 0.0, "num_input_tokens_seen": 124409128, "step": 184670 }, { "epoch": 4.511640974275035, "grad_norm": 0.00017873983597382903, "learning_rate": 5.7564879245668444e-08, "loss": 0.0, "num_input_tokens_seen": 124412904, "step": 184675 }, { "epoch": 4.5117631251068815, "grad_norm": 0.009502957575023174, "learning_rate": 5.753636677623208e-08, "loss": 0.0, "num_input_tokens_seen": 124416360, "step": 184680 }, { "epoch": 4.511885275938729, "grad_norm": 0.00027789807063527405, "learning_rate": 5.750786116064477e-08, "loss": 0.0, "num_input_tokens_seen": 124419688, "step": 184685 }, { "epoch": 4.512007426770577, "grad_norm": 0.013086493127048016, "learning_rate": 5.7479362399113995e-08, "loss": 0.0, "num_input_tokens_seen": 124423080, "step": 184690 }, { "epoch": 4.5121295776024235, "grad_norm": 0.005364415235817432, "learning_rate": 5.7450870491846715e-08, "loss": 0.0, "num_input_tokens_seen": 124426344, "step": 184695 }, { "epoch": 4.51225172843427, "grad_norm": 1.4318278772407211e-05, "learning_rate": 5.7422385439050095e-08, "loss": 0.0, "num_input_tokens_seen": 124429800, "step": 184700 }, { "epoch": 4.512373879266118, "grad_norm": 8.754514419706538e-05, "learning_rate": 5.7393907240931624e-08, "loss": 0.0, "num_input_tokens_seen": 124433000, "step": 184705 }, { "epoch": 4.512496030097965, "grad_norm": 0.0005249512614682317, "learning_rate": 5.736543589769816e-08, "loss": 0.0, "num_input_tokens_seen": 124436392, "step": 184710 }, { "epoch": 4.512618180929812, "grad_norm": 0.00015164175420068204, "learning_rate": 5.733697140955662e-08, "loss": 0.0, "num_input_tokens_seen": 124439656, "step": 184715 }, { "epoch": 4.512740331761659, "grad_norm": 2.3307018636842258e-05, "learning_rate": 5.73085137767142e-08, "loss": 0.0003, "num_input_tokens_seen": 124442984, "step": 184720 }, { "epoch": 4.512862482593507, "grad_norm": 6.507845682790503e-05, "learning_rate": 5.728006299937793e-08, "loss": 0.0, "num_input_tokens_seen": 124445992, "step": 184725 }, { "epoch": 4.512984633425353, "grad_norm": 1.8531858586356975e-05, "learning_rate": 5.7251619077754445e-08, "loss": 0.0, "num_input_tokens_seen": 124449256, "step": 184730 }, { "epoch": 4.513106784257201, "grad_norm": 2.9950553653179668e-05, "learning_rate": 5.72231820120509e-08, "loss": 0.0, "num_input_tokens_seen": 124452264, "step": 184735 }, { "epoch": 4.513228935089048, "grad_norm": 0.00011571579670999199, "learning_rate": 5.7194751802473793e-08, "loss": 0.0, "num_input_tokens_seen": 124455976, "step": 184740 }, { "epoch": 4.513351085920895, "grad_norm": 1.489359692641301e-05, "learning_rate": 5.716632844923008e-08, "loss": 0.0, "num_input_tokens_seen": 124458920, "step": 184745 }, { "epoch": 4.513473236752742, "grad_norm": 0.0008117137476801872, "learning_rate": 5.71379119525266e-08, "loss": 0.0, "num_input_tokens_seen": 124462504, "step": 184750 }, { "epoch": 4.51359538758459, "grad_norm": 0.0002265637303935364, "learning_rate": 5.710950231256961e-08, "loss": 0.0, "num_input_tokens_seen": 124465960, "step": 184755 }, { "epoch": 4.5137175384164365, "grad_norm": 0.013958304189145565, "learning_rate": 5.708109952956608e-08, "loss": 0.0, "num_input_tokens_seen": 124469672, "step": 184760 }, { "epoch": 4.513839689248284, "grad_norm": 0.002560678403824568, "learning_rate": 5.705270360372227e-08, "loss": 0.0, "num_input_tokens_seen": 124473000, "step": 184765 }, { "epoch": 4.513961840080131, "grad_norm": 0.0003456149424891919, "learning_rate": 5.702431453524503e-08, "loss": 0.0402, "num_input_tokens_seen": 124476328, "step": 184770 }, { "epoch": 4.514083990911978, "grad_norm": 0.000688866013661027, "learning_rate": 5.699593232434041e-08, "loss": 0.0008, "num_input_tokens_seen": 124479848, "step": 184775 }, { "epoch": 4.514206141743825, "grad_norm": 1.5245567738020327e-05, "learning_rate": 5.6967556971215027e-08, "loss": 0.0, "num_input_tokens_seen": 124483240, "step": 184780 }, { "epoch": 4.514328292575673, "grad_norm": 3.2199197448790073e-05, "learning_rate": 5.693918847607526e-08, "loss": 0.0, "num_input_tokens_seen": 124486696, "step": 184785 }, { "epoch": 4.51445044340752, "grad_norm": 0.0001976760831894353, "learning_rate": 5.691082683912729e-08, "loss": 0.0, "num_input_tokens_seen": 124490152, "step": 184790 }, { "epoch": 4.514572594239366, "grad_norm": 0.004537543747574091, "learning_rate": 5.688247206057761e-08, "loss": 0.0, "num_input_tokens_seen": 124494376, "step": 184795 }, { "epoch": 4.514694745071214, "grad_norm": 0.00019870110554620624, "learning_rate": 5.6854124140632285e-08, "loss": 0.0, "num_input_tokens_seen": 124497960, "step": 184800 }, { "epoch": 4.514816895903061, "grad_norm": 0.00021802991977892816, "learning_rate": 5.682578307949726e-08, "loss": 0.0, "num_input_tokens_seen": 124500904, "step": 184805 }, { "epoch": 4.514939046734908, "grad_norm": 8.688728121342137e-05, "learning_rate": 5.679744887737903e-08, "loss": 0.0, "num_input_tokens_seen": 124504360, "step": 184810 }, { "epoch": 4.515061197566755, "grad_norm": 0.00013730123464483768, "learning_rate": 5.676912153448321e-08, "loss": 0.0, "num_input_tokens_seen": 124508072, "step": 184815 }, { "epoch": 4.515183348398603, "grad_norm": 0.00025287570315413177, "learning_rate": 5.6740801051016197e-08, "loss": 0.0, "num_input_tokens_seen": 124511336, "step": 184820 }, { "epoch": 4.5153054992304495, "grad_norm": 0.0004785667988471687, "learning_rate": 5.671248742718371e-08, "loss": 0.0, "num_input_tokens_seen": 124515432, "step": 184825 }, { "epoch": 4.515427650062297, "grad_norm": 0.002044425345957279, "learning_rate": 5.66841806631918e-08, "loss": 0.0, "num_input_tokens_seen": 124519016, "step": 184830 }, { "epoch": 4.515549800894144, "grad_norm": 0.00304840924218297, "learning_rate": 5.66558807592461e-08, "loss": 0.0, "num_input_tokens_seen": 124522152, "step": 184835 }, { "epoch": 4.515671951725992, "grad_norm": 0.00020235584815964103, "learning_rate": 5.662758771555265e-08, "loss": 0.0, "num_input_tokens_seen": 124525480, "step": 184840 }, { "epoch": 4.515794102557838, "grad_norm": 0.0005201268359087408, "learning_rate": 5.659930153231718e-08, "loss": 0.0, "num_input_tokens_seen": 124528808, "step": 184845 }, { "epoch": 4.515916253389686, "grad_norm": 0.0029112256597727537, "learning_rate": 5.657102220974519e-08, "loss": 0.0, "num_input_tokens_seen": 124532456, "step": 184850 }, { "epoch": 4.516038404221533, "grad_norm": 0.08507881313562393, "learning_rate": 5.654274974804263e-08, "loss": 0.0, "num_input_tokens_seen": 124535976, "step": 184855 }, { "epoch": 4.51616055505338, "grad_norm": 0.006632891483604908, "learning_rate": 5.651448414741489e-08, "loss": 0.0, "num_input_tokens_seen": 124539048, "step": 184860 }, { "epoch": 4.516282705885227, "grad_norm": 0.0012043018359690905, "learning_rate": 5.648622540806758e-08, "loss": 0.0, "num_input_tokens_seen": 124542568, "step": 184865 }, { "epoch": 4.516404856717074, "grad_norm": 0.0008988183690235019, "learning_rate": 5.6457973530206206e-08, "loss": 0.0, "num_input_tokens_seen": 124545832, "step": 184870 }, { "epoch": 4.5165270075489214, "grad_norm": 1.9462617274257354e-05, "learning_rate": 5.6429728514036154e-08, "loss": 0.0, "num_input_tokens_seen": 124549032, "step": 184875 }, { "epoch": 4.516649158380768, "grad_norm": 0.001904234173707664, "learning_rate": 5.640149035976305e-08, "loss": 0.0, "num_input_tokens_seen": 124552616, "step": 184880 }, { "epoch": 4.516771309212616, "grad_norm": 0.000173800130141899, "learning_rate": 5.637325906759205e-08, "loss": 0.0, "num_input_tokens_seen": 124555624, "step": 184885 }, { "epoch": 4.516893460044463, "grad_norm": 0.0005049865576438606, "learning_rate": 5.634503463772855e-08, "loss": 0.0, "num_input_tokens_seen": 124558632, "step": 184890 }, { "epoch": 4.51701561087631, "grad_norm": 0.001019941526465118, "learning_rate": 5.631681707037772e-08, "loss": 0.0, "num_input_tokens_seen": 124561832, "step": 184895 }, { "epoch": 4.517137761708157, "grad_norm": 5.146110197529197e-05, "learning_rate": 5.628860636574495e-08, "loss": 0.0, "num_input_tokens_seen": 124565352, "step": 184900 }, { "epoch": 4.517259912540005, "grad_norm": 2.4899445634218864e-05, "learning_rate": 5.626040252403519e-08, "loss": 0.0, "num_input_tokens_seen": 124568680, "step": 184905 }, { "epoch": 4.517382063371851, "grad_norm": 0.0002568171184975654, "learning_rate": 5.623220554545349e-08, "loss": 0.0, "num_input_tokens_seen": 124571944, "step": 184910 }, { "epoch": 4.517504214203699, "grad_norm": 0.00153777573723346, "learning_rate": 5.6204015430205254e-08, "loss": 0.0, "num_input_tokens_seen": 124575848, "step": 184915 }, { "epoch": 4.517626365035546, "grad_norm": 0.00015119351155590266, "learning_rate": 5.6175832178495086e-08, "loss": 0.0, "num_input_tokens_seen": 124579048, "step": 184920 }, { "epoch": 4.517748515867393, "grad_norm": 0.0012161422055214643, "learning_rate": 5.614765579052827e-08, "loss": 0.0, "num_input_tokens_seen": 124582312, "step": 184925 }, { "epoch": 4.51787066669924, "grad_norm": 0.000827375624794513, "learning_rate": 5.6119486266509306e-08, "loss": 0.0, "num_input_tokens_seen": 124585896, "step": 184930 }, { "epoch": 4.517992817531088, "grad_norm": 0.00010629823373164982, "learning_rate": 5.6091323606643484e-08, "loss": 0.0, "num_input_tokens_seen": 124590248, "step": 184935 }, { "epoch": 4.5181149683629345, "grad_norm": 0.0003775875666178763, "learning_rate": 5.606316781113551e-08, "loss": 0.0, "num_input_tokens_seen": 124593960, "step": 184940 }, { "epoch": 4.518237119194782, "grad_norm": 7.271223148563877e-05, "learning_rate": 5.603501888018991e-08, "loss": 0.0, "num_input_tokens_seen": 124596968, "step": 184945 }, { "epoch": 4.518359270026629, "grad_norm": 7.457828905899078e-05, "learning_rate": 5.6006876814011725e-08, "loss": 0.0, "num_input_tokens_seen": 124600232, "step": 184950 }, { "epoch": 4.5184814208584765, "grad_norm": 8.921558764996007e-05, "learning_rate": 5.5978741612805244e-08, "loss": 0.0, "num_input_tokens_seen": 124603624, "step": 184955 }, { "epoch": 4.518603571690323, "grad_norm": 0.005783631466329098, "learning_rate": 5.5950613276775415e-08, "loss": 0.0, "num_input_tokens_seen": 124607016, "step": 184960 }, { "epoch": 4.51872572252217, "grad_norm": 0.0010702416766434908, "learning_rate": 5.5922491806126514e-08, "loss": 0.0, "num_input_tokens_seen": 124610344, "step": 184965 }, { "epoch": 4.518847873354018, "grad_norm": 0.0003246046253480017, "learning_rate": 5.589437720106327e-08, "loss": 0.0, "num_input_tokens_seen": 124613736, "step": 184970 }, { "epoch": 4.518970024185864, "grad_norm": 3.850104258162901e-05, "learning_rate": 5.586626946179007e-08, "loss": 0.0, "num_input_tokens_seen": 124617128, "step": 184975 }, { "epoch": 4.519092175017712, "grad_norm": 0.00029245539917610586, "learning_rate": 5.58381685885112e-08, "loss": 0.0, "num_input_tokens_seen": 124620392, "step": 184980 }, { "epoch": 4.519214325849559, "grad_norm": 7.031484710751101e-05, "learning_rate": 5.581007458143128e-08, "loss": 0.0, "num_input_tokens_seen": 124623784, "step": 184985 }, { "epoch": 4.519336476681406, "grad_norm": 0.0002684597857296467, "learning_rate": 5.578198744075424e-08, "loss": 0.0, "num_input_tokens_seen": 124627112, "step": 184990 }, { "epoch": 4.519458627513253, "grad_norm": 0.00010617887164698914, "learning_rate": 5.57539071666846e-08, "loss": 0.0, "num_input_tokens_seen": 124630376, "step": 184995 }, { "epoch": 4.519580778345101, "grad_norm": 5.1525526941986755e-05, "learning_rate": 5.572583375942675e-08, "loss": 0.0, "num_input_tokens_seen": 124633832, "step": 185000 }, { "epoch": 4.5197029291769475, "grad_norm": 0.0009134263964369893, "learning_rate": 5.569776721918451e-08, "loss": 0.0, "num_input_tokens_seen": 124637288, "step": 185005 }, { "epoch": 4.519825080008795, "grad_norm": 0.0002533289953134954, "learning_rate": 5.566970754616196e-08, "loss": 0.0001, "num_input_tokens_seen": 124640424, "step": 185010 }, { "epoch": 4.519947230840642, "grad_norm": 0.01770016737282276, "learning_rate": 5.564165474056337e-08, "loss": 0.0, "num_input_tokens_seen": 124643944, "step": 185015 }, { "epoch": 4.5200693816724895, "grad_norm": 0.0004412032139953226, "learning_rate": 5.5613608802592806e-08, "loss": 0.0, "num_input_tokens_seen": 124647528, "step": 185020 }, { "epoch": 4.520191532504336, "grad_norm": 0.0002838908403646201, "learning_rate": 5.558556973245387e-08, "loss": 0.0, "num_input_tokens_seen": 124651048, "step": 185025 }, { "epoch": 4.520313683336184, "grad_norm": 5.34481341674109e-06, "learning_rate": 5.555753753035064e-08, "loss": 0.0, "num_input_tokens_seen": 124654568, "step": 185030 }, { "epoch": 4.520435834168031, "grad_norm": 0.0009813703363761306, "learning_rate": 5.552951219648727e-08, "loss": 0.0, "num_input_tokens_seen": 124657704, "step": 185035 }, { "epoch": 4.520557984999877, "grad_norm": 0.0004406738735269755, "learning_rate": 5.550149373106716e-08, "loss": 0.0, "num_input_tokens_seen": 124661672, "step": 185040 }, { "epoch": 4.520680135831725, "grad_norm": 0.00022535765310749412, "learning_rate": 5.547348213429437e-08, "loss": 0.0, "num_input_tokens_seen": 124665128, "step": 185045 }, { "epoch": 4.520802286663573, "grad_norm": 0.0007305620820261538, "learning_rate": 5.544547740637229e-08, "loss": 0.0, "num_input_tokens_seen": 124668776, "step": 185050 }, { "epoch": 4.520924437495419, "grad_norm": 2.776537621684838e-05, "learning_rate": 5.5417479547504756e-08, "loss": 0.0, "num_input_tokens_seen": 124672488, "step": 185055 }, { "epoch": 4.521046588327266, "grad_norm": 0.0010982746025547385, "learning_rate": 5.538948855789549e-08, "loss": 0.0, "num_input_tokens_seen": 124675688, "step": 185060 }, { "epoch": 4.521168739159114, "grad_norm": 3.9815080526750535e-05, "learning_rate": 5.536150443774779e-08, "loss": 0.0, "num_input_tokens_seen": 124679016, "step": 185065 }, { "epoch": 4.5212908899909605, "grad_norm": 0.0006533220293931663, "learning_rate": 5.5333527187265474e-08, "loss": 0.0, "num_input_tokens_seen": 124682664, "step": 185070 }, { "epoch": 4.521413040822808, "grad_norm": 0.0008570468053221703, "learning_rate": 5.530555680665172e-08, "loss": 0.0, "num_input_tokens_seen": 124685544, "step": 185075 }, { "epoch": 4.521535191654655, "grad_norm": 0.00027924918686039746, "learning_rate": 5.5277593296110145e-08, "loss": 0.0, "num_input_tokens_seen": 124689384, "step": 185080 }, { "epoch": 4.5216573424865025, "grad_norm": 1.934741339937318e-05, "learning_rate": 5.5249636655843924e-08, "loss": 0.0, "num_input_tokens_seen": 124692456, "step": 185085 }, { "epoch": 4.521779493318349, "grad_norm": 0.0005722602945752442, "learning_rate": 5.5221686886056326e-08, "loss": 0.0, "num_input_tokens_seen": 124695784, "step": 185090 }, { "epoch": 4.521901644150197, "grad_norm": 0.0003941961913369596, "learning_rate": 5.519374398695098e-08, "loss": 0.0, "num_input_tokens_seen": 124699624, "step": 185095 }, { "epoch": 4.522023794982044, "grad_norm": 4.406500738696195e-05, "learning_rate": 5.516580795873071e-08, "loss": 0.0713, "num_input_tokens_seen": 124702888, "step": 185100 }, { "epoch": 4.522145945813891, "grad_norm": 40.052284240722656, "learning_rate": 5.513787880159892e-08, "loss": 0.0463, "num_input_tokens_seen": 124705960, "step": 185105 }, { "epoch": 4.522268096645738, "grad_norm": 3.8516096537932754e-05, "learning_rate": 5.5109956515758674e-08, "loss": 0.0, "num_input_tokens_seen": 124709160, "step": 185110 }, { "epoch": 4.522390247477586, "grad_norm": 0.0001808811502996832, "learning_rate": 5.508204110141279e-08, "loss": 0.0, "num_input_tokens_seen": 124712296, "step": 185115 }, { "epoch": 4.522512398309432, "grad_norm": 0.0002272677666042, "learning_rate": 5.505413255876457e-08, "loss": 0.0, "num_input_tokens_seen": 124716456, "step": 185120 }, { "epoch": 4.52263454914128, "grad_norm": 0.0005010889144614339, "learning_rate": 5.502623088801672e-08, "loss": 0.0, "num_input_tokens_seen": 124719784, "step": 185125 }, { "epoch": 4.522756699973127, "grad_norm": 0.0003914596454706043, "learning_rate": 5.4998336089372546e-08, "loss": 0.0, "num_input_tokens_seen": 124723368, "step": 185130 }, { "epoch": 4.5228788508049735, "grad_norm": 8.751282621233258e-06, "learning_rate": 5.497044816303442e-08, "loss": 0.0, "num_input_tokens_seen": 124726888, "step": 185135 }, { "epoch": 4.523001001636821, "grad_norm": 0.00026843592058867216, "learning_rate": 5.494256710920542e-08, "loss": 0.0, "num_input_tokens_seen": 124730216, "step": 185140 }, { "epoch": 4.523123152468669, "grad_norm": 0.0016787907807156444, "learning_rate": 5.4914692928088257e-08, "loss": 0.0, "num_input_tokens_seen": 124733544, "step": 185145 }, { "epoch": 4.5232453033005156, "grad_norm": 0.000363263301551342, "learning_rate": 5.488682561988556e-08, "loss": 0.0, "num_input_tokens_seen": 124736552, "step": 185150 }, { "epoch": 4.523367454132362, "grad_norm": 1.498419078416191e-05, "learning_rate": 5.485896518480026e-08, "loss": 0.0, "num_input_tokens_seen": 124739944, "step": 185155 }, { "epoch": 4.52348960496421, "grad_norm": 0.0005063650896772742, "learning_rate": 5.483111162303466e-08, "loss": 0.0, "num_input_tokens_seen": 124743144, "step": 185160 }, { "epoch": 4.523611755796057, "grad_norm": 7.881235069362447e-05, "learning_rate": 5.480326493479148e-08, "loss": 0.0, "num_input_tokens_seen": 124746856, "step": 185165 }, { "epoch": 4.523733906627904, "grad_norm": 0.00028229813324287534, "learning_rate": 5.477542512027311e-08, "loss": 0.0, "num_input_tokens_seen": 124749992, "step": 185170 }, { "epoch": 4.523856057459751, "grad_norm": 0.0024047689512372017, "learning_rate": 5.474759217968228e-08, "loss": 0.0, "num_input_tokens_seen": 124753512, "step": 185175 }, { "epoch": 4.523978208291599, "grad_norm": 0.00021296401973813772, "learning_rate": 5.4719766113220936e-08, "loss": 0.0, "num_input_tokens_seen": 124756904, "step": 185180 }, { "epoch": 4.524100359123445, "grad_norm": 0.02283492125570774, "learning_rate": 5.4691946921091804e-08, "loss": 0.0, "num_input_tokens_seen": 124759976, "step": 185185 }, { "epoch": 4.524222509955293, "grad_norm": 0.0004252835351508111, "learning_rate": 5.4664134603497166e-08, "loss": 0.0, "num_input_tokens_seen": 124763112, "step": 185190 }, { "epoch": 4.52434466078714, "grad_norm": 3.6331872252048925e-05, "learning_rate": 5.463632916063909e-08, "loss": 0.0, "num_input_tokens_seen": 124766568, "step": 185195 }, { "epoch": 4.5244668116189874, "grad_norm": 4.691815047408454e-05, "learning_rate": 5.460853059272008e-08, "loss": 0.0, "num_input_tokens_seen": 124769640, "step": 185200 }, { "epoch": 4.524588962450834, "grad_norm": 4.6906861825846136e-05, "learning_rate": 5.458073889994197e-08, "loss": 0.0, "num_input_tokens_seen": 124772904, "step": 185205 }, { "epoch": 4.524711113282682, "grad_norm": 5.869155211257748e-05, "learning_rate": 5.4552954082507154e-08, "loss": 0.0, "num_input_tokens_seen": 124776552, "step": 185210 }, { "epoch": 4.524833264114529, "grad_norm": 0.04095650091767311, "learning_rate": 5.452517614061736e-08, "loss": 0.0, "num_input_tokens_seen": 124780072, "step": 185215 }, { "epoch": 4.524955414946376, "grad_norm": 0.08172313123941422, "learning_rate": 5.4497405074474976e-08, "loss": 0.0, "num_input_tokens_seen": 124783208, "step": 185220 }, { "epoch": 4.525077565778223, "grad_norm": 0.00012791799963451922, "learning_rate": 5.446964088428174e-08, "loss": 0.0, "num_input_tokens_seen": 124786408, "step": 185225 }, { "epoch": 4.52519971661007, "grad_norm": 0.0001636029628571123, "learning_rate": 5.444188357023938e-08, "loss": 0.0, "num_input_tokens_seen": 124789800, "step": 185230 }, { "epoch": 4.525321867441917, "grad_norm": 9.43487902986817e-05, "learning_rate": 5.441413313255028e-08, "loss": 0.0, "num_input_tokens_seen": 124793192, "step": 185235 }, { "epoch": 4.525444018273764, "grad_norm": 0.0008830385049805045, "learning_rate": 5.4386389571415616e-08, "loss": 0.0, "num_input_tokens_seen": 124796392, "step": 185240 }, { "epoch": 4.525566169105612, "grad_norm": 3.347361052874476e-05, "learning_rate": 5.435865288703756e-08, "loss": 0.0, "num_input_tokens_seen": 124799592, "step": 185245 }, { "epoch": 4.5256883199374585, "grad_norm": 1.570507993164938e-05, "learning_rate": 5.433092307961784e-08, "loss": 0.0, "num_input_tokens_seen": 124802472, "step": 185250 }, { "epoch": 4.525810470769306, "grad_norm": 0.000179529408342205, "learning_rate": 5.4303200149357966e-08, "loss": 0.0, "num_input_tokens_seen": 124805480, "step": 185255 }, { "epoch": 4.525932621601153, "grad_norm": 0.005788368638604879, "learning_rate": 5.4275484096459546e-08, "loss": 0.0, "num_input_tokens_seen": 124809896, "step": 185260 }, { "epoch": 4.5260547724330005, "grad_norm": 1.500210419180803e-05, "learning_rate": 5.42477749211242e-08, "loss": 0.0, "num_input_tokens_seen": 124813672, "step": 185265 }, { "epoch": 4.526176923264847, "grad_norm": 0.00018123764311894774, "learning_rate": 5.422007262355344e-08, "loss": 0.0, "num_input_tokens_seen": 124816616, "step": 185270 }, { "epoch": 4.526299074096695, "grad_norm": 0.0005241407197900116, "learning_rate": 5.419237720394865e-08, "loss": 0.0, "num_input_tokens_seen": 124820456, "step": 185275 }, { "epoch": 4.526421224928542, "grad_norm": 0.00012486950436141342, "learning_rate": 5.416468866251123e-08, "loss": 0.0, "num_input_tokens_seen": 124823656, "step": 185280 }, { "epoch": 4.526543375760389, "grad_norm": 0.0008724026847630739, "learning_rate": 5.413700699944268e-08, "loss": 0.0305, "num_input_tokens_seen": 124826792, "step": 185285 }, { "epoch": 4.526665526592236, "grad_norm": 0.10582155734300613, "learning_rate": 5.4109332214944184e-08, "loss": 0.0001, "num_input_tokens_seen": 124829928, "step": 185290 }, { "epoch": 4.526787677424084, "grad_norm": 0.00011101571726612747, "learning_rate": 5.4081664309217126e-08, "loss": 0.0, "num_input_tokens_seen": 124832808, "step": 185295 }, { "epoch": 4.52690982825593, "grad_norm": 6.935147393960506e-05, "learning_rate": 5.405400328246246e-08, "loss": 0.0, "num_input_tokens_seen": 124835944, "step": 185300 }, { "epoch": 4.527031979087777, "grad_norm": 0.0004977515200152993, "learning_rate": 5.402634913488158e-08, "loss": 0.0, "num_input_tokens_seen": 124839656, "step": 185305 }, { "epoch": 4.527154129919625, "grad_norm": 0.0001641338167246431, "learning_rate": 5.399870186667554e-08, "loss": 0.0, "num_input_tokens_seen": 124842728, "step": 185310 }, { "epoch": 4.527276280751472, "grad_norm": 0.00629672734066844, "learning_rate": 5.3971061478045533e-08, "loss": 0.0, "num_input_tokens_seen": 124845864, "step": 185315 }, { "epoch": 4.527398431583319, "grad_norm": 0.0005530017078854144, "learning_rate": 5.3943427969192154e-08, "loss": 0.0, "num_input_tokens_seen": 124849512, "step": 185320 }, { "epoch": 4.527520582415166, "grad_norm": 0.006828108802437782, "learning_rate": 5.391580134031681e-08, "loss": 0.0, "num_input_tokens_seen": 124852904, "step": 185325 }, { "epoch": 4.5276427332470135, "grad_norm": 0.033004239201545715, "learning_rate": 5.388818159162034e-08, "loss": 0.0, "num_input_tokens_seen": 124856552, "step": 185330 }, { "epoch": 4.52776488407886, "grad_norm": 0.00016071656136773527, "learning_rate": 5.386056872330325e-08, "loss": 0.0, "num_input_tokens_seen": 124859880, "step": 185335 }, { "epoch": 4.527887034910708, "grad_norm": 0.006571914069354534, "learning_rate": 5.383296273556648e-08, "loss": 0.0072, "num_input_tokens_seen": 124862952, "step": 185340 }, { "epoch": 4.528009185742555, "grad_norm": 7.050389103824273e-05, "learning_rate": 5.380536362861121e-08, "loss": 0.0, "num_input_tokens_seen": 124866024, "step": 185345 }, { "epoch": 4.528131336574402, "grad_norm": 2.7005646188626997e-05, "learning_rate": 5.377777140263762e-08, "loss": 0.0, "num_input_tokens_seen": 124869672, "step": 185350 }, { "epoch": 4.528253487406249, "grad_norm": 0.0005079191760160029, "learning_rate": 5.375018605784665e-08, "loss": 0.0, "num_input_tokens_seen": 124873064, "step": 185355 }, { "epoch": 4.528375638238097, "grad_norm": 0.00022228542366065085, "learning_rate": 5.372260759443881e-08, "loss": 0.0, "num_input_tokens_seen": 124876136, "step": 185360 }, { "epoch": 4.528497789069943, "grad_norm": 0.0002706579689402133, "learning_rate": 5.36950360126146e-08, "loss": 0.0, "num_input_tokens_seen": 124879592, "step": 185365 }, { "epoch": 4.528619939901791, "grad_norm": 2.9045746487099677e-05, "learning_rate": 5.3667471312574766e-08, "loss": 0.0, "num_input_tokens_seen": 124883048, "step": 185370 }, { "epoch": 4.528742090733638, "grad_norm": 0.00029516901122406125, "learning_rate": 5.363991349451957e-08, "loss": 0.0, "num_input_tokens_seen": 124887080, "step": 185375 }, { "epoch": 4.528864241565485, "grad_norm": 0.000799721572548151, "learning_rate": 5.3612362558649536e-08, "loss": 0.0, "num_input_tokens_seen": 124890152, "step": 185380 }, { "epoch": 4.528986392397332, "grad_norm": 0.000487153185531497, "learning_rate": 5.358481850516483e-08, "loss": 0.0, "num_input_tokens_seen": 124893096, "step": 185385 }, { "epoch": 4.52910854322918, "grad_norm": 0.0011373942252248526, "learning_rate": 5.3557281334265957e-08, "loss": 0.0, "num_input_tokens_seen": 124896360, "step": 185390 }, { "epoch": 4.5292306940610265, "grad_norm": 0.0003631273575592786, "learning_rate": 5.352975104615298e-08, "loss": 0.0489, "num_input_tokens_seen": 124899560, "step": 185395 }, { "epoch": 4.529352844892873, "grad_norm": 0.014406845904886723, "learning_rate": 5.35022276410263e-08, "loss": 0.0, "num_input_tokens_seen": 124903144, "step": 185400 }, { "epoch": 4.529474995724721, "grad_norm": 0.0010270047932863235, "learning_rate": 5.347471111908608e-08, "loss": 0.0, "num_input_tokens_seen": 124906280, "step": 185405 }, { "epoch": 4.5295971465565685, "grad_norm": 0.0006817655521444976, "learning_rate": 5.3447201480532164e-08, "loss": 0.0, "num_input_tokens_seen": 124909800, "step": 185410 }, { "epoch": 4.529719297388415, "grad_norm": 0.00252143875695765, "learning_rate": 5.3419698725564956e-08, "loss": 0.0, "num_input_tokens_seen": 124912936, "step": 185415 }, { "epoch": 4.529841448220262, "grad_norm": 7.715379615547135e-05, "learning_rate": 5.3392202854384284e-08, "loss": 0.0, "num_input_tokens_seen": 124916264, "step": 185420 }, { "epoch": 4.52996359905211, "grad_norm": 0.0008216107380576432, "learning_rate": 5.3364713867189995e-08, "loss": 0.0, "num_input_tokens_seen": 124919208, "step": 185425 }, { "epoch": 4.530085749883956, "grad_norm": 2.122437217622064e-05, "learning_rate": 5.3337231764182366e-08, "loss": 0.0, "num_input_tokens_seen": 124922664, "step": 185430 }, { "epoch": 4.530207900715804, "grad_norm": 0.00047716827248223126, "learning_rate": 5.3309756545560694e-08, "loss": 0.0, "num_input_tokens_seen": 124926056, "step": 185435 }, { "epoch": 4.530330051547651, "grad_norm": 0.00011559919221326709, "learning_rate": 5.328228821152536e-08, "loss": 0.0, "num_input_tokens_seen": 124929576, "step": 185440 }, { "epoch": 4.530452202379498, "grad_norm": 0.0006488763028755784, "learning_rate": 5.325482676227566e-08, "loss": 0.0, "num_input_tokens_seen": 124932584, "step": 185445 }, { "epoch": 4.530574353211345, "grad_norm": 0.00010975029726978391, "learning_rate": 5.3227372198011657e-08, "loss": 0.0, "num_input_tokens_seen": 124935976, "step": 185450 }, { "epoch": 4.530696504043193, "grad_norm": 7.937142072478309e-05, "learning_rate": 5.319992451893274e-08, "loss": 0.0, "num_input_tokens_seen": 124939240, "step": 185455 }, { "epoch": 4.5308186548750395, "grad_norm": 0.00013056962052360177, "learning_rate": 5.3172483725238635e-08, "loss": 0.0, "num_input_tokens_seen": 124942312, "step": 185460 }, { "epoch": 4.530940805706887, "grad_norm": 0.0003446988412179053, "learning_rate": 5.3145049817128975e-08, "loss": 0.0, "num_input_tokens_seen": 124946600, "step": 185465 }, { "epoch": 4.531062956538734, "grad_norm": 0.00012127536319894716, "learning_rate": 5.311762279480314e-08, "loss": 0.0, "num_input_tokens_seen": 124949928, "step": 185470 }, { "epoch": 4.5311851073705816, "grad_norm": 0.0005341236828826368, "learning_rate": 5.309020265846076e-08, "loss": 0.0, "num_input_tokens_seen": 124953128, "step": 185475 }, { "epoch": 4.531307258202428, "grad_norm": 0.0023311295080929995, "learning_rate": 5.306278940830089e-08, "loss": 0.0, "num_input_tokens_seen": 124956712, "step": 185480 }, { "epoch": 4.531429409034276, "grad_norm": 0.0015459018759429455, "learning_rate": 5.3035383044523266e-08, "loss": 0.0, "num_input_tokens_seen": 124960552, "step": 185485 }, { "epoch": 4.531551559866123, "grad_norm": 4.727648411062546e-05, "learning_rate": 5.3007983567326943e-08, "loss": 0.0, "num_input_tokens_seen": 124963816, "step": 185490 }, { "epoch": 4.531673710697969, "grad_norm": 0.0004621119878720492, "learning_rate": 5.298059097691132e-08, "loss": 0.0, "num_input_tokens_seen": 124967080, "step": 185495 }, { "epoch": 4.531795861529817, "grad_norm": 0.0013261305866762996, "learning_rate": 5.295320527347558e-08, "loss": 0.0, "num_input_tokens_seen": 124970856, "step": 185500 }, { "epoch": 4.531918012361664, "grad_norm": 6.2955332396086305e-06, "learning_rate": 5.292582645721877e-08, "loss": 0.0, "num_input_tokens_seen": 124975208, "step": 185505 }, { "epoch": 4.532040163193511, "grad_norm": 2.9312808692338876e-05, "learning_rate": 5.2898454528340296e-08, "loss": 0.0, "num_input_tokens_seen": 124980648, "step": 185510 }, { "epoch": 4.532162314025358, "grad_norm": 0.00013647187734022737, "learning_rate": 5.287108948703878e-08, "loss": 0.0254, "num_input_tokens_seen": 124984168, "step": 185515 }, { "epoch": 4.532284464857206, "grad_norm": 0.000533355341758579, "learning_rate": 5.284373133351361e-08, "loss": 0.0, "num_input_tokens_seen": 124987816, "step": 185520 }, { "epoch": 4.532406615689053, "grad_norm": 2.2797685232944787e-05, "learning_rate": 5.2816380067963406e-08, "loss": 0.0, "num_input_tokens_seen": 124991144, "step": 185525 }, { "epoch": 4.5325287665209, "grad_norm": 0.00010959566134260967, "learning_rate": 5.278903569058735e-08, "loss": 0.0, "num_input_tokens_seen": 124994344, "step": 185530 }, { "epoch": 4.532650917352747, "grad_norm": 0.0015156366862356663, "learning_rate": 5.276169820158427e-08, "loss": 0.0, "num_input_tokens_seen": 124997288, "step": 185535 }, { "epoch": 4.532773068184595, "grad_norm": 0.0022079667542129755, "learning_rate": 5.27343676011528e-08, "loss": 0.0, "num_input_tokens_seen": 125001192, "step": 185540 }, { "epoch": 4.532895219016441, "grad_norm": 0.022384779527783394, "learning_rate": 5.270704388949188e-08, "loss": 0.0, "num_input_tokens_seen": 125004456, "step": 185545 }, { "epoch": 4.533017369848289, "grad_norm": 0.0008203625911846757, "learning_rate": 5.2679727066799905e-08, "loss": 0.0, "num_input_tokens_seen": 125008040, "step": 185550 }, { "epoch": 4.533139520680136, "grad_norm": 6.327245500870049e-05, "learning_rate": 5.265241713327584e-08, "loss": 0.0, "num_input_tokens_seen": 125011496, "step": 185555 }, { "epoch": 4.533261671511983, "grad_norm": 0.00014428046415559947, "learning_rate": 5.262511408911841e-08, "loss": 0.0, "num_input_tokens_seen": 125015016, "step": 185560 }, { "epoch": 4.53338382234383, "grad_norm": 0.0006246797856874764, "learning_rate": 5.2597817934525776e-08, "loss": 0.0, "num_input_tokens_seen": 125018408, "step": 185565 }, { "epoch": 4.533505973175677, "grad_norm": 3.378166729817167e-05, "learning_rate": 5.257052866969669e-08, "loss": 0.0, "num_input_tokens_seen": 125021992, "step": 185570 }, { "epoch": 4.5336281240075245, "grad_norm": 0.004872548393905163, "learning_rate": 5.2543246294829426e-08, "loss": 0.0, "num_input_tokens_seen": 125025384, "step": 185575 }, { "epoch": 4.533750274839372, "grad_norm": 0.0018695322796702385, "learning_rate": 5.2515970810122715e-08, "loss": 0.0, "num_input_tokens_seen": 125028712, "step": 185580 }, { "epoch": 4.533872425671219, "grad_norm": 9.729518205858767e-05, "learning_rate": 5.248870221577451e-08, "loss": 0.0, "num_input_tokens_seen": 125031848, "step": 185585 }, { "epoch": 4.533994576503066, "grad_norm": 0.0005519227706827223, "learning_rate": 5.2461440511983424e-08, "loss": 0.0, "num_input_tokens_seen": 125034984, "step": 185590 }, { "epoch": 4.534116727334913, "grad_norm": 0.00024730851873755455, "learning_rate": 5.243418569894764e-08, "loss": 0.0, "num_input_tokens_seen": 125038248, "step": 185595 }, { "epoch": 4.53423887816676, "grad_norm": 0.0003327416779939085, "learning_rate": 5.2406937776865225e-08, "loss": 0.0, "num_input_tokens_seen": 125041640, "step": 185600 }, { "epoch": 4.534361028998608, "grad_norm": 0.00013707105244975537, "learning_rate": 5.2379696745934455e-08, "loss": 0.0, "num_input_tokens_seen": 125044776, "step": 185605 }, { "epoch": 4.534483179830454, "grad_norm": 0.0006666643312200904, "learning_rate": 5.23524626063534e-08, "loss": 0.0003, "num_input_tokens_seen": 125047912, "step": 185610 }, { "epoch": 4.534605330662302, "grad_norm": 0.0019504765514284372, "learning_rate": 5.232523535832012e-08, "loss": 0.0, "num_input_tokens_seen": 125051496, "step": 185615 }, { "epoch": 4.534727481494149, "grad_norm": 0.00022360155708156526, "learning_rate": 5.229801500203268e-08, "loss": 0.0, "num_input_tokens_seen": 125054888, "step": 185620 }, { "epoch": 4.534849632325996, "grad_norm": 0.00042555946856737137, "learning_rate": 5.2270801537689035e-08, "loss": 0.0, "num_input_tokens_seen": 125058536, "step": 185625 }, { "epoch": 4.534971783157843, "grad_norm": 0.004613831639289856, "learning_rate": 5.2243594965486916e-08, "loss": 0.0, "num_input_tokens_seen": 125061928, "step": 185630 }, { "epoch": 4.535093933989691, "grad_norm": 0.0004901207285001874, "learning_rate": 5.221639528562438e-08, "loss": 0.0, "num_input_tokens_seen": 125065704, "step": 185635 }, { "epoch": 4.5352160848215375, "grad_norm": 8.990954665932804e-05, "learning_rate": 5.218920249829906e-08, "loss": 0.0, "num_input_tokens_seen": 125068968, "step": 185640 }, { "epoch": 4.535338235653385, "grad_norm": 0.0013385992497205734, "learning_rate": 5.216201660370878e-08, "loss": 0.0, "num_input_tokens_seen": 125072168, "step": 185645 }, { "epoch": 4.535460386485232, "grad_norm": 0.0028935212176293135, "learning_rate": 5.2134837602051174e-08, "loss": 0.0, "num_input_tokens_seen": 125076008, "step": 185650 }, { "epoch": 4.5355825373170795, "grad_norm": 0.7927491068840027, "learning_rate": 5.210766549352419e-08, "loss": 0.0, "num_input_tokens_seen": 125078888, "step": 185655 }, { "epoch": 4.535704688148926, "grad_norm": 0.00037081216578371823, "learning_rate": 5.2080500278325e-08, "loss": 0.0, "num_input_tokens_seen": 125082152, "step": 185660 }, { "epoch": 4.535826838980773, "grad_norm": 0.00030138157308101654, "learning_rate": 5.2053341956651566e-08, "loss": 0.0, "num_input_tokens_seen": 125085864, "step": 185665 }, { "epoch": 4.535948989812621, "grad_norm": 0.0005100808339193463, "learning_rate": 5.202619052870105e-08, "loss": 0.0, "num_input_tokens_seen": 125089256, "step": 185670 }, { "epoch": 4.536071140644468, "grad_norm": 0.00035853905137628317, "learning_rate": 5.19990459946712e-08, "loss": 0.0, "num_input_tokens_seen": 125092968, "step": 185675 }, { "epoch": 4.536193291476315, "grad_norm": 2.2261907361098565e-05, "learning_rate": 5.1971908354759065e-08, "loss": 0.0, "num_input_tokens_seen": 125096232, "step": 185680 }, { "epoch": 4.536315442308162, "grad_norm": 0.0009718828368932009, "learning_rate": 5.194477760916227e-08, "loss": 0.0, "num_input_tokens_seen": 125099880, "step": 185685 }, { "epoch": 4.536437593140009, "grad_norm": 14.48056697845459, "learning_rate": 5.1917653758078216e-08, "loss": 0.0256, "num_input_tokens_seen": 125103080, "step": 185690 }, { "epoch": 4.536559743971856, "grad_norm": 0.00042473600478842854, "learning_rate": 5.189053680170374e-08, "loss": 0.0, "num_input_tokens_seen": 125106856, "step": 185695 }, { "epoch": 4.536681894803704, "grad_norm": 0.027816886082291603, "learning_rate": 5.186342674023647e-08, "loss": 0.0, "num_input_tokens_seen": 125110248, "step": 185700 }, { "epoch": 4.5368040456355505, "grad_norm": 0.0033893415238708258, "learning_rate": 5.1836323573873354e-08, "loss": 0.0, "num_input_tokens_seen": 125113256, "step": 185705 }, { "epoch": 4.536926196467398, "grad_norm": 1.5602147579193115, "learning_rate": 5.180922730281134e-08, "loss": 0.0006, "num_input_tokens_seen": 125116392, "step": 185710 }, { "epoch": 4.537048347299245, "grad_norm": 0.0005292331916280091, "learning_rate": 5.178213792724795e-08, "loss": 0.0, "num_input_tokens_seen": 125119592, "step": 185715 }, { "epoch": 4.5371704981310925, "grad_norm": 0.0013334914110600948, "learning_rate": 5.175505544737968e-08, "loss": 0.0, "num_input_tokens_seen": 125122856, "step": 185720 }, { "epoch": 4.537292648962939, "grad_norm": 0.0020089801400899887, "learning_rate": 5.1727979863403826e-08, "loss": 0.0, "num_input_tokens_seen": 125126312, "step": 185725 }, { "epoch": 4.537414799794787, "grad_norm": 0.00014795052993576974, "learning_rate": 5.1700911175517114e-08, "loss": 0.0474, "num_input_tokens_seen": 125129448, "step": 185730 }, { "epoch": 4.537536950626634, "grad_norm": 0.00015690297004766762, "learning_rate": 5.167384938391639e-08, "loss": 0.0, "num_input_tokens_seen": 125133096, "step": 185735 }, { "epoch": 4.537659101458481, "grad_norm": 0.005538203753530979, "learning_rate": 5.1646794488798606e-08, "loss": 0.0, "num_input_tokens_seen": 125136424, "step": 185740 }, { "epoch": 4.537781252290328, "grad_norm": 0.0001602565753273666, "learning_rate": 5.161974649036027e-08, "loss": 0.0, "num_input_tokens_seen": 125139880, "step": 185745 }, { "epoch": 4.537903403122176, "grad_norm": 0.00017262480105273426, "learning_rate": 5.159270538879834e-08, "loss": 0.0, "num_input_tokens_seen": 125143400, "step": 185750 }, { "epoch": 4.538025553954022, "grad_norm": 0.0003744983405340463, "learning_rate": 5.156567118430921e-08, "loss": 0.0, "num_input_tokens_seen": 125146856, "step": 185755 }, { "epoch": 4.538147704785869, "grad_norm": 5.3543702961178496e-05, "learning_rate": 5.1538643877089724e-08, "loss": 0.0, "num_input_tokens_seen": 125150120, "step": 185760 }, { "epoch": 4.538269855617717, "grad_norm": 0.00017326330998912454, "learning_rate": 5.151162346733629e-08, "loss": 0.0, "num_input_tokens_seen": 125153000, "step": 185765 }, { "epoch": 4.538392006449564, "grad_norm": 0.0007275182870216668, "learning_rate": 5.1484609955245395e-08, "loss": 0.0, "num_input_tokens_seen": 125156584, "step": 185770 }, { "epoch": 4.538514157281411, "grad_norm": 0.00010956819460261613, "learning_rate": 5.145760334101368e-08, "loss": 0.0, "num_input_tokens_seen": 125159848, "step": 185775 }, { "epoch": 4.538636308113258, "grad_norm": 0.00023068742302712053, "learning_rate": 5.14306036248372e-08, "loss": 0.0, "num_input_tokens_seen": 125163240, "step": 185780 }, { "epoch": 4.5387584589451055, "grad_norm": 0.0010408993111923337, "learning_rate": 5.140361080691269e-08, "loss": 0.0001, "num_input_tokens_seen": 125166440, "step": 185785 }, { "epoch": 4.538880609776952, "grad_norm": 0.0009393574437126517, "learning_rate": 5.1376624887436105e-08, "loss": 0.0, "num_input_tokens_seen": 125169576, "step": 185790 }, { "epoch": 4.5390027606088, "grad_norm": 0.0018601644551381469, "learning_rate": 5.134964586660406e-08, "loss": 0.0, "num_input_tokens_seen": 125173032, "step": 185795 }, { "epoch": 4.539124911440647, "grad_norm": 0.0002813244063872844, "learning_rate": 5.13226737446123e-08, "loss": 0.0, "num_input_tokens_seen": 125176872, "step": 185800 }, { "epoch": 4.539247062272494, "grad_norm": 0.0007881836500018835, "learning_rate": 5.129570852165732e-08, "loss": 0.0, "num_input_tokens_seen": 125179880, "step": 185805 }, { "epoch": 4.539369213104341, "grad_norm": 0.007888463325798512, "learning_rate": 5.1268750197935196e-08, "loss": 0.0, "num_input_tokens_seen": 125183848, "step": 185810 }, { "epoch": 4.539491363936189, "grad_norm": 0.0018474315293133259, "learning_rate": 5.124179877364176e-08, "loss": 0.0, "num_input_tokens_seen": 125187304, "step": 185815 }, { "epoch": 4.539613514768035, "grad_norm": 8.314439764944836e-05, "learning_rate": 5.1214854248973316e-08, "loss": 0.0, "num_input_tokens_seen": 125191400, "step": 185820 }, { "epoch": 4.539735665599883, "grad_norm": 1.6251518900389783e-05, "learning_rate": 5.118791662412558e-08, "loss": 0.0, "num_input_tokens_seen": 125195432, "step": 185825 }, { "epoch": 4.53985781643173, "grad_norm": 0.004052783828228712, "learning_rate": 5.116098589929452e-08, "loss": 0.0, "num_input_tokens_seen": 125198888, "step": 185830 }, { "epoch": 4.539979967263577, "grad_norm": 0.00011836458725156263, "learning_rate": 5.1134062074675966e-08, "loss": 0.0, "num_input_tokens_seen": 125202536, "step": 185835 }, { "epoch": 4.540102118095424, "grad_norm": 3.898305294569582e-05, "learning_rate": 5.110714515046577e-08, "loss": 0.0, "num_input_tokens_seen": 125205672, "step": 185840 }, { "epoch": 4.540224268927272, "grad_norm": 0.00021784893760923296, "learning_rate": 5.108023512685966e-08, "loss": 0.0, "num_input_tokens_seen": 125209128, "step": 185845 }, { "epoch": 4.540346419759119, "grad_norm": 9.264857362722978e-05, "learning_rate": 5.105333200405315e-08, "loss": 0.0, "num_input_tokens_seen": 125212200, "step": 185850 }, { "epoch": 4.540468570590965, "grad_norm": 0.0009690878796391189, "learning_rate": 5.102643578224219e-08, "loss": 0.0, "num_input_tokens_seen": 125216296, "step": 185855 }, { "epoch": 4.540590721422813, "grad_norm": 0.01024213619530201, "learning_rate": 5.099954646162208e-08, "loss": 0.0, "num_input_tokens_seen": 125219368, "step": 185860 }, { "epoch": 4.54071287225466, "grad_norm": 0.000201819246285595, "learning_rate": 5.0972664042388534e-08, "loss": 0.0, "num_input_tokens_seen": 125222696, "step": 185865 }, { "epoch": 4.540835023086507, "grad_norm": 0.0005003840196877718, "learning_rate": 5.0945788524737186e-08, "loss": 0.0, "num_input_tokens_seen": 125226024, "step": 185870 }, { "epoch": 4.540957173918354, "grad_norm": 0.004579643253237009, "learning_rate": 5.0918919908863214e-08, "loss": 0.0, "num_input_tokens_seen": 125229224, "step": 185875 }, { "epoch": 4.541079324750202, "grad_norm": 0.002748160855844617, "learning_rate": 5.089205819496223e-08, "loss": 0.0, "num_input_tokens_seen": 125232616, "step": 185880 }, { "epoch": 4.5412014755820485, "grad_norm": 0.0002458605740685016, "learning_rate": 5.0865203383229305e-08, "loss": 0.0402, "num_input_tokens_seen": 125235560, "step": 185885 }, { "epoch": 4.541323626413896, "grad_norm": 9.4459333922714e-05, "learning_rate": 5.0838355473860174e-08, "loss": 0.0, "num_input_tokens_seen": 125239272, "step": 185890 }, { "epoch": 4.541445777245743, "grad_norm": 0.00036171413375996053, "learning_rate": 5.081151446704956e-08, "loss": 0.0, "num_input_tokens_seen": 125242600, "step": 185895 }, { "epoch": 4.5415679280775905, "grad_norm": 0.0018198946490883827, "learning_rate": 5.0784680362992884e-08, "loss": 0.0, "num_input_tokens_seen": 125246056, "step": 185900 }, { "epoch": 4.541690078909437, "grad_norm": 0.0005793093587271869, "learning_rate": 5.075785316188552e-08, "loss": 0.0, "num_input_tokens_seen": 125249448, "step": 185905 }, { "epoch": 4.541812229741285, "grad_norm": 5.1052174967480823e-05, "learning_rate": 5.073103286392222e-08, "loss": 0.0, "num_input_tokens_seen": 125252776, "step": 185910 }, { "epoch": 4.541934380573132, "grad_norm": 3.680026202346198e-05, "learning_rate": 5.070421946929837e-08, "loss": 0.0, "num_input_tokens_seen": 125255976, "step": 185915 }, { "epoch": 4.542056531404979, "grad_norm": 3.232800372643396e-05, "learning_rate": 5.06774129782086e-08, "loss": 0.0, "num_input_tokens_seen": 125259240, "step": 185920 }, { "epoch": 4.542178682236826, "grad_norm": 0.00017554397345520556, "learning_rate": 5.0650613390847975e-08, "loss": 0.0, "num_input_tokens_seen": 125262632, "step": 185925 }, { "epoch": 4.542300833068673, "grad_norm": 0.002945692278444767, "learning_rate": 5.0623820707411556e-08, "loss": 0.0, "num_input_tokens_seen": 125265448, "step": 185930 }, { "epoch": 4.54242298390052, "grad_norm": 0.002603281522169709, "learning_rate": 5.0597034928094084e-08, "loss": 0.0, "num_input_tokens_seen": 125268904, "step": 185935 }, { "epoch": 4.542545134732368, "grad_norm": 1.2701857485808432e-05, "learning_rate": 5.057025605309029e-08, "loss": 0.0, "num_input_tokens_seen": 125272424, "step": 185940 }, { "epoch": 4.542667285564215, "grad_norm": 8.991216600406915e-05, "learning_rate": 5.054348408259501e-08, "loss": 0.0, "num_input_tokens_seen": 125275624, "step": 185945 }, { "epoch": 4.5427894363960615, "grad_norm": 0.00012111781688872725, "learning_rate": 5.051671901680288e-08, "loss": 0.0, "num_input_tokens_seen": 125279080, "step": 185950 }, { "epoch": 4.542911587227909, "grad_norm": 0.003195281373336911, "learning_rate": 5.0489960855908395e-08, "loss": 0.0, "num_input_tokens_seen": 125282344, "step": 185955 }, { "epoch": 4.543033738059756, "grad_norm": 0.0038429235573858023, "learning_rate": 5.04632096001063e-08, "loss": 0.0, "num_input_tokens_seen": 125285608, "step": 185960 }, { "epoch": 4.5431558888916035, "grad_norm": 1.9571771190385334e-05, "learning_rate": 5.043646524959133e-08, "loss": 0.0, "num_input_tokens_seen": 125289640, "step": 185965 }, { "epoch": 4.54327803972345, "grad_norm": 0.00022896616428624839, "learning_rate": 5.0409727804557655e-08, "loss": 0.0, "num_input_tokens_seen": 125292968, "step": 185970 }, { "epoch": 4.543400190555298, "grad_norm": 0.00019275565864518285, "learning_rate": 5.03829972651999e-08, "loss": 0.0, "num_input_tokens_seen": 125296360, "step": 185975 }, { "epoch": 4.543522341387145, "grad_norm": 0.0001385089854011312, "learning_rate": 5.0356273631712357e-08, "loss": 0.0, "num_input_tokens_seen": 125299624, "step": 185980 }, { "epoch": 4.543644492218992, "grad_norm": 0.00014744758664164692, "learning_rate": 5.032955690428953e-08, "loss": 0.0, "num_input_tokens_seen": 125302888, "step": 185985 }, { "epoch": 4.543766643050839, "grad_norm": 0.0001173034543171525, "learning_rate": 5.030284708312549e-08, "loss": 0.0, "num_input_tokens_seen": 125306024, "step": 185990 }, { "epoch": 4.543888793882687, "grad_norm": 0.00016871007392182946, "learning_rate": 5.027614416841453e-08, "loss": 0.0, "num_input_tokens_seen": 125309352, "step": 185995 }, { "epoch": 4.544010944714533, "grad_norm": 0.0008024271228350699, "learning_rate": 5.024944816035104e-08, "loss": 0.0, "num_input_tokens_seen": 125312680, "step": 186000 }, { "epoch": 4.544133095546381, "grad_norm": 0.013525069691240788, "learning_rate": 5.0222759059128874e-08, "loss": 0.0, "num_input_tokens_seen": 125315752, "step": 186005 }, { "epoch": 4.544255246378228, "grad_norm": 0.0012057870626449585, "learning_rate": 5.0196076864942426e-08, "loss": 0.0, "num_input_tokens_seen": 125318760, "step": 186010 }, { "epoch": 4.544377397210075, "grad_norm": 0.00020089205645490438, "learning_rate": 5.0169401577985435e-08, "loss": 0.0, "num_input_tokens_seen": 125321832, "step": 186015 }, { "epoch": 4.544499548041922, "grad_norm": 0.0002571505028754473, "learning_rate": 5.014273319845197e-08, "loss": 0.0, "num_input_tokens_seen": 125324968, "step": 186020 }, { "epoch": 4.544621698873769, "grad_norm": 6.082994514144957e-05, "learning_rate": 5.01160717265362e-08, "loss": 0.0, "num_input_tokens_seen": 125328488, "step": 186025 }, { "epoch": 4.5447438497056165, "grad_norm": 0.009646477177739143, "learning_rate": 5.008941716243176e-08, "loss": 0.0, "num_input_tokens_seen": 125332520, "step": 186030 }, { "epoch": 4.544866000537464, "grad_norm": 0.00025338766863569617, "learning_rate": 5.0062769506332704e-08, "loss": 0.0, "num_input_tokens_seen": 125335656, "step": 186035 }, { "epoch": 4.544988151369311, "grad_norm": 2.7880143534275703e-05, "learning_rate": 5.003612875843266e-08, "loss": 0.0, "num_input_tokens_seen": 125338920, "step": 186040 }, { "epoch": 4.545110302201158, "grad_norm": 0.0014782889047637582, "learning_rate": 5.000949491892525e-08, "loss": 0.0, "num_input_tokens_seen": 125342120, "step": 186045 }, { "epoch": 4.545232453033005, "grad_norm": 9.009351197164506e-05, "learning_rate": 4.998286798800444e-08, "loss": 0.0, "num_input_tokens_seen": 125345576, "step": 186050 }, { "epoch": 4.545354603864852, "grad_norm": 0.0061968364752829075, "learning_rate": 4.995624796586362e-08, "loss": 0.0, "num_input_tokens_seen": 125349096, "step": 186055 }, { "epoch": 4.5454767546967, "grad_norm": 1.4608061974286102e-05, "learning_rate": 4.992963485269663e-08, "loss": 0.0, "num_input_tokens_seen": 125352104, "step": 186060 }, { "epoch": 4.545598905528546, "grad_norm": 0.0016588385915383697, "learning_rate": 4.990302864869678e-08, "loss": 0.0, "num_input_tokens_seen": 125355752, "step": 186065 }, { "epoch": 4.545721056360394, "grad_norm": 2.921727718785405e-05, "learning_rate": 4.987642935405767e-08, "loss": 0.0, "num_input_tokens_seen": 125359144, "step": 186070 }, { "epoch": 4.545843207192241, "grad_norm": 0.00036771618761122227, "learning_rate": 4.984983696897271e-08, "loss": 0.0, "num_input_tokens_seen": 125362728, "step": 186075 }, { "epoch": 4.545965358024088, "grad_norm": 0.001058812951669097, "learning_rate": 4.98232514936352e-08, "loss": 0.0, "num_input_tokens_seen": 125365992, "step": 186080 }, { "epoch": 4.546087508855935, "grad_norm": 0.0024411482736468315, "learning_rate": 4.979667292823875e-08, "loss": 0.0, "num_input_tokens_seen": 125370088, "step": 186085 }, { "epoch": 4.546209659687783, "grad_norm": 5.861128738615662e-05, "learning_rate": 4.9770101272976316e-08, "loss": 0.0, "num_input_tokens_seen": 125373544, "step": 186090 }, { "epoch": 4.5463318105196295, "grad_norm": 0.00014239706797525287, "learning_rate": 4.974353652804142e-08, "loss": 0.0, "num_input_tokens_seen": 125376872, "step": 186095 }, { "epoch": 4.546453961351477, "grad_norm": 0.010437625460326672, "learning_rate": 4.971697869362701e-08, "loss": 0.0, "num_input_tokens_seen": 125380264, "step": 186100 }, { "epoch": 4.546576112183324, "grad_norm": 3.1913328712107614e-05, "learning_rate": 4.969042776992649e-08, "loss": 0.0, "num_input_tokens_seen": 125383400, "step": 186105 }, { "epoch": 4.5466982630151715, "grad_norm": 1.1161178917973302e-05, "learning_rate": 4.9663883757132596e-08, "loss": 0.0512, "num_input_tokens_seen": 125386728, "step": 186110 }, { "epoch": 4.546820413847018, "grad_norm": 0.0003454769030213356, "learning_rate": 4.96373466554385e-08, "loss": 0.0, "num_input_tokens_seen": 125390440, "step": 186115 }, { "epoch": 4.546942564678865, "grad_norm": 0.00020126953313592821, "learning_rate": 4.961081646503751e-08, "loss": 0.0, "num_input_tokens_seen": 125393960, "step": 186120 }, { "epoch": 4.547064715510713, "grad_norm": 0.0006144981598481536, "learning_rate": 4.9584293186122004e-08, "loss": 0.0, "num_input_tokens_seen": 125397800, "step": 186125 }, { "epoch": 4.547186866342559, "grad_norm": 0.000722163007594645, "learning_rate": 4.95577768188854e-08, "loss": 0.0, "num_input_tokens_seen": 125401512, "step": 186130 }, { "epoch": 4.547309017174407, "grad_norm": 5.897651135455817e-05, "learning_rate": 4.953126736352009e-08, "loss": 0.0, "num_input_tokens_seen": 125404968, "step": 186135 }, { "epoch": 4.547431168006254, "grad_norm": 0.002091348869726062, "learning_rate": 4.950476482021915e-08, "loss": 0.0, "num_input_tokens_seen": 125408296, "step": 186140 }, { "epoch": 4.547553318838101, "grad_norm": 0.0008667752845212817, "learning_rate": 4.947826918917519e-08, "loss": 0.0, "num_input_tokens_seen": 125411176, "step": 186145 }, { "epoch": 4.547675469669948, "grad_norm": 0.0008085620356723666, "learning_rate": 4.945178047058096e-08, "loss": 0.0, "num_input_tokens_seen": 125414760, "step": 186150 }, { "epoch": 4.547797620501796, "grad_norm": 0.001199282007291913, "learning_rate": 4.942529866462908e-08, "loss": 0.0, "num_input_tokens_seen": 125418152, "step": 186155 }, { "epoch": 4.547919771333643, "grad_norm": 0.00020651493105106056, "learning_rate": 4.9398823771511944e-08, "loss": 0.0, "num_input_tokens_seen": 125421224, "step": 186160 }, { "epoch": 4.54804192216549, "grad_norm": 0.0018644839292392135, "learning_rate": 4.9372355791422406e-08, "loss": 0.0, "num_input_tokens_seen": 125424808, "step": 186165 }, { "epoch": 4.548164072997337, "grad_norm": 0.000575466372538358, "learning_rate": 4.934589472455264e-08, "loss": 0.0, "num_input_tokens_seen": 125428392, "step": 186170 }, { "epoch": 4.548286223829185, "grad_norm": 0.0004205135628581047, "learning_rate": 4.9319440571095164e-08, "loss": 0.0, "num_input_tokens_seen": 125431784, "step": 186175 }, { "epoch": 4.548408374661031, "grad_norm": 0.0006393736694008112, "learning_rate": 4.9292993331242595e-08, "loss": 0.0, "num_input_tokens_seen": 125435048, "step": 186180 }, { "epoch": 4.548530525492879, "grad_norm": 0.00011096692469436675, "learning_rate": 4.9266553005187005e-08, "loss": 0.0, "num_input_tokens_seen": 125438504, "step": 186185 }, { "epoch": 4.548652676324726, "grad_norm": 0.00046578276669606566, "learning_rate": 4.924011959312091e-08, "loss": 0.0, "num_input_tokens_seen": 125441832, "step": 186190 }, { "epoch": 4.548774827156572, "grad_norm": 7.459130574716255e-05, "learning_rate": 4.9213693095236154e-08, "loss": 0.0, "num_input_tokens_seen": 125445288, "step": 186195 }, { "epoch": 4.54889697798842, "grad_norm": 0.00038041500374674797, "learning_rate": 4.918727351172536e-08, "loss": 0.0359, "num_input_tokens_seen": 125448168, "step": 186200 }, { "epoch": 4.549019128820268, "grad_norm": 0.0028200701344758272, "learning_rate": 4.916086084278026e-08, "loss": 0.0, "num_input_tokens_seen": 125451816, "step": 186205 }, { "epoch": 4.5491412796521145, "grad_norm": 0.0003451250959187746, "learning_rate": 4.913445508859315e-08, "loss": 0.0, "num_input_tokens_seen": 125455208, "step": 186210 }, { "epoch": 4.549263430483961, "grad_norm": 0.00034329271875321865, "learning_rate": 4.91080562493561e-08, "loss": 0.0, "num_input_tokens_seen": 125458664, "step": 186215 }, { "epoch": 4.549385581315809, "grad_norm": 0.000254724029218778, "learning_rate": 4.908166432526106e-08, "loss": 0.0, "num_input_tokens_seen": 125461672, "step": 186220 }, { "epoch": 4.549507732147656, "grad_norm": 0.0013476322637870908, "learning_rate": 4.905527931649989e-08, "loss": 0.0, "num_input_tokens_seen": 125465256, "step": 186225 }, { "epoch": 4.549629882979503, "grad_norm": 7.785054913256317e-05, "learning_rate": 4.902890122326442e-08, "loss": 0.0, "num_input_tokens_seen": 125469224, "step": 186230 }, { "epoch": 4.54975203381135, "grad_norm": 0.00030382751720026135, "learning_rate": 4.900253004574673e-08, "loss": 0.0, "num_input_tokens_seen": 125472808, "step": 186235 }, { "epoch": 4.549874184643198, "grad_norm": 9.079613846552093e-06, "learning_rate": 4.8976165784138327e-08, "loss": 0.0, "num_input_tokens_seen": 125475944, "step": 186240 }, { "epoch": 4.549996335475044, "grad_norm": 0.00020586224854923785, "learning_rate": 4.894980843863106e-08, "loss": 0.0, "num_input_tokens_seen": 125479336, "step": 186245 }, { "epoch": 4.550118486306892, "grad_norm": 0.00014247857325244695, "learning_rate": 4.892345800941655e-08, "loss": 0.0, "num_input_tokens_seen": 125482536, "step": 186250 }, { "epoch": 4.550240637138739, "grad_norm": 0.00018053391249850392, "learning_rate": 4.889711449668654e-08, "loss": 0.0, "num_input_tokens_seen": 125485608, "step": 186255 }, { "epoch": 4.550362787970586, "grad_norm": 0.0022484755609184504, "learning_rate": 4.8870777900632543e-08, "loss": 0.0, "num_input_tokens_seen": 125488808, "step": 186260 }, { "epoch": 4.550484938802433, "grad_norm": 0.0017215277766808867, "learning_rate": 4.884444822144595e-08, "loss": 0.0, "num_input_tokens_seen": 125491880, "step": 186265 }, { "epoch": 4.550607089634281, "grad_norm": 0.000490458682179451, "learning_rate": 4.88181254593184e-08, "loss": 0.0, "num_input_tokens_seen": 125494696, "step": 186270 }, { "epoch": 4.5507292404661275, "grad_norm": 7.075289613567293e-05, "learning_rate": 4.8791809614441405e-08, "loss": 0.0001, "num_input_tokens_seen": 125498600, "step": 186275 }, { "epoch": 4.550851391297975, "grad_norm": 7.022430509096012e-05, "learning_rate": 4.8765500687006024e-08, "loss": 0.0, "num_input_tokens_seen": 125501800, "step": 186280 }, { "epoch": 4.550973542129822, "grad_norm": 2.5926497983164154e-05, "learning_rate": 4.873919867720389e-08, "loss": 0.0, "num_input_tokens_seen": 125505576, "step": 186285 }, { "epoch": 4.551095692961669, "grad_norm": 0.0003630456340033561, "learning_rate": 4.871290358522606e-08, "loss": 0.0, "num_input_tokens_seen": 125509480, "step": 186290 }, { "epoch": 4.551217843793516, "grad_norm": 2.0731151380459778e-05, "learning_rate": 4.868661541126407e-08, "loss": 0.0, "num_input_tokens_seen": 125512808, "step": 186295 }, { "epoch": 4.551339994625364, "grad_norm": 0.004396742209792137, "learning_rate": 4.866033415550863e-08, "loss": 0.0, "num_input_tokens_seen": 125515944, "step": 186300 }, { "epoch": 4.551462145457211, "grad_norm": 0.004247918259352446, "learning_rate": 4.863405981815116e-08, "loss": 0.0, "num_input_tokens_seen": 125519848, "step": 186305 }, { "epoch": 4.551584296289057, "grad_norm": 0.008792792446911335, "learning_rate": 4.860779239938284e-08, "loss": 0.0, "num_input_tokens_seen": 125523432, "step": 186310 }, { "epoch": 4.551706447120905, "grad_norm": 0.0026869927532970905, "learning_rate": 4.8581531899394404e-08, "loss": 0.0, "num_input_tokens_seen": 125527336, "step": 186315 }, { "epoch": 4.551828597952752, "grad_norm": 2.581719309091568e-05, "learning_rate": 4.8555278318377136e-08, "loss": 0.0, "num_input_tokens_seen": 125531048, "step": 186320 }, { "epoch": 4.551950748784599, "grad_norm": 0.00021105869382154197, "learning_rate": 4.852903165652167e-08, "loss": 0.0, "num_input_tokens_seen": 125534440, "step": 186325 }, { "epoch": 4.552072899616446, "grad_norm": 0.0005000850069336593, "learning_rate": 4.850279191401896e-08, "loss": 0.0, "num_input_tokens_seen": 125538024, "step": 186330 }, { "epoch": 4.552195050448294, "grad_norm": 0.008285541087388992, "learning_rate": 4.8476559091059966e-08, "loss": 0.0, "num_input_tokens_seen": 125541352, "step": 186335 }, { "epoch": 4.5523172012801405, "grad_norm": 2.6498841180000454e-05, "learning_rate": 4.845033318783531e-08, "loss": 0.0, "num_input_tokens_seen": 125545000, "step": 186340 }, { "epoch": 4.552439352111988, "grad_norm": 0.00018164741049986333, "learning_rate": 4.8424114204535846e-08, "loss": 0.0, "num_input_tokens_seen": 125548136, "step": 186345 }, { "epoch": 4.552561502943835, "grad_norm": 1.322677871939959e-05, "learning_rate": 4.83979021413522e-08, "loss": 0.0, "num_input_tokens_seen": 125551528, "step": 186350 }, { "epoch": 4.5526836537756825, "grad_norm": 0.0001355686254100874, "learning_rate": 4.837169699847476e-08, "loss": 0.0, "num_input_tokens_seen": 125554600, "step": 186355 }, { "epoch": 4.552805804607529, "grad_norm": 0.00036094902316108346, "learning_rate": 4.834549877609451e-08, "loss": 0.0, "num_input_tokens_seen": 125558056, "step": 186360 }, { "epoch": 4.552927955439377, "grad_norm": 5.311130007612519e-05, "learning_rate": 4.831930747440161e-08, "loss": 0.0, "num_input_tokens_seen": 125561320, "step": 186365 }, { "epoch": 4.553050106271224, "grad_norm": 0.07028447091579437, "learning_rate": 4.8293123093586795e-08, "loss": 0.0, "num_input_tokens_seen": 125564840, "step": 186370 }, { "epoch": 4.553172257103071, "grad_norm": 0.00010695837409002706, "learning_rate": 4.8266945633840264e-08, "loss": 0.0, "num_input_tokens_seen": 125568040, "step": 186375 }, { "epoch": 4.553294407934918, "grad_norm": 1.5230105418595485e-05, "learning_rate": 4.8240775095352517e-08, "loss": 0.0, "num_input_tokens_seen": 125570984, "step": 186380 }, { "epoch": 4.553416558766765, "grad_norm": 0.00041354206041432917, "learning_rate": 4.821461147831385e-08, "loss": 0.0, "num_input_tokens_seen": 125574440, "step": 186385 }, { "epoch": 4.553538709598612, "grad_norm": 5.543339284486137e-05, "learning_rate": 4.818845478291456e-08, "loss": 0.0, "num_input_tokens_seen": 125578152, "step": 186390 }, { "epoch": 4.553660860430459, "grad_norm": 0.00212540989741683, "learning_rate": 4.8162305009344705e-08, "loss": 0.0, "num_input_tokens_seen": 125581160, "step": 186395 }, { "epoch": 4.553783011262307, "grad_norm": 0.00012192504800623283, "learning_rate": 4.81361621577947e-08, "loss": 0.0489, "num_input_tokens_seen": 125584936, "step": 186400 }, { "epoch": 4.5539051620941535, "grad_norm": 0.00018608868413139135, "learning_rate": 4.81100262284545e-08, "loss": 0.0, "num_input_tokens_seen": 125588520, "step": 186405 }, { "epoch": 4.554027312926001, "grad_norm": 0.000508465978782624, "learning_rate": 4.808389722151418e-08, "loss": 0.0, "num_input_tokens_seen": 125592296, "step": 186410 }, { "epoch": 4.554149463757848, "grad_norm": 8.924116991693154e-05, "learning_rate": 4.8057775137163913e-08, "loss": 0.0, "num_input_tokens_seen": 125595624, "step": 186415 }, { "epoch": 4.5542716145896955, "grad_norm": 2.7994163247058168e-05, "learning_rate": 4.803165997559344e-08, "loss": 0.0, "num_input_tokens_seen": 125598952, "step": 186420 }, { "epoch": 4.554393765421542, "grad_norm": 0.00013703010336030275, "learning_rate": 4.800555173699283e-08, "loss": 0.0, "num_input_tokens_seen": 125601896, "step": 186425 }, { "epoch": 4.55451591625339, "grad_norm": 0.0009500607848167419, "learning_rate": 4.797945042155194e-08, "loss": 0.0, "num_input_tokens_seen": 125605224, "step": 186430 }, { "epoch": 4.554638067085237, "grad_norm": 0.013281804509460926, "learning_rate": 4.795335602946049e-08, "loss": 0.0, "num_input_tokens_seen": 125608552, "step": 186435 }, { "epoch": 4.554760217917084, "grad_norm": 0.002333866897970438, "learning_rate": 4.7927268560908343e-08, "loss": 0.0, "num_input_tokens_seen": 125612072, "step": 186440 }, { "epoch": 4.554882368748931, "grad_norm": 9.072609827853739e-05, "learning_rate": 4.7901188016085116e-08, "loss": 0.0, "num_input_tokens_seen": 125615464, "step": 186445 }, { "epoch": 4.555004519580779, "grad_norm": 0.00028325567836873233, "learning_rate": 4.787511439518066e-08, "loss": 0.0, "num_input_tokens_seen": 125618792, "step": 186450 }, { "epoch": 4.555126670412625, "grad_norm": 7.057032053126022e-05, "learning_rate": 4.784904769838427e-08, "loss": 0.0, "num_input_tokens_seen": 125622248, "step": 186455 }, { "epoch": 4.555248821244473, "grad_norm": 4.032731521874666e-05, "learning_rate": 4.782298792588591e-08, "loss": 0.0686, "num_input_tokens_seen": 125625896, "step": 186460 }, { "epoch": 4.55537097207632, "grad_norm": 0.0002833160397130996, "learning_rate": 4.7796935077874856e-08, "loss": 0.0, "num_input_tokens_seen": 125629288, "step": 186465 }, { "epoch": 4.555493122908167, "grad_norm": 0.0002632912655826658, "learning_rate": 4.7770889154540525e-08, "loss": 0.0, "num_input_tokens_seen": 125632488, "step": 186470 }, { "epoch": 4.555615273740014, "grad_norm": 0.0001713380916044116, "learning_rate": 4.774485015607244e-08, "loss": 0.0, "num_input_tokens_seen": 125635624, "step": 186475 }, { "epoch": 4.555737424571861, "grad_norm": 2.570607466623187e-05, "learning_rate": 4.7718818082659874e-08, "loss": 0.0, "num_input_tokens_seen": 125638696, "step": 186480 }, { "epoch": 4.555859575403709, "grad_norm": 0.0002236905274912715, "learning_rate": 4.769279293449213e-08, "loss": 0.0, "num_input_tokens_seen": 125642536, "step": 186485 }, { "epoch": 4.555981726235555, "grad_norm": 0.00016828883963171393, "learning_rate": 4.766677471175873e-08, "loss": 0.0, "num_input_tokens_seen": 125645608, "step": 186490 }, { "epoch": 4.556103877067403, "grad_norm": 0.00016017680172808468, "learning_rate": 4.7640763414648624e-08, "loss": 0.0, "num_input_tokens_seen": 125649000, "step": 186495 }, { "epoch": 4.55622602789925, "grad_norm": 0.0006411911454051733, "learning_rate": 4.761475904335099e-08, "loss": 0.0, "num_input_tokens_seen": 125652392, "step": 186500 }, { "epoch": 4.556348178731097, "grad_norm": 0.0003673804458230734, "learning_rate": 4.758876159805503e-08, "loss": 0.0, "num_input_tokens_seen": 125655720, "step": 186505 }, { "epoch": 4.556470329562944, "grad_norm": 0.00026387552497908473, "learning_rate": 4.7562771078949794e-08, "loss": 0.0, "num_input_tokens_seen": 125659048, "step": 186510 }, { "epoch": 4.556592480394792, "grad_norm": 0.052026305347681046, "learning_rate": 4.753678748622414e-08, "loss": 0.0, "num_input_tokens_seen": 125662824, "step": 186515 }, { "epoch": 4.5567146312266384, "grad_norm": 0.0003993824648205191, "learning_rate": 4.751081082006714e-08, "loss": 0.0, "num_input_tokens_seen": 125665832, "step": 186520 }, { "epoch": 4.556836782058486, "grad_norm": 0.00013458718603942543, "learning_rate": 4.748484108066786e-08, "loss": 0.0, "num_input_tokens_seen": 125669416, "step": 186525 }, { "epoch": 4.556958932890333, "grad_norm": 0.0002635191776789725, "learning_rate": 4.745887826821493e-08, "loss": 0.0, "num_input_tokens_seen": 125672424, "step": 186530 }, { "epoch": 4.5570810837221805, "grad_norm": 5.497500751516782e-05, "learning_rate": 4.743292238289731e-08, "loss": 0.0305, "num_input_tokens_seen": 125675688, "step": 186535 }, { "epoch": 4.557203234554027, "grad_norm": 0.0003593292785808444, "learning_rate": 4.7406973424903626e-08, "loss": 0.0, "num_input_tokens_seen": 125678696, "step": 186540 }, { "epoch": 4.557325385385875, "grad_norm": 0.010443294420838356, "learning_rate": 4.738103139442273e-08, "loss": 0.0, "num_input_tokens_seen": 125681768, "step": 186545 }, { "epoch": 4.557447536217722, "grad_norm": 0.0002558843407314271, "learning_rate": 4.7355096291643026e-08, "loss": 0.0001, "num_input_tokens_seen": 125685608, "step": 186550 }, { "epoch": 4.557569687049568, "grad_norm": 0.0004936489858664572, "learning_rate": 4.7329168116753473e-08, "loss": 0.0, "num_input_tokens_seen": 125689000, "step": 186555 }, { "epoch": 4.557691837881416, "grad_norm": 0.003704722970724106, "learning_rate": 4.7303246869942246e-08, "loss": 0.0, "num_input_tokens_seen": 125692008, "step": 186560 }, { "epoch": 4.557813988713264, "grad_norm": 0.00018676824402064085, "learning_rate": 4.727733255139832e-08, "loss": 0.0001, "num_input_tokens_seen": 125695464, "step": 186565 }, { "epoch": 4.55793613954511, "grad_norm": 0.00029991735937073827, "learning_rate": 4.725142516130975e-08, "loss": 0.0, "num_input_tokens_seen": 125698728, "step": 186570 }, { "epoch": 4.558058290376957, "grad_norm": 0.0004272170190233737, "learning_rate": 4.722552469986507e-08, "loss": 0.0, "num_input_tokens_seen": 125701992, "step": 186575 }, { "epoch": 4.558180441208805, "grad_norm": 4.4216849346412346e-05, "learning_rate": 4.719963116725256e-08, "loss": 0.0, "num_input_tokens_seen": 125705320, "step": 186580 }, { "epoch": 4.5583025920406515, "grad_norm": 0.0008761108620092273, "learning_rate": 4.717374456366074e-08, "loss": 0.0, "num_input_tokens_seen": 125708520, "step": 186585 }, { "epoch": 4.558424742872499, "grad_norm": 0.0021364863496273756, "learning_rate": 4.714786488927758e-08, "loss": 0.0, "num_input_tokens_seen": 125711912, "step": 186590 }, { "epoch": 4.558546893704346, "grad_norm": 0.001226111315190792, "learning_rate": 4.712199214429158e-08, "loss": 0.0, "num_input_tokens_seen": 125715176, "step": 186595 }, { "epoch": 4.5586690445361935, "grad_norm": 0.0002580843574833125, "learning_rate": 4.709612632889059e-08, "loss": 0.0, "num_input_tokens_seen": 125718248, "step": 186600 }, { "epoch": 4.55879119536804, "grad_norm": 0.0003518314624670893, "learning_rate": 4.7070267443263035e-08, "loss": 0.0, "num_input_tokens_seen": 125722088, "step": 186605 }, { "epoch": 4.558913346199888, "grad_norm": 0.018274880945682526, "learning_rate": 4.7044415487596744e-08, "loss": 0.0, "num_input_tokens_seen": 125725544, "step": 186610 }, { "epoch": 4.559035497031735, "grad_norm": 1.0470458619238343e-05, "learning_rate": 4.701857046207969e-08, "loss": 0.0, "num_input_tokens_seen": 125728872, "step": 186615 }, { "epoch": 4.559157647863582, "grad_norm": 0.00013799651060253382, "learning_rate": 4.699273236690005e-08, "loss": 0.0, "num_input_tokens_seen": 125732136, "step": 186620 }, { "epoch": 4.559279798695429, "grad_norm": 1.64757548191119e-05, "learning_rate": 4.6966901202245446e-08, "loss": 0.0, "num_input_tokens_seen": 125735400, "step": 186625 }, { "epoch": 4.559401949527277, "grad_norm": 0.00010223350545857102, "learning_rate": 4.694107696830407e-08, "loss": 0.0, "num_input_tokens_seen": 125739048, "step": 186630 }, { "epoch": 4.559524100359123, "grad_norm": 0.000656140735372901, "learning_rate": 4.691525966526333e-08, "loss": 0.0, "num_input_tokens_seen": 125742120, "step": 186635 }, { "epoch": 4.559646251190971, "grad_norm": 0.0003749874304048717, "learning_rate": 4.6889449293311176e-08, "loss": 0.0, "num_input_tokens_seen": 125745704, "step": 186640 }, { "epoch": 4.559768402022818, "grad_norm": 0.0002589155628811568, "learning_rate": 4.686364585263547e-08, "loss": 0.0004, "num_input_tokens_seen": 125748968, "step": 186645 }, { "epoch": 4.5598905528546645, "grad_norm": 0.0012152070412412286, "learning_rate": 4.6837849343423494e-08, "loss": 0.0, "num_input_tokens_seen": 125751976, "step": 186650 }, { "epoch": 4.560012703686512, "grad_norm": 0.01056479662656784, "learning_rate": 4.681205976586322e-08, "loss": 0.0, "num_input_tokens_seen": 125754920, "step": 186655 }, { "epoch": 4.56013485451836, "grad_norm": 0.00018264618120156229, "learning_rate": 4.6786277120142047e-08, "loss": 0.0, "num_input_tokens_seen": 125758504, "step": 186660 }, { "epoch": 4.5602570053502065, "grad_norm": 0.02871021069586277, "learning_rate": 4.676050140644727e-08, "loss": 0.0, "num_input_tokens_seen": 125761768, "step": 186665 }, { "epoch": 4.560379156182053, "grad_norm": 3.788889080169611e-05, "learning_rate": 4.673473262496663e-08, "loss": 0.0, "num_input_tokens_seen": 125765288, "step": 186670 }, { "epoch": 4.560501307013901, "grad_norm": 0.04666898027062416, "learning_rate": 4.670897077588731e-08, "loss": 0.0, "num_input_tokens_seen": 125769832, "step": 186675 }, { "epoch": 4.560623457845748, "grad_norm": 0.0006879869033582509, "learning_rate": 4.668321585939694e-08, "loss": 0.0, "num_input_tokens_seen": 125773032, "step": 186680 }, { "epoch": 4.560745608677595, "grad_norm": 0.0001125606067944318, "learning_rate": 4.665746787568248e-08, "loss": 0.0, "num_input_tokens_seen": 125776360, "step": 186685 }, { "epoch": 4.560867759509442, "grad_norm": 0.02449299953877926, "learning_rate": 4.663172682493144e-08, "loss": 0.0, "num_input_tokens_seen": 125779944, "step": 186690 }, { "epoch": 4.56098991034129, "grad_norm": 0.00034439985756762326, "learning_rate": 4.660599270733079e-08, "loss": 0.0, "num_input_tokens_seen": 125783464, "step": 186695 }, { "epoch": 4.561112061173136, "grad_norm": 7.401494804071262e-05, "learning_rate": 4.658026552306793e-08, "loss": 0.0, "num_input_tokens_seen": 125786792, "step": 186700 }, { "epoch": 4.561234212004984, "grad_norm": 0.00045592195237986743, "learning_rate": 4.6554545272329715e-08, "loss": 0.0, "num_input_tokens_seen": 125790056, "step": 186705 }, { "epoch": 4.561356362836831, "grad_norm": 0.0002018619270529598, "learning_rate": 4.6528831955303215e-08, "loss": 0.0, "num_input_tokens_seen": 125793320, "step": 186710 }, { "epoch": 4.561478513668678, "grad_norm": 0.0051199872978031635, "learning_rate": 4.6503125572175725e-08, "loss": 0.0, "num_input_tokens_seen": 125796648, "step": 186715 }, { "epoch": 4.561600664500525, "grad_norm": 0.003007345600053668, "learning_rate": 4.6477426123133765e-08, "loss": 0.0, "num_input_tokens_seen": 125799912, "step": 186720 }, { "epoch": 4.561722815332373, "grad_norm": 0.00036833074409514666, "learning_rate": 4.645173360836463e-08, "loss": 0.0, "num_input_tokens_seen": 125802920, "step": 186725 }, { "epoch": 4.5618449661642195, "grad_norm": 0.0001462739601265639, "learning_rate": 4.642604802805472e-08, "loss": 0.0, "num_input_tokens_seen": 125805928, "step": 186730 }, { "epoch": 4.561967116996067, "grad_norm": 0.0020424015820026398, "learning_rate": 4.640036938239111e-08, "loss": 0.0, "num_input_tokens_seen": 125808936, "step": 186735 }, { "epoch": 4.562089267827914, "grad_norm": 0.0025019191671162844, "learning_rate": 4.637469767156066e-08, "loss": 0.0, "num_input_tokens_seen": 125812264, "step": 186740 }, { "epoch": 4.562211418659761, "grad_norm": 0.0013807759387418628, "learning_rate": 4.634903289574976e-08, "loss": 0.0, "num_input_tokens_seen": 125816424, "step": 186745 }, { "epoch": 4.562333569491608, "grad_norm": 0.0036531754303723574, "learning_rate": 4.6323375055145386e-08, "loss": 0.0, "num_input_tokens_seen": 125819880, "step": 186750 }, { "epoch": 4.562455720323455, "grad_norm": 0.00678299693390727, "learning_rate": 4.629772414993371e-08, "loss": 0.0, "num_input_tokens_seen": 125823336, "step": 186755 }, { "epoch": 4.562577871155303, "grad_norm": 0.0002474345965310931, "learning_rate": 4.627208018030171e-08, "loss": 0.0, "num_input_tokens_seen": 125826536, "step": 186760 }, { "epoch": 4.562700021987149, "grad_norm": 0.0001734690449666232, "learning_rate": 4.6246443146435554e-08, "loss": 0.0, "num_input_tokens_seen": 125829352, "step": 186765 }, { "epoch": 4.562822172818997, "grad_norm": 0.002393064321950078, "learning_rate": 4.622081304852177e-08, "loss": 0.0, "num_input_tokens_seen": 125832872, "step": 186770 }, { "epoch": 4.562944323650844, "grad_norm": 0.0003072379913646728, "learning_rate": 4.619518988674686e-08, "loss": 0.0, "num_input_tokens_seen": 125836456, "step": 186775 }, { "epoch": 4.563066474482691, "grad_norm": 0.00010231092164758593, "learning_rate": 4.6169573661297034e-08, "loss": 0.0, "num_input_tokens_seen": 125839464, "step": 186780 }, { "epoch": 4.563188625314538, "grad_norm": 0.01051098108291626, "learning_rate": 4.6143964372358676e-08, "loss": 0.0, "num_input_tokens_seen": 125842920, "step": 186785 }, { "epoch": 4.563310776146386, "grad_norm": 6.985938671277836e-05, "learning_rate": 4.611836202011776e-08, "loss": 0.0, "num_input_tokens_seen": 125846056, "step": 186790 }, { "epoch": 4.5634329269782326, "grad_norm": 0.001552744535729289, "learning_rate": 4.609276660476069e-08, "loss": 0.0001, "num_input_tokens_seen": 125849512, "step": 186795 }, { "epoch": 4.56355507781008, "grad_norm": 0.0006545864744111896, "learning_rate": 4.606717812647387e-08, "loss": 0.0, "num_input_tokens_seen": 125853224, "step": 186800 }, { "epoch": 4.563677228641927, "grad_norm": 0.00017454169574193656, "learning_rate": 4.604159658544282e-08, "loss": 0.0, "num_input_tokens_seen": 125856424, "step": 186805 }, { "epoch": 4.563799379473775, "grad_norm": 0.0006190399872139096, "learning_rate": 4.601602198185406e-08, "loss": 0.0, "num_input_tokens_seen": 125859816, "step": 186810 }, { "epoch": 4.563921530305621, "grad_norm": 0.0008284965879283845, "learning_rate": 4.599045431589321e-08, "loss": 0.0, "num_input_tokens_seen": 125863656, "step": 186815 }, { "epoch": 4.564043681137468, "grad_norm": 0.0007577021024189889, "learning_rate": 4.596489358774658e-08, "loss": 0.0, "num_input_tokens_seen": 125866984, "step": 186820 }, { "epoch": 4.564165831969316, "grad_norm": 9.084967314265668e-05, "learning_rate": 4.593933979759967e-08, "loss": 0.0, "num_input_tokens_seen": 125870376, "step": 186825 }, { "epoch": 4.564287982801163, "grad_norm": 0.0006819769041612744, "learning_rate": 4.5913792945638465e-08, "loss": 0.0, "num_input_tokens_seen": 125873832, "step": 186830 }, { "epoch": 4.56441013363301, "grad_norm": 0.0003621177456807345, "learning_rate": 4.5888253032048906e-08, "loss": 0.0, "num_input_tokens_seen": 125877352, "step": 186835 }, { "epoch": 4.564532284464857, "grad_norm": 0.0003841822035610676, "learning_rate": 4.586272005701652e-08, "loss": 0.0, "num_input_tokens_seen": 125880616, "step": 186840 }, { "epoch": 4.5646544352967044, "grad_norm": 6.918048165971413e-05, "learning_rate": 4.5837194020727165e-08, "loss": 0.0, "num_input_tokens_seen": 125884456, "step": 186845 }, { "epoch": 4.564776586128551, "grad_norm": 0.00011394621833460405, "learning_rate": 4.581167492336624e-08, "loss": 0.0, "num_input_tokens_seen": 125888104, "step": 186850 }, { "epoch": 4.564898736960399, "grad_norm": 0.00013341843441594392, "learning_rate": 4.5786162765119596e-08, "loss": 0.0, "num_input_tokens_seen": 125891816, "step": 186855 }, { "epoch": 4.565020887792246, "grad_norm": 0.006895654369145632, "learning_rate": 4.576065754617253e-08, "loss": 0.0, "num_input_tokens_seen": 125894760, "step": 186860 }, { "epoch": 4.565143038624093, "grad_norm": 0.0022672039922326803, "learning_rate": 4.573515926671079e-08, "loss": 0.0, "num_input_tokens_seen": 125897768, "step": 186865 }, { "epoch": 4.56526518945594, "grad_norm": 4.803660704055801e-05, "learning_rate": 4.570966792691944e-08, "loss": 0.0, "num_input_tokens_seen": 125900840, "step": 186870 }, { "epoch": 4.565387340287788, "grad_norm": 0.00014346172974910587, "learning_rate": 4.568418352698411e-08, "loss": 0.0, "num_input_tokens_seen": 125904040, "step": 186875 }, { "epoch": 4.565509491119634, "grad_norm": 0.0002021217514993623, "learning_rate": 4.5658706067090215e-08, "loss": 0.0, "num_input_tokens_seen": 125907752, "step": 186880 }, { "epoch": 4.565631641951482, "grad_norm": 0.0008756064344197512, "learning_rate": 4.563323554742271e-08, "loss": 0.0, "num_input_tokens_seen": 125911464, "step": 186885 }, { "epoch": 4.565753792783329, "grad_norm": 0.000680445518810302, "learning_rate": 4.560777196816701e-08, "loss": 0.0, "num_input_tokens_seen": 125914920, "step": 186890 }, { "epoch": 4.565875943615176, "grad_norm": 0.006386886816471815, "learning_rate": 4.5582315329508405e-08, "loss": 0.0, "num_input_tokens_seen": 125918376, "step": 186895 }, { "epoch": 4.565998094447023, "grad_norm": 0.002658127574250102, "learning_rate": 4.5556865631631856e-08, "loss": 0.0001, "num_input_tokens_seen": 125922088, "step": 186900 }, { "epoch": 4.566120245278871, "grad_norm": 0.00010610414756229147, "learning_rate": 4.5531422874722555e-08, "loss": 0.0, "num_input_tokens_seen": 125925224, "step": 186905 }, { "epoch": 4.5662423961107175, "grad_norm": 0.00031076581217348576, "learning_rate": 4.5505987058965355e-08, "loss": 0.0, "num_input_tokens_seen": 125928360, "step": 186910 }, { "epoch": 4.566364546942564, "grad_norm": 0.00034491971018724144, "learning_rate": 4.548055818454544e-08, "loss": 0.0, "num_input_tokens_seen": 125931304, "step": 186915 }, { "epoch": 4.566486697774412, "grad_norm": 0.0020436709746718407, "learning_rate": 4.545513625164754e-08, "loss": 0.0, "num_input_tokens_seen": 125934504, "step": 186920 }, { "epoch": 4.5666088486062595, "grad_norm": 0.0015757272485643625, "learning_rate": 4.5429721260456633e-08, "loss": 0.0246, "num_input_tokens_seen": 125938280, "step": 186925 }, { "epoch": 4.566730999438106, "grad_norm": 4.1816121665760875e-05, "learning_rate": 4.5404313211157675e-08, "loss": 0.0, "num_input_tokens_seen": 125941416, "step": 186930 }, { "epoch": 4.566853150269953, "grad_norm": 0.002388280350714922, "learning_rate": 4.537891210393519e-08, "loss": 0.0, "num_input_tokens_seen": 125945576, "step": 186935 }, { "epoch": 4.566975301101801, "grad_norm": 1.306969079450937e-05, "learning_rate": 4.535351793897413e-08, "loss": 0.0, "num_input_tokens_seen": 125948648, "step": 186940 }, { "epoch": 4.567097451933647, "grad_norm": 4.432052810443565e-05, "learning_rate": 4.532813071645891e-08, "loss": 0.0, "num_input_tokens_seen": 125952040, "step": 186945 }, { "epoch": 4.567219602765495, "grad_norm": 8.9752342319116e-05, "learning_rate": 4.53027504365745e-08, "loss": 0.0, "num_input_tokens_seen": 125955432, "step": 186950 }, { "epoch": 4.567341753597342, "grad_norm": 0.003290035994723439, "learning_rate": 4.5277377099505076e-08, "loss": 0.0, "num_input_tokens_seen": 125958824, "step": 186955 }, { "epoch": 4.567463904429189, "grad_norm": 3.403526352485642e-05, "learning_rate": 4.5252010705435386e-08, "loss": 0.0, "num_input_tokens_seen": 125961896, "step": 186960 }, { "epoch": 4.567586055261036, "grad_norm": 0.00044939748477190733, "learning_rate": 4.522665125454994e-08, "loss": 0.0, "num_input_tokens_seen": 125965288, "step": 186965 }, { "epoch": 4.567708206092884, "grad_norm": 0.005186882801353931, "learning_rate": 4.5201298747033155e-08, "loss": 0.0, "num_input_tokens_seen": 125968680, "step": 186970 }, { "epoch": 4.5678303569247305, "grad_norm": 0.00021564609778579324, "learning_rate": 4.517595318306911e-08, "loss": 0.0, "num_input_tokens_seen": 125971816, "step": 186975 }, { "epoch": 4.567952507756578, "grad_norm": 0.0006758919917047024, "learning_rate": 4.5150614562842635e-08, "loss": 0.0, "num_input_tokens_seen": 125975144, "step": 186980 }, { "epoch": 4.568074658588425, "grad_norm": 0.00018626233213581145, "learning_rate": 4.51252828865375e-08, "loss": 0.0, "num_input_tokens_seen": 125978408, "step": 186985 }, { "epoch": 4.5681968094202725, "grad_norm": 5.7257089792983606e-05, "learning_rate": 4.5099958154338204e-08, "loss": 0.0, "num_input_tokens_seen": 125981544, "step": 186990 }, { "epoch": 4.568318960252119, "grad_norm": 0.00012096945283701643, "learning_rate": 4.507464036642883e-08, "loss": 0.0, "num_input_tokens_seen": 125984872, "step": 186995 }, { "epoch": 4.568441111083967, "grad_norm": 0.00010047034447779879, "learning_rate": 4.504932952299356e-08, "loss": 0.0, "num_input_tokens_seen": 125988328, "step": 187000 }, { "epoch": 4.568563261915814, "grad_norm": 0.00630926201120019, "learning_rate": 4.502402562421637e-08, "loss": 0.0, "num_input_tokens_seen": 125991208, "step": 187005 }, { "epoch": 4.56868541274766, "grad_norm": 8.817094203550369e-05, "learning_rate": 4.499872867028143e-08, "loss": 0.0, "num_input_tokens_seen": 125995304, "step": 187010 }, { "epoch": 4.568807563579508, "grad_norm": 0.0006666062981821597, "learning_rate": 4.4973438661372374e-08, "loss": 0.0, "num_input_tokens_seen": 125998504, "step": 187015 }, { "epoch": 4.568929714411355, "grad_norm": 1.3411827239906415e-05, "learning_rate": 4.494815559767351e-08, "loss": 0.0, "num_input_tokens_seen": 126002024, "step": 187020 }, { "epoch": 4.569051865243202, "grad_norm": 0.0007918593473732471, "learning_rate": 4.492287947936857e-08, "loss": 0.0, "num_input_tokens_seen": 126005096, "step": 187025 }, { "epoch": 4.569174016075049, "grad_norm": 0.0014626365154981613, "learning_rate": 4.4897610306641184e-08, "loss": 0.0, "num_input_tokens_seen": 126008232, "step": 187030 }, { "epoch": 4.569296166906897, "grad_norm": 5.475069701788016e-05, "learning_rate": 4.487234807967544e-08, "loss": 0.0, "num_input_tokens_seen": 126012136, "step": 187035 }, { "epoch": 4.5694183177387435, "grad_norm": 0.0029703148175030947, "learning_rate": 4.484709279865473e-08, "loss": 0.0, "num_input_tokens_seen": 126015848, "step": 187040 }, { "epoch": 4.569540468570591, "grad_norm": 0.00021349718736018986, "learning_rate": 4.482184446376291e-08, "loss": 0.0, "num_input_tokens_seen": 126019048, "step": 187045 }, { "epoch": 4.569662619402438, "grad_norm": 0.0002331474534003064, "learning_rate": 4.479660307518363e-08, "loss": 0.0, "num_input_tokens_seen": 126022632, "step": 187050 }, { "epoch": 4.5697847702342855, "grad_norm": 0.0002785135293379426, "learning_rate": 4.477136863310016e-08, "loss": 0.0, "num_input_tokens_seen": 126026216, "step": 187055 }, { "epoch": 4.569906921066132, "grad_norm": 0.0015631432179361582, "learning_rate": 4.474614113769648e-08, "loss": 0.0, "num_input_tokens_seen": 126029352, "step": 187060 }, { "epoch": 4.57002907189798, "grad_norm": 2.8294090952840634e-05, "learning_rate": 4.472092058915567e-08, "loss": 0.0, "num_input_tokens_seen": 126032616, "step": 187065 }, { "epoch": 4.570151222729827, "grad_norm": 0.00030958701972849667, "learning_rate": 4.469570698766134e-08, "loss": 0.0, "num_input_tokens_seen": 126035880, "step": 187070 }, { "epoch": 4.570273373561674, "grad_norm": 0.00970099214464426, "learning_rate": 4.46705003333967e-08, "loss": 0.0, "num_input_tokens_seen": 126039080, "step": 187075 }, { "epoch": 4.570395524393521, "grad_norm": 0.00010253264917992055, "learning_rate": 4.4645300626545146e-08, "loss": 0.0, "num_input_tokens_seen": 126042600, "step": 187080 }, { "epoch": 4.570517675225369, "grad_norm": 0.048178721219301224, "learning_rate": 4.462010786728998e-08, "loss": 0.0, "num_input_tokens_seen": 126045736, "step": 187085 }, { "epoch": 4.570639826057215, "grad_norm": 0.0006714654737152159, "learning_rate": 4.4594922055814275e-08, "loss": 0.0, "num_input_tokens_seen": 126049000, "step": 187090 }, { "epoch": 4.570761976889063, "grad_norm": 0.004424653016030788, "learning_rate": 4.456974319230145e-08, "loss": 0.0, "num_input_tokens_seen": 126053288, "step": 187095 }, { "epoch": 4.57088412772091, "grad_norm": 2.338566628168337e-05, "learning_rate": 4.454457127693412e-08, "loss": 0.0, "num_input_tokens_seen": 126056424, "step": 187100 }, { "epoch": 4.5710062785527565, "grad_norm": 0.012012263759970665, "learning_rate": 4.4519406309895924e-08, "loss": 0.0, "num_input_tokens_seen": 126059944, "step": 187105 }, { "epoch": 4.571128429384604, "grad_norm": 0.5830227136611938, "learning_rate": 4.4494248291369495e-08, "loss": 0.0001, "num_input_tokens_seen": 126063208, "step": 187110 }, { "epoch": 4.571250580216451, "grad_norm": 0.003950295504182577, "learning_rate": 4.44690972215378e-08, "loss": 0.0, "num_input_tokens_seen": 126066088, "step": 187115 }, { "epoch": 4.5713727310482986, "grad_norm": 0.000485446973470971, "learning_rate": 4.444395310058402e-08, "loss": 0.0, "num_input_tokens_seen": 126069288, "step": 187120 }, { "epoch": 4.571494881880145, "grad_norm": 6.134338764240965e-05, "learning_rate": 4.441881592869068e-08, "loss": 0.0, "num_input_tokens_seen": 126072680, "step": 187125 }, { "epoch": 4.571617032711993, "grad_norm": 0.0011996165849268436, "learning_rate": 4.439368570604085e-08, "loss": 0.0, "num_input_tokens_seen": 126076008, "step": 187130 }, { "epoch": 4.57173918354384, "grad_norm": 0.00873605441302061, "learning_rate": 4.436856243281706e-08, "loss": 0.0, "num_input_tokens_seen": 126079400, "step": 187135 }, { "epoch": 4.571861334375687, "grad_norm": 0.000793572049587965, "learning_rate": 4.434344610920204e-08, "loss": 0.0, "num_input_tokens_seen": 126082792, "step": 187140 }, { "epoch": 4.571983485207534, "grad_norm": 2.3203620003187098e-05, "learning_rate": 4.431833673537877e-08, "loss": 0.0, "num_input_tokens_seen": 126086184, "step": 187145 }, { "epoch": 4.572105636039382, "grad_norm": 0.0007438582251779735, "learning_rate": 4.4293234311529315e-08, "loss": 0.0, "num_input_tokens_seen": 126089576, "step": 187150 }, { "epoch": 4.572227786871228, "grad_norm": 8.248721314885188e-06, "learning_rate": 4.426813883783676e-08, "loss": 0.0, "num_input_tokens_seen": 126092776, "step": 187155 }, { "epoch": 4.572349937703076, "grad_norm": 0.0004233851213939488, "learning_rate": 4.424305031448328e-08, "loss": 0.0, "num_input_tokens_seen": 126096360, "step": 187160 }, { "epoch": 4.572472088534923, "grad_norm": 0.0021329792216420174, "learning_rate": 4.4217968741651403e-08, "loss": 0.0, "num_input_tokens_seen": 126099752, "step": 187165 }, { "epoch": 4.5725942393667705, "grad_norm": 0.0002402208192506805, "learning_rate": 4.419289411952354e-08, "loss": 0.0, "num_input_tokens_seen": 126102952, "step": 187170 }, { "epoch": 4.572716390198617, "grad_norm": 0.0009508799994364381, "learning_rate": 4.4167826448282095e-08, "loss": 0.0, "num_input_tokens_seen": 126106536, "step": 187175 }, { "epoch": 4.572838541030464, "grad_norm": 0.00020646640041377395, "learning_rate": 4.414276572810915e-08, "loss": 0.0, "num_input_tokens_seen": 126110376, "step": 187180 }, { "epoch": 4.572960691862312, "grad_norm": 0.0008768560364842415, "learning_rate": 4.411771195918723e-08, "loss": 0.0, "num_input_tokens_seen": 126113896, "step": 187185 }, { "epoch": 4.573082842694159, "grad_norm": 3.4873282857006416e-05, "learning_rate": 4.409266514169841e-08, "loss": 0.0, "num_input_tokens_seen": 126117224, "step": 187190 }, { "epoch": 4.573204993526006, "grad_norm": 0.0007052597356960177, "learning_rate": 4.406762527582475e-08, "loss": 0.0017, "num_input_tokens_seen": 126120552, "step": 187195 }, { "epoch": 4.573327144357853, "grad_norm": 0.0013276163954287767, "learning_rate": 4.404259236174846e-08, "loss": 0.0, "num_input_tokens_seen": 126124136, "step": 187200 }, { "epoch": 4.5734492951897, "grad_norm": 0.0001082074231817387, "learning_rate": 4.4017566399651596e-08, "loss": 0.0, "num_input_tokens_seen": 126127464, "step": 187205 }, { "epoch": 4.573571446021547, "grad_norm": 1.8606428056955338e-05, "learning_rate": 4.399254738971603e-08, "loss": 0.0, "num_input_tokens_seen": 126130728, "step": 187210 }, { "epoch": 4.573693596853395, "grad_norm": 2.800193578877952e-05, "learning_rate": 4.396753533212394e-08, "loss": 0.0, "num_input_tokens_seen": 126134376, "step": 187215 }, { "epoch": 4.5738157476852415, "grad_norm": 0.0005040558171458542, "learning_rate": 4.394253022705696e-08, "loss": 0.0, "num_input_tokens_seen": 126138152, "step": 187220 }, { "epoch": 4.573937898517089, "grad_norm": 0.0006679038051515818, "learning_rate": 4.3917532074697175e-08, "loss": 0.0, "num_input_tokens_seen": 126141224, "step": 187225 }, { "epoch": 4.574060049348936, "grad_norm": 3.217871199012734e-05, "learning_rate": 4.389254087522609e-08, "loss": 0.0, "num_input_tokens_seen": 126144552, "step": 187230 }, { "epoch": 4.5741822001807835, "grad_norm": 0.00022618411458097398, "learning_rate": 4.386755662882558e-08, "loss": 0.0, "num_input_tokens_seen": 126147880, "step": 187235 }, { "epoch": 4.57430435101263, "grad_norm": 3.0717765184817836e-05, "learning_rate": 4.384257933567759e-08, "loss": 0.0, "num_input_tokens_seen": 126151016, "step": 187240 }, { "epoch": 4.574426501844478, "grad_norm": 0.0004580276436172426, "learning_rate": 4.381760899596332e-08, "loss": 0.0, "num_input_tokens_seen": 126154984, "step": 187245 }, { "epoch": 4.574548652676325, "grad_norm": 5.467039954965003e-05, "learning_rate": 4.379264560986473e-08, "loss": 0.0, "num_input_tokens_seen": 126158120, "step": 187250 }, { "epoch": 4.574670803508172, "grad_norm": 0.0004239397821947932, "learning_rate": 4.376768917756313e-08, "loss": 0.0, "num_input_tokens_seen": 126161320, "step": 187255 }, { "epoch": 4.574792954340019, "grad_norm": 0.029181107878684998, "learning_rate": 4.374273969924014e-08, "loss": 0.0, "num_input_tokens_seen": 126164584, "step": 187260 }, { "epoch": 4.574915105171867, "grad_norm": 1.1901041943929158e-05, "learning_rate": 4.3717797175077064e-08, "loss": 0.0, "num_input_tokens_seen": 126167976, "step": 187265 }, { "epoch": 4.575037256003713, "grad_norm": 0.00022315053502097726, "learning_rate": 4.3692861605255424e-08, "loss": 0.0, "num_input_tokens_seen": 126171176, "step": 187270 }, { "epoch": 4.57515940683556, "grad_norm": 0.00018542897305451334, "learning_rate": 4.366793298995664e-08, "loss": 0.0, "num_input_tokens_seen": 126174696, "step": 187275 }, { "epoch": 4.575281557667408, "grad_norm": 0.0003335888613946736, "learning_rate": 4.364301132936177e-08, "loss": 0.0, "num_input_tokens_seen": 126178408, "step": 187280 }, { "epoch": 4.575403708499255, "grad_norm": 4.691140566137619e-05, "learning_rate": 4.3618096623652126e-08, "loss": 0.0001, "num_input_tokens_seen": 126181672, "step": 187285 }, { "epoch": 4.575525859331102, "grad_norm": 0.0005547900800593197, "learning_rate": 4.3593188873009e-08, "loss": 0.0, "num_input_tokens_seen": 126184808, "step": 187290 }, { "epoch": 4.575648010162949, "grad_norm": 0.00023010231961961836, "learning_rate": 4.356828807761326e-08, "loss": 0.0, "num_input_tokens_seen": 126187880, "step": 187295 }, { "epoch": 4.5757701609947965, "grad_norm": 96.00919342041016, "learning_rate": 4.354339423764641e-08, "loss": 0.0667, "num_input_tokens_seen": 126191400, "step": 187300 }, { "epoch": 4.575892311826643, "grad_norm": 2.1266638214001432e-05, "learning_rate": 4.3518507353289103e-08, "loss": 0.0, "num_input_tokens_seen": 126194728, "step": 187305 }, { "epoch": 4.576014462658491, "grad_norm": 0.04412275552749634, "learning_rate": 4.349362742472251e-08, "loss": 0.0, "num_input_tokens_seen": 126198120, "step": 187310 }, { "epoch": 4.576136613490338, "grad_norm": 1.4897872461006045e-05, "learning_rate": 4.34687544521275e-08, "loss": 0.0, "num_input_tokens_seen": 126201192, "step": 187315 }, { "epoch": 4.576258764322185, "grad_norm": 0.0005763565422967076, "learning_rate": 4.344388843568503e-08, "loss": 0.0, "num_input_tokens_seen": 126204008, "step": 187320 }, { "epoch": 4.576380915154032, "grad_norm": 0.0006947465590201318, "learning_rate": 4.3419029375575844e-08, "loss": 0.0, "num_input_tokens_seen": 126207912, "step": 187325 }, { "epoch": 4.57650306598588, "grad_norm": 8.477483788738027e-05, "learning_rate": 4.339417727198069e-08, "loss": 0.0, "num_input_tokens_seen": 126210856, "step": 187330 }, { "epoch": 4.576625216817726, "grad_norm": 3.6006738810101524e-05, "learning_rate": 4.336933212508054e-08, "loss": 0.0, "num_input_tokens_seen": 126214632, "step": 187335 }, { "epoch": 4.576747367649574, "grad_norm": 0.00023908380535431206, "learning_rate": 4.334449393505579e-08, "loss": 0.0001, "num_input_tokens_seen": 126219176, "step": 187340 }, { "epoch": 4.576869518481421, "grad_norm": 0.04327535256743431, "learning_rate": 4.331966270208731e-08, "loss": 0.0, "num_input_tokens_seen": 126222312, "step": 187345 }, { "epoch": 4.576991669313268, "grad_norm": 0.0037969492841511965, "learning_rate": 4.329483842635551e-08, "loss": 0.0, "num_input_tokens_seen": 126226600, "step": 187350 }, { "epoch": 4.577113820145115, "grad_norm": 0.0007374720880761743, "learning_rate": 4.3270021108040786e-08, "loss": 0.0, "num_input_tokens_seen": 126230248, "step": 187355 }, { "epoch": 4.577235970976963, "grad_norm": 5.166658593225293e-05, "learning_rate": 4.324521074732412e-08, "loss": 0.0, "num_input_tokens_seen": 126233704, "step": 187360 }, { "epoch": 4.5773581218088095, "grad_norm": 9.717391367303208e-05, "learning_rate": 4.3220407344385365e-08, "loss": 0.0, "num_input_tokens_seen": 126237096, "step": 187365 }, { "epoch": 4.577480272640656, "grad_norm": 0.0003349487960804254, "learning_rate": 4.3195610899405266e-08, "loss": 0.0, "num_input_tokens_seen": 126240552, "step": 187370 }, { "epoch": 4.577602423472504, "grad_norm": 0.0029941475950181484, "learning_rate": 4.317082141256401e-08, "loss": 0.0, "num_input_tokens_seen": 126243496, "step": 187375 }, { "epoch": 4.577724574304351, "grad_norm": 6.836828106315807e-05, "learning_rate": 4.314603888404189e-08, "loss": 0.0, "num_input_tokens_seen": 126246952, "step": 187380 }, { "epoch": 4.577846725136198, "grad_norm": 0.0005417978391051292, "learning_rate": 4.312126331401911e-08, "loss": 0.0, "num_input_tokens_seen": 126250344, "step": 187385 }, { "epoch": 4.577968875968045, "grad_norm": 7.682857358304318e-06, "learning_rate": 4.309649470267596e-08, "loss": 0.0, "num_input_tokens_seen": 126253608, "step": 187390 }, { "epoch": 4.578091026799893, "grad_norm": 0.00015603537030983716, "learning_rate": 4.3071733050192513e-08, "loss": 0.0, "num_input_tokens_seen": 126257064, "step": 187395 }, { "epoch": 4.578213177631739, "grad_norm": 4.7801844630157575e-05, "learning_rate": 4.304697835674864e-08, "loss": 0.0, "num_input_tokens_seen": 126260584, "step": 187400 }, { "epoch": 4.578335328463587, "grad_norm": 0.001868409919552505, "learning_rate": 4.302223062252475e-08, "loss": 0.0, "num_input_tokens_seen": 126263848, "step": 187405 }, { "epoch": 4.578457479295434, "grad_norm": 0.02896163798868656, "learning_rate": 4.2997489847700354e-08, "loss": 0.0, "num_input_tokens_seen": 126266984, "step": 187410 }, { "epoch": 4.578579630127281, "grad_norm": 0.004452672321349382, "learning_rate": 4.297275603245576e-08, "loss": 0.0, "num_input_tokens_seen": 126269800, "step": 187415 }, { "epoch": 4.578701780959128, "grad_norm": 0.00024759912048466504, "learning_rate": 4.29480291769706e-08, "loss": 0.0, "num_input_tokens_seen": 126273000, "step": 187420 }, { "epoch": 4.578823931790976, "grad_norm": 0.0007450595730915666, "learning_rate": 4.2923309281424734e-08, "loss": 0.0, "num_input_tokens_seen": 126276584, "step": 187425 }, { "epoch": 4.5789460826228225, "grad_norm": 8.44811656861566e-05, "learning_rate": 4.289859634599824e-08, "loss": 0.0, "num_input_tokens_seen": 126280488, "step": 187430 }, { "epoch": 4.57906823345467, "grad_norm": 0.00011353510490152985, "learning_rate": 4.28738903708703e-08, "loss": 0.0, "num_input_tokens_seen": 126284328, "step": 187435 }, { "epoch": 4.579190384286517, "grad_norm": 0.0032558231614530087, "learning_rate": 4.2849191356221116e-08, "loss": 0.0, "num_input_tokens_seen": 126287464, "step": 187440 }, { "epoch": 4.579312535118364, "grad_norm": 0.00015557577717117965, "learning_rate": 4.282449930222987e-08, "loss": 0.0, "num_input_tokens_seen": 126291112, "step": 187445 }, { "epoch": 4.579434685950211, "grad_norm": 0.00022213808551896363, "learning_rate": 4.27998142090763e-08, "loss": 0.0001, "num_input_tokens_seen": 126294760, "step": 187450 }, { "epoch": 4.579556836782059, "grad_norm": 0.04265875741839409, "learning_rate": 4.2775136076940054e-08, "loss": 0.0, "num_input_tokens_seen": 126298408, "step": 187455 }, { "epoch": 4.579678987613906, "grad_norm": 3.826288775599096e-06, "learning_rate": 4.275046490600043e-08, "loss": 0.0, "num_input_tokens_seen": 126302184, "step": 187460 }, { "epoch": 4.579801138445752, "grad_norm": 0.0010683677392080426, "learning_rate": 4.2725800696436945e-08, "loss": 0.0, "num_input_tokens_seen": 126306152, "step": 187465 }, { "epoch": 4.5799232892776, "grad_norm": 0.004442038480192423, "learning_rate": 4.270114344842879e-08, "loss": 0.0, "num_input_tokens_seen": 126309736, "step": 187470 }, { "epoch": 4.580045440109447, "grad_norm": 0.0018260219367220998, "learning_rate": 4.26764931621556e-08, "loss": 0.0, "num_input_tokens_seen": 126313256, "step": 187475 }, { "epoch": 4.580167590941294, "grad_norm": 0.00024842077982611954, "learning_rate": 4.265184983779624e-08, "loss": 0.0, "num_input_tokens_seen": 126316456, "step": 187480 }, { "epoch": 4.580289741773141, "grad_norm": 0.009610271081328392, "learning_rate": 4.262721347553033e-08, "loss": 0.0, "num_input_tokens_seen": 126319976, "step": 187485 }, { "epoch": 4.580411892604989, "grad_norm": 0.00014046119758859277, "learning_rate": 4.260258407553663e-08, "loss": 0.0, "num_input_tokens_seen": 126323496, "step": 187490 }, { "epoch": 4.580534043436836, "grad_norm": 4.547784919850528e-05, "learning_rate": 4.257796163799454e-08, "loss": 0.0, "num_input_tokens_seen": 126326888, "step": 187495 }, { "epoch": 4.580656194268683, "grad_norm": 0.00014775693125557154, "learning_rate": 4.2553346163083146e-08, "loss": 0.0, "num_input_tokens_seen": 126330280, "step": 187500 }, { "epoch": 4.58077834510053, "grad_norm": 2.9446891858242452e-05, "learning_rate": 4.2528737650981086e-08, "loss": 0.0, "num_input_tokens_seen": 126333160, "step": 187505 }, { "epoch": 4.580900495932378, "grad_norm": 0.00039331192965619266, "learning_rate": 4.250413610186765e-08, "loss": 0.0, "num_input_tokens_seen": 126336488, "step": 187510 }, { "epoch": 4.581022646764224, "grad_norm": 0.00026621881988830864, "learning_rate": 4.2479541515921816e-08, "loss": 0.0, "num_input_tokens_seen": 126339368, "step": 187515 }, { "epoch": 4.581144797596072, "grad_norm": 1.4885709788359236e-05, "learning_rate": 4.24549538933221e-08, "loss": 0.0, "num_input_tokens_seen": 126342440, "step": 187520 }, { "epoch": 4.581266948427919, "grad_norm": 5.755107486038469e-05, "learning_rate": 4.2430373234247696e-08, "loss": 0.0, "num_input_tokens_seen": 126346536, "step": 187525 }, { "epoch": 4.581389099259766, "grad_norm": 0.0004279443237464875, "learning_rate": 4.2405799538877016e-08, "loss": 0.0, "num_input_tokens_seen": 126350056, "step": 187530 }, { "epoch": 4.581511250091613, "grad_norm": 0.0019634836353361607, "learning_rate": 4.2381232807389035e-08, "loss": 0.0, "num_input_tokens_seen": 126353768, "step": 187535 }, { "epoch": 4.58163340092346, "grad_norm": 8.21991270640865e-05, "learning_rate": 4.2356673039962265e-08, "loss": 0.0, "num_input_tokens_seen": 126357096, "step": 187540 }, { "epoch": 4.5817555517553075, "grad_norm": 0.00014001928502693772, "learning_rate": 4.233212023677524e-08, "loss": 0.0, "num_input_tokens_seen": 126360360, "step": 187545 }, { "epoch": 4.581877702587155, "grad_norm": 1.1686901416396722e-05, "learning_rate": 4.2307574398006806e-08, "loss": 0.0, "num_input_tokens_seen": 126364328, "step": 187550 }, { "epoch": 4.581999853419002, "grad_norm": 0.00013509398559108377, "learning_rate": 4.228303552383516e-08, "loss": 0.0, "num_input_tokens_seen": 126367464, "step": 187555 }, { "epoch": 4.582122004250849, "grad_norm": 4.4286636693868786e-05, "learning_rate": 4.225850361443894e-08, "loss": 0.0, "num_input_tokens_seen": 126370856, "step": 187560 }, { "epoch": 4.582244155082696, "grad_norm": 2.9124090360710397e-05, "learning_rate": 4.223397866999634e-08, "loss": 0.0, "num_input_tokens_seen": 126374248, "step": 187565 }, { "epoch": 4.582366305914543, "grad_norm": 0.0018620576011016965, "learning_rate": 4.2209460690686096e-08, "loss": 0.0, "num_input_tokens_seen": 126377832, "step": 187570 }, { "epoch": 4.582488456746391, "grad_norm": 0.00014488592569250613, "learning_rate": 4.218494967668607e-08, "loss": 0.0, "num_input_tokens_seen": 126381352, "step": 187575 }, { "epoch": 4.582610607578237, "grad_norm": 9.13929907255806e-05, "learning_rate": 4.216044562817467e-08, "loss": 0.0, "num_input_tokens_seen": 126384872, "step": 187580 }, { "epoch": 4.582732758410085, "grad_norm": 0.0017216145060956478, "learning_rate": 4.213594854533031e-08, "loss": 0.029, "num_input_tokens_seen": 126388264, "step": 187585 }, { "epoch": 4.582854909241932, "grad_norm": 0.0009816106176003814, "learning_rate": 4.211145842833097e-08, "loss": 0.0, "num_input_tokens_seen": 126391400, "step": 187590 }, { "epoch": 4.582977060073779, "grad_norm": 9.432035585632548e-05, "learning_rate": 4.2086975277354606e-08, "loss": 0.0, "num_input_tokens_seen": 126394344, "step": 187595 }, { "epoch": 4.583099210905626, "grad_norm": 4.532306775217876e-05, "learning_rate": 4.206249909257953e-08, "loss": 0.0, "num_input_tokens_seen": 126397544, "step": 187600 }, { "epoch": 4.583221361737474, "grad_norm": 0.0012224335223436356, "learning_rate": 4.203802987418348e-08, "loss": 0.0, "num_input_tokens_seen": 126401320, "step": 187605 }, { "epoch": 4.5833435125693205, "grad_norm": 0.0014491344336420298, "learning_rate": 4.201356762234476e-08, "loss": 0.0, "num_input_tokens_seen": 126404904, "step": 187610 }, { "epoch": 4.583465663401168, "grad_norm": 0.0010459222830832005, "learning_rate": 4.1989112337240784e-08, "loss": 0.0005, "num_input_tokens_seen": 126408360, "step": 187615 }, { "epoch": 4.583587814233015, "grad_norm": 0.00420120544731617, "learning_rate": 4.1964664019049855e-08, "loss": 0.0, "num_input_tokens_seen": 126411560, "step": 187620 }, { "epoch": 4.5837099650648625, "grad_norm": 2.1079378711874597e-05, "learning_rate": 4.1940222667949385e-08, "loss": 0.0, "num_input_tokens_seen": 126414888, "step": 187625 }, { "epoch": 4.583832115896709, "grad_norm": 0.0007706163451075554, "learning_rate": 4.191578828411746e-08, "loss": 0.0, "num_input_tokens_seen": 126418600, "step": 187630 }, { "epoch": 4.583954266728556, "grad_norm": 0.0009319090167991817, "learning_rate": 4.18913608677316e-08, "loss": 0.0, "num_input_tokens_seen": 126421736, "step": 187635 }, { "epoch": 4.584076417560404, "grad_norm": 8.97532154340297e-05, "learning_rate": 4.1866940418969324e-08, "loss": 0.0, "num_input_tokens_seen": 126425064, "step": 187640 }, { "epoch": 4.58419856839225, "grad_norm": 0.0001152338954852894, "learning_rate": 4.1842526938008495e-08, "loss": 0.0, "num_input_tokens_seen": 126428456, "step": 187645 }, { "epoch": 4.584320719224098, "grad_norm": 0.0002968577609863132, "learning_rate": 4.181812042502641e-08, "loss": 0.0, "num_input_tokens_seen": 126431848, "step": 187650 }, { "epoch": 4.584442870055945, "grad_norm": 0.012818018905818462, "learning_rate": 4.179372088020083e-08, "loss": 0.0, "num_input_tokens_seen": 126435496, "step": 187655 }, { "epoch": 4.584565020887792, "grad_norm": 6.266115815378726e-05, "learning_rate": 4.176932830370894e-08, "loss": 0.0, "num_input_tokens_seen": 126438952, "step": 187660 }, { "epoch": 4.584687171719639, "grad_norm": 0.0003587511891964823, "learning_rate": 4.174494269572837e-08, "loss": 0.0, "num_input_tokens_seen": 126442216, "step": 187665 }, { "epoch": 4.584809322551487, "grad_norm": 2.6462874302524142e-05, "learning_rate": 4.172056405643609e-08, "loss": 0.0, "num_input_tokens_seen": 126445736, "step": 187670 }, { "epoch": 4.5849314733833335, "grad_norm": 0.002904921304434538, "learning_rate": 4.169619238600963e-08, "loss": 0.0, "num_input_tokens_seen": 126449064, "step": 187675 }, { "epoch": 4.585053624215181, "grad_norm": 9.107970981858671e-05, "learning_rate": 4.16718276846264e-08, "loss": 0.0, "num_input_tokens_seen": 126452328, "step": 187680 }, { "epoch": 4.585175775047028, "grad_norm": 1.338696984021226e-05, "learning_rate": 4.164746995246327e-08, "loss": 0.0, "num_input_tokens_seen": 126455912, "step": 187685 }, { "epoch": 4.5852979258788755, "grad_norm": 0.0004112111055292189, "learning_rate": 4.162311918969763e-08, "loss": 0.0, "num_input_tokens_seen": 126459368, "step": 187690 }, { "epoch": 4.585420076710722, "grad_norm": 1.6203335690079257e-05, "learning_rate": 4.1598775396506246e-08, "loss": 0.0, "num_input_tokens_seen": 126462952, "step": 187695 }, { "epoch": 4.58554222754257, "grad_norm": 0.004849761724472046, "learning_rate": 4.1574438573066526e-08, "loss": 0.0028, "num_input_tokens_seen": 126466088, "step": 187700 }, { "epoch": 4.585664378374417, "grad_norm": 7.962723611854017e-05, "learning_rate": 4.155010871955522e-08, "loss": 0.0, "num_input_tokens_seen": 126469288, "step": 187705 }, { "epoch": 4.585786529206263, "grad_norm": 0.00017755954468157142, "learning_rate": 4.1525785836149294e-08, "loss": 0.0, "num_input_tokens_seen": 126472744, "step": 187710 }, { "epoch": 4.585908680038111, "grad_norm": 3.693220060085878e-05, "learning_rate": 4.150146992302572e-08, "loss": 0.0, "num_input_tokens_seen": 126476456, "step": 187715 }, { "epoch": 4.586030830869959, "grad_norm": 0.0013728952035307884, "learning_rate": 4.147716098036103e-08, "loss": 0.0, "num_input_tokens_seen": 126479976, "step": 187720 }, { "epoch": 4.586152981701805, "grad_norm": 0.0031150600407272577, "learning_rate": 4.145285900833251e-08, "loss": 0.0, "num_input_tokens_seen": 126482984, "step": 187725 }, { "epoch": 4.586275132533652, "grad_norm": 0.0005813776515424252, "learning_rate": 4.142856400711647e-08, "loss": 0.0, "num_input_tokens_seen": 126486504, "step": 187730 }, { "epoch": 4.5863972833655, "grad_norm": 0.0002588435309007764, "learning_rate": 4.1404275976889666e-08, "loss": 0.0, "num_input_tokens_seen": 126489704, "step": 187735 }, { "epoch": 4.5865194341973465, "grad_norm": 3.303175981272943e-05, "learning_rate": 4.1379994917828956e-08, "loss": 0.0001, "num_input_tokens_seen": 126492712, "step": 187740 }, { "epoch": 4.586641585029194, "grad_norm": 0.0034465112257748842, "learning_rate": 4.135572083011074e-08, "loss": 0.0, "num_input_tokens_seen": 126495976, "step": 187745 }, { "epoch": 4.586763735861041, "grad_norm": 2.2768550479668193e-05, "learning_rate": 4.133145371391156e-08, "loss": 0.0, "num_input_tokens_seen": 126499304, "step": 187750 }, { "epoch": 4.5868858866928885, "grad_norm": 0.00033273472217842937, "learning_rate": 4.130719356940782e-08, "loss": 0.0, "num_input_tokens_seen": 126502760, "step": 187755 }, { "epoch": 4.587008037524735, "grad_norm": 0.0005335175665095448, "learning_rate": 4.128294039677605e-08, "loss": 0.0, "num_input_tokens_seen": 126506152, "step": 187760 }, { "epoch": 4.587130188356583, "grad_norm": 0.00044096849160268903, "learning_rate": 4.125869419619266e-08, "loss": 0.0, "num_input_tokens_seen": 126509608, "step": 187765 }, { "epoch": 4.58725233918843, "grad_norm": 3.677398490253836e-05, "learning_rate": 4.1234454967833844e-08, "loss": 0.0, "num_input_tokens_seen": 126512808, "step": 187770 }, { "epoch": 4.587374490020277, "grad_norm": 8.349936251761392e-05, "learning_rate": 4.121022271187602e-08, "loss": 0.0, "num_input_tokens_seen": 126516136, "step": 187775 }, { "epoch": 4.587496640852124, "grad_norm": 0.005426215007901192, "learning_rate": 4.1185997428495265e-08, "loss": 0.0001, "num_input_tokens_seen": 126519336, "step": 187780 }, { "epoch": 4.587618791683972, "grad_norm": 6.722656689817086e-05, "learning_rate": 4.1161779117868004e-08, "loss": 0.0, "num_input_tokens_seen": 126522856, "step": 187785 }, { "epoch": 4.587740942515818, "grad_norm": 0.00011253041884629056, "learning_rate": 4.113756778016997e-08, "loss": 0.0, "num_input_tokens_seen": 126525928, "step": 187790 }, { "epoch": 4.587863093347666, "grad_norm": 4.811697181139607e-06, "learning_rate": 4.1113363415577583e-08, "loss": 0.0, "num_input_tokens_seen": 126529448, "step": 187795 }, { "epoch": 4.587985244179513, "grad_norm": 0.005408538971096277, "learning_rate": 4.10891660242666e-08, "loss": 0.0, "num_input_tokens_seen": 126532904, "step": 187800 }, { "epoch": 4.58810739501136, "grad_norm": 0.038692690432071686, "learning_rate": 4.10649756064132e-08, "loss": 0.0, "num_input_tokens_seen": 126536104, "step": 187805 }, { "epoch": 4.588229545843207, "grad_norm": 0.000823112262878567, "learning_rate": 4.104079216219336e-08, "loss": 0.0, "num_input_tokens_seen": 126539304, "step": 187810 }, { "epoch": 4.588351696675055, "grad_norm": 0.0004939865320920944, "learning_rate": 4.101661569178261e-08, "loss": 0.0, "num_input_tokens_seen": 126542504, "step": 187815 }, { "epoch": 4.588473847506902, "grad_norm": 7.609864405822009e-05, "learning_rate": 4.099244619535702e-08, "loss": 0.0, "num_input_tokens_seen": 126545576, "step": 187820 }, { "epoch": 4.588595998338748, "grad_norm": 0.0031696229707449675, "learning_rate": 4.0968283673092244e-08, "loss": 0.0, "num_input_tokens_seen": 126549224, "step": 187825 }, { "epoch": 4.588718149170596, "grad_norm": 0.0009845405584201217, "learning_rate": 4.0944128125164014e-08, "loss": 0.0, "num_input_tokens_seen": 126552744, "step": 187830 }, { "epoch": 4.588840300002443, "grad_norm": 0.0037003783509135246, "learning_rate": 4.091997955174831e-08, "loss": 0.0, "num_input_tokens_seen": 126556840, "step": 187835 }, { "epoch": 4.58896245083429, "grad_norm": 0.002434109104797244, "learning_rate": 4.089583795302021e-08, "loss": 0.0, "num_input_tokens_seen": 126560680, "step": 187840 }, { "epoch": 4.589084601666137, "grad_norm": 0.0022978431079536676, "learning_rate": 4.0871703329155685e-08, "loss": 0.0, "num_input_tokens_seen": 126563880, "step": 187845 }, { "epoch": 4.589206752497985, "grad_norm": 8.7481748778373e-06, "learning_rate": 4.084757568033004e-08, "loss": 0.0, "num_input_tokens_seen": 126567720, "step": 187850 }, { "epoch": 4.5893289033298315, "grad_norm": 0.000550804368685931, "learning_rate": 4.082345500671869e-08, "loss": 0.0, "num_input_tokens_seen": 126570856, "step": 187855 }, { "epoch": 4.589451054161679, "grad_norm": 0.0002906092850025743, "learning_rate": 4.079934130849738e-08, "loss": 0.0, "num_input_tokens_seen": 126573800, "step": 187860 }, { "epoch": 4.589573204993526, "grad_norm": 0.0010242760181427002, "learning_rate": 4.077523458584109e-08, "loss": 0.0, "num_input_tokens_seen": 126577960, "step": 187865 }, { "epoch": 4.5896953558253735, "grad_norm": 0.00287572480738163, "learning_rate": 4.075113483892545e-08, "loss": 0.0436, "num_input_tokens_seen": 126581032, "step": 187870 }, { "epoch": 4.58981750665722, "grad_norm": 0.0011697241570800543, "learning_rate": 4.072704206792543e-08, "loss": 0.0, "num_input_tokens_seen": 126584808, "step": 187875 }, { "epoch": 4.589939657489068, "grad_norm": 0.00040922340122051537, "learning_rate": 4.070295627301656e-08, "loss": 0.0, "num_input_tokens_seen": 126588840, "step": 187880 }, { "epoch": 4.590061808320915, "grad_norm": 1.8101340174325742e-05, "learning_rate": 4.067887745437359e-08, "loss": 0.0, "num_input_tokens_seen": 126592232, "step": 187885 }, { "epoch": 4.590183959152762, "grad_norm": 0.0011178995482623577, "learning_rate": 4.0654805612171936e-08, "loss": 0.0, "num_input_tokens_seen": 126595560, "step": 187890 }, { "epoch": 4.590306109984609, "grad_norm": 5.905812940909527e-05, "learning_rate": 4.0630740746586564e-08, "loss": 0.0, "num_input_tokens_seen": 126598952, "step": 187895 }, { "epoch": 4.590428260816456, "grad_norm": 0.013189355842769146, "learning_rate": 4.060668285779256e-08, "loss": 0.0, "num_input_tokens_seen": 126602344, "step": 187900 }, { "epoch": 4.590550411648303, "grad_norm": 7.253669900819659e-05, "learning_rate": 4.0582631945964786e-08, "loss": 0.0, "num_input_tokens_seen": 126605736, "step": 187905 }, { "epoch": 4.590672562480151, "grad_norm": 0.00012215471360832453, "learning_rate": 4.055858801127809e-08, "loss": 0.0002, "num_input_tokens_seen": 126609064, "step": 187910 }, { "epoch": 4.590794713311998, "grad_norm": 3.525710053509101e-05, "learning_rate": 4.0534551053907464e-08, "loss": 0.0, "num_input_tokens_seen": 126612392, "step": 187915 }, { "epoch": 4.5909168641438445, "grad_norm": 0.0003965279902331531, "learning_rate": 4.0510521074027636e-08, "loss": 0.0, "num_input_tokens_seen": 126615528, "step": 187920 }, { "epoch": 4.591039014975692, "grad_norm": 0.0001456565223634243, "learning_rate": 4.0486498071813256e-08, "loss": 0.0, "num_input_tokens_seen": 126618920, "step": 187925 }, { "epoch": 4.591161165807539, "grad_norm": 0.00045940681593492627, "learning_rate": 4.0462482047439295e-08, "loss": 0.0, "num_input_tokens_seen": 126622120, "step": 187930 }, { "epoch": 4.5912833166393865, "grad_norm": 0.0001200491824420169, "learning_rate": 4.043847300108016e-08, "loss": 0.0, "num_input_tokens_seen": 126625576, "step": 187935 }, { "epoch": 4.591405467471233, "grad_norm": 0.0006971561233513057, "learning_rate": 4.041447093291062e-08, "loss": 0.0, "num_input_tokens_seen": 126629096, "step": 187940 }, { "epoch": 4.591527618303081, "grad_norm": 0.00014549454499501735, "learning_rate": 4.0390475843105066e-08, "loss": 0.0, "num_input_tokens_seen": 126632424, "step": 187945 }, { "epoch": 4.591649769134928, "grad_norm": 0.00033066223841160536, "learning_rate": 4.036648773183804e-08, "loss": 0.0, "num_input_tokens_seen": 126635880, "step": 187950 }, { "epoch": 4.591771919966775, "grad_norm": 5.113217775942758e-05, "learning_rate": 4.0342506599284175e-08, "loss": 0.0, "num_input_tokens_seen": 126639208, "step": 187955 }, { "epoch": 4.591894070798622, "grad_norm": 0.0005086977034807205, "learning_rate": 4.0318532445617557e-08, "loss": 0.0, "num_input_tokens_seen": 126643112, "step": 187960 }, { "epoch": 4.59201622163047, "grad_norm": 2.0289666281314567e-05, "learning_rate": 4.0294565271012825e-08, "loss": 0.0, "num_input_tokens_seen": 126646440, "step": 187965 }, { "epoch": 4.592138372462316, "grad_norm": 1.0712580660765525e-05, "learning_rate": 4.027060507564406e-08, "loss": 0.0, "num_input_tokens_seen": 126650024, "step": 187970 }, { "epoch": 4.592260523294164, "grad_norm": 0.003349565202370286, "learning_rate": 4.0246651859685675e-08, "loss": 0.0, "num_input_tokens_seen": 126653352, "step": 187975 }, { "epoch": 4.592382674126011, "grad_norm": 14.837358474731445, "learning_rate": 4.0222705623311645e-08, "loss": 0.0354, "num_input_tokens_seen": 126656552, "step": 187980 }, { "epoch": 4.592504824957858, "grad_norm": 1.570095082570333e-05, "learning_rate": 4.019876636669628e-08, "loss": 0.0, "num_input_tokens_seen": 126660072, "step": 187985 }, { "epoch": 4.592626975789705, "grad_norm": 9.002363367471844e-05, "learning_rate": 4.017483409001376e-08, "loss": 0.0, "num_input_tokens_seen": 126663272, "step": 187990 }, { "epoch": 4.592749126621552, "grad_norm": 0.000561710970941931, "learning_rate": 4.0150908793437854e-08, "loss": 0.0, "num_input_tokens_seen": 126666984, "step": 187995 }, { "epoch": 4.5928712774533995, "grad_norm": 0.00041520060040056705, "learning_rate": 4.0126990477142854e-08, "loss": 0.0, "num_input_tokens_seen": 126670504, "step": 188000 }, { "epoch": 4.592993428285246, "grad_norm": 0.001887407386675477, "learning_rate": 4.0103079141302507e-08, "loss": 0.0, "num_input_tokens_seen": 126673896, "step": 188005 }, { "epoch": 4.593115579117094, "grad_norm": 0.0003722521068993956, "learning_rate": 4.00791747860908e-08, "loss": 0.0, "num_input_tokens_seen": 126677096, "step": 188010 }, { "epoch": 4.593237729948941, "grad_norm": 0.0004346818896010518, "learning_rate": 4.005527741168147e-08, "loss": 0.0, "num_input_tokens_seen": 126680616, "step": 188015 }, { "epoch": 4.593359880780788, "grad_norm": 0.007946284487843513, "learning_rate": 4.003138701824826e-08, "loss": 0.0, "num_input_tokens_seen": 126684264, "step": 188020 }, { "epoch": 4.593482031612635, "grad_norm": 0.00010074281453853473, "learning_rate": 4.000750360596517e-08, "loss": 0.0, "num_input_tokens_seen": 126687336, "step": 188025 }, { "epoch": 4.593604182444483, "grad_norm": 0.0014380852226167917, "learning_rate": 3.998362717500558e-08, "loss": 0.0, "num_input_tokens_seen": 126690856, "step": 188030 }, { "epoch": 4.593726333276329, "grad_norm": 0.00030201810295693576, "learning_rate": 3.995975772554339e-08, "loss": 0.0394, "num_input_tokens_seen": 126694632, "step": 188035 }, { "epoch": 4.593848484108177, "grad_norm": 0.009521891362965107, "learning_rate": 3.9935895257751984e-08, "loss": 0.0, "num_input_tokens_seen": 126698024, "step": 188040 }, { "epoch": 4.593970634940024, "grad_norm": 0.0001786075154086575, "learning_rate": 3.9912039771804903e-08, "loss": 0.0, "num_input_tokens_seen": 126701544, "step": 188045 }, { "epoch": 4.594092785771871, "grad_norm": 0.00011570060451049358, "learning_rate": 3.98881912678759e-08, "loss": 0.0, "num_input_tokens_seen": 126704936, "step": 188050 }, { "epoch": 4.594214936603718, "grad_norm": 9.638779738452286e-05, "learning_rate": 3.986434974613806e-08, "loss": 0.0, "num_input_tokens_seen": 126708264, "step": 188055 }, { "epoch": 4.594337087435566, "grad_norm": 2.7568939913180657e-05, "learning_rate": 3.984051520676501e-08, "loss": 0.0, "num_input_tokens_seen": 126711272, "step": 188060 }, { "epoch": 4.5944592382674125, "grad_norm": 5.386893008108018e-06, "learning_rate": 3.981668764992985e-08, "loss": 0.0, "num_input_tokens_seen": 126714728, "step": 188065 }, { "epoch": 4.594581389099259, "grad_norm": 3.274362825322896e-05, "learning_rate": 3.979286707580598e-08, "loss": 0.0, "num_input_tokens_seen": 126718376, "step": 188070 }, { "epoch": 4.594703539931107, "grad_norm": 0.0004523637762758881, "learning_rate": 3.976905348456683e-08, "loss": 0.0, "num_input_tokens_seen": 126721384, "step": 188075 }, { "epoch": 4.5948256907629546, "grad_norm": 0.12980130314826965, "learning_rate": 3.9745246876385255e-08, "loss": 0.0001, "num_input_tokens_seen": 126724776, "step": 188080 }, { "epoch": 4.594947841594801, "grad_norm": 0.0001941550726769492, "learning_rate": 3.972144725143456e-08, "loss": 0.0, "num_input_tokens_seen": 126728168, "step": 188085 }, { "epoch": 4.595069992426648, "grad_norm": 0.12191027402877808, "learning_rate": 3.969765460988772e-08, "loss": 0.0, "num_input_tokens_seen": 126731176, "step": 188090 }, { "epoch": 4.595192143258496, "grad_norm": 1.0134815056517255e-05, "learning_rate": 3.9673868951918045e-08, "loss": 0.0, "num_input_tokens_seen": 126734504, "step": 188095 }, { "epoch": 4.595314294090342, "grad_norm": 0.0001895191235234961, "learning_rate": 3.9650090277698054e-08, "loss": 0.0, "num_input_tokens_seen": 126737512, "step": 188100 }, { "epoch": 4.59543644492219, "grad_norm": 0.000430260319262743, "learning_rate": 3.9626318587401066e-08, "loss": 0.0, "num_input_tokens_seen": 126740456, "step": 188105 }, { "epoch": 4.595558595754037, "grad_norm": 3.98890369979199e-05, "learning_rate": 3.960255388119971e-08, "loss": 0.0, "num_input_tokens_seen": 126743592, "step": 188110 }, { "epoch": 4.595680746585884, "grad_norm": 7.241523417178541e-05, "learning_rate": 3.957879615926696e-08, "loss": 0.0, "num_input_tokens_seen": 126747176, "step": 188115 }, { "epoch": 4.595802897417731, "grad_norm": 0.00023456825874745846, "learning_rate": 3.9555045421775566e-08, "loss": 0.0, "num_input_tokens_seen": 126751144, "step": 188120 }, { "epoch": 4.595925048249579, "grad_norm": 7.77715613367036e-05, "learning_rate": 3.9531301668898066e-08, "loss": 0.0, "num_input_tokens_seen": 126754664, "step": 188125 }, { "epoch": 4.596047199081426, "grad_norm": 0.00020698907610494643, "learning_rate": 3.950756490080742e-08, "loss": 0.0, "num_input_tokens_seen": 126757864, "step": 188130 }, { "epoch": 4.596169349913273, "grad_norm": 0.0008199518779292703, "learning_rate": 3.9483835117675947e-08, "loss": 0.0, "num_input_tokens_seen": 126761576, "step": 188135 }, { "epoch": 4.59629150074512, "grad_norm": 0.00011563400767045096, "learning_rate": 3.946011231967639e-08, "loss": 0.0, "num_input_tokens_seen": 126764968, "step": 188140 }, { "epoch": 4.596413651576968, "grad_norm": 0.002665347419679165, "learning_rate": 3.9436396506981383e-08, "loss": 0.0, "num_input_tokens_seen": 126768360, "step": 188145 }, { "epoch": 4.596535802408814, "grad_norm": 4.5120101276552305e-05, "learning_rate": 3.941268767976314e-08, "loss": 0.0, "num_input_tokens_seen": 126771944, "step": 188150 }, { "epoch": 4.596657953240662, "grad_norm": 0.00019696805975399911, "learning_rate": 3.938898583819428e-08, "loss": 0.0, "num_input_tokens_seen": 126775400, "step": 188155 }, { "epoch": 4.596780104072509, "grad_norm": 0.0002855850907508284, "learning_rate": 3.936529098244701e-08, "loss": 0.0, "num_input_tokens_seen": 126778856, "step": 188160 }, { "epoch": 4.5969022549043554, "grad_norm": 0.0016741352155804634, "learning_rate": 3.934160311269374e-08, "loss": 0.0, "num_input_tokens_seen": 126781864, "step": 188165 }, { "epoch": 4.597024405736203, "grad_norm": 0.00040661616367287934, "learning_rate": 3.931792222910679e-08, "loss": 0.0, "num_input_tokens_seen": 126785832, "step": 188170 }, { "epoch": 4.597146556568051, "grad_norm": 1.6207712178584188e-05, "learning_rate": 3.929424833185824e-08, "loss": 0.0, "num_input_tokens_seen": 126789352, "step": 188175 }, { "epoch": 4.5972687073998975, "grad_norm": 6.939090962987393e-05, "learning_rate": 3.9270581421120386e-08, "loss": 0.0, "num_input_tokens_seen": 126792424, "step": 188180 }, { "epoch": 4.597390858231744, "grad_norm": 0.5053022503852844, "learning_rate": 3.92469214970651e-08, "loss": 0.0002, "num_input_tokens_seen": 126795432, "step": 188185 }, { "epoch": 4.597513009063592, "grad_norm": 0.0003090602986048907, "learning_rate": 3.9223268559864796e-08, "loss": 0.0, "num_input_tokens_seen": 126798568, "step": 188190 }, { "epoch": 4.597635159895439, "grad_norm": 0.00018028549675364047, "learning_rate": 3.919962260969123e-08, "loss": 0.0, "num_input_tokens_seen": 126801832, "step": 188195 }, { "epoch": 4.597757310727286, "grad_norm": 0.0002370340225752443, "learning_rate": 3.917598364671637e-08, "loss": 0.0, "num_input_tokens_seen": 126805416, "step": 188200 }, { "epoch": 4.597879461559133, "grad_norm": 0.0002550011849962175, "learning_rate": 3.9152351671112305e-08, "loss": 0.0, "num_input_tokens_seen": 126808680, "step": 188205 }, { "epoch": 4.598001612390981, "grad_norm": 1.735908335831482e-05, "learning_rate": 3.9128726683050675e-08, "loss": 0.0, "num_input_tokens_seen": 126812072, "step": 188210 }, { "epoch": 4.598123763222827, "grad_norm": 0.0027028501499444246, "learning_rate": 3.9105108682703447e-08, "loss": 0.0, "num_input_tokens_seen": 126815528, "step": 188215 }, { "epoch": 4.598245914054675, "grad_norm": 0.0037059872411191463, "learning_rate": 3.908149767024238e-08, "loss": 0.0, "num_input_tokens_seen": 126818344, "step": 188220 }, { "epoch": 4.598368064886522, "grad_norm": 9.531846046447754, "learning_rate": 3.9057893645839005e-08, "loss": 0.0488, "num_input_tokens_seen": 126821800, "step": 188225 }, { "epoch": 4.598490215718369, "grad_norm": 0.00041096939821727574, "learning_rate": 3.903429660966517e-08, "loss": 0.0001, "num_input_tokens_seen": 126825384, "step": 188230 }, { "epoch": 4.598612366550216, "grad_norm": 0.00016224953287746757, "learning_rate": 3.901070656189231e-08, "loss": 0.0, "num_input_tokens_seen": 126828584, "step": 188235 }, { "epoch": 4.598734517382064, "grad_norm": 0.0003495425044093281, "learning_rate": 3.898712350269218e-08, "loss": 0.0, "num_input_tokens_seen": 126832232, "step": 188240 }, { "epoch": 4.5988566682139105, "grad_norm": 0.00017650242079980671, "learning_rate": 3.8963547432236064e-08, "loss": 0.0, "num_input_tokens_seen": 126835624, "step": 188245 }, { "epoch": 4.598978819045758, "grad_norm": 0.00018021099094767123, "learning_rate": 3.8939978350695625e-08, "loss": 0.0, "num_input_tokens_seen": 126839336, "step": 188250 }, { "epoch": 4.599100969877605, "grad_norm": 2.55699414992705e-05, "learning_rate": 3.8916416258242045e-08, "loss": 0.0, "num_input_tokens_seen": 126842600, "step": 188255 }, { "epoch": 4.599223120709452, "grad_norm": 1.4513319911202416e-05, "learning_rate": 3.889286115504686e-08, "loss": 0.0, "num_input_tokens_seen": 126845608, "step": 188260 }, { "epoch": 4.599345271541299, "grad_norm": 8.852702740114182e-05, "learning_rate": 3.886931304128127e-08, "loss": 0.0, "num_input_tokens_seen": 126848936, "step": 188265 }, { "epoch": 4.599467422373146, "grad_norm": 0.0014767482643947005, "learning_rate": 3.8845771917116466e-08, "loss": 0.0, "num_input_tokens_seen": 126851816, "step": 188270 }, { "epoch": 4.599589573204994, "grad_norm": 0.0007634480716660619, "learning_rate": 3.882223778272398e-08, "loss": 0.0, "num_input_tokens_seen": 126855272, "step": 188275 }, { "epoch": 4.59971172403684, "grad_norm": 3.212130468455143e-05, "learning_rate": 3.879871063827445e-08, "loss": 0.0, "num_input_tokens_seen": 126858792, "step": 188280 }, { "epoch": 4.599833874868688, "grad_norm": 0.001050909049808979, "learning_rate": 3.87751904839394e-08, "loss": 0.0, "num_input_tokens_seen": 126862504, "step": 188285 }, { "epoch": 4.599956025700535, "grad_norm": 9.944814519258216e-05, "learning_rate": 3.8751677319889485e-08, "loss": 0.0, "num_input_tokens_seen": 126865320, "step": 188290 }, { "epoch": 4.600078176532382, "grad_norm": 0.000473526946734637, "learning_rate": 3.872817114629601e-08, "loss": 0.0, "num_input_tokens_seen": 126868264, "step": 188295 }, { "epoch": 4.600200327364229, "grad_norm": 5.8090405218536034e-05, "learning_rate": 3.8704671963329935e-08, "loss": 0.0, "num_input_tokens_seen": 126871464, "step": 188300 }, { "epoch": 4.600322478196077, "grad_norm": 0.00030619362951256335, "learning_rate": 3.868117977116192e-08, "loss": 0.0, "num_input_tokens_seen": 126874728, "step": 188305 }, { "epoch": 4.6004446290279235, "grad_norm": 0.0002061193372355774, "learning_rate": 3.865769456996304e-08, "loss": 0.0, "num_input_tokens_seen": 126878120, "step": 188310 }, { "epoch": 4.600566779859771, "grad_norm": 0.00028244717395864427, "learning_rate": 3.863421635990394e-08, "loss": 0.0682, "num_input_tokens_seen": 126881448, "step": 188315 }, { "epoch": 4.600688930691618, "grad_norm": 0.003838619915768504, "learning_rate": 3.861074514115536e-08, "loss": 0.0, "num_input_tokens_seen": 126884584, "step": 188320 }, { "epoch": 4.6008110815234655, "grad_norm": 4.583996633300558e-05, "learning_rate": 3.858728091388819e-08, "loss": 0.0, "num_input_tokens_seen": 126887784, "step": 188325 }, { "epoch": 4.600933232355312, "grad_norm": 0.00048598088324069977, "learning_rate": 3.85638236782726e-08, "loss": 0.0, "num_input_tokens_seen": 126890920, "step": 188330 }, { "epoch": 4.601055383187159, "grad_norm": 5.8064073527930304e-05, "learning_rate": 3.854037343447969e-08, "loss": 0.0, "num_input_tokens_seen": 126893928, "step": 188335 }, { "epoch": 4.601177534019007, "grad_norm": 0.00011497107334434986, "learning_rate": 3.8516930182679765e-08, "loss": 0.0, "num_input_tokens_seen": 126897256, "step": 188340 }, { "epoch": 4.601299684850854, "grad_norm": 2.2357677153195255e-05, "learning_rate": 3.849349392304335e-08, "loss": 0.0, "num_input_tokens_seen": 126900392, "step": 188345 }, { "epoch": 4.601421835682701, "grad_norm": 0.0005386814009398222, "learning_rate": 3.8470064655740655e-08, "loss": 0.0, "num_input_tokens_seen": 126903656, "step": 188350 }, { "epoch": 4.601543986514548, "grad_norm": 0.00010427878441987559, "learning_rate": 3.844664238094242e-08, "loss": 0.0, "num_input_tokens_seen": 126906792, "step": 188355 }, { "epoch": 4.601666137346395, "grad_norm": 0.00020836573094129562, "learning_rate": 3.842322709881884e-08, "loss": 0.0, "num_input_tokens_seen": 126910440, "step": 188360 }, { "epoch": 4.601788288178242, "grad_norm": 0.0036403266713023186, "learning_rate": 3.839981880954013e-08, "loss": 0.0, "num_input_tokens_seen": 126913704, "step": 188365 }, { "epoch": 4.60191043901009, "grad_norm": 2.5985518732341006e-05, "learning_rate": 3.837641751327669e-08, "loss": 0.0, "num_input_tokens_seen": 126916904, "step": 188370 }, { "epoch": 4.6020325898419365, "grad_norm": 2.1882120563532226e-05, "learning_rate": 3.835302321019851e-08, "loss": 0.0, "num_input_tokens_seen": 126920296, "step": 188375 }, { "epoch": 4.602154740673784, "grad_norm": 0.002450756961479783, "learning_rate": 3.832963590047589e-08, "loss": 0.0, "num_input_tokens_seen": 126924456, "step": 188380 }, { "epoch": 4.602276891505631, "grad_norm": 0.0028287044260650873, "learning_rate": 3.830625558427869e-08, "loss": 0.0005, "num_input_tokens_seen": 126927656, "step": 188385 }, { "epoch": 4.6023990423374785, "grad_norm": 0.00017941469559445977, "learning_rate": 3.8282882261777e-08, "loss": 0.0, "num_input_tokens_seen": 126931240, "step": 188390 }, { "epoch": 4.602521193169325, "grad_norm": 0.0003415448300074786, "learning_rate": 3.825951593314103e-08, "loss": 0.0, "num_input_tokens_seen": 126934568, "step": 188395 }, { "epoch": 4.602643344001173, "grad_norm": 0.0003005169564858079, "learning_rate": 3.82361565985404e-08, "loss": 0.0, "num_input_tokens_seen": 126938024, "step": 188400 }, { "epoch": 4.60276549483302, "grad_norm": 0.0007993972394615412, "learning_rate": 3.8212804258145324e-08, "loss": 0.0, "num_input_tokens_seen": 126941544, "step": 188405 }, { "epoch": 4.602887645664867, "grad_norm": 0.0006093989941291511, "learning_rate": 3.818945891212522e-08, "loss": 0.0, "num_input_tokens_seen": 126944488, "step": 188410 }, { "epoch": 4.603009796496714, "grad_norm": 0.00032665004255250096, "learning_rate": 3.816612056065016e-08, "loss": 0.0, "num_input_tokens_seen": 126947816, "step": 188415 }, { "epoch": 4.603131947328562, "grad_norm": 0.00045783931273035705, "learning_rate": 3.814278920388969e-08, "loss": 0.0, "num_input_tokens_seen": 126951400, "step": 188420 }, { "epoch": 4.603254098160408, "grad_norm": 0.03975209966301918, "learning_rate": 3.811946484201378e-08, "loss": 0.0764, "num_input_tokens_seen": 126954536, "step": 188425 }, { "epoch": 4.603376248992255, "grad_norm": 0.0011893432820215821, "learning_rate": 3.809614747519174e-08, "loss": 0.0, "num_input_tokens_seen": 126957288, "step": 188430 }, { "epoch": 4.603498399824103, "grad_norm": 9.247218258678913e-06, "learning_rate": 3.8072837103593106e-08, "loss": 0.0, "num_input_tokens_seen": 126960744, "step": 188435 }, { "epoch": 4.60362055065595, "grad_norm": 9.148565732175484e-05, "learning_rate": 3.804953372738762e-08, "loss": 0.0, "num_input_tokens_seen": 126963688, "step": 188440 }, { "epoch": 4.603742701487797, "grad_norm": 3.173185177729465e-05, "learning_rate": 3.8026237346744596e-08, "loss": 0.0, "num_input_tokens_seen": 126967528, "step": 188445 }, { "epoch": 4.603864852319644, "grad_norm": 0.00013901907368563116, "learning_rate": 3.8002947961833565e-08, "loss": 0.0, "num_input_tokens_seen": 126970856, "step": 188450 }, { "epoch": 4.603987003151492, "grad_norm": 0.0020034005865454674, "learning_rate": 3.797966557282384e-08, "loss": 0.062, "num_input_tokens_seen": 126974056, "step": 188455 }, { "epoch": 4.604109153983338, "grad_norm": 0.012726671993732452, "learning_rate": 3.795639017988472e-08, "loss": 0.0, "num_input_tokens_seen": 126978408, "step": 188460 }, { "epoch": 4.604231304815186, "grad_norm": 0.00035635806852951646, "learning_rate": 3.793312178318553e-08, "loss": 0.0, "num_input_tokens_seen": 126981736, "step": 188465 }, { "epoch": 4.604353455647033, "grad_norm": 6.302624387899414e-05, "learning_rate": 3.7909860382895455e-08, "loss": 0.0002, "num_input_tokens_seen": 126984936, "step": 188470 }, { "epoch": 4.60447560647888, "grad_norm": 1.5507221178268082e-05, "learning_rate": 3.788660597918347e-08, "loss": 0.0, "num_input_tokens_seen": 126988072, "step": 188475 }, { "epoch": 4.604597757310727, "grad_norm": 0.00047399455797858536, "learning_rate": 3.786335857221912e-08, "loss": 0.0, "num_input_tokens_seen": 126991464, "step": 188480 }, { "epoch": 4.604719908142575, "grad_norm": 3.249979636166245e-05, "learning_rate": 3.7840118162171033e-08, "loss": 0.0, "num_input_tokens_seen": 126994792, "step": 188485 }, { "epoch": 4.6048420589744214, "grad_norm": 0.014283166266977787, "learning_rate": 3.781688474920852e-08, "loss": 0.0, "num_input_tokens_seen": 126997800, "step": 188490 }, { "epoch": 4.604964209806269, "grad_norm": 9.774186037248e-05, "learning_rate": 3.779365833350035e-08, "loss": 0.0001, "num_input_tokens_seen": 127001000, "step": 188495 }, { "epoch": 4.605086360638116, "grad_norm": 1.9654957213788293e-05, "learning_rate": 3.7770438915215586e-08, "loss": 0.0, "num_input_tokens_seen": 127004648, "step": 188500 }, { "epoch": 4.6052085114699635, "grad_norm": 0.0008767848485149443, "learning_rate": 3.7747226494522775e-08, "loss": 0.0, "num_input_tokens_seen": 127007976, "step": 188505 }, { "epoch": 4.60533066230181, "grad_norm": 0.0007073588203638792, "learning_rate": 3.7724021071591116e-08, "loss": 0.0, "num_input_tokens_seen": 127011624, "step": 188510 }, { "epoch": 4.605452813133658, "grad_norm": 0.0001672497164690867, "learning_rate": 3.770082264658925e-08, "loss": 0.0, "num_input_tokens_seen": 127014760, "step": 188515 }, { "epoch": 4.605574963965505, "grad_norm": 0.0004079336067661643, "learning_rate": 3.7677631219685704e-08, "loss": 0.0, "num_input_tokens_seen": 127018152, "step": 188520 }, { "epoch": 4.605697114797351, "grad_norm": 0.0006070801755413413, "learning_rate": 3.765444679104934e-08, "loss": 0.0, "num_input_tokens_seen": 127021544, "step": 188525 }, { "epoch": 4.605819265629199, "grad_norm": 7.90668127592653e-05, "learning_rate": 3.7631269360848706e-08, "loss": 0.0, "num_input_tokens_seen": 127024936, "step": 188530 }, { "epoch": 4.605941416461046, "grad_norm": 7.476261089323089e-05, "learning_rate": 3.7608098929252205e-08, "loss": 0.0, "num_input_tokens_seen": 127028136, "step": 188535 }, { "epoch": 4.606063567292893, "grad_norm": 0.0001280046271858737, "learning_rate": 3.7584935496428604e-08, "loss": 0.0, "num_input_tokens_seen": 127031336, "step": 188540 }, { "epoch": 4.60618571812474, "grad_norm": 0.023287976160645485, "learning_rate": 3.756177906254609e-08, "loss": 0.0, "num_input_tokens_seen": 127035048, "step": 188545 }, { "epoch": 4.606307868956588, "grad_norm": 0.003366853343322873, "learning_rate": 3.753862962777321e-08, "loss": 0.0, "num_input_tokens_seen": 127038568, "step": 188550 }, { "epoch": 4.6064300197884345, "grad_norm": 0.0007673184736631811, "learning_rate": 3.751548719227826e-08, "loss": 0.0, "num_input_tokens_seen": 127041704, "step": 188555 }, { "epoch": 4.606552170620282, "grad_norm": 0.0015113946283236146, "learning_rate": 3.749235175622967e-08, "loss": 0.0, "num_input_tokens_seen": 127045416, "step": 188560 }, { "epoch": 4.606674321452129, "grad_norm": 0.00047893551527522504, "learning_rate": 3.746922331979552e-08, "loss": 0.0, "num_input_tokens_seen": 127048744, "step": 188565 }, { "epoch": 4.6067964722839765, "grad_norm": 0.0014517490053549409, "learning_rate": 3.744610188314401e-08, "loss": 0.0, "num_input_tokens_seen": 127052264, "step": 188570 }, { "epoch": 4.606918623115823, "grad_norm": 0.00027874510851688683, "learning_rate": 3.7422987446443455e-08, "loss": 0.0005, "num_input_tokens_seen": 127055528, "step": 188575 }, { "epoch": 4.607040773947671, "grad_norm": 1.3206047697167378e-05, "learning_rate": 3.739988000986172e-08, "loss": 0.0, "num_input_tokens_seen": 127059048, "step": 188580 }, { "epoch": 4.607162924779518, "grad_norm": 0.0002179906005039811, "learning_rate": 3.7376779573567106e-08, "loss": 0.0, "num_input_tokens_seen": 127062504, "step": 188585 }, { "epoch": 4.607285075611365, "grad_norm": 0.0002854499325621873, "learning_rate": 3.735368613772727e-08, "loss": 0.0, "num_input_tokens_seen": 127065896, "step": 188590 }, { "epoch": 4.607407226443212, "grad_norm": 0.0003474639088381082, "learning_rate": 3.733059970251051e-08, "loss": 0.0, "num_input_tokens_seen": 127069160, "step": 188595 }, { "epoch": 4.60752937727506, "grad_norm": 0.0037290072068572044, "learning_rate": 3.7307520268084483e-08, "loss": 0.0, "num_input_tokens_seen": 127072552, "step": 188600 }, { "epoch": 4.607651528106906, "grad_norm": 0.002940698992460966, "learning_rate": 3.728444783461704e-08, "loss": 0.0, "num_input_tokens_seen": 127076008, "step": 188605 }, { "epoch": 4.607773678938754, "grad_norm": 0.00025564880343154073, "learning_rate": 3.726138240227628e-08, "loss": 0.0, "num_input_tokens_seen": 127079336, "step": 188610 }, { "epoch": 4.607895829770601, "grad_norm": 0.000302201573504135, "learning_rate": 3.723832397122939e-08, "loss": 0.0, "num_input_tokens_seen": 127082664, "step": 188615 }, { "epoch": 4.6080179806024475, "grad_norm": 0.0005223817424848676, "learning_rate": 3.721527254164458e-08, "loss": 0.0, "num_input_tokens_seen": 127085672, "step": 188620 }, { "epoch": 4.608140131434295, "grad_norm": 0.00040818838169798255, "learning_rate": 3.719222811368916e-08, "loss": 0.0, "num_input_tokens_seen": 127088872, "step": 188625 }, { "epoch": 4.608262282266142, "grad_norm": 0.0005449874443002045, "learning_rate": 3.716919068753088e-08, "loss": 0.0, "num_input_tokens_seen": 127092328, "step": 188630 }, { "epoch": 4.6083844330979895, "grad_norm": 3.8353104173438624e-05, "learning_rate": 3.714616026333728e-08, "loss": 0.0, "num_input_tokens_seen": 127095656, "step": 188635 }, { "epoch": 4.608506583929836, "grad_norm": 0.0029568178579211235, "learning_rate": 3.712313684127566e-08, "loss": 0.0446, "num_input_tokens_seen": 127099176, "step": 188640 }, { "epoch": 4.608628734761684, "grad_norm": 0.00016009059618227184, "learning_rate": 3.710012042151367e-08, "loss": 0.0, "num_input_tokens_seen": 127102440, "step": 188645 }, { "epoch": 4.608750885593531, "grad_norm": 9.877441334538162e-05, "learning_rate": 3.70771110042184e-08, "loss": 0.0, "num_input_tokens_seen": 127106216, "step": 188650 }, { "epoch": 4.608873036425378, "grad_norm": 0.00012277076893951744, "learning_rate": 3.70541085895576e-08, "loss": 0.0, "num_input_tokens_seen": 127109672, "step": 188655 }, { "epoch": 4.608995187257225, "grad_norm": 0.0019494864391162992, "learning_rate": 3.703111317769814e-08, "loss": 0.0329, "num_input_tokens_seen": 127112424, "step": 188660 }, { "epoch": 4.609117338089073, "grad_norm": 0.03183303773403168, "learning_rate": 3.700812476880744e-08, "loss": 0.0, "num_input_tokens_seen": 127115624, "step": 188665 }, { "epoch": 4.609239488920919, "grad_norm": 0.000619585276581347, "learning_rate": 3.6985143363052806e-08, "loss": 0.0, "num_input_tokens_seen": 127119016, "step": 188670 }, { "epoch": 4.609361639752767, "grad_norm": 0.0004196378868073225, "learning_rate": 3.696216896060112e-08, "loss": 0.0, "num_input_tokens_seen": 127122600, "step": 188675 }, { "epoch": 4.609483790584614, "grad_norm": 0.00010272400686517358, "learning_rate": 3.693920156161967e-08, "loss": 0.0, "num_input_tokens_seen": 127126312, "step": 188680 }, { "epoch": 4.609605941416461, "grad_norm": 0.06050526350736618, "learning_rate": 3.691624116627523e-08, "loss": 0.0, "num_input_tokens_seen": 127130536, "step": 188685 }, { "epoch": 4.609728092248308, "grad_norm": 0.0011279457248747349, "learning_rate": 3.6893287774735106e-08, "loss": 0.0512, "num_input_tokens_seen": 127133800, "step": 188690 }, { "epoch": 4.609850243080155, "grad_norm": 0.0002741161733865738, "learning_rate": 3.687034138716594e-08, "loss": 0.0, "num_input_tokens_seen": 127137832, "step": 188695 }, { "epoch": 4.6099723939120025, "grad_norm": 0.019624967128038406, "learning_rate": 3.6847402003734596e-08, "loss": 0.0001, "num_input_tokens_seen": 127141160, "step": 188700 }, { "epoch": 4.61009454474385, "grad_norm": 1.8829721739166416e-05, "learning_rate": 3.682446962460817e-08, "loss": 0.0001, "num_input_tokens_seen": 127144552, "step": 188705 }, { "epoch": 4.610216695575697, "grad_norm": 0.00046129096881486475, "learning_rate": 3.680154424995319e-08, "loss": 0.0, "num_input_tokens_seen": 127148136, "step": 188710 }, { "epoch": 4.610338846407544, "grad_norm": 6.336476508295164e-05, "learning_rate": 3.677862587993652e-08, "loss": 0.0, "num_input_tokens_seen": 127151656, "step": 188715 }, { "epoch": 4.610460997239391, "grad_norm": 1.540117955300957e-05, "learning_rate": 3.675571451472459e-08, "loss": 0.0, "num_input_tokens_seen": 127155112, "step": 188720 }, { "epoch": 4.610583148071238, "grad_norm": 4.7324389015557244e-05, "learning_rate": 3.673281015448437e-08, "loss": 0.0, "num_input_tokens_seen": 127158632, "step": 188725 }, { "epoch": 4.610705298903086, "grad_norm": 0.00022235990036278963, "learning_rate": 3.670991279938218e-08, "loss": 0.0, "num_input_tokens_seen": 127162216, "step": 188730 }, { "epoch": 4.610827449734932, "grad_norm": 8.080943371169269e-05, "learning_rate": 3.668702244958466e-08, "loss": 0.0, "num_input_tokens_seen": 127165864, "step": 188735 }, { "epoch": 4.61094960056678, "grad_norm": 0.0002779005444608629, "learning_rate": 3.6664139105258115e-08, "loss": 0.0, "num_input_tokens_seen": 127169256, "step": 188740 }, { "epoch": 4.611071751398627, "grad_norm": 0.00119435612577945, "learning_rate": 3.664126276656909e-08, "loss": 0.0, "num_input_tokens_seen": 127172648, "step": 188745 }, { "epoch": 4.611193902230474, "grad_norm": 0.04748645797371864, "learning_rate": 3.6618393433684006e-08, "loss": 0.0, "num_input_tokens_seen": 127176040, "step": 188750 }, { "epoch": 4.611316053062321, "grad_norm": 0.0005814972682856023, "learning_rate": 3.6595531106768826e-08, "loss": 0.0001, "num_input_tokens_seen": 127179176, "step": 188755 }, { "epoch": 4.611438203894169, "grad_norm": 9.456976113142446e-05, "learning_rate": 3.657267578599021e-08, "loss": 0.0001, "num_input_tokens_seen": 127182184, "step": 188760 }, { "epoch": 4.6115603547260156, "grad_norm": 0.0006560469628311694, "learning_rate": 3.654982747151436e-08, "loss": 0.0, "num_input_tokens_seen": 127185512, "step": 188765 }, { "epoch": 4.611682505557863, "grad_norm": 4.1001934732776135e-05, "learning_rate": 3.652698616350713e-08, "loss": 0.0002, "num_input_tokens_seen": 127189096, "step": 188770 }, { "epoch": 4.61180465638971, "grad_norm": 0.001413383288308978, "learning_rate": 3.650415186213485e-08, "loss": 0.0, "num_input_tokens_seen": 127192744, "step": 188775 }, { "epoch": 4.611926807221558, "grad_norm": 0.06415407359600067, "learning_rate": 3.648132456756348e-08, "loss": 0.0, "num_input_tokens_seen": 127195944, "step": 188780 }, { "epoch": 4.612048958053404, "grad_norm": 0.47165924310684204, "learning_rate": 3.645850427995911e-08, "loss": 0.0002, "num_input_tokens_seen": 127199912, "step": 188785 }, { "epoch": 4.612171108885251, "grad_norm": 0.0006177661707624793, "learning_rate": 3.643569099948773e-08, "loss": 0.0, "num_input_tokens_seen": 127203560, "step": 188790 }, { "epoch": 4.612293259717099, "grad_norm": 0.0027711568400263786, "learning_rate": 3.6412884726315095e-08, "loss": 0.0, "num_input_tokens_seen": 127206760, "step": 188795 }, { "epoch": 4.612415410548946, "grad_norm": 5.98339902353473e-05, "learning_rate": 3.639008546060718e-08, "loss": 0.0, "num_input_tokens_seen": 127210344, "step": 188800 }, { "epoch": 4.612537561380793, "grad_norm": 0.0010224354919046164, "learning_rate": 3.636729320252962e-08, "loss": 0.0, "num_input_tokens_seen": 127214504, "step": 188805 }, { "epoch": 4.61265971221264, "grad_norm": 2.919272446888499e-05, "learning_rate": 3.6344507952248525e-08, "loss": 0.0002, "num_input_tokens_seen": 127217896, "step": 188810 }, { "epoch": 4.6127818630444875, "grad_norm": 6.360562838381156e-05, "learning_rate": 3.632172970992919e-08, "loss": 0.0, "num_input_tokens_seen": 127221288, "step": 188815 }, { "epoch": 4.612904013876334, "grad_norm": 0.00672942353412509, "learning_rate": 3.6298958475737384e-08, "loss": 0.0, "num_input_tokens_seen": 127224808, "step": 188820 }, { "epoch": 4.613026164708182, "grad_norm": 0.0002779511851258576, "learning_rate": 3.6276194249838855e-08, "loss": 0.0, "num_input_tokens_seen": 127228456, "step": 188825 }, { "epoch": 4.613148315540029, "grad_norm": 0.0013309585629031062, "learning_rate": 3.625343703239903e-08, "loss": 0.0, "num_input_tokens_seen": 127231976, "step": 188830 }, { "epoch": 4.613270466371876, "grad_norm": 0.00028830976225435734, "learning_rate": 3.623068682358354e-08, "loss": 0.0, "num_input_tokens_seen": 127235368, "step": 188835 }, { "epoch": 4.613392617203723, "grad_norm": 0.0012144579086452723, "learning_rate": 3.620794362355761e-08, "loss": 0.0, "num_input_tokens_seen": 127238568, "step": 188840 }, { "epoch": 4.613514768035571, "grad_norm": 7.327213825192302e-05, "learning_rate": 3.6185207432486764e-08, "loss": 0.0, "num_input_tokens_seen": 127241896, "step": 188845 }, { "epoch": 4.613636918867417, "grad_norm": 0.0004144865379203111, "learning_rate": 3.616247825053631e-08, "loss": 0.0, "num_input_tokens_seen": 127245224, "step": 188850 }, { "epoch": 4.613759069699265, "grad_norm": 8.64536632434465e-05, "learning_rate": 3.6139756077871563e-08, "loss": 0.0164, "num_input_tokens_seen": 127248488, "step": 188855 }, { "epoch": 4.613881220531112, "grad_norm": 7.078130875015631e-05, "learning_rate": 3.6117040914657726e-08, "loss": 0.0, "num_input_tokens_seen": 127251624, "step": 188860 }, { "epoch": 4.614003371362959, "grad_norm": 0.00017707333608996123, "learning_rate": 3.6094332761059995e-08, "loss": 0.0, "num_input_tokens_seen": 127254824, "step": 188865 }, { "epoch": 4.614125522194806, "grad_norm": 0.00012071825767634436, "learning_rate": 3.6071631617243694e-08, "loss": 0.0, "num_input_tokens_seen": 127258216, "step": 188870 }, { "epoch": 4.614247673026654, "grad_norm": 0.006401208695024252, "learning_rate": 3.604893748337356e-08, "loss": 0.0, "num_input_tokens_seen": 127261736, "step": 188875 }, { "epoch": 4.6143698238585005, "grad_norm": 6.188951374497265e-05, "learning_rate": 3.6026250359614926e-08, "loss": 0.0, "num_input_tokens_seen": 127265256, "step": 188880 }, { "epoch": 4.614491974690347, "grad_norm": 3.674550316645764e-05, "learning_rate": 3.600357024613265e-08, "loss": 0.0, "num_input_tokens_seen": 127269096, "step": 188885 }, { "epoch": 4.614614125522195, "grad_norm": 0.0004047330003231764, "learning_rate": 3.598089714309172e-08, "loss": 0.0, "num_input_tokens_seen": 127272744, "step": 188890 }, { "epoch": 4.614736276354042, "grad_norm": 0.0006450513028539717, "learning_rate": 3.5958231050656985e-08, "loss": 0.0, "num_input_tokens_seen": 127276136, "step": 188895 }, { "epoch": 4.614858427185889, "grad_norm": 0.00035003412631340325, "learning_rate": 3.5935571968993215e-08, "loss": 0.0, "num_input_tokens_seen": 127279400, "step": 188900 }, { "epoch": 4.614980578017736, "grad_norm": 9.33985211304389e-05, "learning_rate": 3.5912919898265394e-08, "loss": 0.0, "num_input_tokens_seen": 127283112, "step": 188905 }, { "epoch": 4.615102728849584, "grad_norm": 1.889674058475066e-05, "learning_rate": 3.5890274838638044e-08, "loss": 0.0, "num_input_tokens_seen": 127286632, "step": 188910 }, { "epoch": 4.61522487968143, "grad_norm": 0.001484995591454208, "learning_rate": 3.5867636790275933e-08, "loss": 0.0, "num_input_tokens_seen": 127289832, "step": 188915 }, { "epoch": 4.615347030513278, "grad_norm": 0.002284928457811475, "learning_rate": 3.5845005753343704e-08, "loss": 0.0, "num_input_tokens_seen": 127292840, "step": 188920 }, { "epoch": 4.615469181345125, "grad_norm": 0.0001229102781508118, "learning_rate": 3.582238172800589e-08, "loss": 0.0, "num_input_tokens_seen": 127296104, "step": 188925 }, { "epoch": 4.615591332176972, "grad_norm": 2.021228283410892e-05, "learning_rate": 3.579976471442714e-08, "loss": 0.0, "num_input_tokens_seen": 127299624, "step": 188930 }, { "epoch": 4.615713483008819, "grad_norm": 0.0012440208811312914, "learning_rate": 3.577715471277176e-08, "loss": 0.0, "num_input_tokens_seen": 127303272, "step": 188935 }, { "epoch": 4.615835633840667, "grad_norm": 0.000826433242764324, "learning_rate": 3.5754551723204404e-08, "loss": 0.0001, "num_input_tokens_seen": 127307048, "step": 188940 }, { "epoch": 4.6159577846725135, "grad_norm": 3.18732563755475e-05, "learning_rate": 3.573195574588917e-08, "loss": 0.0, "num_input_tokens_seen": 127310568, "step": 188945 }, { "epoch": 4.616079935504361, "grad_norm": 3.4316594792471733e-06, "learning_rate": 3.570936678099046e-08, "loss": 0.0, "num_input_tokens_seen": 127314280, "step": 188950 }, { "epoch": 4.616202086336208, "grad_norm": 0.0002651728573255241, "learning_rate": 3.568678482867271e-08, "loss": 0.0, "num_input_tokens_seen": 127318056, "step": 188955 }, { "epoch": 4.616324237168055, "grad_norm": 0.00045246246736496687, "learning_rate": 3.5664209889099904e-08, "loss": 0.0383, "num_input_tokens_seen": 127321768, "step": 188960 }, { "epoch": 4.616446387999902, "grad_norm": 0.0002699349424801767, "learning_rate": 3.564164196243658e-08, "loss": 0.0, "num_input_tokens_seen": 127325352, "step": 188965 }, { "epoch": 4.61656853883175, "grad_norm": 0.003011636668816209, "learning_rate": 3.5619081048846364e-08, "loss": 0.0, "num_input_tokens_seen": 127328616, "step": 188970 }, { "epoch": 4.616690689663597, "grad_norm": 0.00096094113541767, "learning_rate": 3.559652714849359e-08, "loss": 0.0, "num_input_tokens_seen": 127331816, "step": 188975 }, { "epoch": 4.616812840495443, "grad_norm": 6.9423522290890105e-06, "learning_rate": 3.5573980261542345e-08, "loss": 0.0, "num_input_tokens_seen": 127334952, "step": 188980 }, { "epoch": 4.616934991327291, "grad_norm": 0.000394241651520133, "learning_rate": 3.5551440388156494e-08, "loss": 0.0, "num_input_tokens_seen": 127338344, "step": 188985 }, { "epoch": 4.617057142159138, "grad_norm": 0.0008912490447983146, "learning_rate": 3.552890752850002e-08, "loss": 0.0, "num_input_tokens_seen": 127341288, "step": 188990 }, { "epoch": 4.617179292990985, "grad_norm": 0.0007487831171602011, "learning_rate": 3.550638168273667e-08, "loss": 0.0, "num_input_tokens_seen": 127344744, "step": 188995 }, { "epoch": 4.617301443822832, "grad_norm": 3.8883234083186835e-05, "learning_rate": 3.5483862851030444e-08, "loss": 0.0, "num_input_tokens_seen": 127347944, "step": 189000 }, { "epoch": 4.61742359465468, "grad_norm": 0.000263658759649843, "learning_rate": 3.546135103354486e-08, "loss": 0.0, "num_input_tokens_seen": 127351208, "step": 189005 }, { "epoch": 4.6175457454865265, "grad_norm": 0.002318422310054302, "learning_rate": 3.54388462304438e-08, "loss": 0.0001, "num_input_tokens_seen": 127354344, "step": 189010 }, { "epoch": 4.617667896318374, "grad_norm": 5.154572136234492e-05, "learning_rate": 3.54163484418909e-08, "loss": 0.0, "num_input_tokens_seen": 127357416, "step": 189015 }, { "epoch": 4.617790047150221, "grad_norm": 0.00021221258793957531, "learning_rate": 3.53938576680497e-08, "loss": 0.0, "num_input_tokens_seen": 127360488, "step": 189020 }, { "epoch": 4.6179121979820685, "grad_norm": 0.00020391370344441384, "learning_rate": 3.5371373909083956e-08, "loss": 0.0, "num_input_tokens_seen": 127363816, "step": 189025 }, { "epoch": 4.618034348813915, "grad_norm": 2.5762474251678213e-05, "learning_rate": 3.534889716515688e-08, "loss": 0.0, "num_input_tokens_seen": 127367144, "step": 189030 }, { "epoch": 4.618156499645763, "grad_norm": 6.636412763327826e-06, "learning_rate": 3.532642743643221e-08, "loss": 0.0, "num_input_tokens_seen": 127369960, "step": 189035 }, { "epoch": 4.61827865047761, "grad_norm": 6.774356006644666e-05, "learning_rate": 3.5303964723073174e-08, "loss": 0.0, "num_input_tokens_seen": 127374248, "step": 189040 }, { "epoch": 4.618400801309457, "grad_norm": 0.00014490798639599234, "learning_rate": 3.528150902524319e-08, "loss": 0.0, "num_input_tokens_seen": 127377256, "step": 189045 }, { "epoch": 4.618522952141304, "grad_norm": 3.6679371987702325e-05, "learning_rate": 3.525906034310555e-08, "loss": 0.0, "num_input_tokens_seen": 127380904, "step": 189050 }, { "epoch": 4.618645102973151, "grad_norm": 9.35463685891591e-05, "learning_rate": 3.523661867682348e-08, "loss": 0.0, "num_input_tokens_seen": 127384424, "step": 189055 }, { "epoch": 4.618767253804998, "grad_norm": 0.002642608480527997, "learning_rate": 3.521418402656029e-08, "loss": 0.0, "num_input_tokens_seen": 127387496, "step": 189060 }, { "epoch": 4.618889404636846, "grad_norm": 1.5124430319701787e-05, "learning_rate": 3.519175639247895e-08, "loss": 0.0, "num_input_tokens_seen": 127390632, "step": 189065 }, { "epoch": 4.619011555468693, "grad_norm": 0.0014844568213447928, "learning_rate": 3.516933577474257e-08, "loss": 0.0, "num_input_tokens_seen": 127393640, "step": 189070 }, { "epoch": 4.6191337063005395, "grad_norm": 0.0023881462402641773, "learning_rate": 3.514692217351456e-08, "loss": 0.0008, "num_input_tokens_seen": 127396840, "step": 189075 }, { "epoch": 4.619255857132387, "grad_norm": 5.589219290413894e-05, "learning_rate": 3.512451558895735e-08, "loss": 0.0, "num_input_tokens_seen": 127400616, "step": 189080 }, { "epoch": 4.619378007964234, "grad_norm": 0.0317891500890255, "learning_rate": 3.510211602123447e-08, "loss": 0.0, "num_input_tokens_seen": 127403880, "step": 189085 }, { "epoch": 4.619500158796082, "grad_norm": 0.00031270348699763417, "learning_rate": 3.507972347050825e-08, "loss": 0.0, "num_input_tokens_seen": 127407464, "step": 189090 }, { "epoch": 4.619622309627928, "grad_norm": 0.12114350497722626, "learning_rate": 3.50573379369421e-08, "loss": 0.0, "num_input_tokens_seen": 127410728, "step": 189095 }, { "epoch": 4.619744460459776, "grad_norm": 0.0023343341890722513, "learning_rate": 3.503495942069834e-08, "loss": 0.0, "num_input_tokens_seen": 127413992, "step": 189100 }, { "epoch": 4.619866611291623, "grad_norm": 4.485997123993002e-05, "learning_rate": 3.501258792193984e-08, "loss": 0.0, "num_input_tokens_seen": 127417064, "step": 189105 }, { "epoch": 4.61998876212347, "grad_norm": 8.163870006683283e-06, "learning_rate": 3.499022344082947e-08, "loss": 0.0, "num_input_tokens_seen": 127420392, "step": 189110 }, { "epoch": 4.620110912955317, "grad_norm": 0.04615478590130806, "learning_rate": 3.4967865977529655e-08, "loss": 0.0, "num_input_tokens_seen": 127423784, "step": 189115 }, { "epoch": 4.620233063787165, "grad_norm": 0.0004512005252763629, "learning_rate": 3.494551553220326e-08, "loss": 0.0, "num_input_tokens_seen": 127426600, "step": 189120 }, { "epoch": 4.620355214619011, "grad_norm": 0.0012671244330704212, "learning_rate": 3.49231721050125e-08, "loss": 0.0, "num_input_tokens_seen": 127430056, "step": 189125 }, { "epoch": 4.620477365450859, "grad_norm": 8.45419563120231e-05, "learning_rate": 3.490083569612001e-08, "loss": 0.0, "num_input_tokens_seen": 127433384, "step": 189130 }, { "epoch": 4.620599516282706, "grad_norm": 2.610344745335169e-05, "learning_rate": 3.487850630568834e-08, "loss": 0.0, "num_input_tokens_seen": 127436776, "step": 189135 }, { "epoch": 4.6207216671145535, "grad_norm": 0.0002742982469499111, "learning_rate": 3.4856183933879566e-08, "loss": 0.0, "num_input_tokens_seen": 127440488, "step": 189140 }, { "epoch": 4.6208438179464, "grad_norm": 0.0003484401968307793, "learning_rate": 3.483386858085646e-08, "loss": 0.0, "num_input_tokens_seen": 127443880, "step": 189145 }, { "epoch": 4.620965968778247, "grad_norm": 0.0003409306809771806, "learning_rate": 3.481156024678089e-08, "loss": 0.0, "num_input_tokens_seen": 127447016, "step": 189150 }, { "epoch": 4.621088119610095, "grad_norm": 0.0019779300782829523, "learning_rate": 3.478925893181528e-08, "loss": 0.0, "num_input_tokens_seen": 127450536, "step": 189155 }, { "epoch": 4.621210270441941, "grad_norm": 0.002628277288749814, "learning_rate": 3.4766964636121834e-08, "loss": 0.0, "num_input_tokens_seen": 127454824, "step": 189160 }, { "epoch": 4.621332421273789, "grad_norm": 0.0024878974072635174, "learning_rate": 3.474467735986264e-08, "loss": 0.0, "num_input_tokens_seen": 127458216, "step": 189165 }, { "epoch": 4.621454572105636, "grad_norm": 0.0001902437797980383, "learning_rate": 3.4722397103199797e-08, "loss": 0.0, "num_input_tokens_seen": 127461672, "step": 189170 }, { "epoch": 4.621576722937483, "grad_norm": 0.001445115078240633, "learning_rate": 3.4700123866295174e-08, "loss": 0.0, "num_input_tokens_seen": 127464872, "step": 189175 }, { "epoch": 4.62169887376933, "grad_norm": 6.11757131991908e-05, "learning_rate": 3.4677857649311084e-08, "loss": 0.0, "num_input_tokens_seen": 127469992, "step": 189180 }, { "epoch": 4.621821024601178, "grad_norm": 0.0021974225528538227, "learning_rate": 3.4655598452409066e-08, "loss": 0.0, "num_input_tokens_seen": 127473448, "step": 189185 }, { "epoch": 4.6219431754330245, "grad_norm": 0.002855875762179494, "learning_rate": 3.4633346275751206e-08, "loss": 0.0, "num_input_tokens_seen": 127476904, "step": 189190 }, { "epoch": 4.622065326264872, "grad_norm": 9.0890156570822e-05, "learning_rate": 3.461110111949939e-08, "loss": 0.0, "num_input_tokens_seen": 127480168, "step": 189195 }, { "epoch": 4.622187477096719, "grad_norm": 0.000494190666358918, "learning_rate": 3.458886298381525e-08, "loss": 0.0, "num_input_tokens_seen": 127483944, "step": 189200 }, { "epoch": 4.6223096279285665, "grad_norm": 0.001285312115214765, "learning_rate": 3.456663186886055e-08, "loss": 0.0, "num_input_tokens_seen": 127487080, "step": 189205 }, { "epoch": 4.622431778760413, "grad_norm": 0.0009309606975875795, "learning_rate": 3.454440777479695e-08, "loss": 0.0, "num_input_tokens_seen": 127489896, "step": 189210 }, { "epoch": 4.622553929592261, "grad_norm": 0.02289321832358837, "learning_rate": 3.45221907017863e-08, "loss": 0.0, "num_input_tokens_seen": 127492840, "step": 189215 }, { "epoch": 4.622676080424108, "grad_norm": 5.375369073590264e-05, "learning_rate": 3.4499980649989716e-08, "loss": 0.0, "num_input_tokens_seen": 127496168, "step": 189220 }, { "epoch": 4.622798231255955, "grad_norm": 3.526311047608033e-06, "learning_rate": 3.447777761956894e-08, "loss": 0.0343, "num_input_tokens_seen": 127499560, "step": 189225 }, { "epoch": 4.622920382087802, "grad_norm": 7.114775507943705e-05, "learning_rate": 3.445558161068574e-08, "loss": 0.0, "num_input_tokens_seen": 127502696, "step": 189230 }, { "epoch": 4.62304253291965, "grad_norm": 0.0013356233248487115, "learning_rate": 3.443339262350098e-08, "loss": 0.0, "num_input_tokens_seen": 127506216, "step": 189235 }, { "epoch": 4.623164683751496, "grad_norm": 9.928596409736201e-05, "learning_rate": 3.441121065817643e-08, "loss": 0.0004, "num_input_tokens_seen": 127510056, "step": 189240 }, { "epoch": 4.623286834583343, "grad_norm": 0.004730502609163523, "learning_rate": 3.438903571487317e-08, "loss": 0.0, "num_input_tokens_seen": 127514280, "step": 189245 }, { "epoch": 4.623408985415191, "grad_norm": 0.039514582604169846, "learning_rate": 3.4366867793752755e-08, "loss": 0.0001, "num_input_tokens_seen": 127517864, "step": 189250 }, { "epoch": 4.6235311362470375, "grad_norm": 0.00017600138380657881, "learning_rate": 3.434470689497615e-08, "loss": 0.0, "num_input_tokens_seen": 127521256, "step": 189255 }, { "epoch": 4.623653287078885, "grad_norm": 6.132919224910438e-05, "learning_rate": 3.432255301870435e-08, "loss": 0.0, "num_input_tokens_seen": 127524584, "step": 189260 }, { "epoch": 4.623775437910732, "grad_norm": 0.00012273050379008055, "learning_rate": 3.4300406165098884e-08, "loss": 0.0, "num_input_tokens_seen": 127528040, "step": 189265 }, { "epoch": 4.6238975887425795, "grad_norm": 0.012543193064630032, "learning_rate": 3.427826633432051e-08, "loss": 0.0, "num_input_tokens_seen": 127531112, "step": 189270 }, { "epoch": 4.624019739574426, "grad_norm": 0.0007053225999698043, "learning_rate": 3.425613352653045e-08, "loss": 0.0, "num_input_tokens_seen": 127534440, "step": 189275 }, { "epoch": 4.624141890406274, "grad_norm": 0.00041072419844567776, "learning_rate": 3.423400774188945e-08, "loss": 0.0, "num_input_tokens_seen": 127538024, "step": 189280 }, { "epoch": 4.624264041238121, "grad_norm": 0.0010990831069648266, "learning_rate": 3.421188898055838e-08, "loss": 0.0001, "num_input_tokens_seen": 127541224, "step": 189285 }, { "epoch": 4.624386192069968, "grad_norm": 0.000811230216640979, "learning_rate": 3.4189777242698447e-08, "loss": 0.0001, "num_input_tokens_seen": 127544808, "step": 189290 }, { "epoch": 4.624508342901815, "grad_norm": 0.00042687845416367054, "learning_rate": 3.416767252847008e-08, "loss": 0.0, "num_input_tokens_seen": 127548328, "step": 189295 }, { "epoch": 4.624630493733663, "grad_norm": 0.0013424467761069536, "learning_rate": 3.4145574838034264e-08, "loss": 0.0, "num_input_tokens_seen": 127551464, "step": 189300 }, { "epoch": 4.624752644565509, "grad_norm": 0.00041963253170251846, "learning_rate": 3.412348417155153e-08, "loss": 0.0, "num_input_tokens_seen": 127554792, "step": 189305 }, { "epoch": 4.624874795397357, "grad_norm": 3.8160214899107814e-05, "learning_rate": 3.410140052918275e-08, "loss": 0.0, "num_input_tokens_seen": 127558056, "step": 189310 }, { "epoch": 4.624996946229204, "grad_norm": 3.0968370992923155e-05, "learning_rate": 3.4079323911088256e-08, "loss": 0.0, "num_input_tokens_seen": 127561064, "step": 189315 }, { "epoch": 4.6251190970610505, "grad_norm": 2.0047538782819174e-05, "learning_rate": 3.4057254317428674e-08, "loss": 0.0, "num_input_tokens_seen": 127564200, "step": 189320 }, { "epoch": 4.625241247892898, "grad_norm": 0.0002797935449052602, "learning_rate": 3.4035191748364665e-08, "loss": 0.0, "num_input_tokens_seen": 127567400, "step": 189325 }, { "epoch": 4.625363398724746, "grad_norm": 0.0034103060606867075, "learning_rate": 3.4013136204056436e-08, "loss": 0.0, "num_input_tokens_seen": 127570600, "step": 189330 }, { "epoch": 4.6254855495565925, "grad_norm": 6.119706085883081e-05, "learning_rate": 3.3991087684664634e-08, "loss": 0.0, "num_input_tokens_seen": 127574312, "step": 189335 }, { "epoch": 4.625607700388439, "grad_norm": 0.0007159013766795397, "learning_rate": 3.3969046190349346e-08, "loss": 0.0, "num_input_tokens_seen": 127577704, "step": 189340 }, { "epoch": 4.625729851220287, "grad_norm": 0.00033408793387934566, "learning_rate": 3.3947011721271126e-08, "loss": 0.0, "num_input_tokens_seen": 127581352, "step": 189345 }, { "epoch": 4.625852002052134, "grad_norm": 2.0137898900429718e-05, "learning_rate": 3.392498427758994e-08, "loss": 0.0, "num_input_tokens_seen": 127585000, "step": 189350 }, { "epoch": 4.625974152883981, "grad_norm": 4.836769949179143e-05, "learning_rate": 3.390296385946623e-08, "loss": 0.0, "num_input_tokens_seen": 127588136, "step": 189355 }, { "epoch": 4.626096303715828, "grad_norm": 3.60465855919756e-05, "learning_rate": 3.3880950467059964e-08, "loss": 0.0, "num_input_tokens_seen": 127591336, "step": 189360 }, { "epoch": 4.626218454547676, "grad_norm": 0.00076403992716223, "learning_rate": 3.385894410053125e-08, "loss": 0.0, "num_input_tokens_seen": 127594856, "step": 189365 }, { "epoch": 4.626340605379522, "grad_norm": 0.00041453863377682865, "learning_rate": 3.383694476004018e-08, "loss": 0.0, "num_input_tokens_seen": 127597992, "step": 189370 }, { "epoch": 4.62646275621137, "grad_norm": 0.00014004397962708026, "learning_rate": 3.381495244574673e-08, "loss": 0.0, "num_input_tokens_seen": 127601000, "step": 189375 }, { "epoch": 4.626584907043217, "grad_norm": 0.0023981884587556124, "learning_rate": 3.379296715781066e-08, "loss": 0.0, "num_input_tokens_seen": 127604328, "step": 189380 }, { "epoch": 4.626707057875064, "grad_norm": 0.0008847627905197442, "learning_rate": 3.377098889639229e-08, "loss": 0.0, "num_input_tokens_seen": 127607336, "step": 189385 }, { "epoch": 4.626829208706911, "grad_norm": 0.00041764331399463117, "learning_rate": 3.374901766165095e-08, "loss": 0.0, "num_input_tokens_seen": 127610728, "step": 189390 }, { "epoch": 4.626951359538759, "grad_norm": 4.636571611627005e-05, "learning_rate": 3.372705345374671e-08, "loss": 0.0001, "num_input_tokens_seen": 127613928, "step": 189395 }, { "epoch": 4.6270735103706055, "grad_norm": 0.00011620017903624102, "learning_rate": 3.370509627283924e-08, "loss": 0.0, "num_input_tokens_seen": 127616936, "step": 189400 }, { "epoch": 4.627195661202453, "grad_norm": 0.00018765301501844078, "learning_rate": 3.368314611908829e-08, "loss": 0.0, "num_input_tokens_seen": 127620200, "step": 189405 }, { "epoch": 4.6273178120343, "grad_norm": 0.0007750758086331189, "learning_rate": 3.3661202992653294e-08, "loss": 0.0, "num_input_tokens_seen": 127623400, "step": 189410 }, { "epoch": 4.627439962866147, "grad_norm": 3.3744108804967254e-05, "learning_rate": 3.3639266893693894e-08, "loss": 0.0, "num_input_tokens_seen": 127626664, "step": 189415 }, { "epoch": 4.627562113697994, "grad_norm": 0.0006838897825218737, "learning_rate": 3.361733782236986e-08, "loss": 0.0, "num_input_tokens_seen": 127629864, "step": 189420 }, { "epoch": 4.627684264529842, "grad_norm": 0.0018917579436674714, "learning_rate": 3.3595415778840284e-08, "loss": 0.0, "num_input_tokens_seen": 127633512, "step": 189425 }, { "epoch": 4.627806415361689, "grad_norm": 2.9165235901018605e-05, "learning_rate": 3.357350076326493e-08, "loss": 0.0, "num_input_tokens_seen": 127636840, "step": 189430 }, { "epoch": 4.627928566193535, "grad_norm": 0.0006560211186297238, "learning_rate": 3.355159277580289e-08, "loss": 0.0, "num_input_tokens_seen": 127640680, "step": 189435 }, { "epoch": 4.628050717025383, "grad_norm": 0.00021043684682808816, "learning_rate": 3.352969181661358e-08, "loss": 0.0, "num_input_tokens_seen": 127644072, "step": 189440 }, { "epoch": 4.62817286785723, "grad_norm": 0.00036051214556209743, "learning_rate": 3.350779788585645e-08, "loss": 0.0, "num_input_tokens_seen": 127647336, "step": 189445 }, { "epoch": 4.6282950186890774, "grad_norm": 0.0009198451298289001, "learning_rate": 3.348591098369047e-08, "loss": 0.0, "num_input_tokens_seen": 127651048, "step": 189450 }, { "epoch": 4.628417169520924, "grad_norm": 1.5009105482022278e-05, "learning_rate": 3.346403111027507e-08, "loss": 0.0, "num_input_tokens_seen": 127654312, "step": 189455 }, { "epoch": 4.628539320352772, "grad_norm": 0.00016384219634346664, "learning_rate": 3.344215826576913e-08, "loss": 0.0, "num_input_tokens_seen": 127657320, "step": 189460 }, { "epoch": 4.628661471184619, "grad_norm": 0.00019981464720331132, "learning_rate": 3.342029245033162e-08, "loss": 0.0, "num_input_tokens_seen": 127660712, "step": 189465 }, { "epoch": 4.628783622016466, "grad_norm": 2.0608813429134898e-05, "learning_rate": 3.339843366412198e-08, "loss": 0.0, "num_input_tokens_seen": 127664296, "step": 189470 }, { "epoch": 4.628905772848313, "grad_norm": 15.964238166809082, "learning_rate": 3.337658190729864e-08, "loss": 0.0313, "num_input_tokens_seen": 127667432, "step": 189475 }, { "epoch": 4.629027923680161, "grad_norm": 0.0016988354036584496, "learning_rate": 3.335473718002102e-08, "loss": 0.0, "num_input_tokens_seen": 127670568, "step": 189480 }, { "epoch": 4.629150074512007, "grad_norm": 2.7631498596747406e-05, "learning_rate": 3.333289948244755e-08, "loss": 0.0, "num_input_tokens_seen": 127674472, "step": 189485 }, { "epoch": 4.629272225343855, "grad_norm": 8.974929369287565e-05, "learning_rate": 3.331106881473744e-08, "loss": 0.0, "num_input_tokens_seen": 127677736, "step": 189490 }, { "epoch": 4.629394376175702, "grad_norm": 3.777238089242019e-05, "learning_rate": 3.328924517704912e-08, "loss": 0.0631, "num_input_tokens_seen": 127681128, "step": 189495 }, { "epoch": 4.629516527007549, "grad_norm": 0.0008741427445784211, "learning_rate": 3.326742856954135e-08, "loss": 0.0, "num_input_tokens_seen": 127684072, "step": 189500 }, { "epoch": 4.629638677839396, "grad_norm": 0.0001225506712216884, "learning_rate": 3.324561899237299e-08, "loss": 0.0, "num_input_tokens_seen": 127687208, "step": 189505 }, { "epoch": 4.629760828671243, "grad_norm": 0.0024218896869570017, "learning_rate": 3.322381644570238e-08, "loss": 0.0, "num_input_tokens_seen": 127690600, "step": 189510 }, { "epoch": 4.6298829795030905, "grad_norm": 0.0008322626817971468, "learning_rate": 3.3202020929688376e-08, "loss": 0.0, "num_input_tokens_seen": 127694120, "step": 189515 }, { "epoch": 4.630005130334937, "grad_norm": 6.55809126328677e-05, "learning_rate": 3.31802324444892e-08, "loss": 0.0, "num_input_tokens_seen": 127697128, "step": 189520 }, { "epoch": 4.630127281166785, "grad_norm": 0.0007595704519189894, "learning_rate": 3.31584509902636e-08, "loss": 0.0, "num_input_tokens_seen": 127700456, "step": 189525 }, { "epoch": 4.630249431998632, "grad_norm": 0.0009916751878336072, "learning_rate": 3.3136676567169566e-08, "loss": 0.0, "num_input_tokens_seen": 127703848, "step": 189530 }, { "epoch": 4.630371582830479, "grad_norm": 0.000650413625407964, "learning_rate": 3.3114909175365635e-08, "loss": 0.0, "num_input_tokens_seen": 127707240, "step": 189535 }, { "epoch": 4.630493733662326, "grad_norm": 0.00022649105812888592, "learning_rate": 3.3093148815010355e-08, "loss": 0.0, "num_input_tokens_seen": 127711208, "step": 189540 }, { "epoch": 4.630615884494174, "grad_norm": 0.004471167456358671, "learning_rate": 3.30713954862617e-08, "loss": 0.0, "num_input_tokens_seen": 127714664, "step": 189545 }, { "epoch": 4.63073803532602, "grad_norm": 31.14780616760254, "learning_rate": 3.304964918927788e-08, "loss": 0.0619, "num_input_tokens_seen": 127718120, "step": 189550 }, { "epoch": 4.630860186157868, "grad_norm": 0.03578144684433937, "learning_rate": 3.3027909924217114e-08, "loss": 0.0, "num_input_tokens_seen": 127722088, "step": 189555 }, { "epoch": 4.630982336989715, "grad_norm": 0.0008296905434690416, "learning_rate": 3.300617769123748e-08, "loss": 0.0, "num_input_tokens_seen": 127725416, "step": 189560 }, { "epoch": 4.631104487821562, "grad_norm": 0.00012225030513945967, "learning_rate": 3.2984452490497084e-08, "loss": 0.0, "num_input_tokens_seen": 127728232, "step": 189565 }, { "epoch": 4.631226638653409, "grad_norm": 0.0011644281912595034, "learning_rate": 3.29627343221538e-08, "loss": 0.0, "num_input_tokens_seen": 127731496, "step": 189570 }, { "epoch": 4.631348789485257, "grad_norm": 0.00025070918491110206, "learning_rate": 3.29410231863656e-08, "loss": 0.0, "num_input_tokens_seen": 127734440, "step": 189575 }, { "epoch": 4.6314709403171035, "grad_norm": 0.0001868014078354463, "learning_rate": 3.291931908329026e-08, "loss": 0.0, "num_input_tokens_seen": 127737832, "step": 189580 }, { "epoch": 4.63159309114895, "grad_norm": 3.0223734938772395e-05, "learning_rate": 3.289762201308599e-08, "loss": 0.0, "num_input_tokens_seen": 127741480, "step": 189585 }, { "epoch": 4.631715241980798, "grad_norm": 7.385831122519448e-05, "learning_rate": 3.28759319759101e-08, "loss": 0.0, "num_input_tokens_seen": 127744424, "step": 189590 }, { "epoch": 4.6318373928126455, "grad_norm": 25.03947639465332, "learning_rate": 3.2854248971920574e-08, "loss": 0.0584, "num_input_tokens_seen": 127747304, "step": 189595 }, { "epoch": 4.631959543644492, "grad_norm": 5.329222767613828e-05, "learning_rate": 3.283257300127529e-08, "loss": 0.0, "num_input_tokens_seen": 127750376, "step": 189600 }, { "epoch": 4.632081694476339, "grad_norm": 0.0017787275137379766, "learning_rate": 3.281090406413145e-08, "loss": 0.0, "num_input_tokens_seen": 127754152, "step": 189605 }, { "epoch": 4.632203845308187, "grad_norm": 0.00028210910386405885, "learning_rate": 3.2789242160647046e-08, "loss": 0.0, "num_input_tokens_seen": 127757480, "step": 189610 }, { "epoch": 4.632325996140033, "grad_norm": 2.0327272068243474e-05, "learning_rate": 3.276758729097928e-08, "loss": 0.0, "num_input_tokens_seen": 127760680, "step": 189615 }, { "epoch": 4.632448146971881, "grad_norm": 0.0010251597268506885, "learning_rate": 3.274593945528581e-08, "loss": 0.0, "num_input_tokens_seen": 127763880, "step": 189620 }, { "epoch": 4.632570297803728, "grad_norm": 3.72401082131546e-05, "learning_rate": 3.272429865372406e-08, "loss": 0.0, "num_input_tokens_seen": 127767592, "step": 189625 }, { "epoch": 4.632692448635575, "grad_norm": 4.352164251031354e-05, "learning_rate": 3.270266488645124e-08, "loss": 0.0, "num_input_tokens_seen": 127770664, "step": 189630 }, { "epoch": 4.632814599467422, "grad_norm": 0.002941213781014085, "learning_rate": 3.2681038153624995e-08, "loss": 0.0, "num_input_tokens_seen": 127773736, "step": 189635 }, { "epoch": 4.63293675029927, "grad_norm": 0.0002367892739130184, "learning_rate": 3.265941845540232e-08, "loss": 0.0, "num_input_tokens_seen": 127777128, "step": 189640 }, { "epoch": 4.6330589011311165, "grad_norm": 0.00020610762294381857, "learning_rate": 3.2637805791940645e-08, "loss": 0.0, "num_input_tokens_seen": 127780648, "step": 189645 }, { "epoch": 4.633181051962964, "grad_norm": 7.007142994552851e-05, "learning_rate": 3.2616200163396834e-08, "loss": 0.0, "num_input_tokens_seen": 127784360, "step": 189650 }, { "epoch": 4.633303202794811, "grad_norm": 0.025431230664253235, "learning_rate": 3.259460156992844e-08, "loss": 0.0001, "num_input_tokens_seen": 127787496, "step": 189655 }, { "epoch": 4.6334253536266585, "grad_norm": 4.163160701864399e-05, "learning_rate": 3.2573010011692105e-08, "loss": 0.0003, "num_input_tokens_seen": 127790504, "step": 189660 }, { "epoch": 4.633547504458505, "grad_norm": 2.1552707039518282e-05, "learning_rate": 3.2551425488845264e-08, "loss": 0.0, "num_input_tokens_seen": 127793960, "step": 189665 }, { "epoch": 4.633669655290353, "grad_norm": 0.0002611360978335142, "learning_rate": 3.252984800154446e-08, "loss": 0.0, "num_input_tokens_seen": 127797736, "step": 189670 }, { "epoch": 4.6337918061222, "grad_norm": 1.6607929865131155e-05, "learning_rate": 3.250827754994701e-08, "loss": 0.0, "num_input_tokens_seen": 127801320, "step": 189675 }, { "epoch": 4.633913956954046, "grad_norm": 0.0010899596381932497, "learning_rate": 3.248671413420956e-08, "loss": 0.0, "num_input_tokens_seen": 127804584, "step": 189680 }, { "epoch": 4.634036107785894, "grad_norm": 3.3614189305808395e-05, "learning_rate": 3.246515775448877e-08, "loss": 0.0, "num_input_tokens_seen": 127807784, "step": 189685 }, { "epoch": 4.634158258617742, "grad_norm": 0.00045298977056518197, "learning_rate": 3.2443608410941624e-08, "loss": 0.0, "num_input_tokens_seen": 127810920, "step": 189690 }, { "epoch": 4.634280409449588, "grad_norm": 0.00048405147390440106, "learning_rate": 3.2422066103725e-08, "loss": 0.0, "num_input_tokens_seen": 127814440, "step": 189695 }, { "epoch": 4.634402560281435, "grad_norm": 0.0020768088288605213, "learning_rate": 3.24005308329951e-08, "loss": 0.0, "num_input_tokens_seen": 127817832, "step": 189700 }, { "epoch": 4.634524711113283, "grad_norm": 0.0004733783716801554, "learning_rate": 3.237900259890902e-08, "loss": 0.0183, "num_input_tokens_seen": 127820968, "step": 189705 }, { "epoch": 4.6346468619451295, "grad_norm": 0.03913038596510887, "learning_rate": 3.2357481401622976e-08, "loss": 0.0, "num_input_tokens_seen": 127824232, "step": 189710 }, { "epoch": 4.634769012776977, "grad_norm": 0.00033711790456436574, "learning_rate": 3.233596724129361e-08, "loss": 0.0, "num_input_tokens_seen": 127827368, "step": 189715 }, { "epoch": 4.634891163608824, "grad_norm": 0.0015374831855297089, "learning_rate": 3.231446011807737e-08, "loss": 0.0, "num_input_tokens_seen": 127830760, "step": 189720 }, { "epoch": 4.6350133144406716, "grad_norm": 0.00012308296572882682, "learning_rate": 3.229296003213056e-08, "loss": 0.0, "num_input_tokens_seen": 127834024, "step": 189725 }, { "epoch": 4.635135465272518, "grad_norm": 5.056528971181251e-05, "learning_rate": 3.227146698360983e-08, "loss": 0.0, "num_input_tokens_seen": 127837352, "step": 189730 }, { "epoch": 4.635257616104366, "grad_norm": 9.410569327883422e-05, "learning_rate": 3.224998097267106e-08, "loss": 0.0, "num_input_tokens_seen": 127841064, "step": 189735 }, { "epoch": 4.635379766936213, "grad_norm": 0.0031454835552722216, "learning_rate": 3.222850199947091e-08, "loss": 0.0, "num_input_tokens_seen": 127844264, "step": 189740 }, { "epoch": 4.63550191776806, "grad_norm": 0.018755445256829262, "learning_rate": 3.2207030064165255e-08, "loss": 0.0, "num_input_tokens_seen": 127847656, "step": 189745 }, { "epoch": 4.635624068599907, "grad_norm": 0.000981925637461245, "learning_rate": 3.21855651669104e-08, "loss": 0.0, "num_input_tokens_seen": 127850920, "step": 189750 }, { "epoch": 4.635746219431755, "grad_norm": 0.0001301287265960127, "learning_rate": 3.2164107307862456e-08, "loss": 0.0, "num_input_tokens_seen": 127854632, "step": 189755 }, { "epoch": 4.635868370263601, "grad_norm": 1.917783447424881e-05, "learning_rate": 3.21426564871774e-08, "loss": 0.0, "num_input_tokens_seen": 127857896, "step": 189760 }, { "epoch": 4.635990521095449, "grad_norm": 2.9853536034352146e-05, "learning_rate": 3.212121270501134e-08, "loss": 0.0, "num_input_tokens_seen": 127861416, "step": 189765 }, { "epoch": 4.636112671927296, "grad_norm": 0.0005437976797111332, "learning_rate": 3.209977596152025e-08, "loss": 0.0002, "num_input_tokens_seen": 127864616, "step": 189770 }, { "epoch": 4.636234822759143, "grad_norm": 3.973076672991738e-05, "learning_rate": 3.207834625685968e-08, "loss": 0.0, "num_input_tokens_seen": 127867816, "step": 189775 }, { "epoch": 4.63635697359099, "grad_norm": 0.00019530528516042978, "learning_rate": 3.205692359118595e-08, "loss": 0.0, "num_input_tokens_seen": 127871464, "step": 189780 }, { "epoch": 4.636479124422837, "grad_norm": 0.00014822077355347574, "learning_rate": 3.203550796465437e-08, "loss": 0.0, "num_input_tokens_seen": 127874664, "step": 189785 }, { "epoch": 4.636601275254685, "grad_norm": 9.349855099571869e-05, "learning_rate": 3.2014099377421165e-08, "loss": 0.0, "num_input_tokens_seen": 127877864, "step": 189790 }, { "epoch": 4.636723426086531, "grad_norm": 1.5588731912430376e-05, "learning_rate": 3.199269782964165e-08, "loss": 0.0, "num_input_tokens_seen": 127880872, "step": 189795 }, { "epoch": 4.636845576918379, "grad_norm": 0.0001132964389398694, "learning_rate": 3.197130332147169e-08, "loss": 0.0, "num_input_tokens_seen": 127884328, "step": 189800 }, { "epoch": 4.636967727750226, "grad_norm": 4.8849913582671434e-05, "learning_rate": 3.194991585306661e-08, "loss": 0.0, "num_input_tokens_seen": 127887528, "step": 189805 }, { "epoch": 4.637089878582073, "grad_norm": 5.665285061695613e-05, "learning_rate": 3.1928535424582294e-08, "loss": 0.0, "num_input_tokens_seen": 127890408, "step": 189810 }, { "epoch": 4.63721202941392, "grad_norm": 0.0005679251626133919, "learning_rate": 3.190716203617394e-08, "loss": 0.0, "num_input_tokens_seen": 127893992, "step": 189815 }, { "epoch": 4.637334180245768, "grad_norm": 0.00044258078560233116, "learning_rate": 3.1885795687997094e-08, "loss": 0.0675, "num_input_tokens_seen": 127897192, "step": 189820 }, { "epoch": 4.6374563310776145, "grad_norm": 0.00010819250019267201, "learning_rate": 3.1864436380207196e-08, "loss": 0.0, "num_input_tokens_seen": 127900520, "step": 189825 }, { "epoch": 4.637578481909462, "grad_norm": 0.0031544826924800873, "learning_rate": 3.1843084112959437e-08, "loss": 0.0, "num_input_tokens_seen": 127904168, "step": 189830 }, { "epoch": 4.637700632741309, "grad_norm": 0.00038662514998577535, "learning_rate": 3.182173888640927e-08, "loss": 0.0, "num_input_tokens_seen": 127907560, "step": 189835 }, { "epoch": 4.6378227835731565, "grad_norm": 0.0006948575028218329, "learning_rate": 3.180040070071166e-08, "loss": 0.0, "num_input_tokens_seen": 127910824, "step": 189840 }, { "epoch": 4.637944934405003, "grad_norm": 0.00039235371514223516, "learning_rate": 3.1779069556022055e-08, "loss": 0.0, "num_input_tokens_seen": 127914152, "step": 189845 }, { "epoch": 4.638067085236851, "grad_norm": 0.00019603042164817452, "learning_rate": 3.175774545249554e-08, "loss": 0.0, "num_input_tokens_seen": 127917416, "step": 189850 }, { "epoch": 4.638189236068698, "grad_norm": 7.47626181691885e-05, "learning_rate": 3.1736428390287005e-08, "loss": 0.0, "num_input_tokens_seen": 127921256, "step": 189855 }, { "epoch": 4.638311386900545, "grad_norm": 0.00021746759011875838, "learning_rate": 3.171511836955176e-08, "loss": 0.0, "num_input_tokens_seen": 127924456, "step": 189860 }, { "epoch": 4.638433537732392, "grad_norm": 0.0006375667289830744, "learning_rate": 3.1693815390444466e-08, "loss": 0.0, "num_input_tokens_seen": 127927848, "step": 189865 }, { "epoch": 4.638555688564239, "grad_norm": 0.001184274209663272, "learning_rate": 3.1672519453120325e-08, "loss": 0.0, "num_input_tokens_seen": 127931368, "step": 189870 }, { "epoch": 4.638677839396086, "grad_norm": 5.584588507190347e-05, "learning_rate": 3.1651230557733996e-08, "loss": 0.0, "num_input_tokens_seen": 127934568, "step": 189875 }, { "epoch": 4.638799990227933, "grad_norm": 0.001166929374448955, "learning_rate": 3.162994870444036e-08, "loss": 0.0, "num_input_tokens_seen": 127937384, "step": 189880 }, { "epoch": 4.638922141059781, "grad_norm": 0.00011990226630587131, "learning_rate": 3.1608673893394164e-08, "loss": 0.0, "num_input_tokens_seen": 127941352, "step": 189885 }, { "epoch": 4.6390442918916275, "grad_norm": 0.0003758897364605218, "learning_rate": 3.158740612475019e-08, "loss": 0.0, "num_input_tokens_seen": 127944424, "step": 189890 }, { "epoch": 4.639166442723475, "grad_norm": 0.0006660557119175792, "learning_rate": 3.1566145398663094e-08, "loss": 0.0, "num_input_tokens_seen": 127948136, "step": 189895 }, { "epoch": 4.639288593555322, "grad_norm": 1.3435801267623901, "learning_rate": 3.1544891715287405e-08, "loss": 0.0007, "num_input_tokens_seen": 127951592, "step": 189900 }, { "epoch": 4.6394107443871695, "grad_norm": 0.000565065536648035, "learning_rate": 3.1523645074777783e-08, "loss": 0.0001, "num_input_tokens_seen": 127954728, "step": 189905 }, { "epoch": 4.639532895219016, "grad_norm": 2.8673955966951326e-05, "learning_rate": 3.150240547728877e-08, "loss": 0.0, "num_input_tokens_seen": 127958696, "step": 189910 }, { "epoch": 4.639655046050864, "grad_norm": 9.768320160219446e-05, "learning_rate": 3.1481172922974584e-08, "loss": 0.0, "num_input_tokens_seen": 127962152, "step": 189915 }, { "epoch": 4.639777196882711, "grad_norm": 0.00023989586043171585, "learning_rate": 3.145994741198998e-08, "loss": 0.0, "num_input_tokens_seen": 127965864, "step": 189920 }, { "epoch": 4.639899347714558, "grad_norm": 1.8430131603963673e-05, "learning_rate": 3.143872894448907e-08, "loss": 0.0, "num_input_tokens_seen": 127969192, "step": 189925 }, { "epoch": 4.640021498546405, "grad_norm": 0.001935247448273003, "learning_rate": 3.141751752062627e-08, "loss": 0.0537, "num_input_tokens_seen": 127972328, "step": 189930 }, { "epoch": 4.640143649378253, "grad_norm": 0.0016746302135288715, "learning_rate": 3.1396313140555684e-08, "loss": 0.0, "num_input_tokens_seen": 127975592, "step": 189935 }, { "epoch": 4.640265800210099, "grad_norm": 0.00011264430213486776, "learning_rate": 3.137511580443175e-08, "loss": 0.0164, "num_input_tokens_seen": 127978984, "step": 189940 }, { "epoch": 4.640387951041946, "grad_norm": 0.0004881844506599009, "learning_rate": 3.135392551240856e-08, "loss": 0.0, "num_input_tokens_seen": 127982312, "step": 189945 }, { "epoch": 4.640510101873794, "grad_norm": 1.6157357094925828e-05, "learning_rate": 3.1332742264639996e-08, "loss": 0.0, "num_input_tokens_seen": 127985704, "step": 189950 }, { "epoch": 4.640632252705641, "grad_norm": 0.0005269379471428692, "learning_rate": 3.131156606128038e-08, "loss": 0.0, "num_input_tokens_seen": 127988712, "step": 189955 }, { "epoch": 4.640754403537488, "grad_norm": 0.0007660355186089873, "learning_rate": 3.129039690248359e-08, "loss": 0.0, "num_input_tokens_seen": 127992296, "step": 189960 }, { "epoch": 4.640876554369335, "grad_norm": 0.00013214911450631917, "learning_rate": 3.12692347884036e-08, "loss": 0.0, "num_input_tokens_seen": 127995496, "step": 189965 }, { "epoch": 4.6409987052011825, "grad_norm": 0.03304639831185341, "learning_rate": 3.124807971919419e-08, "loss": 0.0001, "num_input_tokens_seen": 127998824, "step": 189970 }, { "epoch": 4.641120856033029, "grad_norm": 0.00015475453983526677, "learning_rate": 3.122693169500945e-08, "loss": 0.0, "num_input_tokens_seen": 128002088, "step": 189975 }, { "epoch": 4.641243006864877, "grad_norm": 0.0007058978080749512, "learning_rate": 3.120579071600282e-08, "loss": 0.0, "num_input_tokens_seen": 128005032, "step": 189980 }, { "epoch": 4.641365157696724, "grad_norm": 0.0006571342819370329, "learning_rate": 3.118465678232851e-08, "loss": 0.0, "num_input_tokens_seen": 128008872, "step": 189985 }, { "epoch": 4.641487308528571, "grad_norm": 7.548756457254058e-06, "learning_rate": 3.1163529894139837e-08, "loss": 0.0, "num_input_tokens_seen": 128012200, "step": 189990 }, { "epoch": 4.641609459360418, "grad_norm": 0.0006949577946215868, "learning_rate": 3.1142410051590573e-08, "loss": 0.0, "num_input_tokens_seen": 128015400, "step": 189995 }, { "epoch": 4.641731610192266, "grad_norm": 0.00013040592602919787, "learning_rate": 3.112129725483425e-08, "loss": 0.0, "num_input_tokens_seen": 128018664, "step": 190000 }, { "epoch": 4.641853761024112, "grad_norm": 0.00020232108363416046, "learning_rate": 3.1100191504024545e-08, "loss": 0.0, "num_input_tokens_seen": 128021864, "step": 190005 }, { "epoch": 4.64197591185596, "grad_norm": 2.4844073777785525e-05, "learning_rate": 3.1079092799314757e-08, "loss": 0.0, "num_input_tokens_seen": 128025320, "step": 190010 }, { "epoch": 4.642098062687807, "grad_norm": 0.00045661506010219455, "learning_rate": 3.105800114085844e-08, "loss": 0.0, "num_input_tokens_seen": 128028840, "step": 190015 }, { "epoch": 4.642220213519654, "grad_norm": 0.00022646358411293477, "learning_rate": 3.103691652880891e-08, "loss": 0.0, "num_input_tokens_seen": 128032360, "step": 190020 }, { "epoch": 4.642342364351501, "grad_norm": 0.00022188508592080325, "learning_rate": 3.1015838963319605e-08, "loss": 0.0, "num_input_tokens_seen": 128035560, "step": 190025 }, { "epoch": 4.642464515183349, "grad_norm": 0.0008343648514710367, "learning_rate": 3.099476844454374e-08, "loss": 0.0, "num_input_tokens_seen": 128039208, "step": 190030 }, { "epoch": 4.6425866660151955, "grad_norm": 1.0348808245908003e-05, "learning_rate": 3.0973704972634515e-08, "loss": 0.0, "num_input_tokens_seen": 128042536, "step": 190035 }, { "epoch": 4.642708816847042, "grad_norm": 0.0011595974210649729, "learning_rate": 3.095264854774515e-08, "loss": 0.0, "num_input_tokens_seen": 128045672, "step": 190040 }, { "epoch": 4.64283096767889, "grad_norm": 0.00014588657359126955, "learning_rate": 3.0931599170028745e-08, "loss": 0.0, "num_input_tokens_seen": 128048872, "step": 190045 }, { "epoch": 4.6429531185107376, "grad_norm": 0.0019392999820411205, "learning_rate": 3.0910556839638504e-08, "loss": 0.0, "num_input_tokens_seen": 128052392, "step": 190050 }, { "epoch": 4.643075269342584, "grad_norm": 0.0005591996596194804, "learning_rate": 3.0889521556727304e-08, "loss": 0.0, "num_input_tokens_seen": 128055784, "step": 190055 }, { "epoch": 4.643197420174431, "grad_norm": 0.0012844577431678772, "learning_rate": 3.086849332144803e-08, "loss": 0.0, "num_input_tokens_seen": 128059432, "step": 190060 }, { "epoch": 4.643319571006279, "grad_norm": 6.231262523215264e-05, "learning_rate": 3.084747213395389e-08, "loss": 0.0, "num_input_tokens_seen": 128062312, "step": 190065 }, { "epoch": 4.643441721838125, "grad_norm": 8.152231021085754e-05, "learning_rate": 3.0826457994397533e-08, "loss": 0.0, "num_input_tokens_seen": 128065384, "step": 190070 }, { "epoch": 4.643563872669973, "grad_norm": 0.00023000915825832635, "learning_rate": 3.080545090293196e-08, "loss": 0.0, "num_input_tokens_seen": 128069096, "step": 190075 }, { "epoch": 4.64368602350182, "grad_norm": 0.0004738394927699119, "learning_rate": 3.078445085970982e-08, "loss": 0.0, "num_input_tokens_seen": 128072744, "step": 190080 }, { "epoch": 4.643808174333667, "grad_norm": 0.00019146144040860236, "learning_rate": 3.076345786488377e-08, "loss": 0.0, "num_input_tokens_seen": 128076200, "step": 190085 }, { "epoch": 4.643930325165514, "grad_norm": 0.00027293289895169437, "learning_rate": 3.074247191860657e-08, "loss": 0.0, "num_input_tokens_seen": 128079464, "step": 190090 }, { "epoch": 4.644052475997362, "grad_norm": 1.609481478226371e-05, "learning_rate": 3.072149302103078e-08, "loss": 0.0, "num_input_tokens_seen": 128083240, "step": 190095 }, { "epoch": 4.644174626829209, "grad_norm": 93.06655883789062, "learning_rate": 3.0700521172309035e-08, "loss": 0.0042, "num_input_tokens_seen": 128086824, "step": 190100 }, { "epoch": 4.644296777661056, "grad_norm": 0.00014359374472405761, "learning_rate": 3.067955637259367e-08, "loss": 0.0, "num_input_tokens_seen": 128090472, "step": 190105 }, { "epoch": 4.644418928492903, "grad_norm": 3.9056434616213664e-05, "learning_rate": 3.065859862203746e-08, "loss": 0.0, "num_input_tokens_seen": 128093736, "step": 190110 }, { "epoch": 4.644541079324751, "grad_norm": 0.0011020988458767533, "learning_rate": 3.06376479207926e-08, "loss": 0.0, "num_input_tokens_seen": 128096744, "step": 190115 }, { "epoch": 4.644663230156597, "grad_norm": 0.002124036429449916, "learning_rate": 3.061670426901153e-08, "loss": 0.0, "num_input_tokens_seen": 128099752, "step": 190120 }, { "epoch": 4.644785380988445, "grad_norm": 0.00013197159569244832, "learning_rate": 3.059576766684635e-08, "loss": 0.0, "num_input_tokens_seen": 128102568, "step": 190125 }, { "epoch": 4.644907531820292, "grad_norm": 0.003396212123334408, "learning_rate": 3.0574838114449605e-08, "loss": 0.0, "num_input_tokens_seen": 128105448, "step": 190130 }, { "epoch": 4.6450296826521384, "grad_norm": 0.00010021981870522723, "learning_rate": 3.05539156119734e-08, "loss": 0.0, "num_input_tokens_seen": 128108520, "step": 190135 }, { "epoch": 4.645151833483986, "grad_norm": 0.0007454652222804725, "learning_rate": 3.053300015956983e-08, "loss": 0.0, "num_input_tokens_seen": 128111912, "step": 190140 }, { "epoch": 4.645273984315833, "grad_norm": 0.0032192934304475784, "learning_rate": 3.0512091757391114e-08, "loss": 0.0, "num_input_tokens_seen": 128115240, "step": 190145 }, { "epoch": 4.6453961351476805, "grad_norm": 7.996581553015858e-05, "learning_rate": 3.049119040558912e-08, "loss": 0.0, "num_input_tokens_seen": 128118696, "step": 190150 }, { "epoch": 4.645518285979527, "grad_norm": 0.0012409802293404937, "learning_rate": 3.047029610431595e-08, "loss": 0.0, "num_input_tokens_seen": 128121640, "step": 190155 }, { "epoch": 4.645640436811375, "grad_norm": 0.00021187012316659093, "learning_rate": 3.0449408853723715e-08, "loss": 0.0, "num_input_tokens_seen": 128124904, "step": 190160 }, { "epoch": 4.645762587643222, "grad_norm": 0.00025122863007709384, "learning_rate": 3.0428528653963946e-08, "loss": 0.0, "num_input_tokens_seen": 128128232, "step": 190165 }, { "epoch": 4.645884738475069, "grad_norm": 0.0025913571007549763, "learning_rate": 3.040765550518887e-08, "loss": 0.0, "num_input_tokens_seen": 128131176, "step": 190170 }, { "epoch": 4.646006889306916, "grad_norm": 0.006195464637130499, "learning_rate": 3.0386789407550017e-08, "loss": 0.0, "num_input_tokens_seen": 128134568, "step": 190175 }, { "epoch": 4.646129040138764, "grad_norm": 0.0006205164827406406, "learning_rate": 3.036593036119928e-08, "loss": 0.0, "num_input_tokens_seen": 128137704, "step": 190180 }, { "epoch": 4.64625119097061, "grad_norm": 0.0023309632670134306, "learning_rate": 3.034507836628841e-08, "loss": 0.0, "num_input_tokens_seen": 128141160, "step": 190185 }, { "epoch": 4.646373341802458, "grad_norm": 0.0002740486233960837, "learning_rate": 3.0324233422968747e-08, "loss": 0.0, "num_input_tokens_seen": 128145128, "step": 190190 }, { "epoch": 4.646495492634305, "grad_norm": 4.773643740918487e-05, "learning_rate": 3.030339553139216e-08, "loss": 0.0, "num_input_tokens_seen": 128148648, "step": 190195 }, { "epoch": 4.646617643466152, "grad_norm": 9.275714546674863e-05, "learning_rate": 3.0282564691709975e-08, "loss": 0.0, "num_input_tokens_seen": 128152360, "step": 190200 }, { "epoch": 4.646739794297999, "grad_norm": 0.035588447004556656, "learning_rate": 3.0261740904073965e-08, "loss": 0.0, "num_input_tokens_seen": 128155688, "step": 190205 }, { "epoch": 4.646861945129846, "grad_norm": 0.018034692853689194, "learning_rate": 3.024092416863533e-08, "loss": 0.0, "num_input_tokens_seen": 128158888, "step": 190210 }, { "epoch": 4.6469840959616935, "grad_norm": 0.00043136090971529484, "learning_rate": 3.02201144855454e-08, "loss": 0.0, "num_input_tokens_seen": 128162216, "step": 190215 }, { "epoch": 4.647106246793541, "grad_norm": 0.00020243579638190567, "learning_rate": 3.0199311854955837e-08, "loss": 0.0, "num_input_tokens_seen": 128165736, "step": 190220 }, { "epoch": 4.647228397625388, "grad_norm": 0.00023691370734013617, "learning_rate": 3.017851627701762e-08, "loss": 0.0, "num_input_tokens_seen": 128168808, "step": 190225 }, { "epoch": 4.647350548457235, "grad_norm": 0.0003029266663361341, "learning_rate": 3.015772775188219e-08, "loss": 0.0, "num_input_tokens_seen": 128172584, "step": 190230 }, { "epoch": 4.647472699289082, "grad_norm": 0.0029733150731772184, "learning_rate": 3.013694627970054e-08, "loss": 0.0, "num_input_tokens_seen": 128175912, "step": 190235 }, { "epoch": 4.647594850120929, "grad_norm": 7.283730519702658e-05, "learning_rate": 3.011617186062387e-08, "loss": 0.0, "num_input_tokens_seen": 128179240, "step": 190240 }, { "epoch": 4.647717000952777, "grad_norm": 0.000707432976923883, "learning_rate": 3.00954044948033e-08, "loss": 0.0, "num_input_tokens_seen": 128182376, "step": 190245 }, { "epoch": 4.647839151784623, "grad_norm": 9.027096530189738e-05, "learning_rate": 3.0074644182389694e-08, "loss": 0.065, "num_input_tokens_seen": 128185640, "step": 190250 }, { "epoch": 4.647961302616471, "grad_norm": 0.0001138881707447581, "learning_rate": 3.0053890923534273e-08, "loss": 0.0, "num_input_tokens_seen": 128188904, "step": 190255 }, { "epoch": 4.648083453448318, "grad_norm": 0.00011583354353206232, "learning_rate": 3.00331447183878e-08, "loss": 0.0, "num_input_tokens_seen": 128192104, "step": 190260 }, { "epoch": 4.648205604280165, "grad_norm": 0.0005509683978743851, "learning_rate": 3.0012405567101275e-08, "loss": 0.0, "num_input_tokens_seen": 128195368, "step": 190265 }, { "epoch": 4.648327755112012, "grad_norm": 0.022481124848127365, "learning_rate": 2.999167346982534e-08, "loss": 0.0, "num_input_tokens_seen": 128198824, "step": 190270 }, { "epoch": 4.64844990594386, "grad_norm": 0.0005666337092407048, "learning_rate": 2.997094842671099e-08, "loss": 0.0, "num_input_tokens_seen": 128201832, "step": 190275 }, { "epoch": 4.6485720567757065, "grad_norm": 0.00038574362406507134, "learning_rate": 2.9950230437908676e-08, "loss": 0.0, "num_input_tokens_seen": 128204776, "step": 190280 }, { "epoch": 4.648694207607554, "grad_norm": 0.00022460322361439466, "learning_rate": 2.992951950356926e-08, "loss": 0.0, "num_input_tokens_seen": 128208488, "step": 190285 }, { "epoch": 4.648816358439401, "grad_norm": 0.00033724631066434085, "learning_rate": 2.990881562384318e-08, "loss": 0.0563, "num_input_tokens_seen": 128212136, "step": 190290 }, { "epoch": 4.6489385092712485, "grad_norm": 0.00024160873726941645, "learning_rate": 2.9888118798881315e-08, "loss": 0.0, "num_input_tokens_seen": 128215400, "step": 190295 }, { "epoch": 4.649060660103095, "grad_norm": 6.861024303361773e-05, "learning_rate": 2.986742902883388e-08, "loss": 0.0, "num_input_tokens_seen": 128218856, "step": 190300 }, { "epoch": 4.649182810934942, "grad_norm": 0.00023723019694443792, "learning_rate": 2.984674631385142e-08, "loss": 0.0001, "num_input_tokens_seen": 128221736, "step": 190305 }, { "epoch": 4.64930496176679, "grad_norm": 0.0017610428621992469, "learning_rate": 2.982607065408427e-08, "loss": 0.0, "num_input_tokens_seen": 128225576, "step": 190310 }, { "epoch": 4.649427112598637, "grad_norm": 0.046799734234809875, "learning_rate": 2.980540204968307e-08, "loss": 0.0001, "num_input_tokens_seen": 128228776, "step": 190315 }, { "epoch": 4.649549263430484, "grad_norm": 0.0007966930279508233, "learning_rate": 2.9784740500797822e-08, "loss": 0.0, "num_input_tokens_seen": 128232232, "step": 190320 }, { "epoch": 4.649671414262331, "grad_norm": 5.0024358642986044e-05, "learning_rate": 2.9764086007578958e-08, "loss": 0.0, "num_input_tokens_seen": 128235560, "step": 190325 }, { "epoch": 4.649793565094178, "grad_norm": 0.0002488717727828771, "learning_rate": 2.974343857017647e-08, "loss": 0.0, "num_input_tokens_seen": 128239080, "step": 190330 }, { "epoch": 4.649915715926025, "grad_norm": 0.003739925567060709, "learning_rate": 2.9722798188740907e-08, "loss": 0.0, "num_input_tokens_seen": 128242088, "step": 190335 }, { "epoch": 4.650037866757873, "grad_norm": 6.226352707017213e-05, "learning_rate": 2.970216486342192e-08, "loss": 0.0, "num_input_tokens_seen": 128245224, "step": 190340 }, { "epoch": 4.6501600175897195, "grad_norm": 0.0009980950271710753, "learning_rate": 2.9681538594369837e-08, "loss": 0.0, "num_input_tokens_seen": 128248552, "step": 190345 }, { "epoch": 4.650282168421567, "grad_norm": 0.0005805629189126194, "learning_rate": 2.9660919381734652e-08, "loss": 0.0, "num_input_tokens_seen": 128252072, "step": 190350 }, { "epoch": 4.650404319253414, "grad_norm": 4.1041876102099195e-05, "learning_rate": 2.964030722566613e-08, "loss": 0.0, "num_input_tokens_seen": 128255528, "step": 190355 }, { "epoch": 4.6505264700852615, "grad_norm": 0.00015798966342117637, "learning_rate": 2.961970212631437e-08, "loss": 0.0317, "num_input_tokens_seen": 128259176, "step": 190360 }, { "epoch": 4.650648620917108, "grad_norm": 0.00017236039275303483, "learning_rate": 2.9599104083829153e-08, "loss": 0.0, "num_input_tokens_seen": 128262504, "step": 190365 }, { "epoch": 4.650770771748956, "grad_norm": 0.00020189836504869163, "learning_rate": 2.9578513098360235e-08, "loss": 0.0, "num_input_tokens_seen": 128265512, "step": 190370 }, { "epoch": 4.650892922580803, "grad_norm": 0.00678278086706996, "learning_rate": 2.9557929170057282e-08, "loss": 0.0, "num_input_tokens_seen": 128268776, "step": 190375 }, { "epoch": 4.65101507341265, "grad_norm": 0.0014909114688634872, "learning_rate": 2.9537352299070173e-08, "loss": 0.0, "num_input_tokens_seen": 128272232, "step": 190380 }, { "epoch": 4.651137224244497, "grad_norm": 0.00044912658631801605, "learning_rate": 2.9516782485548563e-08, "loss": 0.0, "num_input_tokens_seen": 128275624, "step": 190385 }, { "epoch": 4.651259375076345, "grad_norm": 0.0003641039947979152, "learning_rate": 2.9496219729641892e-08, "loss": 0.0, "num_input_tokens_seen": 128278888, "step": 190390 }, { "epoch": 4.651381525908191, "grad_norm": 0.0002309923293069005, "learning_rate": 2.947566403149959e-08, "loss": 0.1665, "num_input_tokens_seen": 128282152, "step": 190395 }, { "epoch": 4.651503676740038, "grad_norm": 0.00023649900685995817, "learning_rate": 2.9455115391271546e-08, "loss": 0.0, "num_input_tokens_seen": 128285800, "step": 190400 }, { "epoch": 4.651625827571886, "grad_norm": 0.0006368455942720175, "learning_rate": 2.9434573809106744e-08, "loss": 0.0, "num_input_tokens_seen": 128288936, "step": 190405 }, { "epoch": 4.6517479784037326, "grad_norm": 0.10722141712903976, "learning_rate": 2.9414039285154846e-08, "loss": 0.0, "num_input_tokens_seen": 128292008, "step": 190410 }, { "epoch": 4.65187012923558, "grad_norm": 0.002036165911704302, "learning_rate": 2.9393511819565063e-08, "loss": 0.0, "num_input_tokens_seen": 128295592, "step": 190415 }, { "epoch": 4.651992280067427, "grad_norm": 0.004228158388286829, "learning_rate": 2.9372991412486836e-08, "loss": 0.0, "num_input_tokens_seen": 128299112, "step": 190420 }, { "epoch": 4.652114430899275, "grad_norm": 0.00015035481192171574, "learning_rate": 2.9352478064069152e-08, "loss": 0.0, "num_input_tokens_seen": 128302568, "step": 190425 }, { "epoch": 4.652236581731121, "grad_norm": 8.828377031022683e-05, "learning_rate": 2.933197177446145e-08, "loss": 0.0, "num_input_tokens_seen": 128306024, "step": 190430 }, { "epoch": 4.652358732562969, "grad_norm": 5.080635673948564e-06, "learning_rate": 2.931147254381261e-08, "loss": 0.0, "num_input_tokens_seen": 128309480, "step": 190435 }, { "epoch": 4.652480883394816, "grad_norm": 0.0010717608965933323, "learning_rate": 2.9290980372271736e-08, "loss": 0.0006, "num_input_tokens_seen": 128312808, "step": 190440 }, { "epoch": 4.652603034226663, "grad_norm": 4.420124605530873e-05, "learning_rate": 2.927049525998815e-08, "loss": 0.0, "num_input_tokens_seen": 128316072, "step": 190445 }, { "epoch": 4.65272518505851, "grad_norm": 0.006245040334761143, "learning_rate": 2.925001720711051e-08, "loss": 0.0002, "num_input_tokens_seen": 128319720, "step": 190450 }, { "epoch": 4.652847335890358, "grad_norm": 0.00013759582361672074, "learning_rate": 2.9229546213787925e-08, "loss": 0.0, "num_input_tokens_seen": 128322984, "step": 190455 }, { "epoch": 4.6529694867222045, "grad_norm": 0.003004379104822874, "learning_rate": 2.920908228016916e-08, "loss": 0.0, "num_input_tokens_seen": 128326824, "step": 190460 }, { "epoch": 4.653091637554052, "grad_norm": 0.0024323337711393833, "learning_rate": 2.918862540640299e-08, "loss": 0.0, "num_input_tokens_seen": 128329896, "step": 190465 }, { "epoch": 4.653213788385899, "grad_norm": 6.900146399857476e-05, "learning_rate": 2.9168175592638288e-08, "loss": 0.0, "num_input_tokens_seen": 128332776, "step": 190470 }, { "epoch": 4.653335939217746, "grad_norm": 0.00020709547970909625, "learning_rate": 2.914773283902372e-08, "loss": 0.0, "num_input_tokens_seen": 128336360, "step": 190475 }, { "epoch": 4.653458090049593, "grad_norm": 0.0002024859859375283, "learning_rate": 2.9127297145708052e-08, "loss": 0.0, "num_input_tokens_seen": 128339624, "step": 190480 }, { "epoch": 4.653580240881441, "grad_norm": 0.002173555316403508, "learning_rate": 2.9106868512839722e-08, "loss": 0.0, "num_input_tokens_seen": 128343080, "step": 190485 }, { "epoch": 4.653702391713288, "grad_norm": 0.0013618976809084415, "learning_rate": 2.90864469405675e-08, "loss": 0.0, "num_input_tokens_seen": 128346280, "step": 190490 }, { "epoch": 4.653824542545134, "grad_norm": 0.00013016993761993945, "learning_rate": 2.906603242903971e-08, "loss": 0.0, "num_input_tokens_seen": 128349096, "step": 190495 }, { "epoch": 4.653946693376982, "grad_norm": 0.0004320065781939775, "learning_rate": 2.9045624978404793e-08, "loss": 0.0, "num_input_tokens_seen": 128352168, "step": 190500 }, { "epoch": 4.654068844208829, "grad_norm": 0.0005773974698968232, "learning_rate": 2.9025224588811402e-08, "loss": 0.0, "num_input_tokens_seen": 128355560, "step": 190505 }, { "epoch": 4.654190995040676, "grad_norm": 0.004524011164903641, "learning_rate": 2.9004831260407647e-08, "loss": 0.0, "num_input_tokens_seen": 128359016, "step": 190510 }, { "epoch": 4.654313145872523, "grad_norm": 0.011500171385705471, "learning_rate": 2.898444499334196e-08, "loss": 0.0, "num_input_tokens_seen": 128362344, "step": 190515 }, { "epoch": 4.654435296704371, "grad_norm": 0.0002538670087233186, "learning_rate": 2.896406578776256e-08, "loss": 0.0, "num_input_tokens_seen": 128365416, "step": 190520 }, { "epoch": 4.6545574475362175, "grad_norm": 9.319341188529506e-05, "learning_rate": 2.8943693643817656e-08, "loss": 0.0, "num_input_tokens_seen": 128368872, "step": 190525 }, { "epoch": 4.654679598368065, "grad_norm": 3.173381264787167e-05, "learning_rate": 2.8923328561655357e-08, "loss": 0.0, "num_input_tokens_seen": 128372392, "step": 190530 }, { "epoch": 4.654801749199912, "grad_norm": 0.0012093858094885945, "learning_rate": 2.8902970541423765e-08, "loss": 0.0467, "num_input_tokens_seen": 128375720, "step": 190535 }, { "epoch": 4.6549239000317595, "grad_norm": 1.1120274393761065e-05, "learning_rate": 2.8882619583270983e-08, "loss": 0.0, "num_input_tokens_seen": 128379496, "step": 190540 }, { "epoch": 4.655046050863606, "grad_norm": 0.14474591612815857, "learning_rate": 2.8862275687345004e-08, "loss": 0.0, "num_input_tokens_seen": 128383400, "step": 190545 }, { "epoch": 4.655168201695454, "grad_norm": 0.07663882523775101, "learning_rate": 2.8841938853793823e-08, "loss": 0.0, "num_input_tokens_seen": 128386792, "step": 190550 }, { "epoch": 4.655290352527301, "grad_norm": 0.0017596009420230985, "learning_rate": 2.8821609082765207e-08, "loss": 0.0, "num_input_tokens_seen": 128390120, "step": 190555 }, { "epoch": 4.655412503359148, "grad_norm": 0.00014885960263200104, "learning_rate": 2.880128637440704e-08, "loss": 0.0, "num_input_tokens_seen": 128393576, "step": 190560 }, { "epoch": 4.655534654190995, "grad_norm": 0.0004244176670908928, "learning_rate": 2.8780970728867204e-08, "loss": 0.0, "num_input_tokens_seen": 128397032, "step": 190565 }, { "epoch": 4.655656805022842, "grad_norm": 6.144320650491863e-05, "learning_rate": 2.8760662146293357e-08, "loss": 0.0, "num_input_tokens_seen": 128400168, "step": 190570 }, { "epoch": 4.655778955854689, "grad_norm": 0.0005951565690338612, "learning_rate": 2.874036062683327e-08, "loss": 0.0, "num_input_tokens_seen": 128403560, "step": 190575 }, { "epoch": 4.655901106686537, "grad_norm": 0.006229200400412083, "learning_rate": 2.8720066170634383e-08, "loss": 0.0625, "num_input_tokens_seen": 128406632, "step": 190580 }, { "epoch": 4.656023257518384, "grad_norm": 1.0892995305766817e-05, "learning_rate": 2.8699778777844574e-08, "loss": 0.0, "num_input_tokens_seen": 128410280, "step": 190585 }, { "epoch": 4.6561454083502305, "grad_norm": 0.0011337065370753407, "learning_rate": 2.867949844861106e-08, "loss": 0.0, "num_input_tokens_seen": 128414568, "step": 190590 }, { "epoch": 4.656267559182078, "grad_norm": 0.00012795043585356325, "learning_rate": 2.8659225183081613e-08, "loss": 0.0, "num_input_tokens_seen": 128417768, "step": 190595 }, { "epoch": 4.656389710013925, "grad_norm": 0.0025257400702685118, "learning_rate": 2.863895898140345e-08, "loss": 0.0, "num_input_tokens_seen": 128421032, "step": 190600 }, { "epoch": 4.6565118608457725, "grad_norm": 0.0013095020549371839, "learning_rate": 2.8618699843724115e-08, "loss": 0.0, "num_input_tokens_seen": 128424168, "step": 190605 }, { "epoch": 4.656634011677619, "grad_norm": 0.0003252495953347534, "learning_rate": 2.8598447770190938e-08, "loss": 0.0, "num_input_tokens_seen": 128427496, "step": 190610 }, { "epoch": 4.656756162509467, "grad_norm": 0.00011898396769538522, "learning_rate": 2.857820276095091e-08, "loss": 0.0, "num_input_tokens_seen": 128430824, "step": 190615 }, { "epoch": 4.656878313341314, "grad_norm": 0.0004511339357122779, "learning_rate": 2.855796481615158e-08, "loss": 0.0, "num_input_tokens_seen": 128434088, "step": 190620 }, { "epoch": 4.657000464173161, "grad_norm": 3.237876444472931e-05, "learning_rate": 2.8537733935940055e-08, "loss": 0.0, "num_input_tokens_seen": 128437864, "step": 190625 }, { "epoch": 4.657122615005008, "grad_norm": 0.000573989178519696, "learning_rate": 2.851751012046333e-08, "loss": 0.0, "num_input_tokens_seen": 128441576, "step": 190630 }, { "epoch": 4.657244765836856, "grad_norm": 0.0007632462657056749, "learning_rate": 2.8497293369868723e-08, "loss": 0.0, "num_input_tokens_seen": 128444520, "step": 190635 }, { "epoch": 4.657366916668702, "grad_norm": 0.003807037603110075, "learning_rate": 2.8477083684302904e-08, "loss": 0.0, "num_input_tokens_seen": 128448296, "step": 190640 }, { "epoch": 4.65748906750055, "grad_norm": 0.0006615742458961904, "learning_rate": 2.8456881063913195e-08, "loss": 0.0, "num_input_tokens_seen": 128451240, "step": 190645 }, { "epoch": 4.657611218332397, "grad_norm": 1.733975477691274e-05, "learning_rate": 2.843668550884626e-08, "loss": 0.0, "num_input_tokens_seen": 128454504, "step": 190650 }, { "epoch": 4.657733369164244, "grad_norm": 0.0003319148381706327, "learning_rate": 2.8416497019249086e-08, "loss": 0.0, "num_input_tokens_seen": 128457640, "step": 190655 }, { "epoch": 4.657855519996091, "grad_norm": 0.0028955182060599327, "learning_rate": 2.839631559526856e-08, "loss": 0.0, "num_input_tokens_seen": 128461096, "step": 190660 }, { "epoch": 4.657977670827938, "grad_norm": 6.624946399824694e-05, "learning_rate": 2.8376141237051234e-08, "loss": 0.0, "num_input_tokens_seen": 128464424, "step": 190665 }, { "epoch": 4.6580998216597855, "grad_norm": 0.01330376137048006, "learning_rate": 2.8355973944743982e-08, "loss": 0.0, "num_input_tokens_seen": 128467944, "step": 190670 }, { "epoch": 4.658221972491633, "grad_norm": 0.0015481158625334501, "learning_rate": 2.8335813718493474e-08, "loss": 0.0, "num_input_tokens_seen": 128471016, "step": 190675 }, { "epoch": 4.65834412332348, "grad_norm": 0.00015667296247556806, "learning_rate": 2.8315660558446252e-08, "loss": 0.0, "num_input_tokens_seen": 128474600, "step": 190680 }, { "epoch": 4.658466274155327, "grad_norm": 0.0011868600267916918, "learning_rate": 2.829551446474887e-08, "loss": 0.0, "num_input_tokens_seen": 128478056, "step": 190685 }, { "epoch": 4.658588424987174, "grad_norm": 0.18346768617630005, "learning_rate": 2.8275375437547876e-08, "loss": 0.0001, "num_input_tokens_seen": 128481256, "step": 190690 }, { "epoch": 4.658710575819021, "grad_norm": 6.0428901633713394e-05, "learning_rate": 2.825524347698971e-08, "loss": 0.0, "num_input_tokens_seen": 128484456, "step": 190695 }, { "epoch": 4.658832726650869, "grad_norm": 0.0029399923514574766, "learning_rate": 2.8235118583220918e-08, "loss": 0.0, "num_input_tokens_seen": 128487464, "step": 190700 }, { "epoch": 4.658954877482715, "grad_norm": 0.0033414326608181, "learning_rate": 2.8215000756387496e-08, "loss": 0.0, "num_input_tokens_seen": 128490728, "step": 190705 }, { "epoch": 4.659077028314563, "grad_norm": 0.00032659669523127377, "learning_rate": 2.8194889996636217e-08, "loss": 0.0, "num_input_tokens_seen": 128494120, "step": 190710 }, { "epoch": 4.65919917914641, "grad_norm": 0.00039978005224838853, "learning_rate": 2.8174786304112853e-08, "loss": 0.0, "num_input_tokens_seen": 128497192, "step": 190715 }, { "epoch": 4.659321329978257, "grad_norm": 0.006133579649031162, "learning_rate": 2.8154689678963948e-08, "loss": 0.0002, "num_input_tokens_seen": 128500456, "step": 190720 }, { "epoch": 4.659443480810104, "grad_norm": 0.0007687496836297214, "learning_rate": 2.8134600121335506e-08, "loss": 0.0, "num_input_tokens_seen": 128503528, "step": 190725 }, { "epoch": 4.659565631641952, "grad_norm": 1.2737462520599365, "learning_rate": 2.8114517631373623e-08, "loss": 0.0002, "num_input_tokens_seen": 128506536, "step": 190730 }, { "epoch": 4.659687782473799, "grad_norm": 0.0026064272969961166, "learning_rate": 2.8094442209224412e-08, "loss": 0.0, "num_input_tokens_seen": 128509992, "step": 190735 }, { "epoch": 4.659809933305646, "grad_norm": 0.0006508814403787255, "learning_rate": 2.8074373855033862e-08, "loss": 0.0, "num_input_tokens_seen": 128513576, "step": 190740 }, { "epoch": 4.659932084137493, "grad_norm": 4.2336319893365726e-05, "learning_rate": 2.8054312568947747e-08, "loss": 0.0001, "num_input_tokens_seen": 128516968, "step": 190745 }, { "epoch": 4.660054234969341, "grad_norm": 0.0007426925003528595, "learning_rate": 2.803425835111217e-08, "loss": 0.0, "num_input_tokens_seen": 128520232, "step": 190750 }, { "epoch": 4.660176385801187, "grad_norm": 0.000315658311592415, "learning_rate": 2.801421120167291e-08, "loss": 0.0, "num_input_tokens_seen": 128523560, "step": 190755 }, { "epoch": 4.660298536633034, "grad_norm": 0.0018822962883859873, "learning_rate": 2.7994171120775732e-08, "loss": 0.0, "num_input_tokens_seen": 128526696, "step": 190760 }, { "epoch": 4.660420687464882, "grad_norm": 8.385728506254964e-06, "learning_rate": 2.7974138108566414e-08, "loss": 0.0, "num_input_tokens_seen": 128529832, "step": 190765 }, { "epoch": 4.660542838296728, "grad_norm": 8.750159031478688e-05, "learning_rate": 2.7954112165190502e-08, "loss": 0.0, "num_input_tokens_seen": 128533736, "step": 190770 }, { "epoch": 4.660664989128576, "grad_norm": 0.00037626747507601976, "learning_rate": 2.793409329079377e-08, "loss": 0.0, "num_input_tokens_seen": 128537448, "step": 190775 }, { "epoch": 4.660787139960423, "grad_norm": 2.4590988687123172e-05, "learning_rate": 2.791408148552188e-08, "loss": 0.0, "num_input_tokens_seen": 128540840, "step": 190780 }, { "epoch": 4.6609092907922705, "grad_norm": 0.00022164350957609713, "learning_rate": 2.7894076749520158e-08, "loss": 0.0001, "num_input_tokens_seen": 128544360, "step": 190785 }, { "epoch": 4.661031441624117, "grad_norm": 0.000510553945787251, "learning_rate": 2.7874079082934155e-08, "loss": 0.0, "num_input_tokens_seen": 128548456, "step": 190790 }, { "epoch": 4.661153592455965, "grad_norm": 0.0030823589768260717, "learning_rate": 2.7854088485909312e-08, "loss": 0.0, "num_input_tokens_seen": 128551784, "step": 190795 }, { "epoch": 4.661275743287812, "grad_norm": 0.00036780742811970413, "learning_rate": 2.7834104958591176e-08, "loss": 0.0, "num_input_tokens_seen": 128554792, "step": 190800 }, { "epoch": 4.661397894119659, "grad_norm": 7.580179226351902e-05, "learning_rate": 2.7814128501124856e-08, "loss": 0.0, "num_input_tokens_seen": 128557736, "step": 190805 }, { "epoch": 4.661520044951506, "grad_norm": 0.0011283807689324021, "learning_rate": 2.7794159113655567e-08, "loss": 0.0, "num_input_tokens_seen": 128560808, "step": 190810 }, { "epoch": 4.661642195783354, "grad_norm": 0.000357277225703001, "learning_rate": 2.7774196796328752e-08, "loss": 0.0, "num_input_tokens_seen": 128563944, "step": 190815 }, { "epoch": 4.6617643466152, "grad_norm": 0.0012690431904047728, "learning_rate": 2.775424154928929e-08, "loss": 0.0, "num_input_tokens_seen": 128567976, "step": 190820 }, { "epoch": 4.661886497447048, "grad_norm": 2.7468084226711653e-05, "learning_rate": 2.7734293372682737e-08, "loss": 0.0002, "num_input_tokens_seen": 128571496, "step": 190825 }, { "epoch": 4.662008648278895, "grad_norm": 0.00075659150024876, "learning_rate": 2.771435226665364e-08, "loss": 0.0, "num_input_tokens_seen": 128574568, "step": 190830 }, { "epoch": 4.6621307991107415, "grad_norm": 0.0006138571770861745, "learning_rate": 2.769441823134755e-08, "loss": 0.0, "num_input_tokens_seen": 128577640, "step": 190835 }, { "epoch": 4.662252949942589, "grad_norm": 0.004741390701383352, "learning_rate": 2.7674491266909016e-08, "loss": 0.0, "num_input_tokens_seen": 128581160, "step": 190840 }, { "epoch": 4.662375100774437, "grad_norm": 0.00347688514739275, "learning_rate": 2.765457137348304e-08, "loss": 0.0, "num_input_tokens_seen": 128584104, "step": 190845 }, { "epoch": 4.6624972516062835, "grad_norm": 7.782386092003435e-05, "learning_rate": 2.7634658551214717e-08, "loss": 0.0, "num_input_tokens_seen": 128587368, "step": 190850 }, { "epoch": 4.66261940243813, "grad_norm": 2.530582969484385e-05, "learning_rate": 2.7614752800248608e-08, "loss": 0.0, "num_input_tokens_seen": 128590696, "step": 190855 }, { "epoch": 4.662741553269978, "grad_norm": 0.00023780610354151577, "learning_rate": 2.7594854120729594e-08, "loss": 0.0, "num_input_tokens_seen": 128594024, "step": 190860 }, { "epoch": 4.662863704101825, "grad_norm": 5.801659062854014e-05, "learning_rate": 2.7574962512802334e-08, "loss": 0.0, "num_input_tokens_seen": 128597544, "step": 190865 }, { "epoch": 4.662985854933672, "grad_norm": 3.095933789154515e-05, "learning_rate": 2.7555077976611385e-08, "loss": 0.0, "num_input_tokens_seen": 128603112, "step": 190870 }, { "epoch": 4.663108005765519, "grad_norm": 0.0004294172103982419, "learning_rate": 2.7535200512301626e-08, "loss": 0.0, "num_input_tokens_seen": 128606760, "step": 190875 }, { "epoch": 4.663230156597367, "grad_norm": 9.709588630357757e-05, "learning_rate": 2.7515330120017387e-08, "loss": 0.0, "num_input_tokens_seen": 128610024, "step": 190880 }, { "epoch": 4.663352307429213, "grad_norm": 0.0002693548158276826, "learning_rate": 2.7495466799903222e-08, "loss": 0.0, "num_input_tokens_seen": 128613480, "step": 190885 }, { "epoch": 4.663474458261061, "grad_norm": 0.00010838372691068798, "learning_rate": 2.7475610552103568e-08, "loss": 0.0, "num_input_tokens_seen": 128616808, "step": 190890 }, { "epoch": 4.663596609092908, "grad_norm": 0.0002789755817502737, "learning_rate": 2.7455761376762976e-08, "loss": 0.0, "num_input_tokens_seen": 128619944, "step": 190895 }, { "epoch": 4.663718759924755, "grad_norm": 0.0007267400505952537, "learning_rate": 2.7435919274025553e-08, "loss": 0.0, "num_input_tokens_seen": 128623016, "step": 190900 }, { "epoch": 4.663840910756602, "grad_norm": 0.0002554529346525669, "learning_rate": 2.7416084244035852e-08, "loss": 0.0, "num_input_tokens_seen": 128626152, "step": 190905 }, { "epoch": 4.66396306158845, "grad_norm": 0.00016573209722992033, "learning_rate": 2.739625628693776e-08, "loss": 0.0, "num_input_tokens_seen": 128629160, "step": 190910 }, { "epoch": 4.6640852124202965, "grad_norm": 0.00022480067855212837, "learning_rate": 2.737643540287593e-08, "loss": 0.0, "num_input_tokens_seen": 128632232, "step": 190915 }, { "epoch": 4.664207363252144, "grad_norm": 0.001412676996551454, "learning_rate": 2.7356621591994146e-08, "loss": 0.0, "num_input_tokens_seen": 128635688, "step": 190920 }, { "epoch": 4.664329514083991, "grad_norm": 0.00020378133922349662, "learning_rate": 2.733681485443662e-08, "loss": 0.0, "num_input_tokens_seen": 128639336, "step": 190925 }, { "epoch": 4.664451664915838, "grad_norm": 5.148085983819328e-05, "learning_rate": 2.731701519034735e-08, "loss": 0.0, "num_input_tokens_seen": 128642856, "step": 190930 }, { "epoch": 4.664573815747685, "grad_norm": 4.189375613350421e-05, "learning_rate": 2.729722259987044e-08, "loss": 0.0, "num_input_tokens_seen": 128646184, "step": 190935 }, { "epoch": 4.664695966579533, "grad_norm": 0.00539232324808836, "learning_rate": 2.7277437083149668e-08, "loss": 0.0, "num_input_tokens_seen": 128649576, "step": 190940 }, { "epoch": 4.66481811741138, "grad_norm": 0.0003227377019356936, "learning_rate": 2.725765864032914e-08, "loss": 0.0, "num_input_tokens_seen": 128653032, "step": 190945 }, { "epoch": 4.664940268243226, "grad_norm": 0.0002500084519851953, "learning_rate": 2.7237887271552406e-08, "loss": 0.0, "num_input_tokens_seen": 128656232, "step": 190950 }, { "epoch": 4.665062419075074, "grad_norm": 0.0006451523513533175, "learning_rate": 2.7218122976963465e-08, "loss": 0.0, "num_input_tokens_seen": 128659688, "step": 190955 }, { "epoch": 4.665184569906921, "grad_norm": 0.0003521182225085795, "learning_rate": 2.7198365756705976e-08, "loss": 0.0, "num_input_tokens_seen": 128663656, "step": 190960 }, { "epoch": 4.665306720738768, "grad_norm": 0.0023154793307185173, "learning_rate": 2.7178615610923606e-08, "loss": 0.0, "num_input_tokens_seen": 128666920, "step": 190965 }, { "epoch": 4.665428871570615, "grad_norm": 4.0537401218898594e-05, "learning_rate": 2.7158872539760014e-08, "loss": 0.0, "num_input_tokens_seen": 128670632, "step": 190970 }, { "epoch": 4.665551022402463, "grad_norm": 0.00011217274004593492, "learning_rate": 2.7139136543358754e-08, "loss": 0.0, "num_input_tokens_seen": 128673896, "step": 190975 }, { "epoch": 4.6656731732343095, "grad_norm": 0.0001300013973377645, "learning_rate": 2.711940762186349e-08, "loss": 0.0, "num_input_tokens_seen": 128677096, "step": 190980 }, { "epoch": 4.665795324066157, "grad_norm": 0.0002665658830665052, "learning_rate": 2.7099685775417324e-08, "loss": 0.041, "num_input_tokens_seen": 128680232, "step": 190985 }, { "epoch": 4.665917474898004, "grad_norm": 0.0036078591365367174, "learning_rate": 2.707997100416415e-08, "loss": 0.0, "num_input_tokens_seen": 128683752, "step": 190990 }, { "epoch": 4.6660396257298515, "grad_norm": 0.00020211531955283135, "learning_rate": 2.7060263308246956e-08, "loss": 0.0, "num_input_tokens_seen": 128687400, "step": 190995 }, { "epoch": 4.666161776561698, "grad_norm": 0.0020623011514544487, "learning_rate": 2.704056268780919e-08, "loss": 0.0, "num_input_tokens_seen": 128690472, "step": 191000 }, { "epoch": 4.666283927393546, "grad_norm": 0.00035755732096731663, "learning_rate": 2.7020869142994284e-08, "loss": 0.0, "num_input_tokens_seen": 128693992, "step": 191005 }, { "epoch": 4.666406078225393, "grad_norm": 0.0050127855502069, "learning_rate": 2.7001182673945354e-08, "loss": 0.0, "num_input_tokens_seen": 128697064, "step": 191010 }, { "epoch": 4.66652822905724, "grad_norm": 0.0020575481466948986, "learning_rate": 2.6981503280805395e-08, "loss": 0.0, "num_input_tokens_seen": 128700456, "step": 191015 }, { "epoch": 4.666650379889087, "grad_norm": 5.921476986259222e-05, "learning_rate": 2.6961830963717737e-08, "loss": 0.0, "num_input_tokens_seen": 128703464, "step": 191020 }, { "epoch": 4.666772530720934, "grad_norm": 0.0003617874172050506, "learning_rate": 2.694216572282526e-08, "loss": 0.0, "num_input_tokens_seen": 128706856, "step": 191025 }, { "epoch": 4.666894681552781, "grad_norm": 0.015034444630146027, "learning_rate": 2.692250755827119e-08, "loss": 0.0, "num_input_tokens_seen": 128710888, "step": 191030 }, { "epoch": 4.667016832384628, "grad_norm": 0.0004685759777203202, "learning_rate": 2.69028564701983e-08, "loss": 0.0, "num_input_tokens_seen": 128714408, "step": 191035 }, { "epoch": 4.667138983216476, "grad_norm": 0.0005598337156698108, "learning_rate": 2.6883212458749694e-08, "loss": 0.0, "num_input_tokens_seen": 128717544, "step": 191040 }, { "epoch": 4.6672611340483225, "grad_norm": 1.4924536117177922e-05, "learning_rate": 2.686357552406793e-08, "loss": 0.0, "num_input_tokens_seen": 128721000, "step": 191045 }, { "epoch": 4.66738328488017, "grad_norm": 6.75343835609965e-05, "learning_rate": 2.684394566629611e-08, "loss": 0.0, "num_input_tokens_seen": 128724200, "step": 191050 }, { "epoch": 4.667505435712017, "grad_norm": 1.3195672181609552e-05, "learning_rate": 2.682432288557679e-08, "loss": 0.0, "num_input_tokens_seen": 128727464, "step": 191055 }, { "epoch": 4.667627586543865, "grad_norm": 0.0004628331807907671, "learning_rate": 2.6804707182052633e-08, "loss": 0.0, "num_input_tokens_seen": 128730536, "step": 191060 }, { "epoch": 4.667749737375711, "grad_norm": 0.005984678864479065, "learning_rate": 2.6785098555866635e-08, "loss": 0.0, "num_input_tokens_seen": 128734120, "step": 191065 }, { "epoch": 4.667871888207559, "grad_norm": 0.0022569738794118166, "learning_rate": 2.676549700716102e-08, "loss": 0.0, "num_input_tokens_seen": 128737640, "step": 191070 }, { "epoch": 4.667994039039406, "grad_norm": 0.00012861998402513564, "learning_rate": 2.6745902536078558e-08, "loss": 0.0, "num_input_tokens_seen": 128740904, "step": 191075 }, { "epoch": 4.668116189871253, "grad_norm": 3.7217909266473725e-05, "learning_rate": 2.6726315142761578e-08, "loss": 0.0, "num_input_tokens_seen": 128744296, "step": 191080 }, { "epoch": 4.6682383407031, "grad_norm": 0.018890826031565666, "learning_rate": 2.670673482735275e-08, "loss": 0.0, "num_input_tokens_seen": 128747624, "step": 191085 }, { "epoch": 4.668360491534948, "grad_norm": 2.0563811631291173e-05, "learning_rate": 2.668716158999418e-08, "loss": 0.0, "num_input_tokens_seen": 128751208, "step": 191090 }, { "epoch": 4.668482642366794, "grad_norm": 0.00013304762251209468, "learning_rate": 2.6667595430828417e-08, "loss": 0.0, "num_input_tokens_seen": 128754600, "step": 191095 }, { "epoch": 4.668604793198641, "grad_norm": 0.005235996562987566, "learning_rate": 2.6648036349997792e-08, "loss": 0.0, "num_input_tokens_seen": 128757992, "step": 191100 }, { "epoch": 4.668726944030489, "grad_norm": 0.022949010133743286, "learning_rate": 2.662848434764431e-08, "loss": 0.0, "num_input_tokens_seen": 128761640, "step": 191105 }, { "epoch": 4.6688490948623365, "grad_norm": 0.0007702650036662817, "learning_rate": 2.6608939423910404e-08, "loss": 0.0, "num_input_tokens_seen": 128764584, "step": 191110 }, { "epoch": 4.668971245694183, "grad_norm": 0.00018525759514886886, "learning_rate": 2.6589401578938075e-08, "loss": 0.0, "num_input_tokens_seen": 128768040, "step": 191115 }, { "epoch": 4.66909339652603, "grad_norm": 0.00010585349809844047, "learning_rate": 2.6569870812869323e-08, "loss": 0.0, "num_input_tokens_seen": 128771368, "step": 191120 }, { "epoch": 4.669215547357878, "grad_norm": 0.00026610007626004517, "learning_rate": 2.6550347125846472e-08, "loss": 0.0, "num_input_tokens_seen": 128774632, "step": 191125 }, { "epoch": 4.669337698189724, "grad_norm": 0.00032241616281680763, "learning_rate": 2.6530830518011194e-08, "loss": 0.0, "num_input_tokens_seen": 128777704, "step": 191130 }, { "epoch": 4.669459849021572, "grad_norm": 0.0034227308351546526, "learning_rate": 2.651132098950559e-08, "loss": 0.0, "num_input_tokens_seen": 128781160, "step": 191135 }, { "epoch": 4.669581999853419, "grad_norm": 0.1563401222229004, "learning_rate": 2.6491818540471446e-08, "loss": 0.0001, "num_input_tokens_seen": 128784104, "step": 191140 }, { "epoch": 4.669704150685266, "grad_norm": 8.37320931168506e-06, "learning_rate": 2.6472323171050747e-08, "loss": 0.0, "num_input_tokens_seen": 128787752, "step": 191145 }, { "epoch": 4.669826301517113, "grad_norm": 0.0012280684895813465, "learning_rate": 2.6452834881385055e-08, "loss": 0.0, "num_input_tokens_seen": 128790888, "step": 191150 }, { "epoch": 4.669948452348961, "grad_norm": 3.109731187578291e-05, "learning_rate": 2.6433353671616142e-08, "loss": 0.0, "num_input_tokens_seen": 128793960, "step": 191155 }, { "epoch": 4.6700706031808075, "grad_norm": 0.0015018595149740577, "learning_rate": 2.64138795418859e-08, "loss": 0.0, "num_input_tokens_seen": 128797416, "step": 191160 }, { "epoch": 4.670192754012655, "grad_norm": 0.0013169918674975634, "learning_rate": 2.6394412492335648e-08, "loss": 0.0, "num_input_tokens_seen": 128801128, "step": 191165 }, { "epoch": 4.670314904844502, "grad_norm": 0.0002888040617108345, "learning_rate": 2.6374952523107286e-08, "loss": 0.0, "num_input_tokens_seen": 128804136, "step": 191170 }, { "epoch": 4.6704370556763495, "grad_norm": 3.728661977220327e-05, "learning_rate": 2.6355499634341916e-08, "loss": 0.0, "num_input_tokens_seen": 128807400, "step": 191175 }, { "epoch": 4.670559206508196, "grad_norm": 0.00021267979172989726, "learning_rate": 2.6336053826181314e-08, "loss": 0.0, "num_input_tokens_seen": 128811240, "step": 191180 }, { "epoch": 4.670681357340044, "grad_norm": 7.831936090951785e-05, "learning_rate": 2.6316615098766927e-08, "loss": 0.0, "num_input_tokens_seen": 128814440, "step": 191185 }, { "epoch": 4.670803508171891, "grad_norm": 0.0009277883800677955, "learning_rate": 2.6297183452239856e-08, "loss": 0.0, "num_input_tokens_seen": 128817960, "step": 191190 }, { "epoch": 4.670925659003737, "grad_norm": 0.002798869274556637, "learning_rate": 2.6277758886741664e-08, "loss": 0.0, "num_input_tokens_seen": 128821416, "step": 191195 }, { "epoch": 4.671047809835585, "grad_norm": 0.0005861495155841112, "learning_rate": 2.6258341402413454e-08, "loss": 0.0, "num_input_tokens_seen": 128824872, "step": 191200 }, { "epoch": 4.671169960667433, "grad_norm": 0.0006913174293003976, "learning_rate": 2.6238930999396557e-08, "loss": 0.0, "num_input_tokens_seen": 128828136, "step": 191205 }, { "epoch": 4.671292111499279, "grad_norm": 0.003023535944521427, "learning_rate": 2.6219527677831976e-08, "loss": 0.0, "num_input_tokens_seen": 128831720, "step": 191210 }, { "epoch": 4.671414262331126, "grad_norm": 9.19162903301185e-06, "learning_rate": 2.6200131437861038e-08, "loss": 0.0, "num_input_tokens_seen": 128834792, "step": 191215 }, { "epoch": 4.671536413162974, "grad_norm": 0.020835041999816895, "learning_rate": 2.6180742279624523e-08, "loss": 0.0001, "num_input_tokens_seen": 128837800, "step": 191220 }, { "epoch": 4.6716585639948205, "grad_norm": 0.0006879670545458794, "learning_rate": 2.616136020326365e-08, "loss": 0.0, "num_input_tokens_seen": 128840936, "step": 191225 }, { "epoch": 4.671780714826668, "grad_norm": 0.0040020509622991085, "learning_rate": 2.6141985208919305e-08, "loss": 0.0, "num_input_tokens_seen": 128844648, "step": 191230 }, { "epoch": 4.671902865658515, "grad_norm": 0.02804373949766159, "learning_rate": 2.6122617296732376e-08, "loss": 0.0, "num_input_tokens_seen": 128847976, "step": 191235 }, { "epoch": 4.6720250164903625, "grad_norm": 0.0001148119117715396, "learning_rate": 2.610325646684375e-08, "loss": 0.0, "num_input_tokens_seen": 128850984, "step": 191240 }, { "epoch": 4.672147167322209, "grad_norm": 16.482566833496094, "learning_rate": 2.6083902719393978e-08, "loss": 0.1687, "num_input_tokens_seen": 128854760, "step": 191245 }, { "epoch": 4.672269318154057, "grad_norm": 7.6309333962854e-05, "learning_rate": 2.606455605452418e-08, "loss": 0.0, "num_input_tokens_seen": 128858088, "step": 191250 }, { "epoch": 4.672391468985904, "grad_norm": 0.00012644918751902878, "learning_rate": 2.6045216472374898e-08, "loss": 0.0, "num_input_tokens_seen": 128861672, "step": 191255 }, { "epoch": 4.672513619817751, "grad_norm": 0.0014578666305169463, "learning_rate": 2.6025883973086693e-08, "loss": 0.0, "num_input_tokens_seen": 128865576, "step": 191260 }, { "epoch": 4.672635770649598, "grad_norm": 5.851191599504091e-05, "learning_rate": 2.600655855680034e-08, "loss": 0.0, "num_input_tokens_seen": 128868904, "step": 191265 }, { "epoch": 4.672757921481446, "grad_norm": 0.0002196232817368582, "learning_rate": 2.598724022365617e-08, "loss": 0.0365, "num_input_tokens_seen": 128872488, "step": 191270 }, { "epoch": 4.672880072313292, "grad_norm": 4.39381183241494e-05, "learning_rate": 2.5967928973794738e-08, "loss": 0.0, "num_input_tokens_seen": 128875944, "step": 191275 }, { "epoch": 4.67300222314514, "grad_norm": 0.0006547580123879015, "learning_rate": 2.59486248073566e-08, "loss": 0.0, "num_input_tokens_seen": 128879016, "step": 191280 }, { "epoch": 4.673124373976987, "grad_norm": 0.00048621126916259527, "learning_rate": 2.5929327724481976e-08, "loss": 0.0, "num_input_tokens_seen": 128882472, "step": 191285 }, { "epoch": 4.6732465248088335, "grad_norm": 0.00038942944956943393, "learning_rate": 2.5910037725311418e-08, "loss": 0.0, "num_input_tokens_seen": 128885352, "step": 191290 }, { "epoch": 4.673368675640681, "grad_norm": 0.000668442458845675, "learning_rate": 2.5890754809984928e-08, "loss": 0.0, "num_input_tokens_seen": 128889128, "step": 191295 }, { "epoch": 4.673490826472528, "grad_norm": 0.00021932261006440967, "learning_rate": 2.5871478978642945e-08, "loss": 0.0, "num_input_tokens_seen": 128892200, "step": 191300 }, { "epoch": 4.6736129773043755, "grad_norm": 0.0009539284510537982, "learning_rate": 2.5852210231425475e-08, "loss": 0.0, "num_input_tokens_seen": 128896104, "step": 191305 }, { "epoch": 4.673735128136222, "grad_norm": 0.0003438375424593687, "learning_rate": 2.5832948568472733e-08, "loss": 0.0, "num_input_tokens_seen": 128899304, "step": 191310 }, { "epoch": 4.67385727896807, "grad_norm": 1.730235635477584e-05, "learning_rate": 2.5813693989924944e-08, "loss": 0.0, "num_input_tokens_seen": 128902568, "step": 191315 }, { "epoch": 4.673979429799917, "grad_norm": 0.00022493410506285727, "learning_rate": 2.5794446495921994e-08, "loss": 0.0, "num_input_tokens_seen": 128906600, "step": 191320 }, { "epoch": 4.674101580631764, "grad_norm": 0.0017360784113407135, "learning_rate": 2.5775206086603772e-08, "loss": 0.0, "num_input_tokens_seen": 128910120, "step": 191325 }, { "epoch": 4.674223731463611, "grad_norm": 2.6076373615069315e-05, "learning_rate": 2.575597276211039e-08, "loss": 0.0, "num_input_tokens_seen": 128913064, "step": 191330 }, { "epoch": 4.674345882295459, "grad_norm": 0.01238018274307251, "learning_rate": 2.573674652258151e-08, "loss": 0.0, "num_input_tokens_seen": 128916456, "step": 191335 }, { "epoch": 4.674468033127305, "grad_norm": 8.923443965613842e-05, "learning_rate": 2.5717527368157134e-08, "loss": 0.0, "num_input_tokens_seen": 128919528, "step": 191340 }, { "epoch": 4.674590183959153, "grad_norm": 0.00023435594630427659, "learning_rate": 2.5698315298976813e-08, "loss": 0.0, "num_input_tokens_seen": 128923304, "step": 191345 }, { "epoch": 4.674712334791, "grad_norm": 2.5178560463245958e-05, "learning_rate": 2.5679110315180553e-08, "loss": 0.0, "num_input_tokens_seen": 128926696, "step": 191350 }, { "epoch": 4.674834485622847, "grad_norm": 7.507026748498902e-05, "learning_rate": 2.565991241690779e-08, "loss": 0.0, "num_input_tokens_seen": 128930472, "step": 191355 }, { "epoch": 4.674956636454694, "grad_norm": 0.0008368153939954937, "learning_rate": 2.564072160429831e-08, "loss": 0.0, "num_input_tokens_seen": 128933800, "step": 191360 }, { "epoch": 4.675078787286542, "grad_norm": 0.0004949255962856114, "learning_rate": 2.562153787749144e-08, "loss": 0.0, "num_input_tokens_seen": 128936936, "step": 191365 }, { "epoch": 4.6752009381183885, "grad_norm": 0.011058218777179718, "learning_rate": 2.5602361236626736e-08, "loss": 0.0, "num_input_tokens_seen": 128941032, "step": 191370 }, { "epoch": 4.675323088950236, "grad_norm": 9.234155004378408e-05, "learning_rate": 2.5583191681843973e-08, "loss": 0.0, "num_input_tokens_seen": 128944296, "step": 191375 }, { "epoch": 4.675445239782083, "grad_norm": 0.0030573757831007242, "learning_rate": 2.5564029213282157e-08, "loss": 0.0, "num_input_tokens_seen": 128947496, "step": 191380 }, { "epoch": 4.67556739061393, "grad_norm": 0.0009128560195676982, "learning_rate": 2.554487383108095e-08, "loss": 0.0, "num_input_tokens_seen": 128950568, "step": 191385 }, { "epoch": 4.675689541445777, "grad_norm": 6.758284143870696e-05, "learning_rate": 2.552572553537935e-08, "loss": 0.0, "num_input_tokens_seen": 128954408, "step": 191390 }, { "epoch": 4.675811692277624, "grad_norm": 0.0006308990996330976, "learning_rate": 2.5506584326316916e-08, "loss": 0.0, "num_input_tokens_seen": 128958248, "step": 191395 }, { "epoch": 4.675933843109472, "grad_norm": 0.00012568126840051264, "learning_rate": 2.5487450204032644e-08, "loss": 0.0, "num_input_tokens_seen": 128961320, "step": 191400 }, { "epoch": 4.676055993941318, "grad_norm": 0.00015130749670788646, "learning_rate": 2.546832316866576e-08, "loss": 0.0, "num_input_tokens_seen": 128964776, "step": 191405 }, { "epoch": 4.676178144773166, "grad_norm": 0.0007944232784211636, "learning_rate": 2.5449203220355377e-08, "loss": 0.0, "num_input_tokens_seen": 128968296, "step": 191410 }, { "epoch": 4.676300295605013, "grad_norm": 2.92979439109331e-05, "learning_rate": 2.5430090359240486e-08, "loss": 0.0, "num_input_tokens_seen": 128971752, "step": 191415 }, { "epoch": 4.6764224464368604, "grad_norm": 0.00021781650139018893, "learning_rate": 2.5410984585460203e-08, "loss": 0.0, "num_input_tokens_seen": 128974696, "step": 191420 }, { "epoch": 4.676544597268707, "grad_norm": 0.0005129770725034177, "learning_rate": 2.539188589915331e-08, "loss": 0.0001, "num_input_tokens_seen": 128978088, "step": 191425 }, { "epoch": 4.676666748100555, "grad_norm": 0.0003045414632651955, "learning_rate": 2.537279430045869e-08, "loss": 0.0, "num_input_tokens_seen": 128981288, "step": 191430 }, { "epoch": 4.676788898932402, "grad_norm": 8.265865471912548e-05, "learning_rate": 2.5353709789515344e-08, "loss": 0.0, "num_input_tokens_seen": 128985064, "step": 191435 }, { "epoch": 4.676911049764249, "grad_norm": 9.866351319942623e-05, "learning_rate": 2.5334632366461827e-08, "loss": 0.0, "num_input_tokens_seen": 128988584, "step": 191440 }, { "epoch": 4.677033200596096, "grad_norm": 0.00010443492647027597, "learning_rate": 2.5315562031437144e-08, "loss": 0.0, "num_input_tokens_seen": 128991784, "step": 191445 }, { "epoch": 4.677155351427944, "grad_norm": 0.001498854486271739, "learning_rate": 2.5296498784579845e-08, "loss": 0.0, "num_input_tokens_seen": 128995432, "step": 191450 }, { "epoch": 4.67727750225979, "grad_norm": 3.8011914966773475e-06, "learning_rate": 2.52774426260286e-08, "loss": 0.0, "num_input_tokens_seen": 128999016, "step": 191455 }, { "epoch": 4.677399653091637, "grad_norm": 5.286881059873849e-05, "learning_rate": 2.5258393555921855e-08, "loss": 0.0, "num_input_tokens_seen": 129002408, "step": 191460 }, { "epoch": 4.677521803923485, "grad_norm": 1.5034872376418207e-05, "learning_rate": 2.523935157439816e-08, "loss": 0.0, "num_input_tokens_seen": 129006248, "step": 191465 }, { "epoch": 4.677643954755332, "grad_norm": 3.237137934775092e-05, "learning_rate": 2.52203166815963e-08, "loss": 0.0, "num_input_tokens_seen": 129009640, "step": 191470 }, { "epoch": 4.677766105587179, "grad_norm": 0.00026456735213287175, "learning_rate": 2.520128887765438e-08, "loss": 0.0, "num_input_tokens_seen": 129012968, "step": 191475 }, { "epoch": 4.677888256419026, "grad_norm": 0.00042985472828149796, "learning_rate": 2.5182268162710962e-08, "loss": 0.0, "num_input_tokens_seen": 129016104, "step": 191480 }, { "epoch": 4.6780104072508735, "grad_norm": 0.0021684430539608, "learning_rate": 2.5163254536904155e-08, "loss": 0.0, "num_input_tokens_seen": 129019304, "step": 191485 }, { "epoch": 4.67813255808272, "grad_norm": 0.0005304042715579271, "learning_rate": 2.5144248000372403e-08, "loss": 0.0, "num_input_tokens_seen": 129022696, "step": 191490 }, { "epoch": 4.678254708914568, "grad_norm": 0.0005591454682871699, "learning_rate": 2.512524855325393e-08, "loss": 0.0, "num_input_tokens_seen": 129026152, "step": 191495 }, { "epoch": 4.678376859746415, "grad_norm": 1.403722581017064e-05, "learning_rate": 2.510625619568674e-08, "loss": 0.0, "num_input_tokens_seen": 129029608, "step": 191500 }, { "epoch": 4.678499010578262, "grad_norm": 0.0003742658591363579, "learning_rate": 2.5087270927809266e-08, "loss": 0.0, "num_input_tokens_seen": 129033576, "step": 191505 }, { "epoch": 4.678621161410109, "grad_norm": 0.00023465380945708603, "learning_rate": 2.506829274975919e-08, "loss": 0.0852, "num_input_tokens_seen": 129037672, "step": 191510 }, { "epoch": 4.678743312241957, "grad_norm": 0.0006855735555291176, "learning_rate": 2.504932166167484e-08, "loss": 0.0, "num_input_tokens_seen": 129041000, "step": 191515 }, { "epoch": 4.678865463073803, "grad_norm": 0.006005747709423304, "learning_rate": 2.503035766369399e-08, "loss": 0.0, "num_input_tokens_seen": 129044328, "step": 191520 }, { "epoch": 4.678987613905651, "grad_norm": 1.0378267765045166, "learning_rate": 2.5011400755954648e-08, "loss": 0.0005, "num_input_tokens_seen": 129047976, "step": 191525 }, { "epoch": 4.679109764737498, "grad_norm": 0.0018501474987715483, "learning_rate": 2.499245093859459e-08, "loss": 0.0, "num_input_tokens_seen": 129051304, "step": 191530 }, { "epoch": 4.679231915569345, "grad_norm": 2.888561721192673e-05, "learning_rate": 2.4973508211751816e-08, "loss": 0.0, "num_input_tokens_seen": 129054504, "step": 191535 }, { "epoch": 4.679354066401192, "grad_norm": 0.0003150117408949882, "learning_rate": 2.495457257556388e-08, "loss": 0.0, "num_input_tokens_seen": 129057512, "step": 191540 }, { "epoch": 4.67947621723304, "grad_norm": 4.09619024139829e-05, "learning_rate": 2.4935644030168456e-08, "loss": 0.0, "num_input_tokens_seen": 129061032, "step": 191545 }, { "epoch": 4.6795983680648865, "grad_norm": 6.424232651625061e-06, "learning_rate": 2.491672257570343e-08, "loss": 0.0, "num_input_tokens_seen": 129064296, "step": 191550 }, { "epoch": 4.679720518896733, "grad_norm": 6.473718531196937e-05, "learning_rate": 2.4897808212306026e-08, "loss": 0.0, "num_input_tokens_seen": 129067752, "step": 191555 }, { "epoch": 4.679842669728581, "grad_norm": 4.298717976780608e-05, "learning_rate": 2.4878900940114134e-08, "loss": 0.0, "num_input_tokens_seen": 129070952, "step": 191560 }, { "epoch": 4.6799648205604285, "grad_norm": 0.00011635415285127237, "learning_rate": 2.4860000759265308e-08, "loss": 0.0501, "num_input_tokens_seen": 129074280, "step": 191565 }, { "epoch": 4.680086971392275, "grad_norm": 0.0006911892560310662, "learning_rate": 2.4841107669896668e-08, "loss": 0.0, "num_input_tokens_seen": 129077672, "step": 191570 }, { "epoch": 4.680209122224122, "grad_norm": 0.00032391652348451316, "learning_rate": 2.4822221672145872e-08, "loss": 0.0, "num_input_tokens_seen": 129081384, "step": 191575 }, { "epoch": 4.68033127305597, "grad_norm": 6.961155304452404e-05, "learning_rate": 2.4803342766150036e-08, "loss": 0.0, "num_input_tokens_seen": 129084648, "step": 191580 }, { "epoch": 4.680453423887816, "grad_norm": 0.00012113929551560432, "learning_rate": 2.4784470952046722e-08, "loss": 0.0, "num_input_tokens_seen": 129087848, "step": 191585 }, { "epoch": 4.680575574719664, "grad_norm": 0.0004131891764700413, "learning_rate": 2.4765606229973034e-08, "loss": 0.0, "num_input_tokens_seen": 129091624, "step": 191590 }, { "epoch": 4.680697725551511, "grad_norm": 0.0053316219709813595, "learning_rate": 2.474674860006609e-08, "loss": 0.0, "num_input_tokens_seen": 129094504, "step": 191595 }, { "epoch": 4.680819876383358, "grad_norm": 7.62042254791595e-05, "learning_rate": 2.4727898062463226e-08, "loss": 0.0, "num_input_tokens_seen": 129098536, "step": 191600 }, { "epoch": 4.680942027215205, "grad_norm": 0.002029527211561799, "learning_rate": 2.4709054617301218e-08, "loss": 0.0, "num_input_tokens_seen": 129101800, "step": 191605 }, { "epoch": 4.681064178047053, "grad_norm": 8.223400800488889e-05, "learning_rate": 2.4690218264717398e-08, "loss": 0.0, "num_input_tokens_seen": 129105448, "step": 191610 }, { "epoch": 4.6811863288788995, "grad_norm": 0.09763093292713165, "learning_rate": 2.4671389004848663e-08, "loss": 0.0001, "num_input_tokens_seen": 129108904, "step": 191615 }, { "epoch": 4.681308479710747, "grad_norm": 0.0001553054607938975, "learning_rate": 2.465256683783179e-08, "loss": 0.0, "num_input_tokens_seen": 129112616, "step": 191620 }, { "epoch": 4.681430630542594, "grad_norm": 0.0007918269839137793, "learning_rate": 2.4633751763804e-08, "loss": 0.0, "num_input_tokens_seen": 129116264, "step": 191625 }, { "epoch": 4.6815527813744415, "grad_norm": 6.41044243820943e-05, "learning_rate": 2.461494378290174e-08, "loss": 0.0, "num_input_tokens_seen": 129119336, "step": 191630 }, { "epoch": 4.681674932206288, "grad_norm": 0.001178376143798232, "learning_rate": 2.4596142895262017e-08, "loss": 0.0488, "num_input_tokens_seen": 129122728, "step": 191635 }, { "epoch": 4.681797083038136, "grad_norm": 0.0006886956398375332, "learning_rate": 2.4577349101021495e-08, "loss": 0.0, "num_input_tokens_seen": 129126376, "step": 191640 }, { "epoch": 4.681919233869983, "grad_norm": 0.0001021117132040672, "learning_rate": 2.455856240031684e-08, "loss": 0.0, "num_input_tokens_seen": 129129256, "step": 191645 }, { "epoch": 4.682041384701829, "grad_norm": 0.0002298145991517231, "learning_rate": 2.4539782793284723e-08, "loss": 0.0, "num_input_tokens_seen": 129132648, "step": 191650 }, { "epoch": 4.682163535533677, "grad_norm": 2.295691410836298e-05, "learning_rate": 2.4521010280061592e-08, "loss": 0.0, "num_input_tokens_seen": 129136168, "step": 191655 }, { "epoch": 4.682285686365524, "grad_norm": 0.0005900713731534779, "learning_rate": 2.4502244860784115e-08, "loss": 0.0, "num_input_tokens_seen": 129139240, "step": 191660 }, { "epoch": 4.682407837197371, "grad_norm": 0.0011430694721639156, "learning_rate": 2.4483486535588628e-08, "loss": 0.0, "num_input_tokens_seen": 129142696, "step": 191665 }, { "epoch": 4.682529988029218, "grad_norm": 0.0018797010416164994, "learning_rate": 2.4464735304611682e-08, "loss": 0.0402, "num_input_tokens_seen": 129145704, "step": 191670 }, { "epoch": 4.682652138861066, "grad_norm": 0.009413032792508602, "learning_rate": 2.444599116798951e-08, "loss": 0.0, "num_input_tokens_seen": 129148904, "step": 191675 }, { "epoch": 4.6827742896929125, "grad_norm": 0.00027055476675741374, "learning_rate": 2.4427254125858444e-08, "loss": 0.0, "num_input_tokens_seen": 129152744, "step": 191680 }, { "epoch": 4.68289644052476, "grad_norm": 0.0002803192473948002, "learning_rate": 2.440852417835482e-08, "loss": 0.0, "num_input_tokens_seen": 129156264, "step": 191685 }, { "epoch": 4.683018591356607, "grad_norm": 0.00013676565140485764, "learning_rate": 2.4389801325614855e-08, "loss": 0.0, "num_input_tokens_seen": 129159912, "step": 191690 }, { "epoch": 4.6831407421884546, "grad_norm": 9.485345799475908e-05, "learning_rate": 2.4371085567774676e-08, "loss": 0.0, "num_input_tokens_seen": 129163112, "step": 191695 }, { "epoch": 4.683262893020301, "grad_norm": 0.0976140946149826, "learning_rate": 2.4352376904970275e-08, "loss": 0.0348, "num_input_tokens_seen": 129166120, "step": 191700 }, { "epoch": 4.683385043852149, "grad_norm": 0.00010639366519171745, "learning_rate": 2.4333675337337876e-08, "loss": 0.0, "num_input_tokens_seen": 129169576, "step": 191705 }, { "epoch": 4.683507194683996, "grad_norm": 0.33715274930000305, "learning_rate": 2.431498086501338e-08, "loss": 0.0001, "num_input_tokens_seen": 129173160, "step": 191710 }, { "epoch": 4.683629345515843, "grad_norm": 0.0008040166576392949, "learning_rate": 2.429629348813278e-08, "loss": 0.0, "num_input_tokens_seen": 129176424, "step": 191715 }, { "epoch": 4.68375149634769, "grad_norm": 0.007210198789834976, "learning_rate": 2.427761320683208e-08, "loss": 0.0, "num_input_tokens_seen": 129179880, "step": 191720 }, { "epoch": 4.683873647179537, "grad_norm": 1.565586899232585e-05, "learning_rate": 2.4258940021246842e-08, "loss": 0.0, "num_input_tokens_seen": 129183656, "step": 191725 }, { "epoch": 4.683995798011384, "grad_norm": 0.0002123810991179198, "learning_rate": 2.4240273931513176e-08, "loss": 0.0, "num_input_tokens_seen": 129187112, "step": 191730 }, { "epoch": 4.684117948843232, "grad_norm": 0.0012347750598564744, "learning_rate": 2.4221614937766643e-08, "loss": 0.0, "num_input_tokens_seen": 129190376, "step": 191735 }, { "epoch": 4.684240099675079, "grad_norm": 0.0008376938058063388, "learning_rate": 2.420296304014291e-08, "loss": 0.0, "num_input_tokens_seen": 129193576, "step": 191740 }, { "epoch": 4.684362250506926, "grad_norm": 0.004786266479641199, "learning_rate": 2.4184318238777756e-08, "loss": 0.0, "num_input_tokens_seen": 129196584, "step": 191745 }, { "epoch": 4.684484401338773, "grad_norm": 0.005261608865112066, "learning_rate": 2.4165680533806632e-08, "loss": 0.0, "num_input_tokens_seen": 129200104, "step": 191750 }, { "epoch": 4.68460655217062, "grad_norm": 0.006853157188743353, "learning_rate": 2.4147049925365314e-08, "loss": 0.0, "num_input_tokens_seen": 129203688, "step": 191755 }, { "epoch": 4.684728703002468, "grad_norm": 0.0003582743520382792, "learning_rate": 2.412842641358892e-08, "loss": 0.0265, "num_input_tokens_seen": 129207080, "step": 191760 }, { "epoch": 4.684850853834314, "grad_norm": 0.0009606487583369017, "learning_rate": 2.410980999861323e-08, "loss": 0.0, "num_input_tokens_seen": 129210152, "step": 191765 }, { "epoch": 4.684973004666162, "grad_norm": 0.00018830588669516146, "learning_rate": 2.4091200680573352e-08, "loss": 0.0, "num_input_tokens_seen": 129213608, "step": 191770 }, { "epoch": 4.685095155498009, "grad_norm": 0.00023918184160720557, "learning_rate": 2.4072598459604743e-08, "loss": 0.0, "num_input_tokens_seen": 129217384, "step": 191775 }, { "epoch": 4.685217306329856, "grad_norm": 0.00038536760257557034, "learning_rate": 2.4054003335842842e-08, "loss": 0.0001, "num_input_tokens_seen": 129220840, "step": 191780 }, { "epoch": 4.685339457161703, "grad_norm": 0.0007668939069844782, "learning_rate": 2.4035415309422657e-08, "loss": 0.0, "num_input_tokens_seen": 129223912, "step": 191785 }, { "epoch": 4.685461607993551, "grad_norm": 1.1869547961396165e-05, "learning_rate": 2.401683438047941e-08, "loss": 0.0, "num_input_tokens_seen": 129227304, "step": 191790 }, { "epoch": 4.6855837588253975, "grad_norm": 0.0011348174884915352, "learning_rate": 2.399826054914822e-08, "loss": 0.0, "num_input_tokens_seen": 129230824, "step": 191795 }, { "epoch": 4.685705909657245, "grad_norm": 0.001253793016076088, "learning_rate": 2.3979693815564305e-08, "loss": 0.0, "num_input_tokens_seen": 129234024, "step": 191800 }, { "epoch": 4.685828060489092, "grad_norm": 0.11175274103879929, "learning_rate": 2.3961134179862564e-08, "loss": 0.0, "num_input_tokens_seen": 129237288, "step": 191805 }, { "epoch": 4.6859502113209395, "grad_norm": 0.000316588586429134, "learning_rate": 2.3942581642177884e-08, "loss": 0.0, "num_input_tokens_seen": 129240680, "step": 191810 }, { "epoch": 4.686072362152786, "grad_norm": 5.7418455980950966e-05, "learning_rate": 2.392403620264538e-08, "loss": 0.0, "num_input_tokens_seen": 129243880, "step": 191815 }, { "epoch": 4.686194512984633, "grad_norm": 3.4552344914118294e-06, "learning_rate": 2.3905497861399616e-08, "loss": 0.0, "num_input_tokens_seen": 129247592, "step": 191820 }, { "epoch": 4.686316663816481, "grad_norm": 0.1387357860803604, "learning_rate": 2.388696661857581e-08, "loss": 0.0, "num_input_tokens_seen": 129250600, "step": 191825 }, { "epoch": 4.686438814648328, "grad_norm": 0.004403110593557358, "learning_rate": 2.3868442474308524e-08, "loss": 0.0, "num_input_tokens_seen": 129253672, "step": 191830 }, { "epoch": 4.686560965480175, "grad_norm": 9.900266013573855e-05, "learning_rate": 2.384992542873243e-08, "loss": 0.0, "num_input_tokens_seen": 129256808, "step": 191835 }, { "epoch": 4.686683116312022, "grad_norm": 7.538765203207731e-05, "learning_rate": 2.3831415481982198e-08, "loss": 0.0, "num_input_tokens_seen": 129259944, "step": 191840 }, { "epoch": 4.686805267143869, "grad_norm": 0.0011306103551760316, "learning_rate": 2.3812912634192495e-08, "loss": 0.0, "num_input_tokens_seen": 129263336, "step": 191845 }, { "epoch": 4.686927417975716, "grad_norm": 0.0007455127197317779, "learning_rate": 2.379441688549788e-08, "loss": 0.0246, "num_input_tokens_seen": 129266408, "step": 191850 }, { "epoch": 4.687049568807564, "grad_norm": 0.0007126157288439572, "learning_rate": 2.3775928236032806e-08, "loss": 0.0, "num_input_tokens_seen": 129269928, "step": 191855 }, { "epoch": 4.6871717196394105, "grad_norm": 0.00044742575846612453, "learning_rate": 2.3757446685931826e-08, "loss": 0.0, "num_input_tokens_seen": 129273256, "step": 191860 }, { "epoch": 4.687293870471258, "grad_norm": 6.619530176976696e-05, "learning_rate": 2.3738972235329168e-08, "loss": 0.0024, "num_input_tokens_seen": 129276200, "step": 191865 }, { "epoch": 4.687416021303105, "grad_norm": 1.003375382424565e-05, "learning_rate": 2.3720504884359282e-08, "loss": 0.0, "num_input_tokens_seen": 129279400, "step": 191870 }, { "epoch": 4.6875381721349525, "grad_norm": 0.0061368816532194614, "learning_rate": 2.3702044633156503e-08, "loss": 0.0, "num_input_tokens_seen": 129282664, "step": 191875 }, { "epoch": 4.687660322966799, "grad_norm": 0.00020990378106944263, "learning_rate": 2.3683591481855058e-08, "loss": 0.0, "num_input_tokens_seen": 129285864, "step": 191880 }, { "epoch": 4.687782473798647, "grad_norm": 0.0008555403328500688, "learning_rate": 2.3665145430589173e-08, "loss": 0.0, "num_input_tokens_seen": 129289128, "step": 191885 }, { "epoch": 4.687904624630494, "grad_norm": 0.00013554960605688393, "learning_rate": 2.364670647949285e-08, "loss": 0.0, "num_input_tokens_seen": 129292008, "step": 191890 }, { "epoch": 4.688026775462341, "grad_norm": 5.329174382495694e-05, "learning_rate": 2.3628274628700318e-08, "loss": 0.0, "num_input_tokens_seen": 129295336, "step": 191895 }, { "epoch": 4.688148926294188, "grad_norm": 21.01401710510254, "learning_rate": 2.3609849878345577e-08, "loss": 0.0185, "num_input_tokens_seen": 129298984, "step": 191900 }, { "epoch": 4.688271077126036, "grad_norm": 0.0005068847676739097, "learning_rate": 2.3591432228562634e-08, "loss": 0.0, "num_input_tokens_seen": 129302824, "step": 191905 }, { "epoch": 4.688393227957882, "grad_norm": 0.004574902355670929, "learning_rate": 2.3573021679485495e-08, "loss": 0.0, "num_input_tokens_seen": 129306408, "step": 191910 }, { "epoch": 4.688515378789729, "grad_norm": 0.0023982757702469826, "learning_rate": 2.3554618231247934e-08, "loss": 0.0, "num_input_tokens_seen": 129309992, "step": 191915 }, { "epoch": 4.688637529621577, "grad_norm": 0.001703580841422081, "learning_rate": 2.3536221883983854e-08, "loss": 0.0001, "num_input_tokens_seen": 129313576, "step": 191920 }, { "epoch": 4.6887596804534235, "grad_norm": 0.000909437658265233, "learning_rate": 2.3517832637826806e-08, "loss": 0.0, "num_input_tokens_seen": 129317160, "step": 191925 }, { "epoch": 4.688881831285271, "grad_norm": 0.00022461153275799006, "learning_rate": 2.349945049291091e-08, "loss": 0.0, "num_input_tokens_seen": 129320232, "step": 191930 }, { "epoch": 4.689003982117118, "grad_norm": 1.9985896869911812e-05, "learning_rate": 2.3481075449369614e-08, "loss": 0.0, "num_input_tokens_seen": 129323112, "step": 191935 }, { "epoch": 4.6891261329489655, "grad_norm": 0.0005150790675543249, "learning_rate": 2.34627075073367e-08, "loss": 0.0022, "num_input_tokens_seen": 129326504, "step": 191940 }, { "epoch": 4.689248283780812, "grad_norm": 0.00022479926701635122, "learning_rate": 2.3444346666945503e-08, "loss": 0.0, "num_input_tokens_seen": 129329512, "step": 191945 }, { "epoch": 4.68937043461266, "grad_norm": 1.1208144314878155e-05, "learning_rate": 2.3425992928329695e-08, "loss": 0.0, "num_input_tokens_seen": 129333480, "step": 191950 }, { "epoch": 4.689492585444507, "grad_norm": 4.2835341446334496e-05, "learning_rate": 2.340764629162284e-08, "loss": 0.0, "num_input_tokens_seen": 129336872, "step": 191955 }, { "epoch": 4.689614736276354, "grad_norm": 0.0007139771478250623, "learning_rate": 2.338930675695805e-08, "loss": 0.0, "num_input_tokens_seen": 129340072, "step": 191960 }, { "epoch": 4.689736887108201, "grad_norm": 9.58927339524962e-05, "learning_rate": 2.3370974324468997e-08, "loss": 0.0, "num_input_tokens_seen": 129343848, "step": 191965 }, { "epoch": 4.689859037940049, "grad_norm": 7.770962110953405e-05, "learning_rate": 2.3352648994288905e-08, "loss": 0.0224, "num_input_tokens_seen": 129347368, "step": 191970 }, { "epoch": 4.689981188771895, "grad_norm": 0.00241994415409863, "learning_rate": 2.3334330766551002e-08, "loss": 0.0, "num_input_tokens_seen": 129350888, "step": 191975 }, { "epoch": 4.690103339603743, "grad_norm": 0.00035448124981485307, "learning_rate": 2.331601964138863e-08, "loss": 0.0, "num_input_tokens_seen": 129354728, "step": 191980 }, { "epoch": 4.69022549043559, "grad_norm": 0.0003735190839506686, "learning_rate": 2.329771561893479e-08, "loss": 0.0, "num_input_tokens_seen": 129357864, "step": 191985 }, { "epoch": 4.690347641267437, "grad_norm": 0.0018810693873092532, "learning_rate": 2.3279418699322594e-08, "loss": 0.0, "num_input_tokens_seen": 129361832, "step": 191990 }, { "epoch": 4.690469792099284, "grad_norm": 0.00018536254356149584, "learning_rate": 2.3261128882685275e-08, "loss": 0.0, "num_input_tokens_seen": 129365096, "step": 191995 }, { "epoch": 4.690591942931132, "grad_norm": 0.01687249168753624, "learning_rate": 2.3242846169155728e-08, "loss": 0.0, "num_input_tokens_seen": 129368872, "step": 192000 }, { "epoch": 4.6907140937629785, "grad_norm": 0.00037364265881478786, "learning_rate": 2.3224570558866952e-08, "loss": 0.0, "num_input_tokens_seen": 129371944, "step": 192005 }, { "epoch": 4.690836244594825, "grad_norm": 0.00013020077312830836, "learning_rate": 2.320630205195173e-08, "loss": 0.0, "num_input_tokens_seen": 129375272, "step": 192010 }, { "epoch": 4.690958395426673, "grad_norm": 0.00023853543098084629, "learning_rate": 2.3188040648543073e-08, "loss": 0.0, "num_input_tokens_seen": 129378792, "step": 192015 }, { "epoch": 4.69108054625852, "grad_norm": 0.01277604978531599, "learning_rate": 2.3169786348773644e-08, "loss": 0.0, "num_input_tokens_seen": 129382568, "step": 192020 }, { "epoch": 4.691202697090367, "grad_norm": 0.000663110229652375, "learning_rate": 2.3151539152776345e-08, "loss": 0.0, "num_input_tokens_seen": 129385832, "step": 192025 }, { "epoch": 4.691324847922214, "grad_norm": 0.0010855343425646424, "learning_rate": 2.3133299060683732e-08, "loss": 0.0, "num_input_tokens_seen": 129389224, "step": 192030 }, { "epoch": 4.691446998754062, "grad_norm": 0.00010959253268083557, "learning_rate": 2.3115066072628585e-08, "loss": 0.0, "num_input_tokens_seen": 129392424, "step": 192035 }, { "epoch": 4.691569149585908, "grad_norm": 0.0001751628442434594, "learning_rate": 2.309684018874336e-08, "loss": 0.0, "num_input_tokens_seen": 129395880, "step": 192040 }, { "epoch": 4.691691300417756, "grad_norm": 0.0004285105096641928, "learning_rate": 2.3078621409160727e-08, "loss": 0.0, "num_input_tokens_seen": 129399464, "step": 192045 }, { "epoch": 4.691813451249603, "grad_norm": 0.0003401923459023237, "learning_rate": 2.306040973401313e-08, "loss": 0.0, "num_input_tokens_seen": 129402792, "step": 192050 }, { "epoch": 4.69193560208145, "grad_norm": 0.021882670000195503, "learning_rate": 2.3042205163432914e-08, "loss": 0.0675, "num_input_tokens_seen": 129406248, "step": 192055 }, { "epoch": 4.692057752913297, "grad_norm": 0.0002050148177659139, "learning_rate": 2.302400769755264e-08, "loss": 0.0, "num_input_tokens_seen": 129409640, "step": 192060 }, { "epoch": 4.692179903745145, "grad_norm": 0.00010289058991475031, "learning_rate": 2.300581733650453e-08, "loss": 0.0, "num_input_tokens_seen": 129413224, "step": 192065 }, { "epoch": 4.692302054576992, "grad_norm": 0.00038634383236058056, "learning_rate": 2.2987634080420815e-08, "loss": 0.0, "num_input_tokens_seen": 129416808, "step": 192070 }, { "epoch": 4.692424205408839, "grad_norm": 0.00010601503163343295, "learning_rate": 2.2969457929433946e-08, "loss": 0.0, "num_input_tokens_seen": 129420712, "step": 192075 }, { "epoch": 4.692546356240686, "grad_norm": 0.001418996136635542, "learning_rate": 2.295128888367581e-08, "loss": 0.0, "num_input_tokens_seen": 129424680, "step": 192080 }, { "epoch": 4.692668507072533, "grad_norm": 0.00041618754039518535, "learning_rate": 2.2933126943278758e-08, "loss": 0.0, "num_input_tokens_seen": 129428072, "step": 192085 }, { "epoch": 4.69279065790438, "grad_norm": 3.8680613215547055e-05, "learning_rate": 2.2914972108374896e-08, "loss": 0.0, "num_input_tokens_seen": 129431528, "step": 192090 }, { "epoch": 4.692912808736228, "grad_norm": 0.000333454052451998, "learning_rate": 2.2896824379096014e-08, "loss": 0.0, "num_input_tokens_seen": 129434984, "step": 192095 }, { "epoch": 4.693034959568075, "grad_norm": 0.00013087606930639595, "learning_rate": 2.2878683755574446e-08, "loss": 0.0, "num_input_tokens_seen": 129438568, "step": 192100 }, { "epoch": 4.6931571103999215, "grad_norm": 0.002510525519028306, "learning_rate": 2.2860550237941644e-08, "loss": 0.0, "num_input_tokens_seen": 129442408, "step": 192105 }, { "epoch": 4.693279261231769, "grad_norm": 0.0001203200445161201, "learning_rate": 2.284242382632995e-08, "loss": 0.0, "num_input_tokens_seen": 129445736, "step": 192110 }, { "epoch": 4.693401412063616, "grad_norm": 0.0006897970451973379, "learning_rate": 2.2824304520870808e-08, "loss": 0.0, "num_input_tokens_seen": 129448808, "step": 192115 }, { "epoch": 4.6935235628954635, "grad_norm": 0.0007370402454398572, "learning_rate": 2.2806192321696225e-08, "loss": 0.0001, "num_input_tokens_seen": 129452136, "step": 192120 }, { "epoch": 4.69364571372731, "grad_norm": 6.168057007016614e-05, "learning_rate": 2.278808722893788e-08, "loss": 0.0, "num_input_tokens_seen": 129455464, "step": 192125 }, { "epoch": 4.693767864559158, "grad_norm": 0.00012389298353809863, "learning_rate": 2.2769989242727328e-08, "loss": 0.0, "num_input_tokens_seen": 129458472, "step": 192130 }, { "epoch": 4.693890015391005, "grad_norm": 6.390651105903089e-05, "learning_rate": 2.2751898363196354e-08, "loss": 0.0, "num_input_tokens_seen": 129462120, "step": 192135 }, { "epoch": 4.694012166222852, "grad_norm": 0.07547641545534134, "learning_rate": 2.273381459047641e-08, "loss": 0.0, "num_input_tokens_seen": 129465832, "step": 192140 }, { "epoch": 4.694134317054699, "grad_norm": 0.9471267461776733, "learning_rate": 2.271573792469905e-08, "loss": 0.0707, "num_input_tokens_seen": 129468968, "step": 192145 }, { "epoch": 4.694256467886547, "grad_norm": 0.0007499286439269781, "learning_rate": 2.2697668365995514e-08, "loss": 0.0, "num_input_tokens_seen": 129472296, "step": 192150 }, { "epoch": 4.694378618718393, "grad_norm": 0.00041503115789964795, "learning_rate": 2.2679605914497578e-08, "loss": 0.0002, "num_input_tokens_seen": 129475368, "step": 192155 }, { "epoch": 4.694500769550241, "grad_norm": 0.00028109681443311274, "learning_rate": 2.2661550570336473e-08, "loss": 0.0, "num_input_tokens_seen": 129478696, "step": 192160 }, { "epoch": 4.694622920382088, "grad_norm": 5.857786527485587e-05, "learning_rate": 2.2643502333643205e-08, "loss": 0.0, "num_input_tokens_seen": 129482088, "step": 192165 }, { "epoch": 4.694745071213935, "grad_norm": 3.311472391942516e-05, "learning_rate": 2.2625461204549444e-08, "loss": 0.0, "num_input_tokens_seen": 129485416, "step": 192170 }, { "epoch": 4.694867222045782, "grad_norm": 0.0005333948647603393, "learning_rate": 2.26074271831862e-08, "loss": 0.0, "num_input_tokens_seen": 129488808, "step": 192175 }, { "epoch": 4.694989372877629, "grad_norm": 0.0011380622163414955, "learning_rate": 2.2589400269684477e-08, "loss": 0.0359, "num_input_tokens_seen": 129492200, "step": 192180 }, { "epoch": 4.6951115237094765, "grad_norm": 0.006969318725168705, "learning_rate": 2.2571380464175725e-08, "loss": 0.0, "num_input_tokens_seen": 129495464, "step": 192185 }, { "epoch": 4.695233674541324, "grad_norm": 0.0009833202930167317, "learning_rate": 2.2553367766790622e-08, "loss": 0.0, "num_input_tokens_seen": 129499112, "step": 192190 }, { "epoch": 4.695355825373171, "grad_norm": 0.004456230904906988, "learning_rate": 2.25353621776605e-08, "loss": 0.0, "num_input_tokens_seen": 129502824, "step": 192195 }, { "epoch": 4.695477976205018, "grad_norm": 3.732401455636136e-05, "learning_rate": 2.2517363696916037e-08, "loss": 0.0667, "num_input_tokens_seen": 129506088, "step": 192200 }, { "epoch": 4.695600127036865, "grad_norm": 0.000706669467035681, "learning_rate": 2.2499372324688125e-08, "loss": 0.0, "num_input_tokens_seen": 129509352, "step": 192205 }, { "epoch": 4.695722277868712, "grad_norm": 0.00011496250226628035, "learning_rate": 2.2481388061107888e-08, "loss": 0.0029, "num_input_tokens_seen": 129513064, "step": 192210 }, { "epoch": 4.69584442870056, "grad_norm": 20.1938533782959, "learning_rate": 2.2463410906305768e-08, "loss": 0.0354, "num_input_tokens_seen": 129516264, "step": 192215 }, { "epoch": 4.695966579532406, "grad_norm": 0.0046359761618077755, "learning_rate": 2.2445440860412777e-08, "loss": 0.0, "num_input_tokens_seen": 129519976, "step": 192220 }, { "epoch": 4.696088730364254, "grad_norm": 0.0020351947750896215, "learning_rate": 2.242747792355937e-08, "loss": 0.0002, "num_input_tokens_seen": 129523816, "step": 192225 }, { "epoch": 4.696210881196101, "grad_norm": 0.0008622038294561207, "learning_rate": 2.240952209587632e-08, "loss": 0.0, "num_input_tokens_seen": 129527080, "step": 192230 }, { "epoch": 4.696333032027948, "grad_norm": 4.840063866140554e-06, "learning_rate": 2.239157337749409e-08, "loss": 0.0, "num_input_tokens_seen": 129530408, "step": 192235 }, { "epoch": 4.696455182859795, "grad_norm": 0.001740322564728558, "learning_rate": 2.2373631768543344e-08, "loss": 0.0, "num_input_tokens_seen": 129533416, "step": 192240 }, { "epoch": 4.696577333691643, "grad_norm": 0.001596541260369122, "learning_rate": 2.2355697269154537e-08, "loss": 0.0, "num_input_tokens_seen": 129537064, "step": 192245 }, { "epoch": 4.6966994845234895, "grad_norm": 0.00013007610687054694, "learning_rate": 2.2337769879458014e-08, "loss": 0.0, "num_input_tokens_seen": 129540264, "step": 192250 }, { "epoch": 4.696821635355337, "grad_norm": 6.054230107110925e-05, "learning_rate": 2.231984959958422e-08, "loss": 0.0, "num_input_tokens_seen": 129544104, "step": 192255 }, { "epoch": 4.696943786187184, "grad_norm": 0.00020400869834702462, "learning_rate": 2.230193642966338e-08, "loss": 0.0, "num_input_tokens_seen": 129547176, "step": 192260 }, { "epoch": 4.6970659370190315, "grad_norm": 0.004991667345166206, "learning_rate": 2.2284030369825956e-08, "loss": 0.0, "num_input_tokens_seen": 129550056, "step": 192265 }, { "epoch": 4.697188087850878, "grad_norm": 0.0008958621765486896, "learning_rate": 2.226613142020195e-08, "loss": 0.0004, "num_input_tokens_seen": 129553448, "step": 192270 }, { "epoch": 4.697310238682725, "grad_norm": 0.0023038180079311132, "learning_rate": 2.2248239580921478e-08, "loss": 0.0, "num_input_tokens_seen": 129556968, "step": 192275 }, { "epoch": 4.697432389514573, "grad_norm": 0.0013945907121524215, "learning_rate": 2.2230354852114998e-08, "loss": 0.0, "num_input_tokens_seen": 129560296, "step": 192280 }, { "epoch": 4.697554540346419, "grad_norm": 0.004303370136767626, "learning_rate": 2.2212477233912285e-08, "loss": 0.0, "num_input_tokens_seen": 129564136, "step": 192285 }, { "epoch": 4.697676691178267, "grad_norm": 0.00010136087803402916, "learning_rate": 2.2194606726443465e-08, "loss": 0.0, "num_input_tokens_seen": 129567528, "step": 192290 }, { "epoch": 4.697798842010114, "grad_norm": 0.08520669490098953, "learning_rate": 2.2176743329838433e-08, "loss": 0.0, "num_input_tokens_seen": 129570984, "step": 192295 }, { "epoch": 4.697920992841961, "grad_norm": 0.00013460438640322536, "learning_rate": 2.215888704422708e-08, "loss": 0.0, "num_input_tokens_seen": 129573992, "step": 192300 }, { "epoch": 4.698043143673808, "grad_norm": 7.838014425942674e-05, "learning_rate": 2.214103786973931e-08, "loss": 0.0, "num_input_tokens_seen": 129578472, "step": 192305 }, { "epoch": 4.698165294505656, "grad_norm": 0.00031517792376689613, "learning_rate": 2.2123195806505013e-08, "loss": 0.0, "num_input_tokens_seen": 129581544, "step": 192310 }, { "epoch": 4.6982874453375025, "grad_norm": 0.001368136378005147, "learning_rate": 2.2105360854653865e-08, "loss": 0.0, "num_input_tokens_seen": 129584744, "step": 192315 }, { "epoch": 4.69840959616935, "grad_norm": 0.000132130560814403, "learning_rate": 2.2087533014315428e-08, "loss": 0.0, "num_input_tokens_seen": 129588072, "step": 192320 }, { "epoch": 4.698531747001197, "grad_norm": 0.011298295110464096, "learning_rate": 2.2069712285619602e-08, "loss": 0.0, "num_input_tokens_seen": 129591016, "step": 192325 }, { "epoch": 4.6986538978330445, "grad_norm": 0.00041692276136018336, "learning_rate": 2.2051898668695724e-08, "loss": 0.0001, "num_input_tokens_seen": 129594536, "step": 192330 }, { "epoch": 4.698776048664891, "grad_norm": 4.01533288822975e-05, "learning_rate": 2.203409216367358e-08, "loss": 0.0, "num_input_tokens_seen": 129597928, "step": 192335 }, { "epoch": 4.698898199496739, "grad_norm": 0.0007659096154384315, "learning_rate": 2.201629277068251e-08, "loss": 0.0, "num_input_tokens_seen": 129600872, "step": 192340 }, { "epoch": 4.699020350328586, "grad_norm": 0.0030560491140931845, "learning_rate": 2.1998500489852077e-08, "loss": 0.0, "num_input_tokens_seen": 129604776, "step": 192345 }, { "epoch": 4.699142501160432, "grad_norm": 0.00016910966951400042, "learning_rate": 2.1980715321311515e-08, "loss": 0.0, "num_input_tokens_seen": 129607848, "step": 192350 }, { "epoch": 4.69926465199228, "grad_norm": 1.6891659470275044e-05, "learning_rate": 2.1962937265190385e-08, "loss": 0.0, "num_input_tokens_seen": 129611240, "step": 192355 }, { "epoch": 4.699386802824128, "grad_norm": 0.00044170417822897434, "learning_rate": 2.194516632161769e-08, "loss": 0.0, "num_input_tokens_seen": 129614504, "step": 192360 }, { "epoch": 4.699508953655974, "grad_norm": 0.0006980904727242887, "learning_rate": 2.192740249072289e-08, "loss": 0.0, "num_input_tokens_seen": 129618088, "step": 192365 }, { "epoch": 4.699631104487821, "grad_norm": 2.8264948923606426e-05, "learning_rate": 2.1909645772634988e-08, "loss": 0.0, "num_input_tokens_seen": 129621608, "step": 192370 }, { "epoch": 4.699753255319669, "grad_norm": 0.0015524571062996984, "learning_rate": 2.1891896167483327e-08, "loss": 0.0, "num_input_tokens_seen": 129625192, "step": 192375 }, { "epoch": 4.6998754061515156, "grad_norm": 0.0006360540282912552, "learning_rate": 2.1874153675396802e-08, "loss": 0.0, "num_input_tokens_seen": 129628328, "step": 192380 }, { "epoch": 4.699997556983363, "grad_norm": 0.00010307029151590541, "learning_rate": 2.1856418296504642e-08, "loss": 0.0, "num_input_tokens_seen": 129631720, "step": 192385 }, { "epoch": 4.70011970781521, "grad_norm": 0.00037360982969403267, "learning_rate": 2.1838690030935524e-08, "loss": 0.0, "num_input_tokens_seen": 129635880, "step": 192390 }, { "epoch": 4.700241858647058, "grad_norm": 0.001626503886654973, "learning_rate": 2.1820968878818567e-08, "loss": 0.0, "num_input_tokens_seen": 129639144, "step": 192395 }, { "epoch": 4.700364009478904, "grad_norm": 0.00037940763286314905, "learning_rate": 2.180325484028278e-08, "loss": 0.0, "num_input_tokens_seen": 129642344, "step": 192400 }, { "epoch": 4.700486160310752, "grad_norm": 0.0007786820060573518, "learning_rate": 2.1785547915456727e-08, "loss": 0.0, "num_input_tokens_seen": 129645416, "step": 192405 }, { "epoch": 4.700608311142599, "grad_norm": 3.812544309766963e-05, "learning_rate": 2.17678481044693e-08, "loss": 0.0, "num_input_tokens_seen": 129649128, "step": 192410 }, { "epoch": 4.700730461974446, "grad_norm": 5.064254582975991e-05, "learning_rate": 2.1750155407449178e-08, "loss": 0.0, "num_input_tokens_seen": 129652456, "step": 192415 }, { "epoch": 4.700852612806293, "grad_norm": 0.006544878240674734, "learning_rate": 2.1732469824525035e-08, "loss": 0.0, "num_input_tokens_seen": 129655464, "step": 192420 }, { "epoch": 4.700974763638141, "grad_norm": 0.0009501032182015479, "learning_rate": 2.1714791355825434e-08, "loss": 0.0, "num_input_tokens_seen": 129659112, "step": 192425 }, { "epoch": 4.7010969144699875, "grad_norm": 0.00031887463410384953, "learning_rate": 2.1697120001479053e-08, "loss": 0.0, "num_input_tokens_seen": 129662888, "step": 192430 }, { "epoch": 4.701219065301835, "grad_norm": 0.00013029456022195518, "learning_rate": 2.167945576161434e-08, "loss": 0.0, "num_input_tokens_seen": 129666216, "step": 192435 }, { "epoch": 4.701341216133682, "grad_norm": 7.15162095730193e-05, "learning_rate": 2.166179863635975e-08, "loss": 0.017, "num_input_tokens_seen": 129669608, "step": 192440 }, { "epoch": 4.701463366965529, "grad_norm": 0.021288808435201645, "learning_rate": 2.164414862584385e-08, "loss": 0.0, "num_input_tokens_seen": 129672744, "step": 192445 }, { "epoch": 4.701585517797376, "grad_norm": 0.00013626010331790894, "learning_rate": 2.1626505730194645e-08, "loss": 0.0, "num_input_tokens_seen": 129676648, "step": 192450 }, { "epoch": 4.701707668629224, "grad_norm": 0.0007354624103754759, "learning_rate": 2.1608869949540808e-08, "loss": 0.0, "num_input_tokens_seen": 129680104, "step": 192455 }, { "epoch": 4.701829819461071, "grad_norm": 7.168312731664628e-05, "learning_rate": 2.1591241284010242e-08, "loss": 0.0724, "num_input_tokens_seen": 129683304, "step": 192460 }, { "epoch": 4.701951970292917, "grad_norm": 4.4334596168482676e-05, "learning_rate": 2.1573619733731507e-08, "loss": 0.0002, "num_input_tokens_seen": 129686312, "step": 192465 }, { "epoch": 4.702074121124765, "grad_norm": 0.0033819645177572966, "learning_rate": 2.1556005298832502e-08, "loss": 0.0, "num_input_tokens_seen": 129690408, "step": 192470 }, { "epoch": 4.702196271956612, "grad_norm": 0.000799092638771981, "learning_rate": 2.1538397979441348e-08, "loss": 0.0, "num_input_tokens_seen": 129693800, "step": 192475 }, { "epoch": 4.702318422788459, "grad_norm": 2.3778502509230748e-05, "learning_rate": 2.1520797775686273e-08, "loss": 0.0, "num_input_tokens_seen": 129697064, "step": 192480 }, { "epoch": 4.702440573620306, "grad_norm": 0.0001850874105002731, "learning_rate": 2.1503204687694952e-08, "loss": 0.0, "num_input_tokens_seen": 129700264, "step": 192485 }, { "epoch": 4.702562724452154, "grad_norm": 0.00020812089496757835, "learning_rate": 2.148561871559562e-08, "loss": 0.0005, "num_input_tokens_seen": 129703400, "step": 192490 }, { "epoch": 4.7026848752840005, "grad_norm": 0.00045550725189968944, "learning_rate": 2.1468039859516062e-08, "loss": 0.0, "num_input_tokens_seen": 129706600, "step": 192495 }, { "epoch": 4.702807026115848, "grad_norm": 4.641317354980856e-05, "learning_rate": 2.1450468119584066e-08, "loss": 0.0, "num_input_tokens_seen": 129710056, "step": 192500 }, { "epoch": 4.702929176947695, "grad_norm": 0.00022677636297885329, "learning_rate": 2.1432903495927523e-08, "loss": 0.0, "num_input_tokens_seen": 129713576, "step": 192505 }, { "epoch": 4.7030513277795425, "grad_norm": 0.0009053811081685126, "learning_rate": 2.1415345988674006e-08, "loss": 0.0, "num_input_tokens_seen": 129716840, "step": 192510 }, { "epoch": 4.703173478611389, "grad_norm": 0.00039823848055675626, "learning_rate": 2.1397795597951406e-08, "loss": 0.0, "num_input_tokens_seen": 129720040, "step": 192515 }, { "epoch": 4.703295629443237, "grad_norm": 0.00016683421563357115, "learning_rate": 2.1380252323887182e-08, "loss": 0.0, "num_input_tokens_seen": 129723048, "step": 192520 }, { "epoch": 4.703417780275084, "grad_norm": 0.00031939937616698444, "learning_rate": 2.1362716166609008e-08, "loss": 0.0, "num_input_tokens_seen": 129726760, "step": 192525 }, { "epoch": 4.703539931106931, "grad_norm": 0.0009505918715149164, "learning_rate": 2.1345187126244335e-08, "loss": 0.0, "num_input_tokens_seen": 129730088, "step": 192530 }, { "epoch": 4.703662081938778, "grad_norm": 0.001424207934178412, "learning_rate": 2.1327665202920732e-08, "loss": 0.0, "num_input_tokens_seen": 129733544, "step": 192535 }, { "epoch": 4.703784232770625, "grad_norm": 0.0001100384906749241, "learning_rate": 2.1310150396765646e-08, "loss": 0.0, "num_input_tokens_seen": 129736424, "step": 192540 }, { "epoch": 4.703906383602472, "grad_norm": 0.00031706338631920516, "learning_rate": 2.1292642707906316e-08, "loss": 0.0, "num_input_tokens_seen": 129739432, "step": 192545 }, { "epoch": 4.704028534434319, "grad_norm": 0.01422285009175539, "learning_rate": 2.127514213647008e-08, "loss": 0.0, "num_input_tokens_seen": 129743336, "step": 192550 }, { "epoch": 4.704150685266167, "grad_norm": 3.073379775742069e-05, "learning_rate": 2.1257648682584284e-08, "loss": 0.0, "num_input_tokens_seen": 129747112, "step": 192555 }, { "epoch": 4.7042728360980135, "grad_norm": 0.00014186625776346773, "learning_rate": 2.1240162346376266e-08, "loss": 0.0, "num_input_tokens_seen": 129750504, "step": 192560 }, { "epoch": 4.704394986929861, "grad_norm": 4.058052581967786e-05, "learning_rate": 2.1222683127972817e-08, "loss": 0.0, "num_input_tokens_seen": 129753640, "step": 192565 }, { "epoch": 4.704517137761708, "grad_norm": 0.12598304450511932, "learning_rate": 2.120521102750139e-08, "loss": 0.0001, "num_input_tokens_seen": 129757160, "step": 192570 }, { "epoch": 4.7046392885935555, "grad_norm": 0.0008097590180113912, "learning_rate": 2.1187746045088996e-08, "loss": 0.0, "num_input_tokens_seen": 129760296, "step": 192575 }, { "epoch": 4.704761439425402, "grad_norm": 0.00015910847287159413, "learning_rate": 2.1170288180862528e-08, "loss": 0.0, "num_input_tokens_seen": 129763880, "step": 192580 }, { "epoch": 4.70488359025725, "grad_norm": 2.650478927535005e-05, "learning_rate": 2.115283743494889e-08, "loss": 0.0, "num_input_tokens_seen": 129766952, "step": 192585 }, { "epoch": 4.705005741089097, "grad_norm": 7.849488611100242e-05, "learning_rate": 2.1135393807475198e-08, "loss": 0.0, "num_input_tokens_seen": 129770152, "step": 192590 }, { "epoch": 4.705127891920944, "grad_norm": 0.0001116606654250063, "learning_rate": 2.1117957298568133e-08, "loss": 0.0, "num_input_tokens_seen": 129773800, "step": 192595 }, { "epoch": 4.705250042752791, "grad_norm": 0.0020123757421970367, "learning_rate": 2.1100527908354704e-08, "loss": 0.0, "num_input_tokens_seen": 129776936, "step": 192600 }, { "epoch": 4.705372193584639, "grad_norm": 0.0008393031312152743, "learning_rate": 2.1083105636961363e-08, "loss": 0.0, "num_input_tokens_seen": 129780328, "step": 192605 }, { "epoch": 4.705494344416485, "grad_norm": 0.00021688619744963944, "learning_rate": 2.1065690484515007e-08, "loss": 0.0, "num_input_tokens_seen": 129783400, "step": 192610 }, { "epoch": 4.705616495248332, "grad_norm": 2.179694092774298e-05, "learning_rate": 2.1048282451142428e-08, "loss": 0.0, "num_input_tokens_seen": 129786536, "step": 192615 }, { "epoch": 4.70573864608018, "grad_norm": 0.003509758971631527, "learning_rate": 2.1030881536969857e-08, "loss": 0.0, "num_input_tokens_seen": 129789736, "step": 192620 }, { "epoch": 4.705860796912027, "grad_norm": 2.4690003556315787e-05, "learning_rate": 2.1013487742124192e-08, "loss": 0.0, "num_input_tokens_seen": 129793000, "step": 192625 }, { "epoch": 4.705982947743874, "grad_norm": 0.012998932972550392, "learning_rate": 2.0996101066731552e-08, "loss": 0.0, "num_input_tokens_seen": 129796456, "step": 192630 }, { "epoch": 4.706105098575721, "grad_norm": 0.0010782252065837383, "learning_rate": 2.097872151091873e-08, "loss": 0.0, "num_input_tokens_seen": 129799784, "step": 192635 }, { "epoch": 4.7062272494075685, "grad_norm": 0.0006848773919045925, "learning_rate": 2.0961349074811952e-08, "loss": 0.0, "num_input_tokens_seen": 129802792, "step": 192640 }, { "epoch": 4.706349400239415, "grad_norm": 0.04416637495160103, "learning_rate": 2.0943983758537453e-08, "loss": 0.0, "num_input_tokens_seen": 129806184, "step": 192645 }, { "epoch": 4.706471551071263, "grad_norm": 0.00011389933206373826, "learning_rate": 2.09266255622218e-08, "loss": 0.0, "num_input_tokens_seen": 129809960, "step": 192650 }, { "epoch": 4.70659370190311, "grad_norm": 8.055933722062036e-05, "learning_rate": 2.0909274485991003e-08, "loss": 0.0, "num_input_tokens_seen": 129812904, "step": 192655 }, { "epoch": 4.706715852734957, "grad_norm": 2.3937989681144245e-05, "learning_rate": 2.089193052997129e-08, "loss": 0.0, "num_input_tokens_seen": 129816552, "step": 192660 }, { "epoch": 4.706838003566804, "grad_norm": 6.111896800575778e-05, "learning_rate": 2.08745936942889e-08, "loss": 0.0, "num_input_tokens_seen": 129819816, "step": 192665 }, { "epoch": 4.706960154398652, "grad_norm": 0.00013432465493679047, "learning_rate": 2.0857263979069727e-08, "loss": 0.0, "num_input_tokens_seen": 129823400, "step": 192670 }, { "epoch": 4.707082305230498, "grad_norm": 0.0024881605058908463, "learning_rate": 2.0839941384439897e-08, "loss": 0.0043, "num_input_tokens_seen": 129826856, "step": 192675 }, { "epoch": 4.707204456062346, "grad_norm": 0.0003247729910071939, "learning_rate": 2.0822625910525415e-08, "loss": 0.0, "num_input_tokens_seen": 129830440, "step": 192680 }, { "epoch": 4.707326606894193, "grad_norm": 3.557054515113123e-05, "learning_rate": 2.0805317557452184e-08, "loss": 0.0, "num_input_tokens_seen": 129834664, "step": 192685 }, { "epoch": 4.70744875772604, "grad_norm": 3.953570194425993e-05, "learning_rate": 2.078801632534588e-08, "loss": 0.0, "num_input_tokens_seen": 129837992, "step": 192690 }, { "epoch": 4.707570908557887, "grad_norm": 0.027544084936380386, "learning_rate": 2.0770722214332736e-08, "loss": 0.0, "num_input_tokens_seen": 129841192, "step": 192695 }, { "epoch": 4.707693059389735, "grad_norm": 0.00014391849981620908, "learning_rate": 2.0753435224538095e-08, "loss": 0.0, "num_input_tokens_seen": 129844648, "step": 192700 }, { "epoch": 4.707815210221582, "grad_norm": 0.006761842407286167, "learning_rate": 2.0736155356087858e-08, "loss": 0.0, "num_input_tokens_seen": 129848232, "step": 192705 }, { "epoch": 4.707937361053428, "grad_norm": 0.0003056692366953939, "learning_rate": 2.0718882609107812e-08, "loss": 0.0, "num_input_tokens_seen": 129851816, "step": 192710 }, { "epoch": 4.708059511885276, "grad_norm": 0.004368090070784092, "learning_rate": 2.0701616983723414e-08, "loss": 0.0, "num_input_tokens_seen": 129855144, "step": 192715 }, { "epoch": 4.708181662717124, "grad_norm": 0.0005663208430632949, "learning_rate": 2.0684358480060228e-08, "loss": 0.0, "num_input_tokens_seen": 129858408, "step": 192720 }, { "epoch": 4.70830381354897, "grad_norm": 0.00015874313248787075, "learning_rate": 2.0667107098243818e-08, "loss": 0.0, "num_input_tokens_seen": 129862248, "step": 192725 }, { "epoch": 4.708425964380817, "grad_norm": 1.957891981874127e-05, "learning_rate": 2.0649862838399645e-08, "loss": 0.0, "num_input_tokens_seen": 129865640, "step": 192730 }, { "epoch": 4.708548115212665, "grad_norm": 0.0003272563626524061, "learning_rate": 2.0632625700652938e-08, "loss": 0.0, "num_input_tokens_seen": 129869416, "step": 192735 }, { "epoch": 4.708670266044511, "grad_norm": 9.015477553475648e-05, "learning_rate": 2.0615395685129266e-08, "loss": 0.0, "num_input_tokens_seen": 129872680, "step": 192740 }, { "epoch": 4.708792416876359, "grad_norm": 9.212747681885958e-05, "learning_rate": 2.059817279195397e-08, "loss": 0.0, "num_input_tokens_seen": 129875944, "step": 192745 }, { "epoch": 4.708914567708206, "grad_norm": 0.0008575996034778655, "learning_rate": 2.0580957021252067e-08, "loss": 0.0, "num_input_tokens_seen": 129879336, "step": 192750 }, { "epoch": 4.7090367185400535, "grad_norm": 0.000842539535369724, "learning_rate": 2.0563748373148894e-08, "loss": 0.0, "num_input_tokens_seen": 129882728, "step": 192755 }, { "epoch": 4.7091588693719, "grad_norm": 0.0002445719437673688, "learning_rate": 2.0546546847769574e-08, "loss": 0.0, "num_input_tokens_seen": 129885800, "step": 192760 }, { "epoch": 4.709281020203748, "grad_norm": 0.00025933721917681396, "learning_rate": 2.0529352445239234e-08, "loss": 0.0, "num_input_tokens_seen": 129889128, "step": 192765 }, { "epoch": 4.709403171035595, "grad_norm": 0.00010074125020764768, "learning_rate": 2.0512165165682882e-08, "loss": 0.0, "num_input_tokens_seen": 129893160, "step": 192770 }, { "epoch": 4.709525321867442, "grad_norm": 5.31175146534224e-06, "learning_rate": 2.049498500922553e-08, "loss": 0.0001, "num_input_tokens_seen": 129896168, "step": 192775 }, { "epoch": 4.709647472699289, "grad_norm": 4.8221234465017915e-05, "learning_rate": 2.0477811975992187e-08, "loss": 0.0, "num_input_tokens_seen": 129899176, "step": 192780 }, { "epoch": 4.709769623531137, "grad_norm": 0.005339786410331726, "learning_rate": 2.0460646066107533e-08, "loss": 0.0245, "num_input_tokens_seen": 129902888, "step": 192785 }, { "epoch": 4.709891774362983, "grad_norm": 0.0022137362975627184, "learning_rate": 2.0443487279696582e-08, "loss": 0.0, "num_input_tokens_seen": 129905896, "step": 192790 }, { "epoch": 4.710013925194831, "grad_norm": 5.289226828608662e-05, "learning_rate": 2.0426335616884005e-08, "loss": 0.0, "num_input_tokens_seen": 129909352, "step": 192795 }, { "epoch": 4.710136076026678, "grad_norm": 0.0035220349673181772, "learning_rate": 2.0409191077794595e-08, "loss": 0.0, "num_input_tokens_seen": 129912424, "step": 192800 }, { "epoch": 4.7102582268585245, "grad_norm": 0.0060235122218728065, "learning_rate": 2.039205366255303e-08, "loss": 0.0, "num_input_tokens_seen": 129915752, "step": 192805 }, { "epoch": 4.710380377690372, "grad_norm": 0.00028744732844643295, "learning_rate": 2.0374923371283992e-08, "loss": 0.0, "num_input_tokens_seen": 129919208, "step": 192810 }, { "epoch": 4.71050252852222, "grad_norm": 0.0006107626832090318, "learning_rate": 2.035780020411193e-08, "loss": 0.0001, "num_input_tokens_seen": 129922792, "step": 192815 }, { "epoch": 4.7106246793540665, "grad_norm": 0.000291914155241102, "learning_rate": 2.0340684161161414e-08, "loss": 0.0, "num_input_tokens_seen": 129926312, "step": 192820 }, { "epoch": 4.710746830185913, "grad_norm": 1.9453102140687406e-05, "learning_rate": 2.0323575242557123e-08, "loss": 0.0, "num_input_tokens_seen": 129929896, "step": 192825 }, { "epoch": 4.710868981017761, "grad_norm": 0.00029763614293187857, "learning_rate": 2.0306473448423066e-08, "loss": 0.0, "num_input_tokens_seen": 129933096, "step": 192830 }, { "epoch": 4.710991131849608, "grad_norm": 0.001083080773241818, "learning_rate": 2.0289378778883924e-08, "loss": 0.0, "num_input_tokens_seen": 129936488, "step": 192835 }, { "epoch": 4.711113282681455, "grad_norm": 5.25053619639948e-05, "learning_rate": 2.027229123406393e-08, "loss": 0.0, "num_input_tokens_seen": 129939944, "step": 192840 }, { "epoch": 4.711235433513302, "grad_norm": 4.293541132938117e-05, "learning_rate": 2.025521081408732e-08, "loss": 0.0, "num_input_tokens_seen": 129943400, "step": 192845 }, { "epoch": 4.71135758434515, "grad_norm": 0.000599803461227566, "learning_rate": 2.0238137519078436e-08, "loss": 0.0, "num_input_tokens_seen": 129946728, "step": 192850 }, { "epoch": 4.711479735176996, "grad_norm": 0.00016343145398423076, "learning_rate": 2.022107134916129e-08, "loss": 0.0, "num_input_tokens_seen": 129949736, "step": 192855 }, { "epoch": 4.711601886008844, "grad_norm": 0.005201476626098156, "learning_rate": 2.0204012304460005e-08, "loss": 0.0, "num_input_tokens_seen": 129952936, "step": 192860 }, { "epoch": 4.711724036840691, "grad_norm": 0.026774518191814423, "learning_rate": 2.0186960385098707e-08, "loss": 0.0, "num_input_tokens_seen": 129956136, "step": 192865 }, { "epoch": 4.711846187672538, "grad_norm": 0.0007024348597042263, "learning_rate": 2.0169915591201403e-08, "loss": 0.0, "num_input_tokens_seen": 129959528, "step": 192870 }, { "epoch": 4.711968338504385, "grad_norm": 0.00012115760910091922, "learning_rate": 2.0152877922891996e-08, "loss": 0.0, "num_input_tokens_seen": 129962728, "step": 192875 }, { "epoch": 4.712090489336233, "grad_norm": 3.893441680702381e-05, "learning_rate": 2.013584738029439e-08, "loss": 0.0, "num_input_tokens_seen": 129965928, "step": 192880 }, { "epoch": 4.7122126401680795, "grad_norm": 1.953154060174711e-05, "learning_rate": 2.0118823963532482e-08, "loss": 0.0, "num_input_tokens_seen": 129969704, "step": 192885 }, { "epoch": 4.712334790999927, "grad_norm": 0.018396366387605667, "learning_rate": 2.0101807672729953e-08, "loss": 0.0, "num_input_tokens_seen": 129972968, "step": 192890 }, { "epoch": 4.712456941831774, "grad_norm": 5.830708323628642e-05, "learning_rate": 2.0084798508010703e-08, "loss": 0.0, "num_input_tokens_seen": 129976168, "step": 192895 }, { "epoch": 4.712579092663621, "grad_norm": 0.0005415156483650208, "learning_rate": 2.006779646949841e-08, "loss": 0.0, "num_input_tokens_seen": 129979752, "step": 192900 }, { "epoch": 4.712701243495468, "grad_norm": 5.077142486697994e-05, "learning_rate": 2.0050801557316532e-08, "loss": 0.0, "num_input_tokens_seen": 129983016, "step": 192905 }, { "epoch": 4.712823394327315, "grad_norm": 0.00036389665910974145, "learning_rate": 2.003381377158897e-08, "loss": 0.0, "num_input_tokens_seen": 129986472, "step": 192910 }, { "epoch": 4.712945545159163, "grad_norm": 0.002304884372279048, "learning_rate": 2.0016833112438958e-08, "loss": 0.0321, "num_input_tokens_seen": 129989672, "step": 192915 }, { "epoch": 4.713067695991009, "grad_norm": 1.98540001292713e-05, "learning_rate": 1.9999859579990175e-08, "loss": 0.0, "num_input_tokens_seen": 129993000, "step": 192920 }, { "epoch": 4.713189846822857, "grad_norm": 0.0007984357071109116, "learning_rate": 1.9982893174366077e-08, "loss": 0.0, "num_input_tokens_seen": 129996136, "step": 192925 }, { "epoch": 4.713311997654704, "grad_norm": 0.0018656195607036352, "learning_rate": 1.996593389568979e-08, "loss": 0.0, "num_input_tokens_seen": 129999784, "step": 192930 }, { "epoch": 4.713434148486551, "grad_norm": 0.0021639051847159863, "learning_rate": 1.9948981744084992e-08, "loss": 0.0, "num_input_tokens_seen": 130003368, "step": 192935 }, { "epoch": 4.713556299318398, "grad_norm": 0.0019073068397119641, "learning_rate": 1.9932036719674696e-08, "loss": 0.0, "num_input_tokens_seen": 130006632, "step": 192940 }, { "epoch": 4.713678450150246, "grad_norm": 0.0027616843581199646, "learning_rate": 1.9915098822582353e-08, "loss": 0.0, "num_input_tokens_seen": 130009704, "step": 192945 }, { "epoch": 4.7138006009820925, "grad_norm": 0.00016160483937710524, "learning_rate": 1.9898168052930987e-08, "loss": 0.0, "num_input_tokens_seen": 130013352, "step": 192950 }, { "epoch": 4.71392275181394, "grad_norm": 2.910471084760502e-05, "learning_rate": 1.98812444108436e-08, "loss": 0.0, "num_input_tokens_seen": 130016552, "step": 192955 }, { "epoch": 4.714044902645787, "grad_norm": 0.09401872009038925, "learning_rate": 1.9864327896443655e-08, "loss": 0.0, "num_input_tokens_seen": 130019752, "step": 192960 }, { "epoch": 4.7141670534776345, "grad_norm": 0.01751399040222168, "learning_rate": 1.984741850985383e-08, "loss": 0.0, "num_input_tokens_seen": 130023720, "step": 192965 }, { "epoch": 4.714289204309481, "grad_norm": 0.0027101587038487196, "learning_rate": 1.9830516251197247e-08, "loss": 0.0, "num_input_tokens_seen": 130026728, "step": 192970 }, { "epoch": 4.714411355141328, "grad_norm": 0.0004698086704593152, "learning_rate": 1.9813621120596703e-08, "loss": 0.0, "num_input_tokens_seen": 130029992, "step": 192975 }, { "epoch": 4.714533505973176, "grad_norm": 0.00010237730748485774, "learning_rate": 1.97967331181752e-08, "loss": 0.0, "num_input_tokens_seen": 130033192, "step": 192980 }, { "epoch": 4.714655656805023, "grad_norm": 3.8691145164193586e-05, "learning_rate": 1.977985224405554e-08, "loss": 0.0, "num_input_tokens_seen": 130036968, "step": 192985 }, { "epoch": 4.71477780763687, "grad_norm": 0.00014511265908367932, "learning_rate": 1.9762978498360393e-08, "loss": 0.0, "num_input_tokens_seen": 130039912, "step": 192990 }, { "epoch": 4.714899958468717, "grad_norm": 0.0002013398625422269, "learning_rate": 1.9746111881212556e-08, "loss": 0.0, "num_input_tokens_seen": 130043368, "step": 192995 }, { "epoch": 4.715022109300564, "grad_norm": 0.0014783921651542187, "learning_rate": 1.9729252392734597e-08, "loss": 0.0, "num_input_tokens_seen": 130046568, "step": 193000 }, { "epoch": 4.715144260132411, "grad_norm": 0.0002856815990526229, "learning_rate": 1.9712400033049194e-08, "loss": 0.0, "num_input_tokens_seen": 130049640, "step": 193005 }, { "epoch": 4.715266410964259, "grad_norm": 0.0011368409032002091, "learning_rate": 1.9695554802278803e-08, "loss": 0.0001, "num_input_tokens_seen": 130053096, "step": 193010 }, { "epoch": 4.7153885617961055, "grad_norm": 0.0006178556359373033, "learning_rate": 1.9678716700546106e-08, "loss": 0.0, "num_input_tokens_seen": 130056424, "step": 193015 }, { "epoch": 4.715510712627953, "grad_norm": 3.2776610169094056e-05, "learning_rate": 1.9661885727973448e-08, "loss": 0.0, "num_input_tokens_seen": 130059816, "step": 193020 }, { "epoch": 4.7156328634598, "grad_norm": 0.00012467149645090103, "learning_rate": 1.964506188468318e-08, "loss": 0.0, "num_input_tokens_seen": 130062888, "step": 193025 }, { "epoch": 4.715755014291648, "grad_norm": 0.00035838209441863, "learning_rate": 1.9628245170797865e-08, "loss": 0.0, "num_input_tokens_seen": 130065960, "step": 193030 }, { "epoch": 4.715877165123494, "grad_norm": 5.7584791647968814e-05, "learning_rate": 1.9611435586439405e-08, "loss": 0.0005, "num_input_tokens_seen": 130069416, "step": 193035 }, { "epoch": 4.715999315955342, "grad_norm": 7.922954559326172, "learning_rate": 1.9594633131730488e-08, "loss": 0.0224, "num_input_tokens_seen": 130073320, "step": 193040 }, { "epoch": 4.716121466787189, "grad_norm": 0.001085901283659041, "learning_rate": 1.957783780679301e-08, "loss": 0.0, "num_input_tokens_seen": 130076456, "step": 193045 }, { "epoch": 4.716243617619036, "grad_norm": 0.0024623856879770756, "learning_rate": 1.9561049611749093e-08, "loss": 0.0, "num_input_tokens_seen": 130079528, "step": 193050 }, { "epoch": 4.716365768450883, "grad_norm": 0.006301338318735361, "learning_rate": 1.9544268546721087e-08, "loss": 0.0, "num_input_tokens_seen": 130083048, "step": 193055 }, { "epoch": 4.716487919282731, "grad_norm": 4.267549957148731e-05, "learning_rate": 1.9527494611830786e-08, "loss": 0.0004, "num_input_tokens_seen": 130086184, "step": 193060 }, { "epoch": 4.7166100701145774, "grad_norm": 0.0012560115428641438, "learning_rate": 1.951072780720031e-08, "loss": 0.001, "num_input_tokens_seen": 130089512, "step": 193065 }, { "epoch": 4.716732220946424, "grad_norm": 0.0003190060087945312, "learning_rate": 1.9493968132951455e-08, "loss": 0.0, "num_input_tokens_seen": 130092520, "step": 193070 }, { "epoch": 4.716854371778272, "grad_norm": 0.00017790061247069389, "learning_rate": 1.947721558920634e-08, "loss": 0.0, "num_input_tokens_seen": 130095912, "step": 193075 }, { "epoch": 4.7169765226101195, "grad_norm": 5.468689778354019e-05, "learning_rate": 1.946047017608654e-08, "loss": 0.0, "num_input_tokens_seen": 130099304, "step": 193080 }, { "epoch": 4.717098673441966, "grad_norm": 0.00034660325036384165, "learning_rate": 1.9443731893713954e-08, "loss": 0.0, "num_input_tokens_seen": 130102568, "step": 193085 }, { "epoch": 4.717220824273813, "grad_norm": 4.338144208304584e-05, "learning_rate": 1.9427000742210376e-08, "loss": 0.0, "num_input_tokens_seen": 130106216, "step": 193090 }, { "epoch": 4.717342975105661, "grad_norm": 0.0010805005440488458, "learning_rate": 1.9410276721697262e-08, "loss": 0.0, "num_input_tokens_seen": 130109544, "step": 193095 }, { "epoch": 4.717465125937507, "grad_norm": 3.018993629666511e-05, "learning_rate": 1.9393559832296404e-08, "loss": 0.0, "num_input_tokens_seen": 130112744, "step": 193100 }, { "epoch": 4.717587276769355, "grad_norm": 0.0006496264250017703, "learning_rate": 1.9376850074129257e-08, "loss": 0.0, "num_input_tokens_seen": 130116264, "step": 193105 }, { "epoch": 4.717709427601202, "grad_norm": 3.441320586716756e-05, "learning_rate": 1.9360147447317398e-08, "loss": 0.0, "num_input_tokens_seen": 130119720, "step": 193110 }, { "epoch": 4.717831578433049, "grad_norm": 0.00019770261133089662, "learning_rate": 1.9343451951982505e-08, "loss": 0.0, "num_input_tokens_seen": 130123240, "step": 193115 }, { "epoch": 4.717953729264896, "grad_norm": 4.3698764784494415e-05, "learning_rate": 1.9326763588245587e-08, "loss": 0.0, "num_input_tokens_seen": 130127080, "step": 193120 }, { "epoch": 4.718075880096744, "grad_norm": 0.00031875044805929065, "learning_rate": 1.931008235622844e-08, "loss": 0.0, "num_input_tokens_seen": 130130792, "step": 193125 }, { "epoch": 4.7181980309285905, "grad_norm": 0.002110627479851246, "learning_rate": 1.929340825605197e-08, "loss": 0.0, "num_input_tokens_seen": 130134184, "step": 193130 }, { "epoch": 4.718320181760438, "grad_norm": 0.00013501415378414094, "learning_rate": 1.927674128783763e-08, "loss": 0.0, "num_input_tokens_seen": 130137448, "step": 193135 }, { "epoch": 4.718442332592285, "grad_norm": 0.0004318559367675334, "learning_rate": 1.9260081451706654e-08, "loss": 0.0, "num_input_tokens_seen": 130140776, "step": 193140 }, { "epoch": 4.7185644834241325, "grad_norm": 0.00030089804204180837, "learning_rate": 1.9243428747780065e-08, "loss": 0.0, "num_input_tokens_seen": 130144744, "step": 193145 }, { "epoch": 4.718686634255979, "grad_norm": 0.000204432217287831, "learning_rate": 1.9226783176179208e-08, "loss": 0.0, "num_input_tokens_seen": 130148200, "step": 193150 }, { "epoch": 4.718808785087827, "grad_norm": 0.00013242612476460636, "learning_rate": 1.921014473702476e-08, "loss": 0.0002, "num_input_tokens_seen": 130151464, "step": 193155 }, { "epoch": 4.718930935919674, "grad_norm": 0.0012007177574560046, "learning_rate": 1.919351343043818e-08, "loss": 0.0, "num_input_tokens_seen": 130154664, "step": 193160 }, { "epoch": 4.71905308675152, "grad_norm": 0.010337266139686108, "learning_rate": 1.917688925654004e-08, "loss": 0.0, "num_input_tokens_seen": 130158184, "step": 193165 }, { "epoch": 4.719175237583368, "grad_norm": 0.00024405837757512927, "learning_rate": 1.9160272215451355e-08, "loss": 0.0001, "num_input_tokens_seen": 130161704, "step": 193170 }, { "epoch": 4.719297388415215, "grad_norm": 0.002331462921574712, "learning_rate": 1.914366230729303e-08, "loss": 0.0, "num_input_tokens_seen": 130165736, "step": 193175 }, { "epoch": 4.719419539247062, "grad_norm": 0.00015924693434499204, "learning_rate": 1.9127059532185853e-08, "loss": 0.0, "num_input_tokens_seen": 130169000, "step": 193180 }, { "epoch": 4.719541690078909, "grad_norm": 0.0002346932451473549, "learning_rate": 1.9110463890250506e-08, "loss": 0.0, "num_input_tokens_seen": 130172712, "step": 193185 }, { "epoch": 4.719663840910757, "grad_norm": 0.00011881341197295114, "learning_rate": 1.909387538160767e-08, "loss": 0.0, "num_input_tokens_seen": 130176872, "step": 193190 }, { "epoch": 4.7197859917426035, "grad_norm": 0.0005221093888394535, "learning_rate": 1.907729400637803e-08, "loss": 0.0002, "num_input_tokens_seen": 130180136, "step": 193195 }, { "epoch": 4.719908142574451, "grad_norm": 0.00241106073372066, "learning_rate": 1.9060719764682155e-08, "loss": 0.0, "num_input_tokens_seen": 130183656, "step": 193200 }, { "epoch": 4.720030293406298, "grad_norm": 0.0005604327307082713, "learning_rate": 1.9044152656640498e-08, "loss": 0.0, "num_input_tokens_seen": 130186728, "step": 193205 }, { "epoch": 4.7201524442381455, "grad_norm": 0.002126067876815796, "learning_rate": 1.902759268237364e-08, "loss": 0.0, "num_input_tokens_seen": 130189992, "step": 193210 }, { "epoch": 4.720274595069992, "grad_norm": 0.0030112469103187323, "learning_rate": 1.901103984200192e-08, "loss": 0.0, "num_input_tokens_seen": 130193192, "step": 193215 }, { "epoch": 4.72039674590184, "grad_norm": 0.000552595010958612, "learning_rate": 1.899449413564591e-08, "loss": 0.0, "num_input_tokens_seen": 130196712, "step": 193220 }, { "epoch": 4.720518896733687, "grad_norm": 0.000356059375917539, "learning_rate": 1.897795556342563e-08, "loss": 0.0, "num_input_tokens_seen": 130200104, "step": 193225 }, { "epoch": 4.720641047565534, "grad_norm": 0.006424812134355307, "learning_rate": 1.8961424125461756e-08, "loss": 0.0, "num_input_tokens_seen": 130203496, "step": 193230 }, { "epoch": 4.720763198397381, "grad_norm": 0.002036610385403037, "learning_rate": 1.8944899821874083e-08, "loss": 0.0, "num_input_tokens_seen": 130206952, "step": 193235 }, { "epoch": 4.720885349229228, "grad_norm": 9.202076762448996e-05, "learning_rate": 1.892838265278296e-08, "loss": 0.0, "num_input_tokens_seen": 130210088, "step": 193240 }, { "epoch": 4.721007500061075, "grad_norm": 0.00014109651965554804, "learning_rate": 1.891187261830862e-08, "loss": 0.0, "num_input_tokens_seen": 130213608, "step": 193245 }, { "epoch": 4.721129650892923, "grad_norm": 0.0001980575470952317, "learning_rate": 1.8895369718570865e-08, "loss": 0.0, "num_input_tokens_seen": 130217128, "step": 193250 }, { "epoch": 4.72125180172477, "grad_norm": 0.0011710544349625707, "learning_rate": 1.8878873953690035e-08, "loss": 0.0, "num_input_tokens_seen": 130220456, "step": 193255 }, { "epoch": 4.7213739525566165, "grad_norm": 0.00030315376352518797, "learning_rate": 1.8862385323785813e-08, "loss": 0.0, "num_input_tokens_seen": 130223784, "step": 193260 }, { "epoch": 4.721496103388464, "grad_norm": 0.002116880612447858, "learning_rate": 1.8845903828978216e-08, "loss": 0.0, "num_input_tokens_seen": 130227304, "step": 193265 }, { "epoch": 4.721618254220311, "grad_norm": 4.5842771214665845e-06, "learning_rate": 1.882942946938726e-08, "loss": 0.0, "num_input_tokens_seen": 130230632, "step": 193270 }, { "epoch": 4.7217404050521585, "grad_norm": 0.00034518842585384846, "learning_rate": 1.8812962245132403e-08, "loss": 0.0, "num_input_tokens_seen": 130234280, "step": 193275 }, { "epoch": 4.721862555884005, "grad_norm": 0.00042717086034826934, "learning_rate": 1.879650215633377e-08, "loss": 0.0, "num_input_tokens_seen": 130237800, "step": 193280 }, { "epoch": 4.721984706715853, "grad_norm": 0.0005576208932325244, "learning_rate": 1.8780049203110714e-08, "loss": 0.0002, "num_input_tokens_seen": 130241320, "step": 193285 }, { "epoch": 4.7221068575477, "grad_norm": 0.0001096838095691055, "learning_rate": 1.876360338558325e-08, "loss": 0.0, "num_input_tokens_seen": 130244712, "step": 193290 }, { "epoch": 4.722229008379547, "grad_norm": 0.0005637186113744974, "learning_rate": 1.8747164703870722e-08, "loss": 0.0, "num_input_tokens_seen": 130247848, "step": 193295 }, { "epoch": 4.722351159211394, "grad_norm": 0.0004345967317931354, "learning_rate": 1.8730733158092593e-08, "loss": 0.0, "num_input_tokens_seen": 130251432, "step": 193300 }, { "epoch": 4.722473310043242, "grad_norm": 0.004918577615171671, "learning_rate": 1.8714308748368657e-08, "loss": 0.0, "num_input_tokens_seen": 130254696, "step": 193305 }, { "epoch": 4.722595460875088, "grad_norm": 0.00018379326502326876, "learning_rate": 1.869789147481815e-08, "loss": 0.0, "num_input_tokens_seen": 130257896, "step": 193310 }, { "epoch": 4.722717611706936, "grad_norm": 0.006261991336941719, "learning_rate": 1.8681481337560534e-08, "loss": 0.0, "num_input_tokens_seen": 130261224, "step": 193315 }, { "epoch": 4.722839762538783, "grad_norm": 0.0014488224405795336, "learning_rate": 1.8665078336715046e-08, "loss": 0.0, "num_input_tokens_seen": 130264424, "step": 193320 }, { "epoch": 4.72296191337063, "grad_norm": 0.000362872495315969, "learning_rate": 1.8648682472401033e-08, "loss": 0.0, "num_input_tokens_seen": 130267752, "step": 193325 }, { "epoch": 4.723084064202477, "grad_norm": 0.0027441405691206455, "learning_rate": 1.8632293744737958e-08, "loss": 0.0, "num_input_tokens_seen": 130270952, "step": 193330 }, { "epoch": 4.723206215034324, "grad_norm": 3.456345075392164e-05, "learning_rate": 1.8615912153844617e-08, "loss": 0.0, "num_input_tokens_seen": 130274024, "step": 193335 }, { "epoch": 4.7233283658661716, "grad_norm": 5.745082671637647e-05, "learning_rate": 1.859953769984046e-08, "loss": 0.0, "num_input_tokens_seen": 130277672, "step": 193340 }, { "epoch": 4.723450516698019, "grad_norm": 5.0481285143177956e-05, "learning_rate": 1.8583170382844294e-08, "loss": 0.0, "num_input_tokens_seen": 130281064, "step": 193345 }, { "epoch": 4.723572667529866, "grad_norm": 0.006787308491766453, "learning_rate": 1.8566810202975348e-08, "loss": 0.0, "num_input_tokens_seen": 130284584, "step": 193350 }, { "epoch": 4.723694818361713, "grad_norm": 0.0006456426926888525, "learning_rate": 1.855045716035253e-08, "loss": 0.0, "num_input_tokens_seen": 130288232, "step": 193355 }, { "epoch": 4.72381696919356, "grad_norm": 0.00033384087146259844, "learning_rate": 1.853411125509463e-08, "loss": 0.0, "num_input_tokens_seen": 130291688, "step": 193360 }, { "epoch": 4.723939120025407, "grad_norm": 0.00017671861860435456, "learning_rate": 1.8517772487320892e-08, "loss": 0.0, "num_input_tokens_seen": 130294888, "step": 193365 }, { "epoch": 4.724061270857255, "grad_norm": 0.00026331390836276114, "learning_rate": 1.8501440857149776e-08, "loss": 0.0, "num_input_tokens_seen": 130298216, "step": 193370 }, { "epoch": 4.724183421689101, "grad_norm": 0.00023717127623967826, "learning_rate": 1.848511636470018e-08, "loss": 0.0, "num_input_tokens_seen": 130301608, "step": 193375 }, { "epoch": 4.724305572520949, "grad_norm": 1.0042748726846185e-05, "learning_rate": 1.8468799010090796e-08, "loss": 0.0, "num_input_tokens_seen": 130305064, "step": 193380 }, { "epoch": 4.724427723352796, "grad_norm": 0.01970692165195942, "learning_rate": 1.8452488793440414e-08, "loss": 0.0, "num_input_tokens_seen": 130308328, "step": 193385 }, { "epoch": 4.7245498741846434, "grad_norm": 0.000939359306357801, "learning_rate": 1.8436185714867382e-08, "loss": 0.0, "num_input_tokens_seen": 130312104, "step": 193390 }, { "epoch": 4.72467202501649, "grad_norm": 0.000279458035947755, "learning_rate": 1.8419889774490494e-08, "loss": 0.0, "num_input_tokens_seen": 130315432, "step": 193395 }, { "epoch": 4.724794175848338, "grad_norm": 0.0003981810586992651, "learning_rate": 1.8403600972428322e-08, "loss": 0.0, "num_input_tokens_seen": 130319080, "step": 193400 }, { "epoch": 4.724916326680185, "grad_norm": 8.307830285048112e-05, "learning_rate": 1.8387319308799e-08, "loss": 0.0, "num_input_tokens_seen": 130322216, "step": 193405 }, { "epoch": 4.725038477512032, "grad_norm": 0.00042934861266985536, "learning_rate": 1.837104478372109e-08, "loss": 0.0, "num_input_tokens_seen": 130325480, "step": 193410 }, { "epoch": 4.725160628343879, "grad_norm": 3.530388858052902e-05, "learning_rate": 1.835477739731306e-08, "loss": 0.0, "num_input_tokens_seen": 130329128, "step": 193415 }, { "epoch": 4.725282779175727, "grad_norm": 7.023687794571742e-05, "learning_rate": 1.8338517149693034e-08, "loss": 0.0, "num_input_tokens_seen": 130332584, "step": 193420 }, { "epoch": 4.725404930007573, "grad_norm": 0.0024503623135387897, "learning_rate": 1.8322264040979472e-08, "loss": 0.0, "num_input_tokens_seen": 130335848, "step": 193425 }, { "epoch": 4.72552708083942, "grad_norm": 16.8475284576416, "learning_rate": 1.8306018071290284e-08, "loss": 0.0184, "num_input_tokens_seen": 130339112, "step": 193430 }, { "epoch": 4.725649231671268, "grad_norm": 4.4047746996511705e-06, "learning_rate": 1.828977924074393e-08, "loss": 0.0, "num_input_tokens_seen": 130342568, "step": 193435 }, { "epoch": 4.7257713825031145, "grad_norm": 0.0008143739541992545, "learning_rate": 1.8273547549458203e-08, "loss": 0.0002, "num_input_tokens_seen": 130346152, "step": 193440 }, { "epoch": 4.725893533334962, "grad_norm": 0.001741119078360498, "learning_rate": 1.825732299755145e-08, "loss": 0.0, "num_input_tokens_seen": 130349224, "step": 193445 }, { "epoch": 4.726015684166809, "grad_norm": 2.8913536880281754e-05, "learning_rate": 1.824110558514136e-08, "loss": 0.0, "num_input_tokens_seen": 130352488, "step": 193450 }, { "epoch": 4.7261378349986565, "grad_norm": 5.65302834729664e-05, "learning_rate": 1.8224895312346055e-08, "loss": 0.0, "num_input_tokens_seen": 130355816, "step": 193455 }, { "epoch": 4.726259985830503, "grad_norm": 0.0007669543847441673, "learning_rate": 1.8208692179283446e-08, "loss": 0.0, "num_input_tokens_seen": 130359144, "step": 193460 }, { "epoch": 4.726382136662351, "grad_norm": 7.570572779513896e-05, "learning_rate": 1.8192496186071216e-08, "loss": 0.0, "num_input_tokens_seen": 130362216, "step": 193465 }, { "epoch": 4.726504287494198, "grad_norm": 0.0005270456313155591, "learning_rate": 1.8176307332827378e-08, "loss": 0.0, "num_input_tokens_seen": 130365032, "step": 193470 }, { "epoch": 4.726626438326045, "grad_norm": 0.0011675796704366803, "learning_rate": 1.8160125619669285e-08, "loss": 0.0, "num_input_tokens_seen": 130368680, "step": 193475 }, { "epoch": 4.726748589157892, "grad_norm": 0.00010108354763360694, "learning_rate": 1.8143951046714957e-08, "loss": 0.0001, "num_input_tokens_seen": 130372008, "step": 193480 }, { "epoch": 4.72687073998974, "grad_norm": 0.00028355169342830777, "learning_rate": 1.8127783614081958e-08, "loss": 0.0, "num_input_tokens_seen": 130375784, "step": 193485 }, { "epoch": 4.726992890821586, "grad_norm": 0.00012076576967956498, "learning_rate": 1.811162332188776e-08, "loss": 0.0, "num_input_tokens_seen": 130379816, "step": 193490 }, { "epoch": 4.727115041653434, "grad_norm": 0.00043065642239525914, "learning_rate": 1.809547017024993e-08, "loss": 0.0, "num_input_tokens_seen": 130383208, "step": 193495 }, { "epoch": 4.727237192485281, "grad_norm": 0.00017358400509692729, "learning_rate": 1.8079324159285925e-08, "loss": 0.0, "num_input_tokens_seen": 130386920, "step": 193500 }, { "epoch": 4.727359343317128, "grad_norm": 0.0008371736039407551, "learning_rate": 1.8063185289113326e-08, "loss": 0.0, "num_input_tokens_seen": 130390824, "step": 193505 }, { "epoch": 4.727481494148975, "grad_norm": 0.1857682764530182, "learning_rate": 1.8047053559849146e-08, "loss": 0.0, "num_input_tokens_seen": 130393960, "step": 193510 }, { "epoch": 4.727603644980823, "grad_norm": 0.0001310663647018373, "learning_rate": 1.8030928971610958e-08, "loss": 0.0305, "num_input_tokens_seen": 130397160, "step": 193515 }, { "epoch": 4.7277257958126695, "grad_norm": 0.0009740307577885687, "learning_rate": 1.8014811524516006e-08, "loss": 0.0, "num_input_tokens_seen": 130400552, "step": 193520 }, { "epoch": 4.727847946644516, "grad_norm": 3.696549538290128e-05, "learning_rate": 1.7998701218681413e-08, "loss": 0.0, "num_input_tokens_seen": 130403816, "step": 193525 }, { "epoch": 4.727970097476364, "grad_norm": 6.525940261781216e-05, "learning_rate": 1.798259805422453e-08, "loss": 0.0, "num_input_tokens_seen": 130407080, "step": 193530 }, { "epoch": 4.728092248308211, "grad_norm": 1.0025220944953617e-05, "learning_rate": 1.7966502031262154e-08, "loss": 0.0, "num_input_tokens_seen": 130410088, "step": 193535 }, { "epoch": 4.728214399140058, "grad_norm": 0.0003596026508603245, "learning_rate": 1.7950413149911638e-08, "loss": 0.0, "num_input_tokens_seen": 130413992, "step": 193540 }, { "epoch": 4.728336549971905, "grad_norm": 8.720139157958329e-05, "learning_rate": 1.7934331410289773e-08, "loss": 0.0, "num_input_tokens_seen": 130417576, "step": 193545 }, { "epoch": 4.728458700803753, "grad_norm": 0.001967525575309992, "learning_rate": 1.7918256812513576e-08, "loss": 0.0, "num_input_tokens_seen": 130420968, "step": 193550 }, { "epoch": 4.728580851635599, "grad_norm": 0.002015329897403717, "learning_rate": 1.790218935670007e-08, "loss": 0.0, "num_input_tokens_seen": 130424424, "step": 193555 }, { "epoch": 4.728703002467447, "grad_norm": 0.0013729424681514502, "learning_rate": 1.7886129042965826e-08, "loss": 0.0, "num_input_tokens_seen": 130428008, "step": 193560 }, { "epoch": 4.728825153299294, "grad_norm": 0.0003414524544496089, "learning_rate": 1.787007587142797e-08, "loss": 0.0, "num_input_tokens_seen": 130431080, "step": 193565 }, { "epoch": 4.728947304131141, "grad_norm": 0.002865726361051202, "learning_rate": 1.7854029842203078e-08, "loss": 0.0, "num_input_tokens_seen": 130434536, "step": 193570 }, { "epoch": 4.729069454962988, "grad_norm": 0.00024214327277150005, "learning_rate": 1.7837990955407723e-08, "loss": 0.0, "num_input_tokens_seen": 130437864, "step": 193575 }, { "epoch": 4.729191605794836, "grad_norm": 0.001680655055679381, "learning_rate": 1.782195921115881e-08, "loss": 0.0, "num_input_tokens_seen": 130440872, "step": 193580 }, { "epoch": 4.7293137566266825, "grad_norm": 0.0011174753308296204, "learning_rate": 1.7805934609572693e-08, "loss": 0.0, "num_input_tokens_seen": 130443880, "step": 193585 }, { "epoch": 4.72943590745853, "grad_norm": 3.276481220382266e-05, "learning_rate": 1.7789917150766054e-08, "loss": 0.0, "num_input_tokens_seen": 130447400, "step": 193590 }, { "epoch": 4.729558058290377, "grad_norm": 0.005894262343645096, "learning_rate": 1.7773906834855245e-08, "loss": 0.0, "num_input_tokens_seen": 130451240, "step": 193595 }, { "epoch": 4.729680209122224, "grad_norm": 6.7249638959765434e-06, "learning_rate": 1.7757903661956842e-08, "loss": 0.0, "num_input_tokens_seen": 130454760, "step": 193600 }, { "epoch": 4.729802359954071, "grad_norm": 0.00039404755807481706, "learning_rate": 1.774190763218719e-08, "loss": 0.0, "num_input_tokens_seen": 130458728, "step": 193605 }, { "epoch": 4.729924510785919, "grad_norm": 3.2486277632415295e-05, "learning_rate": 1.7725918745662426e-08, "loss": 0.0, "num_input_tokens_seen": 130462120, "step": 193610 }, { "epoch": 4.730046661617766, "grad_norm": 0.00042969462811015546, "learning_rate": 1.770993700249912e-08, "loss": 0.0, "num_input_tokens_seen": 130465768, "step": 193615 }, { "epoch": 4.730168812449612, "grad_norm": 3.5601688068709336e-06, "learning_rate": 1.7693962402813288e-08, "loss": 0.0, "num_input_tokens_seen": 130469288, "step": 193620 }, { "epoch": 4.73029096328146, "grad_norm": 2.8810822186642326e-05, "learning_rate": 1.7677994946721286e-08, "loss": 0.0, "num_input_tokens_seen": 130472808, "step": 193625 }, { "epoch": 4.730413114113307, "grad_norm": 0.001526939682662487, "learning_rate": 1.7662034634339017e-08, "loss": 0.0, "num_input_tokens_seen": 130476264, "step": 193630 }, { "epoch": 4.730535264945154, "grad_norm": 0.00026932620676234365, "learning_rate": 1.7646081465782614e-08, "loss": 0.0, "num_input_tokens_seen": 130479272, "step": 193635 }, { "epoch": 4.730657415777001, "grad_norm": 0.0005830815061926842, "learning_rate": 1.7630135441168203e-08, "loss": 0.0, "num_input_tokens_seen": 130482728, "step": 193640 }, { "epoch": 4.730779566608849, "grad_norm": 0.00040932599222287536, "learning_rate": 1.7614196560611583e-08, "loss": 0.0, "num_input_tokens_seen": 130486056, "step": 193645 }, { "epoch": 4.7309017174406955, "grad_norm": 0.0009243565145879984, "learning_rate": 1.7598264824228883e-08, "loss": 0.0, "num_input_tokens_seen": 130489128, "step": 193650 }, { "epoch": 4.731023868272543, "grad_norm": 0.00019320863066241145, "learning_rate": 1.7582340232135782e-08, "loss": 0.0, "num_input_tokens_seen": 130492456, "step": 193655 }, { "epoch": 4.73114601910439, "grad_norm": 0.0003500950988382101, "learning_rate": 1.7566422784448087e-08, "loss": 0.0, "num_input_tokens_seen": 130495912, "step": 193660 }, { "epoch": 4.7312681699362376, "grad_norm": 0.0006958039593882859, "learning_rate": 1.7550512481281698e-08, "loss": 0.0, "num_input_tokens_seen": 130499368, "step": 193665 }, { "epoch": 4.731390320768084, "grad_norm": 2.7768734071287327e-05, "learning_rate": 1.753460932275208e-08, "loss": 0.0, "num_input_tokens_seen": 130503016, "step": 193670 }, { "epoch": 4.731512471599932, "grad_norm": 0.0002878334780689329, "learning_rate": 1.751871330897514e-08, "loss": 0.0, "num_input_tokens_seen": 130506216, "step": 193675 }, { "epoch": 4.731634622431779, "grad_norm": 6.573292921530083e-05, "learning_rate": 1.7502824440066344e-08, "loss": 0.0, "num_input_tokens_seen": 130509544, "step": 193680 }, { "epoch": 4.731756773263626, "grad_norm": 0.0005611705128103495, "learning_rate": 1.7486942716141374e-08, "loss": 0.0, "num_input_tokens_seen": 130512552, "step": 193685 }, { "epoch": 4.731878924095473, "grad_norm": 0.002522786846384406, "learning_rate": 1.747106813731547e-08, "loss": 0.0825, "num_input_tokens_seen": 130515688, "step": 193690 }, { "epoch": 4.73200107492732, "grad_norm": 0.0021701049990952015, "learning_rate": 1.7455200703704432e-08, "loss": 0.0, "num_input_tokens_seen": 130518888, "step": 193695 }, { "epoch": 4.732123225759167, "grad_norm": 0.00020151086209807545, "learning_rate": 1.7439340415423164e-08, "loss": 0.0, "num_input_tokens_seen": 130522216, "step": 193700 }, { "epoch": 4.732245376591015, "grad_norm": 0.0006844014278613031, "learning_rate": 1.7423487272587577e-08, "loss": 0.0, "num_input_tokens_seen": 130525352, "step": 193705 }, { "epoch": 4.732367527422862, "grad_norm": 0.003490231931209564, "learning_rate": 1.740764127531258e-08, "loss": 0.0, "num_input_tokens_seen": 130529128, "step": 193710 }, { "epoch": 4.732489678254709, "grad_norm": 0.010199329815804958, "learning_rate": 1.739180242371341e-08, "loss": 0.0, "num_input_tokens_seen": 130532008, "step": 193715 }, { "epoch": 4.732611829086556, "grad_norm": 0.0008412694442085922, "learning_rate": 1.7375970717905418e-08, "loss": 0.0, "num_input_tokens_seen": 130535208, "step": 193720 }, { "epoch": 4.732733979918403, "grad_norm": 0.00025614985497668386, "learning_rate": 1.736014615800352e-08, "loss": 0.0, "num_input_tokens_seen": 130538344, "step": 193725 }, { "epoch": 4.732856130750251, "grad_norm": 0.00017684178601484746, "learning_rate": 1.734432874412306e-08, "loss": 0.0, "num_input_tokens_seen": 130541416, "step": 193730 }, { "epoch": 4.732978281582097, "grad_norm": 9.338807285530493e-05, "learning_rate": 1.732851847637895e-08, "loss": 0.0, "num_input_tokens_seen": 130544552, "step": 193735 }, { "epoch": 4.733100432413945, "grad_norm": 0.0050024231895804405, "learning_rate": 1.7312715354886098e-08, "loss": 0.0, "num_input_tokens_seen": 130548008, "step": 193740 }, { "epoch": 4.733222583245792, "grad_norm": 0.00018156580335926265, "learning_rate": 1.7296919379759635e-08, "loss": 0.0, "num_input_tokens_seen": 130551208, "step": 193745 }, { "epoch": 4.733344734077639, "grad_norm": 0.00015616827295161784, "learning_rate": 1.7281130551114132e-08, "loss": 0.0, "num_input_tokens_seen": 130554792, "step": 193750 }, { "epoch": 4.733466884909486, "grad_norm": 2.225218850071542e-05, "learning_rate": 1.726534886906461e-08, "loss": 0.0, "num_input_tokens_seen": 130558376, "step": 193755 }, { "epoch": 4.733589035741334, "grad_norm": 0.0001064696698449552, "learning_rate": 1.7249574333725868e-08, "loss": 0.0, "num_input_tokens_seen": 130561896, "step": 193760 }, { "epoch": 4.7337111865731805, "grad_norm": 0.00014043239934835583, "learning_rate": 1.723380694521237e-08, "loss": 0.0, "num_input_tokens_seen": 130565480, "step": 193765 }, { "epoch": 4.733833337405028, "grad_norm": 0.0004173256456851959, "learning_rate": 1.7218046703639134e-08, "loss": 0.0, "num_input_tokens_seen": 130568616, "step": 193770 }, { "epoch": 4.733955488236875, "grad_norm": 0.0012830361956730485, "learning_rate": 1.7202293609120512e-08, "loss": 0.0, "num_input_tokens_seen": 130572264, "step": 193775 }, { "epoch": 4.7340776390687225, "grad_norm": 9.681628580437973e-05, "learning_rate": 1.71865476617713e-08, "loss": 0.0, "num_input_tokens_seen": 130575848, "step": 193780 }, { "epoch": 4.734199789900569, "grad_norm": 0.0009466626797802746, "learning_rate": 1.7170808861705633e-08, "loss": 0.0, "num_input_tokens_seen": 130579496, "step": 193785 }, { "epoch": 4.734321940732416, "grad_norm": 2.105771091009956e-05, "learning_rate": 1.7155077209038416e-08, "loss": 0.0, "num_input_tokens_seen": 130582568, "step": 193790 }, { "epoch": 4.734444091564264, "grad_norm": 0.008503005839884281, "learning_rate": 1.713935270388367e-08, "loss": 0.0, "num_input_tokens_seen": 130585640, "step": 193795 }, { "epoch": 4.73456624239611, "grad_norm": 7.453518628608435e-05, "learning_rate": 1.712363534635597e-08, "loss": 0.0, "num_input_tokens_seen": 130588904, "step": 193800 }, { "epoch": 4.734688393227958, "grad_norm": 0.00011131344945169985, "learning_rate": 1.7107925136569557e-08, "loss": 0.0, "num_input_tokens_seen": 130592296, "step": 193805 }, { "epoch": 4.734810544059805, "grad_norm": 4.882766370428726e-05, "learning_rate": 1.7092222074638674e-08, "loss": 0.0, "num_input_tokens_seen": 130595432, "step": 193810 }, { "epoch": 4.734932694891652, "grad_norm": 2.8152720915386453e-05, "learning_rate": 1.7076526160677563e-08, "loss": 0.0, "num_input_tokens_seen": 130598568, "step": 193815 }, { "epoch": 4.735054845723499, "grad_norm": 0.0005087078316137195, "learning_rate": 1.7060837394800244e-08, "loss": 0.0, "num_input_tokens_seen": 130602856, "step": 193820 }, { "epoch": 4.735176996555347, "grad_norm": 2.070388654829003e-05, "learning_rate": 1.7045155777120844e-08, "loss": 0.0, "num_input_tokens_seen": 130606248, "step": 193825 }, { "epoch": 4.7352991473871935, "grad_norm": 0.0008681362960487604, "learning_rate": 1.7029481307753613e-08, "loss": 0.0, "num_input_tokens_seen": 130610408, "step": 193830 }, { "epoch": 4.735421298219041, "grad_norm": 0.00028922062483616173, "learning_rate": 1.7013813986812233e-08, "loss": 0.0, "num_input_tokens_seen": 130615080, "step": 193835 }, { "epoch": 4.735543449050888, "grad_norm": 0.003609132720157504, "learning_rate": 1.6998153814410943e-08, "loss": 0.0, "num_input_tokens_seen": 130618152, "step": 193840 }, { "epoch": 4.7356655998827355, "grad_norm": 2.6319756216253154e-05, "learning_rate": 1.6982500790663325e-08, "loss": 0.0, "num_input_tokens_seen": 130621672, "step": 193845 }, { "epoch": 4.735787750714582, "grad_norm": 0.0005618033464998007, "learning_rate": 1.6966854915683504e-08, "loss": 0.0, "num_input_tokens_seen": 130625064, "step": 193850 }, { "epoch": 4.73590990154643, "grad_norm": 0.0007557669887319207, "learning_rate": 1.6951216189585062e-08, "loss": 0.0, "num_input_tokens_seen": 130628456, "step": 193855 }, { "epoch": 4.736032052378277, "grad_norm": 7.4730646701937076e-06, "learning_rate": 1.693558461248168e-08, "loss": 0.0, "num_input_tokens_seen": 130631848, "step": 193860 }, { "epoch": 4.736154203210123, "grad_norm": 0.00011738949979189783, "learning_rate": 1.691996018448727e-08, "loss": 0.0, "num_input_tokens_seen": 130634920, "step": 193865 }, { "epoch": 4.736276354041971, "grad_norm": 0.0007808083901181817, "learning_rate": 1.6904342905715297e-08, "loss": 0.0, "num_input_tokens_seen": 130638120, "step": 193870 }, { "epoch": 4.736398504873819, "grad_norm": 0.003294591326266527, "learning_rate": 1.6888732776279336e-08, "loss": 0.0, "num_input_tokens_seen": 130641448, "step": 193875 }, { "epoch": 4.736520655705665, "grad_norm": 7.634532084921375e-05, "learning_rate": 1.6873129796292964e-08, "loss": 0.0, "num_input_tokens_seen": 130645096, "step": 193880 }, { "epoch": 4.736642806537512, "grad_norm": 0.0010532429441809654, "learning_rate": 1.685753396586953e-08, "loss": 0.0, "num_input_tokens_seen": 130648744, "step": 193885 }, { "epoch": 4.73676495736936, "grad_norm": 0.00015523310867138207, "learning_rate": 1.6841945285122727e-08, "loss": 0.0, "num_input_tokens_seen": 130652200, "step": 193890 }, { "epoch": 4.7368871082012065, "grad_norm": 0.0006120402249507606, "learning_rate": 1.6826363754165573e-08, "loss": 0.0, "num_input_tokens_seen": 130655528, "step": 193895 }, { "epoch": 4.737009259033054, "grad_norm": 0.015912186354398727, "learning_rate": 1.6810789373111644e-08, "loss": 0.0, "num_input_tokens_seen": 130658472, "step": 193900 }, { "epoch": 4.737131409864901, "grad_norm": 0.0005159526481293142, "learning_rate": 1.6795222142073962e-08, "loss": 0.0, "num_input_tokens_seen": 130661928, "step": 193905 }, { "epoch": 4.7372535606967485, "grad_norm": 0.00624240655452013, "learning_rate": 1.677966206116599e-08, "loss": 0.0, "num_input_tokens_seen": 130665448, "step": 193910 }, { "epoch": 4.737375711528595, "grad_norm": 6.810051127104089e-05, "learning_rate": 1.676410913050086e-08, "loss": 0.0, "num_input_tokens_seen": 130668712, "step": 193915 }, { "epoch": 4.737497862360443, "grad_norm": 0.01036494504660368, "learning_rate": 1.674856335019137e-08, "loss": 0.0, "num_input_tokens_seen": 130671976, "step": 193920 }, { "epoch": 4.73762001319229, "grad_norm": 0.0002825463598128408, "learning_rate": 1.6733024720350987e-08, "loss": 0.0, "num_input_tokens_seen": 130675240, "step": 193925 }, { "epoch": 4.737742164024137, "grad_norm": 0.00041512216557748616, "learning_rate": 1.6717493241092396e-08, "loss": 0.0, "num_input_tokens_seen": 130678376, "step": 193930 }, { "epoch": 4.737864314855984, "grad_norm": 0.002276372630149126, "learning_rate": 1.670196891252873e-08, "loss": 0.0, "num_input_tokens_seen": 130681768, "step": 193935 }, { "epoch": 4.737986465687832, "grad_norm": 0.0005208499496802688, "learning_rate": 1.668645173477279e-08, "loss": 0.0, "num_input_tokens_seen": 130684968, "step": 193940 }, { "epoch": 4.738108616519678, "grad_norm": 0.0005167789640836418, "learning_rate": 1.667094170793748e-08, "loss": 0.0, "num_input_tokens_seen": 130688744, "step": 193945 }, { "epoch": 4.738230767351526, "grad_norm": 0.0034957481548190117, "learning_rate": 1.6655438832135494e-08, "loss": 0.0, "num_input_tokens_seen": 130692392, "step": 193950 }, { "epoch": 4.738352918183373, "grad_norm": 0.0005267342203296721, "learning_rate": 1.6639943107479627e-08, "loss": 0.0, "num_input_tokens_seen": 130695976, "step": 193955 }, { "epoch": 4.7384750690152195, "grad_norm": 0.0006037270068190992, "learning_rate": 1.6624454534082678e-08, "loss": 0.0, "num_input_tokens_seen": 130699560, "step": 193960 }, { "epoch": 4.738597219847067, "grad_norm": 0.00028693300555460155, "learning_rate": 1.6608973112057113e-08, "loss": 0.0, "num_input_tokens_seen": 130703528, "step": 193965 }, { "epoch": 4.738719370678915, "grad_norm": 0.01057329960167408, "learning_rate": 1.659349884151573e-08, "loss": 0.0, "num_input_tokens_seen": 130706920, "step": 193970 }, { "epoch": 4.7388415215107615, "grad_norm": 0.715209424495697, "learning_rate": 1.6578031722570774e-08, "loss": 0.0004, "num_input_tokens_seen": 130710376, "step": 193975 }, { "epoch": 4.738963672342608, "grad_norm": 1.2409607734298334e-05, "learning_rate": 1.656257175533493e-08, "loss": 0.0, "num_input_tokens_seen": 130713704, "step": 193980 }, { "epoch": 4.739085823174456, "grad_norm": 3.912892134394497e-05, "learning_rate": 1.6547118939920556e-08, "loss": 0.0, "num_input_tokens_seen": 130717032, "step": 193985 }, { "epoch": 4.739207974006303, "grad_norm": 0.0005390921141952276, "learning_rate": 1.6531673276440118e-08, "loss": 0.0, "num_input_tokens_seen": 130720936, "step": 193990 }, { "epoch": 4.73933012483815, "grad_norm": 0.00012799096293747425, "learning_rate": 1.6516234765005855e-08, "loss": 0.0001, "num_input_tokens_seen": 130724136, "step": 193995 }, { "epoch": 4.739452275669997, "grad_norm": 0.0004156142531428486, "learning_rate": 1.6500803405730013e-08, "loss": 0.0, "num_input_tokens_seen": 130727784, "step": 194000 }, { "epoch": 4.739574426501845, "grad_norm": 3.3402313420083374e-05, "learning_rate": 1.6485379198724948e-08, "loss": 0.0, "num_input_tokens_seen": 130730856, "step": 194005 }, { "epoch": 4.739696577333691, "grad_norm": 8.93677570275031e-05, "learning_rate": 1.6469962144102568e-08, "loss": 0.0, "num_input_tokens_seen": 130734312, "step": 194010 }, { "epoch": 4.739818728165539, "grad_norm": 1.1265015018580016e-05, "learning_rate": 1.645455224197534e-08, "loss": 0.0, "num_input_tokens_seen": 130737256, "step": 194015 }, { "epoch": 4.739940878997386, "grad_norm": 0.00013452736311592162, "learning_rate": 1.6439149492455172e-08, "loss": 0.0, "num_input_tokens_seen": 130740456, "step": 194020 }, { "epoch": 4.740063029829233, "grad_norm": 0.00022618900402449071, "learning_rate": 1.642375389565387e-08, "loss": 0.0, "num_input_tokens_seen": 130743528, "step": 194025 }, { "epoch": 4.74018518066108, "grad_norm": 0.003155721351504326, "learning_rate": 1.640836545168378e-08, "loss": 0.0, "num_input_tokens_seen": 130746792, "step": 194030 }, { "epoch": 4.740307331492928, "grad_norm": 0.0001418334140907973, "learning_rate": 1.6392984160656486e-08, "loss": 0.0, "num_input_tokens_seen": 130750056, "step": 194035 }, { "epoch": 4.740429482324775, "grad_norm": 0.00012222133227624, "learning_rate": 1.6377610022683897e-08, "loss": 0.0, "num_input_tokens_seen": 130752744, "step": 194040 }, { "epoch": 4.740551633156622, "grad_norm": 6.382190622389317e-05, "learning_rate": 1.6362243037878032e-08, "loss": 0.0, "num_input_tokens_seen": 130755944, "step": 194045 }, { "epoch": 4.740673783988469, "grad_norm": 16.01801300048828, "learning_rate": 1.634688320635047e-08, "loss": 0.0403, "num_input_tokens_seen": 130759464, "step": 194050 }, { "epoch": 4.740795934820316, "grad_norm": 9.348007733933628e-05, "learning_rate": 1.6331530528212902e-08, "loss": 0.0, "num_input_tokens_seen": 130762984, "step": 194055 }, { "epoch": 4.740918085652163, "grad_norm": 0.00011072187771787867, "learning_rate": 1.6316185003577008e-08, "loss": 0.0, "num_input_tokens_seen": 130766248, "step": 194060 }, { "epoch": 4.74104023648401, "grad_norm": 0.09227142482995987, "learning_rate": 1.630084663255449e-08, "loss": 0.0, "num_input_tokens_seen": 130769448, "step": 194065 }, { "epoch": 4.741162387315858, "grad_norm": 5.0311602535657585e-05, "learning_rate": 1.628551541525669e-08, "loss": 0.0, "num_input_tokens_seen": 130773032, "step": 194070 }, { "epoch": 4.7412845381477045, "grad_norm": 0.009662486612796783, "learning_rate": 1.6270191351795194e-08, "loss": 0.0, "num_input_tokens_seen": 130776424, "step": 194075 }, { "epoch": 4.741406688979552, "grad_norm": 0.00038499830407090485, "learning_rate": 1.6254874442281574e-08, "loss": 0.0, "num_input_tokens_seen": 130779944, "step": 194080 }, { "epoch": 4.741528839811399, "grad_norm": 0.01203479990363121, "learning_rate": 1.623956468682708e-08, "loss": 0.0, "num_input_tokens_seen": 130783144, "step": 194085 }, { "epoch": 4.7416509906432465, "grad_norm": 0.0028149220161139965, "learning_rate": 1.6224262085543063e-08, "loss": 0.0, "num_input_tokens_seen": 130786216, "step": 194090 }, { "epoch": 4.741773141475093, "grad_norm": 0.00014468298468273133, "learning_rate": 1.6208966638540766e-08, "loss": 0.0, "num_input_tokens_seen": 130789416, "step": 194095 }, { "epoch": 4.741895292306941, "grad_norm": 0.0025662481784820557, "learning_rate": 1.619367834593155e-08, "loss": 0.0, "num_input_tokens_seen": 130792680, "step": 194100 }, { "epoch": 4.742017443138788, "grad_norm": 0.0013403160264715552, "learning_rate": 1.6178397207826434e-08, "loss": 0.0, "num_input_tokens_seen": 130795944, "step": 194105 }, { "epoch": 4.742139593970635, "grad_norm": 0.00011099289258709177, "learning_rate": 1.616312322433666e-08, "loss": 0.0, "num_input_tokens_seen": 130799400, "step": 194110 }, { "epoch": 4.742261744802482, "grad_norm": 0.0006016991101205349, "learning_rate": 1.6147856395573258e-08, "loss": 0.0, "num_input_tokens_seen": 130802984, "step": 194115 }, { "epoch": 4.74238389563433, "grad_norm": 0.04025644809007645, "learning_rate": 1.613259672164735e-08, "loss": 0.0, "num_input_tokens_seen": 130807016, "step": 194120 }, { "epoch": 4.742506046466176, "grad_norm": 0.0006900572334416211, "learning_rate": 1.611734420266986e-08, "loss": 0.0, "num_input_tokens_seen": 130810344, "step": 194125 }, { "epoch": 4.742628197298024, "grad_norm": 0.026209458708763123, "learning_rate": 1.6102098838751465e-08, "loss": 0.0477, "num_input_tokens_seen": 130813928, "step": 194130 }, { "epoch": 4.742750348129871, "grad_norm": 0.002796885557472706, "learning_rate": 1.6086860630003418e-08, "loss": 0.0, "num_input_tokens_seen": 130817064, "step": 194135 }, { "epoch": 4.742872498961718, "grad_norm": 0.0003261294914409518, "learning_rate": 1.6071629576536295e-08, "loss": 0.0, "num_input_tokens_seen": 130820456, "step": 194140 }, { "epoch": 4.742994649793565, "grad_norm": 0.0002879606618080288, "learning_rate": 1.6056405678460892e-08, "loss": 0.0, "num_input_tokens_seen": 130823848, "step": 194145 }, { "epoch": 4.743116800625412, "grad_norm": 0.08680712431669235, "learning_rate": 1.604118893588802e-08, "loss": 0.0, "num_input_tokens_seen": 130827240, "step": 194150 }, { "epoch": 4.7432389514572595, "grad_norm": 4.954704490955919e-05, "learning_rate": 1.6025979348928242e-08, "loss": 0.0, "num_input_tokens_seen": 130830952, "step": 194155 }, { "epoch": 4.743361102289106, "grad_norm": 0.0006787815364077687, "learning_rate": 1.601077691769226e-08, "loss": 0.0, "num_input_tokens_seen": 130834088, "step": 194160 }, { "epoch": 4.743483253120954, "grad_norm": 0.00010076801117975265, "learning_rate": 1.5995581642290534e-08, "loss": 0.0, "num_input_tokens_seen": 130837352, "step": 194165 }, { "epoch": 4.743605403952801, "grad_norm": 1.2453545423340984e-05, "learning_rate": 1.5980393522833536e-08, "loss": 0.0, "num_input_tokens_seen": 130840296, "step": 194170 }, { "epoch": 4.743727554784648, "grad_norm": 0.0006471985834650695, "learning_rate": 1.596521255943184e-08, "loss": 0.0, "num_input_tokens_seen": 130844200, "step": 194175 }, { "epoch": 4.743849705616495, "grad_norm": 0.000188805308425799, "learning_rate": 1.5950038752195806e-08, "loss": 0.0, "num_input_tokens_seen": 130847208, "step": 194180 }, { "epoch": 4.743971856448343, "grad_norm": 0.0007714568055234849, "learning_rate": 1.5934872101235785e-08, "loss": 0.0, "num_input_tokens_seen": 130851368, "step": 194185 }, { "epoch": 4.744094007280189, "grad_norm": 4.891970456810668e-05, "learning_rate": 1.5919712606662027e-08, "loss": 0.0, "num_input_tokens_seen": 130854568, "step": 194190 }, { "epoch": 4.744216158112037, "grad_norm": 0.00020988327742088586, "learning_rate": 1.590456026858478e-08, "loss": 0.0, "num_input_tokens_seen": 130858600, "step": 194195 }, { "epoch": 4.744338308943884, "grad_norm": 0.00017921268590725958, "learning_rate": 1.588941508711428e-08, "loss": 0.0, "num_input_tokens_seen": 130861864, "step": 194200 }, { "epoch": 4.744460459775731, "grad_norm": 0.0008576527470722795, "learning_rate": 1.5874277062360663e-08, "loss": 0.0, "num_input_tokens_seen": 130865192, "step": 194205 }, { "epoch": 4.744582610607578, "grad_norm": 0.0070258015766739845, "learning_rate": 1.5859146194433958e-08, "loss": 0.0, "num_input_tokens_seen": 130868392, "step": 194210 }, { "epoch": 4.744704761439426, "grad_norm": 9.018840501084924e-05, "learning_rate": 1.5844022483444296e-08, "loss": 0.0, "num_input_tokens_seen": 130871592, "step": 194215 }, { "epoch": 4.7448269122712725, "grad_norm": 0.0017428853316232562, "learning_rate": 1.582890592950159e-08, "loss": 0.0, "num_input_tokens_seen": 130874664, "step": 194220 }, { "epoch": 4.744949063103119, "grad_norm": 0.007411367725580931, "learning_rate": 1.581379653271586e-08, "loss": 0.0, "num_input_tokens_seen": 130878056, "step": 194225 }, { "epoch": 4.745071213934967, "grad_norm": 0.001655631116591394, "learning_rate": 1.57986942931968e-08, "loss": 0.0, "num_input_tokens_seen": 130881768, "step": 194230 }, { "epoch": 4.7451933647668145, "grad_norm": 0.0011050160974264145, "learning_rate": 1.5783599211054434e-08, "loss": 0.0, "num_input_tokens_seen": 130885224, "step": 194235 }, { "epoch": 4.745315515598661, "grad_norm": 0.0002407036372460425, "learning_rate": 1.5768511286398446e-08, "loss": 0.0, "num_input_tokens_seen": 130888424, "step": 194240 }, { "epoch": 4.745437666430508, "grad_norm": 0.0037414035759866238, "learning_rate": 1.575343051933853e-08, "loss": 0.0, "num_input_tokens_seen": 130891624, "step": 194245 }, { "epoch": 4.745559817262356, "grad_norm": 0.00037726483424194157, "learning_rate": 1.5738356909984372e-08, "loss": 0.0, "num_input_tokens_seen": 130895016, "step": 194250 }, { "epoch": 4.745681968094202, "grad_norm": 0.0006118064629845321, "learning_rate": 1.572329045844578e-08, "loss": 0.0, "num_input_tokens_seen": 130898856, "step": 194255 }, { "epoch": 4.74580411892605, "grad_norm": 1.8054362953989767e-05, "learning_rate": 1.5708231164831998e-08, "loss": 0.0, "num_input_tokens_seen": 130902056, "step": 194260 }, { "epoch": 4.745926269757897, "grad_norm": 0.00450882688164711, "learning_rate": 1.569317902925271e-08, "loss": 0.0, "num_input_tokens_seen": 130905384, "step": 194265 }, { "epoch": 4.746048420589744, "grad_norm": 0.0017727892845869064, "learning_rate": 1.5678134051817392e-08, "loss": 0.0, "num_input_tokens_seen": 130908712, "step": 194270 }, { "epoch": 4.746170571421591, "grad_norm": 0.002703641774132848, "learning_rate": 1.56630962326354e-08, "loss": 0.0, "num_input_tokens_seen": 130911976, "step": 194275 }, { "epoch": 4.746292722253439, "grad_norm": 0.002856372855603695, "learning_rate": 1.5648065571816193e-08, "loss": 0.0, "num_input_tokens_seen": 130914728, "step": 194280 }, { "epoch": 4.7464148730852855, "grad_norm": 0.00012437388068065047, "learning_rate": 1.5633042069469025e-08, "loss": 0.0, "num_input_tokens_seen": 130917992, "step": 194285 }, { "epoch": 4.746537023917133, "grad_norm": 0.0003391630307305604, "learning_rate": 1.561802572570303e-08, "loss": 0.0, "num_input_tokens_seen": 130921512, "step": 194290 }, { "epoch": 4.74665917474898, "grad_norm": 4.5554650569101796e-05, "learning_rate": 1.560301654062768e-08, "loss": 0.0, "num_input_tokens_seen": 130925224, "step": 194295 }, { "epoch": 4.7467813255808275, "grad_norm": 2.3404105377267115e-05, "learning_rate": 1.5588014514351766e-08, "loss": 0.0, "num_input_tokens_seen": 130928424, "step": 194300 }, { "epoch": 4.746903476412674, "grad_norm": 0.0001046815377776511, "learning_rate": 1.5573019646984765e-08, "loss": 0.0001, "num_input_tokens_seen": 130932520, "step": 194305 }, { "epoch": 4.747025627244522, "grad_norm": 0.0001049903585226275, "learning_rate": 1.5558031938635474e-08, "loss": 0.0, "num_input_tokens_seen": 130936168, "step": 194310 }, { "epoch": 4.747147778076369, "grad_norm": 0.000262533692875877, "learning_rate": 1.554305138941292e-08, "loss": 0.0002, "num_input_tokens_seen": 130939240, "step": 194315 }, { "epoch": 4.747269928908215, "grad_norm": 0.000540116336196661, "learning_rate": 1.5528077999426125e-08, "loss": 0.0, "num_input_tokens_seen": 130943080, "step": 194320 }, { "epoch": 4.747392079740063, "grad_norm": 0.0008610247168689966, "learning_rate": 1.5513111768784004e-08, "loss": 0.0, "num_input_tokens_seen": 130946536, "step": 194325 }, { "epoch": 4.747514230571911, "grad_norm": 0.000537598563823849, "learning_rate": 1.5498152697595245e-08, "loss": 0.0, "num_input_tokens_seen": 130950312, "step": 194330 }, { "epoch": 4.747636381403757, "grad_norm": 0.00010188839951297268, "learning_rate": 1.5483200785968765e-08, "loss": 0.0, "num_input_tokens_seen": 130954088, "step": 194335 }, { "epoch": 4.747758532235604, "grad_norm": 1.9151226297253743e-05, "learning_rate": 1.546825603401325e-08, "loss": 0.0, "num_input_tokens_seen": 130957608, "step": 194340 }, { "epoch": 4.747880683067452, "grad_norm": 0.0001951848535099998, "learning_rate": 1.5453318441837282e-08, "loss": 0.0, "num_input_tokens_seen": 130961384, "step": 194345 }, { "epoch": 4.748002833899299, "grad_norm": 0.00010421617480460554, "learning_rate": 1.5438388009549665e-08, "loss": 0.0, "num_input_tokens_seen": 130965096, "step": 194350 }, { "epoch": 4.748124984731146, "grad_norm": 1.662445538386237e-05, "learning_rate": 1.5423464737258974e-08, "loss": 0.0, "num_input_tokens_seen": 130968488, "step": 194355 }, { "epoch": 4.748247135562993, "grad_norm": 0.0002314087760169059, "learning_rate": 1.540854862507357e-08, "loss": 0.0, "num_input_tokens_seen": 130972520, "step": 194360 }, { "epoch": 4.748369286394841, "grad_norm": 0.18845084309577942, "learning_rate": 1.5393639673102033e-08, "loss": 0.0001, "num_input_tokens_seen": 130975720, "step": 194365 }, { "epoch": 4.748491437226687, "grad_norm": 0.0035586815793067217, "learning_rate": 1.537873788145283e-08, "loss": 0.0, "num_input_tokens_seen": 130978600, "step": 194370 }, { "epoch": 4.748613588058535, "grad_norm": 0.0011052679037675261, "learning_rate": 1.536384325023421e-08, "loss": 0.0, "num_input_tokens_seen": 130982120, "step": 194375 }, { "epoch": 4.748735738890382, "grad_norm": 0.0009073576657101512, "learning_rate": 1.534895577955464e-08, "loss": 0.0, "num_input_tokens_seen": 130985384, "step": 194380 }, { "epoch": 4.748857889722229, "grad_norm": 0.004358983598649502, "learning_rate": 1.5334075469522146e-08, "loss": 0.0, "num_input_tokens_seen": 130988968, "step": 194385 }, { "epoch": 4.748980040554076, "grad_norm": 8.133276423905045e-05, "learning_rate": 1.5319202320245305e-08, "loss": 0.0, "num_input_tokens_seen": 130992616, "step": 194390 }, { "epoch": 4.749102191385924, "grad_norm": 0.029877588152885437, "learning_rate": 1.5304336331831924e-08, "loss": 0.0, "num_input_tokens_seen": 130996072, "step": 194395 }, { "epoch": 4.7492243422177705, "grad_norm": 0.02860754169523716, "learning_rate": 1.5289477504390358e-08, "loss": 0.0, "num_input_tokens_seen": 130999144, "step": 194400 }, { "epoch": 4.749346493049618, "grad_norm": 0.00028780565480701625, "learning_rate": 1.527462583802852e-08, "loss": 0.0, "num_input_tokens_seen": 131002408, "step": 194405 }, { "epoch": 4.749468643881465, "grad_norm": 0.0009346139850094914, "learning_rate": 1.5259781332854436e-08, "loss": 0.0, "num_input_tokens_seen": 131005864, "step": 194410 }, { "epoch": 4.749590794713312, "grad_norm": 0.0005739172920584679, "learning_rate": 1.5244943988976135e-08, "loss": 0.0, "num_input_tokens_seen": 131009256, "step": 194415 }, { "epoch": 4.749712945545159, "grad_norm": 0.00032403404475189745, "learning_rate": 1.523011380650141e-08, "loss": 0.0, "num_input_tokens_seen": 131012904, "step": 194420 }, { "epoch": 4.749835096377006, "grad_norm": 0.00260849273763597, "learning_rate": 1.521529078553818e-08, "loss": 0.0, "num_input_tokens_seen": 131016232, "step": 194425 }, { "epoch": 4.749957247208854, "grad_norm": 8.902177796699107e-06, "learning_rate": 1.5200474926194363e-08, "loss": 0.0006, "num_input_tokens_seen": 131019624, "step": 194430 }, { "epoch": 4.7500793980407, "grad_norm": 0.0015526276547461748, "learning_rate": 1.518566622857742e-08, "loss": 0.0058, "num_input_tokens_seen": 131022824, "step": 194435 }, { "epoch": 4.750201548872548, "grad_norm": 0.0007619461975991726, "learning_rate": 1.5170864692795272e-08, "loss": 0.0, "num_input_tokens_seen": 131026792, "step": 194440 }, { "epoch": 4.750323699704395, "grad_norm": 1.0548461432335898e-05, "learning_rate": 1.5156070318955384e-08, "loss": 0.0727, "num_input_tokens_seen": 131029672, "step": 194445 }, { "epoch": 4.750348129870765, "eval_loss": 0.3178049325942993, "eval_runtime": 47.5919, "eval_samples_per_second": 764.521, "eval_steps_per_second": 95.584, "num_input_tokens_seen": 131030440, "step": 194446 }, { "epoch": 4.750445850536242, "grad_norm": 0.0001051591825671494, "learning_rate": 1.514128310716556e-08, "loss": 0.0, "num_input_tokens_seen": 131033192, "step": 194450 }, { "epoch": 4.750568001368089, "grad_norm": 7.242747233249247e-05, "learning_rate": 1.512650305753316e-08, "loss": 0.0, "num_input_tokens_seen": 131036264, "step": 194455 }, { "epoch": 4.750690152199937, "grad_norm": 0.00018513934628572315, "learning_rate": 1.511173017016576e-08, "loss": 0.0, "num_input_tokens_seen": 131039144, "step": 194460 }, { "epoch": 4.7508123030317835, "grad_norm": 0.0014160667778924108, "learning_rate": 1.5096964445170723e-08, "loss": 0.0, "num_input_tokens_seen": 131042984, "step": 194465 }, { "epoch": 4.750934453863631, "grad_norm": 0.0006216327892616391, "learning_rate": 1.5082205882655518e-08, "loss": 0.0, "num_input_tokens_seen": 131046504, "step": 194470 }, { "epoch": 4.751056604695478, "grad_norm": 7.211205956991762e-05, "learning_rate": 1.506745448272728e-08, "loss": 0.0, "num_input_tokens_seen": 131049896, "step": 194475 }, { "epoch": 4.7511787555273255, "grad_norm": 0.0027999139856547117, "learning_rate": 1.5052710245493593e-08, "loss": 0.0, "num_input_tokens_seen": 131053288, "step": 194480 }, { "epoch": 4.751300906359172, "grad_norm": 0.0013571522431448102, "learning_rate": 1.503797317106148e-08, "loss": 0.0, "num_input_tokens_seen": 131056680, "step": 194485 }, { "epoch": 4.751423057191019, "grad_norm": 0.0003400477871764451, "learning_rate": 1.5023243259538078e-08, "loss": 0.0, "num_input_tokens_seen": 131060840, "step": 194490 }, { "epoch": 4.751545208022867, "grad_norm": 0.004371670540422201, "learning_rate": 1.5008520511030632e-08, "loss": 0.0, "num_input_tokens_seen": 131064104, "step": 194495 }, { "epoch": 4.751667358854714, "grad_norm": 0.0003847415209747851, "learning_rate": 1.499380492564617e-08, "loss": 0.0, "num_input_tokens_seen": 131067624, "step": 194500 }, { "epoch": 4.751789509686561, "grad_norm": 0.0026928375009447336, "learning_rate": 1.497909650349172e-08, "loss": 0.0, "num_input_tokens_seen": 131071144, "step": 194505 }, { "epoch": 4.751911660518408, "grad_norm": 0.00046413298696279526, "learning_rate": 1.4964395244674077e-08, "loss": 0.0489, "num_input_tokens_seen": 131074728, "step": 194510 }, { "epoch": 4.752033811350255, "grad_norm": 0.0003477961290627718, "learning_rate": 1.4949701149300385e-08, "loss": 0.0, "num_input_tokens_seen": 131077992, "step": 194515 }, { "epoch": 4.752155962182102, "grad_norm": 2.7544947442947887e-05, "learning_rate": 1.493501421747745e-08, "loss": 0.0, "num_input_tokens_seen": 131081192, "step": 194520 }, { "epoch": 4.75227811301395, "grad_norm": 0.008760979399085045, "learning_rate": 1.4920334449311957e-08, "loss": 0.0, "num_input_tokens_seen": 131084328, "step": 194525 }, { "epoch": 4.7524002638457965, "grad_norm": 6.047858278179774e-06, "learning_rate": 1.4905661844910934e-08, "loss": 0.0, "num_input_tokens_seen": 131087784, "step": 194530 }, { "epoch": 4.752522414677644, "grad_norm": 0.00041074713226407766, "learning_rate": 1.489099640438074e-08, "loss": 0.0, "num_input_tokens_seen": 131091112, "step": 194535 }, { "epoch": 4.752644565509491, "grad_norm": 0.00014632332022301853, "learning_rate": 1.4876338127828181e-08, "loss": 0.0, "num_input_tokens_seen": 131094248, "step": 194540 }, { "epoch": 4.7527667163413385, "grad_norm": 0.0059760636650025845, "learning_rate": 1.4861687015359947e-08, "loss": 0.0, "num_input_tokens_seen": 131097192, "step": 194545 }, { "epoch": 4.752888867173185, "grad_norm": 7.05088023096323e-05, "learning_rate": 1.4847043067082398e-08, "loss": 0.0266, "num_input_tokens_seen": 131100456, "step": 194550 }, { "epoch": 4.753011018005033, "grad_norm": 3.3506712497910485e-06, "learning_rate": 1.4832406283102228e-08, "loss": 0.0313, "num_input_tokens_seen": 131103720, "step": 194555 }, { "epoch": 4.75313316883688, "grad_norm": 0.0002796973567456007, "learning_rate": 1.4817776663525683e-08, "loss": 0.0, "num_input_tokens_seen": 131106920, "step": 194560 }, { "epoch": 4.753255319668727, "grad_norm": 2.2053438442526385e-05, "learning_rate": 1.4803154208459233e-08, "loss": 0.0, "num_input_tokens_seen": 131109992, "step": 194565 }, { "epoch": 4.753377470500574, "grad_norm": 0.005914537236094475, "learning_rate": 1.4788538918009242e-08, "loss": 0.0, "num_input_tokens_seen": 131113640, "step": 194570 }, { "epoch": 4.753499621332422, "grad_norm": 0.0007829848327673972, "learning_rate": 1.4773930792282064e-08, "loss": 0.0, "num_input_tokens_seen": 131116776, "step": 194575 }, { "epoch": 4.753621772164268, "grad_norm": 0.00043016657582484186, "learning_rate": 1.4759329831383837e-08, "loss": 0.0, "num_input_tokens_seen": 131120424, "step": 194580 }, { "epoch": 4.753743922996115, "grad_norm": 0.0001220947888214141, "learning_rate": 1.4744736035420702e-08, "loss": 0.0, "num_input_tokens_seen": 131123752, "step": 194585 }, { "epoch": 4.753866073827963, "grad_norm": 6.0042086261091754e-05, "learning_rate": 1.4730149404498905e-08, "loss": 0.0, "num_input_tokens_seen": 131126888, "step": 194590 }, { "epoch": 4.75398822465981, "grad_norm": 0.0038243704475462437, "learning_rate": 1.4715569938724359e-08, "loss": 0.0, "num_input_tokens_seen": 131130344, "step": 194595 }, { "epoch": 4.754110375491657, "grad_norm": 7.801556785125285e-05, "learning_rate": 1.4700997638203316e-08, "loss": 0.0, "num_input_tokens_seen": 131133800, "step": 194600 }, { "epoch": 4.754232526323504, "grad_norm": 0.00026956311194226146, "learning_rate": 1.468643250304158e-08, "loss": 0.0, "num_input_tokens_seen": 131136872, "step": 194605 }, { "epoch": 4.7543546771553515, "grad_norm": 0.0015906771877780557, "learning_rate": 1.4671874533345064e-08, "loss": 0.0, "num_input_tokens_seen": 131140200, "step": 194610 }, { "epoch": 4.754476827987198, "grad_norm": 0.0002600103907752782, "learning_rate": 1.4657323729219906e-08, "loss": 0.0, "num_input_tokens_seen": 131143912, "step": 194615 }, { "epoch": 4.754598978819046, "grad_norm": 0.013368427753448486, "learning_rate": 1.4642780090771467e-08, "loss": 0.0, "num_input_tokens_seen": 131147944, "step": 194620 }, { "epoch": 4.754721129650893, "grad_norm": 0.00011110393825219944, "learning_rate": 1.4628243618105996e-08, "loss": 0.0, "num_input_tokens_seen": 131151272, "step": 194625 }, { "epoch": 4.75484328048274, "grad_norm": 0.001707794377580285, "learning_rate": 1.4613714311328739e-08, "loss": 0.0, "num_input_tokens_seen": 131155112, "step": 194630 }, { "epoch": 4.754965431314587, "grad_norm": 0.008413401432335377, "learning_rate": 1.4599192170545838e-08, "loss": 0.0, "num_input_tokens_seen": 131158504, "step": 194635 }, { "epoch": 4.755087582146435, "grad_norm": 0.0002867542498279363, "learning_rate": 1.4584677195862538e-08, "loss": 0.0, "num_input_tokens_seen": 131161832, "step": 194640 }, { "epoch": 4.755209732978281, "grad_norm": 0.004074745811522007, "learning_rate": 1.4570169387384424e-08, "loss": 0.0, "num_input_tokens_seen": 131165864, "step": 194645 }, { "epoch": 4.755331883810129, "grad_norm": 0.02270219847559929, "learning_rate": 1.4555668745217186e-08, "loss": 0.0, "num_input_tokens_seen": 131169448, "step": 194650 }, { "epoch": 4.755454034641976, "grad_norm": 4.968816938344389e-05, "learning_rate": 1.4541175269466078e-08, "loss": 0.0, "num_input_tokens_seen": 131173096, "step": 194655 }, { "epoch": 4.755576185473823, "grad_norm": 0.0008052657940424979, "learning_rate": 1.4526688960236788e-08, "loss": 0.0, "num_input_tokens_seen": 131176232, "step": 194660 }, { "epoch": 4.75569833630567, "grad_norm": 0.008516672067344189, "learning_rate": 1.4512209817634235e-08, "loss": 0.0, "num_input_tokens_seen": 131179560, "step": 194665 }, { "epoch": 4.755820487137518, "grad_norm": 0.0010770387016236782, "learning_rate": 1.4497737841764114e-08, "loss": 0.0, "num_input_tokens_seen": 131182696, "step": 194670 }, { "epoch": 4.755942637969365, "grad_norm": 0.0012830148916691542, "learning_rate": 1.4483273032731447e-08, "loss": 0.0, "num_input_tokens_seen": 131186024, "step": 194675 }, { "epoch": 4.756064788801211, "grad_norm": 5.277749369270168e-05, "learning_rate": 1.4468815390641486e-08, "loss": 0.0, "num_input_tokens_seen": 131189416, "step": 194680 }, { "epoch": 4.756186939633059, "grad_norm": 2.9087057555443607e-05, "learning_rate": 1.4454364915599482e-08, "loss": 0.0, "num_input_tokens_seen": 131193064, "step": 194685 }, { "epoch": 4.756309090464906, "grad_norm": 0.00029995886143296957, "learning_rate": 1.4439921607710348e-08, "loss": 0.0, "num_input_tokens_seen": 131196392, "step": 194690 }, { "epoch": 4.756431241296753, "grad_norm": 0.04478108510375023, "learning_rate": 1.4425485467079113e-08, "loss": 0.0, "num_input_tokens_seen": 131199528, "step": 194695 }, { "epoch": 4.7565533921286, "grad_norm": 0.00012941205932293087, "learning_rate": 1.4411056493810913e-08, "loss": 0.0, "num_input_tokens_seen": 131203176, "step": 194700 }, { "epoch": 4.756675542960448, "grad_norm": 0.010789011605083942, "learning_rate": 1.4396634688010556e-08, "loss": 0.0, "num_input_tokens_seen": 131206248, "step": 194705 }, { "epoch": 4.7567976937922944, "grad_norm": 0.001222478342242539, "learning_rate": 1.4382220049783068e-08, "loss": 0.0, "num_input_tokens_seen": 131210088, "step": 194710 }, { "epoch": 4.756919844624142, "grad_norm": 0.00024161135661415756, "learning_rate": 1.436781257923303e-08, "loss": 0.0, "num_input_tokens_seen": 131213224, "step": 194715 }, { "epoch": 4.757041995455989, "grad_norm": 0.03183262422680855, "learning_rate": 1.4353412276465471e-08, "loss": 0.0, "num_input_tokens_seen": 131216488, "step": 194720 }, { "epoch": 4.7571641462878365, "grad_norm": 0.0014592782827094197, "learning_rate": 1.4339019141584973e-08, "loss": 0.0, "num_input_tokens_seen": 131219944, "step": 194725 }, { "epoch": 4.757286297119683, "grad_norm": 0.00017539637337904423, "learning_rate": 1.4324633174696343e-08, "loss": 0.0, "num_input_tokens_seen": 131225256, "step": 194730 }, { "epoch": 4.757408447951531, "grad_norm": 2.7212019631406292e-05, "learning_rate": 1.4310254375903941e-08, "loss": 0.0, "num_input_tokens_seen": 131228712, "step": 194735 }, { "epoch": 4.757530598783378, "grad_norm": 0.0010347808711230755, "learning_rate": 1.429588274531257e-08, "loss": 0.0, "num_input_tokens_seen": 131231912, "step": 194740 }, { "epoch": 4.757652749615225, "grad_norm": 0.9352474212646484, "learning_rate": 1.4281518283026595e-08, "loss": 0.0004, "num_input_tokens_seen": 131234984, "step": 194745 }, { "epoch": 4.757774900447072, "grad_norm": 5.460303509607911e-05, "learning_rate": 1.4267160989150595e-08, "loss": 0.0, "num_input_tokens_seen": 131237992, "step": 194750 }, { "epoch": 4.75789705127892, "grad_norm": 0.0440170057117939, "learning_rate": 1.4252810863788932e-08, "loss": 0.0, "num_input_tokens_seen": 131240744, "step": 194755 }, { "epoch": 4.758019202110766, "grad_norm": 0.0032277258578687906, "learning_rate": 1.4238467907045971e-08, "loss": 0.0, "num_input_tokens_seen": 131243752, "step": 194760 }, { "epoch": 4.758141352942614, "grad_norm": 8.80181833053939e-05, "learning_rate": 1.4224132119025956e-08, "loss": 0.0, "num_input_tokens_seen": 131246952, "step": 194765 }, { "epoch": 4.758263503774461, "grad_norm": 8.081180567387491e-05, "learning_rate": 1.420980349983325e-08, "loss": 0.0, "num_input_tokens_seen": 131250024, "step": 194770 }, { "epoch": 4.7583856546063075, "grad_norm": 2.834494262060616e-05, "learning_rate": 1.4195482049571993e-08, "loss": 0.0, "num_input_tokens_seen": 131253032, "step": 194775 }, { "epoch": 4.758507805438155, "grad_norm": 4.941626684740186e-05, "learning_rate": 1.4181167768346324e-08, "loss": 0.0, "num_input_tokens_seen": 131256360, "step": 194780 }, { "epoch": 4.758629956270002, "grad_norm": 8.536856330465525e-05, "learning_rate": 1.4166860656260271e-08, "loss": 0.0, "num_input_tokens_seen": 131259880, "step": 194785 }, { "epoch": 4.7587521071018495, "grad_norm": 0.00026171712670475245, "learning_rate": 1.4152560713418082e-08, "loss": 0.0, "num_input_tokens_seen": 131263464, "step": 194790 }, { "epoch": 4.758874257933696, "grad_norm": 0.007480157073587179, "learning_rate": 1.4138267939923565e-08, "loss": 0.0, "num_input_tokens_seen": 131266984, "step": 194795 }, { "epoch": 4.758996408765544, "grad_norm": 5.9159250668017194e-05, "learning_rate": 1.4123982335880746e-08, "loss": 0.0, "num_input_tokens_seen": 131270184, "step": 194800 }, { "epoch": 4.759118559597391, "grad_norm": 0.004247388802468777, "learning_rate": 1.4109703901393543e-08, "loss": 0.0, "num_input_tokens_seen": 131274536, "step": 194805 }, { "epoch": 4.759240710429238, "grad_norm": 0.006303729489445686, "learning_rate": 1.4095432636565763e-08, "loss": 0.0, "num_input_tokens_seen": 131278056, "step": 194810 }, { "epoch": 4.759362861261085, "grad_norm": 2.7018289983971044e-05, "learning_rate": 1.4081168541501099e-08, "loss": 0.0, "num_input_tokens_seen": 131281512, "step": 194815 }, { "epoch": 4.759485012092933, "grad_norm": 9.589582623448223e-05, "learning_rate": 1.4066911616303357e-08, "loss": 0.0, "num_input_tokens_seen": 131285096, "step": 194820 }, { "epoch": 4.759607162924779, "grad_norm": 0.0005603454192169011, "learning_rate": 1.4052661861076232e-08, "loss": 0.0, "num_input_tokens_seen": 131288680, "step": 194825 }, { "epoch": 4.759729313756627, "grad_norm": 0.000639497535303235, "learning_rate": 1.4038419275923419e-08, "loss": 0.0, "num_input_tokens_seen": 131291496, "step": 194830 }, { "epoch": 4.759851464588474, "grad_norm": 0.00018495744734536856, "learning_rate": 1.402418386094828e-08, "loss": 0.0, "num_input_tokens_seen": 131294440, "step": 194835 }, { "epoch": 4.759973615420321, "grad_norm": 0.0032376840244978666, "learning_rate": 1.400995561625451e-08, "loss": 0.0, "num_input_tokens_seen": 131297960, "step": 194840 }, { "epoch": 4.760095766252168, "grad_norm": 4.197334055788815e-05, "learning_rate": 1.3995734541945692e-08, "loss": 0.0, "num_input_tokens_seen": 131301480, "step": 194845 }, { "epoch": 4.760217917084015, "grad_norm": 0.0002773664309643209, "learning_rate": 1.3981520638124855e-08, "loss": 0.0, "num_input_tokens_seen": 131304872, "step": 194850 }, { "epoch": 4.7603400679158625, "grad_norm": 0.0001326696656178683, "learning_rate": 1.3967313904895805e-08, "loss": 0.0, "num_input_tokens_seen": 131308392, "step": 194855 }, { "epoch": 4.76046221874771, "grad_norm": 2.312642573087942e-05, "learning_rate": 1.3953114342361571e-08, "loss": 0.001, "num_input_tokens_seen": 131311912, "step": 194860 }, { "epoch": 4.760584369579557, "grad_norm": 0.02015681564807892, "learning_rate": 1.3938921950625515e-08, "loss": 0.0, "num_input_tokens_seen": 131315368, "step": 194865 }, { "epoch": 4.760706520411404, "grad_norm": 0.0017596333054825664, "learning_rate": 1.3924736729790775e-08, "loss": 0.0, "num_input_tokens_seen": 131319592, "step": 194870 }, { "epoch": 4.760828671243251, "grad_norm": 0.00010401655890746042, "learning_rate": 1.3910558679960715e-08, "loss": 0.0, "num_input_tokens_seen": 131323624, "step": 194875 }, { "epoch": 4.760950822075098, "grad_norm": 0.0013329308712854981, "learning_rate": 1.3896387801238141e-08, "loss": 0.0087, "num_input_tokens_seen": 131327400, "step": 194880 }, { "epoch": 4.761072972906946, "grad_norm": 0.0004021950880996883, "learning_rate": 1.3882224093726302e-08, "loss": 0.0, "num_input_tokens_seen": 131330600, "step": 194885 }, { "epoch": 4.761195123738792, "grad_norm": 0.025195332244038582, "learning_rate": 1.3868067557528228e-08, "loss": 0.0, "num_input_tokens_seen": 131333800, "step": 194890 }, { "epoch": 4.76131727457064, "grad_norm": 0.00035623961593955755, "learning_rate": 1.3853918192746839e-08, "loss": 0.0, "num_input_tokens_seen": 131336936, "step": 194895 }, { "epoch": 4.761439425402487, "grad_norm": 7.331221422646195e-05, "learning_rate": 1.3839775999484938e-08, "loss": 0.0, "num_input_tokens_seen": 131340456, "step": 194900 }, { "epoch": 4.761561576234334, "grad_norm": 0.008651269599795341, "learning_rate": 1.3825640977845333e-08, "loss": 0.0, "num_input_tokens_seen": 131344168, "step": 194905 }, { "epoch": 4.761683727066181, "grad_norm": 0.00015639110642950982, "learning_rate": 1.3811513127931052e-08, "loss": 0.0, "num_input_tokens_seen": 131347752, "step": 194910 }, { "epoch": 4.761805877898029, "grad_norm": 0.39226239919662476, "learning_rate": 1.379739244984468e-08, "loss": 0.0002, "num_input_tokens_seen": 131350696, "step": 194915 }, { "epoch": 4.7619280287298755, "grad_norm": 5.454105485114269e-05, "learning_rate": 1.3783278943688914e-08, "loss": 0.0, "num_input_tokens_seen": 131354216, "step": 194920 }, { "epoch": 4.762050179561723, "grad_norm": 0.006486842874437571, "learning_rate": 1.3769172609566337e-08, "loss": 0.0, "num_input_tokens_seen": 131357352, "step": 194925 }, { "epoch": 4.76217233039357, "grad_norm": 0.00033026799792423844, "learning_rate": 1.3755073447579646e-08, "loss": 0.0, "num_input_tokens_seen": 131360360, "step": 194930 }, { "epoch": 4.7622944812254175, "grad_norm": 8.527412137482315e-05, "learning_rate": 1.3740981457831424e-08, "loss": 0.0, "num_input_tokens_seen": 131363688, "step": 194935 }, { "epoch": 4.762416632057264, "grad_norm": 0.00025074477889575064, "learning_rate": 1.3726896640423924e-08, "loss": 0.0, "num_input_tokens_seen": 131367016, "step": 194940 }, { "epoch": 4.762538782889111, "grad_norm": 0.0016728693153709173, "learning_rate": 1.3712818995459841e-08, "loss": 0.0, "num_input_tokens_seen": 131370600, "step": 194945 }, { "epoch": 4.762660933720959, "grad_norm": 0.0001442194334231317, "learning_rate": 1.3698748523041314e-08, "loss": 0.0, "num_input_tokens_seen": 131373864, "step": 194950 }, { "epoch": 4.762783084552806, "grad_norm": 4.660491686081514e-05, "learning_rate": 1.368468522327082e-08, "loss": 0.0, "num_input_tokens_seen": 131377384, "step": 194955 }, { "epoch": 4.762905235384653, "grad_norm": 9.689110811450519e-06, "learning_rate": 1.3670629096250496e-08, "loss": 0.0, "num_input_tokens_seen": 131380520, "step": 194960 }, { "epoch": 4.7630273862165, "grad_norm": 0.0014319338370114565, "learning_rate": 1.3656580142082707e-08, "loss": 0.0, "num_input_tokens_seen": 131383912, "step": 194965 }, { "epoch": 4.763149537048347, "grad_norm": 0.00041627997416071594, "learning_rate": 1.3642538360869593e-08, "loss": 0.0, "num_input_tokens_seen": 131387240, "step": 194970 }, { "epoch": 4.763271687880194, "grad_norm": 0.007283204235136509, "learning_rate": 1.3628503752713183e-08, "loss": 0.0, "num_input_tokens_seen": 131390312, "step": 194975 }, { "epoch": 4.763393838712042, "grad_norm": 0.0030106627382338047, "learning_rate": 1.3614476317715618e-08, "loss": 0.0, "num_input_tokens_seen": 131393704, "step": 194980 }, { "epoch": 4.7635159895438886, "grad_norm": 0.0012613601284101605, "learning_rate": 1.3600456055978926e-08, "loss": 0.0, "num_input_tokens_seen": 131397160, "step": 194985 }, { "epoch": 4.763638140375736, "grad_norm": 8.995272219181061e-05, "learning_rate": 1.3586442967604916e-08, "loss": 0.0, "num_input_tokens_seen": 131400872, "step": 194990 }, { "epoch": 4.763760291207583, "grad_norm": 0.00010999714868376032, "learning_rate": 1.3572437052695729e-08, "loss": 0.0, "num_input_tokens_seen": 131403944, "step": 194995 }, { "epoch": 4.763882442039431, "grad_norm": 0.001251238165423274, "learning_rate": 1.355843831135306e-08, "loss": 0.0, "num_input_tokens_seen": 131407144, "step": 195000 }, { "epoch": 4.764004592871277, "grad_norm": 4.152482870267704e-05, "learning_rate": 1.3544446743678717e-08, "loss": 0.0, "num_input_tokens_seen": 131410728, "step": 195005 }, { "epoch": 4.764126743703125, "grad_norm": 0.0001953756291186437, "learning_rate": 1.3530462349774508e-08, "loss": 0.0, "num_input_tokens_seen": 131414440, "step": 195010 }, { "epoch": 4.764248894534972, "grad_norm": 0.0006240036454983056, "learning_rate": 1.3516485129742018e-08, "loss": 0.0, "num_input_tokens_seen": 131417896, "step": 195015 }, { "epoch": 4.764371045366819, "grad_norm": 0.16515390574932098, "learning_rate": 1.3502515083683164e-08, "loss": 0.0, "num_input_tokens_seen": 131420968, "step": 195020 }, { "epoch": 4.764493196198666, "grad_norm": 0.0005905067082494497, "learning_rate": 1.34885522116992e-08, "loss": 0.0, "num_input_tokens_seen": 131424104, "step": 195025 }, { "epoch": 4.764615347030514, "grad_norm": 0.0006180317723192275, "learning_rate": 1.3474596513891935e-08, "loss": 0.0, "num_input_tokens_seen": 131427304, "step": 195030 }, { "epoch": 4.7647374978623604, "grad_norm": 0.008717228658497334, "learning_rate": 1.3460647990362617e-08, "loss": 0.0, "num_input_tokens_seen": 131430760, "step": 195035 }, { "epoch": 4.764859648694207, "grad_norm": 0.0009299147059209645, "learning_rate": 1.3446706641212946e-08, "loss": 0.0, "num_input_tokens_seen": 131433768, "step": 195040 }, { "epoch": 4.764981799526055, "grad_norm": 0.0001631972409086302, "learning_rate": 1.3432772466544062e-08, "loss": 0.0, "num_input_tokens_seen": 131437096, "step": 195045 }, { "epoch": 4.765103950357902, "grad_norm": 8.587415504734963e-05, "learning_rate": 1.341884546645744e-08, "loss": 0.0, "num_input_tokens_seen": 131440296, "step": 195050 }, { "epoch": 4.765226101189749, "grad_norm": 5.65875307074748e-05, "learning_rate": 1.3404925641054331e-08, "loss": 0.0, "num_input_tokens_seen": 131443752, "step": 195055 }, { "epoch": 4.765348252021596, "grad_norm": 4.379829624667764e-05, "learning_rate": 1.3391012990436101e-08, "loss": 0.0039, "num_input_tokens_seen": 131446952, "step": 195060 }, { "epoch": 4.765470402853444, "grad_norm": 0.0006057017599232495, "learning_rate": 1.3377107514703667e-08, "loss": 0.0, "num_input_tokens_seen": 131450536, "step": 195065 }, { "epoch": 4.76559255368529, "grad_norm": 6.288501026574522e-05, "learning_rate": 1.3363209213958282e-08, "loss": 0.0, "num_input_tokens_seen": 131454184, "step": 195070 }, { "epoch": 4.765714704517138, "grad_norm": 0.024181053042411804, "learning_rate": 1.3349318088300976e-08, "loss": 0.0, "num_input_tokens_seen": 131457320, "step": 195075 }, { "epoch": 4.765836855348985, "grad_norm": 0.0012301692040637136, "learning_rate": 1.333543413783289e-08, "loss": 0.0, "num_input_tokens_seen": 131460520, "step": 195080 }, { "epoch": 4.765959006180832, "grad_norm": 0.00588964531198144, "learning_rate": 1.3321557362654833e-08, "loss": 0.0, "num_input_tokens_seen": 131463912, "step": 195085 }, { "epoch": 4.766081157012679, "grad_norm": 0.0011522286804392934, "learning_rate": 1.3307687762867836e-08, "loss": 0.0, "num_input_tokens_seen": 131467304, "step": 195090 }, { "epoch": 4.766203307844527, "grad_norm": 0.026447875425219536, "learning_rate": 1.3293825338572706e-08, "loss": 0.0, "num_input_tokens_seen": 131470888, "step": 195095 }, { "epoch": 4.7663254586763735, "grad_norm": 0.00015927547065075487, "learning_rate": 1.3279970089870251e-08, "loss": 0.0, "num_input_tokens_seen": 131474472, "step": 195100 }, { "epoch": 4.766447609508221, "grad_norm": 0.0005589300417341292, "learning_rate": 1.3266122016861392e-08, "loss": 0.0, "num_input_tokens_seen": 131477736, "step": 195105 }, { "epoch": 4.766569760340068, "grad_norm": 0.0003874331305269152, "learning_rate": 1.3252281119646491e-08, "loss": 0.0, "num_input_tokens_seen": 131481256, "step": 195110 }, { "epoch": 4.766691911171915, "grad_norm": 0.00016673367645125836, "learning_rate": 1.323844739832658e-08, "loss": 0.0, "num_input_tokens_seen": 131484072, "step": 195115 }, { "epoch": 4.766814062003762, "grad_norm": 0.042626719921827316, "learning_rate": 1.3224620853001911e-08, "loss": 0.0, "num_input_tokens_seen": 131486952, "step": 195120 }, { "epoch": 4.76693621283561, "grad_norm": 0.000655999465379864, "learning_rate": 1.3210801483773404e-08, "loss": 0.0, "num_input_tokens_seen": 131490344, "step": 195125 }, { "epoch": 4.767058363667457, "grad_norm": 0.0005754061858169734, "learning_rate": 1.3196989290741201e-08, "loss": 0.0, "num_input_tokens_seen": 131493864, "step": 195130 }, { "epoch": 4.767180514499303, "grad_norm": 0.00032430619467049837, "learning_rate": 1.3183184274005888e-08, "loss": 0.0, "num_input_tokens_seen": 131497576, "step": 195135 }, { "epoch": 4.767302665331151, "grad_norm": 0.0011158830020576715, "learning_rate": 1.316938643366805e-08, "loss": 0.0, "num_input_tokens_seen": 131500840, "step": 195140 }, { "epoch": 4.767424816162998, "grad_norm": 3.952207407564856e-05, "learning_rate": 1.3155595769827721e-08, "loss": 0.0, "num_input_tokens_seen": 131504168, "step": 195145 }, { "epoch": 4.767546966994845, "grad_norm": 0.00012400586274452507, "learning_rate": 1.3141812282585374e-08, "loss": 0.0, "num_input_tokens_seen": 131507432, "step": 195150 }, { "epoch": 4.767669117826692, "grad_norm": 1.1847202586068306e-05, "learning_rate": 1.3128035972041262e-08, "loss": 0.0, "num_input_tokens_seen": 131510888, "step": 195155 }, { "epoch": 4.76779126865854, "grad_norm": 0.0014117275131866336, "learning_rate": 1.3114266838295418e-08, "loss": 0.0, "num_input_tokens_seen": 131514408, "step": 195160 }, { "epoch": 4.7679134194903865, "grad_norm": 0.0005267454544082284, "learning_rate": 1.3100504881448093e-08, "loss": 0.0, "num_input_tokens_seen": 131517800, "step": 195165 }, { "epoch": 4.768035570322234, "grad_norm": 0.0010505430400371552, "learning_rate": 1.308675010159932e-08, "loss": 0.0, "num_input_tokens_seen": 131521384, "step": 195170 }, { "epoch": 4.768157721154081, "grad_norm": 0.00037277350202202797, "learning_rate": 1.307300249884924e-08, "loss": 0.0, "num_input_tokens_seen": 131524392, "step": 195175 }, { "epoch": 4.7682798719859285, "grad_norm": 0.029164448380470276, "learning_rate": 1.305926207329766e-08, "loss": 0.0, "num_input_tokens_seen": 131528168, "step": 195180 }, { "epoch": 4.768402022817775, "grad_norm": 4.427357271197252e-05, "learning_rate": 1.3045528825044615e-08, "loss": 0.0, "num_input_tokens_seen": 131531368, "step": 195185 }, { "epoch": 4.768524173649623, "grad_norm": 2.8566098990268074e-05, "learning_rate": 1.3031802754189913e-08, "loss": 0.0, "num_input_tokens_seen": 131534376, "step": 195190 }, { "epoch": 4.76864632448147, "grad_norm": 3.755874058697373e-05, "learning_rate": 1.3018083860833362e-08, "loss": 0.0004, "num_input_tokens_seen": 131537448, "step": 195195 }, { "epoch": 4.768768475313317, "grad_norm": 2.5464059945079498e-05, "learning_rate": 1.3004372145074883e-08, "loss": 0.0, "num_input_tokens_seen": 131540584, "step": 195200 }, { "epoch": 4.768890626145164, "grad_norm": 0.0004309054056648165, "learning_rate": 1.299066760701395e-08, "loss": 0.0, "num_input_tokens_seen": 131544360, "step": 195205 }, { "epoch": 4.769012776977011, "grad_norm": 0.0006671411683782935, "learning_rate": 1.2976970246750484e-08, "loss": 0.0, "num_input_tokens_seen": 131547816, "step": 195210 }, { "epoch": 4.769134927808858, "grad_norm": 0.00020214584947098047, "learning_rate": 1.2963280064383853e-08, "loss": 0.0, "num_input_tokens_seen": 131550824, "step": 195215 }, { "epoch": 4.769257078640706, "grad_norm": 0.0008767805411480367, "learning_rate": 1.2949597060013862e-08, "loss": 0.0, "num_input_tokens_seen": 131554280, "step": 195220 }, { "epoch": 4.769379229472553, "grad_norm": 0.0031209036242216825, "learning_rate": 1.2935921233739766e-08, "loss": 0.0, "num_input_tokens_seen": 131557480, "step": 195225 }, { "epoch": 4.7695013803043995, "grad_norm": 0.0023440788500010967, "learning_rate": 1.2922252585661153e-08, "loss": 0.0, "num_input_tokens_seen": 131560872, "step": 195230 }, { "epoch": 4.769623531136247, "grad_norm": 4.385735155665316e-05, "learning_rate": 1.2908591115877499e-08, "loss": 0.0, "num_input_tokens_seen": 131563944, "step": 195235 }, { "epoch": 4.769745681968094, "grad_norm": 0.002352142706513405, "learning_rate": 1.2894936824487945e-08, "loss": 0.0, "num_input_tokens_seen": 131567080, "step": 195240 }, { "epoch": 4.7698678327999415, "grad_norm": 3.444106914685108e-05, "learning_rate": 1.288128971159197e-08, "loss": 0.0, "num_input_tokens_seen": 131570408, "step": 195245 }, { "epoch": 4.769989983631788, "grad_norm": 0.00010280736751155928, "learning_rate": 1.2867649777288826e-08, "loss": 0.0, "num_input_tokens_seen": 131573608, "step": 195250 }, { "epoch": 4.770112134463636, "grad_norm": 0.01051387283951044, "learning_rate": 1.2854017021677543e-08, "loss": 0.0, "num_input_tokens_seen": 131578984, "step": 195255 }, { "epoch": 4.770234285295483, "grad_norm": 0.00026178904226981103, "learning_rate": 1.284039144485749e-08, "loss": 0.0, "num_input_tokens_seen": 131582120, "step": 195260 }, { "epoch": 4.77035643612733, "grad_norm": 0.0002798176428768784, "learning_rate": 1.2826773046927475e-08, "loss": 0.0, "num_input_tokens_seen": 131585448, "step": 195265 }, { "epoch": 4.770478586959177, "grad_norm": 4.130896195420064e-05, "learning_rate": 1.2813161827986752e-08, "loss": 0.0, "num_input_tokens_seen": 131589160, "step": 195270 }, { "epoch": 4.770600737791025, "grad_norm": 0.0005915540968999267, "learning_rate": 1.2799557788134241e-08, "loss": 0.0, "num_input_tokens_seen": 131592168, "step": 195275 }, { "epoch": 4.770722888622871, "grad_norm": 0.00042665572254918516, "learning_rate": 1.2785960927468863e-08, "loss": 0.0, "num_input_tokens_seen": 131595176, "step": 195280 }, { "epoch": 4.770845039454719, "grad_norm": 7.571290916530415e-05, "learning_rate": 1.2772371246089431e-08, "loss": 0.0, "num_input_tokens_seen": 131598376, "step": 195285 }, { "epoch": 4.770967190286566, "grad_norm": 0.0002932634379249066, "learning_rate": 1.2758788744094862e-08, "loss": 0.0, "num_input_tokens_seen": 131602984, "step": 195290 }, { "epoch": 4.771089341118413, "grad_norm": 0.0005674429121427238, "learning_rate": 1.2745213421583967e-08, "loss": 0.0, "num_input_tokens_seen": 131606248, "step": 195295 }, { "epoch": 4.77121149195026, "grad_norm": 0.0014069292228668928, "learning_rate": 1.2731645278655445e-08, "loss": 0.0, "num_input_tokens_seen": 131609512, "step": 195300 }, { "epoch": 4.771333642782107, "grad_norm": 0.023213129490613937, "learning_rate": 1.2718084315407995e-08, "loss": 0.0, "num_input_tokens_seen": 131612968, "step": 195305 }, { "epoch": 4.7714557936139546, "grad_norm": 7.675900997128338e-05, "learning_rate": 1.2704530531939982e-08, "loss": 0.0, "num_input_tokens_seen": 131616808, "step": 195310 }, { "epoch": 4.771577944445801, "grad_norm": 0.00011649368389043957, "learning_rate": 1.269098392835033e-08, "loss": 0.0, "num_input_tokens_seen": 131620008, "step": 195315 }, { "epoch": 4.771700095277649, "grad_norm": 0.0007854030118323863, "learning_rate": 1.26774445047374e-08, "loss": 0.0, "num_input_tokens_seen": 131623720, "step": 195320 }, { "epoch": 4.771822246109496, "grad_norm": 0.05870318040251732, "learning_rate": 1.266391226119956e-08, "loss": 0.0, "num_input_tokens_seen": 131627176, "step": 195325 }, { "epoch": 4.771944396941343, "grad_norm": 6.638450577156618e-05, "learning_rate": 1.2650387197835399e-08, "loss": 0.0, "num_input_tokens_seen": 131630888, "step": 195330 }, { "epoch": 4.77206654777319, "grad_norm": 0.0003788423491641879, "learning_rate": 1.2636869314743059e-08, "loss": 0.0, "num_input_tokens_seen": 131634536, "step": 195335 }, { "epoch": 4.772188698605038, "grad_norm": 0.0008416434284299612, "learning_rate": 1.2623358612021128e-08, "loss": 0.0001, "num_input_tokens_seen": 131637736, "step": 195340 }, { "epoch": 4.772310849436884, "grad_norm": 0.0003930965031031519, "learning_rate": 1.260985508976764e-08, "loss": 0.0, "num_input_tokens_seen": 131641000, "step": 195345 }, { "epoch": 4.772433000268732, "grad_norm": 0.0031450181268155575, "learning_rate": 1.2596358748080848e-08, "loss": 0.0, "num_input_tokens_seen": 131644712, "step": 195350 }, { "epoch": 4.772555151100579, "grad_norm": 1.319029524893267e-05, "learning_rate": 1.2582869587059008e-08, "loss": 0.0, "num_input_tokens_seen": 131648104, "step": 195355 }, { "epoch": 4.7726773019324265, "grad_norm": 0.0007021019700914621, "learning_rate": 1.2569387606800041e-08, "loss": 0.0, "num_input_tokens_seen": 131651368, "step": 195360 }, { "epoch": 4.772799452764273, "grad_norm": 0.0006127048982307315, "learning_rate": 1.2555912807402092e-08, "loss": 0.0, "num_input_tokens_seen": 131654632, "step": 195365 }, { "epoch": 4.772921603596121, "grad_norm": 0.0014195807743817568, "learning_rate": 1.254244518896308e-08, "loss": 0.0, "num_input_tokens_seen": 131657768, "step": 195370 }, { "epoch": 4.773043754427968, "grad_norm": 4.549880395643413e-05, "learning_rate": 1.2528984751581151e-08, "loss": 0.0, "num_input_tokens_seen": 131661160, "step": 195375 }, { "epoch": 4.773165905259814, "grad_norm": 0.00033199405879713595, "learning_rate": 1.2515531495353893e-08, "loss": 0.0, "num_input_tokens_seen": 131664808, "step": 195380 }, { "epoch": 4.773288056091662, "grad_norm": 0.00022149684082251042, "learning_rate": 1.2502085420379337e-08, "loss": 0.0, "num_input_tokens_seen": 131667688, "step": 195385 }, { "epoch": 4.77341020692351, "grad_norm": 7.733409438515082e-05, "learning_rate": 1.2488646526755187e-08, "loss": 0.0, "num_input_tokens_seen": 131671080, "step": 195390 }, { "epoch": 4.773532357755356, "grad_norm": 0.0018248482374474406, "learning_rate": 1.2475214814579248e-08, "loss": 0.0, "num_input_tokens_seen": 131674152, "step": 195395 }, { "epoch": 4.773654508587203, "grad_norm": 8.610795339336619e-05, "learning_rate": 1.2461790283949115e-08, "loss": 0.0, "num_input_tokens_seen": 131677416, "step": 195400 }, { "epoch": 4.773776659419051, "grad_norm": 0.06222959980368614, "learning_rate": 1.244837293496248e-08, "loss": 0.0, "num_input_tokens_seen": 131680616, "step": 195405 }, { "epoch": 4.7738988102508975, "grad_norm": 0.003180848667398095, "learning_rate": 1.2434962767716939e-08, "loss": 0.0, "num_input_tokens_seen": 131683944, "step": 195410 }, { "epoch": 4.774020961082745, "grad_norm": 0.00010856594599317759, "learning_rate": 1.2421559782309966e-08, "loss": 0.0, "num_input_tokens_seen": 131686952, "step": 195415 }, { "epoch": 4.774143111914592, "grad_norm": 0.00012096684804419056, "learning_rate": 1.2408163978839036e-08, "loss": 0.0, "num_input_tokens_seen": 131689896, "step": 195420 }, { "epoch": 4.7742652627464395, "grad_norm": 0.000282303401036188, "learning_rate": 1.2394775357401521e-08, "loss": 0.0, "num_input_tokens_seen": 131693608, "step": 195425 }, { "epoch": 4.774387413578286, "grad_norm": 0.0012512014945968986, "learning_rate": 1.2381393918094784e-08, "loss": 0.0348, "num_input_tokens_seen": 131697064, "step": 195430 }, { "epoch": 4.774509564410134, "grad_norm": 0.0003305374411866069, "learning_rate": 1.2368019661016304e-08, "loss": 0.0, "num_input_tokens_seen": 131700072, "step": 195435 }, { "epoch": 4.774631715241981, "grad_norm": 0.0003950181417167187, "learning_rate": 1.2354652586263226e-08, "loss": 0.0, "num_input_tokens_seen": 131703144, "step": 195440 }, { "epoch": 4.774753866073828, "grad_norm": 0.0002270180848427117, "learning_rate": 1.2341292693932692e-08, "loss": 0.0, "num_input_tokens_seen": 131706664, "step": 195445 }, { "epoch": 4.774876016905675, "grad_norm": 0.0008159330463968217, "learning_rate": 1.232793998412196e-08, "loss": 0.0, "num_input_tokens_seen": 131709992, "step": 195450 }, { "epoch": 4.774998167737523, "grad_norm": 0.0028800824657082558, "learning_rate": 1.2314594456928063e-08, "loss": 0.0, "num_input_tokens_seen": 131713576, "step": 195455 }, { "epoch": 4.775120318569369, "grad_norm": 0.14265161752700806, "learning_rate": 1.2301256112448144e-08, "loss": 0.0, "num_input_tokens_seen": 131716584, "step": 195460 }, { "epoch": 4.775242469401217, "grad_norm": 6.738114461768419e-05, "learning_rate": 1.2287924950779238e-08, "loss": 0.0, "num_input_tokens_seen": 131719912, "step": 195465 }, { "epoch": 4.775364620233064, "grad_norm": 0.0014600591966882348, "learning_rate": 1.2274600972018046e-08, "loss": 0.0, "num_input_tokens_seen": 131723688, "step": 195470 }, { "epoch": 4.7754867710649105, "grad_norm": 0.08191387355327606, "learning_rate": 1.226128417626171e-08, "loss": 0.0, "num_input_tokens_seen": 131726888, "step": 195475 }, { "epoch": 4.775608921896758, "grad_norm": 0.00014762043429072946, "learning_rate": 1.2247974563606823e-08, "loss": 0.0, "num_input_tokens_seen": 131730024, "step": 195480 }, { "epoch": 4.775731072728606, "grad_norm": 0.0026744746137410402, "learning_rate": 1.2234672134150525e-08, "loss": 0.0, "num_input_tokens_seen": 131733032, "step": 195485 }, { "epoch": 4.7758532235604525, "grad_norm": 0.02477974258363247, "learning_rate": 1.2221376887989298e-08, "loss": 0.0, "num_input_tokens_seen": 131736808, "step": 195490 }, { "epoch": 4.775975374392299, "grad_norm": 7.066877151373774e-05, "learning_rate": 1.2208088825219954e-08, "loss": 0.0, "num_input_tokens_seen": 131740328, "step": 195495 }, { "epoch": 4.776097525224147, "grad_norm": 0.006642943248152733, "learning_rate": 1.2194807945938967e-08, "loss": 0.0, "num_input_tokens_seen": 131743592, "step": 195500 }, { "epoch": 4.776219676055994, "grad_norm": 0.0001629340258659795, "learning_rate": 1.2181534250243041e-08, "loss": 0.0, "num_input_tokens_seen": 131746984, "step": 195505 }, { "epoch": 4.776341826887841, "grad_norm": 0.0013602145481854677, "learning_rate": 1.2168267738228765e-08, "loss": 0.0, "num_input_tokens_seen": 131750248, "step": 195510 }, { "epoch": 4.776463977719688, "grad_norm": 0.04610900953412056, "learning_rate": 1.2155008409992507e-08, "loss": 0.0, "num_input_tokens_seen": 131753768, "step": 195515 }, { "epoch": 4.776586128551536, "grad_norm": 0.0009138291934505105, "learning_rate": 1.2141756265630742e-08, "loss": 0.0, "num_input_tokens_seen": 131756904, "step": 195520 }, { "epoch": 4.776708279383382, "grad_norm": 0.004156212322413921, "learning_rate": 1.212851130523973e-08, "loss": 0.0, "num_input_tokens_seen": 131760296, "step": 195525 }, { "epoch": 4.77683043021523, "grad_norm": 0.0008036352810449898, "learning_rate": 1.211527352891606e-08, "loss": 0.0, "num_input_tokens_seen": 131763688, "step": 195530 }, { "epoch": 4.776952581047077, "grad_norm": 0.0006484703626483679, "learning_rate": 1.2102042936755652e-08, "loss": 0.0, "num_input_tokens_seen": 131767400, "step": 195535 }, { "epoch": 4.777074731878924, "grad_norm": 3.612769432947971e-05, "learning_rate": 1.2088819528854988e-08, "loss": 0.0, "num_input_tokens_seen": 131770920, "step": 195540 }, { "epoch": 4.777196882710771, "grad_norm": 5.983481969451532e-05, "learning_rate": 1.2075603305310211e-08, "loss": 0.0, "num_input_tokens_seen": 131774056, "step": 195545 }, { "epoch": 4.777319033542619, "grad_norm": 0.006965314038097858, "learning_rate": 1.2062394266217246e-08, "loss": 0.0001, "num_input_tokens_seen": 131777640, "step": 195550 }, { "epoch": 4.7774411843744655, "grad_norm": 0.0002887119189836085, "learning_rate": 1.2049192411672348e-08, "loss": 0.0, "num_input_tokens_seen": 131781032, "step": 195555 }, { "epoch": 4.777563335206313, "grad_norm": 5.628536200674716e-06, "learning_rate": 1.2035997741771442e-08, "loss": 0.0, "num_input_tokens_seen": 131784168, "step": 195560 }, { "epoch": 4.77768548603816, "grad_norm": 6.626116373809054e-05, "learning_rate": 1.202281025661045e-08, "loss": 0.0, "num_input_tokens_seen": 131787496, "step": 195565 }, { "epoch": 4.777807636870007, "grad_norm": 0.0005148024065420032, "learning_rate": 1.2009629956285405e-08, "loss": 0.0, "num_input_tokens_seen": 131790888, "step": 195570 }, { "epoch": 4.777929787701854, "grad_norm": 0.00042017377563752234, "learning_rate": 1.1996456840892011e-08, "loss": 0.0, "num_input_tokens_seen": 131793768, "step": 195575 }, { "epoch": 4.778051938533701, "grad_norm": 0.002864888869225979, "learning_rate": 1.1983290910526079e-08, "loss": 0.0, "num_input_tokens_seen": 131797416, "step": 195580 }, { "epoch": 4.778174089365549, "grad_norm": 0.0014279955066740513, "learning_rate": 1.197013216528342e-08, "loss": 0.0, "num_input_tokens_seen": 131800744, "step": 195585 }, { "epoch": 4.778296240197395, "grad_norm": 8.794532732281368e-06, "learning_rate": 1.1956980605259737e-08, "loss": 0.0, "num_input_tokens_seen": 131803944, "step": 195590 }, { "epoch": 4.778418391029243, "grad_norm": 0.0006799935363233089, "learning_rate": 1.1943836230550619e-08, "loss": 0.0, "num_input_tokens_seen": 131807208, "step": 195595 }, { "epoch": 4.77854054186109, "grad_norm": 6.255536573007703e-05, "learning_rate": 1.1930699041251657e-08, "loss": 0.0, "num_input_tokens_seen": 131810856, "step": 195600 }, { "epoch": 4.778662692692937, "grad_norm": 0.0006646077963523567, "learning_rate": 1.1917569037458553e-08, "loss": 0.0007, "num_input_tokens_seen": 131813800, "step": 195605 }, { "epoch": 4.778784843524784, "grad_norm": 0.00029163400176912546, "learning_rate": 1.1904446219266451e-08, "loss": 0.0, "num_input_tokens_seen": 131817384, "step": 195610 }, { "epoch": 4.778906994356632, "grad_norm": 0.00019608518050517887, "learning_rate": 1.1891330586771165e-08, "loss": 0.0, "num_input_tokens_seen": 131820648, "step": 195615 }, { "epoch": 4.7790291451884785, "grad_norm": 4.607576920534484e-05, "learning_rate": 1.1878222140067729e-08, "loss": 0.0, "num_input_tokens_seen": 131824168, "step": 195620 }, { "epoch": 4.779151296020326, "grad_norm": 0.00015413403161801398, "learning_rate": 1.1865120879251734e-08, "loss": 0.0, "num_input_tokens_seen": 131827240, "step": 195625 }, { "epoch": 4.779273446852173, "grad_norm": 0.0007240389240905643, "learning_rate": 1.1852026804418325e-08, "loss": 0.0, "num_input_tokens_seen": 131830184, "step": 195630 }, { "epoch": 4.779395597684021, "grad_norm": 1.851908746175468e-05, "learning_rate": 1.1838939915662761e-08, "loss": 0.0, "num_input_tokens_seen": 131833384, "step": 195635 }, { "epoch": 4.779517748515867, "grad_norm": 0.005462095607072115, "learning_rate": 1.1825860213080185e-08, "loss": 0.0, "num_input_tokens_seen": 131836968, "step": 195640 }, { "epoch": 4.779639899347715, "grad_norm": 0.000327201618347317, "learning_rate": 1.1812787696765747e-08, "loss": 0.0, "num_input_tokens_seen": 131840872, "step": 195645 }, { "epoch": 4.779762050179562, "grad_norm": 9.929361112881452e-05, "learning_rate": 1.1799722366814591e-08, "loss": 0.0, "num_input_tokens_seen": 131844008, "step": 195650 }, { "epoch": 4.779884201011409, "grad_norm": 7.03208424965851e-05, "learning_rate": 1.1786664223321529e-08, "loss": 0.0004, "num_input_tokens_seen": 131847336, "step": 195655 }, { "epoch": 4.780006351843256, "grad_norm": 3.4296645026188344e-05, "learning_rate": 1.177361326638171e-08, "loss": 0.0, "num_input_tokens_seen": 131850344, "step": 195660 }, { "epoch": 4.780128502675103, "grad_norm": 0.00012804471771232784, "learning_rate": 1.1760569496089946e-08, "loss": 0.0, "num_input_tokens_seen": 131853224, "step": 195665 }, { "epoch": 4.78025065350695, "grad_norm": 0.0007066622492857277, "learning_rate": 1.1747532912541159e-08, "loss": 0.0, "num_input_tokens_seen": 131856168, "step": 195670 }, { "epoch": 4.780372804338797, "grad_norm": 0.0008149920031428337, "learning_rate": 1.1734503515830053e-08, "loss": 0.0, "num_input_tokens_seen": 131859368, "step": 195675 }, { "epoch": 4.780494955170645, "grad_norm": 0.0002996523689944297, "learning_rate": 1.172148130605155e-08, "loss": 0.0, "num_input_tokens_seen": 131862760, "step": 195680 }, { "epoch": 4.780617106002492, "grad_norm": 0.001203904626891017, "learning_rate": 1.1708466283300245e-08, "loss": 0.0, "num_input_tokens_seen": 131866280, "step": 195685 }, { "epoch": 4.780739256834339, "grad_norm": 6.254316394915804e-05, "learning_rate": 1.1695458447670725e-08, "loss": 0.0, "num_input_tokens_seen": 131869992, "step": 195690 }, { "epoch": 4.780861407666186, "grad_norm": 0.0009527397342026234, "learning_rate": 1.1682457799257584e-08, "loss": 0.0336, "num_input_tokens_seen": 131873768, "step": 195695 }, { "epoch": 4.780983558498034, "grad_norm": 0.003038577502593398, "learning_rate": 1.1669464338155632e-08, "loss": 0.0, "num_input_tokens_seen": 131877672, "step": 195700 }, { "epoch": 4.78110570932988, "grad_norm": 0.0008341276552528143, "learning_rate": 1.1656478064459019e-08, "loss": 0.0, "num_input_tokens_seen": 131880616, "step": 195705 }, { "epoch": 4.781227860161728, "grad_norm": 0.0013110843719914556, "learning_rate": 1.1643498978262334e-08, "loss": 0.0, "num_input_tokens_seen": 131884072, "step": 195710 }, { "epoch": 4.781350010993575, "grad_norm": 5.7564688177080825e-05, "learning_rate": 1.1630527079660057e-08, "loss": 0.0, "num_input_tokens_seen": 131887464, "step": 195715 }, { "epoch": 4.781472161825422, "grad_norm": 0.0017062549013644457, "learning_rate": 1.1617562368746226e-08, "loss": 0.0, "num_input_tokens_seen": 131891048, "step": 195720 }, { "epoch": 4.781594312657269, "grad_norm": 5.923534627072513e-05, "learning_rate": 1.160460484561554e-08, "loss": 0.0, "num_input_tokens_seen": 131894312, "step": 195725 }, { "epoch": 4.781716463489117, "grad_norm": 0.08431998640298843, "learning_rate": 1.1591654510361926e-08, "loss": 0.0, "num_input_tokens_seen": 131897960, "step": 195730 }, { "epoch": 4.7818386143209635, "grad_norm": 0.00012925357441417873, "learning_rate": 1.157871136307964e-08, "loss": 0.0001, "num_input_tokens_seen": 131901544, "step": 195735 }, { "epoch": 4.78196076515281, "grad_norm": 0.00012562771735247225, "learning_rate": 1.1565775403862831e-08, "loss": 0.0, "num_input_tokens_seen": 131904680, "step": 195740 }, { "epoch": 4.782082915984658, "grad_norm": 1.1264551176282112e-05, "learning_rate": 1.1552846632805646e-08, "loss": 0.0, "num_input_tokens_seen": 131908008, "step": 195745 }, { "epoch": 4.7822050668165055, "grad_norm": 0.00039110734360292554, "learning_rate": 1.1539925050001897e-08, "loss": 0.0, "num_input_tokens_seen": 131911400, "step": 195750 }, { "epoch": 4.782327217648352, "grad_norm": 4.613706187228672e-05, "learning_rate": 1.1527010655545621e-08, "loss": 0.0436, "num_input_tokens_seen": 131915304, "step": 195755 }, { "epoch": 4.782449368480199, "grad_norm": 0.0015119805466383696, "learning_rate": 1.1514103449530966e-08, "loss": 0.0, "num_input_tokens_seen": 131918312, "step": 195760 }, { "epoch": 4.782571519312047, "grad_norm": 0.0010038913460448384, "learning_rate": 1.150120343205152e-08, "loss": 0.0, "num_input_tokens_seen": 131922280, "step": 195765 }, { "epoch": 4.782693670143893, "grad_norm": 34.064022064208984, "learning_rate": 1.1488310603201323e-08, "loss": 0.0538, "num_input_tokens_seen": 131925352, "step": 195770 }, { "epoch": 4.782815820975741, "grad_norm": 0.00014960873522795737, "learning_rate": 1.1475424963073853e-08, "loss": 0.0, "num_input_tokens_seen": 131928488, "step": 195775 }, { "epoch": 4.782937971807588, "grad_norm": 0.004268994554877281, "learning_rate": 1.1462546511763039e-08, "loss": 0.0, "num_input_tokens_seen": 131931560, "step": 195780 }, { "epoch": 4.783060122639435, "grad_norm": 0.0008855354390107095, "learning_rate": 1.1449675249362467e-08, "loss": 0.0, "num_input_tokens_seen": 131935400, "step": 195785 }, { "epoch": 4.783182273471282, "grad_norm": 6.182612560223788e-05, "learning_rate": 1.1436811175965732e-08, "loss": 0.0, "num_input_tokens_seen": 131938920, "step": 195790 }, { "epoch": 4.78330442430313, "grad_norm": 1.5357125448645093e-05, "learning_rate": 1.1423954291666427e-08, "loss": 0.0, "num_input_tokens_seen": 131942120, "step": 195795 }, { "epoch": 4.7834265751349765, "grad_norm": 1.5569354218314402e-05, "learning_rate": 1.141110459655803e-08, "loss": 0.0, "num_input_tokens_seen": 131945064, "step": 195800 }, { "epoch": 4.783548725966824, "grad_norm": 0.0012987203663215041, "learning_rate": 1.1398262090733913e-08, "loss": 0.0, "num_input_tokens_seen": 131948776, "step": 195805 }, { "epoch": 4.783670876798671, "grad_norm": 0.0008184673497453332, "learning_rate": 1.1385426774287555e-08, "loss": 0.0, "num_input_tokens_seen": 131951976, "step": 195810 }, { "epoch": 4.7837930276305185, "grad_norm": 1.7329102774965577e-05, "learning_rate": 1.1372598647312325e-08, "loss": 0.0, "num_input_tokens_seen": 131955560, "step": 195815 }, { "epoch": 4.783915178462365, "grad_norm": 0.00024611057597212493, "learning_rate": 1.1359777709901374e-08, "loss": 0.0, "num_input_tokens_seen": 131958888, "step": 195820 }, { "epoch": 4.784037329294213, "grad_norm": 0.0007355216657742858, "learning_rate": 1.134696396214807e-08, "loss": 0.0, "num_input_tokens_seen": 131962088, "step": 195825 }, { "epoch": 4.78415948012606, "grad_norm": 0.001096261665225029, "learning_rate": 1.1334157404145672e-08, "loss": 0.0, "num_input_tokens_seen": 131965352, "step": 195830 }, { "epoch": 4.784281630957906, "grad_norm": 7.370772800641134e-05, "learning_rate": 1.1321358035987106e-08, "loss": 0.0, "num_input_tokens_seen": 131968808, "step": 195835 }, { "epoch": 4.784403781789754, "grad_norm": 0.0006600241176784039, "learning_rate": 1.1308565857765517e-08, "loss": 0.0, "num_input_tokens_seen": 131972520, "step": 195840 }, { "epoch": 4.784525932621602, "grad_norm": 0.0007484956295229495, "learning_rate": 1.1295780869574056e-08, "loss": 0.0, "num_input_tokens_seen": 131975528, "step": 195845 }, { "epoch": 4.784648083453448, "grad_norm": 0.00033427434391342103, "learning_rate": 1.1283003071505426e-08, "loss": 0.0, "num_input_tokens_seen": 131978408, "step": 195850 }, { "epoch": 4.784770234285295, "grad_norm": 0.00029287466895766556, "learning_rate": 1.1270232463652884e-08, "loss": 0.0, "num_input_tokens_seen": 131981736, "step": 195855 }, { "epoch": 4.784892385117143, "grad_norm": 2.4387059966102242e-05, "learning_rate": 1.1257469046109135e-08, "loss": 0.0, "num_input_tokens_seen": 131985192, "step": 195860 }, { "epoch": 4.7850145359489895, "grad_norm": 0.025505151599645615, "learning_rate": 1.1244712818966995e-08, "loss": 0.0, "num_input_tokens_seen": 131988264, "step": 195865 }, { "epoch": 4.785136686780837, "grad_norm": 9.816163219511509e-05, "learning_rate": 1.1231963782319275e-08, "loss": 0.0527, "num_input_tokens_seen": 131992424, "step": 195870 }, { "epoch": 4.785258837612684, "grad_norm": 4.145934508414939e-05, "learning_rate": 1.1219221936258682e-08, "loss": 0.0, "num_input_tokens_seen": 131995560, "step": 195875 }, { "epoch": 4.7853809884445315, "grad_norm": 0.00012785769649781287, "learning_rate": 1.1206487280877807e-08, "loss": 0.0, "num_input_tokens_seen": 131998504, "step": 195880 }, { "epoch": 4.785503139276378, "grad_norm": 0.0005638871225528419, "learning_rate": 1.1193759816269243e-08, "loss": 0.0, "num_input_tokens_seen": 132001704, "step": 195885 }, { "epoch": 4.785625290108226, "grad_norm": 0.000946306565310806, "learning_rate": 1.1181039542525806e-08, "loss": 0.0, "num_input_tokens_seen": 132005416, "step": 195890 }, { "epoch": 4.785747440940073, "grad_norm": 0.000836696068290621, "learning_rate": 1.1168326459739642e-08, "loss": 0.0, "num_input_tokens_seen": 132008808, "step": 195895 }, { "epoch": 4.78586959177192, "grad_norm": 0.00010829546954482794, "learning_rate": 1.1155620568003455e-08, "loss": 0.0, "num_input_tokens_seen": 132011880, "step": 195900 }, { "epoch": 4.785991742603767, "grad_norm": 0.00018229872512165457, "learning_rate": 1.1142921867409505e-08, "loss": 0.0, "num_input_tokens_seen": 132015080, "step": 195905 }, { "epoch": 4.786113893435615, "grad_norm": 0.003096227301284671, "learning_rate": 1.1130230358050164e-08, "loss": 0.0, "num_input_tokens_seen": 132018344, "step": 195910 }, { "epoch": 4.786236044267461, "grad_norm": 6.02893705945462e-05, "learning_rate": 1.11175460400178e-08, "loss": 0.0, "num_input_tokens_seen": 132021352, "step": 195915 }, { "epoch": 4.786358195099309, "grad_norm": 0.00012660080392379314, "learning_rate": 1.1104868913404563e-08, "loss": 0.0, "num_input_tokens_seen": 132025064, "step": 195920 }, { "epoch": 4.786480345931156, "grad_norm": 5.217445868765935e-05, "learning_rate": 1.1092198978302824e-08, "loss": 0.0, "num_input_tokens_seen": 132028520, "step": 195925 }, { "epoch": 4.7866024967630025, "grad_norm": 2.1660089259967208e-05, "learning_rate": 1.107953623480451e-08, "loss": 0.0, "num_input_tokens_seen": 132032232, "step": 195930 }, { "epoch": 4.78672464759485, "grad_norm": 7.575411291327327e-05, "learning_rate": 1.1066880683001878e-08, "loss": 0.0, "num_input_tokens_seen": 132035496, "step": 195935 }, { "epoch": 4.786846798426697, "grad_norm": 0.02140646055340767, "learning_rate": 1.1054232322986857e-08, "loss": 0.0001, "num_input_tokens_seen": 132038760, "step": 195940 }, { "epoch": 4.7869689492585445, "grad_norm": 2.3463653633370996e-05, "learning_rate": 1.1041591154851371e-08, "loss": 0.0, "num_input_tokens_seen": 132041768, "step": 195945 }, { "epoch": 4.787091100090391, "grad_norm": 0.00011079600517405197, "learning_rate": 1.102895717868757e-08, "loss": 0.0, "num_input_tokens_seen": 132045416, "step": 195950 }, { "epoch": 4.787213250922239, "grad_norm": 0.00012676662299782038, "learning_rate": 1.1016330394587048e-08, "loss": 0.0, "num_input_tokens_seen": 132048680, "step": 195955 }, { "epoch": 4.787335401754086, "grad_norm": 0.1253872811794281, "learning_rate": 1.1003710802641842e-08, "loss": 0.0, "num_input_tokens_seen": 132052136, "step": 195960 }, { "epoch": 4.787457552585933, "grad_norm": 0.0003003769088536501, "learning_rate": 1.0991098402943655e-08, "loss": 0.0, "num_input_tokens_seen": 132056040, "step": 195965 }, { "epoch": 4.78757970341778, "grad_norm": 0.0003315807261969894, "learning_rate": 1.0978493195584193e-08, "loss": 0.0, "num_input_tokens_seen": 132060008, "step": 195970 }, { "epoch": 4.787701854249628, "grad_norm": 0.0008391228620894253, "learning_rate": 1.096589518065516e-08, "loss": 0.0, "num_input_tokens_seen": 132063528, "step": 195975 }, { "epoch": 4.787824005081474, "grad_norm": 0.004577086307108402, "learning_rate": 1.095330435824826e-08, "loss": 0.0, "num_input_tokens_seen": 132066856, "step": 195980 }, { "epoch": 4.787946155913322, "grad_norm": 0.0018010676139965653, "learning_rate": 1.0940720728454755e-08, "loss": 0.0, "num_input_tokens_seen": 132070120, "step": 195985 }, { "epoch": 4.788068306745169, "grad_norm": 0.00010236025264021009, "learning_rate": 1.092814429136646e-08, "loss": 0.0, "num_input_tokens_seen": 132073896, "step": 195990 }, { "epoch": 4.7881904575770164, "grad_norm": 0.0017252841498702765, "learning_rate": 1.0915575047074854e-08, "loss": 0.0, "num_input_tokens_seen": 132077096, "step": 195995 }, { "epoch": 4.788312608408863, "grad_norm": 0.00012550843530334532, "learning_rate": 1.090301299567098e-08, "loss": 0.0, "num_input_tokens_seen": 132080680, "step": 196000 }, { "epoch": 4.78843475924071, "grad_norm": 0.0007126462296582758, "learning_rate": 1.0890458137246539e-08, "loss": 0.0, "num_input_tokens_seen": 132083816, "step": 196005 }, { "epoch": 4.788556910072558, "grad_norm": 0.0004484814126044512, "learning_rate": 1.0877910471892793e-08, "loss": 0.0, "num_input_tokens_seen": 132087208, "step": 196010 }, { "epoch": 4.788679060904405, "grad_norm": 0.0004872412246186286, "learning_rate": 1.086536999970078e-08, "loss": 0.0, "num_input_tokens_seen": 132090472, "step": 196015 }, { "epoch": 4.788801211736252, "grad_norm": 1.7085814761230722e-05, "learning_rate": 1.0852836720761982e-08, "loss": 0.0, "num_input_tokens_seen": 132093928, "step": 196020 }, { "epoch": 4.788923362568099, "grad_norm": 0.000753185071516782, "learning_rate": 1.0840310635167216e-08, "loss": 0.0, "num_input_tokens_seen": 132097320, "step": 196025 }, { "epoch": 4.789045513399946, "grad_norm": 0.0021108719520270824, "learning_rate": 1.0827791743007852e-08, "loss": 0.0, "num_input_tokens_seen": 132100264, "step": 196030 }, { "epoch": 4.789167664231793, "grad_norm": 5.6025430239969864e-05, "learning_rate": 1.081528004437493e-08, "loss": 0.0, "num_input_tokens_seen": 132103848, "step": 196035 }, { "epoch": 4.789289815063641, "grad_norm": 0.0003986261726822704, "learning_rate": 1.0802775539359266e-08, "loss": 0.0, "num_input_tokens_seen": 132107176, "step": 196040 }, { "epoch": 4.7894119658954875, "grad_norm": 4.800490569323301e-05, "learning_rate": 1.0790278228051897e-08, "loss": 0.0, "num_input_tokens_seen": 132110376, "step": 196045 }, { "epoch": 4.789534116727335, "grad_norm": 9.925442282110453e-05, "learning_rate": 1.0777788110543751e-08, "loss": 0.0, "num_input_tokens_seen": 132113704, "step": 196050 }, { "epoch": 4.789656267559182, "grad_norm": 2.5734510927577503e-05, "learning_rate": 1.0765305186925532e-08, "loss": 0.0, "num_input_tokens_seen": 132117288, "step": 196055 }, { "epoch": 4.7897784183910295, "grad_norm": 0.0013512138975784183, "learning_rate": 1.075282945728806e-08, "loss": 0.0, "num_input_tokens_seen": 132120488, "step": 196060 }, { "epoch": 4.789900569222876, "grad_norm": 0.00018341124814469367, "learning_rate": 1.0740360921722146e-08, "loss": 0.0, "num_input_tokens_seen": 132123816, "step": 196065 }, { "epoch": 4.790022720054724, "grad_norm": 0.0020230484660714865, "learning_rate": 1.0727899580318388e-08, "loss": 0.0, "num_input_tokens_seen": 132126952, "step": 196070 }, { "epoch": 4.790144870886571, "grad_norm": 1.877875729405787e-05, "learning_rate": 1.071544543316738e-08, "loss": 0.0, "num_input_tokens_seen": 132130536, "step": 196075 }, { "epoch": 4.790267021718418, "grad_norm": 0.00011027592699974775, "learning_rate": 1.0702998480359827e-08, "loss": 0.0, "num_input_tokens_seen": 132133800, "step": 196080 }, { "epoch": 4.790389172550265, "grad_norm": 0.0023058492224663496, "learning_rate": 1.0690558721986209e-08, "loss": 0.0, "num_input_tokens_seen": 132136872, "step": 196085 }, { "epoch": 4.790511323382113, "grad_norm": 0.0016999959480017424, "learning_rate": 1.0678126158136791e-08, "loss": 0.0001, "num_input_tokens_seen": 132140392, "step": 196090 }, { "epoch": 4.790633474213959, "grad_norm": 0.00752806244418025, "learning_rate": 1.0665700788902277e-08, "loss": 0.0, "num_input_tokens_seen": 132143336, "step": 196095 }, { "epoch": 4.790755625045806, "grad_norm": 1.3469006262312178e-05, "learning_rate": 1.0653282614372705e-08, "loss": 0.0, "num_input_tokens_seen": 132146664, "step": 196100 }, { "epoch": 4.790877775877654, "grad_norm": 0.0008614024263806641, "learning_rate": 1.064087163463867e-08, "loss": 0.0, "num_input_tokens_seen": 132149928, "step": 196105 }, { "epoch": 4.790999926709501, "grad_norm": 0.01275652926415205, "learning_rate": 1.0628467849790323e-08, "loss": 0.0, "num_input_tokens_seen": 132153320, "step": 196110 }, { "epoch": 4.791122077541348, "grad_norm": 0.0005827039130963385, "learning_rate": 1.0616071259917925e-08, "loss": 0.0, "num_input_tokens_seen": 132156328, "step": 196115 }, { "epoch": 4.791244228373195, "grad_norm": 0.00018269375141244382, "learning_rate": 1.0603681865111402e-08, "loss": 0.0, "num_input_tokens_seen": 132159272, "step": 196120 }, { "epoch": 4.7913663792050425, "grad_norm": 9.967612277250737e-05, "learning_rate": 1.0591299665461128e-08, "loss": 0.0, "num_input_tokens_seen": 132162600, "step": 196125 }, { "epoch": 4.791488530036889, "grad_norm": 0.0005747093237005174, "learning_rate": 1.057892466105703e-08, "loss": 0.0001, "num_input_tokens_seen": 132165864, "step": 196130 }, { "epoch": 4.791610680868737, "grad_norm": 0.000769525533542037, "learning_rate": 1.056655685198915e-08, "loss": 0.0, "num_input_tokens_seen": 132169512, "step": 196135 }, { "epoch": 4.791732831700584, "grad_norm": 0.00016920336929615587, "learning_rate": 1.0554196238347302e-08, "loss": 0.0, "num_input_tokens_seen": 132172520, "step": 196140 }, { "epoch": 4.791854982532431, "grad_norm": 0.00163973867893219, "learning_rate": 1.0541842820221524e-08, "loss": 0.0, "num_input_tokens_seen": 132175656, "step": 196145 }, { "epoch": 4.791977133364278, "grad_norm": 0.00014396561891771853, "learning_rate": 1.0529496597701636e-08, "loss": 0.0, "num_input_tokens_seen": 132180008, "step": 196150 }, { "epoch": 4.792099284196126, "grad_norm": 0.0024907325860112906, "learning_rate": 1.0517157570877344e-08, "loss": 0.0, "num_input_tokens_seen": 132183400, "step": 196155 }, { "epoch": 4.792221435027972, "grad_norm": 0.006735434755682945, "learning_rate": 1.0504825739838353e-08, "loss": 0.0, "num_input_tokens_seen": 132186856, "step": 196160 }, { "epoch": 4.79234358585982, "grad_norm": 1.2725016858894378e-05, "learning_rate": 1.049250110467459e-08, "loss": 0.0, "num_input_tokens_seen": 132190248, "step": 196165 }, { "epoch": 4.792465736691667, "grad_norm": 1.25408132589655e-05, "learning_rate": 1.0480183665475317e-08, "loss": 0.0, "num_input_tokens_seen": 132193640, "step": 196170 }, { "epoch": 4.792587887523514, "grad_norm": 0.00020013254834339023, "learning_rate": 1.0467873422330464e-08, "loss": 0.0, "num_input_tokens_seen": 132196712, "step": 196175 }, { "epoch": 4.792710038355361, "grad_norm": 0.000348270230460912, "learning_rate": 1.0455570375329181e-08, "loss": 0.0, "num_input_tokens_seen": 132200232, "step": 196180 }, { "epoch": 4.792832189187209, "grad_norm": 0.0007520915823988616, "learning_rate": 1.0443274524561396e-08, "loss": 0.0, "num_input_tokens_seen": 132203432, "step": 196185 }, { "epoch": 4.7929543400190555, "grad_norm": 9.265771950595081e-05, "learning_rate": 1.043098587011615e-08, "loss": 0.0, "num_input_tokens_seen": 132206824, "step": 196190 }, { "epoch": 4.793076490850902, "grad_norm": 5.9360292652854696e-05, "learning_rate": 1.0418704412082924e-08, "loss": 0.0001, "num_input_tokens_seen": 132210088, "step": 196195 }, { "epoch": 4.79319864168275, "grad_norm": 0.0014555418165400624, "learning_rate": 1.0406430150551094e-08, "loss": 0.0003, "num_input_tokens_seen": 132213480, "step": 196200 }, { "epoch": 4.793320792514597, "grad_norm": 0.0003992864803876728, "learning_rate": 1.0394163085609808e-08, "loss": 0.0, "num_input_tokens_seen": 132216616, "step": 196205 }, { "epoch": 4.793442943346444, "grad_norm": 4.887767499894835e-05, "learning_rate": 1.038190321734833e-08, "loss": 0.0, "num_input_tokens_seen": 132220008, "step": 196210 }, { "epoch": 4.793565094178291, "grad_norm": 0.0018320534145459533, "learning_rate": 1.0369650545855813e-08, "loss": 0.0003, "num_input_tokens_seen": 132222952, "step": 196215 }, { "epoch": 4.793687245010139, "grad_norm": 0.003153629833832383, "learning_rate": 1.0357405071221404e-08, "loss": 0.0, "num_input_tokens_seen": 132226216, "step": 196220 }, { "epoch": 4.793809395841985, "grad_norm": 0.00641963304951787, "learning_rate": 1.0345166793534255e-08, "loss": 0.0, "num_input_tokens_seen": 132229224, "step": 196225 }, { "epoch": 4.793931546673833, "grad_norm": 0.009071718901395798, "learning_rate": 1.0332935712883073e-08, "loss": 0.0, "num_input_tokens_seen": 132232488, "step": 196230 }, { "epoch": 4.79405369750568, "grad_norm": 8.992061339085922e-05, "learning_rate": 1.032071182935701e-08, "loss": 0.0, "num_input_tokens_seen": 132235304, "step": 196235 }, { "epoch": 4.794175848337527, "grad_norm": 0.00019319630519021302, "learning_rate": 1.0308495143044993e-08, "loss": 0.0, "num_input_tokens_seen": 132238696, "step": 196240 }, { "epoch": 4.794297999169374, "grad_norm": 5.871194298379123e-05, "learning_rate": 1.029628565403573e-08, "loss": 0.0001, "num_input_tokens_seen": 132242024, "step": 196245 }, { "epoch": 4.794420150001222, "grad_norm": 0.0004964267718605697, "learning_rate": 1.028408336241804e-08, "loss": 0.0, "num_input_tokens_seen": 132245480, "step": 196250 }, { "epoch": 4.7945423008330685, "grad_norm": 0.0026195256505161524, "learning_rate": 1.0271888268280737e-08, "loss": 0.0, "num_input_tokens_seen": 132248936, "step": 196255 }, { "epoch": 4.794664451664916, "grad_norm": 0.0009615510352887213, "learning_rate": 1.0259700371712532e-08, "loss": 0.0, "num_input_tokens_seen": 132252584, "step": 196260 }, { "epoch": 4.794786602496763, "grad_norm": 0.00019157327187713236, "learning_rate": 1.0247519672801907e-08, "loss": 0.0, "num_input_tokens_seen": 132255784, "step": 196265 }, { "epoch": 4.7949087533286106, "grad_norm": 0.001955286832526326, "learning_rate": 1.0235346171637571e-08, "loss": 0.0, "num_input_tokens_seen": 132259368, "step": 196270 }, { "epoch": 4.795030904160457, "grad_norm": 0.0010139269288629293, "learning_rate": 1.0223179868308007e-08, "loss": 0.0, "num_input_tokens_seen": 132262824, "step": 196275 }, { "epoch": 4.795153054992305, "grad_norm": 1.6927018805290572e-05, "learning_rate": 1.02110207629017e-08, "loss": 0.0, "num_input_tokens_seen": 132266024, "step": 196280 }, { "epoch": 4.795275205824152, "grad_norm": 3.738186933333054e-05, "learning_rate": 1.0198868855507026e-08, "loss": 0.0, "num_input_tokens_seen": 132269672, "step": 196285 }, { "epoch": 4.795397356655998, "grad_norm": 0.0002817772619891912, "learning_rate": 1.0186724146212467e-08, "loss": 0.0, "num_input_tokens_seen": 132272872, "step": 196290 }, { "epoch": 4.795519507487846, "grad_norm": 0.001034017652273178, "learning_rate": 1.0174586635106285e-08, "loss": 0.0, "num_input_tokens_seen": 132275944, "step": 196295 }, { "epoch": 4.795641658319693, "grad_norm": 0.00042883388232439756, "learning_rate": 1.0162456322276747e-08, "loss": 0.0, "num_input_tokens_seen": 132279400, "step": 196300 }, { "epoch": 4.79576380915154, "grad_norm": 0.00022619598894380033, "learning_rate": 1.0150333207812001e-08, "loss": 0.0, "num_input_tokens_seen": 132283240, "step": 196305 }, { "epoch": 4.795885959983387, "grad_norm": 1.1148900739499368e-05, "learning_rate": 1.013821729180031e-08, "loss": 0.0, "num_input_tokens_seen": 132286248, "step": 196310 }, { "epoch": 4.796008110815235, "grad_norm": 9.672602573118638e-06, "learning_rate": 1.0126108574329718e-08, "loss": 0.0, "num_input_tokens_seen": 132289768, "step": 196315 }, { "epoch": 4.796130261647082, "grad_norm": 0.0005740622291341424, "learning_rate": 1.0114007055488261e-08, "loss": 0.0453, "num_input_tokens_seen": 132292840, "step": 196320 }, { "epoch": 4.796252412478929, "grad_norm": 3.958161687478423e-05, "learning_rate": 1.0101912735364092e-08, "loss": 0.0275, "num_input_tokens_seen": 132296168, "step": 196325 }, { "epoch": 4.796374563310776, "grad_norm": 0.0006076816935092211, "learning_rate": 1.0089825614045032e-08, "loss": 0.0, "num_input_tokens_seen": 132299688, "step": 196330 }, { "epoch": 4.796496714142624, "grad_norm": 0.00510576693341136, "learning_rate": 1.00777456916189e-08, "loss": 0.0, "num_input_tokens_seen": 132302952, "step": 196335 }, { "epoch": 4.79661886497447, "grad_norm": 0.0004893884179182351, "learning_rate": 1.0065672968173734e-08, "loss": 0.0, "num_input_tokens_seen": 132305960, "step": 196340 }, { "epoch": 4.796741015806318, "grad_norm": 0.0009445503819733858, "learning_rate": 1.0053607443797351e-08, "loss": 0.0, "num_input_tokens_seen": 132309096, "step": 196345 }, { "epoch": 4.796863166638165, "grad_norm": 0.0003557786112651229, "learning_rate": 1.0041549118577353e-08, "loss": 0.0, "num_input_tokens_seen": 132312552, "step": 196350 }, { "epoch": 4.796985317470012, "grad_norm": 8.319457265315577e-05, "learning_rate": 1.0029497992601443e-08, "loss": 0.0003, "num_input_tokens_seen": 132315624, "step": 196355 }, { "epoch": 4.797107468301859, "grad_norm": 0.0003622810763772577, "learning_rate": 1.001745406595722e-08, "loss": 0.0, "num_input_tokens_seen": 132319144, "step": 196360 }, { "epoch": 4.797229619133706, "grad_norm": 0.0001322765019722283, "learning_rate": 1.0005417338732502e-08, "loss": 0.0, "num_input_tokens_seen": 132322472, "step": 196365 }, { "epoch": 4.7973517699655535, "grad_norm": 0.00015713920583948493, "learning_rate": 9.993387811014553e-09, "loss": 0.0, "num_input_tokens_seen": 132325672, "step": 196370 }, { "epoch": 4.797473920797401, "grad_norm": 0.001111492863856256, "learning_rate": 9.98136548289097e-09, "loss": 0.0, "num_input_tokens_seen": 132328744, "step": 196375 }, { "epoch": 4.797596071629248, "grad_norm": 8.227213402278721e-05, "learning_rate": 9.969350354449236e-09, "loss": 0.0, "num_input_tokens_seen": 132331880, "step": 196380 }, { "epoch": 4.797718222461095, "grad_norm": 0.0006079964805394411, "learning_rate": 9.957342425776617e-09, "loss": 0.0, "num_input_tokens_seen": 132335208, "step": 196385 }, { "epoch": 4.797840373292942, "grad_norm": 5.064927245257422e-05, "learning_rate": 9.945341696960596e-09, "loss": 0.0, "num_input_tokens_seen": 132338536, "step": 196390 }, { "epoch": 4.797962524124789, "grad_norm": 3.677351196529344e-05, "learning_rate": 9.933348168088329e-09, "loss": 0.0, "num_input_tokens_seen": 132341928, "step": 196395 }, { "epoch": 4.798084674956637, "grad_norm": 0.001421840162947774, "learning_rate": 9.921361839246967e-09, "loss": 0.0, "num_input_tokens_seen": 132345256, "step": 196400 }, { "epoch": 4.798206825788483, "grad_norm": 0.0006268465076573193, "learning_rate": 9.909382710523773e-09, "loss": 0.0, "num_input_tokens_seen": 132348456, "step": 196405 }, { "epoch": 4.798328976620331, "grad_norm": 0.00036963573074899614, "learning_rate": 9.897410782005789e-09, "loss": 0.0, "num_input_tokens_seen": 132351464, "step": 196410 }, { "epoch": 4.798451127452178, "grad_norm": 0.00029923420515842736, "learning_rate": 9.885446053780278e-09, "loss": 0.0, "num_input_tokens_seen": 132354728, "step": 196415 }, { "epoch": 4.798573278284025, "grad_norm": 5.575628892984241e-05, "learning_rate": 9.87348852593406e-09, "loss": 0.0, "num_input_tokens_seen": 132358376, "step": 196420 }, { "epoch": 4.798695429115872, "grad_norm": 3.5729637602344155e-05, "learning_rate": 9.861538198554175e-09, "loss": 0.0, "num_input_tokens_seen": 132361448, "step": 196425 }, { "epoch": 4.79881757994772, "grad_norm": 0.0001392970298184082, "learning_rate": 9.849595071727445e-09, "loss": 0.0, "num_input_tokens_seen": 132364712, "step": 196430 }, { "epoch": 4.7989397307795665, "grad_norm": 0.004172740038484335, "learning_rate": 9.837659145540689e-09, "loss": 0.0, "num_input_tokens_seen": 132367656, "step": 196435 }, { "epoch": 4.799061881611414, "grad_norm": 0.003073500469326973, "learning_rate": 9.825730420080946e-09, "loss": 0.0, "num_input_tokens_seen": 132370920, "step": 196440 }, { "epoch": 4.799184032443261, "grad_norm": 0.0002630621602293104, "learning_rate": 9.813808895434706e-09, "loss": 0.0, "num_input_tokens_seen": 132374440, "step": 196445 }, { "epoch": 4.7993061832751085, "grad_norm": 3.272489266237244e-05, "learning_rate": 9.801894571688895e-09, "loss": 0.0001, "num_input_tokens_seen": 132377832, "step": 196450 }, { "epoch": 4.799428334106955, "grad_norm": 0.0007254548254422843, "learning_rate": 9.789987448930004e-09, "loss": 0.0, "num_input_tokens_seen": 132380904, "step": 196455 }, { "epoch": 4.799550484938802, "grad_norm": 0.00751190772280097, "learning_rate": 9.778087527244628e-09, "loss": 0.0, "num_input_tokens_seen": 132384104, "step": 196460 }, { "epoch": 4.79967263577065, "grad_norm": 0.0009871211368590593, "learning_rate": 9.766194806719364e-09, "loss": 0.0, "num_input_tokens_seen": 132387624, "step": 196465 }, { "epoch": 4.799794786602497, "grad_norm": 0.0007149404264055192, "learning_rate": 9.754309287440588e-09, "loss": 0.0, "num_input_tokens_seen": 132390568, "step": 196470 }, { "epoch": 4.799916937434344, "grad_norm": 0.0004417687305249274, "learning_rate": 9.742430969494896e-09, "loss": 0.0, "num_input_tokens_seen": 132393704, "step": 196475 }, { "epoch": 4.800039088266191, "grad_norm": 0.0023151689674705267, "learning_rate": 9.730559852968557e-09, "loss": 0.0, "num_input_tokens_seen": 132396968, "step": 196480 }, { "epoch": 4.800161239098038, "grad_norm": 0.03932206705212593, "learning_rate": 9.718695937948052e-09, "loss": 0.0, "num_input_tokens_seen": 132400488, "step": 196485 }, { "epoch": 4.800283389929885, "grad_norm": 0.000937451608479023, "learning_rate": 9.706839224519426e-09, "loss": 0.0, "num_input_tokens_seen": 132403368, "step": 196490 }, { "epoch": 4.800405540761733, "grad_norm": 0.00011623586033238098, "learning_rate": 9.694989712769053e-09, "loss": 0.0, "num_input_tokens_seen": 132406632, "step": 196495 }, { "epoch": 4.8005276915935795, "grad_norm": 0.0004854142025578767, "learning_rate": 9.683147402783088e-09, "loss": 0.0, "num_input_tokens_seen": 132409640, "step": 196500 }, { "epoch": 4.800649842425427, "grad_norm": 0.0037219214718788862, "learning_rate": 9.671312294647683e-09, "loss": 0.0, "num_input_tokens_seen": 132412840, "step": 196505 }, { "epoch": 4.800771993257274, "grad_norm": 0.00020332216809038073, "learning_rate": 9.659484388448768e-09, "loss": 0.0001, "num_input_tokens_seen": 132415976, "step": 196510 }, { "epoch": 4.8008941440891215, "grad_norm": 0.00035678790300153196, "learning_rate": 9.6476636842725e-09, "loss": 0.0, "num_input_tokens_seen": 132419688, "step": 196515 }, { "epoch": 4.801016294920968, "grad_norm": 5.017863441025838e-05, "learning_rate": 9.635850182204809e-09, "loss": 0.0, "num_input_tokens_seen": 132423272, "step": 196520 }, { "epoch": 4.801138445752816, "grad_norm": 0.0006258144276216626, "learning_rate": 9.624043882331511e-09, "loss": 0.0, "num_input_tokens_seen": 132426920, "step": 196525 }, { "epoch": 4.801260596584663, "grad_norm": 0.0013677050592377782, "learning_rate": 9.612244784738543e-09, "loss": 0.0, "num_input_tokens_seen": 132430056, "step": 196530 }, { "epoch": 4.80138274741651, "grad_norm": 0.00019165143021382391, "learning_rate": 9.600452889511835e-09, "loss": 0.0, "num_input_tokens_seen": 132433128, "step": 196535 }, { "epoch": 4.801504898248357, "grad_norm": 0.004350326023995876, "learning_rate": 9.588668196736871e-09, "loss": 0.0, "num_input_tokens_seen": 132436264, "step": 196540 }, { "epoch": 4.801627049080205, "grad_norm": 0.0016057752072811127, "learning_rate": 9.576890706499696e-09, "loss": 0.0, "num_input_tokens_seen": 132439720, "step": 196545 }, { "epoch": 4.801749199912051, "grad_norm": 5.42028974450659e-05, "learning_rate": 9.565120418885574e-09, "loss": 0.0, "num_input_tokens_seen": 132443176, "step": 196550 }, { "epoch": 4.801871350743898, "grad_norm": 0.0002965881139971316, "learning_rate": 9.553357333980438e-09, "loss": 0.0, "num_input_tokens_seen": 132446888, "step": 196555 }, { "epoch": 4.801993501575746, "grad_norm": 11.83859920501709, "learning_rate": 9.541601451869552e-09, "loss": 0.0354, "num_input_tokens_seen": 132450024, "step": 196560 }, { "epoch": 4.8021156524075925, "grad_norm": 0.05751524493098259, "learning_rate": 9.529852772638625e-09, "loss": 0.0, "num_input_tokens_seen": 132453800, "step": 196565 }, { "epoch": 4.80223780323944, "grad_norm": 3.001950062753167e-06, "learning_rate": 9.518111296372921e-09, "loss": 0.0, "num_input_tokens_seen": 132456936, "step": 196570 }, { "epoch": 4.802359954071287, "grad_norm": 0.00021984051272738725, "learning_rate": 9.506377023158042e-09, "loss": 0.0, "num_input_tokens_seen": 132459944, "step": 196575 }, { "epoch": 4.8024821049031345, "grad_norm": 4.9385958845959976e-05, "learning_rate": 9.494649953079137e-09, "loss": 0.0, "num_input_tokens_seen": 132463784, "step": 196580 }, { "epoch": 4.802604255734981, "grad_norm": 0.0006307087605819106, "learning_rate": 9.482930086221585e-09, "loss": 0.0, "num_input_tokens_seen": 132466792, "step": 196585 }, { "epoch": 4.802726406566829, "grad_norm": 0.00309332855977118, "learning_rate": 9.471217422670541e-09, "loss": 0.0, "num_input_tokens_seen": 132470696, "step": 196590 }, { "epoch": 4.802848557398676, "grad_norm": 0.00015673524467274547, "learning_rate": 9.459511962511268e-09, "loss": 0.0, "num_input_tokens_seen": 132474408, "step": 196595 }, { "epoch": 4.802970708230523, "grad_norm": 0.000370059278793633, "learning_rate": 9.44781370582881e-09, "loss": 0.0, "num_input_tokens_seen": 132477864, "step": 196600 }, { "epoch": 4.80309285906237, "grad_norm": 0.009521500207483768, "learning_rate": 9.436122652708212e-09, "loss": 0.0, "num_input_tokens_seen": 132480872, "step": 196605 }, { "epoch": 4.803215009894218, "grad_norm": 0.00011173263192176819, "learning_rate": 9.424438803234736e-09, "loss": 0.0, "num_input_tokens_seen": 132484264, "step": 196610 }, { "epoch": 4.803337160726064, "grad_norm": 3.594495137804188e-05, "learning_rate": 9.412762157493092e-09, "loss": 0.0, "num_input_tokens_seen": 132487848, "step": 196615 }, { "epoch": 4.803459311557912, "grad_norm": 0.000330898241372779, "learning_rate": 9.401092715568215e-09, "loss": 0.0, "num_input_tokens_seen": 132491304, "step": 196620 }, { "epoch": 4.803581462389759, "grad_norm": 0.000155567133333534, "learning_rate": 9.389430477545035e-09, "loss": 0.0, "num_input_tokens_seen": 132494824, "step": 196625 }, { "epoch": 4.8037036132216056, "grad_norm": 0.00020720763131976128, "learning_rate": 9.377775443508485e-09, "loss": 0.0, "num_input_tokens_seen": 132498280, "step": 196630 }, { "epoch": 4.803825764053453, "grad_norm": 1.3492269317794126e-05, "learning_rate": 9.366127613543051e-09, "loss": 0.0, "num_input_tokens_seen": 132501672, "step": 196635 }, { "epoch": 4.803947914885301, "grad_norm": 1.2948934454470873e-05, "learning_rate": 9.354486987733668e-09, "loss": 0.0, "num_input_tokens_seen": 132505000, "step": 196640 }, { "epoch": 4.804070065717148, "grad_norm": 0.0005500533152371645, "learning_rate": 9.342853566164932e-09, "loss": 0.0, "num_input_tokens_seen": 132508328, "step": 196645 }, { "epoch": 4.804192216548994, "grad_norm": 7.395833381451666e-05, "learning_rate": 9.331227348921333e-09, "loss": 0.0, "num_input_tokens_seen": 132511720, "step": 196650 }, { "epoch": 4.804314367380842, "grad_norm": 6.505249621113762e-05, "learning_rate": 9.319608336087582e-09, "loss": 0.0, "num_input_tokens_seen": 132515176, "step": 196655 }, { "epoch": 4.804436518212689, "grad_norm": 0.00017917572404257953, "learning_rate": 9.307996527747941e-09, "loss": 0.0, "num_input_tokens_seen": 132518760, "step": 196660 }, { "epoch": 4.804558669044536, "grad_norm": 0.0001549017906654626, "learning_rate": 9.296391923987235e-09, "loss": 0.0, "num_input_tokens_seen": 132521832, "step": 196665 }, { "epoch": 4.804680819876383, "grad_norm": 0.004271478857845068, "learning_rate": 9.284794524889505e-09, "loss": 0.0, "num_input_tokens_seen": 132524968, "step": 196670 }, { "epoch": 4.804802970708231, "grad_norm": 0.00041260154102928936, "learning_rate": 9.273204330539242e-09, "loss": 0.0, "num_input_tokens_seen": 132528616, "step": 196675 }, { "epoch": 4.8049251215400774, "grad_norm": 0.00033596798311918974, "learning_rate": 9.26162134102071e-09, "loss": 0.0, "num_input_tokens_seen": 132531880, "step": 196680 }, { "epoch": 4.805047272371925, "grad_norm": 7.80812042648904e-05, "learning_rate": 9.250045556418173e-09, "loss": 0.0, "num_input_tokens_seen": 132534888, "step": 196685 }, { "epoch": 4.805169423203772, "grad_norm": 3.21592997352127e-05, "learning_rate": 9.23847697681579e-09, "loss": 0.0, "num_input_tokens_seen": 132537960, "step": 196690 }, { "epoch": 4.8052915740356195, "grad_norm": 0.00014071361511014402, "learning_rate": 9.226915602297602e-09, "loss": 0.0, "num_input_tokens_seen": 132541096, "step": 196695 }, { "epoch": 4.805413724867466, "grad_norm": 0.0008824639371596277, "learning_rate": 9.215361432947877e-09, "loss": 0.0, "num_input_tokens_seen": 132544616, "step": 196700 }, { "epoch": 4.805535875699314, "grad_norm": 3.565695442375727e-05, "learning_rate": 9.203814468850547e-09, "loss": 0.0, "num_input_tokens_seen": 132548008, "step": 196705 }, { "epoch": 4.805658026531161, "grad_norm": 0.00031765305902808905, "learning_rate": 9.192274710089432e-09, "loss": 0.0, "num_input_tokens_seen": 132551784, "step": 196710 }, { "epoch": 4.805780177363008, "grad_norm": 4.537498170975596e-05, "learning_rate": 9.180742156748688e-09, "loss": 0.0, "num_input_tokens_seen": 132555112, "step": 196715 }, { "epoch": 4.805902328194855, "grad_norm": 3.123639908153564e-05, "learning_rate": 9.169216808912028e-09, "loss": 0.0536, "num_input_tokens_seen": 132558376, "step": 196720 }, { "epoch": 4.806024479026702, "grad_norm": 2.5088884285651147e-05, "learning_rate": 9.157698666663382e-09, "loss": 0.0, "num_input_tokens_seen": 132561896, "step": 196725 }, { "epoch": 4.806146629858549, "grad_norm": 0.00033539420110173523, "learning_rate": 9.146187730086463e-09, "loss": 0.0, "num_input_tokens_seen": 132564968, "step": 196730 }, { "epoch": 4.806268780690397, "grad_norm": 0.0005353165324777365, "learning_rate": 9.134683999264981e-09, "loss": 0.0, "num_input_tokens_seen": 132568168, "step": 196735 }, { "epoch": 4.806390931522244, "grad_norm": 0.0009416808607056737, "learning_rate": 9.123187474282535e-09, "loss": 0.0, "num_input_tokens_seen": 132571240, "step": 196740 }, { "epoch": 4.8065130823540905, "grad_norm": 0.00034308669273741543, "learning_rate": 9.111698155222724e-09, "loss": 0.0, "num_input_tokens_seen": 132575016, "step": 196745 }, { "epoch": 4.806635233185938, "grad_norm": 0.00025203998666256666, "learning_rate": 9.100216042169262e-09, "loss": 0.0, "num_input_tokens_seen": 132577832, "step": 196750 }, { "epoch": 4.806757384017785, "grad_norm": 0.0003336328372824937, "learning_rate": 9.088741135205525e-09, "loss": 0.0, "num_input_tokens_seen": 132581224, "step": 196755 }, { "epoch": 4.8068795348496325, "grad_norm": 4.427826570463367e-05, "learning_rate": 9.077273434415e-09, "loss": 0.0001, "num_input_tokens_seen": 132584424, "step": 196760 }, { "epoch": 4.807001685681479, "grad_norm": 0.0014946991577744484, "learning_rate": 9.065812939881067e-09, "loss": 0.0, "num_input_tokens_seen": 132587560, "step": 196765 }, { "epoch": 4.807123836513327, "grad_norm": 0.0006305626593530178, "learning_rate": 9.054359651687105e-09, "loss": 0.0, "num_input_tokens_seen": 132590568, "step": 196770 }, { "epoch": 4.807245987345174, "grad_norm": 0.001171535113826394, "learning_rate": 9.042913569916266e-09, "loss": 0.0, "num_input_tokens_seen": 132594024, "step": 196775 }, { "epoch": 4.807368138177021, "grad_norm": 0.00044857600005343556, "learning_rate": 9.03147469465193e-09, "loss": 0.0, "num_input_tokens_seen": 132597608, "step": 196780 }, { "epoch": 4.807490289008868, "grad_norm": 0.001058773836120963, "learning_rate": 9.020043025977253e-09, "loss": 0.0, "num_input_tokens_seen": 132601000, "step": 196785 }, { "epoch": 4.807612439840716, "grad_norm": 0.0021331259049475193, "learning_rate": 9.00861856397539e-09, "loss": 0.0, "num_input_tokens_seen": 132604584, "step": 196790 }, { "epoch": 4.807734590672562, "grad_norm": 0.0007856449228711426, "learning_rate": 8.997201308729385e-09, "loss": 0.0, "num_input_tokens_seen": 132607656, "step": 196795 }, { "epoch": 4.80785674150441, "grad_norm": 2.964310442621354e-05, "learning_rate": 8.985791260322283e-09, "loss": 0.0, "num_input_tokens_seen": 132611304, "step": 196800 }, { "epoch": 4.807978892336257, "grad_norm": 0.001216082600876689, "learning_rate": 8.97438841883713e-09, "loss": 0.0, "num_input_tokens_seen": 132614632, "step": 196805 }, { "epoch": 4.808101043168104, "grad_norm": 5.0105814933776855, "learning_rate": 8.962992784356749e-09, "loss": 0.001, "num_input_tokens_seen": 132618152, "step": 196810 }, { "epoch": 4.808223193999951, "grad_norm": 0.0014316142769530416, "learning_rate": 8.95160435696396e-09, "loss": 0.0, "num_input_tokens_seen": 132621544, "step": 196815 }, { "epoch": 4.808345344831798, "grad_norm": 4.400141187943518e-05, "learning_rate": 8.940223136741698e-09, "loss": 0.0, "num_input_tokens_seen": 132625128, "step": 196820 }, { "epoch": 4.8084674956636455, "grad_norm": 8.361654181499034e-05, "learning_rate": 8.928849123772674e-09, "loss": 0.0, "num_input_tokens_seen": 132628200, "step": 196825 }, { "epoch": 4.808589646495492, "grad_norm": 0.0016611508326604962, "learning_rate": 8.917482318139713e-09, "loss": 0.0, "num_input_tokens_seen": 132631848, "step": 196830 }, { "epoch": 4.80871179732734, "grad_norm": 0.0007444024668075144, "learning_rate": 8.906122719925302e-09, "loss": 0.0, "num_input_tokens_seen": 132634984, "step": 196835 }, { "epoch": 4.808833948159187, "grad_norm": 0.000621675921138376, "learning_rate": 8.894770329212154e-09, "loss": 0.0, "num_input_tokens_seen": 132638440, "step": 196840 }, { "epoch": 4.808956098991034, "grad_norm": 0.005653452128171921, "learning_rate": 8.883425146082868e-09, "loss": 0.0, "num_input_tokens_seen": 132641704, "step": 196845 }, { "epoch": 4.809078249822881, "grad_norm": 0.00017273153935093433, "learning_rate": 8.872087170619825e-09, "loss": 0.0, "num_input_tokens_seen": 132645288, "step": 196850 }, { "epoch": 4.809200400654729, "grad_norm": 0.14124557375907898, "learning_rate": 8.860756402905623e-09, "loss": 0.0001, "num_input_tokens_seen": 132648552, "step": 196855 }, { "epoch": 4.809322551486575, "grad_norm": 0.00012055077240802348, "learning_rate": 8.84943284302253e-09, "loss": 0.0, "num_input_tokens_seen": 132651368, "step": 196860 }, { "epoch": 4.809444702318423, "grad_norm": 0.0024922320153564215, "learning_rate": 8.838116491052927e-09, "loss": 0.0, "num_input_tokens_seen": 132654632, "step": 196865 }, { "epoch": 4.80956685315027, "grad_norm": 0.0005170554504729807, "learning_rate": 8.82680734707919e-09, "loss": 0.0, "num_input_tokens_seen": 132657960, "step": 196870 }, { "epoch": 4.809689003982117, "grad_norm": 0.013109724968671799, "learning_rate": 8.815505411183367e-09, "loss": 0.0, "num_input_tokens_seen": 132661288, "step": 196875 }, { "epoch": 4.809811154813964, "grad_norm": 0.0007171641918830574, "learning_rate": 8.804210683447944e-09, "loss": 0.0, "num_input_tokens_seen": 132664552, "step": 196880 }, { "epoch": 4.809933305645812, "grad_norm": 2.0580151613103226e-05, "learning_rate": 8.792923163954857e-09, "loss": 0.0, "num_input_tokens_seen": 132667624, "step": 196885 }, { "epoch": 4.8100554564776585, "grad_norm": 0.0007847507367841899, "learning_rate": 8.781642852786264e-09, "loss": 0.0, "num_input_tokens_seen": 132670888, "step": 196890 }, { "epoch": 4.810177607309506, "grad_norm": 0.00033158838050439954, "learning_rate": 8.770369750024099e-09, "loss": 0.0, "num_input_tokens_seen": 132674152, "step": 196895 }, { "epoch": 4.810299758141353, "grad_norm": 0.00032024146639741957, "learning_rate": 8.759103855750404e-09, "loss": 0.0, "num_input_tokens_seen": 132677160, "step": 196900 }, { "epoch": 4.8104219089732005, "grad_norm": 0.00017308765382040292, "learning_rate": 8.747845170047119e-09, "loss": 0.0, "num_input_tokens_seen": 132680360, "step": 196905 }, { "epoch": 4.810544059805047, "grad_norm": 8.452088877675124e-06, "learning_rate": 8.736593692996174e-09, "loss": 0.0, "num_input_tokens_seen": 132683944, "step": 196910 }, { "epoch": 4.810666210636894, "grad_norm": 0.00010812583786901087, "learning_rate": 8.725349424679396e-09, "loss": 0.0, "num_input_tokens_seen": 132687080, "step": 196915 }, { "epoch": 4.810788361468742, "grad_norm": 0.0023433612659573555, "learning_rate": 8.714112365178383e-09, "loss": 0.0, "num_input_tokens_seen": 132690344, "step": 196920 }, { "epoch": 4.810910512300588, "grad_norm": 0.0006696759373880923, "learning_rate": 8.702882514575072e-09, "loss": 0.0, "num_input_tokens_seen": 132693288, "step": 196925 }, { "epoch": 4.811032663132436, "grad_norm": 2.063168358290568e-05, "learning_rate": 8.691659872950951e-09, "loss": 0.0, "num_input_tokens_seen": 132696424, "step": 196930 }, { "epoch": 4.811154813964283, "grad_norm": 0.0022210939787328243, "learning_rate": 8.680444440387624e-09, "loss": 0.0, "num_input_tokens_seen": 132699304, "step": 196935 }, { "epoch": 4.81127696479613, "grad_norm": 0.0010496609611436725, "learning_rate": 8.669236216966913e-09, "loss": 0.0, "num_input_tokens_seen": 132702888, "step": 196940 }, { "epoch": 4.811399115627977, "grad_norm": 0.00013030644913669676, "learning_rate": 8.658035202770086e-09, "loss": 0.0, "num_input_tokens_seen": 132706728, "step": 196945 }, { "epoch": 4.811521266459825, "grad_norm": 0.001109745935536921, "learning_rate": 8.646841397878634e-09, "loss": 0.0, "num_input_tokens_seen": 132710248, "step": 196950 }, { "epoch": 4.8116434172916716, "grad_norm": 0.00022087209799792618, "learning_rate": 8.635654802374048e-09, "loss": 0.0, "num_input_tokens_seen": 132713640, "step": 196955 }, { "epoch": 4.811765568123519, "grad_norm": 0.00040978117613121867, "learning_rate": 8.624475416337596e-09, "loss": 0.0, "num_input_tokens_seen": 132717096, "step": 196960 }, { "epoch": 4.811887718955366, "grad_norm": 0.0002552904188632965, "learning_rate": 8.613303239850544e-09, "loss": 0.0, "num_input_tokens_seen": 132720552, "step": 196965 }, { "epoch": 4.812009869787214, "grad_norm": 0.0022950470447540283, "learning_rate": 8.602138272994274e-09, "loss": 0.0, "num_input_tokens_seen": 132723624, "step": 196970 }, { "epoch": 4.81213202061906, "grad_norm": 0.0010927910916507244, "learning_rate": 8.590980515849945e-09, "loss": 0.0436, "num_input_tokens_seen": 132727336, "step": 196975 }, { "epoch": 4.812254171450908, "grad_norm": 8.401823288295418e-05, "learning_rate": 8.579829968498486e-09, "loss": 0.0, "num_input_tokens_seen": 132730280, "step": 196980 }, { "epoch": 4.812376322282755, "grad_norm": 0.0002500289410818368, "learning_rate": 8.568686631021394e-09, "loss": 0.0, "num_input_tokens_seen": 132733288, "step": 196985 }, { "epoch": 4.812498473114601, "grad_norm": 1.3336955817067064e-05, "learning_rate": 8.557550503499378e-09, "loss": 0.0, "num_input_tokens_seen": 132736872, "step": 196990 }, { "epoch": 4.812620623946449, "grad_norm": 7.999356967047788e-06, "learning_rate": 8.546421586013486e-09, "loss": 0.0, "num_input_tokens_seen": 132740264, "step": 196995 }, { "epoch": 4.812742774778297, "grad_norm": 0.000838687177747488, "learning_rate": 8.535299878644653e-09, "loss": 0.0, "num_input_tokens_seen": 132743912, "step": 197000 }, { "epoch": 4.8128649256101435, "grad_norm": 0.0011125325690954924, "learning_rate": 8.524185381473815e-09, "loss": 0.0, "num_input_tokens_seen": 132747368, "step": 197005 }, { "epoch": 4.81298707644199, "grad_norm": 0.0002906046574935317, "learning_rate": 8.513078094581904e-09, "loss": 0.0, "num_input_tokens_seen": 132750376, "step": 197010 }, { "epoch": 4.813109227273838, "grad_norm": 5.050322215538472e-05, "learning_rate": 8.501978018049528e-09, "loss": 0.0, "num_input_tokens_seen": 132753640, "step": 197015 }, { "epoch": 4.813231378105685, "grad_norm": 9.830085764406249e-05, "learning_rate": 8.490885151957283e-09, "loss": 0.0, "num_input_tokens_seen": 132757736, "step": 197020 }, { "epoch": 4.813353528937532, "grad_norm": 0.001108907745219767, "learning_rate": 8.47979949638622e-09, "loss": 0.0, "num_input_tokens_seen": 132761000, "step": 197025 }, { "epoch": 4.813475679769379, "grad_norm": 3.4584136301418766e-05, "learning_rate": 8.468721051416606e-09, "loss": 0.0, "num_input_tokens_seen": 132764392, "step": 197030 }, { "epoch": 4.813597830601227, "grad_norm": 0.001427344512194395, "learning_rate": 8.457649817129153e-09, "loss": 0.0, "num_input_tokens_seen": 132767656, "step": 197035 }, { "epoch": 4.813719981433073, "grad_norm": 0.00036199059104546905, "learning_rate": 8.446585793604355e-09, "loss": 0.0, "num_input_tokens_seen": 132771048, "step": 197040 }, { "epoch": 4.813842132264921, "grad_norm": 0.005828613881021738, "learning_rate": 8.435528980922812e-09, "loss": 0.0, "num_input_tokens_seen": 132774312, "step": 197045 }, { "epoch": 4.813964283096768, "grad_norm": 0.00045282530481927097, "learning_rate": 8.424479379164684e-09, "loss": 0.0, "num_input_tokens_seen": 132777448, "step": 197050 }, { "epoch": 4.814086433928615, "grad_norm": 0.00023736456932965666, "learning_rate": 8.41343698841035e-09, "loss": 0.0, "num_input_tokens_seen": 132780584, "step": 197055 }, { "epoch": 4.814208584760462, "grad_norm": 0.00018944533076137304, "learning_rate": 8.402401808740411e-09, "loss": 0.0, "num_input_tokens_seen": 132784168, "step": 197060 }, { "epoch": 4.81433073559231, "grad_norm": 0.0011267053196206689, "learning_rate": 8.391373840234805e-09, "loss": 0.0, "num_input_tokens_seen": 132787624, "step": 197065 }, { "epoch": 4.8144528864241565, "grad_norm": 0.00018865136371459812, "learning_rate": 8.380353082973913e-09, "loss": 0.0, "num_input_tokens_seen": 132791144, "step": 197070 }, { "epoch": 4.814575037256004, "grad_norm": 0.00015997131413314492, "learning_rate": 8.369339537037668e-09, "loss": 0.0, "num_input_tokens_seen": 132794472, "step": 197075 }, { "epoch": 4.814697188087851, "grad_norm": 0.012078426778316498, "learning_rate": 8.358333202506451e-09, "loss": 0.0, "num_input_tokens_seen": 132797992, "step": 197080 }, { "epoch": 4.814819338919698, "grad_norm": 0.00041595305083319545, "learning_rate": 8.347334079459978e-09, "loss": 0.0, "num_input_tokens_seen": 132801512, "step": 197085 }, { "epoch": 4.814941489751545, "grad_norm": 0.00453962991014123, "learning_rate": 8.336342167978516e-09, "loss": 0.0, "num_input_tokens_seen": 132805032, "step": 197090 }, { "epoch": 4.815063640583393, "grad_norm": 5.149428761797026e-05, "learning_rate": 8.325357468142002e-09, "loss": 0.0, "num_input_tokens_seen": 132808744, "step": 197095 }, { "epoch": 4.81518579141524, "grad_norm": 8.893240192264784e-06, "learning_rate": 8.31437998003004e-09, "loss": 0.0012, "num_input_tokens_seen": 132812136, "step": 197100 }, { "epoch": 4.815307942247086, "grad_norm": 0.001375921769067645, "learning_rate": 8.303409703722786e-09, "loss": 0.0, "num_input_tokens_seen": 132815848, "step": 197105 }, { "epoch": 4.815430093078934, "grad_norm": 5.135929677635431e-05, "learning_rate": 8.292446639299732e-09, "loss": 0.0, "num_input_tokens_seen": 132818920, "step": 197110 }, { "epoch": 4.815552243910781, "grad_norm": 0.10179103910923004, "learning_rate": 8.281490786840927e-09, "loss": 0.0006, "num_input_tokens_seen": 132822312, "step": 197115 }, { "epoch": 4.815674394742628, "grad_norm": 0.0015505808405578136, "learning_rate": 8.270542146425751e-09, "loss": 0.0, "num_input_tokens_seen": 132826152, "step": 197120 }, { "epoch": 4.815796545574475, "grad_norm": 2.3445218175766058e-05, "learning_rate": 8.25960071813392e-09, "loss": 0.0, "num_input_tokens_seen": 132829672, "step": 197125 }, { "epoch": 4.815918696406323, "grad_norm": 0.002743740798905492, "learning_rate": 8.248666502045032e-09, "loss": 0.0, "num_input_tokens_seen": 132832552, "step": 197130 }, { "epoch": 4.8160408472381695, "grad_norm": 3.8246111216722056e-05, "learning_rate": 8.237739498238582e-09, "loss": 0.0, "num_input_tokens_seen": 132835880, "step": 197135 }, { "epoch": 4.816162998070017, "grad_norm": 0.0019803382456302643, "learning_rate": 8.226819706794063e-09, "loss": 0.0, "num_input_tokens_seen": 132839080, "step": 197140 }, { "epoch": 4.816285148901864, "grad_norm": 0.0012779106618836522, "learning_rate": 8.215907127790856e-09, "loss": 0.0, "num_input_tokens_seen": 132842536, "step": 197145 }, { "epoch": 4.8164072997337115, "grad_norm": 5.748857438447885e-05, "learning_rate": 8.205001761308228e-09, "loss": 0.0, "num_input_tokens_seen": 132845736, "step": 197150 }, { "epoch": 4.816529450565558, "grad_norm": 2.8861688406323083e-05, "learning_rate": 8.194103607425784e-09, "loss": 0.0, "num_input_tokens_seen": 132849192, "step": 197155 }, { "epoch": 4.816651601397406, "grad_norm": 0.02581091783940792, "learning_rate": 8.183212666222461e-09, "loss": 0.0, "num_input_tokens_seen": 132851944, "step": 197160 }, { "epoch": 4.816773752229253, "grad_norm": 3.863290839944966e-05, "learning_rate": 8.172328937777639e-09, "loss": 0.0, "num_input_tokens_seen": 132855272, "step": 197165 }, { "epoch": 4.8168959030611, "grad_norm": 5.909180254093371e-05, "learning_rate": 8.161452422170367e-09, "loss": 0.0, "num_input_tokens_seen": 132858408, "step": 197170 }, { "epoch": 4.817018053892947, "grad_norm": 0.00025937153259292245, "learning_rate": 8.150583119479803e-09, "loss": 0.0, "num_input_tokens_seen": 132861736, "step": 197175 }, { "epoch": 4.817140204724794, "grad_norm": 0.0043001617304980755, "learning_rate": 8.139721029784996e-09, "loss": 0.0, "num_input_tokens_seen": 132865256, "step": 197180 }, { "epoch": 4.817262355556641, "grad_norm": 7.975584594532847e-05, "learning_rate": 8.12886615316477e-09, "loss": 0.0, "num_input_tokens_seen": 132868712, "step": 197185 }, { "epoch": 4.817384506388488, "grad_norm": 0.0008708810200914741, "learning_rate": 8.118018489698396e-09, "loss": 0.0, "num_input_tokens_seen": 132871720, "step": 197190 }, { "epoch": 4.817506657220336, "grad_norm": 0.0014090074691921473, "learning_rate": 8.10717803946448e-09, "loss": 0.0, "num_input_tokens_seen": 132874856, "step": 197195 }, { "epoch": 4.8176288080521825, "grad_norm": 0.012226647697389126, "learning_rate": 8.096344802542066e-09, "loss": 0.0, "num_input_tokens_seen": 132878184, "step": 197200 }, { "epoch": 4.81775095888403, "grad_norm": 8.560111382394098e-06, "learning_rate": 8.085518779009648e-09, "loss": 0.0, "num_input_tokens_seen": 132881448, "step": 197205 }, { "epoch": 4.817873109715877, "grad_norm": 0.0011228991206735373, "learning_rate": 8.074699968946275e-09, "loss": 0.0, "num_input_tokens_seen": 132885288, "step": 197210 }, { "epoch": 4.8179952605477245, "grad_norm": 0.0005283401696942747, "learning_rate": 8.063888372430439e-09, "loss": 0.0, "num_input_tokens_seen": 132889256, "step": 197215 }, { "epoch": 4.818117411379571, "grad_norm": 0.0003394389059394598, "learning_rate": 8.053083989540743e-09, "loss": 0.0, "num_input_tokens_seen": 132892712, "step": 197220 }, { "epoch": 4.818239562211419, "grad_norm": 1.3488976037479006e-05, "learning_rate": 8.042286820355903e-09, "loss": 0.0, "num_input_tokens_seen": 132895976, "step": 197225 }, { "epoch": 4.818361713043266, "grad_norm": 0.02327442169189453, "learning_rate": 8.031496864954302e-09, "loss": 0.0, "num_input_tokens_seen": 132898792, "step": 197230 }, { "epoch": 4.818483863875113, "grad_norm": 6.318982195807621e-05, "learning_rate": 8.020714123414541e-09, "loss": 0.0, "num_input_tokens_seen": 132902248, "step": 197235 }, { "epoch": 4.81860601470696, "grad_norm": 0.0002425254788249731, "learning_rate": 8.009938595814892e-09, "loss": 0.0, "num_input_tokens_seen": 132905640, "step": 197240 }, { "epoch": 4.818728165538808, "grad_norm": 0.00015229900600388646, "learning_rate": 7.999170282233736e-09, "loss": 0.0, "num_input_tokens_seen": 132909032, "step": 197245 }, { "epoch": 4.818850316370654, "grad_norm": 0.00026307988446205854, "learning_rate": 7.988409182749567e-09, "loss": 0.0, "num_input_tokens_seen": 132912232, "step": 197250 }, { "epoch": 4.818972467202501, "grad_norm": 6.763396231690422e-05, "learning_rate": 7.977655297440433e-09, "loss": 0.0, "num_input_tokens_seen": 132915752, "step": 197255 }, { "epoch": 4.819094618034349, "grad_norm": 0.00016935997700784355, "learning_rate": 7.966908626384605e-09, "loss": 0.0, "num_input_tokens_seen": 132918824, "step": 197260 }, { "epoch": 4.819216768866196, "grad_norm": 0.00012209487613290548, "learning_rate": 7.956169169660242e-09, "loss": 0.0, "num_input_tokens_seen": 132922024, "step": 197265 }, { "epoch": 4.819338919698043, "grad_norm": 4.2448813474038616e-05, "learning_rate": 7.945436927345395e-09, "loss": 0.0, "num_input_tokens_seen": 132925736, "step": 197270 }, { "epoch": 4.81946107052989, "grad_norm": 0.0004389749956317246, "learning_rate": 7.93471189951822e-09, "loss": 0.0, "num_input_tokens_seen": 132929576, "step": 197275 }, { "epoch": 4.819583221361738, "grad_norm": 0.0021197644528001547, "learning_rate": 7.923994086256657e-09, "loss": 0.0, "num_input_tokens_seen": 132932776, "step": 197280 }, { "epoch": 4.819705372193584, "grad_norm": 2.7764235710492358e-05, "learning_rate": 7.913283487638645e-09, "loss": 0.0, "num_input_tokens_seen": 132936104, "step": 197285 }, { "epoch": 4.819827523025432, "grad_norm": 6.35007891105488e-05, "learning_rate": 7.902580103742008e-09, "loss": 0.0, "num_input_tokens_seen": 132939816, "step": 197290 }, { "epoch": 4.819949673857279, "grad_norm": 0.0019520074129104614, "learning_rate": 7.891883934644794e-09, "loss": 0.0, "num_input_tokens_seen": 132943336, "step": 197295 }, { "epoch": 4.820071824689126, "grad_norm": 0.0001843513164203614, "learning_rate": 7.8811949804245e-09, "loss": 0.0, "num_input_tokens_seen": 132946536, "step": 197300 }, { "epoch": 4.820193975520973, "grad_norm": 0.0003039787115994841, "learning_rate": 7.87051324115906e-09, "loss": 0.0, "num_input_tokens_seen": 132949864, "step": 197305 }, { "epoch": 4.820316126352821, "grad_norm": 0.009050632826983929, "learning_rate": 7.859838716926081e-09, "loss": 0.0, "num_input_tokens_seen": 132953064, "step": 197310 }, { "epoch": 4.820438277184667, "grad_norm": 0.004172494634985924, "learning_rate": 7.849171407803168e-09, "loss": 0.0, "num_input_tokens_seen": 132956008, "step": 197315 }, { "epoch": 4.820560428016515, "grad_norm": 0.003063405863940716, "learning_rate": 7.838511313868035e-09, "loss": 0.0, "num_input_tokens_seen": 132959400, "step": 197320 }, { "epoch": 4.820682578848362, "grad_norm": 0.0004874825826846063, "learning_rate": 7.827858435198176e-09, "loss": 0.0, "num_input_tokens_seen": 132962728, "step": 197325 }, { "epoch": 4.8208047296802095, "grad_norm": 0.002249550772830844, "learning_rate": 7.817212771870863e-09, "loss": 0.0, "num_input_tokens_seen": 132966312, "step": 197330 }, { "epoch": 4.820926880512056, "grad_norm": 8.849135338095948e-05, "learning_rate": 7.806574323963699e-09, "loss": 0.0, "num_input_tokens_seen": 132970024, "step": 197335 }, { "epoch": 4.821049031343904, "grad_norm": 3.668137651402503e-05, "learning_rate": 7.795943091553847e-09, "loss": 0.0, "num_input_tokens_seen": 132973224, "step": 197340 }, { "epoch": 4.821171182175751, "grad_norm": 0.0004503615782596171, "learning_rate": 7.78531907471891e-09, "loss": 0.0, "num_input_tokens_seen": 132976680, "step": 197345 }, { "epoch": 4.821293333007597, "grad_norm": 2.815644438669551e-05, "learning_rate": 7.774702273535937e-09, "loss": 0.0, "num_input_tokens_seen": 132980264, "step": 197350 }, { "epoch": 4.821415483839445, "grad_norm": 0.0013015008298680186, "learning_rate": 7.764092688082313e-09, "loss": 0.0, "num_input_tokens_seen": 132984040, "step": 197355 }, { "epoch": 4.821537634671293, "grad_norm": 0.001261149882338941, "learning_rate": 7.753490318434975e-09, "loss": 0.0, "num_input_tokens_seen": 132987560, "step": 197360 }, { "epoch": 4.821659785503139, "grad_norm": 0.05001718923449516, "learning_rate": 7.742895164671303e-09, "loss": 0.0, "num_input_tokens_seen": 132990952, "step": 197365 }, { "epoch": 4.821781936334986, "grad_norm": 0.00034191334270872176, "learning_rate": 7.732307226868017e-09, "loss": 0.0, "num_input_tokens_seen": 132993960, "step": 197370 }, { "epoch": 4.821904087166834, "grad_norm": 0.0001335551933152601, "learning_rate": 7.721726505102277e-09, "loss": 0.0004, "num_input_tokens_seen": 132997288, "step": 197375 }, { "epoch": 4.8220262379986805, "grad_norm": 0.00773590337485075, "learning_rate": 7.711152999451132e-09, "loss": 0.0, "num_input_tokens_seen": 133001192, "step": 197380 }, { "epoch": 4.822148388830528, "grad_norm": 9.39232631935738e-05, "learning_rate": 7.700586709991297e-09, "loss": 0.0001, "num_input_tokens_seen": 133004520, "step": 197385 }, { "epoch": 4.822270539662375, "grad_norm": 0.0005778432823717594, "learning_rate": 7.690027636799712e-09, "loss": 0.0, "num_input_tokens_seen": 133008232, "step": 197390 }, { "epoch": 4.8223926904942225, "grad_norm": 0.008273705840110779, "learning_rate": 7.679475779953093e-09, "loss": 0.0, "num_input_tokens_seen": 133011432, "step": 197395 }, { "epoch": 4.822514841326069, "grad_norm": 0.00024316016060765833, "learning_rate": 7.668931139528267e-09, "loss": 0.0, "num_input_tokens_seen": 133015080, "step": 197400 }, { "epoch": 4.822636992157917, "grad_norm": 0.0011960206320509315, "learning_rate": 7.658393715601951e-09, "loss": 0.0, "num_input_tokens_seen": 133018792, "step": 197405 }, { "epoch": 4.822759142989764, "grad_norm": 0.0014048997545614839, "learning_rate": 7.64786350825064e-09, "loss": 0.0, "num_input_tokens_seen": 133022248, "step": 197410 }, { "epoch": 4.822881293821611, "grad_norm": 2.924349610111676e-05, "learning_rate": 7.637340517551049e-09, "loss": 0.0, "num_input_tokens_seen": 133025896, "step": 197415 }, { "epoch": 4.823003444653458, "grad_norm": 3.692340760608204e-05, "learning_rate": 7.626824743579564e-09, "loss": 0.0, "num_input_tokens_seen": 133029864, "step": 197420 }, { "epoch": 4.823125595485306, "grad_norm": 5.631545354845002e-05, "learning_rate": 7.616316186412675e-09, "loss": 0.0, "num_input_tokens_seen": 133033448, "step": 197425 }, { "epoch": 4.823247746317152, "grad_norm": 0.00086938840104267, "learning_rate": 7.60581484612699e-09, "loss": 0.0, "num_input_tokens_seen": 133037224, "step": 197430 }, { "epoch": 4.823369897149, "grad_norm": 2.341336767130997e-05, "learning_rate": 7.59532072279867e-09, "loss": 0.0, "num_input_tokens_seen": 133041000, "step": 197435 }, { "epoch": 4.823492047980847, "grad_norm": 0.0006846496253274381, "learning_rate": 7.5848338165041e-09, "loss": 0.0, "num_input_tokens_seen": 133044392, "step": 197440 }, { "epoch": 4.8236141988126935, "grad_norm": 0.00022248385357670486, "learning_rate": 7.574354127319548e-09, "loss": 0.0, "num_input_tokens_seen": 133047976, "step": 197445 }, { "epoch": 4.823736349644541, "grad_norm": 7.169241143856198e-05, "learning_rate": 7.56388165532118e-09, "loss": 0.0, "num_input_tokens_seen": 133051240, "step": 197450 }, { "epoch": 4.823858500476388, "grad_norm": 0.0003167959803249687, "learning_rate": 7.553416400585267e-09, "loss": 0.0, "num_input_tokens_seen": 133054504, "step": 197455 }, { "epoch": 4.8239806513082355, "grad_norm": 3.2362113415729254e-05, "learning_rate": 7.542958363187746e-09, "loss": 0.0, "num_input_tokens_seen": 133057896, "step": 197460 }, { "epoch": 4.824102802140082, "grad_norm": 0.000935925985686481, "learning_rate": 7.532507543204891e-09, "loss": 0.0, "num_input_tokens_seen": 133060904, "step": 197465 }, { "epoch": 4.82422495297193, "grad_norm": 1.3999424481880851e-05, "learning_rate": 7.522063940712531e-09, "loss": 0.0, "num_input_tokens_seen": 133064296, "step": 197470 }, { "epoch": 4.824347103803777, "grad_norm": 0.00016260526899714023, "learning_rate": 7.511627555786715e-09, "loss": 0.0, "num_input_tokens_seen": 133067624, "step": 197475 }, { "epoch": 4.824469254635624, "grad_norm": 4.1454304664512165e-06, "learning_rate": 7.50119838850316e-09, "loss": 0.0, "num_input_tokens_seen": 133071720, "step": 197480 }, { "epoch": 4.824591405467471, "grad_norm": 4.6204342652345076e-05, "learning_rate": 7.490776438937918e-09, "loss": 0.0, "num_input_tokens_seen": 133075304, "step": 197485 }, { "epoch": 4.824713556299319, "grad_norm": 0.00016602440155111253, "learning_rate": 7.480361707166705e-09, "loss": 0.0, "num_input_tokens_seen": 133078568, "step": 197490 }, { "epoch": 4.824835707131165, "grad_norm": 0.00016575964400544763, "learning_rate": 7.469954193265238e-09, "loss": 0.0, "num_input_tokens_seen": 133082024, "step": 197495 }, { "epoch": 4.824957857963013, "grad_norm": 0.0069565181620419025, "learning_rate": 7.459553897309346e-09, "loss": 0.0, "num_input_tokens_seen": 133085544, "step": 197500 }, { "epoch": 4.82508000879486, "grad_norm": 0.0015840993728488684, "learning_rate": 7.4491608193744115e-09, "loss": 0.0, "num_input_tokens_seen": 133088680, "step": 197505 }, { "epoch": 4.825202159626707, "grad_norm": 8.865137351676822e-05, "learning_rate": 7.438774959536154e-09, "loss": 0.0, "num_input_tokens_seen": 133091752, "step": 197510 }, { "epoch": 4.825324310458554, "grad_norm": 0.0001753615797497332, "learning_rate": 7.428396317870067e-09, "loss": 0.0, "num_input_tokens_seen": 133094888, "step": 197515 }, { "epoch": 4.825446461290401, "grad_norm": 2.2867747247801162e-05, "learning_rate": 7.4180248944517575e-09, "loss": 0.0, "num_input_tokens_seen": 133098152, "step": 197520 }, { "epoch": 4.8255686121222485, "grad_norm": 0.0006337054655887187, "learning_rate": 7.407660689356388e-09, "loss": 0.0, "num_input_tokens_seen": 133101224, "step": 197525 }, { "epoch": 4.825690762954096, "grad_norm": 0.004857445135712624, "learning_rate": 7.397303702659674e-09, "loss": 0.0, "num_input_tokens_seen": 133104744, "step": 197530 }, { "epoch": 4.825812913785943, "grad_norm": 0.0017321788473054767, "learning_rate": 7.3869539344365575e-09, "loss": 0.0, "num_input_tokens_seen": 133108072, "step": 197535 }, { "epoch": 4.82593506461779, "grad_norm": 0.0005325916572473943, "learning_rate": 7.376611384762643e-09, "loss": 0.0, "num_input_tokens_seen": 133111784, "step": 197540 }, { "epoch": 4.826057215449637, "grad_norm": 5.844299084856175e-05, "learning_rate": 7.366276053712983e-09, "loss": 0.0001, "num_input_tokens_seen": 133115176, "step": 197545 }, { "epoch": 4.826179366281484, "grad_norm": 0.0008696087752468884, "learning_rate": 7.355947941362628e-09, "loss": 0.0, "num_input_tokens_seen": 133118312, "step": 197550 }, { "epoch": 4.826301517113332, "grad_norm": 0.00013190713070798665, "learning_rate": 7.345627047786851e-09, "loss": 0.0, "num_input_tokens_seen": 133121960, "step": 197555 }, { "epoch": 4.826423667945178, "grad_norm": 0.0003237095370423049, "learning_rate": 7.335313373060703e-09, "loss": 0.0, "num_input_tokens_seen": 133125096, "step": 197560 }, { "epoch": 4.826545818777026, "grad_norm": 1.4504607861454133e-05, "learning_rate": 7.325006917259124e-09, "loss": 0.0, "num_input_tokens_seen": 133128488, "step": 197565 }, { "epoch": 4.826667969608873, "grad_norm": 0.00011755106970667839, "learning_rate": 7.3147076804571665e-09, "loss": 0.0, "num_input_tokens_seen": 133132072, "step": 197570 }, { "epoch": 4.82679012044072, "grad_norm": 0.0031064345967024565, "learning_rate": 7.304415662729546e-09, "loss": 0.0, "num_input_tokens_seen": 133135400, "step": 197575 }, { "epoch": 4.826912271272567, "grad_norm": 0.00012528127990663052, "learning_rate": 7.294130864151315e-09, "loss": 0.0, "num_input_tokens_seen": 133138856, "step": 197580 }, { "epoch": 4.827034422104415, "grad_norm": 0.0107027068734169, "learning_rate": 7.2838532847971926e-09, "loss": 0.0, "num_input_tokens_seen": 133141992, "step": 197585 }, { "epoch": 4.8271565729362615, "grad_norm": 0.018038859590888023, "learning_rate": 7.273582924741783e-09, "loss": 0.0, "num_input_tokens_seen": 133145256, "step": 197590 }, { "epoch": 4.827278723768109, "grad_norm": 0.000366037042113021, "learning_rate": 7.263319784059918e-09, "loss": 0.0, "num_input_tokens_seen": 133148712, "step": 197595 }, { "epoch": 4.827400874599956, "grad_norm": 0.01911904290318489, "learning_rate": 7.253063862826203e-09, "loss": 0.0, "num_input_tokens_seen": 133152104, "step": 197600 }, { "epoch": 4.827523025431804, "grad_norm": 0.0010948879644274712, "learning_rate": 7.242815161115246e-09, "loss": 0.0, "num_input_tokens_seen": 133155432, "step": 197605 }, { "epoch": 4.82764517626365, "grad_norm": 0.001213407376781106, "learning_rate": 7.232573679001541e-09, "loss": 0.0256, "num_input_tokens_seen": 133158760, "step": 197610 }, { "epoch": 4.827767327095497, "grad_norm": 0.00012105743371648714, "learning_rate": 7.222339416559587e-09, "loss": 0.0762, "num_input_tokens_seen": 133163176, "step": 197615 }, { "epoch": 4.827889477927345, "grad_norm": 4.5139000576455146e-05, "learning_rate": 7.212112373863877e-09, "loss": 0.0001, "num_input_tokens_seen": 133166312, "step": 197620 }, { "epoch": 4.828011628759192, "grad_norm": 0.00018658023327589035, "learning_rate": 7.201892550988686e-09, "loss": 0.0, "num_input_tokens_seen": 133169640, "step": 197625 }, { "epoch": 4.828133779591039, "grad_norm": 0.00011530852498253807, "learning_rate": 7.191679948008289e-09, "loss": 0.0, "num_input_tokens_seen": 133173224, "step": 197630 }, { "epoch": 4.828255930422886, "grad_norm": 7.312805246328935e-05, "learning_rate": 7.1814745649971805e-09, "loss": 0.0, "num_input_tokens_seen": 133176680, "step": 197635 }, { "epoch": 4.8283780812547334, "grad_norm": 0.00026635496760718524, "learning_rate": 7.171276402029191e-09, "loss": 0.0, "num_input_tokens_seen": 133179496, "step": 197640 }, { "epoch": 4.82850023208658, "grad_norm": 0.00010877988825086504, "learning_rate": 7.161085459178928e-09, "loss": 0.0001, "num_input_tokens_seen": 133182760, "step": 197645 }, { "epoch": 4.828622382918428, "grad_norm": 6.915336416568607e-05, "learning_rate": 7.150901736520221e-09, "loss": 0.0, "num_input_tokens_seen": 133186216, "step": 197650 }, { "epoch": 4.828744533750275, "grad_norm": 0.0005507735768333077, "learning_rate": 7.140725234127231e-09, "loss": 0.0, "num_input_tokens_seen": 133189416, "step": 197655 }, { "epoch": 4.828866684582122, "grad_norm": 5.71970667806454e-05, "learning_rate": 7.130555952073792e-09, "loss": 0.0, "num_input_tokens_seen": 133193704, "step": 197660 }, { "epoch": 4.828988835413969, "grad_norm": 3.259177537984215e-05, "learning_rate": 7.120393890434173e-09, "loss": 0.0, "num_input_tokens_seen": 133197992, "step": 197665 }, { "epoch": 4.829110986245817, "grad_norm": 0.002456663642078638, "learning_rate": 7.1102390492819855e-09, "loss": 0.0, "num_input_tokens_seen": 133201640, "step": 197670 }, { "epoch": 4.829233137077663, "grad_norm": 0.0064817629754543304, "learning_rate": 7.100091428691279e-09, "loss": 0.0, "num_input_tokens_seen": 133205544, "step": 197675 }, { "epoch": 4.829355287909511, "grad_norm": 6.728667358402163e-05, "learning_rate": 7.089951028735663e-09, "loss": 0.0, "num_input_tokens_seen": 133208872, "step": 197680 }, { "epoch": 4.829477438741358, "grad_norm": 4.01813886128366e-05, "learning_rate": 7.079817849489078e-09, "loss": 0.0, "num_input_tokens_seen": 133212008, "step": 197685 }, { "epoch": 4.829599589573205, "grad_norm": 0.002144238678738475, "learning_rate": 7.069691891025132e-09, "loss": 0.0, "num_input_tokens_seen": 133215464, "step": 197690 }, { "epoch": 4.829721740405052, "grad_norm": 0.0005259595345705748, "learning_rate": 7.05957315341732e-09, "loss": 0.0, "num_input_tokens_seen": 133218536, "step": 197695 }, { "epoch": 4.8298438912369, "grad_norm": 0.031700070947408676, "learning_rate": 7.049461636739473e-09, "loss": 0.0, "num_input_tokens_seen": 133221544, "step": 197700 }, { "epoch": 4.8299660420687465, "grad_norm": 6.699377263430506e-05, "learning_rate": 7.039357341064978e-09, "loss": 0.0, "num_input_tokens_seen": 133224744, "step": 197705 }, { "epoch": 4.830088192900593, "grad_norm": 3.650726648629643e-05, "learning_rate": 7.0292602664673295e-09, "loss": 0.0, "num_input_tokens_seen": 133227944, "step": 197710 }, { "epoch": 4.830210343732441, "grad_norm": 0.0007697915425524116, "learning_rate": 7.019170413020026e-09, "loss": 0.0174, "num_input_tokens_seen": 133231784, "step": 197715 }, { "epoch": 4.8303324945642885, "grad_norm": 9.95201407931745e-05, "learning_rate": 7.009087780796452e-09, "loss": 0.0, "num_input_tokens_seen": 133235112, "step": 197720 }, { "epoch": 4.830454645396135, "grad_norm": 0.00124736491125077, "learning_rate": 6.999012369869773e-09, "loss": 0.0, "num_input_tokens_seen": 133238632, "step": 197725 }, { "epoch": 4.830576796227982, "grad_norm": 0.00044174346840009093, "learning_rate": 6.988944180313372e-09, "loss": 0.0, "num_input_tokens_seen": 133241896, "step": 197730 }, { "epoch": 4.83069894705983, "grad_norm": 0.0006960714235901833, "learning_rate": 6.978883212200526e-09, "loss": 0.0, "num_input_tokens_seen": 133244904, "step": 197735 }, { "epoch": 4.830821097891676, "grad_norm": 0.009272739291191101, "learning_rate": 6.968829465604287e-09, "loss": 0.0, "num_input_tokens_seen": 133247656, "step": 197740 }, { "epoch": 4.830943248723524, "grad_norm": 4.248324330546893e-05, "learning_rate": 6.9587829405978184e-09, "loss": 0.0, "num_input_tokens_seen": 133250984, "step": 197745 }, { "epoch": 4.831065399555371, "grad_norm": 0.0006277945940382779, "learning_rate": 6.948743637254173e-09, "loss": 0.0, "num_input_tokens_seen": 133255208, "step": 197750 }, { "epoch": 4.831187550387218, "grad_norm": 0.00044739150325767696, "learning_rate": 6.938711555646293e-09, "loss": 0.0, "num_input_tokens_seen": 133258600, "step": 197755 }, { "epoch": 4.831309701219065, "grad_norm": 4.2511983338044956e-05, "learning_rate": 6.928686695847341e-09, "loss": 0.0009, "num_input_tokens_seen": 133262120, "step": 197760 }, { "epoch": 4.831431852050913, "grad_norm": 3.650227881735191e-05, "learning_rate": 6.918669057929927e-09, "loss": 0.0, "num_input_tokens_seen": 133265448, "step": 197765 }, { "epoch": 4.8315540028827595, "grad_norm": 0.0002340917126275599, "learning_rate": 6.908658641967102e-09, "loss": 0.0, "num_input_tokens_seen": 133268584, "step": 197770 }, { "epoch": 4.831676153714607, "grad_norm": 0.003371995175257325, "learning_rate": 6.8986554480316985e-09, "loss": 0.0, "num_input_tokens_seen": 133271848, "step": 197775 }, { "epoch": 4.831798304546454, "grad_norm": 0.03181646019220352, "learning_rate": 6.888659476196323e-09, "loss": 0.0, "num_input_tokens_seen": 133275880, "step": 197780 }, { "epoch": 4.8319204553783015, "grad_norm": 0.000140202566399239, "learning_rate": 6.878670726533808e-09, "loss": 0.0, "num_input_tokens_seen": 133279656, "step": 197785 }, { "epoch": 4.832042606210148, "grad_norm": 0.00034241325920447707, "learning_rate": 6.868689199116651e-09, "loss": 0.0, "num_input_tokens_seen": 133283112, "step": 197790 }, { "epoch": 4.832164757041996, "grad_norm": 0.0015882498119026423, "learning_rate": 6.85871489401757e-09, "loss": 0.0, "num_input_tokens_seen": 133285992, "step": 197795 }, { "epoch": 4.832286907873843, "grad_norm": 0.00044084640103392303, "learning_rate": 6.8487478113089524e-09, "loss": 0.0, "num_input_tokens_seen": 133289512, "step": 197800 }, { "epoch": 4.832409058705689, "grad_norm": 5.1393995818216354e-05, "learning_rate": 6.838787951063407e-09, "loss": 0.0, "num_input_tokens_seen": 133292712, "step": 197805 }, { "epoch": 4.832531209537537, "grad_norm": 0.0006298979860730469, "learning_rate": 6.8288353133533205e-09, "loss": 0.0, "num_input_tokens_seen": 133296360, "step": 197810 }, { "epoch": 4.832653360369384, "grad_norm": 0.004993734881281853, "learning_rate": 6.818889898250968e-09, "loss": 0.0, "num_input_tokens_seen": 133299752, "step": 197815 }, { "epoch": 4.832775511201231, "grad_norm": 215.03285217285156, "learning_rate": 6.8089517058289584e-09, "loss": 0.0019, "num_input_tokens_seen": 133303272, "step": 197820 }, { "epoch": 4.832897662033078, "grad_norm": 2.1080935766804032e-05, "learning_rate": 6.7990207361593445e-09, "loss": 0.0, "num_input_tokens_seen": 133306664, "step": 197825 }, { "epoch": 4.833019812864926, "grad_norm": 0.0006089547532610595, "learning_rate": 6.789096989314291e-09, "loss": 0.0, "num_input_tokens_seen": 133310312, "step": 197830 }, { "epoch": 4.8331419636967725, "grad_norm": 0.003080077702179551, "learning_rate": 6.7791804653661855e-09, "loss": 0.0, "num_input_tokens_seen": 133313960, "step": 197835 }, { "epoch": 4.83326411452862, "grad_norm": 8.677435107529163e-05, "learning_rate": 6.769271164386969e-09, "loss": 0.0, "num_input_tokens_seen": 133316968, "step": 197840 }, { "epoch": 4.833386265360467, "grad_norm": 0.0014505936997011304, "learning_rate": 6.759369086448696e-09, "loss": 0.0001, "num_input_tokens_seen": 133319848, "step": 197845 }, { "epoch": 4.8335084161923145, "grad_norm": 5.86692440265324e-05, "learning_rate": 6.749474231623531e-09, "loss": 0.0, "num_input_tokens_seen": 133323048, "step": 197850 }, { "epoch": 4.833630567024161, "grad_norm": 0.00010189796012127772, "learning_rate": 6.739586599983416e-09, "loss": 0.0, "num_input_tokens_seen": 133326184, "step": 197855 }, { "epoch": 4.833752717856009, "grad_norm": 6.377408226398984e-06, "learning_rate": 6.7297061916000706e-09, "loss": 0.0, "num_input_tokens_seen": 133329512, "step": 197860 }, { "epoch": 4.833874868687856, "grad_norm": 0.0003621731302700937, "learning_rate": 6.719833006545439e-09, "loss": 0.0, "num_input_tokens_seen": 133332840, "step": 197865 }, { "epoch": 4.833997019519703, "grad_norm": 0.031017715111374855, "learning_rate": 6.709967044891351e-09, "loss": 0.0, "num_input_tokens_seen": 133336424, "step": 197870 }, { "epoch": 4.83411917035155, "grad_norm": 0.0003073037078138441, "learning_rate": 6.7001083067095285e-09, "loss": 0.0, "num_input_tokens_seen": 133339880, "step": 197875 }, { "epoch": 4.834241321183397, "grad_norm": 4.218026151647791e-05, "learning_rate": 6.690256792071802e-09, "loss": 0.0, "num_input_tokens_seen": 133343208, "step": 197880 }, { "epoch": 4.834363472015244, "grad_norm": 0.0009448288474231958, "learning_rate": 6.680412501049559e-09, "loss": 0.0, "num_input_tokens_seen": 133346664, "step": 197885 }, { "epoch": 4.834485622847092, "grad_norm": 3.618423215812072e-05, "learning_rate": 6.670575433714631e-09, "loss": 0.0, "num_input_tokens_seen": 133350440, "step": 197890 }, { "epoch": 4.834607773678939, "grad_norm": 0.0009504237677901983, "learning_rate": 6.660745590138406e-09, "loss": 0.0, "num_input_tokens_seen": 133353704, "step": 197895 }, { "epoch": 4.8347299245107855, "grad_norm": 0.008153039962053299, "learning_rate": 6.650922970392381e-09, "loss": 0.0, "num_input_tokens_seen": 133356648, "step": 197900 }, { "epoch": 4.834852075342633, "grad_norm": 0.07207217812538147, "learning_rate": 6.641107574548055e-09, "loss": 0.0001, "num_input_tokens_seen": 133361064, "step": 197905 }, { "epoch": 4.83497422617448, "grad_norm": 0.08829605579376221, "learning_rate": 6.6312994026768155e-09, "loss": 0.0, "num_input_tokens_seen": 133365160, "step": 197910 }, { "epoch": 4.8350963770063276, "grad_norm": 0.005756590981036425, "learning_rate": 6.621498454849939e-09, "loss": 0.0, "num_input_tokens_seen": 133368296, "step": 197915 }, { "epoch": 4.835218527838174, "grad_norm": 9.277733624912798e-05, "learning_rate": 6.6117047311387006e-09, "loss": 0.0, "num_input_tokens_seen": 133371304, "step": 197920 }, { "epoch": 4.835340678670022, "grad_norm": 0.00010662163549568504, "learning_rate": 6.601918231614267e-09, "loss": 0.0279, "num_input_tokens_seen": 133374760, "step": 197925 }, { "epoch": 4.835462829501869, "grad_norm": 0.29649490118026733, "learning_rate": 6.592138956347915e-09, "loss": 0.0001, "num_input_tokens_seen": 133378728, "step": 197930 }, { "epoch": 4.835584980333716, "grad_norm": 0.0001025107194436714, "learning_rate": 6.582366905410808e-09, "loss": 0.0, "num_input_tokens_seen": 133382376, "step": 197935 }, { "epoch": 4.835707131165563, "grad_norm": 0.0010448351968079805, "learning_rate": 6.57260207887389e-09, "loss": 0.0, "num_input_tokens_seen": 133385576, "step": 197940 }, { "epoch": 4.835829281997411, "grad_norm": 0.009936603717505932, "learning_rate": 6.562844476808216e-09, "loss": 0.0, "num_input_tokens_seen": 133388456, "step": 197945 }, { "epoch": 4.835951432829257, "grad_norm": 9.89523614407517e-05, "learning_rate": 6.553094099284617e-09, "loss": 0.0, "num_input_tokens_seen": 133393704, "step": 197950 }, { "epoch": 4.836073583661105, "grad_norm": 0.00017748077516444027, "learning_rate": 6.543350946374259e-09, "loss": 0.0, "num_input_tokens_seen": 133397096, "step": 197955 }, { "epoch": 4.836195734492952, "grad_norm": 6.688635949103627e-06, "learning_rate": 6.533615018147753e-09, "loss": 0.0, "num_input_tokens_seen": 133399976, "step": 197960 }, { "epoch": 4.8363178853247994, "grad_norm": 0.0003063578624278307, "learning_rate": 6.523886314676152e-09, "loss": 0.0, "num_input_tokens_seen": 133403688, "step": 197965 }, { "epoch": 4.836440036156646, "grad_norm": 0.002320501022040844, "learning_rate": 6.514164836029956e-09, "loss": 0.0, "num_input_tokens_seen": 133407336, "step": 197970 }, { "epoch": 4.836562186988493, "grad_norm": 0.0012531977845355868, "learning_rate": 6.504450582279997e-09, "loss": 0.0002, "num_input_tokens_seen": 133410536, "step": 197975 }, { "epoch": 4.836684337820341, "grad_norm": 0.0012084580957889557, "learning_rate": 6.494743553496884e-09, "loss": 0.0, "num_input_tokens_seen": 133413480, "step": 197980 }, { "epoch": 4.836806488652188, "grad_norm": 9.320944809587672e-05, "learning_rate": 6.485043749751229e-09, "loss": 0.0, "num_input_tokens_seen": 133417000, "step": 197985 }, { "epoch": 4.836928639484035, "grad_norm": 0.005629129242151976, "learning_rate": 6.47535117111353e-09, "loss": 0.0, "num_input_tokens_seen": 133420200, "step": 197990 }, { "epoch": 4.837050790315882, "grad_norm": 1.2280200280656572e-05, "learning_rate": 6.465665817654287e-09, "loss": 0.0, "num_input_tokens_seen": 133423528, "step": 197995 }, { "epoch": 4.837172941147729, "grad_norm": 0.0018922107992693782, "learning_rate": 6.455987689443998e-09, "loss": 0.0, "num_input_tokens_seen": 133426984, "step": 198000 }, { "epoch": 4.837295091979576, "grad_norm": 0.008688423782587051, "learning_rate": 6.446316786552941e-09, "loss": 0.0, "num_input_tokens_seen": 133430312, "step": 198005 }, { "epoch": 4.837417242811424, "grad_norm": 6.29360947641544e-05, "learning_rate": 6.436653109051615e-09, "loss": 0.0, "num_input_tokens_seen": 133434024, "step": 198010 }, { "epoch": 4.8375393936432705, "grad_norm": 2.2677289962302893e-05, "learning_rate": 6.426996657010075e-09, "loss": 0.0, "num_input_tokens_seen": 133437480, "step": 198015 }, { "epoch": 4.837661544475118, "grad_norm": 0.011553088203072548, "learning_rate": 6.4173474304987096e-09, "loss": 0.0, "num_input_tokens_seen": 133440936, "step": 198020 }, { "epoch": 4.837783695306965, "grad_norm": 0.00016699406842235476, "learning_rate": 6.407705429587573e-09, "loss": 0.0, "num_input_tokens_seen": 133444072, "step": 198025 }, { "epoch": 4.8379058461388125, "grad_norm": 0.002082869876176119, "learning_rate": 6.398070654346943e-09, "loss": 0.0, "num_input_tokens_seen": 133447848, "step": 198030 }, { "epoch": 4.838027996970659, "grad_norm": 0.0001289423234993592, "learning_rate": 6.3884431048467635e-09, "loss": 0.0, "num_input_tokens_seen": 133451432, "step": 198035 }, { "epoch": 4.838150147802507, "grad_norm": 0.0005948066245764494, "learning_rate": 6.378822781156978e-09, "loss": 0.0, "num_input_tokens_seen": 133454440, "step": 198040 }, { "epoch": 4.838272298634354, "grad_norm": 0.0036121481098234653, "learning_rate": 6.369209683347754e-09, "loss": 0.0, "num_input_tokens_seen": 133457704, "step": 198045 }, { "epoch": 4.838394449466201, "grad_norm": 0.0004089911817573011, "learning_rate": 6.3596038114888114e-09, "loss": 0.0, "num_input_tokens_seen": 133461096, "step": 198050 }, { "epoch": 4.838516600298048, "grad_norm": 0.00018862300203181803, "learning_rate": 6.350005165650207e-09, "loss": 0.0, "num_input_tokens_seen": 133464360, "step": 198055 }, { "epoch": 4.838638751129896, "grad_norm": 0.004749796353280544, "learning_rate": 6.340413745901551e-09, "loss": 0.0, "num_input_tokens_seen": 133467816, "step": 198060 }, { "epoch": 4.838760901961742, "grad_norm": 2.8277538149268366e-05, "learning_rate": 6.330829552312678e-09, "loss": 0.0, "num_input_tokens_seen": 133471080, "step": 198065 }, { "epoch": 4.838883052793589, "grad_norm": 0.11721772700548172, "learning_rate": 6.321252584953307e-09, "loss": 0.0001, "num_input_tokens_seen": 133474408, "step": 198070 }, { "epoch": 4.839005203625437, "grad_norm": 0.003499187296256423, "learning_rate": 6.31168284389294e-09, "loss": 0.0, "num_input_tokens_seen": 133477800, "step": 198075 }, { "epoch": 4.8391273544572835, "grad_norm": 0.00037740456173196435, "learning_rate": 6.302120329201411e-09, "loss": 0.0, "num_input_tokens_seen": 133480936, "step": 198080 }, { "epoch": 4.839249505289131, "grad_norm": 0.00026468883152119815, "learning_rate": 6.292565040947995e-09, "loss": 0.0, "num_input_tokens_seen": 133484392, "step": 198085 }, { "epoch": 4.839371656120978, "grad_norm": 0.0002155925176339224, "learning_rate": 6.283016979202416e-09, "loss": 0.0, "num_input_tokens_seen": 133488040, "step": 198090 }, { "epoch": 4.8394938069528255, "grad_norm": 7.394074145850027e-06, "learning_rate": 6.273476144034062e-09, "loss": 0.0, "num_input_tokens_seen": 133491112, "step": 198095 }, { "epoch": 4.839615957784672, "grad_norm": 0.0001947157143149525, "learning_rate": 6.2639425355122126e-09, "loss": 0.0, "num_input_tokens_seen": 133494504, "step": 198100 }, { "epoch": 4.83973810861652, "grad_norm": 0.00013583162217400968, "learning_rate": 6.254416153706254e-09, "loss": 0.0, "num_input_tokens_seen": 133497640, "step": 198105 }, { "epoch": 4.839860259448367, "grad_norm": 0.0065417601726949215, "learning_rate": 6.244896998685467e-09, "loss": 0.0, "num_input_tokens_seen": 133500968, "step": 198110 }, { "epoch": 4.839982410280214, "grad_norm": 3.4384058380965143e-05, "learning_rate": 6.235385070519017e-09, "loss": 0.0, "num_input_tokens_seen": 133504488, "step": 198115 }, { "epoch": 4.840104561112061, "grad_norm": 6.624732122872956e-06, "learning_rate": 6.225880369276293e-09, "loss": 0.0, "num_input_tokens_seen": 133507560, "step": 198120 }, { "epoch": 4.840226711943909, "grad_norm": 0.7948578596115112, "learning_rate": 6.216382895026129e-09, "loss": 0.0001, "num_input_tokens_seen": 133510952, "step": 198125 }, { "epoch": 4.840348862775755, "grad_norm": 0.008793001994490623, "learning_rate": 6.206892647837802e-09, "loss": 0.0, "num_input_tokens_seen": 133513768, "step": 198130 }, { "epoch": 4.840471013607603, "grad_norm": 0.0005286111263558269, "learning_rate": 6.197409627780148e-09, "loss": 0.0, "num_input_tokens_seen": 133517480, "step": 198135 }, { "epoch": 4.84059316443945, "grad_norm": 7.7891701948829e-05, "learning_rate": 6.187933834922332e-09, "loss": 0.0, "num_input_tokens_seen": 133520744, "step": 198140 }, { "epoch": 4.8407153152712965, "grad_norm": 6.683762330794707e-05, "learning_rate": 6.178465269333188e-09, "loss": 0.0, "num_input_tokens_seen": 133523944, "step": 198145 }, { "epoch": 4.840837466103144, "grad_norm": 0.0002326093817828223, "learning_rate": 6.16900393108144e-09, "loss": 0.0, "num_input_tokens_seen": 133527016, "step": 198150 }, { "epoch": 4.840959616934992, "grad_norm": 5.887150109629147e-05, "learning_rate": 6.159549820236032e-09, "loss": 0.0, "num_input_tokens_seen": 133530152, "step": 198155 }, { "epoch": 4.8410817677668385, "grad_norm": 0.00042430704343132675, "learning_rate": 6.150102936865797e-09, "loss": 0.0, "num_input_tokens_seen": 133533352, "step": 198160 }, { "epoch": 4.841203918598685, "grad_norm": 0.0015933191170915961, "learning_rate": 6.140663281039238e-09, "loss": 0.0, "num_input_tokens_seen": 133536488, "step": 198165 }, { "epoch": 4.841326069430533, "grad_norm": 2.208988917118404e-05, "learning_rate": 6.131230852825075e-09, "loss": 0.0, "num_input_tokens_seen": 133539944, "step": 198170 }, { "epoch": 4.84144822026238, "grad_norm": 0.0014280682662501931, "learning_rate": 6.1218056522919225e-09, "loss": 0.0, "num_input_tokens_seen": 133542952, "step": 198175 }, { "epoch": 4.841570371094227, "grad_norm": 0.0014833662426099181, "learning_rate": 6.11238767950839e-09, "loss": 0.0, "num_input_tokens_seen": 133546472, "step": 198180 }, { "epoch": 4.841692521926074, "grad_norm": 3.1954394216882065e-05, "learning_rate": 6.102976934542758e-09, "loss": 0.0, "num_input_tokens_seen": 133549608, "step": 198185 }, { "epoch": 4.841814672757922, "grad_norm": 0.0003832554502878338, "learning_rate": 6.0935734174637485e-09, "loss": 0.0, "num_input_tokens_seen": 133553064, "step": 198190 }, { "epoch": 4.841936823589768, "grad_norm": 8.113325748126954e-05, "learning_rate": 6.084177128339529e-09, "loss": 0.0, "num_input_tokens_seen": 133556776, "step": 198195 }, { "epoch": 4.842058974421616, "grad_norm": 6.306503200903535e-05, "learning_rate": 6.074788067238601e-09, "loss": 0.0, "num_input_tokens_seen": 133560488, "step": 198200 }, { "epoch": 4.842181125253463, "grad_norm": 3.252805254305713e-05, "learning_rate": 6.0654062342290204e-09, "loss": 0.0, "num_input_tokens_seen": 133563496, "step": 198205 }, { "epoch": 4.84230327608531, "grad_norm": 0.0006236277404241264, "learning_rate": 6.056031629379177e-09, "loss": 0.0, "num_input_tokens_seen": 133566504, "step": 198210 }, { "epoch": 4.842425426917157, "grad_norm": 0.001088500372134149, "learning_rate": 6.046664252757239e-09, "loss": 0.0, "num_input_tokens_seen": 133569832, "step": 198215 }, { "epoch": 4.842547577749005, "grad_norm": 3.684737384901382e-05, "learning_rate": 6.037304104431262e-09, "loss": 0.0, "num_input_tokens_seen": 133572904, "step": 198220 }, { "epoch": 4.8426697285808515, "grad_norm": 4.16367947764229e-05, "learning_rate": 6.027951184469416e-09, "loss": 0.0, "num_input_tokens_seen": 133576744, "step": 198225 }, { "epoch": 4.842791879412699, "grad_norm": 0.00012082437751814723, "learning_rate": 6.018605492939533e-09, "loss": 0.0, "num_input_tokens_seen": 133580648, "step": 198230 }, { "epoch": 4.842914030244546, "grad_norm": 0.0002326148678548634, "learning_rate": 6.009267029909892e-09, "loss": 0.0, "num_input_tokens_seen": 133584104, "step": 198235 }, { "epoch": 4.843036181076393, "grad_norm": 0.0003760404360946268, "learning_rate": 5.999935795447997e-09, "loss": 0.0, "num_input_tokens_seen": 133587496, "step": 198240 }, { "epoch": 4.84315833190824, "grad_norm": 0.00015414123481605202, "learning_rate": 5.990611789622013e-09, "loss": 0.0, "num_input_tokens_seen": 133590888, "step": 198245 }, { "epoch": 4.843280482740088, "grad_norm": 1.6417672668467276e-05, "learning_rate": 5.9812950124997765e-09, "loss": 0.0, "num_input_tokens_seen": 133594344, "step": 198250 }, { "epoch": 4.843402633571935, "grad_norm": 1.9476810848573223e-05, "learning_rate": 5.971985464148788e-09, "loss": 0.0, "num_input_tokens_seen": 133597608, "step": 198255 }, { "epoch": 4.843524784403781, "grad_norm": 0.0008444825652986765, "learning_rate": 5.962683144636882e-09, "loss": 0.0, "num_input_tokens_seen": 133600872, "step": 198260 }, { "epoch": 4.843646935235629, "grad_norm": 0.00028568675043061376, "learning_rate": 5.9533880540317826e-09, "loss": 0.0, "num_input_tokens_seen": 133604392, "step": 198265 }, { "epoch": 4.843769086067476, "grad_norm": 0.002278596628457308, "learning_rate": 5.944100192400992e-09, "loss": 0.0, "num_input_tokens_seen": 133607912, "step": 198270 }, { "epoch": 4.843891236899323, "grad_norm": 0.01016619149595499, "learning_rate": 5.93481955981201e-09, "loss": 0.0, "num_input_tokens_seen": 133610920, "step": 198275 }, { "epoch": 4.84401338773117, "grad_norm": 0.00020297674927860498, "learning_rate": 5.92554615633245e-09, "loss": 0.0, "num_input_tokens_seen": 133614760, "step": 198280 }, { "epoch": 4.844135538563018, "grad_norm": 0.00021635602752212435, "learning_rate": 5.916279982029704e-09, "loss": 0.0, "num_input_tokens_seen": 133618664, "step": 198285 }, { "epoch": 4.844257689394865, "grad_norm": 1.835424336604774e-05, "learning_rate": 5.90702103697105e-09, "loss": 0.0, "num_input_tokens_seen": 133621992, "step": 198290 }, { "epoch": 4.844379840226712, "grad_norm": 0.00019787390192504972, "learning_rate": 5.897769321223989e-09, "loss": 0.0, "num_input_tokens_seen": 133625192, "step": 198295 }, { "epoch": 4.844501991058559, "grad_norm": 0.00028561081853695214, "learning_rate": 5.888524834855802e-09, "loss": 0.0, "num_input_tokens_seen": 133628840, "step": 198300 }, { "epoch": 4.844624141890407, "grad_norm": 4.879441257799044e-05, "learning_rate": 5.879287577933545e-09, "loss": 0.0, "num_input_tokens_seen": 133632296, "step": 198305 }, { "epoch": 4.844746292722253, "grad_norm": 7.078771886881441e-05, "learning_rate": 5.870057550524499e-09, "loss": 0.0, "num_input_tokens_seen": 133635560, "step": 198310 }, { "epoch": 4.844868443554101, "grad_norm": 0.00010384644701844081, "learning_rate": 5.860834752695831e-09, "loss": 0.0, "num_input_tokens_seen": 133638568, "step": 198315 }, { "epoch": 4.844990594385948, "grad_norm": 0.0012810814660042524, "learning_rate": 5.851619184514489e-09, "loss": 0.0, "num_input_tokens_seen": 133642024, "step": 198320 }, { "epoch": 4.845112745217795, "grad_norm": 0.00021210868726484478, "learning_rate": 5.842410846047641e-09, "loss": 0.0, "num_input_tokens_seen": 133645544, "step": 198325 }, { "epoch": 4.845234896049642, "grad_norm": 0.00038419957854785025, "learning_rate": 5.833209737362121e-09, "loss": 0.0, "num_input_tokens_seen": 133648936, "step": 198330 }, { "epoch": 4.845357046881489, "grad_norm": 0.00927771721035242, "learning_rate": 5.8240158585249886e-09, "loss": 0.0, "num_input_tokens_seen": 133652584, "step": 198335 }, { "epoch": 4.8454791977133365, "grad_norm": 6.750579632353038e-05, "learning_rate": 5.814829209602856e-09, "loss": 0.0, "num_input_tokens_seen": 133656424, "step": 198340 }, { "epoch": 4.845601348545183, "grad_norm": 0.0001237114774994552, "learning_rate": 5.805649790662892e-09, "loss": 0.0, "num_input_tokens_seen": 133660328, "step": 198345 }, { "epoch": 4.845723499377031, "grad_norm": 0.00010960324289044365, "learning_rate": 5.796477601771488e-09, "loss": 0.0, "num_input_tokens_seen": 133664040, "step": 198350 }, { "epoch": 4.845845650208878, "grad_norm": 0.0028228070586919785, "learning_rate": 5.78731264299559e-09, "loss": 0.0, "num_input_tokens_seen": 133667496, "step": 198355 }, { "epoch": 4.845967801040725, "grad_norm": 9.504000627202913e-05, "learning_rate": 5.7781549144017e-09, "loss": 0.0, "num_input_tokens_seen": 133671464, "step": 198360 }, { "epoch": 4.846089951872572, "grad_norm": 0.001479696249589324, "learning_rate": 5.769004416056544e-09, "loss": 0.0, "num_input_tokens_seen": 133674920, "step": 198365 }, { "epoch": 4.84621210270442, "grad_norm": 0.00032303182524628937, "learning_rate": 5.759861148026624e-09, "loss": 0.0, "num_input_tokens_seen": 133678312, "step": 198370 }, { "epoch": 4.846334253536266, "grad_norm": 0.003984017763286829, "learning_rate": 5.75072511037833e-09, "loss": 0.0, "num_input_tokens_seen": 133681832, "step": 198375 }, { "epoch": 4.846456404368114, "grad_norm": 6.143693462945521e-05, "learning_rate": 5.741596303178276e-09, "loss": 0.0, "num_input_tokens_seen": 133685288, "step": 198380 }, { "epoch": 4.846578555199961, "grad_norm": 0.00015785187133587897, "learning_rate": 5.732474726492631e-09, "loss": 0.0, "num_input_tokens_seen": 133688616, "step": 198385 }, { "epoch": 4.846700706031808, "grad_norm": 2.5543968149577267e-05, "learning_rate": 5.723360380388009e-09, "loss": 0.0, "num_input_tokens_seen": 133691880, "step": 198390 }, { "epoch": 4.846822856863655, "grad_norm": 0.000297121936455369, "learning_rate": 5.714253264930357e-09, "loss": 0.0, "num_input_tokens_seen": 133695400, "step": 198395 }, { "epoch": 4.846945007695503, "grad_norm": 3.050006489502266e-05, "learning_rate": 5.7051533801861786e-09, "loss": 0.0, "num_input_tokens_seen": 133698664, "step": 198400 }, { "epoch": 4.8470671585273495, "grad_norm": 0.000478239671792835, "learning_rate": 5.696060726221641e-09, "loss": 0.0, "num_input_tokens_seen": 133702248, "step": 198405 }, { "epoch": 4.847189309359197, "grad_norm": 0.00010278217087034136, "learning_rate": 5.686975303102693e-09, "loss": 0.0, "num_input_tokens_seen": 133705832, "step": 198410 }, { "epoch": 4.847311460191044, "grad_norm": 3.768475289689377e-05, "learning_rate": 5.677897110895502e-09, "loss": 0.0818, "num_input_tokens_seen": 133709224, "step": 198415 }, { "epoch": 4.8474336110228915, "grad_norm": 0.00011841404921142384, "learning_rate": 5.6688261496661286e-09, "loss": 0.0, "num_input_tokens_seen": 133712488, "step": 198420 }, { "epoch": 4.847555761854738, "grad_norm": 4.986863132216968e-06, "learning_rate": 5.659762419480407e-09, "loss": 0.0, "num_input_tokens_seen": 133715880, "step": 198425 }, { "epoch": 4.847677912686585, "grad_norm": 0.00031835006666369736, "learning_rate": 5.650705920404397e-09, "loss": 0.0, "num_input_tokens_seen": 133719016, "step": 198430 }, { "epoch": 4.847800063518433, "grad_norm": 6.428364486055216e-06, "learning_rate": 5.641656652503934e-09, "loss": 0.0, "num_input_tokens_seen": 133722728, "step": 198435 }, { "epoch": 4.847922214350279, "grad_norm": 0.0008753464790061116, "learning_rate": 5.632614615844744e-09, "loss": 0.0, "num_input_tokens_seen": 133725864, "step": 198440 }, { "epoch": 4.848044365182127, "grad_norm": 0.000149292332935147, "learning_rate": 5.6235798104926625e-09, "loss": 0.0, "num_input_tokens_seen": 133729064, "step": 198445 }, { "epoch": 4.848166516013974, "grad_norm": 4.988710497855209e-05, "learning_rate": 5.614552236513304e-09, "loss": 0.0, "num_input_tokens_seen": 133732200, "step": 198450 }, { "epoch": 4.848288666845821, "grad_norm": 0.0008496578666381538, "learning_rate": 5.605531893972393e-09, "loss": 0.0, "num_input_tokens_seen": 133735784, "step": 198455 }, { "epoch": 4.848410817677668, "grad_norm": 8.90137926035095e-06, "learning_rate": 5.596518782935655e-09, "loss": 0.0, "num_input_tokens_seen": 133739304, "step": 198460 }, { "epoch": 4.848532968509516, "grad_norm": 0.0003163012443110347, "learning_rate": 5.587512903468372e-09, "loss": 0.0, "num_input_tokens_seen": 133742696, "step": 198465 }, { "epoch": 4.8486551193413625, "grad_norm": 3.769802060560323e-05, "learning_rate": 5.578514255636158e-09, "loss": 0.0, "num_input_tokens_seen": 133746472, "step": 198470 }, { "epoch": 4.84877727017321, "grad_norm": 0.0006406178581528366, "learning_rate": 5.5695228395045145e-09, "loss": 0.0, "num_input_tokens_seen": 133749416, "step": 198475 }, { "epoch": 4.848899421005057, "grad_norm": 4.202740819891915e-05, "learning_rate": 5.560538655138724e-09, "loss": 0.0, "num_input_tokens_seen": 133753000, "step": 198480 }, { "epoch": 4.8490215718369045, "grad_norm": 0.00010983301035594195, "learning_rate": 5.5515617026041796e-09, "loss": 0.0, "num_input_tokens_seen": 133756008, "step": 198485 }, { "epoch": 4.849143722668751, "grad_norm": 0.0002412906615063548, "learning_rate": 5.542591981966049e-09, "loss": 0.0, "num_input_tokens_seen": 133759528, "step": 198490 }, { "epoch": 4.849265873500599, "grad_norm": 0.0010208688909187913, "learning_rate": 5.5336294932898376e-09, "loss": 0.0, "num_input_tokens_seen": 133762920, "step": 198495 }, { "epoch": 4.849388024332446, "grad_norm": 2.516420317988377e-05, "learning_rate": 5.5246742366404915e-09, "loss": 0.0, "num_input_tokens_seen": 133766632, "step": 198500 }, { "epoch": 4.849510175164292, "grad_norm": 0.0001550140732433647, "learning_rate": 5.515726212083071e-09, "loss": 0.0, "num_input_tokens_seen": 133769832, "step": 198505 }, { "epoch": 4.84963232599614, "grad_norm": 1.6459995094919577e-05, "learning_rate": 5.506785419682969e-09, "loss": 0.0, "num_input_tokens_seen": 133773224, "step": 198510 }, { "epoch": 4.849754476827988, "grad_norm": 0.00033926600008271635, "learning_rate": 5.49785185950491e-09, "loss": 0.0, "num_input_tokens_seen": 133776616, "step": 198515 }, { "epoch": 4.849876627659834, "grad_norm": 8.666397479828447e-05, "learning_rate": 5.488925531613953e-09, "loss": 0.0, "num_input_tokens_seen": 133779752, "step": 198520 }, { "epoch": 4.849998778491681, "grad_norm": 0.00028835254488512874, "learning_rate": 5.480006436075046e-09, "loss": 0.0, "num_input_tokens_seen": 133783528, "step": 198525 }, { "epoch": 4.850120929323529, "grad_norm": 0.0001398788153892383, "learning_rate": 5.471094572953028e-09, "loss": 0.0, "num_input_tokens_seen": 133786920, "step": 198530 }, { "epoch": 4.8502430801553755, "grad_norm": 0.00022068715770728886, "learning_rate": 5.462189942312734e-09, "loss": 0.0, "num_input_tokens_seen": 133790248, "step": 198535 }, { "epoch": 4.850365230987223, "grad_norm": 0.0027091875672340393, "learning_rate": 5.453292544218779e-09, "loss": 0.0, "num_input_tokens_seen": 133793576, "step": 198540 }, { "epoch": 4.85048738181907, "grad_norm": 0.0008957352256402373, "learning_rate": 5.444402378736113e-09, "loss": 0.0001, "num_input_tokens_seen": 133797224, "step": 198545 }, { "epoch": 4.8506095326509175, "grad_norm": 0.0001257900585187599, "learning_rate": 5.435519445929237e-09, "loss": 0.1013, "num_input_tokens_seen": 133800360, "step": 198550 }, { "epoch": 4.850731683482764, "grad_norm": 0.00018330544116906822, "learning_rate": 5.426643745862658e-09, "loss": 0.0, "num_input_tokens_seen": 133803880, "step": 198555 }, { "epoch": 4.850853834314612, "grad_norm": 0.006341997068375349, "learning_rate": 5.4177752786011e-09, "loss": 0.0, "num_input_tokens_seen": 133807400, "step": 198560 }, { "epoch": 4.850975985146459, "grad_norm": 6.26806213404052e-05, "learning_rate": 5.408914044209068e-09, "loss": 0.0, "num_input_tokens_seen": 133810984, "step": 198565 }, { "epoch": 4.851098135978306, "grad_norm": 0.0001525198749732226, "learning_rate": 5.400060042750843e-09, "loss": 0.0, "num_input_tokens_seen": 133814632, "step": 198570 }, { "epoch": 4.851220286810153, "grad_norm": 0.00013128323189448565, "learning_rate": 5.391213274290929e-09, "loss": 0.0, "num_input_tokens_seen": 133817896, "step": 198575 }, { "epoch": 4.851342437642001, "grad_norm": 0.0014685146743431687, "learning_rate": 5.382373738893609e-09, "loss": 0.0001, "num_input_tokens_seen": 133820840, "step": 198580 }, { "epoch": 4.851464588473847, "grad_norm": 8.187860657926649e-05, "learning_rate": 5.3735414366232745e-09, "loss": 0.0, "num_input_tokens_seen": 133824232, "step": 198585 }, { "epoch": 4.851586739305695, "grad_norm": 1.9650955437100492e-05, "learning_rate": 5.3647163675439864e-09, "loss": 0.0, "num_input_tokens_seen": 133827688, "step": 198590 }, { "epoch": 4.851708890137542, "grad_norm": 0.00018294328765477985, "learning_rate": 5.3558985317200265e-09, "loss": 0.0, "num_input_tokens_seen": 133831272, "step": 198595 }, { "epoch": 4.8518310409693886, "grad_norm": 0.00021778763039037585, "learning_rate": 5.347087929215455e-09, "loss": 0.0004, "num_input_tokens_seen": 133835048, "step": 198600 }, { "epoch": 4.851953191801236, "grad_norm": 0.0007337976712733507, "learning_rate": 5.338284560094442e-09, "loss": 0.0, "num_input_tokens_seen": 133837992, "step": 198605 }, { "epoch": 4.852075342633084, "grad_norm": 4.622937194653787e-05, "learning_rate": 5.3294884244208246e-09, "loss": 0.0, "num_input_tokens_seen": 133841256, "step": 198610 }, { "epoch": 4.852197493464931, "grad_norm": 2.2683816496282816e-05, "learning_rate": 5.320699522258887e-09, "loss": 0.0, "num_input_tokens_seen": 133844520, "step": 198615 }, { "epoch": 4.852319644296777, "grad_norm": 4.410344627103768e-05, "learning_rate": 5.311917853672243e-09, "loss": 0.0, "num_input_tokens_seen": 133847848, "step": 198620 }, { "epoch": 4.852441795128625, "grad_norm": 0.0010950923897325993, "learning_rate": 5.303143418724843e-09, "loss": 0.0, "num_input_tokens_seen": 133851496, "step": 198625 }, { "epoch": 4.852563945960472, "grad_norm": 0.0003131901612505317, "learning_rate": 5.294376217480634e-09, "loss": 0.0, "num_input_tokens_seen": 133854568, "step": 198630 }, { "epoch": 4.852686096792319, "grad_norm": 0.00022891550906933844, "learning_rate": 5.285616250003233e-09, "loss": 0.0001, "num_input_tokens_seen": 133858024, "step": 198635 }, { "epoch": 4.852808247624166, "grad_norm": 0.010083446279168129, "learning_rate": 5.276863516356367e-09, "loss": 0.0, "num_input_tokens_seen": 133861224, "step": 198640 }, { "epoch": 4.852930398456014, "grad_norm": 0.00024773701443336904, "learning_rate": 5.268118016603651e-09, "loss": 0.0, "num_input_tokens_seen": 133864616, "step": 198645 }, { "epoch": 4.8530525492878605, "grad_norm": 0.00037822252488695085, "learning_rate": 5.259379750808812e-09, "loss": 0.0, "num_input_tokens_seen": 133867688, "step": 198650 }, { "epoch": 4.853174700119708, "grad_norm": 0.0006301982211880386, "learning_rate": 5.250648719035245e-09, "loss": 0.0, "num_input_tokens_seen": 133871336, "step": 198655 }, { "epoch": 4.853296850951555, "grad_norm": 0.00013490190031006932, "learning_rate": 5.241924921346564e-09, "loss": 0.0, "num_input_tokens_seen": 133875432, "step": 198660 }, { "epoch": 4.8534190017834025, "grad_norm": 0.000527222000528127, "learning_rate": 5.233208357806163e-09, "loss": 0.0, "num_input_tokens_seen": 133878760, "step": 198665 }, { "epoch": 4.853541152615249, "grad_norm": 0.00030499850981868804, "learning_rate": 5.224499028477436e-09, "loss": 0.0, "num_input_tokens_seen": 133882024, "step": 198670 }, { "epoch": 4.853663303447097, "grad_norm": 0.0008157443953678012, "learning_rate": 5.215796933423666e-09, "loss": 0.0, "num_input_tokens_seen": 133885544, "step": 198675 }, { "epoch": 4.853785454278944, "grad_norm": 0.0001473993033869192, "learning_rate": 5.207102072708247e-09, "loss": 0.0, "num_input_tokens_seen": 133888808, "step": 198680 }, { "epoch": 4.853907605110791, "grad_norm": 0.0014335029991343617, "learning_rate": 5.1984144463943505e-09, "loss": 0.0, "num_input_tokens_seen": 133892648, "step": 198685 }, { "epoch": 4.854029755942638, "grad_norm": 0.03337578475475311, "learning_rate": 5.1897340545451474e-09, "loss": 0.0, "num_input_tokens_seen": 133896360, "step": 198690 }, { "epoch": 4.854151906774485, "grad_norm": 0.00011968166654696688, "learning_rate": 5.181060897223699e-09, "loss": 0.0, "num_input_tokens_seen": 133899816, "step": 198695 }, { "epoch": 4.854274057606332, "grad_norm": 0.00016990097356028855, "learning_rate": 5.172394974493177e-09, "loss": 0.0, "num_input_tokens_seen": 133903144, "step": 198700 }, { "epoch": 4.854396208438179, "grad_norm": 9.55967916524969e-05, "learning_rate": 5.1637362864166424e-09, "loss": 0.0, "num_input_tokens_seen": 133906408, "step": 198705 }, { "epoch": 4.854518359270027, "grad_norm": 0.0006632882286794484, "learning_rate": 5.155084833056933e-09, "loss": 0.0, "num_input_tokens_seen": 133909672, "step": 198710 }, { "epoch": 4.8546405101018735, "grad_norm": 0.0002882846456486732, "learning_rate": 5.146440614476999e-09, "loss": 0.0, "num_input_tokens_seen": 133913192, "step": 198715 }, { "epoch": 4.854762660933721, "grad_norm": 8.525528392056003e-05, "learning_rate": 5.13780363073979e-09, "loss": 0.0, "num_input_tokens_seen": 133916584, "step": 198720 }, { "epoch": 4.854884811765568, "grad_norm": 0.00025111655122600496, "learning_rate": 5.129173881908033e-09, "loss": 0.0, "num_input_tokens_seen": 133919720, "step": 198725 }, { "epoch": 4.8550069625974155, "grad_norm": 0.0003259789664298296, "learning_rate": 5.120551368044568e-09, "loss": 0.0, "num_input_tokens_seen": 133922856, "step": 198730 }, { "epoch": 4.855129113429262, "grad_norm": 3.2461808586958796e-05, "learning_rate": 5.11193608921201e-09, "loss": 0.0, "num_input_tokens_seen": 133926312, "step": 198735 }, { "epoch": 4.85525126426111, "grad_norm": 8.612728561274707e-05, "learning_rate": 5.103328045472977e-09, "loss": 0.0, "num_input_tokens_seen": 133929512, "step": 198740 }, { "epoch": 4.855373415092957, "grad_norm": 0.00030700431670993567, "learning_rate": 5.094727236890195e-09, "loss": 0.0, "num_input_tokens_seen": 133932968, "step": 198745 }, { "epoch": 4.855495565924804, "grad_norm": 0.0037068724632263184, "learning_rate": 5.086133663526171e-09, "loss": 0.0695, "num_input_tokens_seen": 133936360, "step": 198750 }, { "epoch": 4.855617716756651, "grad_norm": 1.6346239135600626e-05, "learning_rate": 5.0775473254434094e-09, "loss": 0.0, "num_input_tokens_seen": 133939560, "step": 198755 }, { "epoch": 4.855739867588499, "grad_norm": 0.0004788915684912354, "learning_rate": 5.068968222704307e-09, "loss": 0.0, "num_input_tokens_seen": 133942952, "step": 198760 }, { "epoch": 4.855862018420345, "grad_norm": 0.004332916811108589, "learning_rate": 5.0603963553711435e-09, "loss": 0.0, "num_input_tokens_seen": 133945960, "step": 198765 }, { "epoch": 4.855984169252192, "grad_norm": 6.013052552589215e-05, "learning_rate": 5.051831723506539e-09, "loss": 0.0, "num_input_tokens_seen": 133949096, "step": 198770 }, { "epoch": 4.85610632008404, "grad_norm": 0.0014203329337760806, "learning_rate": 5.043274327172553e-09, "loss": 0.0, "num_input_tokens_seen": 133952680, "step": 198775 }, { "epoch": 4.856228470915887, "grad_norm": 0.0002815816842485219, "learning_rate": 5.034724166431581e-09, "loss": 0.0, "num_input_tokens_seen": 133955816, "step": 198780 }, { "epoch": 4.856350621747734, "grad_norm": 0.008390041999518871, "learning_rate": 5.026181241345573e-09, "loss": 0.0, "num_input_tokens_seen": 133959080, "step": 198785 }, { "epoch": 4.856472772579581, "grad_norm": 4.61300733149983e-05, "learning_rate": 5.017645551976812e-09, "loss": 0.0, "num_input_tokens_seen": 133962344, "step": 198790 }, { "epoch": 4.8565949234114285, "grad_norm": 0.0011516953818500042, "learning_rate": 5.009117098387472e-09, "loss": 0.0, "num_input_tokens_seen": 133965544, "step": 198795 }, { "epoch": 4.856717074243275, "grad_norm": 1.3971000043966342e-05, "learning_rate": 5.000595880639391e-09, "loss": 0.0, "num_input_tokens_seen": 133968680, "step": 198800 }, { "epoch": 4.856839225075123, "grad_norm": 0.00022225634893402457, "learning_rate": 4.9920818987945205e-09, "loss": 0.0, "num_input_tokens_seen": 133972072, "step": 198805 }, { "epoch": 4.85696137590697, "grad_norm": 0.0014802911318838596, "learning_rate": 4.98357515291492e-09, "loss": 0.0, "num_input_tokens_seen": 133975976, "step": 198810 }, { "epoch": 4.857083526738817, "grad_norm": 0.0018839415861293674, "learning_rate": 4.975075643062321e-09, "loss": 0.0, "num_input_tokens_seen": 133978920, "step": 198815 }, { "epoch": 4.857205677570664, "grad_norm": 34.527950286865234, "learning_rate": 4.966583369298782e-09, "loss": 0.0706, "num_input_tokens_seen": 133982312, "step": 198820 }, { "epoch": 4.857327828402512, "grad_norm": 4.835088475374505e-05, "learning_rate": 4.9580983316857005e-09, "loss": 0.0, "num_input_tokens_seen": 133986216, "step": 198825 }, { "epoch": 4.857449979234358, "grad_norm": 9.118052548728883e-05, "learning_rate": 4.9496205302850256e-09, "loss": 0.0, "num_input_tokens_seen": 133989288, "step": 198830 }, { "epoch": 4.857572130066206, "grad_norm": 5.738848813052755e-06, "learning_rate": 4.941149965158375e-09, "loss": 0.0, "num_input_tokens_seen": 133992680, "step": 198835 }, { "epoch": 4.857694280898053, "grad_norm": 7.152935722842813e-05, "learning_rate": 4.932686636367256e-09, "loss": 0.0, "num_input_tokens_seen": 133996008, "step": 198840 }, { "epoch": 4.8578164317299, "grad_norm": 5.278225944493897e-05, "learning_rate": 4.924230543973284e-09, "loss": 0.0, "num_input_tokens_seen": 133999528, "step": 198845 }, { "epoch": 4.857938582561747, "grad_norm": 0.0007009048713371158, "learning_rate": 4.915781688037967e-09, "loss": 0.0, "num_input_tokens_seen": 134002984, "step": 198850 }, { "epoch": 4.858060733393595, "grad_norm": 0.00017325843509752303, "learning_rate": 4.9073400686228115e-09, "loss": 0.0, "num_input_tokens_seen": 134007016, "step": 198855 }, { "epoch": 4.8581828842254415, "grad_norm": 2.3501368559664115e-05, "learning_rate": 4.89890568578899e-09, "loss": 0.0, "num_input_tokens_seen": 134010216, "step": 198860 }, { "epoch": 4.858305035057288, "grad_norm": 0.0004582449037116021, "learning_rate": 4.890478539598008e-09, "loss": 0.0, "num_input_tokens_seen": 134013416, "step": 198865 }, { "epoch": 4.858427185889136, "grad_norm": 0.00030273612355813384, "learning_rate": 4.8820586301112635e-09, "loss": 0.0, "num_input_tokens_seen": 134016680, "step": 198870 }, { "epoch": 4.8585493367209835, "grad_norm": 8.884895942173898e-05, "learning_rate": 4.873645957389705e-09, "loss": 0.0, "num_input_tokens_seen": 134019816, "step": 198875 }, { "epoch": 4.85867148755283, "grad_norm": 8.290004916489124e-05, "learning_rate": 4.865240521494729e-09, "loss": 0.0, "num_input_tokens_seen": 134023400, "step": 198880 }, { "epoch": 4.858793638384677, "grad_norm": 0.0005973275983706117, "learning_rate": 4.8568423224872866e-09, "loss": 0.0134, "num_input_tokens_seen": 134026664, "step": 198885 }, { "epoch": 4.858915789216525, "grad_norm": 0.0023913774639368057, "learning_rate": 4.848451360428551e-09, "loss": 0.0, "num_input_tokens_seen": 134029736, "step": 198890 }, { "epoch": 4.859037940048371, "grad_norm": 0.0021553239785134792, "learning_rate": 4.840067635379697e-09, "loss": 0.0, "num_input_tokens_seen": 134032808, "step": 198895 }, { "epoch": 4.859160090880219, "grad_norm": 0.0007805172353982925, "learning_rate": 4.83169114740134e-09, "loss": 0.0, "num_input_tokens_seen": 134036392, "step": 198900 }, { "epoch": 4.859282241712066, "grad_norm": 0.00021608646784443408, "learning_rate": 4.823321896554766e-09, "loss": 0.0, "num_input_tokens_seen": 134040168, "step": 198905 }, { "epoch": 4.859404392543913, "grad_norm": 2.203617441409733e-05, "learning_rate": 4.814959882900482e-09, "loss": 0.0, "num_input_tokens_seen": 134043624, "step": 198910 }, { "epoch": 4.85952654337576, "grad_norm": 0.0034446585923433304, "learning_rate": 4.806605106499661e-09, "loss": 0.0, "num_input_tokens_seen": 134046440, "step": 198915 }, { "epoch": 4.859648694207608, "grad_norm": 0.32102784514427185, "learning_rate": 4.7982575674128115e-09, "loss": 0.0001, "num_input_tokens_seen": 134049384, "step": 198920 }, { "epoch": 4.859770845039455, "grad_norm": 0.00032789475517347455, "learning_rate": 4.78991726570066e-09, "loss": 0.0, "num_input_tokens_seen": 134052392, "step": 198925 }, { "epoch": 4.859892995871302, "grad_norm": 0.00021464366000145674, "learning_rate": 4.7815842014239385e-09, "loss": 0.0, "num_input_tokens_seen": 134055976, "step": 198930 }, { "epoch": 4.860015146703149, "grad_norm": 4.051731139043113e-06, "learning_rate": 4.7732583746432635e-09, "loss": 0.0, "num_input_tokens_seen": 134060008, "step": 198935 }, { "epoch": 4.860137297534997, "grad_norm": 0.00031672645127400756, "learning_rate": 4.764939785419031e-09, "loss": 0.0, "num_input_tokens_seen": 134063400, "step": 198940 }, { "epoch": 4.860259448366843, "grad_norm": 0.00017728647799231112, "learning_rate": 4.756628433811971e-09, "loss": 0.0, "num_input_tokens_seen": 134066920, "step": 198945 }, { "epoch": 4.860381599198691, "grad_norm": 8.471917681163177e-06, "learning_rate": 4.7483243198823685e-09, "loss": 0.0, "num_input_tokens_seen": 134070312, "step": 198950 }, { "epoch": 4.860503750030538, "grad_norm": 0.00016513862647116184, "learning_rate": 4.740027443690509e-09, "loss": 0.0, "num_input_tokens_seen": 134073576, "step": 198955 }, { "epoch": 4.860625900862384, "grad_norm": 2.3647400666959584e-05, "learning_rate": 4.731737805297009e-09, "loss": 0.0, "num_input_tokens_seen": 134076840, "step": 198960 }, { "epoch": 4.860748051694232, "grad_norm": 0.00016174823394976556, "learning_rate": 4.723455404761933e-09, "loss": 0.0001, "num_input_tokens_seen": 134080040, "step": 198965 }, { "epoch": 4.860870202526079, "grad_norm": 0.00017587091133464128, "learning_rate": 4.715180242145678e-09, "loss": 0.0, "num_input_tokens_seen": 134082984, "step": 198970 }, { "epoch": 4.8609923533579265, "grad_norm": 0.00035376264713704586, "learning_rate": 4.706912317508305e-09, "loss": 0.0, "num_input_tokens_seen": 134086440, "step": 198975 }, { "epoch": 4.861114504189773, "grad_norm": 0.0009537230944260955, "learning_rate": 4.698651630909878e-09, "loss": 0.0, "num_input_tokens_seen": 134089768, "step": 198980 }, { "epoch": 4.861236655021621, "grad_norm": 7.79301262809895e-05, "learning_rate": 4.690398182410682e-09, "loss": 0.0, "num_input_tokens_seen": 134092968, "step": 198985 }, { "epoch": 4.861358805853468, "grad_norm": 4.4560383685166016e-05, "learning_rate": 4.682151972070558e-09, "loss": 0.0, "num_input_tokens_seen": 134096168, "step": 198990 }, { "epoch": 4.861480956685315, "grad_norm": 3.626396573963575e-05, "learning_rate": 4.673912999949459e-09, "loss": 0.0, "num_input_tokens_seen": 134099368, "step": 198995 }, { "epoch": 4.861603107517162, "grad_norm": 0.0009027646156027913, "learning_rate": 4.665681266107446e-09, "loss": 0.0, "num_input_tokens_seen": 134103528, "step": 199000 }, { "epoch": 4.86172525834901, "grad_norm": 0.00018381788686383516, "learning_rate": 4.657456770604362e-09, "loss": 0.0, "num_input_tokens_seen": 134106600, "step": 199005 }, { "epoch": 4.861847409180856, "grad_norm": 0.0003359086695127189, "learning_rate": 4.649239513499936e-09, "loss": 0.0, "num_input_tokens_seen": 134109864, "step": 199010 }, { "epoch": 4.861969560012704, "grad_norm": 0.012157008051872253, "learning_rate": 4.641029494853899e-09, "loss": 0.0, "num_input_tokens_seen": 134113448, "step": 199015 }, { "epoch": 4.862091710844551, "grad_norm": 0.00029743279446847737, "learning_rate": 4.632826714725979e-09, "loss": 0.0, "num_input_tokens_seen": 134116648, "step": 199020 }, { "epoch": 4.862213861676398, "grad_norm": 0.03120971843600273, "learning_rate": 4.624631173176019e-09, "loss": 0.0, "num_input_tokens_seen": 134120296, "step": 199025 }, { "epoch": 4.862336012508245, "grad_norm": 0.0002550722274463624, "learning_rate": 4.616442870263304e-09, "loss": 0.0, "num_input_tokens_seen": 134123624, "step": 199030 }, { "epoch": 4.862458163340093, "grad_norm": 0.00016976814367808402, "learning_rate": 4.608261806047675e-09, "loss": 0.0, "num_input_tokens_seen": 134126824, "step": 199035 }, { "epoch": 4.8625803141719395, "grad_norm": 0.00021611245756503195, "learning_rate": 4.600087980588418e-09, "loss": 0.0, "num_input_tokens_seen": 134130152, "step": 199040 }, { "epoch": 4.862702465003787, "grad_norm": 0.00029622766305692494, "learning_rate": 4.591921393945042e-09, "loss": 0.0, "num_input_tokens_seen": 134133800, "step": 199045 }, { "epoch": 4.862824615835634, "grad_norm": 0.00036044183070771396, "learning_rate": 4.583762046177053e-09, "loss": 0.0, "num_input_tokens_seen": 134137256, "step": 199050 }, { "epoch": 4.862946766667481, "grad_norm": 0.006442390847951174, "learning_rate": 4.575609937343517e-09, "loss": 0.0, "num_input_tokens_seen": 134140648, "step": 199055 }, { "epoch": 4.863068917499328, "grad_norm": 0.0006749753374606371, "learning_rate": 4.567465067504051e-09, "loss": 0.0, "num_input_tokens_seen": 134144040, "step": 199060 }, { "epoch": 4.863191068331175, "grad_norm": 3.755068973987363e-05, "learning_rate": 4.559327436717608e-09, "loss": 0.0, "num_input_tokens_seen": 134148200, "step": 199065 }, { "epoch": 4.863313219163023, "grad_norm": 0.0005150276701897383, "learning_rate": 4.5511970450434755e-09, "loss": 0.0, "num_input_tokens_seen": 134151528, "step": 199070 }, { "epoch": 4.863435369994869, "grad_norm": 0.0005858474760316312, "learning_rate": 4.543073892540828e-09, "loss": 0.0, "num_input_tokens_seen": 134154664, "step": 199075 }, { "epoch": 4.863557520826717, "grad_norm": 0.00026716565480455756, "learning_rate": 4.534957979268728e-09, "loss": 0.0, "num_input_tokens_seen": 134157672, "step": 199080 }, { "epoch": 4.863679671658564, "grad_norm": 0.00024082417075987905, "learning_rate": 4.526849305286129e-09, "loss": 0.0, "num_input_tokens_seen": 134160744, "step": 199085 }, { "epoch": 4.863801822490411, "grad_norm": 0.0014010306913405657, "learning_rate": 4.518747870651985e-09, "loss": 0.0, "num_input_tokens_seen": 134164392, "step": 199090 }, { "epoch": 4.863923973322258, "grad_norm": 8.185812475858256e-06, "learning_rate": 4.510653675425358e-09, "loss": 0.0, "num_input_tokens_seen": 134167208, "step": 199095 }, { "epoch": 4.864046124154106, "grad_norm": 0.0003967114898841828, "learning_rate": 4.502566719664869e-09, "loss": 0.0, "num_input_tokens_seen": 134170280, "step": 199100 }, { "epoch": 4.8641682749859525, "grad_norm": 0.0006387066678144038, "learning_rate": 4.494487003429581e-09, "loss": 0.0, "num_input_tokens_seen": 134173736, "step": 199105 }, { "epoch": 4.8642904258178, "grad_norm": 5.824856089020614e-06, "learning_rate": 4.486414526778115e-09, "loss": 0.0, "num_input_tokens_seen": 134177000, "step": 199110 }, { "epoch": 4.864412576649647, "grad_norm": 0.00041367491940036416, "learning_rate": 4.478349289769201e-09, "loss": 0.0, "num_input_tokens_seen": 134180904, "step": 199115 }, { "epoch": 4.8645347274814945, "grad_norm": 0.00021077069686725736, "learning_rate": 4.470291292461459e-09, "loss": 0.0, "num_input_tokens_seen": 134184168, "step": 199120 }, { "epoch": 4.864656878313341, "grad_norm": 0.0019649332389235497, "learning_rate": 4.462240534913508e-09, "loss": 0.0, "num_input_tokens_seen": 134187944, "step": 199125 }, { "epoch": 4.864779029145188, "grad_norm": 3.090451718890108e-05, "learning_rate": 4.45419701718397e-09, "loss": 0.0, "num_input_tokens_seen": 134191208, "step": 199130 }, { "epoch": 4.864901179977036, "grad_norm": 0.00044984908890910447, "learning_rate": 4.446160739331239e-09, "loss": 0.0, "num_input_tokens_seen": 134194472, "step": 199135 }, { "epoch": 4.865023330808883, "grad_norm": 0.0001340339076705277, "learning_rate": 4.4381317014138274e-09, "loss": 0.0, "num_input_tokens_seen": 134197480, "step": 199140 }, { "epoch": 4.86514548164073, "grad_norm": 0.0010863590287044644, "learning_rate": 4.4301099034901315e-09, "loss": 0.0, "num_input_tokens_seen": 134201128, "step": 199145 }, { "epoch": 4.865267632472577, "grad_norm": 0.00024744641268625855, "learning_rate": 4.422095345618437e-09, "loss": 0.0, "num_input_tokens_seen": 134204968, "step": 199150 }, { "epoch": 4.865389783304424, "grad_norm": 3.0349398002726957e-05, "learning_rate": 4.414088027857032e-09, "loss": 0.0, "num_input_tokens_seen": 134208616, "step": 199155 }, { "epoch": 4.865511934136271, "grad_norm": 0.00012485562183428556, "learning_rate": 4.406087950264092e-09, "loss": 0.0, "num_input_tokens_seen": 134211752, "step": 199160 }, { "epoch": 4.865634084968119, "grad_norm": 2.2154621547088027e-05, "learning_rate": 4.398095112898015e-09, "loss": 0.0, "num_input_tokens_seen": 134215144, "step": 199165 }, { "epoch": 4.8657562357999655, "grad_norm": 0.0002064045111183077, "learning_rate": 4.390109515816642e-09, "loss": 0.0, "num_input_tokens_seen": 134218728, "step": 199170 }, { "epoch": 4.865878386631813, "grad_norm": 3.425988688832149e-05, "learning_rate": 4.3821311590781505e-09, "loss": 0.0, "num_input_tokens_seen": 134222440, "step": 199175 }, { "epoch": 4.86600053746366, "grad_norm": 8.955002704169601e-05, "learning_rate": 4.374160042740716e-09, "loss": 0.0, "num_input_tokens_seen": 134225640, "step": 199180 }, { "epoch": 4.8661226882955075, "grad_norm": 1.300713665841613e-05, "learning_rate": 4.366196166862179e-09, "loss": 0.0, "num_input_tokens_seen": 134229096, "step": 199185 }, { "epoch": 4.866244839127354, "grad_norm": 0.00013864053471479565, "learning_rate": 4.358239531500385e-09, "loss": 0.0, "num_input_tokens_seen": 134232360, "step": 199190 }, { "epoch": 4.866366989959202, "grad_norm": 0.0013332118978723884, "learning_rate": 4.3502901367132864e-09, "loss": 0.0, "num_input_tokens_seen": 134235432, "step": 199195 }, { "epoch": 4.866489140791049, "grad_norm": 6.434672104660422e-05, "learning_rate": 4.342347982558614e-09, "loss": 0.0, "num_input_tokens_seen": 134239400, "step": 199200 }, { "epoch": 4.866611291622896, "grad_norm": 0.00048381893429905176, "learning_rate": 4.334413069094322e-09, "loss": 0.0, "num_input_tokens_seen": 134242984, "step": 199205 }, { "epoch": 4.866733442454743, "grad_norm": 0.0002854143676813692, "learning_rate": 4.326485396377921e-09, "loss": 0.0, "num_input_tokens_seen": 134246440, "step": 199210 }, { "epoch": 4.866855593286591, "grad_norm": 5.3498759370995685e-05, "learning_rate": 4.318564964467031e-09, "loss": 0.0, "num_input_tokens_seen": 134249896, "step": 199215 }, { "epoch": 4.866977744118437, "grad_norm": 1.6442389096482657e-05, "learning_rate": 4.3106517734194935e-09, "loss": 0.0, "num_input_tokens_seen": 134252968, "step": 199220 }, { "epoch": 4.867099894950284, "grad_norm": 0.00855648797005415, "learning_rate": 4.302745823292598e-09, "loss": 0.0, "num_input_tokens_seen": 134255976, "step": 199225 }, { "epoch": 4.867222045782132, "grad_norm": 3.934272172045894e-05, "learning_rate": 4.294847114143963e-09, "loss": 0.0, "num_input_tokens_seen": 134259944, "step": 199230 }, { "epoch": 4.867344196613979, "grad_norm": 1.6429348761448637e-05, "learning_rate": 4.286955646030988e-09, "loss": 0.0, "num_input_tokens_seen": 134263080, "step": 199235 }, { "epoch": 4.867466347445826, "grad_norm": 0.009650076739490032, "learning_rate": 4.279071419011182e-09, "loss": 0.0, "num_input_tokens_seen": 134266280, "step": 199240 }, { "epoch": 4.867588498277673, "grad_norm": 0.00010051135905086994, "learning_rate": 4.271194433141723e-09, "loss": 0.0, "num_input_tokens_seen": 134269864, "step": 199245 }, { "epoch": 4.867710649109521, "grad_norm": 0.0030795312486588955, "learning_rate": 4.263324688480008e-09, "loss": 0.0, "num_input_tokens_seen": 134273128, "step": 199250 }, { "epoch": 4.867832799941367, "grad_norm": 2.8745831514243037e-05, "learning_rate": 4.255462185083103e-09, "loss": 0.0, "num_input_tokens_seen": 134276456, "step": 199255 }, { "epoch": 4.867954950773215, "grad_norm": 0.0016464096261188388, "learning_rate": 4.2476069230084066e-09, "loss": 0.0, "num_input_tokens_seen": 134279592, "step": 199260 }, { "epoch": 4.868077101605062, "grad_norm": 0.004813062958419323, "learning_rate": 4.239758902312873e-09, "loss": 0.0001, "num_input_tokens_seen": 134284840, "step": 199265 }, { "epoch": 4.868199252436909, "grad_norm": 0.0006078015430830419, "learning_rate": 4.231918123053679e-09, "loss": 0.0, "num_input_tokens_seen": 134288424, "step": 199270 }, { "epoch": 4.868321403268756, "grad_norm": 0.0001032560394378379, "learning_rate": 4.22408458528778e-09, "loss": 0.0, "num_input_tokens_seen": 134291560, "step": 199275 }, { "epoch": 4.868443554100604, "grad_norm": 0.0002666850632522255, "learning_rate": 4.216258289072128e-09, "loss": 0.0, "num_input_tokens_seen": 134294760, "step": 199280 }, { "epoch": 4.8685657049324504, "grad_norm": 0.0008626289200037718, "learning_rate": 4.20843923446379e-09, "loss": 0.0, "num_input_tokens_seen": 134298600, "step": 199285 }, { "epoch": 4.868687855764298, "grad_norm": 0.00012125779176130891, "learning_rate": 4.200627421519498e-09, "loss": 0.0, "num_input_tokens_seen": 134301864, "step": 199290 }, { "epoch": 4.868810006596145, "grad_norm": 0.00019722123397514224, "learning_rate": 4.192822850295985e-09, "loss": 0.0, "num_input_tokens_seen": 134305064, "step": 199295 }, { "epoch": 4.8689321574279925, "grad_norm": 0.00021909379574935883, "learning_rate": 4.185025520850205e-09, "loss": 0.0, "num_input_tokens_seen": 134309224, "step": 199300 }, { "epoch": 4.869054308259839, "grad_norm": 0.0006481913733296096, "learning_rate": 4.1772354332386686e-09, "loss": 0.0, "num_input_tokens_seen": 134312488, "step": 199305 }, { "epoch": 4.869176459091687, "grad_norm": 6.552576087415218e-05, "learning_rate": 4.169452587518219e-09, "loss": 0.0, "num_input_tokens_seen": 134316328, "step": 199310 }, { "epoch": 4.869298609923534, "grad_norm": 0.028785167261958122, "learning_rate": 4.161676983745255e-09, "loss": 0.0, "num_input_tokens_seen": 134319592, "step": 199315 }, { "epoch": 4.86942076075538, "grad_norm": 0.000281141052255407, "learning_rate": 4.15390862197651e-09, "loss": 0.0, "num_input_tokens_seen": 134322856, "step": 199320 }, { "epoch": 4.869542911587228, "grad_norm": 0.00023606415197718889, "learning_rate": 4.146147502268383e-09, "loss": 0.0, "num_input_tokens_seen": 134326056, "step": 199325 }, { "epoch": 4.869665062419075, "grad_norm": 4.8425907152704895e-05, "learning_rate": 4.138393624677272e-09, "loss": 0.0, "num_input_tokens_seen": 134329384, "step": 199330 }, { "epoch": 4.869787213250922, "grad_norm": 2.464874523866456e-05, "learning_rate": 4.13064698925969e-09, "loss": 0.0, "num_input_tokens_seen": 134332328, "step": 199335 }, { "epoch": 4.869909364082769, "grad_norm": 0.0005067095626145601, "learning_rate": 4.122907596071812e-09, "loss": 0.0, "num_input_tokens_seen": 134335720, "step": 199340 }, { "epoch": 4.870031514914617, "grad_norm": 0.00011708718375302851, "learning_rate": 4.115175445170038e-09, "loss": 0.0, "num_input_tokens_seen": 134339560, "step": 199345 }, { "epoch": 4.8701536657464635, "grad_norm": 0.0003647690755315125, "learning_rate": 4.107450536610657e-09, "loss": 0.0, "num_input_tokens_seen": 134342632, "step": 199350 }, { "epoch": 4.870275816578311, "grad_norm": 0.0004586986906360835, "learning_rate": 4.099732870449624e-09, "loss": 0.0, "num_input_tokens_seen": 134346088, "step": 199355 }, { "epoch": 4.870397967410158, "grad_norm": 8.667811925988644e-05, "learning_rate": 4.092022446743337e-09, "loss": 0.0, "num_input_tokens_seen": 134350056, "step": 199360 }, { "epoch": 4.8705201182420055, "grad_norm": 6.58802455291152e-05, "learning_rate": 4.084319265547531e-09, "loss": 0.0, "num_input_tokens_seen": 134353256, "step": 199365 }, { "epoch": 4.870642269073852, "grad_norm": 3.016005030076485e-05, "learning_rate": 4.076623326918604e-09, "loss": 0.0, "num_input_tokens_seen": 134356456, "step": 199370 }, { "epoch": 4.8707644199057, "grad_norm": 0.0002504217263776809, "learning_rate": 4.068934630912291e-09, "loss": 0.0, "num_input_tokens_seen": 134359464, "step": 199375 }, { "epoch": 4.870886570737547, "grad_norm": 0.0016258807154372334, "learning_rate": 4.061253177584545e-09, "loss": 0.0, "num_input_tokens_seen": 134362664, "step": 199380 }, { "epoch": 4.871008721569394, "grad_norm": 9.487319039180875e-05, "learning_rate": 4.053578966991211e-09, "loss": 0.0, "num_input_tokens_seen": 134365928, "step": 199385 }, { "epoch": 4.871130872401241, "grad_norm": 7.63261632528156e-05, "learning_rate": 4.045911999188245e-09, "loss": 0.0, "num_input_tokens_seen": 134369256, "step": 199390 }, { "epoch": 4.871253023233088, "grad_norm": 0.0032971366308629513, "learning_rate": 4.038252274231157e-09, "loss": 0.0, "num_input_tokens_seen": 134372968, "step": 199395 }, { "epoch": 4.871375174064935, "grad_norm": 0.0007157826912589371, "learning_rate": 4.030599792175904e-09, "loss": 0.0, "num_input_tokens_seen": 134376488, "step": 199400 }, { "epoch": 4.871497324896783, "grad_norm": 0.00042528269113972783, "learning_rate": 4.022954553077884e-09, "loss": 0.0, "num_input_tokens_seen": 134379560, "step": 199405 }, { "epoch": 4.87161947572863, "grad_norm": 0.00018838932737708092, "learning_rate": 4.015316556992943e-09, "loss": 0.0, "num_input_tokens_seen": 134383016, "step": 199410 }, { "epoch": 4.8717416265604765, "grad_norm": 53.78935241699219, "learning_rate": 4.007685803976479e-09, "loss": 0.079, "num_input_tokens_seen": 134386344, "step": 199415 }, { "epoch": 4.871863777392324, "grad_norm": 0.00014842470409348607, "learning_rate": 4.0000622940838945e-09, "loss": 0.0, "num_input_tokens_seen": 134390056, "step": 199420 }, { "epoch": 4.871985928224171, "grad_norm": 0.0018565324135124683, "learning_rate": 3.99244602737081e-09, "loss": 0.0, "num_input_tokens_seen": 134393448, "step": 199425 }, { "epoch": 4.8721080790560185, "grad_norm": 0.0063219680450856686, "learning_rate": 3.9848370038926275e-09, "loss": 0.0002, "num_input_tokens_seen": 134396712, "step": 199430 }, { "epoch": 4.872230229887865, "grad_norm": 0.00012290282757021487, "learning_rate": 3.977235223704523e-09, "loss": 0.0, "num_input_tokens_seen": 134399656, "step": 199435 }, { "epoch": 4.872352380719713, "grad_norm": 0.0008801176445558667, "learning_rate": 3.969640686861897e-09, "loss": 0.0, "num_input_tokens_seen": 134403304, "step": 199440 }, { "epoch": 4.87247453155156, "grad_norm": 0.000574213161598891, "learning_rate": 3.962053393419929e-09, "loss": 0.0, "num_input_tokens_seen": 134406184, "step": 199445 }, { "epoch": 4.872596682383407, "grad_norm": 0.005858482327312231, "learning_rate": 3.954473343433795e-09, "loss": 0.0, "num_input_tokens_seen": 134409960, "step": 199450 }, { "epoch": 4.872718833215254, "grad_norm": 0.001610560342669487, "learning_rate": 3.946900536958675e-09, "loss": 0.0, "num_input_tokens_seen": 134413480, "step": 199455 }, { "epoch": 4.872840984047102, "grad_norm": 5.890335887670517e-05, "learning_rate": 3.939334974049635e-09, "loss": 0.0, "num_input_tokens_seen": 134416872, "step": 199460 }, { "epoch": 4.872963134878948, "grad_norm": 0.0007327334024012089, "learning_rate": 3.931776654761631e-09, "loss": 0.0, "num_input_tokens_seen": 134420008, "step": 199465 }, { "epoch": 4.873085285710796, "grad_norm": 0.0007441366324201226, "learning_rate": 3.924225579149621e-09, "loss": 0.0, "num_input_tokens_seen": 134423272, "step": 199470 }, { "epoch": 4.873207436542643, "grad_norm": 0.0051063066348433495, "learning_rate": 3.916681747268558e-09, "loss": 0.0, "num_input_tokens_seen": 134426536, "step": 199475 }, { "epoch": 4.87332958737449, "grad_norm": 0.0001381008914904669, "learning_rate": 3.909145159173289e-09, "loss": 0.0, "num_input_tokens_seen": 134430568, "step": 199480 }, { "epoch": 4.873451738206337, "grad_norm": 1.1536059901118279e-05, "learning_rate": 3.901615814918657e-09, "loss": 0.0, "num_input_tokens_seen": 134433640, "step": 199485 }, { "epoch": 4.873573889038184, "grad_norm": 0.0011014309711754322, "learning_rate": 3.894093714559399e-09, "loss": 0.0, "num_input_tokens_seen": 134436904, "step": 199490 }, { "epoch": 4.8736960398700315, "grad_norm": 0.0012070384109392762, "learning_rate": 3.886578858150247e-09, "loss": 0.0, "num_input_tokens_seen": 134440168, "step": 199495 }, { "epoch": 4.873818190701879, "grad_norm": 4.059781349496916e-05, "learning_rate": 3.879071245745713e-09, "loss": 0.0, "num_input_tokens_seen": 134443496, "step": 199500 }, { "epoch": 4.873940341533726, "grad_norm": 0.001782463165000081, "learning_rate": 3.871570877400643e-09, "loss": 0.0, "num_input_tokens_seen": 134447208, "step": 199505 }, { "epoch": 4.874062492365573, "grad_norm": 0.00033074626117013395, "learning_rate": 3.864077753169326e-09, "loss": 0.0, "num_input_tokens_seen": 134450792, "step": 199510 }, { "epoch": 4.87418464319742, "grad_norm": 0.00011855031334562227, "learning_rate": 3.8565918731063855e-09, "loss": 0.0, "num_input_tokens_seen": 134454952, "step": 199515 }, { "epoch": 4.874306794029267, "grad_norm": 0.009077893570065498, "learning_rate": 3.849113237266222e-09, "loss": 0.0, "num_input_tokens_seen": 134457960, "step": 199520 }, { "epoch": 4.874428944861115, "grad_norm": 0.019870661199092865, "learning_rate": 3.8416418457032365e-09, "loss": 0.0, "num_input_tokens_seen": 134461608, "step": 199525 }, { "epoch": 4.874551095692961, "grad_norm": 0.0001336112036369741, "learning_rate": 3.83417769847183e-09, "loss": 0.0, "num_input_tokens_seen": 134464360, "step": 199530 }, { "epoch": 4.874673246524809, "grad_norm": 0.00011955316585954279, "learning_rate": 3.826720795626181e-09, "loss": 0.0, "num_input_tokens_seen": 134467944, "step": 199535 }, { "epoch": 4.874795397356656, "grad_norm": 0.0008456766954623163, "learning_rate": 3.819271137220581e-09, "loss": 0.0, "num_input_tokens_seen": 134470952, "step": 199540 }, { "epoch": 4.874917548188503, "grad_norm": 0.0008179315482266247, "learning_rate": 3.8118287233090965e-09, "loss": 0.0, "num_input_tokens_seen": 134474536, "step": 199545 }, { "epoch": 4.87503969902035, "grad_norm": 2.3027980205370113e-05, "learning_rate": 3.804393553946017e-09, "loss": 0.0, "num_input_tokens_seen": 134477800, "step": 199550 }, { "epoch": 4.875161849852198, "grad_norm": 0.003511697519570589, "learning_rate": 3.7969656291853e-09, "loss": 0.0, "num_input_tokens_seen": 134481000, "step": 199555 }, { "epoch": 4.8752840006840445, "grad_norm": 9.18523728614673e-05, "learning_rate": 3.789544949081014e-09, "loss": 0.0, "num_input_tokens_seen": 134484584, "step": 199560 }, { "epoch": 4.875406151515892, "grad_norm": 0.0005713349091820419, "learning_rate": 3.7821315136871145e-09, "loss": 0.0, "num_input_tokens_seen": 134488168, "step": 199565 }, { "epoch": 4.875528302347739, "grad_norm": 0.001412555342540145, "learning_rate": 3.774725323057449e-09, "loss": 0.0, "num_input_tokens_seen": 134491432, "step": 199570 }, { "epoch": 4.875650453179587, "grad_norm": 1.3871308510715608e-05, "learning_rate": 3.767326377245972e-09, "loss": 0.0, "num_input_tokens_seen": 134494760, "step": 199575 }, { "epoch": 4.875772604011433, "grad_norm": 0.00034401044831611216, "learning_rate": 3.75993467630642e-09, "loss": 0.0, "num_input_tokens_seen": 134497896, "step": 199580 }, { "epoch": 4.87589475484328, "grad_norm": 0.00021265115356072783, "learning_rate": 3.752550220292638e-09, "loss": 0.0, "num_input_tokens_seen": 134501736, "step": 199585 }, { "epoch": 4.876016905675128, "grad_norm": 0.0026902747340500355, "learning_rate": 3.745173009258252e-09, "loss": 0.0, "num_input_tokens_seen": 134505128, "step": 199590 }, { "epoch": 4.876139056506974, "grad_norm": 0.0028633566107600927, "learning_rate": 3.737803043256993e-09, "loss": 0.0, "num_input_tokens_seen": 134508712, "step": 199595 }, { "epoch": 4.876261207338822, "grad_norm": 0.00011423487012507394, "learning_rate": 3.730440322342266e-09, "loss": 0.0, "num_input_tokens_seen": 134512424, "step": 199600 }, { "epoch": 4.876383358170669, "grad_norm": 0.001742964843288064, "learning_rate": 3.7230848465678033e-09, "loss": 0.0, "num_input_tokens_seen": 134515816, "step": 199605 }, { "epoch": 4.8765055090025164, "grad_norm": 0.0001392453268636018, "learning_rate": 3.7157366159870086e-09, "loss": 0.0, "num_input_tokens_seen": 134519144, "step": 199610 }, { "epoch": 4.876627659834363, "grad_norm": 4.974427793058567e-05, "learning_rate": 3.7083956306533936e-09, "loss": 0.0, "num_input_tokens_seen": 134522536, "step": 199615 }, { "epoch": 4.876749810666211, "grad_norm": 0.00012248499842826277, "learning_rate": 3.7010618906202494e-09, "loss": 0.0, "num_input_tokens_seen": 134525544, "step": 199620 }, { "epoch": 4.876871961498058, "grad_norm": 0.00078915199264884, "learning_rate": 3.693735395940978e-09, "loss": 0.0, "num_input_tokens_seen": 134528808, "step": 199625 }, { "epoch": 4.876994112329905, "grad_norm": 0.0011299817124381661, "learning_rate": 3.6864161466688694e-09, "loss": 0.0, "num_input_tokens_seen": 134532264, "step": 199630 }, { "epoch": 4.877116263161752, "grad_norm": 0.0001292000088142231, "learning_rate": 3.6791041428569926e-09, "loss": 0.0, "num_input_tokens_seen": 134535400, "step": 199635 }, { "epoch": 4.8772384139936, "grad_norm": 0.0001341778552159667, "learning_rate": 3.6717993845587493e-09, "loss": 0.0, "num_input_tokens_seen": 134539432, "step": 199640 }, { "epoch": 4.877360564825446, "grad_norm": 0.0006362085114233196, "learning_rate": 3.6645018718272082e-09, "loss": 0.0, "num_input_tokens_seen": 134542632, "step": 199645 }, { "epoch": 4.877482715657294, "grad_norm": 0.005953234154731035, "learning_rate": 3.6572116047153267e-09, "loss": 0.0, "num_input_tokens_seen": 134546024, "step": 199650 }, { "epoch": 4.877604866489141, "grad_norm": 0.003755022771656513, "learning_rate": 3.649928583276174e-09, "loss": 0.0, "num_input_tokens_seen": 134549160, "step": 199655 }, { "epoch": 4.8777270173209875, "grad_norm": 0.012922837398946285, "learning_rate": 3.6426528075627073e-09, "loss": 0.0, "num_input_tokens_seen": 134551848, "step": 199660 }, { "epoch": 4.877849168152835, "grad_norm": 0.004435810260474682, "learning_rate": 3.635384277627884e-09, "loss": 0.0, "num_input_tokens_seen": 134555624, "step": 199665 }, { "epoch": 4.877971318984683, "grad_norm": 0.00010084949462907389, "learning_rate": 3.6281229935245516e-09, "loss": 0.0, "num_input_tokens_seen": 134559080, "step": 199670 }, { "epoch": 4.8780934698165295, "grad_norm": 0.00020235779811628163, "learning_rate": 3.620868955305445e-09, "loss": 0.0, "num_input_tokens_seen": 134562280, "step": 199675 }, { "epoch": 4.878215620648376, "grad_norm": 8.76703197718598e-05, "learning_rate": 3.613622163023522e-09, "loss": 0.0, "num_input_tokens_seen": 134565800, "step": 199680 }, { "epoch": 4.878337771480224, "grad_norm": 4.6558223402826115e-05, "learning_rate": 3.606382616731185e-09, "loss": 0.0, "num_input_tokens_seen": 134569320, "step": 199685 }, { "epoch": 4.878459922312071, "grad_norm": 0.03909289836883545, "learning_rate": 3.599150316481281e-09, "loss": 0.0464, "num_input_tokens_seen": 134572712, "step": 199690 }, { "epoch": 4.878582073143918, "grad_norm": 3.961454058298841e-05, "learning_rate": 3.591925262326323e-09, "loss": 0.0, "num_input_tokens_seen": 134576104, "step": 199695 }, { "epoch": 4.878704223975765, "grad_norm": 2.078626857837662e-05, "learning_rate": 3.584707454318936e-09, "loss": 0.0, "num_input_tokens_seen": 134579368, "step": 199700 }, { "epoch": 4.878826374807613, "grad_norm": 9.76806586550083e-06, "learning_rate": 3.5774968925115223e-09, "loss": 0.0, "num_input_tokens_seen": 134582376, "step": 199705 }, { "epoch": 4.878948525639459, "grad_norm": 6.536354339914396e-05, "learning_rate": 3.5702935769565956e-09, "loss": 0.0, "num_input_tokens_seen": 134586024, "step": 199710 }, { "epoch": 4.879070676471307, "grad_norm": 9.415813838131726e-05, "learning_rate": 3.5630975077065583e-09, "loss": 0.0, "num_input_tokens_seen": 134589736, "step": 199715 }, { "epoch": 4.879192827303154, "grad_norm": 0.0004944285610690713, "learning_rate": 3.555908684813591e-09, "loss": 0.0, "num_input_tokens_seen": 134593384, "step": 199720 }, { "epoch": 4.879314978135001, "grad_norm": 0.00047317787539213896, "learning_rate": 3.5487271083300962e-09, "loss": 0.0, "num_input_tokens_seen": 134596968, "step": 199725 }, { "epoch": 4.879437128966848, "grad_norm": 0.0010366524802520871, "learning_rate": 3.5415527783082544e-09, "loss": 0.0, "num_input_tokens_seen": 134600424, "step": 199730 }, { "epoch": 4.879559279798696, "grad_norm": 7.29838793631643e-05, "learning_rate": 3.534385694800246e-09, "loss": 0.0, "num_input_tokens_seen": 134604072, "step": 199735 }, { "epoch": 4.8796814306305425, "grad_norm": 6.1684686443186365e-06, "learning_rate": 3.5272258578581405e-09, "loss": 0.0, "num_input_tokens_seen": 134607272, "step": 199740 }, { "epoch": 4.87980358146239, "grad_norm": 4.777483263751492e-05, "learning_rate": 3.5200732675341185e-09, "loss": 0.0, "num_input_tokens_seen": 134610408, "step": 199745 }, { "epoch": 4.879925732294237, "grad_norm": 0.00011650729720713571, "learning_rate": 3.512927923880249e-09, "loss": 0.0, "num_input_tokens_seen": 134613672, "step": 199750 }, { "epoch": 4.880047883126084, "grad_norm": 0.001114787650294602, "learning_rate": 3.505789826948269e-09, "loss": 0.0, "num_input_tokens_seen": 134616872, "step": 199755 }, { "epoch": 4.880170033957931, "grad_norm": 0.0028762961737811565, "learning_rate": 3.4986589767902476e-09, "loss": 0.0, "num_input_tokens_seen": 134620392, "step": 199760 }, { "epoch": 4.880292184789779, "grad_norm": 5.495239292940823e-06, "learning_rate": 3.4915353734580322e-09, "loss": 0.0, "num_input_tokens_seen": 134623528, "step": 199765 }, { "epoch": 4.880414335621626, "grad_norm": 0.01798221282660961, "learning_rate": 3.4844190170033596e-09, "loss": 0.0, "num_input_tokens_seen": 134626536, "step": 199770 }, { "epoch": 4.880536486453472, "grad_norm": 2.4297723939525895e-05, "learning_rate": 3.4773099074780765e-09, "loss": 0.0, "num_input_tokens_seen": 134629928, "step": 199775 }, { "epoch": 4.88065863728532, "grad_norm": 0.0001187268499052152, "learning_rate": 3.470208044933809e-09, "loss": 0.031, "num_input_tokens_seen": 134633256, "step": 199780 }, { "epoch": 4.880780788117167, "grad_norm": 0.00022896818700246513, "learning_rate": 3.463113429422182e-09, "loss": 0.0, "num_input_tokens_seen": 134636392, "step": 199785 }, { "epoch": 4.880902938949014, "grad_norm": 0.00026889523724094033, "learning_rate": 3.456026060994821e-09, "loss": 0.0005, "num_input_tokens_seen": 134639720, "step": 199790 }, { "epoch": 4.881025089780861, "grad_norm": 0.00028634234331548214, "learning_rate": 3.4489459397033514e-09, "loss": 0.0, "num_input_tokens_seen": 134643112, "step": 199795 }, { "epoch": 4.881147240612709, "grad_norm": 8.220658492064103e-05, "learning_rate": 3.441873065599066e-09, "loss": 0.0, "num_input_tokens_seen": 134646504, "step": 199800 }, { "epoch": 4.8812693914445555, "grad_norm": 0.00017027057765517384, "learning_rate": 3.4348074387337e-09, "loss": 0.0, "num_input_tokens_seen": 134650024, "step": 199805 }, { "epoch": 4.881391542276403, "grad_norm": 6.944937922526151e-05, "learning_rate": 3.4277490591583245e-09, "loss": 0.0, "num_input_tokens_seen": 134653544, "step": 199810 }, { "epoch": 4.88151369310825, "grad_norm": 0.00022692351194564253, "learning_rate": 3.420697926924454e-09, "loss": 0.0, "num_input_tokens_seen": 134657064, "step": 199815 }, { "epoch": 4.8816358439400975, "grad_norm": 3.496105273370631e-05, "learning_rate": 3.413654042083269e-09, "loss": 0.0, "num_input_tokens_seen": 134660584, "step": 199820 }, { "epoch": 4.881757994771944, "grad_norm": 0.0008579176501370966, "learning_rate": 3.406617404686063e-09, "loss": 0.0, "num_input_tokens_seen": 134664424, "step": 199825 }, { "epoch": 4.881880145603792, "grad_norm": 0.0008436653297394514, "learning_rate": 3.3995880147840163e-09, "loss": 0.0, "num_input_tokens_seen": 134667816, "step": 199830 }, { "epoch": 4.882002296435639, "grad_norm": 6.152554851723835e-05, "learning_rate": 3.392565872428199e-09, "loss": 0.0, "num_input_tokens_seen": 134671144, "step": 199835 }, { "epoch": 4.882124447267486, "grad_norm": 0.00047535798512399197, "learning_rate": 3.385550977669682e-09, "loss": 0.0, "num_input_tokens_seen": 134674152, "step": 199840 }, { "epoch": 4.882246598099333, "grad_norm": 0.00028677177033387125, "learning_rate": 3.3785433305595355e-09, "loss": 0.0, "num_input_tokens_seen": 134677480, "step": 199845 }, { "epoch": 4.88236874893118, "grad_norm": 1.8712211385718547e-05, "learning_rate": 3.371542931148608e-09, "loss": 0.058, "num_input_tokens_seen": 134680936, "step": 199850 }, { "epoch": 4.882490899763027, "grad_norm": 0.0001424605870852247, "learning_rate": 3.3645497794879684e-09, "loss": 0.0002, "num_input_tokens_seen": 134684264, "step": 199855 }, { "epoch": 4.882613050594875, "grad_norm": 0.0006394163356162608, "learning_rate": 3.3575638756283555e-09, "loss": 0.0, "num_input_tokens_seen": 134687208, "step": 199860 }, { "epoch": 4.882735201426722, "grad_norm": 3.500204547890462e-05, "learning_rate": 3.350585219620505e-09, "loss": 0.0, "num_input_tokens_seen": 134690728, "step": 199865 }, { "epoch": 4.8828573522585685, "grad_norm": 0.004288224037736654, "learning_rate": 3.343613811515378e-09, "loss": 0.0, "num_input_tokens_seen": 134694248, "step": 199870 }, { "epoch": 4.882979503090416, "grad_norm": 0.00017929727619048208, "learning_rate": 3.336649651363599e-09, "loss": 0.0318, "num_input_tokens_seen": 134698024, "step": 199875 }, { "epoch": 4.883101653922263, "grad_norm": 1.3863214007869828e-05, "learning_rate": 3.3296927392156836e-09, "loss": 0.0, "num_input_tokens_seen": 134701608, "step": 199880 }, { "epoch": 4.8832238047541106, "grad_norm": 0.00016461328777950257, "learning_rate": 3.3227430751223696e-09, "loss": 0.0, "num_input_tokens_seen": 134704616, "step": 199885 }, { "epoch": 4.883345955585957, "grad_norm": 1.3935764400230255e-05, "learning_rate": 3.3158006591340603e-09, "loss": 0.0, "num_input_tokens_seen": 134707752, "step": 199890 }, { "epoch": 4.883468106417805, "grad_norm": 3.107173688476905e-05, "learning_rate": 3.3088654913013825e-09, "loss": 0.0, "num_input_tokens_seen": 134711016, "step": 199895 }, { "epoch": 4.883590257249652, "grad_norm": 0.00013317028060555458, "learning_rate": 3.30193757167474e-09, "loss": 0.0, "num_input_tokens_seen": 134714536, "step": 199900 }, { "epoch": 4.883712408081499, "grad_norm": 4.84615111417952e-06, "learning_rate": 3.295016900304426e-09, "loss": 0.0, "num_input_tokens_seen": 134717544, "step": 199905 }, { "epoch": 4.883834558913346, "grad_norm": 8.700467151356861e-05, "learning_rate": 3.2881034772408444e-09, "loss": 0.0, "num_input_tokens_seen": 134720936, "step": 199910 }, { "epoch": 4.883956709745194, "grad_norm": 6.082547042751685e-05, "learning_rate": 3.281197302534289e-09, "loss": 0.0, "num_input_tokens_seen": 134724584, "step": 199915 }, { "epoch": 4.88407886057704, "grad_norm": 7.671982166357338e-05, "learning_rate": 3.2742983762349406e-09, "loss": 0.0, "num_input_tokens_seen": 134727720, "step": 199920 }, { "epoch": 4.884201011408888, "grad_norm": 0.002736483933404088, "learning_rate": 3.2674066983929826e-09, "loss": 0.0, "num_input_tokens_seen": 134731112, "step": 199925 }, { "epoch": 4.884323162240735, "grad_norm": 0.00012476065603550524, "learning_rate": 3.2605222690585967e-09, "loss": 0.0, "num_input_tokens_seen": 134734248, "step": 199930 }, { "epoch": 4.8844453130725825, "grad_norm": 0.007749420590698719, "learning_rate": 3.253645088281631e-09, "loss": 0.0, "num_input_tokens_seen": 134737320, "step": 199935 }, { "epoch": 4.884567463904429, "grad_norm": 0.0003622000804170966, "learning_rate": 3.2467751561123803e-09, "loss": 0.0, "num_input_tokens_seen": 134740648, "step": 199940 }, { "epoch": 4.884689614736276, "grad_norm": 0.0007984722615219653, "learning_rate": 3.2399124726005813e-09, "loss": 0.0, "num_input_tokens_seen": 134743976, "step": 199945 }, { "epoch": 4.884811765568124, "grad_norm": 0.0003382827853783965, "learning_rate": 3.2330570377963053e-09, "loss": 0.0, "num_input_tokens_seen": 134747112, "step": 199950 }, { "epoch": 4.88493391639997, "grad_norm": 0.0002902325359173119, "learning_rate": 3.2262088517492903e-09, "loss": 0.0, "num_input_tokens_seen": 134750696, "step": 199955 }, { "epoch": 4.885056067231818, "grad_norm": 0.00018129262025468051, "learning_rate": 3.2193679145093857e-09, "loss": 0.0, "num_input_tokens_seen": 134754344, "step": 199960 }, { "epoch": 4.885178218063665, "grad_norm": 5.9551683079916984e-05, "learning_rate": 3.21253422612644e-09, "loss": 0.0, "num_input_tokens_seen": 134758440, "step": 199965 }, { "epoch": 4.885300368895512, "grad_norm": 9.369918188895099e-06, "learning_rate": 3.205707786649858e-09, "loss": 0.0, "num_input_tokens_seen": 134761896, "step": 199970 }, { "epoch": 4.885422519727359, "grad_norm": 0.00010241235577268526, "learning_rate": 3.198888596129712e-09, "loss": 0.0, "num_input_tokens_seen": 134765608, "step": 199975 }, { "epoch": 4.885544670559207, "grad_norm": 0.00010554419714026153, "learning_rate": 3.1920766546151833e-09, "loss": 0.0, "num_input_tokens_seen": 134769064, "step": 199980 }, { "epoch": 4.8856668213910535, "grad_norm": 0.01280683558434248, "learning_rate": 3.185271962156011e-09, "loss": 0.0, "num_input_tokens_seen": 134772200, "step": 199985 }, { "epoch": 4.885788972222901, "grad_norm": 8.941164560383186e-05, "learning_rate": 3.1784745188017106e-09, "loss": 0.0, "num_input_tokens_seen": 134775400, "step": 199990 }, { "epoch": 4.885911123054748, "grad_norm": 0.7302924990653992, "learning_rate": 3.1716843246015757e-09, "loss": 0.0004, "num_input_tokens_seen": 134778600, "step": 199995 }, { "epoch": 4.8860332738865955, "grad_norm": 0.00028197289793752134, "learning_rate": 3.1649013796051226e-09, "loss": 0.0, "num_input_tokens_seen": 134782056, "step": 200000 }, { "epoch": 4.886155424718442, "grad_norm": 0.0009042451856657863, "learning_rate": 3.1581256838615346e-09, "loss": 0.0001, "num_input_tokens_seen": 134785448, "step": 200005 }, { "epoch": 4.88627757555029, "grad_norm": 0.0001275439135497436, "learning_rate": 3.1513572374203267e-09, "loss": 0.0, "num_input_tokens_seen": 134788968, "step": 200010 }, { "epoch": 4.886399726382137, "grad_norm": 0.00046351307537406683, "learning_rate": 3.1445960403304605e-09, "loss": 0.0, "num_input_tokens_seen": 134792488, "step": 200015 }, { "epoch": 4.886521877213983, "grad_norm": 0.0031093384604901075, "learning_rate": 3.13784209264123e-09, "loss": 0.0, "num_input_tokens_seen": 134795752, "step": 200020 }, { "epoch": 4.886644028045831, "grad_norm": 4.4503143726615235e-05, "learning_rate": 3.131095394401817e-09, "loss": 0.0, "num_input_tokens_seen": 134799400, "step": 200025 }, { "epoch": 4.886766178877679, "grad_norm": 7.445557275786996e-05, "learning_rate": 3.1243559456610726e-09, "loss": 0.0, "num_input_tokens_seen": 134802600, "step": 200030 }, { "epoch": 4.886888329709525, "grad_norm": 0.00014683169138152152, "learning_rate": 3.117623746468179e-09, "loss": 0.0, "num_input_tokens_seen": 134806056, "step": 200035 }, { "epoch": 4.887010480541372, "grad_norm": 0.00024286813277285546, "learning_rate": 3.110898796872097e-09, "loss": 0.0, "num_input_tokens_seen": 134810216, "step": 200040 }, { "epoch": 4.88713263137322, "grad_norm": 0.000297984981443733, "learning_rate": 3.1041810969216766e-09, "loss": 0.0, "num_input_tokens_seen": 134813288, "step": 200045 }, { "epoch": 4.8872547822050665, "grad_norm": 0.0006580796907655895, "learning_rate": 3.0974706466657676e-09, "loss": 0.0001, "num_input_tokens_seen": 134816360, "step": 200050 }, { "epoch": 4.887376933036914, "grad_norm": 0.0007669659098610282, "learning_rate": 3.09076744615322e-09, "loss": 0.0001, "num_input_tokens_seen": 134819816, "step": 200055 }, { "epoch": 4.887499083868761, "grad_norm": 1.6302408766932786e-05, "learning_rate": 3.0840714954326608e-09, "loss": 0.0, "num_input_tokens_seen": 134823016, "step": 200060 }, { "epoch": 4.8876212347006085, "grad_norm": 0.00024084115284495056, "learning_rate": 3.077382794552941e-09, "loss": 0.0, "num_input_tokens_seen": 134826472, "step": 200065 }, { "epoch": 4.887743385532455, "grad_norm": 0.013386818580329418, "learning_rate": 3.070701343562687e-09, "loss": 0.0, "num_input_tokens_seen": 134829800, "step": 200070 }, { "epoch": 4.887865536364303, "grad_norm": 0.00042936980025842786, "learning_rate": 3.064027142510306e-09, "loss": 0.0, "num_input_tokens_seen": 134833128, "step": 200075 }, { "epoch": 4.88798768719615, "grad_norm": 0.001953166676685214, "learning_rate": 3.057360191444536e-09, "loss": 0.0, "num_input_tokens_seen": 134836264, "step": 200080 }, { "epoch": 4.888109838027997, "grad_norm": 0.0004152859910391271, "learning_rate": 3.0507004904137823e-09, "loss": 0.0, "num_input_tokens_seen": 134839272, "step": 200085 }, { "epoch": 4.888231988859844, "grad_norm": 3.0269908165792003e-05, "learning_rate": 3.0440480394664516e-09, "loss": 0.0, "num_input_tokens_seen": 134842472, "step": 200090 }, { "epoch": 4.888354139691692, "grad_norm": 0.0003708160074893385, "learning_rate": 3.0374028386510596e-09, "loss": 0.0, "num_input_tokens_seen": 134845800, "step": 200095 }, { "epoch": 4.888476290523538, "grad_norm": 0.005872996523976326, "learning_rate": 3.0307648880156798e-09, "loss": 0.0, "num_input_tokens_seen": 134849064, "step": 200100 }, { "epoch": 4.888598441355386, "grad_norm": 0.000277300423476845, "learning_rate": 3.0241341876088287e-09, "loss": 0.0, "num_input_tokens_seen": 134852712, "step": 200105 }, { "epoch": 4.888720592187233, "grad_norm": 0.007827439345419407, "learning_rate": 3.0175107374785792e-09, "loss": 0.0, "num_input_tokens_seen": 134856040, "step": 200110 }, { "epoch": 4.8888427430190795, "grad_norm": 7.76977394707501e-05, "learning_rate": 3.0108945376732254e-09, "loss": 0.0, "num_input_tokens_seen": 134859560, "step": 200115 }, { "epoch": 4.888964893850927, "grad_norm": 4.224982330924831e-05, "learning_rate": 3.0042855882407293e-09, "loss": 0.0, "num_input_tokens_seen": 134862568, "step": 200120 }, { "epoch": 4.889087044682775, "grad_norm": 0.0004493116866797209, "learning_rate": 2.9976838892292746e-09, "loss": 0.0, "num_input_tokens_seen": 134866408, "step": 200125 }, { "epoch": 4.8892091955146215, "grad_norm": 0.003806524444371462, "learning_rate": 2.9910894406868224e-09, "loss": 0.0, "num_input_tokens_seen": 134870696, "step": 200130 }, { "epoch": 4.889331346346468, "grad_norm": 2.1448315237648785e-05, "learning_rate": 2.9845022426612243e-09, "loss": 0.0, "num_input_tokens_seen": 134874088, "step": 200135 }, { "epoch": 4.889453497178316, "grad_norm": 2.3338936443906277e-05, "learning_rate": 2.9779222952005524e-09, "loss": 0.0, "num_input_tokens_seen": 134877096, "step": 200140 }, { "epoch": 4.889575648010163, "grad_norm": 5.836586933583021e-05, "learning_rate": 2.971349598352657e-09, "loss": 0.0, "num_input_tokens_seen": 134880232, "step": 200145 }, { "epoch": 4.88969779884201, "grad_norm": 0.004671205300837755, "learning_rate": 2.9647841521652783e-09, "loss": 0.0, "num_input_tokens_seen": 134883560, "step": 200150 }, { "epoch": 4.889819949673857, "grad_norm": 0.00043029917287640274, "learning_rate": 2.9582259566860446e-09, "loss": 0.0, "num_input_tokens_seen": 134887208, "step": 200155 }, { "epoch": 4.889942100505705, "grad_norm": 3.5230928915552795e-05, "learning_rate": 2.9516750119629176e-09, "loss": 0.0224, "num_input_tokens_seen": 134890600, "step": 200160 }, { "epoch": 4.890064251337551, "grad_norm": 6.732921610819176e-05, "learning_rate": 2.9451313180431924e-09, "loss": 0.0, "num_input_tokens_seen": 134894120, "step": 200165 }, { "epoch": 4.890186402169399, "grad_norm": 2.25527910515666e-05, "learning_rate": 2.938594874974831e-09, "loss": 0.0, "num_input_tokens_seen": 134897384, "step": 200170 }, { "epoch": 4.890308553001246, "grad_norm": 0.0002783486561384052, "learning_rate": 2.9320656828050182e-09, "loss": 0.0, "num_input_tokens_seen": 134900904, "step": 200175 }, { "epoch": 4.890430703833093, "grad_norm": 8.396808698307723e-05, "learning_rate": 2.9255437415816044e-09, "loss": 0.0, "num_input_tokens_seen": 134903912, "step": 200180 }, { "epoch": 4.89055285466494, "grad_norm": 0.004095216281712055, "learning_rate": 2.9190290513516624e-09, "loss": 0.0, "num_input_tokens_seen": 134907048, "step": 200185 }, { "epoch": 4.890675005496788, "grad_norm": 0.00025316732353530824, "learning_rate": 2.9125216121628214e-09, "loss": 0.0, "num_input_tokens_seen": 134910184, "step": 200190 }, { "epoch": 4.8907971563286345, "grad_norm": 4.9143873184220865e-05, "learning_rate": 2.906021424062155e-09, "loss": 0.0, "num_input_tokens_seen": 134913192, "step": 200195 }, { "epoch": 4.890919307160482, "grad_norm": 0.00027152185793966055, "learning_rate": 2.8995284870971804e-09, "loss": 0.0, "num_input_tokens_seen": 134916456, "step": 200200 }, { "epoch": 4.891041457992329, "grad_norm": 1.3070682143734302e-05, "learning_rate": 2.893042801315082e-09, "loss": 0.0, "num_input_tokens_seen": 134919528, "step": 200205 }, { "epoch": 4.891163608824176, "grad_norm": 8.361788786714897e-05, "learning_rate": 2.8865643667629336e-09, "loss": 0.0, "num_input_tokens_seen": 134922920, "step": 200210 }, { "epoch": 4.891285759656023, "grad_norm": 0.0013510880526155233, "learning_rate": 2.8800931834878085e-09, "loss": 0.0, "num_input_tokens_seen": 134925992, "step": 200215 }, { "epoch": 4.89140791048787, "grad_norm": 0.0006011762889102101, "learning_rate": 2.873629251536891e-09, "loss": 0.0, "num_input_tokens_seen": 134928936, "step": 200220 }, { "epoch": 4.891530061319718, "grad_norm": 0.0008538027759641409, "learning_rate": 2.8671725709571437e-09, "loss": 0.0, "num_input_tokens_seen": 134932328, "step": 200225 }, { "epoch": 4.891652212151564, "grad_norm": 0.00014728435780853033, "learning_rate": 2.860723141795529e-09, "loss": 0.0, "num_input_tokens_seen": 134935592, "step": 200230 }, { "epoch": 4.891774362983412, "grad_norm": 0.0003445665934123099, "learning_rate": 2.8542809640988986e-09, "loss": 0.0, "num_input_tokens_seen": 134938664, "step": 200235 }, { "epoch": 4.891896513815259, "grad_norm": 0.000725120073184371, "learning_rate": 2.847846037914103e-09, "loss": 0.0, "num_input_tokens_seen": 134941608, "step": 200240 }, { "epoch": 4.892018664647106, "grad_norm": 2.4838760509737767e-05, "learning_rate": 2.841418363287995e-09, "loss": 0.0001, "num_input_tokens_seen": 134945192, "step": 200245 }, { "epoch": 4.892140815478953, "grad_norm": 0.0050308541394770145, "learning_rate": 2.834997940267425e-09, "loss": 0.0, "num_input_tokens_seen": 134948200, "step": 200250 }, { "epoch": 4.892262966310801, "grad_norm": 0.0032318481244146824, "learning_rate": 2.8285847688988006e-09, "loss": 0.0, "num_input_tokens_seen": 134951528, "step": 200255 }, { "epoch": 4.892385117142648, "grad_norm": 0.0008068734314292669, "learning_rate": 2.8221788492289733e-09, "loss": 0.0, "num_input_tokens_seen": 134954792, "step": 200260 }, { "epoch": 4.892507267974495, "grad_norm": 0.0009322738042101264, "learning_rate": 2.8157801813044613e-09, "loss": 0.0, "num_input_tokens_seen": 134958120, "step": 200265 }, { "epoch": 4.892629418806342, "grad_norm": 0.0007301435107365251, "learning_rate": 2.809388765171783e-09, "loss": 0.0, "num_input_tokens_seen": 134961128, "step": 200270 }, { "epoch": 4.89275156963819, "grad_norm": 0.00023704532941337675, "learning_rate": 2.8030046008774564e-09, "loss": 0.0, "num_input_tokens_seen": 134964264, "step": 200275 }, { "epoch": 4.892873720470036, "grad_norm": 0.0002072526840493083, "learning_rate": 2.79662768846789e-09, "loss": 0.0, "num_input_tokens_seen": 134967720, "step": 200280 }, { "epoch": 4.892995871301883, "grad_norm": 4.86823009850923e-05, "learning_rate": 2.7902580279894895e-09, "loss": 0.0, "num_input_tokens_seen": 134971368, "step": 200285 }, { "epoch": 4.893118022133731, "grad_norm": 0.000519820605404675, "learning_rate": 2.783895619488552e-09, "loss": 0.0, "num_input_tokens_seen": 134975080, "step": 200290 }, { "epoch": 4.893240172965578, "grad_norm": 0.00029773195274174213, "learning_rate": 2.7775404630112632e-09, "loss": 0.0, "num_input_tokens_seen": 134978472, "step": 200295 }, { "epoch": 4.893362323797425, "grad_norm": 0.00020020749070681632, "learning_rate": 2.7711925586040298e-09, "loss": 0.0, "num_input_tokens_seen": 134982248, "step": 200300 }, { "epoch": 4.893484474629272, "grad_norm": 0.00055017473641783, "learning_rate": 2.764851906312815e-09, "loss": 0.0, "num_input_tokens_seen": 134985512, "step": 200305 }, { "epoch": 4.8936066254611195, "grad_norm": 0.00029676384292542934, "learning_rate": 2.7585185061839154e-09, "loss": 0.0, "num_input_tokens_seen": 134988520, "step": 200310 }, { "epoch": 4.893728776292966, "grad_norm": 0.00014414470933843404, "learning_rate": 2.7521923582631833e-09, "loss": 0.0, "num_input_tokens_seen": 134992616, "step": 200315 }, { "epoch": 4.893850927124814, "grad_norm": 0.00025701947743073106, "learning_rate": 2.745873462596804e-09, "loss": 0.0, "num_input_tokens_seen": 134996008, "step": 200320 }, { "epoch": 4.893973077956661, "grad_norm": 0.002546359086409211, "learning_rate": 2.7395618192306292e-09, "loss": 0.0, "num_input_tokens_seen": 134999400, "step": 200325 }, { "epoch": 4.894095228788508, "grad_norm": 0.0006520068855024874, "learning_rate": 2.7332574282107335e-09, "loss": 0.0, "num_input_tokens_seen": 135002600, "step": 200330 }, { "epoch": 4.894217379620355, "grad_norm": 0.0029449693392962217, "learning_rate": 2.7269602895826362e-09, "loss": 0.0, "num_input_tokens_seen": 135005992, "step": 200335 }, { "epoch": 4.894339530452203, "grad_norm": 0.0027736180927604437, "learning_rate": 2.720670403392411e-09, "loss": 0.0, "num_input_tokens_seen": 135009128, "step": 200340 }, { "epoch": 4.894461681284049, "grad_norm": 0.0016403611516579986, "learning_rate": 2.7143877696856887e-09, "loss": 0.0, "num_input_tokens_seen": 135012648, "step": 200345 }, { "epoch": 4.894583832115897, "grad_norm": 0.00010193864000029862, "learning_rate": 2.70811238850821e-09, "loss": 0.0, "num_input_tokens_seen": 135015720, "step": 200350 }, { "epoch": 4.894705982947744, "grad_norm": 2.756047797447536e-05, "learning_rate": 2.701844259905495e-09, "loss": 0.0, "num_input_tokens_seen": 135019560, "step": 200355 }, { "epoch": 4.894828133779591, "grad_norm": 4.996271673007868e-05, "learning_rate": 2.6955833839232834e-09, "loss": 0.0, "num_input_tokens_seen": 135023400, "step": 200360 }, { "epoch": 4.894950284611438, "grad_norm": 0.0004890519776381552, "learning_rate": 2.6893297606069843e-09, "loss": 0.0, "num_input_tokens_seen": 135027432, "step": 200365 }, { "epoch": 4.895072435443286, "grad_norm": 3.164946974720806e-05, "learning_rate": 2.6830833900021166e-09, "loss": 0.0, "num_input_tokens_seen": 135030760, "step": 200370 }, { "epoch": 4.8951945862751325, "grad_norm": 0.0010964064858853817, "learning_rate": 2.6768442721541994e-09, "loss": 0.0, "num_input_tokens_seen": 135034600, "step": 200375 }, { "epoch": 4.895316737106979, "grad_norm": 1.2080210581189021e-05, "learning_rate": 2.670612407108419e-09, "loss": 0.0, "num_input_tokens_seen": 135038248, "step": 200380 }, { "epoch": 4.895438887938827, "grad_norm": 0.0004444806545507163, "learning_rate": 2.6643877949101834e-09, "loss": 0.0, "num_input_tokens_seen": 135041576, "step": 200385 }, { "epoch": 4.8955610387706745, "grad_norm": 0.0008745653321966529, "learning_rate": 2.6581704356047895e-09, "loss": 0.0, "num_input_tokens_seen": 135045288, "step": 200390 }, { "epoch": 4.895683189602521, "grad_norm": 0.003511784365400672, "learning_rate": 2.6519603292375347e-09, "loss": 0.0, "num_input_tokens_seen": 135048872, "step": 200395 }, { "epoch": 4.895805340434368, "grad_norm": 1.117614192480687e-05, "learning_rate": 2.645757475853383e-09, "loss": 0.0, "num_input_tokens_seen": 135052072, "step": 200400 }, { "epoch": 4.895927491266216, "grad_norm": 0.00029673470999114215, "learning_rate": 2.639561875497631e-09, "loss": 0.0, "num_input_tokens_seen": 135055336, "step": 200405 }, { "epoch": 4.896049642098062, "grad_norm": 0.00034569003037177026, "learning_rate": 2.6333735282151326e-09, "loss": 0.0, "num_input_tokens_seen": 135058984, "step": 200410 }, { "epoch": 4.89617179292991, "grad_norm": 4.958968565915711e-05, "learning_rate": 2.627192434050962e-09, "loss": 0.0, "num_input_tokens_seen": 135062248, "step": 200415 }, { "epoch": 4.896293943761757, "grad_norm": 0.00011519034160301089, "learning_rate": 2.621018593050195e-09, "loss": 0.0, "num_input_tokens_seen": 135065832, "step": 200420 }, { "epoch": 4.896416094593604, "grad_norm": 0.00024000953999347985, "learning_rate": 2.6148520052576838e-09, "loss": 0.0, "num_input_tokens_seen": 135069160, "step": 200425 }, { "epoch": 4.896538245425451, "grad_norm": 0.0036131960805505514, "learning_rate": 2.608692670718171e-09, "loss": 0.0, "num_input_tokens_seen": 135072552, "step": 200430 }, { "epoch": 4.896660396257299, "grad_norm": 0.0005584946484304965, "learning_rate": 2.6025405894766204e-09, "loss": 0.0, "num_input_tokens_seen": 135075688, "step": 200435 }, { "epoch": 4.8967825470891455, "grad_norm": 1.2202312973386142e-05, "learning_rate": 2.596395761577552e-09, "loss": 0.0, "num_input_tokens_seen": 135078824, "step": 200440 }, { "epoch": 4.896904697920993, "grad_norm": 0.002158592687919736, "learning_rate": 2.590258187065708e-09, "loss": 0.0, "num_input_tokens_seen": 135082280, "step": 200445 }, { "epoch": 4.89702684875284, "grad_norm": 7.275992538779974e-05, "learning_rate": 2.5841278659858303e-09, "loss": 0.0, "num_input_tokens_seen": 135085928, "step": 200450 }, { "epoch": 4.8971489995846875, "grad_norm": 0.00630525778979063, "learning_rate": 2.578004798382549e-09, "loss": 0.0538, "num_input_tokens_seen": 135089768, "step": 200455 }, { "epoch": 4.897271150416534, "grad_norm": 2.0241757738403976e-05, "learning_rate": 2.5718889843001632e-09, "loss": 0.0, "num_input_tokens_seen": 135093096, "step": 200460 }, { "epoch": 4.897393301248382, "grad_norm": 7.490185089409351e-05, "learning_rate": 2.5657804237833037e-09, "loss": 0.0, "num_input_tokens_seen": 135096552, "step": 200465 }, { "epoch": 4.897515452080229, "grad_norm": 0.000123066536616534, "learning_rate": 2.559679116876379e-09, "loss": 0.0, "num_input_tokens_seen": 135100072, "step": 200470 }, { "epoch": 4.897637602912075, "grad_norm": 0.0003549766552168876, "learning_rate": 2.5535850636237976e-09, "loss": 0.0, "num_input_tokens_seen": 135103528, "step": 200475 }, { "epoch": 4.897759753743923, "grad_norm": 0.0001572407054482028, "learning_rate": 2.5474982640697475e-09, "loss": 0.0, "num_input_tokens_seen": 135107240, "step": 200480 }, { "epoch": 4.89788190457577, "grad_norm": 0.0003219839127268642, "learning_rate": 2.5414187182586365e-09, "loss": 0.0, "num_input_tokens_seen": 135110952, "step": 200485 }, { "epoch": 4.898004055407617, "grad_norm": 0.0002576294064056128, "learning_rate": 2.5353464262345415e-09, "loss": 0.0, "num_input_tokens_seen": 135114728, "step": 200490 }, { "epoch": 4.898126206239464, "grad_norm": 0.0020776099991053343, "learning_rate": 2.5292813880417595e-09, "loss": 0.0, "num_input_tokens_seen": 135118120, "step": 200495 }, { "epoch": 4.898248357071312, "grad_norm": 7.651347550563514e-05, "learning_rate": 2.523223603724367e-09, "loss": 0.0, "num_input_tokens_seen": 135121640, "step": 200500 }, { "epoch": 4.8983705079031585, "grad_norm": 5.8618898037821054e-05, "learning_rate": 2.5171730733262175e-09, "loss": 0.0, "num_input_tokens_seen": 135125672, "step": 200505 }, { "epoch": 4.898492658735006, "grad_norm": 0.002982416423037648, "learning_rate": 2.511129796891609e-09, "loss": 0.0, "num_input_tokens_seen": 135129000, "step": 200510 }, { "epoch": 4.898614809566853, "grad_norm": 0.0001612217165529728, "learning_rate": 2.5050937744643952e-09, "loss": 0.0318, "num_input_tokens_seen": 135132392, "step": 200515 }, { "epoch": 4.8987369603987005, "grad_norm": 0.018624404445290565, "learning_rate": 2.4990650060883188e-09, "loss": 0.0, "num_input_tokens_seen": 135135656, "step": 200520 }, { "epoch": 4.898859111230547, "grad_norm": 0.0567937046289444, "learning_rate": 2.493043491807345e-09, "loss": 0.0, "num_input_tokens_seen": 135139496, "step": 200525 }, { "epoch": 4.898981262062395, "grad_norm": 1.878110742836725e-05, "learning_rate": 2.4870292316653275e-09, "loss": 0.0, "num_input_tokens_seen": 135142696, "step": 200530 }, { "epoch": 4.899103412894242, "grad_norm": 2.994788155774586e-05, "learning_rate": 2.481022225705898e-09, "loss": 0.0, "num_input_tokens_seen": 135145832, "step": 200535 }, { "epoch": 4.899225563726089, "grad_norm": 0.00013597743236459792, "learning_rate": 2.475022473972799e-09, "loss": 0.0, "num_input_tokens_seen": 135149608, "step": 200540 }, { "epoch": 4.899347714557936, "grad_norm": 0.00014619667490478605, "learning_rate": 2.469029976509662e-09, "loss": 0.0, "num_input_tokens_seen": 135153192, "step": 200545 }, { "epoch": 4.899469865389784, "grad_norm": 0.0010466484818607569, "learning_rate": 2.463044733360009e-09, "loss": 0.0, "num_input_tokens_seen": 135156712, "step": 200550 }, { "epoch": 4.89959201622163, "grad_norm": 0.00010759360156953335, "learning_rate": 2.4570667445673597e-09, "loss": 0.0, "num_input_tokens_seen": 135159912, "step": 200555 }, { "epoch": 4.899714167053478, "grad_norm": 31.909116744995117, "learning_rate": 2.4510960101752355e-09, "loss": 0.0339, "num_input_tokens_seen": 135163496, "step": 200560 }, { "epoch": 4.899836317885325, "grad_norm": 7.701950380578637e-05, "learning_rate": 2.4451325302270455e-09, "loss": 0.0, "num_input_tokens_seen": 135166888, "step": 200565 }, { "epoch": 4.8999584687171716, "grad_norm": 0.00031009703525342047, "learning_rate": 2.4391763047661997e-09, "loss": 0.0002, "num_input_tokens_seen": 135170216, "step": 200570 }, { "epoch": 4.900080619549019, "grad_norm": 0.007576879113912582, "learning_rate": 2.4332273338359965e-09, "loss": 0.0, "num_input_tokens_seen": 135173608, "step": 200575 }, { "epoch": 4.900202770380866, "grad_norm": 0.029712717980146408, "learning_rate": 2.4272856174796242e-09, "loss": 0.0, "num_input_tokens_seen": 135176552, "step": 200580 }, { "epoch": 4.900324921212714, "grad_norm": 0.00031999516068026423, "learning_rate": 2.421351155740381e-09, "loss": 0.0, "num_input_tokens_seen": 135180328, "step": 200585 }, { "epoch": 4.90044707204456, "grad_norm": 0.00021247580298222601, "learning_rate": 2.4154239486613438e-09, "loss": 0.0, "num_input_tokens_seen": 135184040, "step": 200590 }, { "epoch": 4.900569222876408, "grad_norm": 3.182264481438324e-05, "learning_rate": 2.4095039962857e-09, "loss": 0.0, "num_input_tokens_seen": 135187624, "step": 200595 }, { "epoch": 4.900691373708255, "grad_norm": 1.529901783214882e-05, "learning_rate": 2.4035912986564155e-09, "loss": 0.0, "num_input_tokens_seen": 135190760, "step": 200600 }, { "epoch": 4.900813524540102, "grad_norm": 0.00029649061616510153, "learning_rate": 2.3976858558165667e-09, "loss": 0.0002, "num_input_tokens_seen": 135194216, "step": 200605 }, { "epoch": 4.900935675371949, "grad_norm": 5.1163326133973897e-05, "learning_rate": 2.3917876678091197e-09, "loss": 0.0, "num_input_tokens_seen": 135197288, "step": 200610 }, { "epoch": 4.901057826203797, "grad_norm": 0.00202845293097198, "learning_rate": 2.385896734676818e-09, "loss": 0.0, "num_input_tokens_seen": 135200872, "step": 200615 }, { "epoch": 4.9011799770356435, "grad_norm": 0.0005032019107602537, "learning_rate": 2.3800130564627374e-09, "loss": 0.0, "num_input_tokens_seen": 135204008, "step": 200620 }, { "epoch": 4.901302127867491, "grad_norm": 0.0008152702357620001, "learning_rate": 2.3741366332094003e-09, "loss": 0.0, "num_input_tokens_seen": 135207208, "step": 200625 }, { "epoch": 4.901424278699338, "grad_norm": 4.561661626212299e-06, "learning_rate": 2.3682674649597725e-09, "loss": 0.0, "num_input_tokens_seen": 135210664, "step": 200630 }, { "epoch": 4.9015464295311855, "grad_norm": 0.00010567594290478155, "learning_rate": 2.3624055517562634e-09, "loss": 0.0, "num_input_tokens_seen": 135213800, "step": 200635 }, { "epoch": 4.901668580363032, "grad_norm": 6.133651913842186e-05, "learning_rate": 2.35655089364184e-09, "loss": 0.0, "num_input_tokens_seen": 135216744, "step": 200640 }, { "epoch": 4.901790731194879, "grad_norm": 0.0007190249161794782, "learning_rate": 2.350703490658912e-09, "loss": 0.0, "num_input_tokens_seen": 135220392, "step": 200645 }, { "epoch": 4.901912882026727, "grad_norm": 0.00048561752191744745, "learning_rate": 2.34486334284989e-09, "loss": 0.0, "num_input_tokens_seen": 135223592, "step": 200650 }, { "epoch": 4.902035032858574, "grad_norm": 0.0003284842532593757, "learning_rate": 2.3390304502575175e-09, "loss": 0.0, "num_input_tokens_seen": 135226856, "step": 200655 }, { "epoch": 4.902157183690421, "grad_norm": 0.00022950841230340302, "learning_rate": 2.3332048129238724e-09, "loss": 0.0, "num_input_tokens_seen": 135230568, "step": 200660 }, { "epoch": 4.902279334522268, "grad_norm": 6.730855966452509e-05, "learning_rate": 2.3273864308915867e-09, "loss": 0.0, "num_input_tokens_seen": 135233704, "step": 200665 }, { "epoch": 4.902401485354115, "grad_norm": 3.094383282586932e-05, "learning_rate": 2.321575304202961e-09, "loss": 0.0, "num_input_tokens_seen": 135237288, "step": 200670 }, { "epoch": 4.902523636185962, "grad_norm": 0.00031161034712567925, "learning_rate": 2.315771432900071e-09, "loss": 0.0, "num_input_tokens_seen": 135241000, "step": 200675 }, { "epoch": 4.90264578701781, "grad_norm": 2.608428076200653e-05, "learning_rate": 2.3099748170253287e-09, "loss": 0.0, "num_input_tokens_seen": 135244520, "step": 200680 }, { "epoch": 4.9027679378496565, "grad_norm": 0.0007350142695941031, "learning_rate": 2.3041854566206997e-09, "loss": 0.0, "num_input_tokens_seen": 135247976, "step": 200685 }, { "epoch": 4.902890088681504, "grad_norm": 0.00028342122095637023, "learning_rate": 2.298403351728484e-09, "loss": 0.0, "num_input_tokens_seen": 135251560, "step": 200690 }, { "epoch": 4.903012239513351, "grad_norm": 0.00029275708948262036, "learning_rate": 2.2926285023905368e-09, "loss": 0.0, "num_input_tokens_seen": 135254824, "step": 200695 }, { "epoch": 4.9031343903451985, "grad_norm": 0.001582046621479094, "learning_rate": 2.2868609086489355e-09, "loss": 0.0, "num_input_tokens_seen": 135257704, "step": 200700 }, { "epoch": 4.903256541177045, "grad_norm": 4.7803659981582314e-05, "learning_rate": 2.2811005705456466e-09, "loss": 0.0, "num_input_tokens_seen": 135261416, "step": 200705 }, { "epoch": 4.903378692008893, "grad_norm": 4.758605427923612e-05, "learning_rate": 2.2753474881226365e-09, "loss": 0.0, "num_input_tokens_seen": 135264424, "step": 200710 }, { "epoch": 4.90350084284074, "grad_norm": 1.9773568055825308e-05, "learning_rate": 2.2696016614216497e-09, "loss": 0.0, "num_input_tokens_seen": 135267816, "step": 200715 }, { "epoch": 4.903622993672587, "grad_norm": 0.00011461249232525006, "learning_rate": 2.263863090484319e-09, "loss": 0.0, "num_input_tokens_seen": 135271400, "step": 200720 }, { "epoch": 4.903745144504434, "grad_norm": 2.939927435363643e-05, "learning_rate": 2.2581317753527227e-09, "loss": 0.0438, "num_input_tokens_seen": 135274664, "step": 200725 }, { "epoch": 4.903867295336282, "grad_norm": 9.464116010349244e-05, "learning_rate": 2.252407716068272e-09, "loss": 0.0, "num_input_tokens_seen": 135278248, "step": 200730 }, { "epoch": 4.903989446168128, "grad_norm": 0.006667073350399733, "learning_rate": 2.2466909126726e-09, "loss": 0.0, "num_input_tokens_seen": 135281576, "step": 200735 }, { "epoch": 4.904111596999975, "grad_norm": 0.015178213827311993, "learning_rate": 2.2409813652074503e-09, "loss": 0.0, "num_input_tokens_seen": 135285160, "step": 200740 }, { "epoch": 4.904233747831823, "grad_norm": 0.0013789625372737646, "learning_rate": 2.2352790737142357e-09, "loss": 0.0, "num_input_tokens_seen": 135288552, "step": 200745 }, { "epoch": 4.90435589866367, "grad_norm": 0.0009246623376384377, "learning_rate": 2.2295840382344776e-09, "loss": 0.0, "num_input_tokens_seen": 135291944, "step": 200750 }, { "epoch": 4.904478049495517, "grad_norm": 0.00013876563753001392, "learning_rate": 2.2238962588094766e-09, "loss": 0.0, "num_input_tokens_seen": 135295208, "step": 200755 }, { "epoch": 4.904600200327364, "grad_norm": 0.0006278780056163669, "learning_rate": 2.2182157354807552e-09, "loss": 0.0383, "num_input_tokens_seen": 135298088, "step": 200760 }, { "epoch": 4.9047223511592115, "grad_norm": 0.0015818560495972633, "learning_rate": 2.212542468289502e-09, "loss": 0.0001, "num_input_tokens_seen": 135301864, "step": 200765 }, { "epoch": 4.904844501991058, "grad_norm": 0.0005441773100756109, "learning_rate": 2.206876457276907e-09, "loss": 0.0, "num_input_tokens_seen": 135304936, "step": 200770 }, { "epoch": 4.904966652822906, "grad_norm": 0.0003268019063398242, "learning_rate": 2.2012177024843816e-09, "loss": 0.0, "num_input_tokens_seen": 135308200, "step": 200775 }, { "epoch": 4.905088803654753, "grad_norm": 0.0003277796204201877, "learning_rate": 2.1955662039530032e-09, "loss": 0.0021, "num_input_tokens_seen": 135311464, "step": 200780 }, { "epoch": 4.9052109544866, "grad_norm": 0.00046862257295288146, "learning_rate": 2.189921961723851e-09, "loss": 0.0, "num_input_tokens_seen": 135314984, "step": 200785 }, { "epoch": 4.905333105318447, "grad_norm": 0.0011110889026895165, "learning_rate": 2.184284975837891e-09, "loss": 0.0, "num_input_tokens_seen": 135317928, "step": 200790 }, { "epoch": 4.905455256150295, "grad_norm": 0.00011374321911716834, "learning_rate": 2.178655246336203e-09, "loss": 0.0, "num_input_tokens_seen": 135321448, "step": 200795 }, { "epoch": 4.905577406982141, "grad_norm": 0.0003457633429206908, "learning_rate": 2.173032773259753e-09, "loss": 0.0, "num_input_tokens_seen": 135324648, "step": 200800 }, { "epoch": 4.905699557813989, "grad_norm": 2.1189451217651367e-05, "learning_rate": 2.167417556649287e-09, "loss": 0.0, "num_input_tokens_seen": 135328104, "step": 200805 }, { "epoch": 4.905821708645836, "grad_norm": 0.00014341072528623044, "learning_rate": 2.1618095965458826e-09, "loss": 0.0, "num_input_tokens_seen": 135331496, "step": 200810 }, { "epoch": 4.905943859477683, "grad_norm": 56.895790100097656, "learning_rate": 2.1562088929901745e-09, "loss": 0.0402, "num_input_tokens_seen": 135334504, "step": 200815 }, { "epoch": 4.90606601030953, "grad_norm": 0.00114006910007447, "learning_rate": 2.1506154460227965e-09, "loss": 0.0, "num_input_tokens_seen": 135338024, "step": 200820 }, { "epoch": 4.906188161141378, "grad_norm": 4.435875962371938e-05, "learning_rate": 2.145029255684605e-09, "loss": 0.0, "num_input_tokens_seen": 135341160, "step": 200825 }, { "epoch": 4.9063103119732245, "grad_norm": 5.82519278395921e-05, "learning_rate": 2.139450322016123e-09, "loss": 0.0, "num_input_tokens_seen": 135344360, "step": 200830 }, { "epoch": 4.906432462805071, "grad_norm": 0.0002500818227417767, "learning_rate": 2.1338786450579847e-09, "loss": 0.0, "num_input_tokens_seen": 135347752, "step": 200835 }, { "epoch": 4.906554613636919, "grad_norm": 0.0010689017362892628, "learning_rate": 2.1283142248507135e-09, "loss": 0.0, "num_input_tokens_seen": 135351080, "step": 200840 }, { "epoch": 4.906676764468766, "grad_norm": 0.0010640647960826755, "learning_rate": 2.1227570614346103e-09, "loss": 0.0, "num_input_tokens_seen": 135354536, "step": 200845 }, { "epoch": 4.906798915300613, "grad_norm": 4.3444782932056114e-05, "learning_rate": 2.117207154850309e-09, "loss": 0.0, "num_input_tokens_seen": 135357992, "step": 200850 }, { "epoch": 4.90692106613246, "grad_norm": 0.014457025565207005, "learning_rate": 2.111664505138111e-09, "loss": 0.0, "num_input_tokens_seen": 135361128, "step": 200855 }, { "epoch": 4.907043216964308, "grad_norm": 0.0001936874759849161, "learning_rate": 2.1061291123382063e-09, "loss": 0.0, "num_input_tokens_seen": 135364776, "step": 200860 }, { "epoch": 4.907165367796154, "grad_norm": 0.00028388245846144855, "learning_rate": 2.100600976491007e-09, "loss": 0.0, "num_input_tokens_seen": 135368104, "step": 200865 }, { "epoch": 4.907287518628002, "grad_norm": 0.00029171412461437285, "learning_rate": 2.095080097636592e-09, "loss": 0.0, "num_input_tokens_seen": 135370984, "step": 200870 }, { "epoch": 4.907409669459849, "grad_norm": 0.00025791852385737, "learning_rate": 2.089566475815152e-09, "loss": 0.0, "num_input_tokens_seen": 135374248, "step": 200875 }, { "epoch": 4.907531820291696, "grad_norm": 0.0002960785641334951, "learning_rate": 2.0840601110667654e-09, "loss": 0.0, "num_input_tokens_seen": 135377512, "step": 200880 }, { "epoch": 4.907653971123543, "grad_norm": 0.0001221057027578354, "learning_rate": 2.0785610034315114e-09, "loss": 0.0, "num_input_tokens_seen": 135380840, "step": 200885 }, { "epoch": 4.907776121955391, "grad_norm": 0.0012774858623743057, "learning_rate": 2.0730691529493583e-09, "loss": 0.0, "num_input_tokens_seen": 135384360, "step": 200890 }, { "epoch": 4.907898272787238, "grad_norm": 0.0003208449052181095, "learning_rate": 2.0675845596602737e-09, "loss": 0.0, "num_input_tokens_seen": 135387816, "step": 200895 }, { "epoch": 4.908020423619085, "grad_norm": 0.0002654562995303422, "learning_rate": 2.0621072236042257e-09, "loss": 0.0, "num_input_tokens_seen": 135390888, "step": 200900 }, { "epoch": 4.908142574450932, "grad_norm": 0.000628930632956326, "learning_rate": 2.0566371448208497e-09, "loss": 0.0, "num_input_tokens_seen": 135394216, "step": 200905 }, { "epoch": 4.908264725282779, "grad_norm": 0.00024314632173627615, "learning_rate": 2.0511743233500024e-09, "loss": 0.0, "num_input_tokens_seen": 135397608, "step": 200910 }, { "epoch": 4.908386876114626, "grad_norm": 6.568645039806142e-05, "learning_rate": 2.0457187592314294e-09, "loss": 0.0, "num_input_tokens_seen": 135401256, "step": 200915 }, { "epoch": 4.908509026946474, "grad_norm": 0.00040447432547807693, "learning_rate": 2.0402704525048776e-09, "loss": 0.0, "num_input_tokens_seen": 135404328, "step": 200920 }, { "epoch": 4.908631177778321, "grad_norm": 0.0004772391403093934, "learning_rate": 2.03482940320987e-09, "loss": 0.0, "num_input_tokens_seen": 135407528, "step": 200925 }, { "epoch": 4.908753328610167, "grad_norm": 0.0001811403635656461, "learning_rate": 2.029395611386042e-09, "loss": 0.0, "num_input_tokens_seen": 135410728, "step": 200930 }, { "epoch": 4.908875479442015, "grad_norm": 0.0008391044102609158, "learning_rate": 2.0239690770728068e-09, "loss": 0.0, "num_input_tokens_seen": 135414696, "step": 200935 }, { "epoch": 4.908997630273862, "grad_norm": 5.66676590096904e-06, "learning_rate": 2.018549800309688e-09, "loss": 0.0, "num_input_tokens_seen": 135418088, "step": 200940 }, { "epoch": 4.9091197811057095, "grad_norm": 0.00012580875772982836, "learning_rate": 2.0131377811360982e-09, "loss": 0.0, "num_input_tokens_seen": 135421544, "step": 200945 }, { "epoch": 4.909241931937556, "grad_norm": 0.0003512984258122742, "learning_rate": 2.0077330195914512e-09, "loss": 0.0, "num_input_tokens_seen": 135425064, "step": 200950 }, { "epoch": 4.909364082769404, "grad_norm": 0.0013358528958633542, "learning_rate": 2.0023355157149367e-09, "loss": 0.0, "num_input_tokens_seen": 135428712, "step": 200955 }, { "epoch": 4.909486233601251, "grad_norm": 0.0003753567289095372, "learning_rate": 1.9969452695458576e-09, "loss": 0.0, "num_input_tokens_seen": 135432040, "step": 200960 }, { "epoch": 4.909608384433098, "grad_norm": 0.00010384988854639232, "learning_rate": 1.9915622811235155e-09, "loss": 0.0, "num_input_tokens_seen": 135435304, "step": 200965 }, { "epoch": 4.909730535264945, "grad_norm": 0.0036878047976642847, "learning_rate": 1.9861865504868792e-09, "loss": 0.0, "num_input_tokens_seen": 135438504, "step": 200970 }, { "epoch": 4.909852686096793, "grad_norm": 0.011057667434215546, "learning_rate": 1.9808180776751393e-09, "loss": 0.0, "num_input_tokens_seen": 135441576, "step": 200975 }, { "epoch": 4.909974836928639, "grad_norm": 0.0002773915184661746, "learning_rate": 1.975456862727376e-09, "loss": 0.0, "num_input_tokens_seen": 135445160, "step": 200980 }, { "epoch": 4.910096987760487, "grad_norm": 8.614901889814064e-05, "learning_rate": 1.970102905682447e-09, "loss": 0.0, "num_input_tokens_seen": 135448872, "step": 200985 }, { "epoch": 4.910219138592334, "grad_norm": 0.0002527591132093221, "learning_rate": 1.964756206579432e-09, "loss": 0.0001, "num_input_tokens_seen": 135452328, "step": 200990 }, { "epoch": 4.910341289424181, "grad_norm": 0.0005422373651526868, "learning_rate": 1.959416765457189e-09, "loss": 0.0001, "num_input_tokens_seen": 135455976, "step": 200995 }, { "epoch": 4.910463440256028, "grad_norm": 0.0001778609148459509, "learning_rate": 1.954084582354465e-09, "loss": 0.0, "num_input_tokens_seen": 135459368, "step": 201000 }, { "epoch": 4.910585591087875, "grad_norm": 0.015482158400118351, "learning_rate": 1.948759657310006e-09, "loss": 0.0, "num_input_tokens_seen": 135462952, "step": 201005 }, { "epoch": 4.9107077419197225, "grad_norm": 0.00010658630344551057, "learning_rate": 1.9434419903626708e-09, "loss": 0.0, "num_input_tokens_seen": 135466856, "step": 201010 }, { "epoch": 4.91082989275157, "grad_norm": 0.0001746755588101223, "learning_rate": 1.9381315815510946e-09, "loss": 0.0, "num_input_tokens_seen": 135469992, "step": 201015 }, { "epoch": 4.910952043583417, "grad_norm": 0.00026380535564385355, "learning_rate": 1.9328284309138022e-09, "loss": 0.0, "num_input_tokens_seen": 135473576, "step": 201020 }, { "epoch": 4.911074194415264, "grad_norm": 4.049741255585104e-05, "learning_rate": 1.92753253848954e-09, "loss": 0.0, "num_input_tokens_seen": 135477160, "step": 201025 }, { "epoch": 4.911196345247111, "grad_norm": 0.005961469374597073, "learning_rate": 1.9222439043166116e-09, "loss": 0.0, "num_input_tokens_seen": 135480680, "step": 201030 }, { "epoch": 4.911318496078958, "grad_norm": 0.00024113174004014581, "learning_rate": 1.9169625284336523e-09, "loss": 0.0, "num_input_tokens_seen": 135485544, "step": 201035 }, { "epoch": 4.911440646910806, "grad_norm": 1.186096324090613e-05, "learning_rate": 1.9116884108789644e-09, "loss": 0.0, "num_input_tokens_seen": 135488808, "step": 201040 }, { "epoch": 4.911562797742652, "grad_norm": 9.603124635759741e-05, "learning_rate": 1.9064215516908513e-09, "loss": 0.0, "num_input_tokens_seen": 135492200, "step": 201045 }, { "epoch": 4.9116849485745, "grad_norm": 1.8488022760720924e-05, "learning_rate": 1.901161950907837e-09, "loss": 0.0, "num_input_tokens_seen": 135495912, "step": 201050 }, { "epoch": 4.911807099406347, "grad_norm": 7.88636680226773e-05, "learning_rate": 1.8959096085678915e-09, "loss": 0.0, "num_input_tokens_seen": 135498984, "step": 201055 }, { "epoch": 4.911929250238194, "grad_norm": 0.0008046218426898122, "learning_rate": 1.8906645247094288e-09, "loss": 0.0, "num_input_tokens_seen": 135502184, "step": 201060 }, { "epoch": 4.912051401070041, "grad_norm": 3.1551830034004524e-05, "learning_rate": 1.885426699370529e-09, "loss": 0.0, "num_input_tokens_seen": 135506088, "step": 201065 }, { "epoch": 4.912173551901889, "grad_norm": 6.683461833745241e-05, "learning_rate": 1.8801961325892735e-09, "loss": 0.0, "num_input_tokens_seen": 135509544, "step": 201070 }, { "epoch": 4.9122957027337355, "grad_norm": 0.00046349395415745676, "learning_rate": 1.874972824403631e-09, "loss": 0.0, "num_input_tokens_seen": 135512936, "step": 201075 }, { "epoch": 4.912417853565583, "grad_norm": 0.008031395263969898, "learning_rate": 1.869756774851683e-09, "loss": 0.0, "num_input_tokens_seen": 135516392, "step": 201080 }, { "epoch": 4.91254000439743, "grad_norm": 0.000429147039540112, "learning_rate": 1.8645479839712873e-09, "loss": 0.0, "num_input_tokens_seen": 135519912, "step": 201085 }, { "epoch": 4.9126621552292775, "grad_norm": 3.720055246958509e-05, "learning_rate": 1.8593464518004143e-09, "loss": 0.0, "num_input_tokens_seen": 135523240, "step": 201090 }, { "epoch": 4.912784306061124, "grad_norm": 24.435461044311523, "learning_rate": 1.8541521783768111e-09, "loss": 0.0422, "num_input_tokens_seen": 135526568, "step": 201095 }, { "epoch": 4.912906456892971, "grad_norm": 0.00021874564117752016, "learning_rate": 1.8489651637383363e-09, "loss": 0.0, "num_input_tokens_seen": 135530216, "step": 201100 }, { "epoch": 4.913028607724819, "grad_norm": 0.0004242685972712934, "learning_rate": 1.8437854079225158e-09, "loss": 0.0, "num_input_tokens_seen": 135533416, "step": 201105 }, { "epoch": 4.913150758556665, "grad_norm": 3.266122075729072e-05, "learning_rate": 1.8386129109673187e-09, "loss": 0.0, "num_input_tokens_seen": 135536360, "step": 201110 }, { "epoch": 4.913272909388513, "grad_norm": 1137.6353759765625, "learning_rate": 1.83344767291016e-09, "loss": 0.041, "num_input_tokens_seen": 135539624, "step": 201115 }, { "epoch": 4.91339506022036, "grad_norm": 0.00044449593406170607, "learning_rate": 1.828289693788565e-09, "loss": 0.0, "num_input_tokens_seen": 135543016, "step": 201120 }, { "epoch": 4.913517211052207, "grad_norm": 0.0006559291505254805, "learning_rate": 1.8231389736401703e-09, "loss": 0.0, "num_input_tokens_seen": 135546280, "step": 201125 }, { "epoch": 4.913639361884054, "grad_norm": 0.00026207236805930734, "learning_rate": 1.8179955125023905e-09, "loss": 0.0, "num_input_tokens_seen": 135549416, "step": 201130 }, { "epoch": 4.913761512715902, "grad_norm": 3.2648542401148006e-05, "learning_rate": 1.81285931041264e-09, "loss": 0.0, "num_input_tokens_seen": 135552872, "step": 201135 }, { "epoch": 4.9138836635477485, "grad_norm": 0.00014412151358556002, "learning_rate": 1.8077303674083332e-09, "loss": 0.0, "num_input_tokens_seen": 135556136, "step": 201140 }, { "epoch": 4.914005814379596, "grad_norm": 0.0016105592949315906, "learning_rate": 1.802608683526552e-09, "loss": 0.0, "num_input_tokens_seen": 135559784, "step": 201145 }, { "epoch": 4.914127965211443, "grad_norm": 0.000432511733379215, "learning_rate": 1.7974942588048213e-09, "loss": 0.0, "num_input_tokens_seen": 135562984, "step": 201150 }, { "epoch": 4.9142501160432905, "grad_norm": 2.4063732780632563e-05, "learning_rate": 1.7923870932801123e-09, "loss": 0.0, "num_input_tokens_seen": 135566056, "step": 201155 }, { "epoch": 4.914372266875137, "grad_norm": 0.0001285884209210053, "learning_rate": 1.7872871869896166e-09, "loss": 0.0, "num_input_tokens_seen": 135569704, "step": 201160 }, { "epoch": 4.914494417706985, "grad_norm": 13.854092597961426, "learning_rate": 1.7821945399705273e-09, "loss": 0.0558, "num_input_tokens_seen": 135573224, "step": 201165 }, { "epoch": 4.914616568538832, "grad_norm": 0.0008157655247487128, "learning_rate": 1.7771091522598146e-09, "loss": 0.0, "num_input_tokens_seen": 135576296, "step": 201170 }, { "epoch": 4.914738719370679, "grad_norm": 0.0005658494192175567, "learning_rate": 1.7720310238943381e-09, "loss": 0.0, "num_input_tokens_seen": 135580648, "step": 201175 }, { "epoch": 4.914860870202526, "grad_norm": 0.019938549026846886, "learning_rate": 1.76696015491129e-09, "loss": 0.0, "num_input_tokens_seen": 135583912, "step": 201180 }, { "epoch": 4.914983021034374, "grad_norm": 0.00011997364345006645, "learning_rate": 1.7618965453473078e-09, "loss": 0.0, "num_input_tokens_seen": 135586856, "step": 201185 }, { "epoch": 4.91510517186622, "grad_norm": 0.0046823713928461075, "learning_rate": 1.7568401952392509e-09, "loss": 0.0, "num_input_tokens_seen": 135590120, "step": 201190 }, { "epoch": 4.915227322698067, "grad_norm": 1.848404099291656e-05, "learning_rate": 1.7517911046240897e-09, "loss": 0.0, "num_input_tokens_seen": 135593000, "step": 201195 }, { "epoch": 4.915349473529915, "grad_norm": 5.2997685997979715e-05, "learning_rate": 1.7467492735383505e-09, "loss": 0.0, "num_input_tokens_seen": 135596456, "step": 201200 }, { "epoch": 4.9154716243617615, "grad_norm": 0.00031060067703947425, "learning_rate": 1.7417147020186706e-09, "loss": 0.0, "num_input_tokens_seen": 135600360, "step": 201205 }, { "epoch": 4.915593775193609, "grad_norm": 3.5080083762295544e-05, "learning_rate": 1.7366873901017987e-09, "loss": 0.0, "num_input_tokens_seen": 135603880, "step": 201210 }, { "epoch": 4.915715926025456, "grad_norm": 0.00045931426575407386, "learning_rate": 1.7316673378242609e-09, "loss": 0.0, "num_input_tokens_seen": 135607272, "step": 201215 }, { "epoch": 4.915838076857304, "grad_norm": 0.00012492769747041166, "learning_rate": 1.7266545452225835e-09, "loss": 0.0, "num_input_tokens_seen": 135610728, "step": 201220 }, { "epoch": 4.91596022768915, "grad_norm": 0.0017665666528046131, "learning_rate": 1.7216490123330707e-09, "loss": 0.0, "num_input_tokens_seen": 135613672, "step": 201225 }, { "epoch": 4.916082378520998, "grad_norm": 0.011362847872078419, "learning_rate": 1.716650739192249e-09, "loss": 0.0, "num_input_tokens_seen": 135617256, "step": 201230 }, { "epoch": 4.916204529352845, "grad_norm": 0.002289575058966875, "learning_rate": 1.711659725836534e-09, "loss": 0.0, "num_input_tokens_seen": 135620584, "step": 201235 }, { "epoch": 4.916326680184692, "grad_norm": 0.0027904491871595383, "learning_rate": 1.7066759723021185e-09, "loss": 0.0, "num_input_tokens_seen": 135623976, "step": 201240 }, { "epoch": 4.916448831016539, "grad_norm": 4.122515383642167e-05, "learning_rate": 1.7016994786251958e-09, "loss": 0.0, "num_input_tokens_seen": 135627112, "step": 201245 }, { "epoch": 4.916570981848387, "grad_norm": 0.000364553474355489, "learning_rate": 1.6967302448420707e-09, "loss": 0.0001, "num_input_tokens_seen": 135630056, "step": 201250 }, { "epoch": 4.9166931326802334, "grad_norm": 6.166181265143678e-05, "learning_rate": 1.6917682709887139e-09, "loss": 0.0, "num_input_tokens_seen": 135633256, "step": 201255 }, { "epoch": 4.916815283512081, "grad_norm": 2.0448240320547484e-05, "learning_rate": 1.6868135571015408e-09, "loss": 0.0, "num_input_tokens_seen": 135636648, "step": 201260 }, { "epoch": 4.916937434343928, "grad_norm": 1.3415295143204276e-05, "learning_rate": 1.6818661032161896e-09, "loss": 0.0, "num_input_tokens_seen": 135640360, "step": 201265 }, { "epoch": 4.917059585175775, "grad_norm": 0.0004135376075282693, "learning_rate": 1.6769259093689647e-09, "loss": 0.0, "num_input_tokens_seen": 135643880, "step": 201270 }, { "epoch": 4.917181736007622, "grad_norm": 0.00034867238719016314, "learning_rate": 1.6719929755956152e-09, "loss": 0.0018, "num_input_tokens_seen": 135647208, "step": 201275 }, { "epoch": 4.91730388683947, "grad_norm": 7.221074338303879e-05, "learning_rate": 1.6670673019320014e-09, "loss": 0.0, "num_input_tokens_seen": 135650280, "step": 201280 }, { "epoch": 4.917426037671317, "grad_norm": 3.404384187888354e-05, "learning_rate": 1.6621488884139834e-09, "loss": 0.0, "num_input_tokens_seen": 135653672, "step": 201285 }, { "epoch": 4.917548188503163, "grad_norm": 0.01465871836990118, "learning_rate": 1.6572377350774213e-09, "loss": 0.0, "num_input_tokens_seen": 135657064, "step": 201290 }, { "epoch": 4.917670339335011, "grad_norm": 7.839166210033e-05, "learning_rate": 1.6523338419578426e-09, "loss": 0.0, "num_input_tokens_seen": 135660328, "step": 201295 }, { "epoch": 4.917792490166858, "grad_norm": 5.956729728495702e-05, "learning_rate": 1.647437209091107e-09, "loss": 0.0, "num_input_tokens_seen": 135663464, "step": 201300 }, { "epoch": 4.917914640998705, "grad_norm": 0.0019718848634511232, "learning_rate": 1.6425478365126311e-09, "loss": 0.0, "num_input_tokens_seen": 135667432, "step": 201305 }, { "epoch": 4.918036791830552, "grad_norm": 0.0001525900443084538, "learning_rate": 1.6376657242581638e-09, "loss": 0.0001, "num_input_tokens_seen": 135670504, "step": 201310 }, { "epoch": 4.9181589426624, "grad_norm": 0.0029563040006905794, "learning_rate": 1.6327908723631213e-09, "loss": 0.0, "num_input_tokens_seen": 135673768, "step": 201315 }, { "epoch": 4.9182810934942465, "grad_norm": 0.0002739654737524688, "learning_rate": 1.6279232808629196e-09, "loss": 0.0, "num_input_tokens_seen": 135676840, "step": 201320 }, { "epoch": 4.918403244326094, "grad_norm": 0.00016119072097353637, "learning_rate": 1.6230629497929748e-09, "loss": 0.0, "num_input_tokens_seen": 135680104, "step": 201325 }, { "epoch": 4.918525395157941, "grad_norm": 2.614900040498469e-05, "learning_rate": 1.6182098791887033e-09, "loss": 0.0, "num_input_tokens_seen": 135683752, "step": 201330 }, { "epoch": 4.9186475459897885, "grad_norm": 0.001245207036845386, "learning_rate": 1.613364069085299e-09, "loss": 0.0, "num_input_tokens_seen": 135687272, "step": 201335 }, { "epoch": 4.918769696821635, "grad_norm": 0.0006022427114658058, "learning_rate": 1.608525519518067e-09, "loss": 0.0, "num_input_tokens_seen": 135691048, "step": 201340 }, { "epoch": 4.918891847653483, "grad_norm": 0.0015529391821473837, "learning_rate": 1.6036942305220902e-09, "loss": 0.0, "num_input_tokens_seen": 135694248, "step": 201345 }, { "epoch": 4.91901399848533, "grad_norm": 2.1091949747642502e-05, "learning_rate": 1.5988702021326738e-09, "loss": 0.0, "num_input_tokens_seen": 135698216, "step": 201350 }, { "epoch": 4.919136149317177, "grad_norm": 0.0007461210479959846, "learning_rate": 1.59405343438479e-09, "loss": 0.0, "num_input_tokens_seen": 135701544, "step": 201355 }, { "epoch": 4.919258300149024, "grad_norm": 5.899009556742385e-05, "learning_rate": 1.5892439273135216e-09, "loss": 0.0, "num_input_tokens_seen": 135705064, "step": 201360 }, { "epoch": 4.919380450980871, "grad_norm": 1.099123437597882e-05, "learning_rate": 1.5844416809537297e-09, "loss": 0.0, "num_input_tokens_seen": 135708264, "step": 201365 }, { "epoch": 4.919502601812718, "grad_norm": 7.421234477078542e-05, "learning_rate": 1.5796466953404974e-09, "loss": 0.0, "num_input_tokens_seen": 135711336, "step": 201370 }, { "epoch": 4.919624752644566, "grad_norm": 0.007350210566073656, "learning_rate": 1.5748589705085747e-09, "loss": 0.0, "num_input_tokens_seen": 135714984, "step": 201375 }, { "epoch": 4.919746903476413, "grad_norm": 1.9945811800425872e-05, "learning_rate": 1.5700785064928224e-09, "loss": 0.0, "num_input_tokens_seen": 135718440, "step": 201380 }, { "epoch": 4.9198690543082595, "grad_norm": 0.000783449097070843, "learning_rate": 1.5653053033279906e-09, "loss": 0.0, "num_input_tokens_seen": 135722536, "step": 201385 }, { "epoch": 4.919991205140107, "grad_norm": 0.0006227453704923391, "learning_rate": 1.5605393610488294e-09, "loss": 0.0, "num_input_tokens_seen": 135725992, "step": 201390 }, { "epoch": 4.920113355971954, "grad_norm": 0.00017266077338717878, "learning_rate": 1.5557806796899776e-09, "loss": 0.0, "num_input_tokens_seen": 135729256, "step": 201395 }, { "epoch": 4.9202355068038015, "grad_norm": 0.0024521774612367153, "learning_rate": 1.551029259286074e-09, "loss": 0.0, "num_input_tokens_seen": 135732392, "step": 201400 }, { "epoch": 4.920357657635648, "grad_norm": 4.0178052586270496e-05, "learning_rate": 1.546285099871647e-09, "loss": 0.0, "num_input_tokens_seen": 135735848, "step": 201405 }, { "epoch": 4.920479808467496, "grad_norm": 0.00014618277782574296, "learning_rate": 1.5415482014811132e-09, "loss": 0.0, "num_input_tokens_seen": 135739240, "step": 201410 }, { "epoch": 4.920601959299343, "grad_norm": 0.0004449485568329692, "learning_rate": 1.5368185641490005e-09, "loss": 0.0, "num_input_tokens_seen": 135742440, "step": 201415 }, { "epoch": 4.92072411013119, "grad_norm": 0.0021590932738035917, "learning_rate": 1.532096187909726e-09, "loss": 0.0, "num_input_tokens_seen": 135745384, "step": 201420 }, { "epoch": 4.920846260963037, "grad_norm": 7.301468576770276e-05, "learning_rate": 1.5273810727975955e-09, "loss": 0.0, "num_input_tokens_seen": 135748584, "step": 201425 }, { "epoch": 4.920968411794885, "grad_norm": 0.19861288368701935, "learning_rate": 1.522673218846915e-09, "loss": 0.0, "num_input_tokens_seen": 135752104, "step": 201430 }, { "epoch": 4.921090562626731, "grad_norm": 0.6534940004348755, "learning_rate": 1.5179726260918791e-09, "loss": 0.0002, "num_input_tokens_seen": 135755112, "step": 201435 }, { "epoch": 4.921212713458579, "grad_norm": 0.00027803939883597195, "learning_rate": 1.5132792945666827e-09, "loss": 0.0002, "num_input_tokens_seen": 135758376, "step": 201440 }, { "epoch": 4.921334864290426, "grad_norm": 0.0002275543665746227, "learning_rate": 1.5085932243055211e-09, "loss": 0.0, "num_input_tokens_seen": 135761320, "step": 201445 }, { "epoch": 4.921457015122273, "grad_norm": 0.00024673750158399343, "learning_rate": 1.5039144153424777e-09, "loss": 0.0436, "num_input_tokens_seen": 135764712, "step": 201450 }, { "epoch": 4.92157916595412, "grad_norm": 0.0010997720528393984, "learning_rate": 1.4992428677115255e-09, "loss": 0.0, "num_input_tokens_seen": 135767912, "step": 201455 }, { "epoch": 4.921701316785967, "grad_norm": 0.0006602886132895947, "learning_rate": 1.4945785814465262e-09, "loss": 0.0, "num_input_tokens_seen": 135771432, "step": 201460 }, { "epoch": 4.9218234676178145, "grad_norm": 3.607894541346468e-05, "learning_rate": 1.4899215565816748e-09, "loss": 0.0, "num_input_tokens_seen": 135774504, "step": 201465 }, { "epoch": 4.921945618449661, "grad_norm": 0.0029618015978485346, "learning_rate": 1.485271793150611e-09, "loss": 0.0, "num_input_tokens_seen": 135777832, "step": 201470 }, { "epoch": 4.922067769281509, "grad_norm": 0.00795203447341919, "learning_rate": 1.4806292911871965e-09, "loss": 0.0, "num_input_tokens_seen": 135781480, "step": 201475 }, { "epoch": 4.922189920113356, "grad_norm": 0.0001877306931419298, "learning_rate": 1.4759940507251822e-09, "loss": 0.0, "num_input_tokens_seen": 135785256, "step": 201480 }, { "epoch": 4.922312070945203, "grad_norm": 0.0031501068733632565, "learning_rate": 1.47136607179843e-09, "loss": 0.0002, "num_input_tokens_seen": 135788968, "step": 201485 }, { "epoch": 4.92243422177705, "grad_norm": 0.005074269603937864, "learning_rate": 1.4667453544403573e-09, "loss": 0.0, "num_input_tokens_seen": 135791784, "step": 201490 }, { "epoch": 4.922556372608898, "grad_norm": 0.000645491003524512, "learning_rate": 1.4621318986847154e-09, "loss": 0.0, "num_input_tokens_seen": 135794984, "step": 201495 }, { "epoch": 4.922678523440744, "grad_norm": 8.181369776139036e-05, "learning_rate": 1.4575257045650325e-09, "loss": 0.0, "num_input_tokens_seen": 135798568, "step": 201500 }, { "epoch": 4.922800674272592, "grad_norm": 0.007807936519384384, "learning_rate": 1.4529267721148375e-09, "loss": 0.0037, "num_input_tokens_seen": 135801704, "step": 201505 }, { "epoch": 4.922922825104439, "grad_norm": 2.3001324734650552e-05, "learning_rate": 1.4483351013675482e-09, "loss": 0.0, "num_input_tokens_seen": 135805288, "step": 201510 }, { "epoch": 4.923044975936286, "grad_norm": 0.029084760695695877, "learning_rate": 1.4437506923564714e-09, "loss": 0.0, "num_input_tokens_seen": 135809128, "step": 201515 }, { "epoch": 4.923167126768133, "grad_norm": 0.0006325517897494137, "learning_rate": 1.4391735451150245e-09, "loss": 0.0, "num_input_tokens_seen": 135812200, "step": 201520 }, { "epoch": 4.923289277599981, "grad_norm": 0.00041451648576185107, "learning_rate": 1.4346036596765142e-09, "loss": 0.0, "num_input_tokens_seen": 135815912, "step": 201525 }, { "epoch": 4.9234114284318276, "grad_norm": 7.652008935110644e-05, "learning_rate": 1.4300410360741365e-09, "loss": 0.0, "num_input_tokens_seen": 135819880, "step": 201530 }, { "epoch": 4.923533579263674, "grad_norm": 0.0031949025578796864, "learning_rate": 1.425485674341087e-09, "loss": 0.0, "num_input_tokens_seen": 135823400, "step": 201535 }, { "epoch": 4.923655730095522, "grad_norm": 0.00017632555682212114, "learning_rate": 1.4209375745105613e-09, "loss": 0.0, "num_input_tokens_seen": 135826408, "step": 201540 }, { "epoch": 4.92377788092737, "grad_norm": 0.0007741831941530108, "learning_rate": 1.4163967366154217e-09, "loss": 0.0, "num_input_tokens_seen": 135829928, "step": 201545 }, { "epoch": 4.923900031759216, "grad_norm": 0.0008055263315327466, "learning_rate": 1.4118631606889752e-09, "loss": 0.0, "num_input_tokens_seen": 135832808, "step": 201550 }, { "epoch": 4.924022182591063, "grad_norm": 4.663014260586351e-05, "learning_rate": 1.4073368467639735e-09, "loss": 0.0, "num_input_tokens_seen": 135836072, "step": 201555 }, { "epoch": 4.924144333422911, "grad_norm": 0.002874054480344057, "learning_rate": 1.40281779487339e-09, "loss": 0.0, "num_input_tokens_seen": 135839720, "step": 201560 }, { "epoch": 4.924266484254757, "grad_norm": 0.0008985823951661587, "learning_rate": 1.3983060050500872e-09, "loss": 0.0, "num_input_tokens_seen": 135843688, "step": 201565 }, { "epoch": 4.924388635086605, "grad_norm": 0.0002772827574517578, "learning_rate": 1.393801477327039e-09, "loss": 0.0003, "num_input_tokens_seen": 135847528, "step": 201570 }, { "epoch": 4.924510785918452, "grad_norm": 0.0003808287438005209, "learning_rate": 1.3893042117367748e-09, "loss": 0.0, "num_input_tokens_seen": 135851048, "step": 201575 }, { "epoch": 4.9246329367502995, "grad_norm": 0.07541102170944214, "learning_rate": 1.3848142083120462e-09, "loss": 0.0, "num_input_tokens_seen": 135854120, "step": 201580 }, { "epoch": 4.924755087582146, "grad_norm": 0.0004984051920473576, "learning_rate": 1.3803314670856047e-09, "loss": 0.0, "num_input_tokens_seen": 135857192, "step": 201585 }, { "epoch": 4.924877238413994, "grad_norm": 0.0004298131098039448, "learning_rate": 1.3758559880898691e-09, "loss": 0.0, "num_input_tokens_seen": 135861160, "step": 201590 }, { "epoch": 4.924999389245841, "grad_norm": 0.00010070007556350902, "learning_rate": 1.3713877713575905e-09, "loss": 0.0, "num_input_tokens_seen": 135864232, "step": 201595 }, { "epoch": 4.925121540077688, "grad_norm": 0.0006499238079413772, "learning_rate": 1.366926816921188e-09, "loss": 0.0, "num_input_tokens_seen": 135867752, "step": 201600 }, { "epoch": 4.925243690909535, "grad_norm": 3.205785105819814e-05, "learning_rate": 1.3624731248130794e-09, "loss": 0.0, "num_input_tokens_seen": 135871080, "step": 201605 }, { "epoch": 4.925365841741383, "grad_norm": 0.0239634420722723, "learning_rate": 1.3580266950656837e-09, "loss": 0.0, "num_input_tokens_seen": 135874152, "step": 201610 }, { "epoch": 4.925487992573229, "grad_norm": 7.261833667755127, "learning_rate": 1.3535875277113085e-09, "loss": 0.0288, "num_input_tokens_seen": 135877672, "step": 201615 }, { "epoch": 4.925610143405077, "grad_norm": 1.7677119103609584e-05, "learning_rate": 1.349155622782261e-09, "loss": 0.0, "num_input_tokens_seen": 135881000, "step": 201620 }, { "epoch": 4.925732294236924, "grad_norm": 1.3271970601635985e-05, "learning_rate": 1.3447309803107376e-09, "loss": 0.0, "num_input_tokens_seen": 135884520, "step": 201625 }, { "epoch": 4.9258544450687705, "grad_norm": 0.00042806309647858143, "learning_rate": 1.340313600328935e-09, "loss": 0.0, "num_input_tokens_seen": 135887912, "step": 201630 }, { "epoch": 4.925976595900618, "grad_norm": 8.053508645389229e-05, "learning_rate": 1.3359034828689385e-09, "loss": 0.0, "num_input_tokens_seen": 135890920, "step": 201635 }, { "epoch": 4.926098746732466, "grad_norm": 0.002819041023030877, "learning_rate": 1.3315006279629448e-09, "loss": 0.0, "num_input_tokens_seen": 135894120, "step": 201640 }, { "epoch": 4.9262208975643125, "grad_norm": 0.0016093713929876685, "learning_rate": 1.327105035642817e-09, "loss": 0.0, "num_input_tokens_seen": 135897576, "step": 201645 }, { "epoch": 4.926343048396159, "grad_norm": 0.006175595335662365, "learning_rate": 1.3227167059406407e-09, "loss": 0.0, "num_input_tokens_seen": 135900776, "step": 201650 }, { "epoch": 4.926465199228007, "grad_norm": 0.00012842906289733946, "learning_rate": 1.3183356388882794e-09, "loss": 0.0, "num_input_tokens_seen": 135904360, "step": 201655 }, { "epoch": 4.926587350059854, "grad_norm": 9.985136421164498e-05, "learning_rate": 1.3139618345175962e-09, "loss": 0.0, "num_input_tokens_seen": 135907496, "step": 201660 }, { "epoch": 4.926709500891701, "grad_norm": 0.0026760785840451717, "learning_rate": 1.3095952928603438e-09, "loss": 0.0, "num_input_tokens_seen": 135911016, "step": 201665 }, { "epoch": 4.926831651723548, "grad_norm": 2.7883741495315917e-05, "learning_rate": 1.3052360139483853e-09, "loss": 0.0, "num_input_tokens_seen": 135914344, "step": 201670 }, { "epoch": 4.926953802555396, "grad_norm": 0.00010994355397997424, "learning_rate": 1.3008839978133623e-09, "loss": 0.0, "num_input_tokens_seen": 135917480, "step": 201675 }, { "epoch": 4.927075953387242, "grad_norm": 0.00031039112946018577, "learning_rate": 1.296539244486916e-09, "loss": 0.0, "num_input_tokens_seen": 135921192, "step": 201680 }, { "epoch": 4.92719810421909, "grad_norm": 9.488489740760997e-05, "learning_rate": 1.292201754000688e-09, "loss": 0.0, "num_input_tokens_seen": 135925416, "step": 201685 }, { "epoch": 4.927320255050937, "grad_norm": 0.04363443702459335, "learning_rate": 1.2878715263860973e-09, "loss": 0.0, "num_input_tokens_seen": 135928488, "step": 201690 }, { "epoch": 4.927442405882784, "grad_norm": 0.000588393013458699, "learning_rate": 1.2835485616748964e-09, "loss": 0.0, "num_input_tokens_seen": 135931624, "step": 201695 }, { "epoch": 4.927564556714631, "grad_norm": 0.0011282862396910787, "learning_rate": 1.2792328598981716e-09, "loss": 0.0, "num_input_tokens_seen": 135935400, "step": 201700 }, { "epoch": 4.927686707546479, "grad_norm": 0.0069139981642365456, "learning_rate": 1.2749244210875643e-09, "loss": 0.0, "num_input_tokens_seen": 135938728, "step": 201705 }, { "epoch": 4.9278088583783255, "grad_norm": 0.003317581955343485, "learning_rate": 1.2706232452743826e-09, "loss": 0.0, "num_input_tokens_seen": 135941992, "step": 201710 }, { "epoch": 4.927931009210173, "grad_norm": 0.00010448665852891281, "learning_rate": 1.2663293324897128e-09, "loss": 0.0, "num_input_tokens_seen": 135945640, "step": 201715 }, { "epoch": 4.92805316004202, "grad_norm": 0.00011394867760827765, "learning_rate": 1.2620426827650854e-09, "loss": 0.0, "num_input_tokens_seen": 135948904, "step": 201720 }, { "epoch": 4.928175310873867, "grad_norm": 0.0020071258768439293, "learning_rate": 1.2577632961313644e-09, "loss": 0.0002, "num_input_tokens_seen": 135952360, "step": 201725 }, { "epoch": 4.928297461705714, "grad_norm": 0.0017316938610747457, "learning_rate": 1.2534911726199693e-09, "loss": 0.0, "num_input_tokens_seen": 135955688, "step": 201730 }, { "epoch": 4.928419612537561, "grad_norm": 0.0001321783784078434, "learning_rate": 1.2492263122616532e-09, "loss": 0.0, "num_input_tokens_seen": 135959080, "step": 201735 }, { "epoch": 4.928541763369409, "grad_norm": 2.647805195010733e-05, "learning_rate": 1.2449687150877242e-09, "loss": 0.0, "num_input_tokens_seen": 135963048, "step": 201740 }, { "epoch": 4.928663914201255, "grad_norm": 0.00029094170895405114, "learning_rate": 1.2407183811289357e-09, "loss": 0.0006, "num_input_tokens_seen": 135966120, "step": 201745 }, { "epoch": 4.928786065033103, "grad_norm": 0.0003574473666958511, "learning_rate": 1.2364753104163738e-09, "loss": 0.0, "num_input_tokens_seen": 135969448, "step": 201750 }, { "epoch": 4.92890821586495, "grad_norm": 0.0008297132444567978, "learning_rate": 1.232239502980681e-09, "loss": 0.0146, "num_input_tokens_seen": 135972520, "step": 201755 }, { "epoch": 4.929030366696797, "grad_norm": 0.0002502483839634806, "learning_rate": 1.228010958852832e-09, "loss": 0.0, "num_input_tokens_seen": 135975784, "step": 201760 }, { "epoch": 4.929152517528644, "grad_norm": 2.7652360586216673e-05, "learning_rate": 1.2237896780635803e-09, "loss": 0.0, "num_input_tokens_seen": 135978984, "step": 201765 }, { "epoch": 4.929274668360492, "grad_norm": 0.0005179181462153792, "learning_rate": 1.2195756606434571e-09, "loss": 0.0, "num_input_tokens_seen": 135982248, "step": 201770 }, { "epoch": 4.9293968191923385, "grad_norm": 0.0005535013624466956, "learning_rate": 1.2153689066233263e-09, "loss": 0.0, "num_input_tokens_seen": 135985960, "step": 201775 }, { "epoch": 4.929518970024186, "grad_norm": 0.00016883248463273048, "learning_rate": 1.2111694160336083e-09, "loss": 0.0, "num_input_tokens_seen": 135989800, "step": 201780 }, { "epoch": 4.929641120856033, "grad_norm": 0.0014849066501483321, "learning_rate": 1.2069771889049452e-09, "loss": 0.0, "num_input_tokens_seen": 135993384, "step": 201785 }, { "epoch": 4.9297632716878805, "grad_norm": 0.00024154149286914617, "learning_rate": 1.202792225267757e-09, "loss": 0.0, "num_input_tokens_seen": 135997032, "step": 201790 }, { "epoch": 4.929885422519727, "grad_norm": 0.001064109499566257, "learning_rate": 1.1986145251524637e-09, "loss": 0.0, "num_input_tokens_seen": 136000360, "step": 201795 }, { "epoch": 4.930007573351575, "grad_norm": 8.710243128007278e-05, "learning_rate": 1.1944440885895968e-09, "loss": 0.0, "num_input_tokens_seen": 136003560, "step": 201800 }, { "epoch": 4.930129724183422, "grad_norm": 1.2830115338147152e-05, "learning_rate": 1.190280915609354e-09, "loss": 0.0, "num_input_tokens_seen": 136006824, "step": 201805 }, { "epoch": 4.930251875015269, "grad_norm": 5.2895011322107166e-05, "learning_rate": 1.1861250062419336e-09, "loss": 0.0, "num_input_tokens_seen": 136010280, "step": 201810 }, { "epoch": 4.930374025847116, "grad_norm": 4.9812042561825365e-05, "learning_rate": 1.1819763605177557e-09, "loss": 0.0, "num_input_tokens_seen": 136013352, "step": 201815 }, { "epoch": 4.930496176678963, "grad_norm": 0.000558437081053853, "learning_rate": 1.1778349784669073e-09, "loss": 0.0, "num_input_tokens_seen": 136017064, "step": 201820 }, { "epoch": 4.93061832751081, "grad_norm": 0.0009660838986746967, "learning_rate": 1.1737008601194754e-09, "loss": 0.0, "num_input_tokens_seen": 136020648, "step": 201825 }, { "epoch": 4.930740478342657, "grad_norm": 8.856524073053151e-05, "learning_rate": 1.169574005505547e-09, "loss": 0.0, "num_input_tokens_seen": 136024168, "step": 201830 }, { "epoch": 4.930862629174505, "grad_norm": 0.00048212928231805563, "learning_rate": 1.1654544146550982e-09, "loss": 0.0, "num_input_tokens_seen": 136027432, "step": 201835 }, { "epoch": 4.9309847800063515, "grad_norm": 0.004406995605677366, "learning_rate": 1.161342087598105e-09, "loss": 0.0, "num_input_tokens_seen": 136030952, "step": 201840 }, { "epoch": 4.931106930838199, "grad_norm": 0.0005184956826269627, "learning_rate": 1.1572370243645434e-09, "loss": 0.0, "num_input_tokens_seen": 136035048, "step": 201845 }, { "epoch": 4.931229081670046, "grad_norm": 0.00019828364020213485, "learning_rate": 1.1531392249841675e-09, "loss": 0.0, "num_input_tokens_seen": 136038632, "step": 201850 }, { "epoch": 4.9313512325018936, "grad_norm": 0.0024534957483410835, "learning_rate": 1.1490486894868422e-09, "loss": 0.0, "num_input_tokens_seen": 136042088, "step": 201855 }, { "epoch": 4.93147338333374, "grad_norm": 0.00020765556837432086, "learning_rate": 1.1449654179022105e-09, "loss": 0.0, "num_input_tokens_seen": 136045416, "step": 201860 }, { "epoch": 4.931595534165588, "grad_norm": 0.00010033969738287851, "learning_rate": 1.1408894102601374e-09, "loss": 0.0, "num_input_tokens_seen": 136048552, "step": 201865 }, { "epoch": 4.931717684997435, "grad_norm": 0.0007195353973656893, "learning_rate": 1.1368206665901548e-09, "loss": 0.0, "num_input_tokens_seen": 136052200, "step": 201870 }, { "epoch": 4.931839835829282, "grad_norm": 0.02347908727824688, "learning_rate": 1.1327591869219055e-09, "loss": 0.0, "num_input_tokens_seen": 136055720, "step": 201875 }, { "epoch": 4.931961986661129, "grad_norm": 0.006182366982102394, "learning_rate": 1.1287049712849217e-09, "loss": 0.0, "num_input_tokens_seen": 136058728, "step": 201880 }, { "epoch": 4.932084137492977, "grad_norm": 7.819590246072039e-05, "learning_rate": 1.1246580197086242e-09, "loss": 0.0, "num_input_tokens_seen": 136061992, "step": 201885 }, { "epoch": 4.932206288324823, "grad_norm": 0.0008825138211250305, "learning_rate": 1.120618332222434e-09, "loss": 0.0, "num_input_tokens_seen": 136065192, "step": 201890 }, { "epoch": 4.93232843915667, "grad_norm": 0.00010310118523193523, "learning_rate": 1.1165859088558826e-09, "loss": 0.0, "num_input_tokens_seen": 136068584, "step": 201895 }, { "epoch": 4.932450589988518, "grad_norm": 0.006327376700937748, "learning_rate": 1.1125607496380584e-09, "loss": 0.0, "num_input_tokens_seen": 136071848, "step": 201900 }, { "epoch": 4.9325727408203655, "grad_norm": 0.00017610739450901747, "learning_rate": 1.108542854598382e-09, "loss": 0.0, "num_input_tokens_seen": 136075304, "step": 201905 }, { "epoch": 4.932694891652212, "grad_norm": 5.935440185567131e-06, "learning_rate": 1.1045322237660527e-09, "loss": 0.0, "num_input_tokens_seen": 136078312, "step": 201910 }, { "epoch": 4.932817042484059, "grad_norm": 0.0005160932778380811, "learning_rate": 1.1005288571702687e-09, "loss": 0.0, "num_input_tokens_seen": 136081896, "step": 201915 }, { "epoch": 4.932939193315907, "grad_norm": 0.0001272395602427423, "learning_rate": 1.0965327548401183e-09, "loss": 0.0, "num_input_tokens_seen": 136085480, "step": 201920 }, { "epoch": 4.933061344147753, "grad_norm": 0.0070088147185742855, "learning_rate": 1.092543916804689e-09, "loss": 0.0, "num_input_tokens_seen": 136089128, "step": 201925 }, { "epoch": 4.933183494979601, "grad_norm": 2.4680461137904786e-05, "learning_rate": 1.088562343092847e-09, "loss": 0.0, "num_input_tokens_seen": 136092584, "step": 201930 }, { "epoch": 4.933305645811448, "grad_norm": 0.0004222989082336426, "learning_rate": 1.084588033733791e-09, "loss": 0.0, "num_input_tokens_seen": 136095976, "step": 201935 }, { "epoch": 4.933427796643295, "grad_norm": 7.98075197963044e-05, "learning_rate": 1.0806209887561646e-09, "loss": 0.0, "num_input_tokens_seen": 136099624, "step": 201940 }, { "epoch": 4.933549947475142, "grad_norm": 0.0002973111695609987, "learning_rate": 1.0766612081889448e-09, "loss": 0.0, "num_input_tokens_seen": 136103272, "step": 201945 }, { "epoch": 4.93367209830699, "grad_norm": 0.015840444713830948, "learning_rate": 1.0727086920609973e-09, "loss": 0.0, "num_input_tokens_seen": 136106472, "step": 201950 }, { "epoch": 4.9337942491388365, "grad_norm": 0.0004685459425672889, "learning_rate": 1.068763440400966e-09, "loss": 0.0, "num_input_tokens_seen": 136109864, "step": 201955 }, { "epoch": 4.933916399970684, "grad_norm": 2.5634884877945296e-05, "learning_rate": 1.0648254532376055e-09, "loss": 0.0, "num_input_tokens_seen": 136113192, "step": 201960 }, { "epoch": 4.934038550802531, "grad_norm": 2.3012027668301016e-05, "learning_rate": 1.0608947305994487e-09, "loss": 0.0, "num_input_tokens_seen": 136116456, "step": 201965 }, { "epoch": 4.9341607016343785, "grad_norm": 0.0007491564028896391, "learning_rate": 1.0569712725151392e-09, "loss": 0.0, "num_input_tokens_seen": 136119848, "step": 201970 }, { "epoch": 4.934282852466225, "grad_norm": 0.0005802642554044724, "learning_rate": 1.0530550790132098e-09, "loss": 0.0, "num_input_tokens_seen": 136122664, "step": 201975 }, { "epoch": 4.934405003298073, "grad_norm": 6.311324978014454e-05, "learning_rate": 1.0491461501221932e-09, "loss": 0.0279, "num_input_tokens_seen": 136126056, "step": 201980 }, { "epoch": 4.93452715412992, "grad_norm": 2.978726479341276e-05, "learning_rate": 1.0452444858705113e-09, "loss": 0.0, "num_input_tokens_seen": 136129832, "step": 201985 }, { "epoch": 4.934649304961766, "grad_norm": 0.002146832412108779, "learning_rate": 1.0413500862864743e-09, "loss": 0.0, "num_input_tokens_seen": 136132968, "step": 201990 }, { "epoch": 4.934771455793614, "grad_norm": 0.0019067926332354546, "learning_rate": 1.0374629513983935e-09, "loss": 0.0, "num_input_tokens_seen": 136136488, "step": 201995 }, { "epoch": 4.934893606625462, "grad_norm": 2.646358552738093e-05, "learning_rate": 1.0335830812345792e-09, "loss": 0.0, "num_input_tokens_seen": 136140136, "step": 202000 }, { "epoch": 4.935015757457308, "grad_norm": 0.0009238035418093204, "learning_rate": 1.0297104758232311e-09, "loss": 0.0, "num_input_tokens_seen": 136143272, "step": 202005 }, { "epoch": 4.935137908289155, "grad_norm": 0.0005739349289797246, "learning_rate": 1.0258451351925491e-09, "loss": 0.0, "num_input_tokens_seen": 136146408, "step": 202010 }, { "epoch": 4.935260059121003, "grad_norm": 4.488255399337504e-06, "learning_rate": 1.0219870593706215e-09, "loss": 0.0, "num_input_tokens_seen": 136149480, "step": 202015 }, { "epoch": 4.9353822099528495, "grad_norm": 0.006313847843557596, "learning_rate": 1.0181362483854262e-09, "loss": 0.0, "num_input_tokens_seen": 136152616, "step": 202020 }, { "epoch": 4.935504360784697, "grad_norm": 0.00014504387218039483, "learning_rate": 1.0142927022650516e-09, "loss": 0.0, "num_input_tokens_seen": 136155688, "step": 202025 }, { "epoch": 4.935626511616544, "grad_norm": 0.00022232808987610042, "learning_rate": 1.0104564210374756e-09, "loss": 0.0, "num_input_tokens_seen": 136159016, "step": 202030 }, { "epoch": 4.9357486624483915, "grad_norm": 0.0006274926709011197, "learning_rate": 1.0066274047305645e-09, "loss": 0.0, "num_input_tokens_seen": 136162600, "step": 202035 }, { "epoch": 4.935870813280238, "grad_norm": 0.00015691977750975639, "learning_rate": 1.0028056533720742e-09, "loss": 0.0, "num_input_tokens_seen": 136165864, "step": 202040 }, { "epoch": 4.935992964112086, "grad_norm": 0.0005563534214161336, "learning_rate": 9.98991166989982e-10, "loss": 0.0, "num_input_tokens_seen": 136169128, "step": 202045 }, { "epoch": 4.936115114943933, "grad_norm": 0.0008726372034288943, "learning_rate": 9.951839456119327e-10, "loss": 0.0, "num_input_tokens_seen": 136173224, "step": 202050 }, { "epoch": 4.93623726577578, "grad_norm": 0.0030929571948945522, "learning_rate": 9.913839892654596e-10, "loss": 0.0, "num_input_tokens_seen": 136176936, "step": 202055 }, { "epoch": 4.936359416607627, "grad_norm": 0.00030830607283860445, "learning_rate": 9.875912979784296e-10, "loss": 0.0, "num_input_tokens_seen": 136180392, "step": 202060 }, { "epoch": 4.936481567439475, "grad_norm": 3.1983137887436897e-05, "learning_rate": 9.83805871778376e-10, "loss": 0.0, "num_input_tokens_seen": 136183400, "step": 202065 }, { "epoch": 4.936603718271321, "grad_norm": 0.00014813434972893447, "learning_rate": 9.800277106927213e-10, "loss": 0.0, "num_input_tokens_seen": 136186792, "step": 202070 }, { "epoch": 4.936725869103169, "grad_norm": 0.0002679832396097481, "learning_rate": 9.762568147491102e-10, "loss": 0.0, "num_input_tokens_seen": 136190376, "step": 202075 }, { "epoch": 4.936848019935016, "grad_norm": 4.157804505666718e-05, "learning_rate": 9.72493183974743e-10, "loss": 0.0, "num_input_tokens_seen": 136193384, "step": 202080 }, { "epoch": 4.9369701707668625, "grad_norm": 0.0003346680023241788, "learning_rate": 9.687368183972644e-10, "loss": 0.0, "num_input_tokens_seen": 136196840, "step": 202085 }, { "epoch": 4.93709232159871, "grad_norm": 1.67577982210787e-05, "learning_rate": 9.649877180437637e-10, "loss": 0.0, "num_input_tokens_seen": 136200360, "step": 202090 }, { "epoch": 4.937214472430557, "grad_norm": 0.009562824852764606, "learning_rate": 9.612458829415527e-10, "loss": 0.0, "num_input_tokens_seen": 136203944, "step": 202095 }, { "epoch": 4.9373366232624045, "grad_norm": 0.001202791347168386, "learning_rate": 9.575113131178315e-10, "loss": 0.0, "num_input_tokens_seen": 136207528, "step": 202100 }, { "epoch": 4.937458774094251, "grad_norm": 0.0003408331540413201, "learning_rate": 9.537840085998006e-10, "loss": 0.041, "num_input_tokens_seen": 136210472, "step": 202105 }, { "epoch": 4.937580924926099, "grad_norm": 0.00019882139167748392, "learning_rate": 9.500639694146606e-10, "loss": 0.0, "num_input_tokens_seen": 136213544, "step": 202110 }, { "epoch": 4.937703075757946, "grad_norm": 1.4465913409367204e-05, "learning_rate": 9.46351195589279e-10, "loss": 0.0, "num_input_tokens_seen": 136216616, "step": 202115 }, { "epoch": 4.937825226589793, "grad_norm": 7.180091051850468e-05, "learning_rate": 9.426456871508559e-10, "loss": 0.0, "num_input_tokens_seen": 136220008, "step": 202120 }, { "epoch": 4.93794737742164, "grad_norm": 1.3721350114792585e-05, "learning_rate": 9.38947444126148e-10, "loss": 0.0, "num_input_tokens_seen": 136223080, "step": 202125 }, { "epoch": 4.938069528253488, "grad_norm": 0.0034140758216381073, "learning_rate": 9.352564665421337e-10, "loss": 0.0, "num_input_tokens_seen": 136226664, "step": 202130 }, { "epoch": 4.938191679085334, "grad_norm": 0.0001253626251127571, "learning_rate": 9.315727544256801e-10, "loss": 0.0, "num_input_tokens_seen": 136229928, "step": 202135 }, { "epoch": 4.938313829917182, "grad_norm": 0.0001573254558024928, "learning_rate": 9.27896307803433e-10, "loss": 0.0, "num_input_tokens_seen": 136232936, "step": 202140 }, { "epoch": 4.938435980749029, "grad_norm": 0.00011776157043641433, "learning_rate": 9.242271267023705e-10, "loss": 0.0, "num_input_tokens_seen": 136236264, "step": 202145 }, { "epoch": 4.938558131580876, "grad_norm": 0.0010260837152600288, "learning_rate": 9.20565211149027e-10, "loss": 0.0002, "num_input_tokens_seen": 136239400, "step": 202150 }, { "epoch": 4.938680282412723, "grad_norm": 9.85598744591698e-05, "learning_rate": 9.169105611699369e-10, "loss": 0.0, "num_input_tokens_seen": 136242536, "step": 202155 }, { "epoch": 4.93880243324457, "grad_norm": 3.4223503462271765e-05, "learning_rate": 9.132631767919674e-10, "loss": 0.0, "num_input_tokens_seen": 136245736, "step": 202160 }, { "epoch": 4.9389245840764175, "grad_norm": 9.328880878456403e-06, "learning_rate": 9.096230580413201e-10, "loss": 0.0, "num_input_tokens_seen": 136249064, "step": 202165 }, { "epoch": 4.939046734908265, "grad_norm": 0.0008164794999174774, "learning_rate": 9.05990204944751e-10, "loss": 0.0, "num_input_tokens_seen": 136252456, "step": 202170 }, { "epoch": 4.939168885740112, "grad_norm": 0.00014976067177485675, "learning_rate": 9.023646175284616e-10, "loss": 0.0, "num_input_tokens_seen": 136256104, "step": 202175 }, { "epoch": 4.939291036571959, "grad_norm": 2.89030449494021e-05, "learning_rate": 8.987462958189862e-10, "loss": 0.0, "num_input_tokens_seen": 136259432, "step": 202180 }, { "epoch": 4.939413187403806, "grad_norm": 0.0013129940489307046, "learning_rate": 8.95135239842415e-10, "loss": 0.0, "num_input_tokens_seen": 136262888, "step": 202185 }, { "epoch": 4.939535338235653, "grad_norm": 0.01688556745648384, "learning_rate": 8.915314496252824e-10, "loss": 0.0, "num_input_tokens_seen": 136265896, "step": 202190 }, { "epoch": 4.939657489067501, "grad_norm": 0.0009134452557191253, "learning_rate": 8.879349251935675e-10, "loss": 0.0, "num_input_tokens_seen": 136269288, "step": 202195 }, { "epoch": 4.939779639899347, "grad_norm": 0.0002523370203562081, "learning_rate": 8.843456665735827e-10, "loss": 0.0, "num_input_tokens_seen": 136272552, "step": 202200 }, { "epoch": 4.939901790731195, "grad_norm": 0.00024008109176065773, "learning_rate": 8.807636737913071e-10, "loss": 0.0, "num_input_tokens_seen": 136275688, "step": 202205 }, { "epoch": 4.940023941563042, "grad_norm": 0.0004295228864066303, "learning_rate": 8.771889468728311e-10, "loss": 0.0, "num_input_tokens_seen": 136278632, "step": 202210 }, { "epoch": 4.940146092394889, "grad_norm": 3.370600097696297e-05, "learning_rate": 8.736214858442448e-10, "loss": 0.0, "num_input_tokens_seen": 136282152, "step": 202215 }, { "epoch": 4.940268243226736, "grad_norm": 7.857001764932647e-05, "learning_rate": 8.700612907314164e-10, "loss": 0.0, "num_input_tokens_seen": 136285160, "step": 202220 }, { "epoch": 4.940390394058584, "grad_norm": 4.169629391981289e-05, "learning_rate": 8.665083615602142e-10, "loss": 0.0576, "num_input_tokens_seen": 136288424, "step": 202225 }, { "epoch": 4.940512544890431, "grad_norm": 5.379470530897379e-05, "learning_rate": 8.629626983565064e-10, "loss": 0.1, "num_input_tokens_seen": 136291560, "step": 202230 }, { "epoch": 4.940634695722278, "grad_norm": 0.0005903999553993344, "learning_rate": 8.5942430114605e-10, "loss": 0.0, "num_input_tokens_seen": 136294376, "step": 202235 }, { "epoch": 4.940756846554125, "grad_norm": 1.558191615913529e-05, "learning_rate": 8.558931699546023e-10, "loss": 0.0, "num_input_tokens_seen": 136297384, "step": 202240 }, { "epoch": 4.940878997385973, "grad_norm": 0.0004315730766393244, "learning_rate": 8.523693048078096e-10, "loss": 0.0, "num_input_tokens_seen": 136300968, "step": 202245 }, { "epoch": 4.941001148217819, "grad_norm": 0.005034905392676592, "learning_rate": 8.488527057313177e-10, "loss": 0.0, "num_input_tokens_seen": 136304552, "step": 202250 }, { "epoch": 4.941123299049666, "grad_norm": 2.2010641259839758e-05, "learning_rate": 8.45343372750773e-10, "loss": 0.0, "num_input_tokens_seen": 136308520, "step": 202255 }, { "epoch": 4.941245449881514, "grad_norm": 0.00024630461120978, "learning_rate": 8.418413058915997e-10, "loss": 0.0, "num_input_tokens_seen": 136312104, "step": 202260 }, { "epoch": 4.941367600713361, "grad_norm": 5.707953096134588e-05, "learning_rate": 8.383465051792216e-10, "loss": 0.0002, "num_input_tokens_seen": 136315304, "step": 202265 }, { "epoch": 4.941489751545208, "grad_norm": 0.0013829541858285666, "learning_rate": 8.34858970639285e-10, "loss": 0.0, "num_input_tokens_seen": 136318312, "step": 202270 }, { "epoch": 4.941611902377055, "grad_norm": 0.0006188564002513885, "learning_rate": 8.31378702296881e-10, "loss": 0.0376, "num_input_tokens_seen": 136322024, "step": 202275 }, { "epoch": 4.9417340532089025, "grad_norm": 0.0002772464358713478, "learning_rate": 8.279057001774336e-10, "loss": 0.0, "num_input_tokens_seen": 136325160, "step": 202280 }, { "epoch": 4.941856204040749, "grad_norm": 0.00021161598851904273, "learning_rate": 8.244399643062561e-10, "loss": 0.0001, "num_input_tokens_seen": 136328296, "step": 202285 }, { "epoch": 4.941978354872597, "grad_norm": 2.8740387278958224e-05, "learning_rate": 8.209814947084392e-10, "loss": 0.0, "num_input_tokens_seen": 136332584, "step": 202290 }, { "epoch": 4.942100505704444, "grad_norm": 0.00011233813711442053, "learning_rate": 8.175302914092963e-10, "loss": 0.0, "num_input_tokens_seen": 136335912, "step": 202295 }, { "epoch": 4.942222656536291, "grad_norm": 4.50963998446241e-05, "learning_rate": 8.140863544336963e-10, "loss": 0.0, "num_input_tokens_seen": 136338856, "step": 202300 }, { "epoch": 4.942344807368138, "grad_norm": 0.04743233323097229, "learning_rate": 8.106496838069521e-10, "loss": 0.0, "num_input_tokens_seen": 136341928, "step": 202305 }, { "epoch": 4.942466958199986, "grad_norm": 0.00042162369936704636, "learning_rate": 8.072202795538219e-10, "loss": 0.0, "num_input_tokens_seen": 136345256, "step": 202310 }, { "epoch": 4.942589109031832, "grad_norm": 2.6135880034416914e-05, "learning_rate": 8.037981416992857e-10, "loss": 0.0, "num_input_tokens_seen": 136348456, "step": 202315 }, { "epoch": 4.94271125986368, "grad_norm": 0.0002569279167801142, "learning_rate": 8.003832702683233e-10, "loss": 0.0, "num_input_tokens_seen": 136351720, "step": 202320 }, { "epoch": 4.942833410695527, "grad_norm": 0.0001678541739238426, "learning_rate": 7.969756652858039e-10, "loss": 0.0, "num_input_tokens_seen": 136355752, "step": 202325 }, { "epoch": 4.942955561527374, "grad_norm": 4.449135303730145e-05, "learning_rate": 7.935753267763745e-10, "loss": 0.0, "num_input_tokens_seen": 136359016, "step": 202330 }, { "epoch": 4.943077712359221, "grad_norm": 2.2052936401451007e-05, "learning_rate": 7.901822547647929e-10, "loss": 0.0855, "num_input_tokens_seen": 136361704, "step": 202335 }, { "epoch": 4.943199863191069, "grad_norm": 0.0033455921802669764, "learning_rate": 7.867964492758172e-10, "loss": 0.0, "num_input_tokens_seen": 136365096, "step": 202340 }, { "epoch": 4.9433220140229155, "grad_norm": 0.0006569712422788143, "learning_rate": 7.834179103339833e-10, "loss": 0.0, "num_input_tokens_seen": 136368744, "step": 202345 }, { "epoch": 4.943444164854762, "grad_norm": 101.92816162109375, "learning_rate": 7.800466379638271e-10, "loss": 0.0501, "num_input_tokens_seen": 136372392, "step": 202350 }, { "epoch": 4.94356631568661, "grad_norm": 0.00010263787407893687, "learning_rate": 7.766826321899955e-10, "loss": 0.0, "num_input_tokens_seen": 136376168, "step": 202355 }, { "epoch": 4.943688466518457, "grad_norm": 0.002447111066430807, "learning_rate": 7.733258930369135e-10, "loss": 0.0, "num_input_tokens_seen": 136379368, "step": 202360 }, { "epoch": 4.943810617350304, "grad_norm": 0.00013913783186580986, "learning_rate": 7.69976420528895e-10, "loss": 0.0, "num_input_tokens_seen": 136382888, "step": 202365 }, { "epoch": 4.943932768182151, "grad_norm": 0.0009353599161840975, "learning_rate": 7.666342146904759e-10, "loss": 0.0, "num_input_tokens_seen": 136386472, "step": 202370 }, { "epoch": 4.944054919013999, "grad_norm": 0.0001008975159493275, "learning_rate": 7.632992755457479e-10, "loss": 0.0, "num_input_tokens_seen": 136389864, "step": 202375 }, { "epoch": 4.944177069845845, "grad_norm": 0.016027770936489105, "learning_rate": 7.599716031191361e-10, "loss": 0.0, "num_input_tokens_seen": 136393512, "step": 202380 }, { "epoch": 4.944299220677693, "grad_norm": 0.0003744078567251563, "learning_rate": 7.566511974347322e-10, "loss": 0.0, "num_input_tokens_seen": 136396712, "step": 202385 }, { "epoch": 4.94442137150954, "grad_norm": 9.113190026255324e-05, "learning_rate": 7.533380585167393e-10, "loss": 0.0001, "num_input_tokens_seen": 136400104, "step": 202390 }, { "epoch": 4.944543522341387, "grad_norm": 0.00021912145894020796, "learning_rate": 7.50032186389249e-10, "loss": 0.0, "num_input_tokens_seen": 136403432, "step": 202395 }, { "epoch": 4.944665673173234, "grad_norm": 0.00018315493070986122, "learning_rate": 7.467335810762421e-10, "loss": 0.0, "num_input_tokens_seen": 136406888, "step": 202400 }, { "epoch": 4.944787824005082, "grad_norm": 5.3936601034365594e-06, "learning_rate": 7.434422426018105e-10, "loss": 0.0, "num_input_tokens_seen": 136410280, "step": 202405 }, { "epoch": 4.9449099748369285, "grad_norm": 0.0018063209718093276, "learning_rate": 7.401581709898241e-10, "loss": 0.0, "num_input_tokens_seen": 136413544, "step": 202410 }, { "epoch": 4.945032125668776, "grad_norm": 0.0017041267128661275, "learning_rate": 7.368813662641527e-10, "loss": 0.0, "num_input_tokens_seen": 136416872, "step": 202415 }, { "epoch": 4.945154276500623, "grad_norm": 0.0022793838288635015, "learning_rate": 7.336118284486659e-10, "loss": 0.0, "num_input_tokens_seen": 136420008, "step": 202420 }, { "epoch": 4.94527642733247, "grad_norm": 0.005071424413472414, "learning_rate": 7.303495575671226e-10, "loss": 0.0, "num_input_tokens_seen": 136423656, "step": 202425 }, { "epoch": 4.945398578164317, "grad_norm": 0.000549265940207988, "learning_rate": 7.270945536431705e-10, "loss": 0.0, "num_input_tokens_seen": 136427048, "step": 202430 }, { "epoch": 4.945520728996165, "grad_norm": 6.238095375010744e-05, "learning_rate": 7.238468167006795e-10, "loss": 0.0, "num_input_tokens_seen": 136432552, "step": 202435 }, { "epoch": 4.945642879828012, "grad_norm": 4.154796260991134e-05, "learning_rate": 7.206063467630752e-10, "loss": 0.0, "num_input_tokens_seen": 136435816, "step": 202440 }, { "epoch": 4.945765030659858, "grad_norm": 0.0002437532675685361, "learning_rate": 7.173731438540054e-10, "loss": 0.0, "num_input_tokens_seen": 136439080, "step": 202445 }, { "epoch": 4.945887181491706, "grad_norm": 0.00020424068497959524, "learning_rate": 7.141472079970068e-10, "loss": 0.0, "num_input_tokens_seen": 136442024, "step": 202450 }, { "epoch": 4.946009332323553, "grad_norm": 9.218160266755149e-05, "learning_rate": 7.109285392155051e-10, "loss": 0.0, "num_input_tokens_seen": 136445352, "step": 202455 }, { "epoch": 4.9461314831554, "grad_norm": 9.915697592077777e-05, "learning_rate": 7.077171375329261e-10, "loss": 0.0, "num_input_tokens_seen": 136448616, "step": 202460 }, { "epoch": 4.946253633987247, "grad_norm": 0.000816051266156137, "learning_rate": 7.045130029725843e-10, "loss": 0.0, "num_input_tokens_seen": 136451816, "step": 202465 }, { "epoch": 4.946375784819095, "grad_norm": 0.09224054962396622, "learning_rate": 7.013161355577945e-10, "loss": 0.0, "num_input_tokens_seen": 136455720, "step": 202470 }, { "epoch": 4.9464979356509415, "grad_norm": 0.0026561652775853872, "learning_rate": 6.981265353117605e-10, "loss": 0.0, "num_input_tokens_seen": 136459304, "step": 202475 }, { "epoch": 4.946620086482789, "grad_norm": 0.032723743468523026, "learning_rate": 6.949442022577967e-10, "loss": 0.0, "num_input_tokens_seen": 136462632, "step": 202480 }, { "epoch": 4.946742237314636, "grad_norm": 9.901898010866717e-06, "learning_rate": 6.917691364188849e-10, "loss": 0.0, "num_input_tokens_seen": 136466152, "step": 202485 }, { "epoch": 4.9468643881464835, "grad_norm": 0.44688522815704346, "learning_rate": 6.886013378183397e-10, "loss": 0.0001, "num_input_tokens_seen": 136469352, "step": 202490 }, { "epoch": 4.94698653897833, "grad_norm": 2.6838359190151095e-05, "learning_rate": 6.854408064790318e-10, "loss": 0.0, "num_input_tokens_seen": 136473512, "step": 202495 }, { "epoch": 4.947108689810178, "grad_norm": 0.0001793210831237957, "learning_rate": 6.822875424239427e-10, "loss": 0.0, "num_input_tokens_seen": 136477416, "step": 202500 }, { "epoch": 4.947230840642025, "grad_norm": 6.435633258661255e-05, "learning_rate": 6.79141545676054e-10, "loss": 0.0, "num_input_tokens_seen": 136480616, "step": 202505 }, { "epoch": 4.947352991473872, "grad_norm": 0.00011103512952104211, "learning_rate": 6.760028162582365e-10, "loss": 0.0, "num_input_tokens_seen": 136483752, "step": 202510 }, { "epoch": 4.947475142305719, "grad_norm": 0.00047665010788477957, "learning_rate": 6.728713541933606e-10, "loss": 0.0, "num_input_tokens_seen": 136486760, "step": 202515 }, { "epoch": 4.947597293137566, "grad_norm": 0.00012407048780005425, "learning_rate": 6.697471595040749e-10, "loss": 0.0, "num_input_tokens_seen": 136490088, "step": 202520 }, { "epoch": 4.947719443969413, "grad_norm": 9.188240073854104e-05, "learning_rate": 6.6663023221325e-10, "loss": 0.0, "num_input_tokens_seen": 136493928, "step": 202525 }, { "epoch": 4.947841594801261, "grad_norm": 0.0026419502682983875, "learning_rate": 6.635205723434234e-10, "loss": 0.0, "num_input_tokens_seen": 136497832, "step": 202530 }, { "epoch": 4.947963745633108, "grad_norm": 2.2201460524229333e-05, "learning_rate": 6.604181799172437e-10, "loss": 0.0, "num_input_tokens_seen": 136500968, "step": 202535 }, { "epoch": 4.948085896464955, "grad_norm": 0.0008757863542996347, "learning_rate": 6.573230549573594e-10, "loss": 0.0, "num_input_tokens_seen": 136504296, "step": 202540 }, { "epoch": 4.948208047296802, "grad_norm": 0.0021094870753586292, "learning_rate": 6.54235197486197e-10, "loss": 0.0, "num_input_tokens_seen": 136508008, "step": 202545 }, { "epoch": 4.948330198128649, "grad_norm": 0.00016692510689608753, "learning_rate": 6.511546075261831e-10, "loss": 0.0, "num_input_tokens_seen": 136511656, "step": 202550 }, { "epoch": 4.948452348960497, "grad_norm": 0.0003622740041464567, "learning_rate": 6.480812850997442e-10, "loss": 0.0, "num_input_tokens_seen": 136514856, "step": 202555 }, { "epoch": 4.948574499792343, "grad_norm": 0.00023636213154532015, "learning_rate": 6.450152302293066e-10, "loss": 0.0, "num_input_tokens_seen": 136518888, "step": 202560 }, { "epoch": 4.948696650624191, "grad_norm": 0.0003470900119282305, "learning_rate": 6.41956442937186e-10, "loss": 0.0, "num_input_tokens_seen": 136522344, "step": 202565 }, { "epoch": 4.948818801456038, "grad_norm": 0.005127554759383202, "learning_rate": 6.389049232454757e-10, "loss": 0.0, "num_input_tokens_seen": 136525544, "step": 202570 }, { "epoch": 4.948940952287885, "grad_norm": 6.562341877724975e-05, "learning_rate": 6.358606711763803e-10, "loss": 0.0, "num_input_tokens_seen": 136528872, "step": 202575 }, { "epoch": 4.949063103119732, "grad_norm": 0.0006907072965987027, "learning_rate": 6.328236867522152e-10, "loss": 0.0, "num_input_tokens_seen": 136532264, "step": 202580 }, { "epoch": 4.94918525395158, "grad_norm": 21.064266204833984, "learning_rate": 6.297939699948518e-10, "loss": 0.0615, "num_input_tokens_seen": 136535272, "step": 202585 }, { "epoch": 4.9493074047834265, "grad_norm": 6.251333252293989e-05, "learning_rate": 6.267715209264945e-10, "loss": 0.0, "num_input_tokens_seen": 136538664, "step": 202590 }, { "epoch": 4.949429555615274, "grad_norm": 0.0009637015173211694, "learning_rate": 6.237563395690149e-10, "loss": 0.0, "num_input_tokens_seen": 136542120, "step": 202595 }, { "epoch": 4.949551706447121, "grad_norm": 0.0023319462779909372, "learning_rate": 6.207484259443952e-10, "loss": 0.0, "num_input_tokens_seen": 136545768, "step": 202600 }, { "epoch": 4.9496738572789685, "grad_norm": 0.2298898547887802, "learning_rate": 6.177477800745067e-10, "loss": 0.0, "num_input_tokens_seen": 136548968, "step": 202605 }, { "epoch": 4.949796008110815, "grad_norm": 0.0007136244676075876, "learning_rate": 6.147544019812212e-10, "loss": 0.0, "num_input_tokens_seen": 136552168, "step": 202610 }, { "epoch": 4.949918158942662, "grad_norm": 0.00026461650850251317, "learning_rate": 6.117682916861877e-10, "loss": 0.0, "num_input_tokens_seen": 136555432, "step": 202615 }, { "epoch": 4.95004030977451, "grad_norm": 8.239582530222833e-05, "learning_rate": 6.087894492111667e-10, "loss": 0.0, "num_input_tokens_seen": 136558952, "step": 202620 }, { "epoch": 4.950162460606357, "grad_norm": 0.00012572153354994953, "learning_rate": 6.058178745778076e-10, "loss": 0.0, "num_input_tokens_seen": 136562216, "step": 202625 }, { "epoch": 4.950284611438204, "grad_norm": 0.00047976983478292823, "learning_rate": 6.028535678077595e-10, "loss": 0.0, "num_input_tokens_seen": 136565864, "step": 202630 }, { "epoch": 4.950406762270051, "grad_norm": 0.004869820084422827, "learning_rate": 5.998965289225611e-10, "loss": 0.0, "num_input_tokens_seen": 136569128, "step": 202635 }, { "epoch": 4.950528913101898, "grad_norm": 4.379753227112815e-05, "learning_rate": 5.969467579437504e-10, "loss": 0.0, "num_input_tokens_seen": 136572200, "step": 202640 }, { "epoch": 4.950651063933745, "grad_norm": 1.0504995771043468e-05, "learning_rate": 5.940042548927548e-10, "loss": 0.0, "num_input_tokens_seen": 136575592, "step": 202645 }, { "epoch": 4.950773214765593, "grad_norm": 6.727211439283565e-05, "learning_rate": 5.910690197908908e-10, "loss": 0.0, "num_input_tokens_seen": 136578920, "step": 202650 }, { "epoch": 4.9508953655974395, "grad_norm": 4.358491423772648e-05, "learning_rate": 5.881410526595854e-10, "loss": 0.0, "num_input_tokens_seen": 136581992, "step": 202655 }, { "epoch": 4.951017516429287, "grad_norm": 0.00016454195429105312, "learning_rate": 5.85220353520266e-10, "loss": 0.0, "num_input_tokens_seen": 136585576, "step": 202660 }, { "epoch": 4.951139667261134, "grad_norm": 5.1774670282611623e-05, "learning_rate": 5.823069223939159e-10, "loss": 0.0, "num_input_tokens_seen": 136589480, "step": 202665 }, { "epoch": 4.9512618180929815, "grad_norm": 0.000128003244753927, "learning_rate": 5.794007593018512e-10, "loss": 0.0, "num_input_tokens_seen": 136592808, "step": 202670 }, { "epoch": 4.951383968924828, "grad_norm": 0.0002087876491714269, "learning_rate": 5.765018642652775e-10, "loss": 0.0, "num_input_tokens_seen": 136596200, "step": 202675 }, { "epoch": 4.951506119756676, "grad_norm": 0.001459281425923109, "learning_rate": 5.736102373050666e-10, "loss": 0.0, "num_input_tokens_seen": 136599848, "step": 202680 }, { "epoch": 4.951628270588523, "grad_norm": 5.585969483945519e-05, "learning_rate": 5.707258784424241e-10, "loss": 0.0, "num_input_tokens_seen": 136603048, "step": 202685 }, { "epoch": 4.95175042142037, "grad_norm": 0.0025811134837567806, "learning_rate": 5.678487876983329e-10, "loss": 0.0, "num_input_tokens_seen": 136607080, "step": 202690 }, { "epoch": 4.951872572252217, "grad_norm": 0.0004729589563794434, "learning_rate": 5.649789650936654e-10, "loss": 0.0, "num_input_tokens_seen": 136610344, "step": 202695 }, { "epoch": 4.951994723084065, "grad_norm": 0.00126547587569803, "learning_rate": 5.621164106491827e-10, "loss": 0.0, "num_input_tokens_seen": 136613800, "step": 202700 }, { "epoch": 4.952116873915911, "grad_norm": 0.00017267995281144977, "learning_rate": 5.592611243858681e-10, "loss": 0.0, "num_input_tokens_seen": 136617064, "step": 202705 }, { "epoch": 4.952239024747758, "grad_norm": 0.0011236423160880804, "learning_rate": 5.564131063244826e-10, "loss": 0.0, "num_input_tokens_seen": 136620392, "step": 202710 }, { "epoch": 4.952361175579606, "grad_norm": 0.0035760272294282913, "learning_rate": 5.535723564855654e-10, "loss": 0.0346, "num_input_tokens_seen": 136623528, "step": 202715 }, { "epoch": 4.9524833264114525, "grad_norm": 0.04028363898396492, "learning_rate": 5.507388748899889e-10, "loss": 0.0, "num_input_tokens_seen": 136626728, "step": 202720 }, { "epoch": 4.9526054772433, "grad_norm": 8.850402082316577e-05, "learning_rate": 5.479126615581808e-10, "loss": 0.0, "num_input_tokens_seen": 136630120, "step": 202725 }, { "epoch": 4.952727628075147, "grad_norm": 8.48881700221682e-06, "learning_rate": 5.450937165109026e-10, "loss": 0.0437, "num_input_tokens_seen": 136633448, "step": 202730 }, { "epoch": 4.9528497789069945, "grad_norm": 0.0008506285957992077, "learning_rate": 5.422820397683603e-10, "loss": 0.0, "num_input_tokens_seen": 136636648, "step": 202735 }, { "epoch": 4.952971929738841, "grad_norm": 0.0005138792330399156, "learning_rate": 5.394776313512039e-10, "loss": 0.0, "num_input_tokens_seen": 136640424, "step": 202740 }, { "epoch": 4.953094080570689, "grad_norm": 0.0001250112400157377, "learning_rate": 5.366804912798617e-10, "loss": 0.0, "num_input_tokens_seen": 136644008, "step": 202745 }, { "epoch": 4.953216231402536, "grad_norm": 0.0009678230853751302, "learning_rate": 5.338906195745396e-10, "loss": 0.0, "num_input_tokens_seen": 136647528, "step": 202750 }, { "epoch": 4.953338382234383, "grad_norm": 0.0006666900007985532, "learning_rate": 5.311080162556658e-10, "loss": 0.0, "num_input_tokens_seen": 136651048, "step": 202755 }, { "epoch": 4.95346053306623, "grad_norm": 0.0007440054323524237, "learning_rate": 5.283326813433353e-10, "loss": 0.0, "num_input_tokens_seen": 136654568, "step": 202760 }, { "epoch": 4.953582683898078, "grad_norm": 0.006778331473469734, "learning_rate": 5.255646148577542e-10, "loss": 0.0, "num_input_tokens_seen": 136658152, "step": 202765 }, { "epoch": 4.953704834729924, "grad_norm": 4.8525103920837864e-05, "learning_rate": 5.228038168191284e-10, "loss": 0.0, "num_input_tokens_seen": 136661608, "step": 202770 }, { "epoch": 4.953826985561772, "grad_norm": 0.0004143420956097543, "learning_rate": 5.200502872475531e-10, "loss": 0.0, "num_input_tokens_seen": 136664744, "step": 202775 }, { "epoch": 4.953949136393619, "grad_norm": 2.905308429035358e-05, "learning_rate": 5.173040261629014e-10, "loss": 0.0, "num_input_tokens_seen": 136668584, "step": 202780 }, { "epoch": 4.9540712872254655, "grad_norm": 0.0058879847638309, "learning_rate": 5.145650335853791e-10, "loss": 0.0, "num_input_tokens_seen": 136671912, "step": 202785 }, { "epoch": 4.954193438057313, "grad_norm": 0.00014649657532572746, "learning_rate": 5.118333095346372e-10, "loss": 0.0, "num_input_tokens_seen": 136675368, "step": 202790 }, { "epoch": 4.954315588889161, "grad_norm": 1.4093015124672092e-05, "learning_rate": 5.091088540307708e-10, "loss": 0.0, "num_input_tokens_seen": 136678888, "step": 202795 }, { "epoch": 4.9544377397210075, "grad_norm": 0.0002563645539339632, "learning_rate": 5.06391667093431e-10, "loss": 0.0, "num_input_tokens_seen": 136682600, "step": 202800 }, { "epoch": 4.954559890552854, "grad_norm": 0.0004114148614462465, "learning_rate": 5.036817487424905e-10, "loss": 0.0, "num_input_tokens_seen": 136686184, "step": 202805 }, { "epoch": 4.954682041384702, "grad_norm": 0.0030616470612585545, "learning_rate": 5.009790989974893e-10, "loss": 0.0, "num_input_tokens_seen": 136689320, "step": 202810 }, { "epoch": 4.954804192216549, "grad_norm": 0.00034004944609478116, "learning_rate": 4.982837178783006e-10, "loss": 0.0, "num_input_tokens_seen": 136692584, "step": 202815 }, { "epoch": 4.954926343048396, "grad_norm": 0.0015535084530711174, "learning_rate": 4.955956054044641e-10, "loss": 0.0001, "num_input_tokens_seen": 136696104, "step": 202820 }, { "epoch": 4.955048493880243, "grad_norm": 0.007306914310902357, "learning_rate": 4.929147615954088e-10, "loss": 0.0, "num_input_tokens_seen": 136699496, "step": 202825 }, { "epoch": 4.955170644712091, "grad_norm": 0.00029027010896243155, "learning_rate": 4.902411864707856e-10, "loss": 0.0, "num_input_tokens_seen": 136703080, "step": 202830 }, { "epoch": 4.955292795543937, "grad_norm": 0.00010325389303034171, "learning_rate": 4.875748800499124e-10, "loss": 0.0, "num_input_tokens_seen": 136706536, "step": 202835 }, { "epoch": 4.955414946375785, "grad_norm": 0.00011646476195892319, "learning_rate": 4.849158423522181e-10, "loss": 0.0, "num_input_tokens_seen": 136709608, "step": 202840 }, { "epoch": 4.955537097207632, "grad_norm": 0.0016139474464580417, "learning_rate": 4.822640733971317e-10, "loss": 0.0, "num_input_tokens_seen": 136712872, "step": 202845 }, { "epoch": 4.955659248039479, "grad_norm": 0.0004955488257110119, "learning_rate": 4.796195732038599e-10, "loss": 0.0, "num_input_tokens_seen": 136715752, "step": 202850 }, { "epoch": 4.955781398871326, "grad_norm": 0.0006088154623284936, "learning_rate": 4.769823417914987e-10, "loss": 0.0, "num_input_tokens_seen": 136718952, "step": 202855 }, { "epoch": 4.955903549703174, "grad_norm": 0.0019494992448017001, "learning_rate": 4.743523791794768e-10, "loss": 0.0, "num_input_tokens_seen": 136722344, "step": 202860 }, { "epoch": 4.956025700535021, "grad_norm": 0.00047076281043700874, "learning_rate": 4.717296853867791e-10, "loss": 0.0, "num_input_tokens_seen": 136725608, "step": 202865 }, { "epoch": 4.956147851366868, "grad_norm": 9.878385753836483e-05, "learning_rate": 4.691142604325016e-10, "loss": 0.0, "num_input_tokens_seen": 136729384, "step": 202870 }, { "epoch": 4.956270002198715, "grad_norm": 0.0019373642280697823, "learning_rate": 4.665061043356289e-10, "loss": 0.0, "num_input_tokens_seen": 136732584, "step": 202875 }, { "epoch": 4.956392153030562, "grad_norm": 0.0004523659299593419, "learning_rate": 4.639052171152569e-10, "loss": 0.0, "num_input_tokens_seen": 136735976, "step": 202880 }, { "epoch": 4.956514303862409, "grad_norm": 2.4888262487365864e-05, "learning_rate": 4.6131159879014834e-10, "loss": 0.0, "num_input_tokens_seen": 136739432, "step": 202885 }, { "epoch": 4.956636454694257, "grad_norm": 0.001991266617551446, "learning_rate": 4.5872524937917713e-10, "loss": 0.0, "num_input_tokens_seen": 136742760, "step": 202890 }, { "epoch": 4.956758605526104, "grad_norm": 5.5790322221582755e-05, "learning_rate": 4.5614616890121693e-10, "loss": 0.0, "num_input_tokens_seen": 136746088, "step": 202895 }, { "epoch": 4.9568807563579504, "grad_norm": 0.004070554859936237, "learning_rate": 4.535743573750306e-10, "loss": 0.0, "num_input_tokens_seen": 136749544, "step": 202900 }, { "epoch": 4.957002907189798, "grad_norm": 0.0001542905520182103, "learning_rate": 4.5100981481938085e-10, "loss": 0.0, "num_input_tokens_seen": 136753064, "step": 202905 }, { "epoch": 4.957125058021645, "grad_norm": 0.0006492988322861493, "learning_rate": 4.484525412526974e-10, "loss": 0.0717, "num_input_tokens_seen": 136756136, "step": 202910 }, { "epoch": 4.9572472088534925, "grad_norm": 0.0013533460441976786, "learning_rate": 4.4590253669385404e-10, "loss": 0.0043, "num_input_tokens_seen": 136759336, "step": 202915 }, { "epoch": 4.957369359685339, "grad_norm": 1.4056084182811901e-05, "learning_rate": 4.4335980116116946e-10, "loss": 0.0, "num_input_tokens_seen": 136762728, "step": 202920 }, { "epoch": 4.957491510517187, "grad_norm": 5.709950710297562e-05, "learning_rate": 4.4082433467318436e-10, "loss": 0.0, "num_input_tokens_seen": 136765992, "step": 202925 }, { "epoch": 4.957613661349034, "grad_norm": 0.002161990851163864, "learning_rate": 4.382961372484395e-10, "loss": 0.0, "num_input_tokens_seen": 136770024, "step": 202930 }, { "epoch": 4.957735812180881, "grad_norm": 1.5122742297535297e-05, "learning_rate": 4.3577520890525353e-10, "loss": 0.0001, "num_input_tokens_seen": 136773160, "step": 202935 }, { "epoch": 4.957857963012728, "grad_norm": 0.001000594929791987, "learning_rate": 4.332615496619452e-10, "loss": 0.0, "num_input_tokens_seen": 136776872, "step": 202940 }, { "epoch": 4.957980113844576, "grad_norm": 0.0017755540320649743, "learning_rate": 4.3075515953683306e-10, "loss": 0.0, "num_input_tokens_seen": 136780648, "step": 202945 }, { "epoch": 4.958102264676422, "grad_norm": 6.09001363045536e-05, "learning_rate": 4.2825603854801385e-10, "loss": 0.0, "num_input_tokens_seen": 136783912, "step": 202950 }, { "epoch": 4.95822441550827, "grad_norm": 0.0006439242861233652, "learning_rate": 4.257641867139172e-10, "loss": 0.0001, "num_input_tokens_seen": 136787048, "step": 202955 }, { "epoch": 4.958346566340117, "grad_norm": 2.781920375127811e-05, "learning_rate": 4.2327960405241783e-10, "loss": 0.0, "num_input_tokens_seen": 136790952, "step": 202960 }, { "epoch": 4.958468717171964, "grad_norm": 0.0003997894236817956, "learning_rate": 4.2080229058172325e-10, "loss": 0.0, "num_input_tokens_seen": 136794600, "step": 202965 }, { "epoch": 4.958590868003811, "grad_norm": 0.03980407118797302, "learning_rate": 4.183322463198191e-10, "loss": 0.0, "num_input_tokens_seen": 136797992, "step": 202970 }, { "epoch": 4.958713018835658, "grad_norm": 4.830466423300095e-06, "learning_rate": 4.1586947128458006e-10, "loss": 0.0, "num_input_tokens_seen": 136801640, "step": 202975 }, { "epoch": 4.9588351696675055, "grad_norm": 0.033779554069042206, "learning_rate": 4.134139654941027e-10, "loss": 0.0, "num_input_tokens_seen": 136804968, "step": 202980 }, { "epoch": 4.958957320499352, "grad_norm": 5.306170351104811e-05, "learning_rate": 4.109657289660395e-10, "loss": 0.0, "num_input_tokens_seen": 136807848, "step": 202985 }, { "epoch": 4.9590794713312, "grad_norm": 0.00020911532919853926, "learning_rate": 4.085247617183762e-10, "loss": 0.0, "num_input_tokens_seen": 136811368, "step": 202990 }, { "epoch": 4.959201622163047, "grad_norm": 0.00023600812710355967, "learning_rate": 4.0609106376876537e-10, "loss": 0.0, "num_input_tokens_seen": 136814376, "step": 202995 }, { "epoch": 4.959323772994894, "grad_norm": 9.761666296981275e-05, "learning_rate": 4.036646351348594e-10, "loss": 0.0, "num_input_tokens_seen": 136817704, "step": 203000 }, { "epoch": 4.959445923826741, "grad_norm": 1.3856853001925629e-05, "learning_rate": 4.012454758344219e-10, "loss": 0.0, "num_input_tokens_seen": 136821096, "step": 203005 }, { "epoch": 4.959568074658589, "grad_norm": 4.735983020509593e-05, "learning_rate": 3.988335858849945e-10, "loss": 0.0, "num_input_tokens_seen": 136825320, "step": 203010 }, { "epoch": 4.959690225490435, "grad_norm": 0.002790002152323723, "learning_rate": 3.964289653040076e-10, "loss": 0.0, "num_input_tokens_seen": 136828520, "step": 203015 }, { "epoch": 4.959812376322283, "grad_norm": 0.001175225363112986, "learning_rate": 3.940316141091138e-10, "loss": 0.0, "num_input_tokens_seen": 136831784, "step": 203020 }, { "epoch": 4.95993452715413, "grad_norm": 0.002540030749514699, "learning_rate": 3.9164153231774353e-10, "loss": 0.0, "num_input_tokens_seen": 136835688, "step": 203025 }, { "epoch": 4.960056677985977, "grad_norm": 0.0004663171130232513, "learning_rate": 3.8925871994710536e-10, "loss": 0.0738, "num_input_tokens_seen": 136838760, "step": 203030 }, { "epoch": 4.960178828817824, "grad_norm": 0.005429753102362156, "learning_rate": 3.868831770147407e-10, "loss": 0.0, "num_input_tokens_seen": 136842024, "step": 203035 }, { "epoch": 4.960300979649672, "grad_norm": 0.00017984755686484277, "learning_rate": 3.8451490353774706e-10, "loss": 0.0, "num_input_tokens_seen": 136845736, "step": 203040 }, { "epoch": 4.9604231304815185, "grad_norm": 3.470600495347753e-05, "learning_rate": 3.8215389953355494e-10, "loss": 0.0, "num_input_tokens_seen": 136849128, "step": 203045 }, { "epoch": 4.960545281313365, "grad_norm": 0.005800565704703331, "learning_rate": 3.7980016501903966e-10, "loss": 0.0, "num_input_tokens_seen": 136852520, "step": 203050 }, { "epoch": 4.960667432145213, "grad_norm": 0.00013009316171519458, "learning_rate": 3.774537000116318e-10, "loss": 0.0001, "num_input_tokens_seen": 136856360, "step": 203055 }, { "epoch": 4.9607895829770605, "grad_norm": 0.0001301606243941933, "learning_rate": 3.7511450452809565e-10, "loss": 0.0, "num_input_tokens_seen": 136859432, "step": 203060 }, { "epoch": 4.960911733808907, "grad_norm": 0.00011180521687492728, "learning_rate": 3.727825785857508e-10, "loss": 0.0, "num_input_tokens_seen": 136862504, "step": 203065 }, { "epoch": 4.961033884640754, "grad_norm": 0.00028714933432638645, "learning_rate": 3.704579222012505e-10, "loss": 0.0, "num_input_tokens_seen": 136865320, "step": 203070 }, { "epoch": 4.961156035472602, "grad_norm": 5.161049557500519e-05, "learning_rate": 3.681405353916922e-10, "loss": 0.0, "num_input_tokens_seen": 136868520, "step": 203075 }, { "epoch": 4.961278186304448, "grad_norm": 0.00584859075024724, "learning_rate": 3.658304181739513e-10, "loss": 0.0397, "num_input_tokens_seen": 136871720, "step": 203080 }, { "epoch": 4.961400337136296, "grad_norm": 0.0003809731570072472, "learning_rate": 3.635275705646812e-10, "loss": 0.0, "num_input_tokens_seen": 136875176, "step": 203085 }, { "epoch": 4.961522487968143, "grad_norm": 0.001918514259159565, "learning_rate": 3.612319925807572e-10, "loss": 0.0, "num_input_tokens_seen": 136878440, "step": 203090 }, { "epoch": 4.96164463879999, "grad_norm": 0.002260788343846798, "learning_rate": 3.589436842388327e-10, "loss": 0.0, "num_input_tokens_seen": 136881512, "step": 203095 }, { "epoch": 4.961766789631837, "grad_norm": 0.0006037589628249407, "learning_rate": 3.56662645555561e-10, "loss": 0.0003, "num_input_tokens_seen": 136884392, "step": 203100 }, { "epoch": 4.961888940463685, "grad_norm": 0.002064619679003954, "learning_rate": 3.543888765473735e-10, "loss": 0.0, "num_input_tokens_seen": 136887592, "step": 203105 }, { "epoch": 4.9620110912955315, "grad_norm": 0.0017396878683939576, "learning_rate": 3.521223772311455e-10, "loss": 0.0, "num_input_tokens_seen": 136890600, "step": 203110 }, { "epoch": 4.962133242127379, "grad_norm": 0.0003977522428613156, "learning_rate": 3.498631476229752e-10, "loss": 0.0, "num_input_tokens_seen": 136894056, "step": 203115 }, { "epoch": 4.962255392959226, "grad_norm": 0.005513790063560009, "learning_rate": 3.47611187739516e-10, "loss": 0.0, "num_input_tokens_seen": 136897448, "step": 203120 }, { "epoch": 4.9623775437910735, "grad_norm": 0.00010466863022884354, "learning_rate": 3.453664975971993e-10, "loss": 0.0, "num_input_tokens_seen": 136900840, "step": 203125 }, { "epoch": 4.96249969462292, "grad_norm": 6.562995258718729e-05, "learning_rate": 3.4312907721212316e-10, "loss": 0.0, "num_input_tokens_seen": 136904104, "step": 203130 }, { "epoch": 4.962621845454768, "grad_norm": 2.7876367312273942e-05, "learning_rate": 3.4089892660082997e-10, "loss": 0.0513, "num_input_tokens_seen": 136907752, "step": 203135 }, { "epoch": 4.962743996286615, "grad_norm": 0.00010341319284634665, "learning_rate": 3.38676045779307e-10, "loss": 0.0, "num_input_tokens_seen": 136911400, "step": 203140 }, { "epoch": 4.962866147118461, "grad_norm": 0.0016331837978214025, "learning_rate": 3.364604347637634e-10, "loss": 0.0, "num_input_tokens_seen": 136914472, "step": 203145 }, { "epoch": 4.962988297950309, "grad_norm": 0.00025190794258378446, "learning_rate": 3.342520935704085e-10, "loss": 0.0, "num_input_tokens_seen": 136918056, "step": 203150 }, { "epoch": 4.963110448782157, "grad_norm": 8.690000686328858e-05, "learning_rate": 3.3205102221534054e-10, "loss": 0.0, "num_input_tokens_seen": 136921576, "step": 203155 }, { "epoch": 4.963232599614003, "grad_norm": 0.0013419648166745901, "learning_rate": 3.2985722071432465e-10, "loss": 0.0, "num_input_tokens_seen": 136925032, "step": 203160 }, { "epoch": 4.96335475044585, "grad_norm": 0.0007638754323124886, "learning_rate": 3.276706890835701e-10, "loss": 0.0, "num_input_tokens_seen": 136928232, "step": 203165 }, { "epoch": 4.963476901277698, "grad_norm": 5.1917631935793906e-05, "learning_rate": 3.2549142733884203e-10, "loss": 0.0, "num_input_tokens_seen": 136931688, "step": 203170 }, { "epoch": 4.9635990521095446, "grad_norm": 0.00024687673430889845, "learning_rate": 3.2331943549601673e-10, "loss": 0.0, "num_input_tokens_seen": 136935016, "step": 203175 }, { "epoch": 4.963721202941392, "grad_norm": 0.31552761793136597, "learning_rate": 3.211547135708592e-10, "loss": 0.0001, "num_input_tokens_seen": 136938408, "step": 203180 }, { "epoch": 4.963843353773239, "grad_norm": 0.00021553314581979066, "learning_rate": 3.1899726157913476e-10, "loss": 0.0, "num_input_tokens_seen": 136941736, "step": 203185 }, { "epoch": 4.963965504605087, "grad_norm": 0.00010820927855093032, "learning_rate": 3.168470795366085e-10, "loss": 0.0, "num_input_tokens_seen": 136945128, "step": 203190 }, { "epoch": 4.964087655436933, "grad_norm": 8.269635873148218e-05, "learning_rate": 3.1470416745882353e-10, "loss": 0.0, "num_input_tokens_seen": 136948456, "step": 203195 }, { "epoch": 4.964209806268781, "grad_norm": 1.1428928701207042e-05, "learning_rate": 3.1256852536143407e-10, "loss": 0.0, "num_input_tokens_seen": 136951464, "step": 203200 }, { "epoch": 4.964331957100628, "grad_norm": 0.0011048450833186507, "learning_rate": 3.1044015325987217e-10, "loss": 0.0, "num_input_tokens_seen": 136954664, "step": 203205 }, { "epoch": 4.964454107932475, "grad_norm": 0.000956613221205771, "learning_rate": 3.0831905116968093e-10, "loss": 0.0, "num_input_tokens_seen": 136957928, "step": 203210 }, { "epoch": 4.964576258764322, "grad_norm": 3.0835537472739816e-05, "learning_rate": 3.062052191062925e-10, "loss": 0.0, "num_input_tokens_seen": 136961640, "step": 203215 }, { "epoch": 4.96469840959617, "grad_norm": 0.0006256560445763171, "learning_rate": 3.040986570851389e-10, "loss": 0.0, "num_input_tokens_seen": 136965032, "step": 203220 }, { "epoch": 4.9648205604280164, "grad_norm": 0.0019232897320762277, "learning_rate": 3.019993651213193e-10, "loss": 0.0, "num_input_tokens_seen": 136968872, "step": 203225 }, { "epoch": 4.964942711259864, "grad_norm": 8.002047252375633e-05, "learning_rate": 2.999073432303767e-10, "loss": 0.0, "num_input_tokens_seen": 136972584, "step": 203230 }, { "epoch": 4.965064862091711, "grad_norm": 9.228551061823964e-05, "learning_rate": 2.9782259142729913e-10, "loss": 0.0001, "num_input_tokens_seen": 136976232, "step": 203235 }, { "epoch": 4.965187012923558, "grad_norm": 0.00013471365673467517, "learning_rate": 2.957451097274077e-10, "loss": 0.0, "num_input_tokens_seen": 136980136, "step": 203240 }, { "epoch": 4.965309163755405, "grad_norm": 0.0009636140894144773, "learning_rate": 2.9367489814569044e-10, "loss": 0.0, "num_input_tokens_seen": 136983208, "step": 203245 }, { "epoch": 4.965431314587252, "grad_norm": 9.399078408023342e-06, "learning_rate": 2.9161195669735736e-10, "loss": 0.0, "num_input_tokens_seen": 136986920, "step": 203250 }, { "epoch": 4.9655534654191, "grad_norm": 0.0016986504197120667, "learning_rate": 2.8955628539717447e-10, "loss": 0.0, "num_input_tokens_seen": 136989864, "step": 203255 }, { "epoch": 4.965675616250946, "grad_norm": 0.00014009641017764807, "learning_rate": 2.8750788426035175e-10, "loss": 0.0, "num_input_tokens_seen": 136993192, "step": 203260 }, { "epoch": 4.965797767082794, "grad_norm": 0.00019577737839426845, "learning_rate": 2.854667533015442e-10, "loss": 0.0, "num_input_tokens_seen": 136996264, "step": 203265 }, { "epoch": 4.965919917914641, "grad_norm": 5.1051170885330066e-05, "learning_rate": 2.834328925358509e-10, "loss": 0.0, "num_input_tokens_seen": 136999528, "step": 203270 }, { "epoch": 4.966042068746488, "grad_norm": 0.0007095492910593748, "learning_rate": 2.814063019778157e-10, "loss": 0.0, "num_input_tokens_seen": 137002536, "step": 203275 }, { "epoch": 4.966164219578335, "grad_norm": 0.0004173467750661075, "learning_rate": 2.7938698164231556e-10, "loss": 0.0, "num_input_tokens_seen": 137005544, "step": 203280 }, { "epoch": 4.966286370410183, "grad_norm": 0.00018406417802907526, "learning_rate": 2.773749315440055e-10, "loss": 0.0234, "num_input_tokens_seen": 137008808, "step": 203285 }, { "epoch": 4.9664085212420295, "grad_norm": 0.0005171273369342089, "learning_rate": 2.753701516975404e-10, "loss": 0.0, "num_input_tokens_seen": 137012008, "step": 203290 }, { "epoch": 4.966530672073877, "grad_norm": 0.0019284519366919994, "learning_rate": 2.7337264211746427e-10, "loss": 0.0, "num_input_tokens_seen": 137015272, "step": 203295 }, { "epoch": 4.966652822905724, "grad_norm": 0.0029952942859381437, "learning_rate": 2.713824028183209e-10, "loss": 0.0, "num_input_tokens_seen": 137018280, "step": 203300 }, { "epoch": 4.9667749737375715, "grad_norm": 1.635344779060688e-05, "learning_rate": 2.693994338145433e-10, "loss": 0.0, "num_input_tokens_seen": 137021864, "step": 203305 }, { "epoch": 4.966897124569418, "grad_norm": 7.378828013315797e-05, "learning_rate": 2.6742373512056435e-10, "loss": 0.0, "num_input_tokens_seen": 137025640, "step": 203310 }, { "epoch": 4.967019275401266, "grad_norm": 0.00014544688747264445, "learning_rate": 2.6545530675081695e-10, "loss": 0.0, "num_input_tokens_seen": 137028968, "step": 203315 }, { "epoch": 4.967141426233113, "grad_norm": 3.913308682967909e-05, "learning_rate": 2.6349414871962297e-10, "loss": 0.0008, "num_input_tokens_seen": 137032104, "step": 203320 }, { "epoch": 4.96726357706496, "grad_norm": 0.00011158483539475128, "learning_rate": 2.615402610411932e-10, "loss": 0.0, "num_input_tokens_seen": 137035624, "step": 203325 }, { "epoch": 4.967385727896807, "grad_norm": 1.9497132598189637e-05, "learning_rate": 2.595936437296276e-10, "loss": 0.0, "num_input_tokens_seen": 137039016, "step": 203330 }, { "epoch": 4.967507878728654, "grad_norm": 1.9880870240740478e-05, "learning_rate": 2.576542967993589e-10, "loss": 0.0, "num_input_tokens_seen": 137042344, "step": 203335 }, { "epoch": 4.967630029560501, "grad_norm": 0.0005132934893481433, "learning_rate": 2.55722220264154e-10, "loss": 0.0, "num_input_tokens_seen": 137045672, "step": 203340 }, { "epoch": 4.967752180392348, "grad_norm": 0.000174630869878456, "learning_rate": 2.5379741413833475e-10, "loss": 0.0, "num_input_tokens_seen": 137049128, "step": 203345 }, { "epoch": 4.967874331224196, "grad_norm": 0.0001575258356751874, "learning_rate": 2.5187987843577897e-10, "loss": 0.0, "num_input_tokens_seen": 137052904, "step": 203350 }, { "epoch": 4.9679964820560425, "grad_norm": 6.655186734860763e-05, "learning_rate": 2.499696131704754e-10, "loss": 0.0, "num_input_tokens_seen": 137056616, "step": 203355 }, { "epoch": 4.96811863288789, "grad_norm": 5.819553734909277e-06, "learning_rate": 2.4806661835630185e-10, "loss": 0.0, "num_input_tokens_seen": 137060008, "step": 203360 }, { "epoch": 4.968240783719737, "grad_norm": 0.013717192225158215, "learning_rate": 2.461708940070251e-10, "loss": 0.0, "num_input_tokens_seen": 137063208, "step": 203365 }, { "epoch": 4.9683629345515845, "grad_norm": 0.00012144362699473277, "learning_rate": 2.4428244013652287e-10, "loss": 0.0, "num_input_tokens_seen": 137066728, "step": 203370 }, { "epoch": 4.968485085383431, "grad_norm": 0.00012399266415741295, "learning_rate": 2.4240125675856206e-10, "loss": 0.0, "num_input_tokens_seen": 137070056, "step": 203375 }, { "epoch": 4.968607236215279, "grad_norm": 0.003136326791718602, "learning_rate": 2.405273438866873e-10, "loss": 0.0, "num_input_tokens_seen": 137073192, "step": 203380 }, { "epoch": 4.968729387047126, "grad_norm": 0.0038233192171901464, "learning_rate": 2.3866070153466534e-10, "loss": 0.0008, "num_input_tokens_seen": 137076264, "step": 203385 }, { "epoch": 4.968851537878973, "grad_norm": 0.0008583197486586869, "learning_rate": 2.368013297159299e-10, "loss": 0.0, "num_input_tokens_seen": 137079528, "step": 203390 }, { "epoch": 4.96897368871082, "grad_norm": 5.924261131440289e-05, "learning_rate": 2.349492284441368e-10, "loss": 0.0, "num_input_tokens_seen": 137082728, "step": 203395 }, { "epoch": 4.969095839542668, "grad_norm": 0.0002450200554449111, "learning_rate": 2.331043977327196e-10, "loss": 0.0, "num_input_tokens_seen": 137086376, "step": 203400 }, { "epoch": 4.969217990374514, "grad_norm": 0.0019818120636045933, "learning_rate": 2.312668375950011e-10, "loss": 0.0, "num_input_tokens_seen": 137089704, "step": 203405 }, { "epoch": 4.969340141206361, "grad_norm": 3.149250187561847e-05, "learning_rate": 2.2943654804441493e-10, "loss": 0.0, "num_input_tokens_seen": 137093160, "step": 203410 }, { "epoch": 4.969462292038209, "grad_norm": 0.0010760186705738306, "learning_rate": 2.2761352909428377e-10, "loss": 0.0, "num_input_tokens_seen": 137096552, "step": 203415 }, { "epoch": 4.969584442870056, "grad_norm": 2.086194399453234e-05, "learning_rate": 2.2579778075793031e-10, "loss": 0.0, "num_input_tokens_seen": 137099816, "step": 203420 }, { "epoch": 4.969706593701903, "grad_norm": 1.81776522367727e-05, "learning_rate": 2.2398930304834417e-10, "loss": 0.0, "num_input_tokens_seen": 137103208, "step": 203425 }, { "epoch": 4.96982874453375, "grad_norm": 0.00011152262595715001, "learning_rate": 2.2218809597895906e-10, "loss": 0.0, "num_input_tokens_seen": 137106600, "step": 203430 }, { "epoch": 4.9699508953655975, "grad_norm": 0.0004978838260285556, "learning_rate": 2.203941595626535e-10, "loss": 0.0, "num_input_tokens_seen": 137110056, "step": 203435 }, { "epoch": 4.970073046197444, "grad_norm": 0.000971376255620271, "learning_rate": 2.186074938125282e-10, "loss": 0.0, "num_input_tokens_seen": 137113320, "step": 203440 }, { "epoch": 4.970195197029292, "grad_norm": 0.00025585308321751654, "learning_rate": 2.1682809874168373e-10, "loss": 0.0, "num_input_tokens_seen": 137116456, "step": 203445 }, { "epoch": 4.970317347861139, "grad_norm": 4.208882819511928e-05, "learning_rate": 2.150559743628877e-10, "loss": 0.0, "num_input_tokens_seen": 137119784, "step": 203450 }, { "epoch": 4.970439498692986, "grad_norm": 0.00013479522021953017, "learning_rate": 2.132911206891297e-10, "loss": 0.0, "num_input_tokens_seen": 137123368, "step": 203455 }, { "epoch": 4.970561649524833, "grad_norm": 0.0004235265660099685, "learning_rate": 2.115335377332883e-10, "loss": 0.0, "num_input_tokens_seen": 137126824, "step": 203460 }, { "epoch": 4.970683800356681, "grad_norm": 2.3727705411147326e-05, "learning_rate": 2.0978322550802007e-10, "loss": 0.0, "num_input_tokens_seen": 137130280, "step": 203465 }, { "epoch": 4.970805951188527, "grad_norm": 0.0004997532814741135, "learning_rate": 2.080401840262036e-10, "loss": 0.0, "num_input_tokens_seen": 137133736, "step": 203470 }, { "epoch": 4.970928102020375, "grad_norm": 0.0019565869588404894, "learning_rate": 2.063044133003844e-10, "loss": 0.0, "num_input_tokens_seen": 137137128, "step": 203475 }, { "epoch": 4.971050252852222, "grad_norm": 0.0008852098835632205, "learning_rate": 2.04575913343219e-10, "loss": 0.0, "num_input_tokens_seen": 137140456, "step": 203480 }, { "epoch": 4.971172403684069, "grad_norm": 4.9233127356274053e-05, "learning_rate": 2.0285468416725294e-10, "loss": 0.0, "num_input_tokens_seen": 137143656, "step": 203485 }, { "epoch": 4.971294554515916, "grad_norm": 9.417891851626337e-05, "learning_rate": 2.0114072578503172e-10, "loss": 0.0001, "num_input_tokens_seen": 137146792, "step": 203490 }, { "epoch": 4.971416705347764, "grad_norm": 10.676913261413574, "learning_rate": 1.9943403820910086e-10, "loss": 0.0011, "num_input_tokens_seen": 137149800, "step": 203495 }, { "epoch": 4.9715388561796106, "grad_norm": 0.0032459113281220198, "learning_rate": 1.9773462145178387e-10, "loss": 0.0, "num_input_tokens_seen": 137153064, "step": 203500 }, { "epoch": 4.971661007011457, "grad_norm": 0.0002514548250474036, "learning_rate": 1.960424755254042e-10, "loss": 0.0, "num_input_tokens_seen": 137156328, "step": 203505 }, { "epoch": 4.971783157843305, "grad_norm": 2.9430975700961426e-05, "learning_rate": 1.9435760044239635e-10, "loss": 0.0, "num_input_tokens_seen": 137159400, "step": 203510 }, { "epoch": 4.971905308675153, "grad_norm": 0.0017755551962181926, "learning_rate": 1.9267999621486174e-10, "loss": 0.0, "num_input_tokens_seen": 137162600, "step": 203515 }, { "epoch": 4.972027459506999, "grad_norm": 0.00012700166553258896, "learning_rate": 1.9100966285512388e-10, "loss": 0.0, "num_input_tokens_seen": 137166568, "step": 203520 }, { "epoch": 4.972149610338846, "grad_norm": 0.00025107708643190563, "learning_rate": 1.8934660037528417e-10, "loss": 0.0, "num_input_tokens_seen": 137170088, "step": 203525 }, { "epoch": 4.972271761170694, "grad_norm": 9.3789476522943e-06, "learning_rate": 1.8769080878744402e-10, "loss": 0.0, "num_input_tokens_seen": 137173608, "step": 203530 }, { "epoch": 4.97239391200254, "grad_norm": 0.00017873231263365597, "learning_rate": 1.860422881035939e-10, "loss": 0.0, "num_input_tokens_seen": 137178984, "step": 203535 }, { "epoch": 4.972516062834388, "grad_norm": 0.009690695442259312, "learning_rate": 1.8440103833572417e-10, "loss": 0.0, "num_input_tokens_seen": 137182056, "step": 203540 }, { "epoch": 4.972638213666235, "grad_norm": 0.001154441386461258, "learning_rate": 1.8276705949593629e-10, "loss": 0.0, "num_input_tokens_seen": 137185064, "step": 203545 }, { "epoch": 4.9727603644980825, "grad_norm": 7.696470675000455e-06, "learning_rate": 1.8114035159588758e-10, "loss": 0.0224, "num_input_tokens_seen": 137187944, "step": 203550 }, { "epoch": 4.972882515329929, "grad_norm": 0.0008099049446173012, "learning_rate": 1.7952091464756846e-10, "loss": 0.0, "num_input_tokens_seen": 137190952, "step": 203555 }, { "epoch": 4.973004666161777, "grad_norm": 8.639370207674801e-05, "learning_rate": 1.7790874866263628e-10, "loss": 0.0, "num_input_tokens_seen": 137194408, "step": 203560 }, { "epoch": 4.973126816993624, "grad_norm": 0.0008555944077670574, "learning_rate": 1.7630385365285938e-10, "loss": 0.0, "num_input_tokens_seen": 137197736, "step": 203565 }, { "epoch": 4.973248967825471, "grad_norm": 0.0013645697617903352, "learning_rate": 1.7470622962989511e-10, "loss": 0.0, "num_input_tokens_seen": 137201064, "step": 203570 }, { "epoch": 4.973371118657318, "grad_norm": 5.793929813080467e-05, "learning_rate": 1.7311587660551186e-10, "loss": 0.0, "num_input_tokens_seen": 137204456, "step": 203575 }, { "epoch": 4.973493269489166, "grad_norm": 0.0002381923550274223, "learning_rate": 1.7153279459103386e-10, "loss": 0.0, "num_input_tokens_seen": 137208168, "step": 203580 }, { "epoch": 4.973615420321012, "grad_norm": 0.001816401956602931, "learning_rate": 1.699569835981185e-10, "loss": 0.0, "num_input_tokens_seen": 137211304, "step": 203585 }, { "epoch": 4.97373757115286, "grad_norm": 8.881182293407619e-05, "learning_rate": 1.6838844363820103e-10, "loss": 0.0, "num_input_tokens_seen": 137214696, "step": 203590 }, { "epoch": 4.973859721984707, "grad_norm": 0.016806993633508682, "learning_rate": 1.668271747227168e-10, "loss": 0.0, "num_input_tokens_seen": 137218408, "step": 203595 }, { "epoch": 4.9739818728165535, "grad_norm": 0.01145798061043024, "learning_rate": 1.6527317686299002e-10, "loss": 0.0, "num_input_tokens_seen": 137221864, "step": 203600 }, { "epoch": 4.974104023648401, "grad_norm": 2.2194937628228217e-05, "learning_rate": 1.63726450070234e-10, "loss": 0.0, "num_input_tokens_seen": 137225192, "step": 203605 }, { "epoch": 4.974226174480248, "grad_norm": 0.003558725118637085, "learning_rate": 1.62186994355884e-10, "loss": 0.0, "num_input_tokens_seen": 137228392, "step": 203610 }, { "epoch": 4.9743483253120955, "grad_norm": 0.0013230282347649336, "learning_rate": 1.6065480973104228e-10, "loss": 0.0, "num_input_tokens_seen": 137232296, "step": 203615 }, { "epoch": 4.974470476143942, "grad_norm": 0.001185271656140685, "learning_rate": 1.5912989620681107e-10, "loss": 0.0, "num_input_tokens_seen": 137235304, "step": 203620 }, { "epoch": 4.97459262697579, "grad_norm": 0.001854298054240644, "learning_rate": 1.5761225379429255e-10, "loss": 0.0, "num_input_tokens_seen": 137238120, "step": 203625 }, { "epoch": 4.974714777807637, "grad_norm": 2.3406746549881063e-05, "learning_rate": 1.56101882504478e-10, "loss": 0.0, "num_input_tokens_seen": 137241320, "step": 203630 }, { "epoch": 4.974836928639484, "grad_norm": 0.00031124529778026044, "learning_rate": 1.5459878234846958e-10, "loss": 0.0, "num_input_tokens_seen": 137245160, "step": 203635 }, { "epoch": 4.974959079471331, "grad_norm": 0.0002666858781594783, "learning_rate": 1.5310295333725853e-10, "loss": 0.0, "num_input_tokens_seen": 137248616, "step": 203640 }, { "epoch": 4.975081230303179, "grad_norm": 0.0014295884175226092, "learning_rate": 1.5161439548150301e-10, "loss": 0.0001, "num_input_tokens_seen": 137251688, "step": 203645 }, { "epoch": 4.975203381135025, "grad_norm": 0.024229461327195168, "learning_rate": 1.501331087920832e-10, "loss": 0.0, "num_input_tokens_seen": 137255016, "step": 203650 }, { "epoch": 4.975325531966873, "grad_norm": 0.006210799794644117, "learning_rate": 1.4865909327987924e-10, "loss": 0.0, "num_input_tokens_seen": 137258408, "step": 203655 }, { "epoch": 4.97544768279872, "grad_norm": 5.4780171922175214e-05, "learning_rate": 1.4719234895566034e-10, "loss": 0.0, "num_input_tokens_seen": 137261480, "step": 203660 }, { "epoch": 4.975569833630567, "grad_norm": 4.419705874170177e-06, "learning_rate": 1.457328758298626e-10, "loss": 0.0, "num_input_tokens_seen": 137264808, "step": 203665 }, { "epoch": 4.975691984462414, "grad_norm": 0.004317359533160925, "learning_rate": 1.4428067391325515e-10, "loss": 0.0, "num_input_tokens_seen": 137268328, "step": 203670 }, { "epoch": 4.975814135294261, "grad_norm": 8.27832191134803e-05, "learning_rate": 1.4283574321627413e-10, "loss": 0.0001, "num_input_tokens_seen": 137271464, "step": 203675 }, { "epoch": 4.9759362861261085, "grad_norm": 0.0003701518871821463, "learning_rate": 1.4139808374968864e-10, "loss": 0.0, "num_input_tokens_seen": 137274472, "step": 203680 }, { "epoch": 4.976058436957956, "grad_norm": 0.00020802194194402546, "learning_rate": 1.3996769552371279e-10, "loss": 0.0, "num_input_tokens_seen": 137277928, "step": 203685 }, { "epoch": 4.976180587789803, "grad_norm": 0.00158477493096143, "learning_rate": 1.3854457854878265e-10, "loss": 0.0, "num_input_tokens_seen": 137281320, "step": 203690 }, { "epoch": 4.97630273862165, "grad_norm": 2.6810001145349815e-05, "learning_rate": 1.3712873283533433e-10, "loss": 0.0, "num_input_tokens_seen": 137284648, "step": 203695 }, { "epoch": 4.976424889453497, "grad_norm": 0.0019791999366134405, "learning_rate": 1.3572015839358187e-10, "loss": 0.0, "num_input_tokens_seen": 137288232, "step": 203700 }, { "epoch": 4.976547040285344, "grad_norm": 0.0001736970734782517, "learning_rate": 1.3431885523385034e-10, "loss": 0.0, "num_input_tokens_seen": 137291432, "step": 203705 }, { "epoch": 4.976669191117192, "grad_norm": 6.91135719534941e-05, "learning_rate": 1.329248233662428e-10, "loss": 0.0, "num_input_tokens_seen": 137294760, "step": 203710 }, { "epoch": 4.976791341949038, "grad_norm": 0.00016507727559655905, "learning_rate": 1.3153806280097323e-10, "loss": 0.0, "num_input_tokens_seen": 137298280, "step": 203715 }, { "epoch": 4.976913492780886, "grad_norm": 7.36595347916591e-06, "learning_rate": 1.3015857354803372e-10, "loss": 0.0, "num_input_tokens_seen": 137301608, "step": 203720 }, { "epoch": 4.977035643612733, "grad_norm": 0.005928873550146818, "learning_rate": 1.2878635561752726e-10, "loss": 0.0, "num_input_tokens_seen": 137304488, "step": 203725 }, { "epoch": 4.97715779444458, "grad_norm": 1.852934292401187e-05, "learning_rate": 1.2742140901944587e-10, "loss": 0.0, "num_input_tokens_seen": 137308520, "step": 203730 }, { "epoch": 4.977279945276427, "grad_norm": 6.925003253854811e-05, "learning_rate": 1.2606373376367052e-10, "loss": 0.0, "num_input_tokens_seen": 137312040, "step": 203735 }, { "epoch": 4.977402096108275, "grad_norm": 0.00015649801935069263, "learning_rate": 1.2471332986008222e-10, "loss": 0.0, "num_input_tokens_seen": 137315560, "step": 203740 }, { "epoch": 4.9775242469401215, "grad_norm": 6.381532875820994e-05, "learning_rate": 1.233701973185619e-10, "loss": 0.0, "num_input_tokens_seen": 137318952, "step": 203745 }, { "epoch": 4.977646397771969, "grad_norm": 3.099068271694705e-05, "learning_rate": 1.2203433614876858e-10, "loss": 0.0, "num_input_tokens_seen": 137322408, "step": 203750 }, { "epoch": 4.977768548603816, "grad_norm": 0.00028246158035472035, "learning_rate": 1.2070574636058318e-10, "loss": 0.0, "num_input_tokens_seen": 137325800, "step": 203755 }, { "epoch": 4.9778906994356635, "grad_norm": 8.343757508555427e-05, "learning_rate": 1.1938442796344263e-10, "loss": 0.0, "num_input_tokens_seen": 137329448, "step": 203760 }, { "epoch": 4.97801285026751, "grad_norm": 0.000519098830409348, "learning_rate": 1.1807038096711685e-10, "loss": 0.0, "num_input_tokens_seen": 137332776, "step": 203765 }, { "epoch": 4.978135001099357, "grad_norm": 0.00046904810005798936, "learning_rate": 1.1676360538115381e-10, "loss": 0.0, "num_input_tokens_seen": 137336808, "step": 203770 }, { "epoch": 4.978257151931205, "grad_norm": 0.0006193334120325744, "learning_rate": 1.154641012149904e-10, "loss": 0.0, "num_input_tokens_seen": 137340392, "step": 203775 }, { "epoch": 4.978379302763052, "grad_norm": 0.00027628708630800247, "learning_rate": 1.1417186847806349e-10, "loss": 0.0, "num_input_tokens_seen": 137343592, "step": 203780 }, { "epoch": 4.978501453594899, "grad_norm": 0.0007421595510095358, "learning_rate": 1.12886907179921e-10, "loss": 0.0, "num_input_tokens_seen": 137347496, "step": 203785 }, { "epoch": 4.978623604426746, "grad_norm": 0.0011555387172847986, "learning_rate": 1.1160921732977779e-10, "loss": 0.0, "num_input_tokens_seen": 137350760, "step": 203790 }, { "epoch": 4.978745755258593, "grad_norm": 0.0014732476556673646, "learning_rate": 1.1033879893684872e-10, "loss": 0.0, "num_input_tokens_seen": 137354216, "step": 203795 }, { "epoch": 4.97886790609044, "grad_norm": 0.0026653846725821495, "learning_rate": 1.0907565201057067e-10, "loss": 0.0, "num_input_tokens_seen": 137357416, "step": 203800 }, { "epoch": 4.978990056922288, "grad_norm": 0.00010707018373068422, "learning_rate": 1.0781977655993645e-10, "loss": 0.0, "num_input_tokens_seen": 137360424, "step": 203805 }, { "epoch": 4.9791122077541345, "grad_norm": 0.0009073491673916578, "learning_rate": 1.0657117259427195e-10, "loss": 0.0, "num_input_tokens_seen": 137363368, "step": 203810 }, { "epoch": 4.979234358585982, "grad_norm": 0.0016944017261266708, "learning_rate": 1.0532984012256995e-10, "loss": 0.0, "num_input_tokens_seen": 137367208, "step": 203815 }, { "epoch": 4.979356509417829, "grad_norm": 0.00020370505808386952, "learning_rate": 1.0409577915382328e-10, "loss": 0.0, "num_input_tokens_seen": 137370344, "step": 203820 }, { "epoch": 4.979478660249677, "grad_norm": 3.5763987398240715e-05, "learning_rate": 1.0286898969702473e-10, "loss": 0.1607, "num_input_tokens_seen": 137373480, "step": 203825 }, { "epoch": 4.979600811081523, "grad_norm": 0.00012906512711197138, "learning_rate": 1.016494717610561e-10, "loss": 0.0, "num_input_tokens_seen": 137377000, "step": 203830 }, { "epoch": 4.979722961913371, "grad_norm": 0.0001898586779134348, "learning_rate": 1.0043722535491018e-10, "loss": 0.0, "num_input_tokens_seen": 137380712, "step": 203835 }, { "epoch": 4.979845112745218, "grad_norm": 0.0006985256914049387, "learning_rate": 9.92322504872467e-11, "loss": 0.0, "num_input_tokens_seen": 137384360, "step": 203840 }, { "epoch": 4.979967263577065, "grad_norm": 0.001462116721086204, "learning_rate": 9.803454716694748e-11, "loss": 0.0, "num_input_tokens_seen": 137387560, "step": 203845 }, { "epoch": 4.980089414408912, "grad_norm": 0.0007516284240409732, "learning_rate": 9.684411540267224e-11, "loss": 0.0, "num_input_tokens_seen": 137391016, "step": 203850 }, { "epoch": 4.98021156524076, "grad_norm": 0.003561106277629733, "learning_rate": 9.566095520308071e-11, "loss": 0.0, "num_input_tokens_seen": 137394472, "step": 203855 }, { "epoch": 4.980333716072606, "grad_norm": 0.0006031613447703421, "learning_rate": 9.448506657683264e-11, "loss": 0.0, "num_input_tokens_seen": 137397352, "step": 203860 }, { "epoch": 4.980455866904453, "grad_norm": 0.15565507113933563, "learning_rate": 9.331644953236573e-11, "loss": 0.0001, "num_input_tokens_seen": 137400808, "step": 203865 }, { "epoch": 4.980578017736301, "grad_norm": 2.2073681975598447e-05, "learning_rate": 9.21551040783397e-11, "loss": 0.0, "num_input_tokens_seen": 137403880, "step": 203870 }, { "epoch": 4.980700168568148, "grad_norm": 1.1446640201029368e-05, "learning_rate": 9.100103022297023e-11, "loss": 0.0, "num_input_tokens_seen": 137406824, "step": 203875 }, { "epoch": 4.980822319399995, "grad_norm": 0.00042873574420809746, "learning_rate": 8.985422797491704e-11, "loss": 0.0002, "num_input_tokens_seen": 137410216, "step": 203880 }, { "epoch": 4.980944470231842, "grad_norm": 0.000481167109683156, "learning_rate": 8.871469734228477e-11, "loss": 0.0, "num_input_tokens_seen": 137414056, "step": 203885 }, { "epoch": 4.98106662106369, "grad_norm": 6.701985694235191e-05, "learning_rate": 8.758243833351109e-11, "loss": 0.0, "num_input_tokens_seen": 137417192, "step": 203890 }, { "epoch": 4.981188771895536, "grad_norm": 0.0001698703272268176, "learning_rate": 8.645745095681167e-11, "loss": 0.0, "num_input_tokens_seen": 137420264, "step": 203895 }, { "epoch": 4.981310922727384, "grad_norm": 0.00016953774320427328, "learning_rate": 8.533973522029114e-11, "loss": 0.0, "num_input_tokens_seen": 137423400, "step": 203900 }, { "epoch": 4.981433073559231, "grad_norm": 0.000789462763350457, "learning_rate": 8.422929113216515e-11, "loss": 0.0, "num_input_tokens_seen": 137426408, "step": 203905 }, { "epoch": 4.981555224391078, "grad_norm": 0.001563973375596106, "learning_rate": 8.312611870042729e-11, "loss": 0.0, "num_input_tokens_seen": 137430184, "step": 203910 }, { "epoch": 4.981677375222925, "grad_norm": 0.992311954498291, "learning_rate": 8.203021793318221e-11, "loss": 0.0002, "num_input_tokens_seen": 137433448, "step": 203915 }, { "epoch": 4.981799526054773, "grad_norm": 0.00013686691818293184, "learning_rate": 8.094158883831248e-11, "loss": 0.0, "num_input_tokens_seen": 137437032, "step": 203920 }, { "epoch": 4.9819216768866195, "grad_norm": 0.0002493971842341125, "learning_rate": 7.98602314238117e-11, "loss": 0.0, "num_input_tokens_seen": 137440680, "step": 203925 }, { "epoch": 4.982043827718467, "grad_norm": 0.0005622510798275471, "learning_rate": 7.878614569745146e-11, "loss": 0.0, "num_input_tokens_seen": 137443688, "step": 203930 }, { "epoch": 4.982165978550314, "grad_norm": 8.922885172069073e-05, "learning_rate": 7.771933166722532e-11, "loss": 0.0001, "num_input_tokens_seen": 137447144, "step": 203935 }, { "epoch": 4.9822881293821615, "grad_norm": 9.467336349189281e-05, "learning_rate": 7.665978934068285e-11, "loss": 0.0, "num_input_tokens_seen": 137450344, "step": 203940 }, { "epoch": 4.982410280214008, "grad_norm": 1.9983550373581238e-05, "learning_rate": 7.560751872559557e-11, "loss": 0.0, "num_input_tokens_seen": 137453800, "step": 203945 }, { "epoch": 4.982532431045856, "grad_norm": 0.0004079849168192595, "learning_rate": 7.456251982973505e-11, "loss": 0.0, "num_input_tokens_seen": 137456808, "step": 203950 }, { "epoch": 4.982654581877703, "grad_norm": 3.181111242156476e-05, "learning_rate": 7.352479266053979e-11, "loss": 0.0, "num_input_tokens_seen": 137460072, "step": 203955 }, { "epoch": 4.982776732709549, "grad_norm": 0.000325191009324044, "learning_rate": 7.249433722567033e-11, "loss": 0.0, "num_input_tokens_seen": 137463720, "step": 203960 }, { "epoch": 4.982898883541397, "grad_norm": 4.00619137508329e-05, "learning_rate": 7.147115353245414e-11, "loss": 0.0, "num_input_tokens_seen": 137467560, "step": 203965 }, { "epoch": 4.983021034373244, "grad_norm": 0.017873620614409447, "learning_rate": 7.045524158855176e-11, "loss": 0.0, "num_input_tokens_seen": 137470888, "step": 203970 }, { "epoch": 4.983143185205091, "grad_norm": 0.00022479586186818779, "learning_rate": 6.944660140117964e-11, "loss": 0.0, "num_input_tokens_seen": 137474088, "step": 203975 }, { "epoch": 4.983265336036938, "grad_norm": 0.014547290280461311, "learning_rate": 6.844523297777627e-11, "loss": 0.0, "num_input_tokens_seen": 137477160, "step": 203980 }, { "epoch": 4.983387486868786, "grad_norm": 5.238172161625698e-05, "learning_rate": 6.745113632566912e-11, "loss": 0.0, "num_input_tokens_seen": 137480552, "step": 203985 }, { "epoch": 4.9835096377006325, "grad_norm": 0.0010335510596632957, "learning_rate": 6.64643114518526e-11, "loss": 0.0, "num_input_tokens_seen": 137484072, "step": 203990 }, { "epoch": 4.98363178853248, "grad_norm": 7.121911039575934e-05, "learning_rate": 6.548475836376521e-11, "loss": 0.0, "num_input_tokens_seen": 137487464, "step": 203995 }, { "epoch": 4.983753939364327, "grad_norm": 0.0017879261868074536, "learning_rate": 6.451247706840136e-11, "loss": 0.0, "num_input_tokens_seen": 137491560, "step": 204000 }, { "epoch": 4.9838760901961745, "grad_norm": 0.0008008818840608001, "learning_rate": 6.354746757286645e-11, "loss": 0.0, "num_input_tokens_seen": 137494632, "step": 204005 }, { "epoch": 4.983998241028021, "grad_norm": 0.002612438052892685, "learning_rate": 6.258972988415489e-11, "loss": 0.0, "num_input_tokens_seen": 137497896, "step": 204010 }, { "epoch": 4.984120391859869, "grad_norm": 4.5757096813758835e-05, "learning_rate": 6.16392640091501e-11, "loss": 0.0, "num_input_tokens_seen": 137501288, "step": 204015 }, { "epoch": 4.984242542691716, "grad_norm": 0.00022237258963286877, "learning_rate": 6.069606995495746e-11, "loss": 0.0, "num_input_tokens_seen": 137504680, "step": 204020 }, { "epoch": 4.984364693523563, "grad_norm": 0.00012107265501981601, "learning_rate": 5.976014772834937e-11, "loss": 0.0, "num_input_tokens_seen": 137507688, "step": 204025 }, { "epoch": 4.98448684435541, "grad_norm": 5.114965824759565e-05, "learning_rate": 5.883149733609816e-11, "loss": 0.0, "num_input_tokens_seen": 137510824, "step": 204030 }, { "epoch": 4.984608995187257, "grad_norm": 2.346815927012358e-05, "learning_rate": 5.7910118784976204e-11, "loss": 0.0, "num_input_tokens_seen": 137514088, "step": 204035 }, { "epoch": 4.984731146019104, "grad_norm": 8.091831841738895e-05, "learning_rate": 5.699601208164484e-11, "loss": 0.0, "num_input_tokens_seen": 137517928, "step": 204040 }, { "epoch": 4.984853296850952, "grad_norm": 0.002588754054158926, "learning_rate": 5.6089177232765406e-11, "loss": 0.0, "num_input_tokens_seen": 137521128, "step": 204045 }, { "epoch": 4.984975447682799, "grad_norm": 0.00010856986773433164, "learning_rate": 5.518961424499924e-11, "loss": 0.0492, "num_input_tokens_seen": 137524584, "step": 204050 }, { "epoch": 4.9850975985146455, "grad_norm": 0.0001354777195956558, "learning_rate": 5.4297323124896655e-11, "loss": 0.0, "num_input_tokens_seen": 137528296, "step": 204055 }, { "epoch": 4.985219749346493, "grad_norm": 1.4176566764945164e-05, "learning_rate": 5.341230387878593e-11, "loss": 0.0, "num_input_tokens_seen": 137532456, "step": 204060 }, { "epoch": 4.98534190017834, "grad_norm": 0.00031324996962212026, "learning_rate": 5.253455651332839e-11, "loss": 0.0, "num_input_tokens_seen": 137535784, "step": 204065 }, { "epoch": 4.9854640510101875, "grad_norm": 0.00043233123142272234, "learning_rate": 5.166408103474129e-11, "loss": 0.0, "num_input_tokens_seen": 137538920, "step": 204070 }, { "epoch": 4.985586201842034, "grad_norm": 0.0011373285669833422, "learning_rate": 5.080087744946393e-11, "loss": 0.0, "num_input_tokens_seen": 137541992, "step": 204075 }, { "epoch": 4.985708352673882, "grad_norm": 0.0024712735321372747, "learning_rate": 4.994494576360253e-11, "loss": 0.0, "num_input_tokens_seen": 137545960, "step": 204080 }, { "epoch": 4.985830503505729, "grad_norm": 1.2956945283804089e-05, "learning_rate": 4.909628598359639e-11, "loss": 0.0, "num_input_tokens_seen": 137549352, "step": 204085 }, { "epoch": 4.985952654337576, "grad_norm": 1.887368671305012e-05, "learning_rate": 4.82548981154407e-11, "loss": 0.0, "num_input_tokens_seen": 137552488, "step": 204090 }, { "epoch": 4.986074805169423, "grad_norm": 9.684948963695206e-06, "learning_rate": 4.742078216535272e-11, "loss": 0.0, "num_input_tokens_seen": 137555816, "step": 204095 }, { "epoch": 4.986196956001271, "grad_norm": 0.0011596221011132002, "learning_rate": 4.6593938139438685e-11, "loss": 0.0, "num_input_tokens_seen": 137559976, "step": 204100 }, { "epoch": 4.986319106833117, "grad_norm": 0.0002005387214012444, "learning_rate": 4.577436604358276e-11, "loss": 0.0, "num_input_tokens_seen": 137563368, "step": 204105 }, { "epoch": 4.986441257664965, "grad_norm": 2.454880450386554e-05, "learning_rate": 4.496206588378015e-11, "loss": 0.0, "num_input_tokens_seen": 137566760, "step": 204110 }, { "epoch": 4.986563408496812, "grad_norm": 0.00022712937789037824, "learning_rate": 4.4157037666026075e-11, "loss": 0.0, "num_input_tokens_seen": 137570088, "step": 204115 }, { "epoch": 4.986685559328659, "grad_norm": 0.0013797935098409653, "learning_rate": 4.335928139609368e-11, "loss": 0.0, "num_input_tokens_seen": 137573928, "step": 204120 }, { "epoch": 4.986807710160506, "grad_norm": 0.00011030215682694688, "learning_rate": 4.2568797079867156e-11, "loss": 0.0003, "num_input_tokens_seen": 137577128, "step": 204125 }, { "epoch": 4.986929860992353, "grad_norm": 0.025388803333044052, "learning_rate": 4.1785584723008635e-11, "loss": 0.0, "num_input_tokens_seen": 137580264, "step": 204130 }, { "epoch": 4.9870520118242005, "grad_norm": 1.8925420590676367e-05, "learning_rate": 4.100964433118026e-11, "loss": 0.0, "num_input_tokens_seen": 137583464, "step": 204135 }, { "epoch": 4.987174162656048, "grad_norm": 0.0001776775170583278, "learning_rate": 4.0240975910155186e-11, "loss": 0.0, "num_input_tokens_seen": 137586728, "step": 204140 }, { "epoch": 4.987296313487895, "grad_norm": 0.023028668016195297, "learning_rate": 3.9479579465373504e-11, "loss": 0.0, "num_input_tokens_seen": 137589864, "step": 204145 }, { "epoch": 4.987418464319742, "grad_norm": 0.0003330595209263265, "learning_rate": 3.8725455002608377e-11, "loss": 0.0, "num_input_tokens_seen": 137593064, "step": 204150 }, { "epoch": 4.987540615151589, "grad_norm": 0.000414291862398386, "learning_rate": 3.797860252707785e-11, "loss": 0.0, "num_input_tokens_seen": 137596520, "step": 204155 }, { "epoch": 4.987662765983436, "grad_norm": 0.00011527969763847068, "learning_rate": 3.7239022044333045e-11, "loss": 0.0, "num_input_tokens_seen": 137599720, "step": 204160 }, { "epoch": 4.987784916815284, "grad_norm": 0.00047657510731369257, "learning_rate": 3.6506713559703025e-11, "loss": 0.0, "num_input_tokens_seen": 137605352, "step": 204165 }, { "epoch": 4.98790706764713, "grad_norm": 3.753498094738461e-05, "learning_rate": 3.578167707862789e-11, "loss": 0.0, "num_input_tokens_seen": 137608360, "step": 204170 }, { "epoch": 4.988029218478978, "grad_norm": 0.00023259302543010563, "learning_rate": 3.506391260621466e-11, "loss": 0.0, "num_input_tokens_seen": 137611816, "step": 204175 }, { "epoch": 4.988151369310825, "grad_norm": 0.002862380351871252, "learning_rate": 3.4353420147903435e-11, "loss": 0.0, "num_input_tokens_seen": 137615144, "step": 204180 }, { "epoch": 4.9882735201426724, "grad_norm": 0.0003798987891059369, "learning_rate": 3.365019970869021e-11, "loss": 0.0, "num_input_tokens_seen": 137618536, "step": 204185 }, { "epoch": 4.988395670974519, "grad_norm": 0.0195639468729496, "learning_rate": 3.295425129368201e-11, "loss": 0.0, "num_input_tokens_seen": 137621800, "step": 204190 }, { "epoch": 4.988517821806367, "grad_norm": 2.1513849787879735e-05, "learning_rate": 3.226557490798587e-11, "loss": 0.0, "num_input_tokens_seen": 137624872, "step": 204195 }, { "epoch": 4.988639972638214, "grad_norm": 5.544390660361387e-05, "learning_rate": 3.158417055670881e-11, "loss": 0.0579, "num_input_tokens_seen": 137627944, "step": 204200 }, { "epoch": 4.988762123470061, "grad_norm": 0.00016343161405529827, "learning_rate": 3.0910038244624794e-11, "loss": 0.0, "num_input_tokens_seen": 137631592, "step": 204205 }, { "epoch": 4.988884274301908, "grad_norm": 0.0017932208720594645, "learning_rate": 3.0243177976729816e-11, "loss": 0.0, "num_input_tokens_seen": 137634664, "step": 204210 }, { "epoch": 4.989006425133756, "grad_norm": 0.0006000014836899936, "learning_rate": 2.9583589757908863e-11, "loss": 0.0, "num_input_tokens_seen": 137637800, "step": 204215 }, { "epoch": 4.989128575965602, "grad_norm": 1.5821640772628598e-05, "learning_rate": 2.893127359282488e-11, "loss": 0.0, "num_input_tokens_seen": 137641512, "step": 204220 }, { "epoch": 4.989250726797449, "grad_norm": 0.00039087023469619453, "learning_rate": 2.8286229486362833e-11, "loss": 0.0, "num_input_tokens_seen": 137644648, "step": 204225 }, { "epoch": 4.989372877629297, "grad_norm": 60.79288864135742, "learning_rate": 2.764845744318567e-11, "loss": 0.0763, "num_input_tokens_seen": 137648232, "step": 204230 }, { "epoch": 4.9894950284611435, "grad_norm": 9.158341708825901e-05, "learning_rate": 2.701795746795632e-11, "loss": 0.0, "num_input_tokens_seen": 137651752, "step": 204235 }, { "epoch": 4.989617179292991, "grad_norm": 0.00457747234031558, "learning_rate": 2.6394729565115682e-11, "loss": 0.0, "num_input_tokens_seen": 137655016, "step": 204240 }, { "epoch": 4.989739330124838, "grad_norm": 0.00026916858041659, "learning_rate": 2.5778773739326687e-11, "loss": 0.0, "num_input_tokens_seen": 137658664, "step": 204245 }, { "epoch": 4.9898614809566855, "grad_norm": 0.00012310463353060186, "learning_rate": 2.517008999503023e-11, "loss": 0.0, "num_input_tokens_seen": 137662440, "step": 204250 }, { "epoch": 4.989983631788532, "grad_norm": 0.00021796536748297513, "learning_rate": 2.4568678336667203e-11, "loss": 0.0, "num_input_tokens_seen": 137666344, "step": 204255 }, { "epoch": 4.99010578262038, "grad_norm": 0.006101370323449373, "learning_rate": 2.3974538768567475e-11, "loss": 0.0, "num_input_tokens_seen": 137669544, "step": 204260 }, { "epoch": 4.990227933452227, "grad_norm": 0.00022258379613049328, "learning_rate": 2.338767129517194e-11, "loss": 0.0, "num_input_tokens_seen": 137672744, "step": 204265 }, { "epoch": 4.990350084284074, "grad_norm": 0.0004926729016005993, "learning_rate": 2.280807592058842e-11, "loss": 0.112, "num_input_tokens_seen": 137676200, "step": 204270 }, { "epoch": 4.990472235115921, "grad_norm": 2.890416180889588e-05, "learning_rate": 2.2235752649146787e-11, "loss": 0.0, "num_input_tokens_seen": 137679592, "step": 204275 }, { "epoch": 4.990594385947769, "grad_norm": 0.0013556944904848933, "learning_rate": 2.1670701484954866e-11, "loss": 0.0, "num_input_tokens_seen": 137682984, "step": 204280 }, { "epoch": 4.990716536779615, "grad_norm": 0.08208898454904556, "learning_rate": 2.1112922432120482e-11, "loss": 0.0, "num_input_tokens_seen": 137686056, "step": 204285 }, { "epoch": 4.990838687611463, "grad_norm": 0.000291264004772529, "learning_rate": 2.056241549475146e-11, "loss": 0.0, "num_input_tokens_seen": 137689192, "step": 204290 }, { "epoch": 4.99096083844331, "grad_norm": 0.007216623052954674, "learning_rate": 2.0019180676733582e-11, "loss": 0.0, "num_input_tokens_seen": 137692648, "step": 204295 }, { "epoch": 4.9910829892751565, "grad_norm": 0.0006970735266804695, "learning_rate": 1.948321798217467e-11, "loss": 0.0, "num_input_tokens_seen": 137695848, "step": 204300 }, { "epoch": 4.991205140107004, "grad_norm": 1.272992722078925e-05, "learning_rate": 1.8954527414849488e-11, "loss": 0.0, "num_input_tokens_seen": 137698856, "step": 204305 }, { "epoch": 4.991327290938852, "grad_norm": 1.9237311789765954e-05, "learning_rate": 1.843310897875483e-11, "loss": 0.0, "num_input_tokens_seen": 137702120, "step": 204310 }, { "epoch": 4.9914494417706985, "grad_norm": 0.0004398747405502945, "learning_rate": 1.7918962677443418e-11, "loss": 0.0, "num_input_tokens_seen": 137705704, "step": 204315 }, { "epoch": 4.991571592602545, "grad_norm": 7.332904351642355e-05, "learning_rate": 1.741208851491205e-11, "loss": 0.0, "num_input_tokens_seen": 137709096, "step": 204320 }, { "epoch": 4.991693743434393, "grad_norm": 6.562047929037362e-05, "learning_rate": 1.6912486494602417e-11, "loss": 0.0, "num_input_tokens_seen": 137712744, "step": 204325 }, { "epoch": 4.99181589426624, "grad_norm": 0.0015275224577635527, "learning_rate": 1.6420156620289283e-11, "loss": 0.0, "num_input_tokens_seen": 137715880, "step": 204330 }, { "epoch": 4.991938045098087, "grad_norm": 0.04927076771855354, "learning_rate": 1.593509889563638e-11, "loss": 0.0, "num_input_tokens_seen": 137719528, "step": 204335 }, { "epoch": 4.992060195929934, "grad_norm": 0.0021134349517524242, "learning_rate": 1.5457313323974374e-11, "loss": 0.0, "num_input_tokens_seen": 137722856, "step": 204340 }, { "epoch": 4.992182346761782, "grad_norm": 0.0008005241979844868, "learning_rate": 1.4986799908855985e-11, "loss": 0.0, "num_input_tokens_seen": 137726312, "step": 204345 }, { "epoch": 4.992304497593628, "grad_norm": 6.598847539862618e-05, "learning_rate": 1.45235586537229e-11, "loss": 0.0, "num_input_tokens_seen": 137729576, "step": 204350 }, { "epoch": 4.992426648425476, "grad_norm": 12.575538635253906, "learning_rate": 1.4067589562016812e-11, "loss": 0.0302, "num_input_tokens_seen": 137732584, "step": 204355 }, { "epoch": 4.992548799257323, "grad_norm": 0.001489661866798997, "learning_rate": 1.361889263695737e-11, "loss": 0.0, "num_input_tokens_seen": 137735976, "step": 204360 }, { "epoch": 4.99267095008917, "grad_norm": 0.002886560745537281, "learning_rate": 1.3177467881764214e-11, "loss": 0.0, "num_input_tokens_seen": 137739944, "step": 204365 }, { "epoch": 4.992793100921017, "grad_norm": 0.00042491499334573746, "learning_rate": 1.274331529976802e-11, "loss": 0.0, "num_input_tokens_seen": 137743336, "step": 204370 }, { "epoch": 4.992915251752865, "grad_norm": 1.1034517228836194e-05, "learning_rate": 1.2316434893966387e-11, "loss": 0.0, "num_input_tokens_seen": 137746728, "step": 204375 }, { "epoch": 4.9930374025847115, "grad_norm": 0.00016755687829572707, "learning_rate": 1.1896826667689985e-11, "loss": 0.0, "num_input_tokens_seen": 137750312, "step": 204380 }, { "epoch": 4.993159553416559, "grad_norm": 0.00024167363881133497, "learning_rate": 1.1484490623825394e-11, "loss": 0.0, "num_input_tokens_seen": 137753192, "step": 204385 }, { "epoch": 4.993281704248406, "grad_norm": 2.5365121473441832e-05, "learning_rate": 1.1079426765370215e-11, "loss": 0.0, "num_input_tokens_seen": 137756328, "step": 204390 }, { "epoch": 4.993403855080253, "grad_norm": 0.0004141927638556808, "learning_rate": 1.068163509532205e-11, "loss": 0.0, "num_input_tokens_seen": 137759656, "step": 204395 }, { "epoch": 4.9935260059121, "grad_norm": 0.00025063075008802116, "learning_rate": 1.0291115616567481e-11, "loss": 0.0, "num_input_tokens_seen": 137762792, "step": 204400 }, { "epoch": 4.993648156743948, "grad_norm": 1.626476841920521e-05, "learning_rate": 9.907868331882063e-12, "loss": 0.0, "num_input_tokens_seen": 137766056, "step": 204405 }, { "epoch": 4.993770307575795, "grad_norm": 0.0001084671457647346, "learning_rate": 9.5318932442634e-12, "loss": 0.0, "num_input_tokens_seen": 137769512, "step": 204410 }, { "epoch": 4.993892458407641, "grad_norm": 0.0017103019636124372, "learning_rate": 9.163190356153982e-12, "loss": 0.0, "num_input_tokens_seen": 137772904, "step": 204415 }, { "epoch": 4.994014609239489, "grad_norm": 0.0003728297306224704, "learning_rate": 8.80175967044039e-12, "loss": 0.0, "num_input_tokens_seen": 137775912, "step": 204420 }, { "epoch": 4.994136760071336, "grad_norm": 9.055395639734343e-05, "learning_rate": 8.447601189676135e-12, "loss": 0.0, "num_input_tokens_seen": 137779496, "step": 204425 }, { "epoch": 4.994258910903183, "grad_norm": 0.0002516876847948879, "learning_rate": 8.100714916414731e-12, "loss": 0.0, "num_input_tokens_seen": 137782504, "step": 204430 }, { "epoch": 4.99438106173503, "grad_norm": 0.0004563323745969683, "learning_rate": 7.76110085320969e-12, "loss": 0.0, "num_input_tokens_seen": 137785896, "step": 204435 }, { "epoch": 4.994503212566878, "grad_norm": 0.0009458388667553663, "learning_rate": 7.428759002614527e-12, "loss": 0.0, "num_input_tokens_seen": 137789928, "step": 204440 }, { "epoch": 4.9946253633987245, "grad_norm": 0.056152962148189545, "learning_rate": 7.103689366849686e-12, "loss": 0.0, "num_input_tokens_seen": 137793512, "step": 204445 }, { "epoch": 4.994747514230572, "grad_norm": 0.0003752903430722654, "learning_rate": 6.7858919484686805e-12, "loss": 0.0, "num_input_tokens_seen": 137796776, "step": 204450 }, { "epoch": 4.994869665062419, "grad_norm": 9.508341463515535e-05, "learning_rate": 6.4753667496919575e-12, "loss": 0.0, "num_input_tokens_seen": 137800168, "step": 204455 }, { "epoch": 4.9949918158942666, "grad_norm": 0.000608145899605006, "learning_rate": 6.172113772850984e-12, "loss": 0.0, "num_input_tokens_seen": 137803816, "step": 204460 }, { "epoch": 4.995113966726113, "grad_norm": 0.0006833765655755997, "learning_rate": 5.876133019944163e-12, "loss": 0.0, "num_input_tokens_seen": 137806888, "step": 204465 }, { "epoch": 4.995236117557961, "grad_norm": 0.00020582752767950296, "learning_rate": 5.587424493413984e-12, "loss": 0.0, "num_input_tokens_seen": 137810088, "step": 204470 }, { "epoch": 4.995358268389808, "grad_norm": 0.0001288704515900463, "learning_rate": 5.3059881951478254e-12, "loss": 0.0, "num_input_tokens_seen": 137813608, "step": 204475 }, { "epoch": 4.995480419221655, "grad_norm": 0.00013208498421590775, "learning_rate": 5.031824127255113e-12, "loss": 0.0, "num_input_tokens_seen": 137817064, "step": 204480 }, { "epoch": 4.995602570053502, "grad_norm": 0.0015713643515482545, "learning_rate": 4.764932291734247e-12, "loss": 0.0, "num_input_tokens_seen": 137820264, "step": 204485 }, { "epoch": 4.995724720885349, "grad_norm": 0.00032959875534288585, "learning_rate": 4.505312690583629e-12, "loss": 0.0, "num_input_tokens_seen": 137824040, "step": 204490 }, { "epoch": 4.995846871717196, "grad_norm": 0.002318650484085083, "learning_rate": 4.252965325579616e-12, "loss": 0.0, "num_input_tokens_seen": 137826920, "step": 204495 }, { "epoch": 4.995969022549043, "grad_norm": 6.434229726437479e-05, "learning_rate": 4.0078901987206096e-12, "loss": 0.0, "num_input_tokens_seen": 137830568, "step": 204500 }, { "epoch": 4.996091173380891, "grad_norm": 0.0012508381623774767, "learning_rate": 3.770087311560921e-12, "loss": 0.0, "num_input_tokens_seen": 137834088, "step": 204505 }, { "epoch": 4.996213324212738, "grad_norm": 0.0002872292825486511, "learning_rate": 3.5395566659879307e-12, "loss": 0.0, "num_input_tokens_seen": 137837160, "step": 204510 }, { "epoch": 4.996335475044585, "grad_norm": 0.0004731950757559389, "learning_rate": 3.3162982636669722e-12, "loss": 0.0, "num_input_tokens_seen": 137840424, "step": 204515 }, { "epoch": 4.996457625876432, "grad_norm": 0.011159982532262802, "learning_rate": 3.1003121061523583e-12, "loss": 0.0, "num_input_tokens_seen": 137843944, "step": 204520 }, { "epoch": 4.99657977670828, "grad_norm": 0.00016125261026900262, "learning_rate": 2.8915981951094236e-12, "loss": 0.0, "num_input_tokens_seen": 137847080, "step": 204525 }, { "epoch": 4.996701927540126, "grad_norm": 0.00041134297498501837, "learning_rate": 2.690156531981458e-12, "loss": 0.0, "num_input_tokens_seen": 137850792, "step": 204530 }, { "epoch": 4.996824078371974, "grad_norm": 0.000439604016719386, "learning_rate": 2.4959871183227732e-12, "loss": 0.0, "num_input_tokens_seen": 137853928, "step": 204535 }, { "epoch": 4.996946229203821, "grad_norm": 0.002617679536342621, "learning_rate": 2.309089955354615e-12, "loss": 0.0, "num_input_tokens_seen": 137856936, "step": 204540 }, { "epoch": 4.997068380035668, "grad_norm": 7.568387081846595e-05, "learning_rate": 2.1294650446312955e-12, "loss": 0.0, "num_input_tokens_seen": 137860584, "step": 204545 }, { "epoch": 4.997190530867515, "grad_norm": 0.0002223595802206546, "learning_rate": 1.9571123873740602e-12, "loss": 0.0, "num_input_tokens_seen": 137863976, "step": 204550 }, { "epoch": 4.997312681699363, "grad_norm": 4.211713530821726e-05, "learning_rate": 1.7920319849151766e-12, "loss": 0.0, "num_input_tokens_seen": 137867496, "step": 204555 }, { "epoch": 4.9974348325312095, "grad_norm": 0.00966684427112341, "learning_rate": 1.6342238382538453e-12, "loss": 0.0, "num_input_tokens_seen": 137871272, "step": 204560 }, { "epoch": 4.997556983363056, "grad_norm": 0.0009443532908335328, "learning_rate": 1.4836879488333564e-12, "loss": 0.0, "num_input_tokens_seen": 137874280, "step": 204565 }, { "epoch": 4.997679134194904, "grad_norm": 7.629985702806152e-06, "learning_rate": 1.340424317430866e-12, "loss": 0.0, "num_input_tokens_seen": 137877672, "step": 204570 }, { "epoch": 4.9978012850267515, "grad_norm": 5.480439995153574e-06, "learning_rate": 1.2044329453786417e-12, "loss": 0.0, "num_input_tokens_seen": 137881192, "step": 204575 }, { "epoch": 4.997923435858598, "grad_norm": 3.845514220301993e-05, "learning_rate": 1.0757138334538396e-12, "loss": 0.0, "num_input_tokens_seen": 137884712, "step": 204580 }, { "epoch": 4.998045586690445, "grad_norm": 0.0005065813893452287, "learning_rate": 9.542669827666827e-13, "loss": 0.0, "num_input_tokens_seen": 137887720, "step": 204585 }, { "epoch": 4.998167737522293, "grad_norm": 1.7385242244927213e-05, "learning_rate": 8.400923939833049e-13, "loss": 0.0, "num_input_tokens_seen": 137891240, "step": 204590 }, { "epoch": 4.998289888354139, "grad_norm": 0.004977858159691095, "learning_rate": 7.331900682139292e-13, "loss": 0.0, "num_input_tokens_seen": 137894696, "step": 204595 }, { "epoch": 4.998412039185987, "grad_norm": 0.0001772203977452591, "learning_rate": 6.335600059026447e-13, "loss": 0.0, "num_input_tokens_seen": 137898152, "step": 204600 }, { "epoch": 4.998534190017834, "grad_norm": 0.00018388996249996126, "learning_rate": 5.412022080486523e-13, "loss": 0.0, "num_input_tokens_seen": 137901288, "step": 204605 }, { "epoch": 4.998656340849681, "grad_norm": 0.0003622965596150607, "learning_rate": 4.561166752070633e-13, "loss": 0.0, "num_input_tokens_seen": 137905000, "step": 204610 }, { "epoch": 4.998778491681528, "grad_norm": 1.2093821169401053e-05, "learning_rate": 3.7830340804401173e-13, "loss": 0.0, "num_input_tokens_seen": 137909032, "step": 204615 }, { "epoch": 4.998900642513376, "grad_norm": 0.11611104011535645, "learning_rate": 3.07762407114609e-13, "loss": 0.0, "num_input_tokens_seen": 137912168, "step": 204620 }, { "epoch": 4.9990227933452225, "grad_norm": 0.004503564443439245, "learning_rate": 2.444936728629443e-13, "loss": 0.0, "num_input_tokens_seen": 137915560, "step": 204625 }, { "epoch": 4.99914494417707, "grad_norm": 3.479938823147677e-05, "learning_rate": 1.884972058441292e-13, "loss": 0.0, "num_input_tokens_seen": 137918888, "step": 204630 }, { "epoch": 4.999267095008917, "grad_norm": 0.00011234768317081034, "learning_rate": 1.3977300639123058e-13, "loss": 0.0, "num_input_tokens_seen": 137922344, "step": 204635 }, { "epoch": 4.9993892458407645, "grad_norm": 0.0003669550933409482, "learning_rate": 9.832107483731533e-14, "loss": 0.0, "num_input_tokens_seen": 137925800, "step": 204640 }, { "epoch": 4.999511396672611, "grad_norm": 0.0006305554416030645, "learning_rate": 6.414141162647269e-14, "loss": 0.0, "num_input_tokens_seen": 137929256, "step": 204645 }, { "epoch": 4.999633547504459, "grad_norm": 0.00038496809429489076, "learning_rate": 3.723401686972494e-14, "loss": 0.0, "num_input_tokens_seen": 137932264, "step": 204650 }, { "epoch": 4.999755698336306, "grad_norm": 0.0001226778404088691, "learning_rate": 1.7598890900138997e-14, "loss": 0.0, "num_input_tokens_seen": 137935272, "step": 204655 }, { "epoch": 4.999877849168152, "grad_norm": 0.00022551536676473916, "learning_rate": 5.236033606692558e-15, "loss": 0.0013, "num_input_tokens_seen": 137938536, "step": 204660 }, { "epoch": 5.0, "grad_norm": 0.000892703770659864, "learning_rate": 1.454454334748334e-16, "loss": 0.0, "num_input_tokens_seen": 137941664, "step": 204665 }, { "epoch": 5.0, "num_input_tokens_seen": 137941664, "step": 204665, "total_flos": 8.054243640264622e+17, "train_loss": 0.052412337061547826, "train_runtime": 15894.9677, "train_samples_per_second": 103.008, "train_steps_per_second": 12.876 } ], "logging_steps": 5, "max_steps": 204665, "num_input_tokens_seen": 137941664, "num_train_epochs": 5, "save_steps": 10234, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.054243640264622e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }