{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.6268656716417915, "eval_steps": 500, "global_step": 620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007462686567164179, "grad_norm": 7.744417324010015, "learning_rate": 0.0, "loss": 0.8532977104187012, "num_tokens": 940199.0, "step": 1 }, { "epoch": 0.014925373134328358, "grad_norm": 7.534592349315775, "learning_rate": 1.5789473684210526e-06, "loss": 0.870805561542511, "num_tokens": 1940958.0, "step": 2 }, { "epoch": 0.022388059701492536, "grad_norm": 7.301629258268991, "learning_rate": 3.157894736842105e-06, "loss": 0.8422647714614868, "num_tokens": 2857380.0, "step": 3 }, { "epoch": 0.029850746268656716, "grad_norm": 6.897211503812214, "learning_rate": 4.736842105263158e-06, "loss": 0.8292515277862549, "num_tokens": 3696403.0, "step": 4 }, { "epoch": 0.03731343283582089, "grad_norm": 4.507105826947362, "learning_rate": 6.31578947368421e-06, "loss": 0.7875182628631592, "num_tokens": 4528235.0, "step": 5 }, { "epoch": 0.04477611940298507, "grad_norm": 2.305703326092364, "learning_rate": 7.894736842105263e-06, "loss": 0.7126146554946899, "num_tokens": 5554672.0, "step": 6 }, { "epoch": 0.05223880597014925, "grad_norm": 1.921229796923974, "learning_rate": 9.473684210526315e-06, "loss": 0.6916477680206299, "num_tokens": 6423132.0, "step": 7 }, { "epoch": 0.05970149253731343, "grad_norm": 2.0416384054663053, "learning_rate": 1.1052631578947368e-05, "loss": 0.6471172571182251, "num_tokens": 7201644.0, "step": 8 }, { "epoch": 0.06716417910447761, "grad_norm": 2.239115666581825, "learning_rate": 1.263157894736842e-05, "loss": 0.6206663846969604, "num_tokens": 8128715.0, "step": 9 }, { "epoch": 0.07462686567164178, "grad_norm": 2.1623779063703825, "learning_rate": 1.4210526315789473e-05, "loss": 0.6805848479270935, "num_tokens": 9074027.0, "step": 10 }, { "epoch": 0.08208955223880597, "grad_norm": 1.357823714265532, "learning_rate": 1.5789473684210526e-05, "loss": 0.5907301902770996, "num_tokens": 9950641.0, "step": 11 }, { "epoch": 0.08955223880597014, "grad_norm": 1.2436697073761152, "learning_rate": 1.736842105263158e-05, "loss": 0.6134575009346008, "num_tokens": 10885057.0, "step": 12 }, { "epoch": 0.09701492537313433, "grad_norm": 1.0124569249363744, "learning_rate": 1.894736842105263e-05, "loss": 0.5782807469367981, "num_tokens": 11697963.0, "step": 13 }, { "epoch": 0.1044776119402985, "grad_norm": 0.7569239607127325, "learning_rate": 2.0526315789473685e-05, "loss": 0.5509419441223145, "num_tokens": 12632602.0, "step": 14 }, { "epoch": 0.11194029850746269, "grad_norm": 0.5817534527891748, "learning_rate": 2.2105263157894736e-05, "loss": 0.532228410243988, "num_tokens": 13568889.0, "step": 15 }, { "epoch": 0.11940298507462686, "grad_norm": 0.7057373397195236, "learning_rate": 2.368421052631579e-05, "loss": 0.5408649444580078, "num_tokens": 14534242.0, "step": 16 }, { "epoch": 0.12686567164179105, "grad_norm": 0.6046224645124905, "learning_rate": 2.526315789473684e-05, "loss": 0.5322834253311157, "num_tokens": 15435946.0, "step": 17 }, { "epoch": 0.13432835820895522, "grad_norm": 0.49331973808673285, "learning_rate": 2.6842105263157896e-05, "loss": 0.5015720725059509, "num_tokens": 16352267.0, "step": 18 }, { "epoch": 0.1417910447761194, "grad_norm": 0.5002360432437354, "learning_rate": 2.8421052631578946e-05, "loss": 0.507352888584137, "num_tokens": 17277422.0, "step": 19 }, { "epoch": 0.14925373134328357, "grad_norm": 0.5278153522403675, "learning_rate": 3e-05, "loss": 0.5295838713645935, "num_tokens": 18270697.0, "step": 20 }, { "epoch": 0.15671641791044777, "grad_norm": 0.504428457093509, "learning_rate": 2.9999815560649025e-05, "loss": 0.4966413080692291, "num_tokens": 19308555.0, "step": 21 }, { "epoch": 0.16417910447761194, "grad_norm": 0.46271144160592165, "learning_rate": 2.9999262247635783e-05, "loss": 0.47084784507751465, "num_tokens": 20162797.0, "step": 22 }, { "epoch": 0.17164179104477612, "grad_norm": 0.4874776182121212, "learning_rate": 2.9998340076079188e-05, "loss": 0.4917251765727997, "num_tokens": 20981106.0, "step": 23 }, { "epoch": 0.1791044776119403, "grad_norm": 0.4431461732541396, "learning_rate": 2.9997049071176987e-05, "loss": 0.4785962998867035, "num_tokens": 21858000.0, "step": 24 }, { "epoch": 0.1865671641791045, "grad_norm": 0.387861494751977, "learning_rate": 2.9995389268205035e-05, "loss": 0.4448994994163513, "num_tokens": 22793285.0, "step": 25 }, { "epoch": 0.19402985074626866, "grad_norm": 0.4318496289646886, "learning_rate": 2.9993360712516377e-05, "loss": 0.5124952793121338, "num_tokens": 23723801.0, "step": 26 }, { "epoch": 0.20149253731343283, "grad_norm": 0.37498700509106286, "learning_rate": 2.999096345953997e-05, "loss": 0.4689701795578003, "num_tokens": 24725740.0, "step": 27 }, { "epoch": 0.208955223880597, "grad_norm": 0.46758132187976276, "learning_rate": 2.9988197574779187e-05, "loss": 0.5058130621910095, "num_tokens": 25730725.0, "step": 28 }, { "epoch": 0.21641791044776118, "grad_norm": 0.4541769806648773, "learning_rate": 2.998506313381003e-05, "loss": 0.4760160744190216, "num_tokens": 26557776.0, "step": 29 }, { "epoch": 0.22388059701492538, "grad_norm": 0.4285196017188533, "learning_rate": 2.998156022227906e-05, "loss": 0.46224498748779297, "num_tokens": 27504251.0, "step": 30 }, { "epoch": 0.23134328358208955, "grad_norm": 0.47777177042995506, "learning_rate": 2.9977688935901042e-05, "loss": 0.45890241861343384, "num_tokens": 28534541.0, "step": 31 }, { "epoch": 0.23880597014925373, "grad_norm": 0.4677469755237667, "learning_rate": 2.997344938045636e-05, "loss": 0.48648908734321594, "num_tokens": 29341502.0, "step": 32 }, { "epoch": 0.2462686567164179, "grad_norm": 0.417614303033266, "learning_rate": 2.99688416717881e-05, "loss": 0.4909588694572449, "num_tokens": 30211822.0, "step": 33 }, { "epoch": 0.2537313432835821, "grad_norm": 0.41160216727121024, "learning_rate": 2.9963865935798904e-05, "loss": 0.470625102519989, "num_tokens": 31102775.0, "step": 34 }, { "epoch": 0.26119402985074625, "grad_norm": 0.3679274109247649, "learning_rate": 2.995852230844751e-05, "loss": 0.45474812388420105, "num_tokens": 31898227.0, "step": 35 }, { "epoch": 0.26865671641791045, "grad_norm": 0.38798639827332265, "learning_rate": 2.9952810935745055e-05, "loss": 0.44148534536361694, "num_tokens": 32541892.0, "step": 36 }, { "epoch": 0.27611940298507465, "grad_norm": 0.3837062370088247, "learning_rate": 2.9946731973751076e-05, "loss": 0.47073429822921753, "num_tokens": 33543040.0, "step": 37 }, { "epoch": 0.2835820895522388, "grad_norm": 0.35454060994897946, "learning_rate": 2.9940285588569244e-05, "loss": 0.4520432949066162, "num_tokens": 34505224.0, "step": 38 }, { "epoch": 0.291044776119403, "grad_norm": 0.47976493136494663, "learning_rate": 2.993347195634284e-05, "loss": 0.490145206451416, "num_tokens": 35411826.0, "step": 39 }, { "epoch": 0.29850746268656714, "grad_norm": 0.3473597577972683, "learning_rate": 2.992629126324992e-05, "loss": 0.48773276805877686, "num_tokens": 36307345.0, "step": 40 }, { "epoch": 0.30597014925373134, "grad_norm": 0.3849018293538417, "learning_rate": 2.9918743705498237e-05, "loss": 0.4593764543533325, "num_tokens": 37196875.0, "step": 41 }, { "epoch": 0.31343283582089554, "grad_norm": 0.32778453758129855, "learning_rate": 2.9910829489319903e-05, "loss": 0.4493025243282318, "num_tokens": 38112193.0, "step": 42 }, { "epoch": 0.3208955223880597, "grad_norm": 0.38874846877448016, "learning_rate": 2.9902548830965703e-05, "loss": 0.44223347306251526, "num_tokens": 38855918.0, "step": 43 }, { "epoch": 0.3283582089552239, "grad_norm": 0.3600626273878142, "learning_rate": 2.9893901956699236e-05, "loss": 0.4619264602661133, "num_tokens": 39833215.0, "step": 44 }, { "epoch": 0.3358208955223881, "grad_norm": 0.4252489485804584, "learning_rate": 2.9884889102790703e-05, "loss": 0.47333118319511414, "num_tokens": 40760145.0, "step": 45 }, { "epoch": 0.34328358208955223, "grad_norm": 0.44355227925846324, "learning_rate": 2.9875510515510472e-05, "loss": 0.4684419631958008, "num_tokens": 41745749.0, "step": 46 }, { "epoch": 0.35074626865671643, "grad_norm": 0.40090080870615863, "learning_rate": 2.986576645112232e-05, "loss": 0.45152851939201355, "num_tokens": 42686630.0, "step": 47 }, { "epoch": 0.3582089552238806, "grad_norm": 0.3968769871577976, "learning_rate": 2.9855657175876453e-05, "loss": 0.46956488490104675, "num_tokens": 43510586.0, "step": 48 }, { "epoch": 0.3656716417910448, "grad_norm": 0.34818131786283646, "learning_rate": 2.9845182966002236e-05, "loss": 0.43737900257110596, "num_tokens": 44362248.0, "step": 49 }, { "epoch": 0.373134328358209, "grad_norm": 0.37174081071440807, "learning_rate": 2.983434410770063e-05, "loss": 0.4198949337005615, "num_tokens": 45216722.0, "step": 50 }, { "epoch": 0.3805970149253731, "grad_norm": 0.3771227721970023, "learning_rate": 2.9823140897136368e-05, "loss": 0.43871694803237915, "num_tokens": 46010142.0, "step": 51 }, { "epoch": 0.3880597014925373, "grad_norm": 0.40827091621620715, "learning_rate": 2.981157364042988e-05, "loss": 0.43513864278793335, "num_tokens": 46858670.0, "step": 52 }, { "epoch": 0.39552238805970147, "grad_norm": 0.37817640704637673, "learning_rate": 2.9799642653648915e-05, "loss": 0.4714231491088867, "num_tokens": 47836905.0, "step": 53 }, { "epoch": 0.40298507462686567, "grad_norm": 0.4521767636779311, "learning_rate": 2.9787348262799917e-05, "loss": 0.46958601474761963, "num_tokens": 48836237.0, "step": 54 }, { "epoch": 0.41044776119402987, "grad_norm": 0.3996917386608209, "learning_rate": 2.9774690803819092e-05, "loss": 0.4700014591217041, "num_tokens": 49860153.0, "step": 55 }, { "epoch": 0.417910447761194, "grad_norm": 0.3486187929603249, "learning_rate": 2.976167062256327e-05, "loss": 0.4338191747665405, "num_tokens": 50786110.0, "step": 56 }, { "epoch": 0.4253731343283582, "grad_norm": 0.32937938308672743, "learning_rate": 2.9748288074800414e-05, "loss": 0.41941165924072266, "num_tokens": 51790390.0, "step": 57 }, { "epoch": 0.43283582089552236, "grad_norm": 0.34924110154041565, "learning_rate": 2.9734543526199922e-05, "loss": 0.457973837852478, "num_tokens": 52742397.0, "step": 58 }, { "epoch": 0.44029850746268656, "grad_norm": 0.410698318336265, "learning_rate": 2.9720437352322618e-05, "loss": 0.47605207562446594, "num_tokens": 53673114.0, "step": 59 }, { "epoch": 0.44776119402985076, "grad_norm": 0.32459119196055575, "learning_rate": 2.9705969938610523e-05, "loss": 0.4107760787010193, "num_tokens": 54566889.0, "step": 60 }, { "epoch": 0.4552238805970149, "grad_norm": 0.356687155861432, "learning_rate": 2.9691141680376277e-05, "loss": 0.4515986740589142, "num_tokens": 55460491.0, "step": 61 }, { "epoch": 0.4626865671641791, "grad_norm": 0.34110692115474495, "learning_rate": 2.9675952982792383e-05, "loss": 0.4474300444126129, "num_tokens": 56520990.0, "step": 62 }, { "epoch": 0.4701492537313433, "grad_norm": 0.4001841133306117, "learning_rate": 2.9660404260880092e-05, "loss": 0.446544885635376, "num_tokens": 57422206.0, "step": 63 }, { "epoch": 0.47761194029850745, "grad_norm": 0.3580957652668953, "learning_rate": 2.964449593949811e-05, "loss": 0.47310975193977356, "num_tokens": 58260720.0, "step": 64 }, { "epoch": 0.48507462686567165, "grad_norm": 0.4285689076758677, "learning_rate": 2.9628228453330938e-05, "loss": 0.4611589014530182, "num_tokens": 59123617.0, "step": 65 }, { "epoch": 0.4925373134328358, "grad_norm": 0.37944004231248546, "learning_rate": 2.9611602246877044e-05, "loss": 0.43839746713638306, "num_tokens": 60033505.0, "step": 66 }, { "epoch": 0.5, "grad_norm": 0.4058895912331529, "learning_rate": 2.9594617774436683e-05, "loss": 0.4206322133541107, "num_tokens": 60974452.0, "step": 67 }, { "epoch": 0.5074626865671642, "grad_norm": 0.3573247245646442, "learning_rate": 2.957727550009949e-05, "loss": 0.4404195547103882, "num_tokens": 61913977.0, "step": 68 }, { "epoch": 0.5149253731343284, "grad_norm": 0.3517188453992546, "learning_rate": 2.9559575897731815e-05, "loss": 0.4638599753379822, "num_tokens": 62800879.0, "step": 69 }, { "epoch": 0.5223880597014925, "grad_norm": 0.38237698336509157, "learning_rate": 2.9541519450963753e-05, "loss": 0.4506247341632843, "num_tokens": 63823183.0, "step": 70 }, { "epoch": 0.5298507462686567, "grad_norm": 0.34718947718626825, "learning_rate": 2.9523106653175947e-05, "loss": 0.43822404742240906, "num_tokens": 64700087.0, "step": 71 }, { "epoch": 0.5373134328358209, "grad_norm": 0.38571581429627394, "learning_rate": 2.9504338007486096e-05, "loss": 0.41369548439979553, "num_tokens": 65533415.0, "step": 72 }, { "epoch": 0.5447761194029851, "grad_norm": 0.4179062195109546, "learning_rate": 2.948521402673521e-05, "loss": 0.4132109582424164, "num_tokens": 66411589.0, "step": 73 }, { "epoch": 0.5522388059701493, "grad_norm": 0.3884838709698021, "learning_rate": 2.9465735233473607e-05, "loss": 0.4519786536693573, "num_tokens": 67203675.0, "step": 74 }, { "epoch": 0.5597014925373134, "grad_norm": 0.37292728239052736, "learning_rate": 2.9445902159946608e-05, "loss": 0.4415651559829712, "num_tokens": 68056574.0, "step": 75 }, { "epoch": 0.5671641791044776, "grad_norm": 0.3498292196022338, "learning_rate": 2.942571534808003e-05, "loss": 0.4234054684638977, "num_tokens": 68907426.0, "step": 76 }, { "epoch": 0.5746268656716418, "grad_norm": 0.3282849607678631, "learning_rate": 2.9405175349465346e-05, "loss": 0.43461883068084717, "num_tokens": 69817179.0, "step": 77 }, { "epoch": 0.582089552238806, "grad_norm": 0.35116807662765037, "learning_rate": 2.938428272534464e-05, "loss": 0.45615193247795105, "num_tokens": 70803003.0, "step": 78 }, { "epoch": 0.5895522388059702, "grad_norm": 0.3329914149262814, "learning_rate": 2.9363038046595242e-05, "loss": 0.41635048389434814, "num_tokens": 71708353.0, "step": 79 }, { "epoch": 0.5970149253731343, "grad_norm": 0.38045704306263595, "learning_rate": 2.9341441893714155e-05, "loss": 0.43726855516433716, "num_tokens": 72587706.0, "step": 80 }, { "epoch": 0.6044776119402985, "grad_norm": 0.35924349265893135, "learning_rate": 2.9319494856802178e-05, "loss": 0.4230448007583618, "num_tokens": 73605832.0, "step": 81 }, { "epoch": 0.6119402985074627, "grad_norm": 0.38619247621111746, "learning_rate": 2.9297197535547806e-05, "loss": 0.4357215464115143, "num_tokens": 74512496.0, "step": 82 }, { "epoch": 0.6194029850746269, "grad_norm": 0.39604323481100373, "learning_rate": 2.9274550539210795e-05, "loss": 0.4608227014541626, "num_tokens": 75428481.0, "step": 83 }, { "epoch": 0.6268656716417911, "grad_norm": 0.37443244592149005, "learning_rate": 2.925155448660557e-05, "loss": 0.4334092140197754, "num_tokens": 76292706.0, "step": 84 }, { "epoch": 0.6343283582089553, "grad_norm": 0.331225928550454, "learning_rate": 2.9228210006084278e-05, "loss": 0.42100948095321655, "num_tokens": 77209633.0, "step": 85 }, { "epoch": 0.6417910447761194, "grad_norm": 0.3733284174499636, "learning_rate": 2.9204517735519638e-05, "loss": 0.42018914222717285, "num_tokens": 78063420.0, "step": 86 }, { "epoch": 0.6492537313432836, "grad_norm": 0.32698643343045436, "learning_rate": 2.91804783222875e-05, "loss": 0.4293556809425354, "num_tokens": 78870397.0, "step": 87 }, { "epoch": 0.6567164179104478, "grad_norm": 0.3702901089838646, "learning_rate": 2.915609242324917e-05, "loss": 0.43072593212127686, "num_tokens": 79871666.0, "step": 88 }, { "epoch": 0.664179104477612, "grad_norm": 0.37258166652948627, "learning_rate": 2.913136070473344e-05, "loss": 0.42400825023651123, "num_tokens": 80712206.0, "step": 89 }, { "epoch": 0.6716417910447762, "grad_norm": 0.3474241060283533, "learning_rate": 2.9106283842518404e-05, "loss": 0.4022632837295532, "num_tokens": 81538216.0, "step": 90 }, { "epoch": 0.6791044776119403, "grad_norm": 0.3241352741578233, "learning_rate": 2.9080862521812974e-05, "loss": 0.4167214035987854, "num_tokens": 82585839.0, "step": 91 }, { "epoch": 0.6865671641791045, "grad_norm": 0.3810979193242223, "learning_rate": 2.9055097437238178e-05, "loss": 0.424973726272583, "num_tokens": 83427449.0, "step": 92 }, { "epoch": 0.6940298507462687, "grad_norm": 0.3649031636927641, "learning_rate": 2.9028989292808156e-05, "loss": 0.4390385150909424, "num_tokens": 84449388.0, "step": 93 }, { "epoch": 0.7014925373134329, "grad_norm": 0.28994598738953636, "learning_rate": 2.9002538801910943e-05, "loss": 0.4120522141456604, "num_tokens": 85256514.0, "step": 94 }, { "epoch": 0.7089552238805971, "grad_norm": 0.33708866035801577, "learning_rate": 2.897574668728896e-05, "loss": 0.4396127164363861, "num_tokens": 86165960.0, "step": 95 }, { "epoch": 0.7164179104477612, "grad_norm": 0.33927189407215896, "learning_rate": 2.894861368101929e-05, "loss": 0.4281761050224304, "num_tokens": 86982659.0, "step": 96 }, { "epoch": 0.7238805970149254, "grad_norm": 0.31050046707178475, "learning_rate": 2.892114052449363e-05, "loss": 0.42657923698425293, "num_tokens": 87931000.0, "step": 97 }, { "epoch": 0.7313432835820896, "grad_norm": 0.3648336319576507, "learning_rate": 2.8893327968398085e-05, "loss": 0.4396938681602478, "num_tokens": 88689701.0, "step": 98 }, { "epoch": 0.7388059701492538, "grad_norm": 0.32123414861291977, "learning_rate": 2.886517677269263e-05, "loss": 0.4277549386024475, "num_tokens": 89547645.0, "step": 99 }, { "epoch": 0.746268656716418, "grad_norm": 0.35178070684423185, "learning_rate": 2.883668770659033e-05, "loss": 0.42951005697250366, "num_tokens": 90297517.0, "step": 100 }, { "epoch": 0.753731343283582, "grad_norm": 0.3404454736543532, "learning_rate": 2.8807861548536364e-05, "loss": 0.42362749576568604, "num_tokens": 91186856.0, "step": 101 }, { "epoch": 0.7611940298507462, "grad_norm": 0.3294687134617137, "learning_rate": 2.8778699086186704e-05, "loss": 0.43012386560440063, "num_tokens": 91987232.0, "step": 102 }, { "epoch": 0.7686567164179104, "grad_norm": 0.3613861468433532, "learning_rate": 2.8749201116386635e-05, "loss": 0.46676358580589294, "num_tokens": 92898696.0, "step": 103 }, { "epoch": 0.7761194029850746, "grad_norm": 0.32525203161057137, "learning_rate": 2.871936844514895e-05, "loss": 0.4271778464317322, "num_tokens": 93791120.0, "step": 104 }, { "epoch": 0.7835820895522388, "grad_norm": 0.3645223492290418, "learning_rate": 2.8689201887631954e-05, "loss": 0.4019509553909302, "num_tokens": 94639289.0, "step": 105 }, { "epoch": 0.7910447761194029, "grad_norm": 0.35120891749306765, "learning_rate": 2.8658702268117166e-05, "loss": 0.47020262479782104, "num_tokens": 95400207.0, "step": 106 }, { "epoch": 0.7985074626865671, "grad_norm": 0.3897429998289724, "learning_rate": 2.8627870419986818e-05, "loss": 0.45215320587158203, "num_tokens": 96227104.0, "step": 107 }, { "epoch": 0.8059701492537313, "grad_norm": 0.3551261237711927, "learning_rate": 2.859670718570107e-05, "loss": 0.41790810227394104, "num_tokens": 97056588.0, "step": 108 }, { "epoch": 0.8134328358208955, "grad_norm": 0.4085760278992768, "learning_rate": 2.8565213416774984e-05, "loss": 0.43688803911209106, "num_tokens": 97944111.0, "step": 109 }, { "epoch": 0.8208955223880597, "grad_norm": 0.34538986828654805, "learning_rate": 2.8533389973755266e-05, "loss": 0.40269792079925537, "num_tokens": 98816920.0, "step": 110 }, { "epoch": 0.8283582089552238, "grad_norm": 0.3680387468305633, "learning_rate": 2.8501237726196767e-05, "loss": 0.4414367079734802, "num_tokens": 99773832.0, "step": 111 }, { "epoch": 0.835820895522388, "grad_norm": 0.3132024744536474, "learning_rate": 2.846875755263869e-05, "loss": 0.44121602177619934, "num_tokens": 100805832.0, "step": 112 }, { "epoch": 0.8432835820895522, "grad_norm": 0.3212832093670825, "learning_rate": 2.843595034058062e-05, "loss": 0.43163514137268066, "num_tokens": 101747939.0, "step": 113 }, { "epoch": 0.8507462686567164, "grad_norm": 0.3377699333103733, "learning_rate": 2.8402816986458235e-05, "loss": 0.45706361532211304, "num_tokens": 102733715.0, "step": 114 }, { "epoch": 0.8582089552238806, "grad_norm": 0.3020084528652058, "learning_rate": 2.836935839561885e-05, "loss": 0.40077459812164307, "num_tokens": 103577969.0, "step": 115 }, { "epoch": 0.8656716417910447, "grad_norm": 0.3487492894550424, "learning_rate": 2.833557548229665e-05, "loss": 0.4227057695388794, "num_tokens": 104507837.0, "step": 116 }, { "epoch": 0.8731343283582089, "grad_norm": 0.3476991142190051, "learning_rate": 2.8301469169587724e-05, "loss": 0.4556281566619873, "num_tokens": 105482901.0, "step": 117 }, { "epoch": 0.8805970149253731, "grad_norm": 0.328015796780554, "learning_rate": 2.826704038942485e-05, "loss": 0.42667752504348755, "num_tokens": 106441176.0, "step": 118 }, { "epoch": 0.8880597014925373, "grad_norm": 0.34794554907206476, "learning_rate": 2.8232290082551994e-05, "loss": 0.4443303048610687, "num_tokens": 107265870.0, "step": 119 }, { "epoch": 0.8955223880597015, "grad_norm": 0.3207190436944611, "learning_rate": 2.819721919849865e-05, "loss": 0.43958723545074463, "num_tokens": 108146690.0, "step": 120 }, { "epoch": 0.9029850746268657, "grad_norm": 0.36544124775156067, "learning_rate": 2.8161828695553876e-05, "loss": 0.4427248537540436, "num_tokens": 109034402.0, "step": 121 }, { "epoch": 0.9104477611940298, "grad_norm": 0.35253344355491567, "learning_rate": 2.812611954074009e-05, "loss": 0.4511459469795227, "num_tokens": 109989572.0, "step": 122 }, { "epoch": 0.917910447761194, "grad_norm": 0.3419502161009737, "learning_rate": 2.8090092709786683e-05, "loss": 0.45898139476776123, "num_tokens": 110969334.0, "step": 123 }, { "epoch": 0.9253731343283582, "grad_norm": 0.3994476102816512, "learning_rate": 2.8053749187103323e-05, "loss": 0.4459114372730255, "num_tokens": 111844990.0, "step": 124 }, { "epoch": 0.9328358208955224, "grad_norm": 0.37335441467558017, "learning_rate": 2.801708996575309e-05, "loss": 0.43445926904678345, "num_tokens": 112800888.0, "step": 125 }, { "epoch": 0.9402985074626866, "grad_norm": 0.3148170264896714, "learning_rate": 2.7980116047425318e-05, "loss": 0.4525066912174225, "num_tokens": 113857610.0, "step": 126 }, { "epoch": 0.9477611940298507, "grad_norm": 0.3540516068525593, "learning_rate": 2.7942828442408225e-05, "loss": 0.42399919033050537, "num_tokens": 114800904.0, "step": 127 }, { "epoch": 0.9552238805970149, "grad_norm": 0.329278225140609, "learning_rate": 2.7905228169561314e-05, "loss": 0.43032482266426086, "num_tokens": 115759913.0, "step": 128 }, { "epoch": 0.9626865671641791, "grad_norm": 0.3630319590251905, "learning_rate": 2.786731625628754e-05, "loss": 0.44865018129348755, "num_tokens": 116624191.0, "step": 129 }, { "epoch": 0.9701492537313433, "grad_norm": 0.34043245011086026, "learning_rate": 2.7829093738505223e-05, "loss": 0.4354362189769745, "num_tokens": 117499418.0, "step": 130 }, { "epoch": 0.9776119402985075, "grad_norm": 0.3222353349021393, "learning_rate": 2.7790561660619757e-05, "loss": 0.4167882204055786, "num_tokens": 118329517.0, "step": 131 }, { "epoch": 0.9850746268656716, "grad_norm": 0.33759104962145015, "learning_rate": 2.7751721075495062e-05, "loss": 0.4432622492313385, "num_tokens": 119221343.0, "step": 132 }, { "epoch": 0.9925373134328358, "grad_norm": 0.3033715752060908, "learning_rate": 2.7712573044424797e-05, "loss": 0.4342583119869232, "num_tokens": 120123659.0, "step": 133 }, { "epoch": 1.0, "grad_norm": 0.3157095104018222, "learning_rate": 2.7673118637103414e-05, "loss": 0.43080803751945496, "num_tokens": 121054976.0, "step": 134 }, { "epoch": 1.007462686567164, "grad_norm": 0.4365911200440399, "learning_rate": 2.7633358931596875e-05, "loss": 0.39168182015419006, "num_tokens": 121995409.0, "step": 135 }, { "epoch": 1.0149253731343284, "grad_norm": 0.34729870538048124, "learning_rate": 2.7593295014313222e-05, "loss": 0.3802366852760315, "num_tokens": 122823226.0, "step": 136 }, { "epoch": 1.0223880597014925, "grad_norm": 0.36891237319998677, "learning_rate": 2.755292797997288e-05, "loss": 0.370537668466568, "num_tokens": 123660597.0, "step": 137 }, { "epoch": 1.0298507462686568, "grad_norm": 0.4473631385211834, "learning_rate": 2.751225893157876e-05, "loss": 0.3735314905643463, "num_tokens": 124554146.0, "step": 138 }, { "epoch": 1.037313432835821, "grad_norm": 0.3699400370687646, "learning_rate": 2.7471288980386104e-05, "loss": 0.3833698034286499, "num_tokens": 125332236.0, "step": 139 }, { "epoch": 1.044776119402985, "grad_norm": 0.3383566258765907, "learning_rate": 2.743001924587213e-05, "loss": 0.36771178245544434, "num_tokens": 126310236.0, "step": 140 }, { "epoch": 1.0522388059701493, "grad_norm": 0.34187462479662406, "learning_rate": 2.738845085570543e-05, "loss": 0.37672191858291626, "num_tokens": 127218706.0, "step": 141 }, { "epoch": 1.0597014925373134, "grad_norm": 0.3483444921381018, "learning_rate": 2.734658494571519e-05, "loss": 0.38160958886146545, "num_tokens": 128057825.0, "step": 142 }, { "epoch": 1.0671641791044777, "grad_norm": 0.3614945464912025, "learning_rate": 2.73044226598601e-05, "loss": 0.37473732233047485, "num_tokens": 128955736.0, "step": 143 }, { "epoch": 1.0746268656716418, "grad_norm": 0.36395086131367427, "learning_rate": 2.7261965150197148e-05, "loss": 0.3781934380531311, "num_tokens": 129777788.0, "step": 144 }, { "epoch": 1.0820895522388059, "grad_norm": 0.38520112404335904, "learning_rate": 2.7219213576850122e-05, "loss": 0.37962204217910767, "num_tokens": 130659960.0, "step": 145 }, { "epoch": 1.0895522388059702, "grad_norm": 0.4078777110059471, "learning_rate": 2.7176169107977898e-05, "loss": 0.38424360752105713, "num_tokens": 131550221.0, "step": 146 }, { "epoch": 1.0970149253731343, "grad_norm": 0.36815785695334224, "learning_rate": 2.713283291974253e-05, "loss": 0.38741737604141235, "num_tokens": 132486469.0, "step": 147 }, { "epoch": 1.1044776119402986, "grad_norm": 0.3233706872052398, "learning_rate": 2.7089206196277132e-05, "loss": 0.36474981904029846, "num_tokens": 133366950.0, "step": 148 }, { "epoch": 1.1119402985074627, "grad_norm": 0.3322677249511474, "learning_rate": 2.704529012965348e-05, "loss": 0.3808598518371582, "num_tokens": 134285043.0, "step": 149 }, { "epoch": 1.1194029850746268, "grad_norm": 0.339456725306424, "learning_rate": 2.7001085919849477e-05, "loss": 0.35642245411872864, "num_tokens": 135136228.0, "step": 150 }, { "epoch": 1.126865671641791, "grad_norm": 0.3226137136335262, "learning_rate": 2.6956594774716346e-05, "loss": 0.3718845844268799, "num_tokens": 136013129.0, "step": 151 }, { "epoch": 1.1343283582089552, "grad_norm": 0.3327685091410092, "learning_rate": 2.691181790994564e-05, "loss": 0.3985145092010498, "num_tokens": 136978716.0, "step": 152 }, { "epoch": 1.1417910447761195, "grad_norm": 0.33120353900381816, "learning_rate": 2.6866756549035997e-05, "loss": 0.3699200451374054, "num_tokens": 137957110.0, "step": 153 }, { "epoch": 1.1492537313432836, "grad_norm": 0.3043199924636294, "learning_rate": 2.6821411923259747e-05, "loss": 0.3767678737640381, "num_tokens": 138894209.0, "step": 154 }, { "epoch": 1.1567164179104479, "grad_norm": 0.33866074681411823, "learning_rate": 2.677578527162923e-05, "loss": 0.3994665741920471, "num_tokens": 139925878.0, "step": 155 }, { "epoch": 1.164179104477612, "grad_norm": 0.3339471075019717, "learning_rate": 2.672987784086297e-05, "loss": 0.37443894147872925, "num_tokens": 140844266.0, "step": 156 }, { "epoch": 1.171641791044776, "grad_norm": 0.34303437215557886, "learning_rate": 2.66836908853516e-05, "loss": 0.37581557035446167, "num_tokens": 141685264.0, "step": 157 }, { "epoch": 1.1791044776119404, "grad_norm": 0.29691716745960073, "learning_rate": 2.6637225667123567e-05, "loss": 0.3585776090621948, "num_tokens": 142607439.0, "step": 158 }, { "epoch": 1.1865671641791045, "grad_norm": 0.36115800096975614, "learning_rate": 2.659048345581068e-05, "loss": 0.3523404598236084, "num_tokens": 143442522.0, "step": 159 }, { "epoch": 1.1940298507462686, "grad_norm": 0.3854051146002361, "learning_rate": 2.654346552861341e-05, "loss": 0.3825865685939789, "num_tokens": 144356683.0, "step": 160 }, { "epoch": 1.2014925373134329, "grad_norm": 0.37239720041712515, "learning_rate": 2.6496173170265967e-05, "loss": 0.38340622186660767, "num_tokens": 145164747.0, "step": 161 }, { "epoch": 1.208955223880597, "grad_norm": 0.32579199473078013, "learning_rate": 2.6448607673001228e-05, "loss": 0.37306541204452515, "num_tokens": 145974438.0, "step": 162 }, { "epoch": 1.2164179104477613, "grad_norm": 0.3018547733296397, "learning_rate": 2.6400770336515403e-05, "loss": 0.35844796895980835, "num_tokens": 146897553.0, "step": 163 }, { "epoch": 1.2238805970149254, "grad_norm": 0.3718792094427089, "learning_rate": 2.6352662467932535e-05, "loss": 0.4024726152420044, "num_tokens": 147706235.0, "step": 164 }, { "epoch": 1.2313432835820897, "grad_norm": 0.2976198356156792, "learning_rate": 2.6304285381768785e-05, "loss": 0.3483440577983856, "num_tokens": 148638477.0, "step": 165 }, { "epoch": 1.2388059701492538, "grad_norm": 0.40262421884910277, "learning_rate": 2.6255640399896502e-05, "loss": 0.37967991828918457, "num_tokens": 149598765.0, "step": 166 }, { "epoch": 1.2462686567164178, "grad_norm": 0.3506490786312828, "learning_rate": 2.620672885150811e-05, "loss": 0.3896668553352356, "num_tokens": 150499813.0, "step": 167 }, { "epoch": 1.2537313432835822, "grad_norm": 0.37867461356808607, "learning_rate": 2.61575520730798e-05, "loss": 0.3811056315898895, "num_tokens": 151406909.0, "step": 168 }, { "epoch": 1.2611940298507462, "grad_norm": 0.34652476711835556, "learning_rate": 2.6108111408334992e-05, "loss": 0.4021441638469696, "num_tokens": 152358615.0, "step": 169 }, { "epoch": 1.2686567164179103, "grad_norm": 0.3443693872016931, "learning_rate": 2.6058408208207623e-05, "loss": 0.3495699167251587, "num_tokens": 153140245.0, "step": 170 }, { "epoch": 1.2761194029850746, "grad_norm": 0.3199055027860086, "learning_rate": 2.600844383080525e-05, "loss": 0.39528757333755493, "num_tokens": 154065403.0, "step": 171 }, { "epoch": 1.2835820895522387, "grad_norm": 0.38260512944207237, "learning_rate": 2.595821964137192e-05, "loss": 0.3903374969959259, "num_tokens": 155004060.0, "step": 172 }, { "epoch": 1.291044776119403, "grad_norm": 0.32094103983604383, "learning_rate": 2.590773701225089e-05, "loss": 0.38557156920433044, "num_tokens": 155894044.0, "step": 173 }, { "epoch": 1.2985074626865671, "grad_norm": 0.3191554521618694, "learning_rate": 2.585699732284708e-05, "loss": 0.3759213089942932, "num_tokens": 156891753.0, "step": 174 }, { "epoch": 1.3059701492537314, "grad_norm": 0.32242700469585533, "learning_rate": 2.580600195958945e-05, "loss": 0.37213414907455444, "num_tokens": 157781264.0, "step": 175 }, { "epoch": 1.3134328358208955, "grad_norm": 0.3284517036436859, "learning_rate": 2.5754752315893065e-05, "loss": 0.378812313079834, "num_tokens": 158729371.0, "step": 176 }, { "epoch": 1.3208955223880596, "grad_norm": 0.3415023419708296, "learning_rate": 2.5703249792121037e-05, "loss": 0.3865644931793213, "num_tokens": 159723929.0, "step": 177 }, { "epoch": 1.328358208955224, "grad_norm": 0.3343050420961093, "learning_rate": 2.5651495795546263e-05, "loss": 0.4062744081020355, "num_tokens": 160655042.0, "step": 178 }, { "epoch": 1.335820895522388, "grad_norm": 0.5981815975526952, "learning_rate": 2.5599491740312972e-05, "loss": 0.37754061818122864, "num_tokens": 161689806.0, "step": 179 }, { "epoch": 1.3432835820895521, "grad_norm": 0.35234814112990026, "learning_rate": 2.5547239047398078e-05, "loss": 0.3692866861820221, "num_tokens": 162672971.0, "step": 180 }, { "epoch": 1.3507462686567164, "grad_norm": 0.32830063055017134, "learning_rate": 2.5494739144572368e-05, "loss": 0.35535305738449097, "num_tokens": 163606727.0, "step": 181 }, { "epoch": 1.3582089552238805, "grad_norm": 0.3451645432424477, "learning_rate": 2.544199346636147e-05, "loss": 0.38066795468330383, "num_tokens": 164379724.0, "step": 182 }, { "epoch": 1.3656716417910448, "grad_norm": 0.36363681244248197, "learning_rate": 2.5389003454006667e-05, "loss": 0.380257785320282, "num_tokens": 165282114.0, "step": 183 }, { "epoch": 1.373134328358209, "grad_norm": 0.3192538780146095, "learning_rate": 2.533577055542551e-05, "loss": 0.3674117922782898, "num_tokens": 166184652.0, "step": 184 }, { "epoch": 1.3805970149253732, "grad_norm": 0.33313618040811743, "learning_rate": 2.5282296225172267e-05, "loss": 0.36746978759765625, "num_tokens": 167131883.0, "step": 185 }, { "epoch": 1.3880597014925373, "grad_norm": 0.3670551777933176, "learning_rate": 2.522858192439815e-05, "loss": 0.40295130014419556, "num_tokens": 168105786.0, "step": 186 }, { "epoch": 1.3955223880597014, "grad_norm": 0.3475964519943968, "learning_rate": 2.5174629120811432e-05, "loss": 0.38296568393707275, "num_tokens": 168981965.0, "step": 187 }, { "epoch": 1.4029850746268657, "grad_norm": 0.3556039194849401, "learning_rate": 2.512043928863731e-05, "loss": 0.38510382175445557, "num_tokens": 169813930.0, "step": 188 }, { "epoch": 1.4104477611940298, "grad_norm": 0.32738176960414617, "learning_rate": 2.5066013908577625e-05, "loss": 0.356991708278656, "num_tokens": 170803921.0, "step": 189 }, { "epoch": 1.417910447761194, "grad_norm": 0.3545590302027483, "learning_rate": 2.501135446777042e-05, "loss": 0.3816283941268921, "num_tokens": 171568584.0, "step": 190 }, { "epoch": 1.4253731343283582, "grad_norm": 0.33317616623937235, "learning_rate": 2.4956462459749297e-05, "loss": 0.36903613805770874, "num_tokens": 172302686.0, "step": 191 }, { "epoch": 1.4328358208955223, "grad_norm": 0.3581041627669198, "learning_rate": 2.4901339384402598e-05, "loss": 0.40988194942474365, "num_tokens": 173251435.0, "step": 192 }, { "epoch": 1.4402985074626866, "grad_norm": 0.3987362939905261, "learning_rate": 2.4845986747932434e-05, "loss": 0.3909692168235779, "num_tokens": 174154926.0, "step": 193 }, { "epoch": 1.4477611940298507, "grad_norm": 0.4522210758422187, "learning_rate": 2.4790406062813526e-05, "loss": 0.40102025866508484, "num_tokens": 174968736.0, "step": 194 }, { "epoch": 1.455223880597015, "grad_norm": 0.3447348513379396, "learning_rate": 2.4734598847751868e-05, "loss": 0.3985745310783386, "num_tokens": 175993671.0, "step": 195 }, { "epoch": 1.462686567164179, "grad_norm": 0.30700265731423365, "learning_rate": 2.4678566627643243e-05, "loss": 0.37859317660331726, "num_tokens": 176965410.0, "step": 196 }, { "epoch": 1.4701492537313432, "grad_norm": 0.34463758170682973, "learning_rate": 2.462231093353155e-05, "loss": 0.4219540059566498, "num_tokens": 177894815.0, "step": 197 }, { "epoch": 1.4776119402985075, "grad_norm": 0.3101163888412067, "learning_rate": 2.4565833302566967e-05, "loss": 0.3521503210067749, "num_tokens": 178840660.0, "step": 198 }, { "epoch": 1.4850746268656716, "grad_norm": 0.34884755051979194, "learning_rate": 2.4509135277963953e-05, "loss": 0.3874298632144928, "num_tokens": 179786009.0, "step": 199 }, { "epoch": 1.4925373134328357, "grad_norm": 0.3068857508105448, "learning_rate": 2.445221840895908e-05, "loss": 0.34809160232543945, "num_tokens": 180680467.0, "step": 200 }, { "epoch": 1.5, "grad_norm": 0.33736718002624627, "learning_rate": 2.43950842507687e-05, "loss": 0.38442444801330566, "num_tokens": 181598316.0, "step": 201 }, { "epoch": 1.5074626865671643, "grad_norm": 0.3701582076339982, "learning_rate": 2.4337734364546455e-05, "loss": 0.38641679286956787, "num_tokens": 182458909.0, "step": 202 }, { "epoch": 1.5149253731343284, "grad_norm": 0.43633118208871485, "learning_rate": 2.4280170317340602e-05, "loss": 0.3791668117046356, "num_tokens": 183258199.0, "step": 203 }, { "epoch": 1.5223880597014925, "grad_norm": 0.3471858102435004, "learning_rate": 2.4222393682051225e-05, "loss": 0.38509491086006165, "num_tokens": 184223376.0, "step": 204 }, { "epoch": 1.5298507462686568, "grad_norm": 0.3457150792550615, "learning_rate": 2.4164406037387226e-05, "loss": 0.40659117698669434, "num_tokens": 185129043.0, "step": 205 }, { "epoch": 1.537313432835821, "grad_norm": 0.4042975807556774, "learning_rate": 2.4106208967823205e-05, "loss": 0.386791467666626, "num_tokens": 186025421.0, "step": 206 }, { "epoch": 1.544776119402985, "grad_norm": 0.32459079771864724, "learning_rate": 2.4047804063556156e-05, "loss": 0.3690309226512909, "num_tokens": 186963319.0, "step": 207 }, { "epoch": 1.5522388059701493, "grad_norm": 0.3392685539840793, "learning_rate": 2.3989192920462032e-05, "loss": 0.3927544951438904, "num_tokens": 187973354.0, "step": 208 }, { "epoch": 1.5597014925373134, "grad_norm": 0.32438229385759354, "learning_rate": 2.3930377140052118e-05, "loss": 0.3521687984466553, "num_tokens": 188705328.0, "step": 209 }, { "epoch": 1.5671641791044775, "grad_norm": 0.3418923460834205, "learning_rate": 2.3871358329429282e-05, "loss": 0.39543381333351135, "num_tokens": 189538934.0, "step": 210 }, { "epoch": 1.5746268656716418, "grad_norm": 0.32558989276658784, "learning_rate": 2.3812138101244062e-05, "loss": 0.3742252588272095, "num_tokens": 190336903.0, "step": 211 }, { "epoch": 1.582089552238806, "grad_norm": 0.35255218420418694, "learning_rate": 2.37527180736506e-05, "loss": 0.40875107049942017, "num_tokens": 191168843.0, "step": 212 }, { "epoch": 1.5895522388059702, "grad_norm": 0.31783452945012386, "learning_rate": 2.3693099870262425e-05, "loss": 0.3772295117378235, "num_tokens": 192111363.0, "step": 213 }, { "epoch": 1.5970149253731343, "grad_norm": 0.3156291557535895, "learning_rate": 2.363328512010809e-05, "loss": 0.39021003246307373, "num_tokens": 193103746.0, "step": 214 }, { "epoch": 1.6044776119402986, "grad_norm": 0.32761123022827565, "learning_rate": 2.3573275457586658e-05, "loss": 0.38943108916282654, "num_tokens": 193981563.0, "step": 215 }, { "epoch": 1.6119402985074627, "grad_norm": 0.3337068007026254, "learning_rate": 2.3513072522423058e-05, "loss": 0.3988877236843109, "num_tokens": 194834592.0, "step": 216 }, { "epoch": 1.6194029850746268, "grad_norm": 0.3080942622353808, "learning_rate": 2.3452677959623254e-05, "loss": 0.3594892621040344, "num_tokens": 195762991.0, "step": 217 }, { "epoch": 1.626865671641791, "grad_norm": 0.29606722446920497, "learning_rate": 2.3392093419429313e-05, "loss": 0.37819525599479675, "num_tokens": 196736861.0, "step": 218 }, { "epoch": 1.6343283582089554, "grad_norm": 0.31290197305267825, "learning_rate": 2.333132055727431e-05, "loss": 0.386009156703949, "num_tokens": 197626724.0, "step": 219 }, { "epoch": 1.6417910447761193, "grad_norm": 0.3200855389837665, "learning_rate": 2.32703610337371e-05, "loss": 0.40475738048553467, "num_tokens": 198637589.0, "step": 220 }, { "epoch": 1.6492537313432836, "grad_norm": 0.2866817447413364, "learning_rate": 2.320921651449694e-05, "loss": 0.39424002170562744, "num_tokens": 199563255.0, "step": 221 }, { "epoch": 1.6567164179104479, "grad_norm": 0.4885347371955867, "learning_rate": 2.3147888670287962e-05, "loss": 0.3826729953289032, "num_tokens": 200461303.0, "step": 222 }, { "epoch": 1.664179104477612, "grad_norm": 0.3109683935111661, "learning_rate": 2.3086379176853553e-05, "loss": 0.40459978580474854, "num_tokens": 201369977.0, "step": 223 }, { "epoch": 1.671641791044776, "grad_norm": 0.25809457402969005, "learning_rate": 2.3024689714900524e-05, "loss": 0.35879969596862793, "num_tokens": 202278503.0, "step": 224 }, { "epoch": 1.6791044776119404, "grad_norm": 0.3025695421124313, "learning_rate": 2.296282197005322e-05, "loss": 0.35284388065338135, "num_tokens": 203242720.0, "step": 225 }, { "epoch": 1.6865671641791045, "grad_norm": 0.30640518076000706, "learning_rate": 2.2900777632807456e-05, "loss": 0.37256160378456116, "num_tokens": 204150301.0, "step": 226 }, { "epoch": 1.6940298507462686, "grad_norm": 0.3140380062192946, "learning_rate": 2.283855839848431e-05, "loss": 0.37972885370254517, "num_tokens": 205093558.0, "step": 227 }, { "epoch": 1.7014925373134329, "grad_norm": 0.34946442818041484, "learning_rate": 2.2776165967183807e-05, "loss": 0.39244264364242554, "num_tokens": 205970210.0, "step": 228 }, { "epoch": 1.7089552238805972, "grad_norm": 0.32538438973624206, "learning_rate": 2.2713602043738475e-05, "loss": 0.39682289958000183, "num_tokens": 206859291.0, "step": 229 }, { "epoch": 1.716417910447761, "grad_norm": 0.3124091616900136, "learning_rate": 2.2650868337666746e-05, "loss": 0.3859510123729706, "num_tokens": 207786446.0, "step": 230 }, { "epoch": 1.7238805970149254, "grad_norm": 0.3423691973747688, "learning_rate": 2.2587966563126255e-05, "loss": 0.3976070284843445, "num_tokens": 208698287.0, "step": 231 }, { "epoch": 1.7313432835820897, "grad_norm": 0.3134320041738064, "learning_rate": 2.2524898438867004e-05, "loss": 0.3667559325695038, "num_tokens": 209548343.0, "step": 232 }, { "epoch": 1.7388059701492538, "grad_norm": 0.33617188210180216, "learning_rate": 2.2461665688184372e-05, "loss": 0.3952285945415497, "num_tokens": 210468969.0, "step": 233 }, { "epoch": 1.7462686567164178, "grad_norm": 0.2947903980731328, "learning_rate": 2.2398270038872083e-05, "loss": 0.40012168884277344, "num_tokens": 211457470.0, "step": 234 }, { "epoch": 1.7537313432835822, "grad_norm": 0.3086024831747328, "learning_rate": 2.233471322317492e-05, "loss": 0.38004422187805176, "num_tokens": 212347451.0, "step": 235 }, { "epoch": 1.7611940298507462, "grad_norm": 0.29270776401429416, "learning_rate": 2.227099697774146e-05, "loss": 0.37762215733528137, "num_tokens": 213190706.0, "step": 236 }, { "epoch": 1.7686567164179103, "grad_norm": 0.35194638120625044, "learning_rate": 2.2207123043576585e-05, "loss": 0.3850764036178589, "num_tokens": 214169074.0, "step": 237 }, { "epoch": 1.7761194029850746, "grad_norm": 0.30551926833119664, "learning_rate": 2.2143093165993916e-05, "loss": 0.395663321018219, "num_tokens": 215188427.0, "step": 238 }, { "epoch": 1.783582089552239, "grad_norm": 0.33662162397203393, "learning_rate": 2.2078909094568133e-05, "loss": 0.3957657814025879, "num_tokens": 216080767.0, "step": 239 }, { "epoch": 1.7910447761194028, "grad_norm": 0.34783223375914446, "learning_rate": 2.2014572583087155e-05, "loss": 0.390730082988739, "num_tokens": 216995394.0, "step": 240 }, { "epoch": 1.7985074626865671, "grad_norm": 0.3130872530548468, "learning_rate": 2.1950085389504232e-05, "loss": 0.3682572841644287, "num_tokens": 217866020.0, "step": 241 }, { "epoch": 1.8059701492537314, "grad_norm": 0.3489897287487041, "learning_rate": 2.18854492758899e-05, "loss": 0.3791583180427551, "num_tokens": 218680341.0, "step": 242 }, { "epoch": 1.8134328358208955, "grad_norm": 0.28800056676846153, "learning_rate": 2.182066600838384e-05, "loss": 0.39488768577575684, "num_tokens": 219550948.0, "step": 243 }, { "epoch": 1.8208955223880596, "grad_norm": 0.35235893169992594, "learning_rate": 2.1755737357146618e-05, "loss": 0.37826257944107056, "num_tokens": 220517125.0, "step": 244 }, { "epoch": 1.828358208955224, "grad_norm": 0.3108059485175432, "learning_rate": 2.169066509631132e-05, "loss": 0.3689156770706177, "num_tokens": 221365026.0, "step": 245 }, { "epoch": 1.835820895522388, "grad_norm": 0.27688834966994996, "learning_rate": 2.162545100393505e-05, "loss": 0.34449559450149536, "num_tokens": 222233736.0, "step": 246 }, { "epoch": 1.8432835820895521, "grad_norm": 0.3559202619871652, "learning_rate": 2.1560096861950396e-05, "loss": 0.41038885712623596, "num_tokens": 223222809.0, "step": 247 }, { "epoch": 1.8507462686567164, "grad_norm": 0.32206069093634854, "learning_rate": 2.1494604456116695e-05, "loss": 0.3931525945663452, "num_tokens": 224116326.0, "step": 248 }, { "epoch": 1.8582089552238807, "grad_norm": 0.32036384873450585, "learning_rate": 2.1428975575971243e-05, "loss": 0.3952087461948395, "num_tokens": 225107686.0, "step": 249 }, { "epoch": 1.8656716417910446, "grad_norm": 0.2750788343679779, "learning_rate": 2.1363212014780432e-05, "loss": 0.3948509097099304, "num_tokens": 226126493.0, "step": 250 }, { "epoch": 1.873134328358209, "grad_norm": 0.3546848770246566, "learning_rate": 2.1297315569490704e-05, "loss": 0.38538211584091187, "num_tokens": 227002265.0, "step": 251 }, { "epoch": 1.8805970149253732, "grad_norm": 0.31987168628076534, "learning_rate": 2.123128804067949e-05, "loss": 0.3849794268608093, "num_tokens": 227879194.0, "step": 252 }, { "epoch": 1.8880597014925373, "grad_norm": 0.35226690474895933, "learning_rate": 2.1165131232505973e-05, "loss": 0.40667471289634705, "num_tokens": 228849840.0, "step": 253 }, { "epoch": 1.8955223880597014, "grad_norm": 0.3517377549019829, "learning_rate": 2.1098846952661833e-05, "loss": 0.36520224809646606, "num_tokens": 229755841.0, "step": 254 }, { "epoch": 1.9029850746268657, "grad_norm": 0.31196447519845827, "learning_rate": 2.1032437012321812e-05, "loss": 0.37600016593933105, "num_tokens": 230531378.0, "step": 255 }, { "epoch": 1.9104477611940298, "grad_norm": 0.2994021775242901, "learning_rate": 2.0965903226094246e-05, "loss": 0.35384806990623474, "num_tokens": 231462516.0, "step": 256 }, { "epoch": 1.917910447761194, "grad_norm": 0.3160465003726717, "learning_rate": 2.08992474119715e-05, "loss": 0.3978261649608612, "num_tokens": 232355925.0, "step": 257 }, { "epoch": 1.9253731343283582, "grad_norm": 0.3261497602811777, "learning_rate": 2.0832471391280234e-05, "loss": 0.40133193135261536, "num_tokens": 233194033.0, "step": 258 }, { "epoch": 1.9328358208955225, "grad_norm": 0.2915382309557714, "learning_rate": 2.0765576988631707e-05, "loss": 0.36901217699050903, "num_tokens": 234098698.0, "step": 259 }, { "epoch": 1.9402985074626866, "grad_norm": 0.3301749887472271, "learning_rate": 2.0698566031871877e-05, "loss": 0.38775068521499634, "num_tokens": 235139771.0, "step": 260 }, { "epoch": 1.9477611940298507, "grad_norm": 0.32435823220698096, "learning_rate": 2.063144035203146e-05, "loss": 0.37508994340896606, "num_tokens": 235974035.0, "step": 261 }, { "epoch": 1.955223880597015, "grad_norm": 0.3051639407042942, "learning_rate": 2.0564201783275908e-05, "loss": 0.3903445601463318, "num_tokens": 236882822.0, "step": 262 }, { "epoch": 1.962686567164179, "grad_norm": 0.3254047024560983, "learning_rate": 2.0496852162855303e-05, "loss": 0.40096017718315125, "num_tokens": 237883798.0, "step": 263 }, { "epoch": 1.9701492537313432, "grad_norm": 0.3010384436296043, "learning_rate": 2.0429393331054122e-05, "loss": 0.3954760432243347, "num_tokens": 238711038.0, "step": 264 }, { "epoch": 1.9776119402985075, "grad_norm": 0.2897130131386432, "learning_rate": 2.0361827131140988e-05, "loss": 0.3967036008834839, "num_tokens": 239602771.0, "step": 265 }, { "epoch": 1.9850746268656716, "grad_norm": 0.2957878042115852, "learning_rate": 2.0294155409318273e-05, "loss": 0.3834611177444458, "num_tokens": 240344316.0, "step": 266 }, { "epoch": 1.9925373134328357, "grad_norm": 0.28585034773563434, "learning_rate": 2.022638001467168e-05, "loss": 0.36557599902153015, "num_tokens": 241222304.0, "step": 267 }, { "epoch": 2.0, "grad_norm": 0.31387177455183296, "learning_rate": 2.0158502799119694e-05, "loss": 0.3776703178882599, "num_tokens": 242128094.0, "step": 268 }, { "epoch": 2.0074626865671643, "grad_norm": 0.4118428271810675, "learning_rate": 2.0090525617362995e-05, "loss": 0.35364389419555664, "num_tokens": 243032329.0, "step": 269 }, { "epoch": 2.014925373134328, "grad_norm": 0.3402616783384963, "learning_rate": 2.002245032683378e-05, "loss": 0.3219972252845764, "num_tokens": 243800954.0, "step": 270 }, { "epoch": 2.0223880597014925, "grad_norm": 0.37707313442099644, "learning_rate": 1.9954278787644977e-05, "loss": 0.3484679162502289, "num_tokens": 244806412.0, "step": 271 }, { "epoch": 2.029850746268657, "grad_norm": 0.39396702522268606, "learning_rate": 1.988601286253949e-05, "loss": 0.3331984877586365, "num_tokens": 245802398.0, "step": 272 }, { "epoch": 2.0373134328358207, "grad_norm": 0.40195308083874304, "learning_rate": 1.9817654416839217e-05, "loss": 0.3107374608516693, "num_tokens": 246739297.0, "step": 273 }, { "epoch": 2.044776119402985, "grad_norm": 0.35623399811084044, "learning_rate": 1.9749205318394146e-05, "loss": 0.3448570966720581, "num_tokens": 247613231.0, "step": 274 }, { "epoch": 2.0522388059701493, "grad_norm": 0.36546920760892426, "learning_rate": 1.9680667437531283e-05, "loss": 0.332324355840683, "num_tokens": 248481475.0, "step": 275 }, { "epoch": 2.0597014925373136, "grad_norm": 0.34431898026591723, "learning_rate": 1.961204264700355e-05, "loss": 0.3348411023616791, "num_tokens": 249300938.0, "step": 276 }, { "epoch": 2.0671641791044775, "grad_norm": 0.35727153061507005, "learning_rate": 1.954333282193863e-05, "loss": 0.33406710624694824, "num_tokens": 250171263.0, "step": 277 }, { "epoch": 2.074626865671642, "grad_norm": 0.3250548689859224, "learning_rate": 1.9474539839787713e-05, "loss": 0.3140842020511627, "num_tokens": 251071115.0, "step": 278 }, { "epoch": 2.082089552238806, "grad_norm": 0.3059178018916231, "learning_rate": 1.9405665580274205e-05, "loss": 0.32764101028442383, "num_tokens": 251961398.0, "step": 279 }, { "epoch": 2.08955223880597, "grad_norm": 0.3265488394498236, "learning_rate": 1.9336711925342357e-05, "loss": 0.31429940462112427, "num_tokens": 252775080.0, "step": 280 }, { "epoch": 2.0970149253731343, "grad_norm": 0.3033003365838648, "learning_rate": 1.926768075910586e-05, "loss": 0.3364748954772949, "num_tokens": 253678902.0, "step": 281 }, { "epoch": 2.1044776119402986, "grad_norm": 0.3218060266076608, "learning_rate": 1.919857396779633e-05, "loss": 0.34063756465911865, "num_tokens": 254547582.0, "step": 282 }, { "epoch": 2.111940298507463, "grad_norm": 0.28752356415270264, "learning_rate": 1.9129393439711812e-05, "loss": 0.3032745122909546, "num_tokens": 255299741.0, "step": 283 }, { "epoch": 2.1194029850746268, "grad_norm": 0.3114027725704962, "learning_rate": 1.906014106516515e-05, "loss": 0.323519766330719, "num_tokens": 256183942.0, "step": 284 }, { "epoch": 2.126865671641791, "grad_norm": 0.35567716347702344, "learning_rate": 1.899081873643235e-05, "loss": 0.3606981635093689, "num_tokens": 257098083.0, "step": 285 }, { "epoch": 2.1343283582089554, "grad_norm": 0.31558423890531895, "learning_rate": 1.8921428347700853e-05, "loss": 0.33504611253738403, "num_tokens": 258138577.0, "step": 286 }, { "epoch": 2.1417910447761193, "grad_norm": 0.34224186580930754, "learning_rate": 1.8851971795017822e-05, "loss": 0.326399028301239, "num_tokens": 258888036.0, "step": 287 }, { "epoch": 2.1492537313432836, "grad_norm": 0.30575598315812624, "learning_rate": 1.8782450976238294e-05, "loss": 0.3074103593826294, "num_tokens": 259766509.0, "step": 288 }, { "epoch": 2.156716417910448, "grad_norm": 0.3205831945487892, "learning_rate": 1.8712867790973317e-05, "loss": 0.33759474754333496, "num_tokens": 260610097.0, "step": 289 }, { "epoch": 2.1641791044776117, "grad_norm": 0.3023776868912514, "learning_rate": 1.86432241405381e-05, "loss": 0.3334404230117798, "num_tokens": 261447212.0, "step": 290 }, { "epoch": 2.171641791044776, "grad_norm": 0.30838933870298346, "learning_rate": 1.8573521927900004e-05, "loss": 0.32669875025749207, "num_tokens": 262481613.0, "step": 291 }, { "epoch": 2.1791044776119404, "grad_norm": 0.31402266902142234, "learning_rate": 1.850376305762655e-05, "loss": 0.35277265310287476, "num_tokens": 263536437.0, "step": 292 }, { "epoch": 2.1865671641791047, "grad_norm": 0.31931309491882254, "learning_rate": 1.843394943583342e-05, "loss": 0.32963383197784424, "num_tokens": 264379962.0, "step": 293 }, { "epoch": 2.1940298507462686, "grad_norm": 0.34845358198148824, "learning_rate": 1.836408297013232e-05, "loss": 0.3339906334877014, "num_tokens": 265196630.0, "step": 294 }, { "epoch": 2.201492537313433, "grad_norm": 0.3046594968746612, "learning_rate": 1.8294165569578902e-05, "loss": 0.33100634813308716, "num_tokens": 266192395.0, "step": 295 }, { "epoch": 2.208955223880597, "grad_norm": 0.30699215790098994, "learning_rate": 1.8224199144620557e-05, "loss": 0.33232712745666504, "num_tokens": 267198691.0, "step": 296 }, { "epoch": 2.216417910447761, "grad_norm": 0.29968857683346356, "learning_rate": 1.8154185607044267e-05, "loss": 0.3363949656486511, "num_tokens": 268129026.0, "step": 297 }, { "epoch": 2.2238805970149254, "grad_norm": 0.2805025168393364, "learning_rate": 1.8084126869924304e-05, "loss": 0.32357555627822876, "num_tokens": 269034104.0, "step": 298 }, { "epoch": 2.2313432835820897, "grad_norm": 0.3030010408610569, "learning_rate": 1.801402484757001e-05, "loss": 0.31561556458473206, "num_tokens": 269856471.0, "step": 299 }, { "epoch": 2.2388059701492535, "grad_norm": 0.3195759723269335, "learning_rate": 1.794388145547346e-05, "loss": 0.34712180495262146, "num_tokens": 270737041.0, "step": 300 }, { "epoch": 2.246268656716418, "grad_norm": 0.3002118422900145, "learning_rate": 1.7873698610257117e-05, "loss": 0.35004639625549316, "num_tokens": 271655450.0, "step": 301 }, { "epoch": 2.253731343283582, "grad_norm": 0.28294722266034306, "learning_rate": 1.7803478229621504e-05, "loss": 0.3119392395019531, "num_tokens": 272452734.0, "step": 302 }, { "epoch": 2.2611940298507465, "grad_norm": 0.3751958180610849, "learning_rate": 1.773322223229275e-05, "loss": 0.3349981904029846, "num_tokens": 273321732.0, "step": 303 }, { "epoch": 2.2686567164179103, "grad_norm": 0.29383426672277096, "learning_rate": 1.766293253797021e-05, "loss": 0.3226167857646942, "num_tokens": 274159747.0, "step": 304 }, { "epoch": 2.2761194029850746, "grad_norm": 0.3225857187342117, "learning_rate": 1.7592611067273947e-05, "loss": 0.34066349267959595, "num_tokens": 275031559.0, "step": 305 }, { "epoch": 2.283582089552239, "grad_norm": 0.30673283102679866, "learning_rate": 1.7522259741692343e-05, "loss": 0.33413374423980713, "num_tokens": 275930932.0, "step": 306 }, { "epoch": 2.291044776119403, "grad_norm": 0.29177063821827953, "learning_rate": 1.7451880483529507e-05, "loss": 0.308035671710968, "num_tokens": 276741084.0, "step": 307 }, { "epoch": 2.298507462686567, "grad_norm": 0.28427282903782, "learning_rate": 1.7381475215852805e-05, "loss": 0.3250593841075897, "num_tokens": 277707588.0, "step": 308 }, { "epoch": 2.3059701492537314, "grad_norm": 0.2971627244171146, "learning_rate": 1.7311045862440298e-05, "loss": 0.32269105315208435, "num_tokens": 278703194.0, "step": 309 }, { "epoch": 2.3134328358208958, "grad_norm": 0.31555926494620046, "learning_rate": 1.724059434772816e-05, "loss": 0.32977578043937683, "num_tokens": 279491539.0, "step": 310 }, { "epoch": 2.3208955223880596, "grad_norm": 0.36417590081584483, "learning_rate": 1.7170122596758127e-05, "loss": 0.33532094955444336, "num_tokens": 280324630.0, "step": 311 }, { "epoch": 2.328358208955224, "grad_norm": 0.3105391565464644, "learning_rate": 1.7099632535124854e-05, "loss": 0.3156779408454895, "num_tokens": 281308248.0, "step": 312 }, { "epoch": 2.3358208955223883, "grad_norm": 0.2749240967299516, "learning_rate": 1.702912608892335e-05, "loss": 0.31482142210006714, "num_tokens": 282221715.0, "step": 313 }, { "epoch": 2.343283582089552, "grad_norm": 0.29515283586141233, "learning_rate": 1.6958605184696297e-05, "loss": 0.32622820138931274, "num_tokens": 283077823.0, "step": 314 }, { "epoch": 2.3507462686567164, "grad_norm": 0.2968928416618244, "learning_rate": 1.688807174938145e-05, "loss": 0.3397972583770752, "num_tokens": 284064121.0, "step": 315 }, { "epoch": 2.3582089552238807, "grad_norm": 0.29827012810037, "learning_rate": 1.681752771025896e-05, "loss": 0.3332856297492981, "num_tokens": 285080424.0, "step": 316 }, { "epoch": 2.3656716417910446, "grad_norm": 0.3039705351616898, "learning_rate": 1.674697499489872e-05, "loss": 0.33647334575653076, "num_tokens": 286006199.0, "step": 317 }, { "epoch": 2.373134328358209, "grad_norm": 0.30141732787530867, "learning_rate": 1.6676415531107706e-05, "loss": 0.3342139720916748, "num_tokens": 286965514.0, "step": 318 }, { "epoch": 2.3805970149253732, "grad_norm": 0.2956530210848347, "learning_rate": 1.6605851246877272e-05, "loss": 0.3201013207435608, "num_tokens": 287842439.0, "step": 319 }, { "epoch": 2.388059701492537, "grad_norm": 0.2940637214016598, "learning_rate": 1.65352840703305e-05, "loss": 0.3377227783203125, "num_tokens": 288763923.0, "step": 320 }, { "epoch": 2.3955223880597014, "grad_norm": 0.2832996218159561, "learning_rate": 1.64647159296695e-05, "loss": 0.3385891020298004, "num_tokens": 289625254.0, "step": 321 }, { "epoch": 2.4029850746268657, "grad_norm": 0.3190448740603143, "learning_rate": 1.6394148753122734e-05, "loss": 0.33053308725357056, "num_tokens": 290474425.0, "step": 322 }, { "epoch": 2.41044776119403, "grad_norm": 0.3096387349106184, "learning_rate": 1.63235844688923e-05, "loss": 0.3427371680736542, "num_tokens": 291335951.0, "step": 323 }, { "epoch": 2.417910447761194, "grad_norm": 0.27491757978825115, "learning_rate": 1.6253025005101283e-05, "loss": 0.3303934931755066, "num_tokens": 292257658.0, "step": 324 }, { "epoch": 2.425373134328358, "grad_norm": 0.3066415534546823, "learning_rate": 1.6182472289741043e-05, "loss": 0.36399906873703003, "num_tokens": 293162733.0, "step": 325 }, { "epoch": 2.4328358208955225, "grad_norm": 0.29140134988200495, "learning_rate": 1.611192825061855e-05, "loss": 0.3504979610443115, "num_tokens": 294199419.0, "step": 326 }, { "epoch": 2.4402985074626864, "grad_norm": 0.2895038992576744, "learning_rate": 1.604139481530371e-05, "loss": 0.35671094059944153, "num_tokens": 295163721.0, "step": 327 }, { "epoch": 2.4477611940298507, "grad_norm": 0.2871110161885208, "learning_rate": 1.5970873911076654e-05, "loss": 0.3230712115764618, "num_tokens": 296048485.0, "step": 328 }, { "epoch": 2.455223880597015, "grad_norm": 0.29355297538880015, "learning_rate": 1.590036746487515e-05, "loss": 0.32808297872543335, "num_tokens": 296905697.0, "step": 329 }, { "epoch": 2.4626865671641793, "grad_norm": 0.2951986083005398, "learning_rate": 1.5829877403241875e-05, "loss": 0.3399554491043091, "num_tokens": 297837804.0, "step": 330 }, { "epoch": 2.470149253731343, "grad_norm": 0.29400144153471874, "learning_rate": 1.5759405652271843e-05, "loss": 0.33751606941223145, "num_tokens": 298822600.0, "step": 331 }, { "epoch": 2.4776119402985075, "grad_norm": 0.31657313495586964, "learning_rate": 1.5688954137559705e-05, "loss": 0.35242465138435364, "num_tokens": 299764042.0, "step": 332 }, { "epoch": 2.485074626865672, "grad_norm": 0.2716779461194812, "learning_rate": 1.5618524784147197e-05, "loss": 0.3363187313079834, "num_tokens": 300754135.0, "step": 333 }, { "epoch": 2.4925373134328357, "grad_norm": 0.29632964201216716, "learning_rate": 1.5548119516470496e-05, "loss": 0.3306392431259155, "num_tokens": 301644488.0, "step": 334 }, { "epoch": 2.5, "grad_norm": 0.27058966408716395, "learning_rate": 1.547774025830766e-05, "loss": 0.31814491748809814, "num_tokens": 302538046.0, "step": 335 }, { "epoch": 2.5074626865671643, "grad_norm": 0.3055781987611692, "learning_rate": 1.5407388932726056e-05, "loss": 0.3387256860733032, "num_tokens": 303333898.0, "step": 336 }, { "epoch": 2.5149253731343286, "grad_norm": 0.27960594879506695, "learning_rate": 1.53370674620298e-05, "loss": 0.33688774704933167, "num_tokens": 304300483.0, "step": 337 }, { "epoch": 2.5223880597014925, "grad_norm": 0.3168292150331439, "learning_rate": 1.526677776770725e-05, "loss": 0.34352821111679077, "num_tokens": 305176138.0, "step": 338 }, { "epoch": 2.529850746268657, "grad_norm": 0.40646030417402895, "learning_rate": 1.5196521770378498e-05, "loss": 0.3636009693145752, "num_tokens": 306092248.0, "step": 339 }, { "epoch": 2.5373134328358207, "grad_norm": 0.34451254015626254, "learning_rate": 1.5126301389742889e-05, "loss": 0.3361930251121521, "num_tokens": 306939786.0, "step": 340 }, { "epoch": 2.544776119402985, "grad_norm": 0.301191033765102, "learning_rate": 1.5056118544526552e-05, "loss": 0.34493589401245117, "num_tokens": 307676111.0, "step": 341 }, { "epoch": 2.5522388059701493, "grad_norm": 0.3179102063972407, "learning_rate": 1.4985975152429998e-05, "loss": 0.35899001359939575, "num_tokens": 308557757.0, "step": 342 }, { "epoch": 2.5597014925373136, "grad_norm": 0.3171583352212965, "learning_rate": 1.4915873130075704e-05, "loss": 0.3521921634674072, "num_tokens": 309465811.0, "step": 343 }, { "epoch": 2.5671641791044775, "grad_norm": 0.3150146379597525, "learning_rate": 1.484581439295574e-05, "loss": 0.3577362895011902, "num_tokens": 310383391.0, "step": 344 }, { "epoch": 2.574626865671642, "grad_norm": 0.3129412764272567, "learning_rate": 1.4775800855379447e-05, "loss": 0.33559077978134155, "num_tokens": 311194322.0, "step": 345 }, { "epoch": 2.582089552238806, "grad_norm": 0.3119289397564452, "learning_rate": 1.4705834430421109e-05, "loss": 0.3442152142524719, "num_tokens": 312296357.0, "step": 346 }, { "epoch": 2.58955223880597, "grad_norm": 0.30424883379817386, "learning_rate": 1.4635917029867686e-05, "loss": 0.3301926851272583, "num_tokens": 313212850.0, "step": 347 }, { "epoch": 2.5970149253731343, "grad_norm": 0.2888937621740727, "learning_rate": 1.4566050564166585e-05, "loss": 0.3173384368419647, "num_tokens": 314136793.0, "step": 348 }, { "epoch": 2.6044776119402986, "grad_norm": 0.3181049412726844, "learning_rate": 1.4496236942373452e-05, "loss": 0.33396849036216736, "num_tokens": 315103742.0, "step": 349 }, { "epoch": 2.611940298507463, "grad_norm": 0.285849757305298, "learning_rate": 1.4426478072100001e-05, "loss": 0.3271850645542145, "num_tokens": 315911989.0, "step": 350 }, { "epoch": 2.6194029850746268, "grad_norm": 0.30462072449655236, "learning_rate": 1.4356775859461898e-05, "loss": 0.3309672474861145, "num_tokens": 316818398.0, "step": 351 }, { "epoch": 2.626865671641791, "grad_norm": 0.28242189700779124, "learning_rate": 1.4287132209026686e-05, "loss": 0.3406432271003723, "num_tokens": 317815953.0, "step": 352 }, { "epoch": 2.6343283582089554, "grad_norm": 0.30367294764460456, "learning_rate": 1.4217549023761713e-05, "loss": 0.33886873722076416, "num_tokens": 318782328.0, "step": 353 }, { "epoch": 2.6417910447761193, "grad_norm": 0.2986406693990765, "learning_rate": 1.4148028204982184e-05, "loss": 0.3135310113430023, "num_tokens": 319721759.0, "step": 354 }, { "epoch": 2.6492537313432836, "grad_norm": 0.26230985959231096, "learning_rate": 1.407857165229915e-05, "loss": 0.3319952189922333, "num_tokens": 320632767.0, "step": 355 }, { "epoch": 2.656716417910448, "grad_norm": 0.29286908776153336, "learning_rate": 1.4009181263567659e-05, "loss": 0.33468297123908997, "num_tokens": 321567293.0, "step": 356 }, { "epoch": 2.664179104477612, "grad_norm": 0.2655369943810491, "learning_rate": 1.3939858934834851e-05, "loss": 0.31415632367134094, "num_tokens": 322466432.0, "step": 357 }, { "epoch": 2.671641791044776, "grad_norm": 0.29776986063827793, "learning_rate": 1.3870606560288188e-05, "loss": 0.32620397210121155, "num_tokens": 323416159.0, "step": 358 }, { "epoch": 2.6791044776119404, "grad_norm": 0.2888554358463497, "learning_rate": 1.3801426032203668e-05, "loss": 0.3294253945350647, "num_tokens": 324280115.0, "step": 359 }, { "epoch": 2.6865671641791042, "grad_norm": 0.2743974222493521, "learning_rate": 1.3732319240894143e-05, "loss": 0.33846813440322876, "num_tokens": 325182095.0, "step": 360 }, { "epoch": 2.6940298507462686, "grad_norm": 0.28798464786719813, "learning_rate": 1.3663288074657639e-05, "loss": 0.32448339462280273, "num_tokens": 326171068.0, "step": 361 }, { "epoch": 2.701492537313433, "grad_norm": 0.24943230534603614, "learning_rate": 1.3594334419725797e-05, "loss": 0.3398998975753784, "num_tokens": 327115635.0, "step": 362 }, { "epoch": 2.708955223880597, "grad_norm": 0.2855896503061799, "learning_rate": 1.3525460160212284e-05, "loss": 0.3351544141769409, "num_tokens": 328060133.0, "step": 363 }, { "epoch": 2.716417910447761, "grad_norm": 0.2981015005997933, "learning_rate": 1.3456667178061365e-05, "loss": 0.3235108256340027, "num_tokens": 328868585.0, "step": 364 }, { "epoch": 2.7238805970149254, "grad_norm": 0.3017533668551756, "learning_rate": 1.3387957352996446e-05, "loss": 0.34303897619247437, "num_tokens": 329676478.0, "step": 365 }, { "epoch": 2.7313432835820897, "grad_norm": 0.2793280893422549, "learning_rate": 1.3319332562468716e-05, "loss": 0.3332846164703369, "num_tokens": 330487275.0, "step": 366 }, { "epoch": 2.7388059701492535, "grad_norm": 0.272656727703741, "learning_rate": 1.3250794681605853e-05, "loss": 0.3316395878791809, "num_tokens": 331339930.0, "step": 367 }, { "epoch": 2.746268656716418, "grad_norm": 0.2742961273683912, "learning_rate": 1.3182345583160782e-05, "loss": 0.3241080045700073, "num_tokens": 332357238.0, "step": 368 }, { "epoch": 2.753731343283582, "grad_norm": 0.26762807579168846, "learning_rate": 1.3113987137460514e-05, "loss": 0.331865131855011, "num_tokens": 333294492.0, "step": 369 }, { "epoch": 2.7611940298507465, "grad_norm": 0.28820933208703176, "learning_rate": 1.3045721212355023e-05, "loss": 0.35760703682899475, "num_tokens": 334107753.0, "step": 370 }, { "epoch": 2.7686567164179103, "grad_norm": 0.26979220761978373, "learning_rate": 1.2977549673166228e-05, "loss": 0.3278617858886719, "num_tokens": 334989082.0, "step": 371 }, { "epoch": 2.7761194029850746, "grad_norm": 0.27879196286904034, "learning_rate": 1.2909474382637006e-05, "loss": 0.33124369382858276, "num_tokens": 335901082.0, "step": 372 }, { "epoch": 2.783582089552239, "grad_norm": 0.2463893540212004, "learning_rate": 1.2841497200880305e-05, "loss": 0.32943689823150635, "num_tokens": 336958851.0, "step": 373 }, { "epoch": 2.791044776119403, "grad_norm": 0.27477456461332017, "learning_rate": 1.2773619985328323e-05, "loss": 0.3239135444164276, "num_tokens": 337786409.0, "step": 374 }, { "epoch": 2.798507462686567, "grad_norm": 0.28497614476087085, "learning_rate": 1.2705844590681726e-05, "loss": 0.3271849453449249, "num_tokens": 338694981.0, "step": 375 }, { "epoch": 2.8059701492537314, "grad_norm": 0.2777009652008523, "learning_rate": 1.2638172868859015e-05, "loss": 0.31704217195510864, "num_tokens": 339501927.0, "step": 376 }, { "epoch": 2.8134328358208958, "grad_norm": 0.30766613572700274, "learning_rate": 1.2570606668945877e-05, "loss": 0.35138726234436035, "num_tokens": 340423876.0, "step": 377 }, { "epoch": 2.8208955223880596, "grad_norm": 0.24806225558937847, "learning_rate": 1.2503147837144702e-05, "loss": 0.31420814990997314, "num_tokens": 341285598.0, "step": 378 }, { "epoch": 2.828358208955224, "grad_norm": 0.2847378795760287, "learning_rate": 1.2435798216724094e-05, "loss": 0.32901105284690857, "num_tokens": 342213168.0, "step": 379 }, { "epoch": 2.835820895522388, "grad_norm": 0.27949658273947187, "learning_rate": 1.2368559647968544e-05, "loss": 0.34290027618408203, "num_tokens": 343216441.0, "step": 380 }, { "epoch": 2.843283582089552, "grad_norm": 0.27303724081659647, "learning_rate": 1.2301433968128127e-05, "loss": 0.3377082645893097, "num_tokens": 344164273.0, "step": 381 }, { "epoch": 2.8507462686567164, "grad_norm": 0.28475093005317836, "learning_rate": 1.2234423011368292e-05, "loss": 0.3300044536590576, "num_tokens": 345034929.0, "step": 382 }, { "epoch": 2.8582089552238807, "grad_norm": 0.280519214961473, "learning_rate": 1.2167528608719768e-05, "loss": 0.3426816463470459, "num_tokens": 345822215.0, "step": 383 }, { "epoch": 2.8656716417910446, "grad_norm": 0.27103397464423407, "learning_rate": 1.2100752588028507e-05, "loss": 0.33561939001083374, "num_tokens": 346779144.0, "step": 384 }, { "epoch": 2.873134328358209, "grad_norm": 0.26428076882187357, "learning_rate": 1.2034096773905753e-05, "loss": 0.3420035243034363, "num_tokens": 347750581.0, "step": 385 }, { "epoch": 2.8805970149253732, "grad_norm": 0.30501014590148545, "learning_rate": 1.196756298767819e-05, "loss": 0.33400657773017883, "num_tokens": 348809613.0, "step": 386 }, { "epoch": 2.888059701492537, "grad_norm": 0.24697890321618382, "learning_rate": 1.1901153047338168e-05, "loss": 0.3329269289970398, "num_tokens": 349843341.0, "step": 387 }, { "epoch": 2.8955223880597014, "grad_norm": 0.266310278361214, "learning_rate": 1.1834868767494028e-05, "loss": 0.3315233588218689, "num_tokens": 350686011.0, "step": 388 }, { "epoch": 2.9029850746268657, "grad_norm": 0.25685951719776035, "learning_rate": 1.1768711959320512e-05, "loss": 0.3367440104484558, "num_tokens": 351603297.0, "step": 389 }, { "epoch": 2.91044776119403, "grad_norm": 0.3604332672305553, "learning_rate": 1.1702684430509298e-05, "loss": 0.35349708795547485, "num_tokens": 352566195.0, "step": 390 }, { "epoch": 2.917910447761194, "grad_norm": 0.2935692512218851, "learning_rate": 1.1636787985219572e-05, "loss": 0.3288194537162781, "num_tokens": 353185236.0, "step": 391 }, { "epoch": 2.925373134328358, "grad_norm": 0.2815859488295857, "learning_rate": 1.1571024424028761e-05, "loss": 0.339729905128479, "num_tokens": 354050628.0, "step": 392 }, { "epoch": 2.9328358208955225, "grad_norm": 0.275808180586563, "learning_rate": 1.1505395543883313e-05, "loss": 0.3455864489078522, "num_tokens": 354968219.0, "step": 393 }, { "epoch": 2.9402985074626864, "grad_norm": 0.25420785215211034, "learning_rate": 1.143990313804961e-05, "loss": 0.33193981647491455, "num_tokens": 355907268.0, "step": 394 }, { "epoch": 2.9477611940298507, "grad_norm": 0.2700179741152324, "learning_rate": 1.1374548996064953e-05, "loss": 0.32135769724845886, "num_tokens": 356786243.0, "step": 395 }, { "epoch": 2.955223880597015, "grad_norm": 0.2881492550060451, "learning_rate": 1.1309334903688686e-05, "loss": 0.33170467615127563, "num_tokens": 357742891.0, "step": 396 }, { "epoch": 2.9626865671641793, "grad_norm": 0.267258769627609, "learning_rate": 1.1244262642853383e-05, "loss": 0.3263099193572998, "num_tokens": 358521016.0, "step": 397 }, { "epoch": 2.970149253731343, "grad_norm": 0.27461845486227027, "learning_rate": 1.1179333991616162e-05, "loss": 0.31942278146743774, "num_tokens": 359455120.0, "step": 398 }, { "epoch": 2.9776119402985075, "grad_norm": 0.28304959654627004, "learning_rate": 1.1114550724110105e-05, "loss": 0.3328409790992737, "num_tokens": 360361804.0, "step": 399 }, { "epoch": 2.9850746268656714, "grad_norm": 0.25788972512908753, "learning_rate": 1.1049914610495772e-05, "loss": 0.3321342468261719, "num_tokens": 361424683.0, "step": 400 }, { "epoch": 2.9925373134328357, "grad_norm": 0.28331510950003724, "learning_rate": 1.0985427416912853e-05, "loss": 0.33656731247901917, "num_tokens": 362323989.0, "step": 401 }, { "epoch": 3.0, "grad_norm": 0.3020936447432793, "learning_rate": 1.0921090905431871e-05, "loss": 0.33412468433380127, "num_tokens": 363125328.0, "step": 402 }, { "epoch": 3.0074626865671643, "grad_norm": 0.3799465813612812, "learning_rate": 1.0856906834006088e-05, "loss": 0.2873135805130005, "num_tokens": 363894208.0, "step": 403 }, { "epoch": 3.014925373134328, "grad_norm": 0.36967050447420124, "learning_rate": 1.079287695642342e-05, "loss": 0.2959785461425781, "num_tokens": 364737491.0, "step": 404 }, { "epoch": 3.0223880597014925, "grad_norm": 0.29461417590711114, "learning_rate": 1.0729003022258542e-05, "loss": 0.29170793294906616, "num_tokens": 365722761.0, "step": 405 }, { "epoch": 3.029850746268657, "grad_norm": 0.4116803087373601, "learning_rate": 1.0665286776825081e-05, "loss": 0.30883458256721497, "num_tokens": 366512957.0, "step": 406 }, { "epoch": 3.0373134328358207, "grad_norm": 0.46157786514533145, "learning_rate": 1.0601729961127924e-05, "loss": 0.30715805292129517, "num_tokens": 367415626.0, "step": 407 }, { "epoch": 3.044776119402985, "grad_norm": 0.4187156114489574, "learning_rate": 1.0538334311815627e-05, "loss": 0.31521543860435486, "num_tokens": 368197609.0, "step": 408 }, { "epoch": 3.0522388059701493, "grad_norm": 0.3101860773424548, "learning_rate": 1.0475101561133e-05, "loss": 0.2965121269226074, "num_tokens": 369065047.0, "step": 409 }, { "epoch": 3.0597014925373136, "grad_norm": 0.33242255182112, "learning_rate": 1.0412033436873744e-05, "loss": 0.2895386815071106, "num_tokens": 370031828.0, "step": 410 }, { "epoch": 3.0671641791044775, "grad_norm": 0.33876504585540845, "learning_rate": 1.0349131662333255e-05, "loss": 0.3026469647884369, "num_tokens": 370964850.0, "step": 411 }, { "epoch": 3.074626865671642, "grad_norm": 0.30688475151658845, "learning_rate": 1.0286397956261533e-05, "loss": 0.2771751582622528, "num_tokens": 371789883.0, "step": 412 }, { "epoch": 3.082089552238806, "grad_norm": 0.29360163117574556, "learning_rate": 1.0223834032816198e-05, "loss": 0.3152085840702057, "num_tokens": 372663206.0, "step": 413 }, { "epoch": 3.08955223880597, "grad_norm": 0.31015233222700656, "learning_rate": 1.0161441601515695e-05, "loss": 0.2951708137989044, "num_tokens": 373488698.0, "step": 414 }, { "epoch": 3.0970149253731343, "grad_norm": 0.3610813886577992, "learning_rate": 1.0099222367192547e-05, "loss": 0.3165642321109772, "num_tokens": 374309008.0, "step": 415 }, { "epoch": 3.1044776119402986, "grad_norm": 0.3051632607105892, "learning_rate": 1.0037178029946785e-05, "loss": 0.2940051853656769, "num_tokens": 375243569.0, "step": 416 }, { "epoch": 3.111940298507463, "grad_norm": 0.28089972063969026, "learning_rate": 9.975310285099484e-06, "loss": 0.30177193880081177, "num_tokens": 376203415.0, "step": 417 }, { "epoch": 3.1194029850746268, "grad_norm": 0.25054754752277697, "learning_rate": 9.913620823146451e-06, "loss": 0.2875446081161499, "num_tokens": 377153421.0, "step": 418 }, { "epoch": 3.126865671641791, "grad_norm": 0.26784113270221377, "learning_rate": 9.852111329712039e-06, "loss": 0.30190229415893555, "num_tokens": 378087276.0, "step": 419 }, { "epoch": 3.1343283582089554, "grad_norm": 0.2708038955849966, "learning_rate": 9.790783485503063e-06, "loss": 0.27638930082321167, "num_tokens": 378977281.0, "step": 420 }, { "epoch": 3.1417910447761193, "grad_norm": 0.29895669435540717, "learning_rate": 9.729638966262907e-06, "loss": 0.29848071932792664, "num_tokens": 379899880.0, "step": 421 }, { "epoch": 3.1492537313432836, "grad_norm": 0.2554802919348738, "learning_rate": 9.668679442725697e-06, "loss": 0.27390313148498535, "num_tokens": 380749969.0, "step": 422 }, { "epoch": 3.156716417910448, "grad_norm": 0.3050926022188745, "learning_rate": 9.607906580570695e-06, "loss": 0.2757868468761444, "num_tokens": 381625559.0, "step": 423 }, { "epoch": 3.1641791044776117, "grad_norm": 0.2679289456473412, "learning_rate": 9.54732204037675e-06, "loss": 0.284029483795166, "num_tokens": 382524515.0, "step": 424 }, { "epoch": 3.171641791044776, "grad_norm": 0.3337923481748472, "learning_rate": 9.486927477576948e-06, "loss": 0.2807900905609131, "num_tokens": 383460945.0, "step": 425 }, { "epoch": 3.1791044776119404, "grad_norm": 0.26612009858515995, "learning_rate": 9.426724542413345e-06, "loss": 0.273318886756897, "num_tokens": 384264130.0, "step": 426 }, { "epoch": 3.1865671641791047, "grad_norm": 0.3524087992151756, "learning_rate": 9.366714879891915e-06, "loss": 0.3047345280647278, "num_tokens": 385268579.0, "step": 427 }, { "epoch": 3.1940298507462686, "grad_norm": 0.27293635594516935, "learning_rate": 9.306900129737579e-06, "loss": 0.2729998230934143, "num_tokens": 386028916.0, "step": 428 }, { "epoch": 3.201492537313433, "grad_norm": 0.2758767208976419, "learning_rate": 9.2472819263494e-06, "loss": 0.2999764680862427, "num_tokens": 386990098.0, "step": 429 }, { "epoch": 3.208955223880597, "grad_norm": 0.2600988814863155, "learning_rate": 9.187861898755944e-06, "loss": 0.28329452872276306, "num_tokens": 387863679.0, "step": 430 }, { "epoch": 3.216417910447761, "grad_norm": 0.2692034268302841, "learning_rate": 9.128641670570722e-06, "loss": 0.29894596338272095, "num_tokens": 388670162.0, "step": 431 }, { "epoch": 3.2238805970149254, "grad_norm": 0.26505479450449865, "learning_rate": 9.069622859947886e-06, "loss": 0.28377240896224976, "num_tokens": 389520124.0, "step": 432 }, { "epoch": 3.2313432835820897, "grad_norm": 0.27522585261540833, "learning_rate": 9.010807079537969e-06, "loss": 0.30390995740890503, "num_tokens": 390462131.0, "step": 433 }, { "epoch": 3.2388059701492535, "grad_norm": 0.2716269898933067, "learning_rate": 8.952195936443843e-06, "loss": 0.28739655017852783, "num_tokens": 391294528.0, "step": 434 }, { "epoch": 3.246268656716418, "grad_norm": 0.24102893105996573, "learning_rate": 8.893791032176798e-06, "loss": 0.27352797985076904, "num_tokens": 392296640.0, "step": 435 }, { "epoch": 3.253731343283582, "grad_norm": 0.2648291528044443, "learning_rate": 8.835593962612773e-06, "loss": 0.2909316122531891, "num_tokens": 393156418.0, "step": 436 }, { "epoch": 3.2611940298507465, "grad_norm": 0.28306499151263154, "learning_rate": 8.777606317948772e-06, "loss": 0.2992030084133148, "num_tokens": 394033667.0, "step": 437 }, { "epoch": 3.2686567164179103, "grad_norm": 0.2716112435631178, "learning_rate": 8.719829682659399e-06, "loss": 0.2813768982887268, "num_tokens": 394903535.0, "step": 438 }, { "epoch": 3.2761194029850746, "grad_norm": 0.27512395684649504, "learning_rate": 8.662265635453547e-06, "loss": 0.29536497592926025, "num_tokens": 395846549.0, "step": 439 }, { "epoch": 3.283582089552239, "grad_norm": 0.2989586716757814, "learning_rate": 8.604915749231298e-06, "loss": 0.2988872826099396, "num_tokens": 396737205.0, "step": 440 }, { "epoch": 3.291044776119403, "grad_norm": 0.2883428566171811, "learning_rate": 8.54778159104092e-06, "loss": 0.2863343358039856, "num_tokens": 397630057.0, "step": 441 }, { "epoch": 3.298507462686567, "grad_norm": 0.28465714524566543, "learning_rate": 8.490864722036045e-06, "loss": 0.29591017961502075, "num_tokens": 398582978.0, "step": 442 }, { "epoch": 3.3059701492537314, "grad_norm": 0.27001645219378384, "learning_rate": 8.434166697433034e-06, "loss": 0.28916236758232117, "num_tokens": 399421334.0, "step": 443 }, { "epoch": 3.3134328358208958, "grad_norm": 0.2636502279817121, "learning_rate": 8.377689066468452e-06, "loss": 0.2919909954071045, "num_tokens": 400404286.0, "step": 444 }, { "epoch": 3.3208955223880596, "grad_norm": 0.26433758679315084, "learning_rate": 8.321433372356756e-06, "loss": 0.29081422090530396, "num_tokens": 401357244.0, "step": 445 }, { "epoch": 3.328358208955224, "grad_norm": 0.2615424811114582, "learning_rate": 8.26540115224813e-06, "loss": 0.29471999406814575, "num_tokens": 402272150.0, "step": 446 }, { "epoch": 3.3358208955223883, "grad_norm": 0.2774834257770196, "learning_rate": 8.209593937186475e-06, "loss": 0.3036431670188904, "num_tokens": 403288360.0, "step": 447 }, { "epoch": 3.343283582089552, "grad_norm": 0.2615528181357639, "learning_rate": 8.154013252067565e-06, "loss": 0.28283798694610596, "num_tokens": 404201834.0, "step": 448 }, { "epoch": 3.3507462686567164, "grad_norm": 0.26020037614375063, "learning_rate": 8.098660615597401e-06, "loss": 0.2982422113418579, "num_tokens": 405227526.0, "step": 449 }, { "epoch": 3.3582089552238807, "grad_norm": 0.24108662753109747, "learning_rate": 8.043537540250705e-06, "loss": 0.2861343026161194, "num_tokens": 406200774.0, "step": 450 }, { "epoch": 3.3656716417910446, "grad_norm": 0.2739099235918602, "learning_rate": 7.988645532229581e-06, "loss": 0.2993728816509247, "num_tokens": 407124735.0, "step": 451 }, { "epoch": 3.373134328358209, "grad_norm": 0.247255433646386, "learning_rate": 7.933986091422379e-06, "loss": 0.26630109548568726, "num_tokens": 407967520.0, "step": 452 }, { "epoch": 3.3805970149253732, "grad_norm": 0.26491478815853126, "learning_rate": 7.879560711362696e-06, "loss": 0.2873428463935852, "num_tokens": 408873357.0, "step": 453 }, { "epoch": 3.388059701492537, "grad_norm": 0.2894069285105355, "learning_rate": 7.825370879188569e-06, "loss": 0.28855782747268677, "num_tokens": 409780883.0, "step": 454 }, { "epoch": 3.3955223880597014, "grad_norm": 0.2492281531289753, "learning_rate": 7.771418075601852e-06, "loss": 0.28437167406082153, "num_tokens": 410746305.0, "step": 455 }, { "epoch": 3.4029850746268657, "grad_norm": 0.24355852796907684, "learning_rate": 7.71770377482774e-06, "loss": 0.27994096279144287, "num_tokens": 411680660.0, "step": 456 }, { "epoch": 3.41044776119403, "grad_norm": 0.2700214986931186, "learning_rate": 7.664229444574492e-06, "loss": 0.2921644449234009, "num_tokens": 412605533.0, "step": 457 }, { "epoch": 3.417910447761194, "grad_norm": 0.26147389049318864, "learning_rate": 7.610996545993334e-06, "loss": 0.2780182957649231, "num_tokens": 413578521.0, "step": 458 }, { "epoch": 3.425373134328358, "grad_norm": 0.29826386143822425, "learning_rate": 7.558006533638531e-06, "loss": 0.2961535155773163, "num_tokens": 414502174.0, "step": 459 }, { "epoch": 3.4328358208955225, "grad_norm": 0.3037556792472721, "learning_rate": 7.505260855427631e-06, "loss": 0.2871173024177551, "num_tokens": 415404496.0, "step": 460 }, { "epoch": 3.4402985074626864, "grad_norm": 0.27538452886466275, "learning_rate": 7.452760952601926e-06, "loss": 0.29723048210144043, "num_tokens": 416329218.0, "step": 461 }, { "epoch": 3.4477611940298507, "grad_norm": 0.27152400208894184, "learning_rate": 7.400508259687034e-06, "loss": 0.28178274631500244, "num_tokens": 417169036.0, "step": 462 }, { "epoch": 3.455223880597015, "grad_norm": 0.26056261717807916, "learning_rate": 7.3485042044537425e-06, "loss": 0.28464025259017944, "num_tokens": 418088260.0, "step": 463 }, { "epoch": 3.4626865671641793, "grad_norm": 0.27386142278491205, "learning_rate": 7.296750207878967e-06, "loss": 0.29148146510124207, "num_tokens": 418913562.0, "step": 464 }, { "epoch": 3.470149253731343, "grad_norm": 0.2854422365181894, "learning_rate": 7.2452476841069365e-06, "loss": 0.30438661575317383, "num_tokens": 419816417.0, "step": 465 }, { "epoch": 3.4776119402985075, "grad_norm": 0.2555489664737189, "learning_rate": 7.193998040410553e-06, "loss": 0.3044406771659851, "num_tokens": 420724216.0, "step": 466 }, { "epoch": 3.485074626865672, "grad_norm": 0.25726240443773163, "learning_rate": 7.143002677152923e-06, "loss": 0.28696900606155396, "num_tokens": 421711967.0, "step": 467 }, { "epoch": 3.4925373134328357, "grad_norm": 0.23933852458046884, "learning_rate": 7.092262987749115e-06, "loss": 0.28907179832458496, "num_tokens": 422655373.0, "step": 468 }, { "epoch": 3.5, "grad_norm": 0.25521858687923527, "learning_rate": 7.041780358628076e-06, "loss": 0.2952384948730469, "num_tokens": 423645388.0, "step": 469 }, { "epoch": 3.5074626865671643, "grad_norm": 0.2611107763482694, "learning_rate": 6.991556169194752e-06, "loss": 0.29364389181137085, "num_tokens": 424569069.0, "step": 470 }, { "epoch": 3.5149253731343286, "grad_norm": 0.25648032981059116, "learning_rate": 6.941591791792378e-06, "loss": 0.29367825388908386, "num_tokens": 425492189.0, "step": 471 }, { "epoch": 3.5223880597014925, "grad_norm": 0.2622850225964596, "learning_rate": 6.8918885916650105e-06, "loss": 0.29189831018447876, "num_tokens": 426458959.0, "step": 472 }, { "epoch": 3.529850746268657, "grad_norm": 0.2570163822819945, "learning_rate": 6.842447926920199e-06, "loss": 0.2819617688655853, "num_tokens": 427323105.0, "step": 473 }, { "epoch": 3.5373134328358207, "grad_norm": 0.29549281458059196, "learning_rate": 6.793271148491887e-06, "loss": 0.303572416305542, "num_tokens": 428282716.0, "step": 474 }, { "epoch": 3.544776119402985, "grad_norm": 0.2802879385897247, "learning_rate": 6.7443596001035025e-06, "loss": 0.31654465198516846, "num_tokens": 429252409.0, "step": 475 }, { "epoch": 3.5522388059701493, "grad_norm": 0.24794767420280742, "learning_rate": 6.6957146182312175e-06, "loss": 0.2985188364982605, "num_tokens": 430179989.0, "step": 476 }, { "epoch": 3.5597014925373136, "grad_norm": 0.26530731793672097, "learning_rate": 6.647337532067467e-06, "loss": 0.2864232063293457, "num_tokens": 431131078.0, "step": 477 }, { "epoch": 3.5671641791044775, "grad_norm": 0.26007394216176255, "learning_rate": 6.599229663484598e-06, "loss": 0.31048181653022766, "num_tokens": 432118357.0, "step": 478 }, { "epoch": 3.574626865671642, "grad_norm": 0.2561363907813142, "learning_rate": 6.551392326998776e-06, "loss": 0.29227665066719055, "num_tokens": 432981468.0, "step": 479 }, { "epoch": 3.582089552238806, "grad_norm": 0.2560064424004792, "learning_rate": 6.503826829734035e-06, "loss": 0.2897188663482666, "num_tokens": 433892112.0, "step": 480 }, { "epoch": 3.58955223880597, "grad_norm": 0.25775269914474314, "learning_rate": 6.456534471386594e-06, "loss": 0.2899354100227356, "num_tokens": 434806511.0, "step": 481 }, { "epoch": 3.5970149253731343, "grad_norm": 0.2763280796455374, "learning_rate": 6.409516544189322e-06, "loss": 0.294207900762558, "num_tokens": 435709840.0, "step": 482 }, { "epoch": 3.6044776119402986, "grad_norm": 0.26039260807281084, "learning_rate": 6.362774332876438e-06, "loss": 0.2990114390850067, "num_tokens": 436640115.0, "step": 483 }, { "epoch": 3.611940298507463, "grad_norm": 0.2527009190257145, "learning_rate": 6.316309114648409e-06, "loss": 0.2699679732322693, "num_tokens": 437494545.0, "step": 484 }, { "epoch": 3.6194029850746268, "grad_norm": 0.2736279490033172, "learning_rate": 6.270122159137033e-06, "loss": 0.2987067401409149, "num_tokens": 438288906.0, "step": 485 }, { "epoch": 3.626865671641791, "grad_norm": 0.2789173799100359, "learning_rate": 6.2242147283707714e-06, "loss": 0.3188440203666687, "num_tokens": 439102139.0, "step": 486 }, { "epoch": 3.6343283582089554, "grad_norm": 0.24253033921125058, "learning_rate": 6.178588076740253e-06, "loss": 0.2938775420188904, "num_tokens": 439996247.0, "step": 487 }, { "epoch": 3.6417910447761193, "grad_norm": 0.2638488313889946, "learning_rate": 6.133243450964005e-06, "loss": 0.299264132976532, "num_tokens": 440863036.0, "step": 488 }, { "epoch": 3.6492537313432836, "grad_norm": 0.2862685346779505, "learning_rate": 6.088182090054364e-06, "loss": 0.29607367515563965, "num_tokens": 441618331.0, "step": 489 }, { "epoch": 3.656716417910448, "grad_norm": 0.27289917809516917, "learning_rate": 6.043405225283654e-06, "loss": 0.2921777367591858, "num_tokens": 442361717.0, "step": 490 }, { "epoch": 3.664179104477612, "grad_norm": 0.2537367234076187, "learning_rate": 5.998914080150525e-06, "loss": 0.2836867570877075, "num_tokens": 443313769.0, "step": 491 }, { "epoch": 3.671641791044776, "grad_norm": 0.26170955211126384, "learning_rate": 5.9547098703465215e-06, "loss": 0.30563318729400635, "num_tokens": 444314596.0, "step": 492 }, { "epoch": 3.6791044776119404, "grad_norm": 0.2693301068014662, "learning_rate": 5.910793803722873e-06, "loss": 0.29311275482177734, "num_tokens": 445237263.0, "step": 493 }, { "epoch": 3.6865671641791042, "grad_norm": 0.27762349435994677, "learning_rate": 5.867167080257471e-06, "loss": 0.29791638255119324, "num_tokens": 446151590.0, "step": 494 }, { "epoch": 3.6940298507462686, "grad_norm": 0.2701314496245139, "learning_rate": 5.823830892022107e-06, "loss": 0.3165101408958435, "num_tokens": 447040490.0, "step": 495 }, { "epoch": 3.701492537313433, "grad_norm": 0.27274243095008927, "learning_rate": 5.780786423149879e-06, "loss": 0.32390397787094116, "num_tokens": 447930938.0, "step": 496 }, { "epoch": 3.708955223880597, "grad_norm": 0.3607925974135692, "learning_rate": 5.738034849802852e-06, "loss": 0.2941335439682007, "num_tokens": 448795073.0, "step": 497 }, { "epoch": 3.716417910447761, "grad_norm": 0.27114314101622733, "learning_rate": 5.695577340139905e-06, "loss": 0.29179757833480835, "num_tokens": 449748272.0, "step": 498 }, { "epoch": 3.7238805970149254, "grad_norm": 0.2676978851481763, "learning_rate": 5.653415054284816e-06, "loss": 0.30068930983543396, "num_tokens": 450716521.0, "step": 499 }, { "epoch": 3.7313432835820897, "grad_norm": 0.26294336472483293, "learning_rate": 5.611549144294568e-06, "loss": 0.2907962203025818, "num_tokens": 451536750.0, "step": 500 }, { "epoch": 3.7388059701492535, "grad_norm": 0.2546485590984235, "learning_rate": 5.569980754127872e-06, "loss": 0.2873173952102661, "num_tokens": 452509967.0, "step": 501 }, { "epoch": 3.746268656716418, "grad_norm": 0.24938478382421467, "learning_rate": 5.5287110196138985e-06, "loss": 0.2843964993953705, "num_tokens": 453499953.0, "step": 502 }, { "epoch": 3.753731343283582, "grad_norm": 0.24634142699097625, "learning_rate": 5.487741068421242e-06, "loss": 0.295748770236969, "num_tokens": 454428619.0, "step": 503 }, { "epoch": 3.7611940298507465, "grad_norm": 0.24888257131984212, "learning_rate": 5.447072020027122e-06, "loss": 0.2946910858154297, "num_tokens": 455343533.0, "step": 504 }, { "epoch": 3.7686567164179103, "grad_norm": 0.2368545844700678, "learning_rate": 5.406704985686782e-06, "loss": 0.27735936641693115, "num_tokens": 456246016.0, "step": 505 }, { "epoch": 3.7761194029850746, "grad_norm": 0.27241818855184635, "learning_rate": 5.366641068403126e-06, "loss": 0.3016122579574585, "num_tokens": 457104506.0, "step": 506 }, { "epoch": 3.783582089552239, "grad_norm": 0.2590785342630335, "learning_rate": 5.326881362896588e-06, "loss": 0.3151727020740509, "num_tokens": 458003785.0, "step": 507 }, { "epoch": 3.791044776119403, "grad_norm": 0.25242642100086654, "learning_rate": 5.287426955575205e-06, "loss": 0.2941104769706726, "num_tokens": 458840614.0, "step": 508 }, { "epoch": 3.798507462686567, "grad_norm": 0.25216547714604487, "learning_rate": 5.24827892450494e-06, "loss": 0.28807011246681213, "num_tokens": 459707587.0, "step": 509 }, { "epoch": 3.8059701492537314, "grad_norm": 0.23923122289508578, "learning_rate": 5.209438339380242e-06, "loss": 0.2823304533958435, "num_tokens": 460629686.0, "step": 510 }, { "epoch": 3.8134328358208958, "grad_norm": 0.2657855590710968, "learning_rate": 5.170906261494776e-06, "loss": 0.2919255197048187, "num_tokens": 461544147.0, "step": 511 }, { "epoch": 3.8208955223880596, "grad_norm": 0.2626472514066274, "learning_rate": 5.132683743712462e-06, "loss": 0.29430970549583435, "num_tokens": 462477850.0, "step": 512 }, { "epoch": 3.828358208955224, "grad_norm": 0.2566886419848628, "learning_rate": 5.094771830438689e-06, "loss": 0.2987692952156067, "num_tokens": 463412060.0, "step": 513 }, { "epoch": 3.835820895522388, "grad_norm": 0.2484881857541934, "learning_rate": 5.057171557591777e-06, "loss": 0.2915360927581787, "num_tokens": 464308740.0, "step": 514 }, { "epoch": 3.843283582089552, "grad_norm": 0.24381344201474844, "learning_rate": 5.019883952574686e-06, "loss": 0.28436267375946045, "num_tokens": 465265384.0, "step": 515 }, { "epoch": 3.8507462686567164, "grad_norm": 0.2481908860876439, "learning_rate": 4.98291003424691e-06, "loss": 0.28611573576927185, "num_tokens": 466226494.0, "step": 516 }, { "epoch": 3.8582089552238807, "grad_norm": 0.23871319999253146, "learning_rate": 4.946250812896678e-06, "loss": 0.2998065948486328, "num_tokens": 467259239.0, "step": 517 }, { "epoch": 3.8656716417910446, "grad_norm": 0.24799658125418186, "learning_rate": 4.909907290213321e-06, "loss": 0.2929803729057312, "num_tokens": 468142586.0, "step": 518 }, { "epoch": 3.873134328358209, "grad_norm": 0.2570907705948353, "learning_rate": 4.873880459259913e-06, "loss": 0.2957007884979248, "num_tokens": 468951581.0, "step": 519 }, { "epoch": 3.8805970149253732, "grad_norm": 0.2616713052030643, "learning_rate": 4.838171304446129e-06, "loss": 0.3021651804447174, "num_tokens": 469861165.0, "step": 520 }, { "epoch": 3.888059701492537, "grad_norm": 0.3010016938124609, "learning_rate": 4.80278080150135e-06, "loss": 0.308903306722641, "num_tokens": 470804718.0, "step": 521 }, { "epoch": 3.8955223880597014, "grad_norm": 0.24926265212747228, "learning_rate": 4.767709917448009e-06, "loss": 0.30023178458213806, "num_tokens": 471749228.0, "step": 522 }, { "epoch": 3.9029850746268657, "grad_norm": 0.24761743456611565, "learning_rate": 4.732959610575154e-06, "loss": 0.2946227788925171, "num_tokens": 472698930.0, "step": 523 }, { "epoch": 3.91044776119403, "grad_norm": 0.22825850031566985, "learning_rate": 4.698530830412276e-06, "loss": 0.2835308611392975, "num_tokens": 473565553.0, "step": 524 }, { "epoch": 3.917910447761194, "grad_norm": 0.25239434326625193, "learning_rate": 4.664424517703353e-06, "loss": 0.3003775477409363, "num_tokens": 474425434.0, "step": 525 }, { "epoch": 3.925373134328358, "grad_norm": 0.24946733272255223, "learning_rate": 4.630641604381151e-06, "loss": 0.3032747507095337, "num_tokens": 475400550.0, "step": 526 }, { "epoch": 3.9328358208955225, "grad_norm": 0.23900706286857004, "learning_rate": 4.597183013541764e-06, "loss": 0.3009137809276581, "num_tokens": 476322074.0, "step": 527 }, { "epoch": 3.9402985074626864, "grad_norm": 0.23647944717460595, "learning_rate": 4.564049659419379e-06, "loss": 0.2712666392326355, "num_tokens": 477127686.0, "step": 528 }, { "epoch": 3.9477611940298507, "grad_norm": 0.24461158477231615, "learning_rate": 4.531242447361308e-06, "loss": 0.2808017432689667, "num_tokens": 477992768.0, "step": 529 }, { "epoch": 3.955223880597015, "grad_norm": 0.25868694964779676, "learning_rate": 4.498762273803233e-06, "loss": 0.3064419627189636, "num_tokens": 478818611.0, "step": 530 }, { "epoch": 3.9626865671641793, "grad_norm": 0.2384742960079164, "learning_rate": 4.4666100262447335e-06, "loss": 0.28597795963287354, "num_tokens": 479757992.0, "step": 531 }, { "epoch": 3.970149253731343, "grad_norm": 0.23968331135860904, "learning_rate": 4.434786583225018e-06, "loss": 0.28608185052871704, "num_tokens": 480686770.0, "step": 532 }, { "epoch": 3.9776119402985075, "grad_norm": 0.23668555789215315, "learning_rate": 4.403292814298932e-06, "loss": 0.2850901782512665, "num_tokens": 481556474.0, "step": 533 }, { "epoch": 3.9850746268656714, "grad_norm": 0.25182124727383254, "learning_rate": 4.372129580013179e-06, "loss": 0.29344847798347473, "num_tokens": 482402398.0, "step": 534 }, { "epoch": 3.9925373134328357, "grad_norm": 0.26240005117001564, "learning_rate": 4.341297731882833e-06, "loss": 0.28991544246673584, "num_tokens": 483144226.0, "step": 535 }, { "epoch": 4.0, "grad_norm": 0.2394553338374243, "learning_rate": 4.31079811236805e-06, "loss": 0.28979605436325073, "num_tokens": 484171179.0, "step": 536 }, { "epoch": 4.007462686567164, "grad_norm": 0.33867608284366874, "learning_rate": 4.280631554851052e-06, "loss": 0.261859267950058, "num_tokens": 484964422.0, "step": 537 }, { "epoch": 4.014925373134329, "grad_norm": 0.3387690029521035, "learning_rate": 4.250798883613371e-06, "loss": 0.258260041475296, "num_tokens": 485911398.0, "step": 538 }, { "epoch": 4.022388059701493, "grad_norm": 0.31295687889359947, "learning_rate": 4.221300913813297e-06, "loss": 0.26438719034194946, "num_tokens": 486765516.0, "step": 539 }, { "epoch": 4.029850746268656, "grad_norm": 0.2533693528515132, "learning_rate": 4.192138451463637e-06, "loss": 0.26276901364326477, "num_tokens": 487755450.0, "step": 540 }, { "epoch": 4.037313432835821, "grad_norm": 0.27941924345032165, "learning_rate": 4.163312293409668e-06, "loss": 0.2743380069732666, "num_tokens": 488596501.0, "step": 541 }, { "epoch": 4.044776119402985, "grad_norm": 0.3119420012284113, "learning_rate": 4.134823227307376e-06, "loss": 0.27551499009132385, "num_tokens": 489333987.0, "step": 542 }, { "epoch": 4.052238805970149, "grad_norm": 0.3376015245186099, "learning_rate": 4.1066720316019176e-06, "loss": 0.2677218019962311, "num_tokens": 490271866.0, "step": 543 }, { "epoch": 4.059701492537314, "grad_norm": 0.31476158712266056, "learning_rate": 4.0788594755063754e-06, "loss": 0.2655893564224243, "num_tokens": 491167672.0, "step": 544 }, { "epoch": 4.067164179104478, "grad_norm": 0.2801726675903428, "learning_rate": 4.051386318980717e-06, "loss": 0.2636064291000366, "num_tokens": 492117374.0, "step": 545 }, { "epoch": 4.074626865671641, "grad_norm": 0.2642554446164588, "learning_rate": 4.024253312711041e-06, "loss": 0.2632978558540344, "num_tokens": 493064577.0, "step": 546 }, { "epoch": 4.082089552238806, "grad_norm": 0.2340921814939966, "learning_rate": 3.99746119808906e-06, "loss": 0.2561931908130646, "num_tokens": 494008196.0, "step": 547 }, { "epoch": 4.08955223880597, "grad_norm": 0.24746722464151832, "learning_rate": 3.971010707191848e-06, "loss": 0.2665466368198395, "num_tokens": 495010032.0, "step": 548 }, { "epoch": 4.097014925373134, "grad_norm": 0.28750263458503306, "learning_rate": 3.9449025627618256e-06, "loss": 0.2657792568206787, "num_tokens": 495771485.0, "step": 549 }, { "epoch": 4.104477611940299, "grad_norm": 0.25981920943022424, "learning_rate": 3.919137478187027e-06, "loss": 0.2730734050273895, "num_tokens": 496704001.0, "step": 550 }, { "epoch": 4.111940298507463, "grad_norm": 0.26506589650257595, "learning_rate": 3.893716157481598e-06, "loss": 0.26241227984428406, "num_tokens": 497580217.0, "step": 551 }, { "epoch": 4.119402985074627, "grad_norm": 0.28902536946390145, "learning_rate": 3.868639295266562e-06, "loss": 0.27827292680740356, "num_tokens": 498399947.0, "step": 552 }, { "epoch": 4.126865671641791, "grad_norm": 0.2305613889202318, "learning_rate": 3.8439075767508304e-06, "loss": 0.25871434807777405, "num_tokens": 499337510.0, "step": 553 }, { "epoch": 4.134328358208955, "grad_norm": 0.2543579464580596, "learning_rate": 3.819521677712498e-06, "loss": 0.26276665925979614, "num_tokens": 500211058.0, "step": 554 }, { "epoch": 4.141791044776119, "grad_norm": 0.2603404639875204, "learning_rate": 3.7954822644803612e-06, "loss": 0.27976810932159424, "num_tokens": 501239171.0, "step": 555 }, { "epoch": 4.149253731343284, "grad_norm": 0.24399135581961698, "learning_rate": 3.7717899939157227e-06, "loss": 0.2695601284503937, "num_tokens": 502320140.0, "step": 556 }, { "epoch": 4.156716417910448, "grad_norm": 0.2506194739658917, "learning_rate": 3.748445513394432e-06, "loss": 0.2601467967033386, "num_tokens": 503200601.0, "step": 557 }, { "epoch": 4.164179104477612, "grad_norm": 0.24780405901365382, "learning_rate": 3.7254494607892062e-06, "loss": 0.2658926248550415, "num_tokens": 504111915.0, "step": 558 }, { "epoch": 4.1716417910447765, "grad_norm": 0.25277522040944933, "learning_rate": 3.7028024644521974e-06, "loss": 0.26618829369544983, "num_tokens": 951575.0, "step": 559 }, { "epoch": 4.17910447761194, "grad_norm": 0.260926277591076, "learning_rate": 3.6805051431978215e-06, "loss": 0.2764492630958557, "num_tokens": 1870368.0, "step": 560 }, { "epoch": 4.186567164179104, "grad_norm": 0.24151138917904563, "learning_rate": 3.6585581062858515e-06, "loss": 0.26785239577293396, "num_tokens": 2827046.0, "step": 561 }, { "epoch": 4.1940298507462686, "grad_norm": 0.24384225850500896, "learning_rate": 3.636961953404763e-06, "loss": 0.26912403106689453, "num_tokens": 3739973.0, "step": 562 }, { "epoch": 4.201492537313433, "grad_norm": 0.2916626614705674, "learning_rate": 3.615717274655364e-06, "loss": 0.26528483629226685, "num_tokens": 4518704.0, "step": 563 }, { "epoch": 4.208955223880597, "grad_norm": 0.24960926879350168, "learning_rate": 3.5948246505346537e-06, "loss": 0.27783459424972534, "num_tokens": 5501253.0, "step": 564 }, { "epoch": 4.2164179104477615, "grad_norm": 0.25681267819662723, "learning_rate": 3.5742846519199715e-06, "loss": 0.27307459712028503, "num_tokens": 6402302.0, "step": 565 }, { "epoch": 4.223880597014926, "grad_norm": 0.2412629050166804, "learning_rate": 3.5540978400533933e-06, "loss": 0.264928936958313, "num_tokens": 7296048.0, "step": 566 }, { "epoch": 4.231343283582089, "grad_norm": 0.26007426064530514, "learning_rate": 3.5342647665263963e-06, "loss": 0.27285411953926086, "num_tokens": 8246217.0, "step": 567 }, { "epoch": 4.2388059701492535, "grad_norm": 0.2505447033271199, "learning_rate": 3.514785973264789e-06, "loss": 0.2539595663547516, "num_tokens": 9030493.0, "step": 568 }, { "epoch": 4.246268656716418, "grad_norm": 0.24939677987959621, "learning_rate": 3.495661992513905e-06, "loss": 0.273257315158844, "num_tokens": 9936844.0, "step": 569 }, { "epoch": 4.253731343283582, "grad_norm": 0.25283551407816135, "learning_rate": 3.476893346824055e-06, "loss": 0.2572386562824249, "num_tokens": 10836976.0, "step": 570 }, { "epoch": 4.2611940298507465, "grad_norm": 0.25014049995931353, "learning_rate": 3.4584805490362493e-06, "loss": 0.27239200472831726, "num_tokens": 11812223.0, "step": 571 }, { "epoch": 4.268656716417911, "grad_norm": 0.2565763851261565, "learning_rate": 3.4404241022681873e-06, "loss": 0.26448339223861694, "num_tokens": 12615614.0, "step": 572 }, { "epoch": 4.276119402985074, "grad_norm": 0.25289117775054565, "learning_rate": 3.42272449990051e-06, "loss": 0.29063016176223755, "num_tokens": 13567548.0, "step": 573 }, { "epoch": 4.2835820895522385, "grad_norm": 0.25823345228475075, "learning_rate": 3.40538222556332e-06, "loss": 0.27311235666275024, "num_tokens": 14395131.0, "step": 574 }, { "epoch": 4.291044776119403, "grad_norm": 0.23315641988846117, "learning_rate": 3.388397753122957e-06, "loss": 0.25236693024635315, "num_tokens": 15335598.0, "step": 575 }, { "epoch": 4.298507462686567, "grad_norm": 0.2841401512615274, "learning_rate": 3.3717715466690624e-06, "loss": 0.2869318723678589, "num_tokens": 16179341.0, "step": 576 }, { "epoch": 4.3059701492537314, "grad_norm": 0.25632145802021455, "learning_rate": 3.3555040605018935e-06, "loss": 0.26220396161079407, "num_tokens": 16988671.0, "step": 577 }, { "epoch": 4.313432835820896, "grad_norm": 0.26924823560517036, "learning_rate": 3.339595739119909e-06, "loss": 0.28524714708328247, "num_tokens": 17818903.0, "step": 578 }, { "epoch": 4.32089552238806, "grad_norm": 0.24597376079056055, "learning_rate": 3.3240470172076226e-06, "loss": 0.25928568840026855, "num_tokens": 18686514.0, "step": 579 }, { "epoch": 4.3283582089552235, "grad_norm": 0.2296054299641554, "learning_rate": 3.3088583196237253e-06, "loss": 0.2673494219779968, "num_tokens": 19710461.0, "step": 580 }, { "epoch": 4.335820895522388, "grad_norm": 0.303772974273409, "learning_rate": 3.294030061389481e-06, "loss": 0.29324933886528015, "num_tokens": 20505162.0, "step": 581 }, { "epoch": 4.343283582089552, "grad_norm": 0.24075458576098716, "learning_rate": 3.2795626476773833e-06, "loss": 0.2494013011455536, "num_tokens": 21440460.0, "step": 582 }, { "epoch": 4.350746268656716, "grad_norm": 0.26414061441007297, "learning_rate": 3.2654564738000822e-06, "loss": 0.28142672777175903, "num_tokens": 22250398.0, "step": 583 }, { "epoch": 4.358208955223881, "grad_norm": 0.22985556392550052, "learning_rate": 3.2517119251995873e-06, "loss": 0.2574723958969116, "num_tokens": 23184740.0, "step": 584 }, { "epoch": 4.365671641791045, "grad_norm": 0.2340946834808678, "learning_rate": 3.2383293774367286e-06, "loss": 0.262751042842865, "num_tokens": 24111398.0, "step": 585 }, { "epoch": 4.373134328358209, "grad_norm": 0.2622850655813035, "learning_rate": 3.225309196180906e-06, "loss": 0.26962852478027344, "num_tokens": 24935442.0, "step": 586 }, { "epoch": 4.380597014925373, "grad_norm": 0.30896829245037205, "learning_rate": 3.212651737200086e-06, "loss": 0.2718137502670288, "num_tokens": 25850666.0, "step": 587 }, { "epoch": 4.388059701492537, "grad_norm": 0.24656349669035904, "learning_rate": 3.200357346351084e-06, "loss": 0.2535630166530609, "num_tokens": 26632303.0, "step": 588 }, { "epoch": 4.395522388059701, "grad_norm": 0.2478490440504693, "learning_rate": 3.188426359570121e-06, "loss": 0.2648570239543915, "num_tokens": 27523524.0, "step": 589 }, { "epoch": 4.402985074626866, "grad_norm": 0.2467537368918543, "learning_rate": 3.176859102863631e-06, "loss": 0.268078088760376, "num_tokens": 28364038.0, "step": 590 }, { "epoch": 4.41044776119403, "grad_norm": 0.24022496953126724, "learning_rate": 3.16565589229937e-06, "loss": 0.2637268900871277, "num_tokens": 29254874.0, "step": 591 }, { "epoch": 4.417910447761194, "grad_norm": 0.25520142610516455, "learning_rate": 3.1548170339977626e-06, "loss": 0.27608251571655273, "num_tokens": 30099118.0, "step": 592 }, { "epoch": 4.425373134328359, "grad_norm": 0.25418541542220713, "learning_rate": 3.144342824123548e-06, "loss": 0.27403631806373596, "num_tokens": 30937080.0, "step": 593 }, { "epoch": 4.432835820895522, "grad_norm": 0.32216925679050706, "learning_rate": 3.134233548877684e-06, "loss": 0.2749292850494385, "num_tokens": 31868459.0, "step": 594 }, { "epoch": 4.440298507462686, "grad_norm": 0.23633842342693723, "learning_rate": 3.1244894844895307e-06, "loss": 0.26009055972099304, "num_tokens": 32844776.0, "step": 595 }, { "epoch": 4.447761194029851, "grad_norm": 0.23167846669851885, "learning_rate": 3.115110897209297e-06, "loss": 0.25624188780784607, "num_tokens": 33800215.0, "step": 596 }, { "epoch": 4.455223880597015, "grad_norm": 0.31853695227310724, "learning_rate": 3.1060980433007674e-06, "loss": 0.26650676131248474, "num_tokens": 34652575.0, "step": 597 }, { "epoch": 4.462686567164179, "grad_norm": 0.2474619333740578, "learning_rate": 3.0974511690342995e-06, "loss": 0.26506173610687256, "num_tokens": 35526076.0, "step": 598 }, { "epoch": 4.470149253731344, "grad_norm": 0.2379051889253177, "learning_rate": 3.089170510680101e-06, "loss": 0.2590046525001526, "num_tokens": 36465383.0, "step": 599 }, { "epoch": 4.477611940298507, "grad_norm": 0.24103925822435351, "learning_rate": 3.0812562945017625e-06, "loss": 0.26156845688819885, "num_tokens": 37402609.0, "step": 600 }, { "epoch": 4.485074626865671, "grad_norm": 0.24478825299983173, "learning_rate": 3.0737087367500848e-06, "loss": 0.26436761021614075, "num_tokens": 38372549.0, "step": 601 }, { "epoch": 4.492537313432836, "grad_norm": 0.25588847680085197, "learning_rate": 3.066528043657163e-06, "loss": 0.2649264335632324, "num_tokens": 39258770.0, "step": 602 }, { "epoch": 4.5, "grad_norm": 0.2462865755078873, "learning_rate": 3.0597144114307577e-06, "loss": 0.2759783864021301, "num_tokens": 40167992.0, "step": 603 }, { "epoch": 4.507462686567164, "grad_norm": 0.24099167658546947, "learning_rate": 3.0532680262489272e-06, "loss": 0.2647096812725067, "num_tokens": 41103593.0, "step": 604 }, { "epoch": 4.514925373134329, "grad_norm": 0.26228388440102607, "learning_rate": 3.047189064254947e-06, "loss": 0.2846449017524719, "num_tokens": 41964920.0, "step": 605 }, { "epoch": 4.522388059701493, "grad_norm": 0.24399175109648016, "learning_rate": 3.0414776915524926e-06, "loss": 0.2578504979610443, "num_tokens": 42832698.0, "step": 606 }, { "epoch": 4.529850746268656, "grad_norm": 0.24557003170358038, "learning_rate": 3.0361340642010974e-06, "loss": 0.2687520980834961, "num_tokens": 43751841.0, "step": 607 }, { "epoch": 4.537313432835821, "grad_norm": 0.2457169444504286, "learning_rate": 3.0311583282119004e-06, "loss": 0.2654935121536255, "num_tokens": 44670570.0, "step": 608 }, { "epoch": 4.544776119402985, "grad_norm": 0.23344187463481425, "learning_rate": 3.026550619543641e-06, "loss": 0.2680796980857849, "num_tokens": 45565349.0, "step": 609 }, { "epoch": 4.552238805970149, "grad_norm": 0.2616978203803085, "learning_rate": 3.0223110640989607e-06, "loss": 0.2733978033065796, "num_tokens": 46334877.0, "step": 610 }, { "epoch": 4.559701492537314, "grad_norm": 0.24402710769793126, "learning_rate": 3.0184397777209436e-06, "loss": 0.26678377389907837, "num_tokens": 47197933.0, "step": 611 }, { "epoch": 4.567164179104478, "grad_norm": 0.23596997365787184, "learning_rate": 3.0149368661899707e-06, "loss": 0.2666507959365845, "num_tokens": 48185966.0, "step": 612 }, { "epoch": 4.574626865671641, "grad_norm": 0.26072871992164276, "learning_rate": 3.0118024252208146e-06, "loss": 0.2727803587913513, "num_tokens": 49053041.0, "step": 613 }, { "epoch": 4.582089552238806, "grad_norm": 0.2429680853323204, "learning_rate": 3.0090365404600324e-06, "loss": 0.27436989545822144, "num_tokens": 49972669.0, "step": 614 }, { "epoch": 4.58955223880597, "grad_norm": 0.2492201703405157, "learning_rate": 3.0066392874836254e-06, "loss": 0.2650463581085205, "num_tokens": 50759258.0, "step": 615 }, { "epoch": 4.597014925373134, "grad_norm": 0.23159753484897908, "learning_rate": 3.004610731794965e-06, "loss": 0.2537558376789093, "num_tokens": 51687796.0, "step": 616 }, { "epoch": 4.604477611940299, "grad_norm": 0.23805277832433672, "learning_rate": 3.002950928823016e-06, "loss": 0.26197919249534607, "num_tokens": 52660231.0, "step": 617 }, { "epoch": 4.611940298507463, "grad_norm": 0.24026810337813148, "learning_rate": 3.001659923920811e-06, "loss": 0.2531256675720215, "num_tokens": 53529194.0, "step": 618 }, { "epoch": 4.619402985074627, "grad_norm": 0.258077064890661, "learning_rate": 3.0007377523642196e-06, "loss": 0.26511213183403015, "num_tokens": 54455687.0, "step": 619 }, { "epoch": 4.6268656716417915, "grad_norm": 0.24117669708783973, "learning_rate": 3.0001844393509754e-06, "loss": 0.2814059257507324, "num_tokens": 55475962.0, "step": 620 }, { "epoch": 4.6268656716417915, "step": 620, "total_flos": 829282868854784.0, "train_loss": 0.02679153286641644, "train_runtime": 1845.6941, "train_samples_per_second": 10.749, "train_steps_per_second": 0.336 } ], "logging_steps": 1, "max_steps": 620, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 62, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 829282868854784.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }