Files
Qwen3-4B-Thinking-2507-SFT-tr5/trainer_state.json
ModelHub XC ace6e29fbe 初始化项目,由ModelHub XC社区提供模型
Model: edbeeching/Qwen3-4B-Thinking-2507-SFT-tr5
Source: Original Platform
2026-06-20 17:34:31 +08:00

5004 lines
133 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.6268656716417915,
"eval_steps": 500,
"global_step": 620,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007462686567164179,
"grad_norm": 7.744417324010015,
"learning_rate": 0.0,
"loss": 0.8532977104187012,
"num_tokens": 940199.0,
"step": 1
},
{
"epoch": 0.014925373134328358,
"grad_norm": 7.534592349315775,
"learning_rate": 1.5789473684210526e-06,
"loss": 0.870805561542511,
"num_tokens": 1940958.0,
"step": 2
},
{
"epoch": 0.022388059701492536,
"grad_norm": 7.301629258268991,
"learning_rate": 3.157894736842105e-06,
"loss": 0.8422647714614868,
"num_tokens": 2857380.0,
"step": 3
},
{
"epoch": 0.029850746268656716,
"grad_norm": 6.897211503812214,
"learning_rate": 4.736842105263158e-06,
"loss": 0.8292515277862549,
"num_tokens": 3696403.0,
"step": 4
},
{
"epoch": 0.03731343283582089,
"grad_norm": 4.507105826947362,
"learning_rate": 6.31578947368421e-06,
"loss": 0.7875182628631592,
"num_tokens": 4528235.0,
"step": 5
},
{
"epoch": 0.04477611940298507,
"grad_norm": 2.305703326092364,
"learning_rate": 7.894736842105263e-06,
"loss": 0.7126146554946899,
"num_tokens": 5554672.0,
"step": 6
},
{
"epoch": 0.05223880597014925,
"grad_norm": 1.921229796923974,
"learning_rate": 9.473684210526315e-06,
"loss": 0.6916477680206299,
"num_tokens": 6423132.0,
"step": 7
},
{
"epoch": 0.05970149253731343,
"grad_norm": 2.0416384054663053,
"learning_rate": 1.1052631578947368e-05,
"loss": 0.6471172571182251,
"num_tokens": 7201644.0,
"step": 8
},
{
"epoch": 0.06716417910447761,
"grad_norm": 2.239115666581825,
"learning_rate": 1.263157894736842e-05,
"loss": 0.6206663846969604,
"num_tokens": 8128715.0,
"step": 9
},
{
"epoch": 0.07462686567164178,
"grad_norm": 2.1623779063703825,
"learning_rate": 1.4210526315789473e-05,
"loss": 0.6805848479270935,
"num_tokens": 9074027.0,
"step": 10
},
{
"epoch": 0.08208955223880597,
"grad_norm": 1.357823714265532,
"learning_rate": 1.5789473684210526e-05,
"loss": 0.5907301902770996,
"num_tokens": 9950641.0,
"step": 11
},
{
"epoch": 0.08955223880597014,
"grad_norm": 1.2436697073761152,
"learning_rate": 1.736842105263158e-05,
"loss": 0.6134575009346008,
"num_tokens": 10885057.0,
"step": 12
},
{
"epoch": 0.09701492537313433,
"grad_norm": 1.0124569249363744,
"learning_rate": 1.894736842105263e-05,
"loss": 0.5782807469367981,
"num_tokens": 11697963.0,
"step": 13
},
{
"epoch": 0.1044776119402985,
"grad_norm": 0.7569239607127325,
"learning_rate": 2.0526315789473685e-05,
"loss": 0.5509419441223145,
"num_tokens": 12632602.0,
"step": 14
},
{
"epoch": 0.11194029850746269,
"grad_norm": 0.5817534527891748,
"learning_rate": 2.2105263157894736e-05,
"loss": 0.532228410243988,
"num_tokens": 13568889.0,
"step": 15
},
{
"epoch": 0.11940298507462686,
"grad_norm": 0.7057373397195236,
"learning_rate": 2.368421052631579e-05,
"loss": 0.5408649444580078,
"num_tokens": 14534242.0,
"step": 16
},
{
"epoch": 0.12686567164179105,
"grad_norm": 0.6046224645124905,
"learning_rate": 2.526315789473684e-05,
"loss": 0.5322834253311157,
"num_tokens": 15435946.0,
"step": 17
},
{
"epoch": 0.13432835820895522,
"grad_norm": 0.49331973808673285,
"learning_rate": 2.6842105263157896e-05,
"loss": 0.5015720725059509,
"num_tokens": 16352267.0,
"step": 18
},
{
"epoch": 0.1417910447761194,
"grad_norm": 0.5002360432437354,
"learning_rate": 2.8421052631578946e-05,
"loss": 0.507352888584137,
"num_tokens": 17277422.0,
"step": 19
},
{
"epoch": 0.14925373134328357,
"grad_norm": 0.5278153522403675,
"learning_rate": 3e-05,
"loss": 0.5295838713645935,
"num_tokens": 18270697.0,
"step": 20
},
{
"epoch": 0.15671641791044777,
"grad_norm": 0.504428457093509,
"learning_rate": 2.9999815560649025e-05,
"loss": 0.4966413080692291,
"num_tokens": 19308555.0,
"step": 21
},
{
"epoch": 0.16417910447761194,
"grad_norm": 0.46271144160592165,
"learning_rate": 2.9999262247635783e-05,
"loss": 0.47084784507751465,
"num_tokens": 20162797.0,
"step": 22
},
{
"epoch": 0.17164179104477612,
"grad_norm": 0.4874776182121212,
"learning_rate": 2.9998340076079188e-05,
"loss": 0.4917251765727997,
"num_tokens": 20981106.0,
"step": 23
},
{
"epoch": 0.1791044776119403,
"grad_norm": 0.4431461732541396,
"learning_rate": 2.9997049071176987e-05,
"loss": 0.4785962998867035,
"num_tokens": 21858000.0,
"step": 24
},
{
"epoch": 0.1865671641791045,
"grad_norm": 0.387861494751977,
"learning_rate": 2.9995389268205035e-05,
"loss": 0.4448994994163513,
"num_tokens": 22793285.0,
"step": 25
},
{
"epoch": 0.19402985074626866,
"grad_norm": 0.4318496289646886,
"learning_rate": 2.9993360712516377e-05,
"loss": 0.5124952793121338,
"num_tokens": 23723801.0,
"step": 26
},
{
"epoch": 0.20149253731343283,
"grad_norm": 0.37498700509106286,
"learning_rate": 2.999096345953997e-05,
"loss": 0.4689701795578003,
"num_tokens": 24725740.0,
"step": 27
},
{
"epoch": 0.208955223880597,
"grad_norm": 0.46758132187976276,
"learning_rate": 2.9988197574779187e-05,
"loss": 0.5058130621910095,
"num_tokens": 25730725.0,
"step": 28
},
{
"epoch": 0.21641791044776118,
"grad_norm": 0.4541769806648773,
"learning_rate": 2.998506313381003e-05,
"loss": 0.4760160744190216,
"num_tokens": 26557776.0,
"step": 29
},
{
"epoch": 0.22388059701492538,
"grad_norm": 0.4285196017188533,
"learning_rate": 2.998156022227906e-05,
"loss": 0.46224498748779297,
"num_tokens": 27504251.0,
"step": 30
},
{
"epoch": 0.23134328358208955,
"grad_norm": 0.47777177042995506,
"learning_rate": 2.9977688935901042e-05,
"loss": 0.45890241861343384,
"num_tokens": 28534541.0,
"step": 31
},
{
"epoch": 0.23880597014925373,
"grad_norm": 0.4677469755237667,
"learning_rate": 2.997344938045636e-05,
"loss": 0.48648908734321594,
"num_tokens": 29341502.0,
"step": 32
},
{
"epoch": 0.2462686567164179,
"grad_norm": 0.417614303033266,
"learning_rate": 2.99688416717881e-05,
"loss": 0.4909588694572449,
"num_tokens": 30211822.0,
"step": 33
},
{
"epoch": 0.2537313432835821,
"grad_norm": 0.41160216727121024,
"learning_rate": 2.9963865935798904e-05,
"loss": 0.470625102519989,
"num_tokens": 31102775.0,
"step": 34
},
{
"epoch": 0.26119402985074625,
"grad_norm": 0.3679274109247649,
"learning_rate": 2.995852230844751e-05,
"loss": 0.45474812388420105,
"num_tokens": 31898227.0,
"step": 35
},
{
"epoch": 0.26865671641791045,
"grad_norm": 0.38798639827332265,
"learning_rate": 2.9952810935745055e-05,
"loss": 0.44148534536361694,
"num_tokens": 32541892.0,
"step": 36
},
{
"epoch": 0.27611940298507465,
"grad_norm": 0.3837062370088247,
"learning_rate": 2.9946731973751076e-05,
"loss": 0.47073429822921753,
"num_tokens": 33543040.0,
"step": 37
},
{
"epoch": 0.2835820895522388,
"grad_norm": 0.35454060994897946,
"learning_rate": 2.9940285588569244e-05,
"loss": 0.4520432949066162,
"num_tokens": 34505224.0,
"step": 38
},
{
"epoch": 0.291044776119403,
"grad_norm": 0.47976493136494663,
"learning_rate": 2.993347195634284e-05,
"loss": 0.490145206451416,
"num_tokens": 35411826.0,
"step": 39
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.3473597577972683,
"learning_rate": 2.992629126324992e-05,
"loss": 0.48773276805877686,
"num_tokens": 36307345.0,
"step": 40
},
{
"epoch": 0.30597014925373134,
"grad_norm": 0.3849018293538417,
"learning_rate": 2.9918743705498237e-05,
"loss": 0.4593764543533325,
"num_tokens": 37196875.0,
"step": 41
},
{
"epoch": 0.31343283582089554,
"grad_norm": 0.32778453758129855,
"learning_rate": 2.9910829489319903e-05,
"loss": 0.4493025243282318,
"num_tokens": 38112193.0,
"step": 42
},
{
"epoch": 0.3208955223880597,
"grad_norm": 0.38874846877448016,
"learning_rate": 2.9902548830965703e-05,
"loss": 0.44223347306251526,
"num_tokens": 38855918.0,
"step": 43
},
{
"epoch": 0.3283582089552239,
"grad_norm": 0.3600626273878142,
"learning_rate": 2.9893901956699236e-05,
"loss": 0.4619264602661133,
"num_tokens": 39833215.0,
"step": 44
},
{
"epoch": 0.3358208955223881,
"grad_norm": 0.4252489485804584,
"learning_rate": 2.9884889102790703e-05,
"loss": 0.47333118319511414,
"num_tokens": 40760145.0,
"step": 45
},
{
"epoch": 0.34328358208955223,
"grad_norm": 0.44355227925846324,
"learning_rate": 2.9875510515510472e-05,
"loss": 0.4684419631958008,
"num_tokens": 41745749.0,
"step": 46
},
{
"epoch": 0.35074626865671643,
"grad_norm": 0.40090080870615863,
"learning_rate": 2.986576645112232e-05,
"loss": 0.45152851939201355,
"num_tokens": 42686630.0,
"step": 47
},
{
"epoch": 0.3582089552238806,
"grad_norm": 0.3968769871577976,
"learning_rate": 2.9855657175876453e-05,
"loss": 0.46956488490104675,
"num_tokens": 43510586.0,
"step": 48
},
{
"epoch": 0.3656716417910448,
"grad_norm": 0.34818131786283646,
"learning_rate": 2.9845182966002236e-05,
"loss": 0.43737900257110596,
"num_tokens": 44362248.0,
"step": 49
},
{
"epoch": 0.373134328358209,
"grad_norm": 0.37174081071440807,
"learning_rate": 2.983434410770063e-05,
"loss": 0.4198949337005615,
"num_tokens": 45216722.0,
"step": 50
},
{
"epoch": 0.3805970149253731,
"grad_norm": 0.3771227721970023,
"learning_rate": 2.9823140897136368e-05,
"loss": 0.43871694803237915,
"num_tokens": 46010142.0,
"step": 51
},
{
"epoch": 0.3880597014925373,
"grad_norm": 0.40827091621620715,
"learning_rate": 2.981157364042988e-05,
"loss": 0.43513864278793335,
"num_tokens": 46858670.0,
"step": 52
},
{
"epoch": 0.39552238805970147,
"grad_norm": 0.37817640704637673,
"learning_rate": 2.9799642653648915e-05,
"loss": 0.4714231491088867,
"num_tokens": 47836905.0,
"step": 53
},
{
"epoch": 0.40298507462686567,
"grad_norm": 0.4521767636779311,
"learning_rate": 2.9787348262799917e-05,
"loss": 0.46958601474761963,
"num_tokens": 48836237.0,
"step": 54
},
{
"epoch": 0.41044776119402987,
"grad_norm": 0.3996917386608209,
"learning_rate": 2.9774690803819092e-05,
"loss": 0.4700014591217041,
"num_tokens": 49860153.0,
"step": 55
},
{
"epoch": 0.417910447761194,
"grad_norm": 0.3486187929603249,
"learning_rate": 2.976167062256327e-05,
"loss": 0.4338191747665405,
"num_tokens": 50786110.0,
"step": 56
},
{
"epoch": 0.4253731343283582,
"grad_norm": 0.32937938308672743,
"learning_rate": 2.9748288074800414e-05,
"loss": 0.41941165924072266,
"num_tokens": 51790390.0,
"step": 57
},
{
"epoch": 0.43283582089552236,
"grad_norm": 0.34924110154041565,
"learning_rate": 2.9734543526199922e-05,
"loss": 0.457973837852478,
"num_tokens": 52742397.0,
"step": 58
},
{
"epoch": 0.44029850746268656,
"grad_norm": 0.410698318336265,
"learning_rate": 2.9720437352322618e-05,
"loss": 0.47605207562446594,
"num_tokens": 53673114.0,
"step": 59
},
{
"epoch": 0.44776119402985076,
"grad_norm": 0.32459119196055575,
"learning_rate": 2.9705969938610523e-05,
"loss": 0.4107760787010193,
"num_tokens": 54566889.0,
"step": 60
},
{
"epoch": 0.4552238805970149,
"grad_norm": 0.356687155861432,
"learning_rate": 2.9691141680376277e-05,
"loss": 0.4515986740589142,
"num_tokens": 55460491.0,
"step": 61
},
{
"epoch": 0.4626865671641791,
"grad_norm": 0.34110692115474495,
"learning_rate": 2.9675952982792383e-05,
"loss": 0.4474300444126129,
"num_tokens": 56520990.0,
"step": 62
},
{
"epoch": 0.4701492537313433,
"grad_norm": 0.4001841133306117,
"learning_rate": 2.9660404260880092e-05,
"loss": 0.446544885635376,
"num_tokens": 57422206.0,
"step": 63
},
{
"epoch": 0.47761194029850745,
"grad_norm": 0.3580957652668953,
"learning_rate": 2.964449593949811e-05,
"loss": 0.47310975193977356,
"num_tokens": 58260720.0,
"step": 64
},
{
"epoch": 0.48507462686567165,
"grad_norm": 0.4285689076758677,
"learning_rate": 2.9628228453330938e-05,
"loss": 0.4611589014530182,
"num_tokens": 59123617.0,
"step": 65
},
{
"epoch": 0.4925373134328358,
"grad_norm": 0.37944004231248546,
"learning_rate": 2.9611602246877044e-05,
"loss": 0.43839746713638306,
"num_tokens": 60033505.0,
"step": 66
},
{
"epoch": 0.5,
"grad_norm": 0.4058895912331529,
"learning_rate": 2.9594617774436683e-05,
"loss": 0.4206322133541107,
"num_tokens": 60974452.0,
"step": 67
},
{
"epoch": 0.5074626865671642,
"grad_norm": 0.3573247245646442,
"learning_rate": 2.957727550009949e-05,
"loss": 0.4404195547103882,
"num_tokens": 61913977.0,
"step": 68
},
{
"epoch": 0.5149253731343284,
"grad_norm": 0.3517188453992546,
"learning_rate": 2.9559575897731815e-05,
"loss": 0.4638599753379822,
"num_tokens": 62800879.0,
"step": 69
},
{
"epoch": 0.5223880597014925,
"grad_norm": 0.38237698336509157,
"learning_rate": 2.9541519450963753e-05,
"loss": 0.4506247341632843,
"num_tokens": 63823183.0,
"step": 70
},
{
"epoch": 0.5298507462686567,
"grad_norm": 0.34718947718626825,
"learning_rate": 2.9523106653175947e-05,
"loss": 0.43822404742240906,
"num_tokens": 64700087.0,
"step": 71
},
{
"epoch": 0.5373134328358209,
"grad_norm": 0.38571581429627394,
"learning_rate": 2.9504338007486096e-05,
"loss": 0.41369548439979553,
"num_tokens": 65533415.0,
"step": 72
},
{
"epoch": 0.5447761194029851,
"grad_norm": 0.4179062195109546,
"learning_rate": 2.948521402673521e-05,
"loss": 0.4132109582424164,
"num_tokens": 66411589.0,
"step": 73
},
{
"epoch": 0.5522388059701493,
"grad_norm": 0.3884838709698021,
"learning_rate": 2.9465735233473607e-05,
"loss": 0.4519786536693573,
"num_tokens": 67203675.0,
"step": 74
},
{
"epoch": 0.5597014925373134,
"grad_norm": 0.37292728239052736,
"learning_rate": 2.9445902159946608e-05,
"loss": 0.4415651559829712,
"num_tokens": 68056574.0,
"step": 75
},
{
"epoch": 0.5671641791044776,
"grad_norm": 0.3498292196022338,
"learning_rate": 2.942571534808003e-05,
"loss": 0.4234054684638977,
"num_tokens": 68907426.0,
"step": 76
},
{
"epoch": 0.5746268656716418,
"grad_norm": 0.3282849607678631,
"learning_rate": 2.9405175349465346e-05,
"loss": 0.43461883068084717,
"num_tokens": 69817179.0,
"step": 77
},
{
"epoch": 0.582089552238806,
"grad_norm": 0.35116807662765037,
"learning_rate": 2.938428272534464e-05,
"loss": 0.45615193247795105,
"num_tokens": 70803003.0,
"step": 78
},
{
"epoch": 0.5895522388059702,
"grad_norm": 0.3329914149262814,
"learning_rate": 2.9363038046595242e-05,
"loss": 0.41635048389434814,
"num_tokens": 71708353.0,
"step": 79
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.38045704306263595,
"learning_rate": 2.9341441893714155e-05,
"loss": 0.43726855516433716,
"num_tokens": 72587706.0,
"step": 80
},
{
"epoch": 0.6044776119402985,
"grad_norm": 0.35924349265893135,
"learning_rate": 2.9319494856802178e-05,
"loss": 0.4230448007583618,
"num_tokens": 73605832.0,
"step": 81
},
{
"epoch": 0.6119402985074627,
"grad_norm": 0.38619247621111746,
"learning_rate": 2.9297197535547806e-05,
"loss": 0.4357215464115143,
"num_tokens": 74512496.0,
"step": 82
},
{
"epoch": 0.6194029850746269,
"grad_norm": 0.39604323481100373,
"learning_rate": 2.9274550539210795e-05,
"loss": 0.4608227014541626,
"num_tokens": 75428481.0,
"step": 83
},
{
"epoch": 0.6268656716417911,
"grad_norm": 0.37443244592149005,
"learning_rate": 2.925155448660557e-05,
"loss": 0.4334092140197754,
"num_tokens": 76292706.0,
"step": 84
},
{
"epoch": 0.6343283582089553,
"grad_norm": 0.331225928550454,
"learning_rate": 2.9228210006084278e-05,
"loss": 0.42100948095321655,
"num_tokens": 77209633.0,
"step": 85
},
{
"epoch": 0.6417910447761194,
"grad_norm": 0.3733284174499636,
"learning_rate": 2.9204517735519638e-05,
"loss": 0.42018914222717285,
"num_tokens": 78063420.0,
"step": 86
},
{
"epoch": 0.6492537313432836,
"grad_norm": 0.32698643343045436,
"learning_rate": 2.91804783222875e-05,
"loss": 0.4293556809425354,
"num_tokens": 78870397.0,
"step": 87
},
{
"epoch": 0.6567164179104478,
"grad_norm": 0.3702901089838646,
"learning_rate": 2.915609242324917e-05,
"loss": 0.43072593212127686,
"num_tokens": 79871666.0,
"step": 88
},
{
"epoch": 0.664179104477612,
"grad_norm": 0.37258166652948627,
"learning_rate": 2.913136070473344e-05,
"loss": 0.42400825023651123,
"num_tokens": 80712206.0,
"step": 89
},
{
"epoch": 0.6716417910447762,
"grad_norm": 0.3474241060283533,
"learning_rate": 2.9106283842518404e-05,
"loss": 0.4022632837295532,
"num_tokens": 81538216.0,
"step": 90
},
{
"epoch": 0.6791044776119403,
"grad_norm": 0.3241352741578233,
"learning_rate": 2.9080862521812974e-05,
"loss": 0.4167214035987854,
"num_tokens": 82585839.0,
"step": 91
},
{
"epoch": 0.6865671641791045,
"grad_norm": 0.3810979193242223,
"learning_rate": 2.9055097437238178e-05,
"loss": 0.424973726272583,
"num_tokens": 83427449.0,
"step": 92
},
{
"epoch": 0.6940298507462687,
"grad_norm": 0.3649031636927641,
"learning_rate": 2.9028989292808156e-05,
"loss": 0.4390385150909424,
"num_tokens": 84449388.0,
"step": 93
},
{
"epoch": 0.7014925373134329,
"grad_norm": 0.28994598738953636,
"learning_rate": 2.9002538801910943e-05,
"loss": 0.4120522141456604,
"num_tokens": 85256514.0,
"step": 94
},
{
"epoch": 0.7089552238805971,
"grad_norm": 0.33708866035801577,
"learning_rate": 2.897574668728896e-05,
"loss": 0.4396127164363861,
"num_tokens": 86165960.0,
"step": 95
},
{
"epoch": 0.7164179104477612,
"grad_norm": 0.33927189407215896,
"learning_rate": 2.894861368101929e-05,
"loss": 0.4281761050224304,
"num_tokens": 86982659.0,
"step": 96
},
{
"epoch": 0.7238805970149254,
"grad_norm": 0.31050046707178475,
"learning_rate": 2.892114052449363e-05,
"loss": 0.42657923698425293,
"num_tokens": 87931000.0,
"step": 97
},
{
"epoch": 0.7313432835820896,
"grad_norm": 0.3648336319576507,
"learning_rate": 2.8893327968398085e-05,
"loss": 0.4396938681602478,
"num_tokens": 88689701.0,
"step": 98
},
{
"epoch": 0.7388059701492538,
"grad_norm": 0.32123414861291977,
"learning_rate": 2.886517677269263e-05,
"loss": 0.4277549386024475,
"num_tokens": 89547645.0,
"step": 99
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.35178070684423185,
"learning_rate": 2.883668770659033e-05,
"loss": 0.42951005697250366,
"num_tokens": 90297517.0,
"step": 100
},
{
"epoch": 0.753731343283582,
"grad_norm": 0.3404454736543532,
"learning_rate": 2.8807861548536364e-05,
"loss": 0.42362749576568604,
"num_tokens": 91186856.0,
"step": 101
},
{
"epoch": 0.7611940298507462,
"grad_norm": 0.3294687134617137,
"learning_rate": 2.8778699086186704e-05,
"loss": 0.43012386560440063,
"num_tokens": 91987232.0,
"step": 102
},
{
"epoch": 0.7686567164179104,
"grad_norm": 0.3613861468433532,
"learning_rate": 2.8749201116386635e-05,
"loss": 0.46676358580589294,
"num_tokens": 92898696.0,
"step": 103
},
{
"epoch": 0.7761194029850746,
"grad_norm": 0.32525203161057137,
"learning_rate": 2.871936844514895e-05,
"loss": 0.4271778464317322,
"num_tokens": 93791120.0,
"step": 104
},
{
"epoch": 0.7835820895522388,
"grad_norm": 0.3645223492290418,
"learning_rate": 2.8689201887631954e-05,
"loss": 0.4019509553909302,
"num_tokens": 94639289.0,
"step": 105
},
{
"epoch": 0.7910447761194029,
"grad_norm": 0.35120891749306765,
"learning_rate": 2.8658702268117166e-05,
"loss": 0.47020262479782104,
"num_tokens": 95400207.0,
"step": 106
},
{
"epoch": 0.7985074626865671,
"grad_norm": 0.3897429998289724,
"learning_rate": 2.8627870419986818e-05,
"loss": 0.45215320587158203,
"num_tokens": 96227104.0,
"step": 107
},
{
"epoch": 0.8059701492537313,
"grad_norm": 0.3551261237711927,
"learning_rate": 2.859670718570107e-05,
"loss": 0.41790810227394104,
"num_tokens": 97056588.0,
"step": 108
},
{
"epoch": 0.8134328358208955,
"grad_norm": 0.4085760278992768,
"learning_rate": 2.8565213416774984e-05,
"loss": 0.43688803911209106,
"num_tokens": 97944111.0,
"step": 109
},
{
"epoch": 0.8208955223880597,
"grad_norm": 0.34538986828654805,
"learning_rate": 2.8533389973755266e-05,
"loss": 0.40269792079925537,
"num_tokens": 98816920.0,
"step": 110
},
{
"epoch": 0.8283582089552238,
"grad_norm": 0.3680387468305633,
"learning_rate": 2.8501237726196767e-05,
"loss": 0.4414367079734802,
"num_tokens": 99773832.0,
"step": 111
},
{
"epoch": 0.835820895522388,
"grad_norm": 0.3132024744536474,
"learning_rate": 2.846875755263869e-05,
"loss": 0.44121602177619934,
"num_tokens": 100805832.0,
"step": 112
},
{
"epoch": 0.8432835820895522,
"grad_norm": 0.3212832093670825,
"learning_rate": 2.843595034058062e-05,
"loss": 0.43163514137268066,
"num_tokens": 101747939.0,
"step": 113
},
{
"epoch": 0.8507462686567164,
"grad_norm": 0.3377699333103733,
"learning_rate": 2.8402816986458235e-05,
"loss": 0.45706361532211304,
"num_tokens": 102733715.0,
"step": 114
},
{
"epoch": 0.8582089552238806,
"grad_norm": 0.3020084528652058,
"learning_rate": 2.836935839561885e-05,
"loss": 0.40077459812164307,
"num_tokens": 103577969.0,
"step": 115
},
{
"epoch": 0.8656716417910447,
"grad_norm": 0.3487492894550424,
"learning_rate": 2.833557548229665e-05,
"loss": 0.4227057695388794,
"num_tokens": 104507837.0,
"step": 116
},
{
"epoch": 0.8731343283582089,
"grad_norm": 0.3476991142190051,
"learning_rate": 2.8301469169587724e-05,
"loss": 0.4556281566619873,
"num_tokens": 105482901.0,
"step": 117
},
{
"epoch": 0.8805970149253731,
"grad_norm": 0.328015796780554,
"learning_rate": 2.826704038942485e-05,
"loss": 0.42667752504348755,
"num_tokens": 106441176.0,
"step": 118
},
{
"epoch": 0.8880597014925373,
"grad_norm": 0.34794554907206476,
"learning_rate": 2.8232290082551994e-05,
"loss": 0.4443303048610687,
"num_tokens": 107265870.0,
"step": 119
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.3207190436944611,
"learning_rate": 2.819721919849865e-05,
"loss": 0.43958723545074463,
"num_tokens": 108146690.0,
"step": 120
},
{
"epoch": 0.9029850746268657,
"grad_norm": 0.36544124775156067,
"learning_rate": 2.8161828695553876e-05,
"loss": 0.4427248537540436,
"num_tokens": 109034402.0,
"step": 121
},
{
"epoch": 0.9104477611940298,
"grad_norm": 0.35253344355491567,
"learning_rate": 2.812611954074009e-05,
"loss": 0.4511459469795227,
"num_tokens": 109989572.0,
"step": 122
},
{
"epoch": 0.917910447761194,
"grad_norm": 0.3419502161009737,
"learning_rate": 2.8090092709786683e-05,
"loss": 0.45898139476776123,
"num_tokens": 110969334.0,
"step": 123
},
{
"epoch": 0.9253731343283582,
"grad_norm": 0.3994476102816512,
"learning_rate": 2.8053749187103323e-05,
"loss": 0.4459114372730255,
"num_tokens": 111844990.0,
"step": 124
},
{
"epoch": 0.9328358208955224,
"grad_norm": 0.37335441467558017,
"learning_rate": 2.801708996575309e-05,
"loss": 0.43445926904678345,
"num_tokens": 112800888.0,
"step": 125
},
{
"epoch": 0.9402985074626866,
"grad_norm": 0.3148170264896714,
"learning_rate": 2.7980116047425318e-05,
"loss": 0.4525066912174225,
"num_tokens": 113857610.0,
"step": 126
},
{
"epoch": 0.9477611940298507,
"grad_norm": 0.3540516068525593,
"learning_rate": 2.7942828442408225e-05,
"loss": 0.42399919033050537,
"num_tokens": 114800904.0,
"step": 127
},
{
"epoch": 0.9552238805970149,
"grad_norm": 0.329278225140609,
"learning_rate": 2.7905228169561314e-05,
"loss": 0.43032482266426086,
"num_tokens": 115759913.0,
"step": 128
},
{
"epoch": 0.9626865671641791,
"grad_norm": 0.3630319590251905,
"learning_rate": 2.786731625628754e-05,
"loss": 0.44865018129348755,
"num_tokens": 116624191.0,
"step": 129
},
{
"epoch": 0.9701492537313433,
"grad_norm": 0.34043245011086026,
"learning_rate": 2.7829093738505223e-05,
"loss": 0.4354362189769745,
"num_tokens": 117499418.0,
"step": 130
},
{
"epoch": 0.9776119402985075,
"grad_norm": 0.3222353349021393,
"learning_rate": 2.7790561660619757e-05,
"loss": 0.4167882204055786,
"num_tokens": 118329517.0,
"step": 131
},
{
"epoch": 0.9850746268656716,
"grad_norm": 0.33759104962145015,
"learning_rate": 2.7751721075495062e-05,
"loss": 0.4432622492313385,
"num_tokens": 119221343.0,
"step": 132
},
{
"epoch": 0.9925373134328358,
"grad_norm": 0.3033715752060908,
"learning_rate": 2.7712573044424797e-05,
"loss": 0.4342583119869232,
"num_tokens": 120123659.0,
"step": 133
},
{
"epoch": 1.0,
"grad_norm": 0.3157095104018222,
"learning_rate": 2.7673118637103414e-05,
"loss": 0.43080803751945496,
"num_tokens": 121054976.0,
"step": 134
},
{
"epoch": 1.007462686567164,
"grad_norm": 0.4365911200440399,
"learning_rate": 2.7633358931596875e-05,
"loss": 0.39168182015419006,
"num_tokens": 121995409.0,
"step": 135
},
{
"epoch": 1.0149253731343284,
"grad_norm": 0.34729870538048124,
"learning_rate": 2.7593295014313222e-05,
"loss": 0.3802366852760315,
"num_tokens": 122823226.0,
"step": 136
},
{
"epoch": 1.0223880597014925,
"grad_norm": 0.36891237319998677,
"learning_rate": 2.755292797997288e-05,
"loss": 0.370537668466568,
"num_tokens": 123660597.0,
"step": 137
},
{
"epoch": 1.0298507462686568,
"grad_norm": 0.4473631385211834,
"learning_rate": 2.751225893157876e-05,
"loss": 0.3735314905643463,
"num_tokens": 124554146.0,
"step": 138
},
{
"epoch": 1.037313432835821,
"grad_norm": 0.3699400370687646,
"learning_rate": 2.7471288980386104e-05,
"loss": 0.3833698034286499,
"num_tokens": 125332236.0,
"step": 139
},
{
"epoch": 1.044776119402985,
"grad_norm": 0.3383566258765907,
"learning_rate": 2.743001924587213e-05,
"loss": 0.36771178245544434,
"num_tokens": 126310236.0,
"step": 140
},
{
"epoch": 1.0522388059701493,
"grad_norm": 0.34187462479662406,
"learning_rate": 2.738845085570543e-05,
"loss": 0.37672191858291626,
"num_tokens": 127218706.0,
"step": 141
},
{
"epoch": 1.0597014925373134,
"grad_norm": 0.3483444921381018,
"learning_rate": 2.734658494571519e-05,
"loss": 0.38160958886146545,
"num_tokens": 128057825.0,
"step": 142
},
{
"epoch": 1.0671641791044777,
"grad_norm": 0.3614945464912025,
"learning_rate": 2.73044226598601e-05,
"loss": 0.37473732233047485,
"num_tokens": 128955736.0,
"step": 143
},
{
"epoch": 1.0746268656716418,
"grad_norm": 0.36395086131367427,
"learning_rate": 2.7261965150197148e-05,
"loss": 0.3781934380531311,
"num_tokens": 129777788.0,
"step": 144
},
{
"epoch": 1.0820895522388059,
"grad_norm": 0.38520112404335904,
"learning_rate": 2.7219213576850122e-05,
"loss": 0.37962204217910767,
"num_tokens": 130659960.0,
"step": 145
},
{
"epoch": 1.0895522388059702,
"grad_norm": 0.4078777110059471,
"learning_rate": 2.7176169107977898e-05,
"loss": 0.38424360752105713,
"num_tokens": 131550221.0,
"step": 146
},
{
"epoch": 1.0970149253731343,
"grad_norm": 0.36815785695334224,
"learning_rate": 2.713283291974253e-05,
"loss": 0.38741737604141235,
"num_tokens": 132486469.0,
"step": 147
},
{
"epoch": 1.1044776119402986,
"grad_norm": 0.3233706872052398,
"learning_rate": 2.7089206196277132e-05,
"loss": 0.36474981904029846,
"num_tokens": 133366950.0,
"step": 148
},
{
"epoch": 1.1119402985074627,
"grad_norm": 0.3322677249511474,
"learning_rate": 2.704529012965348e-05,
"loss": 0.3808598518371582,
"num_tokens": 134285043.0,
"step": 149
},
{
"epoch": 1.1194029850746268,
"grad_norm": 0.339456725306424,
"learning_rate": 2.7001085919849477e-05,
"loss": 0.35642245411872864,
"num_tokens": 135136228.0,
"step": 150
},
{
"epoch": 1.126865671641791,
"grad_norm": 0.3226137136335262,
"learning_rate": 2.6956594774716346e-05,
"loss": 0.3718845844268799,
"num_tokens": 136013129.0,
"step": 151
},
{
"epoch": 1.1343283582089552,
"grad_norm": 0.3327685091410092,
"learning_rate": 2.691181790994564e-05,
"loss": 0.3985145092010498,
"num_tokens": 136978716.0,
"step": 152
},
{
"epoch": 1.1417910447761195,
"grad_norm": 0.33120353900381816,
"learning_rate": 2.6866756549035997e-05,
"loss": 0.3699200451374054,
"num_tokens": 137957110.0,
"step": 153
},
{
"epoch": 1.1492537313432836,
"grad_norm": 0.3043199924636294,
"learning_rate": 2.6821411923259747e-05,
"loss": 0.3767678737640381,
"num_tokens": 138894209.0,
"step": 154
},
{
"epoch": 1.1567164179104479,
"grad_norm": 0.33866074681411823,
"learning_rate": 2.677578527162923e-05,
"loss": 0.3994665741920471,
"num_tokens": 139925878.0,
"step": 155
},
{
"epoch": 1.164179104477612,
"grad_norm": 0.3339471075019717,
"learning_rate": 2.672987784086297e-05,
"loss": 0.37443894147872925,
"num_tokens": 140844266.0,
"step": 156
},
{
"epoch": 1.171641791044776,
"grad_norm": 0.34303437215557886,
"learning_rate": 2.66836908853516e-05,
"loss": 0.37581557035446167,
"num_tokens": 141685264.0,
"step": 157
},
{
"epoch": 1.1791044776119404,
"grad_norm": 0.29691716745960073,
"learning_rate": 2.6637225667123567e-05,
"loss": 0.3585776090621948,
"num_tokens": 142607439.0,
"step": 158
},
{
"epoch": 1.1865671641791045,
"grad_norm": 0.36115800096975614,
"learning_rate": 2.659048345581068e-05,
"loss": 0.3523404598236084,
"num_tokens": 143442522.0,
"step": 159
},
{
"epoch": 1.1940298507462686,
"grad_norm": 0.3854051146002361,
"learning_rate": 2.654346552861341e-05,
"loss": 0.3825865685939789,
"num_tokens": 144356683.0,
"step": 160
},
{
"epoch": 1.2014925373134329,
"grad_norm": 0.37239720041712515,
"learning_rate": 2.6496173170265967e-05,
"loss": 0.38340622186660767,
"num_tokens": 145164747.0,
"step": 161
},
{
"epoch": 1.208955223880597,
"grad_norm": 0.32579199473078013,
"learning_rate": 2.6448607673001228e-05,
"loss": 0.37306541204452515,
"num_tokens": 145974438.0,
"step": 162
},
{
"epoch": 1.2164179104477613,
"grad_norm": 0.3018547733296397,
"learning_rate": 2.6400770336515403e-05,
"loss": 0.35844796895980835,
"num_tokens": 146897553.0,
"step": 163
},
{
"epoch": 1.2238805970149254,
"grad_norm": 0.3718792094427089,
"learning_rate": 2.6352662467932535e-05,
"loss": 0.4024726152420044,
"num_tokens": 147706235.0,
"step": 164
},
{
"epoch": 1.2313432835820897,
"grad_norm": 0.2976198356156792,
"learning_rate": 2.6304285381768785e-05,
"loss": 0.3483440577983856,
"num_tokens": 148638477.0,
"step": 165
},
{
"epoch": 1.2388059701492538,
"grad_norm": 0.40262421884910277,
"learning_rate": 2.6255640399896502e-05,
"loss": 0.37967991828918457,
"num_tokens": 149598765.0,
"step": 166
},
{
"epoch": 1.2462686567164178,
"grad_norm": 0.3506490786312828,
"learning_rate": 2.620672885150811e-05,
"loss": 0.3896668553352356,
"num_tokens": 150499813.0,
"step": 167
},
{
"epoch": 1.2537313432835822,
"grad_norm": 0.37867461356808607,
"learning_rate": 2.61575520730798e-05,
"loss": 0.3811056315898895,
"num_tokens": 151406909.0,
"step": 168
},
{
"epoch": 1.2611940298507462,
"grad_norm": 0.34652476711835556,
"learning_rate": 2.6108111408334992e-05,
"loss": 0.4021441638469696,
"num_tokens": 152358615.0,
"step": 169
},
{
"epoch": 1.2686567164179103,
"grad_norm": 0.3443693872016931,
"learning_rate": 2.6058408208207623e-05,
"loss": 0.3495699167251587,
"num_tokens": 153140245.0,
"step": 170
},
{
"epoch": 1.2761194029850746,
"grad_norm": 0.3199055027860086,
"learning_rate": 2.600844383080525e-05,
"loss": 0.39528757333755493,
"num_tokens": 154065403.0,
"step": 171
},
{
"epoch": 1.2835820895522387,
"grad_norm": 0.38260512944207237,
"learning_rate": 2.595821964137192e-05,
"loss": 0.3903374969959259,
"num_tokens": 155004060.0,
"step": 172
},
{
"epoch": 1.291044776119403,
"grad_norm": 0.32094103983604383,
"learning_rate": 2.590773701225089e-05,
"loss": 0.38557156920433044,
"num_tokens": 155894044.0,
"step": 173
},
{
"epoch": 1.2985074626865671,
"grad_norm": 0.3191554521618694,
"learning_rate": 2.585699732284708e-05,
"loss": 0.3759213089942932,
"num_tokens": 156891753.0,
"step": 174
},
{
"epoch": 1.3059701492537314,
"grad_norm": 0.32242700469585533,
"learning_rate": 2.580600195958945e-05,
"loss": 0.37213414907455444,
"num_tokens": 157781264.0,
"step": 175
},
{
"epoch": 1.3134328358208955,
"grad_norm": 0.3284517036436859,
"learning_rate": 2.5754752315893065e-05,
"loss": 0.378812313079834,
"num_tokens": 158729371.0,
"step": 176
},
{
"epoch": 1.3208955223880596,
"grad_norm": 0.3415023419708296,
"learning_rate": 2.5703249792121037e-05,
"loss": 0.3865644931793213,
"num_tokens": 159723929.0,
"step": 177
},
{
"epoch": 1.328358208955224,
"grad_norm": 0.3343050420961093,
"learning_rate": 2.5651495795546263e-05,
"loss": 0.4062744081020355,
"num_tokens": 160655042.0,
"step": 178
},
{
"epoch": 1.335820895522388,
"grad_norm": 0.5981815975526952,
"learning_rate": 2.5599491740312972e-05,
"loss": 0.37754061818122864,
"num_tokens": 161689806.0,
"step": 179
},
{
"epoch": 1.3432835820895521,
"grad_norm": 0.35234814112990026,
"learning_rate": 2.5547239047398078e-05,
"loss": 0.3692866861820221,
"num_tokens": 162672971.0,
"step": 180
},
{
"epoch": 1.3507462686567164,
"grad_norm": 0.32830063055017134,
"learning_rate": 2.5494739144572368e-05,
"loss": 0.35535305738449097,
"num_tokens": 163606727.0,
"step": 181
},
{
"epoch": 1.3582089552238805,
"grad_norm": 0.3451645432424477,
"learning_rate": 2.544199346636147e-05,
"loss": 0.38066795468330383,
"num_tokens": 164379724.0,
"step": 182
},
{
"epoch": 1.3656716417910448,
"grad_norm": 0.36363681244248197,
"learning_rate": 2.5389003454006667e-05,
"loss": 0.380257785320282,
"num_tokens": 165282114.0,
"step": 183
},
{
"epoch": 1.373134328358209,
"grad_norm": 0.3192538780146095,
"learning_rate": 2.533577055542551e-05,
"loss": 0.3674117922782898,
"num_tokens": 166184652.0,
"step": 184
},
{
"epoch": 1.3805970149253732,
"grad_norm": 0.33313618040811743,
"learning_rate": 2.5282296225172267e-05,
"loss": 0.36746978759765625,
"num_tokens": 167131883.0,
"step": 185
},
{
"epoch": 1.3880597014925373,
"grad_norm": 0.3670551777933176,
"learning_rate": 2.522858192439815e-05,
"loss": 0.40295130014419556,
"num_tokens": 168105786.0,
"step": 186
},
{
"epoch": 1.3955223880597014,
"grad_norm": 0.3475964519943968,
"learning_rate": 2.5174629120811432e-05,
"loss": 0.38296568393707275,
"num_tokens": 168981965.0,
"step": 187
},
{
"epoch": 1.4029850746268657,
"grad_norm": 0.3556039194849401,
"learning_rate": 2.512043928863731e-05,
"loss": 0.38510382175445557,
"num_tokens": 169813930.0,
"step": 188
},
{
"epoch": 1.4104477611940298,
"grad_norm": 0.32738176960414617,
"learning_rate": 2.5066013908577625e-05,
"loss": 0.356991708278656,
"num_tokens": 170803921.0,
"step": 189
},
{
"epoch": 1.417910447761194,
"grad_norm": 0.3545590302027483,
"learning_rate": 2.501135446777042e-05,
"loss": 0.3816283941268921,
"num_tokens": 171568584.0,
"step": 190
},
{
"epoch": 1.4253731343283582,
"grad_norm": 0.33317616623937235,
"learning_rate": 2.4956462459749297e-05,
"loss": 0.36903613805770874,
"num_tokens": 172302686.0,
"step": 191
},
{
"epoch": 1.4328358208955223,
"grad_norm": 0.3581041627669198,
"learning_rate": 2.4901339384402598e-05,
"loss": 0.40988194942474365,
"num_tokens": 173251435.0,
"step": 192
},
{
"epoch": 1.4402985074626866,
"grad_norm": 0.3987362939905261,
"learning_rate": 2.4845986747932434e-05,
"loss": 0.3909692168235779,
"num_tokens": 174154926.0,
"step": 193
},
{
"epoch": 1.4477611940298507,
"grad_norm": 0.4522210758422187,
"learning_rate": 2.4790406062813526e-05,
"loss": 0.40102025866508484,
"num_tokens": 174968736.0,
"step": 194
},
{
"epoch": 1.455223880597015,
"grad_norm": 0.3447348513379396,
"learning_rate": 2.4734598847751868e-05,
"loss": 0.3985745310783386,
"num_tokens": 175993671.0,
"step": 195
},
{
"epoch": 1.462686567164179,
"grad_norm": 0.30700265731423365,
"learning_rate": 2.4678566627643243e-05,
"loss": 0.37859317660331726,
"num_tokens": 176965410.0,
"step": 196
},
{
"epoch": 1.4701492537313432,
"grad_norm": 0.34463758170682973,
"learning_rate": 2.462231093353155e-05,
"loss": 0.4219540059566498,
"num_tokens": 177894815.0,
"step": 197
},
{
"epoch": 1.4776119402985075,
"grad_norm": 0.3101163888412067,
"learning_rate": 2.4565833302566967e-05,
"loss": 0.3521503210067749,
"num_tokens": 178840660.0,
"step": 198
},
{
"epoch": 1.4850746268656716,
"grad_norm": 0.34884755051979194,
"learning_rate": 2.4509135277963953e-05,
"loss": 0.3874298632144928,
"num_tokens": 179786009.0,
"step": 199
},
{
"epoch": 1.4925373134328357,
"grad_norm": 0.3068857508105448,
"learning_rate": 2.445221840895908e-05,
"loss": 0.34809160232543945,
"num_tokens": 180680467.0,
"step": 200
},
{
"epoch": 1.5,
"grad_norm": 0.33736718002624627,
"learning_rate": 2.43950842507687e-05,
"loss": 0.38442444801330566,
"num_tokens": 181598316.0,
"step": 201
},
{
"epoch": 1.5074626865671643,
"grad_norm": 0.3701582076339982,
"learning_rate": 2.4337734364546455e-05,
"loss": 0.38641679286956787,
"num_tokens": 182458909.0,
"step": 202
},
{
"epoch": 1.5149253731343284,
"grad_norm": 0.43633118208871485,
"learning_rate": 2.4280170317340602e-05,
"loss": 0.3791668117046356,
"num_tokens": 183258199.0,
"step": 203
},
{
"epoch": 1.5223880597014925,
"grad_norm": 0.3471858102435004,
"learning_rate": 2.4222393682051225e-05,
"loss": 0.38509491086006165,
"num_tokens": 184223376.0,
"step": 204
},
{
"epoch": 1.5298507462686568,
"grad_norm": 0.3457150792550615,
"learning_rate": 2.4164406037387226e-05,
"loss": 0.40659117698669434,
"num_tokens": 185129043.0,
"step": 205
},
{
"epoch": 1.537313432835821,
"grad_norm": 0.4042975807556774,
"learning_rate": 2.4106208967823205e-05,
"loss": 0.386791467666626,
"num_tokens": 186025421.0,
"step": 206
},
{
"epoch": 1.544776119402985,
"grad_norm": 0.32459079771864724,
"learning_rate": 2.4047804063556156e-05,
"loss": 0.3690309226512909,
"num_tokens": 186963319.0,
"step": 207
},
{
"epoch": 1.5522388059701493,
"grad_norm": 0.3392685539840793,
"learning_rate": 2.3989192920462032e-05,
"loss": 0.3927544951438904,
"num_tokens": 187973354.0,
"step": 208
},
{
"epoch": 1.5597014925373134,
"grad_norm": 0.32438229385759354,
"learning_rate": 2.3930377140052118e-05,
"loss": 0.3521687984466553,
"num_tokens": 188705328.0,
"step": 209
},
{
"epoch": 1.5671641791044775,
"grad_norm": 0.3418923460834205,
"learning_rate": 2.3871358329429282e-05,
"loss": 0.39543381333351135,
"num_tokens": 189538934.0,
"step": 210
},
{
"epoch": 1.5746268656716418,
"grad_norm": 0.32558989276658784,
"learning_rate": 2.3812138101244062e-05,
"loss": 0.3742252588272095,
"num_tokens": 190336903.0,
"step": 211
},
{
"epoch": 1.582089552238806,
"grad_norm": 0.35255218420418694,
"learning_rate": 2.37527180736506e-05,
"loss": 0.40875107049942017,
"num_tokens": 191168843.0,
"step": 212
},
{
"epoch": 1.5895522388059702,
"grad_norm": 0.31783452945012386,
"learning_rate": 2.3693099870262425e-05,
"loss": 0.3772295117378235,
"num_tokens": 192111363.0,
"step": 213
},
{
"epoch": 1.5970149253731343,
"grad_norm": 0.3156291557535895,
"learning_rate": 2.363328512010809e-05,
"loss": 0.39021003246307373,
"num_tokens": 193103746.0,
"step": 214
},
{
"epoch": 1.6044776119402986,
"grad_norm": 0.32761123022827565,
"learning_rate": 2.3573275457586658e-05,
"loss": 0.38943108916282654,
"num_tokens": 193981563.0,
"step": 215
},
{
"epoch": 1.6119402985074627,
"grad_norm": 0.3337068007026254,
"learning_rate": 2.3513072522423058e-05,
"loss": 0.3988877236843109,
"num_tokens": 194834592.0,
"step": 216
},
{
"epoch": 1.6194029850746268,
"grad_norm": 0.3080942622353808,
"learning_rate": 2.3452677959623254e-05,
"loss": 0.3594892621040344,
"num_tokens": 195762991.0,
"step": 217
},
{
"epoch": 1.626865671641791,
"grad_norm": 0.29606722446920497,
"learning_rate": 2.3392093419429313e-05,
"loss": 0.37819525599479675,
"num_tokens": 196736861.0,
"step": 218
},
{
"epoch": 1.6343283582089554,
"grad_norm": 0.31290197305267825,
"learning_rate": 2.333132055727431e-05,
"loss": 0.386009156703949,
"num_tokens": 197626724.0,
"step": 219
},
{
"epoch": 1.6417910447761193,
"grad_norm": 0.3200855389837665,
"learning_rate": 2.32703610337371e-05,
"loss": 0.40475738048553467,
"num_tokens": 198637589.0,
"step": 220
},
{
"epoch": 1.6492537313432836,
"grad_norm": 0.2866817447413364,
"learning_rate": 2.320921651449694e-05,
"loss": 0.39424002170562744,
"num_tokens": 199563255.0,
"step": 221
},
{
"epoch": 1.6567164179104479,
"grad_norm": 0.4885347371955867,
"learning_rate": 2.3147888670287962e-05,
"loss": 0.3826729953289032,
"num_tokens": 200461303.0,
"step": 222
},
{
"epoch": 1.664179104477612,
"grad_norm": 0.3109683935111661,
"learning_rate": 2.3086379176853553e-05,
"loss": 0.40459978580474854,
"num_tokens": 201369977.0,
"step": 223
},
{
"epoch": 1.671641791044776,
"grad_norm": 0.25809457402969005,
"learning_rate": 2.3024689714900524e-05,
"loss": 0.35879969596862793,
"num_tokens": 202278503.0,
"step": 224
},
{
"epoch": 1.6791044776119404,
"grad_norm": 0.3025695421124313,
"learning_rate": 2.296282197005322e-05,
"loss": 0.35284388065338135,
"num_tokens": 203242720.0,
"step": 225
},
{
"epoch": 1.6865671641791045,
"grad_norm": 0.30640518076000706,
"learning_rate": 2.2900777632807456e-05,
"loss": 0.37256160378456116,
"num_tokens": 204150301.0,
"step": 226
},
{
"epoch": 1.6940298507462686,
"grad_norm": 0.3140380062192946,
"learning_rate": 2.283855839848431e-05,
"loss": 0.37972885370254517,
"num_tokens": 205093558.0,
"step": 227
},
{
"epoch": 1.7014925373134329,
"grad_norm": 0.34946442818041484,
"learning_rate": 2.2776165967183807e-05,
"loss": 0.39244264364242554,
"num_tokens": 205970210.0,
"step": 228
},
{
"epoch": 1.7089552238805972,
"grad_norm": 0.32538438973624206,
"learning_rate": 2.2713602043738475e-05,
"loss": 0.39682289958000183,
"num_tokens": 206859291.0,
"step": 229
},
{
"epoch": 1.716417910447761,
"grad_norm": 0.3124091616900136,
"learning_rate": 2.2650868337666746e-05,
"loss": 0.3859510123729706,
"num_tokens": 207786446.0,
"step": 230
},
{
"epoch": 1.7238805970149254,
"grad_norm": 0.3423691973747688,
"learning_rate": 2.2587966563126255e-05,
"loss": 0.3976070284843445,
"num_tokens": 208698287.0,
"step": 231
},
{
"epoch": 1.7313432835820897,
"grad_norm": 0.3134320041738064,
"learning_rate": 2.2524898438867004e-05,
"loss": 0.3667559325695038,
"num_tokens": 209548343.0,
"step": 232
},
{
"epoch": 1.7388059701492538,
"grad_norm": 0.33617188210180216,
"learning_rate": 2.2461665688184372e-05,
"loss": 0.3952285945415497,
"num_tokens": 210468969.0,
"step": 233
},
{
"epoch": 1.7462686567164178,
"grad_norm": 0.2947903980731328,
"learning_rate": 2.2398270038872083e-05,
"loss": 0.40012168884277344,
"num_tokens": 211457470.0,
"step": 234
},
{
"epoch": 1.7537313432835822,
"grad_norm": 0.3086024831747328,
"learning_rate": 2.233471322317492e-05,
"loss": 0.38004422187805176,
"num_tokens": 212347451.0,
"step": 235
},
{
"epoch": 1.7611940298507462,
"grad_norm": 0.29270776401429416,
"learning_rate": 2.227099697774146e-05,
"loss": 0.37762215733528137,
"num_tokens": 213190706.0,
"step": 236
},
{
"epoch": 1.7686567164179103,
"grad_norm": 0.35194638120625044,
"learning_rate": 2.2207123043576585e-05,
"loss": 0.3850764036178589,
"num_tokens": 214169074.0,
"step": 237
},
{
"epoch": 1.7761194029850746,
"grad_norm": 0.30551926833119664,
"learning_rate": 2.2143093165993916e-05,
"loss": 0.395663321018219,
"num_tokens": 215188427.0,
"step": 238
},
{
"epoch": 1.783582089552239,
"grad_norm": 0.33662162397203393,
"learning_rate": 2.2078909094568133e-05,
"loss": 0.3957657814025879,
"num_tokens": 216080767.0,
"step": 239
},
{
"epoch": 1.7910447761194028,
"grad_norm": 0.34783223375914446,
"learning_rate": 2.2014572583087155e-05,
"loss": 0.390730082988739,
"num_tokens": 216995394.0,
"step": 240
},
{
"epoch": 1.7985074626865671,
"grad_norm": 0.3130872530548468,
"learning_rate": 2.1950085389504232e-05,
"loss": 0.3682572841644287,
"num_tokens": 217866020.0,
"step": 241
},
{
"epoch": 1.8059701492537314,
"grad_norm": 0.3489897287487041,
"learning_rate": 2.18854492758899e-05,
"loss": 0.3791583180427551,
"num_tokens": 218680341.0,
"step": 242
},
{
"epoch": 1.8134328358208955,
"grad_norm": 0.28800056676846153,
"learning_rate": 2.182066600838384e-05,
"loss": 0.39488768577575684,
"num_tokens": 219550948.0,
"step": 243
},
{
"epoch": 1.8208955223880596,
"grad_norm": 0.35235893169992594,
"learning_rate": 2.1755737357146618e-05,
"loss": 0.37826257944107056,
"num_tokens": 220517125.0,
"step": 244
},
{
"epoch": 1.828358208955224,
"grad_norm": 0.3108059485175432,
"learning_rate": 2.169066509631132e-05,
"loss": 0.3689156770706177,
"num_tokens": 221365026.0,
"step": 245
},
{
"epoch": 1.835820895522388,
"grad_norm": 0.27688834966994996,
"learning_rate": 2.162545100393505e-05,
"loss": 0.34449559450149536,
"num_tokens": 222233736.0,
"step": 246
},
{
"epoch": 1.8432835820895521,
"grad_norm": 0.3559202619871652,
"learning_rate": 2.1560096861950396e-05,
"loss": 0.41038885712623596,
"num_tokens": 223222809.0,
"step": 247
},
{
"epoch": 1.8507462686567164,
"grad_norm": 0.32206069093634854,
"learning_rate": 2.1494604456116695e-05,
"loss": 0.3931525945663452,
"num_tokens": 224116326.0,
"step": 248
},
{
"epoch": 1.8582089552238807,
"grad_norm": 0.32036384873450585,
"learning_rate": 2.1428975575971243e-05,
"loss": 0.3952087461948395,
"num_tokens": 225107686.0,
"step": 249
},
{
"epoch": 1.8656716417910446,
"grad_norm": 0.2750788343679779,
"learning_rate": 2.1363212014780432e-05,
"loss": 0.3948509097099304,
"num_tokens": 226126493.0,
"step": 250
},
{
"epoch": 1.873134328358209,
"grad_norm": 0.3546848770246566,
"learning_rate": 2.1297315569490704e-05,
"loss": 0.38538211584091187,
"num_tokens": 227002265.0,
"step": 251
},
{
"epoch": 1.8805970149253732,
"grad_norm": 0.31987168628076534,
"learning_rate": 2.123128804067949e-05,
"loss": 0.3849794268608093,
"num_tokens": 227879194.0,
"step": 252
},
{
"epoch": 1.8880597014925373,
"grad_norm": 0.35226690474895933,
"learning_rate": 2.1165131232505973e-05,
"loss": 0.40667471289634705,
"num_tokens": 228849840.0,
"step": 253
},
{
"epoch": 1.8955223880597014,
"grad_norm": 0.3517377549019829,
"learning_rate": 2.1098846952661833e-05,
"loss": 0.36520224809646606,
"num_tokens": 229755841.0,
"step": 254
},
{
"epoch": 1.9029850746268657,
"grad_norm": 0.31196447519845827,
"learning_rate": 2.1032437012321812e-05,
"loss": 0.37600016593933105,
"num_tokens": 230531378.0,
"step": 255
},
{
"epoch": 1.9104477611940298,
"grad_norm": 0.2994021775242901,
"learning_rate": 2.0965903226094246e-05,
"loss": 0.35384806990623474,
"num_tokens": 231462516.0,
"step": 256
},
{
"epoch": 1.917910447761194,
"grad_norm": 0.3160465003726717,
"learning_rate": 2.08992474119715e-05,
"loss": 0.3978261649608612,
"num_tokens": 232355925.0,
"step": 257
},
{
"epoch": 1.9253731343283582,
"grad_norm": 0.3261497602811777,
"learning_rate": 2.0832471391280234e-05,
"loss": 0.40133193135261536,
"num_tokens": 233194033.0,
"step": 258
},
{
"epoch": 1.9328358208955225,
"grad_norm": 0.2915382309557714,
"learning_rate": 2.0765576988631707e-05,
"loss": 0.36901217699050903,
"num_tokens": 234098698.0,
"step": 259
},
{
"epoch": 1.9402985074626866,
"grad_norm": 0.3301749887472271,
"learning_rate": 2.0698566031871877e-05,
"loss": 0.38775068521499634,
"num_tokens": 235139771.0,
"step": 260
},
{
"epoch": 1.9477611940298507,
"grad_norm": 0.32435823220698096,
"learning_rate": 2.063144035203146e-05,
"loss": 0.37508994340896606,
"num_tokens": 235974035.0,
"step": 261
},
{
"epoch": 1.955223880597015,
"grad_norm": 0.3051639407042942,
"learning_rate": 2.0564201783275908e-05,
"loss": 0.3903445601463318,
"num_tokens": 236882822.0,
"step": 262
},
{
"epoch": 1.962686567164179,
"grad_norm": 0.3254047024560983,
"learning_rate": 2.0496852162855303e-05,
"loss": 0.40096017718315125,
"num_tokens": 237883798.0,
"step": 263
},
{
"epoch": 1.9701492537313432,
"grad_norm": 0.3010384436296043,
"learning_rate": 2.0429393331054122e-05,
"loss": 0.3954760432243347,
"num_tokens": 238711038.0,
"step": 264
},
{
"epoch": 1.9776119402985075,
"grad_norm": 0.2897130131386432,
"learning_rate": 2.0361827131140988e-05,
"loss": 0.3967036008834839,
"num_tokens": 239602771.0,
"step": 265
},
{
"epoch": 1.9850746268656716,
"grad_norm": 0.2957878042115852,
"learning_rate": 2.0294155409318273e-05,
"loss": 0.3834611177444458,
"num_tokens": 240344316.0,
"step": 266
},
{
"epoch": 1.9925373134328357,
"grad_norm": 0.28585034773563434,
"learning_rate": 2.022638001467168e-05,
"loss": 0.36557599902153015,
"num_tokens": 241222304.0,
"step": 267
},
{
"epoch": 2.0,
"grad_norm": 0.31387177455183296,
"learning_rate": 2.0158502799119694e-05,
"loss": 0.3776703178882599,
"num_tokens": 242128094.0,
"step": 268
},
{
"epoch": 2.0074626865671643,
"grad_norm": 0.4118428271810675,
"learning_rate": 2.0090525617362995e-05,
"loss": 0.35364389419555664,
"num_tokens": 243032329.0,
"step": 269
},
{
"epoch": 2.014925373134328,
"grad_norm": 0.3402616783384963,
"learning_rate": 2.002245032683378e-05,
"loss": 0.3219972252845764,
"num_tokens": 243800954.0,
"step": 270
},
{
"epoch": 2.0223880597014925,
"grad_norm": 0.37707313442099644,
"learning_rate": 1.9954278787644977e-05,
"loss": 0.3484679162502289,
"num_tokens": 244806412.0,
"step": 271
},
{
"epoch": 2.029850746268657,
"grad_norm": 0.39396702522268606,
"learning_rate": 1.988601286253949e-05,
"loss": 0.3331984877586365,
"num_tokens": 245802398.0,
"step": 272
},
{
"epoch": 2.0373134328358207,
"grad_norm": 0.40195308083874304,
"learning_rate": 1.9817654416839217e-05,
"loss": 0.3107374608516693,
"num_tokens": 246739297.0,
"step": 273
},
{
"epoch": 2.044776119402985,
"grad_norm": 0.35623399811084044,
"learning_rate": 1.9749205318394146e-05,
"loss": 0.3448570966720581,
"num_tokens": 247613231.0,
"step": 274
},
{
"epoch": 2.0522388059701493,
"grad_norm": 0.36546920760892426,
"learning_rate": 1.9680667437531283e-05,
"loss": 0.332324355840683,
"num_tokens": 248481475.0,
"step": 275
},
{
"epoch": 2.0597014925373136,
"grad_norm": 0.34431898026591723,
"learning_rate": 1.961204264700355e-05,
"loss": 0.3348411023616791,
"num_tokens": 249300938.0,
"step": 276
},
{
"epoch": 2.0671641791044775,
"grad_norm": 0.35727153061507005,
"learning_rate": 1.954333282193863e-05,
"loss": 0.33406710624694824,
"num_tokens": 250171263.0,
"step": 277
},
{
"epoch": 2.074626865671642,
"grad_norm": 0.3250548689859224,
"learning_rate": 1.9474539839787713e-05,
"loss": 0.3140842020511627,
"num_tokens": 251071115.0,
"step": 278
},
{
"epoch": 2.082089552238806,
"grad_norm": 0.3059178018916231,
"learning_rate": 1.9405665580274205e-05,
"loss": 0.32764101028442383,
"num_tokens": 251961398.0,
"step": 279
},
{
"epoch": 2.08955223880597,
"grad_norm": 0.3265488394498236,
"learning_rate": 1.9336711925342357e-05,
"loss": 0.31429940462112427,
"num_tokens": 252775080.0,
"step": 280
},
{
"epoch": 2.0970149253731343,
"grad_norm": 0.3033003365838648,
"learning_rate": 1.926768075910586e-05,
"loss": 0.3364748954772949,
"num_tokens": 253678902.0,
"step": 281
},
{
"epoch": 2.1044776119402986,
"grad_norm": 0.3218060266076608,
"learning_rate": 1.919857396779633e-05,
"loss": 0.34063756465911865,
"num_tokens": 254547582.0,
"step": 282
},
{
"epoch": 2.111940298507463,
"grad_norm": 0.28752356415270264,
"learning_rate": 1.9129393439711812e-05,
"loss": 0.3032745122909546,
"num_tokens": 255299741.0,
"step": 283
},
{
"epoch": 2.1194029850746268,
"grad_norm": 0.3114027725704962,
"learning_rate": 1.906014106516515e-05,
"loss": 0.323519766330719,
"num_tokens": 256183942.0,
"step": 284
},
{
"epoch": 2.126865671641791,
"grad_norm": 0.35567716347702344,
"learning_rate": 1.899081873643235e-05,
"loss": 0.3606981635093689,
"num_tokens": 257098083.0,
"step": 285
},
{
"epoch": 2.1343283582089554,
"grad_norm": 0.31558423890531895,
"learning_rate": 1.8921428347700853e-05,
"loss": 0.33504611253738403,
"num_tokens": 258138577.0,
"step": 286
},
{
"epoch": 2.1417910447761193,
"grad_norm": 0.34224186580930754,
"learning_rate": 1.8851971795017822e-05,
"loss": 0.326399028301239,
"num_tokens": 258888036.0,
"step": 287
},
{
"epoch": 2.1492537313432836,
"grad_norm": 0.30575598315812624,
"learning_rate": 1.8782450976238294e-05,
"loss": 0.3074103593826294,
"num_tokens": 259766509.0,
"step": 288
},
{
"epoch": 2.156716417910448,
"grad_norm": 0.3205831945487892,
"learning_rate": 1.8712867790973317e-05,
"loss": 0.33759474754333496,
"num_tokens": 260610097.0,
"step": 289
},
{
"epoch": 2.1641791044776117,
"grad_norm": 0.3023776868912514,
"learning_rate": 1.86432241405381e-05,
"loss": 0.3334404230117798,
"num_tokens": 261447212.0,
"step": 290
},
{
"epoch": 2.171641791044776,
"grad_norm": 0.30838933870298346,
"learning_rate": 1.8573521927900004e-05,
"loss": 0.32669875025749207,
"num_tokens": 262481613.0,
"step": 291
},
{
"epoch": 2.1791044776119404,
"grad_norm": 0.31402266902142234,
"learning_rate": 1.850376305762655e-05,
"loss": 0.35277265310287476,
"num_tokens": 263536437.0,
"step": 292
},
{
"epoch": 2.1865671641791047,
"grad_norm": 0.31931309491882254,
"learning_rate": 1.843394943583342e-05,
"loss": 0.32963383197784424,
"num_tokens": 264379962.0,
"step": 293
},
{
"epoch": 2.1940298507462686,
"grad_norm": 0.34845358198148824,
"learning_rate": 1.836408297013232e-05,
"loss": 0.3339906334877014,
"num_tokens": 265196630.0,
"step": 294
},
{
"epoch": 2.201492537313433,
"grad_norm": 0.3046594968746612,
"learning_rate": 1.8294165569578902e-05,
"loss": 0.33100634813308716,
"num_tokens": 266192395.0,
"step": 295
},
{
"epoch": 2.208955223880597,
"grad_norm": 0.30699215790098994,
"learning_rate": 1.8224199144620557e-05,
"loss": 0.33232712745666504,
"num_tokens": 267198691.0,
"step": 296
},
{
"epoch": 2.216417910447761,
"grad_norm": 0.29968857683346356,
"learning_rate": 1.8154185607044267e-05,
"loss": 0.3363949656486511,
"num_tokens": 268129026.0,
"step": 297
},
{
"epoch": 2.2238805970149254,
"grad_norm": 0.2805025168393364,
"learning_rate": 1.8084126869924304e-05,
"loss": 0.32357555627822876,
"num_tokens": 269034104.0,
"step": 298
},
{
"epoch": 2.2313432835820897,
"grad_norm": 0.3030010408610569,
"learning_rate": 1.801402484757001e-05,
"loss": 0.31561556458473206,
"num_tokens": 269856471.0,
"step": 299
},
{
"epoch": 2.2388059701492535,
"grad_norm": 0.3195759723269335,
"learning_rate": 1.794388145547346e-05,
"loss": 0.34712180495262146,
"num_tokens": 270737041.0,
"step": 300
},
{
"epoch": 2.246268656716418,
"grad_norm": 0.3002118422900145,
"learning_rate": 1.7873698610257117e-05,
"loss": 0.35004639625549316,
"num_tokens": 271655450.0,
"step": 301
},
{
"epoch": 2.253731343283582,
"grad_norm": 0.28294722266034306,
"learning_rate": 1.7803478229621504e-05,
"loss": 0.3119392395019531,
"num_tokens": 272452734.0,
"step": 302
},
{
"epoch": 2.2611940298507465,
"grad_norm": 0.3751958180610849,
"learning_rate": 1.773322223229275e-05,
"loss": 0.3349981904029846,
"num_tokens": 273321732.0,
"step": 303
},
{
"epoch": 2.2686567164179103,
"grad_norm": 0.29383426672277096,
"learning_rate": 1.766293253797021e-05,
"loss": 0.3226167857646942,
"num_tokens": 274159747.0,
"step": 304
},
{
"epoch": 2.2761194029850746,
"grad_norm": 0.3225857187342117,
"learning_rate": 1.7592611067273947e-05,
"loss": 0.34066349267959595,
"num_tokens": 275031559.0,
"step": 305
},
{
"epoch": 2.283582089552239,
"grad_norm": 0.30673283102679866,
"learning_rate": 1.7522259741692343e-05,
"loss": 0.33413374423980713,
"num_tokens": 275930932.0,
"step": 306
},
{
"epoch": 2.291044776119403,
"grad_norm": 0.29177063821827953,
"learning_rate": 1.7451880483529507e-05,
"loss": 0.308035671710968,
"num_tokens": 276741084.0,
"step": 307
},
{
"epoch": 2.298507462686567,
"grad_norm": 0.28427282903782,
"learning_rate": 1.7381475215852805e-05,
"loss": 0.3250593841075897,
"num_tokens": 277707588.0,
"step": 308
},
{
"epoch": 2.3059701492537314,
"grad_norm": 0.2971627244171146,
"learning_rate": 1.7311045862440298e-05,
"loss": 0.32269105315208435,
"num_tokens": 278703194.0,
"step": 309
},
{
"epoch": 2.3134328358208958,
"grad_norm": 0.31555926494620046,
"learning_rate": 1.724059434772816e-05,
"loss": 0.32977578043937683,
"num_tokens": 279491539.0,
"step": 310
},
{
"epoch": 2.3208955223880596,
"grad_norm": 0.36417590081584483,
"learning_rate": 1.7170122596758127e-05,
"loss": 0.33532094955444336,
"num_tokens": 280324630.0,
"step": 311
},
{
"epoch": 2.328358208955224,
"grad_norm": 0.3105391565464644,
"learning_rate": 1.7099632535124854e-05,
"loss": 0.3156779408454895,
"num_tokens": 281308248.0,
"step": 312
},
{
"epoch": 2.3358208955223883,
"grad_norm": 0.2749240967299516,
"learning_rate": 1.702912608892335e-05,
"loss": 0.31482142210006714,
"num_tokens": 282221715.0,
"step": 313
},
{
"epoch": 2.343283582089552,
"grad_norm": 0.29515283586141233,
"learning_rate": 1.6958605184696297e-05,
"loss": 0.32622820138931274,
"num_tokens": 283077823.0,
"step": 314
},
{
"epoch": 2.3507462686567164,
"grad_norm": 0.2968928416618244,
"learning_rate": 1.688807174938145e-05,
"loss": 0.3397972583770752,
"num_tokens": 284064121.0,
"step": 315
},
{
"epoch": 2.3582089552238807,
"grad_norm": 0.29827012810037,
"learning_rate": 1.681752771025896e-05,
"loss": 0.3332856297492981,
"num_tokens": 285080424.0,
"step": 316
},
{
"epoch": 2.3656716417910446,
"grad_norm": 0.3039705351616898,
"learning_rate": 1.674697499489872e-05,
"loss": 0.33647334575653076,
"num_tokens": 286006199.0,
"step": 317
},
{
"epoch": 2.373134328358209,
"grad_norm": 0.30141732787530867,
"learning_rate": 1.6676415531107706e-05,
"loss": 0.3342139720916748,
"num_tokens": 286965514.0,
"step": 318
},
{
"epoch": 2.3805970149253732,
"grad_norm": 0.2956530210848347,
"learning_rate": 1.6605851246877272e-05,
"loss": 0.3201013207435608,
"num_tokens": 287842439.0,
"step": 319
},
{
"epoch": 2.388059701492537,
"grad_norm": 0.2940637214016598,
"learning_rate": 1.65352840703305e-05,
"loss": 0.3377227783203125,
"num_tokens": 288763923.0,
"step": 320
},
{
"epoch": 2.3955223880597014,
"grad_norm": 0.2832996218159561,
"learning_rate": 1.64647159296695e-05,
"loss": 0.3385891020298004,
"num_tokens": 289625254.0,
"step": 321
},
{
"epoch": 2.4029850746268657,
"grad_norm": 0.3190448740603143,
"learning_rate": 1.6394148753122734e-05,
"loss": 0.33053308725357056,
"num_tokens": 290474425.0,
"step": 322
},
{
"epoch": 2.41044776119403,
"grad_norm": 0.3096387349106184,
"learning_rate": 1.63235844688923e-05,
"loss": 0.3427371680736542,
"num_tokens": 291335951.0,
"step": 323
},
{
"epoch": 2.417910447761194,
"grad_norm": 0.27491757978825115,
"learning_rate": 1.6253025005101283e-05,
"loss": 0.3303934931755066,
"num_tokens": 292257658.0,
"step": 324
},
{
"epoch": 2.425373134328358,
"grad_norm": 0.3066415534546823,
"learning_rate": 1.6182472289741043e-05,
"loss": 0.36399906873703003,
"num_tokens": 293162733.0,
"step": 325
},
{
"epoch": 2.4328358208955225,
"grad_norm": 0.29140134988200495,
"learning_rate": 1.611192825061855e-05,
"loss": 0.3504979610443115,
"num_tokens": 294199419.0,
"step": 326
},
{
"epoch": 2.4402985074626864,
"grad_norm": 0.2895038992576744,
"learning_rate": 1.604139481530371e-05,
"loss": 0.35671094059944153,
"num_tokens": 295163721.0,
"step": 327
},
{
"epoch": 2.4477611940298507,
"grad_norm": 0.2871110161885208,
"learning_rate": 1.5970873911076654e-05,
"loss": 0.3230712115764618,
"num_tokens": 296048485.0,
"step": 328
},
{
"epoch": 2.455223880597015,
"grad_norm": 0.29355297538880015,
"learning_rate": 1.590036746487515e-05,
"loss": 0.32808297872543335,
"num_tokens": 296905697.0,
"step": 329
},
{
"epoch": 2.4626865671641793,
"grad_norm": 0.2951986083005398,
"learning_rate": 1.5829877403241875e-05,
"loss": 0.3399554491043091,
"num_tokens": 297837804.0,
"step": 330
},
{
"epoch": 2.470149253731343,
"grad_norm": 0.29400144153471874,
"learning_rate": 1.5759405652271843e-05,
"loss": 0.33751606941223145,
"num_tokens": 298822600.0,
"step": 331
},
{
"epoch": 2.4776119402985075,
"grad_norm": 0.31657313495586964,
"learning_rate": 1.5688954137559705e-05,
"loss": 0.35242465138435364,
"num_tokens": 299764042.0,
"step": 332
},
{
"epoch": 2.485074626865672,
"grad_norm": 0.2716779461194812,
"learning_rate": 1.5618524784147197e-05,
"loss": 0.3363187313079834,
"num_tokens": 300754135.0,
"step": 333
},
{
"epoch": 2.4925373134328357,
"grad_norm": 0.29632964201216716,
"learning_rate": 1.5548119516470496e-05,
"loss": 0.3306392431259155,
"num_tokens": 301644488.0,
"step": 334
},
{
"epoch": 2.5,
"grad_norm": 0.27058966408716395,
"learning_rate": 1.547774025830766e-05,
"loss": 0.31814491748809814,
"num_tokens": 302538046.0,
"step": 335
},
{
"epoch": 2.5074626865671643,
"grad_norm": 0.3055781987611692,
"learning_rate": 1.5407388932726056e-05,
"loss": 0.3387256860733032,
"num_tokens": 303333898.0,
"step": 336
},
{
"epoch": 2.5149253731343286,
"grad_norm": 0.27960594879506695,
"learning_rate": 1.53370674620298e-05,
"loss": 0.33688774704933167,
"num_tokens": 304300483.0,
"step": 337
},
{
"epoch": 2.5223880597014925,
"grad_norm": 0.3168292150331439,
"learning_rate": 1.526677776770725e-05,
"loss": 0.34352821111679077,
"num_tokens": 305176138.0,
"step": 338
},
{
"epoch": 2.529850746268657,
"grad_norm": 0.40646030417402895,
"learning_rate": 1.5196521770378498e-05,
"loss": 0.3636009693145752,
"num_tokens": 306092248.0,
"step": 339
},
{
"epoch": 2.5373134328358207,
"grad_norm": 0.34451254015626254,
"learning_rate": 1.5126301389742889e-05,
"loss": 0.3361930251121521,
"num_tokens": 306939786.0,
"step": 340
},
{
"epoch": 2.544776119402985,
"grad_norm": 0.301191033765102,
"learning_rate": 1.5056118544526552e-05,
"loss": 0.34493589401245117,
"num_tokens": 307676111.0,
"step": 341
},
{
"epoch": 2.5522388059701493,
"grad_norm": 0.3179102063972407,
"learning_rate": 1.4985975152429998e-05,
"loss": 0.35899001359939575,
"num_tokens": 308557757.0,
"step": 342
},
{
"epoch": 2.5597014925373136,
"grad_norm": 0.3171583352212965,
"learning_rate": 1.4915873130075704e-05,
"loss": 0.3521921634674072,
"num_tokens": 309465811.0,
"step": 343
},
{
"epoch": 2.5671641791044775,
"grad_norm": 0.3150146379597525,
"learning_rate": 1.484581439295574e-05,
"loss": 0.3577362895011902,
"num_tokens": 310383391.0,
"step": 344
},
{
"epoch": 2.574626865671642,
"grad_norm": 0.3129412764272567,
"learning_rate": 1.4775800855379447e-05,
"loss": 0.33559077978134155,
"num_tokens": 311194322.0,
"step": 345
},
{
"epoch": 2.582089552238806,
"grad_norm": 0.3119289397564452,
"learning_rate": 1.4705834430421109e-05,
"loss": 0.3442152142524719,
"num_tokens": 312296357.0,
"step": 346
},
{
"epoch": 2.58955223880597,
"grad_norm": 0.30424883379817386,
"learning_rate": 1.4635917029867686e-05,
"loss": 0.3301926851272583,
"num_tokens": 313212850.0,
"step": 347
},
{
"epoch": 2.5970149253731343,
"grad_norm": 0.2888937621740727,
"learning_rate": 1.4566050564166585e-05,
"loss": 0.3173384368419647,
"num_tokens": 314136793.0,
"step": 348
},
{
"epoch": 2.6044776119402986,
"grad_norm": 0.3181049412726844,
"learning_rate": 1.4496236942373452e-05,
"loss": 0.33396849036216736,
"num_tokens": 315103742.0,
"step": 349
},
{
"epoch": 2.611940298507463,
"grad_norm": 0.285849757305298,
"learning_rate": 1.4426478072100001e-05,
"loss": 0.3271850645542145,
"num_tokens": 315911989.0,
"step": 350
},
{
"epoch": 2.6194029850746268,
"grad_norm": 0.30462072449655236,
"learning_rate": 1.4356775859461898e-05,
"loss": 0.3309672474861145,
"num_tokens": 316818398.0,
"step": 351
},
{
"epoch": 2.626865671641791,
"grad_norm": 0.28242189700779124,
"learning_rate": 1.4287132209026686e-05,
"loss": 0.3406432271003723,
"num_tokens": 317815953.0,
"step": 352
},
{
"epoch": 2.6343283582089554,
"grad_norm": 0.30367294764460456,
"learning_rate": 1.4217549023761713e-05,
"loss": 0.33886873722076416,
"num_tokens": 318782328.0,
"step": 353
},
{
"epoch": 2.6417910447761193,
"grad_norm": 0.2986406693990765,
"learning_rate": 1.4148028204982184e-05,
"loss": 0.3135310113430023,
"num_tokens": 319721759.0,
"step": 354
},
{
"epoch": 2.6492537313432836,
"grad_norm": 0.26230985959231096,
"learning_rate": 1.407857165229915e-05,
"loss": 0.3319952189922333,
"num_tokens": 320632767.0,
"step": 355
},
{
"epoch": 2.656716417910448,
"grad_norm": 0.29286908776153336,
"learning_rate": 1.4009181263567659e-05,
"loss": 0.33468297123908997,
"num_tokens": 321567293.0,
"step": 356
},
{
"epoch": 2.664179104477612,
"grad_norm": 0.2655369943810491,
"learning_rate": 1.3939858934834851e-05,
"loss": 0.31415632367134094,
"num_tokens": 322466432.0,
"step": 357
},
{
"epoch": 2.671641791044776,
"grad_norm": 0.29776986063827793,
"learning_rate": 1.3870606560288188e-05,
"loss": 0.32620397210121155,
"num_tokens": 323416159.0,
"step": 358
},
{
"epoch": 2.6791044776119404,
"grad_norm": 0.2888554358463497,
"learning_rate": 1.3801426032203668e-05,
"loss": 0.3294253945350647,
"num_tokens": 324280115.0,
"step": 359
},
{
"epoch": 2.6865671641791042,
"grad_norm": 0.2743974222493521,
"learning_rate": 1.3732319240894143e-05,
"loss": 0.33846813440322876,
"num_tokens": 325182095.0,
"step": 360
},
{
"epoch": 2.6940298507462686,
"grad_norm": 0.28798464786719813,
"learning_rate": 1.3663288074657639e-05,
"loss": 0.32448339462280273,
"num_tokens": 326171068.0,
"step": 361
},
{
"epoch": 2.701492537313433,
"grad_norm": 0.24943230534603614,
"learning_rate": 1.3594334419725797e-05,
"loss": 0.3398998975753784,
"num_tokens": 327115635.0,
"step": 362
},
{
"epoch": 2.708955223880597,
"grad_norm": 0.2855896503061799,
"learning_rate": 1.3525460160212284e-05,
"loss": 0.3351544141769409,
"num_tokens": 328060133.0,
"step": 363
},
{
"epoch": 2.716417910447761,
"grad_norm": 0.2981015005997933,
"learning_rate": 1.3456667178061365e-05,
"loss": 0.3235108256340027,
"num_tokens": 328868585.0,
"step": 364
},
{
"epoch": 2.7238805970149254,
"grad_norm": 0.3017533668551756,
"learning_rate": 1.3387957352996446e-05,
"loss": 0.34303897619247437,
"num_tokens": 329676478.0,
"step": 365
},
{
"epoch": 2.7313432835820897,
"grad_norm": 0.2793280893422549,
"learning_rate": 1.3319332562468716e-05,
"loss": 0.3332846164703369,
"num_tokens": 330487275.0,
"step": 366
},
{
"epoch": 2.7388059701492535,
"grad_norm": 0.272656727703741,
"learning_rate": 1.3250794681605853e-05,
"loss": 0.3316395878791809,
"num_tokens": 331339930.0,
"step": 367
},
{
"epoch": 2.746268656716418,
"grad_norm": 0.2742961273683912,
"learning_rate": 1.3182345583160782e-05,
"loss": 0.3241080045700073,
"num_tokens": 332357238.0,
"step": 368
},
{
"epoch": 2.753731343283582,
"grad_norm": 0.26762807579168846,
"learning_rate": 1.3113987137460514e-05,
"loss": 0.331865131855011,
"num_tokens": 333294492.0,
"step": 369
},
{
"epoch": 2.7611940298507465,
"grad_norm": 0.28820933208703176,
"learning_rate": 1.3045721212355023e-05,
"loss": 0.35760703682899475,
"num_tokens": 334107753.0,
"step": 370
},
{
"epoch": 2.7686567164179103,
"grad_norm": 0.26979220761978373,
"learning_rate": 1.2977549673166228e-05,
"loss": 0.3278617858886719,
"num_tokens": 334989082.0,
"step": 371
},
{
"epoch": 2.7761194029850746,
"grad_norm": 0.27879196286904034,
"learning_rate": 1.2909474382637006e-05,
"loss": 0.33124369382858276,
"num_tokens": 335901082.0,
"step": 372
},
{
"epoch": 2.783582089552239,
"grad_norm": 0.2463893540212004,
"learning_rate": 1.2841497200880305e-05,
"loss": 0.32943689823150635,
"num_tokens": 336958851.0,
"step": 373
},
{
"epoch": 2.791044776119403,
"grad_norm": 0.27477456461332017,
"learning_rate": 1.2773619985328323e-05,
"loss": 0.3239135444164276,
"num_tokens": 337786409.0,
"step": 374
},
{
"epoch": 2.798507462686567,
"grad_norm": 0.28497614476087085,
"learning_rate": 1.2705844590681726e-05,
"loss": 0.3271849453449249,
"num_tokens": 338694981.0,
"step": 375
},
{
"epoch": 2.8059701492537314,
"grad_norm": 0.2777009652008523,
"learning_rate": 1.2638172868859015e-05,
"loss": 0.31704217195510864,
"num_tokens": 339501927.0,
"step": 376
},
{
"epoch": 2.8134328358208958,
"grad_norm": 0.30766613572700274,
"learning_rate": 1.2570606668945877e-05,
"loss": 0.35138726234436035,
"num_tokens": 340423876.0,
"step": 377
},
{
"epoch": 2.8208955223880596,
"grad_norm": 0.24806225558937847,
"learning_rate": 1.2503147837144702e-05,
"loss": 0.31420814990997314,
"num_tokens": 341285598.0,
"step": 378
},
{
"epoch": 2.828358208955224,
"grad_norm": 0.2847378795760287,
"learning_rate": 1.2435798216724094e-05,
"loss": 0.32901105284690857,
"num_tokens": 342213168.0,
"step": 379
},
{
"epoch": 2.835820895522388,
"grad_norm": 0.27949658273947187,
"learning_rate": 1.2368559647968544e-05,
"loss": 0.34290027618408203,
"num_tokens": 343216441.0,
"step": 380
},
{
"epoch": 2.843283582089552,
"grad_norm": 0.27303724081659647,
"learning_rate": 1.2301433968128127e-05,
"loss": 0.3377082645893097,
"num_tokens": 344164273.0,
"step": 381
},
{
"epoch": 2.8507462686567164,
"grad_norm": 0.28475093005317836,
"learning_rate": 1.2234423011368292e-05,
"loss": 0.3300044536590576,
"num_tokens": 345034929.0,
"step": 382
},
{
"epoch": 2.8582089552238807,
"grad_norm": 0.280519214961473,
"learning_rate": 1.2167528608719768e-05,
"loss": 0.3426816463470459,
"num_tokens": 345822215.0,
"step": 383
},
{
"epoch": 2.8656716417910446,
"grad_norm": 0.27103397464423407,
"learning_rate": 1.2100752588028507e-05,
"loss": 0.33561939001083374,
"num_tokens": 346779144.0,
"step": 384
},
{
"epoch": 2.873134328358209,
"grad_norm": 0.26428076882187357,
"learning_rate": 1.2034096773905753e-05,
"loss": 0.3420035243034363,
"num_tokens": 347750581.0,
"step": 385
},
{
"epoch": 2.8805970149253732,
"grad_norm": 0.30501014590148545,
"learning_rate": 1.196756298767819e-05,
"loss": 0.33400657773017883,
"num_tokens": 348809613.0,
"step": 386
},
{
"epoch": 2.888059701492537,
"grad_norm": 0.24697890321618382,
"learning_rate": 1.1901153047338168e-05,
"loss": 0.3329269289970398,
"num_tokens": 349843341.0,
"step": 387
},
{
"epoch": 2.8955223880597014,
"grad_norm": 0.266310278361214,
"learning_rate": 1.1834868767494028e-05,
"loss": 0.3315233588218689,
"num_tokens": 350686011.0,
"step": 388
},
{
"epoch": 2.9029850746268657,
"grad_norm": 0.25685951719776035,
"learning_rate": 1.1768711959320512e-05,
"loss": 0.3367440104484558,
"num_tokens": 351603297.0,
"step": 389
},
{
"epoch": 2.91044776119403,
"grad_norm": 0.3604332672305553,
"learning_rate": 1.1702684430509298e-05,
"loss": 0.35349708795547485,
"num_tokens": 352566195.0,
"step": 390
},
{
"epoch": 2.917910447761194,
"grad_norm": 0.2935692512218851,
"learning_rate": 1.1636787985219572e-05,
"loss": 0.3288194537162781,
"num_tokens": 353185236.0,
"step": 391
},
{
"epoch": 2.925373134328358,
"grad_norm": 0.2815859488295857,
"learning_rate": 1.1571024424028761e-05,
"loss": 0.339729905128479,
"num_tokens": 354050628.0,
"step": 392
},
{
"epoch": 2.9328358208955225,
"grad_norm": 0.275808180586563,
"learning_rate": 1.1505395543883313e-05,
"loss": 0.3455864489078522,
"num_tokens": 354968219.0,
"step": 393
},
{
"epoch": 2.9402985074626864,
"grad_norm": 0.25420785215211034,
"learning_rate": 1.143990313804961e-05,
"loss": 0.33193981647491455,
"num_tokens": 355907268.0,
"step": 394
},
{
"epoch": 2.9477611940298507,
"grad_norm": 0.2700179741152324,
"learning_rate": 1.1374548996064953e-05,
"loss": 0.32135769724845886,
"num_tokens": 356786243.0,
"step": 395
},
{
"epoch": 2.955223880597015,
"grad_norm": 0.2881492550060451,
"learning_rate": 1.1309334903688686e-05,
"loss": 0.33170467615127563,
"num_tokens": 357742891.0,
"step": 396
},
{
"epoch": 2.9626865671641793,
"grad_norm": 0.267258769627609,
"learning_rate": 1.1244262642853383e-05,
"loss": 0.3263099193572998,
"num_tokens": 358521016.0,
"step": 397
},
{
"epoch": 2.970149253731343,
"grad_norm": 0.27461845486227027,
"learning_rate": 1.1179333991616162e-05,
"loss": 0.31942278146743774,
"num_tokens": 359455120.0,
"step": 398
},
{
"epoch": 2.9776119402985075,
"grad_norm": 0.28304959654627004,
"learning_rate": 1.1114550724110105e-05,
"loss": 0.3328409790992737,
"num_tokens": 360361804.0,
"step": 399
},
{
"epoch": 2.9850746268656714,
"grad_norm": 0.25788972512908753,
"learning_rate": 1.1049914610495772e-05,
"loss": 0.3321342468261719,
"num_tokens": 361424683.0,
"step": 400
},
{
"epoch": 2.9925373134328357,
"grad_norm": 0.28331510950003724,
"learning_rate": 1.0985427416912853e-05,
"loss": 0.33656731247901917,
"num_tokens": 362323989.0,
"step": 401
},
{
"epoch": 3.0,
"grad_norm": 0.3020936447432793,
"learning_rate": 1.0921090905431871e-05,
"loss": 0.33412468433380127,
"num_tokens": 363125328.0,
"step": 402
},
{
"epoch": 3.0074626865671643,
"grad_norm": 0.3799465813612812,
"learning_rate": 1.0856906834006088e-05,
"loss": 0.2873135805130005,
"num_tokens": 363894208.0,
"step": 403
},
{
"epoch": 3.014925373134328,
"grad_norm": 0.36967050447420124,
"learning_rate": 1.079287695642342e-05,
"loss": 0.2959785461425781,
"num_tokens": 364737491.0,
"step": 404
},
{
"epoch": 3.0223880597014925,
"grad_norm": 0.29461417590711114,
"learning_rate": 1.0729003022258542e-05,
"loss": 0.29170793294906616,
"num_tokens": 365722761.0,
"step": 405
},
{
"epoch": 3.029850746268657,
"grad_norm": 0.4116803087373601,
"learning_rate": 1.0665286776825081e-05,
"loss": 0.30883458256721497,
"num_tokens": 366512957.0,
"step": 406
},
{
"epoch": 3.0373134328358207,
"grad_norm": 0.46157786514533145,
"learning_rate": 1.0601729961127924e-05,
"loss": 0.30715805292129517,
"num_tokens": 367415626.0,
"step": 407
},
{
"epoch": 3.044776119402985,
"grad_norm": 0.4187156114489574,
"learning_rate": 1.0538334311815627e-05,
"loss": 0.31521543860435486,
"num_tokens": 368197609.0,
"step": 408
},
{
"epoch": 3.0522388059701493,
"grad_norm": 0.3101860773424548,
"learning_rate": 1.0475101561133e-05,
"loss": 0.2965121269226074,
"num_tokens": 369065047.0,
"step": 409
},
{
"epoch": 3.0597014925373136,
"grad_norm": 0.33242255182112,
"learning_rate": 1.0412033436873744e-05,
"loss": 0.2895386815071106,
"num_tokens": 370031828.0,
"step": 410
},
{
"epoch": 3.0671641791044775,
"grad_norm": 0.33876504585540845,
"learning_rate": 1.0349131662333255e-05,
"loss": 0.3026469647884369,
"num_tokens": 370964850.0,
"step": 411
},
{
"epoch": 3.074626865671642,
"grad_norm": 0.30688475151658845,
"learning_rate": 1.0286397956261533e-05,
"loss": 0.2771751582622528,
"num_tokens": 371789883.0,
"step": 412
},
{
"epoch": 3.082089552238806,
"grad_norm": 0.29360163117574556,
"learning_rate": 1.0223834032816198e-05,
"loss": 0.3152085840702057,
"num_tokens": 372663206.0,
"step": 413
},
{
"epoch": 3.08955223880597,
"grad_norm": 0.31015233222700656,
"learning_rate": 1.0161441601515695e-05,
"loss": 0.2951708137989044,
"num_tokens": 373488698.0,
"step": 414
},
{
"epoch": 3.0970149253731343,
"grad_norm": 0.3610813886577992,
"learning_rate": 1.0099222367192547e-05,
"loss": 0.3165642321109772,
"num_tokens": 374309008.0,
"step": 415
},
{
"epoch": 3.1044776119402986,
"grad_norm": 0.3051632607105892,
"learning_rate": 1.0037178029946785e-05,
"loss": 0.2940051853656769,
"num_tokens": 375243569.0,
"step": 416
},
{
"epoch": 3.111940298507463,
"grad_norm": 0.28089972063969026,
"learning_rate": 9.975310285099484e-06,
"loss": 0.30177193880081177,
"num_tokens": 376203415.0,
"step": 417
},
{
"epoch": 3.1194029850746268,
"grad_norm": 0.25054754752277697,
"learning_rate": 9.913620823146451e-06,
"loss": 0.2875446081161499,
"num_tokens": 377153421.0,
"step": 418
},
{
"epoch": 3.126865671641791,
"grad_norm": 0.26784113270221377,
"learning_rate": 9.852111329712039e-06,
"loss": 0.30190229415893555,
"num_tokens": 378087276.0,
"step": 419
},
{
"epoch": 3.1343283582089554,
"grad_norm": 0.2708038955849966,
"learning_rate": 9.790783485503063e-06,
"loss": 0.27638930082321167,
"num_tokens": 378977281.0,
"step": 420
},
{
"epoch": 3.1417910447761193,
"grad_norm": 0.29895669435540717,
"learning_rate": 9.729638966262907e-06,
"loss": 0.29848071932792664,
"num_tokens": 379899880.0,
"step": 421
},
{
"epoch": 3.1492537313432836,
"grad_norm": 0.2554802919348738,
"learning_rate": 9.668679442725697e-06,
"loss": 0.27390313148498535,
"num_tokens": 380749969.0,
"step": 422
},
{
"epoch": 3.156716417910448,
"grad_norm": 0.3050926022188745,
"learning_rate": 9.607906580570695e-06,
"loss": 0.2757868468761444,
"num_tokens": 381625559.0,
"step": 423
},
{
"epoch": 3.1641791044776117,
"grad_norm": 0.2679289456473412,
"learning_rate": 9.54732204037675e-06,
"loss": 0.284029483795166,
"num_tokens": 382524515.0,
"step": 424
},
{
"epoch": 3.171641791044776,
"grad_norm": 0.3337923481748472,
"learning_rate": 9.486927477576948e-06,
"loss": 0.2807900905609131,
"num_tokens": 383460945.0,
"step": 425
},
{
"epoch": 3.1791044776119404,
"grad_norm": 0.26612009858515995,
"learning_rate": 9.426724542413345e-06,
"loss": 0.273318886756897,
"num_tokens": 384264130.0,
"step": 426
},
{
"epoch": 3.1865671641791047,
"grad_norm": 0.3524087992151756,
"learning_rate": 9.366714879891915e-06,
"loss": 0.3047345280647278,
"num_tokens": 385268579.0,
"step": 427
},
{
"epoch": 3.1940298507462686,
"grad_norm": 0.27293635594516935,
"learning_rate": 9.306900129737579e-06,
"loss": 0.2729998230934143,
"num_tokens": 386028916.0,
"step": 428
},
{
"epoch": 3.201492537313433,
"grad_norm": 0.2758767208976419,
"learning_rate": 9.2472819263494e-06,
"loss": 0.2999764680862427,
"num_tokens": 386990098.0,
"step": 429
},
{
"epoch": 3.208955223880597,
"grad_norm": 0.2600988814863155,
"learning_rate": 9.187861898755944e-06,
"loss": 0.28329452872276306,
"num_tokens": 387863679.0,
"step": 430
},
{
"epoch": 3.216417910447761,
"grad_norm": 0.2692034268302841,
"learning_rate": 9.128641670570722e-06,
"loss": 0.29894596338272095,
"num_tokens": 388670162.0,
"step": 431
},
{
"epoch": 3.2238805970149254,
"grad_norm": 0.26505479450449865,
"learning_rate": 9.069622859947886e-06,
"loss": 0.28377240896224976,
"num_tokens": 389520124.0,
"step": 432
},
{
"epoch": 3.2313432835820897,
"grad_norm": 0.27522585261540833,
"learning_rate": 9.010807079537969e-06,
"loss": 0.30390995740890503,
"num_tokens": 390462131.0,
"step": 433
},
{
"epoch": 3.2388059701492535,
"grad_norm": 0.2716269898933067,
"learning_rate": 8.952195936443843e-06,
"loss": 0.28739655017852783,
"num_tokens": 391294528.0,
"step": 434
},
{
"epoch": 3.246268656716418,
"grad_norm": 0.24102893105996573,
"learning_rate": 8.893791032176798e-06,
"loss": 0.27352797985076904,
"num_tokens": 392296640.0,
"step": 435
},
{
"epoch": 3.253731343283582,
"grad_norm": 0.2648291528044443,
"learning_rate": 8.835593962612773e-06,
"loss": 0.2909316122531891,
"num_tokens": 393156418.0,
"step": 436
},
{
"epoch": 3.2611940298507465,
"grad_norm": 0.28306499151263154,
"learning_rate": 8.777606317948772e-06,
"loss": 0.2992030084133148,
"num_tokens": 394033667.0,
"step": 437
},
{
"epoch": 3.2686567164179103,
"grad_norm": 0.2716112435631178,
"learning_rate": 8.719829682659399e-06,
"loss": 0.2813768982887268,
"num_tokens": 394903535.0,
"step": 438
},
{
"epoch": 3.2761194029850746,
"grad_norm": 0.27512395684649504,
"learning_rate": 8.662265635453547e-06,
"loss": 0.29536497592926025,
"num_tokens": 395846549.0,
"step": 439
},
{
"epoch": 3.283582089552239,
"grad_norm": 0.2989586716757814,
"learning_rate": 8.604915749231298e-06,
"loss": 0.2988872826099396,
"num_tokens": 396737205.0,
"step": 440
},
{
"epoch": 3.291044776119403,
"grad_norm": 0.2883428566171811,
"learning_rate": 8.54778159104092e-06,
"loss": 0.2863343358039856,
"num_tokens": 397630057.0,
"step": 441
},
{
"epoch": 3.298507462686567,
"grad_norm": 0.28465714524566543,
"learning_rate": 8.490864722036045e-06,
"loss": 0.29591017961502075,
"num_tokens": 398582978.0,
"step": 442
},
{
"epoch": 3.3059701492537314,
"grad_norm": 0.27001645219378384,
"learning_rate": 8.434166697433034e-06,
"loss": 0.28916236758232117,
"num_tokens": 399421334.0,
"step": 443
},
{
"epoch": 3.3134328358208958,
"grad_norm": 0.2636502279817121,
"learning_rate": 8.377689066468452e-06,
"loss": 0.2919909954071045,
"num_tokens": 400404286.0,
"step": 444
},
{
"epoch": 3.3208955223880596,
"grad_norm": 0.26433758679315084,
"learning_rate": 8.321433372356756e-06,
"loss": 0.29081422090530396,
"num_tokens": 401357244.0,
"step": 445
},
{
"epoch": 3.328358208955224,
"grad_norm": 0.2615424811114582,
"learning_rate": 8.26540115224813e-06,
"loss": 0.29471999406814575,
"num_tokens": 402272150.0,
"step": 446
},
{
"epoch": 3.3358208955223883,
"grad_norm": 0.2774834257770196,
"learning_rate": 8.209593937186475e-06,
"loss": 0.3036431670188904,
"num_tokens": 403288360.0,
"step": 447
},
{
"epoch": 3.343283582089552,
"grad_norm": 0.2615528181357639,
"learning_rate": 8.154013252067565e-06,
"loss": 0.28283798694610596,
"num_tokens": 404201834.0,
"step": 448
},
{
"epoch": 3.3507462686567164,
"grad_norm": 0.26020037614375063,
"learning_rate": 8.098660615597401e-06,
"loss": 0.2982422113418579,
"num_tokens": 405227526.0,
"step": 449
},
{
"epoch": 3.3582089552238807,
"grad_norm": 0.24108662753109747,
"learning_rate": 8.043537540250705e-06,
"loss": 0.2861343026161194,
"num_tokens": 406200774.0,
"step": 450
},
{
"epoch": 3.3656716417910446,
"grad_norm": 0.2739099235918602,
"learning_rate": 7.988645532229581e-06,
"loss": 0.2993728816509247,
"num_tokens": 407124735.0,
"step": 451
},
{
"epoch": 3.373134328358209,
"grad_norm": 0.247255433646386,
"learning_rate": 7.933986091422379e-06,
"loss": 0.26630109548568726,
"num_tokens": 407967520.0,
"step": 452
},
{
"epoch": 3.3805970149253732,
"grad_norm": 0.26491478815853126,
"learning_rate": 7.879560711362696e-06,
"loss": 0.2873428463935852,
"num_tokens": 408873357.0,
"step": 453
},
{
"epoch": 3.388059701492537,
"grad_norm": 0.2894069285105355,
"learning_rate": 7.825370879188569e-06,
"loss": 0.28855782747268677,
"num_tokens": 409780883.0,
"step": 454
},
{
"epoch": 3.3955223880597014,
"grad_norm": 0.2492281531289753,
"learning_rate": 7.771418075601852e-06,
"loss": 0.28437167406082153,
"num_tokens": 410746305.0,
"step": 455
},
{
"epoch": 3.4029850746268657,
"grad_norm": 0.24355852796907684,
"learning_rate": 7.71770377482774e-06,
"loss": 0.27994096279144287,
"num_tokens": 411680660.0,
"step": 456
},
{
"epoch": 3.41044776119403,
"grad_norm": 0.2700214986931186,
"learning_rate": 7.664229444574492e-06,
"loss": 0.2921644449234009,
"num_tokens": 412605533.0,
"step": 457
},
{
"epoch": 3.417910447761194,
"grad_norm": 0.26147389049318864,
"learning_rate": 7.610996545993334e-06,
"loss": 0.2780182957649231,
"num_tokens": 413578521.0,
"step": 458
},
{
"epoch": 3.425373134328358,
"grad_norm": 0.29826386143822425,
"learning_rate": 7.558006533638531e-06,
"loss": 0.2961535155773163,
"num_tokens": 414502174.0,
"step": 459
},
{
"epoch": 3.4328358208955225,
"grad_norm": 0.3037556792472721,
"learning_rate": 7.505260855427631e-06,
"loss": 0.2871173024177551,
"num_tokens": 415404496.0,
"step": 460
},
{
"epoch": 3.4402985074626864,
"grad_norm": 0.27538452886466275,
"learning_rate": 7.452760952601926e-06,
"loss": 0.29723048210144043,
"num_tokens": 416329218.0,
"step": 461
},
{
"epoch": 3.4477611940298507,
"grad_norm": 0.27152400208894184,
"learning_rate": 7.400508259687034e-06,
"loss": 0.28178274631500244,
"num_tokens": 417169036.0,
"step": 462
},
{
"epoch": 3.455223880597015,
"grad_norm": 0.26056261717807916,
"learning_rate": 7.3485042044537425e-06,
"loss": 0.28464025259017944,
"num_tokens": 418088260.0,
"step": 463
},
{
"epoch": 3.4626865671641793,
"grad_norm": 0.27386142278491205,
"learning_rate": 7.296750207878967e-06,
"loss": 0.29148146510124207,
"num_tokens": 418913562.0,
"step": 464
},
{
"epoch": 3.470149253731343,
"grad_norm": 0.2854422365181894,
"learning_rate": 7.2452476841069365e-06,
"loss": 0.30438661575317383,
"num_tokens": 419816417.0,
"step": 465
},
{
"epoch": 3.4776119402985075,
"grad_norm": 0.2555489664737189,
"learning_rate": 7.193998040410553e-06,
"loss": 0.3044406771659851,
"num_tokens": 420724216.0,
"step": 466
},
{
"epoch": 3.485074626865672,
"grad_norm": 0.25726240443773163,
"learning_rate": 7.143002677152923e-06,
"loss": 0.28696900606155396,
"num_tokens": 421711967.0,
"step": 467
},
{
"epoch": 3.4925373134328357,
"grad_norm": 0.23933852458046884,
"learning_rate": 7.092262987749115e-06,
"loss": 0.28907179832458496,
"num_tokens": 422655373.0,
"step": 468
},
{
"epoch": 3.5,
"grad_norm": 0.25521858687923527,
"learning_rate": 7.041780358628076e-06,
"loss": 0.2952384948730469,
"num_tokens": 423645388.0,
"step": 469
},
{
"epoch": 3.5074626865671643,
"grad_norm": 0.2611107763482694,
"learning_rate": 6.991556169194752e-06,
"loss": 0.29364389181137085,
"num_tokens": 424569069.0,
"step": 470
},
{
"epoch": 3.5149253731343286,
"grad_norm": 0.25648032981059116,
"learning_rate": 6.941591791792378e-06,
"loss": 0.29367825388908386,
"num_tokens": 425492189.0,
"step": 471
},
{
"epoch": 3.5223880597014925,
"grad_norm": 0.2622850225964596,
"learning_rate": 6.8918885916650105e-06,
"loss": 0.29189831018447876,
"num_tokens": 426458959.0,
"step": 472
},
{
"epoch": 3.529850746268657,
"grad_norm": 0.2570163822819945,
"learning_rate": 6.842447926920199e-06,
"loss": 0.2819617688655853,
"num_tokens": 427323105.0,
"step": 473
},
{
"epoch": 3.5373134328358207,
"grad_norm": 0.29549281458059196,
"learning_rate": 6.793271148491887e-06,
"loss": 0.303572416305542,
"num_tokens": 428282716.0,
"step": 474
},
{
"epoch": 3.544776119402985,
"grad_norm": 0.2802879385897247,
"learning_rate": 6.7443596001035025e-06,
"loss": 0.31654465198516846,
"num_tokens": 429252409.0,
"step": 475
},
{
"epoch": 3.5522388059701493,
"grad_norm": 0.24794767420280742,
"learning_rate": 6.6957146182312175e-06,
"loss": 0.2985188364982605,
"num_tokens": 430179989.0,
"step": 476
},
{
"epoch": 3.5597014925373136,
"grad_norm": 0.26530731793672097,
"learning_rate": 6.647337532067467e-06,
"loss": 0.2864232063293457,
"num_tokens": 431131078.0,
"step": 477
},
{
"epoch": 3.5671641791044775,
"grad_norm": 0.26007394216176255,
"learning_rate": 6.599229663484598e-06,
"loss": 0.31048181653022766,
"num_tokens": 432118357.0,
"step": 478
},
{
"epoch": 3.574626865671642,
"grad_norm": 0.2561363907813142,
"learning_rate": 6.551392326998776e-06,
"loss": 0.29227665066719055,
"num_tokens": 432981468.0,
"step": 479
},
{
"epoch": 3.582089552238806,
"grad_norm": 0.2560064424004792,
"learning_rate": 6.503826829734035e-06,
"loss": 0.2897188663482666,
"num_tokens": 433892112.0,
"step": 480
},
{
"epoch": 3.58955223880597,
"grad_norm": 0.25775269914474314,
"learning_rate": 6.456534471386594e-06,
"loss": 0.2899354100227356,
"num_tokens": 434806511.0,
"step": 481
},
{
"epoch": 3.5970149253731343,
"grad_norm": 0.2763280796455374,
"learning_rate": 6.409516544189322e-06,
"loss": 0.294207900762558,
"num_tokens": 435709840.0,
"step": 482
},
{
"epoch": 3.6044776119402986,
"grad_norm": 0.26039260807281084,
"learning_rate": 6.362774332876438e-06,
"loss": 0.2990114390850067,
"num_tokens": 436640115.0,
"step": 483
},
{
"epoch": 3.611940298507463,
"grad_norm": 0.2527009190257145,
"learning_rate": 6.316309114648409e-06,
"loss": 0.2699679732322693,
"num_tokens": 437494545.0,
"step": 484
},
{
"epoch": 3.6194029850746268,
"grad_norm": 0.2736279490033172,
"learning_rate": 6.270122159137033e-06,
"loss": 0.2987067401409149,
"num_tokens": 438288906.0,
"step": 485
},
{
"epoch": 3.626865671641791,
"grad_norm": 0.2789173799100359,
"learning_rate": 6.2242147283707714e-06,
"loss": 0.3188440203666687,
"num_tokens": 439102139.0,
"step": 486
},
{
"epoch": 3.6343283582089554,
"grad_norm": 0.24253033921125058,
"learning_rate": 6.178588076740253e-06,
"loss": 0.2938775420188904,
"num_tokens": 439996247.0,
"step": 487
},
{
"epoch": 3.6417910447761193,
"grad_norm": 0.2638488313889946,
"learning_rate": 6.133243450964005e-06,
"loss": 0.299264132976532,
"num_tokens": 440863036.0,
"step": 488
},
{
"epoch": 3.6492537313432836,
"grad_norm": 0.2862685346779505,
"learning_rate": 6.088182090054364e-06,
"loss": 0.29607367515563965,
"num_tokens": 441618331.0,
"step": 489
},
{
"epoch": 3.656716417910448,
"grad_norm": 0.27289917809516917,
"learning_rate": 6.043405225283654e-06,
"loss": 0.2921777367591858,
"num_tokens": 442361717.0,
"step": 490
},
{
"epoch": 3.664179104477612,
"grad_norm": 0.2537367234076187,
"learning_rate": 5.998914080150525e-06,
"loss": 0.2836867570877075,
"num_tokens": 443313769.0,
"step": 491
},
{
"epoch": 3.671641791044776,
"grad_norm": 0.26170955211126384,
"learning_rate": 5.9547098703465215e-06,
"loss": 0.30563318729400635,
"num_tokens": 444314596.0,
"step": 492
},
{
"epoch": 3.6791044776119404,
"grad_norm": 0.2693301068014662,
"learning_rate": 5.910793803722873e-06,
"loss": 0.29311275482177734,
"num_tokens": 445237263.0,
"step": 493
},
{
"epoch": 3.6865671641791042,
"grad_norm": 0.27762349435994677,
"learning_rate": 5.867167080257471e-06,
"loss": 0.29791638255119324,
"num_tokens": 446151590.0,
"step": 494
},
{
"epoch": 3.6940298507462686,
"grad_norm": 0.2701314496245139,
"learning_rate": 5.823830892022107e-06,
"loss": 0.3165101408958435,
"num_tokens": 447040490.0,
"step": 495
},
{
"epoch": 3.701492537313433,
"grad_norm": 0.27274243095008927,
"learning_rate": 5.780786423149879e-06,
"loss": 0.32390397787094116,
"num_tokens": 447930938.0,
"step": 496
},
{
"epoch": 3.708955223880597,
"grad_norm": 0.3607925974135692,
"learning_rate": 5.738034849802852e-06,
"loss": 0.2941335439682007,
"num_tokens": 448795073.0,
"step": 497
},
{
"epoch": 3.716417910447761,
"grad_norm": 0.27114314101622733,
"learning_rate": 5.695577340139905e-06,
"loss": 0.29179757833480835,
"num_tokens": 449748272.0,
"step": 498
},
{
"epoch": 3.7238805970149254,
"grad_norm": 0.2676978851481763,
"learning_rate": 5.653415054284816e-06,
"loss": 0.30068930983543396,
"num_tokens": 450716521.0,
"step": 499
},
{
"epoch": 3.7313432835820897,
"grad_norm": 0.26294336472483293,
"learning_rate": 5.611549144294568e-06,
"loss": 0.2907962203025818,
"num_tokens": 451536750.0,
"step": 500
},
{
"epoch": 3.7388059701492535,
"grad_norm": 0.2546485590984235,
"learning_rate": 5.569980754127872e-06,
"loss": 0.2873173952102661,
"num_tokens": 452509967.0,
"step": 501
},
{
"epoch": 3.746268656716418,
"grad_norm": 0.24938478382421467,
"learning_rate": 5.5287110196138985e-06,
"loss": 0.2843964993953705,
"num_tokens": 453499953.0,
"step": 502
},
{
"epoch": 3.753731343283582,
"grad_norm": 0.24634142699097625,
"learning_rate": 5.487741068421242e-06,
"loss": 0.295748770236969,
"num_tokens": 454428619.0,
"step": 503
},
{
"epoch": 3.7611940298507465,
"grad_norm": 0.24888257131984212,
"learning_rate": 5.447072020027122e-06,
"loss": 0.2946910858154297,
"num_tokens": 455343533.0,
"step": 504
},
{
"epoch": 3.7686567164179103,
"grad_norm": 0.2368545844700678,
"learning_rate": 5.406704985686782e-06,
"loss": 0.27735936641693115,
"num_tokens": 456246016.0,
"step": 505
},
{
"epoch": 3.7761194029850746,
"grad_norm": 0.27241818855184635,
"learning_rate": 5.366641068403126e-06,
"loss": 0.3016122579574585,
"num_tokens": 457104506.0,
"step": 506
},
{
"epoch": 3.783582089552239,
"grad_norm": 0.2590785342630335,
"learning_rate": 5.326881362896588e-06,
"loss": 0.3151727020740509,
"num_tokens": 458003785.0,
"step": 507
},
{
"epoch": 3.791044776119403,
"grad_norm": 0.25242642100086654,
"learning_rate": 5.287426955575205e-06,
"loss": 0.2941104769706726,
"num_tokens": 458840614.0,
"step": 508
},
{
"epoch": 3.798507462686567,
"grad_norm": 0.25216547714604487,
"learning_rate": 5.24827892450494e-06,
"loss": 0.28807011246681213,
"num_tokens": 459707587.0,
"step": 509
},
{
"epoch": 3.8059701492537314,
"grad_norm": 0.23923122289508578,
"learning_rate": 5.209438339380242e-06,
"loss": 0.2823304533958435,
"num_tokens": 460629686.0,
"step": 510
},
{
"epoch": 3.8134328358208958,
"grad_norm": 0.2657855590710968,
"learning_rate": 5.170906261494776e-06,
"loss": 0.2919255197048187,
"num_tokens": 461544147.0,
"step": 511
},
{
"epoch": 3.8208955223880596,
"grad_norm": 0.2626472514066274,
"learning_rate": 5.132683743712462e-06,
"loss": 0.29430970549583435,
"num_tokens": 462477850.0,
"step": 512
},
{
"epoch": 3.828358208955224,
"grad_norm": 0.2566886419848628,
"learning_rate": 5.094771830438689e-06,
"loss": 0.2987692952156067,
"num_tokens": 463412060.0,
"step": 513
},
{
"epoch": 3.835820895522388,
"grad_norm": 0.2484881857541934,
"learning_rate": 5.057171557591777e-06,
"loss": 0.2915360927581787,
"num_tokens": 464308740.0,
"step": 514
},
{
"epoch": 3.843283582089552,
"grad_norm": 0.24381344201474844,
"learning_rate": 5.019883952574686e-06,
"loss": 0.28436267375946045,
"num_tokens": 465265384.0,
"step": 515
},
{
"epoch": 3.8507462686567164,
"grad_norm": 0.2481908860876439,
"learning_rate": 4.98291003424691e-06,
"loss": 0.28611573576927185,
"num_tokens": 466226494.0,
"step": 516
},
{
"epoch": 3.8582089552238807,
"grad_norm": 0.23871319999253146,
"learning_rate": 4.946250812896678e-06,
"loss": 0.2998065948486328,
"num_tokens": 467259239.0,
"step": 517
},
{
"epoch": 3.8656716417910446,
"grad_norm": 0.24799658125418186,
"learning_rate": 4.909907290213321e-06,
"loss": 0.2929803729057312,
"num_tokens": 468142586.0,
"step": 518
},
{
"epoch": 3.873134328358209,
"grad_norm": 0.2570907705948353,
"learning_rate": 4.873880459259913e-06,
"loss": 0.2957007884979248,
"num_tokens": 468951581.0,
"step": 519
},
{
"epoch": 3.8805970149253732,
"grad_norm": 0.2616713052030643,
"learning_rate": 4.838171304446129e-06,
"loss": 0.3021651804447174,
"num_tokens": 469861165.0,
"step": 520
},
{
"epoch": 3.888059701492537,
"grad_norm": 0.3010016938124609,
"learning_rate": 4.80278080150135e-06,
"loss": 0.308903306722641,
"num_tokens": 470804718.0,
"step": 521
},
{
"epoch": 3.8955223880597014,
"grad_norm": 0.24926265212747228,
"learning_rate": 4.767709917448009e-06,
"loss": 0.30023178458213806,
"num_tokens": 471749228.0,
"step": 522
},
{
"epoch": 3.9029850746268657,
"grad_norm": 0.24761743456611565,
"learning_rate": 4.732959610575154e-06,
"loss": 0.2946227788925171,
"num_tokens": 472698930.0,
"step": 523
},
{
"epoch": 3.91044776119403,
"grad_norm": 0.22825850031566985,
"learning_rate": 4.698530830412276e-06,
"loss": 0.2835308611392975,
"num_tokens": 473565553.0,
"step": 524
},
{
"epoch": 3.917910447761194,
"grad_norm": 0.25239434326625193,
"learning_rate": 4.664424517703353e-06,
"loss": 0.3003775477409363,
"num_tokens": 474425434.0,
"step": 525
},
{
"epoch": 3.925373134328358,
"grad_norm": 0.24946733272255223,
"learning_rate": 4.630641604381151e-06,
"loss": 0.3032747507095337,
"num_tokens": 475400550.0,
"step": 526
},
{
"epoch": 3.9328358208955225,
"grad_norm": 0.23900706286857004,
"learning_rate": 4.597183013541764e-06,
"loss": 0.3009137809276581,
"num_tokens": 476322074.0,
"step": 527
},
{
"epoch": 3.9402985074626864,
"grad_norm": 0.23647944717460595,
"learning_rate": 4.564049659419379e-06,
"loss": 0.2712666392326355,
"num_tokens": 477127686.0,
"step": 528
},
{
"epoch": 3.9477611940298507,
"grad_norm": 0.24461158477231615,
"learning_rate": 4.531242447361308e-06,
"loss": 0.2808017432689667,
"num_tokens": 477992768.0,
"step": 529
},
{
"epoch": 3.955223880597015,
"grad_norm": 0.25868694964779676,
"learning_rate": 4.498762273803233e-06,
"loss": 0.3064419627189636,
"num_tokens": 478818611.0,
"step": 530
},
{
"epoch": 3.9626865671641793,
"grad_norm": 0.2384742960079164,
"learning_rate": 4.4666100262447335e-06,
"loss": 0.28597795963287354,
"num_tokens": 479757992.0,
"step": 531
},
{
"epoch": 3.970149253731343,
"grad_norm": 0.23968331135860904,
"learning_rate": 4.434786583225018e-06,
"loss": 0.28608185052871704,
"num_tokens": 480686770.0,
"step": 532
},
{
"epoch": 3.9776119402985075,
"grad_norm": 0.23668555789215315,
"learning_rate": 4.403292814298932e-06,
"loss": 0.2850901782512665,
"num_tokens": 481556474.0,
"step": 533
},
{
"epoch": 3.9850746268656714,
"grad_norm": 0.25182124727383254,
"learning_rate": 4.372129580013179e-06,
"loss": 0.29344847798347473,
"num_tokens": 482402398.0,
"step": 534
},
{
"epoch": 3.9925373134328357,
"grad_norm": 0.26240005117001564,
"learning_rate": 4.341297731882833e-06,
"loss": 0.28991544246673584,
"num_tokens": 483144226.0,
"step": 535
},
{
"epoch": 4.0,
"grad_norm": 0.2394553338374243,
"learning_rate": 4.31079811236805e-06,
"loss": 0.28979605436325073,
"num_tokens": 484171179.0,
"step": 536
},
{
"epoch": 4.007462686567164,
"grad_norm": 0.33867608284366874,
"learning_rate": 4.280631554851052e-06,
"loss": 0.261859267950058,
"num_tokens": 484964422.0,
"step": 537
},
{
"epoch": 4.014925373134329,
"grad_norm": 0.3387690029521035,
"learning_rate": 4.250798883613371e-06,
"loss": 0.258260041475296,
"num_tokens": 485911398.0,
"step": 538
},
{
"epoch": 4.022388059701493,
"grad_norm": 0.31295687889359947,
"learning_rate": 4.221300913813297e-06,
"loss": 0.26438719034194946,
"num_tokens": 486765516.0,
"step": 539
},
{
"epoch": 4.029850746268656,
"grad_norm": 0.2533693528515132,
"learning_rate": 4.192138451463637e-06,
"loss": 0.26276901364326477,
"num_tokens": 487755450.0,
"step": 540
},
{
"epoch": 4.037313432835821,
"grad_norm": 0.27941924345032165,
"learning_rate": 4.163312293409668e-06,
"loss": 0.2743380069732666,
"num_tokens": 488596501.0,
"step": 541
},
{
"epoch": 4.044776119402985,
"grad_norm": 0.3119420012284113,
"learning_rate": 4.134823227307376e-06,
"loss": 0.27551499009132385,
"num_tokens": 489333987.0,
"step": 542
},
{
"epoch": 4.052238805970149,
"grad_norm": 0.3376015245186099,
"learning_rate": 4.1066720316019176e-06,
"loss": 0.2677218019962311,
"num_tokens": 490271866.0,
"step": 543
},
{
"epoch": 4.059701492537314,
"grad_norm": 0.31476158712266056,
"learning_rate": 4.0788594755063754e-06,
"loss": 0.2655893564224243,
"num_tokens": 491167672.0,
"step": 544
},
{
"epoch": 4.067164179104478,
"grad_norm": 0.2801726675903428,
"learning_rate": 4.051386318980717e-06,
"loss": 0.2636064291000366,
"num_tokens": 492117374.0,
"step": 545
},
{
"epoch": 4.074626865671641,
"grad_norm": 0.2642554446164588,
"learning_rate": 4.024253312711041e-06,
"loss": 0.2632978558540344,
"num_tokens": 493064577.0,
"step": 546
},
{
"epoch": 4.082089552238806,
"grad_norm": 0.2340921814939966,
"learning_rate": 3.99746119808906e-06,
"loss": 0.2561931908130646,
"num_tokens": 494008196.0,
"step": 547
},
{
"epoch": 4.08955223880597,
"grad_norm": 0.24746722464151832,
"learning_rate": 3.971010707191848e-06,
"loss": 0.2665466368198395,
"num_tokens": 495010032.0,
"step": 548
},
{
"epoch": 4.097014925373134,
"grad_norm": 0.28750263458503306,
"learning_rate": 3.9449025627618256e-06,
"loss": 0.2657792568206787,
"num_tokens": 495771485.0,
"step": 549
},
{
"epoch": 4.104477611940299,
"grad_norm": 0.25981920943022424,
"learning_rate": 3.919137478187027e-06,
"loss": 0.2730734050273895,
"num_tokens": 496704001.0,
"step": 550
},
{
"epoch": 4.111940298507463,
"grad_norm": 0.26506589650257595,
"learning_rate": 3.893716157481598e-06,
"loss": 0.26241227984428406,
"num_tokens": 497580217.0,
"step": 551
},
{
"epoch": 4.119402985074627,
"grad_norm": 0.28902536946390145,
"learning_rate": 3.868639295266562e-06,
"loss": 0.27827292680740356,
"num_tokens": 498399947.0,
"step": 552
},
{
"epoch": 4.126865671641791,
"grad_norm": 0.2305613889202318,
"learning_rate": 3.8439075767508304e-06,
"loss": 0.25871434807777405,
"num_tokens": 499337510.0,
"step": 553
},
{
"epoch": 4.134328358208955,
"grad_norm": 0.2543579464580596,
"learning_rate": 3.819521677712498e-06,
"loss": 0.26276665925979614,
"num_tokens": 500211058.0,
"step": 554
},
{
"epoch": 4.141791044776119,
"grad_norm": 0.2603404639875204,
"learning_rate": 3.7954822644803612e-06,
"loss": 0.27976810932159424,
"num_tokens": 501239171.0,
"step": 555
},
{
"epoch": 4.149253731343284,
"grad_norm": 0.24399135581961698,
"learning_rate": 3.7717899939157227e-06,
"loss": 0.2695601284503937,
"num_tokens": 502320140.0,
"step": 556
},
{
"epoch": 4.156716417910448,
"grad_norm": 0.2506194739658917,
"learning_rate": 3.748445513394432e-06,
"loss": 0.2601467967033386,
"num_tokens": 503200601.0,
"step": 557
},
{
"epoch": 4.164179104477612,
"grad_norm": 0.24780405901365382,
"learning_rate": 3.7254494607892062e-06,
"loss": 0.2658926248550415,
"num_tokens": 504111915.0,
"step": 558
},
{
"epoch": 4.1716417910447765,
"grad_norm": 0.25277522040944933,
"learning_rate": 3.7028024644521974e-06,
"loss": 0.26618829369544983,
"num_tokens": 951575.0,
"step": 559
},
{
"epoch": 4.17910447761194,
"grad_norm": 0.260926277591076,
"learning_rate": 3.6805051431978215e-06,
"loss": 0.2764492630958557,
"num_tokens": 1870368.0,
"step": 560
},
{
"epoch": 4.186567164179104,
"grad_norm": 0.24151138917904563,
"learning_rate": 3.6585581062858515e-06,
"loss": 0.26785239577293396,
"num_tokens": 2827046.0,
"step": 561
},
{
"epoch": 4.1940298507462686,
"grad_norm": 0.24384225850500896,
"learning_rate": 3.636961953404763e-06,
"loss": 0.26912403106689453,
"num_tokens": 3739973.0,
"step": 562
},
{
"epoch": 4.201492537313433,
"grad_norm": 0.2916626614705674,
"learning_rate": 3.615717274655364e-06,
"loss": 0.26528483629226685,
"num_tokens": 4518704.0,
"step": 563
},
{
"epoch": 4.208955223880597,
"grad_norm": 0.24960926879350168,
"learning_rate": 3.5948246505346537e-06,
"loss": 0.27783459424972534,
"num_tokens": 5501253.0,
"step": 564
},
{
"epoch": 4.2164179104477615,
"grad_norm": 0.25681267819662723,
"learning_rate": 3.5742846519199715e-06,
"loss": 0.27307459712028503,
"num_tokens": 6402302.0,
"step": 565
},
{
"epoch": 4.223880597014926,
"grad_norm": 0.2412629050166804,
"learning_rate": 3.5540978400533933e-06,
"loss": 0.264928936958313,
"num_tokens": 7296048.0,
"step": 566
},
{
"epoch": 4.231343283582089,
"grad_norm": 0.26007426064530514,
"learning_rate": 3.5342647665263963e-06,
"loss": 0.27285411953926086,
"num_tokens": 8246217.0,
"step": 567
},
{
"epoch": 4.2388059701492535,
"grad_norm": 0.2505447033271199,
"learning_rate": 3.514785973264789e-06,
"loss": 0.2539595663547516,
"num_tokens": 9030493.0,
"step": 568
},
{
"epoch": 4.246268656716418,
"grad_norm": 0.24939677987959621,
"learning_rate": 3.495661992513905e-06,
"loss": 0.273257315158844,
"num_tokens": 9936844.0,
"step": 569
},
{
"epoch": 4.253731343283582,
"grad_norm": 0.25283551407816135,
"learning_rate": 3.476893346824055e-06,
"loss": 0.2572386562824249,
"num_tokens": 10836976.0,
"step": 570
},
{
"epoch": 4.2611940298507465,
"grad_norm": 0.25014049995931353,
"learning_rate": 3.4584805490362493e-06,
"loss": 0.27239200472831726,
"num_tokens": 11812223.0,
"step": 571
},
{
"epoch": 4.268656716417911,
"grad_norm": 0.2565763851261565,
"learning_rate": 3.4404241022681873e-06,
"loss": 0.26448339223861694,
"num_tokens": 12615614.0,
"step": 572
},
{
"epoch": 4.276119402985074,
"grad_norm": 0.25289117775054565,
"learning_rate": 3.42272449990051e-06,
"loss": 0.29063016176223755,
"num_tokens": 13567548.0,
"step": 573
},
{
"epoch": 4.2835820895522385,
"grad_norm": 0.25823345228475075,
"learning_rate": 3.40538222556332e-06,
"loss": 0.27311235666275024,
"num_tokens": 14395131.0,
"step": 574
},
{
"epoch": 4.291044776119403,
"grad_norm": 0.23315641988846117,
"learning_rate": 3.388397753122957e-06,
"loss": 0.25236693024635315,
"num_tokens": 15335598.0,
"step": 575
},
{
"epoch": 4.298507462686567,
"grad_norm": 0.2841401512615274,
"learning_rate": 3.3717715466690624e-06,
"loss": 0.2869318723678589,
"num_tokens": 16179341.0,
"step": 576
},
{
"epoch": 4.3059701492537314,
"grad_norm": 0.25632145802021455,
"learning_rate": 3.3555040605018935e-06,
"loss": 0.26220396161079407,
"num_tokens": 16988671.0,
"step": 577
},
{
"epoch": 4.313432835820896,
"grad_norm": 0.26924823560517036,
"learning_rate": 3.339595739119909e-06,
"loss": 0.28524714708328247,
"num_tokens": 17818903.0,
"step": 578
},
{
"epoch": 4.32089552238806,
"grad_norm": 0.24597376079056055,
"learning_rate": 3.3240470172076226e-06,
"loss": 0.25928568840026855,
"num_tokens": 18686514.0,
"step": 579
},
{
"epoch": 4.3283582089552235,
"grad_norm": 0.2296054299641554,
"learning_rate": 3.3088583196237253e-06,
"loss": 0.2673494219779968,
"num_tokens": 19710461.0,
"step": 580
},
{
"epoch": 4.335820895522388,
"grad_norm": 0.303772974273409,
"learning_rate": 3.294030061389481e-06,
"loss": 0.29324933886528015,
"num_tokens": 20505162.0,
"step": 581
},
{
"epoch": 4.343283582089552,
"grad_norm": 0.24075458576098716,
"learning_rate": 3.2795626476773833e-06,
"loss": 0.2494013011455536,
"num_tokens": 21440460.0,
"step": 582
},
{
"epoch": 4.350746268656716,
"grad_norm": 0.26414061441007297,
"learning_rate": 3.2654564738000822e-06,
"loss": 0.28142672777175903,
"num_tokens": 22250398.0,
"step": 583
},
{
"epoch": 4.358208955223881,
"grad_norm": 0.22985556392550052,
"learning_rate": 3.2517119251995873e-06,
"loss": 0.2574723958969116,
"num_tokens": 23184740.0,
"step": 584
},
{
"epoch": 4.365671641791045,
"grad_norm": 0.2340946834808678,
"learning_rate": 3.2383293774367286e-06,
"loss": 0.262751042842865,
"num_tokens": 24111398.0,
"step": 585
},
{
"epoch": 4.373134328358209,
"grad_norm": 0.2622850655813035,
"learning_rate": 3.225309196180906e-06,
"loss": 0.26962852478027344,
"num_tokens": 24935442.0,
"step": 586
},
{
"epoch": 4.380597014925373,
"grad_norm": 0.30896829245037205,
"learning_rate": 3.212651737200086e-06,
"loss": 0.2718137502670288,
"num_tokens": 25850666.0,
"step": 587
},
{
"epoch": 4.388059701492537,
"grad_norm": 0.24656349669035904,
"learning_rate": 3.200357346351084e-06,
"loss": 0.2535630166530609,
"num_tokens": 26632303.0,
"step": 588
},
{
"epoch": 4.395522388059701,
"grad_norm": 0.2478490440504693,
"learning_rate": 3.188426359570121e-06,
"loss": 0.2648570239543915,
"num_tokens": 27523524.0,
"step": 589
},
{
"epoch": 4.402985074626866,
"grad_norm": 0.2467537368918543,
"learning_rate": 3.176859102863631e-06,
"loss": 0.268078088760376,
"num_tokens": 28364038.0,
"step": 590
},
{
"epoch": 4.41044776119403,
"grad_norm": 0.24022496953126724,
"learning_rate": 3.16565589229937e-06,
"loss": 0.2637268900871277,
"num_tokens": 29254874.0,
"step": 591
},
{
"epoch": 4.417910447761194,
"grad_norm": 0.25520142610516455,
"learning_rate": 3.1548170339977626e-06,
"loss": 0.27608251571655273,
"num_tokens": 30099118.0,
"step": 592
},
{
"epoch": 4.425373134328359,
"grad_norm": 0.25418541542220713,
"learning_rate": 3.144342824123548e-06,
"loss": 0.27403631806373596,
"num_tokens": 30937080.0,
"step": 593
},
{
"epoch": 4.432835820895522,
"grad_norm": 0.32216925679050706,
"learning_rate": 3.134233548877684e-06,
"loss": 0.2749292850494385,
"num_tokens": 31868459.0,
"step": 594
},
{
"epoch": 4.440298507462686,
"grad_norm": 0.23633842342693723,
"learning_rate": 3.1244894844895307e-06,
"loss": 0.26009055972099304,
"num_tokens": 32844776.0,
"step": 595
},
{
"epoch": 4.447761194029851,
"grad_norm": 0.23167846669851885,
"learning_rate": 3.115110897209297e-06,
"loss": 0.25624188780784607,
"num_tokens": 33800215.0,
"step": 596
},
{
"epoch": 4.455223880597015,
"grad_norm": 0.31853695227310724,
"learning_rate": 3.1060980433007674e-06,
"loss": 0.26650676131248474,
"num_tokens": 34652575.0,
"step": 597
},
{
"epoch": 4.462686567164179,
"grad_norm": 0.2474619333740578,
"learning_rate": 3.0974511690342995e-06,
"loss": 0.26506173610687256,
"num_tokens": 35526076.0,
"step": 598
},
{
"epoch": 4.470149253731344,
"grad_norm": 0.2379051889253177,
"learning_rate": 3.089170510680101e-06,
"loss": 0.2590046525001526,
"num_tokens": 36465383.0,
"step": 599
},
{
"epoch": 4.477611940298507,
"grad_norm": 0.24103925822435351,
"learning_rate": 3.0812562945017625e-06,
"loss": 0.26156845688819885,
"num_tokens": 37402609.0,
"step": 600
},
{
"epoch": 4.485074626865671,
"grad_norm": 0.24478825299983173,
"learning_rate": 3.0737087367500848e-06,
"loss": 0.26436761021614075,
"num_tokens": 38372549.0,
"step": 601
},
{
"epoch": 4.492537313432836,
"grad_norm": 0.25588847680085197,
"learning_rate": 3.066528043657163e-06,
"loss": 0.2649264335632324,
"num_tokens": 39258770.0,
"step": 602
},
{
"epoch": 4.5,
"grad_norm": 0.2462865755078873,
"learning_rate": 3.0597144114307577e-06,
"loss": 0.2759783864021301,
"num_tokens": 40167992.0,
"step": 603
},
{
"epoch": 4.507462686567164,
"grad_norm": 0.24099167658546947,
"learning_rate": 3.0532680262489272e-06,
"loss": 0.2647096812725067,
"num_tokens": 41103593.0,
"step": 604
},
{
"epoch": 4.514925373134329,
"grad_norm": 0.26228388440102607,
"learning_rate": 3.047189064254947e-06,
"loss": 0.2846449017524719,
"num_tokens": 41964920.0,
"step": 605
},
{
"epoch": 4.522388059701493,
"grad_norm": 0.24399175109648016,
"learning_rate": 3.0414776915524926e-06,
"loss": 0.2578504979610443,
"num_tokens": 42832698.0,
"step": 606
},
{
"epoch": 4.529850746268656,
"grad_norm": 0.24557003170358038,
"learning_rate": 3.0361340642010974e-06,
"loss": 0.2687520980834961,
"num_tokens": 43751841.0,
"step": 607
},
{
"epoch": 4.537313432835821,
"grad_norm": 0.2457169444504286,
"learning_rate": 3.0311583282119004e-06,
"loss": 0.2654935121536255,
"num_tokens": 44670570.0,
"step": 608
},
{
"epoch": 4.544776119402985,
"grad_norm": 0.23344187463481425,
"learning_rate": 3.026550619543641e-06,
"loss": 0.2680796980857849,
"num_tokens": 45565349.0,
"step": 609
},
{
"epoch": 4.552238805970149,
"grad_norm": 0.2616978203803085,
"learning_rate": 3.0223110640989607e-06,
"loss": 0.2733978033065796,
"num_tokens": 46334877.0,
"step": 610
},
{
"epoch": 4.559701492537314,
"grad_norm": 0.24402710769793126,
"learning_rate": 3.0184397777209436e-06,
"loss": 0.26678377389907837,
"num_tokens": 47197933.0,
"step": 611
},
{
"epoch": 4.567164179104478,
"grad_norm": 0.23596997365787184,
"learning_rate": 3.0149368661899707e-06,
"loss": 0.2666507959365845,
"num_tokens": 48185966.0,
"step": 612
},
{
"epoch": 4.574626865671641,
"grad_norm": 0.26072871992164276,
"learning_rate": 3.0118024252208146e-06,
"loss": 0.2727803587913513,
"num_tokens": 49053041.0,
"step": 613
},
{
"epoch": 4.582089552238806,
"grad_norm": 0.2429680853323204,
"learning_rate": 3.0090365404600324e-06,
"loss": 0.27436989545822144,
"num_tokens": 49972669.0,
"step": 614
},
{
"epoch": 4.58955223880597,
"grad_norm": 0.2492201703405157,
"learning_rate": 3.0066392874836254e-06,
"loss": 0.2650463581085205,
"num_tokens": 50759258.0,
"step": 615
},
{
"epoch": 4.597014925373134,
"grad_norm": 0.23159753484897908,
"learning_rate": 3.004610731794965e-06,
"loss": 0.2537558376789093,
"num_tokens": 51687796.0,
"step": 616
},
{
"epoch": 4.604477611940299,
"grad_norm": 0.23805277832433672,
"learning_rate": 3.002950928823016e-06,
"loss": 0.26197919249534607,
"num_tokens": 52660231.0,
"step": 617
},
{
"epoch": 4.611940298507463,
"grad_norm": 0.24026810337813148,
"learning_rate": 3.001659923920811e-06,
"loss": 0.2531256675720215,
"num_tokens": 53529194.0,
"step": 618
},
{
"epoch": 4.619402985074627,
"grad_norm": 0.258077064890661,
"learning_rate": 3.0007377523642196e-06,
"loss": 0.26511213183403015,
"num_tokens": 54455687.0,
"step": 619
},
{
"epoch": 4.6268656716417915,
"grad_norm": 0.24117669708783973,
"learning_rate": 3.0001844393509754e-06,
"loss": 0.2814059257507324,
"num_tokens": 55475962.0,
"step": 620
},
{
"epoch": 4.6268656716417915,
"step": 620,
"total_flos": 829282868854784.0,
"train_loss": 0.02679153286641644,
"train_runtime": 1845.6941,
"train_samples_per_second": 10.749,
"train_steps_per_second": 0.336
}
],
"logging_steps": 1,
"max_steps": 620,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 62,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 829282868854784.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}