commit 1b73f7ee7a3a87a8cbc1fb0b72c0964439a8825c Author: ModelHub XC Date: Mon Apr 20 19:35:02 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Josephgflowers/TinyLlama_v1.1_math_code-world-test-1 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..21b3632 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,49 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +tokenizer.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..9880a72 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +--- +license: mit +--- +This model is trained off of Josephgflowers/TinyLlama-v1.1-Agent-Rag-Nerd-v1 which is trained off of the tinyllama v1.1. This model was trained using the tinyllama chat format, zephyr. +I trained this model using a vast amount of textbooks are all you need style synthetic datasets as well as some common chat datasets and the Cinder dataset. \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..d8ab457 --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "_name_or_path": "Josephgflowers/TinyLlama-Cinder-Agent-v1", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.42.0.dev0", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..b800caf --- /dev/null +++ b/generation_config.json @@ -0,0 +1,7 @@ +{ + "bos_token_id": 1, + "eos_token_id": 2, + "max_length": 2048, + "pad_token_id": 0, + "transformers_version": "4.42.0.dev0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..30cfb76 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c6bd20809e816078ca78e1c0d6e593c0cef4c9d7734c45f9c7b165a84135eea +size 2200119664 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..492d4b2 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..5d592ae --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf467c9e0f536bda271283c6ef85eb1a943e3196b621c8a912d64953b205df83 +size 1842795 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..1d53d16 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,43 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..4904878 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,5549 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 15767, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001268472125325046, + "grad_norm": 0.8813716769218445, + "learning_rate": 4.993657639373375e-05, + "loss": 0.8575, + "step": 20 + }, + { + "epoch": 0.002536944250650092, + "grad_norm": 0.941986083984375, + "learning_rate": 4.98731527874675e-05, + "loss": 0.8299, + "step": 40 + }, + { + "epoch": 0.003805416375975138, + "grad_norm": 0.8295786380767822, + "learning_rate": 4.980972918120125e-05, + "loss": 0.8491, + "step": 60 + }, + { + "epoch": 0.005073888501300184, + "grad_norm": 0.7794184684753418, + "learning_rate": 4.9749476755248304e-05, + "loss": 0.8294, + "step": 80 + }, + { + "epoch": 0.00634236062662523, + "grad_norm": 0.8349530100822449, + "learning_rate": 4.9686053148982053e-05, + "loss": 0.8443, + "step": 100 + }, + { + "epoch": 0.007610832751950276, + "grad_norm": 0.8651715517044067, + "learning_rate": 4.96226295427158e-05, + "loss": 0.817, + "step": 120 + }, + { + "epoch": 0.008879304877275321, + "grad_norm": 0.8325722217559814, + "learning_rate": 4.955920593644955e-05, + "loss": 0.7915, + "step": 140 + }, + { + "epoch": 0.010147777002600368, + "grad_norm": 0.8690646886825562, + "learning_rate": 4.94957823301833e-05, + "loss": 0.8673, + "step": 160 + }, + { + "epoch": 0.011416249127925413, + "grad_norm": 0.8512411117553711, + "learning_rate": 4.9432358723917043e-05, + "loss": 0.8621, + "step": 180 + }, + { + "epoch": 0.01268472125325046, + "grad_norm": 0.8262362480163574, + "learning_rate": 4.936893511765079e-05, + "loss": 0.8387, + "step": 200 + }, + { + "epoch": 0.013953193378575505, + "grad_norm": 0.8982943892478943, + "learning_rate": 4.930551151138454e-05, + "loss": 0.8199, + "step": 220 + }, + { + "epoch": 0.015221665503900552, + "grad_norm": 1.024151086807251, + "learning_rate": 4.92452590854316e-05, + "loss": 0.8289, + "step": 240 + }, + { + "epoch": 0.016490137629225597, + "grad_norm": 0.8547594547271729, + "learning_rate": 4.918183547916535e-05, + "loss": 0.8457, + "step": 260 + }, + { + "epoch": 0.017758609754550642, + "grad_norm": 0.8753955364227295, + "learning_rate": 4.9118411872899095e-05, + "loss": 0.8213, + "step": 280 + }, + { + "epoch": 0.01902708187987569, + "grad_norm": 0.8638611435890198, + "learning_rate": 4.9054988266632844e-05, + "loss": 0.8542, + "step": 300 + }, + { + "epoch": 0.020295554005200736, + "grad_norm": 0.9179531931877136, + "learning_rate": 4.899156466036659e-05, + "loss": 0.8362, + "step": 320 + }, + { + "epoch": 0.02156402613052578, + "grad_norm": 0.9708409905433655, + "learning_rate": 4.892814105410034e-05, + "loss": 0.8485, + "step": 340 + }, + { + "epoch": 0.022832498255850826, + "grad_norm": 0.9580267667770386, + "learning_rate": 4.8864717447834085e-05, + "loss": 0.8234, + "step": 360 + }, + { + "epoch": 0.024100970381175875, + "grad_norm": 0.8894557356834412, + "learning_rate": 4.8801293841567834e-05, + "loss": 0.8227, + "step": 380 + }, + { + "epoch": 0.02536944250650092, + "grad_norm": 0.8640332221984863, + "learning_rate": 4.873787023530158e-05, + "loss": 0.8695, + "step": 400 + }, + { + "epoch": 0.026637914631825965, + "grad_norm": 0.9173359870910645, + "learning_rate": 4.867444662903533e-05, + "loss": 0.8427, + "step": 420 + }, + { + "epoch": 0.02790638675715101, + "grad_norm": 0.8366764187812805, + "learning_rate": 4.8611023022769075e-05, + "loss": 0.8248, + "step": 440 + }, + { + "epoch": 0.02917485888247606, + "grad_norm": 0.801325798034668, + "learning_rate": 4.8547599416502824e-05, + "loss": 0.8649, + "step": 460 + }, + { + "epoch": 0.030443331007801104, + "grad_norm": 1.0194541215896606, + "learning_rate": 4.848417581023657e-05, + "loss": 0.7695, + "step": 480 + }, + { + "epoch": 0.03171180313312615, + "grad_norm": 0.8728939294815063, + "learning_rate": 4.842075220397032e-05, + "loss": 0.8076, + "step": 500 + }, + { + "epoch": 0.032980275258451194, + "grad_norm": 0.9340566992759705, + "learning_rate": 4.8357328597704065e-05, + "loss": 0.8078, + "step": 520 + }, + { + "epoch": 0.03424874738377624, + "grad_norm": 0.8570923209190369, + "learning_rate": 4.8293904991437814e-05, + "loss": 0.85, + "step": 540 + }, + { + "epoch": 0.035517219509101285, + "grad_norm": 0.7447881698608398, + "learning_rate": 4.823048138517156e-05, + "loss": 0.864, + "step": 560 + }, + { + "epoch": 0.03678569163442633, + "grad_norm": 0.9067574143409729, + "learning_rate": 4.816705777890531e-05, + "loss": 0.8536, + "step": 580 + }, + { + "epoch": 0.03805416375975138, + "grad_norm": 1.009699821472168, + "learning_rate": 4.8103634172639055e-05, + "loss": 0.8023, + "step": 600 + }, + { + "epoch": 0.039322635885076423, + "grad_norm": 0.9121712446212769, + "learning_rate": 4.8040210566372804e-05, + "loss": 0.8643, + "step": 620 + }, + { + "epoch": 0.04059110801040147, + "grad_norm": 0.8111468553543091, + "learning_rate": 4.797678696010656e-05, + "loss": 0.8364, + "step": 640 + }, + { + "epoch": 0.04185958013572652, + "grad_norm": 0.8392479419708252, + "learning_rate": 4.79133633538403e-05, + "loss": 0.8694, + "step": 660 + }, + { + "epoch": 0.04312805226105156, + "grad_norm": 1.0781140327453613, + "learning_rate": 4.784993974757405e-05, + "loss": 0.8613, + "step": 680 + }, + { + "epoch": 0.04439652438637661, + "grad_norm": 1.0560787916183472, + "learning_rate": 4.7786516141307794e-05, + "loss": 0.8348, + "step": 700 + }, + { + "epoch": 0.04566499651170165, + "grad_norm": 0.9052237868309021, + "learning_rate": 4.772309253504155e-05, + "loss": 0.8389, + "step": 720 + }, + { + "epoch": 0.0469334686370267, + "grad_norm": 0.698861300945282, + "learning_rate": 4.765966892877529e-05, + "loss": 0.8556, + "step": 740 + }, + { + "epoch": 0.04820194076235175, + "grad_norm": 0.8764225840568542, + "learning_rate": 4.759624532250904e-05, + "loss": 0.8294, + "step": 760 + }, + { + "epoch": 0.04947041288767679, + "grad_norm": 0.9750523567199707, + "learning_rate": 4.7532821716242784e-05, + "loss": 0.8109, + "step": 780 + }, + { + "epoch": 0.05073888501300184, + "grad_norm": 0.931287407875061, + "learning_rate": 4.746939810997654e-05, + "loss": 0.8256, + "step": 800 + }, + { + "epoch": 0.05200735713832689, + "grad_norm": 0.8329412341117859, + "learning_rate": 4.740597450371028e-05, + "loss": 0.8394, + "step": 820 + }, + { + "epoch": 0.05327582926365193, + "grad_norm": 0.8462470173835754, + "learning_rate": 4.734255089744403e-05, + "loss": 0.8445, + "step": 840 + }, + { + "epoch": 0.05454430138897698, + "grad_norm": 0.8757215142250061, + "learning_rate": 4.7279127291177774e-05, + "loss": 0.8467, + "step": 860 + }, + { + "epoch": 0.05581277351430202, + "grad_norm": 0.8820921182632446, + "learning_rate": 4.721570368491153e-05, + "loss": 0.8415, + "step": 880 + }, + { + "epoch": 0.05708124563962707, + "grad_norm": 0.9306124448776245, + "learning_rate": 4.715228007864527e-05, + "loss": 0.8488, + "step": 900 + }, + { + "epoch": 0.05834971776495212, + "grad_norm": 0.9606575965881348, + "learning_rate": 4.708885647237902e-05, + "loss": 0.8495, + "step": 920 + }, + { + "epoch": 0.05961818989027716, + "grad_norm": 0.7876096963882446, + "learning_rate": 4.702543286611277e-05, + "loss": 0.8364, + "step": 940 + }, + { + "epoch": 0.06088666201560221, + "grad_norm": 1.05940842628479, + "learning_rate": 4.696200925984652e-05, + "loss": 0.856, + "step": 960 + }, + { + "epoch": 0.06215513414092725, + "grad_norm": 0.8594347238540649, + "learning_rate": 4.689858565358026e-05, + "loss": 0.8114, + "step": 980 + }, + { + "epoch": 0.0634236062662523, + "grad_norm": 0.8395501971244812, + "learning_rate": 4.683516204731401e-05, + "loss": 0.8645, + "step": 1000 + }, + { + "epoch": 0.06469207839157734, + "grad_norm": 0.9333510994911194, + "learning_rate": 4.677173844104776e-05, + "loss": 0.8196, + "step": 1020 + }, + { + "epoch": 0.06596055051690239, + "grad_norm": 0.8195106983184814, + "learning_rate": 4.670831483478151e-05, + "loss": 0.8586, + "step": 1040 + }, + { + "epoch": 0.06722902264222744, + "grad_norm": 0.9601035118103027, + "learning_rate": 4.664489122851525e-05, + "loss": 0.8884, + "step": 1060 + }, + { + "epoch": 0.06849749476755249, + "grad_norm": 1.006823182106018, + "learning_rate": 4.6581467622249e-05, + "loss": 0.8301, + "step": 1080 + }, + { + "epoch": 0.06976596689287753, + "grad_norm": 0.9135039448738098, + "learning_rate": 4.651804401598275e-05, + "loss": 0.8267, + "step": 1100 + }, + { + "epoch": 0.07103443901820257, + "grad_norm": 0.929247260093689, + "learning_rate": 4.64546204097165e-05, + "loss": 0.8569, + "step": 1120 + }, + { + "epoch": 0.07230291114352762, + "grad_norm": 0.8837388753890991, + "learning_rate": 4.639119680345025e-05, + "loss": 0.8236, + "step": 1140 + }, + { + "epoch": 0.07357138326885267, + "grad_norm": 0.9246069192886353, + "learning_rate": 4.63309443774973e-05, + "loss": 0.8736, + "step": 1160 + }, + { + "epoch": 0.07483985539417772, + "grad_norm": 0.803130567073822, + "learning_rate": 4.626752077123105e-05, + "loss": 0.841, + "step": 1180 + }, + { + "epoch": 0.07610832751950276, + "grad_norm": 0.8640844225883484, + "learning_rate": 4.62040971649648e-05, + "loss": 0.8171, + "step": 1200 + }, + { + "epoch": 0.0773767996448278, + "grad_norm": 0.7979689240455627, + "learning_rate": 4.614067355869855e-05, + "loss": 0.8282, + "step": 1220 + }, + { + "epoch": 0.07864527177015285, + "grad_norm": 0.9452027678489685, + "learning_rate": 4.60772499524323e-05, + "loss": 0.8724, + "step": 1240 + }, + { + "epoch": 0.0799137438954779, + "grad_norm": 0.8859258890151978, + "learning_rate": 4.601382634616604e-05, + "loss": 0.8825, + "step": 1260 + }, + { + "epoch": 0.08118221602080294, + "grad_norm": 0.7669050693511963, + "learning_rate": 4.595040273989979e-05, + "loss": 0.7727, + "step": 1280 + }, + { + "epoch": 0.08245068814612799, + "grad_norm": 0.9593196511268616, + "learning_rate": 4.588697913363354e-05, + "loss": 0.8217, + "step": 1300 + }, + { + "epoch": 0.08371916027145304, + "grad_norm": 0.9165802597999573, + "learning_rate": 4.582355552736729e-05, + "loss": 0.8288, + "step": 1320 + }, + { + "epoch": 0.08498763239677808, + "grad_norm": 0.8536924123764038, + "learning_rate": 4.576013192110103e-05, + "loss": 0.85, + "step": 1340 + }, + { + "epoch": 0.08625610452210312, + "grad_norm": 1.2818777561187744, + "learning_rate": 4.569670831483479e-05, + "loss": 0.8595, + "step": 1360 + }, + { + "epoch": 0.08752457664742817, + "grad_norm": 0.8300599455833435, + "learning_rate": 4.563328470856853e-05, + "loss": 0.8843, + "step": 1380 + }, + { + "epoch": 0.08879304877275322, + "grad_norm": 0.8837544322013855, + "learning_rate": 4.556986110230228e-05, + "loss": 0.869, + "step": 1400 + }, + { + "epoch": 0.09006152089807827, + "grad_norm": 0.8470463156700134, + "learning_rate": 4.550643749603602e-05, + "loss": 0.8748, + "step": 1420 + }, + { + "epoch": 0.0913299930234033, + "grad_norm": 0.9581688046455383, + "learning_rate": 4.544301388976978e-05, + "loss": 0.8545, + "step": 1440 + }, + { + "epoch": 0.09259846514872835, + "grad_norm": 0.9453362822532654, + "learning_rate": 4.537959028350352e-05, + "loss": 0.8134, + "step": 1460 + }, + { + "epoch": 0.0938669372740534, + "grad_norm": 1.0060471296310425, + "learning_rate": 4.531616667723727e-05, + "loss": 0.8136, + "step": 1480 + }, + { + "epoch": 0.09513540939937845, + "grad_norm": 1.0802257061004639, + "learning_rate": 4.525274307097101e-05, + "loss": 0.8548, + "step": 1500 + }, + { + "epoch": 0.0964038815247035, + "grad_norm": 0.8368780612945557, + "learning_rate": 4.518931946470477e-05, + "loss": 0.8709, + "step": 1520 + }, + { + "epoch": 0.09767235365002853, + "grad_norm": 0.8320122361183167, + "learning_rate": 4.512589585843851e-05, + "loss": 0.8181, + "step": 1540 + }, + { + "epoch": 0.09894082577535358, + "grad_norm": 0.8374115824699402, + "learning_rate": 4.506247225217226e-05, + "loss": 0.8549, + "step": 1560 + }, + { + "epoch": 0.10020929790067863, + "grad_norm": 0.7658076882362366, + "learning_rate": 4.4999048645906e-05, + "loss": 0.8453, + "step": 1580 + }, + { + "epoch": 0.10147777002600368, + "grad_norm": 0.8223689198493958, + "learning_rate": 4.493562503963976e-05, + "loss": 0.9502, + "step": 1600 + }, + { + "epoch": 0.10274624215132873, + "grad_norm": 0.8949286341667175, + "learning_rate": 4.487220143337351e-05, + "loss": 0.8565, + "step": 1620 + }, + { + "epoch": 0.10401471427665378, + "grad_norm": 0.9013976454734802, + "learning_rate": 4.480877782710725e-05, + "loss": 0.8313, + "step": 1640 + }, + { + "epoch": 0.10528318640197881, + "grad_norm": 0.7802047729492188, + "learning_rate": 4.4745354220841e-05, + "loss": 0.8598, + "step": 1660 + }, + { + "epoch": 0.10655165852730386, + "grad_norm": 1.0458945035934448, + "learning_rate": 4.468193061457475e-05, + "loss": 0.8358, + "step": 1680 + }, + { + "epoch": 0.10782013065262891, + "grad_norm": 0.8241732716560364, + "learning_rate": 4.46185070083085e-05, + "loss": 0.8473, + "step": 1700 + }, + { + "epoch": 0.10908860277795396, + "grad_norm": 0.794696569442749, + "learning_rate": 4.455508340204224e-05, + "loss": 0.8557, + "step": 1720 + }, + { + "epoch": 0.110357074903279, + "grad_norm": 0.9596499800682068, + "learning_rate": 4.449165979577599e-05, + "loss": 0.8727, + "step": 1740 + }, + { + "epoch": 0.11162554702860404, + "grad_norm": 0.789816677570343, + "learning_rate": 4.442823618950974e-05, + "loss": 0.8475, + "step": 1760 + }, + { + "epoch": 0.11289401915392909, + "grad_norm": 0.8604167103767395, + "learning_rate": 4.436481258324349e-05, + "loss": 0.8426, + "step": 1780 + }, + { + "epoch": 0.11416249127925414, + "grad_norm": 0.7762212157249451, + "learning_rate": 4.430138897697723e-05, + "loss": 0.8532, + "step": 1800 + }, + { + "epoch": 0.11543096340457919, + "grad_norm": 0.7988696694374084, + "learning_rate": 4.423796537071098e-05, + "loss": 0.8398, + "step": 1820 + }, + { + "epoch": 0.11669943552990424, + "grad_norm": 0.7536550164222717, + "learning_rate": 4.417454176444473e-05, + "loss": 0.8669, + "step": 1840 + }, + { + "epoch": 0.11796790765522927, + "grad_norm": 0.8086602687835693, + "learning_rate": 4.411111815817848e-05, + "loss": 0.858, + "step": 1860 + }, + { + "epoch": 0.11923637978055432, + "grad_norm": 0.8665058612823486, + "learning_rate": 4.404769455191222e-05, + "loss": 0.8445, + "step": 1880 + }, + { + "epoch": 0.12050485190587937, + "grad_norm": 0.9934934973716736, + "learning_rate": 4.398427094564597e-05, + "loss": 0.8548, + "step": 1900 + }, + { + "epoch": 0.12177332403120442, + "grad_norm": 1.0823296308517456, + "learning_rate": 4.392084733937972e-05, + "loss": 0.8598, + "step": 1920 + }, + { + "epoch": 0.12304179615652946, + "grad_norm": 0.767232358455658, + "learning_rate": 4.385742373311347e-05, + "loss": 0.8375, + "step": 1940 + }, + { + "epoch": 0.1243102682818545, + "grad_norm": 0.9011818170547485, + "learning_rate": 4.379400012684721e-05, + "loss": 0.8322, + "step": 1960 + }, + { + "epoch": 0.12557874040717956, + "grad_norm": 0.7959470152854919, + "learning_rate": 4.373057652058096e-05, + "loss": 0.8742, + "step": 1980 + }, + { + "epoch": 0.1268472125325046, + "grad_norm": 0.7924261093139648, + "learning_rate": 4.366715291431471e-05, + "loss": 0.854, + "step": 2000 + }, + { + "epoch": 0.12811568465782963, + "grad_norm": 0.7666494250297546, + "learning_rate": 4.360372930804846e-05, + "loss": 0.8068, + "step": 2020 + }, + { + "epoch": 0.12938415678315468, + "grad_norm": 0.7881539463996887, + "learning_rate": 4.354030570178221e-05, + "loss": 0.8483, + "step": 2040 + }, + { + "epoch": 0.13065262890847973, + "grad_norm": 0.8898656964302063, + "learning_rate": 4.3476882095515956e-05, + "loss": 0.8818, + "step": 2060 + }, + { + "epoch": 0.13192110103380478, + "grad_norm": 0.8294678330421448, + "learning_rate": 4.3413458489249705e-05, + "loss": 0.8118, + "step": 2080 + }, + { + "epoch": 0.13318957315912983, + "grad_norm": 0.867928147315979, + "learning_rate": 4.335003488298345e-05, + "loss": 0.8602, + "step": 2100 + }, + { + "epoch": 0.13445804528445487, + "grad_norm": 0.8367668390274048, + "learning_rate": 4.32866112767172e-05, + "loss": 0.8859, + "step": 2120 + }, + { + "epoch": 0.13572651740977992, + "grad_norm": 0.8341355919837952, + "learning_rate": 4.3223187670450946e-05, + "loss": 0.8484, + "step": 2140 + }, + { + "epoch": 0.13699498953510497, + "grad_norm": 0.8836013674736023, + "learning_rate": 4.3159764064184695e-05, + "loss": 0.851, + "step": 2160 + }, + { + "epoch": 0.13826346166043002, + "grad_norm": 0.7626996040344238, + "learning_rate": 4.309634045791844e-05, + "loss": 0.8308, + "step": 2180 + }, + { + "epoch": 0.13953193378575507, + "grad_norm": 0.926589846611023, + "learning_rate": 4.303291685165219e-05, + "loss": 0.8699, + "step": 2200 + }, + { + "epoch": 0.14080040591108012, + "grad_norm": 0.79881751537323, + "learning_rate": 4.2969493245385936e-05, + "loss": 0.8215, + "step": 2220 + }, + { + "epoch": 0.14206887803640514, + "grad_norm": 0.96977299451828, + "learning_rate": 4.2906069639119685e-05, + "loss": 0.8218, + "step": 2240 + }, + { + "epoch": 0.1433373501617302, + "grad_norm": 0.786342442035675, + "learning_rate": 4.284264603285343e-05, + "loss": 0.8128, + "step": 2260 + }, + { + "epoch": 0.14460582228705524, + "grad_norm": 0.8864126205444336, + "learning_rate": 4.277922242658718e-05, + "loss": 0.8931, + "step": 2280 + }, + { + "epoch": 0.14587429441238028, + "grad_norm": 0.9522060751914978, + "learning_rate": 4.2715798820320926e-05, + "loss": 0.8592, + "step": 2300 + }, + { + "epoch": 0.14714276653770533, + "grad_norm": 0.7771050333976746, + "learning_rate": 4.2652375214054675e-05, + "loss": 0.8473, + "step": 2320 + }, + { + "epoch": 0.14841123866303038, + "grad_norm": 0.9344542026519775, + "learning_rate": 4.258895160778842e-05, + "loss": 0.8611, + "step": 2340 + }, + { + "epoch": 0.14967971078835543, + "grad_norm": 0.8358260989189148, + "learning_rate": 4.252552800152217e-05, + "loss": 0.8689, + "step": 2360 + }, + { + "epoch": 0.15094818291368048, + "grad_norm": 0.9203112125396729, + "learning_rate": 4.2462104395255916e-05, + "loss": 0.8701, + "step": 2380 + }, + { + "epoch": 0.15221665503900553, + "grad_norm": 0.87474125623703, + "learning_rate": 4.2398680788989665e-05, + "loss": 0.8441, + "step": 2400 + }, + { + "epoch": 0.15348512716433058, + "grad_norm": 0.838341474533081, + "learning_rate": 4.233525718272341e-05, + "loss": 0.8402, + "step": 2420 + }, + { + "epoch": 0.1547535992896556, + "grad_norm": 0.9312511682510376, + "learning_rate": 4.227183357645716e-05, + "loss": 0.8594, + "step": 2440 + }, + { + "epoch": 0.15602207141498065, + "grad_norm": 0.8382570147514343, + "learning_rate": 4.220840997019091e-05, + "loss": 0.8629, + "step": 2460 + }, + { + "epoch": 0.1572905435403057, + "grad_norm": 0.7457637190818787, + "learning_rate": 4.2144986363924655e-05, + "loss": 0.8518, + "step": 2480 + }, + { + "epoch": 0.15855901566563074, + "grad_norm": 0.9622685313224792, + "learning_rate": 4.2081562757658405e-05, + "loss": 0.8555, + "step": 2500 + }, + { + "epoch": 0.1598274877909558, + "grad_norm": 0.8078732490539551, + "learning_rate": 4.201813915139215e-05, + "loss": 0.8499, + "step": 2520 + }, + { + "epoch": 0.16109595991628084, + "grad_norm": 0.9720255136489868, + "learning_rate": 4.19547155451259e-05, + "loss": 0.8273, + "step": 2540 + }, + { + "epoch": 0.1623644320416059, + "grad_norm": 0.86973637342453, + "learning_rate": 4.1891291938859645e-05, + "loss": 0.8212, + "step": 2560 + }, + { + "epoch": 0.16363290416693094, + "grad_norm": 0.7209818363189697, + "learning_rate": 4.1827868332593395e-05, + "loss": 0.8207, + "step": 2580 + }, + { + "epoch": 0.16490137629225599, + "grad_norm": 0.8860530853271484, + "learning_rate": 4.176444472632714e-05, + "loss": 0.8513, + "step": 2600 + }, + { + "epoch": 0.16616984841758103, + "grad_norm": 0.8513688445091248, + "learning_rate": 4.170102112006089e-05, + "loss": 0.8581, + "step": 2620 + }, + { + "epoch": 0.16743832054290608, + "grad_norm": 0.805892825126648, + "learning_rate": 4.1637597513794635e-05, + "loss": 0.8173, + "step": 2640 + }, + { + "epoch": 0.1687067926682311, + "grad_norm": 0.7897241115570068, + "learning_rate": 4.1574173907528385e-05, + "loss": 0.8358, + "step": 2660 + }, + { + "epoch": 0.16997526479355615, + "grad_norm": 0.8170486092567444, + "learning_rate": 4.151075030126213e-05, + "loss": 0.8259, + "step": 2680 + }, + { + "epoch": 0.1712437369188812, + "grad_norm": 0.993430495262146, + "learning_rate": 4.144732669499588e-05, + "loss": 0.8485, + "step": 2700 + }, + { + "epoch": 0.17251220904420625, + "grad_norm": 0.8168458342552185, + "learning_rate": 4.1383903088729625e-05, + "loss": 0.8446, + "step": 2720 + }, + { + "epoch": 0.1737806811695313, + "grad_norm": 0.9205940365791321, + "learning_rate": 4.1320479482463375e-05, + "loss": 0.8119, + "step": 2740 + }, + { + "epoch": 0.17504915329485635, + "grad_norm": 0.7091718912124634, + "learning_rate": 4.1257055876197124e-05, + "loss": 0.8672, + "step": 2760 + }, + { + "epoch": 0.1763176254201814, + "grad_norm": 0.7693396210670471, + "learning_rate": 4.119363226993087e-05, + "loss": 0.8285, + "step": 2780 + }, + { + "epoch": 0.17758609754550644, + "grad_norm": 1.0046180486679077, + "learning_rate": 4.1130208663664615e-05, + "loss": 0.8405, + "step": 2800 + }, + { + "epoch": 0.1788545696708315, + "grad_norm": 0.8736918568611145, + "learning_rate": 4.1066785057398365e-05, + "loss": 0.838, + "step": 2820 + }, + { + "epoch": 0.18012304179615654, + "grad_norm": 0.9117953777313232, + "learning_rate": 4.1003361451132114e-05, + "loss": 0.866, + "step": 2840 + }, + { + "epoch": 0.18139151392148156, + "grad_norm": 0.7822251319885254, + "learning_rate": 4.093993784486586e-05, + "loss": 0.9023, + "step": 2860 + }, + { + "epoch": 0.1826599860468066, + "grad_norm": 0.8296000361442566, + "learning_rate": 4.087651423859961e-05, + "loss": 0.8215, + "step": 2880 + }, + { + "epoch": 0.18392845817213166, + "grad_norm": 0.814051628112793, + "learning_rate": 4.0813090632333355e-05, + "loss": 0.8362, + "step": 2900 + }, + { + "epoch": 0.1851969302974567, + "grad_norm": 0.8576933145523071, + "learning_rate": 4.0749667026067104e-05, + "loss": 0.8584, + "step": 2920 + }, + { + "epoch": 0.18646540242278176, + "grad_norm": 1.1711878776550293, + "learning_rate": 4.068624341980085e-05, + "loss": 0.8411, + "step": 2940 + }, + { + "epoch": 0.1877338745481068, + "grad_norm": 0.8704431056976318, + "learning_rate": 4.06228198135346e-05, + "loss": 0.8548, + "step": 2960 + }, + { + "epoch": 0.18900234667343185, + "grad_norm": 0.8817090392112732, + "learning_rate": 4.0559396207268345e-05, + "loss": 0.8284, + "step": 2980 + }, + { + "epoch": 0.1902708187987569, + "grad_norm": 0.8044705390930176, + "learning_rate": 4.0495972601002094e-05, + "loss": 0.8242, + "step": 3000 + }, + { + "epoch": 0.19153929092408195, + "grad_norm": 0.7911235690116882, + "learning_rate": 4.043254899473584e-05, + "loss": 0.8239, + "step": 3020 + }, + { + "epoch": 0.192807763049407, + "grad_norm": 0.8657433390617371, + "learning_rate": 4.036912538846959e-05, + "loss": 0.8538, + "step": 3040 + }, + { + "epoch": 0.19407623517473205, + "grad_norm": 0.7675552368164062, + "learning_rate": 4.0305701782203335e-05, + "loss": 0.8036, + "step": 3060 + }, + { + "epoch": 0.19534470730005707, + "grad_norm": 0.8326247930526733, + "learning_rate": 4.0242278175937084e-05, + "loss": 0.8808, + "step": 3080 + }, + { + "epoch": 0.19661317942538212, + "grad_norm": 0.9370586276054382, + "learning_rate": 4.017885456967083e-05, + "loss": 0.8826, + "step": 3100 + }, + { + "epoch": 0.19788165155070717, + "grad_norm": 1.0956268310546875, + "learning_rate": 4.011543096340458e-05, + "loss": 0.8272, + "step": 3120 + }, + { + "epoch": 0.19915012367603221, + "grad_norm": 0.8846516609191895, + "learning_rate": 4.0052007357138325e-05, + "loss": 0.7996, + "step": 3140 + }, + { + "epoch": 0.20041859580135726, + "grad_norm": 0.8572809100151062, + "learning_rate": 3.998858375087208e-05, + "loss": 0.8098, + "step": 3160 + }, + { + "epoch": 0.2016870679266823, + "grad_norm": 0.9393579959869385, + "learning_rate": 3.992516014460582e-05, + "loss": 0.8076, + "step": 3180 + }, + { + "epoch": 0.20295554005200736, + "grad_norm": 0.7605802416801453, + "learning_rate": 3.986173653833957e-05, + "loss": 0.8284, + "step": 3200 + }, + { + "epoch": 0.2042240121773324, + "grad_norm": 0.6856004595756531, + "learning_rate": 3.9798312932073315e-05, + "loss": 0.8277, + "step": 3220 + }, + { + "epoch": 0.20549248430265746, + "grad_norm": 0.889754593372345, + "learning_rate": 3.973488932580707e-05, + "loss": 0.863, + "step": 3240 + }, + { + "epoch": 0.2067609564279825, + "grad_norm": 0.6881526112556458, + "learning_rate": 3.967146571954081e-05, + "loss": 0.8966, + "step": 3260 + }, + { + "epoch": 0.20802942855330755, + "grad_norm": 0.9273505806922913, + "learning_rate": 3.960804211327456e-05, + "loss": 0.867, + "step": 3280 + }, + { + "epoch": 0.20929790067863258, + "grad_norm": 0.9193258285522461, + "learning_rate": 3.954461850700831e-05, + "loss": 0.8131, + "step": 3300 + }, + { + "epoch": 0.21056637280395762, + "grad_norm": 0.9435883164405823, + "learning_rate": 3.948119490074206e-05, + "loss": 0.8066, + "step": 3320 + }, + { + "epoch": 0.21183484492928267, + "grad_norm": 0.8440527319908142, + "learning_rate": 3.941777129447581e-05, + "loss": 0.8195, + "step": 3340 + }, + { + "epoch": 0.21310331705460772, + "grad_norm": 0.7809598445892334, + "learning_rate": 3.935434768820955e-05, + "loss": 0.8503, + "step": 3360 + }, + { + "epoch": 0.21437178917993277, + "grad_norm": 0.7411904335021973, + "learning_rate": 3.92909240819433e-05, + "loss": 0.8055, + "step": 3380 + }, + { + "epoch": 0.21564026130525782, + "grad_norm": 0.9117131233215332, + "learning_rate": 3.922750047567705e-05, + "loss": 0.8789, + "step": 3400 + }, + { + "epoch": 0.21690873343058287, + "grad_norm": 0.816663920879364, + "learning_rate": 3.91640768694108e-05, + "loss": 0.8311, + "step": 3420 + }, + { + "epoch": 0.21817720555590792, + "grad_norm": 0.7956539988517761, + "learning_rate": 3.910065326314454e-05, + "loss": 0.8475, + "step": 3440 + }, + { + "epoch": 0.21944567768123296, + "grad_norm": 0.858045756816864, + "learning_rate": 3.903722965687829e-05, + "loss": 0.8234, + "step": 3460 + }, + { + "epoch": 0.220714149806558, + "grad_norm": 0.8653853535652161, + "learning_rate": 3.897380605061204e-05, + "loss": 0.8363, + "step": 3480 + }, + { + "epoch": 0.22198262193188303, + "grad_norm": 0.8181082606315613, + "learning_rate": 3.891038244434579e-05, + "loss": 0.8224, + "step": 3500 + }, + { + "epoch": 0.22325109405720808, + "grad_norm": 0.8543662428855896, + "learning_rate": 3.884695883807953e-05, + "loss": 0.8572, + "step": 3520 + }, + { + "epoch": 0.22451956618253313, + "grad_norm": 0.8624857068061829, + "learning_rate": 3.878353523181328e-05, + "loss": 0.822, + "step": 3540 + }, + { + "epoch": 0.22578803830785818, + "grad_norm": 0.8224851489067078, + "learning_rate": 3.872011162554703e-05, + "loss": 0.8651, + "step": 3560 + }, + { + "epoch": 0.22705651043318323, + "grad_norm": 0.7411419749259949, + "learning_rate": 3.865668801928078e-05, + "loss": 0.8125, + "step": 3580 + }, + { + "epoch": 0.22832498255850828, + "grad_norm": 0.7959814667701721, + "learning_rate": 3.859326441301452e-05, + "loss": 0.8178, + "step": 3600 + }, + { + "epoch": 0.22959345468383333, + "grad_norm": 0.8626113533973694, + "learning_rate": 3.852984080674827e-05, + "loss": 0.8471, + "step": 3620 + }, + { + "epoch": 0.23086192680915837, + "grad_norm": 0.8034262657165527, + "learning_rate": 3.846641720048202e-05, + "loss": 0.8552, + "step": 3640 + }, + { + "epoch": 0.23213039893448342, + "grad_norm": 0.7086619734764099, + "learning_rate": 3.840299359421577e-05, + "loss": 0.8164, + "step": 3660 + }, + { + "epoch": 0.23339887105980847, + "grad_norm": 0.8688384890556335, + "learning_rate": 3.833956998794951e-05, + "loss": 0.8591, + "step": 3680 + }, + { + "epoch": 0.23466734318513352, + "grad_norm": 0.8770636916160583, + "learning_rate": 3.827614638168326e-05, + "loss": 0.835, + "step": 3700 + }, + { + "epoch": 0.23593581531045854, + "grad_norm": 0.6980867981910706, + "learning_rate": 3.821272277541702e-05, + "loss": 0.8467, + "step": 3720 + }, + { + "epoch": 0.2372042874357836, + "grad_norm": 0.7267412543296814, + "learning_rate": 3.814929916915076e-05, + "loss": 0.8238, + "step": 3740 + }, + { + "epoch": 0.23847275956110864, + "grad_norm": 0.7703206539154053, + "learning_rate": 3.808587556288451e-05, + "loss": 0.8233, + "step": 3760 + }, + { + "epoch": 0.2397412316864337, + "grad_norm": 0.8153693079948425, + "learning_rate": 3.802245195661825e-05, + "loss": 0.8361, + "step": 3780 + }, + { + "epoch": 0.24100970381175874, + "grad_norm": 0.7741461992263794, + "learning_rate": 3.795902835035201e-05, + "loss": 0.9083, + "step": 3800 + }, + { + "epoch": 0.24227817593708378, + "grad_norm": 0.8532843589782715, + "learning_rate": 3.789560474408575e-05, + "loss": 0.8692, + "step": 3820 + }, + { + "epoch": 0.24354664806240883, + "grad_norm": 0.8939579725265503, + "learning_rate": 3.78321811378195e-05, + "loss": 0.8336, + "step": 3840 + }, + { + "epoch": 0.24481512018773388, + "grad_norm": 0.8242042660713196, + "learning_rate": 3.776875753155325e-05, + "loss": 0.7705, + "step": 3860 + }, + { + "epoch": 0.24608359231305893, + "grad_norm": 0.9386515021324158, + "learning_rate": 3.7705333925287e-05, + "loss": 0.8021, + "step": 3880 + }, + { + "epoch": 0.24735206443838398, + "grad_norm": 0.9312074184417725, + "learning_rate": 3.764191031902074e-05, + "loss": 0.8113, + "step": 3900 + }, + { + "epoch": 0.248620536563709, + "grad_norm": 0.8644290566444397, + "learning_rate": 3.757848671275449e-05, + "loss": 0.8336, + "step": 3920 + }, + { + "epoch": 0.24988900868903405, + "grad_norm": 0.7512555122375488, + "learning_rate": 3.751506310648824e-05, + "loss": 0.832, + "step": 3940 + }, + { + "epoch": 0.2511574808143591, + "grad_norm": 0.8139438629150391, + "learning_rate": 3.745163950022199e-05, + "loss": 0.8583, + "step": 3960 + }, + { + "epoch": 0.2524259529396842, + "grad_norm": 0.800618588924408, + "learning_rate": 3.738821589395573e-05, + "loss": 0.8822, + "step": 3980 + }, + { + "epoch": 0.2536944250650092, + "grad_norm": 0.8025221228599548, + "learning_rate": 3.732479228768948e-05, + "loss": 0.8326, + "step": 4000 + }, + { + "epoch": 0.25496289719033427, + "grad_norm": 0.9008484482765198, + "learning_rate": 3.726136868142323e-05, + "loss": 0.8686, + "step": 4020 + }, + { + "epoch": 0.25623136931565926, + "grad_norm": 0.8681203126907349, + "learning_rate": 3.719794507515698e-05, + "loss": 0.8329, + "step": 4040 + }, + { + "epoch": 0.2574998414409843, + "grad_norm": 0.7322081327438354, + "learning_rate": 3.713452146889072e-05, + "loss": 0.8128, + "step": 4060 + }, + { + "epoch": 0.25876831356630936, + "grad_norm": 0.8935046792030334, + "learning_rate": 3.707109786262447e-05, + "loss": 0.8317, + "step": 4080 + }, + { + "epoch": 0.2600367856916344, + "grad_norm": 0.8813959360122681, + "learning_rate": 3.700767425635822e-05, + "loss": 0.855, + "step": 4100 + }, + { + "epoch": 0.26130525781695946, + "grad_norm": 0.7657369375228882, + "learning_rate": 3.694425065009197e-05, + "loss": 0.7865, + "step": 4120 + }, + { + "epoch": 0.2625737299422845, + "grad_norm": 0.7684091329574585, + "learning_rate": 3.688082704382572e-05, + "loss": 0.8039, + "step": 4140 + }, + { + "epoch": 0.26384220206760955, + "grad_norm": 0.7692010998725891, + "learning_rate": 3.681740343755946e-05, + "loss": 0.841, + "step": 4160 + }, + { + "epoch": 0.2651106741929346, + "grad_norm": 0.8434869050979614, + "learning_rate": 3.675397983129321e-05, + "loss": 0.831, + "step": 4180 + }, + { + "epoch": 0.26637914631825965, + "grad_norm": 0.7663161158561707, + "learning_rate": 3.669055622502696e-05, + "loss": 0.7749, + "step": 4200 + }, + { + "epoch": 0.2676476184435847, + "grad_norm": 0.8157145380973816, + "learning_rate": 3.662713261876071e-05, + "loss": 0.8067, + "step": 4220 + }, + { + "epoch": 0.26891609056890975, + "grad_norm": 0.8220170140266418, + "learning_rate": 3.656370901249445e-05, + "loss": 0.7872, + "step": 4240 + }, + { + "epoch": 0.2701845626942348, + "grad_norm": 0.9565273523330688, + "learning_rate": 3.6500285406228205e-05, + "loss": 0.8294, + "step": 4260 + }, + { + "epoch": 0.27145303481955985, + "grad_norm": 0.775265634059906, + "learning_rate": 3.643686179996195e-05, + "loss": 0.8129, + "step": 4280 + }, + { + "epoch": 0.2727215069448849, + "grad_norm": 0.9544994235038757, + "learning_rate": 3.63734381936957e-05, + "loss": 0.8212, + "step": 4300 + }, + { + "epoch": 0.27398997907020994, + "grad_norm": 1.041999340057373, + "learning_rate": 3.631001458742944e-05, + "loss": 0.8178, + "step": 4320 + }, + { + "epoch": 0.275258451195535, + "grad_norm": 0.8267019987106323, + "learning_rate": 3.6246590981163195e-05, + "loss": 0.8306, + "step": 4340 + }, + { + "epoch": 0.27652692332086004, + "grad_norm": 0.849429190158844, + "learning_rate": 3.618316737489694e-05, + "loss": 0.8179, + "step": 4360 + }, + { + "epoch": 0.2777953954461851, + "grad_norm": 0.8038565516471863, + "learning_rate": 3.611974376863069e-05, + "loss": 0.82, + "step": 4380 + }, + { + "epoch": 0.27906386757151014, + "grad_norm": 0.7065672874450684, + "learning_rate": 3.605632016236443e-05, + "loss": 0.8313, + "step": 4400 + }, + { + "epoch": 0.2803323396968352, + "grad_norm": 0.8574305772781372, + "learning_rate": 3.5992896556098185e-05, + "loss": 0.8529, + "step": 4420 + }, + { + "epoch": 0.28160081182216024, + "grad_norm": 0.8780136108398438, + "learning_rate": 3.592947294983193e-05, + "loss": 0.8181, + "step": 4440 + }, + { + "epoch": 0.28286928394748523, + "grad_norm": 0.8531150817871094, + "learning_rate": 3.586604934356568e-05, + "loss": 0.8044, + "step": 4460 + }, + { + "epoch": 0.2841377560728103, + "grad_norm": 0.807064414024353, + "learning_rate": 3.580262573729942e-05, + "loss": 0.8388, + "step": 4480 + }, + { + "epoch": 0.2854062281981353, + "grad_norm": 0.8407668471336365, + "learning_rate": 3.5739202131033175e-05, + "loss": 0.8579, + "step": 4500 + }, + { + "epoch": 0.2866747003234604, + "grad_norm": 0.9750702977180481, + "learning_rate": 3.567577852476692e-05, + "loss": 0.8606, + "step": 4520 + }, + { + "epoch": 0.2879431724487854, + "grad_norm": 0.8305932283401489, + "learning_rate": 3.561235491850067e-05, + "loss": 0.8312, + "step": 4540 + }, + { + "epoch": 0.28921164457411047, + "grad_norm": 0.7954509854316711, + "learning_rate": 3.5548931312234416e-05, + "loss": 0.8504, + "step": 4560 + }, + { + "epoch": 0.2904801166994355, + "grad_norm": 0.8049436211585999, + "learning_rate": 3.5485507705968165e-05, + "loss": 0.887, + "step": 4580 + }, + { + "epoch": 0.29174858882476057, + "grad_norm": 0.7284989953041077, + "learning_rate": 3.5422084099701914e-05, + "loss": 0.811, + "step": 4600 + }, + { + "epoch": 0.2930170609500856, + "grad_norm": 0.8093234300613403, + "learning_rate": 3.535866049343566e-05, + "loss": 0.8339, + "step": 4620 + }, + { + "epoch": 0.29428553307541067, + "grad_norm": 0.7123083472251892, + "learning_rate": 3.5295236887169406e-05, + "loss": 0.8251, + "step": 4640 + }, + { + "epoch": 0.2955540052007357, + "grad_norm": 0.8694972395896912, + "learning_rate": 3.5231813280903155e-05, + "loss": 0.8166, + "step": 4660 + }, + { + "epoch": 0.29682247732606076, + "grad_norm": 0.8641796708106995, + "learning_rate": 3.5168389674636904e-05, + "loss": 0.811, + "step": 4680 + }, + { + "epoch": 0.2980909494513858, + "grad_norm": 0.8516160249710083, + "learning_rate": 3.510496606837065e-05, + "loss": 0.8965, + "step": 4700 + }, + { + "epoch": 0.29935942157671086, + "grad_norm": 0.866295337677002, + "learning_rate": 3.5041542462104396e-05, + "loss": 0.8301, + "step": 4720 + }, + { + "epoch": 0.3006278937020359, + "grad_norm": 0.8344794511795044, + "learning_rate": 3.4978118855838145e-05, + "loss": 0.8512, + "step": 4740 + }, + { + "epoch": 0.30189636582736096, + "grad_norm": 0.8196772933006287, + "learning_rate": 3.4914695249571894e-05, + "loss": 0.8471, + "step": 4760 + }, + { + "epoch": 0.303164837952686, + "grad_norm": 0.66361004114151, + "learning_rate": 3.485127164330564e-05, + "loss": 0.8528, + "step": 4780 + }, + { + "epoch": 0.30443331007801105, + "grad_norm": 0.8254550099372864, + "learning_rate": 3.4787848037039386e-05, + "loss": 0.8133, + "step": 4800 + }, + { + "epoch": 0.3057017822033361, + "grad_norm": 0.7867299914360046, + "learning_rate": 3.4724424430773135e-05, + "loss": 0.8183, + "step": 4820 + }, + { + "epoch": 0.30697025432866115, + "grad_norm": 1.0029250383377075, + "learning_rate": 3.4661000824506884e-05, + "loss": 0.8192, + "step": 4840 + }, + { + "epoch": 0.3082387264539862, + "grad_norm": 0.8651610612869263, + "learning_rate": 3.459757721824063e-05, + "loss": 0.8755, + "step": 4860 + }, + { + "epoch": 0.3095071985793112, + "grad_norm": 0.8182082772254944, + "learning_rate": 3.4534153611974376e-05, + "loss": 0.8661, + "step": 4880 + }, + { + "epoch": 0.31077567070463624, + "grad_norm": 1.0455305576324463, + "learning_rate": 3.4470730005708125e-05, + "loss": 0.8134, + "step": 4900 + }, + { + "epoch": 0.3120441428299613, + "grad_norm": 0.7838363647460938, + "learning_rate": 3.4407306399441874e-05, + "loss": 0.8531, + "step": 4920 + }, + { + "epoch": 0.31331261495528634, + "grad_norm": 0.712868332862854, + "learning_rate": 3.434388279317562e-05, + "loss": 0.8109, + "step": 4940 + }, + { + "epoch": 0.3145810870806114, + "grad_norm": 0.8088375329971313, + "learning_rate": 3.428045918690937e-05, + "loss": 0.8717, + "step": 4960 + }, + { + "epoch": 0.31584955920593644, + "grad_norm": 0.8567003607749939, + "learning_rate": 3.421703558064312e-05, + "loss": 0.7997, + "step": 4980 + }, + { + "epoch": 0.3171180313312615, + "grad_norm": 0.7826852202415466, + "learning_rate": 3.4153611974376864e-05, + "loss": 0.8071, + "step": 5000 + }, + { + "epoch": 0.31838650345658653, + "grad_norm": 0.7257752418518066, + "learning_rate": 3.4090188368110613e-05, + "loss": 0.8053, + "step": 5020 + }, + { + "epoch": 0.3196549755819116, + "grad_norm": 0.9757511019706726, + "learning_rate": 3.402676476184436e-05, + "loss": 0.8195, + "step": 5040 + }, + { + "epoch": 0.32092344770723663, + "grad_norm": 0.9173797369003296, + "learning_rate": 3.396334115557811e-05, + "loss": 0.8662, + "step": 5060 + }, + { + "epoch": 0.3221919198325617, + "grad_norm": 0.8483538627624512, + "learning_rate": 3.3899917549311854e-05, + "loss": 0.8339, + "step": 5080 + }, + { + "epoch": 0.32346039195788673, + "grad_norm": 1.0109443664550781, + "learning_rate": 3.3836493943045604e-05, + "loss": 0.8731, + "step": 5100 + }, + { + "epoch": 0.3247288640832118, + "grad_norm": 0.8521518707275391, + "learning_rate": 3.377307033677935e-05, + "loss": 0.8367, + "step": 5120 + }, + { + "epoch": 0.3259973362085368, + "grad_norm": 0.8792763352394104, + "learning_rate": 3.37096467305131e-05, + "loss": 0.857, + "step": 5140 + }, + { + "epoch": 0.3272658083338619, + "grad_norm": 0.7366037368774414, + "learning_rate": 3.3646223124246844e-05, + "loss": 0.8556, + "step": 5160 + }, + { + "epoch": 0.3285342804591869, + "grad_norm": 0.9895220994949341, + "learning_rate": 3.3582799517980594e-05, + "loss": 0.885, + "step": 5180 + }, + { + "epoch": 0.32980275258451197, + "grad_norm": 0.8205326199531555, + "learning_rate": 3.351937591171434e-05, + "loss": 0.7703, + "step": 5200 + }, + { + "epoch": 0.331071224709837, + "grad_norm": 0.7155152559280396, + "learning_rate": 3.345595230544809e-05, + "loss": 0.8479, + "step": 5220 + }, + { + "epoch": 0.33233969683516207, + "grad_norm": 0.7578288912773132, + "learning_rate": 3.3392528699181834e-05, + "loss": 0.8019, + "step": 5240 + }, + { + "epoch": 0.3336081689604871, + "grad_norm": 0.8876450657844543, + "learning_rate": 3.3329105092915584e-05, + "loss": 0.8644, + "step": 5260 + }, + { + "epoch": 0.33487664108581217, + "grad_norm": 0.8280789256095886, + "learning_rate": 3.326568148664933e-05, + "loss": 0.8358, + "step": 5280 + }, + { + "epoch": 0.33614511321113716, + "grad_norm": 0.8563920855522156, + "learning_rate": 3.320225788038308e-05, + "loss": 0.8908, + "step": 5300 + }, + { + "epoch": 0.3374135853364622, + "grad_norm": 0.8536137342453003, + "learning_rate": 3.3138834274116824e-05, + "loss": 0.8188, + "step": 5320 + }, + { + "epoch": 0.33868205746178726, + "grad_norm": 0.909870982170105, + "learning_rate": 3.3075410667850574e-05, + "loss": 0.8175, + "step": 5340 + }, + { + "epoch": 0.3399505295871123, + "grad_norm": 0.7365782260894775, + "learning_rate": 3.301198706158432e-05, + "loss": 0.8463, + "step": 5360 + }, + { + "epoch": 0.34121900171243735, + "grad_norm": 0.7502683997154236, + "learning_rate": 3.294856345531807e-05, + "loss": 0.7959, + "step": 5380 + }, + { + "epoch": 0.3424874738377624, + "grad_norm": 0.6690531373023987, + "learning_rate": 3.288513984905182e-05, + "loss": 0.8266, + "step": 5400 + }, + { + "epoch": 0.34375594596308745, + "grad_norm": 0.9006823301315308, + "learning_rate": 3.2821716242785564e-05, + "loss": 0.8397, + "step": 5420 + }, + { + "epoch": 0.3450244180884125, + "grad_norm": 0.826954185962677, + "learning_rate": 3.275829263651932e-05, + "loss": 0.8022, + "step": 5440 + }, + { + "epoch": 0.34629289021373755, + "grad_norm": 0.8345193266868591, + "learning_rate": 3.269486903025306e-05, + "loss": 0.8148, + "step": 5460 + }, + { + "epoch": 0.3475613623390626, + "grad_norm": 0.7929914593696594, + "learning_rate": 3.263144542398681e-05, + "loss": 0.8073, + "step": 5480 + }, + { + "epoch": 0.34882983446438764, + "grad_norm": 0.9218955636024475, + "learning_rate": 3.2568021817720554e-05, + "loss": 0.8576, + "step": 5500 + }, + { + "epoch": 0.3500983065897127, + "grad_norm": 0.8164006471633911, + "learning_rate": 3.250459821145431e-05, + "loss": 0.8434, + "step": 5520 + }, + { + "epoch": 0.35136677871503774, + "grad_norm": 0.7511663436889648, + "learning_rate": 3.244117460518805e-05, + "loss": 0.7878, + "step": 5540 + }, + { + "epoch": 0.3526352508403628, + "grad_norm": 0.8193197846412659, + "learning_rate": 3.23777509989218e-05, + "loss": 0.8608, + "step": 5560 + }, + { + "epoch": 0.35390372296568784, + "grad_norm": 0.9112285375595093, + "learning_rate": 3.2314327392655544e-05, + "loss": 0.8404, + "step": 5580 + }, + { + "epoch": 0.3551721950910129, + "grad_norm": 0.75201416015625, + "learning_rate": 3.22509037863893e-05, + "loss": 0.813, + "step": 5600 + }, + { + "epoch": 0.35644066721633794, + "grad_norm": 0.8154107332229614, + "learning_rate": 3.218748018012304e-05, + "loss": 0.8083, + "step": 5620 + }, + { + "epoch": 0.357709139341663, + "grad_norm": 0.7872757315635681, + "learning_rate": 3.212405657385679e-05, + "loss": 0.8358, + "step": 5640 + }, + { + "epoch": 0.35897761146698803, + "grad_norm": 0.8861322999000549, + "learning_rate": 3.206063296759054e-05, + "loss": 0.8328, + "step": 5660 + }, + { + "epoch": 0.3602460835923131, + "grad_norm": 0.7487745881080627, + "learning_rate": 3.199720936132429e-05, + "loss": 0.8626, + "step": 5680 + }, + { + "epoch": 0.36151455571763813, + "grad_norm": 0.9134072065353394, + "learning_rate": 3.193378575505803e-05, + "loss": 0.8187, + "step": 5700 + }, + { + "epoch": 0.3627830278429631, + "grad_norm": 0.8179683089256287, + "learning_rate": 3.187036214879178e-05, + "loss": 0.8223, + "step": 5720 + }, + { + "epoch": 0.3640514999682882, + "grad_norm": 0.9501960873603821, + "learning_rate": 3.180693854252553e-05, + "loss": 0.8064, + "step": 5740 + }, + { + "epoch": 0.3653199720936132, + "grad_norm": 0.8398934006690979, + "learning_rate": 3.174351493625928e-05, + "loss": 0.8279, + "step": 5760 + }, + { + "epoch": 0.36658844421893827, + "grad_norm": 0.7718421816825867, + "learning_rate": 3.168009132999302e-05, + "loss": 0.8453, + "step": 5780 + }, + { + "epoch": 0.3678569163442633, + "grad_norm": 0.7935000061988831, + "learning_rate": 3.161666772372677e-05, + "loss": 0.8292, + "step": 5800 + }, + { + "epoch": 0.36912538846958837, + "grad_norm": 0.8383910655975342, + "learning_rate": 3.155324411746052e-05, + "loss": 0.8116, + "step": 5820 + }, + { + "epoch": 0.3703938605949134, + "grad_norm": 0.7147135734558105, + "learning_rate": 3.148982051119427e-05, + "loss": 0.8146, + "step": 5840 + }, + { + "epoch": 0.37166233272023846, + "grad_norm": 0.792220950126648, + "learning_rate": 3.142639690492802e-05, + "loss": 0.8278, + "step": 5860 + }, + { + "epoch": 0.3729308048455635, + "grad_norm": 0.9010721445083618, + "learning_rate": 3.136297329866176e-05, + "loss": 0.8217, + "step": 5880 + }, + { + "epoch": 0.37419927697088856, + "grad_norm": 0.7762110233306885, + "learning_rate": 3.129954969239551e-05, + "loss": 0.7813, + "step": 5900 + }, + { + "epoch": 0.3754677490962136, + "grad_norm": 0.8827633261680603, + "learning_rate": 3.123612608612926e-05, + "loss": 0.8087, + "step": 5920 + }, + { + "epoch": 0.37673622122153866, + "grad_norm": 0.7577320337295532, + "learning_rate": 3.117270247986301e-05, + "loss": 0.8198, + "step": 5940 + }, + { + "epoch": 0.3780046933468637, + "grad_norm": 0.8777590990066528, + "learning_rate": 3.110927887359675e-05, + "loss": 0.8144, + "step": 5960 + }, + { + "epoch": 0.37927316547218876, + "grad_norm": 0.8621464967727661, + "learning_rate": 3.10458552673305e-05, + "loss": 0.837, + "step": 5980 + }, + { + "epoch": 0.3805416375975138, + "grad_norm": 0.842014491558075, + "learning_rate": 3.098243166106425e-05, + "loss": 0.7945, + "step": 6000 + }, + { + "epoch": 0.38181010972283885, + "grad_norm": 0.8439661860466003, + "learning_rate": 3.0919008054798e-05, + "loss": 0.7907, + "step": 6020 + }, + { + "epoch": 0.3830785818481639, + "grad_norm": 0.7277771830558777, + "learning_rate": 3.085558444853174e-05, + "loss": 0.7792, + "step": 6040 + }, + { + "epoch": 0.38434705397348895, + "grad_norm": 0.8808379173278809, + "learning_rate": 3.07921608422655e-05, + "loss": 0.8356, + "step": 6060 + }, + { + "epoch": 0.385615526098814, + "grad_norm": 0.7495381832122803, + "learning_rate": 3.072873723599924e-05, + "loss": 0.8252, + "step": 6080 + }, + { + "epoch": 0.38688399822413905, + "grad_norm": 0.8698447942733765, + "learning_rate": 3.066531362973299e-05, + "loss": 0.8633, + "step": 6100 + }, + { + "epoch": 0.3881524703494641, + "grad_norm": 0.8586138486862183, + "learning_rate": 3.060189002346673e-05, + "loss": 0.8542, + "step": 6120 + }, + { + "epoch": 0.3894209424747891, + "grad_norm": 0.8375261425971985, + "learning_rate": 3.053846641720049e-05, + "loss": 0.8353, + "step": 6140 + }, + { + "epoch": 0.39068941460011414, + "grad_norm": 0.723111093044281, + "learning_rate": 3.047504281093423e-05, + "loss": 0.8011, + "step": 6160 + }, + { + "epoch": 0.3919578867254392, + "grad_norm": 0.7700281143188477, + "learning_rate": 3.041161920466798e-05, + "loss": 0.8309, + "step": 6180 + }, + { + "epoch": 0.39322635885076423, + "grad_norm": 0.8734796643257141, + "learning_rate": 3.0348195598401725e-05, + "loss": 0.7967, + "step": 6200 + }, + { + "epoch": 0.3944948309760893, + "grad_norm": 0.8601865172386169, + "learning_rate": 3.0284771992135474e-05, + "loss": 0.819, + "step": 6220 + }, + { + "epoch": 0.39576330310141433, + "grad_norm": 0.8753730058670044, + "learning_rate": 3.022134838586922e-05, + "loss": 0.814, + "step": 6240 + }, + { + "epoch": 0.3970317752267394, + "grad_norm": 0.8561723232269287, + "learning_rate": 3.015792477960297e-05, + "loss": 0.8073, + "step": 6260 + }, + { + "epoch": 0.39830024735206443, + "grad_norm": 0.8156526684761047, + "learning_rate": 3.009450117333672e-05, + "loss": 0.8706, + "step": 6280 + }, + { + "epoch": 0.3995687194773895, + "grad_norm": 0.7783714532852173, + "learning_rate": 3.0031077567070464e-05, + "loss": 0.7929, + "step": 6300 + }, + { + "epoch": 0.4008371916027145, + "grad_norm": 0.7870660424232483, + "learning_rate": 2.9967653960804216e-05, + "loss": 0.8169, + "step": 6320 + }, + { + "epoch": 0.4021056637280396, + "grad_norm": 0.9100777506828308, + "learning_rate": 2.990423035453796e-05, + "loss": 0.8365, + "step": 6340 + }, + { + "epoch": 0.4033741358533646, + "grad_norm": 0.8847957849502563, + "learning_rate": 2.984080674827171e-05, + "loss": 0.8178, + "step": 6360 + }, + { + "epoch": 0.4046426079786897, + "grad_norm": 0.9754513502120972, + "learning_rate": 2.9777383142005454e-05, + "loss": 0.8457, + "step": 6380 + }, + { + "epoch": 0.4059110801040147, + "grad_norm": 0.8685561418533325, + "learning_rate": 2.9713959535739206e-05, + "loss": 0.823, + "step": 6400 + }, + { + "epoch": 0.40717955222933977, + "grad_norm": 0.8493009209632874, + "learning_rate": 2.965053592947295e-05, + "loss": 0.8221, + "step": 6420 + }, + { + "epoch": 0.4084480243546648, + "grad_norm": 0.954006016254425, + "learning_rate": 2.95871123232067e-05, + "loss": 0.811, + "step": 6440 + }, + { + "epoch": 0.40971649647998987, + "grad_norm": 11.963293075561523, + "learning_rate": 2.9523688716940444e-05, + "loss": 0.8032, + "step": 6460 + }, + { + "epoch": 0.4109849686053149, + "grad_norm": 0.9225347638130188, + "learning_rate": 2.9460265110674196e-05, + "loss": 0.8645, + "step": 6480 + }, + { + "epoch": 0.41225344073063996, + "grad_norm": 0.7878516316413879, + "learning_rate": 2.939684150440794e-05, + "loss": 0.8303, + "step": 6500 + }, + { + "epoch": 0.413521912855965, + "grad_norm": 0.815546989440918, + "learning_rate": 2.933341789814169e-05, + "loss": 0.7873, + "step": 6520 + }, + { + "epoch": 0.41479038498129006, + "grad_norm": 0.7072951197624207, + "learning_rate": 2.9269994291875437e-05, + "loss": 0.8231, + "step": 6540 + }, + { + "epoch": 0.4160588571066151, + "grad_norm": 0.7275887131690979, + "learning_rate": 2.9206570685609186e-05, + "loss": 0.8445, + "step": 6560 + }, + { + "epoch": 0.4173273292319401, + "grad_norm": 0.7888057827949524, + "learning_rate": 2.9143147079342932e-05, + "loss": 0.7819, + "step": 6580 + }, + { + "epoch": 0.41859580135726515, + "grad_norm": 0.8133323788642883, + "learning_rate": 2.907972347307668e-05, + "loss": 0.7918, + "step": 6600 + }, + { + "epoch": 0.4198642734825902, + "grad_norm": 0.9344608187675476, + "learning_rate": 2.9016299866810427e-05, + "loss": 0.805, + "step": 6620 + }, + { + "epoch": 0.42113274560791525, + "grad_norm": 0.9130797386169434, + "learning_rate": 2.8952876260544176e-05, + "loss": 0.808, + "step": 6640 + }, + { + "epoch": 0.4224012177332403, + "grad_norm": 0.8647783398628235, + "learning_rate": 2.8889452654277922e-05, + "loss": 0.8027, + "step": 6660 + }, + { + "epoch": 0.42366968985856535, + "grad_norm": 0.8025421500205994, + "learning_rate": 2.882602904801167e-05, + "loss": 0.811, + "step": 6680 + }, + { + "epoch": 0.4249381619838904, + "grad_norm": 0.7249786853790283, + "learning_rate": 2.876260544174542e-05, + "loss": 0.7597, + "step": 6700 + }, + { + "epoch": 0.42620663410921544, + "grad_norm": 0.7876623868942261, + "learning_rate": 2.8699181835479166e-05, + "loss": 0.8297, + "step": 6720 + }, + { + "epoch": 0.4274751062345405, + "grad_norm": 0.8284019827842712, + "learning_rate": 2.8635758229212916e-05, + "loss": 0.7985, + "step": 6740 + }, + { + "epoch": 0.42874357835986554, + "grad_norm": 0.7855024337768555, + "learning_rate": 2.857233462294666e-05, + "loss": 0.8414, + "step": 6760 + }, + { + "epoch": 0.4300120504851906, + "grad_norm": 0.789946436882019, + "learning_rate": 2.850891101668041e-05, + "loss": 0.8307, + "step": 6780 + }, + { + "epoch": 0.43128052261051564, + "grad_norm": 0.7722301483154297, + "learning_rate": 2.8445487410414156e-05, + "loss": 0.8101, + "step": 6800 + }, + { + "epoch": 0.4325489947358407, + "grad_norm": 0.8557891249656677, + "learning_rate": 2.8382063804147906e-05, + "loss": 0.846, + "step": 6820 + }, + { + "epoch": 0.43381746686116573, + "grad_norm": 0.8216169476509094, + "learning_rate": 2.831864019788165e-05, + "loss": 0.7966, + "step": 6840 + }, + { + "epoch": 0.4350859389864908, + "grad_norm": 0.87419593334198, + "learning_rate": 2.82552165916154e-05, + "loss": 0.7777, + "step": 6860 + }, + { + "epoch": 0.43635441111181583, + "grad_norm": 0.8956803679466248, + "learning_rate": 2.8191792985349146e-05, + "loss": 0.8066, + "step": 6880 + }, + { + "epoch": 0.4376228832371409, + "grad_norm": 0.8264901638031006, + "learning_rate": 2.8128369379082896e-05, + "loss": 0.8489, + "step": 6900 + }, + { + "epoch": 0.43889135536246593, + "grad_norm": 0.7960401773452759, + "learning_rate": 2.806494577281664e-05, + "loss": 0.8257, + "step": 6920 + }, + { + "epoch": 0.440159827487791, + "grad_norm": 0.7691190838813782, + "learning_rate": 2.8001522166550394e-05, + "loss": 0.8365, + "step": 6940 + }, + { + "epoch": 0.441428299613116, + "grad_norm": 0.7433714270591736, + "learning_rate": 2.7938098560284136e-05, + "loss": 0.7755, + "step": 6960 + }, + { + "epoch": 0.4426967717384411, + "grad_norm": 0.7270233035087585, + "learning_rate": 2.787467495401789e-05, + "loss": 0.808, + "step": 6980 + }, + { + "epoch": 0.44396524386376607, + "grad_norm": 0.7907856106758118, + "learning_rate": 2.781125134775163e-05, + "loss": 0.8038, + "step": 7000 + }, + { + "epoch": 0.4452337159890911, + "grad_norm": 0.7421363592147827, + "learning_rate": 2.7747827741485384e-05, + "loss": 0.8058, + "step": 7020 + }, + { + "epoch": 0.44650218811441617, + "grad_norm": 0.8635361790657043, + "learning_rate": 2.7684404135219126e-05, + "loss": 0.786, + "step": 7040 + }, + { + "epoch": 0.4477706602397412, + "grad_norm": 0.9545580744743347, + "learning_rate": 2.762098052895288e-05, + "loss": 0.8431, + "step": 7060 + }, + { + "epoch": 0.44903913236506626, + "grad_norm": 0.9529020190238953, + "learning_rate": 2.755755692268662e-05, + "loss": 0.8082, + "step": 7080 + }, + { + "epoch": 0.4503076044903913, + "grad_norm": 0.8344403505325317, + "learning_rate": 2.7494133316420374e-05, + "loss": 0.8031, + "step": 7100 + }, + { + "epoch": 0.45157607661571636, + "grad_norm": 0.8490266799926758, + "learning_rate": 2.7430709710154123e-05, + "loss": 0.7952, + "step": 7120 + }, + { + "epoch": 0.4528445487410414, + "grad_norm": 0.8201053738594055, + "learning_rate": 2.736728610388787e-05, + "loss": 0.8057, + "step": 7140 + }, + { + "epoch": 0.45411302086636646, + "grad_norm": 0.7206814289093018, + "learning_rate": 2.7303862497621618e-05, + "loss": 0.7907, + "step": 7160 + }, + { + "epoch": 0.4553814929916915, + "grad_norm": 0.7151837944984436, + "learning_rate": 2.7240438891355364e-05, + "loss": 0.7989, + "step": 7180 + }, + { + "epoch": 0.45664996511701655, + "grad_norm": 0.73557049036026, + "learning_rate": 2.7177015285089113e-05, + "loss": 0.7887, + "step": 7200 + }, + { + "epoch": 0.4579184372423416, + "grad_norm": 0.8012831807136536, + "learning_rate": 2.711359167882286e-05, + "loss": 0.7963, + "step": 7220 + }, + { + "epoch": 0.45918690936766665, + "grad_norm": 0.7544090747833252, + "learning_rate": 2.7050168072556608e-05, + "loss": 0.8021, + "step": 7240 + }, + { + "epoch": 0.4604553814929917, + "grad_norm": 0.8027962446212769, + "learning_rate": 2.6986744466290354e-05, + "loss": 0.8346, + "step": 7260 + }, + { + "epoch": 0.46172385361831675, + "grad_norm": 0.8969400525093079, + "learning_rate": 2.6923320860024103e-05, + "loss": 0.8362, + "step": 7280 + }, + { + "epoch": 0.4629923257436418, + "grad_norm": 0.8505738377571106, + "learning_rate": 2.685989725375785e-05, + "loss": 0.8253, + "step": 7300 + }, + { + "epoch": 0.46426079786896685, + "grad_norm": 0.8005324602127075, + "learning_rate": 2.6796473647491598e-05, + "loss": 0.8133, + "step": 7320 + }, + { + "epoch": 0.4655292699942919, + "grad_norm": 0.8265887498855591, + "learning_rate": 2.6733050041225344e-05, + "loss": 0.7974, + "step": 7340 + }, + { + "epoch": 0.46679774211961694, + "grad_norm": 0.8310080170631409, + "learning_rate": 2.6669626434959093e-05, + "loss": 0.8311, + "step": 7360 + }, + { + "epoch": 0.468066214244942, + "grad_norm": 0.8837007284164429, + "learning_rate": 2.660620282869284e-05, + "loss": 0.8086, + "step": 7380 + }, + { + "epoch": 0.46933468637026704, + "grad_norm": 0.8574205040931702, + "learning_rate": 2.6542779222426588e-05, + "loss": 0.8045, + "step": 7400 + }, + { + "epoch": 0.47060315849559203, + "grad_norm": 0.7924466729164124, + "learning_rate": 2.6479355616160334e-05, + "loss": 0.831, + "step": 7420 + }, + { + "epoch": 0.4718716306209171, + "grad_norm": 0.8812252283096313, + "learning_rate": 2.6415932009894083e-05, + "loss": 0.8414, + "step": 7440 + }, + { + "epoch": 0.47314010274624213, + "grad_norm": 0.8879112601280212, + "learning_rate": 2.635250840362783e-05, + "loss": 0.8037, + "step": 7460 + }, + { + "epoch": 0.4744085748715672, + "grad_norm": 0.8532351851463318, + "learning_rate": 2.6289084797361578e-05, + "loss": 0.7961, + "step": 7480 + }, + { + "epoch": 0.4756770469968922, + "grad_norm": 0.7800135016441345, + "learning_rate": 2.6225661191095324e-05, + "loss": 0.8158, + "step": 7500 + }, + { + "epoch": 0.4769455191222173, + "grad_norm": 0.8264251947402954, + "learning_rate": 2.6162237584829073e-05, + "loss": 0.7902, + "step": 7520 + }, + { + "epoch": 0.4782139912475423, + "grad_norm": 0.9216287136077881, + "learning_rate": 2.6098813978562826e-05, + "loss": 0.8387, + "step": 7540 + }, + { + "epoch": 0.4794824633728674, + "grad_norm": 0.8331848382949829, + "learning_rate": 2.6035390372296568e-05, + "loss": 0.8309, + "step": 7560 + }, + { + "epoch": 0.4807509354981924, + "grad_norm": 0.7791485786437988, + "learning_rate": 2.597196676603032e-05, + "loss": 0.7954, + "step": 7580 + }, + { + "epoch": 0.48201940762351747, + "grad_norm": 0.8223782777786255, + "learning_rate": 2.5908543159764063e-05, + "loss": 0.8012, + "step": 7600 + }, + { + "epoch": 0.4832878797488425, + "grad_norm": 0.7362112998962402, + "learning_rate": 2.5845119553497816e-05, + "loss": 0.8548, + "step": 7620 + }, + { + "epoch": 0.48455635187416757, + "grad_norm": 0.9084497094154358, + "learning_rate": 2.578169594723156e-05, + "loss": 0.7889, + "step": 7640 + }, + { + "epoch": 0.4858248239994926, + "grad_norm": 0.8268200755119324, + "learning_rate": 2.571827234096531e-05, + "loss": 0.778, + "step": 7660 + }, + { + "epoch": 0.48709329612481767, + "grad_norm": 0.683861494064331, + "learning_rate": 2.5654848734699057e-05, + "loss": 0.8079, + "step": 7680 + }, + { + "epoch": 0.4883617682501427, + "grad_norm": 0.9449877142906189, + "learning_rate": 2.5591425128432806e-05, + "loss": 0.7967, + "step": 7700 + }, + { + "epoch": 0.48963024037546776, + "grad_norm": 0.8445014357566833, + "learning_rate": 2.552800152216655e-05, + "loss": 0.7774, + "step": 7720 + }, + { + "epoch": 0.4908987125007928, + "grad_norm": 0.811717689037323, + "learning_rate": 2.54645779159003e-05, + "loss": 0.7659, + "step": 7740 + }, + { + "epoch": 0.49216718462611786, + "grad_norm": 0.7867732644081116, + "learning_rate": 2.5401154309634047e-05, + "loss": 0.8539, + "step": 7760 + }, + { + "epoch": 0.4934356567514429, + "grad_norm": 0.7808852791786194, + "learning_rate": 2.5337730703367796e-05, + "loss": 0.8294, + "step": 7780 + }, + { + "epoch": 0.49470412887676796, + "grad_norm": 0.8998913168907166, + "learning_rate": 2.527430709710154e-05, + "loss": 0.8507, + "step": 7800 + }, + { + "epoch": 0.495972601002093, + "grad_norm": 0.7990160584449768, + "learning_rate": 2.521088349083529e-05, + "loss": 0.7261, + "step": 7820 + }, + { + "epoch": 0.497241073127418, + "grad_norm": 0.7361629605293274, + "learning_rate": 2.5147459884569037e-05, + "loss": 0.8058, + "step": 7840 + }, + { + "epoch": 0.49850954525274305, + "grad_norm": 0.849870502948761, + "learning_rate": 2.5084036278302786e-05, + "loss": 0.7708, + "step": 7860 + }, + { + "epoch": 0.4997780173780681, + "grad_norm": 0.7667344808578491, + "learning_rate": 2.502061267203653e-05, + "loss": 0.818, + "step": 7880 + }, + { + "epoch": 0.5010464895033931, + "grad_norm": 0.800609827041626, + "learning_rate": 2.495718906577028e-05, + "loss": 0.8248, + "step": 7900 + }, + { + "epoch": 0.5023149616287182, + "grad_norm": 0.7429226040840149, + "learning_rate": 2.489376545950403e-05, + "loss": 0.8395, + "step": 7920 + }, + { + "epoch": 0.5035834337540432, + "grad_norm": 0.7970502972602844, + "learning_rate": 2.4830341853237776e-05, + "loss": 0.8182, + "step": 7940 + }, + { + "epoch": 0.5048519058793683, + "grad_norm": 1.7285821437835693, + "learning_rate": 2.4766918246971525e-05, + "loss": 0.8241, + "step": 7960 + }, + { + "epoch": 0.5061203780046933, + "grad_norm": 0.8314895629882812, + "learning_rate": 2.470349464070527e-05, + "loss": 0.8017, + "step": 7980 + }, + { + "epoch": 0.5073888501300184, + "grad_norm": 0.9516363143920898, + "learning_rate": 2.464007103443902e-05, + "loss": 0.8395, + "step": 8000 + }, + { + "epoch": 0.5086573222553434, + "grad_norm": 0.8164798617362976, + "learning_rate": 2.4576647428172766e-05, + "loss": 0.8033, + "step": 8020 + }, + { + "epoch": 0.5099257943806685, + "grad_norm": 0.7700650691986084, + "learning_rate": 2.4513223821906515e-05, + "loss": 0.779, + "step": 8040 + }, + { + "epoch": 0.5111942665059935, + "grad_norm": 0.8437737226486206, + "learning_rate": 2.444980021564026e-05, + "loss": 0.8112, + "step": 8060 + }, + { + "epoch": 0.5124627386313185, + "grad_norm": 0.8371322751045227, + "learning_rate": 2.438637660937401e-05, + "loss": 0.8152, + "step": 8080 + }, + { + "epoch": 0.5137312107566436, + "grad_norm": 0.8382763862609863, + "learning_rate": 2.4322953003107756e-05, + "loss": 0.8414, + "step": 8100 + }, + { + "epoch": 0.5149996828819686, + "grad_norm": 0.9525557160377502, + "learning_rate": 2.4259529396841505e-05, + "loss": 0.8091, + "step": 8120 + }, + { + "epoch": 0.5162681550072937, + "grad_norm": 0.7620564699172974, + "learning_rate": 2.419610579057525e-05, + "loss": 0.7981, + "step": 8140 + }, + { + "epoch": 0.5175366271326187, + "grad_norm": 0.8722305297851562, + "learning_rate": 2.4132682184309003e-05, + "loss": 0.8079, + "step": 8160 + }, + { + "epoch": 0.5188050992579438, + "grad_norm": 0.8774722218513489, + "learning_rate": 2.406925857804275e-05, + "loss": 0.7937, + "step": 8180 + }, + { + "epoch": 0.5200735713832688, + "grad_norm": 0.7515254616737366, + "learning_rate": 2.40058349717765e-05, + "loss": 0.8134, + "step": 8200 + }, + { + "epoch": 0.5213420435085939, + "grad_norm": 0.8385280966758728, + "learning_rate": 2.3942411365510244e-05, + "loss": 0.8064, + "step": 8220 + }, + { + "epoch": 0.5226105156339189, + "grad_norm": 0.8530700206756592, + "learning_rate": 2.3878987759243994e-05, + "loss": 0.8249, + "step": 8240 + }, + { + "epoch": 0.523878987759244, + "grad_norm": 0.7866977453231812, + "learning_rate": 2.381556415297774e-05, + "loss": 0.803, + "step": 8260 + }, + { + "epoch": 0.525147459884569, + "grad_norm": 0.8509036302566528, + "learning_rate": 2.375214054671149e-05, + "loss": 0.8307, + "step": 8280 + }, + { + "epoch": 0.5264159320098941, + "grad_norm": 0.8268348574638367, + "learning_rate": 2.3688716940445234e-05, + "loss": 0.8205, + "step": 8300 + }, + { + "epoch": 0.5276844041352191, + "grad_norm": 0.9176819920539856, + "learning_rate": 2.3625293334178984e-05, + "loss": 0.7774, + "step": 8320 + }, + { + "epoch": 0.5289528762605442, + "grad_norm": 0.758176326751709, + "learning_rate": 2.356186972791273e-05, + "loss": 0.8067, + "step": 8340 + }, + { + "epoch": 0.5302213483858692, + "grad_norm": 0.7369076609611511, + "learning_rate": 2.349844612164648e-05, + "loss": 0.7673, + "step": 8360 + }, + { + "epoch": 0.5314898205111943, + "grad_norm": 0.8413040041923523, + "learning_rate": 2.3435022515380224e-05, + "loss": 0.8289, + "step": 8380 + }, + { + "epoch": 0.5327582926365193, + "grad_norm": 0.8975269794464111, + "learning_rate": 2.3371598909113974e-05, + "loss": 0.8097, + "step": 8400 + }, + { + "epoch": 0.5340267647618444, + "grad_norm": 0.8501763343811035, + "learning_rate": 2.330817530284772e-05, + "loss": 0.7867, + "step": 8420 + }, + { + "epoch": 0.5352952368871694, + "grad_norm": 0.9364180564880371, + "learning_rate": 2.324475169658147e-05, + "loss": 0.7925, + "step": 8440 + }, + { + "epoch": 0.5365637090124945, + "grad_norm": 0.6347882151603699, + "learning_rate": 2.3181328090315214e-05, + "loss": 0.7635, + "step": 8460 + }, + { + "epoch": 0.5378321811378195, + "grad_norm": 0.8539864420890808, + "learning_rate": 2.3117904484048964e-05, + "loss": 0.7993, + "step": 8480 + }, + { + "epoch": 0.5391006532631445, + "grad_norm": 0.8893634080886841, + "learning_rate": 2.305448087778271e-05, + "loss": 0.7809, + "step": 8500 + }, + { + "epoch": 0.5403691253884696, + "grad_norm": 0.7993662357330322, + "learning_rate": 2.299105727151646e-05, + "loss": 0.8639, + "step": 8520 + }, + { + "epoch": 0.5416375975137946, + "grad_norm": 0.8157054781913757, + "learning_rate": 2.2927633665250208e-05, + "loss": 0.8121, + "step": 8540 + }, + { + "epoch": 0.5429060696391197, + "grad_norm": 0.8141036033630371, + "learning_rate": 2.2864210058983954e-05, + "loss": 0.7748, + "step": 8560 + }, + { + "epoch": 0.5441745417644447, + "grad_norm": 0.8311188220977783, + "learning_rate": 2.2803957633031015e-05, + "loss": 0.8182, + "step": 8580 + }, + { + "epoch": 0.5454430138897698, + "grad_norm": 0.8923128247261047, + "learning_rate": 2.274053402676476e-05, + "loss": 0.7841, + "step": 8600 + }, + { + "epoch": 0.5467114860150948, + "grad_norm": 0.8246520757675171, + "learning_rate": 2.267711042049851e-05, + "loss": 0.8146, + "step": 8620 + }, + { + "epoch": 0.5479799581404199, + "grad_norm": 0.8469933271408081, + "learning_rate": 2.261368681423226e-05, + "loss": 0.817, + "step": 8640 + }, + { + "epoch": 0.5492484302657449, + "grad_norm": 0.8211717009544373, + "learning_rate": 2.2550263207966005e-05, + "loss": 0.8082, + "step": 8660 + }, + { + "epoch": 0.55051690239107, + "grad_norm": 0.9137957692146301, + "learning_rate": 2.2486839601699754e-05, + "loss": 0.7691, + "step": 8680 + }, + { + "epoch": 0.551785374516395, + "grad_norm": 0.8431654572486877, + "learning_rate": 2.2423415995433503e-05, + "loss": 0.812, + "step": 8700 + }, + { + "epoch": 0.5530538466417201, + "grad_norm": 0.9029563069343567, + "learning_rate": 2.235999238916725e-05, + "loss": 0.8667, + "step": 8720 + }, + { + "epoch": 0.5543223187670451, + "grad_norm": 0.8180502653121948, + "learning_rate": 2.2296568782900998e-05, + "loss": 0.8171, + "step": 8740 + }, + { + "epoch": 0.5555907908923702, + "grad_norm": 0.844530463218689, + "learning_rate": 2.2233145176634744e-05, + "loss": 0.7784, + "step": 8760 + }, + { + "epoch": 0.5568592630176952, + "grad_norm": 0.7153404355049133, + "learning_rate": 2.2169721570368493e-05, + "loss": 0.7891, + "step": 8780 + }, + { + "epoch": 0.5581277351430203, + "grad_norm": 0.7020410895347595, + "learning_rate": 2.2106297964102242e-05, + "loss": 0.8042, + "step": 8800 + }, + { + "epoch": 0.5593962072683453, + "grad_norm": 0.7581042647361755, + "learning_rate": 2.2042874357835988e-05, + "loss": 0.8337, + "step": 8820 + }, + { + "epoch": 0.5606646793936704, + "grad_norm": 0.7392009496688843, + "learning_rate": 2.1979450751569737e-05, + "loss": 0.8151, + "step": 8840 + }, + { + "epoch": 0.5619331515189954, + "grad_norm": 0.8381578326225281, + "learning_rate": 2.1916027145303483e-05, + "loss": 0.7923, + "step": 8860 + }, + { + "epoch": 0.5632016236443205, + "grad_norm": 1.0505058765411377, + "learning_rate": 2.1852603539037232e-05, + "loss": 0.8117, + "step": 8880 + }, + { + "epoch": 0.5644700957696455, + "grad_norm": 0.67955082654953, + "learning_rate": 2.1789179932770978e-05, + "loss": 0.7921, + "step": 8900 + }, + { + "epoch": 0.5657385678949705, + "grad_norm": 0.798687219619751, + "learning_rate": 2.1725756326504727e-05, + "loss": 0.8038, + "step": 8920 + }, + { + "epoch": 0.5670070400202956, + "grad_norm": 0.989431619644165, + "learning_rate": 2.1662332720238473e-05, + "loss": 0.8317, + "step": 8940 + }, + { + "epoch": 0.5682755121456206, + "grad_norm": 0.8161944150924683, + "learning_rate": 2.1598909113972222e-05, + "loss": 0.8222, + "step": 8960 + }, + { + "epoch": 0.5695439842709457, + "grad_norm": 0.8795542120933533, + "learning_rate": 2.1535485507705968e-05, + "loss": 0.7717, + "step": 8980 + }, + { + "epoch": 0.5708124563962707, + "grad_norm": 0.7453576326370239, + "learning_rate": 2.1472061901439717e-05, + "loss": 0.805, + "step": 9000 + }, + { + "epoch": 0.5720809285215958, + "grad_norm": 0.8081907033920288, + "learning_rate": 2.1408638295173463e-05, + "loss": 0.7757, + "step": 9020 + }, + { + "epoch": 0.5733494006469207, + "grad_norm": 0.7817357778549194, + "learning_rate": 2.1345214688907212e-05, + "loss": 0.7528, + "step": 9040 + }, + { + "epoch": 0.5746178727722459, + "grad_norm": 0.8645827770233154, + "learning_rate": 2.1281791082640958e-05, + "loss": 0.7713, + "step": 9060 + }, + { + "epoch": 0.5758863448975708, + "grad_norm": 0.8567843437194824, + "learning_rate": 2.1218367476374707e-05, + "loss": 0.7447, + "step": 9080 + }, + { + "epoch": 0.577154817022896, + "grad_norm": 0.7494439482688904, + "learning_rate": 2.1154943870108453e-05, + "loss": 0.7661, + "step": 9100 + }, + { + "epoch": 0.5784232891482209, + "grad_norm": 0.8079215884208679, + "learning_rate": 2.1091520263842206e-05, + "loss": 0.8119, + "step": 9120 + }, + { + "epoch": 0.579691761273546, + "grad_norm": 0.9019980430603027, + "learning_rate": 2.102809665757595e-05, + "loss": 0.8125, + "step": 9140 + }, + { + "epoch": 0.580960233398871, + "grad_norm": 0.7452351450920105, + "learning_rate": 2.09646730513097e-05, + "loss": 0.7691, + "step": 9160 + }, + { + "epoch": 0.5822287055241961, + "grad_norm": 0.7727750539779663, + "learning_rate": 2.0901249445043446e-05, + "loss": 0.7925, + "step": 9180 + }, + { + "epoch": 0.5834971776495211, + "grad_norm": 0.7486307621002197, + "learning_rate": 2.0837825838777196e-05, + "loss": 0.7668, + "step": 9200 + }, + { + "epoch": 0.5847656497748462, + "grad_norm": 0.8719222545623779, + "learning_rate": 2.077440223251094e-05, + "loss": 0.8196, + "step": 9220 + }, + { + "epoch": 0.5860341219001712, + "grad_norm": 0.7641133069992065, + "learning_rate": 2.071097862624469e-05, + "loss": 0.7559, + "step": 9240 + }, + { + "epoch": 0.5873025940254963, + "grad_norm": 0.8036416172981262, + "learning_rate": 2.0647555019978436e-05, + "loss": 0.8285, + "step": 9260 + }, + { + "epoch": 0.5885710661508213, + "grad_norm": 0.8614276051521301, + "learning_rate": 2.0584131413712186e-05, + "loss": 0.7951, + "step": 9280 + }, + { + "epoch": 0.5898395382761464, + "grad_norm": 0.8406545519828796, + "learning_rate": 2.052070780744593e-05, + "loss": 0.7714, + "step": 9300 + }, + { + "epoch": 0.5911080104014714, + "grad_norm": 0.9403005838394165, + "learning_rate": 2.045728420117968e-05, + "loss": 0.7999, + "step": 9320 + }, + { + "epoch": 0.5923764825267964, + "grad_norm": 0.8395708799362183, + "learning_rate": 2.0393860594913426e-05, + "loss": 0.8069, + "step": 9340 + }, + { + "epoch": 0.5936449546521215, + "grad_norm": 0.8432602286338806, + "learning_rate": 2.0330436988647176e-05, + "loss": 0.8189, + "step": 9360 + }, + { + "epoch": 0.5949134267774465, + "grad_norm": 0.7362537980079651, + "learning_rate": 2.026701338238092e-05, + "loss": 0.8069, + "step": 9380 + }, + { + "epoch": 0.5961818989027716, + "grad_norm": 0.7601738572120667, + "learning_rate": 2.020358977611467e-05, + "loss": 0.7449, + "step": 9400 + }, + { + "epoch": 0.5974503710280966, + "grad_norm": 0.8012720346450806, + "learning_rate": 2.0140166169848416e-05, + "loss": 0.7831, + "step": 9420 + }, + { + "epoch": 0.5987188431534217, + "grad_norm": 0.7670310139656067, + "learning_rate": 2.0076742563582166e-05, + "loss": 0.8393, + "step": 9440 + }, + { + "epoch": 0.5999873152787467, + "grad_norm": 0.8244422674179077, + "learning_rate": 2.001331895731591e-05, + "loss": 0.7709, + "step": 9460 + }, + { + "epoch": 0.6012557874040718, + "grad_norm": 0.7943612933158875, + "learning_rate": 1.994989535104966e-05, + "loss": 0.819, + "step": 9480 + }, + { + "epoch": 0.6025242595293968, + "grad_norm": 0.9540635347366333, + "learning_rate": 1.9886471744783406e-05, + "loss": 0.7899, + "step": 9500 + }, + { + "epoch": 0.6037927316547219, + "grad_norm": 0.9198821783065796, + "learning_rate": 1.9823048138517156e-05, + "loss": 0.8239, + "step": 9520 + }, + { + "epoch": 0.6050612037800469, + "grad_norm": 0.7837796807289124, + "learning_rate": 1.9759624532250905e-05, + "loss": 0.7929, + "step": 9540 + }, + { + "epoch": 0.606329675905372, + "grad_norm": 0.8205187320709229, + "learning_rate": 1.9696200925984654e-05, + "loss": 0.819, + "step": 9560 + }, + { + "epoch": 0.607598148030697, + "grad_norm": 0.8532772064208984, + "learning_rate": 1.96327773197184e-05, + "loss": 0.7455, + "step": 9580 + }, + { + "epoch": 0.6088666201560221, + "grad_norm": 0.8524623513221741, + "learning_rate": 1.956935371345215e-05, + "loss": 0.8089, + "step": 9600 + }, + { + "epoch": 0.6101350922813471, + "grad_norm": 0.8614479899406433, + "learning_rate": 1.9505930107185895e-05, + "loss": 0.8059, + "step": 9620 + }, + { + "epoch": 0.6114035644066722, + "grad_norm": 0.7598078846931458, + "learning_rate": 1.9442506500919644e-05, + "loss": 0.7832, + "step": 9640 + }, + { + "epoch": 0.6126720365319972, + "grad_norm": 0.809009850025177, + "learning_rate": 1.937908289465339e-05, + "loss": 0.7259, + "step": 9660 + }, + { + "epoch": 0.6139405086573223, + "grad_norm": 0.7381779551506042, + "learning_rate": 1.931565928838714e-05, + "loss": 0.7676, + "step": 9680 + }, + { + "epoch": 0.6152089807826473, + "grad_norm": 0.8887180685997009, + "learning_rate": 1.9252235682120888e-05, + "loss": 0.8126, + "step": 9700 + }, + { + "epoch": 0.6164774529079724, + "grad_norm": 0.7270573973655701, + "learning_rate": 1.9188812075854634e-05, + "loss": 0.7786, + "step": 9720 + }, + { + "epoch": 0.6177459250332974, + "grad_norm": 0.7978057861328125, + "learning_rate": 1.9125388469588383e-05, + "loss": 0.8281, + "step": 9740 + }, + { + "epoch": 0.6190143971586224, + "grad_norm": 0.8202372789382935, + "learning_rate": 1.906196486332213e-05, + "loss": 0.7656, + "step": 9760 + }, + { + "epoch": 0.6202828692839475, + "grad_norm": 0.9720300436019897, + "learning_rate": 1.8998541257055878e-05, + "loss": 0.7881, + "step": 9780 + }, + { + "epoch": 0.6215513414092725, + "grad_norm": 0.9297833442687988, + "learning_rate": 1.8935117650789624e-05, + "loss": 0.8237, + "step": 9800 + }, + { + "epoch": 0.6228198135345976, + "grad_norm": 0.7593715190887451, + "learning_rate": 1.8871694044523373e-05, + "loss": 0.7574, + "step": 9820 + }, + { + "epoch": 0.6240882856599226, + "grad_norm": 0.8537524938583374, + "learning_rate": 1.880827043825712e-05, + "loss": 0.7969, + "step": 9840 + }, + { + "epoch": 0.6253567577852477, + "grad_norm": 0.770918607711792, + "learning_rate": 1.8744846831990868e-05, + "loss": 0.7894, + "step": 9860 + }, + { + "epoch": 0.6266252299105727, + "grad_norm": 0.7605695724487305, + "learning_rate": 1.8681423225724614e-05, + "loss": 0.782, + "step": 9880 + }, + { + "epoch": 0.6278937020358978, + "grad_norm": 0.8978208303451538, + "learning_rate": 1.8617999619458363e-05, + "loss": 0.7979, + "step": 9900 + }, + { + "epoch": 0.6291621741612228, + "grad_norm": 0.7393850088119507, + "learning_rate": 1.855457601319211e-05, + "loss": 0.8139, + "step": 9920 + }, + { + "epoch": 0.6304306462865479, + "grad_norm": 0.7255131602287292, + "learning_rate": 1.8491152406925858e-05, + "loss": 0.7802, + "step": 9940 + }, + { + "epoch": 0.6316991184118729, + "grad_norm": 0.7080028653144836, + "learning_rate": 1.8427728800659607e-05, + "loss": 0.8059, + "step": 9960 + }, + { + "epoch": 0.632967590537198, + "grad_norm": 0.8282076716423035, + "learning_rate": 1.8364305194393357e-05, + "loss": 0.7905, + "step": 9980 + }, + { + "epoch": 0.634236062662523, + "grad_norm": 0.8741589784622192, + "learning_rate": 1.8300881588127102e-05, + "loss": 0.8174, + "step": 10000 + }, + { + "epoch": 0.6355045347878481, + "grad_norm": 0.7435175776481628, + "learning_rate": 1.823745798186085e-05, + "loss": 0.7678, + "step": 10020 + }, + { + "epoch": 0.6367730069131731, + "grad_norm": 0.7347603440284729, + "learning_rate": 1.817720555590791e-05, + "loss": 0.8225, + "step": 10040 + }, + { + "epoch": 0.6380414790384982, + "grad_norm": 0.8974965214729309, + "learning_rate": 1.811378194964166e-05, + "loss": 0.7766, + "step": 10060 + }, + { + "epoch": 0.6393099511638232, + "grad_norm": 0.7255268692970276, + "learning_rate": 1.8050358343375408e-05, + "loss": 0.8, + "step": 10080 + }, + { + "epoch": 0.6405784232891483, + "grad_norm": 0.7062020897865295, + "learning_rate": 1.7986934737109154e-05, + "loss": 0.7088, + "step": 10100 + }, + { + "epoch": 0.6418468954144733, + "grad_norm": 0.8076253533363342, + "learning_rate": 1.7923511130842903e-05, + "loss": 0.8152, + "step": 10120 + }, + { + "epoch": 0.6431153675397984, + "grad_norm": 0.8340699672698975, + "learning_rate": 1.786008752457665e-05, + "loss": 0.7985, + "step": 10140 + }, + { + "epoch": 0.6443838396651234, + "grad_norm": 0.7522137761116028, + "learning_rate": 1.7796663918310398e-05, + "loss": 0.7704, + "step": 10160 + }, + { + "epoch": 0.6456523117904484, + "grad_norm": 0.8227932453155518, + "learning_rate": 1.7733240312044144e-05, + "loss": 0.828, + "step": 10180 + }, + { + "epoch": 0.6469207839157735, + "grad_norm": 0.7742383480072021, + "learning_rate": 1.7669816705777893e-05, + "loss": 0.7893, + "step": 10200 + }, + { + "epoch": 0.6481892560410984, + "grad_norm": 0.7038094401359558, + "learning_rate": 1.760639309951164e-05, + "loss": 0.7699, + "step": 10220 + }, + { + "epoch": 0.6494577281664236, + "grad_norm": 0.8382614850997925, + "learning_rate": 1.7542969493245388e-05, + "loss": 0.8254, + "step": 10240 + }, + { + "epoch": 0.6507262002917485, + "grad_norm": 0.9173989295959473, + "learning_rate": 1.7479545886979134e-05, + "loss": 0.7603, + "step": 10260 + }, + { + "epoch": 0.6519946724170737, + "grad_norm": 0.7602284550666809, + "learning_rate": 1.7416122280712883e-05, + "loss": 0.8019, + "step": 10280 + }, + { + "epoch": 0.6532631445423986, + "grad_norm": 0.8012353181838989, + "learning_rate": 1.735269867444663e-05, + "loss": 0.7944, + "step": 10300 + }, + { + "epoch": 0.6545316166677237, + "grad_norm": 0.8844314217567444, + "learning_rate": 1.7289275068180378e-05, + "loss": 0.7785, + "step": 10320 + }, + { + "epoch": 0.6558000887930487, + "grad_norm": 0.7556779980659485, + "learning_rate": 1.7225851461914124e-05, + "loss": 0.7597, + "step": 10340 + }, + { + "epoch": 0.6570685609183738, + "grad_norm": 0.8446857929229736, + "learning_rate": 1.7162427855647873e-05, + "loss": 0.7941, + "step": 10360 + }, + { + "epoch": 0.6583370330436988, + "grad_norm": 0.7313318848609924, + "learning_rate": 1.709900424938162e-05, + "loss": 0.8017, + "step": 10380 + }, + { + "epoch": 0.6596055051690239, + "grad_norm": 0.8298467397689819, + "learning_rate": 1.7035580643115368e-05, + "loss": 0.7869, + "step": 10400 + }, + { + "epoch": 0.6608739772943489, + "grad_norm": 0.8003538846969604, + "learning_rate": 1.6972157036849114e-05, + "loss": 0.8002, + "step": 10420 + }, + { + "epoch": 0.662142449419674, + "grad_norm": 0.7555122971534729, + "learning_rate": 1.6908733430582863e-05, + "loss": 0.7632, + "step": 10440 + }, + { + "epoch": 0.663410921544999, + "grad_norm": 0.7712675333023071, + "learning_rate": 1.684530982431661e-05, + "loss": 0.7441, + "step": 10460 + }, + { + "epoch": 0.6646793936703241, + "grad_norm": 0.6845158338546753, + "learning_rate": 1.6781886218050358e-05, + "loss": 0.7384, + "step": 10480 + }, + { + "epoch": 0.6659478657956491, + "grad_norm": 0.8500059843063354, + "learning_rate": 1.6718462611784107e-05, + "loss": 0.8052, + "step": 10500 + }, + { + "epoch": 0.6672163379209742, + "grad_norm": 0.759861946105957, + "learning_rate": 1.6655039005517856e-05, + "loss": 0.828, + "step": 10520 + }, + { + "epoch": 0.6684848100462992, + "grad_norm": 0.7759114503860474, + "learning_rate": 1.6591615399251602e-05, + "loss": 0.7719, + "step": 10540 + }, + { + "epoch": 0.6697532821716243, + "grad_norm": 0.8368454575538635, + "learning_rate": 1.652819179298535e-05, + "loss": 0.8383, + "step": 10560 + }, + { + "epoch": 0.6710217542969493, + "grad_norm": 0.8691524267196655, + "learning_rate": 1.6464768186719097e-05, + "loss": 0.7822, + "step": 10580 + }, + { + "epoch": 0.6722902264222743, + "grad_norm": 0.8464477062225342, + "learning_rate": 1.6401344580452846e-05, + "loss": 0.7802, + "step": 10600 + }, + { + "epoch": 0.6735586985475994, + "grad_norm": 0.796231210231781, + "learning_rate": 1.6337920974186592e-05, + "loss": 0.7821, + "step": 10620 + }, + { + "epoch": 0.6748271706729244, + "grad_norm": 0.7409220933914185, + "learning_rate": 1.627449736792034e-05, + "loss": 0.8203, + "step": 10640 + }, + { + "epoch": 0.6760956427982495, + "grad_norm": 0.7760050892829895, + "learning_rate": 1.6211073761654087e-05, + "loss": 0.7673, + "step": 10660 + }, + { + "epoch": 0.6773641149235745, + "grad_norm": 0.7795297503471375, + "learning_rate": 1.6147650155387836e-05, + "loss": 0.7545, + "step": 10680 + }, + { + "epoch": 0.6786325870488996, + "grad_norm": 0.8562922477722168, + "learning_rate": 1.6084226549121585e-05, + "loss": 0.7744, + "step": 10700 + }, + { + "epoch": 0.6799010591742246, + "grad_norm": 0.8879472613334656, + "learning_rate": 1.602080294285533e-05, + "loss": 0.7775, + "step": 10720 + }, + { + "epoch": 0.6811695312995497, + "grad_norm": 0.674929141998291, + "learning_rate": 1.595737933658908e-05, + "loss": 0.8155, + "step": 10740 + }, + { + "epoch": 0.6824380034248747, + "grad_norm": 0.8436025381088257, + "learning_rate": 1.5893955730322826e-05, + "loss": 0.7855, + "step": 10760 + }, + { + "epoch": 0.6837064755501998, + "grad_norm": 0.7950330972671509, + "learning_rate": 1.5830532124056575e-05, + "loss": 0.8171, + "step": 10780 + }, + { + "epoch": 0.6849749476755248, + "grad_norm": 0.7402753233909607, + "learning_rate": 1.576710851779032e-05, + "loss": 0.7543, + "step": 10800 + }, + { + "epoch": 0.6862434198008499, + "grad_norm": 0.7969671487808228, + "learning_rate": 1.570368491152407e-05, + "loss": 0.811, + "step": 10820 + }, + { + "epoch": 0.6875118919261749, + "grad_norm": 0.9241589307785034, + "learning_rate": 1.5640261305257816e-05, + "loss": 0.8234, + "step": 10840 + }, + { + "epoch": 0.6887803640515, + "grad_norm": 0.8808215260505676, + "learning_rate": 1.5576837698991565e-05, + "loss": 0.7646, + "step": 10860 + }, + { + "epoch": 0.690048836176825, + "grad_norm": 0.7900111675262451, + "learning_rate": 1.551341409272531e-05, + "loss": 0.7461, + "step": 10880 + }, + { + "epoch": 0.6913173083021501, + "grad_norm": 0.9008402824401855, + "learning_rate": 1.544999048645906e-05, + "loss": 0.7693, + "step": 10900 + }, + { + "epoch": 0.6925857804274751, + "grad_norm": 0.925081729888916, + "learning_rate": 1.538656688019281e-05, + "loss": 0.8174, + "step": 10920 + }, + { + "epoch": 0.6938542525528002, + "grad_norm": 0.8141810297966003, + "learning_rate": 1.532314327392656e-05, + "loss": 0.8009, + "step": 10940 + }, + { + "epoch": 0.6951227246781252, + "grad_norm": 0.8973850011825562, + "learning_rate": 1.5259719667660305e-05, + "loss": 0.7731, + "step": 10960 + }, + { + "epoch": 0.6963911968034503, + "grad_norm": 0.7652609348297119, + "learning_rate": 1.5196296061394052e-05, + "loss": 0.7751, + "step": 10980 + }, + { + "epoch": 0.6976596689287753, + "grad_norm": 0.8361225724220276, + "learning_rate": 1.51328724551278e-05, + "loss": 0.7551, + "step": 11000 + }, + { + "epoch": 0.6989281410541003, + "grad_norm": 0.7935757040977478, + "learning_rate": 1.5069448848861547e-05, + "loss": 0.8139, + "step": 11020 + }, + { + "epoch": 0.7001966131794254, + "grad_norm": 0.7135019898414612, + "learning_rate": 1.5006025242595295e-05, + "loss": 0.7768, + "step": 11040 + }, + { + "epoch": 0.7014650853047504, + "grad_norm": 0.811869204044342, + "learning_rate": 1.4942601636329042e-05, + "loss": 0.7697, + "step": 11060 + }, + { + "epoch": 0.7027335574300755, + "grad_norm": 0.9030170440673828, + "learning_rate": 1.487917803006279e-05, + "loss": 0.8206, + "step": 11080 + }, + { + "epoch": 0.7040020295554005, + "grad_norm": 0.765082836151123, + "learning_rate": 1.4815754423796537e-05, + "loss": 0.8204, + "step": 11100 + }, + { + "epoch": 0.7052705016807256, + "grad_norm": 0.7715885639190674, + "learning_rate": 1.4752330817530285e-05, + "loss": 0.7718, + "step": 11120 + }, + { + "epoch": 0.7065389738060506, + "grad_norm": 0.7729353904724121, + "learning_rate": 1.4688907211264034e-05, + "loss": 0.8165, + "step": 11140 + }, + { + "epoch": 0.7078074459313757, + "grad_norm": 0.6622787117958069, + "learning_rate": 1.4625483604997781e-05, + "loss": 0.7618, + "step": 11160 + }, + { + "epoch": 0.7090759180567007, + "grad_norm": 0.820572555065155, + "learning_rate": 1.4562059998731529e-05, + "loss": 0.7887, + "step": 11180 + }, + { + "epoch": 0.7103443901820258, + "grad_norm": 0.7710301876068115, + "learning_rate": 1.4498636392465276e-05, + "loss": 0.7712, + "step": 11200 + }, + { + "epoch": 0.7116128623073508, + "grad_norm": 0.8138539791107178, + "learning_rate": 1.4435212786199024e-05, + "loss": 0.7989, + "step": 11220 + }, + { + "epoch": 0.7128813344326759, + "grad_norm": 0.7800792455673218, + "learning_rate": 1.4371789179932771e-05, + "loss": 0.7641, + "step": 11240 + }, + { + "epoch": 0.7141498065580009, + "grad_norm": 0.809686005115509, + "learning_rate": 1.4308365573666519e-05, + "loss": 0.8053, + "step": 11260 + }, + { + "epoch": 0.715418278683326, + "grad_norm": 0.8002369403839111, + "learning_rate": 1.4244941967400266e-05, + "loss": 0.8044, + "step": 11280 + }, + { + "epoch": 0.716686750808651, + "grad_norm": 0.8907930850982666, + "learning_rate": 1.4181518361134014e-05, + "loss": 0.7896, + "step": 11300 + }, + { + "epoch": 0.7179552229339761, + "grad_norm": 0.8205035328865051, + "learning_rate": 1.4118094754867761e-05, + "loss": 0.7609, + "step": 11320 + }, + { + "epoch": 0.7192236950593011, + "grad_norm": 0.7667264342308044, + "learning_rate": 1.4054671148601512e-05, + "loss": 0.8022, + "step": 11340 + }, + { + "epoch": 0.7204921671846262, + "grad_norm": 0.7035322189331055, + "learning_rate": 1.399124754233526e-05, + "loss": 0.7969, + "step": 11360 + }, + { + "epoch": 0.7217606393099512, + "grad_norm": 0.7853593230247498, + "learning_rate": 1.3927823936069007e-05, + "loss": 0.7839, + "step": 11380 + }, + { + "epoch": 0.7230291114352763, + "grad_norm": 0.9023504853248596, + "learning_rate": 1.3864400329802755e-05, + "loss": 0.7867, + "step": 11400 + }, + { + "epoch": 0.7242975835606013, + "grad_norm": 0.8038562536239624, + "learning_rate": 1.3800976723536502e-05, + "loss": 0.76, + "step": 11420 + }, + { + "epoch": 0.7255660556859262, + "grad_norm": 0.8277421593666077, + "learning_rate": 1.373755311727025e-05, + "loss": 0.8377, + "step": 11440 + }, + { + "epoch": 0.7268345278112514, + "grad_norm": 0.7307552099227905, + "learning_rate": 1.3674129511003997e-05, + "loss": 0.7962, + "step": 11460 + }, + { + "epoch": 0.7281029999365763, + "grad_norm": 0.7248812913894653, + "learning_rate": 1.3610705904737745e-05, + "loss": 0.7655, + "step": 11480 + }, + { + "epoch": 0.7293714720619014, + "grad_norm": 0.8142716288566589, + "learning_rate": 1.3547282298471492e-05, + "loss": 0.7874, + "step": 11500 + }, + { + "epoch": 0.7306399441872264, + "grad_norm": 0.8528370261192322, + "learning_rate": 1.348385869220524e-05, + "loss": 0.7478, + "step": 11520 + }, + { + "epoch": 0.7319084163125515, + "grad_norm": 0.7856337428092957, + "learning_rate": 1.3420435085938987e-05, + "loss": 0.7682, + "step": 11540 + }, + { + "epoch": 0.7331768884378765, + "grad_norm": 0.8709967136383057, + "learning_rate": 1.3357011479672735e-05, + "loss": 0.7951, + "step": 11560 + }, + { + "epoch": 0.7344453605632016, + "grad_norm": 0.7879327535629272, + "learning_rate": 1.3293587873406482e-05, + "loss": 0.8052, + "step": 11580 + }, + { + "epoch": 0.7357138326885266, + "grad_norm": 0.7124823331832886, + "learning_rate": 1.323016426714023e-05, + "loss": 0.7847, + "step": 11600 + }, + { + "epoch": 0.7369823048138517, + "grad_norm": 0.8641963601112366, + "learning_rate": 1.3166740660873977e-05, + "loss": 0.7953, + "step": 11620 + }, + { + "epoch": 0.7382507769391767, + "grad_norm": 0.777748167514801, + "learning_rate": 1.3103317054607725e-05, + "loss": 0.7786, + "step": 11640 + }, + { + "epoch": 0.7395192490645018, + "grad_norm": 0.9086549878120422, + "learning_rate": 1.3039893448341472e-05, + "loss": 0.7954, + "step": 11660 + }, + { + "epoch": 0.7407877211898268, + "grad_norm": 0.7550273537635803, + "learning_rate": 1.297646984207522e-05, + "loss": 0.7679, + "step": 11680 + }, + { + "epoch": 0.7420561933151519, + "grad_norm": 0.8174465894699097, + "learning_rate": 1.2913046235808967e-05, + "loss": 0.7829, + "step": 11700 + }, + { + "epoch": 0.7433246654404769, + "grad_norm": 0.8319543600082397, + "learning_rate": 1.2849622629542715e-05, + "loss": 0.7592, + "step": 11720 + }, + { + "epoch": 0.744593137565802, + "grad_norm": 0.7246963381767273, + "learning_rate": 1.2786199023276462e-05, + "loss": 0.7925, + "step": 11740 + }, + { + "epoch": 0.745861609691127, + "grad_norm": 0.7811394333839417, + "learning_rate": 1.2722775417010213e-05, + "loss": 0.8063, + "step": 11760 + }, + { + "epoch": 0.7471300818164521, + "grad_norm": 0.9180453419685364, + "learning_rate": 1.266252299105727e-05, + "loss": 0.8391, + "step": 11780 + }, + { + "epoch": 0.7483985539417771, + "grad_norm": 0.6986908912658691, + "learning_rate": 1.2599099384791018e-05, + "loss": 0.7773, + "step": 11800 + }, + { + "epoch": 0.7496670260671022, + "grad_norm": 0.8293908834457397, + "learning_rate": 1.2535675778524766e-05, + "loss": 0.7888, + "step": 11820 + }, + { + "epoch": 0.7509354981924272, + "grad_norm": 0.9369567036628723, + "learning_rate": 1.2472252172258515e-05, + "loss": 0.772, + "step": 11840 + }, + { + "epoch": 0.7522039703177522, + "grad_norm": 0.884286105632782, + "learning_rate": 1.2408828565992263e-05, + "loss": 0.7806, + "step": 11860 + }, + { + "epoch": 0.7534724424430773, + "grad_norm": 0.749497652053833, + "learning_rate": 1.234540495972601e-05, + "loss": 0.7501, + "step": 11880 + }, + { + "epoch": 0.7547409145684023, + "grad_norm": 0.6741966605186462, + "learning_rate": 1.2281981353459758e-05, + "loss": 0.7672, + "step": 11900 + }, + { + "epoch": 0.7560093866937274, + "grad_norm": 0.8107251524925232, + "learning_rate": 1.2218557747193505e-05, + "loss": 0.7765, + "step": 11920 + }, + { + "epoch": 0.7572778588190524, + "grad_norm": 0.9146373867988586, + "learning_rate": 1.2155134140927253e-05, + "loss": 0.7462, + "step": 11940 + }, + { + "epoch": 0.7585463309443775, + "grad_norm": 0.9027043581008911, + "learning_rate": 1.2091710534661e-05, + "loss": 0.7819, + "step": 11960 + }, + { + "epoch": 0.7598148030697025, + "grad_norm": 0.7713417410850525, + "learning_rate": 1.202828692839475e-05, + "loss": 0.7684, + "step": 11980 + }, + { + "epoch": 0.7610832751950276, + "grad_norm": 0.8822270631790161, + "learning_rate": 1.1964863322128497e-05, + "loss": 0.7524, + "step": 12000 + }, + { + "epoch": 0.7623517473203526, + "grad_norm": 0.8402985334396362, + "learning_rate": 1.1901439715862244e-05, + "loss": 0.795, + "step": 12020 + }, + { + "epoch": 0.7636202194456777, + "grad_norm": 0.7556558847427368, + "learning_rate": 1.1838016109595992e-05, + "loss": 0.773, + "step": 12040 + }, + { + "epoch": 0.7648886915710027, + "grad_norm": 0.7098413705825806, + "learning_rate": 1.177459250332974e-05, + "loss": 0.7888, + "step": 12060 + }, + { + "epoch": 0.7661571636963278, + "grad_norm": 0.6865963935852051, + "learning_rate": 1.1711168897063487e-05, + "loss": 0.7844, + "step": 12080 + }, + { + "epoch": 0.7674256358216528, + "grad_norm": 0.9354507923126221, + "learning_rate": 1.1647745290797234e-05, + "loss": 0.8064, + "step": 12100 + }, + { + "epoch": 0.7686941079469779, + "grad_norm": 0.8983631134033203, + "learning_rate": 1.1584321684530983e-05, + "loss": 0.759, + "step": 12120 + }, + { + "epoch": 0.7699625800723029, + "grad_norm": 0.9061852693557739, + "learning_rate": 1.1520898078264731e-05, + "loss": 0.7959, + "step": 12140 + }, + { + "epoch": 0.771231052197628, + "grad_norm": 0.8606493473052979, + "learning_rate": 1.1457474471998478e-05, + "loss": 0.7926, + "step": 12160 + }, + { + "epoch": 0.772499524322953, + "grad_norm": 0.9167592525482178, + "learning_rate": 1.1394050865732226e-05, + "loss": 0.7813, + "step": 12180 + }, + { + "epoch": 0.7737679964482781, + "grad_norm": 0.866223931312561, + "learning_rate": 1.1330627259465975e-05, + "loss": 0.7827, + "step": 12200 + }, + { + "epoch": 0.7750364685736031, + "grad_norm": 0.720427930355072, + "learning_rate": 1.1267203653199723e-05, + "loss": 0.813, + "step": 12220 + }, + { + "epoch": 0.7763049406989282, + "grad_norm": 0.8172628283500671, + "learning_rate": 1.120378004693347e-05, + "loss": 0.7733, + "step": 12240 + }, + { + "epoch": 0.7775734128242532, + "grad_norm": 0.741121768951416, + "learning_rate": 1.1140356440667218e-05, + "loss": 0.7434, + "step": 12260 + }, + { + "epoch": 0.7788418849495782, + "grad_norm": 0.723564624786377, + "learning_rate": 1.1076932834400965e-05, + "loss": 0.7789, + "step": 12280 + }, + { + "epoch": 0.7801103570749033, + "grad_norm": 0.9289072155952454, + "learning_rate": 1.1013509228134713e-05, + "loss": 0.7894, + "step": 12300 + }, + { + "epoch": 0.7813788292002283, + "grad_norm": 0.8132310509681702, + "learning_rate": 1.095008562186846e-05, + "loss": 0.8153, + "step": 12320 + }, + { + "epoch": 0.7826473013255534, + "grad_norm": 0.967943549156189, + "learning_rate": 1.0886662015602208e-05, + "loss": 0.7821, + "step": 12340 + }, + { + "epoch": 0.7839157734508784, + "grad_norm": 0.7738404273986816, + "learning_rate": 1.0823238409335955e-05, + "loss": 0.7855, + "step": 12360 + }, + { + "epoch": 0.7851842455762035, + "grad_norm": 0.8411769270896912, + "learning_rate": 1.0759814803069703e-05, + "loss": 0.7988, + "step": 12380 + }, + { + "epoch": 0.7864527177015285, + "grad_norm": 0.8962435722351074, + "learning_rate": 1.0696391196803452e-05, + "loss": 0.8321, + "step": 12400 + }, + { + "epoch": 0.7877211898268536, + "grad_norm": 0.7484604716300964, + "learning_rate": 1.06329675905372e-05, + "loss": 0.7686, + "step": 12420 + }, + { + "epoch": 0.7889896619521786, + "grad_norm": 0.802546501159668, + "learning_rate": 1.0569543984270947e-05, + "loss": 0.7904, + "step": 12440 + }, + { + "epoch": 0.7902581340775037, + "grad_norm": 0.7103933691978455, + "learning_rate": 1.0506120378004694e-05, + "loss": 0.756, + "step": 12460 + }, + { + "epoch": 0.7915266062028287, + "grad_norm": 0.6866100430488586, + "learning_rate": 1.0442696771738442e-05, + "loss": 0.7736, + "step": 12480 + }, + { + "epoch": 0.7927950783281538, + "grad_norm": 0.7697407603263855, + "learning_rate": 1.037927316547219e-05, + "loss": 0.7796, + "step": 12500 + }, + { + "epoch": 0.7940635504534788, + "grad_norm": 0.8658385276794434, + "learning_rate": 1.0315849559205937e-05, + "loss": 0.787, + "step": 12520 + }, + { + "epoch": 0.7953320225788039, + "grad_norm": 0.8282449245452881, + "learning_rate": 1.0252425952939684e-05, + "loss": 0.7666, + "step": 12540 + }, + { + "epoch": 0.7966004947041289, + "grad_norm": 0.8376625776290894, + "learning_rate": 1.0189002346673432e-05, + "loss": 0.7313, + "step": 12560 + }, + { + "epoch": 0.797868966829454, + "grad_norm": 0.8002750277519226, + "learning_rate": 1.012557874040718e-05, + "loss": 0.8, + "step": 12580 + }, + { + "epoch": 0.799137438954779, + "grad_norm": 0.7849326729774475, + "learning_rate": 1.0062155134140929e-05, + "loss": 0.7761, + "step": 12600 + }, + { + "epoch": 0.8004059110801041, + "grad_norm": 0.8501541018486023, + "learning_rate": 9.998731527874676e-06, + "loss": 0.7845, + "step": 12620 + }, + { + "epoch": 0.801674383205429, + "grad_norm": 0.7818264365196228, + "learning_rate": 9.935307921608424e-06, + "loss": 0.7669, + "step": 12640 + }, + { + "epoch": 0.8029428553307542, + "grad_norm": 0.9117188453674316, + "learning_rate": 9.871884315342171e-06, + "loss": 0.7578, + "step": 12660 + }, + { + "epoch": 0.8042113274560792, + "grad_norm": 0.8058929443359375, + "learning_rate": 9.808460709075919e-06, + "loss": 0.7676, + "step": 12680 + }, + { + "epoch": 0.8054797995814041, + "grad_norm": 0.8033195734024048, + "learning_rate": 9.745037102809666e-06, + "loss": 0.8151, + "step": 12700 + }, + { + "epoch": 0.8067482717067292, + "grad_norm": 0.898897647857666, + "learning_rate": 9.681613496543414e-06, + "loss": 0.8267, + "step": 12720 + }, + { + "epoch": 0.8080167438320542, + "grad_norm": 0.9970609545707703, + "learning_rate": 9.618189890277161e-06, + "loss": 0.8075, + "step": 12740 + }, + { + "epoch": 0.8092852159573793, + "grad_norm": 0.904344916343689, + "learning_rate": 9.554766284010909e-06, + "loss": 0.8, + "step": 12760 + }, + { + "epoch": 0.8105536880827043, + "grad_norm": 0.8318148255348206, + "learning_rate": 9.491342677744656e-06, + "loss": 0.8027, + "step": 12780 + }, + { + "epoch": 0.8118221602080294, + "grad_norm": 0.8471246957778931, + "learning_rate": 9.427919071478404e-06, + "loss": 0.7587, + "step": 12800 + }, + { + "epoch": 0.8130906323333544, + "grad_norm": 0.7848266363143921, + "learning_rate": 9.364495465212153e-06, + "loss": 0.7499, + "step": 12820 + }, + { + "epoch": 0.8143591044586795, + "grad_norm": 0.9037428498268127, + "learning_rate": 9.3010718589459e-06, + "loss": 0.7865, + "step": 12840 + }, + { + "epoch": 0.8156275765840045, + "grad_norm": 0.7049270868301392, + "learning_rate": 9.237648252679648e-06, + "loss": 0.817, + "step": 12860 + }, + { + "epoch": 0.8168960487093296, + "grad_norm": 0.7613449096679688, + "learning_rate": 9.174224646413395e-06, + "loss": 0.806, + "step": 12880 + }, + { + "epoch": 0.8181645208346546, + "grad_norm": 0.7704141139984131, + "learning_rate": 9.110801040147143e-06, + "loss": 0.7307, + "step": 12900 + }, + { + "epoch": 0.8194329929599797, + "grad_norm": 0.707279622554779, + "learning_rate": 9.04737743388089e-06, + "loss": 0.7838, + "step": 12920 + }, + { + "epoch": 0.8207014650853047, + "grad_norm": 0.7817753553390503, + "learning_rate": 8.983953827614638e-06, + "loss": 0.7994, + "step": 12940 + }, + { + "epoch": 0.8219699372106298, + "grad_norm": 0.8321487307548523, + "learning_rate": 8.920530221348385e-06, + "loss": 0.7843, + "step": 12960 + }, + { + "epoch": 0.8232384093359548, + "grad_norm": 0.799281120300293, + "learning_rate": 8.857106615082133e-06, + "loss": 0.771, + "step": 12980 + }, + { + "epoch": 0.8245068814612799, + "grad_norm": 0.8843486309051514, + "learning_rate": 8.79368300881588e-06, + "loss": 0.8269, + "step": 13000 + }, + { + "epoch": 0.8257753535866049, + "grad_norm": 0.6699514985084534, + "learning_rate": 8.73025940254963e-06, + "loss": 0.7398, + "step": 13020 + }, + { + "epoch": 0.82704382571193, + "grad_norm": 0.7868858575820923, + "learning_rate": 8.666835796283377e-06, + "loss": 0.7779, + "step": 13040 + }, + { + "epoch": 0.828312297837255, + "grad_norm": 0.8733574151992798, + "learning_rate": 8.603412190017125e-06, + "loss": 0.7977, + "step": 13060 + }, + { + "epoch": 0.8295807699625801, + "grad_norm": 0.7439238429069519, + "learning_rate": 8.539988583750872e-06, + "loss": 0.7587, + "step": 13080 + }, + { + "epoch": 0.8308492420879051, + "grad_norm": 0.8214549422264099, + "learning_rate": 8.476564977484621e-06, + "loss": 0.8181, + "step": 13100 + }, + { + "epoch": 0.8321177142132302, + "grad_norm": 0.8577607870101929, + "learning_rate": 8.413141371218369e-06, + "loss": 0.793, + "step": 13120 + }, + { + "epoch": 0.8333861863385552, + "grad_norm": 0.6957492828369141, + "learning_rate": 8.349717764952116e-06, + "loss": 0.755, + "step": 13140 + }, + { + "epoch": 0.8346546584638802, + "grad_norm": 0.981088399887085, + "learning_rate": 8.286294158685864e-06, + "loss": 0.7769, + "step": 13160 + }, + { + "epoch": 0.8359231305892053, + "grad_norm": 0.7333866357803345, + "learning_rate": 8.222870552419611e-06, + "loss": 0.7536, + "step": 13180 + }, + { + "epoch": 0.8371916027145303, + "grad_norm": 0.8152589201927185, + "learning_rate": 8.159446946153359e-06, + "loss": 0.7497, + "step": 13200 + }, + { + "epoch": 0.8384600748398554, + "grad_norm": 0.8962567448616028, + "learning_rate": 8.096023339887106e-06, + "loss": 0.7838, + "step": 13220 + }, + { + "epoch": 0.8397285469651804, + "grad_norm": 0.6861271262168884, + "learning_rate": 8.032599733620855e-06, + "loss": 0.777, + "step": 13240 + }, + { + "epoch": 0.8409970190905055, + "grad_norm": 0.7273656725883484, + "learning_rate": 7.969176127354603e-06, + "loss": 0.7345, + "step": 13260 + }, + { + "epoch": 0.8422654912158305, + "grad_norm": 0.7643877267837524, + "learning_rate": 7.90575252108835e-06, + "loss": 0.7837, + "step": 13280 + }, + { + "epoch": 0.8435339633411556, + "grad_norm": 0.695196270942688, + "learning_rate": 7.842328914822098e-06, + "loss": 0.7361, + "step": 13300 + }, + { + "epoch": 0.8448024354664806, + "grad_norm": 0.6783697009086609, + "learning_rate": 7.778905308555845e-06, + "loss": 0.7315, + "step": 13320 + }, + { + "epoch": 0.8460709075918057, + "grad_norm": 0.8633202314376831, + "learning_rate": 7.715481702289593e-06, + "loss": 0.7713, + "step": 13340 + }, + { + "epoch": 0.8473393797171307, + "grad_norm": 0.7902844548225403, + "learning_rate": 7.65205809602334e-06, + "loss": 0.7732, + "step": 13360 + }, + { + "epoch": 0.8486078518424558, + "grad_norm": 1.1263785362243652, + "learning_rate": 7.588634489757088e-06, + "loss": 0.7549, + "step": 13380 + }, + { + "epoch": 0.8498763239677808, + "grad_norm": 0.7141507863998413, + "learning_rate": 7.5252108834908354e-06, + "loss": 0.7498, + "step": 13400 + }, + { + "epoch": 0.8511447960931059, + "grad_norm": 0.7436708211898804, + "learning_rate": 7.461787277224583e-06, + "loss": 0.7525, + "step": 13420 + }, + { + "epoch": 0.8524132682184309, + "grad_norm": 0.7840022444725037, + "learning_rate": 7.3983636709583304e-06, + "loss": 0.7658, + "step": 13440 + }, + { + "epoch": 0.853681740343756, + "grad_norm": 0.7560069561004639, + "learning_rate": 7.33494006469208e-06, + "loss": 0.7946, + "step": 13460 + }, + { + "epoch": 0.854950212469081, + "grad_norm": 0.7361060976982117, + "learning_rate": 7.271516458425827e-06, + "loss": 0.7722, + "step": 13480 + }, + { + "epoch": 0.8562186845944061, + "grad_norm": 0.8141864538192749, + "learning_rate": 7.208092852159575e-06, + "loss": 0.751, + "step": 13500 + }, + { + "epoch": 0.8574871567197311, + "grad_norm": 0.7860879898071289, + "learning_rate": 7.144669245893322e-06, + "loss": 0.7401, + "step": 13520 + }, + { + "epoch": 0.8587556288450562, + "grad_norm": 1.1111942529678345, + "learning_rate": 7.08124563962707e-06, + "loss": 0.803, + "step": 13540 + }, + { + "epoch": 0.8600241009703812, + "grad_norm": 0.7983526587486267, + "learning_rate": 7.017822033360817e-06, + "loss": 0.7963, + "step": 13560 + }, + { + "epoch": 0.8612925730957062, + "grad_norm": 0.7415090799331665, + "learning_rate": 6.954398427094565e-06, + "loss": 0.7997, + "step": 13580 + }, + { + "epoch": 0.8625610452210313, + "grad_norm": 0.8375813364982605, + "learning_rate": 6.890974820828312e-06, + "loss": 0.8052, + "step": 13600 + }, + { + "epoch": 0.8638295173463563, + "grad_norm": 0.8622868657112122, + "learning_rate": 6.82755121456206e-06, + "loss": 0.7816, + "step": 13620 + }, + { + "epoch": 0.8650979894716814, + "grad_norm": 0.9229819774627686, + "learning_rate": 6.764127608295807e-06, + "loss": 0.7558, + "step": 13640 + }, + { + "epoch": 0.8663664615970064, + "grad_norm": 0.8334788084030151, + "learning_rate": 6.700704002029556e-06, + "loss": 0.7648, + "step": 13660 + }, + { + "epoch": 0.8676349337223315, + "grad_norm": 0.8499231338500977, + "learning_rate": 6.637280395763304e-06, + "loss": 0.7954, + "step": 13680 + }, + { + "epoch": 0.8689034058476565, + "grad_norm": 0.8455031514167786, + "learning_rate": 6.573856789497051e-06, + "loss": 0.7973, + "step": 13700 + }, + { + "epoch": 0.8701718779729816, + "grad_norm": 0.7882249355316162, + "learning_rate": 6.510433183230799e-06, + "loss": 0.8031, + "step": 13720 + }, + { + "epoch": 0.8714403500983066, + "grad_norm": 0.7251647114753723, + "learning_rate": 6.447009576964546e-06, + "loss": 0.7581, + "step": 13740 + }, + { + "epoch": 0.8727088222236317, + "grad_norm": 0.8899182081222534, + "learning_rate": 6.383585970698294e-06, + "loss": 0.7981, + "step": 13760 + }, + { + "epoch": 0.8739772943489567, + "grad_norm": 0.7742959260940552, + "learning_rate": 6.320162364432042e-06, + "loss": 0.7606, + "step": 13780 + }, + { + "epoch": 0.8752457664742818, + "grad_norm": 0.7180908918380737, + "learning_rate": 6.25673875816579e-06, + "loss": 0.8301, + "step": 13800 + }, + { + "epoch": 0.8765142385996068, + "grad_norm": 0.7653104066848755, + "learning_rate": 6.193315151899537e-06, + "loss": 0.8017, + "step": 13820 + }, + { + "epoch": 0.8777827107249319, + "grad_norm": 0.802506148815155, + "learning_rate": 6.1298915456332856e-06, + "loss": 0.7634, + "step": 13840 + }, + { + "epoch": 0.8790511828502569, + "grad_norm": 0.882520318031311, + "learning_rate": 6.066467939367033e-06, + "loss": 0.777, + "step": 13860 + }, + { + "epoch": 0.880319654975582, + "grad_norm": 0.7464948892593384, + "learning_rate": 6.0030443331007806e-06, + "loss": 0.7495, + "step": 13880 + }, + { + "epoch": 0.881588127100907, + "grad_norm": 0.7769840359687805, + "learning_rate": 5.939620726834528e-06, + "loss": 0.7562, + "step": 13900 + }, + { + "epoch": 0.882856599226232, + "grad_norm": 0.9281843304634094, + "learning_rate": 5.8761971205682756e-06, + "loss": 0.7845, + "step": 13920 + }, + { + "epoch": 0.884125071351557, + "grad_norm": 0.801986813545227, + "learning_rate": 5.812773514302024e-06, + "loss": 0.7301, + "step": 13940 + }, + { + "epoch": 0.8853935434768821, + "grad_norm": 0.8647619485855103, + "learning_rate": 5.749349908035771e-06, + "loss": 0.7693, + "step": 13960 + }, + { + "epoch": 0.8866620156022071, + "grad_norm": 0.8235803246498108, + "learning_rate": 5.685926301769519e-06, + "loss": 0.7972, + "step": 13980 + }, + { + "epoch": 0.8879304877275321, + "grad_norm": 0.8338538408279419, + "learning_rate": 5.622502695503266e-06, + "loss": 0.7951, + "step": 14000 + }, + { + "epoch": 0.8891989598528572, + "grad_norm": 0.704741895198822, + "learning_rate": 5.559079089237014e-06, + "loss": 0.7446, + "step": 14020 + }, + { + "epoch": 0.8904674319781822, + "grad_norm": 0.7731455564498901, + "learning_rate": 5.495655482970762e-06, + "loss": 0.7266, + "step": 14040 + }, + { + "epoch": 0.8917359041035073, + "grad_norm": 0.779869794845581, + "learning_rate": 5.43223187670451e-06, + "loss": 0.7638, + "step": 14060 + }, + { + "epoch": 0.8930043762288323, + "grad_norm": 0.7645334005355835, + "learning_rate": 5.368808270438257e-06, + "loss": 0.7868, + "step": 14080 + }, + { + "epoch": 0.8942728483541574, + "grad_norm": 0.8010347485542297, + "learning_rate": 5.305384664172005e-06, + "loss": 0.8038, + "step": 14100 + }, + { + "epoch": 0.8955413204794824, + "grad_norm": 0.830556333065033, + "learning_rate": 5.241961057905752e-06, + "loss": 0.7774, + "step": 14120 + }, + { + "epoch": 0.8968097926048075, + "grad_norm": 0.9386767745018005, + "learning_rate": 5.178537451639501e-06, + "loss": 0.7591, + "step": 14140 + }, + { + "epoch": 0.8980782647301325, + "grad_norm": 0.8357464671134949, + "learning_rate": 5.115113845373248e-06, + "loss": 0.7642, + "step": 14160 + }, + { + "epoch": 0.8993467368554576, + "grad_norm": 0.7423475384712219, + "learning_rate": 5.051690239106996e-06, + "loss": 0.7586, + "step": 14180 + }, + { + "epoch": 0.9006152089807826, + "grad_norm": 0.8751846551895142, + "learning_rate": 4.988266632840743e-06, + "loss": 0.7536, + "step": 14200 + }, + { + "epoch": 0.9018836811061077, + "grad_norm": 0.8088937997817993, + "learning_rate": 4.924843026574491e-06, + "loss": 0.7747, + "step": 14220 + }, + { + "epoch": 0.9031521532314327, + "grad_norm": 0.7831218242645264, + "learning_rate": 4.861419420308239e-06, + "loss": 0.7741, + "step": 14240 + }, + { + "epoch": 0.9044206253567578, + "grad_norm": 0.8346021175384521, + "learning_rate": 4.7979958140419865e-06, + "loss": 0.8034, + "step": 14260 + }, + { + "epoch": 0.9056890974820828, + "grad_norm": 0.7575668692588806, + "learning_rate": 4.734572207775735e-06, + "loss": 0.7866, + "step": 14280 + }, + { + "epoch": 0.9069575696074079, + "grad_norm": 0.8374447822570801, + "learning_rate": 4.671148601509482e-06, + "loss": 0.7808, + "step": 14300 + }, + { + "epoch": 0.9082260417327329, + "grad_norm": 0.7750478982925415, + "learning_rate": 4.60772499524323e-06, + "loss": 0.7628, + "step": 14320 + }, + { + "epoch": 0.909494513858058, + "grad_norm": 0.8722181916236877, + "learning_rate": 4.54747256929029e-06, + "loss": 0.7744, + "step": 14340 + }, + { + "epoch": 0.910762985983383, + "grad_norm": 0.8774197101593018, + "learning_rate": 4.484048963024038e-06, + "loss": 0.8039, + "step": 14360 + }, + { + "epoch": 0.9120314581087081, + "grad_norm": 0.677240788936615, + "learning_rate": 4.420625356757786e-06, + "loss": 0.7648, + "step": 14380 + }, + { + "epoch": 0.9132999302340331, + "grad_norm": 0.8468155264854431, + "learning_rate": 4.357201750491533e-06, + "loss": 0.7494, + "step": 14400 + }, + { + "epoch": 0.9145684023593581, + "grad_norm": 0.8869547247886658, + "learning_rate": 4.293778144225281e-06, + "loss": 0.7713, + "step": 14420 + }, + { + "epoch": 0.9158368744846832, + "grad_norm": 0.8307056427001953, + "learning_rate": 4.230354537959028e-06, + "loss": 0.7784, + "step": 14440 + }, + { + "epoch": 0.9171053466100082, + "grad_norm": 0.8972311019897461, + "learning_rate": 4.166930931692777e-06, + "loss": 0.7653, + "step": 14460 + }, + { + "epoch": 0.9183738187353333, + "grad_norm": 0.7319260835647583, + "learning_rate": 4.103507325426524e-06, + "loss": 0.7802, + "step": 14480 + }, + { + "epoch": 0.9196422908606583, + "grad_norm": 0.8915144801139832, + "learning_rate": 4.040083719160272e-06, + "loss": 0.7709, + "step": 14500 + }, + { + "epoch": 0.9209107629859834, + "grad_norm": 1.0568279027938843, + "learning_rate": 3.976660112894019e-06, + "loss": 0.7961, + "step": 14520 + }, + { + "epoch": 0.9221792351113084, + "grad_norm": 0.9176843166351318, + "learning_rate": 3.913236506627767e-06, + "loss": 0.7758, + "step": 14540 + }, + { + "epoch": 0.9234477072366335, + "grad_norm": 0.8266422152519226, + "learning_rate": 3.849812900361515e-06, + "loss": 0.7896, + "step": 14560 + }, + { + "epoch": 0.9247161793619585, + "grad_norm": 0.8376505970954895, + "learning_rate": 3.7863892940952626e-06, + "loss": 0.7529, + "step": 14580 + }, + { + "epoch": 0.9259846514872836, + "grad_norm": 0.7567136287689209, + "learning_rate": 3.72296568782901e-06, + "loss": 0.7361, + "step": 14600 + }, + { + "epoch": 0.9272531236126086, + "grad_norm": 0.8554660677909851, + "learning_rate": 3.6595420815627576e-06, + "loss": 0.7808, + "step": 14620 + }, + { + "epoch": 0.9285215957379337, + "grad_norm": 0.8139016032218933, + "learning_rate": 3.5961184752965055e-06, + "loss": 0.7743, + "step": 14640 + }, + { + "epoch": 0.9297900678632587, + "grad_norm": 0.8457810282707214, + "learning_rate": 3.532694869030253e-06, + "loss": 0.7431, + "step": 14660 + }, + { + "epoch": 0.9310585399885838, + "grad_norm": 0.7392221093177795, + "learning_rate": 3.469271262764001e-06, + "loss": 0.7678, + "step": 14680 + }, + { + "epoch": 0.9323270121139088, + "grad_norm": 0.74676913022995, + "learning_rate": 3.405847656497749e-06, + "loss": 0.7739, + "step": 14700 + }, + { + "epoch": 0.9335954842392339, + "grad_norm": 0.9203459620475769, + "learning_rate": 3.3424240502314964e-06, + "loss": 0.7746, + "step": 14720 + }, + { + "epoch": 0.9348639563645589, + "grad_norm": 0.8364453911781311, + "learning_rate": 3.279000443965244e-06, + "loss": 0.8001, + "step": 14740 + }, + { + "epoch": 0.936132428489884, + "grad_norm": 0.8923636078834534, + "learning_rate": 3.2155768376989914e-06, + "loss": 0.7831, + "step": 14760 + }, + { + "epoch": 0.937400900615209, + "grad_norm": 0.8334746360778809, + "learning_rate": 3.1521532314327397e-06, + "loss": 0.7731, + "step": 14780 + }, + { + "epoch": 0.9386693727405341, + "grad_norm": 0.7588502764701843, + "learning_rate": 3.0887296251664873e-06, + "loss": 0.7625, + "step": 14800 + }, + { + "epoch": 0.9399378448658591, + "grad_norm": 0.7889774441719055, + "learning_rate": 3.0253060189002348e-06, + "loss": 0.7354, + "step": 14820 + }, + { + "epoch": 0.9412063169911841, + "grad_norm": 0.7171176671981812, + "learning_rate": 2.9618824126339823e-06, + "loss": 0.7341, + "step": 14840 + }, + { + "epoch": 0.9424747891165092, + "grad_norm": 0.9590219259262085, + "learning_rate": 2.89845880636773e-06, + "loss": 0.8243, + "step": 14860 + }, + { + "epoch": 0.9437432612418342, + "grad_norm": 0.8210035562515259, + "learning_rate": 2.8350352001014777e-06, + "loss": 0.7863, + "step": 14880 + }, + { + "epoch": 0.9450117333671593, + "grad_norm": 0.9011463522911072, + "learning_rate": 2.7716115938352256e-06, + "loss": 0.7325, + "step": 14900 + }, + { + "epoch": 0.9462802054924843, + "grad_norm": 0.8472886681556702, + "learning_rate": 2.7081879875689735e-06, + "loss": 0.7409, + "step": 14920 + }, + { + "epoch": 0.9475486776178094, + "grad_norm": 0.8105041980743408, + "learning_rate": 2.644764381302721e-06, + "loss": 0.7598, + "step": 14940 + }, + { + "epoch": 0.9488171497431344, + "grad_norm": 0.9121299386024475, + "learning_rate": 2.581340775036469e-06, + "loss": 0.8026, + "step": 14960 + }, + { + "epoch": 0.9500856218684595, + "grad_norm": 0.7272994518280029, + "learning_rate": 2.5179171687702165e-06, + "loss": 0.771, + "step": 14980 + }, + { + "epoch": 0.9513540939937845, + "grad_norm": 0.9553162455558777, + "learning_rate": 2.454493562503964e-06, + "loss": 0.7562, + "step": 15000 + }, + { + "epoch": 0.9526225661191096, + "grad_norm": 0.775790810585022, + "learning_rate": 2.391069956237712e-06, + "loss": 0.7682, + "step": 15020 + }, + { + "epoch": 0.9538910382444346, + "grad_norm": 0.7874183654785156, + "learning_rate": 2.3276463499714594e-06, + "loss": 0.7706, + "step": 15040 + }, + { + "epoch": 0.9551595103697597, + "grad_norm": 0.9175487756729126, + "learning_rate": 2.2642227437052073e-06, + "loss": 0.7904, + "step": 15060 + }, + { + "epoch": 0.9564279824950846, + "grad_norm": 0.7636487483978271, + "learning_rate": 2.200799137438955e-06, + "loss": 0.7568, + "step": 15080 + }, + { + "epoch": 0.9576964546204098, + "grad_norm": 0.8234472870826721, + "learning_rate": 2.1373755311727023e-06, + "loss": 0.7902, + "step": 15100 + }, + { + "epoch": 0.9589649267457347, + "grad_norm": 0.7153974175453186, + "learning_rate": 2.0739519249064502e-06, + "loss": 0.7305, + "step": 15120 + }, + { + "epoch": 0.9602333988710599, + "grad_norm": 0.8177540302276611, + "learning_rate": 2.010528318640198e-06, + "loss": 0.7584, + "step": 15140 + }, + { + "epoch": 0.9615018709963848, + "grad_norm": 0.7617529630661011, + "learning_rate": 1.9471047123739457e-06, + "loss": 0.74, + "step": 15160 + }, + { + "epoch": 0.96277034312171, + "grad_norm": 0.8607798218727112, + "learning_rate": 1.8836811061076934e-06, + "loss": 0.7867, + "step": 15180 + }, + { + "epoch": 0.9640388152470349, + "grad_norm": 0.7388313412666321, + "learning_rate": 1.820257499841441e-06, + "loss": 0.7561, + "step": 15200 + }, + { + "epoch": 0.96530728737236, + "grad_norm": 0.8492052555084229, + "learning_rate": 1.756833893575189e-06, + "loss": 0.7769, + "step": 15220 + }, + { + "epoch": 0.966575759497685, + "grad_norm": 0.7551003694534302, + "learning_rate": 1.6934102873089365e-06, + "loss": 0.7641, + "step": 15240 + }, + { + "epoch": 0.96784423162301, + "grad_norm": 0.8290461897850037, + "learning_rate": 1.629986681042684e-06, + "loss": 0.7537, + "step": 15260 + }, + { + "epoch": 0.9691127037483351, + "grad_norm": 0.8409222960472107, + "learning_rate": 1.566563074776432e-06, + "loss": 0.7591, + "step": 15280 + }, + { + "epoch": 0.9703811758736601, + "grad_norm": 0.7970009446144104, + "learning_rate": 1.5031394685101795e-06, + "loss": 0.7763, + "step": 15300 + }, + { + "epoch": 0.9716496479989852, + "grad_norm": 0.8285433650016785, + "learning_rate": 1.4397158622439274e-06, + "loss": 0.7357, + "step": 15320 + }, + { + "epoch": 0.9729181201243102, + "grad_norm": 0.8585550785064697, + "learning_rate": 1.376292255977675e-06, + "loss": 0.7706, + "step": 15340 + }, + { + "epoch": 0.9741865922496353, + "grad_norm": 0.7604003548622131, + "learning_rate": 1.3128686497114226e-06, + "loss": 0.7455, + "step": 15360 + }, + { + "epoch": 0.9754550643749603, + "grad_norm": 0.7896905541419983, + "learning_rate": 1.2494450434451703e-06, + "loss": 0.7824, + "step": 15380 + }, + { + "epoch": 0.9767235365002854, + "grad_norm": 0.8073090314865112, + "learning_rate": 1.186021437178918e-06, + "loss": 0.7836, + "step": 15400 + }, + { + "epoch": 0.9779920086256104, + "grad_norm": 0.8942602276802063, + "learning_rate": 1.1225978309126657e-06, + "loss": 0.7993, + "step": 15420 + }, + { + "epoch": 0.9792604807509355, + "grad_norm": 0.7417428493499756, + "learning_rate": 1.0591742246464135e-06, + "loss": 0.7828, + "step": 15440 + }, + { + "epoch": 0.9805289528762605, + "grad_norm": 0.7453452944755554, + "learning_rate": 9.957506183801612e-07, + "loss": 0.7589, + "step": 15460 + }, + { + "epoch": 0.9817974250015856, + "grad_norm": 0.7909823656082153, + "learning_rate": 9.323270121139089e-07, + "loss": 0.7776, + "step": 15480 + }, + { + "epoch": 0.9830658971269106, + "grad_norm": 0.8445958495140076, + "learning_rate": 8.689034058476566e-07, + "loss": 0.8027, + "step": 15500 + }, + { + "epoch": 0.9843343692522357, + "grad_norm": 0.8094016313552856, + "learning_rate": 8.054797995814042e-07, + "loss": 0.7426, + "step": 15520 + }, + { + "epoch": 0.9856028413775607, + "grad_norm": 0.9366889595985413, + "learning_rate": 7.420561933151519e-07, + "loss": 0.7754, + "step": 15540 + }, + { + "epoch": 0.9868713135028858, + "grad_norm": 0.8186490535736084, + "learning_rate": 6.786325870488996e-07, + "loss": 0.7699, + "step": 15560 + }, + { + "epoch": 0.9881397856282108, + "grad_norm": 0.7232887148857117, + "learning_rate": 6.152089807826473e-07, + "loss": 0.7384, + "step": 15580 + }, + { + "epoch": 0.9894082577535359, + "grad_norm": 0.7465324401855469, + "learning_rate": 5.517853745163951e-07, + "loss": 0.775, + "step": 15600 + }, + { + "epoch": 0.9906767298788609, + "grad_norm": 0.8372469544410706, + "learning_rate": 4.883617682501427e-07, + "loss": 0.7497, + "step": 15620 + }, + { + "epoch": 0.991945202004186, + "grad_norm": 0.8273568749427795, + "learning_rate": 4.2493816198389043e-07, + "loss": 0.7738, + "step": 15640 + }, + { + "epoch": 0.993213674129511, + "grad_norm": 0.7157850861549377, + "learning_rate": 3.6151455571763815e-07, + "loss": 0.7398, + "step": 15660 + }, + { + "epoch": 0.994482146254836, + "grad_norm": 0.8117349147796631, + "learning_rate": 2.980909494513858e-07, + "loss": 0.8053, + "step": 15680 + }, + { + "epoch": 0.9957506183801611, + "grad_norm": 0.869315505027771, + "learning_rate": 2.3466734318513352e-07, + "loss": 0.7819, + "step": 15700 + }, + { + "epoch": 0.9970190905054861, + "grad_norm": 0.7544413208961487, + "learning_rate": 1.712437369188812e-07, + "loss": 0.805, + "step": 15720 + }, + { + "epoch": 0.9982875626308112, + "grad_norm": 0.9937105178833008, + "learning_rate": 1.0782013065262892e-07, + "loss": 0.7808, + "step": 15740 + }, + { + "epoch": 0.9995560347561362, + "grad_norm": 0.7431527972221375, + "learning_rate": 4.439652438637661e-08, + "loss": 0.7574, + "step": 15760 + } + ], + "logging_steps": 20, + "max_steps": 15767, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 15767, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.607614351420162e+18, + "train_batch_size": 18, + "trial_name": null, + "trial_params": null +}