From 53414de0a0875c2c520fdd79a558b1610549f8fe Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sat, 20 Jun 2026 18:01:22 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: deqing/llama-300M-v3-muon-original Source: Original Platform --- .gitattributes | 38 + README.md | 61 + ckpt-8-2b/config.json | 32 + ckpt-8-2b/generation_config.json | 11 + ckpt-8-2b/model.safetensors | 3 + ckpt-8-2b/optimizer.pt | 3 + ckpt-8-2b/rng_state_0.pth | 3 + ckpt-8-2b/rng_state_1.pth | 3 + ckpt-8-2b/scheduler.pt | 3 + ckpt-8-2b/tokenizer.json | 3 + ckpt-8-2b/tokenizer_config.json | 13 + ckpt-8-2b/trainer_state.json | 15953 +++++++++++++++++++++++++++ ckpt-8-2b/training_args.bin | 3 + config.json | 32 + final_model/config.json | 32 + final_model/generation_config.json | 11 + final_model/model.safetensors | 3 + final_model/tokenizer.json | 3 + final_model/tokenizer_config.json | 13 + final_model/training_args.bin | 3 + generation_config.json | 11 + model.safetensors | 3 + tokenizer.json | 3 + tokenizer_config.json | 13 + training_args.bin | 3 + 25 files changed, 16259 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 ckpt-8-2b/config.json create mode 100644 ckpt-8-2b/generation_config.json create mode 100644 ckpt-8-2b/model.safetensors create mode 100644 ckpt-8-2b/optimizer.pt create mode 100644 ckpt-8-2b/rng_state_0.pth create mode 100644 ckpt-8-2b/rng_state_1.pth create mode 100644 ckpt-8-2b/scheduler.pt create mode 100644 ckpt-8-2b/tokenizer.json create mode 100644 ckpt-8-2b/tokenizer_config.json create mode 100644 ckpt-8-2b/trainer_state.json create mode 100644 ckpt-8-2b/training_args.bin create mode 100644 config.json create mode 100644 final_model/config.json create mode 100644 final_model/generation_config.json create mode 100644 final_model/model.safetensors create mode 100644 final_model/tokenizer.json create mode 100644 final_model/tokenizer_config.json create mode 100644 final_model/training_args.bin create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..096ecec --- /dev/null +++ b/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +ckpt-8-2b/tokenizer.json filter=lfs diff=lfs merge=lfs -text +final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..d9a2ef5 --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +--- +library_name: transformers +base_model: llama_small_config.json +tags: +- generated_from_trainer +model-index: +- name: llama-300M-v3-muon-original + results: [] +--- + + + +# llama-300M-v3-muon-original + +This model is a fine-tuned version of [llama_small_config.json](https://huggingface.co/llama_small_config.json) on an unknown dataset. +It achieves the following results on the evaluation set: +- Loss: 2.9168 +- Num Input Tokens Seen: 9437184000 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0003 +- train_batch_size: 32 +- eval_batch_size: 32 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 2 +- gradient_accumulation_steps: 8 +- total_train_batch_size: 512 +- total_eval_batch_size: 64 +- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 500 +- num_epochs: 1 + +### Training results + + + +### Framework versions + +- Transformers 5.1.0 +- Pytorch 2.8.0+cu128 +- Datasets 4.0.0 +- Tokenizers 0.22.2 diff --git a/ckpt-8-2b/config.json b/ckpt-8-2b/config.json new file mode 100644 index 0000000..897792f --- /dev/null +++ b/ckpt-8-2b/config.json @@ -0,0 +1,32 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float32", + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "max_position_embeddings": 1024, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 12, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "rope_theta": 500000.0, + "rope_type": "default" + }, + "tie_word_embeddings": true, + "transformers_version": "5.1.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/ckpt-8-2b/generation_config.json b/ckpt-8-2b/generation_config.json new file mode 100644 index 0000000..2e5031c --- /dev/null +++ b/ckpt-8-2b/generation_config.json @@ -0,0 +1,11 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": [ + 128001 + ], + "output_attentions": false, + "output_hidden_states": false, + "transformers_version": "5.1.0", + "use_cache": true +} diff --git a/ckpt-8-2b/model.safetensors b/ckpt-8-2b/model.safetensors new file mode 100644 index 0000000..68a9d3e --- /dev/null +++ b/ckpt-8-2b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abfda4052945da85b8bd9f55c60eb50c55a95fdf4e8c1210cb87b3d0a409821c +size 1280426144 diff --git a/ckpt-8-2b/optimizer.pt b/ckpt-8-2b/optimizer.pt new file mode 100644 index 0000000..5638cf5 --- /dev/null +++ b/ckpt-8-2b/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5c1ebb7de49bedbc90aa6ec27679886da478a3393e987c392022a0ef5d23465 +size 1805894027 diff --git a/ckpt-8-2b/rng_state_0.pth b/ckpt-8-2b/rng_state_0.pth new file mode 100644 index 0000000..91593cc --- /dev/null +++ b/ckpt-8-2b/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62bf4109bb1f7996aaa86885445885c3970978e71d9de55d598ead736b2e0340 +size 14917 diff --git a/ckpt-8-2b/rng_state_1.pth b/ckpt-8-2b/rng_state_1.pth new file mode 100644 index 0000000..5834cc2 --- /dev/null +++ b/ckpt-8-2b/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2295db4a59f5fea920e34eddf64f5118dc09c7be8e89e630d91edc1eb7bd870a +size 14917 diff --git a/ckpt-8-2b/scheduler.pt b/ckpt-8-2b/scheduler.pt new file mode 100644 index 0000000..f2cfb74 --- /dev/null +++ b/ckpt-8-2b/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f112c282e72bbf4b8b37c678e3cbb0a0f4fde1d204545a79f9aeadb1cb38f8ab +size 1465 diff --git a/ckpt-8-2b/tokenizer.json b/ckpt-8-2b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/ckpt-8-2b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/ckpt-8-2b/tokenizer_config.json b/ckpt-8-2b/tokenizer_config.json new file mode 100644 index 0000000..f7213f2 --- /dev/null +++ b/ckpt-8-2b/tokenizer_config.json @@ -0,0 +1,13 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "TokenizersBackend" +} diff --git a/ckpt-8-2b/trainer_state.json b/ckpt-8-2b/trainer_state.json new file mode 100644 index 0000000..3395bb2 --- /dev/null +++ b/ckpt-8-2b/trainer_state.json @@ -0,0 +1,15953 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8463973592359101, + "eval_steps": 500, + "global_step": 15641, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005411401823642415, + "grad_norm": 3.381409168243408, + "learning_rate": 8.999999999999999e-05, + "loss": 11.904257202148438, + "num_input_tokens_seen": 5242880, + "step": 10, + "train_runtime": 98.6668, + "train_tokens_per_second": 53137.211 + }, + { + "epoch": 0.001082280364728483, + "grad_norm": 3.160391092300415, + "learning_rate": 0.00019, + "loss": 11.522220611572266, + "num_input_tokens_seen": 10485760, + "step": 20, + "train_runtime": 143.6354, + "train_tokens_per_second": 73002.603 + }, + { + "epoch": 0.0016234205470927244, + "grad_norm": 2.81247878074646, + "learning_rate": 0.00029, + "loss": 10.88598403930664, + "num_input_tokens_seen": 15728640, + "step": 30, + "train_runtime": 188.573, + "train_tokens_per_second": 83408.77 + }, + { + "epoch": 0.002164560729456966, + "grad_norm": 2.100623846054077, + "learning_rate": 0.00039, + "loss": 10.268418884277343, + "num_input_tokens_seen": 20971520, + "step": 40, + "train_runtime": 233.5046, + "train_tokens_per_second": 89812.025 + }, + { + "epoch": 0.002705700911821207, + "grad_norm": 1.6848973035812378, + "learning_rate": 0.00049, + "loss": 9.741734313964844, + "num_input_tokens_seen": 26214400, + "step": 50, + "train_runtime": 278.4485, + "train_tokens_per_second": 94144.504 + }, + { + "epoch": 0.003246841094185449, + "grad_norm": 2.3044769763946533, + "learning_rate": 0.00059, + "loss": 9.239765167236328, + "num_input_tokens_seen": 31457280, + "step": 60, + "train_runtime": 323.3989, + "train_tokens_per_second": 97270.822 + }, + { + "epoch": 0.00378798127654969, + "grad_norm": 1.6891608238220215, + "learning_rate": 0.0006900000000000001, + "loss": 8.75671157836914, + "num_input_tokens_seen": 36700160, + "step": 70, + "train_runtime": 368.3824, + "train_tokens_per_second": 99625.16 + }, + { + "epoch": 0.004329121458913932, + "grad_norm": 1.304526448249817, + "learning_rate": 0.00079, + "loss": 8.304008483886719, + "num_input_tokens_seen": 41943040, + "step": 80, + "train_runtime": 413.3956, + "train_tokens_per_second": 101459.807 + }, + { + "epoch": 0.004870261641278173, + "grad_norm": 0.9729027152061462, + "learning_rate": 0.00089, + "loss": 7.881130981445312, + "num_input_tokens_seen": 47185920, + "step": 90, + "train_runtime": 458.4234, + "train_tokens_per_second": 102930.863 + }, + { + "epoch": 0.005411401823642414, + "grad_norm": 0.8812423944473267, + "learning_rate": 0.00099, + "loss": 7.507221984863281, + "num_input_tokens_seen": 52428800, + "step": 100, + "train_runtime": 503.4688, + "train_tokens_per_second": 104135.16 + }, + { + "epoch": 0.005952542006006656, + "grad_norm": 0.8653954863548279, + "learning_rate": 0.00109, + "loss": 7.197725677490235, + "num_input_tokens_seen": 57671680, + "step": 110, + "train_runtime": 548.5148, + "train_tokens_per_second": 105141.522 + }, + { + "epoch": 0.006493682188370898, + "grad_norm": 0.6527077555656433, + "learning_rate": 0.0011899999999999999, + "loss": 6.962340545654297, + "num_input_tokens_seen": 62914560, + "step": 120, + "train_runtime": 593.6028, + "train_tokens_per_second": 105987.634 + }, + { + "epoch": 0.007034822370735139, + "grad_norm": 0.9212841987609863, + "learning_rate": 0.0012900000000000001, + "loss": 6.748843383789063, + "num_input_tokens_seen": 68157440, + "step": 130, + "train_runtime": 638.7162, + "train_tokens_per_second": 106710.057 + }, + { + "epoch": 0.00757596255309938, + "grad_norm": 0.8431305885314941, + "learning_rate": 0.0013900000000000002, + "loss": 6.572750854492187, + "num_input_tokens_seen": 73400320, + "step": 140, + "train_runtime": 683.8844, + "train_tokens_per_second": 107328.553 + }, + { + "epoch": 0.008117102735463622, + "grad_norm": 1.0355322360992432, + "learning_rate": 0.00149, + "loss": 6.391636657714844, + "num_input_tokens_seen": 78643200, + "step": 150, + "train_runtime": 729.0444, + "train_tokens_per_second": 107871.613 + }, + { + "epoch": 0.008658242917827864, + "grad_norm": 1.3339749574661255, + "learning_rate": 0.00159, + "loss": 6.223532104492188, + "num_input_tokens_seen": 83886080, + "step": 160, + "train_runtime": 774.2149, + "train_tokens_per_second": 108349.867 + }, + { + "epoch": 0.009199383100192105, + "grad_norm": 1.152486801147461, + "learning_rate": 0.00169, + "loss": 6.081370162963867, + "num_input_tokens_seen": 89128960, + "step": 170, + "train_runtime": 819.4009, + "train_tokens_per_second": 108773.326 + }, + { + "epoch": 0.009740523282556346, + "grad_norm": 1.163500189781189, + "learning_rate": 0.00179, + "loss": 5.940819549560547, + "num_input_tokens_seen": 94371840, + "step": 180, + "train_runtime": 864.5859, + "train_tokens_per_second": 109152.654 + }, + { + "epoch": 0.010281663464920588, + "grad_norm": 1.2408533096313477, + "learning_rate": 0.00189, + "loss": 5.812583160400391, + "num_input_tokens_seen": 99614720, + "step": 190, + "train_runtime": 909.7569, + "train_tokens_per_second": 109495.979 + }, + { + "epoch": 0.010822803647284829, + "grad_norm": 1.1574287414550781, + "learning_rate": 0.00199, + "loss": 5.6905670166015625, + "num_input_tokens_seen": 104857600, + "step": 200, + "train_runtime": 954.9646, + "train_tokens_per_second": 109802.599 + }, + { + "epoch": 0.011363943829649071, + "grad_norm": 1.296819806098938, + "learning_rate": 0.00209, + "loss": 5.591656494140625, + "num_input_tokens_seen": 110100480, + "step": 210, + "train_runtime": 1000.1348, + "train_tokens_per_second": 110085.64 + }, + { + "epoch": 0.011905084012013312, + "grad_norm": 1.0654325485229492, + "learning_rate": 0.00219, + "loss": 5.485440444946289, + "num_input_tokens_seen": 115343360, + "step": 220, + "train_runtime": 1045.3265, + "train_tokens_per_second": 110341.942 + }, + { + "epoch": 0.012446224194377553, + "grad_norm": 1.1002130508422852, + "learning_rate": 0.00229, + "loss": 5.387198257446289, + "num_input_tokens_seen": 120586240, + "step": 230, + "train_runtime": 1090.5293, + "train_tokens_per_second": 110575.879 + }, + { + "epoch": 0.012987364376741795, + "grad_norm": 1.0023939609527588, + "learning_rate": 0.0023899999999999998, + "loss": 5.29501953125, + "num_input_tokens_seen": 125829120, + "step": 240, + "train_runtime": 1135.7379, + "train_tokens_per_second": 110790.637 + }, + { + "epoch": 0.013528504559106036, + "grad_norm": 0.8933797478675842, + "learning_rate": 0.00249, + "loss": 5.21459846496582, + "num_input_tokens_seen": 131072000, + "step": 250, + "train_runtime": 1180.9487, + "train_tokens_per_second": 110988.737 + }, + { + "epoch": 0.014069644741470278, + "grad_norm": 1.0700093507766724, + "learning_rate": 0.0025900000000000003, + "loss": 5.131219482421875, + "num_input_tokens_seen": 136314880, + "step": 260, + "train_runtime": 1226.1688, + "train_tokens_per_second": 111171.384 + }, + { + "epoch": 0.01461078492383452, + "grad_norm": 1.194157600402832, + "learning_rate": 0.00269, + "loss": 5.058176422119141, + "num_input_tokens_seen": 141557760, + "step": 270, + "train_runtime": 1271.4137, + "train_tokens_per_second": 111338.863 + }, + { + "epoch": 0.01515192510619876, + "grad_norm": 1.097806453704834, + "learning_rate": 0.0027900000000000004, + "loss": 4.985312652587891, + "num_input_tokens_seen": 146800640, + "step": 280, + "train_runtime": 1316.6654, + "train_tokens_per_second": 111494.264 + }, + { + "epoch": 0.015693065288563002, + "grad_norm": 0.9698807001113892, + "learning_rate": 0.0028899999999999998, + "loss": 4.905867004394532, + "num_input_tokens_seen": 152043520, + "step": 290, + "train_runtime": 1361.9146, + "train_tokens_per_second": 111639.537 + }, + { + "epoch": 0.016234205470927243, + "grad_norm": 0.9057841300964355, + "learning_rate": 0.00299, + "loss": 4.847021102905273, + "num_input_tokens_seen": 157286400, + "step": 300, + "train_runtime": 1407.157, + "train_tokens_per_second": 111776.016 + }, + { + "epoch": 0.016775345653291484, + "grad_norm": 0.9901854395866394, + "learning_rate": 0.00309, + "loss": 4.7939613342285154, + "num_input_tokens_seen": 162529280, + "step": 310, + "train_runtime": 1452.4002, + "train_tokens_per_second": 111903.927 + }, + { + "epoch": 0.017316485835655728, + "grad_norm": 0.9903898239135742, + "learning_rate": 0.00319, + "loss": 4.720642852783203, + "num_input_tokens_seen": 167772160, + "step": 320, + "train_runtime": 1497.6772, + "train_tokens_per_second": 112021.577 + }, + { + "epoch": 0.01785762601801997, + "grad_norm": 1.001007318496704, + "learning_rate": 0.0032900000000000004, + "loss": 4.6716564178466795, + "num_input_tokens_seen": 173015040, + "step": 330, + "train_runtime": 1542.9302, + "train_tokens_per_second": 112134.066 + }, + { + "epoch": 0.01839876620038421, + "grad_norm": 0.9968867897987366, + "learning_rate": 0.0033900000000000002, + "loss": 4.6183723449707035, + "num_input_tokens_seen": 178257920, + "step": 340, + "train_runtime": 1588.1903, + "train_tokens_per_second": 112239.647 + }, + { + "epoch": 0.01893990638274845, + "grad_norm": 0.9285950064659119, + "learning_rate": 0.00349, + "loss": 4.572103881835938, + "num_input_tokens_seen": 183500800, + "step": 350, + "train_runtime": 1633.4577, + "train_tokens_per_second": 112338.875 + }, + { + "epoch": 0.01948104656511269, + "grad_norm": 0.989262044429779, + "learning_rate": 0.00359, + "loss": 4.529151153564453, + "num_input_tokens_seen": 188743680, + "step": 360, + "train_runtime": 1678.7231, + "train_tokens_per_second": 112432.882 + }, + { + "epoch": 0.020022186747476935, + "grad_norm": 1.0208923816680908, + "learning_rate": 0.00369, + "loss": 4.4996803283691404, + "num_input_tokens_seen": 193986560, + "step": 370, + "train_runtime": 1724.0055, + "train_tokens_per_second": 112520.846 + }, + { + "epoch": 0.020563326929841176, + "grad_norm": 0.9494571089744568, + "learning_rate": 0.00379, + "loss": 4.4542186737060545, + "num_input_tokens_seen": 199229440, + "step": 380, + "train_runtime": 1769.3003, + "train_tokens_per_second": 112603.517 + }, + { + "epoch": 0.021104467112205417, + "grad_norm": 0.7988581657409668, + "learning_rate": 0.0038900000000000002, + "loss": 4.431841278076172, + "num_input_tokens_seen": 204472320, + "step": 390, + "train_runtime": 1818.3866, + "train_tokens_per_second": 112447.111 + }, + { + "epoch": 0.021645607294569658, + "grad_norm": 0.832046389579773, + "learning_rate": 0.0039900000000000005, + "loss": 4.396000671386719, + "num_input_tokens_seen": 209715200, + "step": 400, + "train_runtime": 1863.6718, + "train_tokens_per_second": 112527.967 + }, + { + "epoch": 0.0221867474769339, + "grad_norm": 0.8342320919036865, + "learning_rate": 0.00409, + "loss": 4.37408332824707, + "num_input_tokens_seen": 214958080, + "step": 410, + "train_runtime": 1909.0219, + "train_tokens_per_second": 112601.162 + }, + { + "epoch": 0.022727887659298143, + "grad_norm": 0.9766927361488342, + "learning_rate": 0.00419, + "loss": 4.35020637512207, + "num_input_tokens_seen": 220200960, + "step": 420, + "train_runtime": 1954.3817, + "train_tokens_per_second": 112670.397 + }, + { + "epoch": 0.023269027841662383, + "grad_norm": 0.8501082062721252, + "learning_rate": 0.00429, + "loss": 4.312299346923828, + "num_input_tokens_seen": 225443840, + "step": 430, + "train_runtime": 1999.72, + "train_tokens_per_second": 112737.702 + }, + { + "epoch": 0.023810168024026624, + "grad_norm": 0.8430765867233276, + "learning_rate": 0.00439, + "loss": 4.310842895507813, + "num_input_tokens_seen": 230686720, + "step": 440, + "train_runtime": 2045.0616, + "train_tokens_per_second": 112801.843 + }, + { + "epoch": 0.024351308206390865, + "grad_norm": 0.7848499417304993, + "learning_rate": 0.00449, + "loss": 4.2807456970214846, + "num_input_tokens_seen": 235929600, + "step": 450, + "train_runtime": 2090.4316, + "train_tokens_per_second": 112861.668 + }, + { + "epoch": 0.024892448388755106, + "grad_norm": 0.9066799879074097, + "learning_rate": 0.00459, + "loss": 4.257038879394531, + "num_input_tokens_seen": 241172480, + "step": 460, + "train_runtime": 2135.8061, + "train_tokens_per_second": 112918.713 + }, + { + "epoch": 0.02543358857111935, + "grad_norm": 0.7888091802597046, + "learning_rate": 0.00469, + "loss": 4.235981750488281, + "num_input_tokens_seen": 246415360, + "step": 470, + "train_runtime": 2181.1434, + "train_tokens_per_second": 112975.315 + }, + { + "epoch": 0.02597472875348359, + "grad_norm": 0.6987936496734619, + "learning_rate": 0.00479, + "loss": 4.215058135986328, + "num_input_tokens_seen": 251658240, + "step": 480, + "train_runtime": 2226.5022, + "train_tokens_per_second": 113028.517 + }, + { + "epoch": 0.02651586893584783, + "grad_norm": 0.8686115741729736, + "learning_rate": 0.00489, + "loss": 4.219595336914063, + "num_input_tokens_seen": 256901120, + "step": 490, + "train_runtime": 2271.8741, + "train_tokens_per_second": 113078.945 + }, + { + "epoch": 0.027057009118212072, + "grad_norm": 0.9207416772842407, + "learning_rate": 0.0049900000000000005, + "loss": 4.1927734375, + "num_input_tokens_seen": 262144000, + "step": 500, + "train_runtime": 2317.2129, + "train_tokens_per_second": 113129.008 + }, + { + "epoch": 0.027057009118212072, + "eval_loss": 4.101890563964844, + "eval_runtime": 2.0015, + "eval_samples_per_second": 249.812, + "eval_steps_per_second": 3.997, + "num_input_tokens_seen": 262144000, + "step": 500 + }, + { + "epoch": 0.027598149300576313, + "grad_norm": 0.7904194593429565, + "learning_rate": 0.0049999972179955365, + "loss": 4.170513916015625, + "num_input_tokens_seen": 267386880, + "step": 510, + "train_runtime": 2364.5371, + "train_tokens_per_second": 113082.126 + }, + { + "epoch": 0.028139289482940557, + "grad_norm": 0.63487708568573, + "learning_rate": 0.004999987601198816, + "loss": 4.152132034301758, + "num_input_tokens_seen": 272629760, + "step": 520, + "train_runtime": 2409.8408, + "train_tokens_per_second": 113131.854 + }, + { + "epoch": 0.028680429665304798, + "grad_norm": 0.519443154335022, + "learning_rate": 0.0049999711152934586, + "loss": 4.145978164672852, + "num_input_tokens_seen": 277872640, + "step": 530, + "train_runtime": 2455.1664, + "train_tokens_per_second": 113178.739 + }, + { + "epoch": 0.02922156984766904, + "grad_norm": 0.5551373362541199, + "learning_rate": 0.004999947760329793, + "loss": 4.118004608154297, + "num_input_tokens_seen": 283115520, + "step": 540, + "train_runtime": 2500.4657, + "train_tokens_per_second": 113225.118 + }, + { + "epoch": 0.02976271003003328, + "grad_norm": 0.48466068506240845, + "learning_rate": 0.004999917536379122, + "loss": 4.0990447998046875, + "num_input_tokens_seen": 288358400, + "step": 550, + "train_runtime": 2545.7721, + "train_tokens_per_second": 113269.528 + }, + { + "epoch": 0.03030385021239752, + "grad_norm": 0.4300881624221802, + "learning_rate": 0.004999880443533718, + "loss": 4.095553207397461, + "num_input_tokens_seen": 293601280, + "step": 560, + "train_runtime": 2591.05, + "train_tokens_per_second": 113313.63 + }, + { + "epoch": 0.030844990394761764, + "grad_norm": 0.3266729414463043, + "learning_rate": 0.004999836481906822, + "loss": 4.074318313598633, + "num_input_tokens_seen": 298844160, + "step": 570, + "train_runtime": 2636.411, + "train_tokens_per_second": 113352.647 + }, + { + "epoch": 0.031386130577126005, + "grad_norm": 0.34210285544395447, + "learning_rate": 0.004999785651632649, + "loss": 4.055056762695313, + "num_input_tokens_seen": 304087040, + "step": 580, + "train_runtime": 2681.696, + "train_tokens_per_second": 113393.554 + }, + { + "epoch": 0.03192727075949025, + "grad_norm": 0.3171045482158661, + "learning_rate": 0.004999727952866382, + "loss": 4.028103637695312, + "num_input_tokens_seen": 309329920, + "step": 590, + "train_runtime": 2726.9513, + "train_tokens_per_second": 113434.342 + }, + { + "epoch": 0.032468410941854486, + "grad_norm": 0.28656497597694397, + "learning_rate": 0.00499966338578417, + "loss": 4.014062118530274, + "num_input_tokens_seen": 314572800, + "step": 600, + "train_runtime": 2772.2472, + "train_tokens_per_second": 113472.131 + }, + { + "epoch": 0.03300955112421873, + "grad_norm": 0.31004276871681213, + "learning_rate": 0.004999591950583134, + "loss": 4.000431060791016, + "num_input_tokens_seen": 319815680, + "step": 610, + "train_runtime": 2817.5313, + "train_tokens_per_second": 113509.186 + }, + { + "epoch": 0.03355069130658297, + "grad_norm": 0.29579785466194153, + "learning_rate": 0.004999513647481364, + "loss": 3.9810386657714845, + "num_input_tokens_seen": 325058560, + "step": 620, + "train_runtime": 2862.8161, + "train_tokens_per_second": 113545.036 + }, + { + "epoch": 0.03409183148894721, + "grad_norm": 0.28329184651374817, + "learning_rate": 0.0049994284767179145, + "loss": 3.975200653076172, + "num_input_tokens_seen": 330301440, + "step": 630, + "train_runtime": 2908.1102, + "train_tokens_per_second": 113579.411 + }, + { + "epoch": 0.034632971671311456, + "grad_norm": 0.2848559319972992, + "learning_rate": 0.004999336438552809, + "loss": 3.9574630737304686, + "num_input_tokens_seen": 335544320, + "step": 640, + "train_runtime": 2953.403, + "train_tokens_per_second": 113612.776 + }, + { + "epoch": 0.035174111853675694, + "grad_norm": 0.2778968811035156, + "learning_rate": 0.004999237533267034, + "loss": 3.951917266845703, + "num_input_tokens_seen": 340787200, + "step": 650, + "train_runtime": 2998.7048, + "train_tokens_per_second": 113644.799 + }, + { + "epoch": 0.03571525203603994, + "grad_norm": 0.28124260902404785, + "learning_rate": 0.004999131761162544, + "loss": 3.93038330078125, + "num_input_tokens_seen": 346030080, + "step": 660, + "train_runtime": 3044.0205, + "train_tokens_per_second": 113675.344 + }, + { + "epoch": 0.036256392218404175, + "grad_norm": 0.25421732664108276, + "learning_rate": 0.004999019122562258, + "loss": 3.9207611083984375, + "num_input_tokens_seen": 351272960, + "step": 670, + "train_runtime": 3089.3299, + "train_tokens_per_second": 113705.227 + }, + { + "epoch": 0.03679753240076842, + "grad_norm": 0.2740730345249176, + "learning_rate": 0.0049988996178100525, + "loss": 3.91453857421875, + "num_input_tokens_seen": 356515840, + "step": 680, + "train_runtime": 3134.6017, + "train_tokens_per_second": 113735.61 + }, + { + "epoch": 0.037338672583132664, + "grad_norm": 0.2670656740665436, + "learning_rate": 0.004998773247270772, + "loss": 3.884227752685547, + "num_input_tokens_seen": 361758720, + "step": 690, + "train_runtime": 3179.9122, + "train_tokens_per_second": 113763.746 + }, + { + "epoch": 0.0378798127654969, + "grad_norm": 0.2549172341823578, + "learning_rate": 0.004998640011330221, + "loss": 3.880903625488281, + "num_input_tokens_seen": 367001600, + "step": 700, + "train_runtime": 3225.2126, + "train_tokens_per_second": 113791.443 + }, + { + "epoch": 0.038420952947861145, + "grad_norm": 0.23274943232536316, + "learning_rate": 0.004998499910395162, + "loss": 3.8808818817138673, + "num_input_tokens_seen": 372244480, + "step": 710, + "train_runtime": 3270.4782, + "train_tokens_per_second": 113819.588 + }, + { + "epoch": 0.03896209313022538, + "grad_norm": 0.2661728858947754, + "learning_rate": 0.004998352944893316, + "loss": 3.860551452636719, + "num_input_tokens_seen": 377487360, + "step": 720, + "train_runtime": 3315.7715, + "train_tokens_per_second": 113846.012 + }, + { + "epoch": 0.039503233312589627, + "grad_norm": 0.27070483565330505, + "learning_rate": 0.004998199115273362, + "loss": 3.8578773498535157, + "num_input_tokens_seen": 382730240, + "step": 730, + "train_runtime": 3361.0384, + "train_tokens_per_second": 113872.616 + }, + { + "epoch": 0.04004437349495387, + "grad_norm": 0.2620537281036377, + "learning_rate": 0.004998038422004937, + "loss": 3.8334423065185548, + "num_input_tokens_seen": 387973120, + "step": 740, + "train_runtime": 3406.3177, + "train_tokens_per_second": 113898.102 + }, + { + "epoch": 0.04058551367731811, + "grad_norm": 0.24665935337543488, + "learning_rate": 0.004997870865578627, + "loss": 3.830191802978516, + "num_input_tokens_seen": 393216000, + "step": 750, + "train_runtime": 3451.6094, + "train_tokens_per_second": 113922.508 + }, + { + "epoch": 0.04112665385968235, + "grad_norm": 0.3058369755744934, + "learning_rate": 0.004997696446505975, + "loss": 3.81226806640625, + "num_input_tokens_seen": 398458880, + "step": 760, + "train_runtime": 3496.8514, + "train_tokens_per_second": 113947.901 + }, + { + "epoch": 0.04166779404204659, + "grad_norm": 0.24344538152217865, + "learning_rate": 0.004997515165319476, + "loss": 3.8191978454589846, + "num_input_tokens_seen": 403701760, + "step": 770, + "train_runtime": 3545.7622, + "train_tokens_per_second": 113854.718 + }, + { + "epoch": 0.042208934224410834, + "grad_norm": 0.26970189809799194, + "learning_rate": 0.004997327022572571, + "loss": 3.794965362548828, + "num_input_tokens_seen": 408944640, + "step": 780, + "train_runtime": 3591.0695, + "train_tokens_per_second": 113878.231 + }, + { + "epoch": 0.04275007440677508, + "grad_norm": 0.2699701189994812, + "learning_rate": 0.0049971320188396525, + "loss": 3.7990867614746096, + "num_input_tokens_seen": 414187520, + "step": 790, + "train_runtime": 3636.3454, + "train_tokens_per_second": 113902.14 + }, + { + "epoch": 0.043291214589139315, + "grad_norm": 0.24337078630924225, + "learning_rate": 0.004996930154716057, + "loss": 3.795510101318359, + "num_input_tokens_seen": 419430400, + "step": 800, + "train_runtime": 3681.6305, + "train_tokens_per_second": 113925.175 + }, + { + "epoch": 0.04383235477150356, + "grad_norm": 0.24991652369499207, + "learning_rate": 0.004996721430818068, + "loss": 3.7792850494384767, + "num_input_tokens_seen": 424673280, + "step": 810, + "train_runtime": 3726.9273, + "train_tokens_per_second": 113947.293 + }, + { + "epoch": 0.0443734949538678, + "grad_norm": 0.22850197553634644, + "learning_rate": 0.004996505847782908, + "loss": 3.7752288818359374, + "num_input_tokens_seen": 429916160, + "step": 820, + "train_runtime": 3772.1962, + "train_tokens_per_second": 113969.725 + }, + { + "epoch": 0.04491463513623204, + "grad_norm": 0.24704036116600037, + "learning_rate": 0.004996283406268743, + "loss": 3.7673095703125, + "num_input_tokens_seen": 435159040, + "step": 830, + "train_runtime": 3817.4555, + "train_tokens_per_second": 113991.908 + }, + { + "epoch": 0.045455775318596285, + "grad_norm": 0.24149645864963531, + "learning_rate": 0.004996054106954677, + "loss": 3.767901611328125, + "num_input_tokens_seen": 440401920, + "step": 840, + "train_runtime": 3862.7306, + "train_tokens_per_second": 114013.106 + }, + { + "epoch": 0.04599691550096052, + "grad_norm": 0.26389098167419434, + "learning_rate": 0.004995817950540749, + "loss": 3.765447998046875, + "num_input_tokens_seen": 445644800, + "step": 850, + "train_runtime": 3908.0129, + "train_tokens_per_second": 114033.605 + }, + { + "epoch": 0.04653805568332477, + "grad_norm": 0.2389504611492157, + "learning_rate": 0.004995574937747936, + "loss": 3.7446453094482424, + "num_input_tokens_seen": 450887680, + "step": 860, + "train_runtime": 3953.2772, + "train_tokens_per_second": 114054.151 + }, + { + "epoch": 0.047079195865689004, + "grad_norm": 0.21696795523166656, + "learning_rate": 0.0049953250693181425, + "loss": 3.7382736206054688, + "num_input_tokens_seen": 456130560, + "step": 870, + "train_runtime": 3998.5453, + "train_tokens_per_second": 114074.125 + }, + { + "epoch": 0.04762033604805325, + "grad_norm": 0.23217777907848358, + "learning_rate": 0.004995068346014207, + "loss": 3.7418495178222657, + "num_input_tokens_seen": 461373440, + "step": 880, + "train_runtime": 4043.8212, + "train_tokens_per_second": 114093.431 + }, + { + "epoch": 0.04816147623041749, + "grad_norm": 0.25520190596580505, + "learning_rate": 0.004994804768619892, + "loss": 3.7273784637451173, + "num_input_tokens_seen": 466616320, + "step": 890, + "train_runtime": 4089.1251, + "train_tokens_per_second": 114111.53 + }, + { + "epoch": 0.04870261641278173, + "grad_norm": 0.2495919018983841, + "learning_rate": 0.004994534337939889, + "loss": 3.7182594299316407, + "num_input_tokens_seen": 471859200, + "step": 900, + "train_runtime": 4134.3995, + "train_tokens_per_second": 114130.045 + }, + { + "epoch": 0.049243756595145974, + "grad_norm": 0.2571962773799896, + "learning_rate": 0.00499425705479981, + "loss": 3.7261619567871094, + "num_input_tokens_seen": 477102080, + "step": 910, + "train_runtime": 4179.6624, + "train_tokens_per_second": 114148.472 + }, + { + "epoch": 0.04978489677751021, + "grad_norm": 0.2216644585132599, + "learning_rate": 0.004993972920046188, + "loss": 3.705414581298828, + "num_input_tokens_seen": 482344960, + "step": 920, + "train_runtime": 4224.9503, + "train_tokens_per_second": 114165.831 + }, + { + "epoch": 0.050326036959874455, + "grad_norm": 0.2777004539966583, + "learning_rate": 0.004993681934546471, + "loss": 3.707286834716797, + "num_input_tokens_seen": 487587840, + "step": 930, + "train_runtime": 4270.2223, + "train_tokens_per_second": 114183.246 + }, + { + "epoch": 0.0508671771422387, + "grad_norm": 0.23501209914684296, + "learning_rate": 0.004993384099189028, + "loss": 3.7012203216552733, + "num_input_tokens_seen": 492830720, + "step": 940, + "train_runtime": 4315.4919, + "train_tokens_per_second": 114200.358 + }, + { + "epoch": 0.05140831732460294, + "grad_norm": 0.2504929304122925, + "learning_rate": 0.004993079414883134, + "loss": 3.7007171630859377, + "num_input_tokens_seen": 498073600, + "step": 950, + "train_runtime": 4360.758, + "train_tokens_per_second": 114217.207 + }, + { + "epoch": 0.05194945750696718, + "grad_norm": 0.265903502702713, + "learning_rate": 0.004992767882558976, + "loss": 3.6977813720703123, + "num_input_tokens_seen": 503316480, + "step": 960, + "train_runtime": 4406.0254, + "train_tokens_per_second": 114233.676 + }, + { + "epoch": 0.05249059768933142, + "grad_norm": 0.22946324944496155, + "learning_rate": 0.00499244950316765, + "loss": 3.6873912811279297, + "num_input_tokens_seen": 508559360, + "step": 970, + "train_runtime": 4451.29, + "train_tokens_per_second": 114249.883 + }, + { + "epoch": 0.05303173787169566, + "grad_norm": 0.2554706633090973, + "learning_rate": 0.004992124277681152, + "loss": 3.6791450500488283, + "num_input_tokens_seen": 513802240, + "step": 980, + "train_runtime": 4496.5462, + "train_tokens_per_second": 114265.975 + }, + { + "epoch": 0.05357287805405991, + "grad_norm": 0.22852079570293427, + "learning_rate": 0.004991792207092381, + "loss": 3.677058792114258, + "num_input_tokens_seen": 519045120, + "step": 990, + "train_runtime": 4541.8028, + "train_tokens_per_second": 114281.739 + }, + { + "epoch": 0.054114018236424144, + "grad_norm": 0.24798494577407837, + "learning_rate": 0.004991453292415134, + "loss": 3.657318115234375, + "num_input_tokens_seen": 524288000, + "step": 1000, + "train_runtime": 4587.0445, + "train_tokens_per_second": 114297.561 + }, + { + "epoch": 0.054114018236424144, + "eval_loss": 3.6001899242401123, + "eval_runtime": 1.9848, + "eval_samples_per_second": 251.913, + "eval_steps_per_second": 4.031, + "num_input_tokens_seen": 524288000, + "step": 1000 + }, + { + "epoch": 0.05465515841878839, + "grad_norm": 0.223563551902771, + "learning_rate": 0.0049911075346841, + "loss": 3.666912841796875, + "num_input_tokens_seen": 529530880, + "step": 1010, + "train_runtime": 4637.4751, + "train_tokens_per_second": 114185.17 + }, + { + "epoch": 0.055196298601152625, + "grad_norm": 0.24604271352291107, + "learning_rate": 0.004990754934954863, + "loss": 3.6610164642333984, + "num_input_tokens_seen": 534773760, + "step": 1020, + "train_runtime": 4682.7302, + "train_tokens_per_second": 114201.276 + }, + { + "epoch": 0.05573743878351687, + "grad_norm": 0.2436058074235916, + "learning_rate": 0.004990395494303893, + "loss": 3.6538921356201173, + "num_input_tokens_seen": 540016640, + "step": 1030, + "train_runtime": 4727.9737, + "train_tokens_per_second": 114217.353 + }, + { + "epoch": 0.056278578965881114, + "grad_norm": 0.24788981676101685, + "learning_rate": 0.004990029213828546, + "loss": 3.6453926086425783, + "num_input_tokens_seen": 545259520, + "step": 1040, + "train_runtime": 4773.2764, + "train_tokens_per_second": 114231.708 + }, + { + "epoch": 0.05681971914824535, + "grad_norm": 0.2355376034975052, + "learning_rate": 0.00498965609464706, + "loss": 3.653607940673828, + "num_input_tokens_seen": 550502400, + "step": 1050, + "train_runtime": 4818.5445, + "train_tokens_per_second": 114246.615 + }, + { + "epoch": 0.057360859330609595, + "grad_norm": 0.24511760473251343, + "learning_rate": 0.0049892761378985484, + "loss": 3.655783462524414, + "num_input_tokens_seen": 555745280, + "step": 1060, + "train_runtime": 4863.8191, + "train_tokens_per_second": 114261.091 + }, + { + "epoch": 0.05790199951297383, + "grad_norm": 0.2463475465774536, + "learning_rate": 0.004988889344743005, + "loss": 3.6497840881347656, + "num_input_tokens_seen": 560988160, + "step": 1070, + "train_runtime": 4909.1151, + "train_tokens_per_second": 114274.804 + }, + { + "epoch": 0.05844313969533808, + "grad_norm": 0.24649877846240997, + "learning_rate": 0.00498849571636129, + "loss": 3.6289398193359377, + "num_input_tokens_seen": 566231040, + "step": 1080, + "train_runtime": 4954.3585, + "train_tokens_per_second": 114289.476 + }, + { + "epoch": 0.05898427987770232, + "grad_norm": 0.21440814435482025, + "learning_rate": 0.004988095253955132, + "loss": 3.6420303344726563, + "num_input_tokens_seen": 571473920, + "step": 1090, + "train_runtime": 4999.6131, + "train_tokens_per_second": 114303.629 + }, + { + "epoch": 0.05952542006006656, + "grad_norm": 0.23143576085567474, + "learning_rate": 0.004987687958747124, + "loss": 3.636464309692383, + "num_input_tokens_seen": 576716800, + "step": 1100, + "train_runtime": 5044.8489, + "train_tokens_per_second": 114317.952 + }, + { + "epoch": 0.0600665602424308, + "grad_norm": 0.216554194688797, + "learning_rate": 0.0049872738319807226, + "loss": 3.6284786224365235, + "num_input_tokens_seen": 581959680, + "step": 1110, + "train_runtime": 5090.1116, + "train_tokens_per_second": 114331.419 + }, + { + "epoch": 0.06060770042479504, + "grad_norm": 0.21454273164272308, + "learning_rate": 0.004986852874920234, + "loss": 3.628643035888672, + "num_input_tokens_seen": 587202560, + "step": 1120, + "train_runtime": 5135.379, + "train_tokens_per_second": 114344.542 + }, + { + "epoch": 0.061148840607159284, + "grad_norm": 0.22195634245872498, + "learning_rate": 0.004986425088850824, + "loss": 3.6224212646484375, + "num_input_tokens_seen": 592445440, + "step": 1130, + "train_runtime": 5180.6463, + "train_tokens_per_second": 114357.438 + }, + { + "epoch": 0.06168998078952353, + "grad_norm": 0.23462195694446564, + "learning_rate": 0.004985990475078501, + "loss": 3.614238739013672, + "num_input_tokens_seen": 597688320, + "step": 1140, + "train_runtime": 5225.8696, + "train_tokens_per_second": 114371.074 + }, + { + "epoch": 0.062231120971887766, + "grad_norm": 0.2454216629266739, + "learning_rate": 0.004985549034930123, + "loss": 3.6097618103027345, + "num_input_tokens_seen": 602931200, + "step": 1150, + "train_runtime": 5274.713, + "train_tokens_per_second": 114305.973 + }, + { + "epoch": 0.06277226115425201, + "grad_norm": 0.22363615036010742, + "learning_rate": 0.004985100769753384, + "loss": 3.605723571777344, + "num_input_tokens_seen": 608174080, + "step": 1160, + "train_runtime": 5319.9954, + "train_tokens_per_second": 114318.535 + }, + { + "epoch": 0.06331340133661625, + "grad_norm": 0.2078346163034439, + "learning_rate": 0.00498464568091682, + "loss": 3.602735900878906, + "num_input_tokens_seen": 613416960, + "step": 1170, + "train_runtime": 5365.2147, + "train_tokens_per_second": 114332.23 + }, + { + "epoch": 0.0638545415189805, + "grad_norm": 0.21972794830799103, + "learning_rate": 0.004984183769809795, + "loss": 3.598741912841797, + "num_input_tokens_seen": 618659840, + "step": 1180, + "train_runtime": 5410.4312, + "train_tokens_per_second": 114345.754 + }, + { + "epoch": 0.06439568170134473, + "grad_norm": 0.2427527755498886, + "learning_rate": 0.0049837150378425005, + "loss": 3.596208190917969, + "num_input_tokens_seen": 623902720, + "step": 1190, + "train_runtime": 5455.6665, + "train_tokens_per_second": 114358.662 + }, + { + "epoch": 0.06493682188370897, + "grad_norm": 0.2279594987630844, + "learning_rate": 0.004983239486445956, + "loss": 3.59366455078125, + "num_input_tokens_seen": 629145600, + "step": 1200, + "train_runtime": 5500.922, + "train_tokens_per_second": 114370.936 + }, + { + "epoch": 0.06547796206607322, + "grad_norm": 0.23130950331687927, + "learning_rate": 0.004982757117071998, + "loss": 3.592302703857422, + "num_input_tokens_seen": 634388480, + "step": 1210, + "train_runtime": 5546.1769, + "train_tokens_per_second": 114383.024 + }, + { + "epoch": 0.06601910224843746, + "grad_norm": 0.2286449670791626, + "learning_rate": 0.004982267931193276, + "loss": 3.5859790802001954, + "num_input_tokens_seen": 639631360, + "step": 1220, + "train_runtime": 5591.4258, + "train_tokens_per_second": 114395.038 + }, + { + "epoch": 0.0665602424308017, + "grad_norm": 0.23779889941215515, + "learning_rate": 0.004981771930303254, + "loss": 3.586525726318359, + "num_input_tokens_seen": 644874240, + "step": 1230, + "train_runtime": 5636.6531, + "train_tokens_per_second": 114407.297 + }, + { + "epoch": 0.06710138261316594, + "grad_norm": 0.23324504494667053, + "learning_rate": 0.004981269115916199, + "loss": 3.579142379760742, + "num_input_tokens_seen": 650117120, + "step": 1240, + "train_runtime": 5681.8961, + "train_tokens_per_second": 114419.044 + }, + { + "epoch": 0.06764252279553018, + "grad_norm": 0.2067607045173645, + "learning_rate": 0.004980759489567181, + "loss": 3.5813358306884764, + "num_input_tokens_seen": 655360000, + "step": 1250, + "train_runtime": 5727.162, + "train_tokens_per_second": 114430.148 + }, + { + "epoch": 0.06818366297789442, + "grad_norm": 0.20190924406051636, + "learning_rate": 0.004980243052812064, + "loss": 3.572435760498047, + "num_input_tokens_seen": 660602880, + "step": 1260, + "train_runtime": 5772.4268, + "train_tokens_per_second": 114441.102 + }, + { + "epoch": 0.06872480316025867, + "grad_norm": 0.19773253798484802, + "learning_rate": 0.004979719807227508, + "loss": 3.5610916137695314, + "num_input_tokens_seen": 665845760, + "step": 1270, + "train_runtime": 5817.687, + "train_tokens_per_second": 114451.974 + }, + { + "epoch": 0.06926594334262291, + "grad_norm": 0.21706561744213104, + "learning_rate": 0.004979189754410956, + "loss": 3.5655101776123046, + "num_input_tokens_seen": 671088640, + "step": 1280, + "train_runtime": 5862.9264, + "train_tokens_per_second": 114463.084 + }, + { + "epoch": 0.06980708352498714, + "grad_norm": 0.23492570221424103, + "learning_rate": 0.004978652895980635, + "loss": 3.571335220336914, + "num_input_tokens_seen": 676331520, + "step": 1290, + "train_runtime": 5908.1779, + "train_tokens_per_second": 114473.792 + }, + { + "epoch": 0.07034822370735139, + "grad_norm": 0.23728297650814056, + "learning_rate": 0.004978109233575551, + "loss": 3.5683116912841797, + "num_input_tokens_seen": 681574400, + "step": 1300, + "train_runtime": 5953.4117, + "train_tokens_per_second": 114484.674 + }, + { + "epoch": 0.07088936388971563, + "grad_norm": 0.2128531038761139, + "learning_rate": 0.0049775587688554775, + "loss": 3.553203582763672, + "num_input_tokens_seen": 686817280, + "step": 1310, + "train_runtime": 5998.6316, + "train_tokens_per_second": 114495.659 + }, + { + "epoch": 0.07143050407207988, + "grad_norm": 0.22945411503314972, + "learning_rate": 0.004977001503500959, + "loss": 3.5565677642822267, + "num_input_tokens_seen": 692060160, + "step": 1320, + "train_runtime": 6043.8634, + "train_tokens_per_second": 114506.254 + }, + { + "epoch": 0.07197164425444412, + "grad_norm": 0.21733802556991577, + "learning_rate": 0.004976437439213302, + "loss": 3.5509429931640626, + "num_input_tokens_seen": 697303040, + "step": 1330, + "train_runtime": 6089.0954, + "train_tokens_per_second": 114516.688 + }, + { + "epoch": 0.07251278443680835, + "grad_norm": 0.24347279965877533, + "learning_rate": 0.004975866577714568, + "loss": 3.54642333984375, + "num_input_tokens_seen": 702545920, + "step": 1340, + "train_runtime": 6134.3055, + "train_tokens_per_second": 114527.376 + }, + { + "epoch": 0.0730539246191726, + "grad_norm": 0.21520718932151794, + "learning_rate": 0.004975288920747571, + "loss": 3.550141143798828, + "num_input_tokens_seen": 707788800, + "step": 1350, + "train_runtime": 6179.5539, + "train_tokens_per_second": 114537.201 + }, + { + "epoch": 0.07359506480153684, + "grad_norm": 0.23248061537742615, + "learning_rate": 0.0049747044700758705, + "loss": 3.5488357543945312, + "num_input_tokens_seen": 713031680, + "step": 1360, + "train_runtime": 6224.7742, + "train_tokens_per_second": 114547.397 + }, + { + "epoch": 0.07413620498390108, + "grad_norm": 0.23462453484535217, + "learning_rate": 0.004974113227483768, + "loss": 3.5416290283203127, + "num_input_tokens_seen": 718274560, + "step": 1370, + "train_runtime": 6269.9891, + "train_tokens_per_second": 114557.544 + }, + { + "epoch": 0.07467734516626533, + "grad_norm": 0.2253153920173645, + "learning_rate": 0.004973515194776301, + "loss": 3.540643310546875, + "num_input_tokens_seen": 723517440, + "step": 1380, + "train_runtime": 6315.2141, + "train_tokens_per_second": 114567.365 + }, + { + "epoch": 0.07521848534862956, + "grad_norm": 0.21088625490665436, + "learning_rate": 0.0049729103737792355, + "loss": 3.543656921386719, + "num_input_tokens_seen": 728760320, + "step": 1390, + "train_runtime": 6360.4473, + "train_tokens_per_second": 114576.898 + }, + { + "epoch": 0.0757596255309938, + "grad_norm": 0.2161734253168106, + "learning_rate": 0.00497229876633906, + "loss": 3.5383941650390627, + "num_input_tokens_seen": 734003200, + "step": 1400, + "train_runtime": 6405.6811, + "train_tokens_per_second": 114586.285 + }, + { + "epoch": 0.07630076571335805, + "grad_norm": 0.20709098875522614, + "learning_rate": 0.004971680374322986, + "loss": 3.5313690185546873, + "num_input_tokens_seen": 739246080, + "step": 1410, + "train_runtime": 6450.8882, + "train_tokens_per_second": 114596.015 + }, + { + "epoch": 0.07684190589572229, + "grad_norm": 0.20399479568004608, + "learning_rate": 0.004971055199618935, + "loss": 3.525136184692383, + "num_input_tokens_seen": 744488960, + "step": 1420, + "train_runtime": 6496.1128, + "train_tokens_per_second": 114605.3 + }, + { + "epoch": 0.07738304607808653, + "grad_norm": 0.21809037029743195, + "learning_rate": 0.004970423244135538, + "loss": 3.53038330078125, + "num_input_tokens_seen": 749731840, + "step": 1430, + "train_runtime": 6541.3447, + "train_tokens_per_second": 114614.33 + }, + { + "epoch": 0.07792418626045076, + "grad_norm": 0.22500748932361603, + "learning_rate": 0.004969784509802125, + "loss": 3.5225593566894533, + "num_input_tokens_seen": 754974720, + "step": 1440, + "train_runtime": 6586.5647, + "train_tokens_per_second": 114623.442 + }, + { + "epoch": 0.07846532644281501, + "grad_norm": 0.22614286839962006, + "learning_rate": 0.0049691389985687204, + "loss": 3.5291175842285156, + "num_input_tokens_seen": 760217600, + "step": 1450, + "train_runtime": 6631.7972, + "train_tokens_per_second": 114632.215 + }, + { + "epoch": 0.07900646662517925, + "grad_norm": 0.2029477208852768, + "learning_rate": 0.004968486712406044, + "loss": 3.5189224243164063, + "num_input_tokens_seen": 765460480, + "step": 1460, + "train_runtime": 6677.0236, + "train_tokens_per_second": 114640.973 + }, + { + "epoch": 0.0795476068075435, + "grad_norm": 0.2242126613855362, + "learning_rate": 0.004967827653305494, + "loss": 3.504582977294922, + "num_input_tokens_seen": 770703360, + "step": 1470, + "train_runtime": 6722.2724, + "train_tokens_per_second": 114649.232 + }, + { + "epoch": 0.08008874698990774, + "grad_norm": 0.2245851755142212, + "learning_rate": 0.004967161823279147, + "loss": 3.5151710510253906, + "num_input_tokens_seen": 775946240, + "step": 1480, + "train_runtime": 6767.5092, + "train_tokens_per_second": 114657.582 + }, + { + "epoch": 0.08062988717227197, + "grad_norm": 0.21762171387672424, + "learning_rate": 0.004966489224359752, + "loss": 3.510267639160156, + "num_input_tokens_seen": 781189120, + "step": 1490, + "train_runtime": 6812.7609, + "train_tokens_per_second": 114665.571 + }, + { + "epoch": 0.08117102735463622, + "grad_norm": 0.22595250606536865, + "learning_rate": 0.0049658098586007225, + "loss": 3.515250396728516, + "num_input_tokens_seen": 786432000, + "step": 1500, + "train_runtime": 6857.9968, + "train_tokens_per_second": 114673.719 + }, + { + "epoch": 0.08117102735463622, + "eval_loss": 3.439197540283203, + "eval_runtime": 1.986, + "eval_samples_per_second": 251.759, + "eval_steps_per_second": 4.028, + "num_input_tokens_seen": 786432000, + "step": 1500 + }, + { + "epoch": 0.08171216753700046, + "grad_norm": 0.2041054517030716, + "learning_rate": 0.00496512372807613, + "loss": 3.5051536560058594, + "num_input_tokens_seen": 791674880, + "step": 1510, + "train_runtime": 6905.2279, + "train_tokens_per_second": 114648.624 + }, + { + "epoch": 0.0822533077193647, + "grad_norm": 0.20704089105129242, + "learning_rate": 0.004964430834880702, + "loss": 3.498210906982422, + "num_input_tokens_seen": 796917760, + "step": 1520, + "train_runtime": 6950.4841, + "train_tokens_per_second": 114656.439 + }, + { + "epoch": 0.08279444790172895, + "grad_norm": 0.2235393226146698, + "learning_rate": 0.0049637311811298055, + "loss": 3.510853958129883, + "num_input_tokens_seen": 802160640, + "step": 1530, + "train_runtime": 6999.4133, + "train_tokens_per_second": 114603.982 + }, + { + "epoch": 0.08333558808409318, + "grad_norm": 0.19351601600646973, + "learning_rate": 0.004963024768959454, + "loss": 3.4970939636230467, + "num_input_tokens_seen": 807403520, + "step": 1540, + "train_runtime": 7044.6706, + "train_tokens_per_second": 114611.962 + }, + { + "epoch": 0.08387672826645742, + "grad_norm": 0.2104959934949875, + "learning_rate": 0.0049623116005262915, + "loss": 3.5016387939453124, + "num_input_tokens_seen": 812646400, + "step": 1550, + "train_runtime": 7089.915, + "train_tokens_per_second": 114620.048 + }, + { + "epoch": 0.08441786844882167, + "grad_norm": 0.25421494245529175, + "learning_rate": 0.004961591678007588, + "loss": 3.50089111328125, + "num_input_tokens_seen": 817889280, + "step": 1560, + "train_runtime": 7135.1609, + "train_tokens_per_second": 114628.008 + }, + { + "epoch": 0.08495900863118591, + "grad_norm": 0.2085292786359787, + "learning_rate": 0.004960865003601232, + "loss": 3.5003082275390627, + "num_input_tokens_seen": 823132160, + "step": 1570, + "train_runtime": 7180.3935, + "train_tokens_per_second": 114636.079 + }, + { + "epoch": 0.08550014881355016, + "grad_norm": 0.2042287439107895, + "learning_rate": 0.00496013157952573, + "loss": 3.495660400390625, + "num_input_tokens_seen": 828375040, + "step": 1580, + "train_runtime": 7225.6594, + "train_tokens_per_second": 114643.521 + }, + { + "epoch": 0.08604128899591439, + "grad_norm": 0.21099670231342316, + "learning_rate": 0.004959391408020191, + "loss": 3.4938674926757813, + "num_input_tokens_seen": 833617920, + "step": 1590, + "train_runtime": 7270.8988, + "train_tokens_per_second": 114651.29 + }, + { + "epoch": 0.08658242917827863, + "grad_norm": 0.19382232427597046, + "learning_rate": 0.004958644491344324, + "loss": 3.4875198364257813, + "num_input_tokens_seen": 838860800, + "step": 1600, + "train_runtime": 7316.1586, + "train_tokens_per_second": 114658.64 + }, + { + "epoch": 0.08712356936064287, + "grad_norm": 0.20325426757335663, + "learning_rate": 0.0049578908317784295, + "loss": 3.487265777587891, + "num_input_tokens_seen": 844103680, + "step": 1610, + "train_runtime": 7361.3918, + "train_tokens_per_second": 114666.316 + }, + { + "epoch": 0.08766470954300712, + "grad_norm": 0.21367783844470978, + "learning_rate": 0.004957130431623399, + "loss": 3.4908119201660157, + "num_input_tokens_seen": 849346560, + "step": 1620, + "train_runtime": 7406.6443, + "train_tokens_per_second": 114673.6 + }, + { + "epoch": 0.08820584972537136, + "grad_norm": 0.19166916608810425, + "learning_rate": 0.004956363293200697, + "loss": 3.478108215332031, + "num_input_tokens_seen": 854589440, + "step": 1630, + "train_runtime": 7451.8697, + "train_tokens_per_second": 114681.211 + }, + { + "epoch": 0.0887469899077356, + "grad_norm": 0.22475136816501617, + "learning_rate": 0.004955589418852363, + "loss": 3.4743488311767576, + "num_input_tokens_seen": 859832320, + "step": 1640, + "train_runtime": 7497.1227, + "train_tokens_per_second": 114688.308 + }, + { + "epoch": 0.08928813009009984, + "grad_norm": 0.22001579403877258, + "learning_rate": 0.004954808810940998, + "loss": 3.481397247314453, + "num_input_tokens_seen": 865075200, + "step": 1650, + "train_runtime": 7542.3425, + "train_tokens_per_second": 114695.826 + }, + { + "epoch": 0.08982927027246408, + "grad_norm": 0.20330502092838287, + "learning_rate": 0.0049540214718497635, + "loss": 3.47830810546875, + "num_input_tokens_seen": 870318080, + "step": 1660, + "train_runtime": 7587.5621, + "train_tokens_per_second": 114703.256 + }, + { + "epoch": 0.09037041045482833, + "grad_norm": 0.21012984216213226, + "learning_rate": 0.00495322740398237, + "loss": 3.470731735229492, + "num_input_tokens_seen": 875560960, + "step": 1670, + "train_runtime": 7632.7891, + "train_tokens_per_second": 114710.488 + }, + { + "epoch": 0.09091155063719257, + "grad_norm": 0.20543314516544342, + "learning_rate": 0.004952426609763068, + "loss": 3.4727859497070312, + "num_input_tokens_seen": 880803840, + "step": 1680, + "train_runtime": 7678.0113, + "train_tokens_per_second": 114717.706 + }, + { + "epoch": 0.0914526908195568, + "grad_norm": 0.2099304497241974, + "learning_rate": 0.004951619091636649, + "loss": 3.462004852294922, + "num_input_tokens_seen": 886046720, + "step": 1690, + "train_runtime": 7723.225, + "train_tokens_per_second": 114724.965 + }, + { + "epoch": 0.09199383100192104, + "grad_norm": 0.19785360991954803, + "learning_rate": 0.004950804852068425, + "loss": 3.468863677978516, + "num_input_tokens_seen": 891289600, + "step": 1700, + "train_runtime": 7768.4346, + "train_tokens_per_second": 114732.202 + }, + { + "epoch": 0.09253497118428529, + "grad_norm": 0.20223312079906464, + "learning_rate": 0.004949983893544234, + "loss": 3.47713623046875, + "num_input_tokens_seen": 896532480, + "step": 1710, + "train_runtime": 7813.6346, + "train_tokens_per_second": 114739.494 + }, + { + "epoch": 0.09307611136664953, + "grad_norm": 0.21073880791664124, + "learning_rate": 0.004949156218570423, + "loss": 3.4744213104248045, + "num_input_tokens_seen": 901775360, + "step": 1720, + "train_runtime": 7858.8265, + "train_tokens_per_second": 114746.822 + }, + { + "epoch": 0.09361725154901378, + "grad_norm": 0.21444642543792725, + "learning_rate": 0.004948321829673847, + "loss": 3.4606704711914062, + "num_input_tokens_seen": 907018240, + "step": 1730, + "train_runtime": 7904.0563, + "train_tokens_per_second": 114753.514 + }, + { + "epoch": 0.09415839173137801, + "grad_norm": 0.21360410749912262, + "learning_rate": 0.004947480729401857, + "loss": 3.468334197998047, + "num_input_tokens_seen": 912261120, + "step": 1740, + "train_runtime": 7949.2813, + "train_tokens_per_second": 114760.201 + }, + { + "epoch": 0.09469953191374225, + "grad_norm": 0.20467202365398407, + "learning_rate": 0.0049466329203222935, + "loss": 3.451313018798828, + "num_input_tokens_seen": 917504000, + "step": 1750, + "train_runtime": 7994.4951, + "train_tokens_per_second": 114766.973 + }, + { + "epoch": 0.0952406720961065, + "grad_norm": 0.2081199437379837, + "learning_rate": 0.004945778405023478, + "loss": 3.4613468170166017, + "num_input_tokens_seen": 922746880, + "step": 1760, + "train_runtime": 8039.7393, + "train_tokens_per_second": 114773.235 + }, + { + "epoch": 0.09578181227847074, + "grad_norm": 0.2067294865846634, + "learning_rate": 0.004944917186114206, + "loss": 3.4611587524414062, + "num_input_tokens_seen": 927989760, + "step": 1770, + "train_runtime": 8084.9796, + "train_tokens_per_second": 114779.481 + }, + { + "epoch": 0.09632295246083498, + "grad_norm": 0.19931091368198395, + "learning_rate": 0.00494404926622374, + "loss": 3.462744140625, + "num_input_tokens_seen": 933232640, + "step": 1780, + "train_runtime": 8130.2373, + "train_tokens_per_second": 114785.413 + }, + { + "epoch": 0.09686409264319921, + "grad_norm": 0.213568776845932, + "learning_rate": 0.004943174648001798, + "loss": 3.456349182128906, + "num_input_tokens_seen": 938475520, + "step": 1790, + "train_runtime": 8175.469, + "train_tokens_per_second": 114791.644 + }, + { + "epoch": 0.09740523282556346, + "grad_norm": 0.20064200460910797, + "learning_rate": 0.004942293334118552, + "loss": 3.4558643341064452, + "num_input_tokens_seen": 943718400, + "step": 1800, + "train_runtime": 8220.7059, + "train_tokens_per_second": 114797.732 + }, + { + "epoch": 0.0979463730079277, + "grad_norm": 0.21110571920871735, + "learning_rate": 0.004941405327264611, + "loss": 3.4533897399902345, + "num_input_tokens_seen": 948961280, + "step": 1810, + "train_runtime": 8265.9542, + "train_tokens_per_second": 114803.598 + }, + { + "epoch": 0.09848751319029195, + "grad_norm": 0.20624655485153198, + "learning_rate": 0.0049405106301510186, + "loss": 3.4500003814697267, + "num_input_tokens_seen": 954204160, + "step": 1820, + "train_runtime": 8311.1987, + "train_tokens_per_second": 114809.45 + }, + { + "epoch": 0.09902865337265619, + "grad_norm": 0.20945154130458832, + "learning_rate": 0.004939609245509244, + "loss": 3.440562057495117, + "num_input_tokens_seen": 959447040, + "step": 1830, + "train_runtime": 8356.4084, + "train_tokens_per_second": 114815.719 + }, + { + "epoch": 0.09956979355502042, + "grad_norm": 0.20618219673633575, + "learning_rate": 0.004938701176091175, + "loss": 3.4402488708496093, + "num_input_tokens_seen": 964689920, + "step": 1840, + "train_runtime": 8401.6104, + "train_tokens_per_second": 114822.025 + }, + { + "epoch": 0.10011093373738467, + "grad_norm": 0.21744751930236816, + "learning_rate": 0.004937786424669103, + "loss": 3.447218322753906, + "num_input_tokens_seen": 969932800, + "step": 1850, + "train_runtime": 8446.8197, + "train_tokens_per_second": 114828.165 + }, + { + "epoch": 0.10065207391974891, + "grad_norm": 0.207778662443161, + "learning_rate": 0.004936864994035724, + "loss": 3.4344856262207033, + "num_input_tokens_seen": 975175680, + "step": 1860, + "train_runtime": 8492.045, + "train_tokens_per_second": 114834.022 + }, + { + "epoch": 0.10119321410211315, + "grad_norm": 0.20873455703258514, + "learning_rate": 0.004935936887004123, + "loss": 3.4340728759765624, + "num_input_tokens_seen": 980418560, + "step": 1870, + "train_runtime": 8537.2574, + "train_tokens_per_second": 114839.99 + }, + { + "epoch": 0.1017343542844774, + "grad_norm": 0.21933899819850922, + "learning_rate": 0.004935002106407768, + "loss": 3.431113433837891, + "num_input_tokens_seen": 985661440, + "step": 1880, + "train_runtime": 8582.5067, + "train_tokens_per_second": 114845.403 + }, + { + "epoch": 0.10227549446684163, + "grad_norm": 0.19961251318454742, + "learning_rate": 0.0049340606551005, + "loss": 3.4356346130371094, + "num_input_tokens_seen": 990904320, + "step": 1890, + "train_runtime": 8627.7245, + "train_tokens_per_second": 114851.177 + }, + { + "epoch": 0.10281663464920587, + "grad_norm": 0.1902250349521637, + "learning_rate": 0.004933112535956529, + "loss": 3.432623291015625, + "num_input_tokens_seen": 996147200, + "step": 1900, + "train_runtime": 8672.9813, + "train_tokens_per_second": 114856.376 + }, + { + "epoch": 0.10335777483157012, + "grad_norm": 0.1946999877691269, + "learning_rate": 0.004932157751870416, + "loss": 3.435283660888672, + "num_input_tokens_seen": 1001390080, + "step": 1910, + "train_runtime": 8722.0751, + "train_tokens_per_second": 114810.99 + }, + { + "epoch": 0.10389891501393436, + "grad_norm": 0.21359668672084808, + "learning_rate": 0.004931196305757076, + "loss": 3.4397598266601563, + "num_input_tokens_seen": 1006632960, + "step": 1920, + "train_runtime": 8767.3142, + "train_tokens_per_second": 114816.572 + }, + { + "epoch": 0.1044400551962986, + "grad_norm": 0.188863143324852, + "learning_rate": 0.004930228200551757, + "loss": 3.428334045410156, + "num_input_tokens_seen": 1011875840, + "step": 1930, + "train_runtime": 8812.53, + "train_tokens_per_second": 114822.399 + }, + { + "epoch": 0.10498119537866284, + "grad_norm": 0.2043711096048355, + "learning_rate": 0.0049292534392100405, + "loss": 3.428396987915039, + "num_input_tokens_seen": 1017118720, + "step": 1940, + "train_runtime": 8857.7449, + "train_tokens_per_second": 114828.179 + }, + { + "epoch": 0.10552233556102708, + "grad_norm": 0.18800680339336395, + "learning_rate": 0.00492827202470783, + "loss": 3.423938751220703, + "num_input_tokens_seen": 1022361600, + "step": 1950, + "train_runtime": 8902.9768, + "train_tokens_per_second": 114833.682 + }, + { + "epoch": 0.10606347574339133, + "grad_norm": 0.20674094557762146, + "learning_rate": 0.004927283960041336, + "loss": 3.4255210876464846, + "num_input_tokens_seen": 1027604480, + "step": 1960, + "train_runtime": 8948.2078, + "train_tokens_per_second": 114839.14 + }, + { + "epoch": 0.10660461592575557, + "grad_norm": 0.19658301770687103, + "learning_rate": 0.004926289248227076, + "loss": 3.422502899169922, + "num_input_tokens_seen": 1032847360, + "step": 1970, + "train_runtime": 8993.4331, + "train_tokens_per_second": 114844.614 + }, + { + "epoch": 0.10714575610811981, + "grad_norm": 0.20629730820655823, + "learning_rate": 0.00492528789230186, + "loss": 3.419141387939453, + "num_input_tokens_seen": 1038090240, + "step": 1980, + "train_runtime": 9038.6403, + "train_tokens_per_second": 114850.266 + }, + { + "epoch": 0.10768689629048404, + "grad_norm": 0.20946621894836426, + "learning_rate": 0.00492427989532278, + "loss": 3.4206031799316405, + "num_input_tokens_seen": 1043333120, + "step": 1990, + "train_runtime": 9083.8627, + "train_tokens_per_second": 114855.668 + }, + { + "epoch": 0.10822803647284829, + "grad_norm": 0.2047351747751236, + "learning_rate": 0.004923265260367205, + "loss": 3.421718978881836, + "num_input_tokens_seen": 1048576000, + "step": 2000, + "train_runtime": 9129.0866, + "train_tokens_per_second": 114860.998 + }, + { + "epoch": 0.10822803647284829, + "eval_loss": 3.355583429336548, + "eval_runtime": 1.9828, + "eval_samples_per_second": 252.171, + "eval_steps_per_second": 4.035, + "num_input_tokens_seen": 1048576000, + "step": 2000 + }, + { + "epoch": 0.10876917665521253, + "grad_norm": 0.18940046429634094, + "learning_rate": 0.004922243990532769, + "loss": 3.4131790161132813, + "num_input_tokens_seen": 1053818880, + "step": 2010, + "train_runtime": 9178.8208, + "train_tokens_per_second": 114809.833 + }, + { + "epoch": 0.10931031683757678, + "grad_norm": 0.2015410214662552, + "learning_rate": 0.004921216088937362, + "loss": 3.433843994140625, + "num_input_tokens_seen": 1059061760, + "step": 2020, + "train_runtime": 9224.0155, + "train_tokens_per_second": 114815.696 + }, + { + "epoch": 0.10985145701994102, + "grad_norm": 0.20246025919914246, + "learning_rate": 0.0049201815587191205, + "loss": 3.4257015228271483, + "num_input_tokens_seen": 1064304640, + "step": 2030, + "train_runtime": 9269.2191, + "train_tokens_per_second": 114821.392 + }, + { + "epoch": 0.11039259720230525, + "grad_norm": 0.1984010487794876, + "learning_rate": 0.0049191404030364165, + "loss": 3.407004547119141, + "num_input_tokens_seen": 1069547520, + "step": 2040, + "train_runtime": 9314.4158, + "train_tokens_per_second": 114827.118 + }, + { + "epoch": 0.1109337373846695, + "grad_norm": 0.2240104079246521, + "learning_rate": 0.0049180926250678506, + "loss": 3.413028335571289, + "num_input_tokens_seen": 1074790400, + "step": 2050, + "train_runtime": 9359.6399, + "train_tokens_per_second": 114832.452 + }, + { + "epoch": 0.11147487756703374, + "grad_norm": 0.20743729174137115, + "learning_rate": 0.004917038228012243, + "loss": 3.413587188720703, + "num_input_tokens_seen": 1080033280, + "step": 2060, + "train_runtime": 9404.8754, + "train_tokens_per_second": 114837.596 + }, + { + "epoch": 0.11201601774939798, + "grad_norm": 0.18610326945781708, + "learning_rate": 0.004915977215088616, + "loss": 3.4143035888671873, + "num_input_tokens_seen": 1085276160, + "step": 2070, + "train_runtime": 9450.1196, + "train_tokens_per_second": 114842.584 + }, + { + "epoch": 0.11255715793176223, + "grad_norm": 0.18289707601070404, + "learning_rate": 0.004914909589536196, + "loss": 3.4013748168945312, + "num_input_tokens_seen": 1090519040, + "step": 2080, + "train_runtime": 9495.3594, + "train_tokens_per_second": 114847.579 + }, + { + "epoch": 0.11309829811412646, + "grad_norm": 0.20693431794643402, + "learning_rate": 0.0049138353546143935, + "loss": 3.420492172241211, + "num_input_tokens_seen": 1095761920, + "step": 2090, + "train_runtime": 9540.5793, + "train_tokens_per_second": 114852.766 + }, + { + "epoch": 0.1136394382964907, + "grad_norm": 0.1881825178861618, + "learning_rate": 0.0049127545136027975, + "loss": 3.4042373657226563, + "num_input_tokens_seen": 1101004800, + "step": 2100, + "train_runtime": 9585.774, + "train_tokens_per_second": 114858.206 + }, + { + "epoch": 0.11418057847885495, + "grad_norm": 0.20557381212711334, + "learning_rate": 0.004911667069801167, + "loss": 3.395760345458984, + "num_input_tokens_seen": 1106247680, + "step": 2110, + "train_runtime": 9630.9862, + "train_tokens_per_second": 114863.385 + }, + { + "epoch": 0.11472171866121919, + "grad_norm": 0.20688888430595398, + "learning_rate": 0.004910573026529419, + "loss": 3.3946189880371094, + "num_input_tokens_seen": 1111490560, + "step": 2120, + "train_runtime": 9676.207, + "train_tokens_per_second": 114868.415 + }, + { + "epoch": 0.11526285884358344, + "grad_norm": 0.1814773976802826, + "learning_rate": 0.004909472387127615, + "loss": 3.405241775512695, + "num_input_tokens_seen": 1116733440, + "step": 2130, + "train_runtime": 9721.4134, + "train_tokens_per_second": 114873.568 + }, + { + "epoch": 0.11580399902594767, + "grad_norm": 0.1865544617176056, + "learning_rate": 0.004908365154955957, + "loss": 3.4098495483398437, + "num_input_tokens_seen": 1121976320, + "step": 2140, + "train_runtime": 9766.6099, + "train_tokens_per_second": 114878.789 + }, + { + "epoch": 0.11634513920831191, + "grad_norm": 0.2032928168773651, + "learning_rate": 0.004907251333394776, + "loss": 3.4024234771728517, + "num_input_tokens_seen": 1127219200, + "step": 2150, + "train_runtime": 9811.8225, + "train_tokens_per_second": 114883.774 + }, + { + "epoch": 0.11688627939067615, + "grad_norm": 0.1904987096786499, + "learning_rate": 0.004906130925844515, + "loss": 3.3986663818359375, + "num_input_tokens_seen": 1132462080, + "step": 2160, + "train_runtime": 9857.0319, + "train_tokens_per_second": 114888.751 + }, + { + "epoch": 0.1174274195730404, + "grad_norm": 0.1815112829208374, + "learning_rate": 0.004905003935725728, + "loss": 3.3947410583496094, + "num_input_tokens_seen": 1137704960, + "step": 2170, + "train_runtime": 9902.2357, + "train_tokens_per_second": 114893.746 + }, + { + "epoch": 0.11796855975540464, + "grad_norm": 0.20339980721473694, + "learning_rate": 0.004903870366479064, + "loss": 3.3956260681152344, + "num_input_tokens_seen": 1142947840, + "step": 2180, + "train_runtime": 9947.4535, + "train_tokens_per_second": 114898.536 + }, + { + "epoch": 0.11850969993776887, + "grad_norm": 0.18381614983081818, + "learning_rate": 0.004902730221565258, + "loss": 3.3980743408203127, + "num_input_tokens_seen": 1148190720, + "step": 2190, + "train_runtime": 9992.6437, + "train_tokens_per_second": 114903.599 + }, + { + "epoch": 0.11905084012013312, + "grad_norm": 0.19806860387325287, + "learning_rate": 0.004901583504465119, + "loss": 3.393767547607422, + "num_input_tokens_seen": 1153433600, + "step": 2200, + "train_runtime": 10037.8153, + "train_tokens_per_second": 114908.829 + }, + { + "epoch": 0.11959198030249736, + "grad_norm": 0.20329956710338593, + "learning_rate": 0.004900430218679523, + "loss": 3.3944183349609376, + "num_input_tokens_seen": 1158676480, + "step": 2210, + "train_runtime": 10083.038, + "train_tokens_per_second": 114913.43 + }, + { + "epoch": 0.1201331204848616, + "grad_norm": 0.2080426961183548, + "learning_rate": 0.004899270367729398, + "loss": 3.3978126525878904, + "num_input_tokens_seen": 1163919360, + "step": 2220, + "train_runtime": 10128.2721, + "train_tokens_per_second": 114917.86 + }, + { + "epoch": 0.12067426066722585, + "grad_norm": 0.19686874747276306, + "learning_rate": 0.004898103955155715, + "loss": 3.395246124267578, + "num_input_tokens_seen": 1169162240, + "step": 2230, + "train_runtime": 10173.5097, + "train_tokens_per_second": 114922.212 + }, + { + "epoch": 0.12121540084959008, + "grad_norm": 0.20237593352794647, + "learning_rate": 0.004896930984519478, + "loss": 3.3845314025878905, + "num_input_tokens_seen": 1174405120, + "step": 2240, + "train_runtime": 10218.7376, + "train_tokens_per_second": 114926.634 + }, + { + "epoch": 0.12175654103195432, + "grad_norm": 0.19075846672058105, + "learning_rate": 0.004895751459401713, + "loss": 3.380054473876953, + "num_input_tokens_seen": 1179648000, + "step": 2250, + "train_runtime": 10263.9453, + "train_tokens_per_second": 114931.243 + }, + { + "epoch": 0.12229768121431857, + "grad_norm": 0.18929868936538696, + "learning_rate": 0.004894565383403456, + "loss": 3.3817626953125, + "num_input_tokens_seen": 1184890880, + "step": 2260, + "train_runtime": 10309.1491, + "train_tokens_per_second": 114935.857 + }, + { + "epoch": 0.12283882139668281, + "grad_norm": 0.18654604256153107, + "learning_rate": 0.0048933727601457415, + "loss": 3.3808876037597657, + "num_input_tokens_seen": 1190133760, + "step": 2270, + "train_runtime": 10354.365, + "train_tokens_per_second": 114940.294 + }, + { + "epoch": 0.12337996157904706, + "grad_norm": 0.19212667644023895, + "learning_rate": 0.004892173593269593, + "loss": 3.378282928466797, + "num_input_tokens_seen": 1195376640, + "step": 2280, + "train_runtime": 10399.5899, + "train_tokens_per_second": 114944.594 + }, + { + "epoch": 0.12392110176141129, + "grad_norm": 0.19903384149074554, + "learning_rate": 0.004890967886436014, + "loss": 3.384090042114258, + "num_input_tokens_seen": 1200619520, + "step": 2290, + "train_runtime": 10448.5527, + "train_tokens_per_second": 114907.735 + }, + { + "epoch": 0.12446224194377553, + "grad_norm": 0.19387711584568024, + "learning_rate": 0.004889755643325971, + "loss": 3.380754089355469, + "num_input_tokens_seen": 1205862400, + "step": 2300, + "train_runtime": 10493.7881, + "train_tokens_per_second": 114912.021 + }, + { + "epoch": 0.1250033821261398, + "grad_norm": 0.20909279584884644, + "learning_rate": 0.0048885368676403855, + "loss": 3.3727947235107423, + "num_input_tokens_seen": 1211105280, + "step": 2310, + "train_runtime": 10539.0351, + "train_tokens_per_second": 114916.145 + }, + { + "epoch": 0.12554452230850402, + "grad_norm": 0.17621192336082458, + "learning_rate": 0.004887311563100124, + "loss": 3.384077453613281, + "num_input_tokens_seen": 1216348160, + "step": 2320, + "train_runtime": 10584.2319, + "train_tokens_per_second": 114920.777 + }, + { + "epoch": 0.12608566249086825, + "grad_norm": 0.19513387978076935, + "learning_rate": 0.004886079733445985, + "loss": 3.378644561767578, + "num_input_tokens_seen": 1221591040, + "step": 2330, + "train_runtime": 10629.451, + "train_tokens_per_second": 114925.13 + }, + { + "epoch": 0.1266268026732325, + "grad_norm": 0.19339337944984436, + "learning_rate": 0.004884841382438689, + "loss": 3.3802566528320312, + "num_input_tokens_seen": 1226833920, + "step": 2340, + "train_runtime": 10674.6651, + "train_tokens_per_second": 114929.5 + }, + { + "epoch": 0.12716794285559674, + "grad_norm": 0.20770232379436493, + "learning_rate": 0.004883596513858863, + "loss": 3.3678009033203127, + "num_input_tokens_seen": 1232076800, + "step": 2350, + "train_runtime": 10719.8522, + "train_tokens_per_second": 114934.122 + }, + { + "epoch": 0.127709083037961, + "grad_norm": 0.19512751698493958, + "learning_rate": 0.004882345131507035, + "loss": 3.3827003479003905, + "num_input_tokens_seen": 1237319680, + "step": 2360, + "train_runtime": 10765.0276, + "train_tokens_per_second": 114938.831 + }, + { + "epoch": 0.12825022322032523, + "grad_norm": 0.1894627958536148, + "learning_rate": 0.004881087239203616, + "loss": 3.377857208251953, + "num_input_tokens_seen": 1242562560, + "step": 2370, + "train_runtime": 10810.2272, + "train_tokens_per_second": 114943.242 + }, + { + "epoch": 0.12879136340268946, + "grad_norm": 0.18233883380889893, + "learning_rate": 0.004879822840788895, + "loss": 3.370525360107422, + "num_input_tokens_seen": 1247805440, + "step": 2380, + "train_runtime": 10855.4267, + "train_tokens_per_second": 114947.618 + }, + { + "epoch": 0.12933250358505372, + "grad_norm": 0.1876172125339508, + "learning_rate": 0.00487855194012302, + "loss": 3.3757583618164064, + "num_input_tokens_seen": 1253048320, + "step": 2390, + "train_runtime": 10900.6108, + "train_tokens_per_second": 114952.12 + }, + { + "epoch": 0.12987364376741795, + "grad_norm": 0.19061093032360077, + "learning_rate": 0.0048772745410859955, + "loss": 3.371135711669922, + "num_input_tokens_seen": 1258291200, + "step": 2400, + "train_runtime": 10945.8277, + "train_tokens_per_second": 114956.24 + }, + { + "epoch": 0.1304147839497822, + "grad_norm": 0.19320231676101685, + "learning_rate": 0.004875990647577659, + "loss": 3.376973342895508, + "num_input_tokens_seen": 1263534080, + "step": 2410, + "train_runtime": 10991.0522, + "train_tokens_per_second": 114960.247 + }, + { + "epoch": 0.13095592413214643, + "grad_norm": 0.18665140867233276, + "learning_rate": 0.004874700263517679, + "loss": 3.371229553222656, + "num_input_tokens_seen": 1268776960, + "step": 2420, + "train_runtime": 11036.2334, + "train_tokens_per_second": 114964.673 + }, + { + "epoch": 0.13149706431451066, + "grad_norm": 0.18199113011360168, + "learning_rate": 0.004873403392845541, + "loss": 3.361619567871094, + "num_input_tokens_seen": 1274019840, + "step": 2430, + "train_runtime": 11081.4325, + "train_tokens_per_second": 114968.877 + }, + { + "epoch": 0.13203820449687492, + "grad_norm": 0.19316934049129486, + "learning_rate": 0.004872100039520528, + "loss": 3.360996627807617, + "num_input_tokens_seen": 1279262720, + "step": 2440, + "train_runtime": 11126.6221, + "train_tokens_per_second": 114973.144 + }, + { + "epoch": 0.13257934467923915, + "grad_norm": 0.185124471783638, + "learning_rate": 0.00487079020752172, + "loss": 3.366575241088867, + "num_input_tokens_seen": 1284505600, + "step": 2450, + "train_runtime": 11171.8004, + "train_tokens_per_second": 114977.493 + }, + { + "epoch": 0.1331204848616034, + "grad_norm": 0.18804939091205597, + "learning_rate": 0.004869473900847973, + "loss": 3.3575817108154298, + "num_input_tokens_seen": 1289748480, + "step": 2460, + "train_runtime": 11217.0182, + "train_tokens_per_second": 114981.402 + }, + { + "epoch": 0.13366162504396764, + "grad_norm": 0.19206225872039795, + "learning_rate": 0.004868151123517911, + "loss": 3.3654083251953124, + "num_input_tokens_seen": 1294991360, + "step": 2470, + "train_runtime": 11262.2319, + "train_tokens_per_second": 114985.322 + }, + { + "epoch": 0.13420276522633187, + "grad_norm": 0.1901652067899704, + "learning_rate": 0.004866821879569913, + "loss": 3.3583431243896484, + "num_input_tokens_seen": 1300234240, + "step": 2480, + "train_runtime": 11307.4375, + "train_tokens_per_second": 114989.293 + }, + { + "epoch": 0.13474390540869613, + "grad_norm": 0.1906082034111023, + "learning_rate": 0.004865486173062098, + "loss": 3.3592803955078123, + "num_input_tokens_seen": 1305477120, + "step": 2490, + "train_runtime": 11352.6608, + "train_tokens_per_second": 114993.053 + }, + { + "epoch": 0.13528504559106036, + "grad_norm": 0.17934362590312958, + "learning_rate": 0.004864144008072318, + "loss": 3.3405136108398437, + "num_input_tokens_seen": 1310720000, + "step": 2500, + "train_runtime": 11397.8619, + "train_tokens_per_second": 114997.007 + }, + { + "epoch": 0.13528504559106036, + "eval_loss": 3.2980093955993652, + "eval_runtime": 1.9858, + "eval_samples_per_second": 251.785, + "eval_steps_per_second": 4.029, + "num_input_tokens_seen": 1310720000, + "step": 2500 + }, + { + "epoch": 0.13582618577342462, + "grad_norm": 0.18693317472934723, + "learning_rate": 0.00486279538869814, + "loss": 3.367817687988281, + "num_input_tokens_seen": 1315962880, + "step": 2510, + "train_runtime": 11445.0335, + "train_tokens_per_second": 114981.13 + }, + { + "epoch": 0.13636732595578885, + "grad_norm": 0.18467511236667633, + "learning_rate": 0.004861440319056837, + "loss": 3.355504608154297, + "num_input_tokens_seen": 1321205760, + "step": 2520, + "train_runtime": 11490.248, + "train_tokens_per_second": 114984.965 + }, + { + "epoch": 0.13690846613815308, + "grad_norm": 0.17405100166797638, + "learning_rate": 0.004860078803285375, + "loss": 3.3486671447753906, + "num_input_tokens_seen": 1326448640, + "step": 2530, + "train_runtime": 11535.4773, + "train_tokens_per_second": 114988.622 + }, + { + "epoch": 0.13744960632051734, + "grad_norm": 0.1991710066795349, + "learning_rate": 0.0048587108455403994, + "loss": 3.3470123291015623, + "num_input_tokens_seen": 1331691520, + "step": 2540, + "train_runtime": 11580.7091, + "train_tokens_per_second": 114992.226 + }, + { + "epoch": 0.13799074650288157, + "grad_norm": 0.19981589913368225, + "learning_rate": 0.004857336449998221, + "loss": 3.355559539794922, + "num_input_tokens_seen": 1336934400, + "step": 2550, + "train_runtime": 11625.9198, + "train_tokens_per_second": 114996.011 + }, + { + "epoch": 0.13853188668524583, + "grad_norm": 0.19031056761741638, + "learning_rate": 0.004855955620854806, + "loss": 3.359702301025391, + "num_input_tokens_seen": 1342177280, + "step": 2560, + "train_runtime": 11671.1393, + "train_tokens_per_second": 114999.68 + }, + { + "epoch": 0.13907302686761006, + "grad_norm": 0.1825476437807083, + "learning_rate": 0.004854568362325763, + "loss": 3.3532974243164064, + "num_input_tokens_seen": 1347420160, + "step": 2570, + "train_runtime": 11716.3486, + "train_tokens_per_second": 115003.42 + }, + { + "epoch": 0.13961416704997429, + "grad_norm": 0.1943996399641037, + "learning_rate": 0.004853174678646328, + "loss": 3.3549442291259766, + "num_input_tokens_seen": 1352663040, + "step": 2580, + "train_runtime": 11761.5665, + "train_tokens_per_second": 115007.048 + }, + { + "epoch": 0.14015530723233854, + "grad_norm": 0.18165603280067444, + "learning_rate": 0.004851774574071355, + "loss": 3.345872497558594, + "num_input_tokens_seen": 1357905920, + "step": 2590, + "train_runtime": 11806.7776, + "train_tokens_per_second": 115010.714 + }, + { + "epoch": 0.14069644741470277, + "grad_norm": 0.19906878471374512, + "learning_rate": 0.004850368052875296, + "loss": 3.3501548767089844, + "num_input_tokens_seen": 1363148800, + "step": 2600, + "train_runtime": 11851.978, + "train_tokens_per_second": 115014.456 + }, + { + "epoch": 0.14123758759706703, + "grad_norm": 0.1889800876379013, + "learning_rate": 0.004848955119352198, + "loss": 3.357212448120117, + "num_input_tokens_seen": 1368391680, + "step": 2610, + "train_runtime": 11897.1651, + "train_tokens_per_second": 115018.298 + }, + { + "epoch": 0.14177872777943126, + "grad_norm": 0.1907270848751068, + "learning_rate": 0.00484753577781568, + "loss": 3.3405483245849608, + "num_input_tokens_seen": 1373634560, + "step": 2620, + "train_runtime": 11942.3695, + "train_tokens_per_second": 115021.945 + }, + { + "epoch": 0.1423198679617955, + "grad_norm": 0.18622975051403046, + "learning_rate": 0.004846110032598928, + "loss": 3.344770050048828, + "num_input_tokens_seen": 1378877440, + "step": 2630, + "train_runtime": 11987.559, + "train_tokens_per_second": 115025.706 + }, + { + "epoch": 0.14286100814415975, + "grad_norm": 0.20059405267238617, + "learning_rate": 0.004844677888054675, + "loss": 3.344530487060547, + "num_input_tokens_seen": 1384120320, + "step": 2640, + "train_runtime": 12032.7386, + "train_tokens_per_second": 115029.535 + }, + { + "epoch": 0.14340214832652398, + "grad_norm": 0.18230022490024567, + "learning_rate": 0.004843239348555194, + "loss": 3.340105438232422, + "num_input_tokens_seen": 1389363200, + "step": 2650, + "train_runtime": 12077.95, + "train_tokens_per_second": 115033.032 + }, + { + "epoch": 0.14394328850888824, + "grad_norm": 0.1814001351594925, + "learning_rate": 0.004841794418492279, + "loss": 3.3359622955322266, + "num_input_tokens_seen": 1394606080, + "step": 2660, + "train_runtime": 12123.1531, + "train_tokens_per_second": 115036.581 + }, + { + "epoch": 0.14448442869125247, + "grad_norm": 0.20956675708293915, + "learning_rate": 0.004840343102277236, + "loss": 3.3457298278808594, + "num_input_tokens_seen": 1399848960, + "step": 2670, + "train_runtime": 12168.3992, + "train_tokens_per_second": 115039.697 + }, + { + "epoch": 0.1450255688736167, + "grad_norm": 0.19602610170841217, + "learning_rate": 0.004838885404340865, + "loss": 3.337678909301758, + "num_input_tokens_seen": 1405091840, + "step": 2680, + "train_runtime": 12217.5728, + "train_tokens_per_second": 115005.809 + }, + { + "epoch": 0.14556670905598096, + "grad_norm": 0.18408642709255219, + "learning_rate": 0.00483742132913345, + "loss": 3.3393791198730467, + "num_input_tokens_seen": 1410334720, + "step": 2690, + "train_runtime": 12262.7736, + "train_tokens_per_second": 115009.439 + }, + { + "epoch": 0.1461078492383452, + "grad_norm": 0.21065284311771393, + "learning_rate": 0.00483595088112475, + "loss": 3.3346370697021483, + "num_input_tokens_seen": 1415577600, + "step": 2700, + "train_runtime": 12307.9818, + "train_tokens_per_second": 115012.974 + }, + { + "epoch": 0.14664898942070945, + "grad_norm": 0.18873485922813416, + "learning_rate": 0.00483447406480397, + "loss": 3.33388671875, + "num_input_tokens_seen": 1420820480, + "step": 2710, + "train_runtime": 12353.2004, + "train_tokens_per_second": 115016.387 + }, + { + "epoch": 0.14719012960307368, + "grad_norm": 0.18760992586612701, + "learning_rate": 0.004832990884679764, + "loss": 3.3374618530273437, + "num_input_tokens_seen": 1426063360, + "step": 2720, + "train_runtime": 12398.393, + "train_tokens_per_second": 115020.016 + }, + { + "epoch": 0.1477312697854379, + "grad_norm": 0.192196786403656, + "learning_rate": 0.004831501345280215, + "loss": 3.3331283569335937, + "num_input_tokens_seen": 1431306240, + "step": 2730, + "train_runtime": 12443.5889, + "train_tokens_per_second": 115023.588 + }, + { + "epoch": 0.14827240996780217, + "grad_norm": 0.18086469173431396, + "learning_rate": 0.004830005451152815, + "loss": 3.342273712158203, + "num_input_tokens_seen": 1436549120, + "step": 2740, + "train_runtime": 12488.8002, + "train_tokens_per_second": 115026.992 + }, + { + "epoch": 0.1488135501501664, + "grad_norm": 0.20178896188735962, + "learning_rate": 0.004828503206864461, + "loss": 3.340282440185547, + "num_input_tokens_seen": 1441792000, + "step": 2750, + "train_runtime": 12534.0299, + "train_tokens_per_second": 115030.203 + }, + { + "epoch": 0.14935469033253065, + "grad_norm": 0.1862727403640747, + "learning_rate": 0.004826994617001436, + "loss": 3.333884048461914, + "num_input_tokens_seen": 1447034880, + "step": 2760, + "train_runtime": 12579.232, + "train_tokens_per_second": 115033.643 + }, + { + "epoch": 0.14989583051489488, + "grad_norm": 0.18692044913768768, + "learning_rate": 0.004825479686169395, + "loss": 3.3313224792480467, + "num_input_tokens_seen": 1452277760, + "step": 2770, + "train_runtime": 12624.4525, + "train_tokens_per_second": 115036.89 + }, + { + "epoch": 0.15043697069725911, + "grad_norm": 0.19887301325798035, + "learning_rate": 0.004823958418993353, + "loss": 3.3318561553955077, + "num_input_tokens_seen": 1457520640, + "step": 2780, + "train_runtime": 12669.6266, + "train_tokens_per_second": 115040.536 + }, + { + "epoch": 0.15097811087962337, + "grad_norm": 0.19694387912750244, + "learning_rate": 0.004822430820117667, + "loss": 3.324603271484375, + "num_input_tokens_seen": 1462763520, + "step": 2790, + "train_runtime": 12714.8383, + "train_tokens_per_second": 115043.816 + }, + { + "epoch": 0.1515192510619876, + "grad_norm": 0.18864646553993225, + "learning_rate": 0.0048208968942060285, + "loss": 3.329520416259766, + "num_input_tokens_seen": 1468006400, + "step": 2800, + "train_runtime": 12760.0555, + "train_tokens_per_second": 115047.023 + }, + { + "epoch": 0.15206039124435186, + "grad_norm": 0.19097571074962616, + "learning_rate": 0.004819356645941442, + "loss": 3.334062194824219, + "num_input_tokens_seen": 1473249280, + "step": 2810, + "train_runtime": 12805.2599, + "train_tokens_per_second": 115050.323 + }, + { + "epoch": 0.1526015314267161, + "grad_norm": 0.18825189769268036, + "learning_rate": 0.004817810080026213, + "loss": 3.3339500427246094, + "num_input_tokens_seen": 1478492160, + "step": 2820, + "train_runtime": 12850.4647, + "train_tokens_per_second": 115053.595 + }, + { + "epoch": 0.15314267160908032, + "grad_norm": 0.17853769659996033, + "learning_rate": 0.004816257201181937, + "loss": 3.3271289825439454, + "num_input_tokens_seen": 1483735040, + "step": 2830, + "train_runtime": 12895.6915, + "train_tokens_per_second": 115056.648 + }, + { + "epoch": 0.15368381179144458, + "grad_norm": 0.1959368884563446, + "learning_rate": 0.004814698014149483, + "loss": 3.3293079376220702, + "num_input_tokens_seen": 1488977920, + "step": 2840, + "train_runtime": 12940.9039, + "train_tokens_per_second": 115059.808 + }, + { + "epoch": 0.1542249519738088, + "grad_norm": 0.1917344182729721, + "learning_rate": 0.0048131325236889745, + "loss": 3.3289634704589846, + "num_input_tokens_seen": 1494220800, + "step": 2850, + "train_runtime": 12986.1305, + "train_tokens_per_second": 115062.82 + }, + { + "epoch": 0.15476609215617307, + "grad_norm": 0.18901459872722626, + "learning_rate": 0.004811560734579785, + "loss": 3.3151206970214844, + "num_input_tokens_seen": 1499463680, + "step": 2860, + "train_runtime": 13031.3423, + "train_tokens_per_second": 115065.943 + }, + { + "epoch": 0.1553072323385373, + "grad_norm": 0.1884569674730301, + "learning_rate": 0.004809982651620513, + "loss": 3.321660614013672, + "num_input_tokens_seen": 1504706560, + "step": 2870, + "train_runtime": 13076.553, + "train_tokens_per_second": 115069.052 + }, + { + "epoch": 0.15584837252090153, + "grad_norm": 0.1960553228855133, + "learning_rate": 0.004808398279628971, + "loss": 3.326691436767578, + "num_input_tokens_seen": 1509949440, + "step": 2880, + "train_runtime": 13121.764, + "train_tokens_per_second": 115072.138 + }, + { + "epoch": 0.1563895127032658, + "grad_norm": 0.19503499567508698, + "learning_rate": 0.004806807623442178, + "loss": 3.321258544921875, + "num_input_tokens_seen": 1515192320, + "step": 2890, + "train_runtime": 13166.9951, + "train_tokens_per_second": 115075.027 + }, + { + "epoch": 0.15693065288563002, + "grad_norm": 0.19287334382534027, + "learning_rate": 0.004805210687916331, + "loss": 3.3227684020996096, + "num_input_tokens_seen": 1520435200, + "step": 2900, + "train_runtime": 13212.1814, + "train_tokens_per_second": 115078.287 + }, + { + "epoch": 0.15747179306799428, + "grad_norm": 0.19813010096549988, + "learning_rate": 0.004803607477926801, + "loss": 3.3109420776367187, + "num_input_tokens_seen": 1525678080, + "step": 2910, + "train_runtime": 13257.3873, + "train_tokens_per_second": 115081.354 + }, + { + "epoch": 0.1580129332503585, + "grad_norm": 0.19769278168678284, + "learning_rate": 0.004801997998368116, + "loss": 3.317332458496094, + "num_input_tokens_seen": 1530920960, + "step": 2920, + "train_runtime": 13302.5829, + "train_tokens_per_second": 115084.489 + }, + { + "epoch": 0.15855407343272274, + "grad_norm": 0.21613162755966187, + "learning_rate": 0.0048003822541539416, + "loss": 3.3125213623046874, + "num_input_tokens_seen": 1536163840, + "step": 2930, + "train_runtime": 13347.7691, + "train_tokens_per_second": 115087.684 + }, + { + "epoch": 0.159095213615087, + "grad_norm": 0.19285152852535248, + "learning_rate": 0.004798760250217072, + "loss": 3.3247020721435545, + "num_input_tokens_seen": 1541406720, + "step": 2940, + "train_runtime": 13392.9574, + "train_tokens_per_second": 115090.84 + }, + { + "epoch": 0.15963635379745122, + "grad_norm": 0.19629855453968048, + "learning_rate": 0.004797131991509409, + "loss": 3.3183937072753906, + "num_input_tokens_seen": 1546649600, + "step": 2950, + "train_runtime": 13438.1692, + "train_tokens_per_second": 115093.773 + }, + { + "epoch": 0.16017749397981548, + "grad_norm": 0.1971038430929184, + "learning_rate": 0.004795497483001952, + "loss": 3.3157825469970703, + "num_input_tokens_seen": 1551892480, + "step": 2960, + "train_runtime": 13483.3787, + "train_tokens_per_second": 115096.706 + }, + { + "epoch": 0.1607186341621797, + "grad_norm": 0.2136721909046173, + "learning_rate": 0.0047938567296847805, + "loss": 3.3181556701660155, + "num_input_tokens_seen": 1557135360, + "step": 2970, + "train_runtime": 13528.5747, + "train_tokens_per_second": 115099.734 + }, + { + "epoch": 0.16125977434454394, + "grad_norm": 0.18783020973205566, + "learning_rate": 0.004792209736567038, + "loss": 3.3050804138183594, + "num_input_tokens_seen": 1562378240, + "step": 2980, + "train_runtime": 13573.7919, + "train_tokens_per_second": 115102.563 + }, + { + "epoch": 0.1618009145269082, + "grad_norm": 0.17761750519275665, + "learning_rate": 0.0047905565086769205, + "loss": 3.313432312011719, + "num_input_tokens_seen": 1567621120, + "step": 2990, + "train_runtime": 13618.9923, + "train_tokens_per_second": 115105.515 + }, + { + "epoch": 0.16234205470927243, + "grad_norm": 0.1785641759634018, + "learning_rate": 0.004788897051061655, + "loss": 3.317774200439453, + "num_input_tokens_seen": 1572864000, + "step": 3000, + "train_runtime": 13664.2112, + "train_tokens_per_second": 115108.291 + }, + { + "epoch": 0.16234205470927243, + "eval_loss": 3.2526497840881348, + "eval_runtime": 1.9863, + "eval_samples_per_second": 251.723, + "eval_steps_per_second": 4.028, + "num_input_tokens_seen": 1572864000, + "step": 3000 + }, + { + "epoch": 0.1628831948916367, + "grad_norm": 0.19272948801517487, + "learning_rate": 0.004787231368787491, + "loss": 3.3128257751464845, + "num_input_tokens_seen": 1578106880, + "step": 3010, + "train_runtime": 13714.1711, + "train_tokens_per_second": 115071.255 + }, + { + "epoch": 0.16342433507400092, + "grad_norm": 0.1745939403772354, + "learning_rate": 0.004785559466939679, + "loss": 3.31363525390625, + "num_input_tokens_seen": 1583349760, + "step": 3020, + "train_runtime": 13759.3609, + "train_tokens_per_second": 115074.368 + }, + { + "epoch": 0.16396547525636515, + "grad_norm": 0.20123089849948883, + "learning_rate": 0.0047838813506224575, + "loss": 3.3179275512695314, + "num_input_tokens_seen": 1588592640, + "step": 3030, + "train_runtime": 13804.5657, + "train_tokens_per_second": 115077.335 + }, + { + "epoch": 0.1645066154387294, + "grad_norm": 0.19240304827690125, + "learning_rate": 0.004782197024959039, + "loss": 3.3164352416992187, + "num_input_tokens_seen": 1593835520, + "step": 3040, + "train_runtime": 13849.7747, + "train_tokens_per_second": 115080.249 + }, + { + "epoch": 0.16504775562109364, + "grad_norm": 0.19316141307353973, + "learning_rate": 0.004780506495091593, + "loss": 3.316120147705078, + "num_input_tokens_seen": 1599078400, + "step": 3050, + "train_runtime": 13894.9935, + "train_tokens_per_second": 115083.062 + }, + { + "epoch": 0.1655888958034579, + "grad_norm": 0.19889311492443085, + "learning_rate": 0.004778809766181229, + "loss": 3.3089508056640624, + "num_input_tokens_seen": 1604321280, + "step": 3060, + "train_runtime": 13943.9286, + "train_tokens_per_second": 115055.185 + }, + { + "epoch": 0.16613003598582213, + "grad_norm": 0.19427727162837982, + "learning_rate": 0.004777106843407982, + "loss": 3.3107887268066407, + "num_input_tokens_seen": 1609564160, + "step": 3070, + "train_runtime": 13989.1559, + "train_tokens_per_second": 115057.99 + }, + { + "epoch": 0.16667117616818636, + "grad_norm": 0.1867746263742447, + "learning_rate": 0.004775397731970797, + "loss": 3.306330108642578, + "num_input_tokens_seen": 1614807040, + "step": 3080, + "train_runtime": 14034.3821, + "train_tokens_per_second": 115060.787 + }, + { + "epoch": 0.16721231635055062, + "grad_norm": 0.18061718344688416, + "learning_rate": 0.0047736824370875125, + "loss": 3.3135826110839846, + "num_input_tokens_seen": 1620049920, + "step": 3090, + "train_runtime": 14079.6171, + "train_tokens_per_second": 115063.493 + }, + { + "epoch": 0.16775345653291485, + "grad_norm": 0.1992795616388321, + "learning_rate": 0.004771960963994845, + "loss": 3.2958747863769533, + "num_input_tokens_seen": 1625292800, + "step": 3100, + "train_runtime": 14124.8552, + "train_tokens_per_second": 115066.157 + }, + { + "epoch": 0.1682945967152791, + "grad_norm": 0.1932191550731659, + "learning_rate": 0.004770233317948373, + "loss": 3.305771255493164, + "num_input_tokens_seen": 1630535680, + "step": 3110, + "train_runtime": 14170.1029, + "train_tokens_per_second": 115068.725 + }, + { + "epoch": 0.16883573689764333, + "grad_norm": 0.18210622668266296, + "learning_rate": 0.00476849950422252, + "loss": 3.309395599365234, + "num_input_tokens_seen": 1635778560, + "step": 3120, + "train_runtime": 14215.3361, + "train_tokens_per_second": 115071.396 + }, + { + "epoch": 0.16937687708000757, + "grad_norm": 0.19367872178554535, + "learning_rate": 0.004766759528110539, + "loss": 3.302986907958984, + "num_input_tokens_seen": 1641021440, + "step": 3130, + "train_runtime": 14260.5763, + "train_tokens_per_second": 115073.992 + }, + { + "epoch": 0.16991801726237182, + "grad_norm": 0.19194577634334564, + "learning_rate": 0.004765013394924499, + "loss": 3.304148864746094, + "num_input_tokens_seen": 1646264320, + "step": 3140, + "train_runtime": 14305.8047, + "train_tokens_per_second": 115076.667 + }, + { + "epoch": 0.17045915744473605, + "grad_norm": 0.18965749442577362, + "learning_rate": 0.0047632611099952624, + "loss": 3.298334503173828, + "num_input_tokens_seen": 1651507200, + "step": 3150, + "train_runtime": 14351.0274, + "train_tokens_per_second": 115079.371 + }, + { + "epoch": 0.1710002976271003, + "grad_norm": 0.17816977202892303, + "learning_rate": 0.004761502678672474, + "loss": 3.300872802734375, + "num_input_tokens_seen": 1656750080, + "step": 3160, + "train_runtime": 14396.2711, + "train_tokens_per_second": 115081.889 + }, + { + "epoch": 0.17154143780946454, + "grad_norm": 0.19343526661396027, + "learning_rate": 0.004759738106324546, + "loss": 3.2991104125976562, + "num_input_tokens_seen": 1661992960, + "step": 3170, + "train_runtime": 14441.5146, + "train_tokens_per_second": 115084.394 + }, + { + "epoch": 0.17208257799182877, + "grad_norm": 0.18815293908119202, + "learning_rate": 0.004757967398338635, + "loss": 3.307733154296875, + "num_input_tokens_seen": 1667235840, + "step": 3180, + "train_runtime": 14486.7506, + "train_tokens_per_second": 115086.943 + }, + { + "epoch": 0.17262371817419303, + "grad_norm": 0.18241587281227112, + "learning_rate": 0.004756190560120631, + "loss": 3.2984477996826174, + "num_input_tokens_seen": 1672478720, + "step": 3190, + "train_runtime": 14531.9861, + "train_tokens_per_second": 115089.48 + }, + { + "epoch": 0.17316485835655726, + "grad_norm": 0.18176402151584625, + "learning_rate": 0.00475440759709514, + "loss": 3.300640869140625, + "num_input_tokens_seen": 1677721600, + "step": 3200, + "train_runtime": 14577.2147, + "train_tokens_per_second": 115092.055 + }, + { + "epoch": 0.17370599853892152, + "grad_norm": 0.20022694766521454, + "learning_rate": 0.004752618514705466, + "loss": 3.300579071044922, + "num_input_tokens_seen": 1682964480, + "step": 3210, + "train_runtime": 14622.4411, + "train_tokens_per_second": 115094.632 + }, + { + "epoch": 0.17424713872128575, + "grad_norm": 0.18792809545993805, + "learning_rate": 0.0047508233184135945, + "loss": 3.295984649658203, + "num_input_tokens_seen": 1688207360, + "step": 3220, + "train_runtime": 14667.6619, + "train_tokens_per_second": 115097.237 + }, + { + "epoch": 0.17478827890364998, + "grad_norm": 0.200827494263649, + "learning_rate": 0.0047490220137001785, + "loss": 3.2906261444091798, + "num_input_tokens_seen": 1693450240, + "step": 3230, + "train_runtime": 14712.8844, + "train_tokens_per_second": 115099.813 + }, + { + "epoch": 0.17532941908601424, + "grad_norm": 0.19141127169132233, + "learning_rate": 0.004747214606064517, + "loss": 3.2837890625, + "num_input_tokens_seen": 1698693120, + "step": 3240, + "train_runtime": 14758.1057, + "train_tokens_per_second": 115102.382 + }, + { + "epoch": 0.17587055926837847, + "grad_norm": 0.18976351618766785, + "learning_rate": 0.0047454011010245436, + "loss": 3.287107467651367, + "num_input_tokens_seen": 1703936000, + "step": 3250, + "train_runtime": 14803.3273, + "train_tokens_per_second": 115104.933 + }, + { + "epoch": 0.17641169945074273, + "grad_norm": 0.19698546826839447, + "learning_rate": 0.004743581504116804, + "loss": 3.2882354736328123, + "num_input_tokens_seen": 1709178880, + "step": 3260, + "train_runtime": 14848.5302, + "train_tokens_per_second": 115107.614 + }, + { + "epoch": 0.17695283963310696, + "grad_norm": 0.17822493612766266, + "learning_rate": 0.004741755820896446, + "loss": 3.2927810668945314, + "num_input_tokens_seen": 1714421760, + "step": 3270, + "train_runtime": 14893.7537, + "train_tokens_per_second": 115110.119 + }, + { + "epoch": 0.1774939798154712, + "grad_norm": 0.1720447987318039, + "learning_rate": 0.004739924056937195, + "loss": 3.2904899597167967, + "num_input_tokens_seen": 1719664640, + "step": 3280, + "train_runtime": 14938.9717, + "train_tokens_per_second": 115112.652 + }, + { + "epoch": 0.17803511999783544, + "grad_norm": 0.18303626775741577, + "learning_rate": 0.004738086217831344, + "loss": 3.28282470703125, + "num_input_tokens_seen": 1724907520, + "step": 3290, + "train_runtime": 14984.1992, + "train_tokens_per_second": 115115.096 + }, + { + "epoch": 0.17857626018019968, + "grad_norm": 0.176763117313385, + "learning_rate": 0.004736242309189728, + "loss": 3.286945343017578, + "num_input_tokens_seen": 1730150400, + "step": 3300, + "train_runtime": 15029.4297, + "train_tokens_per_second": 115117.502 + }, + { + "epoch": 0.17911740036256393, + "grad_norm": 0.19218850135803223, + "learning_rate": 0.004734392336641718, + "loss": 3.290885162353516, + "num_input_tokens_seen": 1735393280, + "step": 3310, + "train_runtime": 15074.6639, + "train_tokens_per_second": 115119.866 + }, + { + "epoch": 0.17965854054492816, + "grad_norm": 0.180914968252182, + "learning_rate": 0.004732536305835194, + "loss": 3.2893463134765626, + "num_input_tokens_seen": 1740636160, + "step": 3320, + "train_runtime": 15119.9044, + "train_tokens_per_second": 115122.167 + }, + { + "epoch": 0.1801996807272924, + "grad_norm": 0.1835494488477707, + "learning_rate": 0.0047306742224365326, + "loss": 3.2857479095458983, + "num_input_tokens_seen": 1745879040, + "step": 3330, + "train_runtime": 15165.1285, + "train_tokens_per_second": 115124.579 + }, + { + "epoch": 0.18074082090965665, + "grad_norm": 0.1805170625448227, + "learning_rate": 0.004728806092130589, + "loss": 3.2880077362060547, + "num_input_tokens_seen": 1751121920, + "step": 3340, + "train_runtime": 15210.3276, + "train_tokens_per_second": 115127.166 + }, + { + "epoch": 0.18128196109202088, + "grad_norm": 0.18228840827941895, + "learning_rate": 0.00472693192062068, + "loss": 3.286875915527344, + "num_input_tokens_seen": 1756364800, + "step": 3350, + "train_runtime": 15255.5307, + "train_tokens_per_second": 115129.709 + }, + { + "epoch": 0.18182310127438514, + "grad_norm": 0.20272916555404663, + "learning_rate": 0.0047250517136285634, + "loss": 3.2986392974853516, + "num_input_tokens_seen": 1761607680, + "step": 3360, + "train_runtime": 15300.7449, + "train_tokens_per_second": 115132.151 + }, + { + "epoch": 0.18236424145674937, + "grad_norm": 0.17199651896953583, + "learning_rate": 0.0047231654768944255, + "loss": 3.2849578857421875, + "num_input_tokens_seen": 1766850560, + "step": 3370, + "train_runtime": 15345.9591, + "train_tokens_per_second": 115134.58 + }, + { + "epoch": 0.1829053816391136, + "grad_norm": 0.18118058145046234, + "learning_rate": 0.00472127321617686, + "loss": 3.2900650024414064, + "num_input_tokens_seen": 1772093440, + "step": 3380, + "train_runtime": 15391.1925, + "train_tokens_per_second": 115136.851 + }, + { + "epoch": 0.18344652182147786, + "grad_norm": 0.19814889132976532, + "learning_rate": 0.004719374937252852, + "loss": 3.280558776855469, + "num_input_tokens_seen": 1777336320, + "step": 3390, + "train_runtime": 15436.4025, + "train_tokens_per_second": 115139.283 + }, + { + "epoch": 0.1839876620038421, + "grad_norm": 0.2015380561351776, + "learning_rate": 0.00471747064591776, + "loss": 3.30006103515625, + "num_input_tokens_seen": 1782579200, + "step": 3400, + "train_runtime": 15481.5971, + "train_tokens_per_second": 115141.816 + }, + { + "epoch": 0.18452880218620635, + "grad_norm": 0.16767387092113495, + "learning_rate": 0.0047155603479852965, + "loss": 3.2787837982177734, + "num_input_tokens_seen": 1787822080, + "step": 3410, + "train_runtime": 15526.8015, + "train_tokens_per_second": 115144.261 + }, + { + "epoch": 0.18506994236857058, + "grad_norm": 0.169756680727005, + "learning_rate": 0.0047136440492875145, + "loss": 3.283340072631836, + "num_input_tokens_seen": 1793064960, + "step": 3420, + "train_runtime": 15572.0514, + "train_tokens_per_second": 115146.355 + }, + { + "epoch": 0.1856110825509348, + "grad_norm": 0.18903492391109467, + "learning_rate": 0.004711721755674787, + "loss": 3.289379119873047, + "num_input_tokens_seen": 1798307840, + "step": 3430, + "train_runtime": 15617.2557, + "train_tokens_per_second": 115148.774 + }, + { + "epoch": 0.18615222273329907, + "grad_norm": 0.19553808867931366, + "learning_rate": 0.004709793473015785, + "loss": 3.277596664428711, + "num_input_tokens_seen": 1803550720, + "step": 3440, + "train_runtime": 15666.2718, + "train_tokens_per_second": 115123.16 + }, + { + "epoch": 0.1866933629156633, + "grad_norm": 0.17524783313274384, + "learning_rate": 0.004707859207197468, + "loss": 3.272700881958008, + "num_input_tokens_seen": 1808793600, + "step": 3450, + "train_runtime": 15711.4431, + "train_tokens_per_second": 115125.873 + }, + { + "epoch": 0.18723450309802755, + "grad_norm": 0.1725703924894333, + "learning_rate": 0.004705918964125061, + "loss": 3.2771453857421875, + "num_input_tokens_seen": 1814036480, + "step": 3460, + "train_runtime": 15756.6281, + "train_tokens_per_second": 115128.47 + }, + { + "epoch": 0.18777564328039179, + "grad_norm": 0.18361718952655792, + "learning_rate": 0.004703972749722038, + "loss": 3.2812034606933596, + "num_input_tokens_seen": 1819279360, + "step": 3470, + "train_runtime": 15801.8225, + "train_tokens_per_second": 115130.983 + }, + { + "epoch": 0.18831678346275602, + "grad_norm": 0.18993116915225983, + "learning_rate": 0.004702020569930098, + "loss": 3.2690109252929687, + "num_input_tokens_seen": 1824522240, + "step": 3480, + "train_runtime": 15847.0133, + "train_tokens_per_second": 115133.508 + }, + { + "epoch": 0.18885792364512027, + "grad_norm": 0.1982622891664505, + "learning_rate": 0.004700062430709161, + "loss": 3.2883895874023437, + "num_input_tokens_seen": 1829765120, + "step": 3490, + "train_runtime": 15892.1956, + "train_tokens_per_second": 115136.081 + }, + { + "epoch": 0.1893990638274845, + "grad_norm": 0.1953168362379074, + "learning_rate": 0.004698098338037333, + "loss": 3.2819141387939452, + "num_input_tokens_seen": 1835008000, + "step": 3500, + "train_runtime": 15937.3587, + "train_tokens_per_second": 115138.778 + }, + { + "epoch": 0.1893990638274845, + "eval_loss": 3.2193782329559326, + "eval_runtime": 1.9829, + "eval_samples_per_second": 252.151, + "eval_steps_per_second": 4.034, + "num_input_tokens_seen": 1835008000, + "step": 3500 + }, + { + "epoch": 0.18994020400984876, + "grad_norm": 0.17765532433986664, + "learning_rate": 0.004696128297910899, + "loss": 3.2748733520507813, + "num_input_tokens_seen": 1840250880, + "step": 3510, + "train_runtime": 15984.532, + "train_tokens_per_second": 115126.979 + }, + { + "epoch": 0.190481344192213, + "grad_norm": 0.1692020744085312, + "learning_rate": 0.0046941523163443015, + "loss": 3.282354736328125, + "num_input_tokens_seen": 1845493760, + "step": 3520, + "train_runtime": 16029.707, + "train_tokens_per_second": 115129.601 + }, + { + "epoch": 0.19102248437457722, + "grad_norm": 0.17890560626983643, + "learning_rate": 0.00469217039937012, + "loss": 3.27266845703125, + "num_input_tokens_seen": 1850736640, + "step": 3530, + "train_runtime": 16074.8942, + "train_tokens_per_second": 115132.119 + }, + { + "epoch": 0.19156362455694148, + "grad_norm": 0.17925257980823517, + "learning_rate": 0.004690182553039058, + "loss": 3.28330078125, + "num_input_tokens_seen": 1855979520, + "step": 3540, + "train_runtime": 16120.1066, + "train_tokens_per_second": 115134.445 + }, + { + "epoch": 0.1921047647393057, + "grad_norm": 0.1788860410451889, + "learning_rate": 0.004688188783419917, + "loss": 3.2885406494140623, + "num_input_tokens_seen": 1861222400, + "step": 3550, + "train_runtime": 16165.2878, + "train_tokens_per_second": 115136.979 + }, + { + "epoch": 0.19264590492166997, + "grad_norm": 0.1811930388212204, + "learning_rate": 0.004686189096599585, + "loss": 3.2768978118896483, + "num_input_tokens_seen": 1866465280, + "step": 3560, + "train_runtime": 16210.475, + "train_tokens_per_second": 115139.456 + }, + { + "epoch": 0.1931870451040342, + "grad_norm": 0.2033502757549286, + "learning_rate": 0.004684183498683013, + "loss": 3.2799072265625, + "num_input_tokens_seen": 1871708160, + "step": 3570, + "train_runtime": 16255.6518, + "train_tokens_per_second": 115141.994 + }, + { + "epoch": 0.19372818528639843, + "grad_norm": 0.1871407926082611, + "learning_rate": 0.0046821719957932, + "loss": 3.2745807647705076, + "num_input_tokens_seen": 1876951040, + "step": 3580, + "train_runtime": 16300.8306, + "train_tokens_per_second": 115144.503 + }, + { + "epoch": 0.1942693254687627, + "grad_norm": 0.18156413733959198, + "learning_rate": 0.004680154594071171, + "loss": 3.275892639160156, + "num_input_tokens_seen": 1882193920, + "step": 3590, + "train_runtime": 16346.0348, + "train_tokens_per_second": 115146.819 + }, + { + "epoch": 0.19481046565112692, + "grad_norm": 0.17189612984657288, + "learning_rate": 0.004678131299675962, + "loss": 3.278411102294922, + "num_input_tokens_seen": 1887436800, + "step": 3600, + "train_runtime": 16391.2075, + "train_tokens_per_second": 115149.344 + }, + { + "epoch": 0.19535160583349118, + "grad_norm": 0.18602769076824188, + "learning_rate": 0.004676102118784596, + "loss": 3.2600128173828127, + "num_input_tokens_seen": 1892679680, + "step": 3610, + "train_runtime": 16436.3754, + "train_tokens_per_second": 115151.89 + }, + { + "epoch": 0.1958927460158554, + "grad_norm": 0.18247896432876587, + "learning_rate": 0.0046740670575920705, + "loss": 3.263835906982422, + "num_input_tokens_seen": 1897922560, + "step": 3620, + "train_runtime": 16481.5414, + "train_tokens_per_second": 115154.434 + }, + { + "epoch": 0.19643388619821964, + "grad_norm": 0.17899157106876373, + "learning_rate": 0.004672026122311332, + "loss": 3.266416549682617, + "num_input_tokens_seen": 1903165440, + "step": 3630, + "train_runtime": 16526.6863, + "train_tokens_per_second": 115157.111 + }, + { + "epoch": 0.1969750263805839, + "grad_norm": 0.19543124735355377, + "learning_rate": 0.004669979319173264, + "loss": 3.261871337890625, + "num_input_tokens_seen": 1908408320, + "step": 3640, + "train_runtime": 16571.8633, + "train_tokens_per_second": 115159.55 + }, + { + "epoch": 0.19751616656294813, + "grad_norm": 0.18458126485347748, + "learning_rate": 0.004667926654426661, + "loss": 3.2731971740722656, + "num_input_tokens_seen": 1913651200, + "step": 3650, + "train_runtime": 16617.0658, + "train_tokens_per_second": 115161.799 + }, + { + "epoch": 0.19805730674531238, + "grad_norm": 0.18683847784996033, + "learning_rate": 0.004665868134338213, + "loss": 3.2641891479492187, + "num_input_tokens_seen": 1918894080, + "step": 3660, + "train_runtime": 16662.2485, + "train_tokens_per_second": 115164.173 + }, + { + "epoch": 0.19859844692767661, + "grad_norm": 0.18460538983345032, + "learning_rate": 0.00466380376519249, + "loss": 3.261058044433594, + "num_input_tokens_seen": 1924136960, + "step": 3670, + "train_runtime": 16707.4259, + "train_tokens_per_second": 115166.572 + }, + { + "epoch": 0.19913958711004084, + "grad_norm": 0.17181532084941864, + "learning_rate": 0.004661733553291914, + "loss": 3.2611160278320312, + "num_input_tokens_seen": 1929379840, + "step": 3680, + "train_runtime": 16752.6562, + "train_tokens_per_second": 115168.593 + }, + { + "epoch": 0.1996807272924051, + "grad_norm": 0.19158703088760376, + "learning_rate": 0.004659657504956747, + "loss": 3.2646514892578127, + "num_input_tokens_seen": 1934622720, + "step": 3690, + "train_runtime": 16797.8998, + "train_tokens_per_second": 115170.512 + }, + { + "epoch": 0.20022186747476933, + "grad_norm": 0.18142655491828918, + "learning_rate": 0.004657575626525069, + "loss": 3.258336639404297, + "num_input_tokens_seen": 1939865600, + "step": 3700, + "train_runtime": 16843.1356, + "train_tokens_per_second": 115172.474 + }, + { + "epoch": 0.2007630076571336, + "grad_norm": 0.1888807713985443, + "learning_rate": 0.00465548792435276, + "loss": 3.256613922119141, + "num_input_tokens_seen": 1945108480, + "step": 3710, + "train_runtime": 16888.3732, + "train_tokens_per_second": 115174.414 + }, + { + "epoch": 0.20130414783949782, + "grad_norm": 0.17357957363128662, + "learning_rate": 0.004653394404813478, + "loss": 3.2642303466796876, + "num_input_tokens_seen": 1950351360, + "step": 3720, + "train_runtime": 16933.6199, + "train_tokens_per_second": 115176.281 + }, + { + "epoch": 0.20184528802186205, + "grad_norm": 0.17942315340042114, + "learning_rate": 0.004651295074298641, + "loss": 3.254298782348633, + "num_input_tokens_seen": 1955594240, + "step": 3730, + "train_runtime": 16978.845, + "train_tokens_per_second": 115178.284 + }, + { + "epoch": 0.2023864282042263, + "grad_norm": 0.17983372509479523, + "learning_rate": 0.00464918993921741, + "loss": 3.2564628601074217, + "num_input_tokens_seen": 1960837120, + "step": 3740, + "train_runtime": 17024.0569, + "train_tokens_per_second": 115180.367 + }, + { + "epoch": 0.20292756838659054, + "grad_norm": 0.19154661893844604, + "learning_rate": 0.004647079005996664, + "loss": 3.2626083374023436, + "num_input_tokens_seen": 1966080000, + "step": 3750, + "train_runtime": 17069.2756, + "train_tokens_per_second": 115182.392 + }, + { + "epoch": 0.2034687085689548, + "grad_norm": 0.16907712817192078, + "learning_rate": 0.0046449622810809865, + "loss": 3.2560802459716798, + "num_input_tokens_seen": 1971322880, + "step": 3760, + "train_runtime": 17114.512, + "train_tokens_per_second": 115184.288 + }, + { + "epoch": 0.20400984875131903, + "grad_norm": 0.1877511590719223, + "learning_rate": 0.004642839770932641, + "loss": 3.2611919403076173, + "num_input_tokens_seen": 1976565760, + "step": 3770, + "train_runtime": 17159.7356, + "train_tokens_per_second": 115186.26 + }, + { + "epoch": 0.20455098893368326, + "grad_norm": 0.1924838423728943, + "learning_rate": 0.004640711482031552, + "loss": 3.259069061279297, + "num_input_tokens_seen": 1981808640, + "step": 3780, + "train_runtime": 17204.9712, + "train_tokens_per_second": 115188.14 + }, + { + "epoch": 0.20509212911604752, + "grad_norm": 0.17791348695755005, + "learning_rate": 0.00463857742087529, + "loss": 3.2603363037109374, + "num_input_tokens_seen": 1987051520, + "step": 3790, + "train_runtime": 17250.1988, + "train_tokens_per_second": 115190.065 + }, + { + "epoch": 0.20563326929841175, + "grad_norm": 0.18873880803585052, + "learning_rate": 0.004636437593979043, + "loss": 3.260697937011719, + "num_input_tokens_seen": 1992294400, + "step": 3800, + "train_runtime": 17295.4319, + "train_tokens_per_second": 115191.943 + }, + { + "epoch": 0.206174409480776, + "grad_norm": 0.1765436977148056, + "learning_rate": 0.004634292007875606, + "loss": 3.25205078125, + "num_input_tokens_seen": 1997537280, + "step": 3810, + "train_runtime": 17340.6638, + "train_tokens_per_second": 115193.819 + }, + { + "epoch": 0.20671554966314024, + "grad_norm": 0.17367282509803772, + "learning_rate": 0.004632140669115353, + "loss": 3.2628250122070312, + "num_input_tokens_seen": 2002780160, + "step": 3820, + "train_runtime": 17390.4197, + "train_tokens_per_second": 115165.717 + }, + { + "epoch": 0.20725668984550447, + "grad_norm": 0.1866482049226761, + "learning_rate": 0.004629983584266224, + "loss": 3.255748748779297, + "num_input_tokens_seen": 2008023040, + "step": 3830, + "train_runtime": 17435.5785, + "train_tokens_per_second": 115168.134 + }, + { + "epoch": 0.20779783002786872, + "grad_norm": 0.18709656596183777, + "learning_rate": 0.004627820759913699, + "loss": 3.2663009643554686, + "num_input_tokens_seen": 2013265920, + "step": 3840, + "train_runtime": 17480.7483, + "train_tokens_per_second": 115170.465 + }, + { + "epoch": 0.20833897021023295, + "grad_norm": 0.19228561222553253, + "learning_rate": 0.0046256522026607814, + "loss": 3.2513301849365233, + "num_input_tokens_seen": 2018508800, + "step": 3850, + "train_runtime": 17525.9241, + "train_tokens_per_second": 115172.746 + }, + { + "epoch": 0.2088801103925972, + "grad_norm": 0.17255409061908722, + "learning_rate": 0.004623477919127976, + "loss": 3.243180847167969, + "num_input_tokens_seen": 2023751680, + "step": 3860, + "train_runtime": 17571.1102, + "train_tokens_per_second": 115174.947 + }, + { + "epoch": 0.20942125057496144, + "grad_norm": 0.176405668258667, + "learning_rate": 0.004621297915953271, + "loss": 3.2499061584472657, + "num_input_tokens_seen": 2028994560, + "step": 3870, + "train_runtime": 17616.3069, + "train_tokens_per_second": 115177.067 + }, + { + "epoch": 0.20996239075732567, + "grad_norm": 0.17931312322616577, + "learning_rate": 0.004619112199792115, + "loss": 3.263928985595703, + "num_input_tokens_seen": 2034237440, + "step": 3880, + "train_runtime": 17661.4804, + "train_tokens_per_second": 115179.328 + }, + { + "epoch": 0.21050353093968993, + "grad_norm": 0.1927679032087326, + "learning_rate": 0.004616920777317401, + "loss": 3.243641662597656, + "num_input_tokens_seen": 2039480320, + "step": 3890, + "train_runtime": 17706.6588, + "train_tokens_per_second": 115181.545 + }, + { + "epoch": 0.21104467112205416, + "grad_norm": 0.1834532767534256, + "learning_rate": 0.00461472365521944, + "loss": 3.2529090881347655, + "num_input_tokens_seen": 2044723200, + "step": 3900, + "train_runtime": 17751.8407, + "train_tokens_per_second": 115183.728 + }, + { + "epoch": 0.21158581130441842, + "grad_norm": 0.18204724788665771, + "learning_rate": 0.004612520840205942, + "loss": 3.252873992919922, + "num_input_tokens_seen": 2049966080, + "step": 3910, + "train_runtime": 17797.0199, + "train_tokens_per_second": 115185.918 + }, + { + "epoch": 0.21212695148678265, + "grad_norm": 0.17959408462047577, + "learning_rate": 0.0046103123390020045, + "loss": 3.2571083068847657, + "num_input_tokens_seen": 2055208960, + "step": 3920, + "train_runtime": 17842.2041, + "train_tokens_per_second": 115188.065 + }, + { + "epoch": 0.21266809166914688, + "grad_norm": 0.18271717429161072, + "learning_rate": 0.004608098158350076, + "loss": 3.2583240509033202, + "num_input_tokens_seen": 2060451840, + "step": 3930, + "train_runtime": 17887.3836, + "train_tokens_per_second": 115190.23 + }, + { + "epoch": 0.21320923185151114, + "grad_norm": 0.1708153486251831, + "learning_rate": 0.004605878305009951, + "loss": 3.2490577697753906, + "num_input_tokens_seen": 2065694720, + "step": 3940, + "train_runtime": 17932.5711, + "train_tokens_per_second": 115192.334 + }, + { + "epoch": 0.21375037203387537, + "grad_norm": 0.17891845107078552, + "learning_rate": 0.004603652785758739, + "loss": 3.253165435791016, + "num_input_tokens_seen": 2070937600, + "step": 3950, + "train_runtime": 17977.7786, + "train_tokens_per_second": 115194.299 + }, + { + "epoch": 0.21429151221623963, + "grad_norm": 0.19264911115169525, + "learning_rate": 0.0046014216073908465, + "loss": 3.252245330810547, + "num_input_tokens_seen": 2076180480, + "step": 3960, + "train_runtime": 18022.9578, + "train_tokens_per_second": 115196.434 + }, + { + "epoch": 0.21483265239860386, + "grad_norm": 0.17727237939834595, + "learning_rate": 0.00459918477671796, + "loss": 3.2557418823242186, + "num_input_tokens_seen": 2081423360, + "step": 3970, + "train_runtime": 18068.1382, + "train_tokens_per_second": 115198.552 + }, + { + "epoch": 0.2153737925809681, + "grad_norm": 0.18832355737686157, + "learning_rate": 0.00459694230056902, + "loss": 3.2547958374023436, + "num_input_tokens_seen": 2086666240, + "step": 3980, + "train_runtime": 18113.3099, + "train_tokens_per_second": 115200.714 + }, + { + "epoch": 0.21591493276333235, + "grad_norm": 0.1745108813047409, + "learning_rate": 0.004594694185790203, + "loss": 3.2427162170410155, + "num_input_tokens_seen": 2091909120, + "step": 3990, + "train_runtime": 18158.476, + "train_tokens_per_second": 115202.901 + }, + { + "epoch": 0.21645607294569658, + "grad_norm": 0.18806034326553345, + "learning_rate": 0.004592440439244901, + "loss": 3.247505950927734, + "num_input_tokens_seen": 2097152000, + "step": 4000, + "train_runtime": 18203.6569, + "train_tokens_per_second": 115204.984 + }, + { + "epoch": 0.21645607294569658, + "eval_loss": 3.1910831928253174, + "eval_runtime": 1.9924, + "eval_samples_per_second": 250.957, + "eval_steps_per_second": 4.015, + "num_input_tokens_seen": 2097152000, + "step": 4000 + }, + { + "epoch": 0.21699721312806083, + "grad_norm": 0.18182304501533508, + "learning_rate": 0.004590181067813696, + "loss": 3.2401611328125, + "num_input_tokens_seen": 2102394880, + "step": 4010, + "train_runtime": 18253.3021, + "train_tokens_per_second": 115178.879 + }, + { + "epoch": 0.21753835331042506, + "grad_norm": 0.19632118940353394, + "learning_rate": 0.004587916078394347, + "loss": 3.248242950439453, + "num_input_tokens_seen": 2107637760, + "step": 4020, + "train_runtime": 18298.4585, + "train_tokens_per_second": 115181.164 + }, + { + "epoch": 0.2180794934927893, + "grad_norm": 0.17511795461177826, + "learning_rate": 0.004585645477901763, + "loss": 3.2442108154296876, + "num_input_tokens_seen": 2112880640, + "step": 4030, + "train_runtime": 18343.616, + "train_tokens_per_second": 115183.432 + }, + { + "epoch": 0.21862063367515355, + "grad_norm": 0.18919962644577026, + "learning_rate": 0.004583369273267981, + "loss": 3.2474128723144533, + "num_input_tokens_seen": 2118123520, + "step": 4040, + "train_runtime": 18388.8196, + "train_tokens_per_second": 115185.399 + }, + { + "epoch": 0.21916177385751778, + "grad_norm": 0.1883443295955658, + "learning_rate": 0.00458108747144215, + "loss": 3.2397232055664062, + "num_input_tokens_seen": 2123366400, + "step": 4050, + "train_runtime": 18433.9903, + "train_tokens_per_second": 115187.562 + }, + { + "epoch": 0.21970291403988204, + "grad_norm": 0.16874343156814575, + "learning_rate": 0.004578800079390506, + "loss": 3.243609619140625, + "num_input_tokens_seen": 2128609280, + "step": 4060, + "train_runtime": 18479.1743, + "train_tokens_per_second": 115189.632 + }, + { + "epoch": 0.22024405422224627, + "grad_norm": 0.1780671924352646, + "learning_rate": 0.004576507104096353, + "loss": 3.249961090087891, + "num_input_tokens_seen": 2133852160, + "step": 4070, + "train_runtime": 18524.3424, + "train_tokens_per_second": 115191.79 + }, + { + "epoch": 0.2207851944046105, + "grad_norm": 0.1814623475074768, + "learning_rate": 0.0045742085525600365, + "loss": 3.247069549560547, + "num_input_tokens_seen": 2139095040, + "step": 4080, + "train_runtime": 18569.5288, + "train_tokens_per_second": 115193.825 + }, + { + "epoch": 0.22132633458697476, + "grad_norm": 0.18335077166557312, + "learning_rate": 0.004571904431798931, + "loss": 3.241147994995117, + "num_input_tokens_seen": 2144337920, + "step": 4090, + "train_runtime": 18614.7052, + "train_tokens_per_second": 115195.911 + }, + { + "epoch": 0.221867474769339, + "grad_norm": 0.16724441945552826, + "learning_rate": 0.004569594748847409, + "loss": 3.24347038269043, + "num_input_tokens_seen": 2149580800, + "step": 4100, + "train_runtime": 18659.889, + "train_tokens_per_second": 115197.942 + }, + { + "epoch": 0.22240861495170325, + "grad_norm": 0.17200608551502228, + "learning_rate": 0.004567279510756828, + "loss": 3.2341545104980467, + "num_input_tokens_seen": 2154823680, + "step": 4110, + "train_runtime": 18705.0717, + "train_tokens_per_second": 115199.969 + }, + { + "epoch": 0.22294975513406748, + "grad_norm": 0.178621307015419, + "learning_rate": 0.0045649587245955026, + "loss": 3.2321949005126953, + "num_input_tokens_seen": 2160066560, + "step": 4120, + "train_runtime": 18750.2407, + "train_tokens_per_second": 115202.071 + }, + { + "epoch": 0.2234908953164317, + "grad_norm": 0.18516632914543152, + "learning_rate": 0.0045626323974486864, + "loss": 3.238597869873047, + "num_input_tokens_seen": 2165309440, + "step": 4130, + "train_runtime": 18795.4162, + "train_tokens_per_second": 115204.123 + }, + { + "epoch": 0.22403203549879597, + "grad_norm": 0.20164692401885986, + "learning_rate": 0.004560300536418549, + "loss": 3.237165832519531, + "num_input_tokens_seen": 2170552320, + "step": 4140, + "train_runtime": 18840.5926, + "train_tokens_per_second": 115206.16 + }, + { + "epoch": 0.2245731756811602, + "grad_norm": 0.1872573047876358, + "learning_rate": 0.004557963148624155, + "loss": 3.2406959533691406, + "num_input_tokens_seen": 2175795200, + "step": 4150, + "train_runtime": 18885.7554, + "train_tokens_per_second": 115208.269 + }, + { + "epoch": 0.22511431586352446, + "grad_norm": 0.1811392605304718, + "learning_rate": 0.0045556202412014414, + "loss": 3.235840606689453, + "num_input_tokens_seen": 2181038080, + "step": 4160, + "train_runtime": 18930.9167, + "train_tokens_per_second": 115210.379 + }, + { + "epoch": 0.22565545604588869, + "grad_norm": 0.1595960259437561, + "learning_rate": 0.0045532718213031976, + "loss": 3.2397125244140623, + "num_input_tokens_seen": 2186280960, + "step": 4170, + "train_runtime": 18976.0748, + "train_tokens_per_second": 115212.497 + }, + { + "epoch": 0.22619659622825292, + "grad_norm": 0.16895633935928345, + "learning_rate": 0.00455091789609904, + "loss": 3.2353279113769533, + "num_input_tokens_seen": 2191523840, + "step": 4180, + "train_runtime": 19021.2321, + "train_tokens_per_second": 115214.61 + }, + { + "epoch": 0.22673773641061717, + "grad_norm": 0.17124150693416595, + "learning_rate": 0.004548558472775396, + "loss": 3.2387535095214846, + "num_input_tokens_seen": 2196766720, + "step": 4190, + "train_runtime": 19066.399, + "train_tokens_per_second": 115216.656 + }, + { + "epoch": 0.2272788765929814, + "grad_norm": 0.1730462908744812, + "learning_rate": 0.004546193558535476, + "loss": 3.228282165527344, + "num_input_tokens_seen": 2202009600, + "step": 4200, + "train_runtime": 19115.1785, + "train_tokens_per_second": 115196.916 + }, + { + "epoch": 0.22782001677534566, + "grad_norm": 0.2116994708776474, + "learning_rate": 0.004543823160599253, + "loss": 3.228871154785156, + "num_input_tokens_seen": 2207252480, + "step": 4210, + "train_runtime": 19160.343, + "train_tokens_per_second": 115199.007 + }, + { + "epoch": 0.2283611569577099, + "grad_norm": 0.1870228499174118, + "learning_rate": 0.004541447286203444, + "loss": 3.2268039703369142, + "num_input_tokens_seen": 2212495360, + "step": 4220, + "train_runtime": 19205.5057, + "train_tokens_per_second": 115201.099 + }, + { + "epoch": 0.22890229714007412, + "grad_norm": 0.18021926283836365, + "learning_rate": 0.004539065942601484, + "loss": 3.2385711669921875, + "num_input_tokens_seen": 2217738240, + "step": 4230, + "train_runtime": 19250.6698, + "train_tokens_per_second": 115203.173 + }, + { + "epoch": 0.22944343732243838, + "grad_norm": 0.178096741437912, + "learning_rate": 0.004536679137063506, + "loss": 3.2425048828125, + "num_input_tokens_seen": 2222981120, + "step": 4240, + "train_runtime": 19295.8409, + "train_tokens_per_second": 115205.195 + }, + { + "epoch": 0.2299845775048026, + "grad_norm": 0.17963331937789917, + "learning_rate": 0.004534286876876316, + "loss": 3.2272270202636717, + "num_input_tokens_seen": 2228224000, + "step": 4250, + "train_runtime": 19341.0226, + "train_tokens_per_second": 115207.145 + }, + { + "epoch": 0.23052571768716687, + "grad_norm": 0.1644730269908905, + "learning_rate": 0.004531889169343374, + "loss": 3.232299041748047, + "num_input_tokens_seen": 2233466880, + "step": 4260, + "train_runtime": 19386.1815, + "train_tokens_per_second": 115209.221 + }, + { + "epoch": 0.2310668578695311, + "grad_norm": 0.18202030658721924, + "learning_rate": 0.004529486021784774, + "loss": 3.232588195800781, + "num_input_tokens_seen": 2238709760, + "step": 4270, + "train_runtime": 19431.3552, + "train_tokens_per_second": 115211.201 + }, + { + "epoch": 0.23160799805189533, + "grad_norm": 0.1674603968858719, + "learning_rate": 0.004527077441537213, + "loss": 3.2268638610839844, + "num_input_tokens_seen": 2243952640, + "step": 4280, + "train_runtime": 19476.5366, + "train_tokens_per_second": 115213.125 + }, + { + "epoch": 0.2321491382342596, + "grad_norm": 0.17482970654964447, + "learning_rate": 0.004524663435953974, + "loss": 3.231060791015625, + "num_input_tokens_seen": 2249195520, + "step": 4290, + "train_runtime": 19521.6994, + "train_tokens_per_second": 115215.15 + }, + { + "epoch": 0.23269027841662382, + "grad_norm": 0.1693650186061859, + "learning_rate": 0.004522244012404908, + "loss": 3.2219474792480467, + "num_input_tokens_seen": 2254438400, + "step": 4300, + "train_runtime": 19566.837, + "train_tokens_per_second": 115217.314 + }, + { + "epoch": 0.23323141859898808, + "grad_norm": 0.16282694041728973, + "learning_rate": 0.004519819178276401, + "loss": 3.214075469970703, + "num_input_tokens_seen": 2259681280, + "step": 4310, + "train_runtime": 19611.992, + "train_tokens_per_second": 115219.366 + }, + { + "epoch": 0.2337725587813523, + "grad_norm": 0.17166836559772491, + "learning_rate": 0.004517388940971363, + "loss": 3.229071044921875, + "num_input_tokens_seen": 2264924160, + "step": 4320, + "train_runtime": 19657.1365, + "train_tokens_per_second": 115221.47 + }, + { + "epoch": 0.23431369896371654, + "grad_norm": 0.1811853051185608, + "learning_rate": 0.004514953307909195, + "loss": 3.2278045654296874, + "num_input_tokens_seen": 2270167040, + "step": 4330, + "train_runtime": 19702.2886, + "train_tokens_per_second": 115223.52 + }, + { + "epoch": 0.2348548391460808, + "grad_norm": 0.18831849098205566, + "learning_rate": 0.0045125122865257725, + "loss": 3.2335960388183596, + "num_input_tokens_seen": 2275409920, + "step": 4340, + "train_runtime": 19747.4554, + "train_tokens_per_second": 115225.475 + }, + { + "epoch": 0.23539597932844503, + "grad_norm": 0.18200556933879852, + "learning_rate": 0.004510065884273422, + "loss": 3.230799102783203, + "num_input_tokens_seen": 2280652800, + "step": 4350, + "train_runtime": 19792.6287, + "train_tokens_per_second": 115227.383 + }, + { + "epoch": 0.23593711951080928, + "grad_norm": 0.18054424226284027, + "learning_rate": 0.004507614108620896, + "loss": 3.2332107543945314, + "num_input_tokens_seen": 2285895680, + "step": 4360, + "train_runtime": 19837.7879, + "train_tokens_per_second": 115229.364 + }, + { + "epoch": 0.23647825969317351, + "grad_norm": 0.17672619223594666, + "learning_rate": 0.004505156967053355, + "loss": 3.229229736328125, + "num_input_tokens_seen": 2291138560, + "step": 4370, + "train_runtime": 19882.9214, + "train_tokens_per_second": 115231.485 + }, + { + "epoch": 0.23701939987553775, + "grad_norm": 0.18023458123207092, + "learning_rate": 0.004502694467072336, + "loss": 3.221567916870117, + "num_input_tokens_seen": 2296381440, + "step": 4380, + "train_runtime": 19928.0618, + "train_tokens_per_second": 115233.557 + }, + { + "epoch": 0.237560540057902, + "grad_norm": 0.18084236979484558, + "learning_rate": 0.0045002266161957415, + "loss": 3.2244552612304687, + "num_input_tokens_seen": 2301624320, + "step": 4390, + "train_runtime": 19973.1892, + "train_tokens_per_second": 115235.694 + }, + { + "epoch": 0.23810168024026623, + "grad_norm": 0.17632804811000824, + "learning_rate": 0.004497753421957804, + "loss": 3.2264179229736327, + "num_input_tokens_seen": 2306867200, + "step": 4400, + "train_runtime": 20018.2945, + "train_tokens_per_second": 115237.949 + }, + { + "epoch": 0.2386428204226305, + "grad_norm": 0.18496030569076538, + "learning_rate": 0.004495274891909074, + "loss": 3.2306861877441406, + "num_input_tokens_seen": 2312110080, + "step": 4410, + "train_runtime": 20063.4387, + "train_tokens_per_second": 115239.97 + }, + { + "epoch": 0.23918396060499472, + "grad_norm": 0.19217975437641144, + "learning_rate": 0.004492791033616388, + "loss": 3.2278289794921875, + "num_input_tokens_seen": 2317352960, + "step": 4420, + "train_runtime": 20108.5903, + "train_tokens_per_second": 115241.94 + }, + { + "epoch": 0.23972510078735895, + "grad_norm": 0.17978626489639282, + "learning_rate": 0.004490301854662851, + "loss": 3.222820281982422, + "num_input_tokens_seen": 2322595840, + "step": 4430, + "train_runtime": 20153.7635, + "train_tokens_per_second": 115243.778 + }, + { + "epoch": 0.2402662409697232, + "grad_norm": 0.1925116330385208, + "learning_rate": 0.0044878073626478145, + "loss": 3.216511535644531, + "num_input_tokens_seen": 2327838720, + "step": 4440, + "train_runtime": 20198.9336, + "train_tokens_per_second": 115245.625 + }, + { + "epoch": 0.24080738115208744, + "grad_norm": 0.1764633059501648, + "learning_rate": 0.004485307565186844, + "loss": 3.2247901916503907, + "num_input_tokens_seen": 2333081600, + "step": 4450, + "train_runtime": 20244.1177, + "train_tokens_per_second": 115247.383 + }, + { + "epoch": 0.2413485213344517, + "grad_norm": 0.18181581795215607, + "learning_rate": 0.0044828024699117095, + "loss": 3.2144775390625, + "num_input_tokens_seen": 2338324480, + "step": 4460, + "train_runtime": 20289.2915, + "train_tokens_per_second": 115249.193 + }, + { + "epoch": 0.24188966151681593, + "grad_norm": 0.1797225922346115, + "learning_rate": 0.0044802920844703486, + "loss": 3.2179054260253905, + "num_input_tokens_seen": 2343567360, + "step": 4470, + "train_runtime": 20334.4596, + "train_tokens_per_second": 115251.028 + }, + { + "epoch": 0.24243080169918016, + "grad_norm": 0.17131617665290833, + "learning_rate": 0.004477776416526856, + "loss": 3.2136348724365233, + "num_input_tokens_seen": 2348810240, + "step": 4480, + "train_runtime": 20379.6251, + "train_tokens_per_second": 115252.868 + }, + { + "epoch": 0.24297194188154442, + "grad_norm": 0.17490802705287933, + "learning_rate": 0.004475255473761447, + "loss": 3.223601531982422, + "num_input_tokens_seen": 2354053120, + "step": 4490, + "train_runtime": 20424.7932, + "train_tokens_per_second": 115254.686 + }, + { + "epoch": 0.24351308206390865, + "grad_norm": 0.18434712290763855, + "learning_rate": 0.004472729263870446, + "loss": 3.219706726074219, + "num_input_tokens_seen": 2359296000, + "step": 4500, + "train_runtime": 20469.9487, + "train_tokens_per_second": 115256.566 + }, + { + "epoch": 0.24351308206390865, + "eval_loss": 3.1681437492370605, + "eval_runtime": 1.9847, + "eval_samples_per_second": 251.921, + "eval_steps_per_second": 4.031, + "num_input_tokens_seen": 2359296000, + "step": 4500 + }, + { + "epoch": 0.2440542222462729, + "grad_norm": 0.18588702380657196, + "learning_rate": 0.0044701977945662535, + "loss": 3.2231178283691406, + "num_input_tokens_seen": 2364538880, + "step": 4510, + "train_runtime": 20517.077, + "train_tokens_per_second": 115247.356 + }, + { + "epoch": 0.24459536242863714, + "grad_norm": 0.1629338264465332, + "learning_rate": 0.004467661073577332, + "loss": 3.2203128814697264, + "num_input_tokens_seen": 2369781760, + "step": 4520, + "train_runtime": 20562.2414, + "train_tokens_per_second": 115249.195 + }, + { + "epoch": 0.24513650261100137, + "grad_norm": 0.19198022782802582, + "learning_rate": 0.00446511910864817, + "loss": 3.2169837951660156, + "num_input_tokens_seen": 2375024640, + "step": 4530, + "train_runtime": 20607.3939, + "train_tokens_per_second": 115251.092 + }, + { + "epoch": 0.24567764279336562, + "grad_norm": 0.17990648746490479, + "learning_rate": 0.004462571907539273, + "loss": 3.2237472534179688, + "num_input_tokens_seen": 2380267520, + "step": 4540, + "train_runtime": 20652.5476, + "train_tokens_per_second": 115252.974 + }, + { + "epoch": 0.24621878297572986, + "grad_norm": 0.17402444779872894, + "learning_rate": 0.004460019478027127, + "loss": 3.2200748443603517, + "num_input_tokens_seen": 2385510400, + "step": 4550, + "train_runtime": 20697.698, + "train_tokens_per_second": 115254.865 + }, + { + "epoch": 0.2467599231580941, + "grad_norm": 0.17017342150211334, + "learning_rate": 0.004457461827904183, + "loss": 3.2241039276123047, + "num_input_tokens_seen": 2390753280, + "step": 4560, + "train_runtime": 20742.8484, + "train_tokens_per_second": 115256.749 + }, + { + "epoch": 0.24730106334045834, + "grad_norm": 0.17307622730731964, + "learning_rate": 0.004454898964978828, + "loss": 3.2237174987792967, + "num_input_tokens_seen": 2395996160, + "step": 4570, + "train_runtime": 20788.0181, + "train_tokens_per_second": 115258.518 + }, + { + "epoch": 0.24784220352282257, + "grad_norm": 0.18426761031150818, + "learning_rate": 0.004452330897075365, + "loss": 3.2148464202880858, + "num_input_tokens_seen": 2401239040, + "step": 4580, + "train_runtime": 20836.8458, + "train_tokens_per_second": 115240.045 + }, + { + "epoch": 0.24838334370518683, + "grad_norm": 0.18935158848762512, + "learning_rate": 0.004449757632033987, + "loss": 3.2203189849853517, + "num_input_tokens_seen": 2406481920, + "step": 4590, + "train_runtime": 20882.0043, + "train_tokens_per_second": 115241.903 + }, + { + "epoch": 0.24892448388755106, + "grad_norm": 0.17916643619537354, + "learning_rate": 0.004447179177710755, + "loss": 3.220214080810547, + "num_input_tokens_seen": 2411724800, + "step": 4600, + "train_runtime": 20927.1721, + "train_tokens_per_second": 115243.703 + }, + { + "epoch": 0.24946562406991532, + "grad_norm": 0.1714351922273636, + "learning_rate": 0.0044445955419775696, + "loss": 3.2130130767822265, + "num_input_tokens_seen": 2416967680, + "step": 4610, + "train_runtime": 20972.3937, + "train_tokens_per_second": 115245.199 + }, + { + "epoch": 0.2500067642522796, + "grad_norm": 0.17399680614471436, + "learning_rate": 0.004442006732722152, + "loss": 3.2150115966796875, + "num_input_tokens_seen": 2422210560, + "step": 4620, + "train_runtime": 21017.5875, + "train_tokens_per_second": 115246.841 + }, + { + "epoch": 0.2505479044346438, + "grad_norm": 0.18769405782222748, + "learning_rate": 0.00443941275784802, + "loss": 3.2187454223632814, + "num_input_tokens_seen": 2427453440, + "step": 4630, + "train_runtime": 21062.785, + "train_tokens_per_second": 115248.456 + }, + { + "epoch": 0.25108904461700804, + "grad_norm": 0.1909925937652588, + "learning_rate": 0.004436813625274458, + "loss": 3.228108215332031, + "num_input_tokens_seen": 2432696320, + "step": 4640, + "train_runtime": 21107.9893, + "train_tokens_per_second": 115250.026 + }, + { + "epoch": 0.25163018479937227, + "grad_norm": 0.16732154786586761, + "learning_rate": 0.004434209342936497, + "loss": 3.213469314575195, + "num_input_tokens_seen": 2437939200, + "step": 4650, + "train_runtime": 21153.1981, + "train_tokens_per_second": 115251.566 + }, + { + "epoch": 0.2521713249817365, + "grad_norm": 0.17457814514636993, + "learning_rate": 0.0044315999187848915, + "loss": 3.224944305419922, + "num_input_tokens_seen": 2443182080, + "step": 4660, + "train_runtime": 21198.4016, + "train_tokens_per_second": 115253.127 + }, + { + "epoch": 0.2527124651641008, + "grad_norm": 0.192255899310112, + "learning_rate": 0.004428985360786096, + "loss": 3.227398681640625, + "num_input_tokens_seen": 2448424960, + "step": 4670, + "train_runtime": 21243.6232, + "train_tokens_per_second": 115254.584 + }, + { + "epoch": 0.253253605346465, + "grad_norm": 0.17784617841243744, + "learning_rate": 0.004426365676922234, + "loss": 3.2128623962402343, + "num_input_tokens_seen": 2453667840, + "step": 4680, + "train_runtime": 21288.8398, + "train_tokens_per_second": 115256.062 + }, + { + "epoch": 0.25379474552882925, + "grad_norm": 0.17195752263069153, + "learning_rate": 0.00442374087519108, + "loss": 3.2142982482910156, + "num_input_tokens_seen": 2458910720, + "step": 4690, + "train_runtime": 21334.0638, + "train_tokens_per_second": 115257.493 + }, + { + "epoch": 0.2543358857111935, + "grad_norm": 0.1722942292690277, + "learning_rate": 0.004421110963606032, + "loss": 3.210185241699219, + "num_input_tokens_seen": 2464153600, + "step": 4700, + "train_runtime": 21379.267, + "train_tokens_per_second": 115259.031 + }, + { + "epoch": 0.2548770258935577, + "grad_norm": 0.16966107487678528, + "learning_rate": 0.00441847595019609, + "loss": 3.2123428344726563, + "num_input_tokens_seen": 2469396480, + "step": 4710, + "train_runtime": 21424.483, + "train_tokens_per_second": 115260.493 + }, + { + "epoch": 0.255418166075922, + "grad_norm": 0.18033796548843384, + "learning_rate": 0.004415835843005828, + "loss": 3.2065505981445312, + "num_input_tokens_seen": 2474639360, + "step": 4720, + "train_runtime": 21469.6877, + "train_tokens_per_second": 115262.01 + }, + { + "epoch": 0.2559593062582862, + "grad_norm": 0.18272215127944946, + "learning_rate": 0.004413190650095373, + "loss": 3.2069171905517577, + "num_input_tokens_seen": 2479882240, + "step": 4730, + "train_runtime": 21514.9, + "train_tokens_per_second": 115263.48 + }, + { + "epoch": 0.25650044644065045, + "grad_norm": 0.17386901378631592, + "learning_rate": 0.004410540379540377, + "loss": 3.2177162170410156, + "num_input_tokens_seen": 2485125120, + "step": 4740, + "train_runtime": 21560.142, + "train_tokens_per_second": 115264.784 + }, + { + "epoch": 0.2570415866230147, + "grad_norm": 0.17471922934055328, + "learning_rate": 0.0044078850394319935, + "loss": 3.2096931457519533, + "num_input_tokens_seen": 2490368000, + "step": 4750, + "train_runtime": 21605.3695, + "train_tokens_per_second": 115266.161 + }, + { + "epoch": 0.2575827268053789, + "grad_norm": 0.188929483294487, + "learning_rate": 0.004405224637876854, + "loss": 3.215177536010742, + "num_input_tokens_seen": 2495610880, + "step": 4760, + "train_runtime": 21650.575, + "train_tokens_per_second": 115267.649 + }, + { + "epoch": 0.2581238669877432, + "grad_norm": 0.18201977014541626, + "learning_rate": 0.0044025591829970415, + "loss": 3.2025718688964844, + "num_input_tokens_seen": 2500853760, + "step": 4770, + "train_runtime": 21695.7857, + "train_tokens_per_second": 115269.103 + }, + { + "epoch": 0.25866500717010743, + "grad_norm": 0.18745562434196472, + "learning_rate": 0.004399888682930069, + "loss": 3.2124725341796876, + "num_input_tokens_seen": 2506096640, + "step": 4780, + "train_runtime": 21740.9904, + "train_tokens_per_second": 115270.583 + }, + { + "epoch": 0.25920614735247166, + "grad_norm": 0.18076451122760773, + "learning_rate": 0.004397213145828847, + "loss": 3.2005435943603517, + "num_input_tokens_seen": 2511339520, + "step": 4790, + "train_runtime": 21786.1967, + "train_tokens_per_second": 115272.049 + }, + { + "epoch": 0.2597472875348359, + "grad_norm": 0.16596098244190216, + "learning_rate": 0.004394532579861671, + "loss": 3.197236251831055, + "num_input_tokens_seen": 2516582400, + "step": 4800, + "train_runtime": 21831.4029, + "train_tokens_per_second": 115273.508 + }, + { + "epoch": 0.2602884277172001, + "grad_norm": 0.17128406465053558, + "learning_rate": 0.004391846993212182, + "loss": 3.2089080810546875, + "num_input_tokens_seen": 2521825280, + "step": 4810, + "train_runtime": 21876.6005, + "train_tokens_per_second": 115275.007 + }, + { + "epoch": 0.2608295678995644, + "grad_norm": 0.17832306027412415, + "learning_rate": 0.004389156394079355, + "loss": 3.202547073364258, + "num_input_tokens_seen": 2527068160, + "step": 4820, + "train_runtime": 21921.8037, + "train_tokens_per_second": 115276.471 + }, + { + "epoch": 0.26137070808192864, + "grad_norm": 0.16681405901908875, + "learning_rate": 0.004386460790677465, + "loss": 3.2106822967529296, + "num_input_tokens_seen": 2532311040, + "step": 4830, + "train_runtime": 21967.0048, + "train_tokens_per_second": 115277.939 + }, + { + "epoch": 0.26191184826429287, + "grad_norm": 0.17566899955272675, + "learning_rate": 0.004383760191236065, + "loss": 3.2070526123046874, + "num_input_tokens_seen": 2537553920, + "step": 4840, + "train_runtime": 22012.208, + "train_tokens_per_second": 115279.39 + }, + { + "epoch": 0.2624529884466571, + "grad_norm": 0.17574016749858856, + "learning_rate": 0.00438105460399996, + "loss": 3.203447723388672, + "num_input_tokens_seen": 2542796800, + "step": 4850, + "train_runtime": 22057.4092, + "train_tokens_per_second": 115280.846 + }, + { + "epoch": 0.26299412862902133, + "grad_norm": 0.16241556406021118, + "learning_rate": 0.004378344037229184, + "loss": 3.2026832580566404, + "num_input_tokens_seen": 2548039680, + "step": 4860, + "train_runtime": 22102.6211, + "train_tokens_per_second": 115282.24 + }, + { + "epoch": 0.2635352688113856, + "grad_norm": 0.1805507242679596, + "learning_rate": 0.004375628499198973, + "loss": 3.2010284423828126, + "num_input_tokens_seen": 2553282560, + "step": 4870, + "train_runtime": 22147.8116, + "train_tokens_per_second": 115283.74 + }, + { + "epoch": 0.26407640899374984, + "grad_norm": 0.16756032407283783, + "learning_rate": 0.004372907998199739, + "loss": 3.2070991516113283, + "num_input_tokens_seen": 2558525440, + "step": 4880, + "train_runtime": 22192.9705, + "train_tokens_per_second": 115285.398 + }, + { + "epoch": 0.2646175491761141, + "grad_norm": 0.18972600996494293, + "learning_rate": 0.004370182542537047, + "loss": 3.214699554443359, + "num_input_tokens_seen": 2563768320, + "step": 4890, + "train_runtime": 22238.1209, + "train_tokens_per_second": 115287.094 + }, + { + "epoch": 0.2651586893584783, + "grad_norm": 0.1896647959947586, + "learning_rate": 0.004367452140531587, + "loss": 3.205576705932617, + "num_input_tokens_seen": 2569011200, + "step": 4900, + "train_runtime": 22283.3129, + "train_tokens_per_second": 115288.566 + }, + { + "epoch": 0.26569982954084254, + "grad_norm": 0.18498484790325165, + "learning_rate": 0.004364716800519152, + "loss": 3.2080978393554687, + "num_input_tokens_seen": 2574254080, + "step": 4910, + "train_runtime": 22328.4859, + "train_tokens_per_second": 115290.132 + }, + { + "epoch": 0.2662409697232068, + "grad_norm": 0.1854403018951416, + "learning_rate": 0.0043619765308506074, + "loss": 3.203238677978516, + "num_input_tokens_seen": 2579496960, + "step": 4920, + "train_runtime": 22373.6522, + "train_tokens_per_second": 115291.725 + }, + { + "epoch": 0.26678210990557105, + "grad_norm": 0.1691334992647171, + "learning_rate": 0.004359231339891872, + "loss": 3.1914302825927736, + "num_input_tokens_seen": 2584739840, + "step": 4930, + "train_runtime": 22418.8106, + "train_tokens_per_second": 115293.353 + }, + { + "epoch": 0.2673232500879353, + "grad_norm": 0.17332448065280914, + "learning_rate": 0.004356481236023887, + "loss": 3.2087932586669923, + "num_input_tokens_seen": 2589982720, + "step": 4940, + "train_runtime": 22463.9738, + "train_tokens_per_second": 115294.949 + }, + { + "epoch": 0.2678643902702995, + "grad_norm": 0.1679113507270813, + "learning_rate": 0.004353726227642593, + "loss": 3.2014122009277344, + "num_input_tokens_seen": 2595225600, + "step": 4950, + "train_runtime": 22509.1287, + "train_tokens_per_second": 115296.582 + }, + { + "epoch": 0.26840553045266374, + "grad_norm": 0.16913928091526031, + "learning_rate": 0.004350966323158903, + "loss": 3.1890819549560545, + "num_input_tokens_seen": 2600468480, + "step": 4960, + "train_runtime": 22554.2873, + "train_tokens_per_second": 115298.189 + }, + { + "epoch": 0.26894667063502803, + "grad_norm": 0.16906581819057465, + "learning_rate": 0.00434820153099868, + "loss": 3.202825927734375, + "num_input_tokens_seen": 2605711360, + "step": 4970, + "train_runtime": 22602.9949, + "train_tokens_per_second": 115281.686 + }, + { + "epoch": 0.26948781081739226, + "grad_norm": 0.16878265142440796, + "learning_rate": 0.004345431859602706, + "loss": 3.200624465942383, + "num_input_tokens_seen": 2610954240, + "step": 4980, + "train_runtime": 22648.1981, + "train_tokens_per_second": 115283.089 + }, + { + "epoch": 0.2700289509997565, + "grad_norm": 0.1862846463918686, + "learning_rate": 0.004342657317426662, + "loss": 3.206439971923828, + "num_input_tokens_seen": 2616197120, + "step": 4990, + "train_runtime": 22693.3935, + "train_tokens_per_second": 115284.526 + }, + { + "epoch": 0.2705700911821207, + "grad_norm": 0.16954657435417175, + "learning_rate": 0.004339877912941097, + "loss": 3.199533462524414, + "num_input_tokens_seen": 2621440000, + "step": 5000, + "train_runtime": 22738.6005, + "train_tokens_per_second": 115285.899 + }, + { + "epoch": 0.2705700911821207, + "eval_loss": 3.146559715270996, + "eval_runtime": 1.9859, + "eval_samples_per_second": 251.773, + "eval_steps_per_second": 4.028, + "num_input_tokens_seen": 2621440000, + "step": 5000 + }, + { + "epoch": 0.27111123136448495, + "grad_norm": 0.1746288388967514, + "learning_rate": 0.004337093654631402, + "loss": 3.195170593261719, + "num_input_tokens_seen": 2626682880, + "step": 5010, + "train_runtime": 22788.1861, + "train_tokens_per_second": 115265.114 + }, + { + "epoch": 0.27165237154684924, + "grad_norm": 0.182390496134758, + "learning_rate": 0.004334304550997793, + "loss": 3.184975433349609, + "num_input_tokens_seen": 2631925760, + "step": 5020, + "train_runtime": 22833.4175, + "train_tokens_per_second": 115266.397 + }, + { + "epoch": 0.27219351172921347, + "grad_norm": 0.18103830516338348, + "learning_rate": 0.004331510610555275, + "loss": 3.190489959716797, + "num_input_tokens_seen": 2637168640, + "step": 5030, + "train_runtime": 22878.6263, + "train_tokens_per_second": 115267.788 + }, + { + "epoch": 0.2727346519115777, + "grad_norm": 0.1782936155796051, + "learning_rate": 0.004328711841833618, + "loss": 3.196137237548828, + "num_input_tokens_seen": 2642411520, + "step": 5040, + "train_runtime": 22923.8218, + "train_tokens_per_second": 115269.24 + }, + { + "epoch": 0.2732757920939419, + "grad_norm": 0.185542032122612, + "learning_rate": 0.0043259082533773354, + "loss": 3.190313720703125, + "num_input_tokens_seen": 2647654400, + "step": 5050, + "train_runtime": 22969.0255, + "train_tokens_per_second": 115270.646 + }, + { + "epoch": 0.27381693227630616, + "grad_norm": 0.16143307089805603, + "learning_rate": 0.0043230998537456536, + "loss": 3.2025264739990233, + "num_input_tokens_seen": 2652897280, + "step": 5060, + "train_runtime": 23014.1965, + "train_tokens_per_second": 115272.209 + }, + { + "epoch": 0.27435807245867044, + "grad_norm": 0.16813282668590546, + "learning_rate": 0.004320286651512486, + "loss": 3.1958364486694335, + "num_input_tokens_seen": 2658140160, + "step": 5070, + "train_runtime": 23059.3886, + "train_tokens_per_second": 115273.662 + }, + { + "epoch": 0.2748992126410347, + "grad_norm": 0.18112315237522125, + "learning_rate": 0.004317468655266412, + "loss": 3.194669723510742, + "num_input_tokens_seen": 2663383040, + "step": 5080, + "train_runtime": 23104.5863, + "train_tokens_per_second": 115275.08 + }, + { + "epoch": 0.2754403528233989, + "grad_norm": 0.18187521398067474, + "learning_rate": 0.004314645873610643, + "loss": 3.1878196716308596, + "num_input_tokens_seen": 2668625920, + "step": 5090, + "train_runtime": 23149.7951, + "train_tokens_per_second": 115276.438 + }, + { + "epoch": 0.27598149300576313, + "grad_norm": 0.16804195940494537, + "learning_rate": 0.004311818315163001, + "loss": 3.2023330688476563, + "num_input_tokens_seen": 2673868800, + "step": 5100, + "train_runtime": 23194.9829, + "train_tokens_per_second": 115277.895 + }, + { + "epoch": 0.27652263318812736, + "grad_norm": 0.16169899702072144, + "learning_rate": 0.004308985988555892, + "loss": 3.195353889465332, + "num_input_tokens_seen": 2679111680, + "step": 5110, + "train_runtime": 23240.1686, + "train_tokens_per_second": 115279.356 + }, + { + "epoch": 0.27706377337049165, + "grad_norm": 0.1690625697374344, + "learning_rate": 0.004306148902436281, + "loss": 3.1894439697265624, + "num_input_tokens_seen": 2684354560, + "step": 5120, + "train_runtime": 23285.3699, + "train_tokens_per_second": 115280.735 + }, + { + "epoch": 0.2776049135528559, + "grad_norm": 0.1880822330713272, + "learning_rate": 0.00430330706546566, + "loss": 3.1982452392578127, + "num_input_tokens_seen": 2689597440, + "step": 5130, + "train_runtime": 23330.5682, + "train_tokens_per_second": 115282.123 + }, + { + "epoch": 0.2781460537352201, + "grad_norm": 0.1790066808462143, + "learning_rate": 0.004300460486320026, + "loss": 3.1980308532714843, + "num_input_tokens_seen": 2694840320, + "step": 5140, + "train_runtime": 23375.7756, + "train_tokens_per_second": 115283.461 + }, + { + "epoch": 0.27868719391758434, + "grad_norm": 0.16601233184337616, + "learning_rate": 0.004297609173689855, + "loss": 3.197714996337891, + "num_input_tokens_seen": 2700083200, + "step": 5150, + "train_runtime": 23420.9835, + "train_tokens_per_second": 115284.792 + }, + { + "epoch": 0.27922833409994857, + "grad_norm": 0.16672180593013763, + "learning_rate": 0.0042947531362800715, + "loss": 3.1988187789916993, + "num_input_tokens_seen": 2705326080, + "step": 5160, + "train_runtime": 23466.1808, + "train_tokens_per_second": 115286.169 + }, + { + "epoch": 0.27976947428231286, + "grad_norm": 0.19832877814769745, + "learning_rate": 0.00429189238281003, + "loss": 3.1931121826171873, + "num_input_tokens_seen": 2710568960, + "step": 5170, + "train_runtime": 23511.3921, + "train_tokens_per_second": 115287.472 + }, + { + "epoch": 0.2803106144646771, + "grad_norm": 0.1923927664756775, + "learning_rate": 0.004289026922013475, + "loss": 3.1957611083984374, + "num_input_tokens_seen": 2715811840, + "step": 5180, + "train_runtime": 23556.5915, + "train_tokens_per_second": 115288.829 + }, + { + "epoch": 0.2808517546470413, + "grad_norm": 0.17779354751110077, + "learning_rate": 0.00428615676263853, + "loss": 3.181416702270508, + "num_input_tokens_seen": 2721054720, + "step": 5190, + "train_runtime": 23601.8043, + "train_tokens_per_second": 115290.115 + }, + { + "epoch": 0.28139289482940555, + "grad_norm": 0.16895556449890137, + "learning_rate": 0.004283281913447657, + "loss": 3.1839942932128906, + "num_input_tokens_seen": 2726297600, + "step": 5200, + "train_runtime": 23647.0206, + "train_tokens_per_second": 115291.379 + }, + { + "epoch": 0.2819340350117698, + "grad_norm": 0.17021089792251587, + "learning_rate": 0.004280402383217639, + "loss": 3.193735122680664, + "num_input_tokens_seen": 2731540480, + "step": 5210, + "train_runtime": 23692.2429, + "train_tokens_per_second": 115292.608 + }, + { + "epoch": 0.28247517519413406, + "grad_norm": 0.16943930089473724, + "learning_rate": 0.00427751818073955, + "loss": 3.1817481994628904, + "num_input_tokens_seen": 2736783360, + "step": 5220, + "train_runtime": 23737.4424, + "train_tokens_per_second": 115293.944 + }, + { + "epoch": 0.2830163153764983, + "grad_norm": 0.15319055318832397, + "learning_rate": 0.004274629314818728, + "loss": 3.1803112030029297, + "num_input_tokens_seen": 2742026240, + "step": 5230, + "train_runtime": 23782.6783, + "train_tokens_per_second": 115295.099 + }, + { + "epoch": 0.2835574555588625, + "grad_norm": 0.1702287793159485, + "learning_rate": 0.004271735794274746, + "loss": 3.1876094818115233, + "num_input_tokens_seen": 2747269120, + "step": 5240, + "train_runtime": 23827.881, + "train_tokens_per_second": 115296.409 + }, + { + "epoch": 0.28409859574122676, + "grad_norm": 0.18369406461715698, + "learning_rate": 0.00426883762794139, + "loss": 3.1819345474243166, + "num_input_tokens_seen": 2752512000, + "step": 5250, + "train_runtime": 23873.0945, + "train_tokens_per_second": 115297.663 + }, + { + "epoch": 0.284639735923591, + "grad_norm": 0.1792212277650833, + "learning_rate": 0.004265934824666628, + "loss": 3.1884128570556642, + "num_input_tokens_seen": 2757754880, + "step": 5260, + "train_runtime": 23918.3193, + "train_tokens_per_second": 115298.857 + }, + { + "epoch": 0.28518087610595527, + "grad_norm": 0.17727087438106537, + "learning_rate": 0.0042630273933125865, + "loss": 3.194817543029785, + "num_input_tokens_seen": 2762997760, + "step": 5270, + "train_runtime": 23963.5296, + "train_tokens_per_second": 115300.117 + }, + { + "epoch": 0.2857220162883195, + "grad_norm": 0.15836812555789948, + "learning_rate": 0.004260115342755518, + "loss": 3.1808521270751955, + "num_input_tokens_seen": 2768240640, + "step": 5280, + "train_runtime": 24008.7153, + "train_tokens_per_second": 115301.49 + }, + { + "epoch": 0.28626315647068373, + "grad_norm": 0.17416301369667053, + "learning_rate": 0.00425719868188578, + "loss": 3.1919151306152345, + "num_input_tokens_seen": 2773483520, + "step": 5290, + "train_runtime": 24053.9251, + "train_tokens_per_second": 115302.742 + }, + { + "epoch": 0.28680429665304796, + "grad_norm": 0.16871845722198486, + "learning_rate": 0.004254277419607802, + "loss": 3.182635498046875, + "num_input_tokens_seen": 2778726400, + "step": 5300, + "train_runtime": 24099.1331, + "train_tokens_per_second": 115303.998 + }, + { + "epoch": 0.2873454368354122, + "grad_norm": 0.1787181943655014, + "learning_rate": 0.004251351564840067, + "loss": 3.18890495300293, + "num_input_tokens_seen": 2783969280, + "step": 5310, + "train_runtime": 24144.3426, + "train_tokens_per_second": 115305.243 + }, + { + "epoch": 0.2878865770177765, + "grad_norm": 0.1912972331047058, + "learning_rate": 0.00424842112651507, + "loss": 3.1834373474121094, + "num_input_tokens_seen": 2789212160, + "step": 5320, + "train_runtime": 24189.5491, + "train_tokens_per_second": 115306.496 + }, + { + "epoch": 0.2884277172001407, + "grad_norm": 0.16879980266094208, + "learning_rate": 0.004245486113579308, + "loss": 3.1814502716064452, + "num_input_tokens_seen": 2794455040, + "step": 5330, + "train_runtime": 24234.754, + "train_tokens_per_second": 115307.753 + }, + { + "epoch": 0.28896885738250494, + "grad_norm": 0.17917132377624512, + "learning_rate": 0.00424254653499324, + "loss": 3.188125228881836, + "num_input_tokens_seen": 2799697920, + "step": 5340, + "train_runtime": 24279.9641, + "train_tokens_per_second": 115308.981 + }, + { + "epoch": 0.28950999756486917, + "grad_norm": 0.17311090230941772, + "learning_rate": 0.004239602399731263, + "loss": 3.1844112396240236, + "num_input_tokens_seen": 2804940800, + "step": 5350, + "train_runtime": 24328.7709, + "train_tokens_per_second": 115293.157 + }, + { + "epoch": 0.2900511377472334, + "grad_norm": 0.17229342460632324, + "learning_rate": 0.004236653716781689, + "loss": 3.185770797729492, + "num_input_tokens_seen": 2810183680, + "step": 5360, + "train_runtime": 24373.9674, + "train_tokens_per_second": 115294.471 + }, + { + "epoch": 0.2905922779295977, + "grad_norm": 0.180856391787529, + "learning_rate": 0.0042337004951467075, + "loss": 3.1889812469482424, + "num_input_tokens_seen": 2815426560, + "step": 5370, + "train_runtime": 24419.1733, + "train_tokens_per_second": 115295.736 + }, + { + "epoch": 0.2911334181119619, + "grad_norm": 0.16839343309402466, + "learning_rate": 0.004230742743842371, + "loss": 3.1733203887939454, + "num_input_tokens_seen": 2820669440, + "step": 5380, + "train_runtime": 24464.3893, + "train_tokens_per_second": 115296.949 + }, + { + "epoch": 0.29167455829432615, + "grad_norm": 0.16889749467372894, + "learning_rate": 0.004227780471898559, + "loss": 3.1818462371826173, + "num_input_tokens_seen": 2825912320, + "step": 5390, + "train_runtime": 24509.5858, + "train_tokens_per_second": 115298.249 + }, + { + "epoch": 0.2922156984766904, + "grad_norm": 0.17744433879852295, + "learning_rate": 0.004224813688358949, + "loss": 3.1864446640014648, + "num_input_tokens_seen": 2831155200, + "step": 5400, + "train_runtime": 24554.7949, + "train_tokens_per_second": 115299.485 + }, + { + "epoch": 0.2927568386590546, + "grad_norm": 0.1737280935049057, + "learning_rate": 0.004221842402280996, + "loss": 3.180088424682617, + "num_input_tokens_seen": 2836398080, + "step": 5410, + "train_runtime": 24599.9993, + "train_tokens_per_second": 115300.738 + }, + { + "epoch": 0.2932979788414189, + "grad_norm": 0.16631047427654266, + "learning_rate": 0.004218866622735898, + "loss": 3.175667572021484, + "num_input_tokens_seen": 2841640960, + "step": 5420, + "train_runtime": 24645.2212, + "train_tokens_per_second": 115301.905 + }, + { + "epoch": 0.2938391190237831, + "grad_norm": 0.17272046208381653, + "learning_rate": 0.004215886358808577, + "loss": 3.185796546936035, + "num_input_tokens_seen": 2846883840, + "step": 5430, + "train_runtime": 24690.432, + "train_tokens_per_second": 115303.12 + }, + { + "epoch": 0.29438025920614735, + "grad_norm": 0.1690651923418045, + "learning_rate": 0.004212901619597638, + "loss": 3.1886520385742188, + "num_input_tokens_seen": 2852126720, + "step": 5440, + "train_runtime": 24735.6453, + "train_tokens_per_second": 115304.318 + }, + { + "epoch": 0.2949213993885116, + "grad_norm": 0.19146323204040527, + "learning_rate": 0.0042099124142153535, + "loss": 3.1789478302001952, + "num_input_tokens_seen": 2857369600, + "step": 5450, + "train_runtime": 24780.8456, + "train_tokens_per_second": 115305.573 + }, + { + "epoch": 0.2954625395708758, + "grad_norm": 0.1788649708032608, + "learning_rate": 0.00420691875178763, + "loss": 3.1887844085693358, + "num_input_tokens_seen": 2862612480, + "step": 5460, + "train_runtime": 24826.0467, + "train_tokens_per_second": 115306.819 + }, + { + "epoch": 0.2960036797532401, + "grad_norm": 0.19091546535491943, + "learning_rate": 0.004203920641453982, + "loss": 3.175608253479004, + "num_input_tokens_seen": 2867855360, + "step": 5470, + "train_runtime": 24871.2591, + "train_tokens_per_second": 115308.009 + }, + { + "epoch": 0.29654481993560433, + "grad_norm": 0.16818441450595856, + "learning_rate": 0.004200918092367501, + "loss": 3.1859344482421874, + "num_input_tokens_seen": 2873098240, + "step": 5480, + "train_runtime": 24916.485, + "train_tokens_per_second": 115309.131 + }, + { + "epoch": 0.29708596011796856, + "grad_norm": 0.1913134902715683, + "learning_rate": 0.0041979111136948325, + "loss": 3.1723804473876953, + "num_input_tokens_seen": 2878341120, + "step": 5490, + "train_runtime": 24961.6704, + "train_tokens_per_second": 115310.437 + }, + { + "epoch": 0.2976271003003328, + "grad_norm": 0.18261617422103882, + "learning_rate": 0.004194899714616144, + "loss": 3.179214286804199, + "num_input_tokens_seen": 2883584000, + "step": 5500, + "train_runtime": 25006.8704, + "train_tokens_per_second": 115311.67 + }, + { + "epoch": 0.2976271003003328, + "eval_loss": 3.126129388809204, + "eval_runtime": 1.9962, + "eval_samples_per_second": 250.471, + "eval_steps_per_second": 4.008, + "num_input_tokens_seen": 2883584000, + "step": 5500 + }, + { + "epoch": 0.298168240482697, + "grad_norm": 0.18416427075862885, + "learning_rate": 0.004191883904325097, + "loss": 3.1846160888671875, + "num_input_tokens_seen": 2888826880, + "step": 5510, + "train_runtime": 25054.1224, + "train_tokens_per_second": 115303.455 + }, + { + "epoch": 0.2987093806650613, + "grad_norm": 0.16038469970226288, + "learning_rate": 0.004188863692028823, + "loss": 3.180740737915039, + "num_input_tokens_seen": 2894069760, + "step": 5520, + "train_runtime": 25099.3557, + "train_tokens_per_second": 115304.544 + }, + { + "epoch": 0.29925052084742554, + "grad_norm": 0.16605685651302338, + "learning_rate": 0.004185839086947891, + "loss": 3.1796802520751952, + "num_input_tokens_seen": 2899312640, + "step": 5530, + "train_runtime": 25144.6135, + "train_tokens_per_second": 115305.516 + }, + { + "epoch": 0.29979166102978977, + "grad_norm": 0.1819118857383728, + "learning_rate": 0.004182810098316281, + "loss": 3.1764299392700197, + "num_input_tokens_seen": 2904555520, + "step": 5540, + "train_runtime": 25189.8702, + "train_tokens_per_second": 115306.49 + }, + { + "epoch": 0.300332801212154, + "grad_norm": 0.1876569390296936, + "learning_rate": 0.004179776735381355, + "loss": 3.18255500793457, + "num_input_tokens_seen": 2909798400, + "step": 5550, + "train_runtime": 25235.1612, + "train_tokens_per_second": 115307.304 + }, + { + "epoch": 0.30087394139451823, + "grad_norm": 0.1661430448293686, + "learning_rate": 0.004176739007403832, + "loss": 3.172201156616211, + "num_input_tokens_seen": 2915041280, + "step": 5560, + "train_runtime": 25280.4455, + "train_tokens_per_second": 115308.145 + }, + { + "epoch": 0.3014150815768825, + "grad_norm": 0.17655618488788605, + "learning_rate": 0.004173696923657755, + "loss": 3.17954158782959, + "num_input_tokens_seen": 2920284160, + "step": 5570, + "train_runtime": 25325.7247, + "train_tokens_per_second": 115309.007 + }, + { + "epoch": 0.30195622175924675, + "grad_norm": 0.17908194661140442, + "learning_rate": 0.0041706504934304655, + "loss": 3.1723983764648436, + "num_input_tokens_seen": 2925527040, + "step": 5580, + "train_runtime": 25370.9962, + "train_tokens_per_second": 115309.9 + }, + { + "epoch": 0.302497361941611, + "grad_norm": 0.17515423893928528, + "learning_rate": 0.004167599726022575, + "loss": 3.183238220214844, + "num_input_tokens_seen": 2930769920, + "step": 5590, + "train_runtime": 25416.2839, + "train_tokens_per_second": 115310.717 + }, + { + "epoch": 0.3030385021239752, + "grad_norm": 0.1749441921710968, + "learning_rate": 0.004164544630747937, + "loss": 3.185963821411133, + "num_input_tokens_seen": 2936012800, + "step": 5600, + "train_runtime": 25461.5455, + "train_tokens_per_second": 115311.649 + }, + { + "epoch": 0.30357964230633944, + "grad_norm": 0.1578006148338318, + "learning_rate": 0.004161485216933615, + "loss": 3.177383041381836, + "num_input_tokens_seen": 2941255680, + "step": 5610, + "train_runtime": 25506.8309, + "train_tokens_per_second": 115312.47 + }, + { + "epoch": 0.3041207824887037, + "grad_norm": 0.1903323382139206, + "learning_rate": 0.00415842149391986, + "loss": 3.179554748535156, + "num_input_tokens_seen": 2946498560, + "step": 5620, + "train_runtime": 25552.099, + "train_tokens_per_second": 115313.367 + }, + { + "epoch": 0.30466192267106795, + "grad_norm": 0.16383005678653717, + "learning_rate": 0.004155353471060077, + "loss": 3.160336494445801, + "num_input_tokens_seen": 2951741440, + "step": 5630, + "train_runtime": 25597.3865, + "train_tokens_per_second": 115314.172 + }, + { + "epoch": 0.3052030628534322, + "grad_norm": 0.1735740303993225, + "learning_rate": 0.004152281157720798, + "loss": 3.172795867919922, + "num_input_tokens_seen": 2956984320, + "step": 5640, + "train_runtime": 25642.6481, + "train_tokens_per_second": 115315.092 + }, + { + "epoch": 0.3057442030357964, + "grad_norm": 0.19910795986652374, + "learning_rate": 0.004149204563281657, + "loss": 3.1711971282958986, + "num_input_tokens_seen": 2962227200, + "step": 5650, + "train_runtime": 25687.9012, + "train_tokens_per_second": 115316.046 + }, + { + "epoch": 0.30628534321816064, + "grad_norm": 0.18566472828388214, + "learning_rate": 0.004146123697135352, + "loss": 3.177423095703125, + "num_input_tokens_seen": 2967470080, + "step": 5660, + "train_runtime": 25733.1722, + "train_tokens_per_second": 115316.917 + }, + { + "epoch": 0.30682648340052493, + "grad_norm": 0.16724054515361786, + "learning_rate": 0.004143038568687626, + "loss": 3.174397277832031, + "num_input_tokens_seen": 2972712960, + "step": 5670, + "train_runtime": 25778.4366, + "train_tokens_per_second": 115317.814 + }, + { + "epoch": 0.30736762358288916, + "grad_norm": 0.18052591383457184, + "learning_rate": 0.004139949187357236, + "loss": 3.172323226928711, + "num_input_tokens_seen": 2977955840, + "step": 5680, + "train_runtime": 25823.6944, + "train_tokens_per_second": 115318.738 + }, + { + "epoch": 0.3079087637652534, + "grad_norm": 0.1707129180431366, + "learning_rate": 0.004136855562575921, + "loss": 3.1627834320068358, + "num_input_tokens_seen": 2983198720, + "step": 5690, + "train_runtime": 25868.9566, + "train_tokens_per_second": 115319.638 + }, + { + "epoch": 0.3084499039476176, + "grad_norm": 0.18003937602043152, + "learning_rate": 0.004133757703788374, + "loss": 3.175765609741211, + "num_input_tokens_seen": 2988441600, + "step": 5700, + "train_runtime": 25914.2132, + "train_tokens_per_second": 115320.561 + }, + { + "epoch": 0.30899104412998185, + "grad_norm": 0.17585334181785583, + "learning_rate": 0.004130655620452215, + "loss": 3.1611761093139648, + "num_input_tokens_seen": 2993684480, + "step": 5710, + "train_runtime": 25959.4637, + "train_tokens_per_second": 115321.507 + }, + { + "epoch": 0.30953218431234614, + "grad_norm": 0.17584700882434845, + "learning_rate": 0.004127549322037963, + "loss": 3.1710134506225587, + "num_input_tokens_seen": 2998927360, + "step": 5720, + "train_runtime": 26004.7204, + "train_tokens_per_second": 115322.423 + }, + { + "epoch": 0.31007332449471037, + "grad_norm": 0.1671862006187439, + "learning_rate": 0.004124438818029003, + "loss": 3.171963691711426, + "num_input_tokens_seen": 3004170240, + "step": 5730, + "train_runtime": 26053.4016, + "train_tokens_per_second": 115308.177 + }, + { + "epoch": 0.3106144646770746, + "grad_norm": 0.17120610177516937, + "learning_rate": 0.004121324117921561, + "loss": 3.171039581298828, + "num_input_tokens_seen": 3009413120, + "step": 5740, + "train_runtime": 26098.5909, + "train_tokens_per_second": 115309.41 + }, + { + "epoch": 0.31115560485943883, + "grad_norm": 0.17267778515815735, + "learning_rate": 0.004118205231224675, + "loss": 3.1711191177368163, + "num_input_tokens_seen": 3014656000, + "step": 5750, + "train_runtime": 26143.7653, + "train_tokens_per_second": 115310.705 + }, + { + "epoch": 0.31169674504180306, + "grad_norm": 0.17473942041397095, + "learning_rate": 0.004115082167460159, + "loss": 3.1646095275878907, + "num_input_tokens_seen": 3019898880, + "step": 5760, + "train_runtime": 26188.9631, + "train_tokens_per_second": 115311.892 + }, + { + "epoch": 0.31223788522416734, + "grad_norm": 0.17137861251831055, + "learning_rate": 0.004111954936162586, + "loss": 3.1746740341186523, + "num_input_tokens_seen": 3025141760, + "step": 5770, + "train_runtime": 26234.3115, + "train_tokens_per_second": 115312.413 + }, + { + "epoch": 0.3127790254065316, + "grad_norm": 0.16885042190551758, + "learning_rate": 0.004108823546879249, + "loss": 3.162841033935547, + "num_input_tokens_seen": 3030384640, + "step": 5780, + "train_runtime": 26279.5704, + "train_tokens_per_second": 115313.325 + }, + { + "epoch": 0.3133201655888958, + "grad_norm": 0.15897022187709808, + "learning_rate": 0.004105688009170134, + "loss": 3.1719465255737305, + "num_input_tokens_seen": 3035627520, + "step": 5790, + "train_runtime": 26324.8012, + "train_tokens_per_second": 115314.357 + }, + { + "epoch": 0.31386130577126004, + "grad_norm": 0.1866680085659027, + "learning_rate": 0.004102548332607894, + "loss": 3.1683422088623048, + "num_input_tokens_seen": 3040870400, + "step": 5800, + "train_runtime": 26370.0195, + "train_tokens_per_second": 115315.44 + }, + { + "epoch": 0.31440244595362427, + "grad_norm": 0.18191276490688324, + "learning_rate": 0.004099404526777816, + "loss": 3.1652973175048826, + "num_input_tokens_seen": 3046113280, + "step": 5810, + "train_runtime": 26415.2343, + "train_tokens_per_second": 115316.535 + }, + { + "epoch": 0.31494358613598855, + "grad_norm": 0.16286683082580566, + "learning_rate": 0.004096256601277797, + "loss": 3.1653570175170898, + "num_input_tokens_seen": 3051356160, + "step": 5820, + "train_runtime": 26460.4377, + "train_tokens_per_second": 115317.675 + }, + { + "epoch": 0.3154847263183528, + "grad_norm": 0.15786544978618622, + "learning_rate": 0.004093104565718307, + "loss": 3.171334457397461, + "num_input_tokens_seen": 3056599040, + "step": 5830, + "train_runtime": 26505.6409, + "train_tokens_per_second": 115318.813 + }, + { + "epoch": 0.316025866500717, + "grad_norm": 0.16940993070602417, + "learning_rate": 0.0040899484297223666, + "loss": 3.16903076171875, + "num_input_tokens_seen": 3061841920, + "step": 5840, + "train_runtime": 26550.8652, + "train_tokens_per_second": 115319.855 + }, + { + "epoch": 0.31656700668308124, + "grad_norm": 0.1778353452682495, + "learning_rate": 0.004086788202925512, + "loss": 3.163807678222656, + "num_input_tokens_seen": 3067084800, + "step": 5850, + "train_runtime": 26596.0801, + "train_tokens_per_second": 115320.934 + }, + { + "epoch": 0.3171081468654455, + "grad_norm": 0.18578499555587769, + "learning_rate": 0.004083623894975773, + "loss": 3.1687942504882813, + "num_input_tokens_seen": 3072327680, + "step": 5860, + "train_runtime": 26641.289, + "train_tokens_per_second": 115322.036 + }, + { + "epoch": 0.31764928704780976, + "grad_norm": 0.17534473538398743, + "learning_rate": 0.004080455515533633, + "loss": 3.1645458221435545, + "num_input_tokens_seen": 3077570560, + "step": 5870, + "train_runtime": 26686.5065, + "train_tokens_per_second": 115323.096 + }, + { + "epoch": 0.318190427230174, + "grad_norm": 0.16227850317955017, + "learning_rate": 0.004077283074272012, + "loss": 3.1695529937744142, + "num_input_tokens_seen": 3082813440, + "step": 5880, + "train_runtime": 26731.6901, + "train_tokens_per_second": 115324.3 + }, + { + "epoch": 0.3187315674125382, + "grad_norm": 0.17972981929779053, + "learning_rate": 0.004074106580876226, + "loss": 3.164577102661133, + "num_input_tokens_seen": 3088056320, + "step": 5890, + "train_runtime": 26776.8465, + "train_tokens_per_second": 115325.616 + }, + { + "epoch": 0.31927270759490245, + "grad_norm": 0.17186778783798218, + "learning_rate": 0.0040709260450439615, + "loss": 3.168431854248047, + "num_input_tokens_seen": 3093299200, + "step": 5900, + "train_runtime": 26822.0301, + "train_tokens_per_second": 115326.811 + }, + { + "epoch": 0.3198138477772667, + "grad_norm": 0.16803112626075745, + "learning_rate": 0.0040677414764852485, + "loss": 3.1673011779785156, + "num_input_tokens_seen": 3098542080, + "step": 5910, + "train_runtime": 26867.197, + "train_tokens_per_second": 115328.074 + }, + { + "epoch": 0.32035498795963097, + "grad_norm": 0.16622225940227509, + "learning_rate": 0.00406455288492243, + "loss": 3.156739616394043, + "num_input_tokens_seen": 3103784960, + "step": 5920, + "train_runtime": 26912.3879, + "train_tokens_per_second": 115329.229 + }, + { + "epoch": 0.3208961281419952, + "grad_norm": 0.18976053595542908, + "learning_rate": 0.004061360280090129, + "loss": 3.166844940185547, + "num_input_tokens_seen": 3109027840, + "step": 5930, + "train_runtime": 26957.5834, + "train_tokens_per_second": 115330.361 + }, + { + "epoch": 0.3214372683243594, + "grad_norm": 0.16867531836032867, + "learning_rate": 0.00405816367173522, + "loss": 3.1626731872558596, + "num_input_tokens_seen": 3114270720, + "step": 5940, + "train_runtime": 27002.7901, + "train_tokens_per_second": 115331.442 + }, + { + "epoch": 0.32197840850672366, + "grad_norm": 0.20071354508399963, + "learning_rate": 0.004054963069616803, + "loss": 3.169915199279785, + "num_input_tokens_seen": 3119513600, + "step": 5950, + "train_runtime": 27047.9883, + "train_tokens_per_second": 115332.555 + }, + { + "epoch": 0.3225195486890879, + "grad_norm": 0.16498495638370514, + "learning_rate": 0.0040517584835061664, + "loss": 3.1712413787841798, + "num_input_tokens_seen": 3124756480, + "step": 5960, + "train_runtime": 27093.2042, + "train_tokens_per_second": 115333.589 + }, + { + "epoch": 0.3230606888714522, + "grad_norm": 0.17592206597328186, + "learning_rate": 0.004048549923186767, + "loss": 3.1624687194824217, + "num_input_tokens_seen": 3129999360, + "step": 5970, + "train_runtime": 27138.4223, + "train_tokens_per_second": 115334.61 + }, + { + "epoch": 0.3236018290538164, + "grad_norm": 0.15470415353775024, + "learning_rate": 0.00404533739845419, + "loss": 3.155242347717285, + "num_input_tokens_seen": 3135242240, + "step": 5980, + "train_runtime": 27183.6528, + "train_tokens_per_second": 115335.575 + }, + { + "epoch": 0.32414296923618063, + "grad_norm": 0.16501109302043915, + "learning_rate": 0.004042120919116126, + "loss": 3.1598865509033205, + "num_input_tokens_seen": 3140485120, + "step": 5990, + "train_runtime": 27228.8867, + "train_tokens_per_second": 115336.523 + }, + { + "epoch": 0.32468410941854486, + "grad_norm": 0.16781945526599884, + "learning_rate": 0.004038900494992339, + "loss": 3.157525634765625, + "num_input_tokens_seen": 3145728000, + "step": 6000, + "train_runtime": 27274.108, + "train_tokens_per_second": 115337.521 + }, + { + "epoch": 0.32468410941854486, + "eval_loss": 3.111185073852539, + "eval_runtime": 1.9872, + "eval_samples_per_second": 251.614, + "eval_steps_per_second": 4.026, + "num_input_tokens_seen": 3145728000, + "step": 6000 + }, + { + "epoch": 0.3252252496009091, + "grad_norm": 0.18414868414402008, + "learning_rate": 0.004035676135914636, + "loss": 3.170181655883789, + "num_input_tokens_seen": 3150970880, + "step": 6010, + "train_runtime": 27323.9049, + "train_tokens_per_second": 115319.201 + }, + { + "epoch": 0.3257663897832734, + "grad_norm": 0.1616990864276886, + "learning_rate": 0.004032447851726835, + "loss": 3.1585414886474608, + "num_input_tokens_seen": 3156213760, + "step": 6020, + "train_runtime": 27369.1149, + "train_tokens_per_second": 115320.272 + }, + { + "epoch": 0.3263075299656376, + "grad_norm": 0.16582000255584717, + "learning_rate": 0.004029215652284741, + "loss": 3.1622276306152344, + "num_input_tokens_seen": 3161456640, + "step": 6030, + "train_runtime": 27414.3296, + "train_tokens_per_second": 115321.319 + }, + { + "epoch": 0.32684867014800184, + "grad_norm": 0.17380478978157043, + "learning_rate": 0.00402597954745611, + "loss": 3.1608341217041014, + "num_input_tokens_seen": 3166699520, + "step": 6040, + "train_runtime": 27459.5638, + "train_tokens_per_second": 115322.281 + }, + { + "epoch": 0.32738981033036607, + "grad_norm": 0.18764927983283997, + "learning_rate": 0.00402273954712062, + "loss": 3.1758914947509767, + "num_input_tokens_seen": 3171942400, + "step": 6050, + "train_runtime": 27504.7658, + "train_tokens_per_second": 115323.374 + }, + { + "epoch": 0.3279309505127303, + "grad_norm": 0.1659294068813324, + "learning_rate": 0.004019495661169844, + "loss": 3.1681026458740233, + "num_input_tokens_seen": 3177185280, + "step": 6060, + "train_runtime": 27549.978, + "train_tokens_per_second": 115324.422 + }, + { + "epoch": 0.3284720906950946, + "grad_norm": 0.15407103300094604, + "learning_rate": 0.004016247899507217, + "loss": 3.1617177963256835, + "num_input_tokens_seen": 3182428160, + "step": 6070, + "train_runtime": 27595.2039, + "train_tokens_per_second": 115325.409 + }, + { + "epoch": 0.3290132308774588, + "grad_norm": 0.17896398901939392, + "learning_rate": 0.004012996272048004, + "loss": 3.163351631164551, + "num_input_tokens_seen": 3187671040, + "step": 6080, + "train_runtime": 27640.4032, + "train_tokens_per_second": 115326.503 + }, + { + "epoch": 0.32955437105982305, + "grad_norm": 0.17344997823238373, + "learning_rate": 0.004009740788719276, + "loss": 3.153501510620117, + "num_input_tokens_seen": 3192913920, + "step": 6090, + "train_runtime": 27685.6168, + "train_tokens_per_second": 115327.534 + }, + { + "epoch": 0.3300955112421873, + "grad_norm": 0.16279980540275574, + "learning_rate": 0.004006481459459872, + "loss": 3.160162162780762, + "num_input_tokens_seen": 3198156800, + "step": 6100, + "train_runtime": 27730.837, + "train_tokens_per_second": 115328.535 + }, + { + "epoch": 0.3306366514245515, + "grad_norm": 0.16896025836467743, + "learning_rate": 0.0040032182942203775, + "loss": 3.158255767822266, + "num_input_tokens_seen": 3203399680, + "step": 6110, + "train_runtime": 27779.614, + "train_tokens_per_second": 115314.766 + }, + { + "epoch": 0.3311777916069158, + "grad_norm": 0.18168415129184723, + "learning_rate": 0.003999951302963083, + "loss": 3.156180000305176, + "num_input_tokens_seen": 3208642560, + "step": 6120, + "train_runtime": 27824.7398, + "train_tokens_per_second": 115316.175 + }, + { + "epoch": 0.33171893178928, + "grad_norm": 0.17479564249515533, + "learning_rate": 0.003996680495661963, + "loss": 3.155413818359375, + "num_input_tokens_seen": 3213885440, + "step": 6130, + "train_runtime": 27869.8597, + "train_tokens_per_second": 115317.604 + }, + { + "epoch": 0.33226007197164426, + "grad_norm": 0.16649708151817322, + "learning_rate": 0.003993405882302642, + "loss": 3.162016677856445, + "num_input_tokens_seen": 3219128320, + "step": 6140, + "train_runtime": 27914.9889, + "train_tokens_per_second": 115318.99 + }, + { + "epoch": 0.3328012121540085, + "grad_norm": 0.17866738140583038, + "learning_rate": 0.003990127472882364, + "loss": 3.1546072006225585, + "num_input_tokens_seen": 3224371200, + "step": 6150, + "train_runtime": 27960.113, + "train_tokens_per_second": 115320.392 + }, + { + "epoch": 0.3333423523363727, + "grad_norm": 0.15289874374866486, + "learning_rate": 0.0039868452774099615, + "loss": 3.1471332550048827, + "num_input_tokens_seen": 3229614080, + "step": 6160, + "train_runtime": 28005.2392, + "train_tokens_per_second": 115321.782 + }, + { + "epoch": 0.333883492518737, + "grad_norm": 0.16488930583000183, + "learning_rate": 0.003983559305905828, + "loss": 3.1540958404541017, + "num_input_tokens_seen": 3234856960, + "step": 6170, + "train_runtime": 28050.3655, + "train_tokens_per_second": 115323.166 + }, + { + "epoch": 0.33442463270110123, + "grad_norm": 0.17347821593284607, + "learning_rate": 0.003980269568401881, + "loss": 3.153203010559082, + "num_input_tokens_seen": 3240099840, + "step": 6180, + "train_runtime": 28095.5018, + "train_tokens_per_second": 115324.505 + }, + { + "epoch": 0.33496577288346546, + "grad_norm": 0.16901230812072754, + "learning_rate": 0.00397697607494154, + "loss": 3.153574752807617, + "num_input_tokens_seen": 3245342720, + "step": 6190, + "train_runtime": 28140.6287, + "train_tokens_per_second": 115325.878 + }, + { + "epoch": 0.3355069130658297, + "grad_norm": 0.1725231409072876, + "learning_rate": 0.0039736788355796875, + "loss": 3.1607025146484373, + "num_input_tokens_seen": 3250585600, + "step": 6200, + "train_runtime": 28185.7568, + "train_tokens_per_second": 115327.242 + }, + { + "epoch": 0.3360480532481939, + "grad_norm": 0.17325183749198914, + "learning_rate": 0.003970377860382644, + "loss": 3.147405242919922, + "num_input_tokens_seen": 3255828480, + "step": 6210, + "train_runtime": 28230.8843, + "train_tokens_per_second": 115328.604 + }, + { + "epoch": 0.3365891934305582, + "grad_norm": 0.1715451180934906, + "learning_rate": 0.003967073159428135, + "loss": 3.150386428833008, + "num_input_tokens_seen": 3261071360, + "step": 6220, + "train_runtime": 28276.018, + "train_tokens_per_second": 115329.936 + }, + { + "epoch": 0.33713033361292244, + "grad_norm": 0.16657474637031555, + "learning_rate": 0.003963764742805262, + "loss": 3.1559564590454103, + "num_input_tokens_seen": 3266314240, + "step": 6230, + "train_runtime": 28321.1527, + "train_tokens_per_second": 115331.261 + }, + { + "epoch": 0.33767147379528667, + "grad_norm": 0.17756827175617218, + "learning_rate": 0.003960452620614465, + "loss": 3.1532052993774413, + "num_input_tokens_seen": 3271557120, + "step": 6240, + "train_runtime": 28366.2774, + "train_tokens_per_second": 115332.621 + }, + { + "epoch": 0.3382126139776509, + "grad_norm": 0.16704502701759338, + "learning_rate": 0.003957136802967503, + "loss": 3.145302581787109, + "num_input_tokens_seen": 3276800000, + "step": 6250, + "train_runtime": 28411.4119, + "train_tokens_per_second": 115333.937 + }, + { + "epoch": 0.33875375416001513, + "grad_norm": 0.16609609127044678, + "learning_rate": 0.003953817299987416, + "loss": 3.157614898681641, + "num_input_tokens_seen": 3282042880, + "step": 6260, + "train_runtime": 28456.5404, + "train_tokens_per_second": 115335.274 + }, + { + "epoch": 0.3392948943423794, + "grad_norm": 0.17776867747306824, + "learning_rate": 0.003950494121808493, + "loss": 3.1511157989501952, + "num_input_tokens_seen": 3287285760, + "step": 6270, + "train_runtime": 28501.6688, + "train_tokens_per_second": 115336.607 + }, + { + "epoch": 0.33983603452474365, + "grad_norm": 0.16619160771369934, + "learning_rate": 0.003947167278576242, + "loss": 3.1576236724853515, + "num_input_tokens_seen": 3292528640, + "step": 6280, + "train_runtime": 28546.8015, + "train_tokens_per_second": 115337.917 + }, + { + "epoch": 0.3403771747071079, + "grad_norm": 0.17923958599567413, + "learning_rate": 0.003943836780447365, + "loss": 3.1528648376464843, + "num_input_tokens_seen": 3297771520, + "step": 6290, + "train_runtime": 28591.9231, + "train_tokens_per_second": 115339.269 + }, + { + "epoch": 0.3409183148894721, + "grad_norm": 0.16474676132202148, + "learning_rate": 0.003940502637589718, + "loss": 3.1509103775024414, + "num_input_tokens_seen": 3303014400, + "step": 6300, + "train_runtime": 28637.0641, + "train_tokens_per_second": 115340.539 + }, + { + "epoch": 0.34145945507183634, + "grad_norm": 0.1639336794614792, + "learning_rate": 0.0039371648601822865, + "loss": 3.155986785888672, + "num_input_tokens_seen": 3308257280, + "step": 6310, + "train_runtime": 28682.1884, + "train_tokens_per_second": 115341.871 + }, + { + "epoch": 0.3420005952542006, + "grad_norm": 0.17124713957309723, + "learning_rate": 0.003933823458415151, + "loss": 3.147997283935547, + "num_input_tokens_seen": 3313500160, + "step": 6320, + "train_runtime": 28727.3095, + "train_tokens_per_second": 115343.212 + }, + { + "epoch": 0.34254173543656485, + "grad_norm": 0.17230060696601868, + "learning_rate": 0.003930478442489458, + "loss": 3.1527957916259766, + "num_input_tokens_seen": 3318743040, + "step": 6330, + "train_runtime": 28772.44, + "train_tokens_per_second": 115344.512 + }, + { + "epoch": 0.3430828756189291, + "grad_norm": 0.1681806594133377, + "learning_rate": 0.003927129822617386, + "loss": 3.1512054443359374, + "num_input_tokens_seen": 3323985920, + "step": 6340, + "train_runtime": 28817.6293, + "train_tokens_per_second": 115345.572 + }, + { + "epoch": 0.3436240158012933, + "grad_norm": 0.17435091733932495, + "learning_rate": 0.003923777609022119, + "loss": 3.153603744506836, + "num_input_tokens_seen": 3329228800, + "step": 6350, + "train_runtime": 28862.8169, + "train_tokens_per_second": 115346.635 + }, + { + "epoch": 0.34416515598365754, + "grad_norm": 0.1703469306230545, + "learning_rate": 0.00392042181193781, + "loss": 3.142818069458008, + "num_input_tokens_seen": 3334471680, + "step": 6360, + "train_runtime": 28907.9957, + "train_tokens_per_second": 115347.73 + }, + { + "epoch": 0.34470629616602183, + "grad_norm": 0.1682499647140503, + "learning_rate": 0.0039170624416095525, + "loss": 3.1417423248291017, + "num_input_tokens_seen": 3339714560, + "step": 6370, + "train_runtime": 28953.1644, + "train_tokens_per_second": 115348.862 + }, + { + "epoch": 0.34524743634838606, + "grad_norm": 0.16802842915058136, + "learning_rate": 0.0039136995082933515, + "loss": 3.1456912994384765, + "num_input_tokens_seen": 3344957440, + "step": 6380, + "train_runtime": 28998.3264, + "train_tokens_per_second": 115350.017 + }, + { + "epoch": 0.3457885765307503, + "grad_norm": 0.1582358479499817, + "learning_rate": 0.003910333022256086, + "loss": 3.1438793182373046, + "num_input_tokens_seen": 3350200320, + "step": 6390, + "train_runtime": 29043.4985, + "train_tokens_per_second": 115351.128 + }, + { + "epoch": 0.3463297167131145, + "grad_norm": 0.16883233189582825, + "learning_rate": 0.003906962993775483, + "loss": 3.1468482971191407, + "num_input_tokens_seen": 3355443200, + "step": 6400, + "train_runtime": 29088.66, + "train_tokens_per_second": 115352.278 + }, + { + "epoch": 0.34687085689547875, + "grad_norm": 0.18867318332195282, + "learning_rate": 0.0039035894331400853, + "loss": 3.147420883178711, + "num_input_tokens_seen": 3360686080, + "step": 6410, + "train_runtime": 29133.8253, + "train_tokens_per_second": 115353.409 + }, + { + "epoch": 0.34741199707784304, + "grad_norm": 0.16323506832122803, + "learning_rate": 0.0039002123506492177, + "loss": 3.145482063293457, + "num_input_tokens_seen": 3365928960, + "step": 6420, + "train_runtime": 29179.0336, + "train_tokens_per_second": 115354.367 + }, + { + "epoch": 0.34795313726020727, + "grad_norm": 0.1756802797317505, + "learning_rate": 0.003896831756612958, + "loss": 3.1475906372070312, + "num_input_tokens_seen": 3371171840, + "step": 6430, + "train_runtime": 29224.2308, + "train_tokens_per_second": 115355.366 + }, + { + "epoch": 0.3484942774425715, + "grad_norm": 0.17158783972263336, + "learning_rate": 0.0038934476613521037, + "loss": 3.142435073852539, + "num_input_tokens_seen": 3376414720, + "step": 6440, + "train_runtime": 29269.4011, + "train_tokens_per_second": 115356.467 + }, + { + "epoch": 0.34903541762493573, + "grad_norm": 0.16574952006340027, + "learning_rate": 0.0038900600751981436, + "loss": 3.1459327697753907, + "num_input_tokens_seen": 3381657600, + "step": 6450, + "train_runtime": 29314.5687, + "train_tokens_per_second": 115357.577 + }, + { + "epoch": 0.34957655780729996, + "grad_norm": 0.16016115248203278, + "learning_rate": 0.0038866690084932206, + "loss": 3.1540714263916017, + "num_input_tokens_seen": 3386900480, + "step": 6460, + "train_runtime": 29359.7508, + "train_tokens_per_second": 115358.625 + }, + { + "epoch": 0.35011769798966424, + "grad_norm": 0.1590614914894104, + "learning_rate": 0.0038832744715901063, + "loss": 3.138327789306641, + "num_input_tokens_seen": 3392143360, + "step": 6470, + "train_runtime": 29404.9917, + "train_tokens_per_second": 115359.439 + }, + { + "epoch": 0.3506588381720285, + "grad_norm": 0.1668478101491928, + "learning_rate": 0.003879876474852164, + "loss": 3.1390443801879884, + "num_input_tokens_seen": 3397386240, + "step": 6480, + "train_runtime": 29450.2102, + "train_tokens_per_second": 115360.339 + }, + { + "epoch": 0.3511999783543927, + "grad_norm": 0.16614961624145508, + "learning_rate": 0.0038764750286533244, + "loss": 3.1493562698364257, + "num_input_tokens_seen": 3402629120, + "step": 6490, + "train_runtime": 29498.9151, + "train_tokens_per_second": 115347.602 + }, + { + "epoch": 0.35174111853675694, + "grad_norm": 0.1770559698343277, + "learning_rate": 0.003873070143378044, + "loss": 3.1434371948242186, + "num_input_tokens_seen": 3407872000, + "step": 6500, + "train_runtime": 29544.0364, + "train_tokens_per_second": 115348.896 + }, + { + "epoch": 0.35174111853675694, + "eval_loss": 3.0966169834136963, + "eval_runtime": 1.9851, + "eval_samples_per_second": 251.881, + "eval_steps_per_second": 4.03, + "num_input_tokens_seen": 3407872000, + "step": 6500 + }, + { + "epoch": 0.35228225871912117, + "grad_norm": 0.1724107414484024, + "learning_rate": 0.0038696618294212816, + "loss": 3.1477359771728515, + "num_input_tokens_seen": 3413114880, + "step": 6510, + "train_runtime": 29591.1684, + "train_tokens_per_second": 115342.349 + }, + { + "epoch": 0.35282339890148545, + "grad_norm": 0.17597156763076782, + "learning_rate": 0.0038662500971884633, + "loss": 3.1492542266845702, + "num_input_tokens_seen": 3418357760, + "step": 6520, + "train_runtime": 29636.3254, + "train_tokens_per_second": 115343.509 + }, + { + "epoch": 0.3533645390838497, + "grad_norm": 0.1612919569015503, + "learning_rate": 0.0038628349570954497, + "loss": 3.1426467895507812, + "num_input_tokens_seen": 3423600640, + "step": 6530, + "train_runtime": 29681.4655, + "train_tokens_per_second": 115344.73 + }, + { + "epoch": 0.3539056792662139, + "grad_norm": 0.16101430356502533, + "learning_rate": 0.0038594164195685076, + "loss": 3.137646484375, + "num_input_tokens_seen": 3428843520, + "step": 6540, + "train_runtime": 29726.6035, + "train_tokens_per_second": 115345.957 + }, + { + "epoch": 0.35444681944857814, + "grad_norm": 0.17293353378772736, + "learning_rate": 0.003855994495044273, + "loss": 3.1470672607421877, + "num_input_tokens_seen": 3434086400, + "step": 6550, + "train_runtime": 29771.7425, + "train_tokens_per_second": 115347.175 + }, + { + "epoch": 0.3549879596309424, + "grad_norm": 0.18171222507953644, + "learning_rate": 0.0038525691939697267, + "loss": 3.1423971176147463, + "num_input_tokens_seen": 3439329280, + "step": 6560, + "train_runtime": 29816.873, + "train_tokens_per_second": 115348.423 + }, + { + "epoch": 0.35552909981330666, + "grad_norm": 0.17078061401844025, + "learning_rate": 0.0038491405268021523, + "loss": 3.1396827697753906, + "num_input_tokens_seen": 3444572160, + "step": 6570, + "train_runtime": 29862.0878, + "train_tokens_per_second": 115349.341 + }, + { + "epoch": 0.3560702399956709, + "grad_norm": 0.17867809534072876, + "learning_rate": 0.0038457085040091155, + "loss": 3.1499147415161133, + "num_input_tokens_seen": 3449815040, + "step": 6580, + "train_runtime": 29907.2427, + "train_tokens_per_second": 115350.488 + }, + { + "epoch": 0.3566113801780351, + "grad_norm": 0.15178236365318298, + "learning_rate": 0.003842273136068423, + "loss": 3.13470344543457, + "num_input_tokens_seen": 3455057920, + "step": 6590, + "train_runtime": 29952.42, + "train_tokens_per_second": 115351.545 + }, + { + "epoch": 0.35715252036039935, + "grad_norm": 0.17382913827896118, + "learning_rate": 0.0038388344334680936, + "loss": 3.1436153411865235, + "num_input_tokens_seen": 3460300800, + "step": 6600, + "train_runtime": 29997.6461, + "train_tokens_per_second": 115352.411 + }, + { + "epoch": 0.3576936605427636, + "grad_norm": 0.17544035613536835, + "learning_rate": 0.0038353924067063313, + "loss": 3.1381744384765624, + "num_input_tokens_seen": 3465543680, + "step": 6610, + "train_runtime": 30042.8233, + "train_tokens_per_second": 115353.462 + }, + { + "epoch": 0.35823480072512787, + "grad_norm": 0.15095841884613037, + "learning_rate": 0.003831947066291482, + "loss": 3.1344669342041014, + "num_input_tokens_seen": 3470786560, + "step": 6620, + "train_runtime": 30088.0009, + "train_tokens_per_second": 115354.509 + }, + { + "epoch": 0.3587759409074921, + "grad_norm": 0.16399560868740082, + "learning_rate": 0.0038284984227420146, + "loss": 3.134235382080078, + "num_input_tokens_seen": 3476029440, + "step": 6630, + "train_runtime": 30133.1894, + "train_tokens_per_second": 115355.51 + }, + { + "epoch": 0.3593170810898563, + "grad_norm": 0.18398840725421906, + "learning_rate": 0.003825046486586477, + "loss": 3.131580924987793, + "num_input_tokens_seen": 3481272320, + "step": 6640, + "train_runtime": 30178.3732, + "train_tokens_per_second": 115356.527 + }, + { + "epoch": 0.35985822127222056, + "grad_norm": 0.16813096404075623, + "learning_rate": 0.0038215912683634726, + "loss": 3.1448497772216797, + "num_input_tokens_seen": 3486515200, + "step": 6650, + "train_runtime": 30223.5423, + "train_tokens_per_second": 115357.596 + }, + { + "epoch": 0.3603993614545848, + "grad_norm": 0.1649860441684723, + "learning_rate": 0.003818132778621623, + "loss": 3.14077091217041, + "num_input_tokens_seen": 3491758080, + "step": 6660, + "train_runtime": 30268.7194, + "train_tokens_per_second": 115358.633 + }, + { + "epoch": 0.3609405016369491, + "grad_norm": 0.17575252056121826, + "learning_rate": 0.0038146710279195386, + "loss": 3.1330080032348633, + "num_input_tokens_seen": 3497000960, + "step": 6670, + "train_runtime": 30313.9788, + "train_tokens_per_second": 115359.352 + }, + { + "epoch": 0.3614816418193133, + "grad_norm": 0.1742008924484253, + "learning_rate": 0.003811206026825786, + "loss": 3.155079460144043, + "num_input_tokens_seen": 3502243840, + "step": 6680, + "train_runtime": 30359.1553, + "train_tokens_per_second": 115360.385 + }, + { + "epoch": 0.36202278200167753, + "grad_norm": 0.1799112856388092, + "learning_rate": 0.0038077377859188524, + "loss": 3.1288970947265624, + "num_input_tokens_seen": 3507486720, + "step": 6690, + "train_runtime": 30404.3262, + "train_tokens_per_second": 115361.436 + }, + { + "epoch": 0.36256392218404176, + "grad_norm": 0.16728277504444122, + "learning_rate": 0.003804266315787119, + "loss": 3.137259864807129, + "num_input_tokens_seen": 3512729600, + "step": 6700, + "train_runtime": 30449.5017, + "train_tokens_per_second": 115362.466 + }, + { + "epoch": 0.363105062366406, + "grad_norm": 0.1766940951347351, + "learning_rate": 0.0038007916270288234, + "loss": 3.1414379119873046, + "num_input_tokens_seen": 3517972480, + "step": 6710, + "train_runtime": 30494.6728, + "train_tokens_per_second": 115363.51 + }, + { + "epoch": 0.3636462025487703, + "grad_norm": 0.17950496077537537, + "learning_rate": 0.0037973137302520312, + "loss": 3.141128730773926, + "num_input_tokens_seen": 3523215360, + "step": 6720, + "train_runtime": 30539.8417, + "train_tokens_per_second": 115364.559 + }, + { + "epoch": 0.3641873427311345, + "grad_norm": 0.17668098211288452, + "learning_rate": 0.003793832636074601, + "loss": 3.1354911804199217, + "num_input_tokens_seen": 3528458240, + "step": 6730, + "train_runtime": 30585.0013, + "train_tokens_per_second": 115365.64 + }, + { + "epoch": 0.36472848291349874, + "grad_norm": 0.17323218286037445, + "learning_rate": 0.0037903483551241534, + "loss": 3.1416683197021484, + "num_input_tokens_seen": 3533701120, + "step": 6740, + "train_runtime": 30630.1549, + "train_tokens_per_second": 115366.74 + }, + { + "epoch": 0.36526962309586297, + "grad_norm": 0.1715293824672699, + "learning_rate": 0.003786860898038038, + "loss": 3.133253288269043, + "num_input_tokens_seen": 3538944000, + "step": 6750, + "train_runtime": 30675.3114, + "train_tokens_per_second": 115367.826 + }, + { + "epoch": 0.3658107632782272, + "grad_norm": 0.16131816804409027, + "learning_rate": 0.0037833702754633005, + "loss": 3.137991714477539, + "num_input_tokens_seen": 3544186880, + "step": 6760, + "train_runtime": 30720.4583, + "train_tokens_per_second": 115368.945 + }, + { + "epoch": 0.3663519034605915, + "grad_norm": 0.16405366361141205, + "learning_rate": 0.003779876498056652, + "loss": 3.149972152709961, + "num_input_tokens_seen": 3549429760, + "step": 6770, + "train_runtime": 30765.5763, + "train_tokens_per_second": 115370.17 + }, + { + "epoch": 0.3668930436429557, + "grad_norm": 0.1677146553993225, + "learning_rate": 0.0037763795764844317, + "loss": 3.1432748794555665, + "num_input_tokens_seen": 3554672640, + "step": 6780, + "train_runtime": 30810.7138, + "train_tokens_per_second": 115371.317 + }, + { + "epoch": 0.36743418382531995, + "grad_norm": 0.1701316237449646, + "learning_rate": 0.003772879521422583, + "loss": 3.138026809692383, + "num_input_tokens_seen": 3559915520, + "step": 6790, + "train_runtime": 30855.8357, + "train_tokens_per_second": 115372.52 + }, + { + "epoch": 0.3679753240076842, + "grad_norm": 0.1724764108657837, + "learning_rate": 0.0037693763435566125, + "loss": 3.1394069671630858, + "num_input_tokens_seen": 3565158400, + "step": 6800, + "train_runtime": 30900.9517, + "train_tokens_per_second": 115373.741 + }, + { + "epoch": 0.3685164641900484, + "grad_norm": 0.16157887876033783, + "learning_rate": 0.00376587005358156, + "loss": 3.124007797241211, + "num_input_tokens_seen": 3570401280, + "step": 6810, + "train_runtime": 30946.0772, + "train_tokens_per_second": 115374.923 + }, + { + "epoch": 0.3690576043724127, + "grad_norm": 0.16729003190994263, + "learning_rate": 0.0037623606622019675, + "loss": 3.122986602783203, + "num_input_tokens_seen": 3575644160, + "step": 6820, + "train_runtime": 30991.3846, + "train_tokens_per_second": 115375.425 + }, + { + "epoch": 0.3695987445547769, + "grad_norm": 0.17239217460155487, + "learning_rate": 0.003758848180131846, + "loss": 3.1259433746337892, + "num_input_tokens_seen": 3580887040, + "step": 6830, + "train_runtime": 31036.5265, + "train_tokens_per_second": 115376.54 + }, + { + "epoch": 0.37013988473714116, + "grad_norm": 0.1540314108133316, + "learning_rate": 0.003755332618094642, + "loss": 3.128913688659668, + "num_input_tokens_seen": 3586129920, + "step": 6840, + "train_runtime": 31081.6974, + "train_tokens_per_second": 115377.544 + }, + { + "epoch": 0.3706810249195054, + "grad_norm": 0.16670770943164825, + "learning_rate": 0.0037518139868232036, + "loss": 3.1437910079956053, + "num_input_tokens_seen": 3591372800, + "step": 6850, + "train_runtime": 31126.8444, + "train_tokens_per_second": 115378.634 + }, + { + "epoch": 0.3712221651018696, + "grad_norm": 0.16100816428661346, + "learning_rate": 0.0037482922970597512, + "loss": 3.1303838729858398, + "num_input_tokens_seen": 3596615680, + "step": 6860, + "train_runtime": 31172.0038, + "train_tokens_per_second": 115379.675 + }, + { + "epoch": 0.3717633052842339, + "grad_norm": 0.1720798909664154, + "learning_rate": 0.0037447675595558417, + "loss": 3.139808464050293, + "num_input_tokens_seen": 3601858560, + "step": 6870, + "train_runtime": 31220.5874, + "train_tokens_per_second": 115368.059 + }, + { + "epoch": 0.37230444546659813, + "grad_norm": 0.15832237899303436, + "learning_rate": 0.0037412397850723356, + "loss": 3.1387088775634764, + "num_input_tokens_seen": 3607101440, + "step": 6880, + "train_runtime": 31265.7548, + "train_tokens_per_second": 115369.082 + }, + { + "epoch": 0.37284558564896236, + "grad_norm": 0.16572092473506927, + "learning_rate": 0.0037377089843793664, + "loss": 3.136234092712402, + "num_input_tokens_seen": 3612344320, + "step": 6890, + "train_runtime": 31310.8828, + "train_tokens_per_second": 115370.248 + }, + { + "epoch": 0.3733867258313266, + "grad_norm": 0.16967612504959106, + "learning_rate": 0.0037341751682563075, + "loss": 3.1306957244873046, + "num_input_tokens_seen": 3617587200, + "step": 6900, + "train_runtime": 31356.0169, + "train_tokens_per_second": 115371.388 + }, + { + "epoch": 0.3739278660136908, + "grad_norm": 0.16561359167099, + "learning_rate": 0.0037306383474917356, + "loss": 3.128021240234375, + "num_input_tokens_seen": 3622830080, + "step": 6910, + "train_runtime": 31401.1695, + "train_tokens_per_second": 115372.457 + }, + { + "epoch": 0.3744690061960551, + "grad_norm": 0.16602273285388947, + "learning_rate": 0.0037270985328834013, + "loss": 3.125231170654297, + "num_input_tokens_seen": 3628072960, + "step": 6920, + "train_runtime": 31446.3403, + "train_tokens_per_second": 115373.456 + }, + { + "epoch": 0.37501014637841934, + "grad_norm": 0.15461350977420807, + "learning_rate": 0.0037235557352381975, + "loss": 3.1283363342285155, + "num_input_tokens_seen": 3633315840, + "step": 6930, + "train_runtime": 31491.4936, + "train_tokens_per_second": 115374.516 + }, + { + "epoch": 0.37555128656078357, + "grad_norm": 0.17157427966594696, + "learning_rate": 0.003720009965372121, + "loss": 3.136751174926758, + "num_input_tokens_seen": 3638558720, + "step": 6940, + "train_runtime": 31536.6325, + "train_tokens_per_second": 115375.626 + }, + { + "epoch": 0.3760924267431478, + "grad_norm": 0.15815427899360657, + "learning_rate": 0.0037164612341102445, + "loss": 3.1335182189941406, + "num_input_tokens_seen": 3643801600, + "step": 6950, + "train_runtime": 31581.7854, + "train_tokens_per_second": 115376.682 + }, + { + "epoch": 0.37663356692551203, + "grad_norm": 0.16368745267391205, + "learning_rate": 0.003712909552286681, + "loss": 3.1299674987792967, + "num_input_tokens_seen": 3649044480, + "step": 6960, + "train_runtime": 31626.953, + "train_tokens_per_second": 115377.681 + }, + { + "epoch": 0.3771747071078763, + "grad_norm": 0.17233121395111084, + "learning_rate": 0.003709354930744553, + "loss": 3.1409616470336914, + "num_input_tokens_seen": 3654287360, + "step": 6970, + "train_runtime": 31672.1101, + "train_tokens_per_second": 115378.715 + }, + { + "epoch": 0.37771584729024055, + "grad_norm": 0.1784183382987976, + "learning_rate": 0.0037057973803359553, + "loss": 3.1445953369140627, + "num_input_tokens_seen": 3659530240, + "step": 6980, + "train_runtime": 31717.2675, + "train_tokens_per_second": 115379.745 + }, + { + "epoch": 0.3782569874726048, + "grad_norm": 0.1589273363351822, + "learning_rate": 0.003702236911921925, + "loss": 3.1336727142333984, + "num_input_tokens_seen": 3664773120, + "step": 6990, + "train_runtime": 31762.4428, + "train_tokens_per_second": 115380.707 + }, + { + "epoch": 0.378798127654969, + "grad_norm": 0.16604717075824738, + "learning_rate": 0.00369867353637241, + "loss": 3.125100326538086, + "num_input_tokens_seen": 3670016000, + "step": 7000, + "train_runtime": 31807.6091, + "train_tokens_per_second": 115381.7 + }, + { + "epoch": 0.378798127654969, + "eval_loss": 3.082562208175659, + "eval_runtime": 1.983, + "eval_samples_per_second": 252.143, + "eval_steps_per_second": 4.034, + "num_input_tokens_seen": 3670016000, + "step": 7000 + }, + { + "epoch": 0.37933926783733324, + "grad_norm": 0.16016067564487457, + "learning_rate": 0.003695107264566231, + "loss": 3.132742691040039, + "num_input_tokens_seen": 3675258880, + "step": 7010, + "train_runtime": 31857.0893, + "train_tokens_per_second": 115367.065 + }, + { + "epoch": 0.3798804080196975, + "grad_norm": 0.17284226417541504, + "learning_rate": 0.003691538107391052, + "loss": 3.1309505462646485, + "num_input_tokens_seen": 3680501760, + "step": 7020, + "train_runtime": 31902.2704, + "train_tokens_per_second": 115368.02 + }, + { + "epoch": 0.38042154820206175, + "grad_norm": 0.16180108487606049, + "learning_rate": 0.0036879660757433465, + "loss": 3.1276824951171873, + "num_input_tokens_seen": 3685744640, + "step": 7030, + "train_runtime": 31947.4422, + "train_tokens_per_second": 115369.006 + }, + { + "epoch": 0.380962688384426, + "grad_norm": 0.16350635886192322, + "learning_rate": 0.0036843911805283613, + "loss": 3.127395248413086, + "num_input_tokens_seen": 3690987520, + "step": 7040, + "train_runtime": 31992.5853, + "train_tokens_per_second": 115370.092 + }, + { + "epoch": 0.3815038285667902, + "grad_norm": 0.15854142606258392, + "learning_rate": 0.0036808134326600872, + "loss": 3.1203243255615236, + "num_input_tokens_seen": 3696230400, + "step": 7050, + "train_runtime": 32037.7375, + "train_tokens_per_second": 115371.143 + }, + { + "epoch": 0.38204496874915445, + "grad_norm": 0.1765364557504654, + "learning_rate": 0.0036772328430612245, + "loss": 3.1236772537231445, + "num_input_tokens_seen": 3701473280, + "step": 7060, + "train_runtime": 32082.8987, + "train_tokens_per_second": 115372.159 + }, + { + "epoch": 0.38258610893151873, + "grad_norm": 0.16590341925621033, + "learning_rate": 0.0036736494226631486, + "loss": 3.1179275512695312, + "num_input_tokens_seen": 3706716160, + "step": 7070, + "train_runtime": 32128.0461, + "train_tokens_per_second": 115373.221 + }, + { + "epoch": 0.38312724911388296, + "grad_norm": 0.1656789630651474, + "learning_rate": 0.0036700631824058763, + "loss": 3.1220640182495116, + "num_input_tokens_seen": 3711959040, + "step": 7080, + "train_runtime": 32173.2014, + "train_tokens_per_second": 115374.252 + }, + { + "epoch": 0.3836683892962472, + "grad_norm": 0.18290071189403534, + "learning_rate": 0.003666474133238036, + "loss": 3.130259704589844, + "num_input_tokens_seen": 3717201920, + "step": 7090, + "train_runtime": 32218.3695, + "train_tokens_per_second": 115375.234 + }, + { + "epoch": 0.3842095294786114, + "grad_norm": 0.1678554117679596, + "learning_rate": 0.003662882286116827, + "loss": 3.128999137878418, + "num_input_tokens_seen": 3722444800, + "step": 7100, + "train_runtime": 32263.5278, + "train_tokens_per_second": 115376.248 + }, + { + "epoch": 0.38475066966097565, + "grad_norm": 0.16328170895576477, + "learning_rate": 0.0036592876520079956, + "loss": 3.1096935272216797, + "num_input_tokens_seen": 3727687680, + "step": 7110, + "train_runtime": 32308.6892, + "train_tokens_per_second": 115377.249 + }, + { + "epoch": 0.38529180984333994, + "grad_norm": 0.16377384960651398, + "learning_rate": 0.0036556902418857927, + "loss": 3.1283496856689452, + "num_input_tokens_seen": 3732930560, + "step": 7120, + "train_runtime": 32353.8348, + "train_tokens_per_second": 115378.303 + }, + { + "epoch": 0.38583295002570417, + "grad_norm": 0.17365527153015137, + "learning_rate": 0.0036520900667329475, + "loss": 3.1340274810791016, + "num_input_tokens_seen": 3738173440, + "step": 7130, + "train_runtime": 32398.9948, + "train_tokens_per_second": 115379.303 + }, + { + "epoch": 0.3863740902080684, + "grad_norm": 0.17289578914642334, + "learning_rate": 0.003648487137540628, + "loss": 3.126075553894043, + "num_input_tokens_seen": 3743416320, + "step": 7140, + "train_runtime": 32444.1388, + "train_tokens_per_second": 115380.357 + }, + { + "epoch": 0.38691523039043263, + "grad_norm": 0.1867065280675888, + "learning_rate": 0.003644881465308411, + "loss": 3.1279239654541016, + "num_input_tokens_seen": 3748659200, + "step": 7150, + "train_runtime": 32489.3038, + "train_tokens_per_second": 115381.334 + }, + { + "epoch": 0.38745637057279686, + "grad_norm": 0.16090157628059387, + "learning_rate": 0.003641273061044249, + "loss": 3.126418685913086, + "num_input_tokens_seen": 3753902080, + "step": 7160, + "train_runtime": 32534.4706, + "train_tokens_per_second": 115382.301 + }, + { + "epoch": 0.38799751075516115, + "grad_norm": 0.16933725774288177, + "learning_rate": 0.003637661935764434, + "loss": 3.1228607177734373, + "num_input_tokens_seen": 3759144960, + "step": 7170, + "train_runtime": 32579.6304, + "train_tokens_per_second": 115383.29 + }, + { + "epoch": 0.3885386509375254, + "grad_norm": 0.16463743150234222, + "learning_rate": 0.003634048100493565, + "loss": 3.1265775680541994, + "num_input_tokens_seen": 3764387840, + "step": 7180, + "train_runtime": 32624.7971, + "train_tokens_per_second": 115384.253 + }, + { + "epoch": 0.3890797911198896, + "grad_norm": 0.15814442932605743, + "learning_rate": 0.003630431566264515, + "loss": 3.126376724243164, + "num_input_tokens_seen": 3769630720, + "step": 7190, + "train_runtime": 32669.9527, + "train_tokens_per_second": 115385.252 + }, + { + "epoch": 0.38962093130225384, + "grad_norm": 0.16953812539577484, + "learning_rate": 0.0036268123441183966, + "loss": 3.1293899536132814, + "num_input_tokens_seen": 3774873600, + "step": 7200, + "train_runtime": 32715.1316, + "train_tokens_per_second": 115386.166 + }, + { + "epoch": 0.39016207148461807, + "grad_norm": 0.18077914416790009, + "learning_rate": 0.003623190445104527, + "loss": 3.130533218383789, + "num_input_tokens_seen": 3780116480, + "step": 7210, + "train_runtime": 32760.3295, + "train_tokens_per_second": 115387.01 + }, + { + "epoch": 0.39070321166698235, + "grad_norm": 0.17073588073253632, + "learning_rate": 0.003619565880280401, + "loss": 3.1266639709472654, + "num_input_tokens_seen": 3785359360, + "step": 7220, + "train_runtime": 32805.4983, + "train_tokens_per_second": 115387.955 + }, + { + "epoch": 0.3912443518493466, + "grad_norm": 0.16945651173591614, + "learning_rate": 0.0036159386607116446, + "loss": 3.1234695434570314, + "num_input_tokens_seen": 3790602240, + "step": 7230, + "train_runtime": 32850.6502, + "train_tokens_per_second": 115388.956 + }, + { + "epoch": 0.3917854920317108, + "grad_norm": 0.17761710286140442, + "learning_rate": 0.0036123087974719937, + "loss": 3.127792739868164, + "num_input_tokens_seen": 3795845120, + "step": 7240, + "train_runtime": 32895.8256, + "train_tokens_per_second": 115389.873 + }, + { + "epoch": 0.39232663221407504, + "grad_norm": 0.16878648102283478, + "learning_rate": 0.0036086763016432545, + "loss": 3.120273208618164, + "num_input_tokens_seen": 3801088000, + "step": 7250, + "train_runtime": 32945.144, + "train_tokens_per_second": 115376.275 + }, + { + "epoch": 0.3928677723964393, + "grad_norm": 0.15386980772018433, + "learning_rate": 0.0036050411843152686, + "loss": 3.1222068786621096, + "num_input_tokens_seen": 3806330880, + "step": 7260, + "train_runtime": 32990.288, + "train_tokens_per_second": 115377.316 + }, + { + "epoch": 0.39340891257880356, + "grad_norm": 0.16980594396591187, + "learning_rate": 0.0036014034565858824, + "loss": 3.1281028747558595, + "num_input_tokens_seen": 3811573760, + "step": 7270, + "train_runtime": 33035.4429, + "train_tokens_per_second": 115378.316 + }, + { + "epoch": 0.3939500527611678, + "grad_norm": 0.17536021769046783, + "learning_rate": 0.003597763129560911, + "loss": 3.1235652923583985, + "num_input_tokens_seen": 3816816640, + "step": 7280, + "train_runtime": 33080.605, + "train_tokens_per_second": 115379.288 + }, + { + "epoch": 0.394491192943532, + "grad_norm": 0.1680123209953308, + "learning_rate": 0.0035941202143541053, + "loss": 3.123764991760254, + "num_input_tokens_seen": 3822059520, + "step": 7290, + "train_runtime": 33125.7503, + "train_tokens_per_second": 115380.315 + }, + { + "epoch": 0.39503233312589625, + "grad_norm": 0.15840236842632294, + "learning_rate": 0.003590474722087118, + "loss": 3.124995803833008, + "num_input_tokens_seen": 3827302400, + "step": 7300, + "train_runtime": 33170.9067, + "train_tokens_per_second": 115381.302 + }, + { + "epoch": 0.3955734733082605, + "grad_norm": 0.1702660471200943, + "learning_rate": 0.00358682666388947, + "loss": 3.1230545043945312, + "num_input_tokens_seen": 3832545280, + "step": 7310, + "train_runtime": 33216.0627, + "train_tokens_per_second": 115382.287 + }, + { + "epoch": 0.39611461349062477, + "grad_norm": 0.14530692994594574, + "learning_rate": 0.003583176050898514, + "loss": 3.1195556640625, + "num_input_tokens_seen": 3837788160, + "step": 7320, + "train_runtime": 33261.2169, + "train_tokens_per_second": 115383.276 + }, + { + "epoch": 0.396655753672989, + "grad_norm": 0.16137973964214325, + "learning_rate": 0.003579522894259404, + "loss": 3.122934341430664, + "num_input_tokens_seen": 3843031040, + "step": 7330, + "train_runtime": 33306.3711, + "train_tokens_per_second": 115384.262 + }, + { + "epoch": 0.39719689385535323, + "grad_norm": 0.17957496643066406, + "learning_rate": 0.0035758672051250597, + "loss": 3.118304443359375, + "num_input_tokens_seen": 3848273920, + "step": 7340, + "train_runtime": 33351.4951, + "train_tokens_per_second": 115385.35 + }, + { + "epoch": 0.39773803403771746, + "grad_norm": 0.1619359254837036, + "learning_rate": 0.003572208994656131, + "loss": 3.126445007324219, + "num_input_tokens_seen": 3853516800, + "step": 7350, + "train_runtime": 33396.6238, + "train_tokens_per_second": 115386.418 + }, + { + "epoch": 0.3982791742200817, + "grad_norm": 0.17734915018081665, + "learning_rate": 0.003568548274020967, + "loss": 3.1167884826660157, + "num_input_tokens_seen": 3858759680, + "step": 7360, + "train_runtime": 33441.7562, + "train_tokens_per_second": 115387.471 + }, + { + "epoch": 0.398820314402446, + "grad_norm": 0.17586900293827057, + "learning_rate": 0.0035648850543955773, + "loss": 3.1228519439697267, + "num_input_tokens_seen": 3864002560, + "step": 7370, + "train_runtime": 33486.9063, + "train_tokens_per_second": 115388.46 + }, + { + "epoch": 0.3993614545848102, + "grad_norm": 0.17276950180530548, + "learning_rate": 0.0035612193469636054, + "loss": 3.1270915985107424, + "num_input_tokens_seen": 3869245440, + "step": 7380, + "train_runtime": 33532.0567, + "train_tokens_per_second": 115389.446 + }, + { + "epoch": 0.39990259476717444, + "grad_norm": 0.1578545719385147, + "learning_rate": 0.0035575511629162876, + "loss": 3.102129364013672, + "num_input_tokens_seen": 3874488320, + "step": 7390, + "train_runtime": 33577.2022, + "train_tokens_per_second": 115390.445 + }, + { + "epoch": 0.40044373494953867, + "grad_norm": 0.15498770773410797, + "learning_rate": 0.0035538805134524183, + "loss": 3.115239715576172, + "num_input_tokens_seen": 3879731200, + "step": 7400, + "train_runtime": 33622.363, + "train_tokens_per_second": 115391.39 + }, + { + "epoch": 0.4009848751319029, + "grad_norm": 0.15868115425109863, + "learning_rate": 0.0035502074097783242, + "loss": 3.1181896209716795, + "num_input_tokens_seen": 3884974080, + "step": 7410, + "train_runtime": 33667.5163, + "train_tokens_per_second": 115392.358 + }, + { + "epoch": 0.4015260153142672, + "grad_norm": 0.1605597585439682, + "learning_rate": 0.0035465318631078204, + "loss": 3.113156318664551, + "num_input_tokens_seen": 3890216960, + "step": 7420, + "train_runtime": 33712.6623, + "train_tokens_per_second": 115393.348 + }, + { + "epoch": 0.4020671554966314, + "grad_norm": 0.17280755937099457, + "learning_rate": 0.003542853884662183, + "loss": 3.1183053970336916, + "num_input_tokens_seen": 3895459840, + "step": 7430, + "train_runtime": 33757.8255, + "train_tokens_per_second": 115394.276 + }, + { + "epoch": 0.40260829567899564, + "grad_norm": 0.16187331080436707, + "learning_rate": 0.0035391734856701092, + "loss": 3.1163970947265627, + "num_input_tokens_seen": 3900702720, + "step": 7440, + "train_runtime": 33802.989, + "train_tokens_per_second": 115395.201 + }, + { + "epoch": 0.4031494358613599, + "grad_norm": 0.1724129021167755, + "learning_rate": 0.0035354906773676894, + "loss": 3.1170070648193358, + "num_input_tokens_seen": 3905945600, + "step": 7450, + "train_runtime": 33848.1517, + "train_tokens_per_second": 115396.127 + }, + { + "epoch": 0.4036905760437241, + "grad_norm": 0.17225228250026703, + "learning_rate": 0.003531805470998366, + "loss": 3.110821533203125, + "num_input_tokens_seen": 3911188480, + "step": 7460, + "train_runtime": 33893.3266, + "train_tokens_per_second": 115397.008 + }, + { + "epoch": 0.4042317162260884, + "grad_norm": 0.1592818796634674, + "learning_rate": 0.0035281178778129073, + "loss": 3.116873931884766, + "num_input_tokens_seen": 3916431360, + "step": 7470, + "train_runtime": 33938.5013, + "train_tokens_per_second": 115397.888 + }, + { + "epoch": 0.4047728564084526, + "grad_norm": 0.1658582091331482, + "learning_rate": 0.0035244279090693633, + "loss": 3.1268436431884767, + "num_input_tokens_seen": 3921674240, + "step": 7480, + "train_runtime": 33983.671, + "train_tokens_per_second": 115398.782 + }, + { + "epoch": 0.40531399659081685, + "grad_norm": 0.14836189150810242, + "learning_rate": 0.00352073557603304, + "loss": 3.114876556396484, + "num_input_tokens_seen": 3926917120, + "step": 7490, + "train_runtime": 34028.8257, + "train_tokens_per_second": 115399.725 + }, + { + "epoch": 0.4058551367731811, + "grad_norm": 0.16045086085796356, + "learning_rate": 0.0035170408899764605, + "loss": 3.1156852722167967, + "num_input_tokens_seen": 3932160000, + "step": 7500, + "train_runtime": 34073.9726, + "train_tokens_per_second": 115400.692 + }, + { + "epoch": 0.4058551367731811, + "eval_loss": 3.0682461261749268, + "eval_runtime": 1.9852, + "eval_samples_per_second": 251.858, + "eval_steps_per_second": 4.03, + "num_input_tokens_seen": 3932160000, + "step": 7500 + }, + { + "epoch": 0.4063962769555453, + "grad_norm": 0.16971535980701447, + "learning_rate": 0.0035133438621793296, + "loss": 3.1024160385131836, + "num_input_tokens_seen": 3937402880, + "step": 7510, + "train_runtime": 34121.1044, + "train_tokens_per_second": 115394.943 + }, + { + "epoch": 0.4069374171379096, + "grad_norm": 0.16736076772212982, + "learning_rate": 0.003509644503928506, + "loss": 3.1206098556518556, + "num_input_tokens_seen": 3942645760, + "step": 7520, + "train_runtime": 34166.2706, + "train_tokens_per_second": 115395.848 + }, + { + "epoch": 0.4074785573202738, + "grad_norm": 0.16113705933094025, + "learning_rate": 0.0035059428265179567, + "loss": 3.117937469482422, + "num_input_tokens_seen": 3947888640, + "step": 7530, + "train_runtime": 34211.4099, + "train_tokens_per_second": 115396.841 + }, + { + "epoch": 0.40801969750263806, + "grad_norm": 0.17517107725143433, + "learning_rate": 0.0035022388412487356, + "loss": 3.1136932373046875, + "num_input_tokens_seen": 3953131520, + "step": 7540, + "train_runtime": 34256.533, + "train_tokens_per_second": 115397.887 + }, + { + "epoch": 0.4085608376850023, + "grad_norm": 0.18709343671798706, + "learning_rate": 0.003498532559428938, + "loss": 3.125676918029785, + "num_input_tokens_seen": 3958374400, + "step": 7550, + "train_runtime": 34301.6505, + "train_tokens_per_second": 115398.949 + }, + { + "epoch": 0.4091019778673665, + "grad_norm": 0.1633439064025879, + "learning_rate": 0.0034948239923736713, + "loss": 3.1128585815429686, + "num_input_tokens_seen": 3963617280, + "step": 7560, + "train_runtime": 34346.7672, + "train_tokens_per_second": 115400.01 + }, + { + "epoch": 0.4096431180497308, + "grad_norm": 0.16776174306869507, + "learning_rate": 0.0034911131514050214, + "loss": 3.114968681335449, + "num_input_tokens_seen": 3968860160, + "step": 7570, + "train_runtime": 34391.881, + "train_tokens_per_second": 115401.078 + }, + { + "epoch": 0.41018425823209503, + "grad_norm": 0.17015814781188965, + "learning_rate": 0.0034874000478520148, + "loss": 3.1098609924316407, + "num_input_tokens_seen": 3974103040, + "step": 7580, + "train_runtime": 34437.0464, + "train_tokens_per_second": 115401.971 + }, + { + "epoch": 0.41072539841445926, + "grad_norm": 0.1705334633588791, + "learning_rate": 0.0034836846930505843, + "loss": 3.1172601699829103, + "num_input_tokens_seen": 3979345920, + "step": 7590, + "train_runtime": 34482.2174, + "train_tokens_per_second": 115402.843 + }, + { + "epoch": 0.4112665385968235, + "grad_norm": 0.17283746600151062, + "learning_rate": 0.0034799670983435395, + "loss": 3.1093212127685548, + "num_input_tokens_seen": 3984588800, + "step": 7600, + "train_runtime": 34527.3958, + "train_tokens_per_second": 115403.688 + }, + { + "epoch": 0.4118076787791877, + "grad_norm": 0.1661679744720459, + "learning_rate": 0.003476247275080524, + "loss": 3.114109992980957, + "num_input_tokens_seen": 3989831680, + "step": 7610, + "train_runtime": 34572.5667, + "train_tokens_per_second": 115404.555 + }, + { + "epoch": 0.412348818961552, + "grad_norm": 0.16221173107624054, + "learning_rate": 0.003472525234617988, + "loss": 3.1130563735961916, + "num_input_tokens_seen": 3995074560, + "step": 7620, + "train_runtime": 34617.7625, + "train_tokens_per_second": 115405.337 + }, + { + "epoch": 0.41288995914391624, + "grad_norm": 0.16985613107681274, + "learning_rate": 0.0034688009883191507, + "loss": 3.1183204650878906, + "num_input_tokens_seen": 4000317440, + "step": 7630, + "train_runtime": 34662.948, + "train_tokens_per_second": 115406.152 + }, + { + "epoch": 0.41343109932628047, + "grad_norm": 0.15718990564346313, + "learning_rate": 0.003465074547553963, + "loss": 3.1192548751831053, + "num_input_tokens_seen": 4005560320, + "step": 7640, + "train_runtime": 34711.7967, + "train_tokens_per_second": 115394.785 + }, + { + "epoch": 0.4139722395086447, + "grad_norm": 0.16134141385555267, + "learning_rate": 0.0034613459236990775, + "loss": 3.1101545333862304, + "num_input_tokens_seen": 4010803200, + "step": 7650, + "train_runtime": 34756.9452, + "train_tokens_per_second": 115395.734 + }, + { + "epoch": 0.41451337969100893, + "grad_norm": 0.16892403364181519, + "learning_rate": 0.0034576151281378127, + "loss": 3.103810691833496, + "num_input_tokens_seen": 4016046080, + "step": 7660, + "train_runtime": 34802.1069, + "train_tokens_per_second": 115396.637 + }, + { + "epoch": 0.4150545198733732, + "grad_norm": 0.15722833573818207, + "learning_rate": 0.003453882172260114, + "loss": 3.109886360168457, + "num_input_tokens_seen": 4021288960, + "step": 7670, + "train_runtime": 34847.275, + "train_tokens_per_second": 115397.516 + }, + { + "epoch": 0.41559566005573745, + "grad_norm": 0.16605538129806519, + "learning_rate": 0.0034501470674625258, + "loss": 3.110805892944336, + "num_input_tokens_seen": 4026531840, + "step": 7680, + "train_runtime": 34892.47, + "train_tokens_per_second": 115398.303 + }, + { + "epoch": 0.4161368002381017, + "grad_norm": 0.1643964648246765, + "learning_rate": 0.003446409825148149, + "loss": 3.11865348815918, + "num_input_tokens_seen": 4031774720, + "step": 7690, + "train_runtime": 34937.6366, + "train_tokens_per_second": 115399.183 + }, + { + "epoch": 0.4166779404204659, + "grad_norm": 0.17231661081314087, + "learning_rate": 0.003442670456726614, + "loss": 3.117427444458008, + "num_input_tokens_seen": 4037017600, + "step": 7700, + "train_runtime": 34982.8067, + "train_tokens_per_second": 115400.049 + }, + { + "epoch": 0.41721908060283014, + "grad_norm": 0.16913042962551117, + "learning_rate": 0.0034389289736140405, + "loss": 3.1114864349365234, + "num_input_tokens_seen": 4042260480, + "step": 7710, + "train_runtime": 35027.9883, + "train_tokens_per_second": 115400.874 + }, + { + "epoch": 0.4177602207851944, + "grad_norm": 0.16182249784469604, + "learning_rate": 0.0034351853872330042, + "loss": 3.107219696044922, + "num_input_tokens_seen": 4047503360, + "step": 7720, + "train_runtime": 35073.1627, + "train_tokens_per_second": 115401.722 + }, + { + "epoch": 0.41830136096755866, + "grad_norm": 0.15614280104637146, + "learning_rate": 0.003431439709012501, + "loss": 3.10361385345459, + "num_input_tokens_seen": 4052746240, + "step": 7730, + "train_runtime": 35118.3339, + "train_tokens_per_second": 115402.577 + }, + { + "epoch": 0.4188425011499229, + "grad_norm": 0.16172853112220764, + "learning_rate": 0.003427691950387916, + "loss": 3.10665225982666, + "num_input_tokens_seen": 4057989120, + "step": 7740, + "train_runtime": 35163.5186, + "train_tokens_per_second": 115403.386 + }, + { + "epoch": 0.4193836413322871, + "grad_norm": 0.1584424078464508, + "learning_rate": 0.0034239421228009826, + "loss": 3.109303665161133, + "num_input_tokens_seen": 4063232000, + "step": 7750, + "train_runtime": 35208.6903, + "train_tokens_per_second": 115404.236 + }, + { + "epoch": 0.41992478151465135, + "grad_norm": 0.15736353397369385, + "learning_rate": 0.0034201902376997523, + "loss": 3.1072481155395506, + "num_input_tokens_seen": 4068474880, + "step": 7760, + "train_runtime": 35253.8805, + "train_tokens_per_second": 115405.023 + }, + { + "epoch": 0.42046592169701563, + "grad_norm": 0.158221036195755, + "learning_rate": 0.0034164363065385577, + "loss": 3.107033920288086, + "num_input_tokens_seen": 4073717760, + "step": 7770, + "train_runtime": 35299.0377, + "train_tokens_per_second": 115405.915 + }, + { + "epoch": 0.42100706187937986, + "grad_norm": 0.16100963950157166, + "learning_rate": 0.0034126803407779783, + "loss": 3.102493667602539, + "num_input_tokens_seen": 4078960640, + "step": 7780, + "train_runtime": 35344.2177, + "train_tokens_per_second": 115406.732 + }, + { + "epoch": 0.4215482020617441, + "grad_norm": 0.15508411824703217, + "learning_rate": 0.0034089223518848043, + "loss": 3.110720634460449, + "num_input_tokens_seen": 4084203520, + "step": 7790, + "train_runtime": 35389.3807, + "train_tokens_per_second": 115407.601 + }, + { + "epoch": 0.4220893422441083, + "grad_norm": 0.16234534978866577, + "learning_rate": 0.0034051623513320028, + "loss": 3.116852378845215, + "num_input_tokens_seen": 4089446400, + "step": 7800, + "train_runtime": 35434.5473, + "train_tokens_per_second": 115408.456 + }, + { + "epoch": 0.42263048242647255, + "grad_norm": 0.15150156617164612, + "learning_rate": 0.003401400350598683, + "loss": 3.110218048095703, + "num_input_tokens_seen": 4094689280, + "step": 7810, + "train_runtime": 35479.7081, + "train_tokens_per_second": 115409.328 + }, + { + "epoch": 0.42317162260883684, + "grad_norm": 0.16316647827625275, + "learning_rate": 0.0033976363611700608, + "loss": 3.099168395996094, + "num_input_tokens_seen": 4099932160, + "step": 7820, + "train_runtime": 35524.9004, + "train_tokens_per_second": 115410.096 + }, + { + "epoch": 0.42371276279120107, + "grad_norm": 0.15622437000274658, + "learning_rate": 0.00339387039453742, + "loss": 3.1079681396484373, + "num_input_tokens_seen": 4105175040, + "step": 7830, + "train_runtime": 35570.568, + "train_tokens_per_second": 115409.319 + }, + { + "epoch": 0.4242539029735653, + "grad_norm": 0.1611352562904358, + "learning_rate": 0.0033901024621980865, + "loss": 3.1027732849121095, + "num_input_tokens_seen": 4110417920, + "step": 7840, + "train_runtime": 35628.7933, + "train_tokens_per_second": 115367.868 + }, + { + "epoch": 0.42479504315592953, + "grad_norm": 0.1534154713153839, + "learning_rate": 0.0033863325756553824, + "loss": 3.1010990142822266, + "num_input_tokens_seen": 4115660800, + "step": 7850, + "train_runtime": 35677.6783, + "train_tokens_per_second": 115356.744 + }, + { + "epoch": 0.42533618333829376, + "grad_norm": 0.16484984755516052, + "learning_rate": 0.0033825607464185994, + "loss": 3.0935718536376955, + "num_input_tokens_seen": 4120903680, + "step": 7860, + "train_runtime": 35722.8483, + "train_tokens_per_second": 115357.646 + }, + { + "epoch": 0.42587732352065805, + "grad_norm": 0.15278859436511993, + "learning_rate": 0.0033787869860029576, + "loss": 3.095734405517578, + "num_input_tokens_seen": 4126146560, + "step": 7870, + "train_runtime": 35768.0118, + "train_tokens_per_second": 115358.566 + }, + { + "epoch": 0.4264184637030223, + "grad_norm": 0.16884206235408783, + "learning_rate": 0.003375011305929574, + "loss": 3.1056522369384765, + "num_input_tokens_seen": 4131389440, + "step": 7880, + "train_runtime": 35813.1554, + "train_tokens_per_second": 115359.549 + }, + { + "epoch": 0.4269596038853865, + "grad_norm": 0.15963584184646606, + "learning_rate": 0.003371233717725426, + "loss": 3.1040569305419923, + "num_input_tokens_seen": 4136632320, + "step": 7890, + "train_runtime": 35858.3104, + "train_tokens_per_second": 115360.492 + }, + { + "epoch": 0.42750074406775074, + "grad_norm": 0.1541411578655243, + "learning_rate": 0.0033674542329233175, + "loss": 3.1086753845214843, + "num_input_tokens_seen": 4141875200, + "step": 7900, + "train_runtime": 35903.4547, + "train_tokens_per_second": 115361.467 + }, + { + "epoch": 0.42804188425011497, + "grad_norm": 0.16819094121456146, + "learning_rate": 0.003363672863061842, + "loss": 3.108404350280762, + "num_input_tokens_seen": 4147118080, + "step": 7910, + "train_runtime": 35948.5895, + "train_tokens_per_second": 115362.47 + }, + { + "epoch": 0.42858302443247925, + "grad_norm": 0.15858127176761627, + "learning_rate": 0.003359889619685346, + "loss": 3.1061111450195313, + "num_input_tokens_seen": 4152360960, + "step": 7920, + "train_runtime": 35993.7102, + "train_tokens_per_second": 115363.516 + }, + { + "epoch": 0.4291241646148435, + "grad_norm": 0.15731550753116608, + "learning_rate": 0.003356104514343899, + "loss": 3.1057785034179686, + "num_input_tokens_seen": 4157603840, + "step": 7930, + "train_runtime": 36038.862, + "train_tokens_per_second": 115364.46 + }, + { + "epoch": 0.4296653047972077, + "grad_norm": 0.14610068500041962, + "learning_rate": 0.0033523175585932524, + "loss": 3.09300537109375, + "num_input_tokens_seen": 4162846720, + "step": 7940, + "train_runtime": 36084.0029, + "train_tokens_per_second": 115365.436 + }, + { + "epoch": 0.43020644497957194, + "grad_norm": 0.1672324687242508, + "learning_rate": 0.003348528763994809, + "loss": 3.1017438888549806, + "num_input_tokens_seen": 4168089600, + "step": 7950, + "train_runtime": 36129.1342, + "train_tokens_per_second": 115366.44 + }, + { + "epoch": 0.4307475851619362, + "grad_norm": 0.1700795590877533, + "learning_rate": 0.003344738142115583, + "loss": 3.0958410263061524, + "num_input_tokens_seen": 4173332480, + "step": 7960, + "train_runtime": 36174.3238, + "train_tokens_per_second": 115367.256 + }, + { + "epoch": 0.43128872534430046, + "grad_norm": 0.15165534615516663, + "learning_rate": 0.00334094570452817, + "loss": 3.101241874694824, + "num_input_tokens_seen": 4178575360, + "step": 7970, + "train_runtime": 36219.5195, + "train_tokens_per_second": 115368.051 + }, + { + "epoch": 0.4318298655266647, + "grad_norm": 0.1584347039461136, + "learning_rate": 0.0033371514628107073, + "loss": 3.101197052001953, + "num_input_tokens_seen": 4183818240, + "step": 7980, + "train_runtime": 36264.6831, + "train_tokens_per_second": 115368.945 + }, + { + "epoch": 0.4323710057090289, + "grad_norm": 0.15928910672664642, + "learning_rate": 0.0033333554285468387, + "loss": 3.1082935333251953, + "num_input_tokens_seen": 4189061120, + "step": 7990, + "train_runtime": 36309.973, + "train_tokens_per_second": 115369.436 + }, + { + "epoch": 0.43291214589139315, + "grad_norm": 0.15439751744270325, + "learning_rate": 0.003329557613325685, + "loss": 3.1111793518066406, + "num_input_tokens_seen": 4194304000, + "step": 8000, + "train_runtime": 36355.1728, + "train_tokens_per_second": 115370.212 + }, + { + "epoch": 0.43291214589139315, + "eval_loss": 3.0542824268341064, + "eval_runtime": 1.9899, + "eval_samples_per_second": 251.266, + "eval_steps_per_second": 4.02, + "num_input_tokens_seen": 4194304000, + "step": 8000 + }, + { + "epoch": 0.4334532860737574, + "grad_norm": 0.1620936542749405, + "learning_rate": 0.0033257580287417987, + "loss": 3.1044567108154295, + "num_input_tokens_seen": 4199546880, + "step": 8010, + "train_runtime": 36404.8051, + "train_tokens_per_second": 115356.939 + }, + { + "epoch": 0.43399442625612167, + "grad_norm": 0.16753153502941132, + "learning_rate": 0.0033219566863951383, + "loss": 3.0971731185913085, + "num_input_tokens_seen": 4204789760, + "step": 8020, + "train_runtime": 36454.2975, + "train_tokens_per_second": 115344.144 + }, + { + "epoch": 0.4345355664384859, + "grad_norm": 0.15557527542114258, + "learning_rate": 0.0033181535978910265, + "loss": 3.099981689453125, + "num_input_tokens_seen": 4210032640, + "step": 8030, + "train_runtime": 36499.5202, + "train_tokens_per_second": 115344.876 + }, + { + "epoch": 0.43507670662085013, + "grad_norm": 0.15290997922420502, + "learning_rate": 0.0033143487748401174, + "loss": 3.1018728256225585, + "num_input_tokens_seen": 4215275520, + "step": 8040, + "train_runtime": 36544.6707, + "train_tokens_per_second": 115345.834 + }, + { + "epoch": 0.43561784680321436, + "grad_norm": 0.15225110948085785, + "learning_rate": 0.0033105422288583616, + "loss": 3.09820671081543, + "num_input_tokens_seen": 4220518400, + "step": 8050, + "train_runtime": 36589.8576, + "train_tokens_per_second": 115346.675 + }, + { + "epoch": 0.4361589869855786, + "grad_norm": 0.16044707596302032, + "learning_rate": 0.003306733971566968, + "loss": 3.1036590576171874, + "num_input_tokens_seen": 4225761280, + "step": 8060, + "train_runtime": 36635.0375, + "train_tokens_per_second": 115347.535 + }, + { + "epoch": 0.4367001271679429, + "grad_norm": 0.1795279085636139, + "learning_rate": 0.0033029240145923708, + "loss": 3.102092170715332, + "num_input_tokens_seen": 4231004160, + "step": 8070, + "train_runtime": 36680.2289, + "train_tokens_per_second": 115348.357 + }, + { + "epoch": 0.4372412673503071, + "grad_norm": 0.16536639630794525, + "learning_rate": 0.003299112369566194, + "loss": 3.101215934753418, + "num_input_tokens_seen": 4236247040, + "step": 8080, + "train_runtime": 36725.4044, + "train_tokens_per_second": 115349.228 + }, + { + "epoch": 0.43778240753267134, + "grad_norm": 0.1551489681005478, + "learning_rate": 0.003295299048125215, + "loss": 3.1048954010009764, + "num_input_tokens_seen": 4241489920, + "step": 8090, + "train_runtime": 36770.5602, + "train_tokens_per_second": 115350.158 + }, + { + "epoch": 0.43832354771503557, + "grad_norm": 0.15728254616260529, + "learning_rate": 0.0032914840619113267, + "loss": 3.0963891983032226, + "num_input_tokens_seen": 4246732800, + "step": 8100, + "train_runtime": 36815.7389, + "train_tokens_per_second": 115351.014 + }, + { + "epoch": 0.4388646878973998, + "grad_norm": 0.16317813098430634, + "learning_rate": 0.0032876674225715092, + "loss": 3.095835876464844, + "num_input_tokens_seen": 4251975680, + "step": 8110, + "train_runtime": 36860.9041, + "train_tokens_per_second": 115351.91 + }, + { + "epoch": 0.4394058280797641, + "grad_norm": 0.16513289511203766, + "learning_rate": 0.0032838491417577845, + "loss": 3.100272369384766, + "num_input_tokens_seen": 4257218560, + "step": 8120, + "train_runtime": 36906.0614, + "train_tokens_per_second": 115352.828 + }, + { + "epoch": 0.4399469682621283, + "grad_norm": 0.154524028301239, + "learning_rate": 0.003280029231127189, + "loss": 3.1007152557373048, + "num_input_tokens_seen": 4262461440, + "step": 8130, + "train_runtime": 36951.2345, + "train_tokens_per_second": 115353.695 + }, + { + "epoch": 0.44048810844449254, + "grad_norm": 0.16192783415317535, + "learning_rate": 0.003276207702341735, + "loss": 3.1067665100097654, + "num_input_tokens_seen": 4267704320, + "step": 8140, + "train_runtime": 36996.3989, + "train_tokens_per_second": 115354.587 + }, + { + "epoch": 0.4410292486268568, + "grad_norm": 0.1726672351360321, + "learning_rate": 0.003272384567068373, + "loss": 3.098089027404785, + "num_input_tokens_seen": 4272947200, + "step": 8150, + "train_runtime": 37041.5682, + "train_tokens_per_second": 115355.462 + }, + { + "epoch": 0.441570388809221, + "grad_norm": 0.14850319921970367, + "learning_rate": 0.00326855983697896, + "loss": 3.0921985626220705, + "num_input_tokens_seen": 4278190080, + "step": 8160, + "train_runtime": 37086.741, + "train_tokens_per_second": 115356.323 + }, + { + "epoch": 0.4421115289915853, + "grad_norm": 0.15166330337524414, + "learning_rate": 0.0032647335237502195, + "loss": 3.101424789428711, + "num_input_tokens_seen": 4283432960, + "step": 8170, + "train_runtime": 37131.9267, + "train_tokens_per_second": 115357.142 + }, + { + "epoch": 0.4426526691739495, + "grad_norm": 0.15639446675777435, + "learning_rate": 0.0032609056390637114, + "loss": 3.098773193359375, + "num_input_tokens_seen": 4288675840, + "step": 8180, + "train_runtime": 37177.0966, + "train_tokens_per_second": 115358.009 + }, + { + "epoch": 0.44319380935631375, + "grad_norm": 0.1532983034849167, + "learning_rate": 0.003257076194605791, + "loss": 3.1019330978393556, + "num_input_tokens_seen": 4293918720, + "step": 8190, + "train_runtime": 37222.2634, + "train_tokens_per_second": 115358.883 + }, + { + "epoch": 0.443734949538678, + "grad_norm": 0.17717613279819489, + "learning_rate": 0.0032532452020675763, + "loss": 3.099607467651367, + "num_input_tokens_seen": 4299161600, + "step": 8200, + "train_runtime": 37267.4247, + "train_tokens_per_second": 115359.772 + }, + { + "epoch": 0.4442760897210422, + "grad_norm": 0.15027566254138947, + "learning_rate": 0.00324941267314491, + "loss": 3.107021713256836, + "num_input_tokens_seen": 4304404480, + "step": 8210, + "train_runtime": 37312.5891, + "train_tokens_per_second": 115360.649 + }, + { + "epoch": 0.4448172299034065, + "grad_norm": 0.16586042940616608, + "learning_rate": 0.0032455786195383285, + "loss": 3.0993444442749025, + "num_input_tokens_seen": 4309647360, + "step": 8220, + "train_runtime": 37357.762, + "train_tokens_per_second": 115361.497 + }, + { + "epoch": 0.4453583700857707, + "grad_norm": 0.16227947175502777, + "learning_rate": 0.00324174305295302, + "loss": 3.0918336868286134, + "num_input_tokens_seen": 4314890240, + "step": 8230, + "train_runtime": 37402.9165, + "train_tokens_per_second": 115362.401 + }, + { + "epoch": 0.44589951026813496, + "grad_norm": 0.16855213046073914, + "learning_rate": 0.0032379059850987926, + "loss": 3.0997894287109373, + "num_input_tokens_seen": 4320133120, + "step": 8240, + "train_runtime": 37448.08, + "train_tokens_per_second": 115363.274 + }, + { + "epoch": 0.4464406504504992, + "grad_norm": 0.15484896302223206, + "learning_rate": 0.003234067427690039, + "loss": 3.0965702056884767, + "num_input_tokens_seen": 4325376000, + "step": 8250, + "train_runtime": 37493.242, + "train_tokens_per_second": 115364.15 + }, + { + "epoch": 0.4469817906328634, + "grad_norm": 0.16154596209526062, + "learning_rate": 0.0032302273924456966, + "loss": 3.0933055877685547, + "num_input_tokens_seen": 4330618880, + "step": 8260, + "train_runtime": 37538.3815, + "train_tokens_per_second": 115365.093 + }, + { + "epoch": 0.4475229308152277, + "grad_norm": 0.1575249284505844, + "learning_rate": 0.003226385891089219, + "loss": 3.0924747467041014, + "num_input_tokens_seen": 4335861760, + "step": 8270, + "train_runtime": 37583.5299, + "train_tokens_per_second": 115366.007 + }, + { + "epoch": 0.44806407099759193, + "grad_norm": 0.160599023103714, + "learning_rate": 0.0032225429353485296, + "loss": 3.096691131591797, + "num_input_tokens_seen": 4341104640, + "step": 8280, + "train_runtime": 37628.6879, + "train_tokens_per_second": 115366.889 + }, + { + "epoch": 0.44860521117995616, + "grad_norm": 0.15803949534893036, + "learning_rate": 0.003218698536955999, + "loss": 3.1002126693725587, + "num_input_tokens_seen": 4346347520, + "step": 8290, + "train_runtime": 37673.8359, + "train_tokens_per_second": 115367.799 + }, + { + "epoch": 0.4491463513623204, + "grad_norm": 0.1458759903907776, + "learning_rate": 0.0032148527076483963, + "loss": 3.0890472412109373, + "num_input_tokens_seen": 4351590400, + "step": 8300, + "train_runtime": 37718.9873, + "train_tokens_per_second": 115368.697 + }, + { + "epoch": 0.4496874915446846, + "grad_norm": 0.15396055579185486, + "learning_rate": 0.0032110054591668624, + "loss": 3.0894855499267577, + "num_input_tokens_seen": 4356833280, + "step": 8310, + "train_runtime": 37764.1751, + "train_tokens_per_second": 115369.481 + }, + { + "epoch": 0.4502286317270489, + "grad_norm": 0.15826280415058136, + "learning_rate": 0.0032071568032568704, + "loss": 3.1003223419189454, + "num_input_tokens_seen": 4362076160, + "step": 8320, + "train_runtime": 37809.3568, + "train_tokens_per_second": 115370.282 + }, + { + "epoch": 0.45076977190941314, + "grad_norm": 0.16446684300899506, + "learning_rate": 0.003203306751668188, + "loss": 3.093168258666992, + "num_input_tokens_seen": 4367319040, + "step": 8330, + "train_runtime": 37854.5343, + "train_tokens_per_second": 115371.094 + }, + { + "epoch": 0.45131091209177737, + "grad_norm": 0.1580137461423874, + "learning_rate": 0.0031994553161548474, + "loss": 3.101323699951172, + "num_input_tokens_seen": 4372561920, + "step": 8340, + "train_runtime": 37899.7112, + "train_tokens_per_second": 115371.906 + }, + { + "epoch": 0.4518520522741416, + "grad_norm": 0.15007439255714417, + "learning_rate": 0.003195602508475103, + "loss": 3.0974876403808596, + "num_input_tokens_seen": 4377804800, + "step": 8350, + "train_runtime": 37944.8699, + "train_tokens_per_second": 115372.771 + }, + { + "epoch": 0.45239319245650583, + "grad_norm": 0.16612504422664642, + "learning_rate": 0.0031917483403914, + "loss": 3.097567558288574, + "num_input_tokens_seen": 4383047680, + "step": 8360, + "train_runtime": 37990.0345, + "train_tokens_per_second": 115373.617 + }, + { + "epoch": 0.4529343326388701, + "grad_norm": 0.152009978890419, + "learning_rate": 0.0031878928236703354, + "loss": 3.09008674621582, + "num_input_tokens_seen": 4388290560, + "step": 8370, + "train_runtime": 38035.2108, + "train_tokens_per_second": 115374.425 + }, + { + "epoch": 0.45347547282123435, + "grad_norm": 0.14635391533374786, + "learning_rate": 0.003184035970082625, + "loss": 3.0835281372070313, + "num_input_tokens_seen": 4393533440, + "step": 8380, + "train_runtime": 38080.3814, + "train_tokens_per_second": 115375.248 + }, + { + "epoch": 0.4540166130035986, + "grad_norm": 0.16529026627540588, + "learning_rate": 0.0031801777914030657, + "loss": 3.0935291290283202, + "num_input_tokens_seen": 4398776320, + "step": 8390, + "train_runtime": 38125.5235, + "train_tokens_per_second": 115376.155 + }, + { + "epoch": 0.4545577531859628, + "grad_norm": 0.15716882050037384, + "learning_rate": 0.003176318299410499, + "loss": 3.0900102615356446, + "num_input_tokens_seen": 4404019200, + "step": 8400, + "train_runtime": 38175.1087, + "train_tokens_per_second": 115363.632 + }, + { + "epoch": 0.45509889336832704, + "grad_norm": 0.15623128414154053, + "learning_rate": 0.003172457505887777, + "loss": 3.0833271026611326, + "num_input_tokens_seen": 4409262080, + "step": 8410, + "train_runtime": 38220.3198, + "train_tokens_per_second": 115364.343 + }, + { + "epoch": 0.4556400335506913, + "grad_norm": 0.1591528207063675, + "learning_rate": 0.0031685954226217234, + "loss": 3.0901105880737303, + "num_input_tokens_seen": 4414504960, + "step": 8420, + "train_runtime": 38265.5513, + "train_tokens_per_second": 115364.99 + }, + { + "epoch": 0.45618117373305556, + "grad_norm": 0.16141286492347717, + "learning_rate": 0.003164732061403102, + "loss": 3.0906259536743166, + "num_input_tokens_seen": 4419747840, + "step": 8430, + "train_runtime": 38310.7985, + "train_tokens_per_second": 115365.589 + }, + { + "epoch": 0.4567223139154198, + "grad_norm": 0.15204599499702454, + "learning_rate": 0.0031608674340265768, + "loss": 3.084097671508789, + "num_input_tokens_seen": 4424990720, + "step": 8440, + "train_runtime": 38356.0563, + "train_tokens_per_second": 115366.154 + }, + { + "epoch": 0.457263454097784, + "grad_norm": 0.15592141449451447, + "learning_rate": 0.003157001552290677, + "loss": 3.0875980377197267, + "num_input_tokens_seen": 4430233600, + "step": 8450, + "train_runtime": 38401.2772, + "train_tokens_per_second": 115366.829 + }, + { + "epoch": 0.45780459428014825, + "grad_norm": 0.15805137157440186, + "learning_rate": 0.0031531344279977615, + "loss": 3.0840667724609374, + "num_input_tokens_seen": 4435476480, + "step": 8460, + "train_runtime": 38446.5182, + "train_tokens_per_second": 115367.443 + }, + { + "epoch": 0.45834573446251253, + "grad_norm": 0.1569671630859375, + "learning_rate": 0.003149266072953983, + "loss": 3.095382308959961, + "num_input_tokens_seen": 4440719360, + "step": 8470, + "train_runtime": 38491.7485, + "train_tokens_per_second": 115368.086 + }, + { + "epoch": 0.45888687464487676, + "grad_norm": 0.15263330936431885, + "learning_rate": 0.0031453964989692517, + "loss": 3.0909893035888674, + "num_input_tokens_seen": 4445962240, + "step": 8480, + "train_runtime": 38536.9781, + "train_tokens_per_second": 115368.73 + }, + { + "epoch": 0.459428014827241, + "grad_norm": 0.16741898655891418, + "learning_rate": 0.0031415257178571986, + "loss": 3.091363525390625, + "num_input_tokens_seen": 4451205120, + "step": 8490, + "train_runtime": 38582.2145, + "train_tokens_per_second": 115369.353 + }, + { + "epoch": 0.4599691550096052, + "grad_norm": 0.1621858924627304, + "learning_rate": 0.0031376537414351414, + "loss": 3.0860706329345704, + "num_input_tokens_seen": 4456448000, + "step": 8500, + "train_runtime": 38627.4547, + "train_tokens_per_second": 115369.963 + }, + { + "epoch": 0.4599691550096052, + "eval_loss": 3.044252634048462, + "eval_runtime": 1.9941, + "eval_samples_per_second": 250.745, + "eval_steps_per_second": 4.012, + "num_input_tokens_seen": 4456448000, + "step": 8500 + }, + { + "epoch": 0.46051029519196945, + "grad_norm": 0.1537708044052124, + "learning_rate": 0.0031337805815240443, + "loss": 3.0971357345581056, + "num_input_tokens_seen": 4461690880, + "step": 8510, + "train_runtime": 38674.6947, + "train_tokens_per_second": 115364.605 + }, + { + "epoch": 0.46105143537433374, + "grad_norm": 0.1561785489320755, + "learning_rate": 0.0031299062499484886, + "loss": 3.095275115966797, + "num_input_tokens_seen": 4466933760, + "step": 8520, + "train_runtime": 38719.9322, + "train_tokens_per_second": 115365.227 + }, + { + "epoch": 0.46159257555669797, + "grad_norm": 0.15904352068901062, + "learning_rate": 0.0031260307585366277, + "loss": 3.093882942199707, + "num_input_tokens_seen": 4472176640, + "step": 8530, + "train_runtime": 38765.1761, + "train_tokens_per_second": 115365.828 + }, + { + "epoch": 0.4621337157390622, + "grad_norm": 0.1586320400238037, + "learning_rate": 0.00312215411912016, + "loss": 3.0862545013427733, + "num_input_tokens_seen": 4477419520, + "step": 8540, + "train_runtime": 38810.3531, + "train_tokens_per_second": 115366.627 + }, + { + "epoch": 0.46267485592142643, + "grad_norm": 0.15955059230327606, + "learning_rate": 0.003118276343534288, + "loss": 3.09029598236084, + "num_input_tokens_seen": 4482662400, + "step": 8550, + "train_runtime": 38855.5252, + "train_tokens_per_second": 115367.438 + }, + { + "epoch": 0.46321599610379066, + "grad_norm": 0.16844794154167175, + "learning_rate": 0.0031143974436176804, + "loss": 3.08276252746582, + "num_input_tokens_seen": 4487905280, + "step": 8560, + "train_runtime": 38900.7107, + "train_tokens_per_second": 115368.208 + }, + { + "epoch": 0.46375713628615495, + "grad_norm": 0.15490221977233887, + "learning_rate": 0.003110517431212442, + "loss": 3.096157455444336, + "num_input_tokens_seen": 4493148160, + "step": 8570, + "train_runtime": 38945.895, + "train_tokens_per_second": 115368.979 + }, + { + "epoch": 0.4642982764685192, + "grad_norm": 0.1643703430891037, + "learning_rate": 0.0031066363181640705, + "loss": 3.094961929321289, + "num_input_tokens_seen": 4498391040, + "step": 8580, + "train_runtime": 38991.0775, + "train_tokens_per_second": 115369.754 + }, + { + "epoch": 0.4648394166508834, + "grad_norm": 0.16452111303806305, + "learning_rate": 0.003102754116321427, + "loss": 3.0949285507202147, + "num_input_tokens_seen": 4503633920, + "step": 8590, + "train_runtime": 39036.2496, + "train_tokens_per_second": 115370.558 + }, + { + "epoch": 0.46538055683324764, + "grad_norm": 0.15409614145755768, + "learning_rate": 0.003098870837536694, + "loss": 3.083492660522461, + "num_input_tokens_seen": 4508876800, + "step": 8600, + "train_runtime": 39081.4455, + "train_tokens_per_second": 115371.29 + }, + { + "epoch": 0.46592169701561187, + "grad_norm": 0.16283227503299713, + "learning_rate": 0.0030949864936653444, + "loss": 3.0859600067138673, + "num_input_tokens_seen": 4514119680, + "step": 8610, + "train_runtime": 39126.6271, + "train_tokens_per_second": 115372.063 + }, + { + "epoch": 0.46646283719797615, + "grad_norm": 0.15932060778141022, + "learning_rate": 0.0030911010965660995, + "loss": 3.0858314514160154, + "num_input_tokens_seen": 4519362560, + "step": 8620, + "train_runtime": 39171.8041, + "train_tokens_per_second": 115372.847 + }, + { + "epoch": 0.4670039773803404, + "grad_norm": 0.15321630239486694, + "learning_rate": 0.0030872146581008993, + "loss": 3.0855281829833983, + "num_input_tokens_seen": 4524605440, + "step": 8630, + "train_runtime": 39217.0067, + "train_tokens_per_second": 115373.554 + }, + { + "epoch": 0.4675451175627046, + "grad_norm": 0.15142542123794556, + "learning_rate": 0.0030833271901348604, + "loss": 3.0922718048095703, + "num_input_tokens_seen": 4529848320, + "step": 8640, + "train_runtime": 39262.1839, + "train_tokens_per_second": 115374.334 + }, + { + "epoch": 0.46808625774506885, + "grad_norm": 0.15679921209812164, + "learning_rate": 0.0030794387045362448, + "loss": 3.089971923828125, + "num_input_tokens_seen": 4535091200, + "step": 8650, + "train_runtime": 39307.3714, + "train_tokens_per_second": 115375.082 + }, + { + "epoch": 0.4686273979274331, + "grad_norm": 0.149771049618721, + "learning_rate": 0.0030755492131764196, + "loss": 3.0910947799682615, + "num_input_tokens_seen": 4540334080, + "step": 8660, + "train_runtime": 39352.6168, + "train_tokens_per_second": 115375.659 + }, + { + "epoch": 0.46916853810979736, + "grad_norm": 0.15804412961006165, + "learning_rate": 0.003071658727929823, + "loss": 3.096923065185547, + "num_input_tokens_seen": 4545576960, + "step": 8670, + "train_runtime": 39397.8863, + "train_tokens_per_second": 115376.163 + }, + { + "epoch": 0.4697096782921616, + "grad_norm": 0.17401130497455597, + "learning_rate": 0.003067767260673929, + "loss": 3.0941158294677735, + "num_input_tokens_seen": 4550819840, + "step": 8680, + "train_runtime": 39443.148, + "train_tokens_per_second": 115376.69 + }, + { + "epoch": 0.4702508184745258, + "grad_norm": 0.16347981989383698, + "learning_rate": 0.003063874823289205, + "loss": 3.0893718719482424, + "num_input_tokens_seen": 4556062720, + "step": 8690, + "train_runtime": 39488.3966, + "train_tokens_per_second": 115377.253 + }, + { + "epoch": 0.47079195865689005, + "grad_norm": 0.16059865057468414, + "learning_rate": 0.003059981427659086, + "loss": 3.0792430877685546, + "num_input_tokens_seen": 4561305600, + "step": 8700, + "train_runtime": 39533.6411, + "train_tokens_per_second": 115377.827 + }, + { + "epoch": 0.4713330988392543, + "grad_norm": 0.15623228251934052, + "learning_rate": 0.0030560870856699285, + "loss": 3.0796392440795897, + "num_input_tokens_seen": 4566548480, + "step": 8710, + "train_runtime": 39578.8987, + "train_tokens_per_second": 115378.361 + }, + { + "epoch": 0.47187423902161857, + "grad_norm": 0.17271380126476288, + "learning_rate": 0.003052191809210979, + "loss": 3.0749179840087892, + "num_input_tokens_seen": 4571791360, + "step": 8720, + "train_runtime": 39624.1426, + "train_tokens_per_second": 115378.935 + }, + { + "epoch": 0.4724153792039828, + "grad_norm": 0.1617797613143921, + "learning_rate": 0.0030482956101743385, + "loss": 3.077177047729492, + "num_input_tokens_seen": 4577034240, + "step": 8730, + "train_runtime": 39669.3874, + "train_tokens_per_second": 115379.504 + }, + { + "epoch": 0.47295651938634703, + "grad_norm": 0.15339480340480804, + "learning_rate": 0.0030443985004549234, + "loss": 3.0854717254638673, + "num_input_tokens_seen": 4582277120, + "step": 8740, + "train_runtime": 39714.6736, + "train_tokens_per_second": 115379.952 + }, + { + "epoch": 0.47349765956871126, + "grad_norm": 0.1538633406162262, + "learning_rate": 0.00304050049195043, + "loss": 3.0906457901000977, + "num_input_tokens_seen": 4587520000, + "step": 8750, + "train_runtime": 39759.8518, + "train_tokens_per_second": 115380.712 + }, + { + "epoch": 0.4740387997510755, + "grad_norm": 0.16446056962013245, + "learning_rate": 0.0030366015965612976, + "loss": 3.0834827423095703, + "num_input_tokens_seen": 4592762880, + "step": 8760, + "train_runtime": 39805.0284, + "train_tokens_per_second": 115381.475 + }, + { + "epoch": 0.4745799399334398, + "grad_norm": 0.15852907299995422, + "learning_rate": 0.003032701826190677, + "loss": 3.077737808227539, + "num_input_tokens_seen": 4598005760, + "step": 8770, + "train_runtime": 39850.2201, + "train_tokens_per_second": 115382.192 + }, + { + "epoch": 0.475121080115804, + "grad_norm": 0.15762916207313538, + "learning_rate": 0.003028801192744386, + "loss": 3.074782943725586, + "num_input_tokens_seen": 4603248640, + "step": 8780, + "train_runtime": 39899.2419, + "train_tokens_per_second": 115371.832 + }, + { + "epoch": 0.47566222029816824, + "grad_norm": 0.1619185209274292, + "learning_rate": 0.0030248997081308788, + "loss": 3.0825977325439453, + "num_input_tokens_seen": 4608491520, + "step": 8790, + "train_runtime": 39944.5192, + "train_tokens_per_second": 115372.312 + }, + { + "epoch": 0.47620336048053247, + "grad_norm": 0.16285711526870728, + "learning_rate": 0.0030209973842612097, + "loss": 3.080776405334473, + "num_input_tokens_seen": 4613734400, + "step": 8800, + "train_runtime": 39989.7251, + "train_tokens_per_second": 115372.996 + }, + { + "epoch": 0.4767445006628967, + "grad_norm": 0.17198577523231506, + "learning_rate": 0.003017094233048994, + "loss": 3.0829303741455076, + "num_input_tokens_seen": 4618977280, + "step": 8810, + "train_runtime": 40034.9384, + "train_tokens_per_second": 115373.658 + }, + { + "epoch": 0.477285640845261, + "grad_norm": 0.16893431544303894, + "learning_rate": 0.003013190266410372, + "loss": 3.0930507659912108, + "num_input_tokens_seen": 4624220160, + "step": 8820, + "train_runtime": 40080.146, + "train_tokens_per_second": 115374.334 + }, + { + "epoch": 0.4778267810276252, + "grad_norm": 0.1534847915172577, + "learning_rate": 0.003009285496263973, + "loss": 3.086047554016113, + "num_input_tokens_seen": 4629463040, + "step": 8830, + "train_runtime": 40125.3234, + "train_tokens_per_second": 115375.096 + }, + { + "epoch": 0.47836792120998944, + "grad_norm": 0.16068360209465027, + "learning_rate": 0.003005379934530884, + "loss": 3.0864025115966798, + "num_input_tokens_seen": 4634705920, + "step": 8840, + "train_runtime": 40170.4754, + "train_tokens_per_second": 115375.929 + }, + { + "epoch": 0.4789090613923537, + "grad_norm": 0.1436556577682495, + "learning_rate": 0.003001473593134602, + "loss": 3.0830524444580076, + "num_input_tokens_seen": 4639948800, + "step": 8850, + "train_runtime": 40215.6345, + "train_tokens_per_second": 115376.74 + }, + { + "epoch": 0.4794502015747179, + "grad_norm": 0.14522488415241241, + "learning_rate": 0.0029975664840010104, + "loss": 3.0799121856689453, + "num_input_tokens_seen": 4645191680, + "step": 8860, + "train_runtime": 40260.7882, + "train_tokens_per_second": 115377.564 + }, + { + "epoch": 0.4799913417570822, + "grad_norm": 0.16195201873779297, + "learning_rate": 0.002993658619058331, + "loss": 3.071552848815918, + "num_input_tokens_seen": 4650434560, + "step": 8870, + "train_runtime": 40305.9418, + "train_tokens_per_second": 115378.387 + }, + { + "epoch": 0.4805324819394464, + "grad_norm": 0.14948424696922302, + "learning_rate": 0.0029897500102370974, + "loss": 3.0818138122558594, + "num_input_tokens_seen": 4655677440, + "step": 8880, + "train_runtime": 40351.0976, + "train_tokens_per_second": 115379.202 + }, + { + "epoch": 0.48107362212181065, + "grad_norm": 0.1461019665002823, + "learning_rate": 0.0029858406694701117, + "loss": 3.082274627685547, + "num_input_tokens_seen": 4660920320, + "step": 8890, + "train_runtime": 40396.2463, + "train_tokens_per_second": 115380.035 + }, + { + "epoch": 0.4816147623041749, + "grad_norm": 0.1501043438911438, + "learning_rate": 0.0029819306086924127, + "loss": 3.083462142944336, + "num_input_tokens_seen": 4666163200, + "step": 8900, + "train_runtime": 40441.3988, + "train_tokens_per_second": 115380.856 + }, + { + "epoch": 0.4821559024865391, + "grad_norm": 0.15929782390594482, + "learning_rate": 0.002978019839841233, + "loss": 3.0869064331054688, + "num_input_tokens_seen": 4671406080, + "step": 8910, + "train_runtime": 40486.5499, + "train_tokens_per_second": 115381.678 + }, + { + "epoch": 0.4826970426689034, + "grad_norm": 0.15982982516288757, + "learning_rate": 0.002974108374855974, + "loss": 3.082635688781738, + "num_input_tokens_seen": 4676648960, + "step": 8920, + "train_runtime": 40531.6946, + "train_tokens_per_second": 115382.518 + }, + { + "epoch": 0.48323818285126763, + "grad_norm": 0.15398921072483063, + "learning_rate": 0.0029701962256781555, + "loss": 3.069881820678711, + "num_input_tokens_seen": 4681891840, + "step": 8930, + "train_runtime": 40576.8387, + "train_tokens_per_second": 115383.356 + }, + { + "epoch": 0.48377932303363186, + "grad_norm": 0.16303351521492004, + "learning_rate": 0.0029662834042513903, + "loss": 3.078609085083008, + "num_input_tokens_seen": 4687134720, + "step": 8940, + "train_runtime": 40621.9936, + "train_tokens_per_second": 115384.163 + }, + { + "epoch": 0.4843204632159961, + "grad_norm": 0.16261766850948334, + "learning_rate": 0.0029623699225213417, + "loss": 3.072034454345703, + "num_input_tokens_seen": 4692377600, + "step": 8950, + "train_runtime": 40667.1772, + "train_tokens_per_second": 115384.886 + }, + { + "epoch": 0.4848616033983603, + "grad_norm": 0.14818759262561798, + "learning_rate": 0.002958455792435689, + "loss": 3.077336883544922, + "num_input_tokens_seen": 4697620480, + "step": 8960, + "train_runtime": 40712.3367, + "train_tokens_per_second": 115385.676 + }, + { + "epoch": 0.4854027435807246, + "grad_norm": 0.1536484807729721, + "learning_rate": 0.002954541025944093, + "loss": 3.0622703552246096, + "num_input_tokens_seen": 4702863360, + "step": 8970, + "train_runtime": 40757.516, + "train_tokens_per_second": 115386.408 + }, + { + "epoch": 0.48594388376308884, + "grad_norm": 0.1563226282596588, + "learning_rate": 0.002950625634998154, + "loss": 3.0665721893310547, + "num_input_tokens_seen": 4708106240, + "step": 8980, + "train_runtime": 40802.7032, + "train_tokens_per_second": 115387.116 + }, + { + "epoch": 0.48648502394545307, + "grad_norm": 0.15256533026695251, + "learning_rate": 0.0029467096315513802, + "loss": 3.0700511932373047, + "num_input_tokens_seen": 4713349120, + "step": 8990, + "train_runtime": 40847.8776, + "train_tokens_per_second": 115387.859 + }, + { + "epoch": 0.4870261641278173, + "grad_norm": 0.1605551838874817, + "learning_rate": 0.0029427930275591515, + "loss": 3.076490592956543, + "num_input_tokens_seen": 4718592000, + "step": 9000, + "train_runtime": 40893.0303, + "train_tokens_per_second": 115388.661 + }, + { + "epoch": 0.4870261641278173, + "eval_loss": 3.034029483795166, + "eval_runtime": 1.9847, + "eval_samples_per_second": 251.93, + "eval_steps_per_second": 4.031, + "num_input_tokens_seen": 4718592000, + "step": 9000 + }, + { + "epoch": 0.4875673043101815, + "grad_norm": 0.16099503636360168, + "learning_rate": 0.0029388758349786787, + "loss": 3.081180953979492, + "num_input_tokens_seen": 4723834880, + "step": 9010, + "train_runtime": 40942.5164, + "train_tokens_per_second": 115377.248 + }, + { + "epoch": 0.4881084444925458, + "grad_norm": 0.1605864018201828, + "learning_rate": 0.0029349580657689707, + "loss": 3.0802078247070312, + "num_input_tokens_seen": 4729077760, + "step": 9020, + "train_runtime": 40987.6712, + "train_tokens_per_second": 115378.054 + }, + { + "epoch": 0.48864958467491004, + "grad_norm": 0.15125182271003723, + "learning_rate": 0.0029310397318907965, + "loss": 3.090005111694336, + "num_input_tokens_seen": 4734320640, + "step": 9030, + "train_runtime": 41032.8355, + "train_tokens_per_second": 115378.832 + }, + { + "epoch": 0.4891907248572743, + "grad_norm": 0.16026070713996887, + "learning_rate": 0.002927120845306649, + "loss": 3.087236785888672, + "num_input_tokens_seen": 4739563520, + "step": 9040, + "train_runtime": 41077.9854, + "train_tokens_per_second": 115379.649 + }, + { + "epoch": 0.4897318650396385, + "grad_norm": 0.1533481776714325, + "learning_rate": 0.0029232014179807098, + "loss": 3.0772159576416014, + "num_input_tokens_seen": 4744806400, + "step": 9050, + "train_runtime": 41123.1477, + "train_tokens_per_second": 115380.429 + }, + { + "epoch": 0.49027300522200273, + "grad_norm": 0.16315831243991852, + "learning_rate": 0.002919281461878809, + "loss": 3.080950927734375, + "num_input_tokens_seen": 4750049280, + "step": 9060, + "train_runtime": 41168.3038, + "train_tokens_per_second": 115381.224 + }, + { + "epoch": 0.490814145404367, + "grad_norm": 0.16064807772636414, + "learning_rate": 0.0029153609889683934, + "loss": 3.077931594848633, + "num_input_tokens_seen": 4755292160, + "step": 9070, + "train_runtime": 41213.4562, + "train_tokens_per_second": 115382.028 + }, + { + "epoch": 0.49135528558673125, + "grad_norm": 0.15968522429466248, + "learning_rate": 0.0029114400112184857, + "loss": 3.0715621948242187, + "num_input_tokens_seen": 4760535040, + "step": 9080, + "train_runtime": 41258.6304, + "train_tokens_per_second": 115382.769 + }, + { + "epoch": 0.4918964257690955, + "grad_norm": 0.21646282076835632, + "learning_rate": 0.0029075185405996497, + "loss": 3.070268249511719, + "num_input_tokens_seen": 4765777920, + "step": 9090, + "train_runtime": 41303.7917, + "train_tokens_per_second": 115383.545 + }, + { + "epoch": 0.4924375659514597, + "grad_norm": 0.14861001074314117, + "learning_rate": 0.0029035965890839566, + "loss": 3.0772144317626955, + "num_input_tokens_seen": 4771020800, + "step": 9100, + "train_runtime": 41348.9385, + "train_tokens_per_second": 115384.36 + }, + { + "epoch": 0.49297870613382394, + "grad_norm": 0.15690362453460693, + "learning_rate": 0.0028996741686449427, + "loss": 3.079457092285156, + "num_input_tokens_seen": 4776263680, + "step": 9110, + "train_runtime": 41394.0963, + "train_tokens_per_second": 115385.142 + }, + { + "epoch": 0.4935198463161882, + "grad_norm": 0.1561357080936432, + "learning_rate": 0.0028957512912575777, + "loss": 3.081951141357422, + "num_input_tokens_seen": 4781506560, + "step": 9120, + "train_runtime": 41439.2561, + "train_tokens_per_second": 115385.917 + }, + { + "epoch": 0.49406098649855246, + "grad_norm": 0.15572723746299744, + "learning_rate": 0.002891827968898225, + "loss": 3.0684499740600586, + "num_input_tokens_seen": 4786749440, + "step": 9130, + "train_runtime": 41484.4245, + "train_tokens_per_second": 115386.666 + }, + { + "epoch": 0.4946021266809167, + "grad_norm": 0.14807617664337158, + "learning_rate": 0.0028879042135446092, + "loss": 3.0712486267089845, + "num_input_tokens_seen": 4791992320, + "step": 9140, + "train_runtime": 41529.5787, + "train_tokens_per_second": 115387.453 + }, + { + "epoch": 0.4951432668632809, + "grad_norm": 0.15159912407398224, + "learning_rate": 0.0028839800371757724, + "loss": 3.0685661315917967, + "num_input_tokens_seen": 4797235200, + "step": 9150, + "train_runtime": 41574.7343, + "train_tokens_per_second": 115388.235 + }, + { + "epoch": 0.49568440704564515, + "grad_norm": 0.15283788740634918, + "learning_rate": 0.0028800554517720467, + "loss": 3.066938591003418, + "num_input_tokens_seen": 4802478080, + "step": 9160, + "train_runtime": 41623.4446, + "train_tokens_per_second": 115379.16 + }, + { + "epoch": 0.49622554722800943, + "grad_norm": 0.14532612264156342, + "learning_rate": 0.0028761304693150093, + "loss": 3.0726764678955076, + "num_input_tokens_seen": 4807720960, + "step": 9170, + "train_runtime": 41668.6161, + "train_tokens_per_second": 115379.905 + }, + { + "epoch": 0.49676668741037366, + "grad_norm": 0.15642227232456207, + "learning_rate": 0.0028722051017874514, + "loss": 3.075974464416504, + "num_input_tokens_seen": 4812963840, + "step": 9180, + "train_runtime": 41713.7758, + "train_tokens_per_second": 115380.681 + }, + { + "epoch": 0.4973078275927379, + "grad_norm": 0.15629522502422333, + "learning_rate": 0.00286827936117334, + "loss": 3.0699131011962892, + "num_input_tokens_seen": 4818206720, + "step": 9190, + "train_runtime": 41758.9432, + "train_tokens_per_second": 115381.433 + }, + { + "epoch": 0.4978489677751021, + "grad_norm": 0.15175525844097137, + "learning_rate": 0.00286435325945778, + "loss": 3.0690542221069337, + "num_input_tokens_seen": 4823449600, + "step": 9200, + "train_runtime": 41804.1291, + "train_tokens_per_second": 115382.133 + }, + { + "epoch": 0.49839010795746636, + "grad_norm": 0.14598695933818817, + "learning_rate": 0.0028604268086269793, + "loss": 3.072031021118164, + "num_input_tokens_seen": 4828692480, + "step": 9210, + "train_runtime": 41849.2968, + "train_tokens_per_second": 115382.882 + }, + { + "epoch": 0.49893124813983064, + "grad_norm": 0.14812366664409637, + "learning_rate": 0.0028565000206682125, + "loss": 3.074822998046875, + "num_input_tokens_seen": 4833935360, + "step": 9220, + "train_runtime": 41894.4632, + "train_tokens_per_second": 115383.633 + }, + { + "epoch": 0.49947238832219487, + "grad_norm": 0.1581128090620041, + "learning_rate": 0.0028525729075697813, + "loss": 3.071183967590332, + "num_input_tokens_seen": 4839178240, + "step": 9230, + "train_runtime": 41939.6385, + "train_tokens_per_second": 115384.357 + }, + { + "epoch": 0.5000135285045592, + "grad_norm": 0.160576730966568, + "learning_rate": 0.002848645481320983, + "loss": 3.079146385192871, + "num_input_tokens_seen": 4844421120, + "step": 9240, + "train_runtime": 41984.8089, + "train_tokens_per_second": 115385.094 + }, + { + "epoch": 0.5005546686869233, + "grad_norm": 0.1423659771680832, + "learning_rate": 0.002844717753912068, + "loss": 3.0759227752685545, + "num_input_tokens_seen": 4849664000, + "step": 9250, + "train_runtime": 42029.9847, + "train_tokens_per_second": 115385.814 + }, + { + "epoch": 0.5010958088692876, + "grad_norm": 0.14177994430065155, + "learning_rate": 0.0028407897373342074, + "loss": 3.076811599731445, + "num_input_tokens_seen": 4854906880, + "step": 9260, + "train_runtime": 42075.1549, + "train_tokens_per_second": 115386.548 + }, + { + "epoch": 0.5016369490516518, + "grad_norm": 0.14039497077465057, + "learning_rate": 0.002836861443579456, + "loss": 3.0762613296508787, + "num_input_tokens_seen": 4860149760, + "step": 9270, + "train_runtime": 42120.3369, + "train_tokens_per_second": 115387.248 + }, + { + "epoch": 0.5021780892340161, + "grad_norm": 0.15003980696201324, + "learning_rate": 0.0028329328846407125, + "loss": 3.0661956787109377, + "num_input_tokens_seen": 4865392640, + "step": 9280, + "train_runtime": 42165.525, + "train_tokens_per_second": 115387.93 + }, + { + "epoch": 0.5027192294163804, + "grad_norm": 0.1639668196439743, + "learning_rate": 0.0028290040725116876, + "loss": 3.077253723144531, + "num_input_tokens_seen": 4870635520, + "step": 9290, + "train_runtime": 42210.7142, + "train_tokens_per_second": 115388.607 + }, + { + "epoch": 0.5032603695987445, + "grad_norm": 0.15424971282482147, + "learning_rate": 0.002825075019186865, + "loss": 3.0679557800292967, + "num_input_tokens_seen": 4875878400, + "step": 9300, + "train_runtime": 42255.9028, + "train_tokens_per_second": 115389.285 + }, + { + "epoch": 0.5038015097811088, + "grad_norm": 0.1511068195104599, + "learning_rate": 0.0028211457366614607, + "loss": 3.0695865631103514, + "num_input_tokens_seen": 4881121280, + "step": 9310, + "train_runtime": 42301.0768, + "train_tokens_per_second": 115390.001 + }, + { + "epoch": 0.504342649963473, + "grad_norm": 0.15356752276420593, + "learning_rate": 0.002817216236931397, + "loss": 3.073322296142578, + "num_input_tokens_seen": 4886364160, + "step": 9320, + "train_runtime": 42346.2459, + "train_tokens_per_second": 115390.728 + }, + { + "epoch": 0.5048837901458373, + "grad_norm": 0.14986173808574677, + "learning_rate": 0.002813286531993253, + "loss": 3.07531681060791, + "num_input_tokens_seen": 4891607040, + "step": 9330, + "train_runtime": 42391.4328, + "train_tokens_per_second": 115391.406 + }, + { + "epoch": 0.5054249303282016, + "grad_norm": 0.14537614583969116, + "learning_rate": 0.0028093566338442395, + "loss": 3.0746026992797852, + "num_input_tokens_seen": 4896849920, + "step": 9340, + "train_runtime": 42436.5896, + "train_tokens_per_second": 115392.164 + }, + { + "epoch": 0.5059660705105657, + "grad_norm": 0.15007568895816803, + "learning_rate": 0.0028054265544821522, + "loss": 3.0845333099365235, + "num_input_tokens_seen": 4902092800, + "step": 9350, + "train_runtime": 42481.7468, + "train_tokens_per_second": 115392.92 + }, + { + "epoch": 0.50650721069293, + "grad_norm": 0.15155982971191406, + "learning_rate": 0.0028014963059053446, + "loss": 3.0744888305664064, + "num_input_tokens_seen": 4907335680, + "step": 9360, + "train_runtime": 42526.8939, + "train_tokens_per_second": 115393.701 + }, + { + "epoch": 0.5070483508752942, + "grad_norm": 0.15760909020900726, + "learning_rate": 0.002797565900112684, + "loss": 3.0650793075561524, + "num_input_tokens_seen": 4912578560, + "step": 9370, + "train_runtime": 42572.0476, + "train_tokens_per_second": 115394.463 + }, + { + "epoch": 0.5075894910576585, + "grad_norm": 0.156438410282135, + "learning_rate": 0.0027936353491035183, + "loss": 3.0668895721435545, + "num_input_tokens_seen": 4917821440, + "step": 9380, + "train_runtime": 42617.1956, + "train_tokens_per_second": 115395.238 + }, + { + "epoch": 0.5081306312400228, + "grad_norm": 0.16406750679016113, + "learning_rate": 0.0027897046648776395, + "loss": 3.061408042907715, + "num_input_tokens_seen": 4923064320, + "step": 9390, + "train_runtime": 42662.3399, + "train_tokens_per_second": 115396.022 + }, + { + "epoch": 0.508671771422387, + "grad_norm": 0.13937264680862427, + "learning_rate": 0.002785773859435245, + "loss": 3.069793128967285, + "num_input_tokens_seen": 4928307200, + "step": 9400, + "train_runtime": 42707.5213, + "train_tokens_per_second": 115396.704 + }, + { + "epoch": 0.5092129116047512, + "grad_norm": 0.15395483374595642, + "learning_rate": 0.0027818429447769044, + "loss": 3.071869659423828, + "num_input_tokens_seen": 4933550080, + "step": 9410, + "train_runtime": 42752.6998, + "train_tokens_per_second": 115397.392 + }, + { + "epoch": 0.5097540517871154, + "grad_norm": 0.15119874477386475, + "learning_rate": 0.0027779119329035167, + "loss": 3.067423629760742, + "num_input_tokens_seen": 4938792960, + "step": 9420, + "train_runtime": 42797.8655, + "train_tokens_per_second": 115398.114 + }, + { + "epoch": 0.5102951919694797, + "grad_norm": 0.1615118384361267, + "learning_rate": 0.002773980835816284, + "loss": 3.0653512954711912, + "num_input_tokens_seen": 4944035840, + "step": 9430, + "train_runtime": 42843.0316, + "train_tokens_per_second": 115398.833 + }, + { + "epoch": 0.510836332151844, + "grad_norm": 0.16538918018341064, + "learning_rate": 0.0027700496655166614, + "loss": 3.067237663269043, + "num_input_tokens_seen": 4949278720, + "step": 9440, + "train_runtime": 42888.2044, + "train_tokens_per_second": 115399.532 + }, + { + "epoch": 0.5113774723342082, + "grad_norm": 0.14385169744491577, + "learning_rate": 0.002766118434006332, + "loss": 3.078049087524414, + "num_input_tokens_seen": 4954521600, + "step": 9450, + "train_runtime": 42933.3584, + "train_tokens_per_second": 115400.281 + }, + { + "epoch": 0.5119186125165724, + "grad_norm": 0.15073780715465546, + "learning_rate": 0.0027621871532871657, + "loss": 3.06368350982666, + "num_input_tokens_seen": 4959764480, + "step": 9460, + "train_runtime": 42978.5149, + "train_tokens_per_second": 115401.02 + }, + { + "epoch": 0.5124597526989366, + "grad_norm": 0.15621446073055267, + "learning_rate": 0.0027582558353611802, + "loss": 3.0653354644775392, + "num_input_tokens_seen": 4965007360, + "step": 9470, + "train_runtime": 43023.685, + "train_tokens_per_second": 115401.722 + }, + { + "epoch": 0.5130008928813009, + "grad_norm": 0.15570929646492004, + "learning_rate": 0.0027543244922305105, + "loss": 3.0613819122314454, + "num_input_tokens_seen": 4970250240, + "step": 9480, + "train_runtime": 43068.8573, + "train_tokens_per_second": 115402.417 + }, + { + "epoch": 0.5135420330636652, + "grad_norm": 0.1389538198709488, + "learning_rate": 0.0027503931358973644, + "loss": 3.0687282562255858, + "num_input_tokens_seen": 4975493120, + "step": 9490, + "train_runtime": 43114.0132, + "train_tokens_per_second": 115403.154 + }, + { + "epoch": 0.5140831732460294, + "grad_norm": 0.15439286828041077, + "learning_rate": 0.002746461778363992, + "loss": 3.0685733795166015, + "num_input_tokens_seen": 4980736000, + "step": 9500, + "train_runtime": 43159.1877, + "train_tokens_per_second": 115403.84 + }, + { + "epoch": 0.5140831732460294, + "eval_loss": 3.021251916885376, + "eval_runtime": 1.9829, + "eval_samples_per_second": 252.155, + "eval_steps_per_second": 4.034, + "num_input_tokens_seen": 4980736000, + "step": 9500 + }, + { + "epoch": 0.5146243134283937, + "grad_norm": 0.16106902062892914, + "learning_rate": 0.0027425304316326484, + "loss": 3.076310729980469, + "num_input_tokens_seen": 4985978880, + "step": 9510, + "train_runtime": 43206.3354, + "train_tokens_per_second": 115399.254 + }, + { + "epoch": 0.5151654536107578, + "grad_norm": 0.16451045870780945, + "learning_rate": 0.0027385991077055532, + "loss": 3.0650386810302734, + "num_input_tokens_seen": 4991221760, + "step": 9520, + "train_runtime": 43251.4993, + "train_tokens_per_second": 115399.971 + }, + { + "epoch": 0.5157065937931221, + "grad_norm": 0.141593337059021, + "learning_rate": 0.002734667818584858, + "loss": 3.0678850173950196, + "num_input_tokens_seen": 4996464640, + "step": 9530, + "train_runtime": 43296.675, + "train_tokens_per_second": 115400.655 + }, + { + "epoch": 0.5162477339754864, + "grad_norm": 0.15153639018535614, + "learning_rate": 0.002730736576272606, + "loss": 3.0637826919555664, + "num_input_tokens_seen": 5001707520, + "step": 9540, + "train_runtime": 43345.4631, + "train_tokens_per_second": 115391.719 + }, + { + "epoch": 0.5167888741578506, + "grad_norm": 0.1510300487279892, + "learning_rate": 0.0027268053927707015, + "loss": 3.066213607788086, + "num_input_tokens_seen": 5006950400, + "step": 9550, + "train_runtime": 43390.6485, + "train_tokens_per_second": 115392.385 + }, + { + "epoch": 0.5173300143402149, + "grad_norm": 0.14986655116081238, + "learning_rate": 0.0027228742800808657, + "loss": 3.069229507446289, + "num_input_tokens_seen": 5012193280, + "step": 9560, + "train_runtime": 43435.8008, + "train_tokens_per_second": 115393.136 + }, + { + "epoch": 0.517871154522579, + "grad_norm": 0.15248462557792664, + "learning_rate": 0.002718943250204604, + "loss": 3.0567092895507812, + "num_input_tokens_seen": 5017436160, + "step": 9570, + "train_runtime": 43480.9543, + "train_tokens_per_second": 115393.883 + }, + { + "epoch": 0.5184122947049433, + "grad_norm": 0.1401718556880951, + "learning_rate": 0.0027150123151431717, + "loss": 3.0642112731933593, + "num_input_tokens_seen": 5022679040, + "step": 9580, + "train_runtime": 43526.1319, + "train_tokens_per_second": 115394.565 + }, + { + "epoch": 0.5189534348873076, + "grad_norm": 0.1594904363155365, + "learning_rate": 0.002711081486897532, + "loss": 3.077428436279297, + "num_input_tokens_seen": 5027921920, + "step": 9590, + "train_runtime": 43571.3028, + "train_tokens_per_second": 115395.262 + }, + { + "epoch": 0.5194945750696718, + "grad_norm": 0.16368508338928223, + "learning_rate": 0.0027071507774683217, + "loss": 3.0642780303955077, + "num_input_tokens_seen": 5033164800, + "step": 9600, + "train_runtime": 43616.4759, + "train_tokens_per_second": 115395.953 + }, + { + "epoch": 0.5200357152520361, + "grad_norm": 0.15867675840854645, + "learning_rate": 0.0027032201988558165, + "loss": 3.056943893432617, + "num_input_tokens_seen": 5038407680, + "step": 9610, + "train_runtime": 43661.6654, + "train_tokens_per_second": 115396.599 + }, + { + "epoch": 0.5205768554344002, + "grad_norm": 0.14183756709098816, + "learning_rate": 0.0026992897630598927, + "loss": 3.0706558227539062, + "num_input_tokens_seen": 5043650560, + "step": 9620, + "train_runtime": 43706.8375, + "train_tokens_per_second": 115397.289 + }, + { + "epoch": 0.5211179956167645, + "grad_norm": 0.15698400139808655, + "learning_rate": 0.002695359482079989, + "loss": 3.0621952056884765, + "num_input_tokens_seen": 5048893440, + "step": 9630, + "train_runtime": 43752.0038, + "train_tokens_per_second": 115397.993 + }, + { + "epoch": 0.5216591357991288, + "grad_norm": 0.15258745849132538, + "learning_rate": 0.002691429367915072, + "loss": 3.0683521270751952, + "num_input_tokens_seen": 5054136320, + "step": 9640, + "train_runtime": 43797.1921, + "train_tokens_per_second": 115398.638 + }, + { + "epoch": 0.522200275981493, + "grad_norm": 0.1572786569595337, + "learning_rate": 0.0026874994325636016, + "loss": 3.0657506942749024, + "num_input_tokens_seen": 5059379200, + "step": 9650, + "train_runtime": 43842.367, + "train_tokens_per_second": 115399.317 + }, + { + "epoch": 0.5227414161638573, + "grad_norm": 0.1440490484237671, + "learning_rate": 0.002683569688023488, + "loss": 3.057964324951172, + "num_input_tokens_seen": 5064622080, + "step": 9660, + "train_runtime": 43887.5251, + "train_tokens_per_second": 115400.038 + }, + { + "epoch": 0.5232825563462215, + "grad_norm": 0.1437375247478485, + "learning_rate": 0.002679640146292061, + "loss": 3.067335510253906, + "num_input_tokens_seen": 5069864960, + "step": 9670, + "train_runtime": 43932.6993, + "train_tokens_per_second": 115400.716 + }, + { + "epoch": 0.5238236965285857, + "grad_norm": 0.14964227378368378, + "learning_rate": 0.0026757108193660294, + "loss": 3.0661109924316405, + "num_input_tokens_seen": 5075107840, + "step": 9680, + "train_runtime": 43977.8845, + "train_tokens_per_second": 115401.364 + }, + { + "epoch": 0.52436483671095, + "grad_norm": 0.15807990729808807, + "learning_rate": 0.0026717817192414496, + "loss": 3.0581291198730467, + "num_input_tokens_seen": 5080350720, + "step": 9690, + "train_runtime": 44023.0423, + "train_tokens_per_second": 115402.082 + }, + { + "epoch": 0.5249059768933142, + "grad_norm": 0.1513347625732422, + "learning_rate": 0.0026678528579136833, + "loss": 3.0648067474365233, + "num_input_tokens_seen": 5085593600, + "step": 9700, + "train_runtime": 44068.1745, + "train_tokens_per_second": 115402.865 + }, + { + "epoch": 0.5254471170756785, + "grad_norm": 0.13990284502506256, + "learning_rate": 0.002663924247377361, + "loss": 3.06469841003418, + "num_input_tokens_seen": 5090836480, + "step": 9710, + "train_runtime": 44113.2866, + "train_tokens_per_second": 115403.7 + }, + { + "epoch": 0.5259882572580427, + "grad_norm": 0.16054664552211761, + "learning_rate": 0.002659995899626353, + "loss": 3.070522689819336, + "num_input_tokens_seen": 5096079360, + "step": 9720, + "train_runtime": 44158.4184, + "train_tokens_per_second": 115404.481 + }, + { + "epoch": 0.5265293974404069, + "grad_norm": 0.15888893604278564, + "learning_rate": 0.0026560678266537223, + "loss": 3.061862564086914, + "num_input_tokens_seen": 5101322240, + "step": 9730, + "train_runtime": 44203.5783, + "train_tokens_per_second": 115405.188 + }, + { + "epoch": 0.5270705376227712, + "grad_norm": 0.1485973447561264, + "learning_rate": 0.002652140040451696, + "loss": 3.0686100006103514, + "num_input_tokens_seen": 5106565120, + "step": 9740, + "train_runtime": 44248.7463, + "train_tokens_per_second": 115405.871 + }, + { + "epoch": 0.5276116778051354, + "grad_norm": 0.1576089709997177, + "learning_rate": 0.002648212553011623, + "loss": 3.062734603881836, + "num_input_tokens_seen": 5111808000, + "step": 9750, + "train_runtime": 44293.9122, + "train_tokens_per_second": 115406.559 + }, + { + "epoch": 0.5281528179874997, + "grad_norm": 0.14466626942157745, + "learning_rate": 0.0026442853763239444, + "loss": 3.0534576416015624, + "num_input_tokens_seen": 5117050880, + "step": 9760, + "train_runtime": 44339.0625, + "train_tokens_per_second": 115407.286 + }, + { + "epoch": 0.5286939581698639, + "grad_norm": 0.13870450854301453, + "learning_rate": 0.0026403585223781483, + "loss": 3.0488052368164062, + "num_input_tokens_seen": 5122293760, + "step": 9770, + "train_runtime": 44384.2076, + "train_tokens_per_second": 115408.025 + }, + { + "epoch": 0.5292350983522282, + "grad_norm": 0.15322810411453247, + "learning_rate": 0.0026364320031627385, + "loss": 3.056787109375, + "num_input_tokens_seen": 5127536640, + "step": 9780, + "train_runtime": 44429.3763, + "train_tokens_per_second": 115408.702 + }, + { + "epoch": 0.5297762385345924, + "grad_norm": 0.1526278853416443, + "learning_rate": 0.0026325058306652, + "loss": 3.068254089355469, + "num_input_tokens_seen": 5132779520, + "step": 9790, + "train_runtime": 44474.5291, + "train_tokens_per_second": 115409.418 + }, + { + "epoch": 0.5303173787169566, + "grad_norm": 0.14957252144813538, + "learning_rate": 0.002628580016871954, + "loss": 3.0630029678344726, + "num_input_tokens_seen": 5138022400, + "step": 9800, + "train_runtime": 44519.6838, + "train_tokens_per_second": 115410.128 + }, + { + "epoch": 0.5308585188993209, + "grad_norm": 0.1606789082288742, + "learning_rate": 0.002624654573768332, + "loss": 3.0618038177490234, + "num_input_tokens_seen": 5143265280, + "step": 9810, + "train_runtime": 44564.8296, + "train_tokens_per_second": 115410.859 + }, + { + "epoch": 0.5313996590816851, + "grad_norm": 0.148385152220726, + "learning_rate": 0.002620729513338529, + "loss": 3.06335334777832, + "num_input_tokens_seen": 5148508160, + "step": 9820, + "train_runtime": 44609.9754, + "train_tokens_per_second": 115411.589 + }, + { + "epoch": 0.5319407992640494, + "grad_norm": 0.1520962417125702, + "learning_rate": 0.002616804847565574, + "loss": 3.061989593505859, + "num_input_tokens_seen": 5153751040, + "step": 9830, + "train_runtime": 44655.138, + "train_tokens_per_second": 115412.275 + }, + { + "epoch": 0.5324819394464136, + "grad_norm": 0.14803871512413025, + "learning_rate": 0.002612880588431294, + "loss": 3.062520980834961, + "num_input_tokens_seen": 5158993920, + "step": 9840, + "train_runtime": 44700.2918, + "train_tokens_per_second": 115412.981 + }, + { + "epoch": 0.5330230796287778, + "grad_norm": 0.14693519473075867, + "learning_rate": 0.002608956747916268, + "loss": 3.053732681274414, + "num_input_tokens_seen": 5164236800, + "step": 9850, + "train_runtime": 44745.4454, + "train_tokens_per_second": 115413.686 + }, + { + "epoch": 0.5335642198111421, + "grad_norm": 0.14247646927833557, + "learning_rate": 0.0026050333379998014, + "loss": 3.0687253952026365, + "num_input_tokens_seen": 5169479680, + "step": 9860, + "train_runtime": 44790.614, + "train_tokens_per_second": 115414.352 + }, + { + "epoch": 0.5341053599935063, + "grad_norm": 0.1414949595928192, + "learning_rate": 0.0026011103706598867, + "loss": 3.0667953491210938, + "num_input_tokens_seen": 5174722560, + "step": 9870, + "train_runtime": 44835.7697, + "train_tokens_per_second": 115415.049 + }, + { + "epoch": 0.5346465001758706, + "grad_norm": 0.15308037400245667, + "learning_rate": 0.00259718785787316, + "loss": 3.0632091522216798, + "num_input_tokens_seen": 5179965440, + "step": 9880, + "train_runtime": 44880.9299, + "train_tokens_per_second": 115415.733 + }, + { + "epoch": 0.5351876403582349, + "grad_norm": 0.1401015669107437, + "learning_rate": 0.002593265811614872, + "loss": 3.054189682006836, + "num_input_tokens_seen": 5185208320, + "step": 9890, + "train_runtime": 44926.1001, + "train_tokens_per_second": 115416.391 + }, + { + "epoch": 0.535728780540599, + "grad_norm": 0.15150010585784912, + "learning_rate": 0.0025893442438588523, + "loss": 3.0516624450683594, + "num_input_tokens_seen": 5190451200, + "step": 9900, + "train_runtime": 44971.2488, + "train_tokens_per_second": 115417.102 + }, + { + "epoch": 0.5362699207229633, + "grad_norm": 0.17257611453533173, + "learning_rate": 0.0025854231665774653, + "loss": 3.0537059783935545, + "num_input_tokens_seen": 5195694080, + "step": 9910, + "train_runtime": 45016.4036, + "train_tokens_per_second": 115417.796 + }, + { + "epoch": 0.5368110609053275, + "grad_norm": 0.14506685733795166, + "learning_rate": 0.002581502591741579, + "loss": 3.0568138122558595, + "num_input_tokens_seen": 5200936960, + "step": 9920, + "train_runtime": 45065.2284, + "train_tokens_per_second": 115409.089 + }, + { + "epoch": 0.5373522010876918, + "grad_norm": 0.14898552000522614, + "learning_rate": 0.002577582531320528, + "loss": 3.0490861892700196, + "num_input_tokens_seen": 5206179840, + "step": 9930, + "train_runtime": 45110.3898, + "train_tokens_per_second": 115409.773 + }, + { + "epoch": 0.5378933412700561, + "grad_norm": 0.14676256477832794, + "learning_rate": 0.0025736629972820785, + "loss": 3.067533493041992, + "num_input_tokens_seen": 5211422720, + "step": 9940, + "train_runtime": 45155.5461, + "train_tokens_per_second": 115410.468 + }, + { + "epoch": 0.5384344814524202, + "grad_norm": 0.15546375513076782, + "learning_rate": 0.002569744001592385, + "loss": 3.053817367553711, + "num_input_tokens_seen": 5216665600, + "step": 9950, + "train_runtime": 45200.7043, + "train_tokens_per_second": 115411.157 + }, + { + "epoch": 0.5389756216347845, + "grad_norm": 0.16900601983070374, + "learning_rate": 0.002565825556215962, + "loss": 3.062000274658203, + "num_input_tokens_seen": 5221908480, + "step": 9960, + "train_runtime": 45245.886, + "train_tokens_per_second": 115411.785 + }, + { + "epoch": 0.5395167618171487, + "grad_norm": 0.14602519571781158, + "learning_rate": 0.0025619076731156444, + "loss": 3.0598079681396486, + "num_input_tokens_seen": 5227151360, + "step": 9970, + "train_runtime": 45291.0267, + "train_tokens_per_second": 115412.516 + }, + { + "epoch": 0.540057901999513, + "grad_norm": 0.15503665804862976, + "learning_rate": 0.002557990364252547, + "loss": 3.047740936279297, + "num_input_tokens_seen": 5232394240, + "step": 9980, + "train_runtime": 45336.1788, + "train_tokens_per_second": 115413.217 + }, + { + "epoch": 0.5405990421818773, + "grad_norm": 0.15822327136993408, + "learning_rate": 0.0025540736415860343, + "loss": 3.0622173309326173, + "num_input_tokens_seen": 5237637120, + "step": 9990, + "train_runtime": 45381.3445, + "train_tokens_per_second": 115413.882 + }, + { + "epoch": 0.5411401823642414, + "grad_norm": 0.13620983064174652, + "learning_rate": 0.0025501575170736803, + "loss": 3.0480823516845703, + "num_input_tokens_seen": 5242880000, + "step": 10000, + "train_runtime": 45426.5099, + "train_tokens_per_second": 115414.546 + }, + { + "epoch": 0.5411401823642414, + "eval_loss": 3.0108466148376465, + "eval_runtime": 1.9863, + "eval_samples_per_second": 251.722, + "eval_steps_per_second": 4.028, + "num_input_tokens_seen": 5242880000, + "step": 10000 + }, + { + "epoch": 0.5416813225466057, + "grad_norm": 0.14582324028015137, + "learning_rate": 0.002546242002671233, + "loss": 3.0488231658935545, + "num_input_tokens_seen": 5248122880, + "step": 10010, + "train_runtime": 45475.9888, + "train_tokens_per_second": 115404.261 + }, + { + "epoch": 0.5422224627289699, + "grad_norm": 0.14079852402210236, + "learning_rate": 0.0025423271103325786, + "loss": 3.0604705810546875, + "num_input_tokens_seen": 5253365760, + "step": 10020, + "train_runtime": 45521.0949, + "train_tokens_per_second": 115405.083 + }, + { + "epoch": 0.5427636029113342, + "grad_norm": 0.15606586635112762, + "learning_rate": 0.002538412852009702, + "loss": 3.0583091735839845, + "num_input_tokens_seen": 5258608640, + "step": 10030, + "train_runtime": 45566.226, + "train_tokens_per_second": 115405.841 + }, + { + "epoch": 0.5433047430936985, + "grad_norm": 0.14329582452774048, + "learning_rate": 0.002534499239652654, + "loss": 3.0502853393554688, + "num_input_tokens_seen": 5263851520, + "step": 10040, + "train_runtime": 45611.3663, + "train_tokens_per_second": 115406.574 + }, + { + "epoch": 0.5438458832760626, + "grad_norm": 0.14503344893455505, + "learning_rate": 0.0025305862852095145, + "loss": 3.0582489013671874, + "num_input_tokens_seen": 5269094400, + "step": 10050, + "train_runtime": 45656.5231, + "train_tokens_per_second": 115407.264 + }, + { + "epoch": 0.5443870234584269, + "grad_norm": 0.153029665350914, + "learning_rate": 0.002526674000626352, + "loss": 3.052793502807617, + "num_input_tokens_seen": 5274337280, + "step": 10060, + "train_runtime": 45701.6761, + "train_tokens_per_second": 115407.962 + }, + { + "epoch": 0.5449281636407911, + "grad_norm": 0.14204776287078857, + "learning_rate": 0.00252276239784719, + "loss": 3.052510643005371, + "num_input_tokens_seen": 5279580160, + "step": 10070, + "train_runtime": 45746.8203, + "train_tokens_per_second": 115408.68 + }, + { + "epoch": 0.5454693038231554, + "grad_norm": 0.14915040135383606, + "learning_rate": 0.0025188514888139757, + "loss": 3.058329391479492, + "num_input_tokens_seen": 5284823040, + "step": 10080, + "train_runtime": 45791.9522, + "train_tokens_per_second": 115409.429 + }, + { + "epoch": 0.5460104440055197, + "grad_norm": 0.14046527445316315, + "learning_rate": 0.0025149412854665316, + "loss": 3.0549495697021483, + "num_input_tokens_seen": 5290065920, + "step": 10090, + "train_runtime": 45837.0921, + "train_tokens_per_second": 115410.155 + }, + { + "epoch": 0.5465515841878839, + "grad_norm": 0.15560267865657806, + "learning_rate": 0.0025110317997425295, + "loss": 3.0544879913330076, + "num_input_tokens_seen": 5295308800, + "step": 10100, + "train_runtime": 45882.2338, + "train_tokens_per_second": 115410.876 + }, + { + "epoch": 0.5470927243702481, + "grad_norm": 0.15250231325626373, + "learning_rate": 0.002507123043577449, + "loss": 3.0573678970336915, + "num_input_tokens_seen": 5300551680, + "step": 10110, + "train_runtime": 45927.3738, + "train_tokens_per_second": 115411.6 + }, + { + "epoch": 0.5476338645526123, + "grad_norm": 0.13873432576656342, + "learning_rate": 0.002503215028904543, + "loss": 3.0521045684814454, + "num_input_tokens_seen": 5305794560, + "step": 10120, + "train_runtime": 45972.5283, + "train_tokens_per_second": 115412.285 + }, + { + "epoch": 0.5481750047349766, + "grad_norm": 0.15664595365524292, + "learning_rate": 0.0024993077676548014, + "loss": 3.0536930084228517, + "num_input_tokens_seen": 5311037440, + "step": 10130, + "train_runtime": 46017.6749, + "train_tokens_per_second": 115412.99 + }, + { + "epoch": 0.5487161449173409, + "grad_norm": 0.15226289629936218, + "learning_rate": 0.002495401271756911, + "loss": 3.0586917877197264, + "num_input_tokens_seen": 5316280320, + "step": 10140, + "train_runtime": 46062.8137, + "train_tokens_per_second": 115413.712 + }, + { + "epoch": 0.5492572850997051, + "grad_norm": 0.14132562279701233, + "learning_rate": 0.0024914955531372264, + "loss": 3.0555648803710938, + "num_input_tokens_seen": 5321523200, + "step": 10150, + "train_runtime": 46107.9535, + "train_tokens_per_second": 115414.43 + }, + { + "epoch": 0.5497984252820693, + "grad_norm": 0.15198439359664917, + "learning_rate": 0.002487590623719726, + "loss": 3.0600481033325195, + "num_input_tokens_seen": 5326766080, + "step": 10160, + "train_runtime": 46153.0991, + "train_tokens_per_second": 115415.133 + }, + { + "epoch": 0.5503395654644335, + "grad_norm": 0.1487119495868683, + "learning_rate": 0.002483686495425979, + "loss": 3.0562034606933595, + "num_input_tokens_seen": 5332008960, + "step": 10170, + "train_runtime": 46198.2435, + "train_tokens_per_second": 115415.837 + }, + { + "epoch": 0.5508807056467978, + "grad_norm": 0.1500309705734253, + "learning_rate": 0.00247978318017511, + "loss": 3.0558704376220702, + "num_input_tokens_seen": 5337251840, + "step": 10180, + "train_runtime": 46243.3932, + "train_tokens_per_second": 115416.527 + }, + { + "epoch": 0.5514218458291621, + "grad_norm": 0.14477477967739105, + "learning_rate": 0.0024758806898837614, + "loss": 3.0584625244140624, + "num_input_tokens_seen": 5342494720, + "step": 10190, + "train_runtime": 46288.5341, + "train_tokens_per_second": 115417.237 + }, + { + "epoch": 0.5519629860115263, + "grad_norm": 0.14965343475341797, + "learning_rate": 0.0024719790364660555, + "loss": 3.053845024108887, + "num_input_tokens_seen": 5347737600, + "step": 10200, + "train_runtime": 46333.6743, + "train_tokens_per_second": 115417.948 + }, + { + "epoch": 0.5525041261938906, + "grad_norm": 0.14595621824264526, + "learning_rate": 0.002468078231833561, + "loss": 3.0468162536621093, + "num_input_tokens_seen": 5352980480, + "step": 10210, + "train_runtime": 46378.8185, + "train_tokens_per_second": 115418.647 + }, + { + "epoch": 0.5530452663762547, + "grad_norm": 0.15934494137763977, + "learning_rate": 0.002464178287895256, + "loss": 3.0428611755371096, + "num_input_tokens_seen": 5358223360, + "step": 10220, + "train_runtime": 46423.9466, + "train_tokens_per_second": 115419.385 + }, + { + "epoch": 0.553586406558619, + "grad_norm": 0.14214660227298737, + "learning_rate": 0.002460279216557488, + "loss": 3.0542884826660157, + "num_input_tokens_seen": 5363466240, + "step": 10230, + "train_runtime": 46469.0816, + "train_tokens_per_second": 115420.104 + }, + { + "epoch": 0.5541275467409833, + "grad_norm": 0.16061817109584808, + "learning_rate": 0.0024563810297239448, + "loss": 3.0611974716186525, + "num_input_tokens_seen": 5368709120, + "step": 10240, + "train_runtime": 46514.2276, + "train_tokens_per_second": 115420.795 + }, + { + "epoch": 0.5546686869233475, + "grad_norm": 0.14743222296237946, + "learning_rate": 0.0024524837392956088, + "loss": 3.0409524917602537, + "num_input_tokens_seen": 5373952000, + "step": 10250, + "train_runtime": 46559.373, + "train_tokens_per_second": 115421.486 + }, + { + "epoch": 0.5552098271057118, + "grad_norm": 0.145145446062088, + "learning_rate": 0.0024485873571707313, + "loss": 3.0503875732421877, + "num_input_tokens_seen": 5379194880, + "step": 10260, + "train_runtime": 46604.5364, + "train_tokens_per_second": 115422.13 + }, + { + "epoch": 0.5557509672880759, + "grad_norm": 0.13785313069820404, + "learning_rate": 0.0024446918952447856, + "loss": 3.051102066040039, + "num_input_tokens_seen": 5384437760, + "step": 10270, + "train_runtime": 46649.714, + "train_tokens_per_second": 115422.739 + }, + { + "epoch": 0.5562921074704402, + "grad_norm": 0.15741367638111115, + "learning_rate": 0.002440797365410437, + "loss": 3.0524486541748046, + "num_input_tokens_seen": 5389680640, + "step": 10280, + "train_runtime": 46694.8766, + "train_tokens_per_second": 115423.383 + }, + { + "epoch": 0.5568332476528045, + "grad_norm": 0.14624185860157013, + "learning_rate": 0.002436903779557509, + "loss": 3.041734313964844, + "num_input_tokens_seen": 5394923520, + "step": 10290, + "train_runtime": 46740.04, + "train_tokens_per_second": 115424.024 + }, + { + "epoch": 0.5573743878351687, + "grad_norm": 0.15357740223407745, + "learning_rate": 0.002433011149572938, + "loss": 3.05377254486084, + "num_input_tokens_seen": 5400166400, + "step": 10300, + "train_runtime": 46785.1999, + "train_tokens_per_second": 115424.673 + }, + { + "epoch": 0.557915528017533, + "grad_norm": 0.1404609978199005, + "learning_rate": 0.002429119487340744, + "loss": 3.0517080307006834, + "num_input_tokens_seen": 5405409280, + "step": 10310, + "train_runtime": 46834.0267, + "train_tokens_per_second": 115416.283 + }, + { + "epoch": 0.5584566681998971, + "grad_norm": 0.13460350036621094, + "learning_rate": 0.0024252288047419933, + "loss": 3.047005462646484, + "num_input_tokens_seen": 5410652160, + "step": 10320, + "train_runtime": 46879.179, + "train_tokens_per_second": 115416.956 + }, + { + "epoch": 0.5589978083822614, + "grad_norm": 0.13896240293979645, + "learning_rate": 0.002421339113654761, + "loss": 3.0483970642089844, + "num_input_tokens_seen": 5415895040, + "step": 10330, + "train_runtime": 46924.3535, + "train_tokens_per_second": 115417.574 + }, + { + "epoch": 0.5595389485646257, + "grad_norm": 0.1555888056755066, + "learning_rate": 0.0024174504259540965, + "loss": 3.045535087585449, + "num_input_tokens_seen": 5421137920, + "step": 10340, + "train_runtime": 46969.5132, + "train_tokens_per_second": 115418.227 + }, + { + "epoch": 0.5600800887469899, + "grad_norm": 0.1442544311285019, + "learning_rate": 0.002413562753511982, + "loss": 3.0400226593017576, + "num_input_tokens_seen": 5426380800, + "step": 10350, + "train_runtime": 47014.6917, + "train_tokens_per_second": 115418.832 + }, + { + "epoch": 0.5606212289293542, + "grad_norm": 0.16144470870494843, + "learning_rate": 0.002409676108197302, + "loss": 3.044460678100586, + "num_input_tokens_seen": 5431623680, + "step": 10360, + "train_runtime": 47059.8715, + "train_tokens_per_second": 115419.433 + }, + { + "epoch": 0.5611623691117184, + "grad_norm": 0.1435036063194275, + "learning_rate": 0.0024057905018758097, + "loss": 3.051218032836914, + "num_input_tokens_seen": 5436866560, + "step": 10370, + "train_runtime": 47105.0206, + "train_tokens_per_second": 115420.108 + }, + { + "epoch": 0.5617035092940826, + "grad_norm": 0.14437657594680786, + "learning_rate": 0.0024019059464100794, + "loss": 3.049814987182617, + "num_input_tokens_seen": 5442109440, + "step": 10380, + "train_runtime": 47150.1709, + "train_tokens_per_second": 115420.779 + }, + { + "epoch": 0.5622446494764469, + "grad_norm": 0.1483716368675232, + "learning_rate": 0.0023980224536594803, + "loss": 3.051362991333008, + "num_input_tokens_seen": 5447352320, + "step": 10390, + "train_runtime": 47195.3171, + "train_tokens_per_second": 115421.458 + }, + { + "epoch": 0.5627857896588111, + "grad_norm": 0.1392650008201599, + "learning_rate": 0.002394140035480139, + "loss": 3.05356502532959, + "num_input_tokens_seen": 5452595200, + "step": 10400, + "train_runtime": 47240.4745, + "train_tokens_per_second": 115422.109 + }, + { + "epoch": 0.5633269298411754, + "grad_norm": 0.13990668952465057, + "learning_rate": 0.002390258703724898, + "loss": 3.053313064575195, + "num_input_tokens_seen": 5457838080, + "step": 10410, + "train_runtime": 47285.6159, + "train_tokens_per_second": 115422.798 + }, + { + "epoch": 0.5638680700235396, + "grad_norm": 0.1470736563205719, + "learning_rate": 0.002386378470243285, + "loss": 3.050541305541992, + "num_input_tokens_seen": 5463080960, + "step": 10420, + "train_runtime": 47330.7575, + "train_tokens_per_second": 115423.485 + }, + { + "epoch": 0.5644092102059038, + "grad_norm": 0.15061776340007782, + "learning_rate": 0.0023824993468814734, + "loss": 3.0488460540771483, + "num_input_tokens_seen": 5468323840, + "step": 10430, + "train_runtime": 47375.9168, + "train_tokens_per_second": 115424.127 + }, + { + "epoch": 0.5649503503882681, + "grad_norm": 0.14553044736385345, + "learning_rate": 0.0023786213454822496, + "loss": 3.0426799774169924, + "num_input_tokens_seen": 5473566720, + "step": 10440, + "train_runtime": 47421.0774, + "train_tokens_per_second": 115424.765 + }, + { + "epoch": 0.5654914905706323, + "grad_norm": 0.1505778431892395, + "learning_rate": 0.002374744477884974, + "loss": 3.0493221282958984, + "num_input_tokens_seen": 5478809600, + "step": 10450, + "train_runtime": 47466.2416, + "train_tokens_per_second": 115425.393 + }, + { + "epoch": 0.5660326307529966, + "grad_norm": 0.14359861612319946, + "learning_rate": 0.002370868755925543, + "loss": 3.048199272155762, + "num_input_tokens_seen": 5484052480, + "step": 10460, + "train_runtime": 47511.3938, + "train_tokens_per_second": 115426.049 + }, + { + "epoch": 0.5665737709353608, + "grad_norm": 0.1411399245262146, + "learning_rate": 0.0023669941914363597, + "loss": 3.0590206146240235, + "num_input_tokens_seen": 5489295360, + "step": 10470, + "train_runtime": 47556.5734, + "train_tokens_per_second": 115426.638 + }, + { + "epoch": 0.567114911117725, + "grad_norm": 0.14005140960216522, + "learning_rate": 0.0023631207962462905, + "loss": 3.052465057373047, + "num_input_tokens_seen": 5494538240, + "step": 10480, + "train_runtime": 47601.7335, + "train_tokens_per_second": 115427.272 + }, + { + "epoch": 0.5676560513000893, + "grad_norm": 0.15281735360622406, + "learning_rate": 0.0023592485821806314, + "loss": 3.0543212890625, + "num_input_tokens_seen": 5499781120, + "step": 10490, + "train_runtime": 47646.8804, + "train_tokens_per_second": 115427.937 + }, + { + "epoch": 0.5681971914824535, + "grad_norm": 0.15110628306865692, + "learning_rate": 0.0023553775610610744, + "loss": 3.0445037841796876, + "num_input_tokens_seen": 5505024000, + "step": 10500, + "train_runtime": 47692.0533, + "train_tokens_per_second": 115428.538 + }, + { + "epoch": 0.5681971914824535, + "eval_loss": 3.0027830600738525, + "eval_runtime": 1.9832, + "eval_samples_per_second": 252.115, + "eval_steps_per_second": 4.034, + "num_input_tokens_seen": 5505024000, + "step": 10500 + }, + { + "epoch": 0.5687383316648178, + "grad_norm": 0.14011938869953156, + "learning_rate": 0.0023515077447056705, + "loss": 3.0531822204589845, + "num_input_tokens_seen": 5510266880, + "step": 10510, + "train_runtime": 47739.2068, + "train_tokens_per_second": 115424.349 + }, + { + "epoch": 0.569279471847182, + "grad_norm": 0.14612512290477753, + "learning_rate": 0.002347639144928789, + "loss": 3.051645278930664, + "num_input_tokens_seen": 5515509760, + "step": 10520, + "train_runtime": 47784.361, + "train_tokens_per_second": 115424.998 + }, + { + "epoch": 0.5698206120295463, + "grad_norm": 0.15726540982723236, + "learning_rate": 0.0023437717735410872, + "loss": 3.0477500915527345, + "num_input_tokens_seen": 5520752640, + "step": 10530, + "train_runtime": 47829.5028, + "train_tokens_per_second": 115425.675 + }, + { + "epoch": 0.5703617522119105, + "grad_norm": 0.14600247144699097, + "learning_rate": 0.002339905642349474, + "loss": 3.0487768173217775, + "num_input_tokens_seen": 5525995520, + "step": 10540, + "train_runtime": 47874.6792, + "train_tokens_per_second": 115426.267 + }, + { + "epoch": 0.5709028923942747, + "grad_norm": 0.1526118814945221, + "learning_rate": 0.0023360407631570685, + "loss": 3.0494321823120116, + "num_input_tokens_seen": 5531238400, + "step": 10550, + "train_runtime": 47919.8527, + "train_tokens_per_second": 115426.866 + }, + { + "epoch": 0.571444032576639, + "grad_norm": 0.14721056818962097, + "learning_rate": 0.0023321771477631693, + "loss": 3.046247100830078, + "num_input_tokens_seen": 5536481280, + "step": 10560, + "train_runtime": 47965.0143, + "train_tokens_per_second": 115427.492 + }, + { + "epoch": 0.5719851727590032, + "grad_norm": 0.15114431083202362, + "learning_rate": 0.0023283148079632156, + "loss": 3.0407901763916017, + "num_input_tokens_seen": 5541724160, + "step": 10570, + "train_runtime": 48010.1749, + "train_tokens_per_second": 115428.119 + }, + { + "epoch": 0.5725263129413675, + "grad_norm": 0.14631570875644684, + "learning_rate": 0.0023244537555487544, + "loss": 3.0476711273193358, + "num_input_tokens_seen": 5546967040, + "step": 10580, + "train_runtime": 48055.3433, + "train_tokens_per_second": 115428.726 + }, + { + "epoch": 0.5730674531237318, + "grad_norm": 0.14861121773719788, + "learning_rate": 0.0023205940023074013, + "loss": 3.049782562255859, + "num_input_tokens_seen": 5552209920, + "step": 10590, + "train_runtime": 48100.5075, + "train_tokens_per_second": 115429.342 + }, + { + "epoch": 0.5736085933060959, + "grad_norm": 0.13904227316379547, + "learning_rate": 0.002316735560022804, + "loss": 3.055135726928711, + "num_input_tokens_seen": 5557452800, + "step": 10600, + "train_runtime": 48145.6693, + "train_tokens_per_second": 115429.962 + }, + { + "epoch": 0.5741497334884602, + "grad_norm": 0.13765645027160645, + "learning_rate": 0.00231287844047461, + "loss": 3.047852325439453, + "num_input_tokens_seen": 5562695680, + "step": 10610, + "train_runtime": 48190.8309, + "train_tokens_per_second": 115430.582 + }, + { + "epoch": 0.5746908736708244, + "grad_norm": 0.14210833609104156, + "learning_rate": 0.0023090226554384288, + "loss": 3.0472042083740236, + "num_input_tokens_seen": 5567938560, + "step": 10620, + "train_runtime": 48235.9849, + "train_tokens_per_second": 115431.22 + }, + { + "epoch": 0.5752320138531887, + "grad_norm": 0.149732306599617, + "learning_rate": 0.0023051682166857937, + "loss": 3.0454326629638673, + "num_input_tokens_seen": 5573181440, + "step": 10630, + "train_runtime": 48281.1275, + "train_tokens_per_second": 115431.883 + }, + { + "epoch": 0.575773154035553, + "grad_norm": 0.1392926275730133, + "learning_rate": 0.002301315135984128, + "loss": 3.0390705108642577, + "num_input_tokens_seen": 5578424320, + "step": 10640, + "train_runtime": 48326.2477, + "train_tokens_per_second": 115432.598 + }, + { + "epoch": 0.5763142942179171, + "grad_norm": 0.13122397661209106, + "learning_rate": 0.0022974634250967113, + "loss": 3.036616897583008, + "num_input_tokens_seen": 5583667200, + "step": 10650, + "train_runtime": 48371.3879, + "train_tokens_per_second": 115433.264 + }, + { + "epoch": 0.5768554344002814, + "grad_norm": 0.14650028944015503, + "learning_rate": 0.0022936130957826395, + "loss": 3.04638786315918, + "num_input_tokens_seen": 5588910080, + "step": 10660, + "train_runtime": 48416.5301, + "train_tokens_per_second": 115433.924 + }, + { + "epoch": 0.5773965745826456, + "grad_norm": 0.13940243422985077, + "learning_rate": 0.002289764159796791, + "loss": 3.049785614013672, + "num_input_tokens_seen": 5594152960, + "step": 10670, + "train_runtime": 48461.6807, + "train_tokens_per_second": 115434.564 + }, + { + "epoch": 0.5779377147650099, + "grad_norm": 0.13686427474021912, + "learning_rate": 0.0022859166288897895, + "loss": 3.0434268951416015, + "num_input_tokens_seen": 5599395840, + "step": 10680, + "train_runtime": 48506.8227, + "train_tokens_per_second": 115435.222 + }, + { + "epoch": 0.5784788549473742, + "grad_norm": 0.14600345492362976, + "learning_rate": 0.0022820705148079703, + "loss": 3.052047538757324, + "num_input_tokens_seen": 5604638720, + "step": 10690, + "train_runtime": 48555.616, + "train_tokens_per_second": 115427.198 + }, + { + "epoch": 0.5790199951297383, + "grad_norm": 0.14253467321395874, + "learning_rate": 0.0022782258292933432, + "loss": 3.0317237854003904, + "num_input_tokens_seen": 5609881600, + "step": 10700, + "train_runtime": 48600.7657, + "train_tokens_per_second": 115427.844 + }, + { + "epoch": 0.5795611353121026, + "grad_norm": 0.13422608375549316, + "learning_rate": 0.0022743825840835542, + "loss": 3.038676071166992, + "num_input_tokens_seen": 5615124480, + "step": 10710, + "train_runtime": 48645.891, + "train_tokens_per_second": 115428.546 + }, + { + "epoch": 0.5801022754944668, + "grad_norm": 0.14621081948280334, + "learning_rate": 0.0022705407909118574, + "loss": 3.0488845825195314, + "num_input_tokens_seen": 5620367360, + "step": 10720, + "train_runtime": 48691.0214, + "train_tokens_per_second": 115429.235 + }, + { + "epoch": 0.5806434156768311, + "grad_norm": 0.14328445494174957, + "learning_rate": 0.002266700461507069, + "loss": 3.039694595336914, + "num_input_tokens_seen": 5625610240, + "step": 10730, + "train_runtime": 48736.1623, + "train_tokens_per_second": 115429.898 + }, + { + "epoch": 0.5811845558591954, + "grad_norm": 0.1407247930765152, + "learning_rate": 0.0022628616075935377, + "loss": 3.0443794250488283, + "num_input_tokens_seen": 5630853120, + "step": 10740, + "train_runtime": 48781.2843, + "train_tokens_per_second": 115430.604 + }, + { + "epoch": 0.5817256960415595, + "grad_norm": 0.15159763395786285, + "learning_rate": 0.0022590242408911066, + "loss": 3.0392004013061524, + "num_input_tokens_seen": 5636096000, + "step": 10750, + "train_runtime": 48826.4182, + "train_tokens_per_second": 115431.281 + }, + { + "epoch": 0.5822668362239238, + "grad_norm": 0.14982502162456512, + "learning_rate": 0.0022551883731150822, + "loss": 3.041204833984375, + "num_input_tokens_seen": 5641338880, + "step": 10760, + "train_runtime": 48871.5523, + "train_tokens_per_second": 115431.956 + }, + { + "epoch": 0.582807976406288, + "grad_norm": 0.14223578572273254, + "learning_rate": 0.0022513540159761927, + "loss": 3.058414840698242, + "num_input_tokens_seen": 5646581760, + "step": 10770, + "train_runtime": 48916.6973, + "train_tokens_per_second": 115432.604 + }, + { + "epoch": 0.5833491165886523, + "grad_norm": 0.14026111364364624, + "learning_rate": 0.0022475211811805508, + "loss": 3.040976715087891, + "num_input_tokens_seen": 5651824640, + "step": 10780, + "train_runtime": 48961.8335, + "train_tokens_per_second": 115433.272 + }, + { + "epoch": 0.5838902567710166, + "grad_norm": 0.1421726644039154, + "learning_rate": 0.0022436898804296273, + "loss": 3.0329113006591797, + "num_input_tokens_seen": 5657067520, + "step": 10790, + "train_runtime": 49006.9777, + "train_tokens_per_second": 115433.92 + }, + { + "epoch": 0.5844313969533808, + "grad_norm": 0.14624913036823273, + "learning_rate": 0.0022398601254202074, + "loss": 3.0412059783935548, + "num_input_tokens_seen": 5662310400, + "step": 10800, + "train_runtime": 49052.1103, + "train_tokens_per_second": 115434.593 + }, + { + "epoch": 0.584972537135745, + "grad_norm": 0.14961951971054077, + "learning_rate": 0.0022360319278443555, + "loss": 3.039783477783203, + "num_input_tokens_seen": 5667553280, + "step": 10810, + "train_runtime": 49097.2475, + "train_tokens_per_second": 115435.255 + }, + { + "epoch": 0.5855136773181092, + "grad_norm": 0.13819707930088043, + "learning_rate": 0.0022322052993893828, + "loss": 3.0379779815673826, + "num_input_tokens_seen": 5672796160, + "step": 10820, + "train_runtime": 49142.4002, + "train_tokens_per_second": 115435.879 + }, + { + "epoch": 0.5860548175004735, + "grad_norm": 0.1506834328174591, + "learning_rate": 0.002228380251737811, + "loss": 3.038629341125488, + "num_input_tokens_seen": 5678039040, + "step": 10830, + "train_runtime": 49187.5417, + "train_tokens_per_second": 115436.528 + }, + { + "epoch": 0.5865959576828378, + "grad_norm": 0.1385858803987503, + "learning_rate": 0.0022245567965673346, + "loss": 3.0388534545898436, + "num_input_tokens_seen": 5683281920, + "step": 10840, + "train_runtime": 49232.6855, + "train_tokens_per_second": 115437.171 + }, + { + "epoch": 0.587137097865202, + "grad_norm": 0.14153380692005157, + "learning_rate": 0.002220734945550785, + "loss": 3.040701675415039, + "num_input_tokens_seen": 5688524800, + "step": 10850, + "train_runtime": 49277.8278, + "train_tokens_per_second": 115437.816 + }, + { + "epoch": 0.5876782380475662, + "grad_norm": 0.1447627693414688, + "learning_rate": 0.002216914710356098, + "loss": 3.0347267150878907, + "num_input_tokens_seen": 5693767680, + "step": 10860, + "train_runtime": 49322.9704, + "train_tokens_per_second": 115438.459 + }, + { + "epoch": 0.5882193782299304, + "grad_norm": 0.15700754523277283, + "learning_rate": 0.0022130961026462772, + "loss": 3.0408071517944335, + "num_input_tokens_seen": 5699010560, + "step": 10870, + "train_runtime": 49368.1084, + "train_tokens_per_second": 115439.111 + }, + { + "epoch": 0.5887605184122947, + "grad_norm": 0.1373981535434723, + "learning_rate": 0.002209279134079355, + "loss": 3.0413372039794924, + "num_input_tokens_seen": 5704253440, + "step": 10880, + "train_runtime": 49413.2385, + "train_tokens_per_second": 115439.781 + }, + { + "epoch": 0.589301658594659, + "grad_norm": 0.13982577621936798, + "learning_rate": 0.0022054638163083607, + "loss": 3.0364784240722655, + "num_input_tokens_seen": 5709496320, + "step": 10890, + "train_runtime": 49458.3538, + "train_tokens_per_second": 115440.484 + }, + { + "epoch": 0.5898427987770232, + "grad_norm": 0.1471603363752365, + "learning_rate": 0.0022016501609812846, + "loss": 3.02860107421875, + "num_input_tokens_seen": 5714739200, + "step": 10900, + "train_runtime": 49503.4854, + "train_tokens_per_second": 115441.148 + }, + { + "epoch": 0.5903839389593875, + "grad_norm": 0.140976682305336, + "learning_rate": 0.002197838179741041, + "loss": 3.048592948913574, + "num_input_tokens_seen": 5719982080, + "step": 10910, + "train_runtime": 49548.6066, + "train_tokens_per_second": 115441.835 + }, + { + "epoch": 0.5909250791417516, + "grad_norm": 0.1391858160495758, + "learning_rate": 0.0021940278842254336, + "loss": 3.0438766479492188, + "num_input_tokens_seen": 5725224960, + "step": 10920, + "train_runtime": 49593.7284, + "train_tokens_per_second": 115442.519 + }, + { + "epoch": 0.5914662193241159, + "grad_norm": 0.13984552025794983, + "learning_rate": 0.0021902192860671172, + "loss": 3.032778739929199, + "num_input_tokens_seen": 5730467840, + "step": 10930, + "train_runtime": 49638.8587, + "train_tokens_per_second": 115443.183 + }, + { + "epoch": 0.5920073595064802, + "grad_norm": 0.15091544389724731, + "learning_rate": 0.0021864123968935696, + "loss": 3.0441143035888674, + "num_input_tokens_seen": 5735710720, + "step": 10940, + "train_runtime": 49683.9877, + "train_tokens_per_second": 115443.848 + }, + { + "epoch": 0.5925484996888444, + "grad_norm": 0.1554540991783142, + "learning_rate": 0.0021826072283270465, + "loss": 3.028913116455078, + "num_input_tokens_seen": 5740953600, + "step": 10950, + "train_runtime": 49729.1048, + "train_tokens_per_second": 115444.539 + }, + { + "epoch": 0.5930896398712087, + "grad_norm": 0.14302626252174377, + "learning_rate": 0.0021788037919845526, + "loss": 3.0385337829589845, + "num_input_tokens_seen": 5746196480, + "step": 10960, + "train_runtime": 49774.2324, + "train_tokens_per_second": 115445.205 + }, + { + "epoch": 0.5936307800535728, + "grad_norm": 0.14910683035850525, + "learning_rate": 0.0021750020994778054, + "loss": 3.0436506271362305, + "num_input_tokens_seen": 5751439360, + "step": 10970, + "train_runtime": 49819.3456, + "train_tokens_per_second": 115445.903 + }, + { + "epoch": 0.5941719202359371, + "grad_norm": 0.15283723175525665, + "learning_rate": 0.002171202162413195, + "loss": 3.047803497314453, + "num_input_tokens_seen": 5756682240, + "step": 10980, + "train_runtime": 49864.471, + "train_tokens_per_second": 115446.572 + }, + { + "epoch": 0.5947130604183014, + "grad_norm": 0.14589117467403412, + "learning_rate": 0.002167403992391757, + "loss": 3.0425289154052733, + "num_input_tokens_seen": 5761925120, + "step": 10990, + "train_runtime": 49909.6019, + "train_tokens_per_second": 115447.227 + }, + { + "epoch": 0.5952542006006656, + "grad_norm": 0.1394151747226715, + "learning_rate": 0.0021636076010091276, + "loss": 3.0472259521484375, + "num_input_tokens_seen": 5767168000, + "step": 11000, + "train_runtime": 49954.7308, + "train_tokens_per_second": 115447.885 + }, + { + "epoch": 0.5952542006006656, + "eval_loss": 2.9926395416259766, + "eval_runtime": 1.9832, + "eval_samples_per_second": 252.121, + "eval_steps_per_second": 4.034, + "num_input_tokens_seen": 5767168000, + "step": 11000 + }, + { + "epoch": 0.5957953407830299, + "grad_norm": 0.1461019068956375, + "learning_rate": 0.002159812999855516, + "loss": 3.034767913818359, + "num_input_tokens_seen": 5772410880, + "step": 11010, + "train_runtime": 50004.2941, + "train_tokens_per_second": 115438.304 + }, + { + "epoch": 0.596336480965394, + "grad_norm": 0.13988643884658813, + "learning_rate": 0.002156020200515666, + "loss": 3.0288986206054687, + "num_input_tokens_seen": 5777653760, + "step": 11020, + "train_runtime": 50049.449, + "train_tokens_per_second": 115438.908 + }, + { + "epoch": 0.5968776211477583, + "grad_norm": 0.13354118168354034, + "learning_rate": 0.002152229214568817, + "loss": 3.0315704345703125, + "num_input_tokens_seen": 5782896640, + "step": 11030, + "train_runtime": 50094.5853, + "train_tokens_per_second": 115439.555 + }, + { + "epoch": 0.5974187613301226, + "grad_norm": 0.1455395370721817, + "learning_rate": 0.0021484400535886766, + "loss": 3.0255619049072267, + "num_input_tokens_seen": 5788139520, + "step": 11040, + "train_runtime": 50139.709, + "train_tokens_per_second": 115440.23 + }, + { + "epoch": 0.5979599015124868, + "grad_norm": 0.1527973711490631, + "learning_rate": 0.002144652729143379, + "loss": 3.0323816299438477, + "num_input_tokens_seen": 5793382400, + "step": 11050, + "train_runtime": 50184.8687, + "train_tokens_per_second": 115440.82 + }, + { + "epoch": 0.5985010416948511, + "grad_norm": 0.14457052946090698, + "learning_rate": 0.0021408672527954502, + "loss": 3.0245555877685546, + "num_input_tokens_seen": 5798625280, + "step": 11060, + "train_runtime": 50230.0184, + "train_tokens_per_second": 115441.432 + }, + { + "epoch": 0.5990421818772153, + "grad_norm": 0.1389371007680893, + "learning_rate": 0.0021370836361017764, + "loss": 3.036094856262207, + "num_input_tokens_seen": 5803868160, + "step": 11070, + "train_runtime": 50278.8251, + "train_tokens_per_second": 115433.647 + }, + { + "epoch": 0.5995833220595795, + "grad_norm": 0.15234215557575226, + "learning_rate": 0.002133301890613565, + "loss": 3.0295217514038084, + "num_input_tokens_seen": 5809111040, + "step": 11080, + "train_runtime": 50323.981, + "train_tokens_per_second": 115434.251 + }, + { + "epoch": 0.6001244622419438, + "grad_norm": 0.1413789838552475, + "learning_rate": 0.002129522027876311, + "loss": 3.021541404724121, + "num_input_tokens_seen": 5814353920, + "step": 11090, + "train_runtime": 50369.1453, + "train_tokens_per_second": 115434.834 + }, + { + "epoch": 0.600665602424308, + "grad_norm": 0.15141098201274872, + "learning_rate": 0.0021257440594297607, + "loss": 3.026825714111328, + "num_input_tokens_seen": 5819596800, + "step": 11100, + "train_runtime": 50414.335, + "train_tokens_per_second": 115435.358 + }, + { + "epoch": 0.6012067426066723, + "grad_norm": 0.1444009244441986, + "learning_rate": 0.00212196799680788, + "loss": 3.033803939819336, + "num_input_tokens_seen": 5824839680, + "step": 11110, + "train_runtime": 50459.5415, + "train_tokens_per_second": 115435.842 + }, + { + "epoch": 0.6017478827890365, + "grad_norm": 0.14855210483074188, + "learning_rate": 0.002118193851538812, + "loss": 3.0400081634521485, + "num_input_tokens_seen": 5830082560, + "step": 11120, + "train_runtime": 50504.7336, + "train_tokens_per_second": 115436.359 + }, + { + "epoch": 0.6022890229714007, + "grad_norm": 0.13804234564304352, + "learning_rate": 0.002114421635144851, + "loss": 3.0301578521728514, + "num_input_tokens_seen": 5835325440, + "step": 11130, + "train_runtime": 50549.9225, + "train_tokens_per_second": 115436.882 + }, + { + "epoch": 0.602830163153765, + "grad_norm": 0.14875908195972443, + "learning_rate": 0.0021106513591423967, + "loss": 3.032312774658203, + "num_input_tokens_seen": 5840568320, + "step": 11140, + "train_runtime": 50595.1041, + "train_tokens_per_second": 115437.421 + }, + { + "epoch": 0.6033713033361292, + "grad_norm": 0.14444148540496826, + "learning_rate": 0.0021068830350419315, + "loss": 3.038595199584961, + "num_input_tokens_seen": 5845811200, + "step": 11150, + "train_runtime": 50640.2767, + "train_tokens_per_second": 115437.979 + }, + { + "epoch": 0.6039124435184935, + "grad_norm": 0.14319837093353271, + "learning_rate": 0.002103116674347975, + "loss": 3.0365222930908202, + "num_input_tokens_seen": 5851054080, + "step": 11160, + "train_runtime": 50685.4574, + "train_tokens_per_second": 115438.518 + }, + { + "epoch": 0.6044535837008577, + "grad_norm": 0.1528901755809784, + "learning_rate": 0.002099352288559052, + "loss": 3.0367916107177733, + "num_input_tokens_seen": 5856296960, + "step": 11170, + "train_runtime": 50730.6306, + "train_tokens_per_second": 115439.073 + }, + { + "epoch": 0.604994723883222, + "grad_norm": 0.1397976279258728, + "learning_rate": 0.002095589889167659, + "loss": 3.026215744018555, + "num_input_tokens_seen": 5861539840, + "step": 11180, + "train_runtime": 50775.8006, + "train_tokens_per_second": 115439.634 + }, + { + "epoch": 0.6055358640655862, + "grad_norm": 0.1472213864326477, + "learning_rate": 0.0020918294876602294, + "loss": 3.0274309158325194, + "num_input_tokens_seen": 5866782720, + "step": 11190, + "train_runtime": 50820.9715, + "train_tokens_per_second": 115440.192 + }, + { + "epoch": 0.6060770042479504, + "grad_norm": 0.14902907609939575, + "learning_rate": 0.0020880710955170955, + "loss": 3.0351707458496096, + "num_input_tokens_seen": 5872025600, + "step": 11200, + "train_runtime": 50866.1392, + "train_tokens_per_second": 115440.757 + }, + { + "epoch": 0.6066181444303147, + "grad_norm": 0.1408633142709732, + "learning_rate": 0.0020843147242124555, + "loss": 3.029071807861328, + "num_input_tokens_seen": 5877268480, + "step": 11210, + "train_runtime": 50911.3053, + "train_tokens_per_second": 115441.324 + }, + { + "epoch": 0.6071592846126789, + "grad_norm": 0.14094239473342896, + "learning_rate": 0.0020805603852143383, + "loss": 3.032915496826172, + "num_input_tokens_seen": 5882511360, + "step": 11220, + "train_runtime": 50956.4731, + "train_tokens_per_second": 115441.886 + }, + { + "epoch": 0.6077004247950432, + "grad_norm": 0.14471176266670227, + "learning_rate": 0.0020768080899845687, + "loss": 3.0328413009643556, + "num_input_tokens_seen": 5887754240, + "step": 11230, + "train_runtime": 51001.6333, + "train_tokens_per_second": 115442.465 + }, + { + "epoch": 0.6082415649774074, + "grad_norm": 0.1309700757265091, + "learning_rate": 0.00207305784997873, + "loss": 3.0344516754150392, + "num_input_tokens_seen": 5892997120, + "step": 11240, + "train_runtime": 51046.7842, + "train_tokens_per_second": 115443.063 + }, + { + "epoch": 0.6087827051597716, + "grad_norm": 0.14067348837852478, + "learning_rate": 0.0020693096766461333, + "loss": 3.0316375732421874, + "num_input_tokens_seen": 5898240000, + "step": 11250, + "train_runtime": 51091.9272, + "train_tokens_per_second": 115443.678 + }, + { + "epoch": 0.6093238453421359, + "grad_norm": 0.14874261617660522, + "learning_rate": 0.00206556358142978, + "loss": 3.0260826110839845, + "num_input_tokens_seen": 5903482880, + "step": 11260, + "train_runtime": 51137.0716, + "train_tokens_per_second": 115444.289 + }, + { + "epoch": 0.6098649855245001, + "grad_norm": 0.1435764729976654, + "learning_rate": 0.002061819575766326, + "loss": 3.0409059524536133, + "num_input_tokens_seen": 5908725760, + "step": 11270, + "train_runtime": 51182.2082, + "train_tokens_per_second": 115444.917 + }, + { + "epoch": 0.6104061257068644, + "grad_norm": 0.1484087109565735, + "learning_rate": 0.002058077671086047, + "loss": 3.0283117294311523, + "num_input_tokens_seen": 5913968640, + "step": 11280, + "train_runtime": 51227.347, + "train_tokens_per_second": 115445.538 + }, + { + "epoch": 0.6109472658892287, + "grad_norm": 0.140775665640831, + "learning_rate": 0.002054337878812808, + "loss": 3.026752471923828, + "num_input_tokens_seen": 5919211520, + "step": 11290, + "train_runtime": 51272.4758, + "train_tokens_per_second": 115446.181 + }, + { + "epoch": 0.6114884060715928, + "grad_norm": 0.14487655460834503, + "learning_rate": 0.002050600210364022, + "loss": 3.0381233215332033, + "num_input_tokens_seen": 5924454400, + "step": 11300, + "train_runtime": 51317.6163, + "train_tokens_per_second": 115446.796 + }, + { + "epoch": 0.6120295462539571, + "grad_norm": 0.13244298100471497, + "learning_rate": 0.0020468646771506184, + "loss": 3.037242889404297, + "num_input_tokens_seen": 5929697280, + "step": 11310, + "train_runtime": 51362.7685, + "train_tokens_per_second": 115447.384 + }, + { + "epoch": 0.6125706864363213, + "grad_norm": 0.13805389404296875, + "learning_rate": 0.002043131290577007, + "loss": 3.034191703796387, + "num_input_tokens_seen": 5934940160, + "step": 11320, + "train_runtime": 51407.9443, + "train_tokens_per_second": 115447.919 + }, + { + "epoch": 0.6131118266186856, + "grad_norm": 0.13927042484283447, + "learning_rate": 0.002039400062041048, + "loss": 3.0405059814453126, + "num_input_tokens_seen": 5940183040, + "step": 11330, + "train_runtime": 51453.1187, + "train_tokens_per_second": 115448.455 + }, + { + "epoch": 0.6136529668010499, + "grad_norm": 0.13962484896183014, + "learning_rate": 0.0020356710029340096, + "loss": 3.0331016540527345, + "num_input_tokens_seen": 5945425920, + "step": 11340, + "train_runtime": 51498.2896, + "train_tokens_per_second": 115448.998 + }, + { + "epoch": 0.614194106983414, + "grad_norm": 0.2101336121559143, + "learning_rate": 0.0020319441246405357, + "loss": 3.028001594543457, + "num_input_tokens_seen": 5950668800, + "step": 11350, + "train_runtime": 51543.4451, + "train_tokens_per_second": 115449.574 + }, + { + "epoch": 0.6147352471657783, + "grad_norm": 0.14625418186187744, + "learning_rate": 0.0020282194385386173, + "loss": 3.0344852447509765, + "num_input_tokens_seen": 5955911680, + "step": 11360, + "train_runtime": 51588.6188, + "train_tokens_per_second": 115450.109 + }, + { + "epoch": 0.6152763873481425, + "grad_norm": 0.1353636085987091, + "learning_rate": 0.002024496955999548, + "loss": 3.0306270599365233, + "num_input_tokens_seen": 5961154560, + "step": 11370, + "train_runtime": 51633.7613, + "train_tokens_per_second": 115450.713 + }, + { + "epoch": 0.6158175275305068, + "grad_norm": 0.1368403434753418, + "learning_rate": 0.0020207766883878955, + "loss": 3.0311580657958985, + "num_input_tokens_seen": 5966397440, + "step": 11380, + "train_runtime": 51678.9232, + "train_tokens_per_second": 115451.272 + }, + { + "epoch": 0.6163586677128711, + "grad_norm": 0.14600516855716705, + "learning_rate": 0.0020170586470614656, + "loss": 3.0117847442626955, + "num_input_tokens_seen": 5971640320, + "step": 11390, + "train_runtime": 51724.0749, + "train_tokens_per_second": 115451.853 + }, + { + "epoch": 0.6168998078952352, + "grad_norm": 0.14564567804336548, + "learning_rate": 0.002013342843371269, + "loss": 3.037702941894531, + "num_input_tokens_seen": 5976883200, + "step": 11400, + "train_runtime": 51769.2278, + "train_tokens_per_second": 115452.431 + }, + { + "epoch": 0.6174409480775995, + "grad_norm": 0.1405801773071289, + "learning_rate": 0.0020096292886614825, + "loss": 3.0343984603881835, + "num_input_tokens_seen": 5982126080, + "step": 11410, + "train_runtime": 51814.3769, + "train_tokens_per_second": 115453.016 + }, + { + "epoch": 0.6179820882599637, + "grad_norm": 0.14390794932842255, + "learning_rate": 0.002005917994269417, + "loss": 3.023337173461914, + "num_input_tokens_seen": 5987368960, + "step": 11420, + "train_runtime": 51859.5123, + "train_tokens_per_second": 115453.63 + }, + { + "epoch": 0.618523228442328, + "grad_norm": 0.14093852043151855, + "learning_rate": 0.0020022089715254847, + "loss": 3.0304771423339845, + "num_input_tokens_seen": 5992611840, + "step": 11430, + "train_runtime": 51904.644, + "train_tokens_per_second": 115454.252 + }, + { + "epoch": 0.6190643686246923, + "grad_norm": 0.1447506844997406, + "learning_rate": 0.001998502231753161, + "loss": 3.030156898498535, + "num_input_tokens_seen": 5997854720, + "step": 11440, + "train_runtime": 51949.7701, + "train_tokens_per_second": 115454.885 + }, + { + "epoch": 0.6196055088070564, + "grad_norm": 0.1445121169090271, + "learning_rate": 0.001994797786268952, + "loss": 3.0251228332519533, + "num_input_tokens_seen": 6003097600, + "step": 11450, + "train_runtime": 51998.5254, + "train_tokens_per_second": 115447.458 + }, + { + "epoch": 0.6201466489894207, + "grad_norm": 0.15314067900180817, + "learning_rate": 0.0019910956463823587, + "loss": 3.022572135925293, + "num_input_tokens_seen": 6008340480, + "step": 11460, + "train_runtime": 52043.6251, + "train_tokens_per_second": 115448.155 + }, + { + "epoch": 0.6206877891717849, + "grad_norm": 0.1409369558095932, + "learning_rate": 0.0019873958233958444, + "loss": 3.024155044555664, + "num_input_tokens_seen": 6013583360, + "step": 11470, + "train_runtime": 52088.7237, + "train_tokens_per_second": 115448.852 + }, + { + "epoch": 0.6212289293541492, + "grad_norm": 0.15012867748737335, + "learning_rate": 0.0019836983286047995, + "loss": 3.0334211349487306, + "num_input_tokens_seen": 6018826240, + "step": 11480, + "train_runtime": 52133.8249, + "train_tokens_per_second": 115449.543 + }, + { + "epoch": 0.6217700695365135, + "grad_norm": 0.14085648953914642, + "learning_rate": 0.0019800031732975032, + "loss": 3.0264703750610353, + "num_input_tokens_seen": 6024069120, + "step": 11490, + "train_runtime": 52178.9281, + "train_tokens_per_second": 115450.228 + }, + { + "epoch": 0.6223112097188777, + "grad_norm": 0.14266955852508545, + "learning_rate": 0.001976310368755096, + "loss": 3.032570648193359, + "num_input_tokens_seen": 6029312000, + "step": 11500, + "train_runtime": 52224.0378, + "train_tokens_per_second": 115450.897 + }, + { + "epoch": 0.6223112097188777, + "eval_loss": 2.984975814819336, + "eval_runtime": 1.9819, + "eval_samples_per_second": 252.288, + "eval_steps_per_second": 4.037, + "num_input_tokens_seen": 6029312000, + "step": 11500 + }, + { + "epoch": 0.6228523499012419, + "grad_norm": 0.13947011530399323, + "learning_rate": 0.001972619926251541, + "loss": 3.0404077529907227, + "num_input_tokens_seen": 6034554880, + "step": 11510, + "train_runtime": 52271.1258, + "train_tokens_per_second": 115447.195 + }, + { + "epoch": 0.6233934900836061, + "grad_norm": 0.1446669101715088, + "learning_rate": 0.001968931857053588, + "loss": 3.021891784667969, + "num_input_tokens_seen": 6039797760, + "step": 11520, + "train_runtime": 52316.2406, + "train_tokens_per_second": 115447.855 + }, + { + "epoch": 0.6239346302659704, + "grad_norm": 0.13946829736232758, + "learning_rate": 0.0019652461724207425, + "loss": 3.0241966247558594, + "num_input_tokens_seen": 6045040640, + "step": 11530, + "train_runtime": 52361.3587, + "train_tokens_per_second": 115448.506 + }, + { + "epoch": 0.6244757704483347, + "grad_norm": 0.14458313584327698, + "learning_rate": 0.0019615628836052324, + "loss": 3.0141645431518556, + "num_input_tokens_seen": 6050283520, + "step": 11540, + "train_runtime": 52406.4606, + "train_tokens_per_second": 115449.192 + }, + { + "epoch": 0.6250169106306989, + "grad_norm": 0.14436115324497223, + "learning_rate": 0.0019578820018519663, + "loss": 3.0331525802612305, + "num_input_tokens_seen": 6055526400, + "step": 11550, + "train_runtime": 52451.5704, + "train_tokens_per_second": 115449.859 + }, + { + "epoch": 0.6255580508130631, + "grad_norm": 0.14012649655342102, + "learning_rate": 0.0019542035383985083, + "loss": 3.043803405761719, + "num_input_tokens_seen": 6060769280, + "step": 11560, + "train_runtime": 52496.6939, + "train_tokens_per_second": 115450.495 + }, + { + "epoch": 0.6260991909954273, + "grad_norm": 0.14854469895362854, + "learning_rate": 0.0019505275044750371, + "loss": 3.0200592041015626, + "num_input_tokens_seen": 6066012160, + "step": 11570, + "train_runtime": 52541.8152, + "train_tokens_per_second": 115451.134 + }, + { + "epoch": 0.6266403311777916, + "grad_norm": 0.15853960812091827, + "learning_rate": 0.0019468539113043166, + "loss": 3.020526885986328, + "num_input_tokens_seen": 6071255040, + "step": 11580, + "train_runtime": 52586.931, + "train_tokens_per_second": 115451.785 + }, + { + "epoch": 0.6271814713601559, + "grad_norm": 0.14197298884391785, + "learning_rate": 0.0019431827701016575, + "loss": 3.0370616912841797, + "num_input_tokens_seen": 6076497920, + "step": 11590, + "train_runtime": 52632.0665, + "train_tokens_per_second": 115452.391 + }, + { + "epoch": 0.6277226115425201, + "grad_norm": 0.1305466592311859, + "learning_rate": 0.0019395140920748827, + "loss": 3.023914337158203, + "num_input_tokens_seen": 6081740800, + "step": 11600, + "train_runtime": 52677.2162, + "train_tokens_per_second": 115452.965 + }, + { + "epoch": 0.6282637517248844, + "grad_norm": 0.1447763293981552, + "learning_rate": 0.0019358478884243008, + "loss": 3.024199676513672, + "num_input_tokens_seen": 6086983680, + "step": 11610, + "train_runtime": 52722.3572, + "train_tokens_per_second": 115453.557 + }, + { + "epoch": 0.6288048919072485, + "grad_norm": 0.14126408100128174, + "learning_rate": 0.0019321841703426608, + "loss": 3.022255706787109, + "num_input_tokens_seen": 6092226560, + "step": 11620, + "train_runtime": 52767.4813, + "train_tokens_per_second": 115454.185 + }, + { + "epoch": 0.6293460320896128, + "grad_norm": 0.1334850788116455, + "learning_rate": 0.0019285229490151263, + "loss": 3.0233287811279297, + "num_input_tokens_seen": 6097469440, + "step": 11630, + "train_runtime": 52812.6435, + "train_tokens_per_second": 115454.729 + }, + { + "epoch": 0.6298871722719771, + "grad_norm": 0.14635370671749115, + "learning_rate": 0.0019248642356192365, + "loss": 3.03590087890625, + "num_input_tokens_seen": 6102712320, + "step": 11640, + "train_runtime": 52857.8692, + "train_tokens_per_second": 115455.133 + }, + { + "epoch": 0.6304283124543413, + "grad_norm": 0.13621026277542114, + "learning_rate": 0.0019212080413248762, + "loss": 3.023410415649414, + "num_input_tokens_seen": 6107955200, + "step": 11650, + "train_runtime": 52903.1239, + "train_tokens_per_second": 115455.473 + }, + { + "epoch": 0.6309694526367056, + "grad_norm": 0.14006845653057098, + "learning_rate": 0.0019175543772942383, + "loss": 3.020222473144531, + "num_input_tokens_seen": 6113198080, + "step": 11660, + "train_runtime": 52948.2709, + "train_tokens_per_second": 115456.047 + }, + { + "epoch": 0.6315105928190697, + "grad_norm": 0.13746832311153412, + "learning_rate": 0.0019139032546817902, + "loss": 3.0225994110107424, + "num_input_tokens_seen": 6118440960, + "step": 11670, + "train_runtime": 52993.4226, + "train_tokens_per_second": 115456.611 + }, + { + "epoch": 0.632051733001434, + "grad_norm": 0.13812102377414703, + "learning_rate": 0.0019102546846342411, + "loss": 3.0324447631835936, + "num_input_tokens_seen": 6123683840, + "step": 11680, + "train_runtime": 53038.5588, + "train_tokens_per_second": 115457.207 + }, + { + "epoch": 0.6325928731837983, + "grad_norm": 0.14019303023815155, + "learning_rate": 0.0019066086782905097, + "loss": 3.022325897216797, + "num_input_tokens_seen": 6128926720, + "step": 11690, + "train_runtime": 53083.7143, + "train_tokens_per_second": 115457.76 + }, + { + "epoch": 0.6331340133661625, + "grad_norm": 0.1436738818883896, + "learning_rate": 0.0019029652467816838, + "loss": 3.0244091033935545, + "num_input_tokens_seen": 6134169600, + "step": 11700, + "train_runtime": 53128.868, + "train_tokens_per_second": 115458.315 + }, + { + "epoch": 0.6336751535485268, + "grad_norm": 0.14594176411628723, + "learning_rate": 0.0018993244012309913, + "loss": 3.025048828125, + "num_input_tokens_seen": 6139412480, + "step": 11710, + "train_runtime": 53173.9948, + "train_tokens_per_second": 115458.929 + }, + { + "epoch": 0.634216293730891, + "grad_norm": 0.15385597944259644, + "learning_rate": 0.0018956861527537688, + "loss": 3.0213130950927733, + "num_input_tokens_seen": 6144655360, + "step": 11720, + "train_runtime": 53219.1405, + "train_tokens_per_second": 115459.5 + }, + { + "epoch": 0.6347574339132552, + "grad_norm": 0.14445240795612335, + "learning_rate": 0.0018920505124574195, + "loss": 3.029845428466797, + "num_input_tokens_seen": 6149898240, + "step": 11730, + "train_runtime": 53264.2928, + "train_tokens_per_second": 115460.056 + }, + { + "epoch": 0.6352985740956195, + "grad_norm": 0.1384369432926178, + "learning_rate": 0.001888417491441387, + "loss": 3.0266345977783202, + "num_input_tokens_seen": 6155141120, + "step": 11740, + "train_runtime": 53309.4606, + "train_tokens_per_second": 115460.578 + }, + { + "epoch": 0.6358397142779837, + "grad_norm": 0.14229294657707214, + "learning_rate": 0.0018847871007971163, + "loss": 3.017131042480469, + "num_input_tokens_seen": 6160384000, + "step": 11750, + "train_runtime": 53354.6359, + "train_tokens_per_second": 115461.082 + }, + { + "epoch": 0.636380854460348, + "grad_norm": 0.14127928018569946, + "learning_rate": 0.0018811593516080234, + "loss": 3.021234703063965, + "num_input_tokens_seen": 6165626880, + "step": 11760, + "train_runtime": 53399.8028, + "train_tokens_per_second": 115461.604 + }, + { + "epoch": 0.6369219946427122, + "grad_norm": 0.13989216089248657, + "learning_rate": 0.0018775342549494606, + "loss": 3.0207067489624024, + "num_input_tokens_seen": 6170869760, + "step": 11770, + "train_runtime": 53444.9593, + "train_tokens_per_second": 115462.147 + }, + { + "epoch": 0.6374631348250764, + "grad_norm": 0.141075000166893, + "learning_rate": 0.0018739118218886802, + "loss": 3.017308807373047, + "num_input_tokens_seen": 6176112640, + "step": 11780, + "train_runtime": 53490.1129, + "train_tokens_per_second": 115462.696 + }, + { + "epoch": 0.6380042750074407, + "grad_norm": 0.1446276307106018, + "learning_rate": 0.0018702920634848035, + "loss": 3.0272090911865233, + "num_input_tokens_seen": 6181355520, + "step": 11790, + "train_runtime": 53535.2546, + "train_tokens_per_second": 115463.269 + }, + { + "epoch": 0.6385454151898049, + "grad_norm": 0.14022940397262573, + "learning_rate": 0.001866674990788788, + "loss": 3.0206020355224608, + "num_input_tokens_seen": 6186598400, + "step": 11800, + "train_runtime": 53580.432, + "train_tokens_per_second": 115463.765 + }, + { + "epoch": 0.6390865553721692, + "grad_norm": 0.1391611397266388, + "learning_rate": 0.0018630606148433892, + "loss": 3.0259307861328124, + "num_input_tokens_seen": 6191841280, + "step": 11810, + "train_runtime": 53625.6025, + "train_tokens_per_second": 115464.274 + }, + { + "epoch": 0.6396276955545334, + "grad_norm": 0.1375313252210617, + "learning_rate": 0.0018594489466831293, + "loss": 3.019388198852539, + "num_input_tokens_seen": 6197084160, + "step": 11820, + "train_runtime": 53670.752, + "train_tokens_per_second": 115464.828 + }, + { + "epoch": 0.6401688357368976, + "grad_norm": 0.13326410949230194, + "learning_rate": 0.0018558399973342677, + "loss": 3.0195072174072264, + "num_input_tokens_seen": 6202327040, + "step": 11830, + "train_runtime": 53719.4728, + "train_tokens_per_second": 115457.705 + }, + { + "epoch": 0.6407099759192619, + "grad_norm": 0.1425514966249466, + "learning_rate": 0.0018522337778147586, + "loss": 3.012344741821289, + "num_input_tokens_seen": 6207569920, + "step": 11840, + "train_runtime": 53764.8643, + "train_tokens_per_second": 115457.744 + }, + { + "epoch": 0.6412511161016261, + "grad_norm": 0.14373312890529633, + "learning_rate": 0.001848630299134224, + "loss": 3.0200828552246093, + "num_input_tokens_seen": 6212812800, + "step": 11850, + "train_runtime": 53810.7174, + "train_tokens_per_second": 115456.792 + }, + { + "epoch": 0.6417922562839904, + "grad_norm": 0.14393045008182526, + "learning_rate": 0.0018450295722939214, + "loss": 3.0205759048461913, + "num_input_tokens_seen": 6218055680, + "step": 11860, + "train_runtime": 53856.71, + "train_tokens_per_second": 115455.543 + }, + { + "epoch": 0.6423333964663546, + "grad_norm": 0.13831470906734467, + "learning_rate": 0.0018414316082867015, + "loss": 3.018105697631836, + "num_input_tokens_seen": 6223298560, + "step": 11870, + "train_runtime": 53902.8725, + "train_tokens_per_second": 115453.932 + }, + { + "epoch": 0.6428745366487189, + "grad_norm": 0.14368562400341034, + "learning_rate": 0.0018378364180969837, + "loss": 3.0205171585083006, + "num_input_tokens_seen": 6228541440, + "step": 11880, + "train_runtime": 53949.0344, + "train_tokens_per_second": 115452.325 + }, + { + "epoch": 0.6434156768310831, + "grad_norm": 0.134961798787117, + "learning_rate": 0.0018342440127007181, + "loss": 3.0208873748779297, + "num_input_tokens_seen": 6233784320, + "step": 11890, + "train_runtime": 53994.8144, + "train_tokens_per_second": 115451.537 + }, + { + "epoch": 0.6439568170134473, + "grad_norm": 0.139762744307518, + "learning_rate": 0.0018306544030653531, + "loss": 3.0138370513916017, + "num_input_tokens_seen": 6239027200, + "step": 11900, + "train_runtime": 54040.0794, + "train_tokens_per_second": 115451.851 + }, + { + "epoch": 0.6444979571958116, + "grad_norm": 0.15458019077777863, + "learning_rate": 0.0018270676001498033, + "loss": 3.025080108642578, + "num_input_tokens_seen": 6244270080, + "step": 11910, + "train_runtime": 54085.3315, + "train_tokens_per_second": 115452.192 + }, + { + "epoch": 0.6450390973781758, + "grad_norm": 0.13538894057273865, + "learning_rate": 0.001823483614904411, + "loss": 3.016307830810547, + "num_input_tokens_seen": 6249512960, + "step": 11920, + "train_runtime": 54130.6042, + "train_tokens_per_second": 115452.489 + }, + { + "epoch": 0.6455802375605401, + "grad_norm": 0.13436593115329742, + "learning_rate": 0.0018199024582709177, + "loss": 3.0229183197021485, + "num_input_tokens_seen": 6254755840, + "step": 11930, + "train_runtime": 54175.8479, + "train_tokens_per_second": 115452.846 + }, + { + "epoch": 0.6461213777429043, + "grad_norm": 0.1262059211730957, + "learning_rate": 0.0018163241411824327, + "loss": 3.0243408203125, + "num_input_tokens_seen": 6259998720, + "step": 11940, + "train_runtime": 54221.0877, + "train_tokens_per_second": 115453.212 + }, + { + "epoch": 0.6466625179252685, + "grad_norm": 0.14077694714069366, + "learning_rate": 0.0018127486745633914, + "loss": 3.009103775024414, + "num_input_tokens_seen": 6265241600, + "step": 11950, + "train_runtime": 54266.3714, + "train_tokens_per_second": 115453.483 + }, + { + "epoch": 0.6472036581076328, + "grad_norm": 0.14338544011116028, + "learning_rate": 0.001809176069329529, + "loss": 3.019987106323242, + "num_input_tokens_seen": 6270484480, + "step": 11960, + "train_runtime": 54311.6327, + "train_tokens_per_second": 115453.802 + }, + { + "epoch": 0.647744798289997, + "grad_norm": 0.1309393346309662, + "learning_rate": 0.001805606336387845, + "loss": 3.0178783416748045, + "num_input_tokens_seen": 6275727360, + "step": 11970, + "train_runtime": 54356.8873, + "train_tokens_per_second": 115454.134 + }, + { + "epoch": 0.6482859384723613, + "grad_norm": 0.1303347647190094, + "learning_rate": 0.0018020394866365714, + "loss": 3.0253570556640623, + "num_input_tokens_seen": 6280970240, + "step": 11980, + "train_runtime": 54402.1335, + "train_tokens_per_second": 115454.484 + }, + { + "epoch": 0.6488270786547256, + "grad_norm": 0.14178113639354706, + "learning_rate": 0.0017984755309651346, + "loss": 3.0267719268798827, + "num_input_tokens_seen": 6286213120, + "step": 11990, + "train_runtime": 54447.3835, + "train_tokens_per_second": 115454.825 + }, + { + "epoch": 0.6493682188370897, + "grad_norm": 0.1430656611919403, + "learning_rate": 0.0017949144802541274, + "loss": 3.0143644332885744, + "num_input_tokens_seen": 6291456000, + "step": 12000, + "train_runtime": 54492.6535, + "train_tokens_per_second": 115455.123 + }, + { + "epoch": 0.6493682188370897, + "eval_loss": 2.9761710166931152, + "eval_runtime": 1.9875, + "eval_samples_per_second": 251.575, + "eval_steps_per_second": 4.025, + "num_input_tokens_seen": 6291456000, + "step": 12000 + }, + { + "epoch": 0.649909359019454, + "grad_norm": 0.14500592648983002, + "learning_rate": 0.0017913563453752746, + "loss": 3.018670654296875, + "num_input_tokens_seen": 6296698880, + "step": 12010, + "train_runtime": 54542.2937, + "train_tokens_per_second": 115446.169 + }, + { + "epoch": 0.6504504992018182, + "grad_norm": 0.1448933333158493, + "learning_rate": 0.0017878011371913977, + "loss": 3.0202388763427734, + "num_input_tokens_seen": 6301941760, + "step": 12020, + "train_runtime": 54587.5091, + "train_tokens_per_second": 115446.59 + }, + { + "epoch": 0.6509916393841825, + "grad_norm": 0.1533222645521164, + "learning_rate": 0.0017842488665563833, + "loss": 3.025776672363281, + "num_input_tokens_seen": 6307184640, + "step": 12030, + "train_runtime": 54632.7175, + "train_tokens_per_second": 115447.024 + }, + { + "epoch": 0.6515327795665468, + "grad_norm": 0.13312490284442902, + "learning_rate": 0.0017806995443151524, + "loss": 3.0187503814697267, + "num_input_tokens_seen": 6312427520, + "step": 12040, + "train_runtime": 54677.8786, + "train_tokens_per_second": 115447.557 + }, + { + "epoch": 0.6520739197489109, + "grad_norm": 0.13797084987163544, + "learning_rate": 0.0017771531813036206, + "loss": 3.019959259033203, + "num_input_tokens_seen": 6317670400, + "step": 12050, + "train_runtime": 54723.0506, + "train_tokens_per_second": 115448.067 + }, + { + "epoch": 0.6526150599312752, + "grad_norm": 0.13628187775611877, + "learning_rate": 0.0017736097883486713, + "loss": 3.012210655212402, + "num_input_tokens_seen": 6322913280, + "step": 12060, + "train_runtime": 54768.2437, + "train_tokens_per_second": 115448.531 + }, + { + "epoch": 0.6531562001136394, + "grad_norm": 0.13764619827270508, + "learning_rate": 0.001770069376268119, + "loss": 3.0185993194580076, + "num_input_tokens_seen": 6328156160, + "step": 12070, + "train_runtime": 54813.436, + "train_tokens_per_second": 115448.996 + }, + { + "epoch": 0.6536973402960037, + "grad_norm": 0.14087094366550446, + "learning_rate": 0.001766531955870682, + "loss": 3.0167076110839846, + "num_input_tokens_seen": 6333399040, + "step": 12080, + "train_runtime": 54858.6498, + "train_tokens_per_second": 115449.415 + }, + { + "epoch": 0.654238480478368, + "grad_norm": 0.13622906804084778, + "learning_rate": 0.0017629975379559405, + "loss": 3.021717643737793, + "num_input_tokens_seen": 6338641920, + "step": 12090, + "train_runtime": 54903.8659, + "train_tokens_per_second": 115449.829 + }, + { + "epoch": 0.6547796206607321, + "grad_norm": 0.13120146095752716, + "learning_rate": 0.001759466133314308, + "loss": 3.0197391510009766, + "num_input_tokens_seen": 6343884800, + "step": 12100, + "train_runtime": 54949.0889, + "train_tokens_per_second": 115450.227 + }, + { + "epoch": 0.6553207608430964, + "grad_norm": 0.139594167470932, + "learning_rate": 0.001755937752727003, + "loss": 3.0223533630371096, + "num_input_tokens_seen": 6349127680, + "step": 12110, + "train_runtime": 54994.2934, + "train_tokens_per_second": 115450.664 + }, + { + "epoch": 0.6558619010254606, + "grad_norm": 0.15013989806175232, + "learning_rate": 0.001752412406966008, + "loss": 3.0148881912231444, + "num_input_tokens_seen": 6354370560, + "step": 12120, + "train_runtime": 55039.5071, + "train_tokens_per_second": 115451.08 + }, + { + "epoch": 0.6564030412078249, + "grad_norm": 0.13876710832118988, + "learning_rate": 0.0017488901067940416, + "loss": 3.0114933013916017, + "num_input_tokens_seen": 6359613440, + "step": 12130, + "train_runtime": 55084.7162, + "train_tokens_per_second": 115451.506 + }, + { + "epoch": 0.6569441813901892, + "grad_norm": 0.13125662505626678, + "learning_rate": 0.0017453708629645238, + "loss": 3.004977226257324, + "num_input_tokens_seen": 6364856320, + "step": 12140, + "train_runtime": 55129.9188, + "train_tokens_per_second": 115451.944 + }, + { + "epoch": 0.6574853215725533, + "grad_norm": 0.14310745894908905, + "learning_rate": 0.0017418546862215448, + "loss": 3.0219293594360352, + "num_input_tokens_seen": 6370099200, + "step": 12150, + "train_runtime": 55175.1468, + "train_tokens_per_second": 115452.329 + }, + { + "epoch": 0.6580264617549176, + "grad_norm": 0.1343064159154892, + "learning_rate": 0.0017383415872998303, + "loss": 3.017044258117676, + "num_input_tokens_seen": 6375342080, + "step": 12160, + "train_runtime": 55220.3693, + "train_tokens_per_second": 115452.724 + }, + { + "epoch": 0.6585676019372818, + "grad_norm": 0.13533759117126465, + "learning_rate": 0.0017348315769247086, + "loss": 3.0149707794189453, + "num_input_tokens_seen": 6380584960, + "step": 12170, + "train_runtime": 55265.5973, + "train_tokens_per_second": 115453.108 + }, + { + "epoch": 0.6591087421196461, + "grad_norm": 0.1386122703552246, + "learning_rate": 0.0017313246658120804, + "loss": 3.0143962860107423, + "num_input_tokens_seen": 6385827840, + "step": 12180, + "train_runtime": 55310.8039, + "train_tokens_per_second": 115453.535 + }, + { + "epoch": 0.6596498823020104, + "grad_norm": 0.1343347579240799, + "learning_rate": 0.0017278208646683856, + "loss": 3.0179080963134766, + "num_input_tokens_seen": 6391070720, + "step": 12190, + "train_runtime": 55356.0418, + "train_tokens_per_second": 115453.896 + }, + { + "epoch": 0.6601910224843746, + "grad_norm": 0.14169900119304657, + "learning_rate": 0.0017243201841905666, + "loss": 3.0247045516967774, + "num_input_tokens_seen": 6396313600, + "step": 12200, + "train_runtime": 55401.2599, + "train_tokens_per_second": 115454.299 + }, + { + "epoch": 0.6607321626667388, + "grad_norm": 0.13514377176761627, + "learning_rate": 0.0017208226350660391, + "loss": 3.0104536056518554, + "num_input_tokens_seen": 6401556480, + "step": 12210, + "train_runtime": 55450.8592, + "train_tokens_per_second": 115445.578 + }, + { + "epoch": 0.661273302849103, + "grad_norm": 0.13756819069385529, + "learning_rate": 0.0017173282279726609, + "loss": 3.0194664001464844, + "num_input_tokens_seen": 6406799360, + "step": 12220, + "train_runtime": 55496.1437, + "train_tokens_per_second": 115445.848 + }, + { + "epoch": 0.6618144430314673, + "grad_norm": 0.13056276738643646, + "learning_rate": 0.0017138369735786954, + "loss": 3.0248437881469727, + "num_input_tokens_seen": 6412042240, + "step": 12230, + "train_runtime": 55541.3669, + "train_tokens_per_second": 115446.245 + }, + { + "epoch": 0.6623555832138316, + "grad_norm": 0.13981449604034424, + "learning_rate": 0.0017103488825427826, + "loss": 3.0129575729370117, + "num_input_tokens_seen": 6417285120, + "step": 12240, + "train_runtime": 55586.7139, + "train_tokens_per_second": 115446.384 + }, + { + "epoch": 0.6628967233961958, + "grad_norm": 0.1439344733953476, + "learning_rate": 0.0017068639655139026, + "loss": 3.022663116455078, + "num_input_tokens_seen": 6422528000, + "step": 12250, + "train_runtime": 55632.0146, + "train_tokens_per_second": 115446.619 + }, + { + "epoch": 0.66343786357856, + "grad_norm": 0.15030835568904877, + "learning_rate": 0.001703382233131348, + "loss": 3.012424850463867, + "num_input_tokens_seen": 6427770880, + "step": 12260, + "train_runtime": 55677.2926, + "train_tokens_per_second": 115446.901 + }, + { + "epoch": 0.6639790037609242, + "grad_norm": 0.13960550725460052, + "learning_rate": 0.0016999036960246871, + "loss": 3.0081478118896485, + "num_input_tokens_seen": 6433013760, + "step": 12270, + "train_runtime": 55722.6028, + "train_tokens_per_second": 115447.115 + }, + { + "epoch": 0.6645201439432885, + "grad_norm": 0.13627994060516357, + "learning_rate": 0.0016964283648137329, + "loss": 3.0084842681884765, + "num_input_tokens_seen": 6438256640, + "step": 12280, + "train_runtime": 55767.8798, + "train_tokens_per_second": 115447.398 + }, + { + "epoch": 0.6650612841256528, + "grad_norm": 0.14768123626708984, + "learning_rate": 0.0016929562501085123, + "loss": 3.013652801513672, + "num_input_tokens_seen": 6443499520, + "step": 12290, + "train_runtime": 55813.1427, + "train_tokens_per_second": 115447.71 + }, + { + "epoch": 0.665602424308017, + "grad_norm": 0.14207823574543, + "learning_rate": 0.0016894873625092333, + "loss": 3.0111804962158204, + "num_input_tokens_seen": 6448742400, + "step": 12300, + "train_runtime": 55858.4112, + "train_tokens_per_second": 115448.01 + }, + { + "epoch": 0.6661435644903813, + "grad_norm": 0.1379329413175583, + "learning_rate": 0.0016860217126062479, + "loss": 3.0187799453735353, + "num_input_tokens_seen": 6453985280, + "step": 12310, + "train_runtime": 55903.6646, + "train_tokens_per_second": 115448.34 + }, + { + "epoch": 0.6666847046727454, + "grad_norm": 0.14401622116565704, + "learning_rate": 0.0016825593109800264, + "loss": 3.0228382110595704, + "num_input_tokens_seen": 6459228160, + "step": 12320, + "train_runtime": 55948.9475, + "train_tokens_per_second": 115448.609 + }, + { + "epoch": 0.6672258448551097, + "grad_norm": 0.12955419719219208, + "learning_rate": 0.0016791001682011227, + "loss": 3.0097047805786135, + "num_input_tokens_seen": 6464471040, + "step": 12330, + "train_runtime": 55994.314, + "train_tokens_per_second": 115448.705 + }, + { + "epoch": 0.667766985037474, + "grad_norm": 0.14710277318954468, + "learning_rate": 0.0016756442948301386, + "loss": 3.0169065475463865, + "num_input_tokens_seen": 6469713920, + "step": 12340, + "train_runtime": 56039.7555, + "train_tokens_per_second": 115448.646 + }, + { + "epoch": 0.6683081252198382, + "grad_norm": 0.1326688975095749, + "learning_rate": 0.0016721917014176982, + "loss": 3.009653663635254, + "num_input_tokens_seen": 6474956800, + "step": 12350, + "train_runtime": 56085.2141, + "train_tokens_per_second": 115448.553 + }, + { + "epoch": 0.6688492654022025, + "grad_norm": 0.13903285562992096, + "learning_rate": 0.0016687423985044109, + "loss": 3.019660758972168, + "num_input_tokens_seen": 6480199680, + "step": 12360, + "train_runtime": 56130.7366, + "train_tokens_per_second": 115448.328 + }, + { + "epoch": 0.6693904055845666, + "grad_norm": 0.13976383209228516, + "learning_rate": 0.0016652963966208385, + "loss": 3.0172367095947266, + "num_input_tokens_seen": 6485442560, + "step": 12370, + "train_runtime": 56176.2788, + "train_tokens_per_second": 115448.063 + }, + { + "epoch": 0.6699315457669309, + "grad_norm": 0.13633348047733307, + "learning_rate": 0.0016618537062874665, + "loss": 3.004638671875, + "num_input_tokens_seen": 6490685440, + "step": 12380, + "train_runtime": 56221.949, + "train_tokens_per_second": 115447.535 + }, + { + "epoch": 0.6704726859492952, + "grad_norm": 0.14074033498764038, + "learning_rate": 0.001658414338014669, + "loss": 3.019020843505859, + "num_input_tokens_seen": 6495928320, + "step": 12390, + "train_runtime": 56267.7615, + "train_tokens_per_second": 115446.717 + }, + { + "epoch": 0.6710138261316594, + "grad_norm": 0.1326296180486679, + "learning_rate": 0.0016549783023026808, + "loss": 3.0110851287841798, + "num_input_tokens_seen": 6501171200, + "step": 12400, + "train_runtime": 56313.0806, + "train_tokens_per_second": 115446.911 + }, + { + "epoch": 0.6715549663140237, + "grad_norm": 0.13860943913459778, + "learning_rate": 0.001651545609641561, + "loss": 3.0090118408203126, + "num_input_tokens_seen": 6506414080, + "step": 12410, + "train_runtime": 56358.3912, + "train_tokens_per_second": 115447.122 + }, + { + "epoch": 0.6720961064963878, + "grad_norm": 0.1410975605249405, + "learning_rate": 0.0016481162705111604, + "loss": 3.0008705139160154, + "num_input_tokens_seen": 6511656960, + "step": 12420, + "train_runtime": 56403.6982, + "train_tokens_per_second": 115447.341 + }, + { + "epoch": 0.6726372466787521, + "grad_norm": 0.13546454906463623, + "learning_rate": 0.0016446902953810964, + "loss": 3.013086700439453, + "num_input_tokens_seen": 6516899840, + "step": 12430, + "train_runtime": 56448.9891, + "train_tokens_per_second": 115447.592 + }, + { + "epoch": 0.6731783868611164, + "grad_norm": 0.13547931611537933, + "learning_rate": 0.0016412676947107113, + "loss": 3.004133605957031, + "num_input_tokens_seen": 6522142720, + "step": 12440, + "train_runtime": 56494.2857, + "train_tokens_per_second": 115447.831 + }, + { + "epoch": 0.6737195270434806, + "grad_norm": 0.13898716866970062, + "learning_rate": 0.0016378484789490479, + "loss": 3.015100860595703, + "num_input_tokens_seen": 6527385600, + "step": 12450, + "train_runtime": 56539.5755, + "train_tokens_per_second": 115448.083 + }, + { + "epoch": 0.6742606672258449, + "grad_norm": 0.1385628879070282, + "learning_rate": 0.0016344326585348147, + "loss": 3.018421936035156, + "num_input_tokens_seen": 6532628480, + "step": 12460, + "train_runtime": 56584.8917, + "train_tokens_per_second": 115448.281 + }, + { + "epoch": 0.674801807408209, + "grad_norm": 0.13880495727062225, + "learning_rate": 0.001631020243896355, + "loss": 3.0016693115234374, + "num_input_tokens_seen": 6537871360, + "step": 12470, + "train_runtime": 56630.196, + "train_tokens_per_second": 115448.503 + }, + { + "epoch": 0.6753429475905733, + "grad_norm": 0.1371801793575287, + "learning_rate": 0.0016276112454516134, + "loss": 3.0135356903076174, + "num_input_tokens_seen": 6543114240, + "step": 12480, + "train_runtime": 56675.5074, + "train_tokens_per_second": 115448.71 + }, + { + "epoch": 0.6758840877729376, + "grad_norm": 0.1398102194070816, + "learning_rate": 0.001624205673608104, + "loss": 3.0212148666381835, + "num_input_tokens_seen": 6548357120, + "step": 12490, + "train_runtime": 56720.8046, + "train_tokens_per_second": 115448.946 + }, + { + "epoch": 0.6764252279553018, + "grad_norm": 0.1300211250782013, + "learning_rate": 0.0016208035387628825, + "loss": 3.0142328262329103, + "num_input_tokens_seen": 6553600000, + "step": 12500, + "train_runtime": 56766.0883, + "train_tokens_per_second": 115449.209 + }, + { + "epoch": 0.6764252279553018, + "eval_loss": 2.968597412109375, + "eval_runtime": 1.9925, + "eval_samples_per_second": 250.941, + "eval_steps_per_second": 4.015, + "num_input_tokens_seen": 6553600000, + "step": 12500 + }, + { + "epoch": 0.6769663681376661, + "grad_norm": 0.14369215071201324, + "learning_rate": 0.0016174048513025103, + "loss": 3.0048513412475586, + "num_input_tokens_seen": 6558842880, + "step": 12510, + "train_runtime": 56813.3987, + "train_tokens_per_second": 115445.353 + }, + { + "epoch": 0.6775075083200303, + "grad_norm": 0.14692343771457672, + "learning_rate": 0.0016140096216030232, + "loss": 3.0137935638427735, + "num_input_tokens_seen": 6564085760, + "step": 12520, + "train_runtime": 56858.6904, + "train_tokens_per_second": 115445.602 + }, + { + "epoch": 0.6780486485023945, + "grad_norm": 0.14028270542621613, + "learning_rate": 0.0016106178600299001, + "loss": 3.010356140136719, + "num_input_tokens_seen": 6569328640, + "step": 12530, + "train_runtime": 56903.9761, + "train_tokens_per_second": 115445.863 + }, + { + "epoch": 0.6785897886847588, + "grad_norm": 0.12822629511356354, + "learning_rate": 0.0016072295769380353, + "loss": 3.0003124237060548, + "num_input_tokens_seen": 6574571520, + "step": 12540, + "train_runtime": 56949.26, + "train_tokens_per_second": 115446.127 + }, + { + "epoch": 0.679130928867123, + "grad_norm": 0.1369100958108902, + "learning_rate": 0.0016038447826716993, + "loss": 3.0066249847412108, + "num_input_tokens_seen": 6579814400, + "step": 12550, + "train_runtime": 56994.5681, + "train_tokens_per_second": 115446.342 + }, + { + "epoch": 0.6796720690494873, + "grad_norm": 0.14047878980636597, + "learning_rate": 0.001600463487564515, + "loss": 3.0145965576171876, + "num_input_tokens_seen": 6585057280, + "step": 12560, + "train_runtime": 57039.861, + "train_tokens_per_second": 115446.587 + }, + { + "epoch": 0.6802132092318515, + "grad_norm": 0.14242438971996307, + "learning_rate": 0.001597085701939419, + "loss": 3.0166095733642577, + "num_input_tokens_seen": 6590300160, + "step": 12570, + "train_runtime": 57085.1398, + "train_tokens_per_second": 115446.86 + }, + { + "epoch": 0.6807543494142158, + "grad_norm": 0.1383470743894577, + "learning_rate": 0.0015937114361086369, + "loss": 3.0075637817382814, + "num_input_tokens_seen": 6595543040, + "step": 12580, + "train_runtime": 57130.4343, + "train_tokens_per_second": 115447.101 + }, + { + "epoch": 0.68129548959658, + "grad_norm": 0.1291186362504959, + "learning_rate": 0.0015903407003736466, + "loss": 3.01377010345459, + "num_input_tokens_seen": 6600785920, + "step": 12590, + "train_runtime": 57180.0264, + "train_tokens_per_second": 115438.665 + }, + { + "epoch": 0.6818366297789442, + "grad_norm": 0.13580311834812164, + "learning_rate": 0.0015869735050251489, + "loss": 3.0099231719970705, + "num_input_tokens_seen": 6606028800, + "step": 12600, + "train_runtime": 57225.3092, + "train_tokens_per_second": 115438.936 + }, + { + "epoch": 0.6823777699613085, + "grad_norm": 0.1437922716140747, + "learning_rate": 0.0015836098603430357, + "loss": 3.0034923553466797, + "num_input_tokens_seen": 6611271680, + "step": 12610, + "train_runtime": 57270.5349, + "train_tokens_per_second": 115439.321 + }, + { + "epoch": 0.6829189101436727, + "grad_norm": 0.13526742160320282, + "learning_rate": 0.0015802497765963614, + "loss": 3.00305061340332, + "num_input_tokens_seen": 6616514560, + "step": 12620, + "train_runtime": 57315.7589, + "train_tokens_per_second": 115439.709 + }, + { + "epoch": 0.683460050326037, + "grad_norm": 0.1404607594013214, + "learning_rate": 0.0015768932640433059, + "loss": 3.0041690826416017, + "num_input_tokens_seen": 6621757440, + "step": 12630, + "train_runtime": 57360.9936, + "train_tokens_per_second": 115440.076 + }, + { + "epoch": 0.6840011905084012, + "grad_norm": 0.13756705820560455, + "learning_rate": 0.0015735403329311469, + "loss": 2.9982038497924806, + "num_input_tokens_seen": 6627000320, + "step": 12640, + "train_runtime": 57406.2268, + "train_tokens_per_second": 115440.444 + }, + { + "epoch": 0.6845423306907654, + "grad_norm": 0.14006656408309937, + "learning_rate": 0.0015701909934962305, + "loss": 3.009762763977051, + "num_input_tokens_seen": 6632243200, + "step": 12650, + "train_runtime": 57451.4583, + "train_tokens_per_second": 115440.816 + }, + { + "epoch": 0.6850834708731297, + "grad_norm": 0.13317948579788208, + "learning_rate": 0.001566845255963934, + "loss": 3.0151742935180663, + "num_input_tokens_seen": 6637486080, + "step": 12660, + "train_runtime": 57496.7057, + "train_tokens_per_second": 115441.154 + }, + { + "epoch": 0.6856246110554939, + "grad_norm": 0.13669337332248688, + "learning_rate": 0.0015635031305486417, + "loss": 3.000714874267578, + "num_input_tokens_seen": 6642728960, + "step": 12670, + "train_runtime": 57541.9394, + "train_tokens_per_second": 115441.52 + }, + { + "epoch": 0.6861657512378582, + "grad_norm": 0.13967348635196686, + "learning_rate": 0.0015601646274537087, + "loss": 3.0043874740600587, + "num_input_tokens_seen": 6647971840, + "step": 12680, + "train_runtime": 57587.1773, + "train_tokens_per_second": 115441.877 + }, + { + "epoch": 0.6867068914202225, + "grad_norm": 0.13815197348594666, + "learning_rate": 0.0015568297568714312, + "loss": 3.010976219177246, + "num_input_tokens_seen": 6653214720, + "step": 12690, + "train_runtime": 57632.4045, + "train_tokens_per_second": 115442.255 + }, + { + "epoch": 0.6872480316025866, + "grad_norm": 0.1381223499774933, + "learning_rate": 0.001553498528983015, + "loss": 3.013303756713867, + "num_input_tokens_seen": 6658457600, + "step": 12700, + "train_runtime": 57677.6438, + "train_tokens_per_second": 115442.608 + }, + { + "epoch": 0.6877891717849509, + "grad_norm": 0.13350199162960052, + "learning_rate": 0.0015501709539585454, + "loss": 3.012788009643555, + "num_input_tokens_seen": 6663700480, + "step": 12710, + "train_runtime": 57722.8853, + "train_tokens_per_second": 115442.956 + }, + { + "epoch": 0.6883303119673151, + "grad_norm": 0.13979476690292358, + "learning_rate": 0.0015468470419569564, + "loss": 3.0098241806030273, + "num_input_tokens_seen": 6668943360, + "step": 12720, + "train_runtime": 57768.1112, + "train_tokens_per_second": 115443.334 + }, + { + "epoch": 0.6888714521496794, + "grad_norm": 0.13748957216739655, + "learning_rate": 0.0015435268031259992, + "loss": 3.009090805053711, + "num_input_tokens_seen": 6674186240, + "step": 12730, + "train_runtime": 57813.3636, + "train_tokens_per_second": 115443.659 + }, + { + "epoch": 0.6894125923320437, + "grad_norm": 0.13561367988586426, + "learning_rate": 0.0015402102476022095, + "loss": 3.008078765869141, + "num_input_tokens_seen": 6679429120, + "step": 12740, + "train_runtime": 57858.572, + "train_tokens_per_second": 115444.072 + }, + { + "epoch": 0.6899537325144078, + "grad_norm": 0.12914767861366272, + "learning_rate": 0.0015368973855108782, + "loss": 3.0018003463745115, + "num_input_tokens_seen": 6684672000, + "step": 12750, + "train_runtime": 57903.8186, + "train_tokens_per_second": 115444.407 + }, + { + "epoch": 0.6904948726967721, + "grad_norm": 0.14038655161857605, + "learning_rate": 0.0015335882269660217, + "loss": 3.004079818725586, + "num_input_tokens_seen": 6689914880, + "step": 12760, + "train_runtime": 57949.0509, + "train_tokens_per_second": 115444.771 + }, + { + "epoch": 0.6910360128791363, + "grad_norm": 0.13866056501865387, + "learning_rate": 0.001530282782070348, + "loss": 3.009323310852051, + "num_input_tokens_seen": 6695157760, + "step": 12770, + "train_runtime": 57994.2931, + "train_tokens_per_second": 115445.114 + }, + { + "epoch": 0.6915771530615006, + "grad_norm": 0.1286270171403885, + "learning_rate": 0.001526981060915229, + "loss": 3.000651550292969, + "num_input_tokens_seen": 6700400640, + "step": 12780, + "train_runtime": 58039.518, + "train_tokens_per_second": 115445.491 + }, + { + "epoch": 0.6921182932438649, + "grad_norm": 0.13248993456363678, + "learning_rate": 0.0015236830735806679, + "loss": 3.0101812362670897, + "num_input_tokens_seen": 6705643520, + "step": 12790, + "train_runtime": 58084.7779, + "train_tokens_per_second": 115445.798 + }, + { + "epoch": 0.692659433426229, + "grad_norm": 0.1369810700416565, + "learning_rate": 0.0015203888301352675, + "loss": 3.004811477661133, + "num_input_tokens_seen": 6710886400, + "step": 12800, + "train_runtime": 58130.0044, + "train_tokens_per_second": 115446.171 + }, + { + "epoch": 0.6932005736085933, + "grad_norm": 0.14264971017837524, + "learning_rate": 0.001517098340636202, + "loss": 3.010848808288574, + "num_input_tokens_seen": 6716129280, + "step": 12810, + "train_runtime": 58175.241, + "train_tokens_per_second": 115446.523 + }, + { + "epoch": 0.6937417137909575, + "grad_norm": 0.1406365931034088, + "learning_rate": 0.0015138116151291825, + "loss": 3.0090103149414062, + "num_input_tokens_seen": 6721372160, + "step": 12820, + "train_runtime": 58220.4724, + "train_tokens_per_second": 115446.885 + }, + { + "epoch": 0.6942828539733218, + "grad_norm": 0.13356050848960876, + "learning_rate": 0.0015105286636484334, + "loss": 2.999258613586426, + "num_input_tokens_seen": 6726615040, + "step": 12830, + "train_runtime": 58265.7054, + "train_tokens_per_second": 115447.243 + }, + { + "epoch": 0.6948239941556861, + "grad_norm": 0.13091513514518738, + "learning_rate": 0.001507249496216654, + "loss": 3.005986785888672, + "num_input_tokens_seen": 6731857920, + "step": 12840, + "train_runtime": 58310.9354, + "train_tokens_per_second": 115447.606 + }, + { + "epoch": 0.6953651343380503, + "grad_norm": 0.1335466355085373, + "learning_rate": 0.0015039741228449904, + "loss": 2.9974597930908202, + "num_input_tokens_seen": 6737100800, + "step": 12850, + "train_runtime": 58356.1736, + "train_tokens_per_second": 115447.953 + }, + { + "epoch": 0.6959062745204145, + "grad_norm": 0.1375114917755127, + "learning_rate": 0.0015007025535330083, + "loss": 3.0074440002441407, + "num_input_tokens_seen": 6742343680, + "step": 12860, + "train_runtime": 58401.3717, + "train_tokens_per_second": 115448.379 + }, + { + "epoch": 0.6964474147027787, + "grad_norm": 0.15171852707862854, + "learning_rate": 0.001497434798268658, + "loss": 2.996272659301758, + "num_input_tokens_seen": 6747586560, + "step": 12870, + "train_runtime": 58446.5932, + "train_tokens_per_second": 115448.757 + }, + { + "epoch": 0.696988554885143, + "grad_norm": 0.13725285232067108, + "learning_rate": 0.0014941708670282445, + "loss": 3.0174352645874025, + "num_input_tokens_seen": 6752829440, + "step": 12880, + "train_runtime": 58491.8411, + "train_tokens_per_second": 115449.083 + }, + { + "epoch": 0.6975296950675073, + "grad_norm": 0.1326073855161667, + "learning_rate": 0.0014909107697764006, + "loss": 3.006754684448242, + "num_input_tokens_seen": 6758072320, + "step": 12890, + "train_runtime": 58537.0682, + "train_tokens_per_second": 115449.45 + }, + { + "epoch": 0.6980708352498715, + "grad_norm": 0.1453487128019333, + "learning_rate": 0.0014876545164660543, + "loss": 3.003281021118164, + "num_input_tokens_seen": 6763315200, + "step": 12900, + "train_runtime": 58582.3109, + "train_tokens_per_second": 115449.785 + }, + { + "epoch": 0.6986119754322357, + "grad_norm": 0.13233183324337006, + "learning_rate": 0.001484402117038397, + "loss": 3.0117160797119142, + "num_input_tokens_seen": 6768558080, + "step": 12910, + "train_runtime": 58627.5472, + "train_tokens_per_second": 115450.132 + }, + { + "epoch": 0.6991531156145999, + "grad_norm": 0.1383819729089737, + "learning_rate": 0.0014811535814228522, + "loss": 3.0003276824951173, + "num_input_tokens_seen": 6773800960, + "step": 12920, + "train_runtime": 58672.7881, + "train_tokens_per_second": 115450.47 + }, + { + "epoch": 0.6996942557969642, + "grad_norm": 0.13273653388023376, + "learning_rate": 0.0014779089195370515, + "loss": 3.006727600097656, + "num_input_tokens_seen": 6779043840, + "step": 12930, + "train_runtime": 58718.0154, + "train_tokens_per_second": 115450.834 + }, + { + "epoch": 0.7002353959793285, + "grad_norm": 0.13412410020828247, + "learning_rate": 0.0014746681412867993, + "loss": 2.9990608215332033, + "num_input_tokens_seen": 6784286720, + "step": 12940, + "train_runtime": 58763.2242, + "train_tokens_per_second": 115451.234 + }, + { + "epoch": 0.7007765361616927, + "grad_norm": 0.13567864894866943, + "learning_rate": 0.0014714312565660412, + "loss": 3.001424789428711, + "num_input_tokens_seen": 6789529600, + "step": 12950, + "train_runtime": 58808.4491, + "train_tokens_per_second": 115451.601 + }, + { + "epoch": 0.701317676344057, + "grad_norm": 0.12947793304920197, + "learning_rate": 0.0014681982752568368, + "loss": 2.9996448516845704, + "num_input_tokens_seen": 6794772480, + "step": 12960, + "train_runtime": 58853.6594, + "train_tokens_per_second": 115451.997 + }, + { + "epoch": 0.7018588165264211, + "grad_norm": 0.1319398730993271, + "learning_rate": 0.001464969207229331, + "loss": 3.0077224731445313, + "num_input_tokens_seen": 6800015360, + "step": 12970, + "train_runtime": 58898.8938, + "train_tokens_per_second": 115452.344 + }, + { + "epoch": 0.7023999567087854, + "grad_norm": 0.14026153087615967, + "learning_rate": 0.0014617440623417178, + "loss": 2.999114227294922, + "num_input_tokens_seen": 6805258240, + "step": 12980, + "train_runtime": 58948.6295, + "train_tokens_per_second": 115443.875 + }, + { + "epoch": 0.7029410968911497, + "grad_norm": 0.14495210349559784, + "learning_rate": 0.0014585228504402185, + "loss": 3.005875015258789, + "num_input_tokens_seen": 6810501120, + "step": 12990, + "train_runtime": 58994.0959, + "train_tokens_per_second": 115443.775 + }, + { + "epoch": 0.7034822370735139, + "grad_norm": 0.13643252849578857, + "learning_rate": 0.001455305581359043, + "loss": 2.997660255432129, + "num_input_tokens_seen": 6815744000, + "step": 13000, + "train_runtime": 59039.5206, + "train_tokens_per_second": 115443.756 + }, + { + "epoch": 0.7034822370735139, + "eval_loss": 2.960465669631958, + "eval_runtime": 1.987, + "eval_samples_per_second": 251.641, + "eval_steps_per_second": 4.026, + "num_input_tokens_seen": 6815744000, + "step": 13000 + }, + { + "epoch": 0.7040233772558782, + "grad_norm": 0.130798801779747, + "learning_rate": 0.001452092264920367, + "loss": 3.0002573013305662, + "num_input_tokens_seen": 6820986880, + "step": 13010, + "train_runtime": 59089.599, + "train_tokens_per_second": 115434.645 + }, + { + "epoch": 0.7045645174382423, + "grad_norm": 0.13077320158481598, + "learning_rate": 0.001448882910934297, + "loss": 3.00850830078125, + "num_input_tokens_seen": 6826229760, + "step": 13020, + "train_runtime": 59135.0207, + "train_tokens_per_second": 115434.639 + }, + { + "epoch": 0.7051056576206066, + "grad_norm": 0.14131614565849304, + "learning_rate": 0.0014456775291988434, + "loss": 3.0077110290527345, + "num_input_tokens_seen": 6831472640, + "step": 13030, + "train_runtime": 59180.4577, + "train_tokens_per_second": 115434.603 + }, + { + "epoch": 0.7056467978029709, + "grad_norm": 0.13815636932849884, + "learning_rate": 0.0014424761294998883, + "loss": 3.00131778717041, + "num_input_tokens_seen": 6836715520, + "step": 13040, + "train_runtime": 59225.9087, + "train_tokens_per_second": 115434.54 + }, + { + "epoch": 0.7061879379853351, + "grad_norm": 0.1329071819782257, + "learning_rate": 0.0014392787216111597, + "loss": 2.994339370727539, + "num_input_tokens_seen": 6841958400, + "step": 13050, + "train_runtime": 59271.3336, + "train_tokens_per_second": 115434.528 + }, + { + "epoch": 0.7067290781676994, + "grad_norm": 0.13561072945594788, + "learning_rate": 0.0014360853152941958, + "loss": 3.0034358978271483, + "num_input_tokens_seen": 6847201280, + "step": 13060, + "train_runtime": 59316.7359, + "train_tokens_per_second": 115434.56 + }, + { + "epoch": 0.7072702183500635, + "grad_norm": 0.13618333637714386, + "learning_rate": 0.0014328959202983182, + "loss": 3.0087270736694336, + "num_input_tokens_seen": 6852444160, + "step": 13070, + "train_runtime": 59362.09, + "train_tokens_per_second": 115434.685 + }, + { + "epoch": 0.7078113585324278, + "grad_norm": 0.1365492194890976, + "learning_rate": 0.0014297105463606044, + "loss": 3.0061859130859374, + "num_input_tokens_seen": 6857687040, + "step": 13080, + "train_runtime": 59407.4452, + "train_tokens_per_second": 115434.808 + }, + { + "epoch": 0.7083524987147921, + "grad_norm": 0.13774985074996948, + "learning_rate": 0.001426529203205853, + "loss": 3.010288429260254, + "num_input_tokens_seen": 6862929920, + "step": 13090, + "train_runtime": 59452.8193, + "train_tokens_per_second": 115434.894 + }, + { + "epoch": 0.7088936388971563, + "grad_norm": 0.1349509209394455, + "learning_rate": 0.00142335190054656, + "loss": 3.000904846191406, + "num_input_tokens_seen": 6868172800, + "step": 13100, + "train_runtime": 59498.1377, + "train_tokens_per_second": 115435.089 + }, + { + "epoch": 0.7094347790795206, + "grad_norm": 0.1314682513475418, + "learning_rate": 0.0014201786480828838, + "loss": 3.0022382736206055, + "num_input_tokens_seen": 6873415680, + "step": 13110, + "train_runtime": 59543.4355, + "train_tokens_per_second": 115435.322 + }, + { + "epoch": 0.7099759192618847, + "grad_norm": 0.14362597465515137, + "learning_rate": 0.0014170094555026182, + "loss": 2.9901851654052733, + "num_input_tokens_seen": 6878658560, + "step": 13120, + "train_runtime": 59588.6836, + "train_tokens_per_second": 115435.652 + }, + { + "epoch": 0.710517059444249, + "grad_norm": 0.13301101326942444, + "learning_rate": 0.0014138443324811618, + "loss": 3.0021732330322264, + "num_input_tokens_seen": 6883901440, + "step": 13130, + "train_runtime": 59633.9351, + "train_tokens_per_second": 115435.975 + }, + { + "epoch": 0.7110581996266133, + "grad_norm": 0.13076400756835938, + "learning_rate": 0.0014106832886814891, + "loss": 3.0049604415893554, + "num_input_tokens_seen": 6889144320, + "step": 13140, + "train_runtime": 59679.1572, + "train_tokens_per_second": 115436.354 + }, + { + "epoch": 0.7115993398089775, + "grad_norm": 0.13057680428028107, + "learning_rate": 0.0014075263337541223, + "loss": 3.009153938293457, + "num_input_tokens_seen": 6894387200, + "step": 13150, + "train_runtime": 59724.3952, + "train_tokens_per_second": 115436.702 + }, + { + "epoch": 0.7121404799913418, + "grad_norm": 0.13498692214488983, + "learning_rate": 0.0014043734773370997, + "loss": 2.996112060546875, + "num_input_tokens_seen": 6899630080, + "step": 13160, + "train_runtime": 59769.5992, + "train_tokens_per_second": 115437.115 + }, + { + "epoch": 0.712681620173706, + "grad_norm": 0.13407272100448608, + "learning_rate": 0.0014012247290559466, + "loss": 3.0008213043212892, + "num_input_tokens_seen": 6904872960, + "step": 13170, + "train_runtime": 59814.8054, + "train_tokens_per_second": 115437.523 + }, + { + "epoch": 0.7132227603560702, + "grad_norm": 0.14042150974273682, + "learning_rate": 0.0013980800985236468, + "loss": 2.9953586578369142, + "num_input_tokens_seen": 6910115840, + "step": 13180, + "train_runtime": 59859.9779, + "train_tokens_per_second": 115437.995 + }, + { + "epoch": 0.7137639005384345, + "grad_norm": 0.13807494938373566, + "learning_rate": 0.0013949395953406127, + "loss": 2.9886444091796873, + "num_input_tokens_seen": 6915358720, + "step": 13190, + "train_runtime": 59905.1537, + "train_tokens_per_second": 115438.461 + }, + { + "epoch": 0.7143050407207987, + "grad_norm": 0.13666392862796783, + "learning_rate": 0.0013918032290946552, + "loss": 3.0074825286865234, + "num_input_tokens_seen": 6920601600, + "step": 13200, + "train_runtime": 59950.322, + "train_tokens_per_second": 115438.94 + }, + { + "epoch": 0.714846180903163, + "grad_norm": 0.12777790427207947, + "learning_rate": 0.0013886710093609566, + "loss": 2.9995635986328124, + "num_input_tokens_seen": 6925844480, + "step": 13210, + "train_runtime": 59995.4811, + "train_tokens_per_second": 115439.436 + }, + { + "epoch": 0.7153873210855272, + "grad_norm": 0.13057056069374084, + "learning_rate": 0.0013855429457020408, + "loss": 2.993345260620117, + "num_input_tokens_seen": 6931087360, + "step": 13220, + "train_runtime": 60040.6669, + "train_tokens_per_second": 115439.88 + }, + { + "epoch": 0.7159284612678914, + "grad_norm": 0.13309696316719055, + "learning_rate": 0.0013824190476677417, + "loss": 2.9962528228759764, + "num_input_tokens_seen": 6936330240, + "step": 13230, + "train_runtime": 60085.8338, + "train_tokens_per_second": 115440.359 + }, + { + "epoch": 0.7164696014502557, + "grad_norm": 0.13253308832645416, + "learning_rate": 0.0013792993247951752, + "loss": 3.001760482788086, + "num_input_tokens_seen": 6941573120, + "step": 13240, + "train_runtime": 60130.9838, + "train_tokens_per_second": 115440.871 + }, + { + "epoch": 0.7170107416326199, + "grad_norm": 0.14509917795658112, + "learning_rate": 0.001376183786608712, + "loss": 2.999083137512207, + "num_input_tokens_seen": 6946816000, + "step": 13250, + "train_runtime": 60176.1243, + "train_tokens_per_second": 115441.399 + }, + { + "epoch": 0.7175518818149842, + "grad_norm": 0.13013510406017303, + "learning_rate": 0.001373072442619947, + "loss": 3.0021896362304688, + "num_input_tokens_seen": 6952058880, + "step": 13260, + "train_runtime": 60221.2777, + "train_tokens_per_second": 115441.903 + }, + { + "epoch": 0.7180930219973484, + "grad_norm": 0.1433565616607666, + "learning_rate": 0.0013699653023276715, + "loss": 2.999072265625, + "num_input_tokens_seen": 6957301760, + "step": 13270, + "train_runtime": 60266.4098, + "train_tokens_per_second": 115442.446 + }, + { + "epoch": 0.7186341621797127, + "grad_norm": 0.13696636259555817, + "learning_rate": 0.0013668623752178402, + "loss": 2.991237258911133, + "num_input_tokens_seen": 6962544640, + "step": 13280, + "train_runtime": 60311.5633, + "train_tokens_per_second": 115442.948 + }, + { + "epoch": 0.7191753023620769, + "grad_norm": 0.134785458445549, + "learning_rate": 0.0013637636707635485, + "loss": 3.002344512939453, + "num_input_tokens_seen": 6967787520, + "step": 13290, + "train_runtime": 60356.7015, + "train_tokens_per_second": 115443.478 + }, + { + "epoch": 0.7197164425444411, + "grad_norm": 0.13965272903442383, + "learning_rate": 0.0013606691984249973, + "loss": 2.9921356201171876, + "num_input_tokens_seen": 6973030400, + "step": 13300, + "train_runtime": 60401.8497, + "train_tokens_per_second": 115443.988 + }, + { + "epoch": 0.7202575827268054, + "grad_norm": 0.1369258165359497, + "learning_rate": 0.0013575789676494676, + "loss": 2.9890642166137695, + "num_input_tokens_seen": 6978273280, + "step": 13310, + "train_runtime": 60447.02, + "train_tokens_per_second": 115444.455 + }, + { + "epoch": 0.7207987229091696, + "grad_norm": 0.1361692249774933, + "learning_rate": 0.0013544929878712931, + "loss": 3.0067501068115234, + "num_input_tokens_seen": 6983516160, + "step": 13320, + "train_runtime": 60492.1531, + "train_tokens_per_second": 115444.993 + }, + { + "epoch": 0.7213398630915339, + "grad_norm": 0.13645213842391968, + "learning_rate": 0.0013514112685118279, + "loss": 2.99460506439209, + "num_input_tokens_seen": 6988759040, + "step": 13330, + "train_runtime": 60537.2701, + "train_tokens_per_second": 115445.56 + }, + { + "epoch": 0.7218810032738981, + "grad_norm": 0.13640370965003967, + "learning_rate": 0.0013483338189794198, + "loss": 3.0064407348632813, + "num_input_tokens_seen": 6994001920, + "step": 13340, + "train_runtime": 60582.4237, + "train_tokens_per_second": 115446.057 + }, + { + "epoch": 0.7224221434562623, + "grad_norm": 0.13847370445728302, + "learning_rate": 0.0013452606486693793, + "loss": 2.990389823913574, + "num_input_tokens_seen": 6999244800, + "step": 13350, + "train_runtime": 60627.5832, + "train_tokens_per_second": 115446.542 + }, + { + "epoch": 0.7229632836386266, + "grad_norm": 0.14565610885620117, + "learning_rate": 0.001342191766963955, + "loss": 2.9985219955444338, + "num_input_tokens_seen": 7004487680, + "step": 13360, + "train_runtime": 60676.4805, + "train_tokens_per_second": 115439.914 + }, + { + "epoch": 0.7235044238209908, + "grad_norm": 0.13583402335643768, + "learning_rate": 0.0013391271832323016, + "loss": 3.000563049316406, + "num_input_tokens_seen": 7009730560, + "step": 13370, + "train_runtime": 60721.6176, + "train_tokens_per_second": 115440.445 + }, + { + "epoch": 0.7240455640033551, + "grad_norm": 0.13164934515953064, + "learning_rate": 0.0013360669068304526, + "loss": 2.993762969970703, + "num_input_tokens_seen": 7014973440, + "step": 13380, + "train_runtime": 60766.7453, + "train_tokens_per_second": 115440.993 + }, + { + "epoch": 0.7245867041857194, + "grad_norm": 0.13159868121147156, + "learning_rate": 0.001333010947101289, + "loss": 2.9905731201171877, + "num_input_tokens_seen": 7020216320, + "step": 13390, + "train_runtime": 60811.8966, + "train_tokens_per_second": 115441.496 + }, + { + "epoch": 0.7251278443680835, + "grad_norm": 0.1346818059682846, + "learning_rate": 0.001329959313374518, + "loss": 3.002712631225586, + "num_input_tokens_seen": 7025459200, + "step": 13400, + "train_runtime": 60857.0386, + "train_tokens_per_second": 115442.016 + }, + { + "epoch": 0.7256689845504478, + "grad_norm": 0.1322467029094696, + "learning_rate": 0.0013269120149666353, + "loss": 2.9997226715087892, + "num_input_tokens_seen": 7030702080, + "step": 13410, + "train_runtime": 60902.1814, + "train_tokens_per_second": 115442.533 + }, + { + "epoch": 0.726210124732812, + "grad_norm": 0.13496780395507812, + "learning_rate": 0.0013238690611809029, + "loss": 3.00130615234375, + "num_input_tokens_seen": 7035944960, + "step": 13420, + "train_runtime": 60947.3114, + "train_tokens_per_second": 115443.074 + }, + { + "epoch": 0.7267512649151763, + "grad_norm": 0.13476966321468353, + "learning_rate": 0.0013208304613073197, + "loss": 2.9966285705566404, + "num_input_tokens_seen": 7041187840, + "step": 13430, + "train_runtime": 60992.4581, + "train_tokens_per_second": 115443.582 + }, + { + "epoch": 0.7272924050975406, + "grad_norm": 0.13049598038196564, + "learning_rate": 0.0013177962246225905, + "loss": 3.0012109756469725, + "num_input_tokens_seen": 7046430720, + "step": 13440, + "train_runtime": 61037.614, + "train_tokens_per_second": 115444.072 + }, + { + "epoch": 0.7278335452799047, + "grad_norm": 0.1286519169807434, + "learning_rate": 0.0013147663603901006, + "loss": 2.9998191833496093, + "num_input_tokens_seen": 7051673600, + "step": 13450, + "train_runtime": 61082.7378, + "train_tokens_per_second": 115444.622 + }, + { + "epoch": 0.728374685462269, + "grad_norm": 0.13326317071914673, + "learning_rate": 0.0013117408778598853, + "loss": 2.980904769897461, + "num_input_tokens_seen": 7056916480, + "step": 13460, + "train_runtime": 61127.8727, + "train_tokens_per_second": 115445.151 + }, + { + "epoch": 0.7289158256446332, + "grad_norm": 0.13441520929336548, + "learning_rate": 0.001308719786268604, + "loss": 3.0028324127197266, + "num_input_tokens_seen": 7062159360, + "step": 13470, + "train_runtime": 61173.0008, + "train_tokens_per_second": 115445.691 + }, + { + "epoch": 0.7294569658269975, + "grad_norm": 0.13160498440265656, + "learning_rate": 0.0013057030948395115, + "loss": 2.990519332885742, + "num_input_tokens_seen": 7067402240, + "step": 13480, + "train_runtime": 61218.1024, + "train_tokens_per_second": 115446.281 + }, + { + "epoch": 0.7299981060093618, + "grad_norm": 0.13775858283042908, + "learning_rate": 0.001302690812782427, + "loss": 3.006916046142578, + "num_input_tokens_seen": 7072645120, + "step": 13490, + "train_runtime": 61263.2414, + "train_tokens_per_second": 115446.799 + }, + { + "epoch": 0.7305392461917259, + "grad_norm": 0.13651160895824432, + "learning_rate": 0.0012996829492937084, + "loss": 3.000609016418457, + "num_input_tokens_seen": 7077888000, + "step": 13500, + "train_runtime": 61308.388, + "train_tokens_per_second": 115447.302 + }, + { + "epoch": 0.7305392461917259, + "eval_loss": 2.9539315700531006, + "eval_runtime": 1.9872, + "eval_samples_per_second": 251.611, + "eval_steps_per_second": 4.026, + "num_input_tokens_seen": 7077888000, + "step": 13500 + }, + { + "epoch": 0.7310803863740902, + "grad_norm": 0.1339404284954071, + "learning_rate": 0.001296679513556226, + "loss": 2.9880565643310546, + "num_input_tokens_seen": 7083130880, + "step": 13510, + "train_runtime": 61355.5007, + "train_tokens_per_second": 115444.105 + }, + { + "epoch": 0.7316215265564544, + "grad_norm": 0.1354180872440338, + "learning_rate": 0.0012936805147393292, + "loss": 2.9919578552246096, + "num_input_tokens_seen": 7088373760, + "step": 13520, + "train_runtime": 61400.641, + "train_tokens_per_second": 115444.622 + }, + { + "epoch": 0.7321626667388187, + "grad_norm": 0.13503789901733398, + "learning_rate": 0.0012906859619988247, + "loss": 2.99132080078125, + "num_input_tokens_seen": 7093616640, + "step": 13530, + "train_runtime": 61445.7513, + "train_tokens_per_second": 115445.193 + }, + { + "epoch": 0.732703806921183, + "grad_norm": 0.13498766720294952, + "learning_rate": 0.0012876958644769446, + "loss": 2.9880552291870117, + "num_input_tokens_seen": 7098859520, + "step": 13540, + "train_runtime": 61490.8935, + "train_tokens_per_second": 115445.704 + }, + { + "epoch": 0.7332449471035472, + "grad_norm": 0.13910213112831116, + "learning_rate": 0.0012847102313023185, + "loss": 2.996448516845703, + "num_input_tokens_seen": 7104102400, + "step": 13550, + "train_runtime": 61536.0395, + "train_tokens_per_second": 115446.208 + }, + { + "epoch": 0.7337860872859114, + "grad_norm": 0.13978877663612366, + "learning_rate": 0.0012817290715899468, + "loss": 2.9948408126831056, + "num_input_tokens_seen": 7109345280, + "step": 13560, + "train_runtime": 61581.1749, + "train_tokens_per_second": 115446.73 + }, + { + "epoch": 0.7343272274682756, + "grad_norm": 0.12929198145866394, + "learning_rate": 0.0012787523944411728, + "loss": 2.990352821350098, + "num_input_tokens_seen": 7114588160, + "step": 13570, + "train_runtime": 61626.3208, + "train_tokens_per_second": 115447.232 + }, + { + "epoch": 0.7348683676506399, + "grad_norm": 0.12884965538978577, + "learning_rate": 0.001275780208943655, + "loss": 2.9938125610351562, + "num_input_tokens_seen": 7119831040, + "step": 13580, + "train_runtime": 61671.467, + "train_tokens_per_second": 115447.733 + }, + { + "epoch": 0.7354095078330042, + "grad_norm": 0.13231875002384186, + "learning_rate": 0.0012728125241713403, + "loss": 2.9899265289306642, + "num_input_tokens_seen": 7125073920, + "step": 13590, + "train_runtime": 61716.5949, + "train_tokens_per_second": 115448.267 + }, + { + "epoch": 0.7359506480153684, + "grad_norm": 0.13000380992889404, + "learning_rate": 0.001269849349184432, + "loss": 2.997477722167969, + "num_input_tokens_seen": 7130316800, + "step": 13600, + "train_runtime": 61761.7628, + "train_tokens_per_second": 115448.725 + }, + { + "epoch": 0.7364917881977326, + "grad_norm": 0.13756293058395386, + "learning_rate": 0.0012668906930293686, + "loss": 2.9921825408935545, + "num_input_tokens_seen": 7135559680, + "step": 13610, + "train_runtime": 61806.8862, + "train_tokens_per_second": 115449.266 + }, + { + "epoch": 0.7370329283800968, + "grad_norm": 0.134871244430542, + "learning_rate": 0.0012639365647387907, + "loss": 2.991608238220215, + "num_input_tokens_seen": 7140802560, + "step": 13620, + "train_runtime": 61852.0353, + "train_tokens_per_second": 115449.759 + }, + { + "epoch": 0.7375740685624611, + "grad_norm": 0.13307398557662964, + "learning_rate": 0.0012609869733315145, + "loss": 2.994303512573242, + "num_input_tokens_seen": 7146045440, + "step": 13630, + "train_runtime": 61897.1942, + "train_tokens_per_second": 115450.232 + }, + { + "epoch": 0.7381152087448254, + "grad_norm": 0.1326708197593689, + "learning_rate": 0.0012580419278125086, + "loss": 2.9904823303222656, + "num_input_tokens_seen": 7151288320, + "step": 13640, + "train_runtime": 61942.3523, + "train_tokens_per_second": 115450.706 + }, + { + "epoch": 0.7386563489271896, + "grad_norm": 0.13145731389522552, + "learning_rate": 0.0012551014371728615, + "loss": 2.991769790649414, + "num_input_tokens_seen": 7156531200, + "step": 13650, + "train_runtime": 61987.491, + "train_tokens_per_second": 115451.216 + }, + { + "epoch": 0.7391974891095539, + "grad_norm": 0.13033975660800934, + "learning_rate": 0.0012521655103897556, + "loss": 2.9962963104248046, + "num_input_tokens_seen": 7161774080, + "step": 13660, + "train_runtime": 62032.6128, + "train_tokens_per_second": 115451.756 + }, + { + "epoch": 0.739738629291918, + "grad_norm": 0.13624544441699982, + "learning_rate": 0.0012492341564264394, + "loss": 2.9916343688964844, + "num_input_tokens_seen": 7167016960, + "step": 13670, + "train_runtime": 62077.7496, + "train_tokens_per_second": 115452.268 + }, + { + "epoch": 0.7402797694742823, + "grad_norm": 0.12694226205348969, + "learning_rate": 0.0012463073842322032, + "loss": 2.9956790924072267, + "num_input_tokens_seen": 7172259840, + "step": 13680, + "train_runtime": 62122.8901, + "train_tokens_per_second": 115452.772 + }, + { + "epoch": 0.7408209096566466, + "grad_norm": 0.14218159019947052, + "learning_rate": 0.0012433852027423462, + "loss": 2.9924745559692383, + "num_input_tokens_seen": 7177502720, + "step": 13690, + "train_runtime": 62168.0831, + "train_tokens_per_second": 115453.177 + }, + { + "epoch": 0.7413620498390108, + "grad_norm": 0.13965629041194916, + "learning_rate": 0.0012404676208781556, + "loss": 2.9898683547973635, + "num_input_tokens_seen": 7182745600, + "step": 13700, + "train_runtime": 62213.3158, + "train_tokens_per_second": 115453.509 + }, + { + "epoch": 0.7419031900213751, + "grad_norm": 0.13439473509788513, + "learning_rate": 0.0012375546475468736, + "loss": 2.99302978515625, + "num_input_tokens_seen": 7187988480, + "step": 13710, + "train_runtime": 62258.5518, + "train_tokens_per_second": 115453.834 + }, + { + "epoch": 0.7424443302037392, + "grad_norm": 0.13322672247886658, + "learning_rate": 0.0012346462916416746, + "loss": 2.9867807388305665, + "num_input_tokens_seen": 7193231360, + "step": 13720, + "train_runtime": 62303.7184, + "train_tokens_per_second": 115454.287 + }, + { + "epoch": 0.7429854703861035, + "grad_norm": 0.13469451665878296, + "learning_rate": 0.001231742562041635, + "loss": 2.9933212280273436, + "num_input_tokens_seen": 7198474240, + "step": 13730, + "train_runtime": 62348.8665, + "train_tokens_per_second": 115454.773 + }, + { + "epoch": 0.7435266105684678, + "grad_norm": 0.1325179785490036, + "learning_rate": 0.001228843467611706, + "loss": 2.9945892333984374, + "num_input_tokens_seen": 7203717120, + "step": 13740, + "train_runtime": 62397.9384, + "train_tokens_per_second": 115447.999 + }, + { + "epoch": 0.744067750750832, + "grad_norm": 0.1386304348707199, + "learning_rate": 0.0012259490172026927, + "loss": 2.989889907836914, + "num_input_tokens_seen": 7208960000, + "step": 13750, + "train_runtime": 62443.1321, + "train_tokens_per_second": 115448.405 + }, + { + "epoch": 0.7446088909331963, + "grad_norm": 0.13061648607254028, + "learning_rate": 0.0012230592196512174, + "loss": 2.986536407470703, + "num_input_tokens_seen": 7214202880, + "step": 13760, + "train_runtime": 62488.3343, + "train_tokens_per_second": 115448.795 + }, + { + "epoch": 0.7451500311155604, + "grad_norm": 0.12978407740592957, + "learning_rate": 0.0012201740837796992, + "loss": 2.9931753158569334, + "num_input_tokens_seen": 7219445760, + "step": 13770, + "train_runtime": 62533.544, + "train_tokens_per_second": 115449.17 + }, + { + "epoch": 0.7456911712979247, + "grad_norm": 0.12974348664283752, + "learning_rate": 0.0012172936183963243, + "loss": 2.98385009765625, + "num_input_tokens_seen": 7224688640, + "step": 13780, + "train_runtime": 62578.7317, + "train_tokens_per_second": 115449.586 + }, + { + "epoch": 0.746232311480289, + "grad_norm": 0.1361524909734726, + "learning_rate": 0.0012144178322950217, + "loss": 2.996071624755859, + "num_input_tokens_seen": 7229931520, + "step": 13790, + "train_runtime": 62623.945, + "train_tokens_per_second": 115449.953 + }, + { + "epoch": 0.7467734516626532, + "grad_norm": 0.12753413617610931, + "learning_rate": 0.0012115467342554353, + "loss": 2.989743232727051, + "num_input_tokens_seen": 7235174400, + "step": 13800, + "train_runtime": 62669.1454, + "train_tokens_per_second": 115450.344 + }, + { + "epoch": 0.7473145918450175, + "grad_norm": 0.1313578486442566, + "learning_rate": 0.0012086803330428942, + "loss": 2.9922863006591798, + "num_input_tokens_seen": 7240417280, + "step": 13810, + "train_runtime": 62714.3608, + "train_tokens_per_second": 115450.707 + }, + { + "epoch": 0.7478557320273816, + "grad_norm": 0.13242116570472717, + "learning_rate": 0.0012058186374083889, + "loss": 2.9887691497802735, + "num_input_tokens_seen": 7245660160, + "step": 13820, + "train_runtime": 62759.5959, + "train_tokens_per_second": 115451.033 + }, + { + "epoch": 0.7483968722097459, + "grad_norm": 0.1344103366136551, + "learning_rate": 0.0012029616560885453, + "loss": 2.989380645751953, + "num_input_tokens_seen": 7250903040, + "step": 13830, + "train_runtime": 62804.8179, + "train_tokens_per_second": 115451.382 + }, + { + "epoch": 0.7489380123921102, + "grad_norm": 0.13286016881465912, + "learning_rate": 0.001200109397805595, + "loss": 2.9872367858886717, + "num_input_tokens_seen": 7256145920, + "step": 13840, + "train_runtime": 62850.0273, + "train_tokens_per_second": 115451.754 + }, + { + "epoch": 0.7494791525744744, + "grad_norm": 0.13758355379104614, + "learning_rate": 0.0011972618712673526, + "loss": 2.9894548416137696, + "num_input_tokens_seen": 7261388800, + "step": 13850, + "train_runtime": 62895.244, + "train_tokens_per_second": 115452.113 + }, + { + "epoch": 0.7500202927568387, + "grad_norm": 0.13310939073562622, + "learning_rate": 0.0011944190851671855, + "loss": 2.980154800415039, + "num_input_tokens_seen": 7266631680, + "step": 13860, + "train_runtime": 62940.4589, + "train_tokens_per_second": 115452.474 + }, + { + "epoch": 0.7505614329392029, + "grad_norm": 0.13724195957183838, + "learning_rate": 0.0011915810481839884, + "loss": 2.9957542419433594, + "num_input_tokens_seen": 7271874560, + "step": 13870, + "train_runtime": 62985.6674, + "train_tokens_per_second": 115452.846 + }, + { + "epoch": 0.7511025731215671, + "grad_norm": 0.13776428997516632, + "learning_rate": 0.0011887477689821579, + "loss": 2.9919281005859375, + "num_input_tokens_seen": 7277117440, + "step": 13880, + "train_runtime": 63030.8734, + "train_tokens_per_second": 115453.222 + }, + { + "epoch": 0.7516437133039314, + "grad_norm": 0.13441872596740723, + "learning_rate": 0.001185919256211564, + "loss": 2.9903282165527343, + "num_input_tokens_seen": 7282360320, + "step": 13890, + "train_runtime": 63076.0694, + "train_tokens_per_second": 115453.616 + }, + { + "epoch": 0.7521848534862956, + "grad_norm": 0.14160217344760895, + "learning_rate": 0.001183095518507527, + "loss": 2.9950998306274412, + "num_input_tokens_seen": 7287603200, + "step": 13900, + "train_runtime": 63121.2819, + "train_tokens_per_second": 115453.98 + }, + { + "epoch": 0.7527259936686599, + "grad_norm": 0.13321471214294434, + "learning_rate": 0.001180276564490789, + "loss": 2.9867202758789064, + "num_input_tokens_seen": 7292846080, + "step": 13910, + "train_runtime": 63166.4818, + "train_tokens_per_second": 115454.366 + }, + { + "epoch": 0.7532671338510241, + "grad_norm": 0.13260754942893982, + "learning_rate": 0.001177462402767485, + "loss": 2.9936323165893555, + "num_input_tokens_seen": 7298088960, + "step": 13920, + "train_runtime": 63211.6992, + "train_tokens_per_second": 115454.719 + }, + { + "epoch": 0.7538082740333883, + "grad_norm": 0.13385504484176636, + "learning_rate": 0.0011746530419291235, + "loss": 2.9826412200927734, + "num_input_tokens_seen": 7303331840, + "step": 13930, + "train_runtime": 63256.8908, + "train_tokens_per_second": 115455.119 + }, + { + "epoch": 0.7543494142157526, + "grad_norm": 0.1354595571756363, + "learning_rate": 0.0011718484905525526, + "loss": 2.9921710968017576, + "num_input_tokens_seen": 7308574720, + "step": 13940, + "train_runtime": 63302.0738, + "train_tokens_per_second": 115455.534 + }, + { + "epoch": 0.7548905543981168, + "grad_norm": 0.13242025673389435, + "learning_rate": 0.0011690487571999377, + "loss": 2.9915000915527346, + "num_input_tokens_seen": 7313817600, + "step": 13950, + "train_runtime": 63347.2678, + "train_tokens_per_second": 115455.928 + }, + { + "epoch": 0.7554316945804811, + "grad_norm": 0.1303345412015915, + "learning_rate": 0.0011662538504187375, + "loss": 2.992412567138672, + "num_input_tokens_seen": 7319060480, + "step": 13960, + "train_runtime": 63392.4687, + "train_tokens_per_second": 115456.309 + }, + { + "epoch": 0.7559728347628453, + "grad_norm": 0.1336052417755127, + "learning_rate": 0.0011634637787416738, + "loss": 2.9856544494628907, + "num_input_tokens_seen": 7324303360, + "step": 13970, + "train_runtime": 63437.6413, + "train_tokens_per_second": 115456.742 + }, + { + "epoch": 0.7565139749452096, + "grad_norm": 0.13160865008831024, + "learning_rate": 0.0011606785506867066, + "loss": 2.990740966796875, + "num_input_tokens_seen": 7329546240, + "step": 13980, + "train_runtime": 63482.8312, + "train_tokens_per_second": 115457.142 + }, + { + "epoch": 0.7570551151275738, + "grad_norm": 0.132036030292511, + "learning_rate": 0.0011578981747570086, + "loss": 2.9869890213012695, + "num_input_tokens_seen": 7334789120, + "step": 13990, + "train_runtime": 63528.0172, + "train_tokens_per_second": 115457.548 + }, + { + "epoch": 0.757596255309938, + "grad_norm": 0.13680653274059296, + "learning_rate": 0.0011551226594409406, + "loss": 2.9875946044921875, + "num_input_tokens_seen": 7340032000, + "step": 14000, + "train_runtime": 63573.1915, + "train_tokens_per_second": 115457.976 + }, + { + "epoch": 0.757596255309938, + "eval_loss": 2.948127031326294, + "eval_runtime": 1.9851, + "eval_samples_per_second": 251.872, + "eval_steps_per_second": 4.03, + "num_input_tokens_seen": 7340032000, + "step": 14000 + }, + { + "epoch": 0.7581373954923023, + "grad_norm": 0.1333727240562439, + "learning_rate": 0.0011523520132120217, + "loss": 2.9936281204223634, + "num_input_tokens_seen": 7345274880, + "step": 14010, + "train_runtime": 63622.81, + "train_tokens_per_second": 115450.337 + }, + { + "epoch": 0.7586785356746665, + "grad_norm": 0.13183613121509552, + "learning_rate": 0.0011495862445289092, + "loss": 2.9838493347167967, + "num_input_tokens_seen": 7350517760, + "step": 14020, + "train_runtime": 63667.9625, + "train_tokens_per_second": 115450.809 + }, + { + "epoch": 0.7592196758570308, + "grad_norm": 0.13663019239902496, + "learning_rate": 0.0011468253618353661, + "loss": 2.9881641387939455, + "num_input_tokens_seen": 7355760640, + "step": 14030, + "train_runtime": 63713.121, + "train_tokens_per_second": 115451.269 + }, + { + "epoch": 0.759760816039395, + "grad_norm": 0.1334005743265152, + "learning_rate": 0.0011440693735602413, + "loss": 2.9827747344970703, + "num_input_tokens_seen": 7361003520, + "step": 14040, + "train_runtime": 63758.2642, + "train_tokens_per_second": 115451.755 + }, + { + "epoch": 0.7603019562217592, + "grad_norm": 0.1363915055990219, + "learning_rate": 0.0011413182881174402, + "loss": 2.976375961303711, + "num_input_tokens_seen": 7366246400, + "step": 14050, + "train_runtime": 63803.3929, + "train_tokens_per_second": 115452.268 + }, + { + "epoch": 0.7608430964041235, + "grad_norm": 0.13721340894699097, + "learning_rate": 0.0011385721139058986, + "loss": 3.0018871307373045, + "num_input_tokens_seen": 7371489280, + "step": 14060, + "train_runtime": 63848.5329, + "train_tokens_per_second": 115452.759 + }, + { + "epoch": 0.7613842365864877, + "grad_norm": 0.13170303404331207, + "learning_rate": 0.0011358308593095617, + "loss": 2.9844949722290037, + "num_input_tokens_seen": 7376732160, + "step": 14070, + "train_runtime": 63893.6665, + "train_tokens_per_second": 115453.261 + }, + { + "epoch": 0.761925376768852, + "grad_norm": 0.13645039498806, + "learning_rate": 0.0011330945326973533, + "loss": 2.9850318908691404, + "num_input_tokens_seen": 7381975040, + "step": 14080, + "train_runtime": 63938.7823, + "train_tokens_per_second": 115453.795 + }, + { + "epoch": 0.7624665169512163, + "grad_norm": 0.1297563761472702, + "learning_rate": 0.0011303631424231526, + "loss": 2.9895225524902345, + "num_input_tokens_seen": 7387217920, + "step": 14090, + "train_runtime": 63983.9157, + "train_tokens_per_second": 115454.296 + }, + { + "epoch": 0.7630076571335804, + "grad_norm": 0.13698382675647736, + "learning_rate": 0.0011276366968257677, + "loss": 2.9852466583251953, + "num_input_tokens_seen": 7392460800, + "step": 14100, + "train_runtime": 64029.0446, + "train_tokens_per_second": 115454.804 + }, + { + "epoch": 0.7635487973159447, + "grad_norm": 0.12868466973304749, + "learning_rate": 0.001124915204228913, + "loss": 2.982627105712891, + "num_input_tokens_seen": 7397703680, + "step": 14110, + "train_runtime": 64074.169, + "train_tokens_per_second": 115455.32 + }, + { + "epoch": 0.7640899374983089, + "grad_norm": 0.13413524627685547, + "learning_rate": 0.0011221986729411787, + "loss": 2.982726287841797, + "num_input_tokens_seen": 7402946560, + "step": 14120, + "train_runtime": 64123.0569, + "train_tokens_per_second": 115449.059 + }, + { + "epoch": 0.7646310776806732, + "grad_norm": 0.13302487134933472, + "learning_rate": 0.0011194871112560113, + "loss": 2.9999317169189452, + "num_input_tokens_seen": 7408189440, + "step": 14130, + "train_runtime": 64168.1991, + "train_tokens_per_second": 115449.546 + }, + { + "epoch": 0.7651722178630375, + "grad_norm": 0.13595032691955566, + "learning_rate": 0.001116780527451682, + "loss": 2.986163330078125, + "num_input_tokens_seen": 7413432320, + "step": 14140, + "train_runtime": 64213.3563, + "train_tokens_per_second": 115450.005 + }, + { + "epoch": 0.7657133580454016, + "grad_norm": 0.12740519642829895, + "learning_rate": 0.0011140789297912688, + "loss": 2.9861852645874025, + "num_input_tokens_seen": 7418675200, + "step": 14150, + "train_runtime": 64258.4713, + "train_tokens_per_second": 115450.54 + }, + { + "epoch": 0.7662544982277659, + "grad_norm": 0.13032016158103943, + "learning_rate": 0.0011113823265226242, + "loss": 2.9914901733398436, + "num_input_tokens_seen": 7423918080, + "step": 14160, + "train_runtime": 64303.6051, + "train_tokens_per_second": 115451.04 + }, + { + "epoch": 0.7667956384101301, + "grad_norm": 0.12856240570545197, + "learning_rate": 0.0011086907258783525, + "loss": 2.99139404296875, + "num_input_tokens_seen": 7429160960, + "step": 14170, + "train_runtime": 64348.7292, + "train_tokens_per_second": 115451.557 + }, + { + "epoch": 0.7673367785924944, + "grad_norm": 0.1300676167011261, + "learning_rate": 0.001106004136075789, + "loss": 2.980759620666504, + "num_input_tokens_seen": 7434403840, + "step": 14180, + "train_runtime": 64393.8763, + "train_tokens_per_second": 115452.032 + }, + { + "epoch": 0.7678779187748587, + "grad_norm": 0.13340207934379578, + "learning_rate": 0.0011033225653169676, + "loss": 2.979547882080078, + "num_input_tokens_seen": 7439646720, + "step": 14190, + "train_runtime": 64439.0196, + "train_tokens_per_second": 115452.513 + }, + { + "epoch": 0.7684190589572228, + "grad_norm": 0.1270836591720581, + "learning_rate": 0.0011006460217886007, + "loss": 2.9818099975585937, + "num_input_tokens_seen": 7444889600, + "step": 14200, + "train_runtime": 64484.1553, + "train_tokens_per_second": 115453.006 + }, + { + "epoch": 0.7689601991395871, + "grad_norm": 0.1316118985414505, + "learning_rate": 0.001097974513662052, + "loss": 2.9830299377441407, + "num_input_tokens_seen": 7450132480, + "step": 14210, + "train_runtime": 64529.2695, + "train_tokens_per_second": 115453.538 + }, + { + "epoch": 0.7695013393219513, + "grad_norm": 0.13914352655410767, + "learning_rate": 0.0010953080490933129, + "loss": 2.9925983428955076, + "num_input_tokens_seen": 7455375360, + "step": 14220, + "train_runtime": 64574.3994, + "train_tokens_per_second": 115454.041 + }, + { + "epoch": 0.7700424795043156, + "grad_norm": 0.13164092600345612, + "learning_rate": 0.0010926466362229787, + "loss": 2.9863054275512697, + "num_input_tokens_seen": 7460618240, + "step": 14230, + "train_runtime": 64619.5117, + "train_tokens_per_second": 115454.575 + }, + { + "epoch": 0.7705836196866799, + "grad_norm": 0.1346326619386673, + "learning_rate": 0.001089990283176218, + "loss": 2.9905773162841798, + "num_input_tokens_seen": 7465861120, + "step": 14240, + "train_runtime": 64664.6395, + "train_tokens_per_second": 115455.08 + }, + { + "epoch": 0.771124759869044, + "grad_norm": 0.1283544898033142, + "learning_rate": 0.0010873389980627568, + "loss": 2.9964345932006835, + "num_input_tokens_seen": 7471104000, + "step": 14250, + "train_runtime": 64709.798, + "train_tokens_per_second": 115455.53 + }, + { + "epoch": 0.7716659000514083, + "grad_norm": 0.13457883894443512, + "learning_rate": 0.0010846927889768454, + "loss": 2.9865245819091797, + "num_input_tokens_seen": 7476346880, + "step": 14260, + "train_runtime": 64754.9357, + "train_tokens_per_second": 115456.016 + }, + { + "epoch": 0.7722070402337725, + "grad_norm": 0.13008961081504822, + "learning_rate": 0.0010820516639972377, + "loss": 2.9932941436767577, + "num_input_tokens_seen": 7481589760, + "step": 14270, + "train_runtime": 64800.0796, + "train_tokens_per_second": 115456.49 + }, + { + "epoch": 0.7727481804161368, + "grad_norm": 0.13576596975326538, + "learning_rate": 0.0010794156311871674, + "loss": 2.975057601928711, + "num_input_tokens_seen": 7486832640, + "step": 14280, + "train_runtime": 64845.2255, + "train_tokens_per_second": 115456.96 + }, + { + "epoch": 0.7732893205985011, + "grad_norm": 0.13501375913619995, + "learning_rate": 0.0010767846985943225, + "loss": 2.983927536010742, + "num_input_tokens_seen": 7492075520, + "step": 14290, + "train_runtime": 64890.3622, + "train_tokens_per_second": 115457.446 + }, + { + "epoch": 0.7738304607808653, + "grad_norm": 0.1284349411725998, + "learning_rate": 0.0010741588742508182, + "loss": 2.994318199157715, + "num_input_tokens_seen": 7497318400, + "step": 14300, + "train_runtime": 64935.5045, + "train_tokens_per_second": 115457.922 + }, + { + "epoch": 0.7743716009632295, + "grad_norm": 0.13406863808631897, + "learning_rate": 0.0010715381661731754, + "loss": 2.9812191009521483, + "num_input_tokens_seen": 7502561280, + "step": 14310, + "train_runtime": 64980.6813, + "train_tokens_per_second": 115458.335 + }, + { + "epoch": 0.7749127411455937, + "grad_norm": 0.1352129429578781, + "learning_rate": 0.0010689225823622948, + "loss": 2.9968055725097655, + "num_input_tokens_seen": 7507804160, + "step": 14320, + "train_runtime": 65025.8721, + "train_tokens_per_second": 115458.723 + }, + { + "epoch": 0.775453881327958, + "grad_norm": 0.13681240379810333, + "learning_rate": 0.0010663121308034337, + "loss": 2.984090805053711, + "num_input_tokens_seen": 7513047040, + "step": 14330, + "train_runtime": 65071.0195, + "train_tokens_per_second": 115459.188 + }, + { + "epoch": 0.7759950215103223, + "grad_norm": 0.12757869064807892, + "learning_rate": 0.0010637068194661817, + "loss": 2.9872867584228517, + "num_input_tokens_seen": 7518289920, + "step": 14340, + "train_runtime": 65116.166, + "train_tokens_per_second": 115459.653 + }, + { + "epoch": 0.7765361616926865, + "grad_norm": 0.1297658532857895, + "learning_rate": 0.0010611066563044331, + "loss": 2.987481689453125, + "num_input_tokens_seen": 7523532800, + "step": 14350, + "train_runtime": 65161.3132, + "train_tokens_per_second": 115460.116 + }, + { + "epoch": 0.7770773018750508, + "grad_norm": 0.13100814819335938, + "learning_rate": 0.0010585116492563672, + "loss": 2.984407424926758, + "num_input_tokens_seen": 7528775680, + "step": 14360, + "train_runtime": 65206.4518, + "train_tokens_per_second": 115460.594 + }, + { + "epoch": 0.7776184420574149, + "grad_norm": 0.13708344101905823, + "learning_rate": 0.0010559218062444215, + "loss": 2.9803342819213867, + "num_input_tokens_seen": 7534018560, + "step": 14370, + "train_runtime": 65251.6135, + "train_tokens_per_second": 115461.031 + }, + { + "epoch": 0.7781595822397792, + "grad_norm": 0.13270463049411774, + "learning_rate": 0.001053337135175266, + "loss": 2.9783748626708983, + "num_input_tokens_seen": 7539261440, + "step": 14380, + "train_runtime": 65296.782, + "train_tokens_per_second": 115461.455 + }, + { + "epoch": 0.7787007224221435, + "grad_norm": 0.1348678022623062, + "learning_rate": 0.001050757643939784, + "loss": 2.985927963256836, + "num_input_tokens_seen": 7544504320, + "step": 14390, + "train_runtime": 65341.9205, + "train_tokens_per_second": 115461.931 + }, + { + "epoch": 0.7792418626045077, + "grad_norm": 0.1359061747789383, + "learning_rate": 0.0010481833404130433, + "loss": 2.977262496948242, + "num_input_tokens_seen": 7549747200, + "step": 14400, + "train_runtime": 65387.0473, + "train_tokens_per_second": 115462.427 + }, + { + "epoch": 0.779783002786872, + "grad_norm": 0.13489292562007904, + "learning_rate": 0.0010456142324542742, + "loss": 2.9768039703369142, + "num_input_tokens_seen": 7554990080, + "step": 14410, + "train_runtime": 65432.1998, + "train_tokens_per_second": 115462.878 + }, + { + "epoch": 0.7803241429692361, + "grad_norm": 0.13529463112354279, + "learning_rate": 0.001043050327906844, + "loss": 2.992759132385254, + "num_input_tokens_seen": 7560232960, + "step": 14420, + "train_runtime": 65477.3624, + "train_tokens_per_second": 115463.31 + }, + { + "epoch": 0.7808652831516004, + "grad_norm": 0.13989658653736115, + "learning_rate": 0.0010404916345982372, + "loss": 2.9861518859863283, + "num_input_tokens_seen": 7565475840, + "step": 14430, + "train_runtime": 65522.5287, + "train_tokens_per_second": 115463.734 + }, + { + "epoch": 0.7814064233339647, + "grad_norm": 0.13800008594989777, + "learning_rate": 0.0010379381603400246, + "loss": 2.983747100830078, + "num_input_tokens_seen": 7570718720, + "step": 14440, + "train_runtime": 65567.6879, + "train_tokens_per_second": 115464.171 + }, + { + "epoch": 0.7819475635163289, + "grad_norm": 0.14410988986492157, + "learning_rate": 0.0010353899129278482, + "loss": 2.986704444885254, + "num_input_tokens_seen": 7575961600, + "step": 14450, + "train_runtime": 65612.8209, + "train_tokens_per_second": 115464.653 + }, + { + "epoch": 0.7824887036986932, + "grad_norm": 0.13409604132175446, + "learning_rate": 0.0010328469001413872, + "loss": 2.9869441986083984, + "num_input_tokens_seen": 7581204480, + "step": 14460, + "train_runtime": 65657.9605, + "train_tokens_per_second": 115465.123 + }, + { + "epoch": 0.7830298438810573, + "grad_norm": 0.13234242796897888, + "learning_rate": 0.0010303091297443453, + "loss": 2.9890289306640625, + "num_input_tokens_seen": 7586447360, + "step": 14470, + "train_runtime": 65703.0949, + "train_tokens_per_second": 115465.601 + }, + { + "epoch": 0.7835709840634216, + "grad_norm": 0.13398636877536774, + "learning_rate": 0.001027776609484418, + "loss": 2.9826473236083983, + "num_input_tokens_seen": 7591690240, + "step": 14480, + "train_runtime": 65748.2396, + "train_tokens_per_second": 115466.061 + }, + { + "epoch": 0.7841121242457859, + "grad_norm": 0.13305144011974335, + "learning_rate": 0.0010252493470932719, + "loss": 2.9864757537841795, + "num_input_tokens_seen": 7596933120, + "step": 14490, + "train_runtime": 65793.3795, + "train_tokens_per_second": 115466.528 + }, + { + "epoch": 0.7846532644281501, + "grad_norm": 0.13172990083694458, + "learning_rate": 0.0010227273502865237, + "loss": 2.9912540435791017, + "num_input_tokens_seen": 7602176000, + "step": 14500, + "train_runtime": 65842.395, + "train_tokens_per_second": 115460.199 + }, + { + "epoch": 0.7846532644281501, + "eval_loss": 2.9429469108581543, + "eval_runtime": 1.9893, + "eval_samples_per_second": 251.343, + "eval_steps_per_second": 4.021, + "num_input_tokens_seen": 7602176000, + "step": 14500 + }, + { + "epoch": 0.7851944046105144, + "grad_norm": 0.13013876974582672, + "learning_rate": 0.0010202106267637142, + "loss": 2.9870655059814455, + "num_input_tokens_seen": 7607418880, + "step": 14510, + "train_runtime": 65889.5594, + "train_tokens_per_second": 115457.122 + }, + { + "epoch": 0.7857355447928785, + "grad_norm": 0.14158159494400024, + "learning_rate": 0.001017699184208284, + "loss": 2.9855068206787108, + "num_input_tokens_seen": 7612661760, + "step": 14520, + "train_runtime": 65934.7235, + "train_tokens_per_second": 115457.552 + }, + { + "epoch": 0.7862766849752428, + "grad_norm": 0.12904150784015656, + "learning_rate": 0.001015193030287551, + "loss": 2.9784725189208983, + "num_input_tokens_seen": 7617904640, + "step": 14530, + "train_runtime": 65979.8789, + "train_tokens_per_second": 115457.997 + }, + { + "epoch": 0.7868178251576071, + "grad_norm": 0.1475485861301422, + "learning_rate": 0.0010126921726526892, + "loss": 2.9963218688964846, + "num_input_tokens_seen": 7623147520, + "step": 14540, + "train_runtime": 66025.0052, + "train_tokens_per_second": 115458.492 + }, + { + "epoch": 0.7873589653399713, + "grad_norm": 0.13277380168437958, + "learning_rate": 0.0010101966189387007, + "loss": 2.9872737884521485, + "num_input_tokens_seen": 7628390400, + "step": 14550, + "train_runtime": 66070.1575, + "train_tokens_per_second": 115458.941 + }, + { + "epoch": 0.7879001055223356, + "grad_norm": 0.13506442308425903, + "learning_rate": 0.0010077063767643974, + "loss": 2.9895917892456056, + "num_input_tokens_seen": 7633633280, + "step": 14560, + "train_runtime": 66115.3068, + "train_tokens_per_second": 115459.394 + }, + { + "epoch": 0.7884412457046998, + "grad_norm": 0.13273315131664276, + "learning_rate": 0.0010052214537323724, + "loss": 2.9872600555419924, + "num_input_tokens_seen": 7638876160, + "step": 14570, + "train_runtime": 66160.4452, + "train_tokens_per_second": 115459.866 + }, + { + "epoch": 0.788982385887064, + "grad_norm": 0.1311519294977188, + "learning_rate": 0.0010027418574289832, + "loss": 2.9747976303100585, + "num_input_tokens_seen": 7644119040, + "step": 14580, + "train_runtime": 66205.59, + "train_tokens_per_second": 115460.326 + }, + { + "epoch": 0.7895235260694283, + "grad_norm": 0.13237175345420837, + "learning_rate": 0.0010002675954243225, + "loss": 2.9707094192504884, + "num_input_tokens_seen": 7649361920, + "step": 14590, + "train_runtime": 66250.7308, + "train_tokens_per_second": 115460.793 + }, + { + "epoch": 0.7900646662517925, + "grad_norm": 0.13623256981372833, + "learning_rate": 0.0009977986752721967, + "loss": 2.9789360046386717, + "num_input_tokens_seen": 7654604800, + "step": 14600, + "train_runtime": 66295.8847, + "train_tokens_per_second": 115461.236 + }, + { + "epoch": 0.7906058064341568, + "grad_norm": 0.13563480973243713, + "learning_rate": 0.0009953351045101087, + "loss": 2.976993942260742, + "num_input_tokens_seen": 7659847680, + "step": 14610, + "train_runtime": 66341.0194, + "train_tokens_per_second": 115461.712 + }, + { + "epoch": 0.791146946616521, + "grad_norm": 0.1308317333459854, + "learning_rate": 0.000992876890659225, + "loss": 2.9876148223876955, + "num_input_tokens_seen": 7665090560, + "step": 14620, + "train_runtime": 66386.152, + "train_tokens_per_second": 115462.191 + }, + { + "epoch": 0.7916880867988852, + "grad_norm": 0.12994542717933655, + "learning_rate": 0.0009904240412243594, + "loss": 2.989145278930664, + "num_input_tokens_seen": 7670333440, + "step": 14630, + "train_runtime": 66431.2999, + "train_tokens_per_second": 115462.643 + }, + { + "epoch": 0.7922292269812495, + "grad_norm": 0.13062526285648346, + "learning_rate": 0.0009879765636939479, + "loss": 2.9790761947631834, + "num_input_tokens_seen": 7675576320, + "step": 14640, + "train_runtime": 66476.4455, + "train_tokens_per_second": 115463.098 + }, + { + "epoch": 0.7927703671636137, + "grad_norm": 0.13198526203632355, + "learning_rate": 0.0009855344655400273, + "loss": 2.991826629638672, + "num_input_tokens_seen": 7680819200, + "step": 14650, + "train_runtime": 66521.5925, + "train_tokens_per_second": 115463.55 + }, + { + "epoch": 0.793311507345978, + "grad_norm": 0.12981140613555908, + "learning_rate": 0.0009830977542182112, + "loss": 2.97564754486084, + "num_input_tokens_seen": 7686062080, + "step": 14660, + "train_runtime": 66566.7229, + "train_tokens_per_second": 115464.03 + }, + { + "epoch": 0.7938526475283422, + "grad_norm": 0.13640232384204865, + "learning_rate": 0.0009806664371676665, + "loss": 2.9895370483398436, + "num_input_tokens_seen": 7691304960, + "step": 14670, + "train_runtime": 66611.843, + "train_tokens_per_second": 115464.527 + }, + { + "epoch": 0.7943937877107065, + "grad_norm": 0.13942649960517883, + "learning_rate": 0.0009782405218110937, + "loss": 2.983687973022461, + "num_input_tokens_seen": 7696547840, + "step": 14680, + "train_runtime": 66656.9717, + "train_tokens_per_second": 115465.009 + }, + { + "epoch": 0.7949349278930707, + "grad_norm": 0.13253772258758545, + "learning_rate": 0.0009758200155546995, + "loss": 2.9805246353149415, + "num_input_tokens_seen": 7701790720, + "step": 14690, + "train_runtime": 66702.1127, + "train_tokens_per_second": 115465.469 + }, + { + "epoch": 0.7954760680754349, + "grad_norm": 0.14124181866645813, + "learning_rate": 0.000973404925788178, + "loss": 2.9745468139648437, + "num_input_tokens_seen": 7707033600, + "step": 14700, + "train_runtime": 66747.2598, + "train_tokens_per_second": 115465.918 + }, + { + "epoch": 0.7960172082577992, + "grad_norm": 0.14020085334777832, + "learning_rate": 0.0009709952598846878, + "loss": 2.978104019165039, + "num_input_tokens_seen": 7712276480, + "step": 14710, + "train_runtime": 66792.381, + "train_tokens_per_second": 115466.411 + }, + { + "epoch": 0.7965583484401634, + "grad_norm": 0.14543874561786652, + "learning_rate": 0.0009685910252008282, + "loss": 2.972671890258789, + "num_input_tokens_seen": 7717519360, + "step": 14720, + "train_runtime": 66837.5213, + "train_tokens_per_second": 115466.87 + }, + { + "epoch": 0.7970994886225277, + "grad_norm": 0.1361764669418335, + "learning_rate": 0.0009661922290766168, + "loss": 2.979312515258789, + "num_input_tokens_seen": 7722762240, + "step": 14730, + "train_runtime": 66882.6798, + "train_tokens_per_second": 115467.297 + }, + { + "epoch": 0.797640628804892, + "grad_norm": 0.1359523981809616, + "learning_rate": 0.000963798878835467, + "loss": 2.9832695007324217, + "num_input_tokens_seen": 7728005120, + "step": 14740, + "train_runtime": 66927.821, + "train_tokens_per_second": 115467.753 + }, + { + "epoch": 0.7981817689872561, + "grad_norm": 0.1312197595834732, + "learning_rate": 0.0009614109817841685, + "loss": 2.988373565673828, + "num_input_tokens_seen": 7733248000, + "step": 14750, + "train_runtime": 66972.9704, + "train_tokens_per_second": 115468.195 + }, + { + "epoch": 0.7987229091696204, + "grad_norm": 0.1324051469564438, + "learning_rate": 0.00095902854521286, + "loss": 2.9794536590576173, + "num_input_tokens_seen": 7738490880, + "step": 14760, + "train_runtime": 67018.1103, + "train_tokens_per_second": 115468.652 + }, + { + "epoch": 0.7992640493519846, + "grad_norm": 0.13141310214996338, + "learning_rate": 0.0009566515763950114, + "loss": 2.979531097412109, + "num_input_tokens_seen": 7743733760, + "step": 14770, + "train_runtime": 67063.2657, + "train_tokens_per_second": 115469.083 + }, + { + "epoch": 0.7998051895343489, + "grad_norm": 0.13311649858951569, + "learning_rate": 0.0009542800825873985, + "loss": 2.978958511352539, + "num_input_tokens_seen": 7748976640, + "step": 14780, + "train_runtime": 67108.4044, + "train_tokens_per_second": 115469.541 + }, + { + "epoch": 0.8003463297167132, + "grad_norm": 0.1344899833202362, + "learning_rate": 0.0009519140710300836, + "loss": 2.9761631011962892, + "num_input_tokens_seen": 7754219520, + "step": 14790, + "train_runtime": 67153.558, + "train_tokens_per_second": 115469.973 + }, + { + "epoch": 0.8008874698990773, + "grad_norm": 0.1314343363046646, + "learning_rate": 0.0009495535489463907, + "loss": 2.9750953674316407, + "num_input_tokens_seen": 7759462400, + "step": 14800, + "train_runtime": 67198.7114, + "train_tokens_per_second": 115470.405 + }, + { + "epoch": 0.8014286100814416, + "grad_norm": 0.13687878847122192, + "learning_rate": 0.0009471985235428848, + "loss": 2.977894973754883, + "num_input_tokens_seen": 7764705280, + "step": 14810, + "train_runtime": 67243.8512, + "train_tokens_per_second": 115470.859 + }, + { + "epoch": 0.8019697502638058, + "grad_norm": 0.13268278539180756, + "learning_rate": 0.0009448490020093504, + "loss": 2.983228302001953, + "num_input_tokens_seen": 7769948160, + "step": 14820, + "train_runtime": 67288.9927, + "train_tokens_per_second": 115471.31 + }, + { + "epoch": 0.8025108904461701, + "grad_norm": 0.13738638162612915, + "learning_rate": 0.0009425049915187695, + "loss": 2.98532657623291, + "num_input_tokens_seen": 7775191040, + "step": 14830, + "train_runtime": 67334.146, + "train_tokens_per_second": 115471.741 + }, + { + "epoch": 0.8030520306285344, + "grad_norm": 0.13537852466106415, + "learning_rate": 0.0009401664992272974, + "loss": 2.9814353942871095, + "num_input_tokens_seen": 7780433920, + "step": 14840, + "train_runtime": 67379.3084, + "train_tokens_per_second": 115472.155 + }, + { + "epoch": 0.8035931708108985, + "grad_norm": 0.13461166620254517, + "learning_rate": 0.0009378335322742428, + "loss": 2.988892364501953, + "num_input_tokens_seen": 7785676800, + "step": 14850, + "train_runtime": 67424.4589, + "train_tokens_per_second": 115472.589 + }, + { + "epoch": 0.8041343109932628, + "grad_norm": 0.1397952139377594, + "learning_rate": 0.0009355060977820479, + "loss": 2.981852149963379, + "num_input_tokens_seen": 7790919680, + "step": 14860, + "train_runtime": 67469.6089, + "train_tokens_per_second": 115473.023 + }, + { + "epoch": 0.804675451175627, + "grad_norm": 0.13720718026161194, + "learning_rate": 0.000933184202856262, + "loss": 2.9753461837768556, + "num_input_tokens_seen": 7796162560, + "step": 14870, + "train_runtime": 67514.7478, + "train_tokens_per_second": 115473.475 + }, + { + "epoch": 0.8052165913579913, + "grad_norm": 0.13194413483142853, + "learning_rate": 0.0009308678545855248, + "loss": 2.98673038482666, + "num_input_tokens_seen": 7801405440, + "step": 14880, + "train_runtime": 67563.706, + "train_tokens_per_second": 115467.4 + }, + { + "epoch": 0.8057577315403556, + "grad_norm": 0.13509796559810638, + "learning_rate": 0.0009285570600415394, + "loss": 2.9741546630859377, + "num_input_tokens_seen": 7806648320, + "step": 14890, + "train_runtime": 67608.8064, + "train_tokens_per_second": 115467.921 + }, + { + "epoch": 0.8062988717227197, + "grad_norm": 0.13570842146873474, + "learning_rate": 0.0009262518262790568, + "loss": 2.9908029556274416, + "num_input_tokens_seen": 7811891200, + "step": 14900, + "train_runtime": 67653.9237, + "train_tokens_per_second": 115468.413 + }, + { + "epoch": 0.806840011905084, + "grad_norm": 0.1328882873058319, + "learning_rate": 0.0009239521603358486, + "loss": 2.9901811599731447, + "num_input_tokens_seen": 7817134080, + "step": 14910, + "train_runtime": 67699.0266, + "train_tokens_per_second": 115468.929 + }, + { + "epoch": 0.8073811520874482, + "grad_norm": 0.13037438690662384, + "learning_rate": 0.0009216580692326891, + "loss": 2.9751874923706056, + "num_input_tokens_seen": 7822376960, + "step": 14920, + "train_runtime": 67744.1354, + "train_tokens_per_second": 115469.434 + }, + { + "epoch": 0.8079222922698125, + "grad_norm": 0.13509000837802887, + "learning_rate": 0.0009193695599733333, + "loss": 2.9760356903076173, + "num_input_tokens_seen": 7827619840, + "step": 14930, + "train_runtime": 67789.236, + "train_tokens_per_second": 115469.952 + }, + { + "epoch": 0.8084634324521768, + "grad_norm": 0.13353431224822998, + "learning_rate": 0.0009170866395444952, + "loss": 2.979950714111328, + "num_input_tokens_seen": 7832862720, + "step": 14940, + "train_runtime": 67834.3595, + "train_tokens_per_second": 115470.431 + }, + { + "epoch": 0.809004572634541, + "grad_norm": 0.13296596705913544, + "learning_rate": 0.0009148093149158249, + "loss": 2.9780080795288084, + "num_input_tokens_seen": 7838105600, + "step": 14950, + "train_runtime": 67879.4629, + "train_tokens_per_second": 115470.943 + }, + { + "epoch": 0.8095457128169052, + "grad_norm": 0.13199231028556824, + "learning_rate": 0.0009125375930398896, + "loss": 2.976139450073242, + "num_input_tokens_seen": 7843348480, + "step": 14960, + "train_runtime": 67924.5642, + "train_tokens_per_second": 115471.458 + }, + { + "epoch": 0.8100868529992694, + "grad_norm": 0.1304149031639099, + "learning_rate": 0.0009102714808521528, + "loss": 2.9799163818359373, + "num_input_tokens_seen": 7848591360, + "step": 14970, + "train_runtime": 67969.6467, + "train_tokens_per_second": 115472.005 + }, + { + "epoch": 0.8106279931816337, + "grad_norm": 0.13312670588493347, + "learning_rate": 0.0009080109852709498, + "loss": 2.9826412200927734, + "num_input_tokens_seen": 7853834240, + "step": 14980, + "train_runtime": 68014.7473, + "train_tokens_per_second": 115472.52 + }, + { + "epoch": 0.811169133363998, + "grad_norm": 0.13625964522361755, + "learning_rate": 0.0009057561131974695, + "loss": 2.974313735961914, + "num_input_tokens_seen": 7859077120, + "step": 14990, + "train_runtime": 68059.848, + "train_tokens_per_second": 115473.034 + }, + { + "epoch": 0.8117102735463622, + "grad_norm": 0.13586074113845825, + "learning_rate": 0.000903506871515734, + "loss": 2.9799150466918944, + "num_input_tokens_seen": 7864320000, + "step": 15000, + "train_runtime": 68104.9508, + "train_tokens_per_second": 115473.544 + }, + { + "epoch": 0.8117102735463622, + "eval_loss": 2.9381465911865234, + "eval_runtime": 1.9846, + "eval_samples_per_second": 251.945, + "eval_steps_per_second": 4.031, + "num_input_tokens_seen": 7864320000, + "step": 15000 + }, + { + "epoch": 0.8122514137287264, + "grad_norm": 0.13391871750354767, + "learning_rate": 0.0009012632670925736, + "loss": 2.972438430786133, + "num_input_tokens_seen": 7869562880, + "step": 15010, + "train_runtime": 68154.5217, + "train_tokens_per_second": 115466.482 + }, + { + "epoch": 0.8127925539110906, + "grad_norm": 0.13467305898666382, + "learning_rate": 0.0008990253067776095, + "loss": 2.9732336044311523, + "num_input_tokens_seen": 7874805760, + "step": 15020, + "train_runtime": 68199.7002, + "train_tokens_per_second": 115466.868 + }, + { + "epoch": 0.8133336940934549, + "grad_norm": 0.13371260464191437, + "learning_rate": 0.0008967929974032304, + "loss": 2.9756675720214845, + "num_input_tokens_seen": 7880048640, + "step": 15030, + "train_runtime": 68244.8815, + "train_tokens_per_second": 115467.248 + }, + { + "epoch": 0.8138748342758192, + "grad_norm": 0.13191363215446472, + "learning_rate": 0.0008945663457845765, + "loss": 2.9834621429443358, + "num_input_tokens_seen": 7885291520, + "step": 15040, + "train_runtime": 68290.0502, + "train_tokens_per_second": 115467.649 + }, + { + "epoch": 0.8144159744581834, + "grad_norm": 0.1310187131166458, + "learning_rate": 0.0008923453587195116, + "loss": 2.9787324905395507, + "num_input_tokens_seen": 7890534400, + "step": 15050, + "train_runtime": 68335.2323, + "train_tokens_per_second": 115468.026 + }, + { + "epoch": 0.8149571146405477, + "grad_norm": 0.13005271553993225, + "learning_rate": 0.0008901300429886064, + "loss": 2.9818572998046875, + "num_input_tokens_seen": 7895777280, + "step": 15060, + "train_runtime": 68380.4424, + "train_tokens_per_second": 115468.356 + }, + { + "epoch": 0.8154982548229118, + "grad_norm": 0.13187964260578156, + "learning_rate": 0.0008879204053551192, + "loss": 2.9841533660888673, + "num_input_tokens_seen": 7901020160, + "step": 15070, + "train_runtime": 68425.6233, + "train_tokens_per_second": 115468.735 + }, + { + "epoch": 0.8160393950052761, + "grad_norm": 0.12774254381656647, + "learning_rate": 0.0008857164525649706, + "loss": 2.9738176345825194, + "num_input_tokens_seen": 7906263040, + "step": 15080, + "train_runtime": 68470.8074, + "train_tokens_per_second": 115469.108 + }, + { + "epoch": 0.8165805351876404, + "grad_norm": 0.13418236374855042, + "learning_rate": 0.0008835181913467284, + "loss": 2.9698516845703127, + "num_input_tokens_seen": 7911505920, + "step": 15090, + "train_runtime": 68516.0039, + "train_tokens_per_second": 115469.459 + }, + { + "epoch": 0.8171216753700046, + "grad_norm": 0.13305585086345673, + "learning_rate": 0.000881325628411582, + "loss": 2.9800113677978515, + "num_input_tokens_seen": 7916748800, + "step": 15100, + "train_runtime": 68561.1978, + "train_tokens_per_second": 115469.815 + }, + { + "epoch": 0.8176628155523689, + "grad_norm": 0.1298227459192276, + "learning_rate": 0.0008791387704533261, + "loss": 2.9894580841064453, + "num_input_tokens_seen": 7921991680, + "step": 15110, + "train_runtime": 68606.3897, + "train_tokens_per_second": 115470.173 + }, + { + "epoch": 0.818203955734733, + "grad_norm": 0.13746146857738495, + "learning_rate": 0.0008769576241483369, + "loss": 2.969521903991699, + "num_input_tokens_seen": 7927234560, + "step": 15120, + "train_runtime": 68651.5837, + "train_tokens_per_second": 115470.527 + }, + { + "epoch": 0.8187450959170973, + "grad_norm": 0.1307765245437622, + "learning_rate": 0.0008747821961555536, + "loss": 2.9746829986572267, + "num_input_tokens_seen": 7932477440, + "step": 15130, + "train_runtime": 68696.7803, + "train_tokens_per_second": 115470.877 + }, + { + "epoch": 0.8192862360994616, + "grad_norm": 0.12932413816452026, + "learning_rate": 0.0008726124931164572, + "loss": 2.980904388427734, + "num_input_tokens_seen": 7937720320, + "step": 15140, + "train_runtime": 68741.9605, + "train_tokens_per_second": 115471.253 + }, + { + "epoch": 0.8198273762818258, + "grad_norm": 0.13145951926708221, + "learning_rate": 0.0008704485216550531, + "loss": 2.977578544616699, + "num_input_tokens_seen": 7942963200, + "step": 15150, + "train_runtime": 68787.1491, + "train_tokens_per_second": 115471.615 + }, + { + "epoch": 0.8203685164641901, + "grad_norm": 0.13109584152698517, + "learning_rate": 0.0008682902883778457, + "loss": 2.973899078369141, + "num_input_tokens_seen": 7948206080, + "step": 15160, + "train_runtime": 68832.3314, + "train_tokens_per_second": 115471.987 + }, + { + "epoch": 0.8209096566465542, + "grad_norm": 0.1269070953130722, + "learning_rate": 0.0008661377998738207, + "loss": 2.9858329772949217, + "num_input_tokens_seen": 7953448960, + "step": 15170, + "train_runtime": 68877.5165, + "train_tokens_per_second": 115472.354 + }, + { + "epoch": 0.8214507968289185, + "grad_norm": 0.13239699602127075, + "learning_rate": 0.0008639910627144282, + "loss": 2.9783477783203125, + "num_input_tokens_seen": 7958691840, + "step": 15180, + "train_runtime": 68922.6959, + "train_tokens_per_second": 115472.73 + }, + { + "epoch": 0.8219919370112828, + "grad_norm": 0.129794642329216, + "learning_rate": 0.0008618500834535568, + "loss": 2.9712141036987303, + "num_input_tokens_seen": 7963934720, + "step": 15190, + "train_runtime": 68967.862, + "train_tokens_per_second": 115473.128 + }, + { + "epoch": 0.822533077193647, + "grad_norm": 0.13771747052669525, + "learning_rate": 0.0008597148686275189, + "loss": 2.984314727783203, + "num_input_tokens_seen": 7969177600, + "step": 15200, + "train_runtime": 69013.0362, + "train_tokens_per_second": 115473.511 + }, + { + "epoch": 0.8230742173760113, + "grad_norm": 0.13398458063602448, + "learning_rate": 0.0008575854247550258, + "loss": 2.9714584350585938, + "num_input_tokens_seen": 7974420480, + "step": 15210, + "train_runtime": 69058.1959, + "train_tokens_per_second": 115473.918 + }, + { + "epoch": 0.8236153575583754, + "grad_norm": 0.13028761744499207, + "learning_rate": 0.0008554617583371726, + "loss": 2.9726911544799806, + "num_input_tokens_seen": 7979663360, + "step": 15220, + "train_runtime": 69103.3538, + "train_tokens_per_second": 115474.328 + }, + { + "epoch": 0.8241564977407397, + "grad_norm": 0.13187240064144135, + "learning_rate": 0.0008533438758574152, + "loss": 2.9737316131591798, + "num_input_tokens_seen": 7984906240, + "step": 15230, + "train_runtime": 69148.515, + "train_tokens_per_second": 115474.732 + }, + { + "epoch": 0.824697637923104, + "grad_norm": 0.13035008311271667, + "learning_rate": 0.0008512317837815503, + "loss": 2.9657833099365236, + "num_input_tokens_seen": 7990149120, + "step": 15240, + "train_runtime": 69193.6841, + "train_tokens_per_second": 115475.122 + }, + { + "epoch": 0.8252387781054682, + "grad_norm": 0.1308414787054062, + "learning_rate": 0.0008491254885576988, + "loss": 2.968144416809082, + "num_input_tokens_seen": 7995392000, + "step": 15250, + "train_runtime": 69238.862, + "train_tokens_per_second": 115475.497 + }, + { + "epoch": 0.8257799182878325, + "grad_norm": 0.1312231868505478, + "learning_rate": 0.0008470249966162835, + "loss": 2.9749370574951173, + "num_input_tokens_seen": 8000634880, + "step": 15260, + "train_runtime": 69287.9095, + "train_tokens_per_second": 115469.422 + }, + { + "epoch": 0.8263210584701967, + "grad_norm": 0.13507384061813354, + "learning_rate": 0.0008449303143700088, + "loss": 2.9808319091796873, + "num_input_tokens_seen": 8005877760, + "step": 15270, + "train_runtime": 69333.0664, + "train_tokens_per_second": 115469.835 + }, + { + "epoch": 0.8268621986525609, + "grad_norm": 0.12942056357860565, + "learning_rate": 0.0008428414482138435, + "loss": 2.969392776489258, + "num_input_tokens_seen": 8011120640, + "step": 15280, + "train_runtime": 69378.1613, + "train_tokens_per_second": 115470.351 + }, + { + "epoch": 0.8274033388349252, + "grad_norm": 0.12837563455104828, + "learning_rate": 0.0008407584045250001, + "loss": 2.979315185546875, + "num_input_tokens_seen": 8016363520, + "step": 15290, + "train_runtime": 69423.2721, + "train_tokens_per_second": 115470.84 + }, + { + "epoch": 0.8279444790172894, + "grad_norm": 0.13300900161266327, + "learning_rate": 0.0008386811896629143, + "loss": 2.9644968032836916, + "num_input_tokens_seen": 8021606400, + "step": 15300, + "train_runtime": 69468.3762, + "train_tokens_per_second": 115471.339 + }, + { + "epoch": 0.8284856191996537, + "grad_norm": 0.12836603820323944, + "learning_rate": 0.0008366098099692285, + "loss": 2.972013473510742, + "num_input_tokens_seen": 8026849280, + "step": 15310, + "train_runtime": 69513.475, + "train_tokens_per_second": 115471.846 + }, + { + "epoch": 0.8290267593820179, + "grad_norm": 0.12967608869075775, + "learning_rate": 0.0008345442717677699, + "loss": 2.9776493072509767, + "num_input_tokens_seen": 8032092160, + "step": 15320, + "train_runtime": 69558.5739, + "train_tokens_per_second": 115472.352 + }, + { + "epoch": 0.8295678995643821, + "grad_norm": 0.12830476462841034, + "learning_rate": 0.0008324845813645304, + "loss": 2.9773494720458986, + "num_input_tokens_seen": 8037335040, + "step": 15330, + "train_runtime": 69603.6687, + "train_tokens_per_second": 115472.865 + }, + { + "epoch": 0.8301090397467464, + "grad_norm": 0.13105891644954681, + "learning_rate": 0.0008304307450476511, + "loss": 2.9748680114746096, + "num_input_tokens_seen": 8042577920, + "step": 15340, + "train_runtime": 69648.769, + "train_tokens_per_second": 115473.368 + }, + { + "epoch": 0.8306501799291106, + "grad_norm": 0.1301373690366745, + "learning_rate": 0.0008283827690873988, + "loss": 2.9727630615234375, + "num_input_tokens_seen": 8047820800, + "step": 15350, + "train_runtime": 69693.862, + "train_tokens_per_second": 115473.882 + }, + { + "epoch": 0.8311913201114749, + "grad_norm": 0.13162434101104736, + "learning_rate": 0.0008263406597361503, + "loss": 2.978099822998047, + "num_input_tokens_seen": 8053063680, + "step": 15360, + "train_runtime": 69738.9614, + "train_tokens_per_second": 115474.385 + }, + { + "epoch": 0.8317324602938391, + "grad_norm": 0.13288192451000214, + "learning_rate": 0.0008243044232283723, + "loss": 2.9758016586303713, + "num_input_tokens_seen": 8058306560, + "step": 15370, + "train_runtime": 69784.0695, + "train_tokens_per_second": 115474.873 + }, + { + "epoch": 0.8322736004762034, + "grad_norm": 0.136215478181839, + "learning_rate": 0.0008222740657806005, + "loss": 2.976166915893555, + "num_input_tokens_seen": 8063549440, + "step": 15380, + "train_runtime": 69829.1841, + "train_tokens_per_second": 115475.35 + }, + { + "epoch": 0.8328147406585676, + "grad_norm": 0.12879818677902222, + "learning_rate": 0.000820249593591422, + "loss": 2.9633615493774412, + "num_input_tokens_seen": 8068792320, + "step": 15390, + "train_runtime": 69874.3003, + "train_tokens_per_second": 115475.823 + }, + { + "epoch": 0.8333558808409318, + "grad_norm": 0.1428280621767044, + "learning_rate": 0.0008182310128414587, + "loss": 2.9798999786376954, + "num_input_tokens_seen": 8074035200, + "step": 15400, + "train_runtime": 69919.3861, + "train_tokens_per_second": 115476.346 + }, + { + "epoch": 0.8338970210232961, + "grad_norm": 0.1359853297472, + "learning_rate": 0.0008162183296933439, + "loss": 2.968707275390625, + "num_input_tokens_seen": 8079278080, + "step": 15410, + "train_runtime": 69964.4955, + "train_tokens_per_second": 115476.829 + }, + { + "epoch": 0.8344381612056603, + "grad_norm": 0.13050523400306702, + "learning_rate": 0.0008142115502917066, + "loss": 2.973996162414551, + "num_input_tokens_seen": 8084520960, + "step": 15420, + "train_runtime": 70009.6056, + "train_tokens_per_second": 115477.31 + }, + { + "epoch": 0.8349793013880246, + "grad_norm": 0.13029220700263977, + "learning_rate": 0.0008122106807631529, + "loss": 2.9792009353637696, + "num_input_tokens_seen": 8089763840, + "step": 15430, + "train_runtime": 70054.706, + "train_tokens_per_second": 115477.807 + }, + { + "epoch": 0.8355204415703888, + "grad_norm": 0.13232028484344482, + "learning_rate": 0.0008102157272162447, + "loss": 2.9753578186035154, + "num_input_tokens_seen": 8095006720, + "step": 15440, + "train_runtime": 70099.8205, + "train_tokens_per_second": 115478.28 + }, + { + "epoch": 0.836061581752753, + "grad_norm": 0.13095484673976898, + "learning_rate": 0.0008082266957414837, + "loss": 2.97320671081543, + "num_input_tokens_seen": 8100249600, + "step": 15450, + "train_runtime": 70144.9322, + "train_tokens_per_second": 115478.757 + }, + { + "epoch": 0.8366027219351173, + "grad_norm": 0.13523340225219727, + "learning_rate": 0.0008062435924112902, + "loss": 2.9681285858154296, + "num_input_tokens_seen": 8105492480, + "step": 15460, + "train_runtime": 70190.0213, + "train_tokens_per_second": 115479.271 + }, + { + "epoch": 0.8371438621174815, + "grad_norm": 0.13670340180397034, + "learning_rate": 0.0008042664232799893, + "loss": 2.9674022674560545, + "num_input_tokens_seen": 8110735360, + "step": 15470, + "train_runtime": 70235.1367, + "train_tokens_per_second": 115479.741 + }, + { + "epoch": 0.8376850022998458, + "grad_norm": 0.12936244904994965, + "learning_rate": 0.0008022951943837868, + "loss": 2.966217041015625, + "num_input_tokens_seen": 8115978240, + "step": 15480, + "train_runtime": 70280.2433, + "train_tokens_per_second": 115480.224 + }, + { + "epoch": 0.8382261424822101, + "grad_norm": 0.14200405776500702, + "learning_rate": 0.0008003299117407532, + "loss": 2.978799247741699, + "num_input_tokens_seen": 8121221120, + "step": 15490, + "train_runtime": 70325.3302, + "train_tokens_per_second": 115480.739 + }, + { + "epoch": 0.8387672826645742, + "grad_norm": 0.12791140377521515, + "learning_rate": 0.0007983705813508069, + "loss": 2.971164321899414, + "num_input_tokens_seen": 8126464000, + "step": 15500, + "train_runtime": 70370.4812, + "train_tokens_per_second": 115481.149 + }, + { + "epoch": 0.8387672826645742, + "eval_loss": 2.9325733184814453, + "eval_runtime": 1.9901, + "eval_samples_per_second": 251.238, + "eval_steps_per_second": 4.02, + "num_input_tokens_seen": 8126464000, + "step": 15500 + }, + { + "epoch": 0.8393084228469385, + "grad_norm": 0.1335526406764984, + "learning_rate": 0.0007964172091956926, + "loss": 2.9691984176635744, + "num_input_tokens_seen": 8131706880, + "step": 15510, + "train_runtime": 70417.588, + "train_tokens_per_second": 115478.35 + }, + { + "epoch": 0.8398495630293027, + "grad_norm": 0.13724961876869202, + "learning_rate": 0.0007944698012389664, + "loss": 2.9696407318115234, + "num_input_tokens_seen": 8136949760, + "step": 15520, + "train_runtime": 70462.6835, + "train_tokens_per_second": 115478.851 + }, + { + "epoch": 0.840390703211667, + "grad_norm": 0.13106457889080048, + "learning_rate": 0.0007925283634259745, + "loss": 2.964072036743164, + "num_input_tokens_seen": 8142192640, + "step": 15530, + "train_runtime": 70507.7742, + "train_tokens_per_second": 115479.36 + }, + { + "epoch": 0.8409318433940313, + "grad_norm": 0.1346583068370819, + "learning_rate": 0.000790592901683838, + "loss": 2.9721302032470702, + "num_input_tokens_seen": 8147435520, + "step": 15540, + "train_runtime": 70552.8789, + "train_tokens_per_second": 115479.845 + }, + { + "epoch": 0.8414729835763954, + "grad_norm": 0.12788882851600647, + "learning_rate": 0.0007886634219214321, + "loss": 2.9774459838867187, + "num_input_tokens_seen": 8152678400, + "step": 15550, + "train_runtime": 70597.9816, + "train_tokens_per_second": 115480.333 + }, + { + "epoch": 0.8420141237587597, + "grad_norm": 0.1323845237493515, + "learning_rate": 0.0007867399300293693, + "loss": 2.971846008300781, + "num_input_tokens_seen": 8157921280, + "step": 15560, + "train_runtime": 70643.081, + "train_tokens_per_second": 115480.825 + }, + { + "epoch": 0.8425552639411239, + "grad_norm": 0.132669135928154, + "learning_rate": 0.0007848224318799821, + "loss": 2.9736881256103516, + "num_input_tokens_seen": 8163164160, + "step": 15570, + "train_runtime": 70688.1702, + "train_tokens_per_second": 115481.334 + }, + { + "epoch": 0.8430964041234882, + "grad_norm": 0.1315847635269165, + "learning_rate": 0.0007829109333273051, + "loss": 2.9581043243408205, + "num_input_tokens_seen": 8168407040, + "step": 15580, + "train_runtime": 70733.2527, + "train_tokens_per_second": 115481.852 + }, + { + "epoch": 0.8436375443058525, + "grad_norm": 0.13508620858192444, + "learning_rate": 0.0007810054402070547, + "loss": 2.967576789855957, + "num_input_tokens_seen": 8173649920, + "step": 15590, + "train_runtime": 70778.3173, + "train_tokens_per_second": 115482.4 + }, + { + "epoch": 0.8441786844882166, + "grad_norm": 0.13094158470630646, + "learning_rate": 0.0007791059583366134, + "loss": 2.969736671447754, + "num_input_tokens_seen": 8178892800, + "step": 15600, + "train_runtime": 70823.3875, + "train_tokens_per_second": 115482.937 + }, + { + "epoch": 0.8447198246705809, + "grad_norm": 0.13293389976024628, + "learning_rate": 0.0007772124935150125, + "loss": 2.9740530014038087, + "num_input_tokens_seen": 8184135680, + "step": 15610, + "train_runtime": 70868.5107, + "train_tokens_per_second": 115483.387 + }, + { + "epoch": 0.8452609648529451, + "grad_norm": 0.12885726988315582, + "learning_rate": 0.0007753250515229127, + "loss": 2.9699680328369142, + "num_input_tokens_seen": 8189378560, + "step": 15620, + "train_runtime": 70913.6516, + "train_tokens_per_second": 115483.808 + }, + { + "epoch": 0.8458021050353094, + "grad_norm": 0.13280688226222992, + "learning_rate": 0.0007734436381225877, + "loss": 2.9740190505981445, + "num_input_tokens_seen": 8194621440, + "step": 15630, + "train_runtime": 70958.7738, + "train_tokens_per_second": 115484.259 + }, + { + "epoch": 0.8463432452176737, + "grad_norm": 0.13439851999282837, + "learning_rate": 0.0007715682590579061, + "loss": 2.975991439819336, + "num_input_tokens_seen": 8199864320, + "step": 15640, + "train_runtime": 71003.8731, + "train_tokens_per_second": 115484.747 + } + ], + "logging_steps": 10, + "max_steps": 18480, + "num_input_tokens_seen": 8200388608, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.287888719514173e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/ckpt-8-2b/training_args.bin b/ckpt-8-2b/training_args.bin new file mode 100644 index 0000000..16e4372 --- /dev/null +++ b/ckpt-8-2b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:555e861805d23ffd849ea370cfdde89fa1a892e3e3fbecfaf0bd36e61ec5b39e +size 5329 diff --git a/config.json b/config.json new file mode 100644 index 0000000..897792f --- /dev/null +++ b/config.json @@ -0,0 +1,32 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float32", + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "max_position_embeddings": 1024, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 12, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "rope_theta": 500000.0, + "rope_type": "default" + }, + "tie_word_embeddings": true, + "transformers_version": "5.1.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/final_model/config.json b/final_model/config.json new file mode 100644 index 0000000..897792f --- /dev/null +++ b/final_model/config.json @@ -0,0 +1,32 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float32", + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "max_position_embeddings": 1024, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 16, + "num_hidden_layers": 12, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "rope_theta": 500000.0, + "rope_type": "default" + }, + "tie_word_embeddings": true, + "transformers_version": "5.1.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/final_model/generation_config.json b/final_model/generation_config.json new file mode 100644 index 0000000..2e5031c --- /dev/null +++ b/final_model/generation_config.json @@ -0,0 +1,11 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": [ + 128001 + ], + "output_attentions": false, + "output_hidden_states": false, + "transformers_version": "5.1.0", + "use_cache": true +} diff --git a/final_model/model.safetensors b/final_model/model.safetensors new file mode 100644 index 0000000..a1267cb --- /dev/null +++ b/final_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5a9d2251a5222a6ef34db40724af477af333931a2d133484fba13f535f7e590 +size 1280426144 diff --git a/final_model/tokenizer.json b/final_model/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/final_model/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/final_model/tokenizer_config.json b/final_model/tokenizer_config.json new file mode 100644 index 0000000..f7213f2 --- /dev/null +++ b/final_model/tokenizer_config.json @@ -0,0 +1,13 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "TokenizersBackend" +} diff --git a/final_model/training_args.bin b/final_model/training_args.bin new file mode 100644 index 0000000..16e4372 --- /dev/null +++ b/final_model/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:555e861805d23ffd849ea370cfdde89fa1a892e3e3fbecfaf0bd36e61ec5b39e +size 5329 diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..2e5031c --- /dev/null +++ b/generation_config.json @@ -0,0 +1,11 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": [ + 128001 + ], + "output_attentions": false, + "output_hidden_states": false, + "transformers_version": "5.1.0", + "use_cache": true +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..a1267cb --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5a9d2251a5222a6ef34db40724af477af333931a2d133484fba13f535f7e590 +size 1280426144 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..f7213f2 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,13 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "TokenizersBackend" +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..16e4372 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:555e861805d23ffd849ea370cfdde89fa1a892e3e3fbecfaf0bd36e61ec5b39e +size 5329