commit 0b30b36285b9913d7c9318748eb2e5a3396baaa5
Author: ModelHub XC <noreply@modelhub.org.cn>
Date:   Fri May 1 07:27:32 2026 +0800

    初始化项目，由ModelHub XC社区提供模型
    
    Model: magnifi/magnifi-module-classifier-04-17-relabelled-upsampled
    Source: Original Platform

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..52373fe
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,36 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..778704e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,151 @@
+---
+library_name: transformers
+license: apache-2.0
+base_model: Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors
+tags:
+- axolotl
+- generated_from_trainer
+datasets:
+- Tifin-Sage/magnifi-module-classifier-04-17-relabelled-upsampled
+model-index:
+- name: magnifi-module-classifier-04-17-relabelled-upsampled
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+<details><summary>See axolotl config</summary>
+
+axolotl version: `0.16.0.dev0`
+```yaml
+base_model: Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors
+hub_model_id: Tifin-Sage/magnifi-module-classifier-04-17-relabelled-upsampled
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+chat_template: qwen3
+datasets:
+  - path: Tifin-Sage/magnifi-module-classifier-04-17-relabelled-upsampled
+    type: chat_template
+    split: train
+    field_messages: messages
+    message_property_mappings:
+      role: role
+      content: content
+
+val_set_size: 0.1
+
+output_dir: /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
+dataset_prepared_path: /workspace/data/datasets_prepared/magnifi-module-classifier-04-17-relabelled-upsampled
+
+sequence_len: 16000
+sample_packing: true
+eval_sample_packing: true
+
+
+wandb_project: sage-classifier
+wandb_entity:
+wandb_watch:
+wandb_name: magnifi-module-classifier-04-17-relabelled-upsampled
+wandb_log_model: 
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 2
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+bf16: auto
+tf32: true
+
+resume_from_checkpoint:
+logging_steps: 1
+
+evals_per_epoch: 2
+saves_per_epoch: 1
+
+warmup_ratio: 0.1
+weight_decay: 0.0
+fsdp:
+  - full_shard
+  - auto_wrap
+
+fsdp_config:
+  fsdp_version: 2
+  fsdp_offload_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_reshard_after_forward: true
+  fsdp_activation_checkpointing: true
+
+special_tokens:
+
+```
+
+</details><br>
+
+# magnifi-module-classifier-04-17-relabelled-upsampled
+
+This model is a fine-tuned version of [Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors](https://huggingface.co/Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors) on the Tifin-Sage/magnifi-module-classifier-04-17-relabelled-upsampled dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.2227
+- Ppl: 1.2494
+- Memory/max Active (gib): 34.91
+- Memory/max Allocated (gib): 34.91
+- Memory/device Reserved (gib): 57.25
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 2e-05
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 2
+- total_train_batch_size: 2
+- total_eval_batch_size: 2
+- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 47
+- training_steps: 478
+
+### Training results
+
+| Training Loss | Epoch  | Step | Validation Loss | Ppl    | Active (gib) | Allocated (gib) | Reserved (gib) |
+|:-------------:|:------:|:----:|:---------------:|:------:|:------------:|:---------------:|:--------------:|
+| No log        | 0      | 0    | 0.2049          | 1.2275 | 27.41        | 27.41           | 30.62          |
+| 0.2339        | 0.5    | 120  | 0.2288          | 1.2571 | 34.91        | 34.91           | 59.04          |
+| 0.2290        | 1.0    | 240  | 0.2166          | 1.2419 | 34.91        | 34.91           | 57.54          |
+| 0.0898        | 1.5    | 360  | 0.2251          | 1.2524 | 34.91        | 34.91           | 57.54          |
+| 0.1331        | 1.9917 | 478  | 0.2227          | 1.2494 | 34.91        | 34.91           | 57.25          |
+
+
+### Framework versions
+
+- Transformers 5.5.4
+- Pytorch 2.10.0+cu128
+- Datasets 4.8.4
+- Tokenizers 0.22.2
diff --git a/chat_template.jinja b/chat_template.jinja
new file mode 100644
index 0000000..77ea906
--- /dev/null
+++ b/chat_template.jinja
@@ -0,0 +1,93 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{#- Determine the real last index: use provided value or default to messages length - 1 #}
+{%- if real_last_index is defined and real_last_index is not none %}
+    {%- set ns.real_last_index = real_last_index %}
+{%- else %}
+    {%- set ns.real_last_index = messages|length - 1 %}
+{%- endif %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- else %}
+        {{- '<think>\n\n' }}
+    {%- endif %}
+{%- endif %}
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..f0b779c
--- /dev/null
+++ b/config.json
@@ -0,0 +1,72 @@
+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 262144,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 5000000,
+  "rope_parameters": {
+    "rope_theta": 5000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.4",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/debug.log b/debug.log
new file mode 100644
index 0000000..619309c
--- /dev/null
+++ b/debug.log
@@ -0,0 +1,3 @@
+Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]Fetching 2 files:  50%|█████     | 1/2 [00:05<00:05,  5.40s/it]Fetching 2 files: 100%|██████████| 2/2 [00:05<00:00,  2.70s/it]
+Loading weights:   0%|          | 0/399 [00:00<?, ?it/s]Loading weights:  76%|███████▌  | 303/399 [00:00<00:00, 3028.93it/s]Loading weights: 100%|██████████| 399/399 [00:00<00:00, 2952.58it/s]
+[2026-04-17 02:09:12,900] [WARNING] [accelerate.utils.dataclasses.__post_init__:1992] [PID:9433] sync_module_states is obsolete in FSDP2, as it is not needed anymore.Setting sync_module_states to None.
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000..52148fd
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,12 @@
+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.5.4"
+}
diff --git a/model.safetensors b/model.safetensors
new file mode 100644
index 0000000..2f2fc1c
--- /dev/null
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1219eab4d5ec42272f6e3d86fdc743c22587d91c005646cb1d193ed7f3aca45
+size 8822894520
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000..c7afbed
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000..d397b7b
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,14 @@
+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "is_local": false,
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}
diff --git a/train.log b/train.log
new file mode 100644
index 0000000..453b331
--- /dev/null
+++ b/train.log
@@ -0,0 +1,2396 @@
+The following values were not passed to `accelerate launch` and had defaults used instead:
+	`--num_processes` was set to a value of `2`
+		More than one GPU was found, enabling multi-GPU training.
+		If this was unintended please pass in `--num_processes=1`.
+	`--num_machines` was set to a value of `1`
+	`--mixed_precision` was set to a value of `'no'`
+	`--dynamo_backend` was set to a value of `'no'`
+To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
+[2026-04-17 02:08:45,271] [WARNING] [torchao] Skipping import of cpp extensions due to incompatible torch version. Please upgrade to torch >= 2.11.0 (found 2.10.0+cu128).
+[2026-04-17 02:08:45,439] [WARNING] [torchao] Skipping import of cpp extensions due to incompatible torch version. Please upgrade to torch >= 2.11.0 (found 2.10.0+cu128).
+[2026-04-17 02:08:47,222] [WARNING] [axolotl.utils.schemas.validation] sample_packing without flash, sdp, xformers, sage, or flex attention does not handle cross sample decontamination.
+[2026-04-17 02:08:47,223] [INFO] [axolotl.utils.schemas.validation] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing
+[2026-04-17 02:08:47,223] [WARNING] [axolotl.utils.schemas.validation] Configuring FSDP fields with the `fsdp_` prefix is deprecated. Please omit the `fsdp_` prefix from the any fields in `fsdp_config`.
+[2026-04-17 02:08:47,467] [INFO] [axolotl.cli.config] config:
+{
+  "activation_offloading": false,
+  "axolotl_config_path": "/workspace/data/sage-classifier-train-scripts/qwen3/fft/qwen3-4B-train-v1-6-no-liger-flex-magnifi-module-classifier-04-17-relabelled-upsampled.yml",
+  "base_model": "Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors",
+  "base_model_config": "Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors",
+  "batch_size": 2,
+  "bf16": true,
+  "capabilities": {
+    "bf16": true,
+    "compute_capability": "sm_80",
+    "fp8": false,
+    "n_gpu": 2,
+    "n_node": 1,
+    "tf32": true
+  },
+  "chat_template": "qwen3",
+  "context_parallel_size": 1,
+  "dataloader_num_workers": 2,
+  "dataloader_pin_memory": true,
+  "dataloader_prefetch_factor": 256,
+  "dataset_num_proc": 128,
+  "dataset_prepared_path": "/workspace/data/datasets_prepared/magnifi-module-classifier-04-17-relabelled-upsampled",
+  "datasets": [
+    {
+      "chat_template": "tokenizer_default",
+      "field_messages": "messages",
+      "message_property_mappings": {
+        "content": "content",
+        "role": "role"
+      },
+      "path": "Tifin-Sage/magnifi-module-classifier-04-17-relabelled-upsampled",
+      "split": "train",
+      "trust_remote_code": false,
+      "type": "chat_template"
+    }
+  ],
+  "ddp": true,
+  "device": "cuda:0",
+  "device_map": {
+    "": 0
+  },
+  "dion_rank_fraction": 1.0,
+  "dion_rank_multiple_of": 1,
+  "eaft_alpha": 1.0,
+  "eaft_k": 20,
+  "env_capabilities": {
+    "torch_version": "2.10.0"
+  },
+  "eval_batch_size": 1,
+  "eval_causal_lm_metrics": [
+    "sacrebleu",
+    "comet",
+    "ter",
+    "chrf"
+  ],
+  "eval_max_new_tokens": 128,
+  "eval_sample_packing": true,
+  "eval_steps": 0.25,
+  "eval_table_size": 0,
+  "evals_per_epoch": 2,
+  "experimental_skip_move_to_device": true,
+  "fp16": false,
+  "fsdp": [
+    "full_shard",
+    "auto_wrap"
+  ],
+  "fsdp_config": {
+    "activation_checkpointing": true,
+    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+    "cpu_ram_efficient_loading": true,
+    "fsdp_version": 2,
+    "offload_params": false,
+    "reshard_after_forward": true,
+    "state_dict_type": "FULL_STATE_DICT",
+    "transformer_layer_cls_to_wrap": "Qwen3DecoderLayer"
+  },
+  "fsdp_version": 2,
+  "generate_samples": false,
+  "generation_do_sample": true,
+  "generation_max_new_tokens": 50,
+  "generation_prompt_ratio": 0.5,
+  "generation_temperature": 0.7,
+  "gradient_accumulation_steps": 1,
+  "gradient_checkpointing": false,
+  "hub_model_id": "Tifin-Sage/magnifi-module-classifier-04-17-relabelled-upsampled",
+  "include_tkps": true,
+  "layer_offloading": false,
+  "learning_rate": 2e-05,
+  "lisa_layers_attribute": "model.layers",
+  "load_best_model_at_end": false,
+  "load_in_4bit": false,
+  "load_in_8bit": false,
+  "local_rank": 0,
+  "logging_steps": 1,
+  "lora_dropout": 0.0,
+  "loraplus_lr_embedding": 1e-06,
+  "lr_scheduler": "cosine",
+  "mean_resizing_embeddings": false,
+  "merge_method": "memory_efficient",
+  "micro_batch_size": 1,
+  "model_config_type": "qwen3",
+  "num_epochs": 2.0,
+  "num_generation_samples": 3,
+  "optimizer": "adamw_torch_fused",
+  "otel_metrics_host": "localhost",
+  "otel_metrics_port": 8000,
+  "output_dir": "/workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/",
+  "pad_to_sequence_len": true,
+  "pretrain_multipack_attn": true,
+  "profiler_steps_start": 0,
+  "qlora_sharded_model_loading": false,
+  "quantize_moe_experts": false,
+  "ray_num_workers": 1,
+  "resources_per_worker": {
+    "GPU": 1
+  },
+  "sample_packing": true,
+  "sample_packing_bin_size": 200,
+  "sample_packing_group_size": 100000,
+  "save_only_model": false,
+  "save_safetensors": true,
+  "save_steps": 0.5,
+  "saves_per_epoch": 1,
+  "sequence_len": 16000,
+  "shuffle_before_merging_datasets": false,
+  "shuffle_merged_datasets": true,
+  "skip_prepare_dataset": false,
+  "streaming_multipack_buffer_size": 10000,
+  "strict": false,
+  "tensor_parallel_size": 1,
+  "tf32": true,
+  "tiled_mlp_use_original_mlp": true,
+  "tokenizer_config": "Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors",
+  "tokenizer_save_jinja_files": true,
+  "torch_dtype": "torch.bfloat16",
+  "train_on_inputs": false,
+  "trl": {
+    "async_prefetch": false,
+    "log_completions": false,
+    "mask_truncated_completions": false,
+    "ref_model_mixup_alpha": 0.9,
+    "ref_model_sync_steps": 64,
+    "replay_buffer_size": 0,
+    "replay_recompute_logps": true,
+    "reroll_max_groups": 1,
+    "reroll_start_fraction": 1.0,
+    "reward_num_workers": 1,
+    "scale_rewards": true,
+    "skip_zero_advantage_batches": true,
+    "sync_ref_model": false,
+    "use_data_producer": false,
+    "use_vllm": false,
+    "vllm_lora_sync": false,
+    "vllm_server_host": "0.0.0.0",
+    "vllm_server_port": 8000
+  },
+  "use_otel_metrics": false,
+  "use_ray": false,
+  "use_wandb": true,
+  "val_set_size": 0.1,
+  "vllm": {
+    "device": "auto",
+    "dtype": "auto",
+    "gpu_memory_utilization": 0.9,
+    "host": "0.0.0.0",
+    "port": 8000
+  },
+  "wandb_name": "magnifi-module-classifier-04-17-relabelled-upsampled",
+  "wandb_project": "sage-classifier",
+  "warmup_ratio": 0.1,
+  "weight_decay": 0.0,
+  "world_size": 2
+}
+[2026-04-17 02:08:51,607] [INFO] [axolotl.utils.data.shared] Loading prepared dataset from disk at /workspace/data/datasets_prepared/magnifi-module-classifier-04-17-relabelled-upsampled/6241b9d0f4bdccc4ed4f52e5adefd1bc...
+[Gloo] Rank [Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
+0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
+[2026-04-17 02:08:57,019] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
+[2026-04-17 02:08:57,129] [INFO] [axolotl.utils.trainer] sample_packing_eff_est across ranks: [0.9649779796600342, 0.9649779796600342]
+[2026-04-17 02:09:01,870] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [478, 478]
+[2026-04-17 02:09:01,872] [INFO] [axolotl.utils.trainer] sample_packing_eff_est across ranks: [0.969444751739502, 0.9775572419166565]
+[2026-04-17 02:09:01,874] [INFO] [axolotl.utils.data.sft] Maximum number of steps set at 478
+[2026-04-17 02:09:03,028] [INFO] [axolotl.loaders.patch_manager] Applying multipack dataloader patch for sample packing...
+Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]Fetching 2 files:  50%|█████     | 1/2 [00:05<00:05,  5.40s/it]Fetching 2 files: 100%|██████████| 2/2 [00:05<00:00,  2.70s/it]
+Fetching 2 files:  50%|█████     | 1/2 [00:05<00:05,  5.42s/it]Fetching 2 files: 100%|██████████| 2/2 [00:05<00:00,  2.71s/it]
+Loading weights:   0%|          | 0/399 [00:00<?, ?it/s]Loading weights:  76%|███████▌  | 303/399 [00:00<00:00, 3028.93it/s]Loading weights: 100%|██████████| 399/399 [00:00<00:00, 2952.58it/s]
+Loading weights:   0%|          | 0/399 [00:00<?, ?it/s]Loading weights:  83%|████████▎ | 331/399 [00:00<00:00, 3306.31it/s]Loading weights: 100%|██████████| 399/399 [00:00<00:00, 3610.06it/s]
+[2026-04-17 02:09:10,437] [INFO] [axolotl.loaders.model] Converting modules to torch.bfloat16
+[2026-04-17 02:09:12,900] [WARNING] [accelerate.utils.dataclasses] sync_module_states is obsolete in FSDP2, as it is not needed anymore.Setting sync_module_states to None.
+[2026-04-17 02:09:13,195] [WARNING] [accelerate.utils.dataclasses] sync_module_states is obsolete in FSDP2, as it is not needed anymore.Setting sync_module_states to None.
+[2026-04-17 02:09:13,778] [INFO] [axolotl.train] Pre-saving tokenizer to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/...
+[2026-04-17 02:09:13,930] [INFO] [axolotl.train] Pre-saving model config to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/...
+[2026-04-17 02:09:13,980] [INFO] [axolotl.train] Starting trainer...
+[2026-04-17 02:09:19,915] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [480, 480]
+[2026-04-17 02:09:20,072] [INFO] [axolotl.monkeypatch.accelerate.fsdp2] Broadcasting full state dict to all ranks...
+wandb: [wandb.login()] Loaded credentials for https://api.wandb.ai from WANDB_API_KEY.
+wandb: Currently logged in as: subhanandh-t (subhanandh-t-tifin) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: setting up run 4dxwgvhh
+wandb: Tracking run with wandb version 0.26.0
+wandb: Run data is saved locally in /workspace/wandb/run-20260417_020921-4dxwgvhh
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run magnifi-module-classifier-04-17-relabelled-upsampled
+wandb: ⭐️ View project at https://wandb.ai/subhanandh-t-tifin/sage-classifier
+wandb: 🚀 View run at https://wandb.ai/subhanandh-t-tifin/sage-classifier/runs/4dxwgvhh
+wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
+wandb: WARNING Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
+[2026-04-17 02:09:24,132] [INFO] [axolotl.utils.callbacks] The Axolotl config has been saved to the WandB run under files.
+  0%|          | 0/478 [00:00<?, ?it/s][2026-04-17 02:09:24,137] [INFO] [axolotl.core.trainers.base] Running evaluation step...
+[2026-04-17 02:09:29,462] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
+
+  0%|          | 0/27 [00:00<?, ?it/s]
+  7%|▋         | 2/27 [00:01<00:18,  1.37it/s]
+ 11%|█         | 3/27 [00:04<00:37,  1.57s/it]
+ 15%|█▍        | 4/27 [00:06<00:46,  2.01s/it]
+ 19%|█▊        | 5/27 [00:09<00:49,  2.27s/it]
+ 22%|██▏       | 6/27 [00:12<00:51,  2.43s/it]
+ 26%|██▌       | 7/27 [00:15<00:50,  2.54s/it]
+ 30%|██▉       | 8/27 [00:17<00:49,  2.60s/it]
+ 33%|███▎      | 9/27 [00:20<00:47,  2.65s/it]
+ 37%|███▋      | 10/27 [00:23<00:45,  2.68s/it]
+ 41%|████      | 11/27 [00:26<00:43,  2.70s/it]
+ 44%|████▍     | 12/27 [00:28<00:39,  2.63s/it]
+ 48%|████▊     | 13/27 [00:31<00:38,  2.76s/it]
+ 52%|█████▏    | 14/27 [00:34<00:35,  2.77s/it]
+ 56%|█████▌    | 15/27 [00:37<00:33,  2.76s/it]
+ 59%|█████▉    | 16/27 [00:40<00:30,  2.76s/it]
+ 63%|██████▎   | 17/27 [00:42<00:27,  2.76s/it]
+ 67%|██████▋   | 18/27 [00:45<00:24,  2.76s/it]
+ 70%|███████   | 19/27 [00:48<00:22,  2.76s/it]
+ 74%|███████▍  | 20/27 [00:51<00:19,  2.77s/it]
+ 78%|███████▊  | 21/27 [00:53<00:16,  2.68s/it]
+ 81%|████████▏ | 22/27 [00:56<00:13,  2.79s/it]
+ 85%|████████▌ | 23/27 [00:59<00:11,  2.78s/it]
+ 89%|████████▉ | 24/27 [01:02<00:08,  2.77s/it]
+ 93%|█████████▎| 25/27 [01:04<00:05,  2.77s/it]
+ 96%|█████████▋| 26/27 [01:07<00:02,  2.76s/it]
+100%|██████████| 27/27 [01:10<00:00,  2.79s/it]                                       
+                                               {'eval_loss': '0.2049', 'eval_runtime': '75.78', 'eval_samples_per_second': '2.758', 'eval_steps_per_second': '1.386', 'eval_ppl': '1.227', 'memory/max_active (GiB)': '27.41', 'memory/max_allocated (GiB)': '27.41', 'memory/device_reserved (GiB)': '30.62', 'epoch': 0}
+  0%|          | 0/478 [01:21<?, ?it/s]
+100%|██████████| 27/27 [01:11<00:00,  2.79s/it]
+                                                 0%|          | 1/478 [01:37<12:54:11, 97.38s/it]                                                  {'loss': '0.1978', 'grad_norm': '6.188', 'learning_rate': '0', 'ppl': '1.219', 'memory/max_active (GiB)': '37.9', 'memory/max_allocated (GiB)': '37.9', 'memory/device_reserved (GiB)': '49.99', 'tokens/train_per_sec_per_gpu': '22.08', 'tokens/total': 32000, 'tokens/trainable': 687, 'epoch': '0.004167'}
+  0%|          | 1/478 [01:37<12:54:11, 97.38s/it]  0%|          | 2/478 [01:52<6:27:59, 48.91s/it]                                                  {'loss': '0.2119', 'grad_norm': '6.781', 'learning_rate': '4.255e-07', 'ppl': '1.236', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.84', 'tokens/total': 64000, 'tokens/trainable': 1310, 'epoch': '0.008333'}
+  0%|          | 2/478 [01:52<6:27:59, 48.91s/it]  1%|          | 3/478 [02:07<4:24:28, 33.41s/it]                                                 {'loss': '0.1929', 'grad_norm': '7.031', 'learning_rate': '8.511e-07', 'ppl': '1.213', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.47', 'tokens/total': 96000, 'tokens/trainable': 1862, 'epoch': '0.0125'}
+  1%|          | 3/478 [02:07<4:24:28, 33.41s/it]  1%|          | 4/478 [02:22<3:26:25, 26.13s/it]                                                 {'loss': '0.1885', 'grad_norm': '8.938', 'learning_rate': '1.277e-06', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.06', 'tokens/total': 128000, 'tokens/trainable': 2402, 'epoch': '0.01667'}
+  1%|          | 4/478 [02:22<3:26:25, 26.13s/it]  1%|          | 5/478 [02:37<2:54:16, 22.11s/it]                                                 {'loss': '0.1914', 'grad_norm': '6.75', 'learning_rate': '1.702e-06', 'ppl': '1.211', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.36', 'tokens/total': 160000, 'tokens/trainable': 2951, 'epoch': '0.02083'}
+  1%|          | 5/478 [02:37<2:54:16, 22.11s/it]  1%|▏         | 6/478 [02:52<2:34:49, 19.68s/it]                                                 {'loss': '0.1206', 'grad_norm': '4.875', 'learning_rate': '2.128e-06', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.83', 'tokens/total': 192000, 'tokens/trainable': 3574, 'epoch': '0.025'}
+  1%|▏         | 6/478 [02:52<2:34:49, 19.68s/it]  1%|▏         | 7/478 [03:07<2:22:25, 18.14s/it]                                                 {'loss': '0.2275', 'grad_norm': '9', 'learning_rate': '2.553e-06', 'ppl': '1.256', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 224000, 'tokens/trainable': 4152, 'epoch': '0.02917'}
+  1%|▏         | 7/478 [03:07<2:22:25, 18.14s/it]  2%|▏         | 8/478 [03:22<2:14:12, 17.13s/it]                                                 {'loss': '0.2021', 'grad_norm': '7.312', 'learning_rate': '2.979e-06', 'ppl': '1.224', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.73', 'tokens/total': 256000, 'tokens/trainable': 4742, 'epoch': '0.03333'}
+  2%|▏         | 8/478 [03:22<2:14:12, 17.13s/it]  2%|▏         | 9/478 [03:37<2:08:38, 16.46s/it]                                                 {'loss': '0.1592', 'grad_norm': '4.969', 'learning_rate': '3.404e-06', 'ppl': '1.173', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.34', 'tokens/total': 288000, 'tokens/trainable': 5410, 'epoch': '0.0375'}
+  2%|▏         | 9/478 [03:37<2:08:38, 16.46s/it]  2%|▏         | 10/478 [03:52<2:04:46, 16.00s/it]                                                  {'loss': '0.1953', 'grad_norm': '5.938', 'learning_rate': '3.83e-06', 'ppl': '1.216', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.77', 'tokens/total': 320000, 'tokens/trainable': 6031, 'epoch': '0.04167'}
+  2%|▏         | 10/478 [03:52<2:04:46, 16.00s/it]  2%|▏         | 11/478 [04:07<2:02:05, 15.69s/it]                                                  {'loss': '0.1826', 'grad_norm': '5.5', 'learning_rate': '4.255e-06', 'ppl': '1.2', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.52', 'tokens/total': 352000, 'tokens/trainable': 6615, 'epoch': '0.04583'}
+  2%|▏         | 11/478 [04:07<2:02:05, 15.69s/it]  3%|▎         | 12/478 [04:22<2:00:07, 15.47s/it]                                                  {'loss': '0.2266', 'grad_norm': '6.594', 'learning_rate': '4.681e-06', 'ppl': '1.254', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.01', 'tokens/total': 384000, 'tokens/trainable': 7213, 'epoch': '0.05'}
+  3%|▎         | 12/478 [04:22<2:00:07, 15.47s/it]  3%|▎         | 13/478 [04:37<1:58:41, 15.32s/it]                                                  {'loss': '0.248', 'grad_norm': '7.031', 'learning_rate': '5.106e-06', 'ppl': '1.282', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.44', 'tokens/total': 416000, 'tokens/trainable': 7764, 'epoch': '0.05417'}
+  3%|▎         | 13/478 [04:37<1:58:41, 15.32s/it]  3%|▎         | 14/478 [04:52<1:57:38, 15.21s/it]                                                  {'loss': '0.1992', 'grad_norm': '6.344', 'learning_rate': '5.532e-06', 'ppl': '1.22', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.93', 'tokens/total': 448000, 'tokens/trainable': 8330, 'epoch': '0.05833'}
+  3%|▎         | 14/478 [04:52<1:57:38, 15.21s/it]  3%|▎         | 15/478 [05:06<1:56:49, 15.14s/it]                                                  {'loss': '0.1826', 'grad_norm': '7.469', 'learning_rate': '5.957e-06', 'ppl': '1.2', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.44', 'tokens/total': 480000, 'tokens/trainable': 9001, 'epoch': '0.0625'}
+  3%|▎         | 15/478 [05:07<1:56:49, 15.14s/it]  3%|▎         | 16/478 [05:21<1:56:10, 15.09s/it]                                                  {'loss': '0.1689', 'grad_norm': '5.688', 'learning_rate': '6.383e-06', 'ppl': '1.184', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.78', 'tokens/total': 512000, 'tokens/trainable': 9652, 'epoch': '0.06667'}
+  3%|▎         | 16/478 [05:21<1:56:10, 15.09s/it]  4%|▎         | 17/478 [05:36<1:55:34, 15.04s/it]                                                  {'loss': '0.1113', 'grad_norm': '6.594', 'learning_rate': '6.809e-06', 'ppl': '1.118', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.16', 'tokens/total': 544000, 'tokens/trainable': 10343, 'epoch': '0.07083'}
+  4%|▎         | 17/478 [05:36<1:55:34, 15.04s/it]  4%|▍         | 18/478 [05:51<1:55:09, 15.02s/it]                                                  {'loss': '0.1919', 'grad_norm': '6.281', 'learning_rate': '7.234e-06', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.71', 'tokens/total': 576000, 'tokens/trainable': 11022, 'epoch': '0.075'}
+  4%|▍         | 18/478 [05:51<1:55:09, 15.02s/it]  4%|▍         | 19/478 [06:06<1:54:47, 15.00s/it]                                                  {'loss': '0.2617', 'grad_norm': '7.594', 'learning_rate': '7.66e-06', 'ppl': '1.299', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.8', 'tokens/total': 608000, 'tokens/trainable': 11584, 'epoch': '0.07917'}
+  4%|▍         | 19/478 [06:06<1:54:47, 15.00s/it]  4%|▍         | 20/478 [06:21<1:54:27, 14.99s/it]                                                  {'loss': '0.1699', 'grad_norm': '6.531', 'learning_rate': '8.085e-06', 'ppl': '1.185', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.31', 'tokens/total': 640000, 'tokens/trainable': 12221, 'epoch': '0.08333'}
+  4%|▍         | 20/478 [06:21<1:54:27, 14.99s/it]  4%|▍         | 21/478 [06:36<1:54:08, 14.99s/it]                                                  {'loss': '0.1992', 'grad_norm': '7.25', 'learning_rate': '8.511e-06', 'ppl': '1.22', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.05', 'tokens/total': 672000, 'tokens/trainable': 12880, 'epoch': '0.0875'}
+  4%|▍         | 21/478 [06:36<1:54:08, 14.99s/it]  5%|▍         | 22/478 [06:51<1:53:51, 14.98s/it]                                                  {'loss': '0.2231', 'grad_norm': '6.562', 'learning_rate': '8.936e-06', 'ppl': '1.25', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.91', 'tokens/total': 704000, 'tokens/trainable': 13505, 'epoch': '0.09167'}
+  5%|▍         | 22/478 [06:51<1:53:51, 14.98s/it]  5%|▍         | 23/478 [07:06<1:53:32, 14.97s/it]                                                  {'loss': '0.2583', 'grad_norm': '8.5', 'learning_rate': '9.362e-06', 'ppl': '1.295', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.84', 'tokens/total': 736000, 'tokens/trainable': 14187, 'epoch': '0.09583'}
+  5%|▍         | 23/478 [07:06<1:53:32, 14.97s/it]  5%|▌         | 24/478 [07:21<1:53:16, 14.97s/it]                                                  {'loss': '0.1807', 'grad_norm': '7.062', 'learning_rate': '9.787e-06', 'ppl': '1.198', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.49', 'tokens/total': 768000, 'tokens/trainable': 14680, 'epoch': '0.1'}
+  5%|▌         | 24/478 [07:21<1:53:16, 14.97s/it]  5%|▌         | 25/478 [07:36<1:53:01, 14.97s/it]                                                  {'loss': '0.2788', 'grad_norm': '6.625', 'learning_rate': '1.021e-05', 'ppl': '1.322', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.58', 'tokens/total': 800000, 'tokens/trainable': 15325, 'epoch': '0.1042'}
+  5%|▌         | 25/478 [07:36<1:53:01, 14.97s/it]  5%|▌         | 26/478 [07:51<1:52:46, 14.97s/it]                                                  {'loss': '0.1494', 'grad_norm': '4.75', 'learning_rate': '1.064e-05', 'ppl': '1.161', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 832000, 'tokens/trainable': 15903, 'epoch': '0.1083'}
+  5%|▌         | 26/478 [07:51<1:52:46, 14.97s/it]  6%|▌         | 27/478 [08:06<1:52:31, 14.97s/it]                                                  {'loss': '0.1714', 'grad_norm': '5.812', 'learning_rate': '1.106e-05', 'ppl': '1.187', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.14', 'tokens/total': 864000, 'tokens/trainable': 16535, 'epoch': '0.1125'}
+  6%|▌         | 27/478 [08:06<1:52:31, 14.97s/it]  6%|▌         | 28/478 [08:21<1:52:13, 14.96s/it]                                                  {'loss': '0.2041', 'grad_norm': '6.062', 'learning_rate': '1.149e-05', 'ppl': '1.226', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.99', 'tokens/total': 896000, 'tokens/trainable': 17221, 'epoch': '0.1167'}
+  6%|▌         | 28/478 [08:21<1:52:13, 14.96s/it]  6%|▌         | 29/478 [08:36<1:51:59, 14.96s/it]                                                  {'loss': '0.1694', 'grad_norm': '4.969', 'learning_rate': '1.191e-05', 'ppl': '1.185', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.54', 'tokens/total': 928000, 'tokens/trainable': 17895, 'epoch': '0.1208'}
+  6%|▌         | 29/478 [08:36<1:51:59, 14.96s/it]  6%|▋         | 30/478 [08:51<1:51:45, 14.97s/it]                                                  {'loss': '0.1841', 'grad_norm': '5.438', 'learning_rate': '1.234e-05', 'ppl': '1.202', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '24.18', 'tokens/total': 960000, 'tokens/trainable': 18618, 'epoch': '0.125'}
+  6%|▋         | 30/478 [08:51<1:51:45, 14.97s/it]  6%|▋         | 31/478 [09:06<1:51:30, 14.97s/it]                                                  {'loss': '0.1812', 'grad_norm': '5.281', 'learning_rate': '1.277e-05', 'ppl': '1.199', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.14', 'tokens/total': 992000, 'tokens/trainable': 19250, 'epoch': '0.1292'}
+  6%|▋         | 31/478 [09:06<1:51:30, 14.97s/it]  7%|▋         | 32/478 [09:21<1:51:16, 14.97s/it]                                                  {'loss': '0.2358', 'grad_norm': '5.875', 'learning_rate': '1.319e-05', 'ppl': '1.266', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.54', 'tokens/total': 1024000, 'tokens/trainable': 19924, 'epoch': '0.1333'}
+  7%|▋         | 32/478 [09:21<1:51:16, 14.97s/it]  7%|▋         | 33/478 [09:36<1:51:01, 14.97s/it]                                                  {'loss': '0.1631', 'grad_norm': '5.312', 'learning_rate': '1.362e-05', 'ppl': '1.177', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.76', 'tokens/total': 1056000, 'tokens/trainable': 20545, 'epoch': '0.1375'}
+  7%|▋         | 33/478 [09:36<1:51:01, 14.97s/it]  7%|▋         | 34/478 [09:51<1:50:46, 14.97s/it]                                                  {'loss': '0.269', 'grad_norm': '6.281', 'learning_rate': '1.404e-05', 'ppl': '1.309', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.13', 'tokens/total': 1088000, 'tokens/trainable': 21147, 'epoch': '0.1417'}
+  7%|▋         | 34/478 [09:51<1:50:46, 14.97s/it]  7%|▋         | 35/478 [10:06<1:50:32, 14.97s/it]                                                  {'loss': '0.2339', 'grad_norm': '7', 'learning_rate': '1.447e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.01', 'tokens/total': 1120000, 'tokens/trainable': 21835, 'epoch': '0.1458'}
+  7%|▋         | 35/478 [10:06<1:50:32, 14.97s/it]  8%|▊         | 36/478 [10:21<1:50:17, 14.97s/it]                                                  {'loss': '0.1953', 'grad_norm': '5.5', 'learning_rate': '1.489e-05', 'ppl': '1.216', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.4', 'tokens/total': 1152000, 'tokens/trainable': 22505, 'epoch': '0.15'}
+  8%|▊         | 36/478 [10:21<1:50:17, 14.97s/it]  8%|▊         | 37/478 [10:36<1:50:02, 14.97s/it]                                                  {'loss': '0.1743', 'grad_norm': '5.344', 'learning_rate': '1.532e-05', 'ppl': '1.19', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.26', 'tokens/total': 1184000, 'tokens/trainable': 23081, 'epoch': '0.1542'}
+  8%|▊         | 37/478 [10:36<1:50:02, 14.97s/it]  8%|▊         | 38/478 [10:51<1:49:51, 14.98s/it]                                                  {'loss': '0.2637', 'grad_norm': '6.031', 'learning_rate': '1.574e-05', 'ppl': '1.302', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.96', 'tokens/total': 1216000, 'tokens/trainable': 23769, 'epoch': '0.1583'}
+  8%|▊         | 38/478 [10:51<1:49:51, 14.98s/it]  8%|▊         | 39/478 [11:06<1:49:32, 14.97s/it]                                                  {'loss': '0.1641', 'grad_norm': '4.688', 'learning_rate': '1.617e-05', 'ppl': '1.178', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '24.92', 'tokens/total': 1248000, 'tokens/trainable': 24513, 'epoch': '0.1625'}
+  8%|▊         | 39/478 [11:06<1:49:32, 14.97s/it]  8%|▊         | 40/478 [11:21<1:49:17, 14.97s/it]                                                  {'loss': '0.1709', 'grad_norm': '4.938', 'learning_rate': '1.66e-05', 'ppl': '1.186', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.32', 'tokens/total': 1280000, 'tokens/trainable': 25121, 'epoch': '0.1667'}
+  8%|▊         | 40/478 [11:21<1:49:17, 14.97s/it]  9%|▊         | 41/478 [11:36<1:49:02, 14.97s/it]                                                  {'loss': '0.1924', 'grad_norm': '5.562', 'learning_rate': '1.702e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.4', 'tokens/total': 1312000, 'tokens/trainable': 25761, 'epoch': '0.1708'}
+  9%|▊         | 41/478 [11:36<1:49:02, 14.97s/it]  9%|▉         | 42/478 [11:51<1:48:46, 14.97s/it]                                                  {'loss': '0.1841', 'grad_norm': '4.969', 'learning_rate': '1.745e-05', 'ppl': '1.202', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '24.32', 'tokens/total': 1344000, 'tokens/trainable': 26488, 'epoch': '0.175'}
+  9%|▉         | 42/478 [11:51<1:48:46, 14.97s/it]  9%|▉         | 43/478 [12:06<1:48:28, 14.96s/it]                                                  {'loss': '0.2368', 'grad_norm': '5.562', 'learning_rate': '1.787e-05', 'ppl': '1.267', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.31', 'tokens/total': 1376000, 'tokens/trainable': 27154, 'epoch': '0.1792'}
+  9%|▉         | 43/478 [12:06<1:48:28, 14.96s/it]  9%|▉         | 44/478 [12:21<1:48:14, 14.96s/it]                                                  {'loss': '0.2148', 'grad_norm': '5.625', 'learning_rate': '1.83e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.67', 'tokens/total': 1408000, 'tokens/trainable': 27802, 'epoch': '0.1833'}
+  9%|▉         | 44/478 [12:21<1:48:14, 14.96s/it]  9%|▉         | 45/478 [12:35<1:48:00, 14.97s/it]                                                  {'loss': '0.228', 'grad_norm': '5.906', 'learning_rate': '1.872e-05', 'ppl': '1.256', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.5', 'tokens/total': 1440000, 'tokens/trainable': 28475, 'epoch': '0.1875'}
+  9%|▉         | 45/478 [12:36<1:48:00, 14.97s/it] 10%|▉         | 46/478 [12:50<1:47:45, 14.97s/it]                                                  {'loss': '0.1973', 'grad_norm': '5.75', 'learning_rate': '1.915e-05', 'ppl': '1.218', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.88', 'tokens/total': 1472000, 'tokens/trainable': 29099, 'epoch': '0.1917'}
+ 10%|▉         | 46/478 [12:50<1:47:45, 14.97s/it] 10%|▉         | 47/478 [13:05<1:47:29, 14.97s/it]                                                  {'loss': '0.1963', 'grad_norm': '5.312', 'learning_rate': '1.957e-05', 'ppl': '1.217', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.48', 'tokens/total': 1504000, 'tokens/trainable': 29771, 'epoch': '0.1958'}
+ 10%|▉         | 47/478 [13:05<1:47:29, 14.97s/it] 10%|█         | 48/478 [13:20<1:47:15, 14.97s/it]                                                  {'loss': '0.2627', 'grad_norm': '7.156', 'learning_rate': '2e-05', 'ppl': '1.3', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.63', 'tokens/total': 1536000, 'tokens/trainable': 30328, 'epoch': '0.2'}
+ 10%|█         | 48/478 [13:20<1:47:15, 14.97s/it] 10%|█         | 49/478 [13:35<1:47:00, 14.97s/it]                                                  {'loss': '0.2505', 'grad_norm': '7.438', 'learning_rate': '2e-05', 'ppl': '1.285', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.57', 'tokens/total': 1568000, 'tokens/trainable': 30943, 'epoch': '0.2042'}
+ 10%|█         | 49/478 [13:35<1:47:00, 14.97s/it] 10%|█         | 50/478 [13:50<1:46:44, 14.96s/it]                                                  {'loss': '0.2026', 'grad_norm': '6.75', 'learning_rate': '2e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.94', 'tokens/total': 1600000, 'tokens/trainable': 31509, 'epoch': '0.2083'}
+ 10%|█         | 50/478 [13:50<1:46:44, 14.96s/it] 11%|█         | 51/478 [14:05<1:46:29, 14.96s/it]                                                  {'loss': '0.2368', 'grad_norm': '6.281', 'learning_rate': '2e-05', 'ppl': '1.267', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.24', 'tokens/total': 1632000, 'tokens/trainable': 32054, 'epoch': '0.2125'}
+ 11%|█         | 51/478 [14:05<1:46:29, 14.96s/it] 11%|█         | 52/478 [14:20<1:46:06, 14.94s/it]                                                  {'loss': '0.2344', 'grad_norm': '6.281', 'learning_rate': '2e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.65', 'tokens/total': 1664000, 'tokens/trainable': 32639, 'epoch': '0.2167'}
+ 11%|█         | 52/478 [14:20<1:46:06, 14.94s/it] 11%|█         | 53/478 [14:35<1:45:54, 14.95s/it]                                                  {'loss': '0.2012', 'grad_norm': '5.188', 'learning_rate': '1.999e-05', 'ppl': '1.223', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.51', 'tokens/total': 1696000, 'tokens/trainable': 33282, 'epoch': '0.2208'}
+ 11%|█         | 53/478 [14:35<1:45:54, 14.95s/it] 11%|█▏        | 54/478 [14:50<1:45:37, 14.95s/it]                                                  {'loss': '0.2617', 'grad_norm': '5.656', 'learning_rate': '1.999e-05', 'ppl': '1.299', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.75', 'tokens/total': 1728000, 'tokens/trainable': 33931, 'epoch': '0.225'}
+ 11%|█▏        | 54/478 [14:50<1:45:37, 14.95s/it] 12%|█▏        | 55/478 [15:05<1:45:22, 14.95s/it]                                                  {'loss': '0.207', 'grad_norm': '5.312', 'learning_rate': '1.999e-05', 'ppl': '1.23', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.32', 'tokens/total': 1760000, 'tokens/trainable': 34627, 'epoch': '0.2292'}
+ 12%|█▏        | 55/478 [15:05<1:45:22, 14.95s/it] 12%|█▏        | 56/478 [15:20<1:45:09, 14.95s/it]                                                  {'loss': '0.1846', 'grad_norm': '5.812', 'learning_rate': '1.998e-05', 'ppl': '1.203', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.67', 'tokens/total': 1792000, 'tokens/trainable': 35185, 'epoch': '0.2333'}
+ 12%|█▏        | 56/478 [15:20<1:45:09, 14.95s/it] 12%|█▏        | 57/478 [15:35<1:44:53, 14.95s/it]                                                  {'loss': '0.207', 'grad_norm': '5.594', 'learning_rate': '1.998e-05', 'ppl': '1.23', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.99', 'tokens/total': 1824000, 'tokens/trainable': 35871, 'epoch': '0.2375'}
+ 12%|█▏        | 57/478 [15:35<1:44:53, 14.95s/it] 12%|█▏        | 58/478 [15:50<1:44:40, 14.95s/it]                                                  {'loss': '0.2261', 'grad_norm': '6', 'learning_rate': '1.997e-05', 'ppl': '1.254', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.53', 'tokens/total': 1856000, 'tokens/trainable': 36425, 'epoch': '0.2417'}
+ 12%|█▏        | 58/478 [15:50<1:44:40, 14.95s/it] 12%|█▏        | 59/478 [16:05<1:44:27, 14.96s/it]                                                  {'loss': '0.1909', 'grad_norm': '6.75', 'learning_rate': '1.997e-05', 'ppl': '1.21', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.18', 'tokens/total': 1888000, 'tokens/trainable': 37028, 'epoch': '0.2458'}
+ 12%|█▏        | 59/478 [16:05<1:44:27, 14.96s/it] 13%|█▎        | 60/478 [16:20<1:44:13, 14.96s/it]                                                  {'loss': '0.2793', 'grad_norm': '7.062', 'learning_rate': '1.996e-05', 'ppl': '1.322', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.88', 'tokens/total': 1920000, 'tokens/trainable': 37712, 'epoch': '0.25'}
+ 13%|█▎        | 60/478 [16:20<1:44:13, 14.96s/it] 13%|█▎        | 61/478 [16:35<1:43:51, 14.94s/it]                                                  {'loss': '0.2065', 'grad_norm': '5.125', 'learning_rate': '1.996e-05', 'ppl': '1.229', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.98', 'tokens/total': 1952000, 'tokens/trainable': 38426, 'epoch': '0.2542'}
+ 13%|█▎        | 61/478 [16:35<1:43:51, 14.94s/it] 13%|█▎        | 62/478 [16:50<1:43:40, 14.95s/it]                                                  {'loss': '0.1821', 'grad_norm': '5.156', 'learning_rate': '1.995e-05', 'ppl': '1.2', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.67', 'tokens/total': 1984000, 'tokens/trainable': 39044, 'epoch': '0.2583'}
+ 13%|█▎        | 62/478 [16:50<1:43:40, 14.95s/it] 13%|█▎        | 63/478 [17:05<1:43:26, 14.96s/it]                                                  {'loss': '0.2495', 'grad_norm': '6.406', 'learning_rate': '1.994e-05', 'ppl': '1.283', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.31', 'tokens/total': 2016000, 'tokens/trainable': 39651, 'epoch': '0.2625'}
+ 13%|█▎        | 63/478 [17:05<1:43:26, 14.96s/it] 13%|█▎        | 64/478 [17:20<1:43:12, 14.96s/it]                                                  {'loss': '0.2432', 'grad_norm': '5.375', 'learning_rate': '1.993e-05', 'ppl': '1.275', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.92', 'tokens/total': 2048000, 'tokens/trainable': 40336, 'epoch': '0.2667'}
+ 13%|█▎        | 64/478 [17:20<1:43:12, 14.96s/it] 14%|█▎        | 65/478 [17:35<1:42:58, 14.96s/it]                                                  {'loss': '0.1392', 'grad_norm': '4.469', 'learning_rate': '1.992e-05', 'ppl': '1.149', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.03', 'tokens/total': 2080000, 'tokens/trainable': 40875, 'epoch': '0.2708'}
+ 14%|█▎        | 65/478 [17:35<1:42:58, 14.96s/it] 14%|█▍        | 66/478 [17:50<1:42:45, 14.96s/it]                                                  {'loss': '0.2319', 'grad_norm': '5.562', 'learning_rate': '1.991e-05', 'ppl': '1.261', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.24', 'tokens/total': 2112000, 'tokens/trainable': 41510, 'epoch': '0.275'}
+ 14%|█▍        | 66/478 [17:50<1:42:45, 14.96s/it] 14%|█▍        | 67/478 [18:05<1:42:31, 14.97s/it]                                                  {'loss': '0.189', 'grad_norm': '4.938', 'learning_rate': '1.99e-05', 'ppl': '1.208', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.07', 'tokens/total': 2144000, 'tokens/trainable': 42170, 'epoch': '0.2792'}
+ 14%|█▍        | 67/478 [18:05<1:42:31, 14.97s/it] 14%|█▍        | 68/478 [18:20<1:42:16, 14.97s/it]                                                  {'loss': '0.2231', 'grad_norm': '7.812', 'learning_rate': '1.989e-05', 'ppl': '1.25', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.62', 'tokens/total': 2176000, 'tokens/trainable': 42667, 'epoch': '0.2833'}
+ 14%|█▍        | 68/478 [18:20<1:42:16, 14.97s/it] 14%|█▍        | 69/478 [18:34<1:42:01, 14.97s/it]                                                  {'loss': '0.1948', 'grad_norm': '5.219', 'learning_rate': '1.988e-05', 'ppl': '1.215', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '17.96', 'tokens/total': 2208000, 'tokens/trainable': 43204, 'epoch': '0.2875'}
+ 14%|█▍        | 69/478 [18:34<1:42:01, 14.97s/it] 15%|█▍        | 70/478 [18:49<1:41:46, 14.97s/it]                                                  {'loss': '0.146', 'grad_norm': '4.156', 'learning_rate': '1.987e-05', 'ppl': '1.157', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.14', 'tokens/total': 2240000, 'tokens/trainable': 43866, 'epoch': '0.2917'}
+ 15%|█▍        | 70/478 [18:49<1:41:46, 14.97s/it] 15%|█▍        | 71/478 [19:04<1:41:31, 14.97s/it]                                                  {'loss': '0.2588', 'grad_norm': '5.969', 'learning_rate': '1.986e-05', 'ppl': '1.295', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.85', 'tokens/total': 2272000, 'tokens/trainable': 44549, 'epoch': '0.2958'}
+ 15%|█▍        | 71/478 [19:04<1:41:31, 14.97s/it] 15%|█▌        | 72/478 [19:19<1:41:16, 14.97s/it]                                                  {'loss': '0.1719', 'grad_norm': '4.938', 'learning_rate': '1.985e-05', 'ppl': '1.188', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.27', 'tokens/total': 2304000, 'tokens/trainable': 45155, 'epoch': '0.3'}
+ 15%|█▌        | 72/478 [19:19<1:41:16, 14.97s/it] 15%|█▌        | 73/478 [19:34<1:41:02, 14.97s/it]                                                  {'loss': '0.1885', 'grad_norm': '5.781', 'learning_rate': '1.983e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.49', 'tokens/total': 2336000, 'tokens/trainable': 45738, 'epoch': '0.3042'}
+ 15%|█▌        | 73/478 [19:34<1:41:02, 14.97s/it] 15%|█▌        | 74/478 [19:49<1:40:43, 14.96s/it]                                                  {'loss': '0.2168', 'grad_norm': '6.219', 'learning_rate': '1.982e-05', 'ppl': '1.242', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.08', 'tokens/total': 2368000, 'tokens/trainable': 46367, 'epoch': '0.3083'}
+ 15%|█▌        | 74/478 [19:49<1:40:43, 14.96s/it] 16%|█▌        | 75/478 [20:04<1:40:29, 14.96s/it]                                                  {'loss': '0.1719', 'grad_norm': '5.375', 'learning_rate': '1.981e-05', 'ppl': '1.188', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.66', 'tokens/total': 2400000, 'tokens/trainable': 46955, 'epoch': '0.3125'}
+ 16%|█▌        | 75/478 [20:04<1:40:29, 14.96s/it] 16%|█▌        | 76/478 [20:19<1:40:16, 14.97s/it]                                                  {'loss': '0.2334', 'grad_norm': '6.188', 'learning_rate': '1.979e-05', 'ppl': '1.263', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.3', 'tokens/total': 2432000, 'tokens/trainable': 47562, 'epoch': '0.3167'}
+ 16%|█▌        | 76/478 [20:19<1:40:16, 14.97s/it] 16%|█▌        | 77/478 [20:34<1:40:01, 14.97s/it]                                                  {'loss': '0.3389', 'grad_norm': '7.906', 'learning_rate': '1.978e-05', 'ppl': '1.403', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19', 'tokens/total': 2464000, 'tokens/trainable': 48130, 'epoch': '0.3208'}
+ 16%|█▌        | 77/478 [20:34<1:40:01, 14.97s/it] 16%|█▋        | 78/478 [20:49<1:39:50, 14.98s/it]                                                  {'loss': '0.1982', 'grad_norm': '4.531', 'learning_rate': '1.976e-05', 'ppl': '1.219', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.93', 'tokens/total': 2496000, 'tokens/trainable': 48787, 'epoch': '0.325'}
+ 16%|█▋        | 78/478 [20:49<1:39:50, 14.98s/it] 17%|█▋        | 79/478 [21:04<1:39:34, 14.97s/it]                                                  {'loss': '0.2163', 'grad_norm': '5.594', 'learning_rate': '1.975e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21', 'tokens/total': 2528000, 'tokens/trainable': 49415, 'epoch': '0.3292'}
+ 17%|█▋        | 79/478 [21:04<1:39:34, 14.97s/it] 17%|█▋        | 80/478 [21:19<1:39:15, 14.96s/it]                                                  {'loss': '0.1899', 'grad_norm': '4.906', 'learning_rate': '1.973e-05', 'ppl': '1.209', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.99', 'tokens/total': 2560000, 'tokens/trainable': 50071, 'epoch': '0.3333'}
+ 17%|█▋        | 80/478 [21:19<1:39:15, 14.96s/it] 17%|█▋        | 81/478 [21:34<1:38:58, 14.96s/it]                                                  {'loss': '0.2627', 'grad_norm': '6.5', 'learning_rate': '1.971e-05', 'ppl': '1.3', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.61', 'tokens/total': 2592000, 'tokens/trainable': 50746, 'epoch': '0.3375'}
+ 17%|█▋        | 81/478 [21:34<1:38:58, 14.96s/it] 17%|█▋        | 82/478 [21:49<1:38:46, 14.96s/it]                                                  {'loss': '0.1797', 'grad_norm': '5.031', 'learning_rate': '1.969e-05', 'ppl': '1.197', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.52', 'tokens/total': 2624000, 'tokens/trainable': 51360, 'epoch': '0.3417'}
+ 17%|█▋        | 82/478 [21:49<1:38:46, 14.96s/it] 17%|█▋        | 83/478 [22:04<1:38:30, 14.96s/it]                                                  {'loss': '0.1689', 'grad_norm': '7.531', 'learning_rate': '1.968e-05', 'ppl': '1.184', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.22', 'tokens/total': 2656000, 'tokens/trainable': 51845, 'epoch': '0.3458'}
+ 17%|█▋        | 83/478 [22:04<1:38:30, 14.96s/it] 18%|█▊        | 84/478 [22:19<1:38:15, 14.96s/it]                                                  {'loss': '0.1943', 'grad_norm': '5.688', 'learning_rate': '1.966e-05', 'ppl': '1.214', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.68', 'tokens/total': 2688000, 'tokens/trainable': 52523, 'epoch': '0.35'}
+ 18%|█▊        | 84/478 [22:19<1:38:15, 14.96s/it] 18%|█▊        | 85/478 [22:34<1:38:00, 14.96s/it]                                                  {'loss': '0.1772', 'grad_norm': '5.344', 'learning_rate': '1.964e-05', 'ppl': '1.194', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.51', 'tokens/total': 2720000, 'tokens/trainable': 53106, 'epoch': '0.3542'}
+ 18%|█▊        | 85/478 [22:34<1:38:00, 14.96s/it] 18%|█▊        | 86/478 [22:49<1:37:47, 14.97s/it]                                                  {'loss': '0.2812', 'grad_norm': '7.031', 'learning_rate': '1.962e-05', 'ppl': '1.325', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '17.42', 'tokens/total': 2752000, 'tokens/trainable': 53627, 'epoch': '0.3583'}
+ 18%|█▊        | 86/478 [22:49<1:37:47, 14.97s/it] 18%|█▊        | 87/478 [23:04<1:37:32, 14.97s/it]                                                  {'loss': '0.1904', 'grad_norm': '5.344', 'learning_rate': '1.96e-05', 'ppl': '1.21', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.8', 'tokens/total': 2784000, 'tokens/trainable': 54249, 'epoch': '0.3625'}
+ 18%|█▊        | 87/478 [23:04<1:37:32, 14.97s/it] 18%|█▊        | 88/478 [23:19<1:37:16, 14.97s/it]                                                  {'loss': '0.2026', 'grad_norm': '5.344', 'learning_rate': '1.958e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.22', 'tokens/total': 2816000, 'tokens/trainable': 54943, 'epoch': '0.3667'}
+ 18%|█▊        | 88/478 [23:19<1:37:16, 14.97s/it] 19%|█▊        | 89/478 [23:34<1:37:02, 14.97s/it]                                                  {'loss': '0.23', 'grad_norm': '7.156', 'learning_rate': '1.956e-05', 'ppl': '1.259', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 2848000, 'tokens/trainable': 55572, 'epoch': '0.3708'}
+ 19%|█▊        | 89/478 [23:34<1:37:02, 14.97s/it] 19%|█▉        | 90/478 [23:49<1:36:47, 14.97s/it]                                                  {'loss': '0.2236', 'grad_norm': '5.625', 'learning_rate': '1.954e-05', 'ppl': '1.251', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.91', 'tokens/total': 2880000, 'tokens/trainable': 56287, 'epoch': '0.375'}
+ 19%|█▉        | 90/478 [23:49<1:36:47, 14.97s/it] 19%|█▉        | 91/478 [24:04<1:36:32, 14.97s/it]                                                  {'loss': '0.2427', 'grad_norm': '6.5', 'learning_rate': '1.951e-05', 'ppl': '1.275', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.5', 'tokens/total': 2912000, 'tokens/trainable': 56900, 'epoch': '0.3792'}
+ 19%|█▉        | 91/478 [24:04<1:36:32, 14.97s/it] 19%|█▉        | 92/478 [24:19<1:36:17, 14.97s/it]                                                  {'loss': '0.2754', 'grad_norm': '6.562', 'learning_rate': '1.949e-05', 'ppl': '1.317', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.6', 'tokens/total': 2944000, 'tokens/trainable': 57456, 'epoch': '0.3833'}
+ 19%|█▉        | 92/478 [24:19<1:36:17, 14.97s/it] 19%|█▉        | 93/478 [24:34<1:36:02, 14.97s/it]                                                  {'loss': '0.2202', 'grad_norm': '5.438', 'learning_rate': '1.947e-05', 'ppl': '1.246', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.98', 'tokens/total': 2976000, 'tokens/trainable': 58113, 'epoch': '0.3875'}
+ 19%|█▉        | 93/478 [24:34<1:36:02, 14.97s/it] 20%|█▉        | 94/478 [24:49<1:35:46, 14.97s/it]                                                  {'loss': '0.1865', 'grad_norm': '5.844', 'learning_rate': '1.944e-05', 'ppl': '1.205', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '17.47', 'tokens/total': 3008000, 'tokens/trainable': 58635, 'epoch': '0.3917'}
+ 20%|█▉        | 94/478 [24:49<1:35:46, 14.97s/it] 20%|█▉        | 95/478 [25:04<1:35:33, 14.97s/it]                                                  {'loss': '0.2656', 'grad_norm': '6.562', 'learning_rate': '1.942e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.09', 'tokens/total': 3040000, 'tokens/trainable': 59116, 'epoch': '0.3958'}
+ 20%|█▉        | 95/478 [25:04<1:35:33, 14.97s/it] 20%|██        | 96/478 [25:19<1:35:18, 14.97s/it]                                                  {'loss': '0.1924', 'grad_norm': '6.875', 'learning_rate': '1.939e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.69', 'tokens/total': 3072000, 'tokens/trainable': 59615, 'epoch': '0.4'}
+ 20%|██        | 96/478 [25:19<1:35:18, 14.97s/it] 20%|██        | 97/478 [25:34<1:35:04, 14.97s/it]                                                  {'loss': '0.2271', 'grad_norm': '7.094', 'learning_rate': '1.937e-05', 'ppl': '1.255', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.66', 'tokens/total': 3104000, 'tokens/trainable': 60233, 'epoch': '0.4042'}
+ 20%|██        | 97/478 [25:34<1:35:04, 14.97s/it] 21%|██        | 98/478 [25:49<1:34:49, 14.97s/it]                                                  {'loss': '0.2163', 'grad_norm': '5.062', 'learning_rate': '1.934e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 3136000, 'tokens/trainable': 60857, 'epoch': '0.4083'}
+ 21%|██        | 98/478 [25:49<1:34:49, 14.97s/it] 21%|██        | 99/478 [26:03<1:34:33, 14.97s/it]                                                  {'loss': '0.1621', 'grad_norm': '4.938', 'learning_rate': '1.932e-05', 'ppl': '1.176', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.88', 'tokens/total': 3168000, 'tokens/trainable': 61541, 'epoch': '0.4125'}
+ 21%|██        | 99/478 [26:04<1:34:33, 14.97s/it] 21%|██        | 100/478 [26:18<1:34:18, 14.97s/it]                                                   {'loss': '0.2051', 'grad_norm': '5.219', 'learning_rate': '1.929e-05', 'ppl': '1.228', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.57', 'tokens/total': 3200000, 'tokens/trainable': 62156, 'epoch': '0.4167'}
+ 21%|██        | 100/478 [26:18<1:34:18, 14.97s/it] 21%|██        | 101/478 [26:33<1:34:02, 14.97s/it]                                                   {'loss': '0.2344', 'grad_norm': '6.625', 'learning_rate': '1.926e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.6', 'tokens/total': 3232000, 'tokens/trainable': 62712, 'epoch': '0.4208'}
+ 21%|██        | 101/478 [26:33<1:34:02, 14.97s/it] 21%|██▏       | 102/478 [26:48<1:33:46, 14.96s/it]                                                   {'loss': '0.1924', 'grad_norm': '5.812', 'learning_rate': '1.924e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.25', 'tokens/total': 3264000, 'tokens/trainable': 63347, 'epoch': '0.425'}
+ 21%|██▏       | 102/478 [26:48<1:33:46, 14.96s/it] 22%|██▏       | 103/478 [27:03<1:33:31, 14.96s/it]                                                   {'loss': '0.1865', 'grad_norm': '5', 'learning_rate': '1.921e-05', 'ppl': '1.205', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.4', 'tokens/total': 3296000, 'tokens/trainable': 63927, 'epoch': '0.4292'}
+ 22%|██▏       | 103/478 [27:03<1:33:31, 14.96s/it] 22%|██▏       | 104/478 [27:18<1:33:17, 14.97s/it]                                                   {'loss': '0.2471', 'grad_norm': '5.438', 'learning_rate': '1.918e-05', 'ppl': '1.28', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 3328000, 'tokens/trainable': 64551, 'epoch': '0.4333'}
+ 22%|██▏       | 104/478 [27:18<1:33:17, 14.97s/it] 22%|██▏       | 105/478 [27:33<1:33:02, 14.97s/it]                                                   {'loss': '0.1685', 'grad_norm': '5.125', 'learning_rate': '1.915e-05', 'ppl': '1.183', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.44', 'tokens/total': 3360000, 'tokens/trainable': 65192, 'epoch': '0.4375'}
+ 22%|██▏       | 105/478 [27:33<1:33:02, 14.97s/it] 22%|██▏       | 106/478 [27:48<1:32:47, 14.97s/it]                                                   {'loss': '0.1958', 'grad_norm': '5.219', 'learning_rate': '1.912e-05', 'ppl': '1.216', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.29', 'tokens/total': 3392000, 'tokens/trainable': 65888, 'epoch': '0.4417'}
+ 22%|██▏       | 106/478 [27:48<1:32:47, 14.97s/it] 22%|██▏       | 107/478 [28:03<1:32:32, 14.97s/it]                                                   {'loss': '0.2749', 'grad_norm': '6.094', 'learning_rate': '1.909e-05', 'ppl': '1.316', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.91', 'tokens/total': 3424000, 'tokens/trainable': 66513, 'epoch': '0.4458'}
+ 22%|██▏       | 107/478 [28:03<1:32:32, 14.97s/it] 23%|██▎       | 108/478 [28:18<1:32:17, 14.97s/it]                                                   {'loss': '0.1978', 'grad_norm': '4.844', 'learning_rate': '1.906e-05', 'ppl': '1.219', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.4', 'tokens/total': 3456000, 'tokens/trainable': 67123, 'epoch': '0.45'}
+ 23%|██▎       | 108/478 [28:18<1:32:17, 14.97s/it] 23%|██▎       | 109/478 [28:33<1:32:02, 14.97s/it]                                                   {'loss': '0.2812', 'grad_norm': '6.406', 'learning_rate': '1.903e-05', 'ppl': '1.325', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.87', 'tokens/total': 3488000, 'tokens/trainable': 67687, 'epoch': '0.4542'}
+ 23%|██▎       | 109/478 [28:33<1:32:02, 14.97s/it] 23%|██▎       | 110/478 [28:48<1:31:47, 14.97s/it]                                                   {'loss': '0.2578', 'grad_norm': '6.062', 'learning_rate': '1.9e-05', 'ppl': '1.294', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.61', 'tokens/total': 3520000, 'tokens/trainable': 68333, 'epoch': '0.4583'}
+ 23%|██▎       | 110/478 [28:48<1:31:47, 14.97s/it] 23%|██▎       | 111/478 [29:03<1:31:39, 14.99s/it]                                                   {'loss': '0.1865', 'grad_norm': '4.438', 'learning_rate': '1.896e-05', 'ppl': '1.205', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.88', 'tokens/total': 3552000, 'tokens/trainable': 68990, 'epoch': '0.4625'}
+ 23%|██▎       | 111/478 [29:03<1:31:39, 14.99s/it] 23%|██▎       | 112/478 [29:18<1:31:22, 14.98s/it]                                                   {'loss': '0.2842', 'grad_norm': '6.375', 'learning_rate': '1.893e-05', 'ppl': '1.329', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.87', 'tokens/total': 3584000, 'tokens/trainable': 69554, 'epoch': '0.4667'}
+ 23%|██▎       | 112/478 [29:18<1:31:22, 14.98s/it] 24%|██▎       | 113/478 [29:33<1:30:58, 14.96s/it]                                                   {'loss': '0.29', 'grad_norm': '7.375', 'learning_rate': '1.89e-05', 'ppl': '1.336', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.7', 'tokens/total': 3616000, 'tokens/trainable': 70200, 'epoch': '0.4708'}
+ 24%|██▎       | 113/478 [29:33<1:30:58, 14.96s/it] 24%|██▍       | 114/478 [29:48<1:30:44, 14.96s/it]                                                   {'loss': '0.1782', 'grad_norm': '9.062', 'learning_rate': '1.886e-05', 'ppl': '1.195', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.59', 'tokens/total': 3648000, 'tokens/trainable': 70875, 'epoch': '0.475'}
+ 24%|██▍       | 114/478 [29:48<1:30:44, 14.96s/it] 24%|██▍       | 115/478 [30:03<1:30:30, 14.96s/it]                                                   {'loss': '0.2656', 'grad_norm': '5.594', 'learning_rate': '1.883e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.14', 'tokens/total': 3680000, 'tokens/trainable': 71477, 'epoch': '0.4792'}
+ 24%|██▍       | 115/478 [30:03<1:30:30, 14.96s/it] 24%|██▍       | 116/478 [30:18<1:30:16, 14.96s/it]                                                   {'loss': '0.2251', 'grad_norm': '4.688', 'learning_rate': '1.88e-05', 'ppl': '1.252', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.03', 'tokens/total': 3712000, 'tokens/trainable': 72016, 'epoch': '0.4833'}
+ 24%|██▍       | 116/478 [30:18<1:30:16, 14.96s/it] 24%|██▍       | 117/478 [30:33<1:30:01, 14.96s/it]                                                   {'loss': '0.2319', 'grad_norm': '5.438', 'learning_rate': '1.876e-05', 'ppl': '1.261', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.68', 'tokens/total': 3744000, 'tokens/trainable': 72664, 'epoch': '0.4875'}
+ 24%|██▍       | 117/478 [30:33<1:30:01, 14.96s/it] 25%|██▍       | 118/478 [30:48<1:29:49, 14.97s/it]                                                   {'loss': '0.1895', 'grad_norm': '5.281', 'learning_rate': '1.873e-05', 'ppl': '1.209', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.71', 'tokens/total': 3776000, 'tokens/trainable': 73314, 'epoch': '0.4917'}
+ 25%|██▍       | 118/478 [30:48<1:29:49, 14.97s/it] 25%|██▍       | 119/478 [31:03<1:29:33, 14.97s/it]                                                   {'loss': '0.1973', 'grad_norm': '6.719', 'learning_rate': '1.869e-05', 'ppl': '1.218', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.98', 'tokens/total': 3808000, 'tokens/trainable': 74001, 'epoch': '0.4958'}
+ 25%|██▍       | 119/478 [31:03<1:29:33, 14.97s/it] 25%|██▌       | 120/478 [31:18<1:29:18, 14.97s/it]                                                   {'loss': '0.2339', 'grad_norm': '4.969', 'learning_rate': '1.865e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.25', 'tokens/total': 3840000, 'tokens/trainable': 74666, 'epoch': '0.5'}
+ 25%|██▌       | 120/478 [31:18<1:29:18, 14.97s/it][2026-04-17 02:40:42,441] [INFO] [axolotl.core.trainers.base] Running evaluation step...
+[2026-04-17 02:40:50,590] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
+
+  0%|          | 0/27 [00:00<?, ?it/s]
+  7%|▋         | 2/27 [00:02<00:35,  1.41s/it]
+ 11%|█         | 3/27 [00:05<00:47,  1.97s/it]
+ 15%|█▍        | 4/27 [00:08<00:51,  2.26s/it]
+ 19%|█▊        | 5/27 [00:11<00:53,  2.43s/it]
+ 22%|██▏       | 6/27 [00:13<00:53,  2.54s/it]
+ 26%|██▌       | 7/27 [00:16<00:52,  2.61s/it]
+ 30%|██▉       | 8/27 [00:19<00:50,  2.65s/it]
+ 33%|███▎      | 9/27 [00:22<00:48,  2.68s/it]
+ 37%|███▋      | 10/27 [00:24<00:45,  2.71s/it]
+ 41%|████      | 11/27 [00:27<00:43,  2.72s/it]
+ 44%|████▍     | 12/27 [00:30<00:39,  2.64s/it]
+ 48%|████▊     | 13/27 [00:33<00:38,  2.76s/it]
+ 52%|█████▏    | 14/27 [00:35<00:35,  2.76s/it]
+ 56%|█████▌    | 15/27 [00:38<00:33,  2.76s/it]
+ 59%|█████▉    | 16/27 [00:41<00:30,  2.76s/it]
+ 63%|██████▎   | 17/27 [00:44<00:27,  2.76s/it]
+ 67%|██████▋   | 18/27 [00:46<00:24,  2.76s/it]
+ 70%|███████   | 19/27 [00:49<00:22,  2.76s/it]
+ 74%|███████▍  | 20/27 [00:52<00:19,  2.75s/it]
+ 78%|███████▊  | 21/27 [00:54<00:16,  2.67s/it]
+ 81%|████████▏ | 22/27 [00:57<00:13,  2.78s/it]
+ 85%|████████▌ | 23/27 [01:00<00:11,  2.77s/it]
+ 89%|████████▉ | 24/27 [01:03<00:08,  2.77s/it]
+ 93%|█████████▎| 25/27 [01:06<00:05,  2.76s/it]
+ 96%|█████████▋| 26/27 [01:08<00:02,  2.76s/it]
+100%|██████████| 27/27 [01:11<00:00,  2.79s/it]                                                   
+                                               {'eval_loss': '0.2288', 'eval_runtime': '75.12', 'eval_samples_per_second': '2.782', 'eval_steps_per_second': '1.398', 'eval_ppl': '1.257', 'memory/max_active (GiB)': '34.91', 'memory/max_allocated (GiB)': '34.91', 'memory/device_reserved (GiB)': '59.04', 'epoch': '0.5', 'tokens/train_per_sec_per_gpu': '0'}
+ 25%|██▌       | 120/478 [32:41<1:29:18, 14.97s/it]
+100%|██████████| 27/27 [01:13<00:00,  2.79s/it]
+                                                25%|██▌       | 121/478 [32:56<3:57:42, 39.95s/it]                                                   {'loss': '0.2651', 'grad_norm': '6.125', 'learning_rate': '1.862e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.61', 'tokens/total': 3872000, 'tokens/trainable': 75192, 'epoch': '0.5042'}
+ 25%|██▌       | 121/478 [32:56<3:57:42, 39.95s/it] 26%|██▌       | 122/478 [33:11<3:12:32, 32.45s/it]                                                   {'loss': '0.2109', 'grad_norm': '5.812', 'learning_rate': '1.858e-05', 'ppl': '1.235', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.68', 'tokens/total': 3904000, 'tokens/trainable': 75780, 'epoch': '0.5083'}
+ 26%|██▌       | 122/478 [33:11<3:12:32, 32.45s/it] 26%|██▌       | 123/478 [33:26<2:40:56, 27.20s/it]                                                   {'loss': '0.2168', 'grad_norm': '5.438', 'learning_rate': '1.854e-05', 'ppl': '1.242', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.31', 'tokens/total': 3936000, 'tokens/trainable': 76297, 'epoch': '0.5125'}
+ 26%|██▌       | 123/478 [33:26<2:40:56, 27.20s/it] 26%|██▌       | 124/478 [33:41<2:18:48, 23.53s/it]                                                   {'loss': '0.2153', 'grad_norm': '5.344', 'learning_rate': '1.85e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.41', 'tokens/total': 3968000, 'tokens/trainable': 76817, 'epoch': '0.5167'}
+ 26%|██▌       | 124/478 [33:41<2:18:48, 23.53s/it] 26%|██▌       | 125/478 [33:56<2:03:16, 20.95s/it]                                                   {'loss': '0.2075', 'grad_norm': '8.938', 'learning_rate': '1.847e-05', 'ppl': '1.231', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.12', 'tokens/total': 4000000, 'tokens/trainable': 77418, 'epoch': '0.5208'}
+ 26%|██▌       | 125/478 [33:56<2:03:16, 20.95s/it] 26%|██▋       | 126/478 [34:11<1:52:21, 19.15s/it]                                                   {'loss': '0.2148', 'grad_norm': '5.344', 'learning_rate': '1.843e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.9', 'tokens/total': 4032000, 'tokens/trainable': 78072, 'epoch': '0.525'}
+ 26%|██▋       | 126/478 [34:11<1:52:21, 19.15s/it] 27%|██▋       | 127/478 [34:26<1:44:40, 17.89s/it]                                                   {'loss': '0.2383', 'grad_norm': '5.594', 'learning_rate': '1.839e-05', 'ppl': '1.269', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.32', 'tokens/total': 4064000, 'tokens/trainable': 78709, 'epoch': '0.5292'}
+ 27%|██▋       | 127/478 [34:26<1:44:40, 17.89s/it] 27%|██▋       | 128/478 [34:41<1:39:13, 17.01s/it]                                                   {'loss': '0.2544', 'grad_norm': '5.969', 'learning_rate': '1.835e-05', 'ppl': '1.29', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.15', 'tokens/total': 4096000, 'tokens/trainable': 79251, 'epoch': '0.5333'}
+ 27%|██▋       | 128/478 [34:41<1:39:13, 17.01s/it] 27%|██▋       | 129/478 [34:56<1:35:21, 16.39s/it]                                                   {'loss': '0.2046', 'grad_norm': '5.156', 'learning_rate': '1.831e-05', 'ppl': '1.227', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.95', 'tokens/total': 4128000, 'tokens/trainable': 79847, 'epoch': '0.5375'}
+ 27%|██▋       | 129/478 [34:56<1:35:21, 16.39s/it] 27%|██▋       | 130/478 [35:11<1:32:35, 15.96s/it]                                                   {'loss': '0.2026', 'grad_norm': '4.969', 'learning_rate': '1.827e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.98', 'tokens/total': 4160000, 'tokens/trainable': 80444, 'epoch': '0.5417'}
+ 27%|██▋       | 130/478 [35:11<1:32:35, 15.96s/it] 27%|██▋       | 131/478 [35:26<1:30:33, 15.66s/it]                                                   {'loss': '0.2222', 'grad_norm': '5.688', 'learning_rate': '1.823e-05', 'ppl': '1.249', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.91', 'tokens/total': 4192000, 'tokens/trainable': 80979, 'epoch': '0.5458'}
+ 27%|██▋       | 131/478 [35:26<1:30:33, 15.66s/it] 28%|██▊       | 132/478 [35:41<1:29:04, 15.45s/it]                                                   {'loss': '0.2271', 'grad_norm': '5.656', 'learning_rate': '1.818e-05', 'ppl': '1.255', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.49', 'tokens/total': 4224000, 'tokens/trainable': 81531, 'epoch': '0.55'}
+ 28%|██▊       | 132/478 [35:41<1:29:04, 15.45s/it] 28%|██▊       | 133/478 [35:55<1:27:55, 15.29s/it]                                                   {'loss': '0.2725', 'grad_norm': '4.938', 'learning_rate': '1.814e-05', 'ppl': '1.313', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.24', 'tokens/total': 4256000, 'tokens/trainable': 82224, 'epoch': '0.5542'}
+ 28%|██▊       | 133/478 [35:55<1:27:55, 15.29s/it] 28%|██▊       | 134/478 [36:10<1:27:05, 15.19s/it]                                                   {'loss': '0.2539', 'grad_norm': '6.031', 'learning_rate': '1.81e-05', 'ppl': '1.289', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.04', 'tokens/total': 4288000, 'tokens/trainable': 82912, 'epoch': '0.5583'}
+ 28%|██▊       | 134/478 [36:10<1:27:05, 15.19s/it] 28%|██▊       | 135/478 [36:25<1:26:25, 15.12s/it]                                                   {'loss': '0.2368', 'grad_norm': '5.312', 'learning_rate': '1.806e-05', 'ppl': '1.267', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.44', 'tokens/total': 4320000, 'tokens/trainable': 83582, 'epoch': '0.5625'}
+ 28%|██▊       | 135/478 [36:25<1:26:25, 15.12s/it] 28%|██▊       | 136/478 [36:40<1:25:53, 15.07s/it]                                                   {'loss': '0.2617', 'grad_norm': '7.75', 'learning_rate': '1.801e-05', 'ppl': '1.299', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.19', 'tokens/total': 4352000, 'tokens/trainable': 84185, 'epoch': '0.5667'}
+ 28%|██▊       | 136/478 [36:40<1:25:53, 15.07s/it] 29%|██▊       | 137/478 [36:55<1:25:24, 15.03s/it]                                                   {'loss': '0.2041', 'grad_norm': '4.906', 'learning_rate': '1.797e-05', 'ppl': '1.226', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 4384000, 'tokens/trainable': 84805, 'epoch': '0.5708'}
+ 29%|██▊       | 137/478 [36:55<1:25:24, 15.03s/it] 29%|██▉       | 138/478 [37:10<1:25:01, 15.01s/it]                                                   {'loss': '0.2881', 'grad_norm': '7.125', 'learning_rate': '1.792e-05', 'ppl': '1.334', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.32', 'tokens/total': 4416000, 'tokens/trainable': 85382, 'epoch': '0.575'}
+ 29%|██▉       | 138/478 [37:10<1:25:01, 15.01s/it] 29%|██▉       | 139/478 [37:25<1:24:41, 14.99s/it]                                                   {'loss': '0.1733', 'grad_norm': '4.5', 'learning_rate': '1.788e-05', 'ppl': '1.189', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.12', 'tokens/total': 4448000, 'tokens/trainable': 85983, 'epoch': '0.5792'}
+ 29%|██▉       | 139/478 [37:25<1:24:41, 14.99s/it] 29%|██▉       | 140/478 [37:40<1:24:22, 14.98s/it]                                                   {'loss': '0.2285', 'grad_norm': '5.844', 'learning_rate': '1.783e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.68', 'tokens/total': 4480000, 'tokens/trainable': 86481, 'epoch': '0.5833'}
+ 29%|██▉       | 140/478 [37:40<1:24:22, 14.98s/it] 29%|██▉       | 141/478 [37:55<1:23:58, 14.95s/it]                                                   {'loss': '0.2144', 'grad_norm': '5.438', 'learning_rate': '1.779e-05', 'ppl': '1.239', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.69', 'tokens/total': 4512000, 'tokens/trainable': 87126, 'epoch': '0.5875'}
+ 29%|██▉       | 141/478 [37:55<1:23:58, 14.95s/it] 30%|██▉       | 142/478 [38:10<1:23:42, 14.95s/it]                                                   {'loss': '0.2754', 'grad_norm': '6.188', 'learning_rate': '1.774e-05', 'ppl': '1.317', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.84', 'tokens/total': 4544000, 'tokens/trainable': 87778, 'epoch': '0.5917'}
+ 30%|██▉       | 142/478 [38:10<1:23:42, 14.95s/it] 30%|██▉       | 143/478 [38:25<1:23:28, 14.95s/it]                                                   {'loss': '0.2407', 'grad_norm': '5.375', 'learning_rate': '1.77e-05', 'ppl': '1.272', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.08', 'tokens/total': 4576000, 'tokens/trainable': 88408, 'epoch': '0.5958'}
+ 30%|██▉       | 143/478 [38:25<1:23:28, 14.95s/it] 30%|███       | 144/478 [38:40<1:23:14, 14.95s/it]                                                   {'loss': '0.176', 'grad_norm': '5.844', 'learning_rate': '1.765e-05', 'ppl': '1.192', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.75', 'tokens/total': 4608000, 'tokens/trainable': 88968, 'epoch': '0.6'}
+ 30%|███       | 144/478 [38:40<1:23:14, 14.95s/it] 30%|███       | 145/478 [38:55<1:22:58, 14.95s/it]                                                   {'loss': '0.1982', 'grad_norm': '4.781', 'learning_rate': '1.76e-05', 'ppl': '1.219', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 4640000, 'tokens/trainable': 89545, 'epoch': '0.6042'}
+ 30%|███       | 145/478 [38:55<1:22:58, 14.95s/it] 31%|███       | 146/478 [39:10<1:22:41, 14.94s/it]                                                   {'loss': '0.1758', 'grad_norm': '5.562', 'learning_rate': '1.756e-05', 'ppl': '1.192', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.85', 'tokens/total': 4672000, 'tokens/trainable': 90107, 'epoch': '0.6083'}
+ 31%|███       | 146/478 [39:10<1:22:41, 14.94s/it] 31%|███       | 147/478 [39:25<1:22:27, 14.95s/it]                                                   {'loss': '0.2261', 'grad_norm': '5.438', 'learning_rate': '1.751e-05', 'ppl': '1.254', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.66', 'tokens/total': 4704000, 'tokens/trainable': 90724, 'epoch': '0.6125'}
+ 31%|███       | 147/478 [39:25<1:22:27, 14.95s/it] 31%|███       | 148/478 [39:40<1:22:12, 14.95s/it]                                                   {'loss': '0.2046', 'grad_norm': '5.438', 'learning_rate': '1.746e-05', 'ppl': '1.227', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.3', 'tokens/total': 4736000, 'tokens/trainable': 91390, 'epoch': '0.6167'}
+ 31%|███       | 148/478 [39:40<1:22:12, 14.95s/it] 31%|███       | 149/478 [39:55<1:21:58, 14.95s/it]                                                   {'loss': '0.2109', 'grad_norm': '5.375', 'learning_rate': '1.741e-05', 'ppl': '1.235', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.79', 'tokens/total': 4768000, 'tokens/trainable': 91981, 'epoch': '0.6208'}
+ 31%|███       | 149/478 [39:55<1:21:58, 14.95s/it] 31%|███▏      | 150/478 [40:10<1:21:43, 14.95s/it]                                                   {'loss': '0.208', 'grad_norm': '5.812', 'learning_rate': '1.736e-05', 'ppl': '1.231', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.79', 'tokens/total': 4800000, 'tokens/trainable': 92542, 'epoch': '0.625'}
+ 31%|███▏      | 150/478 [40:10<1:21:43, 14.95s/it] 32%|███▏      | 151/478 [40:24<1:21:28, 14.95s/it]                                                   {'loss': '0.2383', 'grad_norm': '5.625', 'learning_rate': '1.731e-05', 'ppl': '1.269', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.89', 'tokens/total': 4832000, 'tokens/trainable': 93106, 'epoch': '0.6292'}
+ 32%|███▏      | 151/478 [40:24<1:21:28, 14.95s/it] 32%|███▏      | 152/478 [40:39<1:21:13, 14.95s/it]                                                   {'loss': '0.2788', 'grad_norm': '7.312', 'learning_rate': '1.726e-05', 'ppl': '1.322', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.64', 'tokens/total': 4864000, 'tokens/trainable': 93782, 'epoch': '0.6333'}
+ 32%|███▏      | 152/478 [40:39<1:21:13, 14.95s/it] 32%|███▏      | 153/478 [40:54<1:21:02, 14.96s/it]                                                   {'loss': '0.2266', 'grad_norm': '5.344', 'learning_rate': '1.721e-05', 'ppl': '1.254', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.77', 'tokens/total': 4896000, 'tokens/trainable': 94464, 'epoch': '0.6375'}
+ 32%|███▏      | 153/478 [40:54<1:21:02, 14.96s/it] 32%|███▏      | 154/478 [41:09<1:20:47, 14.96s/it]                                                   {'loss': '0.1919', 'grad_norm': '4.656', 'learning_rate': '1.716e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.29', 'tokens/total': 4928000, 'tokens/trainable': 95070, 'epoch': '0.6417'}
+ 32%|███▏      | 154/478 [41:09<1:20:47, 14.96s/it] 32%|███▏      | 155/478 [41:24<1:20:32, 14.96s/it]                                                   {'loss': '0.2554', 'grad_norm': '5.844', 'learning_rate': '1.711e-05', 'ppl': '1.291', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.57', 'tokens/total': 4960000, 'tokens/trainable': 95655, 'epoch': '0.6458'}
+ 32%|███▏      | 155/478 [41:24<1:20:32, 14.96s/it] 33%|███▎      | 156/478 [41:39<1:20:14, 14.95s/it]                                                   {'loss': '0.2178', 'grad_norm': '4.375', 'learning_rate': '1.706e-05', 'ppl': '1.243', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.26', 'tokens/total': 4992000, 'tokens/trainable': 96349, 'epoch': '0.65'}
+ 33%|███▎      | 156/478 [41:39<1:20:14, 14.95s/it] 33%|███▎      | 157/478 [41:54<1:19:59, 14.95s/it]                                                   {'loss': '0.2354', 'grad_norm': '4.625', 'learning_rate': '1.701e-05', 'ppl': '1.265', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.3', 'tokens/total': 5024000, 'tokens/trainable': 97015, 'epoch': '0.6542'}
+ 33%|███▎      | 157/478 [41:54<1:19:59, 14.95s/it] 33%|███▎      | 158/478 [42:09<1:19:44, 14.95s/it]                                                   {'loss': '0.2778', 'grad_norm': '5.625', 'learning_rate': '1.695e-05', 'ppl': '1.32', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.6', 'tokens/total': 5056000, 'tokens/trainable': 97660, 'epoch': '0.6583'}
+ 33%|███▎      | 158/478 [42:09<1:19:44, 14.95s/it] 33%|███▎      | 159/478 [42:24<1:19:29, 14.95s/it]                                                   {'loss': '0.1919', 'grad_norm': '6.25', 'learning_rate': '1.69e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.29', 'tokens/total': 5088000, 'tokens/trainable': 98206, 'epoch': '0.6625'}
+ 33%|███▎      | 159/478 [42:24<1:19:29, 14.95s/it] 33%|███▎      | 160/478 [42:39<1:19:15, 14.95s/it]                                                   {'loss': '0.209', 'grad_norm': '4.875', 'learning_rate': '1.685e-05', 'ppl': '1.232', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.23', 'tokens/total': 5120000, 'tokens/trainable': 98900, 'epoch': '0.6667'}
+ 33%|███▎      | 160/478 [42:39<1:19:15, 14.95s/it] 34%|███▎      | 161/478 [42:54<1:18:59, 14.95s/it]                                                   {'loss': '0.2231', 'grad_norm': '5.594', 'learning_rate': '1.68e-05', 'ppl': '1.25', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.42', 'tokens/total': 5152000, 'tokens/trainable': 99420, 'epoch': '0.6708'}
+ 34%|███▎      | 161/478 [42:54<1:18:59, 14.95s/it] 34%|███▍      | 162/478 [43:09<1:18:44, 14.95s/it]                                                   {'loss': '0.2354', 'grad_norm': '4.938', 'learning_rate': '1.674e-05', 'ppl': '1.265', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.09', 'tokens/total': 5184000, 'tokens/trainable': 100050, 'epoch': '0.675'}
+ 34%|███▍      | 162/478 [43:09<1:18:44, 14.95s/it] 34%|███▍      | 163/478 [43:24<1:18:29, 14.95s/it]                                                   {'loss': '0.2231', 'grad_norm': '4.656', 'learning_rate': '1.669e-05', 'ppl': '1.25', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.64', 'tokens/total': 5216000, 'tokens/trainable': 100696, 'epoch': '0.6792'}
+ 34%|███▍      | 163/478 [43:24<1:18:29, 14.95s/it] 34%|███▍      | 164/478 [43:39<1:18:14, 14.95s/it]                                                   {'loss': '0.2285', 'grad_norm': '6', 'learning_rate': '1.663e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.67', 'tokens/total': 5248000, 'tokens/trainable': 101194, 'epoch': '0.6833'}
+ 34%|███▍      | 164/478 [43:39<1:18:14, 14.95s/it] 35%|███▍      | 165/478 [43:54<1:18:00, 14.95s/it]                                                   {'loss': '0.2773', 'grad_norm': '5.438', 'learning_rate': '1.658e-05', 'ppl': '1.32', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.59', 'tokens/total': 5280000, 'tokens/trainable': 101869, 'epoch': '0.6875'}
+ 35%|███▍      | 165/478 [43:54<1:18:00, 14.95s/it] 35%|███▍      | 166/478 [44:09<1:17:45, 14.95s/it]                                                   {'loss': '0.1777', 'grad_norm': '5.156', 'learning_rate': '1.652e-05', 'ppl': '1.195', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.58', 'tokens/total': 5312000, 'tokens/trainable': 102424, 'epoch': '0.6917'}
+ 35%|███▍      | 166/478 [44:09<1:17:45, 14.95s/it] 35%|███▍      | 167/478 [44:24<1:17:28, 14.95s/it]                                                   {'loss': '0.2383', 'grad_norm': '6.719', 'learning_rate': '1.647e-05', 'ppl': '1.269', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 5344000, 'tokens/trainable': 103051, 'epoch': '0.6958'}
+ 35%|███▍      | 167/478 [44:24<1:17:28, 14.95s/it] 35%|███▌      | 168/478 [44:39<1:17:13, 14.95s/it]                                                   {'loss': '0.2178', 'grad_norm': '5.844', 'learning_rate': '1.641e-05', 'ppl': '1.243', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.22', 'tokens/total': 5376000, 'tokens/trainable': 103625, 'epoch': '0.7'}
+ 35%|███▌      | 168/478 [44:39<1:17:13, 14.95s/it] 35%|███▌      | 169/478 [44:54<1:16:59, 14.95s/it]                                                   {'loss': '0.2124', 'grad_norm': '7.688', 'learning_rate': '1.636e-05', 'ppl': '1.237', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.02', 'tokens/total': 5408000, 'tokens/trainable': 104253, 'epoch': '0.7042'}
+ 35%|███▌      | 169/478 [44:54<1:16:59, 14.95s/it] 36%|███▌      | 170/478 [45:09<1:16:45, 14.95s/it]                                                   {'loss': '0.1899', 'grad_norm': '5', 'learning_rate': '1.63e-05', 'ppl': '1.209', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.95', 'tokens/total': 5440000, 'tokens/trainable': 104879, 'epoch': '0.7083'}
+ 36%|███▌      | 170/478 [45:09<1:16:45, 14.95s/it] 36%|███▌      | 171/478 [45:24<1:16:30, 14.95s/it]                                                   {'loss': '0.2271', 'grad_norm': '5.406', 'learning_rate': '1.624e-05', 'ppl': '1.255', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 5472000, 'tokens/trainable': 105500, 'epoch': '0.7125'}
+ 36%|███▌      | 171/478 [45:24<1:16:30, 14.95s/it] 36%|███▌      | 172/478 [45:38<1:16:15, 14.95s/it]                                                   {'loss': '0.3076', 'grad_norm': '6', 'learning_rate': '1.619e-05', 'ppl': '1.36', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.07', 'tokens/total': 5504000, 'tokens/trainable': 106159, 'epoch': '0.7167'}
+ 36%|███▌      | 172/478 [45:38<1:16:15, 14.95s/it] 36%|███▌      | 173/478 [45:53<1:15:59, 14.95s/it]                                                   {'loss': '0.2109', 'grad_norm': '5.906', 'learning_rate': '1.613e-05', 'ppl': '1.235', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.8', 'tokens/total': 5536000, 'tokens/trainable': 106780, 'epoch': '0.7208'}
+ 36%|███▌      | 173/478 [45:53<1:15:59, 14.95s/it] 36%|███▋      | 174/478 [46:08<1:15:51, 14.97s/it]                                                   {'loss': '0.2563', 'grad_norm': '5.219', 'learning_rate': '1.607e-05', 'ppl': '1.292', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.75', 'tokens/total': 5568000, 'tokens/trainable': 107522, 'epoch': '0.725'}
+ 36%|███▋      | 174/478 [46:08<1:15:51, 14.97s/it] 37%|███▋      | 175/478 [46:23<1:15:34, 14.97s/it]                                                   {'loss': '0.2666', 'grad_norm': '5.25', 'learning_rate': '1.601e-05', 'ppl': '1.306', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21', 'tokens/total': 5600000, 'tokens/trainable': 108149, 'epoch': '0.7292'}
+ 37%|███▋      | 175/478 [46:23<1:15:34, 14.97s/it] 37%|███▋      | 176/478 [46:38<1:15:17, 14.96s/it]                                                   {'loss': '0.3096', 'grad_norm': '5.406', 'learning_rate': '1.595e-05', 'ppl': '1.363', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.3', 'tokens/total': 5632000, 'tokens/trainable': 108874, 'epoch': '0.7333'}
+ 37%|███▋      | 176/478 [46:38<1:15:17, 14.96s/it] 37%|███▋      | 177/478 [46:53<1:15:02, 14.96s/it]                                                   {'loss': '0.2432', 'grad_norm': '6.125', 'learning_rate': '1.59e-05', 'ppl': '1.275', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.38', 'tokens/total': 5664000, 'tokens/trainable': 109483, 'epoch': '0.7375'}
+ 37%|███▋      | 177/478 [46:53<1:15:02, 14.96s/it] 37%|███▋      | 178/478 [47:08<1:14:46, 14.96s/it]                                                   {'loss': '0.1968', 'grad_norm': '4.75', 'learning_rate': '1.584e-05', 'ppl': '1.217', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 5696000, 'tokens/trainable': 110166, 'epoch': '0.7417'}
+ 37%|███▋      | 178/478 [47:08<1:14:46, 14.96s/it] 37%|███▋      | 179/478 [47:23<1:14:31, 14.96s/it]                                                   {'loss': '0.1934', 'grad_norm': '5.656', 'learning_rate': '1.578e-05', 'ppl': '1.213', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.58', 'tokens/total': 5728000, 'tokens/trainable': 110691, 'epoch': '0.7458'}
+ 37%|███▋      | 179/478 [47:23<1:14:31, 14.96s/it] 38%|███▊      | 180/478 [47:38<1:14:16, 14.95s/it]                                                   {'loss': '0.2036', 'grad_norm': '4.781', 'learning_rate': '1.572e-05', 'ppl': '1.226', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.46', 'tokens/total': 5760000, 'tokens/trainable': 111272, 'epoch': '0.75'}
+ 38%|███▊      | 180/478 [47:38<1:14:16, 14.95s/it] 38%|███▊      | 181/478 [47:53<1:14:01, 14.95s/it]                                                   {'loss': '0.2769', 'grad_norm': '6.219', 'learning_rate': '1.566e-05', 'ppl': '1.319', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.86', 'tokens/total': 5792000, 'tokens/trainable': 111895, 'epoch': '0.7542'}
+ 38%|███▊      | 181/478 [47:53<1:14:01, 14.95s/it] 38%|███▊      | 182/478 [48:08<1:13:46, 14.95s/it]                                                   {'loss': '0.1821', 'grad_norm': '4.594', 'learning_rate': '1.56e-05', 'ppl': '1.2', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.58', 'tokens/total': 5824000, 'tokens/trainable': 112450, 'epoch': '0.7583'}
+ 38%|███▊      | 182/478 [48:08<1:13:46, 14.95s/it] 38%|███▊      | 183/478 [48:23<1:13:31, 14.95s/it]                                                   {'loss': '0.2163', 'grad_norm': '5.031', 'learning_rate': '1.554e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.02', 'tokens/total': 5856000, 'tokens/trainable': 113018, 'epoch': '0.7625'}
+ 38%|███▊      | 183/478 [48:23<1:13:31, 14.95s/it] 38%|███▊      | 184/478 [48:38<1:13:15, 14.95s/it]                                                   {'loss': '0.1885', 'grad_norm': '4.406', 'learning_rate': '1.548e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.1', 'tokens/total': 5888000, 'tokens/trainable': 113648, 'epoch': '0.7667'}
+ 38%|███▊      | 184/478 [48:38<1:13:15, 14.95s/it] 39%|███▊      | 185/478 [48:53<1:13:01, 14.95s/it]                                                   {'loss': '0.1938', 'grad_norm': '5.812', 'learning_rate': '1.541e-05', 'ppl': '1.214', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.74', 'tokens/total': 5920000, 'tokens/trainable': 114178, 'epoch': '0.7708'}
+ 39%|███▊      | 185/478 [48:53<1:13:01, 14.95s/it] 39%|███▉      | 186/478 [49:08<1:12:46, 14.95s/it]                                                   {'loss': '0.2041', 'grad_norm': '6.5', 'learning_rate': '1.535e-05', 'ppl': '1.226', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.26', 'tokens/total': 5952000, 'tokens/trainable': 114843, 'epoch': '0.775'}
+ 39%|███▉      | 186/478 [49:08<1:12:46, 14.95s/it] 39%|███▉      | 187/478 [49:23<1:12:31, 14.95s/it]                                                   {'loss': '0.2329', 'grad_norm': '6.531', 'learning_rate': '1.529e-05', 'ppl': '1.262', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.22', 'tokens/total': 5984000, 'tokens/trainable': 115447, 'epoch': '0.7792'}
+ 39%|███▉      | 187/478 [49:23<1:12:31, 14.95s/it] 39%|███▉      | 188/478 [49:38<1:12:16, 14.95s/it]                                                   {'loss': '0.1675', 'grad_norm': '4.219', 'learning_rate': '1.523e-05', 'ppl': '1.182', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.43', 'tokens/total': 6016000, 'tokens/trainable': 116087, 'epoch': '0.7833'}
+ 39%|███▉      | 188/478 [49:38<1:12:16, 14.95s/it] 40%|███▉      | 189/478 [49:53<1:12:01, 14.95s/it]                                                   {'loss': '0.1831', 'grad_norm': '6.25', 'learning_rate': '1.517e-05', 'ppl': '1.201', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.99', 'tokens/total': 6048000, 'tokens/trainable': 116714, 'epoch': '0.7875'}
+ 40%|███▉      | 189/478 [49:53<1:12:01, 14.95s/it] 40%|███▉      | 190/478 [50:08<1:11:47, 14.96s/it]                                                   {'loss': '0.2251', 'grad_norm': '6', 'learning_rate': '1.51e-05', 'ppl': '1.252', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.75', 'tokens/total': 6080000, 'tokens/trainable': 117364, 'epoch': '0.7917'}
+ 40%|███▉      | 190/478 [50:08<1:11:47, 14.96s/it] 40%|███▉      | 191/478 [50:23<1:11:33, 14.96s/it]                                                   {'loss': '0.2207', 'grad_norm': '5.125', 'learning_rate': '1.504e-05', 'ppl': '1.247', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23', 'tokens/total': 6112000, 'tokens/trainable': 118052, 'epoch': '0.7958'}
+ 40%|███▉      | 191/478 [50:23<1:11:33, 14.96s/it] 40%|████      | 192/478 [50:38<1:11:18, 14.96s/it]                                                   {'loss': '0.2251', 'grad_norm': '5.281', 'learning_rate': '1.498e-05', 'ppl': '1.252', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.53', 'tokens/total': 6144000, 'tokens/trainable': 118695, 'epoch': '0.8'}
+ 40%|████      | 192/478 [50:38<1:11:18, 14.96s/it] 40%|████      | 193/478 [50:53<1:11:06, 14.97s/it]                                                   {'loss': '0.1963', 'grad_norm': '4.969', 'learning_rate': '1.492e-05', 'ppl': '1.217', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.1', 'tokens/total': 6176000, 'tokens/trainable': 119357, 'epoch': '0.8042'}
+ 40%|████      | 193/478 [50:53<1:11:06, 14.97s/it] 41%|████      | 194/478 [51:08<1:10:44, 14.95s/it]                                                   {'loss': '0.2544', 'grad_norm': '6.562', 'learning_rate': '1.485e-05', 'ppl': '1.29', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.29', 'tokens/total': 6208000, 'tokens/trainable': 120020, 'epoch': '0.8083'}
+ 41%|████      | 194/478 [51:08<1:10:44, 14.95s/it] 41%|████      | 195/478 [51:22<1:10:29, 14.95s/it]                                                   {'loss': '0.1855', 'grad_norm': '4.75', 'learning_rate': '1.479e-05', 'ppl': '1.204', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.68', 'tokens/total': 6240000, 'tokens/trainable': 120697, 'epoch': '0.8125'}
+ 41%|████      | 195/478 [51:22<1:10:29, 14.95s/it] 41%|████      | 196/478 [51:37<1:10:13, 14.94s/it]                                                   {'loss': '0.2305', 'grad_norm': '4.844', 'learning_rate': '1.472e-05', 'ppl': '1.259', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.5', 'tokens/total': 6272000, 'tokens/trainable': 121368, 'epoch': '0.8167'}
+ 41%|████      | 196/478 [51:37<1:10:13, 14.94s/it] 41%|████      | 197/478 [51:52<1:10:00, 14.95s/it]                                                   {'loss': '0.2656', 'grad_norm': '5.969', 'learning_rate': '1.466e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.08', 'tokens/total': 6304000, 'tokens/trainable': 121938, 'epoch': '0.8208'}
+ 41%|████      | 197/478 [51:52<1:10:00, 14.95s/it] 41%|████▏     | 198/478 [52:07<1:09:45, 14.95s/it]                                                   {'loss': '0.2202', 'grad_norm': '6.125', 'learning_rate': '1.46e-05', 'ppl': '1.246', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.7', 'tokens/total': 6336000, 'tokens/trainable': 122586, 'epoch': '0.825'}
+ 41%|████▏     | 198/478 [52:07<1:09:45, 14.95s/it] 42%|████▏     | 199/478 [52:22<1:09:30, 14.95s/it]                                                   {'loss': '0.2485', 'grad_norm': '7.156', 'learning_rate': '1.453e-05', 'ppl': '1.282', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.35', 'tokens/total': 6368000, 'tokens/trainable': 123164, 'epoch': '0.8292'}
+ 42%|████▏     | 199/478 [52:22<1:09:30, 14.95s/it] 42%|████▏     | 200/478 [52:37<1:09:16, 14.95s/it]                                                   {'loss': '0.2705', 'grad_norm': '6.5', 'learning_rate': '1.447e-05', 'ppl': '1.311', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.86', 'tokens/total': 6400000, 'tokens/trainable': 123787, 'epoch': '0.8333'}
+ 42%|████▏     | 200/478 [52:37<1:09:16, 14.95s/it] 42%|████▏     | 201/478 [52:52<1:09:01, 14.95s/it]                                                   {'loss': '0.2153', 'grad_norm': '5.312', 'learning_rate': '1.44e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.86', 'tokens/total': 6432000, 'tokens/trainable': 124410, 'epoch': '0.8375'}
+ 42%|████▏     | 201/478 [52:52<1:09:01, 14.95s/it] 42%|████▏     | 202/478 [53:07<1:08:46, 14.95s/it]                                                   {'loss': '0.2012', 'grad_norm': '5.531', 'learning_rate': '1.433e-05', 'ppl': '1.223', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.95', 'tokens/total': 6464000, 'tokens/trainable': 124976, 'epoch': '0.8417'}
+ 42%|████▏     | 202/478 [53:07<1:08:46, 14.95s/it] 42%|████▏     | 203/478 [53:22<1:08:32, 14.95s/it]                                                   {'loss': '0.2217', 'grad_norm': '5.812', 'learning_rate': '1.427e-05', 'ppl': '1.248', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.09', 'tokens/total': 6496000, 'tokens/trainable': 125606, 'epoch': '0.8458'}
+ 42%|████▏     | 203/478 [53:22<1:08:32, 14.95s/it] 43%|████▎     | 204/478 [53:37<1:08:17, 14.95s/it]                                                   {'loss': '0.2534', 'grad_norm': '5.469', 'learning_rate': '1.42e-05', 'ppl': '1.288', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.2', 'tokens/total': 6528000, 'tokens/trainable': 126299, 'epoch': '0.85'}
+ 43%|████▎     | 204/478 [53:37<1:08:17, 14.95s/it] 43%|████▎     | 205/478 [53:52<1:08:00, 14.95s/it]                                                   {'loss': '0.1675', 'grad_norm': '4.281', 'learning_rate': '1.414e-05', 'ppl': '1.182', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.46', 'tokens/total': 6560000, 'tokens/trainable': 126939, 'epoch': '0.8542'}
+ 43%|████▎     | 205/478 [53:52<1:08:00, 14.95s/it] 43%|████▎     | 206/478 [54:07<1:07:45, 14.95s/it]                                                   {'loss': '0.1685', 'grad_norm': '4.281', 'learning_rate': '1.407e-05', 'ppl': '1.183', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.8', 'tokens/total': 6592000, 'tokens/trainable': 127590, 'epoch': '0.8583'}
+ 43%|████▎     | 206/478 [54:07<1:07:45, 14.95s/it] 43%|████▎     | 207/478 [54:22<1:07:31, 14.95s/it]                                                   {'loss': '0.229', 'grad_norm': '5.531', 'learning_rate': '1.4e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.74', 'tokens/total': 6624000, 'tokens/trainable': 128299, 'epoch': '0.8625'}
+ 43%|████▎     | 207/478 [54:22<1:07:31, 14.95s/it] 44%|████▎     | 208/478 [54:37<1:07:16, 14.95s/it]                                                   {'loss': '0.1836', 'grad_norm': '7.562', 'learning_rate': '1.394e-05', 'ppl': '1.202', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.96', 'tokens/total': 6656000, 'tokens/trainable': 128895, 'epoch': '0.8667'}
+ 44%|████▎     | 208/478 [54:37<1:07:16, 14.95s/it] 44%|████▎     | 209/478 [54:52<1:06:56, 14.93s/it]                                                   {'loss': '0.2495', 'grad_norm': '5.031', 'learning_rate': '1.387e-05', 'ppl': '1.283', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.14', 'tokens/total': 6688000, 'tokens/trainable': 129583, 'epoch': '0.8708'}
+ 44%|████▎     | 209/478 [54:52<1:06:56, 14.93s/it] 44%|████▍     | 210/478 [55:07<1:06:43, 14.94s/it]                                                   {'loss': '0.1885', 'grad_norm': '4.531', 'learning_rate': '1.38e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.06', 'tokens/total': 6720000, 'tokens/trainable': 130212, 'epoch': '0.875'}
+ 44%|████▍     | 210/478 [55:07<1:06:43, 14.94s/it] 44%|████▍     | 211/478 [55:22<1:07:09, 15.09s/it]                                                   {'loss': '0.1689', 'grad_norm': '4.281', 'learning_rate': '1.373e-05', 'ppl': '1.184', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.63', 'tokens/total': 6752000, 'tokens/trainable': 130880, 'epoch': '0.8792'}
+ 44%|████▍     | 211/478 [55:22<1:07:09, 15.09s/it] 44%|████▍     | 212/478 [55:37<1:06:38, 15.03s/it]                                                   {'loss': '0.2295', 'grad_norm': '4.906', 'learning_rate': '1.367e-05', 'ppl': '1.258', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.62', 'tokens/total': 6784000, 'tokens/trainable': 131523, 'epoch': '0.8833'}
+ 44%|████▍     | 212/478 [55:37<1:06:38, 15.03s/it] 45%|████▍     | 213/478 [55:52<1:06:16, 15.00s/it]                                                   {'loss': '0.2222', 'grad_norm': '4.969', 'learning_rate': '1.36e-05', 'ppl': '1.249', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.21', 'tokens/total': 6816000, 'tokens/trainable': 132186, 'epoch': '0.8875'}
+ 45%|████▍     | 213/478 [55:52<1:06:16, 15.00s/it] 45%|████▍     | 214/478 [56:07<1:05:56, 14.99s/it]                                                   {'loss': '0.2119', 'grad_norm': '5', 'learning_rate': '1.353e-05', 'ppl': '1.236', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.88', 'tokens/total': 6848000, 'tokens/trainable': 132690, 'epoch': '0.8917'}
+ 45%|████▍     | 214/478 [56:07<1:05:56, 14.99s/it] 45%|████▍     | 215/478 [56:22<1:05:38, 14.98s/it]                                                   {'loss': '0.1875', 'grad_norm': '5.219', 'learning_rate': '1.346e-05', 'ppl': '1.206', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.29', 'tokens/total': 6880000, 'tokens/trainable': 133296, 'epoch': '0.8958'}
+ 45%|████▍     | 215/478 [56:22<1:05:38, 14.98s/it] 45%|████▌     | 216/478 [56:37<1:05:21, 14.97s/it]                                                   {'loss': '0.2109', 'grad_norm': '4.906', 'learning_rate': '1.339e-05', 'ppl': '1.235', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 6912000, 'tokens/trainable': 133937, 'epoch': '0.9'}
+ 45%|████▌     | 216/478 [56:37<1:05:21, 14.97s/it] 45%|████▌     | 217/478 [56:52<1:05:04, 14.96s/it]                                                   {'loss': '0.2026', 'grad_norm': '5.781', 'learning_rate': '1.332e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.3', 'tokens/total': 6944000, 'tokens/trainable': 134573, 'epoch': '0.9042'}
+ 45%|████▌     | 217/478 [56:52<1:05:04, 14.96s/it] 46%|████▌     | 218/478 [57:07<1:04:47, 14.95s/it]                                                   {'loss': '0.1709', 'grad_norm': '3.938', 'learning_rate': '1.326e-05', 'ppl': '1.186', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.43', 'tokens/total': 6976000, 'tokens/trainable': 135242, 'epoch': '0.9083'}
+ 46%|████▌     | 218/478 [57:07<1:04:47, 14.95s/it] 46%|████▌     | 219/478 [57:22<1:04:32, 14.95s/it]                                                   {'loss': '0.1785', 'grad_norm': '4.594', 'learning_rate': '1.319e-05', 'ppl': '1.195', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.04', 'tokens/total': 7008000, 'tokens/trainable': 135900, 'epoch': '0.9125'}
+ 46%|████▌     | 219/478 [57:22<1:04:32, 14.95s/it] 46%|████▌     | 220/478 [57:37<1:04:17, 14.95s/it]                                                   {'loss': '0.188', 'grad_norm': '4.688', 'learning_rate': '1.312e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.67', 'tokens/total': 7040000, 'tokens/trainable': 136577, 'epoch': '0.9167'}
+ 46%|████▌     | 220/478 [57:37<1:04:17, 14.95s/it] 46%|████▌     | 221/478 [57:51<1:04:00, 14.94s/it]                                                   {'loss': '0.2163', 'grad_norm': '5.906', 'learning_rate': '1.305e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.88', 'tokens/total': 7072000, 'tokens/trainable': 137289, 'epoch': '0.9208'}
+ 46%|████▌     | 221/478 [57:51<1:04:00, 14.94s/it] 46%|████▋     | 222/478 [58:06<1:03:45, 14.95s/it]                                                   {'loss': '0.2183', 'grad_norm': '5.125', 'learning_rate': '1.298e-05', 'ppl': '1.244', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.47', 'tokens/total': 7104000, 'tokens/trainable': 137900, 'epoch': '0.925'}
+ 46%|████▋     | 222/478 [58:06<1:03:45, 14.95s/it] 47%|████▋     | 223/478 [58:21<1:03:31, 14.95s/it]                                                   {'loss': '0.2695', 'grad_norm': '6.469', 'learning_rate': '1.291e-05', 'ppl': '1.309', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.2', 'tokens/total': 7136000, 'tokens/trainable': 138533, 'epoch': '0.9292'}
+ 47%|████▋     | 223/478 [58:21<1:03:31, 14.95s/it] 47%|████▋     | 224/478 [58:36<1:03:16, 14.95s/it]                                                   {'loss': '0.1685', 'grad_norm': '5.625', 'learning_rate': '1.284e-05', 'ppl': '1.183', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.9', 'tokens/total': 7168000, 'tokens/trainable': 139157, 'epoch': '0.9333'}
+ 47%|████▋     | 224/478 [58:36<1:03:16, 14.95s/it] 47%|████▋     | 225/478 [58:51<1:03:02, 14.95s/it]                                                   {'loss': '0.1929', 'grad_norm': '4.969', 'learning_rate': '1.277e-05', 'ppl': '1.213', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.86', 'tokens/total': 7200000, 'tokens/trainable': 139810, 'epoch': '0.9375'}
+ 47%|████▋     | 225/478 [58:51<1:03:02, 14.95s/it] 47%|████▋     | 226/478 [59:06<1:02:47, 14.95s/it]                                                   {'loss': '0.2168', 'grad_norm': '5.719', 'learning_rate': '1.27e-05', 'ppl': '1.242', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.69', 'tokens/total': 7232000, 'tokens/trainable': 140398, 'epoch': '0.9417'}
+ 47%|████▋     | 226/478 [59:06<1:02:47, 14.95s/it] 47%|████▋     | 227/478 [59:21<1:02:32, 14.95s/it]                                                   {'loss': '0.2075', 'grad_norm': '5.344', 'learning_rate': '1.263e-05', 'ppl': '1.231', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.47', 'tokens/total': 7264000, 'tokens/trainable': 141069, 'epoch': '0.9458'}
+ 47%|████▋     | 227/478 [59:21<1:02:32, 14.95s/it] 48%|████▊     | 228/478 [59:36<1:02:17, 14.95s/it]                                                   {'loss': '0.2222', 'grad_norm': '4.969', 'learning_rate': '1.256e-05', 'ppl': '1.249', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.02', 'tokens/total': 7296000, 'tokens/trainable': 141697, 'epoch': '0.95'}
+ 48%|████▊     | 228/478 [59:36<1:02:17, 14.95s/it] 48%|████▊     | 229/478 [59:51<1:02:02, 14.95s/it]                                                   {'loss': '0.2402', 'grad_norm': '5.469', 'learning_rate': '1.249e-05', 'ppl': '1.272', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.82', 'tokens/total': 7328000, 'tokens/trainable': 142229, 'epoch': '0.9542'}
+ 48%|████▊     | 229/478 [59:51<1:02:02, 14.95s/it] 48%|████▊     | 230/478 [1:00:06<1:01:47, 14.95s/it]                                                     {'loss': '0.2148', 'grad_norm': '5.625', 'learning_rate': '1.242e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.49', 'tokens/total': 7360000, 'tokens/trainable': 142841, 'epoch': '0.9583'}
+ 48%|████▊     | 230/478 [1:00:06<1:01:47, 14.95s/it] 48%|████▊     | 231/478 [1:00:21<1:01:32, 14.95s/it]                                                     {'loss': '0.188', 'grad_norm': '5.344', 'learning_rate': '1.235e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.23', 'tokens/total': 7392000, 'tokens/trainable': 143475, 'epoch': '0.9625'}
+ 48%|████▊     | 231/478 [1:00:21<1:01:32, 14.95s/it] 49%|████▊     | 232/478 [1:00:36<1:01:17, 14.95s/it]                                                     {'loss': '0.2285', 'grad_norm': '5.969', 'learning_rate': '1.228e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.81', 'tokens/total': 7424000, 'tokens/trainable': 144007, 'epoch': '0.9667'}
+ 49%|████▊     | 232/478 [1:00:36<1:01:17, 14.95s/it] 49%|████▊     | 233/478 [1:00:51<1:01:05, 14.96s/it]                                                     {'loss': '0.1855', 'grad_norm': '5.375', 'learning_rate': '1.22e-05', 'ppl': '1.204', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.28', 'tokens/total': 7456000, 'tokens/trainable': 144584, 'epoch': '0.9708'}
+ 49%|████▊     | 233/478 [1:00:51<1:01:05, 14.96s/it] 49%|████▉     | 234/478 [1:01:06<1:00:48, 14.95s/it]                                                     {'loss': '0.1929', 'grad_norm': '4.344', 'learning_rate': '1.213e-05', 'ppl': '1.213', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.1', 'tokens/total': 7488000, 'tokens/trainable': 145273, 'epoch': '0.975'}
+ 49%|████▉     | 234/478 [1:01:06<1:00:48, 14.95s/it] 49%|████▉     | 235/478 [1:01:21<1:00:33, 14.95s/it]                                                     {'loss': '0.1831', 'grad_norm': '5.031', 'learning_rate': '1.206e-05', 'ppl': '1.201', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.3', 'tokens/total': 7520000, 'tokens/trainable': 145879, 'epoch': '0.9792'}
+ 49%|████▉     | 235/478 [1:01:21<1:00:33, 14.95s/it] 49%|████▉     | 236/478 [1:01:36<1:00:18, 14.95s/it]                                                     {'loss': '0.2319', 'grad_norm': '5.719', 'learning_rate': '1.199e-05', 'ppl': '1.261', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.52', 'tokens/total': 7552000, 'tokens/trainable': 146552, 'epoch': '0.9833'}
+ 49%|████▉     | 236/478 [1:01:36<1:00:18, 14.95s/it] 50%|████▉     | 237/478 [1:01:51<1:00:03, 14.95s/it]                                                     {'loss': '0.251', 'grad_norm': '5.812', 'learning_rate': '1.192e-05', 'ppl': '1.285', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.9', 'tokens/total': 7584000, 'tokens/trainable': 147206, 'epoch': '0.9875'}
+ 50%|████▉     | 237/478 [1:01:51<1:00:03, 14.95s/it] 50%|████▉     | 238/478 [1:02:06<59:48, 14.95s/it]                                                     {'loss': '0.2085', 'grad_norm': '5.406', 'learning_rate': '1.185e-05', 'ppl': '1.232', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.62', 'tokens/total': 7616000, 'tokens/trainable': 147792, 'epoch': '0.9917'}
+ 50%|████▉     | 238/478 [1:02:06<59:48, 14.95s/it] 50%|█████     | 239/478 [1:02:21<59:33, 14.95s/it]                                                   {'loss': '0.2236', 'grad_norm': '6.562', 'learning_rate': '1.178e-05', 'ppl': '1.251', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.16', 'tokens/total': 7648000, 'tokens/trainable': 148424, 'epoch': '0.9958'}
+ 50%|█████     | 239/478 [1:02:21<59:33, 14.95s/it][2026-04-17 03:11:51,731] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/checkpoint-239
+
+Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]
+Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.99s/it]Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.99s/it]
+ 50%|█████     | 240/478 [1:04:39<3:26:42, 52.11s/it]                                                     {'loss': '0.229', 'grad_norm': '5.062', 'learning_rate': '1.17e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.67', 'tokens/total': 7680000, 'tokens/trainable': 149041, 'epoch': '1'}
+ 50%|█████     | 240/478 [1:04:39<3:26:42, 52.11s/it][2026-04-17 03:14:04,074] [INFO] [axolotl.core.trainers.base] Running evaluation step...
+[2026-04-17 03:14:12,814] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
+
+  0%|          | 0/27 [00:00<?, ?it/s]
+  7%|▋         | 2/27 [00:02<00:35,  1.41s/it]
+ 11%|█         | 3/27 [00:05<00:47,  1.97s/it]
+ 15%|█▍        | 4/27 [00:08<00:51,  2.26s/it]
+ 19%|█▊        | 5/27 [00:11<00:53,  2.43s/it]
+ 22%|██▏       | 6/27 [00:13<00:53,  2.54s/it]
+ 26%|██▌       | 7/27 [00:16<00:52,  2.61s/it]
+ 30%|██▉       | 8/27 [00:19<00:50,  2.66s/it]
+ 33%|███▎      | 9/27 [00:22<00:48,  2.68s/it]
+ 37%|███▋      | 10/27 [00:24<00:45,  2.70s/it]
+ 41%|████      | 11/27 [00:27<00:43,  2.72s/it]
+ 44%|████▍     | 12/27 [00:30<00:39,  2.64s/it]
+ 48%|████▊     | 13/27 [00:33<00:38,  2.76s/it]
+ 52%|█████▏    | 14/27 [00:35<00:35,  2.76s/it]
+ 56%|█████▌    | 15/27 [00:38<00:33,  2.76s/it]
+ 59%|█████▉    | 16/27 [00:41<00:30,  2.76s/it]
+ 63%|██████▎   | 17/27 [00:44<00:27,  2.75s/it]
+ 67%|██████▋   | 18/27 [00:46<00:24,  2.75s/it]
+ 70%|███████   | 19/27 [00:49<00:22,  2.75s/it]
+ 74%|███████▍  | 20/27 [00:52<00:19,  2.75s/it]
+ 78%|███████▊  | 21/27 [00:54<00:16,  2.67s/it]
+ 81%|████████▏ | 22/27 [00:57<00:13,  2.78s/it]
+ 85%|████████▌ | 23/27 [01:00<00:11,  2.77s/it]
+ 89%|████████▉ | 24/27 [01:03<00:08,  2.76s/it]
+ 93%|█████████▎| 25/27 [01:06<00:05,  2.76s/it]
+ 96%|█████████▋| 26/27 [01:08<00:02,  2.76s/it]
+100%|██████████| 27/27 [01:11<00:00,  2.84s/it]                                                     
+                                               {'eval_loss': '0.2166', 'eval_runtime': '75.42', 'eval_samples_per_second': '2.771', 'eval_steps_per_second': '1.392', 'eval_ppl': '1.242', 'memory/max_active (GiB)': '34.91', 'memory/max_allocated (GiB)': '34.91', 'memory/device_reserved (GiB)': '57.54', 'epoch': '1', 'tokens/train_per_sec_per_gpu': '0'}
+ 50%|█████     | 240/478 [1:06:04<3:26:42, 52.11s/it]
+100%|██████████| 27/27 [01:13<00:00,  2.84s/it]
+                                                50%|█████     | 241/478 [1:06:20<4:22:47, 66.53s/it]                                                     {'loss': '0.2046', 'grad_norm': '5.375', 'learning_rate': '1.163e-05', 'ppl': '1.227', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.95', 'tokens/total': 7712000, 'tokens/trainable': 149728, 'epoch': '1.004'}
+ 50%|█████     | 241/478 [1:06:20<4:22:47, 66.53s/it] 51%|█████     | 242/478 [1:06:35<3:20:49, 51.06s/it]                                                     {'loss': '0.2031', 'grad_norm': '5.406', 'learning_rate': '1.156e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.85', 'tokens/total': 7744000, 'tokens/trainable': 150351, 'epoch': '1.008'}
+ 51%|█████     | 242/478 [1:06:35<3:20:49, 51.06s/it] 51%|█████     | 243/478 [1:06:50<2:37:33, 40.23s/it]                                                     {'loss': '0.1758', 'grad_norm': '5', 'learning_rate': '1.149e-05', 'ppl': '1.192', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.47', 'tokens/total': 7776000, 'tokens/trainable': 150903, 'epoch': '1.012'}
+ 51%|█████     | 243/478 [1:06:50<2:37:33, 40.23s/it] 51%|█████     | 244/478 [1:07:04<2:07:19, 32.65s/it]                                                     {'loss': '0.1514', 'grad_norm': '4.969', 'learning_rate': '1.142e-05', 'ppl': '1.163', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.07', 'tokens/total': 7808000, 'tokens/trainable': 151443, 'epoch': '1.017'}
+ 51%|█████     | 244/478 [1:07:04<2:07:19, 32.65s/it] 51%|█████▏    | 245/478 [1:07:19<1:46:10, 27.34s/it]                                                     {'loss': '0.186', 'grad_norm': '4.875', 'learning_rate': '1.134e-05', 'ppl': '1.204', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.38', 'tokens/total': 7840000, 'tokens/trainable': 151992, 'epoch': '1.021'}
+ 51%|█████▏    | 245/478 [1:07:19<1:46:10, 27.34s/it] 51%|█████▏    | 246/478 [1:07:34<1:31:21, 23.63s/it]                                                     {'loss': '0.1411', 'grad_norm': '3.984', 'learning_rate': '1.127e-05', 'ppl': '1.152', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.85', 'tokens/total': 7872000, 'tokens/trainable': 152615, 'epoch': '1.025'}
+ 51%|█████▏    | 246/478 [1:07:34<1:31:21, 23.63s/it] 52%|█████▏    | 247/478 [1:07:49<1:20:56, 21.02s/it]                                                     {'loss': '0.2158', 'grad_norm': '6.188', 'learning_rate': '1.12e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.35', 'tokens/total': 7904000, 'tokens/trainable': 153193, 'epoch': '1.029'}
+ 52%|█████▏    | 247/478 [1:07:49<1:20:56, 21.02s/it] 52%|█████▏    | 248/478 [1:08:04<1:13:37, 19.20s/it]                                                     {'loss': '0.1768', 'grad_norm': '4.625', 'learning_rate': '1.113e-05', 'ppl': '1.193', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.75', 'tokens/total': 7936000, 'tokens/trainable': 153783, 'epoch': '1.033'}
+ 52%|█████▏    | 248/478 [1:08:04<1:13:37, 19.20s/it] 52%|█████▏    | 249/478 [1:08:19<1:08:26, 17.93s/it]                                                     {'loss': '0.1704', 'grad_norm': '4.625', 'learning_rate': '1.105e-05', 'ppl': '1.186', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.35', 'tokens/total': 7968000, 'tokens/trainable': 154451, 'epoch': '1.038'}
+ 52%|█████▏    | 249/478 [1:08:19<1:08:26, 17.93s/it] 52%|█████▏    | 250/478 [1:08:34<1:04:44, 17.04s/it]                                                     {'loss': '0.186', 'grad_norm': '4.688', 'learning_rate': '1.098e-05', 'ppl': '1.204', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 8000000, 'tokens/trainable': 155072, 'epoch': '1.042'}
+ 52%|█████▏    | 250/478 [1:08:34<1:04:44, 17.04s/it] 53%|█████▎    | 251/478 [1:08:49<1:02:06, 16.42s/it]                                                     {'loss': '0.1641', 'grad_norm': '4.125', 'learning_rate': '1.091e-05', 'ppl': '1.178', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.54', 'tokens/total': 8032000, 'tokens/trainable': 155656, 'epoch': '1.046'}
+ 53%|█████▎    | 251/478 [1:08:49<1:02:06, 16.42s/it] 53%|█████▎    | 252/478 [1:09:04<1:00:10, 15.98s/it]                                                     {'loss': '0.1631', 'grad_norm': '4.5', 'learning_rate': '1.084e-05', 'ppl': '1.177', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.02', 'tokens/total': 8064000, 'tokens/trainable': 156254, 'epoch': '1.05'}
+ 53%|█████▎    | 252/478 [1:09:04<1:00:10, 15.98s/it] 53%|█████▎    | 253/478 [1:09:19<58:45, 15.67s/it]                                                     {'loss': '0.2163', 'grad_norm': '4.969', 'learning_rate': '1.076e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.45', 'tokens/total': 8096000, 'tokens/trainable': 156805, 'epoch': '1.054'}
+ 53%|█████▎    | 253/478 [1:09:19<58:45, 15.67s/it] 53%|█████▎    | 254/478 [1:09:34<57:42, 15.46s/it]                                                   {'loss': '0.1792', 'grad_norm': '4.625', 'learning_rate': '1.069e-05', 'ppl': '1.196', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.95', 'tokens/total': 8128000, 'tokens/trainable': 157371, 'epoch': '1.058'}
+ 53%|█████▎    | 254/478 [1:09:34<57:42, 15.46s/it] 53%|█████▎    | 255/478 [1:09:49<56:53, 15.31s/it]                                                   {'loss': '0.1528', 'grad_norm': '5.031', 'learning_rate': '1.062e-05', 'ppl': '1.165', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.46', 'tokens/total': 8160000, 'tokens/trainable': 158042, 'epoch': '1.062'}
+ 53%|█████▎    | 255/478 [1:09:49<56:53, 15.31s/it] 54%|█████▎    | 256/478 [1:10:04<56:14, 15.20s/it]                                                   {'loss': '0.1448', 'grad_norm': '4.219', 'learning_rate': '1.055e-05', 'ppl': '1.156', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.79', 'tokens/total': 8192000, 'tokens/trainable': 158693, 'epoch': '1.067'}
+ 54%|█████▎    | 256/478 [1:10:04<56:14, 15.20s/it] 54%|█████▍    | 257/478 [1:10:19<55:41, 15.12s/it]                                                   {'loss': '0.09375', 'grad_norm': '3.844', 'learning_rate': '1.047e-05', 'ppl': '1.098', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.17', 'tokens/total': 8224000, 'tokens/trainable': 159384, 'epoch': '1.071'}
+ 54%|█████▍    | 257/478 [1:10:19<55:41, 15.12s/it] 54%|█████▍    | 258/478 [1:10:34<55:16, 15.07s/it]                                                   {'loss': '0.1543', 'grad_norm': '4.625', 'learning_rate': '1.04e-05', 'ppl': '1.167', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.72', 'tokens/total': 8256000, 'tokens/trainable': 160063, 'epoch': '1.075'}
+ 54%|█████▍    | 258/478 [1:10:34<55:16, 15.07s/it] 54%|█████▍    | 259/478 [1:10:49<54:54, 15.05s/it]                                                   {'loss': '0.187', 'grad_norm': '5.406', 'learning_rate': '1.033e-05', 'ppl': '1.206', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.78', 'tokens/total': 8288000, 'tokens/trainable': 160625, 'epoch': '1.079'}
+ 54%|█████▍    | 259/478 [1:10:49<54:54, 15.05s/it] 54%|█████▍    | 260/478 [1:11:04<54:33, 15.02s/it]                                                   {'loss': '0.1433', 'grad_norm': '4.312', 'learning_rate': '1.026e-05', 'ppl': '1.154', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.33', 'tokens/total': 8320000, 'tokens/trainable': 161262, 'epoch': '1.083'}
+ 54%|█████▍    | 260/478 [1:11:04<54:33, 15.02s/it] 55%|█████▍    | 261/478 [1:11:19<54:49, 15.16s/it]                                                   {'loss': '0.1436', 'grad_norm': '4.25', 'learning_rate': '1.018e-05', 'ppl': '1.154', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.3', 'tokens/total': 8352000, 'tokens/trainable': 161921, 'epoch': '1.087'}
+ 55%|█████▍    | 261/478 [1:11:19<54:49, 15.16s/it] 55%|█████▍    | 262/478 [1:11:34<54:21, 15.10s/it]                                                   {'loss': '0.168', 'grad_norm': '4.5', 'learning_rate': '1.011e-05', 'ppl': '1.183', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.92', 'tokens/total': 8384000, 'tokens/trainable': 162546, 'epoch': '1.092'}
+ 55%|█████▍    | 262/478 [1:11:34<54:21, 15.10s/it] 55%|█████▌    | 263/478 [1:11:49<53:55, 15.05s/it]                                                   {'loss': '0.1802', 'grad_norm': '5.188', 'learning_rate': '1.004e-05', 'ppl': '1.197', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 8416000, 'tokens/trainable': 163228, 'epoch': '1.096'}
+ 55%|█████▌    | 263/478 [1:11:49<53:55, 15.05s/it] 55%|█████▌    | 264/478 [1:12:04<53:34, 15.02s/it]                                                   {'loss': '0.1252', 'grad_norm': '4.281', 'learning_rate': '9.964e-06', 'ppl': '1.133', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.5', 'tokens/total': 8448000, 'tokens/trainable': 163721, 'epoch': '1.1'}
+ 55%|█████▌    | 264/478 [1:12:04<53:34, 15.02s/it] 55%|█████▌    | 265/478 [1:12:19<53:15, 15.00s/it]                                                   {'loss': '0.1909', 'grad_norm': '4.844', 'learning_rate': '9.891e-06', 'ppl': '1.21', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.59', 'tokens/total': 8480000, 'tokens/trainable': 164366, 'epoch': '1.104'}
+ 55%|█████▌    | 265/478 [1:12:19<53:15, 15.00s/it] 56%|█████▌    | 266/478 [1:12:34<52:57, 14.99s/it]                                                   {'loss': '0.1145', 'grad_norm': '4.062', 'learning_rate': '9.818e-06', 'ppl': '1.121', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.35', 'tokens/total': 8512000, 'tokens/trainable': 164944, 'epoch': '1.108'}
+ 56%|█████▌    | 266/478 [1:12:34<52:57, 14.99s/it] 56%|█████▌    | 267/478 [1:12:49<52:40, 14.98s/it]                                                   {'loss': '0.1013', 'grad_norm': '3.5', 'learning_rate': '9.745e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.17', 'tokens/total': 8544000, 'tokens/trainable': 165576, 'epoch': '1.113'}
+ 56%|█████▌    | 267/478 [1:12:49<52:40, 14.98s/it] 56%|█████▌    | 268/478 [1:13:04<52:22, 14.96s/it]                                                   {'loss': '0.1455', 'grad_norm': '4.281', 'learning_rate': '9.672e-06', 'ppl': '1.157', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.01', 'tokens/total': 8576000, 'tokens/trainable': 166262, 'epoch': '1.117'}
+ 56%|█████▌    | 268/478 [1:13:04<52:22, 14.96s/it] 56%|█████▋    | 269/478 [1:13:19<52:06, 14.96s/it]                                                   {'loss': '0.1101', 'grad_norm': '3.75', 'learning_rate': '9.599e-06', 'ppl': '1.116', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.57', 'tokens/total': 8608000, 'tokens/trainable': 166936, 'epoch': '1.121'}
+ 56%|█████▋    | 269/478 [1:13:19<52:06, 14.96s/it] 56%|█████▋    | 270/478 [1:13:34<51:51, 14.96s/it]                                                   {'loss': '0.1221', 'grad_norm': '4.031', 'learning_rate': '9.526e-06', 'ppl': '1.13', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.2', 'tokens/total': 8640000, 'tokens/trainable': 167659, 'epoch': '1.125'}
+ 56%|█████▋    | 270/478 [1:13:34<51:51, 14.96s/it] 57%|█████▋    | 271/478 [1:13:49<51:36, 14.96s/it]                                                   {'loss': '0.09717', 'grad_norm': '4', 'learning_rate': '9.454e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.16', 'tokens/total': 8672000, 'tokens/trainable': 168291, 'epoch': '1.129'}
+ 57%|█████▋    | 271/478 [1:13:49<51:36, 14.96s/it] 57%|█████▋    | 272/478 [1:14:04<51:21, 14.96s/it]                                                   {'loss': '0.123', 'grad_norm': '3.75', 'learning_rate': '9.381e-06', 'ppl': '1.131', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.56', 'tokens/total': 8704000, 'tokens/trainable': 168965, 'epoch': '1.133'}
+ 57%|█████▋    | 272/478 [1:14:04<51:21, 14.96s/it] 57%|█████▋    | 273/478 [1:14:19<51:06, 14.96s/it]                                                   {'loss': '0.1218', 'grad_norm': '4.906', 'learning_rate': '9.308e-06', 'ppl': '1.13', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 8736000, 'tokens/trainable': 169586, 'epoch': '1.137'}
+ 57%|█████▋    | 273/478 [1:14:19<51:06, 14.96s/it] 57%|█████▋    | 274/478 [1:14:34<50:50, 14.96s/it]                                                   {'loss': '0.1606', 'grad_norm': '5.156', 'learning_rate': '9.235e-06', 'ppl': '1.174', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.16', 'tokens/total': 8768000, 'tokens/trainable': 170188, 'epoch': '1.142'}
+ 57%|█████▋    | 274/478 [1:14:34<50:50, 14.96s/it] 58%|█████▊    | 275/478 [1:14:49<50:35, 14.95s/it]                                                   {'loss': '0.1206', 'grad_norm': '4.062', 'learning_rate': '9.163e-06', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.04', 'tokens/total': 8800000, 'tokens/trainable': 170876, 'epoch': '1.146'}
+ 58%|█████▊    | 275/478 [1:14:49<50:35, 14.95s/it] 58%|█████▊    | 276/478 [1:15:04<50:20, 14.95s/it]                                                   {'loss': '0.123', 'grad_norm': '4.781', 'learning_rate': '9.09e-06', 'ppl': '1.131', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.44', 'tokens/total': 8832000, 'tokens/trainable': 171546, 'epoch': '1.15'}
+ 58%|█████▊    | 276/478 [1:15:04<50:20, 14.95s/it] 58%|█████▊    | 277/478 [1:15:18<50:05, 14.95s/it]                                                   {'loss': '0.08105', 'grad_norm': '3.578', 'learning_rate': '9.018e-06', 'ppl': '1.084', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.29', 'tokens/total': 8864000, 'tokens/trainable': 172122, 'epoch': '1.154'}
+ 58%|█████▊    | 277/478 [1:15:19<50:05, 14.95s/it] 58%|█████▊    | 278/478 [1:15:33<49:50, 14.95s/it]                                                   {'loss': '0.1572', 'grad_norm': '4.594', 'learning_rate': '8.945e-06', 'ppl': '1.17', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.03', 'tokens/total': 8896000, 'tokens/trainable': 172810, 'epoch': '1.158'}
+ 58%|█████▊    | 278/478 [1:15:33<49:50, 14.95s/it] 58%|█████▊    | 279/478 [1:15:48<49:34, 14.95s/it]                                                   {'loss': '0.09741', 'grad_norm': '3.922', 'learning_rate': '8.873e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.94', 'tokens/total': 8928000, 'tokens/trainable': 173554, 'epoch': '1.163'}
+ 58%|█████▊    | 279/478 [1:15:48<49:34, 14.95s/it] 59%|█████▊    | 280/478 [1:16:03<49:20, 14.95s/it]                                                   {'loss': '0.09204', 'grad_norm': '3.609', 'learning_rate': '8.8e-06', 'ppl': '1.096', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.34', 'tokens/total': 8960000, 'tokens/trainable': 174162, 'epoch': '1.167'}
+ 59%|█████▊    | 280/478 [1:16:03<49:20, 14.95s/it] 59%|█████▉    | 281/478 [1:16:18<49:05, 14.95s/it]                                                   {'loss': '0.09448', 'grad_norm': '3.656', 'learning_rate': '8.728e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.42', 'tokens/total': 8992000, 'tokens/trainable': 174802, 'epoch': '1.171'}
+ 59%|█████▉    | 281/478 [1:16:18<49:05, 14.95s/it] 59%|█████▉    | 282/478 [1:16:33<48:50, 14.95s/it]                                                   {'loss': '0.09424', 'grad_norm': '3.75', 'learning_rate': '8.656e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.35', 'tokens/total': 9024000, 'tokens/trainable': 175529, 'epoch': '1.175'}
+ 59%|█████▉    | 282/478 [1:16:33<48:50, 14.95s/it] 59%|█████▉    | 283/478 [1:16:48<48:34, 14.95s/it]                                                   {'loss': '0.1201', 'grad_norm': '4.125', 'learning_rate': '8.583e-06', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.32', 'tokens/total': 9056000, 'tokens/trainable': 176195, 'epoch': '1.179'}
+ 59%|█████▉    | 283/478 [1:16:48<48:34, 14.95s/it] 59%|█████▉    | 284/478 [1:17:03<48:20, 14.95s/it]                                                   {'loss': '0.1104', 'grad_norm': '3.953', 'learning_rate': '8.511e-06', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.7', 'tokens/total': 9088000, 'tokens/trainable': 176843, 'epoch': '1.183'}
+ 59%|█████▉    | 284/478 [1:17:03<48:20, 14.95s/it] 60%|█████▉    | 285/478 [1:17:18<48:05, 14.95s/it]                                                   {'loss': '0.1064', 'grad_norm': '4.281', 'learning_rate': '8.439e-06', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.53', 'tokens/total': 9120000, 'tokens/trainable': 177516, 'epoch': '1.188'}
+ 60%|█████▉    | 285/478 [1:17:18<48:05, 14.95s/it] 60%|█████▉    | 286/478 [1:17:33<47:51, 14.95s/it]                                                   {'loss': '0.1108', 'grad_norm': '4.344', 'learning_rate': '8.367e-06', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.88', 'tokens/total': 9152000, 'tokens/trainable': 178140, 'epoch': '1.192'}
+ 60%|█████▉    | 286/478 [1:17:33<47:51, 14.95s/it] 60%|██████    | 287/478 [1:17:48<47:36, 14.95s/it]                                                   {'loss': '0.09253', 'grad_norm': '3.922', 'learning_rate': '8.295e-06', 'ppl': '1.097', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.5', 'tokens/total': 9184000, 'tokens/trainable': 178812, 'epoch': '1.196'}
+ 60%|██████    | 287/478 [1:17:48<47:36, 14.95s/it] 60%|██████    | 288/478 [1:18:03<47:21, 14.96s/it]                                                   {'loss': '0.1304', 'grad_norm': '5.625', 'learning_rate': '8.224e-06', 'ppl': '1.139', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.64', 'tokens/total': 9216000, 'tokens/trainable': 179369, 'epoch': '1.2'}
+ 60%|██████    | 288/478 [1:18:03<47:21, 14.96s/it] 60%|██████    | 289/478 [1:18:18<47:06, 14.96s/it]                                                   {'loss': '0.1108', 'grad_norm': '4.625', 'learning_rate': '8.152e-06', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.58', 'tokens/total': 9248000, 'tokens/trainable': 179984, 'epoch': '1.204'}
+ 60%|██████    | 289/478 [1:18:18<47:06, 14.96s/it] 61%|██████    | 290/478 [1:18:33<46:51, 14.96s/it]                                                   {'loss': '0.09741', 'grad_norm': '5.594', 'learning_rate': '8.08e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.95', 'tokens/total': 9280000, 'tokens/trainable': 180550, 'epoch': '1.208'}
+ 61%|██████    | 290/478 [1:18:33<46:51, 14.96s/it] 61%|██████    | 291/478 [1:18:48<46:36, 14.95s/it]                                                   {'loss': '0.08936', 'grad_norm': '4.219', 'learning_rate': '8.009e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.25', 'tokens/total': 9312000, 'tokens/trainable': 181095, 'epoch': '1.212'}
+ 61%|██████    | 291/478 [1:18:48<46:36, 14.95s/it] 61%|██████    | 292/478 [1:19:03<46:21, 14.95s/it]                                                   {'loss': '0.134', 'grad_norm': '5.094', 'learning_rate': '7.938e-06', 'ppl': '1.143', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.67', 'tokens/total': 9344000, 'tokens/trainable': 181680, 'epoch': '1.217'}
+ 61%|██████    | 292/478 [1:19:03<46:21, 14.95s/it] 61%|██████▏   | 293/478 [1:19:18<46:06, 14.95s/it]                                                   {'loss': '0.08301', 'grad_norm': '4.469', 'learning_rate': '7.866e-06', 'ppl': '1.087', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.52', 'tokens/total': 9376000, 'tokens/trainable': 182323, 'epoch': '1.221'}
+ 61%|██████▏   | 293/478 [1:19:18<46:06, 14.95s/it] 62%|██████▏   | 294/478 [1:19:33<45:50, 14.95s/it]                                                   {'loss': '0.1326', 'grad_norm': '4.75', 'learning_rate': '7.795e-06', 'ppl': '1.142', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.77', 'tokens/total': 9408000, 'tokens/trainable': 182972, 'epoch': '1.225'}
+ 62%|██████▏   | 294/478 [1:19:33<45:50, 14.95s/it] 62%|██████▏   | 295/478 [1:19:48<45:34, 14.94s/it]                                                   {'loss': '0.09399', 'grad_norm': '5.062', 'learning_rate': '7.724e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.34', 'tokens/total': 9440000, 'tokens/trainable': 183668, 'epoch': '1.229'}
+ 62%|██████▏   | 295/478 [1:19:48<45:34, 14.94s/it] 62%|██████▏   | 296/478 [1:20:03<45:19, 14.94s/it]                                                   {'loss': '0.05933', 'grad_norm': '4.625', 'learning_rate': '7.653e-06', 'ppl': '1.061', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.68', 'tokens/total': 9472000, 'tokens/trainable': 184226, 'epoch': '1.233'}
+ 62%|██████▏   | 296/478 [1:20:03<45:19, 14.94s/it] 62%|██████▏   | 297/478 [1:20:17<45:04, 14.94s/it]                                                   {'loss': '0.1252', 'grad_norm': '5.25', 'learning_rate': '7.582e-06', 'ppl': '1.133', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23', 'tokens/total': 9504000, 'tokens/trainable': 184912, 'epoch': '1.238'}
+ 62%|██████▏   | 297/478 [1:20:17<45:04, 14.94s/it] 62%|██████▏   | 298/478 [1:20:32<44:49, 14.94s/it]                                                   {'loss': '0.1157', 'grad_norm': '5.531', 'learning_rate': '7.512e-06', 'ppl': '1.123', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.55', 'tokens/total': 9536000, 'tokens/trainable': 185466, 'epoch': '1.242'}
+ 62%|██████▏   | 298/478 [1:20:32<44:49, 14.94s/it] 63%|██████▎   | 299/478 [1:20:47<44:37, 14.96s/it]                                                   {'loss': '0.07788', 'grad_norm': '4.562', 'learning_rate': '7.441e-06', 'ppl': '1.081', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.14', 'tokens/total': 9568000, 'tokens/trainable': 186069, 'epoch': '1.246'}
+ 63%|██████▎   | 299/478 [1:20:47<44:37, 14.96s/it] 63%|██████▎   | 300/478 [1:21:02<44:22, 14.96s/it]                                                   {'loss': '0.1213', 'grad_norm': '5.469', 'learning_rate': '7.371e-06', 'ppl': '1.129', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.9', 'tokens/total': 9600000, 'tokens/trainable': 186753, 'epoch': '1.25'}
+ 63%|██████▎   | 300/478 [1:21:02<44:22, 14.96s/it] 63%|██████▎   | 301/478 [1:21:17<44:03, 14.94s/it]                                                   {'loss': '0.08154', 'grad_norm': '3.797', 'learning_rate': '7.301e-06', 'ppl': '1.085', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.01', 'tokens/total': 9632000, 'tokens/trainable': 187467, 'epoch': '1.254'}
+ 63%|██████▎   | 301/478 [1:21:17<44:03, 14.94s/it] 63%|██████▎   | 302/478 [1:21:32<43:50, 14.94s/it]                                                   {'loss': '0.0835', 'grad_norm': '4.406', 'learning_rate': '7.23e-06', 'ppl': '1.087', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.69', 'tokens/total': 9664000, 'tokens/trainable': 188085, 'epoch': '1.258'}
+ 63%|██████▎   | 302/478 [1:21:32<43:50, 14.94s/it] 63%|██████▎   | 303/478 [1:21:47<43:36, 14.95s/it]                                                   {'loss': '0.1338', 'grad_norm': '5.219', 'learning_rate': '7.16e-06', 'ppl': '1.143', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.31', 'tokens/total': 9696000, 'tokens/trainable': 188692, 'epoch': '1.262'}
+ 63%|██████▎   | 303/478 [1:21:47<43:36, 14.95s/it] 64%|██████▎   | 304/478 [1:22:02<43:21, 14.95s/it]                                                   {'loss': '0.1152', 'grad_norm': '4.344', 'learning_rate': '7.091e-06', 'ppl': '1.122', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.93', 'tokens/total': 9728000, 'tokens/trainable': 189377, 'epoch': '1.267'}
+ 64%|██████▎   | 304/478 [1:22:02<43:21, 14.95s/it] 64%|██████▍   | 305/478 [1:22:17<43:06, 14.95s/it]                                                   {'loss': '0.04266', 'grad_norm': '3.094', 'learning_rate': '7.021e-06', 'ppl': '1.044', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.04', 'tokens/total': 9760000, 'tokens/trainable': 189916, 'epoch': '1.271'}
+ 64%|██████▍   | 305/478 [1:22:17<43:06, 14.95s/it] 64%|██████▍   | 306/478 [1:22:32<42:52, 14.95s/it]                                                   {'loss': '0.09497', 'grad_norm': '4.875', 'learning_rate': '6.951e-06', 'ppl': '1.1', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.25', 'tokens/total': 9792000, 'tokens/trainable': 190551, 'epoch': '1.275'}
+ 64%|██████▍   | 306/478 [1:22:32<42:52, 14.95s/it] 64%|██████▍   | 307/478 [1:22:47<42:37, 14.95s/it]                                                   {'loss': '0.08691', 'grad_norm': '3.906', 'learning_rate': '6.882e-06', 'ppl': '1.091', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.1', 'tokens/total': 9824000, 'tokens/trainable': 191211, 'epoch': '1.279'}
+ 64%|██████▍   | 307/478 [1:22:47<42:37, 14.95s/it] 64%|██████▍   | 308/478 [1:23:02<42:22, 14.95s/it]                                                   {'loss': '0.1401', 'grad_norm': '5.781', 'learning_rate': '6.813e-06', 'ppl': '1.15', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.63', 'tokens/total': 9856000, 'tokens/trainable': 191708, 'epoch': '1.283'}
+ 64%|██████▍   | 308/478 [1:23:02<42:22, 14.95s/it] 65%|██████▍   | 309/478 [1:23:17<42:07, 14.96s/it]                                                   {'loss': '0.0658', 'grad_norm': '3.594', 'learning_rate': '6.744e-06', 'ppl': '1.068', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.97', 'tokens/total': 9888000, 'tokens/trainable': 192245, 'epoch': '1.288'}
+ 65%|██████▍   | 309/478 [1:23:17<42:07, 14.96s/it] 65%|██████▍   | 310/478 [1:23:32<41:52, 14.96s/it]                                                   {'loss': '0.07153', 'grad_norm': '3.516', 'learning_rate': '6.675e-06', 'ppl': '1.074', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.16', 'tokens/total': 9920000, 'tokens/trainable': 192907, 'epoch': '1.292'}
+ 65%|██████▍   | 310/478 [1:23:32<41:52, 14.96s/it] 65%|██████▌   | 311/478 [1:23:47<41:37, 14.96s/it]                                                   {'loss': '0.1274', 'grad_norm': '4.625', 'learning_rate': '6.606e-06', 'ppl': '1.136', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 9952000, 'tokens/trainable': 193590, 'epoch': '1.296'}
+ 65%|██████▌   | 311/478 [1:23:47<41:37, 14.96s/it] 65%|██████▌   | 312/478 [1:24:02<41:23, 14.96s/it]                                                   {'loss': '0.0614', 'grad_norm': '3.797', 'learning_rate': '6.538e-06', 'ppl': '1.063', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.28', 'tokens/total': 9984000, 'tokens/trainable': 194196, 'epoch': '1.3'}
+ 65%|██████▌   | 312/478 [1:24:02<41:23, 14.96s/it] 65%|██████▌   | 313/478 [1:24:17<41:08, 14.96s/it]                                                   {'loss': '0.06641', 'grad_norm': '5.344', 'learning_rate': '6.47e-06', 'ppl': '1.069', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.52', 'tokens/total': 10016000, 'tokens/trainable': 194779, 'epoch': '1.304'}
+ 65%|██████▌   | 313/478 [1:24:17<41:08, 14.96s/it] 66%|██████▌   | 314/478 [1:24:32<40:52, 14.95s/it]                                                   {'loss': '0.09595', 'grad_norm': '5.312', 'learning_rate': '6.402e-06', 'ppl': '1.101', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.09', 'tokens/total': 10048000, 'tokens/trainable': 195408, 'epoch': '1.308'}
+ 66%|██████▌   | 314/478 [1:24:32<40:52, 14.95s/it] 66%|██████▌   | 315/478 [1:24:47<40:37, 14.95s/it]                                                   {'loss': '0.06055', 'grad_norm': '3.203', 'learning_rate': '6.334e-06', 'ppl': '1.062', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.68', 'tokens/total': 10080000, 'tokens/trainable': 195996, 'epoch': '1.312'}
+ 66%|██████▌   | 315/478 [1:24:47<40:37, 14.95s/it] 66%|██████▌   | 316/478 [1:25:02<40:22, 14.95s/it]                                                   {'loss': '0.08887', 'grad_norm': '3.938', 'learning_rate': '6.266e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.32', 'tokens/total': 10112000, 'tokens/trainable': 196603, 'epoch': '1.317'}
+ 66%|██████▌   | 316/478 [1:25:02<40:22, 14.95s/it] 66%|██████▋   | 317/478 [1:25:17<40:07, 14.95s/it]                                                   {'loss': '0.1387', 'grad_norm': '6.281', 'learning_rate': '6.198e-06', 'ppl': '1.149', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.02', 'tokens/total': 10144000, 'tokens/trainable': 197171, 'epoch': '1.321'}
+ 66%|██████▋   | 317/478 [1:25:17<40:07, 14.95s/it] 67%|██████▋   | 318/478 [1:25:32<39:52, 14.95s/it]                                                   {'loss': '0.07373', 'grad_norm': '2.984', 'learning_rate': '6.131e-06', 'ppl': '1.077', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22', 'tokens/total': 10176000, 'tokens/trainable': 197828, 'epoch': '1.325'}
+ 67%|██████▋   | 318/478 [1:25:32<39:52, 14.95s/it] 67%|██████▋   | 319/478 [1:25:46<39:37, 14.95s/it]                                                   {'loss': '0.0835', 'grad_norm': '4.781', 'learning_rate': '6.064e-06', 'ppl': '1.087', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 10208000, 'tokens/trainable': 198456, 'epoch': '1.329'}
+ 67%|██████▋   | 319/478 [1:25:46<39:37, 14.95s/it] 67%|██████▋   | 320/478 [1:26:01<39:21, 14.94s/it]                                                   {'loss': '0.07983', 'grad_norm': '4.688', 'learning_rate': '5.997e-06', 'ppl': '1.083', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22', 'tokens/total': 10240000, 'tokens/trainable': 199112, 'epoch': '1.333'}
+ 67%|██████▋   | 320/478 [1:26:01<39:21, 14.94s/it] 67%|██████▋   | 321/478 [1:26:16<39:05, 14.94s/it]                                                   {'loss': '0.1011', 'grad_norm': '5.469', 'learning_rate': '5.93e-06', 'ppl': '1.106', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.63', 'tokens/total': 10272000, 'tokens/trainable': 199787, 'epoch': '1.337'}
+ 67%|██████▋   | 321/478 [1:26:16<39:05, 14.94s/it] 67%|██████▋   | 322/478 [1:26:31<38:51, 14.95s/it]                                                   {'loss': '0.0813', 'grad_norm': '4.906', 'learning_rate': '5.864e-06', 'ppl': '1.085', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.55', 'tokens/total': 10304000, 'tokens/trainable': 200401, 'epoch': '1.342'}
+ 67%|██████▋   | 322/478 [1:26:31<38:51, 14.95s/it] 68%|██████▊   | 323/478 [1:26:46<38:36, 14.95s/it]                                                   {'loss': '0.06006', 'grad_norm': '3.531', 'learning_rate': '5.798e-06', 'ppl': '1.062', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.24', 'tokens/total': 10336000, 'tokens/trainable': 200886, 'epoch': '1.346'}
+ 68%|██████▊   | 323/478 [1:26:46<38:36, 14.95s/it] 68%|██████▊   | 324/478 [1:27:01<38:22, 14.95s/it]                                                   {'loss': '0.08911', 'grad_norm': '4.469', 'learning_rate': '5.732e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.7', 'tokens/total': 10368000, 'tokens/trainable': 201564, 'epoch': '1.35'}
+ 68%|██████▊   | 324/478 [1:27:01<38:22, 14.95s/it] 68%|██████▊   | 325/478 [1:27:16<38:06, 14.95s/it]                                                   {'loss': '0.06714', 'grad_norm': '4.344', 'learning_rate': '5.666e-06', 'ppl': '1.069', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.53', 'tokens/total': 10400000, 'tokens/trainable': 202147, 'epoch': '1.354'}
+ 68%|██████▊   | 325/478 [1:27:16<38:06, 14.95s/it] 68%|██████▊   | 326/478 [1:27:31<37:52, 14.95s/it]                                                   {'loss': '0.1201', 'grad_norm': '5.5', 'learning_rate': '5.6e-06', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.44', 'tokens/total': 10432000, 'tokens/trainable': 202668, 'epoch': '1.358'}
+ 68%|██████▊   | 326/478 [1:27:31<37:52, 14.95s/it] 68%|██████▊   | 327/478 [1:27:46<37:37, 14.95s/it]                                                   {'loss': '0.09717', 'grad_norm': '4', 'learning_rate': '5.535e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.83', 'tokens/total': 10464000, 'tokens/trainable': 203290, 'epoch': '1.363'}
+ 68%|██████▊   | 327/478 [1:27:46<37:37, 14.95s/it] 69%|██████▊   | 328/478 [1:28:01<37:22, 14.95s/it]                                                   {'loss': '0.08154', 'grad_norm': '4.406', 'learning_rate': '5.47e-06', 'ppl': '1.085', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.24', 'tokens/total': 10496000, 'tokens/trainable': 203984, 'epoch': '1.367'}
+ 69%|██████▊   | 328/478 [1:28:01<37:22, 14.95s/it] 69%|██████▉   | 329/478 [1:28:16<37:07, 14.95s/it]                                                   {'loss': '0.09863', 'grad_norm': '4.188', 'learning_rate': '5.405e-06', 'ppl': '1.104', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.06', 'tokens/total': 10528000, 'tokens/trainable': 204613, 'epoch': '1.371'}
+ 69%|██████▉   | 329/478 [1:28:16<37:07, 14.95s/it] 69%|██████▉   | 330/478 [1:28:31<36:52, 14.95s/it]                                                   {'loss': '0.1023', 'grad_norm': '4.531', 'learning_rate': '5.34e-06', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.94', 'tokens/total': 10560000, 'tokens/trainable': 205328, 'epoch': '1.375'}
+ 69%|██████▉   | 330/478 [1:28:31<36:52, 14.95s/it] 69%|██████▉   | 331/478 [1:28:46<36:37, 14.95s/it]                                                   {'loss': '0.125', 'grad_norm': '5.469', 'learning_rate': '5.276e-06', 'ppl': '1.133', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.52', 'tokens/total': 10592000, 'tokens/trainable': 205941, 'epoch': '1.379'}
+ 69%|██████▉   | 331/478 [1:28:46<36:37, 14.95s/it] 69%|██████▉   | 332/478 [1:29:01<36:23, 14.95s/it]                                                   {'loss': '0.09619', 'grad_norm': '4.906', 'learning_rate': '5.212e-06', 'ppl': '1.101', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.62', 'tokens/total': 10624000, 'tokens/trainable': 206497, 'epoch': '1.383'}
+ 69%|██████▉   | 332/478 [1:29:01<36:23, 14.95s/it] 70%|██████▉   | 333/478 [1:29:16<36:08, 14.95s/it]                                                   {'loss': '0.1157', 'grad_norm': '4.875', 'learning_rate': '5.148e-06', 'ppl': '1.123', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22', 'tokens/total': 10656000, 'tokens/trainable': 207154, 'epoch': '1.387'}
+ 70%|██████▉   | 333/478 [1:29:16<36:08, 14.95s/it] 70%|██████▉   | 334/478 [1:29:31<35:52, 14.95s/it]                                                   {'loss': '0.07983', 'grad_norm': '5.969', 'learning_rate': '5.084e-06', 'ppl': '1.083', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.48', 'tokens/total': 10688000, 'tokens/trainable': 207676, 'epoch': '1.392'}
+ 70%|██████▉   | 334/478 [1:29:31<35:52, 14.95s/it] 70%|███████   | 335/478 [1:29:46<35:38, 14.95s/it]                                                   {'loss': '0.1125', 'grad_norm': '5.75', 'learning_rate': '5.021e-06', 'ppl': '1.119', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.1', 'tokens/total': 10720000, 'tokens/trainable': 208157, 'epoch': '1.396'}
+ 70%|███████   | 335/478 [1:29:46<35:38, 14.95s/it] 70%|███████   | 336/478 [1:30:01<35:23, 14.95s/it]                                                   {'loss': '0.08423', 'grad_norm': '5.094', 'learning_rate': '4.958e-06', 'ppl': '1.088', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.7', 'tokens/total': 10752000, 'tokens/trainable': 208656, 'epoch': '1.4'}
+ 70%|███████   | 336/478 [1:30:01<35:23, 14.95s/it] 71%|███████   | 337/478 [1:30:16<35:08, 14.95s/it]                                                   {'loss': '0.1116', 'grad_norm': '5.281', 'learning_rate': '4.895e-06', 'ppl': '1.118', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.7', 'tokens/total': 10784000, 'tokens/trainable': 209274, 'epoch': '1.404'}
+ 71%|███████   | 337/478 [1:30:16<35:08, 14.95s/it] 71%|███████   | 338/478 [1:30:31<34:53, 14.95s/it]                                                   {'loss': '0.07593', 'grad_norm': '4.062', 'learning_rate': '4.833e-06', 'ppl': '1.079', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.89', 'tokens/total': 10816000, 'tokens/trainable': 209898, 'epoch': '1.408'}
+ 71%|███████   | 338/478 [1:30:31<34:53, 14.95s/it] 71%|███████   | 339/478 [1:30:45<34:39, 14.96s/it]                                                   {'loss': '0.06348', 'grad_norm': '3.125', 'learning_rate': '4.77e-06', 'ppl': '1.066', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.86', 'tokens/total': 10848000, 'tokens/trainable': 210582, 'epoch': '1.413'}
+ 71%|███████   | 339/478 [1:30:46<34:39, 14.96s/it] 71%|███████   | 340/478 [1:31:00<34:24, 14.96s/it]                                                   {'loss': '0.09473', 'grad_norm': '4.094', 'learning_rate': '4.708e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.6', 'tokens/total': 10880000, 'tokens/trainable': 211197, 'epoch': '1.417'}
+ 71%|███████   | 340/478 [1:31:00<34:24, 14.96s/it] 71%|███████▏  | 341/478 [1:31:15<34:08, 14.96s/it]                                                   {'loss': '0.1062', 'grad_norm': '5.125', 'learning_rate': '4.647e-06', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.61', 'tokens/total': 10912000, 'tokens/trainable': 211753, 'epoch': '1.421'}
+ 71%|███████▏  | 341/478 [1:31:15<34:08, 14.96s/it] 72%|███████▏  | 342/478 [1:31:30<33:54, 14.96s/it]                                                   {'loss': '0.08618', 'grad_norm': '4.469', 'learning_rate': '4.585e-06', 'ppl': '1.09', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.26', 'tokens/total': 10944000, 'tokens/trainable': 212388, 'epoch': '1.425'}
+ 72%|███████▏  | 342/478 [1:31:30<33:54, 14.96s/it] 72%|███████▏  | 343/478 [1:31:45<33:38, 14.96s/it]                                                   {'loss': '0.07764', 'grad_norm': '5.062', 'learning_rate': '4.524e-06', 'ppl': '1.081', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.42', 'tokens/total': 10976000, 'tokens/trainable': 212968, 'epoch': '1.429'}
+ 72%|███████▏  | 343/478 [1:31:45<33:38, 14.96s/it] 72%|███████▏  | 344/478 [1:32:00<33:24, 14.96s/it]                                                   {'loss': '0.1353', 'grad_norm': '5.312', 'learning_rate': '4.463e-06', 'ppl': '1.145', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.88', 'tokens/total': 11008000, 'tokens/trainable': 213592, 'epoch': '1.433'}
+ 72%|███████▏  | 344/478 [1:32:00<33:24, 14.96s/it] 72%|███████▏  | 345/478 [1:32:15<33:08, 14.95s/it]                                                   {'loss': '0.07617', 'grad_norm': '3.844', 'learning_rate': '4.403e-06', 'ppl': '1.079', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 11040000, 'tokens/trainable': 214233, 'epoch': '1.438'}
+ 72%|███████▏  | 345/478 [1:32:15<33:08, 14.95s/it] 72%|███████▏  | 346/478 [1:32:30<32:53, 14.95s/it]                                                   {'loss': '0.08276', 'grad_norm': '4.844', 'learning_rate': '4.342e-06', 'ppl': '1.086', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.3', 'tokens/total': 11072000, 'tokens/trainable': 214929, 'epoch': '1.442'}
+ 72%|███████▏  | 346/478 [1:32:30<32:53, 14.95s/it] 73%|███████▎  | 347/478 [1:32:45<32:38, 14.95s/it]                                                   {'loss': '0.1309', 'grad_norm': '5.031', 'learning_rate': '4.282e-06', 'ppl': '1.14', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.93', 'tokens/total': 11104000, 'tokens/trainable': 215554, 'epoch': '1.446'}
+ 73%|███████▎  | 347/478 [1:32:45<32:38, 14.95s/it] 73%|███████▎  | 348/478 [1:33:00<32:23, 14.95s/it]                                                   {'loss': '0.0874', 'grad_norm': '4.156', 'learning_rate': '4.223e-06', 'ppl': '1.091', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.43', 'tokens/total': 11136000, 'tokens/trainable': 216164, 'epoch': '1.45'}
+ 73%|███████▎  | 348/478 [1:33:00<32:23, 14.95s/it] 73%|███████▎  | 349/478 [1:33:15<32:08, 14.95s/it]                                                   {'loss': '0.1606', 'grad_norm': '8', 'learning_rate': '4.164e-06', 'ppl': '1.174', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.89', 'tokens/total': 11168000, 'tokens/trainable': 216728, 'epoch': '1.454'}
+ 73%|███████▎  | 349/478 [1:33:15<32:08, 14.95s/it] 73%|███████▎  | 350/478 [1:33:30<31:53, 14.95s/it]                                                   {'loss': '0.1328', 'grad_norm': '5.344', 'learning_rate': '4.104e-06', 'ppl': '1.142', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.63', 'tokens/total': 11200000, 'tokens/trainable': 217374, 'epoch': '1.458'}
+ 73%|███████▎  | 350/478 [1:33:30<31:53, 14.95s/it] 73%|███████▎  | 351/478 [1:33:45<31:39, 14.95s/it]                                                   {'loss': '0.07727', 'grad_norm': '3.109', 'learning_rate': '4.046e-06', 'ppl': '1.08', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.99', 'tokens/total': 11232000, 'tokens/trainable': 218031, 'epoch': '1.462'}
+ 73%|███████▎  | 351/478 [1:33:45<31:39, 14.95s/it] 74%|███████▎  | 352/478 [1:34:00<31:24, 14.95s/it]                                                   {'loss': '0.1194', 'grad_norm': '5.219', 'learning_rate': '3.987e-06', 'ppl': '1.127', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.88', 'tokens/total': 11264000, 'tokens/trainable': 218595, 'epoch': '1.467'}
+ 74%|███████▎  | 352/478 [1:34:00<31:24, 14.95s/it] 74%|███████▍  | 353/478 [1:34:15<31:06, 14.94s/it]                                                   {'loss': '0.1445', 'grad_norm': '5.031', 'learning_rate': '3.929e-06', 'ppl': '1.155', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.72', 'tokens/total': 11296000, 'tokens/trainable': 219241, 'epoch': '1.471'}
+ 74%|███████▍  | 353/478 [1:34:15<31:06, 14.94s/it] 74%|███████▍  | 354/478 [1:34:30<30:52, 14.94s/it]                                                   {'loss': '0.0697', 'grad_norm': '4.438', 'learning_rate': '3.872e-06', 'ppl': '1.072', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.61', 'tokens/total': 11328000, 'tokens/trainable': 219916, 'epoch': '1.475'}
+ 74%|███████▍  | 354/478 [1:34:30<30:52, 14.94s/it] 74%|███████▍  | 355/478 [1:34:45<30:38, 14.94s/it]                                                   {'loss': '0.1052', 'grad_norm': '5.062', 'learning_rate': '3.814e-06', 'ppl': '1.111', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.15', 'tokens/total': 11360000, 'tokens/trainable': 220518, 'epoch': '1.479'}
+ 74%|███████▍  | 355/478 [1:34:45<30:38, 14.94s/it] 74%|███████▍  | 356/478 [1:35:00<30:23, 14.95s/it]                                                   {'loss': '0.08936', 'grad_norm': '4.531', 'learning_rate': '3.757e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.04', 'tokens/total': 11392000, 'tokens/trainable': 221057, 'epoch': '1.483'}
+ 74%|███████▍  | 356/478 [1:35:00<30:23, 14.95s/it] 75%|███████▍  | 357/478 [1:35:15<30:08, 14.95s/it]                                                   {'loss': '0.1221', 'grad_norm': '4.969', 'learning_rate': '3.7e-06', 'ppl': '1.13', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.69', 'tokens/total': 11424000, 'tokens/trainable': 221705, 'epoch': '1.488'}
+ 75%|███████▍  | 357/478 [1:35:15<30:08, 14.95s/it] 75%|███████▍  | 358/478 [1:35:30<29:54, 14.95s/it]                                                   {'loss': '0.09595', 'grad_norm': '4.25', 'learning_rate': '3.644e-06', 'ppl': '1.101', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.76', 'tokens/total': 11456000, 'tokens/trainable': 222355, 'epoch': '1.492'}
+ 75%|███████▍  | 358/478 [1:35:30<29:54, 14.95s/it] 75%|███████▌  | 359/478 [1:35:45<29:39, 14.95s/it]                                                   {'loss': '0.1006', 'grad_norm': '5.406', 'learning_rate': '3.588e-06', 'ppl': '1.106', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.99', 'tokens/total': 11488000, 'tokens/trainable': 223042, 'epoch': '1.496'}
+ 75%|███████▌  | 359/478 [1:35:45<29:39, 14.95s/it] 75%|███████▌  | 360/478 [1:35:59<29:24, 14.95s/it]                                                   {'loss': '0.08984', 'grad_norm': '4.344', 'learning_rate': '3.532e-06', 'ppl': '1.094', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.26', 'tokens/total': 11520000, 'tokens/trainable': 223707, 'epoch': '1.5'}
+ 75%|███████▌  | 360/478 [1:35:59<29:24, 14.95s/it][2026-04-17 03:45:24,126] [INFO] [axolotl.core.trainers.base] Running evaluation step...
+[2026-04-17 03:45:31,509] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
+
+  0%|          | 0/27 [00:00<?, ?it/s]
+  7%|▋         | 2/27 [00:02<00:35,  1.41s/it]
+ 11%|█         | 3/27 [00:05<00:47,  1.97s/it]
+ 15%|█▍        | 4/27 [00:08<00:51,  2.26s/it]
+ 19%|█▊        | 5/27 [00:11<00:53,  2.43s/it]
+ 22%|██▏       | 6/27 [00:13<00:53,  2.54s/it]
+ 26%|██▌       | 7/27 [00:16<00:52,  2.61s/it]
+ 30%|██▉       | 8/27 [00:19<00:50,  2.65s/it]
+ 33%|███▎      | 9/27 [00:22<00:48,  2.68s/it]
+ 37%|███▋      | 10/27 [00:24<00:45,  2.70s/it]
+ 41%|████      | 11/27 [00:27<00:43,  2.72s/it]
+ 44%|████▍     | 12/27 [00:30<00:39,  2.64s/it]
+ 48%|████▊     | 13/27 [00:33<00:38,  2.76s/it]
+ 52%|█████▏    | 14/27 [00:35<00:35,  2.76s/it]
+ 56%|█████▌    | 15/27 [00:38<00:33,  2.76s/it]
+ 59%|█████▉    | 16/27 [00:41<00:30,  2.76s/it]
+ 63%|██████▎   | 17/27 [00:44<00:27,  2.75s/it]
+ 67%|██████▋   | 18/27 [00:46<00:24,  2.75s/it]
+ 70%|███████   | 19/27 [00:49<00:22,  2.75s/it]
+ 74%|███████▍  | 20/27 [00:52<00:19,  2.75s/it]
+ 78%|███████▊  | 21/27 [00:54<00:15,  2.67s/it]
+ 81%|████████▏ | 22/27 [00:57<00:13,  2.78s/it]
+ 85%|████████▌ | 23/27 [01:00<00:11,  2.77s/it]
+ 89%|████████▉ | 24/27 [01:03<00:08,  2.77s/it]
+ 93%|█████████▎| 25/27 [01:06<00:05,  2.76s/it]
+ 96%|█████████▋| 26/27 [01:08<00:02,  2.76s/it]
+100%|██████████| 27/27 [01:11<00:00,  2.79s/it]                                                   
+                                               {'eval_loss': '0.2251', 'eval_runtime': '75.08', 'eval_samples_per_second': '2.784', 'eval_steps_per_second': '1.398', 'eval_ppl': '1.252', 'memory/max_active (GiB)': '34.91', 'memory/max_allocated (GiB)': '34.91', 'memory/device_reserved (GiB)': '57.54', 'epoch': '1.5', 'tokens/train_per_sec_per_gpu': '0'}
+ 75%|███████▌  | 360/478 [1:37:22<29:24, 14.95s/it]
+100%|██████████| 27/27 [01:13<00:00,  2.79s/it]
+                                                76%|███████▌  | 361/478 [1:37:37<1:17:25, 39.71s/it]                                                     {'loss': '0.09912', 'grad_norm': '4.531', 'learning_rate': '3.476e-06', 'ppl': '1.104', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.6', 'tokens/total': 11552000, 'tokens/trainable': 224233, 'epoch': '1.504'}
+ 76%|███████▌  | 361/478 [1:37:37<1:17:25, 39.71s/it] 76%|███████▌  | 362/478 [1:37:52<1:02:24, 32.28s/it]                                                     {'loss': '0.08459', 'grad_norm': '4.062', 'learning_rate': '3.421e-06', 'ppl': '1.088', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.69', 'tokens/total': 11584000, 'tokens/trainable': 224821, 'epoch': '1.508'}
+ 76%|███████▌  | 362/478 [1:37:52<1:02:24, 32.28s/it] 76%|███████▌  | 363/478 [1:38:07<51:55, 27.09s/it]                                                     {'loss': '0.1008', 'grad_norm': '4.812', 'learning_rate': '3.367e-06', 'ppl': '1.106', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.31', 'tokens/total': 11616000, 'tokens/trainable': 225338, 'epoch': '1.512'}
+ 76%|███████▌  | 363/478 [1:38:07<51:55, 27.09s/it] 76%|███████▌  | 364/478 [1:38:22<44:32, 23.45s/it]                                                   {'loss': '0.1174', 'grad_norm': '5.188', 'learning_rate': '3.312e-06', 'ppl': '1.125', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.41', 'tokens/total': 11648000, 'tokens/trainable': 225858, 'epoch': '1.517'}
+ 76%|███████▌  | 364/478 [1:38:22<44:32, 23.45s/it] 76%|███████▋  | 365/478 [1:38:37<39:21, 20.90s/it]                                                   {'loss': '0.1138', 'grad_norm': '5.594', 'learning_rate': '3.258e-06', 'ppl': '1.12', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.13', 'tokens/total': 11680000, 'tokens/trainable': 226459, 'epoch': '1.521'}
+ 76%|███████▋  | 365/478 [1:38:37<39:21, 20.90s/it] 77%|███████▋  | 366/478 [1:38:52<35:40, 19.11s/it]                                                   {'loss': '0.1018', 'grad_norm': '4.125', 'learning_rate': '3.205e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.9', 'tokens/total': 11712000, 'tokens/trainable': 227113, 'epoch': '1.525'}
+ 77%|███████▋  | 366/478 [1:38:52<35:40, 19.11s/it] 77%|███████▋  | 367/478 [1:39:07<33:02, 17.86s/it]                                                   {'loss': '0.09009', 'grad_norm': '4.031', 'learning_rate': '3.151e-06', 'ppl': '1.094', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.33', 'tokens/total': 11744000, 'tokens/trainable': 227750, 'epoch': '1.529'}
+ 77%|███████▋  | 367/478 [1:39:07<33:02, 17.86s/it] 77%|███████▋  | 368/478 [1:39:22<31:09, 16.99s/it]                                                   {'loss': '0.09216', 'grad_norm': '4.25', 'learning_rate': '3.098e-06', 'ppl': '1.097', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.14', 'tokens/total': 11776000, 'tokens/trainable': 228292, 'epoch': '1.533'}
+ 77%|███████▋  | 368/478 [1:39:22<31:09, 16.99s/it] 77%|███████▋  | 369/478 [1:39:37<29:45, 16.38s/it]                                                   {'loss': '0.115', 'grad_norm': '5.594', 'learning_rate': '3.046e-06', 'ppl': '1.122', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.96', 'tokens/total': 11808000, 'tokens/trainable': 228888, 'epoch': '1.538'}
+ 77%|███████▋  | 369/478 [1:39:37<29:45, 16.38s/it] 77%|███████▋  | 370/478 [1:39:51<28:42, 15.95s/it]                                                   {'loss': '0.09424', 'grad_norm': '4.344', 'learning_rate': '2.994e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20', 'tokens/total': 11840000, 'tokens/trainable': 229485, 'epoch': '1.542'}
+ 77%|███████▋  | 370/478 [1:39:52<28:42, 15.95s/it] 78%|███████▊  | 371/478 [1:40:06<27:54, 15.65s/it]                                                   {'loss': '0.1067', 'grad_norm': '6.469', 'learning_rate': '2.942e-06', 'ppl': '1.113', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.93', 'tokens/total': 11872000, 'tokens/trainable': 230020, 'epoch': '1.546'}
+ 78%|███████▊  | 371/478 [1:40:06<27:54, 15.65s/it] 78%|███████▊  | 372/478 [1:40:21<27:16, 15.44s/it]                                                   {'loss': '0.1021', 'grad_norm': '5.469', 'learning_rate': '2.89e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.49', 'tokens/total': 11904000, 'tokens/trainable': 230572, 'epoch': '1.55'}
+ 78%|███████▊  | 372/478 [1:40:21<27:16, 15.44s/it] 78%|███████▊  | 373/478 [1:40:36<26:44, 15.28s/it]                                                   {'loss': '0.1387', 'grad_norm': '4.656', 'learning_rate': '2.839e-06', 'ppl': '1.149', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.25', 'tokens/total': 11936000, 'tokens/trainable': 231265, 'epoch': '1.554'}
+ 78%|███████▊  | 373/478 [1:40:36<26:44, 15.28s/it] 78%|███████▊  | 374/478 [1:40:51<26:20, 15.20s/it]                                                   {'loss': '0.137', 'grad_norm': '5.344', 'learning_rate': '2.789e-06', 'ppl': '1.147', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.97', 'tokens/total': 11968000, 'tokens/trainable': 231953, 'epoch': '1.558'}
+ 78%|███████▊  | 374/478 [1:40:51<26:20, 15.20s/it] 78%|███████▊  | 375/478 [1:41:06<25:57, 15.12s/it]                                                   {'loss': '0.09937', 'grad_norm': '3.984', 'learning_rate': '2.738e-06', 'ppl': '1.104', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.44', 'tokens/total': 12000000, 'tokens/trainable': 232623, 'epoch': '1.562'}
+ 78%|███████▊  | 375/478 [1:41:06<25:57, 15.12s/it] 79%|███████▊  | 376/478 [1:41:21<25:37, 15.07s/it]                                                   {'loss': '0.116', 'grad_norm': '4.875', 'learning_rate': '2.688e-06', 'ppl': '1.123', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.19', 'tokens/total': 12032000, 'tokens/trainable': 233226, 'epoch': '1.567'}
+ 79%|███████▊  | 376/478 [1:41:21<25:37, 15.07s/it] 79%|███████▉  | 377/478 [1:41:36<25:17, 15.03s/it]                                                   {'loss': '0.1116', 'grad_norm': '5.531', 'learning_rate': '2.639e-06', 'ppl': '1.118', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 12064000, 'tokens/trainable': 233846, 'epoch': '1.571'}
+ 79%|███████▉  | 377/478 [1:41:36<25:17, 15.03s/it] 79%|███████▉  | 378/478 [1:41:51<25:00, 15.01s/it]                                                   {'loss': '0.1494', 'grad_norm': '6.625', 'learning_rate': '2.59e-06', 'ppl': '1.161', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.32', 'tokens/total': 12096000, 'tokens/trainable': 234423, 'epoch': '1.575'}
+ 79%|███████▉  | 378/478 [1:41:51<25:00, 15.01s/it] 79%|███████▉  | 379/478 [1:42:06<24:43, 14.99s/it]                                                   {'loss': '0.07104', 'grad_norm': '3.75', 'learning_rate': '2.541e-06', 'ppl': '1.074', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.13', 'tokens/total': 12128000, 'tokens/trainable': 235024, 'epoch': '1.579'}
+ 79%|███████▉  | 379/478 [1:42:06<24:43, 14.99s/it] 79%|███████▉  | 380/478 [1:42:21<24:27, 14.98s/it]                                                   {'loss': '0.09961', 'grad_norm': '5.688', 'learning_rate': '2.493e-06', 'ppl': '1.105', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '16.68', 'tokens/total': 12160000, 'tokens/trainable': 235522, 'epoch': '1.583'}
+ 79%|███████▉  | 380/478 [1:42:21<24:27, 14.98s/it] 80%|███████▉  | 381/478 [1:42:36<24:10, 14.95s/it]                                                   {'loss': '0.1062', 'grad_norm': '4.531', 'learning_rate': '2.445e-06', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.69', 'tokens/total': 12192000, 'tokens/trainable': 236167, 'epoch': '1.587'}
+ 80%|███████▉  | 381/478 [1:42:36<24:10, 14.95s/it] 80%|███████▉  | 382/478 [1:42:51<23:55, 14.95s/it]                                                   {'loss': '0.1431', 'grad_norm': '5.75', 'learning_rate': '2.397e-06', 'ppl': '1.154', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.84', 'tokens/total': 12224000, 'tokens/trainable': 236819, 'epoch': '1.592'}
+ 80%|███████▉  | 382/478 [1:42:51<23:55, 14.95s/it] 80%|████████  | 383/478 [1:43:06<23:40, 14.95s/it]                                                   {'loss': '0.1384', 'grad_norm': '5.062', 'learning_rate': '2.35e-06', 'ppl': '1.148', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.09', 'tokens/total': 12256000, 'tokens/trainable': 237449, 'epoch': '1.596'}
+ 80%|████████  | 383/478 [1:43:06<23:40, 14.95s/it] 80%|████████  | 384/478 [1:43:21<23:25, 14.95s/it]                                                   {'loss': '0.09216', 'grad_norm': '4.125', 'learning_rate': '2.303e-06', 'ppl': '1.097', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.75', 'tokens/total': 12288000, 'tokens/trainable': 238009, 'epoch': '1.6'}
+ 80%|████████  | 384/478 [1:43:21<23:25, 14.95s/it] 81%|████████  | 385/478 [1:43:36<23:10, 14.95s/it]                                                   {'loss': '0.09546', 'grad_norm': '4.75', 'learning_rate': '2.257e-06', 'ppl': '1.1', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 12320000, 'tokens/trainable': 238586, 'epoch': '1.604'}
+ 81%|████████  | 385/478 [1:43:36<23:10, 14.95s/it] 81%|████████  | 386/478 [1:43:51<22:54, 14.94s/it]                                                   {'loss': '0.06152', 'grad_norm': '3.656', 'learning_rate': '2.211e-06', 'ppl': '1.063', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.85', 'tokens/total': 12352000, 'tokens/trainable': 239148, 'epoch': '1.608'}
+ 81%|████████  | 386/478 [1:43:51<22:54, 14.94s/it] 81%|████████  | 387/478 [1:44:06<22:39, 14.94s/it]                                                   {'loss': '0.09692', 'grad_norm': '4.656', 'learning_rate': '2.165e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.66', 'tokens/total': 12384000, 'tokens/trainable': 239765, 'epoch': '1.613'}
+ 81%|████████  | 387/478 [1:44:06<22:39, 14.94s/it] 81%|████████  | 388/478 [1:44:20<22:25, 14.95s/it]                                                   {'loss': '0.1025', 'grad_norm': '4.562', 'learning_rate': '2.12e-06', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.3', 'tokens/total': 12416000, 'tokens/trainable': 240431, 'epoch': '1.617'}
+ 81%|████████  | 388/478 [1:44:21<22:25, 14.95s/it] 81%|████████▏ | 389/478 [1:44:35<22:10, 14.95s/it]                                                   {'loss': '0.09546', 'grad_norm': '4.188', 'learning_rate': '2.076e-06', 'ppl': '1.1', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.79', 'tokens/total': 12448000, 'tokens/trainable': 241022, 'epoch': '1.621'}
+ 81%|████████▏ | 389/478 [1:44:35<22:10, 14.95s/it] 82%|████████▏ | 390/478 [1:44:50<21:55, 14.95s/it]                                                   {'loss': '0.07373', 'grad_norm': '3.828', 'learning_rate': '2.031e-06', 'ppl': '1.077', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.79', 'tokens/total': 12480000, 'tokens/trainable': 241583, 'epoch': '1.625'}
+ 82%|████████▏ | 390/478 [1:44:50<21:55, 14.95s/it] 82%|████████▏ | 391/478 [1:45:05<21:40, 14.95s/it]                                                   {'loss': '0.09595', 'grad_norm': '3.969', 'learning_rate': '1.988e-06', 'ppl': '1.101', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.89', 'tokens/total': 12512000, 'tokens/trainable': 242147, 'epoch': '1.629'}
+ 82%|████████▏ | 391/478 [1:45:05<21:40, 14.95s/it] 82%|████████▏ | 392/478 [1:45:20<21:25, 14.95s/it]                                                   {'loss': '0.1494', 'grad_norm': '5.188', 'learning_rate': '1.944e-06', 'ppl': '1.161', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.65', 'tokens/total': 12544000, 'tokens/trainable': 242823, 'epoch': '1.633'}
+ 82%|████████▏ | 392/478 [1:45:20<21:25, 14.95s/it] 82%|████████▏ | 393/478 [1:45:35<21:10, 14.95s/it]                                                   {'loss': '0.1069', 'grad_norm': '5.5', 'learning_rate': '1.901e-06', 'ppl': '1.113', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.85', 'tokens/total': 12576000, 'tokens/trainable': 243505, 'epoch': '1.637'}
+ 82%|████████▏ | 393/478 [1:45:35<21:10, 14.95s/it] 82%|████████▏ | 394/478 [1:45:50<20:55, 14.95s/it]                                                   {'loss': '0.08936', 'grad_norm': '5.531', 'learning_rate': '1.859e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.29', 'tokens/total': 12608000, 'tokens/trainable': 244111, 'epoch': '1.642'}
+ 82%|████████▏ | 394/478 [1:45:50<20:55, 14.95s/it] 83%|████████▎ | 395/478 [1:46:05<20:40, 14.95s/it]                                                   {'loss': '0.1116', 'grad_norm': '4.312', 'learning_rate': '1.817e-06', 'ppl': '1.118', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.6', 'tokens/total': 12640000, 'tokens/trainable': 244696, 'epoch': '1.646'}
+ 83%|████████▎ | 395/478 [1:46:05<20:40, 14.95s/it] 83%|████████▎ | 396/478 [1:46:20<20:25, 14.94s/it]                                                   {'loss': '0.1196', 'grad_norm': '4.875', 'learning_rate': '1.775e-06', 'ppl': '1.127', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.28', 'tokens/total': 12672000, 'tokens/trainable': 245390, 'epoch': '1.65'}
+ 83%|████████▎ | 396/478 [1:46:20<20:25, 14.94s/it] 83%|████████▎ | 397/478 [1:46:35<20:10, 14.95s/it]                                                   {'loss': '0.1062', 'grad_norm': '3.922', 'learning_rate': '1.734e-06', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.29', 'tokens/total': 12704000, 'tokens/trainable': 246056, 'epoch': '1.654'}
+ 83%|████████▎ | 397/478 [1:46:35<20:10, 14.95s/it] 83%|████████▎ | 398/478 [1:46:50<19:55, 14.95s/it]                                                   {'loss': '0.124', 'grad_norm': '4.469', 'learning_rate': '1.693e-06', 'ppl': '1.132', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.6', 'tokens/total': 12736000, 'tokens/trainable': 246701, 'epoch': '1.658'}
+ 83%|████████▎ | 398/478 [1:46:50<19:55, 14.95s/it] 83%|████████▎ | 399/478 [1:47:05<19:40, 14.95s/it]                                                   {'loss': '0.07373', 'grad_norm': '4.344', 'learning_rate': '1.653e-06', 'ppl': '1.077', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.28', 'tokens/total': 12768000, 'tokens/trainable': 247247, 'epoch': '1.663'}
+ 83%|████████▎ | 399/478 [1:47:05<19:40, 14.95s/it] 84%|████████▎ | 400/478 [1:47:20<19:25, 14.95s/it]                                                   {'loss': '0.08398', 'grad_norm': '3.781', 'learning_rate': '1.613e-06', 'ppl': '1.088', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.24', 'tokens/total': 12800000, 'tokens/trainable': 247941, 'epoch': '1.667'}
+ 84%|████████▎ | 400/478 [1:47:20<19:25, 14.95s/it] 84%|████████▍ | 401/478 [1:47:35<19:11, 14.95s/it]                                                   {'loss': '0.09668', 'grad_norm': '5.562', 'learning_rate': '1.573e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.41', 'tokens/total': 12832000, 'tokens/trainable': 248461, 'epoch': '1.671'}
+ 84%|████████▍ | 401/478 [1:47:35<19:11, 14.95s/it] 84%|████████▍ | 402/478 [1:47:50<18:56, 14.95s/it]                                                   {'loss': '0.1016', 'grad_norm': '4.344', 'learning_rate': '1.534e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.1', 'tokens/total': 12864000, 'tokens/trainable': 249091, 'epoch': '1.675'}
+ 84%|████████▍ | 402/478 [1:47:50<18:56, 14.95s/it] 84%|████████▍ | 403/478 [1:48:05<18:41, 14.95s/it]                                                   {'loss': '0.127', 'grad_norm': '5.969', 'learning_rate': '1.496e-06', 'ppl': '1.135', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.63', 'tokens/total': 12896000, 'tokens/trainable': 249737, 'epoch': '1.679'}
+ 84%|████████▍ | 403/478 [1:48:05<18:41, 14.95s/it] 85%|████████▍ | 404/478 [1:48:20<18:26, 14.95s/it]                                                   {'loss': '0.09387', 'grad_norm': '4.5', 'learning_rate': '1.457e-06', 'ppl': '1.098', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '16.68', 'tokens/total': 12928000, 'tokens/trainable': 250235, 'epoch': '1.683'}
+ 85%|████████▍ | 404/478 [1:48:20<18:26, 14.95s/it] 85%|████████▍ | 405/478 [1:48:35<18:11, 14.95s/it]                                                   {'loss': '0.1504', 'grad_norm': '6.469', 'learning_rate': '1.42e-06', 'ppl': '1.162', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.6', 'tokens/total': 12960000, 'tokens/trainable': 250910, 'epoch': '1.688'}
+ 85%|████████▍ | 405/478 [1:48:35<18:11, 14.95s/it] 85%|████████▍ | 406/478 [1:48:50<17:56, 14.95s/it]                                                   {'loss': '0.08569', 'grad_norm': '5.438', 'learning_rate': '1.383e-06', 'ppl': '1.089', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.58', 'tokens/total': 12992000, 'tokens/trainable': 251465, 'epoch': '1.692'}
+ 85%|████████▍ | 406/478 [1:48:50<17:56, 14.95s/it] 85%|████████▌ | 407/478 [1:49:04<17:41, 14.94s/it]                                                   {'loss': '0.1079', 'grad_norm': '4.562', 'learning_rate': '1.346e-06', 'ppl': '1.114', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 13024000, 'tokens/trainable': 252092, 'epoch': '1.696'}
+ 85%|████████▌ | 407/478 [1:49:05<17:41, 14.94s/it] 85%|████████▌ | 408/478 [1:49:19<17:26, 14.94s/it]                                                   {'loss': '0.105', 'grad_norm': '4.719', 'learning_rate': '1.31e-06', 'ppl': '1.111', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.23', 'tokens/total': 13056000, 'tokens/trainable': 252666, 'epoch': '1.7'}
+ 85%|████████▌ | 408/478 [1:49:19<17:26, 14.94s/it] 86%|████████▌ | 409/478 [1:49:34<17:11, 14.95s/it]                                                   {'loss': '0.1016', 'grad_norm': '5.094', 'learning_rate': '1.274e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 13088000, 'tokens/trainable': 253294, 'epoch': '1.704'}
+ 86%|████████▌ | 409/478 [1:49:34<17:11, 14.95s/it] 86%|████████▌ | 410/478 [1:49:49<16:56, 14.95s/it]                                                   {'loss': '0.1023', 'grad_norm': '4.938', 'learning_rate': '1.238e-06', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.95', 'tokens/total': 13120000, 'tokens/trainable': 253920, 'epoch': '1.708'}
+ 86%|████████▌ | 410/478 [1:49:49<16:56, 14.95s/it] 86%|████████▌ | 411/478 [1:50:04<16:42, 14.96s/it]                                                   {'loss': '0.1167', 'grad_norm': '5.812', 'learning_rate': '1.203e-06', 'ppl': '1.124', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.77', 'tokens/total': 13152000, 'tokens/trainable': 254541, 'epoch': '1.712'}
+ 86%|████████▌ | 411/478 [1:50:04<16:42, 14.96s/it] 86%|████████▌ | 412/478 [1:50:19<16:26, 14.95s/it]                                                   {'loss': '0.1777', 'grad_norm': '5.562', 'learning_rate': '1.169e-06', 'ppl': '1.195', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.07', 'tokens/total': 13184000, 'tokens/trainable': 255200, 'epoch': '1.717'}
+ 86%|████████▌ | 412/478 [1:50:19<16:26, 14.95s/it] 86%|████████▋ | 413/478 [1:50:34<16:12, 14.95s/it]                                                   {'loss': '0.1094', 'grad_norm': '4.25', 'learning_rate': '1.135e-06', 'ppl': '1.116', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 13216000, 'tokens/trainable': 255821, 'epoch': '1.721'}
+ 86%|████████▋ | 413/478 [1:50:34<16:12, 14.95s/it] 87%|████████▋ | 414/478 [1:50:49<15:57, 14.96s/it]                                                   {'loss': '0.1216', 'grad_norm': '5', 'learning_rate': '1.102e-06', 'ppl': '1.129', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '24.83', 'tokens/total': 13248000, 'tokens/trainable': 256563, 'epoch': '1.725'}
+ 87%|████████▋ | 414/478 [1:50:49<15:57, 14.96s/it] 87%|████████▋ | 415/478 [1:51:04<15:42, 14.95s/it]                                                   {'loss': '0.1398', 'grad_norm': '5', 'learning_rate': '1.069e-06', 'ppl': '1.15', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21', 'tokens/total': 13280000, 'tokens/trainable': 257190, 'epoch': '1.729'}
+ 87%|████████▋ | 415/478 [1:51:04<15:42, 14.95s/it] 87%|████████▋ | 416/478 [1:51:19<15:26, 14.95s/it]                                                   {'loss': '0.1499', 'grad_norm': '5.219', 'learning_rate': '1.036e-06', 'ppl': '1.162', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '24.32', 'tokens/total': 13312000, 'tokens/trainable': 257915, 'epoch': '1.733'}
+ 87%|████████▋ | 416/478 [1:51:19<15:26, 14.95s/it] 87%|████████▋ | 417/478 [1:51:34<15:11, 14.95s/it]                                                   {'loss': '0.1335', 'grad_norm': '5.312', 'learning_rate': '1.004e-06', 'ppl': '1.143', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.39', 'tokens/total': 13344000, 'tokens/trainable': 258524, 'epoch': '1.738'}
+ 87%|████████▋ | 417/478 [1:51:34<15:11, 14.95s/it] 87%|████████▋ | 418/478 [1:51:49<14:56, 14.95s/it]                                                   {'loss': '0.1001', 'grad_norm': '3.969', 'learning_rate': '9.723e-07', 'ppl': '1.105', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 13376000, 'tokens/trainable': 259207, 'epoch': '1.742'}
+ 87%|████████▋ | 418/478 [1:51:49<14:56, 14.95s/it] 88%|████████▊ | 419/478 [1:52:04<14:41, 14.95s/it]                                                   {'loss': '0.09399', 'grad_norm': '4.562', 'learning_rate': '9.412e-07', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.58', 'tokens/total': 13408000, 'tokens/trainable': 259732, 'epoch': '1.746'}
+ 88%|████████▊ | 419/478 [1:52:04<14:41, 14.95s/it] 88%|████████▊ | 420/478 [1:52:19<14:26, 14.95s/it]                                                   {'loss': '0.08618', 'grad_norm': '4.438', 'learning_rate': '9.106e-07', 'ppl': '1.09', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.46', 'tokens/total': 13440000, 'tokens/trainable': 260313, 'epoch': '1.75'}
+ 88%|████████▊ | 420/478 [1:52:19<14:26, 14.95s/it] 88%|████████▊ | 421/478 [1:52:34<14:12, 14.95s/it]                                                   {'loss': '0.1147', 'grad_norm': '4.719', 'learning_rate': '8.804e-07', 'ppl': '1.122', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.85', 'tokens/total': 13472000, 'tokens/trainable': 260936, 'epoch': '1.754'}
+ 88%|████████▊ | 421/478 [1:52:34<14:12, 14.95s/it] 88%|████████▊ | 422/478 [1:52:49<13:57, 14.95s/it]                                                   {'loss': '0.08655', 'grad_norm': '4.156', 'learning_rate': '8.508e-07', 'ppl': '1.09', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.58', 'tokens/total': 13504000, 'tokens/trainable': 261491, 'epoch': '1.758'}
+ 88%|████████▊ | 422/478 [1:52:49<13:57, 14.95s/it] 88%|████████▊ | 423/478 [1:53:04<13:42, 14.95s/it]                                                   {'loss': '0.07715', 'grad_norm': '3.297', 'learning_rate': '8.216e-07', 'ppl': '1.08', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.03', 'tokens/total': 13536000, 'tokens/trainable': 262059, 'epoch': '1.762'}
+ 88%|████████▊ | 423/478 [1:53:04<13:42, 14.95s/it] 89%|████████▊ | 424/478 [1:53:19<13:27, 14.95s/it]                                                   {'loss': '0.1002', 'grad_norm': '4.531', 'learning_rate': '7.929e-07', 'ppl': '1.105', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.1', 'tokens/total': 13568000, 'tokens/trainable': 262689, 'epoch': '1.767'}
+ 89%|████████▊ | 424/478 [1:53:19<13:27, 14.95s/it] 89%|████████▉ | 425/478 [1:53:34<13:12, 14.95s/it]                                                   {'loss': '0.08423', 'grad_norm': '4.406', 'learning_rate': '7.647e-07', 'ppl': '1.088', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.76', 'tokens/total': 13600000, 'tokens/trainable': 263219, 'epoch': '1.771'}
+ 89%|████████▉ | 425/478 [1:53:34<13:12, 14.95s/it] 89%|████████▉ | 426/478 [1:53:49<12:57, 14.95s/it]                                                   {'loss': '0.1345', 'grad_norm': '6.062', 'learning_rate': '7.37e-07', 'ppl': '1.144', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.27', 'tokens/total': 13632000, 'tokens/trainable': 263884, 'epoch': '1.775'}
+ 89%|████████▉ | 426/478 [1:53:49<12:57, 14.95s/it] 89%|████████▉ | 427/478 [1:54:04<12:42, 14.95s/it]                                                   {'loss': '0.1025', 'grad_norm': '4.938', 'learning_rate': '7.098e-07', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.23', 'tokens/total': 13664000, 'tokens/trainable': 264488, 'epoch': '1.779'}
+ 89%|████████▉ | 427/478 [1:54:04<12:42, 14.95s/it] 90%|████████▉ | 428/478 [1:54:18<12:27, 14.95s/it]                                                   {'loss': '0.07617', 'grad_norm': '3.781', 'learning_rate': '6.83e-07', 'ppl': '1.079', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.44', 'tokens/total': 13696000, 'tokens/trainable': 265128, 'epoch': '1.783'}
+ 90%|████████▉ | 428/478 [1:54:18<12:27, 14.95s/it] 90%|████████▉ | 429/478 [1:54:33<12:12, 14.95s/it]                                                   {'loss': '0.1064', 'grad_norm': '4.469', 'learning_rate': '6.568e-07', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21', 'tokens/total': 13728000, 'tokens/trainable': 265755, 'epoch': '1.788'}
+ 90%|████████▉ | 429/478 [1:54:33<12:12, 14.95s/it] 90%|████████▉ | 430/478 [1:54:48<11:57, 14.95s/it]                                                   {'loss': '0.09375', 'grad_norm': '4.375', 'learning_rate': '6.311e-07', 'ppl': '1.098', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.77', 'tokens/total': 13760000, 'tokens/trainable': 266405, 'epoch': '1.792'}
+ 90%|████████▉ | 430/478 [1:54:48<11:57, 14.95s/it] 90%|█████████ | 431/478 [1:55:03<11:42, 14.95s/it]                                                   {'loss': '0.1077', 'grad_norm': '4.219', 'learning_rate': '6.058e-07', 'ppl': '1.114', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.04', 'tokens/total': 13792000, 'tokens/trainable': 267093, 'epoch': '1.796'}
+ 90%|█████████ | 431/478 [1:55:03<11:42, 14.95s/it] 90%|█████████ | 432/478 [1:55:18<11:27, 14.95s/it]                                                   {'loss': '0.1057', 'grad_norm': '4.594', 'learning_rate': '5.811e-07', 'ppl': '1.111', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.54', 'tokens/total': 13824000, 'tokens/trainable': 267736, 'epoch': '1.8'}
+ 90%|█████████ | 432/478 [1:55:18<11:27, 14.95s/it] 91%|█████████ | 433/478 [1:55:33<11:12, 14.95s/it]                                                   {'loss': '0.1113', 'grad_norm': '5.094', 'learning_rate': '5.569e-07', 'ppl': '1.118', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.17', 'tokens/total': 13856000, 'tokens/trainable': 268398, 'epoch': '1.804'}
+ 91%|█████████ | 433/478 [1:55:33<11:12, 14.95s/it] 91%|█████████ | 434/478 [1:55:48<10:56, 14.93s/it]                                                   {'loss': '0.1201', 'grad_norm': '4.969', 'learning_rate': '5.331e-07', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.31', 'tokens/total': 13888000, 'tokens/trainable': 269061, 'epoch': '1.808'}
+ 91%|█████████ | 434/478 [1:55:48<10:56, 14.93s/it] 91%|█████████ | 435/478 [1:56:03<10:42, 14.93s/it]                                                   {'loss': '0.104', 'grad_norm': '4.406', 'learning_rate': '5.099e-07', 'ppl': '1.11', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.69', 'tokens/total': 13920000, 'tokens/trainable': 269738, 'epoch': '1.812'}
+ 91%|█████████ | 435/478 [1:56:03<10:42, 14.93s/it] 91%|█████████ | 436/478 [1:56:18<10:27, 14.93s/it]                                                   {'loss': '0.1233', 'grad_norm': '4.281', 'learning_rate': '4.872e-07', 'ppl': '1.131', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.51', 'tokens/total': 13952000, 'tokens/trainable': 270409, 'epoch': '1.817'}
+ 91%|█████████ | 436/478 [1:56:18<10:27, 14.93s/it] 91%|█████████▏| 437/478 [1:56:33<10:12, 14.93s/it]                                                   {'loss': '0.1187', 'grad_norm': '5.219', 'learning_rate': '4.65e-07', 'ppl': '1.126', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.09', 'tokens/total': 13984000, 'tokens/trainable': 270979, 'epoch': '1.821'}
+ 91%|█████████▏| 437/478 [1:56:33<10:12, 14.93s/it] 92%|█████████▏| 438/478 [1:56:48<09:57, 14.94s/it]                                                   {'loss': '0.1045', 'grad_norm': '5', 'learning_rate': '4.432e-07', 'ppl': '1.11', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.7', 'tokens/total': 14016000, 'tokens/trainable': 271627, 'epoch': '1.825'}
+ 92%|█████████▏| 438/478 [1:56:48<09:57, 14.94s/it] 92%|█████████▏| 439/478 [1:57:03<09:44, 15.00s/it]                                                   {'loss': '0.1008', 'grad_norm': '5', 'learning_rate': '4.22e-07', 'ppl': '1.106', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.12', 'tokens/total': 14048000, 'tokens/trainable': 272205, 'epoch': '1.829'}
+ 92%|█████████▏| 439/478 [1:57:03<09:44, 15.00s/it] 92%|█████████▏| 440/478 [1:57:18<09:29, 14.98s/it]                                                   {'loss': '0.1465', 'grad_norm': '5.156', 'learning_rate': '4.013e-07', 'ppl': '1.158', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 14080000, 'tokens/trainable': 272828, 'epoch': '1.833'}
+ 92%|█████████▏| 440/478 [1:57:18<09:29, 14.98s/it] 92%|█████████▏| 441/478 [1:57:33<09:13, 14.97s/it]                                                   {'loss': '0.1189', 'grad_norm': '4.219', 'learning_rate': '3.812e-07', 'ppl': '1.126', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 14112000, 'tokens/trainable': 273451, 'epoch': '1.837'}
+ 92%|█████████▏| 441/478 [1:57:33<09:13, 14.97s/it] 92%|█████████▏| 442/478 [1:57:48<08:58, 14.97s/it]                                                   {'loss': '0.09351', 'grad_norm': '4.188', 'learning_rate': '3.615e-07', 'ppl': '1.098', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.96', 'tokens/total': 14144000, 'tokens/trainable': 274017, 'epoch': '1.842'}
+ 92%|█████████▏| 442/478 [1:57:48<08:58, 14.97s/it] 93%|█████████▎| 443/478 [1:58:03<08:43, 14.96s/it]                                                   {'loss': '0.1255', 'grad_norm': '4.656', 'learning_rate': '3.423e-07', 'ppl': '1.134', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.1', 'tokens/total': 14176000, 'tokens/trainable': 274647, 'epoch': '1.846'}
+ 93%|█████████▎| 443/478 [1:58:03<08:43, 14.96s/it] 93%|█████████▎| 444/478 [1:58:18<08:28, 14.96s/it]                                                   {'loss': '0.1572', 'grad_norm': '5.25', 'learning_rate': '3.237e-07', 'ppl': '1.17', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.21', 'tokens/total': 14208000, 'tokens/trainable': 275340, 'epoch': '1.85'}
+ 93%|█████████▎| 444/478 [1:58:18<08:28, 14.96s/it] 93%|█████████▎| 445/478 [1:58:33<08:13, 14.95s/it]                                                   {'loss': '0.0918', 'grad_norm': '3.75', 'learning_rate': '3.055e-07', 'ppl': '1.096', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 14240000, 'tokens/trainable': 275980, 'epoch': '1.854'}
+ 93%|█████████▎| 445/478 [1:58:33<08:13, 14.95s/it] 93%|█████████▎| 446/478 [1:58:48<07:58, 14.95s/it]                                                   {'loss': '0.07227', 'grad_norm': '3.5', 'learning_rate': '2.879e-07', 'ppl': '1.075', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.79', 'tokens/total': 14272000, 'tokens/trainable': 276631, 'epoch': '1.858'}
+ 93%|█████████▎| 446/478 [1:58:48<07:58, 14.95s/it] 94%|█████████▎| 447/478 [1:59:03<07:43, 14.95s/it]                                                   {'loss': '0.137', 'grad_norm': '4.969', 'learning_rate': '2.708e-07', 'ppl': '1.147', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.74', 'tokens/total': 14304000, 'tokens/trainable': 277340, 'epoch': '1.863'}
+ 94%|█████████▎| 447/478 [1:59:03<07:43, 14.95s/it] 94%|█████████▎| 448/478 [1:59:17<07:28, 14.95s/it]                                                   {'loss': '0.1152', 'grad_norm': '4.75', 'learning_rate': '2.542e-07', 'ppl': '1.122', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.96', 'tokens/total': 14336000, 'tokens/trainable': 277936, 'epoch': '1.867'}
+ 94%|█████████▎| 448/478 [1:59:17<07:28, 14.95s/it] 94%|█████████▍| 449/478 [1:59:32<07:12, 14.93s/it]                                                   {'loss': '0.167', 'grad_norm': '5.812', 'learning_rate': '2.381e-07', 'ppl': '1.182', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.15', 'tokens/total': 14368000, 'tokens/trainable': 278624, 'epoch': '1.871'}
+ 94%|█████████▍| 449/478 [1:59:32<07:12, 14.93s/it] 94%|█████████▍| 450/478 [1:59:47<06:58, 14.93s/it]                                                   {'loss': '0.09229', 'grad_norm': '4.312', 'learning_rate': '2.226e-07', 'ppl': '1.097', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.07', 'tokens/total': 14400000, 'tokens/trainable': 279253, 'epoch': '1.875'}
+ 94%|█████████▍| 450/478 [1:59:47<06:58, 14.93s/it] 94%|█████████▍| 451/478 [2:00:02<06:43, 14.93s/it]                                                   {'loss': '0.0874', 'grad_norm': '3.5', 'learning_rate': '2.076e-07', 'ppl': '1.091', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.39', 'tokens/total': 14432000, 'tokens/trainable': 279921, 'epoch': '1.879'}
+ 94%|█████████▍| 451/478 [2:00:02<06:43, 14.93s/it] 95%|█████████▍| 452/478 [2:00:17<06:27, 14.92s/it]                                                   {'loss': '0.1311', 'grad_norm': '4.469', 'learning_rate': '1.93e-07', 'ppl': '1.14', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.63', 'tokens/total': 14464000, 'tokens/trainable': 280564, 'epoch': '1.883'}
+ 95%|█████████▍| 452/478 [2:00:17<06:27, 14.92s/it] 95%|█████████▍| 453/478 [2:00:32<06:13, 14.93s/it]                                                   {'loss': '0.1213', 'grad_norm': '4.562', 'learning_rate': '1.79e-07', 'ppl': '1.129', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.21', 'tokens/total': 14496000, 'tokens/trainable': 281227, 'epoch': '1.887'}
+ 95%|█████████▍| 453/478 [2:00:32<06:13, 14.93s/it] 95%|█████████▍| 454/478 [2:00:47<05:58, 14.94s/it]                                                   {'loss': '0.1208', 'grad_norm': '5.375', 'learning_rate': '1.656e-07', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '16.85', 'tokens/total': 14528000, 'tokens/trainable': 281731, 'epoch': '1.892'}
+ 95%|█████████▍| 454/478 [2:00:47<05:58, 14.94s/it] 95%|█████████▌| 455/478 [2:01:02<05:43, 14.94s/it]                                                   {'loss': '0.1101', 'grad_norm': '4.938', 'learning_rate': '1.526e-07', 'ppl': '1.116', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.3', 'tokens/total': 14560000, 'tokens/trainable': 282337, 'epoch': '1.896'}
+ 95%|█████████▌| 455/478 [2:01:02<05:43, 14.94s/it] 95%|█████████▌| 456/478 [2:01:17<05:28, 14.94s/it]                                                   {'loss': '0.1323', 'grad_norm': '4.969', 'learning_rate': '1.402e-07', 'ppl': '1.141', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 14592000, 'tokens/trainable': 282978, 'epoch': '1.9'}
+ 95%|█████████▌| 456/478 [2:01:17<05:28, 14.94s/it] 96%|█████████▌| 457/478 [2:01:32<05:13, 14.94s/it]                                                   {'loss': '0.1028', 'grad_norm': '4.594', 'learning_rate': '1.283e-07', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.3', 'tokens/total': 14624000, 'tokens/trainable': 283614, 'epoch': '1.904'}
+ 96%|█████████▌| 457/478 [2:01:32<05:13, 14.94s/it] 96%|█████████▌| 458/478 [2:01:47<04:58, 14.94s/it]                                                   {'loss': '0.1025', 'grad_norm': '4.156', 'learning_rate': '1.169e-07', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.45', 'tokens/total': 14656000, 'tokens/trainable': 284283, 'epoch': '1.908'}
+ 96%|█████████▌| 458/478 [2:01:47<04:58, 14.94s/it] 96%|█████████▌| 459/478 [2:02:02<04:43, 14.94s/it]                                                   {'loss': '0.0918', 'grad_norm': '4.281', 'learning_rate': '1.061e-07', 'ppl': '1.096', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.04', 'tokens/total': 14688000, 'tokens/trainable': 284941, 'epoch': '1.913'}
+ 96%|█████████▌| 459/478 [2:02:02<04:43, 14.94s/it] 96%|█████████▌| 460/478 [2:02:17<04:28, 14.94s/it]                                                   {'loss': '0.08789', 'grad_norm': '4.656', 'learning_rate': '9.575e-08', 'ppl': '1.092', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.68', 'tokens/total': 14720000, 'tokens/trainable': 285618, 'epoch': '1.917'}
+ 96%|█████████▌| 460/478 [2:02:17<04:28, 14.94s/it] 96%|█████████▋| 461/478 [2:02:32<04:13, 14.93s/it]                                                   {'loss': '0.1421', 'grad_norm': '5.469', 'learning_rate': '8.595e-08', 'ppl': '1.153', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.9', 'tokens/total': 14752000, 'tokens/trainable': 286330, 'epoch': '1.921'}
+ 96%|█████████▋| 461/478 [2:02:32<04:13, 14.93s/it] 97%|█████████▋| 462/478 [2:02:47<03:58, 14.94s/it]                                                   {'loss': '0.1292', 'grad_norm': '4.75', 'learning_rate': '7.668e-08', 'ppl': '1.138', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.47', 'tokens/total': 14784000, 'tokens/trainable': 286941, 'epoch': '1.925'}
+ 97%|█████████▋| 462/478 [2:02:47<03:58, 14.94s/it] 97%|█████████▋| 463/478 [2:03:02<03:46, 15.13s/it]                                                   {'loss': '0.1533', 'grad_norm': '5.406', 'learning_rate': '6.793e-08', 'ppl': '1.166', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.2', 'tokens/total': 14816000, 'tokens/trainable': 287574, 'epoch': '1.929'}
+ 97%|█████████▋| 463/478 [2:03:02<03:46, 15.13s/it] 97%|█████████▋| 464/478 [2:03:17<03:31, 15.08s/it]                                                   {'loss': '0.09521', 'grad_norm': '3.891', 'learning_rate': '5.971e-08', 'ppl': '1.1', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.9', 'tokens/total': 14848000, 'tokens/trainable': 288198, 'epoch': '1.933'}
+ 97%|█████████▋| 464/478 [2:03:17<03:31, 15.08s/it] 97%|█████████▋| 465/478 [2:03:32<03:15, 15.04s/it]                                                   {'loss': '0.1111', 'grad_norm': '4.344', 'learning_rate': '5.202e-08', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.87', 'tokens/total': 14880000, 'tokens/trainable': 288851, 'epoch': '1.938'}
+ 97%|█████████▋| 465/478 [2:03:32<03:15, 15.04s/it] 97%|█████████▋| 466/478 [2:03:47<03:00, 15.07s/it]                                                   {'loss': '0.1223', 'grad_norm': '5.156', 'learning_rate': '4.486e-08', 'ppl': '1.13', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.42', 'tokens/total': 14912000, 'tokens/trainable': 289439, 'epoch': '1.942'}
+ 97%|█████████▋| 466/478 [2:03:47<03:00, 15.07s/it] 98%|█████████▊| 467/478 [2:04:02<02:45, 15.04s/it]                                                   {'loss': '0.1265', 'grad_norm': '4.75', 'learning_rate': '3.823e-08', 'ppl': '1.135', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.47', 'tokens/total': 14944000, 'tokens/trainable': 290110, 'epoch': '1.946'}
+ 98%|█████████▊| 467/478 [2:04:02<02:45, 15.04s/it] 98%|█████████▊| 468/478 [2:04:17<02:30, 15.01s/it]                                                   {'loss': '0.1294', 'grad_norm': '5.344', 'learning_rate': '3.213e-08', 'ppl': '1.138', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 14976000, 'tokens/trainable': 290738, 'epoch': '1.95'}
+ 98%|█████████▊| 468/478 [2:04:17<02:30, 15.01s/it] 98%|█████████▊| 469/478 [2:04:32<02:14, 14.99s/it]                                                   {'loss': '0.1489', 'grad_norm': '5.562', 'learning_rate': '2.655e-08', 'ppl': '1.161', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.81', 'tokens/total': 15008000, 'tokens/trainable': 291270, 'epoch': '1.954'}
+ 98%|█████████▊| 469/478 [2:04:32<02:14, 14.99s/it] 98%|█████████▊| 470/478 [2:04:47<01:59, 14.98s/it]                                                   {'loss': '0.1387', 'grad_norm': '5.312', 'learning_rate': '2.151e-08', 'ppl': '1.149', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.5', 'tokens/total': 15040000, 'tokens/trainable': 291882, 'epoch': '1.958'}
+ 98%|█████████▊| 470/478 [2:04:47<01:59, 14.98s/it] 99%|█████████▊| 471/478 [2:05:02<01:44, 14.97s/it]                                                   {'loss': '0.1108', 'grad_norm': '4.594', 'learning_rate': '1.7e-08', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.23', 'tokens/total': 15072000, 'tokens/trainable': 292516, 'epoch': '1.962'}
+ 99%|█████████▊| 471/478 [2:05:02<01:44, 14.97s/it] 99%|█████████▊| 472/478 [2:05:17<01:29, 14.96s/it]                                                   {'loss': '0.1416', 'grad_norm': '5.562', 'learning_rate': '1.301e-08', 'ppl': '1.152', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.82', 'tokens/total': 15104000, 'tokens/trainable': 293048, 'epoch': '1.967'}
+ 99%|█████████▊| 472/478 [2:05:17<01:29, 14.96s/it] 99%|█████████▉| 473/478 [2:05:32<01:14, 14.96s/it]                                                   {'loss': '0.1084', 'grad_norm': '4.75', 'learning_rate': '9.562e-09', 'ppl': '1.114', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 15136000, 'tokens/trainable': 293625, 'epoch': '1.971'}
+ 99%|█████████▉| 473/478 [2:05:32<01:14, 14.96s/it] 99%|█████████▉| 474/478 [2:05:47<00:59, 14.95s/it]                                                   {'loss': '0.1045', 'grad_norm': '4.344', 'learning_rate': '6.641e-09', 'ppl': '1.11', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.11', 'tokens/total': 15168000, 'tokens/trainable': 294314, 'epoch': '1.975'}
+ 99%|█████████▉| 474/478 [2:05:47<00:59, 14.95s/it] 99%|█████████▉| 475/478 [2:06:02<00:44, 14.95s/it]                                                   {'loss': '0.1055', 'grad_norm': '4.812', 'learning_rate': '4.25e-09', 'ppl': '1.111', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.31', 'tokens/total': 15200000, 'tokens/trainable': 294920, 'epoch': '1.979'}
+ 99%|█████████▉| 475/478 [2:06:02<00:44, 14.95s/it]100%|█████████▉| 476/478 [2:06:17<00:29, 14.95s/it]                                                   {'loss': '0.1462', 'grad_norm': '5.719', 'learning_rate': '2.391e-09', 'ppl': '1.157', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.54', 'tokens/total': 15232000, 'tokens/trainable': 295593, 'epoch': '1.983'}
+100%|█████████▉| 476/478 [2:06:17<00:29, 14.95s/it]100%|█████████▉| 477/478 [2:06:32<00:14, 14.95s/it]                                                   {'loss': '0.1807', 'grad_norm': '6.406', 'learning_rate': '1.063e-09', 'ppl': '1.198', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.9', 'tokens/total': 15264000, 'tokens/trainable': 296247, 'epoch': '1.988'}
+100%|█████████▉| 477/478 [2:06:32<00:14, 14.95s/it]100%|██████████| 478/478 [2:06:47<00:00, 14.95s/it]                                                   {'loss': '0.1331', 'grad_norm': '5.906', 'learning_rate': '2.657e-10', 'ppl': '1.142', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.63', 'tokens/total': 15296000, 'tokens/trainable': 296833, 'epoch': '1.992'}
+100%|██████████| 478/478 [2:06:47<00:00, 14.95s/it][2026-04-17 04:16:11,186] [INFO] [axolotl.core.trainers.base] Running evaluation step...
+[2026-04-17 04:16:19,065] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
+
+  0%|          | 0/27 [00:00<?, ?it/s]
+  7%|▋         | 2/27 [00:02<00:35,  1.41s/it]
+ 11%|█         | 3/27 [00:05<00:47,  1.96s/it]
+ 15%|█▍        | 4/27 [00:08<00:51,  2.26s/it]
+ 19%|█▊        | 5/27 [00:11<00:53,  2.43s/it]
+ 22%|██▏       | 6/27 [00:13<00:53,  2.54s/it]
+ 26%|██▌       | 7/27 [00:16<00:52,  2.61s/it]
+ 30%|██▉       | 8/27 [00:19<00:50,  2.65s/it]
+ 33%|███▎      | 9/27 [00:22<00:48,  2.68s/it]
+ 37%|███▋      | 10/27 [00:24<00:45,  2.70s/it]
+ 41%|████      | 11/27 [00:27<00:43,  2.72s/it]
+ 44%|████▍     | 12/27 [00:30<00:39,  2.64s/it]
+ 48%|████▊     | 13/27 [00:33<00:38,  2.76s/it]
+ 52%|█████▏    | 14/27 [00:35<00:35,  2.76s/it]
+ 56%|█████▌    | 15/27 [00:38<00:33,  2.76s/it]
+ 59%|█████▉    | 16/27 [00:41<00:30,  2.76s/it]
+ 63%|██████▎   | 17/27 [00:44<00:27,  2.75s/it]
+ 67%|██████▋   | 18/27 [00:46<00:24,  2.75s/it]
+ 70%|███████   | 19/27 [00:49<00:22,  2.75s/it]
+ 74%|███████▍  | 20/27 [00:52<00:19,  2.75s/it]
+ 78%|███████▊  | 21/27 [00:54<00:16,  2.67s/it]
+ 81%|████████▏ | 22/27 [00:57<00:13,  2.78s/it]
+ 85%|████████▌ | 23/27 [01:00<00:11,  2.77s/it]
+ 89%|████████▉ | 24/27 [01:03<00:08,  2.77s/it]
+ 93%|█████████▎| 25/27 [01:06<00:05,  2.76s/it]
+ 96%|█████████▋| 26/27 [01:08<00:02,  2.76s/it]
+100%|██████████| 27/27 [01:11<00:00,  2.79s/it]                                                   
+                                               {'eval_loss': '0.2227', 'eval_runtime': '75.1', 'eval_samples_per_second': '2.783', 'eval_steps_per_second': '1.398', 'eval_ppl': '1.249', 'memory/max_active (GiB)': '34.91', 'memory/max_allocated (GiB)': '34.91', 'memory/device_reserved (GiB)': '57.25', 'epoch': '1.992', 'tokens/train_per_sec_per_gpu': '0'}
+100%|██████████| 478/478 [2:08:10<00:00, 14.95s/it]
+100%|██████████| 27/27 [01:13<00:00,  2.79s/it]
+                                               [2026-04-17 04:17:40,668] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/checkpoint-478
+
+Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]
+Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.91s/it]Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.91s/it]
+                                                   {'train_runtime': '7816', 'train_samples_per_second': '0.122', 'train_steps_per_second': '0.061', 'train_loss': '0.1648', 'memory/max_active (GiB)': '12.01', 'memory/max_allocated (GiB)': '12.01', 'memory/device_reserved (GiB)': '12.11', 'epoch': '1.992', 'tokens/train_per_sec_per_gpu': '0'}
+100%|██████████| 478/478 [2:10:13<00:00, 14.95s/it]100%|██████████| 478/478 [2:10:13<00:00, 16.35s/it]
+[2026-04-17 04:20:34,131] [INFO] [axolotl.train] Training completed! Saving trained model to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/.
+[2026-04-17 04:20:39,898] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
+Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.07s/it]Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.07s/it]
+[2026-04-17 04:21:08,433] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
+Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.34s/it]Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.35s/it]
+Processing Files (0 / 0)      : |          |  0.00B /  0.00B            
+New Data Upload               : |          |  0.00B /  0.00B            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   2%|▏         |  184MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   2%|▏         |  184MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   2%|▏         |  195MB / 8.83GB,   ???B/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   4%|▍         |  336MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   4%|▍         |  347MB / 8.83GB,  758MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   5%|▌         |  464MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   5%|▌         |  475MB / 8.83GB,  701MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   7%|▋         |  576MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   7%|▋         |  587MB / 8.83GB,  653MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   8%|▊         |  680MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   8%|▊         |  691MB / 8.83GB,  620MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  10%|█         |  912MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  11%|█         |  976MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  13%|█▎        | 1.12GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  13%|█▎        | 1.13GB / 8.83GB,  668MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  14%|█▍        | 1.22GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  14%|█▍        | 1.23GB / 8.83GB,  645MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  15%|█▍        | 1.30GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  15%|█▍        | 1.32GB / 8.83GB,  622MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  16%|█▌        | 1.40GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  16%|█▌        | 1.41GB / 8.83GB,  608MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  17%|█▋        | 1.51GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  17%|█▋        | 1.52GB / 8.83GB,  604MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  18%|█▊        | 1.61GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  18%|█▊        | 1.62GB / 8.83GB,  593MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  19%|█▉        | 1.70GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  19%|█▉        | 1.72GB / 8.83GB,  585MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  21%|██        | 1.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  21%|██        | 1.83GB / 8.83GB,  583MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  22%|██▏       | 1.92GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  22%|██▏       | 1.93GB / 8.83GB,  579MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  23%|██▎       | 2.02GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  23%|██▎       | 2.04GB / 8.83GB,  575MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  24%|██▍       | 2.11GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  24%|██▍       | 2.12GB / 8.83GB,  567MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  25%|██▌       | 2.21GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  25%|██▌       | 2.22GB / 8.83GB,  562MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  26%|██▌       | 2.30GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  26%|██▌       | 2.32GB / 8.83GB,  558MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  27%|██▋       | 2.41GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  27%|██▋       | 2.42GB / 8.83GB,  556MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  28%|██▊       | 2.50GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  28%|██▊       | 2.52GB / 8.83GB,  552MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  30%|██▉       | 2.62GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  30%|██▉       | 2.63GB / 8.83GB,  553MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  31%|███       | 2.71GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  31%|███       | 2.72GB / 8.83GB,  550MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  32%|███▏      | 2.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  32%|███▏      | 2.83GB / 8.83GB,  548MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  33%|███▎      | 2.91GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  33%|███▎      | 2.92GB / 8.83GB,  546MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  34%|███▍      | 3.01GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  34%|███▍      | 3.02GB / 8.83GB,  543MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  35%|███▌      | 3.11GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  35%|███▌      | 3.12GB / 8.83GB,  542MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  37%|███▋      | 3.22GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  37%|███▋      | 3.24GB / 8.83GB,  543MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  38%|███▊      | 3.33GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  38%|███▊      | 3.34GB / 8.83GB,  542MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  39%|███▉      | 3.43GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  39%|███▉      | 3.44GB / 8.83GB,  541MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  40%|████      | 3.54GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  40%|████      | 3.56GB / 8.83GB,  542MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  41%|████▏     | 3.65GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  41%|████▏     | 3.66GB / 8.83GB,  541MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  43%|████▎     | 3.76GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  43%|████▎     | 3.77GB / 8.83GB,  542MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  44%|████▍     | 3.86GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  44%|████▍     | 3.88GB / 8.83GB,  541MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  45%|████▌     | 3.98GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  45%|████▌     | 4.00GB / 8.83GB,  543MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  47%|████▋     | 4.10GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  47%|████▋     | 4.12GB / 8.83GB,  544MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  48%|████▊     | 4.22GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  48%|████▊     | 4.23GB / 8.83GB,  545MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  49%|████▉     | 4.34GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  49%|████▉     | 4.36GB / 8.83GB,  547MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  51%|█████     | 4.46GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  51%|█████     | 4.48GB / 8.83GB,  549MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  52%|█████▏    | 4.58GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  52%|█████▏    | 4.59GB / 8.83GB,  549MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  53%|█████▎    | 4.70GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  53%|█████▎    | 4.71GB / 8.83GB,  550MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  54%|█████▍    | 4.81GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  55%|█████▍    | 4.82GB / 8.83GB,  550MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  56%|█████▌    | 4.93GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  56%|█████▌    | 4.94GB / 8.83GB,  552MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  57%|█████▋    | 5.05GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  57%|█████▋    | 5.06GB / 8.83GB,  553MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  58%|█████▊    | 5.16GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  59%|█████▊    | 5.17GB / 8.83GB,  553MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  60%|█████▉    | 5.28GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  60%|█████▉    | 5.29GB / 8.83GB,  554MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  61%|██████    | 5.39GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  61%|██████    | 5.40GB / 8.83GB,  554MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  62%|██████▏   | 5.51GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  63%|██████▎   | 5.52GB / 8.83GB,  555MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  64%|██████▎   | 5.62GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  64%|██████▍   | 5.64GB / 8.83GB,  555MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  65%|██████▌   | 5.74GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  65%|██████▌   | 5.76GB / 8.83GB,  556MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  66%|██████▋   | 5.86GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  67%|██████▋   | 5.88GB / 8.83GB,  557MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  68%|██████▊   | 5.98GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  68%|██████▊   | 5.99GB / 8.83GB,  553MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  69%|██████▉   | 6.10GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  69%|██████▉   | 6.11GB / 8.83GB,  552MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  70%|███████   | 6.22GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  70%|███████   | 6.23GB / 8.83GB,  553MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  72%|███████▏  | 6.34GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  72%|███████▏  | 6.35GB / 8.83GB,  554MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  73%|███████▎  | 6.46GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  73%|███████▎  | 6.47GB / 8.83GB,  550MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  75%|███████▍  | 6.58GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  75%|███████▍  | 6.59GB / 8.83GB,  549MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  76%|███████▌  | 6.70GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  76%|███████▌  | 6.71GB / 8.83GB,  547MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  77%|███████▋  | 6.81GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  77%|███████▋  | 6.82GB / 8.83GB,  548MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  79%|███████▊  | 6.93GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  79%|███████▊  | 6.94GB / 8.83GB,  551MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  80%|███████▉  | 7.05GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  80%|███████▉  | 7.06GB / 8.83GB,  554MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  81%|████████▏ | 7.18GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  81%|████████▏ | 7.19GB / 8.83GB,  555MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  83%|████████▎ | 7.29GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  83%|████████▎ | 7.30GB / 8.83GB,  557MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  84%|████████▍ | 7.42GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  84%|████████▍ | 7.43GB / 8.83GB,  560MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  85%|████████▌ | 7.53GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  85%|████████▌ | 7.54GB / 8.83GB,  560MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  87%|████████▋ | 7.65GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  87%|████████▋ | 7.66GB / 8.83GB,  562MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  88%|████████▊ | 7.77GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  88%|████████▊ | 7.78GB / 8.83GB,  563MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  89%|████████▉ | 7.89GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  89%|████████▉ | 7.90GB / 8.83GB,  566MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  91%|█████████ | 8.01GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  91%|█████████ | 8.02GB / 8.83GB,  569MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  92%|█████████▏| 8.13GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  92%|█████████▏| 8.14GB / 8.83GB,  571MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  93%|█████████▎| 8.24GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  93%|█████████▎| 8.25GB / 8.83GB,  572MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  95%|█████████▌| 8.38GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  95%|█████████▌| 8.40GB / 8.83GB,  576MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  97%|█████████▋| 8.52GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  97%|█████████▋| 8.53GB / 8.83GB,  579MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  98%|█████████▊| 8.63GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  98%|█████████▊| 8.64GB / 8.83GB,  580MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  99%|█████████▉| 8.75GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  99%|█████████▉| 8.76GB / 8.83GB,  582MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (3 / 3)      : 100%|██████████| 8.83GB / 8.83GB,  580MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (3 / 3)      : 100%|██████████| 8.83GB / 8.83GB,  506MB/s  
+New Data Upload               : |          |  0.00B /  0.00B,  0.00B/s  
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+[2026-04-17 04:21:54,742] [INFO] [axolotl.train] Model successfully saved to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
+[2026-04-17 04:22:00,408] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
+Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.09s/it]Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.09s/it]
+Processing Files (0 / 0)      : |          |  0.00B /  0.00B            
+New Data Upload               : |          |  0.00B /  0.00B            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   2%|▏         |  160MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   2%|▏         |  160MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   2%|▏         |  171MB / 8.83GB,   ???B/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   3%|▎         |  248MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   3%|▎         |  259MB / 8.83GB,  440MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   4%|▍         |  352MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   4%|▍         |  363MB / 8.83GB,  480MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   5%|▌         |  464MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   5%|▌         |  475MB / 8.83GB,  507MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   7%|▋         |  576MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   7%|▋         |  587MB / 8.83GB,  520MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   8%|▊         |  680MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   8%|▊         |  691MB / 8.83GB,  520MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:   9%|▉         |  784MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   9%|▉         |  795MB / 8.83GB,  520MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  10%|█         |  888MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  10%|█         |  899MB / 8.83GB,  520MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  11%|█         |  984MB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  11%|█▏        |  995MB / 8.83GB,  515MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  12%|█▏        | 1.09GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  12%|█▏        | 1.10GB / 8.83GB,  515MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  13%|█▎        | 1.18GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  14%|█▎        | 1.20GB / 8.83GB,  512MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  15%|█▍        | 1.29GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  15%|█▍        | 1.30GB / 8.83GB,  513MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  16%|█▌        | 1.39GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  16%|█▌        | 1.40GB / 8.83GB,  513MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  17%|█▋        | 1.50GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  17%|█▋        | 1.51GB / 8.83GB,  514MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  18%|█▊        | 1.61GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  18%|█▊        | 1.62GB / 8.83GB,  517MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  19%|█▉        | 1.72GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  20%|█▉        | 1.73GB / 8.83GB,  520MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  21%|██        | 1.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  21%|██        | 1.83GB / 8.83GB,  517MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  22%|██▏       | 1.91GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  22%|██▏       | 1.92GB / 8.83GB,  515MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  23%|██▎       | 2.01GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  23%|██▎       | 2.02GB / 8.83GB,  513MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  24%|██▍       | 2.11GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  24%|██▍       | 2.12GB / 8.83GB,  514MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  25%|██▌       | 2.21GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  25%|██▌       | 2.22GB / 8.83GB,  512MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  26%|██▌       | 2.30GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  26%|██▌       | 2.32GB / 8.83GB,  511MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  27%|██▋       | 2.40GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  27%|██▋       | 2.41GB / 8.83GB,  509MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  28%|██▊       | 2.50GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  28%|██▊       | 2.52GB / 8.83GB,  510MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  29%|██▉       | 2.60GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  30%|██▉       | 2.61GB / 8.83GB,  508MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  31%|███       | 2.72GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  31%|███       | 2.73GB / 8.83GB,  512MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  32%|███▏      | 2.84GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  32%|███▏      | 2.85GB / 8.83GB,  515MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  33%|███▎      | 2.94GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  33%|███▎      | 2.96GB / 8.83GB,  516MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  35%|███▍      | 3.06GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  35%|███▍      | 3.07GB / 8.83GB,  517MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  36%|███▌      | 3.16GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  36%|███▌      | 3.17GB / 8.83GB,  517MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  37%|███▋      | 3.27GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  37%|███▋      | 3.28GB / 8.83GB,  519MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  38%|███▊      | 3.38GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  38%|███▊      | 3.39GB / 8.83GB,  519MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  40%|███▉      | 3.49GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  40%|███▉      | 3.50GB / 8.83GB,  520MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  41%|████      | 3.60GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  41%|████      | 3.61GB / 8.83GB,  521MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  42%|████▏     | 3.73GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  42%|████▏     | 3.74GB / 8.83GB,  525MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  44%|████▎     | 3.84GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  44%|████▎     | 3.85GB / 8.83GB,  526MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  45%|████▍     | 3.96GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  45%|████▍     | 3.97GB / 8.83GB,  528MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  46%|████▌     | 4.07GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  46%|████▌     | 4.08GB / 8.83GB,  529MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  48%|████▊     | 4.19GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  48%|████▊     | 4.20GB / 8.83GB,  531MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  49%|████▉     | 4.34GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  49%|████▉     | 4.36GB / 8.83GB,  536MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  51%|█████     | 4.46GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  51%|█████     | 4.47GB / 8.83GB,  537MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  52%|█████▏    | 4.58GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  52%|█████▏    | 4.59GB / 8.83GB,  539MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  53%|█████▎    | 4.69GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  53%|█████▎    | 4.70GB / 8.83GB,  539MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  55%|█████▍    | 4.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  55%|█████▍    | 4.83GB / 8.83GB,  541MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  56%|█████▌    | 4.94GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  56%|█████▌    | 4.95GB / 8.83GB,  543MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  57%|█████▋    | 5.06GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  57%|█████▋    | 5.07GB / 8.83GB,  544MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  59%|█████▊    | 5.17GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  59%|█████▊    | 5.18GB / 8.83GB,  544MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  60%|█████▉    | 5.29GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  60%|█████▉    | 5.30GB / 8.83GB,  546MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  61%|██████    | 5.40GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  61%|██████▏   | 5.41GB / 8.83GB,  546MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  63%|██████▎   | 5.54GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  63%|██████▎   | 5.55GB / 8.83GB,  549MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  64%|██████▍   | 5.66GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  64%|██████▍   | 5.68GB / 8.83GB,  550MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  66%|██████▌   | 5.78GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  66%|██████▌   | 5.80GB / 8.83GB,  551MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  67%|██████▋   | 5.90GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  67%|██████▋   | 5.92GB / 8.83GB,  555MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  68%|██████▊   | 6.02GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  68%|██████▊   | 6.03GB / 8.83GB,  555MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  70%|██████▉   | 6.14GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  70%|██████▉   | 6.15GB / 8.83GB,  556MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  71%|███████   | 6.26GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  71%|███████   | 6.27GB / 8.83GB,  557MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  72%|███████▏  | 6.38GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  72%|███████▏  | 6.39GB / 8.83GB,  558MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  74%|███████▎  | 6.49GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  74%|███████▎  | 6.50GB / 8.83GB,  559MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  75%|███████▍  | 6.61GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  75%|███████▍  | 6.62GB / 8.83GB,  561MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  76%|███████▌  | 6.72GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  76%|███████▌  | 6.73GB / 8.83GB,  562MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  78%|███████▊  | 6.84GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  78%|███████▊  | 6.85GB / 8.83GB,  564MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  79%|███████▉  | 6.95GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  79%|███████▉  | 6.96GB / 8.83GB,  566MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  80%|████████  | 7.06GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  80%|████████  | 7.08GB / 8.83GB,  566MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  82%|████████▏ | 7.19GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  82%|████████▏ | 7.20GB / 8.83GB,  569MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  90%|█████████ | 7.98GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  91%|█████████ | 7.99GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  91%|█████████ | 7.99GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  91%|█████████ | 8.00GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  92%|█████████▏| 8.12GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  92%|█████████▏| 8.13GB / 8.83GB,  609MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  94%|█████████▎| 8.26GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  94%|█████████▎| 8.27GB / 8.83GB,  613MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  95%|█████████▌| 8.41GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  95%|█████████▌| 8.42GB / 8.83GB,  617MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  97%|█████████▋| 8.53GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  97%|█████████▋| 8.54GB / 8.83GB,  620MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  98%|█████████▊| 8.64GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  98%|█████████▊| 8.65GB / 8.83GB,  621MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors:  99%|█████████▉| 8.76GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  99%|█████████▉| 8.77GB / 8.83GB,  624MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (3 / 3)      : 100%|██████████| 8.83GB / 8.83GB,  620MB/s  
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
+
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+
+
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+
+
+
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (3 / 3)      : 100%|██████████| 8.83GB / 8.83GB,  577MB/s  
+New Data Upload               : |          |  0.00B /  0.00B,  0.00B/s  
+  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
+  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
+  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
diff --git a/training_args.bin b/training_args.bin
new file mode 100644
index 0000000..b877b5a
--- /dev/null
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e35ace8ebd6eb47f46c9f2ae726e7bee9b52ecff4cb1c196eb1476bb2bdc7140
+size 11985